mirror of
https://gitlab.com/scemama/qp_plugins_scemama.git
synced 2025-01-05 02:48:42 +01:00
Allocate managed memory only if full
This commit is contained in:
parent
a08342f60c
commit
29903111f4
@ -138,15 +138,15 @@ void compute_h_oo_chol_gpu(gpu_data* data, int igpu)
|
|||||||
double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol;
|
double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol;
|
||||||
|
|
||||||
double* d_tau_kau;
|
double* d_tau_kau;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_tau_kau, cholesky_mo_num*nV*nO * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_tau_kau, cholesky_mo_num*nV*nO * sizeof(double));
|
||||||
assert(cudaStat == cudaSuccess);
|
assert(cudaStat == cudaSuccess);
|
||||||
|
|
||||||
double* d_tmp_ovv;
|
double* d_tmp_ovv;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_tmp_ovv, nO*nV*nV * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_tmp_ovv, nO*nV*nV * sizeof(double));
|
||||||
assert(cudaStat == cudaSuccess);
|
assert(cudaStat == cudaSuccess);
|
||||||
|
|
||||||
double* d_tmp_vov;
|
double* d_tmp_vov;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_tmp_vov, nV*nO*nV * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_tmp_vov, nV*nO*nV * sizeof(double));
|
||||||
assert(cudaStat == cudaSuccess);
|
assert(cudaStat == cudaSuccess);
|
||||||
|
|
||||||
for (size_t i=0 ; i<nV ; ++i) {
|
for (size_t i=0 ; i<nV ; ++i) {
|
||||||
@ -239,7 +239,7 @@ void compute_h_vo_chol_gpu(gpu_data* data, int igpu)
|
|||||||
cublasDcopy(handle, nV*nO, d_cc_space_f_vo, 1, d_H_vo, 1);
|
cublasDcopy(handle, nV*nO, d_cc_space_f_vo, 1, d_H_vo, 1);
|
||||||
|
|
||||||
double* d_tmp_k;
|
double* d_tmp_k;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_tmp_k, cholesky_mo_num * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_tmp_k, cholesky_mo_num * sizeof(double));
|
||||||
assert(cudaStat == cudaSuccess);
|
assert(cudaStat == cudaSuccess);
|
||||||
|
|
||||||
alpha = 2.0;
|
alpha = 2.0;
|
||||||
@ -261,7 +261,7 @@ void compute_h_vo_chol_gpu(gpu_data* data, int igpu)
|
|||||||
cudaFree(d_tmp_k);
|
cudaFree(d_tmp_k);
|
||||||
|
|
||||||
double* d_tmp;
|
double* d_tmp;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_tmp, cholesky_mo_num*nO*nO * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_tmp, cholesky_mo_num*nO*nO * sizeof(double));
|
||||||
assert(cudaStat == cudaSuccess);
|
assert(cudaStat == cudaSuccess);
|
||||||
|
|
||||||
alpha = 1.0;
|
alpha = 1.0;
|
||||||
@ -273,7 +273,7 @@ void compute_h_vo_chol_gpu(gpu_data* data, int igpu)
|
|||||||
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
|
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
|
||||||
|
|
||||||
double* d_tmp2;
|
double* d_tmp2;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_tmp2, cholesky_mo_num*nO*nO * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_tmp2, cholesky_mo_num*nO*nO * sizeof(double));
|
||||||
assert(cudaStat == cudaSuccess);
|
assert(cudaStat == cudaSuccess);
|
||||||
|
|
||||||
for (size_t i=0 ; i<nV ; ++i) {
|
for (size_t i=0 ; i<nV ; ++i) {
|
||||||
@ -346,11 +346,11 @@ void compute_h_vv_chol_gpu(gpu_data* data, int igpu)
|
|||||||
double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol;
|
double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol;
|
||||||
|
|
||||||
double* d_tau_kia;
|
double* d_tau_kia;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_tau_kia, cholesky_mo_num*nO*nV * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_tau_kia, cholesky_mo_num*nO*nV * sizeof(double));
|
||||||
assert(cudaStat == cudaSuccess);
|
assert(cudaStat == cudaSuccess);
|
||||||
|
|
||||||
double* d_tmp_oov;
|
double* d_tmp_oov;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_tmp_oov, nO*nO*nV * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_tmp_oov, nO*nO*nV * sizeof(double));
|
||||||
assert(cudaStat == cudaSuccess);
|
assert(cudaStat == cudaSuccess);
|
||||||
|
|
||||||
alpha = 1.0;
|
alpha = 1.0;
|
||||||
@ -435,7 +435,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
|
|
||||||
double* d_r2;
|
double* d_r2;
|
||||||
lda = nO * nO;
|
lda = nO * nO;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_r2, lda * nV * nV * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_r2, lda * nV * nV * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
cudaMemset(d_r2, 0, nO*nO*nV*nV*sizeof(double));
|
cudaMemset(d_r2, 0, nO*nO*nV*nV*sizeof(double));
|
||||||
memset(r2, 0, nO*nO*nV*nV*sizeof(double));
|
memset(r2, 0, nO*nO*nV*nV*sizeof(double));
|
||||||
@ -460,7 +460,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
double* d_H_vv = data[igpu].H_vv;
|
double* d_H_vv = data[igpu].H_vv;
|
||||||
|
|
||||||
double* d_K1;
|
double* d_K1;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_K1, nO*nV*nO*nV * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_K1, nO*nV*nO*nV * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
#pragma omp sections
|
#pragma omp sections
|
||||||
@ -469,7 +469,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
#pragma omp section
|
#pragma omp section
|
||||||
{
|
{
|
||||||
double* d_J1;
|
double* d_J1;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_J1, nO*nV*nV*nO * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_J1, nO*nV*nV*nO * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
alpha = 1.0;
|
alpha = 1.0;
|
||||||
@ -481,7 +481,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
|
|
||||||
|
|
||||||
double* d_X_ovoo;
|
double* d_X_ovoo;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_X_ovoo, nO*nV*nO*nO * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_X_ovoo, nO*nV*nO*nO * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
alpha = 0.0;
|
alpha = 0.0;
|
||||||
beta = 1.0;
|
beta = 1.0;
|
||||||
@ -504,7 +504,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
|
|
||||||
|
|
||||||
double* d_Y_ovov;
|
double* d_Y_ovov;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_Y_ovov, nO*nV*nO*nV * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_Y_ovov, nO*nV*nO*nV * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
alpha = 1.0;
|
alpha = 1.0;
|
||||||
@ -536,7 +536,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
cublasSetStream(handle, NULL);
|
cublasSetStream(handle, NULL);
|
||||||
|
|
||||||
double* d_tmp_cc;
|
double* d_tmp_cc;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_tmp_cc, cholesky_mo_num*nV*nO * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_tmp_cc, cholesky_mo_num*nV*nO * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
alpha = 1.0;
|
alpha = 1.0;
|
||||||
@ -548,7 +548,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
|
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
|
||||||
|
|
||||||
double* d_J1_tmp;
|
double* d_J1_tmp;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_J1_tmp, nV*nO*nV*nO * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_J1_tmp, nV*nO*nV*nO * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
alpha = 1.0;
|
alpha = 1.0;
|
||||||
@ -580,7 +580,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
cudaFree(d_J1_tmp);
|
cudaFree(d_J1_tmp);
|
||||||
|
|
||||||
double* d_X_voov;
|
double* d_X_voov;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_X_voov, nV*nO*nO*nV * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_X_voov, nV*nO*nO*nV * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
for (size_t i=0 ; i<nV ; ++i) {
|
for (size_t i=0 ; i<nV ; ++i) {
|
||||||
@ -614,7 +614,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
cublasSetStream(handle, NULL);
|
cublasSetStream(handle, NULL);
|
||||||
|
|
||||||
double* d_Z_ovvo;
|
double* d_Z_ovvo;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_Z_ovvo, nO*nV*nV*nO * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_Z_ovvo, nO*nV*nV*nO * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
alpha = -1.0;
|
alpha = -1.0;
|
||||||
@ -643,7 +643,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
}
|
}
|
||||||
|
|
||||||
double* d_Y_vovo;
|
double* d_Y_vovo;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_Y_vovo, nV*nO*nV*nO * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_Y_vovo, nV*nO*nV*nO * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
alpha = 1.0;
|
alpha = 1.0;
|
||||||
@ -659,7 +659,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
}
|
}
|
||||||
|
|
||||||
double* d_X_ovvo;
|
double* d_X_ovvo;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_X_ovvo, nO*nV*nV*nO * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_X_ovvo, nO*nV*nV*nO * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
alpha = 1.0;
|
alpha = 1.0;
|
||||||
@ -734,11 +734,11 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
|
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
|
||||||
|
|
||||||
double* d_X;
|
double* d_X;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_X, nV*nO*nV*nO * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_X, nV*nO*nV*nO * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
double* d_Y;
|
double* d_Y;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_Y, nO*nV*nV*nO * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_Y, nO*nV*nV*nO * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
for (size_t i=0 ; i<nV ; ++i) {
|
for (size_t i=0 ; i<nV ; ++i) {
|
||||||
@ -773,7 +773,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
cublasSetStream(handle, NULL);
|
cublasSetStream(handle, NULL);
|
||||||
|
|
||||||
double* d_Z;
|
double* d_Z;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_Z, nO*nV*nV*nO * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_Z, nO*nV*nV*nO * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
alpha = 1.0;
|
alpha = 1.0;
|
||||||
@ -789,7 +789,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
|
|
||||||
|
|
||||||
double* d_t1v;
|
double* d_t1v;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_t1v, cholesky_mo_num*nO*nO * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_t1v, cholesky_mo_num*nO*nO * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
alpha = 1.0;
|
alpha = 1.0;
|
||||||
@ -801,7 +801,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
|
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
|
||||||
|
|
||||||
double* d_K1tmp;
|
double* d_K1tmp;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_K1tmp, nO*nO*nV*nV * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_K1tmp, nO*nO*nV*nV * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
alpha = 1.0;
|
alpha = 1.0;
|
||||||
@ -854,7 +854,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
#pragma omp section
|
#pragma omp section
|
||||||
{
|
{
|
||||||
double* d_Y_oooo;
|
double* d_Y_oooo;
|
||||||
cudaStat = cudaMallocManaged((void**)&d_Y_oooo, nO*nO*nO*nO*sizeof(double), 1);
|
cudaStat = gpu_malloc((void**)&d_Y_oooo, nO*nO*nO*nO*sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
alpha = 1.0;
|
alpha = 1.0;
|
||||||
@ -866,7 +866,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
|
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
|
||||||
|
|
||||||
double* d_A1;
|
double* d_A1;
|
||||||
cudaStat = cudaMallocManaged((void**)&d_A1, nO*nO*nO*nO*sizeof(double), 1);
|
cudaStat = gpu_malloc((void**)&d_A1, nO*nO*nO*nO*sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
alpha = 1.0;
|
alpha = 1.0;
|
||||||
@ -918,7 +918,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
#pragma omp section
|
#pragma omp section
|
||||||
{
|
{
|
||||||
double* d_g_vir;
|
double* d_g_vir;
|
||||||
cudaStat = cudaMallocManaged((void**)&d_g_vir, nV*nV*sizeof(double), 1);
|
cudaStat = gpu_malloc((void**)&d_g_vir, nV*nV*sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasDcopy(handle, nV*nV, d_H_vv, 1, d_g_vir, 1);
|
cublasDcopy(handle, nV*nV, d_H_vv, 1, d_g_vir, 1);
|
||||||
|
|
||||||
@ -931,7 +931,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
|
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
|
||||||
|
|
||||||
double* d_tmp_k;
|
double* d_tmp_k;
|
||||||
cudaStat = cudaMallocManaged((void**)&d_tmp_k, cholesky_mo_num*sizeof(double), 1);
|
cudaStat = gpu_malloc((void**)&d_tmp_k, cholesky_mo_num*sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
alpha = 1.0;
|
alpha = 1.0;
|
||||||
beta = 0.0;
|
beta = 0.0;
|
||||||
@ -951,7 +951,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
cudaFree(d_tmp_k);
|
cudaFree(d_tmp_k);
|
||||||
|
|
||||||
double* d_tmp_vo;
|
double* d_tmp_vo;
|
||||||
cudaStat = cudaMallocManaged((void**)&d_tmp_vo, cholesky_mo_num*nV*nO*sizeof(double), 1);
|
cudaStat = gpu_malloc((void**)&d_tmp_vo, cholesky_mo_num*nV*nO*sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
alpha = 1.0;
|
alpha = 1.0;
|
||||||
beta = 0.0;
|
beta = 0.0;
|
||||||
@ -962,7 +962,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
|
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
|
||||||
|
|
||||||
double* d_tmp_vo2;
|
double* d_tmp_vo2;
|
||||||
cudaStat = cudaMallocManaged((void**)&d_tmp_vo2, cholesky_mo_num*nV*nO*sizeof(double), 1);
|
cudaStat = gpu_malloc((void**)&d_tmp_vo2, cholesky_mo_num*nV*nO*sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
for (size_t i=0 ; i<nO ; ++i) {
|
for (size_t i=0 ; i<nO ; ++i) {
|
||||||
cudaStreamCreate(&(stream[i]));
|
cudaStreamCreate(&(stream[i]));
|
||||||
@ -992,7 +992,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
cudaFree(d_tmp_vo2);
|
cudaFree(d_tmp_vo2);
|
||||||
|
|
||||||
double* d_Y_oovv;
|
double* d_Y_oovv;
|
||||||
cudaStat = cudaMallocManaged((void**)&d_Y_oovv, nO*nO*nV*nV*sizeof(double), 1);
|
cudaStat = gpu_malloc((void**)&d_Y_oovv, nO*nO*nV*nV*sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
alpha = 1.0;
|
alpha = 1.0;
|
||||||
beta = 0.0;
|
beta = 0.0;
|
||||||
@ -1037,12 +1037,12 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
{
|
{
|
||||||
double* d_g_occ;
|
double* d_g_occ;
|
||||||
lda = nO;
|
lda = nO;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_g_occ, nO*nO * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_g_occ, nO*nO * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasDcopy(handle, nO*nO, d_H_oo, 1, d_g_occ, 1);
|
cublasDcopy(handle, nO*nO, d_H_oo, 1, d_g_occ, 1);
|
||||||
|
|
||||||
double* d_X;
|
double* d_X;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_X, cholesky_mo_num*sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_X, cholesky_mo_num*sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
alpha = 2.0;
|
alpha = 2.0;
|
||||||
@ -1079,7 +1079,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
|
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
|
||||||
|
|
||||||
double* d_X_oovv;
|
double* d_X_oovv;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_X_oovv, nO*nO*nV*nV * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_X_oovv, nO*nO*nV*nV * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
alpha = 1.0;
|
alpha = 1.0;
|
||||||
@ -1132,11 +1132,11 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
#pragma omp section
|
#pragma omp section
|
||||||
{
|
{
|
||||||
double* d_X_vovv;
|
double* d_X_vovv;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_X_vovv, nV*nO*nV*BLOCK_SIZE * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_X_vovv, nV*nO*nV*BLOCK_SIZE * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
double* d_Y_oovv;
|
double* d_Y_oovv;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_Y_oovv, nO*nO*nV*nV * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_Y_oovv, nO*nO*nV*nV * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
for (size_t iblock=0 ; iblock<nV ; iblock += BLOCK_SIZE) {
|
for (size_t iblock=0 ; iblock<nV ; iblock += BLOCK_SIZE) {
|
||||||
@ -1199,7 +1199,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
#pragma omp section
|
#pragma omp section
|
||||||
{
|
{
|
||||||
double* d_tcc2;
|
double* d_tcc2;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_tcc2, cholesky_mo_num*nV*nO * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_tcc2, cholesky_mo_num*nV*nO * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
alpha = 1.0;
|
alpha = 1.0;
|
||||||
@ -1211,7 +1211,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
|
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
|
||||||
|
|
||||||
double* d_tcc;
|
double* d_tcc;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_tcc, cholesky_mo_num*nO*nV * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_tcc, cholesky_mo_num*nO*nV * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
alpha = 1.0;
|
alpha = 1.0;
|
||||||
@ -1223,7 +1223,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
|
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
|
||||||
|
|
||||||
double* d_X_ovvo;
|
double* d_X_ovvo;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_X_ovvo, nO*nV*nV*nO * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_X_ovvo, nO*nV*nV*nO * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
alpha = 1.0;
|
alpha = 1.0;
|
||||||
@ -1267,7 +1267,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
#pragma omp section
|
#pragma omp section
|
||||||
{
|
{
|
||||||
double* d_X_oovv;
|
double* d_X_oovv;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_X_oovv, nO*nO*nV*nV * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_X_oovv, nO*nO*nV*nV * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
alpha = 1.0;
|
alpha = 1.0;
|
||||||
@ -1298,7 +1298,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
}
|
}
|
||||||
|
|
||||||
double* d_X_vovo;
|
double* d_X_vovo;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_X_vovo, nV*nO*nV*nO * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_X_vovo, nV*nO*nV*nO * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
alpha = 0.0;
|
alpha = 0.0;
|
||||||
@ -1318,7 +1318,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
cublasSetStream(handle, NULL);
|
cublasSetStream(handle, NULL);
|
||||||
|
|
||||||
double* d_Y_oovo;
|
double* d_Y_oovo;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_Y_oovo, nO*nO*nV*nO * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_Y_oovo, nO*nO*nV*nO * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
alpha = 1.0;
|
alpha = 1.0;
|
||||||
@ -1384,12 +1384,12 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
{
|
{
|
||||||
double* d_J1;
|
double* d_J1;
|
||||||
lda = nO*nV;
|
lda = nO*nV;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_J1, nO*nV*nV*nO * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_J1, nO*nV*nV*nO * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(lda, nV*nO, sizeof(double), J1, lda, d_J1, lda);
|
cublasSetMatrix(lda, nV*nO, sizeof(double), J1, lda, d_J1, lda);
|
||||||
|
|
||||||
double* d_X_ovvo;
|
double* d_X_ovvo;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_X_ovvo, nO*nV*nV*nO * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_X_ovvo, nO*nV*nV*nO * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
for (size_t i=0 ; i<nV ; ++i) {
|
for (size_t i=0 ; i<nV ; ++i) {
|
||||||
@ -1413,7 +1413,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
cudaFree(d_J1);
|
cudaFree(d_J1);
|
||||||
|
|
||||||
double* d_Y_voov;
|
double* d_Y_voov;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_Y_voov, nV*nO*nO*nV * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_Y_voov, nV*nO*nO*nV * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
for (size_t i=0 ; i<nV ; ++i) {
|
for (size_t i=0 ; i<nV ; ++i) {
|
||||||
@ -1436,7 +1436,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
cublasSetStream(handle, NULL);
|
cublasSetStream(handle, NULL);
|
||||||
|
|
||||||
double* d_Z_ovov;
|
double* d_Z_ovov;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_Z_ovov, nO*nV*nO*nV * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_Z_ovov, nO*nV*nO*nV * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
alpha = 1.0;
|
alpha = 1.0;
|
||||||
@ -1481,11 +1481,11 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
#pragma omp section
|
#pragma omp section
|
||||||
{
|
{
|
||||||
double* d_X_ovov;
|
double* d_X_ovov;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_X_ovov, nO*nV*nO*nV * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_X_ovov, nO*nV*nO*nV * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
double* d_Y_ovov;
|
double* d_Y_ovov;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_Y_ovov, nO*nV*nO*nV * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_Y_ovov, nO*nV*nO*nV * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
for (size_t i=0 ; i<nV ; ++i) {
|
for (size_t i=0 ; i<nV ; ++i) {
|
||||||
@ -1518,7 +1518,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
cublasSetStream(handle, NULL);
|
cublasSetStream(handle, NULL);
|
||||||
|
|
||||||
double* d_Z_ovov;
|
double* d_Z_ovov;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_Z_ovov, nO*nV*nO*nV * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_Z_ovov, nO*nV*nO*nV * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
alpha = 1.0;
|
alpha = 1.0;
|
||||||
@ -1564,11 +1564,11 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
#pragma omp section
|
#pragma omp section
|
||||||
{
|
{
|
||||||
double* d_X_ovov;
|
double* d_X_ovov;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_X_ovov, nO*nV*nO*nV * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_X_ovov, nO*nV*nO*nV * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
double* d_Y_ovov;
|
double* d_Y_ovov;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_Y_ovov, nO*nV*nO*nV * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_Y_ovov, nO*nV*nO*nV * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
|
|
||||||
@ -1602,7 +1602,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
cublasSetStream(handle, NULL);
|
cublasSetStream(handle, NULL);
|
||||||
|
|
||||||
double* d_Z_ovov;
|
double* d_Z_ovov;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_Z_ovov, nO*nV*nO*nV * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_Z_ovov, nO*nV*nO*nV * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
alpha = 1.0;
|
alpha = 1.0;
|
||||||
@ -1650,7 +1650,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
|
|
||||||
double* d_tmp_cc;
|
double* d_tmp_cc;
|
||||||
lda = cholesky_mo_num * nV;
|
lda = cholesky_mo_num * nV;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_tmp_cc, lda * nV * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_tmp_cc, lda * nV * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
alpha=1.0; beta=0.0;
|
alpha=1.0; beta=0.0;
|
||||||
@ -1659,15 +1659,15 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, m, B, k, &beta, C, m);
|
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, m, B, k, &beta, C, m);
|
||||||
|
|
||||||
double* d_tmp_cc2;
|
double* d_tmp_cc2;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_tmp_cc2, cholesky_mo_num*nV*sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_tmp_cc2, cholesky_mo_num*nV*sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
double* d_B1;
|
double* d_B1;
|
||||||
cudaStat = cudaMallocManaged((void**)&d_B1, nV*nV*BLOCK_SIZE*sizeof(double), 1);
|
cudaStat = gpu_malloc((void**)&d_B1, nV*nV*BLOCK_SIZE*sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
double* d_tmpB1;
|
double* d_tmpB1;
|
||||||
cudaStat = cudaMallocManaged((void**)&d_tmpB1, nV*BLOCK_SIZE*nV*sizeof(double), 1);
|
cudaStat = gpu_malloc((void**)&d_tmpB1, nV*BLOCK_SIZE*nV*sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
#pragma omp for
|
#pragma omp for
|
||||||
@ -1783,7 +1783,7 @@ void compute_r1_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
|
|
||||||
double* d_r1;
|
double* d_r1;
|
||||||
lda = nO ;
|
lda = nO ;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_r1, lda * nV * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_r1, lda * nV * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
cudaMemset(d_r1, 0, nO*nV*sizeof(double));
|
cudaMemset(d_r1, 0, nO*nV*sizeof(double));
|
||||||
memset(r1, 0, nO*nV*sizeof(double));
|
memset(r1, 0, nO*nV*sizeof(double));
|
||||||
@ -1810,7 +1810,7 @@ void compute_r1_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
cublasDcopy(handle, nO*nV, d_cc_space_f_ov, 1, d_r1, 1);
|
cublasDcopy(handle, nO*nV, d_cc_space_f_ov, 1, d_r1, 1);
|
||||||
|
|
||||||
double* d_X_oo;
|
double* d_X_oo;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_X_oo, nO*nO * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_X_oo, nO*nO * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
alpha = -2.0;
|
alpha = -2.0;
|
||||||
@ -1857,7 +1857,7 @@ void compute_r1_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
#pragma omp section
|
#pragma omp section
|
||||||
{
|
{
|
||||||
double* d_X_voov;
|
double* d_X_voov;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_X_voov, nV* nO* nO* nV * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_X_voov, nV* nO* nO* nV * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
for (size_t i=0 ; i<nV ; ++i) {
|
for (size_t i=0 ; i<nV ; ++i) {
|
||||||
@ -1903,7 +1903,7 @@ void compute_r1_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
#pragma omp section
|
#pragma omp section
|
||||||
{
|
{
|
||||||
double* d_X_ovov;
|
double* d_X_ovov;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_X_ovov, nO* nV* nO* nV * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_X_ovov, nO* nV* nO* nV * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasDcopy(handle, nO*nV*nO*nV, d_cc_space_v_ovov, 1, d_X_ovov, 1);
|
cublasDcopy(handle, nO*nV*nO*nV, d_cc_space_v_ovov, 1, d_X_ovov, 1);
|
||||||
|
|
||||||
@ -1941,7 +1941,7 @@ void compute_r1_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
#pragma omp section
|
#pragma omp section
|
||||||
{
|
{
|
||||||
double* d_T_vvoo;
|
double* d_T_vvoo;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_T_vvoo, nV*nV*nO*nO * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_T_vvoo, nV*nV*nO*nO * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
alpha = 0.0;
|
alpha = 0.0;
|
||||||
@ -1952,11 +1952,11 @@ void compute_r1_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nV*nV, nO*nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
|
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nV*nV, nO*nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
|
||||||
|
|
||||||
double* d_W_vvov;
|
double* d_W_vvov;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_W_vvov, nV*nV*nO*BLOCK_SIZE * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_W_vvov, nV*nV*nO*BLOCK_SIZE * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
double* d_W_vvov_tmp;
|
double* d_W_vvov_tmp;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_W_vvov_tmp, nV*nO*nV*BLOCK_SIZE * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_W_vvov_tmp, nV*nO*nV*BLOCK_SIZE * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
|
|
||||||
@ -2010,7 +2010,7 @@ void compute_r1_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
|
|||||||
#pragma omp section
|
#pragma omp section
|
||||||
{
|
{
|
||||||
double* d_W_oovo;
|
double* d_W_oovo;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_W_oovo, nO*nO*nV*nO * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_W_oovo, nO*nO*nV*nO * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
alpha = 2.0;
|
alpha = 2.0;
|
||||||
|
@ -60,3 +60,16 @@ typedef struct {
|
|||||||
int cholesky_mo_num;
|
int cholesky_mo_num;
|
||||||
} gpu_data_sp;
|
} gpu_data_sp;
|
||||||
|
|
||||||
|
|
||||||
|
static cudaError_t gpu_malloc(void** ptr, size_t size) {
|
||||||
|
size_t free, total;
|
||||||
|
cudaError_t rc = cudaMemGetInfo( &free, &total );
|
||||||
|
if (rc != cudaSuccess) return rc;
|
||||||
|
|
||||||
|
if (size < free && size < total/10) {
|
||||||
|
rc= cudaMalloc(ptr, size);
|
||||||
|
} else {
|
||||||
|
rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
|
||||||
|
}
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
@ -36,125 +36,125 @@ gpu_data* gpu_init(
|
|||||||
|
|
||||||
double* d_cc_space_v_oo_chol;
|
double* d_cc_space_v_oo_chol;
|
||||||
lda = cholesky_mo_num * nO;
|
lda = cholesky_mo_num * nO;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_cc_space_v_oo_chol, lda * nO * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_cc_space_v_oo_chol, lda * nO * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(cholesky_mo_num*nO, nO, sizeof(double), cc_space_v_oo_chol, lda, d_cc_space_v_oo_chol, lda);
|
cublasSetMatrix(cholesky_mo_num*nO, nO, sizeof(double), cc_space_v_oo_chol, lda, d_cc_space_v_oo_chol, lda);
|
||||||
|
|
||||||
double* d_cc_space_v_ov_chol;
|
double* d_cc_space_v_ov_chol;
|
||||||
lda = cholesky_mo_num * nO;
|
lda = cholesky_mo_num * nO;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_cc_space_v_ov_chol, lda * nV * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_cc_space_v_ov_chol, lda * nV * sizeof(double) );
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(cholesky_mo_num*nO, nV, sizeof(double), cc_space_v_ov_chol, lda, d_cc_space_v_ov_chol, lda);
|
cublasSetMatrix(cholesky_mo_num*nO, nV, sizeof(double), cc_space_v_ov_chol, lda, d_cc_space_v_ov_chol, lda);
|
||||||
|
|
||||||
double* d_cc_space_v_vo_chol;
|
double* d_cc_space_v_vo_chol;
|
||||||
lda = cholesky_mo_num * nV;
|
lda = cholesky_mo_num * nV;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_cc_space_v_vo_chol, lda * nO * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_cc_space_v_vo_chol, lda * nO * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(cholesky_mo_num*nV, nO, sizeof(double), cc_space_v_vo_chol, lda, d_cc_space_v_vo_chol, lda);
|
cublasSetMatrix(cholesky_mo_num*nV, nO, sizeof(double), cc_space_v_vo_chol, lda, d_cc_space_v_vo_chol, lda);
|
||||||
|
|
||||||
double* d_cc_space_v_vv_chol;
|
double* d_cc_space_v_vv_chol;
|
||||||
lda = cholesky_mo_num * nV;
|
lda = cholesky_mo_num * nV;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_cc_space_v_vv_chol, lda * nV * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_cc_space_v_vv_chol, lda * nV * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(cholesky_mo_num*nV, nV, sizeof(double), cc_space_v_vv_chol, lda, d_cc_space_v_vv_chol, lda);
|
cublasSetMatrix(cholesky_mo_num*nV, nV, sizeof(double), cc_space_v_vv_chol, lda, d_cc_space_v_vv_chol, lda);
|
||||||
|
|
||||||
double* d_cc_space_v_oooo;
|
double* d_cc_space_v_oooo;
|
||||||
cudaStat = cudaMallocManaged((void**)&d_cc_space_v_oooo, nO*nO*nO*nO*sizeof(double), 1);
|
cudaStat = gpu_malloc((void**)&d_cc_space_v_oooo, nO*nO*nO*nO*sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(nO*nO, nO*nO, sizeof(double), cc_space_v_oooo, nO*nO, d_cc_space_v_oooo, nO*nO);
|
cublasSetMatrix(nO*nO, nO*nO, sizeof(double), cc_space_v_oooo, nO*nO, d_cc_space_v_oooo, nO*nO);
|
||||||
|
|
||||||
double* d_cc_space_v_vooo;
|
double* d_cc_space_v_vooo;
|
||||||
cudaStat = cudaMallocManaged((void**)&d_cc_space_v_vooo, nV*nO*nO*nO*sizeof(double), 1);
|
cudaStat = gpu_malloc((void**)&d_cc_space_v_vooo, nV*nO*nO*nO*sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(nV*nO, nO*nO, sizeof(double), cc_space_v_vooo, nV*nO, d_cc_space_v_vooo, nV*nO);
|
cublasSetMatrix(nV*nO, nO*nO, sizeof(double), cc_space_v_vooo, nV*nO, d_cc_space_v_vooo, nV*nO);
|
||||||
|
|
||||||
double* d_cc_space_v_voov;
|
double* d_cc_space_v_voov;
|
||||||
cudaStat = cudaMallocManaged((void**)&d_cc_space_v_voov, nV*nO*nO*nV*sizeof(double), 1);
|
cudaStat = gpu_malloc((void**)&d_cc_space_v_voov, nV*nO*nO*nV*sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(nV*nO, nO*nV, sizeof(double), cc_space_v_voov, nV*nO, d_cc_space_v_voov, nV*nO);
|
cublasSetMatrix(nV*nO, nO*nV, sizeof(double), cc_space_v_voov, nV*nO, d_cc_space_v_voov, nV*nO);
|
||||||
|
|
||||||
double* d_cc_space_v_oovv;
|
double* d_cc_space_v_oovv;
|
||||||
cudaStat = cudaMallocManaged((void**)&d_cc_space_v_oovv, nO*nO*nV*nV*sizeof(double), 1);
|
cudaStat = gpu_malloc((void**)&d_cc_space_v_oovv, nO*nO*nV*nV*sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), cc_space_v_oovv, nO*nO, d_cc_space_v_oovv, nO*nO);
|
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), cc_space_v_oovv, nO*nO, d_cc_space_v_oovv, nO*nO);
|
||||||
|
|
||||||
double* d_cc_space_v_vvoo;
|
double* d_cc_space_v_vvoo;
|
||||||
cudaStat = cudaMallocManaged((void**)&d_cc_space_v_vvoo, nV*nV*nO*nO*sizeof(double), 1);
|
cudaStat = gpu_malloc((void**)&d_cc_space_v_vvoo, nV*nV*nO*nO*sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(nV*nV, nO*nO, sizeof(double), cc_space_v_vvoo, nV*nV, d_cc_space_v_vvoo, nV*nV);
|
cublasSetMatrix(nV*nV, nO*nO, sizeof(double), cc_space_v_vvoo, nV*nV, d_cc_space_v_vvoo, nV*nV);
|
||||||
|
|
||||||
double* d_cc_space_v_oovo;
|
double* d_cc_space_v_oovo;
|
||||||
lda = nO*nO;
|
lda = nO*nO;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_cc_space_v_oovo, nO*nO*nV*nO * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_cc_space_v_oovo, nO*nO*nV*nO * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_oovo, lda, d_cc_space_v_oovo, lda);
|
cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_oovo, lda, d_cc_space_v_oovo, lda);
|
||||||
|
|
||||||
double* d_cc_space_v_ovvo;
|
double* d_cc_space_v_ovvo;
|
||||||
lda = nO*nV;
|
lda = nO*nV;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_cc_space_v_ovvo, nO*nV*nV*nO * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_cc_space_v_ovvo, nO*nV*nV*nO * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_ovvo, lda, d_cc_space_v_ovvo, lda);
|
cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_ovvo, lda, d_cc_space_v_ovvo, lda);
|
||||||
|
|
||||||
double* d_cc_space_v_ovov;
|
double* d_cc_space_v_ovov;
|
||||||
lda = nO*nV;
|
lda = nO*nV;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_cc_space_v_ovov, nO*nV*nV*nO * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_cc_space_v_ovov, nO*nV*nV*nO * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_ovov, lda, d_cc_space_v_ovov, lda);
|
cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_ovov, lda, d_cc_space_v_ovov, lda);
|
||||||
|
|
||||||
double* d_cc_space_v_ovoo;
|
double* d_cc_space_v_ovoo;
|
||||||
lda = nO*nV;
|
lda = nO*nV;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_cc_space_v_ovoo, nO*nV*nO*nO * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_cc_space_v_ovoo, nO*nV*nO*nO * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(lda, nO*nO, sizeof(double), cc_space_v_ovoo, lda, d_cc_space_v_ovoo, lda);
|
cublasSetMatrix(lda, nO*nO, sizeof(double), cc_space_v_ovoo, lda, d_cc_space_v_ovoo, lda);
|
||||||
|
|
||||||
double* d_cc_space_f_oo;
|
double* d_cc_space_f_oo;
|
||||||
cudaStat = cudaMallocManaged((void**)&d_cc_space_f_oo, nO*nO*sizeof(double), 1);
|
cudaStat = gpu_malloc((void**)&d_cc_space_f_oo, nO*nO*sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(nO, nO, sizeof(double), cc_space_f_oo, nO, d_cc_space_f_oo, nO);
|
cublasSetMatrix(nO, nO, sizeof(double), cc_space_f_oo, nO, d_cc_space_f_oo, nO);
|
||||||
|
|
||||||
double* d_cc_space_f_vo;
|
double* d_cc_space_f_vo;
|
||||||
cudaStat = cudaMallocManaged((void**)&d_cc_space_f_vo, nV*nO*sizeof(double), 1);
|
cudaStat = gpu_malloc((void**)&d_cc_space_f_vo, nV*nO*sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(nV, nO, sizeof(double), cc_space_f_vo, nV, d_cc_space_f_vo, nV);
|
cublasSetMatrix(nV, nO, sizeof(double), cc_space_f_vo, nV, d_cc_space_f_vo, nV);
|
||||||
|
|
||||||
double* d_cc_space_f_ov;
|
double* d_cc_space_f_ov;
|
||||||
cudaStat = cudaMallocManaged((void**)&d_cc_space_f_ov, nV*nO*sizeof(double), 1);
|
cudaStat = gpu_malloc((void**)&d_cc_space_f_ov, nV*nO*sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(nO, nV, sizeof(double), cc_space_f_ov, nO, d_cc_space_f_ov, nO);
|
cublasSetMatrix(nO, nV, sizeof(double), cc_space_f_ov, nO, d_cc_space_f_ov, nO);
|
||||||
|
|
||||||
double* d_cc_space_f_vv;
|
double* d_cc_space_f_vv;
|
||||||
cudaStat = cudaMallocManaged((void**)&d_cc_space_f_vv, nV*nV*sizeof(double), 1);
|
cudaStat = gpu_malloc((void**)&d_cc_space_f_vv, nV*nV*sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(nV, nV, sizeof(double), cc_space_f_vv, nV, d_cc_space_f_vv, nV);
|
cublasSetMatrix(nV, nV, sizeof(double), cc_space_f_vv, nV, d_cc_space_f_vv, nV);
|
||||||
|
|
||||||
double* d_tau;
|
double* d_tau;
|
||||||
lda = nO * nO;
|
lda = nO * nO;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_tau, lda * nV * nV * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_tau, lda * nV * nV * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
double* d_tau_x;
|
double* d_tau_x;
|
||||||
lda = nO * nO;
|
lda = nO * nO;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_tau_x, lda * nV * nV * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_tau_x, lda * nV * nV * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
double* d_t1;
|
double* d_t1;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_t1, nO * nV * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_t1, nO * nV * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
double* d_t2;
|
double* d_t2;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_t2, nO*nO*nV*nV * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_t2, nO*nO*nV*nV * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
double* d_H_oo;
|
double* d_H_oo;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_H_oo, nO * nO * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_H_oo, nO * nO * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
double* d_H_vo;
|
double* d_H_vo;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_H_vo, nV * nO * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_H_vo, nV * nO * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
double* d_H_vv;
|
double* d_H_vv;
|
||||||
cudaStat = cudaMallocManaged((void **)&d_H_vv, nV * nV * sizeof(double), 1);
|
cudaStat = gpu_malloc((void **)&d_H_vv, nV * nV * sizeof(double));
|
||||||
assert (cudaStat == cudaSuccess);
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
data[igpu].cc_space_v_oo_chol = d_cc_space_v_oo_chol;
|
data[igpu].cc_space_v_oo_chol = d_cc_space_v_oo_chol;
|
||||||
@ -202,30 +202,30 @@ void gpu_deinit(gpu_data* data)
|
|||||||
int igpu = omp_get_thread_num();
|
int igpu = omp_get_thread_num();
|
||||||
cudaSetDevice(igpu);
|
cudaSetDevice(igpu);
|
||||||
|
|
||||||
free(data[igpu].cc_space_v_oo_chol);
|
cudaFree(data[igpu].cc_space_v_oo_chol);
|
||||||
free(data[igpu].cc_space_v_ov_chol);
|
cudaFree(data[igpu].cc_space_v_ov_chol);
|
||||||
free(data[igpu].cc_space_v_vo_chol);
|
cudaFree(data[igpu].cc_space_v_vo_chol);
|
||||||
free(data[igpu].cc_space_v_vv_chol);
|
cudaFree(data[igpu].cc_space_v_vv_chol);
|
||||||
free(data[igpu].cc_space_v_oooo);
|
cudaFree(data[igpu].cc_space_v_oooo);
|
||||||
free(data[igpu].cc_space_v_vooo);
|
cudaFree(data[igpu].cc_space_v_vooo);
|
||||||
free(data[igpu].cc_space_v_voov);
|
cudaFree(data[igpu].cc_space_v_voov);
|
||||||
free(data[igpu].cc_space_v_oovv);
|
cudaFree(data[igpu].cc_space_v_oovv);
|
||||||
free(data[igpu].cc_space_v_vvoo);
|
cudaFree(data[igpu].cc_space_v_vvoo);
|
||||||
free(data[igpu].cc_space_v_oovo);
|
cudaFree(data[igpu].cc_space_v_oovo);
|
||||||
free(data[igpu].cc_space_v_ovvo);
|
cudaFree(data[igpu].cc_space_v_ovvo);
|
||||||
free(data[igpu].cc_space_v_ovov);
|
cudaFree(data[igpu].cc_space_v_ovov);
|
||||||
free(data[igpu].cc_space_v_ovoo);
|
cudaFree(data[igpu].cc_space_v_ovoo);
|
||||||
free(data[igpu].cc_space_f_oo);
|
cudaFree(data[igpu].cc_space_f_oo);
|
||||||
free(data[igpu].cc_space_f_ov);
|
cudaFree(data[igpu].cc_space_f_ov);
|
||||||
free(data[igpu].cc_space_f_vo);
|
cudaFree(data[igpu].cc_space_f_vo);
|
||||||
free(data[igpu].cc_space_f_vv);
|
cudaFree(data[igpu].cc_space_f_vv);
|
||||||
free(data[igpu].tau);
|
cudaFree(data[igpu].tau);
|
||||||
free(data[igpu].tau_x);
|
cudaFree(data[igpu].tau_x);
|
||||||
free(data[igpu].t1);
|
cudaFree(data[igpu].t1);
|
||||||
free(data[igpu].t2);
|
cudaFree(data[igpu].t2);
|
||||||
free(data[igpu].H_oo);
|
cudaFree(data[igpu].H_oo);
|
||||||
free(data[igpu].H_vo);
|
cudaFree(data[igpu].H_vo);
|
||||||
free(data[igpu].H_vv);
|
cudaFree(data[igpu].H_vv);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user