From 0d53d4fcaa2457bb32fe53bfb24b1dbc0a957657 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Mon, 21 Aug 2023 12:34:55 +0200 Subject: [PATCH] Fix malloc on GPU --- devel/ccsd_gpu/ccsd_space_orb_sub.irp.f | 66 ++- devel/ccsd_gpu/gpu.c | 583 ++++++++++++++---------- devel/ccsd_gpu/gpu.h | 33 +- devel/ccsd_gpu/gpu_dgemm.c | 42 +- devel/ccsd_gpu/gpu_init.c | 119 +++-- devel/ccsd_gpu/gpu_module.f90 | 81 +++- 6 files changed, 617 insertions(+), 307 deletions(-) diff --git a/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f b/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f index 33ab63b..069244d 100644 --- a/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f +++ b/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f @@ -94,12 +94,21 @@ subroutine run_ccsd_space_orb ! Init type(c_ptr) :: gpu_data + logical :: do_sp = .False. - gpu_data = gpu_init(nO, nV, cholesky_mo_num, & + if (do_sp) then + gpu_data = gpu_init_sp(nO, nV, cholesky_mo_num, & cc_space_v_oo_chol, cc_space_v_ov_chol, cc_space_v_vo_chol, cc_space_v_vv_chol, & cc_space_v_oooo, cc_space_v_vooo, cc_space_v_voov, cc_space_v_oovv, cc_space_v_vvoo, & cc_space_v_oovo, cc_space_v_ovvo, cc_space_v_ovov, cc_space_v_ovoo, & cc_space_f_oo, cc_space_f_ov, cc_space_f_vo, cc_space_f_vv) + else + gpu_data = gpu_init(nO, nV, cholesky_mo_num, & + cc_space_v_oo_chol, cc_space_v_ov_chol, cc_space_v_vo_chol, cc_space_v_vv_chol, & + cc_space_v_oooo, cc_space_v_vooo, cc_space_v_voov, cc_space_v_oovv, cc_space_v_vvoo, & + cc_space_v_oovo, cc_space_v_ovvo, cc_space_v_ovov, cc_space_v_ovoo, & + cc_space_f_oo, cc_space_f_ov, cc_space_f_vo, cc_space_f_vv) + endif if (.not.do_ao_cholesky) then print *, 'ao_choleky is required' @@ -109,12 +118,20 @@ subroutine run_ccsd_space_orb call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,t1) call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,t2) - call gpu_upload(gpu_data, nO, nV, t1, t2); + if (do_sp) then + call gpu_upload_sp(gpu_data, nO, nV, t1, t2); + else + call gpu_upload(gpu_data, nO, nV, t1, t2); + endif !print*,'hf_energy', hf_energy call det_energy(det,uncorr_energy) print*,'Det energy', uncorr_energy - energy = ccsd_energy_space_gpu(gpu_data) + if (do_sp) then + energy = ccsd_energy_space_gpu_sp(gpu_data) + else + energy = ccsd_energy_space_gpu(gpu_data) + endif print*,'Guess energy', uncorr_energy+energy, energy nb_iter = 0 @@ -133,18 +150,39 @@ subroutine run_ccsd_space_orb ! Residue !$OMP PARALLEL SECTIONS !$OMP SECTION - call compute_H_oo_chol_gpu(gpu_data,0) + if (do_sp) then + call compute_H_oo_chol_gpu_sp(gpu_data,0) + else + call compute_H_oo_chol_gpu(gpu_data,0) + endif !$OMP SECTION - call compute_H_vo_chol_gpu(gpu_data,1) + if (do_sp) then + call compute_H_vo_chol_gpu_sp(gpu_data,1) + else + call compute_H_vo_chol_gpu(gpu_data,1) + endif !$OMP SECTION - call compute_H_vv_chol_gpu(gpu_data,2) + if (do_sp) then + call compute_H_vv_chol_gpu_sp(gpu_data,2) + else + call compute_H_vv_chol_gpu(gpu_data,2) + endif !$OMP END PARALLEL SECTIONS - call compute_r1_space_chol_gpu(gpu_data, nO, nV, t1, r1, max_r1) - call compute_r2_space_chol_gpu(gpu_data, nO, nV, t1, r2, max_r2) + if (do_sp) then + call compute_r1_space_chol_gpu_sp(gpu_data, nO, nV, t1, r1, max_r1) + else + call compute_r1_space_chol_gpu(gpu_data, nO, nV, t1, r1, max_r1) + endif + + if (do_sp) then + call compute_r2_space_chol_gpu_sp(gpu_data, nO, nV, t1, r2, max_r2) + else + call compute_r2_space_chol_gpu(gpu_data, nO, nV, t1, r2, max_r2) + endif max_r = max(max_r1,max_r2) @@ -162,10 +200,18 @@ subroutine run_ccsd_space_orb print*,'Unkown cc_method_method: '//cc_update_method endif - call gpu_upload(gpu_data, nO, nV, t1, t2); + if (do_sp) then + call gpu_upload_sp(gpu_data, nO, nV, t1, t2); + else + call gpu_upload(gpu_data, nO, nV, t1, t2); + endif ! Energy - energy = ccsd_energy_space_gpu(gpu_data) + if (do_sp) then + energy = ccsd_energy_space_gpu_sp(gpu_data) + else + energy = ccsd_energy_space_gpu(gpu_data) + endif write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |' nb_iter = nb_iter + 1 diff --git a/devel/ccsd_gpu/gpu.c b/devel/ccsd_gpu/gpu.c index ab2c8c6..fc118e9 100644 --- a/devel/ccsd_gpu/gpu.c +++ b/devel/ccsd_gpu/gpu.c @@ -5,34 +5,41 @@ #include #include #include "gpu.h" +#include "assert.h" void gpu_upload(gpu_data* data, int nO, int nV, double* t1, double* t2) { - int lda; + size_t lda; int ngpus = 1; if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus); double * tau = malloc(nO*nO*nV*nV * sizeof(double)); + assert (tau != NULL); + double * tau_x = malloc(nO*nO*nV*nV * sizeof(double)); + assert (tau_x != NULL); #pragma omp parallel num_threads(ngpus) { - int igpu = omp_get_thread_num(); + cudaError_t cudaStat = cudaSuccess; + size_t igpu = omp_get_thread_num(); cudaSetDevice(igpu); double* d_t1 = data[igpu].t1; lda = nO; - cublasSetMatrix(nO, nV, sizeof(double), t1, lda, d_t1, lda); + cudaStat = cublasSetMatrix(nO, nV, sizeof(double), t1, lda, d_t1, lda); + assert (cudaStat == cudaSuccess); double* d_t2 = data[igpu].t2; lda = nO*nO; - cublasSetMatrix(nO*nO, nV*nV, sizeof(double), t2, lda, d_t2, lda); + cudaStat = cublasSetMatrix(nO*nO, nV*nV, sizeof(double), t2, lda, d_t2, lda); + assert (cudaStat == cudaSuccess); - int lda, ldb, ldc; + size_t lda, ldb, ldc; double alpha, beta; double* A; double* B; @@ -46,16 +53,17 @@ void gpu_upload(gpu_data* data, double* d_tau_x = data[igpu].tau_x; lda = nO * nO; - cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau_x, lda, d_tau_x, lda); + cudaStat = cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau_x, lda, d_tau_x, lda); + assert (cudaStat == cudaSuccess); if (igpu == 0) { - for (int i=0 ; icholesky_mo_num; + const size_t cholesky_mo_num = data->cholesky_mo_num; int ngpus = 1; if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus); @@ -397,14 +417,15 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl #pragma omp parallel num_threads(ngpus) { - int m,n,k, lda, ldb, ldc; + cudaError_t cudaStat; + size_t m,n,k, lda, ldb, ldc; double alpha, beta; double* A; double* B; double* C; cudaStream_t stream[nV]; - int igpu = omp_get_thread_num(); + size_t igpu = omp_get_thread_num(); cudaSetDevice(igpu); cublasHandle_t handle; @@ -412,7 +433,8 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl double* d_r2; lda = nO * nO; - cudaMalloc((void **)&d_r2, lda * nV * nV * sizeof(double)); + cudaStat = cudaMalloc((void **)&d_r2, lda * nV * nV * sizeof(double)); + assert (cudaStat == cudaSuccess); cudaMemset(d_r2, 0, nO*nO*nV*nV*sizeof(double)); memset(r2, 0, nO*nO*nV*nV*sizeof(double)); @@ -436,7 +458,8 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl double* d_H_vv = data[igpu].H_vv; double* d_K1; - cudaMalloc((void **)&d_K1, nO*nV*nO*nV * sizeof(double)); + cudaStat = cudaMalloc((void **)&d_K1, nO*nV*nO*nV * sizeof(double)); + assert (cudaStat == cudaSuccess); #pragma omp sections { @@ -444,7 +467,8 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl #pragma omp section { double* d_J1; - cudaMalloc((void **)&d_J1, nO*nV*nV*nO * sizeof(double)); + cudaStat = cudaMalloc((void **)&d_J1, nO*nV*nV*nO * sizeof(double)); + assert (cudaStat == cudaSuccess); alpha = 1.0; beta = 0.0; @@ -455,14 +479,15 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl double* d_X_ovoo; - cudaMalloc((void **)&d_X_ovoo, nO*nV*nO*nO * sizeof(double)); + cudaStat = cudaMalloc((void **)&d_X_ovoo, nO*nV*nO*nO * sizeof(double)); + assert (cudaStat == cudaSuccess); alpha = 0.0; beta = 1.0; - for (int i=0 ; icholesky_mo_num; + const size_t cholesky_mo_num = data->cholesky_mo_num; int ngpus = 1; if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus); #pragma omp parallel num_threads(ngpus) { - int m,n,k, lda, ldb, ldc; + cudaError_t cudaStat; + size_t m,n,k, lda, ldb, ldc; double alpha, beta; double* A; double* B; double* C; cudaStream_t stream[nV]; - int igpu = omp_get_thread_num(); + size_t igpu = omp_get_thread_num(); cudaSetDevice(igpu); cublasHandle_t handle; @@ -1711,7 +1781,8 @@ void compute_r1_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl double* d_r1; lda = nO ; - cudaMalloc((void **)&d_r1, lda * nV * sizeof(double)); + cudaStat = cudaMalloc((void **)&d_r1, lda * nV * sizeof(double)); + assert (cudaStat == cudaSuccess); cudaMemset(d_r1, 0, nO*nV*sizeof(double)); memset(r1, 0, nO*nV*sizeof(double)); @@ -1737,7 +1808,8 @@ void compute_r1_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl cublasDcopy(handle, nO*nV, d_cc_space_f_ov, 1, d_r1, 1); double* d_X_oo; - cudaMalloc((void **)&d_X_oo, nO*nO * sizeof(double)); + cudaStat = cudaMalloc((void **)&d_X_oo, nO*nO * sizeof(double)); + assert (cudaStat == cudaSuccess); alpha = -2.0; beta = 0.0; @@ -1783,14 +1855,15 @@ void compute_r1_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl #pragma omp section { double* d_X_voov; - cudaMalloc((void **)&d_X_voov, nV* nO* nO* nV * sizeof(double)); + cudaStat = cudaMalloc((void **)&d_X_voov, nV* nO* nO* nV * sizeof(double)); + assert (cudaStat == cudaSuccess); - for (int i=0 ; i= nV) kk = 0; @@ -1909,7 +1986,7 @@ void compute_r1_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nV, nV, &alpha, A, lda, &beta, B, ldb, C, ldc); } } - for (int i=0 ; inO; - const int nV = data->nV; + const size_t nO = data->nO; + const size_t nV = data->nV; int ngpus = 1; if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus); #pragma omp parallel num_threads(ngpus) { - int igpu = omp_get_thread_num(); + cudaError_t cudaStat; + size_t igpu = omp_get_thread_num(); cudaSetDevice(igpu); cublasHandle_t handle; diff --git a/devel/ccsd_gpu/gpu.h b/devel/ccsd_gpu/gpu.h index f32bc7a..a74c54d 100644 --- a/devel/ccsd_gpu/gpu.h +++ b/devel/ccsd_gpu/gpu.h @@ -1,3 +1,5 @@ +#define MULTIGPU 1 + typedef struct { double* cc_space_v_oo_chol; double* cc_space_v_ov_chol; @@ -28,4 +30,33 @@ typedef struct { int cholesky_mo_num; } gpu_data; -#define MULTIGPU 1 +typedef struct { + float* cc_space_v_oo_chol; + float* cc_space_v_ov_chol; + float* cc_space_v_vo_chol; + float* cc_space_v_vv_chol; + float* cc_space_v_oooo; + float* cc_space_v_vooo; + float* cc_space_v_voov; + float* cc_space_v_oovv; + float* cc_space_v_vvoo; + float* cc_space_v_oovo; + float* cc_space_v_ovvo; + float* cc_space_v_ovov; + float* cc_space_v_ovoo; + float* cc_space_f_oo; + float* cc_space_f_ov; + float* cc_space_f_vo; + float* cc_space_f_vv; + float* tau; + float* tau_x; + float* t1; + float* t2; + float* H_oo; + float* H_vo; + float* H_vv; + int nO; + int nV; + int cholesky_mo_num; +} gpu_data_sp; + diff --git a/devel/ccsd_gpu/gpu_dgemm.c b/devel/ccsd_gpu/gpu_dgemm.c index 6530b2b..9ade048 100644 --- a/devel/ccsd_gpu/gpu_dgemm.c +++ b/devel/ccsd_gpu/gpu_dgemm.c @@ -4,6 +4,7 @@ #include #include #include +#include #define BLOCK_SIZE 16 @@ -16,6 +17,7 @@ void dgemm_(char*, char*, int*, int*, int*, double*, double*, int*, double*, int void gpu_dgemm(char transa, char transb, int m, int n, int k, double alpha, double* A, int lda, double* B, int ldb, double beta, double* C, int ldc) { + cudaError_t cudaStat = cudaSuccess; cublasHandle_t handle; cublasCreate(&handle); @@ -25,36 +27,48 @@ void gpu_dgemm(char transa, char transb, int m, int n, int k, double alpha, cublasOperation_t ta, tb; if (transa == 'N') { - cudaMalloc((void**)&d_A, lda*k*sizeof(double)); - cublasSetMatrix(m, k, sizeof(double), A, lda, d_A, lda); + cudaStat = cudaMalloc((void**)&d_A, (size_t) lda*k*sizeof(double)); + assert(cudaStat == cudaSuccess); + cudaStat = cublasSetMatrix(m, k, sizeof(double), A, lda, d_A, lda); + assert(cudaStat == cudaSuccess); ta = CUBLAS_OP_N; } else { - cudaMalloc((void**)&d_A, lda*m*sizeof(double)); - cublasSetMatrix(k, m, sizeof(double), A, lda, d_A, lda); + cudaStat = cudaMalloc((void**)&d_A, (size_t) lda*m*sizeof(double)); + assert(cudaStat == cudaSuccess); + cudaStat = cublasSetMatrix(k, m, sizeof(double), A, lda, d_A, lda); + assert(cudaStat == cudaSuccess); ta = CUBLAS_OP_T; } if (transb == 'N') { - cudaMalloc((void**)&d_B, ldb*n*sizeof(double)); - cublasSetMatrix(k, n, sizeof(double), B, ldb, d_B, ldb); + cudaStat = cudaMalloc((void**)&d_B, (size_t) ldb*n*sizeof(double)); + assert(cudaStat == cudaSuccess); + cudaStat = cublasSetMatrix(k, n, sizeof(double), B, ldb, d_B, ldb); + assert(cudaStat == cudaSuccess); tb = CUBLAS_OP_N; } else { - cudaMalloc((void**)&d_B, ldb*k*sizeof(double)); - cublasSetMatrix(n, k, sizeof(double), B, ldb, d_B, ldb); + cudaStat = cudaMalloc((void**)&d_B, (size_t) ldb*k*sizeof(double)); + assert(cudaStat == cudaSuccess); + cudaStat = cublasSetMatrix(n, k, sizeof(double), B, ldb, d_B, ldb); + assert(cudaStat == cudaSuccess); tb = CUBLAS_OP_T; } - cudaMalloc((void**)&d_C, ldc*n*sizeof(double)); + cudaStat = cudaMalloc((void**)&d_C, (size_t) ldc*n*sizeof(double)); + assert(cudaStat == cudaSuccess); if (beta != 0.) { - cublasSetMatrix(m, n, sizeof(double), C, ldc, d_C, ldc); + cudaStat = cublasSetMatrix(m, n, sizeof(double), C, ldc, d_C, ldc); + assert(cudaStat == cudaSuccess); } - cublasDgemm(handle, ta, tb, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, ldc); - - cublasGetMatrix(m, n, sizeof(double), d_C, ldc, C, ldc); - + cudaStat = cublasDgemm(handle, ta, tb, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, ldc); + assert(cudaStat == cudaSuccess); cudaFree(d_A); cudaFree(d_B); + + cudaStat = cublasGetMatrix(m, n, sizeof(double), d_C, ldc, C, ldc); + assert(cudaStat == cudaSuccess); + cudaFree(d_C); cublasDestroy(handle); } diff --git a/devel/ccsd_gpu/gpu_init.c b/devel/ccsd_gpu/gpu_init.c index 0882e95..d23ae48 100644 --- a/devel/ccsd_gpu/gpu_init.c +++ b/devel/ccsd_gpu/gpu_init.c @@ -5,6 +5,7 @@ #include #include #include "gpu.h" +#include "assert.h" gpu_data* gpu_init( int nO, int nV, int cholesky_mo_num, @@ -18,117 +19,143 @@ gpu_data* gpu_init( double* cc_space_f_vo, double* cc_space_f_vv) { int ngpus = 1; - cudaGetDeviceCount(&ngpus); + if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus); gpu_data* data = (gpu_data*) malloc (ngpus*sizeof(gpu_data)); + assert (data != NULL); #pragma omp parallel num_threads(ngpus) { - int lda; + cudaError_t cudaStat = cudaSuccess; + size_t lda; + int igpu = omp_get_thread_num(); cudaSetDevice(igpu); cublasHandle_t handle; - cublasCreate(&handle); double* d_cc_space_v_oo_chol; lda = cholesky_mo_num * nO; - cudaMalloc((void **)&d_cc_space_v_oo_chol, lda * nO * sizeof(double)); + cudaStat = cudaMalloc((void **)&d_cc_space_v_oo_chol, lda * nO * sizeof(double)); + assert (cudaStat == cudaSuccess); cublasSetMatrix(cholesky_mo_num*nO, nO, sizeof(double), cc_space_v_oo_chol, lda, d_cc_space_v_oo_chol, lda); double* d_cc_space_v_ov_chol; lda = cholesky_mo_num * nO; - cudaMalloc((void **)&d_cc_space_v_ov_chol, lda * nV * sizeof(double)); + cudaStat = cudaMalloc((void **)&d_cc_space_v_ov_chol, lda * nV * sizeof(double)); + assert (cudaStat == cudaSuccess); cublasSetMatrix(cholesky_mo_num*nO, nV, sizeof(double), cc_space_v_ov_chol, lda, d_cc_space_v_ov_chol, lda); double* d_cc_space_v_vo_chol; lda = cholesky_mo_num * nV; - cudaMalloc((void **)&d_cc_space_v_vo_chol, lda * nO * sizeof(double)); + cudaStat = cudaMalloc((void **)&d_cc_space_v_vo_chol, lda * nO * sizeof(double)); + assert (cudaStat == cudaSuccess); cublasSetMatrix(cholesky_mo_num*nV, nO, sizeof(double), cc_space_v_vo_chol, lda, d_cc_space_v_vo_chol, lda); double* d_cc_space_v_vv_chol; lda = cholesky_mo_num * nV; - cudaMalloc((void **)&d_cc_space_v_vv_chol, lda * nV * sizeof(double)); + cudaStat = cudaMalloc((void **)&d_cc_space_v_vv_chol, lda * nV * sizeof(double)); + assert (cudaStat == cudaSuccess); cublasSetMatrix(cholesky_mo_num*nV, nV, sizeof(double), cc_space_v_vv_chol, lda, d_cc_space_v_vv_chol, lda); double* d_cc_space_v_oooo; - cudaMalloc((void**)&d_cc_space_v_oooo, nO*nO*nO*nO*sizeof(double)); + cudaStat = cudaMalloc((void**)&d_cc_space_v_oooo, nO*nO*nO*nO*sizeof(double)); + assert (cudaStat == cudaSuccess); cublasSetMatrix(nO*nO, nO*nO, sizeof(double), cc_space_v_oooo, nO*nO, d_cc_space_v_oooo, nO*nO); double* d_cc_space_v_vooo; - cudaMalloc((void**)&d_cc_space_v_vooo, nV*nO*nO*nO*sizeof(double)); + cudaStat = cudaMalloc((void**)&d_cc_space_v_vooo, nV*nO*nO*nO*sizeof(double)); + assert (cudaStat == cudaSuccess); cublasSetMatrix(nV*nO, nO*nO, sizeof(double), cc_space_v_vooo, nV*nO, d_cc_space_v_vooo, nV*nO); double* d_cc_space_v_voov; - cudaMalloc((void**)&d_cc_space_v_voov, nV*nO*nO*nV*sizeof(double)); + cudaStat = cudaMalloc((void**)&d_cc_space_v_voov, nV*nO*nO*nV*sizeof(double)); + assert (cudaStat == cudaSuccess); cublasSetMatrix(nV*nO, nO*nV, sizeof(double), cc_space_v_voov, nV*nO, d_cc_space_v_voov, nV*nO); double* d_cc_space_v_oovv; - cudaMalloc((void**)&d_cc_space_v_oovv, nO*nO*nV*nV*sizeof(double)); + cudaStat = cudaMalloc((void**)&d_cc_space_v_oovv, nO*nO*nV*nV*sizeof(double)); + assert (cudaStat == cudaSuccess); cublasSetMatrix(nO*nO, nV*nV, sizeof(double), cc_space_v_oovv, nO*nO, d_cc_space_v_oovv, nO*nO); double* d_cc_space_v_vvoo; - cudaMalloc((void**)&d_cc_space_v_vvoo, nV*nV*nO*nO*sizeof(double)); + cudaStat = cudaMalloc((void**)&d_cc_space_v_vvoo, nV*nV*nO*nO*sizeof(double)); + assert (cudaStat == cudaSuccess); cublasSetMatrix(nV*nV, nO*nO, sizeof(double), cc_space_v_vvoo, nV*nV, d_cc_space_v_vvoo, nV*nV); double* d_cc_space_v_oovo; lda = nO*nO; - cudaMalloc((void **)&d_cc_space_v_oovo, nO*nO*nV*nO * sizeof(double)); + cudaStat = cudaMalloc((void **)&d_cc_space_v_oovo, nO*nO*nV*nO * sizeof(double)); + assert (cudaStat == cudaSuccess); cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_oovo, lda, d_cc_space_v_oovo, lda); double* d_cc_space_v_ovvo; lda = nO*nV; - cudaMalloc((void **)&d_cc_space_v_ovvo, nO*nV*nV*nO * sizeof(double)); + cudaStat = cudaMalloc((void **)&d_cc_space_v_ovvo, nO*nV*nV*nO * sizeof(double)); + assert (cudaStat == cudaSuccess); cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_ovvo, lda, d_cc_space_v_ovvo, lda); double* d_cc_space_v_ovov; lda = nO*nV; - cudaMalloc((void **)&d_cc_space_v_ovov, nO*nV*nV*nO * sizeof(double)); + cudaStat = cudaMalloc((void **)&d_cc_space_v_ovov, nO*nV*nV*nO * sizeof(double)); + assert (cudaStat == cudaSuccess); cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_ovov, lda, d_cc_space_v_ovov, lda); double* d_cc_space_v_ovoo; lda = nO*nV; - cudaMalloc((void **)&d_cc_space_v_ovoo, nO*nV*nO*nO * sizeof(double)); + cudaStat = cudaMalloc((void **)&d_cc_space_v_ovoo, nO*nV*nO*nO * sizeof(double)); + assert (cudaStat == cudaSuccess); cublasSetMatrix(lda, nO*nO, sizeof(double), cc_space_v_ovoo, lda, d_cc_space_v_ovoo, lda); double* d_cc_space_f_oo; - cudaMalloc((void**)&d_cc_space_f_oo, nO*nO*sizeof(double)); + cudaStat = cudaMalloc((void**)&d_cc_space_f_oo, nO*nO*sizeof(double)); + assert (cudaStat == cudaSuccess); cublasSetMatrix(nO, nO, sizeof(double), cc_space_f_oo, nO, d_cc_space_f_oo, nO); double* d_cc_space_f_vo; - cudaMalloc((void**)&d_cc_space_f_vo, nV*nO*sizeof(double)); + cudaStat = cudaMalloc((void**)&d_cc_space_f_vo, nV*nO*sizeof(double)); + assert (cudaStat == cudaSuccess); cublasSetMatrix(nV, nO, sizeof(double), cc_space_f_vo, nV, d_cc_space_f_vo, nV); double* d_cc_space_f_ov; - cudaMalloc((void**)&d_cc_space_f_ov, nV*nO*sizeof(double)); + cudaStat = cudaMalloc((void**)&d_cc_space_f_ov, nV*nO*sizeof(double)); + assert (cudaStat == cudaSuccess); cublasSetMatrix(nO, nV, sizeof(double), cc_space_f_ov, nO, d_cc_space_f_ov, nO); double* d_cc_space_f_vv; - cudaMalloc((void**)&d_cc_space_f_vv, nV*nV*sizeof(double)); + cudaStat = cudaMalloc((void**)&d_cc_space_f_vv, nV*nV*sizeof(double)); + assert (cudaStat == cudaSuccess); cublasSetMatrix(nV, nV, sizeof(double), cc_space_f_vv, nV, d_cc_space_f_vv, nV); double* d_tau; lda = nO * nO; - cudaMalloc((void **)&d_tau, lda * nV * nV * sizeof(double)); + cudaStat = cudaMalloc((void **)&d_tau, lda * nV * nV * sizeof(double)); + assert (cudaStat == cudaSuccess); double* d_tau_x; lda = nO * nO; - cudaMalloc((void **)&d_tau_x, lda * nV * nV * sizeof(double)); + cudaStat = cudaMalloc((void **)&d_tau_x, lda * nV * nV * sizeof(double)); + assert (cudaStat == cudaSuccess); double* d_t1; - cudaMalloc((void **)&d_t1, nO * nV * sizeof(double)); + cudaStat = cudaMalloc((void **)&d_t1, nO * nV * sizeof(double)); + assert (cudaStat == cudaSuccess); double* d_t2; - cudaMalloc((void **)&d_t2, nO*nO*nV*nV * sizeof(double)); + cudaStat = cudaMalloc((void **)&d_t2, nO*nO*nV*nV * sizeof(double)); + assert (cudaStat == cudaSuccess); double* d_H_oo; - cudaMalloc((void **)&d_H_oo, nO * nO * sizeof(double)); + cudaStat = cudaMalloc((void **)&d_H_oo, nO * nO * sizeof(double)); + assert (cudaStat == cudaSuccess); double* d_H_vo; - cudaMalloc((void **)&d_H_vo, nV * nO * sizeof(double)); + cudaStat = cudaMalloc((void **)&d_H_vo, nV * nO * sizeof(double)); + assert (cudaStat == cudaSuccess); double* d_H_vv; - cudaMalloc((void **)&d_H_vv, nV * nV * sizeof(double)); + cudaStat = cudaMalloc((void **)&d_H_vv, nV * nV * sizeof(double)); + assert (cudaStat == cudaSuccess); data[igpu].cc_space_v_oo_chol = d_cc_space_v_oo_chol; data[igpu].cc_space_v_ov_chol = d_cc_space_v_ov_chol; @@ -164,3 +191,41 @@ gpu_data* gpu_init( } +void gpu_deinit(gpu_data* data) +{ + int ngpus = 1; + if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus); + + #pragma omp parallel num_threads(ngpus) + { + size_t lda; + int igpu = omp_get_thread_num(); + cudaSetDevice(igpu); + + free(data[igpu].cc_space_v_oo_chol); + free(data[igpu].cc_space_v_ov_chol); + free(data[igpu].cc_space_v_vo_chol); + free(data[igpu].cc_space_v_vv_chol); + free(data[igpu].cc_space_v_oooo); + free(data[igpu].cc_space_v_vooo); + free(data[igpu].cc_space_v_voov); + free(data[igpu].cc_space_v_oovv); + free(data[igpu].cc_space_v_vvoo); + free(data[igpu].cc_space_v_oovo); + free(data[igpu].cc_space_v_ovvo); + free(data[igpu].cc_space_v_ovov); + free(data[igpu].cc_space_v_ovoo); + free(data[igpu].cc_space_f_oo); + free(data[igpu].cc_space_f_ov); + free(data[igpu].cc_space_f_vo); + free(data[igpu].cc_space_f_vv); + free(data[igpu].tau); + free(data[igpu].tau_x); + free(data[igpu].t1); + free(data[igpu].t2); + free(data[igpu].H_oo); + free(data[igpu].H_vo); + free(data[igpu].H_vv); + } +} + diff --git a/devel/ccsd_gpu/gpu_module.f90 b/devel/ccsd_gpu/gpu_module.f90 index 2dceffc..abd81f0 100644 --- a/devel/ccsd_gpu/gpu_module.f90 +++ b/devel/ccsd_gpu/gpu_module.f90 @@ -30,6 +30,32 @@ module gpu_module real(c_double), intent(in) :: cc_space_f_vv(nV,nV) end function + type(c_ptr) function gpu_init_sp(nO, nV, cholesky_mo_num, & + cc_space_v_oo_chol, cc_space_v_ov_chol, cc_space_v_vo_chol, cc_space_v_vv_chol, & + cc_space_v_oooo, cc_space_v_vooo, cc_space_v_voov, cc_space_v_oovv, cc_space_v_vvoo, & + cc_space_v_oovo, cc_space_v_ovvo, cc_space_v_ovov, cc_space_v_ovoo, & + cc_space_f_oo, cc_space_f_ov, cc_space_f_vo, cc_space_f_vv) bind(C) + import c_int, c_double, c_ptr + integer(c_int), intent(in), value :: nO, nV, cholesky_mo_num + real(c_double), intent(in) :: cc_space_v_oo_chol(cholesky_mo_num,nO,nO) + real(c_double), intent(in) :: cc_space_v_ov_chol(cholesky_mo_num,nO,nV) + real(c_double), intent(in) :: cc_space_v_vo_chol(cholesky_mo_num,nV,nO) + real(c_double), intent(in) :: cc_space_v_vv_chol(cholesky_mo_num,nV,nV) + real(c_double), intent(in) :: cc_space_v_oooo(nO,nO,nO,nO) + real(c_double), intent(in) :: cc_space_v_vooo(nV,nO,nO,nO) + real(c_double), intent(in) :: cc_space_v_voov(nV,nO,nO,nV) + real(c_double), intent(in) :: cc_space_v_oovv(nO,nO,nV,nV) + real(c_double), intent(in) :: cc_space_v_vvoo(nV,nV,nO,nO) + real(c_double), intent(in) :: cc_space_v_oovo(nO,nO,nV,nO) + real(c_double), intent(in) :: cc_space_v_ovvo(nO,nV,nV,nO) + real(c_double), intent(in) :: cc_space_v_ovov(nO,nV,nO,nV) + real(c_double), intent(in) :: cc_space_v_ovoo(nO,nV,nO,nO) + real(c_double), intent(in) :: cc_space_f_oo(nO,nO) + real(c_double), intent(in) :: cc_space_f_ov(nO,nV) + real(c_double), intent(in) :: cc_space_f_vo(nV,nO) + real(c_double), intent(in) :: cc_space_f_vv(nV,nV) + end function + subroutine gpu_upload(gpu_data, nO, nV, t1, t2) bind(C) import c_int, c_double, c_ptr type(c_ptr), value :: gpu_data @@ -38,21 +64,29 @@ module gpu_module real(c_double), intent(in) :: t2(nO,nO,nV,nV) end subroutine + subroutine gpu_upload_sp(gpu_data, nO, nV, t1, t2) bind(C) + import c_int, c_double, c_ptr + type(c_ptr), value :: gpu_data + integer(c_int), intent(in), value :: nO, nV + real(c_double), intent(in) :: t1(nO,nV) + real(c_double), intent(in) :: t2(nO,nO,nV,nV) + end subroutine + subroutine compute_H_oo_chol_gpu(gpu_data, igpu) bind(C) - import c_int, c_double, c_ptr + import c_int, c_ptr type(c_ptr), value :: gpu_data integer(c_int), intent(in), value :: igpu end subroutine subroutine compute_H_vo_chol_gpu(gpu_data, igpu) bind(C) - import c_int, c_double, c_ptr + import c_int, c_ptr type(c_ptr), value :: gpu_data integer(c_int), intent(in), value :: igpu end subroutine subroutine compute_H_vv_chol_gpu(gpu_data, igpu) bind(C) - import c_int, c_double, c_ptr + import c_int, c_ptr type(c_ptr), value :: gpu_data integer(c_int), intent(in), value :: igpu end subroutine @@ -81,6 +115,47 @@ module gpu_module end function + subroutine compute_H_oo_chol_gpu_sp(gpu_data, igpu) bind(C) + import c_int, c_ptr + type(c_ptr), value :: gpu_data + integer(c_int), intent(in), value :: igpu + end subroutine + + subroutine compute_H_vo_chol_gpu_sp(gpu_data, igpu) bind(C) + import c_int, c_ptr + type(c_ptr), value :: gpu_data + integer(c_int), intent(in), value :: igpu + end subroutine + + subroutine compute_H_vv_chol_gpu_sp(gpu_data, igpu) bind(C) + import c_int, c_ptr + type(c_ptr), value :: gpu_data + integer(c_int), intent(in), value :: igpu + end subroutine + + subroutine compute_r1_space_chol_gpu_sp(gpu_data, nO, nV, t1, r1, max_r1) bind(C) + import c_int, c_double, c_ptr + type(c_ptr), value :: gpu_data + integer(c_int), intent(in), value :: nO, nV + real(c_double), intent(in) :: t1(nO,nV) + real(c_double), intent(out) :: r1(nO,nO,nV,nV) + real(c_double), intent(out) :: max_r1 + end subroutine + + subroutine compute_r2_space_chol_gpu_sp(gpu_data, nO, nV, t1, r2, max_r2) bind(C) + import c_int, c_double, c_ptr + type(c_ptr), value :: gpu_data + integer(c_int), intent(in), value :: nO, nV + real(c_double), intent(in) :: t1(nO,nV) + real(c_double), intent(out) :: r2(nO,nO,nV,nV) + real(c_double), intent(out) :: max_r2 + end subroutine + + double precision function ccsd_energy_space_gpu_sp(gpu_data) bind(C) + import c_ptr + type(c_ptr), value :: gpu_data + end function + subroutine gpu_dgemm(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) bind(C) import c_int, c_double, c_char character(c_char), value :: transa, transb