From 5da8e2cba41db652006ccac4c14239a3af63f336 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Wed, 2 Aug 2023 16:17:43 +0200 Subject: [PATCH] Upload integrals only once --- devel/ccsd_gpu/ccsd_space_orb_sub.irp.f | 11 +- devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f | 12 +- devel/ccsd_gpu/gpu.c | 156 ++++--------------- devel/ccsd_gpu/gpu.h | 15 ++ devel/ccsd_gpu/gpu_dgemm.c | 61 ++++++++ devel/ccsd_gpu/gpu_init.c | 105 +++++++++++++ devel/ccsd_gpu/gpu_module.f90 | 26 ++-- 7 files changed, 241 insertions(+), 145 deletions(-) create mode 100644 devel/ccsd_gpu/gpu.h create mode 100644 devel/ccsd_gpu/gpu_dgemm.c create mode 100644 devel/ccsd_gpu/gpu_init.c diff --git a/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f b/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f index b48ca7d..3d5969f 100644 --- a/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f +++ b/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f @@ -1,5 +1,5 @@ subroutine run_ccsd_space_orb - + use gpu_module implicit none integer :: i,j,k,l,a,b,c,d,tmp_a,tmp_b,tmp_c,tmp_d @@ -115,6 +115,13 @@ subroutine run_ccsd_space_orb write(*,'(A77)') ' -----------------------------------------------------------------------------' call wall_time(ta) + type(c_ptr) :: gpu_data + + gpu_data = gpu_init(nO, nV, cholesky_mo_num, & + cc_space_v_oo_chol, cc_space_v_ov_chol, cc_space_v_vo_chol, cc_space_v_vv_chol, & + cc_space_v_oooo, cc_space_v_vooo, cc_space_v_oovv, cc_space_v_vvoo, & + cc_space_v_oovo, cc_space_v_ovvo, cc_space_v_ovoo, cc_space_f_vo) + do while (not_converged) ! Residue @@ -125,7 +132,7 @@ subroutine run_ccsd_space_orb call compute_H_vo_chol(nO,nV,t1,H_vo) call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) - call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) + call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2,gpu_data) else call compute_H_oo(nO,nV,t1,t2,tau,H_oo) call compute_H_vv(nO,nV,t1,t2,tau,H_vv) diff --git a/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f b/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f index 0de043a..3cc3d0c 100644 --- a/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f +++ b/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f @@ -441,7 +441,7 @@ end ! R2 -subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) +subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2,gpu_data) use gpu_module implicit none @@ -449,6 +449,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) integer, intent(in) :: nO, nV double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV), tau(nO,nO,nV,nV) double precision, intent(in) :: H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO) + type(c_ptr), intent(in) :: gpu_data ! out double precision, intent(out) :: r2(nO,nO,nV,nV), max_r2 @@ -493,11 +494,8 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) allocate(J1(nO,nV,nV,nO)) J1 = 0.d0 - call compute_r2_space_chol_gpu(nO,nV,cholesky_mo_num,t1,t2,tau, & - cc_space_v_oo_chol, cc_space_v_ov_chol, cc_space_v_vo_chol, cc_space_v_vv_chol, & - cc_space_v_oooo, cc_space_v_vooo, cc_space_v_oovv, cc_space_v_vvoo, & - cc_space_v_oovo, cc_space_v_ovvo, cc_space_v_ovoo, & - cc_space_f_vo, H_vv, g_occ, J1, r2) + call compute_r2_space_chol_gpu(nO,nV,cholesky_mo_num,gpu_data,t1,t2,tau, & + H_vv, g_occ, J1, r2) !--- @@ -747,6 +745,8 @@ subroutine compute_J1_chol(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvoo,J1) double precision, allocatable :: tmp_cc(:,:,:), J1_tmp(:,:,:,:) +!TODO: I am here + !- cc_space_v_vvoo(a,b,i,j) * (0.5d0 * t2(u,j,b,beta) + t1(u,b) * t1(j,beta)) & double precision, allocatable :: X_voov(:,:,:,:), Z_ovvo(:,:,:,:) allocate(X_voov(nV,nO,nO,nV), Z_ovvo(nO,nV,nV,nO)) diff --git a/devel/ccsd_gpu/gpu.c b/devel/ccsd_gpu/gpu.c index 3deb9b9..345d3fe 100644 --- a/devel/ccsd_gpu/gpu.c +++ b/devel/ccsd_gpu/gpu.c @@ -4,85 +4,25 @@ #include #include #include +#include "gpu.h" #define BLOCK_SIZE 16 -void dgemm_(char*, char*, int*, int*, int*, double*, double*, int*, double*, int*, - double*, double*, int*); - - - - -void gpu_dgemm(char transa, char transb, int m, int n, int k, double alpha, - double* A, int lda, double* B, int ldb, double beta, double* C, int ldc) -{ - cublasHandle_t handle; - cublasCreate(&handle); - - double * d_A; - double * d_B; - double * d_C; - cublasOperation_t ta, tb; - - if (transa == 'N') { - cudaMalloc((void**)&d_A, lda*k*sizeof(double)); - cublasSetMatrix(m, k, sizeof(double), A, lda, d_A, lda); - ta = CUBLAS_OP_N; - } else { - cudaMalloc((void**)&d_A, lda*m*sizeof(double)); - cublasSetMatrix(k, m, sizeof(double), A, lda, d_A, lda); - ta = CUBLAS_OP_T; - } - - if (transb == 'N') { - cudaMalloc((void**)&d_B, ldb*n*sizeof(double)); - cublasSetMatrix(k, n, sizeof(double), B, ldb, d_B, ldb); - tb = CUBLAS_OP_N; - } else { - cudaMalloc((void**)&d_B, ldb*k*sizeof(double)); - cublasSetMatrix(n, k, sizeof(double), B, ldb, d_B, ldb); - tb = CUBLAS_OP_T; - } - - cudaMalloc((void**)&d_C, ldc*n*sizeof(double)); - if (beta != 0.) { - cublasSetMatrix(m, n, sizeof(double), C, ldc, d_C, ldc); - } - - cublasDgemm(handle, ta, tb, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, ldc); - - cublasGetMatrix(m, n, sizeof(double), d_C, ldc, C, ldc); - - cudaFree(d_A); - cudaFree(d_B); - cudaFree(d_C); - cublasDestroy(handle); -} void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo_num, + gpu_data* data, double* t1, double* t2, double* tau, - double* cc_space_v_oo_chol, - double* cc_space_v_ov_chol, - double* cc_space_v_vo_chol, - double* cc_space_v_vv_chol, - double* cc_space_v_oooo, - double* cc_space_v_vooo, - double* cc_space_v_oovv, - double* cc_space_v_vvoo, - double* cc_space_v_oovo, - double* cc_space_v_ovvo, - double* cc_space_v_ovoo, - double* cc_space_f_vo, double* H_vv, double* g_occ, double* J1, double* r2) { - int ngpus; + int ngpus = 1; cudaGetDeviceCount(&ngpus); + #pragma omp parallel num_threads(ngpus) { int m,n,k, lda, ldb, ldc; @@ -91,20 +31,28 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo double* B; double* C; - int ithread = omp_get_thread_num(); - int igpu = ithread ; -//igpu=1; + int igpu = omp_get_thread_num(); + cudaSetDevice(igpu); cublasHandle_t handle; cublasCreate(&handle); + double* d_cc_space_v_oo_chol = data[igpu].cc_space_v_oo_chol; + double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol; + double* d_cc_space_v_vo_chol = data[igpu].cc_space_v_vo_chol; + double* d_cc_space_v_vv_chol = data[igpu].cc_space_v_vv_chol; + double* d_cc_space_v_oooo = data[igpu].cc_space_v_oooo; + double* d_cc_space_v_vooo = data[igpu].cc_space_v_vooo; + double* d_cc_space_v_oovv = data[igpu].cc_space_v_oovv; + double* d_cc_space_v_vvoo = data[igpu].cc_space_v_vvoo; + double* d_cc_space_v_oovo = data[igpu].cc_space_v_oovo; + double* d_cc_space_v_ovvo = data[igpu].cc_space_v_ovvo; + double* d_cc_space_v_ovoo = data[igpu].cc_space_v_ovoo; + double* d_cc_space_f_vo = data[igpu].cc_space_f_vo; + double* d_tau; double* d_r2; - double* d_cc_space_v_oo_chol; - double* d_cc_space_v_ov_chol; - double* d_cc_space_v_vo_chol; - double* d_cc_space_v_vv_chol; double* d_t1; double* d_t2; double* d_tmp_cc; @@ -118,27 +66,6 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo memset(r2, 0, nO*nO*nV*nV*sizeof(double)); cublasSetMatrix(nO*nO, nV*nV, sizeof(double), r2, lda, d_r2, lda); - lda = cholesky_mo_num * nV; - cudaMalloc((void **)&d_cc_space_v_vv_chol, lda * nV * sizeof(double)); - cublasSetMatrix(cholesky_mo_num*nV, nV, sizeof(double), cc_space_v_vv_chol, lda, d_cc_space_v_vv_chol, lda); - - lda = cholesky_mo_num * nO; - cudaMalloc((void **)&d_cc_space_v_oo_chol, lda * nO * sizeof(double)); - cublasSetMatrix(cholesky_mo_num*nO, nO, sizeof(double), cc_space_v_oo_chol, lda, d_cc_space_v_oo_chol, lda); - - lda = cholesky_mo_num * nO; - cudaMalloc((void **)&d_cc_space_v_ov_chol, lda * nV * sizeof(double)); - cublasSetMatrix(cholesky_mo_num*nO, nV, sizeof(double), cc_space_v_ov_chol, lda, d_cc_space_v_ov_chol, lda); - - lda = cholesky_mo_num * nV; - cudaMalloc((void **)&d_cc_space_v_vo_chol, lda * nO * sizeof(double)); - cublasSetMatrix(cholesky_mo_num*nV, nO, sizeof(double), cc_space_v_vo_chol, lda, d_cc_space_v_vo_chol, lda); - - double* d_cc_space_v_ovvo; - lda = nO*nV; - cudaMalloc((void **)&d_cc_space_v_ovvo, nO*nV*nV*nO * sizeof(double)); - cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_ovvo, lda, d_cc_space_v_ovvo, lda); - lda = nO; cudaMalloc((void **)&d_t1, nO * nV * sizeof(double)); cublasSetMatrix(nO, nV, sizeof(double), t1, lda, d_t1, lda); @@ -147,19 +74,11 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo cudaMalloc((void **)&d_t2, nO*nO*nV*nV * sizeof(double)); cublasSetMatrix(nO*nO, nV*nV, sizeof(double), t2, lda, d_t2, lda); - double* d_cc_space_f_vo; - cudaMalloc((void**)&d_cc_space_f_vo, nV*nO*sizeof(double)); - cublasSetMatrix(nV, nO, sizeof(double), cc_space_f_vo, nV, d_cc_space_f_vo, nV); - #pragma omp sections { #pragma omp section { - double* d_cc_space_v_vooo; - cudaMalloc((void**)&d_cc_space_v_vooo, nV*nO*nO*nO*sizeof(double)); - cublasSetMatrix(nV*nO, nO*nO, sizeof(double), cc_space_v_vooo, nV*nO, d_cc_space_v_vooo, nV*nO); - double* d_Y_oooo; cudaMalloc((void**)&d_Y_oooo, nO*nO*nO*nO*sizeof(double)); @@ -170,15 +89,10 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo B = d_cc_space_v_vooo ; ldb = nV; C = d_Y_oooo; ldc = nO; cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc); - cudaFree(d_cc_space_v_vooo); double* d_A1; cudaMalloc((void**)&d_A1, nO*nO*nO*nO*sizeof(double)); - double* d_cc_space_v_oooo; - cudaMalloc((void**)&d_cc_space_v_oooo, nO*nO*nO*nO*sizeof(double)); - cublasSetMatrix(nO*nO, nO*nO, sizeof(double), cc_space_v_oooo, nO*nO, d_cc_space_v_oooo, nO*nO); - alpha = 1.0; beta = 1.0; A = d_cc_space_v_oooo; lda = nO*nO; @@ -198,10 +112,6 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo } cudaFree(d_Y_oooo); - double* d_cc_space_v_vvoo; - cudaMalloc((void**)&d_cc_space_v_vvoo, nV*nV*nO*nO*sizeof(double)); - cublasSetMatrix(nV*nV, nO*nO, sizeof(double), cc_space_v_vvoo, nV*nV, d_cc_space_v_vvoo, nV*nV); - alpha = 1.0; beta = 1.0; m=nO*nO ; n=nO*nO; k=nV*nV; @@ -209,7 +119,6 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo B = d_cc_space_v_vvoo ; ldb = nV*nV; C = d_A1; ldc = nO*nO; cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc); - cudaFree(d_cc_space_v_vvoo); alpha = 1.0; beta = 0.0; @@ -529,11 +438,6 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo #pragma omp section { - double* d_cc_space_v_oovo; - lda = nO*nO; - cudaMalloc((void **)&d_cc_space_v_oovo, nO*nO*nV*nO * sizeof(double)); - cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_oovo, lda, d_cc_space_v_oovo, lda); - double* d_X_oovv; lda = nO*nO; cudaMalloc((void **)&d_X_oovv, nO*nO*nV*nV * sizeof(double)); @@ -562,7 +466,6 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc); } } - cudaFree(d_cc_space_v_oovo); double* d_X_vovo; lda = nV*nO; @@ -618,6 +521,16 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo cudaFree(d_X_oovv); } + #pragma omp section + { + alpha = 1.0; + beta = 1.0; + A = d_r2; lda = nO*nO; + B = d_cc_space_v_oovv; ldb = nO*nO; + C = d_r2; ldc = nO*nO; + cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO*nO, nV*nV, &alpha, A, lda, &beta, B, ldb, C, ldc); + + } #pragma omp section { @@ -634,11 +547,6 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO*nV, nV*nO, &alpha, A, lda, &beta, B, ldb, C, ldc); - double* d_cc_space_v_ovoo; - lda = nO*nV; - cudaMalloc((void **)&d_cc_space_v_ovoo, nO*nV*nO*nO * sizeof(double)); - cublasSetMatrix(lda, nO*nO, sizeof(double), cc_space_v_ovoo, lda, d_cc_space_v_ovoo, lda); - double* d_X_ovoo; lda = nO*nV; cudaMalloc((void **)&d_X_ovoo, nO*nV*nO*nO * sizeof(double)); @@ -795,8 +703,6 @@ cublasGetMatrix(nO*nV, nV*nO, sizeof(double), d_J1, lda, J1, lda); cudaFree(d_B1); cudaFree(d_tmp_cc2); - cudaFree(d_cc_space_v_vo_chol); - cudaFree(d_cc_space_v_vv_chol); cudaFree(d_tau); cudaFree(d_t1); cudaFree(d_tmp_cc); @@ -815,10 +721,6 @@ cublasGetMatrix(nO*nV, nV*nO, sizeof(double), d_J1, lda, J1, lda); cublasDestroy(handle); } - for (size_t i=0 ; i<(size_t) nO*nO*nV*nV ; ++i) - { - r2[i] += cc_space_v_oovv[i]; - } } diff --git a/devel/ccsd_gpu/gpu.h b/devel/ccsd_gpu/gpu.h new file mode 100644 index 0000000..93dd5e2 --- /dev/null +++ b/devel/ccsd_gpu/gpu.h @@ -0,0 +1,15 @@ +typedef struct { + double* cc_space_v_oo_chol; + double* cc_space_v_ov_chol; + double* cc_space_v_vo_chol; + double* cc_space_v_vv_chol; + double* cc_space_v_oooo; + double* cc_space_v_vooo; + double* cc_space_v_oovv; + double* cc_space_v_vvoo; + double* cc_space_v_oovo; + double* cc_space_v_ovvo; + double* cc_space_v_ovoo; + double* cc_space_f_vo; +} gpu_data; + diff --git a/devel/ccsd_gpu/gpu_dgemm.c b/devel/ccsd_gpu/gpu_dgemm.c new file mode 100644 index 0000000..6530b2b --- /dev/null +++ b/devel/ccsd_gpu/gpu_dgemm.c @@ -0,0 +1,61 @@ +#include +#include +#include +#include +#include +#include + +#define BLOCK_SIZE 16 + +void dgemm_(char*, char*, int*, int*, int*, double*, double*, int*, double*, int*, + double*, double*, int*); + + + + +void gpu_dgemm(char transa, char transb, int m, int n, int k, double alpha, + double* A, int lda, double* B, int ldb, double beta, double* C, int ldc) +{ + cublasHandle_t handle; + cublasCreate(&handle); + + double * d_A; + double * d_B; + double * d_C; + cublasOperation_t ta, tb; + + if (transa == 'N') { + cudaMalloc((void**)&d_A, lda*k*sizeof(double)); + cublasSetMatrix(m, k, sizeof(double), A, lda, d_A, lda); + ta = CUBLAS_OP_N; + } else { + cudaMalloc((void**)&d_A, lda*m*sizeof(double)); + cublasSetMatrix(k, m, sizeof(double), A, lda, d_A, lda); + ta = CUBLAS_OP_T; + } + + if (transb == 'N') { + cudaMalloc((void**)&d_B, ldb*n*sizeof(double)); + cublasSetMatrix(k, n, sizeof(double), B, ldb, d_B, ldb); + tb = CUBLAS_OP_N; + } else { + cudaMalloc((void**)&d_B, ldb*k*sizeof(double)); + cublasSetMatrix(n, k, sizeof(double), B, ldb, d_B, ldb); + tb = CUBLAS_OP_T; + } + + cudaMalloc((void**)&d_C, ldc*n*sizeof(double)); + if (beta != 0.) { + cublasSetMatrix(m, n, sizeof(double), C, ldc, d_C, ldc); + } + + cublasDgemm(handle, ta, tb, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, ldc); + + cublasGetMatrix(m, n, sizeof(double), d_C, ldc, C, ldc); + + cudaFree(d_A); + cudaFree(d_B); + cudaFree(d_C); + cublasDestroy(handle); +} + diff --git a/devel/ccsd_gpu/gpu_init.c b/devel/ccsd_gpu/gpu_init.c new file mode 100644 index 0000000..dc58a73 --- /dev/null +++ b/devel/ccsd_gpu/gpu_init.c @@ -0,0 +1,105 @@ +#include +#include +#include +#include +#include +#include +#include "gpu.h" + +#define BLOCK_SIZE 16 + +gpu_data* gpu_init( + int nO, int nV, int cholesky_mo_num, + double* cc_space_v_oo_chol, double* cc_space_v_ov_chol, + double* cc_space_v_vo_chol, double* cc_space_v_vv_chol, + double* cc_space_v_oooo, double* cc_space_v_vooo, + double* cc_space_v_oovv, double* cc_space_v_vvoo, + double* cc_space_v_oovo, double* cc_space_v_ovvo, + double* cc_space_v_ovoo, double* cc_space_f_vo) +{ + int ngpus = 1; + cudaGetDeviceCount(&ngpus); + + gpu_data* data = (gpu_data*) malloc (ngpus*sizeof(gpu_data)); + + #pragma omp parallel num_threads(ngpus) + { + int lda; + int igpu = omp_get_thread_num(); + cudaSetDevice(igpu); + cublasHandle_t handle; + + cublasCreate(&handle); + + double* d_cc_space_v_oo_chol; + lda = cholesky_mo_num * nO; + cudaMalloc((void **)&d_cc_space_v_oo_chol, lda * nO * sizeof(double)); + cublasSetMatrix(cholesky_mo_num*nO, nO, sizeof(double), cc_space_v_oo_chol, lda, d_cc_space_v_oo_chol, lda); + + double* d_cc_space_v_ov_chol; + lda = cholesky_mo_num * nO; + cudaMalloc((void **)&d_cc_space_v_ov_chol, lda * nV * sizeof(double)); + cublasSetMatrix(cholesky_mo_num*nO, nV, sizeof(double), cc_space_v_ov_chol, lda, d_cc_space_v_ov_chol, lda); + + double* d_cc_space_v_vo_chol; + lda = cholesky_mo_num * nV; + cudaMalloc((void **)&d_cc_space_v_vo_chol, lda * nO * sizeof(double)); + cublasSetMatrix(cholesky_mo_num*nV, nO, sizeof(double), cc_space_v_vo_chol, lda, d_cc_space_v_vo_chol, lda); + + double* d_cc_space_v_vv_chol; + lda = cholesky_mo_num * nV; + cudaMalloc((void **)&d_cc_space_v_vv_chol, lda * nV * sizeof(double)); + cublasSetMatrix(cholesky_mo_num*nV, nV, sizeof(double), cc_space_v_vv_chol, lda, d_cc_space_v_vv_chol, lda); + + double* d_cc_space_v_oooo; + cudaMalloc((void**)&d_cc_space_v_oooo, nO*nO*nO*nO*sizeof(double)); + cublasSetMatrix(nO*nO, nO*nO, sizeof(double), cc_space_v_oooo, nO*nO, d_cc_space_v_oooo, nO*nO); + + double* d_cc_space_v_vooo; + cudaMalloc((void**)&d_cc_space_v_vooo, nV*nO*nO*nO*sizeof(double)); + cublasSetMatrix(nV*nO, nO*nO, sizeof(double), cc_space_v_vooo, nV*nO, d_cc_space_v_vooo, nV*nO); + + double* d_cc_space_v_oovv; + cudaMalloc((void**)&d_cc_space_v_oovv, nO*nO*nV*nV*sizeof(double)); + cublasSetMatrix(nO*nO, nV*nV, sizeof(double), cc_space_v_oovv, nO*nO, d_cc_space_v_oovv, nO*nO); + + double* d_cc_space_v_vvoo; + cudaMalloc((void**)&d_cc_space_v_vvoo, nV*nV*nO*nO*sizeof(double)); + cublasSetMatrix(nV*nV, nO*nO, sizeof(double), cc_space_v_vvoo, nV*nV, d_cc_space_v_vvoo, nV*nV); + + double* d_cc_space_v_oovo; + lda = nO*nO; + cudaMalloc((void **)&d_cc_space_v_oovo, nO*nO*nV*nO * sizeof(double)); + cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_oovo, lda, d_cc_space_v_oovo, lda); + + double* d_cc_space_v_ovvo; + lda = nO*nV; + cudaMalloc((void **)&d_cc_space_v_ovvo, nO*nV*nV*nO * sizeof(double)); + cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_ovvo, lda, d_cc_space_v_ovvo, lda); + + double* d_cc_space_v_ovoo; + lda = nO*nV; + cudaMalloc((void **)&d_cc_space_v_ovoo, nO*nV*nO*nO * sizeof(double)); + cublasSetMatrix(lda, nO*nO, sizeof(double), cc_space_v_ovoo, lda, d_cc_space_v_ovoo, lda); + + double* d_cc_space_f_vo; + cudaMalloc((void**)&d_cc_space_f_vo, nV*nO*sizeof(double)); + cublasSetMatrix(nV, nO, sizeof(double), cc_space_f_vo, nV, d_cc_space_f_vo, nV); + + data[igpu].cc_space_v_oo_chol = d_cc_space_v_oo_chol; + data[igpu].cc_space_v_ov_chol = d_cc_space_v_ov_chol; + data[igpu].cc_space_v_vo_chol = d_cc_space_v_vo_chol; + data[igpu].cc_space_v_vv_chol = d_cc_space_v_vv_chol; + data[igpu].cc_space_v_oooo = d_cc_space_v_oooo; + data[igpu].cc_space_v_vooo = d_cc_space_v_vooo; + data[igpu].cc_space_v_oovv = d_cc_space_v_oovv; + data[igpu].cc_space_v_vvoo = d_cc_space_v_vvoo; + data[igpu].cc_space_v_oovo = d_cc_space_v_oovo; + data[igpu].cc_space_v_ovvo = d_cc_space_v_ovvo; + data[igpu].cc_space_v_ovoo = d_cc_space_v_ovoo; + data[igpu].cc_space_f_vo = d_cc_space_f_vo; + } + return data; +} + + diff --git a/devel/ccsd_gpu/gpu_module.f90 b/devel/ccsd_gpu/gpu_module.f90 index 014f803..3653a03 100644 --- a/devel/ccsd_gpu/gpu_module.f90 +++ b/devel/ccsd_gpu/gpu_module.f90 @@ -4,16 +4,12 @@ module gpu_module implicit none interface - subroutine compute_r2_space_chol_gpu(nO,nV,cholesky_mo_num, t1, t2, tau,& - cc_space_v_oo_chol, cc_space_v_ov_chol, cc_space_v_vo_chol, cc_space_v_vv_chol, & - cc_space_v_oooo, cc_space_v_vooo, cc_space_v_oovv, cc_space_v_vvoo, & - cc_space_v_oovo, cc_space_v_ovvo, cc_space_v_ovoo, & - cc_space_f_vo, H_vv, g_occ, J1, r2) bind(C) - import c_int, c_double - integer(c_int), value :: nO, nV, cholesky_mo_num - real(c_double), intent(in) :: t1(nO,nV) - real(c_double), intent(in) :: t2(nO,nO,nV,nV) - real(c_double), intent(in) :: tau(nO,nO,nV,nV) + type(c_ptr) function gpu_init(nO, nV, cholesky_mo_num, & + cc_space_v_oo_chol, cc_space_v_ov_chol, cc_space_v_vo_chol, cc_space_v_vv_chol, & + cc_space_v_oooo, cc_space_v_vooo, cc_space_v_oovv, cc_space_v_vvoo, & + cc_space_v_oovo, cc_space_v_ovvo, cc_space_v_ovoo, cc_space_f_vo) bind(C) + import c_int, c_double, c_ptr + integer(c_int), intent(in), value :: nO, nV, cholesky_mo_num real(c_double), intent(in) :: cc_space_v_oo_chol(cholesky_mo_num,nO,nO) real(c_double), intent(in) :: cc_space_v_ov_chol(cholesky_mo_num,nO,nV) real(c_double), intent(in) :: cc_space_v_vo_chol(cholesky_mo_num,nV,nO) @@ -26,6 +22,16 @@ module gpu_module real(c_double), intent(in) :: cc_space_v_ovvo(nO,nV,nV,nO) real(c_double), intent(in) :: cc_space_v_ovoo(nO,nV,nO,nO) real(c_double), intent(in) :: cc_space_f_vo(nV,nO) + end function + + subroutine compute_r2_space_chol_gpu(nO,nV,cholesky_mo_num, gpu_data, t1, t2, tau,& + H_vv, g_occ, J1, r2) bind(C) + import c_int, c_double, c_ptr + integer(c_int), intent(in), value :: nO, nV, cholesky_mo_num + type(c_ptr), value :: gpu_data + real(c_double), intent(in) :: t1(nO,nV) + real(c_double), intent(in) :: t2(nO,nO,nV,nV) + real(c_double), intent(in) :: tau(nO,nO,nV,nV) real(c_double), intent(in) :: H_vv(nV,nV) real(c_double), intent(in) :: g_occ(nO,nO) real(c_double) :: J1(nO,nV,nV,nO)