From ed440c16a2f5f9541eb22003e64d7e6c5afc0842 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Sun, 16 Jul 2023 17:27:14 +0200 Subject: [PATCH] Working --- devel/ccsd_gpu/gpu.c | 272 +++++++++++++++++++++++++------------------ 1 file changed, 157 insertions(+), 115 deletions(-) diff --git a/devel/ccsd_gpu/gpu.c b/devel/ccsd_gpu/gpu.c index 47cf01b..71a2c09 100644 --- a/devel/ccsd_gpu/gpu.c +++ b/devel/ccsd_gpu/gpu.c @@ -1,9 +1,10 @@ #include #include +#include #include #include - +#define NGPUS 2 #define BLOCK_SIZE 16 void dgemm_(char*, char*, int*, int*, int*, double*, double*, int*, double*, int*, @@ -65,134 +66,175 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo double* cc_space_v_vv_chol, double* r2) { - int m,n,k, lda, ldb, ldc; - double alpha, beta; - double* A; - double* B; - double* C; + int m,n,k, lda, ldb, ldc; + double alpha, beta; + double* A; + double* B; + double* C; - cublasHandle_t handle; - cublasCreate(&handle); - - double* d_tau; - lda = nO * nO; - cudaMalloc((void **)&d_tau, lda * nV * nV * sizeof(double)); - cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau, lda, d_tau, lda); - - double* d_r2; - lda = nO * nO; - cudaMalloc((void **)&d_r2, lda * nV * nV * sizeof(double)); - - double* d_cc_space_v_vv_chol; - lda = cholesky_mo_num * nV; - cudaMalloc((void **)&d_cc_space_v_vv_chol, lda * nV * sizeof(double)); - cublasSetMatrix(cholesky_mo_num*nV, nV, sizeof(double), cc_space_v_vv_chol, lda, d_cc_space_v_vv_chol, lda); - - double* d_cc_space_v_vo_chol; - lda = cholesky_mo_num * nV; - cudaMalloc((void **)&d_cc_space_v_vo_chol, lda * nO * sizeof(double)); - cublasSetMatrix(cholesky_mo_num*nV, nO, sizeof(double), cc_space_v_vo_chol, lda, d_cc_space_v_vo_chol, lda); - - double* d_t1; - lda = nO; - cudaMalloc((void **)&d_t1, nO * nV * sizeof(double)); - cublasSetMatrix(nO, nV, sizeof(double), t1, lda, d_t1, lda); - - double* d_tmp_cc; - lda = cholesky_mo_num * nV; - cudaMalloc((void **)&d_tmp_cc, lda * nV * sizeof(double)); - - alpha=1.0; beta=0.0; - m=cholesky_mo_num*nV; n=nV; k=nO; - A = d_cc_space_v_vo_chol; B = d_t1; C = d_tmp_cc; - cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, m, B, k, &beta, C, m); - cublasDestroy(handle); + double* d_taus[NGPUS]; + double* d_r2s[NGPUS]; + double* d_cc_space_v_vv_chols[NGPUS]; + double* d_cc_space_v_vo_chols[NGPUS]; + double* d_t1s[NGPUS]; + double* d_tmp_ccs[NGPUS]; + cublasHandle_t handles[NGPUS]; #pragma omp parallel { - cublasHandle_t handle; - cublasCreate(&handle); - double* d_tmp_cc2; - cudaMalloc((void **)&d_tmp_cc2, cholesky_mo_num*nV*sizeof(double)); + int ithread = omp_get_thread_num(); + int igpu = ithread % NGPUS; + cudaSetDevice(igpu); - double* d_B1; - cudaMalloc((void**)&d_B1, nV*nV*BLOCK_SIZE*sizeof(double)); + if (ithread < NGPUS) { + cublasCreate(&handles[ithread]); + } - double* d_tmpB1; - cudaMalloc((void**)&d_tmpB1, nV*BLOCK_SIZE*nV*sizeof(double)); + #pragma omp barrier - #pragma omp for - for (size_t gam=0 ; gam