diff --git a/devel/ccsd_gpu/gpu.c b/devel/ccsd_gpu/gpu.c index 71a2c09..2a4a0c3 100644 --- a/devel/ccsd_gpu/gpu.c +++ b/devel/ccsd_gpu/gpu.c @@ -4,7 +4,6 @@ #include #include -#define NGPUS 2 #define BLOCK_SIZE 16 void dgemm_(char*, char*, int*, int*, int*, double*, double*, int*, double*, int*, @@ -66,34 +65,31 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo double* cc_space_v_vv_chol, double* r2) { - int m,n,k, lda, ldb, ldc; - double alpha, beta; - double* A; - double* B; - double* C; + double* d_tau; + double* d_r2; + double* d_cc_space_v_vv_chol; + double* d_cc_space_v_vo_chol; + double* d_t1; + double* d_tmp_cc; - double* d_taus[NGPUS]; - double* d_r2s[NGPUS]; - double* d_cc_space_v_vv_chols[NGPUS]; - double* d_cc_space_v_vo_chols[NGPUS]; - double* d_t1s[NGPUS]; - double* d_tmp_ccs[NGPUS]; + int ngpus; + cudaGetDeviceCount(&ngpus); - cublasHandle_t handles[NGPUS]; - #pragma omp parallel + #pragma omp parallel num_threads(ngpus) { + int m,n,k, lda, ldb, ldc; + double alpha, beta; + double* A; + double* B; + double* C; int ithread = omp_get_thread_num(); - int igpu = ithread % NGPUS; + int igpu = ithread ; + cudaSetDevice(igpu); + cublasHandle_t handle; - if (ithread < NGPUS) { - cublasCreate(&handles[ithread]); - } - - #pragma omp barrier - - cublasHandle_t handle = handles[igpu]; + cublasCreate(&handle); double* d_tau; double* d_r2; @@ -102,49 +98,32 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo double* d_t1; double* d_tmp_cc; - if (ithread < NGPUS) { - lda = nO * nO; - cudaMalloc((void **)&d_tau, lda * nV * nV * sizeof(double)); - cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau, lda, d_tau, lda); - d_taus[igpu] = d_tau; + lda = nO * nO; + cudaMalloc((void **)&d_tau, lda * nV * nV * sizeof(double)); + cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau, lda, d_tau, lda); - lda = nO * nO; - cudaMalloc((void **)&d_r2, lda * nV * nV * sizeof(double)); - d_r2s[igpu] = d_r2; + lda = nO * nO; + cudaMalloc((void **)&d_r2, lda * nV * nV * sizeof(double)); - lda = cholesky_mo_num * nV; - cudaMalloc((void **)&d_cc_space_v_vv_chol, lda * nV * sizeof(double)); - cublasSetMatrix(cholesky_mo_num*nV, nV, sizeof(double), cc_space_v_vv_chol, lda, d_cc_space_v_vv_chol, lda); - d_cc_space_v_vv_chols[igpu] = d_cc_space_v_vv_chol; + lda = cholesky_mo_num * nV; + cudaMalloc((void **)&d_cc_space_v_vv_chol, lda * nV * sizeof(double)); + cublasSetMatrix(cholesky_mo_num*nV, nV, sizeof(double), cc_space_v_vv_chol, lda, d_cc_space_v_vv_chol, lda); - lda = cholesky_mo_num * nV; - cudaMalloc((void **)&d_cc_space_v_vo_chol, lda * nO * sizeof(double)); - cublasSetMatrix(cholesky_mo_num*nV, nO, sizeof(double), cc_space_v_vo_chol, lda, d_cc_space_v_vo_chol, lda); - d_cc_space_v_vo_chols[igpu] = d_cc_space_v_vo_chol; + lda = cholesky_mo_num * nV; + cudaMalloc((void **)&d_cc_space_v_vo_chol, lda * nO * sizeof(double)); + cublasSetMatrix(cholesky_mo_num*nV, nO, sizeof(double), cc_space_v_vo_chol, lda, d_cc_space_v_vo_chol, lda); - lda = nO; - cudaMalloc((void **)&d_t1, nO * nV * sizeof(double)); - cublasSetMatrix(nO, nV, sizeof(double), t1, lda, d_t1, lda); - d_t1s[igpu] = d_t1; + lda = nO; + cudaMalloc((void **)&d_t1, nO * nV * sizeof(double)); + cublasSetMatrix(nO, nV, sizeof(double), t1, lda, d_t1, lda); - lda = cholesky_mo_num * nV; - cudaMalloc((void **)&d_tmp_cc, lda * nV * sizeof(double)); - d_tmp_ccs[igpu] = d_tmp_cc; + lda = cholesky_mo_num * nV; + cudaMalloc((void **)&d_tmp_cc, lda * nV * sizeof(double)); - alpha=1.0; beta=0.0; - m=cholesky_mo_num*nV; n=nV; k=nO; - A = d_cc_space_v_vo_chol; B = d_t1; C = d_tmp_cc; - cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, m, B, k, &beta, C, m); - } - - #pragma omp barrier - - d_tau = d_taus[igpu] ; - d_r2 = d_r2s[igpu] ; - d_cc_space_v_vv_chol = d_cc_space_v_vv_chols[igpu] ; - d_cc_space_v_vo_chol = d_cc_space_v_vo_chols[igpu] ; - d_t1 = d_t1s[igpu] ; - d_tmp_cc = d_tmp_ccs[igpu] ; + alpha=1.0; beta=0.0; + m=cholesky_mo_num*nV; n=nV; k=nO; + A = d_cc_space_v_vo_chol; B = d_t1; C = d_tmp_cc; + cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, m, B, k, &beta, C, m); double* d_tmp_cc2; cudaMalloc((void **)&d_tmp_cc2, cholesky_mo_num*nV*sizeof(double)); @@ -213,27 +192,24 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo cudaFree(d_B1); cudaFree(d_tmp_cc2); - if (igpu < NGPUS) { - cudaFree(d_cc_space_v_vo_chol); - cudaFree(d_cc_space_v_vv_chol); - cudaFree(d_tau); - cudaFree(d_t1); - cudaFree(d_tmp_cc); - double * r2_tmp = malloc(nO*nO*nV*nV*sizeof(double)); - lda=nO*nO; - cublasGetMatrix(nO*nO, nV*nV, sizeof(double), d_r2, lda, r2_tmp, lda); - #pragma omp critical - { - for (size_t i=0 ; i