diff --git a/devel/ccsd_gpu/LIB b/devel/ccsd_gpu/LIB new file mode 100644 index 0000000..91f54e9 --- /dev/null +++ b/devel/ccsd_gpu/LIB @@ -0,0 +1 @@ +-lcudart -lcublas -lcublasLt diff --git a/devel/ccsd_gpu/gpu.c b/devel/ccsd_gpu/gpu.c index 234faf2..7cee7f7 100644 --- a/devel/ccsd_gpu/gpu.c +++ b/devel/ccsd_gpu/gpu.c @@ -1,11 +1,8 @@ #include #include -/* -#include -#include -#include #include -*/ +#include + #define BLOCK_SIZE 16 @@ -14,7 +11,7 @@ void dgemm_(char*, char*, int*, int*, int*, double*, double*, int*, double*, int -void* compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo_num, +void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo_num, double* t1, double* tau, double* cc_space_v_vo_chol, @@ -29,51 +26,126 @@ void* compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_m double* tmp_cc = malloc(cholesky_mo_num*nV*nV*sizeof(double)); + cublasHandle_t handle; + cublasCreate(&handle); + double* d_tau; + lda = nO * nO; + cudaMalloc((void **)&d_tau, lda * nV * nV * sizeof(double)); + cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau, lda, d_tau, lda); + + double* d_r2; + lda = nO * nO; + cudaMalloc((void **)&d_r2, lda * nV * nV * sizeof(double)); + cublasSetMatrix(nO*nO, nV*nV, sizeof(double), r2, lda, d_r2, lda); + + double* d_cc_space_v_vv_chol; + lda = cholesky_mo_num * nV; + cudaMalloc((void **)&d_cc_space_v_vv_chol, lda * nV * sizeof(double)); + cublasSetMatrix(cholesky_mo_num*nV, nV, sizeof(double), cc_space_v_vv_chol, lda, d_cc_space_v_vv_chol, lda); + + double* d_cc_space_v_vo_chol; + lda = cholesky_mo_num * nV; + cudaMalloc((void **)&d_cc_space_v_vo_chol, lda * nO * sizeof(double)); + cublasSetMatrix(cholesky_mo_num*nV, nO, sizeof(double), cc_space_v_vo_chol, lda, d_cc_space_v_vo_chol, lda); + + double* d_t1; + lda = nO; + cudaMalloc((void **)&d_t1, nO * nV * sizeof(double)); + cublasSetMatrix(nO, nV, sizeof(double), t1, lda, d_t1, lda); + + double* d_tmp_cc; + lda = cholesky_mo_num * nV; + cudaMalloc((void **)&d_tmp_cc, lda * nV * sizeof(double)); + + alpha=1.0; beta=0.0; + m=cholesky_mo_num*nV; n=nV; k=nO; + A = d_cc_space_v_vo_chol; B = d_t1; C = d_tmp_cc; + cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, m, B, k, &beta, C, m); + cublasGetMatrix(m, n, sizeof(double), d_tmp_cc, lda, tmp_cc, lda); + +// --- + +/* m=cholesky_mo_num*nV; n=nV; k=nO; alpha=1.0; beta=0.0; lda=m ; ldb=k ; ldc=m; A=cc_space_v_vo_chol; B=t1; C=tmp_cc; dgemm_("N","N", &m, &n, &k, &alpha, A, &lda, B, &ldb, &beta, C, &ldc); +*/ #pragma omp parallel { - double* tmp_cc2 = malloc(cholesky_mo_num*nV*sizeof(double)); + double* d_tmp_cc2; + cudaMalloc((void **)&d_tmp_cc2, cholesky_mo_num*nV*sizeof(double)); + double* B1 = malloc(nV*nV*BLOCK_SIZE*sizeof(double)); + double* d_B1; + cudaMalloc((void**)&d_B1, nV*nV*BLOCK_SIZE*sizeof(double)); + double* tmpB1 = malloc(nV*BLOCK_SIZE*nV*sizeof(double)); + double* d_tmpB1; + cudaMalloc((void**)&d_tmpB1, nV*BLOCK_SIZE*nV*sizeof(double)); #pragma omp for for (size_t gam=0 ; gam