From 095053333a85f9ed7bde695f1b19844809b86c51 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Mon, 21 Aug 2023 13:17:39 +0200 Subject: [PATCH] Added SP files --- devel/ccsd_gpu/gpu_init_sp.c | 344 ++++++ devel/ccsd_gpu/gpu_sp.c | 2134 ++++++++++++++++++++++++++++++++++ 2 files changed, 2478 insertions(+) create mode 100644 devel/ccsd_gpu/gpu_init_sp.c create mode 100644 devel/ccsd_gpu/gpu_sp.c diff --git a/devel/ccsd_gpu/gpu_init_sp.c b/devel/ccsd_gpu/gpu_init_sp.c new file mode 100644 index 0000000..709f817 --- /dev/null +++ b/devel/ccsd_gpu/gpu_init_sp.c @@ -0,0 +1,344 @@ +#include +#include +#include +#include +#include +#include +#include +#include "gpu.h" + +gpu_data_sp* gpu_init_sp( + int nO, int nV, int cholesky_mo_num, + double* cc_space_v_oo_chol_, double* cc_space_v_ov_chol_, + double* cc_space_v_vo_chol_, double* cc_space_v_vv_chol_, + double* cc_space_v_oooo_, double* cc_space_v_vooo_, double* cc_space_v_voov_, + double* cc_space_v_oovv_, double* cc_space_v_vvoo_, + double* cc_space_v_oovo_, double* cc_space_v_ovvo_, + double* cc_space_v_ovov_, double* cc_space_v_ovoo_, + double* cc_space_f_oo_, double* cc_space_f_ov_, + double* cc_space_f_vo_, double* cc_space_f_vv_) +{ + int ngpus = 1; + if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus); + + gpu_data_sp* data = (gpu_data_sp*) malloc ((size_t) ngpus*sizeof(gpu_data_sp)); + assert (data != NULL); + + #pragma omp parallel num_threads(ngpus) + { + cudaError_t cudaStat = cudaSuccess; + size_t lda; + + int igpu = omp_get_thread_num(); + cudaSetDevice(igpu); + cublasHandle_t handle; + cublasCreate(&handle); + +/* + size_t limit; + size_t mtotal, mfree; + cudaStat = cudaMemGetInfo(&mfree, &mtotal); + fprintf(stderr, "%s\nTotal:%lu\nFree :%lu\n", cudaGetErrorString(cudaStat), mtotal, mfree); +*/ + + float* d_cc_space_v_oo_chol = NULL; + lda = cholesky_mo_num * nO; + cudaStat = cudaMalloc((void **)&d_cc_space_v_oo_chol, lda * nO * sizeof(float)); + + assert (cudaStat == cudaSuccess); + float* cc_space_v_oo_chol = malloc((size_t) lda * nO * sizeof(float)); + assert (cc_space_v_oo_chol != NULL); + for (size_t i=0 ; i<(size_t) lda * nO ; ++i) { + cc_space_v_oo_chol[i] = cc_space_v_oo_chol_[i]; + } + cublasSetMatrix(cholesky_mo_num*nO, nO, sizeof(float), cc_space_v_oo_chol, lda, d_cc_space_v_oo_chol, lda); + free(cc_space_v_oo_chol); + + float* d_cc_space_v_ov_chol = NULL; + lda = cholesky_mo_num * nO; + cudaStat = cudaMalloc((void **)&d_cc_space_v_ov_chol, lda * nV * sizeof(float)); + assert (cudaStat == cudaSuccess); + float* cc_space_v_ov_chol = malloc((size_t) lda * nV * sizeof(float)); + assert (cc_space_v_ov_chol != NULL); + for (size_t i=0 ; i<(size_t) lda * nV ; ++i) { + cc_space_v_ov_chol[i] = cc_space_v_ov_chol_[i]; + } + cublasSetMatrix(cholesky_mo_num*nO, nV, sizeof(float), cc_space_v_ov_chol, lda, d_cc_space_v_ov_chol, lda); + free(cc_space_v_ov_chol); + + float* d_cc_space_v_vo_chol = NULL; + lda = cholesky_mo_num * nV; + cudaStat = cudaMalloc((void **)&d_cc_space_v_vo_chol, lda * nO * sizeof(float)); + assert (cudaStat == cudaSuccess); + float* cc_space_v_vo_chol = malloc((size_t) lda * nO * sizeof(float)); + assert (cc_space_v_vo_chol != NULL); + for (size_t i=0 ; i<(size_t) lda * nO ; ++i) { + cc_space_v_vo_chol[i] = cc_space_v_vo_chol_[i]; + } + cublasSetMatrix(cholesky_mo_num*nV, nO, sizeof(float), cc_space_v_vo_chol, lda, d_cc_space_v_vo_chol, lda); + free(cc_space_v_vo_chol); + + float* d_cc_space_v_vv_chol = NULL; + lda = cholesky_mo_num * nV; + cudaStat = cudaMalloc((void **)&d_cc_space_v_vv_chol, lda * nV * sizeof(float)); + assert (cudaStat == cudaSuccess); + float* cc_space_v_vv_chol = malloc((size_t) lda * nV * sizeof(float)); + assert (cc_space_v_vv_chol != NULL); + for (size_t i=0 ; i< (size_t) lda * nV ; ++i) { + cc_space_v_vv_chol[i] = cc_space_v_vv_chol_[i]; + } + cublasSetMatrix(cholesky_mo_num*nV, nV, sizeof(float), cc_space_v_vv_chol, lda, d_cc_space_v_vv_chol, lda); + free(cc_space_v_vv_chol); + + float* d_cc_space_v_oooo = NULL; + cudaStat = cudaMalloc((void**)&d_cc_space_v_oooo, nO*nO*nO*nO*sizeof(float)); + assert (cudaStat == cudaSuccess); + float* cc_space_v_oooo = malloc((size_t) nO*nO*nO*nO*sizeof(float)); + assert (cc_space_v_oooo != NULL); + for (size_t i=0 ; i<(size_t) nO*nO*nO*nO ; ++i) { + cc_space_v_oooo[i] = cc_space_v_oooo_[i]; + } + cublasSetMatrix(nO*nO, nO*nO, sizeof(float), cc_space_v_oooo, nO*nO, d_cc_space_v_oooo, nO*nO); + free(cc_space_v_oooo); + + float* d_cc_space_v_vooo = NULL; + cudaStat = cudaMalloc((void**)&d_cc_space_v_vooo, nV*nO*nO*nO*sizeof(float)); + assert (cudaStat == cudaSuccess); + float* cc_space_v_vooo = malloc((size_t) nV*nO*nO*nO*sizeof(float)); + assert (cc_space_v_vooo != NULL); + for (size_t i=0 ; i<(size_t) nV*nO*nO*nO; ++i) { + cc_space_v_vooo[i] = cc_space_v_vooo_[i]; + } + cublasSetMatrix(nV*nO, nO*nO, sizeof(float), cc_space_v_vooo, nV*nO, d_cc_space_v_vooo, nV*nO); + free(cc_space_v_vooo); + + float* d_cc_space_v_voov = NULL; + cudaStat = cudaMalloc((void**)&d_cc_space_v_voov, nV*nO*nO*nV*sizeof(float)); + assert (cudaStat == cudaSuccess); + float* cc_space_v_voov = malloc((size_t) nV*nO*nO*nV*sizeof(float)); + assert (cc_space_v_voov != NULL); + for (size_t i=0 ; i<(size_t) nV*nO*nO*nV; ++i) { + cc_space_v_voov[i] = cc_space_v_voov_[i]; + } + cublasSetMatrix(nV*nO, nO*nV, sizeof(float), cc_space_v_voov, nV*nO, d_cc_space_v_voov, nV*nO); + free(cc_space_v_voov); + + float* d_cc_space_v_oovv = NULL; + cudaStat = cudaMalloc((void**)&d_cc_space_v_oovv, nO*nO*nV*nV*sizeof(float)); + assert (cudaStat == cudaSuccess); + float* cc_space_v_oovv = malloc((size_t) nO*nO*nV*nV*sizeof(float)); + assert (cc_space_v_oovv != NULL); + for (size_t i=0 ; i<(size_t) nO*nO*nV*nV; ++i) { + cc_space_v_oovv[i] = cc_space_v_oovv_[i]; + } + cublasSetMatrix(nO*nO, nV*nV, sizeof(float), cc_space_v_oovv, nO*nO, d_cc_space_v_oovv, nO*nO); + free(cc_space_v_oovv); + + float* d_cc_space_v_vvoo = NULL; + cudaStat = cudaMalloc((void**)&d_cc_space_v_vvoo, nV*nV*nO*nO*sizeof(float)); + assert (cudaStat == cudaSuccess); + float* cc_space_v_vvoo = malloc((size_t) nV*nV*nO*nO*sizeof(float)); + assert (cc_space_v_vvoo != NULL); + for (size_t i=0 ; i<(size_t) nV*nV*nO*nO; ++i) { + cc_space_v_vvoo[i] = cc_space_v_vvoo_[i]; + } + cublasSetMatrix(nV*nV, nO*nO, sizeof(float), cc_space_v_vvoo, nV*nV, d_cc_space_v_vvoo, nV*nV); + free(cc_space_v_vvoo); + + float* d_cc_space_v_oovo = NULL; + lda = nO*nO; + cudaStat = cudaMalloc((void **)&d_cc_space_v_oovo, nO*nO*nV*nO * sizeof(float)); + assert (cudaStat == cudaSuccess); + float* cc_space_v_oovo = malloc((size_t) nO*nO*nV*nO * sizeof(float)); + assert (cc_space_v_oovo != NULL); + for (size_t i=0 ; i<(size_t) nO*nO*nV*nO ; ++i) { + cc_space_v_oovo[i] = cc_space_v_oovo_[i]; + } + cublasSetMatrix(lda, nV*nO, sizeof(float), cc_space_v_oovo, lda, d_cc_space_v_oovo, lda); + free(cc_space_v_oovo); + + float* d_cc_space_v_ovvo = NULL; + lda = nO*nV; + cudaStat = cudaMalloc((void **)&d_cc_space_v_ovvo, nO*nV*nV*nO * sizeof(float)); + assert (cudaStat == cudaSuccess); + float* cc_space_v_ovvo = malloc((size_t) nO*nV*nV*nO * sizeof(float)); + assert (cc_space_v_ovvo != NULL); + for (size_t i=0 ; i<(size_t) nO*nV*nV*nO ; ++i) { + cc_space_v_ovvo[i] = cc_space_v_ovvo_[i]; + } + cublasSetMatrix(lda, nV*nO, sizeof(float), cc_space_v_ovvo, lda, d_cc_space_v_ovvo, lda); + free(cc_space_v_ovvo); + + float* d_cc_space_v_ovov = NULL; + lda = nO*nV; + cudaStat = cudaMalloc((void **)&d_cc_space_v_ovov, nO*nV*nV*nO * sizeof(float)); + assert (cudaStat == cudaSuccess); + float* cc_space_v_ovov = malloc((size_t) nO*nV*nV*nO * sizeof(float)); + assert (cc_space_v_ovov != NULL); + for (size_t i=0 ; i<(size_t) nO*nV*nV*nO ; ++i) { + cc_space_v_ovov[i] = cc_space_v_ovov_[i]; + } + cublasSetMatrix(lda, nV*nO, sizeof(float), cc_space_v_ovov, lda, d_cc_space_v_ovov, lda); + free(cc_space_v_ovov); + + float* d_cc_space_v_ovoo = NULL; + lda = nO*nV; + cudaStat = cudaMalloc((void **)&d_cc_space_v_ovoo, nO*nV*nO*nO * sizeof(float)); + assert (cudaStat == cudaSuccess); + float* cc_space_v_ovoo = malloc((size_t) nO*nV*nO*nO * sizeof(float)); + assert (cc_space_v_ovoo != NULL); + for (size_t i=0 ; i<(size_t) nO*nV*nO*nO ; ++i) { + cc_space_v_ovoo[i] = cc_space_v_ovoo_[i]; + } + cublasSetMatrix(lda, nO*nO, sizeof(float), cc_space_v_ovoo, lda, d_cc_space_v_ovoo, lda); + free(cc_space_v_ovoo); + + float* d_cc_space_f_oo = NULL; + cudaStat = cudaMalloc((void**)&d_cc_space_f_oo, nO*nO*sizeof(float)); + assert (cudaStat == cudaSuccess); + float* cc_space_f_oo = malloc((size_t) nO*nO*sizeof(float)); + assert (cc_space_f_oo != NULL); + for (size_t i=0 ; i<(size_t) nO*nO; ++i) { + cc_space_f_oo[i] = cc_space_f_oo_[i]; + } + cublasSetMatrix(nO, nO, sizeof(float), cc_space_f_oo, nO, d_cc_space_f_oo, nO); + free(cc_space_f_oo); + + float* d_cc_space_f_vo = NULL; + + cudaStat = cudaMalloc((void**)&d_cc_space_f_vo, nV*nO*sizeof(float)); + assert (cudaStat == cudaSuccess); + float* cc_space_f_vo = malloc((size_t) nV*nO*sizeof(float)); + assert (cc_space_f_vo != NULL); + for (size_t i=0 ; i<(size_t) nV*nO; ++i) { + cc_space_f_vo[i] = cc_space_f_vo_[i]; + } + cublasSetMatrix(nV, nO, sizeof(float), cc_space_f_vo, nV, d_cc_space_f_vo, nV); + free(cc_space_f_vo); + + float* d_cc_space_f_ov = NULL; + + cudaStat = cudaMalloc((void**)&d_cc_space_f_ov, nV*nO*sizeof(float)); + assert (cudaStat == cudaSuccess); + float* cc_space_f_ov = malloc((size_t) nV*nO*sizeof(float)); + assert (cc_space_f_ov != NULL); + for (size_t i=0 ; i<(size_t) nV*nO; ++i) { + cc_space_f_ov[i] = cc_space_f_ov_[i]; + } + cublasSetMatrix(nO, nV, sizeof(float), cc_space_f_ov, nO, d_cc_space_f_ov, nO); + free(cc_space_f_ov); + + float* d_cc_space_f_vv = NULL; + + cudaStat = cudaMalloc((void**)&d_cc_space_f_vv, nV*nV*sizeof(float)); + assert (cudaStat == cudaSuccess); + float* cc_space_f_vv = malloc((size_t) nV*nV*sizeof(float)); + assert (cc_space_f_vv != NULL); + for (size_t i=0 ; i<(size_t) nV*nV; ++i) { + cc_space_f_vv[i] = cc_space_f_vv_[i]; + } + cublasSetMatrix(nV, nV, sizeof(float), cc_space_f_vv, nV, d_cc_space_f_vv, nV); + free(cc_space_f_vv); + + float* d_tau = NULL; + lda = nO * nO; + cudaStat = cudaMalloc((void **)&d_tau, lda * nV * nV * sizeof(float)); + assert (cudaStat == cudaSuccess); + + float* d_tau_x = NULL; + lda = nO * nO; + cudaStat = cudaMalloc((void **)&d_tau_x, lda * nV * nV * sizeof(float)); + assert (cudaStat == cudaSuccess); + + float* d_t1 = NULL; + cudaStat = cudaMalloc((void **)&d_t1, nO * nV * sizeof(float)); + assert (cudaStat == cudaSuccess); + + float* d_t2 = NULL; + cudaStat = cudaMalloc((void **)&d_t2, nO*nO*nV*nV * sizeof(float)); + assert (cudaStat == cudaSuccess); + + float* d_H_oo = NULL; + cudaStat = cudaMalloc((void **)&d_H_oo, nO * nO * sizeof(float)); + assert (cudaStat == cudaSuccess); + + float* d_H_vo = NULL; + cudaStat = cudaMalloc((void **)&d_H_vo, nV * nO * sizeof(float)); + assert (cudaStat == cudaSuccess); + + float* d_H_vv = NULL; + cudaStat = cudaMalloc((void **)&d_H_vv, nV * nV * sizeof(float)); + assert (cudaStat == cudaSuccess); + + data[igpu].cc_space_v_oo_chol = d_cc_space_v_oo_chol; + data[igpu].cc_space_v_ov_chol = d_cc_space_v_ov_chol; + data[igpu].cc_space_v_vo_chol = d_cc_space_v_vo_chol; + data[igpu].cc_space_v_vv_chol = d_cc_space_v_vv_chol; + data[igpu].cc_space_v_oooo = d_cc_space_v_oooo; + data[igpu].cc_space_v_vooo = d_cc_space_v_vooo; + data[igpu].cc_space_v_voov = d_cc_space_v_voov; + data[igpu].cc_space_v_oovv = d_cc_space_v_oovv; + data[igpu].cc_space_v_vvoo = d_cc_space_v_vvoo; + data[igpu].cc_space_v_oovo = d_cc_space_v_oovo; + data[igpu].cc_space_v_ovvo = d_cc_space_v_ovvo; + data[igpu].cc_space_v_ovov = d_cc_space_v_ovov; + data[igpu].cc_space_v_ovoo = d_cc_space_v_ovoo; + data[igpu].cc_space_f_oo = d_cc_space_f_oo; + data[igpu].cc_space_f_ov = d_cc_space_f_ov; + data[igpu].cc_space_f_vo = d_cc_space_f_vo; + data[igpu].cc_space_f_vv = d_cc_space_f_vv; + data[igpu].tau = d_tau; + data[igpu].tau_x = d_tau_x; + data[igpu].t1 = d_t1; + data[igpu].t2 = d_t2; + data[igpu].H_oo = d_H_oo; + data[igpu].H_vo = d_H_vo; + data[igpu].H_vv = d_H_vv; + + data[igpu].nO = nO; + data[igpu].nV = nV; + data[igpu].cholesky_mo_num = cholesky_mo_num; + + } + return data; +} + + +void gpu_deinit_sp(gpu_data_sp* data) +{ + int ngpus = 1; + if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus); + + #pragma omp parallel num_threads(ngpus) + { + size_t lda; + int igpu = omp_get_thread_num(); + cudaSetDevice(igpu); + + free(data[igpu].cc_space_v_oo_chol); + free(data[igpu].cc_space_v_ov_chol); + free(data[igpu].cc_space_v_vo_chol); + free(data[igpu].cc_space_v_vv_chol); + free(data[igpu].cc_space_v_oooo); + free(data[igpu].cc_space_v_vooo); + free(data[igpu].cc_space_v_voov); + free(data[igpu].cc_space_v_oovv); + free(data[igpu].cc_space_v_vvoo); + free(data[igpu].cc_space_v_oovo); + free(data[igpu].cc_space_v_ovvo); + free(data[igpu].cc_space_v_ovov); + free(data[igpu].cc_space_v_ovoo); + free(data[igpu].cc_space_f_oo); + free(data[igpu].cc_space_f_ov); + free(data[igpu].cc_space_f_vo); + free(data[igpu].cc_space_f_vv); + free(data[igpu].tau); + free(data[igpu].tau_x); + free(data[igpu].t1); + free(data[igpu].t2); + free(data[igpu].H_oo); + free(data[igpu].H_vo); + free(data[igpu].H_vv); + } +} + diff --git a/devel/ccsd_gpu/gpu_sp.c b/devel/ccsd_gpu/gpu_sp.c new file mode 100644 index 0000000..2dc84f3 --- /dev/null +++ b/devel/ccsd_gpu/gpu_sp.c @@ -0,0 +1,2134 @@ +#include +#include +#include +#include +#include +#include +#include +#include "gpu.h" + +void gpu_upload_sp(gpu_data_sp* data, + int nO, int nV, + double* t1, + double* t2) +{ + size_t lda; + + int ngpus = 1; + if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus); + + float* tau = malloc((size_t) nO*nO*nV*nV * sizeof(float)); + assert (tau != NULL); + + float* tau_x = malloc((size_t) nO*nO*nV*nV * sizeof(float)); + assert (tau_x != NULL); + + #pragma omp parallel num_threads(ngpus) + { + cudaError_t cudaStat = cudaSuccess; + float* AA; + size_t igpu = omp_get_thread_num(); + cudaSetDevice(igpu); + + float* d_t1 = data[igpu].t1; + lda = nO; + AA = malloc((size_t) nO*nV*sizeof(float)); + assert (AA != NULL); + for (size_t i=0 ; i<(size_t) nO*nV ; ++i) { + AA[i] = t1[i]; + } + cudaStat = cublasSetMatrix(nO, nV, sizeof(float), AA, lda, d_t1, lda); + assert (cudaStat == cudaSuccess); + free(AA); + + float* d_t2 = data[igpu].t2; + lda = nO*nO; + AA = malloc((size_t) nO*nO*nV*nV*sizeof(float)); + assert (AA != NULL); + for (size_t i=0 ; i<(size_t) nO*nO*nV*nV ; ++i) { + AA[i] = t2[i]; + } + cudaStat = cublasSetMatrix(nO*nO, nV*nV, sizeof(float), AA, lda, d_t2, lda); + assert (cudaStat == cudaSuccess); + free(AA); + + size_t lda, ldb, ldc; + float alpha, beta; + float* A; + float* B; + float* C; + + cublasHandle_t handle; + cublasCreate(&handle); + cudaStream_t stream[nV]; + + float* d_tau = data[igpu].tau; + + float* d_tau_x = data[igpu].tau_x; + lda = nO * nO; + cublasSetMatrix(nO*nO, nV*nV, sizeof(float), tau_x, lda, d_tau_x, lda); + + if (igpu == 0) { + + for (size_t i=0 ; i 0) { + lda = nO * nO; + cublasSetMatrix(nO*nO, nV*nV, sizeof(float), tau, lda, d_tau, lda); + cublasSetMatrix(nO*nO, nV*nV, sizeof(float), tau_x, lda, d_tau_x, lda); + } + cublasDestroy(handle); + } + free(tau); + free(tau_x); +} + + + + +void compute_h_oo_chol_gpu_sp(gpu_data_sp* data, int igpu) +{ + cudaError_t cudaStat; + int ngpus = 1; + if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus); + igpu = igpu % ngpus; + + const size_t cholesky_mo_num = data[igpu].cholesky_mo_num; + const size_t nO = data[igpu].nO; + const size_t nV = data[igpu].nV; + cudaSetDevice(igpu); + + size_t m,n,k, lda, ldb, ldc; + float alpha, beta; + float* A; + float* B; + float* C; + cudaStream_t stream[nV]; + + cublasHandle_t handle; + cublasCreate(&handle); + + float* d_H_oo = data[igpu].H_oo; + float* d_tau_x = data[igpu].tau_x; + float* d_cc_space_f_oo = data[igpu].cc_space_f_oo; + float* d_cc_space_v_vo_chol = data[igpu].cc_space_v_vo_chol; + float* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol; + + float* d_tau_kau; + cudaStat = cudaMalloc((void **)&d_tau_kau, cholesky_mo_num*nV*nO * sizeof(float)); + assert(cudaStat == cudaSuccess); + + float* d_tmp_ovv; + cudaStat = cudaMalloc((void **)&d_tmp_ovv, nO*nV*nV * sizeof(float)); + assert(cudaStat == cudaSuccess); + + float* d_tmp_vov; + cudaStat = cudaMalloc((void **)&d_tmp_vov, nV*nO*nV * sizeof(float)); + assert(cudaStat == cudaSuccess); + + for (size_t i=0 ; icholesky_mo_num; + + int ngpus = 1; + if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus); + + float* J1 = malloc((size_t) nO*nV*nV*nO*sizeof(float)); + assert (J1 != NULL); + + float* K1 = malloc((size_t) nO*nV*nV*nO*sizeof(float)); + assert (K1 != NULL); + + #pragma omp parallel num_threads(ngpus) + { + cudaError_t cudaStat; + size_t m,n,k, lda, ldb, ldc; + float alpha, beta; + float* A; + float* B; + float* C; + cudaStream_t stream[nV]; + + size_t igpu = omp_get_thread_num(); + cudaSetDevice(igpu); + + cublasHandle_t handle; + cublasCreate(&handle); + + float* d_r2; + lda = nO * nO; + cudaStat = cudaMalloc((void **)&d_r2, lda * nV * nV * sizeof(float)); + assert(cudaStat == cudaSuccess); + cudaMemset(d_r2, 0, nO*nO*nV*nV*sizeof(float)); + memset(r2, 0, nO*nO*nV*nV*sizeof(double)); + + float* d_cc_space_v_oo_chol = data[igpu].cc_space_v_oo_chol; + float* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol; + float* d_cc_space_v_vo_chol = data[igpu].cc_space_v_vo_chol; + float* d_cc_space_v_vv_chol = data[igpu].cc_space_v_vv_chol; + float* d_cc_space_v_oooo = data[igpu].cc_space_v_oooo; + float* d_cc_space_v_vooo = data[igpu].cc_space_v_vooo; + float* d_cc_space_v_oovv = data[igpu].cc_space_v_oovv; + float* d_cc_space_v_vvoo = data[igpu].cc_space_v_vvoo; + float* d_cc_space_v_oovo = data[igpu].cc_space_v_oovo; + float* d_cc_space_v_ovvo = data[igpu].cc_space_v_ovvo; + float* d_cc_space_v_ovov = data[igpu].cc_space_v_ovov; + float* d_cc_space_v_ovoo = data[igpu].cc_space_v_ovoo; + float* d_cc_space_f_vo = data[igpu].cc_space_f_vo; + float* d_tau = data[igpu].tau; + float* d_t1 = data[igpu].t1; + float* d_t2 = data[igpu].t2; + float* d_H_oo = data[igpu].H_oo; + float* d_H_vv = data[igpu].H_vv; + + float* d_K1; + cudaStat = cudaMalloc((void **)&d_K1, nO*nV*nO*nV * sizeof(float)); + assert(cudaStat == cudaSuccess); + + #pragma omp sections + { + + #pragma omp section + { + float* d_J1; + cudaStat = cudaMalloc((void **)&d_J1, nO*nV*nV*nO * sizeof(float)); + assert(cudaStat == cudaSuccess); + + alpha = 1.0; + beta = 0.0; + A = d_cc_space_v_ovvo; lda = nO*nV; + B = d_cc_space_v_ovvo; ldb = nO*nV; + C = d_J1; ldc = nO*nV; + cublasSgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO*nV, nV*nO, &alpha, A, lda, &beta, B, ldb, C, ldc); + + + float* d_X_ovoo; + cudaStat = cudaMalloc((void **)&d_X_ovoo, nO*nV*nO*nO * sizeof(float)); + assert(cudaStat == cudaSuccess); + alpha = 0.0; + beta = 1.0; + for (size_t i=0 ; i 0. ? r2[i] : -r2[i]; + *max_r2 = *max_r2 > x ? *max_r2 : x; + } + +} + + + + + + +void compute_r1_space_chol_gpu_sp(gpu_data_sp* data, int nO, int nV, double* t1, double* r1, double* max_r1) +{ + const size_t cholesky_mo_num = data->cholesky_mo_num; + + int ngpus = 1; + if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus); + + #pragma omp parallel num_threads(ngpus) + { + cudaError_t cudaStat; + size_t m,n,k, lda, ldb, ldc; + float alpha, beta; + float* A; + float* B; + float* C; + cudaStream_t stream[nV]; + + size_t igpu = omp_get_thread_num(); + cudaSetDevice(igpu); + + cublasHandle_t handle; + cublasCreate(&handle); + + float* d_r1; + lda = nO ; + cudaStat = cudaMalloc((void **)&d_r1, lda * nV * sizeof(float)); + assert(cudaStat == cudaSuccess); + cudaMemset(d_r1, 0, nO*nV*sizeof(float)); + memset(r1, 0, nO*nV*sizeof(double)); + + float* d_cc_space_v_vo_chol = data[igpu].cc_space_v_vo_chol; + float* d_cc_space_v_vv_chol = data[igpu].cc_space_v_vv_chol; + float* d_cc_space_v_oovo = data[igpu].cc_space_v_oovo; + float* d_cc_space_v_ovov = data[igpu].cc_space_v_ovov; + float* d_cc_space_v_voov = data[igpu].cc_space_v_voov; + float* d_cc_space_f_ov = data[igpu].cc_space_f_ov; + float* d_cc_space_f_vo = data[igpu].cc_space_f_vo; + float* d_tau = data[igpu].tau; + float* d_t1 = data[igpu].t1; + float* d_t2 = data[igpu].t2; + float* d_H_oo = data[igpu].H_oo; + float* d_H_vo = data[igpu].H_vo; + float* d_H_vv = data[igpu].H_vv; + + #pragma omp sections + { + + #pragma omp section + { + cublasScopy(handle, nO*nV, d_cc_space_f_ov, 1, d_r1, 1); + + float* d_X_oo; + cudaStat = cudaMalloc((void **)&d_X_oo, nO*nO * sizeof(float)); + assert(cudaStat == cudaSuccess); + + alpha = -2.0; + beta = 0.0; + m=nO; n=nO; k=nV; + A=d_t1; lda=nO; + B=d_cc_space_f_vo; ldb=nV; + C=d_X_oo; ldc=nO; + cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc); + + alpha = 1.0; + beta = 1.0; + m=nO; n=nV; k=nO; + A=d_X_oo; lda=nO; + B=d_t1; ldb=nO; + C=d_r1; ldc=nO; + cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc); + + cudaFree(d_X_oo); + } + + #pragma omp section + { + alpha = 1.0; + beta = 1.0; + m=nO; n=nV; k=nV; + A=d_t1; lda=nO; + B=d_H_vv; ldb=nV; + C=d_r1; ldc=nO; + cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc); + } + + #pragma omp section + { + alpha = -1.0; + beta = 1.0; + m=nO; n=nV; k=nO; + A=d_H_oo; lda=nO; + B=d_t1; ldb=nO; + C=d_r1; ldc=nO; + cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc); + } + + #pragma omp section + { + float* d_X_voov; + cudaStat = cudaMalloc((void **)&d_X_voov, nV* nO* nO* nV * sizeof(float)); + assert(cudaStat == cudaSuccess); + + for (size_t i=0 ; i= nV) kk = 0; + A = &(d_W_vvov_tmp[nV*(i+nO*nV*bet)]); lda = nV*nO; + B = &(d_W_vvov_tmp[nV*(i+nO*nV*bet)]); ldb = nV*nO; + C = &(d_W_vvov[nV*nV*(i+nO*bet)]); ldc = nV; + cublasSgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nV, nV, &alpha, A, lda, &beta, B, ldb, C, ldc); + } + } + for (size_t i=0 ; i 0. ? r1[i] : -r1[i]; + *max_r1 = *max_r1 > x ? *max_r1 : x; + } +} + + + +double ccsd_energy_space_gpu_sp(gpu_data_sp* data) +{ + double result = 0.0; + + const size_t nO = data->nO; + const size_t nV = data->nV; + + int ngpus = 1; + if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus); + + #pragma omp parallel num_threads(ngpus) + { + cudaError_t cudaStat; + size_t igpu = omp_get_thread_num(); + cudaSetDevice(igpu); + + cublasHandle_t handle; + cublasCreate(&handle); + + double result_local = 0.0; + #pragma omp sections + { + #pragma omp section + { + float* d_cc_space_f_ov = data[igpu].cc_space_f_ov; + float* d_t1 = data[igpu].t1; + float x; + cublasSdot(handle, nO*nV, d_cc_space_f_ov, 1, d_t1, 1, &x); + result_local += 2.0*x; + } + #pragma omp section + { + float* d_tau_x = data[igpu].tau_x; + float* d_cc_space_v_oovv = data[igpu].cc_space_v_oovv; + float x; + cublasSdot(handle, nO*nO*nV*nV, d_tau_x, 1, d_cc_space_v_oovv, 1, &x); + result_local += x; + } + } + cublasDestroy(handle); + #pragma omp critical + { + result += result_local; + } + } + return result; +}