#include #include #include #include #include #include #include "gpu.h" #define BLOCK_SIZE 16 void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo_num, gpu_data* data, double* t1, double* t2, double* tau, double* H_vv, double* g_occ, double* r2) { int ngpus = 1; cudaGetDeviceCount(&ngpus); double* J1 = malloc(nO*nV*nV*nO*sizeof(double)); double* K1 = malloc(nO*nV*nV*nO*sizeof(double)); #pragma omp parallel num_threads(ngpus) { int m,n,k, lda, ldb, ldc; double alpha, beta; double* A; double* B; double* C; int igpu = omp_get_thread_num(); cudaSetDevice(igpu); cublasHandle_t handle; cublasCreate(&handle); cudaStream_t stream[nV]; double* d_cc_space_v_oo_chol = data[igpu].cc_space_v_oo_chol; double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol; double* d_cc_space_v_vo_chol = data[igpu].cc_space_v_vo_chol; double* d_cc_space_v_vv_chol = data[igpu].cc_space_v_vv_chol; double* d_cc_space_v_oooo = data[igpu].cc_space_v_oooo; double* d_cc_space_v_vooo = data[igpu].cc_space_v_vooo; double* d_cc_space_v_oovv = data[igpu].cc_space_v_oovv; double* d_cc_space_v_vvoo = data[igpu].cc_space_v_vvoo; double* d_cc_space_v_oovo = data[igpu].cc_space_v_oovo; double* d_cc_space_v_ovvo = data[igpu].cc_space_v_ovvo; double* d_cc_space_v_ovov = data[igpu].cc_space_v_ovov; double* d_cc_space_v_ovoo = data[igpu].cc_space_v_ovoo; double* d_cc_space_f_vo = data[igpu].cc_space_f_vo; double* d_tau; double* d_r2; double* d_t1; double* d_t2; double* d_tmp_cc; double* d_K1; cudaMalloc((void **)&d_K1, nO*nV*nO*nV * sizeof(double)); lda = nO * nO; cudaMalloc((void **)&d_tau, lda * nV * nV * sizeof(double)); cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau, lda, d_tau, lda); lda = nO * nO; cudaMalloc((void **)&d_r2, lda * nV * nV * sizeof(double)); memset(r2, 0, nO*nO*nV*nV*sizeof(double)); cublasSetMatrix(nO*nO, nV*nV, sizeof(double), r2, lda, d_r2, lda); lda = nO; cudaMalloc((void **)&d_t1, nO * nV * sizeof(double)); cublasSetMatrix(nO, nV, sizeof(double), t1, lda, d_t1, lda); lda = nO*nO; cudaMalloc((void **)&d_t2, nO*nO*nV*nV * sizeof(double)); cublasSetMatrix(nO*nO, nV*nV, sizeof(double), t2, lda, d_t2, lda); #pragma omp sections { #pragma omp section { double* d_J1; cudaMalloc((void **)&d_J1, nO*nV*nV*nO * sizeof(double)); alpha = 1.0; beta = 0.0; A = d_cc_space_v_ovvo; lda = nO*nV; B = d_cc_space_v_ovvo; ldb = nO*nV; C = d_J1; ldc = nO*nV; cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO*nV, nV*nO, &alpha, A, lda, &beta, B, ldb, C, ldc); double* d_X_ovoo; cudaMalloc((void **)&d_X_ovoo, nO*nV*nO*nO * sizeof(double)); alpha = 0.0; beta = 1.0; for (int i=0 ; i