#include #include #include #include #include #include #define BLOCK_SIZE 16 void dgemm_(char*, char*, int*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int*); void gpu_dgemm(char transa, char transb, int m, int n, int k, double alpha, double* A, int lda, double* B, int ldb, double beta, double* C, int ldc) { cublasHandle_t handle; cublasCreate(&handle); double * d_A; double * d_B; double * d_C; cublasOperation_t ta, tb; if (transa == 'N') { cudaMalloc((void**)&d_A, lda*k*sizeof(double)); cublasSetMatrix(m, k, sizeof(double), A, lda, d_A, lda); ta = CUBLAS_OP_N; } else { cudaMalloc((void**)&d_A, lda*m*sizeof(double)); cublasSetMatrix(k, m, sizeof(double), A, lda, d_A, lda); ta = CUBLAS_OP_T; } if (transb == 'N') { cudaMalloc((void**)&d_B, ldb*n*sizeof(double)); cublasSetMatrix(k, n, sizeof(double), B, ldb, d_B, ldb); tb = CUBLAS_OP_N; } else { cudaMalloc((void**)&d_B, ldb*k*sizeof(double)); cublasSetMatrix(n, k, sizeof(double), B, ldb, d_B, ldb); tb = CUBLAS_OP_T; } cudaMalloc((void**)&d_C, ldc*n*sizeof(double)); if (beta != 0.) { cublasSetMatrix(m, n, sizeof(double), C, ldc, d_C, ldc); } cublasDgemm(handle, ta, tb, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, ldc); cublasGetMatrix(m, n, sizeof(double), d_C, ldc, C, ldc); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); cublasDestroy(handle); } void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo_num, double* t1, double* t2, double* tau, double* cc_space_v_oo_chol, double* cc_space_v_ov_chol, double* cc_space_v_vo_chol, double* cc_space_v_vv_chol, double* cc_space_v_oooo, double* cc_space_v_vooo, double* cc_space_v_oovv, double* cc_space_v_vvoo, double* cc_space_v_oovo, double* cc_space_v_ovvo, double* cc_space_f_vo, double* H_vv, double* g_occ, double* r2) { int ngpus; cudaGetDeviceCount(&ngpus); #pragma omp parallel num_threads(ngpus) { int m,n,k, lda, ldb, ldc; double alpha, beta; double* A; double* B; double* C; int ithread = omp_get_thread_num(); int igpu = ithread ; //igpu=1; cudaSetDevice(igpu); cublasHandle_t handle; cublasCreate(&handle); double* d_tau; double* d_r2; double* d_cc_space_v_oo_chol; double* d_cc_space_v_ov_chol; double* d_cc_space_v_vo_chol; double* d_cc_space_v_vv_chol; double* d_t1; double* d_t2; double* d_tmp_cc; lda = nO * nO; cudaMalloc((void **)&d_tau, lda * nV * nV * sizeof(double)); cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau, lda, d_tau, lda); lda = nO * nO; cudaMalloc((void **)&d_r2, lda * nV * nV * sizeof(double)); memset(r2, 0, nO*nO*nV*nV*sizeof(double)); cublasSetMatrix(nO*nO, nV*nV, sizeof(double), r2, lda, d_r2, lda); lda = cholesky_mo_num * nV; cudaMalloc((void **)&d_cc_space_v_vv_chol, lda * nV * sizeof(double)); cublasSetMatrix(cholesky_mo_num*nV, nV, sizeof(double), cc_space_v_vv_chol, lda, d_cc_space_v_vv_chol, lda); lda = cholesky_mo_num * nO; cudaMalloc((void **)&d_cc_space_v_oo_chol, lda * nO * sizeof(double)); cublasSetMatrix(cholesky_mo_num*nO, nO, sizeof(double), cc_space_v_oo_chol, lda, d_cc_space_v_oo_chol, lda); lda = cholesky_mo_num * nO; cudaMalloc((void **)&d_cc_space_v_ov_chol, lda * nV * sizeof(double)); cublasSetMatrix(cholesky_mo_num*nO, nV, sizeof(double), cc_space_v_ov_chol, lda, d_cc_space_v_ov_chol, lda); lda = cholesky_mo_num * nV; cudaMalloc((void **)&d_cc_space_v_vo_chol, lda * nO * sizeof(double)); cublasSetMatrix(cholesky_mo_num*nV, nO, sizeof(double), cc_space_v_vo_chol, lda, d_cc_space_v_vo_chol, lda); lda = nO; cudaMalloc((void **)&d_t1, nO * nV * sizeof(double)); cublasSetMatrix(nO, nV, sizeof(double), t1, lda, d_t1, lda); lda = nO*nO; cudaMalloc((void **)&d_t2, nO*nO*nV*nV * sizeof(double)); cublasSetMatrix(nO*nO, nV*nV, sizeof(double), t2, lda, d_t2, lda); double* d_cc_space_f_vo; cudaMalloc((void**)&d_cc_space_f_vo, nV*nO*sizeof(double)); cublasSetMatrix(nV, nO, sizeof(double), cc_space_f_vo, nV, d_cc_space_f_vo, nV); #pragma omp sections { #pragma omp section { double* d_cc_space_v_vooo; cudaMalloc((void**)&d_cc_space_v_vooo, nV*nO*nO*nO*sizeof(double)); cublasSetMatrix(nV*nO, nO*nO, sizeof(double), cc_space_v_vooo, nV*nO, d_cc_space_v_vooo, nV*nO); double* d_Y_oooo; cudaMalloc((void**)&d_Y_oooo, nO*nO*nO*nO*sizeof(double)); alpha = 1.0; beta = 0.0; m=nO ; n=nO*nO*nO; k=nV; A = d_t1 ; lda = nO; B = d_cc_space_v_vooo ; ldb = nV; C = d_Y_oooo; ldc = nO; cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc); cudaFree(d_cc_space_v_vooo); double* d_A1; cudaMalloc((void**)&d_A1, nO*nO*nO*nO*sizeof(double)); double* d_cc_space_v_oooo; cudaMalloc((void**)&d_cc_space_v_oooo, nO*nO*nO*nO*sizeof(double)); cublasSetMatrix(nO*nO, nO*nO, sizeof(double), cc_space_v_oooo, nO*nO, d_cc_space_v_oooo, nO*nO); alpha = 1.0; beta = 1.0; A = d_cc_space_v_oooo; lda = nO*nO; B = d_Y_oooo; ldb = nO*nO; C = d_A1; ldc = nO*nO; cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO*nO, nO*nO, &alpha, A, lda, &beta, B, ldb, C, ldc); for (int j=0 ; j