#include #include #include #include #define BLOCK_SIZE 16 void dgemm_(char*, char*, int*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int*); void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo_num, double* t1, double* tau, double* cc_space_v_vo_chol, double* cc_space_v_vv_chol, double* r2) { int m,n,k, lda, ldb, ldc; double alpha, beta; double* A; double* B; double* C; double* tmp_cc = malloc(cholesky_mo_num*nV*nV*sizeof(double)); cublasHandle_t handle; cublasCreate(&handle); double* d_tau; lda = nO * nO; cudaMalloc((void **)&d_tau, lda * nV * nV * sizeof(double)); cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau, lda, d_tau, lda); double* d_r2; lda = nO * nO; cudaMalloc((void **)&d_r2, lda * nV * nV * sizeof(double)); cublasSetMatrix(nO*nO, nV*nV, sizeof(double), r2, lda, d_r2, lda); double* d_cc_space_v_vv_chol; lda = cholesky_mo_num * nV; cudaMalloc((void **)&d_cc_space_v_vv_chol, lda * nV * sizeof(double)); cublasSetMatrix(cholesky_mo_num*nV, nV, sizeof(double), cc_space_v_vv_chol, lda, d_cc_space_v_vv_chol, lda); double* d_cc_space_v_vo_chol; lda = cholesky_mo_num * nV; cudaMalloc((void **)&d_cc_space_v_vo_chol, lda * nO * sizeof(double)); cublasSetMatrix(cholesky_mo_num*nV, nO, sizeof(double), cc_space_v_vo_chol, lda, d_cc_space_v_vo_chol, lda); double* d_t1; lda = nO; cudaMalloc((void **)&d_t1, nO * nV * sizeof(double)); cublasSetMatrix(nO, nV, sizeof(double), t1, lda, d_t1, lda); double* d_tmp_cc; lda = cholesky_mo_num * nV; cudaMalloc((void **)&d_tmp_cc, lda * nV * sizeof(double)); alpha=1.0; beta=0.0; m=cholesky_mo_num*nV; n=nV; k=nO; A = d_cc_space_v_vo_chol; B = d_t1; C = d_tmp_cc; cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, m, B, k, &beta, C, m); cublasGetMatrix(m, n, sizeof(double), d_tmp_cc, lda, tmp_cc, lda); // --- /* m=cholesky_mo_num*nV; n=nV; k=nO; alpha=1.0; beta=0.0; lda=m ; ldb=k ; ldc=m; A=cc_space_v_vo_chol; B=t1; C=tmp_cc; dgemm_("N","N", &m, &n, &k, &alpha, A, &lda, B, &ldb, &beta, C, &ldc); */ #pragma omp parallel { double* tmp_cc2 = malloc(cholesky_mo_num*nV*sizeof(double)); double* d_tmp_cc2; cudaMalloc((void **)&d_tmp_cc2, cholesky_mo_num*nV*sizeof(double)); double* B1 = malloc(nV*nV*BLOCK_SIZE*sizeof(double)); double* d_B1; cudaMalloc((void**)&d_B1, nV*nV*BLOCK_SIZE*sizeof(double)); double* tmpB1 = malloc(nV*BLOCK_SIZE*nV*sizeof(double)); double* d_tmpB1; cudaMalloc((void**)&d_tmpB1, nV*BLOCK_SIZE*nV*sizeof(double)); #pragma omp for for (size_t gam=0 ; gam