#include #include /* #include #include #include #include */ #define BLOCK_SIZE 16 void dgemm_(char*, char*, int*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int*); void* compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo_num, double* t1, double* tau, double* cc_space_v_vo_chol, double* cc_space_v_vv_chol, double* r2) { int m,n,k, lda, ldb, ldc; double alpha, beta; double* A; double* B; double* C; double* tmp_cc = malloc(cholesky_mo_num*nV*nV*sizeof(double)); m=cholesky_mo_num*nV; n=nV; k=nO; alpha=1.0; beta=0.0; lda=m ; ldb=k ; ldc=m; A=cc_space_v_vo_chol; B=t1; C=tmp_cc; dgemm_("N","N", &m, &n, &k, &alpha, A, &lda, B, &ldb, &beta, C, &ldc); #pragma omp parallel { double* tmp_cc2 = malloc(cholesky_mo_num*nV*sizeof(double)); double* B1 = malloc(nV*nV*BLOCK_SIZE*sizeof(double)); double* tmpB1 = malloc(nV*BLOCK_SIZE*nV*sizeof(double)); #pragma omp for for (size_t gam=0 ; gam