1
0
mirror of https://gitlab.com/scemama/qp_plugins_scemama.git synced 2024-06-02 11:25:23 +02:00

Introduced cuda streams

This commit is contained in:
Anthony Scemama 2023-08-02 18:31:44 +02:00
parent 5da8e2cba4
commit b376fe685c

View File

@ -38,6 +38,8 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
cublasCreate(&handle); cublasCreate(&handle);
cudaStream_t stream[nV];
double* d_cc_space_v_oo_chol = data[igpu].cc_space_v_oo_chol; double* d_cc_space_v_oo_chol = data[igpu].cc_space_v_oo_chol;
double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol; double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol;
double* d_cc_space_v_vo_chol = data[igpu].cc_space_v_vo_chol; double* d_cc_space_v_vo_chol = data[igpu].cc_space_v_vo_chol;
@ -74,6 +76,7 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
cudaMalloc((void **)&d_t2, nO*nO*nV*nV * sizeof(double)); cudaMalloc((void **)&d_t2, nO*nO*nV*nV * sizeof(double));
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), t2, lda, d_t2, lda); cublasSetMatrix(nO*nO, nV*nV, sizeof(double), t2, lda, d_t2, lda);
#pragma omp sections #pragma omp sections
{ {
@ -99,8 +102,12 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
B = d_Y_oooo; ldb = nO*nO; B = d_Y_oooo; ldb = nO*nO;
C = d_A1; ldc = nO*nO; C = d_A1; ldc = nO*nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO*nO, nO*nO, &alpha, A, lda, &beta, B, ldb, C, ldc); cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO*nO, nO*nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
for (int i=0 ; i<nO ; ++i) {
cudaStreamCreate(&(stream[i]));
}
for (int j=0 ; j<nO ; ++j) { for (int j=0 ; j<nO ; ++j) {
for (int i=0 ; i<nO ; ++i) { for (int i=0 ; i<nO ; ++i) {
cublasSetStream(handle, stream[i]);
alpha = 1.0; alpha = 1.0;
beta = 1.0; beta = 1.0;
A = &(d_A1[nO*nO*(i+nO*j)]); lda = nO; A = &(d_A1[nO*nO*(i+nO*j)]); lda = nO;
@ -110,6 +117,10 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
} }
} }
for (int i=0 ; i<nO ; ++i) {
cudaStreamDestroy(stream[i]);
}
cublasSetStream(handle, NULL);
cudaFree(d_Y_oooo); cudaFree(d_Y_oooo);
alpha = 1.0; alpha = 1.0;
@ -177,6 +188,10 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
double* d_tmp_vo2; double* d_tmp_vo2;
cudaMalloc((void**)&d_tmp_vo2, cholesky_mo_num*nV*nO*sizeof(double)); cudaMalloc((void**)&d_tmp_vo2, cholesky_mo_num*nV*nO*sizeof(double));
for (int i=0 ; i<nO ; ++i) { for (int i=0 ; i<nO ; ++i) {
cudaStreamCreate(&(stream[i]));
}
for (int i=0 ; i<nO ; ++i) {
cublasSetStream(handle, stream[i]);
alpha = -1.0; alpha = -1.0;
beta = 0.0; beta = 0.0;
A = &(d_tmp_vo[cholesky_mo_num*nV*i]); lda = cholesky_mo_num; A = &(d_tmp_vo[cholesky_mo_num*nV*i]); lda = cholesky_mo_num;
@ -184,6 +199,10 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
C = &(d_tmp_vo2[cholesky_mo_num*i]); ldc = cholesky_mo_num*nO; C = &(d_tmp_vo2[cholesky_mo_num*i]); ldc = cholesky_mo_num*nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, cholesky_mo_num, nV, &alpha, A, lda, &beta, B, ldb, C, ldc); cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, cholesky_mo_num, nV, &alpha, A, lda, &beta, B, ldb, C, ldc);
} }
for (int i=0 ; i<nO ; ++i) {
cudaStreamDestroy(stream[i]);
}
cublasSetStream(handle, NULL);
cudaFree(d_tmp_vo); cudaFree(d_tmp_vo);
alpha = 1.0; alpha = 1.0;
@ -213,6 +232,7 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
C = d_r2; ldc = nO*nO; C = d_r2; ldc = nO*nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO*nO, nV*nV, &alpha, A, lda, &beta, B, ldb, C, ldc); cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO*nO, nV*nV, &alpha, A, lda, &beta, B, ldb, C, ldc);
/*
double * Y_oovv = malloc(nO*nO*nV*nV*sizeof(double)); double * Y_oovv = malloc(nO*nO*nV*nV*sizeof(double));
lda=nO*nO; lda=nO*nO;
cublasGetMatrix(nO*nO, nV*nV, sizeof(double), d_Y_oovv, lda, Y_oovv, lda); cublasGetMatrix(nO*nO, nV*nV, sizeof(double), d_Y_oovv, lda, Y_oovv, lda);
@ -239,9 +259,15 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), r2_tmp, lda, d_r2, lda); cublasSetMatrix(nO*nO, nV*nV, sizeof(double), r2_tmp, lda, d_r2, lda);
free(r2_tmp); free(r2_tmp);
/* */
//--
for (int i=0 ; i<nV ; ++i) {
cudaStreamCreate(&(stream[i]));
}
for (int j=0 ; j<nV ; ++j) { for (int j=0 ; j<nV ; ++j) {
for (int i=0 ; i<nV ; ++i) { for (int i=0 ; i<nV ; ++i) {
cublasSetStream(handle, stream[i]);
alpha = 1.0; alpha = 1.0;
beta = 1.0; beta = 1.0;
A = &(d_r2[nO*nO*(i+nV*j)]); lda = nO; A = &(d_r2[nO*nO*(i+nV*j)]); lda = nO;
@ -251,8 +277,12 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
} }
} }
for (int i=0 ; i<nV ; ++i) {
cudaStreamDestroy(stream[i]);
}
cublasSetStream(handle, NULL);
cudaFree(d_Y_oovv); cudaFree(d_Y_oovv);
*/ //--
} }
// g_occ // g_occ
@ -291,6 +321,7 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
C = d_r2; ldc = nO*nO; C = d_r2; ldc = nO*nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO*nO, nV*nV, &alpha, A, lda, &beta, B, ldb, C, ldc); cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO*nO, nV*nV, &alpha, A, lda, &beta, B, ldb, C, ldc);
/*
double * X_oovv = malloc(nO*nO*nV*nV*sizeof(double)); double * X_oovv = malloc(nO*nO*nV*nV*sizeof(double));
lda=nO*nO; lda=nO*nO;
cublasGetMatrix(nO*nO, nV*nV, sizeof(double), d_X_oovv, lda, X_oovv, lda); cublasGetMatrix(nO*nO, nV*nV, sizeof(double), d_X_oovv, lda, X_oovv, lda);
@ -316,10 +347,15 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
lda=nO*nO; lda=nO*nO;
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), r2_tmp, lda, d_r2, lda); cublasSetMatrix(nO*nO, nV*nV, sizeof(double), r2_tmp, lda, d_r2, lda);
free(r2_tmp); free(r2_tmp);
*/
/* //--
for (int i=0 ; i<nV ; ++i) {
cudaStreamCreate(&(stream[i]));
}
for (int j=0 ; j<nV ; ++j) { for (int j=0 ; j<nV ; ++j) {
for (int i=0 ; i<nV ; ++i) { for (int i=0 ; i<nV ; ++i) {
cublasSetStream(handle, stream[i]);
alpha = 1.0; alpha = 1.0;
beta = -1.0; beta = -1.0;
A = &(d_r2[nO*nO*(i+nV*j)]); lda = nO; A = &(d_r2[nO*nO*(i+nV*j)]); lda = nO;
@ -328,8 +364,13 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc); cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
} }
} }
for (int i=0 ; i<nV ; ++i) {
cudaStreamDestroy(stream[i]);
}
cublasSetStream(handle, NULL);
cudaFree(d_X_oovv); cudaFree(d_X_oovv);
*/ //--
} }
#pragma omp section #pragma omp section
@ -345,6 +386,10 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
for (int iblock=0 ; iblock<nV ; iblock += BLOCK_SIZE) { for (int iblock=0 ; iblock<nV ; iblock += BLOCK_SIZE) {
int mbs = nV < iblock+BLOCK_SIZE ? nV : iblock+BLOCK_SIZE; int mbs = nV < iblock+BLOCK_SIZE ? nV : iblock+BLOCK_SIZE;
for (int gam=iblock ; gam<mbs ; ++gam) { for (int gam=iblock ; gam<mbs ; ++gam) {
cudaStreamCreate(&(stream[gam]));
}
for (int gam=iblock ; gam<mbs ; ++gam) {
cublasSetStream(handle, stream[gam]);
alpha = 1.0; alpha = 1.0;
beta = 0.0; beta = 0.0;
m=nV; n=nO*nV; k=cholesky_mo_num; m=nV; n=nO*nV; k=cholesky_mo_num;
@ -353,6 +398,10 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
C=&(d_X_vovv[nV*nO*nV*(gam-iblock)]); ldc=nV; C=&(d_X_vovv[nV*nO*nV*(gam-iblock)]); ldc=nV;
cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc); cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
} }
for (int gam=iblock ; gam<mbs ; ++gam) {
cudaStreamDestroy(stream[gam]);
}
cublasSetStream(handle, NULL);
mbs = BLOCK_SIZE < nV-iblock ? BLOCK_SIZE : nV-iblock; mbs = BLOCK_SIZE < nV-iblock ? BLOCK_SIZE : nV-iblock;
alpha = 1.0; alpha = 1.0;
beta = 0.0; beta = 0.0;
@ -363,20 +412,31 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc); cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
} }
cudaFree(d_X_vovv); cudaFree(d_X_vovv);
alpha = 1.0;
beta = 1.0;
for (int i=0 ; i<nV ; ++i) {
cudaStreamCreate(&(stream[i]));
}
for (int j=0 ; j<nV ; ++j) { for (int j=0 ; j<nV ; ++j) {
for (int i=0 ; i<nV ; ++i) { for (int i=0 ; i<nV ; ++i) {
alpha = 1.0; cublasSetStream(handle, stream[i]);
beta = 1.0;
A = &(d_r2[nO*nO*(i+nV*j)]); lda = nO; A = &(d_r2[nO*nO*(i+nV*j)]); lda = nO;
B = &(d_Y_oovv[nO*nO*(i+nV*j)]); ldb = nO; B = &(d_Y_oovv[nO*nO*(i+nV*j)]); ldb = nO;
C = &(d_r2[nO*nO*(i+nV*j)]); ldc = nO; C = &(d_r2[nO*nO*(i+nV*j)]); ldc = nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc); cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
}
for (int i=0 ; i<nV ; ++i) {
cublasSetStream(handle, stream[i]);
A = &(d_r2[nO*nO*(i+nV*j)]); lda = nO; A = &(d_r2[nO*nO*(i+nV*j)]); lda = nO;
B = &(d_Y_oovv[nO*nO*(j+nV*i)]); ldb = nO; B = &(d_Y_oovv[nO*nO*(j+nV*i)]); ldb = nO;
C = &(d_r2[nO*nO*(i+nV*j)]); ldc = nO; C = &(d_r2[nO*nO*(i+nV*j)]); ldc = nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc); cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
} }
} }
for (int i=0 ; i<nV ; ++i) {
cudaStreamDestroy(stream[i]);
}
cublasSetStream(handle, NULL);
cudaFree(d_Y_oovv); cudaFree(d_Y_oovv);
} }
@ -420,19 +480,31 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
cudaFree(d_tcc); cudaFree(d_tcc);
cudaFree(d_tcc2); cudaFree(d_tcc2);
for (int i=0 ; i<nV ; ++i) {
cudaStreamCreate(&(stream[i]));
}
alpha = 1.0;
beta = -1.0;
for(int gam = 0; gam < nV; gam++){ for(int gam = 0; gam < nV; gam++){
for(int bet = 0; bet < nV; bet++){ for(int bet = 0; bet < nV; bet++){
alpha = 1.0; cublasSetStream(handle, stream[bet]);
beta = -1.0;
A = &(d_r2[nO*nO*(bet+nV*gam)]); lda = nO; A = &(d_r2[nO*nO*(bet+nV*gam)]); lda = nO;
B = &(d_X_ovvo[nO*(bet+nV*gam)]); ldb = nO*nV*nV; B = &(d_X_ovvo[nO*(bet+nV*gam)]); ldb = nO*nV*nV;
C = &(d_r2[nO*nO*(bet+nV*gam)]); ldc = nO; C = &(d_r2[nO*nO*(bet+nV*gam)]); ldc = nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc); cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
}
for(int bet = 0; bet < nV; bet++){
cublasSetStream(handle, stream[bet]);
A = &(d_r2[nO*nO*(bet+nV*gam)]); lda = nO;
B = &(d_X_ovvo[nO*(gam+nV*bet)]); ldb = nO*nV*nV; B = &(d_X_ovvo[nO*(gam+nV*bet)]); ldb = nO*nV*nV;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc); C = &(d_r2[nO*nO*(bet+nV*gam)]); ldc = nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
} }
} }
for (int i=0 ; i<nV ; ++i) {
cudaStreamDestroy(stream[i]);
}
cublasSetStream(handle, NULL);
cudaFree(d_X_ovvo); cudaFree(d_X_ovvo);
} }
@ -456,16 +528,22 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
B = d_X_oovv; ldb = nO*nO; B = d_X_oovv; ldb = nO*nO;
C = d_r2; ldc = nO*nO; C = d_r2; ldc = nO*nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO*nO, nV*nV, &alpha, A, lda, &beta, B, ldb, C, ldc); cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO*nO, nV*nV, &alpha, A, lda, &beta, B, ldb, C, ldc);
for (int i=0 ; i<nV ; ++i) {
cudaStreamCreate(&(stream[i]));
}
for (int j=0 ; j<nV ; ++j) { for (int j=0 ; j<nV ; ++j) {
for (int i=0 ; i<nV ; ++i) { for (int i=0 ; i<nV ; ++i) {
alpha = 1.0; cublasSetStream(handle, stream[i]);
beta = -1.0;
A = &(d_r2[nO*nO*(i+nV*j)]); lda = nO; A = &(d_r2[nO*nO*(i+nV*j)]); lda = nO;
B = &(d_X_oovv[nO*nO*(j+nV*i)]); ldb = nO; B = &(d_X_oovv[nO*nO*(j+nV*i)]); ldb = nO;
C = &(d_r2[nO*nO*(i+nV*j)]); ldc = nO; C = &(d_r2[nO*nO*(i+nV*j)]); ldc = nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc); cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
} }
} }
for (int i=0 ; i<nV ; ++i) {
cudaStreamDestroy(stream[i]);
}
cublasSetStream(handle, NULL);
double* d_X_vovo; double* d_X_vovo;
lda = nV*nO; lda = nV*nO;
@ -473,14 +551,22 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
alpha = 0.0; alpha = 0.0;
beta = 1.0; beta = 1.0;
for (int i=0 ; i<nV ; ++i) {
cudaStreamCreate(&(stream[i]));
}
for (int i=0 ; i<nO ; ++i) { for (int i=0 ; i<nO ; ++i) {
for (int gam=0 ; gam<nV ; ++gam) { for (int gam=0 ; gam<nV ; ++gam) {
cublasSetStream(handle, stream[gam]);
A = &(d_X_vovo[nV*nO*(gam+nV*i)]); lda = nV; A = &(d_X_vovo[nV*nO*(gam+nV*i)]); lda = nV;
B = &(d_cc_space_v_ovvo[nO*nV*(gam+nV*i)]); ldb = nO; B = &(d_cc_space_v_ovvo[nO*nV*(gam+nV*i)]); ldb = nO;
C = &(d_X_vovo[nV*nO*(gam+nV*i)]); ldc = nV; C = &(d_X_vovo[nV*nO*(gam+nV*i)]); ldc = nV;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nV, nO, &alpha, A, lda, &beta, B, ldb, C, ldc); cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nV, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
} }
} }
for (int i=0 ; i<nV ; ++i) {
cudaStreamDestroy(stream[i]);
}
cublasSetStream(handle, NULL);
double* d_Y_oovo; double* d_Y_oovo;
lda = nO*nO; lda = nO*nO;
@ -506,18 +592,31 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
cudaFree(d_Y_oovo); cudaFree(d_Y_oovo);
alpha = 1.0;
beta = -1.0;
for (int i=0 ; i<nV ; ++i) {
cudaStreamCreate(&(stream[i]));
}
for (int j=0 ; j<nV ; ++j) { for (int j=0 ; j<nV ; ++j) {
for (int i=0 ; i<nV ; ++i) { for (int i=0 ; i<nV ; ++i) {
alpha = 1.0; cublasSetStream(handle, stream[i]);
beta = -1.0;
A = &(d_r2[nO*nO*(i+nV*j)]); lda = nO; A = &(d_r2[nO*nO*(i+nV*j)]); lda = nO;
B = &(d_X_oovv[nO*nO*(i+nV*j)]); ldb = nO; B = &(d_X_oovv[nO*nO*(i+nV*j)]); ldb = nO;
C = &(d_r2[nO*nO*(i+nV*j)]); ldc = nO; C = &(d_r2[nO*nO*(i+nV*j)]); ldc = nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc); cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
}
for (int i=0 ; i<nV ; ++i) {
cublasSetStream(handle, stream[i]);
A = &(d_r2[nO*nO*(i+nV*j)]); lda = nO;
B = &(d_X_oovv[nO*nO*(j+nV*i)]); ldb = nO; B = &(d_X_oovv[nO*nO*(j+nV*i)]); ldb = nO;
C = &(d_r2[nO*nO*(i+nV*j)]); ldc = nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc); cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
} }
} }
for (int i=0 ; i<nV ; ++i) {
cudaStreamDestroy(stream[i]);
}
cublasSetStream(handle, NULL);
cudaFree(d_X_oovv); cudaFree(d_X_oovv);
} }
@ -550,16 +649,24 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
double* d_X_ovoo; double* d_X_ovoo;
lda = nO*nV; lda = nO*nV;
cudaMalloc((void **)&d_X_ovoo, nO*nV*nO*nO * sizeof(double)); cudaMalloc((void **)&d_X_ovoo, nO*nV*nO*nO * sizeof(double));
alpha = 0.0;
beta = 1.0;
for (int i=0 ; i<nO ; ++i) {
cudaStreamCreate(&(stream[i]));
}
for (int j=0 ; j<nO ; ++j) { for (int j=0 ; j<nO ; ++j) {
for (int i=0 ; i<nO ; ++i) { for (int i=0 ; i<nO ; ++i) {
alpha = 0.0; cublasSetStream(handle, stream[i]);
beta = 1.0;
A = &(d_X_ovoo[nO*nV*(i+nO*j)]); lda = nO; A = &(d_X_ovoo[nO*nV*(i+nO*j)]); lda = nO;
B = &(d_cc_space_v_ovoo[nO*nV*(j+nO*i)]); ldb = nO; B = &(d_cc_space_v_ovoo[nO*nV*(j+nO*i)]); ldb = nO;
C = &(d_X_ovoo[nO*nV*(i+nO*j)]); ldc = nO; C = &(d_X_ovoo[nO*nV*(i+nO*j)]); ldc = nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO, nV, &alpha, A, lda, &beta, B, ldb, C, ldc); cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO, nV, &alpha, A, lda, &beta, B, ldb, C, ldc);
} }
} }
for (int i=0 ; i<nO ; ++i) {
cudaStreamDestroy(stream[i]);
}
cublasSetStream(handle, NULL);
double* d_Y_ovov; double* d_Y_ovov;
@ -574,16 +681,24 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
C=d_Y_ovov; ldc=nO*nV*nO; C=d_Y_ovov; ldc=nO*nV*nO;
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc); cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
alpha = 1.0;
beta = -1.0;
for (int i=0 ; i<nV ; ++i) {
cudaStreamCreate(&(stream[i]));
}
for (int j=0 ; j<nO ; ++j) { for (int j=0 ; j<nO ; ++j) {
for (int i=0 ; i<nV ; ++i) { for (int i=0 ; i<nV ; ++i) {
alpha = 1.0; cublasSetStream(handle, stream[i]);
beta = -1.0;
A = &(d_J1[nO*nV*(i+nV*j)]); lda = nO; A = &(d_J1[nO*nV*(i+nV*j)]); lda = nO;
B = &(d_Y_ovov[nO*nV*(j+nO*i)]); ldb = nO; B = &(d_Y_ovov[nO*nV*(j+nO*i)]); ldb = nO;
C = &(d_J1[nO*nV*(i+nV*j)]); ldc = nO; C = &(d_J1[nO*nV*(i+nV*j)]); ldc = nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO, nV, &alpha, A, lda, &beta, B, ldb, C, ldc); cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO, nV, &alpha, A, lda, &beta, B, ldb, C, ldc);
} }
} }
for (int i=0 ; i<nV ; ++i) {
cudaStreamDestroy(stream[i]);
}
cublasSetStream(handle, NULL);
double* d_tmp_cc; double* d_tmp_cc;
lda = cholesky_mo_num; lda = cholesky_mo_num;
@ -611,22 +726,35 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
cudaFree(d_tmp_cc); cudaFree(d_tmp_cc);
alpha = 1.0;
beta = 1.0;
for (int i=0 ; i<nO ; ++i) { for (int i=0 ; i<nO ; ++i) {
alpha = 1.0; cudaStreamCreate(&(stream[i]));
beta = 1.0; }
for (int i=0 ; i<nO ; ++i) {
cublasSetStream(handle, stream[i]);
A = &(d_J1[nO*nV*nV*i]); lda = nO*nV; A = &(d_J1[nO*nV*nV*i]); lda = nO*nV;
B = &(d_J1_tmp[nV*nO*nV*i]); ldb = nV; B = &(d_J1_tmp[nV*nO*nV*i]); ldb = nV;
C = &(d_J1[nO*nV*nV*i]); ldc = nO*nV; C = &(d_J1[nO*nV*nV*i]); ldc = nO*nV;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nO*nV, nV, &alpha, A, lda, &beta, B, ldb, C, ldc); cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nO*nV, nV, &alpha, A, lda, &beta, B, ldb, C, ldc);
} }
for (int i=0 ; i<nO ; ++i) {
cudaStreamDestroy(stream[i]);
}
cublasSetStream(handle, NULL);
cudaFree(d_J1_tmp); cudaFree(d_J1_tmp);
lda = nO*nV; lda = nO*nV;
cublasGetMatrix(nO*nV, nV*nO, sizeof(double), d_J1, lda, J1, lda); cublasGetMatrix(nO*nV, nV*nO, sizeof(double), d_J1, lda, J1, lda);
cudaFree(d_J1); cudaFree(d_J1);
} }
}
#pragma omp section
{
}
} // end sections
lda = cholesky_mo_num * nV; lda = cholesky_mo_num * nV;
@ -718,6 +846,7 @@ cublasGetMatrix(nO*nV, nV*nO, sizeof(double), d_J1, lda, J1, lda);
free(r2_tmp); free(r2_tmp);
cudaFree(d_r2); cudaFree(d_r2);
cublasDestroy(handle); cublasDestroy(handle);
} }