Introduced cuda streams

This commit is contained in:
Anthony Scemama 2023-08-02 18:31:44 +02:00
parent 5da8e2cba4
commit b376fe685c
1 changed files with 152 additions and 23 deletions

View File

@ -38,6 +38,8 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
cublasCreate(&handle);
cudaStream_t stream[nV];
double* d_cc_space_v_oo_chol = data[igpu].cc_space_v_oo_chol;
double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol;
double* d_cc_space_v_vo_chol = data[igpu].cc_space_v_vo_chol;
@ -74,6 +76,7 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
cudaMalloc((void **)&d_t2, nO*nO*nV*nV * sizeof(double));
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), t2, lda, d_t2, lda);
#pragma omp sections
{
@ -99,8 +102,12 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
B = d_Y_oooo; ldb = nO*nO;
C = d_A1; ldc = nO*nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO*nO, nO*nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
for (int i=0 ; i<nO ; ++i) {
cudaStreamCreate(&(stream[i]));
}
for (int j=0 ; j<nO ; ++j) {
for (int i=0 ; i<nO ; ++i) {
cublasSetStream(handle, stream[i]);
alpha = 1.0;
beta = 1.0;
A = &(d_A1[nO*nO*(i+nO*j)]); lda = nO;
@ -110,6 +117,10 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
}
}
for (int i=0 ; i<nO ; ++i) {
cudaStreamDestroy(stream[i]);
}
cublasSetStream(handle, NULL);
cudaFree(d_Y_oooo);
alpha = 1.0;
@ -177,6 +188,10 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
double* d_tmp_vo2;
cudaMalloc((void**)&d_tmp_vo2, cholesky_mo_num*nV*nO*sizeof(double));
for (int i=0 ; i<nO ; ++i) {
cudaStreamCreate(&(stream[i]));
}
for (int i=0 ; i<nO ; ++i) {
cublasSetStream(handle, stream[i]);
alpha = -1.0;
beta = 0.0;
A = &(d_tmp_vo[cholesky_mo_num*nV*i]); lda = cholesky_mo_num;
@ -184,6 +199,10 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
C = &(d_tmp_vo2[cholesky_mo_num*i]); ldc = cholesky_mo_num*nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, cholesky_mo_num, nV, &alpha, A, lda, &beta, B, ldb, C, ldc);
}
for (int i=0 ; i<nO ; ++i) {
cudaStreamDestroy(stream[i]);
}
cublasSetStream(handle, NULL);
cudaFree(d_tmp_vo);
alpha = 1.0;
@ -213,6 +232,7 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
C = d_r2; ldc = nO*nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO*nO, nV*nV, &alpha, A, lda, &beta, B, ldb, C, ldc);
/*
double * Y_oovv = malloc(nO*nO*nV*nV*sizeof(double));
lda=nO*nO;
cublasGetMatrix(nO*nO, nV*nV, sizeof(double), d_Y_oovv, lda, Y_oovv, lda);
@ -239,9 +259,15 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), r2_tmp, lda, d_r2, lda);
free(r2_tmp);
/*
*/
//--
for (int i=0 ; i<nV ; ++i) {
cudaStreamCreate(&(stream[i]));
}
for (int j=0 ; j<nV ; ++j) {
for (int i=0 ; i<nV ; ++i) {
cublasSetStream(handle, stream[i]);
alpha = 1.0;
beta = 1.0;
A = &(d_r2[nO*nO*(i+nV*j)]); lda = nO;
@ -251,8 +277,12 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
}
}
for (int i=0 ; i<nV ; ++i) {
cudaStreamDestroy(stream[i]);
}
cublasSetStream(handle, NULL);
cudaFree(d_Y_oovv);
*/
//--
}
// g_occ
@ -291,6 +321,7 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
C = d_r2; ldc = nO*nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO*nO, nV*nV, &alpha, A, lda, &beta, B, ldb, C, ldc);
/*
double * X_oovv = malloc(nO*nO*nV*nV*sizeof(double));
lda=nO*nO;
cublasGetMatrix(nO*nO, nV*nV, sizeof(double), d_X_oovv, lda, X_oovv, lda);
@ -316,10 +347,15 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
lda=nO*nO;
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), r2_tmp, lda, d_r2, lda);
free(r2_tmp);
*/
/*
//--
for (int i=0 ; i<nV ; ++i) {
cudaStreamCreate(&(stream[i]));
}
for (int j=0 ; j<nV ; ++j) {
for (int i=0 ; i<nV ; ++i) {
cublasSetStream(handle, stream[i]);
alpha = 1.0;
beta = -1.0;
A = &(d_r2[nO*nO*(i+nV*j)]); lda = nO;
@ -328,8 +364,13 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
}
}
for (int i=0 ; i<nV ; ++i) {
cudaStreamDestroy(stream[i]);
}
cublasSetStream(handle, NULL);
cudaFree(d_X_oovv);
*/
//--
}
#pragma omp section
@ -345,6 +386,10 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
for (int iblock=0 ; iblock<nV ; iblock += BLOCK_SIZE) {
int mbs = nV < iblock+BLOCK_SIZE ? nV : iblock+BLOCK_SIZE;
for (int gam=iblock ; gam<mbs ; ++gam) {
cudaStreamCreate(&(stream[gam]));
}
for (int gam=iblock ; gam<mbs ; ++gam) {
cublasSetStream(handle, stream[gam]);
alpha = 1.0;
beta = 0.0;
m=nV; n=nO*nV; k=cholesky_mo_num;
@ -353,6 +398,10 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
C=&(d_X_vovv[nV*nO*nV*(gam-iblock)]); ldc=nV;
cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
}
for (int gam=iblock ; gam<mbs ; ++gam) {
cudaStreamDestroy(stream[gam]);
}
cublasSetStream(handle, NULL);
mbs = BLOCK_SIZE < nV-iblock ? BLOCK_SIZE : nV-iblock;
alpha = 1.0;
beta = 0.0;
@ -363,20 +412,31 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
}
cudaFree(d_X_vovv);
alpha = 1.0;
beta = 1.0;
for (int i=0 ; i<nV ; ++i) {
cudaStreamCreate(&(stream[i]));
}
for (int j=0 ; j<nV ; ++j) {
for (int i=0 ; i<nV ; ++i) {
alpha = 1.0;
beta = 1.0;
cublasSetStream(handle, stream[i]);
A = &(d_r2[nO*nO*(i+nV*j)]); lda = nO;
B = &(d_Y_oovv[nO*nO*(i+nV*j)]); ldb = nO;
C = &(d_r2[nO*nO*(i+nV*j)]); ldc = nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
}
for (int i=0 ; i<nV ; ++i) {
cublasSetStream(handle, stream[i]);
A = &(d_r2[nO*nO*(i+nV*j)]); lda = nO;
B = &(d_Y_oovv[nO*nO*(j+nV*i)]); ldb = nO;
C = &(d_r2[nO*nO*(i+nV*j)]); ldc = nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
}
}
for (int i=0 ; i<nV ; ++i) {
cudaStreamDestroy(stream[i]);
}
cublasSetStream(handle, NULL);
cudaFree(d_Y_oovv);
}
@ -420,19 +480,31 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
cudaFree(d_tcc);
cudaFree(d_tcc2);
for (int i=0 ; i<nV ; ++i) {
cudaStreamCreate(&(stream[i]));
}
alpha = 1.0;
beta = -1.0;
for(int gam = 0; gam < nV; gam++){
for(int bet = 0; bet < nV; bet++){
alpha = 1.0;
beta = -1.0;
cublasSetStream(handle, stream[bet]);
A = &(d_r2[nO*nO*(bet+nV*gam)]); lda = nO;
B = &(d_X_ovvo[nO*(bet+nV*gam)]); ldb = nO*nV*nV;
C = &(d_r2[nO*nO*(bet+nV*gam)]); ldc = nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
}
for(int bet = 0; bet < nV; bet++){
cublasSetStream(handle, stream[bet]);
A = &(d_r2[nO*nO*(bet+nV*gam)]); lda = nO;
B = &(d_X_ovvo[nO*(gam+nV*bet)]); ldb = nO*nV*nV;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
C = &(d_r2[nO*nO*(bet+nV*gam)]); ldc = nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
}
}
for (int i=0 ; i<nV ; ++i) {
cudaStreamDestroy(stream[i]);
}
cublasSetStream(handle, NULL);
cudaFree(d_X_ovvo);
}
@ -456,16 +528,22 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
B = d_X_oovv; ldb = nO*nO;
C = d_r2; ldc = nO*nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO*nO, nV*nV, &alpha, A, lda, &beta, B, ldb, C, ldc);
for (int i=0 ; i<nV ; ++i) {
cudaStreamCreate(&(stream[i]));
}
for (int j=0 ; j<nV ; ++j) {
for (int i=0 ; i<nV ; ++i) {
alpha = 1.0;
beta = -1.0;
cublasSetStream(handle, stream[i]);
A = &(d_r2[nO*nO*(i+nV*j)]); lda = nO;
B = &(d_X_oovv[nO*nO*(j+nV*i)]); ldb = nO;
C = &(d_r2[nO*nO*(i+nV*j)]); ldc = nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
}
}
for (int i=0 ; i<nV ; ++i) {
cudaStreamDestroy(stream[i]);
}
cublasSetStream(handle, NULL);
double* d_X_vovo;
lda = nV*nO;
@ -473,14 +551,22 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
alpha = 0.0;
beta = 1.0;
for (int i=0 ; i<nV ; ++i) {
cudaStreamCreate(&(stream[i]));
}
for (int i=0 ; i<nO ; ++i) {
for (int gam=0 ; gam<nV ; ++gam) {
cublasSetStream(handle, stream[gam]);
A = &(d_X_vovo[nV*nO*(gam+nV*i)]); lda = nV;
B = &(d_cc_space_v_ovvo[nO*nV*(gam+nV*i)]); ldb = nO;
C = &(d_X_vovo[nV*nO*(gam+nV*i)]); ldc = nV;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nV, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
}
}
for (int i=0 ; i<nV ; ++i) {
cudaStreamDestroy(stream[i]);
}
cublasSetStream(handle, NULL);
double* d_Y_oovo;
lda = nO*nO;
@ -506,18 +592,31 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
cudaFree(d_Y_oovo);
alpha = 1.0;
beta = -1.0;
for (int i=0 ; i<nV ; ++i) {
cudaStreamCreate(&(stream[i]));
}
for (int j=0 ; j<nV ; ++j) {
for (int i=0 ; i<nV ; ++i) {
alpha = 1.0;
beta = -1.0;
cublasSetStream(handle, stream[i]);
A = &(d_r2[nO*nO*(i+nV*j)]); lda = nO;
B = &(d_X_oovv[nO*nO*(i+nV*j)]); ldb = nO;
C = &(d_r2[nO*nO*(i+nV*j)]); ldc = nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
}
for (int i=0 ; i<nV ; ++i) {
cublasSetStream(handle, stream[i]);
A = &(d_r2[nO*nO*(i+nV*j)]); lda = nO;
B = &(d_X_oovv[nO*nO*(j+nV*i)]); ldb = nO;
C = &(d_r2[nO*nO*(i+nV*j)]); ldc = nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
}
}
for (int i=0 ; i<nV ; ++i) {
cudaStreamDestroy(stream[i]);
}
cublasSetStream(handle, NULL);
cudaFree(d_X_oovv);
}
@ -550,16 +649,24 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
double* d_X_ovoo;
lda = nO*nV;
cudaMalloc((void **)&d_X_ovoo, nO*nV*nO*nO * sizeof(double));
alpha = 0.0;
beta = 1.0;
for (int i=0 ; i<nO ; ++i) {
cudaStreamCreate(&(stream[i]));
}
for (int j=0 ; j<nO ; ++j) {
for (int i=0 ; i<nO ; ++i) {
alpha = 0.0;
beta = 1.0;
cublasSetStream(handle, stream[i]);
A = &(d_X_ovoo[nO*nV*(i+nO*j)]); lda = nO;
B = &(d_cc_space_v_ovoo[nO*nV*(j+nO*i)]); ldb = nO;
C = &(d_X_ovoo[nO*nV*(i+nO*j)]); ldc = nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO, nV, &alpha, A, lda, &beta, B, ldb, C, ldc);
}
}
for (int i=0 ; i<nO ; ++i) {
cudaStreamDestroy(stream[i]);
}
cublasSetStream(handle, NULL);
double* d_Y_ovov;
@ -574,16 +681,24 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
C=d_Y_ovov; ldc=nO*nV*nO;
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
alpha = 1.0;
beta = -1.0;
for (int i=0 ; i<nV ; ++i) {
cudaStreamCreate(&(stream[i]));
}
for (int j=0 ; j<nO ; ++j) {
for (int i=0 ; i<nV ; ++i) {
alpha = 1.0;
beta = -1.0;
cublasSetStream(handle, stream[i]);
A = &(d_J1[nO*nV*(i+nV*j)]); lda = nO;
B = &(d_Y_ovov[nO*nV*(j+nO*i)]); ldb = nO;
C = &(d_J1[nO*nV*(i+nV*j)]); ldc = nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO, nV, &alpha, A, lda, &beta, B, ldb, C, ldc);
}
}
for (int i=0 ; i<nV ; ++i) {
cudaStreamDestroy(stream[i]);
}
cublasSetStream(handle, NULL);
double* d_tmp_cc;
lda = cholesky_mo_num;
@ -611,22 +726,35 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
cudaFree(d_tmp_cc);
alpha = 1.0;
beta = 1.0;
for (int i=0 ; i<nO ; ++i) {
alpha = 1.0;
beta = 1.0;
cudaStreamCreate(&(stream[i]));
}
for (int i=0 ; i<nO ; ++i) {
cublasSetStream(handle, stream[i]);
A = &(d_J1[nO*nV*nV*i]); lda = nO*nV;
B = &(d_J1_tmp[nV*nO*nV*i]); ldb = nV;
C = &(d_J1[nO*nV*nV*i]); ldc = nO*nV;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nO*nV, nV, &alpha, A, lda, &beta, B, ldb, C, ldc);
}
for (int i=0 ; i<nO ; ++i) {
cudaStreamDestroy(stream[i]);
}
cublasSetStream(handle, NULL);
cudaFree(d_J1_tmp);
lda = nO*nV;
cublasGetMatrix(nO*nV, nV*nO, sizeof(double), d_J1, lda, J1, lda);
lda = nO*nV;
cublasGetMatrix(nO*nV, nV*nO, sizeof(double), d_J1, lda, J1, lda);
cudaFree(d_J1);
}
}
#pragma omp section
{
}
} // end sections
lda = cholesky_mo_num * nV;
@ -718,6 +846,7 @@ cublasGetMatrix(nO*nV, nV*nO, sizeof(double), d_J1, lda, J1, lda);
free(r2_tmp);
cudaFree(d_r2);
cublasDestroy(handle);
}