More on GPU

This commit is contained in:
Anthony Scemama 2023-07-20 14:53:40 +02:00
parent 493f2cf7d8
commit 898d9e04d2
3 changed files with 102 additions and 76 deletions

View File

@ -492,6 +492,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
call compute_r2_space_chol_gpu(nO,nV,cholesky_mo_num,t1,t2,tau, &
cc_space_v_oo_chol, cc_space_v_ov_chol, cc_space_v_vo_chol, cc_space_v_vv_chol, &
cc_space_v_oooo, cc_space_v_vooo, cc_space_v_oovv, cc_space_v_vvoo, &
cc_space_v_oovo, cc_space_v_ovvo, &
cc_space_f_vo, H_vv, g_occ, r2)
@ -501,83 +502,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
double precision, allocatable :: X_ovvo(:,:,:,:)
double precision, allocatable :: tcc(:,:,:), tcc2(:,:,:)
allocate(X_oovv(nO,nO,nV,nV))
call dgemm('N','N',nO*nO*nV,nV,nO, &
1d0, cc_space_v_oovo, size(cc_space_v_oovo,1) * size(cc_space_v_oovo,2) * size(cc_space_v_oovo,3), &
t1 , size(t1,1), &
0d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3))
!$omp parallel &
!$omp shared(nO,nV,r2,X_oovv) &
!$omp private(u,v,gam,beta) &
!$omp default(none)
!$omp do
do gam = 1, nV
do beta = 1, nV
do v = 1, nO
do u = 1, nO
r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(u,v,beta,gam) - X_oovv(v,u,gam,beta)
enddo
enddo
enddo
enddo
!$omp end do
!$omp end parallel
deallocate(X_oovv)
double precision, allocatable :: X_vovo(:,:,:,:), Y_oovo(:,:,:,:)
allocate(X_vovo(nV,nO,nV,nO))
!$omp parallel &
!$omp shared(nO,nV,X_vovo,cc_space_v_ovvo) &
!$omp private(a,v,gam,i) &
!$omp default(none)
do i = 1, nO
!$omp do
do gam = 1, nV
do v = 1, nO
do a = 1, nV
X_vovo(a,v,gam,i) = cc_space_v_ovvo(v,a,gam,i)
enddo
enddo
enddo
!$omp end do nowait
enddo
!$omp end parallel
allocate(Y_oovo(nO,nO,nV,nO))
call dgemm('N','N',nO,nO*nV*nO,nV, &
1d0, t1, size(t1,1), &
X_vovo, size(X_vovo,1), &
0d0, Y_oovo, size(Y_oovo,1))
deallocate(X_vovo)
allocate(X_oovv(nO,nO,nV,nV))
call dgemm('N','N',nO*nO*nV, nV, nO, &
1d0, Y_oovo, size(Y_oovo,1) * size(Y_oovo,2) * size(Y_oovo,3), &
t1 , size(t1,1), &
0d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3))
deallocate(Y_oovo)
!$omp parallel &
!$omp shared(nO,nV,r2,X_oovv) &
!$omp private(u,v,gam,beta) &
!$omp default(none)
!$omp do
do gam = 1, nV
do beta = 1, nV
do v = 1, nO
do u = 1, nO
r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(u,v,gam,beta) - X_oovv(v,u,beta,gam)
enddo
enddo
enddo
enddo
!$omp end do
!$omp end parallel
deallocate(X_oovv)
double precision, allocatable :: J1(:,:,:,:)
allocate(J1(nO,nV,nV,nO))

View File

@ -71,6 +71,8 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
double* cc_space_v_vooo,
double* cc_space_v_oovv,
double* cc_space_v_vvoo,
double* cc_space_v_oovo,
double* cc_space_v_ovvo,
double* cc_space_f_vo,
double* H_vv,
double* g_occ,
@ -460,6 +462,102 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
cudaFree(d_X_ovvo);
}
#pragma omp section
{
double* d_cc_space_v_oovo;
lda = nO*nO;
cudaMalloc((void **)&d_cc_space_v_oovo, nO*nO*nV*nO * sizeof(double));
cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_oovo, lda, d_cc_space_v_oovo, lda);
double* d_X_oovv;
lda = nO*nO;
cudaMalloc((void **)&d_X_oovv, nO*nO*nV*nV * sizeof(double));
alpha = 1.0;
beta = 0.0;
m=nO*nO*nV; n=nV; k=nO;
A=d_cc_space_v_oovo; lda=nO*nO*nV;
B=d_t1; ldb=nO;
C=d_X_oovv; ldc=nO*nO*nV;
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
alpha = 1.0;
beta = -1.0;
A = d_r2; lda = nO*nO;
B = d_X_oovv; ldb = nO*nO;
C = d_r2; ldc = nO*nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO*nO, nV*nV, &alpha, A, lda, &beta, B, ldb, C, ldc);
for (int j=0 ; j<nV ; ++j) {
for (int i=0 ; i<nV ; ++i) {
alpha = 1.0;
beta = -1.0;
A = &(d_r2[nO*nO*(i+nV*j)]); lda = nO;
B = &(d_X_oovv[nO*nO*(j+nV*i)]); ldb = nO;
C = &(d_r2[nO*nO*(i+nV*j)]); ldc = nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
}
}
cudaFree(d_cc_space_v_oovo);
double* d_cc_space_v_ovvo;
lda = nO*nV;
cudaMalloc((void **)&d_cc_space_v_ovvo, nO*nV*nV*nO * sizeof(double));
cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_ovvo, lda, d_cc_space_v_ovvo, lda);
double* d_X_vovo;
lda = nV*nO;
cudaMalloc((void **)&d_X_vovo, nV*nO*nV*nO * sizeof(double));
alpha = 0.0;
beta = 1.0;
for (int i=0 ; i<nO ; ++i) {
for (int gam=0 ; gam<nV ; ++gam) {
A = &(d_X_vovo[nV*nO*(gam+nV*i)]); lda = nV;
B = &(d_cc_space_v_ovvo[nO*nV*(gam+nV*i)]); ldb = nO;
C = &(d_X_vovo[nV*nO*(gam+nV*i)]); ldc = nV;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nV, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
}
}
cudaFree(d_cc_space_v_ovvo);
double* d_Y_oovo;
lda = nO*nO;
cudaMalloc((void **)&d_Y_oovo, nO*nO*nV*nO * sizeof(double));
alpha = 1.0;
beta = 0.0;
m=nO; n=nO*nV*nO; k=nV;
A=d_t1; lda=nO;
B=d_X_vovo; ldb=nV;
C=d_Y_oovo; ldc=nO;
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
cudaFree(d_X_vovo);
alpha = 1.0;
beta = 0.0;
m=nO*nO*nV; n=nV; k=nO;
A=d_Y_oovo; lda=nO*nO*nV;
B=d_t1; ldb=nO;
C=d_X_oovv; ldc=nO*nO*nV;
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
cudaFree(d_Y_oovo);
for (int j=0 ; j<nV ; ++j) {
for (int i=0 ; i<nV ; ++i) {
alpha = 1.0;
beta = -1.0;
A = &(d_r2[nO*nO*(i+nV*j)]); lda = nO;
B = &(d_X_oovv[nO*nO*(i+nV*j)]); ldb = nO;
C = &(d_r2[nO*nO*(i+nV*j)]); ldc = nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
B = &(d_X_oovv[nO*nO*(j+nV*i)]); ldb = nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
}
}
cudaFree(d_X_oovv);
}
}

View File

@ -7,6 +7,7 @@ module gpu_module
subroutine compute_r2_space_chol_gpu(nO,nV,cholesky_mo_num, t1, t2, tau,&
cc_space_v_oo_chol, cc_space_v_ov_chol, cc_space_v_vo_chol, cc_space_v_vv_chol, &
cc_space_v_oooo, cc_space_v_vooo, cc_space_v_oovv, cc_space_v_vvoo, &
cc_space_v_oovo, cc_space_v_ovvo, &
cc_space_f_vo, H_vv, g_occ, r2) bind(C)
import c_int, c_double
integer(c_int), value :: nO, nV, cholesky_mo_num
@ -21,6 +22,8 @@ module gpu_module
real(c_double), intent(in) :: cc_space_v_vooo(nV,nO,nO,nO)
real(c_double), intent(in) :: cc_space_v_oovv(nO,nO,nV,nV)
real(c_double), intent(in) :: cc_space_v_vvoo(nV,nV,nO,nO)
real(c_double), intent(in) :: cc_space_v_oovo(nO,nO,nV,nO)
real(c_double), intent(in) :: cc_space_v_ovvo(nO,nV,nV,nO)
real(c_double), intent(in) :: cc_space_f_vo(nV,nO)
real(c_double), intent(in) :: H_vv(nV,nV)
real(c_double), intent(in) :: g_occ(nO,nO)