Added loop

This commit is contained in:
Anthony Scemama 2023-07-17 14:21:22 +02:00
parent eaa7160884
commit 79060ff6e1
2 changed files with 85 additions and 36 deletions

View File

@ -499,41 +499,41 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
double precision, allocatable :: X_oovv(:,:,:,:)
double precision, allocatable :: X_vovv(:,:,:,:)
allocate(X_vovv(nV,nO,nV,block_size))
allocate(Y_oovv(nO,nO,nV,nV))
do iblock = 1, nV, block_size
do gam = iblock, min(nV, iblock+block_size-1)
call dgemm('T','N',nV, nO*nV, cholesky_mo_num, 1.d0, &
cc_space_v_vv_chol(1,1,gam), cholesky_mo_num, cc_space_v_ov_chol, &
cholesky_mo_num, 0.d0, X_vovv(1,1,1,gam-iblock+1), nV)
enddo
call dgemm('N','N',nO,nO*nV*min(block_size, nV-iblock+1),nV, &
1d0, t1 , size(t1,1), &
X_vovv, size(X_vovv,1), &
0d0, Y_oovv(1,1,1,iblock), size(Y_oovv,1))
enddo
deallocate(X_vovv)
!$omp parallel &
!$omp shared(nO,nV,r2,Y_oovv) &
!$omp private(u,v,gam,beta) &
!$omp default(none)
!$omp do
do gam = 1, nV
do beta = 1, nV
do v = 1, nO
do u = 1, nO
r2(u,v,beta,gam) = r2(u,v,beta,gam) + Y_oovv(v,u,beta,gam) + Y_oovv(u,v,gam,beta)
enddo
enddo
enddo
enddo
!$omp end do
!$omp end parallel
deallocate(Y_oovv)
! allocate(X_vovv(nV,nO,nV,block_size))
! allocate(Y_oovv(nO,nO,nV,nV))
!
! do iblock = 1, nV, block_size
! do gam = iblock, min(nV, iblock+block_size-1)
! call dgemm('T','N',nV, nO*nV, cholesky_mo_num, 1.d0, &
! cc_space_v_vv_chol(1,1,gam), cholesky_mo_num, cc_space_v_ov_chol, &
! cholesky_mo_num, 0.d0, X_vovv(1,1,1,gam-iblock+1), nV)
!
! enddo
! call dgemm('N','N',nO,nO*nV*min(block_size, nV-iblock+1),nV, &
! 1d0, t1 , size(t1,1), &
! X_vovv, size(X_vovv,1), &
! 0d0, Y_oovv(1,1,1,iblock), size(Y_oovv,1))
!
! enddo
! deallocate(X_vovv)
!
! !$omp parallel &
! !$omp shared(nO,nV,r2,Y_oovv) &
! !$omp private(u,v,gam,beta) &
! !$omp default(none)
! !$omp do
! do gam = 1, nV
! do beta = 1, nV
! do v = 1, nO
! do u = 1, nO
! r2(u,v,beta,gam) = r2(u,v,beta,gam) + Y_oovv(v,u,beta,gam) + Y_oovv(u,v,gam,beta)
! enddo
! enddo
! enddo
! enddo
! !$omp end do
! !$omp end parallel
! deallocate(Y_oovv)
double precision, allocatable :: X_ovvo(:,:,:,:)
double precision, allocatable :: tcc(:,:,:), tcc2(:,:,:)
@ -884,7 +884,6 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
end
! g_occ
subroutine compute_J1_chol(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvoo,J1)

View File

@ -350,8 +350,58 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
cudaFree(d_X_oovv);
}
#pragma omp section
{
double* d_X_vovv;
lda = nV*nO;
cudaMalloc((void **)&d_X_vovv, nV*nO*nV*BLOCK_SIZE * sizeof(double));
double* d_Y_oovv;
lda = nO*nO;
cudaMalloc((void **)&d_Y_oovv, nO*nO*nV*nV * sizeof(double));
for (size_t iblock=0 ; iblock<nV ; iblock += BLOCK_SIZE) {
size_t mbs = nV < iblock+BLOCK_SIZE ? nV : iblock+BLOCK_SIZE;
for (size_t gam=iblock ; gam<mbs ; ++gam) {
alpha = 1.0;
beta = 0.0;
m=nV; n=nO*nV; k=cholesky_mo_num;
A=&(d_cc_space_v_vv_chol[cholesky_mo_num*nV*gam]); lda=cholesky_mo_num;
B=d_cc_space_v_ov_chol; ldb=cholesky_mo_num;
C=&(d_X_vovv[nV*nO*nV*(gam-iblock)]); ldc=nV;
cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
}
mbs = BLOCK_SIZE < nV-iblock ? BLOCK_SIZE : nV-iblock;
alpha = 1.0;
beta = 0.0;
m=nO; n=nO*nV*mbs; k=nV;
A=d_t1; lda=nO;
B=d_X_vovv; ldb=nV;
C=&(d_Y_oovv[nO*nO*nV*iblock]); ldc=nO;
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
}
cudaFree(d_X_vovv);
for (int j=0 ; j<nV ; ++j) {
for (int i=0 ; i<nV ; ++i) {
alpha = 1.0;
beta = 1.0;
A = &(d_r2[nO*nO*(i+nV*j)]); lda = nO;
B = &(d_Y_oovv[nO*nO*(i+nV*j)]); ldb = nO;
C = &(d_r2[nO*nO*(i+nV*j)]); ldc = nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
A = &(d_r2[nO*nO*(i+nV*j)]); lda = nO;
B = &(d_Y_oovv[nO*nO*(j+nV*i)]); ldb = nO;
C = &(d_r2[nO*nO*(i+nV*j)]); ldc = nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
}
}
cudaFree(d_Y_oovv);
}
}
lda = cholesky_mo_num * nV;
cudaMalloc((void **)&d_tmp_cc, lda * nV * sizeof(double));