From fcb11d662ca8f0789f3c0184e2f02c078c3fef09 Mon Sep 17 00:00:00 2001 From: AbdAmmar Date: Fri, 29 Nov 2024 22:02:59 +0100 Subject: [PATCH] fixed bug in CUDA implement of dRPA-sing --- src/RPA/phRRPA.f90 | 8 ++++++-- src/RPA/phRRPA_GPU.f90 | 8 ++++++-- src/cuda/src/elementwise_dsqrt_inplace.cu | 10 +++++----- src/cuda/src/ph_drpa_amb_sing.cu | 5 ++++- src/cuda/src/ph_drpa_apb_sing.cu | 5 ++++- src/cuda/src/ph_drpa_sing.c | 15 ++++++++++----- 6 files changed, 35 insertions(+), 16 deletions(-) diff --git a/src/RPA/phRRPA.f90 b/src/RPA/phRRPA.f90 index 3ddeb03..ebfa742 100644 --- a/src/RPA/phRRPA.f90 +++ b/src/RPA/phRRPA.f90 @@ -29,7 +29,7 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO, ! Local variables - integer :: i + integer :: ia integer :: ispin logical :: dRPA double precision :: t1, t2 @@ -81,8 +81,12 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO, !call wall_time(t1) call phLR(TDA,nS,Aph,Bph,EcRPA(ispin),Om,XpY,XmY) - !call wall_time(t2) + call wall_time(t2) !print *, "wall time diag A on CPU (sec) = ", t2 - t1 + !do ia = 1, nS + ! write(112, *) Om(ia) + !enddo + !stop call print_excitation_energies('phRPA@RHF','singlet',nS,Om) call phLR_transition_vectors(.true.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY) diff --git a/src/RPA/phRRPA_GPU.f90 b/src/RPA/phRRPA_GPU.f90 index f5c27c0..ed85749 100644 --- a/src/RPA/phRRPA_GPU.f90 +++ b/src/RPA/phRRPA_GPU.f90 @@ -29,7 +29,7 @@ subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC double precision,intent(in) :: dipole_int(nBas,nBas,ncart) - integer :: i + integer :: i, a, ia integer :: ispin logical :: dRPA double precision :: t1, t2 @@ -78,6 +78,10 @@ subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC call ph_drpa_sing(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om(1), XpY(1,1), XmY(1,1)) !call wall_time(t2) !print*, 'diag time on GPU (sec):', t2 - t1 + !do ia = 1, nS + ! write(111, *) Om(ia) + !enddo + !stop endif @@ -98,7 +102,7 @@ subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC do a = nO+1, nBas-nR ia = ia + 1 iorder(ia) = ia - Om(ia) = e(a) - e(i) + Om(ia) = eHF(a) - eHF(i) XpY(ia,ia) = 1.d0 enddo enddo diff --git a/src/cuda/src/elementwise_dsqrt_inplace.cu b/src/cuda/src/elementwise_dsqrt_inplace.cu index 8e0202f..533aeaa 100644 --- a/src/cuda/src/elementwise_dsqrt_inplace.cu +++ b/src/cuda/src/elementwise_dsqrt_inplace.cu @@ -2,14 +2,14 @@ #include -__global__ void elementwise_dsqrt_inplace_kernel(int nS, double *A) { +__global__ void elementwise_dsqrt_inplace_kernel(int n, double *A) { int i; i = blockIdx.x * blockDim.x + threadIdx.x; - while(i < nS) { + while(i < n) { if(A[i] > 0.0) { @@ -30,10 +30,10 @@ __global__ void elementwise_dsqrt_inplace_kernel(int nS, double *A) { -extern "C" void elementwise_dsqrt_inplace(int nS, double *A) { +extern "C" void elementwise_dsqrt_inplace(int n, double *A) { int sBlocks = 32; - int nBlocks = (nS + sBlocks - 1) / sBlocks; + int nBlocks = (n + sBlocks - 1) / sBlocks; dim3 dimGrid(nBlocks, 1, 1); dim3 dimBlock(sBlocks, 1, 1); @@ -42,7 +42,7 @@ extern "C" void elementwise_dsqrt_inplace(int nS, double *A) { nBlocks, sBlocks); - elementwise_dsqrt_inplace_kernel<<>>(nS, A); + elementwise_dsqrt_inplace_kernel<<>>(n, A); } diff --git a/src/cuda/src/ph_drpa_amb_sing.cu b/src/cuda/src/ph_drpa_amb_sing.cu index 4f88b4b..0f99bc3 100644 --- a/src/cuda/src/ph_drpa_amb_sing.cu +++ b/src/cuda/src/ph_drpa_amb_sing.cu @@ -9,6 +9,7 @@ __global__ void ph_dRPA_AmB_sing_kernel(int nO, int nV, int nBas, int nS, double int nBas2, nBas3; int i_A0, i_A1, i_A2; int i_I0, i_I1, i_I2; + int i_J1, i_J2; bool a_eq_b; @@ -33,17 +34,19 @@ __global__ void ph_dRPA_AmB_sing_kernel(int nO, int nV, int nBas, int nS, double i_A1 = i_A0 + bb; i_I1 = i_I0 + b * nBas; + i_J1 = i_I0 + b * nBas3; i = 0; while(i < nO) { i_A2 = i_A1 + i * nVS; i_I2 = i_I1 + i; + i_J2 = i_J1 + i; j = 0; while(j < nO) { - AmB[i_A2 + j * nV] = 2.0 * (ERI[i_I2 + j * nBas3] - ERI[i_I2 + j * nBas]); + AmB[i_A2 + j * nV] = 2.0 * (ERI[i_I2 + j * nBas3] - ERI[i_J2 + j * nBas]); if(a_eq_b && (i==j)) { AmB[i_A2 + j * nV] += eps[a] - eps[i]; } diff --git a/src/cuda/src/ph_drpa_apb_sing.cu b/src/cuda/src/ph_drpa_apb_sing.cu index d7de329..0d25bd0 100644 --- a/src/cuda/src/ph_drpa_apb_sing.cu +++ b/src/cuda/src/ph_drpa_apb_sing.cu @@ -9,6 +9,7 @@ __global__ void ph_dRPA_ApB_sing_kernel(int nO, int nV, int nBas, int nS, double int nBas2, nBas3; int i_A0, i_A1, i_A2; int i_I0, i_I1, i_I2; + int i_J1, i_J2; bool a_eq_b; @@ -33,17 +34,19 @@ __global__ void ph_dRPA_ApB_sing_kernel(int nO, int nV, int nBas, int nS, double i_A1 = i_A0 + bb; i_I1 = i_I0 + b * nBas; + i_J1 = i_I0 + b * nBas3; i = 0; while(i < nO) { i_A2 = i_A1 + i * nVS; i_I2 = i_I1 + i; + i_J2 = i_J1 + i; j = 0; while(j < nO) { - ApB[i_A2 + j * nV] = 2.0 * (ERI[i_I2 + j * nBas3] + ERI[i_I2 + j * nBas]); + ApB[i_A2 + j * nV] = 2.0 * (ERI[i_I2 + j * nBas3] + ERI[i_J2 + j * nBas]); if(a_eq_b && (i==j)) { ApB[i_A2 + j * nV] += eps[a] - eps[i]; } diff --git a/src/cuda/src/ph_drpa_sing.c b/src/cuda/src/ph_drpa_sing.c index d1cba17..3d720e7 100644 --- a/src/cuda/src/ph_drpa_sing.c +++ b/src/cuda/src/ph_drpa_sing.c @@ -80,8 +80,10 @@ void ph_drpa_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, // diagonalize A-B int *d_info1 = NULL; check_Cuda_Errors(cudaMalloc((void**)&d_info1, sizeof(int)), "cudaMalloc", __FILE__, __LINE__); + double *d_Omega = NULL; check_Cuda_Errors(cudaMalloc((void**)&d_Omega, nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); + cudaEventRecord(start, 0); diag_dn_dsyevd(nS, d_info1, d_Omega, d_AmB); check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); @@ -91,6 +93,7 @@ void ph_drpa_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, printf("Time elapsed on diag AmB = %f msec\n", elapsedTime); + // d_Omega <-- d_Omega^{0.5} // TODO: nb of <= 0 elements cudaEventRecord(start, 0); @@ -101,15 +104,17 @@ void ph_drpa_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, printf("Time elapsed on elementwise_dsqrt_inplace %f msec\n", elapsedTime); - // d_AmBSq = d_AmB (d_Omega)^{+0.5} (d_AmB)^T - // d_AmBSqInv = d_AmB (d_Omega)^{-0.5} (d_AmB)^T - cudaEventRecord(start, 0); + // d_AmBSq = d_AmB (d_Omega)^{+0.5} (d_AmB)^T double *d_AmBSq = NULL; - check_Cuda_Errors(cudaMalloc((void**)&d_AmBSq, nS * sizeof(double)), + check_Cuda_Errors(cudaMalloc((void**)&d_AmBSq, nS2 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); + + // d_AmBSqInv = d_AmB (d_Omega)^{-0.5} (d_AmB)^T double *d_AmBSqInv = NULL; - check_Cuda_Errors(cudaMalloc((void**)&d_AmBSqInv, nS * sizeof(double)), + check_Cuda_Errors(cudaMalloc((void**)&d_AmBSqInv, nS2 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); + + cudaEventRecord(start, 0); A_D_At(nS, d_AmB, d_Omega, d_AmBSq); A_Dinv_At(nS, d_AmB, d_Omega, d_AmBSqInv); check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__);