diff --git a/src/RPA/phRRPA.f90 b/src/RPA/phRRPA.f90 index 94c6576..7d00061 100644 --- a/src/RPA/phRRPA.f90 +++ b/src/RPA/phRRPA.f90 @@ -80,9 +80,10 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO, ! DEBUG allocate(Om_gpu(nS), XpY_gpu(nS,nS), XmY_gpu(nS,nS)) - call ph_drpa(nO, nBas, eHF(1), ERI(1,1,1,1), Om_gpu(1), XpY_gpu(1,1), XmY_gpu(1,1)) + call ph_drpa(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om_gpu(1), XpY_gpu(1,1), XmY_gpu(1,1)) print *, ' CPU:', Aph(1,1) print *, ' GPU:', XpY_gpu(1,1) + print *, ' GPU:', XmY_gpu(1,1) stop call phLR(TDA,nS,Aph,Bph,EcRPA(ispin),Om,XpY,XmY) diff --git a/src/cuda/src/ph_drpa.c b/src/cuda/src/ph_drpa.c index 61afcbe..0e0f671 100644 --- a/src/cuda/src/ph_drpa.c +++ b/src/cuda/src/ph_drpa.c @@ -7,7 +7,7 @@ #include "ph_drpa.h" -int ph_drpa(int nO, int nBas, double *h_eps, double *h_ERI, +int ph_drpa(int nO, int nBas, int nS, double *h_eps, double *h_ERI, double *h_Omega, double *h_XpY, double *h_XmY) { double *d_eps; @@ -16,25 +16,23 @@ int ph_drpa(int nO, int nBas, double *h_eps, double *h_ERI, int nBas2 = nBas * nBas; int nBas4 = nBas2 * nBas2; - int ia, jb; - int nS = nO * (nBas - nO); for (ia = 0; ia < nS; ia++) { h_Omega[ia] = 0.0; for (jb = 0; jb < nS; jb++) { - h_XmY[jb + nO * nBas * ia] = 0.0; - h_XpY[jb + nO * nBas * ia] = 0.0; + h_XmY[jb + ia * nS] = 4.0; + //h_XpY[jb + ia * nS] = 5.0; } } - check_Cuda_Errors(cudaMalloc((void**)&d_eps, nO * sizeof(double)), + check_Cuda_Errors(cudaMalloc((void**)&d_eps, nBas * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); check_Cuda_Errors(cudaMalloc((void**)&d_ERI, nBas4 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); - check_Cuda_Errors(cudaMemcpy(d_eps, h_eps, nO * sizeof(double), cudaMemcpyHostToDevice), + check_Cuda_Errors(cudaMemcpy(d_eps, h_eps, nBas * sizeof(double), cudaMemcpyHostToDevice), "cudaMemcpy", __FILE__, __LINE__); check_Cuda_Errors(cudaMemcpy(d_ERI, h_ERI, nBas4 * sizeof(double), cudaMemcpyHostToDevice), "cudaMemcpy", __FILE__, __LINE__); @@ -42,8 +40,8 @@ int ph_drpa(int nO, int nBas, double *h_eps, double *h_ERI, // construct A matrix double *d_A; check_Cuda_Errors(cudaMalloc((void**)&d_A, nS * nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); -// phLR_dRPA_A_sing(nO, nBas, d_eps, d_ERI, d_A); -// check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); + phLR_dRPA_A_sing(nO, nBas, d_eps, d_ERI, d_A); + check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); check_Cuda_Errors(cudaMemcpy(h_XpY, d_A, nS * nS * sizeof(double), cudaMemcpyDeviceToHost), diff --git a/src/cuda/src/phlr_drpa_a_sing.cu b/src/cuda/src/phlr_drpa_a_sing.cu index 3cbc556..7844d66 100644 --- a/src/cuda/src/phlr_drpa_a_sing.cu +++ b/src/cuda/src/phlr_drpa_a_sing.cu @@ -4,39 +4,46 @@ __global__ void phLR_dRPA_A_sing_kernel(int nO, int nBas, double *eps, double *E int i, j, a, b; - int ia, jb, jb_off; + int aa, bb; + int nV, nS, nVS; + int nBas2, nBas3; + int i_A0, i_A1, i_A2; + int i_I0, i_I1, i_I2; - int ij_off0, ij_off; + nV = nBas - nO; + nS = nO * nV; + nVS = nV * nS; - int aa_max = nBas - nO; - int ia_max = aa_max * nO; + nBas2 = nBas * nBas; + nBas3 = nBas2 * nBas; - int nBas2 = nBas * nBas; - int nBas3 = nBas2 * nBas; + aa = blockIdx.x * blockDim.x + threadIdx.x; + bb = blockIdx.y * blockDim.y + threadIdx.y; - int aa = blockIdx.x * blockDim.x + threadIdx.x; - int bb = blockIdx.y * blockDim.y + threadIdx.y; - - while(aa < aa_max) { + while(aa < nV) { a = aa + nO; - ij_off0 = a * nBas2; + i_A0 = aa * nS; + i_I0 = a * nBas2; - while(bb < aa_max) { + while(bb < nV) { b = bb + nO; - ij_off = ij_off0 + b * nBas; + i_A1 = i_A0 + bb; + i_I1 = i_I0 + b * nBas; + i = 0; while(i < nO) { - ia = i * aa_max + aa; - jb_off = ia * ia_max; - - while(j < nO) { - jb = j * aa_max + bb; - A[jb + jb_off] = 2.0 * ERI[i + j * nBas3 + ij_off]; - if(a==b && i==j) { - A[jb + jb_off] += eps[a] - eps[i]; + i_A2 = i_A1 + i * nVS; + i_I2 = i_I1 + i; + + j = 0; + while(j < nO) { + + A[i_A2 + j * nV] = 2.0 * ERI[i_I2 + j * nBas3]; + if((a==b) && (i==j)) { + A[i_A2 + j * nV] += eps[a] - eps[i]; } j ++; diff --git a/src/mod/cu_quack_module.f90 b/src/mod/cu_quack_module.f90 index f73cc6c..a786587 100644 --- a/src/mod/cu_quack_module.f90 +++ b/src/mod/cu_quack_module.f90 @@ -8,16 +8,16 @@ module cu_quack_module interface - subroutine ph_drpa(nO, nBas, eps, ERI, & + subroutine ph_drpa(nO, nBas, nS, eps, ERI, & Omega, XpY, XmY) bind(C, name = "ph_drpa") import c_int, c_double - integer(c_int), intent(in), value :: nO, nBas + integer(c_int), intent(in), value :: nO, nBas, nS real(c_double), intent(in) :: eps(nBas) real(c_double), intent(in) :: ERI(nBas,nBas,nBas,nBas) - real(c_double), intent(out) :: Omega(nO*nBas) - real(c_double), intent(out) :: XpY(nO*nBas,nO*nBas) - real(c_double), intent(out) :: XmY(nO*nBas,nO*nBas) + real(c_double), intent(out) :: Omega(nS) + real(c_double), intent(out) :: XpY(nS,nS) + real(c_double), intent(out) :: XmY(nS,nS) end subroutine ph_drpa