added Aph_dRPA_sing kernel

2025-05-06 15:24:43 +02:00 · 2024-11-26 23:00:15 +01:00 · 2024-11-26 23:00:15 +01:00 · 66566a8ce7
commit 66566a8ce7
parent 9c42437746
4 changed files with 42 additions and 36 deletions
--- a/src/RPA/phRRPA.f90
+++ b/src/RPA/phRRPA.f90
@ -80,9 +80,10 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,

    ! DEBUG
    allocate(Om_gpu(nS), XpY_gpu(nS,nS), XmY_gpu(nS,nS))
-    call ph_drpa(nO, nBas, eHF(1), ERI(1,1,1,1), Om_gpu(1), XpY_gpu(1,1), XmY_gpu(1,1))
+    call ph_drpa(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om_gpu(1), XpY_gpu(1,1), XmY_gpu(1,1))
    print *, ' CPU:', Aph(1,1)
    print *, ' GPU:', XpY_gpu(1,1)
+    print *, ' GPU:', XmY_gpu(1,1)
    stop

    call phLR(TDA,nS,Aph,Bph,EcRPA(ispin),Om,XpY,XmY)
--- a/src/cuda/src/ph_drpa.c
+++ b/src/cuda/src/ph_drpa.c
@ -7,7 +7,7 @@

 #include "ph_drpa.h"

-int ph_drpa(int nO, int nBas, double *h_eps, double *h_ERI,
+int ph_drpa(int nO, int nBas, int nS, double *h_eps, double *h_ERI,
            double *h_Omega, double *h_XpY, double *h_XmY) {

    double *d_eps;
@ -16,25 +16,23 @@ int ph_drpa(int nO, int nBas, double *h_eps, double *h_ERI,
    int nBas2 = nBas * nBas;
    int nBas4 = nBas2 * nBas2;

-
    int ia, jb;
-    int nS = nO * (nBas - nO);
    for (ia = 0; ia < nS; ia++) {
        h_Omega[ia] = 0.0;
        for (jb = 0; jb < nS; jb++) {
-            h_XmY[jb + nO * nBas * ia] = 0.0;
-            h_XpY[jb + nO * nBas * ia] = 0.0;
+            h_XmY[jb + ia * nS] = 4.0;
+            //h_XpY[jb + ia * nS] = 5.0;
        }
    }


-    check_Cuda_Errors(cudaMalloc((void**)&d_eps, nO * sizeof(double)),
+    check_Cuda_Errors(cudaMalloc((void**)&d_eps, nBas * sizeof(double)),
        "cudaMalloc", __FILE__, __LINE__);
    check_Cuda_Errors(cudaMalloc((void**)&d_ERI, nBas4 * sizeof(double)),
        "cudaMalloc", __FILE__, __LINE__);


-    check_Cuda_Errors(cudaMemcpy(d_eps, h_eps, nO * sizeof(double), cudaMemcpyHostToDevice), 
+    check_Cuda_Errors(cudaMemcpy(d_eps, h_eps, nBas * sizeof(double), cudaMemcpyHostToDevice), 
        "cudaMemcpy", __FILE__, __LINE__);
    check_Cuda_Errors(cudaMemcpy(d_ERI, h_ERI, nBas4 * sizeof(double), cudaMemcpyHostToDevice), 
        "cudaMemcpy", __FILE__, __LINE__);
@ -42,8 +40,8 @@ int ph_drpa(int nO, int nBas, double *h_eps, double *h_ERI,
    // construct A matrix
    double *d_A;
    check_Cuda_Errors(cudaMalloc((void**)&d_A, nS * nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__);
-//    phLR_dRPA_A_sing(nO, nBas, d_eps, d_ERI, d_A);
-//    check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__);
+    phLR_dRPA_A_sing(nO, nBas, d_eps, d_ERI, d_A);
+    check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__);


    check_Cuda_Errors(cudaMemcpy(h_XpY, d_A, nS * nS * sizeof(double), cudaMemcpyDeviceToHost), 
--- a/src/cuda/src/phlr_drpa_a_sing.cu
+++ b/src/cuda/src/phlr_drpa_a_sing.cu
@ -4,39 +4,46 @@ __global__ void phLR_dRPA_A_sing_kernel(int nO, int nBas, double *eps, double *E


    int i, j, a, b;
-    int ia, jb, jb_off;
+    int aa, bb;
+    int nV, nS, nVS;
+    int nBas2, nBas3;
+    int i_A0, i_A1, i_A2;
+    int i_I0, i_I1, i_I2;

-    int ij_off0, ij_off;
+    nV = nBas - nO;
+    nS = nO * nV;
+    nVS = nV * nS;

-    int aa_max = nBas - nO;
-    int ia_max = aa_max * nO;
+    nBas2 = nBas * nBas;
+    nBas3 = nBas2 * nBas;

-    int nBas2 = nBas * nBas;
-    int nBas3 = nBas2 * nBas;
+    aa = blockIdx.x * blockDim.x + threadIdx.x;
+    bb = blockIdx.y * blockDim.y + threadIdx.y;

-    int aa = blockIdx.x * blockDim.x + threadIdx.x;
-    int bb = blockIdx.y * blockDim.y + threadIdx.y;
-
-    while(aa < aa_max) {
+    while(aa < nV) {
        a = aa + nO;

-        ij_off0 = a * nBas2;
+        i_A0 = aa * nS;
+        i_I0 = a * nBas2;

-        while(bb < aa_max) {
+        while(bb < nV) {
            b = bb + nO;

-            ij_off = ij_off0 + b * nBas;
+            i_A1 = i_A0 + bb;
+            i_I1 = i_I0 + b * nBas;

+            i = 0;
            while(i < nO) {
-                ia = i * aa_max + aa;
-                jb_off = ia * ia_max;
- 
-                while(j < nO) {
-                    jb = j * aa_max + bb;

-                    A[jb + jb_off] = 2.0 * ERI[i + j * nBas3 + ij_off];
-                    if(a==b && i==j) {
-                        A[jb + jb_off] += eps[a] - eps[i];
+                i_A2 = i_A1 + i * nVS;
+                i_I2 = i_I1 + i;
+ 
+                j = 0;
+                while(j < nO) {
+
+                    A[i_A2 + j * nV] = 2.0 * ERI[i_I2 + j * nBas3];
+                    if((a==b) && (i==j)) {
+                        A[i_A2 + j * nV] += eps[a] - eps[i];
                    }

                    j ++;
--- a/src/mod/cu_quack_module.f90
+++ b/src/mod/cu_quack_module.f90
@ -8,16 +8,16 @@ module cu_quack_module

  interface

-    subroutine ph_drpa(nO, nBas, eps, ERI, &
+    subroutine ph_drpa(nO, nBas, nS, eps, ERI, &
                       Omega, XpY, XmY) bind(C, name = "ph_drpa")

      import c_int, c_double
-      integer(c_int), intent(in), value :: nO, nBas
+      integer(c_int), intent(in), value :: nO, nBas, nS
      real(c_double), intent(in)        :: eps(nBas)
      real(c_double), intent(in)        :: ERI(nBas,nBas,nBas,nBas)
-      real(c_double), intent(out)       :: Omega(nO*nBas)
-      real(c_double), intent(out)       :: XpY(nO*nBas,nO*nBas)
-      real(c_double), intent(out)       :: XmY(nO*nBas,nO*nBas)
+      real(c_double), intent(out)       :: Omega(nS)
+      real(c_double), intent(out)       :: XpY(nS,nS)
+      real(c_double), intent(out)       :: XmY(nS,nS)

    end subroutine ph_drpa