refact in GPU direct & fixed bug in read dipole integrals

2025-04-29 19:54:51 +02:00 · 2024-11-28 16:10:05 +01:00 · 2024-11-28 16:10:05 +01:00 · e43a56e042
commit e43a56e042
parent 6ff3fc2905
10 changed files with 258 additions and 38 deletions
--- a/src/GPU/phRRPA_GPU.f90
+++ b/src/GPU/phRRPA_GPU.f90
@ -0,0 +1,156 @@
+subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF)
+
+ use cu_quack_module
+
+! Perform a direct random phase approximation calculation
+
+  implicit none
+  include 'parameters.h'
+  include 'quadrature.h'
+
+! Input variables
+
+  logical,intent(in)            :: dotest
+
+  logical,intent(in)            :: TDA
+  logical,intent(in)            :: doACFDT
+  logical,intent(in)            :: exchange_kernel
+  logical,intent(in)            :: singlet
+  logical,intent(in)            :: triplet
+  integer,intent(in)            :: nBas
+  integer,intent(in)            :: nC
+  integer,intent(in)            :: nO
+  integer,intent(in)            :: nV
+  integer,intent(in)            :: nR
+  integer,intent(in)            :: nS
+  double precision,intent(in)   :: ENuc
+  double precision,intent(in)   :: ERHF
+  double precision,intent(in)   :: eHF(nBas)
+  double precision,intent(in)   :: ERI(nBas,nBas,nBas,nBas)
+  double precision,intent(in)   :: dipole_int(nBas,nBas,ncart)
+
+! Local variables
+
+  integer                       :: i
+  integer                       :: ispin
+  logical                       :: dRPA
+  double precision              :: t1, t2
+  double precision              :: lambda
+  double precision,allocatable  :: Aph(:,:)
+  double precision,allocatable  :: Bph(:,:)
+  double precision,allocatable  :: Om(:)
+  double precision,allocatable  :: XpY(:,:)
+  double precision,allocatable  :: XmY(:,:)
+  ! DEBUG
+  !double precision, allocatable :: XpY_gpu(:,:), XmY_gpu(:,:), Om_gpu(:)
+
+  double precision              :: EcRPA(nspin)
+
+! Hello world
+
+  write(*,*)
+  write(*,*)'*********************************'
+  write(*,*)'* Restricted ph-RPA Calculation *'
+  write(*,*)'*********************************'
+  write(*,*)
+
+! TDA 
+
+  if(TDA) then
+    write(*,*) 'Tamm-Dancoff approximation activated!'
+    write(*,*)
+  end if
+
+! Initialization
+
+  dRPA = .true.
+  EcRPA(:) = 0d0
+  lambda = 1d0
+
+! Memory allocation
+
+  allocate(Om(nS),XpY(nS,nS),XmY(nS,nS),Aph(nS,nS),Bph(nS,nS))
+
+! Singlet manifold
+
+  if(singlet) then 
+
+    if(TDA) then
+
+      call wall_time(t1)
+      call ph_drpa_tda_sing(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om(1), XpY(1,1))
+      call wall_time(t2)
+      print*, 'diag time on GPU (sec):', t2 - t1
+      stop
+      XmY(:,:) = XpY(:,:)
+
+    else
+
+      ! TODO
+      !call ph_drpa_sing(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om(1), XpY(1,1))
+      !XmY(:,:) = XpY(:,:)
+
+    endif
+
+    call print_excitation_energies('phRPA@RHF','singlet',nS,Om)
+    call phLR_transition_vectors(.true.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY)
+
+  end if
+
+! Triplet manifold 
+
+  if(triplet) then 
+
+    ispin = 2
+
+    call phLR_A(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,eHF,ERI,Aph)
+    if(.not.TDA) call phLR_B(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,ERI,Bph)
+
+    call phLR(TDA,nS,Aph,Bph,EcRPA(ispin),Om,XpY,XmY)
+    call print_excitation_energies('phRPA@RHF','triplet',nS,Om)
+    call phLR_transition_vectors(.false.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY)
+
+  end if
+
+  if(exchange_kernel) then
+
+    EcRPA(1) = 0.5d0*EcRPA(1)
+    EcRPA(2) = 1.5d0*EcRPA(2)
+
+  end if
+
+  write(*,*)
+  write(*,*)'-------------------------------------------------------------------------------'
+  write(*,'(2X,A50,F20.10,A3)') 'Tr@phRPA@RHF correlation energy (singlet) = ',EcRPA(1),' au'
+  write(*,'(2X,A50,F20.10,A3)') 'Tr@phRPA@RHF correlation energy (triplet) = ',EcRPA(2),' au'
+  write(*,'(2X,A50,F20.10,A3)') 'Tr@phRPA@RHF correlation energy           = ',sum(EcRPA),' au'
+  write(*,'(2X,A50,F20.10,A3)') 'Tr@phRPA@RHF total energy                 = ',ENuc + ERHF + sum(EcRPA),' au'
+  write(*,*)'-------------------------------------------------------------------------------'
+  write(*,*)
+
+  deallocate(Om,XpY,XmY,Aph,Bph)
+
+! Compute the correlation energy via the adiabatic connection 
+
+  if(doACFDT) then
+
+    call phACFDT(exchange_kernel,dRPA,TDA,singlet,triplet,nBas,nC,nO,nV,nR,nS,ERI,eHF,EcRPA)
+
+    write(*,*)
+    write(*,*)'-------------------------------------------------------------------------------'
+    write(*,'(2X,A50,F20.10,A3)') 'AC@phRPA@RHF correlation energy (singlet) = ',EcRPA(1),' au'
+    write(*,'(2X,A50,F20.10,A3)') 'AC@phRPA@RHF correlation energy (triplet) = ',EcRPA(2),' au'
+    write(*,'(2X,A50,F20.10,A3)') 'AC@phRPA@RHF correlation energy           = ',sum(EcRPA),' au'
+    write(*,'(2X,A50,F20.10,A3)') 'AC@phRPA@RHF total energy                 = ',ENuc + ERHF + sum(EcRPA),' au'
+    write(*,*)'-------------------------------------------------------------------------------'
+    write(*,*)
+
+  end if
+
+  if(dotest) then
+
+    call dump_test_value('R','phRPA correlation energy',sum(EcRPA))
+
+  end if
+
+end subroutine 
--- a/src/LR/phLR.f90
+++ b/src/LR/phLR.f90
@ -15,6 +15,7 @@ subroutine phLR(TDA,nS,Aph,Bph,EcRPA,Om,XpY,XmY)
  ! Local variables

  double precision              :: trace_matrix
+  double precision              :: t1, t2
  double precision,allocatable  :: ApB(:,:)
  double precision,allocatable  :: AmB(:,:)
  double precision,allocatable  :: AmBSq(:,:)
@ -38,7 +39,10 @@ subroutine phLR(TDA,nS,Aph,Bph,EcRPA,Om,XpY,XmY)
  if(TDA) then
 
    XpY(:,:) = Aph(:,:)
+    !call wall_time(t1)
    call diagonalize_matrix(nS,XpY,Om)
+    !call wall_time(t2)
+    !print*, 'diag time on CPU (sec):', t2 - t1
    XpY(:,:) = transpose(XpY(:,:))
    XmY(:,:) = XpY(:,:)

--- a/src/QuAcK/QuAcK.f90
+++ b/src/QuAcK/QuAcK.f90
@ -177,7 +177,6 @@ program QuAcK

  call read_integrals(working_dir,nBas,S,T,V,Hc,ERI_AO)
  call read_dipole_integrals(working_dir,nBas,dipole_int_AO)
-
  call wall_time(end_int)

  t_int = end_int - start_int
--- a/src/RPA/RRPA.f90
+++ b/src/RPA/RRPA.f90
@ -50,6 +50,8 @@ subroutine RRPA(dotest,dophRPA,dophRPAx,docrRPA,doppRPA,TDA,doACFDT,exchange_ker
    write(*,'(A65,1X,F9.3,A8)') 'Total wall time for RPA = ',t_RPA,' seconds'
    write(*,*)

+    !call phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF)
+
  end if

 !------------------------------------------------------------------------
--- a/src/RPA/phRRPA.f90
+++ b/src/RPA/phRRPA.f90
@ -1,7 +1,5 @@
 subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF)

-! use cu_quack_module
-
 ! Perform a direct random phase approximation calculation

  implicit none
@ -40,8 +38,6 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,
  double precision,allocatable  :: Om(:)
  double precision,allocatable  :: XpY(:,:)
  double precision,allocatable  :: XmY(:,:)
-  ! DEBUG
-  !double precision, allocatable :: XpY_gpu(:,:), XmY_gpu(:,:), Om_gpu(:)

  double precision              :: EcRPA(nspin)

@ -80,20 +76,6 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,
    if(.not.TDA) call phLR_B(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,ERI,Bph)

    call phLR(TDA,nS,Aph,Bph,EcRPA(ispin),Om,XpY,XmY)
-
-    !! DEBUG
-    !allocate(Om_gpu(nS), XpY_gpu(nS,nS), XmY_gpu(nS,nS))
-    !call ph_drpa_tda(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om_gpu(1), XpY_gpu(1,1))
-    !do i = 1, nS
-    !  print *, i, Om(i), Om_gpu(i)
-    !  if(dabs(Om(i) - Om_gpu(i)) .gt. 1d-13) then
-    !    print *, 'GPU FAILED!'
-    !    stop
-    !  endif
-    !enddo
-    !print *, 'GPU DONE!'
-    !stop
-
    call print_excitation_energies('phRPA@RHF','singlet',nS,Om)
    call phLR_transition_vectors(.true.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY)

--- a/src/cuda/src/ph_drpa_a_sing.cu
+++ b/src/cuda/src/ph_drpa_a_sing.cu
@ -10,6 +10,8 @@ __global__ void ph_dRPA_A_sing_kernel(int nO, int nV, int nBas, int nS, double *
    int i_A0, i_A1, i_A2;
    int i_I0, i_I1, i_I2;

+    bool a_eq_b;
+
    nVS = nV * nS;

    nBas2 = nBas * nBas;
@ -27,6 +29,8 @@ __global__ void ph_dRPA_A_sing_kernel(int nO, int nV, int nBas, int nS, double *
        while(bb < nV) {
            b = bb + nO;

+            a_eq_b = a == b;
+
            i_A1 = i_A0 + bb;
            i_I1 = i_I0 + b * nBas;

@ -40,7 +44,7 @@ __global__ void ph_dRPA_A_sing_kernel(int nO, int nV, int nBas, int nS, double *
                while(j < nO) {

                    A[i_A2 + j * nV] = 2.0 * ERI[i_I2 + j * nBas3];
-                    if((a==b) && (i==j)) {
+                    if(a_eq_b && (i==j)) {
                        A[i_A2 + j * nV] += eps[a] - eps[i];
                    }

--- a/src/cuda/src/ph_drpa_tda_sing.c
+++ b/src/cuda/src/ph_drpa_tda_sing.c
@ -9,8 +9,8 @@
 #include "utils.h"
 #include "ph_rpa.h"

-void ph_drpa_tda(int nO, int nBas, int nS, double *h_eps, double *h_ERI,
-                 double *h_Omega, double *h_X) {
+void ph_drpa_tda_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI,
+                      double *h_Omega, double *h_X) {

    double *d_eps = NULL;
    double *d_ERI = NULL;
@ -20,23 +20,39 @@ void ph_drpa_tda(int nO, int nBas, int nS, double *h_eps, double *h_ERI,
    int nBas2 = nBas * nBas;
    int nBas4 = nBas2 * nBas2;

+    float elapsedTime;
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+

    check_Cuda_Errors(cudaMalloc((void**)&d_eps, nBas * sizeof(double)),
        "cudaMalloc", __FILE__, __LINE__);
    check_Cuda_Errors(cudaMalloc((void**)&d_ERI, nBas4 * sizeof(double)),
        "cudaMalloc", __FILE__, __LINE__);

+    cudaEventRecord(start, 0);
    check_Cuda_Errors(cudaMemcpy(d_eps, h_eps, nBas * sizeof(double), cudaMemcpyHostToDevice), 
        "cudaMemcpy", __FILE__, __LINE__);
    check_Cuda_Errors(cudaMemcpy(d_ERI, h_ERI, nBas4 * sizeof(double), cudaMemcpyHostToDevice), 
        "cudaMemcpy", __FILE__, __LINE__);
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&elapsedTime, start, stop);
+    printf("Time elapsed on CPU->GPU transfer = %f msec\n", elapsedTime);

    // construct A
    double *d_A = NULL;
    check_Cuda_Errors(cudaMalloc((void**)&d_A, nS * nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__);

+    cudaEventRecord(start, 0);
    ph_dRPA_A_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_A);
    check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__);
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&elapsedTime, start, stop);
+    printf("Time elapsed on A kernel = %f msec\n", elapsedTime);


    // diagonalize A
@ -47,24 +63,35 @@ void ph_drpa_tda(int nO, int nBas, int nS, double *h_eps, double *h_ERI,
    check_Cuda_Errors(cudaMalloc((void**)&d_Omega, nS * sizeof(double)),
        "cudaMalloc", __FILE__, __LINE__);

+    cudaEventRecord(start, 0);
    diag_dn_dsyevd(nS, d_info, d_Omega, d_A);
    check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__);
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&elapsedTime, start, stop);
+    printf("Time elapsed on diagonalization  = %f msec\n", elapsedTime);

-    int info_gpu = 0;
-    check_Cuda_Errors(cudaMemcpy(&info_gpu, d_info, sizeof(int), cudaMemcpyDeviceToHost),
-        "cudaMemcpy", __FILE__, __LINE__);
-    if (info_gpu != 0) {
-        printf("Error: diag_dn_dsyevd returned error code %d\n", info_gpu);
-        exit(EXIT_FAILURE);
-    }
-
-
+    //int info_gpu = 0;
+    cudaEventRecord(start, 0);
+    //check_Cuda_Errors(cudaMemcpy(&info_gpu, d_info, sizeof(int), cudaMemcpyDeviceToHost),
+    //    "cudaMemcpy", __FILE__, __LINE__);
+    //if (info_gpu != 0) {
+    //    printf("Error: diag_dn_dsyevd returned error code %d\n", info_gpu);
+    //    exit(EXIT_FAILURE);
+    //}
    check_Cuda_Errors(cudaMemcpy(h_X, d_A, nS * nS * sizeof(double), cudaMemcpyDeviceToHost), 
        "cudaMemcpy", __FILE__, __LINE__);
-
    check_Cuda_Errors(cudaMemcpy(h_Omega, d_Omega, nS * sizeof(double), cudaMemcpyDeviceToHost), 
        "cudaMemcpy", __FILE__, __LINE__);

+    cudaEventRecord(start, 0);
+    diag_dn_dsyevd(nS, d_info, d_Omega, d_A);
+    check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__);
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&elapsedTime, start, stop);
+    printf("Time elapsed on GPU -> CPU transfer = %f msec\n", elapsedTime);
+
    check_Cuda_Errors(cudaFree(d_info), "cudaFree", __FILE__, __LINE__);
    check_Cuda_Errors(cudaFree(d_eps), "cudaFree", __FILE__, __LINE__);
    check_Cuda_Errors(cudaFree(d_ERI), "cudaFree", __FILE__, __LINE__);
--- a/src/make_ninja.py
+++ b/src/make_ninja.py
@ -197,6 +197,8 @@ lib_dirs = list(filter(lambda x: os.path.isdir(x) and \
                                x not in exe_dirs, os.listdir(".")))
 i = lib_dirs.index("mod")
 lib_dirs[0], lib_dirs[i] = lib_dirs[i], lib_dirs[0]
+if not USE_GPU:
+    lib_dirs.remove("GPU")

 def create_ninja_in_libdir(directory):
    def write_rule(f, source_file, replace):
--- a/src/mod/cu_quack_module.f90
+++ b/src/mod/cu_quack_module.f90
@ -8,8 +8,8 @@ module cu_quack_module

  interface

-    subroutine ph_drpa_tda(nO, nBas, nS, eps, ERI, &
-                           Omega, X) bind(C, name = "ph_drpa_tda")
+    subroutine ph_drpa_tda_sing(nO, nBas, nS, eps, ERI, &
+        Omega, X) bind(C, name = "ph_drpa_tda_sing")

      import c_int, c_double
      integer(c_int), intent(in), value :: nO, nBas, nS
@ -18,7 +18,51 @@ module cu_quack_module
      real(c_double), intent(out)       :: Omega(nS)
      real(c_double), intent(out)       :: X(nS,nS)

-    end subroutine ph_drpa_tda
+    end subroutine ph_drpa_tda_sing
+
+    ! ---
+
+    subroutine ph_drpa_tda_trip(nO, nBas, nS, eps, ERI, &
+        Omega, X) bind(C, name = "ph_drpa_tda_trip")
+
+      import c_int, c_double
+      integer(c_int), intent(in), value :: nO, nBas, nS
+      real(c_double), intent(in)        :: eps(nBas)
+      real(c_double), intent(in)        :: ERI(nBas,nBas,nBas,nBas)
+      real(c_double), intent(out)       :: Omega(nS)
+      real(c_double), intent(out)       :: X(nS,nS)
+
+    end subroutine ph_drpa_tda_trip
+
+    ! ---
+
+    subroutine ph_drpa_sing(nO, nBas, nS, eps, ERI, &
+        Omega, X) bind(C, name = "ph_drpa_sing")
+
+      import c_int, c_double
+      integer(c_int), intent(in), value :: nO, nBas, nS
+      real(c_double), intent(in)        :: eps(nBas)
+      real(c_double), intent(in)        :: ERI(nBas,nBas,nBas,nBas)
+      real(c_double), intent(out)       :: Omega(nS)
+      real(c_double), intent(out)       :: X(nS,nS)
+
+    end subroutine ph_drpa_sing
+
+    ! ---
+
+    subroutine ph_drpa_trip(nO, nBas, nS, eps, ERI, &
+        Omega, X) bind(C, name = "ph_drpa_trip")
+
+      import c_int, c_double
+      integer(c_int), intent(in), value :: nO, nBas, nS
+      real(c_double), intent(in)        :: eps(nBas)
+      real(c_double), intent(in)        :: ERI(nBas,nBas,nBas,nBas)
+      real(c_double), intent(out)       :: Omega(nS)
+      real(c_double), intent(out)       :: X(nS,nS)
+
+    end subroutine ph_drpa_trip
+
+    ! ---

  end interface

--- a/src/utils/read_dipole_integrals.f90
+++ b/src/utils/read_dipole_integrals.f90
@ -39,7 +39,7 @@ subroutine read_dipole_integrals(working_dir,nBas,R)
    else

      do
-        read(21,*,iostat=ios) mu,nu,Dip
+        read(21, '(I5, I5, E25.17)', iostat=ios) mu, nu, Dip
        if(ios /= 0) exit
        R(mu,nu,1) = Dip
        R(nu,mu,1) = Dip
@ -62,7 +62,7 @@ subroutine read_dipole_integrals(working_dir,nBas,R)
    else

      do
-        read(22,*,iostat=ios) mu,nu,Dip
+        read(22, '(I5, I5, E25.17)', iostat=ios) mu, nu, Dip
        if(ios /= 0) exit
        R(mu,nu,2) = Dip
        R(nu,mu,2) = Dip
@ -85,7 +85,7 @@ subroutine read_dipole_integrals(working_dir,nBas,R)
    else

      do
-        read(23,*,iostat=ios) mu,nu,Dip
+        read(23, '(I5, I5, E25.17)', iostat=ios) mu, nu, Dip
        if(ios /= 0) exit
        R(mu,nu,3) = Dip
        R(nu,mu,3) = Dip