From e43a56e0420b4af023f6a26b4adbcf1e90e96af2 Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Thu, 28 Nov 2024 16:10:05 +0100 Subject: [PATCH] refact in GPU direct & fixed bug in read dipole integrals --- src/GPU/phRRPA_GPU.f90 | 156 ++++++++++++++++++ src/LR/phLR.f90 | 4 + src/QuAcK/QuAcK.f90 | 1 - src/RPA/RRPA.f90 | 2 + src/RPA/phRRPA.f90 | 18 -- src/cuda/src/ph_drpa_a_sing.cu | 6 +- .../src/{ph_drpa_tda.c => ph_drpa_tda_sing.c} | 51 ++++-- src/make_ninja.py | 2 + src/mod/cu_quack_module.f90 | 50 +++++- src/utils/read_dipole_integrals.f90 | 6 +- 10 files changed, 258 insertions(+), 38 deletions(-) create mode 100644 src/GPU/phRRPA_GPU.f90 rename src/cuda/src/{ph_drpa_tda.c => ph_drpa_tda_sing.c} (58%) diff --git a/src/GPU/phRRPA_GPU.f90 b/src/GPU/phRRPA_GPU.f90 new file mode 100644 index 0000000..a4c81b1 --- /dev/null +++ b/src/GPU/phRRPA_GPU.f90 @@ -0,0 +1,156 @@ +subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF) + + use cu_quack_module + +! Perform a direct random phase approximation calculation + + implicit none + include 'parameters.h' + include 'quadrature.h' + +! Input variables + + logical,intent(in) :: dotest + + logical,intent(in) :: TDA + logical,intent(in) :: doACFDT + logical,intent(in) :: exchange_kernel + logical,intent(in) :: singlet + logical,intent(in) :: triplet + integer,intent(in) :: nBas + integer,intent(in) :: nC + integer,intent(in) :: nO + integer,intent(in) :: nV + integer,intent(in) :: nR + integer,intent(in) :: nS + double precision,intent(in) :: ENuc + double precision,intent(in) :: ERHF + double precision,intent(in) :: eHF(nBas) + double precision,intent(in) :: ERI(nBas,nBas,nBas,nBas) + double precision,intent(in) :: dipole_int(nBas,nBas,ncart) + +! Local variables + + integer :: i + integer :: ispin + logical :: dRPA + double precision :: t1, t2 + double precision :: lambda + double precision,allocatable :: Aph(:,:) + double precision,allocatable :: Bph(:,:) + double precision,allocatable :: Om(:) + double precision,allocatable :: XpY(:,:) + double precision,allocatable :: XmY(:,:) + ! DEBUG + !double precision, allocatable :: XpY_gpu(:,:), XmY_gpu(:,:), Om_gpu(:) + + double precision :: EcRPA(nspin) + +! Hello world + + write(*,*) + write(*,*)'*********************************' + write(*,*)'* Restricted ph-RPA Calculation *' + write(*,*)'*********************************' + write(*,*) + +! TDA + + if(TDA) then + write(*,*) 'Tamm-Dancoff approximation activated!' + write(*,*) + end if + +! Initialization + + dRPA = .true. + EcRPA(:) = 0d0 + lambda = 1d0 + +! Memory allocation + + allocate(Om(nS),XpY(nS,nS),XmY(nS,nS),Aph(nS,nS),Bph(nS,nS)) + +! Singlet manifold + + if(singlet) then + + if(TDA) then + + call wall_time(t1) + call ph_drpa_tda_sing(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om(1), XpY(1,1)) + call wall_time(t2) + print*, 'diag time on GPU (sec):', t2 - t1 + stop + XmY(:,:) = XpY(:,:) + + else + + ! TODO + !call ph_drpa_sing(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om(1), XpY(1,1)) + !XmY(:,:) = XpY(:,:) + + endif + + call print_excitation_energies('phRPA@RHF','singlet',nS,Om) + call phLR_transition_vectors(.true.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY) + + end if + +! Triplet manifold + + if(triplet) then + + ispin = 2 + + call phLR_A(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,eHF,ERI,Aph) + if(.not.TDA) call phLR_B(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,ERI,Bph) + + call phLR(TDA,nS,Aph,Bph,EcRPA(ispin),Om,XpY,XmY) + call print_excitation_energies('phRPA@RHF','triplet',nS,Om) + call phLR_transition_vectors(.false.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY) + + end if + + if(exchange_kernel) then + + EcRPA(1) = 0.5d0*EcRPA(1) + EcRPA(2) = 1.5d0*EcRPA(2) + + end if + + write(*,*) + write(*,*)'-------------------------------------------------------------------------------' + write(*,'(2X,A50,F20.10,A3)') 'Tr@phRPA@RHF correlation energy (singlet) = ',EcRPA(1),' au' + write(*,'(2X,A50,F20.10,A3)') 'Tr@phRPA@RHF correlation energy (triplet) = ',EcRPA(2),' au' + write(*,'(2X,A50,F20.10,A3)') 'Tr@phRPA@RHF correlation energy = ',sum(EcRPA),' au' + write(*,'(2X,A50,F20.10,A3)') 'Tr@phRPA@RHF total energy = ',ENuc + ERHF + sum(EcRPA),' au' + write(*,*)'-------------------------------------------------------------------------------' + write(*,*) + + deallocate(Om,XpY,XmY,Aph,Bph) + +! Compute the correlation energy via the adiabatic connection + + if(doACFDT) then + + call phACFDT(exchange_kernel,dRPA,TDA,singlet,triplet,nBas,nC,nO,nV,nR,nS,ERI,eHF,EcRPA) + + write(*,*) + write(*,*)'-------------------------------------------------------------------------------' + write(*,'(2X,A50,F20.10,A3)') 'AC@phRPA@RHF correlation energy (singlet) = ',EcRPA(1),' au' + write(*,'(2X,A50,F20.10,A3)') 'AC@phRPA@RHF correlation energy (triplet) = ',EcRPA(2),' au' + write(*,'(2X,A50,F20.10,A3)') 'AC@phRPA@RHF correlation energy = ',sum(EcRPA),' au' + write(*,'(2X,A50,F20.10,A3)') 'AC@phRPA@RHF total energy = ',ENuc + ERHF + sum(EcRPA),' au' + write(*,*)'-------------------------------------------------------------------------------' + write(*,*) + + end if + + if(dotest) then + + call dump_test_value('R','phRPA correlation energy',sum(EcRPA)) + + end if + +end subroutine diff --git a/src/LR/phLR.f90 b/src/LR/phLR.f90 index ca49344..f3cb274 100644 --- a/src/LR/phLR.f90 +++ b/src/LR/phLR.f90 @@ -15,6 +15,7 @@ subroutine phLR(TDA,nS,Aph,Bph,EcRPA,Om,XpY,XmY) ! Local variables double precision :: trace_matrix + double precision :: t1, t2 double precision,allocatable :: ApB(:,:) double precision,allocatable :: AmB(:,:) double precision,allocatable :: AmBSq(:,:) @@ -38,7 +39,10 @@ subroutine phLR(TDA,nS,Aph,Bph,EcRPA,Om,XpY,XmY) if(TDA) then XpY(:,:) = Aph(:,:) + !call wall_time(t1) call diagonalize_matrix(nS,XpY,Om) + !call wall_time(t2) + !print*, 'diag time on CPU (sec):', t2 - t1 XpY(:,:) = transpose(XpY(:,:)) XmY(:,:) = XpY(:,:) diff --git a/src/QuAcK/QuAcK.f90 b/src/QuAcK/QuAcK.f90 index 586bfff..e6e5865 100644 --- a/src/QuAcK/QuAcK.f90 +++ b/src/QuAcK/QuAcK.f90 @@ -177,7 +177,6 @@ program QuAcK call read_integrals(working_dir,nBas,S,T,V,Hc,ERI_AO) call read_dipole_integrals(working_dir,nBas,dipole_int_AO) - call wall_time(end_int) t_int = end_int - start_int diff --git a/src/RPA/RRPA.f90 b/src/RPA/RRPA.f90 index ab38932..861a0a2 100644 --- a/src/RPA/RRPA.f90 +++ b/src/RPA/RRPA.f90 @@ -50,6 +50,8 @@ subroutine RRPA(dotest,dophRPA,dophRPAx,docrRPA,doppRPA,TDA,doACFDT,exchange_ker write(*,'(A65,1X,F9.3,A8)') 'Total wall time for RPA = ',t_RPA,' seconds' write(*,*) + !call phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF) + end if !------------------------------------------------------------------------ diff --git a/src/RPA/phRRPA.f90 b/src/RPA/phRRPA.f90 index 99b96ae..4cf004c 100644 --- a/src/RPA/phRRPA.f90 +++ b/src/RPA/phRRPA.f90 @@ -1,7 +1,5 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF) -! use cu_quack_module - ! Perform a direct random phase approximation calculation implicit none @@ -40,8 +38,6 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO, double precision,allocatable :: Om(:) double precision,allocatable :: XpY(:,:) double precision,allocatable :: XmY(:,:) - ! DEBUG - !double precision, allocatable :: XpY_gpu(:,:), XmY_gpu(:,:), Om_gpu(:) double precision :: EcRPA(nspin) @@ -80,20 +76,6 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO, if(.not.TDA) call phLR_B(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,ERI,Bph) call phLR(TDA,nS,Aph,Bph,EcRPA(ispin),Om,XpY,XmY) - - !! DEBUG - !allocate(Om_gpu(nS), XpY_gpu(nS,nS), XmY_gpu(nS,nS)) - !call ph_drpa_tda(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om_gpu(1), XpY_gpu(1,1)) - !do i = 1, nS - ! print *, i, Om(i), Om_gpu(i) - ! if(dabs(Om(i) - Om_gpu(i)) .gt. 1d-13) then - ! print *, 'GPU FAILED!' - ! stop - ! endif - !enddo - !print *, 'GPU DONE!' - !stop - call print_excitation_energies('phRPA@RHF','singlet',nS,Om) call phLR_transition_vectors(.true.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY) diff --git a/src/cuda/src/ph_drpa_a_sing.cu b/src/cuda/src/ph_drpa_a_sing.cu index 917cdfb..27678e0 100644 --- a/src/cuda/src/ph_drpa_a_sing.cu +++ b/src/cuda/src/ph_drpa_a_sing.cu @@ -10,6 +10,8 @@ __global__ void ph_dRPA_A_sing_kernel(int nO, int nV, int nBas, int nS, double * int i_A0, i_A1, i_A2; int i_I0, i_I1, i_I2; + bool a_eq_b; + nVS = nV * nS; nBas2 = nBas * nBas; @@ -27,6 +29,8 @@ __global__ void ph_dRPA_A_sing_kernel(int nO, int nV, int nBas, int nS, double * while(bb < nV) { b = bb + nO; + a_eq_b = a == b; + i_A1 = i_A0 + bb; i_I1 = i_I0 + b * nBas; @@ -40,7 +44,7 @@ __global__ void ph_dRPA_A_sing_kernel(int nO, int nV, int nBas, int nS, double * while(j < nO) { A[i_A2 + j * nV] = 2.0 * ERI[i_I2 + j * nBas3]; - if((a==b) && (i==j)) { + if(a_eq_b && (i==j)) { A[i_A2 + j * nV] += eps[a] - eps[i]; } diff --git a/src/cuda/src/ph_drpa_tda.c b/src/cuda/src/ph_drpa_tda_sing.c similarity index 58% rename from src/cuda/src/ph_drpa_tda.c rename to src/cuda/src/ph_drpa_tda_sing.c index 2d6bccf..7b6e476 100644 --- a/src/cuda/src/ph_drpa_tda.c +++ b/src/cuda/src/ph_drpa_tda_sing.c @@ -9,8 +9,8 @@ #include "utils.h" #include "ph_rpa.h" -void ph_drpa_tda(int nO, int nBas, int nS, double *h_eps, double *h_ERI, - double *h_Omega, double *h_X) { +void ph_drpa_tda_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, + double *h_Omega, double *h_X) { double *d_eps = NULL; double *d_ERI = NULL; @@ -20,23 +20,39 @@ void ph_drpa_tda(int nO, int nBas, int nS, double *h_eps, double *h_ERI, int nBas2 = nBas * nBas; int nBas4 = nBas2 * nBas2; + float elapsedTime; + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + check_Cuda_Errors(cudaMalloc((void**)&d_eps, nBas * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); check_Cuda_Errors(cudaMalloc((void**)&d_ERI, nBas4 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); + cudaEventRecord(start, 0); check_Cuda_Errors(cudaMemcpy(d_eps, h_eps, nBas * sizeof(double), cudaMemcpyHostToDevice), "cudaMemcpy", __FILE__, __LINE__); check_Cuda_Errors(cudaMemcpy(d_ERI, h_ERI, nBas4 * sizeof(double), cudaMemcpyHostToDevice), "cudaMemcpy", __FILE__, __LINE__); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&elapsedTime, start, stop); + printf("Time elapsed on CPU->GPU transfer = %f msec\n", elapsedTime); // construct A double *d_A = NULL; check_Cuda_Errors(cudaMalloc((void**)&d_A, nS * nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); + cudaEventRecord(start, 0); ph_dRPA_A_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_A); check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&elapsedTime, start, stop); + printf("Time elapsed on A kernel = %f msec\n", elapsedTime); // diagonalize A @@ -47,24 +63,35 @@ void ph_drpa_tda(int nO, int nBas, int nS, double *h_eps, double *h_ERI, check_Cuda_Errors(cudaMalloc((void**)&d_Omega, nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); + cudaEventRecord(start, 0); diag_dn_dsyevd(nS, d_info, d_Omega, d_A); check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&elapsedTime, start, stop); + printf("Time elapsed on diagonalization = %f msec\n", elapsedTime); - int info_gpu = 0; - check_Cuda_Errors(cudaMemcpy(&info_gpu, d_info, sizeof(int), cudaMemcpyDeviceToHost), - "cudaMemcpy", __FILE__, __LINE__); - if (info_gpu != 0) { - printf("Error: diag_dn_dsyevd returned error code %d\n", info_gpu); - exit(EXIT_FAILURE); - } - - + //int info_gpu = 0; + cudaEventRecord(start, 0); + //check_Cuda_Errors(cudaMemcpy(&info_gpu, d_info, sizeof(int), cudaMemcpyDeviceToHost), + // "cudaMemcpy", __FILE__, __LINE__); + //if (info_gpu != 0) { + // printf("Error: diag_dn_dsyevd returned error code %d\n", info_gpu); + // exit(EXIT_FAILURE); + //} check_Cuda_Errors(cudaMemcpy(h_X, d_A, nS * nS * sizeof(double), cudaMemcpyDeviceToHost), "cudaMemcpy", __FILE__, __LINE__); - check_Cuda_Errors(cudaMemcpy(h_Omega, d_Omega, nS * sizeof(double), cudaMemcpyDeviceToHost), "cudaMemcpy", __FILE__, __LINE__); + cudaEventRecord(start, 0); + diag_dn_dsyevd(nS, d_info, d_Omega, d_A); + check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&elapsedTime, start, stop); + printf("Time elapsed on GPU -> CPU transfer = %f msec\n", elapsedTime); + check_Cuda_Errors(cudaFree(d_info), "cudaFree", __FILE__, __LINE__); check_Cuda_Errors(cudaFree(d_eps), "cudaFree", __FILE__, __LINE__); check_Cuda_Errors(cudaFree(d_ERI), "cudaFree", __FILE__, __LINE__); diff --git a/src/make_ninja.py b/src/make_ninja.py index fc8e819..712c9b9 100755 --- a/src/make_ninja.py +++ b/src/make_ninja.py @@ -197,6 +197,8 @@ lib_dirs = list(filter(lambda x: os.path.isdir(x) and \ x not in exe_dirs, os.listdir("."))) i = lib_dirs.index("mod") lib_dirs[0], lib_dirs[i] = lib_dirs[i], lib_dirs[0] +if not USE_GPU: + lib_dirs.remove("GPU") def create_ninja_in_libdir(directory): def write_rule(f, source_file, replace): diff --git a/src/mod/cu_quack_module.f90 b/src/mod/cu_quack_module.f90 index cf974bb..978fc0c 100644 --- a/src/mod/cu_quack_module.f90 +++ b/src/mod/cu_quack_module.f90 @@ -8,8 +8,8 @@ module cu_quack_module interface - subroutine ph_drpa_tda(nO, nBas, nS, eps, ERI, & - Omega, X) bind(C, name = "ph_drpa_tda") + subroutine ph_drpa_tda_sing(nO, nBas, nS, eps, ERI, & + Omega, X) bind(C, name = "ph_drpa_tda_sing") import c_int, c_double integer(c_int), intent(in), value :: nO, nBas, nS @@ -18,7 +18,51 @@ module cu_quack_module real(c_double), intent(out) :: Omega(nS) real(c_double), intent(out) :: X(nS,nS) - end subroutine ph_drpa_tda + end subroutine ph_drpa_tda_sing + + ! --- + + subroutine ph_drpa_tda_trip(nO, nBas, nS, eps, ERI, & + Omega, X) bind(C, name = "ph_drpa_tda_trip") + + import c_int, c_double + integer(c_int), intent(in), value :: nO, nBas, nS + real(c_double), intent(in) :: eps(nBas) + real(c_double), intent(in) :: ERI(nBas,nBas,nBas,nBas) + real(c_double), intent(out) :: Omega(nS) + real(c_double), intent(out) :: X(nS,nS) + + end subroutine ph_drpa_tda_trip + + ! --- + + subroutine ph_drpa_sing(nO, nBas, nS, eps, ERI, & + Omega, X) bind(C, name = "ph_drpa_sing") + + import c_int, c_double + integer(c_int), intent(in), value :: nO, nBas, nS + real(c_double), intent(in) :: eps(nBas) + real(c_double), intent(in) :: ERI(nBas,nBas,nBas,nBas) + real(c_double), intent(out) :: Omega(nS) + real(c_double), intent(out) :: X(nS,nS) + + end subroutine ph_drpa_sing + + ! --- + + subroutine ph_drpa_trip(nO, nBas, nS, eps, ERI, & + Omega, X) bind(C, name = "ph_drpa_trip") + + import c_int, c_double + integer(c_int), intent(in), value :: nO, nBas, nS + real(c_double), intent(in) :: eps(nBas) + real(c_double), intent(in) :: ERI(nBas,nBas,nBas,nBas) + real(c_double), intent(out) :: Omega(nS) + real(c_double), intent(out) :: X(nS,nS) + + end subroutine ph_drpa_trip + + ! --- end interface diff --git a/src/utils/read_dipole_integrals.f90 b/src/utils/read_dipole_integrals.f90 index 15af2cb..8f56354 100644 --- a/src/utils/read_dipole_integrals.f90 +++ b/src/utils/read_dipole_integrals.f90 @@ -39,7 +39,7 @@ subroutine read_dipole_integrals(working_dir,nBas,R) else do - read(21,*,iostat=ios) mu,nu,Dip + read(21, '(I5, I5, E25.17)', iostat=ios) mu, nu, Dip if(ios /= 0) exit R(mu,nu,1) = Dip R(nu,mu,1) = Dip @@ -62,7 +62,7 @@ subroutine read_dipole_integrals(working_dir,nBas,R) else do - read(22,*,iostat=ios) mu,nu,Dip + read(22, '(I5, I5, E25.17)', iostat=ios) mu, nu, Dip if(ios /= 0) exit R(mu,nu,2) = Dip R(nu,mu,2) = Dip @@ -85,7 +85,7 @@ subroutine read_dipole_integrals(working_dir,nBas,R) else do - read(23,*,iostat=ios) mu,nu,Dip + read(23, '(I5, I5, E25.17)', iostat=ios) mu, nu, Dip if(ios /= 0) exit R(mu,nu,3) = Dip R(nu,mu,3) = Dip