4
1
mirror of https://github.com/pfloos/quack synced 2024-12-22 04:14:26 +01:00

refact for GPU

This commit is contained in:
AbdAmmar 2024-11-29 03:07:18 +01:00
parent 1a091a0707
commit 1235823334
13 changed files with 190 additions and 107 deletions

2
input/hardware Normal file
View File

@ -0,0 +1,2 @@
# if True (T), use GPU
F

View File

@ -0,0 +1,53 @@
module cu_quack_module
use, intrinsic :: iso_c_binding
implicit none
!#ifdef USE_GPU
! interface
! subroutine ph_drpa_tda_sing(nO, nBas, nS, eps, ERI, &
! Omega, X) bind(C, name = "ph_drpa_tda_sing")
!
! import c_int, c_double
! integer(c_int), intent(in), value :: nO, nBas, nS
! real(c_double), intent(in) :: eps(nBas)
! real(c_double), intent(in) :: ERI(nBas,nBas,nBas,nBas)
! real(c_double), intent(out) :: Omega(nS)
! real(c_double), intent(out) :: X(nS,nS)
!
! end subroutine ph_drpa_tda_sing
! end interface
!#else
! interface
! subroutine ph_drpa_tda_sing(nO, nBas, nS, eps, ERI, Omega, X)
! integer, intent(in) :: nO, nBas, nS
! double precision, intent(in) :: eps(nBas)
! double precision, intent(in) :: ERI(nBas,nBas,nBas,nBas)
! double precision, intent(out) :: Omega(nS)
! double precision, intent(out) :: X(nS,nS)
! end subroutine ph_drpa_tda_sing
! end interface
!#endif
interface
subroutine ph_drpa_tda_sing(nO, nBas, nS, eps, ERI, &
Omega, X) bind(C, name = "ph_drpa_tda_sing")
import c_int, c_double
integer(c_int), intent(in), value :: nO, nBas, nS
real(c_double), intent(in) :: eps(nBas)
real(c_double), intent(in) :: ERI(nBas,nBas,nBas,nBas)
real(c_double), intent(out) :: Omega(nS)
real(c_double), intent(out) :: X(nS,nS)
end subroutine ph_drpa_tda_sing
end interface
! ---
end module cu_quack_module

View File

@ -39,10 +39,7 @@ subroutine phLR(TDA,nS,Aph,Bph,EcRPA,Om,XpY,XmY)
if(TDA) then
XpY(:,:) = Aph(:,:)
!call wall_time(t1)
call diagonalize_matrix(nS,XpY,Om)
!call wall_time(t2)
!print*, 'diag time on CPU (sec):', t2 - t1
XpY(:,:) = transpose(XpY(:,:))
XmY(:,:) = XpY(:,:)

View File

@ -44,6 +44,8 @@ program QuAcK
logical :: reg_MP
logical :: use_gpu
integer :: maxSCF_CC,max_diis_CC
double precision :: thresh_CC
@ -134,6 +136,12 @@ program QuAcK
doACFDT,exchange_kernel,doXBS, &
dophBSE,dophBSE2,doppBSE,dBSE,dTDA)
!------------------!
! Hardware !
!------------------!
call read_hardware(working_dir,use_gpu)
!------------------------------------!
! Read input information !
!------------------------------------!
@ -218,7 +226,7 @@ program QuAcK
!-------------------------!
if(doRQuAcK) &
call RQuAcK(doRtest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT, &
call RQuAcK(use_gpu,doRtest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT, &
dodrCCD,dorCCD,docrCCD,dolCCD,doCIS,doCIS_D,doCID,doCISD,doFCI,dophRPA,dophRPAx,docrRPA,doppRPA, &
doG0F2,doevGF2,doqsGF2,doufG0F02,doG0F3,doevGF3,doG0W0,doevGW,doqsGW,doufG0W0,doufGW, &
doG0T0pp,doevGTpp,doqsGTpp,doufG0T0pp,doG0T0eh,doevGTeh,doqsGTeh, &

View File

@ -1,4 +1,4 @@
subroutine RQuAcK(dotest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT, &
subroutine RQuAcK(use_gpu,dotest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT, &
dodrCCD,dorCCD,docrCCD,dolCCD,doCIS,doCIS_D,doCID,doCISD,doFCI,dophRPA,dophRPAx,docrRPA,doppRPA, &
doG0F2,doevGF2,doqsGF2,doufG0F02,doG0F3,doevGF3,doG0W0,doevGW,doqsGW,doufG0W0,doufGW, &
doG0T0pp,doevGTpp,doqsGTpp,doufG0T0pp,doG0T0eh,doevGTeh,doqsGTeh, &
@ -14,6 +14,8 @@ subroutine RQuAcK(dotest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,d
implicit none
include 'parameters.h'
logical,intent(in) :: use_gpu
logical,intent(in) :: dotest
logical,intent(in) :: doRHF,doROHF
@ -274,7 +276,7 @@ subroutine RQuAcK(dotest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,d
if(doRPA) then
call wall_time(start_RPA)
call RRPA(dotest,dophRPA,dophRPAx,docrRPA,doppRPA,TDA,doACFDT,exchange_kernel,singlet,triplet, &
call RRPA(use_gpu,dotest,dophRPA,dophRPAx,docrRPA,doppRPA,TDA,doACFDT,exchange_kernel,singlet,triplet, &
nOrb,nC,nO,nV,nR,nS,ENuc,ERHF,ERI_MO,dipole_int_MO,eHF)
call wall_time(end_RPA)

View File

@ -0,0 +1,45 @@
subroutine read_hardware(working_dir,use_gpu)
! Read desired methods
implicit none
! Input variables
character(len=256),intent(in) :: working_dir
! Output variables
logical,intent(out) :: use_gpu
! Local variables
character(len=1) :: ans
integer :: ios
character(len=256) :: file_path
! Open file with method specification
file_path = trim(working_dir) // '/input/hardware'
open(unit=1, file=file_path, status='old', action='read', iostat=ios)
if(ios /= 0) then
use_gpu = .False.
else
read(1,*)
read(1,*) ans
if(ans == 'T') then
use_gpu = .true.
else
use_gpu = .False.
endif
endif
! Close file with options
close(unit=1)
end subroutine

View File

@ -1,4 +1,4 @@
subroutine RRPA(dotest,dophRPA,dophRPAx,docrRPA,doppRPA,TDA,doACFDT,exchange_kernel,singlet,triplet, &
subroutine RRPA(use_gpu,dotest,dophRPA,dophRPAx,docrRPA,doppRPA,TDA,doACFDT,exchange_kernel,singlet,triplet, &
nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF)
! Random-phase approximation module
@ -8,6 +8,8 @@ subroutine RRPA(dotest,dophRPA,dophRPAx,docrRPA,doppRPA,TDA,doACFDT,exchange_ker
! Input variables
logical,intent(in) :: use_gpu
logical,intent(in) :: dotest
logical,intent(in) :: dophRPA
@ -43,15 +45,17 @@ subroutine RRPA(dotest,dophRPA,dophRPAx,docrRPA,doppRPA,TDA,doACFDT,exchange_ker
if(dophRPA) then
call wall_time(start_RPA)
call phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF)
if (use_gpu) then
call phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF)
else
call phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF)
endif
call wall_time(end_RPA)
t_RPA = end_RPA - start_RPA
write(*,'(A65,1X,F9.3,A8)') 'Total wall time for RPA = ',t_RPA,' seconds'
write(*,*)
!call phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF)
end if
!------------------------------------------------------------------------

View File

@ -83,6 +83,7 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,
call phLR(TDA,nS,Aph,Bph,EcRPA(ispin),Om,XpY,XmY)
!call wall_time(t2)
!print *, "wall time diag A on CPU (sec) = ", t2 - t1
!stop
call print_excitation_energies('phRPA@RHF','singlet',nS,Om)
call phLR_transition_vectors(.true.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY)

View File

@ -1,3 +1,5 @@
#ifdef USE_GPU
subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF)
use cu_quack_module
@ -69,7 +71,8 @@ subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC
! Memory allocation
allocate(Om(nS),XpY(nS,nS),XmY(nS,nS),Aph(nS,nS),Bph(nS,nS))
allocate(Om(nS),XpY(nS,nS),XmY(nS,nS))
!allocate(Aph(nS,nS),Bph(nS,nS))
! Singlet manifold
@ -77,6 +80,7 @@ subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC
if(TDA) then
print*, 'start diag on GPU:'
call wall_time(t1)
call ph_drpa_tda_sing(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om(1), XpY(1,1))
call wall_time(t2)
@ -154,3 +158,34 @@ subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC
end if
end subroutine
#else
subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF)
implicit none
include 'parameters.h'
include 'quadrature.h'
logical,intent(in) :: dotest
logical,intent(in) :: TDA
logical,intent(in) :: doACFDT
logical,intent(in) :: exchange_kernel
logical,intent(in) :: singlet
logical,intent(in) :: triplet
integer,intent(in) :: nBas
integer,intent(in) :: nC
integer,intent(in) :: nO
integer,intent(in) :: nV
integer,intent(in) :: nR
integer,intent(in) :: nS
double precision,intent(in) :: ENuc
double precision,intent(in) :: ERHF
double precision,intent(in) :: eHF(nBas)
double precision,intent(in) :: ERI(nBas,nBas,nBas,nBas)
double precision,intent(in) :: dipole_int(nBas,nBas,ncart)
print*, "compile with USE_GPU FLAG!"
stop
end
#endif

View File

@ -75,6 +75,8 @@ extern "C" void ph_dRPA_A_sing(int nO, int nV, int nBas, int nS, double *eps, do
dim3 dimGrid(nBlocks, nBlocks, 1);
dim3 dimBlock(sBlocks, sBlocks, 1);
//dim3 dimGrid(nBlocks, 1, 1);
//dim3 dimBlock(sBlocks, 1, 1);
printf("lunching ph_dRPA_A_sing_kernel with %dx%d blocks and %dx%d threads/block\n",
nBlocks, nBlocks, sBlocks, sBlocks);

View File

@ -17,14 +17,16 @@ void ph_drpa_tda_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI,
int nV = nBas - nO;
int nBas2 = nBas * nBas;
int nBas4 = nBas2 * nBas2;
long long nBas_long = (long long) nBas;
long long nBas4 = nBas_long * nBas_long * nBas_long * nBas_long;
float elapsedTime;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
//printf("nO = %d, nBas = %d, nS = %d\n", nO, nBas, nS);
//printf("nBas4 = %lld\n", nBas4);
check_Cuda_Errors(cudaMalloc((void**)&d_eps, nBas * sizeof(double)),
@ -32,6 +34,7 @@ void ph_drpa_tda_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI,
check_Cuda_Errors(cudaMalloc((void**)&d_ERI, nBas4 * sizeof(double)),
"cudaMalloc", __FILE__, __LINE__);
printf("CPU->GPU transfer..\n");
cudaEventRecord(start, 0);
check_Cuda_Errors(cudaMemcpy(d_eps, h_eps, nBas * sizeof(double), cudaMemcpyHostToDevice),
"cudaMemcpy", __FILE__, __LINE__);
@ -55,6 +58,10 @@ void ph_drpa_tda_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI,
printf("Time elapsed on A kernel = %f msec\n", elapsedTime);
check_Cuda_Errors(cudaFree(d_eps), "cudaFree", __FILE__, __LINE__);
check_Cuda_Errors(cudaFree(d_ERI), "cudaFree", __FILE__, __LINE__);
// diagonalize A
int *d_info = NULL;
double *d_Omega = NULL;
@ -89,8 +96,6 @@ void ph_drpa_tda_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI,
printf("Time elapsed on GPU -> CPU transfer = %f msec\n", elapsedTime);
check_Cuda_Errors(cudaFree(d_info), "cudaFree", __FILE__, __LINE__);
check_Cuda_Errors(cudaFree(d_eps), "cudaFree", __FILE__, __LINE__);
check_Cuda_Errors(cudaFree(d_ERI), "cudaFree", __FILE__, __LINE__);
check_Cuda_Errors(cudaFree(d_A), "cudaFree", __FILE__, __LINE__);
check_Cuda_Errors(cudaFree(d_Omega), "cudaFree", __FILE__, __LINE__);

View File

@ -36,7 +36,7 @@ def check_compiler_exists(compiler):
compile_gfortran_mac = """
FC = gfortran
AR = libtool -static -o
FFLAGS = -I$IDIR -J$IDIR -fbacktrace -g -Wall -Wno-unused-variable -Wno-unused -Wno-unused-dummy-argument -Wuninitialized -Wmaybe-uninitialized -O3 -march=native
FFLAGS = -I$IDIR -J$IDIR -cpp -fbacktrace -g -Wall -Wno-unused-variable -Wno-unused -Wno-unused-dummy-argument -Wuninitialized -Wmaybe-uninitialized -O3 -march=native
CC = gcc
CXX = g++
LAPACK=-lblas -llapack
@ -47,7 +47,7 @@ FIX_ORDER_OF_LIBS=
compile_gfortran_mac_debug = """
FC = gfortran
AR = libtool -static -o
FFLAGS = -I$IDIR -J$IDIR -fbacktrace -Wall -Wno-unused-variable -g -fcheck=all -Waliasing -Wampersand -Wconversion -Wsurprising -Wintrinsics-std -Wno-tabs -Wintrinsic-shadow -Wline-truncation -Wreal-q-constant
FFLAGS = -I$IDIR -J$IDIR -cpp -fbacktrace -Wall -Wno-unused-variable -g -fcheck=all -Waliasing -Wampersand -Wconversion -Wsurprising -Wintrinsics-std -Wno-tabs -Wintrinsic-shadow -Wline-truncation -Wreal-q-constant
CC = gcc
CXX = g++
LAPACK=-lblas -llapack
@ -58,7 +58,7 @@ FIX_ORDER_OF_LIBS=
compile_gfortran_linux_debug = """
FC = gfortran
AR = ar crs
FFLAGS = -I$IDIR -J$IDIR -fbacktrace -Wall -g -fcheck=all -Waliasing -Wampersand -Wconversion -Wsurprising -Wintrinsics-std -Wno-tabs -Wintrinsic-shadow -Wline-truncation -Wreal-q-constant
FFLAGS = -I$IDIR -J$IDIR -cpp -fbacktrace -Wall -g -fcheck=all -Waliasing -Wampersand -Wconversion -Wsurprising -Wintrinsics-std -Wno-tabs -Wintrinsic-shadow -Wline-truncation -Wreal-q-constant
CC = gcc
CXX = g++
LAPACK=-lblas -llapack
@ -83,7 +83,7 @@ elif sys.platform.lower() == "linux" or os.path.exists('/proc/version'):
compiler = """
FC = ifort -mkl=parallel -qopenmp
AR = ar crs
FFLAGS = -I$IDIR -module $IDIR -traceback -g -Ofast -xHost
FFLAGS = -I$IDIR -module $IDIR -fpp -traceback -g -Ofast -xHost
CC = icc
CXX = icpc
LAPACK=
@ -94,10 +94,12 @@ FIX_ORDER_OF_LIBS=-Wl,--start-group
compiler = """
FC = gfortran -fopenmp
AR = ar crs
FFLAGS = -I$IDIR -J$IDIR -fbacktrace -g -Wall -Wno-unused-variable -Wno-unused -Wno-unused-dummy-argument -Wuninitialized -Wmaybe-uninitialized -O3 -march=native
FFLAGS = -I$IDIR -J$IDIR -cpp -fbacktrace -g -Wall -Wno-unused-variable -Wno-unused -Wno-unused-dummy-argument -Wuninitialized -Wmaybe-uninitialized -O3 -march=native
CC = gcc
CXX = g++
LAPACK=-lblas -llapack
# uncomment for TURPAN
#LAPACK=-larmpl_lp64_mp
STDCXX=-lstdc++
FIX_ORDER_OF_LIBS=-Wl,--start-group
"""
@ -113,8 +115,16 @@ if USE_GPU:
compiler_tmp = compiler.strip().split('\n')
compiler_tmp[0] += " -L{}/src/cuda/build -lcuquack -lcudart -lcublas -lcusolver".format(QUACK_ROOT)
compiler_exe = '\n'.join(compiler_tmp)
compiler_tmp = compiler.strip().split('\n')
compiler_tmp[2] += " -DUSE_GPU"
compiler_lib = '\n'.join(compiler_tmp)
compiler_main = compiler_tmp
else:
compiler_exe = compiler
compiler_lib = compiler
compiler_main = compiler
@ -172,7 +182,7 @@ rule git_clone
build_in_lib_dir = "\n".join([
header,
compiler,
compiler_lib,
rule_fortran,
rule_build_lib,
])
@ -187,7 +197,7 @@ build_in_exe_dir = "\n".join([
build_main = "\n".join([
header,
compiler,
compiler_main,
rule_git_clone,
])
@ -195,9 +205,10 @@ exe_dirs = ["QuAcK"]
lib_dirs = list(filter(lambda x: os.path.isdir(x) and \
x not in ["cuda"] and \
x not in exe_dirs, os.listdir(".")))
i = lib_dirs.index("mod")
lib_dirs[0], lib_dirs[i] = lib_dirs[i], lib_dirs[0]
if not USE_GPU:
if(USE_GPU):
i = lib_dirs.index("GPU")
lib_dirs[0], lib_dirs[i] = lib_dirs[i], lib_dirs[0]
else:
lib_dirs.remove("GPU")
def create_ninja_in_libdir(directory):

View File

@ -1,82 +0,0 @@
module cu_quack_module
use, intrinsic :: iso_c_binding
implicit none
! ---
interface
subroutine ph_drpa_tda_sing(nO, nBas, nS, eps, ERI, &
Omega, X) bind(C, name = "ph_drpa_tda_sing")
import c_int, c_double
integer(c_int), intent(in), value :: nO, nBas, nS
real(c_double), intent(in) :: eps(nBas)
real(c_double), intent(in) :: ERI(nBas,nBas,nBas,nBas)
real(c_double), intent(out) :: Omega(nS)
real(c_double), intent(out) :: X(nS,nS)
end subroutine ph_drpa_tda_sing
! ---
subroutine ph_drpa_tda_trip(nO, nBas, nS, eps, ERI, &
Omega, X) bind(C, name = "ph_drpa_tda_trip")
import c_int, c_double
integer(c_int), intent(in), value :: nO, nBas, nS
real(c_double), intent(in) :: eps(nBas)
real(c_double), intent(in) :: ERI(nBas,nBas,nBas,nBas)
real(c_double), intent(out) :: Omega(nS)
real(c_double), intent(out) :: X(nS,nS)
end subroutine ph_drpa_tda_trip
! ---
subroutine ph_drpa_sing(nO, nBas, nS, eps, ERI, &
Omega, X) bind(C, name = "ph_drpa_sing")
import c_int, c_double
integer(c_int), intent(in), value :: nO, nBas, nS
real(c_double), intent(in) :: eps(nBas)
real(c_double), intent(in) :: ERI(nBas,nBas,nBas,nBas)
real(c_double), intent(out) :: Omega(nS)
real(c_double), intent(out) :: X(nS,nS)
end subroutine ph_drpa_sing
! ---
subroutine ph_drpa_trip(nO, nBas, nS, eps, ERI, &
Omega, X) bind(C, name = "ph_drpa_trip")
import c_int, c_double
integer(c_int), intent(in), value :: nO, nBas, nS
real(c_double), intent(in) :: eps(nBas)
real(c_double), intent(in) :: ERI(nBas,nBas,nBas,nBas)
real(c_double), intent(out) :: Omega(nS)
real(c_double), intent(out) :: X(nS,nS)
end subroutine ph_drpa_trip
! ---
end interface
! ---
contains
subroutine cu_quack_module_test()
implicit none
print*, ' hello from cu_quack_module'
end subroutine cu_quack_module_test
! ---
end module cu_quack_module