From cff0232eb59cc54262370f7d6751518958a67cd2 Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Tue, 26 Nov 2024 09:37:46 +0100 Subject: [PATCH 01/39] workin on dRPA with CUDA --- src/cuda/Makefile | 63 ++++++++++++++++++++++++ src/cuda/include/ph_drpa.h | 10 ++++ src/cuda/include/utils.cuh | 9 ++++ src/cuda/src/ph_drpa.c | 47 ++++++++++++++++++ src/cuda/src/phlr_drpa_a_sing.cu | 82 ++++++++++++++++++++++++++++++++ src/cuda/src/utils.cu | 53 +++++++++++++++++++++ 6 files changed, 264 insertions(+) create mode 100644 src/cuda/Makefile create mode 100644 src/cuda/include/ph_drpa.h create mode 100644 src/cuda/include/utils.cuh create mode 100644 src/cuda/src/ph_drpa.c create mode 100644 src/cuda/src/phlr_drpa_a_sing.cu create mode 100644 src/cuda/src/utils.cu diff --git a/src/cuda/Makefile b/src/cuda/Makefile new file mode 100644 index 0000000..d76e96d --- /dev/null +++ b/src/cuda/Makefile @@ -0,0 +1,63 @@ +NVCC = nvcc +NFLAGS = -O2 --compiler-options '-fPIC' +NDFLAGS = --shared + +CC = gcc +CFLAGS = -fPIC -O2 -Wall -g + +FC = gfortran +FFLAGS = -O2 -Wall -g + +SRC_DIR = src +INC_DIR = include + +BIN_DIR = bin +BLD_DIR = build +$(shell mkdir -p $(BIN_DIR)) +$(shell mkdir -p $(BLD_DIR)) + +CU_SRC = $(wildcard $(SRC_DIR)/*.cu) +CU_OBJ = $(CU_SRC:$(SRC_DIR)/%.cu=$(BLD_DIR)/%.o) + +C_SRC = $(SRC_DIR)/ph_drpa.c +C_OBJ = $(BLD_DIR)/ph_drpa.o + +F_SRC = $(SRC_DIR)/cu_quack_module.f90 +F_OBJ = $(BLD_DIR)/cu_quack_module.f90 + +MAIN_SRC = $(SRC_DIR)/cu_quack.f90 +MAIN_OBJ = $(BLD_DIR)/cu_quack.o + +OUTPUT_LIB = $(BLD_DIR)/libcuquack.so + +CUDA_LIBS = -lcudart -lcublas + + +all: $(OUTPUT_LIB) + +$(OUTPUT_LIB): $(CU_OBJ) $(C_OBJ) + $(NVCC) $(NFLAGS) $(NLDFLAGS) $^ -o $@ $(CUDA_LIBS) -I$(INC_DIR) + +$(BLD_DIR)/%.o: $(SRC_DIR)/%.cu + $(NVCC) $(NFLAGS) -c $< -o $@ -I$(INC_DIR) + +$(C_OBJ): $(C_SRC) + @for src in $(C_SRC); do \ + obj=$(BLD_DIR)/$$(basename $${src} .c).o; \ + echo "$(CC) $(CFLAGS) -c $$src -o $$obj -I$(INC_DIR)"; \ + $(CC) $(CFLAGS) -c $$src -o $$obj -I$(INC_DIR); \ + done + +$(F_OBJ): $(F_SRC) + $(FC) $(FFLAGS) -c $< -o $@ -J$(BLD_DIR) + +$(MAIN_OBJ): $(MAIN_SRC) + $(FC) $(FFLAGS) -c $< -o $@ -J$(BLD_DIR) + + + +.PHONY: clean +clean: + rm -f $(BLD_DIR)/*.o $(BLD_DIR)/*.so $(BLD_DIR)/*.mod $(BIN_DIR)/* + + diff --git a/src/cuda/include/ph_drpa.h b/src/cuda/include/ph_drpa.h new file mode 100644 index 0000000..6f1ed07 --- /dev/null +++ b/src/cuda/include/ph_drpa.h @@ -0,0 +1,10 @@ +#ifndef PH_DRPA + +#define PH_DRPA + +extern void check_Cuda_Errors(cudaError_t err, const char * msg, const char * file, int line); +extern void check_Cublas_Errors(cublasStatus_t status, const char * msg, const char * file, int line); + +extern void phLR_dRPA_A_sing(int nO, int nBas, double *eps, double *ERI, double *A); + +#endif diff --git a/src/cuda/include/utils.cuh b/src/cuda/include/utils.cuh new file mode 100644 index 0000000..1a91732 --- /dev/null +++ b/src/cuda/include/utils.cuh @@ -0,0 +1,9 @@ +#ifndef UTILS +#define UTILS + +extern "C" void check_Cuda_Errors(cudaError_t err, const char* msg, const char* file, int line); + +extern "C" void check_Cublas_Errors(cublasStatus_t status, const char* msg, const char* file, int line); + + +#endif diff --git a/src/cuda/src/ph_drpa.c b/src/cuda/src/ph_drpa.c new file mode 100644 index 0000000..888abaa --- /dev/null +++ b/src/cuda/src/ph_drpa.c @@ -0,0 +1,47 @@ +#include +#include +#include +#include +#include +#include + +#include "ph_drpa.h" + +int ph_drpa(int nO, int nBas, double *h_eps, double *h_ERI) { + + + + double *d_eps; + double *d_ERI; + + int nBas2 = nBas * nBas; + int nBas4 = nBas2 * nBas2; + + + check_Cuda_Errors(cudaMalloc((void**)&d_eps, nO * sizeof(double)), + "cudaMalloc", __FILE__, __LINE__); + check_Cuda_Errors(cudaMalloc((void**)&d_ERI, nBas4 * sizeof(double)), + "cudaMalloc", __FILE__, __LINE__); + + + check_Cuda_Errors(cudaMemcpy(d_eps, h_eps, nO * sizeof(double), cudaMemcpyHostToDevice), + "cudaMemcpy", __FILE__, __LINE__); + check_Cuda_Errors(cudaMemcpy(d_ERI, h_ERI, nBas4 * sizeof(double), cudaMemcpyHostToDevice), + "cudaMemcpy", __FILE__, __LINE__); + + // construct A matrix + int nS = nO * (nBas * nO); + double *d_A; + check_Cuda_Errors(cudaMalloc((void**)&d_A, nS * nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); + phLR_dRPA_A_sing(nO, nBas, d_eps, d_ERI, d_A); + check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); + + + check_Cuda_Errors(cudaFree(d_eps), "cudaFree", __FILE__, __LINE__); + check_Cuda_Errors(cudaFree(d_ERI), "cudaFree", __FILE__, __LINE__); + check_Cuda_Errors(cudaFree(d_A), "cudaFree", __FILE__, __LINE__); + + + return 0; +} + diff --git a/src/cuda/src/phlr_drpa_a_sing.cu b/src/cuda/src/phlr_drpa_a_sing.cu new file mode 100644 index 0000000..3cbc556 --- /dev/null +++ b/src/cuda/src/phlr_drpa_a_sing.cu @@ -0,0 +1,82 @@ +#include + +__global__ void phLR_dRPA_A_sing_kernel(int nO, int nBas, double *eps, double *ERI, double *A) { + + + int i, j, a, b; + int ia, jb, jb_off; + + int ij_off0, ij_off; + + int aa_max = nBas - nO; + int ia_max = aa_max * nO; + + int nBas2 = nBas * nBas; + int nBas3 = nBas2 * nBas; + + int aa = blockIdx.x * blockDim.x + threadIdx.x; + int bb = blockIdx.y * blockDim.y + threadIdx.y; + + while(aa < aa_max) { + a = aa + nO; + + ij_off0 = a * nBas2; + + while(bb < aa_max) { + b = bb + nO; + + ij_off = ij_off0 + b * nBas; + + while(i < nO) { + ia = i * aa_max + aa; + jb_off = ia * ia_max; + + while(j < nO) { + jb = j * aa_max + bb; + + A[jb + jb_off] = 2.0 * ERI[i + j * nBas3 + ij_off]; + if(a==b && i==j) { + A[jb + jb_off] += eps[a] - eps[i]; + } + + j ++; + } // j + + i ++; + } // i + + bb += blockDim.y * gridDim.y; + } // bb + + aa += blockDim.x * gridDim.x; + } // aa + +} + + + + + +extern "C" void phLR_dRPA_A_sing(int nO, int nBas, double *eps, double *ERI, double *A) { + + + int size = nBas - nO; + + int sBlocks = 32; + int nBlocks = (size + sBlocks - 1) / sBlocks; + + dim3 dimGrid(nBlocks, nBlocks, 1); + dim3 dimBlock(sBlocks, sBlocks, 1); + + + printf("lunching phLR_dRPA_A_sing_kernel with %dx%d blocks and %dx%d threads/block\n", + nBlocks, nBlocks, sBlocks, sBlocks); + + + phLR_dRPA_A_sing_kernel<<>>(nO, nBas, eps, ERI, A); + +} + + + + diff --git a/src/cuda/src/utils.cu b/src/cuda/src/utils.cu new file mode 100644 index 0000000..b20c52d --- /dev/null +++ b/src/cuda/src/utils.cu @@ -0,0 +1,53 @@ +#include +#include +#include +#include + + +extern "C" void check_Cuda_Errors(cudaError_t err, const char* msg, const char* file, int line) { + if (err != cudaSuccess) { + printf("CUDA Error in %s at line %d\n", file, line); + printf("%s - %s\n", msg, cudaGetErrorString(err)); + exit(0); + } +} + + +const char* cublasGetErrorString(cublasStatus_t status) { + switch (status) { + case CUBLAS_STATUS_SUCCESS: + return "CUBLAS_STATUS_SUCCESS"; + case CUBLAS_STATUS_NOT_INITIALIZED: + return "CUBLAS_STATUS_NOT_INITIALIZED"; + case CUBLAS_STATUS_ALLOC_FAILED: + return "CUBLAS_STATUS_ALLOC_FAILED"; + case CUBLAS_STATUS_INVALID_VALUE: + return "CUBLAS_STATUS_INVALID_VALUE"; + case CUBLAS_STATUS_ARCH_MISMATCH: + return "CUBLAS_STATUS_ARCH_MISMATCH"; + case CUBLAS_STATUS_MAPPING_ERROR: + return "CUBLAS_STATUS_MAPPING_ERROR"; + case CUBLAS_STATUS_EXECUTION_FAILED: + return "CUBLAS_STATUS_EXECUTION_FAILED"; + case CUBLAS_STATUS_INTERNAL_ERROR: + return "CUBLAS_STATUS_INTERNAL_ERROR"; + case CUBLAS_STATUS_NOT_SUPPORTED: + return "CUBLAS_STATUS_NOT_SUPPORTED"; + case CUBLAS_STATUS_LICENSE_ERROR: + return "CUBLAS_STATUS_LICENSE_ERROR"; + } + return "UNKNOWN CUBLAS ERROR"; +} + +extern "C" void check_Cublas_Errors(cublasStatus_t status, const char* msg, const char* file, int line) { + + const char* err = cublasGetErrorString(status); + + if (err != "CUBLAS_STATUS_SUCCESS") { + printf("CUBLAS Error in %s at line %d\n", file, line); + printf("%s - %s\n", msg, err); + exit(0); + } +} + + From 3009ffe8f791809c01923e9249c2437107a93d2f Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Tue, 26 Nov 2024 14:36:42 +0100 Subject: [PATCH 02/39] dRPA-CUDA: saving --- src/cuda/Makefile | 44 ++++++++++---------------------- src/cuda/src/cu_quack_module.f90 | 26 +++++++++++++++++++ src/cuda/src/ph_drpa.c | 2 -- src/make_ninja.py | 7 ++--- 4 files changed, 43 insertions(+), 36 deletions(-) create mode 100644 src/cuda/src/cu_quack_module.f90 diff --git a/src/cuda/Makefile b/src/cuda/Makefile index d76e96d..04ab17a 100644 --- a/src/cuda/Makefile +++ b/src/cuda/Makefile @@ -1,63 +1,45 @@ NVCC = nvcc -NFLAGS = -O2 --compiler-options '-fPIC' -NDFLAGS = --shared +NVFLAGS = -O2 --compiler-options '-O2 -Wall' CC = gcc -CFLAGS = -fPIC -O2 -Wall -g +CFLAGS = -O2 -Wall -g FC = gfortran FFLAGS = -O2 -Wall -g SRC_DIR = src INC_DIR = include - -BIN_DIR = bin BLD_DIR = build -$(shell mkdir -p $(BIN_DIR)) $(shell mkdir -p $(BLD_DIR)) CU_SRC = $(wildcard $(SRC_DIR)/*.cu) CU_OBJ = $(CU_SRC:$(SRC_DIR)/%.cu=$(BLD_DIR)/%.o) -C_SRC = $(SRC_DIR)/ph_drpa.c -C_OBJ = $(BLD_DIR)/ph_drpa.o +C_SRC = $(wildcard $(SRC_DIR)/*.c) +C_OBJ = $(CU_SRC:$(SRC_DIR)/%.c=$(BLD_DIR)/%.o) F_SRC = $(SRC_DIR)/cu_quack_module.f90 -F_OBJ = $(BLD_DIR)/cu_quack_module.f90 - -MAIN_SRC = $(SRC_DIR)/cu_quack.f90 -MAIN_OBJ = $(BLD_DIR)/cu_quack.o - -OUTPUT_LIB = $(BLD_DIR)/libcuquack.so - -CUDA_LIBS = -lcudart -lcublas +F_OBJ = $(BLD_DIR)/cu_quack_module.o +OUTPUT_LIB = $(BLD_DIR)/cuda.a all: $(OUTPUT_LIB) -$(OUTPUT_LIB): $(CU_OBJ) $(C_OBJ) - $(NVCC) $(NFLAGS) $(NLDFLAGS) $^ -o $@ $(CUDA_LIBS) -I$(INC_DIR) +$(OUTPUT_LIB): $(CU_OBJ) $(C_OBJ) $(F_OBJ) + ar rcs $(OUTPUT_LIB) $(CU_OBJ) $(C_OBJ) $(F_OBJ) -$(BLD_DIR)/%.o: $(SRC_DIR)/%.cu - $(NVCC) $(NFLAGS) -c $< -o $@ -I$(INC_DIR) +$(CU_OBJ): $(CU_SRC) + $(NVCC) $(NVFLAGS) -c -o $@ $< -I$(INC_DIR) $(C_OBJ): $(C_SRC) - @for src in $(C_SRC); do \ - obj=$(BLD_DIR)/$$(basename $${src} .c).o; \ - echo "$(CC) $(CFLAGS) -c $$src -o $$obj -I$(INC_DIR)"; \ - $(CC) $(CFLAGS) -c $$src -o $$obj -I$(INC_DIR); \ - done + $(CC) $(CFLAGS) -c -o $@ $< -I$(INC_DIR) $(F_OBJ): $(F_SRC) - $(FC) $(FFLAGS) -c $< -o $@ -J$(BLD_DIR) - -$(MAIN_OBJ): $(MAIN_SRC) - $(FC) $(FFLAGS) -c $< -o $@ -J$(BLD_DIR) - + $(FC) $(FFLAGS) -c -o $@ $< -J$(BLD_DIR) .PHONY: clean clean: - rm -f $(BLD_DIR)/*.o $(BLD_DIR)/*.so $(BLD_DIR)/*.mod $(BIN_DIR)/* + rm -f $(BLD_DIR)/* diff --git a/src/cuda/src/cu_quack_module.f90 b/src/cuda/src/cu_quack_module.f90 new file mode 100644 index 0000000..5b5ea1a --- /dev/null +++ b/src/cuda/src/cu_quack_module.f90 @@ -0,0 +1,26 @@ +module cu_quack_module + + use, intrinsic :: iso_c_binding + + implicit none + + interface + + ! --- + + subroutine ph_drpa(nO, nBas, eps, ERI) bind(C, name = "cutc_int") + + import c_int, c_double + integer(c_int), intent(in), value :: nO, nBas + real(c_double), intent(in) :: eps(nBas) + real(c_double), intent(in) :: ERI(nBas,nBas,nBas,nBas) + + end subroutine ph_drpa + + ! --- + + end interface + +end module cu_quack_module + + diff --git a/src/cuda/src/ph_drpa.c b/src/cuda/src/ph_drpa.c index 888abaa..aa8d357 100644 --- a/src/cuda/src/ph_drpa.c +++ b/src/cuda/src/ph_drpa.c @@ -9,8 +9,6 @@ int ph_drpa(int nO, int nBas, double *h_eps, double *h_ERI) { - - double *d_eps; double *d_ERI; diff --git a/src/make_ninja.py b/src/make_ninja.py index 09d3a72..ff67c3c 100755 --- a/src/make_ninja.py +++ b/src/make_ninja.py @@ -81,7 +81,7 @@ elif sys.platform.lower() == "linux" or os.path.exists('/proc/version'): else: if check_compiler_exists('ifort'): compiler = """ -FC = ifort -qmkl=parallel -qopenmp +FC = ifort -mkl=parallel -qopenmp AR = ar crs FFLAGS = -I$IDIR -module $IDIR -traceback -g -Ofast -xHost CC = icc @@ -182,9 +182,10 @@ build_main = "\n".join([ rule_git_clone, ]) -exe_dirs = [ "QuAcK"] +exe_dirs = ["QuAcK"] lib_dirs = list(filter(lambda x: os.path.isdir(x) and \ - x not in exe_dirs, os.listdir("."))) + x not in ["cuda"] and \ + x not in exe_dirs, os.listdir("."))) def create_ninja_in_libdir(directory): def write_rule(f, source_file, replace): From a10e4c8c2312f9ce04b60779ee6ca8c108e0da29 Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Tue, 26 Nov 2024 15:26:03 +0100 Subject: [PATCH 03/39] update make_ninja file for cuda objects --- src/make_ninja.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/make_ninja.py b/src/make_ninja.py index ff67c3c..49ac43b 100755 --- a/src/make_ninja.py +++ b/src/make_ninja.py @@ -3,15 +3,15 @@ import os import sys import subprocess +import argparse +parser = argparse.ArgumentParser(description='This script generate the compilation files for QuAcK.') +parser.add_argument('-d', '--debug', action='store_true', help='Debug mode. Default is false.') +parser.add_argument('-u', '--use-gpu', action='store_true', help='Use GPU. Default is false.') +args = parser.parse_args() +DEBUG = args.debug +USE_GPU = args.use_gpu -DEBUG=False -try: - DEBUG = sys.argv[1] == "debug" -except: - pass - - if "QUACK_ROOT" not in os.environ: os.chdir("..") print("") @@ -120,6 +120,7 @@ IDIR=$QUACK_ROOT/include LDIR=$QUACK_ROOT/lib BDIR=$QUACK_ROOT/bin SDIR=$QUACK_ROOT/src +CUDA_DIR=$QUACK_ROOT/src/cuda/build LIBXC_VERSION=5.0.0 @@ -248,7 +249,10 @@ rule build_lib sources = [ "$SDIR/{0}/{1}".format(exe_dir,x) for x in os.listdir(exe_dir) ] sources = filter(lambda x: x.endswith(".f") or x.endswith(".f90"), sources) sources = " ".join(sources) - f.write("build $BDIR/{0}: build_exe {1} {2}\n".format(exe_dir,libs,sources)) + if USE_GPU: + f.write("build $BDIR/{0}: build_exe $CUDA_DIR/cuda.a {1} {2}\n".format(exe_dir,libs,sources)) + else: + f.write("build $BDIR/{0}: build_exe {1} {2}\n".format(exe_dir,libs,sources)) f.write(" dir = {0} \n".format(exe_dir) ) for libname in lib_dirs: From 41dd532d0ea9c13669c0560bb8b3fcf666e6af16 Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Tue, 26 Nov 2024 16:33:06 +0100 Subject: [PATCH 04/39] linked CUDA module with QuAcK properley --- src/cuda/Makefile | 21 ++++++++++----------- src/cuda/src/cu_quack_module.f90 | 26 -------------------------- src/cuda/src/utils.cu | 8 +++++--- src/make_ninja.py | 12 +++++++----- 4 files changed, 22 insertions(+), 45 deletions(-) delete mode 100644 src/cuda/src/cu_quack_module.f90 diff --git a/src/cuda/Makefile b/src/cuda/Makefile index 04ab17a..61ac1d2 100644 --- a/src/cuda/Makefile +++ b/src/cuda/Makefile @@ -1,11 +1,11 @@ NVCC = nvcc -NVFLAGS = -O2 --compiler-options '-O2 -Wall' +NVFLAGS = -O2 --compiler-options '-O2 -Wall -fPIC' CC = gcc -CFLAGS = -O2 -Wall -g +CFLAGS = -O2 -Wall -g -fPIC FC = gfortran -FFLAGS = -O2 -Wall -g +FFLAGS = -O2 -Wall -g -fPIC SRC_DIR = src INC_DIR = include @@ -16,28 +16,27 @@ CU_SRC = $(wildcard $(SRC_DIR)/*.cu) CU_OBJ = $(CU_SRC:$(SRC_DIR)/%.cu=$(BLD_DIR)/%.o) C_SRC = $(wildcard $(SRC_DIR)/*.c) -C_OBJ = $(CU_SRC:$(SRC_DIR)/%.c=$(BLD_DIR)/%.o) +C_OBJ = $(C_SRC:$(SRC_DIR)/%.c=$(BLD_DIR)/%.o) -F_SRC = $(SRC_DIR)/cu_quack_module.f90 -F_OBJ = $(BLD_DIR)/cu_quack_module.o +F_SRC = #$(SRC_DIR)/cu_quack_module.f90 +F_OBJ = #$(BLD_DIR)/cu_quack_module.o -OUTPUT_LIB = $(BLD_DIR)/cuda.a +OUTPUT_LIB = $(BLD_DIR)/libcuquack.so all: $(OUTPUT_LIB) $(OUTPUT_LIB): $(CU_OBJ) $(C_OBJ) $(F_OBJ) - ar rcs $(OUTPUT_LIB) $(CU_OBJ) $(C_OBJ) $(F_OBJ) + $(CC) -shared -o $(OUTPUT_LIB) $(CU_OBJ) $(C_OBJ) $(F_OBJ) -$(CU_OBJ): $(CU_SRC) +$(BLD_DIR)/%.o: $(SRC_DIR)/%.cu $(NVCC) $(NVFLAGS) -c -o $@ $< -I$(INC_DIR) -$(C_OBJ): $(C_SRC) +$(BLD_DIR)/%.o: $(SRC_DIR)/%.c $(CC) $(CFLAGS) -c -o $@ $< -I$(INC_DIR) $(F_OBJ): $(F_SRC) $(FC) $(FFLAGS) -c -o $@ $< -J$(BLD_DIR) - .PHONY: clean clean: rm -f $(BLD_DIR)/* diff --git a/src/cuda/src/cu_quack_module.f90 b/src/cuda/src/cu_quack_module.f90 deleted file mode 100644 index 5b5ea1a..0000000 --- a/src/cuda/src/cu_quack_module.f90 +++ /dev/null @@ -1,26 +0,0 @@ -module cu_quack_module - - use, intrinsic :: iso_c_binding - - implicit none - - interface - - ! --- - - subroutine ph_drpa(nO, nBas, eps, ERI) bind(C, name = "cutc_int") - - import c_int, c_double - integer(c_int), intent(in), value :: nO, nBas - real(c_double), intent(in) :: eps(nBas) - real(c_double), intent(in) :: ERI(nBas,nBas,nBas,nBas) - - end subroutine ph_drpa - - ! --- - - end interface - -end module cu_quack_module - - diff --git a/src/cuda/src/utils.cu b/src/cuda/src/utils.cu index b20c52d..ff5d83d 100644 --- a/src/cuda/src/utils.cu +++ b/src/cuda/src/utils.cu @@ -2,6 +2,8 @@ #include #include #include +#include + extern "C" void check_Cuda_Errors(cudaError_t err, const char* msg, const char* file, int line) { @@ -13,7 +15,7 @@ extern "C" void check_Cuda_Errors(cudaError_t err, const char* msg, const char* } -const char* cublasGetErrorString(cublasStatus_t status) { +const char* cublas_Get_Error_String(cublasStatus_t status) { switch (status) { case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; @@ -41,9 +43,9 @@ const char* cublasGetErrorString(cublasStatus_t status) { extern "C" void check_Cublas_Errors(cublasStatus_t status, const char* msg, const char* file, int line) { - const char* err = cublasGetErrorString(status); + const char* err = cublas_Get_Error_String(status); - if (err != "CUBLAS_STATUS_SUCCESS") { + if (strcmp(err, "CUBLAS_STATUS_SUCCESS") != 0) { printf("CUBLAS Error in %s at line %d\n", file, line); printf("%s - %s\n", msg, err); exit(0); diff --git a/src/make_ninja.py b/src/make_ninja.py index 49ac43b..c78bc7d 100755 --- a/src/make_ninja.py +++ b/src/make_ninja.py @@ -120,7 +120,6 @@ IDIR=$QUACK_ROOT/include LDIR=$QUACK_ROOT/lib BDIR=$QUACK_ROOT/bin SDIR=$QUACK_ROOT/src -CUDA_DIR=$QUACK_ROOT/src/cuda/build LIBXC_VERSION=5.0.0 @@ -187,6 +186,12 @@ exe_dirs = ["QuAcK"] lib_dirs = list(filter(lambda x: os.path.isdir(x) and \ x not in ["cuda"] and \ x not in exe_dirs, os.listdir("."))) +if USE_GPU: + i = lib_dirs.index("mod") + lib_dirs[0], lib_dirs[i] = lib_dirs[i], lib_dirs[0] +else: + lib_dirs.remove("mod") +print(lib_dirs) def create_ninja_in_libdir(directory): def write_rule(f, source_file, replace): @@ -249,10 +254,7 @@ rule build_lib sources = [ "$SDIR/{0}/{1}".format(exe_dir,x) for x in os.listdir(exe_dir) ] sources = filter(lambda x: x.endswith(".f") or x.endswith(".f90"), sources) sources = " ".join(sources) - if USE_GPU: - f.write("build $BDIR/{0}: build_exe $CUDA_DIR/cuda.a {1} {2}\n".format(exe_dir,libs,sources)) - else: - f.write("build $BDIR/{0}: build_exe {1} {2}\n".format(exe_dir,libs,sources)) + f.write("build $BDIR/{0}: build_exe {1} {2}\n".format(exe_dir,libs,sources)) f.write(" dir = {0} \n".format(exe_dir) ) for libname in lib_dirs: From 9c42437746b3f0f60651e239cd64e6f159e9d60f Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Tue, 26 Nov 2024 19:31:44 +0100 Subject: [PATCH 05/39] linking with quack-cuda: OK --- .gitignore | 3 +++ quack.rc | 2 ++ src/RPA/phRRPA.f90 | 11 +++++++++++ src/cuda/src/ph_drpa.c | 22 +++++++++++++++++---- src/make_ninja.py | 12 ++++++++++-- src/mod/cu_quack_module.f90 | 39 +++++++++++++++++++++++++++++++++++++ 6 files changed, 83 insertions(+), 6 deletions(-) create mode 100644 src/mod/cu_quack_module.f90 diff --git a/.gitignore b/.gitignore index 899b091..4a6ab19 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +*.slurm +*.mod +*.so *.o *. __pycache__ diff --git a/quack.rc b/quack.rc index 77ca7ff..bede54b 100644 --- a/quack.rc +++ b/quack.rc @@ -13,3 +13,5 @@ esac export QUACK_ROOT="$( cd $QUACK_ROOT; pwd -P )" export PATH="${QUACK_ROOT}/bin:$PATH" +export LD_LIBRARY_PATH="${QUACK_ROOT}/src/cuda/build:$LD_LIBRARY_PATH" + diff --git a/src/RPA/phRRPA.f90 b/src/RPA/phRRPA.f90 index 7a13d8f..94c6576 100644 --- a/src/RPA/phRRPA.f90 +++ b/src/RPA/phRRPA.f90 @@ -1,5 +1,7 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF) + use cu_quack_module + ! Perform a direct random phase approximation calculation implicit none @@ -37,6 +39,8 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO, double precision,allocatable :: Om(:) double precision,allocatable :: XpY(:,:) double precision,allocatable :: XmY(:,:) + ! DEBUG + double precision, allocatable :: XpY_gpu(:,:), XmY_gpu(:,:), Om_gpu(:) double precision :: EcRPA(nspin) @@ -74,6 +78,13 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO, call phLR_A(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,eHF,ERI,Aph) if(.not.TDA) call phLR_B(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,ERI,Bph) + ! DEBUG + allocate(Om_gpu(nS), XpY_gpu(nS,nS), XmY_gpu(nS,nS)) + call ph_drpa(nO, nBas, eHF(1), ERI(1,1,1,1), Om_gpu(1), XpY_gpu(1,1), XmY_gpu(1,1)) + print *, ' CPU:', Aph(1,1) + print *, ' GPU:', XpY_gpu(1,1) + stop + call phLR(TDA,nS,Aph,Bph,EcRPA(ispin),Om,XpY,XmY) call print_excitation_energies('phRPA@RHF','singlet',nS,Om) call phLR_transition_vectors(.true.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY) diff --git a/src/cuda/src/ph_drpa.c b/src/cuda/src/ph_drpa.c index aa8d357..61afcbe 100644 --- a/src/cuda/src/ph_drpa.c +++ b/src/cuda/src/ph_drpa.c @@ -7,7 +7,8 @@ #include "ph_drpa.h" -int ph_drpa(int nO, int nBas, double *h_eps, double *h_ERI) { +int ph_drpa(int nO, int nBas, double *h_eps, double *h_ERI, + double *h_Omega, double *h_XpY, double *h_XmY) { double *d_eps; double *d_ERI; @@ -16,6 +17,17 @@ int ph_drpa(int nO, int nBas, double *h_eps, double *h_ERI) { int nBas4 = nBas2 * nBas2; + int ia, jb; + int nS = nO * (nBas - nO); + for (ia = 0; ia < nS; ia++) { + h_Omega[ia] = 0.0; + for (jb = 0; jb < nS; jb++) { + h_XmY[jb + nO * nBas * ia] = 0.0; + h_XpY[jb + nO * nBas * ia] = 0.0; + } + } + + check_Cuda_Errors(cudaMalloc((void**)&d_eps, nO * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); check_Cuda_Errors(cudaMalloc((void**)&d_ERI, nBas4 * sizeof(double)), @@ -28,13 +40,15 @@ int ph_drpa(int nO, int nBas, double *h_eps, double *h_ERI) { "cudaMemcpy", __FILE__, __LINE__); // construct A matrix - int nS = nO * (nBas * nO); double *d_A; check_Cuda_Errors(cudaMalloc((void**)&d_A, nS * nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); - phLR_dRPA_A_sing(nO, nBas, d_eps, d_ERI, d_A); - check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); +// phLR_dRPA_A_sing(nO, nBas, d_eps, d_ERI, d_A); +// check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); + check_Cuda_Errors(cudaMemcpy(h_XpY, d_A, nS * nS * sizeof(double), cudaMemcpyDeviceToHost), + "cudaMemcpy", __FILE__, __LINE__); + check_Cuda_Errors(cudaFree(d_eps), "cudaFree", __FILE__, __LINE__); check_Cuda_Errors(cudaFree(d_ERI), "cudaFree", __FILE__, __LINE__); check_Cuda_Errors(cudaFree(d_A), "cudaFree", __FILE__, __LINE__); diff --git a/src/make_ninja.py b/src/make_ninja.py index c78bc7d..3d558d3 100755 --- a/src/make_ninja.py +++ b/src/make_ninja.py @@ -109,6 +109,15 @@ else: print("Unknown platform. Only Linux and Darwin are supported.") sys.exit(-1) +if USE_GPU: + compiler_tmp = compiler.strip().split('\n') + compiler_tmp[0] += " -L{}/src/cuda/build -lcuquack -lcudart -lcublas".format(QUACK_ROOT) + compiler_exe = '\n'.join(compiler_tmp) +else: + compiler_exe = compiler + + + header = """# # This file was automatically generated. Do not modify this file. # To change compiling options, make the modifications in @@ -171,7 +180,7 @@ build_in_lib_dir = "\n".join([ build_in_exe_dir = "\n".join([ header, - compiler, + compiler_exe, rule_fortran, rule_build_exe, ]) @@ -191,7 +200,6 @@ if USE_GPU: lib_dirs[0], lib_dirs[i] = lib_dirs[i], lib_dirs[0] else: lib_dirs.remove("mod") -print(lib_dirs) def create_ninja_in_libdir(directory): def write_rule(f, source_file, replace): diff --git a/src/mod/cu_quack_module.f90 b/src/mod/cu_quack_module.f90 new file mode 100644 index 0000000..f73cc6c --- /dev/null +++ b/src/mod/cu_quack_module.f90 @@ -0,0 +1,39 @@ +module cu_quack_module + + use, intrinsic :: iso_c_binding + + implicit none + + ! --- + + interface + + subroutine ph_drpa(nO, nBas, eps, ERI, & + Omega, XpY, XmY) bind(C, name = "ph_drpa") + + import c_int, c_double + integer(c_int), intent(in), value :: nO, nBas + real(c_double), intent(in) :: eps(nBas) + real(c_double), intent(in) :: ERI(nBas,nBas,nBas,nBas) + real(c_double), intent(out) :: Omega(nO*nBas) + real(c_double), intent(out) :: XpY(nO*nBas,nO*nBas) + real(c_double), intent(out) :: XmY(nO*nBas,nO*nBas) + + end subroutine ph_drpa + + end interface + + ! --- + + contains + + subroutine cu_quack_module_test() + implicit none + print*, ' hello from mod_test' + end subroutine cu_quack_module_test + + ! --- + +end module cu_quack_module + + From 66566a8ce790d5206760b2c1fe5d3b457c616c7d Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Tue, 26 Nov 2024 23:00:15 +0100 Subject: [PATCH 06/39] added Aph_dRPA_sing kernel --- src/RPA/phRRPA.f90 | 3 +- src/cuda/src/ph_drpa.c | 16 +++++------ src/cuda/src/phlr_drpa_a_sing.cu | 49 ++++++++++++++++++-------------- src/mod/cu_quack_module.f90 | 10 +++---- 4 files changed, 42 insertions(+), 36 deletions(-) diff --git a/src/RPA/phRRPA.f90 b/src/RPA/phRRPA.f90 index 94c6576..7d00061 100644 --- a/src/RPA/phRRPA.f90 +++ b/src/RPA/phRRPA.f90 @@ -80,9 +80,10 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO, ! DEBUG allocate(Om_gpu(nS), XpY_gpu(nS,nS), XmY_gpu(nS,nS)) - call ph_drpa(nO, nBas, eHF(1), ERI(1,1,1,1), Om_gpu(1), XpY_gpu(1,1), XmY_gpu(1,1)) + call ph_drpa(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om_gpu(1), XpY_gpu(1,1), XmY_gpu(1,1)) print *, ' CPU:', Aph(1,1) print *, ' GPU:', XpY_gpu(1,1) + print *, ' GPU:', XmY_gpu(1,1) stop call phLR(TDA,nS,Aph,Bph,EcRPA(ispin),Om,XpY,XmY) diff --git a/src/cuda/src/ph_drpa.c b/src/cuda/src/ph_drpa.c index 61afcbe..0e0f671 100644 --- a/src/cuda/src/ph_drpa.c +++ b/src/cuda/src/ph_drpa.c @@ -7,7 +7,7 @@ #include "ph_drpa.h" -int ph_drpa(int nO, int nBas, double *h_eps, double *h_ERI, +int ph_drpa(int nO, int nBas, int nS, double *h_eps, double *h_ERI, double *h_Omega, double *h_XpY, double *h_XmY) { double *d_eps; @@ -16,25 +16,23 @@ int ph_drpa(int nO, int nBas, double *h_eps, double *h_ERI, int nBas2 = nBas * nBas; int nBas4 = nBas2 * nBas2; - int ia, jb; - int nS = nO * (nBas - nO); for (ia = 0; ia < nS; ia++) { h_Omega[ia] = 0.0; for (jb = 0; jb < nS; jb++) { - h_XmY[jb + nO * nBas * ia] = 0.0; - h_XpY[jb + nO * nBas * ia] = 0.0; + h_XmY[jb + ia * nS] = 4.0; + //h_XpY[jb + ia * nS] = 5.0; } } - check_Cuda_Errors(cudaMalloc((void**)&d_eps, nO * sizeof(double)), + check_Cuda_Errors(cudaMalloc((void**)&d_eps, nBas * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); check_Cuda_Errors(cudaMalloc((void**)&d_ERI, nBas4 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); - check_Cuda_Errors(cudaMemcpy(d_eps, h_eps, nO * sizeof(double), cudaMemcpyHostToDevice), + check_Cuda_Errors(cudaMemcpy(d_eps, h_eps, nBas * sizeof(double), cudaMemcpyHostToDevice), "cudaMemcpy", __FILE__, __LINE__); check_Cuda_Errors(cudaMemcpy(d_ERI, h_ERI, nBas4 * sizeof(double), cudaMemcpyHostToDevice), "cudaMemcpy", __FILE__, __LINE__); @@ -42,8 +40,8 @@ int ph_drpa(int nO, int nBas, double *h_eps, double *h_ERI, // construct A matrix double *d_A; check_Cuda_Errors(cudaMalloc((void**)&d_A, nS * nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); -// phLR_dRPA_A_sing(nO, nBas, d_eps, d_ERI, d_A); -// check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); + phLR_dRPA_A_sing(nO, nBas, d_eps, d_ERI, d_A); + check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); check_Cuda_Errors(cudaMemcpy(h_XpY, d_A, nS * nS * sizeof(double), cudaMemcpyDeviceToHost), diff --git a/src/cuda/src/phlr_drpa_a_sing.cu b/src/cuda/src/phlr_drpa_a_sing.cu index 3cbc556..7844d66 100644 --- a/src/cuda/src/phlr_drpa_a_sing.cu +++ b/src/cuda/src/phlr_drpa_a_sing.cu @@ -4,39 +4,46 @@ __global__ void phLR_dRPA_A_sing_kernel(int nO, int nBas, double *eps, double *E int i, j, a, b; - int ia, jb, jb_off; + int aa, bb; + int nV, nS, nVS; + int nBas2, nBas3; + int i_A0, i_A1, i_A2; + int i_I0, i_I1, i_I2; - int ij_off0, ij_off; + nV = nBas - nO; + nS = nO * nV; + nVS = nV * nS; - int aa_max = nBas - nO; - int ia_max = aa_max * nO; + nBas2 = nBas * nBas; + nBas3 = nBas2 * nBas; - int nBas2 = nBas * nBas; - int nBas3 = nBas2 * nBas; + aa = blockIdx.x * blockDim.x + threadIdx.x; + bb = blockIdx.y * blockDim.y + threadIdx.y; - int aa = blockIdx.x * blockDim.x + threadIdx.x; - int bb = blockIdx.y * blockDim.y + threadIdx.y; - - while(aa < aa_max) { + while(aa < nV) { a = aa + nO; - ij_off0 = a * nBas2; + i_A0 = aa * nS; + i_I0 = a * nBas2; - while(bb < aa_max) { + while(bb < nV) { b = bb + nO; - ij_off = ij_off0 + b * nBas; + i_A1 = i_A0 + bb; + i_I1 = i_I0 + b * nBas; + i = 0; while(i < nO) { - ia = i * aa_max + aa; - jb_off = ia * ia_max; - - while(j < nO) { - jb = j * aa_max + bb; - A[jb + jb_off] = 2.0 * ERI[i + j * nBas3 + ij_off]; - if(a==b && i==j) { - A[jb + jb_off] += eps[a] - eps[i]; + i_A2 = i_A1 + i * nVS; + i_I2 = i_I1 + i; + + j = 0; + while(j < nO) { + + A[i_A2 + j * nV] = 2.0 * ERI[i_I2 + j * nBas3]; + if((a==b) && (i==j)) { + A[i_A2 + j * nV] += eps[a] - eps[i]; } j ++; diff --git a/src/mod/cu_quack_module.f90 b/src/mod/cu_quack_module.f90 index f73cc6c..a786587 100644 --- a/src/mod/cu_quack_module.f90 +++ b/src/mod/cu_quack_module.f90 @@ -8,16 +8,16 @@ module cu_quack_module interface - subroutine ph_drpa(nO, nBas, eps, ERI, & + subroutine ph_drpa(nO, nBas, nS, eps, ERI, & Omega, XpY, XmY) bind(C, name = "ph_drpa") import c_int, c_double - integer(c_int), intent(in), value :: nO, nBas + integer(c_int), intent(in), value :: nO, nBas, nS real(c_double), intent(in) :: eps(nBas) real(c_double), intent(in) :: ERI(nBas,nBas,nBas,nBas) - real(c_double), intent(out) :: Omega(nO*nBas) - real(c_double), intent(out) :: XpY(nO*nBas,nO*nBas) - real(c_double), intent(out) :: XmY(nO*nBas,nO*nBas) + real(c_double), intent(out) :: Omega(nS) + real(c_double), intent(out) :: XpY(nS,nS) + real(c_double), intent(out) :: XmY(nS,nS) end subroutine ph_drpa From b2e82bff2ec054ef1abc8e6e5a6bf72696bff1f3 Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Wed, 27 Nov 2024 08:51:04 +0100 Subject: [PATCH 07/39] added cuda kernel for A-trip-dRPA --- src/cuda/include/ph_drpa.h | 2 +- src/cuda/src/ph_drpa.c | 2 +- ...{phlr_drpa_a_sing.cu => ph_drpa_a_sing.cu} | 8 +- src/cuda/src/ph_drpa_a_trip.cu | 89 +++++++++++++++++++ 4 files changed, 95 insertions(+), 6 deletions(-) rename src/cuda/src/{phlr_drpa_a_sing.cu => ph_drpa_a_sing.cu} (79%) create mode 100644 src/cuda/src/ph_drpa_a_trip.cu diff --git a/src/cuda/include/ph_drpa.h b/src/cuda/include/ph_drpa.h index 6f1ed07..1f68857 100644 --- a/src/cuda/include/ph_drpa.h +++ b/src/cuda/include/ph_drpa.h @@ -5,6 +5,6 @@ extern void check_Cuda_Errors(cudaError_t err, const char * msg, const char * file, int line); extern void check_Cublas_Errors(cublasStatus_t status, const char * msg, const char * file, int line); -extern void phLR_dRPA_A_sing(int nO, int nBas, double *eps, double *ERI, double *A); +extern void ph_dRPA_A_sing(int nO, int nBas, double *eps, double *ERI, double *A); #endif diff --git a/src/cuda/src/ph_drpa.c b/src/cuda/src/ph_drpa.c index 0e0f671..304f1c1 100644 --- a/src/cuda/src/ph_drpa.c +++ b/src/cuda/src/ph_drpa.c @@ -40,7 +40,7 @@ int ph_drpa(int nO, int nBas, int nS, double *h_eps, double *h_ERI, // construct A matrix double *d_A; check_Cuda_Errors(cudaMalloc((void**)&d_A, nS * nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); - phLR_dRPA_A_sing(nO, nBas, d_eps, d_ERI, d_A); + ph_dRPA_A_sing(nO, nBas, d_eps, d_ERI, d_A); check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); diff --git a/src/cuda/src/phlr_drpa_a_sing.cu b/src/cuda/src/ph_drpa_a_sing.cu similarity index 79% rename from src/cuda/src/phlr_drpa_a_sing.cu rename to src/cuda/src/ph_drpa_a_sing.cu index 7844d66..5308c1d 100644 --- a/src/cuda/src/phlr_drpa_a_sing.cu +++ b/src/cuda/src/ph_drpa_a_sing.cu @@ -1,6 +1,6 @@ #include -__global__ void phLR_dRPA_A_sing_kernel(int nO, int nBas, double *eps, double *ERI, double *A) { +__global__ void ph_dRPA_A_sing_kernel(int nO, int nBas, double *eps, double *ERI, double *A) { int i, j, a, b; @@ -64,7 +64,7 @@ __global__ void phLR_dRPA_A_sing_kernel(int nO, int nBas, double *eps, double *E -extern "C" void phLR_dRPA_A_sing(int nO, int nBas, double *eps, double *ERI, double *A) { +extern "C" void ph_dRPA_A_sing(int nO, int nBas, double *eps, double *ERI, double *A) { int size = nBas - nO; @@ -76,11 +76,11 @@ extern "C" void phLR_dRPA_A_sing(int nO, int nBas, double *eps, double *ERI, dou dim3 dimBlock(sBlocks, sBlocks, 1); - printf("lunching phLR_dRPA_A_sing_kernel with %dx%d blocks and %dx%d threads/block\n", + printf("lunching ph_dRPA_A_sing_kernel with %dx%d blocks and %dx%d threads/block\n", nBlocks, nBlocks, sBlocks, sBlocks); - phLR_dRPA_A_sing_kernel<<>>(nO, nBas, eps, ERI, A); + ph_dRPA_A_sing_kernel<<>>(nO, nBas, eps, ERI, A); } diff --git a/src/cuda/src/ph_drpa_a_trip.cu b/src/cuda/src/ph_drpa_a_trip.cu new file mode 100644 index 0000000..a58b5a2 --- /dev/null +++ b/src/cuda/src/ph_drpa_a_trip.cu @@ -0,0 +1,89 @@ +#include + +__global__ void ph_dRPA_A_trip_kernel(int nO, int nBas, double *eps, double *A) { + + + int i, j, a, b; + int aa, bb; + int nV, nS, nVS; + int nBas2, nBas3; + int i_A0, i_A1, i_A2; + int i_I0, i_I1, i_I2; + + nV = nBas - nO; + nS = nO * nV; + nVS = nV * nS; + + nBas2 = nBas * nBas; + nBas3 = nBas2 * nBas; + + aa = blockIdx.x * blockDim.x + threadIdx.x; + bb = blockIdx.y * blockDim.y + threadIdx.y; + + while(aa < nV) { + a = aa + nO; + + i_A0 = aa * nS; + i_I0 = a * nBas2; + + while(bb < nV) { + b = bb + nO; + + i_A1 = i_A0 + bb; + i_I1 = i_I0 + b * nBas; + + i = 0; + while(i < nO) { + + i_A2 = i_A1 + i * nVS; + i_I2 = i_I1 + i; + + j = 0; + while(j < nO) { + + A[i_A2 + j * nV] = 0.0; + if((a==b) && (i==j)) { + A[i_A2 + j * nV] += eps[a] - eps[i]; + } + + j ++; + } // j + + i ++; + } // i + + bb += blockDim.y * gridDim.y; + } // bb + + aa += blockDim.x * gridDim.x; + } // aa + +} + + + + + +extern "C" void ph_dRPA_A_trip(int nO, int nBas, double *eps, double *A) { + + + int size = nBas - nO; + + int sBlocks = 32; + int nBlocks = (size + sBlocks - 1) / sBlocks; + + dim3 dimGrid(nBlocks, nBlocks, 1); + dim3 dimBlock(sBlocks, sBlocks, 1); + + + printf("lunching ph_dRPA_A_trip_kernel with %dx%d blocks and %dx%d threads/block\n", + nBlocks, nBlocks, sBlocks, sBlocks); + + + ph_dRPA_A_trip_kernel<<>>(nO, nBas, eps, A); + +} + + + + From a7fa8fd49a2defa72b3a7c86b12ceb16a8e0c454 Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Wed, 27 Nov 2024 09:03:33 +0100 Subject: [PATCH 08/39] added cuda kernel for B-sing-dRPA --- src/cuda/src/ph_drpa_b_sing.cu | 82 ++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 src/cuda/src/ph_drpa_b_sing.cu diff --git a/src/cuda/src/ph_drpa_b_sing.cu b/src/cuda/src/ph_drpa_b_sing.cu new file mode 100644 index 0000000..2a59142 --- /dev/null +++ b/src/cuda/src/ph_drpa_b_sing.cu @@ -0,0 +1,82 @@ +#include + +__global__ void ph_dRPA_B_sing_kernel(int nO, int nV, int nBas, int nS, double *ERI, double *B) { + + + int i, j, a, b; + int aa, bb; + int nVS; + int nBas2, nBas3; + int i_B0, i_B1, i_B2; + int i_I0, i_I1, i_I2; + + nVS = nV * nS; + + nBas2 = nBas * nBas; + nBas3 = nBas2 * nBas; + + aa = blockIdx.x * blockDim.x + threadIdx.x; + bb = blockIdx.y * blockDim.y + threadIdx.y; + + while(aa < nV) { + a = aa + nO; + + i_B0 = aa * nS; + i_I0 = a * nBas2; + + while(bb < nV) { + b = bb + nO; + + i_B1 = i_B0 + bb; + i_I1 = i_I0 + b * nBas3; + + i = 0; + while(i < nO) { + + i_B2 = i_B1 + i * nVS; + i_I2 = i_I1 + i; + + j = 0; + while(j < nO) { + + B[i_B2 + j * nV] = 2.0 * ERI[i_I2 + j * nBas]; + + j ++; + } // j + + i ++; + } // i + + bb += blockDim.y * gridDim.y; + } // bb + + aa += blockDim.x * gridDim.x; + } // aa + +} + + + + + +extern "C" void ph_dRPA_B_sing(int nO, int nV, int nBas, int nS, double *ERI, double *B) { + + + int sBlocks = 32; + int nBlocks = (nV + sBlocks - 1) / sBlocks; + + dim3 dimGrid(nBlocks, nBlocks, 1); + dim3 dimBlock(sBlocks, sBlocks, 1); + + + printf("lunching ph_dRPA_B_sing_kernel with %dx%d blocks and %dx%d threads/block\n", + nBlocks, nBlocks, sBlocks, sBlocks); + + + ph_dRPA_B_sing_kernel<<>>(nO, nV, nBas, nS, ERI, B); + +} + + + + From 0df9203eb3aa80749528cb956968da383f69e8be Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Wed, 27 Nov 2024 09:08:22 +0100 Subject: [PATCH 09/39] dded cuda kernel for B-trip-dRPA --- src/cuda/Makefile | 2 +- src/cuda/include/ph_drpa.h | 2 +- src/cuda/src/ph_drpa.c | 4 +- src/cuda/src/ph_drpa_a_sing.cu | 14 +++---- src/cuda/src/ph_drpa_a_trip.cu | 22 +++-------- src/cuda/src/ph_drpa_b_trip.cu | 72 ++++++++++++++++++++++++++++++++++ 6 files changed, 87 insertions(+), 29 deletions(-) create mode 100644 src/cuda/src/ph_drpa_b_trip.cu diff --git a/src/cuda/Makefile b/src/cuda/Makefile index 61ac1d2..32c2101 100644 --- a/src/cuda/Makefile +++ b/src/cuda/Makefile @@ -39,6 +39,6 @@ $(F_OBJ): $(F_SRC) .PHONY: clean clean: - rm -f $(BLD_DIR)/* + rm $(BLD_DIR)/* diff --git a/src/cuda/include/ph_drpa.h b/src/cuda/include/ph_drpa.h index 1f68857..1786a2f 100644 --- a/src/cuda/include/ph_drpa.h +++ b/src/cuda/include/ph_drpa.h @@ -5,6 +5,6 @@ extern void check_Cuda_Errors(cudaError_t err, const char * msg, const char * file, int line); extern void check_Cublas_Errors(cublasStatus_t status, const char * msg, const char * file, int line); -extern void ph_dRPA_A_sing(int nO, int nBas, double *eps, double *ERI, double *A); +extern void ph_dRPA_A_sing(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *A); #endif diff --git a/src/cuda/src/ph_drpa.c b/src/cuda/src/ph_drpa.c index 304f1c1..c90bfab 100644 --- a/src/cuda/src/ph_drpa.c +++ b/src/cuda/src/ph_drpa.c @@ -13,6 +13,8 @@ int ph_drpa(int nO, int nBas, int nS, double *h_eps, double *h_ERI, double *d_eps; double *d_ERI; + int nV = nBas - nO; + int nBas2 = nBas * nBas; int nBas4 = nBas2 * nBas2; @@ -40,7 +42,7 @@ int ph_drpa(int nO, int nBas, int nS, double *h_eps, double *h_ERI, // construct A matrix double *d_A; check_Cuda_Errors(cudaMalloc((void**)&d_A, nS * nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); - ph_dRPA_A_sing(nO, nBas, d_eps, d_ERI, d_A); + ph_dRPA_A_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_A); check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); diff --git a/src/cuda/src/ph_drpa_a_sing.cu b/src/cuda/src/ph_drpa_a_sing.cu index 5308c1d..917cdfb 100644 --- a/src/cuda/src/ph_drpa_a_sing.cu +++ b/src/cuda/src/ph_drpa_a_sing.cu @@ -1,17 +1,15 @@ #include -__global__ void ph_dRPA_A_sing_kernel(int nO, int nBas, double *eps, double *ERI, double *A) { +__global__ void ph_dRPA_A_sing_kernel(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *A) { int i, j, a, b; int aa, bb; - int nV, nS, nVS; + int nVS; int nBas2, nBas3; int i_A0, i_A1, i_A2; int i_I0, i_I1, i_I2; - nV = nBas - nO; - nS = nO * nV; nVS = nV * nS; nBas2 = nBas * nBas; @@ -64,13 +62,11 @@ __global__ void ph_dRPA_A_sing_kernel(int nO, int nBas, double *eps, double *ERI -extern "C" void ph_dRPA_A_sing(int nO, int nBas, double *eps, double *ERI, double *A) { +extern "C" void ph_dRPA_A_sing(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *A) { - int size = nBas - nO; - int sBlocks = 32; - int nBlocks = (size + sBlocks - 1) / sBlocks; + int nBlocks = (nV + sBlocks - 1) / sBlocks; dim3 dimGrid(nBlocks, nBlocks, 1); dim3 dimBlock(sBlocks, sBlocks, 1); @@ -80,7 +76,7 @@ extern "C" void ph_dRPA_A_sing(int nO, int nBas, double *eps, double *ERI, doubl nBlocks, nBlocks, sBlocks, sBlocks); - ph_dRPA_A_sing_kernel<<>>(nO, nBas, eps, ERI, A); + ph_dRPA_A_sing_kernel<<>>(nO, nV, nBas, nS, eps, ERI, A); } diff --git a/src/cuda/src/ph_drpa_a_trip.cu b/src/cuda/src/ph_drpa_a_trip.cu index a58b5a2..b28e7b8 100644 --- a/src/cuda/src/ph_drpa_a_trip.cu +++ b/src/cuda/src/ph_drpa_a_trip.cu @@ -1,22 +1,15 @@ #include -__global__ void ph_dRPA_A_trip_kernel(int nO, int nBas, double *eps, double *A) { +__global__ void ph_dRPA_A_trip_kernel(int nO, int nV, int nBas, int nS, double *eps, double *A) { int i, j, a, b; int aa, bb; - int nV, nS, nVS; - int nBas2, nBas3; + int nVS; int i_A0, i_A1, i_A2; - int i_I0, i_I1, i_I2; - nV = nBas - nO; - nS = nO * nV; nVS = nV * nS; - nBas2 = nBas * nBas; - nBas3 = nBas2 * nBas; - aa = blockIdx.x * blockDim.x + threadIdx.x; bb = blockIdx.y * blockDim.y + threadIdx.y; @@ -24,19 +17,16 @@ __global__ void ph_dRPA_A_trip_kernel(int nO, int nBas, double *eps, double *A) a = aa + nO; i_A0 = aa * nS; - i_I0 = a * nBas2; while(bb < nV) { b = bb + nO; i_A1 = i_A0 + bb; - i_I1 = i_I0 + b * nBas; i = 0; while(i < nO) { i_A2 = i_A1 + i * nVS; - i_I2 = i_I1 + i; j = 0; while(j < nO) { @@ -64,13 +54,11 @@ __global__ void ph_dRPA_A_trip_kernel(int nO, int nBas, double *eps, double *A) -extern "C" void ph_dRPA_A_trip(int nO, int nBas, double *eps, double *A) { +extern "C" void ph_dRPA_A_trip(int nO, int nV, int nBas, int nS, double *eps, double *A) { - int size = nBas - nO; - int sBlocks = 32; - int nBlocks = (size + sBlocks - 1) / sBlocks; + int nBlocks = (nV + sBlocks - 1) / sBlocks; dim3 dimGrid(nBlocks, nBlocks, 1); dim3 dimBlock(sBlocks, sBlocks, 1); @@ -80,7 +68,7 @@ extern "C" void ph_dRPA_A_trip(int nO, int nBas, double *eps, double *A) { nBlocks, nBlocks, sBlocks, sBlocks); - ph_dRPA_A_trip_kernel<<>>(nO, nBas, eps, A); + ph_dRPA_A_trip_kernel<<>>(nO, nV, nBas, nS, eps, A); } diff --git a/src/cuda/src/ph_drpa_b_trip.cu b/src/cuda/src/ph_drpa_b_trip.cu new file mode 100644 index 0000000..6122164 --- /dev/null +++ b/src/cuda/src/ph_drpa_b_trip.cu @@ -0,0 +1,72 @@ +#include + +__global__ void ph_dRPA_B_trip_kernel(int nO, int nV, int nBas, int nS, double *B) { + + + int i, j; + int aa, bb; + int nVS; + int i_B0, i_B1, i_B2; + + nVS = nV * nS; + + aa = blockIdx.x * blockDim.x + threadIdx.x; + bb = blockIdx.y * blockDim.y + threadIdx.y; + + while(aa < nV) { + + i_B0 = aa * nS; + + while(bb < nV) { + + i_B1 = i_B0 + bb; + + i = 0; + while(i < nO) { + + i_B2 = i_B1 + i * nVS; + + j = 0; + while(j < nO) { + + B[i_B2 + j * nV] = 0.0; + + j ++; + } // j + + i ++; + } // i + + bb += blockDim.y * gridDim.y; + } // bb + + aa += blockDim.x * gridDim.x; + } // aa + +} + + + + + +extern "C" void ph_dRPA_B_trip(int nO, int nV, int nBas, int nS, double *B) { + + + int sBlocks = 32; + int nBlocks = (nV + sBlocks - 1) / sBlocks; + + dim3 dimGrid(nBlocks, nBlocks, 1); + dim3 dimBlock(sBlocks, sBlocks, 1); + + + printf("lunching ph_dRPA_B_trip_kernel with %dx%d blocks and %dx%d threads/block\n", + nBlocks, nBlocks, sBlocks, sBlocks); + + + ph_dRPA_B_trip_kernel<<>>(nO, nV, nBas, nS, B); + +} + + + + From 10cb7f931de11f2b55eef08d928473d0019ce05d Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Wed, 27 Nov 2024 18:29:30 +0100 Subject: [PATCH 10/39] added & tested diagonalyzation with Cusolver! --- src/RPA/phRRPA.f90 | 25 ++++++---- src/cuda/include/ph_drpa.h | 10 ---- src/cuda/include/ph_rpa.h | 10 ++++ src/cuda/include/utils.cuh | 9 ---- src/cuda/include/utils.h | 9 ++++ src/cuda/src/diag_dense_dsy_mat.cu | 41 ++++++++++++++++ src/cuda/src/ph_drpa.c | 59 ----------------------- src/cuda/src/ph_drpa_tda.c | 76 ++++++++++++++++++++++++++++++ src/cuda/src/utils.cu | 70 +++++++++++++++++++++++++++ src/make_ninja.py | 2 +- src/mod/cu_quack_module.f90 | 11 ++--- 11 files changed, 228 insertions(+), 94 deletions(-) delete mode 100644 src/cuda/include/ph_drpa.h create mode 100644 src/cuda/include/ph_rpa.h delete mode 100644 src/cuda/include/utils.cuh create mode 100644 src/cuda/include/utils.h create mode 100644 src/cuda/src/diag_dense_dsy_mat.cu delete mode 100644 src/cuda/src/ph_drpa.c create mode 100644 src/cuda/src/ph_drpa_tda.c diff --git a/src/RPA/phRRPA.f90 b/src/RPA/phRRPA.f90 index 7d00061..577ab6c 100644 --- a/src/RPA/phRRPA.f90 +++ b/src/RPA/phRRPA.f90 @@ -31,6 +31,7 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO, ! Local variables + integer :: i integer :: ispin logical :: dRPA double precision :: lambda @@ -40,7 +41,7 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO, double precision,allocatable :: XpY(:,:) double precision,allocatable :: XmY(:,:) ! DEBUG - double precision, allocatable :: XpY_gpu(:,:), XmY_gpu(:,:), Om_gpu(:) + !double precision, allocatable :: XpY_gpu(:,:), XmY_gpu(:,:), Om_gpu(:) double precision :: EcRPA(nspin) @@ -78,15 +79,21 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO, call phLR_A(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,eHF,ERI,Aph) if(.not.TDA) call phLR_B(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,ERI,Bph) - ! DEBUG - allocate(Om_gpu(nS), XpY_gpu(nS,nS), XmY_gpu(nS,nS)) - call ph_drpa(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om_gpu(1), XpY_gpu(1,1), XmY_gpu(1,1)) - print *, ' CPU:', Aph(1,1) - print *, ' GPU:', XpY_gpu(1,1) - print *, ' GPU:', XmY_gpu(1,1) - stop - call phLR(TDA,nS,Aph,Bph,EcRPA(ispin),Om,XpY,XmY) + + !! DEBUG + !allocate(Om_gpu(nS), XpY_gpu(nS,nS), XmY_gpu(nS,nS)) + !call ph_drpa_tda(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om_gpu(1), XpY_gpu(1,1)) + !do i = 1, nS + ! print *, i, Om(i), Om_gpu(i) + ! if(dabs(Om(i) - Om_gpu(i)) .gt. 1d-13) then + ! print *, 'GPU FAILED!' + ! stop + ! endif + !enddo + !print *, 'GPU DONE!' + !stop + call print_excitation_energies('phRPA@RHF','singlet',nS,Om) call phLR_transition_vectors(.true.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY) diff --git a/src/cuda/include/ph_drpa.h b/src/cuda/include/ph_drpa.h deleted file mode 100644 index 1786a2f..0000000 --- a/src/cuda/include/ph_drpa.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef PH_DRPA - -#define PH_DRPA - -extern void check_Cuda_Errors(cudaError_t err, const char * msg, const char * file, int line); -extern void check_Cublas_Errors(cublasStatus_t status, const char * msg, const char * file, int line); - -extern void ph_dRPA_A_sing(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *A); - -#endif diff --git a/src/cuda/include/ph_rpa.h b/src/cuda/include/ph_rpa.h new file mode 100644 index 0000000..36f111f --- /dev/null +++ b/src/cuda/include/ph_rpa.h @@ -0,0 +1,10 @@ +#ifndef PH_RPA + +#define PH_RPA + +extern void ph_dRPA_A_sing(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *A); +extern void ph_dRPA_B_sing(int nO, int nV, int nBas, int nS, double *ERI, double *B); + +extern void diag_dn_dsyevd(int n, int *info, double *W, double *A); + +#endif diff --git a/src/cuda/include/utils.cuh b/src/cuda/include/utils.cuh deleted file mode 100644 index 1a91732..0000000 --- a/src/cuda/include/utils.cuh +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef UTILS -#define UTILS - -extern "C" void check_Cuda_Errors(cudaError_t err, const char* msg, const char* file, int line); - -extern "C" void check_Cublas_Errors(cublasStatus_t status, const char* msg, const char* file, int line); - - -#endif diff --git a/src/cuda/include/utils.h b/src/cuda/include/utils.h new file mode 100644 index 0000000..f5a6403 --- /dev/null +++ b/src/cuda/include/utils.h @@ -0,0 +1,9 @@ +#ifndef UTILS + +#define UTILS + +extern void check_Cuda_Errors(cudaError_t err, const char *msg, const char *file, int line); +extern void check_Cublas_Errors(cublasStatus_t status, const char *msg, const char *file, int line); +extern void check_Cusolver_Errors(cusolverStatus_t status, const char *msg, const char *file, int line); + +#endif diff --git a/src/cuda/src/diag_dense_dsy_mat.cu b/src/cuda/src/diag_dense_dsy_mat.cu new file mode 100644 index 0000000..b800501 --- /dev/null +++ b/src/cuda/src/diag_dense_dsy_mat.cu @@ -0,0 +1,41 @@ +#include +#include +#include +#include + + + +extern "C" void diag_dn_dsyevd(int n, int *info, double *W, double *A) { + + cusolverDnHandle_t cusolverH = NULL; + cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR; // Compute eigenvalues and eigenvectors + cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER; // Upper triangular part of the matrix is stored + + int lwork = 0; + double *work = NULL; + + //check_Cusolver_Errors(cusolverDnCreate(&cusolverH), "cusolverDnCreate", __FILE__, __LINE__); + cusolverDnCreate(&cusolverH); + + // Query workspace size + //check_Cusolver_Errors(cusolverDnDsyevd_bufferSize(cusolverH, jobz, uplo, n, A, n, W, &lwork), + // "cusolverDnDsyevd_bufferSize", __FILE__, __LINE__); + //check_Cuda_Errors(cudaMalloc((void**)&work, sizeof(double) * lwork), + // "cudaMemcpy", __FILE__, __LINE__); + cusolverDnDsyevd_bufferSize(cusolverH, jobz, uplo, n, A, n, W, &lwork); + cudaMalloc((void**)&work, sizeof(double) * lwork); + + // Compute eigenvalues and eigenvectors + //check_Cusolver_Errors(cusolverDnDsyevd(cusolverH, jobz, uplo, n, A, n, W, work, lwork, info), + // "cusolverDnDsyevd", __FILE__, __LINE__); + cusolverDnDsyevd(cusolverH, jobz, uplo, n, A, n, W, work, lwork, info); + + // Clean up + //check_Cuda_Errors(cudaFree(work), "cudaFree", __FILE__, __LINE__); + //check_Cusolver_Errors(cusolverDnDestroy(cusolverH), "cusolverDnDestroy", __FILE__, __LINE__); + + cudaFree(work); + cusolverDnDestroy(cusolverH); + +} + diff --git a/src/cuda/src/ph_drpa.c b/src/cuda/src/ph_drpa.c deleted file mode 100644 index c90bfab..0000000 --- a/src/cuda/src/ph_drpa.c +++ /dev/null @@ -1,59 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "ph_drpa.h" - -int ph_drpa(int nO, int nBas, int nS, double *h_eps, double *h_ERI, - double *h_Omega, double *h_XpY, double *h_XmY) { - - double *d_eps; - double *d_ERI; - - int nV = nBas - nO; - - int nBas2 = nBas * nBas; - int nBas4 = nBas2 * nBas2; - - int ia, jb; - for (ia = 0; ia < nS; ia++) { - h_Omega[ia] = 0.0; - for (jb = 0; jb < nS; jb++) { - h_XmY[jb + ia * nS] = 4.0; - //h_XpY[jb + ia * nS] = 5.0; - } - } - - - check_Cuda_Errors(cudaMalloc((void**)&d_eps, nBas * sizeof(double)), - "cudaMalloc", __FILE__, __LINE__); - check_Cuda_Errors(cudaMalloc((void**)&d_ERI, nBas4 * sizeof(double)), - "cudaMalloc", __FILE__, __LINE__); - - - check_Cuda_Errors(cudaMemcpy(d_eps, h_eps, nBas * sizeof(double), cudaMemcpyHostToDevice), - "cudaMemcpy", __FILE__, __LINE__); - check_Cuda_Errors(cudaMemcpy(d_ERI, h_ERI, nBas4 * sizeof(double), cudaMemcpyHostToDevice), - "cudaMemcpy", __FILE__, __LINE__); - - // construct A matrix - double *d_A; - check_Cuda_Errors(cudaMalloc((void**)&d_A, nS * nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); - ph_dRPA_A_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_A); - check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); - - - check_Cuda_Errors(cudaMemcpy(h_XpY, d_A, nS * nS * sizeof(double), cudaMemcpyDeviceToHost), - "cudaMemcpy", __FILE__, __LINE__); - - check_Cuda_Errors(cudaFree(d_eps), "cudaFree", __FILE__, __LINE__); - check_Cuda_Errors(cudaFree(d_ERI), "cudaFree", __FILE__, __LINE__); - check_Cuda_Errors(cudaFree(d_A), "cudaFree", __FILE__, __LINE__); - - - return 0; -} - diff --git a/src/cuda/src/ph_drpa_tda.c b/src/cuda/src/ph_drpa_tda.c new file mode 100644 index 0000000..2d6bccf --- /dev/null +++ b/src/cuda/src/ph_drpa_tda.c @@ -0,0 +1,76 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "utils.h" +#include "ph_rpa.h" + +void ph_drpa_tda(int nO, int nBas, int nS, double *h_eps, double *h_ERI, + double *h_Omega, double *h_X) { + + double *d_eps = NULL; + double *d_ERI = NULL; + + int nV = nBas - nO; + + int nBas2 = nBas * nBas; + int nBas4 = nBas2 * nBas2; + + + check_Cuda_Errors(cudaMalloc((void**)&d_eps, nBas * sizeof(double)), + "cudaMalloc", __FILE__, __LINE__); + check_Cuda_Errors(cudaMalloc((void**)&d_ERI, nBas4 * sizeof(double)), + "cudaMalloc", __FILE__, __LINE__); + + check_Cuda_Errors(cudaMemcpy(d_eps, h_eps, nBas * sizeof(double), cudaMemcpyHostToDevice), + "cudaMemcpy", __FILE__, __LINE__); + check_Cuda_Errors(cudaMemcpy(d_ERI, h_ERI, nBas4 * sizeof(double), cudaMemcpyHostToDevice), + "cudaMemcpy", __FILE__, __LINE__); + + // construct A + double *d_A = NULL; + check_Cuda_Errors(cudaMalloc((void**)&d_A, nS * nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); + + ph_dRPA_A_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_A); + check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); + + + // diagonalize A + int *d_info = NULL; + double *d_Omega = NULL; + check_Cuda_Errors(cudaMalloc((void**)&d_info, sizeof(int)), + "cudaMalloc", __FILE__, __LINE__); + check_Cuda_Errors(cudaMalloc((void**)&d_Omega, nS * sizeof(double)), + "cudaMalloc", __FILE__, __LINE__); + + diag_dn_dsyevd(nS, d_info, d_Omega, d_A); + check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); + + int info_gpu = 0; + check_Cuda_Errors(cudaMemcpy(&info_gpu, d_info, sizeof(int), cudaMemcpyDeviceToHost), + "cudaMemcpy", __FILE__, __LINE__); + if (info_gpu != 0) { + printf("Error: diag_dn_dsyevd returned error code %d\n", info_gpu); + exit(EXIT_FAILURE); + } + + + check_Cuda_Errors(cudaMemcpy(h_X, d_A, nS * nS * sizeof(double), cudaMemcpyDeviceToHost), + "cudaMemcpy", __FILE__, __LINE__); + + check_Cuda_Errors(cudaMemcpy(h_Omega, d_Omega, nS * sizeof(double), cudaMemcpyDeviceToHost), + "cudaMemcpy", __FILE__, __LINE__); + + check_Cuda_Errors(cudaFree(d_info), "cudaFree", __FILE__, __LINE__); + check_Cuda_Errors(cudaFree(d_eps), "cudaFree", __FILE__, __LINE__); + check_Cuda_Errors(cudaFree(d_ERI), "cudaFree", __FILE__, __LINE__); + check_Cuda_Errors(cudaFree(d_A), "cudaFree", __FILE__, __LINE__); + check_Cuda_Errors(cudaFree(d_Omega), "cudaFree", __FILE__, __LINE__); + + +} + diff --git a/src/cuda/src/utils.cu b/src/cuda/src/utils.cu index ff5d83d..673b21e 100644 --- a/src/cuda/src/utils.cu +++ b/src/cuda/src/utils.cu @@ -3,6 +3,7 @@ #include #include #include +#include @@ -53,3 +54,72 @@ extern "C" void check_Cublas_Errors(cublasStatus_t status, const char* msg, cons } + +const char* cusolver_Get_Error_String(cusolverStatus_t status) { + switch (status) { + case CUSOLVER_STATUS_SUCCESS: + return "CUSOLVER_STATUS_SUCCESS"; + case CUSOLVER_STATUS_NOT_INITIALIZED: + return "CUSOLVER_STATUS_NOT_INITIALIZED"; + case CUSOLVER_STATUS_ALLOC_FAILED: + return "CUSOLVER_STATUS_ALLOC_FAILED"; + case CUSOLVER_STATUS_INVALID_VALUE: + return "CUSOLVER_STATUS_INVALID_VALUE"; + case CUSOLVER_STATUS_ARCH_MISMATCH: + return "CUSOLVER_STATUS_ARCH_MISMATCH"; + case CUSOLVER_STATUS_MAPPING_ERROR: + return "CUSOLVER_STATUS_MAPPING_ERROR"; + case CUSOLVER_STATUS_EXECUTION_FAILED: + return "CUSOLVER_STATUS_EXECUTION_FAILED"; + case CUSOLVER_STATUS_INTERNAL_ERROR: + return "CUSOLVER_STATUS_INTERNAL_ERROR"; + case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + case CUSOLVER_STATUS_NOT_SUPPORTED: + return "CUSOLVER_STATUS_NOT_SUPPORTED"; + case CUSOLVER_STATUS_ZERO_PIVOT: + return "CUSOLVER_STATUS_ZERO_PIVOT"; + case CUSOLVER_STATUS_INVALID_LICENSE: + return "CUSOLVER_STATUS_INVALID_LICENSE"; + case CUSOLVER_STATUS_IRS_PARAMS_NOT_INITIALIZED: + return "CUSOLVER_STATUS_IRS_PARAMS_NOT_INITIALIZED"; + case CUSOLVER_STATUS_IRS_PARAMS_INVALID: + return "CUSOLVER_STATUS_IRS_PARAMS_INVALID"; + case CUSOLVER_STATUS_IRS_PARAMS_INVALID_PREC: + return "CUSOLVER_STATUS_IRS_PARAMS_INVALID_PREC"; + case CUSOLVER_STATUS_IRS_PARAMS_INVALID_REFINE: + return "CUSOLVER_STATUS_IRS_PARAMS_INVALID_REFINE"; + case CUSOLVER_STATUS_IRS_PARAMS_INVALID_MAXITER: + return "CUSOLVER_STATUS_IRS_PARAMS_INVALID_MAXITER"; + case CUSOLVER_STATUS_IRS_INTERNAL_ERROR: + return "CUSOLVER_STATUS_IRS_INTERNAL_ERROR"; + case CUSOLVER_STATUS_IRS_NOT_SUPPORTED: + return "CUSOLVER_STATUS_IRS_NOT_SUPPORTED"; + case CUSOLVER_STATUS_IRS_OUT_OF_RANGE: + return "CUSOLVER_STATUS_IRS_OUT_OF_RANGE"; + case CUSOLVER_STATUS_IRS_NRHS_NOT_SUPPORTED_FOR_REFINE_GMRES: + return "CUSOLVER_STATUS_IRS_NRHS_NOT_SUPPORTED_FOR_REFINE_GMRES"; + case CUSOLVER_STATUS_IRS_INFOS_NOT_INITIALIZED: + return "CUSOLVER_STATUS_IRS_INFOS_NOT_INITIALIZED"; + case CUSOLVER_STATUS_IRS_INFOS_NOT_DESTROYED: + return "CUSOLVER_STATUS_IRS_INFOS_NOT_DESTROYED"; + case CUSOLVER_STATUS_IRS_MATRIX_SINGULAR: + return "CUSOLVER_STATUS_IRS_MATRIX_SINGULAR"; + case CUSOLVER_STATUS_INVALID_WORKSPACE: + return "CUSOLVER_STATUS_INVALID_WORKSPACE"; + default: + return "UNKNOWN CUSOLVER ERROR"; + } +} + +extern "C" void check_Cusolver_Errors(cusolverStatus_t status, const char* msg, const char* file, int line) { + + const char* err = cusolver_Get_Error_String(status); + + if (status != CUSOLVER_STATUS_SUCCESS) { + printf("CUSOLVER Error in %s at line %d\n", file, line); + printf("%s - %s\n", msg, err); + exit(EXIT_FAILURE); + } +} + diff --git a/src/make_ninja.py b/src/make_ninja.py index 3d558d3..770c405 100755 --- a/src/make_ninja.py +++ b/src/make_ninja.py @@ -111,7 +111,7 @@ else: if USE_GPU: compiler_tmp = compiler.strip().split('\n') - compiler_tmp[0] += " -L{}/src/cuda/build -lcuquack -lcudart -lcublas".format(QUACK_ROOT) + compiler_tmp[0] += " -L{}/src/cuda/build -lcuquack -lcudart -lcublas -lcusolver".format(QUACK_ROOT) compiler_exe = '\n'.join(compiler_tmp) else: compiler_exe = compiler diff --git a/src/mod/cu_quack_module.f90 b/src/mod/cu_quack_module.f90 index a786587..cf974bb 100644 --- a/src/mod/cu_quack_module.f90 +++ b/src/mod/cu_quack_module.f90 @@ -8,18 +8,17 @@ module cu_quack_module interface - subroutine ph_drpa(nO, nBas, nS, eps, ERI, & - Omega, XpY, XmY) bind(C, name = "ph_drpa") + subroutine ph_drpa_tda(nO, nBas, nS, eps, ERI, & + Omega, X) bind(C, name = "ph_drpa_tda") import c_int, c_double integer(c_int), intent(in), value :: nO, nBas, nS real(c_double), intent(in) :: eps(nBas) real(c_double), intent(in) :: ERI(nBas,nBas,nBas,nBas) real(c_double), intent(out) :: Omega(nS) - real(c_double), intent(out) :: XpY(nS,nS) - real(c_double), intent(out) :: XmY(nS,nS) + real(c_double), intent(out) :: X(nS,nS) - end subroutine ph_drpa + end subroutine ph_drpa_tda end interface @@ -29,7 +28,7 @@ module cu_quack_module subroutine cu_quack_module_test() implicit none - print*, ' hello from mod_test' + print*, ' hello from cu_quack_module' end subroutine cu_quack_module_test ! --- From 2bdd48be2e396240e3b8363a3066a893d18b864b Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Thu, 28 Nov 2024 08:55:19 +0100 Subject: [PATCH 11/39] include mod for CPU & GPU --- src/make_ninja.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/make_ninja.py b/src/make_ninja.py index 770c405..fc8e819 100755 --- a/src/make_ninja.py +++ b/src/make_ninja.py @@ -195,11 +195,8 @@ exe_dirs = ["QuAcK"] lib_dirs = list(filter(lambda x: os.path.isdir(x) and \ x not in ["cuda"] and \ x not in exe_dirs, os.listdir("."))) -if USE_GPU: - i = lib_dirs.index("mod") - lib_dirs[0], lib_dirs[i] = lib_dirs[i], lib_dirs[0] -else: - lib_dirs.remove("mod") +i = lib_dirs.index("mod") +lib_dirs[0], lib_dirs[i] = lib_dirs[i], lib_dirs[0] def create_ninja_in_libdir(directory): def write_rule(f, source_file, replace): From 6ff3fc2905b2334a267fc38ddfab81759153a182 Mon Sep 17 00:00:00 2001 From: Pierre-Francois Loos Date: Thu, 28 Nov 2024 09:40:14 +0100 Subject: [PATCH 12/39] comment use --- src/LR/ppLR_C.f90 | 4 ++-- src/LR/ppLR_D.f90 | 4 ++-- src/RPA/phRRPA.f90 | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/LR/ppLR_C.f90 b/src/LR/ppLR_C.f90 index fbc3900..e9551d4 100644 --- a/src/LR/ppLR_C.f90 +++ b/src/LR/ppLR_C.f90 @@ -32,8 +32,8 @@ subroutine ppLR_C(ispin,nOrb,nC,nO,nV,nR,nVV,lambda,e,ERI,Cpp) ! Define the chemical potential -! eF = e(nO) + e(nO+1) - eF = 0d0 + eF = e(nO) + e(nO+1) +! eF = 0d0 ! Build C matrix for the singlet manifold diff --git a/src/LR/ppLR_D.f90 b/src/LR/ppLR_D.f90 index 3744f5c..3b7ff0d 100644 --- a/src/LR/ppLR_D.f90 +++ b/src/LR/ppLR_D.f90 @@ -30,8 +30,8 @@ subroutine ppLR_D(ispin,nOrb,nC,nO,nV,nR,nOO,lambda,e,ERI,Dpp) ! Define the chemical potential -! eF = e(nO) + e(nO+1) - eF = 0d0 + eF = e(nO) + e(nO+1) +! eF = 0d0 ! Build the D matrix for the singlet manifold diff --git a/src/RPA/phRRPA.f90 b/src/RPA/phRRPA.f90 index 577ab6c..99b96ae 100644 --- a/src/RPA/phRRPA.f90 +++ b/src/RPA/phRRPA.f90 @@ -1,6 +1,6 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF) - use cu_quack_module +! use cu_quack_module ! Perform a direct random phase approximation calculation From e43a56e0420b4af023f6a26b4adbcf1e90e96af2 Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Thu, 28 Nov 2024 16:10:05 +0100 Subject: [PATCH 13/39] refact in GPU direct & fixed bug in read dipole integrals --- src/GPU/phRRPA_GPU.f90 | 156 ++++++++++++++++++ src/LR/phLR.f90 | 4 + src/QuAcK/QuAcK.f90 | 1 - src/RPA/RRPA.f90 | 2 + src/RPA/phRRPA.f90 | 18 -- src/cuda/src/ph_drpa_a_sing.cu | 6 +- .../src/{ph_drpa_tda.c => ph_drpa_tda_sing.c} | 51 ++++-- src/make_ninja.py | 2 + src/mod/cu_quack_module.f90 | 50 +++++- src/utils/read_dipole_integrals.f90 | 6 +- 10 files changed, 258 insertions(+), 38 deletions(-) create mode 100644 src/GPU/phRRPA_GPU.f90 rename src/cuda/src/{ph_drpa_tda.c => ph_drpa_tda_sing.c} (58%) diff --git a/src/GPU/phRRPA_GPU.f90 b/src/GPU/phRRPA_GPU.f90 new file mode 100644 index 0000000..a4c81b1 --- /dev/null +++ b/src/GPU/phRRPA_GPU.f90 @@ -0,0 +1,156 @@ +subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF) + + use cu_quack_module + +! Perform a direct random phase approximation calculation + + implicit none + include 'parameters.h' + include 'quadrature.h' + +! Input variables + + logical,intent(in) :: dotest + + logical,intent(in) :: TDA + logical,intent(in) :: doACFDT + logical,intent(in) :: exchange_kernel + logical,intent(in) :: singlet + logical,intent(in) :: triplet + integer,intent(in) :: nBas + integer,intent(in) :: nC + integer,intent(in) :: nO + integer,intent(in) :: nV + integer,intent(in) :: nR + integer,intent(in) :: nS + double precision,intent(in) :: ENuc + double precision,intent(in) :: ERHF + double precision,intent(in) :: eHF(nBas) + double precision,intent(in) :: ERI(nBas,nBas,nBas,nBas) + double precision,intent(in) :: dipole_int(nBas,nBas,ncart) + +! Local variables + + integer :: i + integer :: ispin + logical :: dRPA + double precision :: t1, t2 + double precision :: lambda + double precision,allocatable :: Aph(:,:) + double precision,allocatable :: Bph(:,:) + double precision,allocatable :: Om(:) + double precision,allocatable :: XpY(:,:) + double precision,allocatable :: XmY(:,:) + ! DEBUG + !double precision, allocatable :: XpY_gpu(:,:), XmY_gpu(:,:), Om_gpu(:) + + double precision :: EcRPA(nspin) + +! Hello world + + write(*,*) + write(*,*)'*********************************' + write(*,*)'* Restricted ph-RPA Calculation *' + write(*,*)'*********************************' + write(*,*) + +! TDA + + if(TDA) then + write(*,*) 'Tamm-Dancoff approximation activated!' + write(*,*) + end if + +! Initialization + + dRPA = .true. + EcRPA(:) = 0d0 + lambda = 1d0 + +! Memory allocation + + allocate(Om(nS),XpY(nS,nS),XmY(nS,nS),Aph(nS,nS),Bph(nS,nS)) + +! Singlet manifold + + if(singlet) then + + if(TDA) then + + call wall_time(t1) + call ph_drpa_tda_sing(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om(1), XpY(1,1)) + call wall_time(t2) + print*, 'diag time on GPU (sec):', t2 - t1 + stop + XmY(:,:) = XpY(:,:) + + else + + ! TODO + !call ph_drpa_sing(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om(1), XpY(1,1)) + !XmY(:,:) = XpY(:,:) + + endif + + call print_excitation_energies('phRPA@RHF','singlet',nS,Om) + call phLR_transition_vectors(.true.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY) + + end if + +! Triplet manifold + + if(triplet) then + + ispin = 2 + + call phLR_A(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,eHF,ERI,Aph) + if(.not.TDA) call phLR_B(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,ERI,Bph) + + call phLR(TDA,nS,Aph,Bph,EcRPA(ispin),Om,XpY,XmY) + call print_excitation_energies('phRPA@RHF','triplet',nS,Om) + call phLR_transition_vectors(.false.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY) + + end if + + if(exchange_kernel) then + + EcRPA(1) = 0.5d0*EcRPA(1) + EcRPA(2) = 1.5d0*EcRPA(2) + + end if + + write(*,*) + write(*,*)'-------------------------------------------------------------------------------' + write(*,'(2X,A50,F20.10,A3)') 'Tr@phRPA@RHF correlation energy (singlet) = ',EcRPA(1),' au' + write(*,'(2X,A50,F20.10,A3)') 'Tr@phRPA@RHF correlation energy (triplet) = ',EcRPA(2),' au' + write(*,'(2X,A50,F20.10,A3)') 'Tr@phRPA@RHF correlation energy = ',sum(EcRPA),' au' + write(*,'(2X,A50,F20.10,A3)') 'Tr@phRPA@RHF total energy = ',ENuc + ERHF + sum(EcRPA),' au' + write(*,*)'-------------------------------------------------------------------------------' + write(*,*) + + deallocate(Om,XpY,XmY,Aph,Bph) + +! Compute the correlation energy via the adiabatic connection + + if(doACFDT) then + + call phACFDT(exchange_kernel,dRPA,TDA,singlet,triplet,nBas,nC,nO,nV,nR,nS,ERI,eHF,EcRPA) + + write(*,*) + write(*,*)'-------------------------------------------------------------------------------' + write(*,'(2X,A50,F20.10,A3)') 'AC@phRPA@RHF correlation energy (singlet) = ',EcRPA(1),' au' + write(*,'(2X,A50,F20.10,A3)') 'AC@phRPA@RHF correlation energy (triplet) = ',EcRPA(2),' au' + write(*,'(2X,A50,F20.10,A3)') 'AC@phRPA@RHF correlation energy = ',sum(EcRPA),' au' + write(*,'(2X,A50,F20.10,A3)') 'AC@phRPA@RHF total energy = ',ENuc + ERHF + sum(EcRPA),' au' + write(*,*)'-------------------------------------------------------------------------------' + write(*,*) + + end if + + if(dotest) then + + call dump_test_value('R','phRPA correlation energy',sum(EcRPA)) + + end if + +end subroutine diff --git a/src/LR/phLR.f90 b/src/LR/phLR.f90 index ca49344..f3cb274 100644 --- a/src/LR/phLR.f90 +++ b/src/LR/phLR.f90 @@ -15,6 +15,7 @@ subroutine phLR(TDA,nS,Aph,Bph,EcRPA,Om,XpY,XmY) ! Local variables double precision :: trace_matrix + double precision :: t1, t2 double precision,allocatable :: ApB(:,:) double precision,allocatable :: AmB(:,:) double precision,allocatable :: AmBSq(:,:) @@ -38,7 +39,10 @@ subroutine phLR(TDA,nS,Aph,Bph,EcRPA,Om,XpY,XmY) if(TDA) then XpY(:,:) = Aph(:,:) + !call wall_time(t1) call diagonalize_matrix(nS,XpY,Om) + !call wall_time(t2) + !print*, 'diag time on CPU (sec):', t2 - t1 XpY(:,:) = transpose(XpY(:,:)) XmY(:,:) = XpY(:,:) diff --git a/src/QuAcK/QuAcK.f90 b/src/QuAcK/QuAcK.f90 index 586bfff..e6e5865 100644 --- a/src/QuAcK/QuAcK.f90 +++ b/src/QuAcK/QuAcK.f90 @@ -177,7 +177,6 @@ program QuAcK call read_integrals(working_dir,nBas,S,T,V,Hc,ERI_AO) call read_dipole_integrals(working_dir,nBas,dipole_int_AO) - call wall_time(end_int) t_int = end_int - start_int diff --git a/src/RPA/RRPA.f90 b/src/RPA/RRPA.f90 index ab38932..861a0a2 100644 --- a/src/RPA/RRPA.f90 +++ b/src/RPA/RRPA.f90 @@ -50,6 +50,8 @@ subroutine RRPA(dotest,dophRPA,dophRPAx,docrRPA,doppRPA,TDA,doACFDT,exchange_ker write(*,'(A65,1X,F9.3,A8)') 'Total wall time for RPA = ',t_RPA,' seconds' write(*,*) + !call phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF) + end if !------------------------------------------------------------------------ diff --git a/src/RPA/phRRPA.f90 b/src/RPA/phRRPA.f90 index 99b96ae..4cf004c 100644 --- a/src/RPA/phRRPA.f90 +++ b/src/RPA/phRRPA.f90 @@ -1,7 +1,5 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF) -! use cu_quack_module - ! Perform a direct random phase approximation calculation implicit none @@ -40,8 +38,6 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO, double precision,allocatable :: Om(:) double precision,allocatable :: XpY(:,:) double precision,allocatable :: XmY(:,:) - ! DEBUG - !double precision, allocatable :: XpY_gpu(:,:), XmY_gpu(:,:), Om_gpu(:) double precision :: EcRPA(nspin) @@ -80,20 +76,6 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO, if(.not.TDA) call phLR_B(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,ERI,Bph) call phLR(TDA,nS,Aph,Bph,EcRPA(ispin),Om,XpY,XmY) - - !! DEBUG - !allocate(Om_gpu(nS), XpY_gpu(nS,nS), XmY_gpu(nS,nS)) - !call ph_drpa_tda(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om_gpu(1), XpY_gpu(1,1)) - !do i = 1, nS - ! print *, i, Om(i), Om_gpu(i) - ! if(dabs(Om(i) - Om_gpu(i)) .gt. 1d-13) then - ! print *, 'GPU FAILED!' - ! stop - ! endif - !enddo - !print *, 'GPU DONE!' - !stop - call print_excitation_energies('phRPA@RHF','singlet',nS,Om) call phLR_transition_vectors(.true.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY) diff --git a/src/cuda/src/ph_drpa_a_sing.cu b/src/cuda/src/ph_drpa_a_sing.cu index 917cdfb..27678e0 100644 --- a/src/cuda/src/ph_drpa_a_sing.cu +++ b/src/cuda/src/ph_drpa_a_sing.cu @@ -10,6 +10,8 @@ __global__ void ph_dRPA_A_sing_kernel(int nO, int nV, int nBas, int nS, double * int i_A0, i_A1, i_A2; int i_I0, i_I1, i_I2; + bool a_eq_b; + nVS = nV * nS; nBas2 = nBas * nBas; @@ -27,6 +29,8 @@ __global__ void ph_dRPA_A_sing_kernel(int nO, int nV, int nBas, int nS, double * while(bb < nV) { b = bb + nO; + a_eq_b = a == b; + i_A1 = i_A0 + bb; i_I1 = i_I0 + b * nBas; @@ -40,7 +44,7 @@ __global__ void ph_dRPA_A_sing_kernel(int nO, int nV, int nBas, int nS, double * while(j < nO) { A[i_A2 + j * nV] = 2.0 * ERI[i_I2 + j * nBas3]; - if((a==b) && (i==j)) { + if(a_eq_b && (i==j)) { A[i_A2 + j * nV] += eps[a] - eps[i]; } diff --git a/src/cuda/src/ph_drpa_tda.c b/src/cuda/src/ph_drpa_tda_sing.c similarity index 58% rename from src/cuda/src/ph_drpa_tda.c rename to src/cuda/src/ph_drpa_tda_sing.c index 2d6bccf..7b6e476 100644 --- a/src/cuda/src/ph_drpa_tda.c +++ b/src/cuda/src/ph_drpa_tda_sing.c @@ -9,8 +9,8 @@ #include "utils.h" #include "ph_rpa.h" -void ph_drpa_tda(int nO, int nBas, int nS, double *h_eps, double *h_ERI, - double *h_Omega, double *h_X) { +void ph_drpa_tda_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, + double *h_Omega, double *h_X) { double *d_eps = NULL; double *d_ERI = NULL; @@ -20,23 +20,39 @@ void ph_drpa_tda(int nO, int nBas, int nS, double *h_eps, double *h_ERI, int nBas2 = nBas * nBas; int nBas4 = nBas2 * nBas2; + float elapsedTime; + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + check_Cuda_Errors(cudaMalloc((void**)&d_eps, nBas * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); check_Cuda_Errors(cudaMalloc((void**)&d_ERI, nBas4 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); + cudaEventRecord(start, 0); check_Cuda_Errors(cudaMemcpy(d_eps, h_eps, nBas * sizeof(double), cudaMemcpyHostToDevice), "cudaMemcpy", __FILE__, __LINE__); check_Cuda_Errors(cudaMemcpy(d_ERI, h_ERI, nBas4 * sizeof(double), cudaMemcpyHostToDevice), "cudaMemcpy", __FILE__, __LINE__); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&elapsedTime, start, stop); + printf("Time elapsed on CPU->GPU transfer = %f msec\n", elapsedTime); // construct A double *d_A = NULL; check_Cuda_Errors(cudaMalloc((void**)&d_A, nS * nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); + cudaEventRecord(start, 0); ph_dRPA_A_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_A); check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&elapsedTime, start, stop); + printf("Time elapsed on A kernel = %f msec\n", elapsedTime); // diagonalize A @@ -47,24 +63,35 @@ void ph_drpa_tda(int nO, int nBas, int nS, double *h_eps, double *h_ERI, check_Cuda_Errors(cudaMalloc((void**)&d_Omega, nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); + cudaEventRecord(start, 0); diag_dn_dsyevd(nS, d_info, d_Omega, d_A); check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&elapsedTime, start, stop); + printf("Time elapsed on diagonalization = %f msec\n", elapsedTime); - int info_gpu = 0; - check_Cuda_Errors(cudaMemcpy(&info_gpu, d_info, sizeof(int), cudaMemcpyDeviceToHost), - "cudaMemcpy", __FILE__, __LINE__); - if (info_gpu != 0) { - printf("Error: diag_dn_dsyevd returned error code %d\n", info_gpu); - exit(EXIT_FAILURE); - } - - + //int info_gpu = 0; + cudaEventRecord(start, 0); + //check_Cuda_Errors(cudaMemcpy(&info_gpu, d_info, sizeof(int), cudaMemcpyDeviceToHost), + // "cudaMemcpy", __FILE__, __LINE__); + //if (info_gpu != 0) { + // printf("Error: diag_dn_dsyevd returned error code %d\n", info_gpu); + // exit(EXIT_FAILURE); + //} check_Cuda_Errors(cudaMemcpy(h_X, d_A, nS * nS * sizeof(double), cudaMemcpyDeviceToHost), "cudaMemcpy", __FILE__, __LINE__); - check_Cuda_Errors(cudaMemcpy(h_Omega, d_Omega, nS * sizeof(double), cudaMemcpyDeviceToHost), "cudaMemcpy", __FILE__, __LINE__); + cudaEventRecord(start, 0); + diag_dn_dsyevd(nS, d_info, d_Omega, d_A); + check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&elapsedTime, start, stop); + printf("Time elapsed on GPU -> CPU transfer = %f msec\n", elapsedTime); + check_Cuda_Errors(cudaFree(d_info), "cudaFree", __FILE__, __LINE__); check_Cuda_Errors(cudaFree(d_eps), "cudaFree", __FILE__, __LINE__); check_Cuda_Errors(cudaFree(d_ERI), "cudaFree", __FILE__, __LINE__); diff --git a/src/make_ninja.py b/src/make_ninja.py index fc8e819..712c9b9 100755 --- a/src/make_ninja.py +++ b/src/make_ninja.py @@ -197,6 +197,8 @@ lib_dirs = list(filter(lambda x: os.path.isdir(x) and \ x not in exe_dirs, os.listdir("."))) i = lib_dirs.index("mod") lib_dirs[0], lib_dirs[i] = lib_dirs[i], lib_dirs[0] +if not USE_GPU: + lib_dirs.remove("GPU") def create_ninja_in_libdir(directory): def write_rule(f, source_file, replace): diff --git a/src/mod/cu_quack_module.f90 b/src/mod/cu_quack_module.f90 index cf974bb..978fc0c 100644 --- a/src/mod/cu_quack_module.f90 +++ b/src/mod/cu_quack_module.f90 @@ -8,8 +8,8 @@ module cu_quack_module interface - subroutine ph_drpa_tda(nO, nBas, nS, eps, ERI, & - Omega, X) bind(C, name = "ph_drpa_tda") + subroutine ph_drpa_tda_sing(nO, nBas, nS, eps, ERI, & + Omega, X) bind(C, name = "ph_drpa_tda_sing") import c_int, c_double integer(c_int), intent(in), value :: nO, nBas, nS @@ -18,7 +18,51 @@ module cu_quack_module real(c_double), intent(out) :: Omega(nS) real(c_double), intent(out) :: X(nS,nS) - end subroutine ph_drpa_tda + end subroutine ph_drpa_tda_sing + + ! --- + + subroutine ph_drpa_tda_trip(nO, nBas, nS, eps, ERI, & + Omega, X) bind(C, name = "ph_drpa_tda_trip") + + import c_int, c_double + integer(c_int), intent(in), value :: nO, nBas, nS + real(c_double), intent(in) :: eps(nBas) + real(c_double), intent(in) :: ERI(nBas,nBas,nBas,nBas) + real(c_double), intent(out) :: Omega(nS) + real(c_double), intent(out) :: X(nS,nS) + + end subroutine ph_drpa_tda_trip + + ! --- + + subroutine ph_drpa_sing(nO, nBas, nS, eps, ERI, & + Omega, X) bind(C, name = "ph_drpa_sing") + + import c_int, c_double + integer(c_int), intent(in), value :: nO, nBas, nS + real(c_double), intent(in) :: eps(nBas) + real(c_double), intent(in) :: ERI(nBas,nBas,nBas,nBas) + real(c_double), intent(out) :: Omega(nS) + real(c_double), intent(out) :: X(nS,nS) + + end subroutine ph_drpa_sing + + ! --- + + subroutine ph_drpa_trip(nO, nBas, nS, eps, ERI, & + Omega, X) bind(C, name = "ph_drpa_trip") + + import c_int, c_double + integer(c_int), intent(in), value :: nO, nBas, nS + real(c_double), intent(in) :: eps(nBas) + real(c_double), intent(in) :: ERI(nBas,nBas,nBas,nBas) + real(c_double), intent(out) :: Omega(nS) + real(c_double), intent(out) :: X(nS,nS) + + end subroutine ph_drpa_trip + + ! --- end interface diff --git a/src/utils/read_dipole_integrals.f90 b/src/utils/read_dipole_integrals.f90 index 15af2cb..8f56354 100644 --- a/src/utils/read_dipole_integrals.f90 +++ b/src/utils/read_dipole_integrals.f90 @@ -39,7 +39,7 @@ subroutine read_dipole_integrals(working_dir,nBas,R) else do - read(21,*,iostat=ios) mu,nu,Dip + read(21, '(I5, I5, E25.17)', iostat=ios) mu, nu, Dip if(ios /= 0) exit R(mu,nu,1) = Dip R(nu,mu,1) = Dip @@ -62,7 +62,7 @@ subroutine read_dipole_integrals(working_dir,nBas,R) else do - read(22,*,iostat=ios) mu,nu,Dip + read(22, '(I5, I5, E25.17)', iostat=ios) mu, nu, Dip if(ios /= 0) exit R(mu,nu,2) = Dip R(nu,mu,2) = Dip @@ -85,7 +85,7 @@ subroutine read_dipole_integrals(working_dir,nBas,R) else do - read(23,*,iostat=ios) mu,nu,Dip + read(23, '(I5, I5, E25.17)', iostat=ios) mu, nu, Dip if(ios /= 0) exit R(mu,nu,3) = Dip R(nu,mu,3) = Dip From 1a091a0707471c649df51bb6f4f6c7c1158b5bf1 Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Thu, 28 Nov 2024 18:41:00 +0100 Subject: [PATCH 14/39] few modifs --- src/RPA/phRRPA.f90 | 7 +++++++ src/cuda/src/diag_dense_dsy_mat.cu | 1 + src/cuda/src/ph_drpa_tda_sing.c | 4 ---- src/utils/read_dipole_integrals.f90 | 20 ++++++++++---------- 4 files changed, 18 insertions(+), 14 deletions(-) diff --git a/src/RPA/phRRPA.f90 b/src/RPA/phRRPA.f90 index 4cf004c..3ddeb03 100644 --- a/src/RPA/phRRPA.f90 +++ b/src/RPA/phRRPA.f90 @@ -32,6 +32,7 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO, integer :: i integer :: ispin logical :: dRPA + double precision :: t1, t2 double precision :: lambda double precision,allocatable :: Aph(:,:) double precision,allocatable :: Bph(:,:) @@ -72,10 +73,16 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO, ispin = 1 + !call wall_time(t1) call phLR_A(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,eHF,ERI,Aph) + !call wall_time(t2) + !print *, "wall time for A on CPU (sec) = ", t2 - t1 if(.not.TDA) call phLR_B(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,ERI,Bph) + !call wall_time(t1) call phLR(TDA,nS,Aph,Bph,EcRPA(ispin),Om,XpY,XmY) + !call wall_time(t2) + !print *, "wall time diag A on CPU (sec) = ", t2 - t1 call print_excitation_energies('phRPA@RHF','singlet',nS,Om) call phLR_transition_vectors(.true.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY) diff --git a/src/cuda/src/diag_dense_dsy_mat.cu b/src/cuda/src/diag_dense_dsy_mat.cu index b800501..4e3b0d3 100644 --- a/src/cuda/src/diag_dense_dsy_mat.cu +++ b/src/cuda/src/diag_dense_dsy_mat.cu @@ -10,6 +10,7 @@ extern "C" void diag_dn_dsyevd(int n, int *info, double *W, double *A) { cusolverDnHandle_t cusolverH = NULL; cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR; // Compute eigenvalues and eigenvectors cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER; // Upper triangular part of the matrix is stored + //cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER; // Upper triangular part of the matrix is stored int lwork = 0; double *work = NULL; diff --git a/src/cuda/src/ph_drpa_tda_sing.c b/src/cuda/src/ph_drpa_tda_sing.c index 7b6e476..52ae341 100644 --- a/src/cuda/src/ph_drpa_tda_sing.c +++ b/src/cuda/src/ph_drpa_tda_sing.c @@ -83,10 +83,6 @@ void ph_drpa_tda_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, "cudaMemcpy", __FILE__, __LINE__); check_Cuda_Errors(cudaMemcpy(h_Omega, d_Omega, nS * sizeof(double), cudaMemcpyDeviceToHost), "cudaMemcpy", __FILE__, __LINE__); - - cudaEventRecord(start, 0); - diag_dn_dsyevd(nS, d_info, d_Omega, d_A); - check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsedTime, start, stop); diff --git a/src/utils/read_dipole_integrals.f90 b/src/utils/read_dipole_integrals.f90 index 8f56354..e3ea558 100644 --- a/src/utils/read_dipole_integrals.f90 +++ b/src/utils/read_dipole_integrals.f90 @@ -20,7 +20,7 @@ subroutine read_dipole_integrals(working_dir,nBas,R) double precision,intent(out) :: R(nBas,nBas,ncart) - integer :: status, ios + integer :: ios character(len=256) :: file_path @@ -29,9 +29,9 @@ subroutine read_dipole_integrals(working_dir,nBas,R) R(:,:,:) = 0d0 file_path = trim(working_dir) // '/int/x.dat' - open(unit=21, file=file_path, status='old', action='read', iostat=status) + open(unit=21, file=file_path, status='old', action='read', iostat=ios) - if(status /= 0) then + if(ios /= 0) then print *, "Error opening file: ", file_path stop @@ -39,7 +39,7 @@ subroutine read_dipole_integrals(working_dir,nBas,R) else do - read(21, '(I5, I5, E25.17)', iostat=ios) mu, nu, Dip + read(21, '(I7, I7, E25.17)', iostat=ios) mu, nu, Dip if(ios /= 0) exit R(mu,nu,1) = Dip R(nu,mu,1) = Dip @@ -52,9 +52,9 @@ subroutine read_dipole_integrals(working_dir,nBas,R) ! --- file_path = trim(working_dir) // '/int/y.dat' - open(unit=22, file=file_path, status='old', action='read', iostat=status) + open(unit=22, file=file_path, status='old', action='read', iostat=ios) - if(status /= 0) then + if(ios /= 0) then print *, "Error opening file: ", file_path stop @@ -62,7 +62,7 @@ subroutine read_dipole_integrals(working_dir,nBas,R) else do - read(22, '(I5, I5, E25.17)', iostat=ios) mu, nu, Dip + read(22, '(I7, I7, E25.17)', iostat=ios) mu, nu, Dip if(ios /= 0) exit R(mu,nu,2) = Dip R(nu,mu,2) = Dip @@ -75,9 +75,9 @@ subroutine read_dipole_integrals(working_dir,nBas,R) ! --- file_path = trim(working_dir) // '/int/z.dat' - open(unit=23, file=file_path, status='old', action='read', iostat=status) + open(unit=23, file=file_path, status='old', action='read', iostat=ios) - if(status /= 0) then + if(ios /= 0) then print *, "Error opening file: ", file_path stop @@ -85,7 +85,7 @@ subroutine read_dipole_integrals(working_dir,nBas,R) else do - read(23, '(I5, I5, E25.17)', iostat=ios) mu, nu, Dip + read(23, '(I7, I7, E25.17)', iostat=ios) mu, nu, Dip if(ios /= 0) exit R(mu,nu,3) = Dip R(nu,mu,3) = Dip From 1235823334de743e9b12e7e86087dc1c2b5c4fa3 Mon Sep 17 00:00:00 2001 From: AbdAmmar Date: Fri, 29 Nov 2024 03:07:18 +0100 Subject: [PATCH 15/39] refact for GPU --- input/hardware | 2 + src/GPU/cu_quack_module.f90 | 53 +++++++++++++++++++++ src/LR/phLR.f90 | 3 -- src/QuAcK/QuAcK.f90 | 10 +++- src/QuAcK/RQuAcK.f90 | 6 ++- src/QuAcK/read_hardware.f90 | 45 ++++++++++++++++++ src/RPA/RRPA.f90 | 12 +++-- src/RPA/phRRPA.f90 | 1 + src/{GPU => RPA}/phRRPA_GPU.f90 | 37 ++++++++++++++- src/cuda/src/ph_drpa_a_sing.cu | 2 + src/cuda/src/ph_drpa_tda_sing.c | 13 ++++-- src/make_ninja.py | 31 +++++++++---- src/mod/cu_quack_module.f90 | 82 --------------------------------- 13 files changed, 190 insertions(+), 107 deletions(-) create mode 100644 input/hardware create mode 100644 src/GPU/cu_quack_module.f90 create mode 100644 src/QuAcK/read_hardware.f90 rename src/{GPU => RPA}/phRRPA_GPU.f90 (80%) delete mode 100644 src/mod/cu_quack_module.f90 diff --git a/input/hardware b/input/hardware new file mode 100644 index 0000000..99c395a --- /dev/null +++ b/input/hardware @@ -0,0 +1,2 @@ +# if True (T), use GPU + F diff --git a/src/GPU/cu_quack_module.f90 b/src/GPU/cu_quack_module.f90 new file mode 100644 index 0000000..eba54a0 --- /dev/null +++ b/src/GPU/cu_quack_module.f90 @@ -0,0 +1,53 @@ +module cu_quack_module + + use, intrinsic :: iso_c_binding + + implicit none + +!#ifdef USE_GPU +! interface +! subroutine ph_drpa_tda_sing(nO, nBas, nS, eps, ERI, & +! Omega, X) bind(C, name = "ph_drpa_tda_sing") +! +! import c_int, c_double +! integer(c_int), intent(in), value :: nO, nBas, nS +! real(c_double), intent(in) :: eps(nBas) +! real(c_double), intent(in) :: ERI(nBas,nBas,nBas,nBas) +! real(c_double), intent(out) :: Omega(nS) +! real(c_double), intent(out) :: X(nS,nS) +! +! end subroutine ph_drpa_tda_sing +! end interface +!#else +! interface +! subroutine ph_drpa_tda_sing(nO, nBas, nS, eps, ERI, Omega, X) +! integer, intent(in) :: nO, nBas, nS +! double precision, intent(in) :: eps(nBas) +! double precision, intent(in) :: ERI(nBas,nBas,nBas,nBas) +! double precision, intent(out) :: Omega(nS) +! double precision, intent(out) :: X(nS,nS) +! end subroutine ph_drpa_tda_sing +! end interface +!#endif + + interface + + subroutine ph_drpa_tda_sing(nO, nBas, nS, eps, ERI, & + Omega, X) bind(C, name = "ph_drpa_tda_sing") + + import c_int, c_double + integer(c_int), intent(in), value :: nO, nBas, nS + real(c_double), intent(in) :: eps(nBas) + real(c_double), intent(in) :: ERI(nBas,nBas,nBas,nBas) + real(c_double), intent(out) :: Omega(nS) + real(c_double), intent(out) :: X(nS,nS) + + end subroutine ph_drpa_tda_sing + + end interface + + ! --- + +end module cu_quack_module + + diff --git a/src/LR/phLR.f90 b/src/LR/phLR.f90 index f3cb274..6457e27 100644 --- a/src/LR/phLR.f90 +++ b/src/LR/phLR.f90 @@ -39,10 +39,7 @@ subroutine phLR(TDA,nS,Aph,Bph,EcRPA,Om,XpY,XmY) if(TDA) then XpY(:,:) = Aph(:,:) - !call wall_time(t1) call diagonalize_matrix(nS,XpY,Om) - !call wall_time(t2) - !print*, 'diag time on CPU (sec):', t2 - t1 XpY(:,:) = transpose(XpY(:,:)) XmY(:,:) = XpY(:,:) diff --git a/src/QuAcK/QuAcK.f90 b/src/QuAcK/QuAcK.f90 index e6e5865..9072904 100644 --- a/src/QuAcK/QuAcK.f90 +++ b/src/QuAcK/QuAcK.f90 @@ -44,6 +44,8 @@ program QuAcK logical :: reg_MP + logical :: use_gpu + integer :: maxSCF_CC,max_diis_CC double precision :: thresh_CC @@ -134,6 +136,12 @@ program QuAcK doACFDT,exchange_kernel,doXBS, & dophBSE,dophBSE2,doppBSE,dBSE,dTDA) +!------------------! +! Hardware ! +!------------------! + + call read_hardware(working_dir,use_gpu) + !------------------------------------! ! Read input information ! !------------------------------------! @@ -218,7 +226,7 @@ program QuAcK !-------------------------! if(doRQuAcK) & - call RQuAcK(doRtest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT, & + call RQuAcK(use_gpu,doRtest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT, & dodrCCD,dorCCD,docrCCD,dolCCD,doCIS,doCIS_D,doCID,doCISD,doFCI,dophRPA,dophRPAx,docrRPA,doppRPA, & doG0F2,doevGF2,doqsGF2,doufG0F02,doG0F3,doevGF3,doG0W0,doevGW,doqsGW,doufG0W0,doufGW, & doG0T0pp,doevGTpp,doqsGTpp,doufG0T0pp,doG0T0eh,doevGTeh,doqsGTeh, & diff --git a/src/QuAcK/RQuAcK.f90 b/src/QuAcK/RQuAcK.f90 index 57f0fe5..4a03be8 100644 --- a/src/QuAcK/RQuAcK.f90 +++ b/src/QuAcK/RQuAcK.f90 @@ -1,4 +1,4 @@ -subroutine RQuAcK(dotest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT, & +subroutine RQuAcK(use_gpu,dotest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT, & dodrCCD,dorCCD,docrCCD,dolCCD,doCIS,doCIS_D,doCID,doCISD,doFCI,dophRPA,dophRPAx,docrRPA,doppRPA, & doG0F2,doevGF2,doqsGF2,doufG0F02,doG0F3,doevGF3,doG0W0,doevGW,doqsGW,doufG0W0,doufGW, & doG0T0pp,doevGTpp,doqsGTpp,doufG0T0pp,doG0T0eh,doevGTeh,doqsGTeh, & @@ -14,6 +14,8 @@ subroutine RQuAcK(dotest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,d implicit none include 'parameters.h' + logical,intent(in) :: use_gpu + logical,intent(in) :: dotest logical,intent(in) :: doRHF,doROHF @@ -274,7 +276,7 @@ subroutine RQuAcK(dotest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,d if(doRPA) then call wall_time(start_RPA) - call RRPA(dotest,dophRPA,dophRPAx,docrRPA,doppRPA,TDA,doACFDT,exchange_kernel,singlet,triplet, & + call RRPA(use_gpu,dotest,dophRPA,dophRPAx,docrRPA,doppRPA,TDA,doACFDT,exchange_kernel,singlet,triplet, & nOrb,nC,nO,nV,nR,nS,ENuc,ERHF,ERI_MO,dipole_int_MO,eHF) call wall_time(end_RPA) diff --git a/src/QuAcK/read_hardware.f90 b/src/QuAcK/read_hardware.f90 new file mode 100644 index 0000000..c014d44 --- /dev/null +++ b/src/QuAcK/read_hardware.f90 @@ -0,0 +1,45 @@ +subroutine read_hardware(working_dir,use_gpu) + +! Read desired methods + + implicit none + +! Input variables + + character(len=256),intent(in) :: working_dir + +! Output variables + + logical,intent(out) :: use_gpu + +! Local variables + + character(len=1) :: ans + integer :: ios + character(len=256) :: file_path + +! Open file with method specification + + file_path = trim(working_dir) // '/input/hardware' + open(unit=1, file=file_path, status='old', action='read', iostat=ios) + + if(ios /= 0) then + + use_gpu = .False. + + else + + read(1,*) + read(1,*) ans + if(ans == 'T') then + use_gpu = .true. + else + use_gpu = .False. + endif + + endif + + ! Close file with options + close(unit=1) + +end subroutine diff --git a/src/RPA/RRPA.f90 b/src/RPA/RRPA.f90 index 861a0a2..5ab652f 100644 --- a/src/RPA/RRPA.f90 +++ b/src/RPA/RRPA.f90 @@ -1,4 +1,4 @@ -subroutine RRPA(dotest,dophRPA,dophRPAx,docrRPA,doppRPA,TDA,doACFDT,exchange_kernel,singlet,triplet, & +subroutine RRPA(use_gpu,dotest,dophRPA,dophRPAx,docrRPA,doppRPA,TDA,doACFDT,exchange_kernel,singlet,triplet, & nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF) ! Random-phase approximation module @@ -8,6 +8,8 @@ subroutine RRPA(dotest,dophRPA,dophRPAx,docrRPA,doppRPA,TDA,doACFDT,exchange_ker ! Input variables + logical,intent(in) :: use_gpu + logical,intent(in) :: dotest logical,intent(in) :: dophRPA @@ -43,15 +45,17 @@ subroutine RRPA(dotest,dophRPA,dophRPAx,docrRPA,doppRPA,TDA,doACFDT,exchange_ker if(dophRPA) then call wall_time(start_RPA) - call phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF) + if (use_gpu) then + call phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF) + else + call phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF) + endif call wall_time(end_RPA) t_RPA = end_RPA - start_RPA write(*,'(A65,1X,F9.3,A8)') 'Total wall time for RPA = ',t_RPA,' seconds' write(*,*) - !call phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF) - end if !------------------------------------------------------------------------ diff --git a/src/RPA/phRRPA.f90 b/src/RPA/phRRPA.f90 index 3ddeb03..31e7854 100644 --- a/src/RPA/phRRPA.f90 +++ b/src/RPA/phRRPA.f90 @@ -83,6 +83,7 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO, call phLR(TDA,nS,Aph,Bph,EcRPA(ispin),Om,XpY,XmY) !call wall_time(t2) !print *, "wall time diag A on CPU (sec) = ", t2 - t1 + !stop call print_excitation_energies('phRPA@RHF','singlet',nS,Om) call phLR_transition_vectors(.true.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY) diff --git a/src/GPU/phRRPA_GPU.f90 b/src/RPA/phRRPA_GPU.f90 similarity index 80% rename from src/GPU/phRRPA_GPU.f90 rename to src/RPA/phRRPA_GPU.f90 index a4c81b1..839c7be 100644 --- a/src/GPU/phRRPA_GPU.f90 +++ b/src/RPA/phRRPA_GPU.f90 @@ -1,3 +1,5 @@ +#ifdef USE_GPU + subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF) use cu_quack_module @@ -69,7 +71,8 @@ subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC ! Memory allocation - allocate(Om(nS),XpY(nS,nS),XmY(nS,nS),Aph(nS,nS),Bph(nS,nS)) + allocate(Om(nS),XpY(nS,nS),XmY(nS,nS)) + !allocate(Aph(nS,nS),Bph(nS,nS)) ! Singlet manifold @@ -77,6 +80,7 @@ subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC if(TDA) then + print*, 'start diag on GPU:' call wall_time(t1) call ph_drpa_tda_sing(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om(1), XpY(1,1)) call wall_time(t2) @@ -154,3 +158,34 @@ subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC end if end subroutine + +#else + +subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF) + + implicit none + include 'parameters.h' + include 'quadrature.h' + + logical,intent(in) :: dotest + logical,intent(in) :: TDA + logical,intent(in) :: doACFDT + logical,intent(in) :: exchange_kernel + logical,intent(in) :: singlet + logical,intent(in) :: triplet + integer,intent(in) :: nBas + integer,intent(in) :: nC + integer,intent(in) :: nO + integer,intent(in) :: nV + integer,intent(in) :: nR + integer,intent(in) :: nS + double precision,intent(in) :: ENuc + double precision,intent(in) :: ERHF + double precision,intent(in) :: eHF(nBas) + double precision,intent(in) :: ERI(nBas,nBas,nBas,nBas) + double precision,intent(in) :: dipole_int(nBas,nBas,ncart) + print*, "compile with USE_GPU FLAG!" + stop +end + +#endif diff --git a/src/cuda/src/ph_drpa_a_sing.cu b/src/cuda/src/ph_drpa_a_sing.cu index 27678e0..be5e7af 100644 --- a/src/cuda/src/ph_drpa_a_sing.cu +++ b/src/cuda/src/ph_drpa_a_sing.cu @@ -75,6 +75,8 @@ extern "C" void ph_dRPA_A_sing(int nO, int nV, int nBas, int nS, double *eps, do dim3 dimGrid(nBlocks, nBlocks, 1); dim3 dimBlock(sBlocks, sBlocks, 1); + //dim3 dimGrid(nBlocks, 1, 1); + //dim3 dimBlock(sBlocks, 1, 1); printf("lunching ph_dRPA_A_sing_kernel with %dx%d blocks and %dx%d threads/block\n", nBlocks, nBlocks, sBlocks, sBlocks); diff --git a/src/cuda/src/ph_drpa_tda_sing.c b/src/cuda/src/ph_drpa_tda_sing.c index 52ae341..5cfa9ef 100644 --- a/src/cuda/src/ph_drpa_tda_sing.c +++ b/src/cuda/src/ph_drpa_tda_sing.c @@ -17,14 +17,16 @@ void ph_drpa_tda_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, int nV = nBas - nO; - int nBas2 = nBas * nBas; - int nBas4 = nBas2 * nBas2; + long long nBas_long = (long long) nBas; + long long nBas4 = nBas_long * nBas_long * nBas_long * nBas_long; float elapsedTime; cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); + //printf("nO = %d, nBas = %d, nS = %d\n", nO, nBas, nS); + //printf("nBas4 = %lld\n", nBas4); check_Cuda_Errors(cudaMalloc((void**)&d_eps, nBas * sizeof(double)), @@ -32,6 +34,7 @@ void ph_drpa_tda_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, check_Cuda_Errors(cudaMalloc((void**)&d_ERI, nBas4 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); + printf("CPU->GPU transfer..\n"); cudaEventRecord(start, 0); check_Cuda_Errors(cudaMemcpy(d_eps, h_eps, nBas * sizeof(double), cudaMemcpyHostToDevice), "cudaMemcpy", __FILE__, __LINE__); @@ -55,6 +58,10 @@ void ph_drpa_tda_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, printf("Time elapsed on A kernel = %f msec\n", elapsedTime); + check_Cuda_Errors(cudaFree(d_eps), "cudaFree", __FILE__, __LINE__); + check_Cuda_Errors(cudaFree(d_ERI), "cudaFree", __FILE__, __LINE__); + + // diagonalize A int *d_info = NULL; double *d_Omega = NULL; @@ -89,8 +96,6 @@ void ph_drpa_tda_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, printf("Time elapsed on GPU -> CPU transfer = %f msec\n", elapsedTime); check_Cuda_Errors(cudaFree(d_info), "cudaFree", __FILE__, __LINE__); - check_Cuda_Errors(cudaFree(d_eps), "cudaFree", __FILE__, __LINE__); - check_Cuda_Errors(cudaFree(d_ERI), "cudaFree", __FILE__, __LINE__); check_Cuda_Errors(cudaFree(d_A), "cudaFree", __FILE__, __LINE__); check_Cuda_Errors(cudaFree(d_Omega), "cudaFree", __FILE__, __LINE__); diff --git a/src/make_ninja.py b/src/make_ninja.py index 712c9b9..79a04c7 100755 --- a/src/make_ninja.py +++ b/src/make_ninja.py @@ -36,7 +36,7 @@ def check_compiler_exists(compiler): compile_gfortran_mac = """ FC = gfortran AR = libtool -static -o -FFLAGS = -I$IDIR -J$IDIR -fbacktrace -g -Wall -Wno-unused-variable -Wno-unused -Wno-unused-dummy-argument -Wuninitialized -Wmaybe-uninitialized -O3 -march=native +FFLAGS = -I$IDIR -J$IDIR -cpp -fbacktrace -g -Wall -Wno-unused-variable -Wno-unused -Wno-unused-dummy-argument -Wuninitialized -Wmaybe-uninitialized -O3 -march=native CC = gcc CXX = g++ LAPACK=-lblas -llapack @@ -47,7 +47,7 @@ FIX_ORDER_OF_LIBS= compile_gfortran_mac_debug = """ FC = gfortran AR = libtool -static -o -FFLAGS = -I$IDIR -J$IDIR -fbacktrace -Wall -Wno-unused-variable -g -fcheck=all -Waliasing -Wampersand -Wconversion -Wsurprising -Wintrinsics-std -Wno-tabs -Wintrinsic-shadow -Wline-truncation -Wreal-q-constant +FFLAGS = -I$IDIR -J$IDIR -cpp -fbacktrace -Wall -Wno-unused-variable -g -fcheck=all -Waliasing -Wampersand -Wconversion -Wsurprising -Wintrinsics-std -Wno-tabs -Wintrinsic-shadow -Wline-truncation -Wreal-q-constant CC = gcc CXX = g++ LAPACK=-lblas -llapack @@ -58,7 +58,7 @@ FIX_ORDER_OF_LIBS= compile_gfortran_linux_debug = """ FC = gfortran AR = ar crs -FFLAGS = -I$IDIR -J$IDIR -fbacktrace -Wall -g -fcheck=all -Waliasing -Wampersand -Wconversion -Wsurprising -Wintrinsics-std -Wno-tabs -Wintrinsic-shadow -Wline-truncation -Wreal-q-constant +FFLAGS = -I$IDIR -J$IDIR -cpp -fbacktrace -Wall -g -fcheck=all -Waliasing -Wampersand -Wconversion -Wsurprising -Wintrinsics-std -Wno-tabs -Wintrinsic-shadow -Wline-truncation -Wreal-q-constant CC = gcc CXX = g++ LAPACK=-lblas -llapack @@ -83,7 +83,7 @@ elif sys.platform.lower() == "linux" or os.path.exists('/proc/version'): compiler = """ FC = ifort -mkl=parallel -qopenmp AR = ar crs -FFLAGS = -I$IDIR -module $IDIR -traceback -g -Ofast -xHost +FFLAGS = -I$IDIR -module $IDIR -fpp -traceback -g -Ofast -xHost CC = icc CXX = icpc LAPACK= @@ -94,10 +94,12 @@ FIX_ORDER_OF_LIBS=-Wl,--start-group compiler = """ FC = gfortran -fopenmp AR = ar crs -FFLAGS = -I$IDIR -J$IDIR -fbacktrace -g -Wall -Wno-unused-variable -Wno-unused -Wno-unused-dummy-argument -Wuninitialized -Wmaybe-uninitialized -O3 -march=native +FFLAGS = -I$IDIR -J$IDIR -cpp -fbacktrace -g -Wall -Wno-unused-variable -Wno-unused -Wno-unused-dummy-argument -Wuninitialized -Wmaybe-uninitialized -O3 -march=native CC = gcc CXX = g++ LAPACK=-lblas -llapack +# uncomment for TURPAN +#LAPACK=-larmpl_lp64_mp STDCXX=-lstdc++ FIX_ORDER_OF_LIBS=-Wl,--start-group """ @@ -113,8 +115,16 @@ if USE_GPU: compiler_tmp = compiler.strip().split('\n') compiler_tmp[0] += " -L{}/src/cuda/build -lcuquack -lcudart -lcublas -lcusolver".format(QUACK_ROOT) compiler_exe = '\n'.join(compiler_tmp) + + compiler_tmp = compiler.strip().split('\n') + compiler_tmp[2] += " -DUSE_GPU" + compiler_lib = '\n'.join(compiler_tmp) + + compiler_main = compiler_tmp else: compiler_exe = compiler + compiler_lib = compiler + compiler_main = compiler @@ -172,7 +182,7 @@ rule git_clone build_in_lib_dir = "\n".join([ header, - compiler, + compiler_lib, rule_fortran, rule_build_lib, ]) @@ -187,7 +197,7 @@ build_in_exe_dir = "\n".join([ build_main = "\n".join([ header, - compiler, + compiler_main, rule_git_clone, ]) @@ -195,9 +205,10 @@ exe_dirs = ["QuAcK"] lib_dirs = list(filter(lambda x: os.path.isdir(x) and \ x not in ["cuda"] and \ x not in exe_dirs, os.listdir("."))) -i = lib_dirs.index("mod") -lib_dirs[0], lib_dirs[i] = lib_dirs[i], lib_dirs[0] -if not USE_GPU: +if(USE_GPU): + i = lib_dirs.index("GPU") + lib_dirs[0], lib_dirs[i] = lib_dirs[i], lib_dirs[0] +else: lib_dirs.remove("GPU") def create_ninja_in_libdir(directory): diff --git a/src/mod/cu_quack_module.f90 b/src/mod/cu_quack_module.f90 deleted file mode 100644 index 978fc0c..0000000 --- a/src/mod/cu_quack_module.f90 +++ /dev/null @@ -1,82 +0,0 @@ -module cu_quack_module - - use, intrinsic :: iso_c_binding - - implicit none - - ! --- - - interface - - subroutine ph_drpa_tda_sing(nO, nBas, nS, eps, ERI, & - Omega, X) bind(C, name = "ph_drpa_tda_sing") - - import c_int, c_double - integer(c_int), intent(in), value :: nO, nBas, nS - real(c_double), intent(in) :: eps(nBas) - real(c_double), intent(in) :: ERI(nBas,nBas,nBas,nBas) - real(c_double), intent(out) :: Omega(nS) - real(c_double), intent(out) :: X(nS,nS) - - end subroutine ph_drpa_tda_sing - - ! --- - - subroutine ph_drpa_tda_trip(nO, nBas, nS, eps, ERI, & - Omega, X) bind(C, name = "ph_drpa_tda_trip") - - import c_int, c_double - integer(c_int), intent(in), value :: nO, nBas, nS - real(c_double), intent(in) :: eps(nBas) - real(c_double), intent(in) :: ERI(nBas,nBas,nBas,nBas) - real(c_double), intent(out) :: Omega(nS) - real(c_double), intent(out) :: X(nS,nS) - - end subroutine ph_drpa_tda_trip - - ! --- - - subroutine ph_drpa_sing(nO, nBas, nS, eps, ERI, & - Omega, X) bind(C, name = "ph_drpa_sing") - - import c_int, c_double - integer(c_int), intent(in), value :: nO, nBas, nS - real(c_double), intent(in) :: eps(nBas) - real(c_double), intent(in) :: ERI(nBas,nBas,nBas,nBas) - real(c_double), intent(out) :: Omega(nS) - real(c_double), intent(out) :: X(nS,nS) - - end subroutine ph_drpa_sing - - ! --- - - subroutine ph_drpa_trip(nO, nBas, nS, eps, ERI, & - Omega, X) bind(C, name = "ph_drpa_trip") - - import c_int, c_double - integer(c_int), intent(in), value :: nO, nBas, nS - real(c_double), intent(in) :: eps(nBas) - real(c_double), intent(in) :: ERI(nBas,nBas,nBas,nBas) - real(c_double), intent(out) :: Omega(nS) - real(c_double), intent(out) :: X(nS,nS) - - end subroutine ph_drpa_trip - - ! --- - - end interface - - ! --- - - contains - - subroutine cu_quack_module_test() - implicit none - print*, ' hello from cu_quack_module' - end subroutine cu_quack_module_test - - ! --- - -end module cu_quack_module - - From 3c8f8291bd061aa5e22ffb617aefb0d73cd4b3b0 Mon Sep 17 00:00:00 2001 From: AbdAmmar Date: Fri, 29 Nov 2024 13:58:52 +0100 Subject: [PATCH 16/39] working on dRPA (with no TDA) on GPU: saving --- src/GPU/cu_quack_module.f90 | 51 +++++------ src/RPA/phRRPA.f90 | 1 - src/RPA/phRRPA_GPU.f90 | 144 +++++++++++++++---------------- src/cuda/src/ph_drpa_amb_sing.cu | 88 +++++++++++++++++++ src/cuda/src/ph_drpa_apb_sing.cu | 88 +++++++++++++++++++ src/cuda/src/ph_drpa_sing.c | 114 ++++++++++++++++++++++++ src/cuda/src/ph_drpa_tda_sing.c | 12 ++- 7 files changed, 391 insertions(+), 107 deletions(-) create mode 100644 src/cuda/src/ph_drpa_amb_sing.cu create mode 100644 src/cuda/src/ph_drpa_apb_sing.cu create mode 100644 src/cuda/src/ph_drpa_sing.c diff --git a/src/GPU/cu_quack_module.f90 b/src/GPU/cu_quack_module.f90 index eba54a0..bb022c2 100644 --- a/src/GPU/cu_quack_module.f90 +++ b/src/GPU/cu_quack_module.f90 @@ -4,49 +4,40 @@ module cu_quack_module implicit none -!#ifdef USE_GPU -! interface -! subroutine ph_drpa_tda_sing(nO, nBas, nS, eps, ERI, & -! Omega, X) bind(C, name = "ph_drpa_tda_sing") -! -! import c_int, c_double -! integer(c_int), intent(in), value :: nO, nBas, nS -! real(c_double), intent(in) :: eps(nBas) -! real(c_double), intent(in) :: ERI(nBas,nBas,nBas,nBas) -! real(c_double), intent(out) :: Omega(nS) -! real(c_double), intent(out) :: X(nS,nS) -! -! end subroutine ph_drpa_tda_sing -! end interface -!#else -! interface -! subroutine ph_drpa_tda_sing(nO, nBas, nS, eps, ERI, Omega, X) -! integer, intent(in) :: nO, nBas, nS -! double precision, intent(in) :: eps(nBas) -! double precision, intent(in) :: ERI(nBas,nBas,nBas,nBas) -! double precision, intent(out) :: Omega(nS) -! double precision, intent(out) :: X(nS,nS) -! end subroutine ph_drpa_tda_sing -! end interface -!#endif - interface + ! --- + subroutine ph_drpa_tda_sing(nO, nBas, nS, eps, ERI, & - Omega, X) bind(C, name = "ph_drpa_tda_sing") + Omega, XpY) bind(C, name = "ph_drpa_tda_sing") import c_int, c_double integer(c_int), intent(in), value :: nO, nBas, nS real(c_double), intent(in) :: eps(nBas) real(c_double), intent(in) :: ERI(nBas,nBas,nBas,nBas) real(c_double), intent(out) :: Omega(nS) - real(c_double), intent(out) :: X(nS,nS) + real(c_double), intent(out) :: XpY(nS,nS) end subroutine ph_drpa_tda_sing - end interface + ! --- - ! --- + subroutine ph_drpa_sing(nO, nBas, nS, eps, ERI, & + Omega, XpY, XmY) bind(C, name = "ph_drpa_sing") + + import c_int, c_double + integer(c_int), intent(in), value :: nO, nBas, nS + real(c_double), intent(in) :: eps(nBas) + real(c_double), intent(in) :: ERI(nBas,nBas,nBas,nBas) + real(c_double), intent(out) :: Omega(nS) + real(c_double), intent(out) :: XpY(nS,nS) + real(c_double), intent(out) :: XmY(nS,nS) + + end subroutine ph_drpa_sing + + ! --- + + end interface end module cu_quack_module diff --git a/src/RPA/phRRPA.f90 b/src/RPA/phRRPA.f90 index 31e7854..3ddeb03 100644 --- a/src/RPA/phRRPA.f90 +++ b/src/RPA/phRRPA.f90 @@ -83,7 +83,6 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO, call phLR(TDA,nS,Aph,Bph,EcRPA(ispin),Om,XpY,XmY) !call wall_time(t2) !print *, "wall time diag A on CPU (sec) = ", t2 - t1 - !stop call print_excitation_energies('phRPA@RHF','singlet',nS,Om) call phLR_transition_vectors(.true.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY) diff --git a/src/RPA/phRRPA_GPU.f90 b/src/RPA/phRRPA_GPU.f90 index 839c7be..f5c27c0 100644 --- a/src/RPA/phRRPA_GPU.f90 +++ b/src/RPA/phRRPA_GPU.f90 @@ -2,62 +2,51 @@ subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF) - use cu_quack_module + use cu_quack_module -! Perform a direct random phase approximation calculation implicit none include 'parameters.h' include 'quadrature.h' -! Input variables - logical,intent(in) :: dotest + logical,intent(in) :: dotest + logical,intent(in) :: TDA + logical,intent(in) :: doACFDT + logical,intent(in) :: exchange_kernel + logical,intent(in) :: singlet + logical,intent(in) :: triplet + integer,intent(in) :: nBas + integer,intent(in) :: nC + integer,intent(in) :: nO + integer,intent(in) :: nV + integer,intent(in) :: nR + integer,intent(in) :: nS + double precision,intent(in) :: ENuc + double precision,intent(in) :: ERHF + double precision,intent(in) :: eHF(nBas) + double precision,intent(in) :: ERI(nBas,nBas,nBas,nBas) + double precision,intent(in) :: dipole_int(nBas,nBas,ncart) - logical,intent(in) :: TDA - logical,intent(in) :: doACFDT - logical,intent(in) :: exchange_kernel - logical,intent(in) :: singlet - logical,intent(in) :: triplet - integer,intent(in) :: nBas - integer,intent(in) :: nC - integer,intent(in) :: nO - integer,intent(in) :: nV - integer,intent(in) :: nR - integer,intent(in) :: nS - double precision,intent(in) :: ENuc - double precision,intent(in) :: ERHF - double precision,intent(in) :: eHF(nBas) - double precision,intent(in) :: ERI(nBas,nBas,nBas,nBas) - double precision,intent(in) :: dipole_int(nBas,nBas,ncart) -! Local variables + integer :: i + integer :: ispin + logical :: dRPA + double precision :: t1, t2 + integer, allocatable :: iorder(:) + double precision,allocatable :: Om(:) + double precision,allocatable :: XpY(:,:) + double precision,allocatable :: XmY(:,:) - integer :: i - integer :: ispin - logical :: dRPA - double precision :: t1, t2 - double precision :: lambda - double precision,allocatable :: Aph(:,:) - double precision,allocatable :: Bph(:,:) - double precision,allocatable :: Om(:) - double precision,allocatable :: XpY(:,:) - double precision,allocatable :: XmY(:,:) - ! DEBUG - !double precision, allocatable :: XpY_gpu(:,:), XmY_gpu(:,:), Om_gpu(:) + double precision :: EcRPA(nspin) - double precision :: EcRPA(nspin) - -! Hello world write(*,*) - write(*,*)'*********************************' - write(*,*)'* Restricted ph-RPA Calculation *' - write(*,*)'*********************************' + write(*,*)'******************************************' + write(*,*)'* Restricted ph-RPA Calculation (on GPU) *' + write(*,*)'******************************************' write(*,*) -! TDA - if(TDA) then write(*,*) 'Tamm-Dancoff approximation activated!' write(*,*) @@ -67,61 +56,71 @@ subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC dRPA = .true. EcRPA(:) = 0d0 - lambda = 1d0 -! Memory allocation - allocate(Om(nS),XpY(nS,nS),XmY(nS,nS)) - !allocate(Aph(nS,nS),Bph(nS,nS)) - -! Singlet manifold + allocate(Om(nS), XpY(nS,nS), XmY(nS,nS)) if(singlet) then if(TDA) then - print*, 'start diag on GPU:' - call wall_time(t1) + !print*, 'start diag on GPU:' + !call wall_time(t1) call ph_drpa_tda_sing(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om(1), XpY(1,1)) - call wall_time(t2) - print*, 'diag time on GPU (sec):', t2 - t1 - stop + !call wall_time(t2) + !print*, 'diag time on GPU (sec):', t2 - t1 XmY(:,:) = XpY(:,:) else - ! TODO - !call ph_drpa_sing(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om(1), XpY(1,1)) - !XmY(:,:) = XpY(:,:) + !print*, 'start diag on GPU:' + !call wall_time(t1) + call ph_drpa_sing(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om(1), XpY(1,1), XmY(1,1)) + !call wall_time(t2) + !print*, 'diag time on GPU (sec):', t2 - t1 endif + ! TODO + XpY(:,:) = transpose(XpY(:,:)) + XmY(:,:) = transpose(XmY(:,:)) + call print_excitation_energies('phRPA@RHF','singlet',nS,Om) call phLR_transition_vectors(.true.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY) - - end if - -! Triplet manifold + endif if(triplet) then - ispin = 2 + XpY(:,:) = 0.d0 + allocate(iorder(nS)) + ia = 0 + do i = nC+1, nO + do a = nO+1, nBas-nR + ia = ia + 1 + iorder(ia) = ia + Om(ia) = e(a) - e(i) + XpY(ia,ia) = 1.d0 + enddo + enddo - call phLR_A(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,eHF,ERI,Aph) - if(.not.TDA) call phLR_B(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,ERI,Bph) + call quick_sort(Om(1), iorder(1), nS) + deallocate(iorder) + + XmY(:,:) = XpY(:,:) - call phLR(TDA,nS,Aph,Bph,EcRPA(ispin),Om,XpY,XmY) call print_excitation_energies('phRPA@RHF','triplet',nS,Om) call phLR_transition_vectors(.false.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY) + endif - end if + deallocate(Om, XpY, XmY) + + ! TODO + ! init EcRPA if(exchange_kernel) then - EcRPA(1) = 0.5d0*EcRPA(1) EcRPA(2) = 1.5d0*EcRPA(2) - - end if + endif write(*,*) write(*,*)'-------------------------------------------------------------------------------' @@ -132,12 +131,11 @@ subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC write(*,*)'-------------------------------------------------------------------------------' write(*,*) - deallocate(Om,XpY,XmY,Aph,Bph) - ! Compute the correlation energy via the adiabatic connection if(doACFDT) then + ! TODO call phACFDT(exchange_kernel,dRPA,TDA,singlet,triplet,nBas,nC,nO,nV,nR,nS,ERI,eHF,EcRPA) write(*,*) @@ -149,13 +147,11 @@ subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC write(*,*)'-------------------------------------------------------------------------------' write(*,*) - end if + endif if(dotest) then - - call dump_test_value('R','phRPA correlation energy',sum(EcRPA)) - - end if + call dump_test_value('R','phRPA correlation energy (on GPU)',sum(EcRPA)) + endif end subroutine diff --git a/src/cuda/src/ph_drpa_amb_sing.cu b/src/cuda/src/ph_drpa_amb_sing.cu new file mode 100644 index 0000000..4f88b4b --- /dev/null +++ b/src/cuda/src/ph_drpa_amb_sing.cu @@ -0,0 +1,88 @@ +#include + +__global__ void ph_dRPA_AmB_sing_kernel(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *AmB) { + + + int i, j, a, b; + int aa, bb; + int nVS; + int nBas2, nBas3; + int i_A0, i_A1, i_A2; + int i_I0, i_I1, i_I2; + + bool a_eq_b; + + nVS = nV * nS; + + nBas2 = nBas * nBas; + nBas3 = nBas2 * nBas; + + aa = blockIdx.x * blockDim.x + threadIdx.x; + bb = blockIdx.y * blockDim.y + threadIdx.y; + + while(aa < nV) { + a = aa + nO; + + i_A0 = aa * nS; + i_I0 = a * nBas2; + + while(bb < nV) { + b = bb + nO; + + a_eq_b = a == b; + + i_A1 = i_A0 + bb; + i_I1 = i_I0 + b * nBas; + + i = 0; + while(i < nO) { + + i_A2 = i_A1 + i * nVS; + i_I2 = i_I1 + i; + + j = 0; + while(j < nO) { + + AmB[i_A2 + j * nV] = 2.0 * (ERI[i_I2 + j * nBas3] - ERI[i_I2 + j * nBas]); + if(a_eq_b && (i==j)) { + AmB[i_A2 + j * nV] += eps[a] - eps[i]; + } + + j ++; + } // j + + i ++; + } // i + + bb += blockDim.y * gridDim.y; + } // bb + + aa += blockDim.x * gridDim.x; + } // aa + +} + + + + + +extern "C" void ph_dRPA_AmB_sing(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *AmB) { + + + int sBlocks = 32; + int nBlocks = (nV + sBlocks - 1) / sBlocks; + + dim3 dimGrid(nBlocks, nBlocks, 1); + dim3 dimBlock(sBlocks, sBlocks, 1); + + printf("lunching ph_dRPA_AmB_sing_kernel with %dx%d blocks and %dx%d threads/block\n", + nBlocks, nBlocks, sBlocks, sBlocks); + + + ph_dRPA_AmB_sing_kernel<<>>(nO, nV, nBas, nS, eps, ERI, AmB); + +} + + + + diff --git a/src/cuda/src/ph_drpa_apb_sing.cu b/src/cuda/src/ph_drpa_apb_sing.cu new file mode 100644 index 0000000..d7de329 --- /dev/null +++ b/src/cuda/src/ph_drpa_apb_sing.cu @@ -0,0 +1,88 @@ +#include + +__global__ void ph_dRPA_ApB_sing_kernel(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *ApB) { + + + int i, j, a, b; + int aa, bb; + int nVS; + int nBas2, nBas3; + int i_A0, i_A1, i_A2; + int i_I0, i_I1, i_I2; + + bool a_eq_b; + + nVS = nV * nS; + + nBas2 = nBas * nBas; + nBas3 = nBas2 * nBas; + + aa = blockIdx.x * blockDim.x + threadIdx.x; + bb = blockIdx.y * blockDim.y + threadIdx.y; + + while(aa < nV) { + a = aa + nO; + + i_A0 = aa * nS; + i_I0 = a * nBas2; + + while(bb < nV) { + b = bb + nO; + + a_eq_b = a == b; + + i_A1 = i_A0 + bb; + i_I1 = i_I0 + b * nBas; + + i = 0; + while(i < nO) { + + i_A2 = i_A1 + i * nVS; + i_I2 = i_I1 + i; + + j = 0; + while(j < nO) { + + ApB[i_A2 + j * nV] = 2.0 * (ERI[i_I2 + j * nBas3] + ERI[i_I2 + j * nBas]); + if(a_eq_b && (i==j)) { + ApB[i_A2 + j * nV] += eps[a] - eps[i]; + } + + j ++; + } // j + + i ++; + } // i + + bb += blockDim.y * gridDim.y; + } // bb + + aa += blockDim.x * gridDim.x; + } // aa + +} + + + + + +extern "C" void ph_dRPA_ApB_sing(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *ApB) { + + + int sBlocks = 32; + int nBlocks = (nV + sBlocks - 1) / sBlocks; + + dim3 dimGrid(nBlocks, nBlocks, 1); + dim3 dimBlock(sBlocks, sBlocks, 1); + + printf("lunching ph_dRPA_ApB_sing_kernel with %dx%d blocks and %dx%d threads/block\n", + nBlocks, nBlocks, sBlocks, sBlocks); + + + ph_dRPA_ApB_sing_kernel<<>>(nO, nV, nBas, nS, eps, ERI, ApB); + +} + + + + diff --git a/src/cuda/src/ph_drpa_sing.c b/src/cuda/src/ph_drpa_sing.c new file mode 100644 index 0000000..f3f8244 --- /dev/null +++ b/src/cuda/src/ph_drpa_sing.c @@ -0,0 +1,114 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "utils.h" +#include "ph_rpa.h" + +void ph_drpa_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, + double *h_Omega, double *h_XpY, double *h_XmY) { + + double *d_eps = NULL; + double *d_ERI = NULL; + + int nV = nBas - nO; + + long long nBas_long = (long long) nBas; + long long nBas4 = nBas_long * nBas_long * nBas_long * nBas_long; + + long long nS_long = (long long) nS; + long long nS2 = nS_long * nS_long; + + float elapsedTime; + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + + check_Cuda_Errors(cudaMalloc((void**)&d_eps, nBas * sizeof(double)), + "cudaMalloc", __FILE__, __LINE__); + check_Cuda_Errors(cudaMalloc((void**)&d_ERI, nBas4 * sizeof(double)), + "cudaMalloc", __FILE__, __LINE__); + + printf("CPU->GPU transfer..\n"); + cudaEventRecord(start, 0); + check_Cuda_Errors(cudaMemcpy(d_eps, h_eps, nBas * sizeof(double), cudaMemcpyHostToDevice), + "cudaMemcpy", __FILE__, __LINE__); + check_Cuda_Errors(cudaMemcpy(d_ERI, h_ERI, nBas4 * sizeof(double), cudaMemcpyHostToDevice), + "cudaMemcpy", __FILE__, __LINE__); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&elapsedTime, start, stop); + printf("Time elapsed on CPU->GPU transfer = %f msec\n", elapsedTime); + + // construct A+B & A-B + double *d_ApB = NULL; + double *d_AmB = NULL; + check_Cuda_Errors(cudaMalloc((void**)&d_ApB, nS2 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); + check_Cuda_Errors(cudaMalloc((void**)&d_A-B, nS2 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); + + cudaEventRecord(start, 0); + ph_dRPA_ApB_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_ApB); + ph_dRPA_AmB_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_AmB); + check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&elapsedTime, start, stop); + printf("Time elapsed on A & B kernels = %f msec\n", elapsedTime); + + + // free memory + check_Cuda_Errors(cudaFree(d_eps), "cudaFree", __FILE__, __LINE__); + check_Cuda_Errors(cudaFree(d_ERI), "cudaFree", __FILE__, __LINE__); + + + // TODO + // diagonalize A+B and A-B + int *d_info = NULL; + double *d_Omega = NULL; + check_Cuda_Errors(cudaMalloc((void**)&d_info, sizeof(int)), + "cudaMalloc", __FILE__, __LINE__); + check_Cuda_Errors(cudaMalloc((void**)&d_Omega, nS * sizeof(double)), + "cudaMalloc", __FILE__, __LINE__); + + cudaEventRecord(start, 0); + diag_dn_dsyevd(nS, d_info, d_Omega, d_A); + check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&elapsedTime, start, stop); + printf("Time elapsed on diagonalization = %f msec\n", elapsedTime); + + + // transfer data to CPU + cudaEventRecord(start, 0); + //int info_gpu = 0; + //check_Cuda_Errors(cudaMemcpy(&info_gpu, d_info, sizeof(int), cudaMemcpyDeviceToHost), + // "cudaMemcpy", __FILE__, __LINE__); + //if (info_gpu != 0) { + // printf("Error: diag_dn_dsyevd returned error code %d\n", info_gpu); + // exit(EXIT_FAILURE); + //} + check_Cuda_Errors(cudaMemcpy(h_XpY, d_, nS2 * sizeof(double), cudaMemcpyDeviceToHost), + "cudaMemcpy", __FILE__, __LINE__); + check_Cuda_Errors(cudaMemcpy(h_XmY, d_, nS2 * sizeof(double), cudaMemcpyDeviceToHost), + "cudaMemcpy", __FILE__, __LINE__); + check_Cuda_Errors(cudaMemcpy(h_Omega, d_Omega, nS * sizeof(double), cudaMemcpyDeviceToHost), + "cudaMemcpy", __FILE__, __LINE__); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&elapsedTime, start, stop); + printf("Time elapsed on GPU -> CPU transfer = %f msec\n", elapsedTime); + + check_Cuda_Errors(cudaFree(d_info), "cudaFree", __FILE__, __LINE__); + check_Cuda_Errors(cudaFree(d_A), "cudaFree", __FILE__, __LINE__); + check_Cuda_Errors(cudaFree(d_B), "cudaFree", __FILE__, __LINE__); + check_Cuda_Errors(cudaFree(d_Omega), "cudaFree", __FILE__, __LINE__); + + +} + diff --git a/src/cuda/src/ph_drpa_tda_sing.c b/src/cuda/src/ph_drpa_tda_sing.c index 5cfa9ef..31d0725 100644 --- a/src/cuda/src/ph_drpa_tda_sing.c +++ b/src/cuda/src/ph_drpa_tda_sing.c @@ -9,6 +9,11 @@ #include "utils.h" #include "ph_rpa.h" +/* + * + * Y = 0 ==> X+Y = X-Y = X + * +*/ void ph_drpa_tda_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, double *h_Omega, double *h_X) { @@ -16,6 +21,9 @@ void ph_drpa_tda_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, double *d_ERI = NULL; int nV = nBas - nO; + + long long nS_long = (long long) nS; + long long nS2 = nS_long * nS_long; long long nBas_long = (long long) nBas; long long nBas4 = nBas_long * nBas_long * nBas_long * nBas_long; @@ -47,7 +55,7 @@ void ph_drpa_tda_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, // construct A double *d_A = NULL; - check_Cuda_Errors(cudaMalloc((void**)&d_A, nS * nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); + check_Cuda_Errors(cudaMalloc((void**)&d_A, nS2 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); cudaEventRecord(start, 0); ph_dRPA_A_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_A); @@ -86,7 +94,7 @@ void ph_drpa_tda_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, // printf("Error: diag_dn_dsyevd returned error code %d\n", info_gpu); // exit(EXIT_FAILURE); //} - check_Cuda_Errors(cudaMemcpy(h_X, d_A, nS * nS * sizeof(double), cudaMemcpyDeviceToHost), + check_Cuda_Errors(cudaMemcpy(h_X, d_A, nS2 * sizeof(double), cudaMemcpyDeviceToHost), "cudaMemcpy", __FILE__, __LINE__); check_Cuda_Errors(cudaMemcpy(h_Omega, d_Omega, nS * sizeof(double), cudaMemcpyDeviceToHost), "cudaMemcpy", __FILE__, __LINE__); From fd4dc5b77ea63f0f3f8309030aa1a64aeb49a256 Mon Sep 17 00:00:00 2001 From: AbdAmmar Date: Fri, 29 Nov 2024 19:10:24 +0100 Subject: [PATCH 17/39] fixed memory leak in phRPA --- src/LR/phLR.f90 | 8 +-- src/cuda/src/a_d_at.cu | 64 +++++++++++++++++++++++ src/cuda/src/a_dinv_at.cu | 64 +++++++++++++++++++++++ src/cuda/src/elementwise_dsqrt_inplace.cu | 52 ++++++++++++++++++ src/cuda/src/ph_drpa_sing.c | 63 +++++++++++++++++++--- 5 files changed, 242 insertions(+), 9 deletions(-) create mode 100644 src/cuda/src/a_d_at.cu create mode 100644 src/cuda/src/a_dinv_at.cu create mode 100644 src/cuda/src/elementwise_dsqrt_inplace.cu diff --git a/src/LR/phLR.f90 b/src/LR/phLR.f90 index 6457e27..6040ad7 100644 --- a/src/LR/phLR.f90 +++ b/src/LR/phLR.f90 @@ -30,14 +30,12 @@ subroutine phLR(TDA,nS,Aph,Bph,EcRPA,Om,XpY,XmY) double precision,intent(out) :: XpY(nS,nS) double precision,intent(out) :: XmY(nS,nS) -! Memory allocation - allocate(ApB(nS,nS),AmB(nS,nS),AmBSq(nS,nS),AmBIv(nS,nS),Z(nS,nS),tmp(nS,nS)) ! Tamm-Dancoff approximation if(TDA) then - + XpY(:,:) = Aph(:,:) call diagonalize_matrix(nS,XpY,Om) XpY(:,:) = transpose(XpY(:,:)) @@ -45,6 +43,8 @@ subroutine phLR(TDA,nS,Aph,Bph,EcRPA,Om,XpY,XmY) else + allocate(ApB(nS,nS),AmB(nS,nS),AmBSq(nS,nS),AmBIv(nS,nS),Z(nS,nS),tmp(nS,nS)) + ApB(:,:) = Aph(:,:) + Bph(:,:) AmB(:,:) = Aph(:,:) - Bph(:,:) @@ -81,6 +81,8 @@ subroutine phLR(TDA,nS,Aph,Bph,EcRPA,Om,XpY,XmY) ! XmY = matmul(transpose(Z),AmBIv) ! call DA(nS,1d0*sqrt(Om),XmY) + + deallocate(ApB,AmB,AmBSq,AmBIv,Z,tmp) end if diff --git a/src/cuda/src/a_d_at.cu b/src/cuda/src/a_d_at.cu new file mode 100644 index 0000000..0f190eb --- /dev/null +++ b/src/cuda/src/a_d_at.cu @@ -0,0 +1,64 @@ +#include + + +__global__ void A_D_At_kernel(int n, double *A, double *D, double *R) { + + + int i, j; + int k; + int in, ij; + int kn; + + i = blockIdx.x * blockDim.x + threadIdx.x; + j = blockIdx.y * blockDim.y + threadIdx.y; + + while(i < n) { + + in = i * n; + + while(j < n) { + + ij = in + j; + + R[ij] = 0.0; + k = 0; + while(k < n) { + + kn = k * n; + R[ij] += D[k] * U[i + kn] * U[j + kn]; + + k ++; + } // k + + j += blockDim.y * gridDim.y; + } // j + + i += blockDim.x * gridDim.x; + } // i + +} + + + + + +extern "C" void A_D_At(int n, double *A, double *D, double *R) { + + + int sBlocks = 32; + int nBlocks = (n + sBlocks - 1) / sBlocks; + + dim3 dimGrid(nBlocks, nBlocks, 1); + dim3 dimBlock(sBlocks, sBlocks, 1); + + printf("lunching A_D_At_kernel with %dx%d blocks and %dx%d threads/block\n", + nBlocks, nBlocks, sBlocks, sBlocks); + + + A_D_At_kernel<<>>(n, A, D, R); + +} + + + + diff --git a/src/cuda/src/a_dinv_at.cu b/src/cuda/src/a_dinv_at.cu new file mode 100644 index 0000000..450b866 --- /dev/null +++ b/src/cuda/src/a_dinv_at.cu @@ -0,0 +1,64 @@ +#include + + +__global__ void A_Dinv_At_kernel(int n, double *A, double *D, double *R) { + + + int i, j; + int k; + int in, ij; + int kn; + + i = blockIdx.x * blockDim.x + threadIdx.x; + j = blockIdx.y * blockDim.y + threadIdx.y; + + while(i < n) { + + in = i * n; + + while(j < n) { + + ij = in + j; + + R[ij] = 0.0; + k = 0; + while(k < n) { + + kn = k * n; + R[ij] += D[k] * U[i + kn] * U[j + kn] / (D[k] + 1e-12); + + k ++; + } // k + + j += blockDim.y * gridDim.y; + } // j + + i += blockDim.x * gridDim.x; + } // i + +} + + + + + +extern "C" void A_Dinv_At(int n, double *A, double *D, double *R) { + + + int sBlocks = 32; + int nBlocks = (n + sBlocks - 1) / sBlocks; + + dim3 dimGrid(nBlocks, nBlocks, 1); + dim3 dimBlock(sBlocks, sBlocks, 1); + + printf("lunching A_Dinv_At_kernel with %dx%d blocks and %dx%d threads/block\n", + nBlocks, nBlocks, sBlocks, sBlocks); + + + A_Dinv_At_kernel<<>>(n, A, D, R); + +} + + + + diff --git a/src/cuda/src/elementwise_dsqrt_inplace.cu b/src/cuda/src/elementwise_dsqrt_inplace.cu new file mode 100644 index 0000000..f053dd9 --- /dev/null +++ b/src/cuda/src/elementwise_dsqrt_inplace.cu @@ -0,0 +1,52 @@ +#include +#include + + +__global__ void elementwise_dsqrt_inplace_kernel(int nS, double *A, int *nb_neg_sqrt) { + + + int i; + + i = blockIdx.x * blockDim.x + threadIdx.x; + nb_neg_sqrt = 0; + + while(i < nS) { + + if(A[i] > 0.0) { + + A[i] = sqrt(A[i]); + + } else { + + A[i] = sqrt(-A[i]); + + } + + i += blockDim.x * gridDim.x; + } // i + +} + + + + + +extern "C" void elementwise_dsqrt_inplace(int nS, double *A, int *nb_neg_sqrt) { + + int sBlocks = 32; + int nBlocks = (nS + sBlocks - 1) / sBlocks; + + dim3 dimGrid(nBlocks, 1, 1); + dim3 dimBlock(sBlocks, 1, 1); + + printf("lunching elementwise_dsqrt_inplace_kernel with %d blocks and %d threads/block\n", + nBlocks, sBlocks); + + + elementwise_dsqrt_inplace_kernel<<>>(nS, A, nb_neg_sqrt); + +} + + + + diff --git a/src/cuda/src/ph_drpa_sing.c b/src/cuda/src/ph_drpa_sing.c index f3f8244..95f8693 100644 --- a/src/cuda/src/ph_drpa_sing.c +++ b/src/cuda/src/ph_drpa_sing.c @@ -49,7 +49,7 @@ void ph_drpa_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, double *d_ApB = NULL; double *d_AmB = NULL; check_Cuda_Errors(cudaMalloc((void**)&d_ApB, nS2 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); - check_Cuda_Errors(cudaMalloc((void**)&d_A-B, nS2 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); + check_Cuda_Errors(cudaMalloc((void**)&d_AmB, nS2 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); cudaEventRecord(start, 0); ph_dRPA_ApB_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_ApB); @@ -58,7 +58,7 @@ void ph_drpa_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsedTime, start, stop); - printf("Time elapsed on A & B kernels = %f msec\n", elapsedTime); + printf("Time elapsed on AmB & ApB = %f msec\n", elapsedTime); // free memory @@ -66,8 +66,7 @@ void ph_drpa_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, check_Cuda_Errors(cudaFree(d_ERI), "cudaFree", __FILE__, __LINE__); - // TODO - // diagonalize A+B and A-B + // diagonalize A-B int *d_info = NULL; double *d_Omega = NULL; check_Cuda_Errors(cudaMalloc((void**)&d_info, sizeof(int)), @@ -76,12 +75,64 @@ void ph_drpa_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, "cudaMalloc", __FILE__, __LINE__); cudaEventRecord(start, 0); - diag_dn_dsyevd(nS, d_info, d_Omega, d_A); + diag_dn_dsyevd(nS, d_info, d_Omega, d_AmB); check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsedTime, start, stop); - printf("Time elapsed on diagonalization = %f msec\n", elapsedTime); + printf("Time elapsed on diag AmB = %f msec\n", elapsedTime); + + + // d_Omega <-- d_Omega^{0.5} + elementwise_dsqrt_inplace(nS, d_Omega); + // TODO + //int *d_nb_neg_sqrt = NULL; + //check_Cuda_Errors(cudaMalloc((void**)&d_nb_neg_sqrt, sizeof(int)), + // "cudaMalloc", __FILE__, __LINE__); + //int nb_neg_sqrt = 0; + //check_Cuda_Errors(cudaMemcpy(&nb_neg_sqrt, d_nb_neg_sqrt, sizeof(int), cudaMemcpyDeviceToHost), + // "cudaMemcpy", __FILE__, __LINE__); + //if (nb_neg_sqrt > 0) { + // printf("You may have instabilities in linear response: A-B is not positive definite!!\n"); + // printf("nb of <= 0 elements = %d\n", nb_neg_sqrt); + //} + + + // TODO + // d_AmB (d_Omega)^{+0.5} (d_AmB)^T + // d_AmB (d_Omega)^{-0.5} (d_AmB)^T + double *d_AmBSq = NULL; + check_Cuda_Errors(cudaMalloc((void**)&d_AmBSq, nS * sizeof(double)), + "cudaMalloc", __FILE__, __LINE__); + double *d_AmBSqInv = NULL; + check_Cuda_Errors(cudaMalloc((void**)&d_AmBSqInv, nS * sizeof(double)), + "cudaMalloc", __FILE__, __LINE__); + + cudaEventRecord(start, 0); + A_D_At(nS, d_AmB, d_Omega, d_AmBSq); + A_Dinv_At(nS, d_AmB, d_Omega, d_AmBSqInv); + check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&elapsedTime, start, stop); + printf("Time elapsed on d_AmBSq & d_AmBSqInv = %f msec\n", elapsedTime); + + + // TODO + //call dgemm('N','N',nS,nS,nS,1d0,ApB,size(ApB,1),AmBSq,size(AmBSq,1),0d0,tmp,size(tmp,1)) + //call dgemm('N','N',nS,nS,nS,1d0,AmBSq,size(AmBSq,1),tmp,size(tmp,1),0d0,Z,size(Z,1)) + //call diagonalize_matrix(nS,Z,Om) + //if(minval(Om) < 0d0) & + // call print_warning('You may have instabilities in linear response: negative excitations!!') + //Om = sqrt(Om) + //call dgemm('T','N',nS,nS,nS,1d0,Z,size(Z,1),AmBSq,size(AmBSq,1),0d0,XpY,size(XpY,1)) + //call DA(nS,1d0/dsqrt(Om),XpY) + //call dgemm('T','N',nS,nS,nS,1d0,Z,size(Z,1),AmBIv,size(AmBIv,1),0d0,XmY,size(XmY,1)) + //call DA(nS,1d0*dsqrt(Om),XmY) + + + + // transfer data to CPU From 542cce2da922d77db6c87c67bcb8e1caa9b4855b Mon Sep 17 00:00:00 2001 From: AbdAmmar Date: Fri, 29 Nov 2024 20:32:19 +0100 Subject: [PATCH 18/39] dRPA (with no TDA) on GPU: V0 --- src/cuda/include/my_linalg.h | 16 ++ src/cuda/include/ph_rpa.h | 3 +- src/cuda/src/a_d_at.cu | 2 +- src/cuda/src/a_d_inplace.cu | 57 +++++++ src/cuda/src/a_dinv_at.cu | 2 +- src/cuda/src/a_dinv_inplace.cu | 57 +++++++ src/cuda/src/elementwise_dsqrt.cu | 51 ++++++ src/cuda/src/elementwise_dsqrt_inplace.cu | 7 +- src/cuda/src/ph_drpa_sing.c | 194 ++++++++++++++++------ src/cuda/src/ph_drpa_tda_sing.c | 4 +- 10 files changed, 337 insertions(+), 56 deletions(-) create mode 100644 src/cuda/include/my_linalg.h create mode 100644 src/cuda/src/a_d_inplace.cu create mode 100644 src/cuda/src/a_dinv_inplace.cu create mode 100644 src/cuda/src/elementwise_dsqrt.cu diff --git a/src/cuda/include/my_linalg.h b/src/cuda/include/my_linalg.h new file mode 100644 index 0000000..457ab99 --- /dev/null +++ b/src/cuda/include/my_linalg.h @@ -0,0 +1,16 @@ +#ifndef MY_LINALG + +#define MY_LINALG + +extern void A_D_At(int n, double *A, double *D, double *R); +extern void A_Dinv_At(int n, double *A, double *D, double *R); + +extern void A_D_inplace(int n, double *A, double *D); +extern void A_Dinv_inplace(int n, double *A, double *D); + +extern void elementwise_dsqrt(int nS, double *A, double *A_Sq); +extern void elementwise_dsqrt_inplace(int nS, double *A); + +extern void diag_dn_dsyevd(int n, int *info, double *W, double *A); + +#endif diff --git a/src/cuda/include/ph_rpa.h b/src/cuda/include/ph_rpa.h index 36f111f..abce320 100644 --- a/src/cuda/include/ph_rpa.h +++ b/src/cuda/include/ph_rpa.h @@ -5,6 +5,7 @@ extern void ph_dRPA_A_sing(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *A); extern void ph_dRPA_B_sing(int nO, int nV, int nBas, int nS, double *ERI, double *B); -extern void diag_dn_dsyevd(int n, int *info, double *W, double *A); +extern void ph_dRPA_ApB_sing(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *ApB); +extern void ph_dRPA_AmB_sing(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *AmB); #endif diff --git a/src/cuda/src/a_d_at.cu b/src/cuda/src/a_d_at.cu index 0f190eb..f47b797 100644 --- a/src/cuda/src/a_d_at.cu +++ b/src/cuda/src/a_d_at.cu @@ -25,7 +25,7 @@ __global__ void A_D_At_kernel(int n, double *A, double *D, double *R) { while(k < n) { kn = k * n; - R[ij] += D[k] * U[i + kn] * U[j + kn]; + R[ij] += D[k] * A[i + kn] * A[j + kn]; k ++; } // k diff --git a/src/cuda/src/a_d_inplace.cu b/src/cuda/src/a_d_inplace.cu new file mode 100644 index 0000000..3aa233f --- /dev/null +++ b/src/cuda/src/a_d_inplace.cu @@ -0,0 +1,57 @@ +#include + + +__global__ void A_D_inplace_kernel(int n, double *A, double *D) { + + + int i, j; + int in, ji; + + double tmp; + + i = blockIdx.x * blockDim.x + threadIdx.x; + j = blockIdx.y * blockDim.y + threadIdx.y; + + while(i < n) { + + in = i * n; + + tmp = D[i]; + + while(j < n) { + + ji = in + j; + + A[ji] = A[ji] * tmp; + + j += blockDim.y * gridDim.y; + } // j + + i += blockDim.x * gridDim.x; + } // i + +} + + + + + +extern "C" void A_D_inplace(int n, double *A, double *D) { + + + int sBlocks = 32; + int nBlocks = (n + sBlocks - 1) / sBlocks; + + dim3 dimGrid(nBlocks, nBlocks, 1); + dim3 dimBlock(sBlocks, sBlocks, 1); + + printf("lunching A_D_inplace_kernel with %dx%d blocks and %dx%d threads/block\n", + nBlocks, nBlocks, sBlocks, sBlocks); + + + A_D_inplace_kernel<<>>(n, A, D); + +} + + + diff --git a/src/cuda/src/a_dinv_at.cu b/src/cuda/src/a_dinv_at.cu index 450b866..880816b 100644 --- a/src/cuda/src/a_dinv_at.cu +++ b/src/cuda/src/a_dinv_at.cu @@ -25,7 +25,7 @@ __global__ void A_Dinv_At_kernel(int n, double *A, double *D, double *R) { while(k < n) { kn = k * n; - R[ij] += D[k] * U[i + kn] * U[j + kn] / (D[k] + 1e-12); + R[ij] += D[k] * A[i + kn] * A[j + kn] / (D[k] + 1e-12); k ++; } // k diff --git a/src/cuda/src/a_dinv_inplace.cu b/src/cuda/src/a_dinv_inplace.cu new file mode 100644 index 0000000..be2f567 --- /dev/null +++ b/src/cuda/src/a_dinv_inplace.cu @@ -0,0 +1,57 @@ +#include + + +__global__ void A_Dinv_inplace_kernel(int n, double *A, double *D) { + + + int i, j; + int in, ji; + + double tmp; + + i = blockIdx.x * blockDim.x + threadIdx.x; + j = blockIdx.y * blockDim.y + threadIdx.y; + + while(i < n) { + + in = i * n; + + tmp = 1.0 / (1e-12 + D[i]); + + while(j < n) { + + ji = in + j; + + A[ji] = A[ji] * tmp; + + j += blockDim.y * gridDim.y; + } // j + + i += blockDim.x * gridDim.x; + } // i + +} + + + + + +extern "C" void A_Dinv_inplace(int n, double *A, double *D) { + + + int sBlocks = 32; + int nBlocks = (n + sBlocks - 1) / sBlocks; + + dim3 dimGrid(nBlocks, nBlocks, 1); + dim3 dimBlock(sBlocks, sBlocks, 1); + + printf("lunching A_Dinv_inplace_kernel with %dx%d blocks and %dx%d threads/block\n", + nBlocks, nBlocks, sBlocks, sBlocks); + + + A_Dinv_inplace_kernel<<>>(n, A, D); + +} + + + diff --git a/src/cuda/src/elementwise_dsqrt.cu b/src/cuda/src/elementwise_dsqrt.cu new file mode 100644 index 0000000..990be52 --- /dev/null +++ b/src/cuda/src/elementwise_dsqrt.cu @@ -0,0 +1,51 @@ +#include +#include + + +__global__ void elementwise_dsqrt_kernel(int nS, double *A, double *A_Sq) { + + + int i; + + i = blockIdx.x * blockDim.x + threadIdx.x; + + while(i < nS) { + + if(A[i] > 0.0) { + + A_Sq[i] = sqrt(A[i]); + + } else { + + A_Sq[i] = sqrt(-A[i]); + + } + + i += blockDim.x * gridDim.x; + } // i + +} + + + + + +extern "C" void elementwise_dsqrt(int nS, double *A, double *A_Sq) { + + int sBlocks = 32; + int nBlocks = (nS + sBlocks - 1) / sBlocks; + + dim3 dimGrid(nBlocks, 1, 1); + dim3 dimBlock(sBlocks, 1, 1); + + printf("lunching elementwise_dsqrt_kernel with %d blocks and %d threads/block\n", + nBlocks, sBlocks); + + + elementwise_dsqrt_kernel<<>>(nS, A, A_Sq); + +} + + + + diff --git a/src/cuda/src/elementwise_dsqrt_inplace.cu b/src/cuda/src/elementwise_dsqrt_inplace.cu index f053dd9..8e0202f 100644 --- a/src/cuda/src/elementwise_dsqrt_inplace.cu +++ b/src/cuda/src/elementwise_dsqrt_inplace.cu @@ -2,13 +2,12 @@ #include -__global__ void elementwise_dsqrt_inplace_kernel(int nS, double *A, int *nb_neg_sqrt) { +__global__ void elementwise_dsqrt_inplace_kernel(int nS, double *A) { int i; i = blockIdx.x * blockDim.x + threadIdx.x; - nb_neg_sqrt = 0; while(i < nS) { @@ -31,7 +30,7 @@ __global__ void elementwise_dsqrt_inplace_kernel(int nS, double *A, int *nb_neg_ -extern "C" void elementwise_dsqrt_inplace(int nS, double *A, int *nb_neg_sqrt) { +extern "C" void elementwise_dsqrt_inplace(int nS, double *A) { int sBlocks = 32; int nBlocks = (nS + sBlocks - 1) / sBlocks; @@ -43,7 +42,7 @@ extern "C" void elementwise_dsqrt_inplace(int nS, double *A, int *nb_neg_sqrt) { nBlocks, sBlocks); - elementwise_dsqrt_inplace_kernel<<>>(nS, A, nb_neg_sqrt); + elementwise_dsqrt_inplace_kernel<<>>(nS, A); } diff --git a/src/cuda/src/ph_drpa_sing.c b/src/cuda/src/ph_drpa_sing.c index 95f8693..d1cba17 100644 --- a/src/cuda/src/ph_drpa_sing.c +++ b/src/cuda/src/ph_drpa_sing.c @@ -8,9 +8,15 @@ #include "utils.h" #include "ph_rpa.h" +#include "my_linalg.h" + + + + void ph_drpa_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, - double *h_Omega, double *h_XpY, double *h_XmY) { + double *h_Omega, double *h_XpY, double *h_XmY) { + double *d_eps = NULL; double *d_ERI = NULL; @@ -23,18 +29,23 @@ void ph_drpa_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, long long nS_long = (long long) nS; long long nS2 = nS_long * nS_long; + + cublasHandle_t handle; + const double alpha=1.0, beta=0.0; + + float elapsedTime; cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); + check_Cuda_Errors(cudaMalloc((void**)&d_eps, nBas * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); check_Cuda_Errors(cudaMalloc((void**)&d_ERI, nBas4 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); - printf("CPU->GPU transfer..\n"); cudaEventRecord(start, 0); check_Cuda_Errors(cudaMemcpy(d_eps, h_eps, nBas * sizeof(double), cudaMemcpyHostToDevice), "cudaMemcpy", __FILE__, __LINE__); @@ -67,15 +78,12 @@ void ph_drpa_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, // diagonalize A-B - int *d_info = NULL; + int *d_info1 = NULL; + check_Cuda_Errors(cudaMalloc((void**)&d_info1, sizeof(int)), "cudaMalloc", __FILE__, __LINE__); double *d_Omega = NULL; - check_Cuda_Errors(cudaMalloc((void**)&d_info, sizeof(int)), - "cudaMalloc", __FILE__, __LINE__); - check_Cuda_Errors(cudaMalloc((void**)&d_Omega, nS * sizeof(double)), - "cudaMalloc", __FILE__, __LINE__); - + check_Cuda_Errors(cudaMalloc((void**)&d_Omega, nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); cudaEventRecord(start, 0); - diag_dn_dsyevd(nS, d_info, d_Omega, d_AmB); + diag_dn_dsyevd(nS, d_info1, d_Omega, d_AmB); check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); @@ -84,31 +92,24 @@ void ph_drpa_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, // d_Omega <-- d_Omega^{0.5} + // TODO: nb of <= 0 elements + cudaEventRecord(start, 0); elementwise_dsqrt_inplace(nS, d_Omega); - // TODO - //int *d_nb_neg_sqrt = NULL; - //check_Cuda_Errors(cudaMalloc((void**)&d_nb_neg_sqrt, sizeof(int)), - // "cudaMalloc", __FILE__, __LINE__); - //int nb_neg_sqrt = 0; - //check_Cuda_Errors(cudaMemcpy(&nb_neg_sqrt, d_nb_neg_sqrt, sizeof(int), cudaMemcpyDeviceToHost), - // "cudaMemcpy", __FILE__, __LINE__); - //if (nb_neg_sqrt > 0) { - // printf("You may have instabilities in linear response: A-B is not positive definite!!\n"); - // printf("nb of <= 0 elements = %d\n", nb_neg_sqrt); - //} + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&elapsedTime, start, stop); + printf("Time elapsed on elementwise_dsqrt_inplace %f msec\n", elapsedTime); - // TODO - // d_AmB (d_Omega)^{+0.5} (d_AmB)^T - // d_AmB (d_Omega)^{-0.5} (d_AmB)^T + // d_AmBSq = d_AmB (d_Omega)^{+0.5} (d_AmB)^T + // d_AmBSqInv = d_AmB (d_Omega)^{-0.5} (d_AmB)^T + cudaEventRecord(start, 0); double *d_AmBSq = NULL; check_Cuda_Errors(cudaMalloc((void**)&d_AmBSq, nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); double *d_AmBSqInv = NULL; check_Cuda_Errors(cudaMalloc((void**)&d_AmBSqInv, nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); - - cudaEventRecord(start, 0); A_D_At(nS, d_AmB, d_Omega, d_AmBSq); A_Dinv_At(nS, d_AmB, d_Omega, d_AmBSqInv); check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); @@ -118,35 +119,128 @@ void ph_drpa_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, printf("Time elapsed on d_AmBSq & d_AmBSqInv = %f msec\n", elapsedTime); - // TODO - //call dgemm('N','N',nS,nS,nS,1d0,ApB,size(ApB,1),AmBSq,size(AmBSq,1),0d0,tmp,size(tmp,1)) - //call dgemm('N','N',nS,nS,nS,1d0,AmBSq,size(AmBSq,1),tmp,size(tmp,1),0d0,Z,size(Z,1)) - //call diagonalize_matrix(nS,Z,Om) - //if(minval(Om) < 0d0) & - // call print_warning('You may have instabilities in linear response: negative excitations!!') - //Om = sqrt(Om) - //call dgemm('T','N',nS,nS,nS,1d0,Z,size(Z,1),AmBSq,size(AmBSq,1),0d0,XpY,size(XpY,1)) - //call DA(nS,1d0/dsqrt(Om),XpY) - //call dgemm('T','N',nS,nS,nS,1d0,Z,size(Z,1),AmBIv,size(AmBIv,1),0d0,XmY,size(XmY,1)) - //call DA(nS,1d0*dsqrt(Om),XmY) + // Dgemm + cudaEventRecord(start, 0); + check_Cublas_Errors(cublasCreate(&handle), "cublasCreate", __FILE__, __LINE__); + + // X + Y + check_Cublas_Errors(cublasDgemm(handle, + CUBLAS_OP_N, CUBLAS_OP_N, + nS, nS, nS, + &alpha, + d_ApB, nS, + d_AmBSq, nS, + &beta, + d_AmB, nS), + "cublasDgemm", __FILE__, __LINE__); + + check_Cuda_Errors(cudaDeviceSynchronize(), "cudaDeviceSynchronize", __FILE__, __LINE__); + + // X - Y + check_Cublas_Errors(cublasDgemm(handle, + CUBLAS_OP_N, CUBLAS_OP_N, + nS, nS, nS, + &alpha, + d_AmBSq, nS, + d_AmB, nS, + &beta, + d_ApB, nS), + "cublasDgemm", __FILE__, __LINE__); + + check_Cublas_Errors(cublasDestroy(handle), "cublasDestroy", __FILE__, __LINE__); + + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&elapsedTime, start, stop); + printf("Time elapsed on cublasDgemm = %f msec\n", elapsedTime); + + + + + // diagonalize + int *d_info2 = NULL; + check_Cuda_Errors(cudaMalloc((void**)&d_info2, sizeof(int)), "cudaMalloc", __FILE__, __LINE__); + cudaEventRecord(start, 0); + diag_dn_dsyevd(nS, d_info2, d_Omega, d_ApB); + check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&elapsedTime, start, stop); + printf("Time elapsed on diag ApB = %f msec\n", elapsedTime); + + + + + // d_Omega <-- d_Omega^{0.5} + // TODO: nb of <= 0 elements + cudaEventRecord(start, 0); + elementwise_dsqrt_inplace(nS, d_Omega); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&elapsedTime, start, stop); + printf("Time elapsed on elementwise_dsqrt_inplace %f msec\n", elapsedTime); + + + + + + // Dgemm + cudaEventRecord(start, 0); + check_Cublas_Errors(cublasCreate(&handle), "cublasCreate", __FILE__, __LINE__); + + // X + Y + check_Cublas_Errors(cublasDgemm(handle, + CUBLAS_OP_T, CUBLAS_OP_N, + nS, nS, nS, + &alpha, + d_ApB, nS, + d_AmBSq, nS, + &beta, + d_AmB, nS), + "cublasDgemm", __FILE__, __LINE__); + + check_Cuda_Errors(cudaDeviceSynchronize(), "cudaDeviceSynchronize", __FILE__, __LINE__); + + // X - Y + check_Cublas_Errors(cublasDgemm(handle, + CUBLAS_OP_T, CUBLAS_OP_N, + nS, nS, nS, + &alpha, + d_ApB, nS, + d_AmBSqInv, nS, + &beta, + d_AmBSq, nS), + "cublasDgemm", __FILE__, __LINE__); + + check_Cublas_Errors(cublasDestroy(handle), "cublasDestroy", __FILE__, __LINE__); + + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&elapsedTime, start, stop); + printf("Time elapsed on cublasDgemm = %f msec\n", elapsedTime); + + + + cudaEventRecord(start, 0); + elementwise_dsqrt(nS, d_Omega, d_AmBSq); // avoid addition memory allocation + A_Dinv_inplace(nS, d_AmB, d_AmBSq); // X + Y + A_D_inplace(nS, d_ApB, d_AmBSq); // X - Y + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&elapsedTime, start, stop); + printf("Time elapsed on final X+Y and X-Y trans = %f msec\n", elapsedTime); + // transfer data to CPU cudaEventRecord(start, 0); - //int info_gpu = 0; - //check_Cuda_Errors(cudaMemcpy(&info_gpu, d_info, sizeof(int), cudaMemcpyDeviceToHost), - // "cudaMemcpy", __FILE__, __LINE__); - //if (info_gpu != 0) { - // printf("Error: diag_dn_dsyevd returned error code %d\n", info_gpu); - // exit(EXIT_FAILURE); - //} - check_Cuda_Errors(cudaMemcpy(h_XpY, d_, nS2 * sizeof(double), cudaMemcpyDeviceToHost), + check_Cuda_Errors(cudaMemcpy(h_XpY, d_AmB, nS2 * sizeof(double), cudaMemcpyDeviceToHost), "cudaMemcpy", __FILE__, __LINE__); - check_Cuda_Errors(cudaMemcpy(h_XmY, d_, nS2 * sizeof(double), cudaMemcpyDeviceToHost), + check_Cuda_Errors(cudaMemcpy(h_XmY, d_ApB, nS2 * sizeof(double), cudaMemcpyDeviceToHost), "cudaMemcpy", __FILE__, __LINE__); check_Cuda_Errors(cudaMemcpy(h_Omega, d_Omega, nS * sizeof(double), cudaMemcpyDeviceToHost), "cudaMemcpy", __FILE__, __LINE__); @@ -155,9 +249,13 @@ void ph_drpa_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, cudaEventElapsedTime(&elapsedTime, start, stop); printf("Time elapsed on GPU -> CPU transfer = %f msec\n", elapsedTime); - check_Cuda_Errors(cudaFree(d_info), "cudaFree", __FILE__, __LINE__); - check_Cuda_Errors(cudaFree(d_A), "cudaFree", __FILE__, __LINE__); - check_Cuda_Errors(cudaFree(d_B), "cudaFree", __FILE__, __LINE__); + + check_Cuda_Errors(cudaFree(d_info1), "cudaFree", __FILE__, __LINE__); + check_Cuda_Errors(cudaFree(d_info2), "cudaFree", __FILE__, __LINE__); + check_Cuda_Errors(cudaFree(d_ApB), "cudaFree", __FILE__, __LINE__); + check_Cuda_Errors(cudaFree(d_AmB), "cudaFree", __FILE__, __LINE__); + check_Cuda_Errors(cudaFree(d_AmBSq), "cudaFree", __FILE__, __LINE__); + check_Cuda_Errors(cudaFree(d_AmBSqInv), "cudaFree", __FILE__, __LINE__); check_Cuda_Errors(cudaFree(d_Omega), "cudaFree", __FILE__, __LINE__); diff --git a/src/cuda/src/ph_drpa_tda_sing.c b/src/cuda/src/ph_drpa_tda_sing.c index 31d0725..b98535c 100644 --- a/src/cuda/src/ph_drpa_tda_sing.c +++ b/src/cuda/src/ph_drpa_tda_sing.c @@ -8,6 +8,9 @@ #include "utils.h" #include "ph_rpa.h" +#include "my_linalg.h" + + /* * @@ -42,7 +45,6 @@ void ph_drpa_tda_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, check_Cuda_Errors(cudaMalloc((void**)&d_ERI, nBas4 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); - printf("CPU->GPU transfer..\n"); cudaEventRecord(start, 0); check_Cuda_Errors(cudaMemcpy(d_eps, h_eps, nBas * sizeof(double), cudaMemcpyHostToDevice), "cudaMemcpy", __FILE__, __LINE__); From 956ab28f58aed81d22b98d89b4254e970b0a7c66 Mon Sep 17 00:00:00 2001 From: AbdAmmar Date: Fri, 29 Nov 2024 21:33:57 +0100 Subject: [PATCH 19/39] fixed small bug with -u flag --- src/make_ninja.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/make_ninja.py b/src/make_ninja.py index 79a04c7..349b12f 100755 --- a/src/make_ninja.py +++ b/src/make_ninja.py @@ -120,7 +120,7 @@ if USE_GPU: compiler_tmp[2] += " -DUSE_GPU" compiler_lib = '\n'.join(compiler_tmp) - compiler_main = compiler_tmp + compiler_main = compiler_lib else: compiler_exe = compiler compiler_lib = compiler From fcb11d662ca8f0789f3c0184e2f02c078c3fef09 Mon Sep 17 00:00:00 2001 From: AbdAmmar Date: Fri, 29 Nov 2024 22:02:59 +0100 Subject: [PATCH 20/39] fixed bug in CUDA implement of dRPA-sing --- src/RPA/phRRPA.f90 | 8 ++++++-- src/RPA/phRRPA_GPU.f90 | 8 ++++++-- src/cuda/src/elementwise_dsqrt_inplace.cu | 10 +++++----- src/cuda/src/ph_drpa_amb_sing.cu | 5 ++++- src/cuda/src/ph_drpa_apb_sing.cu | 5 ++++- src/cuda/src/ph_drpa_sing.c | 15 ++++++++++----- 6 files changed, 35 insertions(+), 16 deletions(-) diff --git a/src/RPA/phRRPA.f90 b/src/RPA/phRRPA.f90 index 3ddeb03..ebfa742 100644 --- a/src/RPA/phRRPA.f90 +++ b/src/RPA/phRRPA.f90 @@ -29,7 +29,7 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO, ! Local variables - integer :: i + integer :: ia integer :: ispin logical :: dRPA double precision :: t1, t2 @@ -81,8 +81,12 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO, !call wall_time(t1) call phLR(TDA,nS,Aph,Bph,EcRPA(ispin),Om,XpY,XmY) - !call wall_time(t2) + call wall_time(t2) !print *, "wall time diag A on CPU (sec) = ", t2 - t1 + !do ia = 1, nS + ! write(112, *) Om(ia) + !enddo + !stop call print_excitation_energies('phRPA@RHF','singlet',nS,Om) call phLR_transition_vectors(.true.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY) diff --git a/src/RPA/phRRPA_GPU.f90 b/src/RPA/phRRPA_GPU.f90 index f5c27c0..ed85749 100644 --- a/src/RPA/phRRPA_GPU.f90 +++ b/src/RPA/phRRPA_GPU.f90 @@ -29,7 +29,7 @@ subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC double precision,intent(in) :: dipole_int(nBas,nBas,ncart) - integer :: i + integer :: i, a, ia integer :: ispin logical :: dRPA double precision :: t1, t2 @@ -78,6 +78,10 @@ subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC call ph_drpa_sing(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om(1), XpY(1,1), XmY(1,1)) !call wall_time(t2) !print*, 'diag time on GPU (sec):', t2 - t1 + !do ia = 1, nS + ! write(111, *) Om(ia) + !enddo + !stop endif @@ -98,7 +102,7 @@ subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC do a = nO+1, nBas-nR ia = ia + 1 iorder(ia) = ia - Om(ia) = e(a) - e(i) + Om(ia) = eHF(a) - eHF(i) XpY(ia,ia) = 1.d0 enddo enddo diff --git a/src/cuda/src/elementwise_dsqrt_inplace.cu b/src/cuda/src/elementwise_dsqrt_inplace.cu index 8e0202f..533aeaa 100644 --- a/src/cuda/src/elementwise_dsqrt_inplace.cu +++ b/src/cuda/src/elementwise_dsqrt_inplace.cu @@ -2,14 +2,14 @@ #include -__global__ void elementwise_dsqrt_inplace_kernel(int nS, double *A) { +__global__ void elementwise_dsqrt_inplace_kernel(int n, double *A) { int i; i = blockIdx.x * blockDim.x + threadIdx.x; - while(i < nS) { + while(i < n) { if(A[i] > 0.0) { @@ -30,10 +30,10 @@ __global__ void elementwise_dsqrt_inplace_kernel(int nS, double *A) { -extern "C" void elementwise_dsqrt_inplace(int nS, double *A) { +extern "C" void elementwise_dsqrt_inplace(int n, double *A) { int sBlocks = 32; - int nBlocks = (nS + sBlocks - 1) / sBlocks; + int nBlocks = (n + sBlocks - 1) / sBlocks; dim3 dimGrid(nBlocks, 1, 1); dim3 dimBlock(sBlocks, 1, 1); @@ -42,7 +42,7 @@ extern "C" void elementwise_dsqrt_inplace(int nS, double *A) { nBlocks, sBlocks); - elementwise_dsqrt_inplace_kernel<<>>(nS, A); + elementwise_dsqrt_inplace_kernel<<>>(n, A); } diff --git a/src/cuda/src/ph_drpa_amb_sing.cu b/src/cuda/src/ph_drpa_amb_sing.cu index 4f88b4b..0f99bc3 100644 --- a/src/cuda/src/ph_drpa_amb_sing.cu +++ b/src/cuda/src/ph_drpa_amb_sing.cu @@ -9,6 +9,7 @@ __global__ void ph_dRPA_AmB_sing_kernel(int nO, int nV, int nBas, int nS, double int nBas2, nBas3; int i_A0, i_A1, i_A2; int i_I0, i_I1, i_I2; + int i_J1, i_J2; bool a_eq_b; @@ -33,17 +34,19 @@ __global__ void ph_dRPA_AmB_sing_kernel(int nO, int nV, int nBas, int nS, double i_A1 = i_A0 + bb; i_I1 = i_I0 + b * nBas; + i_J1 = i_I0 + b * nBas3; i = 0; while(i < nO) { i_A2 = i_A1 + i * nVS; i_I2 = i_I1 + i; + i_J2 = i_J1 + i; j = 0; while(j < nO) { - AmB[i_A2 + j * nV] = 2.0 * (ERI[i_I2 + j * nBas3] - ERI[i_I2 + j * nBas]); + AmB[i_A2 + j * nV] = 2.0 * (ERI[i_I2 + j * nBas3] - ERI[i_J2 + j * nBas]); if(a_eq_b && (i==j)) { AmB[i_A2 + j * nV] += eps[a] - eps[i]; } diff --git a/src/cuda/src/ph_drpa_apb_sing.cu b/src/cuda/src/ph_drpa_apb_sing.cu index d7de329..0d25bd0 100644 --- a/src/cuda/src/ph_drpa_apb_sing.cu +++ b/src/cuda/src/ph_drpa_apb_sing.cu @@ -9,6 +9,7 @@ __global__ void ph_dRPA_ApB_sing_kernel(int nO, int nV, int nBas, int nS, double int nBas2, nBas3; int i_A0, i_A1, i_A2; int i_I0, i_I1, i_I2; + int i_J1, i_J2; bool a_eq_b; @@ -33,17 +34,19 @@ __global__ void ph_dRPA_ApB_sing_kernel(int nO, int nV, int nBas, int nS, double i_A1 = i_A0 + bb; i_I1 = i_I0 + b * nBas; + i_J1 = i_I0 + b * nBas3; i = 0; while(i < nO) { i_A2 = i_A1 + i * nVS; i_I2 = i_I1 + i; + i_J2 = i_J1 + i; j = 0; while(j < nO) { - ApB[i_A2 + j * nV] = 2.0 * (ERI[i_I2 + j * nBas3] + ERI[i_I2 + j * nBas]); + ApB[i_A2 + j * nV] = 2.0 * (ERI[i_I2 + j * nBas3] + ERI[i_J2 + j * nBas]); if(a_eq_b && (i==j)) { ApB[i_A2 + j * nV] += eps[a] - eps[i]; } diff --git a/src/cuda/src/ph_drpa_sing.c b/src/cuda/src/ph_drpa_sing.c index d1cba17..3d720e7 100644 --- a/src/cuda/src/ph_drpa_sing.c +++ b/src/cuda/src/ph_drpa_sing.c @@ -80,8 +80,10 @@ void ph_drpa_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, // diagonalize A-B int *d_info1 = NULL; check_Cuda_Errors(cudaMalloc((void**)&d_info1, sizeof(int)), "cudaMalloc", __FILE__, __LINE__); + double *d_Omega = NULL; check_Cuda_Errors(cudaMalloc((void**)&d_Omega, nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); + cudaEventRecord(start, 0); diag_dn_dsyevd(nS, d_info1, d_Omega, d_AmB); check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); @@ -91,6 +93,7 @@ void ph_drpa_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, printf("Time elapsed on diag AmB = %f msec\n", elapsedTime); + // d_Omega <-- d_Omega^{0.5} // TODO: nb of <= 0 elements cudaEventRecord(start, 0); @@ -101,15 +104,17 @@ void ph_drpa_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, printf("Time elapsed on elementwise_dsqrt_inplace %f msec\n", elapsedTime); - // d_AmBSq = d_AmB (d_Omega)^{+0.5} (d_AmB)^T - // d_AmBSqInv = d_AmB (d_Omega)^{-0.5} (d_AmB)^T - cudaEventRecord(start, 0); + // d_AmBSq = d_AmB (d_Omega)^{+0.5} (d_AmB)^T double *d_AmBSq = NULL; - check_Cuda_Errors(cudaMalloc((void**)&d_AmBSq, nS * sizeof(double)), + check_Cuda_Errors(cudaMalloc((void**)&d_AmBSq, nS2 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); + + // d_AmBSqInv = d_AmB (d_Omega)^{-0.5} (d_AmB)^T double *d_AmBSqInv = NULL; - check_Cuda_Errors(cudaMalloc((void**)&d_AmBSqInv, nS * sizeof(double)), + check_Cuda_Errors(cudaMalloc((void**)&d_AmBSqInv, nS2 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); + + cudaEventRecord(start, 0); A_D_At(nS, d_AmB, d_Omega, d_AmBSq); A_Dinv_At(nS, d_AmB, d_Omega, d_AmBSqInv); check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); From da4f8df0f8a59eb2af0ba9d78e4c88cb6a5a2ad6 Mon Sep 17 00:00:00 2001 From: AbdAmmar Date: Sat, 30 Nov 2024 00:38:32 +0100 Subject: [PATCH 21/39] optim A x D x A.T and A x Dinv x A.T on GPU --- src/cuda/include/my_linalg.h | 6 ++++ src/cuda/src/a_d_in_b.cu | 57 +++++++++++++++++++++++++++++ src/cuda/src/a_dinv_in_b.cu | 57 +++++++++++++++++++++++++++++ src/cuda/src/a_minus_twob_in_b.cu | 52 +++++++++++++++++++++++++++ src/cuda/src/a_plus_b_in_a.cu | 52 +++++++++++++++++++++++++++ src/cuda/src/ph_drpa_a_sing.cu | 34 ++++++++++-------- src/cuda/src/ph_drpa_amb_sing.cu | 46 ++++++++++++++---------- src/cuda/src/ph_drpa_apb_sing.cu | 33 +++++++++++------ src/cuda/src/ph_drpa_b_sing.cu | 33 +++++++++-------- src/cuda/src/ph_drpa_sing.c | 59 +++++++++++++++++++++++++------ 10 files changed, 360 insertions(+), 69 deletions(-) create mode 100644 src/cuda/src/a_d_in_b.cu create mode 100644 src/cuda/src/a_dinv_in_b.cu create mode 100644 src/cuda/src/a_minus_twob_in_b.cu create mode 100644 src/cuda/src/a_plus_b_in_a.cu diff --git a/src/cuda/include/my_linalg.h b/src/cuda/include/my_linalg.h index 457ab99..6ad5bf5 100644 --- a/src/cuda/include/my_linalg.h +++ b/src/cuda/include/my_linalg.h @@ -2,12 +2,18 @@ #define MY_LINALG +extern void A_plus_B_in_A(int n, double *A, double *B); +extern void A_minus_twoB_in_B(int n, double *A, double *B); + extern void A_D_At(int n, double *A, double *D, double *R); extern void A_Dinv_At(int n, double *A, double *D, double *R); extern void A_D_inplace(int n, double *A, double *D); extern void A_Dinv_inplace(int n, double *A, double *D); +extern void A_D_in_B(int n, double *A, double *D, double *B); +extern void A_Dinv_in_B(int n, double *A, double *D, double *B); + extern void elementwise_dsqrt(int nS, double *A, double *A_Sq); extern void elementwise_dsqrt_inplace(int nS, double *A); diff --git a/src/cuda/src/a_d_in_b.cu b/src/cuda/src/a_d_in_b.cu new file mode 100644 index 0000000..10a7e89 --- /dev/null +++ b/src/cuda/src/a_d_in_b.cu @@ -0,0 +1,57 @@ +#include + + +__global__ void A_D_in_B_kernel(int n, double *A, double *D, double *B) { + + + int i, j; + int in, ji; + + double tmp; + + i = blockIdx.x * blockDim.x + threadIdx.x; + j = blockIdx.y * blockDim.y + threadIdx.y; + + while(i < n) { + + in = i * n; + + tmp = D[i]; + + while(j < n) { + + ji = in + j; + + B[ji] = A[ji] * tmp; + + j += blockDim.y * gridDim.y; + } // j + + i += blockDim.x * gridDim.x; + } // i + +} + + + + + +extern "C" void A_D_in_B(int n, double *A, double *D, double *B) { + + + int sBlocks = 32; + int nBlocks = (n + sBlocks - 1) / sBlocks; + + dim3 dimGrid(nBlocks, nBlocks, 1); + dim3 dimBlock(sBlocks, sBlocks, 1); + + printf("lunching A_D_in_B_kernel with %dx%d blocks and %dx%d threads/block\n", + nBlocks, nBlocks, sBlocks, sBlocks); + + + A_D_in_B_kernel<<>>(n, A, D, B); + +} + + + diff --git a/src/cuda/src/a_dinv_in_b.cu b/src/cuda/src/a_dinv_in_b.cu new file mode 100644 index 0000000..c73ecd2 --- /dev/null +++ b/src/cuda/src/a_dinv_in_b.cu @@ -0,0 +1,57 @@ +#include + + +__global__ void A_Dinv_in_B_kernel(int n, double *A, double *D, double *B) { + + + int i, j; + int in, ji; + + double tmp; + + i = blockIdx.x * blockDim.x + threadIdx.x; + j = blockIdx.y * blockDim.y + threadIdx.y; + + while(i < n) { + + in = i * n; + + tmp = 1.0 / D[i]; + + while(j < n) { + + ji = in + j; + + B[ji] = A[ji] * tmp; + + j += blockDim.y * gridDim.y; + } // j + + i += blockDim.x * gridDim.x; + } // i + +} + + + + + +extern "C" void A_Dinv_in_B(int n, double *A, double *D, double *B) { + + + int sBlocks = 32; + int nBlocks = (n + sBlocks - 1) / sBlocks; + + dim3 dimGrid(nBlocks, nBlocks, 1); + dim3 dimBlock(sBlocks, sBlocks, 1); + + printf("lunching A_Dinv_in_B_kernel with %dx%d blocks and %dx%d threads/block\n", + nBlocks, nBlocks, sBlocks, sBlocks); + + + A_Dinv_in_B_kernel<<>>(n, A, D, B); + +} + + + diff --git a/src/cuda/src/a_minus_twob_in_b.cu b/src/cuda/src/a_minus_twob_in_b.cu new file mode 100644 index 0000000..0acdb70 --- /dev/null +++ b/src/cuda/src/a_minus_twob_in_b.cu @@ -0,0 +1,52 @@ +#include + + +__global__ void A_minus_twoB_in_B_kernel(int n, double *A, double *B) { + + + int i, j; + int in, ji; + + i = blockIdx.x * blockDim.x + threadIdx.x; + j = blockIdx.y * blockDim.y + threadIdx.y; + + while(i < n) { + + in = i * n; + + while(j < n) { + + ji = in + j; + + B[ji] = A[ji] - 2.0 * B[ji]; + + j += blockDim.y * gridDim.y; + } // j + + i += blockDim.x * gridDim.x; + } // i + +} + + + + +extern "C" void A_minus_twoB_in_B(int n, double *A, double *B) { + + + int sBlocks = 32; + int nBlocks = (n + sBlocks - 1) / sBlocks; + + dim3 dimGrid(nBlocks, nBlocks, 1); + dim3 dimBlock(sBlocks, sBlocks, 1); + + printf("lunching A_minus_twoB_in_B_kernel with %dx%d blocks and %dx%d threads/block\n", + nBlocks, nBlocks, sBlocks, sBlocks); + + + A_minus_twoB_in_B_kernel<<>>(n, A, B); + +} + + + diff --git a/src/cuda/src/a_plus_b_in_a.cu b/src/cuda/src/a_plus_b_in_a.cu new file mode 100644 index 0000000..d1868e3 --- /dev/null +++ b/src/cuda/src/a_plus_b_in_a.cu @@ -0,0 +1,52 @@ +#include + + +__global__ void A_plus_B_in_A_kernel(int n, double *A, double *B) { + + + int i, j; + int in, ji; + + i = blockIdx.x * blockDim.x + threadIdx.x; + j = blockIdx.y * blockDim.y + threadIdx.y; + + while(i < n) { + + in = i * n; + + while(j < n) { + + ji = in + j; + + A[ji] = A[ji] + B[ji]; + + j += blockDim.y * gridDim.y; + } // j + + i += blockDim.x * gridDim.x; + } // i + +} + + + + +extern "C" void A_plus_B_in_A(int n, double *A, double *B) { + + + int sBlocks = 32; + int nBlocks = (n + sBlocks - 1) / sBlocks; + + dim3 dimGrid(nBlocks, nBlocks, 1); + dim3 dimBlock(sBlocks, sBlocks, 1); + + printf("lunching A_plus_B_in_A_kernel with %dx%d blocks and %dx%d threads/block\n", + nBlocks, nBlocks, sBlocks, sBlocks); + + + A_plus_B_in_A_kernel<<>>(n, A, B); + +} + + + diff --git a/src/cuda/src/ph_drpa_a_sing.cu b/src/cuda/src/ph_drpa_a_sing.cu index be5e7af..5711d56 100644 --- a/src/cuda/src/ph_drpa_a_sing.cu +++ b/src/cuda/src/ph_drpa_a_sing.cu @@ -5,17 +5,18 @@ __global__ void ph_dRPA_A_sing_kernel(int nO, int nV, int nBas, int nS, double * int i, j, a, b; int aa, bb; - int nVS; - int nBas2, nBas3; - int i_A0, i_A1, i_A2; - int i_I0, i_I1, i_I2; + + long long nVS; + long long nBas2, nBas3; + long long i_A0, i_A1, i_A2, i_A3; + long long i_I0, i_I1, i_I2, i_I3; bool a_eq_b; - nVS = nV * nS; + nVS = (long long) nV * (long long) nS; - nBas2 = nBas * nBas; - nBas3 = nBas2 * nBas; + nBas2 = (long long) nBas * (long long) nBas; + nBas3 = nBas2 * (long long) nBas; aa = blockIdx.x * blockDim.x + threadIdx.x; bb = blockIdx.y * blockDim.y + threadIdx.y; @@ -23,29 +24,32 @@ __global__ void ph_dRPA_A_sing_kernel(int nO, int nV, int nBas, int nS, double * while(aa < nV) { a = aa + nO; - i_A0 = aa * nS; - i_I0 = a * nBas2; + i_A0 = (long long) aa * (long long) nS; + i_I0 = (long long) a * nBas2; while(bb < nV) { b = bb + nO; a_eq_b = a == b; - i_A1 = i_A0 + bb; - i_I1 = i_I0 + b * nBas; + i_A1 = i_A0 + (long long) bb; + i_I1 = i_I0 + (long long) b * (long long) nBas; i = 0; while(i < nO) { - i_A2 = i_A1 + i * nVS; - i_I2 = i_I1 + i; + i_A2 = i_A1 + (long long) i * nVS; + i_I2 = i_I1 + (long long) i; j = 0; while(j < nO) { - A[i_A2 + j * nV] = 2.0 * ERI[i_I2 + j * nBas3]; + i_A3 = i_A2 + (long long) j * (long long) nV; + i_I3 = i_I2 + (long long) j * nBas3; + + A[i_A3] = 2.0 * ERI[i_I3]; if(a_eq_b && (i==j)) { - A[i_A2 + j * nV] += eps[a] - eps[i]; + A[i_A3] += eps[a] - eps[i]; } j ++; diff --git a/src/cuda/src/ph_drpa_amb_sing.cu b/src/cuda/src/ph_drpa_amb_sing.cu index 0f99bc3..d09642b 100644 --- a/src/cuda/src/ph_drpa_amb_sing.cu +++ b/src/cuda/src/ph_drpa_amb_sing.cu @@ -1,22 +1,26 @@ #include -__global__ void ph_dRPA_AmB_sing_kernel(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *AmB) { + +__global__ void ph_dRPA_AmB_sing_kernel(int nO, int nV, int nBas, int nS, + double *eps, double *ERI, double *AmB) { int i, j, a, b; int aa, bb; - int nVS; - int nBas2, nBas3; - int i_A0, i_A1, i_A2; - int i_I0, i_I1, i_I2; - int i_J1, i_J2; + + long long i_A0, i_A1, i_A2, i_A3; + long long i_I0, i_I1, i_I2, i_I3; + long long i_J1, i_J2, i_J3; + + long long nVS; + long long nBas2, nBas3; bool a_eq_b; - nVS = nV * nS; + nVS = (long long) nV * (long long) nS; - nBas2 = nBas * nBas; - nBas3 = nBas2 * nBas; + nBas2 = (long long) nBas * (long long) nBas; + nBas3 = nBas2 * (long long) nBas; aa = blockIdx.x * blockDim.x + threadIdx.x; bb = blockIdx.y * blockDim.y + threadIdx.y; @@ -24,31 +28,35 @@ __global__ void ph_dRPA_AmB_sing_kernel(int nO, int nV, int nBas, int nS, double while(aa < nV) { a = aa + nO; - i_A0 = aa * nS; - i_I0 = a * nBas2; + i_A0 = (long long) aa * (long long) nS; + i_I0 = (long long) a * nBas2; while(bb < nV) { b = bb + nO; a_eq_b = a == b; - i_A1 = i_A0 + bb; - i_I1 = i_I0 + b * nBas; - i_J1 = i_I0 + b * nBas3; + i_A1 = i_A0 + (long long) bb; + i_I1 = i_I0 + (long long) b * (long long) nBas; + i_J1 = i_I0 + (long long) b * nBas3; i = 0; while(i < nO) { - i_A2 = i_A1 + i * nVS; - i_I2 = i_I1 + i; - i_J2 = i_J1 + i; + i_A2 = i_A1 + (long long) i * nVS; + i_I2 = i_I1 + (long long) i; + i_J2 = i_J1 + (long long) i; j = 0; while(j < nO) { - AmB[i_A2 + j * nV] = 2.0 * (ERI[i_I2 + j * nBas3] - ERI[i_J2 + j * nBas]); + i_A3 = i_A2 + (long long) j * nV; + i_I3 = i_I2 + (long long) j * nBas3; + i_J3 = i_J2 + (long long) j * (long long) nBas; + + AmB[i_A3] = 2.0 * (ERI[i_I3] - ERI[i_J3]); if(a_eq_b && (i==j)) { - AmB[i_A2 + j * nV] += eps[a] - eps[i]; + AmB[i_A3] += eps[a] - eps[i]; } j ++; diff --git a/src/cuda/src/ph_drpa_apb_sing.cu b/src/cuda/src/ph_drpa_apb_sing.cu index 0d25bd0..d317fb4 100644 --- a/src/cuda/src/ph_drpa_apb_sing.cu +++ b/src/cuda/src/ph_drpa_apb_sing.cu @@ -1,22 +1,29 @@ #include -__global__ void ph_dRPA_ApB_sing_kernel(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *ApB) { + +__global__ void ph_dRPA_ApB_sing_kernel(int nO, int nV, int nBas, int nS, + double *eps, double *ERI, double *ApB) { - int i, j, a, b; - int aa, bb; - int nVS; - int nBas2, nBas3; - int i_A0, i_A1, i_A2; + long i, j, a, b; + long aa, bb; + + int i_A0, i_A1, i_A2, i_A3; int i_I0, i_I1, i_I2; int i_J1, i_J2; + int nVS; + int nBas2; + + long long i_I3, i_J3; + long long nBas3; + bool a_eq_b; nVS = nV * nS; nBas2 = nBas * nBas; - nBas3 = nBas2 * nBas; + nBas3 = (long long) nBas2 * (long long) nBas; aa = blockIdx.x * blockDim.x + threadIdx.x; bb = blockIdx.y * blockDim.y + threadIdx.y; @@ -34,21 +41,25 @@ __global__ void ph_dRPA_ApB_sing_kernel(int nO, int nV, int nBas, int nS, double i_A1 = i_A0 + bb; i_I1 = i_I0 + b * nBas; - i_J1 = i_I0 + b * nBas3; + i_J1 = a + b * nBas; i = 0; while(i < nO) { i_A2 = i_A1 + i * nVS; i_I2 = i_I1 + i; - i_J2 = i_J1 + i; + i_J2 = i_J1 + i * nBas2; j = 0; while(j < nO) { - ApB[i_A2 + j * nV] = 2.0 * (ERI[i_I2 + j * nBas3] + ERI[i_J2 + j * nBas]); + i_A3 = i_A2 + j * nV; + i_I3 = i_I2 + (long long) j * nBas3; + i_J3 = i_J2 + (long long) j * nBas3; + + ApB[i_A3] = 2.0 * (ERI[i_I3] + ERI[i_J3]); if(a_eq_b && (i==j)) { - ApB[i_A2 + j * nV] += eps[a] - eps[i]; + ApB[i_A3] += eps[a] - eps[i]; } j ++; diff --git a/src/cuda/src/ph_drpa_b_sing.cu b/src/cuda/src/ph_drpa_b_sing.cu index 2a59142..b8ec06d 100644 --- a/src/cuda/src/ph_drpa_b_sing.cu +++ b/src/cuda/src/ph_drpa_b_sing.cu @@ -5,15 +5,17 @@ __global__ void ph_dRPA_B_sing_kernel(int nO, int nV, int nBas, int nS, double * int i, j, a, b; int aa, bb; - int nVS; - int nBas2, nBas3; - int i_B0, i_B1, i_B2; - int i_I0, i_I1, i_I2; - nVS = nV * nS; + long long nVS; + long long nBas2, nBas3; + long long i_B0, i_B1, i_B2, i_B3; + long long i_I0, i_I1, i_I2, i_I3; - nBas2 = nBas * nBas; - nBas3 = nBas2 * nBas; + + nVS = (long long) nV * (long long) nS; + + nBas2 = (long long) nBas * (long long) nBas; + nBas3 = nBas2 * (long long) nBas; aa = blockIdx.x * blockDim.x + threadIdx.x; bb = blockIdx.y * blockDim.y + threadIdx.y; @@ -21,25 +23,28 @@ __global__ void ph_dRPA_B_sing_kernel(int nO, int nV, int nBas, int nS, double * while(aa < nV) { a = aa + nO; - i_B0 = aa * nS; - i_I0 = a * nBas2; + i_B0 = (long long) aa * (long long) nS; + i_I0 = (long long) a * nBas2; while(bb < nV) { b = bb + nO; - i_B1 = i_B0 + bb; - i_I1 = i_I0 + b * nBas3; + i_B1 = i_B0 + (long long) bb; + i_I1 = i_I0 + (long long) b * nBas3; i = 0; while(i < nO) { - i_B2 = i_B1 + i * nVS; - i_I2 = i_I1 + i; + i_B2 = i_B1 + (long long) i * nVS; + i_I2 = i_I1 + (long long) i; j = 0; while(j < nO) { - B[i_B2 + j * nV] = 2.0 * ERI[i_I2 + j * nBas]; + i_B3 = i_B2 + (long long) j * (long long) nV; + i_I3 = i_I2 + (long long) j * (long long) nBas; + + B[i_B3] = 2.0 * ERI[i_I3]; j ++; } // j diff --git a/src/cuda/src/ph_drpa_sing.c b/src/cuda/src/ph_drpa_sing.c index 3d720e7..85329e0 100644 --- a/src/cuda/src/ph_drpa_sing.c +++ b/src/cuda/src/ph_drpa_sing.c @@ -65,6 +65,12 @@ void ph_drpa_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, cudaEventRecord(start, 0); ph_dRPA_ApB_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_ApB); ph_dRPA_AmB_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_AmB); + //ph_dRPA_A_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_ApB); + //ph_dRPA_B_sing(nO, nV, nBas, nS, d_ERI, d_AmB); + //check_Cuda_Errors(cudaDeviceSynchronize(), "cudaDeviceSynchronize", __FILE__, __LINE__); + //A_plus_B_in_A(nS, d_ApB, d_AmB); + //check_Cuda_Errors(cudaDeviceSynchronize(), "cudaDeviceSynchronize", __FILE__, __LINE__); + //A_minus_twoB_in_B(nS, d_ApB, d_AmB); check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); @@ -73,6 +79,7 @@ void ph_drpa_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, // free memory + check_Cuda_Errors(cudaDeviceSynchronize(), "cudaDeviceSynchronize", __FILE__, __LINE__); check_Cuda_Errors(cudaFree(d_eps), "cudaFree", __FILE__, __LINE__); check_Cuda_Errors(cudaFree(d_ERI), "cudaFree", __FILE__, __LINE__); @@ -105,30 +112,62 @@ void ph_drpa_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, // d_AmBSq = d_AmB (d_Omega)^{+0.5} (d_AmB)^T - double *d_AmBSq = NULL; - check_Cuda_Errors(cudaMalloc((void**)&d_AmBSq, nS2 * sizeof(double)), - "cudaMalloc", __FILE__, __LINE__); - // d_AmBSqInv = d_AmB (d_Omega)^{-0.5} (d_AmB)^T + double *d_AmBSq = NULL; double *d_AmBSqInv = NULL; - check_Cuda_Errors(cudaMalloc((void**)&d_AmBSqInv, nS2 * sizeof(double)), - "cudaMalloc", __FILE__, __LINE__); + double *d_tmp1 = NULL; + double *d_tmp2 = NULL; + check_Cuda_Errors(cudaMalloc((void**)&d_AmBSq, nS2 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); + check_Cuda_Errors(cudaMalloc((void**)&d_AmBSqInv, nS2 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); + check_Cuda_Errors(cudaMalloc((void**)&d_tmp1, nS2 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); + check_Cuda_Errors(cudaMalloc((void**)&d_tmp2, nS2 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); + + check_Cublas_Errors(cublasCreate(&handle), "cublasCreate", __FILE__, __LINE__); cudaEventRecord(start, 0); - A_D_At(nS, d_AmB, d_Omega, d_AmBSq); - A_Dinv_At(nS, d_AmB, d_Omega, d_AmBSqInv); + // naive way + //A_D_At(nS, d_AmB, d_Omega, d_AmBSq); + //A_Dinv_At(nS, d_AmB, d_Omega, d_AmBSqInv); + + A_D_in_B(nS, d_AmB, d_Omega, d_tmp1); + A_Dinv_in_B(nS, d_AmB, d_Omega, d_tmp2); + + check_Cuda_Errors(cudaDeviceSynchronize(), "cudaDeviceSynchronize", __FILE__, __LINE__); + + check_Cublas_Errors(cublasDgemm(handle, + CUBLAS_OP_N, CUBLAS_OP_T, + nS, nS, nS, + &alpha, + d_tmp1, nS, + d_AmB, nS, + &beta, + d_AmBSq, nS), + "cublasDgemm", __FILE__, __LINE__); + + check_Cublas_Errors(cublasDgemm(handle, + CUBLAS_OP_N, CUBLAS_OP_T, + nS, nS, nS, + &alpha, + d_tmp2, nS, + d_AmB, nS, + &beta, + d_AmBSqInv, nS), + "cublasDgemm", __FILE__, __LINE__); + check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); + cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsedTime, start, stop); printf("Time elapsed on d_AmBSq & d_AmBSqInv = %f msec\n", elapsedTime); - + check_Cuda_Errors(cudaDeviceSynchronize(), "cudaDeviceSynchronize", __FILE__, __LINE__); + check_Cuda_Errors(cudaFree(d_tmp1), "cudaFree", __FILE__, __LINE__); + check_Cuda_Errors(cudaFree(d_tmp2), "cudaFree", __FILE__, __LINE__); // Dgemm cudaEventRecord(start, 0); - check_Cublas_Errors(cublasCreate(&handle), "cublasCreate", __FILE__, __LINE__); // X + Y check_Cublas_Errors(cublasDgemm(handle, From 6cfe4a5dece390784ddcc1b6c2f70afe7bac7671 Mon Sep 17 00:00:00 2001 From: AbdAmmar Date: Sun, 1 Dec 2024 11:22:29 +0100 Subject: [PATCH 22/39] OMP for phLR --- src/LR/phLR_A.f90 | 116 ++++++++++++++++++++++++++++++++++------------ src/LR/phLR_B.f90 | 107 ++++++++++++++++++++++++++++++------------ 2 files changed, 163 insertions(+), 60 deletions(-) diff --git a/src/LR/phLR_A.f90 b/src/LR/phLR_A.f90 index 2826fdd..4b44aa7 100644 --- a/src/LR/phLR_A.f90 +++ b/src/LR/phLR_A.f90 @@ -25,6 +25,9 @@ subroutine phLR_A(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,e,ERI,Aph) double precision,external :: Kronecker_delta integer :: i,j,a,b,ia,jb + integer :: nn,jb0 + logical :: i_eq_j + double precision :: ct1,ct2 ! Output variables @@ -39,22 +42,49 @@ subroutine phLR_A(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,e,ERI,Aph) if(ispin == 1) then - ia = 0 - do i=nC+1,nO - do a=nO+1,nBas-nR - ia = ia + 1 - jb = 0 - do j=nC+1,nO - do b=nO+1,nBas-nR - jb = jb + 1 - - Aph(ia,jb) = (e(a) - e(i))*Kronecker_delta(i,j)*Kronecker_delta(a,b) & - + 2d0*lambda*ERI(i,b,a,j) - (1d0 - delta_dRPA)*lambda*ERI(i,b,j,a) + nn = nBas - nR - nO + ct1 = 2d0 * lambda + ct2 = - (1d0 - delta_dRPA) * lambda + !$OMP PARALLEL DEFAULT(NONE) & + !$OMP PRIVATE (i, a, j, b, i_eq_j, ia, jb0, jb) & + !$OMP SHARED (nC, nO, nR, nBas, nn, ct1, ct2, e, ERI, Aph) + !$OMP DO COLLAPSE(2) + do i = nC+1, nO + do a = nO+1, nBas-nR + ia = a - nO + (i - nC - 1) * nn - end do - end do - end do - end do + do j = nC+1, nO + i_eq_j = i == j + jb0 = (j - nC - 1) * nn - nO + + do b = nO+1, nBas-nR + jb = b + jb0 + + Aph(ia,jb) = ct1 * ERI(i,b,a,j) + ct2 * ERI(i,b,j,a) + if(i_eq_j) then + if(a == b) Aph(ia,jb) = Aph(ia,jb) + e(a) - e(i) + endif + enddo + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + + !ia = 0 + !do i=nC+1,nO + ! do a=nO+1,nBas-nR + ! ia = ia + 1 + ! jb = 0 + ! do j=nC+1,nO + ! do b=nO+1,nBas-nR + ! jb = jb + 1 + ! Aph(ia,jb) = (e(a) - e(i))*Kronecker_delta(i,j)*Kronecker_delta(a,b) & + ! + 2d0*lambda*ERI(i,b,a,j) - (1d0 - delta_dRPA)*lambda*ERI(i,b,j,a) + ! end do + ! end do + ! end do + !end do end if @@ -62,22 +92,48 @@ subroutine phLR_A(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,e,ERI,Aph) if(ispin == 2) then - ia = 0 - do i=nC+1,nO - do a=nO+1,nBas-nR - ia = ia + 1 - jb = 0 - do j=nC+1,nO - do b=nO+1,nBas-nR - jb = jb + 1 - - Aph(ia,jb) = (e(a) - e(i))*Kronecker_delta(i,j)*Kronecker_delta(a,b) & - - (1d0 - delta_dRPA)*lambda*ERI(i,b,j,a) + nn = nBas - nR - nO + ct2 = - (1d0 - delta_dRPA) * lambda + !$OMP PARALLEL DEFAULT(NONE) & + !$OMP PRIVATE (i, a, j, b, i_eq_j, ia, jb0, jb) & + !$OMP SHARED (nC, nO, nR, nBas, nn, ct2, e, ERI, Aph) + !$OMP DO COLLAPSE(2) + do i = nC+1, nO + do a = nO+1, nBas-nR + ia = a - nO + (i - nC - 1) * nn - end do - end do - end do - end do + do j = nC+1, nO + i_eq_j = i == j + jb0 = (j - nC - 1) * nn - nO + + do b = nO+1, nBas-nR + jb = b + jb0 + + Aph(ia,jb) = ct2 * ERI(i,b,j,a) + if(i_eq_j) then + if(a == b) Aph(ia,jb) = Aph(ia,jb) + e(a) - e(i) + endif + enddo + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + +! ia = 0 +! do i=nC+1,nO +! do a=nO+1,nBas-nR +! ia = ia + 1 +! jb = 0 +! do j=nC+1,nO +! do b=nO+1,nBas-nR +! jb = jb + 1 +! Aph(ia,jb) = (e(a) - e(i))*Kronecker_delta(i,j)*Kronecker_delta(a,b) & +! - (1d0 - delta_dRPA)*lambda*ERI(i,b,j,a) +! end do +! end do +! end do +! end do end if diff --git a/src/LR/phLR_B.f90 b/src/LR/phLR_B.f90 index 51a4a01..954065a 100644 --- a/src/LR/phLR_B.f90 +++ b/src/LR/phLR_B.f90 @@ -17,6 +17,8 @@ subroutine phLR_B(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,ERI,Bph) double precision :: delta_dRPA integer :: i,j,a,b,ia,jb + integer :: nn,jb0 + double precision :: ct1,ct2 ! Output variables @@ -31,21 +33,44 @@ subroutine phLR_B(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,ERI,Bph) if(ispin == 1) then - ia = 0 - do i=nC+1,nO - do a=nO+1,nBas-nR - ia = ia + 1 - jb = 0 - do j=nC+1,nO - do b=nO+1,nBas-nR - jb = jb + 1 - - Bph(ia,jb) = 2d0*lambda*ERI(i,j,a,b) - (1d0 - delta_dRPA)*lambda*ERI(i,j,b,a) - - end do - end do - end do - end do + nn = nBas - nR - nO + ct1 = 2d0 * lambda + ct2 = - (1d0 - delta_dRPA) * lambda + !$OMP PARALLEL DEFAULT(NONE) & + !$OMP PRIVATE (i, a, j, b, ia, jb0, jb) & + !$OMP SHARED (nC, nO, nR, nBas, nn, ct1, ct2, ERI, Bph) + !$OMP DO COLLAPSE(2) + do i = nC+1, nO + do a = nO+1, nBas-nR + ia = a - nO + (i - nC - 1) * nn + + do j = nC+1, nO + jb0 = (j - nC - 1) * nn - nO + + do b = nO+1, nBas-nR + jb = b + jb0 + + Bph(ia,jb) = ct1 * ERI(i,j,a,b) + ct2 * ERI(i,j,b,a) + enddo + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + + !ia = 0 + !do i=nC+1,nO + ! do a=nO+1,nBas-nR + ! ia = ia + 1 + ! jb = 0 + ! do j=nC+1,nO + ! do b=nO+1,nBas-nR + ! jb = jb + 1 + ! Bph(ia,jb) = 2d0*lambda*ERI(i,j,a,b) - (1d0 - delta_dRPA)*lambda*ERI(i,j,b,a) + ! end do + ! end do + ! end do + !end do end if @@ -53,21 +78,43 @@ subroutine phLR_B(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,ERI,Bph) if(ispin == 2) then - ia = 0 - do i=nC+1,nO - do a=nO+1,nBas-nR - ia = ia + 1 - jb = 0 - do j=nC+1,nO - do b=nO+1,nBas-nR - jb = jb + 1 - - Bph(ia,jb) = - (1d0 - delta_dRPA)*lambda*ERI(i,j,b,a) - - end do - end do - end do - end do + nn = nBas - nR - nO + ct2 = - (1d0 - delta_dRPA) * lambda + !$OMP PARALLEL DEFAULT(NONE) & + !$OMP PRIVATE (i, a, j, b, ia, jb0, jb) & + !$OMP SHARED (nC, nO, nR, nBas, nn, ct2, ERI, Bph) + !$OMP DO COLLAPSE(2) + do i = nC+1, nO + do a = nO+1, nBas-nR + ia = a - nO + (i - nC - 1) * nn + + do j = nC+1, nO + jb0 = (j - nC - 1) * nn - nO + + do b = nO+1, nBas-nR + jb = b + jb0 + + Bph(ia,jb) = ct2 * ERI(i,j,b,a) + enddo + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + +! ia = 0 +! do i=nC+1,nO +! do a=nO+1,nBas-nR +! ia = ia + 1 +! jb = 0 +! do j=nC+1,nO +! do b=nO+1,nBas-nR +! jb = jb + 1 +! Bph(ia,jb) = - (1d0 - delta_dRPA)*lambda*ERI(i,j,b,a) +! end do +! end do +! end do +! end do end if From 20900979c4eef5ec4042be9e0b158e8ad5c32a0c Mon Sep 17 00:00:00 2001 From: AbdAmmar Date: Sun, 1 Dec 2024 11:38:55 +0100 Subject: [PATCH 23/39] opt memory access in Aph & Bph --- src/LR/phLR_A.f90 | 40 ++++++++++++++++++++-------------------- src/LR/phLR_B.f90 | 22 +++++++++++----------- 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/src/LR/phLR_A.f90 b/src/LR/phLR_A.f90 index 4b44aa7..708799a 100644 --- a/src/LR/phLR_A.f90 +++ b/src/LR/phLR_A.f90 @@ -7,31 +7,31 @@ subroutine phLR_A(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,e,ERI,Aph) ! Input variables - logical,intent(in) :: dRPA - integer,intent(in) :: ispin - integer,intent(in) :: nBas - integer,intent(in) :: nC - integer,intent(in) :: nO - integer,intent(in) :: nV - integer,intent(in) :: nR - integer,intent(in) :: nS - double precision,intent(in) :: lambda - double precision,intent(in) :: e(nBas) - double precision,intent(in) :: ERI(nBas,nBas,nBas,nBas) + logical,intent(in) :: dRPA + integer,intent(in) :: ispin + integer,intent(in) :: nBas + integer,intent(in) :: nC + integer,intent(in) :: nO + integer,intent(in) :: nV + integer,intent(in) :: nR + integer,intent(in) :: nS + double precision,intent(in) :: lambda + double precision,intent(in) :: e(nBas) + double precision,intent(in) :: ERI(nBas,nBas,nBas,nBas) ! Local variables - double precision :: delta_dRPA - double precision,external :: Kronecker_delta + double precision :: delta_dRPA + double precision,external :: Kronecker_delta - integer :: i,j,a,b,ia,jb - integer :: nn,jb0 - logical :: i_eq_j - double precision :: ct1,ct2 + integer :: i,j,a,b,ia,jb + integer :: nn,jb0 + logical :: i_eq_j + double precision :: ct1,ct2 ! Output variables - double precision,intent(out) :: Aph(nS,nS) + double precision,intent(out) :: Aph(nS,nS) ! Direct RPA @@ -60,7 +60,7 @@ subroutine phLR_A(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,e,ERI,Aph) do b = nO+1, nBas-nR jb = b + jb0 - Aph(ia,jb) = ct1 * ERI(i,b,a,j) + ct2 * ERI(i,b,j,a) + Aph(ia,jb) = ct1 * ERI(b,i,j,a) + ct2 * ERI(b,j,a,i) if(i_eq_j) then if(a == b) Aph(ia,jb) = Aph(ia,jb) + e(a) - e(i) endif @@ -109,7 +109,7 @@ subroutine phLR_A(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,e,ERI,Aph) do b = nO+1, nBas-nR jb = b + jb0 - Aph(ia,jb) = ct2 * ERI(i,b,j,a) + Aph(ia,jb) = ct2 * ERI(b,j,a,i) if(i_eq_j) then if(a == b) Aph(ia,jb) = Aph(ia,jb) + e(a) - e(i) endif diff --git a/src/LR/phLR_B.f90 b/src/LR/phLR_B.f90 index 954065a..9645952 100644 --- a/src/LR/phLR_B.f90 +++ b/src/LR/phLR_B.f90 @@ -7,22 +7,22 @@ subroutine phLR_B(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,ERI,Bph) ! Input variables - logical,intent(in) :: dRPA - integer,intent(in) :: ispin,nBas,nC,nO,nV,nR,nS - double precision,intent(in) :: lambda - double precision,intent(in) :: ERI(nBas,nBas,nBas,nBas) + logical,intent(in) :: dRPA + integer,intent(in) :: ispin,nBas,nC,nO,nV,nR,nS + double precision,intent(in) :: lambda + double precision,intent(in) :: ERI(nBas,nBas,nBas,nBas) ! Local variables - double precision :: delta_dRPA + double precision :: delta_dRPA - integer :: i,j,a,b,ia,jb - integer :: nn,jb0 - double precision :: ct1,ct2 + integer :: i,j,a,b,ia,jb + integer :: nn,jb0 + double precision :: ct1,ct2 ! Output variables - double precision,intent(out) :: Bph(nS,nS) + double precision,intent(out) :: Bph(nS,nS) ! Direct RPA @@ -50,7 +50,7 @@ subroutine phLR_B(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,ERI,Bph) do b = nO+1, nBas-nR jb = b + jb0 - Bph(ia,jb) = ct1 * ERI(i,j,a,b) + ct2 * ERI(i,j,b,a) + Bph(ia,jb) = ct1 * ERI(b,i,j,a) + ct2 * ERI(b,j,i,a) enddo enddo enddo @@ -94,7 +94,7 @@ subroutine phLR_B(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,ERI,Bph) do b = nO+1, nBas-nR jb = b + jb0 - Bph(ia,jb) = ct2 * ERI(i,j,b,a) + Bph(ia,jb) = ct2 * ERI(b,j,i,a) enddo enddo enddo From 43ee6924bfb0c14fb572e66e8223a4e7f2f26e9b Mon Sep 17 00:00:00 2001 From: AbdAmmar Date: Sun, 1 Dec 2024 11:47:29 +0100 Subject: [PATCH 24/39] rm timing --- src/RPA/phRRPA.f90 | 63 ++++++++++++++++++++---------------------- src/RPA/phRRPA_GPU.f90 | 5 ++-- 2 files changed, 32 insertions(+), 36 deletions(-) diff --git a/src/RPA/phRRPA.f90 b/src/RPA/phRRPA.f90 index ebfa742..48f3c5c 100644 --- a/src/RPA/phRRPA.f90 +++ b/src/RPA/phRRPA.f90 @@ -8,39 +8,39 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO, ! Input variables - logical,intent(in) :: dotest + logical,intent(in) :: dotest - logical,intent(in) :: TDA - logical,intent(in) :: doACFDT - logical,intent(in) :: exchange_kernel - logical,intent(in) :: singlet - logical,intent(in) :: triplet - integer,intent(in) :: nBas - integer,intent(in) :: nC - integer,intent(in) :: nO - integer,intent(in) :: nV - integer,intent(in) :: nR - integer,intent(in) :: nS - double precision,intent(in) :: ENuc - double precision,intent(in) :: ERHF - double precision,intent(in) :: eHF(nBas) - double precision,intent(in) :: ERI(nBas,nBas,nBas,nBas) - double precision,intent(in) :: dipole_int(nBas,nBas,ncart) + logical,intent(in) :: TDA + logical,intent(in) :: doACFDT + logical,intent(in) :: exchange_kernel + logical,intent(in) :: singlet + logical,intent(in) :: triplet + integer,intent(in) :: nBas + integer,intent(in) :: nC + integer,intent(in) :: nO + integer,intent(in) :: nV + integer,intent(in) :: nR + integer,intent(in) :: nS + double precision,intent(in) :: ENuc + double precision,intent(in) :: ERHF + double precision,intent(in) :: eHF(nBas) + double precision,intent(in) :: ERI(nBas,nBas,nBas,nBas) + double precision,intent(in) :: dipole_int(nBas,nBas,ncart) ! Local variables - integer :: ia - integer :: ispin - logical :: dRPA - double precision :: t1, t2 - double precision :: lambda - double precision,allocatable :: Aph(:,:) - double precision,allocatable :: Bph(:,:) - double precision,allocatable :: Om(:) - double precision,allocatable :: XpY(:,:) - double precision,allocatable :: XmY(:,:) + integer :: ia + integer :: ispin + logical :: dRPA + double precision :: t1, t2 + double precision :: lambda + double precision,allocatable :: Aph(:,:) + double precision,allocatable :: Bph(:,:) + double precision,allocatable :: Om(:) + double precision,allocatable :: XpY(:,:) + double precision,allocatable :: XmY(:,:) - double precision :: EcRPA(nspin) + double precision :: EcRPA(nspin) ! Hello world @@ -75,14 +75,11 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO, !call wall_time(t1) call phLR_A(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,eHF,ERI,Aph) - !call wall_time(t2) - !print *, "wall time for A on CPU (sec) = ", t2 - t1 if(.not.TDA) call phLR_B(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,ERI,Bph) - !call wall_time(t1) call phLR(TDA,nS,Aph,Bph,EcRPA(ispin),Om,XpY,XmY) - call wall_time(t2) - !print *, "wall time diag A on CPU (sec) = ", t2 - t1 + !call wall_time(t2) + !print *, "wall time for dRPA on CPU (sec) = ", t2 - t1 !do ia = 1, nS ! write(112, *) Om(ia) !enddo diff --git a/src/RPA/phRRPA_GPU.f90 b/src/RPA/phRRPA_GPU.f90 index ed85749..19bc7e5 100644 --- a/src/RPA/phRRPA_GPU.f90 +++ b/src/RPA/phRRPA_GPU.f90 @@ -73,11 +73,10 @@ subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC else - !print*, 'start diag on GPU:' !call wall_time(t1) call ph_drpa_sing(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om(1), XpY(1,1), XmY(1,1)) - !call wall_time(t2) - !print*, 'diag time on GPU (sec):', t2 - t1 + call wall_time(t2) + print *, "wall time for dRPA on GPU (sec) = ", t2 - t1 !do ia = 1, nS ! write(111, *) Om(ia) !enddo From 97932b9c338db7af6d8e00dab7b2ee5162f49b20 Mon Sep 17 00:00:00 2001 From: AbdAmmar Date: Sun, 1 Dec 2024 12:28:34 +0100 Subject: [PATCH 25/39] rm timing --- src/RPA/phRRPA_GPU.f90 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/RPA/phRRPA_GPU.f90 b/src/RPA/phRRPA_GPU.f90 index 19bc7e5..dcf65d5 100644 --- a/src/RPA/phRRPA_GPU.f90 +++ b/src/RPA/phRRPA_GPU.f90 @@ -75,8 +75,8 @@ subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC !call wall_time(t1) call ph_drpa_sing(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om(1), XpY(1,1), XmY(1,1)) - call wall_time(t2) - print *, "wall time for dRPA on GPU (sec) = ", t2 - t1 + !call wall_time(t2) + !print *, "wall time for dRPA on GPU (sec) = ", t2 - t1 !do ia = 1, nS ! write(111, *) Om(ia) !enddo From ba5da9536a9d06b3ffc314c5c2d65b4e7a21e117 Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Fri, 6 Dec 2024 15:53:14 +0100 Subject: [PATCH 26/39] add memmap to avoid storing 2e-integ in DRAM --- PyDuck.py | 46 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/PyDuck.py b/PyDuck.py index 6c39b70..0c1655c 100644 --- a/PyDuck.py +++ b/PyDuck.py @@ -6,6 +6,8 @@ import pyscf from pyscf import gto import numpy as np import subprocess +import time + #Find the value of the environnement variable QUACK_ROOT. If not present we use the current repository if "QUACK_ROOT" not in os.environ: @@ -23,6 +25,7 @@ parser.add_argument('--bohr', default='Angstrom', action='store_const', const='B parser.add_argument('-c', '--charge', type=int, default=0, help='Total charge of the molecule. Specify negative charges with "m" instead of the minus sign, for example m1 instead of -1. Default is 0') parser.add_argument('--cartesian', default=False, action='store_true', help='Add this option if you want to use cartesian basis functions.') parser.add_argument('--print_2e', default=False, action='store_true', help='Add this option if you want to print 2e-integrals.') +parser.add_argument('--mmap_2e', default=False, action='store_true', help='If True, avoid using DRAM when generating 2e-integrals.') parser.add_argument('-fc', '--frozen_core', type=bool, default=False, help='Freeze core MOs. Default is false') parser.add_argument('-m', '--multiplicity', type=int, default=1, help='Spin multiplicity. Default is 1 therefore singlet') parser.add_argument('--working_dir', type=str, default=QuAcK_dir, help='Set a working directory to run the calculation.') @@ -38,6 +41,7 @@ multiplicity=args.multiplicity xyz=args.xyz + '.xyz' cartesian=args.cartesian print_2e=args.print_2e +mmap_2e=args.mmap_2e working_dir=args.working_dir #Read molecule @@ -129,33 +133,47 @@ write_matrix_to_file(y,norb,working_dir+'/int/y.dat') subprocess.call(['rm', '-f', working_dir + '/int/z.dat']) write_matrix_to_file(z,norb,working_dir+'/int/z.dat') -eri_ao = mol.intor('int2e') - -def write_tensor_to_file(tensor,size,file,cutoff=1e-15): - f = open(file, 'w') +def write_tensor_to_file(tensor,size,file_name,cutoff=1e-15): + f = open(file_name, 'w') for i in range(size): for j in range(i,size): for k in range(i,size): for l in range(j,size): if abs(tensor[i][k][j][l]) > cutoff: - #f.write(str(i+1)+' '+str(j+1)+' '+str(k+1)+' '+str(l+1)+' '+"{:.16E}".format(tensor[i][k][j][l])) f.write(str(i+1)+' '+str(j+1)+' '+str(k+1)+' '+str(l+1)+' '+"{:.16E}".format(tensor[i][k][j][l])) f.write('\n') f.close() -# Write two-electron integrals +# Write two-electron integrals to HD +ti_2e = time.time() if print_2e: # (formatted) - subprocess.call(['rm', '-f', working_dir + '/int/ERI.dat']) - write_tensor_to_file(eri_ao, norb, working_dir + '/int/ERI.dat') + output_file_path = working_dir + '/int/ERI.dat' + subprocess.call(['rm', '-f', output_file_path]) + eri_ao = mol.intor('int2e') + write_tensor_to_file(eri_ao, norb, output_file_path) else: # (binary) - subprocess.call(['rm', '-f', working_dir + '/int/ERI.bin']) - # chem -> phys notation - eri_ao = eri_ao.transpose(0, 2, 1, 3) - f = open(working_dir + '/int/ERI.bin', 'w') - eri_ao.tofile(working_dir + '/int/ERI.bin') - f.close() + output_file_path = working_dir + '/int/ERI.bin' + subprocess.call(['rm', '-f', output_file_path]) + if(mmap_2e): + # avoid using DRAM + eri_shape = (norb, norb, norb, norb) + eri_mmap = np.memmap(output_file_path, dtype='float64', mode='w+', shape=eri_shape) + mol.intor('int2e', out=eri_mmap) + for i in range(norb): + transposed_chunk = eri_mmap[i, :, :, :].transpose(1, 0, 2) + eri_mmap[i, :, :, :] = transposed_chunk + eri_mmap.flush() + del eri_mmap + else: + eri_ao = mol.intor('int2e').transpose(0, 2, 1, 3) # chem -> phys + f = open(output_file_path, 'w') + eri_ao.tofile(output_file_path) + f.close() +te_2e = time.time() +print("Wall time for writing 2e-integrals (physicist notation) to disk: {:.3f} seconds".format(te_2e - ti_2e)) + #Execute the QuAcK fortran program From ce1882cd4ee088e68d1d7c9512441b7e585adb3c Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Fri, 6 Dec 2024 15:57:35 +0100 Subject: [PATCH 27/39] mini modif --- PyDuck.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/PyDuck.py b/PyDuck.py index 0c1655c..6318204 100644 --- a/PyDuck.py +++ b/PyDuck.py @@ -162,8 +162,7 @@ else: eri_mmap = np.memmap(output_file_path, dtype='float64', mode='w+', shape=eri_shape) mol.intor('int2e', out=eri_mmap) for i in range(norb): - transposed_chunk = eri_mmap[i, :, :, :].transpose(1, 0, 2) - eri_mmap[i, :, :, :] = transposed_chunk + eri_mmap[i, :, :, :] = eri_mmap[i, :, :, :].transpose(1, 0, 2) eri_mmap.flush() del eri_mmap else: From b7af468f114980df42d76d7a15f60d47c2454f00 Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Sat, 7 Dec 2024 02:20:05 +0100 Subject: [PATCH 28/39] Working on s8 rep of 2e-integrals --- PyDuck.py | 67 ++-- input/hardware | 2 - input/hpc_flags | 4 + src/AOtoMO/Hartree_matrix_AO_basis.f90 | 103 +++++ src/HF/RHF_hpc.f90 | 247 ++++++++++++ src/QuAcK/GQuAcK.f90 | 34 +- src/QuAcK/QuAcK.f90 | 60 ++- src/QuAcK/RQuAcK.f90 | 41 +- src/QuAcK/RQuAcK_hpc.f90 | 353 ++++++++++++++++++ src/QuAcK/UQuAcK.f90 | 18 +- src/QuAcK/read_hardware.f90 | 45 --- src/QuAcK/read_hpc_flags.f90 | 39 ++ ...ad_integrals.f90 => read_1e_integrals.f90} | 68 +--- src/utils/read_2e_integrals.f90 | 110 ++++++ 14 files changed, 1019 insertions(+), 172 deletions(-) delete mode 100644 input/hardware create mode 100644 input/hpc_flags create mode 100644 src/HF/RHF_hpc.f90 create mode 100644 src/QuAcK/RQuAcK_hpc.f90 delete mode 100644 src/QuAcK/read_hardware.f90 create mode 100644 src/QuAcK/read_hpc_flags.f90 rename src/utils/{read_integrals.f90 => read_1e_integrals.f90} (59%) create mode 100644 src/utils/read_2e_integrals.f90 diff --git a/PyDuck.py b/PyDuck.py index 6318204..45113de 100644 --- a/PyDuck.py +++ b/PyDuck.py @@ -24,8 +24,10 @@ parser.add_argument('-b', '--basis', type=str, required=True, help='Name of the parser.add_argument('--bohr', default='Angstrom', action='store_const', const='Bohr', help='By default QuAcK assumes that the xyz files are in Angstrom. Add this argument if your xyz file is in Bohr.') parser.add_argument('-c', '--charge', type=int, default=0, help='Total charge of the molecule. Specify negative charges with "m" instead of the minus sign, for example m1 instead of -1. Default is 0') parser.add_argument('--cartesian', default=False, action='store_true', help='Add this option if you want to use cartesian basis functions.') -parser.add_argument('--print_2e', default=False, action='store_true', help='Add this option if you want to print 2e-integrals.') +parser.add_argument('--print_2e', default=True, action='store_true', help='If True, print 2e-integrals to disk.') +parser.add_argument('--formatted_2e', default=False, action='store_true', help='Add this option if you want to print formatted 2e-integrals.') parser.add_argument('--mmap_2e', default=False, action='store_true', help='If True, avoid using DRAM when generating 2e-integrals.') +parser.add_argument('--aosym_2e', default=False, action='store_true', help='If True, use 8-fold symmetry 2e-integrals.') parser.add_argument('-fc', '--frozen_core', type=bool, default=False, help='Freeze core MOs. Default is false') parser.add_argument('-m', '--multiplicity', type=int, default=1, help='Spin multiplicity. Default is 1 therefore singlet') parser.add_argument('--working_dir', type=str, default=QuAcK_dir, help='Set a working directory to run the calculation.') @@ -41,7 +43,9 @@ multiplicity=args.multiplicity xyz=args.xyz + '.xyz' cartesian=args.cartesian print_2e=args.print_2e +formatted_2e=args.formatted_2e mmap_2e=args.mmap_2e +aosym_2e=args.aosym_2e working_dir=args.working_dir #Read molecule @@ -63,6 +67,7 @@ mol = gto.M( basis = input_basis, charge = charge, spin = multiplicity - 1 +# symmetry = True # Enable symmetry ) #Fix the unit for the lengths @@ -144,34 +149,46 @@ def write_tensor_to_file(tensor,size,file_name,cutoff=1e-15): f.write('\n') f.close() -# Write two-electron integrals to HD -ti_2e = time.time() if print_2e: - # (formatted) - output_file_path = working_dir + '/int/ERI.dat' - subprocess.call(['rm', '-f', output_file_path]) - eri_ao = mol.intor('int2e') - write_tensor_to_file(eri_ao, norb, output_file_path) -else: - # (binary) - output_file_path = working_dir + '/int/ERI.bin' - subprocess.call(['rm', '-f', output_file_path]) - if(mmap_2e): - # avoid using DRAM - eri_shape = (norb, norb, norb, norb) - eri_mmap = np.memmap(output_file_path, dtype='float64', mode='w+', shape=eri_shape) - mol.intor('int2e', out=eri_mmap) - for i in range(norb): - eri_mmap[i, :, :, :] = eri_mmap[i, :, :, :].transpose(1, 0, 2) - eri_mmap.flush() - del eri_mmap - else: - eri_ao = mol.intor('int2e').transpose(0, 2, 1, 3) # chem -> phys + # Write two-electron integrals to HD + ti_2e = time.time() + + if formatted_2e: + output_file_path = working_dir + '/int/ERI.dat' + subprocess.call(['rm', '-f', output_file_path]) + eri_ao = mol.intor('int2e') + write_tensor_to_file(eri_ao, norb, output_file_path) + + if aosym_2e: + output_file_path = working_dir + '/int/ERI_chem.bin' + subprocess.call(['rm', '-f', output_file_path]) + eri_ao = mol.intor('int2e', aosym='s8') + print(eri_ao.shape) f = open(output_file_path, 'w') eri_ao.tofile(output_file_path) f.close() -te_2e = time.time() -print("Wall time for writing 2e-integrals (physicist notation) to disk: {:.3f} seconds".format(te_2e - ti_2e)) + else: + output_file_path = working_dir + '/int/ERI.bin' + subprocess.call(['rm', '-f', output_file_path]) + if(mmap_2e): + # avoid using DRAM + eri_shape = (norb, norb, norb, norb) + eri_mmap = np.memmap(output_file_path, dtype='float64', mode='w+', shape=eri_shape) + mol.intor('int2e', out=eri_mmap) + for i in range(norb): + eri_mmap[i, :, :, :] = eri_mmap[i, :, :, :].transpose(1, 0, 2) + eri_mmap.flush() + del eri_mmap + else: + eri_ao = mol.intor('int2e').transpose(0, 2, 1, 3) # chem -> phys + f = open(output_file_path, 'w') + eri_ao.tofile(output_file_path) + f.close() + + te_2e = time.time() + print("Wall time for writing 2e-integrals to disk: {:.3f} seconds".format(te_2e - ti_2e)) + sys.stdout.flush() + diff --git a/input/hardware b/input/hardware deleted file mode 100644 index 99c395a..0000000 --- a/input/hardware +++ /dev/null @@ -1,2 +0,0 @@ -# if True (T), use GPU - F diff --git a/input/hpc_flags b/input/hpc_flags new file mode 100644 index 0000000..120f367 --- /dev/null +++ b/input/hpc_flags @@ -0,0 +1,4 @@ +# if True (T), switch to HPC mode + F +# if True (T), use GPU + F diff --git a/src/AOtoMO/Hartree_matrix_AO_basis.f90 b/src/AOtoMO/Hartree_matrix_AO_basis.f90 index 03000a0..ed819c6 100644 --- a/src/AOtoMO/Hartree_matrix_AO_basis.f90 +++ b/src/AOtoMO/Hartree_matrix_AO_basis.f90 @@ -32,3 +32,106 @@ subroutine Hartree_matrix_AO_basis(nBas,P,G,H) end do end subroutine + +! --- + +subroutine Hartree_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, H) + + implicit none + + integer, intent(in) :: nBas + integer*8, intent(in) :: ERI_size + double precision, intent(in) :: P(nBas,nBas) + double precision, intent(in) :: ERI_chem(ERI_size) + double precision, intent(out) :: H(nBas,nBas) + + integer :: mu, nu, la, si + integer*8 :: munu0, munu + integer*8 :: sila0, sila + integer*8 :: munulasi0, munulasi + + integer*8, external :: Yoshimine_ind + + do nu = 1, nBas + do mu = 1, nBas + H(mu,nu) = 0.d0 + do si = 1, nBas + do la = 1, nBas + munulasi = Yoshimine_ind(mu, nu, la, si) + H(mu,nu) = H(mu,nu) + P(la,si) * ERI_chem(munulasi) + enddo + enddo + enddo + enddo + + +! do nu = 1, nBas +! munu0 = (nu * (nu + 1)) / 2 +! +! do mu = 1, nu +! munu = munu0 + mu +! munulasi0 = (munu * (munu + 1)) / 2 +! +! H(mu,nu) = 0.d0 +! +! do si = 1, nu +! sila0 = (si * (si + 1)) / 2 +! +! do la = 1, si +! sila = sila0 + la +! +! if(nu == si .and. mu < la) cycle +! +! munulasi = munulasi0 + sila +! +! H(mu,nu) = H(mu,nu) + 4.d0 * P(la,si) * ERI_chem(munulasi) +! enddo +! enddo +! enddo +! enddo +! +! +! do nu = 1, nBas +! do mu = nu+1, nBas +! H(mu,nu) = H(nu,mu) +! enddo +! enddo + + return +end subroutine + +! --- + +integer*8 function Yoshimine_ind(a, b, c, d) + + implicit none + + integer, intent(in) :: a, b, c, d + + integer*8 :: ab, cd, abcd + + if(a > b) then + ab = (a * (a - 1)) / 2 + b + else + ab = (b * (b - 1)) / 2 + a + endif + + if(c > d) then + cd = (c * (c - 1)) / 2 + d + else + cd = (d * (d - 1)) / 2 + c + endif + + if(ab > cd) then + abcd = (ab * (ab - 1)) / 2 + cd + else + abcd = (cd * (cd - 1)) / 2 + ab + endif + + Yoshimine_ind = abcd + + return +end + +! --- + diff --git a/src/HF/RHF_hpc.f90 b/src/HF/RHF_hpc.f90 new file mode 100644 index 0000000..ef6ea47 --- /dev/null +++ b/src/HF/RHF_hpc.f90 @@ -0,0 +1,247 @@ +subroutine RHF_hpc(working_dir,dotest,maxSCF,thresh,max_diis,guess_type,level_shift,nNuc,ZNuc,rNuc,ENuc, & + nBas,nOrb,nO,S,T,V,Hc,dipole_int,X,ERHF,eHF,c,P,F) + +! Perform restricted Hartree-Fock calculation + + implicit none + include 'parameters.h' + +! Input variables + + character(len=256),intent(in) :: working_dir + + logical,intent(in) :: dotest + + integer,intent(in) :: maxSCF + integer,intent(in) :: max_diis + integer,intent(in) :: guess_type + double precision,intent(in) :: thresh + double precision,intent(in) :: level_shift + + integer,intent(in) :: nBas + integer,intent(in) :: nOrb + integer,intent(in) :: nO + integer,intent(in) :: nNuc + double precision,intent(in) :: ZNuc(nNuc) + double precision,intent(in) :: rNuc(nNuc,ncart) + double precision,intent(in) :: ENuc + double precision,intent(in) :: S(nBas,nBas) + double precision,intent(in) :: T(nBas,nBas) + double precision,intent(in) :: V(nBas,nBas) + double precision,intent(in) :: Hc(nBas,nBas) + double precision,intent(in) :: X(nBas,nOrb) + double precision,intent(in) :: dipole_int(nBas,nBas,ncart) + +! Local variables + + integer :: ii, jj + integer :: nSCF + integer :: nBas_Sq + integer :: n_diis + integer*8 :: ERI_size + double precision :: diff, diff_loc + double precision :: ET + double precision :: EV + double precision :: EJ + double precision :: EK + double precision :: dipole(ncart) + double precision :: Conv + double precision :: rcond + double precision,external :: trace_matrix + double precision,allocatable :: err(:,:) + double precision,allocatable :: err_diis(:,:) + double precision,allocatable :: F_diis(:,:) + double precision,allocatable :: J(:,:) + double precision,allocatable :: K(:,:) + double precision,allocatable :: cp(:,:) + double precision,allocatable :: Fp(:,:) + double precision,allocatable :: ERI_chem(:) + double precision,allocatable :: ERI_phys(:,:,:,:), J_deb(:,:) + + +! Output variables + + double precision,intent(out) :: ERHF + double precision,intent(out) :: eHF(nOrb) + double precision,intent(inout):: c(nBas,nOrb) + double precision,intent(out) :: P(nBas,nBas) + double precision,intent(out) :: F(nBas,nBas) + +! Hello world + + write(*,*) + write(*,*)'****************************************' + write(*,*)'* Restricted HF Calculation (HPC mode) *' + write(*,*)'****************************************' + write(*,*) + +! Useful quantities + + nBas_Sq = nBas*nBas + +! Memory allocation + + allocate(J(nBas,nBas)) + allocate(K(nBas,nBas)) + + allocate(err(nBas,nBas)) + + allocate(cp(nOrb,nOrb)) + allocate(Fp(nOrb,nOrb)) + + allocate(err_diis(nBas_Sq,max_diis)) + allocate(F_diis(nBas_Sq,max_diis)) + +! Guess coefficients and density matrix + call mo_guess(nBas,nOrb,guess_type,S,Hc,X,c) + + call dgemm('N', 'T', nBas, nBas, nO, 2.d0, & + c(1,1), nBas, c(1,1), nBas, & + 0.d0, P(1,1), nBas) + + + ERI_size = (nBas * (nBas + 1)) / 2 + ERI_size = (ERI_size * (ERI_size + 1)) / 2 + allocate(ERI_chem(ERI_size)) + call read_2e_integrals_hpc(working_dir, ERI_size, ERI_chem) + call Hartree_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, J) + + allocate(J_deb(nBas,nBas)) + allocate(ERI_phys(nBas,nBas,nBas,nBas)) + call read_2e_integrals(working_dir, nBas, ERI_phys) + call Hartree_matrix_AO_basis(nBas, P, ERI_phys, J_deb) + + print*, maxval(dabs(J - J_deb)) + diff = 0.d0 + do ii = 1, nBas + do jj = 1, nBas + diff_loc = dabs(J(jj,ii) - J_deb(jj,ii)) + if(diff_loc .gt. 1d-13) then + print*, 'error on: ', jj, ii + print*, J(jj,ii), J_deb(jj,ii) + stop + endif + diff = diff + diff_loc + enddo + enddo + print*, 'total diff = ', diff + + stop + +! Initialization + + n_diis = 0 + F_diis(:,:) = 0d0 + err_diis(:,:) = 0d0 + rcond = 0d0 + + Conv = 1d0 + nSCF = 0 + +!------------------------------------------------------------------------ +! Main SCF loop +!------------------------------------------------------------------------ + + write(*,*) + write(*,*)'-----------------------------------------------------------------------------' + write(*,'(1X,A1,1X,A3,1X,A1,1X,A16,1X,A1,1X,A16,1X,A1,1X,A16,1X,A1,1X,A10,1X,A1,1X)') & + '|','#','|','E(RHF)','|','EJ(RHF)','|','EK(RHF)','|','Conv','|' + write(*,*)'-----------------------------------------------------------------------------' + + do while(Conv > thresh .and. nSCF < maxSCF) + + ! Increment + + nSCF = nSCF + 1 + + ! Build Fock matrix + call Hartree_matrix_AO_basis(nBas,P,ERI_phys,J) + call exchange_matrix_AO_basis(nBas,P,ERI_phys,K) + F(:,:) = Hc(:,:) + J(:,:) + 0.5d0*K(:,:) + + ! Check convergence + err = matmul(F, matmul(P, S)) - matmul(matmul(S, P), F) + if(nSCF > 1) Conv = maxval(abs(err)) + + ! Kinetic energy + ET = trace_matrix(nBas, matmul(P, T)) + + ! Potential energy + EV = trace_matrix(nBas, matmul(P, V)) + + ! Hartree energy + EJ = 0.5d0*trace_matrix(nBas, matmul(P, J)) + + ! Exchange energy + EK = 0.25d0*trace_matrix(nBas, matmul(P, K)) + + ! Total energy + ERHF = ET + EV + EJ + EK + + ! DIIS extrapolation + if(max_diis > 1) then + n_diis = min(n_diis+1,max_diis) + call DIIS_extrapolation(rcond,nBas_Sq,nBas_Sq,n_diis,err_diis,F_diis,err,F) + endif + + ! Level shift + if(level_shift > 0d0 .and. Conv > thresh) then + call level_shifting(level_shift,nBas,nOrb,nO,S,c,F) + endif + + ! Diagonalize Fock matrix + Fp = matmul(transpose(X), matmul(F, X)) + cp(:,:) = Fp(:,:) + call diagonalize_matrix(nOrb,cp,eHF) + c = matmul(X,cp) + + ! Density matrix + call dgemm('N', 'T', nBas, nBas, nO, 2.d0, & + c(1,1), nBas, c(1,1), nBas, & + 0.d0, P(1,1), nBas) + + ! Dump results + write(*,'(1X,A1,1X,I3,1X,A1,1X,F16.10,1X,A1,1X,F16.10,1X,A1,1X,F16.10,1X,A1,1X,E10.2,1X,A1,1X)') & + '|',nSCF,'|',ERHF + ENuc,'|',EJ,'|',EK,'|',Conv,'|' + + end do + write(*,*)'-----------------------------------------------------------------------------' +!------------------------------------------------------------------------ +! End of SCF loop +!------------------------------------------------------------------------ + +! Did it actually converge? + + if(nSCF == maxSCF) then + + write(*,*) + write(*,*)'!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!' + write(*,*)' Convergence failed ' + write(*,*)'!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!' + write(*,*) + + deallocate(J,K,err,cp,Fp,err_diis,F_diis) + + stop + + end if + +! Compute dipole moments + + call dipole_moment(nBas,P,nNuc,ZNuc,rNuc,dipole_int,dipole) + call print_RHF(nBas,nOrb,nO,eHF,c,ENuc,ET,EV,EJ,EK,ERHF,dipole) + +! Testing zone + + if(dotest) then + + call dump_test_value('R','RHF energy',ERHF) + call dump_test_value('R','RHF HOMO energy',eHF(nO)) + call dump_test_value('R','RHF LUMO energy',eHF(nO+1)) + call dump_test_value('R','RHF dipole moment',norm2(dipole)) + + end if + + deallocate(J,K,err,cp,Fp,err_diis,F_diis) + +end subroutine diff --git a/src/QuAcK/GQuAcK.f90 b/src/QuAcK/GQuAcK.f90 index 22441fd..b82f134 100644 --- a/src/QuAcK/GQuAcK.f90 +++ b/src/QuAcK/GQuAcK.f90 @@ -1,17 +1,19 @@ -subroutine GQuAcK(dotest,doGHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT, & - dodrCCD,dorCCD,docrCCD,dolCCD,dophRPA,dophRPAx,docrRPA,doppRPA, & - doG0W0,doevGW,doqsGW,doG0F2,doevGF2,doqsGF2,doG0T0pp,doevGTpp,doqsGTpp, & - nNuc,nBas,nC,nO,nV,nR,ENuc,ZNuc,rNuc,S,T,V,Hc,X,dipole_int_AO,ERI_AO, & - maxSCF_HF,max_diis_HF,thresh_HF,level_shift,guess_type,mix,reg_MP, & - maxSCF_CC,max_diis_CC,thresh_CC, & - TDA,maxSCF_GF,max_diis_GF,thresh_GF,lin_GF,reg_GF,eta_GF, & - maxSCF_GW,max_diis_GW,thresh_GW,TDA_W,lin_GW,reg_GW,eta_GW, & - maxSCF_GT,max_diis_GT,thresh_GT,TDA_T,lin_GT,reg_GT,eta_GT, & +subroutine GQuAcK(working_dir,dotest,doGHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT, & + dodrCCD,dorCCD,docrCCD,dolCCD,dophRPA,dophRPAx,docrRPA,doppRPA, & + doG0W0,doevGW,doqsGW,doG0F2,doevGF2,doqsGF2,doG0T0pp,doevGTpp,doqsGTpp, & + nNuc,nBas,nC,nO,nV,nR,ENuc,ZNuc,rNuc,S,T,V,Hc,X,dipole_int_AO, & + maxSCF_HF,max_diis_HF,thresh_HF,level_shift,guess_type,mix,reg_MP, & + maxSCF_CC,max_diis_CC,thresh_CC, & + TDA,maxSCF_GF,max_diis_GF,thresh_GF,lin_GF,reg_GF,eta_GF, & + maxSCF_GW,max_diis_GW,thresh_GW,TDA_W,lin_GW,reg_GW,eta_GW, & + maxSCF_GT,max_diis_GT,thresh_GT,TDA_T,lin_GT,reg_GT,eta_GT, & dophBSE,dophBSE2,doppBSE,dBSE,dTDA,doACFDT,exchange_kernel,doXBS) implicit none include 'parameters.h' + character(len=256),intent(in) :: working_dir + logical,intent(in) :: dotest logical,intent(in) :: doGHF @@ -41,7 +43,6 @@ subroutine GQuAcK(dotest,doGHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,do double precision,intent(in) :: Hc(nBas,nBas) double precision,intent(in) :: X(nBas,nBas) double precision,intent(in) :: dipole_int_AO(nBas,nBas,ncart) - double precision,intent(in) :: ERI_AO(nBas,nBas,nBas,nBas) integer,intent(in) :: maxSCF_HF,max_diis_HF double precision,intent(in) :: thresh_HF,level_shift,mix @@ -86,9 +87,11 @@ subroutine GQuAcK(dotest,doGHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,do double precision :: start_GW ,end_GW ,t_GW double precision :: start_GT ,end_GT ,t_GT + double precision :: start_int, end_int, t_int double precision,allocatable :: cHF(:,:),eHF(:),PHF(:,:),FHF(:,:) double precision :: EGHF double precision,allocatable :: dipole_int_MO(:,:,:) + double precision,allocatable :: ERI_AO(:,:,:,:) double precision,allocatable :: ERI_MO(:,:,:,:) double precision,allocatable :: ERI_tmp(:,:,:,:) double precision,allocatable :: Ca(:,:),Cb(:,:) @@ -112,6 +115,17 @@ subroutine GQuAcK(dotest,doGHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,do allocate(cHF(nBas2,nBas2),eHF(nBas2),PHF(nBas2,nBas2),FHF(nBas2,nBas2), & dipole_int_MO(nBas2,nBas2,ncart),ERI_MO(nBas2,nBas2,nBas2,nBas2)) + + allocate(ERI_AO(nBas,nBas,nBas,nBas)) + call wall_time(start_int) + call read_2e_integrals(working_dir,nBas,ERI_AO) + call wall_time(end_int) + t_int = end_int - start_int + write(*,*) + write(*,'(A65,1X,F9.3,A8)') 'Total wall time for reading 2e-integrals =',t_int,' seconds' + write(*,*) + + !---------------------! ! Hartree-Fock module ! !---------------------! diff --git a/src/QuAcK/QuAcK.f90 b/src/QuAcK/QuAcK.f90 index 9072904..8fd0214 100644 --- a/src/QuAcK/QuAcK.f90 +++ b/src/QuAcK/QuAcK.f90 @@ -32,7 +32,6 @@ program QuAcK double precision,allocatable :: Hc(:,:) double precision,allocatable :: X(:,:),X_tmp(:,:) double precision,allocatable :: dipole_int_AO(:,:,:) - double precision,allocatable :: ERI_AO(:,:,:,:) double precision,allocatable :: Uvec(:,:), Uval(:) double precision :: start_QuAcK,end_QuAcK,t_QuAcK @@ -44,6 +43,7 @@ program QuAcK logical :: reg_MP + logical :: switch_hpc logical :: use_gpu integer :: maxSCF_CC,max_diis_CC @@ -140,7 +140,7 @@ program QuAcK ! Hardware ! !------------------! - call read_hardware(working_dir,use_gpu) + call read_hpc_flags(working_dir,switch_hpc,use_gpu) !------------------------------------! ! Read input information ! @@ -176,20 +176,19 @@ program QuAcK allocate(T(nBas,nBas)) allocate(V(nBas,nBas)) allocate(Hc(nBas,nBas)) - allocate(ERI_AO(nBas,nBas,nBas,nBas)) allocate(dipole_int_AO(nBas,nBas,ncart)) ! Read integrals call wall_time(start_int) - call read_integrals(working_dir,nBas,S,T,V,Hc,ERI_AO) + call read_1e_integrals(working_dir,nBas,S,T,V,Hc) call read_dipole_integrals(working_dir,nBas,dipole_int_AO) call wall_time(end_int) t_int = end_int - start_int write(*,*) - write(*,'(A65,1X,F9.3,A8)') 'Total wall time for reading integrals = ',t_int,' seconds' + write(*,'(A65,1X,F9.3,A8)') 'Total wall time for reading 1e-integrals = ',t_int,' seconds' write(*,*) ! Compute orthogonalization matrix @@ -225,29 +224,44 @@ program QuAcK ! Restricted QuAcK branch ! !-------------------------! - if(doRQuAcK) & - call RQuAcK(use_gpu,doRtest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT, & - dodrCCD,dorCCD,docrCCD,dolCCD,doCIS,doCIS_D,doCID,doCISD,doFCI,dophRPA,dophRPAx,docrRPA,doppRPA, & - doG0F2,doevGF2,doqsGF2,doufG0F02,doG0F3,doevGF3,doG0W0,doevGW,doqsGW,doufG0W0,doufGW, & - doG0T0pp,doevGTpp,doqsGTpp,doufG0T0pp,doG0T0eh,doevGTeh,doqsGTeh, & - nNuc,nBas,nOrb,nC,nO,nV,nR,ENuc,ZNuc,rNuc, & - S,T,V,Hc,X,dipole_int_AO,ERI_AO,maxSCF_HF,max_diis_HF,thresh_HF,level_shift, & - guess_type,mix,reg_MP,maxSCF_CC,max_diis_CC,thresh_CC,spin_conserved,spin_flip,TDA, & - maxSCF_GF,max_diis_GF,renorm_GF,thresh_GF,lin_GF,reg_GF,eta_GF,maxSCF_GW,max_diis_GW,thresh_GW, & - TDA_W,lin_GW,reg_GW,eta_GW,maxSCF_GT,max_diis_GT,thresh_GT,TDA_T,lin_GT,reg_GT,eta_GT, & - dophBSE,dophBSE2,doppBSE,dBSE,dTDA,doACFDT,exchange_kernel,doXBS) + if(doRQuAcK) then + + if(switch_hpc) then + call RQuAcK_hpc(working_dir,use_gpu,doRtest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT, & + dodrCCD,dorCCD,docrCCD,dolCCD,doCIS,doCIS_D,doCID,doCISD,doFCI,dophRPA,dophRPAx,docrRPA,doppRPA, & + doG0F2,doevGF2,doqsGF2,doufG0F02,doG0F3,doevGF3,doG0W0,doevGW,doqsGW,doufG0W0,doufGW, & + doG0T0pp,doevGTpp,doqsGTpp,doufG0T0pp,doG0T0eh,doevGTeh,doqsGTeh, & + nNuc,nBas,nOrb,nC,nO,nV,nR,ENuc,ZNuc,rNuc, & + S,T,V,Hc,X,dipole_int_AO,maxSCF_HF,max_diis_HF,thresh_HF,level_shift, & + guess_type,mix,reg_MP,maxSCF_CC,max_diis_CC,thresh_CC,spin_conserved,spin_flip,TDA, & + maxSCF_GF,max_diis_GF,renorm_GF,thresh_GF,lin_GF,reg_GF,eta_GF,maxSCF_GW,max_diis_GW,thresh_GW, & + TDA_W,lin_GW,reg_GW,eta_GW,maxSCF_GT,max_diis_GT,thresh_GT,TDA_T,lin_GT,reg_GT,eta_GT, & + dophBSE,dophBSE2,doppBSE,dBSE,dTDA,doACFDT,exchange_kernel,doXBS) + else + call RQuAcK(working_dir,use_gpu,doRtest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT, & + dodrCCD,dorCCD,docrCCD,dolCCD,doCIS,doCIS_D,doCID,doCISD,doFCI,dophRPA,dophRPAx,docrRPA,doppRPA, & + doG0F2,doevGF2,doqsGF2,doufG0F02,doG0F3,doevGF3,doG0W0,doevGW,doqsGW,doufG0W0,doufGW, & + doG0T0pp,doevGTpp,doqsGTpp,doufG0T0pp,doG0T0eh,doevGTeh,doqsGTeh, & + nNuc,nBas,nOrb,nC,nO,nV,nR,ENuc,ZNuc,rNuc, & + S,T,V,Hc,X,dipole_int_AO,maxSCF_HF,max_diis_HF,thresh_HF,level_shift, & + guess_type,mix,reg_MP,maxSCF_CC,max_diis_CC,thresh_CC,spin_conserved,spin_flip,TDA, & + maxSCF_GF,max_diis_GF,renorm_GF,thresh_GF,lin_GF,reg_GF,eta_GF,maxSCF_GW,max_diis_GW,thresh_GW, & + TDA_W,lin_GW,reg_GW,eta_GW,maxSCF_GT,max_diis_GT,thresh_GT,TDA_T,lin_GT,reg_GT,eta_GT, & + dophBSE,dophBSE2,doppBSE,dBSE,dTDA,doACFDT,exchange_kernel,doXBS) + endif + endif !---------------------------! ! Unrestricted QuAcK branch ! !---------------------------! if(doUQuAcK) & - call UQuAcK(doUtest,doUHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT, & + call UQuAcK(working_dir,doUtest,doUHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT, & dodrCCD,dorCCD,docrCCD,dolCCD,doCIS,doCIS_D,doCID,doCISD,doFCI,dophRPA,dophRPAx,docrRPA,doppRPA, & doG0F2,doevGF2,doqsGF2,doufG0F02,doG0F3,doevGF3,doG0W0,doevGW,doqsGW,doufG0W0,doufGW, & doG0T0pp,doevGTpp,doqsGTpp,doufG0T0pp,doG0T0eh,doevGTeh,doqsGTeh, & nNuc,nBas,nC,nO,nV,nR,ENuc,ZNuc,rNuc, & - S,T,V,Hc,X,dipole_int_AO,ERI_AO,maxSCF_HF,max_diis_HF,thresh_HF,level_shift, & + S,T,V,Hc,X,dipole_int_AO,maxSCF_HF,max_diis_HF,thresh_HF,level_shift, & guess_type,mix,reg_MP,maxSCF_CC,max_diis_CC,thresh_CC,spin_conserved,spin_flip,TDA, & maxSCF_GF,max_diis_GF,renorm_GF,thresh_GF,lin_GF,reg_GF,eta_GF,maxSCF_GW,max_diis_GW,thresh_GW, & TDA_W,lin_GW,reg_GW,eta_GW,maxSCF_GT,max_diis_GT,thresh_GT,TDA_T,lin_GT,reg_GT,eta_GT, & @@ -257,10 +271,10 @@ program QuAcK ! Generalized QuAcK branch ! !--------------------------! if(doGQuAcK) & - call GQuAcK(doGtest,doGHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT, & + call GQuAcK(working_dir,doGtest,doGHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT, & dodrCCD,dorCCD,docrCCD,dolCCD,dophRPA,dophRPAx,docrRPA,doppRPA, & doG0W0,doevGW,doqsGW,doG0F2,doevGF2,doqsGF2,doG0T0pp,doevGTpp,doqsGTpp, & - nNuc,nBas,sum(nC),sum(nO),sum(nV),sum(nR),ENuc,ZNuc,rNuc,S,T,V,Hc,X,dipole_int_AO,ERI_AO, & + nNuc,nBas,sum(nC),sum(nO),sum(nV),sum(nR),ENuc,ZNuc,rNuc,S,T,V,Hc,X,dipole_int_AO, & maxSCF_HF,max_diis_HF,thresh_HF,level_shift,guess_type,mix,reg_MP, & maxSCF_CC,max_diis_CC,thresh_CC,TDA,maxSCF_GF,max_diis_GF,thresh_GF,lin_GF,reg_GF,eta_GF, & maxSCF_GW,max_diis_GW,thresh_GW,TDA_W,lin_GW,reg_GW,eta_GW, & @@ -289,4 +303,10 @@ program QuAcK write(*,'(A65,1X,F9.3,A8)') 'Total wall time for QuAcK = ',t_QuAcK,' seconds' write(*,*) + deallocate(S) + deallocate(T) + deallocate(V) + deallocate(Hc) + deallocate(dipole_int_AO) + end program diff --git a/src/QuAcK/RQuAcK.f90 b/src/QuAcK/RQuAcK.f90 index 4a03be8..16b8159 100644 --- a/src/QuAcK/RQuAcK.f90 +++ b/src/QuAcK/RQuAcK.f90 @@ -1,12 +1,12 @@ -subroutine RQuAcK(use_gpu,dotest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT, & - dodrCCD,dorCCD,docrCCD,dolCCD,doCIS,doCIS_D,doCID,doCISD,doFCI,dophRPA,dophRPAx,docrRPA,doppRPA, & - doG0F2,doevGF2,doqsGF2,doufG0F02,doG0F3,doevGF3,doG0W0,doevGW,doqsGW,doufG0W0,doufGW, & - doG0T0pp,doevGTpp,doqsGTpp,doufG0T0pp,doG0T0eh,doevGTeh,doqsGTeh, & - nNuc,nBas,nOrb,nC,nO,nV,nR,ENuc,ZNuc,rNuc, & - S,T,V,Hc,X,dipole_int_AO,ERI_AO,maxSCF_HF,max_diis_HF,thresh_HF,level_shift, & - guess_type,mix,reg_MP,maxSCF_CC,max_diis_CC,thresh_CC,singlet,triplet,TDA, & - maxSCF_GF,max_diis_GF,renorm_GF,thresh_GF,lin_GF,reg_GF,eta_GF,maxSCF_GW,max_diis_GW,thresh_GW, & - TDA_W,lin_GW,reg_GW,eta_GW,maxSCF_GT,max_diis_GT,thresh_GT,TDA_T,lin_GT,reg_GT,eta_GT, & +subroutine RQuAcK(working_dir,use_gpu,dotest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT, & + dodrCCD,dorCCD,docrCCD,dolCCD,doCIS,doCIS_D,doCID,doCISD,doFCI,dophRPA,dophRPAx,docrRPA,doppRPA, & + doG0F2,doevGF2,doqsGF2,doufG0F02,doG0F3,doevGF3,doG0W0,doevGW,doqsGW,doufG0W0,doufGW, & + doG0T0pp,doevGTpp,doqsGTpp,doufG0T0pp,doG0T0eh,doevGTeh,doqsGTeh, & + nNuc,nBas,nOrb,nC,nO,nV,nR,ENuc,ZNuc,rNuc, & + S,T,V,Hc,X,dipole_int_AO,maxSCF_HF,max_diis_HF,thresh_HF,level_shift, & + guess_type,mix,reg_MP,maxSCF_CC,max_diis_CC,thresh_CC,singlet,triplet,TDA, & + maxSCF_GF,max_diis_GF,renorm_GF,thresh_GF,lin_GF,reg_GF,eta_GF,maxSCF_GW,max_diis_GW,thresh_GW, & + TDA_W,lin_GW,reg_GW,eta_GW,maxSCF_GT,max_diis_GT,thresh_GT,TDA_T,lin_GT,reg_GT,eta_GT, & dophBSE,dophBSE2,doppBSE,dBSE,dTDA,doACFDT,exchange_kernel,doXBS) ! Restricted branch of QuAcK @@ -14,6 +14,8 @@ subroutine RQuAcK(use_gpu,dotest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD, implicit none include 'parameters.h' + character(len=256),intent(in) :: working_dir + logical,intent(in) :: use_gpu logical,intent(in) :: dotest @@ -46,7 +48,6 @@ subroutine RQuAcK(use_gpu,dotest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD, double precision,intent(in) :: Hc(nBas,nBas) double precision,intent(in) :: X(nBas,nOrb) double precision,intent(in) :: dipole_int_AO(nBas,nBas,ncart) - double precision,intent(in) :: ERI_AO(nBas,nBas,nBas,nBas) integer,intent(in) :: maxSCF_HF,max_diis_HF double precision,intent(in) :: thresh_HF,level_shift,mix @@ -94,12 +95,14 @@ subroutine RQuAcK(use_gpu,dotest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD, double precision :: start_GW ,end_GW ,t_GW double precision :: start_GT ,end_GT ,t_GT + double precision :: start_int, end_int, t_int double precision,allocatable :: eHF(:) double precision,allocatable :: cHF(:,:) double precision,allocatable :: PHF(:,:) double precision,allocatable :: FHF(:,:) double precision :: ERHF double precision,allocatable :: dipole_int_MO(:,:,:) + double precision,allocatable :: ERI_AO(:,:,:,:) double precision,allocatable :: ERI_MO(:,:,:,:) integer :: ixyz integer :: nS @@ -121,6 +124,15 @@ subroutine RQuAcK(use_gpu,dotest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD, allocate(dipole_int_MO(nOrb,nOrb,ncart)) allocate(ERI_MO(nOrb,nOrb,nOrb,nOrb)) + allocate(ERI_AO(nBas,nBas,nBas,nBas)) + call wall_time(start_int) + call read_2e_integrals(working_dir,nBas,ERI_AO) + call wall_time(end_int) + t_int = end_int - start_int + write(*,*) + write(*,'(A65,1X,F9.3,A8)') 'Total wall time for reading 2e-integrals =',t_int,' seconds' + write(*,*) + !---------------------! ! Hartree-Fock module ! !---------------------! @@ -350,4 +362,13 @@ subroutine RQuAcK(use_gpu,dotest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD, end if + + deallocate(eHF) + deallocate(cHF) + deallocate(PHF) + deallocate(FHF) + deallocate(dipole_int_MO) + deallocate(ERI_MO) + deallocate(ERI_AO) + end subroutine diff --git a/src/QuAcK/RQuAcK_hpc.f90 b/src/QuAcK/RQuAcK_hpc.f90 new file mode 100644 index 0000000..9d13a57 --- /dev/null +++ b/src/QuAcK/RQuAcK_hpc.f90 @@ -0,0 +1,353 @@ +subroutine RQuAcK_hpc(working_dir,use_gpu,dotest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT, & + dodrCCD,dorCCD,docrCCD,dolCCD,doCIS,doCIS_D,doCID,doCISD,doFCI,dophRPA,dophRPAx,docrRPA,doppRPA, & + doG0F2,doevGF2,doqsGF2,doufG0F02,doG0F3,doevGF3,doG0W0,doevGW,doqsGW,doufG0W0,doufGW, & + doG0T0pp,doevGTpp,doqsGTpp,doufG0T0pp,doG0T0eh,doevGTeh,doqsGTeh, & + nNuc,nBas,nOrb,nC,nO,nV,nR,ENuc,ZNuc,rNuc, & + S,T,V,Hc,X,dipole_int_AO,maxSCF_HF,max_diis_HF,thresh_HF,level_shift, & + guess_type,mix,reg_MP,maxSCF_CC,max_diis_CC,thresh_CC,singlet,triplet,TDA, & + maxSCF_GF,max_diis_GF,renorm_GF,thresh_GF,lin_GF,reg_GF,eta_GF,maxSCF_GW,max_diis_GW,thresh_GW, & + TDA_W,lin_GW,reg_GW,eta_GW,maxSCF_GT,max_diis_GT,thresh_GT,TDA_T,lin_GT,reg_GT,eta_GT, & + dophBSE,dophBSE2,doppBSE,dBSE,dTDA,doACFDT,exchange_kernel,doXBS) + +! Restricted branch of QuAcK + + implicit none + include 'parameters.h' + + character(len=256),intent(in) :: working_dir + + logical,intent(in) :: use_gpu + + logical,intent(in) :: dotest + + logical,intent(in) :: doRHF,doROHF + logical,intent(in) :: dostab + logical,intent(in) :: dosearch + logical,intent(in) :: doMP2,doMP3 + logical,intent(in) :: doCCD,dopCCD,doDCD,doCCSD,doCCSDT + logical,intent(in) :: dodrCCD,dorCCD,docrCCD,dolCCD + logical,intent(in) :: doCIS,doCIS_D,doCID,doCISD,doFCI + logical,intent(in) :: dophRPA,dophRPAx,docrRPA,doppRPA + logical,intent(in) :: doG0F2,doevGF2,doqsGF2,doufG0F02,doG0F3,doevGF3 + logical,intent(in) :: doG0W0,doevGW,doqsGW,doufG0W0,doufGW + logical,intent(in) :: doG0T0pp,doevGTpp,doqsGTpp,doufG0T0pp + logical,intent(in) :: doG0T0eh,doevGTeh,doqsGTeh + + integer,intent(in) :: nNuc,nBas,nOrb + integer,intent(in) :: nC + integer,intent(in) :: nO + integer,intent(in) :: nV + integer,intent(in) :: nR + double precision,intent(in) :: ENuc + + double precision,intent(in) :: ZNuc(nNuc),rNuc(nNuc,ncart) + + double precision,intent(in) :: S(nBas,nBas) + double precision,intent(in) :: T(nBas,nBas) + double precision,intent(in) :: V(nBas,nBas) + double precision,intent(in) :: Hc(nBas,nBas) + double precision,intent(in) :: X(nBas,nOrb) + double precision,intent(in) :: dipole_int_AO(nBas,nBas,ncart) + + integer,intent(in) :: maxSCF_HF,max_diis_HF + double precision,intent(in) :: thresh_HF,level_shift,mix + integer,intent(in) :: guess_type + + logical,intent(in) :: reg_MP + + integer,intent(in) :: maxSCF_CC,max_diis_CC + double precision,intent(in) :: thresh_CC + + logical,intent(in) :: singlet + logical,intent(in) :: triplet + logical,intent(in) :: TDA + + integer,intent(in) :: maxSCF_GF,max_diis_GF,renorm_GF + double precision,intent(in) :: thresh_GF + logical,intent(in) :: lin_GF,reg_GF + double precision,intent(in) :: eta_GF + + integer,intent(in) :: maxSCF_GW,max_diis_GW + double precision,intent(in) :: thresh_GW + logical,intent(in) :: TDA_W,lin_GW,reg_GW + double precision,intent(in) :: eta_GW + + integer,intent(in) :: maxSCF_GT,max_diis_GT + double precision,intent(in) :: thresh_GT + logical,intent(in) :: TDA_T,lin_GT,reg_GT + double precision,intent(in) :: eta_GT + + logical,intent(in) :: dophBSE,dophBSE2,doppBSE,dBSE,dTDA + logical,intent(in) :: doACFDT,exchange_kernel,doXBS + +! Local variables + + logical :: doMP,doCC,doCI,doRPA,doGF,doGW,doGT + + double precision :: start_HF ,end_HF ,t_HF + double precision :: start_stab ,end_stab ,t_stab + double precision :: start_AOtoMO ,end_AOtoMO ,t_AOtoMO + double precision :: start_MP ,end_MP ,t_MP + double precision :: start_CC ,end_CC ,t_CC + double precision :: start_CI ,end_CI ,t_CI + double precision :: start_RPA ,end_RPA ,t_RPA + double precision :: start_GF ,end_GF ,t_GF + double precision :: start_GW ,end_GW ,t_GW + double precision :: start_GT ,end_GT ,t_GT + + double precision :: start_int, end_int, t_int + double precision,allocatable :: eHF(:) + double precision,allocatable :: cHF(:,:) + double precision,allocatable :: PHF(:,:) + double precision,allocatable :: FHF(:,:) + double precision :: ERHF + integer :: ixyz + integer :: nS + + write(*,*) + write(*,*) '*****************************************' + write(*,*) '* Restricted Branch of QuAcK (HPC mode) *' + write(*,*) '*****************************************' + write(*,*) + +!-------------------! +! Memory allocation ! +!-------------------! + + allocate(eHF(nOrb)) + allocate(cHF(nBas,nOrb)) + allocate(PHF(nBas,nBas)) + allocate(FHF(nBas,nBas)) + +!---------------------! +! Hartree-Fock module ! +!---------------------! + + if(doRHF) then + + call wall_time(start_HF) + call RHF_hpc(working_dir,dotest,maxSCF_HF,thresh_HF,max_diis_HF,guess_type,level_shift,nNuc,ZNuc,rNuc,ENuc, & + nBas,nOrb,nO,S,T,V,Hc,dipole_int_AO,X,ERHF,eHF,cHF,PHF,FHF) + call wall_time(end_HF) + + t_HF = end_HF - start_HF + write(*,'(A65,1X,F9.3,A8)') 'Total wall time for RHF = ',t_HF,' seconds' + write(*,*) + + end if + +! if(doROHF) then +! +! call wall_time(start_HF) +! call ROHF(dotest,maxSCF_HF,thresh_HF,max_diis_HF,guess_type,mix,level_shift,nNuc,ZNuc,rNuc,ENuc, & +! nBas,nOrb,nO,S,T,V,Hc,ERI_AO,dipole_int_AO,X,ERHF,eHF,cHF,PHF,FHF) +! call wall_time(end_HF) +! +! t_HF = end_HF - start_HF +! write(*,'(A65,1X,F9.3,A8)') 'Total wall time for ROHF = ',t_HF,' seconds' +! write(*,*) +! +! end if +! +!!----------------------------------! +!! AO to MO integral transformation ! +!!----------------------------------! +! +! call wall_time(start_AOtoMO) +! +! write(*,*) +! write(*,*) 'AO to MO transformation... Please be patient' +! write(*,*) +! +! ! Read and transform dipole-related integrals +! +! do ixyz=1,ncart +! call AOtoMO(nBas,nOrb,cHF,dipole_int_AO(1,1,ixyz),dipole_int_MO(1,1,ixyz)) +! end do +! +! ! 4-index transform +! +! call AOtoMO_ERI_RHF(nBas,nOrb,cHF,ERI_AO,ERI_MO) +! +! call wall_time(end_AOtoMO) +! +! t_AOtoMO = end_AOtoMO - start_AOtoMO +! write(*,'(A65,1X,F9.3,A8)') 'Total wall time for AO to MO transformation = ',t_AOtoMO,' seconds' +! write(*,*) +! +!!-----------------------------------! +!! Stability analysis of HF solution ! +!!-----------------------------------! +! +! nS = (nO - nC)*(nV - nR) +! +! if(dostab) then +! +! call wall_time(start_stab) +! call RHF_stability(nOrb,nC,nO,nV,nR,nS,eHF,ERI_MO) +! call wall_time(end_stab) +! +! t_stab = end_stab - start_stab +! write(*,'(A65,1X,F9.3,A8)') 'Total wall time for stability analysis = ',t_stab,' seconds' +! write(*,*) +! +! end if +! +! if(dosearch) then +! +! call wall_time(start_stab) +! call RHF_search(maxSCF_HF,thresh_HF,max_diis_HF,guess_type,level_shift,nNuc,ZNuc,rNuc,ENuc, & +! nBas,nOrb,nC,nO,nV,nR,S,T,V,Hc,ERI_AO,ERI_MO,dipole_int_AO,dipole_int_MO,X, & +! ERHF,eHF,cHF,PHF,FHF) +! call wall_time(end_stab) +! +! t_stab = end_stab - start_stab +! write(*,'(A65,1X,F9.3,A8)') 'Total wall time for stability analysis = ',t_stab,' seconds' +! write(*,*) +! +! end if +! +!!-----------------------! +!! Moller-Plesset module ! +!!-----------------------! +! +! doMP = doMP2 .or. doMP3 +! +! if(doMP) then +! +! call wall_time(start_MP) +! call RMP(dotest,doMP2,doMP3,reg_MP,nOrb,nC,nO,nV,nR,ERI_MO,ENuc,ERHF,eHF) +! call wall_time(end_MP) +! +! t_MP = end_MP - start_MP +! write(*,'(A65,1X,F9.3,A8)') 'Total wall time for MP = ',t_MP,' seconds' +! write(*,*) +! +! end if +! +!!------------------------! +!! Coupled-cluster module ! +!!------------------------! +! +! doCC = doCCD .or. dopCCD .or. doDCD .or. doCCSD .or. doCCSDT .or. & +! dodrCCD .or. dorCCD .or. docrCCD .or. dolCCD +! +! if(doCC) then +! +! call wall_time(start_CC) +! call RCC(dotest,doCCD,dopCCD,doDCD,doCCSD,doCCSDT,dodrCCD,dorCCD,docrCCD,dolCCD, & +! maxSCF_CC,thresh_CC,max_diis_CC,nBas,nOrb,nC,nO,nV,nR,Hc,ERI_AO,ERI_MO, & +! ENuc,ERHF,eHF,cHF,PHF,FHF) +! call wall_time(end_CC) +! +! t_CC = end_CC - start_CC +! write(*,'(A65,1X,F9.3,A8)') 'Total wall time for CC = ',t_CC,' seconds' +! write(*,*) +! +! end if +! +!!----------------------------------! +!! Configuration interaction module ! +!!----------------------------------! +! +! doCI = doCIS .or. doCID .or. doCISD .or. doFCI +! +! if(doCI) then +! +! call wall_time(start_CI) +! call RCI(dotest,doCIS,doCIS_D,doCID,doCISD,doFCI,singlet,triplet,nOrb, & +! nC,nO,nV,nR,nS,ERI_MO,dipole_int_MO,eHF,ERHF) +! call wall_time(end_CI) +! +! t_CI = end_CI - start_CI +! write(*,'(A65,1X,F9.3,A8)') 'Total wall time for CI = ',t_CI,' seconds' +! write(*,*) +! +! end if +! +!!-----------------------------------! +!! Random-phase approximation module ! +!!-----------------------------------! +! +! doRPA = dophRPA .or. dophRPAx .or. docrRPA .or. doppRPA +! +! if(doRPA) then +! +! call wall_time(start_RPA) +! call RRPA(use_gpu,dotest,dophRPA,dophRPAx,docrRPA,doppRPA,TDA,doACFDT,exchange_kernel,singlet,triplet, & +! nOrb,nC,nO,nV,nR,nS,ENuc,ERHF,ERI_MO,dipole_int_MO,eHF) +! call wall_time(end_RPA) +! +! t_RPA = end_RPA - start_RPA +! write(*,'(A65,1X,F9.3,A8)') 'Total wall time for RPA = ',t_RPA,' seconds' +! write(*,*) +! +! end if +! +!!-------------------------! +!! Green's function module ! +!!-------------------------! +! +! doGF = doG0F2 .or. doevGF2 .or. doqsGF2 .or. doufG0F02 .or. doG0F3 .or. doevGF3 +! +! if(doGF) then +! +! call wall_time(start_GF) +! call RGF(dotest,doG0F2,doevGF2,doqsGF2,doufG0F02,doG0F3,doevGF3,renorm_GF,maxSCF_GF, & +! thresh_GF,max_diis_GF,dophBSE,doppBSE,TDA,dBSE,dTDA,singlet,triplet,lin_GF, & +! eta_GF,reg_GF,nNuc,ZNuc,rNuc,ENuc,nBas,nOrb,nC,nO,nV,nR,nS,ERHF, & +! S,X,T,V,Hc,ERI_AO,ERI_MO,dipole_int_AO,dipole_int_MO,PHF,cHF,eHF) +! call wall_time(end_GF) +! +! t_GF = end_GF - start_GF +! write(*,'(A65,1X,F9.3,A8)') 'Total wall time for GF2 = ',t_GF,' seconds' +! write(*,*) +! +! end if +! +!!-----------! +!! GW module ! +!!-----------! +! +! doGW = doG0W0 .or. doevGW .or. doqsGW .or. doufG0W0 .or. doufGW +! +! if(doGW) then +! +! call wall_time(start_GW) +! call RGW(dotest,doG0W0,doevGW,doqsGW,doufG0W0,doufGW,maxSCF_GW,thresh_GW,max_diis_GW, & +! doACFDT,exchange_kernel,doXBS,dophBSE,dophBSE2,doppBSE,TDA_W,TDA,dBSE,dTDA,singlet,triplet, & +! lin_GW,eta_GW,reg_GW,nNuc,ZNuc,rNuc,ENuc,nBas,nOrb,nC,nO,nV,nR,nS,ERHF,S,X,T, & +! V,Hc,ERI_AO,ERI_MO,dipole_int_AO,dipole_int_MO,PHF,cHF,eHF) +! call wall_time(end_GW) +! +! t_GW = end_GW - start_GW +! write(*,'(A65,1X,F9.3,A8)') 'Total wall time for GW = ',t_GW,' seconds' +! write(*,*) +! +! end if +! +!!-----------------! +!! T-matrix module ! +!!-----------------! +! +! doGT = doG0T0pp .or. doevGTpp .or. doqsGTpp .or. doufG0T0pp .or. doG0T0eh .or. doevGTeh .or. doqsGTeh +! +! if(doGT) then +! +! call wall_time(start_GT) +! call RGT(dotest,doG0T0pp,doevGTpp,doqsGTpp,doufG0T0pp,doG0T0eh,doevGTeh,doqsGTeh, & +! maxSCF_GT,thresh_GT,max_diis_GT,doACFDT,exchange_kernel,doXBS,dophBSE,dophBSE2,doppBSE, & +! TDA_T,TDA,dBSE,dTDA,singlet,triplet,lin_GT,eta_GT,reg_GT,nNuc,ZNuc,rNuc,ENuc, & +! nBas,nOrb,nC,nO,nV,nR,nS,ERHF,S,X,T,V,Hc,ERI_AO,ERI_MO,dipole_int_AO, & +! dipole_int_MO,PHF,cHF,eHF) +! call wall_time(end_GT) +! +! t_GT = end_GT - start_GT +! write(*,'(A65,1X,F9.3,A8)') 'Total wall time for GT = ',t_GT,' seconds' +! write(*,*) +! +! end if + + + return +end subroutine diff --git a/src/QuAcK/UQuAcK.f90 b/src/QuAcK/UQuAcK.f90 index f813a72..0ff1759 100644 --- a/src/QuAcK/UQuAcK.f90 +++ b/src/QuAcK/UQuAcK.f90 @@ -1,9 +1,9 @@ -subroutine UQuAcK(dotest,doUHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT, & +subroutine UQuAcK(working_dir,dotest,doUHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT, & dodrCCD,dorCCD,docrCCD,dolCCD,doCIS,doCIS_D,doCID,doCISD,doFCI,dophRPA,dophRPAx,docrRPA,doppRPA, & doG0F2,doevGF2,doqsGF2,doufG0F02,doG0F3,doevGF3,doG0W0,doevGW,doqsGW,doufG0W0,doufGW, & doG0T0pp,doevGTpp,doqsGTpp,doufG0T0pp,doG0T0eh,doevGTeh,doqsGTeh, & nNuc,nBas,nC,nO,nV,nR,ENuc,ZNuc,rNuc, & - S,T,V,Hc,X,dipole_int_AO,ERI_AO,maxSCF_HF,max_diis_HF,thresh_HF,level_shift, & + S,T,V,Hc,X,dipole_int_AO,maxSCF_HF,max_diis_HF,thresh_HF,level_shift, & guess_type,mix,reg_MP,maxSCF_CC,max_diis_CC,thresh_CC,spin_conserved,spin_flip,TDA, & maxSCF_GF,max_diis_GF,renorm_GF,thresh_GF,lin_GF,reg_GF,eta_GF,maxSCF_GW,max_diis_GW,thresh_GW, & TDA_W,lin_GW,reg_GW,eta_GW,maxSCF_GT,max_diis_GT,thresh_GT,TDA_T,lin_GT,reg_GT,eta_GT, & @@ -12,6 +12,8 @@ subroutine UQuAcK(dotest,doUHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,do implicit none include 'parameters.h' + character(len=256),intent(in) :: working_dir + logical,intent(in) :: dotest logical,intent(in) :: doUHF @@ -42,7 +44,6 @@ subroutine UQuAcK(dotest,doUHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,do double precision,intent(in) :: Hc(nBas,nBas) double precision,intent(in) :: X(nBas,nBas) double precision,intent(in) :: dipole_int_AO(nBas,nBas,ncart) - double precision,intent(in) :: ERI_AO(nBas,nBas,nBas,nBas) integer,intent(in) :: maxSCF_HF,max_diis_HF double precision,intent(in) :: thresh_HF,level_shift,mix @@ -90,10 +91,12 @@ subroutine UQuAcK(dotest,doUHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,do double precision :: start_GW ,end_GW ,t_GW double precision :: start_GT ,end_GT ,t_GT + double precision :: start_int, end_int, t_int double precision,allocatable :: cHF(:,:,:),eHF(:,:),PHF(:,:,:),FHF(:,:,:) double precision :: EUHF double precision,allocatable :: dipole_int_aa(:,:,:),dipole_int_bb(:,:,:) double precision,allocatable :: ERI_aaaa(:,:,:,:),ERI_aabb(:,:,:,:),ERI_bbbb(:,:,:,:) + double precision,allocatable :: ERI_AO(:,:,:,:) integer :: ixyz integer :: nS(nspin) @@ -112,6 +115,15 @@ subroutine UQuAcK(dotest,doUHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,do ERI_aaaa(nBas,nBas,nBas,nBas),ERI_aabb(nBas,nBas,nBas,nBas), & ERI_bbbb(nBas,nBas,nBas,nBas)) + allocate(ERI_AO(nBas,nBas,nBas,nBas)) + call wall_time(start_int) + call read_2e_integrals(working_dir,nBas,ERI_AO) + call wall_time(end_int) + t_int = end_int - start_int + write(*,*) + write(*,'(A65,1X,F9.3,A8)') 'Total wall time for reading 2e-integrals =',t_int,' seconds' + write(*,*) + !---------------------! ! Hartree-Fock module ! !---------------------! diff --git a/src/QuAcK/read_hardware.f90 b/src/QuAcK/read_hardware.f90 deleted file mode 100644 index c014d44..0000000 --- a/src/QuAcK/read_hardware.f90 +++ /dev/null @@ -1,45 +0,0 @@ -subroutine read_hardware(working_dir,use_gpu) - -! Read desired methods - - implicit none - -! Input variables - - character(len=256),intent(in) :: working_dir - -! Output variables - - logical,intent(out) :: use_gpu - -! Local variables - - character(len=1) :: ans - integer :: ios - character(len=256) :: file_path - -! Open file with method specification - - file_path = trim(working_dir) // '/input/hardware' - open(unit=1, file=file_path, status='old', action='read', iostat=ios) - - if(ios /= 0) then - - use_gpu = .False. - - else - - read(1,*) - read(1,*) ans - if(ans == 'T') then - use_gpu = .true. - else - use_gpu = .False. - endif - - endif - - ! Close file with options - close(unit=1) - -end subroutine diff --git a/src/QuAcK/read_hpc_flags.f90 b/src/QuAcK/read_hpc_flags.f90 new file mode 100644 index 0000000..4921754 --- /dev/null +++ b/src/QuAcK/read_hpc_flags.f90 @@ -0,0 +1,39 @@ +subroutine read_hpc_flags(working_dir, switch_hpc, use_gpu) + + implicit none + + character(len=256), intent(in) :: working_dir + + logical, intent(out) :: switch_hpc + logical, intent(out) :: use_gpu + + character(len=1) :: ans + integer :: ios + character(len=256) :: file_path + + file_path = trim(working_dir) // '/input/hpc_flags' + open(unit=1, file=file_path, status='old', action='read', iostat=ios) + + if(ios /= 0) then + + switch_hpc = .False. + use_gpu = .False. + + else + + switch_hpc = .False. + use_gpu = .False. + + read(1,*) + read(1,*) ans + if(ans == 'T') switch_hpc = .true. + + read(1,*) + read(1,*) ans + if(ans == 'T') use_gpu = .true. + + endif + + close(unit=1) + +end subroutine diff --git a/src/utils/read_integrals.f90 b/src/utils/read_1e_integrals.f90 similarity index 59% rename from src/utils/read_integrals.f90 rename to src/utils/read_1e_integrals.f90 index 91be5fe..eecdc05 100644 --- a/src/utils/read_integrals.f90 +++ b/src/utils/read_1e_integrals.f90 @@ -1,6 +1,6 @@ -subroutine read_integrals(working_dir,nBas_AOs,S,T,V,Hc,G) +subroutine read_1e_integrals(working_dir,nBas_AOs,S,T,V,Hc) -! Read one- and two-electron integrals from files +! Read one-electron integrals from files implicit none include 'parameters.h' @@ -13,9 +13,8 @@ subroutine read_integrals(working_dir,nBas_AOs,S,T,V,Hc,G) ! Local variables logical :: debug - integer :: mu,nu,la,si - double precision :: Ov,Kin,Nuc,ERI - double precision :: lambda + integer :: mu,nu + double precision :: Ov,Kin,Nuc ! Output variables @@ -23,26 +22,21 @@ subroutine read_integrals(working_dir,nBas_AOs,S,T,V,Hc,G) double precision,intent(out) :: T(nBas_AOs,nBas_AOs) double precision,intent(out) :: V(nBas_AOs,nBas_AOs) double precision,intent(out) :: Hc(nBas_AOs,nBas_AOs) - double precision,intent(out) :: G(nBas_AOs,nBas_AOs,nBas_AOs,nBas_AOs) - integer :: status, ios + integer :: ios character(len=256) :: file_path ! Open file with integrals debug = .false. - lambda = 1d0 - - print*, 'Scaling integrals by ',lambda - ! --- ! Read overlap integrals file_path = trim(working_dir) // '/int/Ov.dat' - open(unit=8, file=file_path, status='old', action='read', iostat=status) - if(status /= 0) then + open(unit=8, file=file_path, status='old', action='read', iostat=ios) + if(ios /= 0) then print *, "Error opening file: ", file_path stop else @@ -60,8 +54,8 @@ subroutine read_integrals(working_dir,nBas_AOs,S,T,V,Hc,G) ! Read kinetic integrals file_path = trim(working_dir) // '/int/Kin.dat' - open(unit=9, file=file_path, status='old', action='read', iostat=status) - if(status /= 0) then + open(unit=9, file=file_path, status='old', action='read', iostat=ios) + if(ios /= 0) then print *, "Error opening file: ", file_path stop else @@ -79,8 +73,8 @@ subroutine read_integrals(working_dir,nBas_AOs,S,T,V,Hc,G) ! Read nuclear integrals file_path = trim(working_dir) // '/int/Nuc.dat' - open(unit=10, file=file_path, status='old', action='read', iostat=status) - if(status /= 0) then + open(unit=10, file=file_path, status='old', action='read', iostat=ios) + if(ios /= 0) then print *, "Error opening file: ", file_path stop else @@ -99,37 +93,6 @@ subroutine read_integrals(working_dir,nBas_AOs,S,T,V,Hc,G) ! Define core Hamiltonian Hc(:,:) = T(:,:) + V(:,:) -! Read 2e-integrals - -! ! formatted file -! open(unit=11, file='int/ERI.dat') -! G(:,:,:,:) = 0d0 -! do -! read(11,*,end=11) mu, nu, la, si, ERI -! ERI = lambda*ERI -! G(mu,nu,la,si) = ERI ! <12|34> -! G(la,nu,mu,si) = ERI ! <32|14> -! G(mu,si,la,nu) = ERI ! <14|32> -! G(la,si,mu,nu) = ERI ! <34|12> -! G(si,mu,nu,la) = ERI ! <41|23> -! G(nu,la,si,mu) = ERI ! <23|41> -! G(nu,mu,si,la) = ERI ! <21|43> -! G(si,la,nu,mu) = ERI ! <43|21> -! end do -! 11 close(unit=11) - - ! binary file - file_path = trim(working_dir) // '/int/ERI.bin' - open(unit=11, file=file_path, status='old', action='read', form='unformatted', access='stream', iostat=status) - if(status /= 0) then - print *, "Error opening file: ", file_path - stop - else - read(11) G - endif - close(unit=11) - - ! Print results if(debug) then @@ -148,15 +111,6 @@ subroutine read_integrals(working_dir,nBas_AOs,S,T,V,Hc,G) write(*,'(A28)') '----------------------' call matout(nBas_AOs,nBas_AOs,V) write(*,*) - write(*,'(A28)') '----------------------' - write(*,'(A28)') 'Electron repulsion integrals' - write(*,'(A28)') '----------------------' - do la=1,nBas_AOs - do si=1,nBas_AOs - call matout(nBas_AOs, nBas_AOs, G(1,1,la,si)) - end do - end do - write(*,*) end if end subroutine diff --git a/src/utils/read_2e_integrals.f90 b/src/utils/read_2e_integrals.f90 new file mode 100644 index 0000000..94404a4 --- /dev/null +++ b/src/utils/read_2e_integrals.f90 @@ -0,0 +1,110 @@ +subroutine read_2e_integrals(working_dir,nBas_AOs,G) + +! Read two-electron integrals from files + + implicit none + +! Input variables + + integer,intent(in) :: nBas_AOs + character(len=256),intent(in) :: working_dir + +! Local variables + + logical :: debug + integer :: mu,nu,la,si + double precision :: ERI + double precision :: lambda + +! Output variables + + double precision,intent(out) :: G(nBas_AOs,nBas_AOs,nBas_AOs,nBas_AOs) + + integer :: ios + character(len=256) :: file_path + +! Open file with integrals + + debug = .false. + + lambda = 1d0 + + print*, 'Scaling integrals by ',lambda + + +! Read 2e-integrals + +! ! formatted file +! open(unit=11, file='int/ERI.dat') +! G(:,:,:,:) = 0d0 +! do +! read(11,*,end=11) mu, nu, la, si, ERI +! ERI = lambda*ERI +! G(mu,nu,la,si) = ERI ! <12|34> +! G(la,nu,mu,si) = ERI ! <32|14> +! G(mu,si,la,nu) = ERI ! <14|32> +! G(la,si,mu,nu) = ERI ! <34|12> +! G(si,mu,nu,la) = ERI ! <41|23> +! G(nu,la,si,mu) = ERI ! <23|41> +! G(nu,mu,si,la) = ERI ! <21|43> +! G(si,la,nu,mu) = ERI ! <43|21> +! end do +! 11 close(unit=11) + + ! binary file + file_path = trim(working_dir) // '/int/ERI.bin' + open(unit=11, file=file_path, status='old', action='read', form='unformatted', access='stream', iostat=ios) + if(ios /= 0) then + print *, "Error opening file: ", file_path + stop + else + read(11) G + endif + close(unit=11) + G = G * lambda + + + +! Print results + if(debug) then + write(*,'(A28)') '----------------------' + write(*,'(A28)') 'Electron repulsion integrals' + write(*,'(A28)') '----------------------' + do la=1,nBas_AOs + do si=1,nBas_AOs + call matout(nBas_AOs, nBas_AOs, G(1,1,la,si)) + end do + end do + write(*,*) + end if + +end subroutine + +! --- + +subroutine read_2e_integrals_hpc(working_dir, ERI_size, ERI_chem) + + implicit none + + character(len=256), intent(in) :: working_dir + integer*8, intent(in) :: ERI_size + double precision, intent(out) :: ERI_chem(ERI_size) + + integer :: ios + character(len=256) :: file_path + + file_path = trim(working_dir) // '/int/ERI_chem.bin' + open(unit=11, file=file_path, status='old', action='read', form='unformatted', access='stream', iostat=ios) + if(ios /= 0) then + print *, "Error opening file: ", file_path + stop + else + read(11) ERI_chem + endif + close(unit=11) + + return +end subroutine + +! --- + From 9094f169061b443af486a2c1323fb88d611cc25a Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Mon, 9 Dec 2024 18:53:55 +0100 Subject: [PATCH 29/39] diag of Coulomb-Fock matrix implemented with 8-fold symmetry --- src/AOtoMO/Hartree_matrix_AO_basis.f90 | 157 ++++++++++++++----------- src/utils/utils.f90 | 30 +++++ 2 files changed, 118 insertions(+), 69 deletions(-) diff --git a/src/AOtoMO/Hartree_matrix_AO_basis.f90 b/src/AOtoMO/Hartree_matrix_AO_basis.f90 index ed819c6..11e4788 100644 --- a/src/AOtoMO/Hartree_matrix_AO_basis.f90 +++ b/src/AOtoMO/Hartree_matrix_AO_basis.f90 @@ -46,92 +46,111 @@ subroutine Hartree_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, H) double precision, intent(out) :: H(nBas,nBas) integer :: mu, nu, la, si + integer :: nunu, lala, nula, lasi + integer*8 :: nunununu, nunulala, nununula, nunulasi, lalanunu, lasinunu integer*8 :: munu0, munu integer*8 :: sila0, sila integer*8 :: munulasi0, munulasi - integer*8, external :: Yoshimine_ind + integer*8, external :: Yoshimine_4ind + +! do nu = 1, nBas +! do mu = 1, nBas +! H(mu,nu) = 0.d0 +! do si = 1, nBas +! do la = 1, nBas +! munulasi = Yoshimine_4ind(mu, nu, la, si) +! H(mu,nu) = H(mu,nu) + P(la,si) * ERI_chem(munulasi) +! enddo +! enddo +! enddo +! enddo + +! do nu = 1, nBas +! do mu = 1, nu +! H(mu,nu) = 0.d0 +! do si = 1, nBas +! munulasi = Yoshimine_4ind(mu, nu, si, si) +! H(mu,nu) = H(mu,nu) + P(si,si) * ERI_chem(munulasi) +! do la = 1, si-1 +! munulasi = Yoshimine_4ind(mu, nu, la, si) +! H(mu,nu) = H(mu,nu) + 2.d0 * P(la,si) * ERI_chem(munulasi) +! enddo +! enddo +! enddo +! enddo + + do nu = 1, nBas - do mu = 1, nBas + + nunu = (nu * (nu - 1)) / 2 + nu + nunununu = (nunu * (nunu - 1)) / 2 + nunu + !nunununu = Yoshimine_4ind(nu, nu, nu, nu) + H(nu,nu) = P(nu,nu) * ERI_chem(nunununu) + + do la = 1, nu-1 + + ! la < nu + lala = (la * (la - 1)) / 2 + la + nunulala = (nunu * (nunu - 1)) / 2 + lala + !nunulala = Yoshimine_4ind(nu, nu, la, la) + H(nu,nu) = H(nu,nu) + P(la,la) * ERI_chem(nunulala) + + ! la < nu + nula = (nu * (nu - 1)) / 2 + la + nununula = (nunu * (nunu - 1)) / 2 + nula + !nununula = Yoshimine_4ind(nu, nu, la, nu) + H(nu,nu) = H(nu,nu) + 2.d0 * P(la,nu) * ERI_chem(nununula) + + do si = 1, la - 1 + ! lasi < nunu + lasi = (la * (la - 1)) / 2 + si + nunulasi = (nunu * (nunu - 1)) / 2 + lasi + !nunulasi = Yoshimine_4ind(nu, nu, si, la) + H(nu,nu) = H(nu,nu) + 2.d0 * P(si,la) * ERI_chem(nunulasi) + enddo + enddo + + do la = nu+1, nBas + + ! nu < la + lala = (la * (la - 1)) / 2 + la + lalanunu = (lala * (lala - 1)) / 2 + nunu + !lalanunu = Yoshimine_4ind(nu, nu, la, la) + H(nu,nu) = H(nu,nu) + P(la,la) * ERI_chem(lalanunu) + + do si = 1, la - 1 + ! nunu < lasi + lasi = (la * (la - 1)) / 2 + si + lasinunu = (lasi * (lasi - 1)) / 2 + nunu + !lasinunu = Yoshimine_4ind(nu, nu, si, la) + H(nu,nu) = H(nu,nu) + 2.d0 * P(si,la) * ERI_chem(lasinunu) + enddo + enddo + + do mu = 1, nu-1 H(mu,nu) = 0.d0 do si = 1, nBas - do la = 1, nBas - munulasi = Yoshimine_ind(mu, nu, la, si) - H(mu,nu) = H(mu,nu) + P(la,si) * ERI_chem(munulasi) + munulasi = Yoshimine_4ind(mu, nu, si, si) + H(mu,nu) = H(mu,nu) + P(si,si) * ERI_chem(munulasi) + do la = 1, si-1 + munulasi = Yoshimine_4ind(mu, nu, la, si) + H(mu,nu) = H(mu,nu) + 2.d0 * P(la,si) * ERI_chem(munulasi) enddo enddo enddo enddo -! do nu = 1, nBas -! munu0 = (nu * (nu + 1)) / 2 -! -! do mu = 1, nu -! munu = munu0 + mu -! munulasi0 = (munu * (munu + 1)) / 2 -! -! H(mu,nu) = 0.d0 -! -! do si = 1, nu -! sila0 = (si * (si + 1)) / 2 -! -! do la = 1, si -! sila = sila0 + la -! -! if(nu == si .and. mu < la) cycle -! -! munulasi = munulasi0 + sila -! -! H(mu,nu) = H(mu,nu) + 4.d0 * P(la,si) * ERI_chem(munulasi) -! enddo -! enddo -! enddo -! enddo -! -! -! do nu = 1, nBas -! do mu = nu+1, nBas -! H(mu,nu) = H(nu,mu) -! enddo -! enddo + do nu = 1, nBas + do mu = nu+1, nBas + H(mu,nu) = H(nu,mu) + enddo + enddo return end subroutine ! --- -integer*8 function Yoshimine_ind(a, b, c, d) - - implicit none - - integer, intent(in) :: a, b, c, d - - integer*8 :: ab, cd, abcd - - if(a > b) then - ab = (a * (a - 1)) / 2 + b - else - ab = (b * (b - 1)) / 2 + a - endif - - if(c > d) then - cd = (c * (c - 1)) / 2 + d - else - cd = (d * (d - 1)) / 2 + c - endif - - if(ab > cd) then - abcd = (ab * (ab - 1)) / 2 + cd - else - abcd = (cd * (cd - 1)) / 2 + ab - endif - - Yoshimine_ind = abcd - - return -end - -! --- - diff --git a/src/utils/utils.f90 b/src/utils/utils.f90 index 33cd99e..79d6d82 100644 --- a/src/utils/utils.f90 +++ b/src/utils/utils.f90 @@ -909,3 +909,33 @@ end ! --- +integer*8 function Yoshimine_4ind(a, b, c, d) + + implicit none + integer, intent(in) :: a, b, c, d + integer*8, external :: Yoshimine_2ind + + Yoshimine_4ind = Yoshimine_2ind(Yoshimine_2ind(a, b), & + Yoshimine_2ind(c, d)) + + return +end + +! --- + +integer*8 function Yoshimine_2ind(a, b) + + implicit none + integer, intent(in) :: a, b + + if(a > b) then + Yoshimine_2ind = (a * (a - 1)) / 2 + b + else + Yoshimine_2ind = (b * (b - 1)) / 2 + a + endif + + return +end + +! --- + From ec797982deed1929e9b8246e9670b705ba7cef6e Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Tue, 10 Dec 2024 02:25:52 +0100 Subject: [PATCH 30/39] full Coulomb-Fock matrix with 8-fold symmetry --- src/AOtoMO/Hartree_matrix_AO_basis.f90 | 94 ++++++++++++-------------- src/HF/RHF_hpc.f90 | 4 +- 2 files changed, 47 insertions(+), 51 deletions(-) diff --git a/src/AOtoMO/Hartree_matrix_AO_basis.f90 b/src/AOtoMO/Hartree_matrix_AO_basis.f90 index 11e4788..83a3e50 100644 --- a/src/AOtoMO/Hartree_matrix_AO_basis.f90 +++ b/src/AOtoMO/Hartree_matrix_AO_basis.f90 @@ -46,101 +46,97 @@ subroutine Hartree_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, H) double precision, intent(out) :: H(nBas,nBas) integer :: mu, nu, la, si - integer :: nunu, lala, nula, lasi - integer*8 :: nunununu, nunulala, nununula, nunulasi, lalanunu, lasinunu + integer :: nunu, lala, nula, lasi, numu + integer*8 :: nunununu, nunulala, nununula, nunulasi + integer*8 :: lalanunu, lasinunu, numulala, lalanumu + integer*8 :: numunula, numulasi, lasinumu integer*8 :: munu0, munu integer*8 :: sila0, sila integer*8 :: munulasi0, munulasi integer*8, external :: Yoshimine_4ind -! do nu = 1, nBas -! do mu = 1, nBas -! H(mu,nu) = 0.d0 -! do si = 1, nBas -! do la = 1, nBas -! munulasi = Yoshimine_4ind(mu, nu, la, si) -! H(mu,nu) = H(mu,nu) + P(la,si) * ERI_chem(munulasi) -! enddo -! enddo -! enddo -! enddo - -! do nu = 1, nBas -! do mu = 1, nu -! H(mu,nu) = 0.d0 -! do si = 1, nBas -! munulasi = Yoshimine_4ind(mu, nu, si, si) -! H(mu,nu) = H(mu,nu) + P(si,si) * ERI_chem(munulasi) -! do la = 1, si-1 -! munulasi = Yoshimine_4ind(mu, nu, la, si) -! H(mu,nu) = H(mu,nu) + 2.d0 * P(la,si) * ERI_chem(munulasi) -! enddo -! enddo -! enddo -! enddo - - do nu = 1, nBas nunu = (nu * (nu - 1)) / 2 + nu nunununu = (nunu * (nunu - 1)) / 2 + nunu - !nunununu = Yoshimine_4ind(nu, nu, nu, nu) H(nu,nu) = P(nu,nu) * ERI_chem(nunununu) do la = 1, nu-1 - ! la < nu lala = (la * (la - 1)) / 2 + la nunulala = (nunu * (nunu - 1)) / 2 + lala - !nunulala = Yoshimine_4ind(nu, nu, la, la) H(nu,nu) = H(nu,nu) + P(la,la) * ERI_chem(nunulala) - ! la < nu nula = (nu * (nu - 1)) / 2 + la nununula = (nunu * (nunu - 1)) / 2 + nula - !nununula = Yoshimine_4ind(nu, nu, la, nu) H(nu,nu) = H(nu,nu) + 2.d0 * P(la,nu) * ERI_chem(nununula) do si = 1, la - 1 - ! lasi < nunu lasi = (la * (la - 1)) / 2 + si nunulasi = (nunu * (nunu - 1)) / 2 + lasi - !nunulasi = Yoshimine_4ind(nu, nu, si, la) H(nu,nu) = H(nu,nu) + 2.d0 * P(si,la) * ERI_chem(nunulasi) enddo enddo do la = nu+1, nBas - ! nu < la lala = (la * (la - 1)) / 2 + la lalanunu = (lala * (lala - 1)) / 2 + nunu - !lalanunu = Yoshimine_4ind(nu, nu, la, la) H(nu,nu) = H(nu,nu) + P(la,la) * ERI_chem(lalanunu) do si = 1, la - 1 - ! nunu < lasi lasi = (la * (la - 1)) / 2 + si lasinunu = (lasi * (lasi - 1)) / 2 + nunu - !lasinunu = Yoshimine_4ind(nu, nu, si, la) H(nu,nu) = H(nu,nu) + 2.d0 * P(si,la) * ERI_chem(lasinunu) enddo enddo - do mu = 1, nu-1 + do mu = 1, nu - 1 + + numu = (nu * (nu - 1)) / 2 + mu + H(mu,nu) = 0.d0 - do si = 1, nBas - munulasi = Yoshimine_4ind(mu, nu, si, si) - H(mu,nu) = H(mu,nu) + P(si,si) * ERI_chem(munulasi) - do la = 1, si-1 - munulasi = Yoshimine_4ind(mu, nu, la, si) - H(mu,nu) = H(mu,nu) + 2.d0 * P(la,si) * ERI_chem(munulasi) + + do la = 1, nu - 1 + lala = (la * (la - 1)) / 2 + la + numulala = (numu * (numu - 1)) / 2 + lala + H(mu,nu) = H(mu,nu) + p(la,la) * ERI_chem(numulala) + enddo + do la = nu, nBas + lala = (la * (la - 1)) / 2 + la + lalanumu = (lala * (lala - 1)) / 2 + numu + H(mu,nu) = H(mu,nu) + p(la,la) * ERI_chem(lalanumu) + enddo + + do la = 1, mu + nula = (nu * (nu - 1)) / 2 + la + numunula = (numu * (numu - 1)) / 2 + nula + H(mu,nu) = H(mu,nu) + 2.d0 * P(la,nu) * ERI_chem(numunula) + enddo + do la = mu + 1, nu - 1 + nula = (nu * (nu - 1)) / 2 + la + numunula = (nula * (nula - 1)) / 2 + numu + H(mu,nu) = H(mu,nu) + 2.d0 * P(la,nu) * ERI_chem(numunula) + enddo + do la = 2, nu - 1 + do si = 1, la - 1 + lasi = (la * (la - 1)) / 2 + si + numulasi = (numu * (numu - 1)) / 2 + lasi + H(mu,nu) = H(mu,nu) + 2.d0 * P(si,la) * ERI_chem(numulasi) enddo enddo - enddo - enddo + do la = nu + 1, nBas + do si = 1, la - 1 + lasi = (la * (la - 1)) / 2 + si + lasinumu = (lasi * (lasi - 1)) / 2 + numu + H(mu,nu) = H(mu,nu) + 2.d0 * P(si,la) * ERI_chem(lasinumu) + enddo + enddo + + enddo ! mu + enddo ! nu do nu = 1, nBas diff --git a/src/HF/RHF_hpc.f90 b/src/HF/RHF_hpc.f90 index ef6ea47..dc66299 100644 --- a/src/HF/RHF_hpc.f90 +++ b/src/HF/RHF_hpc.f90 @@ -111,12 +111,12 @@ subroutine RHF_hpc(working_dir,dotest,maxSCF,thresh,max_diis,guess_type,level_sh call read_2e_integrals(working_dir, nBas, ERI_phys) call Hartree_matrix_AO_basis(nBas, P, ERI_phys, J_deb) - print*, maxval(dabs(J - J_deb)) + print*, "max error = ", maxval(dabs(J - J_deb)) diff = 0.d0 do ii = 1, nBas do jj = 1, nBas diff_loc = dabs(J(jj,ii) - J_deb(jj,ii)) - if(diff_loc .gt. 1d-13) then + if(diff_loc .gt. 1d-12) then print*, 'error on: ', jj, ii print*, J(jj,ii), J_deb(jj,ii) stop From 995bcfd2362eff3ecff11470d8450d53dbbfd678 Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Tue, 10 Dec 2024 11:29:31 +0100 Subject: [PATCH 31/39] working on X-Fock matrix (with s8 sym) --- src/AOtoMO/Hartree_matrix_AO_basis.f90 | 16 +-- src/AOtoMO/exchange_matrix_AO_basis.f90 | 149 ++++++++++++++++++++++++ src/HF/RHF_hpc.f90 | 30 ++++- src/utils/utils.f90 | 6 +- 4 files changed, 186 insertions(+), 15 deletions(-) diff --git a/src/AOtoMO/Hartree_matrix_AO_basis.f90 b/src/AOtoMO/Hartree_matrix_AO_basis.f90 index 83a3e50..0d0810d 100644 --- a/src/AOtoMO/Hartree_matrix_AO_basis.f90 +++ b/src/AOtoMO/Hartree_matrix_AO_basis.f90 @@ -49,13 +49,11 @@ subroutine Hartree_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, H) integer :: nunu, lala, nula, lasi, numu integer*8 :: nunununu, nunulala, nununula, nunulasi integer*8 :: lalanunu, lasinunu, numulala, lalanumu - integer*8 :: numunula, numulasi, lasinumu + integer*8 :: numunula, numulasi, lasinumu, nununumu integer*8 :: munu0, munu integer*8 :: sila0, sila integer*8 :: munulasi0, munulasi - integer*8, external :: Yoshimine_4ind - do nu = 1, nBas @@ -80,7 +78,7 @@ subroutine Hartree_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, H) enddo enddo - do la = nu+1, nBas + do la = nu + 1, nBas lala = (la * (la - 1)) / 2 + la lalanunu = (lala * (lala - 1)) / 2 + nunu @@ -96,15 +94,16 @@ subroutine Hartree_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, H) do mu = 1, nu - 1 numu = (nu * (nu - 1)) / 2 + mu - - H(mu,nu) = 0.d0 + nununumu = (nunu * (nunu - 1)) / 2 + numu + H(mu,nu) = p(nu,nu) * ERI_chem(nununumu) do la = 1, nu - 1 lala = (la * (la - 1)) / 2 + la numulala = (numu * (numu - 1)) / 2 + lala H(mu,nu) = H(mu,nu) + p(la,la) * ERI_chem(numulala) enddo - do la = nu, nBas + + do la = nu + 1, nBas lala = (la * (la - 1)) / 2 + la lalanumu = (lala * (lala - 1)) / 2 + numu H(mu,nu) = H(mu,nu) + p(la,la) * ERI_chem(lalanumu) @@ -115,11 +114,13 @@ subroutine Hartree_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, H) numunula = (numu * (numu - 1)) / 2 + nula H(mu,nu) = H(mu,nu) + 2.d0 * P(la,nu) * ERI_chem(numunula) enddo + do la = mu + 1, nu - 1 nula = (nu * (nu - 1)) / 2 + la numunula = (nula * (nula - 1)) / 2 + numu H(mu,nu) = H(mu,nu) + 2.d0 * P(la,nu) * ERI_chem(numunula) enddo + do la = 2, nu - 1 do si = 1, la - 1 lasi = (la * (la - 1)) / 2 + si @@ -127,6 +128,7 @@ subroutine Hartree_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, H) H(mu,nu) = H(mu,nu) + 2.d0 * P(si,la) * ERI_chem(numulasi) enddo enddo + do la = nu + 1, nBas do si = 1, la - 1 lasi = (la * (la - 1)) / 2 + si diff --git a/src/AOtoMO/exchange_matrix_AO_basis.f90 b/src/AOtoMO/exchange_matrix_AO_basis.f90 index 1a29038..d1c96b4 100644 --- a/src/AOtoMO/exchange_matrix_AO_basis.f90 +++ b/src/AOtoMO/exchange_matrix_AO_basis.f90 @@ -31,3 +31,152 @@ subroutine exchange_matrix_AO_basis(nBas,P,ERI,K) end do end subroutine + +! --- + +subroutine exchange_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, K) + + implicit none + + integer, intent(in) :: nBas + integer*8, intent(in) :: ERI_size + double precision, intent(in) :: P(nBas,nBas) + double precision, intent(in) :: ERI_chem(ERI_size) + double precision, intent(out) :: K(nBas,nBas) + + integer :: mu, nu, la, si + integer :: nunu, lala, nula, lasi, numu + integer*8 :: nunununu, nunulala, nununula, nunulasi + integer*8 :: lalanunu, lasinunu, numulala, lalanumu + integer*8 :: numunula, numulasi, lasinumu, nununumu + integer*8 :: munu0, munu + integer*8 :: sila0, sila + integer*8 :: munulasi0, munulasi + + integer*8, external :: Yoshimine_4ind + + + do nu = 1, nBas + + munulasi = Yoshimine_4ind(nu, nu, nu, nu) + K(nu,nu) = -P(nu,nu) * ERI_chem(munulasi) + + do la = 1, nu - 1 + munulasi = Yoshimine_4ind(nu, la, nu, la) + K(nu,nu) = K(nu,nu) - P(la,la) * ERI_chem(munulasi) + enddo + + do la = nu + 1, nBas + munulasi = Yoshimine_4ind(nu, la, nu, la) + K(nu,nu) = K(nu,nu) - P(la,la) * ERI_chem(munulasi) + enddo + + do la = 1, nu + do si = 1, la - 1 + munulasi = Yoshimine_4ind(nu, la, nu, si) + K(nu,nu) = K(nu,nu) - 2.d0 * P(si,la) * ERI_chem(munulasi) + enddo + enddo + + do la = nu + 1, nBas + do si = 1, nu + munulasi = Yoshimine_4ind(nu, la, nu, si) + K(nu,nu) = K(nu,nu) - 2.d0 * P(si,la) * ERI_chem(munulasi) + enddo + enddo + + do la = nu + 1, nBas + do si = nu + 1, la - 1 + munulasi = Yoshimine_4ind(nu, la, nu, si) + K(nu,nu) = K(nu,nu) - 2.d0 * P(si,la) * ERI_chem(munulasi) + enddo + enddo + + + do mu = 1, nu - 1 + + K(mu,nu) = 0.d0 + do la = 1, mu + munulasi = Yoshimine_4ind(nu, la, mu, la) + K(mu,nu) = K(mu,nu) - P(la,la) * ERI_chem(munulasi) + enddo + do la = mu + 1, nu + munulasi = Yoshimine_4ind(nu, la, mu, la) + K(mu,nu) = K(mu,nu) - P(la,la) * ERI_chem(munulasi) + enddo + do la = nu + 1, nBas + munulasi = Yoshimine_4ind(nu, la, mu, la) + K(mu,nu) = K(mu,nu) - P(la,la) * ERI_chem(munulasi) + enddo + + do la = 1, mu + do si = 1, la - 1 + munulasi = Yoshimine_4ind(nu, la, si, mu) + K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(munulasi) + enddo + enddo + do la = mu+1, nu + do si = 1, mu + munulasi = Yoshimine_4ind(nu, la, si, mu) + K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(munulasi) + enddo + do si = mu + 1, la - 1 + munulasi = Yoshimine_4ind(nu, la, si, mu) + K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(munulasi) + enddo + enddo + do la = nu + 1, nBas + do si = 1, mu + munulasi = Yoshimine_4ind(nu, la, si, mu) + K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(munulasi) + enddo + do si = mu + 1, la - 1 + munulasi = Yoshimine_4ind(nu, la, si, mu) + K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(munulasi) + enddo + enddo + + do la = 1, mu + do si = la + 1, mu + munulasi = Yoshimine_4ind(nu, la, si, mu) + K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(munulasi) + enddo + do si = mu + 1, nBas + munulasi = Yoshimine_4ind(nu, la, si, mu) + K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(munulasi) + enddo + enddo + do la = mu + 1, nu + do si = la + 1, nBas + munulasi = Yoshimine_4ind(nu, la, si, mu) + K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(munulasi) + enddo + enddo + do la = nu + 1, nBas + do si = la + 1, nBas + munulasi = Yoshimine_4ind(nu, la, si, mu) + K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(munulasi) + enddo + enddo + + !do la = 1, nBas + ! do si = la + 1, nBas + ! munulasi = Yoshimine_4ind(nu, la, mu, si) + ! K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(munulasi) + ! enddo + !enddo + enddo ! mu + enddo ! nu + + + do nu = 1, nBas + do mu = nu+1, nBas + K(mu,nu) = K(nu,mu) + enddo + enddo + + return +end subroutine + +! --- + diff --git a/src/HF/RHF_hpc.f90 b/src/HF/RHF_hpc.f90 index dc66299..d9bf77d 100644 --- a/src/HF/RHF_hpc.f90 +++ b/src/HF/RHF_hpc.f90 @@ -56,7 +56,7 @@ subroutine RHF_hpc(working_dir,dotest,maxSCF,thresh,max_diis,guess_type,level_sh double precision,allocatable :: cp(:,:) double precision,allocatable :: Fp(:,:) double precision,allocatable :: ERI_chem(:) - double precision,allocatable :: ERI_phys(:,:,:,:), J_deb(:,:) + double precision,allocatable :: ERI_phys(:,:,:,:), J_deb(:,:), K_deb(:,:) ! Output variables @@ -106,25 +106,43 @@ subroutine RHF_hpc(working_dir,dotest,maxSCF,thresh,max_diis,guess_type,level_sh call read_2e_integrals_hpc(working_dir, ERI_size, ERI_chem) call Hartree_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, J) - allocate(J_deb(nBas,nBas)) allocate(ERI_phys(nBas,nBas,nBas,nBas)) + allocate(J_deb(nBas,nBas)) + allocate(K_deb(nBas,nBas)) + call read_2e_integrals(working_dir, nBas, ERI_phys) call Hartree_matrix_AO_basis(nBas, P, ERI_phys, J_deb) - - print*, "max error = ", maxval(dabs(J - J_deb)) + print*, "max error on J = ", maxval(dabs(J - J_deb)) diff = 0.d0 do ii = 1, nBas do jj = 1, nBas diff_loc = dabs(J(jj,ii) - J_deb(jj,ii)) if(diff_loc .gt. 1d-12) then - print*, 'error on: ', jj, ii + print*, 'error in J on: ', jj, ii print*, J(jj,ii), J_deb(jj,ii) stop endif diff = diff + diff_loc enddo enddo - print*, 'total diff = ', diff + print*, 'total diff on J = ', diff + + call exchange_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, K) + call exchange_matrix_AO_basis(nBas, P, ERI_phys, K_deb) + print*, "max error on K = ", maxval(dabs(K - K_deb)) + diff = 0.d0 + do ii = 1, nBas + do jj = 1, nBas + diff_loc = dabs(K(jj,ii) - K_deb(jj,ii)) + if(diff_loc .gt. 1d-12) then + print*, 'error in K on: ', jj, ii + print*, K(jj,ii), K_deb(jj,ii) + stop + endif + diff = diff + diff_loc + enddo + enddo + print*, 'total diff on K = ', diff stop diff --git a/src/utils/utils.f90 b/src/utils/utils.f90 index 79d6d82..ad528c9 100644 --- a/src/utils/utils.f90 +++ b/src/utils/utils.f90 @@ -914,9 +914,11 @@ integer*8 function Yoshimine_4ind(a, b, c, d) implicit none integer, intent(in) :: a, b, c, d integer*8, external :: Yoshimine_2ind + integer*8 :: ab, cd - Yoshimine_4ind = Yoshimine_2ind(Yoshimine_2ind(a, b), & - Yoshimine_2ind(c, d)) + ab = Yoshimine_2ind(a, b) + cd = Yoshimine_2ind(c, d) + Yoshimine_4ind = Yoshimine_2ind(ab, cd) return end From 995378de742464f3df146d55ef05eea81ce3e0bd Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Tue, 10 Dec 2024 15:35:13 +0100 Subject: [PATCH 32/39] working on X-Fock with s8-ERI --- src/AOtoMO/exchange_matrix_AO_basis.f90 | 188 ++++++++++++++---------- src/utils/utils.f90 | 8 +- 2 files changed, 116 insertions(+), 80 deletions(-) diff --git a/src/AOtoMO/exchange_matrix_AO_basis.f90 b/src/AOtoMO/exchange_matrix_AO_basis.f90 index d1c96b4..47ffbe6 100644 --- a/src/AOtoMO/exchange_matrix_AO_basis.f90 +++ b/src/AOtoMO/exchange_matrix_AO_basis.f90 @@ -45,126 +45,164 @@ subroutine exchange_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, K) double precision, intent(out) :: K(nBas,nBas) integer :: mu, nu, la, si - integer :: nunu, lala, nula, lasi, numu - integer*8 :: nunununu, nunulala, nununula, nunulasi - integer*8 :: lalanunu, lasinunu, numulala, lalanumu - integer*8 :: numunula, numulasi, lasinumu, nununumu - integer*8 :: munu0, munu - integer*8 :: sila0, sila - integer*8 :: munulasi0, munulasi + integer :: nunu, nula, lanu, lasi, nusi, sinu + integer :: numu, mumu, mula, lamu, musi, simu + integer*8 :: nunununu, nulanula, lanulanu, nulanusi + integer*8 :: munulasi, lanunusi, lanusinu, numumumu + integer*8 :: nulamula, nulalamu, lanulamu, nulamusi + integer*8 :: nulasimu, lanumusi, lanusimu + integer*8, external :: Yoshimine_4ind do nu = 1, nBas - munulasi = Yoshimine_4ind(nu, nu, nu, nu) - K(nu,nu) = -P(nu,nu) * ERI_chem(munulasi) + nunu = (nu * (nu - 1)) / 2 + nu + nunununu = (nunu * (nunu - 1)) / 2 + nunu + K(nu,nu) = -P(nu,nu) * ERI_chem(nunununu) do la = 1, nu - 1 - munulasi = Yoshimine_4ind(nu, la, nu, la) - K(nu,nu) = K(nu,nu) - P(la,la) * ERI_chem(munulasi) + nula = (nu * (nu - 1)) / 2 + la + nulanula = (nula * (nula - 1)) / 2 + nula + K(nu,nu) = K(nu,nu) - P(la,la) * ERI_chem(nulanula) enddo do la = nu + 1, nBas - munulasi = Yoshimine_4ind(nu, la, nu, la) - K(nu,nu) = K(nu,nu) - P(la,la) * ERI_chem(munulasi) + lanu = (la * (la - 1)) / 2 + nu + lanulanu = (lanu * (lanu - 1)) / 2 + lanu + K(nu,nu) = K(nu,nu) - P(la,la) * ERI_chem(lanulanu) enddo do la = 1, nu + nula = (nu * (nu - 1)) / 2 + la do si = 1, la - 1 - munulasi = Yoshimine_4ind(nu, la, nu, si) - K(nu,nu) = K(nu,nu) - 2.d0 * P(si,la) * ERI_chem(munulasi) + nusi = (nu * (nu - 1)) / 2 + si + nulanusi = (nula * (nula - 1)) / 2 + nusi + K(nu,nu) = K(nu,nu) - 2.d0 * P(si,la) * ERI_chem(nulanusi) enddo enddo do la = nu + 1, nBas + lanu = (la * (la - 1)) / 2 + nu do si = 1, nu - munulasi = Yoshimine_4ind(nu, la, nu, si) - K(nu,nu) = K(nu,nu) - 2.d0 * P(si,la) * ERI_chem(munulasi) + nusi = (nu * (nu - 1)) / 2 + si + lanunusi = (lanu * (lanu - 1)) / 2 + nusi + K(nu,nu) = K(nu,nu) - 2.d0 * P(si,la) * ERI_chem(lanunusi) enddo - enddo - - do la = nu + 1, nBas do si = nu + 1, la - 1 - munulasi = Yoshimine_4ind(nu, la, nu, si) - K(nu,nu) = K(nu,nu) - 2.d0 * P(si,la) * ERI_chem(munulasi) + sinu = (si * (si - 1)) / 2 + nu + lanusinu = (lanu * (lanu - 1)) / 2 + sinu + K(nu,nu) = K(nu,nu) - 2.d0 * P(si,la) * ERI_chem(lanusinu) enddo enddo do mu = 1, nu - 1 - K(mu,nu) = 0.d0 - do la = 1, mu - munulasi = Yoshimine_4ind(nu, la, mu, la) - K(mu,nu) = K(mu,nu) - P(la,la) * ERI_chem(munulasi) + numu = (nu * (nu - 1)) / 2 + mu + mumu = (mu * (mu - 1)) / 2 + mu + numumumu = (numu * (numu - 1)) / 2 + mumu + K(mu,nu) = - P(mu,mu) * ERI_chem(numumumu) + + do la = 1, mu - 1 + mula = (mu * (mu - 1)) / 2 + la + nula = (nu * (nu - 1)) / 2 + la + nulamula = (nula * (nula - 1)) / 2 + mula + K(mu,nu) = K(mu,nu) - P(la,la) * ERI_chem(nulamula) enddo do la = mu + 1, nu - munulasi = Yoshimine_4ind(nu, la, mu, la) - K(mu,nu) = K(mu,nu) - P(la,la) * ERI_chem(munulasi) + lamu = (la * (la - 1)) / 2 + mu + nula = (nu * (nu - 1)) / 2 + la + nulalamu = (nula * (nula - 1)) / 2 + lamu + K(mu,nu) = K(mu,nu) - P(la,la) * ERI_chem(nulalamu) enddo do la = nu + 1, nBas - munulasi = Yoshimine_4ind(nu, la, mu, la) - K(mu,nu) = K(mu,nu) - P(la,la) * ERI_chem(munulasi) + lamu = (la * (la - 1)) / 2 + mu + lanu = (la * (la - 1)) / 2 + nu + lanulamu = (lanu * (lanu - 1)) / 2 + lamu + K(mu,nu) = K(mu,nu) - P(la,la) * ERI_chem(lanulamu) enddo do la = 1, mu + nula = (nu * (nu - 1)) / 2 + la do si = 1, la - 1 - munulasi = Yoshimine_4ind(nu, la, si, mu) - K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(munulasi) - enddo - enddo - do la = mu+1, nu - do si = 1, mu - munulasi = Yoshimine_4ind(nu, la, si, mu) - K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(munulasi) - enddo - do si = mu + 1, la - 1 - munulasi = Yoshimine_4ind(nu, la, si, mu) - K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(munulasi) - enddo - enddo - do la = nu + 1, nBas - do si = 1, mu - munulasi = Yoshimine_4ind(nu, la, si, mu) - K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(munulasi) - enddo - do si = mu + 1, la - 1 - munulasi = Yoshimine_4ind(nu, la, si, mu) - K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(munulasi) - enddo - enddo - - do la = 1, mu - do si = la + 1, mu - munulasi = Yoshimine_4ind(nu, la, si, mu) - K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(munulasi) - enddo - do si = mu + 1, nBas - munulasi = Yoshimine_4ind(nu, la, si, mu) - K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(munulasi) + musi = (mu * (mu - 1)) / 2 + si + nulamusi = (nula * (nula - 1)) / 2 + musi + K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(nulamusi) enddo enddo do la = mu + 1, nu - do si = la + 1, nBas - munulasi = Yoshimine_4ind(nu, la, si, mu) - K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(munulasi) + nula = (nu * (nu - 1)) / 2 + la + do si = 1, mu + musi = (mu * (mu - 1)) / 2 + si + nulamusi = (nula * (nula - 1)) / 2 + musi + K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(nulamusi) + enddo + do si = mu + 1, la - 1 + simu = (si * (si - 1)) / 2 + mu + nulasimu = (nula * (nula - 1)) / 2 + simu + K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(nulasimu) enddo enddo do la = nu + 1, nBas - do si = la + 1, nBas - munulasi = Yoshimine_4ind(nu, la, si, mu) - K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(munulasi) + lanu = (la * (la - 1)) / 2 + nu + do si = 1, mu + musi = (mu * (mu - 1)) / 2 + si + lanumusi = (lanu * (lanu - 1)) / 2 + musi + K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(lanumusi) + enddo + do si = mu + 1, la - 1 + simu = (si * (si - 1)) / 2 + mu + lanusimu = (lanu * (lanu - 1)) / 2 + simu + K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(lanusimu) enddo enddo - !do la = 1, nBas - ! do si = la + 1, nBas - ! munulasi = Yoshimine_4ind(nu, la, mu, si) - ! K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(munulasi) - ! enddo - !enddo +!TODO +! do la = 1, mu +! nula = (nu * (nu - 1)) / 2 + la +! do si = la + 1, mu +! musi = (mu * (mu - 1)) / 2 + si +! nulamusi = (nula * (nula - 1)) / 2 + musi +! !nulamusi = Yoshimine_4ind(nu, la, si, mu) +! K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(nulamusi) +! enddo +! do si = mu + 1, nBas +! simu = (si * (si - 1)) / 2 + mu +! nulasimu = (nula * (nula - 1)) / 2 + simu +! !nulasimu = Yoshimine_4ind(nu, la, si, mu) +! K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(nulasimu) +! enddo +! enddo +! do la = mu + 1, nu +! nula = (nu * (nu - 1)) / 2 + la +! do si = la + 1, nu +! simu = (si * (si - 1)) / 2 + mu +! nulasimu = (nula * (nula - 1)) / 2 + simu +! !nulasimu = Yoshimine_4ind(nu, la, si, mu) +! K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(nulasimu) +! enddo +! do si = nu + 1, nBas +! simu = (si * (si - 1)) / 2 + mu +! munulasi = Yoshimine_4ind(nu, la, si, mu) +! K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(munulasi) +! enddo +! enddo +! do la = nu + 1, nBas +! lanu = (la * (la - 1)) / 2 + nu +! do si = la + 1, mu +! simu = (si * (si - 1)) / 2 + mu +! munulasi = Yoshimine_4ind(nu, la, si, mu) +! K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(munulasi) +! enddo +! do si = mu + 1, nBas +! musi = (mu * (mu - 1)) / 2 + si +! munulasi = Yoshimine_4ind(nu, la, si, mu) +! K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(munulasi) +! enddo +! enddo + enddo ! mu enddo ! nu diff --git a/src/utils/utils.f90 b/src/utils/utils.f90 index ad528c9..4e89faf 100644 --- a/src/utils/utils.f90 +++ b/src/utils/utils.f90 @@ -914,11 +914,9 @@ integer*8 function Yoshimine_4ind(a, b, c, d) implicit none integer, intent(in) :: a, b, c, d integer*8, external :: Yoshimine_2ind - integer*8 :: ab, cd - ab = Yoshimine_2ind(a, b) - cd = Yoshimine_2ind(c, d) - Yoshimine_4ind = Yoshimine_2ind(ab, cd) + Yoshimine_4ind = Yoshimine_2ind(Yoshimine_2ind(a, b), & + Yoshimine_2ind(c, d)) return end @@ -928,7 +926,7 @@ end integer*8 function Yoshimine_2ind(a, b) implicit none - integer, intent(in) :: a, b + integer*8, intent(in) :: a, b if(a > b) then Yoshimine_2ind = (a * (a - 1)) / 2 + b From 671c6f4f1232bd65079fb47102fbaa8c0fea2a8c Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Tue, 10 Dec 2024 15:37:15 +0100 Subject: [PATCH 33/39] int4->int8 in Yoshimine fct --- src/utils/utils.f90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils/utils.f90 b/src/utils/utils.f90 index 4e89faf..c6eaac0 100644 --- a/src/utils/utils.f90 +++ b/src/utils/utils.f90 @@ -912,7 +912,7 @@ end integer*8 function Yoshimine_4ind(a, b, c, d) implicit none - integer, intent(in) :: a, b, c, d + integer*8, intent(in) :: a, b, c, d integer*8, external :: Yoshimine_2ind Yoshimine_4ind = Yoshimine_2ind(Yoshimine_2ind(a, b), & From 37420d12078abdf512530d330a1d70b094c6b15f Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Tue, 10 Dec 2024 19:14:33 +0100 Subject: [PATCH 34/39] added Hartree and EX terms with s8-sym --- PyDuck.py | 1 - src/AOtoMO/exchange_matrix_AO_basis.f90 | 165 ++++++++++++------------ src/HF/RHF_hpc.f90 | 35 ++++- src/utils/utils.f90 | 6 +- 4 files changed, 115 insertions(+), 92 deletions(-) diff --git a/PyDuck.py b/PyDuck.py index 45113de..4779566 100644 --- a/PyDuck.py +++ b/PyDuck.py @@ -163,7 +163,6 @@ if print_2e: output_file_path = working_dir + '/int/ERI_chem.bin' subprocess.call(['rm', '-f', output_file_path]) eri_ao = mol.intor('int2e', aosym='s8') - print(eri_ao.shape) f = open(output_file_path, 'w') eri_ao.tofile(output_file_path) f.close() diff --git a/src/AOtoMO/exchange_matrix_AO_basis.f90 b/src/AOtoMO/exchange_matrix_AO_basis.f90 index 47ffbe6..4781436 100644 --- a/src/AOtoMO/exchange_matrix_AO_basis.f90 +++ b/src/AOtoMO/exchange_matrix_AO_basis.f90 @@ -50,49 +50,48 @@ subroutine exchange_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, K) integer*8 :: nunununu, nulanula, lanulanu, nulanusi integer*8 :: munulasi, lanunusi, lanusinu, numumumu integer*8 :: nulamula, nulalamu, lanulamu, nulamusi - integer*8 :: nulasimu, lanumusi, lanusimu + integer*8 :: nulasimu, lanumusi, lanusimu, simunula + integer*8 :: simulanu - integer*8, external :: Yoshimine_4ind - do nu = 1, nBas - nunu = (nu * (nu - 1)) / 2 + nu - nunununu = (nunu * (nunu - 1)) / 2 + nunu + nunu = shiftr(nu * (nu - 1), 1) + nu + nunununu = shiftr(nunu * (nunu - 1), 1) + nunu K(nu,nu) = -P(nu,nu) * ERI_chem(nunununu) do la = 1, nu - 1 - nula = (nu * (nu - 1)) / 2 + la - nulanula = (nula * (nula - 1)) / 2 + nula + nula = shiftr(nu * (nu - 1), 1) + la + nulanula = shiftr(nula * (nula - 1), 1) + nula K(nu,nu) = K(nu,nu) - P(la,la) * ERI_chem(nulanula) enddo do la = nu + 1, nBas - lanu = (la * (la - 1)) / 2 + nu - lanulanu = (lanu * (lanu - 1)) / 2 + lanu + lanu = shiftr(la * (la - 1), 1) + nu + lanulanu = shiftr(lanu * (lanu - 1), 1) + lanu K(nu,nu) = K(nu,nu) - P(la,la) * ERI_chem(lanulanu) enddo do la = 1, nu - nula = (nu * (nu - 1)) / 2 + la + nula = shiftr(nu * (nu - 1), 1) + la do si = 1, la - 1 - nusi = (nu * (nu - 1)) / 2 + si - nulanusi = (nula * (nula - 1)) / 2 + nusi + nusi = shiftr(nu * (nu - 1), 1) + si + nulanusi = shiftr(nula * (nula - 1), 1) + nusi K(nu,nu) = K(nu,nu) - 2.d0 * P(si,la) * ERI_chem(nulanusi) enddo enddo do la = nu + 1, nBas - lanu = (la * (la - 1)) / 2 + nu + lanu = shiftr(la * (la - 1), 1) + nu do si = 1, nu - nusi = (nu * (nu - 1)) / 2 + si - lanunusi = (lanu * (lanu - 1)) / 2 + nusi + nusi = shiftr(nu * (nu - 1), 1) + si + lanunusi = shiftr(lanu * (lanu - 1), 1) + nusi K(nu,nu) = K(nu,nu) - 2.d0 * P(si,la) * ERI_chem(lanunusi) enddo do si = nu + 1, la - 1 - sinu = (si * (si - 1)) / 2 + nu - lanusinu = (lanu * (lanu - 1)) / 2 + sinu + sinu = shiftr(si * (si - 1), 1) + nu + lanusinu = shiftr(lanu * (lanu - 1), 1) + sinu K(nu,nu) = K(nu,nu) - 2.d0 * P(si,la) * ERI_chem(lanusinu) enddo enddo @@ -100,108 +99,104 @@ subroutine exchange_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, K) do mu = 1, nu - 1 - numu = (nu * (nu - 1)) / 2 + mu - mumu = (mu * (mu - 1)) / 2 + mu - numumumu = (numu * (numu - 1)) / 2 + mumu + numu = shiftr(nu * (nu - 1), 1) + mu + mumu = shiftr(mu * (mu - 1), 1) + mu + numumumu = shiftr(numu * (numu - 1), 1) + mumu K(mu,nu) = - P(mu,mu) * ERI_chem(numumumu) do la = 1, mu - 1 - mula = (mu * (mu - 1)) / 2 + la - nula = (nu * (nu - 1)) / 2 + la - nulamula = (nula * (nula - 1)) / 2 + mula + mula = shiftr(mu * (mu - 1), 1) + la + nula = shiftr(nu * (nu - 1), 1) + la + nulamula = shiftr(nula * (nula - 1), 1) + mula K(mu,nu) = K(mu,nu) - P(la,la) * ERI_chem(nulamula) enddo do la = mu + 1, nu - lamu = (la * (la - 1)) / 2 + mu - nula = (nu * (nu - 1)) / 2 + la - nulalamu = (nula * (nula - 1)) / 2 + lamu + lamu = shiftr(la * (la - 1), 1) + mu + nula = shiftr(nu * (nu - 1), 1) + la + nulalamu = shiftr(nula * (nula - 1), 1) + lamu K(mu,nu) = K(mu,nu) - P(la,la) * ERI_chem(nulalamu) enddo do la = nu + 1, nBas - lamu = (la * (la - 1)) / 2 + mu - lanu = (la * (la - 1)) / 2 + nu - lanulamu = (lanu * (lanu - 1)) / 2 + lamu + lamu = shiftr(la * (la - 1), 1) + mu + lanu = shiftr(la * (la - 1), 1) + nu + lanulamu = shiftr(lanu * (lanu - 1), 1) + lamu K(mu,nu) = K(mu,nu) - P(la,la) * ERI_chem(lanulamu) enddo do la = 1, mu - nula = (nu * (nu - 1)) / 2 + la + nula = shiftr(nu * (nu - 1), 1) + la do si = 1, la - 1 - musi = (mu * (mu - 1)) / 2 + si - nulamusi = (nula * (nula - 1)) / 2 + musi + musi = shiftr(mu * (mu - 1), 1) + si + nulamusi = shiftr(nula * (nula - 1), 1) + musi K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(nulamusi) enddo enddo do la = mu + 1, nu - nula = (nu * (nu - 1)) / 2 + la + nula = shiftr(nu * (nu - 1), 1) + la do si = 1, mu - musi = (mu * (mu - 1)) / 2 + si - nulamusi = (nula * (nula - 1)) / 2 + musi + musi = shiftr(mu * (mu - 1), 1) + si + nulamusi = shiftr(nula * (nula - 1), 1) + musi K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(nulamusi) enddo do si = mu + 1, la - 1 - simu = (si * (si - 1)) / 2 + mu - nulasimu = (nula * (nula - 1)) / 2 + simu + simu = shiftr(si * (si - 1), 1) + mu + nulasimu = shiftr(nula * (nula - 1), 1) + simu K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(nulasimu) enddo enddo do la = nu + 1, nBas - lanu = (la * (la - 1)) / 2 + nu + lanu = shiftr(la * (la - 1), 1) + nu do si = 1, mu - musi = (mu * (mu - 1)) / 2 + si - lanumusi = (lanu * (lanu - 1)) / 2 + musi + musi = shiftr(mu * (mu - 1), 1) + si + lanumusi = shiftr(lanu * (lanu - 1), 1) + musi K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(lanumusi) enddo do si = mu + 1, la - 1 - simu = (si * (si - 1)) / 2 + mu - lanusimu = (lanu * (lanu - 1)) / 2 + simu + simu = shiftr(si * (si - 1), 1) + mu + lanusimu = shiftr(lanu * (lanu - 1), 1) + simu K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(lanusimu) enddo enddo -!TODO -! do la = 1, mu -! nula = (nu * (nu - 1)) / 2 + la -! do si = la + 1, mu -! musi = (mu * (mu - 1)) / 2 + si -! nulamusi = (nula * (nula - 1)) / 2 + musi -! !nulamusi = Yoshimine_4ind(nu, la, si, mu) -! K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(nulamusi) -! enddo -! do si = mu + 1, nBas -! simu = (si * (si - 1)) / 2 + mu -! nulasimu = (nula * (nula - 1)) / 2 + simu -! !nulasimu = Yoshimine_4ind(nu, la, si, mu) -! K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(nulasimu) -! enddo -! enddo -! do la = mu + 1, nu -! nula = (nu * (nu - 1)) / 2 + la -! do si = la + 1, nu -! simu = (si * (si - 1)) / 2 + mu -! nulasimu = (nula * (nula - 1)) / 2 + simu -! !nulasimu = Yoshimine_4ind(nu, la, si, mu) -! K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(nulasimu) -! enddo -! do si = nu + 1, nBas -! simu = (si * (si - 1)) / 2 + mu -! munulasi = Yoshimine_4ind(nu, la, si, mu) -! K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(munulasi) -! enddo -! enddo -! do la = nu + 1, nBas -! lanu = (la * (la - 1)) / 2 + nu -! do si = la + 1, mu -! simu = (si * (si - 1)) / 2 + mu -! munulasi = Yoshimine_4ind(nu, la, si, mu) -! K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(munulasi) -! enddo -! do si = mu + 1, nBas -! musi = (mu * (mu - 1)) / 2 + si -! munulasi = Yoshimine_4ind(nu, la, si, mu) -! K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(munulasi) -! enddo -! enddo + do la = 1, mu + nula = shiftr(nu * (nu - 1), 1) + la + do si = la + 1, mu + musi = shiftr(mu * (mu - 1) , 1) + si + nulamusi = shiftr(nula * (nula - 1), 1) + musi + K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(nulamusi) + enddo + do si = mu + 1, nu - 1 + simu = shiftr(si * (si - 1), 1) + mu + nulasimu = shiftr(nula * (nula - 1), 1) + simu + K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(nulasimu) + enddo + do si = nu, nBas + simu = shiftr(si * (si - 1), 1) + mu + simunula = shiftr(simu * (simu - 1), 1) + nula + K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(simunula) + enddo + enddo + do la = mu + 1, nu + nula = shiftr(nu * (nu - 1), 1) + la + do si = la + 1, nu + simu = shiftr(si * (si - 1), 1) + mu + nulasimu = shiftr(nula * (nula - 1), 1) + simu + K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(nulasimu) + enddo + do si = nu + 1, nBas + simu = shiftr(si * (si - 1), 1) + mu + simunula = shiftr(simu * (simu - 1), 1) + nula + K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(simunula) + enddo + enddo + do la = nu + 1, nBas + lanu = shiftr(la * (la - 1), 1) + nu + do si = la + 1, nBas + simu = shiftr(si * (si - 1), 1) + mu + simulanu = shiftr(simu * (simu - 1), 1) + lanu + K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(simulanu) + enddo + enddo enddo ! mu enddo ! nu diff --git a/src/HF/RHF_hpc.f90 b/src/HF/RHF_hpc.f90 index d9bf77d..44095d3 100644 --- a/src/HF/RHF_hpc.f90 +++ b/src/HF/RHF_hpc.f90 @@ -39,6 +39,7 @@ subroutine RHF_hpc(working_dir,dotest,maxSCF,thresh,max_diis,guess_type,level_sh integer :: nBas_Sq integer :: n_diis integer*8 :: ERI_size + double precision :: t1, t2 double precision :: diff, diff_loc double precision :: ET double precision :: EV @@ -104,14 +105,42 @@ subroutine RHF_hpc(working_dir,dotest,maxSCF,thresh,max_diis,guess_type,level_sh ERI_size = (ERI_size * (ERI_size + 1)) / 2 allocate(ERI_chem(ERI_size)) call read_2e_integrals_hpc(working_dir, ERI_size, ERI_chem) - call Hartree_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, J) + + call wall_time(t1) + do ii = 1, 5 + call Hartree_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, J) + enddo + call wall_time(t2) + print*, " J built in (sec):", (t2-t1) / 5.d0 + + call wall_time(t1) + do ii = 1, 5 + call exchange_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, K) + enddo + call wall_time(t2) + print*, " K built in (sec):", (t2-t1) / 5.d0 + allocate(ERI_phys(nBas,nBas,nBas,nBas)) allocate(J_deb(nBas,nBas)) allocate(K_deb(nBas,nBas)) call read_2e_integrals(working_dir, nBas, ERI_phys) - call Hartree_matrix_AO_basis(nBas, P, ERI_phys, J_deb) + + call wall_time(t1) + do ii = 1, 5 + call Hartree_matrix_AO_basis(nBas, P, ERI_phys, J_deb) + enddo + call wall_time(t2) + print*, " J_deb built in (sec):", (t2-t1) / 5.d0 + + call wall_time(t1) + do ii = 1, 5 + call exchange_matrix_AO_basis(nBas, P, ERI_phys, K_deb) + enddo + call wall_time(t2) + print*, " K_deb built in (sec):", (t2-t1) / 5.d0 + print*, "max error on J = ", maxval(dabs(J - J_deb)) diff = 0.d0 do ii = 1, nBas @@ -127,8 +156,6 @@ subroutine RHF_hpc(working_dir,dotest,maxSCF,thresh,max_diis,guess_type,level_sh enddo print*, 'total diff on J = ', diff - call exchange_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, K) - call exchange_matrix_AO_basis(nBas, P, ERI_phys, K_deb) print*, "max error on K = ", maxval(dabs(K - K_deb)) diff = 0.d0 do ii = 1, nBas diff --git a/src/utils/utils.f90 b/src/utils/utils.f90 index c6eaac0..cc7998a 100644 --- a/src/utils/utils.f90 +++ b/src/utils/utils.f90 @@ -929,9 +929,11 @@ integer*8 function Yoshimine_2ind(a, b) integer*8, intent(in) :: a, b if(a > b) then - Yoshimine_2ind = (a * (a - 1)) / 2 + b + !Yoshimine_2ind = (a * (a - 1)) / 2 + b + Yoshimine_2ind = shiftr(a * (a - 1), 1) + b else - Yoshimine_2ind = (b * (b - 1)) / 2 + a + !Yoshimine_2ind = (b * (b - 1)) / 2 + a + Yoshimine_2ind = shiftr(b * (b - 1), 1) + a endif return From 956cebf6647ef8d2648839361048fc8803475eac Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Wed, 11 Dec 2024 11:38:55 +0100 Subject: [PATCH 35/39] working on optim Fock matrix --- src/AOtoMO/Hartree_matrix_AO_basis.f90 | 69 +++++++++++++++----------- 1 file changed, 40 insertions(+), 29 deletions(-) diff --git a/src/AOtoMO/Hartree_matrix_AO_basis.f90 b/src/AOtoMO/Hartree_matrix_AO_basis.f90 index 0d0810d..4d12a5b 100644 --- a/src/AOtoMO/Hartree_matrix_AO_basis.f90 +++ b/src/AOtoMO/Hartree_matrix_AO_basis.f90 @@ -47,92 +47,103 @@ subroutine Hartree_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, H) integer :: mu, nu, la, si integer :: nunu, lala, nula, lasi, numu + integer :: nunu0, lala0 integer*8 :: nunununu, nunulala, nununula, nunulasi integer*8 :: lalanunu, lasinunu, numulala, lalanumu integer*8 :: numunula, numulasi, lasinumu, nununumu - integer*8 :: munu0, munu - integer*8 :: sila0, sila - integer*8 :: munulasi0, munulasi + integer*8 :: nunununu0, numunumu0 do nu = 1, nBas - nunu = (nu * (nu - 1)) / 2 + nu - nunununu = (nunu * (nunu - 1)) / 2 + nunu + nunu0 = shiftr(nu * (nu - 1), 1) + nunu = nunu0 + nu + nunununu0 = shiftr(nunu * (nunu - 1), 1) + + nunununu = nunununu0 + nunu H(nu,nu) = P(nu,nu) * ERI_chem(nunununu) do la = 1, nu-1 - lala = (la * (la - 1)) / 2 + la - nunulala = (nunu * (nunu - 1)) / 2 + lala + lala0 = shiftr(la * (la - 1), 1) + + lala = lala0 + la + nunulala = nunununu0 + lala H(nu,nu) = H(nu,nu) + P(la,la) * ERI_chem(nunulala) - nula = (nu * (nu - 1)) / 2 + la - nununula = (nunu * (nunu - 1)) / 2 + nula + nula = nunu0 + la + nununula = nunununu0 + nula H(nu,nu) = H(nu,nu) + 2.d0 * P(la,nu) * ERI_chem(nununula) do si = 1, la - 1 - lasi = (la * (la - 1)) / 2 + si - nunulasi = (nunu * (nunu - 1)) / 2 + lasi + lasi = lala0 + si + nunulasi = nunununu0 + lasi H(nu,nu) = H(nu,nu) + 2.d0 * P(si,la) * ERI_chem(nunulasi) enddo enddo do la = nu + 1, nBas - lala = (la * (la - 1)) / 2 + la - lalanunu = (lala * (lala - 1)) / 2 + nunu + lala0 = shiftr(la * (la - 1), 1) + + lala = lala0 + la + lalanunu = shiftr(lala * (lala - 1), 1) + nunu H(nu,nu) = H(nu,nu) + P(la,la) * ERI_chem(lalanunu) do si = 1, la - 1 - lasi = (la * (la - 1)) / 2 + si - lasinunu = (lasi * (lasi - 1)) / 2 + nunu + lasi = lala0 + si + lasinunu = shiftr(lasi * (lasi - 1), 1) + nunu H(nu,nu) = H(nu,nu) + 2.d0 * P(si,la) * ERI_chem(lasinunu) enddo enddo do mu = 1, nu - 1 - numu = (nu * (nu - 1)) / 2 + mu - nununumu = (nunu * (nunu - 1)) / 2 + numu + numu = nunu0 + mu + + numunumu0 = shiftr(numu * (numu - 1), 1) + + nununumu = nunununu0 + numu H(mu,nu) = p(nu,nu) * ERI_chem(nununumu) do la = 1, nu - 1 - lala = (la * (la - 1)) / 2 + la - numulala = (numu * (numu - 1)) / 2 + lala + lala = shiftr(la * (la - 1), 1) + la + numulala = numunumu0 + lala H(mu,nu) = H(mu,nu) + p(la,la) * ERI_chem(numulala) enddo do la = nu + 1, nBas - lala = (la * (la - 1)) / 2 + la - lalanumu = (lala * (lala - 1)) / 2 + numu + lala = shiftr(la * (la - 1), 1) + la + lalanumu = shiftr(lala * (lala - 1), 1) + numu H(mu,nu) = H(mu,nu) + p(la,la) * ERI_chem(lalanumu) enddo do la = 1, mu - nula = (nu * (nu - 1)) / 2 + la - numunula = (numu * (numu - 1)) / 2 + nula + nula = nunu0 + la + numunula = numunumu0 + nula H(mu,nu) = H(mu,nu) + 2.d0 * P(la,nu) * ERI_chem(numunula) enddo do la = mu + 1, nu - 1 - nula = (nu * (nu - 1)) / 2 + la - numunula = (nula * (nula - 1)) / 2 + numu + nula = nunu0 + la + numunula = shiftr(nula * (nula - 1), 1) + numu H(mu,nu) = H(mu,nu) + 2.d0 * P(la,nu) * ERI_chem(numunula) enddo do la = 2, nu - 1 + lala0 = shiftr(la * (la - 1), 1) do si = 1, la - 1 - lasi = (la * (la - 1)) / 2 + si - numulasi = (numu * (numu - 1)) / 2 + lasi + lasi = lala0 + si + numulasi = numunumu0 + lasi H(mu,nu) = H(mu,nu) + 2.d0 * P(si,la) * ERI_chem(numulasi) enddo enddo do la = nu + 1, nBas + lala0 = shiftr(la * (la - 1), 1) do si = 1, la - 1 - lasi = (la * (la - 1)) / 2 + si - lasinumu = (lasi * (lasi - 1)) / 2 + numu + lasi = lala0 + si + lasinumu = shiftr(lasi * (lasi - 1), 1) + numu H(mu,nu) = H(mu,nu) + 2.d0 * P(si,la) * ERI_chem(lasinumu) enddo enddo From d28c0339ff9190164631d580825d65ac2bb164e7 Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Thu, 12 Dec 2024 18:13:47 +0100 Subject: [PATCH 36/39] // H calc --- src/AOtoMO/Hartree_matrix_AO_basis.f90 | 31 ++++++- src/AOtoMO/exchange_matrix_AO_basis.f90 | 103 ++++++++++++++---------- src/HF/RHF_hpc.f90 | 28 +++---- 3 files changed, 101 insertions(+), 61 deletions(-) diff --git a/src/AOtoMO/Hartree_matrix_AO_basis.f90 b/src/AOtoMO/Hartree_matrix_AO_basis.f90 index 4d12a5b..010e5bb 100644 --- a/src/AOtoMO/Hartree_matrix_AO_basis.f90 +++ b/src/AOtoMO/Hartree_matrix_AO_basis.f90 @@ -53,7 +53,32 @@ subroutine Hartree_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, H) integer*8 :: numunula, numulasi, lasinumu, nununumu integer*8 :: nunununu0, numunumu0 +! integer*8 :: munusila +! integer*8, external :: Yoshimine_4ind +! +! do nu = 1, nBas +! do mu = 1, nu +! H(mu,nu) = 0.d0 +! do la = 1, nBas +! do si = 1, nBas +! munusila = Yoshimine_4ind(int(mu, kind=8), & +! int(nu, kind=8), & +! int(si, kind=8), & +! int(la, kind=8)) +! H(mu,nu) = H(mu,nu) + P(si,la) * ERI_chem(munusila) +! enddo +! enddo +! enddo +! enddo + !$OMP PARALLEL DEFAULT(NONE) & + !$OMP PRIVATE (nu, la, si, mu, & + !$OMP nunu0, nunu, nula, lala0, lala, lasi, numu, & + !$OMP nunununu0, nunununu, nununula, numulala, numunula, & + !$OMP nunulala, lalanunu, lalanumu, nunulasi, lasinunu, & + !$OMP numunumu0, nununumu, numulasi, lasinumu) & + !$OMP SHARED (nBas, H, P, ERI_chem) + !$OMP DO do nu = 1, nBas nunu0 = shiftr(nu * (nu - 1), 1) @@ -63,7 +88,7 @@ subroutine Hartree_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, H) nunununu = nunununu0 + nunu H(nu,nu) = P(nu,nu) * ERI_chem(nunununu) - do la = 1, nu-1 + do la = 1, nu - 1 lala0 = shiftr(la * (la - 1), 1) @@ -150,7 +175,8 @@ subroutine Hartree_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, H) enddo ! mu enddo ! nu - + !$OMP END DO + !$OMP END PARALLEL do nu = 1, nBas do mu = nu+1, nBas @@ -163,3 +189,4 @@ end subroutine ! --- + diff --git a/src/AOtoMO/exchange_matrix_AO_basis.f90 b/src/AOtoMO/exchange_matrix_AO_basis.f90 index 4781436..581e8db 100644 --- a/src/AOtoMO/exchange_matrix_AO_basis.f90 +++ b/src/AOtoMO/exchange_matrix_AO_basis.f90 @@ -47,22 +47,46 @@ subroutine exchange_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, K) integer :: mu, nu, la, si integer :: nunu, nula, lanu, lasi, nusi, sinu integer :: numu, mumu, mula, lamu, musi, simu + integer :: nunu0, lala0, mumu0 integer*8 :: nunununu, nulanula, lanulanu, nulanusi integer*8 :: munulasi, lanunusi, lanusinu, numumumu integer*8 :: nulamula, nulalamu, lanulamu, nulamusi integer*8 :: nulasimu, lanumusi, lanusimu, simunula - integer*8 :: simulanu + integer*8 :: simulanu, nulanula0, lanulanu0 + +! integer*8 :: munusila +! integer*8, external :: Yoshimine_4ind +! +! do nu = 1, nBas +! do mu = 1, nu +! K(mu,nu) = 0.d0 +! do la = 1, nBas +! do si = 1, nBas +! munusila = Yoshimine_4ind(int(mu, kind=8), & +! int(si, kind=8), & +! int(la, kind=8), & +! int(nu, kind=8)) +! K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(munusila) +! enddo +! enddo +! enddo +! enddo +! !$OMP PARALLEL SHARED (NONE) +! !$OMP PRIVATE () +! !$OMP SHARED () +! !$OMP DO do nu = 1, nBas - nunu = shiftr(nu * (nu - 1), 1) + nu + nunu0 = shiftr(nu * (nu - 1), 1) + nunu = nunu0 + nu nunununu = shiftr(nunu * (nunu - 1), 1) + nunu K(nu,nu) = -P(nu,nu) * ERI_chem(nunununu) do la = 1, nu - 1 - nula = shiftr(nu * (nu - 1), 1) + la + nula = nunu0 + la nulanula = shiftr(nula * (nula - 1), 1) + nula K(nu,nu) = K(nu,nu) - P(la,la) * ERI_chem(nulanula) enddo @@ -74,24 +98,23 @@ subroutine exchange_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, K) enddo do la = 1, nu - nula = shiftr(nu * (nu - 1), 1) + la + nula = nunu0 + la + nulanula0 = shiftr(nula * (nula - 1), 1) do si = 1, la - 1 - nusi = shiftr(nu * (nu - 1), 1) + si - nulanusi = shiftr(nula * (nula - 1), 1) + nusi + nulanusi = nulanula0 + nunu0 + si K(nu,nu) = K(nu,nu) - 2.d0 * P(si,la) * ERI_chem(nulanusi) enddo enddo do la = nu + 1, nBas lanu = shiftr(la * (la - 1), 1) + nu + lanulanu0 = shiftr(lanu * (lanu - 1), 1) do si = 1, nu - nusi = shiftr(nu * (nu - 1), 1) + si - lanunusi = shiftr(lanu * (lanu - 1), 1) + nusi + lanunusi = lanulanu0 + nunu0 + si K(nu,nu) = K(nu,nu) - 2.d0 * P(si,la) * ERI_chem(lanunusi) enddo do si = nu + 1, la - 1 - sinu = shiftr(si * (si - 1), 1) + nu - lanusinu = shiftr(lanu * (lanu - 1), 1) + sinu + lanusinu = lanulanu0 + shiftr(si * (si - 1), 1) + nu K(nu,nu) = K(nu,nu) - 2.d0 * P(si,la) * ERI_chem(lanusinu) enddo enddo @@ -99,75 +122,71 @@ subroutine exchange_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, K) do mu = 1, nu - 1 - numu = shiftr(nu * (nu - 1), 1) + mu - mumu = shiftr(mu * (mu - 1), 1) + mu + numu = nunu0 + mu + mumu0 = shiftr(mu * (mu - 1), 1) + mumu = mumu0 + mu numumumu = shiftr(numu * (numu - 1), 1) + mumu K(mu,nu) = - P(mu,mu) * ERI_chem(numumumu) do la = 1, mu - 1 - mula = shiftr(mu * (mu - 1), 1) + la - nula = shiftr(nu * (nu - 1), 1) + la - nulamula = shiftr(nula * (nula - 1), 1) + mula + nula = nunu0 + la + nulamula = shiftr(nula * (nula - 1), 1) + mumu0 + la K(mu,nu) = K(mu,nu) - P(la,la) * ERI_chem(nulamula) enddo do la = mu + 1, nu - lamu = shiftr(la * (la - 1), 1) + mu - nula = shiftr(nu * (nu - 1), 1) + la - nulalamu = shiftr(nula * (nula - 1), 1) + lamu + nula = nunu0 + la + nulalamu = shiftr(nula * (nula - 1), 1) + shiftr(la * (la - 1), 1) + mu K(mu,nu) = K(mu,nu) - P(la,la) * ERI_chem(nulalamu) enddo do la = nu + 1, nBas - lamu = shiftr(la * (la - 1), 1) + mu - lanu = shiftr(la * (la - 1), 1) + nu - lanulamu = shiftr(lanu * (lanu - 1), 1) + lamu + lala0 = shiftr(la * (la - 1), 1) + lanu = lala0 + nu + lanulamu = shiftr(lanu * (lanu - 1), 1) + lala0 + mu K(mu,nu) = K(mu,nu) - P(la,la) * ERI_chem(lanulamu) enddo do la = 1, mu - nula = shiftr(nu * (nu - 1), 1) + la + nula = nunu0 + la + nulanula0 = shiftr(nula * (nula - 1), 1) do si = 1, la - 1 - musi = shiftr(mu * (mu - 1), 1) + si - nulamusi = shiftr(nula * (nula - 1), 1) + musi + nulamusi = nulanula0 + mumu0 + si K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(nulamusi) enddo enddo do la = mu + 1, nu - nula = shiftr(nu * (nu - 1), 1) + la + nula = nunu0 + la + nulanula0 = shiftr(nula * (nula - 1), 1) do si = 1, mu - musi = shiftr(mu * (mu - 1), 1) + si - nulamusi = shiftr(nula * (nula - 1), 1) + musi + nulamusi = nulanula0 + mumu0 + si K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(nulamusi) enddo do si = mu + 1, la - 1 - simu = shiftr(si * (si - 1), 1) + mu - nulasimu = shiftr(nula * (nula - 1), 1) + simu + nulasimu = nulanula0 + shiftr(si * (si - 1), 1) + mu K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(nulasimu) enddo enddo do la = nu + 1, nBas lanu = shiftr(la * (la - 1), 1) + nu + lanulanu0 = shiftr(lanu * (lanu - 1), 1) do si = 1, mu - musi = shiftr(mu * (mu - 1), 1) + si - lanumusi = shiftr(lanu * (lanu - 1), 1) + musi + lanumusi = lanulanu0 + mumu0 + si K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(lanumusi) enddo do si = mu + 1, la - 1 - simu = shiftr(si * (si - 1), 1) + mu - lanusimu = shiftr(lanu * (lanu - 1), 1) + simu + lanusimu = lanulanu0 + shiftr(si * (si - 1), 1) + mu K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(lanusimu) enddo enddo do la = 1, mu - nula = shiftr(nu * (nu - 1), 1) + la + nula = nunu0 + la + nulanula0 = shiftr(nula * (nula - 1), 1) do si = la + 1, mu - musi = shiftr(mu * (mu - 1) , 1) + si - nulamusi = shiftr(nula * (nula - 1), 1) + musi + nulamusi = nulanula0 + mumu0 + si K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(nulamusi) enddo do si = mu + 1, nu - 1 - simu = shiftr(si * (si - 1), 1) + mu - nulasimu = shiftr(nula * (nula - 1), 1) + simu + nulasimu = nulanula0 + shiftr(si * (si - 1), 1) + mu K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(nulasimu) enddo do si = nu, nBas @@ -177,10 +196,10 @@ subroutine exchange_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, K) enddo enddo do la = mu + 1, nu - nula = shiftr(nu * (nu - 1), 1) + la + nula = nunu0 + la + nulanula0 = shiftr(nula * (nula - 1), 1) do si = la + 1, nu - simu = shiftr(si * (si - 1), 1) + mu - nulasimu = shiftr(nula * (nula - 1), 1) + simu + nulasimu = nulanula0 + shiftr(si * (si - 1), 1) + mu K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(nulasimu) enddo do si = nu + 1, nBas @@ -200,6 +219,8 @@ subroutine exchange_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, K) enddo ! mu enddo ! nu +! !$OMP END DO +! !$OMP END PARALLEL do nu = 1, nBas diff --git a/src/HF/RHF_hpc.f90 b/src/HF/RHF_hpc.f90 index 44095d3..740bc94 100644 --- a/src/HF/RHF_hpc.f90 +++ b/src/HF/RHF_hpc.f90 @@ -107,18 +107,14 @@ subroutine RHF_hpc(working_dir,dotest,maxSCF,thresh,max_diis,guess_type,level_sh call read_2e_integrals_hpc(working_dir, ERI_size, ERI_chem) call wall_time(t1) - do ii = 1, 5 - call Hartree_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, J) - enddo + call Hartree_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, J) call wall_time(t2) - print*, " J built in (sec):", (t2-t1) / 5.d0 + print*, " J built in (sec):", (t2-t1) call wall_time(t1) - do ii = 1, 5 - call exchange_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, K) - enddo + call exchange_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, K) call wall_time(t2) - print*, " K built in (sec):", (t2-t1) / 5.d0 + print*, " K built in (sec):", (t2-t1) allocate(ERI_phys(nBas,nBas,nBas,nBas)) @@ -128,25 +124,21 @@ subroutine RHF_hpc(working_dir,dotest,maxSCF,thresh,max_diis,guess_type,level_sh call read_2e_integrals(working_dir, nBas, ERI_phys) call wall_time(t1) - do ii = 1, 5 - call Hartree_matrix_AO_basis(nBas, P, ERI_phys, J_deb) - enddo + call Hartree_matrix_AO_basis(nBas, P, ERI_phys, J_deb) call wall_time(t2) - print*, " J_deb built in (sec):", (t2-t1) / 5.d0 + print*, " J_deb built in (sec):", (t2-t1) call wall_time(t1) - do ii = 1, 5 - call exchange_matrix_AO_basis(nBas, P, ERI_phys, K_deb) - enddo + call exchange_matrix_AO_basis(nBas, P, ERI_phys, K_deb) call wall_time(t2) - print*, " K_deb built in (sec):", (t2-t1) / 5.d0 + print*, " K_deb built in (sec):", (t2-t1) print*, "max error on J = ", maxval(dabs(J - J_deb)) diff = 0.d0 do ii = 1, nBas do jj = 1, nBas diff_loc = dabs(J(jj,ii) - J_deb(jj,ii)) - if(diff_loc .gt. 1d-12) then + if(diff_loc .gt. 1d-10) then print*, 'error in J on: ', jj, ii print*, J(jj,ii), J_deb(jj,ii) stop @@ -161,7 +153,7 @@ subroutine RHF_hpc(working_dir,dotest,maxSCF,thresh,max_diis,guess_type,level_sh do ii = 1, nBas do jj = 1, nBas diff_loc = dabs(K(jj,ii) - K_deb(jj,ii)) - if(diff_loc .gt. 1d-12) then + if(diff_loc .gt. 1d-10) then print*, 'error in K on: ', jj, ii print*, K(jj,ii), K_deb(jj,ii) stop From 84a989b0a5ac05f27d979088784ee90144561af3 Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Thu, 12 Dec 2024 18:25:52 +0100 Subject: [PATCH 37/39] // K calc --- src/AOtoMO/exchange_matrix_AO_basis.f90 | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/src/AOtoMO/exchange_matrix_AO_basis.f90 b/src/AOtoMO/exchange_matrix_AO_basis.f90 index 581e8db..5a363a7 100644 --- a/src/AOtoMO/exchange_matrix_AO_basis.f90 +++ b/src/AOtoMO/exchange_matrix_AO_basis.f90 @@ -74,14 +74,20 @@ subroutine exchange_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, K) -! !$OMP PARALLEL SHARED (NONE) -! !$OMP PRIVATE () -! !$OMP SHARED () -! !$OMP DO + !$OMP PARALLEL DEFAULT (NONE) & + !$OMP PRIVATE (nu, si, la, mu, & + !$OMP nunu0, nunu, lanu, numu, mumu0, mumu, simu, lala0, nula, & + !$OMP nunununu, nulanula, lanulanu, lanulanu0, nulanula0, & + !$OMP nulanusi, lanulamu, lanunusi, lanusinu , numumumu, & + !$OMP nulamula, nulalamu, lanumusi, lanusimu, nulamusi, & + !$OMP nulasimu, simunula, simulanu) & + !$OMP SHARED (nBas, P, ERI_chem, K) + !$OMP DO do nu = 1, nBas nunu0 = shiftr(nu * (nu - 1), 1) nunu = nunu0 + nu + nunununu = shiftr(nunu * (nunu - 1), 1) + nunu K(nu,nu) = -P(nu,nu) * ERI_chem(nunununu) @@ -219,8 +225,8 @@ subroutine exchange_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, K) enddo ! mu enddo ! nu -! !$OMP END DO -! !$OMP END PARALLEL + !$OMP END DO + !$OMP END PARALLEL do nu = 1, nBas @@ -234,3 +240,6 @@ end subroutine ! --- + + + From a8b51ab800a1c2663c926f5b1025fd5f8ba6fb3b Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Thu, 12 Dec 2024 19:25:33 +0100 Subject: [PATCH 38/39] HF on HPC branch: OK --- src/HF/RHF_hpc.f90 | 169 ++++++++++++++++++++++++++------------------- 1 file changed, 98 insertions(+), 71 deletions(-) diff --git a/src/HF/RHF_hpc.f90 b/src/HF/RHF_hpc.f90 index 740bc94..a4d3325 100644 --- a/src/HF/RHF_hpc.f90 +++ b/src/HF/RHF_hpc.f90 @@ -58,6 +58,7 @@ subroutine RHF_hpc(working_dir,dotest,maxSCF,thresh,max_diis,guess_type,level_sh double precision,allocatable :: Fp(:,:) double precision,allocatable :: ERI_chem(:) double precision,allocatable :: ERI_phys(:,:,:,:), J_deb(:,:), K_deb(:,:) + double precision,allocatable :: tmp1(:,:), FX(:,:) ! Output variables @@ -93,6 +94,9 @@ subroutine RHF_hpc(working_dir,dotest,maxSCF,thresh,max_diis,guess_type,level_sh allocate(err_diis(nBas_Sq,max_diis)) allocate(F_diis(nBas_Sq,max_diis)) + allocate(tmp1(nBas,nBas)) + allocate(FX(nBas,nOrb)) + ! Guess coefficients and density matrix call mo_guess(nBas,nOrb,guess_type,S,Hc,X,c) @@ -100,70 +104,60 @@ subroutine RHF_hpc(working_dir,dotest,maxSCF,thresh,max_diis,guess_type,level_sh c(1,1), nBas, c(1,1), nBas, & 0.d0, P(1,1), nBas) - - ERI_size = (nBas * (nBas + 1)) / 2 - ERI_size = (ERI_size * (ERI_size + 1)) / 2 + ERI_size = shiftr(nBas * (nBas + 1), 1) + ERI_size = shiftr(ERI_size * (ERI_size + 1), 1) allocate(ERI_chem(ERI_size)) call read_2e_integrals_hpc(working_dir, ERI_size, ERI_chem) - call wall_time(t1) - call Hartree_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, J) - call wall_time(t2) - print*, " J built in (sec):", (t2-t1) - - call wall_time(t1) - call exchange_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, K) - call wall_time(t2) - print*, " K built in (sec):", (t2-t1) - - - allocate(ERI_phys(nBas,nBas,nBas,nBas)) - allocate(J_deb(nBas,nBas)) - allocate(K_deb(nBas,nBas)) - - call read_2e_integrals(working_dir, nBas, ERI_phys) - - call wall_time(t1) - call Hartree_matrix_AO_basis(nBas, P, ERI_phys, J_deb) - call wall_time(t2) - print*, " J_deb built in (sec):", (t2-t1) - - call wall_time(t1) - call exchange_matrix_AO_basis(nBas, P, ERI_phys, K_deb) - call wall_time(t2) - print*, " K_deb built in (sec):", (t2-t1) - - print*, "max error on J = ", maxval(dabs(J - J_deb)) - diff = 0.d0 - do ii = 1, nBas - do jj = 1, nBas - diff_loc = dabs(J(jj,ii) - J_deb(jj,ii)) - if(diff_loc .gt. 1d-10) then - print*, 'error in J on: ', jj, ii - print*, J(jj,ii), J_deb(jj,ii) - stop - endif - diff = diff + diff_loc - enddo - enddo - print*, 'total diff on J = ', diff - - print*, "max error on K = ", maxval(dabs(K - K_deb)) - diff = 0.d0 - do ii = 1, nBas - do jj = 1, nBas - diff_loc = dabs(K(jj,ii) - K_deb(jj,ii)) - if(diff_loc .gt. 1d-10) then - print*, 'error in K on: ', jj, ii - print*, K(jj,ii), K_deb(jj,ii) - stop - endif - diff = diff + diff_loc - enddo - enddo - print*, 'total diff on K = ', diff - - stop + !call wall_time(t1) + !call Hartree_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, J) + !call wall_time(t2) + !print*, " J built in (sec):", (t2-t1) + !call wall_time(t1) + !call exchange_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, K) + !call wall_time(t2) + !print*, " K built in (sec):", (t2-t1) + !allocate(ERI_phys(nBas,nBas,nBas,nBas)) + !allocate(J_deb(nBas,nBas)) + !allocate(K_deb(nBas,nBas)) + !call read_2e_integrals(working_dir, nBas, ERI_phys) + !call wall_time(t1) + !call Hartree_matrix_AO_basis(nBas, P, ERI_phys, J_deb) + !call wall_time(t2) + !print*, " J_deb built in (sec):", (t2-t1) + !call wall_time(t1) + !call exchange_matrix_AO_basis(nBas, P, ERI_phys, K_deb) + !call wall_time(t2) + !print*, " K_deb built in (sec):", (t2-t1) + !print*, "max error on J = ", maxval(dabs(J - J_deb)) + !diff = 0.d0 + !do ii = 1, nBas + ! do jj = 1, nBas + ! diff_loc = dabs(J(jj,ii) - J_deb(jj,ii)) + ! if(diff_loc .gt. 1d-10) then + ! print*, 'error in J on: ', jj, ii + ! print*, J(jj,ii), J_deb(jj,ii) + ! stop + ! endif + ! diff = diff + diff_loc + ! enddo + !enddo + !print*, 'total diff on J = ', diff + !print*, "max error on K = ", maxval(dabs(K - K_deb)) + !diff = 0.d0 + !do ii = 1, nBas + ! do jj = 1, nBas + ! diff_loc = dabs(K(jj,ii) - K_deb(jj,ii)) + ! if(diff_loc .gt. 1d-10) then + ! print*, 'error in K on: ', jj, ii + ! print*, K(jj,ii), K_deb(jj,ii) + ! stop + ! endif + ! diff = diff + diff_loc + ! enddo + !enddo + !print*, 'total diff on K = ', diff + !stop ! Initialization @@ -192,25 +186,46 @@ subroutine RHF_hpc(working_dir,dotest,maxSCF,thresh,max_diis,guess_type,level_sh nSCF = nSCF + 1 ! Build Fock matrix - call Hartree_matrix_AO_basis(nBas,P,ERI_phys,J) - call exchange_matrix_AO_basis(nBas,P,ERI_phys,K) + call Hartree_matrix_AO_basis_hpc (nBas, ERI_size, P(1,1), ERI_chem(1), J(1,1)) + call exchange_matrix_AO_basis_hpc(nBas, ERI_size, P(1,1), ERI_chem(1), K(1,1)) F(:,:) = Hc(:,:) + J(:,:) + 0.5d0*K(:,:) ! Check convergence - err = matmul(F, matmul(P, S)) - matmul(matmul(S, P), F) + call dgemm("N", "N", nBas, nBas, nBas, 1.d0, & + S(1,1), nBas, P(1,1), nBas, & + 0.d0, tmp1(1,1), nBas) + call dgemm("N", "N", nBas, nBas, nBas, 1.d0, & + tmp1(1,1), nBas, F(1,1), nBas, & + 0.d0, err(1,1), nBas) + call dgemm("N", "T", nBas, nBas, nBas, 1.d0, & + F(1,1), nBas, tmp1(1,1), nBas, & + -1.d0, err(1,1), nBas) + !err = matmul(F, matmul(P, S)) - matmul(matmul(S, P), F) if(nSCF > 1) Conv = maxval(abs(err)) ! Kinetic energy - ET = trace_matrix(nBas, matmul(P, T)) + call dgemm("N", "N", nBas, nBas, nBas, 1.d0, & + P(1,1), nBas, T(1,1), nBas, & + 0.d0, tmp1(1,1), nBas) + ET = trace_matrix(nBas, tmp1(1,1)) ! Potential energy - EV = trace_matrix(nBas, matmul(P, V)) + call dgemm("N", "N", nBas, nBas, nBas, 1.d0, & + P(1,1), nBas, V(1,1), nBas, & + 0.d0, tmp1(1,1), nBas) + EV = trace_matrix(nBas, tmp1(1,1)) ! Hartree energy - EJ = 0.5d0*trace_matrix(nBas, matmul(P, J)) + call dgemm("N", "N", nBas, nBas, nBas, 1.d0, & + P(1,1), nBas, J(1,1), nBas, & + 0.d0, tmp1(1,1), nBas) + EJ = 0.5d0*trace_matrix(nBas, tmp1(1,1)) ! Exchange energy - EK = 0.25d0*trace_matrix(nBas, matmul(P, K)) + call dgemm("N", "N", nBas, nBas, nBas, 1.d0, & + P(1,1), nBas, K(1,1), nBas, & + 0.d0, tmp1(1,1), nBas) + EK = 0.25d0*trace_matrix(nBas, tmp1(1,1)) ! Total energy ERHF = ET + EV + EJ + EK @@ -219,7 +234,7 @@ subroutine RHF_hpc(working_dir,dotest,maxSCF,thresh,max_diis,guess_type,level_sh if(max_diis > 1) then n_diis = min(n_diis+1,max_diis) call DIIS_extrapolation(rcond,nBas_Sq,nBas_Sq,n_diis,err_diis,F_diis,err,F) - endif + endif ! Level shift if(level_shift > 0d0 .and. Conv > thresh) then @@ -227,10 +242,20 @@ subroutine RHF_hpc(working_dir,dotest,maxSCF,thresh,max_diis,guess_type,level_sh endif ! Diagonalize Fock matrix - Fp = matmul(transpose(X), matmul(F, X)) + call dgemm("N", "N", nBas, nOrb, nBas, 1.d0, & + F(1,1), nBas, X(1,1), nBas, & + 0.d0, FX(1,1), nBas) + call dgemm("T", "N", nOrb, nOrb, nBas, 1.d0, & + X(1,1), nBas, FX(1,1), nBas, & + 0.d0, Fp(1,1), nOrb) + !Fp = matmul(transpose(X), matmul(F, X)) cp(:,:) = Fp(:,:) call diagonalize_matrix(nOrb,cp,eHF) - c = matmul(X,cp) + !c = matmul(X, cp) + call dgemm("N", "N", nBas, nOrb, nOrb, 1.d0, & + X(1,1), nBas, cp(1,1), nOrb, & + 0.d0, c(1,1), nBas) + ! Density matrix call dgemm('N', 'T', nBas, nBas, nO, 2.d0, & @@ -258,6 +283,7 @@ subroutine RHF_hpc(working_dir,dotest,maxSCF,thresh,max_diis,guess_type,level_sh write(*,*) deallocate(J,K,err,cp,Fp,err_diis,F_diis) + deallocate(tmp1, FX, ERI_chem) stop @@ -280,5 +306,6 @@ subroutine RHF_hpc(working_dir,dotest,maxSCF,thresh,max_diis,guess_type,level_sh end if deallocate(J,K,err,cp,Fp,err_diis,F_diis) + deallocate(tmp1, FX, ERI_chem) end subroutine From 26d09cc1b29ec637e80a42a4639385621ef5d418 Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Thu, 12 Dec 2024 19:39:22 +0100 Subject: [PATCH 39/39] int8 in J and K --- src/AOtoMO/Hartree_matrix_AO_basis.f90 | 40 +++++++------------ src/AOtoMO/exchange_matrix_AO_basis.f90 | 51 +++++++++---------------- 2 files changed, 31 insertions(+), 60 deletions(-) diff --git a/src/AOtoMO/Hartree_matrix_AO_basis.f90 b/src/AOtoMO/Hartree_matrix_AO_basis.f90 index 010e5bb..63141cc 100644 --- a/src/AOtoMO/Hartree_matrix_AO_basis.f90 +++ b/src/AOtoMO/Hartree_matrix_AO_basis.f90 @@ -45,31 +45,17 @@ subroutine Hartree_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, H) double precision, intent(in) :: ERI_chem(ERI_size) double precision, intent(out) :: H(nBas,nBas) - integer :: mu, nu, la, si - integer :: nunu, lala, nula, lasi, numu - integer :: nunu0, lala0 + integer*8 :: mu, nu, la, si, nBas8 + integer*8 :: nunu, lala, nula, lasi, numu + integer*8 :: nunu0, lala0 integer*8 :: nunununu, nunulala, nununula, nunulasi integer*8 :: lalanunu, lasinunu, numulala, lalanumu integer*8 :: numunula, numulasi, lasinumu, nununumu integer*8 :: nunununu0, numunumu0 -! integer*8 :: munusila -! integer*8, external :: Yoshimine_4ind -! -! do nu = 1, nBas -! do mu = 1, nu -! H(mu,nu) = 0.d0 -! do la = 1, nBas -! do si = 1, nBas -! munusila = Yoshimine_4ind(int(mu, kind=8), & -! int(nu, kind=8), & -! int(si, kind=8), & -! int(la, kind=8)) -! H(mu,nu) = H(mu,nu) + P(si,la) * ERI_chem(munusila) -! enddo -! enddo -! enddo -! enddo + + + nBas8 = int(nBas, kind=8) !$OMP PARALLEL DEFAULT(NONE) & !$OMP PRIVATE (nu, la, si, mu, & @@ -77,9 +63,9 @@ subroutine Hartree_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, H) !$OMP nunununu0, nunununu, nununula, numulala, numunula, & !$OMP nunulala, lalanunu, lalanumu, nunulasi, lasinunu, & !$OMP numunumu0, nununumu, numulasi, lasinumu) & - !$OMP SHARED (nBas, H, P, ERI_chem) + !$OMP SHARED (nBas8, H, P, ERI_chem) !$OMP DO - do nu = 1, nBas + do nu = 1, nBas8 nunu0 = shiftr(nu * (nu - 1), 1) nunu = nunu0 + nu @@ -107,7 +93,7 @@ subroutine Hartree_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, H) enddo enddo - do la = nu + 1, nBas + do la = nu + 1, nBas8 lala0 = shiftr(la * (la - 1), 1) @@ -137,7 +123,7 @@ subroutine Hartree_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, H) H(mu,nu) = H(mu,nu) + p(la,la) * ERI_chem(numulala) enddo - do la = nu + 1, nBas + do la = nu + 1, nBas8 lala = shiftr(la * (la - 1), 1) + la lalanumu = shiftr(lala * (lala - 1), 1) + numu H(mu,nu) = H(mu,nu) + p(la,la) * ERI_chem(lalanumu) @@ -164,7 +150,7 @@ subroutine Hartree_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, H) enddo enddo - do la = nu + 1, nBas + do la = nu + 1, nBas8 lala0 = shiftr(la * (la - 1), 1) do si = 1, la - 1 lasi = lala0 + si @@ -178,8 +164,8 @@ subroutine Hartree_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, H) !$OMP END DO !$OMP END PARALLEL - do nu = 1, nBas - do mu = nu+1, nBas + do nu = 1, nBas8 + do mu = nu+1, nBas8 H(mu,nu) = H(nu,mu) enddo enddo diff --git a/src/AOtoMO/exchange_matrix_AO_basis.f90 b/src/AOtoMO/exchange_matrix_AO_basis.f90 index 5a363a7..8042420 100644 --- a/src/AOtoMO/exchange_matrix_AO_basis.f90 +++ b/src/AOtoMO/exchange_matrix_AO_basis.f90 @@ -44,35 +44,20 @@ subroutine exchange_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, K) double precision, intent(in) :: ERI_chem(ERI_size) double precision, intent(out) :: K(nBas,nBas) - integer :: mu, nu, la, si - integer :: nunu, nula, lanu, lasi, nusi, sinu - integer :: numu, mumu, mula, lamu, musi, simu - integer :: nunu0, lala0, mumu0 + integer*8 :: mu, nu, la, si, nBas8 + integer*8 :: nunu, nula, lanu, lasi, nusi, sinu + integer*8 :: numu, mumu, mula, lamu, musi, simu + integer*8 :: nunu0, lala0, mumu0 integer*8 :: nunununu, nulanula, lanulanu, nulanusi integer*8 :: munulasi, lanunusi, lanusinu, numumumu integer*8 :: nulamula, nulalamu, lanulamu, nulamusi integer*8 :: nulasimu, lanumusi, lanusimu, simunula integer*8 :: simulanu, nulanula0, lanulanu0 -! integer*8 :: munusila -! integer*8, external :: Yoshimine_4ind -! -! do nu = 1, nBas -! do mu = 1, nu -! K(mu,nu) = 0.d0 -! do la = 1, nBas -! do si = 1, nBas -! munusila = Yoshimine_4ind(int(mu, kind=8), & -! int(si, kind=8), & -! int(la, kind=8), & -! int(nu, kind=8)) -! K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(munusila) -! enddo -! enddo -! enddo -! enddo + nBas8 = int(nBas, kind=8) + !$OMP PARALLEL DEFAULT (NONE) & !$OMP PRIVATE (nu, si, la, mu, & @@ -81,9 +66,9 @@ subroutine exchange_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, K) !$OMP nulanusi, lanulamu, lanunusi, lanusinu , numumumu, & !$OMP nulamula, nulalamu, lanumusi, lanusimu, nulamusi, & !$OMP nulasimu, simunula, simulanu) & - !$OMP SHARED (nBas, P, ERI_chem, K) + !$OMP SHARED (nBas8, P, ERI_chem, K) !$OMP DO - do nu = 1, nBas + do nu = 1, nBas8 nunu0 = shiftr(nu * (nu - 1), 1) nunu = nunu0 + nu @@ -97,7 +82,7 @@ subroutine exchange_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, K) K(nu,nu) = K(nu,nu) - P(la,la) * ERI_chem(nulanula) enddo - do la = nu + 1, nBas + do la = nu + 1, nBas8 lanu = shiftr(la * (la - 1), 1) + nu lanulanu = shiftr(lanu * (lanu - 1), 1) + lanu K(nu,nu) = K(nu,nu) - P(la,la) * ERI_chem(lanulanu) @@ -112,7 +97,7 @@ subroutine exchange_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, K) enddo enddo - do la = nu + 1, nBas + do la = nu + 1, nBas8 lanu = shiftr(la * (la - 1), 1) + nu lanulanu0 = shiftr(lanu * (lanu - 1), 1) do si = 1, nu @@ -144,7 +129,7 @@ subroutine exchange_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, K) nulalamu = shiftr(nula * (nula - 1), 1) + shiftr(la * (la - 1), 1) + mu K(mu,nu) = K(mu,nu) - P(la,la) * ERI_chem(nulalamu) enddo - do la = nu + 1, nBas + do la = nu + 1, nBas8 lala0 = shiftr(la * (la - 1), 1) lanu = lala0 + nu lanulamu = shiftr(lanu * (lanu - 1), 1) + lala0 + mu @@ -171,7 +156,7 @@ subroutine exchange_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, K) K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(nulasimu) enddo enddo - do la = nu + 1, nBas + do la = nu + 1, nBas8 lanu = shiftr(la * (la - 1), 1) + nu lanulanu0 = shiftr(lanu * (lanu - 1), 1) do si = 1, mu @@ -195,7 +180,7 @@ subroutine exchange_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, K) nulasimu = nulanula0 + shiftr(si * (si - 1), 1) + mu K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(nulasimu) enddo - do si = nu, nBas + do si = nu, nBas8 simu = shiftr(si * (si - 1), 1) + mu simunula = shiftr(simu * (simu - 1), 1) + nula K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(simunula) @@ -208,15 +193,15 @@ subroutine exchange_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, K) nulasimu = nulanula0 + shiftr(si * (si - 1), 1) + mu K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(nulasimu) enddo - do si = nu + 1, nBas + do si = nu + 1, nBas8 simu = shiftr(si * (si - 1), 1) + mu simunula = shiftr(simu * (simu - 1), 1) + nula K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(simunula) enddo enddo - do la = nu + 1, nBas + do la = nu + 1, nBas8 lanu = shiftr(la * (la - 1), 1) + nu - do si = la + 1, nBas + do si = la + 1, nBas8 simu = shiftr(si * (si - 1), 1) + mu simulanu = shiftr(simu * (simu - 1), 1) + lanu K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(simulanu) @@ -229,8 +214,8 @@ subroutine exchange_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, K) !$OMP END PARALLEL - do nu = 1, nBas - do mu = nu+1, nBas + do nu = 1, nBas8 + do mu = nu+1, nBas8 K(mu,nu) = K(nu,mu) enddo enddo