From 10cb7f931de11f2b55eef08d928473d0019ce05d Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Wed, 27 Nov 2024 18:29:30 +0100 Subject: [PATCH] added & tested diagonalyzation with Cusolver! --- src/RPA/phRRPA.f90 | 25 ++++++---- src/cuda/include/ph_drpa.h | 10 ---- src/cuda/include/ph_rpa.h | 10 ++++ src/cuda/include/utils.cuh | 9 ---- src/cuda/include/utils.h | 9 ++++ src/cuda/src/diag_dense_dsy_mat.cu | 41 ++++++++++++++++ src/cuda/src/ph_drpa.c | 59 ----------------------- src/cuda/src/ph_drpa_tda.c | 76 ++++++++++++++++++++++++++++++ src/cuda/src/utils.cu | 70 +++++++++++++++++++++++++++ src/make_ninja.py | 2 +- src/mod/cu_quack_module.f90 | 11 ++--- 11 files changed, 228 insertions(+), 94 deletions(-) delete mode 100644 src/cuda/include/ph_drpa.h create mode 100644 src/cuda/include/ph_rpa.h delete mode 100644 src/cuda/include/utils.cuh create mode 100644 src/cuda/include/utils.h create mode 100644 src/cuda/src/diag_dense_dsy_mat.cu delete mode 100644 src/cuda/src/ph_drpa.c create mode 100644 src/cuda/src/ph_drpa_tda.c diff --git a/src/RPA/phRRPA.f90 b/src/RPA/phRRPA.f90 index 7d00061..577ab6c 100644 --- a/src/RPA/phRRPA.f90 +++ b/src/RPA/phRRPA.f90 @@ -31,6 +31,7 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO, ! Local variables + integer :: i integer :: ispin logical :: dRPA double precision :: lambda @@ -40,7 +41,7 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO, double precision,allocatable :: XpY(:,:) double precision,allocatable :: XmY(:,:) ! DEBUG - double precision, allocatable :: XpY_gpu(:,:), XmY_gpu(:,:), Om_gpu(:) + !double precision, allocatable :: XpY_gpu(:,:), XmY_gpu(:,:), Om_gpu(:) double precision :: EcRPA(nspin) @@ -78,15 +79,21 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO, call phLR_A(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,eHF,ERI,Aph) if(.not.TDA) call phLR_B(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,ERI,Bph) - ! DEBUG - allocate(Om_gpu(nS), XpY_gpu(nS,nS), XmY_gpu(nS,nS)) - call ph_drpa(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om_gpu(1), XpY_gpu(1,1), XmY_gpu(1,1)) - print *, ' CPU:', Aph(1,1) - print *, ' GPU:', XpY_gpu(1,1) - print *, ' GPU:', XmY_gpu(1,1) - stop - call phLR(TDA,nS,Aph,Bph,EcRPA(ispin),Om,XpY,XmY) + + !! DEBUG + !allocate(Om_gpu(nS), XpY_gpu(nS,nS), XmY_gpu(nS,nS)) + !call ph_drpa_tda(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om_gpu(1), XpY_gpu(1,1)) + !do i = 1, nS + ! print *, i, Om(i), Om_gpu(i) + ! if(dabs(Om(i) - Om_gpu(i)) .gt. 1d-13) then + ! print *, 'GPU FAILED!' + ! stop + ! endif + !enddo + !print *, 'GPU DONE!' + !stop + call print_excitation_energies('phRPA@RHF','singlet',nS,Om) call phLR_transition_vectors(.true.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY) diff --git a/src/cuda/include/ph_drpa.h b/src/cuda/include/ph_drpa.h deleted file mode 100644 index 1786a2f..0000000 --- a/src/cuda/include/ph_drpa.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef PH_DRPA - -#define PH_DRPA - -extern void check_Cuda_Errors(cudaError_t err, const char * msg, const char * file, int line); -extern void check_Cublas_Errors(cublasStatus_t status, const char * msg, const char * file, int line); - -extern void ph_dRPA_A_sing(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *A); - -#endif diff --git a/src/cuda/include/ph_rpa.h b/src/cuda/include/ph_rpa.h new file mode 100644 index 0000000..36f111f --- /dev/null +++ b/src/cuda/include/ph_rpa.h @@ -0,0 +1,10 @@ +#ifndef PH_RPA + +#define PH_RPA + +extern void ph_dRPA_A_sing(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *A); +extern void ph_dRPA_B_sing(int nO, int nV, int nBas, int nS, double *ERI, double *B); + +extern void diag_dn_dsyevd(int n, int *info, double *W, double *A); + +#endif diff --git a/src/cuda/include/utils.cuh b/src/cuda/include/utils.cuh deleted file mode 100644 index 1a91732..0000000 --- a/src/cuda/include/utils.cuh +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef UTILS -#define UTILS - -extern "C" void check_Cuda_Errors(cudaError_t err, const char* msg, const char* file, int line); - -extern "C" void check_Cublas_Errors(cublasStatus_t status, const char* msg, const char* file, int line); - - -#endif diff --git a/src/cuda/include/utils.h b/src/cuda/include/utils.h new file mode 100644 index 0000000..f5a6403 --- /dev/null +++ b/src/cuda/include/utils.h @@ -0,0 +1,9 @@ +#ifndef UTILS + +#define UTILS + +extern void check_Cuda_Errors(cudaError_t err, const char *msg, const char *file, int line); +extern void check_Cublas_Errors(cublasStatus_t status, const char *msg, const char *file, int line); +extern void check_Cusolver_Errors(cusolverStatus_t status, const char *msg, const char *file, int line); + +#endif diff --git a/src/cuda/src/diag_dense_dsy_mat.cu b/src/cuda/src/diag_dense_dsy_mat.cu new file mode 100644 index 0000000..b800501 --- /dev/null +++ b/src/cuda/src/diag_dense_dsy_mat.cu @@ -0,0 +1,41 @@ +#include +#include +#include +#include + + + +extern "C" void diag_dn_dsyevd(int n, int *info, double *W, double *A) { + + cusolverDnHandle_t cusolverH = NULL; + cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR; // Compute eigenvalues and eigenvectors + cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER; // Upper triangular part of the matrix is stored + + int lwork = 0; + double *work = NULL; + + //check_Cusolver_Errors(cusolverDnCreate(&cusolverH), "cusolverDnCreate", __FILE__, __LINE__); + cusolverDnCreate(&cusolverH); + + // Query workspace size + //check_Cusolver_Errors(cusolverDnDsyevd_bufferSize(cusolverH, jobz, uplo, n, A, n, W, &lwork), + // "cusolverDnDsyevd_bufferSize", __FILE__, __LINE__); + //check_Cuda_Errors(cudaMalloc((void**)&work, sizeof(double) * lwork), + // "cudaMemcpy", __FILE__, __LINE__); + cusolverDnDsyevd_bufferSize(cusolverH, jobz, uplo, n, A, n, W, &lwork); + cudaMalloc((void**)&work, sizeof(double) * lwork); + + // Compute eigenvalues and eigenvectors + //check_Cusolver_Errors(cusolverDnDsyevd(cusolverH, jobz, uplo, n, A, n, W, work, lwork, info), + // "cusolverDnDsyevd", __FILE__, __LINE__); + cusolverDnDsyevd(cusolverH, jobz, uplo, n, A, n, W, work, lwork, info); + + // Clean up + //check_Cuda_Errors(cudaFree(work), "cudaFree", __FILE__, __LINE__); + //check_Cusolver_Errors(cusolverDnDestroy(cusolverH), "cusolverDnDestroy", __FILE__, __LINE__); + + cudaFree(work); + cusolverDnDestroy(cusolverH); + +} + diff --git a/src/cuda/src/ph_drpa.c b/src/cuda/src/ph_drpa.c deleted file mode 100644 index c90bfab..0000000 --- a/src/cuda/src/ph_drpa.c +++ /dev/null @@ -1,59 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "ph_drpa.h" - -int ph_drpa(int nO, int nBas, int nS, double *h_eps, double *h_ERI, - double *h_Omega, double *h_XpY, double *h_XmY) { - - double *d_eps; - double *d_ERI; - - int nV = nBas - nO; - - int nBas2 = nBas * nBas; - int nBas4 = nBas2 * nBas2; - - int ia, jb; - for (ia = 0; ia < nS; ia++) { - h_Omega[ia] = 0.0; - for (jb = 0; jb < nS; jb++) { - h_XmY[jb + ia * nS] = 4.0; - //h_XpY[jb + ia * nS] = 5.0; - } - } - - - check_Cuda_Errors(cudaMalloc((void**)&d_eps, nBas * sizeof(double)), - "cudaMalloc", __FILE__, __LINE__); - check_Cuda_Errors(cudaMalloc((void**)&d_ERI, nBas4 * sizeof(double)), - "cudaMalloc", __FILE__, __LINE__); - - - check_Cuda_Errors(cudaMemcpy(d_eps, h_eps, nBas * sizeof(double), cudaMemcpyHostToDevice), - "cudaMemcpy", __FILE__, __LINE__); - check_Cuda_Errors(cudaMemcpy(d_ERI, h_ERI, nBas4 * sizeof(double), cudaMemcpyHostToDevice), - "cudaMemcpy", __FILE__, __LINE__); - - // construct A matrix - double *d_A; - check_Cuda_Errors(cudaMalloc((void**)&d_A, nS * nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); - ph_dRPA_A_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_A); - check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); - - - check_Cuda_Errors(cudaMemcpy(h_XpY, d_A, nS * nS * sizeof(double), cudaMemcpyDeviceToHost), - "cudaMemcpy", __FILE__, __LINE__); - - check_Cuda_Errors(cudaFree(d_eps), "cudaFree", __FILE__, __LINE__); - check_Cuda_Errors(cudaFree(d_ERI), "cudaFree", __FILE__, __LINE__); - check_Cuda_Errors(cudaFree(d_A), "cudaFree", __FILE__, __LINE__); - - - return 0; -} - diff --git a/src/cuda/src/ph_drpa_tda.c b/src/cuda/src/ph_drpa_tda.c new file mode 100644 index 0000000..2d6bccf --- /dev/null +++ b/src/cuda/src/ph_drpa_tda.c @@ -0,0 +1,76 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "utils.h" +#include "ph_rpa.h" + +void ph_drpa_tda(int nO, int nBas, int nS, double *h_eps, double *h_ERI, + double *h_Omega, double *h_X) { + + double *d_eps = NULL; + double *d_ERI = NULL; + + int nV = nBas - nO; + + int nBas2 = nBas * nBas; + int nBas4 = nBas2 * nBas2; + + + check_Cuda_Errors(cudaMalloc((void**)&d_eps, nBas * sizeof(double)), + "cudaMalloc", __FILE__, __LINE__); + check_Cuda_Errors(cudaMalloc((void**)&d_ERI, nBas4 * sizeof(double)), + "cudaMalloc", __FILE__, __LINE__); + + check_Cuda_Errors(cudaMemcpy(d_eps, h_eps, nBas * sizeof(double), cudaMemcpyHostToDevice), + "cudaMemcpy", __FILE__, __LINE__); + check_Cuda_Errors(cudaMemcpy(d_ERI, h_ERI, nBas4 * sizeof(double), cudaMemcpyHostToDevice), + "cudaMemcpy", __FILE__, __LINE__); + + // construct A + double *d_A = NULL; + check_Cuda_Errors(cudaMalloc((void**)&d_A, nS * nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); + + ph_dRPA_A_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_A); + check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); + + + // diagonalize A + int *d_info = NULL; + double *d_Omega = NULL; + check_Cuda_Errors(cudaMalloc((void**)&d_info, sizeof(int)), + "cudaMalloc", __FILE__, __LINE__); + check_Cuda_Errors(cudaMalloc((void**)&d_Omega, nS * sizeof(double)), + "cudaMalloc", __FILE__, __LINE__); + + diag_dn_dsyevd(nS, d_info, d_Omega, d_A); + check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); + + int info_gpu = 0; + check_Cuda_Errors(cudaMemcpy(&info_gpu, d_info, sizeof(int), cudaMemcpyDeviceToHost), + "cudaMemcpy", __FILE__, __LINE__); + if (info_gpu != 0) { + printf("Error: diag_dn_dsyevd returned error code %d\n", info_gpu); + exit(EXIT_FAILURE); + } + + + check_Cuda_Errors(cudaMemcpy(h_X, d_A, nS * nS * sizeof(double), cudaMemcpyDeviceToHost), + "cudaMemcpy", __FILE__, __LINE__); + + check_Cuda_Errors(cudaMemcpy(h_Omega, d_Omega, nS * sizeof(double), cudaMemcpyDeviceToHost), + "cudaMemcpy", __FILE__, __LINE__); + + check_Cuda_Errors(cudaFree(d_info), "cudaFree", __FILE__, __LINE__); + check_Cuda_Errors(cudaFree(d_eps), "cudaFree", __FILE__, __LINE__); + check_Cuda_Errors(cudaFree(d_ERI), "cudaFree", __FILE__, __LINE__); + check_Cuda_Errors(cudaFree(d_A), "cudaFree", __FILE__, __LINE__); + check_Cuda_Errors(cudaFree(d_Omega), "cudaFree", __FILE__, __LINE__); + + +} + diff --git a/src/cuda/src/utils.cu b/src/cuda/src/utils.cu index ff5d83d..673b21e 100644 --- a/src/cuda/src/utils.cu +++ b/src/cuda/src/utils.cu @@ -3,6 +3,7 @@ #include #include #include +#include @@ -53,3 +54,72 @@ extern "C" void check_Cublas_Errors(cublasStatus_t status, const char* msg, cons } + +const char* cusolver_Get_Error_String(cusolverStatus_t status) { + switch (status) { + case CUSOLVER_STATUS_SUCCESS: + return "CUSOLVER_STATUS_SUCCESS"; + case CUSOLVER_STATUS_NOT_INITIALIZED: + return "CUSOLVER_STATUS_NOT_INITIALIZED"; + case CUSOLVER_STATUS_ALLOC_FAILED: + return "CUSOLVER_STATUS_ALLOC_FAILED"; + case CUSOLVER_STATUS_INVALID_VALUE: + return "CUSOLVER_STATUS_INVALID_VALUE"; + case CUSOLVER_STATUS_ARCH_MISMATCH: + return "CUSOLVER_STATUS_ARCH_MISMATCH"; + case CUSOLVER_STATUS_MAPPING_ERROR: + return "CUSOLVER_STATUS_MAPPING_ERROR"; + case CUSOLVER_STATUS_EXECUTION_FAILED: + return "CUSOLVER_STATUS_EXECUTION_FAILED"; + case CUSOLVER_STATUS_INTERNAL_ERROR: + return "CUSOLVER_STATUS_INTERNAL_ERROR"; + case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + case CUSOLVER_STATUS_NOT_SUPPORTED: + return "CUSOLVER_STATUS_NOT_SUPPORTED"; + case CUSOLVER_STATUS_ZERO_PIVOT: + return "CUSOLVER_STATUS_ZERO_PIVOT"; + case CUSOLVER_STATUS_INVALID_LICENSE: + return "CUSOLVER_STATUS_INVALID_LICENSE"; + case CUSOLVER_STATUS_IRS_PARAMS_NOT_INITIALIZED: + return "CUSOLVER_STATUS_IRS_PARAMS_NOT_INITIALIZED"; + case CUSOLVER_STATUS_IRS_PARAMS_INVALID: + return "CUSOLVER_STATUS_IRS_PARAMS_INVALID"; + case CUSOLVER_STATUS_IRS_PARAMS_INVALID_PREC: + return "CUSOLVER_STATUS_IRS_PARAMS_INVALID_PREC"; + case CUSOLVER_STATUS_IRS_PARAMS_INVALID_REFINE: + return "CUSOLVER_STATUS_IRS_PARAMS_INVALID_REFINE"; + case CUSOLVER_STATUS_IRS_PARAMS_INVALID_MAXITER: + return "CUSOLVER_STATUS_IRS_PARAMS_INVALID_MAXITER"; + case CUSOLVER_STATUS_IRS_INTERNAL_ERROR: + return "CUSOLVER_STATUS_IRS_INTERNAL_ERROR"; + case CUSOLVER_STATUS_IRS_NOT_SUPPORTED: + return "CUSOLVER_STATUS_IRS_NOT_SUPPORTED"; + case CUSOLVER_STATUS_IRS_OUT_OF_RANGE: + return "CUSOLVER_STATUS_IRS_OUT_OF_RANGE"; + case CUSOLVER_STATUS_IRS_NRHS_NOT_SUPPORTED_FOR_REFINE_GMRES: + return "CUSOLVER_STATUS_IRS_NRHS_NOT_SUPPORTED_FOR_REFINE_GMRES"; + case CUSOLVER_STATUS_IRS_INFOS_NOT_INITIALIZED: + return "CUSOLVER_STATUS_IRS_INFOS_NOT_INITIALIZED"; + case CUSOLVER_STATUS_IRS_INFOS_NOT_DESTROYED: + return "CUSOLVER_STATUS_IRS_INFOS_NOT_DESTROYED"; + case CUSOLVER_STATUS_IRS_MATRIX_SINGULAR: + return "CUSOLVER_STATUS_IRS_MATRIX_SINGULAR"; + case CUSOLVER_STATUS_INVALID_WORKSPACE: + return "CUSOLVER_STATUS_INVALID_WORKSPACE"; + default: + return "UNKNOWN CUSOLVER ERROR"; + } +} + +extern "C" void check_Cusolver_Errors(cusolverStatus_t status, const char* msg, const char* file, int line) { + + const char* err = cusolver_Get_Error_String(status); + + if (status != CUSOLVER_STATUS_SUCCESS) { + printf("CUSOLVER Error in %s at line %d\n", file, line); + printf("%s - %s\n", msg, err); + exit(EXIT_FAILURE); + } +} + diff --git a/src/make_ninja.py b/src/make_ninja.py index 3d558d3..770c405 100755 --- a/src/make_ninja.py +++ b/src/make_ninja.py @@ -111,7 +111,7 @@ else: if USE_GPU: compiler_tmp = compiler.strip().split('\n') - compiler_tmp[0] += " -L{}/src/cuda/build -lcuquack -lcudart -lcublas".format(QUACK_ROOT) + compiler_tmp[0] += " -L{}/src/cuda/build -lcuquack -lcudart -lcublas -lcusolver".format(QUACK_ROOT) compiler_exe = '\n'.join(compiler_tmp) else: compiler_exe = compiler diff --git a/src/mod/cu_quack_module.f90 b/src/mod/cu_quack_module.f90 index a786587..cf974bb 100644 --- a/src/mod/cu_quack_module.f90 +++ b/src/mod/cu_quack_module.f90 @@ -8,18 +8,17 @@ module cu_quack_module interface - subroutine ph_drpa(nO, nBas, nS, eps, ERI, & - Omega, XpY, XmY) bind(C, name = "ph_drpa") + subroutine ph_drpa_tda(nO, nBas, nS, eps, ERI, & + Omega, X) bind(C, name = "ph_drpa_tda") import c_int, c_double integer(c_int), intent(in), value :: nO, nBas, nS real(c_double), intent(in) :: eps(nBas) real(c_double), intent(in) :: ERI(nBas,nBas,nBas,nBas) real(c_double), intent(out) :: Omega(nS) - real(c_double), intent(out) :: XpY(nS,nS) - real(c_double), intent(out) :: XmY(nS,nS) + real(c_double), intent(out) :: X(nS,nS) - end subroutine ph_drpa + end subroutine ph_drpa_tda end interface @@ -29,7 +28,7 @@ module cu_quack_module subroutine cu_quack_module_test() implicit none - print*, ' hello from mod_test' + print*, ' hello from cu_quack_module' end subroutine cu_quack_module_test ! ---