10
1
mirror of https://github.com/pfloos/quack synced 2025-01-05 02:48:48 +01:00

added & tested diagonalyzation with Cusolver!

This commit is contained in:
Abdallah Ammar 2024-11-27 18:29:30 +01:00
parent 0df9203eb3
commit 10cb7f931d
11 changed files with 228 additions and 94 deletions

View File

@ -31,6 +31,7 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,
! Local variables ! Local variables
integer :: i
integer :: ispin integer :: ispin
logical :: dRPA logical :: dRPA
double precision :: lambda double precision :: lambda
@ -40,7 +41,7 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,
double precision,allocatable :: XpY(:,:) double precision,allocatable :: XpY(:,:)
double precision,allocatable :: XmY(:,:) double precision,allocatable :: XmY(:,:)
! DEBUG ! DEBUG
double precision, allocatable :: XpY_gpu(:,:), XmY_gpu(:,:), Om_gpu(:) !double precision, allocatable :: XpY_gpu(:,:), XmY_gpu(:,:), Om_gpu(:)
double precision :: EcRPA(nspin) double precision :: EcRPA(nspin)
@ -78,15 +79,21 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,
call phLR_A(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,eHF,ERI,Aph) call phLR_A(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,eHF,ERI,Aph)
if(.not.TDA) call phLR_B(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,ERI,Bph) if(.not.TDA) call phLR_B(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,ERI,Bph)
! DEBUG
allocate(Om_gpu(nS), XpY_gpu(nS,nS), XmY_gpu(nS,nS))
call ph_drpa(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om_gpu(1), XpY_gpu(1,1), XmY_gpu(1,1))
print *, ' CPU:', Aph(1,1)
print *, ' GPU:', XpY_gpu(1,1)
print *, ' GPU:', XmY_gpu(1,1)
stop
call phLR(TDA,nS,Aph,Bph,EcRPA(ispin),Om,XpY,XmY) call phLR(TDA,nS,Aph,Bph,EcRPA(ispin),Om,XpY,XmY)
!! DEBUG
!allocate(Om_gpu(nS), XpY_gpu(nS,nS), XmY_gpu(nS,nS))
!call ph_drpa_tda(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om_gpu(1), XpY_gpu(1,1))
!do i = 1, nS
! print *, i, Om(i), Om_gpu(i)
! if(dabs(Om(i) - Om_gpu(i)) .gt. 1d-13) then
! print *, 'GPU FAILED!'
! stop
! endif
!enddo
!print *, 'GPU DONE!'
!stop
call print_excitation_energies('phRPA@RHF','singlet',nS,Om) call print_excitation_energies('phRPA@RHF','singlet',nS,Om)
call phLR_transition_vectors(.true.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY) call phLR_transition_vectors(.true.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY)

View File

@ -1,10 +0,0 @@
#ifndef PH_DRPA
#define PH_DRPA
extern void check_Cuda_Errors(cudaError_t err, const char * msg, const char * file, int line);
extern void check_Cublas_Errors(cublasStatus_t status, const char * msg, const char * file, int line);
extern void ph_dRPA_A_sing(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *A);
#endif

10
src/cuda/include/ph_rpa.h Normal file
View File

@ -0,0 +1,10 @@
#ifndef PH_RPA
#define PH_RPA
extern void ph_dRPA_A_sing(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *A);
extern void ph_dRPA_B_sing(int nO, int nV, int nBas, int nS, double *ERI, double *B);
extern void diag_dn_dsyevd(int n, int *info, double *W, double *A);
#endif

View File

@ -1,9 +0,0 @@
#ifndef UTILS
#define UTILS
extern "C" void check_Cuda_Errors(cudaError_t err, const char* msg, const char* file, int line);
extern "C" void check_Cublas_Errors(cublasStatus_t status, const char* msg, const char* file, int line);
#endif

9
src/cuda/include/utils.h Normal file
View File

@ -0,0 +1,9 @@
#ifndef UTILS
#define UTILS
extern void check_Cuda_Errors(cudaError_t err, const char *msg, const char *file, int line);
extern void check_Cublas_Errors(cublasStatus_t status, const char *msg, const char *file, int line);
extern void check_Cusolver_Errors(cusolverStatus_t status, const char *msg, const char *file, int line);
#endif

View File

@ -0,0 +1,41 @@
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
extern "C" void diag_dn_dsyevd(int n, int *info, double *W, double *A) {
cusolverDnHandle_t cusolverH = NULL;
cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR; // Compute eigenvalues and eigenvectors
cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER; // Upper triangular part of the matrix is stored
int lwork = 0;
double *work = NULL;
//check_Cusolver_Errors(cusolverDnCreate(&cusolverH), "cusolverDnCreate", __FILE__, __LINE__);
cusolverDnCreate(&cusolverH);
// Query workspace size
//check_Cusolver_Errors(cusolverDnDsyevd_bufferSize(cusolverH, jobz, uplo, n, A, n, W, &lwork),
// "cusolverDnDsyevd_bufferSize", __FILE__, __LINE__);
//check_Cuda_Errors(cudaMalloc((void**)&work, sizeof(double) * lwork),
// "cudaMemcpy", __FILE__, __LINE__);
cusolverDnDsyevd_bufferSize(cusolverH, jobz, uplo, n, A, n, W, &lwork);
cudaMalloc((void**)&work, sizeof(double) * lwork);
// Compute eigenvalues and eigenvectors
//check_Cusolver_Errors(cusolverDnDsyevd(cusolverH, jobz, uplo, n, A, n, W, work, lwork, info),
// "cusolverDnDsyevd", __FILE__, __LINE__);
cusolverDnDsyevd(cusolverH, jobz, uplo, n, A, n, W, work, lwork, info);
// Clean up
//check_Cuda_Errors(cudaFree(work), "cudaFree", __FILE__, __LINE__);
//check_Cusolver_Errors(cusolverDnDestroy(cusolverH), "cusolverDnDestroy", __FILE__, __LINE__);
cudaFree(work);
cusolverDnDestroy(cusolverH);
}

View File

@ -1,59 +0,0 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <stdlib.h>
#include <stdio.h>
#include <cublas_v2.h>
#include "ph_drpa.h"
int ph_drpa(int nO, int nBas, int nS, double *h_eps, double *h_ERI,
double *h_Omega, double *h_XpY, double *h_XmY) {
double *d_eps;
double *d_ERI;
int nV = nBas - nO;
int nBas2 = nBas * nBas;
int nBas4 = nBas2 * nBas2;
int ia, jb;
for (ia = 0; ia < nS; ia++) {
h_Omega[ia] = 0.0;
for (jb = 0; jb < nS; jb++) {
h_XmY[jb + ia * nS] = 4.0;
//h_XpY[jb + ia * nS] = 5.0;
}
}
check_Cuda_Errors(cudaMalloc((void**)&d_eps, nBas * sizeof(double)),
"cudaMalloc", __FILE__, __LINE__);
check_Cuda_Errors(cudaMalloc((void**)&d_ERI, nBas4 * sizeof(double)),
"cudaMalloc", __FILE__, __LINE__);
check_Cuda_Errors(cudaMemcpy(d_eps, h_eps, nBas * sizeof(double), cudaMemcpyHostToDevice),
"cudaMemcpy", __FILE__, __LINE__);
check_Cuda_Errors(cudaMemcpy(d_ERI, h_ERI, nBas4 * sizeof(double), cudaMemcpyHostToDevice),
"cudaMemcpy", __FILE__, __LINE__);
// construct A matrix
double *d_A;
check_Cuda_Errors(cudaMalloc((void**)&d_A, nS * nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__);
ph_dRPA_A_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_A);
check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__);
check_Cuda_Errors(cudaMemcpy(h_XpY, d_A, nS * nS * sizeof(double), cudaMemcpyDeviceToHost),
"cudaMemcpy", __FILE__, __LINE__);
check_Cuda_Errors(cudaFree(d_eps), "cudaFree", __FILE__, __LINE__);
check_Cuda_Errors(cudaFree(d_ERI), "cudaFree", __FILE__, __LINE__);
check_Cuda_Errors(cudaFree(d_A), "cudaFree", __FILE__, __LINE__);
return 0;
}

View File

@ -0,0 +1,76 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <stdlib.h>
#include <stdio.h>
#include <cublas_v2.h>
#include <cusolverDn.h>
#include "utils.h"
#include "ph_rpa.h"
void ph_drpa_tda(int nO, int nBas, int nS, double *h_eps, double *h_ERI,
double *h_Omega, double *h_X) {
double *d_eps = NULL;
double *d_ERI = NULL;
int nV = nBas - nO;
int nBas2 = nBas * nBas;
int nBas4 = nBas2 * nBas2;
check_Cuda_Errors(cudaMalloc((void**)&d_eps, nBas * sizeof(double)),
"cudaMalloc", __FILE__, __LINE__);
check_Cuda_Errors(cudaMalloc((void**)&d_ERI, nBas4 * sizeof(double)),
"cudaMalloc", __FILE__, __LINE__);
check_Cuda_Errors(cudaMemcpy(d_eps, h_eps, nBas * sizeof(double), cudaMemcpyHostToDevice),
"cudaMemcpy", __FILE__, __LINE__);
check_Cuda_Errors(cudaMemcpy(d_ERI, h_ERI, nBas4 * sizeof(double), cudaMemcpyHostToDevice),
"cudaMemcpy", __FILE__, __LINE__);
// construct A
double *d_A = NULL;
check_Cuda_Errors(cudaMalloc((void**)&d_A, nS * nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__);
ph_dRPA_A_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_A);
check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__);
// diagonalize A
int *d_info = NULL;
double *d_Omega = NULL;
check_Cuda_Errors(cudaMalloc((void**)&d_info, sizeof(int)),
"cudaMalloc", __FILE__, __LINE__);
check_Cuda_Errors(cudaMalloc((void**)&d_Omega, nS * sizeof(double)),
"cudaMalloc", __FILE__, __LINE__);
diag_dn_dsyevd(nS, d_info, d_Omega, d_A);
check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__);
int info_gpu = 0;
check_Cuda_Errors(cudaMemcpy(&info_gpu, d_info, sizeof(int), cudaMemcpyDeviceToHost),
"cudaMemcpy", __FILE__, __LINE__);
if (info_gpu != 0) {
printf("Error: diag_dn_dsyevd returned error code %d\n", info_gpu);
exit(EXIT_FAILURE);
}
check_Cuda_Errors(cudaMemcpy(h_X, d_A, nS * nS * sizeof(double), cudaMemcpyDeviceToHost),
"cudaMemcpy", __FILE__, __LINE__);
check_Cuda_Errors(cudaMemcpy(h_Omega, d_Omega, nS * sizeof(double), cudaMemcpyDeviceToHost),
"cudaMemcpy", __FILE__, __LINE__);
check_Cuda_Errors(cudaFree(d_info), "cudaFree", __FILE__, __LINE__);
check_Cuda_Errors(cudaFree(d_eps), "cudaFree", __FILE__, __LINE__);
check_Cuda_Errors(cudaFree(d_ERI), "cudaFree", __FILE__, __LINE__);
check_Cuda_Errors(cudaFree(d_A), "cudaFree", __FILE__, __LINE__);
check_Cuda_Errors(cudaFree(d_Omega), "cudaFree", __FILE__, __LINE__);
}

View File

@ -3,6 +3,7 @@
#include <stdio.h> #include <stdio.h>
#include <cublas_v2.h> #include <cublas_v2.h>
#include <cstring> #include <cstring>
#include <cusolverDn.h>
@ -53,3 +54,72 @@ extern "C" void check_Cublas_Errors(cublasStatus_t status, const char* msg, cons
} }
const char* cusolver_Get_Error_String(cusolverStatus_t status) {
switch (status) {
case CUSOLVER_STATUS_SUCCESS:
return "CUSOLVER_STATUS_SUCCESS";
case CUSOLVER_STATUS_NOT_INITIALIZED:
return "CUSOLVER_STATUS_NOT_INITIALIZED";
case CUSOLVER_STATUS_ALLOC_FAILED:
return "CUSOLVER_STATUS_ALLOC_FAILED";
case CUSOLVER_STATUS_INVALID_VALUE:
return "CUSOLVER_STATUS_INVALID_VALUE";
case CUSOLVER_STATUS_ARCH_MISMATCH:
return "CUSOLVER_STATUS_ARCH_MISMATCH";
case CUSOLVER_STATUS_MAPPING_ERROR:
return "CUSOLVER_STATUS_MAPPING_ERROR";
case CUSOLVER_STATUS_EXECUTION_FAILED:
return "CUSOLVER_STATUS_EXECUTION_FAILED";
case CUSOLVER_STATUS_INTERNAL_ERROR:
return "CUSOLVER_STATUS_INTERNAL_ERROR";
case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
case CUSOLVER_STATUS_NOT_SUPPORTED:
return "CUSOLVER_STATUS_NOT_SUPPORTED";
case CUSOLVER_STATUS_ZERO_PIVOT:
return "CUSOLVER_STATUS_ZERO_PIVOT";
case CUSOLVER_STATUS_INVALID_LICENSE:
return "CUSOLVER_STATUS_INVALID_LICENSE";
case CUSOLVER_STATUS_IRS_PARAMS_NOT_INITIALIZED:
return "CUSOLVER_STATUS_IRS_PARAMS_NOT_INITIALIZED";
case CUSOLVER_STATUS_IRS_PARAMS_INVALID:
return "CUSOLVER_STATUS_IRS_PARAMS_INVALID";
case CUSOLVER_STATUS_IRS_PARAMS_INVALID_PREC:
return "CUSOLVER_STATUS_IRS_PARAMS_INVALID_PREC";
case CUSOLVER_STATUS_IRS_PARAMS_INVALID_REFINE:
return "CUSOLVER_STATUS_IRS_PARAMS_INVALID_REFINE";
case CUSOLVER_STATUS_IRS_PARAMS_INVALID_MAXITER:
return "CUSOLVER_STATUS_IRS_PARAMS_INVALID_MAXITER";
case CUSOLVER_STATUS_IRS_INTERNAL_ERROR:
return "CUSOLVER_STATUS_IRS_INTERNAL_ERROR";
case CUSOLVER_STATUS_IRS_NOT_SUPPORTED:
return "CUSOLVER_STATUS_IRS_NOT_SUPPORTED";
case CUSOLVER_STATUS_IRS_OUT_OF_RANGE:
return "CUSOLVER_STATUS_IRS_OUT_OF_RANGE";
case CUSOLVER_STATUS_IRS_NRHS_NOT_SUPPORTED_FOR_REFINE_GMRES:
return "CUSOLVER_STATUS_IRS_NRHS_NOT_SUPPORTED_FOR_REFINE_GMRES";
case CUSOLVER_STATUS_IRS_INFOS_NOT_INITIALIZED:
return "CUSOLVER_STATUS_IRS_INFOS_NOT_INITIALIZED";
case CUSOLVER_STATUS_IRS_INFOS_NOT_DESTROYED:
return "CUSOLVER_STATUS_IRS_INFOS_NOT_DESTROYED";
case CUSOLVER_STATUS_IRS_MATRIX_SINGULAR:
return "CUSOLVER_STATUS_IRS_MATRIX_SINGULAR";
case CUSOLVER_STATUS_INVALID_WORKSPACE:
return "CUSOLVER_STATUS_INVALID_WORKSPACE";
default:
return "UNKNOWN CUSOLVER ERROR";
}
}
extern "C" void check_Cusolver_Errors(cusolverStatus_t status, const char* msg, const char* file, int line) {
const char* err = cusolver_Get_Error_String(status);
if (status != CUSOLVER_STATUS_SUCCESS) {
printf("CUSOLVER Error in %s at line %d\n", file, line);
printf("%s - %s\n", msg, err);
exit(EXIT_FAILURE);
}
}

View File

@ -111,7 +111,7 @@ else:
if USE_GPU: if USE_GPU:
compiler_tmp = compiler.strip().split('\n') compiler_tmp = compiler.strip().split('\n')
compiler_tmp[0] += " -L{}/src/cuda/build -lcuquack -lcudart -lcublas".format(QUACK_ROOT) compiler_tmp[0] += " -L{}/src/cuda/build -lcuquack -lcudart -lcublas -lcusolver".format(QUACK_ROOT)
compiler_exe = '\n'.join(compiler_tmp) compiler_exe = '\n'.join(compiler_tmp)
else: else:
compiler_exe = compiler compiler_exe = compiler

View File

@ -8,18 +8,17 @@ module cu_quack_module
interface interface
subroutine ph_drpa(nO, nBas, nS, eps, ERI, & subroutine ph_drpa_tda(nO, nBas, nS, eps, ERI, &
Omega, XpY, XmY) bind(C, name = "ph_drpa") Omega, X) bind(C, name = "ph_drpa_tda")
import c_int, c_double import c_int, c_double
integer(c_int), intent(in), value :: nO, nBas, nS integer(c_int), intent(in), value :: nO, nBas, nS
real(c_double), intent(in) :: eps(nBas) real(c_double), intent(in) :: eps(nBas)
real(c_double), intent(in) :: ERI(nBas,nBas,nBas,nBas) real(c_double), intent(in) :: ERI(nBas,nBas,nBas,nBas)
real(c_double), intent(out) :: Omega(nS) real(c_double), intent(out) :: Omega(nS)
real(c_double), intent(out) :: XpY(nS,nS) real(c_double), intent(out) :: X(nS,nS)
real(c_double), intent(out) :: XmY(nS,nS)
end subroutine ph_drpa end subroutine ph_drpa_tda
end interface end interface
@ -29,7 +28,7 @@ module cu_quack_module
subroutine cu_quack_module_test() subroutine cu_quack_module_test()
implicit none implicit none
print*, ' hello from mod_test' print*, ' hello from cu_quack_module'
end subroutine cu_quack_module_test end subroutine cu_quack_module_test
! --- ! ---