From b2e82bff2ec054ef1abc8e6e5a6bf72696bff1f3 Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Wed, 27 Nov 2024 08:51:04 +0100 Subject: [PATCH] added cuda kernel for A-trip-dRPA --- src/cuda/include/ph_drpa.h | 2 +- src/cuda/src/ph_drpa.c | 2 +- ...{phlr_drpa_a_sing.cu => ph_drpa_a_sing.cu} | 8 +- src/cuda/src/ph_drpa_a_trip.cu | 89 +++++++++++++++++++ 4 files changed, 95 insertions(+), 6 deletions(-) rename src/cuda/src/{phlr_drpa_a_sing.cu => ph_drpa_a_sing.cu} (79%) create mode 100644 src/cuda/src/ph_drpa_a_trip.cu diff --git a/src/cuda/include/ph_drpa.h b/src/cuda/include/ph_drpa.h index 6f1ed07..1f68857 100644 --- a/src/cuda/include/ph_drpa.h +++ b/src/cuda/include/ph_drpa.h @@ -5,6 +5,6 @@ extern void check_Cuda_Errors(cudaError_t err, const char * msg, const char * file, int line); extern void check_Cublas_Errors(cublasStatus_t status, const char * msg, const char * file, int line); -extern void phLR_dRPA_A_sing(int nO, int nBas, double *eps, double *ERI, double *A); +extern void ph_dRPA_A_sing(int nO, int nBas, double *eps, double *ERI, double *A); #endif diff --git a/src/cuda/src/ph_drpa.c b/src/cuda/src/ph_drpa.c index 0e0f671..304f1c1 100644 --- a/src/cuda/src/ph_drpa.c +++ b/src/cuda/src/ph_drpa.c @@ -40,7 +40,7 @@ int ph_drpa(int nO, int nBas, int nS, double *h_eps, double *h_ERI, // construct A matrix double *d_A; check_Cuda_Errors(cudaMalloc((void**)&d_A, nS * nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); - phLR_dRPA_A_sing(nO, nBas, d_eps, d_ERI, d_A); + ph_dRPA_A_sing(nO, nBas, d_eps, d_ERI, d_A); check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); diff --git a/src/cuda/src/phlr_drpa_a_sing.cu b/src/cuda/src/ph_drpa_a_sing.cu similarity index 79% rename from src/cuda/src/phlr_drpa_a_sing.cu rename to src/cuda/src/ph_drpa_a_sing.cu index 7844d66..5308c1d 100644 --- a/src/cuda/src/phlr_drpa_a_sing.cu +++ b/src/cuda/src/ph_drpa_a_sing.cu @@ -1,6 +1,6 @@ #include -__global__ void phLR_dRPA_A_sing_kernel(int nO, int nBas, double *eps, double *ERI, double *A) { +__global__ void ph_dRPA_A_sing_kernel(int nO, int nBas, double *eps, double *ERI, double *A) { int i, j, a, b; @@ -64,7 +64,7 @@ __global__ void phLR_dRPA_A_sing_kernel(int nO, int nBas, double *eps, double *E -extern "C" void phLR_dRPA_A_sing(int nO, int nBas, double *eps, double *ERI, double *A) { +extern "C" void ph_dRPA_A_sing(int nO, int nBas, double *eps, double *ERI, double *A) { int size = nBas - nO; @@ -76,11 +76,11 @@ extern "C" void phLR_dRPA_A_sing(int nO, int nBas, double *eps, double *ERI, dou dim3 dimBlock(sBlocks, sBlocks, 1); - printf("lunching phLR_dRPA_A_sing_kernel with %dx%d blocks and %dx%d threads/block\n", + printf("lunching ph_dRPA_A_sing_kernel with %dx%d blocks and %dx%d threads/block\n", nBlocks, nBlocks, sBlocks, sBlocks); - phLR_dRPA_A_sing_kernel<<>>(nO, nBas, eps, ERI, A); + ph_dRPA_A_sing_kernel<<>>(nO, nBas, eps, ERI, A); } diff --git a/src/cuda/src/ph_drpa_a_trip.cu b/src/cuda/src/ph_drpa_a_trip.cu new file mode 100644 index 0000000..a58b5a2 --- /dev/null +++ b/src/cuda/src/ph_drpa_a_trip.cu @@ -0,0 +1,89 @@ +#include + +__global__ void ph_dRPA_A_trip_kernel(int nO, int nBas, double *eps, double *A) { + + + int i, j, a, b; + int aa, bb; + int nV, nS, nVS; + int nBas2, nBas3; + int i_A0, i_A1, i_A2; + int i_I0, i_I1, i_I2; + + nV = nBas - nO; + nS = nO * nV; + nVS = nV * nS; + + nBas2 = nBas * nBas; + nBas3 = nBas2 * nBas; + + aa = blockIdx.x * blockDim.x + threadIdx.x; + bb = blockIdx.y * blockDim.y + threadIdx.y; + + while(aa < nV) { + a = aa + nO; + + i_A0 = aa * nS; + i_I0 = a * nBas2; + + while(bb < nV) { + b = bb + nO; + + i_A1 = i_A0 + bb; + i_I1 = i_I0 + b * nBas; + + i = 0; + while(i < nO) { + + i_A2 = i_A1 + i * nVS; + i_I2 = i_I1 + i; + + j = 0; + while(j < nO) { + + A[i_A2 + j * nV] = 0.0; + if((a==b) && (i==j)) { + A[i_A2 + j * nV] += eps[a] - eps[i]; + } + + j ++; + } // j + + i ++; + } // i + + bb += blockDim.y * gridDim.y; + } // bb + + aa += blockDim.x * gridDim.x; + } // aa + +} + + + + + +extern "C" void ph_dRPA_A_trip(int nO, int nBas, double *eps, double *A) { + + + int size = nBas - nO; + + int sBlocks = 32; + int nBlocks = (size + sBlocks - 1) / sBlocks; + + dim3 dimGrid(nBlocks, nBlocks, 1); + dim3 dimBlock(sBlocks, sBlocks, 1); + + + printf("lunching ph_dRPA_A_trip_kernel with %dx%d blocks and %dx%d threads/block\n", + nBlocks, nBlocks, sBlocks, sBlocks); + + + ph_dRPA_A_trip_kernel<<>>(nO, nBas, eps, A); + +} + + + +