diff --git a/src/cuda/Makefile b/src/cuda/Makefile index 61ac1d2..32c2101 100644 --- a/src/cuda/Makefile +++ b/src/cuda/Makefile @@ -39,6 +39,6 @@ $(F_OBJ): $(F_SRC) .PHONY: clean clean: - rm -f $(BLD_DIR)/* + rm $(BLD_DIR)/* diff --git a/src/cuda/include/ph_drpa.h b/src/cuda/include/ph_drpa.h index 1f68857..1786a2f 100644 --- a/src/cuda/include/ph_drpa.h +++ b/src/cuda/include/ph_drpa.h @@ -5,6 +5,6 @@ extern void check_Cuda_Errors(cudaError_t err, const char * msg, const char * file, int line); extern void check_Cublas_Errors(cublasStatus_t status, const char * msg, const char * file, int line); -extern void ph_dRPA_A_sing(int nO, int nBas, double *eps, double *ERI, double *A); +extern void ph_dRPA_A_sing(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *A); #endif diff --git a/src/cuda/src/ph_drpa.c b/src/cuda/src/ph_drpa.c index 304f1c1..c90bfab 100644 --- a/src/cuda/src/ph_drpa.c +++ b/src/cuda/src/ph_drpa.c @@ -13,6 +13,8 @@ int ph_drpa(int nO, int nBas, int nS, double *h_eps, double *h_ERI, double *d_eps; double *d_ERI; + int nV = nBas - nO; + int nBas2 = nBas * nBas; int nBas4 = nBas2 * nBas2; @@ -40,7 +42,7 @@ int ph_drpa(int nO, int nBas, int nS, double *h_eps, double *h_ERI, // construct A matrix double *d_A; check_Cuda_Errors(cudaMalloc((void**)&d_A, nS * nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); - ph_dRPA_A_sing(nO, nBas, d_eps, d_ERI, d_A); + ph_dRPA_A_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_A); check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); diff --git a/src/cuda/src/ph_drpa_a_sing.cu b/src/cuda/src/ph_drpa_a_sing.cu index 5308c1d..917cdfb 100644 --- a/src/cuda/src/ph_drpa_a_sing.cu +++ b/src/cuda/src/ph_drpa_a_sing.cu @@ -1,17 +1,15 @@ #include -__global__ void ph_dRPA_A_sing_kernel(int nO, int nBas, double *eps, double *ERI, double *A) { +__global__ void ph_dRPA_A_sing_kernel(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *A) { int i, j, a, b; int aa, bb; - int nV, nS, nVS; + int nVS; int nBas2, nBas3; int i_A0, i_A1, i_A2; int i_I0, i_I1, i_I2; - nV = nBas - nO; - nS = nO * nV; nVS = nV * nS; nBas2 = nBas * nBas; @@ -64,13 +62,11 @@ __global__ void ph_dRPA_A_sing_kernel(int nO, int nBas, double *eps, double *ERI -extern "C" void ph_dRPA_A_sing(int nO, int nBas, double *eps, double *ERI, double *A) { +extern "C" void ph_dRPA_A_sing(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *A) { - int size = nBas - nO; - int sBlocks = 32; - int nBlocks = (size + sBlocks - 1) / sBlocks; + int nBlocks = (nV + sBlocks - 1) / sBlocks; dim3 dimGrid(nBlocks, nBlocks, 1); dim3 dimBlock(sBlocks, sBlocks, 1); @@ -80,7 +76,7 @@ extern "C" void ph_dRPA_A_sing(int nO, int nBas, double *eps, double *ERI, doubl nBlocks, nBlocks, sBlocks, sBlocks); - ph_dRPA_A_sing_kernel<<>>(nO, nBas, eps, ERI, A); + ph_dRPA_A_sing_kernel<<>>(nO, nV, nBas, nS, eps, ERI, A); } diff --git a/src/cuda/src/ph_drpa_a_trip.cu b/src/cuda/src/ph_drpa_a_trip.cu index a58b5a2..b28e7b8 100644 --- a/src/cuda/src/ph_drpa_a_trip.cu +++ b/src/cuda/src/ph_drpa_a_trip.cu @@ -1,22 +1,15 @@ #include -__global__ void ph_dRPA_A_trip_kernel(int nO, int nBas, double *eps, double *A) { +__global__ void ph_dRPA_A_trip_kernel(int nO, int nV, int nBas, int nS, double *eps, double *A) { int i, j, a, b; int aa, bb; - int nV, nS, nVS; - int nBas2, nBas3; + int nVS; int i_A0, i_A1, i_A2; - int i_I0, i_I1, i_I2; - nV = nBas - nO; - nS = nO * nV; nVS = nV * nS; - nBas2 = nBas * nBas; - nBas3 = nBas2 * nBas; - aa = blockIdx.x * blockDim.x + threadIdx.x; bb = blockIdx.y * blockDim.y + threadIdx.y; @@ -24,19 +17,16 @@ __global__ void ph_dRPA_A_trip_kernel(int nO, int nBas, double *eps, double *A) a = aa + nO; i_A0 = aa * nS; - i_I0 = a * nBas2; while(bb < nV) { b = bb + nO; i_A1 = i_A0 + bb; - i_I1 = i_I0 + b * nBas; i = 0; while(i < nO) { i_A2 = i_A1 + i * nVS; - i_I2 = i_I1 + i; j = 0; while(j < nO) { @@ -64,13 +54,11 @@ __global__ void ph_dRPA_A_trip_kernel(int nO, int nBas, double *eps, double *A) -extern "C" void ph_dRPA_A_trip(int nO, int nBas, double *eps, double *A) { +extern "C" void ph_dRPA_A_trip(int nO, int nV, int nBas, int nS, double *eps, double *A) { - int size = nBas - nO; - int sBlocks = 32; - int nBlocks = (size + sBlocks - 1) / sBlocks; + int nBlocks = (nV + sBlocks - 1) / sBlocks; dim3 dimGrid(nBlocks, nBlocks, 1); dim3 dimBlock(sBlocks, sBlocks, 1); @@ -80,7 +68,7 @@ extern "C" void ph_dRPA_A_trip(int nO, int nBas, double *eps, double *A) { nBlocks, nBlocks, sBlocks, sBlocks); - ph_dRPA_A_trip_kernel<<>>(nO, nBas, eps, A); + ph_dRPA_A_trip_kernel<<>>(nO, nV, nBas, nS, eps, A); } diff --git a/src/cuda/src/ph_drpa_b_trip.cu b/src/cuda/src/ph_drpa_b_trip.cu new file mode 100644 index 0000000..6122164 --- /dev/null +++ b/src/cuda/src/ph_drpa_b_trip.cu @@ -0,0 +1,72 @@ +#include + +__global__ void ph_dRPA_B_trip_kernel(int nO, int nV, int nBas, int nS, double *B) { + + + int i, j; + int aa, bb; + int nVS; + int i_B0, i_B1, i_B2; + + nVS = nV * nS; + + aa = blockIdx.x * blockDim.x + threadIdx.x; + bb = blockIdx.y * blockDim.y + threadIdx.y; + + while(aa < nV) { + + i_B0 = aa * nS; + + while(bb < nV) { + + i_B1 = i_B0 + bb; + + i = 0; + while(i < nO) { + + i_B2 = i_B1 + i * nVS; + + j = 0; + while(j < nO) { + + B[i_B2 + j * nV] = 0.0; + + j ++; + } // j + + i ++; + } // i + + bb += blockDim.y * gridDim.y; + } // bb + + aa += blockDim.x * gridDim.x; + } // aa + +} + + + + + +extern "C" void ph_dRPA_B_trip(int nO, int nV, int nBas, int nS, double *B) { + + + int sBlocks = 32; + int nBlocks = (nV + sBlocks - 1) / sBlocks; + + dim3 dimGrid(nBlocks, nBlocks, 1); + dim3 dimBlock(sBlocks, sBlocks, 1); + + + printf("lunching ph_dRPA_B_trip_kernel with %dx%d blocks and %dx%d threads/block\n", + nBlocks, nBlocks, sBlocks, sBlocks); + + + ph_dRPA_B_trip_kernel<<>>(nO, nV, nBas, nS, B); + +} + + + +