From fd4dc5b77ea63f0f3f8309030aa1a64aeb49a256 Mon Sep 17 00:00:00 2001 From: AbdAmmar Date: Fri, 29 Nov 2024 19:10:24 +0100 Subject: [PATCH] fixed memory leak in phRPA --- src/LR/phLR.f90 | 8 +-- src/cuda/src/a_d_at.cu | 64 +++++++++++++++++++++++ src/cuda/src/a_dinv_at.cu | 64 +++++++++++++++++++++++ src/cuda/src/elementwise_dsqrt_inplace.cu | 52 ++++++++++++++++++ src/cuda/src/ph_drpa_sing.c | 63 +++++++++++++++++++--- 5 files changed, 242 insertions(+), 9 deletions(-) create mode 100644 src/cuda/src/a_d_at.cu create mode 100644 src/cuda/src/a_dinv_at.cu create mode 100644 src/cuda/src/elementwise_dsqrt_inplace.cu diff --git a/src/LR/phLR.f90 b/src/LR/phLR.f90 index 6457e27..6040ad7 100644 --- a/src/LR/phLR.f90 +++ b/src/LR/phLR.f90 @@ -30,14 +30,12 @@ subroutine phLR(TDA,nS,Aph,Bph,EcRPA,Om,XpY,XmY) double precision,intent(out) :: XpY(nS,nS) double precision,intent(out) :: XmY(nS,nS) -! Memory allocation - allocate(ApB(nS,nS),AmB(nS,nS),AmBSq(nS,nS),AmBIv(nS,nS),Z(nS,nS),tmp(nS,nS)) ! Tamm-Dancoff approximation if(TDA) then - + XpY(:,:) = Aph(:,:) call diagonalize_matrix(nS,XpY,Om) XpY(:,:) = transpose(XpY(:,:)) @@ -45,6 +43,8 @@ subroutine phLR(TDA,nS,Aph,Bph,EcRPA,Om,XpY,XmY) else + allocate(ApB(nS,nS),AmB(nS,nS),AmBSq(nS,nS),AmBIv(nS,nS),Z(nS,nS),tmp(nS,nS)) + ApB(:,:) = Aph(:,:) + Bph(:,:) AmB(:,:) = Aph(:,:) - Bph(:,:) @@ -81,6 +81,8 @@ subroutine phLR(TDA,nS,Aph,Bph,EcRPA,Om,XpY,XmY) ! XmY = matmul(transpose(Z),AmBIv) ! call DA(nS,1d0*sqrt(Om),XmY) + + deallocate(ApB,AmB,AmBSq,AmBIv,Z,tmp) end if diff --git a/src/cuda/src/a_d_at.cu b/src/cuda/src/a_d_at.cu new file mode 100644 index 0000000..0f190eb --- /dev/null +++ b/src/cuda/src/a_d_at.cu @@ -0,0 +1,64 @@ +#include + + +__global__ void A_D_At_kernel(int n, double *A, double *D, double *R) { + + + int i, j; + int k; + int in, ij; + int kn; + + i = blockIdx.x * blockDim.x + threadIdx.x; + j = blockIdx.y * blockDim.y + threadIdx.y; + + while(i < n) { + + in = i * n; + + while(j < n) { + + ij = in + j; + + R[ij] = 0.0; + k = 0; + while(k < n) { + + kn = k * n; + R[ij] += D[k] * U[i + kn] * U[j + kn]; + + k ++; + } // k + + j += blockDim.y * gridDim.y; + } // j + + i += blockDim.x * gridDim.x; + } // i + +} + + + + + +extern "C" void A_D_At(int n, double *A, double *D, double *R) { + + + int sBlocks = 32; + int nBlocks = (n + sBlocks - 1) / sBlocks; + + dim3 dimGrid(nBlocks, nBlocks, 1); + dim3 dimBlock(sBlocks, sBlocks, 1); + + printf("lunching A_D_At_kernel with %dx%d blocks and %dx%d threads/block\n", + nBlocks, nBlocks, sBlocks, sBlocks); + + + A_D_At_kernel<<>>(n, A, D, R); + +} + + + + diff --git a/src/cuda/src/a_dinv_at.cu b/src/cuda/src/a_dinv_at.cu new file mode 100644 index 0000000..450b866 --- /dev/null +++ b/src/cuda/src/a_dinv_at.cu @@ -0,0 +1,64 @@ +#include + + +__global__ void A_Dinv_At_kernel(int n, double *A, double *D, double *R) { + + + int i, j; + int k; + int in, ij; + int kn; + + i = blockIdx.x * blockDim.x + threadIdx.x; + j = blockIdx.y * blockDim.y + threadIdx.y; + + while(i < n) { + + in = i * n; + + while(j < n) { + + ij = in + j; + + R[ij] = 0.0; + k = 0; + while(k < n) { + + kn = k * n; + R[ij] += D[k] * U[i + kn] * U[j + kn] / (D[k] + 1e-12); + + k ++; + } // k + + j += blockDim.y * gridDim.y; + } // j + + i += blockDim.x * gridDim.x; + } // i + +} + + + + + +extern "C" void A_Dinv_At(int n, double *A, double *D, double *R) { + + + int sBlocks = 32; + int nBlocks = (n + sBlocks - 1) / sBlocks; + + dim3 dimGrid(nBlocks, nBlocks, 1); + dim3 dimBlock(sBlocks, sBlocks, 1); + + printf("lunching A_Dinv_At_kernel with %dx%d blocks and %dx%d threads/block\n", + nBlocks, nBlocks, sBlocks, sBlocks); + + + A_Dinv_At_kernel<<>>(n, A, D, R); + +} + + + + diff --git a/src/cuda/src/elementwise_dsqrt_inplace.cu b/src/cuda/src/elementwise_dsqrt_inplace.cu new file mode 100644 index 0000000..f053dd9 --- /dev/null +++ b/src/cuda/src/elementwise_dsqrt_inplace.cu @@ -0,0 +1,52 @@ +#include +#include + + +__global__ void elementwise_dsqrt_inplace_kernel(int nS, double *A, int *nb_neg_sqrt) { + + + int i; + + i = blockIdx.x * blockDim.x + threadIdx.x; + nb_neg_sqrt = 0; + + while(i < nS) { + + if(A[i] > 0.0) { + + A[i] = sqrt(A[i]); + + } else { + + A[i] = sqrt(-A[i]); + + } + + i += blockDim.x * gridDim.x; + } // i + +} + + + + + +extern "C" void elementwise_dsqrt_inplace(int nS, double *A, int *nb_neg_sqrt) { + + int sBlocks = 32; + int nBlocks = (nS + sBlocks - 1) / sBlocks; + + dim3 dimGrid(nBlocks, 1, 1); + dim3 dimBlock(sBlocks, 1, 1); + + printf("lunching elementwise_dsqrt_inplace_kernel with %d blocks and %d threads/block\n", + nBlocks, sBlocks); + + + elementwise_dsqrt_inplace_kernel<<>>(nS, A, nb_neg_sqrt); + +} + + + + diff --git a/src/cuda/src/ph_drpa_sing.c b/src/cuda/src/ph_drpa_sing.c index f3f8244..95f8693 100644 --- a/src/cuda/src/ph_drpa_sing.c +++ b/src/cuda/src/ph_drpa_sing.c @@ -49,7 +49,7 @@ void ph_drpa_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, double *d_ApB = NULL; double *d_AmB = NULL; check_Cuda_Errors(cudaMalloc((void**)&d_ApB, nS2 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); - check_Cuda_Errors(cudaMalloc((void**)&d_A-B, nS2 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); + check_Cuda_Errors(cudaMalloc((void**)&d_AmB, nS2 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); cudaEventRecord(start, 0); ph_dRPA_ApB_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_ApB); @@ -58,7 +58,7 @@ void ph_drpa_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsedTime, start, stop); - printf("Time elapsed on A & B kernels = %f msec\n", elapsedTime); + printf("Time elapsed on AmB & ApB = %f msec\n", elapsedTime); // free memory @@ -66,8 +66,7 @@ void ph_drpa_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, check_Cuda_Errors(cudaFree(d_ERI), "cudaFree", __FILE__, __LINE__); - // TODO - // diagonalize A+B and A-B + // diagonalize A-B int *d_info = NULL; double *d_Omega = NULL; check_Cuda_Errors(cudaMalloc((void**)&d_info, sizeof(int)), @@ -76,12 +75,64 @@ void ph_drpa_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, "cudaMalloc", __FILE__, __LINE__); cudaEventRecord(start, 0); - diag_dn_dsyevd(nS, d_info, d_Omega, d_A); + diag_dn_dsyevd(nS, d_info, d_Omega, d_AmB); check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsedTime, start, stop); - printf("Time elapsed on diagonalization = %f msec\n", elapsedTime); + printf("Time elapsed on diag AmB = %f msec\n", elapsedTime); + + + // d_Omega <-- d_Omega^{0.5} + elementwise_dsqrt_inplace(nS, d_Omega); + // TODO + //int *d_nb_neg_sqrt = NULL; + //check_Cuda_Errors(cudaMalloc((void**)&d_nb_neg_sqrt, sizeof(int)), + // "cudaMalloc", __FILE__, __LINE__); + //int nb_neg_sqrt = 0; + //check_Cuda_Errors(cudaMemcpy(&nb_neg_sqrt, d_nb_neg_sqrt, sizeof(int), cudaMemcpyDeviceToHost), + // "cudaMemcpy", __FILE__, __LINE__); + //if (nb_neg_sqrt > 0) { + // printf("You may have instabilities in linear response: A-B is not positive definite!!\n"); + // printf("nb of <= 0 elements = %d\n", nb_neg_sqrt); + //} + + + // TODO + // d_AmB (d_Omega)^{+0.5} (d_AmB)^T + // d_AmB (d_Omega)^{-0.5} (d_AmB)^T + double *d_AmBSq = NULL; + check_Cuda_Errors(cudaMalloc((void**)&d_AmBSq, nS * sizeof(double)), + "cudaMalloc", __FILE__, __LINE__); + double *d_AmBSqInv = NULL; + check_Cuda_Errors(cudaMalloc((void**)&d_AmBSqInv, nS * sizeof(double)), + "cudaMalloc", __FILE__, __LINE__); + + cudaEventRecord(start, 0); + A_D_At(nS, d_AmB, d_Omega, d_AmBSq); + A_Dinv_At(nS, d_AmB, d_Omega, d_AmBSqInv); + check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&elapsedTime, start, stop); + printf("Time elapsed on d_AmBSq & d_AmBSqInv = %f msec\n", elapsedTime); + + + // TODO + //call dgemm('N','N',nS,nS,nS,1d0,ApB,size(ApB,1),AmBSq,size(AmBSq,1),0d0,tmp,size(tmp,1)) + //call dgemm('N','N',nS,nS,nS,1d0,AmBSq,size(AmBSq,1),tmp,size(tmp,1),0d0,Z,size(Z,1)) + //call diagonalize_matrix(nS,Z,Om) + //if(minval(Om) < 0d0) & + // call print_warning('You may have instabilities in linear response: negative excitations!!') + //Om = sqrt(Om) + //call dgemm('T','N',nS,nS,nS,1d0,Z,size(Z,1),AmBSq,size(AmBSq,1),0d0,XpY,size(XpY,1)) + //call DA(nS,1d0/dsqrt(Om),XpY) + //call dgemm('T','N',nS,nS,nS,1d0,Z,size(Z,1),AmBIv,size(AmBIv,1),0d0,XmY,size(XmY,1)) + //call DA(nS,1d0*dsqrt(Om),XmY) + + + + // transfer data to CPU