working on dRPA (with no TDA) on GPU: saving

2025-04-29 19:54:51 +02:00 · 2024-11-29 13:58:52 +01:00 · 2024-11-29 13:58:52 +01:00 · 3c8f8291bd
commit 3c8f8291bd
parent 1235823334
7 changed files with 391 additions and 107 deletions
--- a/src/GPU/cu_quack_module.f90
+++ b/src/GPU/cu_quack_module.f90
@ -4,49 +4,40 @@ module cu_quack_module

  implicit none

-!#ifdef USE_GPU
-!  interface
-!    subroutine ph_drpa_tda_sing(nO, nBas, nS, eps, ERI, &
-!        Omega, X) bind(C, name = "ph_drpa_tda_sing")
-!
-!      import c_int, c_double
-!      integer(c_int), intent(in), value :: nO, nBas, nS
-!      real(c_double), intent(in)        :: eps(nBas)
-!      real(c_double), intent(in)        :: ERI(nBas,nBas,nBas,nBas)
-!      real(c_double), intent(out)       :: Omega(nS)
-!      real(c_double), intent(out)       :: X(nS,nS)
-!
-!    end subroutine ph_drpa_tda_sing
-!  end interface
-!#else
-!  interface
-!    subroutine ph_drpa_tda_sing(nO, nBas, nS, eps, ERI, Omega, X)
-!      integer, intent(in)           :: nO, nBas, nS
-!      double precision, intent(in)  :: eps(nBas)
-!      double precision, intent(in)  :: ERI(nBas,nBas,nBas,nBas)
-!      double precision, intent(out) :: Omega(nS)
-!      double precision, intent(out) :: X(nS,nS)
-!    end subroutine ph_drpa_tda_sing
-!  end interface
-!#endif
-
  interface

+    ! ---
+
    subroutine ph_drpa_tda_sing(nO, nBas, nS, eps, ERI, &
-        Omega, X) bind(C, name = "ph_drpa_tda_sing")
+        Omega, XpY) bind(C, name = "ph_drpa_tda_sing")

      import c_int, c_double
      integer(c_int), intent(in), value :: nO, nBas, nS
      real(c_double), intent(in)        :: eps(nBas)
      real(c_double), intent(in)        :: ERI(nBas,nBas,nBas,nBas)
      real(c_double), intent(out)       :: Omega(nS)
-      real(c_double), intent(out)       :: X(nS,nS)
+      real(c_double), intent(out)       :: XpY(nS,nS)

    end subroutine ph_drpa_tda_sing

-  end interface
+    ! ---

-  ! ---
+    subroutine ph_drpa_sing(nO, nBas, nS, eps, ERI, &
+        Omega, XpY, XmY) bind(C, name = "ph_drpa_sing")
+
+      import c_int, c_double
+      integer(c_int), intent(in), value :: nO, nBas, nS
+      real(c_double), intent(in)        :: eps(nBas)
+      real(c_double), intent(in)        :: ERI(nBas,nBas,nBas,nBas)
+      real(c_double), intent(out)       :: Omega(nS)
+      real(c_double), intent(out)       :: XpY(nS,nS)
+      real(c_double), intent(out)       :: XmY(nS,nS)
+
+    end subroutine ph_drpa_sing
+
+    ! ---
+
+  end interface

 end module cu_quack_module

--- a/src/RPA/phRRPA.f90
+++ b/src/RPA/phRRPA.f90
@ -83,7 +83,6 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,
    call phLR(TDA,nS,Aph,Bph,EcRPA(ispin),Om,XpY,XmY)
    !call wall_time(t2)
    !print *, "wall time diag A on CPU (sec) = ", t2 - t1
-    !stop
    call print_excitation_energies('phRPA@RHF','singlet',nS,Om)
    call phLR_transition_vectors(.true.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY)

--- a/src/RPA/phRRPA_GPU.f90
+++ b/src/RPA/phRRPA_GPU.f90
@ -2,62 +2,51 @@

 subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF)

- use cu_quack_module
+  use cu_quack_module

-! Perform a direct random phase approximation calculation

  implicit none
  include 'parameters.h'
  include 'quadrature.h'

-! Input variables

-  logical,intent(in)            :: dotest
+  logical,intent(in)           :: dotest
+  logical,intent(in)           :: TDA
+  logical,intent(in)           :: doACFDT
+  logical,intent(in)           :: exchange_kernel
+  logical,intent(in)           :: singlet
+  logical,intent(in)           :: triplet
+  integer,intent(in)           :: nBas
+  integer,intent(in)           :: nC
+  integer,intent(in)           :: nO
+  integer,intent(in)           :: nV
+  integer,intent(in)           :: nR
+  integer,intent(in)           :: nS
+  double precision,intent(in)  :: ENuc
+  double precision,intent(in)  :: ERHF
+  double precision,intent(in)  :: eHF(nBas)
+  double precision,intent(in)  :: ERI(nBas,nBas,nBas,nBas)
+  double precision,intent(in)  :: dipole_int(nBas,nBas,ncart)

-  logical,intent(in)            :: TDA
-  logical,intent(in)            :: doACFDT
-  logical,intent(in)            :: exchange_kernel
-  logical,intent(in)            :: singlet
-  logical,intent(in)            :: triplet
-  integer,intent(in)            :: nBas
-  integer,intent(in)            :: nC
-  integer,intent(in)            :: nO
-  integer,intent(in)            :: nV
-  integer,intent(in)            :: nR
-  integer,intent(in)            :: nS
-  double precision,intent(in)   :: ENuc
-  double precision,intent(in)   :: ERHF
-  double precision,intent(in)   :: eHF(nBas)
-  double precision,intent(in)   :: ERI(nBas,nBas,nBas,nBas)
-  double precision,intent(in)   :: dipole_int(nBas,nBas,ncart)

-! Local variables
+  integer                      :: i
+  integer                      :: ispin
+  logical                      :: dRPA
+  double precision             :: t1, t2
+  integer,         allocatable :: iorder(:)
+  double precision,allocatable :: Om(:)
+  double precision,allocatable :: XpY(:,:)
+  double precision,allocatable :: XmY(:,:)

-  integer                       :: i
-  integer                       :: ispin
-  logical                       :: dRPA
-  double precision              :: t1, t2
-  double precision              :: lambda
-  double precision,allocatable  :: Aph(:,:)
-  double precision,allocatable  :: Bph(:,:)
-  double precision,allocatable  :: Om(:)
-  double precision,allocatable  :: XpY(:,:)
-  double precision,allocatable  :: XmY(:,:)
-  ! DEBUG
-  !double precision, allocatable :: XpY_gpu(:,:), XmY_gpu(:,:), Om_gpu(:)
+  double precision             :: EcRPA(nspin)

-  double precision              :: EcRPA(nspin)
-
-! Hello world

  write(*,*)
-  write(*,*)'*********************************'
-  write(*,*)'* Restricted ph-RPA Calculation *'
-  write(*,*)'*********************************'
+  write(*,*)'******************************************'
+  write(*,*)'* Restricted ph-RPA Calculation (on GPU) *'
+  write(*,*)'******************************************'
  write(*,*)

-! TDA 
-
  if(TDA) then
    write(*,*) 'Tamm-Dancoff approximation activated!'
    write(*,*)
@ -67,61 +56,71 @@ subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC

  dRPA = .true.
  EcRPA(:) = 0d0
-  lambda = 1d0

-! Memory allocation

-  allocate(Om(nS),XpY(nS,nS),XmY(nS,nS))
-  !allocate(Aph(nS,nS),Bph(nS,nS))
-
-! Singlet manifold
+  allocate(Om(nS), XpY(nS,nS), XmY(nS,nS))

  if(singlet) then 

    if(TDA) then

-      print*, 'start diag on GPU:'
-      call wall_time(t1)
+      !print*, 'start diag on GPU:'
+      !call wall_time(t1)
      call ph_drpa_tda_sing(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om(1), XpY(1,1))
-      call wall_time(t2)
-      print*, 'diag time on GPU (sec):', t2 - t1
-      stop
+      !call wall_time(t2)
+      !print*, 'diag time on GPU (sec):', t2 - t1
      XmY(:,:) = XpY(:,:)

    else

-      ! TODO
-      !call ph_drpa_sing(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om(1), XpY(1,1))
-      !XmY(:,:) = XpY(:,:)
+      !print*, 'start diag on GPU:'
+      !call wall_time(t1)
+      call ph_drpa_sing(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om(1), XpY(1,1), XmY(1,1))
+      !call wall_time(t2)
+      !print*, 'diag time on GPU (sec):', t2 - t1

    endif

+    ! TODO
+    XpY(:,:) = transpose(XpY(:,:))
+    XmY(:,:) = transpose(XmY(:,:))
+
    call print_excitation_energies('phRPA@RHF','singlet',nS,Om)
    call phLR_transition_vectors(.true.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY)
-
-  end if
-
-! Triplet manifold 
+  endif

  if(triplet) then 

-    ispin = 2
+    XpY(:,:) = 0.d0
+    allocate(iorder(nS))
+    ia = 0
+    do i = nC+1, nO
+      do a = nO+1, nBas-nR
+        ia = ia + 1
+        iorder(ia) = ia
+        Om(ia) = e(a) - e(i)
+        XpY(ia,ia) = 1.d0
+      enddo
+    enddo

-    call phLR_A(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,eHF,ERI,Aph)
-    if(.not.TDA) call phLR_B(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,ERI,Bph)
+    call quick_sort(Om(1), iorder(1), nS)
+    deallocate(iorder)
+
+    XmY(:,:) = XpY(:,:)

-    call phLR(TDA,nS,Aph,Bph,EcRPA(ispin),Om,XpY,XmY)
    call print_excitation_energies('phRPA@RHF','triplet',nS,Om)
    call phLR_transition_vectors(.false.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY)
+  endif

-  end if
+  deallocate(Om, XpY, XmY)

+
+  ! TODO
+  ! init EcRPA
  if(exchange_kernel) then
-
    EcRPA(1) = 0.5d0*EcRPA(1)
    EcRPA(2) = 1.5d0*EcRPA(2)
-
-  end if
+  endif

  write(*,*)
  write(*,*)'-------------------------------------------------------------------------------'
@ -132,12 +131,11 @@ subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC
  write(*,*)'-------------------------------------------------------------------------------'
  write(*,*)

-  deallocate(Om,XpY,XmY,Aph,Bph)
-
 ! Compute the correlation energy via the adiabatic connection 

  if(doACFDT) then

+    ! TODO
    call phACFDT(exchange_kernel,dRPA,TDA,singlet,triplet,nBas,nC,nO,nV,nR,nS,ERI,eHF,EcRPA)

    write(*,*)
@ -149,13 +147,11 @@ subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC
    write(*,*)'-------------------------------------------------------------------------------'
    write(*,*)

-  end if
+  endif

  if(dotest) then
-
-    call dump_test_value('R','phRPA correlation energy',sum(EcRPA))
-
-  end if
+    call dump_test_value('R','phRPA correlation energy (on GPU)',sum(EcRPA))
+  endif

 end subroutine 

--- a/src/cuda/src/ph_drpa_amb_sing.cu
+++ b/src/cuda/src/ph_drpa_amb_sing.cu
@ -0,0 +1,88 @@
+#include <stdio.h>
+
+__global__ void ph_dRPA_AmB_sing_kernel(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *AmB) {
+
+
+    int i, j, a, b;
+    int aa, bb;
+    int nVS;
+    int nBas2, nBas3;
+    int i_A0, i_A1, i_A2;
+    int i_I0, i_I1, i_I2;
+
+    bool a_eq_b;
+
+    nVS = nV * nS;
+
+    nBas2 = nBas * nBas;
+    nBas3 = nBas2 * nBas;
+
+    aa = blockIdx.x * blockDim.x + threadIdx.x;
+    bb = blockIdx.y * blockDim.y + threadIdx.y;
+
+    while(aa < nV) {
+        a = aa + nO;
+
+        i_A0 = aa * nS;
+        i_I0 = a * nBas2;
+
+        while(bb < nV) {
+            b = bb + nO;
+
+            a_eq_b = a == b;
+
+            i_A1 = i_A0 + bb;
+            i_I1 = i_I0 + b * nBas;
+
+            i = 0;
+            while(i < nO) {
+
+                i_A2 = i_A1 + i * nVS;
+                i_I2 = i_I1 + i;
+ 
+                j = 0;
+                while(j < nO) {
+
+                    AmB[i_A2 + j * nV] = 2.0 * (ERI[i_I2 + j * nBas3] - ERI[i_I2 + j * nBas]);
+                    if(a_eq_b && (i==j)) {
+                        AmB[i_A2 + j * nV] += eps[a] - eps[i];
+                    }
+
+                    j ++;
+	        } // j
+
+                i ++;
+            } // i
+
+            bb += blockDim.y * gridDim.y;
+        } // bb
+
+        aa += blockDim.x * gridDim.x;
+    } // aa
+
+}
+
+
+
+
+
+extern "C" void ph_dRPA_AmB_sing(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *AmB) {
+
+
+    int sBlocks = 32;
+    int nBlocks = (nV + sBlocks - 1) / sBlocks;
+
+    dim3 dimGrid(nBlocks, nBlocks, 1);
+    dim3 dimBlock(sBlocks, sBlocks, 1);
+
+    printf("lunching ph_dRPA_AmB_sing_kernel with %dx%d blocks and %dx%d threads/block\n",
+        nBlocks, nBlocks, sBlocks, sBlocks);
+
+
+    ph_dRPA_AmB_sing_kernel<<<dimGrid, dimBlock>>>(nO, nV, nBas, nS, eps, ERI, AmB);
+
+}
+
+
+
+
--- a/src/cuda/src/ph_drpa_apb_sing.cu
+++ b/src/cuda/src/ph_drpa_apb_sing.cu
@ -0,0 +1,88 @@
+#include <stdio.h>
+
+__global__ void ph_dRPA_ApB_sing_kernel(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *ApB) {
+
+
+    int i, j, a, b;
+    int aa, bb;
+    int nVS;
+    int nBas2, nBas3;
+    int i_A0, i_A1, i_A2;
+    int i_I0, i_I1, i_I2;
+
+    bool a_eq_b;
+
+    nVS = nV * nS;
+
+    nBas2 = nBas * nBas;
+    nBas3 = nBas2 * nBas;
+
+    aa = blockIdx.x * blockDim.x + threadIdx.x;
+    bb = blockIdx.y * blockDim.y + threadIdx.y;
+
+    while(aa < nV) {
+        a = aa + nO;
+
+        i_A0 = aa * nS;
+        i_I0 = a * nBas2;
+
+        while(bb < nV) {
+            b = bb + nO;
+
+            a_eq_b = a == b;
+
+            i_A1 = i_A0 + bb;
+            i_I1 = i_I0 + b * nBas;
+
+            i = 0;
+            while(i < nO) {
+
+                i_A2 = i_A1 + i * nVS;
+                i_I2 = i_I1 + i;
+ 
+                j = 0;
+                while(j < nO) {
+
+                    ApB[i_A2 + j * nV] = 2.0 * (ERI[i_I2 + j * nBas3] + ERI[i_I2 + j * nBas]);
+                    if(a_eq_b && (i==j)) {
+                        ApB[i_A2 + j * nV] += eps[a] - eps[i];
+                    }
+
+                    j ++;
+	        } // j
+
+                i ++;
+            } // i
+
+            bb += blockDim.y * gridDim.y;
+        } // bb
+
+        aa += blockDim.x * gridDim.x;
+    } // aa
+
+}
+
+
+
+
+
+extern "C" void ph_dRPA_ApB_sing(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *ApB) {
+
+
+    int sBlocks = 32;
+    int nBlocks = (nV + sBlocks - 1) / sBlocks;
+
+    dim3 dimGrid(nBlocks, nBlocks, 1);
+    dim3 dimBlock(sBlocks, sBlocks, 1);
+
+    printf("lunching ph_dRPA_ApB_sing_kernel with %dx%d blocks and %dx%d threads/block\n",
+        nBlocks, nBlocks, sBlocks, sBlocks);
+
+
+    ph_dRPA_ApB_sing_kernel<<<dimGrid, dimBlock>>>(nO, nV, nBas, nS, eps, ERI, ApB);
+
+}
+
+
+
+
--- a/src/cuda/src/ph_drpa_sing.c
+++ b/src/cuda/src/ph_drpa_sing.c
@ -0,0 +1,114 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <cublas_v2.h>
+#include <cusolverDn.h>
+
+#include "utils.h"
+#include "ph_rpa.h"
+
+void ph_drpa_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI,
+                      double *h_Omega, double *h_XpY, double *h_XmY) {
+
+    double *d_eps = NULL;
+    double *d_ERI = NULL;
+
+    int nV = nBas - nO;
+
+    long long nBas_long = (long long) nBas;
+    long long nBas4 = nBas_long * nBas_long * nBas_long * nBas_long;
+
+    long long nS_long = (long long) nS;
+    long long nS2 = nS_long * nS_long;
+
+    float elapsedTime;
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+
+    check_Cuda_Errors(cudaMalloc((void**)&d_eps, nBas * sizeof(double)),
+        "cudaMalloc", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaMalloc((void**)&d_ERI, nBas4 * sizeof(double)),
+        "cudaMalloc", __FILE__, __LINE__);
+
+    printf("CPU->GPU transfer..\n");
+    cudaEventRecord(start, 0);
+    check_Cuda_Errors(cudaMemcpy(d_eps, h_eps, nBas * sizeof(double), cudaMemcpyHostToDevice), 
+        "cudaMemcpy", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaMemcpy(d_ERI, h_ERI, nBas4 * sizeof(double), cudaMemcpyHostToDevice), 
+        "cudaMemcpy", __FILE__, __LINE__);
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&elapsedTime, start, stop);
+    printf("Time elapsed on CPU->GPU transfer = %f msec\n", elapsedTime);
+
+    // construct A+B & A-B
+    double *d_ApB = NULL;
+    double *d_AmB = NULL;
+    check_Cuda_Errors(cudaMalloc((void**)&d_ApB, nS2 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaMalloc((void**)&d_A-B, nS2 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__);
+
+    cudaEventRecord(start, 0);
+    ph_dRPA_ApB_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_ApB);
+    ph_dRPA_AmB_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_AmB);
+    check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__);
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&elapsedTime, start, stop);
+    printf("Time elapsed on A & B kernels = %f msec\n", elapsedTime);
+
+
+    // free memory
+    check_Cuda_Errors(cudaFree(d_eps), "cudaFree", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaFree(d_ERI), "cudaFree", __FILE__, __LINE__);
+
+
+    // TODO
+    // diagonalize A+B and A-B
+    int *d_info = NULL;
+    double *d_Omega = NULL;
+    check_Cuda_Errors(cudaMalloc((void**)&d_info, sizeof(int)),
+        "cudaMalloc", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaMalloc((void**)&d_Omega, nS * sizeof(double)),
+        "cudaMalloc", __FILE__, __LINE__);
+
+    cudaEventRecord(start, 0);
+    diag_dn_dsyevd(nS, d_info, d_Omega, d_A);
+    check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__);
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&elapsedTime, start, stop);
+    printf("Time elapsed on diagonalization  = %f msec\n", elapsedTime);
+
+
+    // transfer data to CPU
+    cudaEventRecord(start, 0);
+    //int info_gpu = 0;
+    //check_Cuda_Errors(cudaMemcpy(&info_gpu, d_info, sizeof(int), cudaMemcpyDeviceToHost),
+    //    "cudaMemcpy", __FILE__, __LINE__);
+    //if (info_gpu != 0) {
+    //    printf("Error: diag_dn_dsyevd returned error code %d\n", info_gpu);
+    //    exit(EXIT_FAILURE);
+    //}
+    check_Cuda_Errors(cudaMemcpy(h_XpY, d_, nS2 * sizeof(double), cudaMemcpyDeviceToHost), 
+        "cudaMemcpy", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaMemcpy(h_XmY, d_, nS2 * sizeof(double), cudaMemcpyDeviceToHost), 
+        "cudaMemcpy", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaMemcpy(h_Omega, d_Omega, nS * sizeof(double), cudaMemcpyDeviceToHost), 
+        "cudaMemcpy", __FILE__, __LINE__);
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&elapsedTime, start, stop);
+    printf("Time elapsed on GPU -> CPU transfer = %f msec\n", elapsedTime);
+
+    check_Cuda_Errors(cudaFree(d_info), "cudaFree", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaFree(d_A), "cudaFree", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaFree(d_B), "cudaFree", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaFree(d_Omega), "cudaFree", __FILE__, __LINE__);
+
+
+}
+
--- a/src/cuda/src/ph_drpa_tda_sing.c
+++ b/src/cuda/src/ph_drpa_tda_sing.c
@ -9,6 +9,11 @@
 #include "utils.h"
 #include "ph_rpa.h"

+/*
+ *
+ * Y = 0 ==> X+Y = X-Y = X
+ *
+*/
 void ph_drpa_tda_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI,
                      double *h_Omega, double *h_X) {

@ -16,6 +21,9 @@ void ph_drpa_tda_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI,
    double *d_ERI = NULL;

    int nV = nBas - nO;
+ 
+    long long nS_long = (long long) nS;
+    long long nS2 = nS_long * nS_long;

    long long nBas_long = (long long) nBas;
    long long nBas4 = nBas_long * nBas_long * nBas_long * nBas_long;
@ -47,7 +55,7 @@ void ph_drpa_tda_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI,

    // construct A
    double *d_A = NULL;
-    check_Cuda_Errors(cudaMalloc((void**)&d_A, nS * nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaMalloc((void**)&d_A, nS2 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__);

    cudaEventRecord(start, 0);
    ph_dRPA_A_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_A);
@ -86,7 +94,7 @@ void ph_drpa_tda_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI,
    //    printf("Error: diag_dn_dsyevd returned error code %d\n", info_gpu);
    //    exit(EXIT_FAILURE);
    //}
-    check_Cuda_Errors(cudaMemcpy(h_X, d_A, nS * nS * sizeof(double), cudaMemcpyDeviceToHost), 
+    check_Cuda_Errors(cudaMemcpy(h_X, d_A, nS2 * sizeof(double), cudaMemcpyDeviceToHost), 
        "cudaMemcpy", __FILE__, __LINE__);
    check_Cuda_Errors(cudaMemcpy(h_Omega, d_Omega, nS * sizeof(double), cudaMemcpyDeviceToHost), 
        "cudaMemcpy", __FILE__, __LINE__);