diff --git a/.gitignore b/.gitignore
index 899b091..4a6ab19 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
+*.slurm
+*.mod
+*.so
 *.o
 *.
 __pycache__
diff --git a/PyDuck.py b/PyDuck.py
index 6c39b70..4779566 100644
--- a/PyDuck.py
+++ b/PyDuck.py
@@ -6,6 +6,8 @@ import pyscf
 from pyscf import gto
 import numpy as np
 import subprocess
+import time
+
 
 #Find the value of the environnement variable QUACK_ROOT. If not present we use the current repository
 if "QUACK_ROOT" not in os.environ:
@@ -22,7 +24,10 @@ parser.add_argument('-b', '--basis', type=str, required=True, help='Name of the
 parser.add_argument('--bohr', default='Angstrom', action='store_const', const='Bohr', help='By default QuAcK assumes that the xyz files are in Angstrom. Add this argument if your xyz file is in Bohr.')
 parser.add_argument('-c', '--charge', type=int, default=0, help='Total charge of the molecule. Specify negative charges with "m" instead of the minus sign, for example m1 instead of -1. Default is 0')
 parser.add_argument('--cartesian', default=False, action='store_true', help='Add this option if you want to use cartesian basis functions.')
-parser.add_argument('--print_2e', default=False, action='store_true', help='Add this option if you want to print 2e-integrals.')
+parser.add_argument('--print_2e', default=True, action='store_true', help='If True, print 2e-integrals to disk.')
+parser.add_argument('--formatted_2e', default=False, action='store_true', help='Add this option if you want to print formatted 2e-integrals.')
+parser.add_argument('--mmap_2e', default=False, action='store_true', help='If True, avoid using DRAM when generating 2e-integrals.')
+parser.add_argument('--aosym_2e', default=False, action='store_true', help='If True, use 8-fold symmetry 2e-integrals.')
 parser.add_argument('-fc', '--frozen_core', type=bool, default=False, help='Freeze core MOs. Default is false')
 parser.add_argument('-m', '--multiplicity', type=int, default=1, help='Spin multiplicity. Default is 1 therefore singlet')
 parser.add_argument('--working_dir', type=str, default=QuAcK_dir, help='Set a working directory to run the calculation.')
@@ -38,6 +43,9 @@ multiplicity=args.multiplicity
 xyz=args.xyz + '.xyz'
 cartesian=args.cartesian
 print_2e=args.print_2e
+formatted_2e=args.formatted_2e
+mmap_2e=args.mmap_2e
+aosym_2e=args.aosym_2e
 working_dir=args.working_dir
 
 #Read molecule
@@ -59,6 +67,7 @@ mol = gto.M(
     basis = input_basis,
     charge = charge,
     spin = multiplicity - 1
+#    symmetry = True  # Enable symmetry
 )
 
 #Fix the unit for the lengths
@@ -129,33 +138,57 @@ write_matrix_to_file(y,norb,working_dir+'/int/y.dat')
 subprocess.call(['rm', '-f', working_dir + '/int/z.dat'])
 write_matrix_to_file(z,norb,working_dir+'/int/z.dat')
 
-eri_ao = mol.intor('int2e')
-
-def write_tensor_to_file(tensor,size,file,cutoff=1e-15):
-    f = open(file, 'w')
+def write_tensor_to_file(tensor,size,file_name,cutoff=1e-15):
+    f = open(file_name, 'w')
     for i in range(size):
         for j in range(i,size):
             for k in range(i,size):
                 for l in range(j,size):
                     if abs(tensor[i][k][j][l]) > cutoff:
-                        #f.write(str(i+1)+' '+str(j+1)+' '+str(k+1)+' '+str(l+1)+' '+"{:.16E}".format(tensor[i][k][j][l]))
                         f.write(str(i+1)+' '+str(j+1)+' '+str(k+1)+' '+str(l+1)+' '+"{:.16E}".format(tensor[i][k][j][l]))
                         f.write('\n')
     f.close()
 
-# Write two-electron integrals
 if print_2e:
-    # (formatted)
-    subprocess.call(['rm', '-f', working_dir + '/int/ERI.dat'])
-    write_tensor_to_file(eri_ao, norb, working_dir + '/int/ERI.dat')
-else:
-    # (binary)
-    subprocess.call(['rm', '-f', working_dir + '/int/ERI.bin'])
-    # chem -> phys notation
-    eri_ao = eri_ao.transpose(0, 2, 1, 3)
-    f = open(working_dir + '/int/ERI.bin', 'w')
-    eri_ao.tofile(working_dir + '/int/ERI.bin')
-    f.close()
+    # Write two-electron integrals to HD
+    ti_2e = time.time()
+
+    if formatted_2e:
+        output_file_path = working_dir + '/int/ERI.dat'
+        subprocess.call(['rm', '-f', output_file_path])
+        eri_ao = mol.intor('int2e')
+        write_tensor_to_file(eri_ao, norb, output_file_path)
+    
+    if aosym_2e:
+        output_file_path = working_dir + '/int/ERI_chem.bin'
+        subprocess.call(['rm', '-f', output_file_path])
+        eri_ao = mol.intor('int2e', aosym='s8')
+        f = open(output_file_path, 'w')
+        eri_ao.tofile(output_file_path)
+        f.close()
+    else:
+        output_file_path = working_dir + '/int/ERI.bin'
+        subprocess.call(['rm', '-f', output_file_path])
+        if(mmap_2e):
+            # avoid using DRAM
+            eri_shape = (norb, norb, norb, norb)
+            eri_mmap = np.memmap(output_file_path, dtype='float64', mode='w+', shape=eri_shape)
+            mol.intor('int2e', out=eri_mmap)
+            for i in range(norb):
+                eri_mmap[i, :, :, :] = eri_mmap[i, :, :, :].transpose(1, 0, 2)
+            eri_mmap.flush()
+            del eri_mmap
+        else:
+            eri_ao = mol.intor('int2e').transpose(0, 2, 1, 3) # chem -> phys
+            f = open(output_file_path, 'w')
+            eri_ao.tofile(output_file_path)
+            f.close()
+
+    te_2e = time.time()
+    print("Wall time for writing 2e-integrals to disk: {:.3f} seconds".format(te_2e - ti_2e))
+    sys.stdout.flush()
+
+
 
 
 #Execute the QuAcK fortran program
diff --git a/input/hpc_flags b/input/hpc_flags
new file mode 100644
index 0000000..120f367
--- /dev/null
+++ b/input/hpc_flags
@@ -0,0 +1,4 @@
+# if True (T), switch to HPC mode
+  F
+# if True (T), use GPU
+  F
diff --git a/quack.rc b/quack.rc
index 77ca7ff..bede54b 100644
--- a/quack.rc
+++ b/quack.rc
@@ -13,3 +13,5 @@ esac
 export QUACK_ROOT="$( cd $QUACK_ROOT; pwd -P )"
 
 export PATH="${QUACK_ROOT}/bin:$PATH"
+export LD_LIBRARY_PATH="${QUACK_ROOT}/src/cuda/build:$LD_LIBRARY_PATH"
+
diff --git a/src/AOtoMO/Hartree_matrix_AO_basis.f90 b/src/AOtoMO/Hartree_matrix_AO_basis.f90
index 03000a0..63141cc 100644
--- a/src/AOtoMO/Hartree_matrix_AO_basis.f90
+++ b/src/AOtoMO/Hartree_matrix_AO_basis.f90
@@ -32,3 +32,147 @@ subroutine Hartree_matrix_AO_basis(nBas,P,G,H)
   end do
 
 end subroutine 
+
+! ---
+
+subroutine Hartree_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, H)
+
+  implicit none
+
+  integer,          intent(in)  :: nBas
+  integer*8,        intent(in)  :: ERI_size
+  double precision, intent(in)  :: P(nBas,nBas)
+  double precision, intent(in)  :: ERI_chem(ERI_size)
+  double precision, intent(out) :: H(nBas,nBas)
+
+  integer*8                     :: mu, nu, la, si, nBas8
+  integer*8                     :: nunu, lala, nula, lasi, numu
+  integer*8                     :: nunu0, lala0
+  integer*8                     :: nunununu, nunulala, nununula, nunulasi
+  integer*8                     :: lalanunu, lasinunu, numulala, lalanumu
+  integer*8                     :: numunula, numulasi, lasinumu, nununumu
+  integer*8                     :: nunununu0, numunumu0
+
+
+
+  nBas8 = int(nBas, kind=8)
+
+  !$OMP PARALLEL DEFAULT(NONE)                                      &
+  !$OMP PRIVATE (nu, la, si, mu,                                    &
+  !$OMP          nunu0, nunu, nula, lala0, lala, lasi, numu,        &
+  !$OMP          nunununu0, nunununu, nununula, numulala, numunula, &
+  !$OMP          nunulala, lalanunu, lalanumu, nunulasi, lasinunu,  &
+  !$OMP          numunumu0, nununumu, numulasi, lasinumu)           &
+  !$OMP SHARED (nBas8, H, P, ERI_chem)
+  !$OMP DO
+  do nu = 1, nBas8
+
+    nunu0 = shiftr(nu * (nu - 1), 1)
+    nunu = nunu0 + nu
+    nunununu0 = shiftr(nunu * (nunu - 1), 1)
+
+    nunununu = nunununu0 + nunu
+    H(nu,nu) = P(nu,nu) * ERI_chem(nunununu)
+
+    do la = 1, nu - 1
+
+      lala0 = shiftr(la * (la - 1), 1)
+
+      lala = lala0 + la
+      nunulala = nunununu0 + lala
+      H(nu,nu) = H(nu,nu) + P(la,la) * ERI_chem(nunulala)
+
+      nula = nunu0 + la
+      nununula = nunununu0 + nula
+      H(nu,nu) = H(nu,nu) + 2.d0 * P(la,nu) * ERI_chem(nununula)
+
+      do si = 1, la - 1
+        lasi = lala0 + si
+        nunulasi = nunununu0 + lasi
+        H(nu,nu) = H(nu,nu) + 2.d0 * P(si,la) * ERI_chem(nunulasi)
+      enddo
+    enddo
+
+    do la = nu + 1, nBas8
+
+      lala0 = shiftr(la * (la - 1), 1)
+
+      lala = lala0 + la
+      lalanunu = shiftr(lala * (lala - 1), 1) + nunu
+      H(nu,nu) = H(nu,nu) + P(la,la) * ERI_chem(lalanunu)
+
+      do si = 1, la - 1
+        lasi = lala0 + si
+        lasinunu = shiftr(lasi * (lasi - 1), 1) + nunu
+        H(nu,nu) = H(nu,nu) + 2.d0 * P(si,la) * ERI_chem(lasinunu)
+      enddo
+    enddo
+
+    do mu = 1, nu - 1
+
+      numu = nunu0 + mu
+
+      numunumu0 = shiftr(numu * (numu - 1), 1)
+
+      nununumu = nunununu0 + numu
+      H(mu,nu) = p(nu,nu) * ERI_chem(nununumu)
+
+      do la = 1, nu - 1
+        lala = shiftr(la * (la - 1), 1) + la
+        numulala = numunumu0 + lala
+        H(mu,nu) = H(mu,nu) + p(la,la) * ERI_chem(numulala)
+      enddo
+
+      do la = nu + 1, nBas8
+        lala = shiftr(la * (la - 1), 1) + la
+        lalanumu = shiftr(lala * (lala - 1), 1) + numu
+        H(mu,nu) = H(mu,nu) + p(la,la) * ERI_chem(lalanumu)
+      enddo
+
+      do la = 1, mu
+        nula = nunu0 + la
+        numunula = numunumu0 + nula
+        H(mu,nu) = H(mu,nu) + 2.d0 * P(la,nu) * ERI_chem(numunula)
+      enddo
+
+      do la = mu + 1, nu - 1
+        nula = nunu0 + la
+        numunula = shiftr(nula * (nula - 1), 1) + numu
+        H(mu,nu) = H(mu,nu) + 2.d0 * P(la,nu) * ERI_chem(numunula)
+      enddo
+
+      do la = 2, nu - 1
+        lala0 = shiftr(la * (la - 1), 1)
+        do si = 1, la - 1
+          lasi = lala0 + si
+          numulasi = numunumu0 + lasi
+          H(mu,nu) = H(mu,nu) + 2.d0 * P(si,la) * ERI_chem(numulasi)
+        enddo
+      enddo
+
+      do la = nu + 1, nBas8
+        lala0 = shiftr(la * (la - 1), 1)
+        do si = 1, la - 1
+          lasi = lala0 + si
+          lasinumu = shiftr(lasi * (lasi - 1), 1) + numu
+          H(mu,nu) = H(mu,nu) + 2.d0 * P(si,la) * ERI_chem(lasinumu)
+        enddo
+      enddo
+
+    enddo ! mu
+  enddo ! nu
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  do nu = 1, nBas8
+    do mu = nu+1, nBas8
+      H(mu,nu) = H(nu,mu)
+    enddo
+  enddo
+
+  return
+end subroutine 
+
+! ---
+
+
diff --git a/src/AOtoMO/exchange_matrix_AO_basis.f90 b/src/AOtoMO/exchange_matrix_AO_basis.f90
index 1a29038..8042420 100644
--- a/src/AOtoMO/exchange_matrix_AO_basis.f90
+++ b/src/AOtoMO/exchange_matrix_AO_basis.f90
@@ -31,3 +31,200 @@ subroutine exchange_matrix_AO_basis(nBas,P,ERI,K)
   end do
 
 end subroutine 
+
+! ---
+
+subroutine exchange_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, K)
+
+  implicit none
+
+  integer,          intent(in)  :: nBas
+  integer*8,        intent(in)  :: ERI_size
+  double precision, intent(in)  :: P(nBas,nBas)
+  double precision, intent(in)  :: ERI_chem(ERI_size)
+  double precision, intent(out) :: K(nBas,nBas)
+
+  integer*8                     :: mu, nu, la, si, nBas8
+  integer*8                     :: nunu, nula, lanu, lasi, nusi, sinu
+  integer*8                     :: numu, mumu, mula, lamu, musi, simu
+  integer*8                     :: nunu0, lala0, mumu0
+  integer*8                     :: nunununu, nulanula, lanulanu, nulanusi
+  integer*8                     :: munulasi, lanunusi, lanusinu, numumumu 
+  integer*8                     :: nulamula, nulalamu, lanulamu, nulamusi
+  integer*8                     :: nulasimu, lanumusi, lanusimu, simunula
+  integer*8                     :: simulanu, nulanula0, lanulanu0
+
+
+
+  nBas8 = int(nBas, kind=8)
+
+
+  !$OMP PARALLEL DEFAULT (NONE)                                           &
+  !$OMP PRIVATE (nu, si, la, mu,                                          &
+  !$OMP          nunu0, nunu, lanu, numu, mumu0, mumu, simu, lala0, nula, &
+  !$OMP          nunununu, nulanula, lanulanu, lanulanu0, nulanula0,      &
+  !$OMP          nulanusi, lanulamu, lanunusi, lanusinu , numumumu,       &
+  !$OMP          nulamula, nulalamu, lanumusi, lanusimu, nulamusi,        &
+  !$OMP          nulasimu, simunula, simulanu) &
+  !$OMP SHARED (nBas8, P, ERI_chem, K)
+  !$OMP DO
+  do nu = 1, nBas8
+
+    nunu0 = shiftr(nu * (nu - 1), 1)
+    nunu = nunu0 + nu
+
+    nunununu = shiftr(nunu * (nunu - 1), 1) + nunu
+    K(nu,nu) = -P(nu,nu) * ERI_chem(nunununu)
+
+    do la = 1, nu - 1
+      nula = nunu0 + la
+      nulanula = shiftr(nula * (nula - 1), 1) + nula
+      K(nu,nu) = K(nu,nu) - P(la,la) * ERI_chem(nulanula)
+    enddo
+
+    do la = nu + 1, nBas8
+      lanu = shiftr(la * (la - 1), 1) + nu
+      lanulanu = shiftr(lanu * (lanu - 1), 1) + lanu
+      K(nu,nu) = K(nu,nu) - P(la,la) * ERI_chem(lanulanu)
+    enddo
+
+    do la = 1, nu
+      nula = nunu0 + la
+      nulanula0 = shiftr(nula * (nula - 1), 1)
+      do si = 1, la - 1
+        nulanusi = nulanula0 + nunu0 + si
+        K(nu,nu) = K(nu,nu) - 2.d0 * P(si,la) * ERI_chem(nulanusi)
+      enddo
+    enddo
+
+    do la = nu + 1, nBas8
+      lanu = shiftr(la * (la - 1), 1) + nu
+      lanulanu0 = shiftr(lanu * (lanu - 1), 1)
+      do si = 1, nu
+        lanunusi = lanulanu0 + nunu0 + si
+        K(nu,nu) = K(nu,nu) - 2.d0 * P(si,la) * ERI_chem(lanunusi)
+      enddo
+      do si = nu + 1, la - 1
+        lanusinu = lanulanu0 + shiftr(si * (si - 1), 1) + nu
+        K(nu,nu) = K(nu,nu) - 2.d0 * P(si,la) * ERI_chem(lanusinu)
+      enddo
+    enddo
+
+
+    do mu = 1, nu - 1
+
+      numu = nunu0 + mu
+      mumu0 = shiftr(mu * (mu - 1), 1)
+      mumu = mumu0 + mu
+      numumumu = shiftr(numu * (numu - 1), 1) + mumu
+      K(mu,nu) = - P(mu,mu) * ERI_chem(numumumu)
+
+      do la = 1, mu - 1
+        nula = nunu0 + la
+        nulamula = shiftr(nula * (nula - 1), 1) + mumu0 + la
+        K(mu,nu) = K(mu,nu) - P(la,la) * ERI_chem(nulamula)
+      enddo
+      do la = mu + 1, nu
+        nula = nunu0 + la
+        nulalamu = shiftr(nula * (nula - 1), 1) + shiftr(la * (la - 1), 1) + mu
+        K(mu,nu) = K(mu,nu) - P(la,la) * ERI_chem(nulalamu)
+      enddo
+      do la = nu + 1, nBas8
+        lala0 = shiftr(la * (la - 1), 1)
+        lanu = lala0 + nu
+        lanulamu = shiftr(lanu * (lanu - 1), 1) + lala0 + mu
+        K(mu,nu) = K(mu,nu) - P(la,la) * ERI_chem(lanulamu)
+      enddo
+
+      do la = 1, mu
+        nula = nunu0 + la
+        nulanula0 = shiftr(nula * (nula - 1), 1)
+        do si = 1, la - 1
+          nulamusi = nulanula0 + mumu0 + si
+          K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(nulamusi)
+        enddo
+      enddo
+      do la = mu + 1, nu
+        nula = nunu0 + la
+        nulanula0 = shiftr(nula * (nula - 1), 1)
+        do si = 1, mu
+          nulamusi = nulanula0 + mumu0 + si
+          K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(nulamusi)
+        enddo
+        do si = mu + 1, la - 1
+          nulasimu = nulanula0 + shiftr(si * (si - 1), 1) + mu
+          K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(nulasimu)
+        enddo
+      enddo
+      do la = nu + 1, nBas8
+        lanu = shiftr(la * (la - 1), 1) + nu
+        lanulanu0 = shiftr(lanu * (lanu - 1), 1)
+        do si = 1, mu
+          lanumusi = lanulanu0 + mumu0 + si
+          K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(lanumusi)
+        enddo
+        do si = mu + 1, la - 1
+          lanusimu = lanulanu0 + shiftr(si * (si - 1), 1) + mu
+          K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(lanusimu)
+        enddo
+      enddo
+
+      do la = 1, mu
+        nula = nunu0 + la
+        nulanula0 = shiftr(nula * (nula - 1), 1)
+        do si = la + 1, mu
+          nulamusi = nulanula0 + mumu0 + si
+          K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(nulamusi)
+        enddo
+        do si = mu + 1, nu - 1
+          nulasimu = nulanula0 + shiftr(si * (si - 1), 1) + mu
+          K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(nulasimu)
+        enddo
+        do si = nu, nBas8
+          simu = shiftr(si * (si - 1), 1) + mu
+          simunula = shiftr(simu * (simu - 1), 1) + nula
+          K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(simunula)
+        enddo
+      enddo
+      do la = mu + 1, nu
+        nula = nunu0 + la
+        nulanula0 = shiftr(nula * (nula - 1), 1)
+        do si = la + 1, nu
+          nulasimu = nulanula0 + shiftr(si * (si - 1), 1) + mu
+          K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(nulasimu)
+        enddo
+        do si = nu + 1, nBas8
+          simu = shiftr(si * (si - 1), 1) + mu
+          simunula = shiftr(simu * (simu - 1), 1) + nula
+          K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(simunula)
+        enddo
+      enddo
+      do la = nu + 1, nBas8
+        lanu = shiftr(la * (la - 1), 1) + nu
+        do si = la + 1, nBas8
+          simu = shiftr(si * (si - 1), 1) + mu
+          simulanu = shiftr(simu * (simu - 1), 1) + lanu
+          K(mu,nu) = K(mu,nu) - P(si,la) * ERI_chem(simulanu)
+        enddo
+      enddo
+
+    enddo ! mu
+  enddo ! nu
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+
+  do nu = 1, nBas8
+    do mu = nu+1, nBas8
+      K(mu,nu) = K(nu,mu)
+    enddo
+  enddo
+
+  return
+end subroutine 
+
+! ---
+
+
+
+
diff --git a/src/GPU/cu_quack_module.f90 b/src/GPU/cu_quack_module.f90
new file mode 100644
index 0000000..bb022c2
--- /dev/null
+++ b/src/GPU/cu_quack_module.f90
@@ -0,0 +1,44 @@
+module cu_quack_module
+
+  use, intrinsic :: iso_c_binding
+
+  implicit none
+
+  interface
+
+    ! ---
+
+    subroutine ph_drpa_tda_sing(nO, nBas, nS, eps, ERI, &
+        Omega, XpY) bind(C, name = "ph_drpa_tda_sing")
+
+      import c_int, c_double
+      integer(c_int), intent(in), value :: nO, nBas, nS
+      real(c_double), intent(in)        :: eps(nBas)
+      real(c_double), intent(in)        :: ERI(nBas,nBas,nBas,nBas)
+      real(c_double), intent(out)       :: Omega(nS)
+      real(c_double), intent(out)       :: XpY(nS,nS)
+
+    end subroutine ph_drpa_tda_sing
+
+    ! ---
+
+    subroutine ph_drpa_sing(nO, nBas, nS, eps, ERI, &
+        Omega, XpY, XmY) bind(C, name = "ph_drpa_sing")
+
+      import c_int, c_double
+      integer(c_int), intent(in), value :: nO, nBas, nS
+      real(c_double), intent(in)        :: eps(nBas)
+      real(c_double), intent(in)        :: ERI(nBas,nBas,nBas,nBas)
+      real(c_double), intent(out)       :: Omega(nS)
+      real(c_double), intent(out)       :: XpY(nS,nS)
+      real(c_double), intent(out)       :: XmY(nS,nS)
+
+    end subroutine ph_drpa_sing
+
+    ! ---
+
+  end interface
+
+end module cu_quack_module
+
+
diff --git a/src/HF/RHF_hpc.f90 b/src/HF/RHF_hpc.f90
new file mode 100644
index 0000000..a4d3325
--- /dev/null
+++ b/src/HF/RHF_hpc.f90
@@ -0,0 +1,311 @@
+subroutine RHF_hpc(working_dir,dotest,maxSCF,thresh,max_diis,guess_type,level_shift,nNuc,ZNuc,rNuc,ENuc, & 
+                    nBas,nOrb,nO,S,T,V,Hc,dipole_int,X,ERHF,eHF,c,P,F)
+
+! Perform restricted Hartree-Fock calculation
+
+  implicit none
+  include 'parameters.h'
+
+! Input variables
+
+  character(len=256),intent(in) :: working_dir
+
+  logical,intent(in)            :: dotest
+
+  integer,intent(in)            :: maxSCF
+  integer,intent(in)            :: max_diis
+  integer,intent(in)            :: guess_type
+  double precision,intent(in)   :: thresh
+  double precision,intent(in)   :: level_shift
+
+  integer,intent(in)            :: nBas
+  integer,intent(in)            :: nOrb
+  integer,intent(in)            :: nO
+  integer,intent(in)            :: nNuc
+  double precision,intent(in)   :: ZNuc(nNuc)
+  double precision,intent(in)   :: rNuc(nNuc,ncart)
+  double precision,intent(in)   :: ENuc
+  double precision,intent(in)   :: S(nBas,nBas)
+  double precision,intent(in)   :: T(nBas,nBas)
+  double precision,intent(in)   :: V(nBas,nBas)
+  double precision,intent(in)   :: Hc(nBas,nBas) 
+  double precision,intent(in)   :: X(nBas,nOrb)
+  double precision,intent(in)   :: dipole_int(nBas,nBas,ncart)
+
+! Local variables
+
+  integer                       :: ii, jj
+  integer                       :: nSCF
+  integer                       :: nBas_Sq
+  integer                       :: n_diis
+  integer*8                     :: ERI_size
+  double precision              :: t1, t2
+  double precision              :: diff, diff_loc
+  double precision              :: ET
+  double precision              :: EV
+  double precision              :: EJ
+  double precision              :: EK
+  double precision              :: dipole(ncart)
+  double precision              :: Conv
+  double precision              :: rcond
+  double precision,external     :: trace_matrix
+  double precision,allocatable  :: err(:,:)
+  double precision,allocatable  :: err_diis(:,:)
+  double precision,allocatable  :: F_diis(:,:)
+  double precision,allocatable  :: J(:,:)
+  double precision,allocatable  :: K(:,:)
+  double precision,allocatable  :: cp(:,:)
+  double precision,allocatable  :: Fp(:,:)
+  double precision,allocatable  :: ERI_chem(:)
+  double precision,allocatable  :: ERI_phys(:,:,:,:), J_deb(:,:), K_deb(:,:)
+  double precision,allocatable  :: tmp1(:,:), FX(:,:)
+
+
+! Output variables
+
+  double precision,intent(out)  :: ERHF
+  double precision,intent(out)  :: eHF(nOrb)
+  double precision,intent(inout):: c(nBas,nOrb)
+  double precision,intent(out)  :: P(nBas,nBas)
+  double precision,intent(out)  :: F(nBas,nBas)
+
+! Hello world
+
+  write(*,*)
+  write(*,*)'****************************************'
+  write(*,*)'* Restricted HF Calculation (HPC mode) *'
+  write(*,*)'****************************************'
+  write(*,*)
+
+! Useful quantities
+
+  nBas_Sq = nBas*nBas
+
+! Memory allocation
+
+  allocate(J(nBas,nBas))
+  allocate(K(nBas,nBas))
+
+  allocate(err(nBas,nBas))
+
+  allocate(cp(nOrb,nOrb))
+  allocate(Fp(nOrb,nOrb))
+
+  allocate(err_diis(nBas_Sq,max_diis))
+  allocate(F_diis(nBas_Sq,max_diis))
+
+  allocate(tmp1(nBas,nBas))
+  allocate(FX(nBas,nOrb))
+
+! Guess coefficients and density matrix
+  call mo_guess(nBas,nOrb,guess_type,S,Hc,X,c)
+
+  call dgemm('N', 'T', nBas, nBas, nO, 2.d0, &
+             c(1,1), nBas, c(1,1), nBas,     &
+             0.d0, P(1,1), nBas)
+
+  ERI_size = shiftr(nBas * (nBas + 1), 1)
+  ERI_size = shiftr(ERI_size * (ERI_size + 1), 1)
+  allocate(ERI_chem(ERI_size))
+  call read_2e_integrals_hpc(working_dir, ERI_size, ERI_chem)
+
+  !call wall_time(t1)
+  !call Hartree_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, J)
+  !call wall_time(t2)
+  !print*, " J built in (sec):", (t2-t1)
+  !call wall_time(t1)
+  !call exchange_matrix_AO_basis_hpc(nBas, ERI_size, P, ERI_chem, K)
+  !call wall_time(t2)
+  !print*, " K built in (sec):", (t2-t1)
+  !allocate(ERI_phys(nBas,nBas,nBas,nBas))
+  !allocate(J_deb(nBas,nBas))
+  !allocate(K_deb(nBas,nBas))
+  !call read_2e_integrals(working_dir, nBas, ERI_phys)
+  !call wall_time(t1)
+  !call Hartree_matrix_AO_basis(nBas, P, ERI_phys, J_deb)
+  !call wall_time(t2)
+  !print*, " J_deb built in (sec):", (t2-t1)
+  !call wall_time(t1)
+  !call exchange_matrix_AO_basis(nBas, P, ERI_phys, K_deb)
+  !call wall_time(t2)
+  !print*, " K_deb built in (sec):", (t2-t1)
+  !print*, "max error on J = ", maxval(dabs(J - J_deb))
+  !diff = 0.d0
+  !do ii = 1, nBas
+  !  do jj = 1, nBas
+  !    diff_loc = dabs(J(jj,ii) - J_deb(jj,ii))
+  !    if(diff_loc .gt. 1d-10) then
+  !      print*, 'error in J on: ', jj, ii
+  !      print*, J(jj,ii), J_deb(jj,ii)
+  !      stop
+  !    endif
+  !    diff = diff + diff_loc
+  !  enddo
+  !enddo
+  !print*, 'total diff on J = ', diff
+  !print*, "max error on K = ", maxval(dabs(K - K_deb))
+  !diff = 0.d0
+  !do ii = 1, nBas
+  !  do jj = 1, nBas
+  !    diff_loc = dabs(K(jj,ii) - K_deb(jj,ii))
+  !    if(diff_loc .gt. 1d-10) then
+  !      print*, 'error in K on: ', jj, ii
+  !      print*, K(jj,ii), K_deb(jj,ii)
+  !      stop
+  !    endif
+  !    diff = diff + diff_loc
+  !  enddo
+  !enddo
+  !print*, 'total diff on K = ', diff
+  !stop
+
+! Initialization
+
+  n_diis        = 0
+  F_diis(:,:)   = 0d0
+  err_diis(:,:) = 0d0
+  rcond         = 0d0
+
+  Conv = 1d0
+  nSCF = 0
+
+!------------------------------------------------------------------------
+! Main SCF loop
+!------------------------------------------------------------------------
+
+  write(*,*)
+  write(*,*)'-----------------------------------------------------------------------------'
+  write(*,'(1X,A1,1X,A3,1X,A1,1X,A16,1X,A1,1X,A16,1X,A1,1X,A16,1X,A1,1X,A10,1X,A1,1X)') &
+            '|','#','|','E(RHF)','|','EJ(RHF)','|','EK(RHF)','|','Conv','|'
+  write(*,*)'-----------------------------------------------------------------------------'
+
+  do while(Conv > thresh .and. nSCF < maxSCF)
+
+    ! Increment 
+
+    nSCF = nSCF + 1
+
+    ! Build Fock matrix
+    call Hartree_matrix_AO_basis_hpc (nBas, ERI_size, P(1,1), ERI_chem(1), J(1,1))
+    call exchange_matrix_AO_basis_hpc(nBas, ERI_size, P(1,1), ERI_chem(1), K(1,1))
+    F(:,:) = Hc(:,:) + J(:,:) + 0.5d0*K(:,:)
+
+    ! Check convergence 
+    call dgemm("N", "N", nBas, nBas, nBas, 1.d0, &
+               S(1,1), nBas, P(1,1), nBas,       &
+               0.d0, tmp1(1,1), nBas)
+    call dgemm("N", "N", nBas, nBas, nBas, 1.d0, &
+               tmp1(1,1), nBas, F(1,1), nBas,    &
+               0.d0, err(1,1), nBas)
+    call dgemm("N", "T", nBas, nBas, nBas, 1.d0, &
+               F(1,1), nBas, tmp1(1,1), nBas,    &
+               -1.d0, err(1,1), nBas)
+    !err = matmul(F, matmul(P, S)) - matmul(matmul(S, P), F)
+    if(nSCF > 1) Conv = maxval(abs(err))
+
+    ! Kinetic energy
+    call dgemm("N", "N", nBas, nBas, nBas, 1.d0, &
+               P(1,1), nBas, T(1,1), nBas,       &
+               0.d0, tmp1(1,1), nBas)
+    ET = trace_matrix(nBas, tmp1(1,1))
+
+    ! Potential energy
+    call dgemm("N", "N", nBas, nBas, nBas, 1.d0, &
+               P(1,1), nBas, V(1,1), nBas,       &
+               0.d0, tmp1(1,1), nBas)
+    EV = trace_matrix(nBas, tmp1(1,1))
+
+    ! Hartree energy
+    call dgemm("N", "N", nBas, nBas, nBas, 1.d0, &
+               P(1,1), nBas, J(1,1), nBas,       &
+               0.d0, tmp1(1,1), nBas)
+    EJ = 0.5d0*trace_matrix(nBas, tmp1(1,1))
+
+    ! Exchange energy
+    call dgemm("N", "N", nBas, nBas, nBas, 1.d0, &
+               P(1,1), nBas, K(1,1), nBas,       &
+               0.d0, tmp1(1,1), nBas)
+    EK = 0.25d0*trace_matrix(nBas, tmp1(1,1))
+
+    ! Total energy
+    ERHF = ET + EV + EJ + EK
+
+    ! DIIS extrapolation
+    if(max_diis > 1) then
+      n_diis = min(n_diis+1,max_diis)
+      call DIIS_extrapolation(rcond,nBas_Sq,nBas_Sq,n_diis,err_diis,F_diis,err,F)
+    endif
+
+    ! Level shift
+    if(level_shift > 0d0 .and. Conv > thresh) then
+      call level_shifting(level_shift,nBas,nOrb,nO,S,c,F)
+    endif
+
+    ! Diagonalize Fock matrix
+    call dgemm("N", "N", nBas, nOrb, nBas, 1.d0, &
+               F(1,1), nBas, X(1,1), nBas,       &
+               0.d0, FX(1,1), nBas)
+    call dgemm("T", "N", nOrb, nOrb, nBas, 1.d0, &
+               X(1,1), nBas, FX(1,1), nBas,      &
+               0.d0, Fp(1,1), nOrb)
+    !Fp = matmul(transpose(X), matmul(F, X))
+    cp(:,:) = Fp(:,:)
+    call diagonalize_matrix(nOrb,cp,eHF)
+    !c = matmul(X, cp)
+    call dgemm("N", "N", nBas, nOrb, nOrb, 1.d0, &
+               X(1,1), nBas, cp(1,1), nOrb,      &
+               0.d0, c(1,1), nBas)
+    
+
+    ! Density matrix
+    call dgemm('N', 'T', nBas, nBas, nO, 2.d0, &
+               c(1,1), nBas, c(1,1), nBas,     &
+               0.d0, P(1,1), nBas)
+ 
+    ! Dump results
+    write(*,'(1X,A1,1X,I3,1X,A1,1X,F16.10,1X,A1,1X,F16.10,1X,A1,1X,F16.10,1X,A1,1X,E10.2,1X,A1,1X)') &
+      '|',nSCF,'|',ERHF + ENuc,'|',EJ,'|',EK,'|',Conv,'|'
+
+  end do
+  write(*,*)'-----------------------------------------------------------------------------'
+!------------------------------------------------------------------------
+! End of SCF loop
+!------------------------------------------------------------------------
+
+! Did it actually converge?
+
+  if(nSCF == maxSCF) then
+
+    write(*,*)
+    write(*,*)'!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'
+    write(*,*)'                 Convergence failed                 '
+    write(*,*)'!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'
+    write(*,*)
+
+    deallocate(J,K,err,cp,Fp,err_diis,F_diis)
+    deallocate(tmp1, FX, ERI_chem)
+
+    stop
+
+  end if
+
+! Compute dipole moments
+
+  call dipole_moment(nBas,P,nNuc,ZNuc,rNuc,dipole_int,dipole)
+  call print_RHF(nBas,nOrb,nO,eHF,c,ENuc,ET,EV,EJ,EK,ERHF,dipole)
+
+! Testing zone
+
+  if(dotest) then
+ 
+    call dump_test_value('R','RHF energy',ERHF)
+    call dump_test_value('R','RHF HOMO energy',eHF(nO))
+    call dump_test_value('R','RHF LUMO energy',eHF(nO+1))
+    call dump_test_value('R','RHF dipole moment',norm2(dipole))
+
+  end if
+
+  deallocate(J,K,err,cp,Fp,err_diis,F_diis)
+  deallocate(tmp1, FX, ERI_chem)
+
+end subroutine 
diff --git a/src/LR/phLR.f90 b/src/LR/phLR.f90
index ca49344..6040ad7 100644
--- a/src/LR/phLR.f90
+++ b/src/LR/phLR.f90
@@ -15,6 +15,7 @@ subroutine phLR(TDA,nS,Aph,Bph,EcRPA,Om,XpY,XmY)
   ! Local variables
 
   double precision              :: trace_matrix
+  double precision              :: t1, t2
   double precision,allocatable  :: ApB(:,:)
   double precision,allocatable  :: AmB(:,:)
   double precision,allocatable  :: AmBSq(:,:)
@@ -29,14 +30,12 @@ subroutine phLR(TDA,nS,Aph,Bph,EcRPA,Om,XpY,XmY)
   double precision,intent(out)  :: XpY(nS,nS)
   double precision,intent(out)  :: XmY(nS,nS)
 
-! Memory allocation
 
-  allocate(ApB(nS,nS),AmB(nS,nS),AmBSq(nS,nS),AmBIv(nS,nS),Z(nS,nS),tmp(nS,nS))
 
 ! Tamm-Dancoff approximation
 
   if(TDA) then
- 
+
     XpY(:,:) = Aph(:,:)
     call diagonalize_matrix(nS,XpY,Om)
     XpY(:,:) = transpose(XpY(:,:))
@@ -44,6 +43,8 @@ subroutine phLR(TDA,nS,Aph,Bph,EcRPA,Om,XpY,XmY)
 
   else
 
+    allocate(ApB(nS,nS),AmB(nS,nS),AmBSq(nS,nS),AmBIv(nS,nS),Z(nS,nS),tmp(nS,nS))
+
     ApB(:,:) = Aph(:,:) + Bph(:,:)
     AmB(:,:) = Aph(:,:) - Bph(:,:)
 
@@ -80,6 +81,8 @@ subroutine phLR(TDA,nS,Aph,Bph,EcRPA,Om,XpY,XmY)
 
 !   XmY = matmul(transpose(Z),AmBIv)
 !   call DA(nS,1d0*sqrt(Om),XmY)
+
+    deallocate(ApB,AmB,AmBSq,AmBIv,Z,tmp)
  
   end if
 
diff --git a/src/LR/phLR_A.f90 b/src/LR/phLR_A.f90
index 2826fdd..708799a 100644
--- a/src/LR/phLR_A.f90
+++ b/src/LR/phLR_A.f90
@@ -7,28 +7,31 @@ subroutine phLR_A(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,e,ERI,Aph)
 
 ! Input variables
 
-  logical,intent(in)            :: dRPA
-  integer,intent(in)            :: ispin
-  integer,intent(in)            :: nBas
-  integer,intent(in)            :: nC
-  integer,intent(in)            :: nO
-  integer,intent(in)            :: nV
-  integer,intent(in)            :: nR
-  integer,intent(in)            :: nS
-  double precision,intent(in)   :: lambda
-  double precision,intent(in)   :: e(nBas)
-  double precision,intent(in)   :: ERI(nBas,nBas,nBas,nBas) 
+  logical,intent(in)           :: dRPA
+  integer,intent(in)           :: ispin
+  integer,intent(in)           :: nBas
+  integer,intent(in)           :: nC
+  integer,intent(in)           :: nO
+  integer,intent(in)           :: nV
+  integer,intent(in)           :: nR
+  integer,intent(in)           :: nS
+  double precision,intent(in)  :: lambda
+  double precision,intent(in)  :: e(nBas)
+  double precision,intent(in)  :: ERI(nBas,nBas,nBas,nBas) 
   
 ! Local variables
 
-  double precision              :: delta_dRPA
-  double precision,external     :: Kronecker_delta
+  double precision             :: delta_dRPA
+  double precision,external    :: Kronecker_delta
 
-  integer                       :: i,j,a,b,ia,jb
+  integer                      :: i,j,a,b,ia,jb
+  integer                      :: nn,jb0
+  logical                      :: i_eq_j
+  double precision             :: ct1,ct2
 
 ! Output variables
 
-  double precision,intent(out)  :: Aph(nS,nS)
+  double precision,intent(out) :: Aph(nS,nS)
 
 ! Direct RPA
 
@@ -39,22 +42,49 @@ subroutine phLR_A(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,e,ERI,Aph)
 
   if(ispin == 1) then 
 
-    ia = 0
-    do i=nC+1,nO
-      do a=nO+1,nBas-nR
-        ia = ia + 1
-        jb = 0
-        do j=nC+1,nO
-          do b=nO+1,nBas-nR
-            jb = jb + 1
- 
-            Aph(ia,jb) = (e(a) - e(i))*Kronecker_delta(i,j)*Kronecker_delta(a,b) &
-                       + 2d0*lambda*ERI(i,b,a,j) - (1d0 - delta_dRPA)*lambda*ERI(i,b,j,a)
+    nn = nBas - nR - nO
+    ct1 = 2d0 * lambda
+    ct2 = - (1d0 - delta_dRPA) * lambda
+    !$OMP PARALLEL DEFAULT(NONE)                    &
+    !$OMP PRIVATE (i, a, j, b, i_eq_j, ia, jb0, jb) &
+    !$OMP SHARED (nC, nO, nR, nBas, nn, ct1, ct2, e, ERI, Aph)
+    !$OMP DO COLLAPSE(2)
+    do i = nC+1, nO
+      do a = nO+1, nBas-nR
+        ia = a - nO + (i - nC - 1) * nn
 
-          end  do
-        end  do
-      end  do
-    end  do
+        do j = nC+1, nO
+          i_eq_j = i == j
+          jb0 = (j - nC - 1) * nn - nO
+
+          do b = nO+1, nBas-nR
+            jb = b + jb0
+
+            Aph(ia,jb) = ct1 * ERI(b,i,j,a) + ct2 * ERI(b,j,a,i)
+            if(i_eq_j) then
+              if(a == b) Aph(ia,jb) = Aph(ia,jb) + e(a) - e(i)
+            endif
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    !ia = 0
+    !do i=nC+1,nO
+    !  do a=nO+1,nBas-nR
+    !    ia = ia + 1
+    !    jb = 0
+    !    do j=nC+1,nO
+    !      do b=nO+1,nBas-nR
+    !        jb = jb + 1
+    !        Aph(ia,jb) = (e(a) - e(i))*Kronecker_delta(i,j)*Kronecker_delta(a,b) &
+    !                   + 2d0*lambda*ERI(i,b,a,j) - (1d0 - delta_dRPA)*lambda*ERI(i,b,j,a)
+    !      end do
+    !    end do
+    !  end do
+    !end do
 
   end if
 
@@ -62,22 +92,48 @@ subroutine phLR_A(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,e,ERI,Aph)
 
   if(ispin == 2) then 
 
-    ia = 0
-    do i=nC+1,nO
-      do a=nO+1,nBas-nR
-        ia = ia + 1
-        jb = 0
-        do j=nC+1,nO
-          do b=nO+1,nBas-nR
-            jb = jb + 1
- 
-            Aph(ia,jb) = (e(a) - e(i))*Kronecker_delta(i,j)*Kronecker_delta(a,b) &
-                       - (1d0 - delta_dRPA)*lambda*ERI(i,b,j,a)
+    nn = nBas - nR - nO
+    ct2 = - (1d0 - delta_dRPA) * lambda
+    !$OMP PARALLEL DEFAULT(NONE)                    &
+    !$OMP PRIVATE (i, a, j, b, i_eq_j, ia, jb0, jb) &
+    !$OMP SHARED (nC, nO, nR, nBas, nn, ct2, e, ERI, Aph)
+    !$OMP DO COLLAPSE(2)
+    do i = nC+1, nO
+      do a = nO+1, nBas-nR
+        ia = a - nO + (i - nC - 1) * nn
 
-          end  do
-        end  do
-      end  do
-    end  do
+        do j = nC+1, nO
+          i_eq_j = i == j
+          jb0 = (j - nC - 1) * nn - nO
+
+          do b = nO+1, nBas-nR
+            jb = b + jb0
+
+            Aph(ia,jb) = ct2 * ERI(b,j,a,i)
+            if(i_eq_j) then
+              if(a == b) Aph(ia,jb) = Aph(ia,jb) + e(a) - e(i)
+            endif
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+!    ia = 0
+!    do i=nC+1,nO
+!      do a=nO+1,nBas-nR
+!        ia = ia + 1
+!        jb = 0
+!        do j=nC+1,nO
+!          do b=nO+1,nBas-nR
+!            jb = jb + 1
+!            Aph(ia,jb) = (e(a) - e(i))*Kronecker_delta(i,j)*Kronecker_delta(a,b) &
+!                       - (1d0 - delta_dRPA)*lambda*ERI(i,b,j,a)
+!          end do
+!        end do
+!      end do
+!    end do
 
   end if
 
diff --git a/src/LR/phLR_B.f90 b/src/LR/phLR_B.f90
index 51a4a01..9645952 100644
--- a/src/LR/phLR_B.f90
+++ b/src/LR/phLR_B.f90
@@ -7,20 +7,22 @@ subroutine phLR_B(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,ERI,Bph)
 
 ! Input variables
 
-  logical,intent(in)            :: dRPA
-  integer,intent(in)            :: ispin,nBas,nC,nO,nV,nR,nS
-  double precision,intent(in)   :: lambda
-  double precision,intent(in)   :: ERI(nBas,nBas,nBas,nBas)
+  logical,intent(in)           :: dRPA
+  integer,intent(in)           :: ispin,nBas,nC,nO,nV,nR,nS
+  double precision,intent(in)  :: lambda
+  double precision,intent(in)  :: ERI(nBas,nBas,nBas,nBas)
   
 ! Local variables
 
-  double precision              :: delta_dRPA
+  double precision             :: delta_dRPA
 
-  integer                       :: i,j,a,b,ia,jb
+  integer                      :: i,j,a,b,ia,jb
+  integer                      :: nn,jb0
+  double precision             :: ct1,ct2
 
 ! Output variables
 
-  double precision,intent(out)  :: Bph(nS,nS)
+  double precision,intent(out) :: Bph(nS,nS)
 
 ! Direct RPA
 
@@ -31,21 +33,44 @@ subroutine phLR_B(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,ERI,Bph)
 
   if(ispin == 1) then
 
-    ia = 0
-    do i=nC+1,nO
-      do a=nO+1,nBas-nR
-        ia = ia + 1
-        jb = 0
-        do j=nC+1,nO
-          do b=nO+1,nBas-nR
-            jb = jb + 1
- 
-            Bph(ia,jb) = 2d0*lambda*ERI(i,j,a,b) - (1d0 - delta_dRPA)*lambda*ERI(i,j,b,a)
- 
-          end do
-        end do
-      end do
-    end do
+    nn = nBas - nR - nO
+    ct1 = 2d0 * lambda
+    ct2 = - (1d0 - delta_dRPA) * lambda
+    !$OMP PARALLEL DEFAULT(NONE)            &
+    !$OMP PRIVATE (i, a, j, b, ia, jb0, jb) &
+    !$OMP SHARED (nC, nO, nR, nBas, nn, ct1, ct2, ERI, Bph)
+    !$OMP DO COLLAPSE(2)
+    do i = nC+1, nO
+      do a = nO+1, nBas-nR
+        ia = a - nO + (i - nC - 1) * nn
+
+        do j = nC+1, nO
+          jb0 = (j - nC - 1) * nn - nO
+
+          do b = nO+1, nBas-nR
+            jb = b + jb0
+
+            Bph(ia,jb) = ct1 * ERI(b,i,j,a) + ct2 * ERI(b,j,i,a)
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    !ia = 0
+    !do i=nC+1,nO
+    !  do a=nO+1,nBas-nR
+    !    ia = ia + 1
+    !    jb = 0
+    !    do j=nC+1,nO
+    !      do b=nO+1,nBas-nR
+    !        jb = jb + 1
+    !        Bph(ia,jb) = 2d0*lambda*ERI(i,j,a,b) - (1d0 - delta_dRPA)*lambda*ERI(i,j,b,a)
+    !      end do
+    !    end do
+    !  end do
+    !end do
 
   end if
 
@@ -53,21 +78,43 @@ subroutine phLR_B(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,ERI,Bph)
 
   if(ispin == 2) then
 
-    ia = 0
-    do i=nC+1,nO
-      do a=nO+1,nBas-nR
-        ia = ia + 1
-        jb = 0
-        do j=nC+1,nO
-          do b=nO+1,nBas-nR
-            jb = jb + 1
- 
-            Bph(ia,jb) = - (1d0 - delta_dRPA)*lambda*ERI(i,j,b,a)
- 
-          end do
-        end do
-      end do
-    end do
+    nn = nBas - nR - nO
+    ct2 = - (1d0 - delta_dRPA) * lambda
+    !$OMP PARALLEL DEFAULT(NONE)            &
+    !$OMP PRIVATE (i, a, j, b, ia, jb0, jb) &
+    !$OMP SHARED (nC, nO, nR, nBas, nn, ct2, ERI, Bph)
+    !$OMP DO COLLAPSE(2)
+    do i = nC+1, nO
+      do a = nO+1, nBas-nR
+        ia = a - nO + (i - nC - 1) * nn
+
+        do j = nC+1, nO
+          jb0 = (j - nC - 1) * nn - nO
+
+          do b = nO+1, nBas-nR
+            jb = b + jb0
+
+            Bph(ia,jb) = ct2 * ERI(b,j,i,a)
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+!    ia = 0
+!    do i=nC+1,nO
+!      do a=nO+1,nBas-nR
+!        ia = ia + 1
+!        jb = 0
+!        do j=nC+1,nO
+!          do b=nO+1,nBas-nR
+!            jb = jb + 1
+!            Bph(ia,jb) = - (1d0 - delta_dRPA)*lambda*ERI(i,j,b,a)
+!          end do
+!        end do
+!      end do
+!    end do
 
   end if
 
diff --git a/src/LR/ppLR_C.f90 b/src/LR/ppLR_C.f90
index fbc3900..e9551d4 100644
--- a/src/LR/ppLR_C.f90
+++ b/src/LR/ppLR_C.f90
@@ -32,8 +32,8 @@ subroutine ppLR_C(ispin,nOrb,nC,nO,nV,nR,nVV,lambda,e,ERI,Cpp)
 
 ! Define the chemical potential
  
-! eF = e(nO) + e(nO+1)
-  eF = 0d0
+  eF = e(nO) + e(nO+1)
+! eF = 0d0
 
 ! Build C matrix for the singlet manifold
 
diff --git a/src/LR/ppLR_D.f90 b/src/LR/ppLR_D.f90
index 3744f5c..3b7ff0d 100644
--- a/src/LR/ppLR_D.f90
+++ b/src/LR/ppLR_D.f90
@@ -30,8 +30,8 @@ subroutine ppLR_D(ispin,nOrb,nC,nO,nV,nR,nOO,lambda,e,ERI,Dpp)
 
 ! Define the chemical potential
 
-! eF = e(nO) + e(nO+1)
-  eF = 0d0
+  eF = e(nO) + e(nO+1)
+! eF = 0d0
  
 ! Build the D matrix for the singlet manifold
 
diff --git a/src/QuAcK/GQuAcK.f90 b/src/QuAcK/GQuAcK.f90
index 22441fd..b82f134 100644
--- a/src/QuAcK/GQuAcK.f90
+++ b/src/QuAcK/GQuAcK.f90
@@ -1,17 +1,19 @@
-subroutine GQuAcK(dotest,doGHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT, &
-                  dodrCCD,dorCCD,docrCCD,dolCCD,dophRPA,dophRPAx,docrRPA,doppRPA,             &
-                  doG0W0,doevGW,doqsGW,doG0F2,doevGF2,doqsGF2,doG0T0pp,doevGTpp,doqsGTpp,     &
-                  nNuc,nBas,nC,nO,nV,nR,ENuc,ZNuc,rNuc,S,T,V,Hc,X,dipole_int_AO,ERI_AO,       &
-                  maxSCF_HF,max_diis_HF,thresh_HF,level_shift,guess_type,mix,reg_MP,          &
-                  maxSCF_CC,max_diis_CC,thresh_CC,                                            &
-                  TDA,maxSCF_GF,max_diis_GF,thresh_GF,lin_GF,reg_GF,eta_GF,                   &
-                  maxSCF_GW,max_diis_GW,thresh_GW,TDA_W,lin_GW,reg_GW,eta_GW,                 &
-                  maxSCF_GT,max_diis_GT,thresh_GT,TDA_T,lin_GT,reg_GT,eta_GT,                 &
+subroutine GQuAcK(working_dir,dotest,doGHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT, &
+                  dodrCCD,dorCCD,docrCCD,dolCCD,dophRPA,dophRPAx,docrRPA,doppRPA,                         &
+                  doG0W0,doevGW,doqsGW,doG0F2,doevGF2,doqsGF2,doG0T0pp,doevGTpp,doqsGTpp,                 &
+                  nNuc,nBas,nC,nO,nV,nR,ENuc,ZNuc,rNuc,S,T,V,Hc,X,dipole_int_AO,                          &
+                  maxSCF_HF,max_diis_HF,thresh_HF,level_shift,guess_type,mix,reg_MP,                      &
+                  maxSCF_CC,max_diis_CC,thresh_CC,                                                        &
+                  TDA,maxSCF_GF,max_diis_GF,thresh_GF,lin_GF,reg_GF,eta_GF,                               &
+                  maxSCF_GW,max_diis_GW,thresh_GW,TDA_W,lin_GW,reg_GW,eta_GW,                             &
+                  maxSCF_GT,max_diis_GT,thresh_GT,TDA_T,lin_GT,reg_GT,eta_GT,                             &
                   dophBSE,dophBSE2,doppBSE,dBSE,dTDA,doACFDT,exchange_kernel,doXBS)
 
   implicit none
   include 'parameters.h'
 
+  character(len=256),intent(in) :: working_dir
+
   logical,intent(in)            :: dotest
 
   logical,intent(in)            :: doGHF
@@ -41,7 +43,6 @@ subroutine GQuAcK(dotest,doGHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,do
   double precision,intent(in)   :: Hc(nBas,nBas)
   double precision,intent(in)   :: X(nBas,nBas)
   double precision,intent(in)   :: dipole_int_AO(nBas,nBas,ncart)
-  double precision,intent(in)   :: ERI_AO(nBas,nBas,nBas,nBas)
 
   integer,intent(in)            :: maxSCF_HF,max_diis_HF
   double precision,intent(in)   :: thresh_HF,level_shift,mix
@@ -86,9 +87,11 @@ subroutine GQuAcK(dotest,doGHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,do
   double precision              :: start_GW     ,end_GW       ,t_GW
   double precision              :: start_GT     ,end_GT       ,t_GT
 
+  double precision              :: start_int, end_int, t_int 
   double precision,allocatable  :: cHF(:,:),eHF(:),PHF(:,:),FHF(:,:)
   double precision              :: EGHF
   double precision,allocatable  :: dipole_int_MO(:,:,:)
+  double precision,allocatable  :: ERI_AO(:,:,:,:)
   double precision,allocatable  :: ERI_MO(:,:,:,:)
   double precision,allocatable  :: ERI_tmp(:,:,:,:)
   double precision,allocatable  :: Ca(:,:),Cb(:,:)
@@ -112,6 +115,17 @@ subroutine GQuAcK(dotest,doGHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,do
   allocate(cHF(nBas2,nBas2),eHF(nBas2),PHF(nBas2,nBas2),FHF(nBas2,nBas2), &
            dipole_int_MO(nBas2,nBas2,ncart),ERI_MO(nBas2,nBas2,nBas2,nBas2))
 
+
+  allocate(ERI_AO(nBas,nBas,nBas,nBas))
+  call wall_time(start_int)
+  call read_2e_integrals(working_dir,nBas,ERI_AO)
+  call wall_time(end_int)
+  t_int = end_int - start_int
+  write(*,*)
+  write(*,'(A65,1X,F9.3,A8)') 'Total wall time for reading 2e-integrals =',t_int,' seconds'
+  write(*,*)
+
+
 !---------------------!
 ! Hartree-Fock module !
 !---------------------!
diff --git a/src/QuAcK/QuAcK.f90 b/src/QuAcK/QuAcK.f90
index 586bfff..8fd0214 100644
--- a/src/QuAcK/QuAcK.f90
+++ b/src/QuAcK/QuAcK.f90
@@ -32,7 +32,6 @@ program QuAcK
   double precision,allocatable  :: Hc(:,:)
   double precision,allocatable  :: X(:,:),X_tmp(:,:)
   double precision,allocatable  :: dipole_int_AO(:,:,:)
-  double precision,allocatable  :: ERI_AO(:,:,:,:)
   double precision,allocatable  :: Uvec(:,:), Uval(:)
 
   double precision              :: start_QuAcK,end_QuAcK,t_QuAcK
@@ -44,6 +43,9 @@ program QuAcK
 
   logical                       :: reg_MP
 
+  logical                       :: switch_hpc
+  logical                       :: use_gpu
+
   integer                       :: maxSCF_CC,max_diis_CC
   double precision              :: thresh_CC
 
@@ -134,6 +136,12 @@ program QuAcK
                     doACFDT,exchange_kernel,doXBS,                                              &
                     dophBSE,dophBSE2,doppBSE,dBSE,dTDA)
 
+!------------------!
+! Hardware         !
+!------------------!
+
+  call read_hpc_flags(working_dir,switch_hpc,use_gpu)
+
 !------------------------------------!
 ! Read input information             !
 !------------------------------------!
@@ -168,21 +176,19 @@ program QuAcK
   allocate(T(nBas,nBas))
   allocate(V(nBas,nBas))
   allocate(Hc(nBas,nBas))
-  allocate(ERI_AO(nBas,nBas,nBas,nBas))
   allocate(dipole_int_AO(nBas,nBas,ncart))
 
 ! Read integrals
 
   call wall_time(start_int)
 
-  call read_integrals(working_dir,nBas,S,T,V,Hc,ERI_AO)
+  call read_1e_integrals(working_dir,nBas,S,T,V,Hc)
   call read_dipole_integrals(working_dir,nBas,dipole_int_AO)
-
   call wall_time(end_int)
 
   t_int = end_int - start_int
   write(*,*)
-  write(*,'(A65,1X,F9.3,A8)') 'Total wall time for reading integrals = ',t_int,' seconds'
+  write(*,'(A65,1X,F9.3,A8)') 'Total wall time for reading 1e-integrals = ',t_int,' seconds'
   write(*,*)
 
 ! Compute orthogonalization matrix
@@ -218,29 +224,44 @@ program QuAcK
 ! Restricted QuAcK branch !
 !-------------------------!
 
-  if(doRQuAcK) &
-    call RQuAcK(doRtest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT,              &
-                dodrCCD,dorCCD,docrCCD,dolCCD,doCIS,doCIS_D,doCID,doCISD,doFCI,dophRPA,dophRPAx,docrRPA,doppRPA, &
-                doG0F2,doevGF2,doqsGF2,doufG0F02,doG0F3,doevGF3,doG0W0,doevGW,doqsGW,doufG0W0,doufGW,            &
-                doG0T0pp,doevGTpp,doqsGTpp,doufG0T0pp,doG0T0eh,doevGTeh,doqsGTeh,                                & 
-                nNuc,nBas,nOrb,nC,nO,nV,nR,ENuc,ZNuc,rNuc,                                                       &
-                S,T,V,Hc,X,dipole_int_AO,ERI_AO,maxSCF_HF,max_diis_HF,thresh_HF,level_shift,                     &
-                guess_type,mix,reg_MP,maxSCF_CC,max_diis_CC,thresh_CC,spin_conserved,spin_flip,TDA,              &
-                maxSCF_GF,max_diis_GF,renorm_GF,thresh_GF,lin_GF,reg_GF,eta_GF,maxSCF_GW,max_diis_GW,thresh_GW,  &
-                TDA_W,lin_GW,reg_GW,eta_GW,maxSCF_GT,max_diis_GT,thresh_GT,TDA_T,lin_GT,reg_GT,eta_GT,           &
-                dophBSE,dophBSE2,doppBSE,dBSE,dTDA,doACFDT,exchange_kernel,doXBS)
+  if(doRQuAcK) then
+
+    if(switch_hpc) then
+      call RQuAcK_hpc(working_dir,use_gpu,doRtest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT, &
+                      dodrCCD,dorCCD,docrCCD,dolCCD,doCIS,doCIS_D,doCID,doCISD,doFCI,dophRPA,dophRPAx,docrRPA,doppRPA,        &
+                      doG0F2,doevGF2,doqsGF2,doufG0F02,doG0F3,doevGF3,doG0W0,doevGW,doqsGW,doufG0W0,doufGW,                   &
+                      doG0T0pp,doevGTpp,doqsGTpp,doufG0T0pp,doG0T0eh,doevGTeh,doqsGTeh,                                       &
+                      nNuc,nBas,nOrb,nC,nO,nV,nR,ENuc,ZNuc,rNuc,                                                              &
+                      S,T,V,Hc,X,dipole_int_AO,maxSCF_HF,max_diis_HF,thresh_HF,level_shift,                                   &
+                      guess_type,mix,reg_MP,maxSCF_CC,max_diis_CC,thresh_CC,spin_conserved,spin_flip,TDA,                     &
+                      maxSCF_GF,max_diis_GF,renorm_GF,thresh_GF,lin_GF,reg_GF,eta_GF,maxSCF_GW,max_diis_GW,thresh_GW,         &
+                      TDA_W,lin_GW,reg_GW,eta_GW,maxSCF_GT,max_diis_GT,thresh_GT,TDA_T,lin_GT,reg_GT,eta_GT,                  &
+                      dophBSE,dophBSE2,doppBSE,dBSE,dTDA,doACFDT,exchange_kernel,doXBS)
+    else
+      call RQuAcK(working_dir,use_gpu,doRtest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT, &
+                  dodrCCD,dorCCD,docrCCD,dolCCD,doCIS,doCIS_D,doCID,doCISD,doFCI,dophRPA,dophRPAx,docrRPA,doppRPA,        &
+                  doG0F2,doevGF2,doqsGF2,doufG0F02,doG0F3,doevGF3,doG0W0,doevGW,doqsGW,doufG0W0,doufGW,                   &
+                  doG0T0pp,doevGTpp,doqsGTpp,doufG0T0pp,doG0T0eh,doevGTeh,doqsGTeh,                                       &
+                  nNuc,nBas,nOrb,nC,nO,nV,nR,ENuc,ZNuc,rNuc,                                                              &
+                  S,T,V,Hc,X,dipole_int_AO,maxSCF_HF,max_diis_HF,thresh_HF,level_shift,                                   &
+                  guess_type,mix,reg_MP,maxSCF_CC,max_diis_CC,thresh_CC,spin_conserved,spin_flip,TDA,                     &
+                  maxSCF_GF,max_diis_GF,renorm_GF,thresh_GF,lin_GF,reg_GF,eta_GF,maxSCF_GW,max_diis_GW,thresh_GW,         &
+                  TDA_W,lin_GW,reg_GW,eta_GW,maxSCF_GT,max_diis_GT,thresh_GT,TDA_T,lin_GT,reg_GT,eta_GT,                  &
+                  dophBSE,dophBSE2,doppBSE,dBSE,dTDA,doACFDT,exchange_kernel,doXBS)
+    endif
+  endif
 
 !---------------------------!
 ! Unrestricted QuAcK branch !
 !---------------------------!
 
   if(doUQuAcK) &
-    call UQuAcK(doUtest,doUHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT,                     &
+    call UQuAcK(working_dir,doUtest,doUHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT,         &
                 dodrCCD,dorCCD,docrCCD,dolCCD,doCIS,doCIS_D,doCID,doCISD,doFCI,dophRPA,dophRPAx,docrRPA,doppRPA, &
                 doG0F2,doevGF2,doqsGF2,doufG0F02,doG0F3,doevGF3,doG0W0,doevGW,doqsGW,doufG0W0,doufGW,            &
                 doG0T0pp,doevGTpp,doqsGTpp,doufG0T0pp,doG0T0eh,doevGTeh,doqsGTeh,                                & 
                 nNuc,nBas,nC,nO,nV,nR,ENuc,ZNuc,rNuc,                                                            &
-                S,T,V,Hc,X,dipole_int_AO,ERI_AO,maxSCF_HF,max_diis_HF,thresh_HF,level_shift,                     &
+                S,T,V,Hc,X,dipole_int_AO,maxSCF_HF,max_diis_HF,thresh_HF,level_shift,                            &
                 guess_type,mix,reg_MP,maxSCF_CC,max_diis_CC,thresh_CC,spin_conserved,spin_flip,TDA,              &
                 maxSCF_GF,max_diis_GF,renorm_GF,thresh_GF,lin_GF,reg_GF,eta_GF,maxSCF_GW,max_diis_GW,thresh_GW,  &
                 TDA_W,lin_GW,reg_GW,eta_GW,maxSCF_GT,max_diis_GT,thresh_GT,TDA_T,lin_GT,reg_GT,eta_GT,           &
@@ -250,10 +271,10 @@ program QuAcK
 ! Generalized QuAcK branch !
 !--------------------------!
   if(doGQuAcK) & 
-    call GQuAcK(doGtest,doGHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT,                  &
+    call GQuAcK(working_dir,doGtest,doGHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT,      &
                 dodrCCD,dorCCD,docrCCD,dolCCD,dophRPA,dophRPAx,docrRPA,doppRPA,                               &
                 doG0W0,doevGW,doqsGW,doG0F2,doevGF2,doqsGF2,doG0T0pp,doevGTpp,doqsGTpp,                       &
-                nNuc,nBas,sum(nC),sum(nO),sum(nV),sum(nR),ENuc,ZNuc,rNuc,S,T,V,Hc,X,dipole_int_AO,ERI_AO,     &
+                nNuc,nBas,sum(nC),sum(nO),sum(nV),sum(nR),ENuc,ZNuc,rNuc,S,T,V,Hc,X,dipole_int_AO,            &
                 maxSCF_HF,max_diis_HF,thresh_HF,level_shift,guess_type,mix,reg_MP,                            &
                 maxSCF_CC,max_diis_CC,thresh_CC,TDA,maxSCF_GF,max_diis_GF,thresh_GF,lin_GF,reg_GF,eta_GF,     &
                 maxSCF_GW,max_diis_GW,thresh_GW,TDA_W,lin_GW,reg_GW,eta_GW,                                   &
@@ -282,4 +303,10 @@ program QuAcK
   write(*,'(A65,1X,F9.3,A8)') 'Total wall time for QuAcK = ',t_QuAcK,' seconds'
   write(*,*)
 
+  deallocate(S)
+  deallocate(T)
+  deallocate(V)
+  deallocate(Hc)
+  deallocate(dipole_int_AO)
+
 end program 
diff --git a/src/QuAcK/RQuAcK.f90 b/src/QuAcK/RQuAcK.f90
index 57f0fe5..16b8159 100644
--- a/src/QuAcK/RQuAcK.f90
+++ b/src/QuAcK/RQuAcK.f90
@@ -1,12 +1,12 @@
-subroutine RQuAcK(dotest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT,               &   
-                  dodrCCD,dorCCD,docrCCD,dolCCD,doCIS,doCIS_D,doCID,doCISD,doFCI,dophRPA,dophRPAx,docrRPA,doppRPA, & 
-                  doG0F2,doevGF2,doqsGF2,doufG0F02,doG0F3,doevGF3,doG0W0,doevGW,doqsGW,doufG0W0,doufGW,            &
-                  doG0T0pp,doevGTpp,doqsGTpp,doufG0T0pp,doG0T0eh,doevGTeh,doqsGTeh,                                & 
-                  nNuc,nBas,nOrb,nC,nO,nV,nR,ENuc,ZNuc,rNuc,                                                       &
-                  S,T,V,Hc,X,dipole_int_AO,ERI_AO,maxSCF_HF,max_diis_HF,thresh_HF,level_shift,                     &
-                  guess_type,mix,reg_MP,maxSCF_CC,max_diis_CC,thresh_CC,singlet,triplet,TDA,                       &
-                  maxSCF_GF,max_diis_GF,renorm_GF,thresh_GF,lin_GF,reg_GF,eta_GF,maxSCF_GW,max_diis_GW,thresh_GW,  & 
-                  TDA_W,lin_GW,reg_GW,eta_GW,maxSCF_GT,max_diis_GT,thresh_GT,TDA_T,lin_GT,reg_GT,eta_GT,           & 
+subroutine RQuAcK(working_dir,use_gpu,dotest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT, &
+                  dodrCCD,dorCCD,docrCCD,dolCCD,doCIS,doCIS_D,doCID,doCISD,doFCI,dophRPA,dophRPAx,docrRPA,doppRPA,       & 
+                  doG0F2,doevGF2,doqsGF2,doufG0F02,doG0F3,doevGF3,doG0W0,doevGW,doqsGW,doufG0W0,doufGW,                  &
+                  doG0T0pp,doevGTpp,doqsGTpp,doufG0T0pp,doG0T0eh,doevGTeh,doqsGTeh,                                      & 
+                  nNuc,nBas,nOrb,nC,nO,nV,nR,ENuc,ZNuc,rNuc,                                                             &
+                  S,T,V,Hc,X,dipole_int_AO,maxSCF_HF,max_diis_HF,thresh_HF,level_shift,                                  &
+                  guess_type,mix,reg_MP,maxSCF_CC,max_diis_CC,thresh_CC,singlet,triplet,TDA,                             &
+                  maxSCF_GF,max_diis_GF,renorm_GF,thresh_GF,lin_GF,reg_GF,eta_GF,maxSCF_GW,max_diis_GW,thresh_GW,        & 
+                  TDA_W,lin_GW,reg_GW,eta_GW,maxSCF_GT,max_diis_GT,thresh_GT,TDA_T,lin_GT,reg_GT,eta_GT,                 & 
                   dophBSE,dophBSE2,doppBSE,dBSE,dTDA,doACFDT,exchange_kernel,doXBS)
 
 ! Restricted branch of QuAcK
@@ -14,6 +14,10 @@ subroutine RQuAcK(dotest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,d
   implicit none
   include 'parameters.h'
 
+  character(len=256),intent(in) :: working_dir
+
+  logical,intent(in)            :: use_gpu
+
   logical,intent(in)            :: dotest
 
   logical,intent(in)            :: doRHF,doROHF
@@ -44,7 +48,6 @@ subroutine RQuAcK(dotest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,d
   double precision,intent(in)   :: Hc(nBas,nBas)
   double precision,intent(in)   :: X(nBas,nOrb)
   double precision,intent(in)   :: dipole_int_AO(nBas,nBas,ncart)
-  double precision,intent(in)   :: ERI_AO(nBas,nBas,nBas,nBas)
 
   integer,intent(in)            :: maxSCF_HF,max_diis_HF
   double precision,intent(in)   :: thresh_HF,level_shift,mix
@@ -92,12 +95,14 @@ subroutine RQuAcK(dotest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,d
   double precision              :: start_GW     ,end_GW       ,t_GW
   double precision              :: start_GT     ,end_GT       ,t_GT
 
+  double precision              :: start_int, end_int, t_int
   double precision,allocatable  :: eHF(:)
   double precision,allocatable  :: cHF(:,:)
   double precision,allocatable  :: PHF(:,:)
   double precision,allocatable  :: FHF(:,:)
   double precision              :: ERHF
   double precision,allocatable  :: dipole_int_MO(:,:,:)
+  double precision,allocatable  :: ERI_AO(:,:,:,:)
   double precision,allocatable  :: ERI_MO(:,:,:,:)
   integer                       :: ixyz
   integer                       :: nS
@@ -119,6 +124,15 @@ subroutine RQuAcK(dotest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,d
   allocate(dipole_int_MO(nOrb,nOrb,ncart))
   allocate(ERI_MO(nOrb,nOrb,nOrb,nOrb))
 
+  allocate(ERI_AO(nBas,nBas,nBas,nBas))
+  call wall_time(start_int)
+  call read_2e_integrals(working_dir,nBas,ERI_AO)
+  call wall_time(end_int)
+  t_int = end_int - start_int
+  write(*,*)
+  write(*,'(A65,1X,F9.3,A8)') 'Total wall time for reading 2e-integrals =',t_int,' seconds'
+  write(*,*)
+
 !---------------------!
 ! Hartree-Fock module !
 !---------------------!
@@ -274,7 +288,7 @@ subroutine RQuAcK(dotest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,d
   if(doRPA) then
 
     call wall_time(start_RPA)
-    call RRPA(dotest,dophRPA,dophRPAx,docrRPA,doppRPA,TDA,doACFDT,exchange_kernel,singlet,triplet, & 
+    call RRPA(use_gpu,dotest,dophRPA,dophRPAx,docrRPA,doppRPA,TDA,doACFDT,exchange_kernel,singlet,triplet, &
               nOrb,nC,nO,nV,nR,nS,ENuc,ERHF,ERI_MO,dipole_int_MO,eHF)
     call wall_time(end_RPA)
 
@@ -348,4 +362,13 @@ subroutine RQuAcK(dotest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,d
 
   end if
 
+
+  deallocate(eHF)
+  deallocate(cHF)
+  deallocate(PHF)
+  deallocate(FHF)
+  deallocate(dipole_int_MO)
+  deallocate(ERI_MO)
+  deallocate(ERI_AO)
+
 end subroutine
diff --git a/src/QuAcK/RQuAcK_hpc.f90 b/src/QuAcK/RQuAcK_hpc.f90
new file mode 100644
index 0000000..9d13a57
--- /dev/null
+++ b/src/QuAcK/RQuAcK_hpc.f90
@@ -0,0 +1,353 @@
+subroutine RQuAcK_hpc(working_dir,use_gpu,dotest,doRHF,doROHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT, &
+                      dodrCCD,dorCCD,docrCCD,dolCCD,doCIS,doCIS_D,doCID,doCISD,doFCI,dophRPA,dophRPAx,docrRPA,doppRPA,       & 
+                      doG0F2,doevGF2,doqsGF2,doufG0F02,doG0F3,doevGF3,doG0W0,doevGW,doqsGW,doufG0W0,doufGW,                  &
+                      doG0T0pp,doevGTpp,doqsGTpp,doufG0T0pp,doG0T0eh,doevGTeh,doqsGTeh,                                      & 
+                      nNuc,nBas,nOrb,nC,nO,nV,nR,ENuc,ZNuc,rNuc,                                                             &
+                      S,T,V,Hc,X,dipole_int_AO,maxSCF_HF,max_diis_HF,thresh_HF,level_shift,                                  &
+                      guess_type,mix,reg_MP,maxSCF_CC,max_diis_CC,thresh_CC,singlet,triplet,TDA,                             &
+                      maxSCF_GF,max_diis_GF,renorm_GF,thresh_GF,lin_GF,reg_GF,eta_GF,maxSCF_GW,max_diis_GW,thresh_GW,        & 
+                      TDA_W,lin_GW,reg_GW,eta_GW,maxSCF_GT,max_diis_GT,thresh_GT,TDA_T,lin_GT,reg_GT,eta_GT,                 & 
+                      dophBSE,dophBSE2,doppBSE,dBSE,dTDA,doACFDT,exchange_kernel,doXBS)
+
+! Restricted branch of QuAcK
+
+  implicit none
+  include 'parameters.h'
+
+  character(len=256),intent(in) :: working_dir
+
+  logical,intent(in)            :: use_gpu
+
+  logical,intent(in)            :: dotest
+
+  logical,intent(in)            :: doRHF,doROHF
+  logical,intent(in)            :: dostab
+  logical,intent(in)            :: dosearch
+  logical,intent(in)            :: doMP2,doMP3
+  logical,intent(in)            :: doCCD,dopCCD,doDCD,doCCSD,doCCSDT
+  logical,intent(in)            :: dodrCCD,dorCCD,docrCCD,dolCCD
+  logical,intent(in)            :: doCIS,doCIS_D,doCID,doCISD,doFCI
+  logical,intent(in)            :: dophRPA,dophRPAx,docrRPA,doppRPA
+  logical,intent(in)            :: doG0F2,doevGF2,doqsGF2,doufG0F02,doG0F3,doevGF3
+  logical,intent(in)            :: doG0W0,doevGW,doqsGW,doufG0W0,doufGW
+  logical,intent(in)            :: doG0T0pp,doevGTpp,doqsGTpp,doufG0T0pp
+  logical,intent(in)            :: doG0T0eh,doevGTeh,doqsGTeh
+
+  integer,intent(in)            :: nNuc,nBas,nOrb
+  integer,intent(in)            :: nC
+  integer,intent(in)            :: nO
+  integer,intent(in)            :: nV
+  integer,intent(in)            :: nR
+  double precision,intent(in)   :: ENuc
+
+  double precision,intent(in)   :: ZNuc(nNuc),rNuc(nNuc,ncart)
+
+  double precision,intent(in)   :: S(nBas,nBas)
+  double precision,intent(in)   :: T(nBas,nBas)
+  double precision,intent(in)   :: V(nBas,nBas)
+  double precision,intent(in)   :: Hc(nBas,nBas)
+  double precision,intent(in)   :: X(nBas,nOrb)
+  double precision,intent(in)   :: dipole_int_AO(nBas,nBas,ncart)
+
+  integer,intent(in)            :: maxSCF_HF,max_diis_HF
+  double precision,intent(in)   :: thresh_HF,level_shift,mix
+  integer,intent(in)            :: guess_type
+
+  logical,intent(in)            :: reg_MP
+
+  integer,intent(in)            :: maxSCF_CC,max_diis_CC
+  double precision,intent(in)   :: thresh_CC
+
+  logical,intent(in)            :: singlet
+  logical,intent(in)            :: triplet
+  logical,intent(in)            :: TDA
+
+  integer,intent(in)            :: maxSCF_GF,max_diis_GF,renorm_GF
+  double precision,intent(in)   :: thresh_GF
+  logical,intent(in)            :: lin_GF,reg_GF
+  double precision,intent(in)   :: eta_GF
+
+  integer,intent(in)            :: maxSCF_GW,max_diis_GW
+  double precision,intent(in)   :: thresh_GW
+  logical,intent(in)            :: TDA_W,lin_GW,reg_GW
+  double precision,intent(in)   :: eta_GW
+
+  integer,intent(in)            :: maxSCF_GT,max_diis_GT
+  double precision,intent(in)   :: thresh_GT
+  logical,intent(in)            :: TDA_T,lin_GT,reg_GT
+  double precision,intent(in)   :: eta_GT
+
+  logical,intent(in)            :: dophBSE,dophBSE2,doppBSE,dBSE,dTDA
+  logical,intent(in)            :: doACFDT,exchange_kernel,doXBS
+
+! Local variables
+
+  logical                       :: doMP,doCC,doCI,doRPA,doGF,doGW,doGT
+
+  double precision              :: start_HF     ,end_HF       ,t_HF
+  double precision              :: start_stab   ,end_stab     ,t_stab
+  double precision              :: start_AOtoMO ,end_AOtoMO   ,t_AOtoMO
+  double precision              :: start_MP     ,end_MP       ,t_MP
+  double precision              :: start_CC     ,end_CC       ,t_CC
+  double precision              :: start_CI     ,end_CI       ,t_CI
+  double precision              :: start_RPA    ,end_RPA      ,t_RPA
+  double precision              :: start_GF     ,end_GF       ,t_GF
+  double precision              :: start_GW     ,end_GW       ,t_GW
+  double precision              :: start_GT     ,end_GT       ,t_GT
+
+  double precision              :: start_int, end_int, t_int
+  double precision,allocatable  :: eHF(:)
+  double precision,allocatable  :: cHF(:,:)
+  double precision,allocatable  :: PHF(:,:)
+  double precision,allocatable  :: FHF(:,:)
+  double precision              :: ERHF
+  integer                       :: ixyz
+  integer                       :: nS
+
+  write(*,*)
+  write(*,*) '*****************************************'
+  write(*,*) '* Restricted Branch of QuAcK (HPC mode) *'
+  write(*,*) '*****************************************'
+  write(*,*)
+
+!-------------------!
+! Memory allocation !
+!-------------------!
+
+  allocate(eHF(nOrb))
+  allocate(cHF(nBas,nOrb))
+  allocate(PHF(nBas,nBas))
+  allocate(FHF(nBas,nBas))
+
+!---------------------!
+! Hartree-Fock module !
+!---------------------!
+
+  if(doRHF) then
+
+    call wall_time(start_HF)
+    call RHF_hpc(working_dir,dotest,maxSCF_HF,thresh_HF,max_diis_HF,guess_type,level_shift,nNuc,ZNuc,rNuc,ENuc, &
+                 nBas,nOrb,nO,S,T,V,Hc,dipole_int_AO,X,ERHF,eHF,cHF,PHF,FHF)
+    call wall_time(end_HF)
+
+    t_HF = end_HF - start_HF
+    write(*,'(A65,1X,F9.3,A8)') 'Total wall time for RHF = ',t_HF,' seconds'
+    write(*,*)
+
+  end if
+
+!  if(doROHF) then
+!
+!    call wall_time(start_HF)
+!    call ROHF(dotest,maxSCF_HF,thresh_HF,max_diis_HF,guess_type,mix,level_shift,nNuc,ZNuc,rNuc,ENuc, &
+!              nBas,nOrb,nO,S,T,V,Hc,ERI_AO,dipole_int_AO,X,ERHF,eHF,cHF,PHF,FHF)
+!    call wall_time(end_HF)
+!
+!    t_HF = end_HF - start_HF
+!    write(*,'(A65,1X,F9.3,A8)') 'Total wall time for ROHF = ',t_HF,' seconds'
+!    write(*,*)
+!
+!  end if
+!
+!!----------------------------------!
+!! AO to MO integral transformation !
+!!----------------------------------!
+!
+!  call wall_time(start_AOtoMO)
+!
+!  write(*,*)
+!  write(*,*) 'AO to MO transformation... Please be patient'
+!  write(*,*)
+!
+!  ! Read and transform dipole-related integrals
+!  
+!  do ixyz=1,ncart
+!    call AOtoMO(nBas,nOrb,cHF,dipole_int_AO(1,1,ixyz),dipole_int_MO(1,1,ixyz))
+!  end do 
+!
+!  ! 4-index transform 
+!  
+!  call AOtoMO_ERI_RHF(nBas,nOrb,cHF,ERI_AO,ERI_MO)
+!
+!  call wall_time(end_AOtoMO)
+!
+!  t_AOtoMO = end_AOtoMO - start_AOtoMO
+!  write(*,'(A65,1X,F9.3,A8)') 'Total wall time for AO to MO transformation = ',t_AOtoMO,' seconds'
+!  write(*,*)
+!
+!!-----------------------------------!
+!! Stability analysis of HF solution !
+!!-----------------------------------!
+!
+!  nS = (nO - nC)*(nV - nR)
+!
+!  if(dostab) then
+!
+!    call wall_time(start_stab)
+!    call RHF_stability(nOrb,nC,nO,nV,nR,nS,eHF,ERI_MO)
+!    call wall_time(end_stab)
+!
+!    t_stab = end_stab - start_stab
+!    write(*,'(A65,1X,F9.3,A8)') 'Total wall time for stability analysis = ',t_stab,' seconds'
+!    write(*,*)
+!
+!  end if
+!
+!  if(dosearch) then
+!
+!    call wall_time(start_stab)
+!    call RHF_search(maxSCF_HF,thresh_HF,max_diis_HF,guess_type,level_shift,nNuc,ZNuc,rNuc,ENuc, &
+!                    nBas,nOrb,nC,nO,nV,nR,S,T,V,Hc,ERI_AO,ERI_MO,dipole_int_AO,dipole_int_MO,X, & 
+!                    ERHF,eHF,cHF,PHF,FHF)
+!    call wall_time(end_stab)
+!
+!    t_stab = end_stab - start_stab
+!    write(*,'(A65,1X,F9.3,A8)') 'Total wall time for stability analysis = ',t_stab,' seconds'
+!    write(*,*)
+!
+!  end if
+!
+!!-----------------------!
+!! Moller-Plesset module !
+!!-----------------------!
+!
+!  doMP = doMP2 .or. doMP3
+!
+!  if(doMP) then
+!
+!    call wall_time(start_MP)
+!    call RMP(dotest,doMP2,doMP3,reg_MP,nOrb,nC,nO,nV,nR,ERI_MO,ENuc,ERHF,eHF)
+!    call wall_time(end_MP)
+!
+!    t_MP = end_MP - start_MP
+!    write(*,'(A65,1X,F9.3,A8)') 'Total wall time for MP = ',t_MP,' seconds'
+!    write(*,*)
+!
+!  end if
+!
+!!------------------------!
+!! Coupled-cluster module !
+!!------------------------!
+!
+!  doCC = doCCD .or. dopCCD .or. doDCD .or. doCCSD .or. doCCSDT .or. &  
+!         dodrCCD .or. dorCCD .or. docrCCD .or. dolCCD
+!
+!  if(doCC) then
+!
+!    call wall_time(start_CC)
+!    call RCC(dotest,doCCD,dopCCD,doDCD,doCCSD,doCCSDT,dodrCCD,dorCCD,docrCCD,dolCCD, & 
+!             maxSCF_CC,thresh_CC,max_diis_CC,nBas,nOrb,nC,nO,nV,nR,Hc,ERI_AO,ERI_MO, & 
+!             ENuc,ERHF,eHF,cHF,PHF,FHF)
+!    call wall_time(end_CC)
+!
+!    t_CC = end_CC - start_CC
+!    write(*,'(A65,1X,F9.3,A8)') 'Total wall time for CC = ',t_CC,' seconds'
+!    write(*,*)
+!
+!  end if
+!
+!!----------------------------------!
+!! Configuration interaction module !
+!!----------------------------------!
+!
+!  doCI = doCIS .or. doCID .or. doCISD .or. doFCI
+!
+!  if(doCI) then
+!
+!    call wall_time(start_CI)
+!    call RCI(dotest,doCIS,doCIS_D,doCID,doCISD,doFCI,singlet,triplet,nOrb, &
+!             nC,nO,nV,nR,nS,ERI_MO,dipole_int_MO,eHF,ERHF)
+!    call wall_time(end_CI)
+!
+!    t_CI = end_CI - start_CI
+!    write(*,'(A65,1X,F9.3,A8)') 'Total wall time for CI = ',t_CI,' seconds'
+!    write(*,*)
+!
+!  end if
+!
+!!-----------------------------------!
+!! Random-phase approximation module !
+!!-----------------------------------!
+!
+!  doRPA = dophRPA .or. dophRPAx .or. docrRPA .or. doppRPA
+!
+!  if(doRPA) then
+!
+!    call wall_time(start_RPA)
+!    call RRPA(use_gpu,dotest,dophRPA,dophRPAx,docrRPA,doppRPA,TDA,doACFDT,exchange_kernel,singlet,triplet, &
+!              nOrb,nC,nO,nV,nR,nS,ENuc,ERHF,ERI_MO,dipole_int_MO,eHF)
+!    call wall_time(end_RPA)
+!
+!    t_RPA = end_RPA - start_RPA
+!    write(*,'(A65,1X,F9.3,A8)') 'Total wall time for RPA = ',t_RPA,' seconds'
+!    write(*,*)
+!
+!  end if
+!
+!!-------------------------!
+!! Green's function module !
+!!-------------------------!
+!
+!  doGF = doG0F2 .or. doevGF2 .or. doqsGF2 .or. doufG0F02 .or. doG0F3 .or. doevGF3
+!
+!  if(doGF) then
+!
+!    call wall_time(start_GF)
+!    call RGF(dotest,doG0F2,doevGF2,doqsGF2,doufG0F02,doG0F3,doevGF3,renorm_GF,maxSCF_GF, &
+!             thresh_GF,max_diis_GF,dophBSE,doppBSE,TDA,dBSE,dTDA,singlet,triplet,lin_GF, &
+!             eta_GF,reg_GF,nNuc,ZNuc,rNuc,ENuc,nBas,nOrb,nC,nO,nV,nR,nS,ERHF,            &
+!             S,X,T,V,Hc,ERI_AO,ERI_MO,dipole_int_AO,dipole_int_MO,PHF,cHF,eHF)
+!    call wall_time(end_GF)
+!
+!    t_GF = end_GF - start_GF
+!    write(*,'(A65,1X,F9.3,A8)') 'Total wall time for GF2 = ',t_GF,' seconds'
+!    write(*,*)
+!
+!  end if
+!
+!!-----------!
+!! GW module !
+!!-----------!
+!
+!  doGW = doG0W0 .or. doevGW .or. doqsGW .or. doufG0W0 .or. doufGW 
+!
+!  if(doGW) then
+!    
+!    call wall_time(start_GW)
+!    call RGW(dotest,doG0W0,doevGW,doqsGW,doufG0W0,doufGW,maxSCF_GW,thresh_GW,max_diis_GW,                &
+!             doACFDT,exchange_kernel,doXBS,dophBSE,dophBSE2,doppBSE,TDA_W,TDA,dBSE,dTDA,singlet,triplet, &
+!             lin_GW,eta_GW,reg_GW,nNuc,ZNuc,rNuc,ENuc,nBas,nOrb,nC,nO,nV,nR,nS,ERHF,S,X,T,               &
+!             V,Hc,ERI_AO,ERI_MO,dipole_int_AO,dipole_int_MO,PHF,cHF,eHF)
+!    call wall_time(end_GW)
+!  
+!    t_GW = end_GW - start_GW
+!    write(*,'(A65,1X,F9.3,A8)') 'Total wall time for GW = ',t_GW,' seconds'
+!    write(*,*)
+!
+!  end if
+!
+!!-----------------!
+!! T-matrix module !
+!!-----------------!
+!
+!  doGT = doG0T0pp .or. doevGTpp .or. doqsGTpp .or. doufG0T0pp .or. doG0T0eh .or. doevGTeh .or. doqsGTeh
+!
+!  if(doGT) then
+!    
+!    call wall_time(start_GT)
+!    call RGT(dotest,doG0T0pp,doevGTpp,doqsGTpp,doufG0T0pp,doG0T0eh,doevGTeh,doqsGTeh,                &
+!             maxSCF_GT,thresh_GT,max_diis_GT,doACFDT,exchange_kernel,doXBS,dophBSE,dophBSE2,doppBSE, &
+!             TDA_T,TDA,dBSE,dTDA,singlet,triplet,lin_GT,eta_GT,reg_GT,nNuc,ZNuc,rNuc,ENuc,           &
+!             nBas,nOrb,nC,nO,nV,nR,nS,ERHF,S,X,T,V,Hc,ERI_AO,ERI_MO,dipole_int_AO,                   &
+!             dipole_int_MO,PHF,cHF,eHF)
+!    call wall_time(end_GT)
+!  
+!    t_GT = end_GT - start_GT
+!    write(*,'(A65,1X,F9.3,A8)') 'Total wall time for GT = ',t_GT,' seconds'
+!    write(*,*)
+!
+!  end if
+
+
+  return
+end subroutine
diff --git a/src/QuAcK/UQuAcK.f90 b/src/QuAcK/UQuAcK.f90
index f813a72..0ff1759 100644
--- a/src/QuAcK/UQuAcK.f90
+++ b/src/QuAcK/UQuAcK.f90
@@ -1,9 +1,9 @@
-subroutine UQuAcK(dotest,doUHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT,                      &
+subroutine UQuAcK(working_dir,dotest,doUHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,doCCSD,doCCSDT,          &
                   dodrCCD,dorCCD,docrCCD,dolCCD,doCIS,doCIS_D,doCID,doCISD,doFCI,dophRPA,dophRPAx,docrRPA,doppRPA, &
                   doG0F2,doevGF2,doqsGF2,doufG0F02,doG0F3,doevGF3,doG0W0,doevGW,doqsGW,doufG0W0,doufGW,            &
                   doG0T0pp,doevGTpp,doqsGTpp,doufG0T0pp,doG0T0eh,doevGTeh,doqsGTeh,                                & 
                   nNuc,nBas,nC,nO,nV,nR,ENuc,ZNuc,rNuc,                                                            &
-                  S,T,V,Hc,X,dipole_int_AO,ERI_AO,maxSCF_HF,max_diis_HF,thresh_HF,level_shift,                     &
+                  S,T,V,Hc,X,dipole_int_AO,maxSCF_HF,max_diis_HF,thresh_HF,level_shift,                            &
                   guess_type,mix,reg_MP,maxSCF_CC,max_diis_CC,thresh_CC,spin_conserved,spin_flip,TDA,              &
                   maxSCF_GF,max_diis_GF,renorm_GF,thresh_GF,lin_GF,reg_GF,eta_GF,maxSCF_GW,max_diis_GW,thresh_GW,  &
                   TDA_W,lin_GW,reg_GW,eta_GW,maxSCF_GT,max_diis_GT,thresh_GT,TDA_T,lin_GT,reg_GT,eta_GT,           &
@@ -12,6 +12,8 @@ subroutine UQuAcK(dotest,doUHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,do
   implicit none
   include 'parameters.h'
 
+  character(len=256),intent(in) :: working_dir
+
   logical,intent(in)            :: dotest
 
   logical,intent(in)            :: doUHF
@@ -42,7 +44,6 @@ subroutine UQuAcK(dotest,doUHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,do
   double precision,intent(in)   :: Hc(nBas,nBas)
   double precision,intent(in)   :: X(nBas,nBas)
   double precision,intent(in)   :: dipole_int_AO(nBas,nBas,ncart)
-  double precision,intent(in)   :: ERI_AO(nBas,nBas,nBas,nBas)
 
   integer,intent(in)            :: maxSCF_HF,max_diis_HF
   double precision,intent(in)   :: thresh_HF,level_shift,mix
@@ -90,10 +91,12 @@ subroutine UQuAcK(dotest,doUHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,do
   double precision              :: start_GW     ,end_GW       ,t_GW
   double precision              :: start_GT     ,end_GT       ,t_GT
 
+  double precision              :: start_int, end_int, t_int
   double precision,allocatable  :: cHF(:,:,:),eHF(:,:),PHF(:,:,:),FHF(:,:,:)
   double precision              :: EUHF
   double precision,allocatable  :: dipole_int_aa(:,:,:),dipole_int_bb(:,:,:)
   double precision,allocatable  :: ERI_aaaa(:,:,:,:),ERI_aabb(:,:,:,:),ERI_bbbb(:,:,:,:)
+  double precision,allocatable  :: ERI_AO(:,:,:,:)
   integer                       :: ixyz
   integer                       :: nS(nspin)
 
@@ -112,6 +115,15 @@ subroutine UQuAcK(dotest,doUHF,dostab,dosearch,doMP2,doMP3,doCCD,dopCCD,doDCD,do
            ERI_aaaa(nBas,nBas,nBas,nBas),ERI_aabb(nBas,nBas,nBas,nBas),                    & 
            ERI_bbbb(nBas,nBas,nBas,nBas))
 
+  allocate(ERI_AO(nBas,nBas,nBas,nBas))
+  call wall_time(start_int)
+  call read_2e_integrals(working_dir,nBas,ERI_AO)
+  call wall_time(end_int)
+  t_int = end_int - start_int
+  write(*,*)
+  write(*,'(A65,1X,F9.3,A8)') 'Total wall time for reading 2e-integrals =',t_int,' seconds'
+  write(*,*)
+
 !---------------------!
 ! Hartree-Fock module !
 !---------------------!
diff --git a/src/QuAcK/read_hpc_flags.f90 b/src/QuAcK/read_hpc_flags.f90
new file mode 100644
index 0000000..4921754
--- /dev/null
+++ b/src/QuAcK/read_hpc_flags.f90
@@ -0,0 +1,39 @@
+subroutine read_hpc_flags(working_dir, switch_hpc, use_gpu)
+
+  implicit none
+
+  character(len=256), intent(in) :: working_dir
+
+  logical, intent(out)           :: switch_hpc
+  logical, intent(out)           :: use_gpu
+
+  character(len=1)              :: ans
+  integer                       :: ios
+  character(len=256)            :: file_path
+
+  file_path = trim(working_dir) // '/input/hpc_flags'
+  open(unit=1, file=file_path, status='old', action='read', iostat=ios)
+
+    if(ios /= 0) then
+
+      switch_hpc = .False.
+      use_gpu = .False.
+
+    else
+
+      switch_hpc = .False.
+      use_gpu = .False.
+
+      read(1,*) 
+      read(1,*) ans
+      if(ans == 'T') switch_hpc = .true.
+
+      read(1,*) 
+      read(1,*) ans
+      if(ans == 'T') use_gpu = .true.
+
+    endif
+
+  close(unit=1)
+
+end subroutine 
diff --git a/src/RPA/RRPA.f90 b/src/RPA/RRPA.f90
index ab38932..5ab652f 100644
--- a/src/RPA/RRPA.f90
+++ b/src/RPA/RRPA.f90
@@ -1,4 +1,4 @@
-subroutine RRPA(dotest,dophRPA,dophRPAx,docrRPA,doppRPA,TDA,doACFDT,exchange_kernel,singlet,triplet, &  
+subroutine RRPA(use_gpu,dotest,dophRPA,dophRPAx,docrRPA,doppRPA,TDA,doACFDT,exchange_kernel,singlet,triplet, &
                 nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF)
 
 ! Random-phase approximation module
@@ -8,6 +8,8 @@ subroutine RRPA(dotest,dophRPA,dophRPAx,docrRPA,doppRPA,TDA,doACFDT,exchange_ker
 
 ! Input variables
 
+  logical,intent(in)            :: use_gpu
+
   logical,intent(in)            :: dotest
 
   logical,intent(in)            :: dophRPA
@@ -43,7 +45,11 @@ subroutine RRPA(dotest,dophRPA,dophRPAx,docrRPA,doppRPA,TDA,doACFDT,exchange_ker
   if(dophRPA) then
 
     call wall_time(start_RPA)
-    call phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF)
+    if (use_gpu) then
+      call phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF)
+    else
+      call phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF)
+    endif
     call wall_time(end_RPA)
 
     t_RPA = end_RPA - start_RPA
diff --git a/src/RPA/phRRPA.f90 b/src/RPA/phRRPA.f90
index 7a13d8f..48f3c5c 100644
--- a/src/RPA/phRRPA.f90
+++ b/src/RPA/phRRPA.f90
@@ -8,37 +8,39 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,
 
 ! Input variables
 
-  logical,intent(in)            :: dotest
+  logical,intent(in)           :: dotest
 
-  logical,intent(in)            :: TDA
-  logical,intent(in)            :: doACFDT
-  logical,intent(in)            :: exchange_kernel
-  logical,intent(in)            :: singlet
-  logical,intent(in)            :: triplet
-  integer,intent(in)            :: nBas
-  integer,intent(in)            :: nC
-  integer,intent(in)            :: nO
-  integer,intent(in)            :: nV
-  integer,intent(in)            :: nR
-  integer,intent(in)            :: nS
-  double precision,intent(in)   :: ENuc
-  double precision,intent(in)   :: ERHF
-  double precision,intent(in)   :: eHF(nBas)
-  double precision,intent(in)   :: ERI(nBas,nBas,nBas,nBas)
-  double precision,intent(in)   :: dipole_int(nBas,nBas,ncart)
+  logical,intent(in)           :: TDA
+  logical,intent(in)           :: doACFDT
+  logical,intent(in)           :: exchange_kernel
+  logical,intent(in)           :: singlet
+  logical,intent(in)           :: triplet
+  integer,intent(in)           :: nBas
+  integer,intent(in)           :: nC
+  integer,intent(in)           :: nO
+  integer,intent(in)           :: nV
+  integer,intent(in)           :: nR
+  integer,intent(in)           :: nS
+  double precision,intent(in)  :: ENuc
+  double precision,intent(in)  :: ERHF
+  double precision,intent(in)  :: eHF(nBas)
+  double precision,intent(in)  :: ERI(nBas,nBas,nBas,nBas)
+  double precision,intent(in)  :: dipole_int(nBas,nBas,ncart)
 
 ! Local variables
 
-  integer                       :: ispin
-  logical                       :: dRPA
-  double precision              :: lambda
-  double precision,allocatable  :: Aph(:,:)
-  double precision,allocatable  :: Bph(:,:)
-  double precision,allocatable  :: Om(:)
-  double precision,allocatable  :: XpY(:,:)
-  double precision,allocatable  :: XmY(:,:)
+  integer                      :: ia
+  integer                      :: ispin
+  logical                      :: dRPA
+  double precision             :: t1, t2
+  double precision             :: lambda
+  double precision,allocatable :: Aph(:,:)
+  double precision,allocatable :: Bph(:,:)
+  double precision,allocatable :: Om(:)
+  double precision,allocatable :: XpY(:,:)
+  double precision,allocatable :: XmY(:,:)
 
-  double precision              :: EcRPA(nspin)
+  double precision             :: EcRPA(nspin)
 
 ! Hello world
 
@@ -71,10 +73,17 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,
 
     ispin = 1
 
+    !call wall_time(t1)
     call phLR_A(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,eHF,ERI,Aph)
     if(.not.TDA) call phLR_B(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,ERI,Bph)
 
     call phLR(TDA,nS,Aph,Bph,EcRPA(ispin),Om,XpY,XmY)
+    !call wall_time(t2)
+    !print *, "wall time for dRPA on CPU (sec) = ", t2 - t1
+    !do ia = 1, nS
+    !  write(112, *) Om(ia) 
+    !enddo
+    !stop
     call print_excitation_energies('phRPA@RHF','singlet',nS,Om)
     call phLR_transition_vectors(.true.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY)
 
diff --git a/src/RPA/phRRPA_GPU.f90 b/src/RPA/phRRPA_GPU.f90
new file mode 100644
index 0000000..dcf65d5
--- /dev/null
+++ b/src/RPA/phRRPA_GPU.f90
@@ -0,0 +1,190 @@
+#ifdef USE_GPU
+
+subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF)
+
+  use cu_quack_module
+
+
+  implicit none
+  include 'parameters.h'
+  include 'quadrature.h'
+
+
+  logical,intent(in)           :: dotest
+  logical,intent(in)           :: TDA
+  logical,intent(in)           :: doACFDT
+  logical,intent(in)           :: exchange_kernel
+  logical,intent(in)           :: singlet
+  logical,intent(in)           :: triplet
+  integer,intent(in)           :: nBas
+  integer,intent(in)           :: nC
+  integer,intent(in)           :: nO
+  integer,intent(in)           :: nV
+  integer,intent(in)           :: nR
+  integer,intent(in)           :: nS
+  double precision,intent(in)  :: ENuc
+  double precision,intent(in)  :: ERHF
+  double precision,intent(in)  :: eHF(nBas)
+  double precision,intent(in)  :: ERI(nBas,nBas,nBas,nBas)
+  double precision,intent(in)  :: dipole_int(nBas,nBas,ncart)
+
+
+  integer                      :: i, a, ia
+  integer                      :: ispin
+  logical                      :: dRPA
+  double precision             :: t1, t2
+  integer,         allocatable :: iorder(:)
+  double precision,allocatable :: Om(:)
+  double precision,allocatable :: XpY(:,:)
+  double precision,allocatable :: XmY(:,:)
+
+  double precision             :: EcRPA(nspin)
+
+
+  write(*,*)
+  write(*,*)'******************************************'
+  write(*,*)'* Restricted ph-RPA Calculation (on GPU) *'
+  write(*,*)'******************************************'
+  write(*,*)
+
+  if(TDA) then
+    write(*,*) 'Tamm-Dancoff approximation activated!'
+    write(*,*)
+  end if
+
+! Initialization
+
+  dRPA = .true.
+  EcRPA(:) = 0d0
+
+
+  allocate(Om(nS), XpY(nS,nS), XmY(nS,nS))
+
+  if(singlet) then 
+
+    if(TDA) then
+
+      !print*, 'start diag on GPU:'
+      !call wall_time(t1)
+      call ph_drpa_tda_sing(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om(1), XpY(1,1))
+      !call wall_time(t2)
+      !print*, 'diag time on GPU (sec):', t2 - t1
+      XmY(:,:) = XpY(:,:)
+
+    else
+
+      !call wall_time(t1)
+      call ph_drpa_sing(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om(1), XpY(1,1), XmY(1,1))
+      !call wall_time(t2)
+      !print *, "wall time for dRPA on GPU (sec) = ", t2 - t1
+      !do ia = 1, nS
+      !  write(111, *) Om(ia)
+      !enddo
+      !stop
+
+    endif
+
+    ! TODO
+    XpY(:,:) = transpose(XpY(:,:))
+    XmY(:,:) = transpose(XmY(:,:))
+
+    call print_excitation_energies('phRPA@RHF','singlet',nS,Om)
+    call phLR_transition_vectors(.true.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY)
+  endif
+
+  if(triplet) then 
+
+    XpY(:,:) = 0.d0
+    allocate(iorder(nS))
+    ia = 0
+    do i = nC+1, nO
+      do a = nO+1, nBas-nR
+        ia = ia + 1
+        iorder(ia) = ia
+        Om(ia) = eHF(a) - eHF(i)
+        XpY(ia,ia) = 1.d0
+      enddo
+    enddo
+
+    call quick_sort(Om(1), iorder(1), nS)
+    deallocate(iorder)
+
+    XmY(:,:) = XpY(:,:)
+
+    call print_excitation_energies('phRPA@RHF','triplet',nS,Om)
+    call phLR_transition_vectors(.false.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY)
+  endif
+
+  deallocate(Om, XpY, XmY)
+
+
+  ! TODO
+  ! init EcRPA
+  if(exchange_kernel) then
+    EcRPA(1) = 0.5d0*EcRPA(1)
+    EcRPA(2) = 1.5d0*EcRPA(2)
+  endif
+
+  write(*,*)
+  write(*,*)'-------------------------------------------------------------------------------'
+  write(*,'(2X,A50,F20.10,A3)') 'Tr@phRPA@RHF correlation energy (singlet) = ',EcRPA(1),' au'
+  write(*,'(2X,A50,F20.10,A3)') 'Tr@phRPA@RHF correlation energy (triplet) = ',EcRPA(2),' au'
+  write(*,'(2X,A50,F20.10,A3)') 'Tr@phRPA@RHF correlation energy           = ',sum(EcRPA),' au'
+  write(*,'(2X,A50,F20.10,A3)') 'Tr@phRPA@RHF total energy                 = ',ENuc + ERHF + sum(EcRPA),' au'
+  write(*,*)'-------------------------------------------------------------------------------'
+  write(*,*)
+
+! Compute the correlation energy via the adiabatic connection 
+
+  if(doACFDT) then
+
+    ! TODO
+    call phACFDT(exchange_kernel,dRPA,TDA,singlet,triplet,nBas,nC,nO,nV,nR,nS,ERI,eHF,EcRPA)
+
+    write(*,*)
+    write(*,*)'-------------------------------------------------------------------------------'
+    write(*,'(2X,A50,F20.10,A3)') 'AC@phRPA@RHF correlation energy (singlet) = ',EcRPA(1),' au'
+    write(*,'(2X,A50,F20.10,A3)') 'AC@phRPA@RHF correlation energy (triplet) = ',EcRPA(2),' au'
+    write(*,'(2X,A50,F20.10,A3)') 'AC@phRPA@RHF correlation energy           = ',sum(EcRPA),' au'
+    write(*,'(2X,A50,F20.10,A3)') 'AC@phRPA@RHF total energy                 = ',ENuc + ERHF + sum(EcRPA),' au'
+    write(*,*)'-------------------------------------------------------------------------------'
+    write(*,*)
+
+  endif
+
+  if(dotest) then
+    call dump_test_value('R','phRPA correlation energy (on GPU)',sum(EcRPA))
+  endif
+
+end subroutine 
+
+#else
+
+subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF)
+
+  implicit none
+  include 'parameters.h'
+  include 'quadrature.h'
+
+  logical,intent(in)            :: dotest
+  logical,intent(in)            :: TDA
+  logical,intent(in)            :: doACFDT
+  logical,intent(in)            :: exchange_kernel
+  logical,intent(in)            :: singlet
+  logical,intent(in)            :: triplet
+  integer,intent(in)            :: nBas
+  integer,intent(in)            :: nC
+  integer,intent(in)            :: nO
+  integer,intent(in)            :: nV
+  integer,intent(in)            :: nR
+  integer,intent(in)            :: nS
+  double precision,intent(in)   :: ENuc
+  double precision,intent(in)   :: ERHF
+  double precision,intent(in)   :: eHF(nBas)
+  double precision,intent(in)   :: ERI(nBas,nBas,nBas,nBas)
+  double precision,intent(in)   :: dipole_int(nBas,nBas,ncart)
+  print*, "compile with USE_GPU FLAG!"
+  stop
+end
+
+#endif
diff --git a/src/cuda/Makefile b/src/cuda/Makefile
new file mode 100644
index 0000000..32c2101
--- /dev/null
+++ b/src/cuda/Makefile
@@ -0,0 +1,44 @@
+NVCC = nvcc
+NVFLAGS = -O2 --compiler-options '-O2 -Wall -fPIC'
+
+CC = gcc
+CFLAGS = -O2 -Wall -g -fPIC
+
+FC = gfortran
+FFLAGS = -O2 -Wall -g -fPIC
+
+SRC_DIR = src
+INC_DIR = include
+BLD_DIR = build
+$(shell mkdir -p $(BLD_DIR))
+
+CU_SRC = $(wildcard $(SRC_DIR)/*.cu)
+CU_OBJ = $(CU_SRC:$(SRC_DIR)/%.cu=$(BLD_DIR)/%.o)
+
+C_SRC = $(wildcard $(SRC_DIR)/*.c)
+C_OBJ = $(C_SRC:$(SRC_DIR)/%.c=$(BLD_DIR)/%.o)
+
+F_SRC = #$(SRC_DIR)/cu_quack_module.f90
+F_OBJ = #$(BLD_DIR)/cu_quack_module.o
+
+OUTPUT_LIB = $(BLD_DIR)/libcuquack.so
+
+all: $(OUTPUT_LIB)
+
+$(OUTPUT_LIB): $(CU_OBJ) $(C_OBJ) $(F_OBJ)
+	$(CC) -shared -o $(OUTPUT_LIB) $(CU_OBJ) $(C_OBJ) $(F_OBJ)
+
+$(BLD_DIR)/%.o: $(SRC_DIR)/%.cu
+	$(NVCC) $(NVFLAGS) -c -o $@ $< -I$(INC_DIR)
+
+$(BLD_DIR)/%.o: $(SRC_DIR)/%.c
+	$(CC) $(CFLAGS) -c -o $@ $< -I$(INC_DIR)
+
+$(F_OBJ): $(F_SRC)
+	$(FC) $(FFLAGS) -c -o $@ $< -J$(BLD_DIR)
+
+.PHONY: clean
+clean:
+	rm $(BLD_DIR)/*
+
+
diff --git a/src/cuda/include/my_linalg.h b/src/cuda/include/my_linalg.h
new file mode 100644
index 0000000..6ad5bf5
--- /dev/null
+++ b/src/cuda/include/my_linalg.h
@@ -0,0 +1,22 @@
+#ifndef MY_LINALG
+
+#define MY_LINALG
+
+extern void A_plus_B_in_A(int n, double *A, double *B);
+extern void A_minus_twoB_in_B(int n, double *A, double *B);
+
+extern void A_D_At(int n, double *A, double *D, double *R);
+extern void A_Dinv_At(int n, double *A, double *D, double *R);
+
+extern void A_D_inplace(int n, double *A, double *D);
+extern void A_Dinv_inplace(int n, double *A, double *D);
+
+extern void A_D_in_B(int n, double *A, double *D, double *B);
+extern void A_Dinv_in_B(int n, double *A, double *D, double *B);
+
+extern void elementwise_dsqrt(int nS, double *A, double *A_Sq);
+extern void elementwise_dsqrt_inplace(int nS, double *A);
+
+extern void diag_dn_dsyevd(int n, int *info, double *W, double *A);
+
+#endif
diff --git a/src/cuda/include/ph_rpa.h b/src/cuda/include/ph_rpa.h
new file mode 100644
index 0000000..abce320
--- /dev/null
+++ b/src/cuda/include/ph_rpa.h
@@ -0,0 +1,11 @@
+#ifndef PH_RPA
+
+#define PH_RPA
+
+extern void ph_dRPA_A_sing(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *A);
+extern void ph_dRPA_B_sing(int nO, int nV, int nBas, int nS, double *ERI, double *B);
+
+extern void ph_dRPA_ApB_sing(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *ApB);
+extern void ph_dRPA_AmB_sing(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *AmB);
+
+#endif
diff --git a/src/cuda/include/utils.h b/src/cuda/include/utils.h
new file mode 100644
index 0000000..f5a6403
--- /dev/null
+++ b/src/cuda/include/utils.h
@@ -0,0 +1,9 @@
+#ifndef UTILS
+
+#define UTILS
+
+extern void check_Cuda_Errors(cudaError_t err, const char *msg, const char *file, int line);
+extern void check_Cublas_Errors(cublasStatus_t status, const char *msg, const char *file, int line);
+extern void check_Cusolver_Errors(cusolverStatus_t status, const char *msg, const char *file, int line);
+
+#endif
diff --git a/src/cuda/src/a_d_at.cu b/src/cuda/src/a_d_at.cu
new file mode 100644
index 0000000..f47b797
--- /dev/null
+++ b/src/cuda/src/a_d_at.cu
@@ -0,0 +1,64 @@
+#include <stdio.h>
+
+
+__global__ void A_D_At_kernel(int n, double *A, double *D, double *R) {
+
+
+    int i, j;
+    int k;
+    int in, ij;
+    int kn;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    j = blockIdx.y * blockDim.y + threadIdx.y;
+
+    while(i < n) {
+
+        in = i * n;
+
+        while(j < n) {
+
+            ij = in + j;
+
+            R[ij] = 0.0;
+            k = 0;
+            while(k < n) {
+
+                kn = k * n;
+                R[ij] += D[k] * A[i + kn] * A[j + kn];
+
+                k ++;
+            } // k
+
+            j += blockDim.y * gridDim.y;
+        } // j
+
+        i += blockDim.x * gridDim.x;
+    } // i
+
+}
+
+
+
+
+
+extern "C" void A_D_At(int n, double *A, double *D, double *R) {
+
+
+    int sBlocks = 32;
+    int nBlocks = (n + sBlocks - 1) / sBlocks;
+
+    dim3 dimGrid(nBlocks, nBlocks, 1);
+    dim3 dimBlock(sBlocks, sBlocks, 1);
+
+    printf("lunching A_D_At_kernel with %dx%d blocks and %dx%d threads/block\n",
+        nBlocks, nBlocks, sBlocks, sBlocks);
+
+
+    A_D_At_kernel<<<dimGrid, dimBlock>>>(n, A, D, R);
+
+}
+
+
+
+
diff --git a/src/cuda/src/a_d_in_b.cu b/src/cuda/src/a_d_in_b.cu
new file mode 100644
index 0000000..10a7e89
--- /dev/null
+++ b/src/cuda/src/a_d_in_b.cu
@@ -0,0 +1,57 @@
+#include <stdio.h>
+
+
+__global__ void A_D_in_B_kernel(int n, double *A, double *D, double *B) {
+
+
+    int i, j;
+    int in, ji;
+
+    double tmp;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    j = blockIdx.y * blockDim.y + threadIdx.y;
+
+    while(i < n) {
+
+        in = i * n;
+
+        tmp = D[i];
+
+        while(j < n) {
+
+            ji = in + j;
+
+            B[ji] = A[ji] * tmp;
+
+            j += blockDim.y * gridDim.y;
+        } // j
+
+        i += blockDim.x * gridDim.x;
+    } // i
+
+}
+
+
+
+
+
+extern "C" void A_D_in_B(int n, double *A, double *D, double *B) {
+
+
+    int sBlocks = 32;
+    int nBlocks = (n + sBlocks - 1) / sBlocks;
+
+    dim3 dimGrid(nBlocks, nBlocks, 1);
+    dim3 dimBlock(sBlocks, sBlocks, 1);
+
+    printf("lunching A_D_in_B_kernel with %dx%d blocks and %dx%d threads/block\n",
+        nBlocks, nBlocks, sBlocks, sBlocks);
+
+
+    A_D_in_B_kernel<<<dimGrid, dimBlock>>>(n, A, D, B);
+
+}
+
+
+
diff --git a/src/cuda/src/a_d_inplace.cu b/src/cuda/src/a_d_inplace.cu
new file mode 100644
index 0000000..3aa233f
--- /dev/null
+++ b/src/cuda/src/a_d_inplace.cu
@@ -0,0 +1,57 @@
+#include <stdio.h>
+
+
+__global__ void A_D_inplace_kernel(int n, double *A, double *D) {
+
+
+    int i, j;
+    int in, ji;
+
+    double tmp;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    j = blockIdx.y * blockDim.y + threadIdx.y;
+
+    while(i < n) {
+
+        in = i * n;
+
+        tmp = D[i];
+
+        while(j < n) {
+
+            ji = in + j;
+
+            A[ji] = A[ji] * tmp;
+
+            j += blockDim.y * gridDim.y;
+        } // j
+
+        i += blockDim.x * gridDim.x;
+    } // i
+
+}
+
+
+
+
+
+extern "C" void A_D_inplace(int n, double *A, double *D) {
+
+
+    int sBlocks = 32;
+    int nBlocks = (n + sBlocks - 1) / sBlocks;
+
+    dim3 dimGrid(nBlocks, nBlocks, 1);
+    dim3 dimBlock(sBlocks, sBlocks, 1);
+
+    printf("lunching A_D_inplace_kernel with %dx%d blocks and %dx%d threads/block\n",
+        nBlocks, nBlocks, sBlocks, sBlocks);
+
+
+    A_D_inplace_kernel<<<dimGrid, dimBlock>>>(n, A, D);
+
+}
+
+
+
diff --git a/src/cuda/src/a_dinv_at.cu b/src/cuda/src/a_dinv_at.cu
new file mode 100644
index 0000000..880816b
--- /dev/null
+++ b/src/cuda/src/a_dinv_at.cu
@@ -0,0 +1,64 @@
+#include <stdio.h>
+
+
+__global__ void A_Dinv_At_kernel(int n, double *A, double *D, double *R) {
+
+
+    int i, j;
+    int k;
+    int in, ij;
+    int kn;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    j = blockIdx.y * blockDim.y + threadIdx.y;
+
+    while(i < n) {
+
+        in = i * n;
+
+        while(j < n) {
+
+            ij = in + j;
+
+            R[ij] = 0.0;
+            k = 0;
+            while(k < n) {
+
+                kn = k * n;
+                R[ij] += D[k] * A[i + kn] * A[j + kn] / (D[k] + 1e-12);
+
+                k ++;
+            } // k
+
+            j += blockDim.y * gridDim.y;
+        } // j
+
+        i += blockDim.x * gridDim.x;
+    } // i
+
+}
+
+
+
+
+
+extern "C" void A_Dinv_At(int n, double *A, double *D, double *R) {
+
+
+    int sBlocks = 32;
+    int nBlocks = (n + sBlocks - 1) / sBlocks;
+
+    dim3 dimGrid(nBlocks, nBlocks, 1);
+    dim3 dimBlock(sBlocks, sBlocks, 1);
+
+    printf("lunching A_Dinv_At_kernel with %dx%d blocks and %dx%d threads/block\n",
+        nBlocks, nBlocks, sBlocks, sBlocks);
+
+
+    A_Dinv_At_kernel<<<dimGrid, dimBlock>>>(n, A, D, R);
+
+}
+
+
+
+
diff --git a/src/cuda/src/a_dinv_in_b.cu b/src/cuda/src/a_dinv_in_b.cu
new file mode 100644
index 0000000..c73ecd2
--- /dev/null
+++ b/src/cuda/src/a_dinv_in_b.cu
@@ -0,0 +1,57 @@
+#include <stdio.h>
+
+
+__global__ void A_Dinv_in_B_kernel(int n, double *A, double *D, double *B) {
+
+
+    int i, j;
+    int in, ji;
+
+    double tmp;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    j = blockIdx.y * blockDim.y + threadIdx.y;
+
+    while(i < n) {
+
+        in = i * n;
+
+        tmp = 1.0 / D[i];
+
+        while(j < n) {
+
+            ji = in + j;
+
+            B[ji] = A[ji] * tmp;
+
+            j += blockDim.y * gridDim.y;
+        } // j
+
+        i += blockDim.x * gridDim.x;
+    } // i
+
+}
+
+
+
+
+
+extern "C" void A_Dinv_in_B(int n, double *A, double *D, double *B) {
+
+
+    int sBlocks = 32;
+    int nBlocks = (n + sBlocks - 1) / sBlocks;
+
+    dim3 dimGrid(nBlocks, nBlocks, 1);
+    dim3 dimBlock(sBlocks, sBlocks, 1);
+
+    printf("lunching A_Dinv_in_B_kernel with %dx%d blocks and %dx%d threads/block\n",
+        nBlocks, nBlocks, sBlocks, sBlocks);
+
+
+    A_Dinv_in_B_kernel<<<dimGrid, dimBlock>>>(n, A, D, B);
+
+}
+
+
+
diff --git a/src/cuda/src/a_dinv_inplace.cu b/src/cuda/src/a_dinv_inplace.cu
new file mode 100644
index 0000000..be2f567
--- /dev/null
+++ b/src/cuda/src/a_dinv_inplace.cu
@@ -0,0 +1,57 @@
+#include <stdio.h>
+
+
+__global__ void A_Dinv_inplace_kernel(int n, double *A, double *D) {
+
+
+    int i, j;
+    int in, ji;
+
+    double tmp;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    j = blockIdx.y * blockDim.y + threadIdx.y;
+
+    while(i < n) {
+
+        in = i * n;
+
+        tmp = 1.0 / (1e-12 + D[i]);
+
+        while(j < n) {
+
+            ji = in + j;
+
+            A[ji] = A[ji] * tmp;
+
+            j += blockDim.y * gridDim.y;
+        } // j
+
+        i += blockDim.x * gridDim.x;
+    } // i
+
+}
+
+
+
+
+
+extern "C" void A_Dinv_inplace(int n, double *A, double *D) {
+
+
+    int sBlocks = 32;
+    int nBlocks = (n + sBlocks - 1) / sBlocks;
+
+    dim3 dimGrid(nBlocks, nBlocks, 1);
+    dim3 dimBlock(sBlocks, sBlocks, 1);
+
+    printf("lunching A_Dinv_inplace_kernel with %dx%d blocks and %dx%d threads/block\n",
+        nBlocks, nBlocks, sBlocks, sBlocks);
+
+
+    A_Dinv_inplace_kernel<<<dimGrid, dimBlock>>>(n, A, D);
+
+}
+
+
+
diff --git a/src/cuda/src/a_minus_twob_in_b.cu b/src/cuda/src/a_minus_twob_in_b.cu
new file mode 100644
index 0000000..0acdb70
--- /dev/null
+++ b/src/cuda/src/a_minus_twob_in_b.cu
@@ -0,0 +1,52 @@
+#include <stdio.h>
+
+
+__global__ void A_minus_twoB_in_B_kernel(int n, double *A, double *B) {
+
+
+    int i, j;
+    int in, ji;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    j = blockIdx.y * blockDim.y + threadIdx.y;
+
+    while(i < n) {
+
+        in = i * n;
+
+        while(j < n) {
+
+            ji = in + j;
+
+            B[ji] = A[ji] - 2.0 * B[ji];
+
+            j += blockDim.y * gridDim.y;
+        } // j
+
+        i += blockDim.x * gridDim.x;
+    } // i
+
+}
+
+
+
+
+extern "C" void A_minus_twoB_in_B(int n, double *A, double *B) {
+
+
+    int sBlocks = 32;
+    int nBlocks = (n + sBlocks - 1) / sBlocks;
+
+    dim3 dimGrid(nBlocks, nBlocks, 1);
+    dim3 dimBlock(sBlocks, sBlocks, 1);
+
+    printf("lunching A_minus_twoB_in_B_kernel with %dx%d blocks and %dx%d threads/block\n",
+        nBlocks, nBlocks, sBlocks, sBlocks);
+
+
+    A_minus_twoB_in_B_kernel<<<dimGrid, dimBlock>>>(n, A, B);
+
+}
+
+
+
diff --git a/src/cuda/src/a_plus_b_in_a.cu b/src/cuda/src/a_plus_b_in_a.cu
new file mode 100644
index 0000000..d1868e3
--- /dev/null
+++ b/src/cuda/src/a_plus_b_in_a.cu
@@ -0,0 +1,52 @@
+#include <stdio.h>
+
+
+__global__ void A_plus_B_in_A_kernel(int n, double *A, double *B) {
+
+
+    int i, j;
+    int in, ji;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    j = blockIdx.y * blockDim.y + threadIdx.y;
+
+    while(i < n) {
+
+        in = i * n;
+
+        while(j < n) {
+
+            ji = in + j;
+
+            A[ji] = A[ji] + B[ji];
+
+            j += blockDim.y * gridDim.y;
+        } // j
+
+        i += blockDim.x * gridDim.x;
+    } // i
+
+}
+
+
+
+
+extern "C" void A_plus_B_in_A(int n, double *A, double *B) {
+
+
+    int sBlocks = 32;
+    int nBlocks = (n + sBlocks - 1) / sBlocks;
+
+    dim3 dimGrid(nBlocks, nBlocks, 1);
+    dim3 dimBlock(sBlocks, sBlocks, 1);
+
+    printf("lunching A_plus_B_in_A_kernel with %dx%d blocks and %dx%d threads/block\n",
+        nBlocks, nBlocks, sBlocks, sBlocks);
+
+
+    A_plus_B_in_A_kernel<<<dimGrid, dimBlock>>>(n, A, B);
+
+}
+
+
+
diff --git a/src/cuda/src/diag_dense_dsy_mat.cu b/src/cuda/src/diag_dense_dsy_mat.cu
new file mode 100644
index 0000000..4e3b0d3
--- /dev/null
+++ b/src/cuda/src/diag_dense_dsy_mat.cu
@@ -0,0 +1,42 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda_runtime.h>
+#include <cusolverDn.h>
+
+
+
+extern "C" void diag_dn_dsyevd(int n, int *info, double *W, double *A) {
+
+    cusolverDnHandle_t cusolverH = NULL;
+    cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR; // Compute eigenvalues and eigenvectors
+    cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER; // Upper triangular part of the matrix is stored
+    //cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER; // Upper triangular part of the matrix is stored
+
+    int lwork = 0;
+    double *work = NULL;
+
+    //check_Cusolver_Errors(cusolverDnCreate(&cusolverH), "cusolverDnCreate", __FILE__, __LINE__);
+    cusolverDnCreate(&cusolverH);
+
+    // Query workspace size
+    //check_Cusolver_Errors(cusolverDnDsyevd_bufferSize(cusolverH, jobz, uplo, n, A, n, W, &lwork),
+    //    "cusolverDnDsyevd_bufferSize", __FILE__, __LINE__);
+    //check_Cuda_Errors(cudaMalloc((void**)&work, sizeof(double) * lwork),
+    //    "cudaMemcpy", __FILE__, __LINE__);
+    cusolverDnDsyevd_bufferSize(cusolverH, jobz, uplo, n, A, n, W, &lwork);
+    cudaMalloc((void**)&work, sizeof(double) * lwork);
+
+    // Compute eigenvalues and eigenvectors
+    //check_Cusolver_Errors(cusolverDnDsyevd(cusolverH, jobz, uplo, n, A, n, W, work, lwork, info),
+    //    "cusolverDnDsyevd", __FILE__, __LINE__);
+    cusolverDnDsyevd(cusolverH, jobz, uplo, n, A, n, W, work, lwork, info);
+
+    // Clean up
+    //check_Cuda_Errors(cudaFree(work), "cudaFree", __FILE__, __LINE__);
+    //check_Cusolver_Errors(cusolverDnDestroy(cusolverH), "cusolverDnDestroy", __FILE__, __LINE__);
+
+    cudaFree(work);
+    cusolverDnDestroy(cusolverH);
+
+}
+
diff --git a/src/cuda/src/elementwise_dsqrt.cu b/src/cuda/src/elementwise_dsqrt.cu
new file mode 100644
index 0000000..990be52
--- /dev/null
+++ b/src/cuda/src/elementwise_dsqrt.cu
@@ -0,0 +1,51 @@
+#include <stdio.h>
+#include <math.h>
+
+
+__global__ void elementwise_dsqrt_kernel(int nS, double *A, double *A_Sq) {
+
+
+    int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    while(i < nS) {
+
+        if(A[i] > 0.0) {
+
+            A_Sq[i] = sqrt(A[i]);
+
+        } else {
+
+            A_Sq[i] = sqrt(-A[i]);
+
+        }
+
+        i += blockDim.x * gridDim.x;
+    } // i
+
+}
+
+
+
+
+
+extern "C" void elementwise_dsqrt(int nS, double *A, double *A_Sq) {
+
+    int sBlocks = 32;
+    int nBlocks = (nS + sBlocks - 1) / sBlocks;
+
+    dim3 dimGrid(nBlocks, 1, 1);
+    dim3 dimBlock(sBlocks, 1, 1);
+
+    printf("lunching elementwise_dsqrt_kernel with %d blocks and %d threads/block\n",
+        nBlocks, sBlocks);
+
+
+    elementwise_dsqrt_kernel<<<dimGrid, dimBlock>>>(nS, A, A_Sq);
+
+}
+
+
+
+
diff --git a/src/cuda/src/elementwise_dsqrt_inplace.cu b/src/cuda/src/elementwise_dsqrt_inplace.cu
new file mode 100644
index 0000000..533aeaa
--- /dev/null
+++ b/src/cuda/src/elementwise_dsqrt_inplace.cu
@@ -0,0 +1,51 @@
+#include <stdio.h>
+#include <math.h>
+
+
+__global__ void elementwise_dsqrt_inplace_kernel(int n, double *A) {
+
+
+    int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    while(i < n) {
+
+        if(A[i] > 0.0) {
+
+            A[i] = sqrt(A[i]);
+
+        } else {
+
+            A[i] = sqrt(-A[i]);
+
+        }
+
+        i += blockDim.x * gridDim.x;
+    } // i
+
+}
+
+
+
+
+
+extern "C" void elementwise_dsqrt_inplace(int n, double *A) {
+
+    int sBlocks = 32;
+    int nBlocks = (n + sBlocks - 1) / sBlocks;
+
+    dim3 dimGrid(nBlocks, 1, 1);
+    dim3 dimBlock(sBlocks, 1, 1);
+
+    printf("lunching elementwise_dsqrt_inplace_kernel with %d blocks and %d threads/block\n",
+        nBlocks, sBlocks);
+
+
+    elementwise_dsqrt_inplace_kernel<<<dimGrid, dimBlock>>>(n, A);
+
+}
+
+
+
+
diff --git a/src/cuda/src/ph_drpa_a_sing.cu b/src/cuda/src/ph_drpa_a_sing.cu
new file mode 100644
index 0000000..5711d56
--- /dev/null
+++ b/src/cuda/src/ph_drpa_a_sing.cu
@@ -0,0 +1,95 @@
+#include <stdio.h>
+
+__global__ void ph_dRPA_A_sing_kernel(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *A) {
+
+
+    int i, j, a, b;
+    int aa, bb;
+
+    long long nVS;
+    long long nBas2, nBas3;
+    long long i_A0, i_A1, i_A2, i_A3;
+    long long i_I0, i_I1, i_I2, i_I3;
+
+    bool a_eq_b;
+
+    nVS = (long long) nV * (long long) nS;
+
+    nBas2 =  (long long) nBas *  (long long) nBas;
+    nBas3 = nBas2 *  (long long) nBas;
+
+    aa = blockIdx.x * blockDim.x + threadIdx.x;
+    bb = blockIdx.y * blockDim.y + threadIdx.y;
+
+    while(aa < nV) {
+        a = aa + nO;
+
+        i_A0 = (long long) aa * (long long) nS;
+        i_I0 = (long long) a * nBas2;
+
+        while(bb < nV) {
+            b = bb + nO;
+
+            a_eq_b = a == b;
+
+            i_A1 = i_A0 + (long long) bb;
+            i_I1 = i_I0 + (long long) b * (long long) nBas;
+
+            i = 0;
+            while(i < nO) {
+
+                i_A2 = i_A1 + (long long) i * nVS;
+                i_I2 = i_I1 + (long long) i;
+ 
+                j = 0;
+                while(j < nO) {
+
+                    i_A3 = i_A2 + (long long) j * (long long) nV;
+                    i_I3 = i_I2 + (long long) j * nBas3;
+
+                    A[i_A3] = 2.0 * ERI[i_I3];
+                    if(a_eq_b && (i==j)) {
+                        A[i_A3] += eps[a] - eps[i];
+                    }
+
+                    j ++;
+	        } // j
+
+                i ++;
+            } // i
+
+            bb += blockDim.y * gridDim.y;
+        } // bb
+
+        aa += blockDim.x * gridDim.x;
+    } // aa
+
+}
+
+
+
+
+
+extern "C" void ph_dRPA_A_sing(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *A) {
+
+
+    int sBlocks = 32;
+    int nBlocks = (nV + sBlocks - 1) / sBlocks;
+
+    dim3 dimGrid(nBlocks, nBlocks, 1);
+    dim3 dimBlock(sBlocks, sBlocks, 1);
+
+    //dim3 dimGrid(nBlocks, 1, 1);
+    //dim3 dimBlock(sBlocks, 1, 1);
+
+    printf("lunching ph_dRPA_A_sing_kernel with %dx%d blocks and %dx%d threads/block\n",
+        nBlocks, nBlocks, sBlocks, sBlocks);
+
+
+    ph_dRPA_A_sing_kernel<<<dimGrid, dimBlock>>>(nO, nV, nBas, nS, eps, ERI, A);
+
+}
+
+
+
+
diff --git a/src/cuda/src/ph_drpa_a_trip.cu b/src/cuda/src/ph_drpa_a_trip.cu
new file mode 100644
index 0000000..b28e7b8
--- /dev/null
+++ b/src/cuda/src/ph_drpa_a_trip.cu
@@ -0,0 +1,77 @@
+#include <stdio.h>
+
+__global__ void ph_dRPA_A_trip_kernel(int nO, int nV, int nBas, int nS, double *eps, double *A) {
+
+
+    int i, j, a, b;
+    int aa, bb;
+    int nVS;
+    int i_A0, i_A1, i_A2;
+
+    nVS = nV * nS;
+
+    aa = blockIdx.x * blockDim.x + threadIdx.x;
+    bb = blockIdx.y * blockDim.y + threadIdx.y;
+
+    while(aa < nV) {
+        a = aa + nO;
+
+        i_A0 = aa * nS;
+
+        while(bb < nV) {
+            b = bb + nO;
+
+            i_A1 = i_A0 + bb;
+
+            i = 0;
+            while(i < nO) {
+
+                i_A2 = i_A1 + i * nVS;
+ 
+                j = 0;
+                while(j < nO) {
+
+                    A[i_A2 + j * nV] = 0.0;
+                    if((a==b) && (i==j)) {
+                        A[i_A2 + j * nV] += eps[a] - eps[i];
+                    }
+
+                    j ++;
+	        } // j
+
+                i ++;
+            } // i
+
+            bb += blockDim.y * gridDim.y;
+        } // bb
+
+        aa += blockDim.x * gridDim.x;
+    } // aa
+
+}
+
+
+
+
+
+extern "C" void ph_dRPA_A_trip(int nO, int nV, int nBas, int nS, double *eps, double *A) {
+
+
+    int sBlocks = 32;
+    int nBlocks = (nV + sBlocks - 1) / sBlocks;
+
+    dim3 dimGrid(nBlocks, nBlocks, 1);
+    dim3 dimBlock(sBlocks, sBlocks, 1);
+
+
+    printf("lunching ph_dRPA_A_trip_kernel with %dx%d blocks and %dx%d threads/block\n",
+        nBlocks, nBlocks, sBlocks, sBlocks);
+
+
+    ph_dRPA_A_trip_kernel<<<dimGrid, dimBlock>>>(nO, nV, nBas, nS, eps, A);
+
+}
+
+
+
+
diff --git a/src/cuda/src/ph_drpa_amb_sing.cu b/src/cuda/src/ph_drpa_amb_sing.cu
new file mode 100644
index 0000000..d09642b
--- /dev/null
+++ b/src/cuda/src/ph_drpa_amb_sing.cu
@@ -0,0 +1,99 @@
+#include <stdio.h>
+
+
+__global__ void ph_dRPA_AmB_sing_kernel(int nO, int nV, int nBas, int nS, 
+                                        double *eps, double *ERI, double *AmB) {
+
+
+    int i, j, a, b;
+    int aa, bb;
+
+    long long i_A0, i_A1, i_A2, i_A3;
+    long long i_I0, i_I1, i_I2, i_I3;
+    long long i_J1, i_J2, i_J3;
+
+    long long nVS;
+    long long nBas2, nBas3;
+
+    bool a_eq_b;
+
+    nVS = (long long) nV * (long long) nS;
+
+    nBas2 = (long long) nBas * (long long) nBas;
+    nBas3 = nBas2 * (long long) nBas;
+
+    aa = blockIdx.x * blockDim.x + threadIdx.x;
+    bb = blockIdx.y * blockDim.y + threadIdx.y;
+
+    while(aa < nV) {
+        a = aa + nO;
+
+        i_A0 = (long long) aa * (long long) nS;
+        i_I0 = (long long) a * nBas2;
+
+        while(bb < nV) {
+            b = bb + nO;
+
+            a_eq_b = a == b;
+
+            i_A1 = i_A0 + (long long) bb;
+            i_I1 = i_I0 + (long long) b * (long long) nBas;
+            i_J1 = i_I0 + (long long) b * nBas3;
+
+            i = 0;
+            while(i < nO) {
+
+                i_A2 = i_A1 + (long long) i * nVS;
+                i_I2 = i_I1 + (long long) i;
+                i_J2 = i_J1 + (long long) i;
+ 
+                j = 0;
+                while(j < nO) {
+
+                    i_A3 = i_A2 + (long long) j * nV;
+                    i_I3 = i_I2 + (long long) j * nBas3;
+                    i_J3 = i_J2 + (long long) j * (long long) nBas;
+
+                    AmB[i_A3] = 2.0 * (ERI[i_I3] - ERI[i_J3]);
+                    if(a_eq_b && (i==j)) {
+                        AmB[i_A3] += eps[a] - eps[i];
+                    }
+
+                    j ++;
+	        } // j
+
+                i ++;
+            } // i
+
+            bb += blockDim.y * gridDim.y;
+        } // bb
+
+        aa += blockDim.x * gridDim.x;
+    } // aa
+
+}
+
+
+
+
+
+extern "C" void ph_dRPA_AmB_sing(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *AmB) {
+
+
+    int sBlocks = 32;
+    int nBlocks = (nV + sBlocks - 1) / sBlocks;
+
+    dim3 dimGrid(nBlocks, nBlocks, 1);
+    dim3 dimBlock(sBlocks, sBlocks, 1);
+
+    printf("lunching ph_dRPA_AmB_sing_kernel with %dx%d blocks and %dx%d threads/block\n",
+        nBlocks, nBlocks, sBlocks, sBlocks);
+
+
+    ph_dRPA_AmB_sing_kernel<<<dimGrid, dimBlock>>>(nO, nV, nBas, nS, eps, ERI, AmB);
+
+}
+
+
+
+
diff --git a/src/cuda/src/ph_drpa_apb_sing.cu b/src/cuda/src/ph_drpa_apb_sing.cu
new file mode 100644
index 0000000..d317fb4
--- /dev/null
+++ b/src/cuda/src/ph_drpa_apb_sing.cu
@@ -0,0 +1,102 @@
+#include <stdio.h>
+
+
+__global__ void ph_dRPA_ApB_sing_kernel(int nO, int nV, int nBas, int nS, 
+                                        double *eps, double *ERI, double *ApB) {
+
+
+    long i, j, a, b;
+    long aa, bb;
+
+    int i_A0, i_A1, i_A2, i_A3;
+    int i_I0, i_I1, i_I2;
+    int i_J1, i_J2;
+
+    int nVS;
+    int nBas2;
+
+    long long i_I3, i_J3;
+    long long nBas3;
+
+    bool a_eq_b;
+
+    nVS = nV * nS;
+
+    nBas2 = nBas * nBas;
+    nBas3 = (long long) nBas2 * (long long) nBas;
+
+    aa = blockIdx.x * blockDim.x + threadIdx.x;
+    bb = blockIdx.y * blockDim.y + threadIdx.y;
+
+    while(aa < nV) {
+        a = aa + nO;
+
+        i_A0 = aa * nS;
+        i_I0 = a * nBas2;
+
+        while(bb < nV) {
+            b = bb + nO;
+
+            a_eq_b = a == b;
+
+            i_A1 = i_A0 + bb;
+            i_I1 = i_I0 + b * nBas;
+            i_J1 = a + b * nBas;
+
+            i = 0;
+            while(i < nO) {
+
+                i_A2 = i_A1 + i * nVS;
+                i_I2 = i_I1 + i;
+                i_J2 = i_J1 + i * nBas2;
+ 
+                j = 0;
+                while(j < nO) {
+
+                    i_A3 = i_A2 + j * nV;
+                    i_I3 = i_I2 + (long long) j * nBas3;
+                    i_J3 = i_J2 + (long long) j * nBas3;
+
+                    ApB[i_A3] = 2.0 * (ERI[i_I3] + ERI[i_J3]);
+                    if(a_eq_b && (i==j)) {
+                        ApB[i_A3] += eps[a] - eps[i];
+                    }
+
+                    j ++;
+	        } // j
+
+                i ++;
+            } // i
+
+            bb += blockDim.y * gridDim.y;
+        } // bb
+
+        aa += blockDim.x * gridDim.x;
+    } // aa
+
+}
+
+
+
+
+
+extern "C" void ph_dRPA_ApB_sing(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *ApB) {
+
+
+    int sBlocks = 32;
+    int nBlocks = (nV + sBlocks - 1) / sBlocks;
+
+    dim3 dimGrid(nBlocks, nBlocks, 1);
+    dim3 dimBlock(sBlocks, sBlocks, 1);
+
+    printf("lunching ph_dRPA_ApB_sing_kernel with %dx%d blocks and %dx%d threads/block\n",
+        nBlocks, nBlocks, sBlocks, sBlocks);
+
+
+    ph_dRPA_ApB_sing_kernel<<<dimGrid, dimBlock>>>(nO, nV, nBas, nS, eps, ERI, ApB);
+
+}
+
+
+
+
diff --git a/src/cuda/src/ph_drpa_b_sing.cu b/src/cuda/src/ph_drpa_b_sing.cu
new file mode 100644
index 0000000..b8ec06d
--- /dev/null
+++ b/src/cuda/src/ph_drpa_b_sing.cu
@@ -0,0 +1,87 @@
+#include <stdio.h>
+
+__global__ void ph_dRPA_B_sing_kernel(int nO, int nV, int nBas, int nS, double *ERI, double *B) {
+
+
+    int i, j, a, b;
+    int aa, bb;
+
+    long long nVS;
+    long long nBas2, nBas3;
+    long long i_B0, i_B1, i_B2, i_B3;
+    long long i_I0, i_I1, i_I2, i_I3;
+
+
+    nVS = (long long) nV * (long long) nS;
+
+    nBas2 = (long long) nBas * (long long) nBas;
+    nBas3 = nBas2 * (long long) nBas;
+
+    aa = blockIdx.x * blockDim.x + threadIdx.x;
+    bb = blockIdx.y * blockDim.y + threadIdx.y;
+
+    while(aa < nV) {
+        a = aa + nO;
+
+        i_B0 = (long long) aa * (long long) nS;
+        i_I0 = (long long) a * nBas2;
+
+        while(bb < nV) {
+            b = bb + nO;
+
+            i_B1 = i_B0 + (long long) bb;
+            i_I1 = i_I0 + (long long) b * nBas3;
+
+            i = 0;
+            while(i < nO) {
+
+                i_B2 = i_B1 + (long long) i * nVS;
+                i_I2 = i_I1 + (long long) i;
+ 
+                j = 0;
+                while(j < nO) {
+
+                    i_B3 = i_B2 + (long long) j * (long long) nV;
+                    i_I3 = i_I2 + (long long) j * (long long) nBas;
+
+                    B[i_B3] = 2.0 * ERI[i_I3];
+
+                    j ++;
+	        } // j
+
+                i ++;
+            } // i
+
+            bb += blockDim.y * gridDim.y;
+        } // bb
+
+        aa += blockDim.x * gridDim.x;
+    } // aa
+
+}
+
+
+
+
+
+extern "C" void ph_dRPA_B_sing(int nO, int nV, int nBas, int nS, double *ERI, double *B) {
+
+
+    int sBlocks = 32;
+    int nBlocks = (nV + sBlocks - 1) / sBlocks;
+
+    dim3 dimGrid(nBlocks, nBlocks, 1);
+    dim3 dimBlock(sBlocks, sBlocks, 1);
+
+
+    printf("lunching ph_dRPA_B_sing_kernel with %dx%d blocks and %dx%d threads/block\n",
+        nBlocks, nBlocks, sBlocks, sBlocks);
+
+
+    ph_dRPA_B_sing_kernel<<<dimGrid, dimBlock>>>(nO, nV, nBas, nS, ERI, B);
+
+}
+
+
+
+
diff --git a/src/cuda/src/ph_drpa_b_trip.cu b/src/cuda/src/ph_drpa_b_trip.cu
new file mode 100644
index 0000000..6122164
--- /dev/null
+++ b/src/cuda/src/ph_drpa_b_trip.cu
@@ -0,0 +1,72 @@
+#include <stdio.h>
+
+__global__ void ph_dRPA_B_trip_kernel(int nO, int nV, int nBas, int nS, double *B) {
+
+
+    int i, j;
+    int aa, bb;
+    int nVS;
+    int i_B0, i_B1, i_B2;
+
+    nVS = nV * nS;
+
+    aa = blockIdx.x * blockDim.x + threadIdx.x;
+    bb = blockIdx.y * blockDim.y + threadIdx.y;
+
+    while(aa < nV) {
+
+        i_B0 = aa * nS;
+
+        while(bb < nV) {
+
+            i_B1 = i_B0 + bb;
+
+            i = 0;
+            while(i < nO) {
+
+                i_B2 = i_B1 + i * nVS;
+ 
+                j = 0;
+                while(j < nO) {
+
+                    B[i_B2 + j * nV] = 0.0;
+
+                    j ++;
+	        } // j
+
+                i ++;
+            } // i
+
+            bb += blockDim.y * gridDim.y;
+        } // bb
+
+        aa += blockDim.x * gridDim.x;
+    } // aa
+
+}
+
+
+
+
+
+extern "C" void ph_dRPA_B_trip(int nO, int nV, int nBas, int nS, double *B) {
+
+
+    int sBlocks = 32;
+    int nBlocks = (nV + sBlocks - 1) / sBlocks;
+
+    dim3 dimGrid(nBlocks, nBlocks, 1);
+    dim3 dimBlock(sBlocks, sBlocks, 1);
+
+
+    printf("lunching ph_dRPA_B_trip_kernel with %dx%d blocks and %dx%d threads/block\n",
+        nBlocks, nBlocks, sBlocks, sBlocks);
+
+
+    ph_dRPA_B_trip_kernel<<<dimGrid, dimBlock>>>(nO, nV, nBas, nS, B);
+
+}
+
+
+
+
diff --git a/src/cuda/src/ph_drpa_sing.c b/src/cuda/src/ph_drpa_sing.c
new file mode 100644
index 0000000..85329e0
--- /dev/null
+++ b/src/cuda/src/ph_drpa_sing.c
@@ -0,0 +1,307 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <cublas_v2.h>
+#include <cusolverDn.h>
+
+#include "utils.h"
+#include "ph_rpa.h"
+#include "my_linalg.h"
+
+
+
+
+
+void ph_drpa_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI,
+                  double *h_Omega, double *h_XpY, double *h_XmY) {
+
+
+    double *d_eps = NULL;
+    double *d_ERI = NULL;
+
+    int nV = nBas - nO;
+
+    long long nBas_long = (long long) nBas;
+    long long nBas4 = nBas_long * nBas_long * nBas_long * nBas_long;
+
+    long long nS_long = (long long) nS;
+    long long nS2 = nS_long * nS_long;
+
+
+    cublasHandle_t handle;
+    const double alpha=1.0, beta=0.0;
+
+
+    float elapsedTime;
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+
+
+    check_Cuda_Errors(cudaMalloc((void**)&d_eps, nBas * sizeof(double)),
+        "cudaMalloc", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaMalloc((void**)&d_ERI, nBas4 * sizeof(double)),
+        "cudaMalloc", __FILE__, __LINE__);
+
+    cudaEventRecord(start, 0);
+    check_Cuda_Errors(cudaMemcpy(d_eps, h_eps, nBas * sizeof(double), cudaMemcpyHostToDevice), 
+        "cudaMemcpy", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaMemcpy(d_ERI, h_ERI, nBas4 * sizeof(double), cudaMemcpyHostToDevice), 
+        "cudaMemcpy", __FILE__, __LINE__);
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&elapsedTime, start, stop);
+    printf("Time elapsed on CPU->GPU transfer = %f msec\n", elapsedTime);
+
+    // construct A+B & A-B
+    double *d_ApB = NULL;
+    double *d_AmB = NULL;
+    check_Cuda_Errors(cudaMalloc((void**)&d_ApB, nS2 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaMalloc((void**)&d_AmB, nS2 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__);
+
+    cudaEventRecord(start, 0);
+    ph_dRPA_ApB_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_ApB);
+    ph_dRPA_AmB_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_AmB);
+    //ph_dRPA_A_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_ApB);
+    //ph_dRPA_B_sing(nO, nV, nBas, nS, d_ERI, d_AmB);
+    //check_Cuda_Errors(cudaDeviceSynchronize(), "cudaDeviceSynchronize", __FILE__, __LINE__);
+    //A_plus_B_in_A(nS, d_ApB, d_AmB);
+    //check_Cuda_Errors(cudaDeviceSynchronize(), "cudaDeviceSynchronize", __FILE__, __LINE__);
+    //A_minus_twoB_in_B(nS, d_ApB, d_AmB);
+    check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__);
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&elapsedTime, start, stop);
+    printf("Time elapsed on AmB & ApB = %f msec\n", elapsedTime);
+
+
+    // free memory
+    check_Cuda_Errors(cudaDeviceSynchronize(), "cudaDeviceSynchronize", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaFree(d_eps), "cudaFree", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaFree(d_ERI), "cudaFree", __FILE__, __LINE__);
+
+
+    // diagonalize A-B
+    int *d_info1 = NULL;
+    check_Cuda_Errors(cudaMalloc((void**)&d_info1, sizeof(int)), "cudaMalloc", __FILE__, __LINE__);
+
+    double *d_Omega = NULL;
+    check_Cuda_Errors(cudaMalloc((void**)&d_Omega, nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__);
+
+    cudaEventRecord(start, 0);
+    diag_dn_dsyevd(nS, d_info1, d_Omega, d_AmB);
+    check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__);
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&elapsedTime, start, stop);
+    printf("Time elapsed on diag AmB = %f msec\n", elapsedTime);
+
+
+
+    // d_Omega <-- d_Omega^{0.5}
+    // TODO: nb of <= 0 elements
+    cudaEventRecord(start, 0);
+    elementwise_dsqrt_inplace(nS, d_Omega);
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&elapsedTime, start, stop);
+    printf("Time elapsed on elementwise_dsqrt_inplace %f msec\n", elapsedTime);
+
+
+    // d_AmBSq = d_AmB (d_Omega)^{+0.5} (d_AmB)^T
+    // d_AmBSqInv = d_AmB (d_Omega)^{-0.5} (d_AmB)^T
+    double *d_AmBSq = NULL;
+    double *d_AmBSqInv = NULL;
+    double *d_tmp1 = NULL;
+    double *d_tmp2 = NULL;
+    check_Cuda_Errors(cudaMalloc((void**)&d_AmBSq,    nS2 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaMalloc((void**)&d_AmBSqInv, nS2 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaMalloc((void**)&d_tmp1,     nS2 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaMalloc((void**)&d_tmp2,     nS2 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__);
+
+    check_Cublas_Errors(cublasCreate(&handle), "cublasCreate", __FILE__, __LINE__);
+
+    cudaEventRecord(start, 0);
+    // naive way
+    //A_D_At(nS, d_AmB, d_Omega, d_AmBSq);
+    //A_Dinv_At(nS, d_AmB, d_Omega, d_AmBSqInv);
+
+    A_D_in_B(nS, d_AmB, d_Omega, d_tmp1);
+    A_Dinv_in_B(nS, d_AmB, d_Omega, d_tmp2);
+
+    check_Cuda_Errors(cudaDeviceSynchronize(), "cudaDeviceSynchronize", __FILE__, __LINE__);
+
+    check_Cublas_Errors(cublasDgemm(handle,
+                                    CUBLAS_OP_N, CUBLAS_OP_T,
+                                    nS, nS, nS,
+                                    &alpha,
+                                    d_tmp1, nS,
+                                    d_AmB, nS,
+                                    &beta,
+                                    d_AmBSq, nS),
+        "cublasDgemm", __FILE__, __LINE__);
+
+    check_Cublas_Errors(cublasDgemm(handle,
+                                    CUBLAS_OP_N, CUBLAS_OP_T,
+                                    nS, nS, nS,
+                                    &alpha,
+                                    d_tmp2, nS,
+                                    d_AmB, nS,
+                                    &beta,
+                                    d_AmBSqInv, nS),
+        "cublasDgemm", __FILE__, __LINE__);
+
+    check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__);
+
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&elapsedTime, start, stop);
+    printf("Time elapsed on d_AmBSq & d_AmBSqInv = %f msec\n", elapsedTime);
+
+    check_Cuda_Errors(cudaDeviceSynchronize(), "cudaDeviceSynchronize", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaFree(d_tmp1), "cudaFree", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaFree(d_tmp2), "cudaFree", __FILE__, __LINE__);
+
+
+    // Dgemm
+    cudaEventRecord(start, 0);
+
+    // X + Y
+    check_Cublas_Errors(cublasDgemm(handle,
+                                    CUBLAS_OP_N, CUBLAS_OP_N,
+                                    nS, nS, nS,
+                                    &alpha,
+                                    d_ApB, nS,
+                                    d_AmBSq, nS,
+                                    &beta,
+                                    d_AmB, nS),
+        "cublasDgemm", __FILE__, __LINE__);
+
+    check_Cuda_Errors(cudaDeviceSynchronize(), "cudaDeviceSynchronize", __FILE__, __LINE__);
+
+    // X - Y
+    check_Cublas_Errors(cublasDgemm(handle,
+                                    CUBLAS_OP_N, CUBLAS_OP_N,
+                                    nS, nS, nS,
+                                    &alpha,
+                                    d_AmBSq, nS,
+                                    d_AmB, nS,
+                                    &beta,
+                                    d_ApB, nS),
+        "cublasDgemm", __FILE__, __LINE__);
+
+    check_Cublas_Errors(cublasDestroy(handle), "cublasDestroy", __FILE__, __LINE__);
+
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&elapsedTime, start, stop);
+    printf("Time elapsed on cublasDgemm = %f msec\n", elapsedTime);
+
+
+
+
+    // diagonalize
+    int *d_info2 = NULL;
+    check_Cuda_Errors(cudaMalloc((void**)&d_info2, sizeof(int)), "cudaMalloc", __FILE__, __LINE__);
+    cudaEventRecord(start, 0);
+    diag_dn_dsyevd(nS, d_info2, d_Omega, d_ApB);
+    check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__);
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&elapsedTime, start, stop);
+    printf("Time elapsed on diag ApB = %f msec\n", elapsedTime);
+
+
+
+
+    // d_Omega <-- d_Omega^{0.5}
+    // TODO: nb of <= 0 elements
+    cudaEventRecord(start, 0);
+    elementwise_dsqrt_inplace(nS, d_Omega);
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&elapsedTime, start, stop);
+    printf("Time elapsed on elementwise_dsqrt_inplace %f msec\n", elapsedTime);
+
+
+
+
+
+    // Dgemm
+    cudaEventRecord(start, 0);
+    check_Cublas_Errors(cublasCreate(&handle), "cublasCreate", __FILE__, __LINE__);
+
+    // X + Y
+    check_Cublas_Errors(cublasDgemm(handle,
+                                    CUBLAS_OP_T, CUBLAS_OP_N,
+                                    nS, nS, nS,
+                                    &alpha,
+                                    d_ApB, nS,
+                                    d_AmBSq, nS,
+                                    &beta,
+                                    d_AmB, nS),
+        "cublasDgemm", __FILE__, __LINE__);
+
+    check_Cuda_Errors(cudaDeviceSynchronize(), "cudaDeviceSynchronize", __FILE__, __LINE__);
+
+    // X - Y
+    check_Cublas_Errors(cublasDgemm(handle,
+                                    CUBLAS_OP_T, CUBLAS_OP_N,
+                                    nS, nS, nS,
+                                    &alpha,
+                                    d_ApB, nS,
+                                    d_AmBSqInv, nS,
+                                    &beta,
+                                    d_AmBSq, nS),
+        "cublasDgemm", __FILE__, __LINE__);
+
+    check_Cublas_Errors(cublasDestroy(handle), "cublasDestroy", __FILE__, __LINE__);
+
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&elapsedTime, start, stop);
+    printf("Time elapsed on cublasDgemm = %f msec\n", elapsedTime);
+
+
+
+    cudaEventRecord(start, 0);
+    elementwise_dsqrt(nS, d_Omega, d_AmBSq); // avoid addition memory allocation
+    A_Dinv_inplace(nS, d_AmB, d_AmBSq); // X + Y
+    A_D_inplace(nS, d_ApB, d_AmBSq); // X - Y
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&elapsedTime, start, stop);
+    printf("Time elapsed on final X+Y and X-Y trans = %f msec\n", elapsedTime);
+
+
+
+
+
+    // transfer data to CPU
+    cudaEventRecord(start, 0);
+    check_Cuda_Errors(cudaMemcpy(h_XpY, d_AmB, nS2 * sizeof(double), cudaMemcpyDeviceToHost), 
+        "cudaMemcpy", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaMemcpy(h_XmY, d_ApB, nS2 * sizeof(double), cudaMemcpyDeviceToHost), 
+        "cudaMemcpy", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaMemcpy(h_Omega, d_Omega, nS * sizeof(double), cudaMemcpyDeviceToHost), 
+        "cudaMemcpy", __FILE__, __LINE__);
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&elapsedTime, start, stop);
+    printf("Time elapsed on GPU -> CPU transfer = %f msec\n", elapsedTime);
+
+
+    check_Cuda_Errors(cudaFree(d_info1), "cudaFree", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaFree(d_info2), "cudaFree", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaFree(d_ApB), "cudaFree", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaFree(d_AmB), "cudaFree", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaFree(d_AmBSq), "cudaFree", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaFree(d_AmBSqInv), "cudaFree", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaFree(d_Omega), "cudaFree", __FILE__, __LINE__);
+
+
+}
+
diff --git a/src/cuda/src/ph_drpa_tda_sing.c b/src/cuda/src/ph_drpa_tda_sing.c
new file mode 100644
index 0000000..b98535c
--- /dev/null
+++ b/src/cuda/src/ph_drpa_tda_sing.c
@@ -0,0 +1,114 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <cublas_v2.h>
+#include <cusolverDn.h>
+
+#include "utils.h"
+#include "ph_rpa.h"
+#include "my_linalg.h"
+
+
+
+/*
+ *
+ * Y = 0 ==> X+Y = X-Y = X
+ *
+*/
+void ph_drpa_tda_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI,
+                      double *h_Omega, double *h_X) {
+
+    double *d_eps = NULL;
+    double *d_ERI = NULL;
+
+    int nV = nBas - nO;
+ 
+    long long nS_long = (long long) nS;
+    long long nS2 = nS_long * nS_long;
+
+    long long nBas_long = (long long) nBas;
+    long long nBas4 = nBas_long * nBas_long * nBas_long * nBas_long;
+
+    float elapsedTime;
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+    //printf("nO = %d, nBas = %d, nS = %d\n", nO, nBas, nS);
+    //printf("nBas4 = %lld\n", nBas4);
+
+
+    check_Cuda_Errors(cudaMalloc((void**)&d_eps, nBas * sizeof(double)),
+        "cudaMalloc", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaMalloc((void**)&d_ERI, nBas4 * sizeof(double)),
+        "cudaMalloc", __FILE__, __LINE__);
+
+    cudaEventRecord(start, 0);
+    check_Cuda_Errors(cudaMemcpy(d_eps, h_eps, nBas * sizeof(double), cudaMemcpyHostToDevice), 
+        "cudaMemcpy", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaMemcpy(d_ERI, h_ERI, nBas4 * sizeof(double), cudaMemcpyHostToDevice), 
+        "cudaMemcpy", __FILE__, __LINE__);
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&elapsedTime, start, stop);
+    printf("Time elapsed on CPU->GPU transfer = %f msec\n", elapsedTime);
+
+    // construct A
+    double *d_A = NULL;
+    check_Cuda_Errors(cudaMalloc((void**)&d_A, nS2 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__);
+
+    cudaEventRecord(start, 0);
+    ph_dRPA_A_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_A);
+    check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__);
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&elapsedTime, start, stop);
+    printf("Time elapsed on A kernel = %f msec\n", elapsedTime);
+
+
+    check_Cuda_Errors(cudaFree(d_eps), "cudaFree", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaFree(d_ERI), "cudaFree", __FILE__, __LINE__);
+
+
+    // diagonalize A
+    int *d_info = NULL;
+    double *d_Omega = NULL;
+    check_Cuda_Errors(cudaMalloc((void**)&d_info, sizeof(int)),
+        "cudaMalloc", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaMalloc((void**)&d_Omega, nS * sizeof(double)),
+        "cudaMalloc", __FILE__, __LINE__);
+
+    cudaEventRecord(start, 0);
+    diag_dn_dsyevd(nS, d_info, d_Omega, d_A);
+    check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__);
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&elapsedTime, start, stop);
+    printf("Time elapsed on diagonalization  = %f msec\n", elapsedTime);
+
+    //int info_gpu = 0;
+    cudaEventRecord(start, 0);
+    //check_Cuda_Errors(cudaMemcpy(&info_gpu, d_info, sizeof(int), cudaMemcpyDeviceToHost),
+    //    "cudaMemcpy", __FILE__, __LINE__);
+    //if (info_gpu != 0) {
+    //    printf("Error: diag_dn_dsyevd returned error code %d\n", info_gpu);
+    //    exit(EXIT_FAILURE);
+    //}
+    check_Cuda_Errors(cudaMemcpy(h_X, d_A, nS2 * sizeof(double), cudaMemcpyDeviceToHost), 
+        "cudaMemcpy", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaMemcpy(h_Omega, d_Omega, nS * sizeof(double), cudaMemcpyDeviceToHost), 
+        "cudaMemcpy", __FILE__, __LINE__);
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&elapsedTime, start, stop);
+    printf("Time elapsed on GPU -> CPU transfer = %f msec\n", elapsedTime);
+
+    check_Cuda_Errors(cudaFree(d_info), "cudaFree", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaFree(d_A), "cudaFree", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaFree(d_Omega), "cudaFree", __FILE__, __LINE__);
+
+
+}
+
diff --git a/src/cuda/src/utils.cu b/src/cuda/src/utils.cu
new file mode 100644
index 0000000..673b21e
--- /dev/null
+++ b/src/cuda/src/utils.cu
@@ -0,0 +1,125 @@
+#include <cuda_runtime.h>
+#include <cuda.h>
+#include <stdio.h>
+#include <cublas_v2.h>
+#include <cstring>
+#include <cusolverDn.h>
+
+
+
+extern "C" void check_Cuda_Errors(cudaError_t err, const char* msg, const char* file, int line) {
+    if (err != cudaSuccess) {
+        printf("CUDA Error in %s at line %d\n", file, line);
+        printf("%s - %s\n", msg, cudaGetErrorString(err));
+        exit(0);
+    }
+}
+
+
+const char* cublas_Get_Error_String(cublasStatus_t status) {
+    switch (status) {
+        case CUBLAS_STATUS_SUCCESS:
+            return "CUBLAS_STATUS_SUCCESS";
+        case CUBLAS_STATUS_NOT_INITIALIZED:
+            return "CUBLAS_STATUS_NOT_INITIALIZED";
+        case CUBLAS_STATUS_ALLOC_FAILED:
+            return "CUBLAS_STATUS_ALLOC_FAILED";
+        case CUBLAS_STATUS_INVALID_VALUE:
+            return "CUBLAS_STATUS_INVALID_VALUE";
+        case CUBLAS_STATUS_ARCH_MISMATCH:
+            return "CUBLAS_STATUS_ARCH_MISMATCH";
+        case CUBLAS_STATUS_MAPPING_ERROR:
+            return "CUBLAS_STATUS_MAPPING_ERROR";
+        case CUBLAS_STATUS_EXECUTION_FAILED:
+            return "CUBLAS_STATUS_EXECUTION_FAILED";
+        case CUBLAS_STATUS_INTERNAL_ERROR:
+            return "CUBLAS_STATUS_INTERNAL_ERROR";
+        case CUBLAS_STATUS_NOT_SUPPORTED:
+            return "CUBLAS_STATUS_NOT_SUPPORTED";
+        case CUBLAS_STATUS_LICENSE_ERROR:
+            return "CUBLAS_STATUS_LICENSE_ERROR";
+    }
+    return "UNKNOWN CUBLAS ERROR";
+}
+
+extern "C" void check_Cublas_Errors(cublasStatus_t status, const char* msg, const char* file, int line) {
+
+    const char* err = cublas_Get_Error_String(status);
+
+    if (strcmp(err, "CUBLAS_STATUS_SUCCESS") != 0) {
+        printf("CUBLAS Error in %s at line %d\n", file, line);
+        printf("%s - %s\n", msg, err);
+        exit(0);
+    }
+}
+
+
+
+const char* cusolver_Get_Error_String(cusolverStatus_t status) {
+    switch (status) {
+        case CUSOLVER_STATUS_SUCCESS:
+            return "CUSOLVER_STATUS_SUCCESS";
+        case CUSOLVER_STATUS_NOT_INITIALIZED:
+            return "CUSOLVER_STATUS_NOT_INITIALIZED";
+        case CUSOLVER_STATUS_ALLOC_FAILED:
+            return "CUSOLVER_STATUS_ALLOC_FAILED";
+        case CUSOLVER_STATUS_INVALID_VALUE:
+            return "CUSOLVER_STATUS_INVALID_VALUE";
+        case CUSOLVER_STATUS_ARCH_MISMATCH:
+            return "CUSOLVER_STATUS_ARCH_MISMATCH";
+        case CUSOLVER_STATUS_MAPPING_ERROR:
+            return "CUSOLVER_STATUS_MAPPING_ERROR";
+        case CUSOLVER_STATUS_EXECUTION_FAILED:
+            return "CUSOLVER_STATUS_EXECUTION_FAILED";
+        case CUSOLVER_STATUS_INTERNAL_ERROR:
+            return "CUSOLVER_STATUS_INTERNAL_ERROR";
+        case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+            return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+        case CUSOLVER_STATUS_NOT_SUPPORTED:
+            return "CUSOLVER_STATUS_NOT_SUPPORTED";
+        case CUSOLVER_STATUS_ZERO_PIVOT:
+            return "CUSOLVER_STATUS_ZERO_PIVOT";
+        case CUSOLVER_STATUS_INVALID_LICENSE:
+            return "CUSOLVER_STATUS_INVALID_LICENSE";
+        case CUSOLVER_STATUS_IRS_PARAMS_NOT_INITIALIZED:
+            return "CUSOLVER_STATUS_IRS_PARAMS_NOT_INITIALIZED";
+        case CUSOLVER_STATUS_IRS_PARAMS_INVALID:
+            return "CUSOLVER_STATUS_IRS_PARAMS_INVALID";
+        case CUSOLVER_STATUS_IRS_PARAMS_INVALID_PREC:
+            return "CUSOLVER_STATUS_IRS_PARAMS_INVALID_PREC";
+        case CUSOLVER_STATUS_IRS_PARAMS_INVALID_REFINE:
+            return "CUSOLVER_STATUS_IRS_PARAMS_INVALID_REFINE";
+        case CUSOLVER_STATUS_IRS_PARAMS_INVALID_MAXITER:
+            return "CUSOLVER_STATUS_IRS_PARAMS_INVALID_MAXITER";
+        case CUSOLVER_STATUS_IRS_INTERNAL_ERROR:
+            return "CUSOLVER_STATUS_IRS_INTERNAL_ERROR";
+        case CUSOLVER_STATUS_IRS_NOT_SUPPORTED:
+            return "CUSOLVER_STATUS_IRS_NOT_SUPPORTED";
+        case CUSOLVER_STATUS_IRS_OUT_OF_RANGE:
+            return "CUSOLVER_STATUS_IRS_OUT_OF_RANGE";
+        case CUSOLVER_STATUS_IRS_NRHS_NOT_SUPPORTED_FOR_REFINE_GMRES:
+            return "CUSOLVER_STATUS_IRS_NRHS_NOT_SUPPORTED_FOR_REFINE_GMRES";
+        case CUSOLVER_STATUS_IRS_INFOS_NOT_INITIALIZED:
+            return "CUSOLVER_STATUS_IRS_INFOS_NOT_INITIALIZED";
+        case CUSOLVER_STATUS_IRS_INFOS_NOT_DESTROYED:
+            return "CUSOLVER_STATUS_IRS_INFOS_NOT_DESTROYED";
+        case CUSOLVER_STATUS_IRS_MATRIX_SINGULAR:
+            return "CUSOLVER_STATUS_IRS_MATRIX_SINGULAR";
+        case CUSOLVER_STATUS_INVALID_WORKSPACE:
+            return "CUSOLVER_STATUS_INVALID_WORKSPACE";
+        default:
+            return "UNKNOWN CUSOLVER ERROR";
+    }
+}
+
+extern "C" void check_Cusolver_Errors(cusolverStatus_t status, const char* msg, const char* file, int line) {
+
+    const char* err = cusolver_Get_Error_String(status);
+
+    if (status != CUSOLVER_STATUS_SUCCESS) {
+        printf("CUSOLVER Error in %s at line %d\n", file, line);
+        printf("%s - %s\n", msg, err);
+        exit(EXIT_FAILURE);
+    }
+}
+
diff --git a/src/make_ninja.py b/src/make_ninja.py
index 09d3a72..349b12f 100755
--- a/src/make_ninja.py
+++ b/src/make_ninja.py
@@ -3,15 +3,15 @@ import os
 import sys
 import subprocess
 
+import argparse
+parser = argparse.ArgumentParser(description='This script generate the compilation files for QuAcK.')
+parser.add_argument('-d', '--debug', action='store_true', help='Debug mode. Default is false.')
+parser.add_argument('-u', '--use-gpu', action='store_true', help='Use GPU. Default is false.')
+args = parser.parse_args()
+DEBUG = args.debug
+USE_GPU = args.use_gpu
 
 
-DEBUG=False
-try:
-  DEBUG = sys.argv[1] == "debug"
-except:
-  pass
- 	
-
 if "QUACK_ROOT" not in os.environ:
    os.chdir("..")
    print("")
@@ -36,7 +36,7 @@ def check_compiler_exists(compiler):
 compile_gfortran_mac = """
 FC = gfortran
 AR = libtool -static -o
-FFLAGS = -I$IDIR -J$IDIR -fbacktrace -g -Wall -Wno-unused-variable -Wno-unused -Wno-unused-dummy-argument -Wuninitialized -Wmaybe-uninitialized -O3 -march=native
+FFLAGS = -I$IDIR -J$IDIR -cpp -fbacktrace -g -Wall -Wno-unused-variable -Wno-unused -Wno-unused-dummy-argument -Wuninitialized -Wmaybe-uninitialized -O3 -march=native
 CC = gcc
 CXX = g++
 LAPACK=-lblas -llapack
@@ -47,7 +47,7 @@ FIX_ORDER_OF_LIBS=
 compile_gfortran_mac_debug = """
 FC = gfortran
 AR = libtool -static -o
-FFLAGS = -I$IDIR -J$IDIR -fbacktrace -Wall -Wno-unused-variable -g -fcheck=all -Waliasing -Wampersand -Wconversion -Wsurprising -Wintrinsics-std -Wno-tabs -Wintrinsic-shadow -Wline-truncation -Wreal-q-constant
+FFLAGS = -I$IDIR -J$IDIR -cpp -fbacktrace -Wall -Wno-unused-variable -g -fcheck=all -Waliasing -Wampersand -Wconversion -Wsurprising -Wintrinsics-std -Wno-tabs -Wintrinsic-shadow -Wline-truncation -Wreal-q-constant
 CC = gcc
 CXX = g++
 LAPACK=-lblas -llapack
@@ -58,7 +58,7 @@ FIX_ORDER_OF_LIBS=
 compile_gfortran_linux_debug = """
 FC = gfortran
 AR = ar crs
-FFLAGS = -I$IDIR -J$IDIR -fbacktrace -Wall -g -fcheck=all -Waliasing -Wampersand -Wconversion -Wsurprising -Wintrinsics-std -Wno-tabs -Wintrinsic-shadow -Wline-truncation -Wreal-q-constant
+FFLAGS = -I$IDIR -J$IDIR -cpp -fbacktrace -Wall -g -fcheck=all -Waliasing -Wampersand -Wconversion -Wsurprising -Wintrinsics-std -Wno-tabs -Wintrinsic-shadow -Wline-truncation -Wreal-q-constant
 CC = gcc
 CXX = g++
 LAPACK=-lblas -llapack
@@ -81,9 +81,9 @@ elif sys.platform.lower() == "linux" or os.path.exists('/proc/version'):
     else:
         if check_compiler_exists('ifort'):
             compiler = """
-FC = ifort -qmkl=parallel -qopenmp
+FC = ifort -mkl=parallel -qopenmp
 AR = ar crs
-FFLAGS = -I$IDIR -module $IDIR -traceback -g -Ofast -xHost
+FFLAGS = -I$IDIR -module $IDIR -fpp -traceback -g -Ofast -xHost
 CC = icc
 CXX = icpc
 LAPACK=
@@ -94,10 +94,12 @@ FIX_ORDER_OF_LIBS=-Wl,--start-group
             compiler = """
 FC = gfortran -fopenmp
 AR = ar crs
-FFLAGS = -I$IDIR -J$IDIR -fbacktrace -g -Wall -Wno-unused-variable -Wno-unused -Wno-unused-dummy-argument -Wuninitialized -Wmaybe-uninitialized -O3 -march=native
+FFLAGS = -I$IDIR -J$IDIR -cpp -fbacktrace -g -Wall -Wno-unused-variable -Wno-unused -Wno-unused-dummy-argument -Wuninitialized -Wmaybe-uninitialized -O3 -march=native
 CC = gcc
 CXX = g++
 LAPACK=-lblas -llapack
+# uncomment for TURPAN
+#LAPACK=-larmpl_lp64_mp
 STDCXX=-lstdc++
 FIX_ORDER_OF_LIBS=-Wl,--start-group
 """
@@ -109,6 +111,23 @@ else:
     print("Unknown platform. Only Linux and Darwin are supported.")
     sys.exit(-1)
 
+if USE_GPU:
+    compiler_tmp = compiler.strip().split('\n')
+    compiler_tmp[0] += " -L{}/src/cuda/build -lcuquack -lcudart -lcublas -lcusolver".format(QUACK_ROOT)
+    compiler_exe = '\n'.join(compiler_tmp)
+
+    compiler_tmp = compiler.strip().split('\n')
+    compiler_tmp[2] += " -DUSE_GPU"
+    compiler_lib = '\n'.join(compiler_tmp)
+
+    compiler_main = compiler_lib
+else:
+    compiler_exe = compiler
+    compiler_lib = compiler
+    compiler_main = compiler
+
+
+
 header = """#
 # This file was automatically generated. Do not modify this file.
 # To change compiling options, make the modifications in 
@@ -163,7 +182,7 @@ rule git_clone
 
 build_in_lib_dir = "\n".join([
 	header,
-	compiler,
+	compiler_lib,
 	rule_fortran,
 	rule_build_lib,
 ])
@@ -171,20 +190,26 @@ build_in_lib_dir = "\n".join([
   
 build_in_exe_dir = "\n".join([
 	header,
-	compiler,
+	compiler_exe,
 	rule_fortran,
 	rule_build_exe,
 ])
 
 build_main = "\n".join([
 	header,
-        compiler,
+        compiler_main,
         rule_git_clone,
 ])
 
-exe_dirs = [ "QuAcK"]
+exe_dirs = ["QuAcK"]
 lib_dirs = list(filter(lambda x: os.path.isdir(x) and \
-                                 x not in exe_dirs, os.listdir(".")))
+                                x not in ["cuda"] and \
+                                x not in exe_dirs, os.listdir(".")))
+if(USE_GPU):
+    i = lib_dirs.index("GPU")
+    lib_dirs[0], lib_dirs[i] = lib_dirs[i], lib_dirs[0]
+else:
+    lib_dirs.remove("GPU")
 
 def create_ninja_in_libdir(directory):
     def write_rule(f, source_file, replace):
diff --git a/src/utils/read_integrals.f90 b/src/utils/read_1e_integrals.f90
similarity index 59%
rename from src/utils/read_integrals.f90
rename to src/utils/read_1e_integrals.f90
index 91be5fe..eecdc05 100644
--- a/src/utils/read_integrals.f90
+++ b/src/utils/read_1e_integrals.f90
@@ -1,6 +1,6 @@
-subroutine read_integrals(working_dir,nBas_AOs,S,T,V,Hc,G)
+subroutine read_1e_integrals(working_dir,nBas_AOs,S,T,V,Hc)
 
-! Read one- and two-electron integrals from files
+! Read one-electron integrals from files
 
   implicit none
   include 'parameters.h'
@@ -13,9 +13,8 @@ subroutine read_integrals(working_dir,nBas_AOs,S,T,V,Hc,G)
 ! Local variables
 
   logical                       :: debug
-  integer                       :: mu,nu,la,si
-  double precision              :: Ov,Kin,Nuc,ERI
-  double precision              :: lambda
+  integer                       :: mu,nu
+  double precision              :: Ov,Kin,Nuc
 
 ! Output variables
 
@@ -23,26 +22,21 @@ subroutine read_integrals(working_dir,nBas_AOs,S,T,V,Hc,G)
   double precision,intent(out)  :: T(nBas_AOs,nBas_AOs)
   double precision,intent(out)  :: V(nBas_AOs,nBas_AOs)
   double precision,intent(out)  :: Hc(nBas_AOs,nBas_AOs)
-  double precision,intent(out)  :: G(nBas_AOs,nBas_AOs,nBas_AOs,nBas_AOs)
 
-  integer                       :: status, ios
+  integer                       :: ios
   character(len=256)            :: file_path
 
 ! Open file with integrals
 
   debug = .false.
 
-  lambda = 1d0
-
-  print*, 'Scaling integrals by ',lambda
-
 
   ! ---
 
   ! Read overlap integrals
   file_path = trim(working_dir) // '/int/Ov.dat'
-  open(unit=8, file=file_path, status='old', action='read', iostat=status)
-    if(status /= 0) then
+  open(unit=8, file=file_path, status='old', action='read', iostat=ios)
+    if(ios /= 0) then
       print *, "Error opening file: ", file_path
       stop
     else
@@ -60,8 +54,8 @@ subroutine read_integrals(working_dir,nBas_AOs,S,T,V,Hc,G)
 
   ! Read kinetic integrals
   file_path = trim(working_dir) // '/int/Kin.dat'
-  open(unit=9, file=file_path, status='old', action='read', iostat=status)
-    if(status /= 0) then
+  open(unit=9, file=file_path, status='old', action='read', iostat=ios)
+    if(ios /= 0) then
       print *, "Error opening file: ", file_path
       stop
     else
@@ -79,8 +73,8 @@ subroutine read_integrals(working_dir,nBas_AOs,S,T,V,Hc,G)
 
   ! Read nuclear integrals
   file_path = trim(working_dir) // '/int/Nuc.dat'
-  open(unit=10, file=file_path, status='old', action='read', iostat=status)
-    if(status /= 0) then
+  open(unit=10, file=file_path, status='old', action='read', iostat=ios)
+    if(ios /= 0) then
       print *, "Error opening file: ", file_path
       stop
     else
@@ -99,37 +93,6 @@ subroutine read_integrals(working_dir,nBas_AOs,S,T,V,Hc,G)
   ! Define core Hamiltonian
   Hc(:,:) = T(:,:) + V(:,:)
 
-! Read 2e-integrals
-
-!  ! formatted file
-!  open(unit=11, file='int/ERI.dat')
-!  G(:,:,:,:) = 0d0
-!  do 
-!    read(11,*,end=11) mu, nu, la, si, ERI
-!    ERI = lambda*ERI
-!    G(mu,nu,la,si) = ERI    !   <12|34>
-!    G(la,nu,mu,si) = ERI    !   <32|14>
-!    G(mu,si,la,nu) = ERI    !   <14|32>
-!    G(la,si,mu,nu) = ERI    !   <34|12>
-!    G(si,mu,nu,la) = ERI    !   <41|23>
-!    G(nu,la,si,mu) = ERI    !   <23|41>
-!    G(nu,mu,si,la) = ERI    !   <21|43>
-!    G(si,la,nu,mu) = ERI    !   <43|21>
-!  end do
-!  11 close(unit=11)
-
-  ! binary file
-  file_path = trim(working_dir) // '/int/ERI.bin'
-  open(unit=11, file=file_path, status='old', action='read', form='unformatted', access='stream', iostat=status)
-    if(status /= 0) then
-      print *, "Error opening file: ", file_path
-      stop
-    else
-      read(11) G
-    endif
-  close(unit=11)
-
-
 
 ! Print results
   if(debug) then
@@ -148,15 +111,6 @@ subroutine read_integrals(working_dir,nBas_AOs,S,T,V,Hc,G)
     write(*,'(A28)') '----------------------'
     call matout(nBas_AOs,nBas_AOs,V)
     write(*,*)
-    write(*,'(A28)') '----------------------'
-    write(*,'(A28)') 'Electron repulsion integrals'
-    write(*,'(A28)') '----------------------'
-    do la=1,nBas_AOs
-      do si=1,nBas_AOs
-        call matout(nBas_AOs, nBas_AOs, G(1,1,la,si))
-      end do
-    end do
-    write(*,*)
   end if
 
 end subroutine 
diff --git a/src/utils/read_2e_integrals.f90 b/src/utils/read_2e_integrals.f90
new file mode 100644
index 0000000..94404a4
--- /dev/null
+++ b/src/utils/read_2e_integrals.f90
@@ -0,0 +1,110 @@
+subroutine read_2e_integrals(working_dir,nBas_AOs,G)
+
+! Read two-electron integrals from files
+
+  implicit none
+
+! Input variables
+
+  integer,intent(in)            :: nBas_AOs
+  character(len=256),intent(in) :: working_dir
+
+! Local variables
+
+  logical                       :: debug
+  integer                       :: mu,nu,la,si
+  double precision              :: ERI
+  double precision              :: lambda
+
+! Output variables
+
+  double precision,intent(out)  :: G(nBas_AOs,nBas_AOs,nBas_AOs,nBas_AOs)
+
+  integer                       :: ios
+  character(len=256)            :: file_path
+
+! Open file with integrals
+
+  debug = .false.
+
+  lambda = 1d0
+
+  print*, 'Scaling integrals by ',lambda
+
+
+! Read 2e-integrals
+
+!  ! formatted file
+!  open(unit=11, file='int/ERI.dat')
+!  G(:,:,:,:) = 0d0
+!  do 
+!    read(11,*,end=11) mu, nu, la, si, ERI
+!    ERI = lambda*ERI
+!    G(mu,nu,la,si) = ERI    !   <12|34>
+!    G(la,nu,mu,si) = ERI    !   <32|14>
+!    G(mu,si,la,nu) = ERI    !   <14|32>
+!    G(la,si,mu,nu) = ERI    !   <34|12>
+!    G(si,mu,nu,la) = ERI    !   <41|23>
+!    G(nu,la,si,mu) = ERI    !   <23|41>
+!    G(nu,mu,si,la) = ERI    !   <21|43>
+!    G(si,la,nu,mu) = ERI    !   <43|21>
+!  end do
+!  11 close(unit=11)
+
+  ! binary file
+  file_path = trim(working_dir) // '/int/ERI.bin'
+  open(unit=11, file=file_path, status='old', action='read', form='unformatted', access='stream', iostat=ios)
+    if(ios /= 0) then
+      print *, "Error opening file: ", file_path
+      stop
+    else
+      read(11) G
+    endif
+  close(unit=11)
+  G = G * lambda
+
+
+
+! Print results
+  if(debug) then
+    write(*,'(A28)') '----------------------'
+    write(*,'(A28)') 'Electron repulsion integrals'
+    write(*,'(A28)') '----------------------'
+    do la=1,nBas_AOs
+      do si=1,nBas_AOs
+        call matout(nBas_AOs, nBas_AOs, G(1,1,la,si))
+      end do
+    end do
+    write(*,*)
+  end if
+
+end subroutine 
+
+! ---
+
+subroutine read_2e_integrals_hpc(working_dir, ERI_size, ERI_chem)
+
+  implicit none
+
+  character(len=256), intent(in)  :: working_dir
+  integer*8,          intent(in)  :: ERI_size
+  double precision,   intent(out) :: ERI_chem(ERI_size)
+
+  integer                         :: ios
+  character(len=256)              :: file_path
+
+  file_path = trim(working_dir) // '/int/ERI_chem.bin'
+  open(unit=11, file=file_path, status='old', action='read', form='unformatted', access='stream', iostat=ios)
+    if(ios /= 0) then
+      print *, "Error opening file: ", file_path
+      stop
+    else
+      read(11) ERI_chem
+    endif
+  close(unit=11)
+
+  return
+end subroutine 
+
+! ---
+
diff --git a/src/utils/read_dipole_integrals.f90 b/src/utils/read_dipole_integrals.f90
index 15af2cb..e3ea558 100644
--- a/src/utils/read_dipole_integrals.f90
+++ b/src/utils/read_dipole_integrals.f90
@@ -20,7 +20,7 @@ subroutine read_dipole_integrals(working_dir,nBas,R)
 
   double precision,intent(out)  :: R(nBas,nBas,ncart)
 
-  integer                       :: status, ios
+  integer                       :: ios
   character(len=256)            :: file_path
 
 
@@ -29,9 +29,9 @@ subroutine read_dipole_integrals(working_dir,nBas,R)
   R(:,:,:) = 0d0
 
   file_path = trim(working_dir) // '/int/x.dat'
-  open(unit=21, file=file_path, status='old', action='read', iostat=status)
+  open(unit=21, file=file_path, status='old', action='read', iostat=ios)
 
-    if(status /= 0) then
+    if(ios /= 0) then
 
       print *, "Error opening file: ", file_path
       stop
@@ -39,7 +39,7 @@ subroutine read_dipole_integrals(working_dir,nBas,R)
     else
 
       do
-        read(21,*,iostat=ios) mu,nu,Dip
+        read(21, '(I7, I7, E25.17)', iostat=ios) mu, nu, Dip
         if(ios /= 0) exit
         R(mu,nu,1) = Dip
         R(nu,mu,1) = Dip
@@ -52,9 +52,9 @@ subroutine read_dipole_integrals(working_dir,nBas,R)
   ! ---
 
   file_path = trim(working_dir) // '/int/y.dat'
-  open(unit=22, file=file_path, status='old', action='read', iostat=status)
+  open(unit=22, file=file_path, status='old', action='read', iostat=ios)
 
-    if(status /= 0) then
+    if(ios /= 0) then
 
       print *, "Error opening file: ", file_path
       stop
@@ -62,7 +62,7 @@ subroutine read_dipole_integrals(working_dir,nBas,R)
     else
 
       do
-        read(22,*,iostat=ios) mu,nu,Dip
+        read(22, '(I7, I7, E25.17)', iostat=ios) mu, nu, Dip
         if(ios /= 0) exit
         R(mu,nu,2) = Dip
         R(nu,mu,2) = Dip
@@ -75,9 +75,9 @@ subroutine read_dipole_integrals(working_dir,nBas,R)
   ! ---
 
   file_path = trim(working_dir) // '/int/z.dat'
-  open(unit=23, file=file_path, status='old', action='read', iostat=status)
+  open(unit=23, file=file_path, status='old', action='read', iostat=ios)
 
-    if(status /= 0) then
+    if(ios /= 0) then
 
       print *, "Error opening file: ", file_path
       stop
@@ -85,7 +85,7 @@ subroutine read_dipole_integrals(working_dir,nBas,R)
     else
 
       do
-        read(23,*,iostat=ios) mu,nu,Dip
+        read(23, '(I7, I7, E25.17)', iostat=ios) mu, nu, Dip
         if(ios /= 0) exit
         R(mu,nu,3) = Dip
         R(nu,mu,3) = Dip
diff --git a/src/utils/utils.f90 b/src/utils/utils.f90
index 33cd99e..cc7998a 100644
--- a/src/utils/utils.f90
+++ b/src/utils/utils.f90
@@ -909,3 +909,35 @@ end
 
 ! ---
 
+integer*8 function Yoshimine_4ind(a, b, c, d)
+
+  implicit none
+  integer*8, intent(in) :: a, b, c, d
+  integer*8, external :: Yoshimine_2ind
+
+  Yoshimine_4ind = Yoshimine_2ind(Yoshimine_2ind(a, b), &
+                                  Yoshimine_2ind(c, d))
+
+  return
+end
+
+! ---
+
+integer*8 function Yoshimine_2ind(a, b)
+
+  implicit none
+  integer*8, intent(in) :: a, b
+
+  if(a > b) then
+    !Yoshimine_2ind = (a * (a - 1)) / 2 + b
+    Yoshimine_2ind = shiftr(a * (a - 1), 1) + b
+  else
+    !Yoshimine_2ind = (b * (b - 1)) / 2 + a
+    Yoshimine_2ind = shiftr(b * (b - 1), 1) + a
+  endif
+
+  return
+end
+
+! ---
+