diff --git a/.gitignore b/.gitignore
index 899b091..4a6ab19 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
+*.slurm
+*.mod
+*.so
 *.o
 *.
 __pycache__
diff --git a/quack.rc b/quack.rc
index 77ca7ff..bede54b 100644
--- a/quack.rc
+++ b/quack.rc
@@ -13,3 +13,5 @@ esac
 export QUACK_ROOT="$( cd $QUACK_ROOT; pwd -P )"
 
 export PATH="${QUACK_ROOT}/bin:$PATH"
+export LD_LIBRARY_PATH="${QUACK_ROOT}/src/cuda/build:$LD_LIBRARY_PATH"
+
diff --git a/src/RPA/phRRPA.f90 b/src/RPA/phRRPA.f90
index 7a13d8f..94c6576 100644
--- a/src/RPA/phRRPA.f90
+++ b/src/RPA/phRRPA.f90
@@ -1,5 +1,7 @@
 subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF)
 
+  use cu_quack_module
+
 ! Perform a direct random phase approximation calculation
 
   implicit none
@@ -37,6 +39,8 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,
   double precision,allocatable  :: Om(:)
   double precision,allocatable  :: XpY(:,:)
   double precision,allocatable  :: XmY(:,:)
+  ! DEBUG
+  double precision, allocatable :: XpY_gpu(:,:), XmY_gpu(:,:), Om_gpu(:)
 
   double precision              :: EcRPA(nspin)
 
@@ -74,6 +78,13 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,
     call phLR_A(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,eHF,ERI,Aph)
     if(.not.TDA) call phLR_B(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,ERI,Bph)
 
+    ! DEBUG
+    allocate(Om_gpu(nS), XpY_gpu(nS,nS), XmY_gpu(nS,nS))
+    call ph_drpa(nO, nBas, eHF(1), ERI(1,1,1,1), Om_gpu(1), XpY_gpu(1,1), XmY_gpu(1,1))
+    print *, ' CPU:', Aph(1,1)
+    print *, ' GPU:', XpY_gpu(1,1)
+    stop
+
     call phLR(TDA,nS,Aph,Bph,EcRPA(ispin),Om,XpY,XmY)
     call print_excitation_energies('phRPA@RHF','singlet',nS,Om)
     call phLR_transition_vectors(.true.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY)
diff --git a/src/cuda/src/ph_drpa.c b/src/cuda/src/ph_drpa.c
index aa8d357..61afcbe 100644
--- a/src/cuda/src/ph_drpa.c
+++ b/src/cuda/src/ph_drpa.c
@@ -7,7 +7,8 @@
 
 #include "ph_drpa.h"
 
-int ph_drpa(int nO, int nBas, double *h_eps, double *h_ERI) {
+int ph_drpa(int nO, int nBas, double *h_eps, double *h_ERI,
+            double *h_Omega, double *h_XpY, double *h_XmY) {
 
     double *d_eps;
     double *d_ERI;
@@ -16,6 +17,17 @@ int ph_drpa(int nO, int nBas, double *h_eps, double *h_ERI) {
     int nBas4 = nBas2 * nBas2;
 
 
+    int ia, jb;
+    int nS = nO * (nBas - nO);
+    for (ia = 0; ia < nS; ia++) {
+        h_Omega[ia] = 0.0;
+        for (jb = 0; jb < nS; jb++) {
+            h_XmY[jb + nO * nBas * ia] = 0.0;
+            h_XpY[jb + nO * nBas * ia] = 0.0;
+        }
+    }
+
+
     check_Cuda_Errors(cudaMalloc((void**)&d_eps, nO * sizeof(double)),
         "cudaMalloc", __FILE__, __LINE__);
     check_Cuda_Errors(cudaMalloc((void**)&d_ERI, nBas4 * sizeof(double)),
@@ -28,13 +40,15 @@ int ph_drpa(int nO, int nBas, double *h_eps, double *h_ERI) {
         "cudaMemcpy", __FILE__, __LINE__);
 
     // construct A matrix
-    int nS = nO * (nBas * nO);
     double *d_A;
     check_Cuda_Errors(cudaMalloc((void**)&d_A, nS * nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__);
-    phLR_dRPA_A_sing(nO, nBas, d_eps, d_ERI, d_A);
-    check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__);
+//    phLR_dRPA_A_sing(nO, nBas, d_eps, d_ERI, d_A);
+//    check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__);
 
 
+    check_Cuda_Errors(cudaMemcpy(h_XpY, d_A, nS * nS * sizeof(double), cudaMemcpyDeviceToHost), 
+        "cudaMemcpy", __FILE__, __LINE__);
+
     check_Cuda_Errors(cudaFree(d_eps), "cudaFree", __FILE__, __LINE__);
     check_Cuda_Errors(cudaFree(d_ERI), "cudaFree", __FILE__, __LINE__);
     check_Cuda_Errors(cudaFree(d_A), "cudaFree", __FILE__, __LINE__);
diff --git a/src/make_ninja.py b/src/make_ninja.py
index c78bc7d..3d558d3 100755
--- a/src/make_ninja.py
+++ b/src/make_ninja.py
@@ -109,6 +109,15 @@ else:
     print("Unknown platform. Only Linux and Darwin are supported.")
     sys.exit(-1)
 
+if USE_GPU:
+    compiler_tmp = compiler.strip().split('\n')
+    compiler_tmp[0] += " -L{}/src/cuda/build -lcuquack -lcudart -lcublas".format(QUACK_ROOT)
+    compiler_exe = '\n'.join(compiler_tmp)
+else:
+    compiler_exe = compiler
+
+
+
 header = """#
 # This file was automatically generated. Do not modify this file.
 # To change compiling options, make the modifications in 
@@ -171,7 +180,7 @@ build_in_lib_dir = "\n".join([
   
 build_in_exe_dir = "\n".join([
 	header,
-	compiler,
+	compiler_exe,
 	rule_fortran,
 	rule_build_exe,
 ])
@@ -191,7 +200,6 @@ if USE_GPU:
     lib_dirs[0], lib_dirs[i] = lib_dirs[i], lib_dirs[0]
 else:
     lib_dirs.remove("mod")
-print(lib_dirs)
 
 def create_ninja_in_libdir(directory):
     def write_rule(f, source_file, replace):
diff --git a/src/mod/cu_quack_module.f90 b/src/mod/cu_quack_module.f90
new file mode 100644
index 0000000..f73cc6c
--- /dev/null
+++ b/src/mod/cu_quack_module.f90
@@ -0,0 +1,39 @@
+module cu_quack_module
+
+  use, intrinsic :: iso_c_binding
+
+  implicit none
+
+  ! ---
+
+  interface
+
+    subroutine ph_drpa(nO, nBas, eps, ERI, &
+                       Omega, XpY, XmY) bind(C, name = "ph_drpa")
+
+      import c_int, c_double
+      integer(c_int), intent(in), value :: nO, nBas
+      real(c_double), intent(in)        :: eps(nBas)
+      real(c_double), intent(in)        :: ERI(nBas,nBas,nBas,nBas)
+      real(c_double), intent(out)       :: Omega(nO*nBas)
+      real(c_double), intent(out)       :: XpY(nO*nBas,nO*nBas)
+      real(c_double), intent(out)       :: XmY(nO*nBas,nO*nBas)
+
+    end subroutine ph_drpa
+
+  end interface
+
+  ! ---
+
+  contains
+
+    subroutine cu_quack_module_test()
+        implicit none
+        print*, ' hello from mod_test'
+    end subroutine cu_quack_module_test
+
+  ! ---
+
+end module cu_quack_module
+
+