diff --git a/src/RPA/phRRPA.f90 b/src/RPA/phRRPA.f90
index 7d00061..577ab6c 100644
--- a/src/RPA/phRRPA.f90
+++ b/src/RPA/phRRPA.f90
@@ -31,6 +31,7 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,
 
 ! Local variables
 
+  integer                       :: i
   integer                       :: ispin
   logical                       :: dRPA
   double precision              :: lambda
@@ -40,7 +41,7 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,
   double precision,allocatable  :: XpY(:,:)
   double precision,allocatable  :: XmY(:,:)
   ! DEBUG
-  double precision, allocatable :: XpY_gpu(:,:), XmY_gpu(:,:), Om_gpu(:)
+  !double precision, allocatable :: XpY_gpu(:,:), XmY_gpu(:,:), Om_gpu(:)
 
   double precision              :: EcRPA(nspin)
 
@@ -78,15 +79,21 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,
     call phLR_A(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,eHF,ERI,Aph)
     if(.not.TDA) call phLR_B(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,ERI,Bph)
 
-    ! DEBUG
-    allocate(Om_gpu(nS), XpY_gpu(nS,nS), XmY_gpu(nS,nS))
-    call ph_drpa(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om_gpu(1), XpY_gpu(1,1), XmY_gpu(1,1))
-    print *, ' CPU:', Aph(1,1)
-    print *, ' GPU:', XpY_gpu(1,1)
-    print *, ' GPU:', XmY_gpu(1,1)
-    stop
-
     call phLR(TDA,nS,Aph,Bph,EcRPA(ispin),Om,XpY,XmY)
+
+    !! DEBUG
+    !allocate(Om_gpu(nS), XpY_gpu(nS,nS), XmY_gpu(nS,nS))
+    !call ph_drpa_tda(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om_gpu(1), XpY_gpu(1,1))
+    !do i = 1, nS
+    !  print *, i, Om(i), Om_gpu(i)
+    !  if(dabs(Om(i) - Om_gpu(i)) .gt. 1d-13) then
+    !    print *, 'GPU FAILED!'
+    !    stop
+    !  endif
+    !enddo
+    !print *, 'GPU DONE!'
+    !stop
+
     call print_excitation_energies('phRPA@RHF','singlet',nS,Om)
     call phLR_transition_vectors(.true.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY)
 
diff --git a/src/cuda/include/ph_drpa.h b/src/cuda/include/ph_drpa.h
deleted file mode 100644
index 1786a2f..0000000
--- a/src/cuda/include/ph_drpa.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef PH_DRPA
-
-#define PH_DRPA
-
-extern void check_Cuda_Errors(cudaError_t err, const char * msg, const char * file, int line);
-extern void check_Cublas_Errors(cublasStatus_t status, const char * msg, const char * file, int line);
-
-extern void ph_dRPA_A_sing(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *A);
-
-#endif
diff --git a/src/cuda/include/ph_rpa.h b/src/cuda/include/ph_rpa.h
new file mode 100644
index 0000000..36f111f
--- /dev/null
+++ b/src/cuda/include/ph_rpa.h
@@ -0,0 +1,10 @@
+#ifndef PH_RPA
+
+#define PH_RPA
+
+extern void ph_dRPA_A_sing(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *A);
+extern void ph_dRPA_B_sing(int nO, int nV, int nBas, int nS, double *ERI, double *B);
+
+extern void diag_dn_dsyevd(int n, int *info, double *W, double *A);
+
+#endif
diff --git a/src/cuda/include/utils.cuh b/src/cuda/include/utils.cuh
deleted file mode 100644
index 1a91732..0000000
--- a/src/cuda/include/utils.cuh
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef UTILS
-#define UTILS
-
-extern "C" void check_Cuda_Errors(cudaError_t err, const char* msg, const char* file, int line);
-
-extern "C" void check_Cublas_Errors(cublasStatus_t status, const char* msg, const char* file, int line);
-
-
-#endif
diff --git a/src/cuda/include/utils.h b/src/cuda/include/utils.h
new file mode 100644
index 0000000..f5a6403
--- /dev/null
+++ b/src/cuda/include/utils.h
@@ -0,0 +1,9 @@
+#ifndef UTILS
+
+#define UTILS
+
+extern void check_Cuda_Errors(cudaError_t err, const char *msg, const char *file, int line);
+extern void check_Cublas_Errors(cublasStatus_t status, const char *msg, const char *file, int line);
+extern void check_Cusolver_Errors(cusolverStatus_t status, const char *msg, const char *file, int line);
+
+#endif
diff --git a/src/cuda/src/diag_dense_dsy_mat.cu b/src/cuda/src/diag_dense_dsy_mat.cu
new file mode 100644
index 0000000..b800501
--- /dev/null
+++ b/src/cuda/src/diag_dense_dsy_mat.cu
@@ -0,0 +1,41 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda_runtime.h>
+#include <cusolverDn.h>
+
+
+
+extern "C" void diag_dn_dsyevd(int n, int *info, double *W, double *A) {
+
+    cusolverDnHandle_t cusolverH = NULL;
+    cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR; // Compute eigenvalues and eigenvectors
+    cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER; // Upper triangular part of the matrix is stored
+
+    int lwork = 0;
+    double *work = NULL;
+
+    //check_Cusolver_Errors(cusolverDnCreate(&cusolverH), "cusolverDnCreate", __FILE__, __LINE__);
+    cusolverDnCreate(&cusolverH);
+
+    // Query workspace size
+    //check_Cusolver_Errors(cusolverDnDsyevd_bufferSize(cusolverH, jobz, uplo, n, A, n, W, &lwork),
+    //    "cusolverDnDsyevd_bufferSize", __FILE__, __LINE__);
+    //check_Cuda_Errors(cudaMalloc((void**)&work, sizeof(double) * lwork),
+    //    "cudaMemcpy", __FILE__, __LINE__);
+    cusolverDnDsyevd_bufferSize(cusolverH, jobz, uplo, n, A, n, W, &lwork);
+    cudaMalloc((void**)&work, sizeof(double) * lwork);
+
+    // Compute eigenvalues and eigenvectors
+    //check_Cusolver_Errors(cusolverDnDsyevd(cusolverH, jobz, uplo, n, A, n, W, work, lwork, info),
+    //    "cusolverDnDsyevd", __FILE__, __LINE__);
+    cusolverDnDsyevd(cusolverH, jobz, uplo, n, A, n, W, work, lwork, info);
+
+    // Clean up
+    //check_Cuda_Errors(cudaFree(work), "cudaFree", __FILE__, __LINE__);
+    //check_Cusolver_Errors(cusolverDnDestroy(cusolverH), "cusolverDnDestroy", __FILE__, __LINE__);
+
+    cudaFree(work);
+    cusolverDnDestroy(cusolverH);
+
+}
+
diff --git a/src/cuda/src/ph_drpa.c b/src/cuda/src/ph_drpa.c
deleted file mode 100644
index c90bfab..0000000
--- a/src/cuda/src/ph_drpa.c
+++ /dev/null
@@ -1,59 +0,0 @@
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <cublas_v2.h>
-
-#include "ph_drpa.h"
-
-int ph_drpa(int nO, int nBas, int nS, double *h_eps, double *h_ERI,
-            double *h_Omega, double *h_XpY, double *h_XmY) {
-
-    double *d_eps;
-    double *d_ERI;
-
-    int nV = nBas - nO;
-
-    int nBas2 = nBas * nBas;
-    int nBas4 = nBas2 * nBas2;
-
-    int ia, jb;
-    for (ia = 0; ia < nS; ia++) {
-        h_Omega[ia] = 0.0;
-        for (jb = 0; jb < nS; jb++) {
-            h_XmY[jb + ia * nS] = 4.0;
-            //h_XpY[jb + ia * nS] = 5.0;
-        }
-    }
-
-
-    check_Cuda_Errors(cudaMalloc((void**)&d_eps, nBas * sizeof(double)),
-        "cudaMalloc", __FILE__, __LINE__);
-    check_Cuda_Errors(cudaMalloc((void**)&d_ERI, nBas4 * sizeof(double)),
-        "cudaMalloc", __FILE__, __LINE__);
-
-
-    check_Cuda_Errors(cudaMemcpy(d_eps, h_eps, nBas * sizeof(double), cudaMemcpyHostToDevice), 
-        "cudaMemcpy", __FILE__, __LINE__);
-    check_Cuda_Errors(cudaMemcpy(d_ERI, h_ERI, nBas4 * sizeof(double), cudaMemcpyHostToDevice), 
-        "cudaMemcpy", __FILE__, __LINE__);
-
-    // construct A matrix
-    double *d_A;
-    check_Cuda_Errors(cudaMalloc((void**)&d_A, nS * nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__);
-    ph_dRPA_A_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_A);
-    check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__);
-
-
-    check_Cuda_Errors(cudaMemcpy(h_XpY, d_A, nS * nS * sizeof(double), cudaMemcpyDeviceToHost), 
-        "cudaMemcpy", __FILE__, __LINE__);
-
-    check_Cuda_Errors(cudaFree(d_eps), "cudaFree", __FILE__, __LINE__);
-    check_Cuda_Errors(cudaFree(d_ERI), "cudaFree", __FILE__, __LINE__);
-    check_Cuda_Errors(cudaFree(d_A), "cudaFree", __FILE__, __LINE__);
-
-
-    return 0;
-}
-
diff --git a/src/cuda/src/ph_drpa_tda.c b/src/cuda/src/ph_drpa_tda.c
new file mode 100644
index 0000000..2d6bccf
--- /dev/null
+++ b/src/cuda/src/ph_drpa_tda.c
@@ -0,0 +1,76 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <cublas_v2.h>
+#include <cusolverDn.h>
+
+#include "utils.h"
+#include "ph_rpa.h"
+
+void ph_drpa_tda(int nO, int nBas, int nS, double *h_eps, double *h_ERI,
+                 double *h_Omega, double *h_X) {
+
+    double *d_eps = NULL;
+    double *d_ERI = NULL;
+
+    int nV = nBas - nO;
+
+    int nBas2 = nBas * nBas;
+    int nBas4 = nBas2 * nBas2;
+
+
+    check_Cuda_Errors(cudaMalloc((void**)&d_eps, nBas * sizeof(double)),
+        "cudaMalloc", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaMalloc((void**)&d_ERI, nBas4 * sizeof(double)),
+        "cudaMalloc", __FILE__, __LINE__);
+
+    check_Cuda_Errors(cudaMemcpy(d_eps, h_eps, nBas * sizeof(double), cudaMemcpyHostToDevice), 
+        "cudaMemcpy", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaMemcpy(d_ERI, h_ERI, nBas4 * sizeof(double), cudaMemcpyHostToDevice), 
+        "cudaMemcpy", __FILE__, __LINE__);
+
+    // construct A
+    double *d_A = NULL;
+    check_Cuda_Errors(cudaMalloc((void**)&d_A, nS * nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__);
+
+    ph_dRPA_A_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_A);
+    check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__);
+
+
+    // diagonalize A
+    int *d_info = NULL;
+    double *d_Omega = NULL;
+    check_Cuda_Errors(cudaMalloc((void**)&d_info, sizeof(int)),
+        "cudaMalloc", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaMalloc((void**)&d_Omega, nS * sizeof(double)),
+        "cudaMalloc", __FILE__, __LINE__);
+
+    diag_dn_dsyevd(nS, d_info, d_Omega, d_A);
+    check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__);
+
+    int info_gpu = 0;
+    check_Cuda_Errors(cudaMemcpy(&info_gpu, d_info, sizeof(int), cudaMemcpyDeviceToHost),
+        "cudaMemcpy", __FILE__, __LINE__);
+    if (info_gpu != 0) {
+        printf("Error: diag_dn_dsyevd returned error code %d\n", info_gpu);
+        exit(EXIT_FAILURE);
+    }
+
+
+    check_Cuda_Errors(cudaMemcpy(h_X, d_A, nS * nS * sizeof(double), cudaMemcpyDeviceToHost), 
+        "cudaMemcpy", __FILE__, __LINE__);
+
+    check_Cuda_Errors(cudaMemcpy(h_Omega, d_Omega, nS * sizeof(double), cudaMemcpyDeviceToHost), 
+        "cudaMemcpy", __FILE__, __LINE__);
+
+    check_Cuda_Errors(cudaFree(d_info), "cudaFree", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaFree(d_eps), "cudaFree", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaFree(d_ERI), "cudaFree", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaFree(d_A), "cudaFree", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaFree(d_Omega), "cudaFree", __FILE__, __LINE__);
+
+
+}
+
diff --git a/src/cuda/src/utils.cu b/src/cuda/src/utils.cu
index ff5d83d..673b21e 100644
--- a/src/cuda/src/utils.cu
+++ b/src/cuda/src/utils.cu
@@ -3,6 +3,7 @@
 #include <stdio.h>
 #include <cublas_v2.h>
 #include <cstring>
+#include <cusolverDn.h>
 
 
 
@@ -53,3 +54,72 @@ extern "C" void check_Cublas_Errors(cublasStatus_t status, const char* msg, cons
 }
 
 
+
+const char* cusolver_Get_Error_String(cusolverStatus_t status) {
+    switch (status) {
+        case CUSOLVER_STATUS_SUCCESS:
+            return "CUSOLVER_STATUS_SUCCESS";
+        case CUSOLVER_STATUS_NOT_INITIALIZED:
+            return "CUSOLVER_STATUS_NOT_INITIALIZED";
+        case CUSOLVER_STATUS_ALLOC_FAILED:
+            return "CUSOLVER_STATUS_ALLOC_FAILED";
+        case CUSOLVER_STATUS_INVALID_VALUE:
+            return "CUSOLVER_STATUS_INVALID_VALUE";
+        case CUSOLVER_STATUS_ARCH_MISMATCH:
+            return "CUSOLVER_STATUS_ARCH_MISMATCH";
+        case CUSOLVER_STATUS_MAPPING_ERROR:
+            return "CUSOLVER_STATUS_MAPPING_ERROR";
+        case CUSOLVER_STATUS_EXECUTION_FAILED:
+            return "CUSOLVER_STATUS_EXECUTION_FAILED";
+        case CUSOLVER_STATUS_INTERNAL_ERROR:
+            return "CUSOLVER_STATUS_INTERNAL_ERROR";
+        case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+            return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+        case CUSOLVER_STATUS_NOT_SUPPORTED:
+            return "CUSOLVER_STATUS_NOT_SUPPORTED";
+        case CUSOLVER_STATUS_ZERO_PIVOT:
+            return "CUSOLVER_STATUS_ZERO_PIVOT";
+        case CUSOLVER_STATUS_INVALID_LICENSE:
+            return "CUSOLVER_STATUS_INVALID_LICENSE";
+        case CUSOLVER_STATUS_IRS_PARAMS_NOT_INITIALIZED:
+            return "CUSOLVER_STATUS_IRS_PARAMS_NOT_INITIALIZED";
+        case CUSOLVER_STATUS_IRS_PARAMS_INVALID:
+            return "CUSOLVER_STATUS_IRS_PARAMS_INVALID";
+        case CUSOLVER_STATUS_IRS_PARAMS_INVALID_PREC:
+            return "CUSOLVER_STATUS_IRS_PARAMS_INVALID_PREC";
+        case CUSOLVER_STATUS_IRS_PARAMS_INVALID_REFINE:
+            return "CUSOLVER_STATUS_IRS_PARAMS_INVALID_REFINE";
+        case CUSOLVER_STATUS_IRS_PARAMS_INVALID_MAXITER:
+            return "CUSOLVER_STATUS_IRS_PARAMS_INVALID_MAXITER";
+        case CUSOLVER_STATUS_IRS_INTERNAL_ERROR:
+            return "CUSOLVER_STATUS_IRS_INTERNAL_ERROR";
+        case CUSOLVER_STATUS_IRS_NOT_SUPPORTED:
+            return "CUSOLVER_STATUS_IRS_NOT_SUPPORTED";
+        case CUSOLVER_STATUS_IRS_OUT_OF_RANGE:
+            return "CUSOLVER_STATUS_IRS_OUT_OF_RANGE";
+        case CUSOLVER_STATUS_IRS_NRHS_NOT_SUPPORTED_FOR_REFINE_GMRES:
+            return "CUSOLVER_STATUS_IRS_NRHS_NOT_SUPPORTED_FOR_REFINE_GMRES";
+        case CUSOLVER_STATUS_IRS_INFOS_NOT_INITIALIZED:
+            return "CUSOLVER_STATUS_IRS_INFOS_NOT_INITIALIZED";
+        case CUSOLVER_STATUS_IRS_INFOS_NOT_DESTROYED:
+            return "CUSOLVER_STATUS_IRS_INFOS_NOT_DESTROYED";
+        case CUSOLVER_STATUS_IRS_MATRIX_SINGULAR:
+            return "CUSOLVER_STATUS_IRS_MATRIX_SINGULAR";
+        case CUSOLVER_STATUS_INVALID_WORKSPACE:
+            return "CUSOLVER_STATUS_INVALID_WORKSPACE";
+        default:
+            return "UNKNOWN CUSOLVER ERROR";
+    }
+}
+
+extern "C" void check_Cusolver_Errors(cusolverStatus_t status, const char* msg, const char* file, int line) {
+
+    const char* err = cusolver_Get_Error_String(status);
+
+    if (status != CUSOLVER_STATUS_SUCCESS) {
+        printf("CUSOLVER Error in %s at line %d\n", file, line);
+        printf("%s - %s\n", msg, err);
+        exit(EXIT_FAILURE);
+    }
+}
+
diff --git a/src/make_ninja.py b/src/make_ninja.py
index 3d558d3..770c405 100755
--- a/src/make_ninja.py
+++ b/src/make_ninja.py
@@ -111,7 +111,7 @@ else:
 
 if USE_GPU:
     compiler_tmp = compiler.strip().split('\n')
-    compiler_tmp[0] += " -L{}/src/cuda/build -lcuquack -lcudart -lcublas".format(QUACK_ROOT)
+    compiler_tmp[0] += " -L{}/src/cuda/build -lcuquack -lcudart -lcublas -lcusolver".format(QUACK_ROOT)
     compiler_exe = '\n'.join(compiler_tmp)
 else:
     compiler_exe = compiler
diff --git a/src/mod/cu_quack_module.f90 b/src/mod/cu_quack_module.f90
index a786587..cf974bb 100644
--- a/src/mod/cu_quack_module.f90
+++ b/src/mod/cu_quack_module.f90
@@ -8,18 +8,17 @@ module cu_quack_module
 
   interface
 
-    subroutine ph_drpa(nO, nBas, nS, eps, ERI, &
-                       Omega, XpY, XmY) bind(C, name = "ph_drpa")
+    subroutine ph_drpa_tda(nO, nBas, nS, eps, ERI, &
+                           Omega, X) bind(C, name = "ph_drpa_tda")
 
       import c_int, c_double
       integer(c_int), intent(in), value :: nO, nBas, nS
       real(c_double), intent(in)        :: eps(nBas)
       real(c_double), intent(in)        :: ERI(nBas,nBas,nBas,nBas)
       real(c_double), intent(out)       :: Omega(nS)
-      real(c_double), intent(out)       :: XpY(nS,nS)
-      real(c_double), intent(out)       :: XmY(nS,nS)
+      real(c_double), intent(out)       :: X(nS,nS)
 
-    end subroutine ph_drpa
+    end subroutine ph_drpa_tda
 
   end interface
 
@@ -29,7 +28,7 @@ module cu_quack_module
 
     subroutine cu_quack_module_test()
         implicit none
-        print*, ' hello from mod_test'
+        print*, ' hello from cu_quack_module'
     end subroutine cu_quack_module_test
 
   ! ---