diff --git a/src/cuda/Makefile b/src/cuda/Makefile
new file mode 100644
index 0000000..d76e96d
--- /dev/null
+++ b/src/cuda/Makefile
@@ -0,0 +1,63 @@
+NVCC = nvcc
+NFLAGS = -O2 --compiler-options '-fPIC'
+NDFLAGS = --shared
+
+CC = gcc
+CFLAGS = -fPIC -O2 -Wall -g
+
+FC = gfortran
+FFLAGS = -O2 -Wall -g
+
+SRC_DIR = src
+INC_DIR = include
+
+BIN_DIR = bin
+BLD_DIR = build
+$(shell mkdir -p $(BIN_DIR))
+$(shell mkdir -p $(BLD_DIR))
+
+CU_SRC = $(wildcard $(SRC_DIR)/*.cu)
+CU_OBJ = $(CU_SRC:$(SRC_DIR)/%.cu=$(BLD_DIR)/%.o)
+
+C_SRC = $(SRC_DIR)/ph_drpa.c
+C_OBJ = $(BLD_DIR)/ph_drpa.o
+
+F_SRC = $(SRC_DIR)/cu_quack_module.f90
+F_OBJ = $(BLD_DIR)/cu_quack_module.f90
+
+MAIN_SRC = $(SRC_DIR)/cu_quack.f90
+MAIN_OBJ = $(BLD_DIR)/cu_quack.o
+
+OUTPUT_LIB = $(BLD_DIR)/libcuquack.so
+
+CUDA_LIBS = -lcudart -lcublas
+
+
+all: $(OUTPUT_LIB)
+
+$(OUTPUT_LIB): $(CU_OBJ) $(C_OBJ)
+	$(NVCC) $(NFLAGS) $(NLDFLAGS) $^ -o $@ $(CUDA_LIBS) -I$(INC_DIR)
+
+$(BLD_DIR)/%.o: $(SRC_DIR)/%.cu
+	$(NVCC) $(NFLAGS) -c $< -o $@ -I$(INC_DIR)
+
+$(C_OBJ): $(C_SRC)
+	@for src in $(C_SRC); do \
+		obj=$(BLD_DIR)/$$(basename $${src} .c).o; \
+		echo "$(CC) $(CFLAGS) -c $$src -o $$obj -I$(INC_DIR)"; \
+		$(CC) $(CFLAGS) -c $$src -o $$obj -I$(INC_DIR); \
+	done
+
+$(F_OBJ): $(F_SRC)
+	$(FC) $(FFLAGS) -c $< -o $@ -J$(BLD_DIR)
+
+$(MAIN_OBJ): $(MAIN_SRC)
+	$(FC) $(FFLAGS) -c $< -o $@ -J$(BLD_DIR)
+
+
+
+.PHONY: clean
+clean:
+	rm -f $(BLD_DIR)/*.o $(BLD_DIR)/*.so $(BLD_DIR)/*.mod $(BIN_DIR)/*
+
+
diff --git a/src/cuda/include/ph_drpa.h b/src/cuda/include/ph_drpa.h
new file mode 100644
index 0000000..6f1ed07
--- /dev/null
+++ b/src/cuda/include/ph_drpa.h
@@ -0,0 +1,10 @@
+#ifndef PH_DRPA
+
+#define PH_DRPA
+
+extern void check_Cuda_Errors(cudaError_t err, const char * msg, const char * file, int line);
+extern void check_Cublas_Errors(cublasStatus_t status, const char * msg, const char * file, int line);
+
+extern void phLR_dRPA_A_sing(int nO, int nBas, double *eps, double *ERI, double *A);
+
+#endif
diff --git a/src/cuda/include/utils.cuh b/src/cuda/include/utils.cuh
new file mode 100644
index 0000000..1a91732
--- /dev/null
+++ b/src/cuda/include/utils.cuh
@@ -0,0 +1,9 @@
+#ifndef UTILS
+#define UTILS
+
+extern "C" void check_Cuda_Errors(cudaError_t err, const char* msg, const char* file, int line);
+
+extern "C" void check_Cublas_Errors(cublasStatus_t status, const char* msg, const char* file, int line);
+
+
+#endif
diff --git a/src/cuda/src/ph_drpa.c b/src/cuda/src/ph_drpa.c
new file mode 100644
index 0000000..888abaa
--- /dev/null
+++ b/src/cuda/src/ph_drpa.c
@@ -0,0 +1,47 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <cublas_v2.h>
+
+#include "ph_drpa.h"
+
+int ph_drpa(int nO, int nBas, double *h_eps, double *h_ERI) {
+
+
+
+    double *d_eps;
+    double *d_ERI;
+
+    int nBas2 = nBas * nBas;
+    int nBas4 = nBas2 * nBas2;
+
+
+    check_Cuda_Errors(cudaMalloc((void**)&d_eps, nO * sizeof(double)),
+        "cudaMalloc", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaMalloc((void**)&d_ERI, nBas4 * sizeof(double)),
+        "cudaMalloc", __FILE__, __LINE__);
+
+
+    check_Cuda_Errors(cudaMemcpy(d_eps, h_eps, nO * sizeof(double), cudaMemcpyHostToDevice), 
+        "cudaMemcpy", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaMemcpy(d_ERI, h_ERI, nBas4 * sizeof(double), cudaMemcpyHostToDevice), 
+        "cudaMemcpy", __FILE__, __LINE__);
+
+    // construct A matrix
+    int nS = nO * (nBas * nO);
+    double *d_A;
+    check_Cuda_Errors(cudaMalloc((void**)&d_A, nS * nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__);
+    phLR_dRPA_A_sing(nO, nBas, d_eps, d_ERI, d_A);
+    check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__);
+
+
+    check_Cuda_Errors(cudaFree(d_eps), "cudaFree", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaFree(d_ERI), "cudaFree", __FILE__, __LINE__);
+    check_Cuda_Errors(cudaFree(d_A), "cudaFree", __FILE__, __LINE__);
+
+
+    return 0;
+}
+
diff --git a/src/cuda/src/phlr_drpa_a_sing.cu b/src/cuda/src/phlr_drpa_a_sing.cu
new file mode 100644
index 0000000..3cbc556
--- /dev/null
+++ b/src/cuda/src/phlr_drpa_a_sing.cu
@@ -0,0 +1,82 @@
+#include <stdio.h>
+
+__global__ void phLR_dRPA_A_sing_kernel(int nO, int nBas, double *eps, double *ERI, double *A) {
+
+
+    int i, j, a, b;
+    int ia, jb, jb_off;
+
+    int ij_off0, ij_off;
+
+    int aa_max = nBas - nO;
+    int ia_max = aa_max * nO;
+
+    int nBas2 = nBas * nBas;
+    int nBas3 = nBas2 * nBas;
+
+    int aa = blockIdx.x * blockDim.x + threadIdx.x;
+    int bb = blockIdx.y * blockDim.y + threadIdx.y;
+
+    while(aa < aa_max) {
+        a = aa + nO;
+
+        ij_off0 = a * nBas2;
+
+        while(bb < aa_max) {
+            b = bb + nO;
+
+            ij_off = ij_off0 + b * nBas;
+
+            while(i < nO) {
+                ia = i * aa_max + aa;
+                jb_off = ia * ia_max;
+ 
+                while(j < nO) {
+                    jb = j * aa_max + bb;
+
+                    A[jb + jb_off] = 2.0 * ERI[i + j * nBas3 + ij_off];
+                    if(a==b && i==j) {
+                        A[jb + jb_off] += eps[a] - eps[i];
+                    }
+
+                    j ++;
+	        } // j
+
+                i ++;
+            } // i
+
+            bb += blockDim.y * gridDim.y;
+        } // bb
+
+        aa += blockDim.x * gridDim.x;
+    } // aa
+
+}
+
+
+
+
+
+extern "C" void phLR_dRPA_A_sing(int nO, int nBas, double *eps, double *ERI, double *A) {
+
+
+    int size = nBas - nO;
+
+    int sBlocks = 32;
+    int nBlocks = (size + sBlocks - 1) / sBlocks;
+
+    dim3 dimGrid(nBlocks, nBlocks, 1);
+    dim3 dimBlock(sBlocks, sBlocks, 1);
+
+
+    printf("lunching phLR_dRPA_A_sing_kernel with %dx%d blocks and %dx%d threads/block\n",
+        nBlocks, nBlocks, sBlocks, sBlocks);
+
+
+    phLR_dRPA_A_sing_kernel<<<dimGrid, dimBlock>>>(nO, nBas, eps, ERI, A);
+
+}
+
+
+
+
diff --git a/src/cuda/src/utils.cu b/src/cuda/src/utils.cu
new file mode 100644
index 0000000..b20c52d
--- /dev/null
+++ b/src/cuda/src/utils.cu
@@ -0,0 +1,53 @@
+#include <cuda_runtime.h>
+#include <cuda.h>
+#include <stdio.h>
+#include <cublas_v2.h>
+
+
+extern "C" void check_Cuda_Errors(cudaError_t err, const char* msg, const char* file, int line) {
+    if (err != cudaSuccess) {
+        printf("CUDA Error in %s at line %d\n", file, line);
+        printf("%s - %s\n", msg, cudaGetErrorString(err));
+        exit(0);
+    }
+}
+
+
+const char* cublasGetErrorString(cublasStatus_t status) {
+    switch (status) {
+        case CUBLAS_STATUS_SUCCESS:
+            return "CUBLAS_STATUS_SUCCESS";
+        case CUBLAS_STATUS_NOT_INITIALIZED:
+            return "CUBLAS_STATUS_NOT_INITIALIZED";
+        case CUBLAS_STATUS_ALLOC_FAILED:
+            return "CUBLAS_STATUS_ALLOC_FAILED";
+        case CUBLAS_STATUS_INVALID_VALUE:
+            return "CUBLAS_STATUS_INVALID_VALUE";
+        case CUBLAS_STATUS_ARCH_MISMATCH:
+            return "CUBLAS_STATUS_ARCH_MISMATCH";
+        case CUBLAS_STATUS_MAPPING_ERROR:
+            return "CUBLAS_STATUS_MAPPING_ERROR";
+        case CUBLAS_STATUS_EXECUTION_FAILED:
+            return "CUBLAS_STATUS_EXECUTION_FAILED";
+        case CUBLAS_STATUS_INTERNAL_ERROR:
+            return "CUBLAS_STATUS_INTERNAL_ERROR";
+        case CUBLAS_STATUS_NOT_SUPPORTED:
+            return "CUBLAS_STATUS_NOT_SUPPORTED";
+        case CUBLAS_STATUS_LICENSE_ERROR:
+            return "CUBLAS_STATUS_LICENSE_ERROR";
+    }
+    return "UNKNOWN CUBLAS ERROR";
+}
+
+extern "C" void check_Cublas_Errors(cublasStatus_t status, const char* msg, const char* file, int line) {
+
+    const char* err = cublasGetErrorString(status);
+
+    if (err != "CUBLAS_STATUS_SUCCESS") {
+        printf("CUBLAS Error in %s at line %d\n", file, line);
+        printf("%s - %s\n", msg, err);
+        exit(0);
+    }
+}
+
+