diff --git a/src/cuda/include/ph_drpa.h b/src/cuda/include/ph_drpa.h
index 6f1ed07..1f68857 100644
--- a/src/cuda/include/ph_drpa.h
+++ b/src/cuda/include/ph_drpa.h
@@ -5,6 +5,6 @@
 extern void check_Cuda_Errors(cudaError_t err, const char * msg, const char * file, int line);
 extern void check_Cublas_Errors(cublasStatus_t status, const char * msg, const char * file, int line);
 
-extern void phLR_dRPA_A_sing(int nO, int nBas, double *eps, double *ERI, double *A);
+extern void ph_dRPA_A_sing(int nO, int nBas, double *eps, double *ERI, double *A);
 
 #endif
diff --git a/src/cuda/src/ph_drpa.c b/src/cuda/src/ph_drpa.c
index 0e0f671..304f1c1 100644
--- a/src/cuda/src/ph_drpa.c
+++ b/src/cuda/src/ph_drpa.c
@@ -40,7 +40,7 @@ int ph_drpa(int nO, int nBas, int nS, double *h_eps, double *h_ERI,
     // construct A matrix
     double *d_A;
     check_Cuda_Errors(cudaMalloc((void**)&d_A, nS * nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__);
-    phLR_dRPA_A_sing(nO, nBas, d_eps, d_ERI, d_A);
+    ph_dRPA_A_sing(nO, nBas, d_eps, d_ERI, d_A);
     check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__);
 
 
diff --git a/src/cuda/src/phlr_drpa_a_sing.cu b/src/cuda/src/ph_drpa_a_sing.cu
similarity index 79%
rename from src/cuda/src/phlr_drpa_a_sing.cu
rename to src/cuda/src/ph_drpa_a_sing.cu
index 7844d66..5308c1d 100644
--- a/src/cuda/src/phlr_drpa_a_sing.cu
+++ b/src/cuda/src/ph_drpa_a_sing.cu
@@ -1,6 +1,6 @@
 #include <stdio.h>
 
-__global__ void phLR_dRPA_A_sing_kernel(int nO, int nBas, double *eps, double *ERI, double *A) {
+__global__ void ph_dRPA_A_sing_kernel(int nO, int nBas, double *eps, double *ERI, double *A) {
 
 
     int i, j, a, b;
@@ -64,7 +64,7 @@ __global__ void phLR_dRPA_A_sing_kernel(int nO, int nBas, double *eps, double *E
 
 
 
-extern "C" void phLR_dRPA_A_sing(int nO, int nBas, double *eps, double *ERI, double *A) {
+extern "C" void ph_dRPA_A_sing(int nO, int nBas, double *eps, double *ERI, double *A) {
 
 
     int size = nBas - nO;
@@ -76,11 +76,11 @@ extern "C" void phLR_dRPA_A_sing(int nO, int nBas, double *eps, double *ERI, dou
     dim3 dimBlock(sBlocks, sBlocks, 1);
 
 
-    printf("lunching phLR_dRPA_A_sing_kernel with %dx%d blocks and %dx%d threads/block\n",
+    printf("lunching ph_dRPA_A_sing_kernel with %dx%d blocks and %dx%d threads/block\n",
         nBlocks, nBlocks, sBlocks, sBlocks);
 
 
-    phLR_dRPA_A_sing_kernel<<<dimGrid, dimBlock>>>(nO, nBas, eps, ERI, A);
+    ph_dRPA_A_sing_kernel<<<dimGrid, dimBlock>>>(nO, nBas, eps, ERI, A);
 
 }
 
diff --git a/src/cuda/src/ph_drpa_a_trip.cu b/src/cuda/src/ph_drpa_a_trip.cu
new file mode 100644
index 0000000..a58b5a2
--- /dev/null
+++ b/src/cuda/src/ph_drpa_a_trip.cu
@@ -0,0 +1,89 @@
+#include <stdio.h>
+
+__global__ void ph_dRPA_A_trip_kernel(int nO, int nBas, double *eps, double *A) {
+
+
+    int i, j, a, b;
+    int aa, bb;
+    int nV, nS, nVS;
+    int nBas2, nBas3;
+    int i_A0, i_A1, i_A2;
+    int i_I0, i_I1, i_I2;
+
+    nV = nBas - nO;
+    nS = nO * nV;
+    nVS = nV * nS;
+
+    nBas2 = nBas * nBas;
+    nBas3 = nBas2 * nBas;
+
+    aa = blockIdx.x * blockDim.x + threadIdx.x;
+    bb = blockIdx.y * blockDim.y + threadIdx.y;
+
+    while(aa < nV) {
+        a = aa + nO;
+
+        i_A0 = aa * nS;
+        i_I0 = a * nBas2;
+
+        while(bb < nV) {
+            b = bb + nO;
+
+            i_A1 = i_A0 + bb;
+            i_I1 = i_I0 + b * nBas;
+
+            i = 0;
+            while(i < nO) {
+
+                i_A2 = i_A1 + i * nVS;
+                i_I2 = i_I1 + i;
+ 
+                j = 0;
+                while(j < nO) {
+
+                    A[i_A2 + j * nV] = 0.0;
+                    if((a==b) && (i==j)) {
+                        A[i_A2 + j * nV] += eps[a] - eps[i];
+                    }
+
+                    j ++;
+	        } // j
+
+                i ++;
+            } // i
+
+            bb += blockDim.y * gridDim.y;
+        } // bb
+
+        aa += blockDim.x * gridDim.x;
+    } // aa
+
+}
+
+
+
+
+
+extern "C" void ph_dRPA_A_trip(int nO, int nBas, double *eps, double *A) {
+
+
+    int size = nBas - nO;
+
+    int sBlocks = 32;
+    int nBlocks = (size + sBlocks - 1) / sBlocks;
+
+    dim3 dimGrid(nBlocks, nBlocks, 1);
+    dim3 dimBlock(sBlocks, sBlocks, 1);
+
+
+    printf("lunching ph_dRPA_A_trip_kernel with %dx%d blocks and %dx%d threads/block\n",
+        nBlocks, nBlocks, sBlocks, sBlocks);
+
+
+    ph_dRPA_A_trip_kernel<<<dimGrid, dimBlock>>>(nO, nBas, eps, A);
+
+}
+
+
+
+