From bf15b68b0b64a229812dc4eeb2ef498fe56b194c Mon Sep 17 00:00:00 2001
From: AbdAmmar <abd.ammar.phys@gmail.com>
Date: Tue, 13 Aug 2024 10:57:44 +0200
Subject: [PATCH] add normal-ordering with CuTC

---
 plugins/local/tc_int/cutc_module.F90         |  58 +++-
 plugins/local/tc_int/deb_no_0e_gpu.irp.f     |  96 ++++++
 plugins/local/tc_int/deb_no_gpu.irp.f        | 170 ++++++++++
 plugins/local/tc_int/deb_tc_int_cuda.irp.f   |   6 +-
 plugins/local/tc_int/no_0e.irp.f             | 331 +++++++++----------
 plugins/local/tc_int/write_tc_int_cuda.irp.f |  12 +-
 6 files changed, 490 insertions(+), 183 deletions(-)
 create mode 100644 plugins/local/tc_int/deb_no_0e_gpu.irp.f
 create mode 100644 plugins/local/tc_int/deb_no_gpu.irp.f

diff --git a/plugins/local/tc_int/cutc_module.F90 b/plugins/local/tc_int/cutc_module.F90
index d1466697..eaf271e5 100644
--- a/plugins/local/tc_int/cutc_module.F90
+++ b/plugins/local/tc_int/cutc_module.F90
@@ -9,13 +9,13 @@ module cutc_module
 
     ! ---
 
-    subroutine cutc_int_c(nxBlocks, nyBlocks, nzBlocks,           &
-                          blockxSize, blockySize, blockzSize,     &
-                          n_grid1, n_grid2, n_ao, n_nuc, size_bh, &
-                          r1, wr1, r2, wr2, rn,                   &
-                          aos_data1, aos_data2,                   &
-                          c_bh, m_bh, n_bh, o_bh,                 &
-                          int2_grad1_u12_ao, int_2e_ao) bind(C, name = "cutc_int_c")
+    subroutine cutc_int(nxBlocks, nyBlocks, nzBlocks,           &
+                        blockxSize, blockySize, blockzSize,     &
+                        n_grid1, n_grid2, n_ao, n_nuc, size_bh, &
+                        r1, wr1, r2, wr2, rn,                   &
+                        aos_data1, aos_data2,                   &
+                        c_bh, m_bh, n_bh, o_bh,                 &
+                        int2_grad1_u12_ao, int_2e_ao) bind(C, name = "cutc_int")
 
       import c_int, c_double, c_ptr
       integer(c_int), intent(in), value :: nxBlocks, blockxSize
@@ -37,7 +37,7 @@ module cutc_module
       real(c_double), intent(out)       :: int2_grad1_u12_ao(n_ao,n_ao,n_grid1,3)
       real(c_double), intent(out)       :: int_2e_ao(n_ao,n_ao,n_ao,n_ao)
 
-    end subroutine cutc_int_c
+    end subroutine cutc_int
 
     ! ---
 
@@ -170,6 +170,48 @@ module cutc_module
 
     ! ---
 
+    subroutine cutc_no_0e(n_grid1, n_mo, ne_a, ne_b,                   &
+                          wr1, mos_l_in_r, mos_r_in_r, int2_grad1_u12, &
+                          no_0e) bind(C, name = "cutc_no_0e")
+
+      import c_int, c_double, c_ptr
+
+      integer(c_int), intent(in), value :: n_grid1
+      integer(c_int), intent(in), value :: n_mo
+      integer(c_int), intent(in), value :: ne_a
+      integer(c_int), intent(in), value :: ne_b
+      real(c_double), intent(in)        :: wr1(n_grid1)
+      real(c_double), intent(in)        :: mos_l_in_r(n_grid1,n_mo)
+      real(c_double), intent(in)        :: mos_r_in_r(n_grid1,n_mo)
+      real(c_double), intent(in)        :: int2_grad1_u12(n_grid1,3,n_mo,n_mo)
+      real(c_double), intent(out)       :: no_0e(1)
+
+    end subroutine cutc_no_0e
+
+    ! ---
+
+    subroutine cutc_no(n_grid1, n_mo, ne_a, ne_b,                   &
+                       wr1, mos_l_in_r, mos_r_in_r, int2_grad1_u12, &
+                       no_2e, no_1e, no_0e) bind(C, name = "cutc_no")
+
+      import c_int, c_double, c_ptr
+
+      integer(c_int), intent(in), value :: n_grid1
+      integer(c_int), intent(in), value :: n_mo
+      integer(c_int), intent(in), value :: ne_a
+      integer(c_int), intent(in), value :: ne_b
+      real(c_double), intent(in)        :: wr1(n_grid1)
+      real(c_double), intent(in)        :: mos_l_in_r(n_grid1,n_mo)
+      real(c_double), intent(in)        :: mos_r_in_r(n_grid1,n_mo)
+      real(c_double), intent(in)        :: int2_grad1_u12(n_grid1,3,n_mo,n_mo)
+      real(c_double), intent(out)       :: no_2e(n_mo,n_mo,n_mo,n_mo)
+      real(c_double), intent(out)       :: no_1e(n_mo,n_mo)
+      real(c_double), intent(out)       :: no_0e(1)
+
+    end subroutine cutc_no
+
+    ! ---
+
   end interface
 
 end module cutc_module
diff --git a/plugins/local/tc_int/deb_no_0e_gpu.irp.f b/plugins/local/tc_int/deb_no_0e_gpu.irp.f
new file mode 100644
index 00000000..afff060a
--- /dev/null
+++ b/plugins/local/tc_int/deb_no_0e_gpu.irp.f
@@ -0,0 +1,96 @@
+
+! ---
+
+subroutine deb_no_0e_gpu()
+
+  use cutc_module
+
+  implicit none
+
+  integer                       :: i, j, k, l, ipoint
+  double precision              :: acc_thr, err_tot, nrm_tot, err_loc
+  double precision              :: noL_0e
+  double precision              :: noL_0e_gpu(1)
+  double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:)
+  double precision, allocatable :: tmp(:,:,:,:)
+  double precision, allocatable :: int2_grad1_u12_bimo_t(:,:,:,:)
+
+
+  PROVIDE mo_l_coef mo_r_coef
+  PROVIDE mos_l_in_r_array_transp mos_r_in_r_array_transp
+
+
+  allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,3))
+  print*, ' Reading int2_grad1_u12_ao from ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao'
+  open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="read")
+    read(11) int2_grad1_u12_ao
+  close(11)
+
+  allocate(tmp(mo_num,mo_num,n_points_final_grid,3))
+  allocate(int2_grad1_u12_bimo_t(n_points_final_grid,3,mo_num,mo_num))
+
+  !$OMP PARALLEL         &
+  !$OMP DEFAULT (NONE)   &
+  !$OMP PRIVATE (ipoint) & 
+  !$OMP SHARED (ao_num, mo_num, n_points_final_grid, int2_grad1_u12_ao, tmp)
+  !$OMP DO SCHEDULE (dynamic)
+  do ipoint = 1, n_points_final_grid
+    call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,1), ao_num, tmp(1,1,ipoint,1), mo_num)
+    call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,2), ao_num, tmp(1,1,ipoint,2), mo_num)
+    call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,3), ao_num, tmp(1,1,ipoint,3), mo_num)
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  deallocate(int2_grad1_u12_ao)
+
+  !$OMP PARALLEL               &
+  !$OMP DEFAULT (NONE)         &
+  !$OMP PRIVATE (i, j, ipoint) & 
+  !$OMP SHARED (mo_num, n_points_final_grid, tmp, int2_grad1_u12_bimo_t)
+  !$OMP DO COLLAPSE(2) SCHEDULE (dynamic)
+  do ipoint = 1, n_points_final_grid
+    do i = 1, mo_num
+      do j = 1, mo_num
+        int2_grad1_u12_bimo_t(ipoint,1,j,i) = tmp(j,i,ipoint,1)
+        int2_grad1_u12_bimo_t(ipoint,2,j,i) = tmp(j,i,ipoint,2)
+        int2_grad1_u12_bimo_t(ipoint,3,j,i) = tmp(j,i,ipoint,3)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  deallocate(tmp)
+
+  ! ---
+
+  call cutc_no_0e(n_points_final_grid, mo_num, elec_alpha_num, elec_beta_num, &
+                  final_weight_at_r_vector(1),                                &
+                  mos_l_in_r_array_transp(1,1), mos_r_in_r_array_transp(1,1), &
+                  int2_grad1_u12_bimo_t(1,1,1,1), noL_0e_gpu(1))
+
+  ! ---
+
+  call provide_no_0e(n_points_final_grid, mo_num, elec_alpha_num, elec_beta_num, &
+                     final_weight_at_r_vector(1),                                &
+                     mos_l_in_r_array_transp(1,1), mos_r_in_r_array_transp(1,1), &
+                     int2_grad1_u12_bimo_t(1,1,1,1), noL_0e)
+
+  ! ---
+
+  deallocate(int2_grad1_u12_bimo_t)
+
+  print *, 'noL_0e CPU = ', noL_0e
+  print *, 'noL_0e GPU = ', noL_0e_gpu(1)
+
+  err_tot = dabs(noL_0e - noL_0e_gpu(1))
+  nrm_tot = dabs(noL_0e)
+  print *, ' absolute accuracy on noL_0e (%) =', 100.d0 * err_tot / nrm_tot
+  
+  return
+
+end
+
+! ---
+
diff --git a/plugins/local/tc_int/deb_no_gpu.irp.f b/plugins/local/tc_int/deb_no_gpu.irp.f
new file mode 100644
index 00000000..e14404e6
--- /dev/null
+++ b/plugins/local/tc_int/deb_no_gpu.irp.f
@@ -0,0 +1,170 @@
+
+! ---
+
+subroutine deb_no_gpu()
+
+  use cutc_module
+
+  implicit none
+
+  integer                       :: i, j, k, l, ipoint
+  double precision              :: acc_thr, err_tot, nrm_tot, err_loc
+  double precision              :: noL_0e
+  double precision              :: noL_0e_gpu(1)
+  double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:)
+  double precision, allocatable :: tmp(:,:,:,:)
+  double precision, allocatable :: int2_grad1_u12_bimo_t(:,:,:,:)
+  double precision, allocatable :: noL_1e    (:,:)
+  double precision, allocatable :: noL_1e_gpu(:,:)
+  double precision, allocatable :: noL_2e    (:,:,:,:)
+  double precision, allocatable :: noL_2e_gpu(:,:,:,:)
+
+
+  PROVIDE mo_l_coef mo_r_coef
+  PROVIDE mos_l_in_r_array_transp mos_r_in_r_array_transp
+
+
+  allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,3))
+  print*, ' Reading int2_grad1_u12_ao from ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao'
+  open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="read")
+    read(11) int2_grad1_u12_ao
+  close(11)
+
+  allocate(tmp(mo_num,mo_num,n_points_final_grid,3))
+  allocate(int2_grad1_u12_bimo_t(n_points_final_grid,3,mo_num,mo_num))
+
+  !$OMP PARALLEL         &
+  !$OMP DEFAULT (NONE)   &
+  !$OMP PRIVATE (ipoint) & 
+  !$OMP SHARED (ao_num, mo_num, n_points_final_grid, int2_grad1_u12_ao, tmp)
+  !$OMP DO SCHEDULE (dynamic)
+  do ipoint = 1, n_points_final_grid
+    call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,1), ao_num, tmp(1,1,ipoint,1), mo_num)
+    call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,2), ao_num, tmp(1,1,ipoint,2), mo_num)
+    call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,3), ao_num, tmp(1,1,ipoint,3), mo_num)
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  deallocate(int2_grad1_u12_ao)
+
+  !$OMP PARALLEL               &
+  !$OMP DEFAULT (NONE)         &
+  !$OMP PRIVATE (i, j, ipoint) & 
+  !$OMP SHARED (mo_num, n_points_final_grid, tmp, int2_grad1_u12_bimo_t)
+  !$OMP DO COLLAPSE(2) SCHEDULE (dynamic)
+  do ipoint = 1, n_points_final_grid
+    do i = 1, mo_num
+      do j = 1, mo_num
+        int2_grad1_u12_bimo_t(ipoint,1,j,i) = tmp(j,i,ipoint,1)
+        int2_grad1_u12_bimo_t(ipoint,2,j,i) = tmp(j,i,ipoint,2)
+        int2_grad1_u12_bimo_t(ipoint,3,j,i) = tmp(j,i,ipoint,3)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  deallocate(tmp)
+
+  ! ---
+
+  allocate(noL_2e_gpu(mo_num,mo_num,mo_num,mo_num))
+  allocate(noL_1e_gpu(mo_num,mo_num))
+
+  call cutc_no(n_points_final_grid, mo_num, elec_alpha_num, elec_beta_num, &
+               final_weight_at_r_vector(1),                                &
+               mos_l_in_r_array_transp(1,1), mos_r_in_r_array_transp(1,1), &
+               int2_grad1_u12_bimo_t(1,1,1,1), noL_2e_gpu(1,1,1,1), noL_1e_gpu(1,1), noL_0e_gpu(1))
+
+  ! ---
+
+  allocate(noL_2e(mo_num,mo_num,mo_num,mo_num))
+  allocate(noL_1e(mo_num,mo_num))
+
+  call provide_no_2e(n_points_final_grid, mo_num, elec_alpha_num, elec_beta_num, &
+                     final_weight_at_r_vector(1),                                &
+                     mos_l_in_r_array_transp(1,1), mos_r_in_r_array_transp(1,1), &
+                     int2_grad1_u12_bimo_t(1,1,1,1), noL_2e(1,1,1,1))
+
+  call provide_no_1e(n_points_final_grid, mo_num, elec_alpha_num, elec_beta_num, &
+                     final_weight_at_r_vector(1),                                &
+                     mos_l_in_r_array_transp(1,1), mos_r_in_r_array_transp(1,1), &
+                     int2_grad1_u12_bimo_t(1,1,1,1), noL_1e(1,1))
+
+  call provide_no_0e(n_points_final_grid, mo_num, elec_alpha_num, elec_beta_num, &
+                     final_weight_at_r_vector(1),                                &
+                     mos_l_in_r_array_transp(1,1), mos_r_in_r_array_transp(1,1), &
+                     int2_grad1_u12_bimo_t(1,1,1,1), noL_0e)
+
+  ! ---
+
+  deallocate(int2_grad1_u12_bimo_t)
+
+  acc_thr = 1d-12
+
+  ! ---
+
+  err_tot = 0.d0
+  nrm_tot = 0.d0
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do k = 1, mo_num
+        do l = 1, mo_num
+          err_loc = dabs(noL_2e(l,k,j,i) - noL_2e_gpu(l,k,j,i))
+          if(err_loc > acc_thr) then
+            print*, " error on", l, k, j, i
+            print*, " CPU res", noL_2e    (l,k,j,i)
+            print*, " GPU res", noL_2e_gpu(l,k,j,i)
+            stop
+          endif
+          err_tot = err_tot + err_loc
+          nrm_tot = nrm_tot + dabs(noL_2e(l,k,j,i))
+        enddo
+      enddo
+    enddo
+  enddo
+  print *, ' absolute accuracy on noL_2e (%) =', 100.d0 * err_tot / nrm_tot
+
+  deallocate(noL_2e)
+  deallocate(noL_2e_gpu)
+
+  ! ---
+
+  err_tot = 0.d0
+  nrm_tot = 0.d0
+  do k = 1, mo_num
+    do l = 1, mo_num
+      err_loc = dabs(noL_1e(l,k) - noL_1e_gpu(l,k))
+      if(err_loc > acc_thr) then
+        print*, " error on", l, k
+        print*, " CPU res", noL_1e    (l,k)
+        print*, " GPU res", noL_1e_gpu(l,k)
+        stop
+      endif
+      err_tot = err_tot + err_loc
+      nrm_tot = nrm_tot + dabs(noL_1e(l,k))
+    enddo
+  enddo
+  print *, ' absolute accuracy on noL_1e (%) =', 100.d0 * err_tot / nrm_tot
+
+  deallocate(noL_1e)
+  deallocate(noL_1e_gpu)
+
+  ! ---
+
+  print *, 'noL_0e CPU = ', noL_0e
+  print *, 'noL_0e GPU = ', noL_0e_gpu(1)
+
+  err_tot = dabs(noL_0e - noL_0e_gpu(1))
+  nrm_tot = dabs(noL_0e)
+  print *, ' absolute accuracy on noL_0e (%) =', 100.d0 * err_tot / nrm_tot
+
+
+  return
+
+end
+
+! ---
+
+
diff --git a/plugins/local/tc_int/deb_tc_int_cuda.irp.f b/plugins/local/tc_int/deb_tc_int_cuda.irp.f
index 8d0cc4f3..ad20d861 100644
--- a/plugins/local/tc_int/deb_tc_int_cuda.irp.f
+++ b/plugins/local/tc_int/deb_tc_int_cuda.irp.f
@@ -41,9 +41,13 @@ subroutine main()
   !call deb_no_2e_gpu_tmp()
   !call deb_no_2e_gpu()
 
-  call deb_no_1e_gpu_tmp()
+  !call deb_no_1e_gpu_tmp()
   !call deb_no_1e_gpu()
 
+  !call deb_no_0e_gpu()
+
+  call deb_no_gpu()
+
   return
 end
 
diff --git a/plugins/local/tc_int/no_0e.irp.f b/plugins/local/tc_int/no_0e.irp.f
index b945e0dd..830b91a8 100644
--- a/plugins/local/tc_int/no_0e.irp.f
+++ b/plugins/local/tc_int/no_0e.irp.f
@@ -3,12 +3,6 @@
 
 subroutine provide_no_0e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r, int2_grad1_u12, noL_0e)
 
-  BEGIN_DOC
-  ! 
-  ! < Phi_left | L | Phi_right >
-  !
-  END_DOC 
-
   implicit none
 
   integer,          intent(in)  :: n_grid, n_mo
@@ -22,44 +16,47 @@ subroutine provide_no_0e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r,
   integer                       :: i, j, k, ipoint
   double precision              :: t0, t1
   double precision, allocatable :: tmp(:)
-  double precision, allocatable :: tmp_L(:,:), tmp_R(:,:)
-  double precision, allocatable :: tmp_M(:,:), tmp_S(:), tmp_O(:), tmp_J(:,:)
-  double precision, allocatable :: tmp_M_priv(:,:), tmp_S_priv(:), tmp_O_priv(:), tmp_J_priv(:,:)
+  double precision, allocatable :: tmpL(:,:), tmpR(:,:)
+  double precision, allocatable :: tmpM(:,:), tmpS(:), tmpO(:), tmpJ(:,:)
+  double precision, allocatable :: tmpM_priv(:,:), tmpS_priv(:), tmpO_priv(:), tmpJ_priv(:,:)
+
+
+  call wall_time(t0)
 
 
   if(ne_a .eq. ne_b) then
 
     allocate(tmp(ne_b))
-    allocate(tmp_L(n_grid,3), tmp_R(n_grid,3))
+    allocate(tmpL(n_grid,3), tmpR(n_grid,3))
 
     !$OMP PARALLEL                            &
     !$OMP DEFAULT(NONE)                       &
-    !$OMP PRIVATE(j, i, ipoint, tmp_L, tmp_R) &
-    !$OMP SHARED(ne_b, n_grid,       & 
+    !$OMP PRIVATE(j, i, ipoint, tmpL, tmpR)   &
+    !$OMP SHARED(ne_b, n_grid,                & 
     !$OMP        mos_l_in_r, mos_r_in_r, wr1, &
     !$OMP        int2_grad1_u12, tmp)
 
     !$OMP DO
     do j = 1, ne_b
 
-      tmp_L = 0.d0
-      tmp_R = 0.d0
+      tmpL = 0.d0
+      tmpR = 0.d0
       do i = 1, ne_b
         do ipoint = 1, n_grid
 
-          tmp_L(ipoint,1) = tmp_L(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i)
-          tmp_L(ipoint,2) = tmp_L(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i)
-          tmp_L(ipoint,3) = tmp_L(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,1) = tmpL(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,2) = tmpL(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,3) = tmpL(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i)
 
-          tmp_R(ipoint,1) = tmp_R(ipoint,1) + int2_grad1_u12(ipoint,1,i,j) * mos_r_in_r(ipoint,i)
-          tmp_R(ipoint,2) = tmp_R(ipoint,2) + int2_grad1_u12(ipoint,2,i,j) * mos_r_in_r(ipoint,i)
-          tmp_R(ipoint,3) = tmp_R(ipoint,3) + int2_grad1_u12(ipoint,3,i,j) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,1) = tmpR(ipoint,1) + int2_grad1_u12(ipoint,1,i,j) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,2) = tmpR(ipoint,2) + int2_grad1_u12(ipoint,2,i,j) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,3) = tmpR(ipoint,3) + int2_grad1_u12(ipoint,3,i,j) * mos_r_in_r(ipoint,i)
         enddo
       enddo
 
       tmp(j) = 0.d0
       do ipoint = 1, n_grid
-        tmp(j) = tmp(j) + wr1(ipoint) * (tmp_L(ipoint,1)*tmp_R(ipoint,1) + tmp_L(ipoint,2)*tmp_R(ipoint,2) + tmp_L(ipoint,3)*tmp_R(ipoint,3))
+        tmp(j) = tmp(j) + wr1(ipoint) * (tmpL(ipoint,1)*tmpR(ipoint,1) + tmpL(ipoint,2)*tmpR(ipoint,2) + tmpL(ipoint,3)*tmpR(ipoint,3))
       enddo
     enddo ! j
     !$OMP END DO
@@ -68,150 +65,149 @@ subroutine provide_no_0e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r,
     noL_0e = -2.d0 * sum(tmp)
 
     deallocate(tmp)
-    deallocate(tmp_L, tmp_R)
+    deallocate(tmpL, tmpR)
 
     ! ---
 
-    allocate(tmp_O(n_grid), tmp_J(n_grid,3))
-    tmp_O = 0.d0
-    tmp_J = 0.d0
+    allocate(tmpO(n_grid), tmpJ(n_grid,3))
+    tmpO = 0.d0
+    tmpJ = 0.d0
 
-    !$OMP PARALLEL                                   &
-    !$OMP DEFAULT(NONE)                              &
-    !$OMP PRIVATE(i, ipoint, tmp_O_priv, tmp_J_priv) &
-    !$OMP SHARED(ne_b, n_grid,              & 
-    !$OMP        mos_l_in_r, mos_r_in_r,             &
-    !$OMP        int2_grad1_u12, tmp_O, tmp_J)
+    !$OMP PARALLEL                                 &
+    !$OMP DEFAULT(NONE)                            &
+    !$OMP PRIVATE(i, ipoint, tmpO_priv, tmpJ_priv) &
+    !$OMP SHARED(ne_b, n_grid,                     &
+    !$OMP        mos_l_in_r, mos_r_in_r,           &
+    !$OMP        int2_grad1_u12, tmpO, tmpJ)
 
-    allocate(tmp_O_priv(n_grid), tmp_J_priv(n_grid,3))
-    tmp_O_priv = 0.d0
-    tmp_J_priv = 0.d0
+    allocate(tmpO_priv(n_grid), tmpJ_priv(n_grid,3))
+    tmpO_priv = 0.d0
+    tmpJ_priv = 0.d0
   
     !$OMP DO 
     do i = 1, ne_b
       do ipoint = 1, n_grid
-        tmp_O_priv(ipoint)   = tmp_O_priv(ipoint)   + mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
-        tmp_J_priv(ipoint,1) = tmp_J_priv(ipoint,1) + int2_grad1_u12(ipoint,1,i,i)
-        tmp_J_priv(ipoint,2) = tmp_J_priv(ipoint,2) + int2_grad1_u12(ipoint,2,i,i)
-        tmp_J_priv(ipoint,3) = tmp_J_priv(ipoint,3) + int2_grad1_u12(ipoint,3,i,i)
+        tmpO_priv(ipoint)   = tmpO_priv(ipoint)   + mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
+        tmpJ_priv(ipoint,1) = tmpJ_priv(ipoint,1) + int2_grad1_u12(ipoint,1,i,i)
+        tmpJ_priv(ipoint,2) = tmpJ_priv(ipoint,2) + int2_grad1_u12(ipoint,2,i,i)
+        tmpJ_priv(ipoint,3) = tmpJ_priv(ipoint,3) + int2_grad1_u12(ipoint,3,i,i)
       enddo
     enddo
     !$OMP END DO NOWAIT
 
     !$OMP CRITICAL
-    tmp_O = tmp_O + tmp_O_priv
-    tmp_J = tmp_J + tmp_J_priv
+    tmpO = tmpO + tmpO_priv
+    tmpJ = tmpJ + tmpJ_priv
     !$OMP END CRITICAL
 
-    deallocate(tmp_O_priv, tmp_J_priv)
+    deallocate(tmpO_priv, tmpJ_priv)
     !$OMP END PARALLEL
 
-    allocate(tmp_M(n_grid,3), tmp_S(n_grid))
-    tmp_M = 0.d0
-    tmp_S = 0.d0
+    allocate(tmpM(n_grid,3), tmpS(n_grid))
+    tmpM = 0.d0
+    tmpS = 0.d0
 
-    !$OMP PARALLEL                                      &
-    !$OMP DEFAULT(NONE)                                 &
-    !$OMP PRIVATE(i, j, ipoint, tmp_M_priv, tmp_S_priv) &
-    !$OMP SHARED(ne_b, n_grid,                 & 
-    !$OMP        mos_l_in_r, mos_r_in_r,                &
-    !$OMP        int2_grad1_u12, tmp_M, tmp_S)
+    !$OMP PARALLEL                                    &
+    !$OMP DEFAULT(NONE)                               &
+    !$OMP PRIVATE(i, j, ipoint, tmpM_priv, tmpS_priv) &
+    !$OMP SHARED(ne_b, n_grid,                        &
+    !$OMP        mos_l_in_r, mos_r_in_r,              &
+    !$OMP        int2_grad1_u12, tmpM, tmpS)
 
-    allocate(tmp_M_priv(n_grid,3), tmp_S_priv(n_grid))
-    tmp_M_priv = 0.d0
-    tmp_S_priv = 0.d0
+    allocate(tmpM_priv(n_grid,3), tmpS_priv(n_grid))
+    tmpM_priv = 0.d0
+    tmpS_priv = 0.d0
   
     !$OMP DO COLLAPSE(2)
     do i = 1, ne_b
       do j = 1, ne_b
         do ipoint = 1, n_grid
 
-          tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
-          tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
-          tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
 
-          tmp_S_priv(ipoint)   = tmp_S_priv(ipoint)   + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
-                                                      + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
-                                                      + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
+          tmpS_priv(ipoint) = tmpS_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
+                                                + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
+                                                + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
         enddo
       enddo
     enddo
     !$OMP END DO NOWAIT
 
     !$OMP CRITICAL
-    tmp_M = tmp_M + tmp_M_priv
-    tmp_S = tmp_S + tmp_S_priv
+    tmpM = tmpM + tmpM_priv
+    tmpS = tmpS + tmpS_priv
     !$OMP END CRITICAL
 
-    deallocate(tmp_M_priv, tmp_S_priv)
+    deallocate(tmpM_priv, tmpS_priv)
     !$OMP END PARALLEL
 
     allocate(tmp(n_grid))
 
     do ipoint = 1, n_grid
 
-      tmp_S(ipoint) = 2.d0 * (tmp_J(ipoint,1)*tmp_J(ipoint,1) + tmp_J(ipoint,2)*tmp_J(ipoint,2) + tmp_J(ipoint,3)*tmp_J(ipoint,3)) - tmp_S(ipoint)
+      tmpS(ipoint) = 2.d0 * (tmpJ(ipoint,1)*tmpJ(ipoint,1) + tmpJ(ipoint,2)*tmpJ(ipoint,2) + tmpJ(ipoint,3)*tmpJ(ipoint,3)) - tmpS(ipoint)
 
-      tmp(ipoint) = wr1(ipoint) * ( tmp_O(ipoint) * tmp_S(ipoint)              &
-                                  - 2.d0 * ( tmp_J(ipoint,1) * tmp_M(ipoint,1) &
-                                           + tmp_J(ipoint,2) * tmp_M(ipoint,2) &
-                                           + tmp_J(ipoint,3) * tmp_M(ipoint,3)))
+      tmp(ipoint) = wr1(ipoint) * ( tmpO(ipoint) * tmpS(ipoint) - 2.d0 * ( tmpJ(ipoint,1) * tmpM(ipoint,1) &
+                                                                         + tmpJ(ipoint,2) * tmpM(ipoint,2) &
+                                                                         + tmpJ(ipoint,3) * tmpM(ipoint,3) ) )
     enddo
 
-    noL_0e = noL_0e -2.d0 * (sum(tmp))
+    noL_0e = noL_0e - 2.d0 * (sum(tmp))
 
     deallocate(tmp)
 
   else
 
     allocate(tmp(ne_a))
-    allocate(tmp_L(n_grid,3), tmp_R(n_grid,3))
+    allocate(tmpL(n_grid,3), tmpR(n_grid,3))
 
-    !$OMP PARALLEL                                      &
-    !$OMP DEFAULT(NONE)                                 &
-    !$OMP PRIVATE(j, i, ipoint, tmp_L, tmp_R)           &
-    !$OMP SHARED(ne_b, ne_a, n_grid, & 
-    !$OMP        mos_l_in_r, mos_r_in_r,                &
+    !$OMP PARALLEL                          &
+    !$OMP DEFAULT(NONE)                     &
+    !$OMP PRIVATE(j, i, ipoint, tmpL, tmpR) &
+    !$OMP SHARED(ne_b, ne_a, n_grid,        &
+    !$OMP        mos_l_in_r, mos_r_in_r,    &
     !$OMP        int2_grad1_u12, tmp, wr1)
 
     !$OMP DO
     do j = 1, ne_b
 
-      tmp_L = 0.d0
-      tmp_R = 0.d0
+      tmpL = 0.d0
+      tmpR = 0.d0
       do i = ne_b+1, ne_a
         do ipoint = 1, n_grid
 
-          tmp_L(ipoint,1) = tmp_L(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i)
-          tmp_L(ipoint,2) = tmp_L(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i)
-          tmp_L(ipoint,3) = tmp_L(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,1) = tmpL(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,2) = tmpL(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,3) = tmpL(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i)
 
-          tmp_R(ipoint,1) = tmp_R(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * mos_r_in_r(ipoint,i)
-          tmp_R(ipoint,2) = tmp_R(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * mos_r_in_r(ipoint,i)
-          tmp_R(ipoint,3) = tmp_R(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,1) = tmpR(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,2) = tmpR(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,3) = tmpR(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * mos_r_in_r(ipoint,i)
         enddo
       enddo
 
       tmp(j) = 0.d0
       do ipoint = 1, n_grid
-        tmp(j) = tmp(j) + wr1(ipoint) * (tmp_L(ipoint,1)*tmp_R(ipoint,1) + tmp_L(ipoint,2)*tmp_R(ipoint,2) + tmp_L(ipoint,3)*tmp_R(ipoint,3))
+        tmp(j) = tmp(j) + wr1(ipoint) * (tmpL(ipoint,1)*tmpR(ipoint,1) + tmpL(ipoint,2)*tmpR(ipoint,2) + tmpL(ipoint,3)*tmpR(ipoint,3))
       enddo
 
       do i = 1, ne_b
         do ipoint = 1, n_grid
 
-          tmp_L(ipoint,1) = tmp_L(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i)
-          tmp_L(ipoint,2) = tmp_L(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i)
-          tmp_L(ipoint,3) = tmp_L(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,1) = tmpL(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,2) = tmpL(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,3) = tmpL(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i)
 
-          tmp_R(ipoint,1) = tmp_R(ipoint,1) + int2_grad1_u12(ipoint,1,i,j) * mos_r_in_r(ipoint,i)
-          tmp_R(ipoint,2) = tmp_R(ipoint,2) + int2_grad1_u12(ipoint,2,i,j) * mos_r_in_r(ipoint,i)
-          tmp_R(ipoint,3) = tmp_R(ipoint,3) + int2_grad1_u12(ipoint,3,i,j) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,1) = tmpR(ipoint,1) + int2_grad1_u12(ipoint,1,i,j) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,2) = tmpR(ipoint,2) + int2_grad1_u12(ipoint,2,i,j) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,3) = tmpR(ipoint,3) + int2_grad1_u12(ipoint,3,i,j) * mos_r_in_r(ipoint,i)
         enddo
       enddo
 
       do ipoint = 1, n_grid
-        tmp(j) = tmp(j) + wr1(ipoint) * (tmp_L(ipoint,1)*tmp_R(ipoint,1) + tmp_L(ipoint,2)*tmp_R(ipoint,2) + tmp_L(ipoint,3)*tmp_R(ipoint,3))
+        tmp(j) = tmp(j) + wr1(ipoint) * (tmpL(ipoint,1)*tmpR(ipoint,1) + tmpL(ipoint,2)*tmpR(ipoint,2) + tmpL(ipoint,3)*tmpR(ipoint,3))
       enddo
     enddo ! j
     !$OMP END DO
@@ -219,33 +215,33 @@ subroutine provide_no_0e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r,
 
     ! ---
 
-    !$OMP PARALLEL                                      &
-    !$OMP DEFAULT(NONE)                                 &
-    !$OMP PRIVATE(j, i, ipoint, tmp_L, tmp_R)           &
-    !$OMP SHARED(ne_b, ne_a, n_grid, & 
-    !$OMP        mos_l_in_r, mos_r_in_r,                &
+    !$OMP PARALLEL                          &
+    !$OMP DEFAULT(NONE)                     &
+    !$OMP PRIVATE(j, i, ipoint, tmpL, tmpR) &
+    !$OMP SHARED(ne_b, ne_a, n_grid,        &
+    !$OMP        mos_l_in_r, mos_r_in_r,    &
     !$OMP        int2_grad1_u12, tmp, wr1)
 
     !$OMP DO
     do j = ne_b+1, ne_a
 
-      tmp_L = 0.d0
-      tmp_R = 0.d0
+      tmpL = 0.d0
+      tmpR = 0.d0
       do i = 1, ne_a
         do ipoint = 1, n_grid
-          tmp_L(ipoint,1) = tmp_L(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i)
-          tmp_L(ipoint,2) = tmp_L(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i)
-          tmp_L(ipoint,3) = tmp_L(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,1) = tmpL(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,2) = tmpL(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,3) = tmpL(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i)
 
-          tmp_R(ipoint,1) = tmp_R(ipoint,1) + int2_grad1_u12(ipoint,1,i,j) * mos_r_in_r(ipoint,i)
-          tmp_R(ipoint,2) = tmp_R(ipoint,2) + int2_grad1_u12(ipoint,2,i,j) * mos_r_in_r(ipoint,i)
-          tmp_R(ipoint,3) = tmp_R(ipoint,3) + int2_grad1_u12(ipoint,3,i,j) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,1) = tmpR(ipoint,1) + int2_grad1_u12(ipoint,1,i,j) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,2) = tmpR(ipoint,2) + int2_grad1_u12(ipoint,2,i,j) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,3) = tmpR(ipoint,3) + int2_grad1_u12(ipoint,3,i,j) * mos_r_in_r(ipoint,i)
         enddo
       enddo
 
       tmp(j) = 0.d0
       do ipoint = 1, n_grid
-        tmp(j) = tmp(j) + 0.5d0 * wr1(ipoint) * (tmp_L(ipoint,1)*tmp_R(ipoint,1) + tmp_L(ipoint,2)*tmp_R(ipoint,2) + tmp_L(ipoint,3)*tmp_R(ipoint,3))
+        tmp(j) = tmp(j) + 0.5d0 * wr1(ipoint) * (tmpL(ipoint,1)*tmpR(ipoint,1) + tmpL(ipoint,2)*tmpR(ipoint,2) + tmpL(ipoint,3)*tmpR(ipoint,3))
       enddo
     enddo ! j
     !$OMP END DO
@@ -254,32 +250,32 @@ subroutine provide_no_0e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r,
     noL_0e = -2.d0 * sum(tmp)
 
     deallocate(tmp)
-    deallocate(tmp_L, tmp_R)
+    deallocate(tmpL, tmpR)
 
     ! ---
 
-    allocate(tmp_O(n_grid), tmp_J(n_grid,3))
-    tmp_O = 0.d0
-    tmp_J = 0.d0
+    allocate(tmpO(n_grid), tmpJ(n_grid,3))
+    tmpO = 0.d0
+    tmpJ = 0.d0
 
-    !$OMP PARALLEL                                      &
-    !$OMP DEFAULT(NONE)                                 &
-    !$OMP PRIVATE(i, ipoint, tmp_O_priv, tmp_J_priv)    &
-    !$OMP SHARED(ne_b, ne_a, n_grid, & 
-    !$OMP        mos_l_in_r, mos_r_in_r,                &
-    !$OMP        int2_grad1_u12, tmp_O, tmp_J)
+    !$OMP PARALLEL                                 &
+    !$OMP DEFAULT(NONE)                            &
+    !$OMP PRIVATE(i, ipoint, tmpO_priv, tmpJ_priv) &
+    !$OMP SHARED(ne_b, ne_a, n_grid,               &
+    !$OMP        mos_l_in_r, mos_r_in_r,           &
+    !$OMP        int2_grad1_u12, tmpO, tmpJ)
 
-    allocate(tmp_O_priv(n_grid), tmp_J_priv(n_grid,3))
-    tmp_O_priv = 0.d0
-    tmp_J_priv = 0.d0
+    allocate(tmpO_priv(n_grid), tmpJ_priv(n_grid,3))
+    tmpO_priv = 0.d0
+    tmpJ_priv = 0.d0
   
     !$OMP DO 
     do i = 1, ne_b
       do ipoint = 1, n_grid
-        tmp_O_priv(ipoint)   = tmp_O_priv(ipoint)   + mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
-        tmp_J_priv(ipoint,1) = tmp_J_priv(ipoint,1) + int2_grad1_u12(ipoint,1,i,i)
-        tmp_J_priv(ipoint,2) = tmp_J_priv(ipoint,2) + int2_grad1_u12(ipoint,2,i,i)
-        tmp_J_priv(ipoint,3) = tmp_J_priv(ipoint,3) + int2_grad1_u12(ipoint,3,i,i)
+        tmpO_priv(ipoint)   = tmpO_priv(ipoint)   + mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
+        tmpJ_priv(ipoint,1) = tmpJ_priv(ipoint,1) + int2_grad1_u12(ipoint,1,i,i)
+        tmpJ_priv(ipoint,2) = tmpJ_priv(ipoint,2) + int2_grad1_u12(ipoint,2,i,i)
+        tmpJ_priv(ipoint,3) = tmpJ_priv(ipoint,3) + int2_grad1_u12(ipoint,3,i,i)
       enddo
     enddo
     !$OMP END DO NOWAIT
@@ -287,49 +283,49 @@ subroutine provide_no_0e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r,
     !$OMP DO 
     do i = ne_b+1, ne_a
       do ipoint = 1, n_grid
-        tmp_O_priv(ipoint)   = tmp_O_priv(ipoint)   + 0.5d0 * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
-        tmp_J_priv(ipoint,1) = tmp_J_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,i)
-        tmp_J_priv(ipoint,2) = tmp_J_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,i)
-        tmp_J_priv(ipoint,3) = tmp_J_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,i)
+        tmpO_priv(ipoint)   = tmpO_priv(ipoint)   + 0.5d0 * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
+        tmpJ_priv(ipoint,1) = tmpJ_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,i)
+        tmpJ_priv(ipoint,2) = tmpJ_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,i)
+        tmpJ_priv(ipoint,3) = tmpJ_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,i)
       enddo
     enddo
     !$OMP END DO NOWAIT
 
     !$OMP CRITICAL
-    tmp_O = tmp_O + tmp_O_priv
-    tmp_J = tmp_J + tmp_J_priv
+    tmpO = tmpO + tmpO_priv
+    tmpJ = tmpJ + tmpJ_priv
     !$OMP END CRITICAL
 
-    deallocate(tmp_O_priv, tmp_J_priv)
+    deallocate(tmpO_priv, tmpJ_priv)
     !$OMP END PARALLEL
 
     ! ---
 
-    allocate(tmp_M(n_grid,3), tmp_S(n_grid))
-    tmp_M = 0.d0
-    tmp_S = 0.d0
+    allocate(tmpM(n_grid,3), tmpS(n_grid))
+    tmpM = 0.d0
+    tmpS = 0.d0
 
-    !$OMP PARALLEL                                      &
-    !$OMP DEFAULT(NONE)                                 &
-    !$OMP PRIVATE(i, j, ipoint, tmp_M_priv, tmp_S_priv) &
-    !$OMP SHARED(ne_b, ne_a, n_grid, & 
-    !$OMP        mos_l_in_r, mos_r_in_r,                &
-    !$OMP        int2_grad1_u12, tmp_M, tmp_S)
+    !$OMP PARALLEL                                    &
+    !$OMP DEFAULT(NONE)                               &
+    !$OMP PRIVATE(i, j, ipoint, tmpM_priv, tmpS_priv) &
+    !$OMP SHARED(ne_b, ne_a, n_grid,                  &
+    !$OMP        mos_l_in_r, mos_r_in_r,              &
+    !$OMP        int2_grad1_u12, tmpM, tmpS)
 
-    allocate(tmp_M_priv(n_grid,3), tmp_S_priv(n_grid))
-    tmp_M_priv = 0.d0
-    tmp_S_priv = 0.d0
+    allocate(tmpM_priv(n_grid,3), tmpS_priv(n_grid))
+    tmpM_priv = 0.d0
+    tmpS_priv = 0.d0
   
     !$OMP DO COLLAPSE(2)
     do i = 1, ne_b
       do j = 1, ne_b
         do ipoint = 1, n_grid
 
-          tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
-          tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
-          tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
 
-          tmp_S_priv(ipoint)   = tmp_S_priv(ipoint)   + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
+          tmpS_priv(ipoint)   = tmpS_priv(ipoint)   + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
                                                       + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
                                                       + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
         enddo
@@ -342,17 +338,17 @@ subroutine provide_no_0e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r,
       do j = 1, ne_b
         do ipoint = 1, n_grid
 
-          tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
-          tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
-          tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
 
-          tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i)
-          tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i)
-          tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i)
+          tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i)
+          tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i)
+          tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i)
 
-          tmp_S_priv(ipoint)   = tmp_S_priv(ipoint)   + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
-                                                      + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
-                                                      + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
+          tmpS_priv(ipoint) = tmpS_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
+                                                + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
+                                                + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
         enddo
       enddo
     enddo
@@ -363,39 +359,38 @@ subroutine provide_no_0e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r,
       do j = ne_b+1, ne_a
         do ipoint = 1, n_grid
 
-          tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
-          tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
-          tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
 
-          tmp_S_priv(ipoint)   = tmp_S_priv(ipoint)   + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
-                                                      + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
-                                                      + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
+          tmpS_priv(ipoint) = tmpS_priv(ipoint) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
+                                                + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
+                                                + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
         enddo
       enddo
     enddo
     !$OMP END DO NOWAIT
 
     !$OMP CRITICAL
-    tmp_M = tmp_M + tmp_M_priv
-    tmp_S = tmp_S + tmp_S_priv
+    tmpM = tmpM + tmpM_priv
+    tmpS = tmpS + tmpS_priv
     !$OMP END CRITICAL
 
-    deallocate(tmp_M_priv, tmp_S_priv)
+    deallocate(tmpM_priv, tmpS_priv)
     !$OMP END PARALLEL
 
     allocate(tmp(n_grid))
 
     do ipoint = 1, n_grid
 
-      tmp_S(ipoint) = 2.d0 * (tmp_J(ipoint,1)*tmp_J(ipoint,1) + tmp_J(ipoint,2)*tmp_J(ipoint,2) + tmp_J(ipoint,3)*tmp_J(ipoint,3)) - tmp_S(ipoint)
+      tmpS(ipoint) = 2.d0 * (tmpJ(ipoint,1)*tmpJ(ipoint,1) + tmpJ(ipoint,2)*tmpJ(ipoint,2) + tmpJ(ipoint,3)*tmpJ(ipoint,3)) - tmpS(ipoint)
 
-      tmp(ipoint) = wr1(ipoint) * ( tmp_O(ipoint) * tmp_S(ipoint)              &
-                                                       - 2.d0 * ( tmp_J(ipoint,1) * tmp_M(ipoint,1) &
-                                                                + tmp_J(ipoint,2) * tmp_M(ipoint,2) &
-                                                                + tmp_J(ipoint,3) * tmp_M(ipoint,3)))
+      tmp(ipoint) = wr1(ipoint) * ( tmpO(ipoint) * tmpS(ipoint) - 2.d0 * ( tmpJ(ipoint,1) * tmpM(ipoint,1) &
+                                                                         + tmpJ(ipoint,2) * tmpM(ipoint,2) &
+                                                                         + tmpJ(ipoint,3) * tmpM(ipoint,3) ) )
     enddo
 
-    noL_0e = noL_0e -2.d0 * (sum(tmp))
+    noL_0e = noL_0e - 2.d0 * (sum(tmp))
 
     deallocate(tmp)
 
diff --git a/plugins/local/tc_int/write_tc_int_cuda.irp.f b/plugins/local/tc_int/write_tc_int_cuda.irp.f
index 2a3dc4d1..756630b8 100644
--- a/plugins/local/tc_int/write_tc_int_cuda.irp.f
+++ b/plugins/local/tc_int/write_tc_int_cuda.irp.f
@@ -120,12 +120,12 @@ subroutine do_work_on_gpu()
   call wall_time(cuda_time0)
   print*, ' start CUDA kernel'
 
-  call cutc_int_c(nxBlocks, nyBlocks, nzBlocks, blockxSize, blockySize, blockzSize,           &
-                  n_points_final_grid, n_points_extra_final_grid, ao_num, nucl_num, jBH_size, &
-                  final_grid_points, final_weight_at_r_vector,                                &
-                  final_grid_points_extra, final_weight_at_r_vector_extra,                    &
-                  rn, aos_data1, aos_data2, jBH_c, jBH_m, jBH_n, jBH_o,                       &
-                  int2_grad1_u12_ao, int_2e_ao)
+  call cutc_int(nxBlocks, nyBlocks, nzBlocks, blockxSize, blockySize, blockzSize,           &
+                n_points_final_grid, n_points_extra_final_grid, ao_num, nucl_num, jBH_size, &
+                final_grid_points, final_weight_at_r_vector,                                &
+                final_grid_points_extra, final_weight_at_r_vector_extra,                    &
+                rn, aos_data1, aos_data2, jBH_c, jBH_m, jBH_n, jBH_o,                       &
+                int2_grad1_u12_ao, int_2e_ao)
 
   call wall_time(cuda_time1)
   print*, ' wall time for CUDA kernel (min) = ', (cuda_time1-cuda_time0) / 60.d0