From 4e9143133ac19cbfbc8dbda1f358331da6ef5277 Mon Sep 17 00:00:00 2001
From: AbdAmmar <abd.ammar.phys@gmail.com>
Date: Sat, 10 Aug 2024 18:13:23 +0200
Subject: [PATCH] added 1e-noL with CUDA

---
 plugins/local/tc_int/cutc_module.F90       |   51 +
 plugins/local/tc_int/deb_no_1e_gpu.irp.f   |  499 +++++++++
 plugins/local/tc_int/deb_no_2e_gpu.irp.f   |    1 -
 plugins/local/tc_int/deb_tc_int_cuda.irp.f |    6 +-
 plugins/local/tc_int/no_1e.irp.f           | 1101 +++++++++++++++-----
 5 files changed, 1394 insertions(+), 264 deletions(-)
 create mode 100644 plugins/local/tc_int/deb_no_1e_gpu.irp.f

diff --git a/plugins/local/tc_int/cutc_module.F90 b/plugins/local/tc_int/cutc_module.F90
index d7b922cd..d1466697 100644
--- a/plugins/local/tc_int/cutc_module.F90
+++ b/plugins/local/tc_int/cutc_module.F90
@@ -119,6 +119,57 @@ module cutc_module
 
     ! ---
 
+    subroutine cutc_no_1e(n_grid1, n_mo, ne_a, ne_b,                   &
+                          wr1, mos_l_in_r, mos_r_in_r, int2_grad1_u12, &
+                          no_1e) bind(C, name = "cutc_no_1e")
+
+      import c_int, c_double, c_ptr
+
+      integer(c_int), intent(in), value :: n_grid1
+      integer(c_int), intent(in), value :: n_mo
+      integer(c_int), intent(in), value :: ne_a
+      integer(c_int), intent(in), value :: ne_b
+      real(c_double), intent(in)        :: wr1(n_grid1)
+      real(c_double), intent(in)        :: mos_l_in_r(n_grid1,n_mo)
+      real(c_double), intent(in)        :: mos_r_in_r(n_grid1,n_mo)
+      real(c_double), intent(in)        :: int2_grad1_u12(n_grid1,3,n_mo,n_mo)
+      real(c_double), intent(out)       :: no_1e(n_mo,n_mo)
+
+    end subroutine cutc_no_1e
+
+    ! ---
+
+    subroutine deb_no_1e(n_grid1, n_mo, ne_a, ne_b,                                  &
+                         wr1, mos_l_in_r, mos_r_in_r, int2_grad1_u12,                &
+                         tmpO, tmpJ, tmpM, tmpS, tmpC, tmpD, tmpL, tmpR, tmpE, tmpF, &
+                         no_1e) bind(C, name = "deb_no_1e")
+
+      import c_int, c_double, c_ptr
+
+      integer(c_int), intent(in), value :: n_grid1
+      integer(c_int), intent(in), value :: n_mo
+      integer(c_int), intent(in), value :: ne_a
+      integer(c_int), intent(in), value :: ne_b
+      real(c_double), intent(in)        :: wr1(n_grid1)
+      real(c_double), intent(in)        :: mos_l_in_r(n_grid1,n_mo)
+      real(c_double), intent(in)        :: mos_r_in_r(n_grid1,n_mo)
+      real(c_double), intent(in)        :: int2_grad1_u12(n_grid1,3,n_mo,n_mo)
+      real(c_double), intent(out)       :: tmpO(n_grid1)
+      real(c_double), intent(out)       :: tmpJ(n_grid1,3)
+      real(c_double), intent(out)       :: tmpM(n_grid1,3)
+      real(c_double), intent(out)       :: tmpS(n_grid1)
+      real(c_double), intent(out)       :: tmpC(n_grid1,4,n_mo,n_mo)
+      real(c_double), intent(out)       :: tmpD(n_grid1,4)
+      real(c_double), intent(out)       :: tmpL(n_grid1,3,n_mo)
+      real(c_double), intent(out)       :: tmpR(n_grid1,3,n_mo)
+      real(c_double), intent(out)       :: tmpE(n_grid1,5,n_mo)
+      real(c_double), intent(out)       :: tmpF(n_grid1,5,n_mo)
+      real(c_double), intent(out)       :: no_1e(n_mo,n_mo)
+
+    end subroutine deb_no_1e
+
+    ! ---
+
   end interface
 
 end module cutc_module
diff --git a/plugins/local/tc_int/deb_no_1e_gpu.irp.f b/plugins/local/tc_int/deb_no_1e_gpu.irp.f
new file mode 100644
index 00000000..1efbb913
--- /dev/null
+++ b/plugins/local/tc_int/deb_no_1e_gpu.irp.f
@@ -0,0 +1,499 @@
+
+! ---
+
+subroutine deb_no_1e_gpu()
+
+  use cutc_module
+
+  implicit none
+
+  integer                       :: i, j, k, l, ipoint
+  double precision              :: acc_thr, err_tot, nrm_tot, err_loc
+  double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:)
+  double precision, allocatable :: tmp(:,:,:,:)
+  double precision, allocatable :: int2_grad1_u12_bimo_t(:,:,:,:)
+  double precision, allocatable :: noL_1e(:,:)
+  double precision, allocatable :: noL_1e_gpu(:,:)
+
+
+  PROVIDE mo_l_coef mo_r_coef
+  PROVIDE mos_l_in_r_array_transp mos_r_in_r_array_transp
+
+
+  allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,3))
+  print*, ' Reading int2_grad1_u12_ao from ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao'
+  open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="read")
+    read(11) int2_grad1_u12_ao
+  close(11)
+
+  allocate(tmp(mo_num,mo_num,n_points_final_grid,3))
+  allocate(int2_grad1_u12_bimo_t(n_points_final_grid,3,mo_num,mo_num))
+
+  !$OMP PARALLEL         &
+  !$OMP DEFAULT (NONE)   &
+  !$OMP PRIVATE (ipoint) & 
+  !$OMP SHARED (ao_num, mo_num, n_points_final_grid, int2_grad1_u12_ao, tmp)
+  !$OMP DO SCHEDULE (dynamic)
+  do ipoint = 1, n_points_final_grid
+    call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,1), ao_num, tmp(1,1,ipoint,1), mo_num)
+    call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,2), ao_num, tmp(1,1,ipoint,2), mo_num)
+    call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,3), ao_num, tmp(1,1,ipoint,3), mo_num)
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  deallocate(int2_grad1_u12_ao)
+
+  !$OMP PARALLEL               &
+  !$OMP DEFAULT (NONE)         &
+  !$OMP PRIVATE (i, j, ipoint) & 
+  !$OMP SHARED (mo_num, n_points_final_grid, tmp, int2_grad1_u12_bimo_t)
+  !$OMP DO COLLAPSE(2) SCHEDULE (dynamic)
+  do ipoint = 1, n_points_final_grid
+    do i = 1, mo_num
+      do j = 1, mo_num
+        int2_grad1_u12_bimo_t(ipoint,1,j,i) = tmp(j,i,ipoint,1)
+        int2_grad1_u12_bimo_t(ipoint,2,j,i) = tmp(j,i,ipoint,2)
+        int2_grad1_u12_bimo_t(ipoint,3,j,i) = tmp(j,i,ipoint,3)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  deallocate(tmp)
+
+  ! ---
+
+  allocate(noL_1e_gpu(mo_num,mo_num))
+
+  call cutc_no_1e(n_points_final_grid, mo_num, elec_alpha_num, elec_beta_num, &
+                  final_weight_at_r_vector(1),                                &
+                  mos_l_in_r_array_transp(1,1), mos_r_in_r_array_transp(1,1), &
+                  int2_grad1_u12_bimo_t(1,1,1,1), noL_1e_gpu(1,1))
+
+  ! ---
+
+  allocate(noL_1e(mo_num,mo_num))
+
+  call provide_no_1e(n_points_final_grid, mo_num, elec_alpha_num, elec_beta_num, &
+                     final_weight_at_r_vector(1),                                &
+                     mos_l_in_r_array_transp(1,1), mos_r_in_r_array_transp(1,1), &
+                     int2_grad1_u12_bimo_t(1,1,1,1), noL_1e(1,1))
+
+  ! ---
+
+  deallocate(int2_grad1_u12_bimo_t)
+
+  acc_thr = 1d-12
+
+  err_tot = 0.d0
+  nrm_tot = 0.d0
+  do k = 1, mo_num
+    do l = 1, mo_num
+      err_loc = dabs(noL_1e(l,k) - noL_1e_gpu(l,k))
+      if(err_loc > acc_thr) then
+        print*, " error on", l, k
+        print*, " CPU res", noL_1e    (l,k)
+        print*, " GPU res", noL_1e_gpu(l,k)
+        stop
+      endif
+      err_tot = err_tot + err_loc
+      nrm_tot = nrm_tot + dabs(noL_1e(l,k))
+    enddo
+  enddo
+  print *, ' absolute accuracy on noL_1e (%) =', 100.d0 * err_tot / nrm_tot
+
+  deallocate(noL_1e)
+  deallocate(noL_1e_gpu)
+
+
+  return
+
+end
+
+! ---
+
+subroutine deb_no_1e_gpu_tmp()
+
+  use cutc_module
+
+  implicit none
+
+  integer                       :: i, j, k, l, m, ipoint
+  double precision              :: acc_thr, err_tot, nrm_tot, err_loc
+  double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:)
+  double precision, allocatable :: tmp(:,:,:,:)
+  double precision, allocatable :: int2_grad1_u12_bimo_t(:,:,:,:)
+  double precision, allocatable :: tmpO(:), tmpO_gpu(:)
+  double precision, allocatable :: tmpJ(:,:), tmpJ_gpu(:,:)
+  double precision, allocatable :: tmpM(:,:), tmpM_gpu(:,:)
+  double precision, allocatable :: tmpS(:), tmpS_gpu(:)
+  double precision, allocatable :: tmpC(:,:,:,:), tmpC_gpu(:,:,:,:)
+  double precision, allocatable :: tmpD(:,:), tmpD_gpu(:,:)
+  double precision, allocatable :: tmpL(:,:,:), tmpL_gpu(:,:,:)
+  double precision, allocatable :: tmpR(:,:,:), tmpR_gpu(:,:,:)
+  double precision, allocatable :: tmpE(:,:,:), tmpE_gpu(:,:,:)
+  double precision, allocatable :: tmpF(:,:,:), tmpF_gpu(:,:,:)
+  double precision, allocatable :: noL_1e(:,:), noL_1e_gpu(:,:)
+
+
+  ! ---
+
+
+  PROVIDE mo_l_coef mo_r_coef
+  PROVIDE mos_l_in_r_array_transp mos_r_in_r_array_transp
+
+
+  allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,3))
+  print*, ' Reading int2_grad1_u12_ao from ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao'
+  open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="read")
+    read(11) int2_grad1_u12_ao
+  close(11)
+
+  allocate(tmp(mo_num,mo_num,n_points_final_grid,3))
+  allocate(int2_grad1_u12_bimo_t(n_points_final_grid,3,mo_num,mo_num))
+
+  !$OMP PARALLEL         &
+  !$OMP DEFAULT (NONE)   &
+  !$OMP PRIVATE (ipoint) & 
+  !$OMP SHARED (ao_num, mo_num, n_points_final_grid, int2_grad1_u12_ao, tmp)
+  !$OMP DO SCHEDULE (dynamic)
+  do ipoint = 1, n_points_final_grid
+    call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,1), ao_num, tmp(1,1,ipoint,1), mo_num)
+    call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,2), ao_num, tmp(1,1,ipoint,2), mo_num)
+    call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,3), ao_num, tmp(1,1,ipoint,3), mo_num)
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  deallocate(int2_grad1_u12_ao)
+
+  !$OMP PARALLEL               &
+  !$OMP DEFAULT (NONE)         &
+  !$OMP PRIVATE (i, j, ipoint) & 
+  !$OMP SHARED (mo_num, n_points_final_grid, tmp, int2_grad1_u12_bimo_t)
+  !$OMP DO COLLAPSE(2) SCHEDULE (dynamic)
+  do ipoint = 1, n_points_final_grid
+    do i = 1, mo_num
+      do j = 1, mo_num
+        int2_grad1_u12_bimo_t(ipoint,1,j,i) = tmp(j,i,ipoint,1)
+        int2_grad1_u12_bimo_t(ipoint,2,j,i) = tmp(j,i,ipoint,2)
+        int2_grad1_u12_bimo_t(ipoint,3,j,i) = tmp(j,i,ipoint,3)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  deallocate(tmp)
+
+  ! ---
+
+  allocate(tmpO_gpu(n_points_final_grid))
+  allocate(tmpJ_gpu(n_points_final_grid,3))
+  allocate(tmpM_gpu(n_points_final_grid,3))
+  allocate(tmpS_gpu(n_points_final_grid))
+  allocate(tmpC_gpu(n_points_final_grid,4,mo_num,mo_num))
+  allocate(tmpD_gpu(n_points_final_grid,4))
+  allocate(tmpL_gpu(n_points_final_grid,3,mo_num))
+  allocate(tmpR_gpu(n_points_final_grid,3,mo_num))
+  allocate(tmpE_gpu(n_points_final_grid,5,mo_num))
+  allocate(tmpF_gpu(n_points_final_grid,5,mo_num))
+  allocate(noL_1e_gpu(mo_num,mo_num))
+
+  call deb_no_1e(n_points_final_grid, mo_num, elec_alpha_num, elec_beta_num,                                 &
+                 final_weight_at_r_vector(1),                                                                &
+                 mos_l_in_r_array_transp(1,1), mos_r_in_r_array_transp(1,1), int2_grad1_u12_bimo_t(1,1,1,1), &
+                 tmpO_gpu(1), tmpJ_gpu(1,1), tmpM_gpu(1,1), tmpS_gpu(1), tmpC_gpu(1,1,1,1), tmpD_gpu(1,1),   &
+                 tmpL_gpu(1,1,1), tmpR_gpu(1,1,1), tmpE_gpu(1,1,1), tmpF_gpu(1,1,1), noL_1e_gpu(1,1))
+
+  ! ---
+
+  allocate(tmpO(n_points_final_grid))
+  allocate(tmpJ(n_points_final_grid,3))
+  allocate(tmpM(n_points_final_grid,3))
+  allocate(tmpS(n_points_final_grid))
+  allocate(tmpC(n_points_final_grid,4,mo_num,mo_num))
+  allocate(tmpD(n_points_final_grid,4))
+  allocate(tmpL(n_points_final_grid,3,mo_num))
+  allocate(tmpR(n_points_final_grid,3,mo_num))
+  allocate(tmpE(n_points_final_grid,5,mo_num))
+  allocate(tmpF(n_points_final_grid,5,mo_num))
+  allocate(noL_1e(mo_num,mo_num))
+
+  call provide_no_1e_tmp(n_points_final_grid, mo_num, elec_alpha_num, elec_beta_num,                                 &
+                         final_weight_at_r_vector(1),                                                                &
+                         mos_l_in_r_array_transp(1,1), mos_r_in_r_array_transp(1,1), int2_grad1_u12_bimo_t(1,1,1,1), &
+                         tmpO(1), tmpJ(1,1), tmpM(1,1), tmpS(1), tmpC(1,1,1,1), tmpD(1,1), tmpL(1,1,1), tmpR(1,1,1), &
+                         tmpE(1,1,1), tmpF(1,1,1), noL_1e(1,1))
+
+  ! ---
+
+  deallocate(int2_grad1_u12_bimo_t)
+
+
+  acc_thr = 1d-12
+
+  ! ---
+
+  ! tmpO(n_points_final_grid))
+  err_tot = 0.d0
+  nrm_tot = 0.d0
+  do ipoint = 1, n_points_final_grid
+    err_loc = dabs(tmpO(ipoint) - tmpO_gpu(ipoint))
+    if(err_loc > acc_thr) then
+      print*, " error on", ipoint
+      print*, " CPU res", tmpO    (ipoint)
+      print*, " GPU res", tmpO_gpu(ipoint)
+      stop
+    endif
+    err_tot = err_tot + err_loc
+    nrm_tot = nrm_tot + dabs(tmpO(ipoint))
+  enddo
+  print *, ' absolute accuracy on tmpO (%) =', 100.d0 * err_tot / nrm_tot
+
+  ! ---
+
+  ! tmpJ(n_points_final_grid,3))
+  err_tot = 0.d0
+  nrm_tot = 0.d0
+  do m = 1, 3
+    do ipoint = 1, n_points_final_grid
+      err_loc = dabs(tmpJ(ipoint,m) - tmpJ_gpu(ipoint,m))
+      if(err_loc > acc_thr) then
+        print*, " error on", ipoint, m
+        print*, " CPU res", tmpJ    (ipoint,m)
+        print*, " GPU res", tmpJ_gpu(ipoint,m)
+        stop
+      endif
+      err_tot = err_tot + err_loc
+      nrm_tot = nrm_tot + dabs(tmpJ(ipoint,m))
+    enddo
+  enddo
+  print *, ' absolute accuracy on tmpJ (%) =', 100.d0 * err_tot / nrm_tot
+
+  ! ---
+
+  ! tmpM(n_points_final_grid,3))
+  err_tot = 0.d0
+  nrm_tot = 0.d0
+  do m = 1, 3
+    do ipoint = 1, n_points_final_grid
+      err_loc = dabs(tmpM(ipoint,m) - tmpM_gpu(ipoint,m))
+      if(err_loc > acc_thr) then
+        print*, " error on", ipoint, m
+        print*, " CPU res", tmpM    (ipoint,m)
+        print*, " GPU res", tmpM_gpu(ipoint,m)
+        stop
+      endif
+      err_tot = err_tot + err_loc
+      nrm_tot = nrm_tot + dabs(tmpM(ipoint,m))
+    enddo
+  enddo
+  print *, ' absolute accuracy on tmpM (%) =', 100.d0 * err_tot / nrm_tot
+
+  ! ---
+
+  ! tmpS(n_points_final_grid))
+  err_tot = 0.d0
+  nrm_tot = 0.d0
+  do ipoint = 1, n_points_final_grid
+    err_loc = dabs(tmpS(ipoint) - tmpS_gpu(ipoint))
+    if(err_loc > acc_thr) then
+      print*, " error on", ipoint
+      print*, " CPU res", tmpS    (ipoint)
+      print*, " GPU res", tmpS_gpu(ipoint)
+      stop
+    endif
+    err_tot = err_tot + err_loc
+    nrm_tot = nrm_tot + dabs(tmpS(ipoint))
+  enddo
+  print *, ' absolute accuracy on tmpS (%) =', 100.d0 * err_tot / nrm_tot
+
+  ! ---
+
+  ! tmpC(n_points_final_grid,4,mo_num,mo_num))
+  err_tot = 0.d0
+  nrm_tot = 0.d0
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do m = 1, 4
+        do ipoint = 1, n_points_final_grid
+          err_loc = dabs(tmpC(ipoint,m,j,i) - tmpC_gpu(ipoint,m,j,i))
+          if(err_loc > acc_thr) then
+            print*, " error on", ipoint, m, j, i
+            print*, " CPU res", tmpC    (ipoint,m,j,i)
+            print*, " GPU res", tmpC_gpu(ipoint,m,j,i)
+            stop
+          endif
+          err_tot = err_tot + err_loc
+          nrm_tot = nrm_tot + dabs(tmpC(ipoint,m,j,i))
+        enddo
+      enddo
+    enddo
+  enddo
+  print *, ' absolute accuracy on tmpC (%) =', 100.d0 * err_tot / nrm_tot
+
+  ! ---
+
+  ! tmpD(n_points_final_grid,4))
+  err_tot = 0.d0
+  nrm_tot = 0.d0
+  do m = 1, 4
+    do ipoint = 1, n_points_final_grid
+      err_loc = dabs(tmpD(ipoint,m) - tmpD_gpu(ipoint,m))
+      if(err_loc > acc_thr) then
+        print*, " error on", ipoint, m
+        print*, " CPU res", tmpD    (ipoint,m)
+        print*, " GPU res", tmpD_gpu(ipoint,m)
+        stop
+      endif
+      err_tot = err_tot + err_loc
+      nrm_tot = nrm_tot + dabs(tmpD(ipoint,m))
+    enddo
+  enddo
+  print *, ' absolute accuracy on tmpD (%) =', 100.d0 * err_tot / nrm_tot
+
+  ! ---
+
+  ! tmpL(n_points_final_grid,3,mo_num))
+  err_tot = 0.d0
+  nrm_tot = 0.d0
+  do i = 1, mo_num
+    do m = 1, 3
+      do ipoint = 1, n_points_final_grid
+        err_loc = dabs(tmpL(ipoint,m,i) - tmpL_gpu(ipoint,m,i))
+        if(err_loc > acc_thr) then
+          print*, " error on", ipoint, m, i
+          print*, " CPU res", tmpL    (ipoint,m,i)
+          print*, " GPU res", tmpL_gpu(ipoint,m,i)
+          stop
+        endif
+        err_tot = err_tot + err_loc
+        nrm_tot = nrm_tot + dabs(tmpL(ipoint,m,i))
+      enddo
+    enddo
+  enddo
+  print *, ' absolute accuracy on tmpL (%) =', 100.d0 * err_tot / nrm_tot
+
+  ! ---
+
+  ! tmpR(n_points_final_grid,3,mo_num))
+  err_tot = 0.d0
+  nrm_tot = 0.d0
+  do i = 1, mo_num
+    do m = 1, 3
+      do ipoint = 1, n_points_final_grid
+        err_loc = dabs(tmpR(ipoint,m,i) - tmpR_gpu(ipoint,m,i))
+        if(err_loc > acc_thr) then
+          print*, " error on", ipoint, m, i
+          print*, " CPU res", tmpR    (ipoint,m,i)
+          print*, " GPU res", tmpR_gpu(ipoint,m,i)
+          stop
+        endif
+        err_tot = err_tot + err_loc
+        nrm_tot = nrm_tot + dabs(tmpR(ipoint,m,i))
+      enddo
+    enddo
+  enddo
+  print *, ' absolute accuracy on tmpR (%) =', 100.d0 * err_tot / nrm_tot
+
+  ! ---
+
+  ! tmpE(n_points_final_grid,5,mo_num))
+  err_tot = 0.d0
+  nrm_tot = 0.d0
+  do i = 1, mo_num
+    do m = 1, 5
+      do ipoint = 1, n_points_final_grid
+        err_loc = dabs(tmpE(ipoint,m,i) - tmpE_gpu(ipoint,m,i))
+        if(err_loc > acc_thr) then
+          print*, " error on", ipoint, m, i
+          print*, " CPU res", tmpE    (ipoint,m,i)
+          print*, " GPU res", tmpE_gpu(ipoint,m,i)
+          stop
+        endif
+        err_tot = err_tot + err_loc
+        nrm_tot = nrm_tot + dabs(tmpE(ipoint,m,i))
+      enddo
+    enddo
+  enddo
+  print *, ' absolute accuracy on tmpE (%) =', 100.d0 * err_tot / nrm_tot
+
+  ! ---
+
+  ! tmpF(n_points_final_grid,5,mo_num))
+  err_tot = 0.d0
+  nrm_tot = 0.d0
+  do i = 1, mo_num
+    do m = 1, 5
+      do ipoint = 1, n_points_final_grid
+        err_loc = dabs(tmpF(ipoint,m,i) - tmpF_gpu(ipoint,m,i))
+        if(err_loc > acc_thr) then
+          print*, " error on", ipoint, m, i
+          print*, " CPU res", tmpF    (ipoint,m,i)
+          print*, " GPU res", tmpF_gpu(ipoint,m,i)
+          stop
+        endif
+        err_tot = err_tot + err_loc
+        nrm_tot = nrm_tot + dabs(tmpF(ipoint,m,i))
+      enddo
+    enddo
+  enddo
+  print *, ' absolute accuracy on tmpF (%) =', 100.d0 * err_tot / nrm_tot
+
+  ! ---
+
+  ! noL_1e(mo_num,mo_num))
+  err_tot = 0.d0
+  nrm_tot = 0.d0
+  do k = 1, mo_num
+    do l = 1, mo_num
+      err_loc = dabs(noL_1e(l,k) - noL_1e_gpu(l,k))
+      if(err_loc > acc_thr) then
+        print*, " error on", l, k
+        print*, " CPU res", noL_1e    (l,k)
+        print*, " GPU res", noL_1e_gpu(l,k)
+        stop
+      endif
+      err_tot = err_tot + err_loc
+      nrm_tot = nrm_tot + dabs(noL_1e(l,k))
+    enddo
+  enddo
+  print *, ' absolute accuracy on noL_1e (%) =', 100.d0 * err_tot / nrm_tot
+
+  ! ---
+
+
+  deallocate(tmpO)
+  deallocate(tmpJ)
+  deallocate(tmpM)
+  deallocate(tmpS)
+  deallocate(tmpC)
+  deallocate(tmpD)
+  deallocate(tmpL)
+  deallocate(tmpR)
+  deallocate(tmpE)
+  deallocate(tmpF)
+  deallocate(noL_1e)
+
+  deallocate(tmpO_gpu)
+  deallocate(tmpJ_gpu)
+  deallocate(tmpM_gpu)
+  deallocate(tmpS_gpu)
+  deallocate(tmpC_gpu)
+  deallocate(tmpD_gpu)
+  deallocate(tmpL_gpu)
+  deallocate(tmpR_gpu)
+  deallocate(tmpE_gpu)
+  deallocate(tmpF_gpu)
+  deallocate(noL_1e_gpu)
+
+
+  return
+
+end
+
+
+
diff --git a/plugins/local/tc_int/deb_no_2e_gpu.irp.f b/plugins/local/tc_int/deb_no_2e_gpu.irp.f
index 2be53ddd..16f58cca 100644
--- a/plugins/local/tc_int/deb_no_2e_gpu.irp.f
+++ b/plugins/local/tc_int/deb_no_2e_gpu.irp.f
@@ -87,7 +87,6 @@ subroutine deb_no_2e_gpu()
 
   acc_thr = 1d-12
 
-  print *, ' precision on noL_2e '
   err_tot = 0.d0
   nrm_tot = 0.d0
   do i = 1, mo_num
diff --git a/plugins/local/tc_int/deb_tc_int_cuda.irp.f b/plugins/local/tc_int/deb_tc_int_cuda.irp.f
index 9da9ac95..8d0cc4f3 100644
--- a/plugins/local/tc_int/deb_tc_int_cuda.irp.f
+++ b/plugins/local/tc_int/deb_tc_int_cuda.irp.f
@@ -37,8 +37,12 @@ subroutine main()
   implicit none
 
   !call deb_int_2e_ao_gpu()
+
   !call deb_no_2e_gpu_tmp()
-  call deb_no_2e_gpu()
+  !call deb_no_2e_gpu()
+
+  call deb_no_1e_gpu_tmp()
+  !call deb_no_1e_gpu()
 
   return
 end
diff --git a/plugins/local/tc_int/no_1e.irp.f b/plugins/local/tc_int/no_1e.irp.f
index 3a990276..5a9798f0 100644
--- a/plugins/local/tc_int/no_1e.irp.f
+++ b/plugins/local/tc_int/no_1e.irp.f
@@ -15,10 +15,10 @@ subroutine provide_no_1e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r,
 
   integer                       :: p, s, i, j, ipoint
   double precision              :: t0, t1
-  double precision, allocatable :: tmp1(:,:,:,:), tmp2(:,:), tmp3(:,:,:), tmp4(:,:,:)
-  double precision, allocatable :: tmp_L(:,:,:), tmp_R(:,:,:), tmp_M(:,:), tmp_S(:), tmp_O(:), tmp_J(:,:)
-  double precision, allocatable :: tmp_L0(:,:,:), tmp_R0(:,:,:)
-  double precision, allocatable :: tmp_M_priv(:,:), tmp_S_priv(:), tmp_O_priv(:), tmp_J_priv(:,:)
+  double precision, allocatable :: tmpC(:,:,:,:), tmpD(:,:), tmpE(:,:,:), tmpF(:,:,:)
+  double precision, allocatable :: tmpL(:,:,:), tmpR(:,:,:), tmpM(:,:), tmpS(:), tmpO(:), tmpJ(:,:)
+  double precision, allocatable :: tmpL0(:,:,:), tmpR0(:,:,:)
+  double precision, allocatable :: tmpM_priv(:,:), tmpS_priv(:), tmpO_priv(:), tmpJ_priv(:,:)
 
 
   call wall_time(t0)
@@ -26,119 +26,119 @@ subroutine provide_no_1e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r,
 
   if(ne_a .eq. ne_b) then
 
-    allocate(tmp_O(n_grid), tmp_J(n_grid,3))
-    tmp_O = 0.d0
-    tmp_J = 0.d0
+    allocate(tmpO(n_grid), tmpJ(n_grid,3))
+    tmpO = 0.d0
+    tmpJ = 0.d0
 
-    !$OMP PARALLEL                                   &
-    !$OMP DEFAULT(NONE)                              &
-    !$OMP PRIVATE(i, ipoint, tmp_O_priv, tmp_J_priv) &
-    !$OMP SHARED(ne_b, n_grid,              & 
-    !$OMP        mos_l_in_r, mos_r_in_r,             &
-    !$OMP        int2_grad1_u12, tmp_O, tmp_J)
+    !$OMP PARALLEL                                 &
+    !$OMP DEFAULT(NONE)                            &
+    !$OMP PRIVATE(i, ipoint, tmpO_priv, tmpJ_priv) &
+    !$OMP SHARED(ne_b, n_grid,                     & 
+    !$OMP        mos_l_in_r, mos_r_in_r,           &
+    !$OMP        int2_grad1_u12, tmpO, tmpJ)
 
-    allocate(tmp_O_priv(n_grid), tmp_J_priv(n_grid,3))
-    tmp_O_priv = 0.d0
-    tmp_J_priv = 0.d0
+    allocate(tmpO_priv(n_grid), tmpJ_priv(n_grid,3))
+    tmpO_priv = 0.d0
+    tmpJ_priv = 0.d0
   
     !$OMP DO 
     do i = 1, ne_b
       do ipoint = 1, n_grid
-        tmp_O_priv(ipoint)   = tmp_O_priv(ipoint)   + mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
-        tmp_J_priv(ipoint,1) = tmp_J_priv(ipoint,1) + int2_grad1_u12(ipoint,1,i,i)
-        tmp_J_priv(ipoint,2) = tmp_J_priv(ipoint,2) + int2_grad1_u12(ipoint,2,i,i)
-        tmp_J_priv(ipoint,3) = tmp_J_priv(ipoint,3) + int2_grad1_u12(ipoint,3,i,i)
+        tmpO_priv(ipoint)   = tmpO_priv(ipoint)   + mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
+        tmpJ_priv(ipoint,1) = tmpJ_priv(ipoint,1) + int2_grad1_u12(ipoint,1,i,i)
+        tmpJ_priv(ipoint,2) = tmpJ_priv(ipoint,2) + int2_grad1_u12(ipoint,2,i,i)
+        tmpJ_priv(ipoint,3) = tmpJ_priv(ipoint,3) + int2_grad1_u12(ipoint,3,i,i)
       enddo
     enddo
     !$OMP END DO NOWAIT
 
     !$OMP CRITICAL
-    tmp_O = tmp_O + tmp_O_priv
-    tmp_J = tmp_J + tmp_J_priv
+    tmpO = tmpO + tmpO_priv
+    tmpJ = tmpJ + tmpJ_priv
     !$OMP END CRITICAL
 
-    deallocate(tmp_O_priv, tmp_J_priv)
+    deallocate(tmpO_priv, tmpJ_priv)
     !$OMP END PARALLEL
 
     ! ---
 
-    allocate(tmp_M(n_grid,3), tmp_S(n_grid))
-    tmp_M = 0.d0
-    tmp_S = 0.d0
+    allocate(tmpM(n_grid,3), tmpS(n_grid))
+    tmpM = 0.d0
+    tmpS = 0.d0
 
-    !$OMP PARALLEL                                      &
-    !$OMP DEFAULT(NONE)                                 &
-    !$OMP PRIVATE(i, j, ipoint, tmp_M_priv, tmp_S_priv) &
-    !$OMP SHARED(ne_b, n_grid,                 & 
-    !$OMP        mos_l_in_r, mos_r_in_r,                &
-    !$OMP        int2_grad1_u12, tmp_M, tmp_S)
+    !$OMP PARALLEL                                    &
+    !$OMP DEFAULT(NONE)                               &
+    !$OMP PRIVATE(i, j, ipoint, tmpM_priv, tmpS_priv) &
+    !$OMP SHARED(ne_b, n_grid,                        &
+    !$OMP        mos_l_in_r, mos_r_in_r,              &
+    !$OMP        int2_grad1_u12, tmpM, tmpS)
 
-    allocate(tmp_M_priv(n_grid,3), tmp_S_priv(n_grid))
-    tmp_M_priv = 0.d0
-    tmp_S_priv = 0.d0
+    allocate(tmpM_priv(n_grid,3), tmpS_priv(n_grid))
+    tmpM_priv = 0.d0
+    tmpS_priv = 0.d0
   
     !$OMP DO COLLAPSE(2)
     do i = 1, ne_b
       do j = 1, ne_b
         do ipoint = 1, n_grid
 
-          tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
-          tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
-          tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
 
-          tmp_S_priv(ipoint)   = tmp_S_priv(ipoint)   + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
-                                                      + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
-                                                      + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
+          tmpS_priv(ipoint) = tmpS_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
+                                                + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
+                                                + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
         enddo
       enddo
     enddo
     !$OMP END DO NOWAIT
 
     !$OMP CRITICAL
-    tmp_M = tmp_M + tmp_M_priv
-    tmp_S = tmp_S + tmp_S_priv
+    tmpM = tmpM + tmpM_priv
+    tmpS = tmpS + tmpS_priv
     !$OMP END CRITICAL
 
-    deallocate(tmp_M_priv, tmp_S_priv)
+    deallocate(tmpM_priv, tmpS_priv)
     !$OMP END PARALLEL
 
     ! ---
 
-    allocate(tmp2(n_grid,4))
-    allocate(tmp1(n_grid,4,mo_num,mo_num))
+    allocate(tmpC(n_grid,4,n_mo,n_mo))
+    allocate(tmpD(n_grid,4))
 
     do ipoint = 1, n_grid
 
-      tmp2(ipoint,1) = wr1(ipoint) * (2.d0 * tmp_O(ipoint) * tmp_J(ipoint,1) - tmp_M(ipoint,1))
-      tmp2(ipoint,2) = wr1(ipoint) * (2.d0 * tmp_O(ipoint) * tmp_J(ipoint,2) - tmp_M(ipoint,2))
-      tmp2(ipoint,3) = wr1(ipoint) * (2.d0 * tmp_O(ipoint) * tmp_J(ipoint,3) - tmp_M(ipoint,3))
-      tmp2(ipoint,4) = -wr1(ipoint) * tmp_O(ipoint)
+      tmpD(ipoint,1) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,1) - tmpM(ipoint,1))
+      tmpD(ipoint,2) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,2) - tmpM(ipoint,2))
+      tmpD(ipoint,3) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,3) - tmpM(ipoint,3))
+      tmpD(ipoint,4) = -wr1(ipoint) * tmpO(ipoint)
 
-      tmp_S(ipoint) = 2.d0 * (tmp_J(ipoint,1) * tmp_J(ipoint,1) + tmp_J(ipoint,2) * tmp_J(ipoint,2) + tmp_J(ipoint,3) * tmp_J(ipoint,3)) - tmp_S(ipoint)
+      tmpS(ipoint) = 2.d0 * (tmpJ(ipoint,1) * tmpJ(ipoint,1) + tmpJ(ipoint,2) * tmpJ(ipoint,2) + tmpJ(ipoint,3) * tmpJ(ipoint,3)) - tmpS(ipoint)
     enddo
 
-    deallocate(tmp_O, tmp_M)
+    deallocate(tmpO, tmpM)
 
-    !$OMP PARALLEL                              &
-    !$OMP DEFAULT(NONE)                         &
-    !$OMP PRIVATE(p, s, i, ipoint)              &
-    !$OMP SHARED(mo_num, ne_b, n_grid, & 
-    !$OMP        int2_grad1_u12, tmp1)
+    !$OMP PARALLEL                   &
+    !$OMP DEFAULT(NONE)              &
+    !$OMP PRIVATE(p, s, i, ipoint)   &
+    !$OMP SHARED(n_mo, ne_b, n_grid, &
+    !$OMP        int2_grad1_u12, tmpC)
   
     !$OMP DO COLLAPSE(2)
-    do s = 1, mo_num
-      do p = 1, mo_num
+    do s = 1, n_mo
+      do p = 1, n_mo
 
         do ipoint = 1, n_grid
-          tmp1(ipoint,1,p,s) = int2_grad1_u12(ipoint,1,p,s)
-          tmp1(ipoint,2,p,s) = int2_grad1_u12(ipoint,2,p,s)
-          tmp1(ipoint,3,p,s) = int2_grad1_u12(ipoint,3,p,s)
+          tmpC(ipoint,1,p,s) = int2_grad1_u12(ipoint,1,p,s)
+          tmpC(ipoint,2,p,s) = int2_grad1_u12(ipoint,2,p,s)
+          tmpC(ipoint,3,p,s) = int2_grad1_u12(ipoint,3,p,s)
         enddo
 
-        tmp1(:,4,p,s) = 0.d0
+        tmpC(:,4,p,s) = 0.d0
         do i = 1, ne_b
           do ipoint = 1, n_grid
-            tmp1(ipoint,4,p,s) = tmp1(ipoint,4,p,s) + int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,s) &
+            tmpC(ipoint,4,p,s) = tmpC(ipoint,4,p,s) + int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,s) &
                                                     + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,s) &
                                                     + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,s)
           enddo
@@ -149,41 +149,41 @@ subroutine provide_no_1e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r,
     !$OMP END DO
     !$OMP END PARALLEL
 
-    call dgemv( 'T', 4*n_grid, mo_num*mo_num, 2.d0 &
-              , tmp1(1,1,1,1), size(tmp1, 1) * size(tmp1, 2)    &
-              , tmp2(1,1), 1                                    &
+    call dgemv( 'T', 4*n_grid, n_mo*n_mo, 2.d0               &
+              , tmpC(1,1,1,1), size(tmpC, 1) * size(tmpC, 2) &
+              , tmpD(1,1), 1                                 &
               , 0.d0, noL_1e(1,1), 1)
 
-    deallocate(tmp1, tmp2)
+    deallocate(tmpC, tmpD)
 
     ! ---
 
-    allocate(tmp_L(n_grid,3,mo_num))
-    allocate(tmp_R(n_grid,3,mo_num))
+    allocate(tmpL(n_grid,3,n_mo))
+    allocate(tmpR(n_grid,3,n_mo))
 
-    !$OMP PARALLEL                              &
-    !$OMP DEFAULT(NONE)                         &
-    !$OMP PRIVATE(p, i, ipoint)                 &
-    !$OMP SHARED(ne_b, n_grid, mo_num, & 
-    !$OMP        mos_l_in_r, mos_r_in_r,        &
-    !$OMP        int2_grad1_u12, tmp_L, tmp_R)
+    !$OMP PARALLEL                       &
+    !$OMP DEFAULT(NONE)                  &
+    !$OMP PRIVATE(p, i, ipoint)          &
+    !$OMP SHARED(ne_b, n_grid, n_mo,     &
+    !$OMP        mos_l_in_r, mos_r_in_r, &
+    !$OMP        int2_grad1_u12, tmpL, tmpR)
 
     !$OMP DO
-    do p = 1, mo_num
+    do p = 1, n_mo
 
-      tmp_L(:,1:3,p) = 0.d0
-      tmp_R(:,1:3,p) = 0.d0
+      tmpL(:,1:3,p) = 0.d0
+      tmpR(:,1:3,p) = 0.d0
 
       do i = 1, ne_b
         do ipoint = 1, n_grid
 
-          tmp_L(ipoint,1,p) = tmp_L(ipoint,1,p) + int2_grad1_u12(ipoint,1,p,i) * mos_l_in_r(ipoint,i)
-          tmp_L(ipoint,2,p) = tmp_L(ipoint,2,p) + int2_grad1_u12(ipoint,2,p,i) * mos_l_in_r(ipoint,i)
-          tmp_L(ipoint,3,p) = tmp_L(ipoint,3,p) + int2_grad1_u12(ipoint,3,p,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,1,p) = tmpL(ipoint,1,p) + int2_grad1_u12(ipoint,1,p,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,2,p) = tmpL(ipoint,2,p) + int2_grad1_u12(ipoint,2,p,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,3,p) = tmpL(ipoint,3,p) + int2_grad1_u12(ipoint,3,p,i) * mos_l_in_r(ipoint,i)
 
-          tmp_R(ipoint,1,p) = tmp_R(ipoint,1,p) + int2_grad1_u12(ipoint,1,i,p) * mos_r_in_r(ipoint,i)
-          tmp_R(ipoint,2,p) = tmp_R(ipoint,2,p) + int2_grad1_u12(ipoint,2,i,p) * mos_r_in_r(ipoint,i)
-          tmp_R(ipoint,3,p) = tmp_R(ipoint,3,p) + int2_grad1_u12(ipoint,3,i,p) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,1,p) = tmpR(ipoint,1,p) + int2_grad1_u12(ipoint,1,i,p) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,2,p) = tmpR(ipoint,2,p) + int2_grad1_u12(ipoint,2,i,p) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,3,p) = tmpR(ipoint,3,p) + int2_grad1_u12(ipoint,3,i,p) * mos_r_in_r(ipoint,i)
         enddo
       enddo
     enddo ! p
@@ -192,45 +192,45 @@ subroutine provide_no_1e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r,
 
     ! ---
 
-    allocate(tmp3(n_grid,5,mo_num))
-    allocate(tmp4(n_grid,5,mo_num))
+    allocate(tmpE(n_grid,5,n_mo))
+    allocate(tmpF(n_grid,5,n_mo))
 
-    !$OMP PARALLEL                              &
-    !$OMP DEFAULT(NONE)                         &
-    !$OMP PRIVATE(p, i, j, ipoint)              &
-    !$OMP SHARED(ne_b, n_grid, mo_num, & 
-    !$OMP        mos_l_in_r, mos_r_in_r,        &
-    !$OMP        int2_grad1_u12, wr1,           &
-    !$OMP        tmp_L, tmp_R, tmp_J, tmp_S, tmp3, tmp4)
+    !$OMP PARALLEL                       &
+    !$OMP DEFAULT(NONE)                  &
+    !$OMP PRIVATE(p, i, j, ipoint)       &
+    !$OMP SHARED(ne_b, n_grid, n_mo,     &
+    !$OMP        mos_l_in_r, mos_r_in_r, &
+    !$OMP        int2_grad1_u12, wr1,    &
+    !$OMP        tmpL, tmpR, tmpJ, tmpS, tmpE, tmpF)
 
     !$OMP DO
-    do p = 1, mo_num
+    do p = 1, n_mo
 
       do ipoint = 1, n_grid
 
-        tmp3(ipoint,1,p) = wr1(ipoint) * mos_l_in_r(ipoint,p)
-        tmp3(ipoint,2,p) = -2.d0 * (tmp_L(ipoint,1,p) * tmp_J(ipoint,1) + tmp_L(ipoint,2,p) * tmp_J(ipoint,2) + tmp_L(ipoint,3,p) * tmp_J(ipoint,3))
-        tmp3(ipoint,3,p) = wr1(ipoint) * tmp_L(ipoint,1,p)
-        tmp3(ipoint,4,p) = wr1(ipoint) * tmp_L(ipoint,2,p)
-        tmp3(ipoint,5,p) = wr1(ipoint) * tmp_L(ipoint,3,p)
+        tmpE(ipoint,1,p) = wr1(ipoint) * mos_l_in_r(ipoint,p)
+        tmpE(ipoint,2,p) = -2.d0 * (tmpL(ipoint,1,p) * tmpJ(ipoint,1) + tmpL(ipoint,2,p) * tmpJ(ipoint,2) + tmpL(ipoint,3,p) * tmpJ(ipoint,3))
+        tmpE(ipoint,3,p) = wr1(ipoint) * tmpL(ipoint,1,p)
+        tmpE(ipoint,4,p) = wr1(ipoint) * tmpL(ipoint,2,p)
+        tmpE(ipoint,5,p) = wr1(ipoint) * tmpL(ipoint,3,p)
 
-        tmp4(ipoint,1,p) = -2.d0 * (tmp_R(ipoint,1,p) * tmp_J(ipoint,1) + tmp_R(ipoint,2,p) * tmp_J(ipoint,2) + tmp_R(ipoint,3,p) * tmp_J(ipoint,3)) &
-                         + mos_r_in_r(ipoint,p) * tmp_S(ipoint)
-        tmp4(ipoint,2,p) = wr1(ipoint) * mos_r_in_r(ipoint,p)
-        tmp4(ipoint,3,p) = tmp_R(ipoint,1,p)
-        tmp4(ipoint,4,p) = tmp_R(ipoint,2,p)
-        tmp4(ipoint,5,p) = tmp_R(ipoint,3,p)
+        tmpF(ipoint,1,p) = -2.d0 * (tmpR(ipoint,1,p) * tmpJ(ipoint,1) + tmpR(ipoint,2,p) * tmpJ(ipoint,2) + tmpR(ipoint,3,p) * tmpJ(ipoint,3)) &
+                         + mos_r_in_r(ipoint,p) * tmpS(ipoint)
+        tmpF(ipoint,2,p) = wr1(ipoint) * mos_r_in_r(ipoint,p)
+        tmpF(ipoint,3,p) = tmpR(ipoint,1,p)
+        tmpF(ipoint,4,p) = tmpR(ipoint,2,p)
+        tmpF(ipoint,5,p) = tmpR(ipoint,3,p)
       enddo
 
       do i = 1, ne_b
         do j = 1, ne_b
           do ipoint = 1, n_grid
 
-            tmp3(ipoint,2,p) = tmp3(ipoint,2,p) + mos_l_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,j) & 
+            tmpE(ipoint,2,p) = tmpE(ipoint,2,p) + mos_l_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,j) & 
                                                                          + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,j) &                                                                                     
                                                                          + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,j) )
 
-            tmp4(ipoint,1,p) = tmp4(ipoint,1,p) + mos_r_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,p) & 
+            tmpF(ipoint,1,p) = tmpF(ipoint,1,p) + mos_r_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,p) & 
                                                                          + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,p) &                                                                                     
                                                                          + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,p) )
           enddo ! ipoint
@@ -241,40 +241,40 @@ subroutine provide_no_1e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r,
     !$OMP END DO
     !$OMP END PARALLEL
 
-    deallocate(tmp_L, tmp_R, tmp_J, tmp_S)
+    deallocate(tmpL, tmpR, tmpJ, tmpS)
 
-    call dgemm( 'T', 'N', mo_num, mo_num, 5*n_grid, 1.d0     &
-              , tmp3(1,1,1), 5*n_grid, tmp4(1,1,1), 5*n_grid &
-              , 1.d0, noL_1e(1,1), mo_num)
+    call dgemm( 'T', 'N', n_mo, n_mo, 5*n_grid, 1.d0         &
+              , tmpE(1,1,1), 5*n_grid, tmpF(1,1,1), 5*n_grid &
+              , 1.d0, noL_1e(1,1), n_mo)
    
-    deallocate(tmp3, tmp4)
+    deallocate(tmpE, tmpF)
 
     ! ---
 
   else
 
-    allocate(tmp_O(n_grid), tmp_J(n_grid,3))
-    tmp_O = 0.d0
-    tmp_J = 0.d0
+    allocate(tmpO(n_grid), tmpJ(n_grid,3))
+    tmpO = 0.d0
+    tmpJ = 0.d0
 
-    !$OMP PARALLEL                                      &
-    !$OMP DEFAULT(NONE)                                 &
-    !$OMP PRIVATE(i, ipoint, tmp_O_priv, tmp_J_priv)    &
-    !$OMP SHARED(ne_b, ne_a, n_grid, & 
-    !$OMP        mos_l_in_r, mos_r_in_r,                &
-    !$OMP        int2_grad1_u12, tmp_O, tmp_J)
+    !$OMP PARALLEL                                 &
+    !$OMP DEFAULT(NONE)                            &
+    !$OMP PRIVATE(i, ipoint, tmpO_priv, tmpJ_priv) &
+    !$OMP SHARED(ne_b, ne_a, n_grid,               &
+    !$OMP        mos_l_in_r, mos_r_in_r,           &
+    !$OMP        int2_grad1_u12, tmpO, tmpJ)
 
-    allocate(tmp_O_priv(n_grid), tmp_J_priv(n_grid,3))
-    tmp_O_priv = 0.d0
-    tmp_J_priv = 0.d0
+    allocate(tmpO_priv(n_grid), tmpJ_priv(n_grid,3))
+    tmpO_priv = 0.d0
+    tmpJ_priv = 0.d0
   
     !$OMP DO 
     do i = 1, ne_b
       do ipoint = 1, n_grid
-        tmp_O_priv(ipoint)   = tmp_O_priv(ipoint)   + mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
-        tmp_J_priv(ipoint,1) = tmp_J_priv(ipoint,1) + int2_grad1_u12(ipoint,1,i,i)
-        tmp_J_priv(ipoint,2) = tmp_J_priv(ipoint,2) + int2_grad1_u12(ipoint,2,i,i)
-        tmp_J_priv(ipoint,3) = tmp_J_priv(ipoint,3) + int2_grad1_u12(ipoint,3,i,i)
+        tmpO_priv(ipoint)   = tmpO_priv(ipoint)   + mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
+        tmpJ_priv(ipoint,1) = tmpJ_priv(ipoint,1) + int2_grad1_u12(ipoint,1,i,i)
+        tmpJ_priv(ipoint,2) = tmpJ_priv(ipoint,2) + int2_grad1_u12(ipoint,2,i,i)
+        tmpJ_priv(ipoint,3) = tmpJ_priv(ipoint,3) + int2_grad1_u12(ipoint,3,i,i)
       enddo
     enddo
     !$OMP END DO NOWAIT
@@ -282,51 +282,51 @@ subroutine provide_no_1e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r,
     !$OMP DO 
     do i = ne_b+1, ne_a
       do ipoint = 1, n_grid
-        tmp_O_priv(ipoint)   = tmp_O_priv(ipoint)   + 0.5d0 * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
-        tmp_J_priv(ipoint,1) = tmp_J_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,i)
-        tmp_J_priv(ipoint,2) = tmp_J_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,i)
-        tmp_J_priv(ipoint,3) = tmp_J_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,i)
+        tmpO_priv(ipoint)   = tmpO_priv(ipoint)   + 0.5d0 * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
+        tmpJ_priv(ipoint,1) = tmpJ_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,i)
+        tmpJ_priv(ipoint,2) = tmpJ_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,i)
+        tmpJ_priv(ipoint,3) = tmpJ_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,i)
       enddo
     enddo
     !$OMP END DO NOWAIT
 
     !$OMP CRITICAL
-    tmp_O = tmp_O + tmp_O_priv
-    tmp_J = tmp_J + tmp_J_priv
+    tmpO = tmpO + tmpO_priv
+    tmpJ = tmpJ + tmpJ_priv
     !$OMP END CRITICAL
 
-    deallocate(tmp_O_priv, tmp_J_priv)
+    deallocate(tmpO_priv, tmpJ_priv)
     !$OMP END PARALLEL
 
     ! ---
 
-    allocate(tmp_M(n_grid,3), tmp_S(n_grid))
-    tmp_M = 0.d0
-    tmp_S = 0.d0
+    allocate(tmpM(n_grid,3), tmpS(n_grid))
+    tmpM = 0.d0
+    tmpS = 0.d0
 
-    !$OMP PARALLEL                                      &
-    !$OMP DEFAULT(NONE)                                 &
-    !$OMP PRIVATE(i, j, ipoint, tmp_M_priv, tmp_S_priv) &
-    !$OMP SHARED(ne_b, ne_a, n_grid, & 
-    !$OMP        mos_l_in_r, mos_r_in_r,                &
-    !$OMP        int2_grad1_u12, tmp_M, tmp_S)
+    !$OMP PARALLEL                                    &
+    !$OMP DEFAULT(NONE)                               &
+    !$OMP PRIVATE(i, j, ipoint, tmpM_priv, tmpS_priv) &
+    !$OMP SHARED(ne_b, ne_a, n_grid,                  &
+    !$OMP        mos_l_in_r, mos_r_in_r,              &
+    !$OMP        int2_grad1_u12, tmpM, tmpS)
 
-    allocate(tmp_M_priv(n_grid,3), tmp_S_priv(n_grid))
-    tmp_M_priv = 0.d0
-    tmp_S_priv = 0.d0
+    allocate(tmpM_priv(n_grid,3), tmpS_priv(n_grid))
+    tmpM_priv = 0.d0
+    tmpS_priv = 0.d0
   
     !$OMP DO COLLAPSE(2)
     do i = 1, ne_b
       do j = 1, ne_b
         do ipoint = 1, n_grid
 
-          tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
-          tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
-          tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
 
-          tmp_S_priv(ipoint)   = tmp_S_priv(ipoint)   + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
-                                                      + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
-                                                      + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
+          tmpS_priv(ipoint) = tmpS_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
+                                                + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
+                                                + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
         enddo
       enddo
     enddo
@@ -337,17 +337,17 @@ subroutine provide_no_1e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r,
       do j = 1, ne_b
         do ipoint = 1, n_grid
 
-          tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
-          tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
-          tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
 
-          tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i)
-          tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i)
-          tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i)
+          tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i)
+          tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i)
+          tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i)
 
-          tmp_S_priv(ipoint)   = tmp_S_priv(ipoint)   + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
-                                                      + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
-                                                      + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
+          tmpS_priv(ipoint) = tmpS_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
+                                                + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
+                                                + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
         enddo
       enddo
     enddo
@@ -358,70 +358,70 @@ subroutine provide_no_1e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r,
       do j = ne_b+1, ne_a
         do ipoint = 1, n_grid
 
-          tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
-          tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
-          tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
 
-          tmp_S_priv(ipoint)   = tmp_S_priv(ipoint)   + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
-                                                      + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
-                                                      + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
+          tmpS_priv(ipoint) = tmpS_priv(ipoint) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
+                                                + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
+                                                + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
         enddo
       enddo
     enddo
     !$OMP END DO NOWAIT
 
     !$OMP CRITICAL
-    tmp_M = tmp_M + tmp_M_priv
-    tmp_S = tmp_S + tmp_S_priv
+    tmpM = tmpM + tmpM_priv
+    tmpS = tmpS + tmpS_priv
     !$OMP END CRITICAL
 
-    deallocate(tmp_M_priv, tmp_S_priv)
+    deallocate(tmpM_priv, tmpS_priv)
     !$OMP END PARALLEL
 
     ! ---
 
-    allocate(tmp2(n_grid,4))
-    allocate(tmp1(n_grid,4,mo_num,mo_num))
+    allocate(tmpC(n_grid,4,n_mo,n_mo))
+    allocate(tmpD(n_grid,4))
 
     do ipoint = 1, n_grid
 
-      tmp2(ipoint,1) = wr1(ipoint) * (2.d0 * tmp_O(ipoint) * tmp_J(ipoint,1) - tmp_M(ipoint,1))
-      tmp2(ipoint,2) = wr1(ipoint) * (2.d0 * tmp_O(ipoint) * tmp_J(ipoint,2) - tmp_M(ipoint,2))
-      tmp2(ipoint,3) = wr1(ipoint) * (2.d0 * tmp_O(ipoint) * tmp_J(ipoint,3) - tmp_M(ipoint,3))
-      tmp2(ipoint,4) = -wr1(ipoint) * tmp_O(ipoint)
+      tmpD(ipoint,1) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,1) - tmpM(ipoint,1))
+      tmpD(ipoint,2) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,2) - tmpM(ipoint,2))
+      tmpD(ipoint,3) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,3) - tmpM(ipoint,3))
+      tmpD(ipoint,4) = -wr1(ipoint) * tmpO(ipoint)
 
-      tmp_S(ipoint) = 2.d0 * (tmp_J(ipoint,1) * tmp_J(ipoint,1) + tmp_J(ipoint,2) * tmp_J(ipoint,2) + tmp_J(ipoint,3) * tmp_J(ipoint,3)) - tmp_S(ipoint)
+      tmpS(ipoint) = 2.d0 * (tmpJ(ipoint,1) * tmpJ(ipoint,1) + tmpJ(ipoint,2) * tmpJ(ipoint,2) + tmpJ(ipoint,3) * tmpJ(ipoint,3)) - tmpS(ipoint)
     enddo
 
-    deallocate(tmp_O, tmp_M)
+    deallocate(tmpO, tmpM)
 
-    !$OMP PARALLEL                              &
-    !$OMP DEFAULT(NONE)                         &
-    !$OMP PRIVATE(p, s, i, ipoint)              &
-    !$OMP SHARED(mo_num, ne_b, n_grid, & 
-    !$OMP        ne_a, int2_grad1_u12, tmp1)
+    !$OMP PARALLEL                   &
+    !$OMP DEFAULT(NONE)              &
+    !$OMP PRIVATE(p, s, i, ipoint)   &
+    !$OMP SHARED(n_mo, ne_b, n_grid, &
+    !$OMP        ne_a, int2_grad1_u12, tmpC)
   
     !$OMP DO COLLAPSE(2)
-    do s = 1, mo_num
-      do p = 1, mo_num
+    do s = 1, n_mo
+      do p = 1, n_mo
 
         do ipoint = 1, n_grid
-          tmp1(ipoint,1,p,s) = int2_grad1_u12(ipoint,1,p,s)
-          tmp1(ipoint,2,p,s) = int2_grad1_u12(ipoint,2,p,s)
-          tmp1(ipoint,3,p,s) = int2_grad1_u12(ipoint,3,p,s)
+          tmpC(ipoint,1,p,s) = int2_grad1_u12(ipoint,1,p,s)
+          tmpC(ipoint,2,p,s) = int2_grad1_u12(ipoint,2,p,s)
+          tmpC(ipoint,3,p,s) = int2_grad1_u12(ipoint,3,p,s)
         enddo
 
-        tmp1(:,4,p,s) = 0.d0
+        tmpC(:,4,p,s) = 0.d0
         do i = 1, ne_b
           do ipoint = 1, n_grid
-            tmp1(ipoint,4,p,s) = tmp1(ipoint,4,p,s) + int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,s) &
+            tmpC(ipoint,4,p,s) = tmpC(ipoint,4,p,s) + int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,s) &
                                                     + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,s) &
                                                     + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,s)
           enddo
         enddo
         do i = ne_b+1, ne_a
           do ipoint = 1, n_grid
-            tmp1(ipoint,4,p,s) = tmp1(ipoint,4,p,s) + 0.5d0 * int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,s) &
+            tmpC(ipoint,4,p,s) = tmpC(ipoint,4,p,s) + 0.5d0 * int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,s) &
                                                     + 0.5d0 * int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,s) &
                                                     + 0.5d0 * int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,s)
           enddo
@@ -432,55 +432,55 @@ subroutine provide_no_1e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r,
     !$OMP END DO
     !$OMP END PARALLEL
 
-    call dgemv( 'T', 4*n_grid, mo_num*mo_num, 2.d0 &
-              , tmp1(1,1,1,1), size(tmp1, 1) * size(tmp1, 2)    &
-              , tmp2(1,1), 1                                    &
+    call dgemv( 'T', 4*n_grid, n_mo*n_mo, 2.d0               &
+              , tmpC(1,1,1,1), size(tmpC, 1) * size(tmpC, 2) &
+              , tmpD(1,1), 1                                 &
               , 0.d0, noL_1e(1,1), 1)
 
-    deallocate(tmp1, tmp2)
+    deallocate(tmpC, tmpD)
 
     ! ---
 
-    allocate(tmp_L(n_grid,3,mo_num), tmp_L0(n_grid,3,mo_num))
-    allocate(tmp_R(n_grid,3,mo_num), tmp_R0(n_grid,3,mo_num))
+    allocate(tmpL(n_grid,3,n_mo), tmpL0(n_grid,3,n_mo))
+    allocate(tmpR(n_grid,3,n_mo), tmpR0(n_grid,3,n_mo))
 
-    !$OMP PARALLEL                                              &
-    !$OMP DEFAULT(NONE)                                         &
-    !$OMP PRIVATE(p, i, ipoint)                                 &
-    !$OMP SHARED(ne_b, ne_a, n_grid, mo_num, & 
-    !$OMP        mos_l_in_r, mos_r_in_r,                        &
-    !$OMP        int2_grad1_u12, tmp_L0, tmp_R0, tmp_L, tmp_R)
+    !$OMP PARALLEL                         &
+    !$OMP DEFAULT(NONE)                    &
+    !$OMP PRIVATE(p, i, ipoint)            &
+    !$OMP SHARED(ne_b, ne_a, n_grid, n_mo, &
+    !$OMP        mos_l_in_r, mos_r_in_r,   &
+    !$OMP        int2_grad1_u12, tmpL0, tmpR0, tmpL, tmpR)
 
     !$OMP DO
-    do p = 1, mo_num
+    do p = 1, n_mo
 
-      tmp_L0(:,1:3,p) = 0.d0
-      tmp_R0(:,1:3,p) = 0.d0
+      tmpL0(:,1:3,p) = 0.d0
+      tmpR0(:,1:3,p) = 0.d0
       do i = ne_b+1, ne_a
         do ipoint = 1, n_grid
 
-          tmp_L0(ipoint,1,p) = tmp_L0(ipoint,1,p) + 0.5d0 * int2_grad1_u12(ipoint,1,p,i) * mos_l_in_r(ipoint,i)
-          tmp_L0(ipoint,2,p) = tmp_L0(ipoint,2,p) + 0.5d0 * int2_grad1_u12(ipoint,2,p,i) * mos_l_in_r(ipoint,i)
-          tmp_L0(ipoint,3,p) = tmp_L0(ipoint,3,p) + 0.5d0 * int2_grad1_u12(ipoint,3,p,i) * mos_l_in_r(ipoint,i)
+          tmpL0(ipoint,1,p) = tmpL0(ipoint,1,p) + 0.5d0 * int2_grad1_u12(ipoint,1,p,i) * mos_l_in_r(ipoint,i)
+          tmpL0(ipoint,2,p) = tmpL0(ipoint,2,p) + 0.5d0 * int2_grad1_u12(ipoint,2,p,i) * mos_l_in_r(ipoint,i)
+          tmpL0(ipoint,3,p) = tmpL0(ipoint,3,p) + 0.5d0 * int2_grad1_u12(ipoint,3,p,i) * mos_l_in_r(ipoint,i)
                                      
-          tmp_R0(ipoint,1,p) = tmp_R0(ipoint,1,p) + 0.5d0 * int2_grad1_u12(ipoint,1,i,p) * mos_r_in_r(ipoint,i)
-          tmp_R0(ipoint,2,p) = tmp_R0(ipoint,2,p) + 0.5d0 * int2_grad1_u12(ipoint,2,i,p) * mos_r_in_r(ipoint,i)
-          tmp_R0(ipoint,3,p) = tmp_R0(ipoint,3,p) + 0.5d0 * int2_grad1_u12(ipoint,3,i,p) * mos_r_in_r(ipoint,i)
+          tmpR0(ipoint,1,p) = tmpR0(ipoint,1,p) + 0.5d0 * int2_grad1_u12(ipoint,1,i,p) * mos_r_in_r(ipoint,i)
+          tmpR0(ipoint,2,p) = tmpR0(ipoint,2,p) + 0.5d0 * int2_grad1_u12(ipoint,2,i,p) * mos_r_in_r(ipoint,i)
+          tmpR0(ipoint,3,p) = tmpR0(ipoint,3,p) + 0.5d0 * int2_grad1_u12(ipoint,3,i,p) * mos_r_in_r(ipoint,i)
         enddo
       enddo
 
-      tmp_L(:,1:3,p) = tmp_L0(:,1:3,p)
-      tmp_R(:,1:3,p) = tmp_R0(:,1:3,p)
+      tmpL(:,1:3,p) = tmpL0(:,1:3,p)
+      tmpR(:,1:3,p) = tmpR0(:,1:3,p)
       do i = 1, ne_b
         do ipoint = 1, n_grid
 
-          tmp_L(ipoint,1,p) = tmp_L(ipoint,1,p) + int2_grad1_u12(ipoint,1,p,i) * mos_l_in_r(ipoint,i)
-          tmp_L(ipoint,2,p) = tmp_L(ipoint,2,p) + int2_grad1_u12(ipoint,2,p,i) * mos_l_in_r(ipoint,i)
-          tmp_L(ipoint,3,p) = tmp_L(ipoint,3,p) + int2_grad1_u12(ipoint,3,p,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,1,p) = tmpL(ipoint,1,p) + int2_grad1_u12(ipoint,1,p,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,2,p) = tmpL(ipoint,2,p) + int2_grad1_u12(ipoint,2,p,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,3,p) = tmpL(ipoint,3,p) + int2_grad1_u12(ipoint,3,p,i) * mos_l_in_r(ipoint,i)
                                    
-          tmp_R(ipoint,1,p) = tmp_R(ipoint,1,p) + int2_grad1_u12(ipoint,1,i,p) * mos_r_in_r(ipoint,i)
-          tmp_R(ipoint,2,p) = tmp_R(ipoint,2,p) + int2_grad1_u12(ipoint,2,i,p) * mos_r_in_r(ipoint,i)
-          tmp_R(ipoint,3,p) = tmp_R(ipoint,3,p) + int2_grad1_u12(ipoint,3,i,p) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,1,p) = tmpR(ipoint,1,p) + int2_grad1_u12(ipoint,1,i,p) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,2,p) = tmpR(ipoint,2,p) + int2_grad1_u12(ipoint,2,i,p) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,3,p) = tmpR(ipoint,3,p) + int2_grad1_u12(ipoint,3,i,p) * mos_r_in_r(ipoint,i)
         enddo
       enddo
 
@@ -490,51 +490,51 @@ subroutine provide_no_1e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r,
 
     ! ---
 
-    allocate(tmp3(n_grid,8,mo_num))
-    allocate(tmp4(n_grid,8,mo_num))
+    allocate(tmpE(n_grid,8,n_mo))
+    allocate(tmpF(n_grid,8,n_mo))
 
-    !$OMP PARALLEL                                              &
-    !$OMP DEFAULT(NONE)                                         &
-    !$OMP PRIVATE(p, i, j, ipoint)                              &
-    !$OMP SHARED(ne_b, ne_a, n_grid, mo_num, & 
-    !$OMP        mos_l_in_r, mos_r_in_r,                        &
-    !$OMP        int2_grad1_u12, wr1,                           &
-    !$OMP        tmp_L, tmp_L0, tmp_R, tmp_R0, tmp_J, tmp_S, tmp3, tmp4)
+    !$OMP PARALLEL                         &
+    !$OMP DEFAULT(NONE)                    &
+    !$OMP PRIVATE(p, i, j, ipoint)         &
+    !$OMP SHARED(ne_b, ne_a, n_grid, n_mo, & 
+    !$OMP        mos_l_in_r, mos_r_in_r,   &
+    !$OMP        int2_grad1_u12, wr1,      &
+    !$OMP        tmpL, tmpL0, tmpR, tmpR0, tmpJ, tmpS, tmpE, tmpF)
 
     !$OMP DO
-    do p = 1, mo_num
+    do p = 1, n_mo
 
       do ipoint = 1, n_grid
 
-        tmp3(ipoint,1,p) = wr1(ipoint) * mos_l_in_r(ipoint,p)
-        tmp3(ipoint,2,p) = -2.d0 * (tmp_L(ipoint,1,p) * tmp_J(ipoint,1) + tmp_L(ipoint,2,p) * tmp_J(ipoint,2) + tmp_L(ipoint,3,p) * tmp_J(ipoint,3))
-        tmp3(ipoint,3,p) = wr1(ipoint) * tmp_L(ipoint,1,p)
-        tmp3(ipoint,4,p) = wr1(ipoint) * tmp_L(ipoint,2,p)
-        tmp3(ipoint,5,p) = wr1(ipoint) * tmp_L(ipoint,3,p)
-        tmp3(ipoint,6,p) = wr1(ipoint) * tmp_L0(ipoint,1,p)
-        tmp3(ipoint,7,p) = wr1(ipoint) * tmp_L0(ipoint,2,p)
-        tmp3(ipoint,8,p) = wr1(ipoint) * tmp_L0(ipoint,3,p)
+        tmpE(ipoint,1,p) = wr1(ipoint) * mos_l_in_r(ipoint,p)
+        tmpE(ipoint,2,p) = -2.d0 * (tmpL(ipoint,1,p) * tmpJ(ipoint,1) + tmpL(ipoint,2,p) * tmpJ(ipoint,2) + tmpL(ipoint,3,p) * tmpJ(ipoint,3))
+        tmpE(ipoint,3,p) = wr1(ipoint) * tmpL(ipoint,1,p)
+        tmpE(ipoint,4,p) = wr1(ipoint) * tmpL(ipoint,2,p)
+        tmpE(ipoint,5,p) = wr1(ipoint) * tmpL(ipoint,3,p)
+        tmpE(ipoint,6,p) = wr1(ipoint) * tmpL0(ipoint,1,p)
+        tmpE(ipoint,7,p) = wr1(ipoint) * tmpL0(ipoint,2,p)
+        tmpE(ipoint,8,p) = wr1(ipoint) * tmpL0(ipoint,3,p)
 
-        tmp4(ipoint,1,p) = -2.d0 * (tmp_R(ipoint,1,p) * tmp_J(ipoint,1) + tmp_R(ipoint,2,p) * tmp_J(ipoint,2) + tmp_R(ipoint,3,p) * tmp_J(ipoint,3)) &
-                         + mos_r_in_r(ipoint,p) * tmp_S(ipoint)
-        tmp4(ipoint,2,p) = wr1(ipoint) * mos_r_in_r(ipoint,p)
-        tmp4(ipoint,3,p) = tmp_R(ipoint,1,p)
-        tmp4(ipoint,4,p) = tmp_R(ipoint,2,p)
-        tmp4(ipoint,5,p) = tmp_R(ipoint,3,p)
-        tmp4(ipoint,6,p) = tmp_R0(ipoint,1,p)
-        tmp4(ipoint,7,p) = tmp_R0(ipoint,2,p)
-        tmp4(ipoint,8,p) = tmp_R0(ipoint,3,p)
+        tmpF(ipoint,1,p) = -2.d0 * (tmpR(ipoint,1,p) * tmpJ(ipoint,1) + tmpR(ipoint,2,p) * tmpJ(ipoint,2) + tmpR(ipoint,3,p) * tmpJ(ipoint,3)) &
+                         + mos_r_in_r(ipoint,p) * tmpS(ipoint)
+        tmpF(ipoint,2,p) = wr1(ipoint) * mos_r_in_r(ipoint,p)
+        tmpF(ipoint,3,p) = tmpR(ipoint,1,p)
+        tmpF(ipoint,4,p) = tmpR(ipoint,2,p)
+        tmpF(ipoint,5,p) = tmpR(ipoint,3,p)
+        tmpF(ipoint,6,p) = tmpR0(ipoint,1,p)
+        tmpF(ipoint,7,p) = tmpR0(ipoint,2,p)
+        tmpF(ipoint,8,p) = tmpR0(ipoint,3,p)
       enddo
 
       do i = 1, ne_b
         do j = 1, ne_b
           do ipoint = 1, n_grid
 
-            tmp3(ipoint,2,p) = tmp3(ipoint,2,p) + mos_l_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,j) & 
+            tmpE(ipoint,2,p) = tmpE(ipoint,2,p) + mos_l_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,j) & 
                                                                          + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,j) &                                                                                     
                                                                          + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,j) )
 
-            tmp4(ipoint,1,p) = tmp4(ipoint,1,p) + mos_r_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,p) & 
+            tmpF(ipoint,1,p) = tmpF(ipoint,1,p) + mos_r_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,p) & 
                                                                          + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,p) &                                                                                     
                                                                          + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,p) )
           enddo ! ipoint
@@ -545,17 +545,17 @@ subroutine provide_no_1e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r,
         do j = 1, ne_b
           do ipoint = 1, n_grid
 
-            tmp3(ipoint,2,p) = tmp3(ipoint,2,p) + 0.5d0 * mos_l_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,j) & 
+            tmpE(ipoint,2,p) = tmpE(ipoint,2,p) + 0.5d0 * mos_l_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,j) & 
                                                                                  + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,j) &                                                                                     
                                                                                  + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,j) )
-            tmp3(ipoint,2,p) = tmp3(ipoint,2,p) + 0.5d0 * mos_l_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,p,j) * int2_grad1_u12(ipoint,1,j,i) & 
+            tmpE(ipoint,2,p) = tmpE(ipoint,2,p) + 0.5d0 * mos_l_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,p,j) * int2_grad1_u12(ipoint,1,j,i) & 
                                                                                  + int2_grad1_u12(ipoint,2,p,j) * int2_grad1_u12(ipoint,2,j,i) &                                                                                     
                                                                                  + int2_grad1_u12(ipoint,3,p,j) * int2_grad1_u12(ipoint,3,j,i) )
 
-            tmp4(ipoint,1,p) = tmp4(ipoint,1,p) + 0.5d0 * mos_r_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,p) & 
+            tmpF(ipoint,1,p) = tmpF(ipoint,1,p) + 0.5d0 * mos_r_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,p) & 
                                                                                  + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,p) &                                                                                     
                                                                                  + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,p) )
-            tmp4(ipoint,1,p) = tmp4(ipoint,1,p) + 0.5d0 * mos_r_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,j,i) * int2_grad1_u12(ipoint,1,i,p) & 
+            tmpF(ipoint,1,p) = tmpF(ipoint,1,p) + 0.5d0 * mos_r_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,j,i) * int2_grad1_u12(ipoint,1,i,p) & 
                                                                                  + int2_grad1_u12(ipoint,2,j,i) * int2_grad1_u12(ipoint,2,i,p) &                                                                                     
                                                                                  + int2_grad1_u12(ipoint,3,j,i) * int2_grad1_u12(ipoint,3,i,p) )
           enddo ! ipoint
@@ -566,11 +566,11 @@ subroutine provide_no_1e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r,
         do j = ne_b+1, ne_a
           do ipoint = 1, n_grid
 
-            tmp3(ipoint,2,p) = tmp3(ipoint,2,p) + 0.5d0 * mos_l_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,j) & 
+            tmpE(ipoint,2,p) = tmpE(ipoint,2,p) + 0.5d0 * mos_l_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,j) & 
                                                                                  + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,j) &                                                                                     
                                                                                  + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,j) )
 
-            tmp4(ipoint,1,p) = tmp4(ipoint,1,p) + 0.5d0 * mos_r_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,p) & 
+            tmpF(ipoint,1,p) = tmpF(ipoint,1,p) + 0.5d0 * mos_r_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,p) & 
                                                                                  + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,p) &                                                                                     
                                                                                  + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,p) )
           enddo ! ipoint
@@ -581,13 +581,590 @@ subroutine provide_no_1e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r,
     !$OMP END DO
     !$OMP END PARALLEL
 
-    deallocate(tmp_L0, tmp_L, tmp_R0, tmp_R, tmp_J, tmp_S)
+    deallocate(tmpL0, tmpL, tmpR0, tmpR, tmpJ, tmpS)
 
-    call dgemm( 'T', 'N', mo_num, mo_num, 8*n_grid, 1.d0     &
-              , tmp3(1,1,1), 8*n_grid, tmp4(1,1,1), 8*n_grid &
-              , 1.d0, noL_1e(1,1), mo_num)
+    call dgemm( 'T', 'N', n_mo, n_mo, 8*n_grid, 1.d0         &
+              , tmpE(1,1,1), 8*n_grid, tmpF(1,1,1), 8*n_grid &
+              , 1.d0, noL_1e(1,1), n_mo)
    
-    deallocate(tmp3, tmp4)
+    deallocate(tmpE, tmpF)
+
+  endif
+
+
+  call wall_time(t1)
+  write(*,"(A,2X,F15.7)") ' wall time for noL_1e (sec) = ', (t1 - t0)
+
+  return
+end
+
+! ---
+
+subroutine provide_no_1e_tmp(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r, int2_grad1_u12, &
+                             tmpO, tmpJ, tmpM, tmpS, tmpC, tmpD, tmpL, tmpR, tmpE, tmpF, noL_1e)
+
+
+  implicit none
+
+  integer,          intent(in)  :: n_grid, n_mo
+  integer,          intent(in)  :: ne_a, ne_b
+  double precision, intent(in)  :: wr1(n_grid)
+  double precision, intent(in)  :: mos_l_in_r(n_grid,n_mo)
+  double precision, intent(in)  :: mos_r_in_r(n_grid,n_mo)
+  double precision, intent(in)  :: int2_grad1_u12(n_grid,3,n_mo,n_mo)
+  double precision, intent(out) :: tmpO(n_grid), tmpJ(n_grid,3)
+  double precision, intent(out) :: tmpM(n_grid,3), tmpS(n_grid)
+  double precision, intent(out) :: tmpC(n_grid,4,n_mo,n_mo), tmpD(n_grid,4)
+  double precision, intent(out) :: tmpL(n_grid,3,n_mo), tmpR(n_grid,3,n_mo)
+  double precision, intent(out) :: tmpE(n_grid,5,n_mo), tmpF(n_grid,5,n_mo)
+  double precision, intent(out) :: noL_1e(n_mo,n_mo)
+
+  integer                       :: p, s, i, j, ipoint
+  double precision              :: t0, t1
+  double precision, allocatable :: tmpM_priv(:,:), tmpS_priv(:), tmpO_priv(:), tmpJ_priv(:,:)
+  double precision, allocatable :: tmpL0(:,:,:), tmpR0(:,:,:)
+  double precision, allocatable :: tmpE_os(:,:,:), tmpF_os(:,:,:)
+
+
+  call wall_time(t0)
+
+
+  if(ne_a .eq. ne_b) then
+
+    tmpO = 0.d0
+    tmpJ = 0.d0
+
+    !$OMP PARALLEL                                 &
+    !$OMP DEFAULT(NONE)                            &
+    !$OMP PRIVATE(i, ipoint, tmpO_priv, tmpJ_priv) &
+    !$OMP SHARED(ne_b, n_grid,                     & 
+    !$OMP        mos_l_in_r, mos_r_in_r,           &
+    !$OMP        int2_grad1_u12, tmpO, tmpJ)
+
+    allocate(tmpO_priv(n_grid), tmpJ_priv(n_grid,3))
+    tmpO_priv = 0.d0
+    tmpJ_priv = 0.d0
+  
+    !$OMP DO 
+    do i = 1, ne_b
+      do ipoint = 1, n_grid
+        tmpO_priv(ipoint)   = tmpO_priv(ipoint)   + mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
+        tmpJ_priv(ipoint,1) = tmpJ_priv(ipoint,1) + int2_grad1_u12(ipoint,1,i,i)
+        tmpJ_priv(ipoint,2) = tmpJ_priv(ipoint,2) + int2_grad1_u12(ipoint,2,i,i)
+        tmpJ_priv(ipoint,3) = tmpJ_priv(ipoint,3) + int2_grad1_u12(ipoint,3,i,i)
+      enddo
+    enddo
+    !$OMP END DO NOWAIT
+
+    !$OMP CRITICAL
+    tmpO = tmpO + tmpO_priv
+    tmpJ = tmpJ + tmpJ_priv
+    !$OMP END CRITICAL
+
+    deallocate(tmpO_priv, tmpJ_priv)
+    !$OMP END PARALLEL
+
+    ! ---
+
+    tmpM = 0.d0
+    tmpS = 0.d0
+
+    !$OMP PARALLEL                                    &
+    !$OMP DEFAULT(NONE)                               &
+    !$OMP PRIVATE(i, j, ipoint, tmpM_priv, tmpS_priv) &
+    !$OMP SHARED(ne_b, n_grid,                        &
+    !$OMP        mos_l_in_r, mos_r_in_r,              &
+    !$OMP        int2_grad1_u12, tmpM, tmpS)
+
+    allocate(tmpM_priv(n_grid,3), tmpS_priv(n_grid))
+    tmpM_priv = 0.d0
+    tmpS_priv = 0.d0
+  
+    !$OMP DO COLLAPSE(2)
+    do i = 1, ne_b
+      do j = 1, ne_b
+        do ipoint = 1, n_grid
+
+          tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+
+          tmpS_priv(ipoint) = tmpS_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
+                                                + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
+                                                + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
+        enddo
+      enddo
+    enddo
+    !$OMP END DO NOWAIT
+
+    !$OMP CRITICAL
+    tmpM = tmpM + tmpM_priv
+    tmpS = tmpS + tmpS_priv
+    !$OMP END CRITICAL
+
+    deallocate(tmpM_priv, tmpS_priv)
+    !$OMP END PARALLEL
+
+    ! ---
+
+    do ipoint = 1, n_grid
+
+      tmpD(ipoint,1) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,1) - tmpM(ipoint,1))
+      tmpD(ipoint,2) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,2) - tmpM(ipoint,2))
+      tmpD(ipoint,3) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,3) - tmpM(ipoint,3))
+      tmpD(ipoint,4) = -wr1(ipoint) * tmpO(ipoint)
+
+      tmpS(ipoint) = 2.d0 * (tmpJ(ipoint,1) * tmpJ(ipoint,1) + tmpJ(ipoint,2) * tmpJ(ipoint,2) + tmpJ(ipoint,3) * tmpJ(ipoint,3)) - tmpS(ipoint)
+    enddo
+
+    !$OMP PARALLEL                   &
+    !$OMP DEFAULT(NONE)              &
+    !$OMP PRIVATE(p, s, i, ipoint)   &
+    !$OMP SHARED(n_mo, ne_b, n_grid, &
+    !$OMP        int2_grad1_u12, tmpC)
+  
+    !$OMP DO COLLAPSE(2)
+    do s = 1, n_mo
+      do p = 1, n_mo
+
+        do ipoint = 1, n_grid
+          tmpC(ipoint,1,p,s) = int2_grad1_u12(ipoint,1,p,s)
+          tmpC(ipoint,2,p,s) = int2_grad1_u12(ipoint,2,p,s)
+          tmpC(ipoint,3,p,s) = int2_grad1_u12(ipoint,3,p,s)
+        enddo
+
+        tmpC(:,4,p,s) = 0.d0
+        do i = 1, ne_b
+          do ipoint = 1, n_grid
+            tmpC(ipoint,4,p,s) = tmpC(ipoint,4,p,s) + int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,s) &
+                                                    + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,s) &
+                                                    + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,s)
+          enddo
+        enddo
+
+      enddo ! p
+    enddo ! s
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    call dgemv( 'T', 4*n_grid, n_mo*n_mo, 2.d0               &
+              , tmpC(1,1,1,1), size(tmpC, 1) * size(tmpC, 2) &
+              , tmpD(1,1), 1                                 &
+              , 0.d0, noL_1e(1,1), 1)
+
+    ! ---
+
+    !$OMP PARALLEL                       &
+    !$OMP DEFAULT(NONE)                  &
+    !$OMP PRIVATE(p, i, ipoint)          &
+    !$OMP SHARED(ne_b, n_grid, n_mo,     &
+    !$OMP        mos_l_in_r, mos_r_in_r, &
+    !$OMP        int2_grad1_u12, tmpL, tmpR)
+
+    !$OMP DO
+    do p = 1, n_mo
+
+      tmpL(:,1:3,p) = 0.d0
+      tmpR(:,1:3,p) = 0.d0
+
+      do i = 1, ne_b
+        do ipoint = 1, n_grid
+
+          tmpL(ipoint,1,p) = tmpL(ipoint,1,p) + int2_grad1_u12(ipoint,1,p,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,2,p) = tmpL(ipoint,2,p) + int2_grad1_u12(ipoint,2,p,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,3,p) = tmpL(ipoint,3,p) + int2_grad1_u12(ipoint,3,p,i) * mos_l_in_r(ipoint,i)
+
+          tmpR(ipoint,1,p) = tmpR(ipoint,1,p) + int2_grad1_u12(ipoint,1,i,p) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,2,p) = tmpR(ipoint,2,p) + int2_grad1_u12(ipoint,2,i,p) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,3,p) = tmpR(ipoint,3,p) + int2_grad1_u12(ipoint,3,i,p) * mos_r_in_r(ipoint,i)
+        enddo
+      enddo
+    enddo ! p
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    ! ---
+
+    !$OMP PARALLEL                       &
+    !$OMP DEFAULT(NONE)                  &
+    !$OMP PRIVATE(p, i, j, ipoint)       &
+    !$OMP SHARED(ne_b, n_grid, n_mo,     &
+    !$OMP        mos_l_in_r, mos_r_in_r, &
+    !$OMP        int2_grad1_u12, wr1,    &
+    !$OMP        tmpL, tmpR, tmpJ, tmpS, tmpE, tmpF)
+
+    !$OMP DO
+    do p = 1, n_mo
+
+      do ipoint = 1, n_grid
+
+        tmpE(ipoint,1,p) = wr1(ipoint) * mos_l_in_r(ipoint,p)
+        tmpE(ipoint,2,p) = -2.d0 * (tmpL(ipoint,1,p) * tmpJ(ipoint,1) + tmpL(ipoint,2,p) * tmpJ(ipoint,2) + tmpL(ipoint,3,p) * tmpJ(ipoint,3))
+        tmpE(ipoint,3,p) = wr1(ipoint) * tmpL(ipoint,1,p)
+        tmpE(ipoint,4,p) = wr1(ipoint) * tmpL(ipoint,2,p)
+        tmpE(ipoint,5,p) = wr1(ipoint) * tmpL(ipoint,3,p)
+
+        tmpF(ipoint,1,p) = -2.d0 * (tmpR(ipoint,1,p) * tmpJ(ipoint,1) + tmpR(ipoint,2,p) * tmpJ(ipoint,2) + tmpR(ipoint,3,p) * tmpJ(ipoint,3)) &
+                         + mos_r_in_r(ipoint,p) * tmpS(ipoint)
+        tmpF(ipoint,2,p) = wr1(ipoint) * mos_r_in_r(ipoint,p)
+        tmpF(ipoint,3,p) = tmpR(ipoint,1,p)
+        tmpF(ipoint,4,p) = tmpR(ipoint,2,p)
+        tmpF(ipoint,5,p) = tmpR(ipoint,3,p)
+      enddo
+
+      do i = 1, ne_b
+        do j = 1, ne_b
+          do ipoint = 1, n_grid
+
+            tmpE(ipoint,2,p) = tmpE(ipoint,2,p) + mos_l_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,j) & 
+                                                                         + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,j) &                                                                                     
+                                                                         + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,j) )
+
+            tmpF(ipoint,1,p) = tmpF(ipoint,1,p) + mos_r_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,p) & 
+                                                                         + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,p) &                                                                                     
+                                                                         + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,p) )
+          enddo ! ipoint
+        enddo ! j
+      enddo ! i
+
+    enddo ! p
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    call dgemm( 'T', 'N', n_mo, n_mo, 5*n_grid, 1.d0         &
+              , tmpE(1,1,1), 5*n_grid, tmpF(1,1,1), 5*n_grid &
+              , 1.d0, noL_1e(1,1), n_mo)
+   
+    ! ---
+
+  else
+
+    tmpO = 0.d0
+    tmpJ = 0.d0
+
+    !$OMP PARALLEL                                 &
+    !$OMP DEFAULT(NONE)                            &
+    !$OMP PRIVATE(i, ipoint, tmpO_priv, tmpJ_priv) &
+    !$OMP SHARED(ne_b, ne_a, n_grid,               &
+    !$OMP        mos_l_in_r, mos_r_in_r,           &
+    !$OMP        int2_grad1_u12, tmpO, tmpJ)
+
+    allocate(tmpO_priv(n_grid), tmpJ_priv(n_grid,3))
+    tmpO_priv = 0.d0
+    tmpJ_priv = 0.d0
+  
+    !$OMP DO 
+    do i = 1, ne_b
+      do ipoint = 1, n_grid
+        tmpO_priv(ipoint)   = tmpO_priv(ipoint)   + mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
+        tmpJ_priv(ipoint,1) = tmpJ_priv(ipoint,1) + int2_grad1_u12(ipoint,1,i,i)
+        tmpJ_priv(ipoint,2) = tmpJ_priv(ipoint,2) + int2_grad1_u12(ipoint,2,i,i)
+        tmpJ_priv(ipoint,3) = tmpJ_priv(ipoint,3) + int2_grad1_u12(ipoint,3,i,i)
+      enddo
+    enddo
+    !$OMP END DO NOWAIT
+
+    !$OMP DO 
+    do i = ne_b+1, ne_a
+      do ipoint = 1, n_grid
+        tmpO_priv(ipoint)   = tmpO_priv(ipoint)   + 0.5d0 * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
+        tmpJ_priv(ipoint,1) = tmpJ_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,i)
+        tmpJ_priv(ipoint,2) = tmpJ_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,i)
+        tmpJ_priv(ipoint,3) = tmpJ_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,i)
+      enddo
+    enddo
+    !$OMP END DO NOWAIT
+
+    !$OMP CRITICAL
+    tmpO = tmpO + tmpO_priv
+    tmpJ = tmpJ + tmpJ_priv
+    !$OMP END CRITICAL
+
+    deallocate(tmpO_priv, tmpJ_priv)
+    !$OMP END PARALLEL
+
+    ! ---
+
+    tmpM = 0.d0
+    tmpS = 0.d0
+
+    !$OMP PARALLEL                                    &
+    !$OMP DEFAULT(NONE)                               &
+    !$OMP PRIVATE(i, j, ipoint, tmpM_priv, tmpS_priv) &
+    !$OMP SHARED(ne_b, ne_a, n_grid,                  &
+    !$OMP        mos_l_in_r, mos_r_in_r,              &
+    !$OMP        int2_grad1_u12, tmpM, tmpS)
+
+    allocate(tmpM_priv(n_grid,3), tmpS_priv(n_grid))
+    tmpM_priv = 0.d0
+    tmpS_priv = 0.d0
+  
+    !$OMP DO COLLAPSE(2)
+    do i = 1, ne_b
+      do j = 1, ne_b
+        do ipoint = 1, n_grid
+
+          tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+
+          tmpS_priv(ipoint) = tmpS_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
+                                                + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
+                                                + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
+        enddo
+      enddo
+    enddo
+    !$OMP END DO NOWAIT
+
+    !$OMP DO COLLAPSE(2)
+    do i = ne_b+1, ne_a
+      do j = 1, ne_b
+        do ipoint = 1, n_grid
+
+          tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+
+          tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i)
+          tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i)
+          tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i)
+
+          tmpS_priv(ipoint) = tmpS_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
+                                                + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
+                                                + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
+        enddo
+      enddo
+    enddo
+    !$OMP END DO NOWAIT
+
+    !$OMP DO COLLAPSE(2)
+    do i = ne_b+1, ne_a
+      do j = ne_b+1, ne_a
+        do ipoint = 1, n_grid
+
+          tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+
+          tmpS_priv(ipoint) = tmpS_priv(ipoint) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
+                                                + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
+                                                + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
+        enddo
+      enddo
+    enddo
+    !$OMP END DO NOWAIT
+
+    !$OMP CRITICAL
+    tmpM = tmpM + tmpM_priv
+    tmpS = tmpS + tmpS_priv
+    !$OMP END CRITICAL
+
+    deallocate(tmpM_priv, tmpS_priv)
+    !$OMP END PARALLEL
+
+    ! ---
+
+    do ipoint = 1, n_grid
+
+      tmpD(ipoint,1) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,1) - tmpM(ipoint,1))
+      tmpD(ipoint,2) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,2) - tmpM(ipoint,2))
+      tmpD(ipoint,3) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,3) - tmpM(ipoint,3))
+      tmpD(ipoint,4) = -wr1(ipoint) * tmpO(ipoint)
+
+      tmpS(ipoint) = 2.d0 * (tmpJ(ipoint,1) * tmpJ(ipoint,1) + tmpJ(ipoint,2) * tmpJ(ipoint,2) + tmpJ(ipoint,3) * tmpJ(ipoint,3)) - tmpS(ipoint)
+    enddo
+
+    !$OMP PARALLEL                   &
+    !$OMP DEFAULT(NONE)              &
+    !$OMP PRIVATE(p, s, i, ipoint)   &
+    !$OMP SHARED(n_mo, ne_b, n_grid, &
+    !$OMP        ne_a, int2_grad1_u12, tmpC)
+  
+    !$OMP DO COLLAPSE(2)
+    do s = 1, n_mo
+      do p = 1, n_mo
+
+        do ipoint = 1, n_grid
+          tmpC(ipoint,1,p,s) = int2_grad1_u12(ipoint,1,p,s)
+          tmpC(ipoint,2,p,s) = int2_grad1_u12(ipoint,2,p,s)
+          tmpC(ipoint,3,p,s) = int2_grad1_u12(ipoint,3,p,s)
+        enddo
+
+        tmpC(:,4,p,s) = 0.d0
+        do i = 1, ne_b
+          do ipoint = 1, n_grid
+            tmpC(ipoint,4,p,s) = tmpC(ipoint,4,p,s) + int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,s) &
+                                                    + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,s) &
+                                                    + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,s)
+          enddo
+        enddo
+        do i = ne_b+1, ne_a
+          do ipoint = 1, n_grid
+            tmpC(ipoint,4,p,s) = tmpC(ipoint,4,p,s) + 0.5d0 * int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,s) &
+                                                    + 0.5d0 * int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,s) &
+                                                    + 0.5d0 * int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,s)
+          enddo
+        enddo
+
+      enddo ! p
+    enddo ! s
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    call dgemv( 'T', 4*n_grid, n_mo*n_mo, 2.d0               &
+              , tmpC(1,1,1,1), size(tmpC, 1) * size(tmpC, 2) &
+              , tmpD(1,1), 1                                 &
+              , 0.d0, noL_1e(1,1), 1)
+
+    ! ---
+
+    allocate(tmpL0(n_grid,3,n_mo))
+    allocate(tmpR0(n_grid,3,n_mo))
+
+    !$OMP PARALLEL                         &
+    !$OMP DEFAULT(NONE)                    &
+    !$OMP PRIVATE(p, i, ipoint)            &
+    !$OMP SHARED(ne_b, ne_a, n_grid, n_mo, &
+    !$OMP        mos_l_in_r, mos_r_in_r,   &
+    !$OMP        int2_grad1_u12, tmpL0, tmpR0, tmpL, tmpR)
+
+    !$OMP DO
+    do p = 1, n_mo
+
+      tmpL0(:,1:3,p) = 0.d0
+      tmpR0(:,1:3,p) = 0.d0
+      do i = ne_b+1, ne_a
+        do ipoint = 1, n_grid
+
+          tmpL0(ipoint,1,p) = tmpL0(ipoint,1,p) + 0.5d0 * int2_grad1_u12(ipoint,1,p,i) * mos_l_in_r(ipoint,i)
+          tmpL0(ipoint,2,p) = tmpL0(ipoint,2,p) + 0.5d0 * int2_grad1_u12(ipoint,2,p,i) * mos_l_in_r(ipoint,i)
+          tmpL0(ipoint,3,p) = tmpL0(ipoint,3,p) + 0.5d0 * int2_grad1_u12(ipoint,3,p,i) * mos_l_in_r(ipoint,i)
+                                     
+          tmpR0(ipoint,1,p) = tmpR0(ipoint,1,p) + 0.5d0 * int2_grad1_u12(ipoint,1,i,p) * mos_r_in_r(ipoint,i)
+          tmpR0(ipoint,2,p) = tmpR0(ipoint,2,p) + 0.5d0 * int2_grad1_u12(ipoint,2,i,p) * mos_r_in_r(ipoint,i)
+          tmpR0(ipoint,3,p) = tmpR0(ipoint,3,p) + 0.5d0 * int2_grad1_u12(ipoint,3,i,p) * mos_r_in_r(ipoint,i)
+        enddo
+      enddo
+
+      tmpL(:,1:3,p) = tmpL0(:,1:3,p)
+      tmpR(:,1:3,p) = tmpR0(:,1:3,p)
+      do i = 1, ne_b
+        do ipoint = 1, n_grid
+
+          tmpL(ipoint,1,p) = tmpL(ipoint,1,p) + int2_grad1_u12(ipoint,1,p,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,2,p) = tmpL(ipoint,2,p) + int2_grad1_u12(ipoint,2,p,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,3,p) = tmpL(ipoint,3,p) + int2_grad1_u12(ipoint,3,p,i) * mos_l_in_r(ipoint,i)
+                                   
+          tmpR(ipoint,1,p) = tmpR(ipoint,1,p) + int2_grad1_u12(ipoint,1,i,p) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,2,p) = tmpR(ipoint,2,p) + int2_grad1_u12(ipoint,2,i,p) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,3,p) = tmpR(ipoint,3,p) + int2_grad1_u12(ipoint,3,i,p) * mos_r_in_r(ipoint,i)
+        enddo
+      enddo
+
+    enddo ! p
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    ! ---
+
+    allocate(tmpE_os(n_grid,8,n_mo))
+    allocate(tmpF_os(n_grid,8,n_mo))
+
+    !$OMP PARALLEL                         &
+    !$OMP DEFAULT(NONE)                    &
+    !$OMP PRIVATE(p, i, j, ipoint)         &
+    !$OMP SHARED(ne_b, ne_a, n_grid, n_mo, & 
+    !$OMP        mos_l_in_r, mos_r_in_r,   &
+    !$OMP        int2_grad1_u12, wr1,      &
+    !$OMP        tmpL, tmpL0, tmpR, tmpR0, tmpJ, tmpS, tmpE_os, tmpF_os)
+
+    !$OMP DO
+    do p = 1, n_mo
+
+      do ipoint = 1, n_grid
+
+        tmpE_os(ipoint,1,p) = wr1(ipoint) * mos_l_in_r(ipoint,p)
+        tmpE_os(ipoint,2,p) = -2.d0 * (tmpL(ipoint,1,p) * tmpJ(ipoint,1) + tmpL(ipoint,2,p) * tmpJ(ipoint,2) + tmpL(ipoint,3,p) * tmpJ(ipoint,3))
+        tmpE_os(ipoint,3,p) = wr1(ipoint) * tmpL(ipoint,1,p)
+        tmpE_os(ipoint,4,p) = wr1(ipoint) * tmpL(ipoint,2,p)
+        tmpE_os(ipoint,5,p) = wr1(ipoint) * tmpL(ipoint,3,p)
+        tmpE_os(ipoint,6,p) = wr1(ipoint) * tmpL0(ipoint,1,p)
+        tmpE_os(ipoint,7,p) = wr1(ipoint) * tmpL0(ipoint,2,p)
+        tmpE_os(ipoint,8,p) = wr1(ipoint) * tmpL0(ipoint,3,p)
+
+        tmpF_os(ipoint,1,p) = -2.d0 * (tmpR(ipoint,1,p) * tmpJ(ipoint,1) + tmpR(ipoint,2,p) * tmpJ(ipoint,2) + tmpR(ipoint,3,p) * tmpJ(ipoint,3)) &
+                            + mos_r_in_r(ipoint,p) * tmpS(ipoint)
+        tmpF_os(ipoint,2,p) = wr1(ipoint) * mos_r_in_r(ipoint,p)
+        tmpF_os(ipoint,3,p) = tmpR(ipoint,1,p)
+        tmpF_os(ipoint,4,p) = tmpR(ipoint,2,p)
+        tmpF_os(ipoint,5,p) = tmpR(ipoint,3,p)
+        tmpF_os(ipoint,6,p) = tmpR0(ipoint,1,p)
+        tmpF_os(ipoint,7,p) = tmpR0(ipoint,2,p)
+        tmpF_os(ipoint,8,p) = tmpR0(ipoint,3,p)
+      enddo
+
+      do i = 1, ne_b
+        do j = 1, ne_b
+          do ipoint = 1, n_grid
+
+            tmpE_os(ipoint,2,p) = tmpE_os(ipoint,2,p) + mos_l_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,j) & 
+                                                                               + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,j) &                                                                                     
+                                                                               + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,j) )
+
+            tmpF_os(ipoint,1,p) = tmpF_os(ipoint,1,p) + mos_r_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,p) & 
+                                                                               + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,p) &                                                                                     
+                                                                               + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,p) )
+          enddo ! ipoint
+        enddo ! j
+      enddo ! i
+
+      do i = ne_b+1, ne_a
+        do j = 1, ne_b
+          do ipoint = 1, n_grid
+
+            tmpE_os(ipoint,2,p) = tmpE_os(ipoint,2,p) + 0.5d0 * mos_l_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,j) & 
+                                                                                       + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,j) &                                                                                     
+                                                                                       + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,j) )
+            tmpE_os(ipoint,2,p) = tmpE_os(ipoint,2,p) + 0.5d0 * mos_l_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,p,j) * int2_grad1_u12(ipoint,1,j,i) & 
+                                                                                       + int2_grad1_u12(ipoint,2,p,j) * int2_grad1_u12(ipoint,2,j,i) &                                                                                     
+                                                                                       + int2_grad1_u12(ipoint,3,p,j) * int2_grad1_u12(ipoint,3,j,i) )
+
+            tmpF_os(ipoint,1,p) = tmpF_os(ipoint,1,p) + 0.5d0 * mos_r_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,p) & 
+                                                                                       + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,p) &                                                                                     
+                                                                                       + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,p) )
+            tmpF_os(ipoint,1,p) = tmpF_os(ipoint,1,p) + 0.5d0 * mos_r_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,j,i) * int2_grad1_u12(ipoint,1,i,p) & 
+                                                                                       + int2_grad1_u12(ipoint,2,j,i) * int2_grad1_u12(ipoint,2,i,p) &                                                                                     
+                                                                                       + int2_grad1_u12(ipoint,3,j,i) * int2_grad1_u12(ipoint,3,i,p) )
+          enddo ! ipoint
+        enddo ! j
+      enddo ! i
+
+      do i = ne_b+1, ne_a
+        do j = ne_b+1, ne_a
+          do ipoint = 1, n_grid
+
+            tmpE_os(ipoint,2,p) = tmpE_os(ipoint,2,p) + 0.5d0 * mos_l_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,j) & 
+                                                                                       + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,j) &                                                                                     
+                                                                                       + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,j) )
+
+            tmpF_os(ipoint,1,p) = tmpF_os(ipoint,1,p) + 0.5d0 * mos_r_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,p) & 
+                                                                                       + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,p) &                                                                                     
+                                                                                       + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,p) )
+          enddo ! ipoint
+        enddo ! j
+      enddo ! i
+
+    enddo ! p
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    deallocate(tmpL0, tmpR0)
+
+    call dgemm( 'T', 'N', n_mo, n_mo, 8*n_grid, 1.d0               &
+              , tmpE_os(1,1,1), 8*n_grid, tmpF_os(1,1,1), 8*n_grid &
+              , 1.d0, noL_1e(1,1), n_mo)
+   
+    deallocate(tmpE_os, tmpF_os)
 
   endif