From c9caec5f7e8c9faa8b503553bb7895f48b04bcb2 Mon Sep 17 00:00:00 2001
From: AbdAmmar <abd.ammar.phys@gmail.com>
Date: Sun, 4 Feb 2024 13:22:26 +0100
Subject: [PATCH] added Mu_Nu Jastrow

---
 plugins/local/jastrow/EZFIO.cfg               |   7 +
 plugins/local/non_h_ints_mu/jast_1e.irp.f     | 145 +----------
 .../local/non_h_ints_mu/jast_1e_utils.irp.f   |  38 ++-
 .../local/non_h_ints_mu/jast_2e_utils.irp.f   | 195 +++++++++++----
 .../non_h_ints_mu/jast_deriv_utils_vect.irp.f | 225 +++++++++++++++++-
 .../local/non_h_ints_mu/test_non_h_ints.irp.f |  92 ++++++-
 .../local/non_h_ints_mu/total_tc_int.irp.f    |   9 +-
 7 files changed, 494 insertions(+), 217 deletions(-)

diff --git a/plugins/local/jastrow/EZFIO.cfg b/plugins/local/jastrow/EZFIO.cfg
index c3ed29a3..23dde8ea 100644
--- a/plugins/local/jastrow/EZFIO.cfg
+++ b/plugins/local/jastrow/EZFIO.cfg
@@ -144,3 +144,10 @@ interface: ezfio,provider,ocaml
 default: 1.0
 ezfio_name: a_boys
 
+[nu_erf]
+type: double precision
+doc: e-e correlation in the core
+interface: ezfio,provider,ocaml
+default: 1.0
+ezfio_name: nu_erf
+
diff --git a/plugins/local/non_h_ints_mu/jast_1e.irp.f b/plugins/local/non_h_ints_mu/jast_1e.irp.f
index 1fc2fd2b..e994d27a 100644
--- a/plugins/local/non_h_ints_mu/jast_1e.irp.f
+++ b/plugins/local/non_h_ints_mu/jast_1e.irp.f
@@ -78,7 +78,7 @@ END_PROVIDER
   double precision              :: cx, cy, cz
   double precision              :: time0, time1
   double precision, allocatable :: Pa(:,:), Pb(:,:), Pt(:,:)
-  double precision, allocatable :: coef_fit(:), coef_fit2(:,:), coef_fit3(:,:)
+  double precision, allocatable :: coef_fit2(:,:)
 
   PROVIDE j1e_type
 
@@ -163,75 +163,6 @@ END_PROVIDER
 
     deallocate(Pa, Pb, Pt)
 
-!  elseif(j1e_type .eq. "Charge_Harmonizer_AO") then
-!
-!    ! \grad_1 \sum_{\eta} C_{\eta} \chi_{\eta}
-!    ! where 
-!    !       \chi_{\eta} are the AOs
-!    !       C_{\eta} are fitted to mimic (j1e_type .eq. "Charge_Harmonizer")
-!    !
-!    ! The - sign is in the parameters C_{\eta}
-!
-!    PROVIDE aos_grad_in_r_array
-!
-!    allocate(coef_fit(ao_num))
-!
-!    if(mpi_master) then
-!      call ezfio_has_jastrow_j1e_coef_ao(exists)
-!    endif 
-!    IRP_IF MPI_DEBUG
-!      print *,  irp_here, mpi_rank
-!      call MPI_BARRIER(MPI_COMM_WORLD, ierr)
-!    IRP_ENDIF
-!    IRP_IF MPI
-!      include 'mpif.h'
-!      call MPI_BCAST(coef_fit, ao_num, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr)
-!      if (ierr /= MPI_SUCCESS) then
-!        stop 'Unable to read j1e_coef_ao with MPI'
-!      endif
-!    IRP_ENDIF
-!    if(exists) then
-!      if(mpi_master) then
-!        write(6,'(A)') '.. >>>>> [ IO READ: j1e_coef_ao ] <<<<< ..'
-!        call ezfio_get_jastrow_j1e_coef_ao(coef_fit)
-!        IRP_IF MPI
-!          call MPI_BCAST(coef_fit, ao_num, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr)
-!          if (ierr /= MPI_SUCCESS) then
-!            stop 'Unable to read j1e_coef_ao with MPI'
-!          endif
-!        IRP_ENDIF
-!      endif
-!    else
-!
-!      call get_j1e_coef_fit_ao(ao_num, coef_fit)
-!      call ezfio_set_jastrow_j1e_coef_ao(coef_fit)
-!
-!    endif
-!
-!    !$OMP PARALLEL                               &
-!    !$OMP DEFAULT (NONE)                         &
-!    !$OMP PRIVATE (i, ipoint, c)                 &
-!    !$OMP SHARED (n_points_final_grid, ao_num,   &
-!    !$OMP         aos_grad_in_r_array, coef_fit, &
-!    !$OMP         j1e_gradx, j1e_grady, j1e_gradz)
-!    !$OMP DO SCHEDULE (static)
-!    do ipoint = 1, n_points_final_grid
-!
-!      j1e_gradx(ipoint) = 0.d0
-!      j1e_grady(ipoint) = 0.d0
-!      j1e_gradz(ipoint) = 0.d0
-!      do i = 1, ao_num
-!        c = coef_fit(i)
-!        j1e_gradx(ipoint) = j1e_gradx(ipoint) + c * aos_grad_in_r_array(i,ipoint,1)
-!        j1e_grady(ipoint) = j1e_grady(ipoint) + c * aos_grad_in_r_array(i,ipoint,2)
-!        j1e_gradz(ipoint) = j1e_gradz(ipoint) + c * aos_grad_in_r_array(i,ipoint,3)
-!      enddo
-!    enddo
-!    !$OMP END DO
-!    !$OMP END PARALLEL
-!
-!    deallocate(coef_fit)
-
   elseif(j1e_type .eq. "Charge_Harmonizer_AO") then
 
     ! \grad_1 \sum_{\eta,\beta} C_{\eta,\beta} \chi_{\eta} \chi_{\beta}
@@ -271,10 +202,8 @@ END_PROVIDER
         IRP_ENDIF
       endif
     else
-
       call get_j1e_coef_fit_ao2(ao_num, coef_fit2)
       call ezfio_set_jastrow_j1e_coef_ao2(coef_fit2)
-
     endif
 
     !$OMP PARALLEL                                &
@@ -305,78 +234,6 @@ END_PROVIDER
 
     deallocate(coef_fit2)
 
-!  elseif(j1e_type .eq. "Charge_Harmonizer_AO3") then
-!
-!    ! \sum_{\eta} \vec{C}_{\eta} \chi_{\eta}
-!    ! where 
-!    !       \chi_{\eta} are the AOs
-!    !       \vec{C}_{\eta} are fitted to mimic (j1e_type .eq. "Charge_Harmonizer")
-!    !
-!    ! The - sign is in the parameters \vec{C}_{\eta}
-!
-!    PROVIDE aos_grad_in_r_array
-!
-!    allocate(coef_fit3(ao_num,3))
-!
-!    if(mpi_master) then
-!      call ezfio_has_jastrow_j1e_coef_ao3(exists)
-!    endif 
-!    IRP_IF MPI_DEBUG
-!      print *,  irp_here, mpi_rank
-!      call MPI_BARRIER(MPI_COMM_WORLD, ierr)
-!    IRP_ENDIF
-!    IRP_IF MPI
-!      !include 'mpif.h'
-!      call MPI_BCAST(coef_fit3, (ao_num*3), MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr)
-!      if (ierr /= MPI_SUCCESS) then
-!        stop 'Unable to read j1e_coef_ao3 with MPI'
-!      endif
-!    IRP_ENDIF
-!    if(exists) then
-!      if(mpi_master) then
-!        write(6,'(A)') '.. >>>>> [ IO READ: j1e_coef_ao3 ] <<<<< ..'
-!        call ezfio_get_jastrow_j1e_coef_ao3(coef_fit3)
-!        IRP_IF MPI
-!          call MPI_BCAST(coef_fit3, (ao_num*3), MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr)
-!          if (ierr /= MPI_SUCCESS) then
-!            stop 'Unable to read j1e_coef_ao3 with MPI'
-!          endif
-!        IRP_ENDIF
-!      endif
-!    else
-!
-!      call get_j1e_coef_fit_ao3(ao_num, coef_fit3)
-!      call ezfio_set_jastrow_j1e_coef_ao3(coef_fit3)
-!
-!    endif
-!
-!    !$OMP PARALLEL                                &
-!    !$OMP DEFAULT (NONE)                          &
-!    !$OMP PRIVATE (i, ipoint, cx, cy, cz)         &
-!    !$OMP SHARED (n_points_final_grid, ao_num,    &
-!    !$OMP         aos_grad_in_r_array, coef_fit3, &
-!    !$OMP         aos_in_r_array, j1e_gradx, j1e_grady, j1e_gradz)
-!    !$OMP DO SCHEDULE (static)
-!    do ipoint = 1, n_points_final_grid
-!
-!      j1e_gradx(ipoint) = 0.d0
-!      j1e_grady(ipoint) = 0.d0
-!      j1e_gradz(ipoint) = 0.d0
-!      do i = 1, ao_num
-!        cx = coef_fit3(i,1)
-!        cy = coef_fit3(i,2)
-!        cz = coef_fit3(i,3)
-!
-!        j1e_gradx(ipoint) += cx * aos_in_r_array(i,ipoint)
-!        j1e_grady(ipoint) += cy * aos_in_r_array(i,ipoint)
-!        j1e_gradz(ipoint) += cz * aos_in_r_array(i,ipoint)
-!      enddo
-!    enddo
-!    !$OMP END DO
-!    !$OMP END PARALLEL
-!
-!    deallocate(coef_fit3)
-
   else
 
     print *, ' Error in j1e_grad: Unknown j1e_type = ', j1e_type
diff --git a/plugins/local/non_h_ints_mu/jast_1e_utils.irp.f b/plugins/local/non_h_ints_mu/jast_1e_utils.irp.f
index 79f780b1..7aa85148 100644
--- a/plugins/local/non_h_ints_mu/jast_1e_utils.irp.f
+++ b/plugins/local/non_h_ints_mu/jast_1e_utils.irp.f
@@ -128,7 +128,8 @@ subroutine get_j1e_coef_fit_ao2(dim_fit, coef_fit)
   double precision              :: g
   double precision              :: t0, t1
   double precision              :: cutoff_svd, D1_inv
-  double precision, allocatable :: A(:,:,:,:), b(:)
+  double precision              :: accu, norm, diff
+  double precision, allocatable :: A(:,:,:,:), b(:), A_tmp(:,:,:,:)
   double precision, allocatable :: Pa(:,:), Pb(:,:), Pt(:,:)
   double precision, allocatable :: u1e_tmp(:), tmp(:,:,:)
   double precision, allocatable :: U(:,:), D(:), Vt(:,:), work(:)
@@ -197,6 +198,9 @@ subroutine get_j1e_coef_fit_ao2(dim_fit, coef_fit)
             , tmp(1,1,1), n_points_final_grid, tmp(1,1,1), n_points_final_grid  &
             , 0.d0, A(1,1,1,1), ao_num*ao_num)
 
+  allocate(A_tmp(ao_num,ao_num,ao_num,ao_num))
+  A_tmp = A
+
   ! --- --- ---
   ! get b
 
@@ -217,11 +221,6 @@ subroutine get_j1e_coef_fit_ao2(dim_fit, coef_fit)
   ! --- --- ---
   ! solve Ax = b
 
-!  double precision, allocatable :: A_inv(:,:,:,:)
-!  allocate(A_inv(ao_num,ao_num,ao_num,ao_num))
-!  call get_pseudo_inverse(A(1,1,1,1), ao_num*ao_num, ao_num*ao_num, ao_num*ao_num, A_inv(1,1,1,1), ao_num*ao_num, cutoff_svd)
-!  A = A_inv
-
   allocate(D(ao_num*ao_num), U(ao_num*ao_num,ao_num*ao_num), Vt(ao_num*ao_num,ao_num*ao_num))
 
   allocate(work(1))
@@ -287,9 +286,30 @@ subroutine get_j1e_coef_fit_ao2(dim_fit, coef_fit)
 
   ! coef_fit = A_inv x b
   call dgemv("N", ao_num*ao_num, ao_num*ao_num, 1.d0, A(1,1,1,1), ao_num*ao_num, b(1), 1, 0.d0, coef_fit(1,1), 1)
-  !call dgemm( "N", "N", ao_num*ao_num, 1, ao_num*ao_num, 1.d0 &
-  !          , A(1,1,1,1), ao_num*ao_num, b(1), ao_num*ao_num  &
-  !          , 0.d0, coef_fit(1,1), ao_num*ao_num)
+
+  ! ---
+
+  accu = 0.d0
+  norm = 0.d0
+  do k = 1, ao_num
+    do l = 1, ao_num
+      kl = (k-1)*ao_num + l
+      diff = 0.d0
+      do i = 1, ao_num
+        do j = 1, ao_num
+          diff += A_tmp(k,l,i,j) * coef_fit(j,i)
+        enddo
+      enddo
+
+      !print*, kl, b(kl)
+      accu += dabs(diff - b(kl))
+      norm += dabs(b(kl))
+    enddo
+  enddo
+  print*, ' accu total on Ax = b (%) = ', 100.d0*accu/norm
+  deallocate(A_tmp)
+
+  ! ---
 
   deallocate(A, b)
 
diff --git a/plugins/local/non_h_ints_mu/jast_2e_utils.irp.f b/plugins/local/non_h_ints_mu/jast_2e_utils.irp.f
index 34c45df9..34d01fb2 100644
--- a/plugins/local/non_h_ints_mu/jast_2e_utils.irp.f
+++ b/plugins/local/non_h_ints_mu/jast_2e_utils.irp.f
@@ -12,12 +12,17 @@ BEGIN_PROVIDER [double precision, int2_u2e_ao, (ao_num, ao_num, n_points_final_g
   END_DOC
 
   implicit none
-  integer          :: ipoint, i, j, jpoint
-  double precision :: time0, time1
-  double precision :: x, y, z, r2 
-  double precision :: dx, dy, dz
-  double precision :: tmp_ct
-  double precision :: tmp0, tmp1, tmp2, tmp3
+  integer                       :: ipoint, i, j, jpoint
+  integer                       :: n_blocks, n_rest, n_pass
+  integer                       :: i_blocks, i_rest, i_pass, ii
+  double precision              :: mem, n_double
+  double precision              :: time0, time1
+  double precision              :: x, y, z, r2 
+  double precision              :: dx, dy, dz
+  double precision              :: tmp_ct
+  double precision              :: tmp0, tmp1, tmp2, tmp3
+  double precision, allocatable :: tmp(:,:,:)
+  double precision, allocatable :: tmp_u12(:,:)
 
   PROVIDE j2e_type
   PROVIDE Env_type
@@ -25,59 +30,152 @@ BEGIN_PROVIDER [double precision, int2_u2e_ao, (ao_num, ao_num, n_points_final_g
   call wall_time(time0)
   print*, ' providing int2_u2e_ao ...'
 
-  if( (j2e_type .eq. "Mu") .and. &
-      ( (env_type .eq. "None") .or. (env_type .eq. "Prod_Gauss") .or. (env_type .eq. "Sum_Gauss") ) ) then
+  if(tc_integ_type .eq. "numeric") then
 
-    PROVIDE mu_erf
-    PROVIDE env_type env_val
-    PROVIDE Ir2_Mu_long_Du_0 Ir2_Mu_long_Du_x Ir2_Mu_long_Du_y Ir2_Mu_long_Du_z Ir2_Mu_long_Du_2
-    PROVIDE Ir2_Mu_gauss_Du
+    PROVIDE final_weight_at_r_vector_extra aos_in_r_array_extra
 
-    tmp_ct = 0.5d0 / (dsqrt(dacos(-1.d0)) * mu_erf)
-
-    !$OMP PARALLEL                                                &
-    !$OMP DEFAULT (NONE)                                          &
-    !$OMP PRIVATE (ipoint, i, j, x, y, z, r2, dx, dy, dz,         &
-    !$OMP         tmp0, tmp1, tmp2, tmp3)                         & 
-    !$OMP SHARED (ao_num, n_points_final_grid, final_grid_points, &
-    !$OMP         tmp_ct, env_val, Ir2_Mu_long_Du_0,              &
-    !$OMP         Ir2_Mu_long_Du_x, Ir2_Mu_long_Du_y,             &
-    !$OMP         Ir2_Mu_long_Du_z, Ir2_Mu_gauss_Du,              &
-    !$OMP         Ir2_Mu_long_Du_2, int2_u2e_ao)
+    allocate(tmp(n_points_extra_final_grid,ao_num,ao_num))
+    !$OMP PARALLEL               &
+    !$OMP DEFAULT (NONE)         &
+    !$OMP PRIVATE (j, i, jpoint) &
+    !$OMP SHARED (tmp, ao_num, n_points_extra_final_grid, final_weight_at_r_vector_extra, aos_in_r_array_extra_transp)
     !$OMP DO SCHEDULE (static)
-    do ipoint = 1, n_points_final_grid
-
-      x  = final_grid_points(1,ipoint)
-      y  = final_grid_points(2,ipoint)
-      z  = final_grid_points(3,ipoint)
-      r2 = x*x + y*y + z*z
-
-      dx = x * env_val(ipoint)
-      dy = y * env_val(ipoint)
-      dz = z * env_val(ipoint)
-
-      tmp0 = 0.5d0 * env_val(ipoint) * r2
-      tmp1 = 0.5d0 * env_val(ipoint)
-      tmp3 = tmp_ct * env_val(ipoint)
-
-      do j = 1, ao_num
-        do i = 1, ao_num
-
-          tmp2 = tmp1 * Ir2_Mu_long_Du_2(i,j,ipoint) - dx * Ir2_Mu_long_Du_x(i,j,ipoint) - dy * Ir2_Mu_long_Du_y(i,j,ipoint) - dz * Ir2_Mu_long_Du_z(i,j,ipoint)
-
-          int2_u2e_ao(i,j,ipoint) = tmp0 * Ir2_Mu_long_Du_0(i,j,ipoint) + tmp2 - tmp3 * Ir2_Mu_gauss_Du(i,j,ipoint)
+    do j = 1, ao_num
+      do i = 1, ao_num
+        do jpoint = 1, n_points_extra_final_grid
+          tmp(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j)
         enddo
       enddo
     enddo
     !$OMP END DO
     !$OMP END PARALLEL
 
-  else 
+    call total_memory(mem)
+    mem      = max(1.d0, qp_max_mem - mem)
+    n_double = mem * 1.d8
+    n_blocks = int(min(n_double / (n_points_extra_final_grid * 1.d0), 1.d0*n_points_final_grid))
+    n_rest   = int(mod(n_points_final_grid, n_blocks))
+    n_pass   = int((n_points_final_grid - n_rest) / n_blocks)
 
-    print *, ' Error in int2_u2e_ao: Unknown Jastrow'
+    call write_int(6, n_pass, 'Number of passes')
+    call write_int(6, n_blocks, 'Size of the blocks')
+    call write_int(6, n_rest, 'Size of the last block')
+
+    allocate(tmp_u12(n_points_extra_final_grid,n_blocks))
+
+    do i_pass = 1, n_pass
+      ii = (i_pass-1)*n_blocks + 1
+
+      !$OMP PARALLEL                                         &
+      !$OMP DEFAULT (NONE)                                   &
+      !$OMP PRIVATE (i_blocks, ipoint)                       &
+      !$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, &
+      !$OMP         final_grid_points, tmp_u12)
+      !$OMP DO 
+      do i_blocks = 1, n_blocks
+        ipoint = ii - 1 + i_blocks ! r1
+        call get_u12_2e_r1_seq(ipoint, n_points_extra_final_grid, tmp_u12(1,i_blocks))
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+
+      call dgemm( "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0             &
+                , tmp(1,1,1), n_points_extra_final_grid, tmp_u12(1,1), n_points_extra_final_grid &
+                , 0.d0, int2_u2e_ao(1,1,ii), ao_num*ao_num)
+    enddo
+
+    deallocate(tmp_u12)
+
+    if(n_rest .gt. 0) then
+
+      allocate(tmp_u12(n_points_extra_final_grid,n_rest))
+
+      ii = n_pass*n_blocks + 1
+
+      !$OMP PARALLEL                                       &
+      !$OMP DEFAULT (NONE)                                 &
+      !$OMP PRIVATE (i_rest, ipoint)                       &
+      !$OMP SHARED (n_rest, n_points_extra_final_grid, ii, &
+      !$OMP         final_grid_points, tmp_u12)
+      !$OMP DO 
+      do i_rest = 1, n_rest
+        ipoint = ii - 1 + i_rest ! r1
+        call get_u12_2e_r1_seq(ipoint, n_points_extra_final_grid, tmp_u12(1,i_rest))
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+
+      call dgemm( "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0               &
+                , tmp(1,1,1), n_points_extra_final_grid, tmp_u12(1,1), n_points_extra_final_grid &
+                , 0.d0, int2_u2e_ao(1,1,ii), ao_num*ao_num)
+
+      deallocate(tmp_u12)
+    endif
+
+    deallocate(tmp)
+
+  elseif(tc_integ_type .eq. "semi-analytic") then
+
+    if( (j2e_type .eq. "Mu") .and. &
+        ( (env_type .eq. "None") .or. (env_type .eq. "Prod_Gauss") .or. (env_type .eq. "Sum_Gauss") ) ) then
+  
+      PROVIDE mu_erf
+      PROVIDE env_type env_val
+      PROVIDE Ir2_Mu_long_Du_0 Ir2_Mu_long_Du_x Ir2_Mu_long_Du_y Ir2_Mu_long_Du_z Ir2_Mu_long_Du_2
+      PROVIDE Ir2_Mu_gauss_Du
+  
+      tmp_ct = 0.5d0 / (dsqrt(dacos(-1.d0)) * mu_erf)
+  
+      !$OMP PARALLEL                                                &
+      !$OMP DEFAULT (NONE)                                          &
+      !$OMP PRIVATE (ipoint, i, j, x, y, z, r2, dx, dy, dz,         &
+      !$OMP         tmp0, tmp1, tmp2, tmp3)                         & 
+      !$OMP SHARED (ao_num, n_points_final_grid, final_grid_points, &
+      !$OMP         tmp_ct, env_val, Ir2_Mu_long_Du_0,              &
+      !$OMP         Ir2_Mu_long_Du_x, Ir2_Mu_long_Du_y,             &
+      !$OMP         Ir2_Mu_long_Du_z, Ir2_Mu_gauss_Du,              &
+      !$OMP         Ir2_Mu_long_Du_2, int2_u2e_ao)
+      !$OMP DO SCHEDULE (static)
+      do ipoint = 1, n_points_final_grid
+  
+        x  = final_grid_points(1,ipoint)
+        y  = final_grid_points(2,ipoint)
+        z  = final_grid_points(3,ipoint)
+        r2 = x*x + y*y + z*z
+  
+        dx = x * env_val(ipoint)
+        dy = y * env_val(ipoint)
+        dz = z * env_val(ipoint)
+  
+        tmp0 = 0.5d0 * env_val(ipoint) * r2
+        tmp1 = 0.5d0 * env_val(ipoint)
+        tmp3 = tmp_ct * env_val(ipoint)
+  
+        do j = 1, ao_num
+          do i = 1, ao_num
+  
+            tmp2 = tmp1 * Ir2_Mu_long_Du_2(i,j,ipoint) - dx * Ir2_Mu_long_Du_x(i,j,ipoint) - dy * Ir2_Mu_long_Du_y(i,j,ipoint) - dz * Ir2_Mu_long_Du_z(i,j,ipoint)
+  
+            int2_u2e_ao(i,j,ipoint) = tmp0 * Ir2_Mu_long_Du_0(i,j,ipoint) + tmp2 - tmp3 * Ir2_Mu_gauss_Du(i,j,ipoint)
+          enddo
+        enddo
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+  
+    else 
+  
+      print *, ' Error in int2_u2e_ao: Unknown Jastrow'
+      stop
+  
+    endif ! j2e_type
+
+  else 
+  
+    print *, ' Error in int2_u2e_ao: Unknown tc_integ_type'
     stop
 
-  endif ! j2e_type
+  endif ! tc_integ_type
 
   call wall_time(time1)
   print*, ' wall time for int2_u2e_ao (min) =', (time1-time0)/60.d0
@@ -139,11 +237,10 @@ BEGIN_PROVIDER [double precision, int2_grad1_u2e_ao, (ao_num, ao_num, n_points_f
     !$OMP END DO
     !$OMP END PARALLEL
 
-    ! n_points_final_grid = n_blocks * n_pass + n_rest
     call total_memory(mem)
     mem      = max(1.d0, qp_max_mem - mem)
     n_double = mem * 1.d8
-    n_blocks = int(min(n_double / (n_points_extra_final_grid * 4.d0), 1.d0*n_points_final_grid))
+    n_blocks = int(min(n_double / (n_points_extra_final_grid * 3.d0), 1.d0*n_points_final_grid))
     n_rest   = int(mod(n_points_final_grid, n_blocks))
     n_pass   = int((n_points_final_grid - n_rest) / n_blocks)
 
diff --git a/plugins/local/non_h_ints_mu/jast_deriv_utils_vect.irp.f b/plugins/local/non_h_ints_mu/jast_deriv_utils_vect.irp.f
index 9a5e35c6..ffb7b513 100644
--- a/plugins/local/non_h_ints_mu/jast_deriv_utils_vect.irp.f
+++ b/plugins/local/non_h_ints_mu/jast_deriv_utils_vect.irp.f
@@ -19,11 +19,13 @@ subroutine get_grad1_u12_withsq_r1_seq(ipoint, n_grid2, resx, resy, resz, res)
   double precision              :: env_r1, tmp
   double precision              :: grad1_env(3), r1(3)
   double precision, allocatable :: env_r2(:)
-  double precision, allocatable :: u2b_r12(:)
-  double precision, allocatable :: gradx1_u2b(:), grady1_u2b(:), gradz1_u2b(:)
+  double precision, allocatable :: u2b_r12(:), gradx1_u2b(:), grady1_u2b(:), gradz1_u2b(:)
+  double precision, allocatable :: u2b_mu(:), gradx1_mu(:), grady1_mu(:), gradz1_mu(:)
+  double precision, allocatable :: u2b_nu(:), gradx1_nu(:), grady1_nu(:), gradz1_nu(:)
   double precision, external    :: env_nucl
 
   PROVIDE j1e_type j2e_type env_type
+  PROVIDE mu_erf nu_erf a_boys
   PROVIDE final_grid_points
   PROVIDE final_grid_points_extra
 
@@ -41,8 +43,8 @@ subroutine get_grad1_u12_withsq_r1_seq(ipoint, n_grid2, resx, resy, resz, res)
 
     else
 
-      !   u(r1,r2)        = j12_mu(r12) x v(r1) x v(r2)
-      !   grad1 u(r1, r2) = [(grad1 j12_mu) v(r1) + j12_mu grad1 v(r1)] v(r2)
+      ! u(r1,r2)        = j12_mu(r12) x v(r1) x v(r2)
+      ! grad1 u(r1, r2) = [(grad1 j12_mu) v(r1) + j12_mu grad1 v(r1)] v(r2)
 
       allocate(env_r2(n_grid2))
       allocate(u2b_r12(n_grid2))
@@ -67,6 +69,46 @@ subroutine get_grad1_u12_withsq_r1_seq(ipoint, n_grid2, resx, resy, resz, res)
 
     endif ! env_type
 
+  elseif(j2e_type .eq. "Mu_Nu") then
+
+    if(env_type .eq. "None") then
+
+      call grad1_jmu_r1_seq(mu_erf, r1, n_grid2, resx, resy, resz)
+
+    else
+
+      ! u(r1,r2) = jmu(r12) x v(r1) x v(r2) + jnu(r12) x [1 - v(r1) x v(r2)]
+
+      allocate(env_r2(n_grid2))
+      allocate(u2b_mu(n_grid2))
+      allocate(u2b_nu(n_grid2))
+      allocate(gradx1_mu(n_grid2), grady1_mu(n_grid2), gradz1_mu(n_grid2))
+      allocate(gradx1_nu(n_grid2), grady1_nu(n_grid2), gradz1_nu(n_grid2))
+
+      env_r1 = env_nucl(r1)
+      call grad1_env_nucl(r1, grad1_env)
+      call env_nucl_r1_seq(n_grid2, env_r2)
+
+      call jmu_r1_seq(mu_erf, r1, n_grid2, u2b_mu)
+      call jmu_r1_seq(nu_erf, r1, n_grid2, u2b_nu)
+
+      call grad1_jmu_r1_seq(mu_erf, r1, n_grid2, gradx1_mu, grady1_mu, gradz1_mu)
+      call grad1_jmu_r1_seq(nu_erf, r1, n_grid2, gradx1_nu, grady1_nu, gradz1_nu)
+
+      do jpoint = 1, n_points_extra_final_grid
+        resx(jpoint) = gradx1_nu(jpoint) + ((gradx1_mu(jpoint) - gradx1_nu(jpoint)) * env_r1 + (u2b_mu(jpoint) - u2b_nu(jpoint)) * grad1_env(1)) * env_r2(jpoint)
+        resy(jpoint) = grady1_nu(jpoint) + ((grady1_mu(jpoint) - grady1_nu(jpoint)) * env_r1 + (u2b_mu(jpoint) - u2b_nu(jpoint)) * grad1_env(2)) * env_r2(jpoint)
+        resz(jpoint) = gradz1_nu(jpoint) + ((gradz1_mu(jpoint) - gradz1_nu(jpoint)) * env_r1 + (u2b_mu(jpoint) - u2b_nu(jpoint)) * grad1_env(3)) * env_r2(jpoint)
+      enddo
+
+      deallocate(env_r2)
+      deallocate(u2b_mu)
+      deallocate(u2b_nu)
+      deallocate(gradx1_mu, grady1_mu, gradz1_mu)
+      deallocate(gradx1_nu, grady1_nu, gradz1_nu)
+
+    endif ! env_type
+
   else
 
     print *, ' Error in get_grad1_u12_withsq_r1_seq: Unknown Jastrow'
@@ -99,6 +141,9 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
 
   BEGIN_DOC
   !
+  !  d/dx1 j_2e(1,2)
+  !  d/dy1 j_2e(1,2)
+  !  d/dz1 j_2e(1,2)
   !
   END_DOC
 
@@ -116,10 +161,13 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
   double precision              :: dx, dy, dz, r12, tmp
   double precision              :: mu_val, mu_tmp, mu_der(3)
 
+  PROVIDE j2e_type
+
   if(j2e_type .eq. "Mu") then
 
-    !  d/dx1 j(mu,r12) = 0.5 * (1 - erf(mu *r12))/r12 * (x1 - x2)
-    !
+    !  d/dx1 j(mu,r12) = 0.5 * [(1 - erf(mu * r12)) / r12] * (x1 - x2)
+    !  d/dy1 j(mu,r12) = 0.5 * [(1 - erf(mu * r12)) / r12] * (y1 - y2)
+    !  d/dz1 j(mu,r12) = 0.5 * [(1 - erf(mu * r12)) / r12] * (z1 - z2)
 
     do jpoint = 1, n_points_extra_final_grid ! r2 
 
@@ -185,7 +233,12 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
 
   elseif(j2e_type .eq. "Boys") then
 
-   ! j(r12) = 0.5 r12 / (1 + a_boys r_12)
+    !
+    ! j(r12) = 0.5 r12 / (1 + a_boys r_12)
+    !
+    ! d/dx1 j(r12) = 0.5 (x1 - x2) / [r12 * (1 + b r12^2)^2]
+    ! d/dy1 j(r12) = 0.5 (y1 - y2) / [r12 * (1 + b r12^2)^2]
+    ! d/dz1 j(r12) = 0.5 (z1 - z2) / [r12 * (1 + b r12^2)^2]
 
     PROVIDE a_boys
 
@@ -226,6 +279,58 @@ end
 
 ! ---
 
+subroutine grad1_jmu_r1_seq(mu, r1, n_grid2, gradx, grady, gradz)
+
+  BEGIN_DOC
+  !
+  !  d/dx1 jmu(r12) = 0.5 * [(1 - erf(mu * r12)) / r12] * (x1 - x2)
+  !  d/dy1 jmu(r12) = 0.5 * [(1 - erf(mu * r12)) / r12] * (y1 - y2)
+  !  d/dz1 jmu(r12) = 0.5 * [(1 - erf(mu * r12)) / r12] * (z1 - z2)
+  !
+  END_DOC
+
+  implicit none
+  integer         , intent(in)  :: n_grid2
+  double precision, intent(in)  :: mu, r1(3)
+  double precision, intent(out) :: gradx(n_grid2)
+  double precision, intent(out) :: grady(n_grid2)
+  double precision, intent(out) :: gradz(n_grid2)
+
+  integer                       :: jpoint
+  double precision              :: r2(3)
+  double precision              :: dx, dy, dz, r12, tmp
+
+
+  do jpoint = 1, n_points_extra_final_grid ! r2 
+
+    r2(1) = final_grid_points_extra(1,jpoint)
+    r2(2) = final_grid_points_extra(2,jpoint)
+    r2(3) = final_grid_points_extra(3,jpoint)
+
+    dx = r1(1) - r2(1)
+    dy = r1(2) - r2(2)
+    dz = r1(3) - r2(3)
+
+    r12 = dsqrt(dx * dx + dy * dy + dz * dz)
+    if(r12 .lt. 1d-10) then
+      gradx(jpoint) = 0.d0 
+      grady(jpoint) = 0.d0 
+      gradz(jpoint) = 0.d0 
+      cycle
+    endif
+
+    tmp = 0.5d0 * (1.d0 - derf(mu * r12)) / r12
+
+    gradx(jpoint) = tmp * dx
+    grady(jpoint) = tmp * dy
+    gradz(jpoint) = tmp * dz
+  enddo
+
+  return
+end
+
+! ---
+
 subroutine j12_r1_seq(r1, n_grid2, res)
 
   include 'constants.include.F'
@@ -294,6 +399,44 @@ end
 
 ! ---
 
+subroutine jmu_r1_seq(mu, r1, n_grid2, res)
+
+  include 'constants.include.F'
+
+  implicit none
+  integer,          intent(in)  :: n_grid2
+  double precision, intent(in)  :: mu, r1(3)
+  double precision, intent(out) :: res(n_grid2)
+
+  integer                       :: jpoint
+  double precision              :: r2(3)
+  double precision              :: dx, dy, dz
+  double precision              :: r12, tmp1, tmp2
+
+  tmp1 = inv_sq_pi_2 / mu
+
+  do jpoint = 1, n_points_extra_final_grid ! r2 
+  
+    r2(1) = final_grid_points_extra(1,jpoint)
+    r2(2) = final_grid_points_extra(2,jpoint)
+    r2(3) = final_grid_points_extra(3,jpoint)
+  
+    dx  = r1(1) - r2(1)
+    dy  = r1(2) - r2(2)
+    dz  = r1(3) - r2(3)
+    r12 = dsqrt(dx * dx + dy * dy + dz * dz)
+
+    tmp2 = mu * r12
+  
+    res(jpoint) = 0.5d0 * r12 * (1.d0 - derf(tmp2)) - tmp1 * dexp(-tmp2*tmp2)
+  enddo
+
+  return
+end
+
+! ---
+
+
 subroutine env_nucl_r1_seq(n_grid2, res)
 
   ! TODO
@@ -473,3 +616,71 @@ end
 
 ! ---
 
+subroutine get_u12_2e_r1_seq(ipoint, n_grid2, res)
+
+  BEGIN_DOC
+  ! 
+  ! u_2e(r1,r2)
+  !
+  ! we use grid for r1 and extra_grid for r2
+  !
+  END_DOC
+
+  implicit none
+  integer,          intent(in)  :: ipoint, n_grid2
+  double precision, intent(out) :: res(n_grid2)
+
+  integer                       :: jpoint
+  double precision              :: env_r1, tmp
+  double precision              :: grad1_env(3), r1(3)
+  double precision, allocatable :: env_r2(:)
+  double precision, allocatable :: u2b_r12(:)
+  double precision, external    :: env_nucl
+
+  PROVIDE j1e_type j2e_type env_type
+  PROVIDE final_grid_points
+  PROVIDE final_grid_points_extra
+
+  r1(1) = final_grid_points(1,ipoint)
+  r1(2) = final_grid_points(2,ipoint)
+  r1(3) = final_grid_points(3,ipoint)
+
+  if( (j2e_type .eq. "Mu")  .or. &
+      (j2e_type .eq. "Mur") .or. &
+      (j2e_type .eq. "Boys") ) then
+
+    if(env_type .eq. "None") then
+
+      call j12_r1_seq(r1, n_grid2, res)
+
+    else
+
+      ! u(r1,r2) = j12_mu(r12) x v(r1) x v(r2)
+
+      allocate(env_r2(n_grid2))
+      allocate(u2b_r12(n_grid2))
+
+      env_r1 = env_nucl(r1)
+      call j12_r1_seq(r1, n_grid2, u2b_r12)
+      call env_nucl_r1_seq(n_grid2, env_r2)
+
+      do jpoint = 1, n_points_extra_final_grid
+        res(jpoint) = env_r1 * u2b_r12(jpoint) * env_r2(jpoint)
+      enddo
+
+      deallocate(env_r2, u2b_r12)
+
+    endif ! env_type
+
+  else
+
+    print *, ' Error in get_u12_withsq_r1_seq: Unknown Jastrow'
+    stop
+
+  endif ! j2e_type
+
+  return
+end
+
+! ---
+
diff --git a/plugins/local/non_h_ints_mu/test_non_h_ints.irp.f b/plugins/local/non_h_ints_mu/test_non_h_ints.irp.f
index c3fde334..464a1c1f 100644
--- a/plugins/local/non_h_ints_mu/test_non_h_ints.irp.f
+++ b/plugins/local/non_h_ints_mu/test_non_h_ints.irp.f
@@ -43,7 +43,9 @@ program test_non_h
   !call test_tc_grad_square_ao_new()
 
   !call test_fit_coef_A1()
-  call test_fit_coef_inv()
+  !call test_fit_coef_inv()
+
+  call test_fit_coef_testinvA()
 end
 
 ! ---
@@ -1229,7 +1231,7 @@ subroutine test_fit_coef_inv()
 
   implicit none
   integer                       :: i, j, k, l, ij, kl, ipoint
-  integer                       :: n_svd, info, lwork, mn
+  integer                       :: n_svd, info, lwork, mn, m, n
   double precision              :: t1, t2
   double precision              :: accu, norm, diff
   double precision              :: cutoff_svd, D1_inv
@@ -1237,7 +1239,6 @@ subroutine test_fit_coef_inv()
   double precision, allocatable :: A2(:,:,:,:), tmp(:,:,:), A2_inv(:,:,:,:)
   double precision, allocatable :: U(:,:), D(:), Vt(:,:), work(:), A2_tmp(:,:,:,:)
 
-
   cutoff_svd = 5d-8
 
   ! ---
@@ -1435,11 +1436,92 @@ subroutine test_fit_coef_inv()
     enddo
   enddo
 
+  print*, ' accuracy on A_inv (%) = ', 100.d0 * accu / norm
+
   deallocate(A1_inv, A2_inv)
   deallocate(A1, A2)
 
-  print*, ' accuracy on A_inv (%) = ', 100.d0 * accu / norm
-
+  return
+end
+
+! ---
+
+subroutine test_fit_coef_testinvA()
+
+  implicit none
+  integer                       :: i, j, k, l, m, n, ij, kl, mn, ipoint
+  double precision              :: t1, t2
+  double precision              :: accu, norm, diff
+  double precision              :: cutoff_svd
+  double precision, allocatable :: A1(:,:), A1_inv(:,:)
+
+  cutoff_svd = 1d-17
+
+  ! ---
+
+  call wall_time(t1)
+
+  allocate(A1(ao_num*ao_num,ao_num*ao_num))
+
+  !$OMP PARALLEL                             &
+  !$OMP DEFAULT (NONE)                       &
+  !$OMP PRIVATE (i, j, k, l, ij, kl, ipoint) &
+  !$OMP SHARED (n_points_final_grid, ao_num, &
+  !$OMP         final_weight_at_r_vector, aos_in_r_array_transp, A1)
+  !$OMP DO COLLAPSE(2)
+  do k = 1, ao_num
+    do l = 1, ao_num
+      kl = (k-1)*ao_num + l
+
+      do i = 1, ao_num
+        do j = 1, ao_num
+          ij = (i-1)*ao_num + j
+
+          A1(ij,kl) = 0.d0
+          do ipoint = 1, n_points_final_grid
+            A1(ij,kl) += final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j) &
+                                                          * aos_in_r_array_transp(ipoint,k) * aos_in_r_array_transp(ipoint,l)
+          enddo
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  call wall_time(t2)
+  print*, ' WALL TIME FOR A1 (min) =', (t2-t1)/60.d0
+
+  allocate(A1_inv(ao_num*ao_num,ao_num*ao_num))
+  call get_pseudo_inverse(A1, ao_num*ao_num, ao_num*ao_num, ao_num*ao_num, A1_inv, ao_num*ao_num, cutoff_svd)
+
+  call wall_time(t1)
+  print*, ' WALL TIME FOR A1_inv (min) =', (t1-t2)/60.d0
+
+  ! ---
+
+  print*, ' check inv'
+
+  do kl = 1, ao_num*ao_num
+    do ij = 1, ao_num*ao_num
+
+      diff = 0.d0
+      do mn = 1, ao_num*ao_num
+        diff += A1(kl,mn) * A1_inv(mn,ij)
+      enddo
+
+      if(kl .eq. ij) then
+        accu += dabs(diff - 1.d0)
+      else
+        accu += dabs(diff - 0.d0)
+      endif
+    enddo
+  enddo
+
+  print*, ' accuracy (%) = ', accu * 100.d0
+  
+  deallocate(A1, A1_inv)
+
   return
 end
 
diff --git a/plugins/local/non_h_ints_mu/total_tc_int.irp.f b/plugins/local/non_h_ints_mu/total_tc_int.irp.f
index 38da4047..9d3cf565 100644
--- a/plugins/local/non_h_ints_mu/total_tc_int.irp.f
+++ b/plugins/local/non_h_ints_mu/total_tc_int.irp.f
@@ -125,7 +125,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
 
       call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0     &
                 , int2_u2_env2(1,1,1), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
-                , 1.d0, ao_two_e_tc_tot, ao_num*ao_num)
+                , 1.d0, ao_two_e_tc_tot(1,1,1,1), ao_num*ao_num)
 
       FREE int2_u2_env2
     endif ! use_ipp
@@ -166,12 +166,15 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
       do m = 1, 3
         call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, -1.d0             &
                   , int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num, b_mat(1,1,1,m), n_points_final_grid &
-                  , 1.d0, ao_two_e_tc_tot, ao_num*ao_num)
+                  , 1.d0, ao_two_e_tc_tot(1,1,1,1), ao_num*ao_num)
       enddo
       deallocate(b_mat)
 
       FREE int2_grad1_u12_ao
-      FREE int2_grad1_u2e_ao
+
+      if(tc_integ_type .eq. "semi-analytic") then 
+        FREE int2_grad1_u2e_ao
+      endif
 
     endif ! var_tc