merging with dev

2025-01-09 12:44:05 +01:00 · 2023-01-17 17:15:17 +01:00 · 2023-01-17 17:15:17 +01:00 · f0c6c54072
commit f0c6c54072
parent 7dc17fd2d3 92a4e33f8a
64 changed files with 4813 additions and 2233 deletions
--- a/bin/qp_convert_output_to_ezfio
+++ b/bin/qp_convert_output_to_ezfio
@ -162,11 +162,11 @@ def write_ezfio(res, filename):
    # P a r s i n g #
    # ~#~#~#~#~#~#~ #
-    prim_num_max = ezfio.get_ao_basis_ao_prim_num_max()
+    prim_num_max = max(ezfio.get_ao_basis_ao_prim_num())
    ezfio.set_ao_basis_ao_prim_num_max(prim_num_max)
    for i in range(len(res.basis)):
-        coefficient[
+        coefficient[i] += [0. for j in range(len(coefficient[i]), prim_num_max)]
            i] += [0. for j in range(len(coefficient[i]), prim_num_max)]
        exponent[i] += [0. for j in range(len(exponent[i]), prim_num_max)]
    coefficient = reduce(lambda x, y: x + y, coefficient, [])
--- a/config/bull.cfg
+++ b/config/bull.cfg
@ -6,7 +6,7 @@
 # --align=32                 : Align all provided arrays on a 32-byte boundary
 #
 [COMMON]
-FC           : mpiifort -shared-libgcc -shared-intel -fpic
+FC           : mpiifort -fpic -xCORE-AVX2 
 LAPACK_LIB   : -mkl=parallel 
 IRPF90       : irpf90
 IRPF90_FLAGS : --ninja --align=32 -DMPI  
@ -31,7 +31,7 @@ OPENMP  : 1          ; Append OpenMP flags
 # -ftz                       : Flushes denormal results to zero
 #
 [OPT]
-FCFLAGS  : -xCORE-AVX2 -O2 -ip -ftz -g -traceback -qopt-prefetch=5 -qopt-prefetch-issue-excl-hint -unroll-aggressive 
+FCFLAGS  : -O2 -ip -ftz -g -traceback -qopt-prefetch=5 -qopt-prefetch-issue-excl-hint -unroll-aggressive 
 # Profiling flags
 #################
 #
--- a/config/ifort_2019_debug.cfg
+++ b/config/ifort_2019_debug.cfg
@ -0,0 +1,66 @@
 # Common flags
 ##############
 #
 # -mkl=[parallel|sequential] : Use the MKL library
 # --ninja                 : Allow the utilisation of ninja. It is mandatory !
 # --align=32                 : Align all provided arrays on a 32-byte boundary
 #
 [COMMON]
 FC           : ifort -fpic
 LAPACK_LIB   : -mkl=parallel -lirc -lsvml -limf -lipps
 IRPF90       : irpf90
 IRPF90_FLAGS : --ninja --align=32 --assert -DINTEL -DSET_NESTED
 # Global options
 ################
 #
 # 1 : Activate
 # 0 : Deactivate
 # 
 [OPTION]
 MODE    : DEBUG      ; [ OPT | PROFILE | DEBUG ] : Chooses the section below
 CACHE   : 0          ; Enable cache_compile.py
 OPENMP  : 1          ; Append OpenMP flags
 # Optimization flags
 ####################
 #
 # -xHost                     : Compile a binary optimized for the current architecture
 # -O2                        : O3 not better than O2.
 # -ip                        : Inter-procedural optimizations
 # -ftz                       : Flushes denormal results to zero
 #
 [OPT]
 FC       : -traceback
 FCFLAGS  : -msse4.2 -O2 -ip -ftz -g 
 # Profiling flags
 #################
 #
 [PROFILE]
 FC       : -p -g
 FCFLAGS  : -msse4.2 -O2 -ip -ftz 
 # Debugging flags
 #################
 #
 # -traceback   : Activate backtrace on runtime
 # -fpe0        : All floating point exaceptions
 # -C           : Checks uninitialized variables,  array subscripts, etc...
 # -g           : Extra debugging information
 # -msse4.2     : Valgrind needs a very simple x86 executable
 #
 [DEBUG]
 FC      : -g -traceback
 FCFLAGS : -msse4.2 -check all -debug all -fpe-all=0  -implicitnone
 # OpenMP flags
 #################
 #
 [OPENMP]
 FC           : -qopenmp
 IRPF90_FLAGS : --openmp
--- a/ocaml/qp_run.ml
+++ b/ocaml/qp_run.ml
@ -155,6 +155,7 @@ let run slave ?prefix exe ezfio_file =
  in
  Printf.printf "Wall time: %d:%2.2d:%2.2d" (d*24+h) m s ;
  Printf.printf "\n\n";
  Unix.sleep 1;
  if (exit_code <> 0) then
    exit exit_code
--- a/scripts/compilation/qp_create_ninja
+++ b/scripts/compilation/qp_create_ninja
@ -126,7 +126,7 @@ def ninja_create_env_variable(pwd_config_file):
        try:
            content = ""
            with open(libfile,'r') as f:
-                content = f.read()
+                content = f.read().replace('\n','')
                str_lib += " "+content
        except IOError:
            pass
--- a/src/ao_basis/EZFIO.cfg
+++ b/src/ao_basis/EZFIO.cfg
@ -57,13 +57,13 @@ default: false
 [ao_normalized]
 type: logical
-doc: Use normalized basis functions
+doc: Normalize the atomic orbitals
 interface: ezfio, provider
-default: true
+default: false
 [primitives_normalized]
 type: logical
-doc: Use normalized primitive functions
+doc: Normalize the primitive basis functions
 interface: ezfio, provider
 default: true
--- a/src/ao_basis/aos.irp.f
+++ b/src/ao_basis/aos.irp.f
@ -63,15 +63,14 @@ END_PROVIDER
  ! Coefficients including the |AO| normalization
  END_DOC
  do i=1,ao_num
    l = ao_shell(i)
    ao_coef_normalized(i,:) = shell_coef(l,:) * shell_normalization_factor(l)
  end do
  double precision               :: norm,overlap_x,overlap_y,overlap_z,C_A(3), c
-  integer                        :: l, powA(3), nz
+  integer                        :: l, powA(3)
  integer, parameter             :: nz=100
  integer                        :: i,j,k
-  nz=100
+
   ao_coef_normalized(:,:) = ao_coef(:,:)
  C_A = 0.d0
  do i=1,ao_num
@ -80,7 +79,7 @@ END_PROVIDER
    powA(2) = ao_power(i,2)
    powA(3) = ao_power(i,3)
-    ! Normalization of the primitives
+    ! GAMESS-type normalization of the primitives
    if (primitives_normalized) then
      do j=1,ao_prim_num(i)
        call overlap_gaussian_xyz(C_A,C_A,ao_expo(i,j),ao_expo(i,j), &
@ -91,6 +90,7 @@ END_PROVIDER
    ! Normalization of the contracted basis functions
    if (ao_normalized) then
      norm = 0.d0
      l = ao_shell(i)
      do j=1,ao_prim_num(i)
        do k=1,ao_prim_num(i)
          call overlap_gaussian_xyz(C_A,C_A,ao_expo(i,j),ao_expo(i,k),powA,powA,overlap_x,overlap_y,overlap_z,c,nz)
@ -98,6 +98,7 @@ END_PROVIDER
        enddo
      enddo
      ao_coef_normalization_factor(i) = 1.d0/dsqrt(norm)
      ao_coef_normalized(i,:) *= ao_coef_normalization_factor(i)
    else
      ao_coef_normalization_factor(i) = 1.d0
    endif
--- a/src/ao_many_one_e_ints/grad2_jmu_manu.irp.f
+++ b/src/ao_many_one_e_ints/grad2_jmu_manu.irp.f
@ -1,4 +1,6 @@
 ! ---
 BEGIN_PROVIDER [ double precision, int2_grad1u2_grad2u2_j1b2_test, (ao_num, ao_num, n_points_final_grid)]
  BEGIN_DOC
@ -13,12 +15,14 @@ BEGIN_PROVIDER [ double precision, int2_grad1u2_grad2u2_j1b2_test, (ao_num, ao_n
  double precision              :: coef, beta, B_center(3)
  double precision              :: tmp
  double precision              :: wall0, wall1
  double precision, allocatable :: int_fit_v(:)
  double precision, external    :: overlap_gauss_r12_ao_with1s
  double precision              :: int_gauss, dsqpi_3_2, int_j1b
  double precision              :: factor_ij_1s, beta_ij, center_ij_1s(3), sq_pi_3_2 
-  sq_pi_3_2 = (dacos(-1.d0))**(3/2)
+  double precision, allocatable :: int_fit_v(:)
  double precision, external    :: overlap_gauss_r12_ao_with1s
  print*, ' providing int2_grad1u2_grad2u2_j1b2_test ...'
  sq_pi_3_2 = (dacos(-1.d0))**(1.5d0)
  provide mu_erf final_grid_points_transp j1b_pen List_comb_thr_b3_coef
  call wall_time(wall0)
@ -33,7 +37,7 @@ BEGIN_PROVIDER [ double precision, int2_grad1u2_grad2u2_j1b2_test, (ao_num, ao_n
     !$OMP          expo_gauss_1_erf_x_2, coef_gauss_1_erf_x_2,                                             &
     !$OMP          List_comb_thr_b3_coef, List_comb_thr_b3_expo,                                           &
     !$OMP          List_comb_thr_b3_cent, int2_grad1u2_grad2u2_j1b2_test, ao_abs_comb_b3_j1b,              &
-     !$OMP          ao_overlap_abs_grid,sq_pi_3_2)
+     !$OMP          ao_overlap_abs,sq_pi_3_2)
 !$OMP DO SCHEDULE(dynamic)
 do ipoint = 1, n_points_final_grid
   r(1) = final_grid_points(1,ipoint)
@ -41,7 +45,7 @@ BEGIN_PROVIDER [ double precision, int2_grad1u2_grad2u2_j1b2_test, (ao_num, ao_n
   r(3) = final_grid_points(3,ipoint)
   do i = 1, ao_num
     do j = i, ao_num
-       if(ao_overlap_abs_grid(j,i) .lt. 1.d-12) then
+       if(ao_overlap_abs(j,i) .lt. 1.d-12) then
         cycle
       endif
@ -61,7 +65,7 @@ BEGIN_PROVIDER [ double precision, int2_grad1u2_grad2u2_j1b2_test, (ao_num, ao_n
           call gaussian_product(expo_fit,r,beta,B_center,factor_ij_1s,beta_ij,center_ij_1s)
           coef_fit = -0.25d0 *  coef_gauss_1_erf_x_2(i_fit) * coef
 !           if(dabs(coef_fit*factor_ij_1s*int_j1b).lt.1.d-10)cycle ! old version
-           if(dabs(coef_fit*factor_ij_1s*int_j1b*sq_pi_3_2*(beta_ij)**(-3/2)).lt.1.d-12)cycle
+           if(dabs(coef_fit*factor_ij_1s*int_j1b*sq_pi_3_2*(beta_ij)**(-1.5d0)).lt.1.d-10)cycle
 !           call overlap_gauss_r12_ao_with1s_v(B_center, beta, final_grid_points_transp, &
 !                 expo_fit, i, j, int_fit_v, n_points_final_grid)
@ -91,6 +95,8 @@ BEGIN_PROVIDER [ double precision, int2_grad1u2_grad2u2_j1b2_test, (ao_num, ao_n
 END_PROVIDER
 ! ---
 BEGIN_PROVIDER [ double precision, int2_grad1u2_grad2u2_j1b2_test_v, (ao_num, ao_num, n_points_final_grid)]
 !
 !  BEGIN_DOC
@ -109,6 +115,8 @@ BEGIN_PROVIDER [ double precision, int2_grad1u2_grad2u2_j1b2_test_v, (ao_num, ao
  double precision, allocatable :: int_fit_v(:),big_array(:,:,:)
  double precision, external    :: overlap_gauss_r12_ao_with1s
  print*, ' providing int2_grad1u2_grad2u2_j1b2_test_v ...'
  provide mu_erf final_grid_points_transp j1b_pen
  call wall_time(wall0)
@ -123,14 +131,14 @@ BEGIN_PROVIDER [ double precision, int2_grad1u2_grad2u2_j1b2_test_v, (ao_num, ao
     !$OMP          expo_gauss_1_erf_x_2, coef_gauss_1_erf_x_2,      &
     !$OMP          List_comb_thr_b3_coef, List_comb_thr_b3_expo,    &
     !$OMP          List_comb_thr_b3_cent, big_array,&
-     !$OMP          ao_abs_comb_b3_j1b,ao_overlap_abs_grid)
+     !$OMP          ao_abs_comb_b3_j1b,ao_overlap_abs)
 !
 allocate(int_fit_v(n_points_final_grid))
 !$OMP DO SCHEDULE(dynamic)
 do i = 1, ao_num
   do j = i, ao_num
-     if(ao_overlap_abs_grid(j,i) .lt. 1.d-12) then
+     if(ao_overlap_abs(j,i) .lt. 1.d-12) then
       cycle
     endif
@ -139,7 +147,7 @@ BEGIN_PROVIDER [ double precision, int2_grad1u2_grad2u2_j1b2_test_v, (ao_num, ao
         coef        = List_comb_thr_b3_coef  (i_1s,j,i)
         beta        = List_comb_thr_b3_expo  (i_1s,j,i)
         int_j1b = ao_abs_comb_b3_j1b(i_1s,j,i)
-         if(dabs(coef)*dabs(int_j1b).lt.1.d-15)cycle
+!         if(dabs(coef)*dabs(int_j1b).lt.1.d-15)cycle
         B_center(1) = List_comb_thr_b3_cent(1,i_1s,j,i)
         B_center(2) = List_comb_thr_b3_cent(2,i_1s,j,i)
         B_center(3) = List_comb_thr_b3_cent(3,i_1s,j,i)
@ -185,6 +193,8 @@ BEGIN_PROVIDER [ double precision, int2_grad1u2_grad2u2_j1b2_test_v, (ao_num, ao
 END_PROVIDER
 ! ---
 BEGIN_PROVIDER [ double precision, int2_u2_j1b2_test, (ao_num, ao_num, n_points_final_grid)]
  BEGIN_DOC
@ -202,7 +212,10 @@ BEGIN_PROVIDER [ double precision, int2_u2_j1b2_test, (ao_num, ao_num, n_points_
  double precision, external    :: overlap_gauss_r12_ao
  double precision, external    :: overlap_gauss_r12_ao_with1s
  double precision :: factor_ij_1s,beta_ij,center_ij_1s(3),sq_pi_3_2
-  sq_pi_3_2 = (dacos(-1.d0))**(3/2)
+
  print*, ' providing int2_u2_j1b2_test ...'
  sq_pi_3_2 = (dacos(-1.d0))**(1.5d0)
  provide mu_erf final_grid_points j1b_pen
  call wall_time(wall0)
@ -213,7 +226,7 @@ BEGIN_PROVIDER [ double precision, int2_u2_j1b2_test, (ao_num, ao_num, n_points_
 !$OMP PRIVATE (ipoint, i, j, i_1s, i_fit, r, coef, beta, B_center, &
 !$OMP          coef_fit, expo_fit, int_fit, tmp, int_j1b,factor_ij_1s,beta_ij,center_ij_1s)          & 
 !$OMP SHARED  (n_points_final_grid, ao_num, List_comb_thr_b3_size, & 
- !$OMP          final_grid_points, ng_fit_jast,ao_overlap_abs_grid,      &
+ !$OMP          final_grid_points, ng_fit_jast,                     &
 !$OMP          expo_gauss_j_mu_x_2, coef_gauss_j_mu_x_2,           &
 !$OMP          List_comb_thr_b3_coef, List_comb_thr_b3_expo,sq_pi_3_2,       & 
 !$OMP          List_comb_thr_b3_cent, int2_u2_j1b2_test,ao_abs_comb_b3_j1b)
@ -225,9 +238,6 @@ BEGIN_PROVIDER [ double precision, int2_u2_j1b2_test, (ao_num, ao_num, n_points_
    do i = 1, ao_num
      do j = i, ao_num
       if(ao_overlap_abs_grid(j,i) .lt. 1.d-12) then
         cycle
       endif
        tmp = 0.d0
@ -236,7 +246,7 @@ BEGIN_PROVIDER [ double precision, int2_u2_j1b2_test, (ao_num, ao_num, n_points_
          coef        = List_comb_thr_b3_coef  (i_1s,j,i)
          beta        = List_comb_thr_b3_expo  (i_1s,j,i)
          int_j1b = ao_abs_comb_b3_j1b(i_1s,j,i)
-          if(dabs(coef)*dabs(int_j1b).lt.1.d-14)cycle
+          if(dabs(coef)*dabs(int_j1b).lt.1.d-10)cycle
          B_center(1) = List_comb_thr_b3_cent(1,i_1s,j,i)
          B_center(2) = List_comb_thr_b3_cent(2,i_1s,j,i)
          B_center(3) = List_comb_thr_b3_cent(3,i_1s,j,i)
@ -248,7 +258,7 @@ BEGIN_PROVIDER [ double precision, int2_u2_j1b2_test, (ao_num, ao_num, n_points_
            !DIR$ FORCEINLINE
            call gaussian_product(expo_fit,r,beta,B_center,factor_ij_1s,beta_ij,center_ij_1s)
 !            if(dabs(coef_fit*coef*factor_ij_1s*int_j1b).lt.1.d-10)cycle ! old version
-            if(dabs(coef_fit*coef*factor_ij_1s*int_j1b*sq_pi_3_2*(beta_ij)**(-3/2)).lt.1.d-12)cycle
+            if(dabs(coef_fit*coef*factor_ij_1s*int_j1b*sq_pi_3_2*(beta_ij)**(-1.5d0)).lt.1.d-10)cycle
            ! ---
@ -283,7 +293,7 @@ END_PROVIDER
 ! ---
-BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2_test, (3, ao_num, ao_num, n_points_final_grid)]
+BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2_test, (ao_num, ao_num, n_points_final_grid, 3)]
  BEGIN_DOC
  !
@ -298,7 +308,10 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2_test, (3, ao_num, ao_num
  double precision :: alpha_1s, alpha_1s_inv, centr_1s(3), expo_coef_1s, coef_tmp
  double precision :: tmp_x, tmp_y, tmp_z, int_j1b
  double precision :: wall0, wall1, sq_pi_3_2,sq_alpha
-  sq_pi_3_2 = dacos(-1.D0)**(3/2)
+
  print*, ' providing int2_u_grad1u_x_j1b2_test ...'
  sq_pi_3_2 = dacos(-1.D0)**(1.d0)
  provide mu_erf final_grid_points j1b_pen
  call wall_time(wall0)
@ -310,7 +323,7 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2_test, (3, ao_num, ao_num
 !$OMP          alpha_1s_inv, centr_1s, expo_coef_1s, coef_tmp,     & 
 !$OMP          tmp_x, tmp_y, tmp_z,int_j1b,sq_alpha)                        & 
 !$OMP SHARED  (n_points_final_grid, ao_num, List_comb_thr_b3_size, & 
- !$OMP          final_grid_points, ng_fit_jast, ao_overlap_abs_grid,&
+ !$OMP          final_grid_points, ng_fit_jast,                     &
 !$OMP          expo_gauss_j_mu_1_erf, coef_gauss_j_mu_1_erf,       &
 !$OMP          List_comb_thr_b3_coef, List_comb_thr_b3_expo,       & 
 !$OMP          List_comb_thr_b3_cent, int2_u_grad1u_x_j1b2_test,ao_abs_comb_b3_j1b,sq_pi_3_2)
@ -323,9 +336,6 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2_test, (3, ao_num, ao_num
    do i = 1, ao_num
      do j = i, ao_num
       if(ao_overlap_abs_grid(j,i) .lt. 1.d-12) then
         cycle
       endif
        tmp_x = 0.d0
        tmp_y = 0.d0
@ -335,7 +345,7 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2_test, (3, ao_num, ao_num
          coef        = List_comb_thr_b3_coef  (i_1s,j,i)
          beta        = List_comb_thr_b3_expo  (i_1s,j,i)
          int_j1b = ao_abs_comb_b3_j1b(i_1s,j,i)
-          if(dabs(coef)*dabs(int_j1b).lt.1.d-14)cycle
+          if(dabs(coef)*dabs(int_j1b).lt.1.d-10)cycle
          B_center(1) = List_comb_thr_b3_cent(1,i_1s,j,i)
          B_center(2) = List_comb_thr_b3_cent(2,i_1s,j,i)
          B_center(3) = List_comb_thr_b3_cent(3,i_1s,j,i)
@ -359,7 +369,7 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2_test, (3, ao_num, ao_num
            coef_tmp = coef * coef_fit * dexp(-expo_coef_1s)
            sq_alpha = alpha_1s_inv * dsqrt(alpha_1s_inv)
 !            if(dabs(coef_tmp*int_j1b) .lt. 1d-10) cycle ! old version
-            if(dabs(coef_tmp*int_j1b*sq_pi_3_2*sq_alpha) .lt. 1d-14) cycle
+            if(dabs(coef_tmp*int_j1b*sq_pi_3_2*sq_alpha) .lt. 1d-10) cycle
            call NAI_pol_x_mult_erf_ao_with1s(i, j, alpha_1s, centr_1s, 1.d+9, r, int_fit)
@ -372,9 +382,9 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2_test, (3, ao_num, ao_num
        enddo
-        int2_u_grad1u_x_j1b2_test(1,j,i,ipoint) = tmp_x
+        int2_u_grad1u_x_j1b2_test(j,i,ipoint,1) = tmp_x
-        int2_u_grad1u_x_j1b2_test(2,j,i,ipoint) = tmp_y
+        int2_u_grad1u_x_j1b2_test(j,i,ipoint,2) = tmp_y
-        int2_u_grad1u_x_j1b2_test(3,j,i,ipoint) = tmp_z
+        int2_u_grad1u_x_j1b2_test(j,i,ipoint,3) = tmp_z
      enddo
    enddo
  enddo
@ -384,9 +394,9 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2_test, (3, ao_num, ao_num
  do ipoint = 1, n_points_final_grid
    do i = 2, ao_num
      do j = 1, i-1
-        int2_u_grad1u_x_j1b2_test(1,j,i,ipoint) = int2_u_grad1u_x_j1b2_test(1,i,j,ipoint)
+        int2_u_grad1u_x_j1b2_test(j,i,ipoint,1) = int2_u_grad1u_x_j1b2_test(i,j,ipoint,1)
-        int2_u_grad1u_x_j1b2_test(2,j,i,ipoint) = int2_u_grad1u_x_j1b2_test(2,i,j,ipoint)
+        int2_u_grad1u_x_j1b2_test(j,i,ipoint,2) = int2_u_grad1u_x_j1b2_test(i,j,ipoint,2)
-        int2_u_grad1u_x_j1b2_test(3,j,i,ipoint) = int2_u_grad1u_x_j1b2_test(3,i,j,ipoint)
+        int2_u_grad1u_x_j1b2_test(j,i,ipoint,3) = int2_u_grad1u_x_j1b2_test(i,j,ipoint,3)
      enddo
    enddo
  enddo
@ -415,7 +425,10 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_j1b2_test, (ao_num, ao_num, n_p
  double precision :: j12_mu_r12,int_j1b
  double precision :: sigma_ij,dist_ij_ipoint,dsqpi_3_2
  double precision :: beta_ij,center_ij_1s(3),factor_ij_1s
-  dsqpi_3_2 = (dacos(-1.d0))**(3/2)
+
  print*, ' providing int2_u_grad1u_j1b2_test ...'
  dsqpi_3_2 = (dacos(-1.d0))**(1.5d0)
  provide mu_erf final_grid_points j1b_pen ao_overlap_abs List_comb_thr_b3_cent
  call wall_time(wall0)
@ -438,7 +451,7 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_j1b2_test, (ao_num, ao_num, n_p
  do ipoint = 1, n_points_final_grid
    do i = 1, ao_num
      do j = i, ao_num
-        if(dabs(ao_overlap_abs_grid(j,i)).lt.1.d-12)cycle
+        if(dabs(ao_overlap_abs_grid(j,i)).lt.1.d-10)cycle
        r(1) = final_grid_points(1,ipoint)
        r(2) = final_grid_points(2,ipoint)
        r(3) = final_grid_points(3,ipoint)
@ -449,7 +462,7 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_j1b2_test, (ao_num, ao_num, n_p
          coef        = List_comb_thr_b3_coef  (i_1s,j,i)
          beta        = List_comb_thr_b3_expo  (i_1s,j,i)
          int_j1b = ao_abs_comb_b3_j1b(i_1s,j,i)
-          if(dabs(coef)*dabs(int_j1b).lt.1.d-14)cycle
+          if(dabs(coef)*dabs(int_j1b).lt.1.d-10)cycle
          B_center(1) = List_comb_thr_b3_cent(1,i_1s,j,i)
          B_center(2) = List_comb_thr_b3_cent(2,i_1s,j,i)
          B_center(3) = List_comb_thr_b3_cent(3,i_1s,j,i)
@ -461,7 +474,7 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_j1b2_test, (ao_num, ao_num, n_p
            expo_fit = expo_gauss_j_mu_1_erf(i_fit)
            call gaussian_product(expo_fit,r,beta,B_center,factor_ij_1s,beta_ij,center_ij_1s)
-            if(factor_ij_1s*dabs(coef*int_j1b)*dsqpi_3_2*beta_ij**(-3/2).lt.1.d-15)cycle
+            if(factor_ij_1s*dabs(coef*int_j1b)*dsqpi_3_2*beta_ij**(-1.5d0).lt.1.d-15)cycle
            coef_fit = coef_gauss_j_mu_1_erf(i_fit)
            alpha_1s     = beta + expo_fit
@ -471,9 +484,9 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_j1b2_test, (ao_num, ao_num, n_p
            centr_1s(3)  = alpha_1s_inv * (beta * B_center(3) + expo_fit * r(3))
            expo_coef_1s = beta * expo_fit * alpha_1s_inv * dist
-            if(expo_coef_1s .gt. 80.d0) cycle
+            if(expo_coef_1s .gt. 20.d0) cycle
            coef_tmp = coef * coef_fit * dexp(-expo_coef_1s)
-            if(dabs(coef_tmp) .lt. 1d-10) cycle
+            if(dabs(coef_tmp) .lt. 1d-08) cycle
            int_fit = NAI_pol_mult_erf_ao_with1s(i, j, alpha_1s, centr_1s,  1.d+9, r)
--- a/src/ao_many_one_e_ints/grad2_jmu_modif.irp.f
+++ b/src/ao_many_one_e_ints/grad2_jmu_modif.irp.f
@ -19,9 +19,11 @@ BEGIN_PROVIDER [ double precision, int2_grad1u2_grad2u2_j1b2, (ao_num, ao_num, n
  double precision, external    :: overlap_gauss_r12_ao
  double precision, external    :: overlap_gauss_r12_ao_with1s
-  provide mu_erf final_grid_points j1b_pen
+  print*, ' providing int2_grad1u2_grad2u2_j1b2 ...'
  call wall_time(wall0)
  provide mu_erf final_grid_points j1b_pen
  int2_grad1u2_grad2u2_j1b2 = 0.d0
 !$OMP PARALLEL DEFAULT (NONE)                                      &
@ -88,7 +90,7 @@ BEGIN_PROVIDER [ double precision, int2_grad1u2_grad2u2_j1b2, (ao_num, ao_num, n
  enddo
  call wall_time(wall1)
-  print*, ' wall time for int2_grad1u2_grad2u2_j1b2', wall1 - wall0
+  print*, ' wall time for int2_grad1u2_grad2u2_j1b2 =', wall1 - wall0
 END_PROVIDER 
@ -111,9 +113,11 @@ BEGIN_PROVIDER [ double precision, int2_u2_j1b2, (ao_num, ao_num, n_points_final
  double precision, external    :: overlap_gauss_r12_ao
  double precision, external    :: overlap_gauss_r12_ao_with1s
-  provide mu_erf final_grid_points j1b_pen
+  print*, ' providing int2_u2_j1b2 ...'
  call wall_time(wall0)
  provide mu_erf final_grid_points j1b_pen
  int2_u2_j1b2 = 0.d0
 !$OMP PARALLEL DEFAULT (NONE)                                      &
@ -186,7 +190,7 @@ END_PROVIDER
 ! ---
-BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2, (3, ao_num, ao_num, n_points_final_grid)]
+BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2, (ao_num, ao_num, n_points_final_grid, 3)]
  BEGIN_DOC
  !
@ -202,9 +206,11 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2, (3, ao_num, ao_num, n_p
  double precision :: tmp_x, tmp_y, tmp_z
  double precision :: wall0, wall1
-  provide mu_erf final_grid_points j1b_pen
+  print*, ' providing int2_u_grad1u_x_j1b2 ...'
  call wall_time(wall0)
  provide mu_erf final_grid_points j1b_pen
  int2_u_grad1u_x_j1b2 = 0.d0
 !$OMP PARALLEL DEFAULT (NONE)                                      &
@ -278,9 +284,9 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2, (3, ao_num, ao_num, n_p
        enddo
-        int2_u_grad1u_x_j1b2(1,j,i,ipoint) = tmp_x
+        int2_u_grad1u_x_j1b2(j,i,ipoint,1) = tmp_x
-        int2_u_grad1u_x_j1b2(2,j,i,ipoint) = tmp_y
+        int2_u_grad1u_x_j1b2(j,i,ipoint,2) = tmp_y
-        int2_u_grad1u_x_j1b2(3,j,i,ipoint) = tmp_z
+        int2_u_grad1u_x_j1b2(j,i,ipoint,3) = tmp_z
      enddo
    enddo
  enddo
@ -290,15 +296,15 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2, (3, ao_num, ao_num, n_p
  do ipoint = 1, n_points_final_grid
    do i = 2, ao_num
      do j = 1, i-1
-        int2_u_grad1u_x_j1b2(1,j,i,ipoint) = int2_u_grad1u_x_j1b2(1,i,j,ipoint)
+        int2_u_grad1u_x_j1b2(j,i,ipoint,1) = int2_u_grad1u_x_j1b2(i,j,ipoint,1)
-        int2_u_grad1u_x_j1b2(2,j,i,ipoint) = int2_u_grad1u_x_j1b2(2,i,j,ipoint)
+        int2_u_grad1u_x_j1b2(j,i,ipoint,2) = int2_u_grad1u_x_j1b2(i,j,ipoint,2)
-        int2_u_grad1u_x_j1b2(3,j,i,ipoint) = int2_u_grad1u_x_j1b2(3,i,j,ipoint)
+        int2_u_grad1u_x_j1b2(j,i,ipoint,3) = int2_u_grad1u_x_j1b2(i,j,ipoint,3)
      enddo
    enddo
  enddo
  call wall_time(wall1)
-  print*, ' wall time for int2_u_grad1u_x_j1b2', wall1 - wall0
+  print*, ' wall time for int2_u_grad1u_x_j1b2 = ', wall1 - wall0
 END_PROVIDER 
@ -320,9 +326,11 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_j1b2, (ao_num, ao_num, n_points
  double precision              :: wall0, wall1
  double precision, external    :: NAI_pol_mult_erf_ao_with1s
-  provide mu_erf final_grid_points j1b_pen
+  print*, ' providing int2_u_grad1u_j1b2 ...'
  call wall_time(wall0)
  provide mu_erf final_grid_points j1b_pen
  int2_u_grad1u_j1b2 = 0.d0
 !$OMP PARALLEL DEFAULT (NONE)                                      &
--- a/src/ao_many_one_e_ints/grad2_jmu_modif_vect.irp.f
+++ b/src/ao_many_one_e_ints/grad2_jmu_modif_vect.irp.f
@ -241,7 +241,7 @@
 !
 !! ---
 !
-!BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2, (3, ao_num, ao_num, n_points_final_grid)]
+!BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2, (ao_num, ao_num, n_points_final_grid, 3)]
 !
 !  BEGIN_DOC
 !  !
@ -308,7 +308,7 @@
 !
 !          ! ---
 !
-!          int2_u_grad1u_x_j1b2(1,j,i,ipoint) += coef_fit * int_fit_v(ipoint,1)
+!          int2_u_grad1u_x_j1b2(j,i,ipoint,1) += coef_fit * int_fit_v(ipoint,1)
 !
 !          if(dabs(int_fit_v(ipoint,1)) .gt. 1d-10) then
 !            i_mask_grid1 += 1
@ -320,7 +320,7 @@
 !
 !          ! ---
 !
-!          int2_u_grad1u_x_j1b2(2,j,i,ipoint) += coef_fit * int_fit_v(ipoint,2)
+!          int2_u_grad1u_x_j1b2(j,i,ipoint,2) += coef_fit * int_fit_v(ipoint,2)
 !
 !          if(dabs(int_fit_v(ipoint,2)) .gt. 1d-10) then
 !            i_mask_grid2 += 1
@ -332,7 +332,7 @@
 !
 !          ! ---
 !
-!          int2_u_grad1u_x_j1b2(3,j,i,ipoint) += coef_fit * int_fit_v(ipoint,3)
+!          int2_u_grad1u_x_j1b2(j,i,ipoint,3) += coef_fit * int_fit_v(ipoint,3)
 !
 !          if(dabs(int_fit_v(ipoint,3)) .gt. 1d-10) then
 !            i_mask_grid3 += 1
@ -408,15 +408,15 @@
 !          call NAI_pol_x_mult_erf_ao_with1s_v(i, j, alpha_1s, centr_1s, n_points_final_grid, 1.d+9, r_mask_grid, n_points_final_grid, int_fit_v, n_points_final_grid, i_mask_grid)
 !
 !          do ipoint = 1, i_mask_grid1
-!            int2_u_grad1u_x_j1b2(1,j,i,n_mask_grid(ipoint,1)) += coef * dexp(-expo_coef_1s * dist(ipoint,1)) * int_fit_v(ipoint,1)
+!            int2_u_grad1u_x_j1b2(j,i,n_mask_grid(ipoint,1),1) += coef * dexp(-expo_coef_1s * dist(ipoint,1)) * int_fit_v(ipoint,1)
 !          enddo
 !
 !          do ipoint = 1, i_mask_grid2
-!            int2_u_grad1u_x_j1b2(2,j,i,n_mask_grid(ipoint,2)) += coef * dexp(-expo_coef_1s * dist(ipoint,2)) * int_fit_v(ipoint,2)
+!            int2_u_grad1u_x_j1b2(j,i,n_mask_grid(ipoint,2),2) += coef * dexp(-expo_coef_1s * dist(ipoint,2)) * int_fit_v(ipoint,2)
 !          enddo
 !
 !          do ipoint = 1, i_mask_grid3
-!            int2_u_grad1u_x_j1b2(3,j,i,n_mask_grid(ipoint,3)) += coef * dexp(-expo_coef_1s * dist(ipoint,3)) * int_fit_v(ipoint,3)
+!            int2_u_grad1u_x_j1b2(j,i,n_mask_grid(ipoint,3),3) += coef * dexp(-expo_coef_1s * dist(ipoint,3)) * int_fit_v(ipoint,3)
 !          enddo
 !
 !        enddo
@ -439,15 +439,15 @@
 !  do ipoint = 1, n_points_final_grid
 !    do i = 2, ao_num
 !      do j = 1, i-1
-!        int2_u_grad1u_x_j1b2(1,j,i,ipoint) = int2_u_grad1u_x_j1b2(1,i,j,ipoint)
+!        int2_u_grad1u_x_j1b2(j,i,ipoint,1) = int2_u_grad1u_x_j1b2(i,j,ipoint,1)
-!        int2_u_grad1u_x_j1b2(2,j,i,ipoint) = int2_u_grad1u_x_j1b2(2,i,j,ipoint)
+!        int2_u_grad1u_x_j1b2(j,i,ipoint,2) = int2_u_grad1u_x_j1b2(i,j,ipoint,2)
-!        int2_u_grad1u_x_j1b2(3,j,i,ipoint) = int2_u_grad1u_x_j1b2(3,i,j,ipoint)
+!        int2_u_grad1u_x_j1b2(j,i,ipoint,3) = int2_u_grad1u_x_j1b2(i,j,ipoint,3)
 !      enddo
 !    enddo
 !  enddo
 !
 !  call wall_time(wall1)
-!  print*, ' wall time for int2_u_grad1u_x_j1b2', wall1 - wall0
+!  print*, ' wall time for int2_u_grad1u_x_j1b2 =', wall1 - wall0
 !
 !END_PROVIDER
 !
--- a/src/ao_many_one_e_ints/grad_lapl_jmu_manu.irp.f
+++ b/src/ao_many_one_e_ints/grad_lapl_jmu_manu.irp.f
@ -17,7 +17,10 @@ BEGIN_PROVIDER [ double precision, v_ij_erf_rk_cst_mu_j1b_test, (ao_num, ao_num,
  double precision           :: wall0, wall1
  double precision, external :: NAI_pol_mult_erf_ao_with1s
  double precision :: sigma_ij,dist_ij_ipoint,dsqpi_3_2
-  dsqpi_3_2 = (dacos(-1.d0))**(3/2)
+
  print*, ' providing v_ij_erf_rk_cst_mu_j1b_test ...'
  dsqpi_3_2 = (dacos(-1.d0))**(1.5d0)
  provide mu_erf final_grid_points j1b_pen
  call wall_time(wall0)
@ -38,7 +41,7 @@ BEGIN_PROVIDER [ double precision, v_ij_erf_rk_cst_mu_j1b_test, (ao_num, ao_num,
    do i = 1, ao_num
      do j = i, ao_num
-        if(dabs(ao_overlap_abs_grid(j,i)).lt.1.d-12)cycle
+        if(dabs(ao_overlap_abs_grid(j,i)).lt.1.d-20)cycle
        tmp = 0.d0
        do i_1s = 1, List_comb_thr_b2_size(j,i)
@ -46,7 +49,7 @@ BEGIN_PROVIDER [ double precision, v_ij_erf_rk_cst_mu_j1b_test, (ao_num, ao_num,
          coef        = List_comb_thr_b2_coef  (i_1s,j,i)
          beta        = List_comb_thr_b2_expo  (i_1s,j,i)
          int_j1b = ao_abs_comb_b2_j1b(i_1s,j,i)
-          if(dabs(coef)*dabs(int_j1b).lt.1.d-14)cycle
+          if(dabs(coef)*dabs(int_j1b).lt.1.d-10)cycle
          B_center(1) = List_comb_thr_b2_cent(1,i_1s,j,i)
          B_center(2) = List_comb_thr_b2_cent(2,i_1s,j,i)
          B_center(3) = List_comb_thr_b2_cent(3,i_1s,j,i)
@ -85,54 +88,28 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_j1b_test, (ao_num, ao_nu
  ! int dr x phi_i(r) phi_j(r) 1s_j1b(r) (erf(mu(R) |r - R|) - 1)/|r - R|
  END_DOC
  implicit none
  integer          :: i, j, ipoint
  double precision :: wall0, wall1
  call wall_time(wall0)
  do ipoint = 1, n_points_final_grid
    do i = 1, ao_num
      do j = 1, ao_num
        x_v_ij_erf_rk_cst_mu_j1b_test(j,i,ipoint,1) = x_v_ij_erf_rk_cst_mu_tmp_j1b_test(1,j,i,ipoint)
        x_v_ij_erf_rk_cst_mu_j1b_test(j,i,ipoint,2) = x_v_ij_erf_rk_cst_mu_tmp_j1b_test(2,j,i,ipoint)
        x_v_ij_erf_rk_cst_mu_j1b_test(j,i,ipoint,3) = x_v_ij_erf_rk_cst_mu_tmp_j1b_test(3,j,i,ipoint)
      enddo
    enddo
  enddo
  call wall_time(wall1)
  print*, ' wall time for x_v_ij_erf_rk_cst_mu_j1b_test', wall1 - wall0
 END_PROVIDER 
 ! ---
 BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_tmp_j1b_test, (3, ao_num, ao_num, n_points_final_grid)]
  BEGIN_DOC
  ! int dr x phi_i(r) phi_j(r) 1s_j1b(r) (erf(mu(R) |r - R|) - 1)/|r - R|
  END_DOC
  implicit none
  integer          :: i, j, ipoint, i_1s
  double precision :: coef, beta, B_center(3), r(3), ints(3), ints_coulomb(3)
  double precision :: tmp_x, tmp_y, tmp_z
  double precision :: wall0, wall1
  double precision :: sigma_ij,dist_ij_ipoint,dsqpi_3_2,int_j1b,factor_ij_1s,beta_ij,center_ij_1s
-  dsqpi_3_2 = (dacos(-1.d0))**(3/2)
+
  print*, ' providing x_v_ij_erf_rk_cst_mu_j1b_test ...'
  dsqpi_3_2 = (dacos(-1.d0))**(1.5d0)
  provide expo_erfc_mu_gauss ao_prod_sigma ao_prod_center
  call wall_time(wall0)
-  x_v_ij_erf_rk_cst_mu_tmp_j1b_test = 0.d0
+  x_v_ij_erf_rk_cst_mu_j1b_test = 0.d0
 !$OMP PARALLEL DEFAULT (NONE)                                                        &
 !$OMP PRIVATE (ipoint, i, j, i_1s, r, coef, beta, B_center, ints, ints_coulomb,      & 
 !$OMP          int_j1b, tmp_x, tmp_y, tmp_z,factor_ij_1s,beta_ij,center_ij_1s)       & 
 !$OMP SHARED  (n_points_final_grid, ao_num, List_comb_thr_b2_size, final_grid_points,&
 !$OMP          List_comb_thr_b2_coef, List_comb_thr_b2_expo, List_comb_thr_b2_cent,  &
- !$OMP          x_v_ij_erf_rk_cst_mu_tmp_j1b_test, mu_erf,ao_abs_comb_b2_j1b,         &
+ !$OMP          x_v_ij_erf_rk_cst_mu_j1b_test, mu_erf,ao_abs_comb_b2_j1b,         &
 !$OMP          ao_overlap_abs_grid,ao_prod_center,ao_prod_sigma)
 ! !$OMP          ao_overlap_abs_grid,ao_prod_center,ao_prod_sigma,dsqpi_3_2,expo_erfc_mu_gauss)
 !$OMP DO
@ -143,7 +120,7 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_tmp_j1b_test, (3, ao_num
    do i = 1, ao_num
      do j = i, ao_num
-        if(dabs(ao_overlap_abs_grid(j,i)).lt.1.d-12)cycle
+        if(dabs(ao_overlap_abs_grid(j,i)).lt.1.d-10)cycle
        tmp_x = 0.d0
        tmp_y = 0.d0
@ -153,7 +130,7 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_tmp_j1b_test, (3, ao_num
          coef        = List_comb_thr_b2_coef  (i_1s,j,i)
          beta        = List_comb_thr_b2_expo  (i_1s,j,i)
          int_j1b = ao_abs_comb_b2_j1b(i_1s,j,i)
-          if(dabs(coef)*dabs(int_j1b).lt.1.d-14)cycle
+          if(dabs(coef)*dabs(int_j1b).lt.1.d-10)cycle
          B_center(1) = List_comb_thr_b2_cent(1,i_1s,j,i)
          B_center(2) = List_comb_thr_b2_cent(2,i_1s,j,i)
          B_center(3) = List_comb_thr_b2_cent(3,i_1s,j,i)
@ -164,7 +141,7 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_tmp_j1b_test, (3, ao_num
 !           call gaussian_product(expo_erfc_mu_gauss,r,     &
 !                ao_prod_sigma(j,i),ao_prod_center(1,j,i),  & 
 !                factor_ij_1s,beta_ij,center_ij_1s)
-!           if(dabs(coef * factor_ij_1s*int_j1b*10.d0 * dsqpi_3_2 * beta_ij**(-3/2)).lt.1.d-10)cycle 
+!           if(dabs(coef * factor_ij_1s*int_j1b*10.d0 * dsqpi_3_2 * beta_ij**(-1.5d0)).lt.1.d-10)cycle 
 !          endif
          call NAI_pol_x_mult_erf_ao_with1s(i, j, beta, B_center, mu_erf, r, ints        )
          call NAI_pol_x_mult_erf_ao_with1s(i, j, beta, B_center,  1.d+9, r, ints_coulomb)
@ -174,9 +151,9 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_tmp_j1b_test, (3, ao_num
          tmp_z += coef * (ints(3) - ints_coulomb(3))
        enddo
-        x_v_ij_erf_rk_cst_mu_tmp_j1b_test(1,j,i,ipoint) = tmp_x
+        x_v_ij_erf_rk_cst_mu_j1b_test(j,i,ipoint,1) = tmp_x
-        x_v_ij_erf_rk_cst_mu_tmp_j1b_test(2,j,i,ipoint) = tmp_y
+        x_v_ij_erf_rk_cst_mu_j1b_test(j,i,ipoint,2) = tmp_y
-        x_v_ij_erf_rk_cst_mu_tmp_j1b_test(3,j,i,ipoint) = tmp_z
+        x_v_ij_erf_rk_cst_mu_j1b_test(j,i,ipoint,3) = tmp_z
      enddo
    enddo
  enddo
@ -186,15 +163,15 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_tmp_j1b_test, (3, ao_num
  do ipoint = 1, n_points_final_grid
    do i = 2, ao_num
      do j = 1, i-1
-        x_v_ij_erf_rk_cst_mu_tmp_j1b_test(1,j,i,ipoint) = x_v_ij_erf_rk_cst_mu_tmp_j1b_test(1,i,j,ipoint)
+        x_v_ij_erf_rk_cst_mu_j1b_test(j,i,ipoint,1) = x_v_ij_erf_rk_cst_mu_j1b_test(i,j,ipoint,1)
-        x_v_ij_erf_rk_cst_mu_tmp_j1b_test(2,j,i,ipoint) = x_v_ij_erf_rk_cst_mu_tmp_j1b_test(2,i,j,ipoint)
+        x_v_ij_erf_rk_cst_mu_j1b_test(j,i,ipoint,2) = x_v_ij_erf_rk_cst_mu_j1b_test(i,j,ipoint,2)
-        x_v_ij_erf_rk_cst_mu_tmp_j1b_test(3,j,i,ipoint) = x_v_ij_erf_rk_cst_mu_tmp_j1b_test(3,i,j,ipoint)
+        x_v_ij_erf_rk_cst_mu_j1b_test(j,i,ipoint,3) = x_v_ij_erf_rk_cst_mu_j1b_test(i,j,ipoint,3)
      enddo
    enddo
  enddo
  call wall_time(wall1)
-  print*, ' wall time for x_v_ij_erf_rk_cst_mu_tmp_j1b_test', wall1 - wall0
+  print*, ' wall time for x_v_ij_erf_rk_cst_mu_j1b_test', wall1 - wall0
 END_PROVIDER 
@ -218,7 +195,10 @@ BEGIN_PROVIDER [ double precision, v_ij_u_cst_mu_j1b_test, (ao_num, ao_num, n_po
  double precision, external :: overlap_gauss_r12_ao_with1s
  double precision :: sigma_ij,dist_ij_ipoint,dsqpi_3_2,int_j1b
-  dsqpi_3_2 = (dacos(-1.d0))**(3/2)
+
  print*, ' providing v_ij_u_cst_mu_j1b_test ...'
  dsqpi_3_2 = (dacos(-1.d0))**(1.5d0)
  provide mu_erf final_grid_points j1b_pen
  call wall_time(wall0)
@ -244,7 +224,7 @@ BEGIN_PROVIDER [ double precision, v_ij_u_cst_mu_j1b_test, (ao_num, ao_num, n_po
    do i = 1, ao_num
      do j = i, ao_num
-        if(dabs(ao_overlap_abs_grid(j,i)).lt.1.d-12)cycle
+        if(dabs(ao_overlap_abs_grid(j,i)).lt.1.d-20)cycle
        tmp = 0.d0
        do i_1s = 1, List_comb_thr_b2_size(j,i)
@ -252,7 +232,7 @@ BEGIN_PROVIDER [ double precision, v_ij_u_cst_mu_j1b_test, (ao_num, ao_num, n_po
          coef        = List_comb_thr_b2_coef  (i_1s,j,i)
          beta        = List_comb_thr_b2_expo  (i_1s,j,i)
          int_j1b = ao_abs_comb_b2_j1b(i_1s,j,i)
-          if(dabs(coef)*dabs(int_j1b).lt.1.d-14)cycle
+          if(dabs(coef)*dabs(int_j1b).lt.1.d-10)cycle
          B_center(1) = List_comb_thr_b2_cent(1,i_1s,j,i)
          B_center(2) = List_comb_thr_b2_cent(2,i_1s,j,i)
          B_center(3) = List_comb_thr_b2_cent(3,i_1s,j,i)
@ -311,7 +291,7 @@ BEGIN_PROVIDER [ double precision, v_ij_u_cst_mu_j1b_ng_1_test, (ao_num, ao_num,
  double precision, external :: overlap_gauss_r12_ao_with1s
  double precision :: sigma_ij,dist_ij_ipoint,dsqpi_3_2,int_j1b
-  dsqpi_3_2 = (dacos(-1.d0))**(3/2)
+  dsqpi_3_2 = (dacos(-1.d0))**(1.5d0)
  provide mu_erf final_grid_points j1b_pen
  call wall_time(wall0)
@ -337,7 +317,7 @@ BEGIN_PROVIDER [ double precision, v_ij_u_cst_mu_j1b_ng_1_test, (ao_num, ao_num,
    do i = 1, ao_num
      do j = i, ao_num
-        if(dabs(ao_overlap_abs_grid(j,i)).lt.1.d-12)cycle
+        if(dabs(ao_overlap_abs_grid(j,i)).lt.1.d-20)cycle
        tmp = 0.d0
        do i_1s = 1, List_comb_thr_b2_size(j,i)
@ -345,7 +325,7 @@ BEGIN_PROVIDER [ double precision, v_ij_u_cst_mu_j1b_ng_1_test, (ao_num, ao_num,
          coef        = List_comb_thr_b2_coef  (i_1s,j,i)
          beta        = List_comb_thr_b2_expo  (i_1s,j,i)
          int_j1b = ao_abs_comb_b2_j1b(i_1s,j,i)
-          if(dabs(coef)*dabs(int_j1b).lt.1.d-14)cycle
+          if(dabs(coef)*dabs(int_j1b).lt.1.d-10)cycle
          B_center(1) = List_comb_thr_b2_cent(1,i_1s,j,i)
          B_center(2) = List_comb_thr_b2_cent(2,i_1s,j,i)
          B_center(3) = List_comb_thr_b2_cent(3,i_1s,j,i)
--- a/src/ao_many_one_e_ints/grad_lapl_jmu_modif.irp.f
+++ b/src/ao_many_one_e_ints/grad_lapl_jmu_modif.irp.f
@ -17,9 +17,11 @@ BEGIN_PROVIDER [ double precision, v_ij_erf_rk_cst_mu_j1b, (ao_num, ao_num, n_po
  double precision           :: wall0, wall1
  double precision, external :: NAI_pol_mult_erf_ao_with1s
-  provide mu_erf final_grid_points j1b_pen
+  print *, ' providing v_ij_erf_rk_cst_mu_j1b ...'
  call wall_time(wall0)
  provide mu_erf final_grid_points j1b_pen
  v_ij_erf_rk_cst_mu_j1b = 0.d0
 !$OMP PARALLEL DEFAULT (NONE)                                                         &
@ -99,51 +101,23 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_j1b, (ao_num, ao_num, n_
  ! int dr x phi_i(r) phi_j(r) 1s_j1b(r) (erf(mu(R) |r - R|) - 1)/|r - R|
  END_DOC
  implicit none
  integer          :: i, j, ipoint
  double precision :: wall0, wall1
  call wall_time(wall0)
  do ipoint = 1, n_points_final_grid
    do i = 1, ao_num
      do j = 1, ao_num
        x_v_ij_erf_rk_cst_mu_j1b(j,i,ipoint,1) = x_v_ij_erf_rk_cst_mu_tmp_j1b(1,j,i,ipoint)
        x_v_ij_erf_rk_cst_mu_j1b(j,i,ipoint,2) = x_v_ij_erf_rk_cst_mu_tmp_j1b(2,j,i,ipoint)
        x_v_ij_erf_rk_cst_mu_j1b(j,i,ipoint,3) = x_v_ij_erf_rk_cst_mu_tmp_j1b(3,j,i,ipoint)
      enddo
    enddo
  enddo
  call wall_time(wall1)
  print*, ' wall time for x_v_ij_erf_rk_cst_mu_j1b', wall1 - wall0
 END_PROVIDER 
 ! ---
 BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_tmp_j1b, (3, ao_num, ao_num, n_points_final_grid)]
  BEGIN_DOC
  ! int dr x phi_i(r) phi_j(r) 1s_j1b(r) (erf(mu(R) |r - R|) - 1)/|r - R|
  END_DOC
  implicit none
  integer          :: i, j, ipoint, i_1s
  double precision :: coef, beta, B_center(3), r(3), ints(3), ints_coulomb(3)
  double precision :: tmp_x, tmp_y, tmp_z
  double precision :: wall0, wall1
  print*, ' providing x_v_ij_erf_rk_cst_mu_j1b ...'
  call wall_time(wall0)
-  x_v_ij_erf_rk_cst_mu_tmp_j1b = 0.d0
+  x_v_ij_erf_rk_cst_mu_j1b = 0.d0
 !$OMP PARALLEL DEFAULT (NONE)                                                        &
 !$OMP PRIVATE (ipoint, i, j, i_1s, r, coef, beta, B_center, ints, ints_coulomb,      & 
 !$OMP          tmp_x, tmp_y, tmp_z)                                                  & 
 !$OMP SHARED  (n_points_final_grid, ao_num, List_all_comb_b2_size, final_grid_points,&
 !$OMP          List_all_comb_b2_coef, List_all_comb_b2_expo, List_all_comb_b2_cent,  &
- !$OMP          x_v_ij_erf_rk_cst_mu_tmp_j1b, mu_erf)
+ !$OMP          x_v_ij_erf_rk_cst_mu_j1b, mu_erf)
 !$OMP DO
  !do ipoint = 1, 10
  do ipoint = 1, n_points_final_grid
@ -195,9 +169,9 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_tmp_j1b, (3, ao_num, ao_
        ! ---
-        x_v_ij_erf_rk_cst_mu_tmp_j1b(1,j,i,ipoint) = tmp_x
+        x_v_ij_erf_rk_cst_mu_j1b(j,i,ipoint,1) = tmp_x
-        x_v_ij_erf_rk_cst_mu_tmp_j1b(2,j,i,ipoint) = tmp_y
+        x_v_ij_erf_rk_cst_mu_j1b(j,i,ipoint,2) = tmp_y
-        x_v_ij_erf_rk_cst_mu_tmp_j1b(3,j,i,ipoint) = tmp_z
+        x_v_ij_erf_rk_cst_mu_j1b(j,i,ipoint,3) = tmp_z
      enddo
    enddo
  enddo
@ -207,15 +181,15 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_tmp_j1b, (3, ao_num, ao_
  do ipoint = 1, n_points_final_grid
    do i = 2, ao_num
      do j = 1, i-1
-        x_v_ij_erf_rk_cst_mu_tmp_j1b(1,j,i,ipoint) = x_v_ij_erf_rk_cst_mu_tmp_j1b(1,i,j,ipoint)
+        x_v_ij_erf_rk_cst_mu_j1b(j,i,ipoint,1) = x_v_ij_erf_rk_cst_mu_j1b(i,j,ipoint,1)
-        x_v_ij_erf_rk_cst_mu_tmp_j1b(2,j,i,ipoint) = x_v_ij_erf_rk_cst_mu_tmp_j1b(2,i,j,ipoint)
+        x_v_ij_erf_rk_cst_mu_j1b(j,i,ipoint,2) = x_v_ij_erf_rk_cst_mu_j1b(i,j,ipoint,2)
-        x_v_ij_erf_rk_cst_mu_tmp_j1b(3,j,i,ipoint) = x_v_ij_erf_rk_cst_mu_tmp_j1b(3,i,j,ipoint)
+        x_v_ij_erf_rk_cst_mu_j1b(j,i,ipoint,3) = x_v_ij_erf_rk_cst_mu_j1b(i,j,ipoint,3)
      enddo
    enddo
  enddo
  call wall_time(wall1)
-  print*, ' wall time for x_v_ij_erf_rk_cst_mu_tmp_j1b', wall1 - wall0
+  print*, ' wall time for x_v_ij_erf_rk_cst_mu_j1b =', wall1 - wall0
 END_PROVIDER 
@ -239,9 +213,11 @@ BEGIN_PROVIDER [ double precision, v_ij_u_cst_mu_j1b, (ao_num, ao_num, n_points_
  double precision, external :: overlap_gauss_r12_ao_with1s
-  provide mu_erf final_grid_points j1b_pen
+  print*, ' providing v_ij_u_cst_mu_j1b ...'
  call wall_time(wall0)
  provide mu_erf final_grid_points j1b_pen
  v_ij_u_cst_mu_j1b = 0.d0
 !$OMP PARALLEL DEFAULT (NONE)                                      &
--- a/src/ao_many_one_e_ints/grad_related_ints.irp.f
+++ b/src/ao_many_one_e_ints/grad_related_ints.irp.f
@ -17,6 +17,8 @@ BEGIN_PROVIDER [ double precision, v_ij_erf_rk_cst_mu, (ao_num, ao_num, n_points
  double precision :: NAI_pol_mult_erf_ao
  print*, ' providing v_ij_erf_rk_cst_mu ...'
  provide mu_erf final_grid_points 
  call wall_time(wall0)
@ -54,7 +56,7 @@ BEGIN_PROVIDER [ double precision, v_ij_erf_rk_cst_mu, (ao_num, ao_num, n_points
  enddo
  call wall_time(wall1)
-  print*, ' wall time for v_ij_erf_rk_cst_mu  ', wall1 - wall0
+  print*, ' wall time for v_ij_erf_rk_cst_mu = ', wall1 - wall0
 END_PROVIDER 
@ -73,6 +75,8 @@ BEGIN_PROVIDER [ double precision, v_ij_erf_rk_cst_mu_transp, (n_points_final_gr
  double precision :: wall0, wall1
  double precision :: NAI_pol_mult_erf_ao
  print *, ' providing v_ij_erf_rk_cst_mu_transp ...'
  provide mu_erf final_grid_points 
  call wall_time(wall0)
@ -107,7 +111,7 @@ BEGIN_PROVIDER [ double precision, v_ij_erf_rk_cst_mu_transp, (n_points_final_gr
  enddo
  call wall_time(wall1)
-  print *, ' wall time for v_ij_erf_rk_cst_mu_transp  ', wall1 - wall0
+  print *, ' wall time for v_ij_erf_rk_cst_mu_transp = ', wall1 - wall0
 END_PROVIDER 
@ -124,6 +128,8 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_tmp, (3, ao_num, ao_num,
  double precision :: r(3), ints(3), ints_coulomb(3)
  double precision :: wall0, wall1
  print*, ' providing x_v_ij_erf_rk_cst_mu_tmp ...'
  call wall_time(wall0)
 !$OMP PARALLEL                                 &
@ -162,7 +168,7 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_tmp, (3, ao_num, ao_num,
  enddo
  call wall_time(wall1)
-  print*, ' wall time for x_v_ij_erf_rk_cst_mu_tmp', wall1 - wall0
+  print *, ' wall time for x_v_ij_erf_rk_cst_mu_tmp = ', wall1 - wall0
 END_PROVIDER 
@ -178,6 +184,8 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu, (ao_num, ao_num,n_point
  integer          :: i, j, ipoint
  double precision :: wall0, wall1
  print *, ' providing x_v_ij_erf_rk_cst_mu ...'
  call wall_time(wall0)
  do ipoint = 1, n_points_final_grid
@ -191,7 +199,7 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu, (ao_num, ao_num,n_point
  enddo
  call wall_time(wall1)
-  print *, ' wall time for x_v_ij_erf_rk_cst_mu', wall1 - wall0
+  print *, ' wall time for x_v_ij_erf_rk_cst_mu = ', wall1 - wall0
 END_PROVIDER 
@ -207,6 +215,8 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_transp, (ao_num, ao_num,
  integer          :: i, j, ipoint
  double precision :: wall0, wall1
  print *, ' providing x_v_ij_erf_rk_cst_mu_transp ...'
  call wall_time(wall0)
  do ipoint = 1, n_points_final_grid
@ -220,7 +230,7 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_transp, (ao_num, ao_num,
  enddo
  call wall_time(wall1)
-  print *, ' wall time for x_v_ij_erf_rk_cst_mu_transp', wall1 - wall0
+  print *, ' wall time for x_v_ij_erf_rk_cst_mu_transp = ', wall1 - wall0
 END_PROVIDER 
@ -236,6 +246,8 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_transp_bis, (n_points_fi
  integer          :: i, j, ipoint
  double precision :: wall0, wall1
  print *, ' providing x_v_ij_erf_rk_cst_mu_transp_bis ...'
  call wall_time(wall0)
  do i = 1, ao_num
@ -249,7 +261,7 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_transp_bis, (n_points_fi
  enddo
  call wall_time(wall1)
-  print *, ' wall time for x_v_ij_erf_rk_cst_mu_transp_bis', wall1 - wall0
+  print *, ' wall time for x_v_ij_erf_rk_cst_mu_transp_bis = ', wall1 - wall0
 END_PROVIDER 
@ -268,6 +280,8 @@ BEGIN_PROVIDER [ double precision, d_dx_v_ij_erf_rk_cst_mu_tmp, (3, n_points_fin
 double precision :: r(3), ints(3), ints_coulomb(3)
 double precision :: wall0, wall1
  print *, ' providing d_dx_v_ij_erf_rk_cst_mu_tmp ...'
  call wall_time(wall0)
 !$OMP PARALLEL                                 &
@ -295,7 +309,7 @@ BEGIN_PROVIDER [ double precision, d_dx_v_ij_erf_rk_cst_mu_tmp, (3, n_points_fin
 !$OMP END PARALLEL
  call wall_time(wall1)
-  print *, ' wall time for d_dx_v_ij_erf_rk_cst_mu_tmp', wall1 - wall0
+  print *, ' wall time for d_dx_v_ij_erf_rk_cst_mu_tmp = ', wall1 - wall0
 END_PROVIDER 
@ -315,6 +329,8 @@ BEGIN_PROVIDER [ double precision, d_dx_v_ij_erf_rk_cst_mu, (n_points_final_grid
  integer          :: i, j, ipoint
  double precision :: wall0, wall1
  print *, ' providing d_dx_v_ij_erf_rk_cst_mu ...'
  call wall_time(wall0)
  do i = 1, ao_num
    do j = 1, ao_num
@ -327,7 +343,7 @@ BEGIN_PROVIDER [ double precision, d_dx_v_ij_erf_rk_cst_mu, (n_points_final_grid
  enddo
  call wall_time(wall1)
-  print *, ' wall time for d_dx_v_ij_erf_rk_cst_mu', wall1 - wall0
+  print *, ' wall time for d_dx_v_ij_erf_rk_cst_mu = ', wall1 - wall0
 END_PROVIDER 
@ -348,6 +364,8 @@ BEGIN_PROVIDER [ double precision, x_d_dx_v_ij_erf_rk_cst_mu_tmp, (3, n_points_f
  double precision :: r(3), ints(3), ints_coulomb(3)
  double precision :: wall0, wall1
  print *, ' providing x_d_dx_v_ij_erf_rk_cst_mu_tmp ...'
  call wall_time(wall0)
 !$OMP PARALLEL                                 &
@ -375,7 +393,7 @@ BEGIN_PROVIDER [ double precision, x_d_dx_v_ij_erf_rk_cst_mu_tmp, (3, n_points_f
 !$OMP END PARALLEL
  call wall_time(wall1)
-  print *, ' wall time for x_d_dx_v_ij_erf_rk_cst_mu_tmp', wall1 - wall0
+  print *, ' wall time for x_d_dx_v_ij_erf_rk_cst_mu_tmp = ', wall1 - wall0
 END_PROVIDER 
@ -395,6 +413,8 @@ BEGIN_PROVIDER [ double precision, x_d_dx_v_ij_erf_rk_cst_mu, (n_points_final_gr
  integer          :: i, j, ipoint
  double precision :: wall0, wall1
  print *, ' providing x_d_dx_v_ij_erf_rk_cst_mu ...'
  call wall_time(wall0)
  do i = 1, ao_num
@ -408,7 +428,7 @@ BEGIN_PROVIDER [ double precision, x_d_dx_v_ij_erf_rk_cst_mu, (n_points_final_gr
  enddo
  call wall_time(wall1)
-  print *, ' wall time for x_d_dx_v_ij_erf_rk_cst_mu', wall1 - wall0
+  print *, ' wall time for x_d_dx_v_ij_erf_rk_cst_mu = ', wall1 - wall0
 END_PROVIDER 
--- a/src/ao_many_one_e_ints/listj1b.irp.f
+++ b/src/ao_many_one_e_ints/listj1b.irp.f
@ -102,6 +102,12 @@ END_PROVIDER
    List_all_comb_b2_coef(i) = (-1.d0)**dble(phase) * dexp(-List_all_comb_b2_coef(i))
  enddo
  print *, ' coeff, expo & cent of list b2'
  do i = 1, List_all_comb_b2_size
    print*, i, List_all_comb_b2_coef(i), List_all_comb_b2_expo(i)
    print*, List_all_comb_b2_cent(1,i), List_all_comb_b2_cent(2,i), List_all_comb_b2_cent(3,i)
  enddo
 END_PROVIDER
 ! ---
@ -219,9 +225,11 @@ END_PROVIDER
    List_all_comb_b3_coef(i) = (-1.d0)**dble(phase) * facto * dexp(-List_all_comb_b3_coef(i))
  enddo
-  print *, ' 1st coeff & expo of lists'
+  print *, ' coeff, expo & cent of list b3'
-  print*, List_all_comb_b2_coef(1), List_all_comb_b2_expo(1)
+  do i = 1, List_all_comb_b3_size
-  print*, List_all_comb_b3_coef(1), List_all_comb_b3_expo(1)
+    print*, i, List_all_comb_b3_coef(i), List_all_comb_b3_expo(i)
    print*, List_all_comb_b3_cent(1,i), List_all_comb_b3_cent(2,i), List_all_comb_b3_cent(3,i)
  enddo
 END_PROVIDER
--- a/src/ao_one_e_ints/pot_ao_ints.irp.f
+++ b/src/ao_one_e_ints/pot_ao_ints.irp.f
@ -18,6 +18,8 @@ BEGIN_PROVIDER [ double precision, ao_integrals_n_e, (ao_num,ao_num)]
  double precision :: A_center(3),B_center(3),C_center(3)
  double precision :: overlap_x,overlap_y,overlap_z,overlap,dx,NAI_pol_mult
  ao_integrals_n_e = 0.d0
  if (read_ao_integrals_n_e) then
    call ezfio_get_ao_one_e_ints_ao_integrals_n_e(ao_integrals_n_e)
@ -36,8 +38,6 @@ BEGIN_PROVIDER [ double precision, ao_integrals_n_e, (ao_num,ao_num)]
    else
      ao_integrals_n_e = 0.d0
      !$OMP PARALLEL                                                   &
          !$OMP DEFAULT (NONE)                                         &
          !$OMP PRIVATE (i,j,k,l,m,alpha,beta,A_center,B_center,C_center,power_A,power_B,&
--- a/src/ao_one_e_ints/pseudopot.f90
+++ b/src/ao_one_e_ints/pseudopot.f90
@ -1950,26 +1950,26 @@ xq(17)=-3.34785456738322
 xq(18)=-3.94476404011563
 xq(19)=-4.60368244955074
 xq(20)=-5.38748089001123
-wq(1)=  2.229393645534151E-013
+wq(1)=  2.229393645534151D-013
-wq(2)=  4.399340992273176E-010
+wq(2)=  4.399340992273176D-010
-wq(3)=  1.086069370769280E-007
+wq(3)=  1.086069370769280D-007
-wq(4)=  7.802556478532063E-006
+wq(4)=  7.802556478532063D-006
-wq(5)=  2.283386360163528E-004
+wq(5)=  2.283386360163528D-004
-wq(6)=  3.243773342237853E-003
+wq(6)=  3.243773342237853D-003
-wq(7)=  2.481052088746362E-002
+wq(7)=  2.481052088746362D-002
 wq(8)=  0.109017206020022
 wq(9)=  0.286675505362834
 wq(10)= 0.462243669600610
 wq(11)= 0.462243669600610
 wq(12)= 0.286675505362834
 wq(13)= 0.109017206020022
-wq(14)= 2.481052088746362E-002
+wq(14)= 2.481052088746362D-002
-wq(15)= 3.243773342237853E-003
+wq(15)= 3.243773342237853D-003
-wq(16)= 2.283386360163528E-004
+wq(16)= 2.283386360163528D-004
-wq(17)= 7.802556478532063E-006
+wq(17)= 7.802556478532063D-006
-wq(18)= 1.086069370769280E-007
+wq(18)= 1.086069370769280D-007
-wq(19)= 4.399340992273176E-010
+wq(19)= 4.399340992273176D-010
-wq(20)= 2.229393645534151E-013
+wq(20)= 2.229393645534151D-013
      npts=20
 !      call gauher(xq,wq,npts)
--- a/src/ao_tc_eff_map/fit_j.irp.f
+++ b/src/ao_tc_eff_map/fit_j.irp.f
@ -123,6 +123,36 @@ END_PROVIDER
      expo_gauss_j_mu_x(i) = tmp * expo_gauss_j_mu_x(i)
    enddo
  elseif(ng_fit_jast .eq. 7) then
    coef_gauss_j_mu_x = (/ -0.01756495d0 , -0.01023623d0  , -0.06548959d0  , -0.03539446d0  , -0.17150646d0  , -0.15071096d0  , -0.11326834d0   /)
    expo_gauss_j_mu_x = (/ 9.88572565d+02,  1.21363371d+04,  3.69794870d+01,  1.67364529d+02,  3.03962934d+00,  1.27854005d+00,  9.76383343d+00 /)
    tmp = mu_erf * mu_erf
    do i = 1, ng_fit_jast
      expo_gauss_j_mu_x(i) = tmp * expo_gauss_j_mu_x(i)
    enddo
  elseif(ng_fit_jast .eq. 8) then
    coef_gauss_j_mu_x = (/ -0.11489205d0 , -0.16008968d0 , -0.12892456d0 , -0.04250838d0 , -0.0718451d0  , -0.02394051d0 , -0.00913353d0 , -0.01285182d0  /)
    expo_gauss_j_mu_x = (/ 6.97632442d+00, 2.56010878d+00, 1.22760977d+00, 7.47697124d+01, 2.16104215d+01, 2.96549728d+02, 1.40773328d+04, 1.43335159d+03 /)
    tmp = mu_erf * mu_erf
    do i = 1, ng_fit_jast
      expo_gauss_j_mu_x(i) = tmp * expo_gauss_j_mu_x(i)
    enddo
  !elseif(ng_fit_jast .eq. 9) then
  !  coef_gauss_j_mu_x = (/ /)
  !  expo_gauss_j_mu_x = (/ /)
  !  tmp = mu_erf * mu_erf
  !  do i = 1, ng_fit_jast
  !    expo_gauss_j_mu_x(i) = tmp * expo_gauss_j_mu_x(i)
  !  enddo
  elseif(ng_fit_jast .eq. 20) then
    ASSERT(n_max_fit_slat == 20)
@ -224,6 +254,36 @@ END_PROVIDER
      expo_gauss_j_mu_x_2(i) = tmp * expo_gauss_j_mu_x_2(i)
    enddo
  elseif(ng_fit_jast .eq. 7) then
    coef_gauss_j_mu_x_2 = (/ 0.05202849d0  , 0.01031081d0  , 0.04699157d0  , 0.01451002d0  , 0.07442576d0  , 0.02692033d0  , 0.09311842d0   /)
    expo_gauss_j_mu_x_2 = (/ 3.04469415d+00, 1.40682034d+04, 7.45960945d+01, 1.43067466d+03, 2.16815661d+01, 2.95750306d+02, 7.23471236d+00 /)
    tmp = mu_erf * mu_erf
    do i = 1, ng_fit_jast
      expo_gauss_j_mu_x_2(i) = tmp * expo_gauss_j_mu_x_2(i)
    enddo
  elseif(ng_fit_jast .eq. 8) then
    coef_gauss_j_mu_x_2 = (/ 0.00942115d0  , 0.07332421d0  , 0.0508308d0   , 0.08204949d0  , 0.0404099d0   , 0.03201288d0  , 0.01911313d0  , 0.01114732d0   /)
    expo_gauss_j_mu_x_2 = (/ 1.56957321d+04, 1.52867810d+01, 4.36016903d+01, 5.96818956d+00, 2.85535269d+00, 1.36064008d+02, 4.71968910d+02, 1.92022350d+03 /)
    tmp = mu_erf * mu_erf
    do i = 1, ng_fit_jast
      expo_gauss_j_mu_x_2(i) = tmp * expo_gauss_j_mu_x_2(i)
    enddo
  !elseif(ng_fit_jast .eq. 9) then
  !  coef_gauss_j_mu_x_2 = (/  /)
  !  expo_gauss_j_mu_x_2 = (/  /)
  !  
  !  tmp = mu_erf * mu_erf
  !  do i = 1, ng_fit_jast
  !    expo_gauss_j_mu_x_2(i) = tmp * expo_gauss_j_mu_x_2(i)
  !  enddo
  elseif(ng_fit_jast .eq. 20) then
    ASSERT(n_max_fit_slat == 20)
@ -328,6 +388,36 @@ END_PROVIDER
      expo_gauss_j_mu_1_erf(i) = tmp * expo_gauss_j_mu_1_erf(i)
    enddo
  elseif(ng_fit_jast .eq. 7) then
    coef_gauss_j_mu_1_erf = (/ -0.11853067d0 , -0.01522824d0  , -0.07419098d0  , -0.022202d0    , -0.12242283d0  , -0.04177571d0  , -0.16983107d0  /)
    expo_gauss_j_mu_1_erf = (/ 2.74057056d+00,  1.37626591d+04,  6.65578663d+01,  1.34693031d+03,  1.90547699d+01,  2.69445390d+02,  6.31845879d+00/)
    tmp = mu_erf * mu_erf
    do i = 1, ng_fit_jast
      expo_gauss_j_mu_1_erf(i) = tmp * expo_gauss_j_mu_1_erf(i)
    enddo
  elseif(ng_fit_jast .eq. 8) then
    coef_gauss_j_mu_1_erf = (/ -0.12263328d0 , -0.04965255d0 , -0.15463564d0 , -0.09675781d0 , -0.0807023d0  , -0.02923298d0 , -0.01381381d0 , -0.01675923d0  /)
    expo_gauss_j_mu_1_erf = (/ 1.36101994d+01, 1.24908367d+02, 5.29061388d+00, 2.60692516d+00, 3.93396935d+01, 4.43071610d+02, 1.54902240d+04, 1.85170446d+03 /)
    tmp = mu_erf * mu_erf
    do i = 1, ng_fit_jast
      expo_gauss_j_mu_1_erf(i) = tmp * expo_gauss_j_mu_1_erf(i)
    enddo
  !elseif(ng_fit_jast .eq. 9) then
  !  coef_gauss_j_mu_1_erf = (/  /)
  !  expo_gauss_j_mu_1_erf = (/  /)
  !  tmp = mu_erf * mu_erf
  !  do i = 1, ng_fit_jast
  !    expo_gauss_j_mu_1_erf(i) = tmp * expo_gauss_j_mu_1_erf(i)
  !  enddo
  elseif(ng_fit_jast .eq. 20) then
    ASSERT(n_max_fit_slat == 20)
--- a/src/ao_tc_eff_map/potential.irp.f
+++ b/src/ao_tc_eff_map/potential.irp.f
@ -1,5 +1,7 @@
 ! ---
 BEGIN_PROVIDER [integer, n_gauss_eff_pot]
- implicit none
+
  BEGIN_DOC
  ! number of gaussians to represent the effective potential :
  !
@ -7,20 +9,31 @@ BEGIN_PROVIDER [integer, n_gauss_eff_pot]
  !
  ! Here (1 - erf(mu*r12))^2 is expanded in Gaussians as Eqs A11-A20 in JCP 154, 084119 (2021)
  END_DOC
- n_gauss_eff_pot = n_max_fit_slat + 1
+
  implicit none
  n_gauss_eff_pot = ng_fit_jast + 1
 END_PROVIDER 
 ! ---
 BEGIN_PROVIDER [integer, n_gauss_eff_pot_deriv]
- implicit none
+
  BEGIN_DOC
  ! V(r12) = -(1 - erf(mu*r12))^2 is expanded in Gaussians as Eqs A11-A20 in JCP 154, 084119 (2021)
  END_DOC
- n_gauss_eff_pot_deriv = n_max_fit_slat 
+
  implicit none
  n_gauss_eff_pot_deriv = ng_fit_jast
 END_PROVIDER 
 ! ---
 BEGIN_PROVIDER [double precision, expo_gauss_eff_pot, (n_gauss_eff_pot)]
 &BEGIN_PROVIDER [double precision, coef_gauss_eff_pot, (n_gauss_eff_pot)]
- implicit none
+
  BEGIN_DOC
  ! Coefficients and exponents of the Fit on Gaussians of V(X) = -(1 - erf(mu*X))^2 + 1/(\sqrt(pi)mu) * exp(-(mu*X)^2)
  !
@ -28,32 +41,39 @@ END_PROVIDER
  !
  ! Relies on the fit proposed in Eqs A11-A20 in JCP 154, 084119 (2021)
  END_DOC
  include 'constants.include.F'
  implicit none
  integer :: i
  ! fit of the -0.25 * (1 - erf(mu*x))^2 with n_max_fit_slat gaussians 
- do i = 1, n_max_fit_slat
+  do i = 1, ng_fit_jast
   expo_gauss_eff_pot(i) = expo_gauss_1_erf_x_2(i) 
   coef_gauss_eff_pot(i) = -0.25d0 * coef_gauss_1_erf_x_2(i) ! -1/4 * (1 - erf(mu*x))^2
  enddo
  ! Analytical Gaussian part of the potential: + 1/(\sqrt(pi)mu) * exp(-(mu*x)^2) 
- expo_gauss_eff_pot(n_max_fit_slat+1) = mu_erf * mu_erf
+  expo_gauss_eff_pot(ng_fit_jast+1) = mu_erf * mu_erf
- coef_gauss_eff_pot(n_max_fit_slat+1) =  1.d0 * mu_erf * inv_sq_pi
+  coef_gauss_eff_pot(ng_fit_jast+1) =  1.d0 * mu_erf * inv_sq_pi
 END_PROVIDER 
 ! ---
 double precision function eff_pot_gauss(x, mu)
- implicit none
+
  BEGIN_DOC
  ! V(mu,r12) = -0.25 * (1 - erf(mu*r12))^2 + 1/(\sqrt(pi)mu) * exp(-(mu*r12)^2)
  END_DOC
  implicit none
  double precision, intent(in) :: x, mu
  eff_pot_gauss =  mu/dsqrt(dacos(-1.d0)) * dexp(-mu*mu*x*x) - 0.25d0 * (1.d0 - derf(mu*x))**2.d0
 end
 ! -------------------------------------------------------------------------------------------------
 ! ---
@ -129,12 +149,15 @@ END_PROVIDER
 ! ---
 double precision function fit_1_erf_x(x)
- implicit none
+
 double precision, intent(in) :: x
  BEGIN_DOC
  ! fit_1_erf_x(x) = \sum_i c_i exp (-alpha_i x^2) \approx (1 - erf(mu*x))
  END_DOC
  implicit none
  integer :: i
  double precision, intent(in) :: x
  fit_1_erf_x = 0.d0
  do i = 1, n_max_fit_slat
    fit_1_erf_x += dexp(-expo_gauss_1_erf_x(i) *x*x) * coef_gauss_1_erf_x(i)
@ -209,6 +232,36 @@ end
      expo_gauss_1_erf_x_2(i) = tmp * expo_gauss_1_erf_x_2(i)
    enddo
  elseif(ng_fit_jast .eq. 7) then
    coef_gauss_1_erf_x_2 = (/ 0.0213619d0   , 0.03221511d0  , 0.29966689d0  , 0.19178934d0  , 0.06154732d0  , 0.28214555d0  , 0.11125985d0   /)
    expo_gauss_1_erf_x_2 = (/ 1.34727067d+04, 1.27166613d+03, 5.52584567d+00, 1.67753218d+01, 2.46145691d+02, 2.47971820d+00, 5.95141293d+01 /)
    tmp = mu_erf * mu_erf
    do i = 1, ng_fit_jast
      expo_gauss_1_erf_x_2(i) = tmp * expo_gauss_1_erf_x_2(i)
    enddo
  elseif(ng_fit_jast .eq. 8) then
    coef_gauss_1_erf_x_2 = (/ 0.28189124d0  , 0.19518669d0  , 0.12161735d0  , 0.24257438d0  , 0.07309656d0  , 0.042435d0    , 0.01926109d0  , 0.02393415d0   /)
    expo_gauss_1_erf_x_2 = (/ 4.69795903d+00, 1.21379451d+01, 3.55527053d+01, 2.39227172d+00, 1.14827721d+02, 4.16320213d+02, 1.52813587d+04, 1.78516557d+03 /)
    tmp = mu_erf * mu_erf
    do i = 1, ng_fit_jast
      expo_gauss_1_erf_x_2(i) = tmp * expo_gauss_1_erf_x_2(i)
    enddo
  !elseif(ng_fit_jast .eq. 9) then
  !  coef_gauss_1_erf_x_2 = (/  /)
  !  expo_gauss_1_erf_x_2 = (/  /)
  !  tmp = mu_erf * mu_erf
  !  do i = 1, ng_fit_jast
  !    expo_gauss_1_erf_x_2(i) = tmp * expo_gauss_1_erf_x_2(i)
  !  enddo
  elseif(ng_fit_jast .eq. 20) then
    ASSERT(n_max_fit_slat == 20)
--- a/src/bi_ort_ints/semi_num_ints_mo.irp.f
+++ b/src/bi_ort_ints/semi_num_ints_mo.irp.f
@ -107,14 +107,16 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao_transp, (ao_num, ao_num, 3,
  integer          :: i, j, ipoint
  double precision :: wall0, wall1
  print *, ' providing int2_grad1_u12_ao_transp ...'
  call wall_time(wall0)
  if(test_cycle_tc)then
   do ipoint = 1, n_points_final_grid
     do i = 1, ao_num
       do j = 1, ao_num
-         int2_grad1_u12_ao_transp(j,i,1,ipoint) = int2_grad1_u12_ao_test(1,j,i,ipoint)
+         int2_grad1_u12_ao_transp(j,i,1,ipoint) = int2_grad1_u12_ao_test(j,i,ipoint,1)
-         int2_grad1_u12_ao_transp(j,i,2,ipoint) = int2_grad1_u12_ao_test(2,j,i,ipoint)
+         int2_grad1_u12_ao_transp(j,i,2,ipoint) = int2_grad1_u12_ao_test(j,i,ipoint,2)
-         int2_grad1_u12_ao_transp(j,i,3,ipoint) = int2_grad1_u12_ao_test(3,j,i,ipoint)
+         int2_grad1_u12_ao_transp(j,i,3,ipoint) = int2_grad1_u12_ao_test(j,i,ipoint,3)
       enddo
     enddo
   enddo
@ -122,9 +124,9 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao_transp, (ao_num, ao_num, 3,
   do ipoint = 1, n_points_final_grid
     do i = 1, ao_num
       do j = 1, ao_num
-         int2_grad1_u12_ao_transp(j,i,1,ipoint) = int2_grad1_u12_ao(1,j,i,ipoint)
+         int2_grad1_u12_ao_transp(j,i,1,ipoint) = int2_grad1_u12_ao(j,i,ipoint,1)
-         int2_grad1_u12_ao_transp(j,i,2,ipoint) = int2_grad1_u12_ao(2,j,i,ipoint)
+         int2_grad1_u12_ao_transp(j,i,2,ipoint) = int2_grad1_u12_ao(j,i,ipoint,2)
-         int2_grad1_u12_ao_transp(j,i,3,ipoint) = int2_grad1_u12_ao(3,j,i,ipoint)
+         int2_grad1_u12_ao_transp(j,i,3,ipoint) = int2_grad1_u12_ao(j,i,ipoint,3)
       enddo
     enddo
   enddo
@ -192,9 +194,9 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao_t, (n_points_final_grid, 3,
  do ipoint = 1, n_points_final_grid
    do i = 1, ao_num
      do j = 1, ao_num
-        int2_grad1_u12_ao_t(ipoint,1,j,i) = int2_grad1_u12_ao(1,j,i,ipoint)
+        int2_grad1_u12_ao_t(ipoint,1,j,i) = int2_grad1_u12_ao(j,i,ipoint,1)
-        int2_grad1_u12_ao_t(ipoint,2,j,i) = int2_grad1_u12_ao(2,j,i,ipoint)
+        int2_grad1_u12_ao_t(ipoint,2,j,i) = int2_grad1_u12_ao(j,i,ipoint,2)
-        int2_grad1_u12_ao_t(ipoint,3,j,i) = int2_grad1_u12_ao(3,j,i,ipoint)
+        int2_grad1_u12_ao_t(ipoint,3,j,i) = int2_grad1_u12_ao(j,i,ipoint,3)
      enddo                                  
    enddo
  enddo
@ -203,40 +205,6 @@ END_PROVIDER
 ! ---
 BEGIN_PROVIDER [ double precision, int2_grad1_u12_bimo, (3, mo_num, mo_num, n_points_final_grid)]
  BEGIN_DOC
  !
  ! int2_grad1_u12_bimo(:,k,i,ipoint) = \int dr2 [-1 * \grad_r1 J(r1,r2)] \chi_k(r2) \phi_i(r2) 
  !
  END_DOC
  implicit none
  integer :: ipoint
  print*,'Wrong !!'
  stop
 !$OMP PARALLEL         &
 !$OMP DEFAULT (NONE)   &
 !$OMP PRIVATE (ipoint) & 
 !$OMP SHARED (n_points_final_grid,int2_grad1_u12_ao,int2_grad1_u12_bimo)
 !$OMP DO SCHEDULE (dynamic)
  do ipoint = 1, n_points_final_grid
    call ao_to_mo_bi_ortho( int2_grad1_u12_ao  (1,1,1,ipoint), size(int2_grad1_u12_ao  , 2) &
                          , int2_grad1_u12_bimo(1,1,1,ipoint), size(int2_grad1_u12_bimo, 2) )
    call ao_to_mo_bi_ortho( int2_grad1_u12_ao  (2,1,1,ipoint), size(int2_grad1_u12_ao  , 2) &
                          , int2_grad1_u12_bimo(2,1,1,ipoint), size(int2_grad1_u12_bimo, 2) )
    call ao_to_mo_bi_ortho( int2_grad1_u12_ao  (3,1,1,ipoint), size(int2_grad1_u12_ao  , 2) &
                          , int2_grad1_u12_bimo(3,1,1,ipoint), size(int2_grad1_u12_bimo, 2) )
  enddo
 !$OMP END DO
 !$OMP END PARALLEL
 END_PROVIDER 
 ! ---
 BEGIN_PROVIDER [ double precision, mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp, (n_points_final_grid, 3, mo_num, mo_num)]
  implicit none
--- a/src/bi_ortho_mos/bi_density.irp.f
+++ b/src/bi_ortho_mos/bi_density.irp.f
@ -2,49 +2,68 @@
 ! ---
 BEGIN_PROVIDER [double precision, TCSCF_bi_ort_dm_ao_alpha, (ao_num, ao_num) ]
-  implicit none
+
  BEGIN_DOC
  ! TCSCF_bi_ort_dm_ao_alpha(i,j) = <Chi_0| a^dagger_i,alpha a_j,alpha |Phi_0> where i,j are AO basis. 
  !
  ! This is the equivalent of the alpha density of the HF Slater determinant, but with a couple of bi-orthonormal Slater determinant |Chi_0> and |Phi_0>
  END_DOC
  implicit none
  PROVIDE mo_l_coef mo_r_coef
  call dgemm( 'N', 'T', ao_num, ao_num, elec_alpha_num, 1.d0               &
            , mo_l_coef, size(mo_l_coef, 1), mo_r_coef, size(mo_r_coef, 1) &
            !, mo_r_coef, size(mo_r_coef, 1), mo_l_coef, size(mo_l_coef, 1) &
            , 0.d0, TCSCF_bi_ort_dm_ao_alpha, size(TCSCF_bi_ort_dm_ao_alpha, 1) )
 END_PROVIDER
 ! ---
 BEGIN_PROVIDER [ double precision, TCSCF_bi_ort_dm_ao_beta, (ao_num, ao_num) ]
-  implicit none
+
  BEGIN_DOC
  ! TCSCF_bi_ort_dm_ao_beta(i,j) = <Chi_0| a^dagger_i,beta a_j,beta |Phi_0> where i,j are AO basis. 
  !
  ! This is the equivalent of the beta density of the HF Slater determinant, but with a couple of bi-orthonormal Slater determinant |Chi_0> and |Phi_0>
  END_DOC
  implicit none
  PROVIDE mo_l_coef mo_r_coef
  call dgemm( 'N', 'T', ao_num, ao_num, elec_beta_num, 1.d0                &
            , mo_l_coef, size(mo_l_coef, 1), mo_r_coef, size(mo_r_coef, 1) &
            !, mo_r_coef, size(mo_r_coef, 1), mo_l_coef, size(mo_l_coef, 1) &
            , 0.d0, TCSCF_bi_ort_dm_ao_beta, size(TCSCF_bi_ort_dm_ao_beta, 1) )
 END_PROVIDER
 ! ---
 BEGIN_PROVIDER [ double precision, TCSCF_bi_ort_dm_ao, (ao_num, ao_num) ]
-  implicit none
+
  BEGIN_DOC
  ! TCSCF_bi_ort_dm_ao(i,j) = <Chi_0| a^dagger_i,beta+alpha a_j,beta+alpha |Phi_0> where i,j are AO basis. 
  !
  ! This is the equivalent of the total electronic density of the HF Slater determinant, but with a couple of bi-orthonormal Slater determinant |Chi_0> and |Phi_0>
  END_DOC
  implicit none
  PROVIDE mo_l_coef mo_r_coef
  ASSERT(size(TCSCF_bi_ort_dm_ao, 1) == size(TCSCF_bi_ort_dm_ao_alpha, 1))
  if(elec_alpha_num==elec_beta_num) then
    TCSCF_bi_ort_dm_ao = TCSCF_bi_ort_dm_ao_alpha + TCSCF_bi_ort_dm_ao_alpha
  else
    ASSERT(size(TCSCF_bi_ort_dm_ao, 1) == size(TCSCF_bi_ort_dm_ao_beta, 1))
    TCSCF_bi_ort_dm_ao = TCSCF_bi_ort_dm_ao_alpha + TCSCF_bi_ort_dm_ao_beta
  endif
 END_PROVIDER
 ! ---
--- a/src/csf/cfgCI_utils.c
+++ b/src/csf/cfgCI_utils.c
@ -253,9 +253,9 @@ void generateAllBFs(int64_t Isomo, int64_t MS, Tree *bftree, int *NBF, int *NSOM
    buildTreeDriver(bftree, *NSOMO, MS, NBF);
 }
-void ortho_qr_csf(double *overlapMatrix, int lda, double *orthoMatrix, int rows, int cols);
+//void ortho_qr_csf(double *overlapMatrix, int lda, double *orthoMatrix, int rows, int cols);
 // QR to orthogonalize CSFs does not work
 //void gramSchmidt_qp(double *overlapMatrix, int rows, int cols, double *orthoMatrix){
 //  int i,j;
 //  //for(j=0;j<cols;++j){
--- a/src/csf/configuration_CI_sigma_helpers.irp.f
+++ b/src/csf/configuration_CI_sigma_helpers.irp.f
--- a/src/csf/conversion.irp.f
+++ b/src/csf/conversion.irp.f
@ -114,6 +114,7 @@ subroutine convertWFfromCSFtoDET(N_st,psi_coef_cfg_in, psi_coef_det)
  integer                        :: idx
  integer MS
  MS = elec_alpha_num-elec_beta_num
  !print *,"size=",size(tmp_psi_coef_det,1)," ",size(tmp_psi_coef_det,2)
  countcsf = 0
--- a/src/csf/obtain_I_foralpha.irp.f
+++ b/src/csf/obtain_I_foralpha.irp.f
@ -38,6 +38,7 @@ subroutine obtain_connected_J_givenI(idxI, givenI, connectedI, idxs_connectedI,
  integer :: holetype(mo_num)
  integer :: end_index
  integer :: Nsomo_I
  integer :: listall(N_int*bit_kind_size), nelall
  ! 
  ! 2 2 1 1 0 0 : 1 1 0 0 0 0
@ -65,9 +66,12 @@ subroutine obtain_connected_J_givenI(idxI, givenI, connectedI, idxs_connectedI,
  ! Since CFGs are sorted wrt to seniority
  ! we don't have to search the full CFG list
-  Isomo = givenI(1,1)
+  Nsomo_I = 0
-  Idomo = givenI(1,2)
+  do i=1,N_int
-  Nsomo_I = POPCNT(Isomo)
+    Isomo = givenI(i,1)
    Idomo = givenI(i,2)
    Nsomo_I += POPCNT(Isomo)
  end do
  end_index = min(N_configuration,cfg_seniority_index(min(Nsomo_I+6,elec_num))-1)
  if(end_index .LT. 0) end_index= N_configuration
  !end_index = N_configuration
@ -83,17 +87,24 @@ subroutine obtain_connected_J_givenI(idxI, givenI, connectedI, idxs_connectedI,
     !  idxs_connectedI(nconnectedI)=i
     !  cycle
     !endif
-     Isomo = givenI(1,1)
+
-     Idomo = givenI(1,2)
+     ndiffSOMO = 0
-     Jsomo = psi_configuration(1,1,i)
+     ndiffDOMO = 0
-     Jdomo = psi_configuration(1,2,i)
+     nxordiffSOMODOMO = 0
     do ii=1,N_int
       Isomo = givenI(ii,1)
       Idomo = givenI(ii,2)
       Jsomo = psi_configuration(ii,1,i)
       Jdomo = psi_configuration(ii,2,i)
       diffSOMO = IEOR(Isomo,Jsomo)
-     ndiffSOMO = POPCNT(diffSOMO)
+       ndiffSOMO += POPCNT(diffSOMO)
       diffDOMO = IEOR(Idomo,Jdomo)
       xordiffSOMODOMO = IEOR(diffSOMO,diffDOMO)
-     ndiffDOMO = POPCNT(diffDOMO)
+       ndiffDOMO += POPCNT(diffDOMO)
-     nxordiffSOMODOMO = POPCNT(xordiffSOMODOMO)
+       nxordiffSOMODOMO += POPCNT(xordiffSOMODOMO)
-     nxordiffSOMODOMO += ndiffSOMO + ndiffDOMO 
+       nxordiffSOMODOMO += POPCNT(diffSOMO) + POPCNT(diffDOMO)
     end do
     if((nxordiffSOMODOMO .EQ. 4) .AND. ndiffSOMO .EQ. 2) then
       !-------
       ! MONO |
@ -144,25 +155,45 @@ subroutine obtain_connected_J_givenI(idxI, givenI, connectedI, idxs_connectedI,
        ! find out all pq holes possible
        nholes = 0
        ! holes in SOMO
-        Isomo = psi_configuration(1,1,i)
+        !Isomo = psi_configuration(1,1,i)
-        Idomo = psi_configuration(1,2,i)
+        !Idomo = psi_configuration(1,2,i)
-        do iii = 1,n_act_orb
+        !do iii = 1,n_act_orb
-          ii = list_act(iii)
+        !  ii = list_act(iii)
-           if(POPCNT(IAND(Isomo,IBSET(0_8,ii-1))) .EQ. 1) then
+        !   if(POPCNT(IAND(Isomo,IBSET(0_8,ii-1))) .EQ. 1) then
        !      nholes += 1
        !      listholes(nholes) = ii
        !      holetype(nholes) = 1
        !   endif
        !end do
        call bitstring_to_list(psi_configuration(1,1,i),listall,nelall,N_int)
        do iii=1,nelall
          nholes += 1
-              listholes(nholes) = ii
+          listholes(nholes) = listall(iii)
          holetype(nholes) = 1
           endif
        end do
        ! holes in DOMO
-        do iii = 1,n_act_orb
+        !do iii = 1,n_act_orb
-          ii = list_act(iii)
+        !  ii = list_act(iii)
-           if(POPCNT(IAND(Idomo,IBSET(0_8,ii-1))) .EQ. 1) then
+        !   if(POPCNT(IAND(Idomo,IBSET(0_8,ii-1))) .EQ. 1) then
        !      nholes += 1
        !      listholes(nholes) = ii
        !      holetype(nholes) = 2
        !   endif
        !end do
        call bitstring_to_list(psi_configuration(1,2,i),listall,nelall,N_int)
        do iii=1,nelall
          if(listall(iii) .gt. n_core_orb)then
            nholes += 1
-              listholes(nholes) = ii
+            listholes(nholes) = listall(iii)
            holetype(nholes) = 2
          endif
        end do
        ntotalconnectedI += max(1,(psi_config_data(i,2)-psi_config_data(i,1)+1)*nholes)
     endif
  end do
@ -199,6 +230,8 @@ subroutine obtain_connected_I_foralpha(idxI, Ialpha, connectedI, idxs_connectedI
  integer*8                                :: Isomo
  integer*8                                :: Jdomo
  integer*8                                :: Jsomo
  integer(bit_kind)                        :: Jcfg(N_int,2)
  integer(bit_kind)                        :: Icfg(N_int,2)
  integer*8                                :: IJsomo
  integer*8                                :: diffSOMO
  integer*8                                :: diffDOMO
@ -209,132 +242,261 @@ subroutine obtain_connected_I_foralpha(idxI, Ialpha, connectedI, idxs_connectedI
  integer                        :: iii,ii,i,j,k,l,p,q,nsomoJ,nsomoalpha,starti,endi,extyp,nholes
  integer                        :: listholes(mo_num)
  integer                        :: holetype(mo_num)
-  integer                        :: end_index
+  integer                        :: end_index, ishift
-  integer                        :: Nsomo_alpha
+  integer                        :: Nsomo_alpha, pp,qq, nperm, iint, ipos
  integer*8                      :: MS
  integer                        :: exc(0:2,2,2), tz, m, n, high, low
  integer                        :: listall(N_int*bit_kind_size), nelall
  integer                        :: nconnectedExtradiag, nconnectedDiag
  integer(bit_kind)              :: hole, particle, tmp
  MS = elec_alpha_num-elec_beta_num
  nconnectedExtradiag=0
  nconnectedDiag=0
  nconnectedI = 0
  end_index = N_configuration
  ! Since CFGs are sorted wrt to seniority
  ! we don't have to search the full CFG list
-  Isomo = Ialpha(1,1)
+  !Isomo = Ialpha(1,1)
-  Idomo = Ialpha(1,2)
+  !Idomo = Ialpha(1,2)
-  Nsomo_alpha = POPCNT(Isomo)
+  !Nsomo_alpha = POPCNT(Isomo)
  Icfg = Ialpha
  Nsomo_alpha = 0
  !print *," Ialpha="
  do ii=1,N_int
    Isomo = Ialpha(ii,1)
    Idomo = Ialpha(ii,2)
    Nsomo_alpha += POPCNT(Isomo)
    !print *,Isomo, Idomo, "Nsomo=",Nsomo_alpha
  end do
  end_index = min(N_configuration,cfg_seniority_index(min(Nsomo_alpha+4,elec_num))-1)
-  if(end_index .LT. 0) end_index= N_configuration
+  if(end_index .LT. 0 .OR. end_index .lt. idxI) end_index= N_configuration
  end_index = N_configuration
  p = 0
  q = 0
-  if (N_int > 1) stop 'obtain_connected_i_foralpha : N_int > 1'
+  !if (N_int > 1) stop 'obtain_connected_i_foralpha : N_int > 1'
  do i=idxI,end_index
     Isomo = Ialpha(1,1)
     Idomo = Ialpha(1,2)
     Jsomo = psi_configuration(1,1,i)
     Jdomo = psi_configuration(1,2,i)
     ! Check for Minimal alpha electrons (MS)
-     if(POPCNT(Isomo).lt.MS)then
+     if(Nsomo_alpha .lt. MS)then
       cycle
     endif
     ndiffSOMO = 0
     ndiffDOMO = 0
     nxordiffSOMODOMO = 0
     nsomoJ=0
     nsomoalpha=0
     do ii=1,N_int
       Isomo = Ialpha(ii,1)
       Idomo = Ialpha(ii,2)
       Jsomo = psi_configuration(ii,1,i)
       Jdomo = psi_configuration(ii,2,i)
       nsomoJ += POPCNT(Jsomo)
       nsomoalpha += POPCNT(Isomo)
       diffSOMO = IEOR(Isomo,Jsomo)
-     ndiffSOMO = POPCNT(diffSOMO)
+       ndiffSOMO += POPCNT(diffSOMO)
     !if(idxI.eq.1)then
     !  print *," \t idxI=",i," diffS=",ndiffSOMO," popJs=", POPCNT(Jsomo)," popIs=",POPCNT(Isomo)
     !endif
       diffDOMO = IEOR(Idomo,Jdomo)
       xordiffSOMODOMO = IEOR(diffSOMO,diffDOMO)
-     ndiffDOMO = POPCNT(diffDOMO)
+       ndiffDOMO += POPCNT(diffDOMO)
-     nxordiffSOMODOMO = POPCNT(xordiffSOMODOMO)
+       nxordiffSOMODOMO += POPCNT(xordiffSOMODOMO)
-     nxordiffSOMODOMO += ndiffSOMO + ndiffDOMO 
+       nxordiffSOMODOMO += POPCNT(diffSOMO) + POPCNT(diffDOMO)
     end do
     !if(idxI.eq.218)then
     !  print *,"I=",idxI,"Nsomo_alpha=",Nsomo_alpha,"nxordiffSOMODOMO(4)=",nxordiffSOMODOMO, " ndiffSOMO(2)=",ndiffSOMO, " ndiffDOMO=",ndiffDOMO
     !endif
     !Jcfg = psi_configuration(:,:,i)
     !print *,"nxordiffSOMODOMO(4)=",nxordiffSOMODOMO, " ndiffSOMO(2)=",ndiffSOMO
     if((nxordiffSOMODOMO .EQ. 4) .AND. ndiffSOMO .EQ. 2) then
        select case(ndiffDOMO)
        case (0)
           ! SOMO -> VMO
           !print *,"obt SOMO -> VMO"
           extyp = 3
           !if(N_int .eq. 1) then
           !  IJsomo = IEOR(Isomo, Jsomo)
           !  p = TRAILZ(IAND(Isomo,IJsomo)) + 1
           !  IJsomo = IBCLR(IJsomo,p-1)
           !  q = TRAILZ(IJsomo) + 1
           !  !print *," p=",p," q=",q
           !  !call get_single_excitation_cfg(Jcfg, Icfg, p, q, N_int)
           !else
             ! Find p
             do ii=1,N_int
               Isomo = Ialpha(ii,1)
               Jsomo = psi_configuration(ii,1,i)
               IJsomo = IEOR(Isomo, Jsomo)
-!IRP_IF WITHOUT_TRAILZ
+               if(popcnt(IAND(Isomo,IJsomo)) > 0)then
-!           p = (popcnt(ieor( IAND(Isomo,IJsomo) , IAND(Isomo,IJsomo) -1))-1) + 1
+                 p = TRAILZ(IAND(Isomo,IJsomo)) + 1 + (ii-1) * bit_kind_size
-!IRP_ELSE
+                 EXIT
-           p = TRAILZ(IAND(Isomo,IJsomo)) + 1
+               endif
-!IRP_ENDIF
+             end do
-           IJsomo = IBCLR(IJsomo,p-1)
+             ! Find q
-!IRP_IF WITHOUT_TRAILZ
+             do ii=1,N_int
-!           q = (popcnt(ieor(IJsomo,IJsomo-1))-1) + 1
+               Isomo = Ialpha(ii,1)
-!IRP_ELSE
+               Jsomo = psi_configuration(ii,1,i)
-           q = TRAILZ(IJsomo) + 1
+               IJsomo = IEOR(Isomo, Jsomo)
-!IRP_ENDIF
+               iint = shiftr(p-1,bit_kind_shift) + 1
               ipos = p-shiftl((iint-1),bit_kind_shift)
               if(iint .eq. ii)then
                 IJsomo = IBCLR(IJsomo,ipos-1)
               endif
               if(popcnt(IJsomo) > 0)then
                 q = TRAILZ(IJsomo) + 1 + (ii-1) * bit_kind_size
                 EXIT
               endif
             enddo
           !endif
           !assert ( p == pp)
           !assert ( q == qq)
           !print *," 1--- p=",p," q=",q
        case (1)
           ! DOMO -> VMO
           ! or
           ! SOMO -> SOMO
           nsomoJ = POPCNT(Jsomo)
           nsomoalpha = POPCNT(Isomo)
           if(nsomoJ .GT. nsomoalpha) then
              ! DOMO -> VMO
              !print *,"obt DOMO -> VMO"
              extyp = 2
-!IRP_IF WITHOUT_TRAILZ
+              !if(N_int.eq.1)then
-!              p = (popcnt(ieor( IEOR(Idomo,Jdomo),IEOR(Idomo,Jdomo) -1))-1) + 1
+              !  p = TRAILZ(IEOR(Idomo,Jdomo)) + 1
-!IRP_ELSE
+              !  Isomo = IEOR(Isomo, Jsomo)
-              p = TRAILZ(IEOR(Idomo,Jdomo)) + 1
+              !  Isomo = IBCLR(Isomo,p-1)
-!IRP_ENDIF
+              !  q = TRAILZ(Isomo) + 1
-              Isomo = IEOR(Isomo, Jsomo)
+              !else
-              Isomo = IBCLR(Isomo,p-1)
+
-!IRP_IF WITHOUT_TRAILZ
+                ! Find p
-!              q = (popcnt(ieor(Isomo,Isomo-1))-1) + 1
+                do ii=1,N_int
-!IRP_ELSE
+                  Isomo = Ialpha(ii,1)
-              q = TRAILZ(Isomo) + 1
+                  Jsomo = psi_configuration(ii,1,i)
-!IRP_ENDIF
+                  Idomo = Ialpha(ii,2)
                  Jdomo = psi_configuration(ii,2,i)
                  if(popcnt(IEOR(Idomo,Jdomo)) > 0)then
                    p = TRAILZ(IEOR(Idomo,Jdomo)) + 1 + (ii-1) * bit_kind_size
                    EXIT
                  endif
                end do
                ! Find q
                do ii=1,N_int
                  Isomo = Ialpha(ii,1)
                  Jsomo = psi_configuration(ii,1,i)
                  IJsomo = IEOR(Isomo, Jsomo)
                  iint = shiftr(p-1,bit_kind_shift) + 1
                  ipos = p-shiftl((iint-1),bit_kind_shift)
                  if(iint .eq. ii)then
                    IJsomo = IBCLR(IJsomo,ipos-1)
                  endif
                  if(popcnt(IJsomo) > 0)then
                    q = TRAILZ(IJsomo) + 1 + (ii-1) * bit_kind_size
                    EXIT
                  endif
                end do
              !endif
           !assert ( p == pp)
           !assert ( q == qq)
           else
              ! SOMO -> SOMO
              !print *,"obt SOMO -> SOMO"
              extyp = 1
-!IRP_IF WITHOUT_TRAILZ
+              !if(N_int.eq.1)then
-!              q = (popcnt(ieor( IEOR(Idomo,Jdomo), IEOR(Idomo,Jdomo)-1))-1) + 1
+              !  q = TRAILZ(IEOR(Idomo,Jdomo)) + 1
-!IRP_ELSE
+              !  Isomo = IEOR(Isomo, Jsomo)
-              q = TRAILZ(IEOR(Idomo,Jdomo)) + 1
+              !  Isomo = IBCLR(Isomo,q-1)
-!IRP_ENDIF
+              !  p = TRAILZ(Isomo) + 1
-              Isomo = IEOR(Isomo, Jsomo)
+              !  ! Check for Minimal alpha electrons (MS)
-              Isomo = IBCLR(Isomo,q-1)
+              !  !if(POPCNT(Isomo).lt.MS)then
-!IRP_IF WITHOUT_TRAILZ
+              !  !  cycle
-!              p = (popcnt(ieor(Isomo,Isomo-1))-1) + 1
+              !  !endif
-!IRP_ELSE
+              !else
-              p = TRAILZ(Isomo) + 1
+                ! Find p
-!IRP_ENDIF
+                !print *,"Ialpha somo=",Ialpha(1,1), Ialpha(2,1)," Ialpha domo=",Ialpha(1,2), Ialpha(2,2)
-              ! Check for Minimal alpha electrons (MS)
+                !print *,"J somo=",psi_configuration(1,1,i), psi_configuration(2,1,i)," J domo=",psi_configuration(1,2,i),&
-              !if(POPCNT(Isomo).lt.MS)then
+                !psi_configuration(2,2,i)
-              !  cycle
+                do ii=1,N_int
-              !endif
+                  Isomo = Ialpha(ii,1)
                  Jsomo = psi_configuration(ii,1,i)
                  Idomo = Ialpha(ii,2)
                  Jdomo = psi_configuration(ii,2,i)
                  if(popcnt(IEOR(Idomo,Jdomo)) > 0)then
                    q = TRAILZ(IEOR(Idomo,Jdomo)) + 1 + (ii-1) * bit_kind_size
                    EXIT
                  endif
                enddo
                ! Find q
                do ii=1,N_int
                  Isomo = Ialpha(ii,1)
                  Jsomo = psi_configuration(ii,1,i)
                  IJsomo = IEOR(Isomo, Jsomo)
                  iint = shiftr(q-1,bit_kind_shift) + 1
                  ipos = q-shiftl((iint-1),bit_kind_shift)
                  if(iint .eq. ii)then
                    IJsomo = IBCLR(IJsomo,ipos-1)
                  endif
                  !print *,"ii=",ii," Isomo=",Isomo
                  if(popcnt(IJsomo) > 0)then
                    p = TRAILZ(IJsomo) + 1 + (ii-1) * bit_kind_size
                    EXIT
                  endif
                enddo
              !endif
           !assert ( p == pp)
           !assert ( q == qq)
           endif
           !print *," 2--- p=",p," q=",q
        case (2)
           ! DOMO -> SOMO
           !print *,"obt DOMO -> SOMO"
           extyp = 4
           !if(N_int.eq.1)then
           !  IJsomo = IEOR(Isomo, Jsomo)
           !  p = TRAILZ(IAND(Jsomo,IJsomo)) + 1
           !  IJsomo = IBCLR(IJsomo,p-1)
           !  q = TRAILZ(IJsomo) + 1
           !else
             ! Find p
             do ii=1,N_int
               Isomo = Ialpha(ii,1)
               Jsomo = psi_configuration(ii,1,i)
               Idomo = Ialpha(ii,2)
               Jdomo = psi_configuration(ii,2,i)
               IJsomo = IEOR(Isomo, Jsomo)
-!IRP_IF WITHOUT_TRAILZ
+               if(popcnt(IAND(Jsomo,IJsomo)) > 0)then
-!           p = (popcnt(ieor( IAND(Jsomo,IJsomo), IAND(Jsomo,IJsomo)-1))-1) + 1
+                 p = TRAILZ(IAND(Jsomo,IJsomo)) + 1 + (ii-1) * bit_kind_size
-!IRP_ELSE
+                 EXIT
-           p = TRAILZ(IAND(Jsomo,IJsomo)) + 1
+               endif
-!IRP_ENDIF
+             enddo
-           IJsomo = IBCLR(IJsomo,p-1)
+             ! Find q
-!IRP_IF WITHOUT_TRAILZ
+             do ii=1,N_int
-!           q = (popcnt(ieor( IJsomo , IJsomo -1))-1) + 1
+               Isomo = Ialpha(ii,1)
-!IRP_ELSE
+               Jsomo = psi_configuration(ii,1,i)
-           q = TRAILZ(IJsomo) + 1
+               IJsomo = IEOR(Isomo, Jsomo)
-!IRP_ENDIF
+               iint = shiftr(p-1,bit_kind_shift) + 1
               ipos = p-shiftl((iint-1),bit_kind_shift)
               if(iint .eq. ii)then
                 IJsomo = IBCLR(IJsomo,ipos-1)
               endif
               if(popcnt(IJsomo) > 0)then
                 q = TRAILZ(IJsomo) + 1 + (ii-1) * bit_kind_size
                 EXIT
               endif
             enddo
           !endif
           !assert ( p == pp)
           !assert ( q == qq)
           !print *," 3--- p=",p," q=",q
        case default
           print *,"something went wront in get connectedI"
        end select
        starti = psi_config_data(i,1)
        endi   = psi_config_data(i,2)
        nconnectedExtradiag+=1
        nconnectedI += 1
-        do k=1,N_int
+        do ii=1,N_int
-          connectedI(k,1,nconnectedI) = psi_configuration(k,1,i)
+          connectedI(ii,1,nconnectedI) = psi_configuration(ii,1,i)
-          connectedI(k,2,nconnectedI) = psi_configuration(k,2,i)
+          connectedI(ii,2,nconnectedI) = psi_configuration(ii,2,i)
        enddo
        idxs_connectedI(nconnectedI)=starti
        excitationIds(1,nconnectedI)=p
@ -343,28 +505,51 @@ subroutine obtain_connected_I_foralpha(idxI, Ialpha, connectedI, idxs_connectedI
        diagfactors(nconnectedI) = 1.0d0
     else if((ndiffSOMO + ndiffDOMO) .EQ. 0) then
        ! find out all pq holes possible
        !print *,"I = ",i
        !print *,"I somo= ",psi_configuration(1,1,i), " domo=", psi_configuration(1,2,i)
        !print *,"alp somo= ",Ialpha(1,1), " domo=", Ialpha(1,2)
        nholes = 0
        ! holes in SOMO
-        Isomo = psi_configuration(1,1,i)
+        !Isomo = psi_configuration(1,1,i)
-        Idomo = psi_configuration(1,2,i)
+        !Idomo = psi_configuration(1,2,i)
-        do iii = 1,n_act_orb
+        !do iii = 1,n_act_orb
-          ii = list_act(iii)
+        !  ii = list_act(iii)
-           if(POPCNT(IAND(Isomo,IBSET(0_8,ii-1))) .EQ. 1) then
+        !   if(POPCNT(IAND(Isomo,IBSET(0_8,ii-1))) .EQ. 1) then
        !      nholes += 1
        !      listholes(nholes) = ii
        !      holetype(nholes) = 1
        !   endif
        !end do
        call bitstring_to_list(psi_configuration(1,1,i),listall,nelall,N_int)
        do iii=1,nelall
          nholes += 1
-              listholes(nholes) = ii
+          listholes(nholes) = listall(iii)
          holetype(nholes) = 1
           endif
        end do
        ! holes in DOMO
-        do iii = 1,n_act_orb
+        !do iii = 1,n_act_orb
-          ii = list_act(iii)
+        !  ii = list_act(iii)
-           if(POPCNT(IAND(Idomo,IBSET(0_8,ii-1))) .EQ. 1) then
+        !   if(POPCNT(IAND(Idomo,IBSET(0_8,ii-1))) .EQ. 1) then
        !      nholes += 1
        !      listholes(nholes) = ii
        !      holetype(nholes) = 2
        !   endif
        !end do
        nelall=0
        listall=0
        call bitstring_to_list(psi_configuration(1,2,i),listall,nelall,N_int)
        do iii=1,nelall
          if(listall(iii) .gt. n_core_orb)then
            nholes += 1
-              listholes(nholes) = ii
+            listholes(nholes) = listall(iii)
            holetype(nholes) = 2
          endif
        end do
        do k=1,nholes
           p = listholes(k)
           q = p
@ -372,6 +557,7 @@ subroutine obtain_connected_I_foralpha(idxI, Ialpha, connectedI, idxs_connectedI
           if(holetype(k) .EQ. 1) then
              starti = psi_config_data(i,1)
              endi   = psi_config_data(i,2)
              nconnectedDiag+=1
              nconnectedI += 1
              connectedI(:,:,nconnectedI) = psi_configuration(:,:,i)
              idxs_connectedI(nconnectedI)=starti
@ -382,6 +568,7 @@ subroutine obtain_connected_I_foralpha(idxI, Ialpha, connectedI, idxs_connectedI
           else
              starti = psi_config_data(i,1)
              endi   = psi_config_data(i,2)
              nconnectedDiag+=1
              nconnectedI += 1
              connectedI(:,:,nconnectedI) = psi_configuration(:,:,i)
              idxs_connectedI(nconnectedI)=starti
@ -390,8 +577,10 @@ subroutine obtain_connected_I_foralpha(idxI, Ialpha, connectedI, idxs_connectedI
              excitationTypes(nconnectedI) = extyp
              diagfactors(nconnectedI) = 2.0d0
           endif
           !print *,excitationIds(1,nconnectedI), excitationIds(2,nconnectedI)
        enddo
     endif
  end do
  !print *,"nconnectedExtradiag=",nconnectedExtradiag," nconnectedDiad=",nconnectedDiag
 end subroutine obtain_connected_I_foralpha
--- a/src/csf/sigma_vector.irp.f
+++ b/src/csf/sigma_vector.irp.f
@ -146,7 +146,6 @@
    ncfgprev = cfg_seniority_index(i+2)
  end do
  !print *," ^^^^^ N_CSF = ",n_CSF," N_CFG=",N_configuration
 END_PROVIDER
@ -832,7 +831,7 @@ subroutine calculate_preconditioner_cfg(diag_energies)
  ! the configurations in psi_configuration
  ! returns : diag_energies :
  END_DOC
-  integer :: i,j,k,kk,l,p,q,noccp,noccq, ii, jj
+  integer :: i,j,k,kk,l,p,q,noccp,noccq, ii, jj, iii
  real*8,intent(out) :: diag_energies(n_CSF)
  integer                            :: nholes
  integer                            :: nvmos
@ -858,8 +857,8 @@ subroutine calculate_preconditioner_cfg(diag_energies)
  real*8, external  :: mo_two_e_integral
  real*8            :: hpp
  real*8            :: meCC
  real*8            :: ecore
  real*8            :: core_act_contrib
  integer                        :: listall(N_int*bit_kind_size), nelall
  !PROVIDE h_core_ri
  PROVIDE core_fock_operator
@ -869,7 +868,6 @@ subroutine calculate_preconditioner_cfg(diag_energies)
  !print *,"Core energy=",core_energy," nucler rep=",nuclear_repulsion, " n_core_orb=",n_core_orb," n_act_orb=",n_act_orb," mo_num=",mo_num
  ! calculate core energy
  !call get_core_energy(ecore)
  diag_energies = core_energy - nuclear_repulsion
  ! calculate the core energy
@ -877,11 +875,11 @@ subroutine calculate_preconditioner_cfg(diag_energies)
  do i=1,N_configuration
-     Isomo = psi_configuration(1,1,i)
+     !Isomo = psi_configuration(1,1,i)
-     Idomo = psi_configuration(1,2,i)
+     !Idomo = psi_configuration(1,2,i)
-     Icfg(1,1) = psi_configuration(1,1,i)
+     !Icfg(1,1) = psi_configuration(1,1,i)
-     Icfg(1,2) = psi_configuration(1,2,i)
+     !Icfg(1,2) = psi_configuration(1,2,i)
-     NSOMOI = getNSOMO(psi_configuration(:,:,i))
+     !NSOMOI = getNSOMO(psi_configuration(:,:,i))
     starti = psi_config_data(i,1)
     endi   = psi_config_data(i,2)
@ -890,48 +888,63 @@ subroutine calculate_preconditioner_cfg(diag_energies)
     ! find out all pq holes possible
     nholes = 0
     listholes = -1
     ! holes in SOMO
-     !do k = 1,mo_num
+     !do kk = 1,n_act_orb
-     do kk = 1,n_act_orb
+     !  k = list_act(kk)
-       k = list_act(kk)
+     !   if(POPCNT(IAND(Isomo,IBSET(0_8,k-1))) .EQ. 1) then
-        if(POPCNT(IAND(Isomo,IBSET(0_8,k-1))) .EQ. 1) then
+     !      nholes += 1
     !      listholes(nholes) = k
     !      holetype(nholes) = 1
     !   endif
     !enddo
     call bitstring_to_list(psi_configuration(1,1,i),listall,nelall,N_int)
     do iii=1,nelall
       nholes += 1
-           listholes(nholes) = k
+       listholes(nholes) = listall(iii)
       holetype(nholes) = 1
        endif
     end do
     ! holes in DOMO
-     !do k = n_core_orb+1,n_core_orb + n_act_orb
+     !do kk = 1,n_act_orb
-     !do k = 1+n_core_inact_orb,n_core_orb+n_core_inact_act_orb
+     !  k = list_act(kk)
-     !do k = 1,mo_num
+     !   if(POPCNT(IAND(Idomo,IBSET(0_8,k-1))) .EQ. 1) then
-     do kk = 1,n_act_orb
+     !      nholes += 1
-       k = list_act(kk)
+     !      listholes(nholes) = k
-        if(POPCNT(IAND(Idomo,IBSET(0_8,k-1))) .EQ. 1) then
+     !      holetype(nholes) = 2
     !   endif
     !enddo
     call bitstring_to_list(psi_configuration(1,2,i),listall,nelall,N_int)
     do iii=1,nelall
       if(listall(iii) .gt. n_core_orb)then
         nholes += 1
-           listholes(nholes) = k
+         listholes(nholes) = listall(iii)
         holetype(nholes) = 2
       endif
     end do
-     ! find vmos
+
-     listvmos = -1
+     !!! find vmos
-     vmotype = -1
+     !!listvmos = -1
-     nvmos = 0
+     !!vmotype = -1
-     !do k = n_core_orb+1,n_core_orb + n_act_orb
+     !!nvmos = 0
-     !do k = 1,mo_num
+     !!!do k = n_core_orb+1,n_core_orb + n_act_orb
-     do kk = 1,n_act_orb
+     !!!do k = 1,mo_num
-       k = list_act(kk)
+     !!do kk = 1,n_act_orb
-        !print *,i,IBSET(0,i-1),POPCNT(IAND(Isomo,(IBSET(0_8,i-1)))), POPCNT(IAND(Idomo,(IBSET(0_8,i-1))))
+     !!  k = list_act(kk)
-        if(POPCNT(IAND(Isomo,(IBSET(0_8,k-1)))) .EQ. 0 .AND. POPCNT(IAND(Idomo,(IBSET(0_8,k-1)))) .EQ. 0) then
+     !!   !print *,i,IBSET(0,i-1),POPCNT(IAND(Isomo,(IBSET(0_8,i-1)))), POPCNT(IAND(Idomo,(IBSET(0_8,i-1))))
-           nvmos += 1
+     !!   if(POPCNT(IAND(Isomo,(IBSET(0_8,k-1)))) .EQ. 0 .AND. POPCNT(IAND(Idomo,(IBSET(0_8,k-1)))) .EQ. 0) then
-           listvmos(nvmos) = k
+     !!      nvmos += 1
-           vmotype(nvmos) = 0
+     !!      listvmos(nvmos) = k
-        else if(POPCNT(IAND(Isomo,(IBSET(0_8,k-1)))) .EQ. 1 .AND. POPCNT(IAND(Idomo,(IBSET(0_8,k-1)))) .EQ. 0 ) then
+     !!      vmotype(nvmos) = 0
-           nvmos += 1
+     !!   else if(POPCNT(IAND(Isomo,(IBSET(0_8,k-1)))) .EQ. 1 .AND. POPCNT(IAND(Idomo,(IBSET(0_8,k-1)))) .EQ. 0 ) then
-           listvmos(nvmos) = k
+     !!      nvmos += 1
-           vmotype(nvmos) = 1
+     !!      listvmos(nvmos) = k
-        end if
+     !!      vmotype(nvmos) = 1
-     enddo
+     !!   end if
     !!enddo
     !print *,"I=",i
     !call debug_spindet(psi_configuration(1,1,i),N_int)
     !call debug_spindet(psi_configuration(1,2,i),N_int)
@ -1221,27 +1234,30 @@ subroutine convertOrbIdsToModelSpaceIds(Ialpha, Jcfg, p, q, extype, pmodel, qmod
  integer,intent(in)             :: p,q
  integer,intent(in)             :: extype
  integer,intent(out)            :: pmodel,qmodel
-  !integer(bit_kind)              :: Isomo(N_int)
+  integer(bit_kind)              :: Isomo(N_int)
-  !integer(bit_kind)              :: Idomo(N_int)
+  integer(bit_kind)              :: Idomo(N_int)
-  !integer(bit_kind)              :: Jsomo(N_int)
+  integer(bit_kind)              :: Jsomo(N_int)
-  !integer(bit_kind)              :: Jdomo(N_int)
+  integer(bit_kind)              :: Jdomo(N_int)
-  integer*8                       :: Isomo
+  !integer*8                       :: Isomo
-  integer*8                       :: Idomo
+  !integer*8                       :: Idomo
-  integer*8                       :: Jsomo
+  !integer*8                       :: Jsomo
-  integer*8                       :: Jdomo
+  !integer*8                       :: Jdomo
  integer*8                      :: mask
-  integer                        :: iint, ipos
+  integer                        :: iint, ipos, ii
  !integer(bit_kind)              :: Isomotmp(N_int)
  !integer(bit_kind)              :: Jsomotmp(N_int)
  integer*8             :: Isomotmp
  integer*8             :: Jsomotmp
  integer                        :: pos0,pos0prev
  integer                        :: tmpp, tmpq
  ! TODO Flag (print) when model space indices is > 64
-  Isomo = Ialpha(1,1)
+  do ii=1,N_int
-  Idomo = Ialpha(1,2)
+    Isomo(ii) = Ialpha(ii,1)
-  Jsomo = Jcfg(1,1)
+    Idomo(ii) = Ialpha(ii,2)
-  Jdomo = Jcfg(1,2)
+    Jsomo(ii) = Jcfg(ii,1)
    Jdomo(ii) = Jcfg(ii,2)
  end do
  pos0prev = 0
  pmodel = p
  qmodel = q
@ -1255,40 +1271,155 @@ subroutine convertOrbIdsToModelSpaceIds(Ialpha, Jcfg, p, q, extype, pmodel, qmod
          ! SOMO -> SOMO
          ! remove all domos
          !print *,"type -> SOMO -> SOMO"
-          mask = ISHFT(1_8,p) - 1
+          !mask = ISHFT(1_8,p) - 1
-          Isomotmp = IAND(Isomo,mask)
+          !Isomotmp = IAND(Isomo,mask)
-          pmodel = POPCNT(mask) - POPCNT(XOR(Isomotmp,mask))
+          !pmodel = POPCNT(mask) - POPCNT(XOR(Isomotmp,mask))
-          mask = ISHFT(1_8,q) - 1
+          !mask = ISHFT(1_8,q) - 1
-          Isomotmp = IAND(Isomo,mask)
+          !Isomotmp = IAND(Isomo,mask)
-          qmodel = POPCNT(mask) - POPCNT(XOR(Isomotmp,mask))
+          !qmodel = POPCNT(mask) - POPCNT(XOR(Isomotmp,mask))
          iint = shiftr(p-1,bit_kind_shift) + 1
          ipos = p-shiftl((iint-1),bit_kind_shift)-1
          tmpp = 0
          !print *,"iint=",iint, " p=",p
          do ii=1,iint-1
            !mask = ISHFT(1_bit_kind,-1)-1_bit_kind
            !Isomotmp = IAND(Isomo(ii),mask)
            !tmpp += POPCNT(mask) - POPCNT(XOR(Isomotmp,mask))
            tmpp += POPCNT(Isomo(ii))
          end do
          mask = ISHFT(1_bit_kind,ipos+1) - 1
          Isomotmp = IAND(Isomo(iint),mask)
          !pmodel = tmpp + POPCNT(mask) - POPCNT(XOR(Isomotmp,mask))
          pmodel = tmpp + POPCNT(Isomotmp)
          !print *,"iint=",iint, " ipos=",ipos,"pmodel=",pmodel, XOR(Isomotmp,mask),Isomo(iint)
          iint = shiftr(q-1,bit_kind_shift) + 1
          ipos = q-shiftl((iint-1),bit_kind_shift)-1
          tmpq = 0
          do ii=1,iint-1
            !mask = ISHFT(1_bit_kind,-1)-1_bit_kind
            !Isomotmp = IAND(Isomo(ii),mask)
            !tmpq += POPCNT(mask) - POPCNT(XOR(Isomotmp,mask))
            tmpq += POPCNT(Isomo(ii))
          end do
          mask = ISHFT(1_bit_kind,ipos+1) - 1
          Isomotmp = IAND(Isomo(iint),mask)
          !qmodel = tmpq + POPCNT(mask) - POPCNT(XOR(Isomotmp,mask))
          qmodel = tmpq + POPCNT(Isomotmp)
          !print *,"iint=",iint, " ipos=",ipos,"qmodel=",qmodel
       case (2)
          ! DOMO -> VMO
          ! remove all domos except one at p
          !print *,"type -> DOMO -> VMO"
-          mask = ISHFT(1_8,p) - 1
+          !mask = ISHFT(1_8,p) - 1
-          Jsomotmp = IAND(Jsomo,mask)
+          !Jsomotmp = IAND(Jsomo,mask)
-          pmodel = POPCNT(mask) - POPCNT(XOR(Jsomotmp,mask))
+          !pmodel = POPCNT(mask) - POPCNT(XOR(Jsomotmp,mask))
-          mask = ISHFT(1_8,q) - 1
+          !mask = ISHFT(1_8,q) - 1
-          Jsomotmp = IAND(Jsomo,mask)
+          !Jsomotmp = IAND(Jsomo,mask)
-          qmodel = POPCNT(mask) - POPCNT(XOR(Jsomotmp,mask))
+          !qmodel = POPCNT(mask) - POPCNT(XOR(Jsomotmp,mask))
          iint = shiftr(p-1,bit_kind_shift) + 1
          ipos = p-shiftl((iint-1),bit_kind_shift)-1
          tmpp = 0
          do ii=1,iint-1
            !mask = ISHFT(1_bit_kind,-1)-1_bit_kind
            !Jsomotmp = IAND(Jsomo(ii),mask)
            !tmpp += POPCNT(mask) - POPCNT(XOR(Jsomotmp,mask))
            tmpp += POPCNT(Jsomo(ii))
          end do
          mask = ISHFT(1_bit_kind,ipos+1) - 1
          Jsomotmp = IAND(Jsomo(iint),mask)
          !pmodel = tmpp + POPCNT(mask) - POPCNT(XOR(Jsomotmp,mask))
          pmodel = tmpp + POPCNT(Jsomotmp)
          iint = shiftr(q-1,bit_kind_shift) + 1
          ipos = q-shiftl((iint-1),bit_kind_shift)-1
          tmpq = 0
          do ii=1,iint-1
            !mask = ISHFT(1_bit_kind,-1)-1_bit_kind
            !Jsomotmp = IAND(Jsomo(ii),mask)
            !tmpq += POPCNT(mask) - POPCNT(XOR(Jsomotmp,mask))
            tmpq += POPCNT(Jsomo(ii))
          end do
          mask = ISHFT(1_bit_kind,ipos+1) - 1
          Jsomotmp = IAND(Jsomo(iint),mask)
          !qmodel = tmpq + POPCNT(mask) - POPCNT(XOR(Jsomotmp,mask))
          qmodel = tmpq + POPCNT(Jsomotmp)
       case (3)
          ! SOMO -> VMO
          !print *,"type -> SOMO -> VMO"
          !Isomo = IEOR(Isomo,Jsomo)
          if(p.LT.q) then
-             mask = ISHFT(1_8,p) - 1
+             !mask = ISHFT(1_8,p) - 1
-             Isomo = IAND(Isomo,mask)
+             !Isomo = IAND(Isomo,mask)
-             pmodel = POPCNT(mask) - POPCNT(XOR(Isomo,mask))
+             !pmodel = POPCNT(mask) - POPCNT(XOR(Isomo,mask))
-             mask = ISHFT(1_8,q) - 1
+             !mask = ISHFT(1_8,q) - 1
-             Jsomo = IAND(Jsomo,mask)
+             !Jsomo = IAND(Jsomo,mask)
-             qmodel = POPCNT(mask) - POPCNT(XOR(Jsomo,mask)) + 1
+             !qmodel = POPCNT(mask) - POPCNT(XOR(Jsomo,mask)) + 1
             iint = shiftr(p-1,bit_kind_shift) + 1
             ipos = p-shiftl((iint-1),bit_kind_shift)-1
             tmpp = 0
             do ii=1,iint-1
               !mask = ISHFT(1_bit_kind,-1)-1_bit_kind
               !Isomotmp = IAND(Isomo(ii),mask)
               !tmpp += POPCNT(mask) - POPCNT(XOR(Isomotmp,mask))
               tmpp += POPCNT(Isomo(ii))
             end do
             mask = ISHFT(1_bit_kind,ipos+1) - 1
             Isomotmp = IAND(Isomo(iint),mask)
             !pmodel = tmpp + POPCNT(mask) - POPCNT(XOR(Isomotmp,mask))
             pmodel = tmpp + POPCNT(Isomotmp)
             iint = shiftr(q-1,bit_kind_shift) + 1
             ipos = q-shiftl((iint-1),bit_kind_shift)-1
             tmpq = 0
             do ii=1,iint-1
               !mask = ISHFT(1_bit_kind,-1)-1_bit_kind
               !Jsomotmp = IAND(Jsomo(ii),mask)
               !tmpq += POPCNT(mask) - POPCNT(XOR(Jsomotmp,mask))
               tmpq += POPCNT(Jsomo(ii))
             end do
             mask = ISHFT(1_bit_kind,ipos+1) - 1
             Jsomotmp = IAND(Jsomo(iint),mask)
             !qmodel = tmpq + POPCNT(mask) - POPCNT(XOR(Jsomotmp,mask)) + 1
             qmodel = tmpq + POPCNT(Jsomotmp) + 1
          else
-             mask = ISHFT(1_8,p) - 1
+             !mask = ISHFT(1_8,p) - 1
-             Isomo = IAND(Isomo,mask)
+             !Isomo = IAND(Isomo,mask)
-             pmodel = POPCNT(mask) - POPCNT(XOR(Isomo,mask)) + 1
+             !pmodel = POPCNT(mask) - POPCNT(XOR(Isomo,mask)) + 1
-             mask = ISHFT(1_8,q) - 1
+             !mask = ISHFT(1_8,q) - 1
-             Jsomo = IAND(Jsomo,mask)
+             !Jsomo = IAND(Jsomo,mask)
-             qmodel = POPCNT(mask) - POPCNT(XOR(Jsomo,mask))
+             !qmodel = POPCNT(mask) - POPCNT(XOR(Jsomo,mask))
             iint = shiftr(p-1,bit_kind_shift) + 1
             ipos = p-shiftl((iint-1),bit_kind_shift)-1
             tmpp = 0
             do ii=1,iint-1
               !mask = ISHFT(1_bit_kind,-1)-1_bit_kind
               !Isomotmp = IAND(Isomo(ii),mask)
               !tmpp += POPCNT(mask) - POPCNT(XOR(Isomotmp,mask))
               tmpp += POPCNT(Isomo(ii))
             end do
             mask = ISHFT(1_bit_kind,ipos+1) - 1
             Isomotmp = IAND(Isomo(iint),mask)
             !pmodel = tmpp + POPCNT(mask) - POPCNT(XOR(Isomotmp,mask)) + 1
             pmodel = tmpp + POPCNT(Isomotmp) + 1
             iint = shiftr(q-1,bit_kind_shift) + 1
             ipos = q-shiftl((iint-1),bit_kind_shift)-1
             tmpq = 0
             do ii=1,iint-1
               !mask = ISHFT(1_bit_kind,-1)-1_bit_kind
               !Jsomotmp = IAND(Jsomo(ii),mask)
               !tmpq += POPCNT(mask) - POPCNT(XOR(Jsomotmp,mask))
               tmpq += POPCNT(Jsomo(ii))
             end do
             mask = ISHFT(1_bit_kind,ipos+1) - 1
             Jsomotmp = IAND(Jsomo(iint),mask)
             !qmodel = tmpq + POPCNT(mask) - POPCNT(XOR(Jsomotmp,mask))
             qmodel = tmpq + POPCNT(Jsomotmp)
          endif
       case (4)
          ! DOMO -> SOMO
@ -1296,19 +1427,75 @@ subroutine convertOrbIdsToModelSpaceIds(Ialpha, Jcfg, p, q, extype, pmodel, qmod
          !print *,"type -> DOMO -> SOMO"
          !Isomo = IEOR(Isomo,Jsomo)
          if(p.LT.q) then
-             mask = ISHFT(1_8,p) - 1
+             !mask = ISHFT(1_8,p) - 1
-             Jsomo = IAND(Jsomo,mask)
+             !Jsomo = IAND(Jsomo,mask)
-             pmodel = POPCNT(mask) - POPCNT(XOR(Jsomo,mask))
+             !pmodel = POPCNT(mask) - POPCNT(XOR(Jsomo,mask))
-             mask = ISHFT(1_8,q) - 1
+             !mask = ISHFT(1_8,q) - 1
-             Isomo = IAND(Isomo,mask)
+             !Isomo = IAND(Isomo,mask)
-             qmodel = POPCNT(mask) - POPCNT(XOR(Isomo,mask)) + 1
+             !qmodel = POPCNT(mask) - POPCNT(XOR(Isomo,mask)) + 1
             iint = shiftr(p-1,bit_kind_shift) + 1
             ipos = p-shiftl((iint-1),bit_kind_shift)-1
             tmpp = 0
             do ii=1,iint-1
               !mask = ISHFT(1_bit_kind,-1)-1_bit_kind
               !Jsomotmp = IAND(Jsomo(ii),mask)
               !tmpp += POPCNT(mask) - POPCNT(XOR(Jsomotmp,mask))
               tmpp += POPCNT(Jsomo(ii))
             end do
             mask = ISHFT(1_bit_kind,ipos+1) - 1
             Jsomotmp = IAND(Jsomo(iint),mask)
             !pmodel = tmpp + POPCNT(mask) - POPCNT(XOR(Jsomotmp,mask))
             pmodel = tmpp + POPCNT(Jsomotmp)
             iint = shiftr(q-1,bit_kind_shift) + 1
             ipos = q-shiftl((iint-1),bit_kind_shift)-1
             tmpq = 0
             do ii=1,iint-1
               !mask = ISHFT(1_bit_kind,-1)-1_bit_kind
               !Isomotmp = IAND(Isomo(ii),mask)
               !tmpq += POPCNT(mask) - POPCNT(XOR(Isomotmp,mask))
               tmpq += POPCNT(Isomo(ii))
             end do
             mask = ISHFT(1_bit_kind,ipos+1) - 1
             Isomotmp = IAND(Isomo(iint),mask)
             !qmodel = tmpq + POPCNT(mask) - POPCNT(XOR(Isomotmp,mask)) + 1
             qmodel = tmpq + POPCNT(Isomotmp) + 1
          else
-             mask = ISHFT(1_8,p) - 1
+             !mask = ISHFT(1_8,p) - 1
-             Jsomo = IAND(Jsomo,mask)
+             !Jsomo = IAND(Jsomo,mask)
-             pmodel = POPCNT(mask) - POPCNT(XOR(Jsomo,mask)) + 1
+             !pmodel = POPCNT(mask) - POPCNT(XOR(Jsomo,mask)) + 1
-             mask = ISHFT(1_8,q) - 1
+             !mask = ISHFT(1_8,q) - 1
-             Isomo = IAND(Isomo,mask)
+             !Isomo = IAND(Isomo,mask)
-             qmodel = POPCNT(mask) - POPCNT(XOR(Isomo,mask))
+             !qmodel = POPCNT(mask) - POPCNT(XOR(Isomo,mask))
             iint = shiftr(p-1,bit_kind_shift) + 1
             ipos = p-shiftl((iint-1),bit_kind_shift)-1
             tmpp = 0
             do ii=1,iint-1
               !mask = ISHFT(1_bit_kind,-1)-1_bit_kind
               !Jsomotmp = IAND(Jsomo(ii),mask)
               !tmpp += POPCNT(mask) - POPCNT(XOR(Jsomotmp,mask))
               tmpp += POPCNT(Jsomo(ii))
             end do
             mask = ISHFT(1_bit_kind,ipos+1) - 1
             Jsomotmp = IAND(Jsomo(iint),mask)
             !pmodel = tmpp + POPCNT(mask) - POPCNT(XOR(Jsomotmp,mask)) + 1
             pmodel = tmpp + POPCNT(Jsomotmp) + 1
             iint = shiftr(q-1,bit_kind_shift) + 1
             ipos = q-shiftl((iint-1),bit_kind_shift)-1
             tmpq = 0
             do ii=1,iint-1
               !mask = ISHFT(1_bit_kind,-1)-1_bit_kind
               !Isomotmp = IAND(Isomo(ii),mask)
               !tmpq += POPCNT(mask) - POPCNT(XOR(Isomotmp,mask))
               tmpq += POPCNT(Isomo(ii))
             end do
             mask = ISHFT(1_bit_kind,ipos+1) - 1
             Isomotmp = IAND(Isomo(iint),mask)
             !qmodel = tmpq + POPCNT(mask) - POPCNT(XOR(Isomotmp,mask))
             qmodel = tmpq + POPCNT(Isomotmp)
          endif
       case default
          print *,"something is wrong in convertOrbIdsToModelSpaceIds"
@ -1366,8 +1553,13 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze
  integer                        :: rowsTKI
  integer                        :: noccpp
  integer                        :: istart_cfg, iend_cfg, num_threads_max
  integer                        :: iint, jint, ipos, jpos, Nsomo_I, iii
  integer                        :: nconnectedJ,nconnectedtotalmax,nconnectedmaxJ,maxnalphas,ntotJ
-  integer*8                      :: MS, Isomo, Idomo, Jsomo, Jdomo, Ialpha, Ibeta
+  integer*8                      :: MS,Ialpha, Ibeta
  integer(bit_kind)              :: Isomo(N_INT)
  integer(bit_kind)              :: Idomo(N_INT)
  integer(bit_kind)              :: Jsomo(N_INT)
  integer(bit_kind)              :: Jdomo(N_INT)
  integer                        :: moi, moj, mok, mol, starti, endi, startj, endj, cnti, cntj, cntk
  real*8                         :: norm_coef_cfg, fac2eints
  real*8                         :: norm_coef_det
@ -1382,6 +1574,8 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze
  real*8,dimension(:),allocatable:: diag_energies
  real*8                         :: tmpvar, tmptot
  real*8                         :: core_act_contrib
  integer :: listall(N_int*bit_kind_size), nelall
  integer :: countelec
  integer(omp_lock_kind), allocatable :: lock(:)
  call omp_set_max_active_levels(1)
@ -1410,8 +1604,8 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze
  !nconnectedtotalmax = 1000
  !nconnectedmaxJ = 1000
  maxnalphas = elec_num*mo_num
-  Icfg(1,1) = psi_configuration(1,1,1)
+  Icfg(:,1) = psi_configuration(:,1,1)
-  Icfg(1,2) = psi_configuration(1,2,1)
+  Icfg(:,2) = psi_configuration(:,2,1)
  allocate(listconnectedJ(N_INT,2,max(sze,10000)))
  allocate(idslistconnectedJ(max(sze,10000)))
  call obtain_connected_J_givenI(1, Icfg, listconnectedJ, idslistconnectedJ, nconnectedmaxJ, nconnectedtotalmax)
@ -1443,6 +1637,7 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze
      !$OMP shared(istart_cfg, iend_cfg, psi_configuration, mo_num, psi_config_data,&
      !$OMP    N_int, N_st, psi_out, psi_in, h_core_ri, core_energy, h_act_ri, AIJpqContainer,&
      !$OMP     pp, sze, NalphaIcfg_list,alphasIcfg_list, bit_tmp,       &
      !$OMP     qq, iint, jint, ipos, jpos, nelall, listall, Nsomo_I, countelec,&
      !$OMP     AIJpqMatrixDimsList, diag_energies, n_CSF, lock, NBFmax,nconnectedtotalmax, nconnectedmaxJ,maxnalphas,&
      !$OMP     n_core_orb, n_act_orb, list_act, n, list_core,  list_core_is_built,core_act_contrib, num_threads_max,&
      !$OMP     n_core_orb_is_built, mo_integrals_map, mo_integrals_map_is_built)
@ -1465,10 +1660,12 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze
    ! else
    ! cycle
-     Icfg(1,1) = psi_configuration(1,1,i)
+    do ii=1,N_INT
-     Icfg(1,2) = psi_configuration(1,2,i)
+     Icfg(ii,1) = psi_configuration(ii,1,i)
-     Isomo = Icfg(1,1)
+     Icfg(ii,2) = psi_configuration(ii,2,i)
-     Idomo = Icfg(1,2)
+     Isomo(ii) = Icfg(ii,1)
     Idomo(ii) = Icfg(ii,2)
    enddo
    NSOMOI = getNSOMO(Icfg)
     ! find out all pq holes possible
@ -1479,44 +1676,88 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze
     ! list_core_inact
     ! bitmasks
     !do k = 1,mo_num
-     do kk = 1,n_act_orb
+    ! do kk = 1,n_act_orb
-       k = list_act(kk)
+    !   k = list_act(kk)
-       if(POPCNT(IAND(Isomo,IBSET(0_8,k-1))) .EQ. 1) then
+    !   if(POPCNT(IAND(Isomo,IBSET(0_8,k-1))) .EQ. 1) then
    !     nholes += 1
    !     listholes(nholes) = k
    !     holetype(nholes) = 1
    !   endif
    ! enddo
    ! ! holes in DOMO
    ! !do k = 1,mo_num
    ! do kk = 1,n_act_orb
    !   k = list_act(kk)
    !   if(POPCNT(IAND(Idomo,IBSET(0_8,k-1))) .EQ. 1) then
    !     nholes += 1
    !     listholes(nholes) = k
    !     holetype(nholes) = 2
    !   endif
    ! enddo
    ! ! find vmos
    ! do kk = 1,n_act_orb
    !   k = list_act(kk)
    !   !print *,i,IBSET(0,i-1),POPCNT(IAND(Isomo,(IBSET(0_8,i-1)))), POPCNT(IAND(Idomo,(IBSET(0_8,i-1))))
    !   if(POPCNT(IAND(Isomo,(IBSET(0_8,k-1)))) .EQ. 0 .AND. POPCNT(IAND(Idomo,(IBSET(0_8,k-1)))) .EQ. 0) then
    !     nvmos += 1
    !     listvmos(nvmos) = k
    !     vmotype(nvmos) = 0
    !   else if(POPCNT(IAND(Isomo,(IBSET(0_8,k-1)))) .EQ. 1 .AND. POPCNT(IAND(Idomo,(IBSET(0_8,k-1)))) .EQ. 0 ) then
    !     nvmos += 1
    !     listvmos(nvmos) = k
    !     vmotype(nvmos) = 1
    !   end if
    ! enddo
  ! find out all pq holes possible
    nholes = 0
        call bitstring_to_list(Isomo,listall,nelall,N_int)
        do iii=1,nelall
          nholes += 1
-         listholes(nholes) = k
+          listholes(nholes) = listall(iii)
          holetype(nholes) = 1
       endif
        end do
-     ! holes in DOMO
+
-     !do k = 1,mo_num
+        Nsomo_I = nelall
-     do kk = 1,n_act_orb
+
-       k = list_act(kk)
+        call bitstring_to_list(Idomo,listall,nelall,N_int)
-       if(POPCNT(IAND(Idomo,IBSET(0_8,k-1))) .EQ. 1) then
+
        do iii=1,nelall
          if(listall(iii) .gt. n_core_orb)then
            nholes += 1
-         listholes(nholes) = k
+            listholes(nholes) = listall(iii)
            holetype(nholes) = 2
          endif
        end do
-     ! find vmos
+
     listvmos = -1
     vmotype = -1
     nvmos = 0
-     do kk = 1,n_act_orb
+  ! find vmos
-       k = list_act(kk)
+    ! Take into account N_int
-       !print *,i,IBSET(0,i-1),POPCNT(IAND(Isomo,(IBSET(0_8,i-1)))), POPCNT(IAND(Idomo,(IBSET(0_8,i-1))))
+    do ii = 1, n_act_orb
-       if(POPCNT(IAND(Isomo,(IBSET(0_8,k-1)))) .EQ. 0 .AND. POPCNT(IAND(Idomo,(IBSET(0_8,k-1)))) .EQ. 0) then
+      iii = list_act(ii)
      iint = shiftr(iii-1,bit_kind_shift) + 1
      ipos = iii-shiftl((iint-1),bit_kind_shift)-1
      if(IAND(Idomo(iint),(IBSET(0_8,ipos))) .EQ. 0) then
        if(IAND(Isomo(iint),(IBSET(0_8,ipos))) .EQ. 0) then
          nvmos += 1
-         listvmos(nvmos) = k
+          listvmos(nvmos) = iii
         vmotype(nvmos) = 0
       else if(POPCNT(IAND(Isomo,(IBSET(0_8,k-1)))) .EQ. 1 .AND. POPCNT(IAND(Idomo,(IBSET(0_8,k-1)))) .EQ. 0 ) then
         nvmos += 1
         listvmos(nvmos) = k
          vmotype(nvmos) = 1
        else if(POPCNT(IAND(Isomo(iint),(IBSET(0_8,ipos)))) .EQ. 1) then
          nvmos += 1
          listvmos(nvmos) = iii
          vmotype(nvmos) = 2
        end if
      end if
    end do
     ! Icsf ids
     starti = psi_config_data(i,1)
     endi   = psi_config_data(i,2)
@ -1533,16 +1774,31 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze
      extype = excitationTypes_single(j)
      ! Off diagonal terms
      call convertOrbIdsToModelSpaceIds(Icfg, singlesI(1,1,j), p, q, extype, pmodel, qmodel)
-      Jsomo = singlesI(1,1,j)
+      do ii=1,N_INT
-      Jdomo = singlesI(1,2,j)
+        Jsomo(ii) = singlesI(1,1,j)
        Jdomo(ii) = singlesI(1,2,j)
      enddo
      ! Get actual p pos
      pp  = p
      iint = shiftr(pp-1,bit_kind_shift) + 1
      ipos = pp-shiftl((iint-1),bit_kind_shift)-1
      ! Get actual q pos
      qq  = q
      jint = shiftr(qq-1,bit_kind_shift) + 1
      jpos = qq-shiftl((jint-1),bit_kind_shift)-1
      ! Add the hole on J
-      if(POPCNT(IAND(Jsomo,IBSET(0_8,q-1))) .EQ. 1  .AND. POPCNT(IAND(Isomo,IBSET(0_8,q-1))) .EQ. 0) then
+      !if(POPCNT(IAND(Jsomo,IBSET(0_8,q-1))) .EQ. 1  .AND. POPCNT(IAND(Isomo,IBSET(0_8,q-1))) .EQ. 0) then
      if(POPCNT(IAND(Jsomo(jint),IBSET(0_8,jpos))) .EQ. 1  .AND. POPCNT(IAND(Isomo(jint),IBSET(0_8,jpos))) .EQ. 0) then
        nholes += 1
        listholes(nholes) = q
        holetype(nholes) = 1
      endif
-      if((POPCNT(IAND(Jdomo,IBSET(0_8,q-1))) .EQ. 1 .AND. POPCNT(IAND(Idomo,IBSET(0_8,q-1))) .EQ. 0) .AND. POPCNT(IAND(Isomo,IBSET(0_8,q-1))) .EQ. 0) then
+      !if((POPCNT(IAND(Jdomo,IBSET(0_8,q-1))) .EQ. 1 .AND. POPCNT(IAND(Idomo,IBSET(0_8,q-1))) .EQ. 0) .AND. POPCNT(IAND(Isomo,IBSET(0_8,q-1))) .EQ. 0) then
      if((POPCNT(IAND(Jdomo(jint),IBSET(0_8,jpos))) .EQ. 1 .AND. POPCNT(IAND(Idomo(jint),IBSET(0_8,jpos))) .EQ. 0) .AND.&
      POPCNT(IAND(Isomo(jint),IBSET(0_8,jpos))) .EQ. 0) then
        nholes += 1
        listholes(nholes) = q
        holetype(nholes) = 2
@ -1578,10 +1834,12 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze
      enddo
      ! Undo setting in listholes
-      if(POPCNT(IAND(Jsomo,IBSET(0_8,q-1))) .EQ. 1  .AND. POPCNT(IAND(Isomo,IBSET(0_8,q-1))) .EQ. 0) then
+      !if(POPCNT(IAND(Jsomo,IBSET(0_8,q-1))) .EQ. 1  .AND. POPCNT(IAND(Isomo,IBSET(0_8,q-1))) .EQ. 0) then
      if(POPCNT(IAND(Jsomo(jint),IBSET(0_8,jpos))) .EQ. 1  .AND. POPCNT(IAND(Isomo(jint),IBSET(0_8,jpos))) .EQ. 0) then
        nholes -= 1
      endif
-      if((POPCNT(IAND(Jdomo,IBSET(0_8,q-1))) .EQ. 1 .AND. POPCNT(IAND(Idomo,IBSET(0_8,q-1))) .EQ. 0) .AND. POPCNT(IAND(Isomo,IBSET(0_8,q-1))) .EQ. 0) then
+      if((POPCNT(IAND(Jdomo(jint),IBSET(0_8,jpos))) .EQ. 1 .AND. POPCNT(IAND(Idomo(jint),IBSET(0_8,jpos))) .EQ. 0) .AND.&
      POPCNT(IAND(Isomo(jint),IBSET(0_8,jpos))) .EQ. 0) then
        nholes -= 1
      endif
    enddo
@ -1593,6 +1851,9 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze
  deallocate(excitationTypes_single)
  !print *," singles part psi(1,1)=",psi_out(1,1)
  !do i=1,n_CSF
  !  print *,"i=",i," psi(i)=",psi_out(1,i)
  !enddo
  allocate(listconnectedJ(N_INT,2,max(sze,10000)))
  allocate(alphas_Icfg(N_INT,2,max(sze,10000)))
@ -1607,7 +1868,6 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze
  !!!====================!!!
  !!! Double Excitations !!!
  !!!====================!!!
  ! Loop over all selected configurations
  !$OMP DO SCHEDULE(static)
  do i = istart_cfg,iend_cfg
@ -1617,8 +1877,10 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze
     ! else
     ! cycle
-     Icfg(1,1) = psi_configuration(1,1,i)
+     do ii=1,N_INT
-     Icfg(1,2) = psi_configuration(1,2,i)
+       Icfg(ii,1) = psi_configuration(ii,1,i)
       Icfg(ii,2) = psi_configuration(ii,2,i)
     enddo
     starti = psi_config_data(i,1)
     endi   = psi_config_data(i,2)
@ -1629,14 +1891,15 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze
     Nalphas_Icfg = NalphaIcfg_list(i)
     alphas_Icfg(1:n_int,1:2,1:Nalphas_Icfg) = alphasIcfg_list(1:n_int,1:2,i,1:Nalphas_Icfg)
-     if(Nalphas_Icfg .GT. maxnalphas) then
+     !if(Nalphas_Icfg .GT. maxnalphas) then
-       print *,"Nalpha > maxnalpha"
+     !  print *,"Nalpha > maxnalpha"
-     endif
+     !endif
-     call obtain_connected_J_givenI(i, Icfg, listconnectedJ, idslistconnectedJ, nconnectedJ, ntotJ)
+     !call obtain_connected_J_givenI(i, Icfg, listconnectedJ, idslistconnectedJ, nconnectedJ, ntotJ)
     ! TODO : remove doubly excited for return
-     !print *,"I=",i," isomo=",psi_configuration(1,1,i)," idomo=",psi_configuration(1,2,i), " psiout=",psi_out(1,5)
+     !print *,"I=",i,"isomo=",psi_configuration(1,1,i),psi_configuration(2,1,i),POPCNT(psi_configuration(1,1,i)),POPCNT(psi_configuration(2,1,i)),&
     !"idomo=",psi_configuration(1,2,i),psi_configuration(2,2,i),POPCNT(psi_configuration(1,2,i)),POPCNT(psi_configuration(2,2,i)), "Nalphas_Icfg=",Nalphas_Icfg
     do k = 1,Nalphas_Icfg
        ! Now generate all singly excited with respect to a given alpha CFG
@ -1647,15 +1910,18 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze
        call obtain_connected_I_foralpha(i, alphas_Icfg(1,1,k), connectedI_alpha, idxs_connectedI_alpha, &
                                         nconnectedI, excitationIds, excitationTypes, diagfactors)
        !if(i .EQ. 218) then
        !   print *,'k=',k,' kcfgSOMO=',alphas_Icfg(1,1,k),alphas_Icfg(2,1,k),' ',POPCNT(alphas_Icfg(1,1,k)),' &
        !   kcfgDOMO=',alphas_Icfg(1,2,k),alphas_Icfg(2,2,k),' ',POPCNT(alphas_Icfg(1,2,k)), " NconnectedI=",nconnectedI
        !   !print *,'k=',k,' kcfgSOMO=',alphas_Icfg(1,1,k),' ',POPCNT(alphas_Icfg(1,1,k)),' &
        !   !kcfgDOMO=',alphas_Icfg(1,2,k),' ',POPCNT(alphas_Icfg(1,2,k)), " NconnectedI=",nconnectedI
        !endif
        if(nconnectedI .EQ. 0) then
           cycle
        endif
        !if(i .EQ. 1) then
        !   print *,'k=',k,' kcfgSOMO=',alphas_Icfg(1,1,k),' ',POPCNT(alphas_Icfg(1,1,k)),' kcfgDOMO=',alphas_Icfg(1,2,k),' ',POPCNT(alphas_Icfg(1,2,k))
        !endif
        ! Here we do 2x the loop. One to count for the size of the matrix, then we compute.
        totcolsTKI = 0
        rowsTKI = -1
@ -1665,15 +1931,30 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze
           p = excitationIds(1,j)
           q = excitationIds(2,j)
           extype = excitationTypes(j)
           !print *,"K=",k,"j=",j, "countelec=",countelec," p=",p," q=",q, " extype=",extype, "NSOMOalpha=",NSOMOalpha," NSOMOI=",NSOMOI, "alphas_Icfg(1,1,k)=",alphas_Icfg(1,1,k), &
           !alphas_Icfg(2,1,k), " domo=",alphas_Icfg(1,2,k), alphas_Icfg(2,2,k), " connected somo=",connectedI_alpha(1,1,j), &
           !connectedI_alpha(2,1,j), " domo=",connectedI_alpha(1,2,j), connectedI_alpha(2,2,j)
           call convertOrbIdsToModelSpaceIds(alphas_Icfg(1,1,k), connectedI_alpha(1,1,j), p, q, extype, pmodel, qmodel)
           ! for E_pp E_rs and E_ppE_rr case
           rowsikpq = AIJpqMatrixDimsList(NSOMOalpha,extype,pmodel,qmodel,1)
           colsikpq = AIJpqMatrixDimsList(NSOMOalpha,extype,pmodel,qmodel,2)
           !if(i.eq.218)then
           !  print *,"j=",j," k=",k,"p=",p,"q=",q,"NSOMOalpha=",NSOMOalpha, "pmodel=",pmodel,"qmodel=",qmodel, "extype=",extype,&
           !  "conn somo=",connectedI_alpha(1,1,j),connectedI_alpha(2,1,j),&
           !  "conn domo=",connectedI_alpha(1,2,j),connectedI_alpha(2,2,j)
           !  do m=1,colsikpq
           !    print *,idxs_connectedI_alpha(j)+m-1
           !  enddo
           !endif
           !print *,"j=",j," Nsomo=",NSOMOalpha," rowsikpq=",rowsikpq," colsikpq=",colsikpq, " p=",pmodel," q=",qmodel, " extyp=",extype
           totcolsTKI += colsikpq
           rowsTKI = rowsikpq
        enddo
        !if(i.eq.1)then
        !  print *,"n_st=",n_st,"rowsTKI=",rowsTKI, " nconnectedI=",nconnectedI, &
        !  "totcolsTKI=",totcolsTKI
        !endif
        allocate(TKI(n_st,rowsTKI,totcolsTKI)) ! coefficients of CSF
        ! Initialize the integral container
        ! dims : (totcolsTKI, nconnectedI)
@ -1703,10 +1984,10 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze
                 TKI(kk,l,totcolsTKI+m) = AIJpqContainer(l,m,pmodel,qmodel,extype,NSOMOalpha) &
                    * psi_in(kk,idxs_connectedI_alpha(j)+m-1)
              enddo
           !if(i.eq.1) then
           !      print *,AIJpqContainer(l,m,pmodel,qmodel,extype,NSOMOalpha)
           !endif
           enddo
           !if(i.eq.1) then
           !      print *,"j=",j,"psi_in=",psi_in(1,idxs_connectedI_alpha(j)+m-1)
           !endif
           enddo
           diagfactors_0 = diagfactors(j)*0.5d0
@ -1745,16 +2026,24 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze
           rowsTKI = rowsikpq
           CCmattmp = 0.d0
        !if(i.eq.1)then
        !  print *,"\t n_st=",n_st," colsikpq=",colsikpq," rowsTKI=",rowsTKI,&
        !    " | ",size(TKIGIJ,1),size(AIJpqContainer,1),size(CCmattmp,1)
        !endif
           call dgemm('N','N', n_st, colsikpq, rowsTKI, 1.d0,        &
               TKIGIJ(1,1,j), size(TKIGIJ,1),                        &
               AIJpqContainer(1,1,pmodel,qmodel,extype,NSOMOalpha),  &
               size(AIJpqContainer,1), 0.d0,                         &
               CCmattmp, size(CCmattmp,1) )
           !print *,"j=",j,"colsikpq=",colsikpq, "sizeTIG=",size(TKIGIJ,1),"sizeaijpq=",size(AIJpqContainer,1)
           do m = 1,colsikpq
              call omp_set_lock(lock(idxs_connectedI_alpha(j)+m-1))
              do kk = 1,n_st
                 psi_out(kk,idxs_connectedI_alpha(j)+m-1) += CCmattmp(kk,m)
                 !if(dabs(CCmattmp(kk,m)).gt.1e-10)then
                 !  print *, CCmattmp(kk,m), " | ",idxs_connectedI_alpha(j)+m-1
                 !end if
              enddo
              call omp_unset_lock(lock(idxs_connectedI_alpha(j)+m-1))
           enddo
@ -1789,6 +2078,10 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze
  !$OMP END DO
  !$OMP END PARALLEL
  !print *," ----- "
  !do i=1,sze
  !  print *,"i=",i," psi_out(i)=",psi_out(1,i)
  !end do
  call omp_set_max_active_levels(4)
  deallocate(diag_energies)
--- a/src/dav_general_mat/test_dav.irp.f.example
+++ b/src/dav_general_mat/test_dav.irp.f.example
--- a/src/davidson/diagonalization_hcfg.irp.f
+++ b/src/davidson/diagonalization_hcfg.irp.f
@ -112,6 +112,8 @@ subroutine davidson_diag_cfg_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,sze_csf,N
  double precision, allocatable  :: U(:,:), U_csf(:,:), overlap(:,:)
  double precision, allocatable  :: tmpU(:,:), tmpW(:,:)
  double precision, pointer      :: W(:,:), W_csf(:,:)
  !double precision, pointer      :: W2(:,:), W_csf2(:,:)
  !double precision, allocatable  :: U2(:,:), U_csf2(:,:)
  logical                        :: disk_based
  double precision               :: energy_shift(N_st_diag_in*davidson_sze_max)
@ -234,12 +236,15 @@ subroutine davidson_diag_cfg_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,sze_csf,N
    call c_f_pointer(ptr_w, W_csf, (/sze_csf,N_st_diag*itermax/))
  else
    allocate(W(sze,N_st_diag),W_csf(sze_csf,N_st_diag*itermax))
    !allocate(W2(sze,N_st_diag),W_csf2(sze_csf,N_st_diag*itermax))
  endif
  allocate(                                                          &
      ! Large
      U(sze,N_st_diag),                                              &
      !U2(sze,N_st_diag),                                              &
      U_csf(sze_csf,N_st_diag*itermax),                              &
      !U_csf2(sze_csf,N_st_diag*itermax),                              &
      ! Small
      h(N_st_diag*itermax,N_st_diag*itermax),                        &
@ -325,7 +330,7 @@ subroutine davidson_diag_cfg_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,sze_csf,N
              enddo
            enddo
            !tmpU     =0.0d0
-            !tmpU(1,2)=1.0d0
+            !tmpU(1,1)=1.0d0
            double precision               :: irp_rdtsc
            double precision               :: ticks_0, ticks_1
            integer*8                      :: irp_imax
@ -348,9 +353,9 @@ subroutine davidson_diag_cfg_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,sze_csf,N
            !call convertWFfromDETtoCSF(N_st_diag,u_in(1,1),W_csf2(1,1))
            !do i=1,sze_csf
            !  print *,"I=",i," qp=",W_csf2(i,1)," my=",W_csf(i,1)," diff=",dabs(W_csf2(i,1))-dabs(W_csf(i,1))
-            !  if(dabs(dabs(W_csf2(i,1))-dabs(W_csf(i,1))) .gt. 1.0e-10)then
+            !  !if(dabs(dabs(W_csf2(i,1))-dabs(W_csf(i,1))) .gt. 1.0e-10)then
-            !    print *,"somo=",psi_configuration(1,1,i)," domo=",psi_configuration(1,2,i)," diff=",dabs(W_csf2(i,1))-dabs(W_csf(i,1))
+            !  !  print *,"somo=",psi_configuration(1,1,i)," domo=",psi_configuration(1,2,i)," diff=",dabs(W_csf2(i,1))-dabs(W_csf(i,1))
-            !  endif
+            !  !endif
            !end do
            !stop
            deallocate(tmpW)
--- a/src/determinants/determinants.irp.f
+++ b/src/determinants/determinants.irp.f
@ -329,6 +329,7 @@ END_PROVIDER
 BEGIN_PROVIDER [ integer(bit_kind), psi_det_sorted_bit, (N_int,2,psi_det_size) ]
 &BEGIN_PROVIDER [ double precision, psi_coef_sorted_bit, (psi_det_size,N_states) ]
 &BEGIN_PROVIDER [ integer, psi_det_sorted_bit_order, (psi_det_size) ]
   implicit none
   BEGIN_DOC
   ! Determinants on which we apply $\langle i|H|psi \rangle$ for perturbation.
@ -337,8 +338,8 @@ END_PROVIDER
   ! function.
   END_DOC
-   call sort_dets_by_det_search_key(N_det, psi_det, psi_coef, size(psi_coef,1),       &
+   call sort_dets_by_det_search_key_ordered(N_det, psi_det, psi_coef, size(psi_coef,1),       &
-       psi_det_sorted_bit, psi_coef_sorted_bit, N_states)
+       psi_det_sorted_bit, psi_coef_sorted_bit, N_states, psi_det_sorted_bit_order)
 END_PROVIDER
@ -1005,3 +1006,48 @@ BEGIN_PROVIDER [ double precision, psi_det_Hii, (N_det) ]
 END_PROVIDER
 subroutine sort_dets_by_det_search_key_ordered(Ndet, det_in, coef_in, sze, det_out, coef_out, N_st, iorder)
   use bitmasks
   implicit none
   integer, intent(in)            :: Ndet, N_st, sze
   integer(bit_kind), intent(in)  :: det_in  (N_int,2,sze)
   double precision , intent(in)  :: coef_in(sze,N_st)
   integer(bit_kind), intent(out) :: det_out (N_int,2,sze)
   double precision , intent(out) :: coef_out(sze,N_st)
   integer, intent(out)           :: iorder(sze)
   BEGIN_DOC
   ! Determinants are sorted according to their :c:func:`det_search_key`.
   ! Useful to accelerate the search of a random determinant in the wave
   ! function.
   !
   ! /!\ The first dimension of coef_out and coef_in need to be psi_det_size
   !
   END_DOC
   integer                        :: i,j,k
   integer*8, allocatable         :: bit_tmp(:)
   integer*8, external            :: det_search_key
   allocate ( bit_tmp(Ndet) )
   do i=1,Ndet
     iorder(i) = i
     !$DIR FORCEINLINE
     bit_tmp(i) = det_search_key(det_in(1,1,i),N_int)
   enddo
   call i8sort(bit_tmp,iorder,Ndet)
   !DIR$ IVDEP
   do i=1,Ndet
     do j=1,N_int
       det_out(j,1,i) = det_in(j,1,iorder(i))
       det_out(j,2,i) = det_in(j,2,iorder(i))
     enddo
     do k=1,N_st
       coef_out(i,k) = coef_in(iorder(i),k)
     enddo
   enddo
   deallocate(bit_tmp)
 end
--- a/src/determinants/slater_rules.irp.f
+++ b/src/determinants/slater_rules.irp.f
@ -83,7 +83,7 @@ subroutine get_excitation(det1,det2,exc,degree,phase,Nint)
  !               exc(1,1,1) = q
  !               exc(1,2,1) = p
-  ! T^alpha_pq  : exc(0,1,2) = 1
+  ! T^beta_pq   : exc(0,1,2) = 1
  !               exc(0,2,2) = 1
  !               exc(1,1,2) = q
  !               exc(1,2,2) = p
@ -434,6 +434,98 @@ subroutine get_single_excitation(det1,det2,exc,phase,Nint)
 end
 subroutine get_single_excitation_cfg(cfg1,cfg2,p,q,Nint)
  use bitmasks
  implicit none
  BEGIN_DOC
  ! Returns the excitation operator between two singly excited configurations.
  END_DOC
  integer, intent(in)            :: Nint
  integer(bit_kind), intent(in)  :: cfg1(Nint,2)
  integer(bit_kind), intent(in)  :: cfg2(Nint,2)
  integer, intent(out)           :: p, q
  integer                        :: tz
  integer                        :: l, ispin, idx_hole, idx_particle, ishift
  integer                        :: nperm
  integer                        :: i,j,k,m,n
  integer                        :: high, low
  integer                        :: a,b,c,d
  integer(bit_kind)              :: hole, particle, tmp
  integer                        :: exc(0:2,2,2)
  ASSERT (Nint > 0)
  nperm = 0
  p = 0
  q = 0
  exc(0,1,1) = 0
  exc(0,2,1) = 0
  exc(0,1,2) = 0
  exc(0,2,2) = 0
  do ispin = 1,2
    ishift = 1-bit_kind_size
    do l=1,Nint
      ishift = ishift + bit_kind_size
      if (cfg1(l,ispin) == cfg2(l,ispin)) then
        cycle
      endif
      tmp = xor( cfg1(l,ispin), cfg2(l,ispin) )
      particle = iand(tmp, cfg2(l,ispin))
      hole     = iand(tmp, cfg1(l,ispin))
      if (particle /= 0_bit_kind) then
        tz = trailz(particle)
        exc(0,2,ispin) = 1
        exc(1,2,ispin) = tz+ishift
        !print *,"part ",tz+ishift, " ispin=",ispin
      endif
      if (hole /= 0_bit_kind) then
        tz = trailz(hole)
        exc(0,1,ispin) = 1
        exc(1,1,ispin) = tz+ishift
        !print *,"hole ",tz+ishift, " ispin=",ispin
      endif
      if ( iand(exc(0,1,ispin),exc(0,2,ispin)) /= 1) then  ! exc(0,1,ispin)/=1 and exc(0,2,ispin) /= 1
        cycle
      endif
      high = max(exc(1,1,ispin), exc(1,2,ispin))-1
      low  = min(exc(1,1,ispin), exc(1,2,ispin))
      ASSERT (low >= 0)
      ASSERT (high > 0)
      k = shiftr(high,bit_kind_shift)+1
      j = shiftr(low,bit_kind_shift)+1
      m = iand(high,bit_kind_size-1)
      n = iand(low,bit_kind_size-1)
      if (j==k) then
        nperm = nperm + popcnt(iand(cfg1(j,ispin),           &
            iand( shiftl(1_bit_kind,m)-1_bit_kind,            &
                  not(shiftl(1_bit_kind,n))+1_bit_kind)) )
      else
        nperm = nperm + popcnt(                                    &
             iand(cfg1(j,ispin),                                   &
                  iand(not(0_bit_kind),                            &
                       (not(shiftl(1_bit_kind,n)) + 1_bit_kind) ))) &
             + popcnt(iand(cfg1(k,ispin),                          &
                           (shiftl(1_bit_kind,m) - 1_bit_kind ) ))
        do i=j+1,k-1
          nperm = nperm + popcnt(cfg1(i,ispin))
        end do
      endif
      ! Set p and q
      q = max(exc(1,1,1),exc(1,1,2))
      p = max(exc(1,2,1),exc(1,2,2))
      return
    enddo
  enddo
 end
 subroutine bitstring_to_list_ab( string, list, n_elements, Nint)
  use bitmasks
  implicit none
--- a/src/hartree_fock/scf.irp.f
+++ b/src/hartree_fock/scf.irp.f
@ -83,16 +83,12 @@ subroutine run()
  PROVIDE scf_algorithm
-  if(scf_algorithm .eq. "DIIS_MO") then
+  if(scf_algorithm .eq. "DIIS") then
    call Roothaan_Hall_SCF_MO()
  elseif(scf_algorithm .eq. "DIIS_MODIF") then
    call Roothaan_Hall_SCF_MODIF()
  elseif(scf_algorithm .eq. "DIIS") then
    call Roothaan_Hall_SCF()
  elseif(scf_algorithm .eq. "Simple") then
    call Roothaan_Hall_SCF_Simple()
  else
-    print *, ' not implemented yet:', scf_algorithm
+    print *, scf_algorithm, ' not implemented yet'
  endif
  call ezfio_set_hartree_fock_energy(SCF_energy)
--- a/src/mo_two_e_ints/mo_bi_integrals.irp.f
+++ b/src/mo_two_e_ints/mo_bi_integrals.irp.f
@ -245,18 +245,16 @@ subroutine add_integrals_to_map(mask_ijkl)
    return
  endif
  double precision               :: accu_bis
  accu_bis = 0.d0
  call wall_time(wall_1)
-  size_buffer = min( (qp_max_mem/(nproc*5)),mo_num*mo_num*mo_num)
+  size_buffer = min(mo_num*mo_num*mo_num,8000000)
  print*, 'Buffers : ', 8.*(mo_num*(n_j)*(n_k+1) + mo_num+&
      ao_num+ao_num*ao_num+ size_buffer*3)/(1024*1024), 'MB / core'
  !$OMP PARALLEL PRIVATE(l1,k1,j1,i1,i2,i3,i4,i,j,k,l,c, ii1,kmax,   &
      !$OMP  two_e_tmp_0_idx, two_e_tmp_0, two_e_tmp_1,two_e_tmp_2,two_e_tmp_3,&
      !$OMP  buffer_i,buffer_value,n_integrals,wall_2,i0,j0,k0,l0,   &
-      !$OMP  wall_0,thread_num,accu_bis)                             &
+      !$OMP  wall_0,thread_num)                             &
      !$OMP  DEFAULT(NONE)                                           &
      !$OMP  SHARED(size_buffer,ao_num,mo_num,n_i,n_j,n_k,n_l,   &
      !$OMP  mo_coef_transp,                                         &
@ -434,10 +432,10 @@ subroutine add_integrals_to_map(mask_ijkl)
  !$OMP END DO NOWAIT
  deallocate (two_e_tmp_1,two_e_tmp_2,two_e_tmp_3)
-  integer                        :: index_needed
+  if (n_integrals > 0) then
    call insert_into_mo_integrals_map(n_integrals,buffer_i,buffer_value,&
        real(mo_integrals_threshold,integral_kind))
  endif
  deallocate(buffer_i, buffer_value)
  !$OMP END PARALLEL
  call map_merge(mo_integrals_map)
@ -527,12 +525,10 @@ subroutine add_integrals_to_map_three_indices(mask_ijk)
  call wall_time(wall_1)
  call cpu_time(cpu_1)
  double precision               :: accu_bis
  accu_bis = 0.d0
  !$OMP PARALLEL PRIVATE(m,l1,k1,j1,i1,i2,i3,i4,i,j,k,l,c, ii1,kmax, &
      !$OMP  two_e_tmp_0_idx, two_e_tmp_0, two_e_tmp_1,two_e_tmp_2,two_e_tmp_3,&
      !$OMP  buffer_i,buffer_value,n_integrals,wall_2,i0,j0,k0,l0,   &
-      !$OMP  wall_0,thread_num,accu_bis)                             &
+      !$OMP  wall_0,thread_num)                             &
      !$OMP  DEFAULT(NONE)                                           &
      !$OMP  SHARED(size_buffer,ao_num,mo_num,n_i,n_j,n_k,       &
      !$OMP  mo_coef_transp,                                         &
@ -730,8 +726,6 @@ subroutine add_integrals_to_map_three_indices(mask_ijk)
  !$OMP END DO NOWAIT
  deallocate (two_e_tmp_1,two_e_tmp_2,two_e_tmp_3)
  integer                        :: index_needed
  call insert_into_mo_integrals_map(n_integrals,buffer_i,buffer_value,&
      real(mo_integrals_threshold,integral_kind))
  deallocate(buffer_i, buffer_value)
--- a/src/non_h_ints_mu/debug_integ_jmu_modif.irp.f
+++ b/src/non_h_ints_mu/debug_integ_jmu_modif.irp.f
@ -306,7 +306,7 @@ subroutine test_int2_grad1_u12_ao()
        call num_int2_grad1_u12_ao(i, j, ipoint, integ)
-        i_exc  = int2_grad1_u12_ao(1,i,j,ipoint) 
+        i_exc  = int2_grad1_u12_ao(i,j,ipoint,1) 
        i_num  = integ(1)
        acc_ij = dabs(i_exc - i_num)
        if(acc_ij .gt. eps_ij) then
@ -318,7 +318,7 @@ subroutine test_int2_grad1_u12_ao()
        acc_tot += acc_ij
        normalz += dabs(i_num)
-        i_exc  = int2_grad1_u12_ao(2,i,j,ipoint) 
+        i_exc  = int2_grad1_u12_ao(i,j,ipoint,2) 
        i_num  = integ(2)
        acc_ij = dabs(i_exc - i_num)
        if(acc_ij .gt. eps_ij) then
@ -330,7 +330,7 @@ subroutine test_int2_grad1_u12_ao()
        acc_tot += acc_ij
        normalz += dabs(i_num)
-        i_exc  = int2_grad1_u12_ao(3,i,j,ipoint) 
+        i_exc  = int2_grad1_u12_ao(i,j,ipoint,3) 
        i_num  = integ(3)
        acc_ij = dabs(i_exc - i_num)
        if(acc_ij .gt. eps_ij) then
@ -382,7 +382,7 @@ subroutine test_int2_u_grad1u_total_j1b2()
        call num_int2_u_grad1u_total_j1b2(i, j, ipoint, integ)
-        i_exc  = x * int2_u_grad1u_j1b2(i,j,ipoint) - int2_u_grad1u_x_j1b2(1,i,j,ipoint) 
+        i_exc  = x * int2_u_grad1u_j1b2(i,j,ipoint) - int2_u_grad1u_x_j1b2(i,j,ipoint,1)
        i_num  = integ(1)
        acc_ij = dabs(i_exc - i_num)
        if(acc_ij .gt. eps_ij) then
@ -394,7 +394,7 @@ subroutine test_int2_u_grad1u_total_j1b2()
        acc_tot += acc_ij
        normalz += dabs(i_num)
-        i_exc  = y * int2_u_grad1u_j1b2(i,j,ipoint) - int2_u_grad1u_x_j1b2(2,i,j,ipoint) 
+        i_exc  = y * int2_u_grad1u_j1b2(i,j,ipoint) - int2_u_grad1u_x_j1b2(i,j,ipoint,2) 
        i_num  = integ(2)
        acc_ij = dabs(i_exc - i_num)
        if(acc_ij .gt. eps_ij) then
@ -406,7 +406,7 @@ subroutine test_int2_u_grad1u_total_j1b2()
        acc_tot += acc_ij
        normalz += dabs(i_num)
-        i_exc  = z * int2_u_grad1u_j1b2(i,j,ipoint) - int2_u_grad1u_x_j1b2(3,i,j,ipoint) 
+        i_exc  = z * int2_u_grad1u_j1b2(i,j,ipoint) - int2_u_grad1u_x_j1b2(i,j,ipoint,3) 
        i_num  = integ(3)
        acc_ij = dabs(i_exc - i_num)
        if(acc_ij .gt. eps_ij) then
--- a/src/non_h_ints_mu/grad_squared.irp.f
+++ b/src/non_h_ints_mu/grad_squared.irp.f
@ -70,9 +70,9 @@ BEGIN_PROVIDER [ double precision, gradu_squared_u_ij_mu, (ao_num, ao_num, n_poi
          gradu_squared_u_ij_mu(i,j,ipoint) = tmp1 * int2_grad1u2_grad2u2_j1b2(i,j,ipoint)            &
                                            + tmp2 * int2_u2_j1b2             (i,j,ipoint)            &
-                                            + tmp6 * tmp9 + tmp3 * int2_u_grad1u_x_j1b2(1,i,j,ipoint) &
+                                            + tmp6 * tmp9 + tmp3 * int2_u_grad1u_x_j1b2(i,j,ipoint,1) &
-                                            + tmp7 * tmp9 + tmp4 * int2_u_grad1u_x_j1b2(2,i,j,ipoint) &
+                                            + tmp7 * tmp9 + tmp4 * int2_u_grad1u_x_j1b2(i,j,ipoint,2) &
-                                            + tmp8 * tmp9 + tmp5 * int2_u_grad1u_x_j1b2(3,i,j,ipoint)
+                                            + tmp8 * tmp9 + tmp5 * int2_u_grad1u_x_j1b2(i,j,ipoint,3)
        enddo
      enddo
    enddo
@ -104,11 +104,11 @@ END_PROVIDER
 ! ---
-!BEGIN_PROVIDER [double precision, tc_grad_square_ao, (ao_num, ao_num, ao_num, ao_num)]
+!BEGIN_PROVIDER [double precision, tc_grad_square_ao_loop, (ao_num, ao_num, ao_num, ao_num)]
 !
 !  BEGIN_DOC
 !  !
-!  ! tc_grad_square_ao(k,i,l,j) = -1/2 <kl | |\grad_1 u(r1,r2)|^2 + |\grad_1 u(r1,r2)|^2 | ij>
+!  ! tc_grad_square_ao_loop(k,i,l,j) = -1/2 <kl | |\grad_1 u(r1,r2)|^2 + |\grad_1 u(r1,r2)|^2 | ij>
 !  !
 !  END_DOC
 !
@ -142,8 +142,8 @@ END_PROVIDER
 !    do l = 1, ao_num
 !      do i = 1, ao_num
 !        do k = 1, ao_num
-!          tc_grad_square_ao(k,i,l,j) = ac_mat(k,i,l,j) + ac_mat(l,j,k,i)
+!          tc_grad_square_ao_loop(k,i,l,j) = ac_mat(k,i,l,j) + ac_mat(l,j,k,i)
-!          !write(11,*) tc_grad_square_ao(k,i,l,j)
+!          !write(11,*) tc_grad_square_ao_loop(k,i,l,j)
 !        enddo
 !      enddo
 !    enddo
@ -155,19 +155,23 @@ END_PROVIDER
 ! ---
-BEGIN_PROVIDER [double precision, tc_grad_square_ao, (ao_num, ao_num, ao_num, ao_num)]
+BEGIN_PROVIDER [double precision, tc_grad_square_ao_loop, (ao_num, ao_num, ao_num, ao_num)]
  BEGIN_DOC
  !
-  ! tc_grad_square_ao(k,i,l,j) = -1/2 <kl | |\grad_1 u(r1,r2)|^2 + |\grad_1 u(r1,r2)|^2 | ij>
+  ! tc_grad_square_ao_loop(k,i,l,j) = 1/2 <kl | |\grad_1 u(r1,r2)|^2 + |\grad_2 u(r1,r2)|^2 | ij>
  !
  END_DOC
  implicit none
  integer                       :: ipoint, i, j, k, l
  double precision              :: weight1, ao_ik_r, ao_i_r
  double precision              :: time0, time1
  double precision, allocatable :: ac_mat(:,:,:,:), bc_mat(:,:,:,:)
  print*, ' providing tc_grad_square_ao_loop ...'
  call wall_time(time0)
  allocate(ac_mat(ao_num,ao_num,ao_num,ao_num))
  ac_mat = 0.d0
  allocate(bc_mat(ao_num,ao_num,ao_num,ao_num))
@ -177,10 +181,12 @@ BEGIN_PROVIDER [double precision, tc_grad_square_ao, (ao_num, ao_num, ao_num, ao
    weight1 = final_weight_at_r_vector(ipoint)
    do i = 1, ao_num
-      ao_i_r = weight1 * aos_in_r_array_transp(ipoint,i)
+      !ao_i_r = weight1 * aos_in_r_array_transp(ipoint,i)
      ao_i_r = weight1 * aos_in_r_array(i,ipoint)
      do k = 1, ao_num
-        ao_ik_r = ao_i_r * aos_in_r_array_transp(ipoint,k)
+        !ao_ik_r = ao_i_r * aos_in_r_array_transp(ipoint,k)
        ao_ik_r = ao_i_r * aos_in_r_array(k,ipoint)
        do j = 1, ao_num
          do l = 1, ao_num
@ -196,7 +202,7 @@ BEGIN_PROVIDER [double precision, tc_grad_square_ao, (ao_num, ao_num, ao_num, ao
    do l = 1, ao_num
      do i = 1, ao_num
        do k = 1, ao_num
-          tc_grad_square_ao(k,i,l,j) = ac_mat(k,i,l,j) + ac_mat(l,j,k,i) + bc_mat(k,i,l,j)
+          tc_grad_square_ao_loop(k,i,l,j) = ac_mat(k,i,l,j) + ac_mat(l,j,k,i) + bc_mat(k,i,l,j)
        enddo
      enddo
    enddo
@ -205,6 +211,9 @@ BEGIN_PROVIDER [double precision, tc_grad_square_ao, (ao_num, ao_num, ao_num, ao
  deallocate(ac_mat)
  deallocate(bc_mat)
  call wall_time(time1)
  print*, ' Wall time for tc_grad_square_ao_loop = ', time1 - time0
 END_PROVIDER 
 ! ---
@ -329,9 +338,9 @@ BEGIN_PROVIDER [ double precision, u12_grad1_u12_j1b_grad1_j1b, (ao_num, ao_num,
        tmp9 = int2_u_grad1u_j1b2(i,j,ipoint)
-        u12_grad1_u12_j1b_grad1_j1b(i,j,ipoint) = tmp6 * tmp9 + tmp3 * int2_u_grad1u_x_j1b2(1,i,j,ipoint) &
+        u12_grad1_u12_j1b_grad1_j1b(i,j,ipoint) = tmp6 * tmp9 + tmp3 * int2_u_grad1u_x_j1b2(i,j,ipoint,1) &
-                                                + tmp7 * tmp9 + tmp4 * int2_u_grad1u_x_j1b2(2,i,j,ipoint) &
+                                                + tmp7 * tmp9 + tmp4 * int2_u_grad1u_x_j1b2(i,j,ipoint,2) &
-                                                + tmp8 * tmp9 + tmp5 * int2_u_grad1u_x_j1b2(3,i,j,ipoint)
+                                                + tmp8 * tmp9 + tmp5 * int2_u_grad1u_x_j1b2(i,j,ipoint,3)
      enddo
    enddo
  enddo
@ -343,3 +352,86 @@ END_PROVIDER
 ! ---
 BEGIN_PROVIDER [double precision, tc_grad_square_ao, (ao_num, ao_num, ao_num, ao_num)]
  BEGIN_DOC
  !
  ! tc_grad_square_ao(k,i,l,j) = 1/2 <kl | |\grad_1 u(r1,r2)|^2 + |\grad_2 u(r1,r2)|^2 | ij>
  !
  END_DOC
  implicit none
  integer                       :: ipoint, i, j, k, l
  double precision              :: weight1, ao_ik_r, ao_i_r
  double precision              :: time0, time1
  double precision, allocatable :: ac_mat(:,:,:,:), b_mat(:,:,:), tmp(:,:,:)
  print*, ' providing tc_grad_square_ao ...'
  call wall_time(time0)
  allocate(ac_mat(ao_num,ao_num,ao_num,ao_num), b_mat(n_points_final_grid,ao_num,ao_num), tmp(ao_num,ao_num,n_points_final_grid))
  b_mat = 0.d0
 !$OMP PARALLEL               &
 !$OMP DEFAULT (NONE)         &
 !$OMP PRIVATE (i, k, ipoint) & 
 !$OMP SHARED (aos_in_r_array_transp, b_mat, ao_num, n_points_final_grid, final_weight_at_r_vector)
 !$OMP DO SCHEDULE (static)
  do i = 1, ao_num
    do k = 1, ao_num
      do ipoint = 1, n_points_final_grid
        b_mat(ipoint,k,i) = final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,k)
      enddo
    enddo
  enddo
 !$OMP END DO
 !$OMP END PARALLEL
  tmp = 0.d0
 !$OMP PARALLEL               &
 !$OMP DEFAULT (NONE)         &
 !$OMP PRIVATE (j, l, ipoint) & 
 !$OMP SHARED (tmp, ao_num, n_points_final_grid, u12sq_j1bsq, u12_grad1_u12_j1b_grad1_j1b, grad12_j12)
 !$OMP DO SCHEDULE (static)
  do ipoint = 1, n_points_final_grid
    do j = 1, ao_num
      do l = 1, ao_num
        tmp(l,j,ipoint) = u12sq_j1bsq(l,j,ipoint) + u12_grad1_u12_j1b_grad1_j1b(l,j,ipoint) + 0.5d0 * grad12_j12(l,j,ipoint)
      enddo
    enddo
  enddo
 !$OMP END DO
 !$OMP END PARALLEL
  ac_mat = 0.d0
  call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0 &
            , tmp(1,1,1), ao_num*ao_num, b_mat(1,1,1), n_points_final_grid      &
            , 1.d0, ac_mat, ao_num*ao_num)
  deallocate(tmp, b_mat)
 !$OMP PARALLEL             &
 !$OMP DEFAULT (NONE)       &
 !$OMP PRIVATE (i, j, k, l) & 
 !$OMP SHARED (ac_mat, tc_grad_square_ao, ao_num)
 !$OMP DO SCHEDULE (static)
  do j = 1, ao_num
    do l = 1, ao_num
      do i = 1, ao_num
        do k = 1, ao_num
          tc_grad_square_ao(k,i,l,j) = ac_mat(k,i,l,j) + ac_mat(l,j,k,i) 
        enddo
      enddo
    enddo
  enddo
 !$OMP END DO
 !$OMP END PARALLEL
  deallocate(ac_mat)
  call wall_time(time1)
  print*, ' Wall time for tc_grad_square_ao = ', time1 - time0
 END_PROVIDER 
 ! ---
--- a/src/non_h_ints_mu/grad_squared_manu.irp.f
+++ b/src/non_h_ints_mu/grad_squared_manu.irp.f
@ -10,51 +10,75 @@ BEGIN_PROVIDER [double precision, tc_grad_square_ao_test, (ao_num, ao_num, ao_nu
  implicit none
  integer                       :: ipoint, i, j, k, l
  double precision              :: weight1, ao_ik_r, ao_i_r,contrib,contrib2
-  double precision, allocatable :: ac_mat(:,:,:,:), bc_mat(:,:,:,:)
+  double precision              :: time0, time1
-  double precision :: wall1, wall0
+  double precision, allocatable :: ac_mat(:,:,:,:), b_mat(:,:,:), tmp(:,:,:)
  print*, ' providing tc_grad_square_ao_test ...'
  call wall_time(time0)
  provide u12sq_j1bsq_test u12_grad1_u12_j1b_grad1_j1b_test grad12_j12_test
  call wall_time(wall0)
-  allocate(ac_mat(ao_num,ao_num,ao_num,ao_num))
+  allocate(ac_mat(ao_num,ao_num,ao_num,ao_num), b_mat(n_points_final_grid,ao_num,ao_num), tmp(ao_num,ao_num,n_points_final_grid))
  ac_mat = 0.d0
  allocate(bc_mat(ao_num,ao_num,ao_num,ao_num))
  bc_mat = 0.d0
  b_mat = 0.d0
 !$OMP PARALLEL               &
 !$OMP DEFAULT (NONE)         &
 !$OMP PRIVATE (i, k, ipoint) & 
 !$OMP SHARED (aos_in_r_array_transp, b_mat, ao_num, n_points_final_grid, final_weight_at_r_vector)
 !$OMP DO SCHEDULE (static)
  do i = 1, ao_num
    do k = 1, ao_num
      do ipoint = 1, n_points_final_grid
-    weight1 = final_weight_at_r_vector(ipoint)
+        b_mat(ipoint,k,i) = final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,k)
      enddo
    enddo
  enddo
 !$OMP END DO
 !$OMP END PARALLEL
  tmp = 0.d0
 !$OMP PARALLEL               &
 !$OMP DEFAULT (NONE)         &
 !$OMP PRIVATE (j, l, ipoint) & 
 !$OMP SHARED (tmp, ao_num, n_points_final_grid, u12sq_j1bsq_test, u12_grad1_u12_j1b_grad1_j1b_test, grad12_j12_test)
 !$OMP DO SCHEDULE (static)
  do ipoint = 1, n_points_final_grid
    do j = 1, ao_num
      do l = 1, ao_num
-        contrib =  u12sq_j1bsq_test(l,j,ipoint) + u12_grad1_u12_j1b_grad1_j1b_test(l,j,ipoint) 
+        tmp(l,j,ipoint) = u12sq_j1bsq_test(l,j,ipoint) + u12_grad1_u12_j1b_grad1_j1b_test(l,j,ipoint) + 0.5d0 * grad12_j12_test(l,j,ipoint)
-        contrib2=grad12_j12_test(l,j,ipoint)
+      enddo
-        do i = 1, ao_num
+    enddo
-          ao_i_r = weight1 * aos_in_r_array(i,ipoint)
+  enddo
 !$OMP END DO
 !$OMP END PARALLEL
-          do k = 1, ao_num
+  ac_mat = 0.d0
-            ao_ik_r = ao_i_r * aos_in_r_array(k,ipoint)
+  call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0 &
-
+            , tmp(1,1,1), ao_num*ao_num, b_mat(1,1,1), n_points_final_grid      &
-            ac_mat(k,i,l,j) += ao_ik_r * contrib
+            , 1.d0, ac_mat, ao_num*ao_num)
-            bc_mat(k,i,l,j) += ao_ik_r * contrib2
+  deallocate(tmp, b_mat)
          enddo
        enddo
      enddo
    enddo
  enddo
 !$OMP PARALLEL             &
 !$OMP DEFAULT (NONE)       &
 !$OMP PRIVATE (i, j, k, l) & 
 !$OMP SHARED (ac_mat, tc_grad_square_ao_test, ao_num)
 !$OMP DO SCHEDULE (static)
  do j = 1, ao_num
    do l = 1, ao_num
      do i = 1, ao_num
        do k = 1, ao_num
-          tc_grad_square_ao_test(k,i,l,j) = ac_mat(k,i,l,j) + ac_mat(l,j,k,i) + bc_mat(k,i,l,j)
+          tc_grad_square_ao_test(k,i,l,j) = ac_mat(k,i,l,j) + ac_mat(l,j,k,i)
        enddo
      enddo
    enddo
  enddo
-  call wall_time(wall1)
+ !$OMP END DO
-  print*,'wall time for tc_grad_square_ao_test',wall1 - wall0
+ !$OMP END PARALLEL
  deallocate(ac_mat)
-  deallocate(bc_mat)
+
  call wall_time(time1)
  print*, ' Wall time for tc_grad_square_ao_test = ', time1 - time0
 END_PROVIDER 
@ -88,6 +112,7 @@ BEGIN_PROVIDER [ double precision, u12sq_j1bsq_test, (ao_num, ao_num, n_points_f
 END_PROVIDER 
 ! ---
 BEGIN_PROVIDER [ double precision, u12_grad1_u12_j1b_grad1_j1b_test, (ao_num, ao_num, n_points_final_grid) ]
@ -99,8 +124,9 @@ BEGIN_PROVIDER [ double precision, u12_grad1_u12_j1b_grad1_j1b_test, (ao_num, ao
  double precision           :: time0, time1
  double precision, external :: overlap_gauss_r12_ao
  provide int2_u_grad1u_x_j1b2_test
  print*, ' providing u12_grad1_u12_j1b_grad1_j1b_test ...'
  provide int2_u_grad1u_x_j1b2_test
  call wall_time(time0)
  do ipoint = 1, n_points_final_grid
@ -126,9 +152,9 @@ BEGIN_PROVIDER [ double precision, u12_grad1_u12_j1b_grad1_j1b_test, (ao_num, ao
        tmp9 = int2_u_grad1u_j1b2_test(i,j,ipoint)
-        u12_grad1_u12_j1b_grad1_j1b_test(i,j,ipoint) = tmp6 * tmp9 + tmp3 * int2_u_grad1u_x_j1b2_test(1,i,j,ipoint) &
+        u12_grad1_u12_j1b_grad1_j1b_test(i,j,ipoint) = tmp6 * tmp9 + tmp3 * int2_u_grad1u_x_j1b2_test(i,j,ipoint,1) &
-                                                     + tmp7 * tmp9 + tmp4 * int2_u_grad1u_x_j1b2_test(2,i,j,ipoint) &
+                                                     + tmp7 * tmp9 + tmp4 * int2_u_grad1u_x_j1b2_test(i,j,ipoint,2) &
-                                                     + tmp8 * tmp9 + tmp5 * int2_u_grad1u_x_j1b2_test(3,i,j,ipoint)
+                                                     + tmp8 * tmp9 + tmp5 * int2_u_grad1u_x_j1b2_test(i,j,ipoint,3)
      enddo
    enddo
  enddo
@ -192,3 +218,4 @@ BEGIN_PROVIDER [ double precision, grad12_j12_test, (ao_num, ao_num, n_points_fi
 END_PROVIDER 
 ! ---
--- a/src/non_h_ints_mu/new_grad_tc.irp.f
+++ b/src/non_h_ints_mu/new_grad_tc.irp.f
@ -1,22 +1,21 @@
 ! ---
-BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao, (3, ao_num, ao_num, n_points_final_grid)]
+BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao, (ao_num, ao_num, n_points_final_grid, 3)]
  BEGIN_DOC
  !
-  ! int2_grad1_u12_ao(:,i,j,ipoint) = \int dr2 [-1 * \grad_r1 J(r1,r2)] \phi_i(r2) \phi_j(r2) 
+  ! int2_grad1_u12_ao(i,j,ipoint,:) = \int dr2 [-1 * \grad_r1 J(r1,r2)] \phi_i(r2) \phi_j(r2) 
  !
  ! where r1 = r(ipoint)
  !
  ! if J(r1,r2) = u12:
  !
-  ! int2_grad1_u12_ao(:,i,j,ipoint) = 0.5 x \int dr2 [(r1 - r2) (erf(mu * r12)-1)r_12] \phi_i(r2) \phi_j(r2)
+  ! int2_grad1_u12_ao(i,j,ipoint,:) = 0.5 x \int dr2 [(r1 - r2) (erf(mu * r12)-1)r_12] \phi_i(r2) \phi_j(r2)
  !                                 = 0.5 * [ v_ij_erf_rk_cst_mu(i,j,ipoint) * r(:) - x_v_ij_erf_rk_cst_mu(i,j,ipoint,:) ]
  !
  ! if J(r1,r2) = u12 x v1 x v2
  !
-  ! int2_grad1_u12_ao(:,i,j,ipoint) =      v1    x [ 0.5 x \int dr2 [(r1 - r2) (erf(mu * r12)-1)r_12] v2 \phi_i(r2) \phi_j(r2) ]
+  ! int2_grad1_u12_ao(i,j,ipoint,:) =      v1    x [ 0.5 x \int dr2 [(r1 - r2) (erf(mu * r12)-1)r_12] v2 \phi_i(r2) \phi_j(r2) ]
  !                                 - \grad_1 v1 x [       \int dr2                  u12              v2 \phi_i(r2) \phi_j(r2) ] 
  !                                 =    0.5 v_1b(ipoint) * v_ij_erf_rk_cst_mu_j1b(i,j,ipoint) * r(:) 
  !                                 -    0.5 v_1b(ipoint) * x_v_ij_erf_rk_cst_mu_j1b(i,j,ipoint,:) 
@ -27,8 +26,12 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao, (3, ao_num, ao_num, n_poin
  implicit none
  integer          :: ipoint, i, j
  double precision :: time0, time1
  double precision :: x, y, z, tmp_x, tmp_y, tmp_z, tmp0, tmp1, tmp2
  print*, ' providing int2_grad1_u12_ao ...'
  call wall_time(time0)
  PROVIDE j1b_type
  if(j1b_type .eq. 3) then
@ -49,9 +52,9 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao, (3, ao_num, ao_num, n_poin
          tmp1 = tmp0 * v_ij_erf_rk_cst_mu_j1b(i,j,ipoint)
          tmp2 = v_ij_u_cst_mu_j1b(i,j,ipoint)
-          int2_grad1_u12_ao(1,i,j,ipoint) = tmp1 * x - tmp0 * x_v_ij_erf_rk_cst_mu_tmp_j1b(1,i,j,ipoint) - tmp2 * tmp_x
+          int2_grad1_u12_ao(i,j,ipoint,1) = tmp1 * x - tmp0 * x_v_ij_erf_rk_cst_mu_j1b(i,j,ipoint,1) - tmp2 * tmp_x
-          int2_grad1_u12_ao(2,i,j,ipoint) = tmp1 * y - tmp0 * x_v_ij_erf_rk_cst_mu_tmp_j1b(2,i,j,ipoint) - tmp2 * tmp_y
+          int2_grad1_u12_ao(i,j,ipoint,2) = tmp1 * y - tmp0 * x_v_ij_erf_rk_cst_mu_j1b(i,j,ipoint,2) - tmp2 * tmp_y
-          int2_grad1_u12_ao(3,i,j,ipoint) = tmp1 * z - tmp0 * x_v_ij_erf_rk_cst_mu_tmp_j1b(3,i,j,ipoint) - tmp2 * tmp_z
+          int2_grad1_u12_ao(i,j,ipoint,3) = tmp1 * z - tmp0 * x_v_ij_erf_rk_cst_mu_j1b(i,j,ipoint,3) - tmp2 * tmp_z
        enddo
      enddo
    enddo
@ -67,9 +70,9 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao, (3, ao_num, ao_num, n_poin
        do i = 1, ao_num
          tmp1 = v_ij_erf_rk_cst_mu(i,j,ipoint)
-          int2_grad1_u12_ao(1,i,j,ipoint) = tmp1 * x - x_v_ij_erf_rk_cst_mu_tmp(1,i,j,ipoint)
+          int2_grad1_u12_ao(i,j,ipoint,1) = tmp1 * x - x_v_ij_erf_rk_cst_mu_transp_bis(ipoint,i,j,1)
-          int2_grad1_u12_ao(2,i,j,ipoint) = tmp1 * y - x_v_ij_erf_rk_cst_mu_tmp(2,i,j,ipoint)
+          int2_grad1_u12_ao(i,j,ipoint,2) = tmp1 * y - x_v_ij_erf_rk_cst_mu_transp_bis(ipoint,i,j,2)
-          int2_grad1_u12_ao(3,i,j,ipoint) = tmp1 * z - x_v_ij_erf_rk_cst_mu_tmp(3,i,j,ipoint)
+          int2_grad1_u12_ao(i,j,ipoint,3) = tmp1 * z - x_v_ij_erf_rk_cst_mu_transp_bis(ipoint,i,j,3)
        enddo
      enddo
    enddo
@ -78,6 +81,9 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao, (3, ao_num, ao_num, n_poin
  endif
  call wall_time(time1)
  print*, ' Wall time for int2_grad1_u12_ao = ', time1 - time0
 END_PROVIDER 
 ! ---
@ -94,7 +100,7 @@ BEGIN_PROVIDER [ double precision, int1_grad2_u12_ao, (3, ao_num, ao_num, n_poin
  !
  ! int1_grad2_u12_ao(:,i,j,ipoint) = +0.5 x \int dr1 [-(r1 - r2) (erf(mu * r12)-1)r_12] \phi_i(r1) \phi_j(r1)
  !                                 = -0.5 * [ v_ij_erf_rk_cst_mu(i,j,ipoint) * r(:) - x_v_ij_erf_rk_cst_mu(i,j,ipoint,:) ]
-  !                                 = -int2_grad1_u12_ao(:,i,j,ipoint)
+  !                                 = -int2_grad1_u12_ao(i,j,ipoint,:)
  !
  ! if J(r1,r2) = u12 x v1 x v2
  !
@ -131,9 +137,9 @@ BEGIN_PROVIDER [ double precision, int1_grad2_u12_ao, (3, ao_num, ao_num, n_poin
          tmp1 = tmp0 * v_ij_erf_rk_cst_mu_j1b(i,j,ipoint)
          tmp2 = v_ij_u_cst_mu_j1b(i,j,ipoint)
-          int1_grad2_u12_ao(1,i,j,ipoint) = -tmp1 * x + tmp0 * x_v_ij_erf_rk_cst_mu_tmp_j1b(1,i,j,ipoint) - tmp2 * tmp_x
+          int1_grad2_u12_ao(1,i,j,ipoint) = -tmp1 * x + tmp0 * x_v_ij_erf_rk_cst_mu_j1b(i,j,ipoint,1) - tmp2 * tmp_x
-          int1_grad2_u12_ao(2,i,j,ipoint) = -tmp1 * y + tmp0 * x_v_ij_erf_rk_cst_mu_tmp_j1b(2,i,j,ipoint) - tmp2 * tmp_y
+          int1_grad2_u12_ao(2,i,j,ipoint) = -tmp1 * y + tmp0 * x_v_ij_erf_rk_cst_mu_j1b(i,j,ipoint,2) - tmp2 * tmp_y
-          int1_grad2_u12_ao(3,i,j,ipoint) = -tmp1 * z + tmp0 * x_v_ij_erf_rk_cst_mu_tmp_j1b(3,i,j,ipoint) - tmp2 * tmp_z
+          int1_grad2_u12_ao(3,i,j,ipoint) = -tmp1 * z + tmp0 * x_v_ij_erf_rk_cst_mu_j1b(i,j,ipoint,3) - tmp2 * tmp_z
        enddo
      enddo
    enddo
@ -148,11 +154,11 @@ END_PROVIDER
 ! ---
-BEGIN_PROVIDER [double precision, tc_grad_and_lapl_ao, (ao_num, ao_num, ao_num, ao_num)]
+BEGIN_PROVIDER [double precision, tc_grad_and_lapl_ao_loop, (ao_num, ao_num, ao_num, ao_num)]
  BEGIN_DOC
  !
-  ! tc_grad_and_lapl_ao(k,i,l,j) = < k l | -1/2 \Delta_1 u(r1,r2) - \grad_1 u(r1,r2) . \grad_1 | ij >
+  ! tc_grad_and_lapl_ao_loop(k,i,l,j) = < k l | -1/2 \Delta_1 u(r1,r2) - \grad_1 u(r1,r2) . \grad_1 | ij >
  !
  ! = 1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2 \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2) 
  !
@ -165,8 +171,12 @@ BEGIN_PROVIDER [double precision, tc_grad_and_lapl_ao, (ao_num, ao_num, ao_num,
  double precision              :: weight1, contrib_x, contrib_y, contrib_z, tmp_x, tmp_y, tmp_z
  double precision              :: ao_k_r, ao_i_r, ao_i_dx, ao_i_dy, ao_i_dz
  double precision              :: ao_j_r, ao_l_r, ao_l_dx, ao_l_dy, ao_l_dz
  double precision              :: time0, time1
  double precision, allocatable :: ac_mat(:,:,:,:)
  print*, ' providing tc_grad_and_lapl_ao_loop ...'
  call wall_time(time0)
  allocate(ac_mat(ao_num,ao_num,ao_num,ao_num))
  ac_mat = 0.d0
@ -176,24 +186,32 @@ BEGIN_PROVIDER [double precision, tc_grad_and_lapl_ao, (ao_num, ao_num, ao_num,
    weight1 = 0.5d0 * final_weight_at_r_vector(ipoint)
    do i = 1, ao_num
-      ao_i_r  = weight1 * aos_in_r_array_transp         (ipoint,i)
+      !ao_i_r  = weight1 * aos_in_r_array_transp         (ipoint,i)
-      ao_i_dx = weight1 * aos_grad_in_r_array_transp_bis(ipoint,i,1)
+      !ao_i_dx = weight1 * aos_grad_in_r_array_transp_bis(ipoint,i,1)
-      ao_i_dy = weight1 * aos_grad_in_r_array_transp_bis(ipoint,i,2)
+      !ao_i_dy = weight1 * aos_grad_in_r_array_transp_bis(ipoint,i,2)
-      ao_i_dz = weight1 * aos_grad_in_r_array_transp_bis(ipoint,i,3)
+      !ao_i_dz = weight1 * aos_grad_in_r_array_transp_bis(ipoint,i,3)
      ao_i_r  = weight1 * aos_in_r_array     (i,ipoint)
      ao_i_dx = weight1 * aos_grad_in_r_array(i,ipoint,1)
      ao_i_dy = weight1 * aos_grad_in_r_array(i,ipoint,2)
      ao_i_dz = weight1 * aos_grad_in_r_array(i,ipoint,3)
      do k = 1, ao_num
-        ao_k_r = aos_in_r_array_transp(ipoint,k)
+        !ao_k_r = aos_in_r_array_transp(ipoint,k)
        ao_k_r = aos_in_r_array(k,ipoint)
-        tmp_x = ao_k_r * ao_i_dx - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,1) 
+        !tmp_x = ao_k_r * ao_i_dx - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,1) 
-        tmp_y = ao_k_r * ao_i_dy - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,2) 
+        !tmp_y = ao_k_r * ao_i_dy - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,2) 
-        tmp_z = ao_k_r * ao_i_dz - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,3) 
+        !tmp_z = ao_k_r * ao_i_dz - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,3) 
        tmp_x = ao_k_r * ao_i_dx - ao_i_r * aos_grad_in_r_array(k,ipoint,1) 
        tmp_y = ao_k_r * ao_i_dy - ao_i_r * aos_grad_in_r_array(k,ipoint,2) 
        tmp_z = ao_k_r * ao_i_dz - ao_i_r * aos_grad_in_r_array(k,ipoint,3) 
        do j = 1, ao_num
          do l = 1, ao_num
-            contrib_x = int2_grad1_u12_ao(1,l,j,ipoint) * tmp_x 
+            contrib_x = int2_grad1_u12_ao(l,j,ipoint,1) * tmp_x 
-            contrib_y = int2_grad1_u12_ao(2,l,j,ipoint) * tmp_y 
+            contrib_y = int2_grad1_u12_ao(l,j,ipoint,2) * tmp_y 
-            contrib_z = int2_grad1_u12_ao(3,l,j,ipoint) * tmp_z 
+            contrib_z = int2_grad1_u12_ao(l,j,ipoint,3) * tmp_z 
            ac_mat(k,i,l,j) += contrib_x + contrib_y + contrib_z
          enddo
@ -223,9 +241,9 @@ BEGIN_PROVIDER [double precision, tc_grad_and_lapl_ao, (ao_num, ao_num, ao_num,
  !      do i = 1, ao_num
  !        do k = 1, ao_num
-  !          contrib_x = int2_grad1_u12_ao(1,k,i,ipoint) * tmp_x 
+  !          contrib_x = int2_grad1_u12_ao(k,i,ipoint,1) * tmp_x 
-  !          contrib_y = int2_grad1_u12_ao(2,k,i,ipoint) * tmp_y 
+  !          contrib_y = int2_grad1_u12_ao(k,i,ipoint,2) * tmp_y 
-  !          contrib_z = int2_grad1_u12_ao(3,k,i,ipoint) * tmp_z 
+  !          contrib_z = int2_grad1_u12_ao(k,i,ipoint,3) * tmp_z 
  !          ac_mat(k,i,l,j) += contrib_x + contrib_y + contrib_z
  !        enddo
@ -240,8 +258,8 @@ BEGIN_PROVIDER [double precision, tc_grad_and_lapl_ao, (ao_num, ao_num, ao_num,
    do l = 1, ao_num
      do i = 1, ao_num
        do k = 1, ao_num
-          tc_grad_and_lapl_ao(k,i,l,j) = ac_mat(k,i,l,j) + ac_mat(l,j,k,i)
+          tc_grad_and_lapl_ao_loop(k,i,l,j) = ac_mat(k,i,l,j) + ac_mat(l,j,k,i)
-          !tc_grad_and_lapl_ao(k,i,l,j) = ac_mat(k,i,l,j)
+          !tc_grad_and_lapl_ao_loop(k,i,l,j) = ac_mat(k,i,l,j)
        enddo
      enddo
    enddo
@ -249,6 +267,92 @@ BEGIN_PROVIDER [double precision, tc_grad_and_lapl_ao, (ao_num, ao_num, ao_num,
  deallocate(ac_mat)
  call wall_time(time1)
  print*, ' Wall time for tc_grad_and_lapl_ao_loop = ', time1 - time0
 END_PROVIDER 
 ! ---
 BEGIN_PROVIDER [double precision, tc_grad_and_lapl_ao, (ao_num, ao_num, ao_num, ao_num)]
  BEGIN_DOC
  !
  ! tc_grad_and_lapl_ao(k,i,l,j) = < k l | -1/2 \Delta_1 u(r1,r2) - \grad_1 u(r1,r2) . \grad_1 | ij >
  !
  ! = 1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2 \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2) 
  !
  ! This is obtained by integration by parts. 
  !
  END_DOC
  implicit none
  integer                       :: ipoint, i, j, k, l, m
  double precision              :: weight1, ao_k_r, ao_i_r
  double precision              :: time0, time1
  double precision, allocatable :: ac_mat(:,:,:,:), b_mat(:,:,:,:)
  print*, ' providing tc_grad_and_lapl_ao ...'
  call wall_time(time0)
  allocate(b_mat(n_points_final_grid,ao_num,ao_num,3), ac_mat(ao_num,ao_num,ao_num,ao_num))
  b_mat = 0.d0
 !$OMP PARALLEL                                                              &
 !$OMP DEFAULT (NONE)                                                        &
 !$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r)                       & 
 !$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, b_mat, & 
 !$OMP         ao_num, n_points_final_grid, final_weight_at_r_vector)
 !$OMP DO SCHEDULE (static)
  do i = 1, ao_num
    do k = 1, ao_num
      do ipoint = 1, n_points_final_grid
        weight1 = 0.5d0 * final_weight_at_r_vector(ipoint)
        ao_i_r  = aos_in_r_array_transp(ipoint,i)
        ao_k_r  = aos_in_r_array_transp(ipoint,k)
        b_mat(ipoint,k,i,1) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,1) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,1)) 
        b_mat(ipoint,k,i,2) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,2) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,2)) 
        b_mat(ipoint,k,i,3) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,3) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,3)) 
      enddo
    enddo
  enddo
 !$OMP END DO
 !$OMP END PARALLEL
  ac_mat = 0.d0
  do m = 1, 3
    call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0              &
              , int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num, b_mat(1,1,1,m), n_points_final_grid &
              , 1.d0, ac_mat, ao_num*ao_num) 
  enddo
  deallocate(b_mat)
 !$OMP PARALLEL             &
 !$OMP DEFAULT (NONE)       &
 !$OMP PRIVATE (i, j, k, l) & 
 !$OMP SHARED (ac_mat, tc_grad_and_lapl_ao, ao_num)
 !$OMP DO SCHEDULE (static)
  do j = 1, ao_num
    do l = 1, ao_num
      do i = 1, ao_num
        do k = 1, ao_num
          tc_grad_and_lapl_ao(k,i,l,j) = ac_mat(k,i,l,j) + ac_mat(l,j,k,i)
          !tc_grad_and_lapl_ao(k,i,l,j) = ac_mat(k,i,l,j)
        enddo
      enddo
    enddo
  enddo
 !$OMP END DO
 !$OMP END PARALLEL
  deallocate(ac_mat)
  call wall_time(time1)
  print*, ' Wall time for tc_grad_and_lapl_ao = ', time1 - time0
 END_PROVIDER 
 ! ---
--- a/src/non_h_ints_mu/new_grad_tc_manu.irp.f
+++ b/src/non_h_ints_mu/new_grad_tc_manu.irp.f
@ -1,20 +1,20 @@
-BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao_test, (3, ao_num, ao_num, n_points_final_grid)]
+BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao_test, (ao_num, ao_num, n_points_final_grid, 3)]
  BEGIN_DOC
  !
-  ! int2_grad1_u12_ao_test(:,i,j,ipoint) = \int dr2 [-1 * \grad_r1 J(r1,r2)] \phi_i(r2) \phi_j(r2) 
+  ! int2_grad1_u12_ao_test(i,j,ipoint,:) = \int dr2 [-1 * \grad_r1 J(r1,r2)] \phi_i(r2) \phi_j(r2) 
  !
  ! where r1 = r(ipoint)
  !
  ! if J(r1,r2) = u12:
  !
-  ! int2_grad1_u12_ao_test(:,i,j,ipoint) = 0.5 x \int dr2 [(r1 - r2) (erf(mu * r12)-1)r_12] \phi_i(r2) \phi_j(r2)
+  ! int2_grad1_u12_ao_test(i,j,ipoint,:) = 0.5 x \int dr2 [(r1 - r2) (erf(mu * r12)-1)r_12] \phi_i(r2) \phi_j(r2)
  !                                 = 0.5 * [ v_ij_erf_rk_cst_mu(i,j,ipoint) * r(:) - x_v_ij_erf_rk_cst_mu(i,j,ipoint,:) ]
  !
  ! if J(r1,r2) = u12 x v1 x v2
  !
-  ! int2_grad1_u12_ao_test(:,i,j,ipoint) =      v1    x [ 0.5 x \int dr2 [(r1 - r2) (erf(mu * r12)-1)r_12] v2 \phi_i(r2) \phi_j(r2) ]
+  ! int2_grad1_u12_ao_test(i,j,ipoint,:) =      v1    x [ 0.5 x \int dr2 [(r1 - r2) (erf(mu * r12)-1)r_12] v2 \phi_i(r2) \phi_j(r2) ]
  !                                 - \grad_1 v1 x [       \int dr2                  u12              v2 \phi_i(r2) \phi_j(r2) ] 
  !                                 =    0.5 v_1b(ipoint) * v_ij_erf_rk_cst_mu_j1b(i,j,ipoint) * r(:) 
  !                                 -    0.5 v_1b(ipoint) * x_v_ij_erf_rk_cst_mu_j1b(i,j,ipoint,:) 
@ -25,8 +25,12 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao_test, (3, ao_num, ao_num, n
  implicit none
  integer          :: ipoint, i, j
  double precision :: time0, time1
  double precision :: x, y, z, tmp_x, tmp_y, tmp_z, tmp0, tmp1, tmp2
  print*, ' providing int2_grad1_u12_ao_test ...'
  call wall_time(time0)
  PROVIDE j1b_type
  if(j1b_type .eq. 3) then
@ -43,14 +47,13 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao_test, (3, ao_num, ao_num, n
      do j = 1, ao_num
        do i = 1, ao_num
 !         if(dabs(ao_overlap_abs_grid(j,i)).lt.1.d-12)cycle
          tmp1 = tmp0 * v_ij_erf_rk_cst_mu_j1b_test(i,j,ipoint)
          tmp2 = v_ij_u_cst_mu_j1b_test(i,j,ipoint)
-          int2_grad1_u12_ao_test(1,i,j,ipoint) = tmp1 * x - tmp0 * x_v_ij_erf_rk_cst_mu_tmp_j1b_test(1,i,j,ipoint) - tmp2 * tmp_x
+          int2_grad1_u12_ao_test(i,j,ipoint,1) = tmp1 * x - tmp0 * x_v_ij_erf_rk_cst_mu_j1b_test(i,j,ipoint,1) - tmp2 * tmp_x
-          int2_grad1_u12_ao_test(2,i,j,ipoint) = tmp1 * y - tmp0 * x_v_ij_erf_rk_cst_mu_tmp_j1b_test(2,i,j,ipoint) - tmp2 * tmp_y
+          int2_grad1_u12_ao_test(i,j,ipoint,2) = tmp1 * y - tmp0 * x_v_ij_erf_rk_cst_mu_j1b_test(i,j,ipoint,2) - tmp2 * tmp_y
-          int2_grad1_u12_ao_test(3,i,j,ipoint) = tmp1 * z - tmp0 * x_v_ij_erf_rk_cst_mu_tmp_j1b_test(3,i,j,ipoint) - tmp2 * tmp_z
+          int2_grad1_u12_ao_test(i,j,ipoint,3) = tmp1 * z - tmp0 * x_v_ij_erf_rk_cst_mu_j1b_test(i,j,ipoint,3) - tmp2 * tmp_z
        enddo
      enddo
    enddo
@ -66,9 +69,9 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao_test, (3, ao_num, ao_num, n
        do i = 1, ao_num
          tmp1 = v_ij_erf_rk_cst_mu(i,j,ipoint)
-          int2_grad1_u12_ao_test(1,i,j,ipoint) = tmp1 * x - x_v_ij_erf_rk_cst_mu_tmp(1,i,j,ipoint)
+          int2_grad1_u12_ao_test(i,j,ipoint,1) = tmp1 * x - x_v_ij_erf_rk_cst_mu_tmp(i,j,ipoint,1)
-          int2_grad1_u12_ao_test(2,i,j,ipoint) = tmp1 * y - x_v_ij_erf_rk_cst_mu_tmp(2,i,j,ipoint)
+          int2_grad1_u12_ao_test(i,j,ipoint,2) = tmp1 * y - x_v_ij_erf_rk_cst_mu_tmp(i,j,ipoint,2)
-          int2_grad1_u12_ao_test(3,i,j,ipoint) = tmp1 * z - x_v_ij_erf_rk_cst_mu_tmp(3,i,j,ipoint)
+          int2_grad1_u12_ao_test(i,j,ipoint,3) = tmp1 * z - x_v_ij_erf_rk_cst_mu_tmp(i,j,ipoint,3)
        enddo
      enddo
    enddo
@ -77,8 +80,13 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao_test, (3, ao_num, ao_num, n
  endif
  call wall_time(time1)
  print*, ' Wall time for int2_grad1_u12_ao_test = ', time1 - time0
 END_PROVIDER 
 ! ---
 BEGIN_PROVIDER [double precision, tc_grad_and_lapl_ao_test, (ao_num, ao_num, ao_num, ao_num)]
  BEGIN_DOC
@ -92,48 +100,57 @@ BEGIN_PROVIDER [double precision, tc_grad_and_lapl_ao_test, (ao_num, ao_num, ao_
  END_DOC
  implicit none
-  integer                       :: ipoint, i, j, k, l
+  integer                       :: ipoint, i, j, k, l, m
  double precision              :: weight1, contrib_x, contrib_y, contrib_z, tmp_x, tmp_y, tmp_z
  double precision              :: ao_k_r, ao_i_r, ao_i_dx, ao_i_dy, ao_i_dz
-  double precision, allocatable :: ac_mat(:,:,:,:)
+  double precision              :: time0, time1
-  double precision :: wall0, wall1
+  double precision, allocatable :: ac_mat(:,:,:,:), b_mat(:,:,:,:)
  print*, ' providing tc_grad_and_lapl_ao_test ...'
  call wall_time(time0)
  provide int2_grad1_u12_ao_test 
  call wall_time(wall0)
  allocate(ac_mat(ao_num,ao_num,ao_num,ao_num))
  ac_mat = 0.d0
-  do ipoint = 1, n_points_final_grid
+  allocate(b_mat(n_points_final_grid,ao_num,ao_num,3), ac_mat(ao_num,ao_num,ao_num,ao_num))
-    weight1 = 0.5d0 * final_weight_at_r_vector(ipoint)
+
-    do j = 1, ao_num
+  b_mat = 0.d0
-      do l = 1, ao_num
+ !$OMP PARALLEL                                                              &
-       contrib_x = int2_grad1_u12_ao_test(1,l,j,ipoint)
+ !$OMP DEFAULT (NONE)                                                        &
-       contrib_y = int2_grad1_u12_ao_test(2,l,j,ipoint)
+ !$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r)                       & 
-       contrib_z = int2_grad1_u12_ao_test(3,l,j,ipoint)
+ !$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, b_mat, & 
 !$OMP         ao_num, n_points_final_grid, final_weight_at_r_vector)
 !$OMP DO SCHEDULE (static)
  do i = 1, ao_num
         ao_i_r  = weight1 * aos_in_r_array                (i,ipoint)
         ao_i_dx = weight1 * aos_grad_in_r_array_transp(1,i,ipoint)
         ao_i_dy = weight1 * aos_grad_in_r_array_transp(2,i,ipoint)
         ao_i_dz = weight1 * aos_grad_in_r_array_transp(3,i,ipoint)
    do k = 1, ao_num
-           ao_k_r = aos_in_r_array(k,ipoint)
+      do ipoint = 1, n_points_final_grid
-           tmp_x = ao_k_r * ao_i_dx - ao_i_r * aos_grad_in_r_array_transp(1,k,ipoint) 
+        weight1 = 0.5d0 * final_weight_at_r_vector(ipoint)
-           tmp_y = ao_k_r * ao_i_dy - ao_i_r * aos_grad_in_r_array_transp(2,k,ipoint) 
+        ao_i_r  = aos_in_r_array_transp(ipoint,i)
-           tmp_z = ao_k_r * ao_i_dz - ao_i_r * aos_grad_in_r_array_transp(3,k,ipoint) 
+        ao_k_r  = aos_in_r_array_transp(ipoint,k)
-           tmp_x *= contrib_x 
+        b_mat(ipoint,k,i,1) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,1) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,1))
-           tmp_y *= contrib_y 
+        b_mat(ipoint,k,i,2) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,2) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,2))
-           tmp_z *= contrib_z 
+        b_mat(ipoint,k,i,3) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,3) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,3))
      enddo
    enddo
  enddo
 !$OMP END DO
 !$OMP END PARALLEL
-            ac_mat(k,i,l,j) += tmp_x + tmp_y + tmp_z
+  ac_mat = 0.d0
-          enddo
+  do m = 1, 3
-        enddo
+    call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0                   &
-      enddo
+              , int2_grad1_u12_ao_test(1,1,1,m), ao_num*ao_num, b_mat(1,1,1,m), n_points_final_grid &
-    enddo
+              , 1.d0, ac_mat, ao_num*ao_num)
  enddo
  enddo
  deallocate(b_mat)
 !$OMP PARALLEL             &
 !$OMP DEFAULT (NONE)       &
 !$OMP PRIVATE (i, j, k, l) & 
 !$OMP SHARED (ac_mat, tc_grad_and_lapl_ao_test, ao_num)
 !$OMP DO SCHEDULE (static)
  do j = 1, ao_num
    do l = 1, ao_num
      do i = 1, ao_num
@ -143,11 +160,15 @@ BEGIN_PROVIDER [double precision, tc_grad_and_lapl_ao_test, (ao_num, ao_num, ao_
      enddo
    enddo
  enddo
 !$OMP END DO
 !$OMP END PARALLEL
  call wall_time(wall1)
  print*,'wall time for tc_grad_and_lapl_ao_test',wall1 - wall0
  deallocate(ac_mat)
  call wall_time(time1)
  print*, ' Wall time for tc_grad_and_lapl_ao_test = ', time1 - time0
 END_PROVIDER 
 ! ---
--- a/src/non_h_ints_mu/total_tc_int.irp.f
+++ b/src/non_h_ints_mu/total_tc_int.irp.f
@ -7,6 +7,7 @@ BEGIN_PROVIDER [double precision, ao_tc_int_chemist, (ao_num, ao_num, ao_num, ao
  integer          :: i, j, k, l
  double precision :: wall1, wall0
  print *, ' providing ao_tc_int_chemist ...'
  call wall_time(wall0)
  if(test_cycle_tc)then
@ -36,6 +37,7 @@ BEGIN_PROVIDER [double precision, ao_tc_int_chemist_test, (ao_num, ao_num, ao_nu
  integer          :: i, j, k, l
  double precision :: wall1, wall0
  print *, ' providing ao_tc_int_chemist_test ...'
  call wall_time(wall0)
   do j = 1, ao_num
@ -47,8 +49,10 @@ BEGIN_PROVIDER [double precision, ao_tc_int_chemist_test, (ao_num, ao_num, ao_nu
       enddo
     enddo
   enddo
  call wall_time(wall1)
  print *, ' wall time for ao_tc_int_chemist_test ', wall1 - wall0
 END_PROVIDER 
 ! ---
--- a/src/non_hermit_dav/biorthog.irp.f
+++ b/src/non_hermit_dav/biorthog.irp.f
@ -444,8 +444,8 @@ subroutine non_hrmt_bieig(n, A, thr_d, thr_nd, leigvec, reigvec, n_real_eigv, ei
    endif
    call check_biorthog(n, n_real_eigv, leigvec, reigvec, accu_d, accu_nd, S, thr_d, thr_nd, .true.)
-    !call impose_biorthog_qr(n, n_real_eigv, leigvec, reigvec)
+    !call impose_biorthog_qr(n, n_real_eigv, thr_d, thr_nd, leigvec, reigvec)
-    !call impose_biorthog_lu(n, n_real_eigv, leigvec, reigvec)
+    !call impose_biorthog_lu(n, n_real_eigv, thr_d, thr_nd, leigvec, reigvec)
    ! ---
@ -611,7 +611,7 @@ subroutine non_hrmt_bieig_random_diag(n, A, leigvec, reigvec, n_real_eigv, eigva
  enddo
  accu_nd = dsqrt(accu_nd)
-  if(accu_nd .lt. 1d-8) then
+  if(accu_nd .lt. thresh_biorthog_nondiag) then
    ! L x R is already bi-orthogonal
    print *, ' L & T bi-orthogonality: ok'
@ -623,7 +623,7 @@ subroutine non_hrmt_bieig_random_diag(n, A, leigvec, reigvec, n_real_eigv, eigva
    print *, ' L & T bi-orthogonality: not imposed yet'
    print *, ' accu_nd = ', accu_nd
-    call impose_biorthog_qr(n, n_real_eigv, leigvec, reigvec)
+    call impose_biorthog_qr(n, n_real_eigv, thresh_biorthog_diag, thresh_biorthog_nondiag, leigvec, reigvec)
    deallocate( S )
  endif
@ -633,7 +633,7 @@ subroutine non_hrmt_bieig_random_diag(n, A, leigvec, reigvec, n_real_eigv, eigva
  return
-end 
+end subroutine non_hrmt_bieig_random_diag
 ! ---
@ -961,7 +961,7 @@ subroutine non_hrmt_bieig_fullvect(n, A, leigvec, reigvec, n_real_eigv, eigval)
  enddo
  accu_nd = dsqrt(accu_nd)
-  if( accu_nd .lt. 1d-8 ) then
+  if(accu_nd .lt. thresh_biorthog_nondiag) then
    ! L x R is already bi-orthogonal
    !print *, ' L & T bi-orthogonality: ok'
@ -973,7 +973,7 @@ subroutine non_hrmt_bieig_fullvect(n, A, leigvec, reigvec, n_real_eigv, eigval)
    !print *, ' L & T bi-orthogonality: not imposed yet'
    !print *, ' accu_nd = ', accu_nd
-    call impose_biorthog_qr(n, n, leigvec, reigvec)
+    call impose_biorthog_qr(n, n, thresh_biorthog_diag, thresh_biorthog_nondiag, leigvec, reigvec)
    deallocate( S )
  endif
--- a/src/non_hermit_dav/new_routines.irp.f
+++ b/src/non_hermit_dav/new_routines.irp.f
@ -132,9 +132,9 @@ subroutine non_hrmt_diag_split_degen_bi_orthog(n, A, leigvec, reigvec, n_real_ei
 !!! ONCE ALL EIGENVALUES ARE REAL ::: CHECK BI-ORTHONORMALITY
  !                               check bi-orthogonality
-  call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, .false.)
+  call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
  print *, ' accu_nd bi-orthog = ', accu_nd
-  if( accu_nd .lt. 1d-10 ) then
+  if(accu_nd .lt. thresh_biorthog_nondiag) then
    print *, ' bi-orthogonality: ok'
  else
    print *, ' '
@ -149,14 +149,14 @@ subroutine non_hrmt_diag_split_degen_bi_orthog(n, A, leigvec, reigvec, n_real_ei
    deallocate(S_nh_inv_half)
    call impose_orthog_degen_eigvec(n, eigval, reigvec_tmp)
    call impose_orthog_degen_eigvec(n, eigval, leigvec_tmp)
-    call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, .false.)
+    call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-    if( accu_nd .lt. 1d-10 ) then
+    if(accu_nd .lt. thresh_biorthog_nondiag) then
      print *, ' bi-orthogonality: ok'
    else 
     print*,'New vectors not bi-orthonormals at ',accu_nd
     call impose_biorthog_qr(n, n, leigvec_tmp, reigvec_tmp, S)
-     call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, .false.)
+     call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-     if( accu_nd .lt. 1d-10 ) then
+     if(accu_nd .lt. thresh_biorthog_nondiag) then
       print *, ' bi-orthogonality: ok'
     else 
      print*,'New vectors not bi-orthonormals at ',accu_nd
@ -200,10 +200,10 @@ subroutine non_hrmt_diag_split_degen_bi_orthog(n, A, leigvec, reigvec, n_real_ei
  shift_current = max(1.d-10,shift_current)
  print*,'Thr for eigenvectors = ',shift_current
  call check_EIGVEC(n, n, A, eigval, leigvec, reigvec, shift_current, thr_norm, .false.)
-  call check_biorthog(n, n, leigvec, reigvec, accu_d, accu_nd, S, .false.)
+  call check_biorthog(n, n, leigvec, reigvec, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
  print *, ' accu_nd bi-orthog = ', accu_nd
-  if( accu_nd .lt. 1d-10 ) then
+  if(accu_nd .lt. thresh_biorthog_nondiag) then
    print *, ' bi-orthogonality: ok'
  else 
   print*,'Something went wrong in non_hrmt_diag_split_degen_bi_orthog'
@ -354,9 +354,9 @@ subroutine non_hrmt_diag_split_degen_s_inv_half(n, A, leigvec, reigvec, n_real_e
 !!! ONCE ALL EIGENVALUES ARE REAL ::: CHECK BI-ORTHONORMALITY
  !                               check bi-orthogonality
-  call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, .false.)
+  call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
  print *, ' accu_nd bi-orthog = ', accu_nd
-  if( accu_nd .lt. 1d-10 ) then
+  if(accu_nd .lt. thresh_biorthog_nondiag) then
    print *, ' bi-orthogonality: ok'
  else
    print *, ' '
@ -369,9 +369,9 @@ subroutine non_hrmt_diag_split_degen_s_inv_half(n, A, leigvec, reigvec, n_real_e
     ! bi-orthonormalization using orthogonalization of left, right and then QR between left and right
     call impose_orthog_degen_eigvec(n, eigval, reigvec_tmp) ! orthogonalization of reigvec
     call impose_orthog_degen_eigvec(n, eigval, leigvec_tmp) ! orthogonalization of leigvec
-     call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S) 
+     call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S,  thresh_biorthog_diag, thresh_biorthog_nondiag, .false.) 
-     if( accu_nd .lt. 1d-10 ) then
+     if(accu_nd .lt. thresh_biorthog_nondiag) then
       print *, ' bi-orthogonality: ok'
     else 
      print*,'New vectors not bi-orthonormals at ', accu_nd
@ -387,8 +387,8 @@ subroutine non_hrmt_diag_split_degen_s_inv_half(n, A, leigvec, reigvec, n_real_e
     print*,'S^{-1/2} exists !!'
     call bi_ortho_s_inv_half(n,leigvec_tmp,reigvec_tmp,S_nh_inv_half) ! use of S^{-1/2} bi-orthonormalization 
    endif
-    call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, .false.)
+    call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-    if( accu_nd .lt. 1d-10 ) then
+    if(accu_nd .lt. thresh_biorthog_nondiag) then
      print *, ' bi-orthogonality: ok'
    else 
     print*,'New vectors not bi-orthonormals at ',accu_nd
@ -431,10 +431,10 @@ subroutine non_hrmt_diag_split_degen_s_inv_half(n, A, leigvec, reigvec, n_real_e
  shift_current = max(1.d-10,shift_current)
  print*,'Thr for eigenvectors = ',shift_current
  call check_EIGVEC(n, n, A, eigval, leigvec, reigvec, shift_current, thr_norm, .false.)
-  call check_biorthog(n, n, leigvec, reigvec, accu_d, accu_nd, S, .false.)
+  call check_biorthog(n, n, leigvec, reigvec, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
  print *, ' accu_nd bi-orthog = ', accu_nd
-  if( accu_nd .lt. 1d-10 ) then
+  if(accu_nd .lt. thresh_biorthog_nondiag) then
    print *, ' bi-orthogonality: ok'
  else 
   print*,'Something went wrong in non_hrmt_diag_split_degen_bi_orthog'
@ -472,6 +472,7 @@ subroutine non_hrmt_fock_mat(n, A, leigvec, reigvec, n_real_eigv, eigval)
  double precision :: accu,thr_cut
  double precision, allocatable :: S_nh_inv_half(:,:)
  logical :: complex_root
  double precision :: thr_norm=1d0
  thr_cut = 1.d-15
@ -580,9 +581,9 @@ subroutine non_hrmt_fock_mat(n, A, leigvec, reigvec, n_real_eigv, eigval)
 !!! ONCE ALL EIGENVALUES ARE REAL ::: CHECK BI-ORTHONORMALITY
  !                               check bi-orthogonality
-  call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S)
+  call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
  print *, ' accu_nd bi-orthog = ', accu_nd
-  if( accu_nd .lt. 1d-10 ) then
+  if(accu_nd .lt. thresh_biorthog_nondiag) then
    print *, ' bi-orthogonality: ok'
  else
    print *, ' '
@ -593,9 +594,9 @@ subroutine non_hrmt_fock_mat(n, A, leigvec, reigvec, n_real_eigv, eigval)
    print *, ' '
    ! bi-orthonormalization using orthogonalization of left, right and then QR between left and right
    call impose_unique_biorthog_degen_eigvec(n, eigval, mo_coef, leigvec_tmp, reigvec_tmp)
-    call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S)
+    call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
    print*,'accu_nd = ',accu_nd
-    if( accu_nd .lt. 1d-10 ) then
+    if(accu_nd .lt. thresh_biorthog_nondiag) then
      print *, ' bi-orthogonality: ok'
    else 
     print*,'New vectors not bi-orthonormals at ',accu_nd
@ -608,8 +609,8 @@ subroutine non_hrmt_fock_mat(n, A, leigvec, reigvec, n_real_eigv, eigval)
      call bi_ortho_s_inv_half(n,leigvec_tmp,reigvec_tmp,S_nh_inv_half) ! use of S^{-1/2} bi-orthonormalization 
     endif
    endif
-    call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S)
+    call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-    if( accu_nd .lt. 1d-10 ) then
+    if(accu_nd .lt. thresh_biorthog_nondiag) then
      print *, ' bi-orthogonality: ok'
    else 
     print*,'New vectors not bi-orthonormals at ',accu_nd
@ -651,11 +652,11 @@ subroutine non_hrmt_fock_mat(n, A, leigvec, reigvec, n_real_eigv, eigval)
  print*,'Checking for final reigvec/leigvec'
  shift_current = max(1.d-10,shift_current)
  print*,'Thr for eigenvectors = ',shift_current
-  call check_EIGVEC(n, n, A, eigval, leigvec, reigvec,shift_current)
+  call check_EIGVEC(n, n, A, eigval, leigvec, reigvec, shift_current, thr_norm, .false.)
-  call check_biorthog(n, n, leigvec, reigvec, accu_d, accu_nd, S)
+  call check_biorthog(n, n, leigvec, reigvec, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
  print *, ' accu_nd bi-orthog = ', accu_nd
-  if( accu_nd .lt. 1d-10 ) then
+  if(accu_nd .lt. thresh_biorthog_nondiag) then
    print *, ' bi-orthogonality: ok'
  else 
   print*,'Something went wrong in non_hrmt_diag_split_degen_bi_orthog'
--- a/src/scf_utils/diagonalize_fock.irp.f
+++ b/src/scf_utils/diagonalize_fock.irp.f
@ -20,6 +20,12 @@ BEGIN_PROVIDER [ double precision, eigenvectors_Fock_matrix_mo, (ao_num,mo_num)
     enddo
   enddo
  !print *, ' Fock_matrix_MO :'
  !do i = 1, mo_num
  !  write(*, '(100(f15.7, 2x))') (Fock_matrix_MO(j,i), j = 1, mo_num)
  !enddo
   if(frozen_orb_scf)then
     integer                        :: iorb,jorb
     do i = 1, n_core_orb
@ -89,6 +95,10 @@ BEGIN_PROVIDER [ double precision, eigenvectors_Fock_matrix_mo, (ao_num,mo_num)
   call dsyevd( 'V', 'U', mo_num, F,                             &
       size(F,1), diag, work, lwork, iwork, liwork, info)
   deallocate(iwork)
  !print*, ' Fock eigval:'
  !do i = 1, mo_num
  !  print *, diag(i)
  !enddo
   if (info /= 0) then
--- a/src/scf_utils/diis.irp.f
+++ b/src/scf_utils/diis.irp.f
@ -248,3 +248,71 @@ END_PROVIDER
 ! ---
 BEGIN_PROVIDER [double precision, FPS_SPF_Matrix_AO_a, (AO_num, AO_num)]
  implicit none
  double precision, allocatable  :: scratch(:,:)
  allocate(scratch(AO_num, AO_num))
  call dgemm( 'N', 'N', AO_num, AO_num, AO_num, 1.d0                                                                                 &
            , Fock_Matrix_AO_alpha, size(Fock_Matrix_AO_alpha, 1), SCF_density_matrix_ao_alpha, size(SCF_Density_Matrix_AO_alpha, 1) &
            , 0.d0, scratch, size(scratch, 1) )
  call dgemm( 'N', 'N', AO_num, AO_num, AO_num, 1.d0                     &
            , scratch, size(scratch, 1), AO_Overlap, size(AO_Overlap, 1) &
            , 0.d0, FPS_SPF_Matrix_AO_a, size(FPS_SPF_Matrix_AO_a, 1) )
  call dgemm( 'N', 'N', AO_num, AO_num, AO_num, 1.d0                                                             &
            , AO_Overlap, size(AO_Overlap, 1), SCF_density_matrix_ao_alpha, size(SCF_density_matrix_ao_alpha, 1) & 
            , 0.d0, scratch, size(scratch, 1) )
  call dgemm( 'N', 'N', AO_num, AO_num, AO_num, -1.d0                                        &
            , scratch, size(scratch, 1), Fock_Matrix_AO_alpha, size(Fock_Matrix_AO_alpha, 1) &
            , 1.d0, FPS_SPF_Matrix_AO_a, size(FPS_SPF_Matrix_AO_a, 1) )
 END_PROVIDER
 ! ---
 BEGIN_PROVIDER [double precision, FPS_SPF_Matrix_AO_b, (AO_num, AO_num)]
  implicit none
  double precision, allocatable  :: scratch(:,:)
  allocate(scratch(AO_num, AO_num))
  call dgemm( 'N', 'N', AO_num, AO_num, AO_num, 1.d0                                                                             &
            , Fock_Matrix_AO_beta, size(Fock_Matrix_AO_beta, 1), SCF_density_matrix_ao_beta, size(SCF_Density_Matrix_AO_beta, 1) &
            , 0.d0, scratch, size(scratch, 1) )
  call dgemm( 'N', 'N', AO_num, AO_num, AO_num, 1.d0                     &
            , scratch, size(scratch, 1), AO_Overlap, size(AO_Overlap, 1) &
            , 0.d0, FPS_SPF_Matrix_AO_b, size(FPS_SPF_Matrix_AO_b, 1) )
  call dgemm( 'N', 'N', AO_num, AO_num, AO_num, 1.d0                                                           &
            , AO_Overlap, size(AO_Overlap, 1), SCF_density_matrix_ao_beta, size(SCF_density_matrix_ao_beta, 1) & 
            , 0.d0, scratch, size(scratch, 1) )
  call dgemm( 'N', 'N', AO_num, AO_num, AO_num, -1.d0                                      &
            , scratch, size(scratch, 1), Fock_Matrix_AO_beta, size(Fock_Matrix_AO_beta, 1) &
            , 1.d0, FPS_SPF_Matrix_AO_b, size(FPS_SPF_Matrix_AO_b, 1) )
 END_PROVIDER
 ! ---
 BEGIN_PROVIDER [double precision, FPS_SPF_Matrix_MO_a, (mo_num, mo_num)]
  implicit none
  call ao_to_mo(FPS_SPF_Matrix_AO_a, size(FPS_SPF_Matrix_AO_a, 1), FPS_SPF_Matrix_MO_a, size(FPS_SPF_Matrix_MO_a, 1))
 END_PROVIDER
 ! ---
 BEGIN_PROVIDER [double precision, FPS_SPF_Matrix_MO_b, (mo_num, mo_num)]
  implicit none
  call ao_to_mo(FPS_SPF_Matrix_AO_b, size(FPS_SPF_Matrix_AO_b, 1), FPS_SPF_Matrix_MO_b, size(FPS_SPF_Matrix_MO_b, 1))
 END_PROVIDER
 ! ---
--- a/src/scf_utils/fock_matrix.irp.f
+++ b/src/scf_utils/fock_matrix.irp.f
@ -15,7 +15,7 @@
   ! 
   ! Rcc = Acc Fcc^a + Bcc Fcc^b
   ! Roo = Aoo Foo^a + Boo Foo^b
-   ! Rcc = Avv Fvv^a + Bvv Fvv^b
+   ! Rvv = Avv Fvv^a + Bvv Fvv^b
   ! Fcv = (F^a + F^b)/2
   ! 
   ! F^a: Fock matrix alpha (MO), F^b: Fock matrix beta (MO)
@ -267,3 +267,5 @@ BEGIN_PROVIDER [ double precision, SCF_energy ]
 END_PROVIDER
 ! ---
--- a/src/scf_utils/rh_scf_simple.irp.f
+++ b/src/scf_utils/rh_scf_simple.irp.f
@ -63,35 +63,34 @@ END_DOC
    energy_SCF = SCF_energy
    Delta_energy_SCF = energy_SCF - energy_SCF_previous
-    double precision :: level_shift_save
+    !double precision :: level_shift_save
-    level_shift_save = level_shift
+    !level_shift_save = level_shift
-    mo_coef_save(1:ao_num,1:mo_num) = mo_coef(1:ao_num,1:mo_num)
+    !mo_coef_save(1:ao_num,1:mo_num) = mo_coef(1:ao_num,1:mo_num)
-    do while (Delta_energy_SCF > 0.d0)
+    !do while (Delta_energy_SCF > 0.d0)
-      mo_coef(1:ao_num,1:mo_num) = mo_coef_save
+    !  mo_coef(1:ao_num,1:mo_num) = mo_coef_save
-      if (level_shift <= .1d0) then
+    !  if (level_shift <= .1d0) then
-        level_shift = 1.d0
+    !    level_shift = 1.d0
-      else
+    !  else
-        level_shift = level_shift * 3.0d0
+    !    level_shift = level_shift * 3.0d0
-      endif
+    !  endif
-      TOUCH mo_coef level_shift
+    !  TOUCH mo_coef level_shift
-      mo_coef(1:ao_num,1:mo_num) = eigenvectors_Fock_matrix_MO(1:ao_num,1:mo_num)
+    !  mo_coef(1:ao_num,1:mo_num) = eigenvectors_Fock_matrix_MO(1:ao_num,1:mo_num)
-      if(frozen_orb_scf)then
+    !  if(frozen_orb_scf)then
-        call reorder_core_orb
+    !    call reorder_core_orb
-        call initialize_mo_coef_begin_iteration
+    !    call initialize_mo_coef_begin_iteration
-      endif
+    !  endif
-      TOUCH mo_coef
+    !  TOUCH mo_coef
-      Delta_energy_SCF = SCF_energy - energy_SCF_previous
+    !  Delta_energy_SCF = SCF_energy - energy_SCF_previous
-      energy_SCF = SCF_energy
+    !  energy_SCF = SCF_energy
-      if (level_shift-level_shift_save > 40.d0) then
+    !  if (level_shift-level_shift_save > 40.d0) then
-        level_shift = level_shift_save * 4.d0
+    !    level_shift = level_shift_save * 4.d0
-        SOFT_TOUCH level_shift
+    !    SOFT_TOUCH level_shift
-        exit
+    !    exit
-      endif
+    !  endif
    !enddo
    !level_shift = level_shift * 0.5d0
    !SOFT_TOUCH level_shift
    enddo
    level_shift = level_shift * 0.5d0
    SOFT_TOUCH level_shift
    energy_SCF_previous = energy_SCF
 !   Print results at the end of each iteration
@ -100,7 +99,7 @@ END_DOC
      iteration_SCF, energy_SCF, Delta_energy_SCF, max_error_DIIS, level_shift, dim_DIIS
    if(Delta_energy_SCF < 0.d0) then
-      call save_mos
+      call save_mos()
    endif
    if(qp_stop()) exit
--- a/src/scf_utils/roothaan_hall_scf.irp.f
+++ b/src/scf_utils/roothaan_hall_scf.irp.f
@ -86,10 +86,9 @@ END_DOC
          iteration_SCF,dim_DIIS                                       &
          )
-      Fock_matrix_AO_alpha = Fock_matrix_AO*0.5d0
+      Fock_matrix_AO_alpha = Fock_matrix_AO!*0.5d0
-      Fock_matrix_AO_beta  = Fock_matrix_AO*0.5d0
+      Fock_matrix_AO_beta  = Fock_matrix_AO!*0.5d0
      TOUCH Fock_matrix_AO_alpha Fock_matrix_AO_beta
    endif
    MO_coef = eigenvectors_Fock_matrix_MO
@ -100,18 +99,14 @@ END_DOC
    TOUCH MO_coef
 !   Calculate error vectors
    max_error_DIIS = maxval(Abs(FPS_SPF_Matrix_MO))
 !   SCF energy
    energy_SCF = SCF_energy
    Delta_energy_SCF = energy_SCF - energy_SCF_previous
    if ( (SCF_algorithm == 'DIIS').and.(Delta_energy_SCF > 0.d0) ) then
      Fock_matrix_AO(1:ao_num,1:ao_num) = Fock_matrix_DIIS (1:ao_num,1:ao_num,index_dim_DIIS)
-      Fock_matrix_AO_alpha = Fock_matrix_AO*0.5d0
+      Fock_matrix_AO_alpha = Fock_matrix_AO!*0.5d0
-      Fock_matrix_AO_beta  = Fock_matrix_AO*0.5d0
+      Fock_matrix_AO_beta  = Fock_matrix_AO!*0.5d0
      TOUCH Fock_matrix_AO_alpha Fock_matrix_AO_beta
    endif
@ -147,6 +142,9 @@ END_DOC
    SOFT_TOUCH level_shift
    energy_SCF_previous = energy_SCF
 !   Calculate error vectors
    max_error_DIIS = maxval(Abs(FPS_SPF_Matrix_MO))
 !   Print results at the end of each iteration
    write(6,'(I4, 1X, F16.10, 1X, F16.10, 1X, F16.10, 1X, F16.10, 1X, I3)')  &
--- a/src/tc_bi_ortho/tc_som.irp.f
+++ b/src/tc_bi_ortho/tc_som.irp.f
@ -0,0 +1,70 @@
 ! ---
 program tc_som
  BEGIN_DOC
  ! TODO : Put the documentation of the program here
  END_DOC
  implicit none
  print *, ' starting ...'
  print *, ' do not forget to do tc-scf first'
  my_grid_becke  = .True.
  my_n_pt_r_grid = 30
  my_n_pt_a_grid = 50
 !  my_n_pt_r_grid = 10 ! small grid for quick debug
 !  my_n_pt_a_grid = 26 ! small grid for quick debug
  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
  PROVIDE mu_erf 
  print *, ' mu = ', mu_erf
  PROVIDE j1b_type
  print *, ' j1b_type = ', j1b_type
  print *, j1b_pen
  read_wf = .true.
  touch read_wf
  call main()
 end
 ! ---
 subroutine main()
  implicit none
  integer          :: i, i_HF, degree
  double precision :: hmono_1, htwoe_1, hthree_1, htot_1
  double precision :: hmono_2, htwoe_2, hthree_2, htot_2
  double precision :: U_SOM
  PROVIDE N_int N_det
  do i = 1, N_det
    call get_excitation_degree(HF_bitmask, psi_det(1,1,i), degree, N_int)
    if(degree == 0) then
      i_HF = i
      exit
    endif
  enddo
  print *, ' HF determinants:', i_HF
  print *, '          N_det :', N_det
  U_SOM = 0.d0 
  do i = 1, N_det
    if(i == i_HF) cycle
    call htilde_mu_mat_bi_ortho(psi_det(1,1,i_HF), psi_det(1,1,i), N_int, hmono_1, htwoe_1, hthree_1, htot_1)
    call htilde_mu_mat_bi_ortho(psi_det(1,1,i), psi_det(1,1,i_HF), N_int, hmono_2, htwoe_2, hthree_2, htot_2)
    U_SOM += htot_1 * htot_2
  enddo
  U_SOM = 0.5d0 * U_SOM
  print *, ' U_SOM = ', U_SOM
  return
 end subroutine main
 ! ---
--- a/src/tc_keywords/EZFIO.cfg
+++ b/src/tc_keywords/EZFIO.cfg
@ -134,7 +134,31 @@ default: False
 type: integer
 doc: nb of Gaussians used to fit Jastrow fcts
 interface: ezfio,provider,ocaml
-default: 6
+default: 20
 [tcscf_algorithm]
 type: character*(32)
 doc: Type of TCSCF algorithm used. Possible choices are [Simple | DIIS]
 interface: ezfio,provider,ocaml
 default: Simple 
 [test_cycle_tc]
 type: logical
 doc: If |true|, the integrals of the three-body jastrow are computed with cycles
 interface: ezfio,provider,ocaml
 default: False
 [thresh_biorthog_diag]
 type: Threshold
 doc: Threshold to determine if diagonal elements of the bi-orthogonal condition L.T x R are close enouph to 1
 interface: ezfio,provider,ocaml
 default: 1.e-6
 [thresh_biorthog_nondiag]
 type: Threshold
 doc: Threshold to determine if non-diagonal elements of L.T x R are close enouph to 0
 interface: ezfio,provider,ocaml
 default: 1.e-6
 [max_dim_diis_tcscf]
 type: integer
@ -154,21 +178,9 @@ doc: Energy shift on the virtual MOs to improve TCSCF convergence
 interface: ezfio,provider,ocaml
 default: 0.
 [tcscf_algorithm]
 type: character*(32)
 doc: Type of TCSCF algorithm used. Possible choices are [Simple | DIIS]
 interface: ezfio,provider,ocaml
 default: Simple 
 [im_thresh_tcscf]
 type: Threshold
 doc: Thresholds on the Imag part of energy
 interface: ezfio,provider,ocaml
 default: 1.e-7
 [test_cycle_tc]
 type: logical
 doc: If |true|, the integrals of the three-body jastrow are computed with cycles
 interface: ezfio,provider,ocaml
 default: False
--- a/src/tc_scf/diago_bi_ort_tcfock.irp.f
+++ b/src/tc_scf/diago_bi_ort_tcfock.irp.f
@ -13,14 +13,10 @@
  integer                       :: n_real_tc 
  integer                       :: i, j, k, l
  double precision              :: accu_d, accu_nd, accu_tmp
  double precision              :: thr_d, thr_nd
  double precision              :: norm
  double precision, allocatable :: eigval_right_tmp(:)
  double precision, allocatable :: F_tmp(:,:)
  thr_d  = 1d-6
  thr_nd = 1d-6
  allocate( eigval_right_tmp(mo_num), F_tmp(mo_num,mo_num) )
  PROVIDE Fock_matrix_tc_mo_tot
@ -38,12 +34,12 @@
    F_tmp(i,i) += level_shift_tcscf
  enddo
-  call non_hrmt_bieig( mo_num, F_tmp, thr_d, thr_nd           &
+  call non_hrmt_bieig( mo_num, F_tmp, thresh_biorthog_diag, thresh_biorthog_nondiag &
                     , fock_tc_leigvec_mo, fock_tc_reigvec_mo                       & 
                     , n_real_tc, eigval_right_tmp )
  !if(max_ov_tc_scf)then
-  ! call non_hrmt_fock_mat( mo_num, F_tmp, thr_d, thr_nd        &
+  ! call non_hrmt_fock_mat( mo_num, F_tmp, thresh_biorthog_diag, thresh_biorthog_nondiag &
  !                    , fock_tc_leigvec_mo, fock_tc_reigvec_mo                          & 
  !                    , n_real_tc, eigval_right_tmp )
  !else 
@ -88,16 +84,16 @@
      else
        accu_tmp = overlap_fock_tc_eigvec_mo(k,i)
        accu_nd += accu_tmp * accu_tmp
-        if(dabs(overlap_fock_tc_eigvec_mo(k,i)) .gt. thr_nd)then
+        if(dabs(overlap_fock_tc_eigvec_mo(k,i)) .gt. thresh_biorthog_nondiag)then
         print *, 'k,i', k, i, overlap_fock_tc_eigvec_mo(k,i)
        endif
      endif
    enddo 
  enddo
  accu_nd = dsqrt(accu_nd) / accu_d
-  if(accu_nd .gt. thr_nd) then
+  if(accu_nd .gt. thresh_biorthog_nondiag) then
    print *, ' bi-orthog failed'
-    print *, ' accu_nd MO = ', accu_nd, thr_nd
+    print *, ' accu_nd MO = ', accu_nd, thresh_biorthog_nondiag
    print *, ' overlap_fock_tc_eigvec_mo = '
    do i = 1, mo_num
      write(*,'(100(F16.10,X))') overlap_fock_tc_eigvec_mo(i,:)
@ -107,14 +103,14 @@
  ! ---
-  if(dabs(accu_d - dble(mo_num))/dble(mo_num) .gt. thr_d) then
+  if(dabs(accu_d - dble(mo_num))/dble(mo_num) .gt. thresh_biorthog_diag) then
    print *, ' mo_num     = ', mo_num 
-    print *, ' accu_d  MO = ', accu_d, thr_d
+    print *, ' accu_d  MO = ', accu_d, thresh_biorthog_diag
    print *, ' normalizing vectors ...'
    do i = 1, mo_num
      norm = dsqrt(dabs(overlap_fock_tc_eigvec_mo(i,i)))
-      if(norm .gt. thr_d) then
+      if(norm .gt. thresh_biorthog_diag) then
        do k = 1, mo_num
          fock_tc_reigvec_mo(k,i) *= 1.d0/norm
          fock_tc_leigvec_mo(k,i) *= 1.d0/norm
@ -137,16 +133,16 @@
        else
          accu_tmp = overlap_fock_tc_eigvec_mo(k,i)
          accu_nd += accu_tmp * accu_tmp
-          if(dabs(overlap_fock_tc_eigvec_mo(k,i)) .gt. thr_nd)then
+          if(dabs(overlap_fock_tc_eigvec_mo(k,i)) .gt. thresh_biorthog_nondiag)then
           print *, 'k,i', k, i, overlap_fock_tc_eigvec_mo(k,i)
          endif
        endif
      enddo 
    enddo
    accu_nd = dsqrt(accu_nd) / accu_d
-    if(accu_nd .gt. thr_nd) then
+    if(accu_nd .gt. thresh_biorthog_diag) then
      print *, ' bi-orthog failed'
-      print *, ' accu_nd MO = ', accu_nd, thr_nd
+      print *, ' accu_nd MO = ', accu_nd, thresh_biorthog_nondiag
      print *, ' overlap_fock_tc_eigvec_mo = '
      do i = 1, mo_num
        write(*,'(100(F16.10,X))') overlap_fock_tc_eigvec_mo(i,:)
@ -177,6 +173,7 @@ END_PROVIDER
  double precision              :: accu, accu_d
  double precision, allocatable :: tmp(:,:)
  PROVIDE mo_l_coef mo_r_coef
 !  ! MO_R x R
   call dgemm( 'N', 'N', ao_num, mo_num, mo_num, 1.d0          &
--- a/src/tc_scf/fock_3e_bi_ortho_uhf.irp.f
+++ b/src/tc_scf/fock_3e_bi_ortho_uhf.irp.f
@ -310,46 +310,6 @@ BEGIN_PROVIDER [double precision, fock_3e_uhf_ao_a, (ao_num, ao_num)]
  deallocate(f_tmp)
 !$OMP END PARALLEL
 ! TODO
 ! !$OMP PARALLEL DEFAULT (NONE)                                                                &
 ! !$OMP PRIVATE (g, e, d, k, mu, nu, dm_ge_a, dm_ge_b, dm_ge, dm_dk_a, dm_dk_b, dm_dk,         &
 ! !$OMP          i_mugd_nuek, i_mugd_eknu, i_mugd_knue, i_mugd_nuke, i_mugd_enuk, i_mugd_kenu) &
 ! !$OMP SHARED  (ao_num, TCSCF_bi_ort_dm_ao_alpha, TCSCF_bi_ort_dm_ao_beta, fock_3e_uhf_ao_a)
 ! !$OMP DO
 !  do g = 1, ao_num
 !    do e = 1, ao_num
 !      dm_ge_a = TCSCF_bi_ort_dm_ao_alpha(g,e)
 !      dm_ge_b = TCSCF_bi_ort_dm_ao_beta (g,e)
 !      dm_ge   = dm_ge_a + dm_ge_b
 !      do d = 1, ao_num
 !        do k = 1, ao_num
 !          dm_dk_a = TCSCF_bi_ort_dm_ao_alpha(d,k)
 !          dm_dk_b = TCSCF_bi_ort_dm_ao_beta (d,k)
 !          dm_dk   = dm_dk_a + dm_dk_b
 !          do mu = 1, ao_num
 !            do nu = 1, ao_num
 !              call give_integrals_3_body_bi_ort_ao(mu, g, d, nu, e, k, i_mugd_nuek)
 !              call give_integrals_3_body_bi_ort_ao(mu, g, d, e, k, nu, i_mugd_eknu)
 !              call give_integrals_3_body_bi_ort_ao(mu, g, d, k, nu, e, i_mugd_knue)
 !              call give_integrals_3_body_bi_ort_ao(mu, g, d, nu, k, e, i_mugd_nuke)
 !              call give_integrals_3_body_bi_ort_ao(mu, g, d, e, nu, k, i_mugd_enuk)
 !              call give_integrals_3_body_bi_ort_ao(mu, g, d, k, e, nu, i_mugd_kenu)
 !              fock_3e_uhf_ao_a(mu,nu) -= 0.5d0 * ( dm_ge   * dm_dk   * i_mugd_nuek &
 !                                                 + dm_ge_a * dm_dk_a * i_mugd_eknu &
 !                                                 + dm_ge_a * dm_dk_a * i_mugd_knue &
 !                                                 - dm_ge_a * dm_dk   * i_mugd_enuk &
 !                                                 - dm_ge   * dm_dk_a * i_mugd_kenu &
 !                                                 - dm_ge_a * dm_dk_a * i_mugd_nuke &
 !                                                 - dm_ge_b * dm_dk_b * i_mugd_nuke )
 !            enddo
 !          enddo
 !        enddo
 !      enddo
 !    enddo
 !  enddo
 ! !$OMP END DO
 ! !$OMP END PARALLEL
  call wall_time(tf)
  print *, ' total Wall time for fock_3e_uhf_ao_a =', tf - ti
@ -436,44 +396,6 @@ BEGIN_PROVIDER [double precision, fock_3e_uhf_ao_b, (ao_num, ao_num)]
  deallocate(f_tmp)
 !$OMP END PARALLEL
 ! TODO
 ! !$OMP PARALLEL DO DEFAULT (NONE)                                                             &
 ! !$OMP PRIVATE (g, e, d, k, mu, nu, dm_ge_a, dm_ge_b, dm_ge, dm_dk_a, dm_dk_b, dm_dk,         &
 ! !$OMP          i_mugd_nuek, i_mugd_eknu, i_mugd_knue, i_mugd_nuke, i_mugd_enuk, i_mugd_kenu) &
 ! !$OMP SHARED  (ao_num, TCSCF_bi_ort_dm_ao_alpha, TCSCF_bi_ort_dm_ao_beta, fock_3e_uhf_ao_b)
 !  do g = 1, ao_num
 !    do e = 1, ao_num
 !      dm_ge_a = TCSCF_bi_ort_dm_ao_alpha(g,e)
 !      dm_ge_b = TCSCF_bi_ort_dm_ao_beta (g,e)
 !      dm_ge   = dm_ge_a + dm_ge_b
 !      do d = 1, ao_num
 !        do k = 1, ao_num
 !          dm_dk_a = TCSCF_bi_ort_dm_ao_alpha(d,k)
 !          dm_dk_b = TCSCF_bi_ort_dm_ao_beta (d,k)
 !          dm_dk   = dm_dk_a + dm_dk_b
 !          do mu = 1, ao_num
 !            do nu = 1, ao_num
 !              call give_integrals_3_body_bi_ort_ao(mu, g, d, nu, e, k, i_mugd_nuek)
 !              call give_integrals_3_body_bi_ort_ao(mu, g, d, e, k, nu, i_mugd_eknu)
 !              call give_integrals_3_body_bi_ort_ao(mu, g, d, k, nu, e, i_mugd_knue)
 !              call give_integrals_3_body_bi_ort_ao(mu, g, d, nu, k, e, i_mugd_nuke)
 !              call give_integrals_3_body_bi_ort_ao(mu, g, d, e, nu, k, i_mugd_enuk)
 !              call give_integrals_3_body_bi_ort_ao(mu, g, d, k, e, nu, i_mugd_kenu)
 !              fock_3e_uhf_ao_b(mu,nu) -= 0.5d0 * ( dm_ge   * dm_dk   * i_mugd_nuek &
 !                                                 + dm_ge_b * dm_dk_b * i_mugd_eknu &
 !                                                 + dm_ge_b * dm_dk_b * i_mugd_knue &
 !                                                 - dm_ge_b * dm_dk   * i_mugd_enuk &
 !                                                 - dm_ge   * dm_dk_b * i_mugd_kenu &
 !                                                 - dm_ge_b * dm_dk_b * i_mugd_nuke &
 !                                                 - dm_ge_a * dm_dk_a * i_mugd_nuke )
 !            enddo
 !          enddo
 !        enddo
 !      enddo
 !    enddo
 !  enddo
 ! !$OMP END PARALLEL DO
  call wall_time(tf)
  print *, ' total Wall time for fock_3e_uhf_ao_b =', tf - ti
--- a/src/tc_scf/fock_tc.irp.f
+++ b/src/tc_scf/fock_tc.irp.f
@ -1,29 +1,30 @@
 ! ---
- BEGIN_PROVIDER [ double precision, two_e_tc_non_hermit_integral_alpha, (ao_num, ao_num)]
+ BEGIN_PROVIDER [ double precision, two_e_tc_non_hermit_integral_seq_alpha, (ao_num, ao_num)]
-&BEGIN_PROVIDER [ double precision, two_e_tc_non_hermit_integral_beta , (ao_num, ao_num)]
+&BEGIN_PROVIDER [ double precision, two_e_tc_non_hermit_integral_seq_beta , (ao_num, ao_num)]
  BEGIN_DOC
-! two_e_tc_non_hermit_integral_alpha(k,i) = <k| F^tc_alpha |i> 
+  !
  ! two_e_tc_non_hermit_integral_seq_alpha(k,i) = <k| F^tc_alpha |i> 
  !
  ! where F^tc is the two-body part of the TC Fock matrix and k,i are AO basis functions
  !
  END_DOC
  implicit none
  integer          :: i, j, k, l
  double precision :: density, density_a, density_b
  double precision :: t0, t1
-  two_e_tc_non_hermit_integral_alpha = 0.d0
+  !print*, ' providing two_e_tc_non_hermit_integral_seq ...'
-  two_e_tc_non_hermit_integral_beta  = 0.d0
+  !call wall_time(t0)
  two_e_tc_non_hermit_integral_seq_alpha = 0.d0
  two_e_tc_non_hermit_integral_seq_beta  = 0.d0
  !! TODO :: parallelization properly done
  do i = 1, ao_num
    do k = 1, ao_num
 !!$OMP PARALLEL                  &
 !!$OMP DEFAULT (NONE)            &
 !!$OMP PRIVATE (j,l,density_a,density_b,density) & 
 !!$OMP SHARED (i,k,ao_num,SCF_density_matrix_ao_alpha,SCF_density_matrix_ao_beta,ao_non_hermit_term_chemist) & 
 !!$OMP SHARED (two_e_tc_non_hermit_integral_alpha,two_e_tc_non_hermit_integral_beta)
 !!$OMP DO SCHEDULE (dynamic)
      do j = 1, ao_num
        do l = 1, ao_num
@ -31,6 +32,15 @@
          density_b = TCSCF_density_matrix_ao_beta (l,j)
          density   = density_a + density_b
          !!                                         rho(l,j)   *      < k l| T | i j>
          !two_e_tc_non_hermit_integral_seq_alpha(k,i) += density   * ao_two_e_tc_tot(l,j,k,i)
          !!                                         rho(l,j)   *      < k l| T | i j>
          !two_e_tc_non_hermit_integral_seq_beta (k,i) += density   * ao_two_e_tc_tot(l,j,k,i)
          !!                                         rho_a(l,j) *      < l k| T | i j>
          !two_e_tc_non_hermit_integral_seq_alpha(k,i) -= density_a * ao_two_e_tc_tot(k,j,l,i)
          !!                                         rho_b(l,j) *      < l k| T | i j>
          !two_e_tc_non_hermit_integral_seq_beta (k,i) -= density_b * ao_two_e_tc_tot(k,j,l,i)
          !!                                         rho(l,j)   *      < k l| T | i j>
          !two_e_tc_non_hermit_integral_alpha(k,i) += density   * ao_two_e_tc_tot(l,j,k,i)
          !!                                         rho(l,j)   *      < k l| T | i j>
@ -41,32 +51,106 @@
          !two_e_tc_non_hermit_integral_beta (k,i) -= density_b * ao_two_e_tc_tot(k,j,l,i)
          !                                         rho(l,j)   *      < k l| T | i j>
-          two_e_tc_non_hermit_integral_alpha(k,i) += density   * ao_two_e_tc_tot(k,i,l,j)
+          two_e_tc_non_hermit_integral_seq_alpha(k,i) += density   * ao_two_e_tc_tot(k,i,l,j)
          !                                         rho(l,j)   *      < k l| T | i j>
-          two_e_tc_non_hermit_integral_beta (k,i) += density   * ao_two_e_tc_tot(k,i,l,j)
+          two_e_tc_non_hermit_integral_seq_beta (k,i) += density   * ao_two_e_tc_tot(k,i,l,j)
          !                                         rho_a(l,j) *      < k l| T | j i>
-          two_e_tc_non_hermit_integral_alpha(k,i) -= density_a * ao_two_e_tc_tot(k,j,l,i)
+          two_e_tc_non_hermit_integral_seq_alpha(k,i) -= density_a * ao_two_e_tc_tot(k,j,l,i)
          !                                         rho_b(l,j) *      < k l| T | j i>
-          two_e_tc_non_hermit_integral_beta (k,i) -= density_b * ao_two_e_tc_tot(k,j,l,i)
+          two_e_tc_non_hermit_integral_seq_beta (k,i) -= density_b * ao_two_e_tc_tot(k,j,l,i)
        enddo
      enddo
 !!$OMP END DO
 !!$OMP END PARALLEL
    enddo
  enddo
  !call wall_time(t1)
  !print*, ' wall time for two_e_tc_non_hermit_integral_seq after = ', t1 - t0
 END_PROVIDER 
 ! ---
 BEGIN_PROVIDER [ double precision, two_e_tc_non_hermit_integral_alpha, (ao_num, ao_num)]
 &BEGIN_PROVIDER [ double precision, two_e_tc_non_hermit_integral_beta , (ao_num, ao_num)]
  BEGIN_DOC
  !
  ! two_e_tc_non_hermit_integral_alpha(k,i) = <k| F^tc_alpha |i> 
  !
  ! where F^tc is the two-body part of the TC Fock matrix and k,i are AO basis functions
  !
  END_DOC
  implicit none
  integer                       :: i, j, k, l
  double precision              :: density, density_a, density_b, I_coul, I_kjli
  double precision              :: t0, t1
  double precision, allocatable :: tmp_a(:,:), tmp_b(:,:)
  !print*, ' providing two_e_tc_non_hermit_integral ...'
  !call wall_time(t0)
  two_e_tc_non_hermit_integral_alpha = 0.d0
  two_e_tc_non_hermit_integral_beta  = 0.d0
 !$OMP PARALLEL DEFAULT (NONE)                                                                        &
 !$OMP PRIVATE (i, j, k, l, density_a, density_b, density, tmp_a, tmp_b, I_coul, I_kjli)              &
 !$OMP SHARED  (ao_num, TCSCF_density_matrix_ao_alpha, TCSCF_density_matrix_ao_beta, ao_two_e_tc_tot, &
 !$OMP         two_e_tc_non_hermit_integral_alpha, two_e_tc_non_hermit_integral_beta)
  allocate(tmp_a(ao_num,ao_num), tmp_b(ao_num,ao_num))
  tmp_a = 0.d0
  tmp_b = 0.d0
 !$OMP DO
  do j = 1, ao_num
    do l = 1, ao_num
      density_a = TCSCF_density_matrix_ao_alpha(l,j)
      density_b = TCSCF_density_matrix_ao_beta (l,j)
      density   = density_a + density_b                      
      do i = 1, ao_num
        do k = 1, ao_num
          I_coul = density * ao_two_e_tc_tot(k,i,l,j)
          I_kjli = ao_two_e_tc_tot(k,j,l,i)
          tmp_a(k,i) += I_coul - density_a * I_kjli
          tmp_b(k,i) += I_coul - density_b * I_kjli
        enddo
      enddo
    enddo
  enddo
 !$OMP END DO NOWAIT
 !$OMP CRITICAL
  do i = 1, ao_num
    do j = 1, ao_num
      two_e_tc_non_hermit_integral_alpha(j,i) += tmp_a(j,i)
      two_e_tc_non_hermit_integral_beta (j,i) += tmp_b(j,i)
    enddo
  enddo
 !$OMP END CRITICAL
  deallocate(tmp_a, tmp_b)
 !$OMP END PARALLEL
  !call wall_time(t1)
  !print*, ' wall time for two_e_tc_non_hermit_integral after = ', t1 - t0
 END_PROVIDER 
 ! ---
 BEGIN_PROVIDER [ double precision, Fock_matrix_tc_ao_alpha, (ao_num, ao_num)]
-  implicit none
+
  BEGIN_DOC
  ! Total alpha TC Fock matrix : h_c + Two-e^TC terms on the AO basis
  END_DOC
-  Fock_matrix_tc_ao_alpha =  ao_one_e_integrals_tc_tot &
+
-                          + two_e_tc_non_hermit_integral_alpha 
+  implicit none
  Fock_matrix_tc_ao_alpha =  ao_one_e_integrals_tc_tot + two_e_tc_non_hermit_integral_alpha 
 END_PROVIDER 
@ -77,10 +161,10 @@ BEGIN_PROVIDER [ double precision, Fock_matrix_tc_ao_beta, (ao_num, ao_num)]
  BEGIN_DOC
  ! Total beta TC Fock matrix : h_c + Two-e^TC terms on the AO basis
  END_DOC
  implicit none
-  Fock_matrix_tc_ao_beta = ao_one_e_integrals_tc_tot &
+  Fock_matrix_tc_ao_beta = ao_one_e_integrals_tc_tot + two_e_tc_non_hermit_integral_beta 
                         + two_e_tc_non_hermit_integral_beta 
 END_PROVIDER 
@ -171,25 +255,38 @@ END_PROVIDER
  do i = 1, elec_beta_num ! doc --> SOMO
    do k = elec_beta_num+1, elec_alpha_num
-      grad_non_hermit_left  += dabs(Fock_matrix_tc_mo_tot(k,i))
+      grad_non_hermit_left  = max(grad_non_hermit_left , dabs(Fock_matrix_tc_mo_tot(k,i)))
-      grad_non_hermit_right += dabs(Fock_matrix_tc_mo_tot(i,k))
+      grad_non_hermit_right = max(grad_non_hermit_right, dabs(Fock_matrix_tc_mo_tot(i,k)))
      !grad_non_hermit_left  += dabs(Fock_matrix_tc_mo_tot(k,i))
      !grad_non_hermit_right += dabs(Fock_matrix_tc_mo_tot(i,k))
      !grad_non_hermit_left  += Fock_matrix_tc_mo_tot(k,i) * Fock_matrix_tc_mo_tot(k,i)
      !grad_non_hermit_right += Fock_matrix_tc_mo_tot(i,k) * Fock_matrix_tc_mo_tot(i,k)
    enddo
  enddo
  do i = 1, elec_beta_num ! doc --> virt 
    do k = elec_alpha_num+1, mo_num
-      grad_non_hermit_left  += dabs(Fock_matrix_tc_mo_tot(k,i))
+      grad_non_hermit_left  = max(grad_non_hermit_left , dabs(Fock_matrix_tc_mo_tot(k,i)))
-      grad_non_hermit_right += dabs(Fock_matrix_tc_mo_tot(i,k))
+      grad_non_hermit_right = max(grad_non_hermit_right, dabs(Fock_matrix_tc_mo_tot(i,k)))
      !grad_non_hermit_left  += dabs(Fock_matrix_tc_mo_tot(k,i))
      !grad_non_hermit_right += dabs(Fock_matrix_tc_mo_tot(i,k))
      grad_non_hermit_left  += Fock_matrix_tc_mo_tot(k,i) * Fock_matrix_tc_mo_tot(k,i)
      grad_non_hermit_right += Fock_matrix_tc_mo_tot(i,k) * Fock_matrix_tc_mo_tot(i,k)
    enddo
  enddo
  do i = elec_beta_num+1, elec_alpha_num ! SOMO --> virt 
    do k = elec_alpha_num+1, mo_num
-      grad_non_hermit_left  += dabs(Fock_matrix_tc_mo_tot(k,i))
+      grad_non_hermit_left  = max(grad_non_hermit_left , dabs(Fock_matrix_tc_mo_tot(k,i)))
-      grad_non_hermit_right += dabs(Fock_matrix_tc_mo_tot(i,k))
+      grad_non_hermit_right = max(grad_non_hermit_right, dabs(Fock_matrix_tc_mo_tot(i,k)))
      !grad_non_hermit_left  += dabs(Fock_matrix_tc_mo_tot(k,i))
      !grad_non_hermit_right += dabs(Fock_matrix_tc_mo_tot(i,k))
      grad_non_hermit_left  += Fock_matrix_tc_mo_tot(k,i) * Fock_matrix_tc_mo_tot(k,i)
      grad_non_hermit_right += Fock_matrix_tc_mo_tot(i,k) * Fock_matrix_tc_mo_tot(i,k)
    enddo
  enddo
  !grad_non_hermit = dsqrt(grad_non_hermit_left) + dsqrt(grad_non_hermit_right)
  grad_non_hermit = grad_non_hermit_left + grad_non_hermit_right
 END_PROVIDER 
--- a/src/tc_scf/fock_three.irp.f
+++ b/src/tc_scf/fock_three.irp.f
@ -79,6 +79,8 @@ BEGIN_PROVIDER [double precision, diag_three_elem_hf]
  double precision :: contrib, weight, four_third, one_third, two_third, exchange_int_231
  double precision :: integral_aaa, hthree, integral_aab, integral_abb, integral_bbb
  PROVIDE mo_l_coef mo_r_coef
  !print *, ' providing diag_three_elem_hf'
  if(.not. three_body_h_tc) then
--- a/src/tc_scf/rh_tcscf_diis.irp.f
+++ b/src/tc_scf/rh_tcscf_diis.irp.f
@ -0,0 +1,362 @@
 ! ---
 subroutine rh_tcscf_diis()
  implicit none
  integer                       :: i, j, it
  integer                       :: dim_DIIS, index_dim_DIIS
  double precision              :: etc_tot, etc_1e, etc_2e, etc_3e, e_save, e_delta
  double precision              :: tc_grad, g_save, g_delta, g_delta_th
  double precision              :: level_shift_save, rate_th
  double precision              :: t0, t1
  double precision              :: er_DIIS, er_delta, er_save, er_delta_th
  double precision, allocatable :: F_DIIS(:,:,:), E_DIIS(:,:,:)
  double precision, allocatable :: mo_r_coef_save(:,:), mo_l_coef_save(:,:)
  logical, external             :: qp_stop
  it          = 0
  e_save      = 0.d0
  dim_DIIS    = 0
  g_delta_th  = 1d0
  er_delta_th = 1d0
  rate_th     = 100.d0 !0.01d0 !0.2d0
  allocate(mo_r_coef_save(ao_num,mo_num), mo_l_coef_save(ao_num,mo_num))
  mo_l_coef_save = 0.d0
  mo_r_coef_save = 0.d0
  allocate(F_DIIS(ao_num,ao_num,max_dim_DIIS_TCSCF), E_DIIS(ao_num,ao_num,max_dim_DIIS_TCSCF))
  F_DIIS = 0.d0
  E_DIIS = 0.d0
  call write_time(6)
  ! ---
  PROVIDE level_shift_TCSCF
  PROVIDE mo_l_coef mo_r_coef
  write(6, '(A4,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A4, 1X, A8)')              &
    '====', '================', '================', '================', '================', '================' &
          , '================', '================', '================', '====', '========'
  write(6, '(A4,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A4, 1X, A8)')              &
    ' it ', '  SCF TC Energy ', '      E(1e)     ', '      E(2e)     ', '      E(3e)     ', '   energy diff  ' &
          , '    gradient    ', '    DIIS error  ', '  level shift   ', 'DIIS', '  WT (m)'
  write(6, '(A4,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A4, 1X, A8)')              &
    '====', '================', '================', '================', '================', '================' &
          , '================', '================', '================', '====', '========'
  ! first iteration (HF orbitals)
  call wall_time(t0)
  etc_tot = TC_HF_energy
  etc_1e  = TC_HF_one_e_energy
  etc_2e  = TC_HF_two_e_energy
  etc_3e  = 0.d0
  if(three_body_h_tc) then
    etc_3e = diag_three_elem_hf
  endif
  tc_grad = grad_non_hermit
  er_DIIS = maxval(abs(FQS_SQF_mo))
  e_delta = dabs(etc_tot - e_save)
  e_save  = etc_tot
  g_save  = tc_grad
  er_save = er_DIIS
  call wall_time(t1)
  write(6, '(I4,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, I4,1X, F8.2)')  &
    it, etc_tot, etc_1e, etc_2e, etc_3e, e_delta, tc_grad, er_DIIS, level_shift_tcscf, dim_DIIS, (t1-t0)/60.d0
  ! ---
  PROVIDE FQS_SQF_ao Fock_matrix_tc_ao_tot
  do while((tc_grad .gt. dsqrt(thresh_tcscf)) .and. (er_DIIS .gt. threshold_DIIS_nonzero_TCSCF))
    call wall_time(t0)
    it += 1
    if(it > n_it_TCSCF_max) then
      print *, ' max of TCSCF iterations is reached ', n_it_TCSCF_max
      stop
    endif
    dim_DIIS = min(dim_DIIS+1, max_dim_DIIS_TCSCF)
    ! ---
    if(dabs(e_delta) > 1.d-12) then
      index_dim_DIIS = mod(dim_DIIS-1, max_dim_DIIS_TCSCF) + 1
      do j = 1, ao_num
        do i = 1, ao_num
          F_DIIS(i,j,index_dim_DIIS) = Fock_matrix_tc_ao_tot(i,j)
          E_DIIS(i,j,index_dim_DIIS) = FQS_SQF_ao           (i,j)
        enddo
      enddo
      call extrapolate_TC_Fock_matrix(E_DIIS, F_DIIS, Fock_matrix_tc_ao_tot, size(Fock_matrix_tc_ao_tot, 1), it, dim_DIIS)
      call ao_to_mo_bi_ortho( Fock_matrix_tc_ao_tot, size(Fock_matrix_tc_ao_tot, 1) &
                            , Fock_matrix_tc_mo_tot, size(Fock_matrix_tc_mo_tot, 1) )
      TOUCH Fock_matrix_tc_mo_tot fock_matrix_tc_diag_mo_tot
    endif
    ! ---
    mo_l_coef(1:ao_num,1:mo_num) = fock_tc_leigvec_ao(1:ao_num,1:mo_num)
    mo_r_coef(1:ao_num,1:mo_num) = fock_tc_reigvec_ao(1:ao_num,1:mo_num)
    !call ezfio_set_bi_ortho_mos_mo_l_coef(mo_l_coef)
    !call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
    TOUCH mo_l_coef mo_r_coef
    ! ---
    g_delta  = grad_non_hermit         -  g_save
    er_delta = maxval(abs(FQS_SQF_mo)) - er_save
    !if((g_delta > rate_th * g_delta_th) .and. (er_delta > rate_th * er_delta_th) .and. (it > 1)) then
    if((g_delta > rate_th * g_delta_th) .and. (it > 1)) then
    !if((g_delta > 0.d0) .and. (it > 1)) then
      Fock_matrix_tc_ao_tot(1:ao_num,1:ao_num) = F_DIIS(1:ao_num,1:ao_num,index_dim_DIIS)
      call ao_to_mo_bi_ortho( Fock_matrix_tc_ao_tot, size(Fock_matrix_tc_ao_tot, 1) &
                            , Fock_matrix_tc_mo_tot, size(Fock_matrix_tc_mo_tot, 1) )
      TOUCH Fock_matrix_tc_mo_tot fock_matrix_tc_diag_mo_tot
      mo_l_coef(1:ao_num,1:mo_num) = fock_tc_leigvec_ao(1:ao_num,1:mo_num)
      mo_r_coef(1:ao_num,1:mo_num) = fock_tc_reigvec_ao(1:ao_num,1:mo_num)
      !call ezfio_set_bi_ortho_mos_mo_l_coef(mo_l_coef)
      !call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
      TOUCH mo_l_coef mo_r_coef
    endif
    ! ---
    g_delta  = grad_non_hermit         -  g_save
    er_delta = maxval(abs(FQS_SQF_mo)) - er_save
    mo_l_coef_save(1:ao_num,1:mo_num) = mo_l_coef(1:ao_num,1:mo_num)
    mo_r_coef_save(1:ao_num,1:mo_num) = mo_r_coef(1:ao_num,1:mo_num)
    !do while((g_delta > rate_th * g_delta_th) .and. (er_delta > rate_th * er_delta_th) .and. (it > 1))
    do while((g_delta > rate_th * g_delta_th) .and. (it > 1))
      print *, ' big or bad step : ', g_delta, rate_th * g_delta_th
      mo_l_coef(1:ao_num,1:mo_num) = mo_l_coef_save(1:ao_num,1:mo_num) 
      mo_r_coef(1:ao_num,1:mo_num) = mo_r_coef_save(1:ao_num,1:mo_num) 
      if(level_shift_TCSCF <= .1d0) then
        level_shift_TCSCF = 1.d0
      else
        level_shift_TCSCF = level_shift_TCSCF * 3.0d0
      endif
      TOUCH mo_l_coef mo_r_coef level_shift_TCSCF
      mo_l_coef(1:ao_num,1:mo_num) = fock_tc_leigvec_ao(1:ao_num,1:mo_num)
      mo_r_coef(1:ao_num,1:mo_num) = fock_tc_reigvec_ao(1:ao_num,1:mo_num)
      !call ezfio_set_bi_ortho_mos_mo_l_coef(mo_l_coef)
      !call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
      TOUCH mo_l_coef mo_r_coef
      g_delta  = grad_non_hermit         -  g_save
      er_delta = maxval(abs(FQS_SQF_mo)) - er_save
      if(level_shift_TCSCF - level_shift_save > 40.d0) then
        level_shift_TCSCF = level_shift_save * 4.d0
        SOFT_TOUCH level_shift_TCSCF
        exit
      endif
      dim_DIIS = 0
    enddo
    ! ---
    level_shift_TCSCF = level_shift_TCSCF * 0.5d0
    SOFT_TOUCH level_shift_TCSCF
    etc_tot = TC_HF_energy
    etc_1e  = TC_HF_one_e_energy
    etc_2e  = TC_HF_two_e_energy
    etc_3e  = 0.d0
    if(three_body_h_tc) then
      etc_3e = diag_three_elem_hf
    endif
    tc_grad  = grad_non_hermit
    er_DIIS  = maxval(abs(FQS_SQF_mo))
    e_delta  = dabs(etc_tot - e_save)
    g_delta  = tc_grad - g_save
    er_delta = er_DIIS - er_save
    e_save           = etc_tot
    g_save           = tc_grad
    level_shift_save = level_shift_TCSCF
    er_save          = er_DIIS
    g_delta_th  = dabs(tc_grad) ! g_delta)
    er_delta_th = dabs(er_DIIS) !er_delta)
    call wall_time(t1)
    write(6, '(I4,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, I4,1X, F8.2)')  &
      it, etc_tot, etc_1e, etc_2e, etc_3e, e_delta, tc_grad, er_DIIS, level_shift_tcscf, dim_DIIS, (t1-t0)/60.d0
    if(g_delta .lt. 0.d0) then
      call ezfio_set_tc_scf_bitc_energy(etc_tot)
      call ezfio_set_bi_ortho_mos_mo_l_coef(mo_l_coef)
      call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
    endif
    if(qp_stop()) exit
  enddo
  ! ---
  print *, ' TCSCF DIIS converged !'
  call print_energy_and_mos()
  call write_time(6)
  deallocate(mo_r_coef_save, mo_l_coef_save, F_DIIS, E_DIIS)
  call ezfio_set_tc_scf_bitc_energy(TC_HF_energy)
  call ezfio_set_bi_ortho_mos_mo_l_coef(mo_l_coef)
  call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
 end
 ! ---
 subroutine extrapolate_TC_Fock_matrix(E_DIIS, F_DIIS, F_ao, size_F_ao, it, dim_DIIS)
  BEGIN_DOC
  !
  ! Compute the extrapolated Fock matrix using the DIIS procedure
  !
  ! e = \sum_i c_i e_i and \sum_i c_i = 1 
  ! ==> lagrange multiplier with L = |e|^2 - \lambda (\sum_i c_i = 1)
  !
  END_DOC
  implicit none
  integer,          intent(in)    :: it, size_F_ao
  integer,          intent(inout) :: dim_DIIS
  double precision, intent(in)    :: F_DIIS(ao_num,ao_num,dim_DIIS)
  double precision, intent(in)    :: E_DIIS(ao_num,ao_num,dim_DIIS)
  double precision, intent(inout) :: F_ao(size_F_ao,ao_num)
  double precision, allocatable   :: B_matrix_DIIS(:,:), X_vector_DIIS(:), C_vector_DIIS(:)
  integer                         :: i, j, k, l, i_DIIS, j_DIIS
  integer                         :: lwork
  double precision                :: rcond, ferr, berr
  integer,          allocatable   :: iwork(:)
  double precision, allocatable   :: scratch(:,:)
  if(dim_DIIS < 1) then
    return
  endif
  allocate( B_matrix_DIIS(dim_DIIS+1,dim_DIIS+1), X_vector_DIIS(dim_DIIS+1) &
          , C_vector_DIIS(dim_DIIS+1), scratch(ao_num,ao_num) )
  ! Compute the matrices B and X
  B_matrix_DIIS(:,:) = 0.d0
  do j = 1, dim_DIIS
    j_DIIS = min(dim_DIIS, mod(it-j, max_dim_DIIS_TCSCF)+1)
    do i = 1, dim_DIIS
      i_DIIS = min(dim_DIIS, mod(it-i, max_dim_DIIS_TCSCF)+1)
      ! Compute product of two errors vectors
      do l = 1, ao_num
        do k = 1, ao_num
          B_matrix_DIIS(i,j) = B_matrix_DIIS(i,j) + E_DIIS(k,l,i_DIIS) * E_DIIS(k,l,j_DIIS)
        enddo
      enddo
    enddo
  enddo
  ! Pad B matrix and build the X matrix
  C_vector_DIIS(:) = 0.d0
  do i = 1, dim_DIIS
    B_matrix_DIIS(i,dim_DIIS+1) = -1.d0
    B_matrix_DIIS(dim_DIIS+1,i) = -1.d0
  enddo
  C_vector_DIIS(dim_DIIS+1) = -1.d0
  deallocate(scratch)
  ! Estimate condition number of B
  integer                       :: info
  double precision              :: anorm
  integer,          allocatable :: ipiv(:)
  double precision, allocatable :: AF(:,:)
  double precision, external :: dlange
  lwork = max((dim_DIIS+1)**2, (dim_DIIS+1)*5)
  allocate(AF(dim_DIIS+1,dim_DIIS+1))
  allocate(ipiv(2*(dim_DIIS+1)), iwork(2*(dim_DIIS+1)) )
  allocate(scratch(lwork,1))
  scratch(:,1) = 0.d0
  anorm = dlange('1', dim_DIIS+1, dim_DIIS+1, B_matrix_DIIS, size(B_matrix_DIIS, 1), scratch(1,1))
  AF(:,:) = B_matrix_DIIS(:,:)
  call dgetrf(dim_DIIS+1, dim_DIIS+1, AF, size(AF, 1), ipiv, info)
  if(info /= 0) then
    dim_DIIS = 0
    return
  endif
  call dgecon('1', dim_DIIS+1, AF, size(AF, 1), anorm, rcond, scratch, iwork, info)
  if(info /= 0) then
    dim_DIIS = 0
    return
  endif
  if(rcond < 1.d-14) then
    dim_DIIS = 0
    return
  endif
  ! solve the linear system C = B x X
  X_vector_DIIS = C_vector_DIIS
  call dgesv(dim_DIIS+1, 1, B_matrix_DIIS, size(B_matrix_DIIS, 1), ipiv , X_vector_DIIS, size(X_vector_DIIS, 1), info)
  deallocate(scratch, AF, iwork)
  if(info < 0) then
    stop ' bug in TC-DIIS'
  endif
  ! Compute extrapolated Fock matrix
  !$OMP PARALLEL DO PRIVATE(i,j,k) DEFAULT(SHARED) if (ao_num > 200)
  do j = 1, ao_num
    do i = 1, ao_num
      F_ao(i,j) = 0.d0
    enddo
    do k = 1, dim_DIIS
      if(dabs(X_vector_DIIS(k)) < 1.d-10) cycle
      do i = 1,ao_num
        ! FPE here
        F_ao(i,j) = F_ao(i,j) + X_vector_DIIS(k) * F_DIIS(i,j,dim_DIIS-k+1)
      enddo
    enddo
  enddo
  !$OMP END PARALLEL DO
 end
 ! ---
--- a/src/tc_scf/rh_tcscf_simple.irp.f
+++ b/src/tc_scf/rh_tcscf_simple.irp.f
@ -0,0 +1,129 @@
 ! ---
 subroutine rh_tcscf_simple()
  implicit none
  integer                       :: i, j, it, dim_DIIS
  double precision              :: t0, t1
  double precision              :: e_save, e_delta, rho_delta
  double precision              :: etc_tot, etc_1e, etc_2e, etc_3e, tc_grad
  double precision              :: er_DIIS
  double precision, allocatable :: rho_old(:,:), rho_new(:,:)
  allocate(rho_old(ao_num,ao_num), rho_new(ao_num,ao_num))
  it       = 0
  e_save   = 0.d0
  dim_DIIS = 0
  ! ---
  if(.not. bi_ortho) then
   print *, ' grad_hermit = ', grad_hermit
   call save_good_hermit_tc_eigvectors
   TOUCH mo_coef 
   call save_mos
  endif
  ! ---
  if(bi_ortho) then
    PROVIDE level_shift_tcscf
    PROVIDE mo_l_coef mo_r_coef
    write(6, '(A4,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A4, 1X, A8)')              &
      '====', '================', '================', '================', '================', '================' &
            , '================', '================', '================', '====', '========'
    write(6, '(A4,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A4, 1X, A8)')              &
      ' it ', '  SCF TC Energy ', '      E(1e)     ', '      E(2e)     ', '      E(3e)     ', '   energy diff  ' &
            , '    gradient    ', '    DIIS error  ', '  level shift   ', 'DIIS', '  WT (m)'
    write(6, '(A4,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A4, 1X, A8)')              &
      '====', '================', '================', '================', '================', '================' &
            , '================', '================', '================', '====', '========'
    ! first iteration (HF orbitals)
    call wall_time(t0)
    etc_tot = TC_HF_energy
    etc_1e  = TC_HF_one_e_energy
    etc_2e  = TC_HF_two_e_energy
    etc_3e  = 0.d0
    if(three_body_h_tc) then
      etc_3e = diag_three_elem_hf
    endif
    tc_grad = grad_non_hermit
    er_DIIS = maxval(abs(FQS_SQF_mo))
    e_delta = dabs(etc_tot - e_save)
    e_save  = etc_tot
    call wall_time(t1)
    write(6, '(I4,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, I4,1X, F8.2)')  &
      it, etc_tot, etc_1e, etc_2e, etc_3e, e_delta, tc_grad, er_DIIS, level_shift_tcscf, dim_DIIS, (t1-t0)/60.d0
    do while(tc_grad .gt. dsqrt(thresh_tcscf))
      call wall_time(t0)
      it += 1
      if(it > n_it_tcscf_max) then
        print *, ' max of TCSCF iterations is reached ', n_it_TCSCF_max
        stop
      endif
      mo_l_coef = fock_tc_leigvec_ao
      mo_r_coef = fock_tc_reigvec_ao
      call ezfio_set_bi_ortho_mos_mo_l_coef(mo_l_coef)
      call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
      TOUCH mo_l_coef mo_r_coef
      etc_tot = TC_HF_energy
      etc_1e  = TC_HF_one_e_energy
      etc_2e  = TC_HF_two_e_energy
      etc_3e  = 0.d0
      if(three_body_h_tc) then
        etc_3e = diag_three_elem_hf
      endif
      tc_grad = grad_non_hermit
      er_DIIS = maxval(abs(FQS_SQF_mo))
      e_delta = dabs(etc_tot - e_save)
      e_save  = etc_tot
      call ezfio_set_tc_scf_bitc_energy(etc_tot)
      call wall_time(t1)
      write(6, '(I4,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, I4,1X, F8.2)')  &
        it, etc_tot, etc_1e, etc_2e, etc_3e, e_delta, tc_grad, er_DIIS, level_shift_tcscf, dim_DIIS, (t1-t0)/60.d0
    enddo
  else
   do while( (grad_hermit.gt.dsqrt(thresh_tcscf)) .and. (it.lt.n_it_tcscf_max) )
      print*,'grad_hermit = ',grad_hermit
      it += 1
      print *, 'iteration = ', it
      print *, '***'
      print *, 'TC HF total energy = ', TC_HF_energy
      print *, 'TC HF 1 e   energy = ', TC_HF_one_e_energy
      print *, 'TC HF 2 e   energy = ', TC_HF_two_e_energy
      print *, 'TC HF 3 body       = ', diag_three_elem_hf
      print *, '***'
      print *, ''
      call save_good_hermit_tc_eigvectors
      TOUCH mo_coef 
      call save_mos
    enddo
  endif
  print *, ' TCSCF Simple converged !'
  call print_energy_and_mos()
  deallocate(rho_old, rho_new)
 end
 ! ---
--- a/src/tc_scf/rotate_tcscf_orbitals.irp.f
+++ b/src/tc_scf/rotate_tcscf_orbitals.irp.f
@ -260,14 +260,10 @@ subroutine fix_right_to_one()
  integer                       :: i, j, m, n, mm, tot_deg
  double precision              :: accu_d, accu_nd
  double precision              :: de_thr, ei, ej, de
  double precision              :: thr_d, thr_nd
  integer,          allocatable :: deg_num(:)
  double precision, allocatable :: R0(:,:), L0(:,:), W(:,:), e0(:)
  double precision, allocatable :: R(:,:), L(:,:), S(:,:), Stmp(:,:), tmp(:,:)
  thr_d  = 1d-7
  thr_nd = 1d-7
  n = ao_num
  m = mo_num
@ -340,7 +336,7 @@ subroutine fix_right_to_one()
      ! ---
      call impose_weighted_orthog_svd(n, mm, W, R)
-      call impose_weighted_biorthog_qr(n, mm, thr_d, thr_nd, R, W, L)
+      call impose_weighted_biorthog_qr(n, mm, thresh_biorthog_diag, thresh_biorthog_nondiag, R, W, L)
      ! ---
@ -353,7 +349,7 @@ subroutine fix_right_to_one()
    endif
  enddo
-  call check_weighted_biorthog_binormalize(n, m, L0, W, R0, thr_d, thr_nd, .true.)
+  call check_weighted_biorthog_binormalize(n, m, L0, W, R0, thresh_biorthog_diag, thresh_biorthog_nondiag, .true.)
  deallocate(W, deg_num)
--- a/src/tc_scf/routines_rotates.irp.f
+++ b/src/tc_scf/routines_rotates.irp.f
@ -259,7 +259,7 @@ subroutine orthog_functions(m, n, coef, overlap)
  double precision, intent(in)    :: overlap(m,m)
  double precision, intent(inout) :: coef(m,n)
  double precision, allocatable   :: stmp(:,:)
-  integer                         :: j
+  integer                         :: j, k
  allocate(stmp(n,n))
  call build_s_matrix(m, n, coef, coef, overlap, stmp)
@ -270,7 +270,13 @@ subroutine orthog_functions(m, n, coef, overlap)
  call impose_orthog_svd_overlap(m, n, coef, overlap)
  call build_s_matrix(m, n, coef, coef, overlap, stmp)
  do j = 1, n
-    coef(1,:m) *= 1.d0/dsqrt(stmp(j,j))
+    ! ---
    ! TODO: MANU check ici
    !coef(1,:m) *= 1.d0/dsqrt(stmp(j,j))
    do k = 1, m
      coef(k,j) *= 1.d0/dsqrt(stmp(j,j))
    enddo
    ! ---
  enddo
  call build_s_matrix(m, n, coef, coef, overlap, stmp)
--- a/src/tc_scf/tc_scf.irp.f
+++ b/src/tc_scf/tc_scf.irp.f
@ -1,3 +1,5 @@
 ! ---
 program tc_scf
  BEGIN_DOC
@ -15,14 +17,20 @@ program tc_scf
 !  my_n_pt_a_grid = 26 ! small grid for quick debug
  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-  call create_guess()
+  PROVIDE mu_erf 
-  call orthonormalize_mos()
+  print *, ' mu = ', mu_erf
  PROVIDE j1b_type
  print *, ' j1b_type = ', j1b_type
  print *, j1b_pen
  !call create_guess()
  !call orthonormalize_mos()
  PROVIDE tcscf_algorithm
  if(tcscf_algorithm == 'DIIS') then
-    call rh_tcscf()
+    call rh_tcscf_diis()
  elseif(tcscf_algorithm == 'Simple') then
-    call simple_tcscf()
+    call rh_tcscf_simple()
  else
    print *, ' not implemented yet', tcscf_algorithm
    stop
@ -35,11 +43,7 @@ end
 ! ---
-subroutine create_guess
+subroutine create_guess()
  BEGIN_DOC
  !   Create a MO guess if no MOs are present in the EZFIO directory
  END_DOC
  implicit none
  logical :: exists
@ -54,10 +58,7 @@ subroutine create_guess
      mo_coef = ao_ortho_lowdin_coef
      call restore_symmetry(ao_num, mo_num, mo_coef, size(mo_coef, 1), 1.d-10)
      TOUCH mo_coef
-      call mo_as_eigvectors_of_mo_matrix(mo_one_e_integrals,     &
+      call mo_as_eigvectors_of_mo_matrix(mo_one_e_integrals, size(mo_one_e_integrals, 1), size(mo_one_e_integrals, 2), mo_label, 1, .false.)
          size(mo_one_e_integrals,1),                            &
          size(mo_one_e_integrals,2),                            &
          mo_label,1,.false.)
      call restore_symmetry(ao_num, mo_num, mo_coef, size(mo_coef, 1), 1.d-10)
      SOFT_TOUCH mo_coef
    elseif (mo_guess_type == "Huckel") then
@ -72,198 +73,3 @@ subroutine create_guess
 end subroutine create_guess
 ! ---
 subroutine simple_tcscf()
  implicit none
  integer                       :: i, j, it
  double precision              :: e_save, e_delta, rho_delta
  double precision, allocatable :: rho_old(:,:), rho_new(:,:)
  allocate(rho_old(ao_num,ao_num), rho_new(ao_num,ao_num))
  it = 0
  print*,'iteration = ', it
  !print*,'grad_hermit = ', grad_hermit
  print*,'***'
  print*,'TC HF total energy = ', TC_HF_energy
  print*,'TC HF 1 e   energy = ', TC_HF_one_e_energy
  print*,'TC HF 2 e   energy = ', TC_HF_two_e_energy
  if(three_body_h_tc) then
   print*,'TC HF 3 body       = ', diag_three_elem_hf
  endif
  print*,'***'
  e_delta = 10.d0
  e_save  = 0.d0 !TC_HF_energy
  rho_delta = 10.d0
  if(bi_ortho)then
   mo_l_coef = fock_tc_leigvec_ao
   mo_r_coef = fock_tc_reigvec_ao
   rho_old   = TCSCF_bi_ort_dm_ao
   call ezfio_set_bi_ortho_mos_mo_l_coef(mo_l_coef)
   call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
   TOUCH mo_l_coef mo_r_coef
  else
   print *, ' grad_hermit = ', grad_hermit
   call save_good_hermit_tc_eigvectors
   TOUCH mo_coef 
   call save_mos
  endif
  ! ---
  if(bi_ortho) then
    !do while(e_delta .gt. dsqrt(thresh_tcscf)) )
    !do while(e_delta .gt. thresh_tcscf) )
    !do while(rho_delta .gt. thresh_tcscf) )
    !do while(grad_non_hermit_right .gt. dsqrt(thresh_tcscf))
    do while(grad_non_hermit .gt. dsqrt(thresh_tcscf))
      it += 1
      if(it > n_it_tcscf_max) then
        print *, ' max of TCSCF iterations is reached ', n_it_TCSCF_max
        stop
      endif
      print *, ' ***'
      print *, ' iteration = ', it
      print *, ' TC HF total energy = ', TC_HF_energy
      print *, ' TC HF 1 e   energy = ', TC_HF_one_e_energy
      print *, ' TC HF 2 non hermit = ', TC_HF_two_e_energy
      if(three_body_h_tc) then
        print *, ' TC HF 3 body       = ', diag_three_elem_hf
      endif
      e_delta = dabs(TC_HF_energy - e_save)
      print *, ' delta E           = ', e_delta
      print *, ' gradient          = ', grad_non_hermit
      print *, ' max TC DIIS error = ', maxval(abs(FQS_SQF_mo))
      !print *, ' gradient= ', grad_non_hermit_right
      !rho_new   = TCSCF_bi_ort_dm_ao
      !!print*, rho_new
      !rho_delta = 0.d0
      !do i = 1, ao_num 
      !  do j = 1, ao_num 
      !    rho_delta += dabs(rho_new(j,i) - rho_old(j,i))
      !  enddo
      !enddo
      !print *, ' rho_delta =', rho_delta
      !rho_old = rho_new
      e_save    = TC_HF_energy
      mo_l_coef = fock_tc_leigvec_ao
      mo_r_coef = fock_tc_reigvec_ao
      call ezfio_set_bi_ortho_mos_mo_l_coef(mo_l_coef)
      call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
      TOUCH mo_l_coef mo_r_coef
      call ezfio_set_tc_scf_bitc_energy(TC_HF_energy)
      call test_fock_3e_uhf_mo()
      print *, ' ***'
      print *, ''
    enddo
  else
   do while( (grad_hermit.gt.dsqrt(thresh_tcscf)) .and. it .lt. n_it_tcscf_max )
      print*,'grad_hermit = ',grad_hermit
      it += 1
      print *, 'iteration = ', it
      print *, '***'
      print *, 'TC HF total energy = ', TC_HF_energy
      print *, 'TC HF 1 e   energy = ', TC_HF_one_e_energy
      print *, 'TC HF 2 e   energy = ', TC_HF_two_e_energy
      print *, 'TC HF 3 body       = ', diag_three_elem_hf
      print *, '***'
      print *, ''
      call save_good_hermit_tc_eigvectors
      TOUCH mo_coef 
      call save_mos
    enddo
  endif
  print *, ' TCSCF Simple converged !'
  call print_energy_and_mos()
  deallocate(rho_old, rho_new)
 end subroutine simple_tcscf
 ! ---
 subroutine test_fock_3e_uhf_mo()
  implicit none
  integer          :: i, j
  double precision :: diff_tot, diff_ij, thr_ih, norm
  thr_ih = 1d-12
  PROVIDE fock_a_tot_3e_bi_orth fock_b_tot_3e_bi_orth
  PROVIDE fock_3e_uhf_mo_a fock_3e_uhf_mo_b
  ! ---
  norm     = 0.d0
  diff_tot = 0.d0
  do i = 1, mo_num
    do j = 1, mo_num
      diff_ij = dabs(fock_3e_uhf_mo_a(j,i) - fock_a_tot_3e_bi_orth(j,i))
      if(diff_ij .gt. thr_ih) then
        !print *, ' difference on ', j, i
        !print *, ' MANU : ', fock_a_tot_3e_bi_orth(j,i)
        !print *, ' UHF  : ', fock_3e_uhf_mo_a     (j,i)
        !stop
      endif
      norm     += dabs(fock_a_tot_3e_bi_orth(j,i))
      diff_tot += diff_ij
    enddo
  enddo
  print *, ' diff on F_a = ', diff_tot / norm
  print *, '      norm_a = ', norm
  print *, ' '
  ! ---
  norm     = 0.d0
  diff_tot = 0.d0
  do i = 1, mo_num
    do j = 1, mo_num
      diff_ij = dabs(fock_3e_uhf_mo_b(j,i) - fock_b_tot_3e_bi_orth(j,i))
      if(diff_ij .gt. thr_ih) then
        !print *, ' difference on ', j, i
        !print *, ' MANU : ', fock_b_tot_3e_bi_orth(j,i)
        !print *, ' UHF  : ', fock_3e_uhf_mo_b     (j,i)
        !stop
      endif
      norm     += dabs(fock_b_tot_3e_bi_orth(j,i))
      diff_tot += diff_ij
    enddo
  enddo
  print *, ' diff on F_b = ', diff_tot/norm
  print *, '      norm_b = ', norm
  print *, ' '
  ! ---
 end subroutine test_fock_3e_uhf_mo()
--- a/src/tc_scf/tc_scf_dm.irp.f
+++ b/src/tc_scf/tc_scf_dm.irp.f
@ -1,8 +1,11 @@
 ! ---
 BEGIN_PROVIDER [ double precision, TCSCF_density_matrix_ao_beta, (ao_num, ao_num) ]
  implicit none
  if(bi_ortho) then
    PROVIDE mo_l_coef mo_r_coef
    TCSCF_density_matrix_ao_beta = TCSCF_bi_ort_dm_ao_beta
  else
    TCSCF_density_matrix_ao_beta = SCF_density_matrix_ao_beta
@ -12,8 +15,11 @@ END_PROVIDER
 ! ---
 BEGIN_PROVIDER [ double precision, TCSCF_density_matrix_ao_alpha, (ao_num, ao_num) ]
  implicit none
  if(bi_ortho) then
    PROVIDE mo_l_coef mo_r_coef
    TCSCF_density_matrix_ao_alpha = TCSCF_bi_ort_dm_ao_alpha
  else
    TCSCF_density_matrix_ao_alpha = SCF_density_matrix_ao_alpha
--- a/src/tc_scf/tc_scf_energy.irp.f
+++ b/src/tc_scf/tc_scf_energy.irp.f
@ -10,6 +10,8 @@
  implicit none
  integer :: i, j
  PROVIDE mo_l_coef mo_r_coef
  TC_HF_energy = nuclear_repulsion
  TC_HF_one_e_energy = 0.d0
  TC_HF_two_e_energy = 0.d0
--- a/src/tc_scf/test_int.irp.f
+++ b/src/tc_scf/test_int.irp.f
@ -25,7 +25,7 @@ program test_ints
 !! OK
 !call routine_v_ij_erf_rk_cst_mu_j1b
 !! OK 
-! call routine_x_v_ij_erf_rk_cst_mu_tmp_j1b
+! call routine_x_v_ij_erf_rk_cst_mu_j1b
 !! OK
 ! call routine_v_ij_u_cst_mu_j1b
@ -43,10 +43,15 @@ program test_ints
 ! call test_ao_tc_int_chemist
 ! call test_grid_points_ao
 ! call test_tc_scf
- call test_int_gauss
+ !call test_int_gauss
  !call test_fock_3e_uhf_ao()
-  call test_fock_3e_uhf_mo()
+  !call test_fock_3e_uhf_mo()
  !call test_tc_grad_and_lapl_ao()
  !call test_tc_grad_square_ao()
  call test_two_e_tc_non_hermit_integral()
 end
@ -56,14 +61,8 @@ subroutine test_tc_scf
 implicit none
 integer :: i
 ! provide int2_u_grad1u_x_j1b2_test
- provide x_v_ij_erf_rk_cst_mu_tmp_j1b_test
+ provide x_v_ij_erf_rk_cst_mu_j1b_test
-! do i = 1, ng_fit_jast
+! provide x_v_ij_erf_rk_cst_mu_j1b_test
 !  print*,expo_gauss_1_erf_x_2(i),coef_gauss_1_erf_x_2(i)
 ! enddo
 ! provide tc_grad_square_ao_test
 !  provide tc_grad_and_lapl_ao_test
 ! provide int2_u_grad1u_x_j1b2_test
 ! provide x_v_ij_erf_rk_cst_mu_tmp_j1b_test
 ! print*,'TC_HF_energy = ',TC_HF_energy
 ! print*,'grad_non_hermit = ',grad_non_hermit
 end
@ -212,7 +211,7 @@ subroutine routine_v_ij_erf_rk_cst_mu_j1b
 end
-subroutine routine_x_v_ij_erf_rk_cst_mu_tmp_j1b
+subroutine routine_x_v_ij_erf_rk_cst_mu_j1b
 implicit none
 integer :: i,j,ipoint,k,l,m
 double precision :: weight,accu_relat, accu_abs, contrib
@ -242,8 +241,8 @@ subroutine routine_x_v_ij_erf_rk_cst_mu_tmp_j1b
    do i = 1, ao_num
     do j = 1, ao_num
      do m = 1, 3
-       array(j,i,l,k)     += x_v_ij_erf_rk_cst_mu_tmp_j1b_test(m,j,i,ipoint) * aos_grad_in_r_array_transp(m,k,ipoint) * aos_in_r_array(l,ipoint) * weight
+       array(j,i,l,k)     += x_v_ij_erf_rk_cst_mu_j1b_test(j,i,ipoint,m) * aos_grad_in_r_array_transp(m,k,ipoint) * aos_in_r_array(l,ipoint) * weight
-       array_ref(j,i,l,k) += x_v_ij_erf_rk_cst_mu_tmp_j1b(m,j,i,ipoint)      * aos_grad_in_r_array_transp(m,k,ipoint) * aos_in_r_array(l,ipoint) * weight
+       array_ref(j,i,l,k) += x_v_ij_erf_rk_cst_mu_j1b     (j,i,ipoint,m) * aos_grad_in_r_array_transp(m,k,ipoint) * aos_in_r_array(l,ipoint) * weight
      enddo
     enddo
    enddo
@ -500,8 +499,8 @@ subroutine routine_int2_u_grad1u_x_j1b2
    do i = 1, ao_num
     do j = 1, ao_num
      do m = 1, 3
-       array(j,i,l,k)     += int2_u_grad1u_x_j1b2_test(m,j,i,ipoint) * aos_grad_in_r_array_transp(m,k,ipoint) * aos_in_r_array(l,ipoint) * weight
+       array(j,i,l,k)     += int2_u_grad1u_x_j1b2_test(j,i,ipoint,m) * aos_grad_in_r_array_transp(m,k,ipoint) * aos_in_r_array(l,ipoint) * weight
-       array_ref(j,i,l,k) += int2_u_grad1u_x_j1b2(m,j,i,ipoint)      * aos_grad_in_r_array_transp(m,k,ipoint) * aos_in_r_array(l,ipoint) * weight
+       array_ref(j,i,l,k) += int2_u_grad1u_x_j1b2     (j,i,ipoint,m) * aos_grad_in_r_array_transp(m,k,ipoint) * aos_in_r_array(l,ipoint) * weight
      enddo
     enddo
    enddo
@ -708,7 +707,7 @@ subroutine test_fock_3e_uhf_mo()
  ! ---
-end subroutine test_fock_3e_uhf_mo()
+end subroutine test_fock_3e_uhf_mo
 ! ---
@ -776,9 +775,9 @@ subroutine test_grid_points_ao
  icount_bad = 0
  icount_full = 0
  do ipoint = 1, n_points_final_grid
-!   if(dabs(int2_u_grad1u_x_j1b2_test(1,j,i,ipoint)) & 
+!   if(dabs(int2_u_grad1u_x_j1b2_test(j,i,ipoint,1)) & 
-! + dabs(int2_u_grad1u_x_j1b2_test(2,j,i,ipoint)) &
+!    + dabs(int2_u_grad1u_x_j1b2_test(j,i,ipoint,2)) &
-! + dabs(int2_u_grad1u_x_j1b2_test(2,j,i,ipoint)) )
+!    + dabs(int2_u_grad1u_x_j1b2_test(j,i,ipoint,3)) )
 !   if(dabs(int2_u2_j1b2_test(j,i,ipoint)).gt.thr)then
 !    icount += 1
 !   endif
@ -848,3 +847,157 @@ subroutine test_int_gauss
 end
 ! ---
 subroutine test_tc_grad_and_lapl_ao()
  implicit none
  integer          :: i, j, k, l
  double precision :: diff_tot, diff, thr_ih, norm
  thr_ih = 1d-10
  PROVIDE tc_grad_and_lapl_ao tc_grad_and_lapl_ao_loop
  norm     = 0.d0
  diff_tot = 0.d0
  do i = 1, ao_num
    do j = 1, ao_num
      do k = 1, ao_num
        do l = 1, ao_num
          diff = dabs(tc_grad_and_lapl_ao_loop(l,k,j,i) - tc_grad_and_lapl_ao(l,k,j,i))
          if(diff .gt. thr_ih) then
            print *, ' difference on ', l, k, j, i
            print *, ' loops : ', tc_grad_and_lapl_ao_loop(l,k,j,i)
            print *, ' lapack: ', tc_grad_and_lapl_ao     (l,k,j,i)
            !stop
          endif
          norm     += dabs(tc_grad_and_lapl_ao_loop(l,k,j,i))
          diff_tot += diff
        enddo
      enddo
    enddo
  enddo
  print *, ' diff tot = ', diff_tot / norm
  print *, '     norm = ', norm
  print *, ' '
  return
 end
 ! ---
 subroutine test_tc_grad_square_ao()
  implicit none
  integer          :: i, j, k, l
  double precision :: diff_tot, diff, thr_ih, norm
  thr_ih = 1d-10
  PROVIDE tc_grad_square_ao tc_grad_square_ao_loop
  norm     = 0.d0
  diff_tot = 0.d0
  do i = 1, ao_num
    do j = 1, ao_num
      do k = 1, ao_num
        do l = 1, ao_num
          diff = dabs(tc_grad_square_ao_loop(l,k,j,i) - tc_grad_square_ao(l,k,j,i))
          if(diff .gt. thr_ih) then
            print *, ' difference on ', l, k, j, i
            print *, ' loops : ', tc_grad_square_ao_loop(l,k,j,i)
            print *, ' lapack: ', tc_grad_square_ao     (l,k,j,i)
            !stop
          endif
          norm     += dabs(tc_grad_square_ao_loop(l,k,j,i))
          diff_tot += diff
        enddo
      enddo
    enddo
  enddo
  print *, ' diff tot = ', diff_tot / norm
  print *, '     norm = ', norm
  print *, ' '
  return
 end
 ! ---
 subroutine test_two_e_tc_non_hermit_integral()
  implicit none
  integer          :: i, j
  double precision :: diff_tot, diff, thr_ih, norm
  thr_ih = 1d-10
  PROVIDE two_e_tc_non_hermit_integral_beta two_e_tc_non_hermit_integral_alpha
  PROVIDE two_e_tc_non_hermit_integral_seq_beta two_e_tc_non_hermit_integral_seq_alpha
  ! ---
  norm     = 0.d0
  diff_tot = 0.d0
  do i = 1, ao_num
    do j = 1, ao_num
      diff = dabs(two_e_tc_non_hermit_integral_seq_alpha(j,i) - two_e_tc_non_hermit_integral_alpha(j,i))
      if(diff .gt. thr_ih) then
        print *, ' difference on ', j, i
        print *, ' seq         : ', two_e_tc_non_hermit_integral_seq_alpha(j,i)
        print *, ' //          : ', two_e_tc_non_hermit_integral_alpha    (j,i)
        !stop
      endif
      norm     += dabs(two_e_tc_non_hermit_integral_seq_alpha(j,i))
      diff_tot += diff
    enddo
  enddo
  print *, ' diff tot a = ', diff_tot / norm
  print *, '     norm a = ', norm
  print *, ' '
  ! ---
  norm     = 0.d0
  diff_tot = 0.d0
  do i = 1, ao_num
    do j = 1, ao_num
      diff = dabs(two_e_tc_non_hermit_integral_seq_beta(j,i) - two_e_tc_non_hermit_integral_beta(j,i))
      if(diff .gt. thr_ih) then
        print *, ' difference on ', j, i
        print *, ' seq         : ', two_e_tc_non_hermit_integral_seq_beta(j,i)
        print *, ' //          : ', two_e_tc_non_hermit_integral_beta    (j,i)
        !stop
      endif
      norm     += dabs(two_e_tc_non_hermit_integral_seq_beta(j,i))
      diff_tot += diff
    enddo
  enddo
  print *, ' diff tot b = ', diff_tot / norm
  print *, '     norm b = ', norm
  print *, ' '
  ! ---
  return
 end
 ! ---
 >>>>>>> 92a4e33f8a21717cab0c0e4f8412ed6903afb04a
--- a/src/tools/print_he_energy.irp.f
+++ b/src/tools/print_he_energy.irp.f
@ -7,8 +7,8 @@ program print_he_energy
  call print_overlap()
-  call print_energy1()
+  !call print_energy1()
-  call print_energy2()
+  !call print_energy2()
 end 
--- a/src/utils/linear_algebra.irp.f
+++ b/src/utils/linear_algebra.irp.f
@ -1136,7 +1136,6 @@ subroutine ortho_svd(A,LDA,m,n)
 end
 ! QR to orthonormalize CSFs does not work :-(
 !subroutine ortho_qr_withB(A,LDA,B,m,n)
 !  implicit none
 !  BEGIN_DOC
@ -1223,7 +1222,7 @@ end
 !
 !  !deallocate(WORK,TAU)
 !end
-
+!
 !subroutine ortho_qr_csf(A, LDA, B, m, n) bind(C, name="ortho_qr_csf")
 !  use iso_c_binding
 !  integer(c_int32_t), value      :: LDA
@ -1234,6 +1233,7 @@ end
 !  call ortho_qr_withB(A,LDA,B,m,n)
 !end subroutine ortho_qr_csf
 subroutine ortho_qr(A,LDA,m,n)
  implicit none
  BEGIN_DOC
@ -1982,6 +1982,8 @@ end subroutine diag_nonsym_right
 ! ---
 ! Taken from GammCor thanks to Michal Hapka :-)
 subroutine pivoted_cholesky( A, rank, tol, ndim, U)
 !
--- a/src/utils/qsort.c
+++ b/src/utils/qsort.c
@ -0,0 +1,373 @@
 /* [[file:~/qp2/src/utils/qsort.org::*Generated%20C%20file][Generated C file:1]] */
 #include <stdlib.h>
 #include <stdint.h>
 struct int16_t_comp {
  int16_t    x;
  int32_t i;
 };
 int compare_int16_t( const void * l, const void * r )
 {
  const int16_t * restrict _l= l;
  const int16_t * restrict _r= r;
  if( *_l > *_r ) return 1;
  if( *_l < *_r ) return -1;
  return 0;
 }
 void qsort_int16_t(int16_t* restrict A_in, int32_t* restrict iorder, int32_t isize) {
  struct int16_t_comp* A = malloc(isize * sizeof(struct int16_t_comp));
  if (A == NULL) return;
  for (int i=0 ; i<isize ; ++i) {
    A[i].x = A_in[i];
    A[i].i = iorder[i];
  }
  qsort( (void*) A, (size_t) isize, sizeof(struct int16_t_comp), compare_int16_t);
  for (int i=0 ; i<isize ; ++i) {
    A_in[i] = A[i].x;
    iorder[i] = A[i].i;
  }
  free(A);
 }
 void qsort_int16_t_noidx(int16_t* A, int32_t isize) {
  qsort( (void*) A, (size_t) isize, sizeof(int16_t), compare_int16_t);
 }
 struct int16_t_comp_big {
  int16_t    x;
  int64_t i;
 };
 int compare_int16_t_big( const void * l, const void * r )
 {
  const int16_t * restrict _l= l;
  const int16_t * restrict _r= r;
  if( *_l > *_r ) return 1;
  if( *_l < *_r ) return -1;
  return 0;
 }
 void qsort_int16_t_big(int16_t* restrict A_in, int64_t* restrict iorder, int64_t isize) {
  struct int16_t_comp_big* A = malloc(isize * sizeof(struct int16_t_comp_big));
  if (A == NULL) return;
  for (int i=0 ; i<isize ; ++i) {
    A[i].x = A_in[i];
    A[i].i = iorder[i];
  }
  qsort( (void*) A, (size_t) isize, sizeof(struct int16_t_comp_big), compare_int16_t_big);
  for (int i=0 ; i<isize ; ++i) {
    A_in[i] = A[i].x;
    iorder[i] = A[i].i;
  }
  free(A);
 }
 void qsort_int16_t_noidx_big(int16_t* A, int64_t isize) {
  qsort( (void*) A, (size_t) isize, sizeof(int16_t), compare_int16_t_big);
 }
 struct int32_t_comp {
  int32_t    x;
  int32_t i;
 };
 int compare_int32_t( const void * l, const void * r )
 {
  const int32_t * restrict _l= l;
  const int32_t * restrict _r= r;
  if( *_l > *_r ) return 1;
  if( *_l < *_r ) return -1;
  return 0;
 }
 void qsort_int32_t(int32_t* restrict A_in, int32_t* restrict iorder, int32_t isize) {
  struct int32_t_comp* A = malloc(isize * sizeof(struct int32_t_comp));
  if (A == NULL) return;
  for (int i=0 ; i<isize ; ++i) {
    A[i].x = A_in[i];
    A[i].i = iorder[i];
  }
  qsort( (void*) A, (size_t) isize, sizeof(struct int32_t_comp), compare_int32_t);
  for (int i=0 ; i<isize ; ++i) {
    A_in[i] = A[i].x;
    iorder[i] = A[i].i;
  }
  free(A);
 }
 void qsort_int32_t_noidx(int32_t* A, int32_t isize) {
  qsort( (void*) A, (size_t) isize, sizeof(int32_t), compare_int32_t);
 }
 struct int32_t_comp_big {
  int32_t    x;
  int64_t i;
 };
 int compare_int32_t_big( const void * l, const void * r )
 {
  const int32_t * restrict _l= l;
  const int32_t * restrict _r= r;
  if( *_l > *_r ) return 1;
  if( *_l < *_r ) return -1;
  return 0;
 }
 void qsort_int32_t_big(int32_t* restrict A_in, int64_t* restrict iorder, int64_t isize) {
  struct int32_t_comp_big* A = malloc(isize * sizeof(struct int32_t_comp_big));
  if (A == NULL) return;
  for (int i=0 ; i<isize ; ++i) {
    A[i].x = A_in[i];
    A[i].i = iorder[i];
  }
  qsort( (void*) A, (size_t) isize, sizeof(struct int32_t_comp_big), compare_int32_t_big);
  for (int i=0 ; i<isize ; ++i) {
    A_in[i] = A[i].x;
    iorder[i] = A[i].i;
  }
  free(A);
 }
 void qsort_int32_t_noidx_big(int32_t* A, int64_t isize) {
  qsort( (void*) A, (size_t) isize, sizeof(int32_t), compare_int32_t_big);
 }
 struct int64_t_comp {
  int64_t    x;
  int32_t i;
 };
 int compare_int64_t( const void * l, const void * r )
 {
  const int64_t * restrict _l= l;
  const int64_t * restrict _r= r;
  if( *_l > *_r ) return 1;
  if( *_l < *_r ) return -1;
  return 0;
 }
 void qsort_int64_t(int64_t* restrict A_in, int32_t* restrict iorder, int32_t isize) {
  struct int64_t_comp* A = malloc(isize * sizeof(struct int64_t_comp));
  if (A == NULL) return;
  for (int i=0 ; i<isize ; ++i) {
    A[i].x = A_in[i];
    A[i].i = iorder[i];
  }
  qsort( (void*) A, (size_t) isize, sizeof(struct int64_t_comp), compare_int64_t);
  for (int i=0 ; i<isize ; ++i) {
    A_in[i] = A[i].x;
    iorder[i] = A[i].i;
  }
  free(A);
 }
 void qsort_int64_t_noidx(int64_t* A, int32_t isize) {
  qsort( (void*) A, (size_t) isize, sizeof(int64_t), compare_int64_t);
 }
 struct int64_t_comp_big {
  int64_t    x;
  int64_t i;
 };
 int compare_int64_t_big( const void * l, const void * r )
 {
  const int64_t * restrict _l= l;
  const int64_t * restrict _r= r;
  if( *_l > *_r ) return 1;
  if( *_l < *_r ) return -1;
  return 0;
 }
 void qsort_int64_t_big(int64_t* restrict A_in, int64_t* restrict iorder, int64_t isize) {
  struct int64_t_comp_big* A = malloc(isize * sizeof(struct int64_t_comp_big));
  if (A == NULL) return;
  for (int i=0 ; i<isize ; ++i) {
    A[i].x = A_in[i];
    A[i].i = iorder[i];
  }
  qsort( (void*) A, (size_t) isize, sizeof(struct int64_t_comp_big), compare_int64_t_big);
  for (int i=0 ; i<isize ; ++i) {
    A_in[i] = A[i].x;
    iorder[i] = A[i].i;
  }
  free(A);
 }
 void qsort_int64_t_noidx_big(int64_t* A, int64_t isize) {
  qsort( (void*) A, (size_t) isize, sizeof(int64_t), compare_int64_t_big);
 }
 struct double_comp {
  double    x;
  int32_t i;
 };
 int compare_double( const void * l, const void * r )
 {
  const double * restrict _l= l;
  const double * restrict _r= r;
  if( *_l > *_r ) return 1;
  if( *_l < *_r ) return -1;
  return 0;
 }
 void qsort_double(double* restrict A_in, int32_t* restrict iorder, int32_t isize) {
  struct double_comp* A = malloc(isize * sizeof(struct double_comp));
  if (A == NULL) return;
  for (int i=0 ; i<isize ; ++i) {
    A[i].x = A_in[i];
    A[i].i = iorder[i];
  }
  qsort( (void*) A, (size_t) isize, sizeof(struct double_comp), compare_double);
  for (int i=0 ; i<isize ; ++i) {
    A_in[i] = A[i].x;
    iorder[i] = A[i].i;
  }
  free(A);
 }
 void qsort_double_noidx(double* A, int32_t isize) {
  qsort( (void*) A, (size_t) isize, sizeof(double), compare_double);
 }
 struct double_comp_big {
  double    x;
  int64_t i;
 };
 int compare_double_big( const void * l, const void * r )
 {
  const double * restrict _l= l;
  const double * restrict _r= r;
  if( *_l > *_r ) return 1;
  if( *_l < *_r ) return -1;
  return 0;
 }
 void qsort_double_big(double* restrict A_in, int64_t* restrict iorder, int64_t isize) {
  struct double_comp_big* A = malloc(isize * sizeof(struct double_comp_big));
  if (A == NULL) return;
  for (int i=0 ; i<isize ; ++i) {
    A[i].x = A_in[i];
    A[i].i = iorder[i];
  }
  qsort( (void*) A, (size_t) isize, sizeof(struct double_comp_big), compare_double_big);
  for (int i=0 ; i<isize ; ++i) {
    A_in[i] = A[i].x;
    iorder[i] = A[i].i;
  }
  free(A);
 }
 void qsort_double_noidx_big(double* A, int64_t isize) {
  qsort( (void*) A, (size_t) isize, sizeof(double), compare_double_big);
 }
 struct float_comp {
  float    x;
  int32_t i;
 };
 int compare_float( const void * l, const void * r )
 {
  const float * restrict _l= l;
  const float * restrict _r= r;
  if( *_l > *_r ) return 1;
  if( *_l < *_r ) return -1;
  return 0;
 }
 void qsort_float(float* restrict A_in, int32_t* restrict iorder, int32_t isize) {
  struct float_comp* A = malloc(isize * sizeof(struct float_comp));
  if (A == NULL) return;
  for (int i=0 ; i<isize ; ++i) {
    A[i].x = A_in[i];
    A[i].i = iorder[i];
  }
  qsort( (void*) A, (size_t) isize, sizeof(struct float_comp), compare_float);
  for (int i=0 ; i<isize ; ++i) {
    A_in[i] = A[i].x;
    iorder[i] = A[i].i;
  }
  free(A);
 }
 void qsort_float_noidx(float* A, int32_t isize) {
  qsort( (void*) A, (size_t) isize, sizeof(float), compare_float);
 }
 struct float_comp_big {
  float    x;
  int64_t i;
 };
 int compare_float_big( const void * l, const void * r )
 {
  const float * restrict _l= l;
  const float * restrict _r= r;
  if( *_l > *_r ) return 1;
  if( *_l < *_r ) return -1;
  return 0;
 }
 void qsort_float_big(float* restrict A_in, int64_t* restrict iorder, int64_t isize) {
  struct float_comp_big* A = malloc(isize * sizeof(struct float_comp_big));
  if (A == NULL) return;
  for (int i=0 ; i<isize ; ++i) {
    A[i].x = A_in[i];
    A[i].i = iorder[i];
  }
  qsort( (void*) A, (size_t) isize, sizeof(struct float_comp_big), compare_float_big);
  for (int i=0 ; i<isize ; ++i) {
    A_in[i] = A[i].x;
    iorder[i] = A[i].i;
  }
  free(A);
 }
 void qsort_float_noidx_big(float* A, int64_t isize) {
  qsort( (void*) A, (size_t) isize, sizeof(float), compare_float_big);
 }
 /* Generated C file:1 ends here */
--- a/src/utils/qsort.org
+++ b/src/utils/qsort.org
@ -0,0 +1,169 @@
 #+TITLE: Quick sort binding for Fortran
 * C template
 #+NAME: c_template
 #+BEGIN_SRC c
 struct TYPE_comp_big {
  TYPE    x;
  int32_t i;
 };
 int compare_TYPE_big( const void * l, const void * r )
 {
  const TYPE * restrict _l= l;
  const TYPE * restrict _r= r;
  if( *_l > *_r ) return 1;
  if( *_l < *_r ) return -1;
  return 0;
 }
 void qsort_TYPE_big(TYPE* restrict A_in, int32_t* restrict iorder, int32_t isize) {
  struct TYPE_comp_big* A = malloc(isize * sizeof(struct TYPE_comp_big));
  if (A == NULL) return;
  for (int i=0 ; i<isize ; ++i) {
    A[i].x = A_in[i];
    A[i].i = iorder[i];
  }
  qsort( (void*) A, (size_t) isize, sizeof(struct TYPE_comp_big), compare_TYPE_big);
  for (int i=0 ; i<isize ; ++i) {
    A_in[i] = A[i].x;
    iorder[i] = A[i].i;
  }
  free(A);
 }
 void qsort_TYPE_noidx_big(TYPE* A, int32_t isize) {
  qsort( (void*) A, (size_t) isize, sizeof(TYPE), compare_TYPE_big);
 }
 #+END_SRC
 * Fortran template
 #+NAME:f_template
 #+BEGIN_SRC f90
 subroutine Lsort_big_c(A, iorder, isize) bind(C, name="qsort_TYPE_big")
  use iso_c_binding
  integer(c_int32_t), value :: isize
  integer(c_int32_t)        :: iorder(isize)
  real   (c_TYPE)         :: A(isize)
 end subroutine Lsort_big_c
 subroutine Lsort_noidx_big_c(A, isize) bind(C, name="qsort_TYPE_noidx_big")
  use iso_c_binding
  integer(c_int32_t), value :: isize
  real   (c_TYPE)         :: A(isize)
 end subroutine Lsort_noidx_big_c
 #+END_SRC
 #+NAME:f_template2
 #+BEGIN_SRC f90
 subroutine Lsort_big(A, iorder, isize) 
  use qsort_module
  use iso_c_binding
  integer(c_int32_t)        :: isize
  integer(c_int32_t)        :: iorder(isize)
  real   (c_TYPE)         :: A(isize)
  call Lsort_big_c(A, iorder, isize)
 end subroutine Lsort_big
 subroutine Lsort_noidx_big(A, isize)
  use iso_c_binding
  use qsort_module
  integer(c_int32_t) :: isize
  real   (c_TYPE)    :: A(isize)
  call Lsort_noidx_big_c(A, isize)
 end subroutine Lsort_noidx_big
 #+END_SRC
 * Python scripts for type replacements
 #+NAME: replaced
 #+begin_src python :results output :noweb yes
 data = """
 <<c_template>>
 """
 for typ in ["int16_t", "int32_t", "int64_t", "double", "float"]:
    print( data.replace("TYPE", typ).replace("_big", "") )
    print( data.replace("int32_t", "int64_t").replace("TYPE", typ) )
 #+end_src
 #+NAME: replaced_f
 #+begin_src python :results output :noweb yes
 data = """
 <<f_template>>
 """
 c1 = {
    "int16_t": "i2",
    "int32_t": "i",
    "int64_t": "i8",
    "double": "d",
    "float": ""
 }
 c2 = {
    "int16_t": "integer",
    "int32_t": "integer",
    "int64_t": "integer",
    "double": "real",
    "float": "real"
 }
 for typ in ["int16_t", "int32_t", "int64_t", "double", "float"]:
    print( data.replace("real",c2[typ]).replace("L",c1[typ]).replace("TYPE", typ).replace("_big", "") )
    print( data.replace("real",c2[typ]).replace("L",c1[typ]).replace("int32_t", "int64_t").replace("TYPE", typ) )
 #+end_src
 #+NAME: replaced_f2
 #+begin_src python :results output :noweb yes
 data = """
 <<f_template2>>
 """
 c1 = {
    "int16_t": "i2",
    "int32_t": "i",
    "int64_t": "i8",
    "double": "d",
    "float": ""
 }
 c2 = {
    "int16_t": "integer",
    "int32_t": "integer",
    "int64_t": "integer",
    "double": "real",
    "float": "real"
 }
 for typ in ["int16_t", "int32_t", "int64_t", "double", "float"]:
    print( data.replace("real",c2[typ]).replace("L",c1[typ]).replace("TYPE", typ).replace("_big", "") )
    print( data.replace("real",c2[typ]).replace("L",c1[typ]).replace("int32_t", "int64_t").replace("TYPE", typ) )
 #+end_src
 * Generated C file
 #+BEGIN_SRC c :comments link :tangle qsort.c :noweb yes
 #include <stdlib.h>
 #include <stdint.h>
 <<replaced()>>
 #+END_SRC
 * Generated Fortran file
 #+BEGIN_SRC f90 :tangle qsort_module.f90 :noweb yes
 module qsort_module
  use iso_c_binding
  interface
     <<replaced_f()>>
  end interface
 end module qsort_module
 <<replaced_f2()>>
 #+END_SRC
--- a/src/utils/qsort_module.f90
+++ b/src/utils/qsort_module.f90
@ -0,0 +1,347 @@
 module qsort_module
  use iso_c_binding
  interface
     subroutine i2sort_c(A, iorder, isize) bind(C, name="qsort_int16_t")
       use iso_c_binding
       integer(c_int32_t), value :: isize
       integer(c_int32_t)        :: iorder(isize)
       integer   (c_int16_t)         :: A(isize)
     end subroutine i2sort_c
     subroutine i2sort_noidx_c(A, isize) bind(C, name="qsort_int16_t_noidx")
       use iso_c_binding
       integer(c_int32_t), value :: isize
       integer   (c_int16_t)         :: A(isize)
     end subroutine i2sort_noidx_c
     subroutine i2sort_big_c(A, iorder, isize) bind(C, name="qsort_int16_t_big")
       use iso_c_binding
       integer(c_int64_t), value :: isize
       integer(c_int64_t)        :: iorder(isize)
       integer   (c_int16_t)         :: A(isize)
     end subroutine i2sort_big_c
     subroutine i2sort_noidx_big_c(A, isize) bind(C, name="qsort_int16_t_noidx_big")
       use iso_c_binding
       integer(c_int64_t), value :: isize
       integer   (c_int16_t)         :: A(isize)
     end subroutine i2sort_noidx_big_c
     subroutine isort_c(A, iorder, isize) bind(C, name="qsort_int32_t")
       use iso_c_binding
       integer(c_int32_t), value :: isize
       integer(c_int32_t)        :: iorder(isize)
       integer   (c_int32_t)         :: A(isize)
     end subroutine isort_c
     subroutine isort_noidx_c(A, isize) bind(C, name="qsort_int32_t_noidx")
       use iso_c_binding
       integer(c_int32_t), value :: isize
       integer   (c_int32_t)         :: A(isize)
     end subroutine isort_noidx_c
     subroutine isort_big_c(A, iorder, isize) bind(C, name="qsort_int32_t_big")
       use iso_c_binding
       integer(c_int64_t), value :: isize
       integer(c_int64_t)        :: iorder(isize)
       integer   (c_int32_t)         :: A(isize)
     end subroutine isort_big_c
     subroutine isort_noidx_big_c(A, isize) bind(C, name="qsort_int32_t_noidx_big")
       use iso_c_binding
       integer(c_int64_t), value :: isize
       integer   (c_int32_t)         :: A(isize)
     end subroutine isort_noidx_big_c
     subroutine i8sort_c(A, iorder, isize) bind(C, name="qsort_int64_t")
       use iso_c_binding
       integer(c_int32_t), value :: isize
       integer(c_int32_t)        :: iorder(isize)
       integer   (c_int64_t)         :: A(isize)
     end subroutine i8sort_c
     subroutine i8sort_noidx_c(A, isize) bind(C, name="qsort_int64_t_noidx")
       use iso_c_binding
       integer(c_int32_t), value :: isize
       integer   (c_int64_t)         :: A(isize)
     end subroutine i8sort_noidx_c
     subroutine i8sort_big_c(A, iorder, isize) bind(C, name="qsort_int64_t_big")
       use iso_c_binding
       integer(c_int64_t), value :: isize
       integer(c_int64_t)        :: iorder(isize)
       integer   (c_int64_t)         :: A(isize)
     end subroutine i8sort_big_c
     subroutine i8sort_noidx_big_c(A, isize) bind(C, name="qsort_int64_t_noidx_big")
       use iso_c_binding
       integer(c_int64_t), value :: isize
       integer   (c_int64_t)         :: A(isize)
     end subroutine i8sort_noidx_big_c
     subroutine dsort_c(A, iorder, isize) bind(C, name="qsort_double")
       use iso_c_binding
       integer(c_int32_t), value :: isize
       integer(c_int32_t)        :: iorder(isize)
       real   (c_double)         :: A(isize)
     end subroutine dsort_c
     subroutine dsort_noidx_c(A, isize) bind(C, name="qsort_double_noidx")
       use iso_c_binding
       integer(c_int32_t), value :: isize
       real   (c_double)         :: A(isize)
     end subroutine dsort_noidx_c
     subroutine dsort_big_c(A, iorder, isize) bind(C, name="qsort_double_big")
       use iso_c_binding
       integer(c_int64_t), value :: isize
       integer(c_int64_t)        :: iorder(isize)
       real   (c_double)         :: A(isize)
     end subroutine dsort_big_c
     subroutine dsort_noidx_big_c(A, isize) bind(C, name="qsort_double_noidx_big")
       use iso_c_binding
       integer(c_int64_t), value :: isize
       real   (c_double)         :: A(isize)
     end subroutine dsort_noidx_big_c
     subroutine sort_c(A, iorder, isize) bind(C, name="qsort_float")
       use iso_c_binding
       integer(c_int32_t), value :: isize
       integer(c_int32_t)        :: iorder(isize)
       real   (c_float)         :: A(isize)
     end subroutine sort_c
     subroutine sort_noidx_c(A, isize) bind(C, name="qsort_float_noidx")
       use iso_c_binding
       integer(c_int32_t), value :: isize
       real   (c_float)         :: A(isize)
     end subroutine sort_noidx_c
     subroutine sort_big_c(A, iorder, isize) bind(C, name="qsort_float_big")
       use iso_c_binding
       integer(c_int64_t), value :: isize
       integer(c_int64_t)        :: iorder(isize)
       real   (c_float)         :: A(isize)
     end subroutine sort_big_c
     subroutine sort_noidx_big_c(A, isize) bind(C, name="qsort_float_noidx_big")
       use iso_c_binding
       integer(c_int64_t), value :: isize
       real   (c_float)         :: A(isize)
     end subroutine sort_noidx_big_c
  end interface
 end module qsort_module
 subroutine i2sort(A, iorder, isize) 
  use qsort_module
  use iso_c_binding
  integer(c_int32_t)        :: isize
  integer(c_int32_t)        :: iorder(isize)
  integer   (c_int16_t)         :: A(isize)
  call i2sort_c(A, iorder, isize)
 end subroutine i2sort
 subroutine i2sort_noidx(A, isize)
  use iso_c_binding
  use qsort_module
  integer(c_int32_t) :: isize
  integer   (c_int16_t)    :: A(isize)
  call i2sort_noidx_c(A, isize)
 end subroutine i2sort_noidx
 subroutine i2sort_big(A, iorder, isize) 
  use qsort_module
  use iso_c_binding
  integer(c_int64_t)        :: isize
  integer(c_int64_t)        :: iorder(isize)
  integer   (c_int16_t)         :: A(isize)
  call i2sort_big_c(A, iorder, isize)
 end subroutine i2sort_big
 subroutine i2sort_noidx_big(A, isize)
  use iso_c_binding
  use qsort_module
  integer(c_int64_t) :: isize
  integer   (c_int16_t)    :: A(isize)
  call i2sort_noidx_big_c(A, isize)
 end subroutine i2sort_noidx_big
 subroutine isort(A, iorder, isize) 
  use qsort_module
  use iso_c_binding
  integer(c_int32_t)        :: isize
  integer(c_int32_t)        :: iorder(isize)
  integer   (c_int32_t)         :: A(isize)
  call isort_c(A, iorder, isize)
 end subroutine isort
 subroutine isort_noidx(A, isize)
  use iso_c_binding
  use qsort_module
  integer(c_int32_t) :: isize
  integer   (c_int32_t)    :: A(isize)
  call isort_noidx_c(A, isize)
 end subroutine isort_noidx
 subroutine isort_big(A, iorder, isize) 
  use qsort_module
  use iso_c_binding
  integer(c_int64_t)        :: isize
  integer(c_int64_t)        :: iorder(isize)
  integer   (c_int32_t)         :: A(isize)
  call isort_big_c(A, iorder, isize)
 end subroutine isort_big
 subroutine isort_noidx_big(A, isize)
  use iso_c_binding
  use qsort_module
  integer(c_int64_t) :: isize
  integer   (c_int32_t)    :: A(isize)
  call isort_noidx_big_c(A, isize)
 end subroutine isort_noidx_big
 subroutine i8sort(A, iorder, isize) 
  use qsort_module
  use iso_c_binding
  integer(c_int32_t)        :: isize
  integer(c_int32_t)        :: iorder(isize)
  integer   (c_int64_t)         :: A(isize)
  call i8sort_c(A, iorder, isize)
 end subroutine i8sort
 subroutine i8sort_noidx(A, isize)
  use iso_c_binding
  use qsort_module
  integer(c_int32_t) :: isize
  integer   (c_int64_t)    :: A(isize)
  call i8sort_noidx_c(A, isize)
 end subroutine i8sort_noidx
 subroutine i8sort_big(A, iorder, isize) 
  use qsort_module
  use iso_c_binding
  integer(c_int64_t)        :: isize
  integer(c_int64_t)        :: iorder(isize)
  integer   (c_int64_t)         :: A(isize)
  call i8sort_big_c(A, iorder, isize)
 end subroutine i8sort_big
 subroutine i8sort_noidx_big(A, isize)
  use iso_c_binding
  use qsort_module
  integer(c_int64_t) :: isize
  integer   (c_int64_t)    :: A(isize)
  call i8sort_noidx_big_c(A, isize)
 end subroutine i8sort_noidx_big
 subroutine dsort(A, iorder, isize) 
  use qsort_module
  use iso_c_binding
  integer(c_int32_t)        :: isize
  integer(c_int32_t)        :: iorder(isize)
  real   (c_double)         :: A(isize)
  call dsort_c(A, iorder, isize)
 end subroutine dsort
 subroutine dsort_noidx(A, isize)
  use iso_c_binding
  use qsort_module
  integer(c_int32_t) :: isize
  real   (c_double)    :: A(isize)
  call dsort_noidx_c(A, isize)
 end subroutine dsort_noidx
 subroutine dsort_big(A, iorder, isize) 
  use qsort_module
  use iso_c_binding
  integer(c_int64_t)        :: isize
  integer(c_int64_t)        :: iorder(isize)
  real   (c_double)         :: A(isize)
  call dsort_big_c(A, iorder, isize)
 end subroutine dsort_big
 subroutine dsort_noidx_big(A, isize)
  use iso_c_binding
  use qsort_module
  integer(c_int64_t) :: isize
  real   (c_double)    :: A(isize)
  call dsort_noidx_big_c(A, isize)
 end subroutine dsort_noidx_big
 subroutine sort(A, iorder, isize) 
  use qsort_module
  use iso_c_binding
  integer(c_int32_t)        :: isize
  integer(c_int32_t)        :: iorder(isize)
  real   (c_float)         :: A(isize)
  call sort_c(A, iorder, isize)
 end subroutine sort
 subroutine sort_noidx(A, isize)
  use iso_c_binding
  use qsort_module
  integer(c_int32_t) :: isize
  real   (c_float)    :: A(isize)
  call sort_noidx_c(A, isize)
 end subroutine sort_noidx
 subroutine sort_big(A, iorder, isize) 
  use qsort_module
  use iso_c_binding
  integer(c_int64_t)        :: isize
  integer(c_int64_t)        :: iorder(isize)
  real   (c_float)         :: A(isize)
  call sort_big_c(A, iorder, isize)
 end subroutine sort_big
 subroutine sort_noidx_big(A, isize)
  use iso_c_binding
  use qsort_module
  integer(c_int64_t) :: isize
  real   (c_float)    :: A(isize)
  call sort_noidx_big_c(A, isize)
 end subroutine sort_noidx_big
--- a/src/utils/sort.irp.f
+++ b/src/utils/sort.irp.f
@ -1,222 +1,4 @@
 BEGIN_TEMPLATE
 subroutine insertion_$Xsort (x,iorder,isize)
  implicit none
  BEGIN_DOC
  ! Sort array x(isize) using the insertion sort algorithm.
  ! iorder in input should be (1,2,3,...,isize), and in output
  ! contains the new order of the elements.
  END_DOC
  integer,intent(in)             :: isize
  $type,intent(inout)            :: x(isize)
  integer,intent(inout)          :: iorder(isize)
  $type                          :: xtmp
  integer                        :: i, i0, j, jmax
  do i=2,isize
    xtmp = x(i)
    i0 = iorder(i)
    j=i-1
    do while (j>0)
      if ((x(j) <= xtmp)) exit
      x(j+1) = x(j)
      iorder(j+1) = iorder(j)
      j=j-1
    enddo
    x(j+1) = xtmp
    iorder(j+1) = i0
  enddo
 end subroutine insertion_$Xsort
 subroutine quick_$Xsort(x, iorder, isize)
  implicit none
  BEGIN_DOC
  ! Sort array x(isize) using the quicksort algorithm.
  ! iorder in input should be (1,2,3,...,isize), and in output
  ! contains the new order of the elements.
  END_DOC
  integer,intent(in)             :: isize
  $type,intent(inout)            :: x(isize)
  integer,intent(inout)          :: iorder(isize)
  integer, external              :: omp_get_num_threads
  call rec_$X_quicksort(x,iorder,isize,1,isize,nproc)
 end
 recursive subroutine rec_$X_quicksort(x, iorder, isize, first, last, level)
  implicit none
  integer, intent(in)            :: isize, first, last, level
  integer,intent(inout)          :: iorder(isize)
  $type, intent(inout)           :: x(isize)
  $type                          :: c, tmp
  integer                        :: itmp
  integer                        :: i, j
  if(isize<2)return
  c = x( shiftr(first+last,1) )
  i = first
  j = last
  do
    do while (x(i) < c)
      i=i+1
    end do
    do while (c < x(j))
      j=j-1
    end do
    if (i >= j) exit
    tmp  = x(i)
    x(i) = x(j)
    x(j) = tmp
    itmp      = iorder(i)
    iorder(i) = iorder(j)
    iorder(j) = itmp
    i=i+1
    j=j-1
  enddo
  if ( ((i-first <= 10000).and.(last-j <= 10000)).or.(level<=0) ) then
    if (first < i-1) then
      call rec_$X_quicksort(x, iorder, isize, first, i-1,level/2)
    endif
    if (j+1 < last) then
      call rec_$X_quicksort(x, iorder, isize, j+1, last,level/2)
    endif
  else
    if (first < i-1) then
      call rec_$X_quicksort(x, iorder, isize, first, i-1,level/2)
    endif
    if (j+1 < last) then
      call rec_$X_quicksort(x, iorder, isize, j+1, last,level/2)
    endif
  endif
 end
 subroutine heap_$Xsort(x,iorder,isize)
  implicit none
  BEGIN_DOC
  ! Sort array x(isize) using the heap sort algorithm.
  ! iorder in input should be (1,2,3,...,isize), and in output
  ! contains the new order of the elements.
  END_DOC
  integer,intent(in)             :: isize
  $type,intent(inout)            :: x(isize)
  integer,intent(inout)          :: iorder(isize)
  integer                        :: i, k, j, l, i0
  $type                          :: xtemp
  l = isize/2+1
  k = isize
  do while (.True.)
    if (l>1) then
      l=l-1
      xtemp = x(l)
      i0 = iorder(l)
    else
      xtemp = x(k)
      i0 = iorder(k)
      x(k) = x(1)
      iorder(k) = iorder(1)
      k = k-1
      if (k == 1) then
        x(1) = xtemp
        iorder(1) = i0
        exit
      endif
    endif
    i=l
    j = shiftl(l,1)
    do while (j<k)
      if ( x(j) < x(j+1) ) then
        j=j+1
      endif
      if (xtemp < x(j)) then
        x(i) = x(j)
        iorder(i) = iorder(j)
        i = j
        j = shiftl(j,1)
      else
        j = k+1
      endif
    enddo
    if (j==k) then
      if (xtemp < x(j)) then
        x(i) = x(j)
        iorder(i) = iorder(j)
        i = j
        j = shiftl(j,1)
      else
        j = k+1
      endif
    endif
    x(i) = xtemp
    iorder(i) = i0
  enddo
 end subroutine heap_$Xsort
 subroutine heap_$Xsort_big(x,iorder,isize)
  implicit none
  BEGIN_DOC
  ! Sort array x(isize) using the heap sort algorithm.
  ! iorder in input should be (1,2,3,...,isize), and in output
  ! contains the new order of the elements.
  ! This is a version for very large arrays where the indices need
  ! to be in integer*8 format
  END_DOC
  integer*8,intent(in)           :: isize
  $type,intent(inout)            :: x(isize)
  integer*8,intent(inout)        :: iorder(isize)
  integer*8                      :: i, k, j, l, i0
  $type                          :: xtemp
  l = isize/2+1
  k = isize
  do while (.True.)
    if (l>1) then
      l=l-1
      xtemp = x(l)
      i0 = iorder(l)
    else
      xtemp = x(k)
      i0 = iorder(k)
      x(k) = x(1)
      iorder(k) = iorder(1)
      k = k-1
      if (k == 1) then
        x(1) = xtemp
        iorder(1) = i0
        exit
      endif
    endif
    i=l
    j = shiftl(l,1)
    do while (j<k)
      if ( x(j) < x(j+1) ) then
        j=j+1
      endif
      if (xtemp < x(j)) then
        x(i) = x(j)
        iorder(i) = iorder(j)
        i = j
        j = shiftl(j,1)
      else
        j = k+1
      endif
    enddo
    if (j==k) then
      if (xtemp < x(j)) then
        x(i) = x(j)
        iorder(i) = iorder(j)
        i = j
        j = shiftl(j,1)
      else
        j = k+1
      endif
    endif
    x(i) = xtemp
    iorder(i) = i0
  enddo
 end subroutine heap_$Xsort_big
 subroutine sorted_$Xnumber(x,isize,n)
  implicit none
@ -250,222 +32,6 @@ SUBST [ X, type ]
 END_TEMPLATE
 !---------------------- INTEL
 IRP_IF INTEL
 BEGIN_TEMPLATE
 subroutine $Xsort(x,iorder,isize)
  use intel
  implicit none
  BEGIN_DOC
  ! Sort array x(isize).
  ! iorder in input should be (1,2,3,...,isize), and in output
  ! contains the new order of the elements.
  END_DOC
  integer,intent(in)             :: isize
  $type,intent(inout)            :: x(isize)
  integer,intent(inout)          :: iorder(isize)
  integer                        :: n
  character, allocatable         :: tmp(:)
  if (isize < 2) return
  call ippsSortRadixIndexGetBufferSize(isize, $ippsz, n)
  allocate(tmp(n))
  call ippsSortRadixIndexAscend_$ityp(x, $n, iorder, isize, tmp)
  deallocate(tmp)
  iorder(1:isize) = iorder(1:isize)+1
  call $Xset_order(x,iorder,isize)
 end
 subroutine $Xsort_noidx(x,isize)
  use intel
  implicit none
  BEGIN_DOC
  ! Sort array x(isize).
  ! iorder in input should be (1,2,3,...,isize), and in output
  ! contains the new order of the elements.
  END_DOC
  integer,intent(in)             :: isize
  $type,intent(inout)            :: x(isize)
  integer                        :: n
  character, allocatable         :: tmp(:)
  if (isize < 2) return
  call ippsSortRadixIndexGetBufferSize(isize, $ippsz, n)
  allocate(tmp(n))
  call ippsSortRadixAscend_$ityp_I(x, isize, tmp)
  deallocate(tmp)
 end
 SUBST [ X, type, ityp, n, ippsz ]
   ; real ; 32f ; 4 ; 13 ;;
 i ; integer ; 32s ; 4 ; 11 ;;
 i2 ; integer*2 ; 16s ; 2 ; 7 ;;
 END_TEMPLATE
 BEGIN_TEMPLATE
 subroutine $Xsort(x,iorder,isize)
  implicit none
  BEGIN_DOC
  ! Sort array x(isize).
  ! iorder in input should be (1,2,3,...,isize), and in output
  ! contains the new order of the elements.
  END_DOC
  integer,intent(in)             :: isize
  $type,intent(inout)            :: x(isize)
  integer,intent(inout)          :: iorder(isize)
  integer                        :: n
  if (isize < 2) then
    return
  endif
 !  call sorted_$Xnumber(x,isize,n)
 !  if (isize == n) then
 !    return
 !  endif
  if ( isize < 32) then
    call insertion_$Xsort(x,iorder,isize)
  else
 !    call heap_$Xsort(x,iorder,isize)
    call quick_$Xsort(x,iorder,isize)
  endif
 end subroutine $Xsort
 SUBST [ X, type ]
 d ; double precision ;;
 END_TEMPLATE
 BEGIN_TEMPLATE
 subroutine $Xsort(x,iorder,isize)
  implicit none
  BEGIN_DOC
  ! Sort array x(isize).
  ! iorder in input should be (1,2,3,...,isize), and in output
  ! contains the new order of the elements.
  END_DOC
  integer,intent(in)             :: isize
  $type,intent(inout)            :: x(isize)
  integer,intent(inout)          :: iorder(isize)
  integer                        :: n
  if (isize < 2) then
    return
  endif
  call sorted_$Xnumber(x,isize,n)
  if (isize == n) then
    return
  endif
  if ( isize < 32) then
    call insertion_$Xsort(x,iorder,isize)
  else
 !    call $Xradix_sort(x,iorder,isize,-1)
    call quick_$Xsort(x,iorder,isize)
  endif
 end subroutine $Xsort
 SUBST [ X, type ]
 i8 ; integer*8 ;;
 END_TEMPLATE
 !---------------------- END INTEL
 IRP_ELSE
 !---------------------- NON-INTEL
 BEGIN_TEMPLATE
 subroutine $Xsort_noidx(x,isize)
  implicit none
  BEGIN_DOC
  ! Sort array x(isize).
  END_DOC
  integer,intent(in)             :: isize
  $type,intent(inout)            :: x(isize)
  integer, allocatable           :: iorder(:)
  integer                        :: i
  allocate(iorder(isize))
  do i=1,isize
   iorder(i)=i
  enddo
  call $Xsort(x,iorder,isize)
  deallocate(iorder)
 end subroutine $Xsort_noidx
 SUBST [ X, type ]
   ; real ;;
 d ; double precision ;;
 i ; integer ;;
 i8 ; integer*8 ;;
 i2 ; integer*2 ;;
 END_TEMPLATE
 BEGIN_TEMPLATE
 subroutine $Xsort(x,iorder,isize)
  implicit none
  BEGIN_DOC
  ! Sort array x(isize).
  ! iorder in input should be (1,2,3,...,isize), and in output
  ! contains the new order of the elements.
  END_DOC
  integer,intent(in)             :: isize
  $type,intent(inout)            :: x(isize)
  integer,intent(inout)          :: iorder(isize)
  integer                        :: n
  if (isize < 2) then
    return
  endif
 !  call sorted_$Xnumber(x,isize,n)
 !  if (isize == n) then
 !    return
 !  endif
  if ( isize < 32) then
    call insertion_$Xsort(x,iorder,isize)
  else
 !    call heap_$Xsort(x,iorder,isize)
    call quick_$Xsort(x,iorder,isize)
  endif
 end subroutine $Xsort
 SUBST [ X, type ]
   ; real ;;
 d ; double precision ;;
 END_TEMPLATE
 BEGIN_TEMPLATE
 subroutine $Xsort(x,iorder,isize)
  implicit none
  BEGIN_DOC
  ! Sort array x(isize).
  ! iorder in input should be (1,2,3,...,isize), and in output
  ! contains the new order of the elements.
  END_DOC
  integer,intent(in)             :: isize
  $type,intent(inout)            :: x(isize)
  integer,intent(inout)          :: iorder(isize)
  integer                        :: n
  if (isize < 2) then
    return
  endif
  call sorted_$Xnumber(x,isize,n)
  if (isize == n) then
    return
  endif
  if ( isize < 32) then
    call insertion_$Xsort(x,iorder,isize)
  else
 !    call $Xradix_sort(x,iorder,isize,-1)
    call quick_$Xsort(x,iorder,isize)
  endif
 end subroutine $Xsort
 SUBST [ X, type ]
 i ; integer ;;
 i8 ; integer*8 ;;
 i2 ; integer*2 ;;
 END_TEMPLATE
 IRP_ENDIF
 !---------------------- END NON-INTEL
 BEGIN_TEMPLATE
 subroutine $Xset_order(x,iorder,isize)
@ -491,47 +57,6 @@ BEGIN_TEMPLATE
  deallocate(xtmp)
 end
 SUBST [ X, type ]
   ; real ;;
 d ; double precision ;;
 i ; integer ;;
 i8; integer*8 ;;
 i2; integer*2 ;;
 END_TEMPLATE
 BEGIN_TEMPLATE
 subroutine insertion_$Xsort_big (x,iorder,isize)
  implicit none
  BEGIN_DOC
  ! Sort array x(isize) using the insertion sort algorithm.
  ! iorder in input should be (1,2,3,...,isize), and in output
  ! contains the new order of the elements.
  ! This is a version for very large arrays where the indices need
  ! to be in integer*8 format
  END_DOC
  integer*8,intent(in)           :: isize
  $type,intent(inout)            :: x(isize)
  integer*8,intent(inout)        :: iorder(isize)
  $type                          :: xtmp
  integer*8                      :: i, i0, j, jmax
  do i=2_8,isize
    xtmp = x(i)
    i0 = iorder(i)
    j = i-1_8
    do while (j>0_8)
      if (x(j)<=xtmp) exit
      x(j+1_8) = x(j)
      iorder(j+1_8) = iorder(j)
      j = j-1_8
    enddo
    x(j+1_8) = xtmp
    iorder(j+1_8) = i0
  enddo
 end subroutine insertion_$Xsort_big
 subroutine $Xset_order_big(x,iorder,isize)
  implicit none
  BEGIN_DOC
@ -565,223 +90,3 @@ SUBST [ X, type ]
 END_TEMPLATE
 BEGIN_TEMPLATE
 recursive subroutine $Xradix_sort$big(x,iorder,isize,iradix)
  implicit none
  BEGIN_DOC
  ! Sort integer array x(isize) using the radix sort algorithm.
  ! iorder in input should be (1,2,3,...,isize), and in output
  ! contains the new order of the elements.
  ! iradix should be -1 in input.
  END_DOC
  integer*$int_type, intent(in)  :: isize
  integer*$int_type, intent(inout) :: iorder(isize)
  integer*$type, intent(inout)   :: x(isize)
  integer, intent(in)            :: iradix
  integer                        :: iradix_new
  integer*$type, allocatable     :: x2(:), x1(:)
  integer*$type                  :: i4               ! data type
  integer*$int_type, allocatable :: iorder1(:),iorder2(:)
  integer*$int_type              :: i0, i1, i2, i3, i ! index type
  integer*$type                  :: mask
  integer                        :: err
  !DIR$ ATTRIBUTES ALIGN : 128   :: iorder1,iorder2, x2, x1
  if (isize < 2) then
    return
  endif
  if (iradix == -1) then ! Sort Positive and negative
    allocate(x1(isize),iorder1(isize), x2(isize),iorder2(isize),stat=err)
    if (err /= 0) then
      print *,  irp_here, ': Unable to allocate arrays'
      stop
    endif
    i1=1_$int_type
    i2=1_$int_type
    do i=1_$int_type,isize
      if (x(i) < 0_$type) then
        iorder1(i1) = iorder(i)
        x1(i1) = -x(i)
        i1 = i1+1_$int_type
      else
        iorder2(i2) = iorder(i)
        x2(i2) = x(i)
        i2 = i2+1_$int_type
      endif
    enddo
    i1=i1-1_$int_type
    i2=i2-1_$int_type
    do i=1_$int_type,i2
      iorder(i1+i) = iorder2(i)
      x(i1+i) = x2(i)
    enddo
    deallocate(x2,iorder2,stat=err)
    if (err /= 0) then
      print *,  irp_here, ': Unable to deallocate arrays x2, iorder2'
      stop
    endif
    if (i1 > 1_$int_type) then
      call $Xradix_sort$big(x1,iorder1,i1,-2)
      do i=1_$int_type,i1
        x(i) = -x1(1_$int_type+i1-i)
        iorder(i) = iorder1(1_$int_type+i1-i)
      enddo
    endif
    if (i2>1_$int_type) then
      call $Xradix_sort$big(x(i1+1_$int_type),iorder(i1+1_$int_type),i2,-2)
    endif
    deallocate(x1,iorder1,stat=err)
    if (err /= 0) then
      print *,  irp_here, ': Unable to deallocate arrays x1, iorder1'
      stop
    endif
    return
  else if (iradix == -2) then ! Positive
    ! Find most significant bit
    i0 = 0_$int_type
    i4 = maxval(x)
    iradix_new = max($integer_size-1-leadz(i4),1)
    mask = ibset(0_$type,iradix_new)
    allocate(x1(isize),iorder1(isize), x2(isize),iorder2(isize),stat=err)
    if (err /= 0) then
      print *,  irp_here, ': Unable to allocate arrays'
      stop
    endif
    i1=1_$int_type
    i2=1_$int_type
    do i=1_$int_type,isize
      if (iand(mask,x(i)) == 0_$type) then
        iorder1(i1) = iorder(i)
        x1(i1) = x(i)
        i1 = i1+1_$int_type
      else
        iorder2(i2) = iorder(i)
        x2(i2) = x(i)
        i2 = i2+1_$int_type
      endif
    enddo
    i1=i1-1_$int_type
    i2=i2-1_$int_type
    do i=1_$int_type,i1
      iorder(i0+i) = iorder1(i)
      x(i0+i) = x1(i)
    enddo
    i0 = i0+i1
    i3 = i0
    deallocate(x1,iorder1,stat=err)
    if (err /= 0) then
      print *,  irp_here, ': Unable to deallocate arrays x1, iorder1'
      stop
    endif
    do i=1_$int_type,i2
      iorder(i0+i) = iorder2(i)
      x(i0+i) = x2(i)
    enddo
    i0 = i0+i2
    deallocate(x2,iorder2,stat=err)
    if (err /= 0) then
      print *,  irp_here, ': Unable to deallocate arrays x2, iorder2'
      stop
    endif
    if (i3>1_$int_type) then
      call $Xradix_sort$big(x,iorder,i3,iradix_new-1)
    endif
    if (isize-i3>1_$int_type) then
      call $Xradix_sort$big(x(i3+1_$int_type),iorder(i3+1_$int_type),isize-i3,iradix_new-1)
    endif
    return
  endif
  ASSERT (iradix >= 0)
  if (isize < 48) then
    call insertion_$Xsort$big(x,iorder,isize)
    return
  endif
  allocate(x2(isize),iorder2(isize),stat=err)
  if (err /= 0) then
    print *,  irp_here, ': Unable to allocate arrays x1, iorder1'
    stop
  endif
  mask = ibset(0_$type,iradix)
  i0=1_$int_type
  i1=1_$int_type
  do i=1_$int_type,isize
    if (iand(mask,x(i)) == 0_$type) then
      iorder(i0) = iorder(i)
      x(i0) = x(i)
      i0 = i0+1_$int_type
    else
      iorder2(i1) = iorder(i)
      x2(i1) = x(i)
      i1 = i1+1_$int_type
    endif
  enddo
  i0=i0-1_$int_type
  i1=i1-1_$int_type
  do i=1_$int_type,i1
    iorder(i0+i) = iorder2(i)
    x(i0+i) = x2(i)
  enddo
  deallocate(x2,iorder2,stat=err)
  if (err /= 0) then
    print *,  irp_here, ': Unable to allocate arrays x2, iorder2'
    stop
  endif
  if (iradix == 0) then
    return
  endif
  if (i1>1_$int_type) then
    call $Xradix_sort$big(x(i0+1_$int_type),iorder(i0+1_$int_type),i1,iradix-1)
  endif
  if (i0>1) then
    call $Xradix_sort$big(x,iorder,i0,iradix-1)
  endif
 end
 SUBST [ X, type, integer_size, is_big, big, int_type ]
 i  ; 4 ; 32 ; .False. ;      ; 4 ;;
 i8 ; 8 ; 64 ; .False. ;      ; 4 ;;
 i2 ; 2 ; 16 ; .False. ;      ; 4 ;;
 i  ; 4 ; 32 ; .True.  ; _big ; 8 ;;
 i8 ; 8 ; 64 ; .True.  ; _big ; 8 ;;
 END_TEMPLATE