Merge branch 'AbdAmmar-dev' into dev

2024-11-09 06:53:38 +01:00 · 2023-01-17 17:15:25 +01:00 · 2023-01-17 17:15:25 +01:00 · 6d7c7ab335
commit 6d7c7ab335
parent 7dc17fd2d3 f0c6c54072
64 changed files with 4813 additions and 2233 deletions
--- a/bin/qp_convert_output_to_ezfio
+++ b/bin/qp_convert_output_to_ezfio
@ -162,11 +162,11 @@ def write_ezfio(res, filename):
    # P a r s i n g #
    # ~#~#~#~#~#~#~ #

-    prim_num_max = ezfio.get_ao_basis_ao_prim_num_max()
+    prim_num_max = max(ezfio.get_ao_basis_ao_prim_num())
+    ezfio.set_ao_basis_ao_prim_num_max(prim_num_max)

    for i in range(len(res.basis)):
-        coefficient[
-            i] += [0. for j in range(len(coefficient[i]), prim_num_max)]
+        coefficient[i] += [0. for j in range(len(coefficient[i]), prim_num_max)]
        exponent[i] += [0. for j in range(len(exponent[i]), prim_num_max)]

    coefficient = reduce(lambda x, y: x + y, coefficient, [])
--- a/config/bull.cfg
+++ b/config/bull.cfg
@ -6,7 +6,7 @@
 # --align=32                 : Align all provided arrays on a 32-byte boundary
 #
 [COMMON]
-FC           : mpiifort -shared-libgcc -shared-intel -fpic
+FC           : mpiifort -fpic -xCORE-AVX2 
 LAPACK_LIB   : -mkl=parallel 
 IRPF90       : irpf90
 IRPF90_FLAGS : --ninja --align=32 -DMPI  
@ -31,7 +31,7 @@ OPENMP  : 1          ; Append OpenMP flags
 # -ftz                       : Flushes denormal results to zero
 #
 [OPT]
-FCFLAGS  : -xCORE-AVX2 -O2 -ip -ftz -g -traceback -qopt-prefetch=5 -qopt-prefetch-issue-excl-hint -unroll-aggressive 
+FCFLAGS  : -O2 -ip -ftz -g -traceback -qopt-prefetch=5 -qopt-prefetch-issue-excl-hint -unroll-aggressive 
 # Profiling flags
 #################
 #
--- a/config/ifort_2019_debug.cfg
+++ b/config/ifort_2019_debug.cfg
@ -0,0 +1,66 @@
+# Common flags
+##############
+#
+# -mkl=[parallel|sequential] : Use the MKL library
+# --ninja                 : Allow the utilisation of ninja. It is mandatory !
+# --align=32                 : Align all provided arrays on a 32-byte boundary
+#
+[COMMON]
+FC           : ifort -fpic
+LAPACK_LIB   : -mkl=parallel -lirc -lsvml -limf -lipps
+IRPF90       : irpf90
+IRPF90_FLAGS : --ninja --align=32 --assert -DINTEL -DSET_NESTED
+
+# Global options
+################
+#
+# 1 : Activate
+# 0 : Deactivate
+# 
+[OPTION]
+MODE    : DEBUG      ; [ OPT | PROFILE | DEBUG ] : Chooses the section below
+CACHE   : 0          ; Enable cache_compile.py
+OPENMP  : 1          ; Append OpenMP flags
+
+# Optimization flags
+####################
+#
+# -xHost                     : Compile a binary optimized for the current architecture
+# -O2                        : O3 not better than O2.
+# -ip                        : Inter-procedural optimizations
+# -ftz                       : Flushes denormal results to zero
+#
+[OPT]
+FC       : -traceback
+FCFLAGS  : -msse4.2 -O2 -ip -ftz -g 
+
+
+# Profiling flags
+#################
+#
+[PROFILE]
+FC       : -p -g
+FCFLAGS  : -msse4.2 -O2 -ip -ftz 
+
+
+# Debugging flags
+#################
+#
+# -traceback   : Activate backtrace on runtime
+# -fpe0        : All floating point exaceptions
+# -C           : Checks uninitialized variables,  array subscripts, etc...
+# -g           : Extra debugging information
+# -msse4.2     : Valgrind needs a very simple x86 executable
+#
+[DEBUG]
+FC      : -g -traceback
+FCFLAGS : -msse4.2 -check all -debug all -fpe-all=0  -implicitnone
+
+
+# OpenMP flags
+#################
+#
+[OPENMP]
+FC           : -qopenmp
+IRPF90_FLAGS : --openmp
+
--- a/ocaml/qp_run.ml
+++ b/ocaml/qp_run.ml
@ -155,6 +155,7 @@ let run slave ?prefix exe ezfio_file =
  in
  Printf.printf "Wall time: %d:%2.2d:%2.2d" (d*24+h) m s ;
  Printf.printf "\n\n";
+  Unix.sleep 1;
  if (exit_code <> 0) then
    exit exit_code

--- a/scripts/compilation/qp_create_ninja
+++ b/scripts/compilation/qp_create_ninja
@ -126,7 +126,7 @@ def ninja_create_env_variable(pwd_config_file):
        try:
            content = ""
            with open(libfile,'r') as f:
-                content = f.read()
+                content = f.read().replace('\n','')
                str_lib += " "+content
        except IOError:
            pass
--- a/src/ao_basis/EZFIO.cfg
+++ b/src/ao_basis/EZFIO.cfg
@ -57,13 +57,13 @@ default: false

 [ao_normalized]
 type: logical
-doc: Use normalized basis functions
+doc: Normalize the atomic orbitals
 interface: ezfio, provider
-default: true
+default: false

 [primitives_normalized]
 type: logical
-doc: Use normalized primitive functions
+doc: Normalize the primitive basis functions
 interface: ezfio, provider
 default: true

--- a/src/ao_basis/aos.irp.f
+++ b/src/ao_basis/aos.irp.f
@ -63,15 +63,14 @@ END_PROVIDER
  ! Coefficients including the |AO| normalization
  END_DOC

-  do i=1,ao_num
-    l = ao_shell(i)
-    ao_coef_normalized(i,:) = shell_coef(l,:) * shell_normalization_factor(l)
-  end do

  double precision               :: norm,overlap_x,overlap_y,overlap_z,C_A(3), c
-  integer                        :: l, powA(3), nz
+  integer                        :: l, powA(3)
+  integer, parameter             :: nz=100
  integer                        :: i,j,k
-  nz=100
+
+   ao_coef_normalized(:,:) = ao_coef(:,:)
+
  C_A = 0.d0

  do i=1,ao_num
@ -80,7 +79,7 @@ END_PROVIDER
    powA(2) = ao_power(i,2)
    powA(3) = ao_power(i,3)

-    ! Normalization of the primitives
+    ! GAMESS-type normalization of the primitives
    if (primitives_normalized) then
      do j=1,ao_prim_num(i)
        call overlap_gaussian_xyz(C_A,C_A,ao_expo(i,j),ao_expo(i,j), &
@ -91,6 +90,7 @@ END_PROVIDER
    ! Normalization of the contracted basis functions
    if (ao_normalized) then
      norm = 0.d0
+      l = ao_shell(i)
      do j=1,ao_prim_num(i)
        do k=1,ao_prim_num(i)
          call overlap_gaussian_xyz(C_A,C_A,ao_expo(i,j),ao_expo(i,k),powA,powA,overlap_x,overlap_y,overlap_z,c,nz)
@ -98,6 +98,7 @@ END_PROVIDER
        enddo
      enddo
      ao_coef_normalization_factor(i) = 1.d0/dsqrt(norm)
+      ao_coef_normalized(i,:) *= ao_coef_normalization_factor(i)
    else
      ao_coef_normalization_factor(i) = 1.d0
    endif
--- a/src/ao_many_one_e_ints/grad2_jmu_manu.irp.f
+++ b/src/ao_many_one_e_ints/grad2_jmu_manu.irp.f
@ -1,4 +1,6 @@

+! ---
+
 BEGIN_PROVIDER [ double precision, int2_grad1u2_grad2u2_j1b2_test, (ao_num, ao_num, n_points_final_grid)]

  BEGIN_DOC
@ -13,12 +15,14 @@ BEGIN_PROVIDER [ double precision, int2_grad1u2_grad2u2_j1b2_test, (ao_num, ao_n
  double precision              :: coef, beta, B_center(3)
  double precision              :: tmp
  double precision              :: wall0, wall1
-
-  double precision, allocatable :: int_fit_v(:)
-  double precision, external    :: overlap_gauss_r12_ao_with1s
  double precision              :: int_gauss, dsqpi_3_2, int_j1b
  double precision              :: factor_ij_1s, beta_ij, center_ij_1s(3), sq_pi_3_2 
-  sq_pi_3_2 = (dacos(-1.d0))**(3/2)
+  double precision, allocatable :: int_fit_v(:)
+  double precision, external    :: overlap_gauss_r12_ao_with1s
+
+  print*, ' providing int2_grad1u2_grad2u2_j1b2_test ...'
+
+  sq_pi_3_2 = (dacos(-1.d0))**(1.5d0)

  provide mu_erf final_grid_points_transp j1b_pen List_comb_thr_b3_coef
  call wall_time(wall0)
@ -33,7 +37,7 @@ BEGIN_PROVIDER [ double precision, int2_grad1u2_grad2u2_j1b2_test, (ao_num, ao_n
     !$OMP          expo_gauss_1_erf_x_2, coef_gauss_1_erf_x_2,                                             &
     !$OMP          List_comb_thr_b3_coef, List_comb_thr_b3_expo,                                           &
     !$OMP          List_comb_thr_b3_cent, int2_grad1u2_grad2u2_j1b2_test, ao_abs_comb_b3_j1b,              &
-     !$OMP          ao_overlap_abs_grid,sq_pi_3_2)
+     !$OMP          ao_overlap_abs,sq_pi_3_2)
 !$OMP DO SCHEDULE(dynamic)
 do ipoint = 1, n_points_final_grid
   r(1) = final_grid_points(1,ipoint)
@ -41,7 +45,7 @@ BEGIN_PROVIDER [ double precision, int2_grad1u2_grad2u2_j1b2_test, (ao_num, ao_n
   r(3) = final_grid_points(3,ipoint)
   do i = 1, ao_num
     do j = i, ao_num
-       if(ao_overlap_abs_grid(j,i) .lt. 1.d-12) then
+       if(ao_overlap_abs(j,i) .lt. 1.d-12) then
         cycle
       endif
  
@ -61,7 +65,7 @@ BEGIN_PROVIDER [ double precision, int2_grad1u2_grad2u2_j1b2_test, (ao_num, ao_n
           call gaussian_product(expo_fit,r,beta,B_center,factor_ij_1s,beta_ij,center_ij_1s)
           coef_fit = -0.25d0 *  coef_gauss_1_erf_x_2(i_fit) * coef
 !           if(dabs(coef_fit*factor_ij_1s*int_j1b).lt.1.d-10)cycle ! old version
-           if(dabs(coef_fit*factor_ij_1s*int_j1b*sq_pi_3_2*(beta_ij)**(-3/2)).lt.1.d-12)cycle
+           if(dabs(coef_fit*factor_ij_1s*int_j1b*sq_pi_3_2*(beta_ij)**(-1.5d0)).lt.1.d-10)cycle
  
 !           call overlap_gauss_r12_ao_with1s_v(B_center, beta, final_grid_points_transp, &
 !                 expo_fit, i, j, int_fit_v, n_points_final_grid)
@ -91,6 +95,8 @@ BEGIN_PROVIDER [ double precision, int2_grad1u2_grad2u2_j1b2_test, (ao_num, ao_n

 END_PROVIDER

+! ---
+
 BEGIN_PROVIDER [ double precision, int2_grad1u2_grad2u2_j1b2_test_v, (ao_num, ao_num, n_points_final_grid)]
 !
 !  BEGIN_DOC
@ -109,6 +115,8 @@ BEGIN_PROVIDER [ double precision, int2_grad1u2_grad2u2_j1b2_test_v, (ao_num, ao
  double precision, allocatable :: int_fit_v(:),big_array(:,:,:)
  double precision, external    :: overlap_gauss_r12_ao_with1s

+  print*, ' providing int2_grad1u2_grad2u2_j1b2_test_v ...'
+
  provide mu_erf final_grid_points_transp j1b_pen
  call wall_time(wall0)

@ -123,14 +131,14 @@ BEGIN_PROVIDER [ double precision, int2_grad1u2_grad2u2_j1b2_test_v, (ao_num, ao
     !$OMP          expo_gauss_1_erf_x_2, coef_gauss_1_erf_x_2,      &
     !$OMP          List_comb_thr_b3_coef, List_comb_thr_b3_expo,    &
     !$OMP          List_comb_thr_b3_cent, big_array,&
-     !$OMP          ao_abs_comb_b3_j1b,ao_overlap_abs_grid)
+     !$OMP          ao_abs_comb_b3_j1b,ao_overlap_abs)
 !
 allocate(int_fit_v(n_points_final_grid))
 !$OMP DO SCHEDULE(dynamic)
 do i = 1, ao_num
   do j = i, ao_num

-     if(ao_overlap_abs_grid(j,i) .lt. 1.d-12) then
+     if(ao_overlap_abs(j,i) .lt. 1.d-12) then
       cycle
     endif

@ -139,7 +147,7 @@ BEGIN_PROVIDER [ double precision, int2_grad1u2_grad2u2_j1b2_test_v, (ao_num, ao
         coef        = List_comb_thr_b3_coef  (i_1s,j,i)
         beta        = List_comb_thr_b3_expo  (i_1s,j,i)
         int_j1b = ao_abs_comb_b3_j1b(i_1s,j,i)
-         if(dabs(coef)*dabs(int_j1b).lt.1.d-15)cycle
+!         if(dabs(coef)*dabs(int_j1b).lt.1.d-15)cycle
         B_center(1) = List_comb_thr_b3_cent(1,i_1s,j,i)
         B_center(2) = List_comb_thr_b3_cent(2,i_1s,j,i)
         B_center(3) = List_comb_thr_b3_cent(3,i_1s,j,i)
@ -185,6 +193,8 @@ BEGIN_PROVIDER [ double precision, int2_grad1u2_grad2u2_j1b2_test_v, (ao_num, ao

 END_PROVIDER

+! ---
+
 BEGIN_PROVIDER [ double precision, int2_u2_j1b2_test, (ao_num, ao_num, n_points_final_grid)]

  BEGIN_DOC
@ -202,7 +212,10 @@ BEGIN_PROVIDER [ double precision, int2_u2_j1b2_test, (ao_num, ao_num, n_points_
  double precision, external    :: overlap_gauss_r12_ao
  double precision, external    :: overlap_gauss_r12_ao_with1s
  double precision :: factor_ij_1s,beta_ij,center_ij_1s(3),sq_pi_3_2
-  sq_pi_3_2 = (dacos(-1.d0))**(3/2)
+
+  print*, ' providing int2_u2_j1b2_test ...'
+
+  sq_pi_3_2 = (dacos(-1.d0))**(1.5d0)

  provide mu_erf final_grid_points j1b_pen
  call wall_time(wall0)
@ -213,7 +226,7 @@ BEGIN_PROVIDER [ double precision, int2_u2_j1b2_test, (ao_num, ao_num, n_points_
 !$OMP PRIVATE (ipoint, i, j, i_1s, i_fit, r, coef, beta, B_center, &
 !$OMP          coef_fit, expo_fit, int_fit, tmp, int_j1b,factor_ij_1s,beta_ij,center_ij_1s)          & 
 !$OMP SHARED  (n_points_final_grid, ao_num, List_comb_thr_b3_size, & 
- !$OMP          final_grid_points, ng_fit_jast,ao_overlap_abs_grid,      &
+ !$OMP          final_grid_points, ng_fit_jast,                     &
 !$OMP          expo_gauss_j_mu_x_2, coef_gauss_j_mu_x_2,           &
 !$OMP          List_comb_thr_b3_coef, List_comb_thr_b3_expo,sq_pi_3_2,       & 
 !$OMP          List_comb_thr_b3_cent, int2_u2_j1b2_test,ao_abs_comb_b3_j1b)
@ -225,9 +238,6 @@ BEGIN_PROVIDER [ double precision, int2_u2_j1b2_test, (ao_num, ao_num, n_points_

    do i = 1, ao_num
      do j = i, ao_num
-       if(ao_overlap_abs_grid(j,i) .lt. 1.d-12) then
-         cycle
-       endif


        tmp = 0.d0
@ -236,7 +246,7 @@ BEGIN_PROVIDER [ double precision, int2_u2_j1b2_test, (ao_num, ao_num, n_points_
          coef        = List_comb_thr_b3_coef  (i_1s,j,i)
          beta        = List_comb_thr_b3_expo  (i_1s,j,i)
          int_j1b = ao_abs_comb_b3_j1b(i_1s,j,i)
-          if(dabs(coef)*dabs(int_j1b).lt.1.d-14)cycle
+          if(dabs(coef)*dabs(int_j1b).lt.1.d-10)cycle
          B_center(1) = List_comb_thr_b3_cent(1,i_1s,j,i)
          B_center(2) = List_comb_thr_b3_cent(2,i_1s,j,i)
          B_center(3) = List_comb_thr_b3_cent(3,i_1s,j,i)
@ -248,7 +258,7 @@ BEGIN_PROVIDER [ double precision, int2_u2_j1b2_test, (ao_num, ao_num, n_points_
            !DIR$ FORCEINLINE
            call gaussian_product(expo_fit,r,beta,B_center,factor_ij_1s,beta_ij,center_ij_1s)
 !            if(dabs(coef_fit*coef*factor_ij_1s*int_j1b).lt.1.d-10)cycle ! old version
-            if(dabs(coef_fit*coef*factor_ij_1s*int_j1b*sq_pi_3_2*(beta_ij)**(-3/2)).lt.1.d-12)cycle
+            if(dabs(coef_fit*coef*factor_ij_1s*int_j1b*sq_pi_3_2*(beta_ij)**(-1.5d0)).lt.1.d-10)cycle
          
            ! ---
          
@ -283,7 +293,7 @@ END_PROVIDER

 ! ---

-BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2_test, (3, ao_num, ao_num, n_points_final_grid)]
+BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2_test, (ao_num, ao_num, n_points_final_grid, 3)]

  BEGIN_DOC
  !
@ -298,7 +308,10 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2_test, (3, ao_num, ao_num
  double precision :: alpha_1s, alpha_1s_inv, centr_1s(3), expo_coef_1s, coef_tmp
  double precision :: tmp_x, tmp_y, tmp_z, int_j1b
  double precision :: wall0, wall1, sq_pi_3_2,sq_alpha
-  sq_pi_3_2 = dacos(-1.D0)**(3/2)
+
+  print*, ' providing int2_u_grad1u_x_j1b2_test ...'
+
+  sq_pi_3_2 = dacos(-1.D0)**(1.d0)
  provide mu_erf final_grid_points j1b_pen
  call wall_time(wall0)

@ -310,7 +323,7 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2_test, (3, ao_num, ao_num
 !$OMP          alpha_1s_inv, centr_1s, expo_coef_1s, coef_tmp,     & 
 !$OMP          tmp_x, tmp_y, tmp_z,int_j1b,sq_alpha)                        & 
 !$OMP SHARED  (n_points_final_grid, ao_num, List_comb_thr_b3_size, & 
- !$OMP          final_grid_points, ng_fit_jast, ao_overlap_abs_grid,&
+ !$OMP          final_grid_points, ng_fit_jast,                     &
 !$OMP          expo_gauss_j_mu_1_erf, coef_gauss_j_mu_1_erf,       &
 !$OMP          List_comb_thr_b3_coef, List_comb_thr_b3_expo,       & 
 !$OMP          List_comb_thr_b3_cent, int2_u_grad1u_x_j1b2_test,ao_abs_comb_b3_j1b,sq_pi_3_2)
@ -323,9 +336,6 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2_test, (3, ao_num, ao_num

    do i = 1, ao_num
      do j = i, ao_num
-       if(ao_overlap_abs_grid(j,i) .lt. 1.d-12) then
-         cycle
-       endif

        tmp_x = 0.d0
        tmp_y = 0.d0
@ -335,7 +345,7 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2_test, (3, ao_num, ao_num
          coef        = List_comb_thr_b3_coef  (i_1s,j,i)
          beta        = List_comb_thr_b3_expo  (i_1s,j,i)
          int_j1b = ao_abs_comb_b3_j1b(i_1s,j,i)
-          if(dabs(coef)*dabs(int_j1b).lt.1.d-14)cycle
+          if(dabs(coef)*dabs(int_j1b).lt.1.d-10)cycle
          B_center(1) = List_comb_thr_b3_cent(1,i_1s,j,i)
          B_center(2) = List_comb_thr_b3_cent(2,i_1s,j,i)
          B_center(3) = List_comb_thr_b3_cent(3,i_1s,j,i)
@ -359,7 +369,7 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2_test, (3, ao_num, ao_num
            coef_tmp = coef * coef_fit * dexp(-expo_coef_1s)
            sq_alpha = alpha_1s_inv * dsqrt(alpha_1s_inv)
 !            if(dabs(coef_tmp*int_j1b) .lt. 1d-10) cycle ! old version
-            if(dabs(coef_tmp*int_j1b*sq_pi_3_2*sq_alpha) .lt. 1d-14) cycle
+            if(dabs(coef_tmp*int_j1b*sq_pi_3_2*sq_alpha) .lt. 1d-10) cycle
            
            call NAI_pol_x_mult_erf_ao_with1s(i, j, alpha_1s, centr_1s, 1.d+9, r, int_fit)

@ -372,9 +382,9 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2_test, (3, ao_num, ao_num

        enddo

-        int2_u_grad1u_x_j1b2_test(1,j,i,ipoint) = tmp_x
-        int2_u_grad1u_x_j1b2_test(2,j,i,ipoint) = tmp_y
-        int2_u_grad1u_x_j1b2_test(3,j,i,ipoint) = tmp_z
+        int2_u_grad1u_x_j1b2_test(j,i,ipoint,1) = tmp_x
+        int2_u_grad1u_x_j1b2_test(j,i,ipoint,2) = tmp_y
+        int2_u_grad1u_x_j1b2_test(j,i,ipoint,3) = tmp_z
      enddo
    enddo
  enddo
@ -384,9 +394,9 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2_test, (3, ao_num, ao_num
  do ipoint = 1, n_points_final_grid
    do i = 2, ao_num
      do j = 1, i-1
-        int2_u_grad1u_x_j1b2_test(1,j,i,ipoint) = int2_u_grad1u_x_j1b2_test(1,i,j,ipoint)
-        int2_u_grad1u_x_j1b2_test(2,j,i,ipoint) = int2_u_grad1u_x_j1b2_test(2,i,j,ipoint)
-        int2_u_grad1u_x_j1b2_test(3,j,i,ipoint) = int2_u_grad1u_x_j1b2_test(3,i,j,ipoint)
+        int2_u_grad1u_x_j1b2_test(j,i,ipoint,1) = int2_u_grad1u_x_j1b2_test(i,j,ipoint,1)
+        int2_u_grad1u_x_j1b2_test(j,i,ipoint,2) = int2_u_grad1u_x_j1b2_test(i,j,ipoint,2)
+        int2_u_grad1u_x_j1b2_test(j,i,ipoint,3) = int2_u_grad1u_x_j1b2_test(i,j,ipoint,3)
      enddo
    enddo
  enddo
@ -415,7 +425,10 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_j1b2_test, (ao_num, ao_num, n_p
  double precision :: j12_mu_r12,int_j1b
  double precision :: sigma_ij,dist_ij_ipoint,dsqpi_3_2
  double precision :: beta_ij,center_ij_1s(3),factor_ij_1s
-  dsqpi_3_2 = (dacos(-1.d0))**(3/2)
+
+  print*, ' providing int2_u_grad1u_j1b2_test ...'
+
+  dsqpi_3_2 = (dacos(-1.d0))**(1.5d0)

  provide mu_erf final_grid_points j1b_pen ao_overlap_abs List_comb_thr_b3_cent
  call wall_time(wall0)
@ -438,7 +451,7 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_j1b2_test, (ao_num, ao_num, n_p
  do ipoint = 1, n_points_final_grid
    do i = 1, ao_num
      do j = i, ao_num
-        if(dabs(ao_overlap_abs_grid(j,i)).lt.1.d-12)cycle
+        if(dabs(ao_overlap_abs_grid(j,i)).lt.1.d-10)cycle
        r(1) = final_grid_points(1,ipoint)
        r(2) = final_grid_points(2,ipoint)
        r(3) = final_grid_points(3,ipoint)
@ -449,7 +462,7 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_j1b2_test, (ao_num, ao_num, n_p
          coef        = List_comb_thr_b3_coef  (i_1s,j,i)
          beta        = List_comb_thr_b3_expo  (i_1s,j,i)
          int_j1b = ao_abs_comb_b3_j1b(i_1s,j,i)
-          if(dabs(coef)*dabs(int_j1b).lt.1.d-14)cycle
+          if(dabs(coef)*dabs(int_j1b).lt.1.d-10)cycle
          B_center(1) = List_comb_thr_b3_cent(1,i_1s,j,i)
          B_center(2) = List_comb_thr_b3_cent(2,i_1s,j,i)
          B_center(3) = List_comb_thr_b3_cent(3,i_1s,j,i)
@ -461,7 +474,7 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_j1b2_test, (ao_num, ao_num, n_p

            expo_fit = expo_gauss_j_mu_1_erf(i_fit)
            call gaussian_product(expo_fit,r,beta,B_center,factor_ij_1s,beta_ij,center_ij_1s)
-            if(factor_ij_1s*dabs(coef*int_j1b)*dsqpi_3_2*beta_ij**(-3/2).lt.1.d-15)cycle
+            if(factor_ij_1s*dabs(coef*int_j1b)*dsqpi_3_2*beta_ij**(-1.5d0).lt.1.d-15)cycle
            coef_fit = coef_gauss_j_mu_1_erf(i_fit)

            alpha_1s     = beta + expo_fit
@ -471,9 +484,9 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_j1b2_test, (ao_num, ao_num, n_p
            centr_1s(3)  = alpha_1s_inv * (beta * B_center(3) + expo_fit * r(3))

            expo_coef_1s = beta * expo_fit * alpha_1s_inv * dist
-            if(expo_coef_1s .gt. 80.d0) cycle
+            if(expo_coef_1s .gt. 20.d0) cycle
            coef_tmp = coef * coef_fit * dexp(-expo_coef_1s)
-            if(dabs(coef_tmp) .lt. 1d-10) cycle
+            if(dabs(coef_tmp) .lt. 1d-08) cycle

            int_fit = NAI_pol_mult_erf_ao_with1s(i, j, alpha_1s, centr_1s,  1.d+9, r)

--- a/src/ao_many_one_e_ints/grad2_jmu_modif.irp.f
+++ b/src/ao_many_one_e_ints/grad2_jmu_modif.irp.f
@ -19,9 +19,11 @@ BEGIN_PROVIDER [ double precision, int2_grad1u2_grad2u2_j1b2, (ao_num, ao_num, n
  double precision, external    :: overlap_gauss_r12_ao
  double precision, external    :: overlap_gauss_r12_ao_with1s

-  provide mu_erf final_grid_points j1b_pen
+  print*, ' providing int2_grad1u2_grad2u2_j1b2 ...'
  call wall_time(wall0)

+  provide mu_erf final_grid_points j1b_pen
+
  int2_grad1u2_grad2u2_j1b2 = 0.d0

 !$OMP PARALLEL DEFAULT (NONE)                                      &
@ -88,7 +90,7 @@ BEGIN_PROVIDER [ double precision, int2_grad1u2_grad2u2_j1b2, (ao_num, ao_num, n
  enddo

  call wall_time(wall1)
-  print*, ' wall time for int2_grad1u2_grad2u2_j1b2', wall1 - wall0
+  print*, ' wall time for int2_grad1u2_grad2u2_j1b2 =', wall1 - wall0

 END_PROVIDER 

@ -111,9 +113,11 @@ BEGIN_PROVIDER [ double precision, int2_u2_j1b2, (ao_num, ao_num, n_points_final
  double precision, external    :: overlap_gauss_r12_ao
  double precision, external    :: overlap_gauss_r12_ao_with1s

-  provide mu_erf final_grid_points j1b_pen
+  print*, ' providing int2_u2_j1b2 ...'
  call wall_time(wall0)

+  provide mu_erf final_grid_points j1b_pen
+
  int2_u2_j1b2 = 0.d0

 !$OMP PARALLEL DEFAULT (NONE)                                      &
@ -186,7 +190,7 @@ END_PROVIDER

 ! ---

-BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2, (3, ao_num, ao_num, n_points_final_grid)]
+BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2, (ao_num, ao_num, n_points_final_grid, 3)]

  BEGIN_DOC
  !
@ -202,9 +206,11 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2, (3, ao_num, ao_num, n_p
  double precision :: tmp_x, tmp_y, tmp_z
  double precision :: wall0, wall1

-  provide mu_erf final_grid_points j1b_pen
+  print*, ' providing int2_u_grad1u_x_j1b2 ...'
  call wall_time(wall0)

+  provide mu_erf final_grid_points j1b_pen
+
  int2_u_grad1u_x_j1b2 = 0.d0

 !$OMP PARALLEL DEFAULT (NONE)                                      &
@ -278,9 +284,9 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2, (3, ao_num, ao_num, n_p

        enddo

-        int2_u_grad1u_x_j1b2(1,j,i,ipoint) = tmp_x
-        int2_u_grad1u_x_j1b2(2,j,i,ipoint) = tmp_y
-        int2_u_grad1u_x_j1b2(3,j,i,ipoint) = tmp_z
+        int2_u_grad1u_x_j1b2(j,i,ipoint,1) = tmp_x
+        int2_u_grad1u_x_j1b2(j,i,ipoint,2) = tmp_y
+        int2_u_grad1u_x_j1b2(j,i,ipoint,3) = tmp_z
      enddo
    enddo
  enddo
@ -290,15 +296,15 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2, (3, ao_num, ao_num, n_p
  do ipoint = 1, n_points_final_grid
    do i = 2, ao_num
      do j = 1, i-1
-        int2_u_grad1u_x_j1b2(1,j,i,ipoint) = int2_u_grad1u_x_j1b2(1,i,j,ipoint)
-        int2_u_grad1u_x_j1b2(2,j,i,ipoint) = int2_u_grad1u_x_j1b2(2,i,j,ipoint)
-        int2_u_grad1u_x_j1b2(3,j,i,ipoint) = int2_u_grad1u_x_j1b2(3,i,j,ipoint)
+        int2_u_grad1u_x_j1b2(j,i,ipoint,1) = int2_u_grad1u_x_j1b2(i,j,ipoint,1)
+        int2_u_grad1u_x_j1b2(j,i,ipoint,2) = int2_u_grad1u_x_j1b2(i,j,ipoint,2)
+        int2_u_grad1u_x_j1b2(j,i,ipoint,3) = int2_u_grad1u_x_j1b2(i,j,ipoint,3)
      enddo
    enddo
  enddo

  call wall_time(wall1)
-  print*, ' wall time for int2_u_grad1u_x_j1b2', wall1 - wall0
+  print*, ' wall time for int2_u_grad1u_x_j1b2 = ', wall1 - wall0

 END_PROVIDER 

@ -320,9 +326,11 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_j1b2, (ao_num, ao_num, n_points
  double precision              :: wall0, wall1
  double precision, external    :: NAI_pol_mult_erf_ao_with1s

-  provide mu_erf final_grid_points j1b_pen
+  print*, ' providing int2_u_grad1u_j1b2 ...'
  call wall_time(wall0)

+  provide mu_erf final_grid_points j1b_pen
+
  int2_u_grad1u_j1b2 = 0.d0

 !$OMP PARALLEL DEFAULT (NONE)                                      &
--- a/src/ao_many_one_e_ints/grad2_jmu_modif_vect.irp.f
+++ b/src/ao_many_one_e_ints/grad2_jmu_modif_vect.irp.f
@ -241,7 +241,7 @@
 !
 !! ---
 !
-!BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2, (3, ao_num, ao_num, n_points_final_grid)]
+!BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2, (ao_num, ao_num, n_points_final_grid, 3)]
 !
 !  BEGIN_DOC
 !  !
@ -308,7 +308,7 @@
 !
 !          ! ---
 !
-!          int2_u_grad1u_x_j1b2(1,j,i,ipoint) += coef_fit * int_fit_v(ipoint,1)
+!          int2_u_grad1u_x_j1b2(j,i,ipoint,1) += coef_fit * int_fit_v(ipoint,1)
 !
 !          if(dabs(int_fit_v(ipoint,1)) .gt. 1d-10) then
 !            i_mask_grid1 += 1
@ -320,7 +320,7 @@
 !
 !          ! ---
 !
-!          int2_u_grad1u_x_j1b2(2,j,i,ipoint) += coef_fit * int_fit_v(ipoint,2)
+!          int2_u_grad1u_x_j1b2(j,i,ipoint,2) += coef_fit * int_fit_v(ipoint,2)
 !
 !          if(dabs(int_fit_v(ipoint,2)) .gt. 1d-10) then
 !            i_mask_grid2 += 1
@ -332,7 +332,7 @@
 !
 !          ! ---
 !
-!          int2_u_grad1u_x_j1b2(3,j,i,ipoint) += coef_fit * int_fit_v(ipoint,3)
+!          int2_u_grad1u_x_j1b2(j,i,ipoint,3) += coef_fit * int_fit_v(ipoint,3)
 !
 !          if(dabs(int_fit_v(ipoint,3)) .gt. 1d-10) then
 !            i_mask_grid3 += 1
@ -408,15 +408,15 @@
 !          call NAI_pol_x_mult_erf_ao_with1s_v(i, j, alpha_1s, centr_1s, n_points_final_grid, 1.d+9, r_mask_grid, n_points_final_grid, int_fit_v, n_points_final_grid, i_mask_grid)
 !
 !          do ipoint = 1, i_mask_grid1
-!            int2_u_grad1u_x_j1b2(1,j,i,n_mask_grid(ipoint,1)) += coef * dexp(-expo_coef_1s * dist(ipoint,1)) * int_fit_v(ipoint,1)
+!            int2_u_grad1u_x_j1b2(j,i,n_mask_grid(ipoint,1),1) += coef * dexp(-expo_coef_1s * dist(ipoint,1)) * int_fit_v(ipoint,1)
 !          enddo
 !
 !          do ipoint = 1, i_mask_grid2
-!            int2_u_grad1u_x_j1b2(2,j,i,n_mask_grid(ipoint,2)) += coef * dexp(-expo_coef_1s * dist(ipoint,2)) * int_fit_v(ipoint,2)
+!            int2_u_grad1u_x_j1b2(j,i,n_mask_grid(ipoint,2),2) += coef * dexp(-expo_coef_1s * dist(ipoint,2)) * int_fit_v(ipoint,2)
 !          enddo
 !
 !          do ipoint = 1, i_mask_grid3
-!            int2_u_grad1u_x_j1b2(3,j,i,n_mask_grid(ipoint,3)) += coef * dexp(-expo_coef_1s * dist(ipoint,3)) * int_fit_v(ipoint,3)
+!            int2_u_grad1u_x_j1b2(j,i,n_mask_grid(ipoint,3),3) += coef * dexp(-expo_coef_1s * dist(ipoint,3)) * int_fit_v(ipoint,3)
 !          enddo
 !
 !        enddo
@ -439,15 +439,15 @@
 !  do ipoint = 1, n_points_final_grid
 !    do i = 2, ao_num
 !      do j = 1, i-1
-!        int2_u_grad1u_x_j1b2(1,j,i,ipoint) = int2_u_grad1u_x_j1b2(1,i,j,ipoint)
-!        int2_u_grad1u_x_j1b2(2,j,i,ipoint) = int2_u_grad1u_x_j1b2(2,i,j,ipoint)
-!        int2_u_grad1u_x_j1b2(3,j,i,ipoint) = int2_u_grad1u_x_j1b2(3,i,j,ipoint)
+!        int2_u_grad1u_x_j1b2(j,i,ipoint,1) = int2_u_grad1u_x_j1b2(i,j,ipoint,1)
+!        int2_u_grad1u_x_j1b2(j,i,ipoint,2) = int2_u_grad1u_x_j1b2(i,j,ipoint,2)
+!        int2_u_grad1u_x_j1b2(j,i,ipoint,3) = int2_u_grad1u_x_j1b2(i,j,ipoint,3)
 !      enddo
 !    enddo
 !  enddo
 !
 !  call wall_time(wall1)
-!  print*, ' wall time for int2_u_grad1u_x_j1b2', wall1 - wall0
+!  print*, ' wall time for int2_u_grad1u_x_j1b2 =', wall1 - wall0
 !
 !END_PROVIDER
 !
--- a/src/ao_many_one_e_ints/grad_lapl_jmu_manu.irp.f
+++ b/src/ao_many_one_e_ints/grad_lapl_jmu_manu.irp.f
@ -17,7 +17,10 @@ BEGIN_PROVIDER [ double precision, v_ij_erf_rk_cst_mu_j1b_test, (ao_num, ao_num,
  double precision           :: wall0, wall1
  double precision, external :: NAI_pol_mult_erf_ao_with1s
  double precision :: sigma_ij,dist_ij_ipoint,dsqpi_3_2
-  dsqpi_3_2 = (dacos(-1.d0))**(3/2)
+
+  print*, ' providing v_ij_erf_rk_cst_mu_j1b_test ...'
+
+  dsqpi_3_2 = (dacos(-1.d0))**(1.5d0)
  provide mu_erf final_grid_points j1b_pen
  call wall_time(wall0)

@ -38,7 +41,7 @@ BEGIN_PROVIDER [ double precision, v_ij_erf_rk_cst_mu_j1b_test, (ao_num, ao_num,

    do i = 1, ao_num
      do j = i, ao_num
-        if(dabs(ao_overlap_abs_grid(j,i)).lt.1.d-12)cycle
+        if(dabs(ao_overlap_abs_grid(j,i)).lt.1.d-20)cycle

        tmp = 0.d0
        do i_1s = 1, List_comb_thr_b2_size(j,i)
@ -46,7 +49,7 @@ BEGIN_PROVIDER [ double precision, v_ij_erf_rk_cst_mu_j1b_test, (ao_num, ao_num,
          coef        = List_comb_thr_b2_coef  (i_1s,j,i)
          beta        = List_comb_thr_b2_expo  (i_1s,j,i)
          int_j1b = ao_abs_comb_b2_j1b(i_1s,j,i)
-          if(dabs(coef)*dabs(int_j1b).lt.1.d-14)cycle
+          if(dabs(coef)*dabs(int_j1b).lt.1.d-10)cycle
          B_center(1) = List_comb_thr_b2_cent(1,i_1s,j,i)
          B_center(2) = List_comb_thr_b2_cent(2,i_1s,j,i)
          B_center(3) = List_comb_thr_b2_cent(3,i_1s,j,i)
@ -85,54 +88,28 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_j1b_test, (ao_num, ao_nu
  ! int dr x phi_i(r) phi_j(r) 1s_j1b(r) (erf(mu(R) |r - R|) - 1)/|r - R|
  END_DOC

-  implicit none
-  integer          :: i, j, ipoint
-  double precision :: wall0, wall1
-
-  call wall_time(wall0)
-
-  do ipoint = 1, n_points_final_grid
-    do i = 1, ao_num
-      do j = 1, ao_num
-        x_v_ij_erf_rk_cst_mu_j1b_test(j,i,ipoint,1) = x_v_ij_erf_rk_cst_mu_tmp_j1b_test(1,j,i,ipoint)
-        x_v_ij_erf_rk_cst_mu_j1b_test(j,i,ipoint,2) = x_v_ij_erf_rk_cst_mu_tmp_j1b_test(2,j,i,ipoint)
-        x_v_ij_erf_rk_cst_mu_j1b_test(j,i,ipoint,3) = x_v_ij_erf_rk_cst_mu_tmp_j1b_test(3,j,i,ipoint)
-      enddo
-    enddo
-  enddo
-
-  call wall_time(wall1)
-  print*, ' wall time for x_v_ij_erf_rk_cst_mu_j1b_test', wall1 - wall0
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_tmp_j1b_test, (3, ao_num, ao_num, n_points_final_grid)]
-
-  BEGIN_DOC
-  ! int dr x phi_i(r) phi_j(r) 1s_j1b(r) (erf(mu(R) |r - R|) - 1)/|r - R|
-  END_DOC
-
  implicit none
  integer          :: i, j, ipoint, i_1s
  double precision :: coef, beta, B_center(3), r(3), ints(3), ints_coulomb(3)
  double precision :: tmp_x, tmp_y, tmp_z
  double precision :: wall0, wall1
  double precision :: sigma_ij,dist_ij_ipoint,dsqpi_3_2,int_j1b,factor_ij_1s,beta_ij,center_ij_1s
-  dsqpi_3_2 = (dacos(-1.d0))**(3/2)
+
+  print*, ' providing x_v_ij_erf_rk_cst_mu_j1b_test ...'
+
+  dsqpi_3_2 = (dacos(-1.d0))**(1.5d0)

  provide expo_erfc_mu_gauss ao_prod_sigma ao_prod_center
  call wall_time(wall0)

-  x_v_ij_erf_rk_cst_mu_tmp_j1b_test = 0.d0
+  x_v_ij_erf_rk_cst_mu_j1b_test = 0.d0

 !$OMP PARALLEL DEFAULT (NONE)                                                        &
 !$OMP PRIVATE (ipoint, i, j, i_1s, r, coef, beta, B_center, ints, ints_coulomb,      & 
 !$OMP          int_j1b, tmp_x, tmp_y, tmp_z,factor_ij_1s,beta_ij,center_ij_1s)       & 
 !$OMP SHARED  (n_points_final_grid, ao_num, List_comb_thr_b2_size, final_grid_points,&
 !$OMP          List_comb_thr_b2_coef, List_comb_thr_b2_expo, List_comb_thr_b2_cent,  &
- !$OMP          x_v_ij_erf_rk_cst_mu_tmp_j1b_test, mu_erf,ao_abs_comb_b2_j1b,         &
+ !$OMP          x_v_ij_erf_rk_cst_mu_j1b_test, mu_erf,ao_abs_comb_b2_j1b,         &
 !$OMP          ao_overlap_abs_grid,ao_prod_center,ao_prod_sigma)
 ! !$OMP          ao_overlap_abs_grid,ao_prod_center,ao_prod_sigma,dsqpi_3_2,expo_erfc_mu_gauss)
 !$OMP DO
@ -143,7 +120,7 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_tmp_j1b_test, (3, ao_num

    do i = 1, ao_num
      do j = i, ao_num
-        if(dabs(ao_overlap_abs_grid(j,i)).lt.1.d-12)cycle
+        if(dabs(ao_overlap_abs_grid(j,i)).lt.1.d-10)cycle

        tmp_x = 0.d0
        tmp_y = 0.d0
@ -153,7 +130,7 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_tmp_j1b_test, (3, ao_num
          coef        = List_comb_thr_b2_coef  (i_1s,j,i)
          beta        = List_comb_thr_b2_expo  (i_1s,j,i)
          int_j1b = ao_abs_comb_b2_j1b(i_1s,j,i)
-          if(dabs(coef)*dabs(int_j1b).lt.1.d-14)cycle
+          if(dabs(coef)*dabs(int_j1b).lt.1.d-10)cycle
          B_center(1) = List_comb_thr_b2_cent(1,i_1s,j,i)
          B_center(2) = List_comb_thr_b2_cent(2,i_1s,j,i)
          B_center(3) = List_comb_thr_b2_cent(3,i_1s,j,i)
@ -164,7 +141,7 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_tmp_j1b_test, (3, ao_num
 !           call gaussian_product(expo_erfc_mu_gauss,r,     &
 !                ao_prod_sigma(j,i),ao_prod_center(1,j,i),  & 
 !                factor_ij_1s,beta_ij,center_ij_1s)
-!           if(dabs(coef * factor_ij_1s*int_j1b*10.d0 * dsqpi_3_2 * beta_ij**(-3/2)).lt.1.d-10)cycle 
+!           if(dabs(coef * factor_ij_1s*int_j1b*10.d0 * dsqpi_3_2 * beta_ij**(-1.5d0)).lt.1.d-10)cycle 
 !          endif
          call NAI_pol_x_mult_erf_ao_with1s(i, j, beta, B_center, mu_erf, r, ints        )
          call NAI_pol_x_mult_erf_ao_with1s(i, j, beta, B_center,  1.d+9, r, ints_coulomb)
@ -174,9 +151,9 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_tmp_j1b_test, (3, ao_num
          tmp_z += coef * (ints(3) - ints_coulomb(3))
        enddo

-        x_v_ij_erf_rk_cst_mu_tmp_j1b_test(1,j,i,ipoint) = tmp_x
-        x_v_ij_erf_rk_cst_mu_tmp_j1b_test(2,j,i,ipoint) = tmp_y
-        x_v_ij_erf_rk_cst_mu_tmp_j1b_test(3,j,i,ipoint) = tmp_z
+        x_v_ij_erf_rk_cst_mu_j1b_test(j,i,ipoint,1) = tmp_x
+        x_v_ij_erf_rk_cst_mu_j1b_test(j,i,ipoint,2) = tmp_y
+        x_v_ij_erf_rk_cst_mu_j1b_test(j,i,ipoint,3) = tmp_z
      enddo
    enddo
  enddo
@ -186,15 +163,15 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_tmp_j1b_test, (3, ao_num
  do ipoint = 1, n_points_final_grid
    do i = 2, ao_num
      do j = 1, i-1
-        x_v_ij_erf_rk_cst_mu_tmp_j1b_test(1,j,i,ipoint) = x_v_ij_erf_rk_cst_mu_tmp_j1b_test(1,i,j,ipoint)
-        x_v_ij_erf_rk_cst_mu_tmp_j1b_test(2,j,i,ipoint) = x_v_ij_erf_rk_cst_mu_tmp_j1b_test(2,i,j,ipoint)
-        x_v_ij_erf_rk_cst_mu_tmp_j1b_test(3,j,i,ipoint) = x_v_ij_erf_rk_cst_mu_tmp_j1b_test(3,i,j,ipoint)
+        x_v_ij_erf_rk_cst_mu_j1b_test(j,i,ipoint,1) = x_v_ij_erf_rk_cst_mu_j1b_test(i,j,ipoint,1)
+        x_v_ij_erf_rk_cst_mu_j1b_test(j,i,ipoint,2) = x_v_ij_erf_rk_cst_mu_j1b_test(i,j,ipoint,2)
+        x_v_ij_erf_rk_cst_mu_j1b_test(j,i,ipoint,3) = x_v_ij_erf_rk_cst_mu_j1b_test(i,j,ipoint,3)
      enddo
    enddo
  enddo

  call wall_time(wall1)
-  print*, ' wall time for x_v_ij_erf_rk_cst_mu_tmp_j1b_test', wall1 - wall0
+  print*, ' wall time for x_v_ij_erf_rk_cst_mu_j1b_test', wall1 - wall0

 END_PROVIDER 

@ -218,7 +195,10 @@ BEGIN_PROVIDER [ double precision, v_ij_u_cst_mu_j1b_test, (ao_num, ao_num, n_po

  double precision, external :: overlap_gauss_r12_ao_with1s
  double precision :: sigma_ij,dist_ij_ipoint,dsqpi_3_2,int_j1b
-  dsqpi_3_2 = (dacos(-1.d0))**(3/2)
+
+  print*, ' providing v_ij_u_cst_mu_j1b_test ...'
+
+  dsqpi_3_2 = (dacos(-1.d0))**(1.5d0)

  provide mu_erf final_grid_points j1b_pen
  call wall_time(wall0)
@ -244,7 +224,7 @@ BEGIN_PROVIDER [ double precision, v_ij_u_cst_mu_j1b_test, (ao_num, ao_num, n_po

    do i = 1, ao_num
      do j = i, ao_num
-        if(dabs(ao_overlap_abs_grid(j,i)).lt.1.d-12)cycle
+        if(dabs(ao_overlap_abs_grid(j,i)).lt.1.d-20)cycle

        tmp = 0.d0
        do i_1s = 1, List_comb_thr_b2_size(j,i)
@ -252,7 +232,7 @@ BEGIN_PROVIDER [ double precision, v_ij_u_cst_mu_j1b_test, (ao_num, ao_num, n_po
          coef        = List_comb_thr_b2_coef  (i_1s,j,i)
          beta        = List_comb_thr_b2_expo  (i_1s,j,i)
          int_j1b = ao_abs_comb_b2_j1b(i_1s,j,i)
-          if(dabs(coef)*dabs(int_j1b).lt.1.d-14)cycle
+          if(dabs(coef)*dabs(int_j1b).lt.1.d-10)cycle
          B_center(1) = List_comb_thr_b2_cent(1,i_1s,j,i)
          B_center(2) = List_comb_thr_b2_cent(2,i_1s,j,i)
          B_center(3) = List_comb_thr_b2_cent(3,i_1s,j,i)
@ -311,7 +291,7 @@ BEGIN_PROVIDER [ double precision, v_ij_u_cst_mu_j1b_ng_1_test, (ao_num, ao_num,

  double precision, external :: overlap_gauss_r12_ao_with1s
  double precision :: sigma_ij,dist_ij_ipoint,dsqpi_3_2,int_j1b
-  dsqpi_3_2 = (dacos(-1.d0))**(3/2)
+  dsqpi_3_2 = (dacos(-1.d0))**(1.5d0)

  provide mu_erf final_grid_points j1b_pen
  call wall_time(wall0)
@ -337,7 +317,7 @@ BEGIN_PROVIDER [ double precision, v_ij_u_cst_mu_j1b_ng_1_test, (ao_num, ao_num,

    do i = 1, ao_num
      do j = i, ao_num
-        if(dabs(ao_overlap_abs_grid(j,i)).lt.1.d-12)cycle
+        if(dabs(ao_overlap_abs_grid(j,i)).lt.1.d-20)cycle

        tmp = 0.d0
        do i_1s = 1, List_comb_thr_b2_size(j,i)
@ -345,7 +325,7 @@ BEGIN_PROVIDER [ double precision, v_ij_u_cst_mu_j1b_ng_1_test, (ao_num, ao_num,
          coef        = List_comb_thr_b2_coef  (i_1s,j,i)
          beta        = List_comb_thr_b2_expo  (i_1s,j,i)
          int_j1b = ao_abs_comb_b2_j1b(i_1s,j,i)
-          if(dabs(coef)*dabs(int_j1b).lt.1.d-14)cycle
+          if(dabs(coef)*dabs(int_j1b).lt.1.d-10)cycle
          B_center(1) = List_comb_thr_b2_cent(1,i_1s,j,i)
          B_center(2) = List_comb_thr_b2_cent(2,i_1s,j,i)
          B_center(3) = List_comb_thr_b2_cent(3,i_1s,j,i)
--- a/src/ao_many_one_e_ints/grad_lapl_jmu_modif.irp.f
+++ b/src/ao_many_one_e_ints/grad_lapl_jmu_modif.irp.f
@ -17,9 +17,11 @@ BEGIN_PROVIDER [ double precision, v_ij_erf_rk_cst_mu_j1b, (ao_num, ao_num, n_po
  double precision           :: wall0, wall1
  double precision, external :: NAI_pol_mult_erf_ao_with1s

-  provide mu_erf final_grid_points j1b_pen
+  print *, ' providing v_ij_erf_rk_cst_mu_j1b ...'
  call wall_time(wall0)

+  provide mu_erf final_grid_points j1b_pen
+
  v_ij_erf_rk_cst_mu_j1b = 0.d0

 !$OMP PARALLEL DEFAULT (NONE)                                                         &
@ -99,51 +101,23 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_j1b, (ao_num, ao_num, n_
  ! int dr x phi_i(r) phi_j(r) 1s_j1b(r) (erf(mu(R) |r - R|) - 1)/|r - R|
  END_DOC

-  implicit none
-  integer          :: i, j, ipoint
-  double precision :: wall0, wall1
-
-  call wall_time(wall0)
-
-  do ipoint = 1, n_points_final_grid
-    do i = 1, ao_num
-      do j = 1, ao_num
-        x_v_ij_erf_rk_cst_mu_j1b(j,i,ipoint,1) = x_v_ij_erf_rk_cst_mu_tmp_j1b(1,j,i,ipoint)
-        x_v_ij_erf_rk_cst_mu_j1b(j,i,ipoint,2) = x_v_ij_erf_rk_cst_mu_tmp_j1b(2,j,i,ipoint)
-        x_v_ij_erf_rk_cst_mu_j1b(j,i,ipoint,3) = x_v_ij_erf_rk_cst_mu_tmp_j1b(3,j,i,ipoint)
-      enddo
-    enddo
-  enddo
-
-  call wall_time(wall1)
-  print*, ' wall time for x_v_ij_erf_rk_cst_mu_j1b', wall1 - wall0
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_tmp_j1b, (3, ao_num, ao_num, n_points_final_grid)]
-
-  BEGIN_DOC
-  ! int dr x phi_i(r) phi_j(r) 1s_j1b(r) (erf(mu(R) |r - R|) - 1)/|r - R|
-  END_DOC
-
  implicit none
  integer          :: i, j, ipoint, i_1s
  double precision :: coef, beta, B_center(3), r(3), ints(3), ints_coulomb(3)
  double precision :: tmp_x, tmp_y, tmp_z
  double precision :: wall0, wall1

+  print*, ' providing x_v_ij_erf_rk_cst_mu_j1b ...'
  call wall_time(wall0)

-  x_v_ij_erf_rk_cst_mu_tmp_j1b = 0.d0
+  x_v_ij_erf_rk_cst_mu_j1b = 0.d0

 !$OMP PARALLEL DEFAULT (NONE)                                                        &
 !$OMP PRIVATE (ipoint, i, j, i_1s, r, coef, beta, B_center, ints, ints_coulomb,      & 
 !$OMP          tmp_x, tmp_y, tmp_z)                                                  & 
 !$OMP SHARED  (n_points_final_grid, ao_num, List_all_comb_b2_size, final_grid_points,&
 !$OMP          List_all_comb_b2_coef, List_all_comb_b2_expo, List_all_comb_b2_cent,  &
- !$OMP          x_v_ij_erf_rk_cst_mu_tmp_j1b, mu_erf)
+ !$OMP          x_v_ij_erf_rk_cst_mu_j1b, mu_erf)
 !$OMP DO
  !do ipoint = 1, 10
  do ipoint = 1, n_points_final_grid
@ -195,9 +169,9 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_tmp_j1b, (3, ao_num, ao_

        ! ---

-        x_v_ij_erf_rk_cst_mu_tmp_j1b(1,j,i,ipoint) = tmp_x
-        x_v_ij_erf_rk_cst_mu_tmp_j1b(2,j,i,ipoint) = tmp_y
-        x_v_ij_erf_rk_cst_mu_tmp_j1b(3,j,i,ipoint) = tmp_z
+        x_v_ij_erf_rk_cst_mu_j1b(j,i,ipoint,1) = tmp_x
+        x_v_ij_erf_rk_cst_mu_j1b(j,i,ipoint,2) = tmp_y
+        x_v_ij_erf_rk_cst_mu_j1b(j,i,ipoint,3) = tmp_z
      enddo
    enddo
  enddo
@ -207,15 +181,15 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_tmp_j1b, (3, ao_num, ao_
  do ipoint = 1, n_points_final_grid
    do i = 2, ao_num
      do j = 1, i-1
-        x_v_ij_erf_rk_cst_mu_tmp_j1b(1,j,i,ipoint) = x_v_ij_erf_rk_cst_mu_tmp_j1b(1,i,j,ipoint)
-        x_v_ij_erf_rk_cst_mu_tmp_j1b(2,j,i,ipoint) = x_v_ij_erf_rk_cst_mu_tmp_j1b(2,i,j,ipoint)
-        x_v_ij_erf_rk_cst_mu_tmp_j1b(3,j,i,ipoint) = x_v_ij_erf_rk_cst_mu_tmp_j1b(3,i,j,ipoint)
+        x_v_ij_erf_rk_cst_mu_j1b(j,i,ipoint,1) = x_v_ij_erf_rk_cst_mu_j1b(i,j,ipoint,1)
+        x_v_ij_erf_rk_cst_mu_j1b(j,i,ipoint,2) = x_v_ij_erf_rk_cst_mu_j1b(i,j,ipoint,2)
+        x_v_ij_erf_rk_cst_mu_j1b(j,i,ipoint,3) = x_v_ij_erf_rk_cst_mu_j1b(i,j,ipoint,3)
      enddo
    enddo
  enddo

  call wall_time(wall1)
-  print*, ' wall time for x_v_ij_erf_rk_cst_mu_tmp_j1b', wall1 - wall0
+  print*, ' wall time for x_v_ij_erf_rk_cst_mu_j1b =', wall1 - wall0

 END_PROVIDER 

@ -239,9 +213,11 @@ BEGIN_PROVIDER [ double precision, v_ij_u_cst_mu_j1b, (ao_num, ao_num, n_points_

  double precision, external :: overlap_gauss_r12_ao_with1s

-  provide mu_erf final_grid_points j1b_pen
+  print*, ' providing v_ij_u_cst_mu_j1b ...'
  call wall_time(wall0)

+  provide mu_erf final_grid_points j1b_pen
+
  v_ij_u_cst_mu_j1b = 0.d0

 !$OMP PARALLEL DEFAULT (NONE)                                      &
--- a/src/ao_many_one_e_ints/grad_related_ints.irp.f
+++ b/src/ao_many_one_e_ints/grad_related_ints.irp.f
@ -17,6 +17,8 @@ BEGIN_PROVIDER [ double precision, v_ij_erf_rk_cst_mu, (ao_num, ao_num, n_points

  double precision :: NAI_pol_mult_erf_ao

+  print*, ' providing v_ij_erf_rk_cst_mu ...'
+
  provide mu_erf final_grid_points 
  call wall_time(wall0)

@ -54,7 +56,7 @@ BEGIN_PROVIDER [ double precision, v_ij_erf_rk_cst_mu, (ao_num, ao_num, n_points
  enddo
 
  call wall_time(wall1)
-  print*, ' wall time for v_ij_erf_rk_cst_mu  ', wall1 - wall0
+  print*, ' wall time for v_ij_erf_rk_cst_mu = ', wall1 - wall0

 END_PROVIDER 

@ -73,6 +75,8 @@ BEGIN_PROVIDER [ double precision, v_ij_erf_rk_cst_mu_transp, (n_points_final_gr
  double precision :: wall0, wall1
  double precision :: NAI_pol_mult_erf_ao

+  print *, ' providing v_ij_erf_rk_cst_mu_transp ...'
+
  provide mu_erf final_grid_points 
  call wall_time(wall0)

@ -107,7 +111,7 @@ BEGIN_PROVIDER [ double precision, v_ij_erf_rk_cst_mu_transp, (n_points_final_gr
  enddo

  call wall_time(wall1)
-  print *, ' wall time for v_ij_erf_rk_cst_mu_transp  ', wall1 - wall0
+  print *, ' wall time for v_ij_erf_rk_cst_mu_transp = ', wall1 - wall0

 END_PROVIDER 

@ -124,6 +128,8 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_tmp, (3, ao_num, ao_num,
  double precision :: r(3), ints(3), ints_coulomb(3)
  double precision :: wall0, wall1

+  print*, ' providing x_v_ij_erf_rk_cst_mu_tmp ...'
+
  call wall_time(wall0)

 !$OMP PARALLEL                                 &
@ -162,7 +168,7 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_tmp, (3, ao_num, ao_num,
  enddo

  call wall_time(wall1)
-  print*, ' wall time for x_v_ij_erf_rk_cst_mu_tmp', wall1 - wall0
+  print *, ' wall time for x_v_ij_erf_rk_cst_mu_tmp = ', wall1 - wall0

 END_PROVIDER 

@ -178,6 +184,8 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu, (ao_num, ao_num,n_point
  integer          :: i, j, ipoint
  double precision :: wall0, wall1

+  print *, ' providing x_v_ij_erf_rk_cst_mu ...'
+
  call wall_time(wall0)

  do ipoint = 1, n_points_final_grid
@ -191,7 +199,7 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu, (ao_num, ao_num,n_point
  enddo

  call wall_time(wall1)
-  print *, ' wall time for x_v_ij_erf_rk_cst_mu', wall1 - wall0
+  print *, ' wall time for x_v_ij_erf_rk_cst_mu = ', wall1 - wall0

 END_PROVIDER 

@ -207,6 +215,8 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_transp, (ao_num, ao_num,
  integer          :: i, j, ipoint
  double precision :: wall0, wall1

+  print *, ' providing x_v_ij_erf_rk_cst_mu_transp ...'
+
  call wall_time(wall0)

  do ipoint = 1, n_points_final_grid
@ -220,7 +230,7 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_transp, (ao_num, ao_num,
  enddo

  call wall_time(wall1)
-  print *, ' wall time for x_v_ij_erf_rk_cst_mu_transp', wall1 - wall0
+  print *, ' wall time for x_v_ij_erf_rk_cst_mu_transp = ', wall1 - wall0

 END_PROVIDER 

@ -236,6 +246,8 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_transp_bis, (n_points_fi
  integer          :: i, j, ipoint
  double precision :: wall0, wall1

+  print *, ' providing x_v_ij_erf_rk_cst_mu_transp_bis ...'
+
  call wall_time(wall0)

  do i = 1, ao_num
@ -249,7 +261,7 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_transp_bis, (n_points_fi
  enddo

  call wall_time(wall1)
-  print *, ' wall time for x_v_ij_erf_rk_cst_mu_transp_bis', wall1 - wall0
+  print *, ' wall time for x_v_ij_erf_rk_cst_mu_transp_bis = ', wall1 - wall0

 END_PROVIDER 

@ -268,6 +280,8 @@ BEGIN_PROVIDER [ double precision, d_dx_v_ij_erf_rk_cst_mu_tmp, (3, n_points_fin
 double precision :: r(3), ints(3), ints_coulomb(3)
 double precision :: wall0, wall1

+  print *, ' providing d_dx_v_ij_erf_rk_cst_mu_tmp ...'
+
  call wall_time(wall0)

 !$OMP PARALLEL                                 &
@ -295,7 +309,7 @@ BEGIN_PROVIDER [ double precision, d_dx_v_ij_erf_rk_cst_mu_tmp, (3, n_points_fin
 !$OMP END PARALLEL

  call wall_time(wall1)
-  print *, ' wall time for d_dx_v_ij_erf_rk_cst_mu_tmp', wall1 - wall0
+  print *, ' wall time for d_dx_v_ij_erf_rk_cst_mu_tmp = ', wall1 - wall0

 END_PROVIDER 

@ -315,6 +329,8 @@ BEGIN_PROVIDER [ double precision, d_dx_v_ij_erf_rk_cst_mu, (n_points_final_grid
  integer          :: i, j, ipoint
  double precision :: wall0, wall1

+  print *, ' providing d_dx_v_ij_erf_rk_cst_mu ...'
+
  call wall_time(wall0)
  do i = 1, ao_num
    do j = 1, ao_num
@ -327,7 +343,7 @@ BEGIN_PROVIDER [ double precision, d_dx_v_ij_erf_rk_cst_mu, (n_points_final_grid
  enddo

  call wall_time(wall1)
-  print *, ' wall time for d_dx_v_ij_erf_rk_cst_mu', wall1 - wall0
+  print *, ' wall time for d_dx_v_ij_erf_rk_cst_mu = ', wall1 - wall0

 END_PROVIDER 

@ -348,6 +364,8 @@ BEGIN_PROVIDER [ double precision, x_d_dx_v_ij_erf_rk_cst_mu_tmp, (3, n_points_f
  double precision :: r(3), ints(3), ints_coulomb(3)
  double precision :: wall0, wall1

+  print *, ' providing x_d_dx_v_ij_erf_rk_cst_mu_tmp ...'
+
  call wall_time(wall0)

 !$OMP PARALLEL                                 &
@ -375,7 +393,7 @@ BEGIN_PROVIDER [ double precision, x_d_dx_v_ij_erf_rk_cst_mu_tmp, (3, n_points_f
 !$OMP END PARALLEL

  call wall_time(wall1)
-  print *, ' wall time for x_d_dx_v_ij_erf_rk_cst_mu_tmp', wall1 - wall0
+  print *, ' wall time for x_d_dx_v_ij_erf_rk_cst_mu_tmp = ', wall1 - wall0

 END_PROVIDER 

@ -395,6 +413,8 @@ BEGIN_PROVIDER [ double precision, x_d_dx_v_ij_erf_rk_cst_mu, (n_points_final_gr
  integer          :: i, j, ipoint
  double precision :: wall0, wall1

+  print *, ' providing x_d_dx_v_ij_erf_rk_cst_mu ...'
+
  call wall_time(wall0)

  do i = 1, ao_num
@ -408,7 +428,7 @@ BEGIN_PROVIDER [ double precision, x_d_dx_v_ij_erf_rk_cst_mu, (n_points_final_gr
  enddo
 
  call wall_time(wall1)
-  print *, ' wall time for x_d_dx_v_ij_erf_rk_cst_mu', wall1 - wall0
+  print *, ' wall time for x_d_dx_v_ij_erf_rk_cst_mu = ', wall1 - wall0

 END_PROVIDER 

--- a/src/ao_many_one_e_ints/listj1b.irp.f
+++ b/src/ao_many_one_e_ints/listj1b.irp.f
@ -102,6 +102,12 @@ END_PROVIDER
    List_all_comb_b2_coef(i) = (-1.d0)**dble(phase) * dexp(-List_all_comb_b2_coef(i))
  enddo

+  print *, ' coeff, expo & cent of list b2'
+  do i = 1, List_all_comb_b2_size
+    print*, i, List_all_comb_b2_coef(i), List_all_comb_b2_expo(i)
+    print*, List_all_comb_b2_cent(1,i), List_all_comb_b2_cent(2,i), List_all_comb_b2_cent(3,i)
+  enddo
+
 END_PROVIDER

 ! ---
@ -219,9 +225,11 @@ END_PROVIDER
    List_all_comb_b3_coef(i) = (-1.d0)**dble(phase) * facto * dexp(-List_all_comb_b3_coef(i))
  enddo

-  print *, ' 1st coeff & expo of lists'
-  print*, List_all_comb_b2_coef(1), List_all_comb_b2_expo(1)
-  print*, List_all_comb_b3_coef(1), List_all_comb_b3_expo(1)
+  print *, ' coeff, expo & cent of list b3'
+  do i = 1, List_all_comb_b3_size
+    print*, i, List_all_comb_b3_coef(i), List_all_comb_b3_expo(i)
+    print*, List_all_comb_b3_cent(1,i), List_all_comb_b3_cent(2,i), List_all_comb_b3_cent(3,i)
+  enddo

 END_PROVIDER

--- a/src/ao_one_e_ints/pot_ao_ints.irp.f
+++ b/src/ao_one_e_ints/pot_ao_ints.irp.f
@ -18,6 +18,8 @@ BEGIN_PROVIDER [ double precision, ao_integrals_n_e, (ao_num,ao_num)]
  double precision :: A_center(3),B_center(3),C_center(3)
  double precision :: overlap_x,overlap_y,overlap_z,overlap,dx,NAI_pol_mult

+  ao_integrals_n_e = 0.d0
+
  if (read_ao_integrals_n_e) then

    call ezfio_get_ao_one_e_ints_ao_integrals_n_e(ao_integrals_n_e)
@ -36,8 +38,6 @@ BEGIN_PROVIDER [ double precision, ao_integrals_n_e, (ao_num,ao_num)]

    else

-      ao_integrals_n_e = 0.d0
-
      !$OMP PARALLEL                                                   &
          !$OMP DEFAULT (NONE)                                         &
          !$OMP PRIVATE (i,j,k,l,m,alpha,beta,A_center,B_center,C_center,power_A,power_B,&
--- a/src/ao_one_e_ints/pseudopot.f90
+++ b/src/ao_one_e_ints/pseudopot.f90
@ -1950,26 +1950,26 @@ xq(17)=-3.34785456738322
 xq(18)=-3.94476404011563
 xq(19)=-4.60368244955074
 xq(20)=-5.38748089001123
-wq(1)=  2.229393645534151E-013
-wq(2)=  4.399340992273176E-010
-wq(3)=  1.086069370769280E-007
-wq(4)=  7.802556478532063E-006
-wq(5)=  2.283386360163528E-004
-wq(6)=  3.243773342237853E-003
-wq(7)=  2.481052088746362E-002
+wq(1)=  2.229393645534151D-013
+wq(2)=  4.399340992273176D-010
+wq(3)=  1.086069370769280D-007
+wq(4)=  7.802556478532063D-006
+wq(5)=  2.283386360163528D-004
+wq(6)=  3.243773342237853D-003
+wq(7)=  2.481052088746362D-002
 wq(8)=  0.109017206020022
 wq(9)=  0.286675505362834
 wq(10)= 0.462243669600610
 wq(11)= 0.462243669600610
 wq(12)= 0.286675505362834
 wq(13)= 0.109017206020022
-wq(14)= 2.481052088746362E-002
-wq(15)= 3.243773342237853E-003
-wq(16)= 2.283386360163528E-004
-wq(17)= 7.802556478532063E-006
-wq(18)= 1.086069370769280E-007
-wq(19)= 4.399340992273176E-010
-wq(20)= 2.229393645534151E-013
+wq(14)= 2.481052088746362D-002
+wq(15)= 3.243773342237853D-003
+wq(16)= 2.283386360163528D-004
+wq(17)= 7.802556478532063D-006
+wq(18)= 1.086069370769280D-007
+wq(19)= 4.399340992273176D-010
+wq(20)= 2.229393645534151D-013

      npts=20
 !      call gauher(xq,wq,npts)
--- a/src/ao_tc_eff_map/fit_j.irp.f
+++ b/src/ao_tc_eff_map/fit_j.irp.f
@ -123,6 +123,36 @@ END_PROVIDER
      expo_gauss_j_mu_x(i) = tmp * expo_gauss_j_mu_x(i)
    enddo

+  elseif(ng_fit_jast .eq. 7) then
+
+    coef_gauss_j_mu_x = (/ -0.01756495d0 , -0.01023623d0  , -0.06548959d0  , -0.03539446d0  , -0.17150646d0  , -0.15071096d0  , -0.11326834d0   /)
+    expo_gauss_j_mu_x = (/ 9.88572565d+02,  1.21363371d+04,  3.69794870d+01,  1.67364529d+02,  3.03962934d+00,  1.27854005d+00,  9.76383343d+00 /)
+
+    tmp = mu_erf * mu_erf
+    do i = 1, ng_fit_jast
+      expo_gauss_j_mu_x(i) = tmp * expo_gauss_j_mu_x(i)
+    enddo
+
+  elseif(ng_fit_jast .eq. 8) then
+
+    coef_gauss_j_mu_x = (/ -0.11489205d0 , -0.16008968d0 , -0.12892456d0 , -0.04250838d0 , -0.0718451d0  , -0.02394051d0 , -0.00913353d0 , -0.01285182d0  /)
+    expo_gauss_j_mu_x = (/ 6.97632442d+00, 2.56010878d+00, 1.22760977d+00, 7.47697124d+01, 2.16104215d+01, 2.96549728d+02, 1.40773328d+04, 1.43335159d+03 /)
+
+    tmp = mu_erf * mu_erf
+    do i = 1, ng_fit_jast
+      expo_gauss_j_mu_x(i) = tmp * expo_gauss_j_mu_x(i)
+    enddo
+
+  !elseif(ng_fit_jast .eq. 9) then
+
+  !  coef_gauss_j_mu_x = (/ /)
+  !  expo_gauss_j_mu_x = (/ /)
+
+  !  tmp = mu_erf * mu_erf
+  !  do i = 1, ng_fit_jast
+  !    expo_gauss_j_mu_x(i) = tmp * expo_gauss_j_mu_x(i)
+  !  enddo
+
  elseif(ng_fit_jast .eq. 20) then

    ASSERT(n_max_fit_slat == 20)
@ -224,6 +254,36 @@ END_PROVIDER
      expo_gauss_j_mu_x_2(i) = tmp * expo_gauss_j_mu_x_2(i)
    enddo

+  elseif(ng_fit_jast .eq. 7) then
+
+    coef_gauss_j_mu_x_2 = (/ 0.05202849d0  , 0.01031081d0  , 0.04699157d0  , 0.01451002d0  , 0.07442576d0  , 0.02692033d0  , 0.09311842d0   /)
+    expo_gauss_j_mu_x_2 = (/ 3.04469415d+00, 1.40682034d+04, 7.45960945d+01, 1.43067466d+03, 2.16815661d+01, 2.95750306d+02, 7.23471236d+00 /)
+    
+    tmp = mu_erf * mu_erf
+    do i = 1, ng_fit_jast
+      expo_gauss_j_mu_x_2(i) = tmp * expo_gauss_j_mu_x_2(i)
+    enddo
+
+  elseif(ng_fit_jast .eq. 8) then
+
+    coef_gauss_j_mu_x_2 = (/ 0.00942115d0  , 0.07332421d0  , 0.0508308d0   , 0.08204949d0  , 0.0404099d0   , 0.03201288d0  , 0.01911313d0  , 0.01114732d0   /)
+    expo_gauss_j_mu_x_2 = (/ 1.56957321d+04, 1.52867810d+01, 4.36016903d+01, 5.96818956d+00, 2.85535269d+00, 1.36064008d+02, 4.71968910d+02, 1.92022350d+03 /)
+    
+    tmp = mu_erf * mu_erf
+    do i = 1, ng_fit_jast
+      expo_gauss_j_mu_x_2(i) = tmp * expo_gauss_j_mu_x_2(i)
+    enddo
+
+  !elseif(ng_fit_jast .eq. 9) then
+
+  !  coef_gauss_j_mu_x_2 = (/  /)
+  !  expo_gauss_j_mu_x_2 = (/  /)
+  !  
+  !  tmp = mu_erf * mu_erf
+  !  do i = 1, ng_fit_jast
+  !    expo_gauss_j_mu_x_2(i) = tmp * expo_gauss_j_mu_x_2(i)
+  !  enddo
+
  elseif(ng_fit_jast .eq. 20) then

    ASSERT(n_max_fit_slat == 20)
@ -328,6 +388,36 @@ END_PROVIDER
      expo_gauss_j_mu_1_erf(i) = tmp * expo_gauss_j_mu_1_erf(i)
    enddo

+  elseif(ng_fit_jast .eq. 7) then
+
+    coef_gauss_j_mu_1_erf = (/ -0.11853067d0 , -0.01522824d0  , -0.07419098d0  , -0.022202d0    , -0.12242283d0  , -0.04177571d0  , -0.16983107d0  /)
+    expo_gauss_j_mu_1_erf = (/ 2.74057056d+00,  1.37626591d+04,  6.65578663d+01,  1.34693031d+03,  1.90547699d+01,  2.69445390d+02,  6.31845879d+00/)
+
+    tmp = mu_erf * mu_erf
+    do i = 1, ng_fit_jast
+      expo_gauss_j_mu_1_erf(i) = tmp * expo_gauss_j_mu_1_erf(i)
+    enddo
+
+  elseif(ng_fit_jast .eq. 8) then
+
+    coef_gauss_j_mu_1_erf = (/ -0.12263328d0 , -0.04965255d0 , -0.15463564d0 , -0.09675781d0 , -0.0807023d0  , -0.02923298d0 , -0.01381381d0 , -0.01675923d0  /)
+    expo_gauss_j_mu_1_erf = (/ 1.36101994d+01, 1.24908367d+02, 5.29061388d+00, 2.60692516d+00, 3.93396935d+01, 4.43071610d+02, 1.54902240d+04, 1.85170446d+03 /)
+
+    tmp = mu_erf * mu_erf
+    do i = 1, ng_fit_jast
+      expo_gauss_j_mu_1_erf(i) = tmp * expo_gauss_j_mu_1_erf(i)
+    enddo
+
+  !elseif(ng_fit_jast .eq. 9) then
+
+  !  coef_gauss_j_mu_1_erf = (/  /)
+  !  expo_gauss_j_mu_1_erf = (/  /)
+
+  !  tmp = mu_erf * mu_erf
+  !  do i = 1, ng_fit_jast
+  !    expo_gauss_j_mu_1_erf(i) = tmp * expo_gauss_j_mu_1_erf(i)
+  !  enddo
+
  elseif(ng_fit_jast .eq. 20) then

    ASSERT(n_max_fit_slat == 20)
--- a/src/ao_tc_eff_map/potential.irp.f
+++ b/src/ao_tc_eff_map/potential.irp.f
@ -1,5 +1,7 @@
+! ---
+
 BEGIN_PROVIDER [integer, n_gauss_eff_pot]
- implicit none
+
  BEGIN_DOC
  ! number of gaussians to represent the effective potential :
  !
@ -7,20 +9,31 @@ BEGIN_PROVIDER [integer, n_gauss_eff_pot]
  !
  ! Here (1 - erf(mu*r12))^2 is expanded in Gaussians as Eqs A11-A20 in JCP 154, 084119 (2021)
  END_DOC
- n_gauss_eff_pot = n_max_fit_slat + 1
+
+  implicit none
+
+  n_gauss_eff_pot = ng_fit_jast + 1
+
 END_PROVIDER 

+! ---
+
 BEGIN_PROVIDER [integer, n_gauss_eff_pot_deriv]
- implicit none
+
  BEGIN_DOC
  ! V(r12) = -(1 - erf(mu*r12))^2 is expanded in Gaussians as Eqs A11-A20 in JCP 154, 084119 (2021)
  END_DOC
- n_gauss_eff_pot_deriv = n_max_fit_slat 
+
+  implicit none
+  n_gauss_eff_pot_deriv = ng_fit_jast
+
 END_PROVIDER 

+! ---
+
 BEGIN_PROVIDER [double precision, expo_gauss_eff_pot, (n_gauss_eff_pot)]
 &BEGIN_PROVIDER [double precision, coef_gauss_eff_pot, (n_gauss_eff_pot)]
- implicit none
+
  BEGIN_DOC
  ! Coefficients and exponents of the Fit on Gaussians of V(X) = -(1 - erf(mu*X))^2 + 1/(\sqrt(pi)mu) * exp(-(mu*X)^2)
  !
@ -28,32 +41,39 @@ END_PROVIDER
  !
  ! Relies on the fit proposed in Eqs A11-A20 in JCP 154, 084119 (2021)
  END_DOC
+
  include 'constants.include.F'

+  implicit none
  integer :: i
+ 
  ! fit of the -0.25 * (1 - erf(mu*x))^2 with n_max_fit_slat gaussians 
- do i = 1, n_max_fit_slat
+  do i = 1, ng_fit_jast
   expo_gauss_eff_pot(i) = expo_gauss_1_erf_x_2(i) 
   coef_gauss_eff_pot(i) = -0.25d0 * coef_gauss_1_erf_x_2(i) ! -1/4 * (1 - erf(mu*x))^2
  enddo
+
  ! Analytical Gaussian part of the potential: + 1/(\sqrt(pi)mu) * exp(-(mu*x)^2) 
- expo_gauss_eff_pot(n_max_fit_slat+1) = mu_erf * mu_erf
- coef_gauss_eff_pot(n_max_fit_slat+1) =  1.d0 * mu_erf * inv_sq_pi
+  expo_gauss_eff_pot(ng_fit_jast+1) = mu_erf * mu_erf
+  coef_gauss_eff_pot(ng_fit_jast+1) =  1.d0 * mu_erf * inv_sq_pi

 END_PROVIDER 

+! ---

 double precision function eff_pot_gauss(x, mu)
- implicit none
+
  BEGIN_DOC
  ! V(mu,r12) = -0.25 * (1 - erf(mu*r12))^2 + 1/(\sqrt(pi)mu) * exp(-(mu*r12)^2)
  END_DOC
+
+  implicit none
  double precision, intent(in) :: x, mu
+
  eff_pot_gauss =  mu/dsqrt(dacos(-1.d0)) * dexp(-mu*mu*x*x) - 0.25d0 * (1.d0 - derf(mu*x))**2.d0
+
 end

-
-
 ! -------------------------------------------------------------------------------------------------
 ! ---

@ -129,12 +149,15 @@ END_PROVIDER
 ! ---

 double precision function fit_1_erf_x(x)
- implicit none
- double precision, intent(in) :: x
+
  BEGIN_DOC
  ! fit_1_erf_x(x) = \sum_i c_i exp (-alpha_i x^2) \approx (1 - erf(mu*x))
  END_DOC
+
+  implicit none
  integer :: i
+  double precision, intent(in) :: x
+
  fit_1_erf_x = 0.d0
  do i = 1, n_max_fit_slat
    fit_1_erf_x += dexp(-expo_gauss_1_erf_x(i) *x*x) * coef_gauss_1_erf_x(i)
@ -209,6 +232,36 @@ end
      expo_gauss_1_erf_x_2(i) = tmp * expo_gauss_1_erf_x_2(i)
    enddo

+  elseif(ng_fit_jast .eq. 7) then
+
+    coef_gauss_1_erf_x_2 = (/ 0.0213619d0   , 0.03221511d0  , 0.29966689d0  , 0.19178934d0  , 0.06154732d0  , 0.28214555d0  , 0.11125985d0   /)
+    expo_gauss_1_erf_x_2 = (/ 1.34727067d+04, 1.27166613d+03, 5.52584567d+00, 1.67753218d+01, 2.46145691d+02, 2.47971820d+00, 5.95141293d+01 /)
+
+    tmp = mu_erf * mu_erf
+    do i = 1, ng_fit_jast
+      expo_gauss_1_erf_x_2(i) = tmp * expo_gauss_1_erf_x_2(i)
+    enddo
+
+  elseif(ng_fit_jast .eq. 8) then
+
+    coef_gauss_1_erf_x_2 = (/ 0.28189124d0  , 0.19518669d0  , 0.12161735d0  , 0.24257438d0  , 0.07309656d0  , 0.042435d0    , 0.01926109d0  , 0.02393415d0   /)
+    expo_gauss_1_erf_x_2 = (/ 4.69795903d+00, 1.21379451d+01, 3.55527053d+01, 2.39227172d+00, 1.14827721d+02, 4.16320213d+02, 1.52813587d+04, 1.78516557d+03 /)
+
+    tmp = mu_erf * mu_erf
+    do i = 1, ng_fit_jast
+      expo_gauss_1_erf_x_2(i) = tmp * expo_gauss_1_erf_x_2(i)
+    enddo
+
+  !elseif(ng_fit_jast .eq. 9) then
+
+  !  coef_gauss_1_erf_x_2 = (/  /)
+  !  expo_gauss_1_erf_x_2 = (/  /)
+
+  !  tmp = mu_erf * mu_erf
+  !  do i = 1, ng_fit_jast
+  !    expo_gauss_1_erf_x_2(i) = tmp * expo_gauss_1_erf_x_2(i)
+  !  enddo
+
  elseif(ng_fit_jast .eq. 20) then

    ASSERT(n_max_fit_slat == 20)
--- a/src/bi_ort_ints/semi_num_ints_mo.irp.f
+++ b/src/bi_ort_ints/semi_num_ints_mo.irp.f
@ -107,14 +107,16 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao_transp, (ao_num, ao_num, 3,
  integer          :: i, j, ipoint
  double precision :: wall0, wall1

+  print *, ' providing int2_grad1_u12_ao_transp ...'
  call wall_time(wall0)
+
  if(test_cycle_tc)then
   do ipoint = 1, n_points_final_grid
     do i = 1, ao_num
       do j = 1, ao_num
-         int2_grad1_u12_ao_transp(j,i,1,ipoint) = int2_grad1_u12_ao_test(1,j,i,ipoint)
-         int2_grad1_u12_ao_transp(j,i,2,ipoint) = int2_grad1_u12_ao_test(2,j,i,ipoint)
-         int2_grad1_u12_ao_transp(j,i,3,ipoint) = int2_grad1_u12_ao_test(3,j,i,ipoint)
+         int2_grad1_u12_ao_transp(j,i,1,ipoint) = int2_grad1_u12_ao_test(j,i,ipoint,1)
+         int2_grad1_u12_ao_transp(j,i,2,ipoint) = int2_grad1_u12_ao_test(j,i,ipoint,2)
+         int2_grad1_u12_ao_transp(j,i,3,ipoint) = int2_grad1_u12_ao_test(j,i,ipoint,3)
       enddo
     enddo
   enddo
@ -122,9 +124,9 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao_transp, (ao_num, ao_num, 3,
   do ipoint = 1, n_points_final_grid
     do i = 1, ao_num
       do j = 1, ao_num
-         int2_grad1_u12_ao_transp(j,i,1,ipoint) = int2_grad1_u12_ao(1,j,i,ipoint)
-         int2_grad1_u12_ao_transp(j,i,2,ipoint) = int2_grad1_u12_ao(2,j,i,ipoint)
-         int2_grad1_u12_ao_transp(j,i,3,ipoint) = int2_grad1_u12_ao(3,j,i,ipoint)
+         int2_grad1_u12_ao_transp(j,i,1,ipoint) = int2_grad1_u12_ao(j,i,ipoint,1)
+         int2_grad1_u12_ao_transp(j,i,2,ipoint) = int2_grad1_u12_ao(j,i,ipoint,2)
+         int2_grad1_u12_ao_transp(j,i,3,ipoint) = int2_grad1_u12_ao(j,i,ipoint,3)
       enddo
     enddo
   enddo
@ -192,9 +194,9 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao_t, (n_points_final_grid, 3,
  do ipoint = 1, n_points_final_grid
    do i = 1, ao_num
      do j = 1, ao_num
-        int2_grad1_u12_ao_t(ipoint,1,j,i) = int2_grad1_u12_ao(1,j,i,ipoint)
-        int2_grad1_u12_ao_t(ipoint,2,j,i) = int2_grad1_u12_ao(2,j,i,ipoint)
-        int2_grad1_u12_ao_t(ipoint,3,j,i) = int2_grad1_u12_ao(3,j,i,ipoint)
+        int2_grad1_u12_ao_t(ipoint,1,j,i) = int2_grad1_u12_ao(j,i,ipoint,1)
+        int2_grad1_u12_ao_t(ipoint,2,j,i) = int2_grad1_u12_ao(j,i,ipoint,2)
+        int2_grad1_u12_ao_t(ipoint,3,j,i) = int2_grad1_u12_ao(j,i,ipoint,3)
      enddo                                  
    enddo
  enddo
@ -203,40 +205,6 @@ END_PROVIDER

 ! ---

-BEGIN_PROVIDER [ double precision, int2_grad1_u12_bimo, (3, mo_num, mo_num, n_points_final_grid)]
-
-  BEGIN_DOC
-  !
-  ! int2_grad1_u12_bimo(:,k,i,ipoint) = \int dr2 [-1 * \grad_r1 J(r1,r2)] \chi_k(r2) \phi_i(r2) 
-  !
-  END_DOC
-
-  implicit none
-  integer :: ipoint
-  print*,'Wrong !!'
-  stop
- !$OMP PARALLEL         &
- !$OMP DEFAULT (NONE)   &
- !$OMP PRIVATE (ipoint) & 
- !$OMP SHARED (n_points_final_grid,int2_grad1_u12_ao,int2_grad1_u12_bimo)
- !$OMP DO SCHEDULE (dynamic)
-  do ipoint = 1, n_points_final_grid
-
-    call ao_to_mo_bi_ortho( int2_grad1_u12_ao  (1,1,1,ipoint), size(int2_grad1_u12_ao  , 2) &
-                          , int2_grad1_u12_bimo(1,1,1,ipoint), size(int2_grad1_u12_bimo, 2) )
-    call ao_to_mo_bi_ortho( int2_grad1_u12_ao  (2,1,1,ipoint), size(int2_grad1_u12_ao  , 2) &
-                          , int2_grad1_u12_bimo(2,1,1,ipoint), size(int2_grad1_u12_bimo, 2) )
-    call ao_to_mo_bi_ortho( int2_grad1_u12_ao  (3,1,1,ipoint), size(int2_grad1_u12_ao  , 2) &
-                          , int2_grad1_u12_bimo(3,1,1,ipoint), size(int2_grad1_u12_bimo, 2) )
-
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-END_PROVIDER 
-
-! ---
-
 BEGIN_PROVIDER [ double precision, mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp, (n_points_final_grid, 3, mo_num, mo_num)]

  implicit none
--- a/src/bi_ortho_mos/bi_density.irp.f
+++ b/src/bi_ortho_mos/bi_density.irp.f
@ -2,49 +2,68 @@
 ! ---

 BEGIN_PROVIDER [double precision, TCSCF_bi_ort_dm_ao_alpha, (ao_num, ao_num) ]
-  implicit none
+
  BEGIN_DOC
  ! TCSCF_bi_ort_dm_ao_alpha(i,j) = <Chi_0| a^dagger_i,alpha a_j,alpha |Phi_0> where i,j are AO basis. 
  !
  ! This is the equivalent of the alpha density of the HF Slater determinant, but with a couple of bi-orthonormal Slater determinant |Chi_0> and |Phi_0>
  END_DOC
+
+  implicit none
+
+  PROVIDE mo_l_coef mo_r_coef
+
  call dgemm( 'N', 'T', ao_num, ao_num, elec_alpha_num, 1.d0               &
            , mo_l_coef, size(mo_l_coef, 1), mo_r_coef, size(mo_r_coef, 1) &
            !, mo_r_coef, size(mo_r_coef, 1), mo_l_coef, size(mo_l_coef, 1) &
            , 0.d0, TCSCF_bi_ort_dm_ao_alpha, size(TCSCF_bi_ort_dm_ao_alpha, 1) )
+
 END_PROVIDER

 ! ---

 BEGIN_PROVIDER [ double precision, TCSCF_bi_ort_dm_ao_beta, (ao_num, ao_num) ]
-  implicit none
+
  BEGIN_DOC
  ! TCSCF_bi_ort_dm_ao_beta(i,j) = <Chi_0| a^dagger_i,beta a_j,beta |Phi_0> where i,j are AO basis. 
  !
  ! This is the equivalent of the beta density of the HF Slater determinant, but with a couple of bi-orthonormal Slater determinant |Chi_0> and |Phi_0>
  END_DOC
+
+  implicit none
+
+  PROVIDE mo_l_coef mo_r_coef
+
  call dgemm( 'N', 'T', ao_num, ao_num, elec_beta_num, 1.d0                &
            , mo_l_coef, size(mo_l_coef, 1), mo_r_coef, size(mo_r_coef, 1) &
            !, mo_r_coef, size(mo_r_coef, 1), mo_l_coef, size(mo_l_coef, 1) &
            , 0.d0, TCSCF_bi_ort_dm_ao_beta, size(TCSCF_bi_ort_dm_ao_beta, 1) )
+
 END_PROVIDER

 ! ---

 BEGIN_PROVIDER [ double precision, TCSCF_bi_ort_dm_ao, (ao_num, ao_num) ]
-  implicit none
+
  BEGIN_DOC
  ! TCSCF_bi_ort_dm_ao(i,j) = <Chi_0| a^dagger_i,beta+alpha a_j,beta+alpha |Phi_0> where i,j are AO basis. 
  !
  ! This is the equivalent of the total electronic density of the HF Slater determinant, but with a couple of bi-orthonormal Slater determinant |Chi_0> and |Phi_0>
  END_DOC
+
+  implicit none
+
+  PROVIDE mo_l_coef mo_r_coef
+
  ASSERT(size(TCSCF_bi_ort_dm_ao, 1) == size(TCSCF_bi_ort_dm_ao_alpha, 1))
+
  if(elec_alpha_num==elec_beta_num) then
    TCSCF_bi_ort_dm_ao = TCSCF_bi_ort_dm_ao_alpha + TCSCF_bi_ort_dm_ao_alpha
  else
    ASSERT(size(TCSCF_bi_ort_dm_ao, 1) == size(TCSCF_bi_ort_dm_ao_beta, 1))
    TCSCF_bi_ort_dm_ao = TCSCF_bi_ort_dm_ao_alpha + TCSCF_bi_ort_dm_ao_beta
  endif
+
 END_PROVIDER

 ! ---
--- a/src/csf/cfgCI_utils.c
+++ b/src/csf/cfgCI_utils.c
@ -253,9 +253,9 @@ void generateAllBFs(int64_t Isomo, int64_t MS, Tree *bftree, int *NBF, int *NSOM
    buildTreeDriver(bftree, *NSOMO, MS, NBF);
 }

-void ortho_qr_csf(double *overlapMatrix, int lda, double *orthoMatrix, int rows, int cols);
+//void ortho_qr_csf(double *overlapMatrix, int lda, double *orthoMatrix, int rows, int cols);
+

-// QR to orthogonalize CSFs does not work
 //void gramSchmidt_qp(double *overlapMatrix, int rows, int cols, double *orthoMatrix){
 //  int i,j;
 //  //for(j=0;j<cols;++j){
--- a/src/csf/configuration_CI_sigma_helpers.irp.f
+++ b/src/csf/configuration_CI_sigma_helpers.irp.f
--- a/src/csf/conversion.irp.f
+++ b/src/csf/conversion.irp.f
@ -114,6 +114,7 @@ subroutine convertWFfromCSFtoDET(N_st,psi_coef_cfg_in, psi_coef_det)
  integer                        :: idx
  integer MS
  MS = elec_alpha_num-elec_beta_num
+  !print *,"size=",size(tmp_psi_coef_det,1)," ",size(tmp_psi_coef_det,2)

  countcsf = 0

--- a/src/csf/obtain_I_foralpha.irp.f
+++ b/src/csf/obtain_I_foralpha.irp.f
@ -38,6 +38,7 @@ subroutine obtain_connected_J_givenI(idxI, givenI, connectedI, idxs_connectedI,
  integer :: holetype(mo_num)
  integer :: end_index
  integer :: Nsomo_I
+  integer :: listall(N_int*bit_kind_size), nelall

  ! 
  ! 2 2 1 1 0 0 : 1 1 0 0 0 0
@ -65,9 +66,12 @@ subroutine obtain_connected_J_givenI(idxI, givenI, connectedI, idxs_connectedI,

  ! Since CFGs are sorted wrt to seniority
  ! we don't have to search the full CFG list
-  Isomo = givenI(1,1)
-  Idomo = givenI(1,2)
-  Nsomo_I = POPCNT(Isomo)
+  Nsomo_I = 0
+  do i=1,N_int
+    Isomo = givenI(i,1)
+    Idomo = givenI(i,2)
+    Nsomo_I += POPCNT(Isomo)
+  end do
  end_index = min(N_configuration,cfg_seniority_index(min(Nsomo_I+6,elec_num))-1)
  if(end_index .LT. 0) end_index= N_configuration
  !end_index = N_configuration
@ -83,17 +87,24 @@ subroutine obtain_connected_J_givenI(idxI, givenI, connectedI, idxs_connectedI,
     !  idxs_connectedI(nconnectedI)=i
     !  cycle
     !endif
-     Isomo = givenI(1,1)
-     Idomo = givenI(1,2)
-     Jsomo = psi_configuration(1,1,i)
-     Jdomo = psi_configuration(1,2,i)
+
+     ndiffSOMO = 0
+     ndiffDOMO = 0
+     nxordiffSOMODOMO = 0
+     do ii=1,N_int
+       Isomo = givenI(ii,1)
+       Idomo = givenI(ii,2)
+       Jsomo = psi_configuration(ii,1,i)
+       Jdomo = psi_configuration(ii,2,i)
       diffSOMO = IEOR(Isomo,Jsomo)
-     ndiffSOMO = POPCNT(diffSOMO)
+       ndiffSOMO += POPCNT(diffSOMO)
       diffDOMO = IEOR(Idomo,Jdomo)
       xordiffSOMODOMO = IEOR(diffSOMO,diffDOMO)
-     ndiffDOMO = POPCNT(diffDOMO)
-     nxordiffSOMODOMO = POPCNT(xordiffSOMODOMO)
-     nxordiffSOMODOMO += ndiffSOMO + ndiffDOMO 
+       ndiffDOMO += POPCNT(diffDOMO)
+       nxordiffSOMODOMO += POPCNT(xordiffSOMODOMO)
+       nxordiffSOMODOMO += POPCNT(diffSOMO) + POPCNT(diffDOMO)
+     end do
+
     if((nxordiffSOMODOMO .EQ. 4) .AND. ndiffSOMO .EQ. 2) then
       !-------
       ! MONO |
@ -144,25 +155,45 @@ subroutine obtain_connected_J_givenI(idxI, givenI, connectedI, idxs_connectedI,
        ! find out all pq holes possible
        nholes = 0
        ! holes in SOMO
-        Isomo = psi_configuration(1,1,i)
-        Idomo = psi_configuration(1,2,i)
-        do iii = 1,n_act_orb
-          ii = list_act(iii)
-           if(POPCNT(IAND(Isomo,IBSET(0_8,ii-1))) .EQ. 1) then
+        !Isomo = psi_configuration(1,1,i)
+        !Idomo = psi_configuration(1,2,i)
+        !do iii = 1,n_act_orb
+        !  ii = list_act(iii)
+        !   if(POPCNT(IAND(Isomo,IBSET(0_8,ii-1))) .EQ. 1) then
+        !      nholes += 1
+        !      listholes(nholes) = ii
+        !      holetype(nholes) = 1
+        !   endif
+        !end do
+       
+        call bitstring_to_list(psi_configuration(1,1,i),listall,nelall,N_int)
+
+        do iii=1,nelall
          nholes += 1
-              listholes(nholes) = ii
+          listholes(nholes) = listall(iii)
          holetype(nholes) = 1
-           endif
        end do
+
        ! holes in DOMO
-        do iii = 1,n_act_orb
-          ii = list_act(iii)
-           if(POPCNT(IAND(Idomo,IBSET(0_8,ii-1))) .EQ. 1) then
+        !do iii = 1,n_act_orb
+        !  ii = list_act(iii)
+        !   if(POPCNT(IAND(Idomo,IBSET(0_8,ii-1))) .EQ. 1) then
+        !      nholes += 1
+        !      listholes(nholes) = ii
+        !      holetype(nholes) = 2
+        !   endif
+        !end do
+        
+        call bitstring_to_list(psi_configuration(1,2,i),listall,nelall,N_int)
+
+        do iii=1,nelall
+          if(listall(iii) .gt. n_core_orb)then
            nholes += 1
-              listholes(nholes) = ii
+            listholes(nholes) = listall(iii)
            holetype(nholes) = 2
          endif
        end do
+
        ntotalconnectedI += max(1,(psi_config_data(i,2)-psi_config_data(i,1)+1)*nholes)
     endif
  end do
@ -199,6 +230,8 @@ subroutine obtain_connected_I_foralpha(idxI, Ialpha, connectedI, idxs_connectedI
  integer*8                                :: Isomo
  integer*8                                :: Jdomo
  integer*8                                :: Jsomo
+  integer(bit_kind)                        :: Jcfg(N_int,2)
+  integer(bit_kind)                        :: Icfg(N_int,2)
  integer*8                                :: IJsomo
  integer*8                                :: diffSOMO
  integer*8                                :: diffDOMO
@ -209,132 +242,261 @@ subroutine obtain_connected_I_foralpha(idxI, Ialpha, connectedI, idxs_connectedI
  integer                        :: iii,ii,i,j,k,l,p,q,nsomoJ,nsomoalpha,starti,endi,extyp,nholes
  integer                        :: listholes(mo_num)
  integer                        :: holetype(mo_num)
-  integer                        :: end_index
-  integer                        :: Nsomo_alpha
+  integer                        :: end_index, ishift
+  integer                        :: Nsomo_alpha, pp,qq, nperm, iint, ipos
  integer*8                      :: MS
+  integer                        :: exc(0:2,2,2), tz, m, n, high, low
+  integer                        :: listall(N_int*bit_kind_size), nelall
+  integer                        :: nconnectedExtradiag, nconnectedDiag
+  integer(bit_kind)              :: hole, particle, tmp
  MS = elec_alpha_num-elec_beta_num

+  nconnectedExtradiag=0
+  nconnectedDiag=0
  nconnectedI = 0
  end_index = N_configuration

  ! Since CFGs are sorted wrt to seniority
  ! we don't have to search the full CFG list
-  Isomo = Ialpha(1,1)
-  Idomo = Ialpha(1,2)
-  Nsomo_alpha = POPCNT(Isomo)
+  !Isomo = Ialpha(1,1)
+  !Idomo = Ialpha(1,2)
+  !Nsomo_alpha = POPCNT(Isomo)
+  Icfg = Ialpha
+  Nsomo_alpha = 0
+  !print *," Ialpha="
+  do ii=1,N_int
+    Isomo = Ialpha(ii,1)
+    Idomo = Ialpha(ii,2)
+    Nsomo_alpha += POPCNT(Isomo)
+    !print *,Isomo, Idomo, "Nsomo=",Nsomo_alpha
+  end do
  end_index = min(N_configuration,cfg_seniority_index(min(Nsomo_alpha+4,elec_num))-1)
-  if(end_index .LT. 0) end_index= N_configuration
+  if(end_index .LT. 0 .OR. end_index .lt. idxI) end_index= N_configuration
  end_index = N_configuration


  p = 0
  q = 0
-  if (N_int > 1) stop 'obtain_connected_i_foralpha : N_int > 1'
+  !if (N_int > 1) stop 'obtain_connected_i_foralpha : N_int > 1'
  do i=idxI,end_index
-     Isomo = Ialpha(1,1)
-     Idomo = Ialpha(1,2)
-     Jsomo = psi_configuration(1,1,i)
-     Jdomo = psi_configuration(1,2,i)
     ! Check for Minimal alpha electrons (MS)
-     if(POPCNT(Isomo).lt.MS)then
+     if(Nsomo_alpha .lt. MS)then
       cycle
     endif
+
+     ndiffSOMO = 0
+     ndiffDOMO = 0
+     nxordiffSOMODOMO = 0
+     nsomoJ=0
+     nsomoalpha=0
+     do ii=1,N_int
+       Isomo = Ialpha(ii,1)
+       Idomo = Ialpha(ii,2)
+       Jsomo = psi_configuration(ii,1,i)
+       Jdomo = psi_configuration(ii,2,i)
+       nsomoJ += POPCNT(Jsomo)
+       nsomoalpha += POPCNT(Isomo)
       diffSOMO = IEOR(Isomo,Jsomo)
-     ndiffSOMO = POPCNT(diffSOMO)
-     !if(idxI.eq.1)then
-     !  print *," \t idxI=",i," diffS=",ndiffSOMO," popJs=", POPCNT(Jsomo)," popIs=",POPCNT(Isomo)
-     !endif
+       ndiffSOMO += POPCNT(diffSOMO)
       diffDOMO = IEOR(Idomo,Jdomo)
       xordiffSOMODOMO = IEOR(diffSOMO,diffDOMO)
-     ndiffDOMO = POPCNT(diffDOMO)
-     nxordiffSOMODOMO = POPCNT(xordiffSOMODOMO)
-     nxordiffSOMODOMO += ndiffSOMO + ndiffDOMO 
+       ndiffDOMO += POPCNT(diffDOMO)
+       nxordiffSOMODOMO += POPCNT(xordiffSOMODOMO)
+       nxordiffSOMODOMO += POPCNT(diffSOMO) + POPCNT(diffDOMO)
+     end do
+     !if(idxI.eq.218)then
+     !  print *,"I=",idxI,"Nsomo_alpha=",Nsomo_alpha,"nxordiffSOMODOMO(4)=",nxordiffSOMODOMO, " ndiffSOMO(2)=",ndiffSOMO, " ndiffDOMO=",ndiffDOMO
+     !endif
+     !Jcfg = psi_configuration(:,:,i)
+     !print *,"nxordiffSOMODOMO(4)=",nxordiffSOMODOMO, " ndiffSOMO(2)=",ndiffSOMO
+
     if((nxordiffSOMODOMO .EQ. 4) .AND. ndiffSOMO .EQ. 2) then
        select case(ndiffDOMO)
        case (0)
           ! SOMO -> VMO
           !print *,"obt SOMO -> VMO"
           extyp = 3
+           !if(N_int .eq. 1) then
+           !  IJsomo = IEOR(Isomo, Jsomo)
+           !  p = TRAILZ(IAND(Isomo,IJsomo)) + 1
+           !  IJsomo = IBCLR(IJsomo,p-1)
+           !  q = TRAILZ(IJsomo) + 1
+           !  !print *," p=",p," q=",q
+           !  !call get_single_excitation_cfg(Jcfg, Icfg, p, q, N_int)
+           !else
+             ! Find p
+             do ii=1,N_int
+               Isomo = Ialpha(ii,1)
+               Jsomo = psi_configuration(ii,1,i)
               IJsomo = IEOR(Isomo, Jsomo)
-!IRP_IF WITHOUT_TRAILZ
-!           p = (popcnt(ieor( IAND(Isomo,IJsomo) , IAND(Isomo,IJsomo) -1))-1) + 1
-!IRP_ELSE
-           p = TRAILZ(IAND(Isomo,IJsomo)) + 1
-!IRP_ENDIF
-           IJsomo = IBCLR(IJsomo,p-1)
-!IRP_IF WITHOUT_TRAILZ
-!           q = (popcnt(ieor(IJsomo,IJsomo-1))-1) + 1
-!IRP_ELSE
-           q = TRAILZ(IJsomo) + 1
-!IRP_ENDIF
+               if(popcnt(IAND(Isomo,IJsomo)) > 0)then
+                 p = TRAILZ(IAND(Isomo,IJsomo)) + 1 + (ii-1) * bit_kind_size
+                 EXIT
+               endif
+             end do
+             ! Find q
+             do ii=1,N_int
+               Isomo = Ialpha(ii,1)
+               Jsomo = psi_configuration(ii,1,i)
+               IJsomo = IEOR(Isomo, Jsomo)
+               iint = shiftr(p-1,bit_kind_shift) + 1
+               ipos = p-shiftl((iint-1),bit_kind_shift)
+               if(iint .eq. ii)then
+                 IJsomo = IBCLR(IJsomo,ipos-1)
+               endif
+               if(popcnt(IJsomo) > 0)then
+                 q = TRAILZ(IJsomo) + 1 + (ii-1) * bit_kind_size
+                 EXIT
+               endif
+             enddo
+           !endif
+           !assert ( p == pp)
+           !assert ( q == qq)
+           !print *," 1--- p=",p," q=",q
        case (1)
           ! DOMO -> VMO
           ! or
           ! SOMO -> SOMO
-           nsomoJ = POPCNT(Jsomo)
-           nsomoalpha = POPCNT(Isomo)
           if(nsomoJ .GT. nsomoalpha) then
              ! DOMO -> VMO
              !print *,"obt DOMO -> VMO"
              extyp = 2
-!IRP_IF WITHOUT_TRAILZ
-!              p = (popcnt(ieor( IEOR(Idomo,Jdomo),IEOR(Idomo,Jdomo) -1))-1) + 1
-!IRP_ELSE
-              p = TRAILZ(IEOR(Idomo,Jdomo)) + 1
-!IRP_ENDIF
-              Isomo = IEOR(Isomo, Jsomo)
-              Isomo = IBCLR(Isomo,p-1)
-!IRP_IF WITHOUT_TRAILZ
-!              q = (popcnt(ieor(Isomo,Isomo-1))-1) + 1
-!IRP_ELSE
-              q = TRAILZ(Isomo) + 1
-!IRP_ENDIF
+              !if(N_int.eq.1)then
+              !  p = TRAILZ(IEOR(Idomo,Jdomo)) + 1
+              !  Isomo = IEOR(Isomo, Jsomo)
+              !  Isomo = IBCLR(Isomo,p-1)
+              !  q = TRAILZ(Isomo) + 1
+              !else
+
+                ! Find p
+                do ii=1,N_int
+                  Isomo = Ialpha(ii,1)
+                  Jsomo = psi_configuration(ii,1,i)
+                  Idomo = Ialpha(ii,2)
+                  Jdomo = psi_configuration(ii,2,i)
+                  if(popcnt(IEOR(Idomo,Jdomo)) > 0)then
+                    p = TRAILZ(IEOR(Idomo,Jdomo)) + 1 + (ii-1) * bit_kind_size
+                    EXIT
+                  endif
+                end do
+                ! Find q
+                do ii=1,N_int
+                  Isomo = Ialpha(ii,1)
+                  Jsomo = psi_configuration(ii,1,i)
+                  IJsomo = IEOR(Isomo, Jsomo)
+                  iint = shiftr(p-1,bit_kind_shift) + 1
+                  ipos = p-shiftl((iint-1),bit_kind_shift)
+                  if(iint .eq. ii)then
+                    IJsomo = IBCLR(IJsomo,ipos-1)
+                  endif
+                  if(popcnt(IJsomo) > 0)then
+                    q = TRAILZ(IJsomo) + 1 + (ii-1) * bit_kind_size
+                    EXIT
+                  endif
+                end do
+              !endif
+           !assert ( p == pp)
+           !assert ( q == qq)
           else
              ! SOMO -> SOMO
              !print *,"obt SOMO -> SOMO"
              extyp = 1
-!IRP_IF WITHOUT_TRAILZ
-!              q = (popcnt(ieor( IEOR(Idomo,Jdomo), IEOR(Idomo,Jdomo)-1))-1) + 1
-!IRP_ELSE
-              q = TRAILZ(IEOR(Idomo,Jdomo)) + 1
-!IRP_ENDIF
-              Isomo = IEOR(Isomo, Jsomo)
-              Isomo = IBCLR(Isomo,q-1)
-!IRP_IF WITHOUT_TRAILZ
-!              p = (popcnt(ieor(Isomo,Isomo-1))-1) + 1
-!IRP_ELSE
-              p = TRAILZ(Isomo) + 1
-!IRP_ENDIF
-              ! Check for Minimal alpha electrons (MS)
-              !if(POPCNT(Isomo).lt.MS)then
-              !  cycle
-              !endif
+              !if(N_int.eq.1)then
+              !  q = TRAILZ(IEOR(Idomo,Jdomo)) + 1
+              !  Isomo = IEOR(Isomo, Jsomo)
+              !  Isomo = IBCLR(Isomo,q-1)
+              !  p = TRAILZ(Isomo) + 1
+              !  ! Check for Minimal alpha electrons (MS)
+              !  !if(POPCNT(Isomo).lt.MS)then
+              !  !  cycle
+              !  !endif
+              !else
+                ! Find p
+                !print *,"Ialpha somo=",Ialpha(1,1), Ialpha(2,1)," Ialpha domo=",Ialpha(1,2), Ialpha(2,2)
+                !print *,"J somo=",psi_configuration(1,1,i), psi_configuration(2,1,i)," J domo=",psi_configuration(1,2,i),&
+                !psi_configuration(2,2,i)
+                do ii=1,N_int
+                  Isomo = Ialpha(ii,1)
+                  Jsomo = psi_configuration(ii,1,i)
+                  Idomo = Ialpha(ii,2)
+                  Jdomo = psi_configuration(ii,2,i)
+                  if(popcnt(IEOR(Idomo,Jdomo)) > 0)then
+                    q = TRAILZ(IEOR(Idomo,Jdomo)) + 1 + (ii-1) * bit_kind_size
+                    EXIT
                  endif
+                enddo
+                ! Find q
+                do ii=1,N_int
+                  Isomo = Ialpha(ii,1)
+                  Jsomo = psi_configuration(ii,1,i)
+                  IJsomo = IEOR(Isomo, Jsomo)
+                  iint = shiftr(q-1,bit_kind_shift) + 1
+                  ipos = q-shiftl((iint-1),bit_kind_shift)
+                  if(iint .eq. ii)then
+                    IJsomo = IBCLR(IJsomo,ipos-1)
+                  endif
+                  !print *,"ii=",ii," Isomo=",Isomo
+                  if(popcnt(IJsomo) > 0)then
+                    p = TRAILZ(IJsomo) + 1 + (ii-1) * bit_kind_size
+                    EXIT
+                  endif
+                enddo
+              !endif
+           !assert ( p == pp)
+           !assert ( q == qq)
+           endif
+           !print *," 2--- p=",p," q=",q
        case (2)
           ! DOMO -> SOMO
           !print *,"obt DOMO -> SOMO"
           extyp = 4
+           !if(N_int.eq.1)then
+           !  IJsomo = IEOR(Isomo, Jsomo)
+           !  p = TRAILZ(IAND(Jsomo,IJsomo)) + 1
+           !  IJsomo = IBCLR(IJsomo,p-1)
+           !  q = TRAILZ(IJsomo) + 1
+           !else
+             ! Find p
+             do ii=1,N_int
+               Isomo = Ialpha(ii,1)
+               Jsomo = psi_configuration(ii,1,i)
+               Idomo = Ialpha(ii,2)
+               Jdomo = psi_configuration(ii,2,i)
               IJsomo = IEOR(Isomo, Jsomo)
-!IRP_IF WITHOUT_TRAILZ
-!           p = (popcnt(ieor( IAND(Jsomo,IJsomo), IAND(Jsomo,IJsomo)-1))-1) + 1
-!IRP_ELSE
-           p = TRAILZ(IAND(Jsomo,IJsomo)) + 1
-!IRP_ENDIF
-           IJsomo = IBCLR(IJsomo,p-1)
-!IRP_IF WITHOUT_TRAILZ
-!           q = (popcnt(ieor( IJsomo , IJsomo -1))-1) + 1
-!IRP_ELSE
-           q = TRAILZ(IJsomo) + 1
-!IRP_ENDIF
+               if(popcnt(IAND(Jsomo,IJsomo)) > 0)then
+                 p = TRAILZ(IAND(Jsomo,IJsomo)) + 1 + (ii-1) * bit_kind_size
+                 EXIT
+               endif
+             enddo
+             ! Find q
+             do ii=1,N_int
+               Isomo = Ialpha(ii,1)
+               Jsomo = psi_configuration(ii,1,i)
+               IJsomo = IEOR(Isomo, Jsomo)
+               iint = shiftr(p-1,bit_kind_shift) + 1
+               ipos = p-shiftl((iint-1),bit_kind_shift)
+               if(iint .eq. ii)then
+                 IJsomo = IBCLR(IJsomo,ipos-1)
+               endif
+               if(popcnt(IJsomo) > 0)then
+                 q = TRAILZ(IJsomo) + 1 + (ii-1) * bit_kind_size
+                 EXIT
+               endif
+             enddo
+           !endif
+           !assert ( p == pp)
+           !assert ( q == qq)
+           !print *," 3--- p=",p," q=",q
        case default
           print *,"something went wront in get connectedI"
        end select
        starti = psi_config_data(i,1)
        endi   = psi_config_data(i,2)
+        nconnectedExtradiag+=1
        nconnectedI += 1
-        do k=1,N_int
-          connectedI(k,1,nconnectedI) = psi_configuration(k,1,i)
-          connectedI(k,2,nconnectedI) = psi_configuration(k,2,i)
+        do ii=1,N_int
+          connectedI(ii,1,nconnectedI) = psi_configuration(ii,1,i)
+          connectedI(ii,2,nconnectedI) = psi_configuration(ii,2,i)
        enddo
        idxs_connectedI(nconnectedI)=starti
        excitationIds(1,nconnectedI)=p
@ -343,28 +505,51 @@ subroutine obtain_connected_I_foralpha(idxI, Ialpha, connectedI, idxs_connectedI
        diagfactors(nconnectedI) = 1.0d0
     else if((ndiffSOMO + ndiffDOMO) .EQ. 0) then
        ! find out all pq holes possible
+        !print *,"I = ",i
+        !print *,"I somo= ",psi_configuration(1,1,i), " domo=", psi_configuration(1,2,i)
+        !print *,"alp somo= ",Ialpha(1,1), " domo=", Ialpha(1,2)
        nholes = 0
        ! holes in SOMO
-        Isomo = psi_configuration(1,1,i)
-        Idomo = psi_configuration(1,2,i)
-        do iii = 1,n_act_orb
-          ii = list_act(iii)
-           if(POPCNT(IAND(Isomo,IBSET(0_8,ii-1))) .EQ. 1) then
+        !Isomo = psi_configuration(1,1,i)
+        !Idomo = psi_configuration(1,2,i)
+        !do iii = 1,n_act_orb
+        !  ii = list_act(iii)
+        !   if(POPCNT(IAND(Isomo,IBSET(0_8,ii-1))) .EQ. 1) then
+        !      nholes += 1
+        !      listholes(nholes) = ii
+        !      holetype(nholes) = 1
+        !   endif
+        !end do
+        call bitstring_to_list(psi_configuration(1,1,i),listall,nelall,N_int)
+
+        do iii=1,nelall
          nholes += 1
-              listholes(nholes) = ii
+          listholes(nholes) = listall(iii)
          holetype(nholes) = 1
-           endif
        end do
+
        ! holes in DOMO
-        do iii = 1,n_act_orb
-          ii = list_act(iii)
-           if(POPCNT(IAND(Idomo,IBSET(0_8,ii-1))) .EQ. 1) then
+        !do iii = 1,n_act_orb
+        !  ii = list_act(iii)
+        !   if(POPCNT(IAND(Idomo,IBSET(0_8,ii-1))) .EQ. 1) then
+        !      nholes += 1
+        !      listholes(nholes) = ii
+        !      holetype(nholes) = 2
+        !   endif
+        !end do
+        nelall=0
+        listall=0
+        call bitstring_to_list(psi_configuration(1,2,i),listall,nelall,N_int)
+
+        do iii=1,nelall
+          if(listall(iii) .gt. n_core_orb)then
            nholes += 1
-              listholes(nholes) = ii
+            listholes(nholes) = listall(iii)
            holetype(nholes) = 2
          endif
        end do

+
        do k=1,nholes
           p = listholes(k)
           q = p
@ -372,6 +557,7 @@ subroutine obtain_connected_I_foralpha(idxI, Ialpha, connectedI, idxs_connectedI
           if(holetype(k) .EQ. 1) then
              starti = psi_config_data(i,1)
              endi   = psi_config_data(i,2)
+              nconnectedDiag+=1
              nconnectedI += 1
              connectedI(:,:,nconnectedI) = psi_configuration(:,:,i)
              idxs_connectedI(nconnectedI)=starti
@ -382,6 +568,7 @@ subroutine obtain_connected_I_foralpha(idxI, Ialpha, connectedI, idxs_connectedI
           else
              starti = psi_config_data(i,1)
              endi   = psi_config_data(i,2)
+              nconnectedDiag+=1
              nconnectedI += 1
              connectedI(:,:,nconnectedI) = psi_configuration(:,:,i)
              idxs_connectedI(nconnectedI)=starti
@ -390,8 +577,10 @@ subroutine obtain_connected_I_foralpha(idxI, Ialpha, connectedI, idxs_connectedI
              excitationTypes(nconnectedI) = extyp
              diagfactors(nconnectedI) = 2.0d0
           endif
+           !print *,excitationIds(1,nconnectedI), excitationIds(2,nconnectedI)
        enddo
     endif
  end do
+  !print *,"nconnectedExtradiag=",nconnectedExtradiag," nconnectedDiad=",nconnectedDiag

 end subroutine obtain_connected_I_foralpha
--- a/src/csf/sigma_vector.irp.f
+++ b/src/csf/sigma_vector.irp.f
@ -146,7 +146,6 @@
    ncfgprev = cfg_seniority_index(i+2)
  end do
  !print *," ^^^^^ N_CSF = ",n_CSF," N_CFG=",N_configuration
-
 END_PROVIDER


@ -832,7 +831,7 @@ subroutine calculate_preconditioner_cfg(diag_energies)
  ! the configurations in psi_configuration
  ! returns : diag_energies :
  END_DOC
-  integer :: i,j,k,kk,l,p,q,noccp,noccq, ii, jj
+  integer :: i,j,k,kk,l,p,q,noccp,noccq, ii, jj, iii
  real*8,intent(out) :: diag_energies(n_CSF)
  integer                            :: nholes
  integer                            :: nvmos
@ -858,8 +857,8 @@ subroutine calculate_preconditioner_cfg(diag_energies)
  real*8, external  :: mo_two_e_integral
  real*8            :: hpp
  real*8            :: meCC
-  real*8            :: ecore
  real*8            :: core_act_contrib
+  integer                        :: listall(N_int*bit_kind_size), nelall

  !PROVIDE h_core_ri
  PROVIDE core_fock_operator
@ -869,7 +868,6 @@ subroutine calculate_preconditioner_cfg(diag_energies)
  !print *,"Core energy=",core_energy," nucler rep=",nuclear_repulsion, " n_core_orb=",n_core_orb," n_act_orb=",n_act_orb," mo_num=",mo_num

  ! calculate core energy
-  !call get_core_energy(ecore)
  diag_energies = core_energy - nuclear_repulsion

  ! calculate the core energy
@ -877,11 +875,11 @@ subroutine calculate_preconditioner_cfg(diag_energies)

  do i=1,N_configuration

-     Isomo = psi_configuration(1,1,i)
-     Idomo = psi_configuration(1,2,i)
-     Icfg(1,1) = psi_configuration(1,1,i)
-     Icfg(1,2) = psi_configuration(1,2,i)
-     NSOMOI = getNSOMO(psi_configuration(:,:,i))
+     !Isomo = psi_configuration(1,1,i)
+     !Idomo = psi_configuration(1,2,i)
+     !Icfg(1,1) = psi_configuration(1,1,i)
+     !Icfg(1,2) = psi_configuration(1,2,i)
+     !NSOMOI = getNSOMO(psi_configuration(:,:,i))

     starti = psi_config_data(i,1)
     endi   = psi_config_data(i,2)
@ -890,48 +888,63 @@ subroutine calculate_preconditioner_cfg(diag_energies)

     ! find out all pq holes possible
     nholes = 0
+     listholes = -1
     ! holes in SOMO
-     !do k = 1,mo_num
-     do kk = 1,n_act_orb
-       k = list_act(kk)
-        if(POPCNT(IAND(Isomo,IBSET(0_8,k-1))) .EQ. 1) then
+     !do kk = 1,n_act_orb
+     !  k = list_act(kk)
+     !   if(POPCNT(IAND(Isomo,IBSET(0_8,k-1))) .EQ. 1) then
+     !      nholes += 1
+     !      listholes(nholes) = k
+     !      holetype(nholes) = 1
+     !   endif
+     !enddo
+     call bitstring_to_list(psi_configuration(1,1,i),listall,nelall,N_int)
+
+     do iii=1,nelall
       nholes += 1
-           listholes(nholes) = k
+       listholes(nholes) = listall(iii)
       holetype(nholes) = 1
-        endif
     end do
+
     ! holes in DOMO
-     !do k = n_core_orb+1,n_core_orb + n_act_orb
-     !do k = 1+n_core_inact_orb,n_core_orb+n_core_inact_act_orb
-     !do k = 1,mo_num
-     do kk = 1,n_act_orb
-       k = list_act(kk)
-        if(POPCNT(IAND(Idomo,IBSET(0_8,k-1))) .EQ. 1) then
+     !do kk = 1,n_act_orb
+     !  k = list_act(kk)
+     !   if(POPCNT(IAND(Idomo,IBSET(0_8,k-1))) .EQ. 1) then
+     !      nholes += 1
+     !      listholes(nholes) = k
+     !      holetype(nholes) = 2
+     !   endif
+     !enddo
+     call bitstring_to_list(psi_configuration(1,2,i),listall,nelall,N_int)
+
+     do iii=1,nelall
+       if(listall(iii) .gt. n_core_orb)then
         nholes += 1
-           listholes(nholes) = k
+         listholes(nholes) = listall(iii)
         holetype(nholes) = 2
       endif
     end do

-     ! find vmos
-     listvmos = -1
-     vmotype = -1
-     nvmos = 0
-     !do k = n_core_orb+1,n_core_orb + n_act_orb
-     !do k = 1,mo_num
-     do kk = 1,n_act_orb
-       k = list_act(kk)
-        !print *,i,IBSET(0,i-1),POPCNT(IAND(Isomo,(IBSET(0_8,i-1)))), POPCNT(IAND(Idomo,(IBSET(0_8,i-1))))
-        if(POPCNT(IAND(Isomo,(IBSET(0_8,k-1)))) .EQ. 0 .AND. POPCNT(IAND(Idomo,(IBSET(0_8,k-1)))) .EQ. 0) then
-           nvmos += 1
-           listvmos(nvmos) = k
-           vmotype(nvmos) = 0
-        else if(POPCNT(IAND(Isomo,(IBSET(0_8,k-1)))) .EQ. 1 .AND. POPCNT(IAND(Idomo,(IBSET(0_8,k-1)))) .EQ. 0 ) then
-           nvmos += 1
-           listvmos(nvmos) = k
-           vmotype(nvmos) = 1
-        end if
-     enddo
+
+     !!! find vmos
+     !!listvmos = -1
+     !!vmotype = -1
+     !!nvmos = 0
+     !!!do k = n_core_orb+1,n_core_orb + n_act_orb
+     !!!do k = 1,mo_num
+     !!do kk = 1,n_act_orb
+     !!  k = list_act(kk)
+     !!   !print *,i,IBSET(0,i-1),POPCNT(IAND(Isomo,(IBSET(0_8,i-1)))), POPCNT(IAND(Idomo,(IBSET(0_8,i-1))))
+     !!   if(POPCNT(IAND(Isomo,(IBSET(0_8,k-1)))) .EQ. 0 .AND. POPCNT(IAND(Idomo,(IBSET(0_8,k-1)))) .EQ. 0) then
+     !!      nvmos += 1
+     !!      listvmos(nvmos) = k
+     !!      vmotype(nvmos) = 0
+     !!   else if(POPCNT(IAND(Isomo,(IBSET(0_8,k-1)))) .EQ. 1 .AND. POPCNT(IAND(Idomo,(IBSET(0_8,k-1)))) .EQ. 0 ) then
+     !!      nvmos += 1
+     !!      listvmos(nvmos) = k
+     !!      vmotype(nvmos) = 1
+     !!   end if
+     !!enddo
     !print *,"I=",i
     !call debug_spindet(psi_configuration(1,1,i),N_int)
     !call debug_spindet(psi_configuration(1,2,i),N_int)
@ -1221,27 +1234,30 @@ subroutine convertOrbIdsToModelSpaceIds(Ialpha, Jcfg, p, q, extype, pmodel, qmod
  integer,intent(in)             :: p,q
  integer,intent(in)             :: extype
  integer,intent(out)            :: pmodel,qmodel
-  !integer(bit_kind)              :: Isomo(N_int)
-  !integer(bit_kind)              :: Idomo(N_int)
-  !integer(bit_kind)              :: Jsomo(N_int)
-  !integer(bit_kind)              :: Jdomo(N_int)
-  integer*8                       :: Isomo
-  integer*8                       :: Idomo
-  integer*8                       :: Jsomo
-  integer*8                       :: Jdomo
+  integer(bit_kind)              :: Isomo(N_int)
+  integer(bit_kind)              :: Idomo(N_int)
+  integer(bit_kind)              :: Jsomo(N_int)
+  integer(bit_kind)              :: Jdomo(N_int)
+  !integer*8                       :: Isomo
+  !integer*8                       :: Idomo
+  !integer*8                       :: Jsomo
+  !integer*8                       :: Jdomo
  integer*8                      :: mask
-  integer                        :: iint, ipos
+  integer                        :: iint, ipos, ii
  !integer(bit_kind)              :: Isomotmp(N_int)
  !integer(bit_kind)              :: Jsomotmp(N_int)
  integer*8             :: Isomotmp
  integer*8             :: Jsomotmp
  integer                        :: pos0,pos0prev
+  integer                        :: tmpp, tmpq

  ! TODO Flag (print) when model space indices is > 64
-  Isomo = Ialpha(1,1)
-  Idomo = Ialpha(1,2)
-  Jsomo = Jcfg(1,1)
-  Jdomo = Jcfg(1,2)
+  do ii=1,N_int
+    Isomo(ii) = Ialpha(ii,1)
+    Idomo(ii) = Ialpha(ii,2)
+    Jsomo(ii) = Jcfg(ii,1)
+    Jdomo(ii) = Jcfg(ii,2)
+  end do
  pos0prev = 0
  pmodel = p
  qmodel = q
@ -1255,40 +1271,155 @@ subroutine convertOrbIdsToModelSpaceIds(Ialpha, Jcfg, p, q, extype, pmodel, qmod
          ! SOMO -> SOMO
          ! remove all domos
          !print *,"type -> SOMO -> SOMO"
-          mask = ISHFT(1_8,p) - 1
-          Isomotmp = IAND(Isomo,mask)
-          pmodel = POPCNT(mask) - POPCNT(XOR(Isomotmp,mask))
-          mask = ISHFT(1_8,q) - 1
-          Isomotmp = IAND(Isomo,mask)
-          qmodel = POPCNT(mask) - POPCNT(XOR(Isomotmp,mask))
+          !mask = ISHFT(1_8,p) - 1
+          !Isomotmp = IAND(Isomo,mask)
+          !pmodel = POPCNT(mask) - POPCNT(XOR(Isomotmp,mask))
+          !mask = ISHFT(1_8,q) - 1
+          !Isomotmp = IAND(Isomo,mask)
+          !qmodel = POPCNT(mask) - POPCNT(XOR(Isomotmp,mask))
+
+          iint = shiftr(p-1,bit_kind_shift) + 1
+          ipos = p-shiftl((iint-1),bit_kind_shift)-1
+          tmpp = 0
+          !print *,"iint=",iint, " p=",p
+          do ii=1,iint-1
+            !mask = ISHFT(1_bit_kind,-1)-1_bit_kind
+            !Isomotmp = IAND(Isomo(ii),mask)
+            !tmpp += POPCNT(mask) - POPCNT(XOR(Isomotmp,mask))
+            tmpp += POPCNT(Isomo(ii))
+          end do
+          mask = ISHFT(1_bit_kind,ipos+1) - 1
+          Isomotmp = IAND(Isomo(iint),mask)
+          !pmodel = tmpp + POPCNT(mask) - POPCNT(XOR(Isomotmp,mask))
+          pmodel = tmpp + POPCNT(Isomotmp)
+          !print *,"iint=",iint, " ipos=",ipos,"pmodel=",pmodel, XOR(Isomotmp,mask),Isomo(iint)
+
+          iint = shiftr(q-1,bit_kind_shift) + 1
+          ipos = q-shiftl((iint-1),bit_kind_shift)-1
+          tmpq = 0
+          do ii=1,iint-1
+            !mask = ISHFT(1_bit_kind,-1)-1_bit_kind
+            !Isomotmp = IAND(Isomo(ii),mask)
+            !tmpq += POPCNT(mask) - POPCNT(XOR(Isomotmp,mask))
+            tmpq += POPCNT(Isomo(ii))
+          end do
+          mask = ISHFT(1_bit_kind,ipos+1) - 1
+          Isomotmp = IAND(Isomo(iint),mask)
+          !qmodel = tmpq + POPCNT(mask) - POPCNT(XOR(Isomotmp,mask))
+          qmodel = tmpq + POPCNT(Isomotmp)
+          !print *,"iint=",iint, " ipos=",ipos,"qmodel=",qmodel
       case (2)
          ! DOMO -> VMO
          ! remove all domos except one at p
          !print *,"type -> DOMO -> VMO"
-          mask = ISHFT(1_8,p) - 1
-          Jsomotmp = IAND(Jsomo,mask)
-          pmodel = POPCNT(mask) - POPCNT(XOR(Jsomotmp,mask))
-          mask = ISHFT(1_8,q) - 1
-          Jsomotmp = IAND(Jsomo,mask)
-          qmodel = POPCNT(mask) - POPCNT(XOR(Jsomotmp,mask))
+          !mask = ISHFT(1_8,p) - 1
+          !Jsomotmp = IAND(Jsomo,mask)
+          !pmodel = POPCNT(mask) - POPCNT(XOR(Jsomotmp,mask))
+          !mask = ISHFT(1_8,q) - 1
+          !Jsomotmp = IAND(Jsomo,mask)
+          !qmodel = POPCNT(mask) - POPCNT(XOR(Jsomotmp,mask))
+
+          iint = shiftr(p-1,bit_kind_shift) + 1
+          ipos = p-shiftl((iint-1),bit_kind_shift)-1
+          tmpp = 0
+          do ii=1,iint-1
+            !mask = ISHFT(1_bit_kind,-1)-1_bit_kind
+            !Jsomotmp = IAND(Jsomo(ii),mask)
+            !tmpp += POPCNT(mask) - POPCNT(XOR(Jsomotmp,mask))
+            tmpp += POPCNT(Jsomo(ii))
+          end do
+          mask = ISHFT(1_bit_kind,ipos+1) - 1
+          Jsomotmp = IAND(Jsomo(iint),mask)
+          !pmodel = tmpp + POPCNT(mask) - POPCNT(XOR(Jsomotmp,mask))
+          pmodel = tmpp + POPCNT(Jsomotmp)
+
+          iint = shiftr(q-1,bit_kind_shift) + 1
+          ipos = q-shiftl((iint-1),bit_kind_shift)-1
+          tmpq = 0
+          do ii=1,iint-1
+            !mask = ISHFT(1_bit_kind,-1)-1_bit_kind
+            !Jsomotmp = IAND(Jsomo(ii),mask)
+            !tmpq += POPCNT(mask) - POPCNT(XOR(Jsomotmp,mask))
+            tmpq += POPCNT(Jsomo(ii))
+          end do
+          mask = ISHFT(1_bit_kind,ipos+1) - 1
+          Jsomotmp = IAND(Jsomo(iint),mask)
+          !qmodel = tmpq + POPCNT(mask) - POPCNT(XOR(Jsomotmp,mask))
+          qmodel = tmpq + POPCNT(Jsomotmp)
       case (3)
          ! SOMO -> VMO
          !print *,"type -> SOMO -> VMO"
          !Isomo = IEOR(Isomo,Jsomo)
          if(p.LT.q) then
-             mask = ISHFT(1_8,p) - 1
-             Isomo = IAND(Isomo,mask)
-             pmodel = POPCNT(mask) - POPCNT(XOR(Isomo,mask))
-             mask = ISHFT(1_8,q) - 1
-             Jsomo = IAND(Jsomo,mask)
-             qmodel = POPCNT(mask) - POPCNT(XOR(Jsomo,mask)) + 1
+             !mask = ISHFT(1_8,p) - 1
+             !Isomo = IAND(Isomo,mask)
+             !pmodel = POPCNT(mask) - POPCNT(XOR(Isomo,mask))
+             !mask = ISHFT(1_8,q) - 1
+             !Jsomo = IAND(Jsomo,mask)
+             !qmodel = POPCNT(mask) - POPCNT(XOR(Jsomo,mask)) + 1
+
+             iint = shiftr(p-1,bit_kind_shift) + 1
+             ipos = p-shiftl((iint-1),bit_kind_shift)-1
+             tmpp = 0
+             do ii=1,iint-1
+               !mask = ISHFT(1_bit_kind,-1)-1_bit_kind
+               !Isomotmp = IAND(Isomo(ii),mask)
+               !tmpp += POPCNT(mask) - POPCNT(XOR(Isomotmp,mask))
+               tmpp += POPCNT(Isomo(ii))
+             end do
+             mask = ISHFT(1_bit_kind,ipos+1) - 1
+             Isomotmp = IAND(Isomo(iint),mask)
+             !pmodel = tmpp + POPCNT(mask) - POPCNT(XOR(Isomotmp,mask))
+             pmodel = tmpp + POPCNT(Isomotmp)
+
+             iint = shiftr(q-1,bit_kind_shift) + 1
+             ipos = q-shiftl((iint-1),bit_kind_shift)-1
+             tmpq = 0
+             do ii=1,iint-1
+               !mask = ISHFT(1_bit_kind,-1)-1_bit_kind
+               !Jsomotmp = IAND(Jsomo(ii),mask)
+               !tmpq += POPCNT(mask) - POPCNT(XOR(Jsomotmp,mask))
+               tmpq += POPCNT(Jsomo(ii))
+             end do
+             mask = ISHFT(1_bit_kind,ipos+1) - 1
+             Jsomotmp = IAND(Jsomo(iint),mask)
+             !qmodel = tmpq + POPCNT(mask) - POPCNT(XOR(Jsomotmp,mask)) + 1
+             qmodel = tmpq + POPCNT(Jsomotmp) + 1
          else
-             mask = ISHFT(1_8,p) - 1
-             Isomo = IAND(Isomo,mask)
-             pmodel = POPCNT(mask) - POPCNT(XOR(Isomo,mask)) + 1
-             mask = ISHFT(1_8,q) - 1
-             Jsomo = IAND(Jsomo,mask)
-             qmodel = POPCNT(mask) - POPCNT(XOR(Jsomo,mask))
+             !mask = ISHFT(1_8,p) - 1
+             !Isomo = IAND(Isomo,mask)
+             !pmodel = POPCNT(mask) - POPCNT(XOR(Isomo,mask)) + 1
+             !mask = ISHFT(1_8,q) - 1
+             !Jsomo = IAND(Jsomo,mask)
+             !qmodel = POPCNT(mask) - POPCNT(XOR(Jsomo,mask))
+
+             iint = shiftr(p-1,bit_kind_shift) + 1
+             ipos = p-shiftl((iint-1),bit_kind_shift)-1
+             tmpp = 0
+             do ii=1,iint-1
+               !mask = ISHFT(1_bit_kind,-1)-1_bit_kind
+               !Isomotmp = IAND(Isomo(ii),mask)
+               !tmpp += POPCNT(mask) - POPCNT(XOR(Isomotmp,mask))
+               tmpp += POPCNT(Isomo(ii))
+             end do
+             mask = ISHFT(1_bit_kind,ipos+1) - 1
+             Isomotmp = IAND(Isomo(iint),mask)
+             !pmodel = tmpp + POPCNT(mask) - POPCNT(XOR(Isomotmp,mask)) + 1
+             pmodel = tmpp + POPCNT(Isomotmp) + 1
+
+             iint = shiftr(q-1,bit_kind_shift) + 1
+             ipos = q-shiftl((iint-1),bit_kind_shift)-1
+             tmpq = 0
+             do ii=1,iint-1
+               !mask = ISHFT(1_bit_kind,-1)-1_bit_kind
+               !Jsomotmp = IAND(Jsomo(ii),mask)
+               !tmpq += POPCNT(mask) - POPCNT(XOR(Jsomotmp,mask))
+               tmpq += POPCNT(Jsomo(ii))
+             end do
+             mask = ISHFT(1_bit_kind,ipos+1) - 1
+             Jsomotmp = IAND(Jsomo(iint),mask)
+             !qmodel = tmpq + POPCNT(mask) - POPCNT(XOR(Jsomotmp,mask))
+             qmodel = tmpq + POPCNT(Jsomotmp)
          endif
       case (4)
          ! DOMO -> SOMO
@ -1296,19 +1427,75 @@ subroutine convertOrbIdsToModelSpaceIds(Ialpha, Jcfg, p, q, extype, pmodel, qmod
          !print *,"type -> DOMO -> SOMO"
          !Isomo = IEOR(Isomo,Jsomo)
          if(p.LT.q) then
-             mask = ISHFT(1_8,p) - 1
-             Jsomo = IAND(Jsomo,mask)
-             pmodel = POPCNT(mask) - POPCNT(XOR(Jsomo,mask))
-             mask = ISHFT(1_8,q) - 1
-             Isomo = IAND(Isomo,mask)
-             qmodel = POPCNT(mask) - POPCNT(XOR(Isomo,mask)) + 1
+             !mask = ISHFT(1_8,p) - 1
+             !Jsomo = IAND(Jsomo,mask)
+             !pmodel = POPCNT(mask) - POPCNT(XOR(Jsomo,mask))
+             !mask = ISHFT(1_8,q) - 1
+             !Isomo = IAND(Isomo,mask)
+             !qmodel = POPCNT(mask) - POPCNT(XOR(Isomo,mask)) + 1
+
+             iint = shiftr(p-1,bit_kind_shift) + 1
+             ipos = p-shiftl((iint-1),bit_kind_shift)-1
+             tmpp = 0
+             do ii=1,iint-1
+               !mask = ISHFT(1_bit_kind,-1)-1_bit_kind
+               !Jsomotmp = IAND(Jsomo(ii),mask)
+               !tmpp += POPCNT(mask) - POPCNT(XOR(Jsomotmp,mask))
+               tmpp += POPCNT(Jsomo(ii))
+             end do
+             mask = ISHFT(1_bit_kind,ipos+1) - 1
+             Jsomotmp = IAND(Jsomo(iint),mask)
+             !pmodel = tmpp + POPCNT(mask) - POPCNT(XOR(Jsomotmp,mask))
+             pmodel = tmpp + POPCNT(Jsomotmp)
+
+             iint = shiftr(q-1,bit_kind_shift) + 1
+             ipos = q-shiftl((iint-1),bit_kind_shift)-1
+             tmpq = 0
+             do ii=1,iint-1
+               !mask = ISHFT(1_bit_kind,-1)-1_bit_kind
+               !Isomotmp = IAND(Isomo(ii),mask)
+               !tmpq += POPCNT(mask) - POPCNT(XOR(Isomotmp,mask))
+               tmpq += POPCNT(Isomo(ii))
+             end do
+             mask = ISHFT(1_bit_kind,ipos+1) - 1
+             Isomotmp = IAND(Isomo(iint),mask)
+             !qmodel = tmpq + POPCNT(mask) - POPCNT(XOR(Isomotmp,mask)) + 1
+             qmodel = tmpq + POPCNT(Isomotmp) + 1
          else
-             mask = ISHFT(1_8,p) - 1
-             Jsomo = IAND(Jsomo,mask)
-             pmodel = POPCNT(mask) - POPCNT(XOR(Jsomo,mask)) + 1
-             mask = ISHFT(1_8,q) - 1
-             Isomo = IAND(Isomo,mask)
-             qmodel = POPCNT(mask) - POPCNT(XOR(Isomo,mask))
+             !mask = ISHFT(1_8,p) - 1
+             !Jsomo = IAND(Jsomo,mask)
+             !pmodel = POPCNT(mask) - POPCNT(XOR(Jsomo,mask)) + 1
+             !mask = ISHFT(1_8,q) - 1
+             !Isomo = IAND(Isomo,mask)
+             !qmodel = POPCNT(mask) - POPCNT(XOR(Isomo,mask))
+
+             iint = shiftr(p-1,bit_kind_shift) + 1
+             ipos = p-shiftl((iint-1),bit_kind_shift)-1
+             tmpp = 0
+             do ii=1,iint-1
+               !mask = ISHFT(1_bit_kind,-1)-1_bit_kind
+               !Jsomotmp = IAND(Jsomo(ii),mask)
+               !tmpp += POPCNT(mask) - POPCNT(XOR(Jsomotmp,mask))
+               tmpp += POPCNT(Jsomo(ii))
+             end do
+             mask = ISHFT(1_bit_kind,ipos+1) - 1
+             Jsomotmp = IAND(Jsomo(iint),mask)
+             !pmodel = tmpp + POPCNT(mask) - POPCNT(XOR(Jsomotmp,mask)) + 1
+             pmodel = tmpp + POPCNT(Jsomotmp) + 1
+
+             iint = shiftr(q-1,bit_kind_shift) + 1
+             ipos = q-shiftl((iint-1),bit_kind_shift)-1
+             tmpq = 0
+             do ii=1,iint-1
+               !mask = ISHFT(1_bit_kind,-1)-1_bit_kind
+               !Isomotmp = IAND(Isomo(ii),mask)
+               !tmpq += POPCNT(mask) - POPCNT(XOR(Isomotmp,mask))
+               tmpq += POPCNT(Isomo(ii))
+             end do
+             mask = ISHFT(1_bit_kind,ipos+1) - 1
+             Isomotmp = IAND(Isomo(iint),mask)
+             !qmodel = tmpq + POPCNT(mask) - POPCNT(XOR(Isomotmp,mask))
+             qmodel = tmpq + POPCNT(Isomotmp)
          endif
       case default
          print *,"something is wrong in convertOrbIdsToModelSpaceIds"
@ -1366,8 +1553,13 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze
  integer                        :: rowsTKI
  integer                        :: noccpp
  integer                        :: istart_cfg, iend_cfg, num_threads_max
+  integer                        :: iint, jint, ipos, jpos, Nsomo_I, iii
  integer                        :: nconnectedJ,nconnectedtotalmax,nconnectedmaxJ,maxnalphas,ntotJ
-  integer*8                      :: MS, Isomo, Idomo, Jsomo, Jdomo, Ialpha, Ibeta
+  integer*8                      :: MS,Ialpha, Ibeta
+  integer(bit_kind)              :: Isomo(N_INT)
+  integer(bit_kind)              :: Idomo(N_INT)
+  integer(bit_kind)              :: Jsomo(N_INT)
+  integer(bit_kind)              :: Jdomo(N_INT)
  integer                        :: moi, moj, mok, mol, starti, endi, startj, endj, cnti, cntj, cntk
  real*8                         :: norm_coef_cfg, fac2eints
  real*8                         :: norm_coef_det
@ -1382,6 +1574,8 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze
  real*8,dimension(:),allocatable:: diag_energies
  real*8                         :: tmpvar, tmptot
  real*8                         :: core_act_contrib
+  integer :: listall(N_int*bit_kind_size), nelall
+  integer :: countelec

  integer(omp_lock_kind), allocatable :: lock(:)
  call omp_set_max_active_levels(1)
@ -1410,8 +1604,8 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze
  !nconnectedtotalmax = 1000
  !nconnectedmaxJ = 1000
  maxnalphas = elec_num*mo_num
-  Icfg(1,1) = psi_configuration(1,1,1)
-  Icfg(1,2) = psi_configuration(1,2,1)
+  Icfg(:,1) = psi_configuration(:,1,1)
+  Icfg(:,2) = psi_configuration(:,2,1)
  allocate(listconnectedJ(N_INT,2,max(sze,10000)))
  allocate(idslistconnectedJ(max(sze,10000)))
  call obtain_connected_J_givenI(1, Icfg, listconnectedJ, idslistconnectedJ, nconnectedmaxJ, nconnectedtotalmax)
@ -1443,6 +1637,7 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze
      !$OMP shared(istart_cfg, iend_cfg, psi_configuration, mo_num, psi_config_data,&
      !$OMP    N_int, N_st, psi_out, psi_in, h_core_ri, core_energy, h_act_ri, AIJpqContainer,&
      !$OMP     pp, sze, NalphaIcfg_list,alphasIcfg_list, bit_tmp,       &
+      !$OMP     qq, iint, jint, ipos, jpos, nelall, listall, Nsomo_I, countelec,&
      !$OMP     AIJpqMatrixDimsList, diag_energies, n_CSF, lock, NBFmax,nconnectedtotalmax, nconnectedmaxJ,maxnalphas,&
      !$OMP     n_core_orb, n_act_orb, list_act, n, list_core,  list_core_is_built,core_act_contrib, num_threads_max,&
      !$OMP     n_core_orb_is_built, mo_integrals_map, mo_integrals_map_is_built)
@ -1465,10 +1660,12 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze
    ! else
    ! cycle

-     Icfg(1,1) = psi_configuration(1,1,i)
-     Icfg(1,2) = psi_configuration(1,2,i)
-     Isomo = Icfg(1,1)
-     Idomo = Icfg(1,2)
+    do ii=1,N_INT
+     Icfg(ii,1) = psi_configuration(ii,1,i)
+     Icfg(ii,2) = psi_configuration(ii,2,i)
+     Isomo(ii) = Icfg(ii,1)
+     Idomo(ii) = Icfg(ii,2)
+    enddo
    NSOMOI = getNSOMO(Icfg)

     ! find out all pq holes possible
@ -1479,44 +1676,88 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze
     ! list_core_inact
     ! bitmasks
     !do k = 1,mo_num
-     do kk = 1,n_act_orb
-       k = list_act(kk)
-       if(POPCNT(IAND(Isomo,IBSET(0_8,k-1))) .EQ. 1) then
+    ! do kk = 1,n_act_orb
+    !   k = list_act(kk)
+    !   if(POPCNT(IAND(Isomo,IBSET(0_8,k-1))) .EQ. 1) then
+    !     nholes += 1
+    !     listholes(nholes) = k
+    !     holetype(nholes) = 1
+    !   endif
+    ! enddo
+    ! ! holes in DOMO
+    ! !do k = 1,mo_num
+    ! do kk = 1,n_act_orb
+    !   k = list_act(kk)
+    !   if(POPCNT(IAND(Idomo,IBSET(0_8,k-1))) .EQ. 1) then
+    !     nholes += 1
+    !     listholes(nholes) = k
+    !     holetype(nholes) = 2
+    !   endif
+    ! enddo
+
+    ! ! find vmos
+    ! do kk = 1,n_act_orb
+    !   k = list_act(kk)
+    !   !print *,i,IBSET(0,i-1),POPCNT(IAND(Isomo,(IBSET(0_8,i-1)))), POPCNT(IAND(Idomo,(IBSET(0_8,i-1))))
+    !   if(POPCNT(IAND(Isomo,(IBSET(0_8,k-1)))) .EQ. 0 .AND. POPCNT(IAND(Idomo,(IBSET(0_8,k-1)))) .EQ. 0) then
+    !     nvmos += 1
+    !     listvmos(nvmos) = k
+    !     vmotype(nvmos) = 0
+    !   else if(POPCNT(IAND(Isomo,(IBSET(0_8,k-1)))) .EQ. 1 .AND. POPCNT(IAND(Idomo,(IBSET(0_8,k-1)))) .EQ. 0 ) then
+    !     nvmos += 1
+    !     listvmos(nvmos) = k
+    !     vmotype(nvmos) = 1
+    !   end if
+    ! enddo
+
+  ! find out all pq holes possible
+    nholes = 0
+        call bitstring_to_list(Isomo,listall,nelall,N_int)
+
+        do iii=1,nelall
          nholes += 1
-         listholes(nholes) = k
+          listholes(nholes) = listall(iii)
          holetype(nholes) = 1
-       endif
        end do
-     ! holes in DOMO
-     !do k = 1,mo_num
-     do kk = 1,n_act_orb
-       k = list_act(kk)
-       if(POPCNT(IAND(Idomo,IBSET(0_8,k-1))) .EQ. 1) then
+
+        Nsomo_I = nelall
+
+        call bitstring_to_list(Idomo,listall,nelall,N_int)
+
+        do iii=1,nelall
+          if(listall(iii) .gt. n_core_orb)then
            nholes += 1
-         listholes(nholes) = k
+            listholes(nholes) = listall(iii)
            holetype(nholes) = 2
          endif
        end do

-     ! find vmos
+
     listvmos = -1
     vmotype = -1
     nvmos = 0
-     do kk = 1,n_act_orb
-       k = list_act(kk)
-       !print *,i,IBSET(0,i-1),POPCNT(IAND(Isomo,(IBSET(0_8,i-1)))), POPCNT(IAND(Idomo,(IBSET(0_8,i-1))))
-       if(POPCNT(IAND(Isomo,(IBSET(0_8,k-1)))) .EQ. 0 .AND. POPCNT(IAND(Idomo,(IBSET(0_8,k-1)))) .EQ. 0) then
+  ! find vmos
+    ! Take into account N_int
+    do ii = 1, n_act_orb
+      iii = list_act(ii)
+      iint = shiftr(iii-1,bit_kind_shift) + 1
+      ipos = iii-shiftl((iint-1),bit_kind_shift)-1
+
+      if(IAND(Idomo(iint),(IBSET(0_8,ipos))) .EQ. 0) then
+        if(IAND(Isomo(iint),(IBSET(0_8,ipos))) .EQ. 0) then
          nvmos += 1
-         listvmos(nvmos) = k
-         vmotype(nvmos) = 0
-       else if(POPCNT(IAND(Isomo,(IBSET(0_8,k-1)))) .EQ. 1 .AND. POPCNT(IAND(Idomo,(IBSET(0_8,k-1)))) .EQ. 0 ) then
-         nvmos += 1
-         listvmos(nvmos) = k
+          listvmos(nvmos) = iii
          vmotype(nvmos) = 1
+        else if(POPCNT(IAND(Isomo(iint),(IBSET(0_8,ipos)))) .EQ. 1) then
+          nvmos += 1
+          listvmos(nvmos) = iii
+          vmotype(nvmos) = 2
+        end if
      end if
    end do


+
     ! Icsf ids
     starti = psi_config_data(i,1)
     endi   = psi_config_data(i,2)
@ -1533,16 +1774,31 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze
      extype = excitationTypes_single(j)
      ! Off diagonal terms
      call convertOrbIdsToModelSpaceIds(Icfg, singlesI(1,1,j), p, q, extype, pmodel, qmodel)
-      Jsomo = singlesI(1,1,j)
-      Jdomo = singlesI(1,2,j)
+      do ii=1,N_INT
+        Jsomo(ii) = singlesI(1,1,j)
+        Jdomo(ii) = singlesI(1,2,j)
+      enddo
+
+      ! Get actual p pos
+      pp  = p
+      iint = shiftr(pp-1,bit_kind_shift) + 1
+      ipos = pp-shiftl((iint-1),bit_kind_shift)-1
+
+      ! Get actual q pos
+      qq  = q
+      jint = shiftr(qq-1,bit_kind_shift) + 1
+      jpos = qq-shiftl((jint-1),bit_kind_shift)-1

      ! Add the hole on J
-      if(POPCNT(IAND(Jsomo,IBSET(0_8,q-1))) .EQ. 1  .AND. POPCNT(IAND(Isomo,IBSET(0_8,q-1))) .EQ. 0) then
+      !if(POPCNT(IAND(Jsomo,IBSET(0_8,q-1))) .EQ. 1  .AND. POPCNT(IAND(Isomo,IBSET(0_8,q-1))) .EQ. 0) then
+      if(POPCNT(IAND(Jsomo(jint),IBSET(0_8,jpos))) .EQ. 1  .AND. POPCNT(IAND(Isomo(jint),IBSET(0_8,jpos))) .EQ. 0) then
        nholes += 1
        listholes(nholes) = q
        holetype(nholes) = 1
      endif
-      if((POPCNT(IAND(Jdomo,IBSET(0_8,q-1))) .EQ. 1 .AND. POPCNT(IAND(Idomo,IBSET(0_8,q-1))) .EQ. 0) .AND. POPCNT(IAND(Isomo,IBSET(0_8,q-1))) .EQ. 0) then
+      !if((POPCNT(IAND(Jdomo,IBSET(0_8,q-1))) .EQ. 1 .AND. POPCNT(IAND(Idomo,IBSET(0_8,q-1))) .EQ. 0) .AND. POPCNT(IAND(Isomo,IBSET(0_8,q-1))) .EQ. 0) then
+      if((POPCNT(IAND(Jdomo(jint),IBSET(0_8,jpos))) .EQ. 1 .AND. POPCNT(IAND(Idomo(jint),IBSET(0_8,jpos))) .EQ. 0) .AND.&
+      POPCNT(IAND(Isomo(jint),IBSET(0_8,jpos))) .EQ. 0) then
        nholes += 1
        listholes(nholes) = q
        holetype(nholes) = 2
@ -1578,10 +1834,12 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze
      enddo

      ! Undo setting in listholes
-      if(POPCNT(IAND(Jsomo,IBSET(0_8,q-1))) .EQ. 1  .AND. POPCNT(IAND(Isomo,IBSET(0_8,q-1))) .EQ. 0) then
+      !if(POPCNT(IAND(Jsomo,IBSET(0_8,q-1))) .EQ. 1  .AND. POPCNT(IAND(Isomo,IBSET(0_8,q-1))) .EQ. 0) then
+      if(POPCNT(IAND(Jsomo(jint),IBSET(0_8,jpos))) .EQ. 1  .AND. POPCNT(IAND(Isomo(jint),IBSET(0_8,jpos))) .EQ. 0) then
        nholes -= 1
      endif
-      if((POPCNT(IAND(Jdomo,IBSET(0_8,q-1))) .EQ. 1 .AND. POPCNT(IAND(Idomo,IBSET(0_8,q-1))) .EQ. 0) .AND. POPCNT(IAND(Isomo,IBSET(0_8,q-1))) .EQ. 0) then
+      if((POPCNT(IAND(Jdomo(jint),IBSET(0_8,jpos))) .EQ. 1 .AND. POPCNT(IAND(Idomo(jint),IBSET(0_8,jpos))) .EQ. 0) .AND.&
+      POPCNT(IAND(Isomo(jint),IBSET(0_8,jpos))) .EQ. 0) then
        nholes -= 1
      endif
    enddo
@ -1593,6 +1851,9 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze
  deallocate(excitationTypes_single)

  !print *," singles part psi(1,1)=",psi_out(1,1)
+  !do i=1,n_CSF
+  !  print *,"i=",i," psi(i)=",psi_out(1,i)
+  !enddo
  
  allocate(listconnectedJ(N_INT,2,max(sze,10000)))
  allocate(alphas_Icfg(N_INT,2,max(sze,10000)))
@ -1607,7 +1868,6 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze
  !!!====================!!!
  !!! Double Excitations !!!
  !!!====================!!!
-
  ! Loop over all selected configurations
  !$OMP DO SCHEDULE(static)
  do i = istart_cfg,iend_cfg
@ -1617,8 +1877,10 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze
     ! else
     ! cycle

-     Icfg(1,1) = psi_configuration(1,1,i)
-     Icfg(1,2) = psi_configuration(1,2,i)
+     do ii=1,N_INT
+       Icfg(ii,1) = psi_configuration(ii,1,i)
+       Icfg(ii,2) = psi_configuration(ii,2,i)
+     enddo
     starti = psi_config_data(i,1)
     endi   = psi_config_data(i,2)

@ -1629,14 +1891,15 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze

     Nalphas_Icfg = NalphaIcfg_list(i)
     alphas_Icfg(1:n_int,1:2,1:Nalphas_Icfg) = alphasIcfg_list(1:n_int,1:2,i,1:Nalphas_Icfg)
-     if(Nalphas_Icfg .GT. maxnalphas) then
-       print *,"Nalpha > maxnalpha"
-     endif
+     !if(Nalphas_Icfg .GT. maxnalphas) then
+     !  print *,"Nalpha > maxnalpha"
+     !endif

-     call obtain_connected_J_givenI(i, Icfg, listconnectedJ, idslistconnectedJ, nconnectedJ, ntotJ)
+     !call obtain_connected_J_givenI(i, Icfg, listconnectedJ, idslistconnectedJ, nconnectedJ, ntotJ)

     ! TODO : remove doubly excited for return
-     !print *,"I=",i," isomo=",psi_configuration(1,1,i)," idomo=",psi_configuration(1,2,i), " psiout=",psi_out(1,5)
+     !print *,"I=",i,"isomo=",psi_configuration(1,1,i),psi_configuration(2,1,i),POPCNT(psi_configuration(1,1,i)),POPCNT(psi_configuration(2,1,i)),&
+     !"idomo=",psi_configuration(1,2,i),psi_configuration(2,2,i),POPCNT(psi_configuration(1,2,i)),POPCNT(psi_configuration(2,2,i)), "Nalphas_Icfg=",Nalphas_Icfg
     do k = 1,Nalphas_Icfg
        ! Now generate all singly excited with respect to a given alpha CFG

@ -1647,15 +1910,18 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze
        call obtain_connected_I_foralpha(i, alphas_Icfg(1,1,k), connectedI_alpha, idxs_connectedI_alpha, &
                                         nconnectedI, excitationIds, excitationTypes, diagfactors)

+        !if(i .EQ. 218) then
+        !   print *,'k=',k,' kcfgSOMO=',alphas_Icfg(1,1,k),alphas_Icfg(2,1,k),' ',POPCNT(alphas_Icfg(1,1,k)),' &
+        !   kcfgDOMO=',alphas_Icfg(1,2,k),alphas_Icfg(2,2,k),' ',POPCNT(alphas_Icfg(1,2,k)), " NconnectedI=",nconnectedI
+        !   !print *,'k=',k,' kcfgSOMO=',alphas_Icfg(1,1,k),' ',POPCNT(alphas_Icfg(1,1,k)),' &
+        !   !kcfgDOMO=',alphas_Icfg(1,2,k),' ',POPCNT(alphas_Icfg(1,2,k)), " NconnectedI=",nconnectedI
+        !endif
+
        
        if(nconnectedI .EQ. 0) then
           cycle
        endif

-        !if(i .EQ. 1) then
-        !   print *,'k=',k,' kcfgSOMO=',alphas_Icfg(1,1,k),' ',POPCNT(alphas_Icfg(1,1,k)),' kcfgDOMO=',alphas_Icfg(1,2,k),' ',POPCNT(alphas_Icfg(1,2,k))
-        !endif
-
        ! Here we do 2x the loop. One to count for the size of the matrix, then we compute.
        totcolsTKI = 0
        rowsTKI = -1
@ -1665,15 +1931,30 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze
           p = excitationIds(1,j)
           q = excitationIds(2,j)
           extype = excitationTypes(j)
+           !print *,"K=",k,"j=",j, "countelec=",countelec," p=",p," q=",q, " extype=",extype, "NSOMOalpha=",NSOMOalpha," NSOMOI=",NSOMOI, "alphas_Icfg(1,1,k)=",alphas_Icfg(1,1,k), &
+           !alphas_Icfg(2,1,k), " domo=",alphas_Icfg(1,2,k), alphas_Icfg(2,2,k), " connected somo=",connectedI_alpha(1,1,j), &
+           !connectedI_alpha(2,1,j), " domo=",connectedI_alpha(1,2,j), connectedI_alpha(2,2,j)
           call convertOrbIdsToModelSpaceIds(alphas_Icfg(1,1,k), connectedI_alpha(1,1,j), p, q, extype, pmodel, qmodel)
           ! for E_pp E_rs and E_ppE_rr case
           rowsikpq = AIJpqMatrixDimsList(NSOMOalpha,extype,pmodel,qmodel,1)
           colsikpq = AIJpqMatrixDimsList(NSOMOalpha,extype,pmodel,qmodel,2)
+           !if(i.eq.218)then
+           !  print *,"j=",j," k=",k,"p=",p,"q=",q,"NSOMOalpha=",NSOMOalpha, "pmodel=",pmodel,"qmodel=",qmodel, "extype=",extype,&
+           !  "conn somo=",connectedI_alpha(1,1,j),connectedI_alpha(2,1,j),&
+           !  "conn domo=",connectedI_alpha(1,2,j),connectedI_alpha(2,2,j)
+           !  do m=1,colsikpq
+           !    print *,idxs_connectedI_alpha(j)+m-1
+           !  enddo
+           !endif
           !print *,"j=",j," Nsomo=",NSOMOalpha," rowsikpq=",rowsikpq," colsikpq=",colsikpq, " p=",pmodel," q=",qmodel, " extyp=",extype
           totcolsTKI += colsikpq
           rowsTKI = rowsikpq
        enddo

+        !if(i.eq.1)then
+        !  print *,"n_st=",n_st,"rowsTKI=",rowsTKI, " nconnectedI=",nconnectedI, &
+        !  "totcolsTKI=",totcolsTKI
+        !endif
        allocate(TKI(n_st,rowsTKI,totcolsTKI)) ! coefficients of CSF
        ! Initialize the integral container
        ! dims : (totcolsTKI, nconnectedI)
@ -1703,10 +1984,10 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze
                 TKI(kk,l,totcolsTKI+m) = AIJpqContainer(l,m,pmodel,qmodel,extype,NSOMOalpha) &
                    * psi_in(kk,idxs_connectedI_alpha(j)+m-1)
              enddo
-           !if(i.eq.1) then
-           !      print *,AIJpqContainer(l,m,pmodel,qmodel,extype,NSOMOalpha)
-           !endif
           enddo
+           !if(i.eq.1) then
+           !      print *,"j=",j,"psi_in=",psi_in(1,idxs_connectedI_alpha(j)+m-1)
+           !endif
           enddo

           diagfactors_0 = diagfactors(j)*0.5d0
@ -1745,16 +2026,24 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze
           rowsTKI = rowsikpq
           CCmattmp = 0.d0

+        !if(i.eq.1)then
+        !  print *,"\t n_st=",n_st," colsikpq=",colsikpq," rowsTKI=",rowsTKI,&
+        !    " | ",size(TKIGIJ,1),size(AIJpqContainer,1),size(CCmattmp,1)
+        !endif
           call dgemm('N','N', n_st, colsikpq, rowsTKI, 1.d0,        &
               TKIGIJ(1,1,j), size(TKIGIJ,1),                        &
               AIJpqContainer(1,1,pmodel,qmodel,extype,NSOMOalpha),  &
               size(AIJpqContainer,1), 0.d0,                         &
               CCmattmp, size(CCmattmp,1) )

+           !print *,"j=",j,"colsikpq=",colsikpq, "sizeTIG=",size(TKIGIJ,1),"sizeaijpq=",size(AIJpqContainer,1)
           do m = 1,colsikpq
              call omp_set_lock(lock(idxs_connectedI_alpha(j)+m-1))
              do kk = 1,n_st
                 psi_out(kk,idxs_connectedI_alpha(j)+m-1) += CCmattmp(kk,m)
+                 !if(dabs(CCmattmp(kk,m)).gt.1e-10)then
+                 !  print *, CCmattmp(kk,m), " | ",idxs_connectedI_alpha(j)+m-1
+                 !end if
              enddo
              call omp_unset_lock(lock(idxs_connectedI_alpha(j)+m-1))
           enddo
@ -1789,6 +2078,10 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze
  !$OMP END DO

  !$OMP END PARALLEL
+  !print *," ----- "
+  !do i=1,sze
+  !  print *,"i=",i," psi_out(i)=",psi_out(1,i)
+  !end do
  call omp_set_max_active_levels(4)

  deallocate(diag_energies)
--- a/src/dav_general_mat/test_dav.irp.f.example
+++ b/src/dav_general_mat/test_dav.irp.f.example
--- a/src/davidson/diagonalization_hcfg.irp.f
+++ b/src/davidson/diagonalization_hcfg.irp.f
@ -112,6 +112,8 @@ subroutine davidson_diag_cfg_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,sze_csf,N
  double precision, allocatable  :: U(:,:), U_csf(:,:), overlap(:,:)
  double precision, allocatable  :: tmpU(:,:), tmpW(:,:)
  double precision, pointer      :: W(:,:), W_csf(:,:)
+  !double precision, pointer      :: W2(:,:), W_csf2(:,:)
+  !double precision, allocatable  :: U2(:,:), U_csf2(:,:)
  logical                        :: disk_based
  double precision               :: energy_shift(N_st_diag_in*davidson_sze_max)

@ -234,12 +236,15 @@ subroutine davidson_diag_cfg_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,sze_csf,N
    call c_f_pointer(ptr_w, W_csf, (/sze_csf,N_st_diag*itermax/))
  else
    allocate(W(sze,N_st_diag),W_csf(sze_csf,N_st_diag*itermax))
+    !allocate(W2(sze,N_st_diag),W_csf2(sze_csf,N_st_diag*itermax))
  endif

  allocate(                                                          &
      ! Large
      U(sze,N_st_diag),                                              &
+      !U2(sze,N_st_diag),                                              &
      U_csf(sze_csf,N_st_diag*itermax),                              &
+      !U_csf2(sze_csf,N_st_diag*itermax),                              &

      ! Small
      h(N_st_diag*itermax,N_st_diag*itermax),                        &
@ -325,7 +330,7 @@ subroutine davidson_diag_cfg_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,sze_csf,N
              enddo
            enddo
            !tmpU     =0.0d0
-            !tmpU(1,2)=1.0d0
+            !tmpU(1,1)=1.0d0
            double precision               :: irp_rdtsc
            double precision               :: ticks_0, ticks_1
            integer*8                      :: irp_imax
@ -348,9 +353,9 @@ subroutine davidson_diag_cfg_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,sze_csf,N
            !call convertWFfromDETtoCSF(N_st_diag,u_in(1,1),W_csf2(1,1))
            !do i=1,sze_csf
            !  print *,"I=",i," qp=",W_csf2(i,1)," my=",W_csf(i,1)," diff=",dabs(W_csf2(i,1))-dabs(W_csf(i,1))
-            !  if(dabs(dabs(W_csf2(i,1))-dabs(W_csf(i,1))) .gt. 1.0e-10)then
-            !    print *,"somo=",psi_configuration(1,1,i)," domo=",psi_configuration(1,2,i)," diff=",dabs(W_csf2(i,1))-dabs(W_csf(i,1))
-            !  endif
+            !  !if(dabs(dabs(W_csf2(i,1))-dabs(W_csf(i,1))) .gt. 1.0e-10)then
+            !  !  print *,"somo=",psi_configuration(1,1,i)," domo=",psi_configuration(1,2,i)," diff=",dabs(W_csf2(i,1))-dabs(W_csf(i,1))
+            !  !endif
            !end do
            !stop
            deallocate(tmpW)
--- a/src/determinants/determinants.irp.f
+++ b/src/determinants/determinants.irp.f
@ -329,6 +329,7 @@ END_PROVIDER

 BEGIN_PROVIDER [ integer(bit_kind), psi_det_sorted_bit, (N_int,2,psi_det_size) ]
 &BEGIN_PROVIDER [ double precision, psi_coef_sorted_bit, (psi_det_size,N_states) ]
+&BEGIN_PROVIDER [ integer, psi_det_sorted_bit_order, (psi_det_size) ]
   implicit none
   BEGIN_DOC
   ! Determinants on which we apply $\langle i|H|psi \rangle$ for perturbation.
@ -337,8 +338,8 @@ END_PROVIDER
   ! function.
   END_DOC

-   call sort_dets_by_det_search_key(N_det, psi_det, psi_coef, size(psi_coef,1),       &
-       psi_det_sorted_bit, psi_coef_sorted_bit, N_states)
+   call sort_dets_by_det_search_key_ordered(N_det, psi_det, psi_coef, size(psi_coef,1),       &
+       psi_det_sorted_bit, psi_coef_sorted_bit, N_states, psi_det_sorted_bit_order)

 END_PROVIDER

@ -1005,3 +1006,48 @@ BEGIN_PROVIDER [ double precision, psi_det_Hii, (N_det) ]
 END_PROVIDER


+subroutine sort_dets_by_det_search_key_ordered(Ndet, det_in, coef_in, sze, det_out, coef_out, N_st, iorder)
+   use bitmasks
+   implicit none
+   integer, intent(in)            :: Ndet, N_st, sze
+   integer(bit_kind), intent(in)  :: det_in  (N_int,2,sze)
+   double precision , intent(in)  :: coef_in(sze,N_st)
+   integer(bit_kind), intent(out) :: det_out (N_int,2,sze)
+   double precision , intent(out) :: coef_out(sze,N_st)
+   integer, intent(out)           :: iorder(sze)
+   BEGIN_DOC
+   ! Determinants are sorted according to their :c:func:`det_search_key`.
+   ! Useful to accelerate the search of a random determinant in the wave
+   ! function.
+   !
+   ! /!\ The first dimension of coef_out and coef_in need to be psi_det_size
+   !
+   END_DOC
+   integer                        :: i,j,k
+   integer*8, allocatable         :: bit_tmp(:)
+   integer*8, external            :: det_search_key
+
+   allocate ( bit_tmp(Ndet) )
+
+   do i=1,Ndet
+     iorder(i) = i
+     !$DIR FORCEINLINE
+     bit_tmp(i) = det_search_key(det_in(1,1,i),N_int)
+   enddo
+   call i8sort(bit_tmp,iorder,Ndet)
+   !DIR$ IVDEP
+   do i=1,Ndet
+     do j=1,N_int
+       det_out(j,1,i) = det_in(j,1,iorder(i))
+       det_out(j,2,i) = det_in(j,2,iorder(i))
+     enddo
+     do k=1,N_st
+       coef_out(i,k) = coef_in(iorder(i),k)
+     enddo
+   enddo
+
+   deallocate(bit_tmp)
+
+end
+
+
--- a/src/determinants/slater_rules.irp.f
+++ b/src/determinants/slater_rules.irp.f
@ -83,7 +83,7 @@ subroutine get_excitation(det1,det2,exc,degree,phase,Nint)
  !               exc(1,1,1) = q
  !               exc(1,2,1) = p

-  ! T^alpha_pq  : exc(0,1,2) = 1
+  ! T^beta_pq   : exc(0,1,2) = 1
  !               exc(0,2,2) = 1
  !               exc(1,1,2) = q
  !               exc(1,2,2) = p
@ -434,6 +434,98 @@ subroutine get_single_excitation(det1,det2,exc,phase,Nint)

 end

+subroutine get_single_excitation_cfg(cfg1,cfg2,p,q,Nint)
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Returns the excitation operator between two singly excited configurations.
+  END_DOC
+  integer, intent(in)            :: Nint
+  integer(bit_kind), intent(in)  :: cfg1(Nint,2)
+  integer(bit_kind), intent(in)  :: cfg2(Nint,2)
+  integer, intent(out)           :: p, q
+  integer                        :: tz
+  integer                        :: l, ispin, idx_hole, idx_particle, ishift
+  integer                        :: nperm
+  integer                        :: i,j,k,m,n
+  integer                        :: high, low
+  integer                        :: a,b,c,d
+  integer(bit_kind)              :: hole, particle, tmp
+  integer                        :: exc(0:2,2,2)
+
+  ASSERT (Nint > 0)
+  nperm = 0
+  p = 0
+  q = 0
+  exc(0,1,1) = 0
+  exc(0,2,1) = 0
+  exc(0,1,2) = 0
+  exc(0,2,2) = 0
+  do ispin = 1,2
+    ishift = 1-bit_kind_size
+    do l=1,Nint
+      ishift = ishift + bit_kind_size
+      if (cfg1(l,ispin) == cfg2(l,ispin)) then
+        cycle
+      endif
+      tmp = xor( cfg1(l,ispin), cfg2(l,ispin) )
+      particle = iand(tmp, cfg2(l,ispin))
+      hole     = iand(tmp, cfg1(l,ispin))
+      if (particle /= 0_bit_kind) then
+        tz = trailz(particle)
+        exc(0,2,ispin) = 1
+        exc(1,2,ispin) = tz+ishift
+        !print *,"part ",tz+ishift, " ispin=",ispin
+      endif
+      if (hole /= 0_bit_kind) then
+        tz = trailz(hole)
+        exc(0,1,ispin) = 1
+        exc(1,1,ispin) = tz+ishift
+        !print *,"hole ",tz+ishift, " ispin=",ispin
+      endif
+
+      if ( iand(exc(0,1,ispin),exc(0,2,ispin)) /= 1) then  ! exc(0,1,ispin)/=1 and exc(0,2,ispin) /= 1
+        cycle
+      endif
+
+      high = max(exc(1,1,ispin), exc(1,2,ispin))-1
+      low  = min(exc(1,1,ispin), exc(1,2,ispin))
+
+      ASSERT (low >= 0)
+      ASSERT (high > 0)
+
+      k = shiftr(high,bit_kind_shift)+1
+      j = shiftr(low,bit_kind_shift)+1
+      m = iand(high,bit_kind_size-1)
+      n = iand(low,bit_kind_size-1)
+
+      if (j==k) then
+        nperm = nperm + popcnt(iand(cfg1(j,ispin),           &
+            iand( shiftl(1_bit_kind,m)-1_bit_kind,            &
+                  not(shiftl(1_bit_kind,n))+1_bit_kind)) )
+      else
+        nperm = nperm + popcnt(                                    &
+             iand(cfg1(j,ispin),                                   &
+                  iand(not(0_bit_kind),                            &
+                       (not(shiftl(1_bit_kind,n)) + 1_bit_kind) ))) &
+             + popcnt(iand(cfg1(k,ispin),                          &
+                           (shiftl(1_bit_kind,m) - 1_bit_kind ) ))
+
+        do i=j+1,k-1
+          nperm = nperm + popcnt(cfg1(i,ispin))
+        end do
+
+      endif
+
+      ! Set p and q
+      q = max(exc(1,1,1),exc(1,1,2))
+      p = max(exc(1,2,1),exc(1,2,2))
+      return
+
+    enddo
+  enddo
+end
+
 subroutine bitstring_to_list_ab( string, list, n_elements, Nint)
  use bitmasks
  implicit none
--- a/src/hartree_fock/scf.irp.f
+++ b/src/hartree_fock/scf.irp.f
@ -83,16 +83,12 @@ subroutine run()

  PROVIDE scf_algorithm

-  if(scf_algorithm .eq. "DIIS_MO") then
-    call Roothaan_Hall_SCF_MO()
-  elseif(scf_algorithm .eq. "DIIS_MODIF") then
-    call Roothaan_Hall_SCF_MODIF()
-  elseif(scf_algorithm .eq. "DIIS") then
+  if(scf_algorithm .eq. "DIIS") then
    call Roothaan_Hall_SCF()
  elseif(scf_algorithm .eq. "Simple") then
    call Roothaan_Hall_SCF_Simple()
  else
-    print *, ' not implemented yet:', scf_algorithm
+    print *, scf_algorithm, ' not implemented yet'
  endif

  call ezfio_set_hartree_fock_energy(SCF_energy)
--- a/src/mo_two_e_ints/mo_bi_integrals.irp.f
+++ b/src/mo_two_e_ints/mo_bi_integrals.irp.f
@ -245,18 +245,16 @@ subroutine add_integrals_to_map(mask_ijkl)
    return
  endif

-  double precision               :: accu_bis
-  accu_bis = 0.d0
  call wall_time(wall_1)

-  size_buffer = min( (qp_max_mem/(nproc*5)),mo_num*mo_num*mo_num)
+  size_buffer = min(mo_num*mo_num*mo_num,8000000)
  print*, 'Buffers : ', 8.*(mo_num*(n_j)*(n_k+1) + mo_num+&
      ao_num+ao_num*ao_num+ size_buffer*3)/(1024*1024), 'MB / core'

  !$OMP PARALLEL PRIVATE(l1,k1,j1,i1,i2,i3,i4,i,j,k,l,c, ii1,kmax,   &
      !$OMP  two_e_tmp_0_idx, two_e_tmp_0, two_e_tmp_1,two_e_tmp_2,two_e_tmp_3,&
      !$OMP  buffer_i,buffer_value,n_integrals,wall_2,i0,j0,k0,l0,   &
-      !$OMP  wall_0,thread_num,accu_bis)                             &
+      !$OMP  wall_0,thread_num)                             &
      !$OMP  DEFAULT(NONE)                                           &
      !$OMP  SHARED(size_buffer,ao_num,mo_num,n_i,n_j,n_k,n_l,   &
      !$OMP  mo_coef_transp,                                         &
@ -434,10 +432,10 @@ subroutine add_integrals_to_map(mask_ijkl)
  !$OMP END DO NOWAIT
  deallocate (two_e_tmp_1,two_e_tmp_2,two_e_tmp_3)

-  integer                        :: index_needed
-
+  if (n_integrals > 0) then
    call insert_into_mo_integrals_map(n_integrals,buffer_i,buffer_value,&
        real(mo_integrals_threshold,integral_kind))
+  endif
  deallocate(buffer_i, buffer_value)
  !$OMP END PARALLEL
  call map_merge(mo_integrals_map)
@ -527,12 +525,10 @@ subroutine add_integrals_to_map_three_indices(mask_ijk)

  call wall_time(wall_1)
  call cpu_time(cpu_1)
-  double precision               :: accu_bis
-  accu_bis = 0.d0
  !$OMP PARALLEL PRIVATE(m,l1,k1,j1,i1,i2,i3,i4,i,j,k,l,c, ii1,kmax, &
      !$OMP  two_e_tmp_0_idx, two_e_tmp_0, two_e_tmp_1,two_e_tmp_2,two_e_tmp_3,&
      !$OMP  buffer_i,buffer_value,n_integrals,wall_2,i0,j0,k0,l0,   &
-      !$OMP  wall_0,thread_num,accu_bis)                             &
+      !$OMP  wall_0,thread_num)                             &
      !$OMP  DEFAULT(NONE)                                           &
      !$OMP  SHARED(size_buffer,ao_num,mo_num,n_i,n_j,n_k,       &
      !$OMP  mo_coef_transp,                                         &
@ -730,8 +726,6 @@ subroutine add_integrals_to_map_three_indices(mask_ijk)
  !$OMP END DO NOWAIT
  deallocate (two_e_tmp_1,two_e_tmp_2,two_e_tmp_3)

-  integer                        :: index_needed
-
  call insert_into_mo_integrals_map(n_integrals,buffer_i,buffer_value,&
      real(mo_integrals_threshold,integral_kind))
  deallocate(buffer_i, buffer_value)
--- a/src/non_h_ints_mu/debug_integ_jmu_modif.irp.f
+++ b/src/non_h_ints_mu/debug_integ_jmu_modif.irp.f
@ -306,7 +306,7 @@ subroutine test_int2_grad1_u12_ao()

        call num_int2_grad1_u12_ao(i, j, ipoint, integ)

-        i_exc  = int2_grad1_u12_ao(1,i,j,ipoint) 
+        i_exc  = int2_grad1_u12_ao(i,j,ipoint,1) 
        i_num  = integ(1)
        acc_ij = dabs(i_exc - i_num)
        if(acc_ij .gt. eps_ij) then
@ -318,7 +318,7 @@ subroutine test_int2_grad1_u12_ao()
        acc_tot += acc_ij
        normalz += dabs(i_num)

-        i_exc  = int2_grad1_u12_ao(2,i,j,ipoint) 
+        i_exc  = int2_grad1_u12_ao(i,j,ipoint,2) 
        i_num  = integ(2)
        acc_ij = dabs(i_exc - i_num)
        if(acc_ij .gt. eps_ij) then
@ -330,7 +330,7 @@ subroutine test_int2_grad1_u12_ao()
        acc_tot += acc_ij
        normalz += dabs(i_num)

-        i_exc  = int2_grad1_u12_ao(3,i,j,ipoint) 
+        i_exc  = int2_grad1_u12_ao(i,j,ipoint,3) 
        i_num  = integ(3)
        acc_ij = dabs(i_exc - i_num)
        if(acc_ij .gt. eps_ij) then
@ -382,7 +382,7 @@ subroutine test_int2_u_grad1u_total_j1b2()

        call num_int2_u_grad1u_total_j1b2(i, j, ipoint, integ)

-        i_exc  = x * int2_u_grad1u_j1b2(i,j,ipoint) - int2_u_grad1u_x_j1b2(1,i,j,ipoint) 
+        i_exc  = x * int2_u_grad1u_j1b2(i,j,ipoint) - int2_u_grad1u_x_j1b2(i,j,ipoint,1)
        i_num  = integ(1)
        acc_ij = dabs(i_exc - i_num)
        if(acc_ij .gt. eps_ij) then
@ -394,7 +394,7 @@ subroutine test_int2_u_grad1u_total_j1b2()
        acc_tot += acc_ij
        normalz += dabs(i_num)

-        i_exc  = y * int2_u_grad1u_j1b2(i,j,ipoint) - int2_u_grad1u_x_j1b2(2,i,j,ipoint) 
+        i_exc  = y * int2_u_grad1u_j1b2(i,j,ipoint) - int2_u_grad1u_x_j1b2(i,j,ipoint,2) 
        i_num  = integ(2)
        acc_ij = dabs(i_exc - i_num)
        if(acc_ij .gt. eps_ij) then
@ -406,7 +406,7 @@ subroutine test_int2_u_grad1u_total_j1b2()
        acc_tot += acc_ij
        normalz += dabs(i_num)

-        i_exc  = z * int2_u_grad1u_j1b2(i,j,ipoint) - int2_u_grad1u_x_j1b2(3,i,j,ipoint) 
+        i_exc  = z * int2_u_grad1u_j1b2(i,j,ipoint) - int2_u_grad1u_x_j1b2(i,j,ipoint,3) 
        i_num  = integ(3)
        acc_ij = dabs(i_exc - i_num)
        if(acc_ij .gt. eps_ij) then
--- a/src/non_h_ints_mu/grad_squared.irp.f
+++ b/src/non_h_ints_mu/grad_squared.irp.f
@ -70,9 +70,9 @@ BEGIN_PROVIDER [ double precision, gradu_squared_u_ij_mu, (ao_num, ao_num, n_poi

          gradu_squared_u_ij_mu(i,j,ipoint) = tmp1 * int2_grad1u2_grad2u2_j1b2(i,j,ipoint)            &
                                            + tmp2 * int2_u2_j1b2             (i,j,ipoint)            &
-                                            + tmp6 * tmp9 + tmp3 * int2_u_grad1u_x_j1b2(1,i,j,ipoint) &
-                                            + tmp7 * tmp9 + tmp4 * int2_u_grad1u_x_j1b2(2,i,j,ipoint) &
-                                            + tmp8 * tmp9 + tmp5 * int2_u_grad1u_x_j1b2(3,i,j,ipoint)
+                                            + tmp6 * tmp9 + tmp3 * int2_u_grad1u_x_j1b2(i,j,ipoint,1) &
+                                            + tmp7 * tmp9 + tmp4 * int2_u_grad1u_x_j1b2(i,j,ipoint,2) &
+                                            + tmp8 * tmp9 + tmp5 * int2_u_grad1u_x_j1b2(i,j,ipoint,3)
        enddo
      enddo
    enddo
@ -104,11 +104,11 @@ END_PROVIDER

 ! ---

-!BEGIN_PROVIDER [double precision, tc_grad_square_ao, (ao_num, ao_num, ao_num, ao_num)]
+!BEGIN_PROVIDER [double precision, tc_grad_square_ao_loop, (ao_num, ao_num, ao_num, ao_num)]
 !
 !  BEGIN_DOC
 !  !
-!  ! tc_grad_square_ao(k,i,l,j) = -1/2 <kl | |\grad_1 u(r1,r2)|^2 + |\grad_1 u(r1,r2)|^2 | ij>
+!  ! tc_grad_square_ao_loop(k,i,l,j) = -1/2 <kl | |\grad_1 u(r1,r2)|^2 + |\grad_1 u(r1,r2)|^2 | ij>
 !  !
 !  END_DOC
 !
@ -142,8 +142,8 @@ END_PROVIDER
 !    do l = 1, ao_num
 !      do i = 1, ao_num
 !        do k = 1, ao_num
-!          tc_grad_square_ao(k,i,l,j) = ac_mat(k,i,l,j) + ac_mat(l,j,k,i)
-!          !write(11,*) tc_grad_square_ao(k,i,l,j)
+!          tc_grad_square_ao_loop(k,i,l,j) = ac_mat(k,i,l,j) + ac_mat(l,j,k,i)
+!          !write(11,*) tc_grad_square_ao_loop(k,i,l,j)
 !        enddo
 !      enddo
 !    enddo
@ -155,19 +155,23 @@ END_PROVIDER

 ! ---

-BEGIN_PROVIDER [double precision, tc_grad_square_ao, (ao_num, ao_num, ao_num, ao_num)]
+BEGIN_PROVIDER [double precision, tc_grad_square_ao_loop, (ao_num, ao_num, ao_num, ao_num)]

  BEGIN_DOC
  !
-  ! tc_grad_square_ao(k,i,l,j) = -1/2 <kl | |\grad_1 u(r1,r2)|^2 + |\grad_1 u(r1,r2)|^2 | ij>
+  ! tc_grad_square_ao_loop(k,i,l,j) = 1/2 <kl | |\grad_1 u(r1,r2)|^2 + |\grad_2 u(r1,r2)|^2 | ij>
  !
  END_DOC

  implicit none
  integer                       :: ipoint, i, j, k, l
  double precision              :: weight1, ao_ik_r, ao_i_r
+  double precision              :: time0, time1
  double precision, allocatable :: ac_mat(:,:,:,:), bc_mat(:,:,:,:)

+  print*, ' providing tc_grad_square_ao_loop ...'
+  call wall_time(time0)
+
  allocate(ac_mat(ao_num,ao_num,ao_num,ao_num))
  ac_mat = 0.d0
  allocate(bc_mat(ao_num,ao_num,ao_num,ao_num))
@ -177,10 +181,12 @@ BEGIN_PROVIDER [double precision, tc_grad_square_ao, (ao_num, ao_num, ao_num, ao
    weight1 = final_weight_at_r_vector(ipoint)

    do i = 1, ao_num
-      ao_i_r = weight1 * aos_in_r_array_transp(ipoint,i)
+      !ao_i_r = weight1 * aos_in_r_array_transp(ipoint,i)
+      ao_i_r = weight1 * aos_in_r_array(i,ipoint)

      do k = 1, ao_num
-        ao_ik_r = ao_i_r * aos_in_r_array_transp(ipoint,k)
+        !ao_ik_r = ao_i_r * aos_in_r_array_transp(ipoint,k)
+        ao_ik_r = ao_i_r * aos_in_r_array(k,ipoint)

        do j = 1, ao_num
          do l = 1, ao_num
@ -196,7 +202,7 @@ BEGIN_PROVIDER [double precision, tc_grad_square_ao, (ao_num, ao_num, ao_num, ao
    do l = 1, ao_num
      do i = 1, ao_num
        do k = 1, ao_num
-          tc_grad_square_ao(k,i,l,j) = ac_mat(k,i,l,j) + ac_mat(l,j,k,i) + bc_mat(k,i,l,j)
+          tc_grad_square_ao_loop(k,i,l,j) = ac_mat(k,i,l,j) + ac_mat(l,j,k,i) + bc_mat(k,i,l,j)
        enddo
      enddo
    enddo
@ -205,6 +211,9 @@ BEGIN_PROVIDER [double precision, tc_grad_square_ao, (ao_num, ao_num, ao_num, ao
  deallocate(ac_mat)
  deallocate(bc_mat)

+  call wall_time(time1)
+  print*, ' Wall time for tc_grad_square_ao_loop = ', time1 - time0
+
 END_PROVIDER 

 ! ---
@ -329,9 +338,9 @@ BEGIN_PROVIDER [ double precision, u12_grad1_u12_j1b_grad1_j1b, (ao_num, ao_num,

        tmp9 = int2_u_grad1u_j1b2(i,j,ipoint)

-        u12_grad1_u12_j1b_grad1_j1b(i,j,ipoint) = tmp6 * tmp9 + tmp3 * int2_u_grad1u_x_j1b2(1,i,j,ipoint) &
-                                                + tmp7 * tmp9 + tmp4 * int2_u_grad1u_x_j1b2(2,i,j,ipoint) &
-                                                + tmp8 * tmp9 + tmp5 * int2_u_grad1u_x_j1b2(3,i,j,ipoint)
+        u12_grad1_u12_j1b_grad1_j1b(i,j,ipoint) = tmp6 * tmp9 + tmp3 * int2_u_grad1u_x_j1b2(i,j,ipoint,1) &
+                                                + tmp7 * tmp9 + tmp4 * int2_u_grad1u_x_j1b2(i,j,ipoint,2) &
+                                                + tmp8 * tmp9 + tmp5 * int2_u_grad1u_x_j1b2(i,j,ipoint,3)
      enddo
    enddo
  enddo
@ -343,3 +352,86 @@ END_PROVIDER

 ! ---

+BEGIN_PROVIDER [double precision, tc_grad_square_ao, (ao_num, ao_num, ao_num, ao_num)]
+
+  BEGIN_DOC
+  !
+  ! tc_grad_square_ao(k,i,l,j) = 1/2 <kl | |\grad_1 u(r1,r2)|^2 + |\grad_2 u(r1,r2)|^2 | ij>
+  !
+  END_DOC
+
+  implicit none
+  integer                       :: ipoint, i, j, k, l
+  double precision              :: weight1, ao_ik_r, ao_i_r
+  double precision              :: time0, time1
+  double precision, allocatable :: ac_mat(:,:,:,:), b_mat(:,:,:), tmp(:,:,:)
+
+  print*, ' providing tc_grad_square_ao ...'
+  call wall_time(time0)
+
+  allocate(ac_mat(ao_num,ao_num,ao_num,ao_num), b_mat(n_points_final_grid,ao_num,ao_num), tmp(ao_num,ao_num,n_points_final_grid))
+
+  b_mat = 0.d0
+ !$OMP PARALLEL               &
+ !$OMP DEFAULT (NONE)         &
+ !$OMP PRIVATE (i, k, ipoint) & 
+ !$OMP SHARED (aos_in_r_array_transp, b_mat, ao_num, n_points_final_grid, final_weight_at_r_vector)
+ !$OMP DO SCHEDULE (static)
+  do i = 1, ao_num
+    do k = 1, ao_num
+      do ipoint = 1, n_points_final_grid
+        b_mat(ipoint,k,i) = final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,k)
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  tmp = 0.d0
+ !$OMP PARALLEL               &
+ !$OMP DEFAULT (NONE)         &
+ !$OMP PRIVATE (j, l, ipoint) & 
+ !$OMP SHARED (tmp, ao_num, n_points_final_grid, u12sq_j1bsq, u12_grad1_u12_j1b_grad1_j1b, grad12_j12)
+ !$OMP DO SCHEDULE (static)
+  do ipoint = 1, n_points_final_grid
+    do j = 1, ao_num
+      do l = 1, ao_num
+        tmp(l,j,ipoint) = u12sq_j1bsq(l,j,ipoint) + u12_grad1_u12_j1b_grad1_j1b(l,j,ipoint) + 0.5d0 * grad12_j12(l,j,ipoint)
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+
+  ac_mat = 0.d0
+  call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0 &
+            , tmp(1,1,1), ao_num*ao_num, b_mat(1,1,1), n_points_final_grid      &
+            , 1.d0, ac_mat, ao_num*ao_num)
+  deallocate(tmp, b_mat)
+
+ !$OMP PARALLEL             &
+ !$OMP DEFAULT (NONE)       &
+ !$OMP PRIVATE (i, j, k, l) & 
+ !$OMP SHARED (ac_mat, tc_grad_square_ao, ao_num)
+ !$OMP DO SCHEDULE (static)
+  do j = 1, ao_num
+    do l = 1, ao_num
+      do i = 1, ao_num
+        do k = 1, ao_num
+          tc_grad_square_ao(k,i,l,j) = ac_mat(k,i,l,j) + ac_mat(l,j,k,i) 
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  deallocate(ac_mat)
+
+  call wall_time(time1)
+  print*, ' Wall time for tc_grad_square_ao = ', time1 - time0
+
+END_PROVIDER 
+
+! ---
--- a/src/non_h_ints_mu/grad_squared_manu.irp.f
+++ b/src/non_h_ints_mu/grad_squared_manu.irp.f
@ -10,51 +10,75 @@ BEGIN_PROVIDER [double precision, tc_grad_square_ao_test, (ao_num, ao_num, ao_nu
  implicit none
  integer                       :: ipoint, i, j, k, l
  double precision              :: weight1, ao_ik_r, ao_i_r,contrib,contrib2
-  double precision, allocatable :: ac_mat(:,:,:,:), bc_mat(:,:,:,:)
-  double precision :: wall1, wall0
+  double precision              :: time0, time1
+  double precision, allocatable :: ac_mat(:,:,:,:), b_mat(:,:,:), tmp(:,:,:)
+
+  print*, ' providing tc_grad_square_ao_test ...'
+  call wall_time(time0)
+
  provide u12sq_j1bsq_test u12_grad1_u12_j1b_grad1_j1b_test grad12_j12_test
-  call wall_time(wall0)

-  allocate(ac_mat(ao_num,ao_num,ao_num,ao_num))
-  ac_mat = 0.d0
-  allocate(bc_mat(ao_num,ao_num,ao_num,ao_num))
-  bc_mat = 0.d0
+  allocate(ac_mat(ao_num,ao_num,ao_num,ao_num), b_mat(n_points_final_grid,ao_num,ao_num), tmp(ao_num,ao_num,n_points_final_grid))

+  b_mat = 0.d0
+ !$OMP PARALLEL               &
+ !$OMP DEFAULT (NONE)         &
+ !$OMP PRIVATE (i, k, ipoint) & 
+ !$OMP SHARED (aos_in_r_array_transp, b_mat, ao_num, n_points_final_grid, final_weight_at_r_vector)
+ !$OMP DO SCHEDULE (static)
+  do i = 1, ao_num
+    do k = 1, ao_num
      do ipoint = 1, n_points_final_grid
-    weight1 = final_weight_at_r_vector(ipoint)
+        b_mat(ipoint,k,i) = final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,k)
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL

+  tmp = 0.d0
+ !$OMP PARALLEL               &
+ !$OMP DEFAULT (NONE)         &
+ !$OMP PRIVATE (j, l, ipoint) & 
+ !$OMP SHARED (tmp, ao_num, n_points_final_grid, u12sq_j1bsq_test, u12_grad1_u12_j1b_grad1_j1b_test, grad12_j12_test)
+ !$OMP DO SCHEDULE (static)
+  do ipoint = 1, n_points_final_grid
    do j = 1, ao_num
      do l = 1, ao_num
-        contrib =  u12sq_j1bsq_test(l,j,ipoint) + u12_grad1_u12_j1b_grad1_j1b_test(l,j,ipoint) 
-        contrib2=grad12_j12_test(l,j,ipoint)
-        do i = 1, ao_num
-          ao_i_r = weight1 * aos_in_r_array(i,ipoint)
+        tmp(l,j,ipoint) = u12sq_j1bsq_test(l,j,ipoint) + u12_grad1_u12_j1b_grad1_j1b_test(l,j,ipoint) + 0.5d0 * grad12_j12_test(l,j,ipoint)
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL

-          do k = 1, ao_num
-            ao_ik_r = ao_i_r * aos_in_r_array(k,ipoint)
-
-            ac_mat(k,i,l,j) += ao_ik_r * contrib
-            bc_mat(k,i,l,j) += ao_ik_r * contrib2
-          enddo
-        enddo
-      enddo
-    enddo
-  enddo
+  ac_mat = 0.d0
+  call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0 &
+            , tmp(1,1,1), ao_num*ao_num, b_mat(1,1,1), n_points_final_grid      &
+            , 1.d0, ac_mat, ao_num*ao_num)
+  deallocate(tmp, b_mat)

+ !$OMP PARALLEL             &
+ !$OMP DEFAULT (NONE)       &
+ !$OMP PRIVATE (i, j, k, l) & 
+ !$OMP SHARED (ac_mat, tc_grad_square_ao_test, ao_num)
+ !$OMP DO SCHEDULE (static)
  do j = 1, ao_num
    do l = 1, ao_num
      do i = 1, ao_num
        do k = 1, ao_num
-          tc_grad_square_ao_test(k,i,l,j) = ac_mat(k,i,l,j) + ac_mat(l,j,k,i) + bc_mat(k,i,l,j)
+          tc_grad_square_ao_test(k,i,l,j) = ac_mat(k,i,l,j) + ac_mat(l,j,k,i)
        enddo
      enddo
    enddo
  enddo
-  call wall_time(wall1)
-  print*,'wall time for tc_grad_square_ao_test',wall1 - wall0
+ !$OMP END DO
+ !$OMP END PARALLEL

  deallocate(ac_mat)
-  deallocate(bc_mat)
+
+  call wall_time(time1)
+  print*, ' Wall time for tc_grad_square_ao_test = ', time1 - time0

 END_PROVIDER 

@ -88,6 +112,7 @@ BEGIN_PROVIDER [ double precision, u12sq_j1bsq_test, (ao_num, ao_num, n_points_f

 END_PROVIDER 

+! ---

 BEGIN_PROVIDER [ double precision, u12_grad1_u12_j1b_grad1_j1b_test, (ao_num, ao_num, n_points_final_grid) ]
 
@ -99,8 +124,9 @@ BEGIN_PROVIDER [ double precision, u12_grad1_u12_j1b_grad1_j1b_test, (ao_num, ao
  double precision           :: time0, time1
  double precision, external :: overlap_gauss_r12_ao

-  provide int2_u_grad1u_x_j1b2_test
  print*, ' providing u12_grad1_u12_j1b_grad1_j1b_test ...'
+
+  provide int2_u_grad1u_x_j1b2_test
  call wall_time(time0)

  do ipoint = 1, n_points_final_grid
@ -126,9 +152,9 @@ BEGIN_PROVIDER [ double precision, u12_grad1_u12_j1b_grad1_j1b_test, (ao_num, ao

        tmp9 = int2_u_grad1u_j1b2_test(i,j,ipoint)

-        u12_grad1_u12_j1b_grad1_j1b_test(i,j,ipoint) = tmp6 * tmp9 + tmp3 * int2_u_grad1u_x_j1b2_test(1,i,j,ipoint) &
-                                                     + tmp7 * tmp9 + tmp4 * int2_u_grad1u_x_j1b2_test(2,i,j,ipoint) &
-                                                     + tmp8 * tmp9 + tmp5 * int2_u_grad1u_x_j1b2_test(3,i,j,ipoint)
+        u12_grad1_u12_j1b_grad1_j1b_test(i,j,ipoint) = tmp6 * tmp9 + tmp3 * int2_u_grad1u_x_j1b2_test(i,j,ipoint,1) &
+                                                     + tmp7 * tmp9 + tmp4 * int2_u_grad1u_x_j1b2_test(i,j,ipoint,2) &
+                                                     + tmp8 * tmp9 + tmp5 * int2_u_grad1u_x_j1b2_test(i,j,ipoint,3)
      enddo
    enddo
  enddo
@ -192,3 +218,4 @@ BEGIN_PROVIDER [ double precision, grad12_j12_test, (ao_num, ao_num, n_points_fi
 END_PROVIDER 

 ! ---
+
--- a/src/non_h_ints_mu/new_grad_tc.irp.f
+++ b/src/non_h_ints_mu/new_grad_tc.irp.f
@ -1,22 +1,21 @@
-  
 ! ---

-BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao, (3, ao_num, ao_num, n_points_final_grid)]
+BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao, (ao_num, ao_num, n_points_final_grid, 3)]

  BEGIN_DOC
  !
-  ! int2_grad1_u12_ao(:,i,j,ipoint) = \int dr2 [-1 * \grad_r1 J(r1,r2)] \phi_i(r2) \phi_j(r2) 
+  ! int2_grad1_u12_ao(i,j,ipoint,:) = \int dr2 [-1 * \grad_r1 J(r1,r2)] \phi_i(r2) \phi_j(r2) 
  !
  ! where r1 = r(ipoint)
  !
  ! if J(r1,r2) = u12:
  !
-  ! int2_grad1_u12_ao(:,i,j,ipoint) = 0.5 x \int dr2 [(r1 - r2) (erf(mu * r12)-1)r_12] \phi_i(r2) \phi_j(r2)
+  ! int2_grad1_u12_ao(i,j,ipoint,:) = 0.5 x \int dr2 [(r1 - r2) (erf(mu * r12)-1)r_12] \phi_i(r2) \phi_j(r2)
  !                                 = 0.5 * [ v_ij_erf_rk_cst_mu(i,j,ipoint) * r(:) - x_v_ij_erf_rk_cst_mu(i,j,ipoint,:) ]
  !
  ! if J(r1,r2) = u12 x v1 x v2
  !
-  ! int2_grad1_u12_ao(:,i,j,ipoint) =      v1    x [ 0.5 x \int dr2 [(r1 - r2) (erf(mu * r12)-1)r_12] v2 \phi_i(r2) \phi_j(r2) ]
+  ! int2_grad1_u12_ao(i,j,ipoint,:) =      v1    x [ 0.5 x \int dr2 [(r1 - r2) (erf(mu * r12)-1)r_12] v2 \phi_i(r2) \phi_j(r2) ]
  !                                 - \grad_1 v1 x [       \int dr2                  u12              v2 \phi_i(r2) \phi_j(r2) ] 
  !                                 =    0.5 v_1b(ipoint) * v_ij_erf_rk_cst_mu_j1b(i,j,ipoint) * r(:) 
  !                                 -    0.5 v_1b(ipoint) * x_v_ij_erf_rk_cst_mu_j1b(i,j,ipoint,:) 
@ -27,8 +26,12 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao, (3, ao_num, ao_num, n_poin

  implicit none
  integer          :: ipoint, i, j
+  double precision :: time0, time1
  double precision :: x, y, z, tmp_x, tmp_y, tmp_z, tmp0, tmp1, tmp2

+  print*, ' providing int2_grad1_u12_ao ...'
+  call wall_time(time0)
+
  PROVIDE j1b_type
  
  if(j1b_type .eq. 3) then
@ -49,9 +52,9 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao, (3, ao_num, ao_num, n_poin
          tmp1 = tmp0 * v_ij_erf_rk_cst_mu_j1b(i,j,ipoint)
          tmp2 = v_ij_u_cst_mu_j1b(i,j,ipoint)

-          int2_grad1_u12_ao(1,i,j,ipoint) = tmp1 * x - tmp0 * x_v_ij_erf_rk_cst_mu_tmp_j1b(1,i,j,ipoint) - tmp2 * tmp_x
-          int2_grad1_u12_ao(2,i,j,ipoint) = tmp1 * y - tmp0 * x_v_ij_erf_rk_cst_mu_tmp_j1b(2,i,j,ipoint) - tmp2 * tmp_y
-          int2_grad1_u12_ao(3,i,j,ipoint) = tmp1 * z - tmp0 * x_v_ij_erf_rk_cst_mu_tmp_j1b(3,i,j,ipoint) - tmp2 * tmp_z
+          int2_grad1_u12_ao(i,j,ipoint,1) = tmp1 * x - tmp0 * x_v_ij_erf_rk_cst_mu_j1b(i,j,ipoint,1) - tmp2 * tmp_x
+          int2_grad1_u12_ao(i,j,ipoint,2) = tmp1 * y - tmp0 * x_v_ij_erf_rk_cst_mu_j1b(i,j,ipoint,2) - tmp2 * tmp_y
+          int2_grad1_u12_ao(i,j,ipoint,3) = tmp1 * z - tmp0 * x_v_ij_erf_rk_cst_mu_j1b(i,j,ipoint,3) - tmp2 * tmp_z
        enddo
      enddo
    enddo
@ -67,9 +70,9 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao, (3, ao_num, ao_num, n_poin
        do i = 1, ao_num
          tmp1 = v_ij_erf_rk_cst_mu(i,j,ipoint)

-          int2_grad1_u12_ao(1,i,j,ipoint) = tmp1 * x - x_v_ij_erf_rk_cst_mu_tmp(1,i,j,ipoint)
-          int2_grad1_u12_ao(2,i,j,ipoint) = tmp1 * y - x_v_ij_erf_rk_cst_mu_tmp(2,i,j,ipoint)
-          int2_grad1_u12_ao(3,i,j,ipoint) = tmp1 * z - x_v_ij_erf_rk_cst_mu_tmp(3,i,j,ipoint)
+          int2_grad1_u12_ao(i,j,ipoint,1) = tmp1 * x - x_v_ij_erf_rk_cst_mu_transp_bis(ipoint,i,j,1)
+          int2_grad1_u12_ao(i,j,ipoint,2) = tmp1 * y - x_v_ij_erf_rk_cst_mu_transp_bis(ipoint,i,j,2)
+          int2_grad1_u12_ao(i,j,ipoint,3) = tmp1 * z - x_v_ij_erf_rk_cst_mu_transp_bis(ipoint,i,j,3)
        enddo
      enddo
    enddo
@ -78,6 +81,9 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao, (3, ao_num, ao_num, n_poin

  endif

+  call wall_time(time1)
+  print*, ' Wall time for int2_grad1_u12_ao = ', time1 - time0
+
 END_PROVIDER 

 ! ---
@ -94,7 +100,7 @@ BEGIN_PROVIDER [ double precision, int1_grad2_u12_ao, (3, ao_num, ao_num, n_poin
  !
  ! int1_grad2_u12_ao(:,i,j,ipoint) = +0.5 x \int dr1 [-(r1 - r2) (erf(mu * r12)-1)r_12] \phi_i(r1) \phi_j(r1)
  !                                 = -0.5 * [ v_ij_erf_rk_cst_mu(i,j,ipoint) * r(:) - x_v_ij_erf_rk_cst_mu(i,j,ipoint,:) ]
-  !                                 = -int2_grad1_u12_ao(:,i,j,ipoint)
+  !                                 = -int2_grad1_u12_ao(i,j,ipoint,:)
  !
  ! if J(r1,r2) = u12 x v1 x v2
  !
@ -131,9 +137,9 @@ BEGIN_PROVIDER [ double precision, int1_grad2_u12_ao, (3, ao_num, ao_num, n_poin
          tmp1 = tmp0 * v_ij_erf_rk_cst_mu_j1b(i,j,ipoint)
          tmp2 = v_ij_u_cst_mu_j1b(i,j,ipoint)

-          int1_grad2_u12_ao(1,i,j,ipoint) = -tmp1 * x + tmp0 * x_v_ij_erf_rk_cst_mu_tmp_j1b(1,i,j,ipoint) - tmp2 * tmp_x
-          int1_grad2_u12_ao(2,i,j,ipoint) = -tmp1 * y + tmp0 * x_v_ij_erf_rk_cst_mu_tmp_j1b(2,i,j,ipoint) - tmp2 * tmp_y
-          int1_grad2_u12_ao(3,i,j,ipoint) = -tmp1 * z + tmp0 * x_v_ij_erf_rk_cst_mu_tmp_j1b(3,i,j,ipoint) - tmp2 * tmp_z
+          int1_grad2_u12_ao(1,i,j,ipoint) = -tmp1 * x + tmp0 * x_v_ij_erf_rk_cst_mu_j1b(i,j,ipoint,1) - tmp2 * tmp_x
+          int1_grad2_u12_ao(2,i,j,ipoint) = -tmp1 * y + tmp0 * x_v_ij_erf_rk_cst_mu_j1b(i,j,ipoint,2) - tmp2 * tmp_y
+          int1_grad2_u12_ao(3,i,j,ipoint) = -tmp1 * z + tmp0 * x_v_ij_erf_rk_cst_mu_j1b(i,j,ipoint,3) - tmp2 * tmp_z
        enddo
      enddo
    enddo
@ -148,11 +154,11 @@ END_PROVIDER

 ! ---

-BEGIN_PROVIDER [double precision, tc_grad_and_lapl_ao, (ao_num, ao_num, ao_num, ao_num)]
+BEGIN_PROVIDER [double precision, tc_grad_and_lapl_ao_loop, (ao_num, ao_num, ao_num, ao_num)]

  BEGIN_DOC
  !
-  ! tc_grad_and_lapl_ao(k,i,l,j) = < k l | -1/2 \Delta_1 u(r1,r2) - \grad_1 u(r1,r2) . \grad_1 | ij >
+  ! tc_grad_and_lapl_ao_loop(k,i,l,j) = < k l | -1/2 \Delta_1 u(r1,r2) - \grad_1 u(r1,r2) . \grad_1 | ij >
  !
  ! = 1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2 \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2) 
  !
@ -165,8 +171,12 @@ BEGIN_PROVIDER [double precision, tc_grad_and_lapl_ao, (ao_num, ao_num, ao_num,
  double precision              :: weight1, contrib_x, contrib_y, contrib_z, tmp_x, tmp_y, tmp_z
  double precision              :: ao_k_r, ao_i_r, ao_i_dx, ao_i_dy, ao_i_dz
  double precision              :: ao_j_r, ao_l_r, ao_l_dx, ao_l_dy, ao_l_dz
+  double precision              :: time0, time1
  double precision, allocatable :: ac_mat(:,:,:,:)

+  print*, ' providing tc_grad_and_lapl_ao_loop ...'
+  call wall_time(time0)
+
  allocate(ac_mat(ao_num,ao_num,ao_num,ao_num))
  ac_mat = 0.d0

@ -176,24 +186,32 @@ BEGIN_PROVIDER [double precision, tc_grad_and_lapl_ao, (ao_num, ao_num, ao_num,
    weight1 = 0.5d0 * final_weight_at_r_vector(ipoint)

    do i = 1, ao_num
-      ao_i_r  = weight1 * aos_in_r_array_transp         (ipoint,i)
-      ao_i_dx = weight1 * aos_grad_in_r_array_transp_bis(ipoint,i,1)
-      ao_i_dy = weight1 * aos_grad_in_r_array_transp_bis(ipoint,i,2)
-      ao_i_dz = weight1 * aos_grad_in_r_array_transp_bis(ipoint,i,3)
+      !ao_i_r  = weight1 * aos_in_r_array_transp         (ipoint,i)
+      !ao_i_dx = weight1 * aos_grad_in_r_array_transp_bis(ipoint,i,1)
+      !ao_i_dy = weight1 * aos_grad_in_r_array_transp_bis(ipoint,i,2)
+      !ao_i_dz = weight1 * aos_grad_in_r_array_transp_bis(ipoint,i,3)
+      ao_i_r  = weight1 * aos_in_r_array     (i,ipoint)
+      ao_i_dx = weight1 * aos_grad_in_r_array(i,ipoint,1)
+      ao_i_dy = weight1 * aos_grad_in_r_array(i,ipoint,2)
+      ao_i_dz = weight1 * aos_grad_in_r_array(i,ipoint,3)

      do k = 1, ao_num
-        ao_k_r = aos_in_r_array_transp(ipoint,k)
+        !ao_k_r = aos_in_r_array_transp(ipoint,k)
+        ao_k_r = aos_in_r_array(k,ipoint)

-        tmp_x = ao_k_r * ao_i_dx - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,1) 
-        tmp_y = ao_k_r * ao_i_dy - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,2) 
-        tmp_z = ao_k_r * ao_i_dz - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,3) 
+        !tmp_x = ao_k_r * ao_i_dx - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,1) 
+        !tmp_y = ao_k_r * ao_i_dy - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,2) 
+        !tmp_z = ao_k_r * ao_i_dz - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,3) 
+        tmp_x = ao_k_r * ao_i_dx - ao_i_r * aos_grad_in_r_array(k,ipoint,1) 
+        tmp_y = ao_k_r * ao_i_dy - ao_i_r * aos_grad_in_r_array(k,ipoint,2) 
+        tmp_z = ao_k_r * ao_i_dz - ao_i_r * aos_grad_in_r_array(k,ipoint,3) 

        do j = 1, ao_num
          do l = 1, ao_num

-            contrib_x = int2_grad1_u12_ao(1,l,j,ipoint) * tmp_x 
-            contrib_y = int2_grad1_u12_ao(2,l,j,ipoint) * tmp_y 
-            contrib_z = int2_grad1_u12_ao(3,l,j,ipoint) * tmp_z 
+            contrib_x = int2_grad1_u12_ao(l,j,ipoint,1) * tmp_x 
+            contrib_y = int2_grad1_u12_ao(l,j,ipoint,2) * tmp_y 
+            contrib_z = int2_grad1_u12_ao(l,j,ipoint,3) * tmp_z 

            ac_mat(k,i,l,j) += contrib_x + contrib_y + contrib_z
          enddo
@ -223,9 +241,9 @@ BEGIN_PROVIDER [double precision, tc_grad_and_lapl_ao, (ao_num, ao_num, ao_num,
  !      do i = 1, ao_num
  !        do k = 1, ao_num

-  !          contrib_x = int2_grad1_u12_ao(1,k,i,ipoint) * tmp_x 
-  !          contrib_y = int2_grad1_u12_ao(2,k,i,ipoint) * tmp_y 
-  !          contrib_z = int2_grad1_u12_ao(3,k,i,ipoint) * tmp_z 
+  !          contrib_x = int2_grad1_u12_ao(k,i,ipoint,1) * tmp_x 
+  !          contrib_y = int2_grad1_u12_ao(k,i,ipoint,2) * tmp_y 
+  !          contrib_z = int2_grad1_u12_ao(k,i,ipoint,3) * tmp_z 

  !          ac_mat(k,i,l,j) += contrib_x + contrib_y + contrib_z
  !        enddo
@ -240,8 +258,8 @@ BEGIN_PROVIDER [double precision, tc_grad_and_lapl_ao, (ao_num, ao_num, ao_num,
    do l = 1, ao_num
      do i = 1, ao_num
        do k = 1, ao_num
-          tc_grad_and_lapl_ao(k,i,l,j) = ac_mat(k,i,l,j) + ac_mat(l,j,k,i)
-          !tc_grad_and_lapl_ao(k,i,l,j) = ac_mat(k,i,l,j)
+          tc_grad_and_lapl_ao_loop(k,i,l,j) = ac_mat(k,i,l,j) + ac_mat(l,j,k,i)
+          !tc_grad_and_lapl_ao_loop(k,i,l,j) = ac_mat(k,i,l,j)
        enddo
      enddo
    enddo
@ -249,6 +267,92 @@ BEGIN_PROVIDER [double precision, tc_grad_and_lapl_ao, (ao_num, ao_num, ao_num,

  deallocate(ac_mat)

+  call wall_time(time1)
+  print*, ' Wall time for tc_grad_and_lapl_ao_loop = ', time1 - time0
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [double precision, tc_grad_and_lapl_ao, (ao_num, ao_num, ao_num, ao_num)]
+
+  BEGIN_DOC
+  !
+  ! tc_grad_and_lapl_ao(k,i,l,j) = < k l | -1/2 \Delta_1 u(r1,r2) - \grad_1 u(r1,r2) . \grad_1 | ij >
+  !
+  ! = 1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2 \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2) 
+  !
+  ! This is obtained by integration by parts. 
+  !
+  END_DOC
+
+  implicit none
+  integer                       :: ipoint, i, j, k, l, m
+  double precision              :: weight1, ao_k_r, ao_i_r
+  double precision              :: time0, time1
+  double precision, allocatable :: ac_mat(:,:,:,:), b_mat(:,:,:,:)
+
+  print*, ' providing tc_grad_and_lapl_ao ...'
+  call wall_time(time0)
+
+  allocate(b_mat(n_points_final_grid,ao_num,ao_num,3), ac_mat(ao_num,ao_num,ao_num,ao_num))
+
+  b_mat = 0.d0
+ !$OMP PARALLEL                                                              &
+ !$OMP DEFAULT (NONE)                                                        &
+ !$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r)                       & 
+ !$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, b_mat, & 
+ !$OMP         ao_num, n_points_final_grid, final_weight_at_r_vector)
+ !$OMP DO SCHEDULE (static)
+  do i = 1, ao_num
+    do k = 1, ao_num
+      do ipoint = 1, n_points_final_grid
+
+        weight1 = 0.5d0 * final_weight_at_r_vector(ipoint)
+        ao_i_r  = aos_in_r_array_transp(ipoint,i)
+        ao_k_r  = aos_in_r_array_transp(ipoint,k)
+
+        b_mat(ipoint,k,i,1) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,1) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,1)) 
+        b_mat(ipoint,k,i,2) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,2) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,2)) 
+        b_mat(ipoint,k,i,3) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,3) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,3)) 
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  ac_mat = 0.d0
+  do m = 1, 3
+    call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0              &
+              , int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num, b_mat(1,1,1,m), n_points_final_grid &
+              , 1.d0, ac_mat, ao_num*ao_num) 
+
+  enddo
+  deallocate(b_mat)
+
+ !$OMP PARALLEL             &
+ !$OMP DEFAULT (NONE)       &
+ !$OMP PRIVATE (i, j, k, l) & 
+ !$OMP SHARED (ac_mat, tc_grad_and_lapl_ao, ao_num)
+ !$OMP DO SCHEDULE (static)
+  do j = 1, ao_num
+    do l = 1, ao_num
+      do i = 1, ao_num
+        do k = 1, ao_num
+          tc_grad_and_lapl_ao(k,i,l,j) = ac_mat(k,i,l,j) + ac_mat(l,j,k,i)
+          !tc_grad_and_lapl_ao(k,i,l,j) = ac_mat(k,i,l,j)
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  deallocate(ac_mat)
+
+  call wall_time(time1)
+  print*, ' Wall time for tc_grad_and_lapl_ao = ', time1 - time0
+
 END_PROVIDER 

 ! ---
--- a/src/non_h_ints_mu/new_grad_tc_manu.irp.f
+++ b/src/non_h_ints_mu/new_grad_tc_manu.irp.f
@ -1,20 +1,20 @@

-BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao_test, (3, ao_num, ao_num, n_points_final_grid)]
+BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao_test, (ao_num, ao_num, n_points_final_grid, 3)]

  BEGIN_DOC
  !
-  ! int2_grad1_u12_ao_test(:,i,j,ipoint) = \int dr2 [-1 * \grad_r1 J(r1,r2)] \phi_i(r2) \phi_j(r2) 
+  ! int2_grad1_u12_ao_test(i,j,ipoint,:) = \int dr2 [-1 * \grad_r1 J(r1,r2)] \phi_i(r2) \phi_j(r2) 
  !
  ! where r1 = r(ipoint)
  !
  ! if J(r1,r2) = u12:
  !
-  ! int2_grad1_u12_ao_test(:,i,j,ipoint) = 0.5 x \int dr2 [(r1 - r2) (erf(mu * r12)-1)r_12] \phi_i(r2) \phi_j(r2)
+  ! int2_grad1_u12_ao_test(i,j,ipoint,:) = 0.5 x \int dr2 [(r1 - r2) (erf(mu * r12)-1)r_12] \phi_i(r2) \phi_j(r2)
  !                                 = 0.5 * [ v_ij_erf_rk_cst_mu(i,j,ipoint) * r(:) - x_v_ij_erf_rk_cst_mu(i,j,ipoint,:) ]
  !
  ! if J(r1,r2) = u12 x v1 x v2
  !
-  ! int2_grad1_u12_ao_test(:,i,j,ipoint) =      v1    x [ 0.5 x \int dr2 [(r1 - r2) (erf(mu * r12)-1)r_12] v2 \phi_i(r2) \phi_j(r2) ]
+  ! int2_grad1_u12_ao_test(i,j,ipoint,:) =      v1    x [ 0.5 x \int dr2 [(r1 - r2) (erf(mu * r12)-1)r_12] v2 \phi_i(r2) \phi_j(r2) ]
  !                                 - \grad_1 v1 x [       \int dr2                  u12              v2 \phi_i(r2) \phi_j(r2) ] 
  !                                 =    0.5 v_1b(ipoint) * v_ij_erf_rk_cst_mu_j1b(i,j,ipoint) * r(:) 
  !                                 -    0.5 v_1b(ipoint) * x_v_ij_erf_rk_cst_mu_j1b(i,j,ipoint,:) 
@ -25,8 +25,12 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao_test, (3, ao_num, ao_num, n

  implicit none
  integer          :: ipoint, i, j
+  double precision :: time0, time1
  double precision :: x, y, z, tmp_x, tmp_y, tmp_z, tmp0, tmp1, tmp2

+  print*, ' providing int2_grad1_u12_ao_test ...'
+  call wall_time(time0)
+
  PROVIDE j1b_type
  
  if(j1b_type .eq. 3) then
@ -43,14 +47,13 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao_test, (3, ao_num, ao_num, n
  
      do j = 1, ao_num
        do i = 1, ao_num
-!         if(dabs(ao_overlap_abs_grid(j,i)).lt.1.d-12)cycle

          tmp1 = tmp0 * v_ij_erf_rk_cst_mu_j1b_test(i,j,ipoint)
          tmp2 = v_ij_u_cst_mu_j1b_test(i,j,ipoint)

-          int2_grad1_u12_ao_test(1,i,j,ipoint) = tmp1 * x - tmp0 * x_v_ij_erf_rk_cst_mu_tmp_j1b_test(1,i,j,ipoint) - tmp2 * tmp_x
-          int2_grad1_u12_ao_test(2,i,j,ipoint) = tmp1 * y - tmp0 * x_v_ij_erf_rk_cst_mu_tmp_j1b_test(2,i,j,ipoint) - tmp2 * tmp_y
-          int2_grad1_u12_ao_test(3,i,j,ipoint) = tmp1 * z - tmp0 * x_v_ij_erf_rk_cst_mu_tmp_j1b_test(3,i,j,ipoint) - tmp2 * tmp_z
+          int2_grad1_u12_ao_test(i,j,ipoint,1) = tmp1 * x - tmp0 * x_v_ij_erf_rk_cst_mu_j1b_test(i,j,ipoint,1) - tmp2 * tmp_x
+          int2_grad1_u12_ao_test(i,j,ipoint,2) = tmp1 * y - tmp0 * x_v_ij_erf_rk_cst_mu_j1b_test(i,j,ipoint,2) - tmp2 * tmp_y
+          int2_grad1_u12_ao_test(i,j,ipoint,3) = tmp1 * z - tmp0 * x_v_ij_erf_rk_cst_mu_j1b_test(i,j,ipoint,3) - tmp2 * tmp_z
        enddo
      enddo
    enddo
@ -66,9 +69,9 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao_test, (3, ao_num, ao_num, n
        do i = 1, ao_num
          tmp1 = v_ij_erf_rk_cst_mu(i,j,ipoint)

-          int2_grad1_u12_ao_test(1,i,j,ipoint) = tmp1 * x - x_v_ij_erf_rk_cst_mu_tmp(1,i,j,ipoint)
-          int2_grad1_u12_ao_test(2,i,j,ipoint) = tmp1 * y - x_v_ij_erf_rk_cst_mu_tmp(2,i,j,ipoint)
-          int2_grad1_u12_ao_test(3,i,j,ipoint) = tmp1 * z - x_v_ij_erf_rk_cst_mu_tmp(3,i,j,ipoint)
+          int2_grad1_u12_ao_test(i,j,ipoint,1) = tmp1 * x - x_v_ij_erf_rk_cst_mu_tmp(i,j,ipoint,1)
+          int2_grad1_u12_ao_test(i,j,ipoint,2) = tmp1 * y - x_v_ij_erf_rk_cst_mu_tmp(i,j,ipoint,2)
+          int2_grad1_u12_ao_test(i,j,ipoint,3) = tmp1 * z - x_v_ij_erf_rk_cst_mu_tmp(i,j,ipoint,3)
        enddo
      enddo
    enddo
@ -77,8 +80,13 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao_test, (3, ao_num, ao_num, n

  endif

+  call wall_time(time1)
+  print*, ' Wall time for int2_grad1_u12_ao_test = ', time1 - time0
+
 END_PROVIDER 

+! ---
+
 BEGIN_PROVIDER [double precision, tc_grad_and_lapl_ao_test, (ao_num, ao_num, ao_num, ao_num)]

  BEGIN_DOC
@ -92,48 +100,57 @@ BEGIN_PROVIDER [double precision, tc_grad_and_lapl_ao_test, (ao_num, ao_num, ao_
  END_DOC

  implicit none
-  integer                       :: ipoint, i, j, k, l
+  integer                       :: ipoint, i, j, k, l, m
  double precision              :: weight1, contrib_x, contrib_y, contrib_z, tmp_x, tmp_y, tmp_z
  double precision              :: ao_k_r, ao_i_r, ao_i_dx, ao_i_dy, ao_i_dz
-  double precision, allocatable :: ac_mat(:,:,:,:)
-  double precision :: wall0, wall1
+  double precision              :: time0, time1
+  double precision, allocatable :: ac_mat(:,:,:,:), b_mat(:,:,:,:)
+
+  print*, ' providing tc_grad_and_lapl_ao_test ...'
+  call wall_time(time0)

  provide int2_grad1_u12_ao_test 
-  call wall_time(wall0)
-  allocate(ac_mat(ao_num,ao_num,ao_num,ao_num))
-  ac_mat = 0.d0
 
-  do ipoint = 1, n_points_final_grid
-    weight1 = 0.5d0 * final_weight_at_r_vector(ipoint)
-    do j = 1, ao_num
-      do l = 1, ao_num
-       contrib_x = int2_grad1_u12_ao_test(1,l,j,ipoint)
-       contrib_y = int2_grad1_u12_ao_test(2,l,j,ipoint)
-       contrib_z = int2_grad1_u12_ao_test(3,l,j,ipoint)
+  allocate(b_mat(n_points_final_grid,ao_num,ao_num,3), ac_mat(ao_num,ao_num,ao_num,ao_num))
+
+  b_mat = 0.d0
+ !$OMP PARALLEL                                                              &
+ !$OMP DEFAULT (NONE)                                                        &
+ !$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r)                       & 
+ !$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, b_mat, & 
+ !$OMP         ao_num, n_points_final_grid, final_weight_at_r_vector)
+ !$OMP DO SCHEDULE (static)
  do i = 1, ao_num
-         ao_i_r  = weight1 * aos_in_r_array                (i,ipoint)
-         ao_i_dx = weight1 * aos_grad_in_r_array_transp(1,i,ipoint)
-         ao_i_dy = weight1 * aos_grad_in_r_array_transp(2,i,ipoint)
-         ao_i_dz = weight1 * aos_grad_in_r_array_transp(3,i,ipoint)
-       
    do k = 1, ao_num
-           ao_k_r = aos_in_r_array(k,ipoint)
+      do ipoint = 1, n_points_final_grid

-           tmp_x = ao_k_r * ao_i_dx - ao_i_r * aos_grad_in_r_array_transp(1,k,ipoint) 
-           tmp_y = ao_k_r * ao_i_dy - ao_i_r * aos_grad_in_r_array_transp(2,k,ipoint) 
-           tmp_z = ao_k_r * ao_i_dz - ao_i_r * aos_grad_in_r_array_transp(3,k,ipoint) 
+        weight1 = 0.5d0 * final_weight_at_r_vector(ipoint)
+        ao_i_r  = aos_in_r_array_transp(ipoint,i)
+        ao_k_r  = aos_in_r_array_transp(ipoint,k)

-           tmp_x *= contrib_x 
-           tmp_y *= contrib_y 
-           tmp_z *= contrib_z 
+        b_mat(ipoint,k,i,1) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,1) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,1))
+        b_mat(ipoint,k,i,2) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,2) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,2))
+        b_mat(ipoint,k,i,3) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,3) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,3))
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL

-            ac_mat(k,i,l,j) += tmp_x + tmp_y + tmp_z
-          enddo
-        enddo
-      enddo
-    enddo
-  enddo
+  ac_mat = 0.d0
+  do m = 1, 3
+    call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0                   &
+              , int2_grad1_u12_ao_test(1,1,1,m), ao_num*ao_num, b_mat(1,1,1,m), n_points_final_grid &
+              , 1.d0, ac_mat, ao_num*ao_num)

+  enddo
+  deallocate(b_mat)
+
+ !$OMP PARALLEL             &
+ !$OMP DEFAULT (NONE)       &
+ !$OMP PRIVATE (i, j, k, l) & 
+ !$OMP SHARED (ac_mat, tc_grad_and_lapl_ao_test, ao_num)
+ !$OMP DO SCHEDULE (static)
  do j = 1, ao_num
    do l = 1, ao_num
      do i = 1, ao_num
@ -143,11 +160,15 @@ BEGIN_PROVIDER [double precision, tc_grad_and_lapl_ao_test, (ao_num, ao_num, ao_
      enddo
    enddo
  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL

-  call wall_time(wall1)
-  print*,'wall time for tc_grad_and_lapl_ao_test',wall1 - wall0
  deallocate(ac_mat)

+  call wall_time(time1)
+  print*, ' Wall time for tc_grad_and_lapl_ao_test = ', time1 - time0
+
 END_PROVIDER 

 ! ---
+
--- a/src/non_h_ints_mu/total_tc_int.irp.f
+++ b/src/non_h_ints_mu/total_tc_int.irp.f
@ -7,6 +7,7 @@ BEGIN_PROVIDER [double precision, ao_tc_int_chemist, (ao_num, ao_num, ao_num, ao
  integer          :: i, j, k, l
  double precision :: wall1, wall0

+  print *, ' providing ao_tc_int_chemist ...'
  call wall_time(wall0)
  
  if(test_cycle_tc)then
@ -36,6 +37,7 @@ BEGIN_PROVIDER [double precision, ao_tc_int_chemist_test, (ao_num, ao_num, ao_nu
  integer          :: i, j, k, l
  double precision :: wall1, wall0

+  print *, ' providing ao_tc_int_chemist_test ...'
  call wall_time(wall0)

   do j = 1, ao_num
@ -47,8 +49,10 @@ BEGIN_PROVIDER [double precision, ao_tc_int_chemist_test, (ao_num, ao_num, ao_nu
       enddo
     enddo
   enddo
+
  call wall_time(wall1)
  print *, ' wall time for ao_tc_int_chemist_test ', wall1 - wall0
+
 END_PROVIDER 

 ! ---
--- a/src/non_hermit_dav/biorthog.irp.f
+++ b/src/non_hermit_dav/biorthog.irp.f
@ -444,8 +444,8 @@ subroutine non_hrmt_bieig(n, A, thr_d, thr_nd, leigvec, reigvec, n_real_eigv, ei
    endif
    call check_biorthog(n, n_real_eigv, leigvec, reigvec, accu_d, accu_nd, S, thr_d, thr_nd, .true.)

-    !call impose_biorthog_qr(n, n_real_eigv, leigvec, reigvec)
-    !call impose_biorthog_lu(n, n_real_eigv, leigvec, reigvec)
+    !call impose_biorthog_qr(n, n_real_eigv, thr_d, thr_nd, leigvec, reigvec)
+    !call impose_biorthog_lu(n, n_real_eigv, thr_d, thr_nd, leigvec, reigvec)

    ! ---

@ -611,7 +611,7 @@ subroutine non_hrmt_bieig_random_diag(n, A, leigvec, reigvec, n_real_eigv, eigva
  enddo
  accu_nd = dsqrt(accu_nd)

-  if(accu_nd .lt. 1d-8) then
+  if(accu_nd .lt. thresh_biorthog_nondiag) then
    ! L x R is already bi-orthogonal

    print *, ' L & T bi-orthogonality: ok'
@ -623,7 +623,7 @@ subroutine non_hrmt_bieig_random_diag(n, A, leigvec, reigvec, n_real_eigv, eigva

    print *, ' L & T bi-orthogonality: not imposed yet'
    print *, ' accu_nd = ', accu_nd
-    call impose_biorthog_qr(n, n_real_eigv, leigvec, reigvec)
+    call impose_biorthog_qr(n, n_real_eigv, thresh_biorthog_diag, thresh_biorthog_nondiag, leigvec, reigvec)
    deallocate( S )
  
  endif
@ -633,7 +633,7 @@ subroutine non_hrmt_bieig_random_diag(n, A, leigvec, reigvec, n_real_eigv, eigva

  return

-end 
+end subroutine non_hrmt_bieig_random_diag

 ! ---

@ -961,7 +961,7 @@ subroutine non_hrmt_bieig_fullvect(n, A, leigvec, reigvec, n_real_eigv, eigval)
  enddo
  accu_nd = dsqrt(accu_nd)

-  if( accu_nd .lt. 1d-8 ) then
+  if(accu_nd .lt. thresh_biorthog_nondiag) then
    ! L x R is already bi-orthogonal

    !print *, ' L & T bi-orthogonality: ok'
@ -973,7 +973,7 @@ subroutine non_hrmt_bieig_fullvect(n, A, leigvec, reigvec, n_real_eigv, eigval)

    !print *, ' L & T bi-orthogonality: not imposed yet'
    !print *, ' accu_nd = ', accu_nd
-    call impose_biorthog_qr(n, n, leigvec, reigvec)
+    call impose_biorthog_qr(n, n, thresh_biorthog_diag, thresh_biorthog_nondiag, leigvec, reigvec)
    deallocate( S )
  
  endif
--- a/src/non_hermit_dav/new_routines.irp.f
+++ b/src/non_hermit_dav/new_routines.irp.f
@ -132,9 +132,9 @@ subroutine non_hrmt_diag_split_degen_bi_orthog(n, A, leigvec, reigvec, n_real_ei

 !!! ONCE ALL EIGENVALUES ARE REAL ::: CHECK BI-ORTHONORMALITY
  !                               check bi-orthogonality
-  call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, .false.)
+  call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
  print *, ' accu_nd bi-orthog = ', accu_nd
-  if( accu_nd .lt. 1d-10 ) then
+  if(accu_nd .lt. thresh_biorthog_nondiag) then
    print *, ' bi-orthogonality: ok'
  else
    print *, ' '
@ -149,14 +149,14 @@ subroutine non_hrmt_diag_split_degen_bi_orthog(n, A, leigvec, reigvec, n_real_ei
    deallocate(S_nh_inv_half)
    call impose_orthog_degen_eigvec(n, eigval, reigvec_tmp)
    call impose_orthog_degen_eigvec(n, eigval, leigvec_tmp)
-    call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, .false.)
-    if( accu_nd .lt. 1d-10 ) then
+    call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
+    if(accu_nd .lt. thresh_biorthog_nondiag) then
      print *, ' bi-orthogonality: ok'
    else 
     print*,'New vectors not bi-orthonormals at ',accu_nd
     call impose_biorthog_qr(n, n, leigvec_tmp, reigvec_tmp, S)
-     call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, .false.)
-     if( accu_nd .lt. 1d-10 ) then
+     call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
+     if(accu_nd .lt. thresh_biorthog_nondiag) then
       print *, ' bi-orthogonality: ok'
     else 
      print*,'New vectors not bi-orthonormals at ',accu_nd
@ -200,10 +200,10 @@ subroutine non_hrmt_diag_split_degen_bi_orthog(n, A, leigvec, reigvec, n_real_ei
  shift_current = max(1.d-10,shift_current)
  print*,'Thr for eigenvectors = ',shift_current
  call check_EIGVEC(n, n, A, eigval, leigvec, reigvec, shift_current, thr_norm, .false.)
-  call check_biorthog(n, n, leigvec, reigvec, accu_d, accu_nd, S, .false.)
+  call check_biorthog(n, n, leigvec, reigvec, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
  print *, ' accu_nd bi-orthog = ', accu_nd
  
-  if( accu_nd .lt. 1d-10 ) then
+  if(accu_nd .lt. thresh_biorthog_nondiag) then
    print *, ' bi-orthogonality: ok'
  else 
   print*,'Something went wrong in non_hrmt_diag_split_degen_bi_orthog'
@ -354,9 +354,9 @@ subroutine non_hrmt_diag_split_degen_s_inv_half(n, A, leigvec, reigvec, n_real_e

 !!! ONCE ALL EIGENVALUES ARE REAL ::: CHECK BI-ORTHONORMALITY
  !                               check bi-orthogonality
-  call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, .false.)
+  call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
  print *, ' accu_nd bi-orthog = ', accu_nd
-  if( accu_nd .lt. 1d-10 ) then
+  if(accu_nd .lt. thresh_biorthog_nondiag) then
    print *, ' bi-orthogonality: ok'
  else
    print *, ' '
@ -369,9 +369,9 @@ subroutine non_hrmt_diag_split_degen_s_inv_half(n, A, leigvec, reigvec, n_real_e
     ! bi-orthonormalization using orthogonalization of left, right and then QR between left and right
     call impose_orthog_degen_eigvec(n, eigval, reigvec_tmp) ! orthogonalization of reigvec
     call impose_orthog_degen_eigvec(n, eigval, leigvec_tmp) ! orthogonalization of leigvec
-     call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S) 
+     call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S,  thresh_biorthog_diag, thresh_biorthog_nondiag, .false.) 

-     if( accu_nd .lt. 1d-10 ) then
+     if(accu_nd .lt. thresh_biorthog_nondiag) then
       print *, ' bi-orthogonality: ok'
     else 
      print*,'New vectors not bi-orthonormals at ', accu_nd
@ -387,8 +387,8 @@ subroutine non_hrmt_diag_split_degen_s_inv_half(n, A, leigvec, reigvec, n_real_e
     print*,'S^{-1/2} exists !!'
     call bi_ortho_s_inv_half(n,leigvec_tmp,reigvec_tmp,S_nh_inv_half) ! use of S^{-1/2} bi-orthonormalization 
    endif
-    call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, .false.)
-    if( accu_nd .lt. 1d-10 ) then
+    call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
+    if(accu_nd .lt. thresh_biorthog_nondiag) then
      print *, ' bi-orthogonality: ok'
    else 
     print*,'New vectors not bi-orthonormals at ',accu_nd
@ -431,10 +431,10 @@ subroutine non_hrmt_diag_split_degen_s_inv_half(n, A, leigvec, reigvec, n_real_e
  shift_current = max(1.d-10,shift_current)
  print*,'Thr for eigenvectors = ',shift_current
  call check_EIGVEC(n, n, A, eigval, leigvec, reigvec, shift_current, thr_norm, .false.)
-  call check_biorthog(n, n, leigvec, reigvec, accu_d, accu_nd, S, .false.)
+  call check_biorthog(n, n, leigvec, reigvec, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
  print *, ' accu_nd bi-orthog = ', accu_nd
  
-  if( accu_nd .lt. 1d-10 ) then
+  if(accu_nd .lt. thresh_biorthog_nondiag) then
    print *, ' bi-orthogonality: ok'
  else 
   print*,'Something went wrong in non_hrmt_diag_split_degen_bi_orthog'
@ -472,6 +472,7 @@ subroutine non_hrmt_fock_mat(n, A, leigvec, reigvec, n_real_eigv, eigval)
  double precision :: accu,thr_cut
  double precision, allocatable :: S_nh_inv_half(:,:)
  logical :: complex_root
+  double precision :: thr_norm=1d0


  thr_cut = 1.d-15
@ -580,9 +581,9 @@ subroutine non_hrmt_fock_mat(n, A, leigvec, reigvec, n_real_eigv, eigval)

 !!! ONCE ALL EIGENVALUES ARE REAL ::: CHECK BI-ORTHONORMALITY
  !                               check bi-orthogonality
-  call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S)
+  call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
  print *, ' accu_nd bi-orthog = ', accu_nd
-  if( accu_nd .lt. 1d-10 ) then
+  if(accu_nd .lt. thresh_biorthog_nondiag) then
    print *, ' bi-orthogonality: ok'
  else
    print *, ' '
@ -593,9 +594,9 @@ subroutine non_hrmt_fock_mat(n, A, leigvec, reigvec, n_real_eigv, eigval)
    print *, ' '
    ! bi-orthonormalization using orthogonalization of left, right and then QR between left and right
    call impose_unique_biorthog_degen_eigvec(n, eigval, mo_coef, leigvec_tmp, reigvec_tmp)
-    call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S)
+    call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
    print*,'accu_nd = ',accu_nd
-    if( accu_nd .lt. 1d-10 ) then
+    if(accu_nd .lt. thresh_biorthog_nondiag) then
      print *, ' bi-orthogonality: ok'
    else 
     print*,'New vectors not bi-orthonormals at ',accu_nd
@ -608,8 +609,8 @@ subroutine non_hrmt_fock_mat(n, A, leigvec, reigvec, n_real_eigv, eigval)
      call bi_ortho_s_inv_half(n,leigvec_tmp,reigvec_tmp,S_nh_inv_half) ! use of S^{-1/2} bi-orthonormalization 
     endif
    endif
-    call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S)
-    if( accu_nd .lt. 1d-10 ) then
+    call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
+    if(accu_nd .lt. thresh_biorthog_nondiag) then
      print *, ' bi-orthogonality: ok'
    else 
     print*,'New vectors not bi-orthonormals at ',accu_nd
@ -651,11 +652,11 @@ subroutine non_hrmt_fock_mat(n, A, leigvec, reigvec, n_real_eigv, eigval)
  print*,'Checking for final reigvec/leigvec'
  shift_current = max(1.d-10,shift_current)
  print*,'Thr for eigenvectors = ',shift_current
-  call check_EIGVEC(n, n, A, eigval, leigvec, reigvec,shift_current)
-  call check_biorthog(n, n, leigvec, reigvec, accu_d, accu_nd, S)
+  call check_EIGVEC(n, n, A, eigval, leigvec, reigvec, shift_current, thr_norm, .false.)
+  call check_biorthog(n, n, leigvec, reigvec, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
  print *, ' accu_nd bi-orthog = ', accu_nd
  
-  if( accu_nd .lt. 1d-10 ) then
+  if(accu_nd .lt. thresh_biorthog_nondiag) then
    print *, ' bi-orthogonality: ok'
  else 
   print*,'Something went wrong in non_hrmt_diag_split_degen_bi_orthog'
--- a/src/scf_utils/diagonalize_fock.irp.f
+++ b/src/scf_utils/diagonalize_fock.irp.f
@ -20,6 +20,12 @@ BEGIN_PROVIDER [ double precision, eigenvectors_Fock_matrix_mo, (ao_num,mo_num)
     enddo
   enddo

+  !print *, ' Fock_matrix_MO :'
+  !do i = 1, mo_num
+  !  write(*, '(100(f15.7, 2x))') (Fock_matrix_MO(j,i), j = 1, mo_num)
+  !enddo
+
+
   if(frozen_orb_scf)then
     integer                        :: iorb,jorb
     do i = 1, n_core_orb
@ -89,6 +95,10 @@ BEGIN_PROVIDER [ double precision, eigenvectors_Fock_matrix_mo, (ao_num,mo_num)
   call dsyevd( 'V', 'U', mo_num, F,                             &
       size(F,1), diag, work, lwork, iwork, liwork, info)
   deallocate(iwork)
+  !print*, ' Fock eigval:'
+  !do i = 1, mo_num
+  !  print *, diag(i)
+  !enddo


   if (info /= 0) then
--- a/src/scf_utils/diis.irp.f
+++ b/src/scf_utils/diis.irp.f
@ -248,3 +248,71 @@ END_PROVIDER

 ! ---

+BEGIN_PROVIDER [double precision, FPS_SPF_Matrix_AO_a, (AO_num, AO_num)]
+
+  implicit none
+  double precision, allocatable  :: scratch(:,:)
+
+  allocate(scratch(AO_num, AO_num))
+
+  call dgemm( 'N', 'N', AO_num, AO_num, AO_num, 1.d0                                                                                 &
+            , Fock_Matrix_AO_alpha, size(Fock_Matrix_AO_alpha, 1), SCF_density_matrix_ao_alpha, size(SCF_Density_Matrix_AO_alpha, 1) &
+            , 0.d0, scratch, size(scratch, 1) )
+
+  call dgemm( 'N', 'N', AO_num, AO_num, AO_num, 1.d0                     &
+            , scratch, size(scratch, 1), AO_Overlap, size(AO_Overlap, 1) &
+            , 0.d0, FPS_SPF_Matrix_AO_a, size(FPS_SPF_Matrix_AO_a, 1) )
+
+  call dgemm( 'N', 'N', AO_num, AO_num, AO_num, 1.d0                                                             &
+            , AO_Overlap, size(AO_Overlap, 1), SCF_density_matrix_ao_alpha, size(SCF_density_matrix_ao_alpha, 1) & 
+            , 0.d0, scratch, size(scratch, 1) )
+
+  call dgemm( 'N', 'N', AO_num, AO_num, AO_num, -1.d0                                        &
+            , scratch, size(scratch, 1), Fock_Matrix_AO_alpha, size(Fock_Matrix_AO_alpha, 1) &
+            , 1.d0, FPS_SPF_Matrix_AO_a, size(FPS_SPF_Matrix_AO_a, 1) )
+
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [double precision, FPS_SPF_Matrix_AO_b, (AO_num, AO_num)]
+
+  implicit none
+  double precision, allocatable  :: scratch(:,:)
+
+  allocate(scratch(AO_num, AO_num))
+
+  call dgemm( 'N', 'N', AO_num, AO_num, AO_num, 1.d0                                                                             &
+            , Fock_Matrix_AO_beta, size(Fock_Matrix_AO_beta, 1), SCF_density_matrix_ao_beta, size(SCF_Density_Matrix_AO_beta, 1) &
+            , 0.d0, scratch, size(scratch, 1) )
+
+  call dgemm( 'N', 'N', AO_num, AO_num, AO_num, 1.d0                     &
+            , scratch, size(scratch, 1), AO_Overlap, size(AO_Overlap, 1) &
+            , 0.d0, FPS_SPF_Matrix_AO_b, size(FPS_SPF_Matrix_AO_b, 1) )
+
+  call dgemm( 'N', 'N', AO_num, AO_num, AO_num, 1.d0                                                           &
+            , AO_Overlap, size(AO_Overlap, 1), SCF_density_matrix_ao_beta, size(SCF_density_matrix_ao_beta, 1) & 
+            , 0.d0, scratch, size(scratch, 1) )
+
+  call dgemm( 'N', 'N', AO_num, AO_num, AO_num, -1.d0                                      &
+            , scratch, size(scratch, 1), Fock_Matrix_AO_beta, size(Fock_Matrix_AO_beta, 1) &
+            , 1.d0, FPS_SPF_Matrix_AO_b, size(FPS_SPF_Matrix_AO_b, 1) )
+
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [double precision, FPS_SPF_Matrix_MO_a, (mo_num, mo_num)]
+  implicit none
+  call ao_to_mo(FPS_SPF_Matrix_AO_a, size(FPS_SPF_Matrix_AO_a, 1), FPS_SPF_Matrix_MO_a, size(FPS_SPF_Matrix_MO_a, 1))
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [double precision, FPS_SPF_Matrix_MO_b, (mo_num, mo_num)]
+  implicit none
+  call ao_to_mo(FPS_SPF_Matrix_AO_b, size(FPS_SPF_Matrix_AO_b, 1), FPS_SPF_Matrix_MO_b, size(FPS_SPF_Matrix_MO_b, 1))
+END_PROVIDER
+
+! ---
+
--- a/src/scf_utils/fock_matrix.irp.f
+++ b/src/scf_utils/fock_matrix.irp.f
@ -15,7 +15,7 @@
   ! 
   ! Rcc = Acc Fcc^a + Bcc Fcc^b
   ! Roo = Aoo Foo^a + Boo Foo^b
-   ! Rcc = Avv Fvv^a + Bvv Fvv^b
+   ! Rvv = Avv Fvv^a + Bvv Fvv^b
   ! Fcv = (F^a + F^b)/2
   ! 
   ! F^a: Fock matrix alpha (MO), F^b: Fock matrix beta (MO)
@ -267,3 +267,5 @@ BEGIN_PROVIDER [ double precision, SCF_energy ]

 END_PROVIDER

+! ---
+
--- a/src/scf_utils/rh_scf_simple.irp.f
+++ b/src/scf_utils/rh_scf_simple.irp.f
@ -63,35 +63,34 @@ END_DOC
    energy_SCF = SCF_energy
    Delta_energy_SCF = energy_SCF - energy_SCF_previous

-    double precision :: level_shift_save
-    level_shift_save = level_shift
-    mo_coef_save(1:ao_num,1:mo_num) = mo_coef(1:ao_num,1:mo_num)
-    do while (Delta_energy_SCF > 0.d0)
-      mo_coef(1:ao_num,1:mo_num) = mo_coef_save
-      if (level_shift <= .1d0) then
-        level_shift = 1.d0
-      else
-        level_shift = level_shift * 3.0d0
-      endif
-      TOUCH mo_coef level_shift
-      mo_coef(1:ao_num,1:mo_num) = eigenvectors_Fock_matrix_MO(1:ao_num,1:mo_num)
-      if(frozen_orb_scf)then
-        call reorder_core_orb
-        call initialize_mo_coef_begin_iteration
-      endif
-      TOUCH mo_coef
-      Delta_energy_SCF = SCF_energy - energy_SCF_previous
-      energy_SCF = SCF_energy
-      if (level_shift-level_shift_save > 40.d0) then
-        level_shift = level_shift_save * 4.d0
-        SOFT_TOUCH level_shift
-        exit
-      endif
+    !double precision :: level_shift_save
+    !level_shift_save = level_shift
+    !mo_coef_save(1:ao_num,1:mo_num) = mo_coef(1:ao_num,1:mo_num)
+    !do while (Delta_energy_SCF > 0.d0)
+    !  mo_coef(1:ao_num,1:mo_num) = mo_coef_save
+    !  if (level_shift <= .1d0) then
+    !    level_shift = 1.d0
+    !  else
+    !    level_shift = level_shift * 3.0d0
+    !  endif
+    !  TOUCH mo_coef level_shift
+    !  mo_coef(1:ao_num,1:mo_num) = eigenvectors_Fock_matrix_MO(1:ao_num,1:mo_num)
+    !  if(frozen_orb_scf)then
+    !    call reorder_core_orb
+    !    call initialize_mo_coef_begin_iteration
+    !  endif
+    !  TOUCH mo_coef
+    !  Delta_energy_SCF = SCF_energy - energy_SCF_previous
+    !  energy_SCF = SCF_energy
+    !  if (level_shift-level_shift_save > 40.d0) then
+    !    level_shift = level_shift_save * 4.d0
+    !    SOFT_TOUCH level_shift
+    !    exit
+    !  endif
+    !enddo
+    !level_shift = level_shift * 0.5d0
+    !SOFT_TOUCH level_shift

-    enddo
-
-    level_shift = level_shift * 0.5d0
-    SOFT_TOUCH level_shift
    energy_SCF_previous = energy_SCF

 !   Print results at the end of each iteration
@ -100,7 +99,7 @@ END_DOC
      iteration_SCF, energy_SCF, Delta_energy_SCF, max_error_DIIS, level_shift, dim_DIIS

    if(Delta_energy_SCF < 0.d0) then
-      call save_mos
+      call save_mos()
    endif
    if(qp_stop()) exit

--- a/src/scf_utils/roothaan_hall_scf.irp.f
+++ b/src/scf_utils/roothaan_hall_scf.irp.f
@ -86,10 +86,9 @@ END_DOC
          iteration_SCF,dim_DIIS                                       &
          )

-      Fock_matrix_AO_alpha = Fock_matrix_AO*0.5d0
-      Fock_matrix_AO_beta  = Fock_matrix_AO*0.5d0
+      Fock_matrix_AO_alpha = Fock_matrix_AO!*0.5d0
+      Fock_matrix_AO_beta  = Fock_matrix_AO!*0.5d0
      TOUCH Fock_matrix_AO_alpha Fock_matrix_AO_beta
-
    endif

    MO_coef = eigenvectors_Fock_matrix_MO
@ -100,18 +99,14 @@ END_DOC

    TOUCH MO_coef

-!   Calculate error vectors
-
-    max_error_DIIS = maxval(Abs(FPS_SPF_Matrix_MO))
-
 !   SCF energy

    energy_SCF = SCF_energy
    Delta_energy_SCF = energy_SCF - energy_SCF_previous
    if ( (SCF_algorithm == 'DIIS').and.(Delta_energy_SCF > 0.d0) ) then
      Fock_matrix_AO(1:ao_num,1:ao_num) = Fock_matrix_DIIS (1:ao_num,1:ao_num,index_dim_DIIS)
-      Fock_matrix_AO_alpha = Fock_matrix_AO*0.5d0
-      Fock_matrix_AO_beta  = Fock_matrix_AO*0.5d0
+      Fock_matrix_AO_alpha = Fock_matrix_AO!*0.5d0
+      Fock_matrix_AO_beta  = Fock_matrix_AO!*0.5d0
      TOUCH Fock_matrix_AO_alpha Fock_matrix_AO_beta
    endif

@ -147,6 +142,9 @@ END_DOC
    SOFT_TOUCH level_shift
    energy_SCF_previous = energy_SCF

+!   Calculate error vectors
+    max_error_DIIS = maxval(Abs(FPS_SPF_Matrix_MO))
+
 !   Print results at the end of each iteration

    write(6,'(I4, 1X, F16.10, 1X, F16.10, 1X, F16.10, 1X, F16.10, 1X, I3)')  &
--- a/src/tc_bi_ortho/tc_som.irp.f
+++ b/src/tc_bi_ortho/tc_som.irp.f
@ -0,0 +1,70 @@
+! ---
+
+program tc_som
+
+  BEGIN_DOC
+  ! TODO : Put the documentation of the program here
+  END_DOC
+
+  implicit none
+
+  print *, ' starting ...'
+  print *, ' do not forget to do tc-scf first'
+
+  my_grid_becke  = .True.
+  my_n_pt_r_grid = 30
+  my_n_pt_a_grid = 50
+!  my_n_pt_r_grid = 10 ! small grid for quick debug
+!  my_n_pt_a_grid = 26 ! small grid for quick debug
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
+  PROVIDE mu_erf 
+  print *, ' mu = ', mu_erf
+  PROVIDE j1b_type
+  print *, ' j1b_type = ', j1b_type
+  print *, j1b_pen
+
+  read_wf = .true.
+  touch read_wf
+
+  call main()
+
+end
+
+! ---
+
+subroutine main()
+
+  implicit none
+  integer          :: i, i_HF, degree
+  double precision :: hmono_1, htwoe_1, hthree_1, htot_1
+  double precision :: hmono_2, htwoe_2, hthree_2, htot_2
+  double precision :: U_SOM
+
+  PROVIDE N_int N_det
+
+  do i = 1, N_det
+    call get_excitation_degree(HF_bitmask, psi_det(1,1,i), degree, N_int)
+    if(degree == 0) then
+      i_HF = i
+      exit
+    endif
+  enddo
+  print *, ' HF determinants:', i_HF
+  print *, '          N_det :', N_det
+
+  U_SOM = 0.d0 
+  do i = 1, N_det
+    if(i == i_HF) cycle
+    call htilde_mu_mat_bi_ortho(psi_det(1,1,i_HF), psi_det(1,1,i), N_int, hmono_1, htwoe_1, hthree_1, htot_1)
+    call htilde_mu_mat_bi_ortho(psi_det(1,1,i), psi_det(1,1,i_HF), N_int, hmono_2, htwoe_2, hthree_2, htot_2)
+    U_SOM += htot_1 * htot_2
+  enddo
+  U_SOM = 0.5d0 * U_SOM
+  print *, ' U_SOM = ', U_SOM
+  
+  return
+end subroutine main
+
+! ---
+
--- a/src/tc_keywords/EZFIO.cfg
+++ b/src/tc_keywords/EZFIO.cfg
@ -134,7 +134,31 @@ default: False
 type: integer
 doc: nb of Gaussians used to fit Jastrow fcts
 interface: ezfio,provider,ocaml
-default: 6
+default: 20
+
+[tcscf_algorithm]
+type: character*(32)
+doc: Type of TCSCF algorithm used. Possible choices are [Simple | DIIS]
+interface: ezfio,provider,ocaml
+default: Simple 
+
+[test_cycle_tc]
+type: logical
+doc: If |true|, the integrals of the three-body jastrow are computed with cycles
+interface: ezfio,provider,ocaml
+default: False
+
+[thresh_biorthog_diag]
+type: Threshold
+doc: Threshold to determine if diagonal elements of the bi-orthogonal condition L.T x R are close enouph to 1
+interface: ezfio,provider,ocaml
+default: 1.e-6
+
+[thresh_biorthog_nondiag]
+type: Threshold
+doc: Threshold to determine if non-diagonal elements of L.T x R are close enouph to 0
+interface: ezfio,provider,ocaml
+default: 1.e-6

 [max_dim_diis_tcscf]
 type: integer
@ -154,21 +178,9 @@ doc: Energy shift on the virtual MOs to improve TCSCF convergence
 interface: ezfio,provider,ocaml
 default: 0.

-[tcscf_algorithm]
-type: character*(32)
-doc: Type of TCSCF algorithm used. Possible choices are [Simple | DIIS]
-interface: ezfio,provider,ocaml
-default: Simple 
-
 [im_thresh_tcscf]
 type: Threshold
 doc: Thresholds on the Imag part of energy
 interface: ezfio,provider,ocaml
 default: 1.e-7

-[test_cycle_tc]
-type: logical
-doc: If |true|, the integrals of the three-body jastrow are computed with cycles
-interface: ezfio,provider,ocaml
-default: False
-
--- a/src/tc_scf/diago_bi_ort_tcfock.irp.f
+++ b/src/tc_scf/diago_bi_ort_tcfock.irp.f
@ -13,14 +13,10 @@
  integer                       :: n_real_tc 
  integer                       :: i, j, k, l
  double precision              :: accu_d, accu_nd, accu_tmp
-  double precision              :: thr_d, thr_nd
  double precision              :: norm
  double precision, allocatable :: eigval_right_tmp(:)
  double precision, allocatable :: F_tmp(:,:)

-  thr_d  = 1d-6
-  thr_nd = 1d-6
-
  allocate( eigval_right_tmp(mo_num), F_tmp(mo_num,mo_num) )

  PROVIDE Fock_matrix_tc_mo_tot
@ -38,12 +34,12 @@
    F_tmp(i,i) += level_shift_tcscf
  enddo

-  call non_hrmt_bieig( mo_num, F_tmp, thr_d, thr_nd           &
+  call non_hrmt_bieig( mo_num, F_tmp, thresh_biorthog_diag, thresh_biorthog_nondiag &
                     , fock_tc_leigvec_mo, fock_tc_reigvec_mo                       & 
                     , n_real_tc, eigval_right_tmp )

  !if(max_ov_tc_scf)then
-  ! call non_hrmt_fock_mat( mo_num, F_tmp, thr_d, thr_nd        &
+  ! call non_hrmt_fock_mat( mo_num, F_tmp, thresh_biorthog_diag, thresh_biorthog_nondiag &
  !                    , fock_tc_leigvec_mo, fock_tc_reigvec_mo                          & 
  !                    , n_real_tc, eigval_right_tmp )
  !else 
@ -88,16 +84,16 @@
      else
        accu_tmp = overlap_fock_tc_eigvec_mo(k,i)
        accu_nd += accu_tmp * accu_tmp
-        if(dabs(overlap_fock_tc_eigvec_mo(k,i)) .gt. thr_nd)then
+        if(dabs(overlap_fock_tc_eigvec_mo(k,i)) .gt. thresh_biorthog_nondiag)then
         print *, 'k,i', k, i, overlap_fock_tc_eigvec_mo(k,i)
        endif
      endif
    enddo 
  enddo
  accu_nd = dsqrt(accu_nd) / accu_d
-  if(accu_nd .gt. thr_nd) then
+  if(accu_nd .gt. thresh_biorthog_nondiag) then
    print *, ' bi-orthog failed'
-    print *, ' accu_nd MO = ', accu_nd, thr_nd
+    print *, ' accu_nd MO = ', accu_nd, thresh_biorthog_nondiag
    print *, ' overlap_fock_tc_eigvec_mo = '
    do i = 1, mo_num
      write(*,'(100(F16.10,X))') overlap_fock_tc_eigvec_mo(i,:)
@ -107,14 +103,14 @@

  ! ---

-  if(dabs(accu_d - dble(mo_num))/dble(mo_num) .gt. thr_d) then
+  if(dabs(accu_d - dble(mo_num))/dble(mo_num) .gt. thresh_biorthog_diag) then

    print *, ' mo_num     = ', mo_num 
-    print *, ' accu_d  MO = ', accu_d, thr_d
+    print *, ' accu_d  MO = ', accu_d, thresh_biorthog_diag
    print *, ' normalizing vectors ...'
    do i = 1, mo_num
      norm = dsqrt(dabs(overlap_fock_tc_eigvec_mo(i,i)))
-      if(norm .gt. thr_d) then
+      if(norm .gt. thresh_biorthog_diag) then
        do k = 1, mo_num
          fock_tc_reigvec_mo(k,i) *= 1.d0/norm
          fock_tc_leigvec_mo(k,i) *= 1.d0/norm
@ -137,16 +133,16 @@
        else
          accu_tmp = overlap_fock_tc_eigvec_mo(k,i)
          accu_nd += accu_tmp * accu_tmp
-          if(dabs(overlap_fock_tc_eigvec_mo(k,i)) .gt. thr_nd)then
+          if(dabs(overlap_fock_tc_eigvec_mo(k,i)) .gt. thresh_biorthog_nondiag)then
           print *, 'k,i', k, i, overlap_fock_tc_eigvec_mo(k,i)
          endif
        endif
      enddo 
    enddo
    accu_nd = dsqrt(accu_nd) / accu_d
-    if(accu_nd .gt. thr_nd) then
+    if(accu_nd .gt. thresh_biorthog_diag) then
      print *, ' bi-orthog failed'
-      print *, ' accu_nd MO = ', accu_nd, thr_nd
+      print *, ' accu_nd MO = ', accu_nd, thresh_biorthog_nondiag
      print *, ' overlap_fock_tc_eigvec_mo = '
      do i = 1, mo_num
        write(*,'(100(F16.10,X))') overlap_fock_tc_eigvec_mo(i,:)
@ -177,6 +173,7 @@ END_PROVIDER
  double precision              :: accu, accu_d
  double precision, allocatable :: tmp(:,:)

+  PROVIDE mo_l_coef mo_r_coef

 !  ! MO_R x R
   call dgemm( 'N', 'N', ao_num, mo_num, mo_num, 1.d0          &
--- a/src/tc_scf/fock_3e_bi_ortho_uhf.irp.f
+++ b/src/tc_scf/fock_3e_bi_ortho_uhf.irp.f
@ -310,46 +310,6 @@ BEGIN_PROVIDER [double precision, fock_3e_uhf_ao_a, (ao_num, ao_num)]
  deallocate(f_tmp)
 !$OMP END PARALLEL

-! TODO
-! !$OMP PARALLEL DEFAULT (NONE)                                                                &
-! !$OMP PRIVATE (g, e, d, k, mu, nu, dm_ge_a, dm_ge_b, dm_ge, dm_dk_a, dm_dk_b, dm_dk,         &
-! !$OMP          i_mugd_nuek, i_mugd_eknu, i_mugd_knue, i_mugd_nuke, i_mugd_enuk, i_mugd_kenu) &
-! !$OMP SHARED  (ao_num, TCSCF_bi_ort_dm_ao_alpha, TCSCF_bi_ort_dm_ao_beta, fock_3e_uhf_ao_a)
-! !$OMP DO
-!  do g = 1, ao_num
-!    do e = 1, ao_num
-!      dm_ge_a = TCSCF_bi_ort_dm_ao_alpha(g,e)
-!      dm_ge_b = TCSCF_bi_ort_dm_ao_beta (g,e)
-!      dm_ge   = dm_ge_a + dm_ge_b
-!      do d = 1, ao_num
-!        do k = 1, ao_num
-!          dm_dk_a = TCSCF_bi_ort_dm_ao_alpha(d,k)
-!          dm_dk_b = TCSCF_bi_ort_dm_ao_beta (d,k)
-!          dm_dk   = dm_dk_a + dm_dk_b
-!          do mu = 1, ao_num
-!            do nu = 1, ao_num
-!              call give_integrals_3_body_bi_ort_ao(mu, g, d, nu, e, k, i_mugd_nuek)
-!              call give_integrals_3_body_bi_ort_ao(mu, g, d, e, k, nu, i_mugd_eknu)
-!              call give_integrals_3_body_bi_ort_ao(mu, g, d, k, nu, e, i_mugd_knue)
-!              call give_integrals_3_body_bi_ort_ao(mu, g, d, nu, k, e, i_mugd_nuke)
-!              call give_integrals_3_body_bi_ort_ao(mu, g, d, e, nu, k, i_mugd_enuk)
-!              call give_integrals_3_body_bi_ort_ao(mu, g, d, k, e, nu, i_mugd_kenu)
-!              fock_3e_uhf_ao_a(mu,nu) -= 0.5d0 * ( dm_ge   * dm_dk   * i_mugd_nuek &
-!                                                 + dm_ge_a * dm_dk_a * i_mugd_eknu &
-!                                                 + dm_ge_a * dm_dk_a * i_mugd_knue &
-!                                                 - dm_ge_a * dm_dk   * i_mugd_enuk &
-!                                                 - dm_ge   * dm_dk_a * i_mugd_kenu &
-!                                                 - dm_ge_a * dm_dk_a * i_mugd_nuke &
-!                                                 - dm_ge_b * dm_dk_b * i_mugd_nuke )
-!            enddo
-!          enddo
-!        enddo
-!      enddo
-!    enddo
-!  enddo
-! !$OMP END DO
-! !$OMP END PARALLEL
-
  call wall_time(tf)
  print *, ' total Wall time for fock_3e_uhf_ao_a =', tf - ti

@ -436,44 +396,6 @@ BEGIN_PROVIDER [double precision, fock_3e_uhf_ao_b, (ao_num, ao_num)]
  deallocate(f_tmp)
 !$OMP END PARALLEL

-! TODO
-! !$OMP PARALLEL DO DEFAULT (NONE)                                                             &
-! !$OMP PRIVATE (g, e, d, k, mu, nu, dm_ge_a, dm_ge_b, dm_ge, dm_dk_a, dm_dk_b, dm_dk,         &
-! !$OMP          i_mugd_nuek, i_mugd_eknu, i_mugd_knue, i_mugd_nuke, i_mugd_enuk, i_mugd_kenu) &
-! !$OMP SHARED  (ao_num, TCSCF_bi_ort_dm_ao_alpha, TCSCF_bi_ort_dm_ao_beta, fock_3e_uhf_ao_b)
-!  do g = 1, ao_num
-!    do e = 1, ao_num
-!      dm_ge_a = TCSCF_bi_ort_dm_ao_alpha(g,e)
-!      dm_ge_b = TCSCF_bi_ort_dm_ao_beta (g,e)
-!      dm_ge   = dm_ge_a + dm_ge_b
-!      do d = 1, ao_num
-!        do k = 1, ao_num
-!          dm_dk_a = TCSCF_bi_ort_dm_ao_alpha(d,k)
-!          dm_dk_b = TCSCF_bi_ort_dm_ao_beta (d,k)
-!          dm_dk   = dm_dk_a + dm_dk_b
-!          do mu = 1, ao_num
-!            do nu = 1, ao_num
-!              call give_integrals_3_body_bi_ort_ao(mu, g, d, nu, e, k, i_mugd_nuek)
-!              call give_integrals_3_body_bi_ort_ao(mu, g, d, e, k, nu, i_mugd_eknu)
-!              call give_integrals_3_body_bi_ort_ao(mu, g, d, k, nu, e, i_mugd_knue)
-!              call give_integrals_3_body_bi_ort_ao(mu, g, d, nu, k, e, i_mugd_nuke)
-!              call give_integrals_3_body_bi_ort_ao(mu, g, d, e, nu, k, i_mugd_enuk)
-!              call give_integrals_3_body_bi_ort_ao(mu, g, d, k, e, nu, i_mugd_kenu)
-!              fock_3e_uhf_ao_b(mu,nu) -= 0.5d0 * ( dm_ge   * dm_dk   * i_mugd_nuek &
-!                                                 + dm_ge_b * dm_dk_b * i_mugd_eknu &
-!                                                 + dm_ge_b * dm_dk_b * i_mugd_knue &
-!                                                 - dm_ge_b * dm_dk   * i_mugd_enuk &
-!                                                 - dm_ge   * dm_dk_b * i_mugd_kenu &
-!                                                 - dm_ge_b * dm_dk_b * i_mugd_nuke &
-!                                                 - dm_ge_a * dm_dk_a * i_mugd_nuke )
-!            enddo
-!          enddo
-!        enddo
-!      enddo
-!    enddo
-!  enddo
-! !$OMP END PARALLEL DO
-
  call wall_time(tf)
  print *, ' total Wall time for fock_3e_uhf_ao_b =', tf - ti

--- a/src/tc_scf/fock_tc.irp.f
+++ b/src/tc_scf/fock_tc.irp.f
@ -1,29 +1,30 @@

 ! ---

- BEGIN_PROVIDER [ double precision, two_e_tc_non_hermit_integral_alpha, (ao_num, ao_num)]
-&BEGIN_PROVIDER [ double precision, two_e_tc_non_hermit_integral_beta , (ao_num, ao_num)]
+ BEGIN_PROVIDER [ double precision, two_e_tc_non_hermit_integral_seq_alpha, (ao_num, ao_num)]
+&BEGIN_PROVIDER [ double precision, two_e_tc_non_hermit_integral_seq_beta , (ao_num, ao_num)]
+
  BEGIN_DOC
-! two_e_tc_non_hermit_integral_alpha(k,i) = <k| F^tc_alpha |i> 
+  !
+  ! two_e_tc_non_hermit_integral_seq_alpha(k,i) = <k| F^tc_alpha |i> 
  !
  ! where F^tc is the two-body part of the TC Fock matrix and k,i are AO basis functions
+  !
  END_DOC
+
  implicit none
  integer          :: i, j, k, l
  double precision :: density, density_a, density_b
+  double precision :: t0, t1

-  two_e_tc_non_hermit_integral_alpha = 0.d0
-  two_e_tc_non_hermit_integral_beta  = 0.d0
+  !print*, ' providing two_e_tc_non_hermit_integral_seq ...'
+  !call wall_time(t0)
+
+  two_e_tc_non_hermit_integral_seq_alpha = 0.d0
+  two_e_tc_non_hermit_integral_seq_beta  = 0.d0

-  !! TODO :: parallelization properly done
  do i = 1, ao_num
    do k = 1, ao_num
-!!$OMP PARALLEL                  &
-!!$OMP DEFAULT (NONE)            &
-!!$OMP PRIVATE (j,l,density_a,density_b,density) & 
-!!$OMP SHARED (i,k,ao_num,SCF_density_matrix_ao_alpha,SCF_density_matrix_ao_beta,ao_non_hermit_term_chemist) & 
-!!$OMP SHARED (two_e_tc_non_hermit_integral_alpha,two_e_tc_non_hermit_integral_beta)
-!!$OMP DO SCHEDULE (dynamic)
      do j = 1, ao_num
        do l = 1, ao_num

@ -31,6 +32,15 @@
          density_b = TCSCF_density_matrix_ao_beta (l,j)
          density   = density_a + density_b

+          !!                                         rho(l,j)   *      < k l| T | i j>
+          !two_e_tc_non_hermit_integral_seq_alpha(k,i) += density   * ao_two_e_tc_tot(l,j,k,i)
+          !!                                         rho(l,j)   *      < k l| T | i j>
+          !two_e_tc_non_hermit_integral_seq_beta (k,i) += density   * ao_two_e_tc_tot(l,j,k,i)
+          !!                                         rho_a(l,j) *      < l k| T | i j>
+          !two_e_tc_non_hermit_integral_seq_alpha(k,i) -= density_a * ao_two_e_tc_tot(k,j,l,i)
+          !!                                         rho_b(l,j) *      < l k| T | i j>
+          !two_e_tc_non_hermit_integral_seq_beta (k,i) -= density_b * ao_two_e_tc_tot(k,j,l,i)
+
          !!                                         rho(l,j)   *      < k l| T | i j>
          !two_e_tc_non_hermit_integral_alpha(k,i) += density   * ao_two_e_tc_tot(l,j,k,i)
          !!                                         rho(l,j)   *      < k l| T | i j>
@ -41,32 +51,106 @@
          !two_e_tc_non_hermit_integral_beta (k,i) -= density_b * ao_two_e_tc_tot(k,j,l,i)

          !                                         rho(l,j)   *      < k l| T | i j>
-          two_e_tc_non_hermit_integral_alpha(k,i) += density   * ao_two_e_tc_tot(k,i,l,j)
+          two_e_tc_non_hermit_integral_seq_alpha(k,i) += density   * ao_two_e_tc_tot(k,i,l,j)
          !                                         rho(l,j)   *      < k l| T | i j>
-          two_e_tc_non_hermit_integral_beta (k,i) += density   * ao_two_e_tc_tot(k,i,l,j)
+          two_e_tc_non_hermit_integral_seq_beta (k,i) += density   * ao_two_e_tc_tot(k,i,l,j)
          !                                         rho_a(l,j) *      < k l| T | j i>
-          two_e_tc_non_hermit_integral_alpha(k,i) -= density_a * ao_two_e_tc_tot(k,j,l,i)
+          two_e_tc_non_hermit_integral_seq_alpha(k,i) -= density_a * ao_two_e_tc_tot(k,j,l,i)
          !                                         rho_b(l,j) *      < k l| T | j i>
-          two_e_tc_non_hermit_integral_beta (k,i) -= density_b * ao_two_e_tc_tot(k,j,l,i)
+          two_e_tc_non_hermit_integral_seq_beta (k,i) -= density_b * ao_two_e_tc_tot(k,j,l,i)

        enddo
      enddo
-!!$OMP END DO
-!!$OMP END PARALLEL
    enddo
  enddo

+  !call wall_time(t1)
+  !print*, ' wall time for two_e_tc_non_hermit_integral_seq after = ', t1 - t0
+
+END_PROVIDER 
+
+! ---
+
+ BEGIN_PROVIDER [ double precision, two_e_tc_non_hermit_integral_alpha, (ao_num, ao_num)]
+&BEGIN_PROVIDER [ double precision, two_e_tc_non_hermit_integral_beta , (ao_num, ao_num)]
+
+  BEGIN_DOC
+  !
+  ! two_e_tc_non_hermit_integral_alpha(k,i) = <k| F^tc_alpha |i> 
+  !
+  ! where F^tc is the two-body part of the TC Fock matrix and k,i are AO basis functions
+  !
+  END_DOC
+
+  implicit none
+  integer                       :: i, j, k, l
+  double precision              :: density, density_a, density_b, I_coul, I_kjli
+  double precision              :: t0, t1
+  double precision, allocatable :: tmp_a(:,:), tmp_b(:,:)
+
+  !print*, ' providing two_e_tc_non_hermit_integral ...'
+  !call wall_time(t0)
+
+  two_e_tc_non_hermit_integral_alpha = 0.d0
+  two_e_tc_non_hermit_integral_beta  = 0.d0
+
+ !$OMP PARALLEL DEFAULT (NONE)                                                                        &
+ !$OMP PRIVATE (i, j, k, l, density_a, density_b, density, tmp_a, tmp_b, I_coul, I_kjli)              &
+ !$OMP SHARED  (ao_num, TCSCF_density_matrix_ao_alpha, TCSCF_density_matrix_ao_beta, ao_two_e_tc_tot, &
+ !$OMP         two_e_tc_non_hermit_integral_alpha, two_e_tc_non_hermit_integral_beta)
+
+  allocate(tmp_a(ao_num,ao_num), tmp_b(ao_num,ao_num))
+  tmp_a = 0.d0
+  tmp_b = 0.d0
+
+ !$OMP DO
+  do j = 1, ao_num
+    do l = 1, ao_num
+      density_a = TCSCF_density_matrix_ao_alpha(l,j)
+      density_b = TCSCF_density_matrix_ao_beta (l,j)
+      density   = density_a + density_b                      
+      do i = 1, ao_num
+        do k = 1, ao_num
+
+          I_coul = density * ao_two_e_tc_tot(k,i,l,j)
+          I_kjli = ao_two_e_tc_tot(k,j,l,i)
+
+          tmp_a(k,i) += I_coul - density_a * I_kjli
+          tmp_b(k,i) += I_coul - density_b * I_kjli
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO NOWAIT
+
+ !$OMP CRITICAL
+  do i = 1, ao_num
+    do j = 1, ao_num
+      two_e_tc_non_hermit_integral_alpha(j,i) += tmp_a(j,i)
+      two_e_tc_non_hermit_integral_beta (j,i) += tmp_b(j,i)
+    enddo
+  enddo
+ !$OMP END CRITICAL
+
+  deallocate(tmp_a, tmp_b)
+ !$OMP END PARALLEL
+
+  !call wall_time(t1)
+  !print*, ' wall time for two_e_tc_non_hermit_integral after = ', t1 - t0
+
 END_PROVIDER 

 ! ---

 BEGIN_PROVIDER [ double precision, Fock_matrix_tc_ao_alpha, (ao_num, ao_num)]
-  implicit none
+
  BEGIN_DOC
  ! Total alpha TC Fock matrix : h_c + Two-e^TC terms on the AO basis
  END_DOC
-  Fock_matrix_tc_ao_alpha =  ao_one_e_integrals_tc_tot &
-                          + two_e_tc_non_hermit_integral_alpha 
+
+  implicit none
+
+  Fock_matrix_tc_ao_alpha =  ao_one_e_integrals_tc_tot + two_e_tc_non_hermit_integral_alpha 

 END_PROVIDER 

@ -77,10 +161,10 @@ BEGIN_PROVIDER [ double precision, Fock_matrix_tc_ao_beta, (ao_num, ao_num)]
  BEGIN_DOC
  ! Total beta TC Fock matrix : h_c + Two-e^TC terms on the AO basis
  END_DOC
+
  implicit none

-  Fock_matrix_tc_ao_beta = ao_one_e_integrals_tc_tot &
-                         + two_e_tc_non_hermit_integral_beta 
+  Fock_matrix_tc_ao_beta = ao_one_e_integrals_tc_tot + two_e_tc_non_hermit_integral_beta 

 END_PROVIDER 

@ -171,25 +255,38 @@ END_PROVIDER

  do i = 1, elec_beta_num ! doc --> SOMO
    do k = elec_beta_num+1, elec_alpha_num
-      grad_non_hermit_left  += dabs(Fock_matrix_tc_mo_tot(k,i))
-      grad_non_hermit_right += dabs(Fock_matrix_tc_mo_tot(i,k))
+      grad_non_hermit_left  = max(grad_non_hermit_left , dabs(Fock_matrix_tc_mo_tot(k,i)))
+      grad_non_hermit_right = max(grad_non_hermit_right, dabs(Fock_matrix_tc_mo_tot(i,k)))
+      !grad_non_hermit_left  += dabs(Fock_matrix_tc_mo_tot(k,i))
+      !grad_non_hermit_right += dabs(Fock_matrix_tc_mo_tot(i,k))
+      !grad_non_hermit_left  += Fock_matrix_tc_mo_tot(k,i) * Fock_matrix_tc_mo_tot(k,i)
+      !grad_non_hermit_right += Fock_matrix_tc_mo_tot(i,k) * Fock_matrix_tc_mo_tot(i,k)
    enddo
  enddo

  do i = 1, elec_beta_num ! doc --> virt 
    do k = elec_alpha_num+1, mo_num
-      grad_non_hermit_left  += dabs(Fock_matrix_tc_mo_tot(k,i))
-      grad_non_hermit_right += dabs(Fock_matrix_tc_mo_tot(i,k))
+      grad_non_hermit_left  = max(grad_non_hermit_left , dabs(Fock_matrix_tc_mo_tot(k,i)))
+      grad_non_hermit_right = max(grad_non_hermit_right, dabs(Fock_matrix_tc_mo_tot(i,k)))
+      !grad_non_hermit_left  += dabs(Fock_matrix_tc_mo_tot(k,i))
+      !grad_non_hermit_right += dabs(Fock_matrix_tc_mo_tot(i,k))
+      grad_non_hermit_left  += Fock_matrix_tc_mo_tot(k,i) * Fock_matrix_tc_mo_tot(k,i)
+      grad_non_hermit_right += Fock_matrix_tc_mo_tot(i,k) * Fock_matrix_tc_mo_tot(i,k)
    enddo
  enddo

  do i = elec_beta_num+1, elec_alpha_num ! SOMO --> virt 
    do k = elec_alpha_num+1, mo_num
-      grad_non_hermit_left  += dabs(Fock_matrix_tc_mo_tot(k,i))
-      grad_non_hermit_right += dabs(Fock_matrix_tc_mo_tot(i,k))
+      grad_non_hermit_left  = max(grad_non_hermit_left , dabs(Fock_matrix_tc_mo_tot(k,i)))
+      grad_non_hermit_right = max(grad_non_hermit_right, dabs(Fock_matrix_tc_mo_tot(i,k)))
+      !grad_non_hermit_left  += dabs(Fock_matrix_tc_mo_tot(k,i))
+      !grad_non_hermit_right += dabs(Fock_matrix_tc_mo_tot(i,k))
+      grad_non_hermit_left  += Fock_matrix_tc_mo_tot(k,i) * Fock_matrix_tc_mo_tot(k,i)
+      grad_non_hermit_right += Fock_matrix_tc_mo_tot(i,k) * Fock_matrix_tc_mo_tot(i,k)
    enddo
  enddo

+  !grad_non_hermit = dsqrt(grad_non_hermit_left) + dsqrt(grad_non_hermit_right)
  grad_non_hermit = grad_non_hermit_left + grad_non_hermit_right

 END_PROVIDER 
--- a/src/tc_scf/fock_three.irp.f
+++ b/src/tc_scf/fock_three.irp.f
@ -79,6 +79,8 @@ BEGIN_PROVIDER [double precision, diag_three_elem_hf]
  double precision :: contrib, weight, four_third, one_third, two_third, exchange_int_231
  double precision :: integral_aaa, hthree, integral_aab, integral_abb, integral_bbb

+  PROVIDE mo_l_coef mo_r_coef
+
  !print *, ' providing diag_three_elem_hf'

  if(.not. three_body_h_tc) then
--- a/src/tc_scf/rh_tcscf_diis.irp.f
+++ b/src/tc_scf/rh_tcscf_diis.irp.f
@ -0,0 +1,362 @@
+! ---
+
+subroutine rh_tcscf_diis()
+
+  implicit none
+
+  integer                       :: i, j, it
+  integer                       :: dim_DIIS, index_dim_DIIS
+  double precision              :: etc_tot, etc_1e, etc_2e, etc_3e, e_save, e_delta
+  double precision              :: tc_grad, g_save, g_delta, g_delta_th
+  double precision              :: level_shift_save, rate_th
+  double precision              :: t0, t1
+  double precision              :: er_DIIS, er_delta, er_save, er_delta_th
+  double precision, allocatable :: F_DIIS(:,:,:), E_DIIS(:,:,:)
+  double precision, allocatable :: mo_r_coef_save(:,:), mo_l_coef_save(:,:)
+
+  logical, external             :: qp_stop
+
+  it          = 0
+  e_save      = 0.d0
+  dim_DIIS    = 0
+  g_delta_th  = 1d0
+  er_delta_th = 1d0
+  rate_th     = 100.d0 !0.01d0 !0.2d0
+
+  allocate(mo_r_coef_save(ao_num,mo_num), mo_l_coef_save(ao_num,mo_num))
+  mo_l_coef_save = 0.d0
+  mo_r_coef_save = 0.d0
+
+  allocate(F_DIIS(ao_num,ao_num,max_dim_DIIS_TCSCF), E_DIIS(ao_num,ao_num,max_dim_DIIS_TCSCF))
+  F_DIIS = 0.d0
+  E_DIIS = 0.d0
+
+  call write_time(6)
+
+  ! ---
+
+  PROVIDE level_shift_TCSCF
+  PROVIDE mo_l_coef mo_r_coef
+
+  write(6, '(A4,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A4, 1X, A8)')              &
+    '====', '================', '================', '================', '================', '================' &
+          , '================', '================', '================', '====', '========'
+
+  write(6, '(A4,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A4, 1X, A8)')              &
+    ' it ', '  SCF TC Energy ', '      E(1e)     ', '      E(2e)     ', '      E(3e)     ', '   energy diff  ' &
+          , '    gradient    ', '    DIIS error  ', '  level shift   ', 'DIIS', '  WT (m)'
+
+  write(6, '(A4,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A4, 1X, A8)')              &
+    '====', '================', '================', '================', '================', '================' &
+          , '================', '================', '================', '====', '========'
+
+
+  ! first iteration (HF orbitals)
+  call wall_time(t0)
+
+  etc_tot = TC_HF_energy
+  etc_1e  = TC_HF_one_e_energy
+  etc_2e  = TC_HF_two_e_energy
+  etc_3e  = 0.d0
+  if(three_body_h_tc) then
+    etc_3e = diag_three_elem_hf
+  endif
+  tc_grad = grad_non_hermit
+  er_DIIS = maxval(abs(FQS_SQF_mo))
+  e_delta = dabs(etc_tot - e_save)
+
+  e_save  = etc_tot
+  g_save  = tc_grad
+  er_save = er_DIIS
+
+  call wall_time(t1)
+  write(6, '(I4,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, I4,1X, F8.2)')  &
+    it, etc_tot, etc_1e, etc_2e, etc_3e, e_delta, tc_grad, er_DIIS, level_shift_tcscf, dim_DIIS, (t1-t0)/60.d0
+
+  ! ---
+
+  PROVIDE FQS_SQF_ao Fock_matrix_tc_ao_tot
+
+  do while((tc_grad .gt. dsqrt(thresh_tcscf)) .and. (er_DIIS .gt. threshold_DIIS_nonzero_TCSCF))
+
+    call wall_time(t0)
+
+    it += 1
+    if(it > n_it_TCSCF_max) then
+      print *, ' max of TCSCF iterations is reached ', n_it_TCSCF_max
+      stop
+    endif
+
+    dim_DIIS = min(dim_DIIS+1, max_dim_DIIS_TCSCF)
+
+    ! ---
+
+    if(dabs(e_delta) > 1.d-12) then
+
+      index_dim_DIIS = mod(dim_DIIS-1, max_dim_DIIS_TCSCF) + 1
+      do j = 1, ao_num
+        do i = 1, ao_num
+          F_DIIS(i,j,index_dim_DIIS) = Fock_matrix_tc_ao_tot(i,j)
+          E_DIIS(i,j,index_dim_DIIS) = FQS_SQF_ao           (i,j)
+        enddo
+      enddo
+
+      call extrapolate_TC_Fock_matrix(E_DIIS, F_DIIS, Fock_matrix_tc_ao_tot, size(Fock_matrix_tc_ao_tot, 1), it, dim_DIIS)
+
+      call ao_to_mo_bi_ortho( Fock_matrix_tc_ao_tot, size(Fock_matrix_tc_ao_tot, 1) &
+                            , Fock_matrix_tc_mo_tot, size(Fock_matrix_tc_mo_tot, 1) )
+      TOUCH Fock_matrix_tc_mo_tot fock_matrix_tc_diag_mo_tot
+    endif
+
+    ! ---
+
+    mo_l_coef(1:ao_num,1:mo_num) = fock_tc_leigvec_ao(1:ao_num,1:mo_num)
+    mo_r_coef(1:ao_num,1:mo_num) = fock_tc_reigvec_ao(1:ao_num,1:mo_num)
+    !call ezfio_set_bi_ortho_mos_mo_l_coef(mo_l_coef)
+    !call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
+    TOUCH mo_l_coef mo_r_coef
+
+    ! ---
+
+    g_delta  = grad_non_hermit         -  g_save
+    er_delta = maxval(abs(FQS_SQF_mo)) - er_save
+
+    !if((g_delta > rate_th * g_delta_th) .and. (er_delta > rate_th * er_delta_th) .and. (it > 1)) then
+    if((g_delta > rate_th * g_delta_th) .and. (it > 1)) then
+    !if((g_delta > 0.d0) .and. (it > 1)) then
+
+      Fock_matrix_tc_ao_tot(1:ao_num,1:ao_num) = F_DIIS(1:ao_num,1:ao_num,index_dim_DIIS)
+      call ao_to_mo_bi_ortho( Fock_matrix_tc_ao_tot, size(Fock_matrix_tc_ao_tot, 1) &
+                            , Fock_matrix_tc_mo_tot, size(Fock_matrix_tc_mo_tot, 1) )
+      TOUCH Fock_matrix_tc_mo_tot fock_matrix_tc_diag_mo_tot
+
+      mo_l_coef(1:ao_num,1:mo_num) = fock_tc_leigvec_ao(1:ao_num,1:mo_num)
+      mo_r_coef(1:ao_num,1:mo_num) = fock_tc_reigvec_ao(1:ao_num,1:mo_num)
+      !call ezfio_set_bi_ortho_mos_mo_l_coef(mo_l_coef)
+      !call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
+      TOUCH mo_l_coef mo_r_coef
+
+    endif
+
+    ! ---
+
+    g_delta  = grad_non_hermit         -  g_save
+    er_delta = maxval(abs(FQS_SQF_mo)) - er_save
+
+    mo_l_coef_save(1:ao_num,1:mo_num) = mo_l_coef(1:ao_num,1:mo_num)
+    mo_r_coef_save(1:ao_num,1:mo_num) = mo_r_coef(1:ao_num,1:mo_num)
+
+    !do while((g_delta > rate_th * g_delta_th) .and. (er_delta > rate_th * er_delta_th) .and. (it > 1))
+    do while((g_delta > rate_th * g_delta_th) .and. (it > 1))
+      print *, ' big or bad step : ', g_delta, rate_th * g_delta_th
+
+      mo_l_coef(1:ao_num,1:mo_num) = mo_l_coef_save(1:ao_num,1:mo_num) 
+      mo_r_coef(1:ao_num,1:mo_num) = mo_r_coef_save(1:ao_num,1:mo_num) 
+      if(level_shift_TCSCF <= .1d0) then
+        level_shift_TCSCF = 1.d0
+      else
+        level_shift_TCSCF = level_shift_TCSCF * 3.0d0
+      endif
+      TOUCH mo_l_coef mo_r_coef level_shift_TCSCF
+
+      mo_l_coef(1:ao_num,1:mo_num) = fock_tc_leigvec_ao(1:ao_num,1:mo_num)
+      mo_r_coef(1:ao_num,1:mo_num) = fock_tc_reigvec_ao(1:ao_num,1:mo_num)
+      !call ezfio_set_bi_ortho_mos_mo_l_coef(mo_l_coef)
+      !call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
+      TOUCH mo_l_coef mo_r_coef
+
+      g_delta  = grad_non_hermit         -  g_save
+      er_delta = maxval(abs(FQS_SQF_mo)) - er_save
+
+      if(level_shift_TCSCF - level_shift_save > 40.d0) then
+        level_shift_TCSCF = level_shift_save * 4.d0
+        SOFT_TOUCH level_shift_TCSCF
+        exit
+      endif
+
+      dim_DIIS = 0
+    enddo
+
+    ! ---
+
+    level_shift_TCSCF = level_shift_TCSCF * 0.5d0
+    SOFT_TOUCH level_shift_TCSCF
+
+    etc_tot = TC_HF_energy
+    etc_1e  = TC_HF_one_e_energy
+    etc_2e  = TC_HF_two_e_energy
+    etc_3e  = 0.d0
+    if(three_body_h_tc) then
+      etc_3e = diag_three_elem_hf
+    endif
+    tc_grad  = grad_non_hermit
+    er_DIIS  = maxval(abs(FQS_SQF_mo))
+    e_delta  = dabs(etc_tot - e_save)
+    g_delta  = tc_grad - g_save
+    er_delta = er_DIIS - er_save
+    
+    e_save           = etc_tot
+    g_save           = tc_grad
+    level_shift_save = level_shift_TCSCF
+    er_save          = er_DIIS
+
+    g_delta_th  = dabs(tc_grad) ! g_delta)
+    er_delta_th = dabs(er_DIIS) !er_delta)
+
+    call wall_time(t1)
+    write(6, '(I4,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, I4,1X, F8.2)')  &
+      it, etc_tot, etc_1e, etc_2e, etc_3e, e_delta, tc_grad, er_DIIS, level_shift_tcscf, dim_DIIS, (t1-t0)/60.d0
+
+    if(g_delta .lt. 0.d0) then
+      call ezfio_set_tc_scf_bitc_energy(etc_tot)
+      call ezfio_set_bi_ortho_mos_mo_l_coef(mo_l_coef)
+      call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
+    endif
+
+    if(qp_stop()) exit
+  enddo
+
+  ! ---
+
+  print *, ' TCSCF DIIS converged !'
+  call print_energy_and_mos()
+
+  call write_time(6)
+
+  deallocate(mo_r_coef_save, mo_l_coef_save, F_DIIS, E_DIIS)
+
+  call ezfio_set_tc_scf_bitc_energy(TC_HF_energy)
+  call ezfio_set_bi_ortho_mos_mo_l_coef(mo_l_coef)
+  call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
+
+end
+
+! ---
+
+subroutine extrapolate_TC_Fock_matrix(E_DIIS, F_DIIS, F_ao, size_F_ao, it, dim_DIIS)
+
+  BEGIN_DOC
+  !
+  ! Compute the extrapolated Fock matrix using the DIIS procedure
+  !
+  ! e = \sum_i c_i e_i and \sum_i c_i = 1 
+  ! ==> lagrange multiplier with L = |e|^2 - \lambda (\sum_i c_i = 1)
+  !
+  END_DOC
+
+  implicit none
+
+  integer,          intent(in)    :: it, size_F_ao
+  integer,          intent(inout) :: dim_DIIS
+  double precision, intent(in)    :: F_DIIS(ao_num,ao_num,dim_DIIS)
+  double precision, intent(in)    :: E_DIIS(ao_num,ao_num,dim_DIIS)
+  double precision, intent(inout) :: F_ao(size_F_ao,ao_num)
+
+  double precision, allocatable   :: B_matrix_DIIS(:,:), X_vector_DIIS(:), C_vector_DIIS(:)
+
+  integer                         :: i, j, k, l, i_DIIS, j_DIIS
+  integer                         :: lwork
+  double precision                :: rcond, ferr, berr
+  integer,          allocatable   :: iwork(:)
+  double precision, allocatable   :: scratch(:,:)
+
+  if(dim_DIIS < 1) then
+    return
+  endif
+
+  allocate( B_matrix_DIIS(dim_DIIS+1,dim_DIIS+1), X_vector_DIIS(dim_DIIS+1) &
+          , C_vector_DIIS(dim_DIIS+1), scratch(ao_num,ao_num) )
+
+  ! Compute the matrices B and X
+  B_matrix_DIIS(:,:) = 0.d0
+  do j = 1, dim_DIIS
+    j_DIIS = min(dim_DIIS, mod(it-j, max_dim_DIIS_TCSCF)+1)
+
+    do i = 1, dim_DIIS
+      i_DIIS = min(dim_DIIS, mod(it-i, max_dim_DIIS_TCSCF)+1)
+
+      ! Compute product of two errors vectors
+      do l = 1, ao_num
+        do k = 1, ao_num
+          B_matrix_DIIS(i,j) = B_matrix_DIIS(i,j) + E_DIIS(k,l,i_DIIS) * E_DIIS(k,l,j_DIIS)
+        enddo
+      enddo
+
+    enddo
+  enddo
+
+  ! Pad B matrix and build the X matrix
+
+  C_vector_DIIS(:) = 0.d0
+  do i = 1, dim_DIIS
+    B_matrix_DIIS(i,dim_DIIS+1) = -1.d0
+    B_matrix_DIIS(dim_DIIS+1,i) = -1.d0
+  enddo
+  C_vector_DIIS(dim_DIIS+1) = -1.d0
+
+  deallocate(scratch)
+
+  ! Estimate condition number of B
+  integer                       :: info
+  double precision              :: anorm
+  integer,          allocatable :: ipiv(:)
+  double precision, allocatable :: AF(:,:)
+  double precision, external :: dlange
+
+  lwork = max((dim_DIIS+1)**2, (dim_DIIS+1)*5)
+  allocate(AF(dim_DIIS+1,dim_DIIS+1))
+  allocate(ipiv(2*(dim_DIIS+1)), iwork(2*(dim_DIIS+1)) )
+  allocate(scratch(lwork,1))
+  scratch(:,1) = 0.d0
+
+  anorm = dlange('1', dim_DIIS+1, dim_DIIS+1, B_matrix_DIIS, size(B_matrix_DIIS, 1), scratch(1,1))
+
+  AF(:,:) = B_matrix_DIIS(:,:)
+  call dgetrf(dim_DIIS+1, dim_DIIS+1, AF, size(AF, 1), ipiv, info)
+  if(info /= 0) then
+    dim_DIIS = 0
+    return
+  endif
+
+  call dgecon('1', dim_DIIS+1, AF, size(AF, 1), anorm, rcond, scratch, iwork, info)
+  if(info /= 0) then
+    dim_DIIS = 0
+    return
+  endif
+
+  if(rcond < 1.d-14) then
+    dim_DIIS = 0
+    return
+  endif
+
+  ! solve the linear system C = B x X
+
+  X_vector_DIIS = C_vector_DIIS
+  call dgesv(dim_DIIS+1, 1, B_matrix_DIIS, size(B_matrix_DIIS, 1), ipiv , X_vector_DIIS, size(X_vector_DIIS, 1), info)
+
+  deallocate(scratch, AF, iwork)
+  if(info < 0) then
+    stop ' bug in TC-DIIS'
+  endif
+
+  ! Compute extrapolated Fock matrix
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k) DEFAULT(SHARED) if (ao_num > 200)
+  do j = 1, ao_num
+    do i = 1, ao_num
+      F_ao(i,j) = 0.d0
+    enddo
+    do k = 1, dim_DIIS
+      if(dabs(X_vector_DIIS(k)) < 1.d-10) cycle
+      do i = 1,ao_num
+        ! FPE here
+        F_ao(i,j) = F_ao(i,j) + X_vector_DIIS(k) * F_DIIS(i,j,dim_DIIS-k+1)
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+end
+
+! ---
+
--- a/src/tc_scf/rh_tcscf_simple.irp.f
+++ b/src/tc_scf/rh_tcscf_simple.irp.f
@ -0,0 +1,129 @@
+! ---
+
+subroutine rh_tcscf_simple()
+
+  implicit none
+  integer                       :: i, j, it, dim_DIIS
+  double precision              :: t0, t1
+  double precision              :: e_save, e_delta, rho_delta
+  double precision              :: etc_tot, etc_1e, etc_2e, etc_3e, tc_grad
+  double precision              :: er_DIIS
+  double precision, allocatable :: rho_old(:,:), rho_new(:,:)
+
+  allocate(rho_old(ao_num,ao_num), rho_new(ao_num,ao_num))
+
+  it       = 0
+  e_save   = 0.d0
+  dim_DIIS = 0
+
+  ! ---
+
+  if(.not. bi_ortho) then
+   print *, ' grad_hermit = ', grad_hermit
+   call save_good_hermit_tc_eigvectors
+   TOUCH mo_coef 
+   call save_mos
+  endif
+
+  ! ---
+
+  if(bi_ortho) then
+
+    PROVIDE level_shift_tcscf
+    PROVIDE mo_l_coef mo_r_coef
+
+    write(6, '(A4,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A4, 1X, A8)')              &
+      '====', '================', '================', '================', '================', '================' &
+            , '================', '================', '================', '====', '========'
+
+    write(6, '(A4,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A4, 1X, A8)')              &
+      ' it ', '  SCF TC Energy ', '      E(1e)     ', '      E(2e)     ', '      E(3e)     ', '   energy diff  ' &
+            , '    gradient    ', '    DIIS error  ', '  level shift   ', 'DIIS', '  WT (m)'
+
+    write(6, '(A4,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A4, 1X, A8)')              &
+      '====', '================', '================', '================', '================', '================' &
+            , '================', '================', '================', '====', '========'
+
+
+    ! first iteration (HF orbitals)
+    call wall_time(t0)
+
+    etc_tot = TC_HF_energy
+    etc_1e  = TC_HF_one_e_energy
+    etc_2e  = TC_HF_two_e_energy
+    etc_3e  = 0.d0
+    if(three_body_h_tc) then
+      etc_3e = diag_three_elem_hf
+    endif
+    tc_grad = grad_non_hermit
+    er_DIIS = maxval(abs(FQS_SQF_mo))
+    e_delta = dabs(etc_tot - e_save)
+    e_save  = etc_tot
+
+    call wall_time(t1)
+    write(6, '(I4,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, I4,1X, F8.2)')  &
+      it, etc_tot, etc_1e, etc_2e, etc_3e, e_delta, tc_grad, er_DIIS, level_shift_tcscf, dim_DIIS, (t1-t0)/60.d0
+
+    do while(tc_grad .gt. dsqrt(thresh_tcscf))
+      call wall_time(t0)
+
+      it += 1
+      if(it > n_it_tcscf_max) then
+        print *, ' max of TCSCF iterations is reached ', n_it_TCSCF_max
+        stop
+      endif
+
+      mo_l_coef = fock_tc_leigvec_ao
+      mo_r_coef = fock_tc_reigvec_ao
+      call ezfio_set_bi_ortho_mos_mo_l_coef(mo_l_coef)
+      call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
+      TOUCH mo_l_coef mo_r_coef
+
+      etc_tot = TC_HF_energy
+      etc_1e  = TC_HF_one_e_energy
+      etc_2e  = TC_HF_two_e_energy
+      etc_3e  = 0.d0
+      if(three_body_h_tc) then
+        etc_3e = diag_three_elem_hf
+      endif
+      tc_grad = grad_non_hermit
+      er_DIIS = maxval(abs(FQS_SQF_mo))
+      e_delta = dabs(etc_tot - e_save)
+      e_save  = etc_tot
+
+      call ezfio_set_tc_scf_bitc_energy(etc_tot)
+
+      call wall_time(t1)
+      write(6, '(I4,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, I4,1X, F8.2)')  &
+        it, etc_tot, etc_1e, etc_2e, etc_3e, e_delta, tc_grad, er_DIIS, level_shift_tcscf, dim_DIIS, (t1-t0)/60.d0
+    enddo
+
+  else
+
+   do while( (grad_hermit.gt.dsqrt(thresh_tcscf)) .and. (it.lt.n_it_tcscf_max) )
+      print*,'grad_hermit = ',grad_hermit
+      it += 1
+      print *, 'iteration = ', it
+      print *, '***'
+      print *, 'TC HF total energy = ', TC_HF_energy
+      print *, 'TC HF 1 e   energy = ', TC_HF_one_e_energy
+      print *, 'TC HF 2 e   energy = ', TC_HF_two_e_energy
+      print *, 'TC HF 3 body       = ', diag_three_elem_hf
+      print *, '***'
+      print *, ''
+      call save_good_hermit_tc_eigvectors
+      TOUCH mo_coef 
+      call save_mos
+    enddo
+
+  endif
+
+  print *, ' TCSCF Simple converged !'
+  call print_energy_and_mos()
+
+  deallocate(rho_old, rho_new)
+
+end
+
+! ---
+
--- a/src/tc_scf/rotate_tcscf_orbitals.irp.f
+++ b/src/tc_scf/rotate_tcscf_orbitals.irp.f
@ -260,14 +260,10 @@ subroutine fix_right_to_one()
  integer                       :: i, j, m, n, mm, tot_deg
  double precision              :: accu_d, accu_nd
  double precision              :: de_thr, ei, ej, de
-  double precision              :: thr_d, thr_nd
  integer,          allocatable :: deg_num(:)
  double precision, allocatable :: R0(:,:), L0(:,:), W(:,:), e0(:)
  double precision, allocatable :: R(:,:), L(:,:), S(:,:), Stmp(:,:), tmp(:,:)

-  thr_d  = 1d-7
-  thr_nd = 1d-7
-
  n = ao_num
  m = mo_num

@ -340,7 +336,7 @@ subroutine fix_right_to_one()
      ! ---

      call impose_weighted_orthog_svd(n, mm, W, R)
-      call impose_weighted_biorthog_qr(n, mm, thr_d, thr_nd, R, W, L)
+      call impose_weighted_biorthog_qr(n, mm, thresh_biorthog_diag, thresh_biorthog_nondiag, R, W, L)

      ! ---

@ -353,7 +349,7 @@ subroutine fix_right_to_one()
    endif
  enddo

-  call check_weighted_biorthog_binormalize(n, m, L0, W, R0, thr_d, thr_nd, .true.)
+  call check_weighted_biorthog_binormalize(n, m, L0, W, R0, thresh_biorthog_diag, thresh_biorthog_nondiag, .true.)

  deallocate(W, deg_num)

--- a/src/tc_scf/routines_rotates.irp.f
+++ b/src/tc_scf/routines_rotates.irp.f
@ -259,7 +259,7 @@ subroutine orthog_functions(m, n, coef, overlap)
  double precision, intent(in)    :: overlap(m,m)
  double precision, intent(inout) :: coef(m,n)
  double precision, allocatable   :: stmp(:,:)
-  integer                         :: j
+  integer                         :: j, k

  allocate(stmp(n,n))
  call build_s_matrix(m, n, coef, coef, overlap, stmp)
@ -270,7 +270,13 @@ subroutine orthog_functions(m, n, coef, overlap)
  call impose_orthog_svd_overlap(m, n, coef, overlap)
  call build_s_matrix(m, n, coef, coef, overlap, stmp)
  do j = 1, n
-    coef(1,:m) *= 1.d0/dsqrt(stmp(j,j))
+    ! ---
+    ! TODO: MANU check ici
+    !coef(1,:m) *= 1.d0/dsqrt(stmp(j,j))
+    do k = 1, m
+      coef(k,j) *= 1.d0/dsqrt(stmp(j,j))
+    enddo
+    ! ---
  enddo
  call build_s_matrix(m, n, coef, coef, overlap, stmp)

--- a/src/tc_scf/tc_scf.irp.f
+++ b/src/tc_scf/tc_scf.irp.f
@ -1,3 +1,5 @@
+! ---
+
 program tc_scf

  BEGIN_DOC
@ -15,14 +17,20 @@ program tc_scf
 !  my_n_pt_a_grid = 26 ! small grid for quick debug
  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid

-  call create_guess()
-  call orthonormalize_mos()
+  PROVIDE mu_erf 
+  print *, ' mu = ', mu_erf
+  PROVIDE j1b_type
+  print *, ' j1b_type = ', j1b_type
+  print *, j1b_pen
+
+  !call create_guess()
+  !call orthonormalize_mos()

  PROVIDE tcscf_algorithm
  if(tcscf_algorithm == 'DIIS') then
-    call rh_tcscf()
+    call rh_tcscf_diis()
  elseif(tcscf_algorithm == 'Simple') then
-    call simple_tcscf()
+    call rh_tcscf_simple()
  else
    print *, ' not implemented yet', tcscf_algorithm
    stop
@ -35,11 +43,7 @@ end

 ! ---

-subroutine create_guess
-
-  BEGIN_DOC
-  !   Create a MO guess if no MOs are present in the EZFIO directory
-  END_DOC
+subroutine create_guess()

  implicit none
  logical :: exists
@ -54,10 +58,7 @@ subroutine create_guess
      mo_coef = ao_ortho_lowdin_coef
      call restore_symmetry(ao_num, mo_num, mo_coef, size(mo_coef, 1), 1.d-10)
      TOUCH mo_coef
-      call mo_as_eigvectors_of_mo_matrix(mo_one_e_integrals,     &
-          size(mo_one_e_integrals,1),                            &
-          size(mo_one_e_integrals,2),                            &
-          mo_label,1,.false.)
+      call mo_as_eigvectors_of_mo_matrix(mo_one_e_integrals, size(mo_one_e_integrals, 1), size(mo_one_e_integrals, 2), mo_label, 1, .false.)
      call restore_symmetry(ao_num, mo_num, mo_coef, size(mo_coef, 1), 1.d-10)
      SOFT_TOUCH mo_coef
    elseif (mo_guess_type == "Huckel") then
@ -72,198 +73,3 @@ subroutine create_guess
 end subroutine create_guess

 ! ---
-
-subroutine simple_tcscf()
-
-  implicit none
-  integer                       :: i, j, it
-  double precision              :: e_save, e_delta, rho_delta
-  double precision, allocatable :: rho_old(:,:), rho_new(:,:)
-
-  allocate(rho_old(ao_num,ao_num), rho_new(ao_num,ao_num))
-
-  it = 0
-  print*,'iteration = ', it
-
-  !print*,'grad_hermit = ', grad_hermit
-  print*,'***'
-  print*,'TC HF total energy = ', TC_HF_energy
-  print*,'TC HF 1 e   energy = ', TC_HF_one_e_energy
-  print*,'TC HF 2 e   energy = ', TC_HF_two_e_energy
-  if(three_body_h_tc) then
-   print*,'TC HF 3 body       = ', diag_three_elem_hf
-  endif
-  print*,'***'
-  e_delta = 10.d0
-  e_save  = 0.d0 !TC_HF_energy
-  rho_delta = 10.d0
-
-
-  if(bi_ortho)then
-
-   mo_l_coef = fock_tc_leigvec_ao
-   mo_r_coef = fock_tc_reigvec_ao
-   rho_old   = TCSCF_bi_ort_dm_ao
-   call ezfio_set_bi_ortho_mos_mo_l_coef(mo_l_coef)
-   call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
-   TOUCH mo_l_coef mo_r_coef
-
-  else
-
-   print *, ' grad_hermit = ', grad_hermit
-   call save_good_hermit_tc_eigvectors
-   TOUCH mo_coef 
-   call save_mos
-
-  endif
-
-  ! ---
-
-  if(bi_ortho) then
-
-    !do while(e_delta .gt. dsqrt(thresh_tcscf)) )
-    !do while(e_delta .gt. thresh_tcscf) )
-    !do while(rho_delta .gt. thresh_tcscf) )
-    !do while(grad_non_hermit_right .gt. dsqrt(thresh_tcscf))
-    do while(grad_non_hermit .gt. dsqrt(thresh_tcscf))
-
-      it += 1
-      if(it > n_it_tcscf_max) then
-        print *, ' max of TCSCF iterations is reached ', n_it_TCSCF_max
-        stop
-      endif
-
-
-      print *, ' ***'
-      print *, ' iteration = ', it
-
-      print *, ' TC HF total energy = ', TC_HF_energy
-      print *, ' TC HF 1 e   energy = ', TC_HF_one_e_energy
-      print *, ' TC HF 2 non hermit = ', TC_HF_two_e_energy
-      if(three_body_h_tc) then
-        print *, ' TC HF 3 body       = ', diag_three_elem_hf
-      endif
-      e_delta = dabs(TC_HF_energy - e_save)
-
-      print *, ' delta E           = ', e_delta
-      print *, ' gradient          = ', grad_non_hermit
-      print *, ' max TC DIIS error = ', maxval(abs(FQS_SQF_mo))
-
-      !print *, ' gradient= ', grad_non_hermit_right
-
-      !rho_new   = TCSCF_bi_ort_dm_ao
-      !!print*, rho_new
-      !rho_delta = 0.d0
-      !do i = 1, ao_num 
-      !  do j = 1, ao_num 
-      !    rho_delta += dabs(rho_new(j,i) - rho_old(j,i))
-      !  enddo
-      !enddo
-      !print *, ' rho_delta =', rho_delta
-      !rho_old = rho_new
-
-      e_save    = TC_HF_energy
-      mo_l_coef = fock_tc_leigvec_ao
-      mo_r_coef = fock_tc_reigvec_ao
-      call ezfio_set_bi_ortho_mos_mo_l_coef(mo_l_coef)
-      call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
-      TOUCH mo_l_coef mo_r_coef
-      call ezfio_set_tc_scf_bitc_energy(TC_HF_energy)
-
-      call test_fock_3e_uhf_mo()
-
-      print *, ' ***'
-      print *, ''
-
-    enddo
-
-  else
-   do while( (grad_hermit.gt.dsqrt(thresh_tcscf)) .and. it .lt. n_it_tcscf_max )
-      print*,'grad_hermit = ',grad_hermit
-      it += 1
-      print *, 'iteration = ', it
-      print *, '***'
-      print *, 'TC HF total energy = ', TC_HF_energy
-      print *, 'TC HF 1 e   energy = ', TC_HF_one_e_energy
-      print *, 'TC HF 2 e   energy = ', TC_HF_two_e_energy
-      print *, 'TC HF 3 body       = ', diag_three_elem_hf
-      print *, '***'
-      print *, ''
-      call save_good_hermit_tc_eigvectors
-      TOUCH mo_coef 
-      call save_mos
-
-    enddo
-
-  endif
-
-  print *, ' TCSCF Simple converged !'
-  call print_energy_and_mos()
-
-  deallocate(rho_old, rho_new)
-
-end subroutine simple_tcscf
-
-! ---
-
-subroutine test_fock_3e_uhf_mo()
-
-  implicit none
-  integer          :: i, j
-  double precision :: diff_tot, diff_ij, thr_ih, norm
-
-  thr_ih = 1d-12
-
-  PROVIDE fock_a_tot_3e_bi_orth fock_b_tot_3e_bi_orth
-  PROVIDE fock_3e_uhf_mo_a fock_3e_uhf_mo_b
-
-  ! ---
-
-  norm     = 0.d0
-  diff_tot = 0.d0
-  do i = 1, mo_num
-    do j = 1, mo_num
-
-      diff_ij = dabs(fock_3e_uhf_mo_a(j,i) - fock_a_tot_3e_bi_orth(j,i))
-      if(diff_ij .gt. thr_ih) then
-        !print *, ' difference on ', j, i
-        !print *, ' MANU : ', fock_a_tot_3e_bi_orth(j,i)
-        !print *, ' UHF  : ', fock_3e_uhf_mo_a     (j,i)
-        !stop
-      endif
-
-      norm     += dabs(fock_a_tot_3e_bi_orth(j,i))
-      diff_tot += diff_ij
-    enddo
-  enddo
-  print *, ' diff on F_a = ', diff_tot / norm
-  print *, '      norm_a = ', norm
-  print *, ' '
-
-  ! ---
-
-  norm     = 0.d0
-  diff_tot = 0.d0
-  do i = 1, mo_num
-    do j = 1, mo_num
-
-      diff_ij = dabs(fock_3e_uhf_mo_b(j,i) - fock_b_tot_3e_bi_orth(j,i))
-      if(diff_ij .gt. thr_ih) then
-        !print *, ' difference on ', j, i
-        !print *, ' MANU : ', fock_b_tot_3e_bi_orth(j,i)
-        !print *, ' UHF  : ', fock_3e_uhf_mo_b     (j,i)
-        !stop
-      endif
-
-      norm     += dabs(fock_b_tot_3e_bi_orth(j,i))
-      diff_tot += diff_ij
-    enddo
-  enddo
-  print *, ' diff on F_b = ', diff_tot/norm
-  print *, '      norm_b = ', norm
-  print *, ' '
-
-  ! ---
-
-end subroutine test_fock_3e_uhf_mo()
-
--- a/src/tc_scf/tc_scf_dm.irp.f
+++ b/src/tc_scf/tc_scf_dm.irp.f
@ -1,8 +1,11 @@
 ! ---

 BEGIN_PROVIDER [ double precision, TCSCF_density_matrix_ao_beta, (ao_num, ao_num) ]
+
  implicit none
+
  if(bi_ortho) then
+    PROVIDE mo_l_coef mo_r_coef
    TCSCF_density_matrix_ao_beta = TCSCF_bi_ort_dm_ao_beta
  else
    TCSCF_density_matrix_ao_beta = SCF_density_matrix_ao_beta
@ -12,8 +15,11 @@ END_PROVIDER
 ! ---

 BEGIN_PROVIDER [ double precision, TCSCF_density_matrix_ao_alpha, (ao_num, ao_num) ]
+
  implicit none
+
  if(bi_ortho) then
+    PROVIDE mo_l_coef mo_r_coef
    TCSCF_density_matrix_ao_alpha = TCSCF_bi_ort_dm_ao_alpha
  else
    TCSCF_density_matrix_ao_alpha = SCF_density_matrix_ao_alpha
--- a/src/tc_scf/tc_scf_energy.irp.f
+++ b/src/tc_scf/tc_scf_energy.irp.f
@ -10,6 +10,8 @@
  implicit none
  integer :: i, j

+  PROVIDE mo_l_coef mo_r_coef
+
  TC_HF_energy = nuclear_repulsion
  TC_HF_one_e_energy = 0.d0
  TC_HF_two_e_energy = 0.d0
--- a/src/tc_scf/test_int.irp.f
+++ b/src/tc_scf/test_int.irp.f
@ -25,7 +25,7 @@ program test_ints
 !! OK
 !call routine_v_ij_erf_rk_cst_mu_j1b
 !! OK 
-! call routine_x_v_ij_erf_rk_cst_mu_tmp_j1b
+! call routine_x_v_ij_erf_rk_cst_mu_j1b
 !! OK
 ! call routine_v_ij_u_cst_mu_j1b

@ -43,10 +43,15 @@ program test_ints
 ! call test_ao_tc_int_chemist
 ! call test_grid_points_ao
 ! call test_tc_scf
- call test_int_gauss
+ !call test_int_gauss

  !call test_fock_3e_uhf_ao()
-  call test_fock_3e_uhf_mo()
+  !call test_fock_3e_uhf_mo()
+
+  !call test_tc_grad_and_lapl_ao()
+  !call test_tc_grad_square_ao()
+
+  call test_two_e_tc_non_hermit_integral()

 end

@ -56,14 +61,8 @@ subroutine test_tc_scf
 implicit none
 integer :: i
 ! provide int2_u_grad1u_x_j1b2_test
- provide x_v_ij_erf_rk_cst_mu_tmp_j1b_test
-! do i = 1, ng_fit_jast
-!  print*,expo_gauss_1_erf_x_2(i),coef_gauss_1_erf_x_2(i)
-! enddo
-! provide tc_grad_square_ao_test
-!  provide tc_grad_and_lapl_ao_test
-! provide int2_u_grad1u_x_j1b2_test
-! provide x_v_ij_erf_rk_cst_mu_tmp_j1b_test
+ provide x_v_ij_erf_rk_cst_mu_j1b_test
+! provide x_v_ij_erf_rk_cst_mu_j1b_test
 ! print*,'TC_HF_energy = ',TC_HF_energy
 ! print*,'grad_non_hermit = ',grad_non_hermit
 end
@ -212,7 +211,7 @@ subroutine routine_v_ij_erf_rk_cst_mu_j1b
 end


-subroutine routine_x_v_ij_erf_rk_cst_mu_tmp_j1b
+subroutine routine_x_v_ij_erf_rk_cst_mu_j1b
 implicit none
 integer :: i,j,ipoint,k,l,m
 double precision :: weight,accu_relat, accu_abs, contrib
@ -242,8 +241,8 @@ subroutine routine_x_v_ij_erf_rk_cst_mu_tmp_j1b
    do i = 1, ao_num
     do j = 1, ao_num
      do m = 1, 3
-       array(j,i,l,k)     += x_v_ij_erf_rk_cst_mu_tmp_j1b_test(m,j,i,ipoint) * aos_grad_in_r_array_transp(m,k,ipoint) * aos_in_r_array(l,ipoint) * weight
-       array_ref(j,i,l,k) += x_v_ij_erf_rk_cst_mu_tmp_j1b(m,j,i,ipoint)      * aos_grad_in_r_array_transp(m,k,ipoint) * aos_in_r_array(l,ipoint) * weight
+       array(j,i,l,k)     += x_v_ij_erf_rk_cst_mu_j1b_test(j,i,ipoint,m) * aos_grad_in_r_array_transp(m,k,ipoint) * aos_in_r_array(l,ipoint) * weight
+       array_ref(j,i,l,k) += x_v_ij_erf_rk_cst_mu_j1b     (j,i,ipoint,m) * aos_grad_in_r_array_transp(m,k,ipoint) * aos_in_r_array(l,ipoint) * weight
      enddo
     enddo
    enddo
@ -500,8 +499,8 @@ subroutine routine_int2_u_grad1u_x_j1b2
    do i = 1, ao_num
     do j = 1, ao_num
      do m = 1, 3
-       array(j,i,l,k)     += int2_u_grad1u_x_j1b2_test(m,j,i,ipoint) * aos_grad_in_r_array_transp(m,k,ipoint) * aos_in_r_array(l,ipoint) * weight
-       array_ref(j,i,l,k) += int2_u_grad1u_x_j1b2(m,j,i,ipoint)      * aos_grad_in_r_array_transp(m,k,ipoint) * aos_in_r_array(l,ipoint) * weight
+       array(j,i,l,k)     += int2_u_grad1u_x_j1b2_test(j,i,ipoint,m) * aos_grad_in_r_array_transp(m,k,ipoint) * aos_in_r_array(l,ipoint) * weight
+       array_ref(j,i,l,k) += int2_u_grad1u_x_j1b2     (j,i,ipoint,m) * aos_grad_in_r_array_transp(m,k,ipoint) * aos_in_r_array(l,ipoint) * weight
      enddo
     enddo
    enddo
@ -708,7 +707,7 @@ subroutine test_fock_3e_uhf_mo()

  ! ---

-end subroutine test_fock_3e_uhf_mo()
+end subroutine test_fock_3e_uhf_mo

 ! ---

@ -776,9 +775,9 @@ subroutine test_grid_points_ao
  icount_bad = 0
  icount_full = 0
  do ipoint = 1, n_points_final_grid
-!   if(dabs(int2_u_grad1u_x_j1b2_test(1,j,i,ipoint)) & 
-! + dabs(int2_u_grad1u_x_j1b2_test(2,j,i,ipoint)) &
-! + dabs(int2_u_grad1u_x_j1b2_test(2,j,i,ipoint)) )
+!   if(dabs(int2_u_grad1u_x_j1b2_test(j,i,ipoint,1)) & 
+!    + dabs(int2_u_grad1u_x_j1b2_test(j,i,ipoint,2)) &
+!    + dabs(int2_u_grad1u_x_j1b2_test(j,i,ipoint,3)) )
 !   if(dabs(int2_u2_j1b2_test(j,i,ipoint)).gt.thr)then
 !    icount += 1
 !   endif
@ -848,3 +847,157 @@ subroutine test_int_gauss

 end

+! ---
+
+subroutine test_tc_grad_and_lapl_ao()
+
+  implicit none
+  integer          :: i, j, k, l
+  double precision :: diff_tot, diff, thr_ih, norm
+
+  thr_ih = 1d-10
+
+  PROVIDE tc_grad_and_lapl_ao tc_grad_and_lapl_ao_loop
+
+  norm     = 0.d0
+  diff_tot = 0.d0
+  do i = 1, ao_num
+    do j = 1, ao_num
+      do k = 1, ao_num
+        do l = 1, ao_num
+
+          diff = dabs(tc_grad_and_lapl_ao_loop(l,k,j,i) - tc_grad_and_lapl_ao(l,k,j,i))
+          if(diff .gt. thr_ih) then
+            print *, ' difference on ', l, k, j, i
+            print *, ' loops : ', tc_grad_and_lapl_ao_loop(l,k,j,i)
+            print *, ' lapack: ', tc_grad_and_lapl_ao     (l,k,j,i)
+            !stop
+          endif
+
+          norm     += dabs(tc_grad_and_lapl_ao_loop(l,k,j,i))
+          diff_tot += diff
+        enddo
+      enddo
+    enddo
+  enddo
+
+  print *, ' diff tot = ', diff_tot / norm
+  print *, '     norm = ', norm
+  print *, ' '
+
+  return
+
+end
+
+! ---
+
+subroutine test_tc_grad_square_ao()
+
+  implicit none
+  integer          :: i, j, k, l
+  double precision :: diff_tot, diff, thr_ih, norm
+
+  thr_ih = 1d-10
+
+  PROVIDE tc_grad_square_ao tc_grad_square_ao_loop
+
+  norm     = 0.d0
+  diff_tot = 0.d0
+  do i = 1, ao_num
+    do j = 1, ao_num
+      do k = 1, ao_num
+        do l = 1, ao_num
+
+          diff = dabs(tc_grad_square_ao_loop(l,k,j,i) - tc_grad_square_ao(l,k,j,i))
+          if(diff .gt. thr_ih) then
+            print *, ' difference on ', l, k, j, i
+            print *, ' loops : ', tc_grad_square_ao_loop(l,k,j,i)
+            print *, ' lapack: ', tc_grad_square_ao     (l,k,j,i)
+            !stop
+          endif
+
+          norm     += dabs(tc_grad_square_ao_loop(l,k,j,i))
+          diff_tot += diff
+        enddo
+      enddo
+    enddo
+  enddo
+
+  print *, ' diff tot = ', diff_tot / norm
+  print *, '     norm = ', norm
+  print *, ' '
+
+  return
+
+end
+
+! ---
+
+subroutine test_two_e_tc_non_hermit_integral()
+
+  implicit none
+  integer          :: i, j
+  double precision :: diff_tot, diff, thr_ih, norm
+
+  thr_ih = 1d-10
+
+  PROVIDE two_e_tc_non_hermit_integral_beta two_e_tc_non_hermit_integral_alpha
+  PROVIDE two_e_tc_non_hermit_integral_seq_beta two_e_tc_non_hermit_integral_seq_alpha
+
+  ! ---
+
+  norm     = 0.d0
+  diff_tot = 0.d0
+  do i = 1, ao_num
+    do j = 1, ao_num
+
+      diff = dabs(two_e_tc_non_hermit_integral_seq_alpha(j,i) - two_e_tc_non_hermit_integral_alpha(j,i))
+      if(diff .gt. thr_ih) then
+        print *, ' difference on ', j, i
+        print *, ' seq         : ', two_e_tc_non_hermit_integral_seq_alpha(j,i)
+        print *, ' //          : ', two_e_tc_non_hermit_integral_alpha    (j,i)
+        !stop
+      endif
+
+      norm     += dabs(two_e_tc_non_hermit_integral_seq_alpha(j,i))
+      diff_tot += diff
+    enddo
+  enddo
+
+  print *, ' diff tot a = ', diff_tot / norm
+  print *, '     norm a = ', norm
+  print *, ' '
+
+  ! ---
+
+  norm     = 0.d0
+  diff_tot = 0.d0
+  do i = 1, ao_num
+    do j = 1, ao_num
+
+      diff = dabs(two_e_tc_non_hermit_integral_seq_beta(j,i) - two_e_tc_non_hermit_integral_beta(j,i))
+      if(diff .gt. thr_ih) then
+        print *, ' difference on ', j, i
+        print *, ' seq         : ', two_e_tc_non_hermit_integral_seq_beta(j,i)
+        print *, ' //          : ', two_e_tc_non_hermit_integral_beta    (j,i)
+        !stop
+      endif
+
+      norm     += dabs(two_e_tc_non_hermit_integral_seq_beta(j,i))
+      diff_tot += diff
+    enddo
+  enddo
+
+  print *, ' diff tot b = ', diff_tot / norm
+  print *, '     norm b = ', norm
+  print *, ' '
+
+  ! ---
+
+  return
+
+end
+
+! ---
+
+>>>>>>> 92a4e33f8a21717cab0c0e4f8412ed6903afb04a
--- a/src/tools/print_he_energy.irp.f
+++ b/src/tools/print_he_energy.irp.f
@ -7,8 +7,8 @@ program print_he_energy

  call print_overlap()

-  call print_energy1()
-  call print_energy2()
+  !call print_energy1()
+  !call print_energy2()

 end 

--- a/src/utils/linear_algebra.irp.f
+++ b/src/utils/linear_algebra.irp.f
@ -1136,7 +1136,6 @@ subroutine ortho_svd(A,LDA,m,n)

 end

-! QR to orthonormalize CSFs does not work :-(
 !subroutine ortho_qr_withB(A,LDA,B,m,n)
 !  implicit none
 !  BEGIN_DOC
@ -1223,7 +1222,7 @@ end
 !
 !  !deallocate(WORK,TAU)
 !end
-
+!
 !subroutine ortho_qr_csf(A, LDA, B, m, n) bind(C, name="ortho_qr_csf")
 !  use iso_c_binding
 !  integer(c_int32_t), value      :: LDA
@ -1234,6 +1233,7 @@ end
 !  call ortho_qr_withB(A,LDA,B,m,n)
 !end subroutine ortho_qr_csf

+
 subroutine ortho_qr(A,LDA,m,n)
  implicit none
  BEGIN_DOC
@ -1982,6 +1982,8 @@ end subroutine diag_nonsym_right

 ! ---

+! Taken from GammCor thanks to Michal Hapka :-)
+

 subroutine pivoted_cholesky( A, rank, tol, ndim, U)
 !
--- a/src/utils/qsort.c
+++ b/src/utils/qsort.c
@ -0,0 +1,373 @@
+/* [[file:~/qp2/src/utils/qsort.org::*Generated%20C%20file][Generated C file:1]] */
+#include <stdlib.h>
+#include <stdint.h>
+
+struct int16_t_comp {
+  int16_t    x;
+  int32_t i;
+};
+
+int compare_int16_t( const void * l, const void * r )
+{
+  const int16_t * restrict _l= l;
+  const int16_t * restrict _r= r;
+  if( *_l > *_r ) return 1;
+  if( *_l < *_r ) return -1;
+  return 0;
+}
+
+void qsort_int16_t(int16_t* restrict A_in, int32_t* restrict iorder, int32_t isize) {
+  struct int16_t_comp* A = malloc(isize * sizeof(struct int16_t_comp));
+  if (A == NULL) return;
+
+  for (int i=0 ; i<isize ; ++i) {
+    A[i].x = A_in[i];
+    A[i].i = iorder[i];
+  }
+
+  qsort( (void*) A, (size_t) isize, sizeof(struct int16_t_comp), compare_int16_t);
+
+  for (int i=0 ; i<isize ; ++i) {
+    A_in[i] = A[i].x;
+    iorder[i] = A[i].i;
+  }
+  free(A);
+}
+
+void qsort_int16_t_noidx(int16_t* A, int32_t isize) {
+  qsort( (void*) A, (size_t) isize, sizeof(int16_t), compare_int16_t);
+}
+
+
+struct int16_t_comp_big {
+  int16_t    x;
+  int64_t i;
+};
+
+int compare_int16_t_big( const void * l, const void * r )
+{
+  const int16_t * restrict _l= l;
+  const int16_t * restrict _r= r;
+  if( *_l > *_r ) return 1;
+  if( *_l < *_r ) return -1;
+  return 0;
+}
+
+void qsort_int16_t_big(int16_t* restrict A_in, int64_t* restrict iorder, int64_t isize) {
+  struct int16_t_comp_big* A = malloc(isize * sizeof(struct int16_t_comp_big));
+  if (A == NULL) return;
+
+  for (int i=0 ; i<isize ; ++i) {
+    A[i].x = A_in[i];
+    A[i].i = iorder[i];
+  }
+
+  qsort( (void*) A, (size_t) isize, sizeof(struct int16_t_comp_big), compare_int16_t_big);
+
+  for (int i=0 ; i<isize ; ++i) {
+    A_in[i] = A[i].x;
+    iorder[i] = A[i].i;
+  }
+  free(A);
+}
+
+void qsort_int16_t_noidx_big(int16_t* A, int64_t isize) {
+  qsort( (void*) A, (size_t) isize, sizeof(int16_t), compare_int16_t_big);
+}
+
+
+struct int32_t_comp {
+  int32_t    x;
+  int32_t i;
+};
+
+int compare_int32_t( const void * l, const void * r )
+{
+  const int32_t * restrict _l= l;
+  const int32_t * restrict _r= r;
+  if( *_l > *_r ) return 1;
+  if( *_l < *_r ) return -1;
+  return 0;
+}
+
+void qsort_int32_t(int32_t* restrict A_in, int32_t* restrict iorder, int32_t isize) {
+  struct int32_t_comp* A = malloc(isize * sizeof(struct int32_t_comp));
+  if (A == NULL) return;
+
+  for (int i=0 ; i<isize ; ++i) {
+    A[i].x = A_in[i];
+    A[i].i = iorder[i];
+  }
+
+  qsort( (void*) A, (size_t) isize, sizeof(struct int32_t_comp), compare_int32_t);
+
+  for (int i=0 ; i<isize ; ++i) {
+    A_in[i] = A[i].x;
+    iorder[i] = A[i].i;
+  }
+  free(A);
+}
+
+void qsort_int32_t_noidx(int32_t* A, int32_t isize) {
+  qsort( (void*) A, (size_t) isize, sizeof(int32_t), compare_int32_t);
+}
+
+
+struct int32_t_comp_big {
+  int32_t    x;
+  int64_t i;
+};
+
+int compare_int32_t_big( const void * l, const void * r )
+{
+  const int32_t * restrict _l= l;
+  const int32_t * restrict _r= r;
+  if( *_l > *_r ) return 1;
+  if( *_l < *_r ) return -1;
+  return 0;
+}
+
+void qsort_int32_t_big(int32_t* restrict A_in, int64_t* restrict iorder, int64_t isize) {
+  struct int32_t_comp_big* A = malloc(isize * sizeof(struct int32_t_comp_big));
+  if (A == NULL) return;
+
+  for (int i=0 ; i<isize ; ++i) {
+    A[i].x = A_in[i];
+    A[i].i = iorder[i];
+  }
+
+  qsort( (void*) A, (size_t) isize, sizeof(struct int32_t_comp_big), compare_int32_t_big);
+
+  for (int i=0 ; i<isize ; ++i) {
+    A_in[i] = A[i].x;
+    iorder[i] = A[i].i;
+  }
+  free(A);
+}
+
+void qsort_int32_t_noidx_big(int32_t* A, int64_t isize) {
+  qsort( (void*) A, (size_t) isize, sizeof(int32_t), compare_int32_t_big);
+}
+
+
+struct int64_t_comp {
+  int64_t    x;
+  int32_t i;
+};
+
+int compare_int64_t( const void * l, const void * r )
+{
+  const int64_t * restrict _l= l;
+  const int64_t * restrict _r= r;
+  if( *_l > *_r ) return 1;
+  if( *_l < *_r ) return -1;
+  return 0;
+}
+
+void qsort_int64_t(int64_t* restrict A_in, int32_t* restrict iorder, int32_t isize) {
+  struct int64_t_comp* A = malloc(isize * sizeof(struct int64_t_comp));
+  if (A == NULL) return;
+
+  for (int i=0 ; i<isize ; ++i) {
+    A[i].x = A_in[i];
+    A[i].i = iorder[i];
+  }
+
+  qsort( (void*) A, (size_t) isize, sizeof(struct int64_t_comp), compare_int64_t);
+
+  for (int i=0 ; i<isize ; ++i) {
+    A_in[i] = A[i].x;
+    iorder[i] = A[i].i;
+  }
+  free(A);
+}
+
+void qsort_int64_t_noidx(int64_t* A, int32_t isize) {
+  qsort( (void*) A, (size_t) isize, sizeof(int64_t), compare_int64_t);
+}
+
+
+struct int64_t_comp_big {
+  int64_t    x;
+  int64_t i;
+};
+
+int compare_int64_t_big( const void * l, const void * r )
+{
+  const int64_t * restrict _l= l;
+  const int64_t * restrict _r= r;
+  if( *_l > *_r ) return 1;
+  if( *_l < *_r ) return -1;
+  return 0;
+}
+
+void qsort_int64_t_big(int64_t* restrict A_in, int64_t* restrict iorder, int64_t isize) {
+  struct int64_t_comp_big* A = malloc(isize * sizeof(struct int64_t_comp_big));
+  if (A == NULL) return;
+
+  for (int i=0 ; i<isize ; ++i) {
+    A[i].x = A_in[i];
+    A[i].i = iorder[i];
+  }
+
+  qsort( (void*) A, (size_t) isize, sizeof(struct int64_t_comp_big), compare_int64_t_big);
+
+  for (int i=0 ; i<isize ; ++i) {
+    A_in[i] = A[i].x;
+    iorder[i] = A[i].i;
+  }
+  free(A);
+}
+
+void qsort_int64_t_noidx_big(int64_t* A, int64_t isize) {
+  qsort( (void*) A, (size_t) isize, sizeof(int64_t), compare_int64_t_big);
+}
+
+
+struct double_comp {
+  double    x;
+  int32_t i;
+};
+
+int compare_double( const void * l, const void * r )
+{
+  const double * restrict _l= l;
+  const double * restrict _r= r;
+  if( *_l > *_r ) return 1;
+  if( *_l < *_r ) return -1;
+  return 0;
+}
+
+void qsort_double(double* restrict A_in, int32_t* restrict iorder, int32_t isize) {
+  struct double_comp* A = malloc(isize * sizeof(struct double_comp));
+  if (A == NULL) return;
+
+  for (int i=0 ; i<isize ; ++i) {
+    A[i].x = A_in[i];
+    A[i].i = iorder[i];
+  }
+
+  qsort( (void*) A, (size_t) isize, sizeof(struct double_comp), compare_double);
+
+  for (int i=0 ; i<isize ; ++i) {
+    A_in[i] = A[i].x;
+    iorder[i] = A[i].i;
+  }
+  free(A);
+}
+
+void qsort_double_noidx(double* A, int32_t isize) {
+  qsort( (void*) A, (size_t) isize, sizeof(double), compare_double);
+}
+
+
+struct double_comp_big {
+  double    x;
+  int64_t i;
+};
+
+int compare_double_big( const void * l, const void * r )
+{
+  const double * restrict _l= l;
+  const double * restrict _r= r;
+  if( *_l > *_r ) return 1;
+  if( *_l < *_r ) return -1;
+  return 0;
+}
+
+void qsort_double_big(double* restrict A_in, int64_t* restrict iorder, int64_t isize) {
+  struct double_comp_big* A = malloc(isize * sizeof(struct double_comp_big));
+  if (A == NULL) return;
+
+  for (int i=0 ; i<isize ; ++i) {
+    A[i].x = A_in[i];
+    A[i].i = iorder[i];
+  }
+
+  qsort( (void*) A, (size_t) isize, sizeof(struct double_comp_big), compare_double_big);
+
+  for (int i=0 ; i<isize ; ++i) {
+    A_in[i] = A[i].x;
+    iorder[i] = A[i].i;
+  }
+  free(A);
+}
+
+void qsort_double_noidx_big(double* A, int64_t isize) {
+  qsort( (void*) A, (size_t) isize, sizeof(double), compare_double_big);
+}
+
+
+struct float_comp {
+  float    x;
+  int32_t i;
+};
+
+int compare_float( const void * l, const void * r )
+{
+  const float * restrict _l= l;
+  const float * restrict _r= r;
+  if( *_l > *_r ) return 1;
+  if( *_l < *_r ) return -1;
+  return 0;
+}
+
+void qsort_float(float* restrict A_in, int32_t* restrict iorder, int32_t isize) {
+  struct float_comp* A = malloc(isize * sizeof(struct float_comp));
+  if (A == NULL) return;
+
+  for (int i=0 ; i<isize ; ++i) {
+    A[i].x = A_in[i];
+    A[i].i = iorder[i];
+  }
+
+  qsort( (void*) A, (size_t) isize, sizeof(struct float_comp), compare_float);
+
+  for (int i=0 ; i<isize ; ++i) {
+    A_in[i] = A[i].x;
+    iorder[i] = A[i].i;
+  }
+  free(A);
+}
+
+void qsort_float_noidx(float* A, int32_t isize) {
+  qsort( (void*) A, (size_t) isize, sizeof(float), compare_float);
+}
+
+
+struct float_comp_big {
+  float    x;
+  int64_t i;
+};
+
+int compare_float_big( const void * l, const void * r )
+{
+  const float * restrict _l= l;
+  const float * restrict _r= r;
+  if( *_l > *_r ) return 1;
+  if( *_l < *_r ) return -1;
+  return 0;
+}
+
+void qsort_float_big(float* restrict A_in, int64_t* restrict iorder, int64_t isize) {
+  struct float_comp_big* A = malloc(isize * sizeof(struct float_comp_big));
+  if (A == NULL) return;
+
+  for (int i=0 ; i<isize ; ++i) {
+    A[i].x = A_in[i];
+    A[i].i = iorder[i];
+  }
+
+  qsort( (void*) A, (size_t) isize, sizeof(struct float_comp_big), compare_float_big);
+
+  for (int i=0 ; i<isize ; ++i) {
+    A_in[i] = A[i].x;
+    iorder[i] = A[i].i;
+  }
+  free(A);
+}
+
+void qsort_float_noidx_big(float* A, int64_t isize) {
+  qsort( (void*) A, (size_t) isize, sizeof(float), compare_float_big);
+}
+/* Generated C file:1 ends here */
--- a/src/utils/qsort.org
+++ b/src/utils/qsort.org
@ -0,0 +1,169 @@
+#+TITLE: Quick sort binding for Fortran
+
+* C template
+
+#+NAME: c_template
+#+BEGIN_SRC c
+struct TYPE_comp_big {
+  TYPE    x;
+  int32_t i;
+};
+
+int compare_TYPE_big( const void * l, const void * r )
+{
+  const TYPE * restrict _l= l;
+  const TYPE * restrict _r= r;
+  if( *_l > *_r ) return 1;
+  if( *_l < *_r ) return -1;
+  return 0;
+}
+
+void qsort_TYPE_big(TYPE* restrict A_in, int32_t* restrict iorder, int32_t isize) {
+  struct TYPE_comp_big* A = malloc(isize * sizeof(struct TYPE_comp_big));
+  if (A == NULL) return;
+
+  for (int i=0 ; i<isize ; ++i) {
+    A[i].x = A_in[i];
+    A[i].i = iorder[i];
+  }
+
+  qsort( (void*) A, (size_t) isize, sizeof(struct TYPE_comp_big), compare_TYPE_big);
+
+  for (int i=0 ; i<isize ; ++i) {
+    A_in[i] = A[i].x;
+    iorder[i] = A[i].i;
+  }
+  free(A);
+}
+
+void qsort_TYPE_noidx_big(TYPE* A, int32_t isize) {
+  qsort( (void*) A, (size_t) isize, sizeof(TYPE), compare_TYPE_big);
+}
+#+END_SRC
+
+* Fortran template
+
+#+NAME:f_template
+#+BEGIN_SRC f90
+subroutine Lsort_big_c(A, iorder, isize) bind(C, name="qsort_TYPE_big")
+  use iso_c_binding
+  integer(c_int32_t), value :: isize
+  integer(c_int32_t)        :: iorder(isize)
+  real   (c_TYPE)         :: A(isize)
+end subroutine Lsort_big_c
+
+subroutine Lsort_noidx_big_c(A, isize) bind(C, name="qsort_TYPE_noidx_big")
+  use iso_c_binding
+  integer(c_int32_t), value :: isize
+  real   (c_TYPE)         :: A(isize)
+end subroutine Lsort_noidx_big_c
+
+#+END_SRC
+
+#+NAME:f_template2
+#+BEGIN_SRC f90
+subroutine Lsort_big(A, iorder, isize) 
+  use qsort_module
+  use iso_c_binding
+  integer(c_int32_t)        :: isize
+  integer(c_int32_t)        :: iorder(isize)
+  real   (c_TYPE)         :: A(isize)
+  call Lsort_big_c(A, iorder, isize)
+end subroutine Lsort_big
+
+subroutine Lsort_noidx_big(A, isize)
+  use iso_c_binding
+  use qsort_module
+  integer(c_int32_t) :: isize
+  real   (c_TYPE)    :: A(isize)
+  call Lsort_noidx_big_c(A, isize)
+end subroutine Lsort_noidx_big
+
+#+END_SRC
+
+* Python scripts for type replacements
+
+#+NAME: replaced
+#+begin_src python :results output :noweb yes
+data = """
+<<c_template>>
+"""
+for typ in ["int16_t", "int32_t", "int64_t", "double", "float"]:
+    print( data.replace("TYPE", typ).replace("_big", "") )
+    print( data.replace("int32_t", "int64_t").replace("TYPE", typ) )
+#+end_src
+
+#+NAME: replaced_f
+#+begin_src python :results output :noweb yes
+data = """
+<<f_template>>
+"""
+c1 = {
+    "int16_t": "i2",
+    "int32_t": "i",
+    "int64_t": "i8",
+    "double": "d",
+    "float": ""
+}
+c2 = {
+    "int16_t": "integer",
+    "int32_t": "integer",
+    "int64_t": "integer",
+    "double": "real",
+    "float": "real"
+}
+
+for typ in ["int16_t", "int32_t", "int64_t", "double", "float"]:
+    print( data.replace("real",c2[typ]).replace("L",c1[typ]).replace("TYPE", typ).replace("_big", "") )
+    print( data.replace("real",c2[typ]).replace("L",c1[typ]).replace("int32_t", "int64_t").replace("TYPE", typ) )
+#+end_src
+
+#+NAME: replaced_f2
+#+begin_src python :results output :noweb yes
+data = """
+<<f_template2>>
+"""
+c1 = {
+    "int16_t": "i2",
+    "int32_t": "i",
+    "int64_t": "i8",
+    "double": "d",
+    "float": ""
+}
+c2 = {
+    "int16_t": "integer",
+    "int32_t": "integer",
+    "int64_t": "integer",
+    "double": "real",
+    "float": "real"
+}
+
+for typ in ["int16_t", "int32_t", "int64_t", "double", "float"]:
+    print( data.replace("real",c2[typ]).replace("L",c1[typ]).replace("TYPE", typ).replace("_big", "") )
+    print( data.replace("real",c2[typ]).replace("L",c1[typ]).replace("int32_t", "int64_t").replace("TYPE", typ) )
+#+end_src
+
+* Generated C file
+  
+#+BEGIN_SRC c :comments link :tangle qsort.c :noweb yes
+#include <stdlib.h>
+#include <stdint.h>
+<<replaced()>>
+#+END_SRC
+
+* Generated Fortran file
+
+#+BEGIN_SRC f90 :tangle qsort_module.f90 :noweb yes
+module qsort_module
+  use iso_c_binding
+  
+  interface
+     <<replaced_f()>>
+  end interface
+
+end module qsort_module
+
+<<replaced_f2()>>
+
+#+END_SRC
+
--- a/src/utils/qsort_module.f90
+++ b/src/utils/qsort_module.f90
@ -0,0 +1,347 @@
+module qsort_module
+  use iso_c_binding
+  
+  interface
+     
+     subroutine i2sort_c(A, iorder, isize) bind(C, name="qsort_int16_t")
+       use iso_c_binding
+       integer(c_int32_t), value :: isize
+       integer(c_int32_t)        :: iorder(isize)
+       integer   (c_int16_t)         :: A(isize)
+     end subroutine i2sort_c
+     
+     subroutine i2sort_noidx_c(A, isize) bind(C, name="qsort_int16_t_noidx")
+       use iso_c_binding
+       integer(c_int32_t), value :: isize
+       integer   (c_int16_t)         :: A(isize)
+     end subroutine i2sort_noidx_c
+     
+     
+     
+     subroutine i2sort_big_c(A, iorder, isize) bind(C, name="qsort_int16_t_big")
+       use iso_c_binding
+       integer(c_int64_t), value :: isize
+       integer(c_int64_t)        :: iorder(isize)
+       integer   (c_int16_t)         :: A(isize)
+     end subroutine i2sort_big_c
+     
+     subroutine i2sort_noidx_big_c(A, isize) bind(C, name="qsort_int16_t_noidx_big")
+       use iso_c_binding
+       integer(c_int64_t), value :: isize
+       integer   (c_int16_t)         :: A(isize)
+     end subroutine i2sort_noidx_big_c
+     
+     
+     
+     subroutine isort_c(A, iorder, isize) bind(C, name="qsort_int32_t")
+       use iso_c_binding
+       integer(c_int32_t), value :: isize
+       integer(c_int32_t)        :: iorder(isize)
+       integer   (c_int32_t)         :: A(isize)
+     end subroutine isort_c
+     
+     subroutine isort_noidx_c(A, isize) bind(C, name="qsort_int32_t_noidx")
+       use iso_c_binding
+       integer(c_int32_t), value :: isize
+       integer   (c_int32_t)         :: A(isize)
+     end subroutine isort_noidx_c
+     
+     
+     
+     subroutine isort_big_c(A, iorder, isize) bind(C, name="qsort_int32_t_big")
+       use iso_c_binding
+       integer(c_int64_t), value :: isize
+       integer(c_int64_t)        :: iorder(isize)
+       integer   (c_int32_t)         :: A(isize)
+     end subroutine isort_big_c
+     
+     subroutine isort_noidx_big_c(A, isize) bind(C, name="qsort_int32_t_noidx_big")
+       use iso_c_binding
+       integer(c_int64_t), value :: isize
+       integer   (c_int32_t)         :: A(isize)
+     end subroutine isort_noidx_big_c
+     
+     
+     
+     subroutine i8sort_c(A, iorder, isize) bind(C, name="qsort_int64_t")
+       use iso_c_binding
+       integer(c_int32_t), value :: isize
+       integer(c_int32_t)        :: iorder(isize)
+       integer   (c_int64_t)         :: A(isize)
+     end subroutine i8sort_c
+     
+     subroutine i8sort_noidx_c(A, isize) bind(C, name="qsort_int64_t_noidx")
+       use iso_c_binding
+       integer(c_int32_t), value :: isize
+       integer   (c_int64_t)         :: A(isize)
+     end subroutine i8sort_noidx_c
+     
+     
+     
+     subroutine i8sort_big_c(A, iorder, isize) bind(C, name="qsort_int64_t_big")
+       use iso_c_binding
+       integer(c_int64_t), value :: isize
+       integer(c_int64_t)        :: iorder(isize)
+       integer   (c_int64_t)         :: A(isize)
+     end subroutine i8sort_big_c
+     
+     subroutine i8sort_noidx_big_c(A, isize) bind(C, name="qsort_int64_t_noidx_big")
+       use iso_c_binding
+       integer(c_int64_t), value :: isize
+       integer   (c_int64_t)         :: A(isize)
+     end subroutine i8sort_noidx_big_c
+     
+     
+     
+     subroutine dsort_c(A, iorder, isize) bind(C, name="qsort_double")
+       use iso_c_binding
+       integer(c_int32_t), value :: isize
+       integer(c_int32_t)        :: iorder(isize)
+       real   (c_double)         :: A(isize)
+     end subroutine dsort_c
+     
+     subroutine dsort_noidx_c(A, isize) bind(C, name="qsort_double_noidx")
+       use iso_c_binding
+       integer(c_int32_t), value :: isize
+       real   (c_double)         :: A(isize)
+     end subroutine dsort_noidx_c
+     
+     
+     
+     subroutine dsort_big_c(A, iorder, isize) bind(C, name="qsort_double_big")
+       use iso_c_binding
+       integer(c_int64_t), value :: isize
+       integer(c_int64_t)        :: iorder(isize)
+       real   (c_double)         :: A(isize)
+     end subroutine dsort_big_c
+     
+     subroutine dsort_noidx_big_c(A, isize) bind(C, name="qsort_double_noidx_big")
+       use iso_c_binding
+       integer(c_int64_t), value :: isize
+       real   (c_double)         :: A(isize)
+     end subroutine dsort_noidx_big_c
+     
+     
+     
+     subroutine sort_c(A, iorder, isize) bind(C, name="qsort_float")
+       use iso_c_binding
+       integer(c_int32_t), value :: isize
+       integer(c_int32_t)        :: iorder(isize)
+       real   (c_float)         :: A(isize)
+     end subroutine sort_c
+     
+     subroutine sort_noidx_c(A, isize) bind(C, name="qsort_float_noidx")
+       use iso_c_binding
+       integer(c_int32_t), value :: isize
+       real   (c_float)         :: A(isize)
+     end subroutine sort_noidx_c
+     
+     
+     
+     subroutine sort_big_c(A, iorder, isize) bind(C, name="qsort_float_big")
+       use iso_c_binding
+       integer(c_int64_t), value :: isize
+       integer(c_int64_t)        :: iorder(isize)
+       real   (c_float)         :: A(isize)
+     end subroutine sort_big_c
+     
+     subroutine sort_noidx_big_c(A, isize) bind(C, name="qsort_float_noidx_big")
+       use iso_c_binding
+       integer(c_int64_t), value :: isize
+       real   (c_float)         :: A(isize)
+     end subroutine sort_noidx_big_c
+     
+     
+     
+  end interface
+
+end module qsort_module
+
+
+subroutine i2sort(A, iorder, isize) 
+  use qsort_module
+  use iso_c_binding
+  integer(c_int32_t)        :: isize
+  integer(c_int32_t)        :: iorder(isize)
+  integer   (c_int16_t)         :: A(isize)
+  call i2sort_c(A, iorder, isize)
+end subroutine i2sort
+
+subroutine i2sort_noidx(A, isize)
+  use iso_c_binding
+  use qsort_module
+  integer(c_int32_t) :: isize
+  integer   (c_int16_t)    :: A(isize)
+  call i2sort_noidx_c(A, isize)
+end subroutine i2sort_noidx
+
+
+
+subroutine i2sort_big(A, iorder, isize) 
+  use qsort_module
+  use iso_c_binding
+  integer(c_int64_t)        :: isize
+  integer(c_int64_t)        :: iorder(isize)
+  integer   (c_int16_t)         :: A(isize)
+  call i2sort_big_c(A, iorder, isize)
+end subroutine i2sort_big
+
+subroutine i2sort_noidx_big(A, isize)
+  use iso_c_binding
+  use qsort_module
+  integer(c_int64_t) :: isize
+  integer   (c_int16_t)    :: A(isize)
+  call i2sort_noidx_big_c(A, isize)
+end subroutine i2sort_noidx_big
+
+
+
+subroutine isort(A, iorder, isize) 
+  use qsort_module
+  use iso_c_binding
+  integer(c_int32_t)        :: isize
+  integer(c_int32_t)        :: iorder(isize)
+  integer   (c_int32_t)         :: A(isize)
+  call isort_c(A, iorder, isize)
+end subroutine isort
+
+subroutine isort_noidx(A, isize)
+  use iso_c_binding
+  use qsort_module
+  integer(c_int32_t) :: isize
+  integer   (c_int32_t)    :: A(isize)
+  call isort_noidx_c(A, isize)
+end subroutine isort_noidx
+
+
+
+subroutine isort_big(A, iorder, isize) 
+  use qsort_module
+  use iso_c_binding
+  integer(c_int64_t)        :: isize
+  integer(c_int64_t)        :: iorder(isize)
+  integer   (c_int32_t)         :: A(isize)
+  call isort_big_c(A, iorder, isize)
+end subroutine isort_big
+
+subroutine isort_noidx_big(A, isize)
+  use iso_c_binding
+  use qsort_module
+  integer(c_int64_t) :: isize
+  integer   (c_int32_t)    :: A(isize)
+  call isort_noidx_big_c(A, isize)
+end subroutine isort_noidx_big
+
+
+
+subroutine i8sort(A, iorder, isize) 
+  use qsort_module
+  use iso_c_binding
+  integer(c_int32_t)        :: isize
+  integer(c_int32_t)        :: iorder(isize)
+  integer   (c_int64_t)         :: A(isize)
+  call i8sort_c(A, iorder, isize)
+end subroutine i8sort
+
+subroutine i8sort_noidx(A, isize)
+  use iso_c_binding
+  use qsort_module
+  integer(c_int32_t) :: isize
+  integer   (c_int64_t)    :: A(isize)
+  call i8sort_noidx_c(A, isize)
+end subroutine i8sort_noidx
+
+
+
+subroutine i8sort_big(A, iorder, isize) 
+  use qsort_module
+  use iso_c_binding
+  integer(c_int64_t)        :: isize
+  integer(c_int64_t)        :: iorder(isize)
+  integer   (c_int64_t)         :: A(isize)
+  call i8sort_big_c(A, iorder, isize)
+end subroutine i8sort_big
+
+subroutine i8sort_noidx_big(A, isize)
+  use iso_c_binding
+  use qsort_module
+  integer(c_int64_t) :: isize
+  integer   (c_int64_t)    :: A(isize)
+  call i8sort_noidx_big_c(A, isize)
+end subroutine i8sort_noidx_big
+
+
+
+subroutine dsort(A, iorder, isize) 
+  use qsort_module
+  use iso_c_binding
+  integer(c_int32_t)        :: isize
+  integer(c_int32_t)        :: iorder(isize)
+  real   (c_double)         :: A(isize)
+  call dsort_c(A, iorder, isize)
+end subroutine dsort
+
+subroutine dsort_noidx(A, isize)
+  use iso_c_binding
+  use qsort_module
+  integer(c_int32_t) :: isize
+  real   (c_double)    :: A(isize)
+  call dsort_noidx_c(A, isize)
+end subroutine dsort_noidx
+
+
+
+subroutine dsort_big(A, iorder, isize) 
+  use qsort_module
+  use iso_c_binding
+  integer(c_int64_t)        :: isize
+  integer(c_int64_t)        :: iorder(isize)
+  real   (c_double)         :: A(isize)
+  call dsort_big_c(A, iorder, isize)
+end subroutine dsort_big
+
+subroutine dsort_noidx_big(A, isize)
+  use iso_c_binding
+  use qsort_module
+  integer(c_int64_t) :: isize
+  real   (c_double)    :: A(isize)
+  call dsort_noidx_big_c(A, isize)
+end subroutine dsort_noidx_big
+
+
+
+subroutine sort(A, iorder, isize) 
+  use qsort_module
+  use iso_c_binding
+  integer(c_int32_t)        :: isize
+  integer(c_int32_t)        :: iorder(isize)
+  real   (c_float)         :: A(isize)
+  call sort_c(A, iorder, isize)
+end subroutine sort
+
+subroutine sort_noidx(A, isize)
+  use iso_c_binding
+  use qsort_module
+  integer(c_int32_t) :: isize
+  real   (c_float)    :: A(isize)
+  call sort_noidx_c(A, isize)
+end subroutine sort_noidx
+
+
+
+subroutine sort_big(A, iorder, isize) 
+  use qsort_module
+  use iso_c_binding
+  integer(c_int64_t)        :: isize
+  integer(c_int64_t)        :: iorder(isize)
+  real   (c_float)         :: A(isize)
+  call sort_big_c(A, iorder, isize)
+end subroutine sort_big
+
+subroutine sort_noidx_big(A, isize)
+  use iso_c_binding
+  use qsort_module
+  integer(c_int64_t) :: isize
+  real   (c_float)    :: A(isize)
+  call sort_noidx_big_c(A, isize)
+end subroutine sort_noidx_big
--- a/src/utils/sort.irp.f
+++ b/src/utils/sort.irp.f
@ -1,222 +1,4 @@
 BEGIN_TEMPLATE
- subroutine insertion_$Xsort (x,iorder,isize)
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize) using the insertion sort algorithm.
-  ! iorder in input should be (1,2,3,...,isize), and in output
-  ! contains the new order of the elements.
-  END_DOC
-  integer,intent(in)             :: isize
-  $type,intent(inout)            :: x(isize)
-  integer,intent(inout)          :: iorder(isize)
-  $type                          :: xtmp
-  integer                        :: i, i0, j, jmax
-
-  do i=2,isize
-    xtmp = x(i)
-    i0 = iorder(i)
-    j=i-1
-    do while (j>0)
-      if ((x(j) <= xtmp)) exit
-      x(j+1) = x(j)
-      iorder(j+1) = iorder(j)
-      j=j-1
-    enddo
-    x(j+1) = xtmp
-    iorder(j+1) = i0
-  enddo
- end subroutine insertion_$Xsort
-
- subroutine quick_$Xsort(x, iorder, isize)
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize) using the quicksort algorithm.
-  ! iorder in input should be (1,2,3,...,isize), and in output
-  ! contains the new order of the elements.
-  END_DOC
-  integer,intent(in)             :: isize
-  $type,intent(inout)            :: x(isize)
-  integer,intent(inout)          :: iorder(isize)
-  integer, external              :: omp_get_num_threads
-  call rec_$X_quicksort(x,iorder,isize,1,isize,nproc)
- end
-
- recursive subroutine rec_$X_quicksort(x, iorder, isize, first, last, level)
-  implicit none
-  integer, intent(in)            :: isize, first, last, level
-  integer,intent(inout)          :: iorder(isize)
-  $type, intent(inout)           :: x(isize)
-  $type                          :: c, tmp
-  integer                        :: itmp
-  integer                        :: i, j
-
-  if(isize<2)return
-
-  c = x( shiftr(first+last,1) )
-  i = first
-  j = last
-  do
-    do while (x(i) < c)
-      i=i+1
-    end do
-    do while (c < x(j))
-      j=j-1
-    end do
-    if (i >= j) exit
-    tmp  = x(i)
-    x(i) = x(j)
-    x(j) = tmp
-    itmp      = iorder(i)
-    iorder(i) = iorder(j)
-    iorder(j) = itmp
-    i=i+1
-    j=j-1
-  enddo
-  if ( ((i-first <= 10000).and.(last-j <= 10000)).or.(level<=0) ) then
-    if (first < i-1) then
-      call rec_$X_quicksort(x, iorder, isize, first, i-1,level/2)
-    endif
-    if (j+1 < last) then
-      call rec_$X_quicksort(x, iorder, isize, j+1, last,level/2)
-    endif
-  else
-    if (first < i-1) then
-      call rec_$X_quicksort(x, iorder, isize, first, i-1,level/2)
-    endif
-    if (j+1 < last) then
-      call rec_$X_quicksort(x, iorder, isize, j+1, last,level/2)
-    endif
-  endif
- end
-
- subroutine heap_$Xsort(x,iorder,isize)
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize) using the heap sort algorithm.
-  ! iorder in input should be (1,2,3,...,isize), and in output
-  ! contains the new order of the elements.
-  END_DOC
-  integer,intent(in)             :: isize
-  $type,intent(inout)            :: x(isize)
-  integer,intent(inout)          :: iorder(isize)
-
-  integer                        :: i, k, j, l, i0
-  $type                          :: xtemp
-
-  l = isize/2+1
-  k = isize
-  do while (.True.)
-    if (l>1) then
-      l=l-1
-      xtemp = x(l)
-      i0 = iorder(l)
-    else
-      xtemp = x(k)
-      i0 = iorder(k)
-      x(k) = x(1)
-      iorder(k) = iorder(1)
-      k = k-1
-      if (k == 1) then
-        x(1) = xtemp
-        iorder(1) = i0
-        exit
-      endif
-    endif
-    i=l
-    j = shiftl(l,1)
-    do while (j<k)
-      if ( x(j) < x(j+1) ) then
-        j=j+1
-      endif
-      if (xtemp < x(j)) then
-        x(i) = x(j)
-        iorder(i) = iorder(j)
-        i = j
-        j = shiftl(j,1)
-      else
-        j = k+1
-      endif
-    enddo
-    if (j==k) then
-      if (xtemp < x(j)) then
-        x(i) = x(j)
-        iorder(i) = iorder(j)
-        i = j
-        j = shiftl(j,1)
-      else
-        j = k+1
-      endif
-    endif
-    x(i) = xtemp
-    iorder(i) = i0
-  enddo
- end subroutine heap_$Xsort
-
- subroutine heap_$Xsort_big(x,iorder,isize)
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize) using the heap sort algorithm.
-  ! iorder in input should be (1,2,3,...,isize), and in output
-  ! contains the new order of the elements.
-  ! This is a version for very large arrays where the indices need
-  ! to be in integer*8 format
-  END_DOC
-  integer*8,intent(in)           :: isize
-  $type,intent(inout)            :: x(isize)
-  integer*8,intent(inout)        :: iorder(isize)
-
-  integer*8                      :: i, k, j, l, i0
-  $type                          :: xtemp
-
-  l = isize/2+1
-  k = isize
-  do while (.True.)
-    if (l>1) then
-      l=l-1
-      xtemp = x(l)
-      i0 = iorder(l)
-    else
-      xtemp = x(k)
-      i0 = iorder(k)
-      x(k) = x(1)
-      iorder(k) = iorder(1)
-      k = k-1
-      if (k == 1) then
-        x(1) = xtemp
-        iorder(1) = i0
-        exit
-      endif
-    endif
-    i=l
-    j = shiftl(l,1)
-    do while (j<k)
-      if ( x(j) < x(j+1) ) then
-        j=j+1
-      endif
-      if (xtemp < x(j)) then
-        x(i) = x(j)
-        iorder(i) = iorder(j)
-        i = j
-        j = shiftl(j,1)
-      else
-        j = k+1
-      endif
-    enddo
-    if (j==k) then
-      if (xtemp < x(j)) then
-        x(i) = x(j)
-        iorder(i) = iorder(j)
-        i = j
-        j = shiftl(j,1)
-      else
-        j = k+1
-      endif
-    endif
-    x(i) = xtemp
-    iorder(i) = i0
-  enddo
-
- end subroutine heap_$Xsort_big

 subroutine sorted_$Xnumber(x,isize,n)
  implicit none
@ -250,222 +32,6 @@ SUBST [ X, type ]
 END_TEMPLATE


-!---------------------- INTEL
-IRP_IF INTEL
-
-BEGIN_TEMPLATE
- subroutine $Xsort(x,iorder,isize)
-  use intel
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize).
-  ! iorder in input should be (1,2,3,...,isize), and in output
-  ! contains the new order of the elements.
-  END_DOC
-  integer,intent(in)             :: isize
-  $type,intent(inout)            :: x(isize)
-  integer,intent(inout)          :: iorder(isize)
-  integer                        :: n
-  character, allocatable         :: tmp(:)
-  if (isize < 2) return
-  call ippsSortRadixIndexGetBufferSize(isize, $ippsz, n)
-  allocate(tmp(n))
-  call ippsSortRadixIndexAscend_$ityp(x, $n, iorder, isize, tmp)
-  deallocate(tmp)
-  iorder(1:isize) = iorder(1:isize)+1
-  call $Xset_order(x,iorder,isize)
- end
-
- subroutine $Xsort_noidx(x,isize)
-  use intel
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize).
-  ! iorder in input should be (1,2,3,...,isize), and in output
-  ! contains the new order of the elements.
-  END_DOC
-  integer,intent(in)             :: isize
-  $type,intent(inout)            :: x(isize)
-  integer                        :: n
-  character, allocatable         :: tmp(:)
-  if (isize < 2) return
-  call ippsSortRadixIndexGetBufferSize(isize, $ippsz, n)
-  allocate(tmp(n))
-  call ippsSortRadixAscend_$ityp_I(x, isize, tmp)
-  deallocate(tmp)
- end
-
-SUBST [ X, type, ityp, n, ippsz ]
-   ; real ; 32f ; 4 ; 13 ;;
- i ; integer ; 32s ; 4 ; 11 ;;
- i2 ; integer*2 ; 16s ; 2 ; 7 ;;
-END_TEMPLATE
-
-BEGIN_TEMPLATE
-
- subroutine $Xsort(x,iorder,isize)
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize).
-  ! iorder in input should be (1,2,3,...,isize), and in output
-  ! contains the new order of the elements.
-  END_DOC
-  integer,intent(in)             :: isize
-  $type,intent(inout)            :: x(isize)
-  integer,intent(inout)          :: iorder(isize)
-  integer                        :: n
-  if (isize < 2) then
-    return
-  endif
-!  call sorted_$Xnumber(x,isize,n)
-!  if (isize == n) then
-!    return
-!  endif
-  if ( isize < 32) then
-    call insertion_$Xsort(x,iorder,isize)
-  else
-!    call heap_$Xsort(x,iorder,isize)
-    call quick_$Xsort(x,iorder,isize)
-  endif
- end subroutine $Xsort
-
-SUBST [ X, type ]
- d ; double precision ;;
-END_TEMPLATE
-
-BEGIN_TEMPLATE
-
- subroutine $Xsort(x,iorder,isize)
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize).
-  ! iorder in input should be (1,2,3,...,isize), and in output
-  ! contains the new order of the elements.
-  END_DOC
-  integer,intent(in)             :: isize
-  $type,intent(inout)            :: x(isize)
-  integer,intent(inout)          :: iorder(isize)
-  integer                        :: n
-  if (isize < 2) then
-    return
-  endif
-  call sorted_$Xnumber(x,isize,n)
-  if (isize == n) then
-    return
-  endif
-  if ( isize < 32) then
-    call insertion_$Xsort(x,iorder,isize)
-  else
-!    call $Xradix_sort(x,iorder,isize,-1)
-    call quick_$Xsort(x,iorder,isize)
-  endif
- end subroutine $Xsort
-
-SUBST [ X, type ]
- i8 ; integer*8 ;;
-END_TEMPLATE
-
-!---------------------- END INTEL
-IRP_ELSE
-!---------------------- NON-INTEL
-BEGIN_TEMPLATE
-
- subroutine $Xsort_noidx(x,isize)
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize).
-  END_DOC
-  integer,intent(in)             :: isize
-  $type,intent(inout)            :: x(isize)
-  integer, allocatable           :: iorder(:)
-  integer                        :: i
-  allocate(iorder(isize))
-  do i=1,isize
-   iorder(i)=i
-  enddo
-  call $Xsort(x,iorder,isize)
-  deallocate(iorder)
- end subroutine $Xsort_noidx
-
-SUBST [ X, type ]
-   ; real ;;
- d ; double precision ;;
- i ; integer ;;
- i8 ; integer*8 ;;
- i2 ; integer*2 ;;
-END_TEMPLATE
-
-BEGIN_TEMPLATE
-
- subroutine $Xsort(x,iorder,isize)
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize).
-  ! iorder in input should be (1,2,3,...,isize), and in output
-  ! contains the new order of the elements.
-  END_DOC
-  integer,intent(in)             :: isize
-  $type,intent(inout)            :: x(isize)
-  integer,intent(inout)          :: iorder(isize)
-  integer                        :: n
-  if (isize < 2) then
-    return
-  endif
-!  call sorted_$Xnumber(x,isize,n)
-!  if (isize == n) then
-!    return
-!  endif
-  if ( isize < 32) then
-    call insertion_$Xsort(x,iorder,isize)
-  else
-!    call heap_$Xsort(x,iorder,isize)
-    call quick_$Xsort(x,iorder,isize)
-  endif
- end subroutine $Xsort
-
-SUBST [ X, type ]
-   ; real ;;
- d ; double precision ;;
-END_TEMPLATE
-
-BEGIN_TEMPLATE
-
- subroutine $Xsort(x,iorder,isize)
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize).
-  ! iorder in input should be (1,2,3,...,isize), and in output
-  ! contains the new order of the elements.
-  END_DOC
-  integer,intent(in)             :: isize
-  $type,intent(inout)            :: x(isize)
-  integer,intent(inout)          :: iorder(isize)
-  integer                        :: n
-  if (isize < 2) then
-    return
-  endif
-  call sorted_$Xnumber(x,isize,n)
-  if (isize == n) then
-    return
-  endif
-  if ( isize < 32) then
-    call insertion_$Xsort(x,iorder,isize)
-  else
-!    call $Xradix_sort(x,iorder,isize,-1)
-    call quick_$Xsort(x,iorder,isize)
-  endif
- end subroutine $Xsort
-
-SUBST [ X, type ]
- i ; integer ;;
- i8 ; integer*8 ;;
- i2 ; integer*2 ;;
-END_TEMPLATE
-
-IRP_ENDIF
-!---------------------- END NON-INTEL
-
-

 BEGIN_TEMPLATE
 subroutine $Xset_order(x,iorder,isize)
@ -491,47 +57,6 @@ BEGIN_TEMPLATE
  deallocate(xtmp)
 end

-SUBST [ X, type ]
-   ; real ;;
- d ; double precision ;;
- i ; integer ;;
- i8; integer*8 ;;
- i2; integer*2 ;;
-END_TEMPLATE
-
-
-BEGIN_TEMPLATE
- subroutine insertion_$Xsort_big (x,iorder,isize)
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize) using the insertion sort algorithm.
-  ! iorder in input should be (1,2,3,...,isize), and in output
-  ! contains the new order of the elements.
-  ! This is a version for very large arrays where the indices need
-  ! to be in integer*8 format
-  END_DOC
-  integer*8,intent(in)           :: isize
-  $type,intent(inout)            :: x(isize)
-  integer*8,intent(inout)        :: iorder(isize)
-  $type                          :: xtmp
-  integer*8                      :: i, i0, j, jmax
-
-  do i=2_8,isize
-    xtmp = x(i)
-    i0 = iorder(i)
-    j = i-1_8
-    do while (j>0_8)
-      if (x(j)<=xtmp) exit
-      x(j+1_8) = x(j)
-      iorder(j+1_8) = iorder(j)
-      j = j-1_8
-    enddo
-    x(j+1_8) = xtmp
-    iorder(j+1_8) = i0
-  enddo
-
- end subroutine insertion_$Xsort_big
-
 subroutine $Xset_order_big(x,iorder,isize)
  implicit none
  BEGIN_DOC
@ -565,223 +90,3 @@ SUBST [ X, type ]
 END_TEMPLATE


-BEGIN_TEMPLATE
-
-recursive subroutine $Xradix_sort$big(x,iorder,isize,iradix)
-  implicit none
-
-  BEGIN_DOC
-  ! Sort integer array x(isize) using the radix sort algorithm.
-  ! iorder in input should be (1,2,3,...,isize), and in output
-  ! contains the new order of the elements.
-  ! iradix should be -1 in input.
-  END_DOC
-  integer*$int_type, intent(in)  :: isize
-  integer*$int_type, intent(inout) :: iorder(isize)
-  integer*$type, intent(inout)   :: x(isize)
-  integer, intent(in)            :: iradix
-  integer                        :: iradix_new
-  integer*$type, allocatable     :: x2(:), x1(:)
-  integer*$type                  :: i4               ! data type
-  integer*$int_type, allocatable :: iorder1(:),iorder2(:)
-  integer*$int_type              :: i0, i1, i2, i3, i ! index type
-  integer*$type                  :: mask
-  integer                        :: err
-  !DIR$ ATTRIBUTES ALIGN : 128   :: iorder1,iorder2, x2, x1
-
-  if (isize < 2) then
-    return
-  endif
-
-  if (iradix == -1) then ! Sort Positive and negative
-
-    allocate(x1(isize),iorder1(isize), x2(isize),iorder2(isize),stat=err)
-    if (err /= 0) then
-      print *,  irp_here, ': Unable to allocate arrays'
-      stop
-    endif
-
-    i1=1_$int_type
-    i2=1_$int_type
-    do i=1_$int_type,isize
-      if (x(i) < 0_$type) then
-        iorder1(i1) = iorder(i)
-        x1(i1) = -x(i)
-        i1 = i1+1_$int_type
-      else
-        iorder2(i2) = iorder(i)
-        x2(i2) = x(i)
-        i2 = i2+1_$int_type
-      endif
-    enddo
-    i1=i1-1_$int_type
-    i2=i2-1_$int_type
-
-    do i=1_$int_type,i2
-      iorder(i1+i) = iorder2(i)
-      x(i1+i) = x2(i)
-    enddo
-    deallocate(x2,iorder2,stat=err)
-    if (err /= 0) then
-      print *,  irp_here, ': Unable to deallocate arrays x2, iorder2'
-      stop
-    endif
-
-
-    if (i1 > 1_$int_type) then
-      call $Xradix_sort$big(x1,iorder1,i1,-2)
-      do i=1_$int_type,i1
-        x(i) = -x1(1_$int_type+i1-i)
-        iorder(i) = iorder1(1_$int_type+i1-i)
-      enddo
-    endif
-
-    if (i2>1_$int_type) then
-      call $Xradix_sort$big(x(i1+1_$int_type),iorder(i1+1_$int_type),i2,-2)
-    endif
-
-    deallocate(x1,iorder1,stat=err)
-    if (err /= 0) then
-      print *,  irp_here, ': Unable to deallocate arrays x1, iorder1'
-      stop
-    endif
-    return
-
-  else if (iradix == -2) then ! Positive
-
-    ! Find most significant bit
-
-    i0 = 0_$int_type
-    i4 = maxval(x)
-
-    iradix_new = max($integer_size-1-leadz(i4),1)
-    mask = ibset(0_$type,iradix_new)
-
-    allocate(x1(isize),iorder1(isize), x2(isize),iorder2(isize),stat=err)
-    if (err /= 0) then
-      print *,  irp_here, ': Unable to allocate arrays'
-      stop
-    endif
-
-    i1=1_$int_type
-    i2=1_$int_type
-
-    do i=1_$int_type,isize
-      if (iand(mask,x(i)) == 0_$type) then
-        iorder1(i1) = iorder(i)
-        x1(i1) = x(i)
-        i1 = i1+1_$int_type
-      else
-        iorder2(i2) = iorder(i)
-        x2(i2) = x(i)
-        i2 = i2+1_$int_type
-      endif
-    enddo
-    i1=i1-1_$int_type
-    i2=i2-1_$int_type
-
-    do i=1_$int_type,i1
-      iorder(i0+i) = iorder1(i)
-      x(i0+i) = x1(i)
-    enddo
-    i0 = i0+i1
-    i3 = i0
-    deallocate(x1,iorder1,stat=err)
-    if (err /= 0) then
-      print *,  irp_here, ': Unable to deallocate arrays x1, iorder1'
-      stop
-    endif
-
-
-    do i=1_$int_type,i2
-      iorder(i0+i) = iorder2(i)
-      x(i0+i) = x2(i)
-    enddo
-    i0 = i0+i2
-    deallocate(x2,iorder2,stat=err)
-    if (err /= 0) then
-      print *,  irp_here, ': Unable to deallocate arrays x2, iorder2'
-      stop
-    endif
-
-
-    if (i3>1_$int_type) then
-      call $Xradix_sort$big(x,iorder,i3,iradix_new-1)
-    endif
-
-    if (isize-i3>1_$int_type) then
-      call $Xradix_sort$big(x(i3+1_$int_type),iorder(i3+1_$int_type),isize-i3,iradix_new-1)
-    endif
-
-    return
-  endif
-
-  ASSERT (iradix >= 0)
-
-  if (isize < 48) then
-    call insertion_$Xsort$big(x,iorder,isize)
-    return
-  endif
-
-
-  allocate(x2(isize),iorder2(isize),stat=err)
-  if (err /= 0) then
-    print *,  irp_here, ': Unable to allocate arrays x1, iorder1'
-    stop
-  endif
-
-
-  mask = ibset(0_$type,iradix)
-  i0=1_$int_type
-  i1=1_$int_type
-
-  do i=1_$int_type,isize
-    if (iand(mask,x(i)) == 0_$type) then
-      iorder(i0) = iorder(i)
-      x(i0) = x(i)
-      i0 = i0+1_$int_type
-    else
-      iorder2(i1) = iorder(i)
-      x2(i1) = x(i)
-      i1 = i1+1_$int_type
-    endif
-  enddo
-  i0=i0-1_$int_type
-  i1=i1-1_$int_type
-
-  do i=1_$int_type,i1
-    iorder(i0+i) = iorder2(i)
-    x(i0+i) = x2(i)
-  enddo
-
-  deallocate(x2,iorder2,stat=err)
-  if (err /= 0) then
-    print *,  irp_here, ': Unable to allocate arrays x2, iorder2'
-    stop
-  endif
-
-
-  if (iradix == 0) then
-    return
-  endif
-
-
-  if (i1>1_$int_type) then
-    call $Xradix_sort$big(x(i0+1_$int_type),iorder(i0+1_$int_type),i1,iradix-1)
-  endif
-  if (i0>1) then
-    call $Xradix_sort$big(x,iorder,i0,iradix-1)
-  endif
-
- end
-
-SUBST [ X, type, integer_size, is_big, big, int_type ]
- i  ; 4 ; 32 ; .False. ;      ; 4 ;;
- i8 ; 8 ; 64 ; .False. ;      ; 4 ;;
- i2 ; 2 ; 16 ; .False. ;      ; 4 ;;
- i  ; 4 ; 32 ; .True.  ; _big ; 8 ;;
- i8 ; 8 ; 64 ; .True.  ; _big ; 8 ;;
-END_TEMPLATE
-
-
-