Merge pull request #17 from QuantumPackage/dev-stable-tc-scf

Dev stable tc scf
2024-12-31 07:35:38 +01:00 · 2023-06-02 20:28:36 +02:00 · 2023-06-02 20:28:36 +02:00 · 6e00c869c8
commit 6e00c869c8
parent 6d01eb42ca bf1ee8f8fa
65 changed files with 3055 additions and 1202 deletions
--- a/2
+++ b/2
@ -215,7 +215,6 @@ EOF
              cd trexio-${VERSION}
              ./configure --prefix=\${QP_ROOT} --without-hdf5
              make -j 8 && make -j 8 check && make -j 8 install
              cp ${QP_ROOT}/include/trexio_f.f90 ${QP_ROOT}/src/ezfio_files
              tar -zxvf "\${QP_ROOT}"/external/qp2-dependencies/${ARCHITECTURE}/ninja.tar.gz 
              mv ninja "\${QP_ROOT}"/bin/
 EOF
@ -229,7 +228,6 @@ EOF
              cd trexio-${VERSION}
              ./configure --prefix=\${QP_ROOT}
              make -j 8 && make -j 8 check && make -j 8 install
              cp ${QP_ROOT}/include/trexio_f.f90 ${QP_ROOT}/src/ezfio_files
 EOF
--- a/ocaml/Input_ao_basis.ml
+++ b/ocaml/Input_ao_basis.ml
@ -44,8 +44,12 @@ end = struct
  let get_default = Qpackage.get_ezfio_default "ao_basis";;
  let read_ao_basis () =
    let result =
      Ezfio.get_ao_basis_ao_basis ()
-    |> AO_basis_name.of_string
+    in
    if result <> "None" then
      AO_basis_name.of_string result
    else failwith "No basis"
  ;;
  let read_ao_num () =
@ -267,7 +271,10 @@ end = struct
        |> Ezfio.set_ao_basis_ao_md5 ;
        Some result
      with
-      | _ -> (Ezfio.set_ao_basis_ao_md5 "None" ; None)
+      | _ -> ( "None"
               |> Digest.string
               |> Digest.to_hex
               |> Ezfio.set_ao_basis_ao_md5 ; None)
  ;;
--- a/ocaml/Input_mo_basis.ml
+++ b/ocaml/Input_mo_basis.ml
@ -56,7 +56,10 @@ end = struct
  let read_ao_md5 () =
    let ao_md5 =
      match (Input_ao_basis.Ao_basis.read ()) with
-      | None -> failwith "Unable to read AO basis"
+      | None -> ("None"
                 |> Digest.string
                 |> Digest.to_hex
                 |> MD5.of_string)
      | Some result -> Input_ao_basis.Ao_basis.to_md5 result
    in
    let result =
--- a/src/trexio/qp_import_trexio.py
+++ b/src/trexio/qp_import_trexio.py
@ -13,12 +13,17 @@ Options:
 import sys
 import os
 import trexio
 import numpy as np
 from functools import reduce
 from ezfio import ezfio
 from docopt import docopt
 try:
  import trexio
 except ImportError:
    print("Error: trexio python module is not found. Try python3 -m pip install trexio")
    sys.exit(1)
 try:
    QP_ROOT = os.environ["QP_ROOT"]
@ -90,14 +95,15 @@ def write_ezfio(trexio_filename, filename):
        p = re.compile(r'(\d*)$')
        label = [p.sub("", x).capitalize() for x in label]
        ezfio.set_nuclei_nucl_label(label)
        print("OK")
    else:
        ezfio.set_nuclei_nucl_num(1)
        ezfio.set_nuclei_nucl_charge([0.])
        ezfio.set_nuclei_nucl_coord([0.,0.,0.])
        ezfio.set_nuclei_nucl_label(["X"])
        print("None")
    print("OK")
    print("Electrons\t...\t", end=' ')
@ -105,12 +111,12 @@ def write_ezfio(trexio_filename, filename):
    try:
        num_beta = trexio.read_electron_dn_num(trexio_file)
    except:
-        num_beta = sum(charge)//2
+        num_beta = int(sum(charge))//2
    try:
        num_alpha = trexio.read_electron_up_num(trexio_file)
    except:
-        num_alpha = sum(charge) - num_beta
+        num_alpha = int(sum(charge)) - num_beta
    if num_alpha == 0:
        print("\n\nError: There are zero electrons in the TREXIO file.\n\n")
@ -118,7 +124,7 @@ def write_ezfio(trexio_filename, filename):
    ezfio.set_electrons_elec_alpha_num(num_alpha)
    ezfio.set_electrons_elec_beta_num(num_beta)
-    print("OK")
+    print(f"{num_alpha} {num_beta}")
    print("Basis\t\t...\t", end=' ')
@ -126,9 +132,7 @@ def write_ezfio(trexio_filename, filename):
    try:
        basis_type = trexio.read_basis_type(trexio_file)
-        if basis_type.lower() not in ["gaussian", "slater"]:
+        if basis_type.lower() in ["gaussian", "slater"]:
            raise TypeError
            shell_num   = trexio.read_basis_shell_num(trexio_file)
            prim_num    = trexio.read_basis_prim_num(trexio_file)
            ang_mom     = trexio.read_basis_shell_ang_mom(trexio_file)
@ -139,6 +143,7 @@ def write_ezfio(trexio_filename, filename):
            ao_shell    = trexio.read_ao_shell(trexio_file)
            ezfio.set_basis_basis("Read from TREXIO")
            ezfio.set_ao_basis_ao_basis("Read from TREXIO")
            ezfio.set_basis_shell_num(shell_num)
            ezfio.set_basis_prim_num(prim_num)
            ezfio.set_basis_shell_ang_mom(ang_mom)
@ -179,7 +184,61 @@ def write_ezfio(trexio_filename, filename):
            shell_factor = trexio.read_basis_shell_factor(trexio_file)
            prim_factor  = trexio.read_basis_prim_factor(trexio_file)
-        print("OK")
+        elif basis_type.lower() == "numerical":
            shell_num   = trexio.read_basis_shell_num(trexio_file)
            prim_num    = shell_num
            ang_mom     = trexio.read_basis_shell_ang_mom(trexio_file)
            nucl_index  = trexio.read_basis_nucleus_index(trexio_file)
            exponent    = [1.]*prim_num
            coefficient = [1.]*prim_num
            shell_index = [i for i in range(shell_num)]
            ao_shell    = trexio.read_ao_shell(trexio_file)
            ezfio.set_basis_basis("None")
            ezfio.set_ao_basis_ao_basis("None")
            ezfio.set_basis_shell_num(shell_num)
            ezfio.set_basis_prim_num(prim_num)
            ezfio.set_basis_shell_ang_mom(ang_mom)
            ezfio.set_basis_basis_nucleus_index([ x+1 for x in nucl_index ])
            ezfio.set_basis_prim_expo(exponent)
            ezfio.set_basis_prim_coef(coefficient)
            nucl_shell_num = []
            prev = None
            m = 0
            for i in ao_shell:
                if i != prev:
                   m += 1
                   if prev is None or nucl_index[i] != nucl_index[prev]:
                        nucl_shell_num.append(m)
                        m = 0
                prev = i
            assert (len(nucl_shell_num) == nucl_num)
            shell_prim_num = []
            prev = shell_index[0]
            count = 0
            for i in shell_index:
                if i != prev:
                   shell_prim_num.append(count)
                   count = 0
                count += 1
                prev = i
            shell_prim_num.append(count)
            assert (len(shell_prim_num) == shell_num)
            ezfio.set_basis_shell_prim_num(shell_prim_num)
            ezfio.set_basis_shell_index([x+1 for x in shell_index])
            ezfio.set_basis_nucleus_shell_num(nucl_shell_num)
            shell_factor = trexio.read_basis_shell_factor(trexio_file)
            prim_factor  = [1.]*prim_num
        else:
           raise TypeError
        print(basis_type)
    except:
        print("None")
        ezfio.set_ao_basis_ao_cartesian(True)
@ -256,10 +315,12 @@ def write_ezfio(trexio_filename, filename):
 #        ezfio.set_ao_basis_ao_prim_num_max(prim_num_max)
        ezfio.set_ao_basis_ao_coef(coef)
        ezfio.set_ao_basis_ao_expo(expo)
        ezfio.set_ao_basis_ao_basis("Read from TREXIO")
        print("OK")
    else:
        print("None")
    #                _
    # |\/|  _   _   |_)  _.  _ o  _
@ -279,6 +340,7 @@ def write_ezfio(trexio_filename, filename):
    except:
      label = "None"
    ezfio.set_mo_basis_mo_label(label)
    ezfio.set_determinants_mo_label(label)
    try:
      clss = trexio.read_mo_class(trexio_file)
@ -303,10 +365,10 @@ def write_ezfio(trexio_filename, filename):
      for i in range(num_beta):
         mo_occ[i] += 1.
      ezfio.set_mo_basis_mo_occ(mo_occ)
    except:
      pass
      print("OK")
    except:
      print("None")
    print("Pseudos\t\t...\t", end=' ')
@ -386,10 +448,11 @@ def write_ezfio(trexio_filename, filename):
        ezfio.set_pseudo_pseudo_n_kl(pseudo_n_kl)
        ezfio.set_pseudo_pseudo_v_kl(pseudo_v_kl)
        ezfio.set_pseudo_pseudo_dz_kl(pseudo_dz_kl)
        print("OK")
    else:
        print("None")
--- a/src/ao_basis/EZFIO.cfg
+++ b/src/ao_basis/EZFIO.cfg
@ -67,3 +67,15 @@ doc: Use normalized primitive functions
 interface: ezfio, provider
 default: true
 [ao_expoim_cosgtos]
 type: double precision
 doc: imag part for Exponents for each primitive of each cosGTOs |AO|
 size: (ao_basis.ao_num,ao_basis.ao_prim_num_max)
 interface: ezfio, provider
 [use_cosgtos]
 type: logical
 doc: If true, use cosgtos for AO integrals
 interface: ezfio
 default: False
--- a/src/ao_basis/cosgtos.irp.f
+++ b/src/ao_basis/cosgtos.irp.f
@ -0,0 +1,33 @@
 BEGIN_PROVIDER [ logical, use_cosgtos  ]
  implicit none
  BEGIN_DOC
 ! If true, use cosgtos for AO integrals
  END_DOC
  logical                        :: has
  PROVIDE ezfio_filename
  if (mpi_master) then
    call ezfio_has_ao_basis_use_cosgtos(has)
    if (has) then
 !      write(6,'(A)') '.. >>>>> [ IO READ: use_cosgtos ] <<<<< ..'
      call ezfio_get_ao_basis_use_cosgtos(use_cosgtos)
    else
      use_cosgtos = .False.
    endif
  endif
  IRP_IF MPI_DEBUG
    print *,  irp_here, mpi_rank
    call MPI_BARRIER(MPI_COMM_WORLD, ierr)
  IRP_ENDIF
  IRP_IF MPI
    include 'mpif.h'
    integer :: ierr
    call MPI_BCAST( use_cosgtos, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
    if (ierr /= MPI_SUCCESS) then
      stop 'Unable to read use_cosgtos with MPI'
    endif
  IRP_ENDIF
 !  call write_time(6)
 END_PROVIDER
--- a/src/ao_one_e_ints/NEED
+++ b/src/ao_one_e_ints/NEED
@ -1,3 +1,2 @@
 ao_basis
 pseudo
 cosgtos_ao_int
--- a/src/cosgtos_ao_int/aos_cosgtos.irp.f
+++ b/src/cosgtos_ao_int/aos_cosgtos.irp.f
--- a/src/ao_one_e_ints/one_e_Coul_integrals_cosgtos.irp.f
+++ b/src/ao_one_e_ints/one_e_Coul_integrals_cosgtos.irp.f
--- a/src/ao_one_e_ints/one_e_kin_integrals_cosgtos.irp.f
+++ b/src/ao_one_e_ints/one_e_kin_integrals_cosgtos.irp.f
--- a/src/ao_one_e_ints/pot_ao_ints.irp.f
+++ b/src/ao_one_e_ints/pot_ao_ints.irp.f
@ -455,10 +455,12 @@ recursive subroutine I_x1_pol_mult_one_e(a,c,R1x,R1xp,R2x,d,nd,n_pt_in)
    do ix=0,nx
      X(ix) *= dble(c)
    enddo
-    call multiply_poly(X,nx,R2x,2,d,nd)
+!    call multiply_poly(X,nx,R2x,2,d,nd)
    call multiply_poly_c2(X,nx,R2x,d,nd)
    ny=0
    call I_x2_pol_mult_one_e(c,R1x,R1xp,R2x,Y,ny,n_pt_in)
-    call multiply_poly(Y,ny,R1x,2,d,nd)
+!    call multiply_poly(Y,ny,R1x,2,d,nd)
    call multiply_poly_c2(Y,ny,R1x,d,nd)
  else
    do ix=0,n_pt_in
      X(ix) = 0.d0
@ -469,7 +471,8 @@ recursive subroutine I_x1_pol_mult_one_e(a,c,R1x,R1xp,R2x,d,nd,n_pt_in)
    do ix=0,nx
      X(ix) *= dble(a-1)
    enddo
-    call multiply_poly(X,nx,R2x,2,d,nd)
+!    call multiply_poly(X,nx,R2x,2,d,nd)
    call multiply_poly_c2(X,nx,R2x,d,nd)
    nx = nd
    do ix=0,n_pt_in
@ -479,10 +482,12 @@ recursive subroutine I_x1_pol_mult_one_e(a,c,R1x,R1xp,R2x,d,nd,n_pt_in)
    do ix=0,nx
      X(ix) *= dble(c)
    enddo
-    call multiply_poly(X,nx,R2x,2,d,nd)
+!    call multiply_poly(X,nx,R2x,2,d,nd)
    call multiply_poly_c2(X,nx,R2x,d,nd)
    ny=0
    call I_x1_pol_mult_one_e(a-1,c,R1x,R1xp,R2x,Y,ny,n_pt_in)
-    call multiply_poly(Y,ny,R1x,2,d,nd)
+!    call multiply_poly(Y,ny,R1x,2,d,nd)
    call multiply_poly_c2(Y,ny,R1x,d,nd)
  endif
 end
@ -519,7 +524,8 @@ recursive subroutine I_x2_pol_mult_one_e(c,R1x,R1xp,R2x,d,nd,dim)
    do ix=0,nx
      X(ix) *= dble(c-1)
    enddo
-    call multiply_poly(X,nx,R2x,2,d,nd)
+!    call multiply_poly(X,nx,R2x,2,d,nd)
    call multiply_poly_c2(X,nx,R2x,d,nd)
    ny = 0
    do ix=0,dim
      Y(ix) = 0.d0
@ -527,7 +533,8 @@ recursive subroutine I_x2_pol_mult_one_e(c,R1x,R1xp,R2x,d,nd,dim)
    call I_x1_pol_mult_one_e(0,c-1,R1x,R1xp,R2x,Y,ny,dim)
    if(ny.ge.0)then
-      call multiply_poly(Y,ny,R1xp,2,d,nd)
+!      call multiply_poly(Y,ny,R1xp,2,d,nd)
      call multiply_poly_c2(Y,ny,R1xp,d,nd)
    endif
  endif
 end
--- a/src/ao_two_e_ints/EZFIO.cfg
+++ b/src/ao_two_e_ints/EZFIO.cfg
@ -4,6 +4,19 @@ doc: Read/Write |AO| integrals from/to disk [ Write | Read | None ]
 interface: ezfio,provider,ocaml
 default: None
 [ao_integrals_threshold]
 type: Threshold
 doc: If | (pq|rs) | < `ao_integrals_threshold` then (pq|rs) is zero
 interface: ezfio,provider,ocaml
 default: 1.e-15
 ezfio_name: threshold_ao
 [ao_cholesky_threshold]
 type: Threshold
 doc: If | (ii|jj) | < `ao_cholesky_threshold` then (ii|jj) is zero
 interface: ezfio,provider,ocaml
 default: 1.e-12
 [do_direct_integrals]
 type: logical
 doc: Compute integrals on the fly (very slow, only for debugging)
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@ -4,29 +4,7 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num_guess ]
 ! Number of Cholesky vectors in AO basis
 END_DOC
- integer :: i,j,k,l
+ cholesky_ao_num_guess = ao_num*ao_num / 2
 double precision :: xnorm0, x, integral
 double precision, external :: ao_two_e_integral
 cholesky_ao_num_guess = 0
 xnorm0 = 0.d0
 x = 0.d0
 do j=1,ao_num
   do i=1,ao_num
     integral = ao_two_e_integral(i,i,j,j)
     if (integral > ao_integrals_threshold) then
       cholesky_ao_num_guess += 1
     else
       x += integral
     endif
   enddo
 enddo
 print *, 'Cholesky decomposition of AO integrals'
 print *, '--------------------------------------'
 print *, ''
 print *, 'Estimated Error: ', x
 print *, 'Guess size: ', cholesky_ao_num_guess, '(', 100.d0*dble(cholesky_ao_num_guess)/dble(ao_num*ao_num), ' %)'
 END_PROVIDER
 BEGIN_PROVIDER [ integer, cholesky_ao_num ]
@ -39,7 +17,7 @@ END_PROVIDER
 END_DOC
 type(c_ptr) :: ptr
- integer :: fd, i,j,k,l, rank
+ integer :: fd, i,j,k,l,m,rank
 double precision, pointer :: ao_integrals(:,:,:,:)
 double precision, external :: ao_two_e_integral
@ -49,28 +27,90 @@ END_PROVIDER
   8, fd, .False., ptr)
 call c_f_pointer(ptr, ao_integrals, (/ao_num, ao_num, ao_num, ao_num/))
- double precision :: integral
+ print*, 'Providing the AO integrals (Cholesky)'
 call wall_time(wall_1)
 call cpu_time(cpu_1)
 ao_integrals = 0.d0
 double precision :: integral, cpu_1, cpu_2, wall_1, wall_2
 logical, external :: ao_two_e_integral_zero
- !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i,j,k,l, integral) SCHEDULE(dynamic)
+  double precision, external :: get_ao_two_e_integral
- do l=1,ao_num
+
 if (read_ao_two_e_integrals) then
   PROVIDE ao_two_e_integrals_in_map
   !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,j,k,l, integral, wall_2)
   do m=0,9
     do l=1+m,ao_num,10
       !$OMP DO SCHEDULE(dynamic)
       do j=1,l
         do k=1,ao_num
-    do i=1,k
+           do i=1,min(k,j)
             if (ao_two_e_integral_zero(i,j,k,l)) cycle
             integral = get_ao_two_e_integral(i,j,k,l, ao_integrals_map)
             ao_integrals(i,k,j,l) = integral
             ao_integrals(k,i,j,l) = integral
             ao_integrals(i,k,l,j) = integral
             ao_integrals(k,i,l,j) = integral
             ao_integrals(j,l,i,k) = integral
             ao_integrals(j,l,k,i) = integral
             ao_integrals(l,j,i,k) = integral
             ao_integrals(l,j,k,i) = integral
           enddo
         enddo
       enddo
       !$OMP END DO NOWAIT
     enddo
     !$OMP MASTER
     call wall_time(wall_2)
     print '(I10,'' %  in'', 4X, F10.2, '' s.'')', (m+1) * 10, wall_2-wall_1
     !$OMP END MASTER
   enddo
   !$OMP END PARALLEL
 else
   !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,j,k,l, integral, wall_2)
   do m=0,9
     do l=1+m,ao_num,10
       !$OMP DO SCHEDULE(dynamic)
       do j=1,l
         do k=1,ao_num
           do i=1,min(k,j)
             if (ao_two_e_integral_zero(i,j,k,l)) cycle
             integral = ao_two_e_integral(i,k,j,l)
             ao_integrals(i,k,j,l) = integral
             ao_integrals(k,i,j,l) = integral
             ao_integrals(i,k,l,j) = integral
             ao_integrals(k,i,l,j) = integral
             ao_integrals(j,l,i,k) = integral
             ao_integrals(j,l,k,i) = integral
             ao_integrals(l,j,i,k) = integral
             ao_integrals(l,j,k,i) = integral
           enddo
         enddo
       enddo
       !$OMP END DO NOWAIT
     enddo
- !$OMP END PARALLEL DO
+     !$OMP MASTER
     call wall_time(wall_2)
     print '(I10,'' %  in'', 4X, F10.2, '' s.'')', (m+1) * 10, wall_2-wall_1
     !$OMP END MASTER
   enddo
   !$OMP END PARALLEL
   call wall_time(wall_2)
   call cpu_time(cpu_2)
   print*, 'AO integrals provided:'
   print*, ' cpu  time :',cpu_2 - cpu_1, 's'
   print*, ' wall time :',wall_2 - wall_1, 's  ( x ', (cpu_2-cpu_1)/(wall_2-wall_1+tiny(1.d0)), ' )'
 endif
 ! Call Lapack
 cholesky_ao_num = cholesky_ao_num_guess
- call pivoted_cholesky(ao_integrals, cholesky_ao_num, ao_integrals_threshold, ao_num*ao_num, cholesky_ao)
+ call pivoted_cholesky(ao_integrals, cholesky_ao_num, ao_cholesky_threshold, ao_num*ao_num, cholesky_ao)
 print *, 'Rank: ', cholesky_ao_num, '(', 100.d0*dble(cholesky_ao_num)/dble(ao_num*ao_num), ' %)'
 ! Remove mmap
--- a/src/cosgtos_ao_int/gauss_legendre.irp.f
+++ b/src/cosgtos_ao_int/gauss_legendre.irp.f
--- a/src/ao_two_e_ints/two_e_Coul_integrals_cosgtos.irp.f
+++ b/src/ao_two_e_ints/two_e_Coul_integrals_cosgtos.irp.f
--- a/src/ao_two_e_ints/two_e_integrals.irp.f
+++ b/src/ao_two_e_ints/two_e_integrals.irp.f
@ -590,8 +590,20 @@ double precision function general_primitive_integral(dim,            &
    d_poly(i)=0.d0
  enddo
-  !DIR$ FORCEINLINE
+!  call multiply_poly(Ix_pol,n_Ix,Iy_pol,n_Iy,d_poly,n_pt_tmp)
-  call multiply_poly(Ix_pol,n_Ix,Iy_pol,n_Iy,d_poly,n_pt_tmp)
+  integer :: ib, ic
  if (ior(n_Ix,n_Iy) >= 0) then
    do ib=0,n_Ix
      do ic = 0,n_Iy
        d_poly(ib+ic) = d_poly(ib+ic) + Iy_pol(ic) * Ix_pol(ib)
      enddo
    enddo
    do n_pt_tmp = n_Ix+n_Iy, 0, -1
      if (d_poly(n_pt_tmp) /= 0.d0) exit
    enddo
  endif
  if (n_pt_tmp == -1) then
    return
  endif
@ -600,8 +612,21 @@ double precision function general_primitive_integral(dim,            &
    d1(i)=0.d0
  enddo
-  !DIR$ FORCEINLINE
+!  call multiply_poly(d_poly ,n_pt_tmp ,Iz_pol,n_Iz,d1,n_pt_out)
-  call multiply_poly(d_poly ,n_pt_tmp ,Iz_pol,n_Iz,d1,n_pt_out)
+  if (ior(n_pt_tmp,n_Iz) >= 0) then
    ! Bottleneck here
    do ib=0,n_pt_tmp
      do ic = 0,n_Iz
        d1(ib+ic) = d1(ib+ic) + Iz_pol(ic) * d_poly(ib)
      enddo
    enddo
    do n_pt_out = n_pt_tmp+n_Iz, 0, -1
      if (d1(n_pt_out) /= 0.d0) exit
    enddo
  endif
  double precision               :: rint_sum
  accu = accu + rint_sum(n_pt_out,const,d1)
@ -948,8 +973,9 @@ recursive subroutine I_x1_pol_mult_recurs(a,c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt
    X(ix) *= dble(a-1)
  enddo
-  !DIR$ FORCEINLINE
+!  !DIR$ FORCEINLINE
-  call multiply_poly(X,nx,B_10,2,d,nd)
+!  call multiply_poly(X,nx,B_10,2,d,nd)
  call multiply_poly_c2(X,nx,B_10,d,nd)
  nx = nd
  !DIR$ LOOP COUNT(8)
@ -970,8 +996,9 @@ recursive subroutine I_x1_pol_mult_recurs(a,c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt
        X(ix) *= c
      enddo
    endif
-    !DIR$ FORCEINLINE
+!    !DIR$ FORCEINLINE
-    call multiply_poly(X,nx,B_00,2,d,nd)
+!    call multiply_poly(X,nx,B_00,2,d,nd)
    call multiply_poly_c2(X,nx,B_00,d,nd)
  endif
  ny=0
@ -988,9 +1015,9 @@ recursive subroutine I_x1_pol_mult_recurs(a,c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt
    call I_x1_pol_mult_recurs(a-1,c,B_10,B_01,B_00,C_00,D_00,Y,ny,n_pt_in)
  endif
-  !DIR$ FORCEINLINE
+!  !DIR$ FORCEINLINE
-  call multiply_poly(Y,ny,C_00,2,d,nd)
+!  call multiply_poly(Y,ny,C_00,2,d,nd)
-
+  call multiply_poly_c2(Y,ny,C_00,d,nd)
 end
 recursive subroutine I_x1_pol_mult_a1(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in)
@ -1028,8 +1055,9 @@ recursive subroutine I_x1_pol_mult_a1(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in)
    enddo
  endif
-  !DIR$ FORCEINLINE
+!  !DIR$ FORCEINLINE
-  call multiply_poly(X,nx,B_00,2,d,nd)
+!  call multiply_poly(X,nx,B_00,2,d,nd)
  call multiply_poly_c2(X,nx,B_00,d,nd)
  ny=0
@ -1039,8 +1067,9 @@ recursive subroutine I_x1_pol_mult_a1(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in)
  enddo
  call I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,Y,ny,n_pt_in)
-  !DIR$ FORCEINLINE
+!  !DIR$ FORCEINLINE
-  call multiply_poly(Y,ny,C_00,2,d,nd)
+!  call multiply_poly(Y,ny,C_00,2,d,nd)
  call multiply_poly_c2(Y,ny,C_00,d,nd)
 end
@ -1067,8 +1096,9 @@ recursive subroutine I_x1_pol_mult_a2(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in)
  nx = 0
  call I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,X,nx,n_pt_in)
-  !DIR$ FORCEINLINE
+!  !DIR$ FORCEINLINE
-  call multiply_poly(X,nx,B_10,2,d,nd)
+!  call multiply_poly(X,nx,B_10,2,d,nd)
  call multiply_poly_c2(X,nx,B_10,d,nd)
  nx = nd
  !DIR$ LOOP COUNT(8)
@ -1086,8 +1116,9 @@ recursive subroutine I_x1_pol_mult_a2(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in)
    enddo
  endif
-  !DIR$ FORCEINLINE
+!  !DIR$ FORCEINLINE
-  call multiply_poly(X,nx,B_00,2,d,nd)
+!  call multiply_poly(X,nx,B_00,2,d,nd)
  call multiply_poly_c2(X,nx,B_00,d,nd)
  ny=0
  !DIR$ LOOP COUNT(8)
@ -1097,9 +1128,9 @@ recursive subroutine I_x1_pol_mult_a2(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in)
  !DIR$ FORCEINLINE
  call I_x1_pol_mult_a1(c,B_10,B_01,B_00,C_00,D_00,Y,ny,n_pt_in)
-  !DIR$ FORCEINLINE
+!  !DIR$ FORCEINLINE
-  call multiply_poly(Y,ny,C_00,2,d,nd)
+!  call multiply_poly(Y,ny,C_00,2,d,nd)
-
+  call multiply_poly_c2(Y,ny,C_00,d,nd)
 end
 recursive subroutine I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,d,nd,dim)
@ -1146,8 +1177,10 @@ recursive subroutine I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,d,nd,dim)
      Y(1) = D_00(1)
      Y(2) = D_00(2)
-      !DIR$ FORCEINLINE
+!      !DIR$ FORCEINLINE
-      call multiply_poly(Y,ny,D_00,2,d,nd)
+!      call multiply_poly(Y,ny,D_00,2,d,nd)
      call multiply_poly_c2(Y,ny,D_00,d,nd)
      return
      case default
@ -1164,8 +1197,9 @@ recursive subroutine I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,d,nd,dim)
        X(ix) *= dble(c-1)
      enddo
-      !DIR$ FORCEINLINE
+!      !DIR$ FORCEINLINE
-      call multiply_poly(X,nx,B_01,2,d,nd)
+!      call multiply_poly(X,nx,B_01,2,d,nd)
      call multiply_poly_c2(X,nx,B_01,d,nd)
      ny = 0
      !DIR$ LOOP COUNT(6)
@ -1174,8 +1208,9 @@ recursive subroutine I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,d,nd,dim)
      enddo
      call I_x2_pol_mult(c-1,B_10,B_01,B_00,C_00,D_00,Y,ny,dim)
-      !DIR$ FORCEINLINE
+!      !DIR$ FORCEINLINE
-      call multiply_poly(Y,ny,D_00,2,d,nd)
+!      call multiply_poly(Y,ny,D_00,2,d,nd)
      call multiply_poly_c2(Y,ny,D_00,d,nd)
  end select
 end
@ -1233,3 +1268,34 @@ subroutine compute_ao_integrals_jl(j,l,n_integrals,buffer_i,buffer_value)
  enddo
 end
 subroutine multiply_poly_local(b,nb,c,nc,d,nd)
  implicit none
  BEGIN_DOC
  ! Multiply two polynomials
  ! D(t) += B(t)*C(t)
  END_DOC
  integer, intent(in)            :: nb, nc
  integer, intent(out)           :: nd
  double precision, intent(in)   :: b(0:nb), c(0:nc)
  double precision, intent(inout) :: d(0:nb+nc)
  integer                        :: ndtmp
  integer                        :: ib, ic, id, k
  if(ior(nc,nb) < 0) return !False if nc>=0 and nb>=0
  do ib=0,nb
    do ic = 0,nc
      d(ib+ic) = d(ib+ic) + c(ic) * b(ib)
    enddo
  enddo
  do nd = nb+nc,0,-1
    if (d(nd) /= 0.d0) exit
  enddo
 end
--- a/src/bi_ort_ints/bi_ort_ints.irp.f
+++ b/src/bi_ort_ints/bi_ort_ints.irp.f
@ -7,7 +7,13 @@ program bi_ort_ints
  my_n_pt_r_grid = 10
  my_n_pt_a_grid = 14
  touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
- call test_3e
+! call test_3e
 call test_5idx
 !  call test_5idx2
 end
 subroutine test_5idx2
  PROVIDE three_e_5_idx_cycle_2_bi_ort
 end
 subroutine test_3e
@ -16,6 +22,7 @@ subroutine test_3e
 double precision :: accu, contrib,new,ref
 i = 1
 k = 1
 n = 0
 accu = 0.d0
 do i = 1, mo_num
  do k = 1, mo_num
@ -31,6 +38,7 @@ subroutine test_3e
         print*,'pb !!'
         print*,i,k,j,l,m,n
         print*,ref,new,contrib
         stop
        endif
      enddo
     enddo
@ -42,3 +50,93 @@ subroutine test_3e
 end
 subroutine test_5idx
 implicit none
 integer :: i,k,j,l,m,n,ipoint
 double precision :: accu, contrib,new,ref
 i = 1
 k = 1
 n = 0
 accu = 0.d0
 do i = 1, mo_num
  do k = 1, mo_num
   do j = 1, mo_num
    do l = 1, mo_num
     do m = 1, mo_num
      new = three_e_5_idx_direct_bi_ort(m,l,j,k,i)
      ref = three_e_5_idx_direct_bi_ort_old(m,l,j,k,i)
      contrib = dabs(new - ref)
      accu += contrib
      if(contrib .gt. 1.d-10)then
       print*,'direct'
       print*,i,k,j,l,m
       print*,ref,new,contrib
       stop
      endif
      new = three_e_5_idx_exch12_bi_ort(m,l,j,k,i)
      ref = three_e_5_idx_exch12_bi_ort_old(m,l,j,k,i)
      contrib = dabs(new - ref)
      accu += contrib
      if(contrib .gt. 1.d-10)then
       print*,'exch12'
       print*,i,k,j,l,m
       print*,ref,new,contrib
       stop
      endif
 !
      new = three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i)
      ref = three_e_5_idx_cycle_1_bi_ort_old(m,l,j,k,i)
      contrib = dabs(new - ref)
      accu += contrib
      if(contrib .gt. 1.d-10)then
       print*,'cycle1'
       print*,i,k,j,l,m
       print*,ref,new,contrib
       stop
      endif
      new = three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i)
      ref = three_e_5_idx_cycle_2_bi_ort_old(m,l,j,k,i)
      contrib = dabs(new - ref)
      accu += contrib
      if(contrib .gt. 1.d-10)then
       print*,'cycle2'
       print*,i,k,j,l,m
       print*,ref,new,contrib
       stop
      endif
      new = three_e_5_idx_exch23_bi_ort(m,l,j,k,i)
      ref = three_e_5_idx_exch23_bi_ort_old(m,l,j,k,i)
      contrib = dabs(new - ref)
      accu += contrib
      if(contrib .gt. 1.d-10)then
       print*,'exch23'
       print*,i,k,j,l,m
       print*,ref,new,contrib
       stop
      endif
      new = three_e_5_idx_exch13_bi_ort(m,l,j,k,i)
      ref = three_e_5_idx_exch13_bi_ort_old(m,l,j,k,i)
      contrib = dabs(new - ref)
      accu += contrib
      if(contrib .gt. 1.d-10)then
       print*,'exch13'
       print*,i,k,j,l,m
       print*,ref,new,contrib
       stop
      endif
     enddo
    enddo
   enddo
  enddo
 enddo
 print*,'accu = ',accu/dble(mo_num)**5
 end
--- a/src/bi_ort_ints/three_body_ijmkl.irp.f
+++ b/src/bi_ort_ints/three_body_ijmkl.irp.f
@ -1,7 +1,11 @@
 ! ---
 BEGIN_PROVIDER [ double precision, three_e_5_idx_direct_bi_ort , (mo_num, mo_num, mo_num, mo_num, mo_num)]
 &BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort , (mo_num, mo_num, mo_num, mo_num, mo_num)]
 &BEGIN_PROVIDER [ double precision, three_e_5_idx_exch23_bi_ort , (mo_num, mo_num, mo_num, mo_num, mo_num)]
 &BEGIN_PROVIDER [ double precision, three_e_5_idx_exch13_bi_ort , (mo_num, mo_num, mo_num, mo_num, mo_num)]
 &BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_1_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
 &BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_2_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
  BEGIN_DOC
  !
@ -14,289 +18,221 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_direct_bi_ort, (mo_num, mo_num,
  implicit none
  integer          :: i, j, k, m, l
-  double precision :: integral, wall1, wall0
+  double precision :: wall1, wall0
-
+  integer          :: ipoint
-  three_e_5_idx_direct_bi_ort = 0.d0
+  double precision, allocatable :: grad_mli(:,:,:), orb_mat(:,:,:)
-  print *, ' Providing the three_e_5_idx_direct_bi_ort ...'
+  double precision, allocatable :: lk_grad_mi(:,:,:,:), rk_grad_im(:,:,:,:)
-  call wall_time(wall0)
+  double precision, allocatable :: lm_grad_ik(:,:,:,:), rm_grad_ik(:,:,:,:)
  double precision, allocatable :: tmp_mat(:,:,:,:)
  allocate(tmp_mat(mo_num,mo_num,mo_num,mo_num))
  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
  PROVIDE mo_l_coef mo_r_coef int2_grad1_u12_bimo_t
  print *, ' Providing the three_e_5_idx_bi_ort ...'
  call wall_time(wall0)
 do m = 1, mo_num
  allocate(grad_mli(n_points_final_grid,mo_num,mo_num))
  allocate(orb_mat(n_points_final_grid,mo_num,mo_num))
 !$OMP PARALLEL                     &
 !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) & 
+ !$OMP PRIVATE (i,l,ipoint) &
- !$OMP SHARED (mo_num,three_e_5_idx_direct_bi_ort)
+ !$OMP SHARED (m,mo_num,n_points_final_grid, &
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+ !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
 !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
 !$OMP         grad_mli, orb_mat)
 !$OMP DO COLLAPSE(2)
  do i=1,mo_num
    do l=1,mo_num
       do ipoint=1, n_points_final_grid
         grad_mli(ipoint,l,i) = final_weight_at_r_vector(ipoint) * ( &
               int2_grad1_u12_bimo_t(ipoint,1,m,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i) + &
               int2_grad1_u12_bimo_t(ipoint,2,m,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i) + &
               int2_grad1_u12_bimo_t(ipoint,3,m,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i) )
         orb_mat(ipoint,l,i) = mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,i)
       enddo
    enddo
  enddo
  !$OMP END DO
  !$OMP END PARALLEL
  call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, n_points_final_grid, 1.d0, &
      orb_mat, n_points_final_grid,  &
      grad_mli, n_points_final_grid,  0.d0, &
      tmp_mat, mo_num*mo_num)
  !$OMP PARALLEL DO PRIVATE(i,j,k,l)
  do i = 1, mo_num
    do k = 1, mo_num
      do j = 1, mo_num
        do l = 1, mo_num
-          do m = 1, mo_num
+            three_e_5_idx_direct_bi_ort(m,l,j,k,i) = - tmp_mat(l,j,k,i) - tmp_mat(k,i,l,j)
-            call give_integrals_3_body_bi_ort(m, l, k, m, j, i, integral)
+            three_e_5_idx_exch12_bi_ort(m,l,j,k,i) = - tmp_mat(l,i,k,j) - tmp_mat(k,j,l,i)
            three_e_5_idx_direct_bi_ort(m,l,j,k,i) = -1.d0 * integral 
        enddo
      enddo
    enddo
  enddo
  !$OMP END PARALLEL DO
  deallocate(orb_mat,grad_mli)
  allocate(lm_grad_ik(n_points_final_grid,3,mo_num,mo_num))
  allocate(rm_grad_ik(n_points_final_grid,3,mo_num,mo_num))
  allocate(rk_grad_im(n_points_final_grid,3,mo_num,mo_num))
 !$OMP PARALLEL                     &
 !$OMP DEFAULT (NONE)               &
 !$OMP PRIVATE (i,l,ipoint) &
 !$OMP SHARED (m,mo_num,n_points_final_grid, &
 !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
 !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
 !$OMP         rm_grad_ik, lm_grad_ik, rk_grad_im, lk_grad_mi)
 !$OMP DO COLLAPSE(2)
  do i=1,mo_num
    do l=1,mo_num
       do ipoint=1, n_points_final_grid
         lm_grad_ik(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i) * final_weight_at_r_vector(ipoint)
         lm_grad_ik(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i) * final_weight_at_r_vector(ipoint)
         lm_grad_ik(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i) * final_weight_at_r_vector(ipoint)
         rm_grad_ik(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i)
         rm_grad_ik(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i)
         rm_grad_ik(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i)
         rk_grad_im(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,1,i,m)
         rk_grad_im(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,2,i,m)
         rk_grad_im(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,3,i,m)
       enddo
    enddo
  enddo
  !$OMP END DO
  !$OMP END PARALLEL
  call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
      lm_grad_ik, 3*n_points_final_grid,  &
      rm_grad_ik, 3*n_points_final_grid,  0.d0, &
      tmp_mat, mo_num*mo_num)
  !$OMP PARALLEL DO PRIVATE(i,j,k,l)
  do i = 1, mo_num
    do k = 1, mo_num
      do j = 1, mo_num
        do l = 1, mo_num
            three_e_5_idx_direct_bi_ort(m,l,j,k,i) = three_e_5_idx_direct_bi_ort(m,l,j,k,i) - tmp_mat(l,j,k,i)
            three_e_5_idx_exch12_bi_ort(m,l,j,k,i) = three_e_5_idx_exch12_bi_ort(m,l,j,k,i) - tmp_mat(l,i,k,j)
        enddo
      enddo
    enddo
  enddo
  !$OMP END PARALLEL DO
  call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
      lm_grad_ik, 3*n_points_final_grid,  &
      rk_grad_im, 3*n_points_final_grid,  0.d0, &
      tmp_mat, mo_num*mo_num)
  !$OMP PARALLEL DO PRIVATE(i,j,k,l)
  do i = 1, mo_num
    do k = 1, mo_num
      do j = 1, mo_num
        do l = 1, mo_num
            three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = - tmp_mat(l,i,j,k)
            three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = - tmp_mat(k,j,i,l)
            three_e_5_idx_exch23_bi_ort (m,l,j,k,i) = - tmp_mat(k,i,j,l)
            three_e_5_idx_exch13_bi_ort (m,l,j,k,i) = - tmp_mat(l,j,i,k)
        enddo
      enddo
    enddo
  enddo
  !$OMP END PARALLEL DO
  deallocate(lm_grad_ik)
  allocate(lk_grad_mi(n_points_final_grid,3,mo_num,mo_num))
 !$OMP PARALLEL                     &
 !$OMP DEFAULT (NONE)               &
 !$OMP PRIVATE (i,l,ipoint) &
 !$OMP SHARED (m,mo_num,n_points_final_grid, &
 !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
 !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
 !$OMP         lk_grad_mi)
 !$OMP DO COLLAPSE(2)
  do i=1,mo_num
    do l=1,mo_num
       do ipoint=1, n_points_final_grid
         lk_grad_mi(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,1,m,i) * final_weight_at_r_vector(ipoint)
         lk_grad_mi(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,2,m,i) * final_weight_at_r_vector(ipoint)
         lk_grad_mi(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,3,m,i) * final_weight_at_r_vector(ipoint)
       enddo
    enddo
  enddo
  !$OMP END DO
  !$OMP END PARALLEL
-  call wall_time(wall1)
+  call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
-  print *, ' wall time for three_e_5_idx_direct_bi_ort', wall1 - wall0
+      lk_grad_mi, 3*n_points_final_grid,  &
-  call print_memory_usage()
+      rm_grad_ik, 3*n_points_final_grid,  0.d0, &
      tmp_mat, mo_num*mo_num)
-END_PROVIDER 
+  !$OMP PARALLEL DO PRIVATE(i,j,k,l)
 ! ---
 BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_1_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
  BEGIN_DOC
  !
  ! matrix element of the -L  three-body operator FOR THE FIRST CYCLIC PERMUTATION TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
  !
  ! three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = <mlk|-L|jim> ::: notice that i is the RIGHT MO and k is the LEFT MO
  !
  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
  !
  END_DOC
  implicit none
  integer          :: i, j, k, m, l
  double precision :: integral, wall1, wall0
  three_e_5_idx_cycle_1_bi_ort = 0.d0
  print *, ' Providing the three_e_5_idx_cycle_1_bi_ort ...'
  call wall_time(wall0)
  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
 !$OMP PARALLEL                     &
 !$OMP DEFAULT (NONE)               &
 !$OMP PRIVATE (i,j,k,m,l,integral) & 
 !$OMP SHARED (mo_num,three_e_5_idx_cycle_1_bi_ort)
 !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
  do i = 1, mo_num
    do k = 1, mo_num
      do j = 1, mo_num
        do l = 1, mo_num
-          do m = 1, mo_num
+            three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) - tmp_mat(k,j,l,i)
-            call give_integrals_3_body_bi_ort(m, l, k, j, i, m, integral)
+            three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) - tmp_mat(l,i,k,j)
-            three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = -1.d0 * integral 
+            three_e_5_idx_exch23_bi_ort (m,l,j,k,i) = three_e_5_idx_exch23_bi_ort (m,l,j,k,i) - tmp_mat(l,j,k,i)
            three_e_5_idx_exch13_bi_ort (m,l,j,k,i) = three_e_5_idx_exch13_bi_ort (m,l,j,k,i) - tmp_mat(k,i,l,j)
        enddo
      enddo
    enddo
  enddo
-  enddo
+  !$OMP END PARALLEL DO
 !$OMP END DO
 !$OMP END PARALLEL
  call wall_time(wall1)
  print *, ' wall time for three_e_5_idx_cycle_1_bi_ort', wall1 - wall0
  call print_memory_usage()
-END_PROVIDER 
+  call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
      lk_grad_mi, 3*n_points_final_grid,  &
      rk_grad_im, 3*n_points_final_grid,  0.d0, &
      tmp_mat, mo_num*mo_num)
-! ---
+  !$OMP PARALLEL DO PRIVATE(i,j,k,l)
 BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_2_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
  BEGIN_DOC
  !
  ! matrix element of the -L  three-body operator FOR THE FIRST CYCLIC PERMUTATION TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
  !
  ! three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = <mlk|-L|imj> ::: notice that i is the RIGHT MO and k is the LEFT MO
  !
  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
  !
  END_DOC
  implicit none
  integer          :: i, j, k, m, l
  double precision :: integral, wall1, wall0
  three_e_5_idx_cycle_2_bi_ort = 0.d0
  print *, ' Providing the three_e_5_idx_cycle_2_bi_ort ...'
  call wall_time(wall0)
  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
 !$OMP PARALLEL                     &
 !$OMP DEFAULT (NONE)               &
 !$OMP PRIVATE (i,j,k,m,l,integral) & 
 !$OMP SHARED (mo_num,three_e_5_idx_cycle_2_bi_ort)
 !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
  do i = 1, mo_num
    do k = 1, mo_num
      do j = 1, mo_num
        do m = 1, mo_num
          do l = 1, mo_num
            call give_integrals_3_body_bi_ort(m, l, k, i, m, j, integral)
            three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = -1.d0 * integral 
          enddo
        enddo
      enddo
    enddo
  enddo
 !$OMP END DO
 !$OMP END PARALLEL
  call wall_time(wall1)
  print *, ' wall time for three_e_5_idx_cycle_2_bi_ort', wall1 - wall0
  call print_memory_usage()
 END_PROVIDER 
 ! ---
 BEGIN_PROVIDER [ double precision, three_e_5_idx_exch23_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
  BEGIN_DOC
  !
  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
  !
  ! three_e_5_idx_exch23_bi_ort(m,l,j,k,i) = <mlk|-L|jmi> ::: notice that i is the RIGHT MO and k is the LEFT MO
  !
  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
  !
  END_DOC
  implicit none
  integer          :: i, j, k, m, l
  double precision :: integral, wall1, wall0
  three_e_5_idx_exch23_bi_ort = 0.d0
  print *, ' Providing the three_e_5_idx_exch23_bi_ort ...'
  call wall_time(wall0)
  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
 !$OMP PARALLEL                     &
 !$OMP DEFAULT (NONE)               &
 !$OMP PRIVATE (i,j,k,m,l,integral) & 
 !$OMP SHARED (mo_num,three_e_5_idx_exch23_bi_ort)
 !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
  do i = 1, mo_num
    do k = 1, mo_num
      do j = 1, mo_num
        do l = 1, mo_num
-          do m = 1, mo_num
+            three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) - tmp_mat(l,j,i,k)
-            call give_integrals_3_body_bi_ort(m, l, k, j, m, i, integral)
+            three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) - tmp_mat(k,i,j,l)
-            three_e_5_idx_exch23_bi_ort(m,l,j,k,i) = -1.d0 * integral 
+            three_e_5_idx_exch23_bi_ort (m,l,j,k,i) = three_e_5_idx_exch23_bi_ort (m,l,j,k,i) - tmp_mat(k,j,i,l)
            three_e_5_idx_exch13_bi_ort (m,l,j,k,i) = three_e_5_idx_exch13_bi_ort (m,l,j,k,i) - tmp_mat(l,i,j,k)
        enddo
      enddo
    enddo
  enddo
  !$OMP END PARALLEL DO
  deallocate(lk_grad_mi)
  deallocate(rm_grad_ik)
  deallocate(rk_grad_im)
  enddo
- !$OMP END DO
+
 !$OMP END PARALLEL
  call wall_time(wall1)
-  print *, ' wall time for three_e_5_idx_exch23_bi_ort', wall1 - wall0
+  print *, ' wall time for three_e_5_idx_bi_ort', wall1 - wall0
  call print_memory_usage()
 END_PROVIDER
 ! ---
 BEGIN_PROVIDER [ double precision, three_e_5_idx_exch13_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
  BEGIN_DOC
  !
  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
  !
  ! three_e_5_idx_exch13_bi_ort(m,l,j,k,i) = <mlk|-L|ijm> ::: notice that i is the RIGHT MO and k is the LEFT MO
  !
  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
  !
  END_DOC
  implicit none
  integer          :: i, j, k, m, l
  double precision :: integral, wall1, wall0
  three_e_5_idx_exch13_bi_ort = 0.d0
  print *, ' Providing the three_e_5_idx_exch13_bi_ort ...'
  call wall_time(wall0)
  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
 !$OMP PARALLEL                     &
 !$OMP DEFAULT (NONE)               &
 !$OMP PRIVATE (i,j,k,m,l,integral) & 
 !$OMP SHARED (mo_num,three_e_5_idx_exch13_bi_ort)
 !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
  do i = 1, mo_num
    do k = 1, mo_num
      do j = 1, mo_num
        do l = 1, mo_num
          do m = 1, mo_num
            call give_integrals_3_body_bi_ort(m, l, k, i, j, m, integral)
            three_e_5_idx_exch13_bi_ort(m,l,j,k,i) = -1.d0 * integral 
          enddo
        enddo
      enddo
    enddo
  enddo
 !$OMP END DO
 !$OMP END PARALLEL
  call wall_time(wall1)
  print *, ' wall time for three_e_5_idx_exch13_bi_ort', wall1 - wall0
  call print_memory_usage()
 END_PROVIDER 
 ! ---
 BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
  BEGIN_DOC
  !
  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
  !
  ! three_e_5_idx_exch12_bi_ort(m,l,j,k,i) = <mlk|-L|mij> ::: notice that i is the RIGHT MO and k is the LEFT MO
  !
  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
  !
  END_DOC
  implicit none
  integer          :: i, j, k, m, l
  double precision :: integral, wall1, wall0
  three_e_5_idx_exch12_bi_ort = 0.d0
  print *, ' Providing the three_e_5_idx_exch12_bi_ort ...'
  call wall_time(wall0)
  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
 !$OMP PARALLEL                     &
 !$OMP DEFAULT (NONE)               &
 !$OMP PRIVATE (i,j,k,m,l,integral) & 
 !$OMP SHARED (mo_num,three_e_5_idx_exch12_bi_ort)
 !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
  do i = 1, mo_num
    do k = 1, mo_num
      do j = 1, mo_num
        do l = 1, mo_num
          do m = 1, mo_num
            call give_integrals_3_body_bi_ort(m, l, k, m, i, j, integral)
            three_e_5_idx_exch12_bi_ort(m,l,j,k,i) = -1.d0 * integral 
          enddo
        enddo
      enddo
    enddo
  enddo
 !$OMP END DO
 !$OMP END PARALLEL
  call wall_time(wall1)
  print *, ' wall time for three_e_5_idx_exch12_bi_ort', wall1 - wall0
  call print_memory_usage()
 END_PROVIDER 
 ! ---
--- a/src/bi_ort_ints/three_body_ijmkl_old.irp.f
+++ b/src/bi_ort_ints/three_body_ijmkl_old.irp.f
@ -0,0 +1,295 @@
 ! ---
 BEGIN_PROVIDER [ double precision, three_e_5_idx_direct_bi_ort_old, (mo_num, mo_num, mo_num, mo_num, mo_num)]
  BEGIN_DOC
  !
  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
  !
  ! three_e_5_idx_direct_bi_ort_old(m,l,j,k,i) = <mlk|-L|mji> ::: notice that i is the RIGHT MO and k is the LEFT MO
  !
  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
  END_DOC
  implicit none
  integer          :: i, j, k, m, l
  double precision :: integral, wall1, wall0
  three_e_5_idx_direct_bi_ort_old = 0.d0
  print *, ' Providing the three_e_5_idx_direct_bi_ort_old ...'
  call wall_time(wall0)
  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
 !$OMP PARALLEL                     &
 !$OMP DEFAULT (NONE)               &
 !$OMP PRIVATE (i,j,k,m,l,integral) &
 !$OMP SHARED (mo_num,three_e_5_idx_direct_bi_ort_old)
 !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
  do i = 1, mo_num
    do k = 1, mo_num
      do j = 1, mo_num
        do l = 1, mo_num
          do m = 1, mo_num
            call give_integrals_3_body_bi_ort(m, l, k, m, j, i, integral)
            three_e_5_idx_direct_bi_ort_old(m,l,j,k,i) = -1.d0 * integral
          enddo
        enddo
      enddo
    enddo
  enddo
 !$OMP END DO
 !$OMP END PARALLEL
  call wall_time(wall1)
  print *, ' wall time for three_e_5_idx_direct_bi_ort_old', wall1 - wall0
 END_PROVIDER
 ! ---
 BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_1_bi_ort_old, (mo_num, mo_num, mo_num, mo_num, mo_num)]
  BEGIN_DOC
  !
  ! matrix element of the -L  three-body operator FOR THE FIRST CYCLIC PERMUTATION TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
  !
  ! three_e_5_idx_cycle_1_bi_ort_old(m,l,j,k,i) = <mlk|-L|jim> ::: notice that i is the RIGHT MO and k is the LEFT MO
  !
  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
  !
  END_DOC
  implicit none
  integer          :: i, j, k, m, l
  double precision :: integral, wall1, wall0
  three_e_5_idx_cycle_1_bi_ort_old = 0.d0
  print *, ' Providing the three_e_5_idx_cycle_1_bi_ort_old ...'
  call wall_time(wall0)
  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
 !$OMP PARALLEL                     &
 !$OMP DEFAULT (NONE)               &
 !$OMP PRIVATE (i,j,k,m,l,integral) &
 !$OMP SHARED (mo_num,three_e_5_idx_cycle_1_bi_ort_old)
 !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
  do i = 1, mo_num
    do k = 1, mo_num
      do j = 1, mo_num
        do l = 1, mo_num
          do m = 1, mo_num
            call give_integrals_3_body_bi_ort(m, l, k, j, i, m, integral)
            three_e_5_idx_cycle_1_bi_ort_old(m,l,j,k,i) = -1.d0 * integral
          enddo
        enddo
      enddo
    enddo
  enddo
 !$OMP END DO
 !$OMP END PARALLEL
  call wall_time(wall1)
  print *, ' wall time for three_e_5_idx_cycle_1_bi_ort_old', wall1 - wall0
 END_PROVIDER
 ! ---
 BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_2_bi_ort_old, (mo_num, mo_num, mo_num, mo_num, mo_num)]
  BEGIN_DOC
  !
  ! matrix element of the -L  three-body operator FOR THE FIRST CYCLIC PERMUTATION TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
  !
  ! three_e_5_idx_cycle_2_bi_ort_old(m,l,j,k,i) = <mlk|-L|imj> ::: notice that i is the RIGHT MO and k is the LEFT MO
  !
  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
  !
  END_DOC
  implicit none
  integer          :: i, j, k, m, l
  double precision :: integral, wall1, wall0
  three_e_5_idx_cycle_2_bi_ort_old = 0.d0
  print *, ' Providing the three_e_5_idx_cycle_2_bi_ort_old ...'
  call wall_time(wall0)
  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
 !$OMP PARALLEL                     &
 !$OMP DEFAULT (NONE)               &
 !$OMP PRIVATE (i,j,k,m,l,integral) &
 !$OMP SHARED (mo_num,three_e_5_idx_cycle_2_bi_ort_old)
 !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
  do i = 1, mo_num
    do k = 1, mo_num
      do j = 1, mo_num
        do m = 1, mo_num
          do l = 1, mo_num
            call give_integrals_3_body_bi_ort(m, l, k, i, m, j, integral)
            three_e_5_idx_cycle_2_bi_ort_old(m,l,j,k,i) = -1.d0 * integral
          enddo
        enddo
      enddo
    enddo
  enddo
 !$OMP END DO
 !$OMP END PARALLEL
  call wall_time(wall1)
  print *, ' wall time for three_e_5_idx_cycle_2_bi_ort_old', wall1 - wall0
 END_PROVIDER
 ! ---
 BEGIN_PROVIDER [ double precision, three_e_5_idx_exch23_bi_ort_old, (mo_num, mo_num, mo_num, mo_num, mo_num)]
  BEGIN_DOC
  !
  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
  !
  ! three_e_5_idx_exch23_bi_ort_old(m,l,j,k,i) = <mlk|-L|jmi> ::: notice that i is the RIGHT MO and k is the LEFT MO
  !
  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
  !
  END_DOC
  implicit none
  integer          :: i, j, k, m, l
  double precision :: integral, wall1, wall0
  three_e_5_idx_exch23_bi_ort_old = 0.d0
  print *, ' Providing the three_e_5_idx_exch23_bi_ort_old ...'
  call wall_time(wall0)
  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
 !$OMP PARALLEL                     &
 !$OMP DEFAULT (NONE)               &
 !$OMP PRIVATE (i,j,k,m,l,integral) &
 !$OMP SHARED (mo_num,three_e_5_idx_exch23_bi_ort_old)
 !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
  do i = 1, mo_num
    do k = 1, mo_num
      do j = 1, mo_num
        do l = 1, mo_num
          do m = 1, mo_num
            call give_integrals_3_body_bi_ort(m, l, k, j, m, i, integral)
            three_e_5_idx_exch23_bi_ort_old(m,l,j,k,i) = -1.d0 * integral
          enddo
        enddo
      enddo
    enddo
  enddo
 !$OMP END DO
 !$OMP END PARALLEL
  call wall_time(wall1)
  print *, ' wall time for three_e_5_idx_exch23_bi_ort_old', wall1 - wall0
 END_PROVIDER
 ! ---
 BEGIN_PROVIDER [ double precision, three_e_5_idx_exch13_bi_ort_old, (mo_num, mo_num, mo_num, mo_num, mo_num)]
  BEGIN_DOC
  !
  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
  !
  ! three_e_5_idx_exch13_bi_ort_old(m,l,j,k,i) = <mlk|-L|ijm> ::: notice that i is the RIGHT MO and k is the LEFT MO
  !
  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
  !
  END_DOC
  implicit none
  integer          :: i, j, k, m, l
  double precision :: integral, wall1, wall0
  three_e_5_idx_exch13_bi_ort_old = 0.d0
  print *, ' Providing the three_e_5_idx_exch13_bi_ort_old ...'
  call wall_time(wall0)
  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
 !$OMP PARALLEL                     &
 !$OMP DEFAULT (NONE)               &
 !$OMP PRIVATE (i,j,k,m,l,integral) &
 !$OMP SHARED (mo_num,three_e_5_idx_exch13_bi_ort_old)
 !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
  do i = 1, mo_num
    do k = 1, mo_num
      do j = 1, mo_num
        do l = 1, mo_num
          do m = 1, mo_num
            call give_integrals_3_body_bi_ort(m, l, k, i, j, m, integral)
            three_e_5_idx_exch13_bi_ort_old(m,l,j,k,i) = -1.d0 * integral
          enddo
        enddo
      enddo
    enddo
  enddo
 !$OMP END DO
 !$OMP END PARALLEL
  call wall_time(wall1)
  print *, ' wall time for three_e_5_idx_exch13_bi_ort_old', wall1 - wall0
 END_PROVIDER
 ! ---
 BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort_old, (mo_num, mo_num, mo_num, mo_num, mo_num)]
  BEGIN_DOC
  !
  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
  !
  ! three_e_5_idx_exch12_bi_ort_old(m,l,j,k,i) = <mlk|-L|mij> ::: notice that i is the RIGHT MO and k is the LEFT MO
  !
  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
  !
  END_DOC
  implicit none
  integer          :: i, j, k, m, l
  double precision :: integral, wall1, wall0
  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
  PROVIDE mo_l_coef mo_r_coef int2_grad1_u12_bimo_t
  three_e_5_idx_exch12_bi_ort_old = 0.d0
  print *, ' Providing the three_e_5_idx_exch12_bi_ort_old ...'
  call wall_time(wall0)
 !$OMP PARALLEL                     &
 !$OMP DEFAULT (NONE)               &
 !$OMP PRIVATE (i,j,k,m,l,integral) &
 !$OMP SHARED (mo_num,three_e_5_idx_exch12_bi_ort_old)
 !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
  do i = 1, mo_num
    do k = 1, mo_num
      do j = 1, mo_num
        do l = 1, mo_num
          do m = 1, mo_num
            call give_integrals_3_body_bi_ort(m, l, k, m, i, j, integral)
            three_e_5_idx_exch12_bi_ort_old(m,l,j,k,i) = -1.d0 * integral
          enddo
        enddo
      enddo
    enddo
  enddo
 !$OMP END DO
 !$OMP END PARALLEL
  call wall_time(wall1)
  print *, ' wall time for three_e_5_idx_exch12_bi_ort_old', wall1 - wall0
 END_PROVIDER
--- a/src/bi_ort_ints/three_body_ints_bi_ort.irp.f
+++ b/src/bi_ort_ints/three_body_ints_bi_ort.irp.f
@ -86,22 +86,25 @@ subroutine give_integrals_3_body_bi_ort(n, l, k, m, j, i, integral)
  PROVIDE int2_grad1_u12_bimo_t
  integral = 0.d0
  ! (n, l, k, m, j, i)
  do ipoint = 1, n_points_final_grid
    weight = final_weight_at_r_vector(ipoint)                                                                          
-    integral += weight * mos_l_in_r_array_transp(ipoint,k) * mos_r_in_r_array_transp(ipoint,i) & 
+    tmp =     mos_l_in_r_array_transp(ipoint,k) * mos_r_in_r_array_transp(ipoint,i) &
              * ( int2_grad1_u12_bimo_t(ipoint,1,n,m) * int2_grad1_u12_bimo_t(ipoint,1,l,j)    &
                + int2_grad1_u12_bimo_t(ipoint,2,n,m) * int2_grad1_u12_bimo_t(ipoint,2,l,j)    &
                + int2_grad1_u12_bimo_t(ipoint,3,n,m) * int2_grad1_u12_bimo_t(ipoint,3,l,j) )
-    integral += weight * mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,j) & 
+
    tmp = tmp + mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,j) &
              * ( int2_grad1_u12_bimo_t(ipoint,1,n,m) * int2_grad1_u12_bimo_t(ipoint,1,k,i)    &
                + int2_grad1_u12_bimo_t(ipoint,2,n,m) * int2_grad1_u12_bimo_t(ipoint,2,k,i)    &
                + int2_grad1_u12_bimo_t(ipoint,3,n,m) * int2_grad1_u12_bimo_t(ipoint,3,k,i) )
-    integral += weight * mos_l_in_r_array_transp(ipoint,n) * mos_r_in_r_array_transp(ipoint,m) &
+
    tmp = tmp + mos_l_in_r_array_transp(ipoint,n) * mos_r_in_r_array_transp(ipoint,m) &
              * ( int2_grad1_u12_bimo_t(ipoint,1,l,j) * int2_grad1_u12_bimo_t(ipoint,1,k,i)    &
                + int2_grad1_u12_bimo_t(ipoint,2,l,j) * int2_grad1_u12_bimo_t(ipoint,2,k,i)    &
                + int2_grad1_u12_bimo_t(ipoint,3,l,j) * int2_grad1_u12_bimo_t(ipoint,3,k,i) )
    integral = integral + tmp * final_weight_at_r_vector(ipoint)
  enddo
 end subroutine give_integrals_3_body_bi_ort
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@ -16,20 +16,16 @@ subroutine run_ccsd_space_orb
  double precision, allocatable :: all_err(:,:), all_t(:,:)
  integer, allocatable          :: list_occ(:), list_vir(:)
  integer(bit_kind)             :: det(N_int,2)
-  integer                       :: nO, nV, nOa, nOb, nVa, nVb, n_spin(4)
+  integer                       :: nO, nV, nOa, nVa
-  PROVIDE mo_two_e_integrals_in_map
+!  PROVIDE mo_two_e_integrals_in_map
  det = psi_det(:,:,cc_ref)
  print*,'Reference determinant:'
  call print_det(det,N_int)
-  ! Extract number of occ/vir alpha/beta spin orbitals
+  nOa = cc_nOa
-  !call extract_n_spin(det,n_spin)
+  nVa = cc_nVa
  nOa = cc_nOa !n_spin(1)
  nOb = cc_nOb !n_spin(2)
  nVa = cc_nVa !n_spin(3)
  nVb = cc_nVb !n_spin(4)
  ! Check that the reference is a closed shell determinant
  if (cc_ref_is_open_shell) then
@ -109,7 +105,7 @@ subroutine run_ccsd_space_orb
      call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1,t1)
      call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2,t2)
    else
-      print*,'Unkonw cc_method_method: '//cc_update_method
+      print*,'Unkown cc_method_method: '//cc_update_method
    endif
    call update_tau_space(nO,nV,t1,t2,tau)
@ -169,8 +165,13 @@ subroutine run_ccsd_space_orb
    ! New
    print*,'Computing (T) correction...'
    call wall_time(ta)
-    call ccsd_par_t_space_v2(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
+!    call ccsd_par_t_space_v3(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
 !         ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t)
    e_t = uncorr_energy + energy ! For print in next call
    call ccsd_par_t_space_stoch(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
         ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t)
    call wall_time(tb)
    print*,'Time: ',tb-ta, ' s'
@ -211,8 +212,8 @@ subroutine ccsd_energy_space(nO,nV,tau,t1,energy)
  !$omp default(none)
  e = 0d0
  !$omp do
  do i = 1, nO
  do a = 1, nV
    do i = 1, nO
      e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
    enddo
  enddo
@ -255,7 +256,7 @@ subroutine update_tau_space(nO,nV,t1,t2,tau)
  !$OMP SHARED(nO,nV,tau,t2,t1) &
  !$OMP PRIVATE(i,j,a,b) &
  !$OMP DEFAULT(NONE)
-  !$OMP DO collapse(3)
+  !$OMP DO
  do b = 1, nV
    do a = 1, nV
      do j = 1, nO
@ -373,7 +374,7 @@ subroutine compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
  !$omp shared(nO,nV,X_voov,t2,t1) &
  !$omp private(u,beta,i,a) &
  !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do
  do beta = 1, nV
    do u = 1, nO
      do i = 1, nO
@ -412,7 +413,7 @@ subroutine compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
  !$omp shared(nO,nV,cc_space_v_ovov,cc_space_v_voov,X_ovov) &
  !$omp private(u,beta,i,a) &
  !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do
  do beta = 1, nV
    do u = 1, nO
      do a = 1, nv
@ -452,7 +453,7 @@ subroutine compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
  !$omp shared(nO,nV,cc_space_v_vvov,W_vvov,T_vvoo,tau) &
  !$omp private(b,beta,i,a) &
  !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do
  do beta = 1, nV
    do i = 1, nO
      do b = 1, nV
@ -464,11 +465,11 @@ subroutine compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
  enddo
  !$omp end do nowait
-  !$omp do collapse(3)
+  !$omp do
  do u = 1, nO
    do i = 1, nO
      do b = 1, nV
        do a = 1, nV
        do u = 1, nO
          T_vvoo(a,b,i,u) = tau(i,u,a,b)
        enddo
      enddo
@ -504,8 +505,8 @@ subroutine compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
  !$omp shared(nO,nV,cc_space_v_vooo,W_oovo) &
  !$omp private(u,a,i,j) &
  !$omp default(none)
  !$omp do collapse(3)
  do u = 1, nO
    !$omp do
    do a = 1, nV
      do j = 1, nO
        do i = 1, nO
@ -513,8 +514,8 @@ subroutine compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
        enddo
      enddo
    enddo
    !$omp end do nowait
  enddo
  !$omp end do
  !$omp end parallel
  call dgemm('T','N', nO, nV, nO*nO*nV, &
@ -527,9 +528,7 @@ subroutine compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
  max_r1 = 0d0
  do a = 1, nV
    do i = 1, nO
-      if (dabs(r1(i,a)) > max_r1) then
+      max_r1 = max(dabs(r1(i,a)), max_r1)
        max_r1 = dabs(r1(i,a))
      endif
    enddo
  enddo
@ -657,7 +656,7 @@ subroutine compute_H_vv(nO,nV,t1,t2,tau,H_vv)
  ! H_vv(a,beta) = H_vv(a,beta) - cc_space_w_vvoo(a,b,i,j) * tau(i,j,beta,b)
  ! H_vv(a,beta) = H_vv(a,beta) - cc_space_w_vvoo(a,b,i,j) * tmp_tau(b,i,j,beta)
-  !$omp do collapse(3)
+  !$omp do
  do beta = 1, nV
    do j = 1, nO
      do i = 1, nO
@ -727,7 +726,7 @@ subroutine compute_H_vo(nO,nV,t1,t2,H_vo)
  ! H_vo(a,i) = H_vo(a,i) + cc_space_w_vvoo(a,b,i,j) * t1(j,b)
  ! H_vo(a,i) = H_vo(a,i) + w(a,i,j,b) * t1(j,b)
-  !$omp do collapse(3)
+  !$omp do
  do b = 1, nV
    do j = 1, nO
      do i = 1, nO
@ -765,7 +764,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
  ! internal
  double precision, allocatable :: g_occ(:,:), g_vir(:,:), J1(:,:,:,:), K1(:,:,:,:)
-  double precision, allocatable :: A1(:,:,:,:), B1(:,:,:,:)
+  double precision, allocatable :: A1(:,:,:,:), B1_gam(:,:,:)
  integer                       :: u,v,i,j,beta,gam,a,b
  allocate(g_occ(nO,nO), g_vir(nV,nV))
@ -787,7 +786,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
  !$omp shared(nO,nV,r2,cc_space_v_oovv) &
  !$omp private(u,v,gam,beta) &
  !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do
  do gam = 1, nV
    do beta = 1, nV
      do v = 1, nO
@ -835,13 +834,18 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
  !  enddo
  !enddo
-  allocate(B1(nV,nV,nV,nV))
+!  allocate(B1(nV,nV,nV,nV))
-  call compute_B1(nO,nV,t1,t2,B1)
+!  call compute_B1(nO,nV,t1,t2,B1)
-  call dgemm('N','N',nO*nO,nV*nV,nV*nV, &
+  allocate(B1_gam(nV,nV,nV))
  do gam=1,nV
    call compute_B1_gam(nO,nV,t1,t2,B1_gam,gam)
    call dgemm('N','N',nO*nO,nV,nV*nV, &
                1d0, tau, size(tau,1) * size(tau,2), &
-                  B1 , size(B1,1) * size(B1,2), &
+                     B1_gam        , size(B1_gam,1) * size(B1_gam,2), &
-             1d0, r2, size(r2,1) * size(r2,2))
+                1d0, r2(1,1,1,gam), size(r2,1) * size(r2,2))
-  deallocate(B1)
+  enddo
  deallocate(B1_gam)
  !do gam = 1, nV
  !  do beta = 1, nV
@ -863,7 +867,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
  !$omp shared(nO,nV,t2,X_oovv) &
  !$omp private(u,v,gam,a) &
  !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do
  do a = 1, nV
    do gam = 1, nV
      do v = 1, nO
@ -885,7 +889,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
  !$omp shared(nO,nV,r2,Y_oovv) &
  !$omp private(u,v,gam,beta) &
  !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do
  do gam = 1, nV
    do beta = 1, nV
      do v = 1, nO
@ -921,7 +925,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
  !$omp shared(nO,nV,r2,X_oovv) &
  !$omp private(u,v,gam,beta) &
  !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do
  do gam = 1, nV
    do beta = 1, nV
      do v = 1, nO
@ -957,7 +961,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
  !$omp shared(nO,nV,X_vovv,cc_space_v_ovvv) &
  !$omp private(u,a,gam,beta) &
  !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do
  do gam = 1, nV
    do beta = 1, nV
      do u = 1, nO
@ -979,7 +983,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
  !$omp shared(nO,nV,r2,Y_oovv) &
  !$omp private(u,v,gam,beta) &
  !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do
  do gam = 1, nV
    do beta = 1, nV
      do v = 1, nO
@ -1014,8 +1018,8 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
  !$omp shared(nO,nV,X_vovo,cc_space_v_ovov) &
  !$omp private(u,v,gam,i) &
  !$omp default(none)
  !$omp do collapse(3)
  do i = 1, nO
    !$omp do
    do gam = 1, nV
      do u = 1, nO
        do a = 1, nV
@ -1023,8 +1027,8 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
        enddo
      enddo
    enddo
    !$omp end do nowait
  enddo
  !$omp end do
  !$omp end parallel
  call dgemm('N','N',nV*nO*nV,nV,nO, &
@ -1041,7 +1045,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
  !$omp shared(nO,nV,r2,X_oovv) &
  !$omp private(u,v,gam,beta) &
  !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do
  do gam = 1, nV
    do beta = 1, nV
      do v = 1, nO
@ -1079,7 +1083,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
  !$omp shared(nO,nV,r2,X_oovv) &
  !$omp private(u,v,gam,beta) &
  !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do
  do gam = 1, nV
    do beta = 1, nV
      do v = 1, nO
@ -1116,8 +1120,8 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
  !$omp shared(nO,nV,X_vovo,cc_space_v_ovvo) &
  !$omp private(a,v,gam,i) &
  !$omp default(none)
  !$omp do collapse(3)
  do i = 1, nO
    !$omp do
    do gam = 1, nV
      do v = 1, nO
        do a = 1, nV
@ -1125,8 +1129,8 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
        enddo
      enddo
    enddo
    !$omp end do nowait
  enddo
  !$omp end do
  !$omp end parallel
  call dgemm('N','N',nO,nO*nV*nO,nV, &
@ -1143,7 +1147,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
  !$omp shared(nO,nV,r2,X_oovv) &
  !$omp private(u,v,gam,beta) &
  !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do
  do gam = 1, nV
    do beta = 1, nV
      do v = 1, nO
@ -1182,19 +1186,19 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
  !$omp shared(nO,nV,X_ovvo,Y_voov,K1,J1,t2) &
  !$omp private(u,v,gam,beta,i,a) &
  !$omp default(none)
  !$omp do collapse(3)
  do i = 1, nO
    !$omp do
    do a = 1, nV
      do beta = 1, nV
        do u = 1, nO
-          X_ovvo(u,beta,a,i) = 0.5d0 * (2d0 * J1(u,a,beta,i) - K1(u,a,i,beta))
+          X_ovvo(u,beta,a,i) = (J1(u,a,beta,i) - 0.5d0 * K1(u,a,i,beta))
        enddo
        enddo
      enddo
    enddo
    !$omp end do nowait
  enddo
-  !$omp do collapse(3)
+  !$omp do
  do gam = 1, nV
    do v = 1, nO
      do i = 1, nO
@ -1216,7 +1220,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
  !$omp shared(nO,nV,r2,Z_ovov) &
  !$omp private(u,v,gam,beta) &
  !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do
  do gam = 1, nV
    do beta = 1, nV
      do v = 1, nO
@ -1252,7 +1256,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
  !$omp shared(nO,nV,r2,K1,X_ovov,Y_ovov,t2) &
  !$omp private(u,a,i,beta,gam) &
  !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do
  do beta = 1, nV
    do u = 1, nO
      do a = 1, nV
@ -1264,7 +1268,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
  enddo
  !$omp end do nowait
-  !$omp do collapse(3)
+  !$omp do
  do gam = 1, nV
    do v = 1, nO
      do a = 1, nV
@ -1286,7 +1290,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
  !$omp shared(nO,nV,r2,Z_ovov) &
  !$omp private(u,v,gam,beta) &
  !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do
  do gam = 1, nV
    do beta = 1, nV
      do v = 1, nO
@ -1319,7 +1323,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
  !$omp shared(nO,nV,K1,X_ovov,Z_ovov,t2) &
  !$omp private(u,v,gam,beta,i,a) &
  !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do
  do a = 1, nV
    do i = 1, nO
      do gam = 1, nV
@ -1331,7 +1335,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
  enddo
  !$omp end do nowait
-  !$omp do collapse(3)
+  !$omp do
  do beta = 1, nV
    do v = 1, nO
      do a = 1, nV
@ -1353,7 +1357,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
  !$omp shared(nO,nV,r2,Z_ovov) &
  !$omp private(u,v,gam,beta) &
  !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do
  do gam = 1, nV
    do beta = 1, nV
      do v = 1, nO
@ -1373,7 +1377,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
  !$omp shared(nO,nV,r2) &
  !$omp private(i,j,a,b) &
  !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do
  do b = 1, nV
    do a = 1, nV
      do j = 1, nO
@ -1391,9 +1395,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
    do a = 1, nV
      do j = 1, nO
        do i = 1, nO
-          if (dabs(r2(i,j,a,b)) > max_r2) then
+          max_r2 = max(r2(i,j,a,b), max_r2)
            max_r2 = dabs(r2(i,j,a,b))
          endif
        enddo
      enddo
    enddo
@ -1448,7 +1450,7 @@ subroutine compute_A1(nO,nV,t1,t2,tau,A1)
  !$omp shared(nO,nV,A1,cc_space_v_oooo,cc_space_v_ovoo,X_vooo) &
  !$omp private(u,v,i,j) &
  !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do collapse(2)
  do j = 1, nO
    do i = 1, nO
      do v = 1, nO
@ -1462,7 +1464,7 @@ subroutine compute_A1(nO,nV,t1,t2,tau,A1)
  ! A1(u,v,i,j) += cc_space_v_ovoo(u,a,i,j) * t1(v,a) &
-  !$omp do collapse(3)
+  !$omp do collapse(2)
  do j = 1, nO
    do i = 1, nO
      do u = 1, nO
@ -1484,7 +1486,7 @@ subroutine compute_A1(nO,nV,t1,t2,tau,A1)
  !$omp shared(nO,nV,A1,Y_oooo) &
  !$omp private(u,v,i,j) &
  !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do collapse(2)
  do j = 1, nO
    do i = 1, nO
      do v = 1, nO
@ -1515,6 +1517,90 @@ end
 ! B1
 subroutine compute_B1_gam(nO,nV,t1,t2,B1,gam)
  implicit none
  integer, intent(in)           :: nO,nV,gam
  double precision, intent(in)  :: t1(nO, nV)
  double precision, intent(in)  :: t2(nO, nO, nV, nV)
  double precision, intent(out) :: B1(nV, nV, nV)
  integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta
 !  do beta = 1, nV
 !    do b = 1, nV
 !      do a = 1, nV
 !        B1(a,b,beta) = cc_space_v_vvvv(a,b,beta,gam)
 !
 !        do i = 1, nO
 !          B1(a,b,beta) = B1(a,b,beta) &
 !          - cc_space_v_vvvo(a,b,beta,i) * t1(i,gam) &
 !          - cc_space_v_vvov(a,b,i,gam) * t1(i,beta)
 !        enddo
 !
 !      enddo
 !    enddo
 !  enddo
  double precision, allocatable :: X_vvvo(:,:,:), Y_vvvv(:,:,:)
  allocate(X_vvvo(nV,nV,nO), Y_vvvv(nV,nV,nV))
 !  ! B1(a,b,beta,gam) = cc_space_v_vvvv(a,b,beta,gam)
  !$omp parallel &
  !$omp shared(nO,nV,B1,cc_space_v_vvvv,cc_space_v_vvov,X_vvvo,gam) &
  !$omp private(a,b,beta) &
  !$omp default(none)
  !$omp do
    do beta = 1, nV
      do b = 1, nV
        do a = 1, nV
          B1(a,b,beta) = cc_space_v_vvvv(a,b,beta,gam)
        enddo
      enddo
    enddo
  !$omp end do nowait
  do i = 1, nO
    !$omp do
      do b = 1, nV
        do a = 1, nV
          X_vvvo(a,b,i) = cc_space_v_vvov(a,b,i,gam)
        enddo
      enddo
    !$omp end do nowait
  enddo
  !$omp end parallel
 !  ! B1(a,b,beta) -= cc_space_v_vvvo(a,b,beta,i) * t1(i,gam) &
  call dgemm('N','N', nV*nV*nV, 1, nO, &
             -1d0, cc_space_v_vvvo, size(cc_space_v_vvvo,1) * size(cc_space_v_vvvo,2) * size(cc_space_v_vvvo,3), &
                   t1(1,gam), size(t1,1), &
              1d0, B1      , size(B1,1) * size(B1,2) * size(B1,3))
  ! B1(a,b,beta,gam) -= cc_space_v_vvov(a,b,i,gam) * t1(i,beta)
  call dgemm('N','N', nV*nV, nV, nO, &
             -1d0, X_vvvo, size(X_vvvo,1) * size(X_vvvo,2), &
                   t1    , size(t1,1), &
              0d0, Y_vvvv, size(Y_vvvv,1) * size(Y_vvvv,2))
  !$omp parallel &
  !$omp shared(nV,B1,Y_vvvv,gam) &
  !$omp private(a,b,beta) &
  !$omp default(none)
  !$omp do
  do beta = 1, nV
    do b = 1, nV
      do a = 1, nV
        B1(a,b,beta) = B1(a,b,beta) + Y_vvvv(a,b,beta)
      enddo
    enddo
  enddo
  !$omp end do
  !$omp end parallel
  deallocate(X_vvvo,Y_vvvv)
 end
 subroutine compute_B1(nO,nV,t1,t2,B1)
  implicit none
@ -1553,7 +1639,7 @@ subroutine compute_B1(nO,nV,t1,t2,B1)
  !$omp shared(nO,nV,B1,cc_space_v_vvvv,cc_space_v_vvov,X_vvvo) &
  !$omp private(a,b,beta,gam) &
  !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do
  do gam = 1, nV
    do beta = 1, nV
      do b = 1, nV
@ -1564,8 +1650,8 @@ subroutine compute_B1(nO,nV,t1,t2,B1)
    enddo
  enddo
  !$omp end do nowait
  !$omp do collapse(3)
  do i = 1, nO
    !$omp do
    do gam = 1, nV
      do b = 1, nV
        do a = 1, nV
@ -1573,8 +1659,8 @@ subroutine compute_B1(nO,nV,t1,t2,B1)
        enddo
      enddo
    enddo
    !$omp end do nowait
  enddo
  !$omp end do
  !$omp end parallel
  ! B1(a,b,beta,gam) -= cc_space_v_vvvo(a,b,beta,i) * t1(i,gam) &
@ -1594,7 +1680,7 @@ subroutine compute_B1(nO,nV,t1,t2,B1)
  !$omp shared(nV,B1,Y_vvvv) &
  !$omp private(a,b,beta,gam) &
  !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do
  do gam = 1, nV
    do beta = 1, nV
      do b = 1, nV
@ -1658,7 +1744,7 @@ subroutine compute_g_occ(nO,nV,t1,t2,H_oo,g_occ)
  enddo
  !$omp end do
-  !$omp do collapse(1)
+  !$omp do
  do i = 1, nO
    do j = 1, nO
      do a = 1, nV
@ -1720,7 +1806,7 @@ subroutine compute_g_vir(nO,nV,t1,t2,H_vv,g_vir)
  enddo
  !$omp end do
-  !$omp do collapse(1)
+  !$omp do
  do beta = 1, nV
    do i = 1, nO
      do b = 1, nV
@ -1788,8 +1874,8 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
  !$omp shared(nO,nV,J1,v_ovvo,v_ovoo,X_ovoo) &
  !$omp private(i,j,a,u,beta) &
  !$omp default(none)
  !$omp do collapse(3)
  do i = 1, nO
    !$omp do
    do beta = 1, nV
      do a = 1, nV
        do u = 1, nO
@ -1797,10 +1883,10 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
        enddo
      enddo
    enddo
  enddo
    !$omp end do nowait
  enddo
-  !$omp do collapse(3)
+  !$omp do collapse(2)
  do j = 1, nO
    do i = 1, nO
      do a = 1, nV
@ -1822,8 +1908,8 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
  !$omp shared(nO,nV,J1,Y_ovov) &
  !$omp private(i,beta,a,u) &
  !$omp default(none)
  !$omp do collapse(3)
  do i = 1, nO
    !$omp do
    do beta = 1, nV
      do a = 1, nV
        do u = 1, nO
@ -1831,8 +1917,8 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
        enddo
      enddo
    enddo
    !$omp end do nowait
  enddo
  !$omp end do
  !$omp end parallel
  deallocate(X_ovoo)
@ -1849,7 +1935,7 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
  !$omp shared(nO,nV,t2,t1,Y_ovov,X_voov,v_vvoo) &
  !$omp private(i,beta,a,u,b,j) &
  !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do
  do b = 1, nV
    do j = 1, nO
      do beta = 1, nV
@ -1861,7 +1947,7 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
  enddo
  !$omp end do nowait
-  !$omp do collapse(3)
+  !$omp do
  do b = 1, nV
    do j = 1, nO
      do i = 1, nO
@ -1886,8 +1972,8 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
  !$omp shared(nO,nV,J1,Z_ovvo,t2,Y_vovo,v_vvoo,X_ovvo) &
  !$omp private(i,beta,a,u,j,b) &
  !$omp default(none)
  !$omp do collapse(3)
  do i = 1, nO
    !$omp do
    do beta = 1, nV
      do a = 1, nV
        do u = 1, nO
@ -1895,12 +1981,12 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
        enddo
      enddo
    enddo
  enddo
    !$omp end do nowait
  enddo
  !+ 0.5d0 * (2d0 * cc_space_v_vvoo(a,b,i,j) - cc_space_v_vvoo(b,a,i,j)) * t2(u,j,beta,b)
  !$omp do collapse(3)
  do j = 1, nO
    !$omp do
    do b = 1, nV
      do i = 1, nO
        do a = 1, nV
@ -1908,11 +1994,11 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
        enddo
      enddo
    enddo
  enddo
    !$omp end do nowait
  enddo
  !$omp do collapse(3)
  do j = 1, nO
    !$omp do
    do b = 1, nV
      do beta = 1, nV
        do u = 1, nO
@ -1920,8 +2006,8 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
        enddo
      enddo
    enddo
    !$omp end do nowait
  enddo
  !$omp end do
  !$omp end parallel
  call dgemm('N','T',nO*nV,nV*nO,nV*nO, &
@ -1933,8 +2019,8 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
  !$omp shared(nO,nV,J1,Z_ovvo) &
  !$omp private(i,beta,a,u) &
  !$omp default(none)
  !$omp do collapse(3)
  do i = 1, nO
    !$omp do
    do beta = 1, nV
      do a = 1, nV
        do u = 1, nO
@ -1942,8 +2028,8 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
        enddo
      enddo
    enddo
    !$omp end do nowait
  enddo
  !$omp end do
  !$omp end parallel
  deallocate(X_ovvo,Z_ovvo,Y_ovov)
@ -2003,7 +2089,7 @@ subroutine compute_K1(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov,v_vvov,K1)
  !$omp shared(nO,nV,K1,X,Y,v_vvoo,v_ovov,t1,t2) &
  !$omp private(i,beta,a,u,j,b) &
  !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do
  do beta = 1, nV
    do i = 1, nO
      do a = 1, nV
@ -2015,8 +2101,8 @@ subroutine compute_K1(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov,v_vvov,K1)
  enddo
  !$omp end do nowait
  !$omp do collapse(3)
  do i = 1, nO
    !$omp do
    do a = 1, nV
      do j = 1, nO
        do b = 1, nV
@ -2024,11 +2110,11 @@ subroutine compute_K1(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov,v_vvov,K1)
        enddo
      enddo
    enddo
  enddo
    !$omp end do nowait
  enddo
  !$omp do collapse(3)
  do j = 1, nO
    !$omp do
    do b = 1, nV
      do beta = 1, nV
        do u = 1, nO
@ -2036,8 +2122,8 @@ subroutine compute_K1(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov,v_vvov,K1)
        enddo
      enddo
    enddo
  enddo
    !$omp end do
  enddo
  !$omp end parallel
  call dgemm('N','N',nO*nV*nO,nV,nO, &
@ -2060,7 +2146,7 @@ subroutine compute_K1(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov,v_vvov,K1)
  !$omp shared(nO,nV,K1,Z) &
  !$omp private(i,beta,a,u) &
  !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do
   do beta = 1, nV
    do i = 1, nO
      do a = 1, nV
--- a/src/ccsd/ccsd_t_space_orb_abc.irp.f
+++ b/src/ccsd/ccsd_t_space_orb_abc.irp.f
@ -10,51 +10,43 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
  double precision, intent(in)  :: v_vvvo(nV,nV,nV,nO), v_vvoo(nV,nV,nO,nO), v_vooo(nV,nO,nO,nO)
  double precision, intent(out) :: energy
-  double precision, allocatable :: W(:,:,:,:,:,:)
+  double precision, allocatable :: X_vovv(:,:,:,:), X_ooov(:,:,:,:), X_oovv(:,:,:,:)
-  double precision, allocatable :: V(:,:,:,:,:,:)
+  double precision, allocatable :: T_voov(:,:,:,:), T_oovv(:,:,:,:)
  double precision, allocatable :: W_abc(:,:,:), V_abc(:,:,:)
  double precision, allocatable :: W_cab(:,:,:), W_cba(:,:,:)
  double precision, allocatable :: W_bca(:,:,:), V_cba(:,:,:)
  double precision, allocatable :: X_vvvo(:,:,:,:), X_ovoo(:,:,:,:), X_vvoo(:,:,:,:)
  double precision, allocatable :: T_vvoo(:,:,:,:), T_ovvo(:,:,:,:), T_vo(:,:)
  integer                       :: i,j,k,l,a,b,c,d
-  double precision              :: e,ta,tb, delta, delta_abc
+  double precision              :: e,ta,tb
-  !allocate(W(nV,nV,nV,nO,nO,nO))
+  call set_multiple_levels_omp(.False.)
-  !allocate(V(nV,nV,nV,nO,nO,nO))
+
-  allocate(W_abc(nO,nO,nO), V_abc(nO,nO,nO), W_cab(nO,nO,nO))
+  allocate(X_vovv(nV,nO,nV,nV), X_ooov(nO,nO,nO,nV), X_oovv(nO,nO,nV,nV))
-  allocate(W_bca(nO,nO,nO), V_cba(nO,nO,nO), W_cba(nO,nO,nO))
+  allocate(T_voov(nV,nO,nO,nV),T_oovv(nO,nO,nV,nV))
  allocate(X_vvvo(nV,nV,nV,nO), X_ovoo(nO,nV,nO,nO), X_vvoo(nV,nV,nO,nO))
  allocate(T_vvoo(nV,nV,nO,nO), T_ovvo(nO,nV,nV,nO), T_vo(nV,nO))
  ! Temporary arrays
  !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,T_vvoo,T_ovvo,T_vo,X_vvvo,X_ovoo,X_vvoo, &
+  !$OMP SHARED(nO,nV,T_voov,T_oovv,X_vovv,X_ooov,X_oovv, &
  !$OMP t1,t2,v_vvvo,v_vooo,v_vvoo) &
  !$OMP PRIVATE(a,b,c,d,i,j,k,l) &
  !$OMP DEFAULT(NONE)
  !v_vvvo(b,a,d,i) * t2(k,j,c,d) &
-  !X_vvvo(d,b,a,i) * T_vvoo(d,c,k,j)
+  !X_vovv(d,i,b,a,i) * T_voov(d,j,c,k)
-  !$OMP DO collapse(3)
+  !$OMP DO
  do i = 1, nO
  do a = 1, nV
    do b = 1, nV
      do i = 1, nO
        do d = 1, nV
-          X_vvvo(d,b,a,i) = v_vvvo(b,a,d,i)
+          X_vovv(d,i,b,a) = v_vvvo(b,a,d,i)
        enddo
      enddo
    enddo
  enddo
  !$OMP END DO nowait
-  !$OMP DO collapse(3)
+  !$OMP DO
  do c = 1, nV
    do j = 1, nO
      do k = 1, nO
      do c = 1, nV
        do d = 1, nV
-          T_vvoo(d,c,k,j) = t2(k,j,c,d)
+          T_voov(d,k,j,c) = t2(k,j,c,d)
        enddo
      enddo
    enddo
@ -62,191 +54,399 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
  !$OMP END DO nowait
  !v_vooo(c,j,k,l) * t2(i,l,a,b) &
-  !X_ovoo(l,c,j,k) * T_ovvo(l,a,b,i) &
+  !X_ooov(l,j,k,c) * T_oovv(l,i,a,b) &
-  !$OMP DO collapse(3)
+  !$OMP DO
  do c = 1, nV
    do k = 1, nO
      do j = 1, nO
      do c = 1, nV
        do l = 1, nO
-           X_ovoo(l,c,j,k) = v_vooo(c,j,k,l)
+           X_ooov(l,j,k,c) = v_vooo(c,j,k,l)
        enddo
      enddo
    enddo
  enddo
  !$OMP END DO nowait
-  !$OMP DO collapse(3)
+  !$OMP DO
  do i = 1, nO
  do b = 1, nV
    do a = 1, nV
      do i = 1, nO
        do l = 1, nO
-          T_ovvo(l,a,b,i) = t2(i,l,a,b)
+          T_oovv(l,i,a,b) = t2(i,l,a,b)
        enddo
      enddo
    enddo
  enddo
  !$OMP END DO nowait
-  !v_vvoo(b,c,j,k) * t1(i,a) &
+  !X_oovv(j,k,b,c) * T1_vo(a,i) &
  !X_vvoo(b,c,k,j) * T1_vo(a,i) &
-  !$OMP DO collapse(3)
+  !$OMP DO
-  do j = 1, nO
+  do c = 1, nV
    do b = 1, nV
      do k = 1, nO
-      do c = 1, nV
+        do j = 1, nO
-        do b = 1, nV
+          X_oovv(j,k,b,c) = v_vvoo(b,c,j,k)
          X_vvoo(b,c,k,j) = v_vvoo(b,c,j,k)
        enddo
      enddo
    enddo
  enddo
  !$OMP END DO nowait
  !$OMP DO collapse(1)
  do i = 1, nO
    do a = 1, nV
      T_vo(a,i) = t1(i,a)
    enddo
  enddo
  !$OMP END DO
  !$OMP END PARALLEL
-  call wall_time(ta)
+  double precision, external :: ccsd_t_task_aba
-  energy = 0d0
+  double precision, external :: ccsd_t_task_abc
  do c = 1, nV
    do b = 1, nV
      do a = 1, nV
        delta_abc = f_v(a) + f_v(b) + f_v(c)
        call form_w_abc(nO,nV,a,b,c,T_vvoo,T_ovvo,X_vvvo,X_ovoo,W_abc)
        call form_w_abc(nO,nV,b,c,a,T_vvoo,T_ovvo,X_vvvo,X_ovoo,W_bca)
        call form_w_abc(nO,nV,c,a,b,T_vvoo,T_ovvo,X_vvvo,X_ovoo,W_cab)
        call form_w_abc(nO,nV,c,b,a,T_vvoo,T_ovvo,X_vvvo,X_ovoo,W_cba)
-        call form_v_abc(nO,nV,a,b,c,T_vo,X_vvoo,W_abc,V_abc)
+  !$OMP PARALLEL PRIVATE(a,b,c,e) DEFAULT(SHARED)
        call form_v_abc(nO,nV,c,b,a,T_vo,X_vvoo,W_cba,V_cba)
        !$OMP PARALLEL                                               &
            !$OMP SHARED(energy,nO,a,b,c,W_abc,W_cab,W_bca,V_abc,V_cba,f_o,f_v,delta_abc)&
            !$OMP PRIVATE(i,j,k,e,delta)                             &
            !$OMP DEFAULT(NONE)
  e = 0d0
-        !$OMP DO
+  !$OMP DO SCHEDULE(dynamic)
-        do i = 1, nO
+  do a = 1, nV
-          do j = 1, nO
+    do b = a+1, nV
-            do k = 1, nO
+      do c = b+1, nV
-              delta = 1d0 / (f_o(i) + f_o(j) + f_o(k) - delta_abc)
+        e = e + ccsd_t_task_abc(a,b,c,nO,nV,t1,T_oovv,T_voov, &
-              !energy = energy + (4d0 * W(i,j,k,a,b,c) + W(i,j,k,b,c,a) + W(i,j,k,c,a,b)) * (V(i,j,k,a,b,c) - V(i,j,k,c,b,a)) / (cc_space_f_o(i) + cc_space_f_o(j) + cc_space_f_o(k) - cc_space_f_v(a) - cc_space_f_v(b) - cc_space_f_v(c))  !delta_ooovvv(i,j,k,a,b,c)
+                        X_ooov,X_oovv,X_vovv,f_o,f_v)
              e = e + (4d0 * W_abc(i,j,k) + W_bca(i,j,k) + W_cab(i,j,k))&
                  * (V_abc(i,j,k) - V_cba(i,j,k)) * delta
      enddo
      e = e + ccsd_t_task_aba(a,b,nO,nV,t1,T_oovv,T_voov, &
                      X_ooov,X_oovv,X_vovv,f_o,f_v)
      e = e + ccsd_t_task_aba(b,a,nO,nV,t1,T_oovv,T_voov, &
                      X_ooov,X_oovv,X_vovv,f_o,f_v)
    enddo
  enddo
  !$OMP END DO NOWAIT
  !$OMP CRITICAL
  energy = energy + e
  !$OMP END CRITICAL
  !$OMP END PARALLEL
      enddo
    enddo
    call wall_time(tb)
    write(*,'(F12.2,A5,F12.2,A2)') dble(i)/dble(nO)*100d0, '% in ', tb - ta, ' s'
  enddo
-  energy = energy / 3d0
+  energy = energy / 3.d0
-  deallocate(W_abc,V_abc,W_cab,V_cba,W_bca,X_vvvo,X_ovoo,T_vvoo,T_ovvo,T_vo)
+  deallocate(X_vovv,X_ooov,T_voov,T_oovv)
  !deallocate(V,W)
 end
-subroutine form_w_abc(nO,nV,a,b,c,T_vvoo,T_ovvo,X_vvvo,X_ovoo,W_abc)
+double precision function ccsd_t_task_abc(a,b,c,nO,nV,t1,T_oovv,T_voov,&
      X_ooov,X_oovv,X_vovv,f_o,f_v) result(e)
  implicit none
  integer, intent(in)              :: nO,nV,a,b,c
  double precision, intent(in)     :: t1(nO,nV), f_o(nO), f_v(nV)
  double precision, intent(in)     :: X_oovv(nO,nO,nV,nV)
  double precision, intent(in)     :: T_voov(nV,nO,nO,nV), T_oovv(nO,nO,nV,nV)
  double precision, intent(in)     :: X_vovv(nV,nO,nV,nV), X_ooov(nO,nO,nO,nV)
  double precision :: delta, delta_abc
  integer  :: i,j,k
  double precision, allocatable :: W_abc(:,:,:), W_cab(:,:,:), W_bca(:,:,:)
  double precision, allocatable :: W_bac(:,:,:), W_cba(:,:,:), W_acb(:,:,:)
  double precision, allocatable :: V_abc(:,:,:), V_cab(:,:,:), V_bca(:,:,:)
  double precision, allocatable :: V_bac(:,:,:), V_cba(:,:,:), V_acb(:,:,:)
  allocate( W_abc(nO,nO,nO), W_cab(nO,nO,nO), W_bca(nO,nO,nO), &
            W_bac(nO,nO,nO), W_cba(nO,nO,nO), W_acb(nO,nO,nO), &
            V_abc(nO,nO,nO), V_cab(nO,nO,nO), V_bca(nO,nO,nO), &
            V_bac(nO,nO,nO), V_cba(nO,nO,nO), V_acb(nO,nO,nO) )
  call form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc,W_cba,W_bca,W_cab,W_bac,W_acb)
  call form_v_abc(nO,nV,a,b,c,t1,X_oovv,W_abc,V_abc,W_cba,V_cba,W_bca,V_bca,W_cab,V_cab,W_bac,V_bac,W_acb,V_acb)
  delta_abc = f_v(a) + f_v(b) + f_v(c)
  e = 0.d0
  do k = 1, nO
    do j = 1, nO
      do i = 1, nO
        delta = 1.d0 / (f_o(i) + f_o(j) + f_o(k) - delta_abc)
        e = e + delta * (                                    &
            (4d0 * (W_abc(i,j,k) - W_cba(i,j,k)) +           &
            W_bca(i,j,k) - W_bac(i,j,k)  +                   &
            W_cab(i,j,k) - W_acb(i,j,k)  ) * (V_abc(i,j,k) - V_cba(i,j,k)) +&
            (4d0 * (W_acb(i,j,k) - W_bca(i,j,k)) +           &
            W_cba(i,j,k) - W_cab(i,j,k)  +                   &
            W_bac(i,j,k) - W_abc(i,j,k)  ) * (V_acb(i,j,k) - V_bca(i,j,k)) +&
            (4d0 * (W_bac(i,j,k) - W_cab(i,j,k)) +           &
            W_acb(i,j,k) - W_abc(i,j,k)  +                   &
            W_cba(i,j,k) - W_bca(i,j,k)  ) * (V_bac(i,j,k) - V_cab(i,j,k)) )
      enddo
    enddo
  enddo
  deallocate(W_abc, W_cab, W_bca, W_bac, W_cba, W_acb, &
             V_abc, V_cab, V_bca, V_bac, V_cba, V_acb )
 end
 double precision function ccsd_t_task_aba(a,b,nO,nV,t1,T_oovv,T_voov,&
      X_ooov,X_oovv,X_vovv,f_o,f_v) result(e)
  implicit none
  integer, intent(in)              :: nO,nV,a,b
  double precision, intent(in)     :: t1(nO,nV), f_o(nO), f_v(nV)
  double precision, intent(in)     :: X_oovv(nO,nO,nV,nV)
  double precision, intent(in)     :: T_voov(nV,nO,nO,nV), T_oovv(nO,nO,nV,nV)
  double precision, intent(in)     :: X_vovv(nV,nO,nV,nV), X_ooov(nO,nO,nO,nV)
  double precision :: delta, delta_abc
  integer  :: i,j,k
  double precision, allocatable :: W_abc(:,:,:), W_cab(:,:,:), W_bca(:,:,:)
  double precision, allocatable :: W_bac(:,:,:), W_cba(:,:,:), W_acb(:,:,:)
  double precision, allocatable :: V_abc(:,:,:), V_cab(:,:,:), V_bca(:,:,:)
  double precision, allocatable :: V_bac(:,:,:), V_cba(:,:,:), V_acb(:,:,:)
  allocate( W_abc(nO,nO,nO), W_cab(nO,nO,nO), W_bca(nO,nO,nO), &
            W_bac(nO,nO,nO), W_cba(nO,nO,nO), W_acb(nO,nO,nO), &
            V_abc(nO,nO,nO), V_cab(nO,nO,nO), V_bca(nO,nO,nO), &
            V_bac(nO,nO,nO), V_cba(nO,nO,nO), V_acb(nO,nO,nO) )
  call form_w_abc(nO,nV,a,b,a,T_voov,T_oovv,X_vovv,X_ooov,W_abc,W_cba,W_bca,W_cab,W_bac,W_acb)
  call form_v_abc(nO,nV,a,b,a,t1,X_oovv,W_abc,V_abc,W_cba,V_cba,W_bca,V_bca,W_cab,V_cab,W_bac,V_bac,W_acb,V_acb)
  delta_abc = f_v(a) + f_v(b) + f_v(a)
  e = 0.d0
  do k = 1, nO
    do j = 1, nO
      do i = 1, nO
        delta = 1.d0 / (f_o(i) + f_o(j) + f_o(k) - delta_abc)
        e = e + delta * (                                    &
               (4d0 * W_abc(i,j,k) + W_bca(i,j,k) + W_cab(i,j,k)) * (V_abc(i,j,k) - V_cba(i,j,k)) + &
               (4d0 * W_acb(i,j,k) + W_cba(i,j,k) + W_bac(i,j,k)) * (V_acb(i,j,k) - V_bca(i,j,k)) + &
               (4d0 * W_bac(i,j,k) + W_acb(i,j,k) + W_cba(i,j,k)) * (V_bac(i,j,k) - V_cab(i,j,k)) )
      enddo
    enddo
  enddo
  deallocate(W_abc, W_cab, W_bca, W_bac, W_cba, W_acb, &
             V_abc, V_cab, V_bca, V_bac, V_cba, V_acb )
 end
 subroutine form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc,W_cba,W_bca,W_cab,W_bac,W_acb)
  implicit none
  integer, intent(in)           :: nO,nV,a,b,c
-  !double precision, intent(in) :: t2(nO,nO,nV,nV)
+  double precision, intent(in)  :: T_voov(nV,nO,nO,nV), T_oovv(nO,nO,nV,nV)
-  double precision, intent(in)  :: T_vvoo(nV,nV,nO,nO), T_ovvo(nO,nV,nV,nO)
+  double precision, intent(in)  :: X_vovv(nV,nO,nV,nV), X_ooov(nO,nO,nO,nV)
  double precision, intent(in)  :: X_vvvo(nV,nV,nV,nO), X_ovoo(nO,nV,nO,nO)
  double precision, intent(out) :: W_abc(nO,nO,nO)
  double precision, intent(out) :: W_cba(nO,nO,nO)
  double precision, intent(out) :: W_bca(nO,nO,nO)
  double precision, intent(out) :: W_cab(nO,nO,nO)
  double precision, intent(out) :: W_bac(nO,nO,nO)
  double precision, intent(out) :: W_acb(nO,nO,nO)
  integer :: l,i,j,k,d
  double precision, allocatable, dimension(:,:,:,:) :: W_ikj
  double precision, allocatable :: X(:,:,:,:)
  allocate(W_ikj(nO,nO,nO,6))
  allocate(X(nV,nO,nO,3))
  do k=1,nO
    do i=1,nO
      do d=1,nV
        X(d,i,k,1) = T_voov(d,k,i,a)
        X(d,i,k,2) = T_voov(d,k,i,b)
        X(d,i,k,3) = T_voov(d,k,i,c)
      enddo
    enddo
  enddo
 !   X_vovv(d,i,c,a) * T_voov(d,j,k,b) : i jk
  call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,c,a), nV, T_voov(1,1,1,b), nV, 0.d0, W_abc, nO)
  call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,c,b), nV, T_voov(1,1,1,a), nV, 0.d0, W_bac, nO)
  call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,a,c), nV, T_voov(1,1,1,b), nV, 0.d0, W_cba, nO)
  call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,a,b), nV, T_voov(1,1,1,c), nV, 0.d0, W_bca, nO)
  call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,b,c), nV, T_voov(1,1,1,a), nV, 0.d0, W_cab, nO)
  call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,b,a), nV, T_voov(1,1,1,c), nV, 0.d0, W_acb, nO)
 !   T_voov(d,i,j,a) * X_vovv(d,k,b,c) : ij k
  call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,a), nV, X_vovv(1,1,b,c), nV, 1.d0, W_abc, nO*nO)
  call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,b), nV, X_vovv(1,1,a,c), nV, 1.d0, W_bac, nO*nO)
  call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,c), nV, X_vovv(1,1,b,a), nV, 1.d0, W_cba, nO*nO)
  call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,b), nV, X_vovv(1,1,c,a), nV, 1.d0, W_bca, nO*nO)
  call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,c), nV, X_vovv(1,1,a,b), nV, 1.d0, W_cab, nO*nO)
  call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,a), nV, X_vovv(1,1,c,b), nV, 1.d0, W_acb, nO*nO)
-  !$OMP PARALLEL &
+!   X_vovv(d,k,a,c) * T_voov(d,j,i,b) : k ji
-  !$OMP SHARED(nO,nV,a,b,c,T_vvoo,T_ovvo,X_vvvo,X_ovoo,W_abc) &
+
-  !$OMP PRIVATE(i,j,k,d,l) &
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,2), nV, X_vovv(1,1,a,c), nV, 1.d0, W_abc, nO*nO)
-  !$OMP DEFAULT(NONE)
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,1), nV, X_vovv(1,1,b,c), nV, 1.d0, W_bac, nO*nO)
  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,2), nV, X_vovv(1,1,c,a), nV, 1.d0, W_cba, nO*nO)
  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,3), nV, X_vovv(1,1,b,a), nV, 1.d0, W_bca, nO*nO)
  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,1), nV, X_vovv(1,1,c,b), nV, 1.d0, W_cab, nO*nO)
  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,3), nV, X_vovv(1,1,a,b), nV, 1.d0, W_acb, nO*nO)
 !   X_vovv(d,i,b,a) * T_voov(d,k,j,c) : i kj
  call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,b,a), nV, X(1,1,1,3), nV, 1.d0, W_abc, nO)
  call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,a,b), nV, X(1,1,1,3), nV, 1.d0, W_bac, nO)
  call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,b,c), nV, X(1,1,1,1), nV, 1.d0, W_cba, nO)
  call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,c,b), nV, X(1,1,1,1), nV, 1.d0, W_bca, nO)
  call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,a,c), nV, X(1,1,1,2), nV, 1.d0, W_cab, nO)
  call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,c,a), nV, X(1,1,1,2), nV, 1.d0, W_acb, nO)
 !  T_voov(d,k,i,c) * X_vovv(d,j,a,b) : ki j
  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,3), nV, X_vovv(1,1,a,b), nV, 0.d0, W_ikj(1,1,1,1), nO*nO)
  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,3), nV, X_vovv(1,1,b,a), nV, 0.d0, W_ikj(1,1,1,2), nO*nO)
  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,1), nV, X_vovv(1,1,c,b), nV, 0.d0, W_ikj(1,1,1,3), nO*nO)
  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,1), nV, X_vovv(1,1,b,c), nV, 0.d0, W_ikj(1,1,1,4), nO*nO)
  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,2), nV, X_vovv(1,1,c,a), nV, 0.d0, W_ikj(1,1,1,5), nO*nO)
  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,2), nV, X_vovv(1,1,a,c), nV, 0.d0, W_ikj(1,1,1,6), nO*nO)
 !   T_voov(d,i,k,a) * X_vovv(d,j,c,b) : ik j
  call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,a), nV, X_vovv(1,1,c,b), nV, 1.d0, W_ikj(1,1,1,1), nO*nO)
  call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,b), nV, X_vovv(1,1,c,a), nV, 1.d0, W_ikj(1,1,1,2), nO*nO)
  call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,c), nV, X_vovv(1,1,a,b), nV, 1.d0, W_ikj(1,1,1,3), nO*nO)
  call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,b), nV, X_vovv(1,1,a,c), nV, 1.d0, W_ikj(1,1,1,4), nO*nO)
  call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,c), nV, X_vovv(1,1,b,a), nV, 1.d0, W_ikj(1,1,1,5), nO*nO)
  call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,a), nV, X_vovv(1,1,b,c), nV, 1.d0, W_ikj(1,1,1,6), nO*nO)
  deallocate(X)
  allocate(X(nO,nO,nO,3))
  do k=1,nO
    do j=1,nO
      do l=1,nO
        X(l,j,k,1) = X_ooov(l,k,j,a)
        X(l,j,k,2) = X_ooov(l,k,j,b)
        X(l,j,k,3) = X_ooov(l,k,j,c)
      enddo
    enddo
  enddo
 !   - T_oovv(l,i,a,b) * X_ooov(l,j,k,c) : i jk
  call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,a,b), nO, X_ooov(1,1,1,c), nO, 1.d0, W_abc, nO)
  call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,b,a), nO, X_ooov(1,1,1,c), nO, 1.d0, W_bac, nO)
  call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,c,b), nO, X_ooov(1,1,1,a), nO, 1.d0, W_cba, nO)
  call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,b,c), nO, X_ooov(1,1,1,a), nO, 1.d0, W_bca, nO)
  call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,c,a), nO, X_ooov(1,1,1,b), nO, 1.d0, W_cab, nO)
  call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,a,c), nO, X_ooov(1,1,1,b), nO, 1.d0, W_acb, nO)
 !   - T_oovv(l,i,a,c) * X_ooov(l,k,j,b) : i kj
  call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,a,c), nO, X(1,1,1,2), nO, 1.d0, W_abc, nO)
  call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,b,c), nO, X(1,1,1,1), nO, 1.d0, W_bac, nO)
  call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,c,a), nO, X(1,1,1,2), nO, 1.d0, W_cba, nO)
  call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,b,a), nO, X(1,1,1,3), nO, 1.d0, W_bca, nO)
  call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,c,b), nO, X(1,1,1,1), nO, 1.d0, W_cab, nO)
  call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,a,b), nO, X(1,1,1,3), nO, 1.d0, W_acb, nO)
 !   - X_ooov(l,i,j,b) * T_oovv(l,k,c,a) : ij k
  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,b), nO, T_oovv(1,1,c,a), nO, 1.d0, W_abc, nO*nO)
  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,a), nO, T_oovv(1,1,c,b), nO, 1.d0, W_bac, nO*nO)
  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,b), nO, T_oovv(1,1,a,c), nO, 1.d0, W_cba, nO*nO)
  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,c), nO, T_oovv(1,1,a,b), nO, 1.d0, W_bca, nO*nO)
  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,a), nO, T_oovv(1,1,b,c), nO, 1.d0, W_cab, nO*nO)
  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,c), nO, T_oovv(1,1,b,a), nO, 1.d0, W_acb, nO*nO)
 !   - X_ooov(l,j,i,a) * T_oovv(l,k,c,b) : ji k
  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,1), nO, T_oovv(1,1,c,b), nO, 1.d0, W_abc, nO*nO)
  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,2), nO, T_oovv(1,1,c,a), nO, 1.d0, W_bac, nO*nO)
  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,3), nO, T_oovv(1,1,a,b), nO, 1.d0, W_cba, nO*nO)
  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,2), nO, T_oovv(1,1,a,c), nO, 1.d0, W_bca, nO*nO)
  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,3), nO, T_oovv(1,1,b,a), nO, 1.d0, W_cab, nO*nO)
  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,1), nO, T_oovv(1,1,b,c), nO, 1.d0, W_acb, nO*nO)
 !   - X_ooov(l,k,i,a) * T_oovv(l,j,b,c) : ki j
  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,1), nO, T_oovv(1,1,b,c), nO, 1.d0, W_ikj(1,1,1,1), nO*nO)
  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,2), nO, T_oovv(1,1,a,c), nO, 1.d0, W_ikj(1,1,1,2), nO*nO)
  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,3), nO, T_oovv(1,1,b,a), nO, 1.d0, W_ikj(1,1,1,3), nO*nO)
  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,2), nO, T_oovv(1,1,c,a), nO, 1.d0, W_ikj(1,1,1,4), nO*nO)
  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,3), nO, T_oovv(1,1,a,b), nO, 1.d0, W_ikj(1,1,1,5), nO*nO)
  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,1), nO, T_oovv(1,1,c,b), nO, 1.d0, W_ikj(1,1,1,6), nO*nO)
 !   - X_ooov(l,i,k,c) * T_oovv(l,j,b,a) : ik j
  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,c), nO, T_oovv(1,1,b,a), nO, 1.d0, W_ikj(1,1,1,1), nO*nO)
  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,c), nO, T_oovv(1,1,a,b), nO, 1.d0, W_ikj(1,1,1,2), nO*nO)
  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,a), nO, T_oovv(1,1,b,c), nO, 1.d0, W_ikj(1,1,1,3), nO*nO)
  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,a), nO, T_oovv(1,1,c,b), nO, 1.d0, W_ikj(1,1,1,4), nO*nO)
  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,b), nO, T_oovv(1,1,a,c), nO, 1.d0, W_ikj(1,1,1,5), nO*nO)
  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,b), nO, T_oovv(1,1,c,a), nO, 1.d0, W_ikj(1,1,1,6), nO*nO)
  !$OMP DO collapse(3)
  do k=1,nO
    do j=1,nO
      do i=1,nO
-        W_abc(i,j,k) = 0.d0
+        W_abc(i,j,k) = W_abc(i,j,k) + W_ikj(i,k,j,1)
-
+        W_bac(i,j,k) = W_bac(i,j,k) + W_ikj(i,k,j,2)
-        do d = 1, nV
+        W_cba(i,j,k) = W_cba(i,j,k) + W_ikj(i,k,j,3)
-          W_abc(i,j,k) = W_abc(i,j,k) &
+        W_bca(i,j,k) = W_bca(i,j,k) + W_ikj(i,k,j,4)
-                 + X_vvvo(d,b,a,i) * T_vvoo(d,c,k,j) &
+        W_cab(i,j,k) = W_cab(i,j,k) + W_ikj(i,k,j,5)
-                 + X_vvvo(d,c,a,i) * T_vvoo(d,b,j,k) &
+        W_acb(i,j,k) = W_acb(i,j,k) + W_ikj(i,k,j,6)
                 + X_vvvo(d,a,c,k) * T_vvoo(d,b,j,i) &
                 + X_vvvo(d,b,c,k) * T_vvoo(d,a,i,j) &
                 + X_vvvo(d,c,b,j) * T_vvoo(d,a,i,k) &
                 + X_vvvo(d,a,b,j) * T_vvoo(d,c,k,i)
        enddo
        do l = 1, nO
          W_abc(i,j,k) = W_abc(i,j,k) &
              - T_ovvo(l,a,b,i) * X_ovoo(l,c,j,k) &
              - T_ovvo(l,a,c,i) * X_ovoo(l,b,k,j) & ! bc kj
              - T_ovvo(l,c,a,k) * X_ovoo(l,b,i,j) & ! prev ac ik
              - T_ovvo(l,c,b,k) * X_ovoo(l,a,j,i) & ! prev ab ij
              - T_ovvo(l,b,c,j) * X_ovoo(l,a,k,i) & ! prev bc kj
              - T_ovvo(l,b,a,j) * X_ovoo(l,c,i,k) ! prev ac ik
        enddo
      enddo
    enddo
  enddo
  !$OMP END DO
  !$OMP END PARALLEL
  deallocate(X,W_ikj)
 end
 ! V_abc
-subroutine form_v_abc(nO,nV,a,b,c,T_vo,X_vvoo,W,V)
+subroutine form_v_abc(nO,nV,a,b,c,T_ov,X_oovv,W_abc,V_abc,W_cba,V_cba,W_bca,V_bca,W_cab,V_cab,W_bac,V_bac,W_acb,V_acb)
 implicit none
  integer, intent(in)           :: nO,nV,a,b,c
-  !double precision, intent(in)  :: t1(nO,nV)
+  double precision, intent(in)  :: T_ov(nO,nV)
-  double precision, intent(in)  :: T_vo(nV,nO)
+  double precision, intent(in)  :: X_oovv(nO,nO,nV,nV)
-  double precision, intent(in)  :: X_vvoo(nV,nV,nO,nO)
+  double precision, intent(in)  :: W_abc(nO,nO,nO), W_cab(nO,nO,nO), W_bca(nO,nO,nO)
-  double precision, intent(in)  :: W(nO,nO,nO)
+  double precision, intent(in)  :: W_bac(nO,nO,nO), W_cba(nO,nO,nO), W_acb(nO,nO,nO)
-  double precision, intent(out) :: V(nO,nO,nO)
+  double precision, intent(out) :: V_abc(nO,nO,nO), V_cab(nO,nO,nO), V_bca(nO,nO,nO)
  double precision, intent(out) :: V_bac(nO,nO,nO), V_cba(nO,nO,nO), V_acb(nO,nO,nO)
  integer :: i,j,k
  !$OMP PARALLEL &
  !$OMP SHARED(nO,nV,a,b,c,T_vo,X_vvoo,W,V) &
  !$OMP PRIVATE(i,j,k) &
  !$OMP DEFAULT(NONE)
  !$OMP DO collapse(2)
  do k = 1, nO
    do j = 1, nO
      do i = 1, nO
-        !V(i,j,k,a,b,c) = V(i,j,k,a,b,c) + W(i,j,k,a,b,c) &
+        V_abc(i,j,k) = W_abc(i,j,k) &
-        V(i,j,k) = W(i,j,k) &
+           + X_oovv(j,k,b,c) * T_ov(i,a) &
-           + X_vvoo(b,c,k,j) * T_vo(a,i) &
+           + X_oovv(i,k,a,c) * T_ov(j,b) &
-           + X_vvoo(a,c,k,i) * T_vo(b,j) &
+           + X_oovv(i,j,a,b) * T_ov(k,c)
-           + X_vvoo(a,b,j,i) * T_vo(c,k)
+
        V_cba(i,j,k) = W_cba(i,j,k) &
           + X_oovv(j,k,b,a) * T_ov(i,c) &
           + X_oovv(i,k,c,a) * T_ov(j,b) &
           + X_oovv(i,j,c,b) * T_ov(k,a)
        V_bca(i,j,k) = W_bca(i,j,k) &
           + X_oovv(j,k,c,a) * T_ov(i,b) &
           + X_oovv(i,k,b,a) * T_ov(j,c) &
           + X_oovv(i,j,b,c) * T_ov(k,a)
        V_cab(i,j,k) = W_cab(i,j,k) &
           + X_oovv(j,k,a,b) * T_ov(i,c) &
           + X_oovv(i,k,c,b) * T_ov(j,a) &
           + X_oovv(i,j,c,a) * T_ov(k,b)
        V_bac(i,j,k) = W_bac(i,j,k) &
           + X_oovv(j,k,a,c) * T_ov(i,b) &
           + X_oovv(i,k,b,c) * T_ov(j,a) &
           + X_oovv(i,j,b,a) * T_ov(k,c)
        V_acb(i,j,k) = W_acb(i,j,k) &
           + X_oovv(j,k,c,b) * T_ov(i,a) &
           + X_oovv(i,k,a,b) * T_ov(j,c) &
           + X_oovv(i,j,a,c) * T_ov(k,b)
      enddo
    enddo
  enddo
  !$OMP END DO
  !$OMP END PARALLEL
 end
--- a/src/ccsd/ccsd_t_space_orb_stoch.irp.f
+++ b/src/ccsd/ccsd_t_space_orb_stoch.irp.f
@ -0,0 +1,363 @@
 ! Main
 subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
  implicit none
  integer, intent(in)           :: nO,nV
  double precision, intent(in)  :: t1(nO,nV), f_o(nO), f_v(nV)
  double precision, intent(in)  :: t2(nO,nO,nV,nV)
  double precision, intent(in)  :: v_vvvo(nV,nV,nV,nO), v_vvoo(nV,nV,nO,nO), v_vooo(nV,nO,nO,nO)
  double precision, intent(inout) :: energy
  double precision, allocatable :: X_vovv(:,:,:,:), X_ooov(:,:,:,:), X_oovv(:,:,:,:)
  double precision, allocatable :: T_voov(:,:,:,:), T_oovv(:,:,:,:)
  integer                       :: i,j,k,l,a,b,c,d
  double precision              :: e,ta,tb,eccsd
  eccsd = energy
  call set_multiple_levels_omp(.False.)
  allocate(X_vovv(nV,nO,nV,nV), X_ooov(nO,nO,nO,nV), X_oovv(nO,nO,nV,nV))
  allocate(T_voov(nV,nO,nO,nV),T_oovv(nO,nO,nV,nV))
  !$OMP PARALLEL &
  !$OMP SHARED(nO,nV,T_voov,T_oovv,X_vovv,X_ooov,X_oovv, &
  !$OMP t1,t2,v_vvvo,v_vooo,v_vvoo) &
  !$OMP PRIVATE(a,b,c,d,i,j,k,l) &
  !$OMP DEFAULT(NONE)
  !v_vvvo(b,a,d,i) * t2(k,j,c,d) &
  !X_vovv(d,i,b,a,i) * T_voov(d,j,c,k)
  !$OMP DO
  do a = 1, nV
    do b = 1, nV
      do i = 1, nO
        do d = 1, nV
          X_vovv(d,i,b,a) = v_vvvo(b,a,d,i)
        enddo
      enddo
    enddo
  enddo
  !$OMP END DO nowait
  !$OMP DO
  do c = 1, nV
    do j = 1, nO
      do k = 1, nO
        do d = 1, nV
          T_voov(d,k,j,c) = t2(k,j,c,d)
        enddo
      enddo
    enddo
  enddo
  !$OMP END DO nowait
  !v_vooo(c,j,k,l) * t2(i,l,a,b) &
  !X_ooov(l,j,k,c) * T_oovv(l,i,a,b) &
  !$OMP DO
  do c = 1, nV
    do k = 1, nO
      do j = 1, nO
        do l = 1, nO
           X_ooov(l,j,k,c) = v_vooo(c,j,k,l)
        enddo
      enddo
    enddo
  enddo
  !$OMP END DO nowait
  !$OMP DO
  do b = 1, nV
    do a = 1, nV
      do i = 1, nO
        do l = 1, nO
          T_oovv(l,i,a,b) = t2(i,l,a,b)
        enddo
      enddo
    enddo
  enddo
  !$OMP END DO nowait
  !X_oovv(j,k,b,c) * T1_vo(a,i) &
  !$OMP DO
  do c = 1, nV
    do b = 1, nV
      do k = 1, nO
        do j = 1, nO
          X_oovv(j,k,b,c) = v_vvoo(b,c,j,k)
        enddo
      enddo
    enddo
  enddo
  !$OMP END DO nowait
  !$OMP END PARALLEL
  double precision, external :: ccsd_t_task_aba
  double precision, external :: ccsd_t_task_abc
 !  logical, external :: omp_test_lock
  double precision, allocatable :: memo(:), Pabc(:), waccu(:)
  integer*8, allocatable :: sampled(:)
 !  integer(omp_lock_kind), allocatable :: lock(:)
  integer*2       , allocatable :: abc(:,:)
  integer*8                     :: Nabc, i8
  integer*8, allocatable :: iorder(:)
  double precision :: eocc
  double precision :: norm
  integer :: kiter, isample
  ! Prepare table of triplets (a,b,c)
  Nabc = (int(nV,8) * int(nV+1,8) * int(nV+2,8))/6_8 - nV
  allocate (memo(Nabc), sampled(Nabc), Pabc(Nabc), waccu(Nabc))
  allocate (abc(4,Nabc), iorder(Nabc)) !, lock(Nabc))
 !  eocc = 3.d0/dble(nO) * sum(f_o(1:nO))
  Nabc = 0_8
  do a = 1, nV
    do b = a+1, nV
      do c = b+1, nV
        Nabc = Nabc + 1_8
        Pabc(Nabc) = -1.d0/(f_v(a) + f_v(b) + f_v(c))
        abc(1,Nabc) = a
        abc(2,Nabc) = b
        abc(3,Nabc) = c
      enddo
      Nabc = Nabc + 1_8
      abc(1,Nabc) = a
      abc(2,Nabc) = b
      abc(3,Nabc) = a
      Pabc(Nabc) = -1.d0/(2.d0*f_v(a) + f_v(b))
      Nabc = Nabc + 1_8
      abc(1,Nabc) = b
      abc(2,Nabc) = a
      abc(3,Nabc) = b
      Pabc(Nabc) = -1.d0/(f_v(a) + 2.d0*f_v(b))
    enddo
  enddo
  do i8=1,Nabc
   iorder(i8) = i8
  enddo
  ! Sort triplets in decreasing Pabc
  call dsort_big(Pabc, iorder, Nabc)
  ! Normalize
  norm = 0.d0
  do i8=Nabc,1,-1
    norm = norm + Pabc(i8)
  enddo
  norm = 1.d0/norm
  do i8=1,Nabc
    Pabc(i8) = Pabc(i8) * norm
  enddo
  call i8set_order_big(abc, iorder, Nabc)
  ! Cumulative distribution for sampling
  waccu(Nabc) = 0.d0
  do i8=Nabc-1,1,-1
   waccu(i8) = waccu(i8+1) - Pabc(i8+1)
  enddo
  waccu(:) = waccu(:) + 1.d0
  logical :: converged, do_comp
  double precision :: eta, variance, error, sample
  double precision :: t00, t01
  integer*8 :: ieta, Ncomputed
  integer*8, external :: binary_search
  integer :: nbuckets
  nbuckets = 100
  double precision, allocatable :: wsum(:)
  allocate(wsum(nbuckets))
  converged = .False.
  Ncomputed = 0_8
  energy = 0.d0
  variance = 0.d0
  memo(:) = 0.d0
  sampled(:) = -1_8
  integer*8 :: ileft, iright, imin
  ileft = 1_8
  iright = Nabc
  integer*8, allocatable :: bounds(:,:)
  allocate (bounds(2,nbuckets))
  do isample=1,nbuckets
    eta = 1.d0/dble(nbuckets) * dble(isample)
    ieta = binary_search(waccu,eta,Nabc)
    bounds(1,isample) = ileft
    bounds(2,isample) = ieta
    ileft = ieta+1
    wsum(isample) = sum( Pabc(bounds(1,isample):bounds(2,isample) ) )
  enddo
  Pabc(:) = 1.d0/Pabc(:)
  print '(A)', ''
  print '(A)', ' +----------------------+--------------+----------+'
  print '(A)', ' |      E(CCSD(T))      |   Error      |     %    |'
  print '(A)', ' +----------------------+--------------+----------+'
  call wall_time(t00)
  imin = 1_8
  !$OMP PARALLEL                                                     &
      !$OMP PRIVATE(ieta,eta,a,b,c,kiter,isample)                    &
      !$OMP DEFAULT(SHARED)
  do kiter=1,Nabc
    !$OMP MASTER
    do while ((imin <= Nabc).and.(sampled(imin)>-1_8))
      imin = imin+1
    enddo
    ! Deterministic part
    if (imin < Nabc) then
      ieta=imin
      sampled(ieta) = 0_8
      a = abc(1,ieta)
      b = abc(2,ieta)
      c = abc(3,ieta)
      Ncomputed += 1_8
      !$OMP TASK DEFAULT(SHARED) FIRSTPRIVATE(a,b,c,ieta)
      if (a/=c) then
        memo(ieta) = ccsd_t_task_abc(a,b,c,nO,nV,t1,T_oovv,T_voov, &
            X_ooov,X_oovv,X_vovv,f_o,f_v) / 3.d0
      else
        memo(ieta) =  ccsd_t_task_aba(a,b,nO,nV,t1,T_oovv,T_voov,  &
            X_ooov,X_oovv,X_vovv,f_o,f_v) / 3.d0
      endif
      !$OMP END TASK
    endif
    ! Stochastic part
    call random_number(eta)
    do isample=1,nbuckets
      if (imin >= bounds(2,isample)) then
        cycle
      endif
      ieta = binary_search(waccu,(eta + dble(isample-1))/dble(nbuckets),Nabc)
      if (sampled(ieta) == -1_8) then
        sampled(ieta) = 0_8
        a = abc(1,ieta)
        b = abc(2,ieta)
        c = abc(3,ieta)
        Ncomputed += 1_8
        !$OMP TASK DEFAULT(SHARED) FIRSTPRIVATE(a,b,c,ieta)
        if (a/=c) then
          memo(ieta) = ccsd_t_task_abc(a,b,c,nO,nV,t1,T_oovv,T_voov, &
              X_ooov,X_oovv,X_vovv,f_o,f_v) / 3.d0
        else
          memo(ieta) =  ccsd_t_task_aba(a,b,nO,nV,t1,T_oovv,T_voov,  &
              X_ooov,X_oovv,X_vovv,f_o,f_v) / 3.d0
        endif
        !$OMP END TASK
      endif
      sampled(ieta) = sampled(ieta)+1_8
    enddo
    call wall_time(t01)
    if ((t01-t00 > 1.0d0).or.(imin >= Nabc)) then
      t00 = t01
      !$OMP TASKWAIT
      double precision :: ET, ET2
      double precision :: energy_stoch, energy_det
      double precision :: scale
      double precision :: w
      double precision :: tmp
      energy_stoch = 0.d0
      energy_det   = 0.d0
      norm = 0.d0
      scale = 1.d0
      ET = 0.d0
      ET2 = 0.d0
      do isample=1,nbuckets
        if (imin >= bounds(2,isample)) then
          energy_det = energy_det + sum(memo(bounds(1,isample):bounds(2,isample)))
          scale = scale - wsum(isample)
        else
          exit
        endif
      enddo
      do ieta=bounds(1,isample), Nabc
          w = dble(max(sampled(ieta),0_8))
          tmp = w * memo(ieta) * Pabc(ieta)
          ET = ET + tmp
          ET2 = ET2 + tmp * memo(ieta) * Pabc(ieta)
          norm = norm + w
      enddo
      norm = norm/scale
      if (norm > 0.d0) then
        energy_stoch = ET / norm
        variance = ET2 / norm - energy_stoch*energy_stoch
      endif
      energy = energy_det + energy_stoch
      print '('' | '',F20.8, '' | '', E12.4,'' | '', F8.2,'' |'')', eccsd+energy, dsqrt(variance/(norm-1.d0)), 100.*real(Ncomputed)/real(Nabc)
    endif
    !$OMP END MASTER
    if (imin >= Nabc) exit
  enddo
  !$OMP END PARALLEL
  print '(A)', ' +----------------------+--------------+----------+'
  print '(A)', ''
  deallocate(X_vovv,X_ooov,T_voov,T_oovv)
 end
 integer*8 function binary_search(arr, key, size)
    implicit none
    BEGIN_DOC
 ! Searches the key in array arr(1:size) between l_in and r_in, and returns its index
    END_DOC
    integer*8 :: size, i, j, mid, l_in, r_in
    double precision, dimension(size) :: arr(1:size)
    double precision :: key
    i = 1_8
    j = size
    do while (j >= i)
        mid = i + (j - i) / 2
        if (arr(mid) >= key) then
            if (mid > 1 .and. arr(mid - 1) < key) then
                binary_search = mid
                return
            end if
            j = mid - 1
        else if (arr(mid) < key) then
            i = mid + 1
        else
            binary_search = mid + 1
            return
        end if
    end do
    binary_search = i
 end function binary_search
--- a/src/cipsi/selection.irp.f
+++ b/src/cipsi/selection.irp.f
@ -76,6 +76,8 @@ subroutine select_connected(i_generator,E0,pt2_data,b,subset,csubset)
  double precision, allocatable  :: fock_diag_tmp(:,:)
  if (csubset == 0) return
  allocate(fock_diag_tmp(2,mo_num+1))
  call build_fock_tmp(fock_diag_tmp,psi_det_generators(1,1,i_generator),N_int)
@ -177,6 +179,7 @@ subroutine select_singles_and_doubles(i_generator,hole_mask,particle_mask,fock_d
  monoAdo = .true.
  monoBdo = .true.
  if (csubset == 0) return
  do k=1,N_int
    hole    (k,1) = iand(psi_det_generators(k,1,i_generator), hole_mask(k,1))
--- a/src/cipsi_tc_bi_ortho/selection.irp.f
+++ b/src/cipsi_tc_bi_ortho/selection.irp.f
@ -868,7 +868,6 @@ subroutine fill_buffer_double(i_generator, sp, h1, h2, bannedOrb, banned, fock_d
        ! <det|H(j)|psi_0> and transpose
        ! -------------------------------------------
 !        call htilde_mu_mat_bi_ortho_tot(det, det, N_int, Hii)
        double precision               :: hmono, htwoe, hthree
        call diag_htilde_mu_mat_fock_bi_ortho(N_int, det, hmono, htwoe, hthree, hii)
        do istate = 1,N_states
@ -878,8 +877,8 @@ subroutine fill_buffer_double(i_generator, sp, h1, h2, bannedOrb, banned, fock_d
            psi_h_alpha = 0.d0
            alpha_h_psi = 0.d0
            do iii = 1, N_det_selectors
-              call htilde_mu_mat_bi_ortho_tot(psi_selectors(1,1,iii), det, N_int, i_h_alpha)
+              call htilde_mu_mat_bi_ortho_tot_slow(psi_selectors(1,1,iii), det, N_int, i_h_alpha)
-              call htilde_mu_mat_bi_ortho_tot(det, psi_selectors(1,1,iii), N_int, alpha_h_i)
+              call htilde_mu_mat_bi_ortho_tot_slow(det, psi_selectors(1,1,iii), N_int, alpha_h_i)
              call get_excitation_degree(psi_selectors(1,1,iii), det,degree,N_int)
              if(degree == 0)then
               print*,'problem !!!'
--- a/src/cosgtos_ao_int/EZFIO.cfg
+++ b/src/cosgtos_ao_int/EZFIO.cfg
@ -1,19 +0,0 @@
 [ao_expoim_cosgtos]
 type: double precision
 doc: imag part for Exponents for each primitive of each cosGTOs |AO|
 size: (ao_basis.ao_num,ao_basis.ao_prim_num_max)
 interface: ezfio, provider
 [use_cosgtos]
 type: logical
 doc: If true, use cosgtos for AO integrals
 interface: ezfio,provider,ocaml
 default: False
 [ao_integrals_threshold]
 type: Threshold
 doc: If | (pq|rs) | < `ao_integrals_threshold` then (pq|rs) is zero
 interface: ezfio,provider,ocaml
 default: 1.e-15
 ezfio_name: threshold_ao
--- a/src/cosgtos_ao_int/NEED
+++ b/src/cosgtos_ao_int/NEED
@ -1,2 +0,0 @@
 ezfio_files
 ao_basis
--- a/src/cosgtos_ao_int/README.rst
+++ b/src/cosgtos_ao_int/README.rst
@ -1,4 +0,0 @@
 ==============
 cosgtos_ao_int
 ==============
--- a/src/cosgtos_ao_int/cosgtos_ao_int.irp.f
+++ b/src/cosgtos_ao_int/cosgtos_ao_int.irp.f
@ -1,7 +0,0 @@
 program cosgtos_ao_int
  implicit none
  BEGIN_DOC
 ! TODO : Put the documentation of the program here
  END_DOC
  print *, 'Hello world'
 end
--- a/src/fci_tc_bi/diagonalize_ci.irp.f
+++ b/src/fci_tc_bi/diagonalize_ci.irp.f
@ -39,7 +39,7 @@ subroutine diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
   write(*,'(A28,X,I10,X,100(F16.8,X))')'Ndet,E,E+PT2,E+RPT2,|PT2|=',ndet,E_tc ,E_tc  + pt2_tmp/norm,E_tc  + rpt2_tmp/norm,abs_pt2
   print*,'*****'
  endif
-  psi_energy(1:N_states) = eigval_right_tc_bi_orth(1:N_states)
+  psi_energy(1:N_states) = eigval_right_tc_bi_orth(1:N_states) - nuclear_repulsion
  psi_s2(1:N_states) = s2_eigvec_tc_bi_orth(1:N_states)
  E_tc  = eigval_right_tc_bi_orth(1)
--- a/src/mo_two_e_ints/cholesky.irp.f
+++ b/src/mo_two_e_ints/cholesky.irp.f
@ -6,11 +6,42 @@ BEGIN_PROVIDER [ double precision, cholesky_mo, (mo_num, mo_num, cholesky_ao_num
 integer :: k
 call set_multiple_levels_omp(.False.)
 print *, 'AO->MO Transformation of Cholesky vectors'
 !$OMP PARALLEL DO PRIVATE(k)
 do k=1,cholesky_ao_num
  call ao_to_mo(cholesky_ao(1,1,k),ao_num,cholesky_mo(1,1,k),mo_num)
 enddo
 !$OMP END PARALLEL DO
 print *, ''
 END_PROVIDER
 BEGIN_PROVIDER [ double precision, cholesky_mo_transp, (cholesky_ao_num, mo_num, mo_num) ]
 implicit none
 BEGIN_DOC
 ! Cholesky vectors in MO basis
 END_DOC
 integer :: i,j,k
 double precision, allocatable :: buffer(:,:)
 print *, 'AO->MO Transformation of Cholesky vectors  .'
 !$OMP PARALLEL PRIVATE(i,j,k,buffer)
 allocate(buffer(mo_num,mo_num))
 !$OMP DO SCHEDULE(static)
 do k=1,cholesky_ao_num
  call ao_to_mo(cholesky_ao(1,1,k),ao_num,buffer,mo_num)
  do j=1,mo_num
    do i=1,mo_num
      cholesky_mo_transp(k,i,j) = buffer(i,j)
    enddo
  enddo
 enddo
 !$OMP END DO
 deallocate(buffer)
 !$OMP END PARALLEL
 print *, ''
 END_PROVIDER
--- a/src/mo_two_e_ints/integrals_3_index.irp.f
+++ b/src/mo_two_e_ints/integrals_3_index.irp.f
@ -4,12 +4,54 @@
 BEGIN_DOC
 ! big_array_coulomb_integrals(j,i,k)  = <ij|kj> = (ik|jj)
 !
- ! big_array_exchange_integrals(i,j,k) = <ij|jk> = (ij|kj)
+ ! big_array_exchange_integrals(j,i,k) = <ij|jk> = (ij|kj)
 END_DOC
- integer :: i,j,k,l
+ integer :: i,j,k,l,a
 double precision :: get_two_e_integral
 double precision :: integral
 if (do_ao_cholesky) then
    double precision, allocatable :: buffer_jj(:,:), buffer(:,:,:)
    allocate(buffer_jj(cholesky_ao_num,mo_num), buffer(mo_num,mo_num,mo_num))
    do j=1,mo_num
      buffer_jj(:,j) = cholesky_mo_transp(:,j,j)
    enddo
    call dgemm('T','N', mo_num*mo_num,mo_num,cholesky_ao_num, 1.d0, &
        cholesky_mo_transp, cholesky_ao_num, &
        buffer_jj, cholesky_ao_num, 0.d0, &
        buffer, mo_num*mo_num)
    do k = 1, mo_num
      do i = 1, mo_num
        do j = 1, mo_num
          big_array_coulomb_integrals(j,i,k) = buffer(i,k,j)
        enddo
      enddo
    enddo
    deallocate(buffer_jj)
    allocate(buffer_jj(mo_num,mo_num))
    do j = 1, mo_num
      call dgemm('T','N',mo_num,mo_num,cholesky_ao_num, 1.d0, &
        cholesky_mo_transp(1,1,j), cholesky_ao_num, &
        cholesky_mo_transp(1,1,j), cholesky_ao_num, 0.d0, &
        buffer_jj, mo_num)
      do k=1,mo_num
        do i=1,mo_num
          big_array_exchange_integrals(j,i,k) = buffer_jj(i,k)
       enddo
     enddo
    enddo
    deallocate(buffer_jj)
 else
   do k = 1, mo_num
     do i = 1, mo_num
       do j = 1, mo_num
@ -23,5 +65,7 @@
     enddo
   enddo
 endif
 END_PROVIDER
--- a/src/mo_two_e_ints/mo_bi_integrals.irp.f
+++ b/src/mo_two_e_ints/mo_bi_integrals.irp.f
@ -1353,14 +1353,29 @@ END_PROVIDER
  integer                        :: i,j
  double precision               :: get_two_e_integral
-  PROVIDE mo_two_e_integrals_in_map
+
-  mo_two_e_integrals_jj = 0.d0
+  if (do_ao_cholesky) then
-  mo_two_e_integrals_jj_exchange = 0.d0
+    do j=1,mo_num
      do i=1,mo_num
        !TODO: use dgemm
        mo_two_e_integrals_jj(i,j) = sum(cholesky_mo_transp(:,i,i)*cholesky_mo_transp(:,j,j))
        mo_two_e_integrals_jj_exchange(i,j) = sum(cholesky_mo_transp(:,i,j)*cholesky_mo_transp(:,j,i))
      enddo
    enddo
  else
    do j=1,mo_num
      do i=1,mo_num
        mo_two_e_integrals_jj(i,j) = get_two_e_integral(i,j,i,j,mo_integrals_map)
        mo_two_e_integrals_jj_exchange(i,j) = get_two_e_integral(i,j,j,i,mo_integrals_map)
      enddo
    enddo
  endif
  do j=1,mo_num
    do i=1,mo_num
        mo_two_e_integrals_jj_anti(i,j) = mo_two_e_integrals_jj(i,j) - mo_two_e_integrals_jj_exchange(i,j)
    enddo
  enddo
--- a/src/non_h_ints_mu/jast_deriv.irp.f
+++ b/src/non_h_ints_mu/jast_deriv.irp.f
@ -187,6 +187,19 @@ end function j12_mu
 subroutine grad1_j12_mu(r1, r2, grad)
  BEGIN_DOC
 !  gradient of j(mu(r1,r2),r12) form of jastrow. 
 !
 ! if mu(r1,r2) = cst ---> j1b_type < 200 and 
 !
 !  d/dx1 j(mu,r12) = 0.5 * (1 - erf(mu *r12))/r12 * (x1 - x2)
 !
 ! if mu(r1,r2) /= cst ---> 200 < j1b_type < 300 and 
 !
 ! d/dx1 j(mu(r1,r2),r12) = exp(-(mu(r1,r2)*r12)**2) /(2 *sqrt(pi) * mu(r1,r2)**2 )   d/dx1 mu(r1,r2) 
 !
 !                          + 0.5 * (1 - erf(mu(r1,r2) *r12))/r12 * (x1 - x2)
  END_DOC
  include 'constants.include.F'
  implicit none
@ -515,6 +528,9 @@ subroutine mu_r_val_and_grad(r1, r2, mu_val, mu_der)
  double precision              :: r(3)
  double precision              :: dm_a(1), dm_b(1), grad_dm_a(3,1), grad_dm_b(3,1)
  double precision              :: dm_tot, tmp1, tmp2, tmp3
  double precision :: rho1, grad_rho1(3),rho2,rho_tot,inv_rho_tot
  double precision :: f_rho1, f_rho2, d_drho_f_rho1
  double precision :: d_dx1_f_rho1(3),d_dx_rho_f_rho(3),nume
  if(j1b_type .eq. 200) then
@ -578,8 +594,84 @@ subroutine mu_r_val_and_grad(r1, r2, mu_val, mu_der)
    mu_der(2) = tmp3 * (grad_dm_a(2,1) + grad_dm_b(2,1))
    mu_der(3) = tmp3 * (grad_dm_a(3,1) + grad_dm_b(3,1))
-  else
+  elseif(j1b_type .eq. 202) then
    ! mu(r1,r2) = {rho(r1) f[rho(r1)] + rho(r2) f[rho(r2)]} / RHO
    !
    ! RHO = rho(r1) + rho(r2)
    !
    ! f[rho] = alpha rho^beta + mu0 exp(-rho)
    !
    ! d/dx1 mu(r1,r2) = 1/RHO^2 * {RHO * d/dx1 (rho(r1) f[rho(r1)]) 
    !                              - d/dx1 rho(r1) * [rho(r1) f[rho(r1)] + rho(r2) f[rho(r2)]] }
    !
    ! d/dx1 f[rho(r1)] = [0.5 alpha / sqrt(rho(r1)) - mu0 exp(-rho(r1))] (d rho(r1) / dx1)
    !
    ! d/dx1 (rho(r1) f[rho(r1)] = rho(r1) * d/dx1 f[rho(r1)] + f[rho(r1)] * d/dx1 rho(r1)
    !!!!!!!!! rho1,rho2,rho1+rho2
    call get_all_rho_grad_rho(r1,r2,rho1,rho2,grad_rho1)
    rho_tot = rho1 + rho2
    if(rho_tot.lt.1.d-10)rho_tot = 1.d-10
    inv_rho_tot = 1.d0/rho_tot
    ! f(rho) = mu_r_ct * rho**beta_rho_power + mu_erf * exp(-rho)
    call get_all_f_rho(rho1,rho2,mu_r_ct,mu_erf,beta_rho_power,f_rho1,d_drho_f_rho1,f_rho2)
    d_dx1_f_rho1(1:3)   = d_drho_f_rho1 * grad_rho1(1:3)
    d_dx_rho_f_rho(1:3) = rho1 * d_dx1_f_rho1(1:3) + f_rho1 * grad_rho1(1:3)
    nume   = rho1 * f_rho1 + rho2 * f_rho2
    mu_val = nume * inv_rho_tot
    mu_der(1:3) = inv_rho_tot*inv_rho_tot * (rho_tot * d_dx_rho_f_rho(1:3) - grad_rho1(1:3) * nume)
  elseif(j1b_type .eq. 203) then
    ! mu(r1,r2) = {rho(r1) f[rho(r1)] + rho(r2) f[rho(r2)]} / RHO
    !
    ! RHO = rho(r1) + rho(r2)
    !
    ! f[rho] = alpha rho^beta + mu0 
    !
    ! d/dx1 mu(r1,r2) = 1/RHO^2 * {RHO * d/dx1 (rho(r1) f[rho(r1)]) 
    !                              - d/dx1 rho(r1) * [rho(r1) f[rho(r1)] + rho(r2) f[rho(r2)]] }
    !
    ! d/dx1 f[rho(r1)] = [0.5 alpha / sqrt(rho(r1)) ] (d rho(r1) / dx1)
    !
    ! d/dx1 (rho(r1) f[rho(r1)] = rho(r1) * d/dx1 f[rho(r1)] + f[rho(r1)] * d/dx1 rho(r1)
    !!!!!!!!! rho1,rho2,rho1+rho2
    call get_all_rho_grad_rho(r1,r2,rho1,rho2,grad_rho1)
    rho_tot = rho1 + rho2
    if(rho_tot.lt.1.d-10)rho_tot = 1.d-10
    inv_rho_tot = 1.d0/rho_tot
    ! f(rho) = mu_r_ct * rho**beta_rho_power + mu_erf 
    call get_all_f_rho_simple(rho1,rho2,mu_r_ct,mu_erf,beta_rho_power,f_rho1,d_drho_f_rho1,f_rho2)
    d_dx1_f_rho1(1:3)   = d_drho_f_rho1 * grad_rho1(1:3)
    d_dx_rho_f_rho(1:3) = rho1 * d_dx1_f_rho1(1:3) + f_rho1 * grad_rho1(1:3)
    nume   = rho1 * f_rho1 + rho2 * f_rho2
    mu_val = nume * inv_rho_tot
    mu_der(1:3) = inv_rho_tot*inv_rho_tot * (rho_tot * d_dx_rho_f_rho(1:3) - grad_rho1(1:3) * nume)
  elseif(j1b_type .eq. 204) then
    ! mu(r1,r2) = 1/2 * (f[rho(r1)] + f[rho(r2)]} 
    !
    ! f[rho] = alpha rho^beta + mu0 
    !
    ! d/dx1 mu(r1,r2) = 1/2 * d/dx1 (rho(r1) f[rho(r1)])
    !                   
    ! d/dx1 f[rho(r1)] = [0.5 alpha / sqrt(rho(r1)) ] (d rho(r1) / dx1)
    !
    ! d/dx1 (rho(r1) f[rho(r1)] = rho(r1) * d/dx1 f[rho(r1)] + f[rho(r1)] * d/dx1 rho(r1)
    !!!!!!!!! rho1,rho2,rho1+rho2
    call get_all_rho_grad_rho(r1,r2,rho1,rho2,grad_rho1)
    rho_tot = rho1 + rho2
    if(rho_tot.lt.1.d-10)rho_tot = 1.d-10
    inv_rho_tot = 1.d0/rho_tot
    ! f(rho) = mu_r_ct * rho**beta_rho_power + mu_erf 
    call get_all_f_rho_simple(rho1,rho2,mu_r_ct,mu_erf,beta_rho_power,f_rho1,d_drho_f_rho1,f_rho2)
    d_dx1_f_rho1(1:3)   = d_drho_f_rho1 * grad_rho1(1:3)
    d_dx_rho_f_rho(1:3) = rho1 * d_dx1_f_rho1(1:3) + f_rho1 * grad_rho1(1:3)
    mu_val = 0.5d0 * ( f_rho1 + f_rho2)
    mu_der(1:3) = d_dx_rho_f_rho(1:3) 
  else
    print *, ' j1b_type = ', j1b_type, 'not implemented yet'
    stop
@ -684,3 +776,76 @@ end function j12_mu_square
 ! ---
 subroutine f_mu_and_deriv_mu(rho,alpha,mu0,beta,f_mu,d_drho_f_mu)
 implicit none
 BEGIN_DOC
 ! function giving mu as a function of rho
 !
 ! f_mu = alpha * rho**beta + mu0 * exp(-rho)
 !
 ! and its derivative with respect to rho d_drho_f_mu
 END_DOC
 double precision, intent(in)  :: rho,alpha,mu0,beta
 double precision, intent(out) :: f_mu,d_drho_f_mu
 f_mu = alpha * (rho)**beta + mu0 * dexp(-rho)
 d_drho_f_mu = alpha * beta * rho**(beta-1.d0) - mu0 * dexp(-rho)
 end
 subroutine get_all_rho_grad_rho(r1,r2,rho1,rho2,grad_rho1)
 implicit none
 BEGIN_DOC
 ! returns the density in r1,r2 and grad_rho at r1
 END_DOC
 double precision, intent(in) :: r1(3),r2(3)
 double precision, intent(out):: grad_rho1(3),rho1,rho2
 double precision              :: dm_a(1), dm_b(1), grad_dm_a(3,1), grad_dm_b(3,1)
 call density_and_grad_alpha_beta(r1, dm_a, dm_b, grad_dm_a, grad_dm_b)
 rho1 = dm_a(1) + dm_b(1)
 grad_rho1(1:3) = grad_dm_a(1:3,1) + grad_dm_b(1:3,1)
 call density_and_grad_alpha_beta(r2, dm_a, dm_b, grad_dm_a, grad_dm_b)
 rho2 = dm_a(1) + dm_b(1)
 end
 subroutine get_all_f_rho(rho1,rho2,alpha,mu0,beta,f_rho1,d_drho_f_rho1,f_rho2)
 implicit none
 BEGIN_DOC
 ! returns the values f(mu(r1)), f(mu(r2)) and d/drho(1) f(mu(r1))
 END_DOC
 double precision, intent(in) :: rho1,rho2,alpha,mu0,beta
 double precision, intent(out):: f_rho1,d_drho_f_rho1,f_rho2
 double precision :: tmp
 call f_mu_and_deriv_mu(rho1,alpha,mu0,beta,f_rho1,d_drho_f_rho1)
 call f_mu_and_deriv_mu(rho2,alpha,mu0,beta,f_rho2,tmp)
 end
 subroutine get_all_f_rho_simple(rho1,rho2,alpha,mu0,beta,f_rho1,d_drho_f_rho1,f_rho2)
 implicit none
 BEGIN_DOC
 ! returns the values f(mu(r1)), f(mu(r2)) and d/drho(1) f(mu(r1))
 END_DOC
 double precision, intent(in) :: rho1,rho2,alpha,mu0,beta
 double precision, intent(out):: f_rho1,d_drho_f_rho1,f_rho2
 double precision :: tmp
 call f_mu_and_deriv_mu_simple(rho1,alpha,mu0,beta,f_rho1,d_drho_f_rho1)
 call f_mu_and_deriv_mu_simple(rho2,alpha,mu0,beta,f_rho2,tmp)
 end
 subroutine f_mu_and_deriv_mu_simple(rho,alpha,mu0,beta,f_mu,d_drho_f_mu)
 implicit none
 BEGIN_DOC
 ! function giving mu as a function of rho
 !
 ! f_mu = alpha * rho**beta + mu0 
 !
 ! and its derivative with respect to rho d_drho_f_mu
 END_DOC
 double precision, intent(in)  :: rho,alpha,mu0,beta
 double precision, intent(out) :: f_mu,d_drho_f_mu
 f_mu = alpha * (rho)**beta + mu0 
 d_drho_f_mu = alpha * beta * rho**(beta-1.d0) 
 end
--- a/src/non_h_ints_mu/plot_mu_of_r.irp.f
+++ b/src/non_h_ints_mu/plot_mu_of_r.irp.f
@ -0,0 +1,33 @@
 program plot_mu_of_r
 implicit none
 read_wf = .False.
 touch read_wf 
 call routine_print
 end
 subroutine routine_print
 implicit none
 character*(128) :: output
 integer :: i_unit_output,getUnitAndOpen
 output=trim(ezfio_filename)//'.mu_of_r'
 i_unit_output = getUnitAndOpen(output,'w')
 integer :: ipoint,nx
 double precision :: xmax,xmin,r(3),dx
 double precision :: mu_val, mu_der(3),dm_a,dm_b,grad
 xmax =  5.D0
 xmin = -5.D0
 nx = 10000
 dx = (xmax - xmin)/dble(nx)
 r = 0.d0
 r(1) = xmin
 do ipoint = 1, nx
  call mu_r_val_and_grad(r, r, mu_val, mu_der)
  call dm_dft_alpha_beta_at_r(r,dm_a,dm_b)
  grad = mu_der(1)**2 + mu_der(2)**2 + mu_der(3)**2 
  grad = dsqrt(grad)
  write(i_unit_output,'(100(F16.7,X))')r(1),mu_val,dm_a+dm_b,grad
  r(1) += dx
 enddo
 end
--- a/src/tc_bi_ortho/dressing_vectors_lr.irp.f
+++ b/src/tc_bi_ortho/dressing_vectors_lr.irp.f
@ -27,7 +27,7 @@ subroutine get_delta_bitc_right(psidet, psicoef, ndet, Nint, delta)
  i = 1
  j = 1
-  call htilde_mu_mat_bi_ortho(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
+  call htilde_mu_mat_bi_ortho_slow(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
  call hmat_bi_ortho         (psidet(1,1,i), psidet(1,1,j), Nint, h_mono, h_twoe, h_tot)
  delta = 0.d0
@ -39,7 +39,7 @@ subroutine get_delta_bitc_right(psidet, psicoef, ndet, Nint, delta)
    do j = 1, ndet
      ! < I | Htilde | J >
-      call htilde_mu_mat_bi_ortho(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
+      call htilde_mu_mat_bi_ortho_slow(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
      ! < I | H | J >
      call hmat_bi_ortho(psidet(1,1,i), psidet(1,1,j), Nint, h_mono, h_twoe, h_tot)
@ -78,7 +78,7 @@ subroutine get_htc_bitc_right(psidet, psicoef, ndet, Nint, delta)
  i = 1
  j = 1
-  call htilde_mu_mat_bi_ortho(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
+  call htilde_mu_mat_bi_ortho_slow(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
  delta = 0.d0
 !$OMP PARALLEL DO DEFAULT(NONE) SCHEDULE(dynamic,8)   &
@ -88,7 +88,7 @@ subroutine get_htc_bitc_right(psidet, psicoef, ndet, Nint, delta)
    do j = 1, ndet
      ! < I | Htilde | J >
-      call htilde_mu_mat_bi_ortho(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
+      call htilde_mu_mat_bi_ortho_slow(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
      delta(i) = delta(i) + psicoef(j) * htc_tot
    enddo
--- a/src/tc_bi_ortho/e_corr_bi_ortho.irp.f
+++ b/src/tc_bi_ortho/e_corr_bi_ortho.irp.f
@ -2,7 +2,7 @@
 BEGIN_PROVIDER [ double precision, e_tilde_00]
 implicit none
 double precision :: hmono,htwoe,hthree,htot
- call htilde_mu_mat_bi_ortho(HF_bitmask,HF_bitmask,N_int,hmono,htwoe,hthree,htot)
+ call htilde_mu_mat_bi_ortho_slow(HF_bitmask,HF_bitmask,N_int,hmono,htwoe,hthree,htot)
 e_tilde_00 = htot
 END_PROVIDER 
@ -18,11 +18,11 @@
 do i = 1, N_det
  call get_excitation_degree(HF_bitmask,psi_det(1,1,i),degree,N_int)
  if(degree == 1 .or. degree == 2)then
-   call htilde_mu_mat_bi_ortho(psi_det(1,1,i),HF_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
+   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i),HF_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
-   call htilde_mu_mat_bi_ortho(psi_det(1,1,i),psi_det(1,1,i),N_int,hmono,htwoe,hthree,e_i0)
+   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i),psi_det(1,1,i),N_int,hmono,htwoe,hthree,e_i0)
   delta_e = e_tilde_00 - e_i0
   coef_pt1 = htilde_ij / delta_e
-   call htilde_mu_mat_bi_ortho(HF_bitmask,psi_det(1,1,i),N_int,hmono,htwoe,hthree,htilde_ij)
+   call htilde_mu_mat_bi_ortho_slow(HF_bitmask,psi_det(1,1,i),N_int,hmono,htwoe,hthree,htilde_ij)
   e_pt2_tc_bi_orth += coef_pt1 * htilde_ij
   if(degree == 1)then
    e_pt2_tc_bi_orth_single += coef_pt1 * htilde_ij
@ -37,7 +37,7 @@
 BEGIN_PROVIDER [ double precision, e_tilde_bi_orth_00]
 implicit none
 double precision :: hmono,htwoe,hthree,htilde_ij
- call htilde_mu_mat_bi_ortho(HF_bitmask,HF_bitmask,N_int,hmono,htwoe,hthree,e_tilde_bi_orth_00)
+ call htilde_mu_mat_bi_ortho_slow(HF_bitmask,HF_bitmask,N_int,hmono,htwoe,hthree,e_tilde_bi_orth_00)
 e_tilde_bi_orth_00 += nuclear_repulsion
 END_PROVIDER 
@ -57,7 +57,7 @@
 e_corr_double_bi_orth = 0.d0
 do i = 1, N_det
  call get_excitation_degree(HF_bitmask,psi_det(1,1,i),degree,N_int)
-  call htilde_mu_mat_bi_ortho(HF_bitmask,psi_det(1,1,i),N_int,hmono,htwoe,hthree,htilde_ij)
+  call htilde_mu_mat_bi_ortho_slow(HF_bitmask,psi_det(1,1,i),N_int,hmono,htwoe,hthree,htilde_ij)
  if(degree == 1)then
   e_corr_single_bi_orth += reigvec_tc_bi_orth(i,1) * htilde_ij/reigvec_tc_bi_orth(1,1)
   e_corr_single_bi_orth_abs += dabs(reigvec_tc_bi_orth(i,1) * htilde_ij/reigvec_tc_bi_orth(1,1))
@ -80,7 +80,7 @@
 do i = 1, N_det
  accu += reigvec_tc_bi_orth(i,1) * leigvec_tc_bi_orth(i,1)
  do j = 1, N_det
-   call htilde_mu_mat_bi_ortho(psi_det(1,1,j),psi_det(1,1,i),N_int,hmono,htwoe,hthree,htilde_ij)
+   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,j),psi_det(1,1,i),N_int,hmono,htwoe,hthree,htilde_ij)
   e_tc_left_right += htilde_ij * reigvec_tc_bi_orth(i,1) * leigvec_tc_bi_orth(j,1)
  enddo
 enddo
@ -99,8 +99,8 @@ BEGIN_PROVIDER [ double precision, coef_pt1_bi_ortho, (N_det)]
  if(degree==0)then
   coef_pt1_bi_ortho(i) = 1.d0
  else
-   call htilde_mu_mat_bi_ortho(psi_det(1,1,i),HF_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
+   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i),HF_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
-   call htilde_mu_mat_bi_ortho(psi_det(1,1,i),psi_det(1,1,i),N_int,hmono,htwoe,hthree,e_i0)
+   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i),psi_det(1,1,i),N_int,hmono,htwoe,hthree,e_i0)
   delta_e = e_tilde_00 - e_i0
   coef_pt1 = htilde_ij / delta_e
   coef_pt1_bi_ortho(i)= coef_pt1
--- a/src/tc_bi_ortho/h_tc_bi_ortho_psi.irp.f
+++ b/src/tc_bi_ortho/h_tc_bi_ortho_psi.irp.f
@ -1,4 +1,4 @@
-subroutine htc_bi_ortho_calc_tdav(v, u, N_st, sze)
+subroutine htc_bi_ortho_calc_tdav_slow(v, u, N_st, sze)
  use bitmasks
@ -27,7 +27,7 @@ subroutine htc_bi_ortho_calc_tdav(v, u, N_st, sze)
  i = 1
  j = 1
-  call htilde_mu_mat_bi_ortho_tot(psi_det(1,1,i), psi_det(1,1,j), N_int, htot)
+  call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,i), psi_det(1,1,j), N_int, htot)
  v = 0.d0
 !$OMP PARALLEL DO DEFAULT(NONE) SCHEDULE(dynamic,8) &
@ -36,7 +36,7 @@ subroutine htc_bi_ortho_calc_tdav(v, u, N_st, sze)
  do istate = 1, N_st
    do i = 1, sze
      do j = 1, sze
-        call htilde_mu_mat_bi_ortho_tot(psi_det(1,1,i), psi_det(1,1,j), N_int, htot)
+        call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,i), psi_det(1,1,j), N_int, htot)
        v(i,istate) = v(i,istate) + htot * u(j,istate)
      enddo
    enddo 
@ -45,7 +45,7 @@ subroutine htc_bi_ortho_calc_tdav(v, u, N_st, sze)
 end 
-subroutine htcdag_bi_ortho_calc_tdav(v, u, N_st, sze)
+subroutine htcdag_bi_ortho_calc_tdav_slow(v, u, N_st, sze)
  use bitmasks
@ -71,7 +71,7 @@ subroutine htcdag_bi_ortho_calc_tdav(v, u, N_st, sze)
  i = 1
  j = 1
-  call htilde_mu_mat_bi_ortho_tot(psi_det(1,1,i), psi_det(1,1,j), N_int, htot)
+  call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,i), psi_det(1,1,j), N_int, htot)
  v = 0.d0
@ -81,7 +81,7 @@ subroutine htcdag_bi_ortho_calc_tdav(v, u, N_st, sze)
  do istate = 1, N_st
    do i = 1, sze
      do j = 1, sze
-        call htilde_mu_mat_bi_ortho_tot(psi_det(1,1,j), psi_det(1,1,i), N_int, htot)
+        call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,j), psi_det(1,1,i), N_int, htot)
        v(i,istate) = v(i,istate) + htot * u(j,istate)
      enddo
    enddo
--- a/src/tc_bi_ortho/print_tc_wf.irp.f
+++ b/src/tc_bi_ortho/print_tc_wf.irp.f
@ -49,12 +49,12 @@ subroutine routine
 do i = 1, N_det
  call get_excitation_degree(HF_bitmask,psi_det(1,1,i),degree,N_int)
   if(degree == 1 .or. degree == 2)then
-    call htilde_mu_mat_bi_ortho(psi_det(1,1,i),HF_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
+    call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i),HF_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
-    call htilde_mu_mat_bi_ortho(psi_det(1,1,i),psi_det(1,1,i),N_int,hmono,htwoe,hthree,e_i0)
+    call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i),psi_det(1,1,i),N_int,hmono,htwoe,hthree,e_i0)
    delta_e = e_tilde_00 - e_i0
    coef_pt1 = htilde_ij / delta_e
-    call htilde_mu_mat_bi_ortho(HF_bitmask,psi_det(1,1,i),N_int,hmono,htwoe,hthree,htilde_ij)
+    call htilde_mu_mat_bi_ortho_slow(HF_bitmask,psi_det(1,1,i),N_int,hmono,htwoe,hthree,htilde_ij)
    contrib_pt = coef_pt1 * htilde_ij
    e_pt2 += contrib_pt
--- a/src/tc_bi_ortho/pt2_tc_cisd.irp.f
+++ b/src/tc_bi_ortho/pt2_tc_cisd.irp.f
@ -36,11 +36,11 @@ subroutine routine
 e_corr_abs = 0.d0
 e_corr_pos = 0.d0
 e_corr_neg = 0.d0
- call htilde_mu_mat_bi_ortho_tot(psi_det(1,1,1), psi_det(1,1,1), N_int, e00) 
+ call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,1), psi_det(1,1,1), N_int, e00) 
 do i = 2, N_det
-  call htilde_mu_mat_bi_ortho_tot(psi_det(1,1,i), psi_det(1,1,1), N_int, hi0) 
+  call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,i), psi_det(1,1,1), N_int, hi0) 
-  call htilde_mu_mat_bi_ortho_tot(psi_det(1,1,1), psi_det(1,1,i), N_int, h0i) 
+  call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,1), psi_det(1,1,i), N_int, h0i) 
-  call htilde_mu_mat_bi_ortho_tot(psi_det(1,1,i), psi_det(1,1,i), N_int, ei) 
+  call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,i), psi_det(1,1,i), N_int, ei) 
  call get_excitation_degree(psi_det(1,1,1), psi_det(1,1,i),degree,N_int)
  call get_excitation(psi_det(1,1,1), psi_det(1,1,i),exc,degree,phase,N_int)
  call decode_exc(exc,degree,h1,p1,h2,p2,s1,s2)
--- a/src/tc_bi_ortho/slater_tc_3e_slow.irp.f
+++ b/src/tc_bi_ortho/slater_tc_3e_slow.irp.f
@ -1,23 +1,5 @@
 subroutine provide_all_three_ints_bi_ortho
 implicit none
 BEGIN_DOC
 ! routine that provides all necessary three-electron integrals 
 END_DOC
 if(three_body_h_tc)then
  PROVIDE three_e_3_idx_direct_bi_ort three_e_3_idx_cycle_1_bi_ort three_e_3_idx_cycle_2_bi_ort
  PROVIDE three_e_3_idx_exch23_bi_ort three_e_3_idx_exch13_bi_ort three_e_3_idx_exch12_bi_ort
  PROVIDE three_e_4_idx_direct_bi_ort three_e_4_idx_cycle_1_bi_ort three_e_4_idx_cycle_2_bi_ort
  PROVIDE three_e_4_idx_exch23_bi_ort three_e_4_idx_exch13_bi_ort three_e_4_idx_exch12_bi_ort
 endif
 if(.not.double_normal_ord)then
  PROVIDE three_e_5_idx_direct_bi_ort three_e_5_idx_cycle_1_bi_ort three_e_5_idx_cycle_2_bi_ort
  PROVIDE three_e_5_idx_exch23_bi_ort three_e_5_idx_exch13_bi_ort three_e_5_idx_exch12_bi_ort
 else
 PROVIDE normal_two_body_bi_orth
 endif
 end
-subroutine diag_htilde_three_body_ints_bi_ort(Nint, key_i, hthree)
+subroutine diag_htilde_three_body_ints_bi_ort_slow(Nint, key_i, hthree)
  BEGIN_DOC
  !  diagonal element of htilde ONLY FOR THREE-BODY TERMS WITH BI ORTHONORMAL ORBITALS
@ -108,7 +90,7 @@ subroutine diag_htilde_three_body_ints_bi_ort(Nint, key_i, hthree)
 end
-subroutine single_htilde_three_body_ints_bi_ort(Nint, key_j, key_i, hthree)
+subroutine single_htilde_three_body_ints_bi_ort_slow(Nint, key_j, key_i, hthree)
  BEGIN_DOC
  ! <key_j | H_tilde | key_i> for single excitation ONLY FOR THREE-BODY TERMS WITH BI ORTHONORMAL ORBITALS
@ -203,7 +185,7 @@ end
 ! ---
-subroutine double_htilde_three_body_ints_bi_ort(Nint, key_j, key_i, hthree)
+subroutine double_htilde_three_body_ints_bi_ort_slow(Nint, key_j, key_i, hthree)
  BEGIN_DOC
  ! <key_j | H_tilde | key_i> for double excitation ONLY FOR THREE-BODY TERMS  WITH BI ORTHONORMAL ORBITALS
--- a/src/tc_bi_ortho/slater_tc_opt.irp.f
+++ b/src/tc_bi_ortho/slater_tc_opt.irp.f
@ -1,3 +1,26 @@
 subroutine provide_all_three_ints_bi_ortho
 implicit none
 BEGIN_DOC
 ! routine that provides all necessary three-electron integrals 
 END_DOC
 if(three_body_h_tc)then
  if(three_e_3_idx_term)then
   PROVIDE three_e_3_idx_direct_bi_ort three_e_3_idx_cycle_1_bi_ort three_e_3_idx_cycle_2_bi_ort
   PROVIDE three_e_3_idx_exch23_bi_ort three_e_3_idx_exch13_bi_ort three_e_3_idx_exch12_bi_ort
  endif
  if(three_e_4_idx_term)then
   PROVIDE three_e_4_idx_direct_bi_ort three_e_4_idx_cycle_1_bi_ort three_e_4_idx_cycle_2_bi_ort
   PROVIDE three_e_4_idx_exch23_bi_ort three_e_4_idx_exch13_bi_ort three_e_4_idx_exch12_bi_ort
  endif
  if(.not.double_normal_ord.and.three_e_5_idx_term)then
   PROVIDE three_e_5_idx_direct_bi_ort three_e_5_idx_cycle_1_bi_ort three_e_5_idx_cycle_2_bi_ort
   PROVIDE three_e_5_idx_exch23_bi_ort three_e_5_idx_exch13_bi_ort three_e_5_idx_exch12_bi_ort
  elseif (double_normal_ord .and. (.not. three_e_5_idx_term))then
   PROVIDE normal_two_body_bi_orth
  endif
 endif
 end
 subroutine htilde_mu_mat_opt_bi_ortho_tot(key_j, key_i, Nint, htot)
 implicit none
  BEGIN_DOC
--- a/src/tc_bi_ortho/slater_tc_opt_diag.irp.f
+++ b/src/tc_bi_ortho/slater_tc_opt_diag.irp.f
@ -7,11 +7,11 @@
 ! Various component of the TC energy for the reference "HF" Slater determinant
 END_DOC 
 double precision :: hmono, htwoe, htot, hthree
- call diag_htilde_mu_mat_bi_ortho(N_int,HF_bitmask , hmono, htwoe, htot)
+ call diag_htilde_mu_mat_bi_ortho_slow(N_int,HF_bitmask , hmono, htwoe, htot)
 ref_tc_energy_1e = hmono
 ref_tc_energy_2e = htwoe 
 if(three_body_h_tc)then
-  call diag_htilde_three_body_ints_bi_ort(N_int, HF_bitmask, hthree)
+  call diag_htilde_three_body_ints_bi_ort_slow(N_int, HF_bitmask, hthree)
  ref_tc_energy_3e = hthree
 else
  ref_tc_energy_3e = 0.d0
@ -156,7 +156,7 @@ subroutine ac_tc_operator(iorb,ispin,key,hmono,htwoe,hthree,Nint,na,nb)
    htwoe = htwoe + mo_bi_ortho_tc_two_e_jj(occ(i,other_spin),iorb)
  enddo
-  if(three_body_h_tc.and.elec_num.gt.2)then
+  if(three_body_h_tc.and.elec_num.gt.2.and.three_e_3_idx_term)then
   !!!!! 3-e part 
   !! same-spin/same-spin
   do j = 1, na
@ -243,7 +243,7 @@ subroutine a_tc_operator(iorb,ispin,key,hmono,htwoe,hthree,Nint,na,nb)
    htwoe= htwoe- mo_bi_ortho_tc_two_e_jj(occ(i,other_spin),iorb)
  enddo
-  if(three_body_h_tc.and.elec_num.gt.2)then
+  if(three_body_h_tc.and.elec_num.gt.2.and.three_e_3_idx_term)then
   !!!!! 3-e part 
   !! same-spin/same-spin
   do j = 1, na
--- a/src/tc_bi_ortho/slater_tc_opt_double.irp.f
+++ b/src/tc_bi_ortho/slater_tc_opt_double.irp.f
@ -42,13 +42,13 @@ subroutine double_htilde_mu_mat_fock_bi_ortho(Nint, key_j, key_i, hmono, htwoe,
   ! opposite spin two-body 
    htwoe  = mo_bi_ortho_tc_two_e(p2,p1,h2,h1) 
    if(three_body_h_tc.and.elec_num.gt.2)then
-     if(.not.double_normal_ord)then
+     if(.not.double_normal_ord.and.three_e_5_idx_term)then
      if(degree_i>degree_j)then
       call three_comp_two_e_elem(key_j,h1,h2,p1,p2,s1,s2,hthree)
      else
       call three_comp_two_e_elem(key_i,h1,h2,p1,p2,s1,s2,hthree)
      endif
-     elseif(double_normal_ord.and.elec_num.gt.2)then
+     elseif(double_normal_ord)then
      htwoe += normal_two_body_bi_orth(p2,h2,p1,h1)
     endif
    endif
@ -59,13 +59,13 @@ subroutine double_htilde_mu_mat_fock_bi_ortho(Nint, key_j, key_i, hmono, htwoe,
   ! exchange terms 
   htwoe -= mo_bi_ortho_tc_two_e(p1,p2,h2,h1) 
   if(three_body_h_tc.and.elec_num.gt.2)then
-    if(.not.double_normal_ord)then
+    if(.not.double_normal_ord.and.three_e_5_idx_term)then
     if(degree_i>degree_j)then
      call three_comp_two_e_elem(key_j,h1,h2,p1,p2,s1,s2,hthree)
     else
      call three_comp_two_e_elem(key_i,h1,h2,p1,p2,s1,s2,hthree)
     endif
-    elseif(double_normal_ord.and.elec_num.gt.2)then
+    elseif(double_normal_ord)then
      htwoe -= normal_two_body_bi_orth(h2,p1,h1,p2)
      htwoe += normal_two_body_bi_orth(h1,p1,h2,p2)
    endif
--- a/src/tc_bi_ortho/slater_tc_opt_single.irp.f
+++ b/src/tc_bi_ortho/slater_tc_opt_single.irp.f
@ -106,7 +106,7 @@ subroutine get_single_excitation_from_fock_tc(key_i,key_j,h,p,spin,phase,hmono,h
  htwoe -= buffer_x(i)
 enddo
 hthree = 0.d0
- if (three_body_h_tc.and.elec_num.gt.2)then
+ if (three_body_h_tc.and.elec_num.gt.2.and.three_e_4_idx_term)then
  call three_comp_fock_elem(key_i,h,p,spin,hthree)
 endif
--- a/src/tc_bi_ortho/slater_tc_slow.irp.f
+++ b/src/tc_bi_ortho/slater_tc_slow.irp.f
@ -1,7 +1,7 @@
 ! ---
-subroutine htilde_mu_mat_bi_ortho_tot(key_j, key_i, Nint, htot)
+subroutine htilde_mu_mat_bi_ortho_tot_slow(key_j, key_i, Nint, htot)
  BEGIN_DOC
  ! <key_j | H_tilde | key_i> where |key_j> is developed on the LEFT basis and |key_i> is developed on the RIGHT basis
@ -24,14 +24,14 @@ subroutine htilde_mu_mat_bi_ortho_tot(key_j, key_i, Nint, htot)
  if(degree.gt.2)then
    htot = 0.d0
  else
-    call htilde_mu_mat_bi_ortho(key_j, key_i, Nint, hmono, htwoe, hthree, htot)
+    call htilde_mu_mat_bi_ortho_slow(key_j, key_i, Nint, hmono, htwoe, hthree, htot)
  endif
-end subroutine htilde_mu_mat_bi_ortho_tot
+end subroutine htilde_mu_mat_bi_ortho_tot_slow
 ! --
-subroutine htilde_mu_mat_bi_ortho(key_j, key_i, Nint, hmono, htwoe, hthree, htot)
+subroutine htilde_mu_mat_bi_ortho_slow(key_j, key_i, Nint, hmono, htwoe, hthree, htot)
  BEGIN_DOC
  !
@ -61,22 +61,22 @@ subroutine htilde_mu_mat_bi_ortho(key_j, key_i, Nint, hmono, htwoe, hthree, htot
  if(degree.gt.2) return
  if(degree == 0)then
-    call diag_htilde_mu_mat_bi_ortho(Nint, key_i, hmono, htwoe, htot)
+    call diag_htilde_mu_mat_bi_ortho_slow(Nint, key_i, hmono, htwoe, htot)
  else if (degree == 1)then
-    call single_htilde_mu_mat_bi_ortho(Nint, key_j, key_i, hmono, htwoe, htot)
+    call single_htilde_mu_mat_bi_ortho_slow(Nint, key_j, key_i, hmono, htwoe, htot)
  else if(degree == 2)then
-    call double_htilde_mu_mat_bi_ortho(Nint, key_j, key_i, hmono, htwoe, htot)
+    call double_htilde_mu_mat_bi_ortho_slow(Nint, key_j, key_i, hmono, htwoe, htot)
  endif
  if(three_body_h_tc) then
    if(degree == 2) then
-      if(.not.double_normal_ord) then
+      if(.not.double_normal_ord.and.elec_num.gt.2.and.three_e_5_idx_term) then
-        call double_htilde_three_body_ints_bi_ort(Nint, key_j, key_i, hthree)
+        call double_htilde_three_body_ints_bi_ort_slow(Nint, key_j, key_i, hthree)
      endif
-    else if(degree == 1) then
+    else if(degree == 1.and.elec_num.gt.2.and.three_e_4_idx_term) then
-      call single_htilde_three_body_ints_bi_ort(Nint, key_j, key_i, hthree)
+      call single_htilde_three_body_ints_bi_ort_slow(Nint, key_j, key_i, hthree)
-    else if(degree == 0) then
+    else if(degree == 0.and.elec_num.gt.2.and.three_e_3_idx_term) then
-      call diag_htilde_three_body_ints_bi_ort(Nint, key_i, hthree)
+      call diag_htilde_three_body_ints_bi_ort_slow(Nint, key_i, hthree)
    endif
  endif
@ -89,7 +89,7 @@ end
 ! ---
-subroutine diag_htilde_mu_mat_bi_ortho(Nint, key_i, hmono, htwoe, htot)
+subroutine diag_htilde_mu_mat_bi_ortho_slow(Nint, key_i, hmono, htwoe, htot)
  BEGIN_DOC
  !  diagonal element of htilde ONLY FOR ONE- AND TWO-BODY TERMS 
@ -188,7 +188,7 @@ end
-subroutine double_htilde_mu_mat_bi_ortho(Nint, key_j, key_i, hmono, htwoe, htot)
+subroutine double_htilde_mu_mat_bi_ortho_slow(Nint, key_j, key_i, hmono, htwoe, htot)
  BEGIN_DOC
  ! <key_j | H_tilde | key_i> for double excitation  ONLY FOR ONE- AND TWO-BODY TERMS 
@ -227,18 +227,7 @@ subroutine double_htilde_mu_mat_bi_ortho(Nint, key_j, key_i, hmono, htwoe, htot)
   return
  endif
 !  if(core_tc_op)then
 !   print*,'core_tc_op not already taken into account for bi ortho'
 !   print*,'stopping ...'
 !   stop
 !   do i = 1, Nint
 !    key_i_core(i,1) = xor(key_i(i,1),core_bitmask(i,1))
 !    key_i_core(i,2) = xor(key_i(i,2),core_bitmask(i,2))
 !   enddo
 !   call bitstring_to_list_ab(key_i_core, occ, Ne, Nint)
 !  else
   call bitstring_to_list_ab(key_i, occ, Ne, Nint)
 !  endif
  call get_double_excitation(key_i, key_j, exc, phase, Nint)
  call decode_exc(exc, 2, h1, p1, h2, p2, s1, s2)
@ -246,7 +235,7 @@ subroutine double_htilde_mu_mat_bi_ortho(Nint, key_j, key_i, hmono, htwoe, htot)
   ! opposite spin two-body 
 !   key_j, key_i
    htwoe  = mo_bi_ortho_tc_two_e(p2,p1,h2,h1) 
-    if(double_normal_ord.and.+Ne(1).gt.2)then
+    if(three_body_h_tc.and.double_normal_ord.and.+Ne(1).gt.2)then
     htwoe += normal_two_body_bi_orth(p2,h2,p1,h1)!!! WTF ???
    endif
  else
@ -255,7 +244,7 @@ subroutine double_htilde_mu_mat_bi_ortho(Nint, key_j, key_i, hmono, htwoe, htot)
   htwoe  = mo_bi_ortho_tc_two_e(p2,p1,h2,h1)  
   ! exchange terms 
   htwoe -= mo_bi_ortho_tc_two_e(p1,p2,h2,h1) 
-   if(double_normal_ord.and.+Ne(1).gt.2)then
+   if(three_body_h_tc.and.double_normal_ord.and.+Ne(1).gt.2)then
    htwoe -= normal_two_body_bi_orth(h2,p1,h1,p2)!!! WTF ???
    htwoe += normal_two_body_bi_orth(h1,p1,h2,p2)!!! WTF ???
   endif
@ -266,7 +255,7 @@ subroutine double_htilde_mu_mat_bi_ortho(Nint, key_j, key_i, hmono, htwoe, htot)
 end
-subroutine single_htilde_mu_mat_bi_ortho(Nint, key_j, key_i, hmono, htwoe, htot)
+subroutine single_htilde_mu_mat_bi_ortho_slow(Nint, key_j, key_i, hmono, htwoe, htot)
  BEGIN_DOC
  ! <key_j | H_tilde | key_i> for single excitation ONLY FOR ONE- AND TWO-BODY TERMS 
--- a/src/tc_bi_ortho/tc_cisd_sc2_utils.irp.f
+++ b/src/tc_bi_ortho/tc_cisd_sc2_utils.irp.f
@ -11,10 +11,10 @@
 allocate(H_jj(N_det),vec_tmp(N_det,n_states_diag),eigval_tmp(N_states))
 dressing_dets = 0.d0
 do i = 1, N_det
-  call htilde_mu_mat_bi_ortho_tot(psi_det(1,1,i), psi_det(1,1,i), N_int, H_jj(i))
+  call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,i), psi_det(1,1,i), N_int, H_jj(i))
  call get_excitation_degree(HF_bitmask,psi_det(1,1,i),degree,N_int)
  if(degree == 1 .or. degree == 2)then
-   call htilde_mu_mat_bi_ortho(HF_bitmask,psi_det(1,1,i),N_int,hmono,htwoe,hthree,h0j(i))
+   call htilde_mu_mat_bi_ortho_slow(HF_bitmask,psi_det(1,1,i),N_int,hmono,htwoe,hthree,h0j(i))
  endif
 enddo
 reigvec_tc_bi_orth_tmp = 0.d0
@ -29,7 +29,7 @@
  vec_tmp(istate,istate) = 1.d0
 enddo
 print*,'Diagonalizing the TC CISD '
- call davidson_general_diag_dressed_ext_rout_nonsym_b1space(vec_tmp, H_jj, dressing_dets,eigval_tmp, N_det, n_states, n_states_diag, converged, htc_bi_ortho_calc_tdav)
+ call davidson_general_diag_dressed_ext_rout_nonsym_b1space(vec_tmp, H_jj, dressing_dets,eigval_tmp, N_det, n_states, n_states_diag, converged, htc_bi_ortho_calc_tdav_slow)
 do i = 1, N_det 
  e_corr_dets(i) = reigvec_tc_bi_orth_tmp(i,1) * h0j(i)/reigvec_tc_bi_orth_tmp(1,1)
 enddo
@ -41,8 +41,8 @@
 it = 0
 dressing_dets = 0.d0
  double precision, allocatable :: H_jj(:),vec_tmp(:,:),eigval_tmp(:)
-  external                         htc_bi_ortho_calc_tdav
+  external                         htc_bi_ortho_calc_tdav_slow
-  external                         htcdag_bi_ortho_calc_tdav
+  external                         htcdag_bi_ortho_calc_tdav_slow
  logical                       :: converged
 do while (dabs(E_before-E_current).gt.thr)
  it += 1
@ -66,7 +66,7 @@
  do istate = N_states+1, n_states_diag
   vec_tmp(istate,istate) = 1.d0
  enddo
-  call davidson_general_diag_dressed_ext_rout_nonsym_b1space(vec_tmp, H_jj, dressing_dets,eigval_tmp, N_det, n_states, n_states_diag, converged, htc_bi_ortho_calc_tdav)
+  call davidson_general_diag_dressed_ext_rout_nonsym_b1space(vec_tmp, H_jj, dressing_dets,eigval_tmp, N_det, n_states, n_states_diag, converged, htc_bi_ortho_calc_tdav_slow)
  print*,'outside Davidson'
  print*,'eigval_tmp(1) = ',eigval_tmp(1)
  do i = 1, N_det 
--- a/src/tc_bi_ortho/tc_h_eigvectors.irp.f
+++ b/src/tc_bi_ortho/tc_h_eigvectors.irp.f
@ -207,8 +207,6 @@ end
  else ! n_det > N_det_max_full
    double precision, allocatable :: H_jj(:),vec_tmp(:,:)
    external                         htc_bi_ortho_calc_tdav
    external                         htcdag_bi_ortho_calc_tdav
    external                         H_tc_u_0_opt
    external                         H_tc_dagger_u_0_opt
    external                         H_tc_s2_dagger_u_0_opt
@ -217,7 +215,7 @@ end
    allocate(H_jj(N_det),vec_tmp(N_det,n_states_diag))
    do i = 1, N_det
-      call htilde_mu_mat_bi_ortho_tot(psi_det(1,1,i), psi_det(1,1,i), N_int, H_jj(i))
+      call htilde_mu_mat_opt_bi_ortho_tot(psi_det(1,1,i), psi_det(1,1,i), N_int, H_jj(i))
    enddo
    print*,'---------------------------------'
@ -259,7 +257,6 @@ end
    do istate = N_states+1, n_states_diag
      vec_tmp(istate,istate) = 1.d0
    enddo
    !call davidson_general_ext_rout_nonsym_b1space(vec_tmp, H_jj, eigval_right_tc_bi_orth, N_det, n_states, n_states_diag, converged, htc_bi_ortho_calc_tdav)
    !call davidson_general_ext_rout_nonsym_b1space(vec_tmp, H_jj, eigval_right_tc_bi_orth, N_det, n_states, n_states_diag, converged, H_tc_u_0_opt)
    converged = .False.
    i_it = 0
--- a/src/tc_bi_ortho/tc_hmat.irp.f
+++ b/src/tc_bi_ortho/tc_hmat.irp.f
@ -9,33 +9,25 @@
  implicit none
  integer          :: i, j
-  double precision :: hmono,htwoe,hthree,htot
+  double precision :: htot
  PROVIDE N_int
  i = 1
  j = 1
-  call htilde_mu_mat_bi_ortho(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
+  call htilde_mu_mat_opt_bi_ortho_tot(psi_det(1,1,j), psi_det(1,1,i), N_int, htot)
- !$OMP PARALLEL DO SCHEDULE(GUIDED) DEFAULT(NONE) PRIVATE(i,j,hmono, htwoe, hthree, htot) &
+ !$OMP PARALLEL DO SCHEDULE(GUIDED) DEFAULT(NONE) PRIVATE(i,j, htot) &
 !$OMP SHARED (N_det, psi_det, N_int,htilde_matrix_elmt_bi_ortho)
    do i = 1, N_det
      do j = 1, N_det
        ! < J | Htilde | I >
-        call htilde_mu_mat_bi_ortho(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
+        call htilde_mu_mat_opt_bi_ortho_tot(psi_det(1,1,j), psi_det(1,1,i), N_int, htot)
        !print *, ' hmono  = ', hmono
        !print *, ' htwoe  = ', htwoe
        !print *, ' hthree = ', hthree
        htilde_matrix_elmt_bi_ortho(j,i) = htot
      enddo
    enddo
 !$OMP END PARALLEL DO
 ! print*,'htilde_matrix_elmt_bi_ortho = '
 ! do i = 1, min(100,N_det)
 !  write(*,'(100(F16.10,X))')htilde_matrix_elmt_bi_ortho(1:min(100,N_det),i)
 ! enddo
 END_PROVIDER 
--- a/src/tc_bi_ortho/tc_som.irp.f
+++ b/src/tc_bi_ortho/tc_som.irp.f
@ -56,8 +56,8 @@ subroutine main()
  U_SOM = 0.d0 
  do i = 1, N_det
    if(i == i_HF) cycle
-    call htilde_mu_mat_bi_ortho(psi_det(1,1,i_HF), psi_det(1,1,i), N_int, hmono_1, htwoe_1, hthree_1, htot_1)
+    call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i_HF), psi_det(1,1,i), N_int, hmono_1, htwoe_1, hthree_1, htot_1)
-    call htilde_mu_mat_bi_ortho(psi_det(1,1,i), psi_det(1,1,i_HF), N_int, hmono_2, htwoe_2, hthree_2, htot_2)
+    call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i), psi_det(1,1,i_HF), N_int, hmono_2, htwoe_2, hthree_2, htot_2)
    U_SOM += htot_1 * htot_2
  enddo
  U_SOM = 0.5d0 * U_SOM
--- a/src/tc_bi_ortho/tc_utils.irp.f
+++ b/src/tc_bi_ortho/tc_utils.irp.f
@ -12,7 +12,7 @@ subroutine write_tc_energy()
    do i = 1, N_det
      do j = 1, N_det
        !htot = htilde_matrix_elmt_bi_ortho(i,j)
-        call htilde_mu_mat_bi_ortho(psi_det(1,1,i), psi_det(1,1,j), N_int, hmono, htwoe, hthree, htot)
+        call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i), psi_det(1,1,j), N_int, hmono, htwoe, hthree, htot)
        E_TC = E_TC + psi_l_coef_bi_ortho(i,k) * psi_r_coef_bi_ortho(j,k) * htot
        !E_TC = E_TC + leigvec_tc_bi_orth(i,k) * reigvec_tc_bi_orth(j,k) * htot
      enddo
@ -45,7 +45,7 @@ subroutine write_tc_var()
    SIGMA_TC = 0.d0
    do j = 2, N_det
-      call htilde_mu_mat_bi_ortho(psi_det(1,1,1), psi_det(1,1,j), N_int, hmono, htwoe, hthree, htot)
+      call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,1), psi_det(1,1,j), N_int, hmono, htwoe, hthree, htot)
      SIGMA_TC = SIGMA_TC + htot * htot
    enddo
--- a/src/tc_bi_ortho/test_normal_order.irp.f
+++ b/src/tc_bi_ortho/test_normal_order.irp.f
@ -35,7 +35,7 @@ subroutine test
     det_i = ref_bitmask
     call do_single_excitation(det_i,h1,p1,s1,i_ok)
     call do_single_excitation(det_i,h2,p2,s2,i_ok)
-     call htilde_mu_mat_bi_ortho(det_i,HF_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
+     call htilde_mu_mat_bi_ortho_slow(det_i,HF_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
     call get_excitation_degree(ref_bitmask,det_i,degree,N_int)
     call get_excitation(ref_bitmask,det_i,exc,degree,phase,N_int)
     hthree *= phase
@ -67,7 +67,7 @@ do h1 = 1, elec_alpha_num
    if(i_ok.ne.1)cycle
    call do_single_excitation(det_i,h2,p2,s2,i_ok)
    if(i_ok.ne.1)cycle
-    call htilde_mu_mat_bi_ortho(det_i,ref_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
+    call htilde_mu_mat_bi_ortho_slow(det_i,ref_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
    call get_excitation_degree(ref_bitmask,det_i,degree,N_int)
    call get_excitation(ref_bitmask,det_i,exc,degree,phase,N_int)
    integer :: hh1, pp1, hh2, pp2, ss1, ss2
@ -103,7 +103,7 @@ do h1 = 1, elec_beta_num
    if(i_ok.ne.1)cycle
    call do_single_excitation(det_i,h2,p2,s2,i_ok)
    if(i_ok.ne.1)cycle
-    call htilde_mu_mat_bi_ortho(det_i,ref_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
+    call htilde_mu_mat_bi_ortho_slow(det_i,ref_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
    call get_excitation_degree(ref_bitmask,det_i,degree,N_int)
    call get_excitation(ref_bitmask,det_i,exc,degree,phase,N_int)
    call decode_exc(exc, 2, hh1, pp1, hh2, pp2, ss1, ss2)
--- a/src/tc_bi_ortho/test_s2_tc.irp.f
+++ b/src/tc_bi_ortho/test_s2_tc.irp.f
@ -91,7 +91,7 @@ subroutine routine_test_s2_davidson
 external H_tc_s2_u_0_opt
 allocate(H_jj(N_det),vec_tmp(N_det,n_states_diag),energies(n_states_diag), s2(n_states_diag))
 do i = 1, N_det
-   call htilde_mu_mat_bi_ortho_tot(psi_det(1,1,i), psi_det(1,1,i), N_int, H_jj(i))
+   call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,i), psi_det(1,1,i), N_int, H_jj(i))
 enddo
 ! Preparing the left-eigenvector
 print*,'Computing the left-eigenvector '
--- a/src/tc_bi_ortho/test_tc_bi_ortho.irp.f
+++ b/src/tc_bi_ortho/test_tc_bi_ortho.irp.f
@ -31,7 +31,7 @@ subroutine test_h_u0
  u_0(i) = psi_r_coef_bi_ortho(i,1)
 enddo
 call H_tc_u_0_nstates_openmp(v_0_new,u_0,N_states,N_det, do_right)
- call htc_bi_ortho_calc_tdav (v_0_ref,u_0,N_states,N_det)
+ call htc_bi_ortho_calc_tdav_slow (v_0_ref,u_0,N_states,N_det)
 print*,'difference right '
 accu = 0.d0
 do i = 1, N_det
@ -42,7 +42,7 @@ subroutine test_h_u0
 do_right = .False.
 v_0_new = 0.d0
 call H_tc_u_0_nstates_openmp(v_0_new,u_0,N_states,N_det, do_right)
- call htcdag_bi_ortho_calc_tdav(v_0_ref_dagger,u_0,N_states,N_det, do_right)
+ call htcdag_bi_ortho_calc_tdav_slow(v_0_ref_dagger,u_0,N_states,N_det, do_right)
 print*,'difference left'
 accu = 0.d0
 do i = 1, N_det
@ -63,7 +63,7 @@ subroutine test_slater_tc_opt
 i_count = 0.d0
 do i = 1, N_det
  do j = 1,N_det
-   call htilde_mu_mat_bi_ortho(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
+   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
   call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,j), psi_det(1,1,i), N_int, hnewmono, hnewtwoe, hnewthree, hnewtot)
   if(dabs(htot).gt.1.d-15)then
     i_count += 1.D0
@ -99,7 +99,7 @@ subroutine timing_tot
  do j = 1, N_det
 !   call get_excitation_degree(psi_det(1,1,j), psi_det(1,1,i),degree,N_int)
   i_count += 1.d0
-   call htilde_mu_mat_bi_ortho(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
+   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
  enddo
 enddo
 call wall_time(wall1)
@ -146,7 +146,7 @@ subroutine timing_diag
 do i = 1, N_det
  do j = i,i 
   i_count += 1.d0
-   call htilde_mu_mat_bi_ortho(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
+   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
  enddo
 enddo
 call wall_time(wall1)
@ -183,7 +183,7 @@ subroutine timing_single
   if(degree.ne.1)cycle
   i_count += 1.d0
   call wall_time(wall0)
-   call htilde_mu_mat_bi_ortho(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
+   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
   call wall_time(wall1)
   accu += wall1 - wall0
  enddo
@ -225,7 +225,7 @@ subroutine timing_double
   if(degree.ne.2)cycle
   i_count += 1.d0
   call wall_time(wall0)
-   call htilde_mu_mat_bi_ortho(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
+   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
   call wall_time(wall1)
   accu += wall1 - wall0
  enddo
--- a/src/tc_bi_ortho/test_tc_fock.irp.f
+++ b/src/tc_bi_ortho/test_tc_fock.irp.f
@ -25,8 +25,7 @@ subroutine test_3e
 implicit none
 double precision :: integral_aaa,integral_aab,integral_abb,integral_bbb,accu
 double precision ::  hmono, htwoe, hthree, htot
- call htilde_mu_mat_bi_ortho(ref_bitmask, ref_bitmask, N_int, hmono, htwoe, hthree, htot)
+ call htilde_mu_mat_bi_ortho_slow(ref_bitmask, ref_bitmask, N_int, hmono, htwoe, hthree, htot)
 ! call diag_htilde_three_body_ints_bi_ort(N_int, ref_bitmask, hthree)
 print*,'hmono = ',hmono
 print*,'htwoe = ',htwoe
 print*,'hthree= ',hthree
@ -88,7 +87,7 @@ subroutine routine_3()
       print*, ' excited det'
       call debug_det(det_i, N_int)
-       call htilde_mu_mat_bi_ortho(det_i, ref_bitmask, N_int, hmono, htwoe, hthree, htilde_ij)
+       call htilde_mu_mat_bi_ortho_slow(det_i, ref_bitmask, N_int, hmono, htwoe, hthree, htilde_ij)
       if(dabs(hthree).lt.1.d-10)cycle
       ref = hthree 
       if(s1 == 1)then
@ -156,7 +155,7 @@ subroutine routine_tot()
        stop
       endif
-       call htilde_mu_mat_bi_ortho(det_i, ref_bitmask, N_int, hmono, htwoe, hthree, htilde_ij)
+       call htilde_mu_mat_bi_ortho_slow(det_i, ref_bitmask, N_int, hmono, htwoe, hthree, htilde_ij)
       print*,htilde_ij
       if(dabs(htilde_ij).lt.1.d-10)cycle
       print*, ' excited det'
--- a/src/tc_keywords/EZFIO.cfg
+++ b/src/tc_keywords/EZFIO.cfg
@ -16,6 +16,24 @@ doc: If |true|, three-body terms are included
 interface: ezfio,provider,ocaml
 default: True
 [three_e_3_idx_term]
 type: logical
 doc: If |true|, the diagonal 3-idx terms of the 3-e interaction are taken
 interface: ezfio,provider,ocaml
 default: True
 [three_e_4_idx_term]
 type: logical
 doc: If |true|, the off-diagonal 4-idx terms of the 3-e interaction are taken
 interface: ezfio,provider,ocaml
 default: True
 [three_e_5_idx_term]
 type: logical
 doc: If |true|, the off-diagonal 5-idx terms of the 3-e interaction are taken
 interface: ezfio,provider,ocaml
 default: True
 [pure_three_body_h_tc]
 type: logical
 doc: If |true|, pure triple excitation three-body terms are included
@ -130,6 +148,12 @@ doc: a parameter used to define mu(r)
 interface: ezfio, provider, ocaml
 default: 6.203504908994001e-1
 [beta_rho_power]
 type: double precision
 doc: a parameter used to define mu(r)
 interface: ezfio, provider, ocaml
 default: 0.5
 [thr_degen_tc]
 type: Threshold
 doc: Threshold to determine if two orbitals are degenerate in TCSCF in order to avoid random quasi orthogonality between the right- and left-eigenvector for the same eigenvalue
--- a/src/trexio/EZFIO.cfg
+++ b/src/trexio/EZFIO.cfg
@ -10,11 +10,17 @@ doc: Name of the exported TREXIO file
 interface: ezfio, ocaml, provider
 default: None
-[export_rdm]
+[export_basis]
 type: logical
-doc: If True, export two-body reduced density matrix
+doc: If True, export basis set and AOs
 interface: ezfio, ocaml, provider
-default: False
+default: True
 [export_mos]
 type: logical
 doc: If True, export basis set and AOs
 interface: ezfio, ocaml, provider
 default: True
 [export_ao_one_e_ints]
 type: logical
@ -22,12 +28,6 @@ doc: If True, export one-electron integrals in AO basis
 interface: ezfio, ocaml, provider
 default: False
 [export_mo_one_e_ints]
 type: logical
 doc: If True, export one-electron integrals in MO basis
 interface: ezfio, ocaml, provider
 default: False
 [export_ao_two_e_ints]
 type: logical
 doc: If True, export two-electron integrals in AO basis
@ -40,6 +40,12 @@ doc: If True, export Cholesky-decomposed two-electron integrals in AO basis
 interface: ezfio, ocaml, provider
 default: False
 [export_mo_one_e_ints]
 type: logical
 doc: If True, export one-electron integrals in MO basis
 interface: ezfio, ocaml, provider
 default: False
 [export_mo_two_e_ints]
 type: logical
 doc: If True, export two-electron integrals in MO basis
@ -52,3 +58,9 @@ doc: If True, export Cholesky-decomposed two-electron integrals in MO basis
 interface: ezfio, ocaml, provider
 default: False
 [export_rdm]
 type: logical
 doc: If True, export two-body reduced density matrix
 interface: ezfio, ocaml, provider
 default: False
--- a/src/trexio/export_trexio.irp.f
+++ b/src/trexio/export_trexio.irp.f
@ -2,6 +2,6 @@ program export_trexio_prog
  implicit none
  read_wf = .True.
  SOFT_TOUCH read_wf
-  call export_trexio
+  call export_trexio(.False.)
 end
--- a/src/trexio/export_trexio_routines.irp.f
+++ b/src/trexio/export_trexio_routines.irp.f
@ -1,15 +1,17 @@
-subroutine export_trexio
+subroutine export_trexio(update)
  use trexio
  implicit none
  BEGIN_DOC
  !     Exports the wave function in TREXIO format
  END_DOC
  logical, intent(in)            :: update
  integer(trexio_t)              :: f(N_states) ! TREXIO file handle
  integer(trexio_exit_code)      :: rc
  integer                        :: k
  double precision, allocatable  :: factor(:)
  character*(256)  :: filenames(N_states)
  character :: rw
  filenames(1) = trexio_filename
  do k=2,N_states
@ -18,15 +20,26 @@ subroutine export_trexio
  do k=1,N_states
    print *, 'TREXIO file : ', trim(filenames(k))
    if (update) then
      call system('test -f '//trim(filenames(k))//' && cp -r '//trim(filenames(k))//' '//trim(filenames(k))//'.bak')
    else
      call system('test -f '//trim(filenames(k))//' && mv '//trim(filenames(k))//' '//trim(filenames(k))//'.bak')
    endif
  enddo
  print *, ''
  if (update) then
     rw = 'u'
  else
     rw = 'w'
  endif
  do k=1,N_states
    if (backend == 0) then
-      f(k) = trexio_open(filenames(k), 'u', TREXIO_HDF5, rc)
+      f(k) = trexio_open(filenames(k), rw, TREXIO_HDF5, rc)
    else if (backend == 1) then
-      f(k) = trexio_open(filenames(k), 'u', TREXIO_TEXT, rc)
+      f(k) = trexio_open(filenames(k), rw, TREXIO_TEXT, rc)
    endif
    if (f(k) == 0_8) then
      print *, 'Unable to open TREXIO file for writing'
@ -171,12 +184,13 @@ subroutine export_trexio
  endif
  if (export_basis) then
 ! Basis
 ! -----
    print *, 'Basis'
    rc = trexio_write_basis_type(f(1), 'Gaussian', len('Gaussian'))
    call trexio_assert(rc, TREXIO_SUCCESS)
@ -193,11 +207,11 @@ subroutine export_trexio
     call trexio_assert(rc, TREXIO_SUCCESS)
     allocate(factor(shell_num))
-   if (ao_normalized) then
+!     if (ao_normalized) then
-     factor(1:shell_num) = shell_normalization_factor(1:shell_num)
+!       factor(1:shell_num) = shell_normalization_factor(1:shell_num)
-   else
+!     else
       factor(1:shell_num) = 1.d0
-   endif
+!     endif
     rc = trexio_write_basis_shell_factor(f(1), factor)
     call trexio_assert(rc, TREXIO_SUCCESS)
@ -258,6 +272,8 @@ subroutine export_trexio
    call trexio_assert(rc, TREXIO_SUCCESS)
    deallocate(factor)
  endif
 ! One-e AO integrals
 ! ------------------
@ -375,6 +391,7 @@ subroutine export_trexio
 ! Molecular orbitals
 ! ------------------
  if (export_mos) then
    print *, 'MOs'
    rc = trexio_write_mo_type(f(1), mo_label, len(trim(mo_label)))
@ -396,6 +413,7 @@ subroutine export_trexio
    rc = trexio_write_mo_class(f(1), mo_class, len(mo_class(1)))
    call trexio_assert(rc, TREXIO_SUCCESS)
  endif
 ! One-e MO integrals
 ! ------------------
--- a/src/trexio/import_trexio_integrals.irp.f
+++ b/src/trexio/import_trexio_integrals.irp.f
@ -3,6 +3,7 @@ program import_integrals_ao
  implicit none
  integer(trexio_t)              :: f ! TREXIO file handle
  integer(trexio_exit_code)      :: rc
  PROVIDE mo_num
  f = trexio_open(trexio_filename, 'r', TREXIO_AUTO, rc)
  if (f == 0_8) then
@ -42,10 +43,10 @@ subroutine run(f)
  if (trexio_has_nucleus_repulsion(f) == TREXIO_SUCCESS) then
    rc = trexio_read_nucleus_repulsion(f, s)
    call trexio_assert(rc, TREXIO_SUCCESS)
    if (rc /= TREXIO_SUCCESS) then
      print *, irp_here, rc
      print *, 'Error reading nuclear repulsion'
      call trexio_assert(rc, TREXIO_SUCCESS)
      stop -1
    endif
    call ezfio_set_nuclei_nuclear_repulsion(s)
@ -63,6 +64,7 @@ subroutine run(f)
    if (rc /= TREXIO_SUCCESS) then
      print *, irp_here
      print *, 'Error reading AO overlap'
      call trexio_assert(rc, TREXIO_SUCCESS)
      stop -1
    endif
    call ezfio_set_ao_one_e_ints_ao_integrals_overlap(A)
@ -74,6 +76,7 @@ subroutine run(f)
    if (rc /= TREXIO_SUCCESS) then
      print *, irp_here
      print *, 'Error reading AO kinetic integrals'
      call trexio_assert(rc, TREXIO_SUCCESS)
      stop -1
    endif
    call ezfio_set_ao_one_e_ints_ao_integrals_kinetic(A)
@ -85,6 +88,7 @@ subroutine run(f)
 !    if (rc /= TREXIO_SUCCESS) then
 !      print *, irp_here
 !      print *, 'Error reading AO ECP local integrals'
 !      call trexio_assert(rc, TREXIO_SUCCESS)
 !      stop -1
 !    endif
 !    call ezfio_set_ao_one_e_ints_ao_integrals_pseudo(A)
@ -96,6 +100,7 @@ subroutine run(f)
    if (rc /= TREXIO_SUCCESS) then
      print *, irp_here
      print *, 'Error reading AO potential N-e integrals'
      call trexio_assert(rc, TREXIO_SUCCESS)
      stop -1
    endif
    call ezfio_set_ao_one_e_ints_ao_integrals_n_e(A)
@ -106,6 +111,10 @@ subroutine run(f)
  ! AO 2e integrals
  ! ---------------
  rc = trexio_has_ao_2e_int(f)
  PROVIDE ao_num
  if (rc /= TREXIO_HAS_NOT) then
      PROVIDE ao_integrals_map
      integer*4 :: BUFSIZE
@ -143,4 +152,71 @@ subroutine run(f)
      call map_save_to_disk(trim(ezfio_filename)//'/work/ao_ints',ao_integrals_map)
      call ezfio_set_ao_two_e_ints_io_ao_two_e_integrals('Read')
      deallocate(buffer_i, buffer_values, Vi, V)
      print *, 'AO integrals read from TREXIO file'
  else
      print *, 'AO integrals not found in TREXIO file'
  endif
  ! MO integrals
  ! ------------
  allocate(A(mo_num, mo_num))
  if (trexio_has_mo_1e_int_core_hamiltonian(f) == TREXIO_SUCCESS) then
    rc = trexio_read_mo_1e_int_core_hamiltonian(f, A)
    if (rc /= TREXIO_SUCCESS) then
      print *, irp_here
      print *, 'Error reading MO 1e integrals'
      call trexio_assert(rc, TREXIO_SUCCESS)
      stop -1
    endif
    call ezfio_set_mo_one_e_ints_mo_one_e_integrals(A)
    call ezfio_set_mo_one_e_ints_io_mo_one_e_integrals('Read')
  endif
  deallocate(A)
  ! MO 2e integrals
  ! ---------------
  rc = trexio_has_mo_2e_int(f)
  if (rc /= TREXIO_HAS_NOT) then
      BUFSIZE=mo_num**2
      allocate(buffer_i(BUFSIZE), buffer_values(BUFSIZE))
      allocate(Vi(4,BUFSIZE), V(BUFSIZE))
      offset = 0_8
      icount = BUFSIZE
      rc = TREXIO_SUCCESS
      do while (icount == size(V))
        rc = trexio_read_mo_2e_int_eri(f, offset, icount, Vi, V)
        do m=1,icount
          i = Vi(1,m)
          j = Vi(2,m)
          k = Vi(3,m)
          l = Vi(4,m)
          integral = V(m)
          call two_e_integrals_index(i, j, k, l, buffer_i(m) )
          buffer_values(m) = integral
        enddo
        call map_append(mo_integrals_map, buffer_i, buffer_values, int(icount,4))
        offset = offset + icount
        if (rc /= TREXIO_SUCCESS) then
            exit
        endif
      end do
      n_integrals = offset
      call map_sort(mo_integrals_map)
      call map_unique(mo_integrals_map)
      call map_save_to_disk(trim(ezfio_filename)//'/work/mo_ints',mo_integrals_map)
      call ezfio_set_mo_two_e_ints_io_mo_two_e_integrals('Read')
      deallocate(buffer_i, buffer_values, Vi, V)
      print *, 'MO integrals read from TREXIO file'
  else
      print *, 'MO integrals not found in TREXIO file'
  endif
 end
--- a/src/utils/integration.irp.f
+++ b/src/utils/integration.irp.f
@ -468,8 +468,6 @@ end subroutine
 subroutine multiply_poly(b,nb,c,nc,d,nd)
  implicit none
  BEGIN_DOC
@ -484,33 +482,292 @@ subroutine multiply_poly(b,nb,c,nc,d,nd)
  integer                        :: ndtmp
  integer                        :: ib, ic, id, k
-  if(ior(nc,nb) >= 0) then ! True if nc>=0 and nb>=0
+  if(ior(nc,nb) < 0) return !False if nc>=0 and nb>=0
-    continue
+
-  else
+  select case (nb)
    case (0)
      call multiply_poly_b0(b,c,nc,d,nd)
      return
-  endif
+    case (1)
-  ndtmp = nb+nc
+      call multiply_poly_b1(b,c,nc,d,nd)
      return
    case (2)
      call multiply_poly_b2(b,c,nc,d,nd)
      return
  end select
  select case (nc)
    case (0)
      call multiply_poly_c0(b,nb,c,d,nd)
      return
    case (1)
      call multiply_poly_c1(b,nb,c,d,nd)
      return
    case (2)
      call multiply_poly_c2(b,nb,c,d,nd)
      return
  end select
  do ib=0,nb
    do ic = 0,nc
      d(ib+ic) = d(ib+ic) + c(ic) * b(ib)
    enddo
  enddo
  do nd = nb+nc,0,-1
    if (d(nd) /= 0.d0) exit
  enddo
 end
 subroutine multiply_poly_b0(b,c,nc,d,nd)
  implicit none
  BEGIN_DOC
  ! Multiply two polynomials
  ! D(t) += B(t)*C(t)
  END_DOC
  integer, intent(in)            :: nc
  integer, intent(out)           :: nd
  double precision, intent(in)   :: b(0:0), c(0:nc)
  double precision, intent(inout) :: d(0:nc)
  integer                        :: ndtmp
  integer                        :: ic, id, k
  if(nc < 0) return !False if nc>=0
  do ic = 0,nc
    d(ic) = d(ic) + c(ic) * b(0)
  enddo
-  do ib=1,nb
+  do nd = nc,0,-1
-    d(ib) = d(ib) + c(0) * b(ib)
+    if (d(nd) /= 0.d0) exit
    do ic = 1,nc
      d(ib+ic) = d(ib+ic) + c(ic) * b(ib)
    enddo
  enddo
  do nd = ndtmp,0,-1
    if (d(nd) == 0.d0) then
      cycle
    endif
    exit
  enddo
 end
 subroutine multiply_poly_b1(b,c,nc,d,nd)
  implicit none
  BEGIN_DOC
  ! Multiply two polynomials
  ! D(t) += B(t)*C(t)
  END_DOC
  integer, intent(in)            :: nc
  integer, intent(out)           :: nd
  double precision, intent(in)   :: b(0:1), c(0:nc)
  double precision, intent(inout) :: d(0:1+nc)
  integer                        :: ndtmp
  integer                        :: ib, ic, id, k
  if(nc < 0) return !False if nc>=0
  select case (nc)
    case (0)
      d(0) = d(0) + c(0) * b(0)
      d(1) = d(1) + c(0) * b(1)
    case (1)
      d(0) = d(0) + c(0) * b(0)
      d(1) = d(1) + c(0) * b(1) + c(1) * b(0)
      d(2) = d(2) + c(1) * b(1)
    case default
      d(0) = d(0) + c(0) * b(0)
      do ic = 1,nc
        d(ic) = d(ic) + c(ic) * b(0) + c(ic-1) * b(1)
      enddo
      d(nc+1) = d(nc+1) + c(nc) * b(1)
  end select
  do nd = 1+nc,0,-1
    if (d(nd) /= 0.d0) exit
  enddo
 end
 subroutine multiply_poly_b2(b,c,nc,d,nd)
  implicit none
  BEGIN_DOC
  ! Multiply two polynomials
  ! D(t) += B(t)*C(t)
  END_DOC
  integer, intent(in)            :: nc
  integer, intent(out)           :: nd
  double precision, intent(in)   :: b(0:2), c(0:nc)
  double precision, intent(inout) :: d(0:2+nc)
  integer                        :: ndtmp
  integer                        :: ib, ic, id, k
  if(nc < 0) return !False if nc>=0
  select case (nc)
    case (0)
      d(0) = d(0) + c(0) * b(0)
      d(1) = d(1) + c(0) * b(1)
      d(2) = d(2) + c(0) * b(2)
    case (1)
      d(0) = d(0) + c(0) * b(0)
      d(1) = d(1) + c(0) * b(1) + c(1) * b(0)
      d(2) = d(2) + c(0) * b(2) + c(1) * b(1)
      d(3) = d(3) + c(1) * b(2)
    case (2)
      d(0) = d(0) + c(0) * b(0)
      d(1) = d(1) + c(0) * b(1) + c(1) * b(0)
      d(2) = d(2) + c(0) * b(2) + c(1) * b(1) + c(2) * b(0)
      d(3) = d(3) + c(2) * b(1) + c(1) * b(2)
      d(4) = d(4) + c(2) * b(2)
    case default
      d(0) = d(0) + c(0) * b(0)
      d(1) = d(1) + c(0) * b(1) + c(1) * b(0)
      do ic = 2,nc
        d(ic) = d(ic) + c(ic) * b(0) + c(ic-1) * b(1) + c(ic-2) * b(2)
      enddo
      d(nc+1) = d(nc+1) + c(nc) * b(1) + c(nc-1) * b(2)
      d(nc+2) = d(nc+2) + c(nc) * b(2)
  end select
  do nd = 2+nc,0,-1
    if (d(nd) /= 0.d0) exit
  enddo
 end
 subroutine multiply_poly_c0(b,nb,c,d,nd)
  implicit none
  BEGIN_DOC
  ! Multiply two polynomials
  ! D(t) += B(t)*C(t)
  END_DOC
  integer, intent(in)            :: nb
  integer, intent(out)           :: nd
  double precision, intent(in)   :: b(0:nb), c(0:0)
  double precision, intent(inout) :: d(0:nb)
  integer                        :: ndtmp
  integer                        :: ib, ic, id, k
  if(nb < 0) return !False if nb>=0
  do ib=0,nb
      d(ib) = d(ib) + c(0) * b(ib)
  enddo
  do nd = nb,0,-1
    if (d(nd) /= 0.d0) exit
  enddo
 end
 subroutine multiply_poly_c1(b,nb,c,d,nd)
  implicit none
  BEGIN_DOC
  ! Multiply two polynomials
  ! D(t) += B(t)*C(t)
  END_DOC
  integer, intent(in)            :: nb
  integer, intent(out)           :: nd
  double precision, intent(in)   :: b(0:nb), c(0:1)
  double precision, intent(inout) :: d(0:nb+1)
  integer                        :: ndtmp
  integer                        :: ib, ic, id, k
  if(nb < 0) return !False if nb>=0
  select case (nb)
    case (0)
      d(0) = d(0) + c(0) * b(0)
      d(1) = d(1) + c(1) * b(0)
    case (1)
      d(0) = d(0) + c(0) * b(0)
      d(1) = d(1) + c(0) * b(1) + c(1) * b(0)
      d(2) = d(2) + c(1) * b(1)
    case default
      d(0) = d(0) + c(0) * b(0)
      do ib=1,nb
        d(ib) = d(ib) + c(0) * b(ib) + c(1) * b(ib-1)
      enddo
      d(nb+1) = d(nb+1) + c(1) * b(nb)
  end select
  do nd = nb+1,0,-1
    if (d(nd) /= 0.d0) exit
  enddo
 end
 subroutine multiply_poly_c2(b,nb,c,d,nd)
  implicit none
  BEGIN_DOC
  ! Multiply two polynomials
  ! D(t) += B(t)*C(t)
  END_DOC
  integer, intent(in)            :: nb
  integer, intent(out)           :: nd
  double precision, intent(in)   :: b(0:nb), c(0:2)
  double precision, intent(inout) :: d(0:nb+2)
  integer                        :: ndtmp
  integer                        :: ib, ic, id, k
  if(nb < 0) return !False if nb>=0
  select case (nb)
    case (0)
      d(0) = d(0) + c(0) * b(0)
      d(1) = d(1) + c(1) * b(0)
      d(2) = d(2) + c(2) * b(0)
    case (1)
      d(0) = d(0) + c(0) * b(0)
      d(1) = d(1) + c(0) * b(1) + c(1) * b(0)
      d(2) = d(2) + c(1) * b(1) + c(2) * b(0)
      d(3) = d(3) + c(2) * b(1)
    case (2)
      d(0) = d(0) + c(0) * b(0)
      d(1) = d(1) + c(0) * b(1) + c(1) * b(0)
      d(2) = d(2) + c(0) * b(2) + c(1) * b(1) + c(2) * b(0)
      d(3) = d(3) + c(1) * b(2) + c(2) * b(1)
      d(4) = d(4) + c(2) * b(2)
    case default
      d(0) = d(0) + c(0) * b(0)
      d(1) = d(1) + c(0) * b(1) + c(1) * b(0)
      do ib=2,nb
        d(ib) = d(ib) + c(0) * b(ib) + c(1) * b(ib-1) + c(2) * b(ib-2)
      enddo
      d(nb+1) = d(nb+1) + c(1) * b(nb) + c(2) * b(nb-1)
      d(nb+2) = d(nb+2) + c(2) * b(nb)
  end select
  do nd = nb+2,0,-1
    if (d(nd) /= 0.d0) exit
  enddo
 end
 subroutine multiply_poly_v(b,nb,c,nc,d,nd,n_points)
  implicit none
  BEGIN_DOC
--- a/src/utils/linear_algebra.irp.f
+++ b/src/utils/linear_algebra.irp.f
@ -1825,39 +1825,37 @@ subroutine pivoted_cholesky( A, rank, tol, ndim, U)
 !
 integer                          :: ndim
 integer, intent(inout)           :: rank
-double precision, dimension(ndim, ndim), intent(inout)        :: A
+double precision, intent(inout)  :: A(ndim, ndim)
-double precision, dimension(ndim, rank), intent(out)          :: U
+double precision, intent(out)    :: U(ndim, rank)
 double precision, intent(in)     :: tol
 integer, dimension(:), allocatable          :: piv
 double precision, dimension(:), allocatable :: work
 character, parameter :: uplo = "U"
-integer :: N, LDA
+integer :: LDA
 integer :: info
 integer :: k, l, rank0
 external :: dpstrf
 rank0 = rank
-N = size(A, dim=1)
+LDA = ndim
-LDA = N
+allocate(piv(ndim))
-allocate(piv(N))
+allocate(work(2*ndim))
-allocate(work(2*N))
+call dpstrf(uplo, ndim, A, LDA, piv, rank, tol, work, info)
 call dpstrf(uplo, N, A, LDA, piv, rank, tol, work, info)
 if (rank > rank0) then
  print *, 'Bug: rank > rank0 in pivoted cholesky. Increase rank before calling'
  stop
 end if
-do k = 1, N
+do k = 1, ndim
-  A(k+1:, k) = 0.00D+0
+  A(k+1:ndim, k) = 0.00D+0
 end do
 ! TODO: It should be possible to use only one vector of size (1:rank) as a buffer
 ! to do the swapping in-place
 U(:,:) = 0.00D+0
-do k = 1, N
+do k = 1, ndim
  l = piv(k)
-  U(l, :) = A(1:rank, k)
+  U(l, 1:rank) = A(1:rank, k)
 end do
 end subroutine pivoted_cholesky
--- a/src/utils_cc/energy.irp.f
+++ b/src/utils_cc/energy.irp.f
@ -5,9 +5,8 @@ subroutine det_energy(det,energy)
  integer(bit_kind), intent(in) :: det
  double precision, intent(out) :: energy
  double precision, external :: diag_H_mat_elem
-  call i_H_j(det,det,N_int,energy)
+  energy = diag_H_mat_elem(det,N_int) + nuclear_repulsion
  energy = energy + nuclear_repulsion
 end
--- a/src/utils_cc/mo_integrals_cc.irp.f
+++ b/src/utils_cc/mo_integrals_cc.irp.f
@ -45,61 +45,64 @@ subroutine gen_v_space(n1,n2,n3,n4,list1,list2,list3,list4,v)
  integer, intent(in)           :: list1(n1),list2(n2),list3(n3),list4(n4)
  double precision, intent(out) :: v(n1,n2,n3,n4)
-  integer                       :: i1,i2,i3,i4,idx1,idx2,idx3,idx4
+  integer                       :: i1,i2,i3,i4,idx1,idx2,idx3,idx4,k
  double precision              :: get_two_e_integral
  PROVIDE mo_two_e_integrals_in_map
  double precision, allocatable :: buffer(:,:,:)
  !$OMP PARALLEL &
-  !$OMP SHARED(n1,n2,n3,n4,list1,list2,list3,list4,v,mo_integrals_map) &
+  !$OMP SHARED(n1,n2,n3,n4,list1,list2,list3,list4,v,mo_num,cholesky_mo_transp,cholesky_ao_num) &
-  !$OMP PRIVATE(i1,i2,i3,i4,idx1,idx2,idx3,idx4)&
+  !$OMP PRIVATE(i1,i2,i3,i4,idx1,idx2,idx3,idx4,k,buffer)&
  !$OMP DEFAULT(NONE)
-  !$OMP DO collapse(3)
+  allocate(buffer(mo_num,mo_num,mo_num))
  !$OMP DO
  do i4 = 1, n4
    do i3 = 1, n3
      do i2 = 1, n2
        do i1 = 1, n1
    idx4 = list4(i4)
-          idx3 = list3(i3)
+    call dgemm('T','N', mo_num*mo_num, mo_num, cholesky_ao_num, 1.d0, &
       cholesky_mo_transp, cholesky_ao_num, &
       cholesky_mo_transp(1,1,idx4), cholesky_ao_num, 0.d0, buffer, mo_num*mo_num)
    do i2 = 1, n2
      idx2 = list2(i2)
      do i3 = 1, n3
        idx3 = list3(i3)
        do i1 = 1, n1
          idx1 = list1(i1)
-          v(i1,i2,i3,i4) = get_two_e_integral(idx1,idx2,idx3,idx4,mo_integrals_map)
+          v(i1,i2,i3,i4) = buffer(idx1,idx3,idx2)
        enddo
      enddo
    enddo
  enddo
  !$OMP END DO
  deallocate(buffer)
  !$OMP END PARALLEL
 end
 ! full
 BEGIN_PROVIDER [double precision, cc_space_v, (mo_num,mo_num,mo_num,mo_num)]
  implicit none
-
+  integer                       :: i1,i2,i3,i4,k
-  integer          :: i,j,k,l
+  double precision, allocatable :: buffer(:,:,:)
  double precision :: get_two_e_integral
  PROVIDE mo_two_e_integrals_in_map
  !$OMP PARALLEL &
-  !$OMP SHARED(cc_space_v,mo_num,mo_integrals_map) &
+  !$OMP SHARED(cc_space_v,mo_num,cholesky_mo_transp,cholesky_ao_num) &
-  !$OMP PRIVATE(i,j,k,l) &
+  !$OMP PRIVATE(i1,i2,i3,i4,k,buffer)&
  !$OMP DEFAULT(NONE)
-  
+  allocate(buffer(mo_num,mo_num,mo_num))
-  !$OMP DO collapse(3)
+  !$OMP DO
-  do l = 1, mo_num
+  do i4 = 1, mo_num
-    do k = 1, mo_num
+    call dgemm('T','N', mo_num*mo_num, mo_num, cholesky_ao_num, 1.d0, &
-      do j = 1, mo_num
+         cholesky_mo_transp, cholesky_ao_num, &
-        do i = 1, mo_num
+         cholesky_mo_transp(1,1,i4), cholesky_ao_num, 0.d0, buffer, mo_num*mo_num)
-          cc_space_v(i,j,k,l) = get_two_e_integral(i,j,k,l,mo_integrals_map)
+    do i2 = 1, mo_num
      do i3 = 1, mo_num
        do i1 = 1, mo_num
          cc_space_v(i1,i2,i3,i4) = buffer(i1,i3,i2)
        enddo
      enddo
    enddo
  enddo
  !$OMP END DO
  deallocate(buffer)
  !$OMP END PARALLEL
 END_PROVIDER
@ -638,6 +641,7 @@ subroutine gen_f_spin(det, n1,n2, n1_S,n2_S, list1,list2, dim1,dim2, f)
  integer                       :: i,j, idx_i,idx_j,i_shift,j_shift
  integer                       :: tmp_i,tmp_j
  integer                       :: si,sj,s
  PROVIDE big_array_exchange_integrals big_array_coulomb_integrals
  allocate(tmp_F(mo_num,mo_num))
@ -702,8 +706,10 @@ subroutine get_fock_matrix_spin(det,s,f)
    s2 = 1
  endif
  PROVIDE big_array_coulomb_integrals big_array_exchange_integrals
  !$OMP PARALLEL &
-  !$OMP SHARED(f,mo_num,s1,s2,N_int,det,mo_one_e_integrals) &
+  !$OMP SHARED(f,mo_num,s1,s2,N_int,det,mo_one_e_integrals,big_array_coulomb_integrals,big_array_exchange_integrals) &
  !$OMP PRIVATE(p,q,ok,i,res)&
  !$OMP DEFAULT(NONE)
  !$OMP DO collapse(1)
@ -713,13 +719,14 @@ subroutine get_fock_matrix_spin(det,s,f)
      do i = 1, mo_num
        call apply_hole(det, s1, i, res, ok, N_int)
        if (ok) then
-          f(p,q) = f(p,q) + mo_two_e_integral(p,i,q,i) - mo_two_e_integral(p,i,i,q)
+!          f(p,q) = f(p,q) + mo_two_e_integral(p,i,q,i) - mo_two_e_integral(p,i,i,q)
          f(p,q) = f(p,q) + big_array_coulomb_integrals(i,p,q) - big_array_exchange_integrals(i,p,q)
        endif
      enddo
      do i = 1, mo_num
        call apply_hole(det, s2, i, res, ok, N_int)
        if (ok) then
-          f(p,q) = f(p,q) + mo_two_e_integral(p,i,q,i)
+          f(p,q) = f(p,q) + big_array_coulomb_integrals(i,p,q)
        endif
      enddo
    enddo
--- a/src/utils_cc/update_t.irp.f
+++ b/src/utils_cc/update_t.irp.f
@ -22,7 +22,7 @@ subroutine update_t1(nO,nV,f_o,f_v,r1,t1)
  !$OMP SHARED(nO,nV,t1,r1,cc_level_shift,f_o,f_v) &
  !$OMP PRIVATE(i,a) &
  !$OMP DEFAULT(NONE)
-  !$OMP DO collapse(1)
+  !$OMP DO 
  do a = 1, nV
    do i = 1, nO
      t1(i,a) = t1(i,a) - r1(i,a) / (f_o(i) - f_v(a) - cc_level_shift)
@ -57,7 +57,7 @@ subroutine update_t2(nO,nV,f_o,f_v,r2,t2)
  !$OMP SHARED(nO,nV,t2,r2,cc_level_shift,f_o,f_v) &
  !$OMP PRIVATE(i,j,a,b) &
  !$OMP DEFAULT(NONE)
-  !$OMP DO collapse(3)
+  !$OMP DO 
  do b = 1, nV
    do a = 1, nV
      do j = 1, nO