diff --git a/plugins/FOBOCI/dress_simple.irp.f b/plugins/FOBOCI/dress_simple.irp.f
index 99566a8e..dfa57865 100644
--- a/plugins/FOBOCI/dress_simple.irp.f
+++ b/plugins/FOBOCI/dress_simple.irp.f
@@ -207,16 +207,16 @@ subroutine dress_H_matrix_from_psi_det_input(psi_det_generators_input,Ndet_gener
 
  call lapack_diagd(eigvalues,eigvectors,dressed_H_matrix,Ndet_generators,Ndet_generators)  ! Diagonalize the Dressed_H_matrix
  
- double precision :: s2,E_ref(N_states)
+ double precision :: s2(N_det_generators),E_ref(N_states)
  integer :: i_state(N_states)
  integer :: n_state_good
  n_state_good = 0
  if(s2_eig)then
+  call u_0_S2_u_0_nstates(s2,eigvectors,Ndet_generators,psi_det_generators_input,N_int,N_det_generators,size(eigvectors,1))
   do i = 1, Ndet_generators
-    call get_s2_u0(psi_det_generators_input,eigvectors(1,i),Ndet_generators,Ndet_generators,s2)
-    print*,'s2 = ',s2
-    print*,dabs(s2-expected_s2)
-    if(dabs(s2-expected_s2).le.0.3d0)then
+    print*,'s2 = ',s2(i)
+    print*,dabs(s2(i)-expected_s2)
+    if(dabs(s2(i)-expected_s2).le.0.3d0)then
      n_state_good +=1
      i_state(n_state_good) = i
      E_ref(n_state_good) = eigvalues(i)
@@ -274,7 +274,6 @@ subroutine dress_H_matrix_from_psi_det_input(psi_det_generators_input,Ndet_gener
  integer :: i_good_state(0:N_states)
  i_good_state(0) = 0
   do i = 1, Ndet_generators
-    call get_s2_u0(psi_det_generators_input,eigvectors(1,i),Ndet_generators,Ndet_generators,s2)
     ! State following
     do k = 1, N_states 
      accu = 0.d0
diff --git a/plugins/Full_CI/micro_pt2.irp.f b/plugins/Full_CI/micro_pt2.irp.f
deleted file mode 100644
index 4ce7c4be..00000000
--- a/plugins/Full_CI/micro_pt2.irp.f
+++ /dev/null
@@ -1,61 +0,0 @@
-program micro_pt2
-  implicit none
-  BEGIN_DOC
-! Helper program to compute the PT2 in distributed mode.
-  END_DOC
-
-  read_wf = .False.
-  SOFT_TOUCH read_wf
-  call provide_everything
-  call switch_qp_run_to_master
-  call run_wf
-
-end
-
-subroutine provide_everything
-  PROVIDE H_apply_buffer_allocated mo_bielec_integrals_in_map psi_det_generators psi_coef_generators psi_det_sorted_bit psi_selectors n_det_generators n_states generators_bitmask zmq_context
-end
-
-subroutine run_wf
-  use f77_zmq
-  implicit none
-
-  integer(ZMQ_PTR), external :: new_zmq_to_qp_run_socket
-  integer(ZMQ_PTR) :: zmq_to_qp_run_socket
-  double precision :: energy(N_states_diag)
-
-  print *,  'Getting wave function'
-  zmq_context = f77_zmq_ctx_new ()
-
-  zmq_to_qp_run_socket = new_zmq_to_qp_run_socket()
-
-  ! TODO : do loop here
-  ! TODO : wait_state
-  call zmq_get_psi(zmq_to_qp_run_socket,1,energy,size(energy))
-  integer :: j,k
-  do j=1,N_states_diag
-    do k=1,N_det
-     CI_eigenvectors(k,j) = psi_coef(k,j)
-    enddo
-    call get_s2_u0(psi_det,CI_eigenvectors(1,j),N_det,size(CI_eigenvectors,1),CI_eigenvectors_s2(j))
-  enddo
-  if (.True.) then
-    do k=1,size(ci_electronic_energy)
-      ci_electronic_energy(k) = energy(k)
-    enddo
-    SOFT_TOUCH ci_electronic_energy CI_eigenvectors_s2 CI_eigenvectors
-    print *,  energy(:)
-  endif
-  call write_double(6,ci_energy,'Energy')
-  zmq_state = 'h_apply_fci_pt2'
-
-  call provide_everything
-  integer :: rc, i
-
-  print *,  'Contribution to PT2 running'
-
-  !$OMP PARALLEL PRIVATE(i)
-  i = omp_get_thread_num()
-  call H_apply_FCI_PT2_slave_tcp(i)
-  !$OMP END PARALLEL
-end
diff --git a/plugins/Full_CI_ZMQ/selection_slave.irp.f b/plugins/Full_CI_ZMQ/selection_slave.irp.f
index ff87b479..438892d3 100644
--- a/plugins/Full_CI_ZMQ/selection_slave.irp.f
+++ b/plugins/Full_CI_ZMQ/selection_slave.irp.f
@@ -58,12 +58,12 @@ subroutine update_energy(energy)
 ! Update energy when it is received from ZMQ
   END_DOC
   integer :: j,k
-    do j=1,N_states_diag
+  do j=1,N_states_diag
     do k=1,N_det
       CI_eigenvectors(k,j) = psi_coef(k,j)
     enddo
-    call get_s2_u0(psi_det,CI_eigenvectors(1,j),N_det,size(CI_eigenvectors,1),CI_eigenvectors_s2(j))
   enddo
+  call u_0_S2_u_0(CI_eigenvectors_s2,CI_eigenvectors,N_det,psi_det,N_int)
   if (.True.) then
     do k=1,size(ci_electronic_energy)
       ci_electronic_energy(k) = energy(k)
diff --git a/plugins/MRCC_Utils/mrcc_utils.irp.f b/plugins/MRCC_Utils/mrcc_utils.irp.f
index b8ed005d..210ce848 100644
--- a/plugins/MRCC_Utils/mrcc_utils.irp.f
+++ b/plugins/MRCC_Utils/mrcc_utils.irp.f
@@ -148,9 +148,8 @@ END_PROVIDER
      
      call davidson_diag_mrcc(psi_det,CI_eigenvectors_dressed,CI_electronic_energy_dressed,&
           size(CI_eigenvectors_dressed,1),N_det,N_states_diag,N_int,output_determinants,mrcc_state)
-     do j=1,N_states_diag
-       call get_s2_u0(psi_det,CI_eigenvectors_dressed(1,j),N_det,size(CI_eigenvectors_dressed,1),CI_eigenvectors_s2_dressed(j))
-     enddo
+     call u_0_S2_u_0_nstates(CI_eigenvectors_s2_dressed,CI_eigenvectors_dressed,N_det,psi_det,N_int,&
+         N_states_diag,size(CI_eigenvectors_dressed,1))
 
      
    else if (diag_algorithm == "Lapack") then
@@ -160,53 +159,96 @@ END_PROVIDER
      call lapack_diag(eigenvalues,eigenvectors,                      &
          H_matrix_dressed,size(H_matrix_dressed,1),N_det)
      CI_electronic_energy_dressed(:) = 0.d0
-     do i=1,N_det
-       CI_eigenvectors_dressed(i,1) = eigenvectors(i,1)
-     enddo
-     i_state = 0
      if (s2_eig) then
+       i_state = 0
+       allocate (s2_eigvalues(N_det))
+       allocate(index_good_state_array(N_det),good_state_array(N_det))
+       good_state_array = .False.
+       call u_0_S2_u_0_nstates(s2_eigvalues,eigenvectors,N_det,psi_det,N_int,&
+         N_det,size(eigenvectors,1))
        do j=1,N_det
-         call get_s2_u0(psi_det,eigenvectors(1,j),N_det,N_det,s2)
+         ! Select at least n_states states with S^2 values closed to "expected_s2"
          if(dabs(s2-expected_s2).le.0.5d0)then
            i_state += 1
-           do i=1,N_det
-             CI_eigenvectors_dressed(i,i_state) = eigenvectors(i,j)
-           enddo
-           CI_electronic_energy_dressed(i_state) = eigenvalues(j)
-           CI_eigenvectors_s2_dressed(i_state) = s2
+           index_good_state_array(i_state) = j
+           good_state_array(j) = .True.
          endif
-         if (i_state.ge.N_states_diag) then
+         if (i_state==N_states) then
            exit
          endif
        enddo
-     else
-       do j=1,N_states_diag
-         call get_s2_u0(psi_det,eigenvectors(1,j),N_det,N_det,s2)
-         i_state += 1
-         do i=1,N_det
-           CI_eigenvectors_dressed(i,i_state) = eigenvectors(i,j)
+       if (i_state /= 0) then
+         ! Fill the first "i_state" states that have a correct S^2 value
+         do j = 1, i_state
+           do i=1,N_det
+             CI_eigenvectors_dressed(i,j) = eigenvectors(i,index_good_state_array(j))
+           enddo
+           CI_electronic_energy_dressed(j) = eigenvalues(index_good_state_array(j))
+           CI_eigenvectors_s2_dressed(j) = s2_eigvalues(index_good_state_array(j))
          enddo
-         CI_electronic_energy_dressed(i_state) = eigenvalues(j)
-         CI_eigenvectors_s2_dressed(i_state) = s2
+         i_other_state = 0
+         do j = 1, N_det
+           if(good_state_array(j))cycle
+           i_other_state +=1
+           if(i_state+i_other_state.gt.n_states_diag)then
+             exit
+           endif
+           do i=1,N_det
+             CI_eigenvectors_dressed(i,i_state+i_other_state) = eigenvectors(i,j)
+           enddo
+           CI_electronic_energy_dressed(i_state+i_other_state) = eigenvalues(j)
+           CI_eigenvectors_s2_dressed(i_state+i_other_state) = s2_eigvalues(i_state+i_other_state)
+         enddo
+         
+       else
+         print*,''
+         print*,'!!!!!!!!   WARNING  !!!!!!!!!'
+         print*,'  Within the ',N_det,'determinants selected'
+         print*,'  and the ',N_states_diag,'states requested'
+         print*,'  We did not find any state with S^2 values close to ',expected_s2
+         print*,'  We will then set the first N_states eigenvectors of the H matrix'
+         print*,'  as the CI_eigenvectors_dressed'
+         print*,'  You should consider more states and maybe ask for s2_eig to be .True. or just enlarge the CI space'
+         print*,''
+         do j=1,min(N_states_diag,N_det)
+           do i=1,N_det
+             CI_eigenvectors_dressed(i,j) = eigenvectors(i,j)
+           enddo
+           CI_electronic_energy_dressed(j) = eigenvalues(j)
+           CI_eigenvectors_s2_dressed(j) = s2_eigvalues(j)
+         enddo
+       endif
+       deallocate(index_good_state_array,good_state_array)
+       deallocate(s2_eigvalues)
+     else
+       call u_0_S2_u_0_nstates(CI_eigenvectors_s2_dressed,eigenvectors,N_det,psi_det,N_int,&
+          min(N_det,N_states_diag),size(eigenvectors,1))
+       ! Select the "N_states_diag" states of lowest energy
+       do j=1,min(N_det,N_states_diag)
+         do i=1,N_det
+           CI_eigenvectors_dressed(i,j) = eigenvectors(i,j)
+         enddo
+         CI_electronic_energy_dressed(j) = eigenvalues(j)
        enddo
      endif
      deallocate(eigenvectors,eigenvalues)
    endif
-   
-   if(s2_eig.and.n_states_diag > 1.and. n_det >= n_states_diag)then
-      ! Diagonalizing S^2 within the "n_states_diag" states found
-      allocate(s2_eigvalues(N_states_diag))
-      call diagonalize_s2_betweenstates(psi_det,CI_eigenvectors_dressed,n_det,size(psi_det,3),size(CI_eigenvectors_dressed,1),min(n_states_diag,n_det),s2_eigvalues)
 
+   if( s2_eig.and.(n_states_diag > 1).and.(n_det >= n_states_diag) )then
+      ! Diagonalizing S^2 within the "n_states_diag" states found
+      allocate(s2_eigvalues(N_states_diag), e_array(N_states_diag))
+      call diagonalize_s2_betweenstates(psi_det,CI_eigenvectors_dressed,n_det,size(psi_det,3),size(CI_eigenvectors_dressed,1),min(n_states_diag,n_det),s2_eigvalues)
+      
       do j = 1, N_states_diag
         do i = 1, N_det
           psi_coef(i,j) = CI_eigenvectors_dressed(i,j)
         enddo
       enddo
-
+      call u_0_H_u_0_nstates(e_array,psi_coef,n_det,psi_det,N_int,N_states_diag,psi_det_size)
+     
       ! Browsing the "n_states_diag" states and getting the lowest in energy "n_states" ones that have the S^2 value
       ! closer to the "expected_s2" set as input
-
+      
       allocate(index_good_state_array(N_det),good_state_array(N_det))
       good_state_array = .False.
       i_state = 0
@@ -218,15 +260,13 @@ END_PROVIDER
         endif
       enddo
       ! Sorting the i_state good states by energy
-      allocate(e_array(i_state),iorder(i_state))
+      allocate(iorder(i_state))
       do j = 1, i_state
         do i = 1, N_det
           CI_eigenvectors_dressed(i,j) = psi_coef(i,index_good_state_array(j))
         enddo
         CI_eigenvectors_s2_dressed(j) = s2_eigvalues(index_good_state_array(j))
-        call u0_H_u_0_mrcc(e_0,CI_eigenvectors_dressed(1,j),n_det,psi_det,N_int,mrcc_state)
-        CI_electronic_energy_dressed(j) = e_0
-        e_array(j) = e_0
+        CI_electronic_energy_dressed(j) = e_array(j)
         iorder(j) = j
       enddo
       call dsort(e_array,iorder,i_state)
@@ -236,15 +276,8 @@ END_PROVIDER
         do i = 1, N_det
           CI_eigenvectors_dressed(i,j) = psi_coef(i,index_good_state_array(iorder(j)))
         enddo
-        !    call u0_H_u_0_mrcc(e_0,CI_eigenvectors_dressed(1,j),n_det,psi_det,N_int,mrcc_state)
-        !    print*,'e    = ',CI_electronic_energy_dressed(j)
-        !    print*,'<e>  = ',e_0
-        !    call get_s2_u0(psi_det,CI_eigenvectors_dressed(1,j),N_det,size(CI_eigenvectors_dressed,1),s2)
-        !    print*,'s^2  = ',CI_eigenvectors_s2_dressed(j)
-        !    print*,'<s^2>= ',s2
       enddo
-      deallocate(e_array,iorder)
-
+      
       ! Then setting the other states without any specific energy order
       i_other_state = 0
       do j = 1, N_states_diag
@@ -254,15 +287,14 @@ END_PROVIDER
           CI_eigenvectors_dressed(i,i_state + i_other_state) = psi_coef(i,j)
         enddo
         CI_eigenvectors_s2_dressed(i_state + i_other_state) = s2_eigvalues(j)
-        call u0_H_u_0_mrcc(e_0,CI_eigenvectors_dressed(1,i_state + i_other_state),n_det,psi_det,N_int,mrcc_state)
-        CI_electronic_energy_dressed(i_state + i_other_state) = e_0
+        CI_electronic_energy_dressed(i_state + i_other_state) = e_array(i_state + i_other_state)
       enddo
-      deallocate(index_good_state_array,good_state_array)
-
-      deallocate(s2_eigvalues)
-
+      deallocate(iorder,e_array,index_good_state_array,good_state_array)
+       
+     deallocate(s2_eigvalues)
+     
    endif
-
+   
 END_PROVIDER
 
 BEGIN_PROVIDER [ double precision, CI_energy_dressed, (N_states_diag) ]
diff --git a/src/Determinants/diagonalize_CI.irp.f b/src/Determinants/diagonalize_CI.irp.f
index 2a071f83..58118461 100644
--- a/src/Determinants/diagonalize_CI.irp.f
+++ b/src/Determinants/diagonalize_CI.irp.f
@@ -72,7 +72,7 @@ END_PROVIDER
      call davidson_diag(psi_det,CI_eigenvectors,CI_electronic_energy,&
          size(CI_eigenvectors,1),N_det,N_states_diag,N_int,output_determinants)
 
-     call get_s2_u0_nstates(CI_eigenvectors_s2,CI_eigenvectors,N_det,psi_det,N_int,&
+     call u_0_S2_u_0_nstates(CI_eigenvectors_s2,CI_eigenvectors,N_det,psi_det,N_int,&
          N_states_diag,size(CI_eigenvectors,1))
      
      
@@ -88,7 +88,7 @@ END_PROVIDER
        allocate (s2_eigvalues(N_det))
        allocate(index_good_state_array(N_det),good_state_array(N_det))
        good_state_array = .False.
-       call get_s2_u0_nstates(s2_eigvalues,eigenvectors,N_det,psi_det,N_int,&
+       call u_0_S2_u_0_nstates(s2_eigvalues,eigenvectors,N_det,psi_det,N_int,&
          N_det,size(eigenvectors,1))
        do j=1,N_det
          ! Select at least n_states states with S^2 values closed to "expected_s2"
@@ -145,7 +145,7 @@ END_PROVIDER
        deallocate(index_good_state_array,good_state_array)
        deallocate(s2_eigvalues)
      else
-       call get_s2_u0_nstates(CI_eigenvectors_s2,eigenvectors,N_det,psi_det,N_int,&
+       call u_0_S2_u_0_nstates(CI_eigenvectors_s2,eigenvectors,N_det,psi_det,N_int,&
           min(N_det,N_states_diag),size(eigenvectors,1))
        ! Select the "N_states_diag" states of lowest energy
        do j=1,min(N_det,N_states_diag)
@@ -169,7 +169,7 @@ END_PROVIDER
           psi_coef(i,j) = CI_eigenvectors(i,j)
         enddo
       enddo
-      call u0_H_u_0_nstates(e_array,psi_coef,n_det,psi_det,N_int,N_states_diag,psi_det_size)
+      call u_0_H_u_0_nstates(e_array,psi_coef,n_det,psi_det,N_int,N_states_diag,psi_det_size)
      
       ! Browsing the "n_states_diag" states and getting the lowest in energy "n_states" ones that have the S^2 value
       ! closer to the "expected_s2" set as input
diff --git a/src/Determinants/s2.irp.f b/src/Determinants/s2.irp.f
index 086613de..c66bc714 100644
--- a/src/Determinants/s2.irp.f
+++ b/src/Determinants/s2.irp.f
@@ -69,38 +69,86 @@ BEGIN_PROVIDER [ double precision, s2_values, (N_states) ]
 ! array of the averaged values of the S^2 operator on the various states
  END_DOC
  integer :: i
- call get_s2_u0_nstates(s2_values,psi_coef,n_det,psi_det,N_int,N_states,psi_det_size)
+ call u_0_S2_u_0_nstates(s2_values,psi_coef,n_det,psi_det,N_int,N_states,psi_det_size)
 
 END_PROVIDER
 
 
 
-subroutine get_s2_u0(psi_keys_tmp,psi_coefs_tmp,n,nmax,s2)
-  implicit none
-  use bitmasks
-  integer, intent(in)            :: n,nmax
-  integer(bit_kind), intent(in)  :: psi_keys_tmp(N_int,2,nmax)
-  double precision, intent(in)   :: psi_coefs_tmp(nmax)
-  double precision, intent(out)  :: s2
-  call get_s2_u0_nstates(s2,psi_coefs_tmp,n,psi_keys_tmp,N_int,1,nmax)
-end
-
-
-subroutine get_s2_u0_nstates(s2,u_0,n,keys_tmp,Nint,N_st,sze_8)
+subroutine u_0_S2_u_0(e_0,u_0,n,keys_tmp,Nint)
   use bitmasks
   implicit none
   BEGIN_DOC
-  ! Computes s2  = <u_0|S^2|u_0>
+  ! Computes e_0 = <u_0|S2|u_0>/<u_0|u_0>
+  !
+  ! n : number of determinants
+  !
+  END_DOC
+  integer, intent(in)            :: n,Nint
+  double precision, intent(out)  :: e_0
+  double precision, intent(in)   :: u_0(n)
+  integer(bit_kind),intent(in)   :: keys_tmp(Nint,2,n)
+  call u_0_S2_u_0_nstates(e_0,u_0,n,keys_tmp,Nint,1,n)
+end
+
+subroutine u_0_S2_u_0_nstates(e_0,u_0,n,keys_tmp,Nint,N_st,sze_8)
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Computes e_0 = <u_0|S2|u_0>/<u_0|u_0>
+  !
+  ! n : number of determinants
+  !
+  END_DOC
+  integer, intent(in)            :: n,Nint, N_st, sze_8
+  double precision, intent(out)  :: e_0(N_st)
+  double precision, intent(in)   :: u_0(sze_8,N_st)
+  integer(bit_kind),intent(in)   :: keys_tmp(Nint,2,n)
+  
+  double precision, allocatable  :: v_0(:,:)
+  double precision               :: u_dot_u,u_dot_v
+  integer :: i,j
+  allocate (v_0(sze_8,N_st))
+  
+  call S2_u_0_nstates(v_0,u_0,n,keys_tmp,Nint,N_st,sze_8)
+  do i=1,N_st
+    e_0(i) = u_dot_v(v_0(1,i),u_0(1,i),n)/u_dot_u(u_0(1,i),n) + S_z2_Sz
+  enddo
+end
+
+
+
+subroutine S2_u_0(v_0,u_0,n,keys_tmp,Nint)
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Computes v_0 = S^2|u_0>
+  !
+  ! n : number of determinants
+  !
+  END_DOC
+  integer, intent(in)            :: n,Nint
+  double precision, intent(out)  :: v_0(n)
+  double precision, intent(in)   :: u_0(n)
+  integer(bit_kind),intent(in)   :: keys_tmp(Nint,2,n)
+  call S2_u_0_nstates(v_0,u_0,n,keys_tmp,Nint,1,n)
+end
+
+subroutine S2_u_0_nstates(v_0,u_0,n,keys_tmp,Nint,N_st,sze_8)
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Computes v_0  = S^2|u_0>
   !
   ! n : number of determinants
   !
   END_DOC
   integer, intent(in)            :: N_st,n,Nint, sze_8
-  double precision, intent(out)  :: s2(N_st)
+  double precision, intent(out)  :: v_0(sze_8,N_st)
   double precision, intent(in)   :: u_0(sze_8,N_st)
   integer(bit_kind),intent(in)   :: keys_tmp(Nint,2,n)
   double precision               :: s2_tmp
-  double precision               :: s2t(N_st)
+  double precision, allocatable  :: vt(:,:)
   integer                        :: i,j,k,l, jj,ii
   integer                        :: i0, j0
   
@@ -117,15 +165,16 @@ subroutine get_s2_u0_nstates(s2,u_0,n,keys_tmp,Nint,N_st,sze_8)
   PROVIDE ref_bitmask_energy davidson_criterion
 
   allocate (shortcut(0:n+1,2), sort_idx(n,2), sorted(Nint,n,2), version(Nint,n,2))
-  s2 = 0.d0
+  v_0 = 0.d0
 
   call sort_dets_ab_v(keys_tmp, sorted(1,1,1), sort_idx(1,1), shortcut(0,1), version(1,1,1), n, Nint)
   call sort_dets_ba_v(keys_tmp, sorted(1,1,2), sort_idx(1,2), shortcut(0,2), version(1,1,2), n, Nint)
   
   !$OMP PARALLEL DEFAULT(NONE)                                       &
-      !$OMP PRIVATE(i,s2_tmp,j,k,jj,s2t,ii,sh,sh2,ni,exa,ext,org_i,org_j,endi,sorted_i,istate)&
-      !$OMP SHARED(n,u_0,keys_tmp,Nint,s2,sorted,shortcut,sort_idx,version,N_st,sze_8)
-  s2t = 0.d0
+      !$OMP PRIVATE(i,s2_tmp,j,k,jj,vt,ii,sh,sh2,ni,exa,ext,org_i,org_j,endi,sorted_i,istate)&
+      !$OMP SHARED(n,u_0,keys_tmp,Nint,v_0,sorted,shortcut,sort_idx,version,N_st,sze_8)
+  allocate(vt(sze_8,N_st))
+  vt = 0.d0
   
   !$OMP DO SCHEDULE(dynamic)
   do sh=1,shortcut(0,1)
@@ -158,7 +207,8 @@ subroutine get_s2_u0_nstates(s2,u_0,n,keys_tmp,Nint,N_st,sze_8)
           if(ext <= 4) then
             call get_s2(keys_tmp(1,1,org_i),keys_tmp(1,1,org_j),s2_tmp,Nint)
             do istate=1,N_st
-              s2t(istate)  = s2t(istate) + u_0(org_i,istate)*u_0(org_j,istate)*s2_tmp
+              vt (org_i,istate) = vt (org_i,istate) + s2_tmp*u_0(org_j,istate)
+              vt (org_j,istate) = vt (org_j,istate) + s2_tmp*u_0(org_i,istate)
             enddo
           endif
         enddo
@@ -180,7 +230,8 @@ subroutine get_s2_u0_nstates(s2,u_0,n,keys_tmp,Nint,N_st,sze_8)
         if(ext == 4) then
           call get_s2(keys_tmp(1,1,org_i),keys_tmp(1,1,org_j),s2_tmp,Nint)
           do istate=1,N_st
-            s2t(istate)  = s2t(istate) + u_0(org_i,istate)*u_0(org_j,istate)*s2_tmp
+            vt (org_i,istate) = vt (org_i,istate) + s2_tmp*u_0(org_j,istate)
+            vt (org_j,istate) = vt (org_j,istate) + s2_tmp*u_0(org_i,istate)
           enddo
         end if
       end do
@@ -190,21 +241,22 @@ subroutine get_s2_u0_nstates(s2,u_0,n,keys_tmp,Nint,N_st,sze_8)
   
   !$OMP CRITICAL
   do istate=1,N_st
-      s2(istate) = s2(istate) + 2.d0*s2t(istate)
+    do i=n,1,-1
+      v_0(i,istate) = v_0(i,istate) + vt(i,istate)
+    enddo
   enddo
   !$OMP END CRITICAL
 
+  deallocate(vt)
   !$OMP END PARALLEL
   
   do i=1,n
     call get_s2(keys_tmp(1,1,i),keys_tmp(1,1,i),s2_tmp,Nint)
     do istate=1,N_st
-      s2(istate) = s2(istate) + u_0(i,istate)*u_0(i,istate)*s2_tmp
+      v_0(i,istate) += s2_tmp * u_0(i,istate)
     enddo
   enddo
-  do istate=1,N_st
-      s2(istate) += S_z2_Sz
-  enddo
+
   deallocate (shortcut, sort_idx, sorted, version)
 end
 
@@ -214,116 +266,6 @@ end
 
 
 
-
-
-
-subroutine get_s2_u0_nstates_old(psi_keys_tmp,psi_coefs_tmp,n,nmax,s2,N_st)
-  implicit none
-  use bitmasks
-  integer, intent(in)            :: n,nmax, N_st
-  integer(bit_kind), intent(in)  :: psi_keys_tmp(N_int,2,nmax)
-  double precision, intent(in)   :: psi_coefs_tmp(nmax)
-  double precision, intent(out)  :: s2
-  double precision               :: s2_tmp
-  integer                        :: i,j,l,jj,ii
-  integer, allocatable           :: idx(:)
-  
-  integer, allocatable           :: shortcut(:), sort_idx(:)
-  integer(bit_kind), allocatable :: sorted(:,:), version(:,:)
-  integer                        :: sh, sh2, ni, exa, ext, org_i, org_j, endi, pass
-  
-  allocate (shortcut(0:n+1), sort_idx(n), sorted(N_int,n), version(N_int,n))
-  s2 = 0.d0
-  call sort_dets_ab_v(psi_keys_tmp, sorted, sort_idx, shortcut, version, n, N_int)
-  
-  PROVIDE threshold_davidson
-  !$OMP PARALLEL DEFAULT(NONE)                                       &
-      !$OMP PRIVATE(i,j,s2_tmp,sh, sh2, ni, exa, ext, org_i, org_j, endi, pass)&
-      !$OMP SHARED(n,psi_coefs_tmp,psi_keys_tmp,N_int,threshold_davidson,shortcut,sorted,sort_idx,version)&
-      !$OMP REDUCTION(+:s2)
-  
-  !$OMP DO SCHEDULE(dynamic)
-  do sh=1,shortcut(0)
-    
-    do sh2=1,sh
-      exa = 0
-      do ni=1,N_int
-        exa += popcnt(xor(version(ni,sh), version(ni,sh2)))
-      end do
-      if(exa > 2) then
-        cycle
-      end if
-      
-      do i=shortcut(sh),shortcut(sh+1)-1
-        if(sh==sh2) then
-          endi = i-1
-        else
-          endi = shortcut(sh2+1)-1
-        end if
-        
-        do j=shortcut(sh2),endi
-          ext = exa
-          do ni=1,N_int
-            ext += popcnt(xor(sorted(ni,i), sorted(ni,j)))
-          end do
-          if(ext <= 4) then
-            org_i = sort_idx(i)
-            org_j = sort_idx(j)
-            
-            if ( dabs(psi_coefs_tmp(org_j)) + dabs(psi_coefs_tmp(org_i))&
-                  > threshold_davidson ) then
-              call get_s2(psi_keys_tmp(1,1,org_i),psi_keys_tmp(1,1,org_j),s2_tmp,N_int)
-              s2 = s2 + psi_coefs_tmp(org_i)*psi_coefs_tmp(org_j)*s2_tmp
-            endif
-          end if
-        end do
-      end do
-    end do
-  enddo
-  !$OMP END DO
-  
-  !$OMP END PARALLEL
-  
-  call sort_dets_ba_v(psi_keys_tmp, sorted, sort_idx, shortcut, version, n, N_int)
-  
-  !$OMP PARALLEL DEFAULT(NONE)                                       &
-      !$OMP PRIVATE(i,j,s2_tmp,sh, sh2, ni, exa, ext, org_i, org_j, endi, pass)&
-      !$OMP SHARED(n,psi_coefs_tmp,psi_keys_tmp,N_int,threshold_davidson,shortcut,sorted,sort_idx,version)&
-      !$OMP REDUCTION(+:s2)
-  
-  !$OMP DO SCHEDULE(dynamic)
-  do sh=1,shortcut(0)
-    do i=shortcut(sh),shortcut(sh+1)-1
-      do j=shortcut(sh),i-1
-        ext = 0
-        do ni=1,N_int
-          ext += popcnt(xor(sorted(ni,i), sorted(ni,j)))
-        end do
-        if(ext == 4) then
-          org_i = sort_idx(i)
-          org_j = sort_idx(j)
-          
-          if ( dabs(psi_coefs_tmp(org_j)) + dabs(psi_coefs_tmp(org_i))&
-                > threshold_davidson ) then
-            call get_s2(psi_keys_tmp(1,1,org_i),psi_keys_tmp(1,1,org_j),s2_tmp,N_int)
-            s2 = s2 + psi_coefs_tmp(org_i)*psi_coefs_tmp(org_j)*s2_tmp
-          endif
-        end if
-      end do
-    end do
-  enddo
-  !$OMP END DO
-  
-  !$OMP END PARALLEL
-  s2 = s2+s2
-  do i=1,n
-    call get_s2(psi_keys_tmp(1,1,i),psi_keys_tmp(1,1,i),s2_tmp,N_int)
-    s2 = s2 + psi_coefs_tmp(i)*psi_coefs_tmp(i)*s2_tmp
-  enddo
-  s2 = s2 + S_z2_Sz
-  deallocate (shortcut, sort_idx, sorted, version)
-end
-
 subroutine get_uJ_s2_uI(psi_keys_tmp,psi_coefs_tmp,n,nmax_coefs,nmax_keys,s2,nstates)
   implicit none
   use bitmasks
@@ -373,9 +315,9 @@ subroutine get_uJ_s2_uI(psi_keys_tmp,psi_coefs_tmp,n,nmax_coefs,nmax_keys,s2,nst
   enddo
 end
 
-subroutine diagonalize_s2_betweenstates(keys_tmp,psi_coefs_inout,n,nmax_keys,nmax_coefs,nstates,s2_eigvalues)
+subroutine diagonalize_s2_betweenstates(keys_tmp,u_0,n,nmax_keys,nmax_coefs,nstates,s2_eigvalues)
  BEGIN_DOC
-! You enter with nstates vectors in psi_coefs_inout that may be coupled by S^2
+! You enter with nstates vectors in u_0 that may be coupled by S^2
 ! The subroutine diagonalize the S^2 operator in the basis of these states. 
 ! The vectors that you obtain in output are no more coupled by S^2, 
 ! which does not necessary mean that they are eigenfunction of S^2. 
@@ -388,11 +330,7 @@ subroutine diagonalize_s2_betweenstates(keys_tmp,psi_coefs_inout,n,nmax_keys,nma
  use bitmasks
  integer, intent(in) :: n,nmax_keys,nmax_coefs,nstates
  integer(bit_kind), intent(in) :: keys_tmp(N_int,2,nmax_keys)
- double precision, intent(inout) :: psi_coefs_inout(nmax_coefs,nstates)
-
-!integer, intent(in) :: ndets_real,ndets_keys,ndets_coefs,nstates
-!integer(bit_kind), intent(in) :: keys_tmp(N_int,2,ndets_keys)
-!double precision, intent(inout) :: psi_coefs_inout(ndets_coefs,nstates)
+ double precision, intent(inout) :: u_0(nmax_coefs,nstates)
  double precision, intent(out)   :: s2_eigvalues(nstates)
 
 
@@ -410,43 +348,37 @@ subroutine diagonalize_s2_betweenstates(keys_tmp,psi_coefs_inout,n,nmax_keys,nma
  print*,'nstates = ',nstates
  allocate(s2(nstates,nstates),overlap(nstates,nstates))
  !$OMP PARALLEL DO COLLAPSE(2) DEFAULT(NONE) SCHEDULE(dynamic) &
-     !$OMP  PRIVATE(i,j) SHARED(overlap,psi_coefs_inout,nstates,n)
+     !$OMP  PRIVATE(i,j) SHARED(overlap,u_0,nstates,n)
  do i = 1, nstates
    do j = 1, nstates
      if (i < j) then
         cycle
      else if (i == j) then
-       overlap(i,i) = u_dot_u(psi_coefs_inout(1,i),n)
+       overlap(i,i) = u_dot_u(u_0(1,i),n)
      else
-       overlap(i,j) = u_dot_v(psi_coefs_inout(1,j),psi_coefs_inout(1,i),n)
+       overlap(i,j) = u_dot_v(u_0(1,j),u_0(1,i),n)
        overlap(j,i) = overlap(i,j)
      endif
    enddo
  enddo
  !$OMP END PARALLEL DO
- call ortho_lowdin(overlap,size(overlap,1),nstates,psi_coefs_inout,size(psi_coefs_inout,1),n)
+ call ortho_lowdin(overlap,size(overlap,1),nstates,u_0,size(u_0,1),n)
 
- !$OMP PARALLEL DO COLLAPSE(2) DEFAULT(NONE) SCHEDULE(dynamic) &
-     !$OMP  PRIVATE(i,j) SHARED(overlap,psi_coefs_inout,nstates,n)
- do i = 1, nstates
-   do j = 1, nstates
-     if (i < j) then
-        cycle
-     else if (i == j) then
-       overlap(i,i) = u_dot_u(psi_coefs_inout(1,i),n)
-     else
-       overlap(i,j) = u_dot_v(psi_coefs_inout(1,j),psi_coefs_inout(1,i),n)
-       overlap(j,i) = overlap(i,j)
-     endif
-   enddo
+ double precision, allocatable :: v_0(:,:)
+ allocate ( v_0(size(u_0,1),nstates) )
+ call S2_u_0_nstates(v_0,u_0,n,keys_tmp,N_int,nstates,size(u_0,1))
+      
+ do i=1, nstates
+  do j=1,i
+    s2(j,i) = u_dot_v(u_0(1,i), v_0(1,j),n)
+    s2(i,j) = s2(j,i) 
+  enddo
  enddo
- !$OMP END PARALLEL DO
 
- call get_uJ_s2_uI(keys_tmp,psi_coefs_inout,n_det,size(psi_coefs_inout,1),size(keys_tmp,3),s2,nstates)
+! call get_uJ_s2_uI(keys_tmp,u_0,n_det,size(u_0,1),size(keys_tmp,3),s2,nstates)
  print*,'S^2 matrix in the basis of the states considered'
  do i = 1, nstates
   write(*,'(10(F10.6,X))')s2(i,:)
-  s2(i,i) = s2(i,i) 
  enddo
 
  double precision :: accu_precision_diag,accu_precision_of_diag
@@ -476,12 +408,11 @@ subroutine diagonalize_s2_betweenstates(keys_tmp,psi_coefs_inout,n,nmax_keys,nma
   s2(i,i) = s2(i,i) 
  enddo
 
- allocate(eigvalues(nstates),eigvectors(nstates,nstates))
- call lapack_diagd(eigvalues,eigvectors,s2,nstates,nstates)
+ allocate(eigvectors(nstates,nstates))
+ call lapack_diagd(s2_eigvalues,eigvectors,s2,nstates,nstates)
  print*,'Eigenvalues'
  do i = 1, nstates
-  print*,'s2 = ',eigvalues(i)
-  s2_eigvalues(i) = eigvalues(i)
+  print*,'s2 = ',s2_eigvalues(i)
  enddo
 
  allocate(psi_coefs_tmp(nmax_coefs,nstates))
@@ -490,27 +421,18 @@ subroutine diagonalize_s2_betweenstates(keys_tmp,psi_coefs_inout,n,nmax_keys,nma
   do k = 1, nstates
    coef_contract =  eigvectors(k,j)    !  <phi_k|Psi_j>
    do i = 1, n_det
-    psi_coefs_tmp(i,j) += psi_coefs_inout(i,k) * coef_contract
+    psi_coefs_tmp(i,j) += u_0(i,k) * coef_contract
    enddo
   enddo
  enddo
  do j = 1, nstates
-  accu = 0.d0
+   accu = 1.d0/u_dot_u(psi_coefs_tmp(1,j),n_det)
    do i = 1, n_det
-    accu += psi_coefs_tmp(i,j) * psi_coefs_tmp(i,j)
-   enddo
-   accu = 1.d0/dsqrt(accu)
-   do i = 1, n_det
-    psi_coefs_inout(i,j) = psi_coefs_tmp(i,j) * accu
+    u_0(i,j) = psi_coefs_tmp(i,j) * accu
    enddo
  enddo
-!call get_uJ_s2_uI(keys_tmp,psi_coefs_inout,n_det,size(psi_coefs_inout,1),size(keys_tmp,3),s2,nstates)
-!print*,'S^2 matrix in the basis of the NEW states considered'
-!do i = 1, nstates
-! write(*,'(10(F16.10,X))')s2(i,:)
-!enddo
 
- deallocate(s2,eigvalues,eigvectors,psi_coefs_tmp,overlap)
+ deallocate(s2,v_0,eigvectors,psi_coefs_tmp,overlap)
 
 end
 
diff --git a/src/Determinants/slater_rules.irp.f b/src/Determinants/slater_rules.irp.f
index 8ef69ce4..5e27253a 100644
--- a/src/Determinants/slater_rules.irp.f
+++ b/src/Determinants/slater_rules.irp.f
@@ -1634,7 +1634,7 @@ subroutine get_occ_from_key(key,occ,Nint)
   
 end
 
-subroutine u0_H_u_0(e_0,u_0,n,keys_tmp,Nint)
+subroutine u_0_H_u_0(e_0,u_0,n,keys_tmp,Nint)
   use bitmasks
   implicit none
   BEGIN_DOC
@@ -1647,10 +1647,10 @@ subroutine u0_H_u_0(e_0,u_0,n,keys_tmp,Nint)
   double precision, intent(out)  :: e_0
   double precision, intent(in)   :: u_0(n)
   integer(bit_kind),intent(in)   :: keys_tmp(Nint,2,n)
-  call u0_H_u_0_nstates(e_0,u_0,n,keys_tmp,Nint,1,n)
+  call u_0_H_u_0_nstates(e_0,u_0,n,keys_tmp,Nint,1,n)
 end
 
-subroutine u0_H_u_0_nstates(e_0,u_0,n,keys_tmp,Nint,N_st,sze_8)
+subroutine u_0_H_u_0_nstates(e_0,u_0,n,keys_tmp,Nint,N_st,sze_8)
   use bitmasks
   implicit none
   BEGIN_DOC