Rewrote s2

2025-02-18 15:44:18 +01:00 · 2016-09-16 23:45:03 +02:00 · 2016-09-16 23:45:03 +02:00 · 342927be90
commit 342927be90
parent 89be407d7e
7 changed files with 197 additions and 305 deletions
--- a/plugins/FOBOCI/dress_simple.irp.f
+++ b/plugins/FOBOCI/dress_simple.irp.f
@ -207,16 +207,16 @@ subroutine dress_H_matrix_from_psi_det_input(psi_det_generators_input,Ndet_gener

 call lapack_diagd(eigvalues,eigvectors,dressed_H_matrix,Ndet_generators,Ndet_generators)  ! Diagonalize the Dressed_H_matrix
 
- double precision :: s2,E_ref(N_states)
+ double precision :: s2(N_det_generators),E_ref(N_states)
 integer :: i_state(N_states)
 integer :: n_state_good
 n_state_good = 0
 if(s2_eig)then
+  call u_0_S2_u_0_nstates(s2,eigvectors,Ndet_generators,psi_det_generators_input,N_int,N_det_generators,size(eigvectors,1))
  do i = 1, Ndet_generators
-    call get_s2_u0(psi_det_generators_input,eigvectors(1,i),Ndet_generators,Ndet_generators,s2)
-    print*,'s2 = ',s2
-    print*,dabs(s2-expected_s2)
-    if(dabs(s2-expected_s2).le.0.3d0)then
+    print*,'s2 = ',s2(i)
+    print*,dabs(s2(i)-expected_s2)
+    if(dabs(s2(i)-expected_s2).le.0.3d0)then
     n_state_good +=1
     i_state(n_state_good) = i
     E_ref(n_state_good) = eigvalues(i)
@ -274,7 +274,6 @@ subroutine dress_H_matrix_from_psi_det_input(psi_det_generators_input,Ndet_gener
 integer :: i_good_state(0:N_states)
 i_good_state(0) = 0
  do i = 1, Ndet_generators
-    call get_s2_u0(psi_det_generators_input,eigvectors(1,i),Ndet_generators,Ndet_generators,s2)
    ! State following
    do k = 1, N_states 
     accu = 0.d0
--- a/plugins/Full_CI/micro_pt2.irp.f
+++ b/plugins/Full_CI/micro_pt2.irp.f
@ -1,61 +0,0 @@
-program micro_pt2
-  implicit none
-  BEGIN_DOC
-! Helper program to compute the PT2 in distributed mode.
-  END_DOC
-
-  read_wf = .False.
-  SOFT_TOUCH read_wf
-  call provide_everything
-  call switch_qp_run_to_master
-  call run_wf
-
-end
-
-subroutine provide_everything
-  PROVIDE H_apply_buffer_allocated mo_bielec_integrals_in_map psi_det_generators psi_coef_generators psi_det_sorted_bit psi_selectors n_det_generators n_states generators_bitmask zmq_context
-end
-
-subroutine run_wf
-  use f77_zmq
-  implicit none
-
-  integer(ZMQ_PTR), external :: new_zmq_to_qp_run_socket
-  integer(ZMQ_PTR) :: zmq_to_qp_run_socket
-  double precision :: energy(N_states_diag)
-
-  print *,  'Getting wave function'
-  zmq_context = f77_zmq_ctx_new ()
-
-  zmq_to_qp_run_socket = new_zmq_to_qp_run_socket()
-
-  ! TODO : do loop here
-  ! TODO : wait_state
-  call zmq_get_psi(zmq_to_qp_run_socket,1,energy,size(energy))
-  integer :: j,k
-  do j=1,N_states_diag
-    do k=1,N_det
-     CI_eigenvectors(k,j) = psi_coef(k,j)
-    enddo
-    call get_s2_u0(psi_det,CI_eigenvectors(1,j),N_det,size(CI_eigenvectors,1),CI_eigenvectors_s2(j))
-  enddo
-  if (.True.) then
-    do k=1,size(ci_electronic_energy)
-      ci_electronic_energy(k) = energy(k)
-    enddo
-    SOFT_TOUCH ci_electronic_energy CI_eigenvectors_s2 CI_eigenvectors
-    print *,  energy(:)
-  endif
-  call write_double(6,ci_energy,'Energy')
-  zmq_state = 'h_apply_fci_pt2'
-
-  call provide_everything
-  integer :: rc, i
-
-  print *,  'Contribution to PT2 running'
-
-  !$OMP PARALLEL PRIVATE(i)
-  i = omp_get_thread_num()
-  call H_apply_FCI_PT2_slave_tcp(i)
-  !$OMP END PARALLEL
-end
--- a/plugins/Full_CI_ZMQ/selection_slave.irp.f
+++ b/plugins/Full_CI_ZMQ/selection_slave.irp.f
@ -62,8 +62,8 @@ subroutine update_energy(energy)
    do k=1,N_det
      CI_eigenvectors(k,j) = psi_coef(k,j)
    enddo
-    call get_s2_u0(psi_det,CI_eigenvectors(1,j),N_det,size(CI_eigenvectors,1),CI_eigenvectors_s2(j))
  enddo
+  call u_0_S2_u_0(CI_eigenvectors_s2,CI_eigenvectors,N_det,psi_det,N_int)
  if (.True.) then
    do k=1,size(ci_electronic_energy)
      ci_electronic_energy(k) = energy(k)
--- a/plugins/MRCC_Utils/mrcc_utils.irp.f
+++ b/plugins/MRCC_Utils/mrcc_utils.irp.f
@ -148,9 +148,8 @@ END_PROVIDER
     
     call davidson_diag_mrcc(psi_det,CI_eigenvectors_dressed,CI_electronic_energy_dressed,&
          size(CI_eigenvectors_dressed,1),N_det,N_states_diag,N_int,output_determinants,mrcc_state)
-     do j=1,N_states_diag
-       call get_s2_u0(psi_det,CI_eigenvectors_dressed(1,j),N_det,size(CI_eigenvectors_dressed,1),CI_eigenvectors_s2_dressed(j))
-     enddo
+     call u_0_S2_u_0_nstates(CI_eigenvectors_s2_dressed,CI_eigenvectors_dressed,N_det,psi_det,N_int,&
+         N_states_diag,size(CI_eigenvectors_dressed,1))

     
   else if (diag_algorithm == "Lapack") then
@ -160,42 +159,84 @@ END_PROVIDER
     call lapack_diag(eigenvalues,eigenvectors,                      &
         H_matrix_dressed,size(H_matrix_dressed,1),N_det)
     CI_electronic_energy_dressed(:) = 0.d0
-     do i=1,N_det
-       CI_eigenvectors_dressed(i,1) = eigenvectors(i,1)
-     enddo
-     i_state = 0
     if (s2_eig) then
+       i_state = 0
+       allocate (s2_eigvalues(N_det))
+       allocate(index_good_state_array(N_det),good_state_array(N_det))
+       good_state_array = .False.
+       call u_0_S2_u_0_nstates(s2_eigvalues,eigenvectors,N_det,psi_det,N_int,&
+         N_det,size(eigenvectors,1))
       do j=1,N_det
-         call get_s2_u0(psi_det,eigenvectors(1,j),N_det,N_det,s2)
+         ! Select at least n_states states with S^2 values closed to "expected_s2"
         if(dabs(s2-expected_s2).le.0.5d0)then
           i_state += 1
-           do i=1,N_det
-             CI_eigenvectors_dressed(i,i_state) = eigenvectors(i,j)
-           enddo
-           CI_electronic_energy_dressed(i_state) = eigenvalues(j)
-           CI_eigenvectors_s2_dressed(i_state) = s2
+           index_good_state_array(i_state) = j
+           good_state_array(j) = .True.
         endif
-         if (i_state.ge.N_states_diag) then
+         if (i_state==N_states) then
           exit
         endif
       enddo
-     else
-       do j=1,N_states_diag
-         call get_s2_u0(psi_det,eigenvectors(1,j),N_det,N_det,s2)
-         i_state += 1
+       if (i_state /= 0) then
+         ! Fill the first "i_state" states that have a correct S^2 value
+         do j = 1, i_state
           do i=1,N_det
-           CI_eigenvectors_dressed(i,i_state) = eigenvectors(i,j)
+             CI_eigenvectors_dressed(i,j) = eigenvectors(i,index_good_state_array(j))
           enddo
-         CI_electronic_energy_dressed(i_state) = eigenvalues(j)
-         CI_eigenvectors_s2_dressed(i_state) = s2
+           CI_electronic_energy_dressed(j) = eigenvalues(index_good_state_array(j))
+           CI_eigenvectors_s2_dressed(j) = s2_eigvalues(index_good_state_array(j))
+         enddo
+         i_other_state = 0
+         do j = 1, N_det
+           if(good_state_array(j))cycle
+           i_other_state +=1
+           if(i_state+i_other_state.gt.n_states_diag)then
+             exit
+           endif
+           do i=1,N_det
+             CI_eigenvectors_dressed(i,i_state+i_other_state) = eigenvectors(i,j)
+           enddo
+           CI_electronic_energy_dressed(i_state+i_other_state) = eigenvalues(j)
+           CI_eigenvectors_s2_dressed(i_state+i_other_state) = s2_eigvalues(i_state+i_other_state)
+         enddo
+         
+       else
+         print*,''
+         print*,'!!!!!!!!   WARNING  !!!!!!!!!'
+         print*,'  Within the ',N_det,'determinants selected'
+         print*,'  and the ',N_states_diag,'states requested'
+         print*,'  We did not find any state with S^2 values close to ',expected_s2
+         print*,'  We will then set the first N_states eigenvectors of the H matrix'
+         print*,'  as the CI_eigenvectors_dressed'
+         print*,'  You should consider more states and maybe ask for s2_eig to be .True. or just enlarge the CI space'
+         print*,''
+         do j=1,min(N_states_diag,N_det)
+           do i=1,N_det
+             CI_eigenvectors_dressed(i,j) = eigenvectors(i,j)
+           enddo
+           CI_electronic_energy_dressed(j) = eigenvalues(j)
+           CI_eigenvectors_s2_dressed(j) = s2_eigvalues(j)
+         enddo
+       endif
+       deallocate(index_good_state_array,good_state_array)
+       deallocate(s2_eigvalues)
+     else
+       call u_0_S2_u_0_nstates(CI_eigenvectors_s2_dressed,eigenvectors,N_det,psi_det,N_int,&
+          min(N_det,N_states_diag),size(eigenvectors,1))
+       ! Select the "N_states_diag" states of lowest energy
+       do j=1,min(N_det,N_states_diag)
+         do i=1,N_det
+           CI_eigenvectors_dressed(i,j) = eigenvectors(i,j)
+         enddo
+         CI_electronic_energy_dressed(j) = eigenvalues(j)
       enddo
     endif
     deallocate(eigenvectors,eigenvalues)
   endif

-   if(s2_eig.and.n_states_diag > 1.and. n_det >= n_states_diag)then
+   if( s2_eig.and.(n_states_diag > 1).and.(n_det >= n_states_diag) )then
      ! Diagonalizing S^2 within the "n_states_diag" states found
-      allocate(s2_eigvalues(N_states_diag))
+      allocate(s2_eigvalues(N_states_diag), e_array(N_states_diag))
      call diagonalize_s2_betweenstates(psi_det,CI_eigenvectors_dressed,n_det,size(psi_det,3),size(CI_eigenvectors_dressed,1),min(n_states_diag,n_det),s2_eigvalues)
      
      do j = 1, N_states_diag
@ -203,6 +244,7 @@ END_PROVIDER
          psi_coef(i,j) = CI_eigenvectors_dressed(i,j)
        enddo
      enddo
+      call u_0_H_u_0_nstates(e_array,psi_coef,n_det,psi_det,N_int,N_states_diag,psi_det_size)
     
      ! Browsing the "n_states_diag" states and getting the lowest in energy "n_states" ones that have the S^2 value
      ! closer to the "expected_s2" set as input
@ -218,15 +260,13 @@ END_PROVIDER
        endif
      enddo
      ! Sorting the i_state good states by energy
-      allocate(e_array(i_state),iorder(i_state))
+      allocate(iorder(i_state))
      do j = 1, i_state
        do i = 1, N_det
          CI_eigenvectors_dressed(i,j) = psi_coef(i,index_good_state_array(j))
        enddo
        CI_eigenvectors_s2_dressed(j) = s2_eigvalues(index_good_state_array(j))
-        call u0_H_u_0_mrcc(e_0,CI_eigenvectors_dressed(1,j),n_det,psi_det,N_int,mrcc_state)
-        CI_electronic_energy_dressed(j) = e_0
-        e_array(j) = e_0
+        CI_electronic_energy_dressed(j) = e_array(j)
        iorder(j) = j
      enddo
      call dsort(e_array,iorder,i_state)
@ -236,14 +276,7 @@ END_PROVIDER
        do i = 1, N_det
          CI_eigenvectors_dressed(i,j) = psi_coef(i,index_good_state_array(iorder(j)))
        enddo
-        !    call u0_H_u_0_mrcc(e_0,CI_eigenvectors_dressed(1,j),n_det,psi_det,N_int,mrcc_state)
-        !    print*,'e    = ',CI_electronic_energy_dressed(j)
-        !    print*,'<e>  = ',e_0
-        !    call get_s2_u0(psi_det,CI_eigenvectors_dressed(1,j),N_det,size(CI_eigenvectors_dressed,1),s2)
-        !    print*,'s^2  = ',CI_eigenvectors_s2_dressed(j)
-        !    print*,'<s^2>= ',s2
      enddo
-      deallocate(e_array,iorder)
      
      ! Then setting the other states without any specific energy order
      i_other_state = 0
@ -254,10 +287,9 @@ END_PROVIDER
          CI_eigenvectors_dressed(i,i_state + i_other_state) = psi_coef(i,j)
        enddo
        CI_eigenvectors_s2_dressed(i_state + i_other_state) = s2_eigvalues(j)
-        call u0_H_u_0_mrcc(e_0,CI_eigenvectors_dressed(1,i_state + i_other_state),n_det,psi_det,N_int,mrcc_state)
-        CI_electronic_energy_dressed(i_state + i_other_state) = e_0
+        CI_electronic_energy_dressed(i_state + i_other_state) = e_array(i_state + i_other_state)
      enddo
-      deallocate(index_good_state_array,good_state_array)
+      deallocate(iorder,e_array,index_good_state_array,good_state_array)
       
     deallocate(s2_eigvalues)
     
--- a/src/Determinants/diagonalize_CI.irp.f
+++ b/src/Determinants/diagonalize_CI.irp.f
@ -72,7 +72,7 @@ END_PROVIDER
     call davidson_diag(psi_det,CI_eigenvectors,CI_electronic_energy,&
         size(CI_eigenvectors,1),N_det,N_states_diag,N_int,output_determinants)

-     call get_s2_u0_nstates(CI_eigenvectors_s2,CI_eigenvectors,N_det,psi_det,N_int,&
+     call u_0_S2_u_0_nstates(CI_eigenvectors_s2,CI_eigenvectors,N_det,psi_det,N_int,&
         N_states_diag,size(CI_eigenvectors,1))
     
     
@ -88,7 +88,7 @@ END_PROVIDER
       allocate (s2_eigvalues(N_det))
       allocate(index_good_state_array(N_det),good_state_array(N_det))
       good_state_array = .False.
-       call get_s2_u0_nstates(s2_eigvalues,eigenvectors,N_det,psi_det,N_int,&
+       call u_0_S2_u_0_nstates(s2_eigvalues,eigenvectors,N_det,psi_det,N_int,&
         N_det,size(eigenvectors,1))
       do j=1,N_det
         ! Select at least n_states states with S^2 values closed to "expected_s2"
@ -145,7 +145,7 @@ END_PROVIDER
       deallocate(index_good_state_array,good_state_array)
       deallocate(s2_eigvalues)
     else
-       call get_s2_u0_nstates(CI_eigenvectors_s2,eigenvectors,N_det,psi_det,N_int,&
+       call u_0_S2_u_0_nstates(CI_eigenvectors_s2,eigenvectors,N_det,psi_det,N_int,&
          min(N_det,N_states_diag),size(eigenvectors,1))
       ! Select the "N_states_diag" states of lowest energy
       do j=1,min(N_det,N_states_diag)
@ -169,7 +169,7 @@ END_PROVIDER
          psi_coef(i,j) = CI_eigenvectors(i,j)
        enddo
      enddo
-      call u0_H_u_0_nstates(e_array,psi_coef,n_det,psi_det,N_int,N_states_diag,psi_det_size)
+      call u_0_H_u_0_nstates(e_array,psi_coef,n_det,psi_det,N_int,N_states_diag,psi_det_size)
     
      ! Browsing the "n_states_diag" states and getting the lowest in energy "n_states" ones that have the S^2 value
      ! closer to the "expected_s2" set as input
--- a/src/Determinants/s2.irp.f
+++ b/src/Determinants/s2.irp.f
@ -69,38 +69,86 @@ BEGIN_PROVIDER [ double precision, s2_values, (N_states) ]
 ! array of the averaged values of the S^2 operator on the various states
 END_DOC
 integer :: i
- call get_s2_u0_nstates(s2_values,psi_coef,n_det,psi_det,N_int,N_states,psi_det_size)
+ call u_0_S2_u_0_nstates(s2_values,psi_coef,n_det,psi_det,N_int,N_states,psi_det_size)

 END_PROVIDER



-subroutine get_s2_u0(psi_keys_tmp,psi_coefs_tmp,n,nmax,s2)
-  implicit none
-  use bitmasks
-  integer, intent(in)            :: n,nmax
-  integer(bit_kind), intent(in)  :: psi_keys_tmp(N_int,2,nmax)
-  double precision, intent(in)   :: psi_coefs_tmp(nmax)
-  double precision, intent(out)  :: s2
-  call get_s2_u0_nstates(s2,psi_coefs_tmp,n,psi_keys_tmp,N_int,1,nmax)
-end
-
-
-subroutine get_s2_u0_nstates(s2,u_0,n,keys_tmp,Nint,N_st,sze_8)
+subroutine u_0_S2_u_0(e_0,u_0,n,keys_tmp,Nint)
  use bitmasks
  implicit none
  BEGIN_DOC
-  ! Computes s2  = <u_0|S^2|u_0>
+  ! Computes e_0 = <u_0|S2|u_0>/<u_0|u_0>
+  !
+  ! n : number of determinants
+  !
+  END_DOC
+  integer, intent(in)            :: n,Nint
+  double precision, intent(out)  :: e_0
+  double precision, intent(in)   :: u_0(n)
+  integer(bit_kind),intent(in)   :: keys_tmp(Nint,2,n)
+  call u_0_S2_u_0_nstates(e_0,u_0,n,keys_tmp,Nint,1,n)
+end
+
+subroutine u_0_S2_u_0_nstates(e_0,u_0,n,keys_tmp,Nint,N_st,sze_8)
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Computes e_0 = <u_0|S2|u_0>/<u_0|u_0>
+  !
+  ! n : number of determinants
+  !
+  END_DOC
+  integer, intent(in)            :: n,Nint, N_st, sze_8
+  double precision, intent(out)  :: e_0(N_st)
+  double precision, intent(in)   :: u_0(sze_8,N_st)
+  integer(bit_kind),intent(in)   :: keys_tmp(Nint,2,n)
+  
+  double precision, allocatable  :: v_0(:,:)
+  double precision               :: u_dot_u,u_dot_v
+  integer :: i,j
+  allocate (v_0(sze_8,N_st))
+  
+  call S2_u_0_nstates(v_0,u_0,n,keys_tmp,Nint,N_st,sze_8)
+  do i=1,N_st
+    e_0(i) = u_dot_v(v_0(1,i),u_0(1,i),n)/u_dot_u(u_0(1,i),n) + S_z2_Sz
+  enddo
+end
+
+
+
+subroutine S2_u_0(v_0,u_0,n,keys_tmp,Nint)
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Computes v_0 = S^2|u_0>
+  !
+  ! n : number of determinants
+  !
+  END_DOC
+  integer, intent(in)            :: n,Nint
+  double precision, intent(out)  :: v_0(n)
+  double precision, intent(in)   :: u_0(n)
+  integer(bit_kind),intent(in)   :: keys_tmp(Nint,2,n)
+  call S2_u_0_nstates(v_0,u_0,n,keys_tmp,Nint,1,n)
+end
+
+subroutine S2_u_0_nstates(v_0,u_0,n,keys_tmp,Nint,N_st,sze_8)
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Computes v_0  = S^2|u_0>
  !
  ! n : number of determinants
  !
  END_DOC
  integer, intent(in)            :: N_st,n,Nint, sze_8
-  double precision, intent(out)  :: s2(N_st)
+  double precision, intent(out)  :: v_0(sze_8,N_st)
  double precision, intent(in)   :: u_0(sze_8,N_st)
  integer(bit_kind),intent(in)   :: keys_tmp(Nint,2,n)
  double precision               :: s2_tmp
-  double precision               :: s2t(N_st)
+  double precision, allocatable  :: vt(:,:)
  integer                        :: i,j,k,l, jj,ii
  integer                        :: i0, j0
  
@ -117,15 +165,16 @@ subroutine get_s2_u0_nstates(s2,u_0,n,keys_tmp,Nint,N_st,sze_8)
  PROVIDE ref_bitmask_energy davidson_criterion

  allocate (shortcut(0:n+1,2), sort_idx(n,2), sorted(Nint,n,2), version(Nint,n,2))
-  s2 = 0.d0
+  v_0 = 0.d0

  call sort_dets_ab_v(keys_tmp, sorted(1,1,1), sort_idx(1,1), shortcut(0,1), version(1,1,1), n, Nint)
  call sort_dets_ba_v(keys_tmp, sorted(1,1,2), sort_idx(1,2), shortcut(0,2), version(1,1,2), n, Nint)
  
  !$OMP PARALLEL DEFAULT(NONE)                                       &
-      !$OMP PRIVATE(i,s2_tmp,j,k,jj,s2t,ii,sh,sh2,ni,exa,ext,org_i,org_j,endi,sorted_i,istate)&
-      !$OMP SHARED(n,u_0,keys_tmp,Nint,s2,sorted,shortcut,sort_idx,version,N_st,sze_8)
-  s2t = 0.d0
+      !$OMP PRIVATE(i,s2_tmp,j,k,jj,vt,ii,sh,sh2,ni,exa,ext,org_i,org_j,endi,sorted_i,istate)&
+      !$OMP SHARED(n,u_0,keys_tmp,Nint,v_0,sorted,shortcut,sort_idx,version,N_st,sze_8)
+  allocate(vt(sze_8,N_st))
+  vt = 0.d0
  
  !$OMP DO SCHEDULE(dynamic)
  do sh=1,shortcut(0,1)
@ -158,7 +207,8 @@ subroutine get_s2_u0_nstates(s2,u_0,n,keys_tmp,Nint,N_st,sze_8)
          if(ext <= 4) then
            call get_s2(keys_tmp(1,1,org_i),keys_tmp(1,1,org_j),s2_tmp,Nint)
            do istate=1,N_st
-              s2t(istate)  = s2t(istate) + u_0(org_i,istate)*u_0(org_j,istate)*s2_tmp
+              vt (org_i,istate) = vt (org_i,istate) + s2_tmp*u_0(org_j,istate)
+              vt (org_j,istate) = vt (org_j,istate) + s2_tmp*u_0(org_i,istate)
            enddo
          endif
        enddo
@ -180,7 +230,8 @@ subroutine get_s2_u0_nstates(s2,u_0,n,keys_tmp,Nint,N_st,sze_8)
        if(ext == 4) then
          call get_s2(keys_tmp(1,1,org_i),keys_tmp(1,1,org_j),s2_tmp,Nint)
          do istate=1,N_st
-            s2t(istate)  = s2t(istate) + u_0(org_i,istate)*u_0(org_j,istate)*s2_tmp
+            vt (org_i,istate) = vt (org_i,istate) + s2_tmp*u_0(org_j,istate)
+            vt (org_j,istate) = vt (org_j,istate) + s2_tmp*u_0(org_i,istate)
          enddo
        end if
      end do
@ -190,21 +241,22 @@ subroutine get_s2_u0_nstates(s2,u_0,n,keys_tmp,Nint,N_st,sze_8)
  
  !$OMP CRITICAL
  do istate=1,N_st
-      s2(istate) = s2(istate) + 2.d0*s2t(istate)
+    do i=n,1,-1
+      v_0(i,istate) = v_0(i,istate) + vt(i,istate)
+    enddo
  enddo
  !$OMP END CRITICAL

+  deallocate(vt)
  !$OMP END PARALLEL
  
  do i=1,n
    call get_s2(keys_tmp(1,1,i),keys_tmp(1,1,i),s2_tmp,Nint)
    do istate=1,N_st
-      s2(istate) = s2(istate) + u_0(i,istate)*u_0(i,istate)*s2_tmp
+      v_0(i,istate) += s2_tmp * u_0(i,istate)
    enddo
  enddo
-  do istate=1,N_st
-      s2(istate) += S_z2_Sz
-  enddo
+
  deallocate (shortcut, sort_idx, sorted, version)
 end

@ -214,116 +266,6 @@ end



-
-
-
-subroutine get_s2_u0_nstates_old(psi_keys_tmp,psi_coefs_tmp,n,nmax,s2,N_st)
-  implicit none
-  use bitmasks
-  integer, intent(in)            :: n,nmax, N_st
-  integer(bit_kind), intent(in)  :: psi_keys_tmp(N_int,2,nmax)
-  double precision, intent(in)   :: psi_coefs_tmp(nmax)
-  double precision, intent(out)  :: s2
-  double precision               :: s2_tmp
-  integer                        :: i,j,l,jj,ii
-  integer, allocatable           :: idx(:)
-  
-  integer, allocatable           :: shortcut(:), sort_idx(:)
-  integer(bit_kind), allocatable :: sorted(:,:), version(:,:)
-  integer                        :: sh, sh2, ni, exa, ext, org_i, org_j, endi, pass
-  
-  allocate (shortcut(0:n+1), sort_idx(n), sorted(N_int,n), version(N_int,n))
-  s2 = 0.d0
-  call sort_dets_ab_v(psi_keys_tmp, sorted, sort_idx, shortcut, version, n, N_int)
-  
-  PROVIDE threshold_davidson
-  !$OMP PARALLEL DEFAULT(NONE)                                       &
-      !$OMP PRIVATE(i,j,s2_tmp,sh, sh2, ni, exa, ext, org_i, org_j, endi, pass)&
-      !$OMP SHARED(n,psi_coefs_tmp,psi_keys_tmp,N_int,threshold_davidson,shortcut,sorted,sort_idx,version)&
-      !$OMP REDUCTION(+:s2)
-  
-  !$OMP DO SCHEDULE(dynamic)
-  do sh=1,shortcut(0)
-    
-    do sh2=1,sh
-      exa = 0
-      do ni=1,N_int
-        exa += popcnt(xor(version(ni,sh), version(ni,sh2)))
-      end do
-      if(exa > 2) then
-        cycle
-      end if
-      
-      do i=shortcut(sh),shortcut(sh+1)-1
-        if(sh==sh2) then
-          endi = i-1
-        else
-          endi = shortcut(sh2+1)-1
-        end if
-        
-        do j=shortcut(sh2),endi
-          ext = exa
-          do ni=1,N_int
-            ext += popcnt(xor(sorted(ni,i), sorted(ni,j)))
-          end do
-          if(ext <= 4) then
-            org_i = sort_idx(i)
-            org_j = sort_idx(j)
-            
-            if ( dabs(psi_coefs_tmp(org_j)) + dabs(psi_coefs_tmp(org_i))&
-                  > threshold_davidson ) then
-              call get_s2(psi_keys_tmp(1,1,org_i),psi_keys_tmp(1,1,org_j),s2_tmp,N_int)
-              s2 = s2 + psi_coefs_tmp(org_i)*psi_coefs_tmp(org_j)*s2_tmp
-            endif
-          end if
-        end do
-      end do
-    end do
-  enddo
-  !$OMP END DO
-  
-  !$OMP END PARALLEL
-  
-  call sort_dets_ba_v(psi_keys_tmp, sorted, sort_idx, shortcut, version, n, N_int)
-  
-  !$OMP PARALLEL DEFAULT(NONE)                                       &
-      !$OMP PRIVATE(i,j,s2_tmp,sh, sh2, ni, exa, ext, org_i, org_j, endi, pass)&
-      !$OMP SHARED(n,psi_coefs_tmp,psi_keys_tmp,N_int,threshold_davidson,shortcut,sorted,sort_idx,version)&
-      !$OMP REDUCTION(+:s2)
-  
-  !$OMP DO SCHEDULE(dynamic)
-  do sh=1,shortcut(0)
-    do i=shortcut(sh),shortcut(sh+1)-1
-      do j=shortcut(sh),i-1
-        ext = 0
-        do ni=1,N_int
-          ext += popcnt(xor(sorted(ni,i), sorted(ni,j)))
-        end do
-        if(ext == 4) then
-          org_i = sort_idx(i)
-          org_j = sort_idx(j)
-          
-          if ( dabs(psi_coefs_tmp(org_j)) + dabs(psi_coefs_tmp(org_i))&
-                > threshold_davidson ) then
-            call get_s2(psi_keys_tmp(1,1,org_i),psi_keys_tmp(1,1,org_j),s2_tmp,N_int)
-            s2 = s2 + psi_coefs_tmp(org_i)*psi_coefs_tmp(org_j)*s2_tmp
-          endif
-        end if
-      end do
-    end do
-  enddo
-  !$OMP END DO
-  
-  !$OMP END PARALLEL
-  s2 = s2+s2
-  do i=1,n
-    call get_s2(psi_keys_tmp(1,1,i),psi_keys_tmp(1,1,i),s2_tmp,N_int)
-    s2 = s2 + psi_coefs_tmp(i)*psi_coefs_tmp(i)*s2_tmp
-  enddo
-  s2 = s2 + S_z2_Sz
-  deallocate (shortcut, sort_idx, sorted, version)
-end
-
 subroutine get_uJ_s2_uI(psi_keys_tmp,psi_coefs_tmp,n,nmax_coefs,nmax_keys,s2,nstates)
  implicit none
  use bitmasks
@ -373,9 +315,9 @@ subroutine get_uJ_s2_uI(psi_keys_tmp,psi_coefs_tmp,n,nmax_coefs,nmax_keys,s2,nst
  enddo
 end

-subroutine diagonalize_s2_betweenstates(keys_tmp,psi_coefs_inout,n,nmax_keys,nmax_coefs,nstates,s2_eigvalues)
+subroutine diagonalize_s2_betweenstates(keys_tmp,u_0,n,nmax_keys,nmax_coefs,nstates,s2_eigvalues)
 BEGIN_DOC
-! You enter with nstates vectors in psi_coefs_inout that may be coupled by S^2
+! You enter with nstates vectors in u_0 that may be coupled by S^2
 ! The subroutine diagonalize the S^2 operator in the basis of these states. 
 ! The vectors that you obtain in output are no more coupled by S^2, 
 ! which does not necessary mean that they are eigenfunction of S^2. 
@ -388,11 +330,7 @@ subroutine diagonalize_s2_betweenstates(keys_tmp,psi_coefs_inout,n,nmax_keys,nma
 use bitmasks
 integer, intent(in) :: n,nmax_keys,nmax_coefs,nstates
 integer(bit_kind), intent(in) :: keys_tmp(N_int,2,nmax_keys)
- double precision, intent(inout) :: psi_coefs_inout(nmax_coefs,nstates)
-
-!integer, intent(in) :: ndets_real,ndets_keys,ndets_coefs,nstates
-!integer(bit_kind), intent(in) :: keys_tmp(N_int,2,ndets_keys)
-!double precision, intent(inout) :: psi_coefs_inout(ndets_coefs,nstates)
+ double precision, intent(inout) :: u_0(nmax_coefs,nstates)
 double precision, intent(out)   :: s2_eigvalues(nstates)


@ -410,43 +348,37 @@ subroutine diagonalize_s2_betweenstates(keys_tmp,psi_coefs_inout,n,nmax_keys,nma
 print*,'nstates = ',nstates
 allocate(s2(nstates,nstates),overlap(nstates,nstates))
 !$OMP PARALLEL DO COLLAPSE(2) DEFAULT(NONE) SCHEDULE(dynamic) &
-     !$OMP  PRIVATE(i,j) SHARED(overlap,psi_coefs_inout,nstates,n)
+     !$OMP  PRIVATE(i,j) SHARED(overlap,u_0,nstates,n)
 do i = 1, nstates
   do j = 1, nstates
     if (i < j) then
        cycle
     else if (i == j) then
-       overlap(i,i) = u_dot_u(psi_coefs_inout(1,i),n)
+       overlap(i,i) = u_dot_u(u_0(1,i),n)
     else
-       overlap(i,j) = u_dot_v(psi_coefs_inout(1,j),psi_coefs_inout(1,i),n)
+       overlap(i,j) = u_dot_v(u_0(1,j),u_0(1,i),n)
       overlap(j,i) = overlap(i,j)
     endif
   enddo
 enddo
 !$OMP END PARALLEL DO
- call ortho_lowdin(overlap,size(overlap,1),nstates,psi_coefs_inout,size(psi_coefs_inout,1),n)
+ call ortho_lowdin(overlap,size(overlap,1),nstates,u_0,size(u_0,1),n)
+
+ double precision, allocatable :: v_0(:,:)
+ allocate ( v_0(size(u_0,1),nstates) )
+ call S2_u_0_nstates(v_0,u_0,n,keys_tmp,N_int,nstates,size(u_0,1))
      
- !$OMP PARALLEL DO COLLAPSE(2) DEFAULT(NONE) SCHEDULE(dynamic) &
-     !$OMP  PRIVATE(i,j) SHARED(overlap,psi_coefs_inout,nstates,n)
 do i=1, nstates
-   do j = 1, nstates
-     if (i < j) then
-        cycle
-     else if (i == j) then
-       overlap(i,i) = u_dot_u(psi_coefs_inout(1,i),n)
-     else
-       overlap(i,j) = u_dot_v(psi_coefs_inout(1,j),psi_coefs_inout(1,i),n)
-       overlap(j,i) = overlap(i,j)
-     endif
+  do j=1,i
+    s2(j,i) = u_dot_v(u_0(1,i), v_0(1,j),n)
+    s2(i,j) = s2(j,i) 
  enddo
 enddo
- !$OMP END PARALLEL DO

- call get_uJ_s2_uI(keys_tmp,psi_coefs_inout,n_det,size(psi_coefs_inout,1),size(keys_tmp,3),s2,nstates)
+! call get_uJ_s2_uI(keys_tmp,u_0,n_det,size(u_0,1),size(keys_tmp,3),s2,nstates)
 print*,'S^2 matrix in the basis of the states considered'
 do i = 1, nstates
  write(*,'(10(F10.6,X))')s2(i,:)
-  s2(i,i) = s2(i,i) 
 enddo

 double precision :: accu_precision_diag,accu_precision_of_diag
@ -476,12 +408,11 @@ subroutine diagonalize_s2_betweenstates(keys_tmp,psi_coefs_inout,n,nmax_keys,nma
  s2(i,i) = s2(i,i) 
 enddo

- allocate(eigvalues(nstates),eigvectors(nstates,nstates))
- call lapack_diagd(eigvalues,eigvectors,s2,nstates,nstates)
+ allocate(eigvectors(nstates,nstates))
+ call lapack_diagd(s2_eigvalues,eigvectors,s2,nstates,nstates)
 print*,'Eigenvalues'
 do i = 1, nstates
-  print*,'s2 = ',eigvalues(i)
-  s2_eigvalues(i) = eigvalues(i)
+  print*,'s2 = ',s2_eigvalues(i)
 enddo

 allocate(psi_coefs_tmp(nmax_coefs,nstates))
@ -490,27 +421,18 @@ subroutine diagonalize_s2_betweenstates(keys_tmp,psi_coefs_inout,n,nmax_keys,nma
  do k = 1, nstates
   coef_contract =  eigvectors(k,j)    !  <phi_k|Psi_j>
   do i = 1, n_det
-    psi_coefs_tmp(i,j) += psi_coefs_inout(i,k) * coef_contract
+    psi_coefs_tmp(i,j) += u_0(i,k) * coef_contract
   enddo
  enddo
 enddo
 do j = 1, nstates
-  accu = 0.d0
+   accu = 1.d0/u_dot_u(psi_coefs_tmp(1,j),n_det)
   do i = 1, n_det
-    accu += psi_coefs_tmp(i,j) * psi_coefs_tmp(i,j)
-   enddo
-   accu = 1.d0/dsqrt(accu)
-   do i = 1, n_det
-    psi_coefs_inout(i,j) = psi_coefs_tmp(i,j) * accu
+    u_0(i,j) = psi_coefs_tmp(i,j) * accu
   enddo
 enddo
-!call get_uJ_s2_uI(keys_tmp,psi_coefs_inout,n_det,size(psi_coefs_inout,1),size(keys_tmp,3),s2,nstates)
-!print*,'S^2 matrix in the basis of the NEW states considered'
-!do i = 1, nstates
-! write(*,'(10(F16.10,X))')s2(i,:)
-!enddo

- deallocate(s2,eigvalues,eigvectors,psi_coefs_tmp,overlap)
+ deallocate(s2,v_0,eigvectors,psi_coefs_tmp,overlap)

 end

--- a/src/Determinants/slater_rules.irp.f
+++ b/src/Determinants/slater_rules.irp.f
@ -1634,7 +1634,7 @@ subroutine get_occ_from_key(key,occ,Nint)
  
 end

-subroutine u0_H_u_0(e_0,u_0,n,keys_tmp,Nint)
+subroutine u_0_H_u_0(e_0,u_0,n,keys_tmp,Nint)
  use bitmasks
  implicit none
  BEGIN_DOC
@ -1647,10 +1647,10 @@ subroutine u0_H_u_0(e_0,u_0,n,keys_tmp,Nint)
  double precision, intent(out)  :: e_0
  double precision, intent(in)   :: u_0(n)
  integer(bit_kind),intent(in)   :: keys_tmp(Nint,2,n)
-  call u0_H_u_0_nstates(e_0,u_0,n,keys_tmp,Nint,1,n)
+  call u_0_H_u_0_nstates(e_0,u_0,n,keys_tmp,Nint,1,n)
 end

-subroutine u0_H_u_0_nstates(e_0,u_0,n,keys_tmp,Nint,N_st,sze_8)
+subroutine u_0_H_u_0_nstates(e_0,u_0,n,keys_tmp,Nint,N_st,sze_8)
  use bitmasks
  implicit none
  BEGIN_DOC