diff --git a/src/Davidson/diagonalization_hs2.irp.f b/src/Davidson/diagonalization_hs2.irp.f
index bf56855a..71d69e82 100644
--- a/src/Davidson/diagonalization_hs2.irp.f
+++ b/src/Davidson/diagonalization_hs2.irp.f
@@ -25,7 +25,7 @@ subroutine davidson_diag_hs2(dets_in,u_in,s2_out,dim_in,energies,sze,N_st,N_st_d
   double precision, intent(out)  :: energies(N_st_diag), s2_out(N_st_diag)
   double precision, allocatable  :: H_jj(:), S2_jj(:)
   
-  double precision               :: diag_h_mat_elem
+  double precision               :: diag_H_mat_elem, diag_S_mat_elem
   integer                        :: i
   ASSERT (N_st > 0)
   ASSERT (sze > 0)
@@ -37,10 +37,10 @@ subroutine davidson_diag_hs2(dets_in,u_in,s2_out,dim_in,energies,sze,N_st,N_st_d
   !$OMP PARALLEL DEFAULT(NONE)                                       &
       !$OMP  SHARED(sze,H_jj,S2_jj, dets_in,Nint)                    &
       !$OMP  PRIVATE(i)
-  !$OMP DO SCHEDULE(guided)
+  !$OMP DO SCHEDULE(static)
   do i=1,sze
-    H_jj(i) = diag_h_mat_elem(dets_in(1,1,i),Nint)
-    call get_s2(dets_in(1,1,i),dets_in(1,1,i),Nint,S2_jj(i))
+    H_jj(i)  = diag_H_mat_elem(dets_in(1,1,i),Nint)
+    S2_jj(i) = diag_S_mat_elem(dets_in(1,1,i),Nint)
   enddo
   !$OMP END DO 
   !$OMP END PARALLEL
@@ -209,7 +209,7 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
   do k=1,N_st_diag
     call normalize(u_in(1,k),sze)
   enddo
-  
+
   
   update_dets = 1
 
@@ -235,7 +235,7 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
       if (distributed_davidson) then
           call H_S2_u_0_nstates_zmq(W(1,shift+1),S(1,shift+1),U(1,shift+1),H_jj,S2_jj,sze,dets_in,Nint,N_st_diag,sze_8,update_dets)
       else
-          call H_S2_u_0_nstates(W(1,shift+1),S(1,shift+1),U(1,shift+1),H_jj,S2_jj,sze,dets_in,Nint,N_st_diag,sze_8)
+          call H_S2_u_0_nstates_openmp(W(1,shift+1),S(1,shift+1),U(1,shift+1),N_st_diag,sze_8)
       endif
       update_dets = 0
       
diff --git a/src/Davidson/diagonalize_CI.irp.f b/src/Davidson/diagonalize_CI.irp.f
index e1b67438..9b98ea91 100644
--- a/src/Davidson/diagonalize_CI.irp.f
+++ b/src/Davidson/diagonalize_CI.irp.f
@@ -66,7 +66,6 @@ END_PROVIDER
      call davidson_diag_HS2(psi_det,CI_eigenvectors, CI_eigenvectors_s2, &
          size(CI_eigenvectors,1),CI_electronic_energy,               &
          N_det,min(N_det,N_states),min(N_det,N_states_diag),N_int,output_determinants)
-
      
    else if (diag_algorithm == "Lapack") then
      
diff --git a/src/Davidson/u0Hu0.irp.f b/src/Davidson/u0Hu0.irp.f
index 7231611a..e59f80c0 100644
--- a/src/Davidson/u0Hu0.irp.f
+++ b/src/Davidson/u0Hu0.irp.f
@@ -613,20 +613,37 @@ end
 
 
 
-subroutine H_S2_u_0_nstates_new(v_0,s_0,N_st,sze_8)
+subroutine H_S2_u_0_nstates_openmp(v_0,s_0,u_0,N_st,sze_8)
   use bitmasks
   implicit none
   BEGIN_DOC
   ! Computes v_0 = H|u_0> and s_0 = S^2 |u_0>
   !
-  ! n : number of determinants
-  !
-  ! H_jj : array of <j|H|j>
-  !
-  ! S2_jj : array of <j|S^2|j>
+  ! Assumes that the determinants are in psi_det
   END_DOC
   integer, intent(in)            :: N_st,sze_8
-  double precision, intent(out)  :: v_0(sze_8,N_st), s_0(sze_8,N_st)
+  double precision, intent(inout)  :: v_0(sze_8,N_st), s_0(sze_8,N_st), u_0(sze_8,N_st)
+  integer :: k
+  do k=1,N_st
+    call dset_order(u_0(1,k),psi_bilinear_matrix_order,N_det)
+  enddo
+  call H_S2_u_0_nstates_bilinear_order(v_0,s_0,u_0,N_st,sze_8)
+  do k=1,N_st
+    call dset_order(v_0(1,k),psi_bilinear_matrix_order_reverse,N_det)
+    call dset_order(s_0(1,k),psi_bilinear_matrix_order_reverse,N_det)
+    call dset_order(u_0(1,k),psi_bilinear_matrix_order_reverse,N_det)
+  enddo
+
+end
+
+subroutine H_S2_u_0_nstates_bilinear_order(v_0,s_0,u_0,N_st,sze_8)
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Computes v_0 = H|u_0> and s_0 = S^2 |u_0>
+  END_DOC
+  integer, intent(in)            :: N_st,sze_8
+  double precision, intent(out)  :: v_0(sze_8,N_st), s_0(sze_8,N_st), u_0(sze_8,N_st)
 
   
   PROVIDE ref_bitmask_energy
@@ -643,7 +660,6 @@ subroutine H_S2_u_0_nstates_new(v_0,s_0,N_st,sze_8)
   integer(bit_kind)              :: tmp_det2(N_int,2)
   integer(bit_kind)              :: tmp_det3(N_int,2)
   integer(bit_kind), allocatable :: buffer(:,:)
-  double precision               :: ck(N_st), cl(N_st), cm(N_st)
   integer                        :: n_singles, n_doubles
   integer, allocatable           :: singles(:), doubles(:)
   integer, allocatable           :: singles_a(:,:), singles_b(:,:)
@@ -651,18 +667,33 @@ subroutine H_S2_u_0_nstates_new(v_0,s_0,N_st,sze_8)
   logical, allocatable           :: is_single_a(:)
   logical, allocatable           :: is_single_b(:)
   integer                        :: maxab, n_singles_max
+  double precision, allocatable  :: u_t(:,:), v_t(:,:), s_t(:,:)
 
   maxab = max(N_det_alpha_unique, N_det_beta_unique)
   allocate( buffer(N_int,maxab),                                     &
       singles(maxab), doubles(maxab),                                &
       is_single_a(N_det_alpha_unique),                               &
       is_single_b(N_det_beta_unique),                                &
-      idx(maxab), idx0(maxab))
+      idx(maxab), idx0(maxab),                                       &
+      u_t(N_st,N_det), v_t(N_st,N_det), s_t(N_st,N_det) )
   
   do i=1,maxab
     idx0(i) = i
   enddo
 
+!  call dtranspose(                                                   &
+!      u_0,                                                           &
+!      size(u_0, 1),                                                  &
+!      u_t,                                                           &
+!      size(u_t, 1),                                                  &
+!      N_det, N_st)
+!
+  do k=1,N_det
+    do l=1,N_st
+     u_t(l,k) = u_0(k,l)
+    enddo
+  enddo
+
   ! Prepare the array of all alpha single excitations
   ! -------------------------------------------------
 
@@ -683,8 +714,8 @@ subroutine H_S2_u_0_nstates_new(v_0,s_0,N_st,sze_8)
         singles_a(1,i), singles_a(0,i))
   enddo
 
-  v_0 = 0.d0
-  s_0 = 0.d0
+  v_t = 0.d0
+  s_t = 0.d0
   do k_a=1,N_det
     
     ! Initial determinant is at k_a in alpha-major representation
@@ -699,21 +730,8 @@ subroutine H_S2_u_0_nstates_new(v_0,s_0,N_st,sze_8)
     ! Initial determinant is at k_b in beta-major representation
     ! ----------------------------------------------------------------------
     
-    k_b = psi_bilinear_matrix_order_reverse(k_a)
+    k_b = psi_bilinear_matrix_order_transp_reverse(k_a)
 
-    ! Diagonal contribution
-    ! ---------------------
-
-    double precision, external :: diag_H_mat_elem, diag_S_mat_elem
-  
-    hij = diag_H_mat_elem(tmp_det,N_int) 
-    sij = diag_S_mat_elem(tmp_det,N_int)
-    do l=1,N_st
-      v_0(k_a,l) = v_0(k_a,l) + hij * psi_bilinear_matrix_values(k_a,l)
-      s_0(k_a,l) = s_0(k_a,l) + sij * psi_bilinear_matrix_values(k_a,l)
-    enddo
-
-    
     ! Get all single and double alpha excitations
     ! ===========================================
     
@@ -721,7 +739,7 @@ subroutine H_S2_u_0_nstates_new(v_0,s_0,N_st,sze_8)
     
     ! Loop inside the beta column to gather all the connected alphas
     i=1
-    l_a = k_a+1
+    l_a = k_a
     lcol = psi_bilinear_matrix_columns(l_a)
     do while (lcol == kcol)
       lrow = psi_bilinear_matrix_rows(l_a)
@@ -752,8 +770,8 @@ subroutine H_S2_u_0_nstates_new(v_0,s_0,N_st,sze_8)
       tmp_det2(1:N_int,1) = psi_det_alpha_unique(1:N_int, lrow)
       call i_H_j_mono_spin( tmp_det, tmp_det2, N_int, 1, hij)
       do l=1,N_st
-        v_0(l_a, l) += hij * psi_bilinear_matrix_values(k_a,l)
-        v_0(k_a, l) += hij * psi_bilinear_matrix_values(l_a,l)
+        v_t(l,l_a) += hij * u_t(l,k_a)
+        v_t(l,k_a) += hij * u_t(l,l_a)
         ! single => sij = 0 
       enddo
     enddo
@@ -770,8 +788,8 @@ subroutine H_S2_u_0_nstates_new(v_0,s_0,N_st,sze_8)
       enddo
       call i_H_j_double_spin( tmp_det(1,1), psi_det_alpha_unique(1, doubles(i)), N_int, hij)
       do l=1,N_st
-        v_0(l_a, l) += hij * psi_bilinear_matrix_values(k_a,l)
-        v_0(k_a, l) += hij * psi_bilinear_matrix_values(l_a,l)
+        v_t(l,l_a) += hij * u_t(l,k_a)
+        v_t(l,k_a) += hij * u_t(l,l_a)
         ! same spin => sij = 0 
       enddo
     enddo
@@ -785,7 +803,8 @@ subroutine H_S2_u_0_nstates_new(v_0,s_0,N_st,sze_8)
     
     ! Loop inside the alpha row to gather all the connected betas
     i=1
-    l_b = k_b+1
+    l_b = k_b
+
     lrow = psi_bilinear_matrix_transp_rows(l_b)
     do while (lrow == krow)
       lcol = psi_bilinear_matrix_transp_columns(l_b)
@@ -817,8 +836,8 @@ subroutine H_S2_u_0_nstates_new(v_0,s_0,N_st,sze_8)
       l_a = psi_bilinear_matrix_transp_order(l_b)
       call i_H_j_mono_spin( tmp_det, tmp_det2, N_int, 2, hij)
       do l=1,N_st
-        v_0(l_a, l) += hij * psi_bilinear_matrix_values(k_a,l)
-        v_0(k_a, l) += hij * psi_bilinear_matrix_values(l_a,l)
+        v_t(l,l_a) += hij * u_t(l,k_a)
+        v_t(l,k_a) += hij * u_t(l,l_a)
         ! single => sij = 0 
       enddo
     enddo
@@ -836,8 +855,8 @@ subroutine H_S2_u_0_nstates_new(v_0,s_0,N_st,sze_8)
       l_a = psi_bilinear_matrix_transp_order(l_b)
       call i_H_j_double_spin( tmp_det(1,2), psi_det_beta_unique(1, doubles(i)), N_int, hij)
       do l=1,N_st
-        v_0(l_a, l) += hij * psi_bilinear_matrix_values(k_a,l)
-        v_0(k_a, l) += hij * psi_bilinear_matrix_values(l_a,l)
+        v_t(l,l_a) += hij * u_t(l,k_a)
+        v_t(l,k_a) += hij * u_t(l,l_a)
         ! same spin => sij = 0 
       enddo
     enddo
@@ -892,12 +911,13 @@ subroutine H_S2_u_0_nstates_new(v_0,s_0,N_st,sze_8)
         if (is_single_a(lrow)) then
           tmp_det2(1:N_int,1) = psi_det_alpha_unique(1:N_int, lrow)
 
-          call i_H_j_double_alpha_beta(tmp_det,tmp_det2,N_int,hij,sij)
+          call i_H_j_double_alpha_beta(tmp_det,tmp_det2,N_int,hij)
+          call get_s2(tmp_det,tmp_det2,N_int,sij)
           do l=1,N_st
-            v_0(k_a, l) += hij * psi_bilinear_matrix_values(l_a,l)
-            v_0(l_a, l) += hij * psi_bilinear_matrix_values(k_a,l)
-            s_0(k_a, l) -= sij * psi_bilinear_matrix_values(l_a,l)
-            s_0(l_a, l) -= sij * psi_bilinear_matrix_values(k_a,l)
+            v_t(l,k_a) += hij * u_t(l,l_a)
+            v_t(l,l_a) += hij * u_t(l,k_a)
+            s_t(l,k_a) += sij * u_t(l,l_a)
+            s_t(l,l_a) += sij * u_t(l,k_a)
           enddo
         endif
         l_a += 1
@@ -905,6 +925,38 @@ subroutine H_S2_u_0_nstates_new(v_0,s_0,N_st,sze_8)
       enddo
     enddo
 
+    ! Diagonal contribution
+    ! ---------------------
+
+    double precision, external :: diag_H_mat_elem, diag_S_mat_elem
+  
+    hij = diag_H_mat_elem(tmp_det,N_int) 
+    sij = diag_S_mat_elem(tmp_det,N_int)
+    do l=1,N_st
+      v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,k_a)
+      s_t(l,k_a) = s_t(l,k_a) + sij * u_t(l,k_a)
+    enddo
+
+  enddo
+
+!  call dtranspose(                                                   &
+!      v_t,                                                           &
+!      size(v_t, 1),                                                  &
+!      v_0,                                                           &
+!      size(v_0, 1),                                                  &
+!      N_st, N_det)
+!
+!  call dtranspose(                                                   &
+!      s_t,                                                           &
+!      size(s_t, 1),                                                  &
+!      s_0,                                                           &
+!      size(s_0, 1),                                                  &
+!      N_st, N_det)
+  do k=1,N_det
+    do l=1,N_st
+     v_0(k,l) = v_t(l,k) 
+     s_0(k,l) = s_t(l,k) 
+    enddo
   enddo
 
 end
diff --git a/src/Determinants/slater_rules.irp.f b/src/Determinants/slater_rules.irp.f
index ef246e50..496b59de 100644
--- a/src/Determinants/slater_rules.irp.f
+++ b/src/Determinants/slater_rules.irp.f
@@ -2556,7 +2556,7 @@ subroutine i_H_j_double_spin(key_i,key_j,Nint,hij)
       exc(1,2), mo_integrals_map) )
 end
 
-subroutine i_H_j_double_alpha_beta(key_i,key_j,Nint,hij,phase)
+subroutine i_H_j_double_alpha_beta(key_i,key_j,Nint,hij)
   use bitmasks
   implicit none
   BEGIN_DOC
diff --git a/src/Determinants/spindeterminants.irp.f b/src/Determinants/spindeterminants.irp.f
index 783474f9..31fd3915 100644
--- a/src/Determinants/spindeterminants.irp.f
+++ b/src/Determinants/spindeterminants.irp.f
@@ -389,6 +389,7 @@ BEGIN_PROVIDER  [ double precision, psi_bilinear_matrix_values, (N_det,N_states)
 &BEGIN_PROVIDER [ integer, psi_bilinear_matrix_rows   , (N_det) ]
 &BEGIN_PROVIDER [ integer, psi_bilinear_matrix_columns, (N_det) ]
 &BEGIN_PROVIDER [ integer, psi_bilinear_matrix_order  , (N_det) ]
+&BEGIN_PROVIDER [ integer, psi_bilinear_matrix_order_reverse , (N_det) ]
 &BEGIN_PROVIDER [ integer, psi_bilinear_matrix_columns_loc, (N_det_beta_unique+1) ]
   use bitmasks
   implicit none
@@ -429,7 +430,6 @@ BEGIN_PROVIDER  [ double precision, psi_bilinear_matrix_values, (N_det,N_states)
   do l=1,N_states
     call dset_order(psi_bilinear_matrix_values(1,l),psi_bilinear_matrix_order,N_det)
   enddo
-  psi_bilinear_matrix_columns_loc(1:N_det_beta_unique) = -1
   psi_bilinear_matrix_columns_loc(1) = 1
   do k=2,N_det
     if (psi_bilinear_matrix_columns(k) == psi_bilinear_matrix_columns(k-1)) then
@@ -440,6 +440,9 @@ BEGIN_PROVIDER  [ double precision, psi_bilinear_matrix_values, (N_det,N_states)
     endif
   enddo
   psi_bilinear_matrix_columns_loc(N_det_beta_unique+1) = N_det+1
+  do k=1,N_det
+    psi_bilinear_matrix_order_reverse(psi_bilinear_matrix_order(k)) = k
+  enddo
   deallocate(to_sort)
 END_PROVIDER
 
@@ -448,7 +451,7 @@ BEGIN_PROVIDER  [ double precision, psi_bilinear_matrix_transp_values, (N_det,N_
 &BEGIN_PROVIDER [ integer, psi_bilinear_matrix_transp_rows   , (N_det) ]
 &BEGIN_PROVIDER [ integer, psi_bilinear_matrix_transp_columns, (N_det) ]
 &BEGIN_PROVIDER [ integer, psi_bilinear_matrix_transp_order  , (N_det) ]
-&BEGIN_PROVIDER [ integer, psi_bilinear_matrix_order_reverse , (N_det) ]
+&BEGIN_PROVIDER [ integer, psi_bilinear_matrix_order_transp_reverse , (N_det) ]
   use bitmasks
   implicit none
   BEGIN_DOC
@@ -487,7 +490,7 @@ BEGIN_PROVIDER  [ double precision, psi_bilinear_matrix_transp_values, (N_det,N_
     call dset_order(psi_bilinear_matrix_transp_values(1,l),psi_bilinear_matrix_transp_order,N_det)
   enddo
   do k=1,N_det
-    psi_bilinear_matrix_order_reverse(psi_bilinear_matrix_transp_order(k)) = k
+    psi_bilinear_matrix_order_transp_reverse(psi_bilinear_matrix_transp_order(k)) = k
   enddo
   deallocate(to_sort)
 END_PROVIDER
@@ -1387,3 +1390,18 @@ subroutine get_all_spin_doubles_3(buffer, idx, spindet, size_buffer, doubles, n_
   
 end
 
+subroutine copy_psi_bilinear_to_psi(psi, isize)
+  implicit none
+  BEGIN_DOC
+! Overwrites psi_det and psi_coef with the wf in bilinear order
+  END_DOC
+  integer, intent(in)            :: isize
+  integer(bit_kind), intent(out) :: psi(N_int,2,isize)
+  integer                        :: i,j,k,l
+  do k=1,isize
+    i = psi_bilinear_matrix_rows(k)
+    j = psi_bilinear_matrix_columns(k)
+    psi(1:N_int,1,k) = psi_det_alpha_unique(1:N_int,i)
+    psi(1:N_int,2,k) = psi_det_beta_unique(1:N_int,j)
+  enddo
+end
diff --git a/src/Utils/sort.irp.f b/src/Utils/sort.irp.f
index dc91ab3a..04e71208 100644
--- a/src/Utils/sort.irp.f
+++ b/src/Utils/sort.irp.f
@@ -158,6 +158,34 @@ BEGIN_TEMPLATE
   
  end subroutine heap_$Xsort_big
 
+ subroutine sorted_$Xnumber(x,isize,n)
+  implicit none
+  BEGIN_DOC
+! Returns the number of sorted elements
+  END_DOC
+  integer, intent(in)            :: isize
+  $type, intent(inout)           :: x(isize)
+  integer, intent(out)           :: n
+  integer :: i
+  if (isize < 2) then
+    n = 1
+    return
+  endif
+
+  if (x(1) > x(2)) then
+    n=1
+  else
+    n=0
+  endif
+
+  do i=2,isize
+    if (x(i-1) > x(i)) then
+      n=n+1
+    endif
+  enddo
+
+ end
+
  subroutine $Xsort(x,iorder,isize)
   implicit none
   BEGIN_DOC
@@ -168,10 +196,16 @@ BEGIN_TEMPLATE
   integer,intent(in)             :: isize
   $type,intent(inout)            :: x(isize)
   integer,intent(inout)          :: iorder(isize)
+  integer                        :: n
   if (isize < 32) then
     call insertion_$Xsort(x,iorder,isize)
   else
-    call heap_$Xsort(x,iorder,isize)
+!    call sorted_$Xnumber(x,isize,n)
+!    if ( (16*n) / isize > 0) then
+!      call insertion_$Xsort(x,iorder,isize)
+!    else
+      call heap_$Xsort(x,iorder,isize)
+!    endif
   endif
  end subroutine $Xsort