diff --git a/config/sse4_avx2.cfg b/config/sse4_avx2.cfg
new file mode 100644
index 00000000..eabf75a3
--- /dev/null
+++ b/config/sse4_avx2.cfg
@@ -0,0 +1,62 @@
+# Common flags
+##############
+#
+# -mkl=[parallel|sequential] : Use the MKL library
+# --ninja                 : Allow the utilisation of ninja. It is mandatory !
+# --align=32                 : Align all provided arrays on a 32-byte boundary
+#
+[COMMON]
+FC           : ifort
+LAPACK_LIB   : -mkl=parallel
+IRPF90       : irpf90
+IRPF90_FLAGS : --ninja --align=32
+
+# Global options
+################
+#
+# 1 : Activate
+# 0 : Deactivate
+# 
+[OPTION]
+MODE    : OPT        ; [ OPT | PROFILE | DEBUG ] : Chooses the section below
+CACHE   : 1          ; Enable cache_compile.py
+OPENMP  : 1          ; Append OpenMP flags
+
+# Optimization flags
+####################
+#
+# -xHost                     : Compile a binary optimized for the current architecture
+# -O2                        : O3 not better than O2.
+# -ip                        : Inter-procedural optimizations
+# -ftz                       : Flushes denormal results to zero
+#
+[OPT]
+FCFLAGS  : -axSSE4.2,AVX,CORE-AVX2 -O2 -ip -ftz -g -traceback
+
+# Profiling flags
+#################
+#
+[PROFILE]
+FC       : -p -g
+FCFLAGS  : -xSSE4.2 -O2 -ip -ftz
+
+# Debugging flags
+#################
+#
+# -traceback   : Activate backtrace on runtime
+# -fpe0        : All floating point exaceptions
+# -C           : Checks uninitialized variables,  array subscripts, etc...
+# -g           : Extra debugging information
+# -xSSE2       : Valgrind needs a very simple x86 executable
+#
+[DEBUG]
+FC      : -g -traceback
+FCFLAGS : -xSSE2 -C  -fpe0 
+
+# OpenMP flags
+#################
+#
+[OPENMP]
+FC           : -openmp
+IRPF90_FLAGS : --openmp 
+
diff --git a/plugins/Full_CI_ZMQ/fci_zmq.irp.f b/plugins/Full_CI_ZMQ/fci_zmq.irp.f
index 0577a408..892e0e4b 100644
--- a/plugins/Full_CI_ZMQ/fci_zmq.irp.f
+++ b/plugins/Full_CI_ZMQ/fci_zmq.irp.f
@@ -38,7 +38,6 @@ program fci_zmq
   
   do while (N_det < N_det_max.and.maxval(abs(pt2(1:N_st))) > pt2_max)
     n_det_before = N_det
-    ! call H_apply_FCI(pt2, norm_pert, H_pert_diag,  N_st)
     call ZMQ_selection(max(1024-N_det, N_det), pt2)
     
     PROVIDE  psi_coef
@@ -90,21 +89,21 @@ program fci_zmq
    N_det = min(N_det_max,N_det)
    touch N_det psi_det psi_coef
    call diagonalize_CI
-!    if(do_pt2_end)then
-!     print*,'Last iteration only to compute the PT2'
-!     threshold_selectors = 1.d0
-!     threshold_generators = 0.999d0
-!     call H_apply_FCI_PT2(pt2, norm_pert, H_pert_diag,  N_st)
-!  
-!     print *,  'Final step'
-!     print *,  'N_det    = ', N_det
-!     print *,  'N_states = ', N_states
-!     print *,  'PT2      = ', pt2
-!     print *,  'E        = ', CI_energy
-!     print *,  'E+PT2    = ', CI_energy+pt2
-!     print *,  '-----'
-!     call ezfio_set_full_ci_energy_pt2(CI_energy+pt2)
-!    endif
+   if(do_pt2_end)then
+     print*,'Last iteration only to compute the PT2'
+     threshold_selectors = 1.d0
+     threshold_generators = 0.9999d0
+     E_CI_before = CI_energy
+     call ZMQ_selection(1, pt2)
+     print *,  'Final step'
+     print *,  'N_det    = ', N_det
+     print *,  'N_states = ', N_states
+     print *,  'PT2      = ', pt2
+     print *,  'E        = ', E_CI_before
+     print *,  'E+PT2    = ', E_CI_before+pt2
+     print *,  '-----'
+     call ezfio_set_full_ci_energy_pt2(E_CI_before+pt2)
+   endif
    call save_wavefunction
 end
 
diff --git a/src/Determinants/davidson.irp.f b/src/Determinants/davidson.irp.f
index e7480ca2..5bc3768e 100644
--- a/src/Determinants/davidson.irp.f
+++ b/src/Determinants/davidson.irp.f
@@ -530,7 +530,7 @@ subroutine davidson_diag_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,N_st,Nint,iun
       !$OMP END PARALLEL
       
       write(iunit,'(X,I3,X,100(X,F16.10,X,E16.6))')  iter, to_print(:,1:N_st)
-      call davidson_converged(lambda,residual_norm,wall,iter,cpu,N_st,converged)
+      call davidson_converged(lambda,residual_norm,wall,iter,cpu,N_states,converged)
       if (converged) then
         exit
       endif
diff --git a/src/Determinants/s2.irp.f b/src/Determinants/s2.irp.f
index 9a60dbd9..96e342cd 100644
--- a/src/Determinants/s2.irp.f
+++ b/src/Determinants/s2.irp.f
@@ -215,54 +215,52 @@ subroutine get_s2_u0(psi_keys_tmp,psi_coefs_tmp,n,nmax,s2)
 end
 
 subroutine get_uJ_s2_uI(psi_keys_tmp,psi_coefs_tmp,n,nmax_coefs,nmax_keys,s2,nstates)
- implicit none
- use bitmasks
- integer(bit_kind), intent(in) :: psi_keys_tmp(N_int,2,nmax_keys)
- integer, intent(in) :: n,nmax_coefs,nmax_keys,nstates
- double precision, intent(in) :: psi_coefs_tmp(nmax_coefs,nstates)
- double precision, intent(out) :: s2(nstates,nstates)
- double precision :: s2_tmp,accu
- integer :: i,j,l,jj,ll,kk
- integer, allocatable           :: idx(:)
- double precision, allocatable :: tmp(:,:)
- BEGIN_DOC
- ! returns the matrix elements of S^2 "s2(i,j)" between the "nstates" states 
- ! psi_coefs_tmp(:,i) and psi_coefs_tmp(:,j)
- END_DOC
- s2 = 0.d0
- do ll = 1, nstates
-  do jj = 1, nstates
- accu = 0.d0
- !$OMP PARALLEL DEFAULT(NONE)                                         &
- !$OMP PRIVATE (i,j,kk,idx,tmp,s2_tmp) & 
- !$OMP SHARED (ll,jj,psi_keys_tmp,psi_coefs_tmp,N_int,n,nstates)      &
- !$OMP REDUCTION(+:accu)
- allocate(idx(0:n))
-   !$OMP DO SCHEDULE(dynamic)
-   do i = 1, n
-    call get_s2(psi_keys_tmp(1,1,i),psi_keys_tmp(1,1,i),s2_tmp,N_int)
-    accu += psi_coefs_tmp(i,ll) * s2_tmp * psi_coefs_tmp(i,jj)
-    call filter_connected(psi_keys_tmp,psi_keys_tmp(1,1,i),N_int,i-1,idx)
-    do kk=1,idx(0)
-     j = idx(kk)
-     call get_s2(psi_keys_tmp(1,1,i),psi_keys_tmp(1,1,j),s2_tmp,N_int)
-     accu += psi_coefs_tmp(i,ll) * s2_tmp * psi_coefs_tmp(j,jj) + psi_coefs_tmp(i,jj) * s2_tmp * psi_coefs_tmp(j,ll)
+  implicit none
+  use bitmasks
+  integer(bit_kind), intent(in)  :: psi_keys_tmp(N_int,2,nmax_keys)
+  integer, intent(in)            :: n,nmax_coefs,nmax_keys,nstates
+  double precision, intent(in)   :: psi_coefs_tmp(nmax_coefs,nstates)
+  double precision, intent(out)  :: s2(nstates,nstates)
+  double precision               :: s2_tmp,accu
+  integer                        :: i,j,l,jj,ll,kk
+  integer, allocatable           :: idx(:)
+  BEGIN_DOC
+  ! returns the matrix elements of S^2 "s2(i,j)" between the "nstates" states
+  ! psi_coefs_tmp(:,i) and psi_coefs_tmp(:,j)
+  END_DOC
+  s2 = 0.d0
+  do ll = 1, nstates
+    do jj = 1, nstates
+      accu = 0.d0
+      !$OMP PARALLEL DEFAULT(NONE)                                   &
+          !$OMP PRIVATE (i,j,kk,idx,s2_tmp)                          &
+          !$OMP SHARED (ll,jj,psi_keys_tmp,psi_coefs_tmp,N_int,n,nstates)&
+          !$OMP REDUCTION(+:accu)
+      allocate(idx(0:n))
+      !$OMP DO SCHEDULE(dynamic)
+      do i = n,1,-1   ! Better OMP scheduling
+        call get_s2(psi_keys_tmp(1,1,i),psi_keys_tmp(1,1,i),s2_tmp,N_int)
+        accu += psi_coefs_tmp(i,ll) * s2_tmp * psi_coefs_tmp(i,jj)
+        call filter_connected(psi_keys_tmp,psi_keys_tmp(1,1,i),N_int,i-1,idx)
+        do kk=1,idx(0)
+          j = idx(kk)
+          call get_s2(psi_keys_tmp(1,1,i),psi_keys_tmp(1,1,j),s2_tmp,N_int)
+          accu += psi_coefs_tmp(i,ll) * s2_tmp * psi_coefs_tmp(j,jj) + psi_coefs_tmp(i,jj) * s2_tmp * psi_coefs_tmp(j,ll)
+        enddo
+      enddo
+      !$OMP END DO
+      deallocate(idx)
+      !$OMP END PARALLEL
+      s2(ll,jj) += accu
     enddo
-   enddo
-   !$OMP END DO NOWAIT
- deallocate(idx)
- !$OMP BARRIER
- !$OMP END PARALLEL
-   s2(ll,jj) += accu
   enddo
- enddo
- do i = 1, nstates
-  do j =i+1,nstates
-   accu = 0.5d0 * (s2(i,j) + s2(j,i))
-   s2(i,j) = accu
-   s2(j,i) = accu
+  do i = 1, nstates
+    do j =i+1,nstates
+      accu = 0.5d0 * (s2(i,j) + s2(j,i))
+      s2(i,j) = accu
+      s2(j,i) = accu
+    enddo
   enddo
- enddo
 end
 
 subroutine diagonalize_s2_betweenstates(keys_tmp,psi_coefs_inout,n,nmax_keys,nmax_coefs,nstates,s2_eigvalues)