merged with scemama/master

2024-12-23 04:43:50 +01:00 · 2016-10-07 13:53:41 +02:00 · 2016-10-07 13:53:41 +02:00 · dbc7c3cb2c
commit dbc7c3cb2c
parent 1b63438d68 f7a2710f5c
35 changed files with 1672 additions and 511 deletions
--- a/ocaml/TaskServer.ml
+++ b/ocaml/TaskServer.ml
@ -547,7 +547,6 @@ let terminate program_state rep_socket =

  
 let error msg program_state rep_socket =
-    Printf.printf "%s\n%!" msg;
    Message.Error (Message.Error_msg.create msg)
    |> Message.to_string
    |> ZMQ.Socket.send rep_socket ;
--- a/ocaml/qp_create_guess.ml
+++ b/ocaml/qp_create_guess.ml
@ -0,0 +1,141 @@
+open Qputils
+open Qptypes
+open Core.Std
+
+let run ~multiplicity ezfio_file =
+  if (not (Sys.file_exists_exn ezfio_file)) then
+    failwith ("EZFIO directory "^ezfio_file^" not found");
+  Ezfio.set_file ezfio_file;
+  let d =
+    Input.Determinants_by_hand.read ()
+  in
+  let m =
+    Multiplicity.of_int multiplicity
+  in
+  let ne = 
+    Ezfio.get_electrons_elec_alpha_num () +
+    Ezfio.get_electrons_elec_beta_num () 
+    |> Elec_number.of_int
+  in
+  let alpha, beta = 
+    let (a,b) = 
+      Multiplicity.to_alpha_beta ne m 
+    in
+    (Elec_alpha_number.to_int a, Elec_beta_number.to_int b)
+  in
+  let n_open_shells =
+    alpha - beta
+  in
+  let mo_tot_num = 
+    Ezfio.get_mo_basis_mo_tot_num ()
+  in
+  let build_list_of_dets ne n_closed n_open =
+    let init = 
+      Array.create ~len:n_closed Bit.One 
+      |> Array.to_list
+    in
+    let rec set_electron accu = function
+    | 1 ->  [ Bit.One :: accu ]
+    | i -> 
+          assert (i>1);
+          let rest =
+            set_electron (Bit.Zero :: accu) (i-1)
+          in
+          (Bit.One::accu) :: rest
+    in
+    let rec extend accu = function 
+    | 0 -> List.rev accu
+    | i -> extend (Bit.Zero::accu) (i-1)
+    in
+    let rec set_n_electrons accu imax = function
+    | 0 -> []
+    | 1 -> set_electron accu imax
+    | i ->
+          assert (i>1);
+          let l =
+            set_electron accu (imax-1)
+          in
+          List.map ~f:(fun x -> set_n_electrons x (imax-1) (i-1)) l
+          |> List.concat
+    in
+    set_n_electrons init n_open ne
+          |> List.filter ~f:(fun x -> List.length x <= n_closed+n_open) 
+          |> List.map ~f:(fun x -> extend x (((mo_tot_num-1)/64+1)*64 - List.length x))
+  in
+
+  let alpha_new = 
+    (Elec_number.to_int ne + 1)/2
+  and beta_new = 
+    Elec_number.to_int ne/2
+  in
+  let l_alpha = 
+    build_list_of_dets ((alpha-beta+1)/2) beta n_open_shells
+  in 
+  let l_beta = 
+    if alpha_new = beta_new then
+      l_alpha
+    else
+      build_list_of_dets ((alpha-beta)/2)beta n_open_shells 
+  in
+
+  let n_int = 
+    Bitlist.n_int_of_mo_tot_num mo_tot_num
+  in
+  let determinants = 
+    List.map l_alpha ~f:(fun x -> List.map l_beta ~f:(fun y -> (x,y) ))
+    |> List.concat
+    |> List.map ~f:(fun pair -> Determinant.of_bitlist_couple ~n_int
+      ~alpha:(Elec_alpha_number.of_int alpha_new)
+      ~beta:(Elec_beta_number.of_int beta_new) pair )
+  in
+  let c = 
+    Array.create ~len:(List.length determinants) (Det_coef.of_float 1.)
+  in
+ 
+  determinants 
+  |> List.map ~f:(fun x -> Determinant.to_string ~mo_tot_num:(MO_number.of_int mo_tot_num) x) 
+  |> List.iter ~f:(fun x -> Printf.printf "%s\n\n%!" x);
+
+  let l =
+    List.length determinants
+  in
+  if l > 0 then
+    begin
+      let d = 
+        let s = (Float.of_int (alpha - beta)) *. 0.5 in
+        let open Input.Determinants_by_hand in
+        { d with n_int ; 
+          n_det = Det_number.of_int ~min:1 ~max:l  l;
+          expected_s2 = Positive_float.of_float (s *. (s +. 1.)) ;
+          psi_coef = c;
+          psi_det = Array.of_list determinants;
+        }
+      in
+      Input.Determinants_by_hand.write d;
+      Ezfio.set_determinants_read_wf true
+    end
+  else
+    Ezfio.set_determinants_read_wf false
+  
+
+
+let spec =
+  let open Command.Spec in
+  empty
+  +> flag "m" (required int)
+     ~doc:"int Spin multiplicity"
+  +> anon ("ezfio_file" %: string)
+
+let () = 
+  Command.basic
+  ~summary: "Quantum Package command"
+  ~readme:( fun () -> "
+Creates an open-shell multiplet initial guess\n\n" )
+  spec
+  (fun multiplicity ezfio_file () ->
+    run ~multiplicity ezfio_file
+  )
+  |> Command.run   ~version: Git.sha1   ~build_info: Git.message
+
+
+
--- a/plugins/CAS_SD/H_apply.irp.f
+++ b/plugins/CAS_SD/H_apply.irp.f
@ -3,6 +3,7 @@ BEGIN_SHELL [ /usr/bin/env python ]
 from generate_h_apply import *

 s = H_apply("CAS_SD")
+s.unset_skip()
 print s

 s = H_apply("CAS_SD_selected_no_skip")
@ -12,6 +13,7 @@ print s

 s = H_apply("CAS_SD_selected")
 s.set_selection_pt2("epstein_nesbet_2x2")
+s.unset_skip()
 print s

 s = H_apply("CAS_SD_PT2")
@ -22,13 +24,9 @@ print s
 s = H_apply("CAS_S",do_double_exc=False)
 print s

-s = H_apply("CAS_S_selected_no_skip",do_double_exc=False)
-s.set_selection_pt2("epstein_nesbet_2x2")
-s.unset_skip()
-print s
-
 s = H_apply("CAS_S_selected",do_double_exc=False)
 s.set_selection_pt2("epstein_nesbet_2x2")
+s.unset_skip()
 print s

 s = H_apply("CAS_S_PT2",do_double_exc=False)
--- a/plugins/CAS_SD/cas_s_selected.irp.f
+++ b/plugins/CAS_SD/cas_s_selected.irp.f
@ -12,6 +12,7 @@ program full_ci
  
  pt2 = 1.d0
  diag_algorithm = "Lapack"
+
  if (N_det > N_det_max) then
    call diagonalize_CI
    call save_wavefunction
@ -28,49 +29,84 @@ program full_ci
    print *,  'E+PT2    = ', CI_energy+pt2
    print *,  '-----'
  endif
+  double precision :: i_H_psi_array(N_states),diag_H_mat_elem,h,i_O1_psi_array(N_states)
+  double precision :: E_CI_before(N_states)
+  if(read_wf)then
+   call i_H_psi(psi_det(1,1,N_det),psi_det,psi_coef,N_int,N_det,psi_det_size,N_states,i_H_psi_array)
+   h = diag_H_mat_elem(psi_det(1,1,N_det),N_int)
+   selection_criterion = dabs(psi_coef(N_det,1) *  (i_H_psi_array(1) - h * psi_coef(N_det,1))) * 0.1d0
+   soft_touch selection_criterion
+  endif

+
+  integer :: n_det_before
+  print*,'Beginning the selection ...'
+  E_CI_before(1:N_states) = CI_energy(1:N_states)
  do while (N_det < N_det_max.and.maxval(abs(pt2(1:N_st))) > pt2_max)
-    call H_apply_CAS_S_selected_no_skip(pt2, norm_pert, H_pert_diag,  N_st)
+    n_det_before = N_det
+    call H_apply_CAS_SD_selected(pt2, norm_pert, H_pert_diag,  N_st)

    PROVIDE  psi_coef
    PROVIDE  psi_det
    PROVIDE  psi_det_sorted

-    if (N_det > N_det_max) then
-       psi_det = psi_det_sorted
-       psi_coef = psi_coef_sorted
-       N_det = N_det_max
-       soft_touch N_det psi_det psi_coef
-    endif
    call diagonalize_CI
+
+    if (N_det > N_det_max) then
+      N_det = N_det_max
+      psi_det = psi_det_sorted
+      psi_coef = psi_coef_sorted
+      touch N_det psi_det psi_coef psi_det_sorted psi_coef_sorted psi_average_norm_contrib_sorted
+    endif
+
+
    call save_wavefunction
-    print *,  'N_det    = ', N_det
-    print *,  'N_states = ', N_states
-    print *,  'PT2      = ', pt2
-    print *,  'E        = ', CI_energy
-    print *,  'E+PT2    = ', CI_energy+pt2
+    if(n_det_before == N_det)then
+     selection_criterion = selection_criterion * 0.5d0
+    endif
+    print *,  'N_det          = ', N_det
+    print *,  'N_states       = ', N_states
+    do  k = 1, N_states
+    print*,'State ',k
+    print *,  'PT2            = ', pt2(k)
+    print *,  'E              = ', CI_energy(k)
+    print *,  'E(before)+PT2  = ', E_CI_before(k)+pt2(k)
+    enddo
    print *,  '-----'
+    if(N_states.gt.1)then
+     print*,'Variational Energy difference'
+     do i = 2, N_states
+      print*,'Delta E = ',CI_energy(i) - CI_energy(1)
+     enddo
+    endif
+    if(N_states.gt.1)then
+     print*,'Variational + perturbative Energy difference'
+     do i = 2, N_states
+      print*,'Delta E = ',E_CI_before(i)+ pt2(i) - (E_CI_before(1) + pt2(1))
+     enddo
+    endif
+    E_CI_before(1:N_states) = CI_energy(1:N_states)
    call ezfio_set_cas_sd_energy(CI_energy(1))
  enddo
-  call diagonalize_CI
-
+   N_det = min(N_det_max,N_det)
+   touch N_det psi_det psi_coef
+   call diagonalize_CI
   if(do_pt2_end)then
    print*,'Last iteration only to compute the PT2'
    threshold_selectors = 1.d0
    threshold_generators = 0.999d0
-    call H_apply_CAS_S_PT2(pt2, norm_pert, H_pert_diag,  N_st)
+    call H_apply_CAS_SD_PT2(pt2, norm_pert, H_pert_diag,  N_st)

    print *,  'Final step'
    print *,  'N_det    = ', N_det
    print *,  'N_states = ', N_states
    print *,  'PT2      = ', pt2
-    print *,  'E        = ', CI_energy
-    print *,  'E+PT2    = ', CI_energy+pt2
+    print *,  'E        = ', CI_energy(1:N_states)
+    print *,  'E+PT2    = ', CI_energy(1:N_states)+pt2(1:N_states)
    print *,  '-----'
    call ezfio_set_cas_sd_energy_pt2(CI_energy(1)+pt2(1))
   endif

-
  integer :: exc_max, degree_min
  exc_max = 0
  print *,  'CAS determinants : ', N_det_cas
@ -79,6 +115,7 @@ program full_ci
      call get_excitation_degree(psi_cas(1,1,k),psi_cas(1,1,i),degree,N_int)
      exc_max = max(exc_max,degree)
    enddo
+    print *,  psi_coef_cas_diagonalized(i,:)
    call debug_det(psi_cas(1,1,i),N_int)
    print *,  ''
  enddo
--- a/plugins/CAS_SD/cas_sd.irp.f
+++ b/plugins/CAS_SD/cas_sd.irp.f
@ -1,7 +1,6 @@
 program full_ci
  implicit none
  integer                        :: i,k
-  integer                        :: N_det_old

  
  double precision, allocatable  :: pt2(:), norm_pert(:), H_pert_diag(:)
@ -11,9 +10,9 @@ program full_ci
  character*(64)                 :: perturbation
  PROVIDE N_det_cas
  
-  N_det_old = 0
  pt2 = 1.d0
  diag_algorithm = "Lapack"
+
  if (N_det > N_det_max) then
    call diagonalize_CI
    call save_wavefunction
@ -30,36 +29,68 @@ program full_ci
    print *,  'E+PT2    = ', CI_energy+pt2
    print *,  '-----'
  endif
+  double precision :: i_H_psi_array(N_states),diag_H_mat_elem,h,i_O1_psi_array(N_states)
+  double precision :: E_CI_before(N_states)
+  if(read_wf)then
+   call i_H_psi(psi_det(1,1,N_det),psi_det,psi_coef,N_int,N_det,psi_det_size,N_states,i_H_psi_array)
+   h = diag_H_mat_elem(psi_det(1,1,N_det),N_int)
+   selection_criterion = dabs(psi_coef(N_det,1) *  (i_H_psi_array(1) - h * psi_coef(N_det,1))) * 0.1d0
+   soft_touch selection_criterion
+  endif

+
+  integer :: n_det_before
+  print*,'Beginning the selection ...'
+  E_CI_before(1:N_states) = CI_energy(1:N_states)
  do while (N_det < N_det_max.and.maxval(abs(pt2(1:N_st))) > pt2_max)
-    N_det_old = N_det
+    n_det_before = N_det
    call H_apply_CAS_SD(pt2, norm_pert, H_pert_diag,  N_st)

    PROVIDE  psi_coef
    PROVIDE  psi_det
    PROVIDE  psi_det_sorted

-    if (N_det > N_det_max) then
-       psi_det = psi_det_sorted
-       psi_coef = psi_coef_sorted
-       N_det = N_det_max
-       soft_touch N_det psi_det psi_coef
-    endif
    call diagonalize_CI
-    call save_wavefunction
-    print *,  'N_det    = ', N_det
-    print *,  'N_states = ', N_states
-    print *,  'PT2      = ', pt2
-    print *,  'E        = ', CI_energy
-    print *,  'E+PT2    = ', CI_energy+pt2
-    print *,  '-----'
-    call ezfio_set_cas_sd_energy(CI_energy(1))
-    if (N_det == N_det_old) then
-      exit
-    endif
-  enddo
-  call diagonalize_CI

+    if (N_det > N_det_max) then
+      N_det = N_det_max
+      psi_det = psi_det_sorted
+      psi_coef = psi_coef_sorted
+      touch N_det psi_det psi_coef psi_det_sorted psi_coef_sorted psi_average_norm_contrib_sorted
+    endif
+
+
+    call save_wavefunction
+    if(n_det_before == N_det)then
+     selection_criterion = selection_criterion * 0.5d0
+    endif
+    print *,  'N_det          = ', N_det
+    print *,  'N_states       = ', N_states
+    do  k = 1, N_states
+    print*,'State ',k
+    print *,  'PT2            = ', pt2(k)
+    print *,  'E              = ', CI_energy(k)
+    print *,  'E(before)+PT2  = ', E_CI_before(k)+pt2(k)
+    enddo
+    print *,  '-----'
+    if(N_states.gt.1)then
+     print*,'Variational Energy difference'
+     do i = 2, N_states
+      print*,'Delta E = ',CI_energy(i) - CI_energy(1)
+     enddo
+    endif
+    if(N_states.gt.1)then
+     print*,'Variational + perturbative Energy difference'
+     do i = 2, N_states
+      print*,'Delta E = ',E_CI_before(i)+ pt2(i) - (E_CI_before(1) + pt2(1))
+     enddo
+    endif
+    E_CI_before(1:N_states) = CI_energy(1:N_states)
+    call ezfio_set_cas_sd_energy(CI_energy(1))
+  enddo
+   N_det = min(N_det_max,N_det)
+   touch N_det psi_det psi_coef
+   call diagonalize_CI
   if(do_pt2_end)then
    print*,'Last iteration only to compute the PT2'
    threshold_selectors = 1.d0
@ -70,13 +101,12 @@ program full_ci
    print *,  'N_det    = ', N_det
    print *,  'N_states = ', N_states
    print *,  'PT2      = ', pt2
-    print *,  'E        = ', CI_energy
-    print *,  'E+PT2    = ', CI_energy+pt2
+    print *,  'E        = ', CI_energy(1:N_states)
+    print *,  'E+PT2    = ', CI_energy(1:N_states)+pt2(1:N_states)
    print *,  '-----'
    call ezfio_set_cas_sd_energy_pt2(CI_energy(1)+pt2(1))
   endif

-
  integer :: exc_max, degree_min
  exc_max = 0
  print *,  'CAS determinants : ', N_det_cas
@ -85,6 +115,7 @@ program full_ci
      call get_excitation_degree(psi_cas(1,1,k),psi_cas(1,1,i),degree,N_int)
      exc_max = max(exc_max,degree)
    enddo
+    print *,  psi_coef_cas_diagonalized(i,:)
    call debug_det(psi_cas(1,1,i),N_int)
    print *,  ''
  enddo
--- a/plugins/CAS_SD/cas_sd_selected.irp.f
+++ b/plugins/CAS_SD/cas_sd_selected.irp.f
@ -12,6 +12,7 @@ program full_ci
  
  pt2 = 1.d0
  diag_algorithm = "Lapack"
+
  if (N_det > N_det_max) then
    call diagonalize_CI
    call save_wavefunction
@ -28,32 +29,68 @@ program full_ci
    print *,  'E+PT2    = ', CI_energy+pt2
    print *,  '-----'
  endif
+  double precision :: i_H_psi_array(N_states),diag_H_mat_elem,h,i_O1_psi_array(N_states)
+  double precision :: E_CI_before(N_states)
+  if(read_wf)then
+   call i_H_psi(psi_det(1,1,N_det),psi_det,psi_coef,N_int,N_det,psi_det_size,N_states,i_H_psi_array)
+   h = diag_H_mat_elem(psi_det(1,1,N_det),N_int)
+   selection_criterion = dabs(psi_coef(N_det,1) *  (i_H_psi_array(1) - h * psi_coef(N_det,1))) * 0.1d0
+   soft_touch selection_criterion
+  endif

+
+  integer :: n_det_before
+  print*,'Beginning the selection ...'
+  E_CI_before(1:N_states) = CI_energy(1:N_states)
  do while (N_det < N_det_max.and.maxval(abs(pt2(1:N_st))) > pt2_max)
+    n_det_before = N_det
    call H_apply_CAS_SD_selected(pt2, norm_pert, H_pert_diag,  N_st)

    PROVIDE  psi_coef
    PROVIDE  psi_det
    PROVIDE  psi_det_sorted

-    if (N_det > N_det_max) then
-       psi_det = psi_det_sorted
-       psi_coef = psi_coef_sorted
-       N_det = N_det_max
-       soft_touch N_det psi_det psi_coef
-    endif
    call diagonalize_CI
+
+    if (N_det > N_det_max) then
+      N_det = N_det_max
+      psi_det = psi_det_sorted
+      psi_coef = psi_coef_sorted
+      touch N_det psi_det psi_coef psi_det_sorted psi_coef_sorted psi_average_norm_contrib_sorted
+    endif
+
+
    call save_wavefunction
-    print *,  'N_det    = ', N_det
-    print *,  'N_states = ', N_states
-    print *,  'PT2      = ', pt2
-    print *,  'E        = ', CI_energy(1:N_states)
-    print *,  'E+PT2    = ', CI_energy(1:N_states)+pt2(1:N_states)
+    if(n_det_before == N_det)then
+     selection_criterion = selection_criterion * 0.5d0
+    endif
+    print *,  'N_det          = ', N_det
+    print *,  'N_states       = ', N_states
+    do  k = 1, N_states
+    print*,'State ',k
+    print *,  'PT2            = ', pt2(k)
+    print *,  'E              = ', CI_energy(k)
+    print *,  'E(before)+PT2  = ', E_CI_before(k)+pt2(k)
+    enddo
    print *,  '-----'
+    if(N_states.gt.1)then
+     print*,'Variational Energy difference'
+     do i = 2, N_states
+      print*,'Delta E = ',CI_energy(i) - CI_energy(1)
+     enddo
+    endif
+    if(N_states.gt.1)then
+     print*,'Variational + perturbative Energy difference'
+     do i = 2, N_states
+      print*,'Delta E = ',E_CI_before(i)+ pt2(i) - (E_CI_before(1) + pt2(1))
+     enddo
+    endif
+    E_CI_before(1:N_states) = CI_energy(1:N_states)
    call ezfio_set_cas_sd_energy(CI_energy(1))
  enddo
-  call diagonalize_CI
-
+   N_det = min(N_det_max,N_det)
+   touch N_det psi_det psi_coef
+   call diagonalize_CI
   if(do_pt2_end)then
    print*,'Last iteration only to compute the PT2'
    threshold_selectors = 1.d0
@ -70,7 +107,6 @@ program full_ci
    call ezfio_set_cas_sd_energy_pt2(CI_energy(1)+pt2(1))
   endif

-
  integer :: exc_max, degree_min
  exc_max = 0
  print *,  'CAS determinants : ', N_det_cas
@ -79,6 +115,7 @@ program full_ci
      call get_excitation_degree(psi_cas(1,1,k),psi_cas(1,1,i),degree,N_int)
      exc_max = max(exc_max,degree)
    enddo
+    print *,  psi_cas_coef(i,:)
    call debug_det(psi_cas(1,1,i),N_int)
    print *,  ''
  enddo
--- a/plugins/Full_CI/H_apply.irp.f
+++ b/plugins/Full_CI/H_apply.irp.f
@ -23,6 +23,11 @@ s.unset_skip()
 #s.unset_openmp()
 print s

+s = H_apply("FCI_no_selection")
+s.set_selection_pt2("dummy")
+s.unset_skip()
+print s
+
 s = H_apply("FCI_mono")
 s.set_selection_pt2("epstein_nesbet_2x2")
 s.unset_double_excitations()
--- a/plugins/Full_CI/full_ci.irp.f
+++ b/plugins/Full_CI/full_ci.irp.f
@ -11,7 +11,7 @@ program full_ci
  
  pt2 = 1.d0
  diag_algorithm = "Lapack"
-  
+
  if (N_det > N_det_max) then
    call diagonalize_CI
    call save_wavefunction
--- a/plugins/Full_CI_ZMQ/selection_slave.irp.f
+++ b/plugins/Full_CI_ZMQ/selection_slave.irp.f
@ -23,42 +23,55 @@ subroutine run_wf
  integer(ZMQ_PTR), external :: new_zmq_to_qp_run_socket
  integer(ZMQ_PTR) :: zmq_to_qp_run_socket
  double precision :: energy(N_states_diag)
-  character*(64) :: state
+  character*(64) :: states(2)
+  integer :: rc, i
  
  call provide_everything
  
  zmq_context = f77_zmq_ctx_new ()
-  state = 'Waiting'
+  states(1) = 'selection'
+  states(2) = 'davidson'

  zmq_to_qp_run_socket = new_zmq_to_qp_run_socket()

  do
-    call wait_for_next_state(state)
-    print *, "STATE ", trim(state)
-    if(trim(state) == "Stopped") exit
-    if(trim(state) == 'selection') then
-  zmq_state = 'selection'
-      print *,  'Getting wave function'
+
+    call wait_for_states(states,zmq_state,2)
+
+    if(trim(zmq_state) == 'Stopped') then
+
+      exit
+
+    else if (trim(zmq_state) == 'selection') then
+
+      ! Selection
+      ! ---------
+
+      print *,  'Selection'
      call zmq_get_psi(zmq_to_qp_run_socket,1,energy,N_states_diag)
- 
  
-      integer :: rc, i
- 
-      print *,  'Selection slave running'
- 
      !$OMP PARALLEL PRIVATE(i)
      i = omp_get_thread_num()
      call selection_dressing_slave_tcp(i, energy)
      !$OMP END PARALLEL
-    else if(trim(state) == "davidson") then
-  zmq_state = 'davidson'
+      print *,  'Selection done'
+
+    else if (trim(zmq_state) == 'davidson') then
+
+      ! Davidson
+      ! --------
+
+      print *,  'Davidson'
      call davidson_miniserver_get()
-      print *, "Davidson slave running"
+
      !$OMP PARALLEL PRIVATE(i)
-        i = omp_get_thread_num()
-        call davidson_slave_tcp(i)
+      i = omp_get_thread_num()
+      call davidson_slave_tcp(i)
      !$OMP END PARALLEL
-    end if
+      print *,  'Davidson done'
+
+    endif
+
  end do
 end

--- a/plugins/MRCC_CASSD/mrcc_cassd.irp.f
+++ b/plugins/MRCC_CASSD/mrcc_cassd.irp.f
@ -35,10 +35,10 @@ subroutine run(N_st,energy)
    print *,  'MRCC Iteration', iteration
    print *,  '===========================' 
    print *,  ''
-    E_old = sum(ci_energy_dressed)
+    E_old = sum(ci_energy_dressed(1:N_st))
    call write_double(6,ci_energy_dressed(1),"MRCC energy")
    call diagonalize_ci_dressed(lambda)
-    E_new = sum(ci_energy_dressed)
+    E_new = sum(ci_energy_dressed(1:N_st))
    delta_E = dabs(E_new - E_old)
    call save_wavefunction
    call ezfio_set_mrcc_cassd_energy(ci_energy_dressed(1))
@ -47,7 +47,7 @@ subroutine run(N_st,energy)
    endif
  enddo
  call write_double(6,ci_energy_dressed(1),"Final MRCC energy")
-  energy(:) = ci_energy_dressed(:)
+  energy(1:N_st) = ci_energy_dressed(1:N_st)

 end

--- a/plugins/MRCC_Utils/davidson.irp.f
+++ b/plugins/MRCC_Utils/davidson.irp.f
@ -548,3 +548,621 @@ subroutine H_u_0_mrcc_nstates(v_0,u_0,H_jj,n,keys_tmp,Nint,istate_in,N_st,sze_8)
  
 end

+
+subroutine davidson_diag_mrcc_hs2(dets_in,u_in,dim_in,energies,sze,N_st,N_st_diag,Nint,iunit,istate)
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Davidson diagonalization.
+  !
+  ! dets_in : bitmasks corresponding to determinants
+  !
+  ! u_in : guess coefficients on the various states. Overwritten
+  !   on exit
+  !
+  ! dim_in : leftmost dimension of u_in
+  !
+  ! sze : Number of determinants
+  !
+  ! N_st : Number of eigenstates
+  !
+  ! iunit : Unit number for the I/O
+  !
+  ! Initial guess vectors are not necessarily orthonormal
+  END_DOC
+  integer, intent(in)            :: dim_in, sze, N_st, N_st_diag, Nint, iunit, istate
+  integer(bit_kind), intent(in)  :: dets_in(Nint,2,sze)
+  double precision, intent(inout) :: u_in(dim_in,N_st_diag)
+  double precision, intent(out)  :: energies(N_st)
+  double precision, allocatable  :: H_jj(:), S2_jj(:)
+  
+  double precision               :: diag_h_mat_elem
+  integer                        :: i
+  ASSERT (N_st > 0)
+  ASSERT (sze > 0)
+  ASSERT (Nint > 0)
+  ASSERT (Nint == N_int)
+  PROVIDE mo_bielec_integrals_in_map
+  allocate(H_jj(sze), S2_jj(sze))
+  
+  !$OMP PARALLEL DEFAULT(NONE)                                       &
+      !$OMP  SHARED(sze,H_jj,S2_jj, dets_in,Nint,N_det_ref,delta_ii, &
+      !$OMP  idx_ref, istate)  &
+      !$OMP  PRIVATE(i)
+  !$OMP DO SCHEDULE(guided)
+  do i=1,sze
+    H_jj(i) = diag_h_mat_elem(dets_in(1,1,i),Nint)
+    call get_s2(dets_in(1,1,i),dets_in(1,1,i),Nint,S2_jj(i))
+  enddo
+  !$OMP END DO 
+  !$OMP DO SCHEDULE(guided)
+  do i=1,N_det_ref
+    H_jj(idx_ref(i)) +=  delta_ii(istate,i)
+  enddo
+  !$OMP END DO 
+  !$OMP END PARALLEL
+
+  call davidson_diag_hjj_sjj_mrcc(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_st,N_st_diag,Nint,iunit,istate)
+  deallocate (H_jj,S2_jj)
+end
+
+
+subroutine davidson_diag_hjj_sjj_mrcc(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_st,N_st_diag,Nint,iunit,istate )
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Davidson diagonalization with specific diagonal elements of the H matrix
+  !
+  ! H_jj : specific diagonal H matrix elements to diagonalize de Davidson
+  !
+  ! S2_jj : specific diagonal S^2 matrix elements 
+  !
+  ! dets_in : bitmasks corresponding to determinants
+  !
+  ! u_in : guess coefficients on the various states. Overwritten
+  !   on exit
+  !
+  ! dim_in : leftmost dimension of u_in
+  !
+  ! sze : Number of determinants
+  !
+  ! N_st : Number of eigenstates
+  ! 
+  ! N_st_diag : Number of states in which H is diagonalized. Assumed > sze
+  !
+  ! iunit : Unit for the I/O
+  !
+  ! Initial guess vectors are not necessarily orthonormal
+  END_DOC
+  integer, intent(in)            :: dim_in, sze, N_st, N_st_diag, Nint, istate
+  integer(bit_kind), intent(in)  :: dets_in(Nint,2,sze)
+  double precision,  intent(in)  :: H_jj(sze), S2_jj(sze)
+  integer,  intent(in)  :: iunit
+  double precision, intent(inout) :: u_in(dim_in,N_st_diag)
+  double precision, intent(out)  :: energies(N_st_diag)
+  
+  integer                        :: sze_8
+  integer                        :: iter
+  integer                        :: i,j,k,l,m
+  logical                        :: converged
+  
+  double precision, allocatable  :: overlap(:,:)
+  double precision               :: u_dot_v, u_dot_u
+  
+  integer, allocatable           :: kl_pairs(:,:)
+  integer                        :: k_pairs, kl
+  
+  integer                        :: iter2
+  double precision, allocatable  :: W(:,:),  U(:,:), R(:,:), S(:,:)
+  double precision, allocatable  :: y(:,:), h(:,:), lambda(:), s2(:)
+  double precision, allocatable  :: c(:), s_(:,:), s_tmp(:,:)
+  double precision               :: diag_h_mat_elem
+  double precision, allocatable  :: residual_norm(:)
+  character*(16384)              :: write_buffer
+  double precision               :: to_print(3,N_st)
+  double precision               :: cpu, wall
+  integer                        :: shift, shift2
+  include 'constants.include.F'
+  
+  !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: U, W, R, S, y, h, lambda
+  if (N_st_diag > sze) then
+     stop 'error in Davidson : N_st_diag > sze'
+  endif
+
+  PROVIDE nuclear_repulsion
+
+  call write_time(iunit)
+  call wall_time(wall)
+  call cpu_time(cpu)
+  write(iunit,'(A)') ''
+  write(iunit,'(A)') 'Davidson Diagonalization'
+  write(iunit,'(A)') '------------------------'
+  write(iunit,'(A)') ''
+  call write_int(iunit,N_st,'Number of states')
+  call write_int(iunit,N_st_diag,'Number of states in diagonalization')
+  call write_int(iunit,sze,'Number of determinants')
+  call write_int(iunit,istate,'Using dressing for state ')
+
+  write(iunit,'(A)') ''
+  write_buffer = '===== '
+  do i=1,N_st
+    write_buffer = trim(write_buffer)//' ================ =========== ==========='
+  enddo
+  write(iunit,'(A)') trim(write_buffer)
+  write_buffer = ' Iter'
+  do i=1,N_st
+    write_buffer = trim(write_buffer)//'      Energy          S^2      Residual'
+  enddo
+  write(iunit,'(A)') trim(write_buffer)
+  write_buffer = '===== '
+  do i=1,N_st
+    write_buffer = trim(write_buffer)//' ================ =========== ==========='
+  enddo
+  write(iunit,'(A)') trim(write_buffer)
+
+  integer, external :: align_double
+  sze_8 = align_double(sze)
+
+  double precision :: delta
+
+  if (s2_eig) then
+    delta = 1.d0
+  else
+    delta = 0.d0
+  endif
+
+  allocate(                                                          &
+      kl_pairs(2,N_st_diag*(N_st_diag+1)/2),                         &
+      W(sze_8,N_st_diag*davidson_sze_max),                           &
+      U(sze_8,N_st_diag*davidson_sze_max),                           &
+      R(sze_8,N_st_diag),                                            &
+      S(sze_8,N_st_diag*davidson_sze_max),                           &
+      h(N_st_diag*davidson_sze_max,N_st_diag*davidson_sze_max),      &
+      y(N_st_diag*davidson_sze_max,N_st_diag*davidson_sze_max),      &
+      s_(N_st_diag*davidson_sze_max,N_st_diag*davidson_sze_max),     &
+      s_tmp(N_st_diag*davidson_sze_max,N_st_diag*davidson_sze_max),  &
+      residual_norm(N_st_diag),                                      &
+      overlap(N_st_diag,N_st_diag),                                  &
+      c(N_st_diag*davidson_sze_max),                                 &
+      s2(N_st_diag*davidson_sze_max),                                &
+      lambda(N_st_diag*davidson_sze_max))
+  
+  ASSERT (N_st > 0)
+  ASSERT (N_st_diag >= N_st)
+  ASSERT (sze > 0)
+  ASSERT (Nint > 0)
+  ASSERT (Nint == N_int)
+  
+  ! Davidson iterations
+  ! ===================
+  
+  converged = .False.
+  
+  do k=1,N_st
+    call normalize(u_in(1,k),sze)
+  enddo
+
+  do k=N_st+1,N_st_diag
+      do i=1,sze
+        double precision               :: r1, r2
+        call random_number(r1)
+        call random_number(r2)
+        u_in(i,k) = dsqrt(-2.d0*dlog(r1))*dcos(dtwo_pi*r2)
+      enddo
+    
+    ! Gram-Schmidt
+    ! ------------
+    call dgemv('T',sze,k-1,1.d0,u_in,size(u_in,1),                   &
+        u_in(1,k),1,0.d0,c,1)
+    call dgemv('N',sze,k-1,-1.d0,u_in,size(u_in,1),                  &
+        c,1,1.d0,u_in(1,k),1)
+    call normalize(u_in(1,k),sze)
+  enddo
+
+  
+  do while (.not.converged)
+    
+    do k=1,N_st_diag
+      do i=1,sze
+        U(i,k) = u_in(i,k)
+      enddo
+    enddo
+
+    do iter=1,davidson_sze_max-1
+      
+      shift  = N_st_diag*(iter-1)
+      shift2 = N_st_diag*iter
+
+
+      ! Compute |W_k> = \sum_i |i><i|H|u_k>
+      ! -----------------------------------------
+      
+       
+      call H_S2_u_0_mrcc_nstates(W(1,shift+1),S(1,shift+1),U(1,shift+1),H_jj,S2_jj,sze,dets_in,Nint,&
+        istate,N_st_diag,sze_8)
+      
+      
+      ! Compute h_kl = <u_k | W_l> = <u_k| H |u_l>
+      ! -------------------------------------------
+
+
+!      do l=1,N_st_diag
+!        do k=1,N_st_diag
+!          do iter2=1,iter-1
+!            h(k,iter2,l,iter) = u_dot_v(U(1,k,iter2),W(1,l,iter),sze)
+!            h(k,iter,l,iter2) = h(k,iter2,l,iter)
+!          enddo
+!        enddo
+!        do k=1,l
+!          h(k,iter,l,iter) = u_dot_v(U(1,k,iter),W(1,l,iter),sze)
+!          h(l,iter,k,iter) = h(k,iter,l,iter)
+!        enddo
+!      enddo
+
+      call dgemm('T','N', shift2, N_st_diag, sze,            &
+          1.d0, U, size(U,1), W(1,shift+1), size(W,1),                &
+          0.d0, h(1,shift+1), size(h,1))
+
+      call dgemm('T','N', shift2, N_st_diag, sze,            &
+          1.d0, U, size(U,1), S(1,shift+1), size(S,1),                &
+          0.d0, s_(1,shift+1), size(s_,1))
+
+      ! Diagonalize h
+      ! -------------
+      call lapack_diag(lambda,y,h,size(h,1),shift2)
+      
+      ! Compute S2 for each eigenvector
+      ! -------------------------------
+
+      call dgemm('N','N',shift2,shift2,shift2,                            &
+          1.d0, s_, size(s_,1), y, size(y,1), &
+          0.d0, s_tmp, size(s_tmp,1))
+
+      call dgemm('T','N',shift2,shift2,shift2,                            &
+          1.d0, y, size(y,1), s_tmp, size(s_tmp,1), &
+          0.d0, s_, size(s_,1))
+
+      do k=1,shift2
+        s2(k) = s_(k,k) + S_z2_Sz
+      enddo
+
+      if (s2_eig) then
+        logical :: state_ok(N_st_diag*davidson_sze_max)
+        do k=1,shift2
+          state_ok(k) = (dabs(s2(k)-expected_s2) < 0.3d0)
+        enddo
+        do k=1,shift2
+          if (.not. state_ok(k)) then
+            do l=k+1,shift2
+              if (state_ok(l)) then
+                call dswap(shift2, y(1,k), 1, y(1,l), 1)
+                call dswap(1, s2(k), 1, s2(l), 1)
+                call dswap(1, lambda(k), 1, lambda(l), 1)
+                state_ok(k) = .True.
+                state_ok(l) = .False.
+                exit
+              endif
+            enddo
+          endif
+        enddo
+      endif
+
+
+      ! Express eigenvectors of h in the determinant basis
+      ! --------------------------------------------------
+      
+!      do k=1,N_st_diag
+!        do i=1,sze
+!          U(i,shift2+k) = 0.d0
+!          W(i,shift2+k) = 0.d0
+!          S(i,shift2+k) = 0.d0
+!        enddo
+!        do l=1,N_st_diag*iter
+!          do i=1,sze
+!            U(i,shift2+k) = U(i,shift2+k) + U(i,l)*y(l,k)
+!            W(i,shift2+k) = W(i,shift2+k) + W(i,l)*y(l,k)
+!            S(i,shift2+k) = S(i,shift2+k) + S(i,l)*y(l,k)
+!          enddo
+!        enddo
+!      enddo
+!
+!
+      call dgemm('N','N', sze, N_st_diag, shift2,                    &
+          1.d0, U, size(U,1), y, size(y,1), 0.d0, U(1,shift2+1), size(U,1))
+      call dgemm('N','N', sze, N_st_diag, shift2,                    &
+          1.d0, W, size(W,1), y, size(y,1), 0.d0, W(1,shift2+1), size(W,1))
+      call dgemm('N','N', sze, N_st_diag, shift2,                    &
+          1.d0, S, size(S,1), y, size(y,1), 0.d0, S(1,shift2+1), size(S,1))
+
+      ! Compute residual vector
+      ! -----------------------
+      
+!      do k=1,N_st_diag
+!        print *,  s2(k)
+!        s2(k) = u_dot_v(U(1,shift2+k),  S(1,shift2+k), sze) + S_z2_Sz
+!        print *,  s2(k)
+!        print *,  ''
+!        pause
+!      enddo
+      do k=1,N_st_diag
+        do i=1,sze
+          R(i,k) = (lambda(k) * U(i,shift2+k) - W(i,shift2+k) ) &
+            * (1.d0 + s2(k) * U(i,shift2+k) - S(i,shift2+k) - S_z2_Sz)
+        enddo
+        if (k <= N_st) then
+          residual_norm(k) = u_dot_u(R(1,k),sze)
+          to_print(1,k) = lambda(k) + nuclear_repulsion
+          to_print(2,k) = s2(k)
+          to_print(3,k) = residual_norm(k)
+          if (residual_norm(k) > 1.e9) then
+            stop 'Davidson failed'
+          endif
+        endif
+      enddo
+      
+      write(iunit,'(X,I3,X,100(X,F16.10,X,F11.6,X,E11.3))')  iter, to_print(:,1:N_st)
+      call davidson_converged(lambda,residual_norm,wall,iter,cpu,N_st,converged)
+      if (converged) then
+        exit
+      endif
+      
+      ! Davidson step
+      ! -------------
+      
+      do k=1,N_st_diag
+        do i=1,sze
+          U(i,shift2+k) =  - R(i,k)/max(H_jj(i) - lambda(k),1.d-2) 
+        enddo
+      enddo
+      
+      ! Gram-Schmidt
+      ! ------------
+      
+      do k=1,N_st_diag
+
+!        do l=1,N_st_diag*iter
+!            c(1) = u_dot_v(U(1,shift2+k),U(1,l),sze)
+!            do i=1,sze
+!              U(i,k,iter+1) = U(i,shift2+k) - c(1) * U(i,l)
+!            enddo
+!        enddo
+!
+        call dgemv('T',sze,N_st_diag*iter,1.d0,U,size(U,1),  &
+              U(1,shift2+k),1,0.d0,c,1)
+        call dgemv('N',sze,N_st_diag*iter,-1.d0,U,size(U,1), &
+              c,1,1.d0,U(1,shift2+k),1)
+!
+!        do l=1,k-1
+!          c(1) = u_dot_v(U(1,shift2+k),U(1,shift2+l),sze)
+!          do i=1,sze
+!            U(i,k,iter+1) = U(i,shift2+k) - c(1) * U(i,shift2+l)
+!          enddo
+!        enddo
+!
+        call dgemv('T',sze,k-1,1.d0,U(1,shift2+1),size(U,1),   &
+            U(1,shift2+k),1,0.d0,c,1)
+        call dgemv('N',sze,k-1,-1.d0,U(1,shift2+1),size(U,1),        &
+            c,1,1.d0,U(1,shift2+k),1)
+
+        call normalize( U(1,shift2+k), sze )
+      enddo
+
+    enddo
+
+    if (.not.converged) then
+      iter = davidson_sze_max-1
+    endif
+    
+    ! Re-contract to u_in
+    ! -----------
+    
+    do k=1,N_st_diag
+      energies(k) = lambda(k)
+    enddo
+
+!    do k=1,N_st_diag
+!      do i=1,sze
+!        do l=1,iter*N_st_diag
+!            u_in(i,k) += U(i,l)*y(l,k)
+!          enddo
+!        enddo
+!      enddo
+!    enddo
+
+    call dgemm('N','N', sze, N_st_diag, N_st_diag*iter, 1.d0,      &
+        U, size(U,1), y, size(y,1), 0.d0, u_in, size(u_in,1))
+
+  enddo
+
+  write_buffer = '===== '
+  do i=1,N_st
+    write_buffer = trim(write_buffer)//' ================ =========== ==========='
+  enddo
+  write(iunit,'(A)') trim(write_buffer)
+  write(iunit,'(A)') ''
+  call write_time(iunit)
+
+  deallocate (                                                       &
+      kl_pairs,                                                      &
+      W, residual_norm,                                              &
+      U, overlap,                                                    &
+      R, c, S,                                                       &
+      h,                                                             &
+      y, s_, s_tmp,                                                  &
+      lambda                                                         &
+      )
+end
+
+
+subroutine H_S2_u_0_mrcc_nstates(v_0,s_0,u_0,H_jj,S2_jj,n,keys_tmp,Nint,istate_in,N_st,sze_8)
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Computes v_0 = H|u_0> and s_0 = S^2 |u_0>
+  !
+  ! n : number of determinants
+  !
+  ! H_jj : array of <j|H|j>
+  !
+  ! S2_jj : array of <j|S^2|j>
+  END_DOC
+  integer, intent(in)            :: N_st,n,Nint, sze_8, istate_in
+  double precision, intent(out)  :: v_0(sze_8,N_st), s_0(sze_8,N_st)
+  double precision, intent(in)   :: u_0(sze_8,N_st)
+  double precision, intent(in)   :: H_jj(n), S2_jj(n)
+  integer(bit_kind),intent(in)   :: keys_tmp(Nint,2,n)
+  double precision               :: hij,s2
+  double precision, allocatable  :: vt(:,:), ut(:,:), st(:,:)
+  integer                        :: i,j,k,l, jj,ii
+  integer                        :: i0, j0
+
+  integer, allocatable           :: shortcut(:,:), sort_idx(:,:)
+  integer(bit_kind), allocatable :: sorted(:,:,:), version(:,:,:)
+  integer(bit_kind)              :: sorted_i(Nint)
+
+  integer                        :: sh, sh2, ni, exa, ext, org_i, org_j, endi, istate
+  integer                        :: N_st_8
+
+  integer, external              :: align_double
+  !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: vt, ut
+
+  N_st_8 = align_double(N_st)
+
+  ASSERT (Nint > 0)
+  ASSERT (Nint == N_int)
+  ASSERT (n>0)
+  PROVIDE ref_bitmask_energy
+
+  allocate (shortcut(0:n+1,2), sort_idx(n,2), sorted(Nint,n,2), version(Nint,n,2))
+  allocate(ut(N_st_8,n))
+
+  v_0 = 0.d0
+  s_0 = 0.d0
+
+  do i=1,n
+    do istate=1,N_st
+      ut(istate,i) = u_0(i,istate)
+    enddo
+  enddo
+
+  call sort_dets_ab_v(keys_tmp, sorted(1,1,1), sort_idx(1,1), shortcut(0,1), version(1,1,1), n, Nint)
+  call sort_dets_ba_v(keys_tmp, sorted(1,1,2), sort_idx(1,2), shortcut(0,2), version(1,1,2), n, Nint)
+
+  !$OMP PARALLEL DEFAULT(NONE)                                       &
+      !$OMP PRIVATE(i,hij,s2,j,k,jj,vt,st,ii,sh,sh2,ni,exa,ext,org_i,org_j,endi,sorted_i,istate)&
+      !$OMP SHARED(n,keys_tmp,ut,Nint,v_0,s_0,sorted,shortcut,sort_idx,version,N_st,N_st_8, &
+      !$OMP  N_det_ref, idx_ref, N_det_non_ref, idx_non_ref, delta_ij,istate_in)
+  allocate(vt(N_st_8,n),st(N_st_8,n))
+  Vt = 0.d0
+  St = 0.d0
+
+  !$OMP DO SCHEDULE(dynamic)
+  do sh=1,shortcut(0,1)
+    do sh2=sh,shortcut(0,1)
+      exa = 0
+      do ni=1,Nint
+        exa = exa + popcnt(xor(version(ni,sh,1), version(ni,sh2,1)))
+      end do
+      if(exa > 2) then
+        cycle
+      end if
+
+      do i=shortcut(sh,1),shortcut(sh+1,1)-1
+        org_i = sort_idx(i,1)
+        if(sh==sh2) then
+          endi = i-1
+        else
+          endi = shortcut(sh2+1,1)-1
+        end if
+        do ni=1,Nint
+          sorted_i(ni) = sorted(ni,i,1)
+        enddo
+
+        do j=shortcut(sh2,1),endi
+          org_j = sort_idx(j,1)
+          ext = exa
+          do ni=1,Nint
+            ext = ext + popcnt(xor(sorted_i(ni), sorted(ni,j,1)))
+          end do
+          if(ext <= 4) then
+            call i_h_j (keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,hij)
+            call get_s2(keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,s2)
+            do istate=1,n_st
+              vt (istate,org_i) = vt (istate,org_i) + hij*ut(istate,org_j)
+              vt (istate,org_j) = vt (istate,org_j) + hij*ut(istate,org_i)
+              st (istate,org_i) = st (istate,org_i) + s2*ut(istate,org_j)
+              st (istate,org_j) = st (istate,org_j) + s2*ut(istate,org_i)
+            enddo
+          endif
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO NOWAIT
+  !$OMP DO SCHEDULE(dynamic)
+  do sh=1,shortcut(0,2)
+    do i=shortcut(sh,2),shortcut(sh+1,2)-1
+      org_i = sort_idx(i,2)
+      do j=shortcut(sh,2),i-1
+        org_j = sort_idx(j,2)
+        ext = 0
+        do ni=1,Nint
+          ext = ext + popcnt(xor(sorted(ni,i,2), sorted(ni,j,2)))
+        end do
+        if(ext == 4) then
+            call i_h_j (keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,hij)
+            call get_s2(keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,s2)
+            do istate=1,n_st
+              vt (istate,org_i) = vt (istate,org_i) + hij*ut(istate,org_j)
+              vt (istate,org_j) = vt (istate,org_j) + hij*ut(istate,org_i)
+              st (istate,org_i) = st (istate,org_i) + s2*ut(istate,org_j)
+              st (istate,org_j) = st (istate,org_j) + s2*ut(istate,org_i)
+            enddo
+        end if
+      end do
+    end do
+  enddo
+  !$OMP END DO NOWAIT
+
+! --------------------------
+! Begin Specific to dressing
+! --------------------------
+
+  !$OMP DO 
+  do ii=1,n_det_ref
+    i = idx_ref(ii)
+    do jj = 1, n_det_non_ref
+        j = idx_non_ref(jj)
+        do istate=1,N_st
+          vt (istate,i) = vt (istate,i) + delta_ij(istate_in,jj,ii)*ut(istate,j)
+          vt (istate,j) = vt (istate,j) + delta_ij(istate_in,jj,ii)*ut(istate,i)
+        enddo
+    enddo
+  enddo
+  !$OMP END DO
+
+! ------------------------
+! End Specific to dressing
+! ------------------------
+
+  !$OMP CRITICAL
+  do istate=1,N_st
+    do i=n,1,-1
+      v_0(i,istate) = v_0(i,istate) + vt(istate,i)
+      s_0(i,istate) = s_0(i,istate) + st(istate,i)
+    enddo
+  enddo
+  !$OMP END CRITICAL
+
+  deallocate(vt,st)
+  !$OMP END PARALLEL
+
+  do istate=1,N_st
+    do i=1,n
+      v_0(i,istate) = v_0(i,istate) + H_jj(i) * u_0(i,istate)
+      s_0(i,istate) = s_0(i,istate) + s2_jj(i)* u_0(i,istate)
+    enddo
+  enddo
+  deallocate (shortcut, sort_idx, sorted, version, ut)
+end
+
--- a/plugins/MRCC_Utils/mrcc_dress.irp.f
+++ b/plugins/MRCC_Utils/mrcc_dress.irp.f
@ -53,7 +53,6 @@ subroutine mrcc_dress(delta_ij_, delta_ii_, Nstates, Ndet_non_ref, Ndet_ref,i_ge
  integer :: mobiles(2), smallerlist
  logical, external :: is_generable
  
-  print *, i_generator
  leng = max(N_det_generators, N_det_non_ref)
  allocate(miniList(Nint, 2, leng), idx_minilist(leng), hij_cache(N_det_non_ref))
  
--- a/plugins/MRCC_Utils/mrcc_utils.irp.f
+++ b/plugins/MRCC_Utils/mrcc_utils.irp.f
@ -148,8 +148,14 @@ END_PROVIDER
   
   if (diag_algorithm == "Davidson") then
     
-     call davidson_diag_mrcc(psi_det,CI_eigenvectors_dressed,CI_electronic_energy_dressed,&
-          size(CI_eigenvectors_dressed,1),N_det,N_states,N_states_diag,N_int,output_determinants,mrcc_state)
+!     call davidson_diag_mrcc(psi_det,CI_eigenvectors_dressed,CI_electronic_energy_dressed,&
+!          size(CI_eigenvectors_dressed,1),N_det,N_states,N_states_diag,N_int,output_determinants,mrcc_state)
+
+     call davidson_diag_mrcc_HS2(psi_det,CI_eigenvectors_dressed,&
+          size(CI_eigenvectors_dressed,1), &
+          CI_electronic_energy_dressed,N_det,N_states,N_states_diag,N_int, &
+          output_determinants,mrcc_state)
+
     call u_0_S2_u_0(CI_eigenvectors_s2_dressed,CI_eigenvectors_dressed,N_det,psi_det,N_int,&
         N_states_diag,size(CI_eigenvectors_dressed,1))

@ -236,69 +242,6 @@ END_PROVIDER
     deallocate(eigenvectors,eigenvalues)
   endif

-   if( s2_eig.and.(n_states_diag > 1).and.(n_det >= n_states_diag) )then
-      ! Diagonalizing S^2 within the "n_states_diag" states found
-      allocate(s2_eigvalues(N_states_diag), e_array(N_states_diag))
-      call diagonalize_s2_betweenstates(psi_det,CI_eigenvectors_dressed,n_det,size(psi_det,3),size(CI_eigenvectors_dressed,1),min(n_states_diag,n_det),s2_eigvalues)
-      
-      double precision, allocatable :: psi_coef_tmp(:,:)
-      allocate(psi_coef_tmp(psi_det_size,N_states_diag))
-      do j = 1, N_states_diag
-        do i = 1, N_det
-          psi_coef_tmp(i,j) = CI_eigenvectors_dressed(i,j)
-        enddo
-      enddo
-      call u_0_H_u_0_mrcc_nstates(e_array,psi_coef_tmp,n_det,psi_det,N_int,mrcc_state,N_states,psi_det_size)
-     
-      ! Browsing the "n_states_diag" states and getting the lowest in energy "n_states" ones that have the S^2 value
-      ! closer to the "expected_s2" set as input
-      
-      allocate(index_good_state_array(N_det),good_state_array(N_det))
-      good_state_array = .False.
-      i_state = 0
-      do j = 1, N_states_diag
-        if(dabs(s2_eigvalues(j)-expected_s2).le.0.5d0)then
-          good_state_array(j) = .True.
-          i_state +=1
-          index_good_state_array(i_state) = j
-        endif
-      enddo
-      ! Sorting the i_state good states by energy
-      allocate(iorder(i_state))
-      do j = 1, i_state
-        do i = 1, N_det
-          CI_eigenvectors_dressed(i,j) = psi_coef_tmp(i,index_good_state_array(j))
-        enddo
-        CI_eigenvectors_s2_dressed(j) = s2_eigvalues(index_good_state_array(j))
-        CI_electronic_energy_dressed(j) = e_array(j)
-        iorder(j) = j
-      enddo
-      call dsort(e_array,iorder,i_state)
-      do j = 1, i_state
-        CI_electronic_energy_dressed(j) = e_array(j)
-        CI_eigenvectors_s2_dressed(j) = s2_eigvalues(index_good_state_array(iorder(j)))
-        do i = 1, N_det
-          CI_eigenvectors_dressed(i,j) = psi_coef_tmp(i,index_good_state_array(iorder(j)))
-        enddo
-      enddo
-      
-      ! Then setting the other states without any specific energy order
-      i_other_state = 0
-      do j = 1, N_states_diag
-        if(good_state_array(j))cycle
-        i_other_state +=1
-        do i = 1, N_det
-          CI_eigenvectors_dressed(i,i_state + i_other_state) = psi_coef_tmp(i,j)
-        enddo
-        CI_eigenvectors_s2_dressed(i_state + i_other_state) = s2_eigvalues(j)
-        CI_electronic_energy_dressed(i_state + i_other_state) = e_array(i_state + i_other_state)
-      enddo
-      deallocate(iorder,e_array,index_good_state_array,good_state_array,psi_coef_tmp)
-       
-     deallocate(s2_eigvalues)
-     
-   endif
-   
 END_PROVIDER

 BEGIN_PROVIDER [ double precision, CI_energy_dressed, (N_states_diag) ]
@ -644,6 +587,7 @@ end subroutine
  N_ex_exists = 0

  n = 0
+  !TODO Openmp
  do i=1, N_det_ref
    do l=1, N_det_non_ref
      call get_excitation(psi_ref(1,1,i), psi_non_ref(1,1,l), exc, degree, phase, N_int)
--- a/plugins/Perturbation/pt2_equations.irp.f
+++ b/plugins/Perturbation/pt2_equations.irp.f
@ -345,6 +345,37 @@ subroutine pt2_epstein_nesbet_sc2 ($arguments)
  
 end

+subroutine pt2_dummy ($arguments)
+  use bitmasks
+  implicit none
+  $declarations
+  
+  BEGIN_DOC
+  ! Dummy perturbation to add all connected determinants.
+  END_DOC
+  
+  integer                        :: i,j
+  double precision               :: diag_H_mat_elem_fock, h
+  double precision               :: i_H_psi_array(N_st)
+  PROVIDE  selection_criterion
+
+  call i_H_psi_minilist(det_pert,minilist,idx_minilist,N_minilist,psi_selectors_coef,Nint,N_minilist,psi_selectors_size,N_st,i_H_psi_array)
+  
+  h = diag_H_mat_elem_fock(det_ref,det_pert,fock_diag_tmp,Nint)
+  do i =1,N_st
+    if (i_H_psi_array(i) /= 0.d0) then
+     c_pert(i) = i_H_psi_array(i) / (electronic_energy(i) - h)
+     H_pert_diag(i) = h*c_pert(i)*c_pert(i)
+     e_2_pert(i) = 1.d0
+    else
+      c_pert(i) = 0.d0
+      e_2_pert(i) = 0.d0
+      H_pert_diag(i) = 0.d0
+    endif
+  enddo
+  
+end
+


 SUBST [ arguments, declarations ]
--- a/plugins/Selectors_full/selectors.irp.f
+++ b/plugins/Selectors_full/selectors.irp.f
@ -14,7 +14,7 @@ BEGIN_PROVIDER [ integer, N_det_selectors]
  integer                        :: i
  double precision               :: norm, norm_max
  call write_time(output_determinants)
-  N_det_selectors = N_det
+  N_det_selectors = N_det_generators
  if (threshold_generators < 1.d0) then
    norm = 0.d0
    do i=1,N_det
@ -24,7 +24,7 @@ BEGIN_PROVIDER [ integer, N_det_selectors]
        exit
      endif
    enddo
-    N_det_selectors = max(N_det_selectors,1)
+    N_det_selectors = max(N_det_selectors,N_det_generators)
  endif
  call write_int(output_determinants,N_det_selectors,'Number of selectors')
 END_PROVIDER
--- a/scripts/ezfio_interface/ei_handler.py
+++ b/scripts/ezfio_interface/ei_handler.py
@ -45,7 +45,7 @@ Optional:
                            (by default is one)
                            Example : 1, =sum(ao_num); (ao_num,3)
                            ATTENTION : The module and the value are separed by a "." not a "_".
-                            For exemple (determinants.n_det)
+                            For example (determinants.n_det)
    ezfio_name: <str>   The name for the EZFIO lib
                             (by default is <provider_name>)
    ezfio_dir: <str>    Will be the folder of EZFIO.
--- a/scripts/module/qp_module.py
+++ b/scripts/module/qp_module.py
@ -213,7 +213,7 @@ def main(arguments):
            print "[ OK ]"
            print ""
            print "You can now compile as usual"
-            print "`cd {0} ; ninja` for exemple".format(QP_ROOT)
+            print "`cd {0} ; ninja` for example".format(QP_ROOT)
            print " or --in developement mode-- you can cd in a directory and compile here"

    elif arguments["uninstall"]:
--- a/src/AO_Basis/EZFIO.cfg
+++ b/src/AO_Basis/EZFIO.cfg
@ -54,3 +54,25 @@ type: logical
 doc: If true, use AOs in Cartesian coordinates (6d,10f,...)
 interface: ezfio, provider
 default: false
+
+[integral_overlap]
+type: double precision
+doc: Overlap integrals in AO basis set
+size: (ao_basis.ao_num,ao_basis.ao_num)
+interface: ezfio
+default: false
+
+[integral_nuclear]
+type: double precision
+doc: Nucleus-electron integrals in AO basis set
+size: (ao_basis.ao_num,ao_basis.ao_num)
+interface: ezfio
+default: false
+
+[integral_kinetic]
+type: double precision
+doc: Kinetic energy integrals in AO basis set
+size: (ao_basis.ao_num,ao_basis.ao_num)
+interface: ezfio
+default: false
+
--- a/src/AO_Basis/ao_overlap.irp.f
+++ b/src/AO_Basis/ao_overlap.irp.f
@ -14,51 +14,60 @@
  double precision :: alpha, beta, c
  double precision :: A_center(3), B_center(3)
  integer :: power_A(3), power_B(3)
-  dim1=100
-  !$OMP PARALLEL DO SCHEDULE(GUIDED) &
-  !$OMP DEFAULT(NONE) &
-  !$OMP PRIVATE(A_center,B_center,power_A,power_B,&
-  !$OMP  overlap_x,overlap_y, overlap_z, overlap, &
-  !$OMP  alpha, beta,i,j,c) &
-  !$OMP SHARED(nucl_coord,ao_power,ao_prim_num, &
-  !$OMP  ao_overlap_x,ao_overlap_y,ao_overlap_z,ao_overlap,ao_num,ao_coef_normalized_ordered_transp,ao_nucl, &
-  !$OMP  ao_expo_ordered_transp,dim1)
-  do j=1,ao_num
-   A_center(1) = nucl_coord( ao_nucl(j), 1 )
-   A_center(2) = nucl_coord( ao_nucl(j), 2 )
-   A_center(3) = nucl_coord( ao_nucl(j), 3 )
-   power_A(1)  = ao_power( j, 1 )
-   power_A(2)  = ao_power( j, 2 )
-   power_A(3)  = ao_power( j, 3 )
-   !DEC$ VECTOR ALIGNED
-   !DEC$ VECTOR ALWAYS
-   do i= 1,ao_num
-    ao_overlap(i,j)= 0.d0
-    ao_overlap_x(i,j)= 0.d0
-    ao_overlap_y(i,j)= 0.d0
-    ao_overlap_z(i,j)= 0.d0
-    B_center(1) = nucl_coord( ao_nucl(i), 1 )
-    B_center(2) = nucl_coord( ao_nucl(i), 2 )
-    B_center(3) = nucl_coord( ao_nucl(i), 3 )
-    power_B(1)  = ao_power( i, 1 )
-    power_B(2)  = ao_power( i, 2 )
-    power_B(3)  = ao_power( i, 3 )
-    do n = 1,ao_prim_num(j)
-     alpha = ao_expo_ordered_transp(n,j)
-     !DEC$ VECTOR ALIGNED
-     do l = 1, ao_prim_num(i)
-      beta = ao_expo_ordered_transp(l,i)
-      call overlap_gaussian_xyz(A_center,B_center,alpha,beta,power_A,power_B,overlap_x,overlap_y,overlap_z,overlap,dim1)
-      c = ao_coef_normalized_ordered_transp(n,j) * ao_coef_normalized_ordered_transp(l,i)
-      ao_overlap(i,j) += c * overlap
-      ao_overlap_x(i,j) += c * overlap_x
-      ao_overlap_y(i,j) += c * overlap_y
-      ao_overlap_z(i,j) += c * overlap_z
-     enddo
+!   if (read_ao_one_integrals) then
+!     call ezfio_get_ao_basis_integral_overlap(ao_overlap(1:ao_num, 1:ao_num))
+!     print *,  'AO overlap integrals read from disk'
+!  else
+    dim1=100
+    !$OMP PARALLEL DO SCHEDULE(GUIDED) &
+    !$OMP DEFAULT(NONE) &
+    !$OMP PRIVATE(A_center,B_center,power_A,power_B,&
+    !$OMP  overlap_x,overlap_y, overlap_z, overlap, &
+    !$OMP  alpha, beta,i,j,c) &
+    !$OMP SHARED(nucl_coord,ao_power,ao_prim_num, &
+    !$OMP  ao_overlap_x,ao_overlap_y,ao_overlap_z,ao_overlap,ao_num,ao_coef_normalized_ordered_transp,ao_nucl, &
+    !$OMP  ao_expo_ordered_transp,dim1)
+    do j=1,ao_num
+    A_center(1) = nucl_coord( ao_nucl(j), 1 )
+    A_center(2) = nucl_coord( ao_nucl(j), 2 )
+    A_center(3) = nucl_coord( ao_nucl(j), 3 )
+    power_A(1)  = ao_power( j, 1 )
+    power_A(2)  = ao_power( j, 2 )
+    power_A(3)  = ao_power( j, 3 )
+    !DEC$ VECTOR ALIGNED
+    !DEC$ VECTOR ALWAYS
+    do i= 1,ao_num
+      ao_overlap(i,j)= 0.d0
+      ao_overlap_x(i,j)= 0.d0
+      ao_overlap_y(i,j)= 0.d0
+      ao_overlap_z(i,j)= 0.d0
+      B_center(1) = nucl_coord( ao_nucl(i), 1 )
+      B_center(2) = nucl_coord( ao_nucl(i), 2 )
+      B_center(3) = nucl_coord( ao_nucl(i), 3 )
+      power_B(1)  = ao_power( i, 1 )
+      power_B(2)  = ao_power( i, 2 )
+      power_B(3)  = ao_power( i, 3 )
+      do n = 1,ao_prim_num(j)
+      alpha = ao_expo_ordered_transp(n,j)
+      !DEC$ VECTOR ALIGNED
+      do l = 1, ao_prim_num(i)
+        beta = ao_expo_ordered_transp(l,i)
+        call overlap_gaussian_xyz(A_center,B_center,alpha,beta,power_A,power_B,overlap_x,overlap_y,overlap_z,overlap,dim1)
+        c = ao_coef_normalized_ordered_transp(n,j) * ao_coef_normalized_ordered_transp(l,i)
+        ao_overlap(i,j) += c * overlap
+        ao_overlap_x(i,j) += c * overlap_x
+        ao_overlap_y(i,j) += c * overlap_y
+        ao_overlap_z(i,j) += c * overlap_z
+      enddo
+      enddo
    enddo
-   enddo
-  enddo
-  !$OMP END PARALLEL DO
+    enddo
+    !$OMP END PARALLEL DO
+!  endif
+!  if (write_ao_one_integrals) then
+!     call ezfio_set_ao_basis_integral_overlap(ao_overlap(1:ao_num, 1:ao_num))
+!     print *,  'AO overlap integrals written to disk'
+!  endif

 END_PROVIDER

--- a/src/Davidson/EZFIO.cfg
+++ b/src/Davidson/EZFIO.cfg
@ -7,6 +7,6 @@ default: 1.e-12
 [n_states_diag]
 type: States_number
 doc: n_states_diag
-default: 1
+default: 10
 interface: ezfio,provider,ocaml

--- a/src/Davidson/davidson.irp.f
+++ b/src/Davidson/davidson.irp.f
@ -1,3 +0,0 @@
-program davidson
-  stop 1
-end
--- a/src/Davidson/davidson_parallel.irp.f
+++ b/src/Davidson/davidson_parallel.irp.f
@ -12,8 +12,8 @@ subroutine davidson_process(blockb, blocke, N, idx, vt, st)
  integer             , intent(in)        :: blockb, blocke
  integer             , intent(inout)     :: N
  integer             , intent(inout)     :: idx(dav_size)
-  double precision    , intent(inout)     :: vt(N_states, dav_size)
-  double precision    , intent(inout)     :: st(N_states, dav_size)
+  double precision    , intent(inout)     :: vt(N_states_diag, dav_size)
+  double precision    , intent(inout)     :: st(N_states_diag, dav_size)
  
  integer :: i, j, sh, sh2, exa, ext, org_i, org_j, istate, ni, endi
  integer(bit_kind) :: sorted_i(N_int)
@ -63,7 +63,7 @@ subroutine davidson_process(blockb, blocke, N, idx, vt, st)
            vt (:,org_j) = 0d0
            st (:,org_j) = 0d0
          end if
-          do istate=1,N_states
+          do istate=1,N_states_diag
            vt (istate,org_i) += hij*dav_ut(istate,org_j)
            st (istate,org_i) += s2*dav_ut(istate,org_j)
            vt (istate,org_j) += hij*dav_ut(istate,org_i)
@ -79,7 +79,7 @@ subroutine davidson_process(blockb, blocke, N, idx, vt, st)
  do i=1, dav_size
    if(wrotten(i)) then
      N = N+1
-      do istate=1,N_states
+      do istate=1,N_states_diag
        vt (istate,N) = vt (istate,i)
        st (istate,N) = st (istate,i)
        idx(N) = i
@ -91,23 +91,28 @@ end subroutine



-subroutine davidson_collect(blockb, blocke, N, idx, vt, st , v0, s0)
+subroutine davidson_collect(blockb, blocke, N, idx, vt, st , v0t, s0t)
  implicit none


  integer             , intent(in)        :: blockb, blocke
  integer             , intent(in)        :: N
  integer             , intent(in)        :: idx(N)
-  double precision    , intent(in)        :: vt(N_states, N)
-  double precision    , intent(in)        :: st(N_states, N)
-  double precision    , intent(inout)     :: v0(dav_size, N_states)
-  double precision    , intent(inout)     :: s0(dav_size, N_states)
+  double precision    , intent(in)        :: vt(N_states_diag, N)
+  double precision    , intent(in)        :: st(N_states_diag, N)
+  double precision    , intent(inout)     :: v0t(N_states_diag,dav_size)
+  double precision    , intent(inout)     :: s0t(N_states_diag,dav_size)

-  integer :: i
+  integer :: i, j, k

+  !DIR$ IVDEP
  do i=1,N
-    v0(idx(i), :) += vt(:, i)
-    s0(idx(i), :) += st(:, i)
+    k = idx(i)
+    !DIR$ IVDEP
+    do j=1,N_states_diag
+      v0t(j,k) = v0t(j,k) + vt(j,i)
+      s0t(j,k) = s0t(j,k) + st(j,i)
+    enddo
  end do
 end subroutine

@ -210,8 +215,8 @@ subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, worker_id)
  
  
  allocate(idx(dav_size)) 
-  allocate(vt(N_states, dav_size)) 
-  allocate(st(N_states, dav_size)) 
+  allocate(vt(N_states_diag, dav_size)) 
+  allocate(st(N_states_diag, dav_size)) 
  
  
  do
@ -221,7 +226,6 @@ subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, worker_id)
    
    call davidson_process(blockb, blocke, N, idx, vt, st)
    
-    ! reverse ?
    call task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id)
    call davidson_push_results(zmq_socket_push, blockb, blocke, N, idx, vt, st, task_id)
  end do
@ -239,8 +243,8 @@ subroutine davidson_push_results(zmq_socket_push, blockb, blocke, N, idx, vt, st
  integer             ,intent(in)    :: blockb, blocke
  integer             ,intent(in)    :: N
  integer             ,intent(in)    :: idx(N)
-  double precision    ,intent(in)    :: vt(N_states, N)
-  double precision    ,intent(in)    :: st(N_states, N)
+  double precision    ,intent(in)    :: vt(N_states_diag, N)
+  double precision    ,intent(in)    :: st(N_states_diag, N)
  integer                            :: rc

  rc = f77_zmq_send( zmq_socket_push, blockb, 4, ZMQ_SNDMORE)
@ -255,11 +259,11 @@ subroutine davidson_push_results(zmq_socket_push, blockb, blocke, N, idx, vt, st
  rc = f77_zmq_send( zmq_socket_push, idx, 4*N, ZMQ_SNDMORE)
  if(rc /= 4*N) stop "davidson_push_results failed to push idx"

-  rc = f77_zmq_send( zmq_socket_push, vt, 8*N_states* N, ZMQ_SNDMORE)
-  if(rc /= 8*N_states* N) stop "davidson_push_results failed to push vt"
+  rc = f77_zmq_send( zmq_socket_push, vt, 8*N_states_diag* N, ZMQ_SNDMORE)
+  if(rc /= 8*N_states_diag* N) stop "davidson_push_results failed to push vt"

-  rc = f77_zmq_send( zmq_socket_push, st, 8*N_states* N, ZMQ_SNDMORE)
-  if(rc /= 8*N_states* N) stop "davidson_push_results failed to push st"
+  rc = f77_zmq_send( zmq_socket_push, st, 8*N_states_diag* N, ZMQ_SNDMORE)
+  if(rc /= 8*N_states_diag* N) stop "davidson_push_results failed to push st"

  rc = f77_zmq_send( zmq_socket_push, task_id, 4, 0)
  if(rc /= 4) stop "davidson_push_results failed to push task_id"
@ -276,8 +280,8 @@ subroutine davidson_pull_results(zmq_socket_pull, blockb, blocke, N, idx, vt, st
  integer             ,intent(out)    :: blockb, blocke
  integer             ,intent(out)    :: N
  integer             ,intent(out)    :: idx(dav_size)
-  double precision    ,intent(out)    :: vt(N_states, dav_size)
-  double precision    ,intent(out)    :: st(N_states, dav_size)
+  double precision    ,intent(out)    :: vt(N_states_diag, dav_size)
+  double precision    ,intent(out)    :: st(N_states_diag, dav_size)

  integer                            :: rc

@ -293,11 +297,11 @@ subroutine davidson_pull_results(zmq_socket_pull, blockb, blocke, N, idx, vt, st
  rc = f77_zmq_recv( zmq_socket_pull, idx, 4*N, 0)
  if(rc /= 4*N) stop "davidson_push_results failed to pull idx"

-  rc = f77_zmq_recv( zmq_socket_pull, vt, 8*N_states* N, 0)
-  if(rc /= 8*N_states* N) stop "davidson_push_results failed to pull vt"
+  rc = f77_zmq_recv( zmq_socket_pull, vt, 8*N_states_diag* N, 0)
+  if(rc /= 8*N_states_diag* N) stop "davidson_push_results failed to pull vt"

-  rc = f77_zmq_recv( zmq_socket_pull, st, 8*N_states* N, 0)
-  if(rc /= 8*N_states* N) stop "davidson_push_results failed to pull st"
+  rc = f77_zmq_recv( zmq_socket_pull, st, 8*N_states_diag* N, 0)
+  if(rc /= 8*N_states_diag* N) stop "davidson_push_results failed to pull st"

  rc = f77_zmq_recv( zmq_socket_pull, task_id, 4, 0)
  if(rc /= 4) stop "davidson_pull_results failed to pull task_id"
@ -312,8 +316,8 @@ subroutine davidson_collector(zmq_to_qp_run_socket, zmq_socket_pull , v0, s0)
  integer(ZMQ_PTR), intent(in)     :: zmq_to_qp_run_socket
  integer(ZMQ_PTR), intent(in)     :: zmq_socket_pull

-  double precision    ,intent(inout)    :: v0(dav_size, N_states)
-  double precision    ,intent(inout)    :: s0(dav_size, N_states)
+  double precision    ,intent(inout)    :: v0(dav_size, N_states_diag)
+  double precision    ,intent(inout)    :: s0(dav_size, N_states_diag)

  integer                          :: more, task_id
  
@ -321,29 +325,31 @@ subroutine davidson_collector(zmq_to_qp_run_socket, zmq_socket_pull , v0, s0)
  integer                  :: blockb, blocke
  integer                  :: N
  integer             , allocatable      :: idx(:)
-  double precision    , allocatable      :: vt(:,:)
+  double precision    , allocatable      :: vt(:,:), v0t(:,:), s0t(:,:)
  double precision    , allocatable      :: st(:,:)
-  integer :: deleted
-  logical, allocatable :: done(:)
-  allocate(done(shortcut_(0,1)))
-  deleted = 0
-  done = .false.
  
  allocate(idx(dav_size)) 
-  allocate(vt(N_states, dav_size)) 
-  allocate(st(N_states, dav_size)) 
+  allocate(vt(N_states_diag, dav_size)) 
+  allocate(st(N_states_diag, dav_size)) 
+  allocate(v0t(N_states_diag, dav_size)) 
+  allocate(s0t(N_states_diag, dav_size)) 
  
+  v0t = 00.d0
+  s0t = 00.d0
+
  more = 1
  
  do while (more == 1)
    call davidson_pull_results(zmq_socket_pull, blockb, blocke, N, idx, vt, st, task_id)
-    call davidson_collect(blockb, blocke, N, idx, vt, st , v0, s0)
-!     done(block) = .true.
+    !DIR$ FORCEINLINE
+    call davidson_collect(blockb, blocke, N, idx, vt, st , v0t, s0t)
    call zmq_delete_task(zmq_to_qp_run_socket,zmq_socket_pull,task_id,more)
-    deleted += 1
-!     print *, "DELETED", deleted, done
  end do
-!   print *, "done collector"
+  deallocate(idx,vt,st)
+
+  call dtranspose(v0t,size(v0t,1), v0, size(v0,1), N_states_diag, dav_size)
+  call dtranspose(s0t,size(s0t,1), s0, size(s0,1), N_states_diag, dav_size)
+  deallocate(v0t,s0t)
 end subroutine


@ -360,8 +366,8 @@ subroutine davidson_run(zmq_to_qp_run_socket , v0, s0)
  integer :: i
  integer, external              :: omp_get_thread_num

-  double precision    , intent(inout)     :: v0(dav_size, N_states)
-  double precision    , intent(inout)     :: s0(dav_size, N_states)
+  double precision    , intent(inout)     :: v0(dav_size, N_states_diag)
+  double precision    , intent(inout)     :: s0(dav_size, N_states_diag)
  
  call zmq_set_running(zmq_to_qp_run_socket)
  
@ -393,7 +399,6 @@ end subroutine
 subroutine davidson_miniserver_run()
  use f77_zmq
  implicit none
-  integer(ZMQ_PTR)        context
  integer(ZMQ_PTR)        responder
  character*(64)          address
  character(len=:), allocatable :: buffer
@ -402,8 +407,7 @@ subroutine davidson_miniserver_run()
  allocate (character(len=20) :: buffer)
  address = 'tcp://*:11223'
  
-  context   = f77_zmq_ctx_new()
-  responder = f77_zmq_socket(context, ZMQ_REP)
+  responder = f77_zmq_socket(zmq_context, ZMQ_REP)
  rc        = f77_zmq_bind(responder,address)
  
  do
@ -411,7 +415,7 @@ subroutine davidson_miniserver_run()
    if (buffer(1:rc) /= 'end') then
      rc = f77_zmq_send (responder, dav_size, 4, ZMQ_SNDMORE)
      rc = f77_zmq_send (responder, dav_det, 16*N_int*dav_size, ZMQ_SNDMORE)
-      rc = f77_zmq_send (responder, dav_ut, 8*dav_size*N_states, 0)
+      rc = f77_zmq_send (responder, dav_ut, 8*dav_size*N_states_diag, 0)
    else
      rc = f77_zmq_send (responder, "end", 3, 0)
      exit
@ -419,7 +423,6 @@ subroutine davidson_miniserver_run()
  enddo

  rc = f77_zmq_close(responder)
-  rc = f77_zmq_ctx_destroy(context)
 end subroutine


@ -427,21 +430,18 @@ subroutine davidson_miniserver_end()
  implicit none
  use f77_zmq
  
-  integer(ZMQ_PTR)        context
  integer(ZMQ_PTR)        requester
  character*(64)          address
  integer                 rc
  character*(64)          buf
  
  address = trim(qp_run_address)//':11223'
-  context   = f77_zmq_ctx_new()
-  requester = f77_zmq_socket(context, ZMQ_REQ)
+  requester = f77_zmq_socket(zmq_context, ZMQ_REQ)
  rc        = f77_zmq_connect(requester,address)

  rc = f77_zmq_send(requester, "end", 3, 0)
  rc = f77_zmq_recv(requester, buf, 3, 0)
  rc = f77_zmq_close(requester)
-  rc = f77_zmq_ctx_destroy(context)
 end subroutine

  
@ -449,7 +449,6 @@ subroutine davidson_miniserver_get()
  implicit none
  use f77_zmq
  
-  integer(ZMQ_PTR)        context
  integer(ZMQ_PTR)        requester
  character*(64)          address
  character*(20)          buffer
@ -457,21 +456,17 @@ subroutine davidson_miniserver_get()
  
  address = trim(qp_run_address)//':11223'
  
-  context   = f77_zmq_ctx_new()
-  requester = f77_zmq_socket(context, ZMQ_REQ)
+  requester = f77_zmq_socket(zmq_context, ZMQ_REQ)
  rc        = f77_zmq_connect(requester,address)

  rc = f77_zmq_send(requester, "Hello", 5, 0)
  rc = f77_zmq_recv(requester, dav_size, 4, 0)
  TOUCH dav_size
  rc = f77_zmq_recv(requester, dav_det, 16*N_int*dav_size, 0)
-  rc = f77_zmq_recv(requester, dav_ut, 8*dav_size*N_states, 0)
+  rc = f77_zmq_recv(requester, dav_ut, 8*dav_size*N_states_diag, 0)
  TOUCH dav_det dav_ut

-  rc = f77_zmq_close(requester)
-  rc = f77_zmq_ctx_destroy(context)
  
-  touch dav_det dav_ut
 end subroutine


@ -480,7 +475,7 @@ BEGIN_PROVIDER [ integer(bit_kind), dav_det, (N_int, 2, dav_size) ]
 END_PROVIDER


-BEGIN_PROVIDER [ double precision, dav_ut, (N_states, dav_size) ]
+BEGIN_PROVIDER [ double precision, dav_ut, (N_states_diag, dav_size) ]
 END_PROVIDER


--- a/src/Davidson/davidson_slave.irp.f
+++ b/src/Davidson/davidson_slave.irp.f
@ -8,7 +8,7 @@ program davidson_slave
  double precision :: energy(N_states_diag)
  character*(64) :: state
  
-!   call provide_everything
+  call provide_everything
  call switch_qp_run_to_master
  
  zmq_context = f77_zmq_ctx_new ()
@ -35,6 +35,6 @@ program davidson_slave
  end do
 end

-! subroutine provide_everything
-!   PROVIDE mo_bielec_integrals_in_map psi_det_sorted_bit N_states zmq_context
-! end subroutine
+subroutine provide_everything
+  PROVIDE mo_bielec_integrals_in_map psi_det_sorted_bit N_states_diag zmq_context
+end subroutine
--- a/src/Davidson/diagonalization_hs2.irp.f
+++ b/src/Davidson/diagonalization_hs2.irp.f
@ -71,7 +71,7 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
  !
  ! N_st : Number of eigenstates
  ! 
-  ! N_st_diag : Number of states in which H is diagonalized
+  ! N_st_diag : Number of states in which H is diagonalized. Assumed > sze
  !
  ! iunit : Unit for the I/O
  !
@ -89,24 +89,27 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
  integer                        :: i,j,k,l,m
  logical                        :: converged
  
-  double precision, allocatable  :: overlap(:,:)
  double precision               :: u_dot_v, u_dot_u
  
  integer, allocatable           :: kl_pairs(:,:)
  integer                        :: k_pairs, kl
  
  integer                        :: iter2
-  double precision, allocatable  :: W(:,:,:),  U(:,:,:), R(:,:), S(:,:,:)
-  double precision, allocatable  :: y(:,:,:,:), h(:,:,:,:), lambda(:), s2(:)
-  double precision, allocatable  :: c(:), H_small(:,:)
+  double precision, allocatable  :: W(:,:),  U(:,:), R(:,:), S(:,:)
+  double precision, allocatable  :: y(:,:), h(:,:), lambda(:), s2(:)
+  double precision, allocatable  :: c(:), s_(:,:), s_tmp(:,:)
  double precision               :: diag_h_mat_elem
  double precision, allocatable  :: residual_norm(:)
  character*(16384)              :: write_buffer
  double precision               :: to_print(3,N_st)
  double precision               :: cpu, wall
+  integer                        :: shift, shift2
  include 'constants.include.F'
  
  !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: U, W, R, S, y, h, lambda
+  if (N_st_diag > sze) then
+     stop 'error in Davidson : N_st_diag > sze'
+  endif

  PROVIDE nuclear_repulsion

@ -140,29 +143,31 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
  integer, external :: align_double
  sze_8 = align_double(sze)

-  double precision :: delta
-
-  if (s2_eig) then
-    delta = 1.d0
-  else
-    delta = 0.d0
-  endif
-
  allocate(                                                          &
      kl_pairs(2,N_st_diag*(N_st_diag+1)/2),                         &
-      W(sze_8,N_st_diag,davidson_sze_max),                           &
-      U(sze_8,N_st_diag,davidson_sze_max),                           &
+      W(sze_8,N_st_diag*davidson_sze_max),                           &
+      U(sze_8,N_st_diag*davidson_sze_max),                           &
      R(sze_8,N_st_diag),                                            &
-      S(sze_8,N_st_diag,davidson_sze_max),                           &
-      h(N_st_diag,davidson_sze_max,N_st_diag,davidson_sze_max),      &
-      y(N_st_diag,davidson_sze_max,N_st_diag,davidson_sze_max),      &
+      S(sze_8,N_st_diag*davidson_sze_max),                           &
+      h(N_st_diag*davidson_sze_max,N_st_diag*davidson_sze_max),      &
+      y(N_st_diag*davidson_sze_max,N_st_diag*davidson_sze_max),      &
+      s_(N_st_diag*davidson_sze_max,N_st_diag*davidson_sze_max),     &
+      s_tmp(N_st_diag*davidson_sze_max,N_st_diag*davidson_sze_max),  &
      residual_norm(N_st_diag),                                      &
-      overlap(N_st_diag,N_st_diag),                                  &
      c(N_st_diag*davidson_sze_max),                                 &
-      H_small(N_st_diag,N_st_diag),                                  &
-      s2(N_st_diag),                                                 &
+      s2(N_st_diag*davidson_sze_max),                                &
      lambda(N_st_diag*davidson_sze_max))
  
+  h  = 0.d0
+  s_ = 0.d0
+  s_tmp = 0.d0
+  c = 0.d0
+  U = 0.d0
+  S = 0.d0
+  R = 0.d0
+  y = 0.d0
+
+
  ASSERT (N_st > 0)
  ASSERT (N_st_diag >= N_st)
  ASSERT (sze > 0)
@ -174,16 +179,17 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
  
  converged = .False.
  
-  do k=1,N_st_diag
+  do k=1,N_st
+    call normalize(u_in(1,k),sze)
+  enddo

-    if (k > N_st) then
+  do k=N_st+1,N_st_diag
      do i=1,sze
        double precision               :: r1, r2
        call random_number(r1)
        call random_number(r2)
        u_in(i,k) = dsqrt(-2.d0*dlog(r1))*dcos(dtwo_pi*r2)
      enddo
-    endif
    
    ! Gram-Schmidt
    ! ------------
@ -194,22 +200,26 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
    call normalize(u_in(1,k),sze)
  enddo

-
  
  do while (.not.converged)
    
    do k=1,N_st_diag
      do i=1,sze
-        U(i,k,1) = u_in(i,k)
+        U(i,k) = u_in(i,k)
      enddo
    enddo

    do iter=1,davidson_sze_max-1
      
+      shift  = N_st_diag*(iter-1)
+      shift2 = N_st_diag*iter
+
+
      ! Compute |W_k> = \sum_i |i><i|H|u_k>
      ! -----------------------------------------
      
-      call H_S2_u_0_nstates(W(1,1,iter),S(1,1,iter),U(1,1,iter),H_jj,S2_jj,sze,dets_in,Nint,N_st_diag,sze_8)
+       
+      call H_S2_u_0_nstates(W(1,shift+1),S(1,shift+1),U(1,shift+1),H_jj,S2_jj,sze,dets_in,Nint,N_st_diag,sze_8)
      
      
      ! Compute h_kl = <u_k | W_l> = <u_k| H |u_l>
@ -229,60 +239,104 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
 !        enddo
 !      enddo

-      call dgemm('T','N', N_st_diag*iter, N_st_diag, sze,            &
-          1.d0, U, size(U,1), W(1,1,iter), size(W,1),                &
-          0.d0, h(1,1,1,iter), size(h,1)*size(h,2))
+      call dgemm('T','N', shift2, N_st_diag, sze,            &
+          1.d0, U, size(U,1), W(1,shift+1), size(W,1),                &
+          0.d0, h(1,shift+1), size(h,1))
+
+      call dgemm('T','N', shift2, N_st_diag, sze,            &
+          1.d0, U, size(U,1), S(1,shift+1), size(S,1),                &
+          0.d0, s_(1,shift+1), size(s_,1))

      ! Diagonalize h
      ! -------------
-      call lapack_diag(lambda,y,h,N_st_diag*davidson_sze_max,N_st_diag*iter)
+      call lapack_diag(lambda,y,h,size(h,1),shift2)
      
+      ! Compute S2 for each eigenvector
+      ! -------------------------------
+
+      call dgemm('N','N',shift2,shift2,shift2,                            &
+          1.d0, s_, size(s_,1), y, size(y,1), &
+          0.d0, s_tmp, size(s_tmp,1))
+
+      call dgemm('T','N',shift2,shift2,shift2,                            &
+          1.d0, y, size(y,1), s_tmp, size(s_tmp,1), &
+          0.d0, s_, size(s_,1))
+
+      do k=1,shift2
+        s2(k) = s_(k,k) + S_z2_Sz
+      enddo
+
+      if (s2_eig) then
+        logical :: state_ok(N_st_diag*davidson_sze_max)
+        do k=1,shift2
+          state_ok(k) = (dabs(s2(k)-expected_s2) < 0.6d0)
+        enddo
+        do k=1,shift2
+          if (.not. state_ok(k)) then
+            do l=k+1,shift2
+              if (state_ok(l)) then
+                call dswap(shift2, y(1,k), 1, y(1,l), 1)
+                call dswap(1, s2(k), 1, s2(l), 1)
+                call dswap(1, lambda(k), 1, lambda(l), 1)
+                state_ok(k) = .True.
+                state_ok(l) = .False.
+                exit
+              endif
+            enddo
+          endif
+        enddo
+      endif
+
+
      ! Express eigenvectors of h in the determinant basis
      ! --------------------------------------------------
      
-      do k=1,N_st_diag
-        do i=1,sze
-          U(i,k,iter+1) = 0.d0
-          W(i,k,iter+1) = 0.d0
-          S(i,k,iter+1) = 0.d0
-        enddo
-      enddo
 !      do k=1,N_st_diag
-!         do iter2=1,iter
-!          do l=1,N_st_diag
-!            do i=1,sze
-!              U(i,k,iter+1) = U(i,k,iter+1) + U(i,l,iter2)*y(l,iter2,k,1)
-!              W(i,k,iter+1) = W(i,k,iter+1) + W(i,l,iter2)*y(l,iter2,k,1)
-!            enddo
+!        do i=1,sze
+!          U(i,shift2+k) = 0.d0
+!          W(i,shift2+k) = 0.d0
+!          S(i,shift2+k) = 0.d0
+!        enddo
+!        do l=1,N_st_diag*iter
+!          do i=1,sze
+!            U(i,shift2+k) = U(i,shift2+k) + U(i,l)*y(l,k)
+!            W(i,shift2+k) = W(i,shift2+k) + W(i,l)*y(l,k)
+!            S(i,shift2+k) = S(i,shift2+k) + S(i,l)*y(l,k)
 !          enddo
 !        enddo
 !      enddo
 !
 !
-      call dgemm('N','N', sze, N_st_diag, N_st_diag*iter,            &
-          1.d0, U, size(U,1), y, size(y,1)*size(y,2), 0.d0, U(1,1,iter+1), size(U,1))
-      call dgemm('N','N',sze,N_st_diag,N_st_diag*iter,               &
-          1.d0, W, size(W,1), y, size(y,1)*size(y,2), 0.d0, W(1,1,iter+1), size(W,1))
-      call dgemm('N','N',sze,N_st_diag,1,                            &
-          1.d0, S, size(S,1), y, size(y,1)*size(y,2), 0.d0, S(1,1,iter+1), size(S,1))
+      call dgemm('N','N', sze, N_st_diag, shift2,                    &
+          1.d0, U, size(U,1), y, size(y,1), 0.d0, U(1,shift2+1), size(U,1))
+      call dgemm('N','N', sze, N_st_diag, shift2,                    &
+          1.d0, W, size(W,1), y, size(y,1), 0.d0, W(1,shift2+1), size(W,1))
+      call dgemm('N','N', sze, N_st_diag, shift2,                    &
+          1.d0, S, size(S,1), y, size(y,1), 0.d0, S(1,shift2+1), size(S,1))

      ! Compute residual vector
      ! -----------------------
      
-      do k=1,N_st_diag
-        s2(k) = u_dot_v(U(1,k,iter+1), S(1,k,iter+1), sze)
-      enddo
-
+!      do k=1,N_st_diag
+!        print *,  s2(k)
+!        s2(k) = u_dot_v(U(1,shift2+k),  S(1,shift2+k), sze) + S_z2_Sz
+!        print *,  s2(k)
+!        print *,  ''
+!        pause
+!      enddo
      do k=1,N_st_diag
        do i=1,sze
-          R(i,k) = (lambda(k) * U(i,k,iter+1) - W(i,k,iter+1) ) &
-            * (1.d0 + s2(k) * U(i,k,iter+1) - S(i,k,iter+1) )
+          R(i,k) = (lambda(k) * U(i,shift2+k) - W(i,shift2+k) ) &
+            * (1.d0 + s2(k) * U(i,shift2+k) - S(i,shift2+k) - S_z2_Sz)
        enddo
        if (k <= N_st) then
          residual_norm(k) = u_dot_u(R(1,k),sze)
          to_print(1,k) = lambda(k) + nuclear_repulsion
          to_print(2,k) = s2(k)
          to_print(3,k) = residual_norm(k)
+          if (residual_norm(k) > 1.e9) then
+            stop 'Davidson failed'
+          endif
        endif
      enddo
      
@ -297,7 +351,7 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
      
      do k=1,N_st_diag
        do i=1,sze
-          U(i,k,iter+1) =  - R(i,k)/max(H_jj(i) - lambda(k),1.d-2) 
+          U(i,shift2+k) =  - R(i,k)/max(H_jj(i) - lambda(k),1.d-2) 
        enddo
      enddo
      
@ -306,33 +360,31 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
      
      do k=1,N_st_diag

-!        do iter2=1,iter
-!          do l=1,N_st_diag
-!            c(1) = u_dot_v(U(1,k,iter+1),U(1,l,iter2),sze)
+!        do l=1,N_st_diag*iter
+!            c(1) = u_dot_v(U(1,shift2+k),U(1,l),sze)
 !            do i=1,sze
-!              U(i,k,iter+1) = U(i,k,iter+1) - c(1) * U(i,l,iter2)
+!              U(i,k,iter+1) = U(i,shift2+k) - c(1) * U(i,l)
 !            enddo
-!          enddo
 !        enddo
 !
        call dgemv('T',sze,N_st_diag*iter,1.d0,U,size(U,1),  &
-              U(1,k,iter+1),1,0.d0,c,1)
+              U(1,shift2+k),1,0.d0,c,1)
        call dgemv('N',sze,N_st_diag*iter,-1.d0,U,size(U,1), &
-              c,1,1.d0,U(1,k,iter+1),1)
+              c,1,1.d0,U(1,shift2+k),1)
 !
 !        do l=1,k-1
-!          c(1) = u_dot_v(U(1,k,iter+1),U(1,l,iter+1),sze)
+!          c(1) = u_dot_v(U(1,shift2+k),U(1,shift2+l),sze)
 !          do i=1,sze
-!            U(i,k,iter+1) = U(i,k,iter+1) - c(1) * U(i,l,iter+1)
+!            U(i,k,iter+1) = U(i,shift2+k) - c(1) * U(i,shift2+l)
 !          enddo
 !        enddo
 !
-        call dgemv('T',sze,k-1,1.d0,U(1,1,iter+1),size(U,1),   &
-            U(1,k,iter+1),1,0.d0,c,1)
-        call dgemv('N',sze,k-1,-1.d0,U(1,1,iter+1),size(U,1),        &
-            c,1,1.d0,U(1,k,iter+1),1)
+        call dgemv('T',sze,k-1,1.d0,U(1,shift2+1),size(U,1),   &
+            U(1,shift2+k),1,0.d0,c,1)
+        call dgemv('N',sze,k-1,-1.d0,U(1,shift2+1),size(U,1),        &
+            c,1,1.d0,U(1,shift2+k),1)

-        call normalize( U(1,k,iter+1), sze )
+        call normalize( U(1,shift2+k), sze )
      enddo

    enddo
@ -346,23 +398,19 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
    
    do k=1,N_st_diag
      energies(k) = lambda(k)
-      do i=1,sze
-        u_in(i,k) = 0.d0
-      enddo
    enddo
+
 !    do k=1,N_st_diag
 !      do i=1,sze
-!        do iter2=1,iter
-!          do l=1,N_st_diag
-!            u_in(i,k) += U(i,l,iter2)*y(l,iter2,k,1)
+!        do l=1,iter*N_st_diag
+!            u_in(i,k) += U(i,l)*y(l,k)
 !          enddo
 !        enddo
 !      enddo
 !    enddo

    call dgemm('N','N', sze, N_st_diag, N_st_diag*iter, 1.d0,      &
-        U, size(U,1), y, N_st_diag*davidson_sze_max, &
-        0.d0, u_in, size(u_in,1))
+        U, size(U,1), y, size(y,1), 0.d0, u_in, size(u_in,1))

  enddo

@ -377,10 +425,10 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
  deallocate (                                                       &
      kl_pairs,                                                      &
      W, residual_norm,                                              &
-      U, overlap,                                                    &
-      R, c,                                                          &
+      U,                                                             &
+      R, c, S,                                                       &
      h,                                                             &
-      y,                                                             &
+      y, s_, s_tmp,                                                  &
      lambda                                                         &
      )
 end
--- a/src/Davidson/diagonalize_CI.irp.f
+++ b/src/Davidson/diagonalize_CI.irp.f
@ -55,12 +55,16 @@ END_PROVIDER
   
   if (diag_algorithm == "Davidson") then
     
+!     call davidson_diag(psi_det,CI_eigenvectors,CI_electronic_energy,  &
+!         size(CI_eigenvectors,1),               &
+!         N_det,min(N_det,N_states),min(N_det,N_states_diag),N_int,output_determinants)
+!
     call davidson_diag_HS2(psi_det,CI_eigenvectors,                 &
         size(CI_eigenvectors,1),CI_electronic_energy,               &
-         N_det,N_states,N_states_diag,N_int,output_determinants)
+         N_det,min(N_det,N_states),min(N_det,N_states_diag),N_int,output_determinants)

     call u_0_S2_u_0(CI_eigenvectors_s2,CI_eigenvectors,N_det,psi_det,N_int,&
-         N_states_diag,size(CI_eigenvectors,1))
+         min(N_det,N_states_diag),size(CI_eigenvectors,1))

     
   else if (diag_algorithm == "Lapack") then
@ -145,71 +149,6 @@ END_PROVIDER
     deallocate(eigenvectors,eigenvalues)
   endif
   
-
-   if( s2_eig.and.(N_states_diag > 1).and.(N_det >= N_states_diag) )then
-      ! Diagonalizing S^2 within the "n_states_diag" states found
-      allocate(s2_eigvalues(N_states_diag), e_array(N_states_diag))
-      call diagonalize_s2_betweenstates(psi_det,CI_eigenvectors,N_det,size(psi_det,3), &
-        size(CI_eigenvectors,1),min(n_states_diag,n_det),s2_eigvalues)
-      
-      double precision, allocatable :: psi_coef_tmp(:,:)
-      allocate(psi_coef_tmp(psi_det_size,N_states_diag))
-      do j = 1, N_states_diag
-        do i = 1, N_det
-          psi_coef_tmp(i,j) = CI_eigenvectors(i,j)
-        enddo
-      enddo
-      call u_0_H_u_0(e_array,psi_coef_tmp,n_det,psi_det,N_int,N_states_diag,psi_det_size)
-     
-      ! Browsing the "n_states_diag" states and getting the lowest in energy "n_states" ones that have the S^2 value
-      ! closer to the "expected_s2" set as input
-      
-      allocate(index_good_state_array(N_det),good_state_array(N_det))
-      good_state_array = .False.
-      i_state = 0
-      do j = 1, N_states_diag
-        if(dabs(s2_eigvalues(j)-expected_s2).le.0.5d0)then
-          good_state_array(j) = .True.
-          i_state +=1
-          index_good_state_array(i_state) = j
-        endif
-      enddo
-      ! Sorting the i_state good states by energy
-      allocate(iorder(i_state))
-      do j = 1, i_state
-        do i = 1, N_det
-          CI_eigenvectors(i,j) = psi_coef_tmp(i,index_good_state_array(j))
-        enddo
-        CI_eigenvectors_s2(j) = s2_eigvalues(index_good_state_array(j))
-        CI_electronic_energy(j) = e_array(j)
-        iorder(j) = j
-      enddo
-      call dsort(e_array,iorder,i_state)
-      do j = 1, i_state
-        CI_electronic_energy(j) = e_array(j)
-        CI_eigenvectors_s2(j) = s2_eigvalues(index_good_state_array(iorder(j)))
-        do i = 1, N_det
-          CI_eigenvectors(i,j) = psi_coef_tmp(i,index_good_state_array(iorder(j)))
-        enddo
-      enddo
-      
-      ! Then setting the other states without any specific energy order
-      i_other_state = 0
-      do j = 1, N_states_diag
-        if(good_state_array(j))cycle
-        i_other_state +=1
-        do i = 1, N_det
-          CI_eigenvectors(i,i_state + i_other_state) = psi_coef_tmp(i,j)
-        enddo
-        CI_eigenvectors_s2(i_state + i_other_state) = s2_eigvalues(j)
-        CI_electronic_energy(i_state + i_other_state) = e_array(i_state + i_other_state)
-      enddo
-      deallocate(iorder,e_array,index_good_state_array,good_state_array,psi_coef_tmp)
-       
-     deallocate(s2_eigvalues)
-     
-   endif
-   
 END_PROVIDER
 
 subroutine diagonalize_CI
--- a/src/Davidson/u0Hu0.irp.f
+++ b/src/Davidson/u0Hu0.irp.f
@ -58,8 +58,8 @@ subroutine H_u_0_nstates(v_0,u_0,H_jj,n,keys_tmp,Nint,N_st,sze_8)
  integer, external              :: align_double
  !!!DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: vt, ut

-  if(N_st /= N_states) stop "H_u_0_nstates N_st /= N_states"
-  N_st_8 = N_states ! align_double(N_st)
+  if(N_st /= N_states_diag) stop "H_u_0_nstates N_st /= N_states_diag"
+  N_st_8 = N_states_diag ! align_double(N_st)

  ASSERT (Nint > 0)
  ASSERT (Nint == N_int)
@ -214,7 +214,7 @@ subroutine H_S2_u_0_nstates(v_0,s_0,u_0,H_jj,S2_jj,n,keys_tmp,Nint,N_st,sze_8)
  
  integer(ZMQ_PTR) :: handler
  
-  if(N_st /= N_states .or. sze_8 < N_det) stop "assert fail in H_S2_u_0_nstates"
+  if(N_st /= N_states_diag .or. sze_8 < N_det) stop "assert fail in H_S2_u_0_nstates"
  N_st_8 = N_st !! align_double(N_st)

  ASSERT (Nint > 0)
@ -249,7 +249,7 @@ subroutine H_S2_u_0_nstates(v_0,s_0,u_0,H_jj,S2_jj,n,keys_tmp,Nint,N_st,sze_8)
  call davidson_init(handler)
  do sh=shortcut(0,1),1,-1
    workload += (shortcut(sh+1,1) - shortcut(sh,1))**2
-    if(workload > 10000) then
+    if(workload > 100000) then
      blocke = sh
      call davidson_add_task(handler, blocke, blockb)
      blockb = sh-1
@ -257,13 +257,13 @@ subroutine H_S2_u_0_nstates(v_0,s_0,u_0,H_jj,S2_jj,n,keys_tmp,Nint,N_st,sze_8)
    end if
  enddo
  
-  if(blockb) call davidson_add_task(handler, 1, blockb)
-
+  if(blockb > 0) call davidson_add_task(handler, 1, blockb)
  call davidson_run(handler, v_0, s_0)
-  
+
  !$OMP PARALLEL DEFAULT(NONE)                                       &
      !$OMP PRIVATE(i,hij,s2,j,k,jj,vt,st,ii,sh,sh2,ni,exa,ext,org_i,org_j,endi,sorted_i,istate)&
      !$OMP SHARED(n,keys_tmp,ut,Nint,v_0,s_0,sorted,shortcut,sort_idx,version,N_st,N_st_8)
+
  allocate(vt(N_st_8,n),st(N_st_8,n))
  Vt = 0.d0
  St = 0.d0
@ -311,6 +311,7 @@ subroutine H_S2_u_0_nstates(v_0,s_0,u_0,H_jj,S2_jj,n,keys_tmp,Nint,N_st,sze_8)
      s_0(i,istate) = s_0(i,istate) + s2_jj(i)* u_0(i,istate)
    enddo
  enddo
+
  deallocate (shortcut, sort_idx, sorted, version)
 end

--- a/src/Determinants/psi_cas.irp.f
+++ b/src/Determinants/psi_cas.irp.f
@ -21,9 +21,9 @@ use bitmasks
      do k=1,N_int
        good = good .and. (                                          &
            iand(not(cas_bitmask(k,1,l)), psi_det(k,1,i)) ==         &
-            iand(not(cas_bitmask(k,1,l)), psi_det(k,1,1)) ) .and. (  &
+            iand(not(cas_bitmask(k,1,l)), hf_bitmask(k,1)) ) .and. (  &
            iand(not(cas_bitmask(k,2,l)), psi_det(k,2,i)) ==         &
-            iand(not(cas_bitmask(k,2,l)), psi_det(k,2,1)) )
+            iand(not(cas_bitmask(k,2,l)), hf_bitmask(k,2)) )
      enddo
      if (good) then
        exit
--- a/src/Integrals_Bielec/map_integrals.irp.f
+++ b/src/Integrals_Bielec/map_integrals.irp.f
@ -109,6 +109,42 @@ subroutine bielec_integrals_index_reverse(i,j,k,l,i1)

 end

+ BEGIN_PROVIDER [ integer, ao_integrals_cache_min ]
+&BEGIN_PROVIDER [ integer, ao_integrals_cache_max ]
+ implicit none
+ BEGIN_DOC
+ ! Min and max values of the AOs for which the integrals are in the cache
+ END_DOC
+ ao_integrals_cache_min = max(1,ao_num - 63)
+ ao_integrals_cache_max = ao_num
+
+END_PROVIDER
+
+BEGIN_PROVIDER [ double precision, ao_integrals_cache, (ao_integrals_cache_min:ao_integrals_cache_max,ao_integrals_cache_min:ao_integrals_cache_max,ao_integrals_cache_min:ao_integrals_cache_max,ao_integrals_cache_min:ao_integrals_cache_max) ]
+ implicit none
+ BEGIN_DOC
+ ! Cache of AO integrals for fast access
+ END_DOC
+ PROVIDE ao_bielec_integrals_in_map
+ integer                        :: i,j,k,l
+ integer(key_kind)              :: idx
+ !$OMP PARALLEL DO PRIVATE (i,j,k,l,idx)
+ do l=ao_integrals_cache_min,ao_integrals_cache_max
+   do k=ao_integrals_cache_min,ao_integrals_cache_max
+     do j=ao_integrals_cache_min,ao_integrals_cache_max
+       do i=ao_integrals_cache_min,ao_integrals_cache_max
+         !DIR$ FORCEINLINE
+         call bielec_integrals_index(i,j,k,l,idx)
+         !DIR$ FORCEINLINE
+         call map_get(ao_integrals_map,idx,ao_integrals_cache(i,j,k,l))
+       enddo
+     enddo
+   enddo
+ enddo
+ !$OMP END PARALLEL DO
+
+END_PROVIDER
+

 double precision function get_ao_bielec_integral(i,j,k,l,map)
  use map_module
@ -127,8 +163,20 @@ double precision function get_ao_bielec_integral(i,j,k,l,map)
  else if (ao_bielec_integral_schwartz(i,k)*ao_bielec_integral_schwartz(j,l) < ao_integrals_threshold) then
    tmp = 0.d0
  else
-    call bielec_integrals_index(i,j,k,l,idx)
-    call map_get(map,idx,tmp)
+    if ( (i >= ao_integrals_cache_min) .and. &
+        (j >= ao_integrals_cache_min) .and. &
+        (k >= ao_integrals_cache_min) .and. &
+        (l >= ao_integrals_cache_min) .and. &
+        (i <= ao_integrals_cache_max) .and. &
+        (j <= ao_integrals_cache_max) .and. &
+        (k <= ao_integrals_cache_max) .and. &
+        (l <= ao_integrals_cache_max) ) then
+        tmp = ao_integrals_cache(i,j,k,l)
+    else
+      !DIR$ FORCEINLINE
+      call bielec_integrals_index(i,j,k,l,idx)
+      call map_get(map,idx,tmp)
+    endif
  endif
  get_ao_bielec_integral = tmp
 end
@ -155,16 +203,9 @@ subroutine get_ao_bielec_integrals(j,k,l,sze,out_val)
    return
  endif
  
+  double precision :: get_ao_bielec_integral
  do i=1,sze
-    if (ao_overlap_abs(i,k)*ao_overlap_abs(j,l) < thresh ) then
-      out_val(i) = 0.d0
-    else if (ao_bielec_integral_schwartz(i,k)*ao_bielec_integral_schwartz(j,l) < thresh) then
-      out_val(i)=0.d0
-    else
-      !DIR$ FORCEINLINE
-      call bielec_integrals_index(i,j,k,l,hash)
-      call map_get(ao_integrals_map, hash, out_val(i))
-    endif
+    out_val(i) = get_ao_bielec_integral(i,j,k,l,ao_integrals_map)
  enddo
  
 end
@ -276,6 +317,43 @@ subroutine insert_into_mo_integrals_map(n_integrals,                 &
  call map_update(mo_integrals_map, buffer_i, buffer_values, n_integrals, thr)
 end

+ BEGIN_PROVIDER [ integer, mo_integrals_cache_min ]
+&BEGIN_PROVIDER [ integer, mo_integrals_cache_max ]
+ implicit none
+ BEGIN_DOC
+ ! Min and max values of the MOs for which the integrals are in the cache
+ END_DOC
+ mo_integrals_cache_min = max(1,elec_alpha_num - 31)
+ mo_integrals_cache_max = min(mo_tot_num,elec_alpha_num + 32)
+
+END_PROVIDER
+
+BEGIN_PROVIDER [ double precision, mo_integrals_cache, (mo_integrals_cache_min:mo_integrals_cache_max,mo_integrals_cache_min:mo_integrals_cache_max,mo_integrals_cache_min:mo_integrals_cache_max,mo_integrals_cache_min:mo_integrals_cache_max) ]
+ implicit none
+ BEGIN_DOC
+ ! Cache of MO integrals for fast access
+ END_DOC
+ PROVIDE mo_bielec_integrals_in_map
+ integer                        :: i,j,k,l
+ integer(key_kind)              :: idx
+ FREE ao_integrals_cache
+ !$OMP PARALLEL DO PRIVATE (i,j,k,l,idx)
+ do l=mo_integrals_cache_min,mo_integrals_cache_max
+   do k=mo_integrals_cache_min,mo_integrals_cache_max
+     do j=mo_integrals_cache_min,mo_integrals_cache_max
+       do i=mo_integrals_cache_min,mo_integrals_cache_max
+         !DIR$ FORCEINLINE
+         call bielec_integrals_index(i,j,k,l,idx)
+         !DIR$ FORCEINLINE
+         call map_get(mo_integrals_map,idx,mo_integrals_cache(i,j,k,l))
+       enddo
+     enddo
+   enddo
+ enddo
+ !$OMP END PARALLEL DO
+
+END_PROVIDER
+
 double precision function get_mo_bielec_integral(i,j,k,l,map)
  use map_module
  implicit none
@ -287,11 +365,22 @@ double precision function get_mo_bielec_integral(i,j,k,l,map)
  type(map_type), intent(inout)  :: map
  real(integral_kind)            :: tmp
  PROVIDE mo_bielec_integrals_in_map
-  !DIR$ FORCEINLINE
-  call bielec_integrals_index(i,j,k,l,idx)
-  !DIR$ FORCEINLINE
-  call map_get(map,idx,tmp)
-  get_mo_bielec_integral = dble(tmp)
+  if ( (i >= mo_integrals_cache_min) .and. &
+       (j >= mo_integrals_cache_min) .and. &
+       (k >= mo_integrals_cache_min) .and. &
+       (l >= mo_integrals_cache_min) .and. &
+       (i <= mo_integrals_cache_max) .and. &
+       (j <= mo_integrals_cache_max) .and. &
+       (k <= mo_integrals_cache_max) .and. &
+       (l <= mo_integrals_cache_max) ) then
+       get_mo_bielec_integral = mo_integrals_cache(i,j,k,l)
+  else
+    !DIR$ FORCEINLINE
+    call bielec_integrals_index(i,j,k,l,idx)
+    !DIR$ FORCEINLINE
+    call map_get(map,idx,tmp)
+    get_mo_bielec_integral = dble(tmp)
+  endif
 end

 double precision function get_mo_bielec_integral_schwartz(i,j,k,l,map)
@ -306,14 +395,12 @@ double precision function get_mo_bielec_integral_schwartz(i,j,k,l,map)
  real(integral_kind)            :: tmp
  PROVIDE mo_bielec_integrals_in_map
  if (mo_bielec_integral_schwartz(i,k)*mo_bielec_integral_schwartz(j,l) > mo_integrals_threshold) then
+    double precision, external :: get_mo_bielec_integral
    !DIR$ FORCEINLINE
-    call bielec_integrals_index(i,j,k,l,idx)
-    !DIR$ FORCEINLINE
-    call map_get(map,idx,tmp)
+    get_mo_bielec_integral_schwartz = get_mo_bielec_integral(i,j,k,l,map)
  else
    tmp = 0.d0
  endif
-  get_mo_bielec_integral_schwartz = dble(tmp)
 end


@ -325,6 +412,7 @@ double precision function mo_bielec_integral(i,j,k,l)
  integer, intent(in)            :: i,j,k,l
  double precision               :: get_mo_bielec_integral
  PROVIDE mo_bielec_integrals_in_map
+  !DIR$ FORCEINLINE
  mo_bielec_integral = get_mo_bielec_integral(i,j,k,l,mo_integrals_map)
  return
 end
--- a/src/Integrals_Monoelec/kin_ao_ints.irp.f
+++ b/src/Integrals_Monoelec/kin_ao_ints.irp.f
@ -131,8 +131,8 @@ BEGIN_PROVIDER [double precision, ao_kinetic_integral, (ao_num_align,ao_num)]
  integer                        :: i,j,k,l
  
  if (read_ao_one_integrals) then
-    call read_one_e_integrals('ao_kinetic_integral', ao_kinetic_integral,&
-       size(ao_kinetic_integral,1), size(ao_kinetic_integral,2))
+     call ezfio_get_ao_basis_integral_kinetic(ao_kinetic_integral(1:ao_num, 1:ao_num))
+ call ezfio_set_ao_basis_integral_kinetic(ao_kinetic_integral(1:ao_num, 1:ao_num))
    print *,  'AO kinetic integrals read from disk'
  else
    !$OMP PARALLEL DO DEFAULT(NONE) &
@ -151,8 +151,7 @@ BEGIN_PROVIDER [double precision, ao_kinetic_integral, (ao_num_align,ao_num)]
    !$OMP END PARALLEL DO
  endif
  if (write_ao_one_integrals) then
-    call write_one_e_integrals('ao_kinetic_integral', ao_kinetic_integral,&
-       size(ao_kinetic_integral,1), size(ao_kinetic_integral,2))
+    call ezfio_set_ao_basis_integral_kinetic(ao_kinetic_integral(1:ao_num, 1:ao_num))
    print *,  'AO kinetic integrals written to disk'
  endif
 END_PROVIDER
--- a/src/Integrals_Monoelec/pot_ao_ints.irp.f
+++ b/src/Integrals_Monoelec/pot_ao_ints.irp.f
@ -11,8 +11,7 @@ BEGIN_PROVIDER [ double precision, ao_nucl_elec_integral, (ao_num_align,ao_num)]
   double precision               :: overlap_x,overlap_y,overlap_z,overlap,dx,NAI_pol_mult
   
   if (read_ao_one_integrals) then
-     call read_one_e_integrals('ao_ne_integral', ao_nucl_elec_integral, &
-         size(ao_nucl_elec_integral,1), size(ao_nucl_elec_integral,2))
+     call ezfio_get_ao_basis_integral_nuclear(ao_nucl_elec_integral(1:ao_num, 1:ao_num))
     print *,  'AO N-e integrals read from disk'
   else
     
@ -74,8 +73,7 @@ BEGIN_PROVIDER [ double precision, ao_nucl_elec_integral, (ao_num_align,ao_num)]
     !$OMP END PARALLEL
   endif
   if (write_ao_one_integrals) then
-     call write_one_e_integrals('ao_ne_integral', ao_nucl_elec_integral, &
-         size(ao_nucl_elec_integral,1), size(ao_nucl_elec_integral,2))
+     call ezfio_set_ao_basis_integral_nuclear(ao_nucl_elec_integral(1:ao_num, 1:ao_num))
     print *,  'AO N-e integrals written to disk'
   endif
   
--- a/src/Integrals_Monoelec/pot_ao_pseudo_ints.irp.f
+++ b/src/Integrals_Monoelec/pot_ao_pseudo_ints.irp.f
@ -53,6 +53,13 @@ BEGIN_PROVIDER [ double precision, ao_pseudo_integral_local, (ao_num_align,ao_nu
  call wall_time(wall_1)
  call cpu_time(cpu_1)
  
+!write(33,*) 'xxxLOCxxx'
+!write(33,*)   'pseudo_klocmax', pseudo_klocmax
+!write(33,*)  'pseudo_v_k_transp ', pseudo_v_k_transp 
+!write(33,*)  'pseudo_n_k_transp ', pseudo_n_k_transp 
+!write(33,*)  'pseudo_dz_k_transp', pseudo_dz_k_transp
+!write(33,*) 'xxxLOCxxx'
+
  thread_num = 0
  !$OMP PARALLEL                                                     &
      !$OMP DEFAULT (NONE)                                           &
@ -102,7 +109,15 @@ BEGIN_PROVIDER [ double precision, ao_pseudo_integral_local, (ao_num_align,ao_nu
                pseudo_n_k_transp (1,k), & 
                pseudo_dz_k_transp(1,k), & 
                A_center,power_A,alpha,B_center,power_B,beta,C_center)
-            
+!  write(33,*) i,j,k
+!  write(33,*) A_center,power_A,alpha,B_center,power_B,beta,C_center, &
+!      Vloc(pseudo_klocmax, &
+!        pseudo_v_k_transp (1,k), &
+!        pseudo_n_k_transp (1,k), &
+!        pseudo_dz_k_transp(1,k), &
+!        A_center,power_A,alpha,B_center,power_B,beta,C_center)
+!  write(33,*)
+                    
          enddo
          ao_pseudo_integral_local(i,j) = ao_pseudo_integral_local(i,j) +&
              ao_coef_normalized_ordered_transp(l,j)*ao_coef_normalized_ordered_transp(m,i)*c
@ -123,7 +138,6 @@ BEGIN_PROVIDER [ double precision, ao_pseudo_integral_local, (ao_num_align,ao_nu
 !$OMP END DO
 !$OMP END PARALLEL

-
 END_PROVIDER


@ -151,6 +165,13 @@ BEGIN_PROVIDER [ double precision, ao_pseudo_integral_local, (ao_num_align,ao_nu
  call wall_time(wall_1)
  call cpu_time(cpu_1)
  thread_num = 0
+!write(34,*) 'xxxNONLOCxxx'
+!write(34,*)  ' pseudo_lmax,pseudo_kmax', pseudo_lmax,pseudo_kmax
+!write(34,*)  ' pseudo_v_kl_transp(1,0,k)', pseudo_v_kl_transp
+!write(34,*)  ' pseudo_n_kl_transp(1,0,k)', pseudo_n_kl_transp
+!write(34,*)  ' pseudo_dz_kl_transp(1,0,k)', pseudo_dz_kl_transp
+!write(34,*) 'xxxNONLOCxxx'
+
  !$OMP PARALLEL                                                     &
      !$OMP DEFAULT (NONE)                                           &
      !$OMP PRIVATE (i,j,k,l,m,alpha,beta,A_center,B_center,C_center,power_A,power_B,&
@ -201,6 +222,15 @@ BEGIN_PROVIDER [ double precision, ao_pseudo_integral_local, (ao_num_align,ao_nu
                    pseudo_n_kl_transp(1,0,k),  &
                    pseudo_dz_kl_transp(1,0,k), &
                    A_center,power_A,alpha,B_center,power_B,beta,C_center)
+!  write(34,*) i,j,k
+!  write(34,*)  &
+!          A_center,power_A,alpha,B_center,power_B,beta,C_center, &
+!       Vpseudo(pseudo_lmax,pseudo_kmax, &
+!          pseudo_v_kl_transp(1,0,k),  &
+!          pseudo_n_kl_transp(1,0,k),  &
+!          pseudo_dz_kl_transp(1,0,k), &
+!          A_center,power_A,alpha,B_center,power_B,beta,C_center)
+!  write(34,*) ''
          enddo
          ao_pseudo_integral_non_local(i,j) = ao_pseudo_integral_non_local(i,j) +&
              ao_coef_normalized_ordered_transp(l,j)*ao_coef_normalized_ordered_transp(m,i)*c
@ -223,7 +253,6 @@ BEGIN_PROVIDER [ double precision, ao_pseudo_integral_local, (ao_num_align,ao_nu
  !$OMP END PARALLEL
  
  
-  
 END_PROVIDER

 BEGIN_PROVIDER [ double precision, pseudo_v_k_transp, (pseudo_klocmax,nucl_num) ]
--- a/src/Integrals_Monoelec/pseudopot.f90
+++ b/src/Integrals_Monoelec/pseudopot.f90
@ -136,7 +136,7 @@ end

         if(l.eq.2.and.m.eq.2)ylm_real=dsqrt(15.d0/16.d0/pi)*(xchap*xchap-ychap*ychap)
         if(l.eq.2.and.m.eq.1)ylm_real=dsqrt(15.d0/fourpi)*xchap*zchap
-         if(l.eq.2.and.m.eq.0)ylm_real=dsqrt(5.d0/16.d0/pi)*(-xchap*xchap-ychap*ychap+2.d0*zchap*zchap)
+         if(l.eq.2.and.m.eq.0)ylm_real=dsqrt(5.d0/16.d0/pi)*(2.d0*zchap*zchap-xchap*xchap-ychap*ychap)
         if(l.eq.2.and.m.eq.-1)ylm_real=dsqrt(15.d0/fourpi)*ychap*zchap
         if(l.eq.2.and.m.eq.-2)ylm_real=dsqrt(15.d0/fourpi)*xchap*ychap

@ -276,30 +276,16 @@ if(ac.eq.0.d0.and.bc.eq.0.d0)then
 do k=1,kmax
  do l=0,lmax
   ktot=ntot+n_kl(k,l)
+   if (v_kl(k,l) == 0.d0) cycle
   do m=-l,l
    prod=bigI(0,0,l,m,n_a(1),n_a(2),n_a(3))
+    if (prod == 0.d0) cycle
    prodp=bigI(0,0,l,m,n_b(1),n_b(2),n_b(3))
-
-      accu=accu+prod*prodp*v_kl(k,l)*int_prod_bessel(ktot+2,g_a+g_b+dz_kl(k,l),0,0,areal,breal,arg)
-
+    if (prodp == 0.d0) cycle
+     accu=accu+prod*prodp*v_kl(k,l)*int_prod_bessel(ktot+2,g_a+g_b+dz_kl(k,l),0,0,areal,breal,arg)
   enddo
  enddo
 enddo
-! do k=1,kmax
-!   do l=0,lmax
-!     ktot=ntot+n_kl(k,l)
-!     do m=-l,l
-!       prod =bigI(0,0,l,m,n_a(1),n_a(2),n_a(3))*v_kl(k,l)
-!       prodp=bigI(0,0,l,m,n_b(1),n_b(2),n_b(3))*prod
-!       if (dabs (prodp) < 1.d-15) then
-!         cycle
-!       endif
-!       
-!       accu=accu+prodp*int_prod_bessel(ktot+2,g_a+g_b+dz_kl(k,l),0,0,areal,breal,arg)
-!       
-!     enddo
-!   enddo
-! enddo

 !=!=!=!=!
 ! E n d !
@ -386,14 +372,17 @@ else if(ac.ne.0.d0.and.bc.ne.0.d0)then
     enddo
     
     do k3=0,n_a(3)
+       if (array_coefs_A(k3,3) == 0.d0) cycle
       do k2=0,n_a(2)
+         if (array_coefs_A(k2,2) == 0.d0) cycle
         do k1=0,n_a(1)
-           
+           if (array_coefs_A(k1,1) == 0.d0) cycle
+
           do lambda=0,l+ntotA
             do mu=-lambda,lambda
               
               prod=ylm(lambda,mu,theta_AC0,phi_AC0)*array_coefs_A(k1,1)*array_coefs_A(k2,2)*array_coefs_A(k3,3)*array_I_A(mu,lambda,k1,k2,k3)
-               
+               if (prod == 0.d0) cycle
               
               do k3p=0,n_b(3)
                 do k2p=0,n_b(2)
@ -405,6 +394,7 @@ else if(ac.ne.0.d0.and.bc.ne.0.d0)then
                            array_coefs_B(k1p,1)*array_coefs_B(k2p,2)*array_coefs_B(k3p,3)* &
                            array_I_B(mup,lambdap,k1p,k2p,k3p)
                         
+                         if (prodp == 0.d0) cycle
                         do k=1,kmax
                           ktot=k1+k2+k3+k1p+k2p+k3p+n_kl(k,l)
                           accu=accu+prodp*v_kl(k,l)*array_R(k,ktot,l,lambda,lambdap)
@ -490,13 +480,18 @@ else if(ac.eq.0.d0.and.bc.ne.0.d0)then
     prod=bigI(0,0,l,m,n_a(1),n_a(2),n_a(3))
     
     do k3p=0,n_b(3)
+       if (array_coefs_B(k3p,3) == 0.d0) cycle
       do k2p=0,n_b(2)
+         if (array_coefs_B(k2p,2) == 0.d0) cycle
         do k1p=0,n_b(1)
+           if (array_coefs_B(k1p,1) == 0.d0) cycle
           do lambdap=0,l+ntotB
             do mup=-lambdap,lambdap
               
               prodp=prod*array_coefs_B(k1p,1)*array_coefs_B(k2p,2)*array_coefs_B(k3p,3)*ylm(lambdap,mup,theta_BC0,phi_BC0)*array_I_B(mup,lambdap,k1p,k2p,k3p)
               
+               if (prodp == 0.d0) cycle
+
               do k=1,kmax
                 
                 ktot=ntotA+k1p+k2p+k3p+n_kl(k,l)
@ -573,13 +568,19 @@ else if(ac.ne.0.d0.and.bc.eq.0.d0)then
    enddo

    do k3=0,n_a(3)
+      if (array_coefs_A(k3,3) == 0.d0) cycle
      do k2=0,n_a(2)
+        if (array_coefs_A(k2,2) == 0.d0) cycle
        do k1=0,n_a(1)
+          if (array_coefs_A(k1,1) == 0.d0) cycle
          do lambda=0,l+ntotA
            do mu=-lambda,lambda
              
              prod=array_coefs_A(k1,1)*array_coefs_A(k2,2)*array_coefs_A(k3,3)*ylm(lambda,mu,theta_AC0,phi_AC0)*array_I_A(mu,lambda,k1,k2,k3)
+              if (prod == 0.d0) cycle
              prodp=prod*bigI(0,0,l,m,n_b(1),n_b(2),n_b(3))
+
+              if (prodp == 0.d0) cycle
              
              do k=1,kmax
                ktot=k1+k2+k3+ntotB+n_kl(k,l)
@ -812,18 +813,22 @@ double precision int_prod_bessel_loc,binom_func,accu,prod,ylm,bigI,arg
   phi_DC0=datan2(d(2)/d2,d(1)/d2)
   
   do k=1,klocmax
+     if (v_k(k) == 0.d0) cycle
     do k1=0,n_a(1)
       do k2=0,n_a(2)
         do k3=0,n_a(3)
           do k1p=0,n_b(1)
             do k2p=0,n_b(2)
               do k3p=0,n_b(3)
+                 if (array_coefs(k1,k2,k3,k1p,k2p,k3p) == 0.d0) cycle
                 do l=0,ntot
                   do m=-l,l
                     coef=ylm(l,m,theta_DC0,phi_DC0)
+                     if (coef == 0.d0) cycle
+                     ktot=k1+k2+k3+k1p+k2p+k3p+n_k(k)
+                     if (array_R_loc(ktot,k,l) == 0.d0) cycle
                     prod=coef*array_coefs(k1,k2,k3,k1p,k2p,k3p)     &
                         *bigI(l,m,0,0,k1+k1p,k2+k2p,k3+k3p)
-                     ktot=k1+k2+k3+k1p+k2p+k3p+n_k(k)
                     accu=accu+prod*v_k(k)*array_R_loc(ktot,k,l)
                   enddo
                 enddo
@ -864,18 +869,24 @@ double precision pi,sum,factor1,factor2,cylm,cylmp,bigA,binom_func,fact,coef_pm
 double precision sgn, sgnp
 pi=dacos(-1.d0)

+bigI=0.d0
 if(mu.gt.0.and.m.gt.0)then
 sum=0.d0
 factor1=dsqrt((2*lambda+1)*fact(lambda-iabs(mu))/(2.d0*pi*fact(lambda+iabs(mu))))
+if (factor1== 0.d0) return
 factor2=dsqrt((2*l+1)*fact(l-iabs(m))/(2.d0*pi*fact(l+iabs(m))))
+if (factor2== 0.d0) return
 sgn = 1.d0
 do k=0,mu/2
 do i=0,lambda-mu
+  if (coef_pm(lambda,i+mu) == 0.d0) cycle
  sgnp = 1.d0
  do kp=0,m/2
   do ip=0,l-m
    cylm=sgn*factor1*binom_func(mu,2*k)*fact(mu+i)/fact(i)*coef_pm(lambda,i+mu)
+    if (cylm == 0.d0) cycle
    cylmp=sgnp*factor2*binom_func(m,2*kp)*fact(m+ip)/fact(ip)*coef_pm(l,ip+m)
+    if (cylmp == 0.d0) cycle
    sum=sum+cylm*cylmp*bigA(mu-2*k+m-2*kp+k1,2*k+2*kp+k2,i+ip+k3)
   enddo
   sgnp = -sgnp
@ -889,12 +900,16 @@ endif

 if(mu.eq.0.and.m.eq.0)then
 factor1=dsqrt((2*lambda+1)/(4.d0*pi))
+if (factor1== 0.d0) return
 factor2=dsqrt((2*l+1)/(4.d0*pi))
+if (factor2== 0.d0) return
 sum=0.d0
 do i=0,lambda
 do ip=0,l
  cylm=factor1*coef_pm(lambda,i)
+  if (cylm == 0.d0) cycle
  cylmp=factor2*coef_pm(l,ip)
+  if (cylmp == 0.d0) cycle
  sum=sum+cylm*cylmp*bigA(k1,k2,i+ip+k3)
 enddo
 enddo
@ -904,14 +919,18 @@ endif

 if(mu.eq.0.and.m.gt.0)then
 factor1=dsqrt((2*lambda+1)/(4.d0*pi))
+if (factor1== 0.d0) return
 factor2=dsqrt((2*l+1)*fact(l-iabs(m))/(2.d0*pi*fact(l+iabs(m))))
+if (factor2== 0.d0) return
 sum=0.d0
 do i=0,lambda
 sgnp = 1.d0
 do kp=0,m/2
  do ip=0,l-m
   cylm=factor1*coef_pm(lambda,i)
+   if (cylm == 0.d0) cycle
   cylmp=sgnp*factor2*binom_func(m,2*kp)*fact(m+ip)/fact(ip)*coef_pm(l,ip+m)
+   if (cylmp == 0.d0) cycle
   sum=sum+cylm*cylmp*bigA(m-2*kp+k1,2*kp+k2,i+ip+k3)
  enddo
  sgnp = -sgnp
@ -924,13 +943,18 @@ endif
 if(mu.gt.0.and.m.eq.0)then
 sum=0.d0
 factor1=dsqrt((2*lambda+1)*fact(lambda-iabs(mu))/(2.d0*pi*fact(lambda+iabs(mu))))
+if (factor1== 0.d0) return
 factor2=dsqrt((2*l+1)/(4.d0*pi))
+if (factor2== 0.d0) return
 sgn = 1.d0
 do k=0,mu/2
 do i=0,lambda-mu
+  if (coef_pm(lambda,i+mu) == 0.d0) cycle
  do ip=0,l
   cylm=sgn*factor1*binom_func(mu,2*k)*fact(mu+i)/fact(i)*coef_pm(lambda,i+mu)
+   if (cylm == 0.d0) cycle
   cylmp=factor2*coef_pm(l,ip)
+   if (cylmp == 0.d0) cycle
   sum=sum+cylm*cylmp*bigA(mu-2*k +k1,2*k +k2,i+ip +k3)
  enddo
 enddo
@ -944,16 +968,22 @@ if(mu.lt.0.and.m.lt.0)then
 mu=-mu
 m=-m
 factor1=dsqrt((2*lambda+1)*fact(lambda-iabs(mu))/(2.d0*pi*fact(lambda+iabs(mu))))
+if (factor1== 0.d0) return
 factor2=dsqrt((2*l+1)*fact(l-iabs(m))/(2.d0*pi*fact(l+iabs(m))))
+if (factor2== 0.d0) return
 sum=0.d0
 sgn = 1.d0
 do k=0,(mu-1)/2
 do i=0,lambda-mu
+  if (coef_pm(lambda,i+mu) == 0.d0) cycle
  sgnp = 1.d0
  do kp=0,(m-1)/2
   do ip=0,l-m
+    if (coef_pm(l,ip+m) == 0.d0) cycle
    cylm=sgn*factor1*binom_func(mu,2*k+1)*fact(mu+i)/fact(i)*coef_pm(lambda,i+mu)
+    if (cylm == 0.d0) cycle
    cylmp=sgnp*factor2*binom_func(m,2*kp+1)*fact(m+ip)/fact(ip)*coef_pm(l,ip+m)
+    if (cylmp == 0.d0) cycle
    sum=sum+cylm*cylmp*bigA(mu-(2*k+1)+m-(2*kp+1)+k1,(2*k+1)+(2*kp+1)+k2,i+ip+k3)
   enddo
   sgnp = -sgnp
@ -970,14 +1000,18 @@ endif
 if(mu.eq.0.and.m.lt.0)then
 m=-m
 factor1=dsqrt((2*lambda+1)/(4.d0*pi))
+if (factor1 == 0.d0) return
 factor2=dsqrt((2*l+1)*fact(l-iabs(m))/(2.d0*pi*fact(l+iabs(m))))
+if (factor2 == 0.d0) return
 sum=0.d0
 do i=0,lambda
 sgnp = 1.d0
 do kp=0,(m-1)/2
  do ip=0,l-m
   cylm=factor1*coef_pm(lambda,i)
+   if (cylm == 0.d0) cycle
   cylmp=sgnp*factor2*binom_func(m,2*kp+1)*fact(m+ip)/fact(ip)*coef_pm(l,ip+m)
+   if (cylmp == 0.d0) cycle
   sum=sum+cylm*cylmp*bigA(m-(2*kp+1)+k1,2*kp+1+k2,i+ip+k3)
  enddo
  sgnp = -sgnp
@ -992,13 +1026,17 @@ if(mu.lt.0.and.m.eq.0)then
 sum=0.d0
 mu=-mu
 factor1=dsqrt((2*lambda+1)*fact(lambda-iabs(mu))/(2.d0*pi*fact(lambda+iabs(mu))))
+if (factor1== 0.d0) return
 factor2=dsqrt((2*l+1)/(4.d0*pi))
+if (factor2== 0.d0) return
 sgn = 1.d0
 do k=0,(mu-1)/2
 do i=0,lambda-mu
   do ip=0,l
    cylm=sgn*factor1*binom_func(mu,2*k+1)*fact(mu+i)/fact(i)*coef_pm(lambda,i+mu)
+    if (cylm == 0.d0) cycle
    cylmp=factor2*coef_pm(l,ip)
+    if (cylmp == 0.d0) cycle
    sum=sum+cylm*cylmp*bigA(mu-(2*k+1)+k1,2*k+1+k2,i+ip+k3)
   enddo
 enddo
@ -1012,16 +1050,22 @@ endif
 if(mu.gt.0.and.m.lt.0)then
 sum=0.d0
 factor1=dsqrt((2*lambda+1)*fact(lambda-iabs(mu))/(2.d0*pi*fact(lambda+iabs(mu))))
+if (factor1== 0.d0) return
 factor2=dsqrt((2*l+1)*fact(l-iabs(m))/(2.d0*pi*fact(l+iabs(m))))
+if (factor2== 0.d0) return
 m=-m
 sgn=1.d0
 do k=0,mu/2
 do i=0,lambda-mu
+  if (coef_pm(lambda,i+mu) == 0.d0) cycle
  sgnp=1.d0
  do kp=0,(m-1)/2
   do ip=0,l-m
+    if (coef_pm(l,ip+m) == 0.d0) cycle
    cylm =sgn *factor1*binom_func(mu,2*k)*fact(mu+i)/fact(i)*coef_pm(lambda,i+mu)
+    if (cylm == 0.d0) cycle
    cylmp=sgnp*factor2*binom_func(m,2*kp+1)*fact(m+ip)/fact(ip)*coef_pm(l,ip+m)
+    if (cylmp == 0.d0) cycle
    sum=sum+cylm*cylmp*bigA(mu-2*k+m-(2*kp+1)+k1,2*k+2*kp+1+k2,i+ip+k3)
   enddo
   sgnp = -sgnp
@ -1037,16 +1081,22 @@ endif
 if(mu.lt.0.and.m.gt.0)then
 mu=-mu
 factor1=dsqrt((2*lambda+1)*fact(lambda-iabs(mu))/(2.d0*pi*fact(lambda+iabs(mu))))
+if (factor1== 0.d0) return
 factor2=dsqrt((2*l+1)*fact(l-iabs(m))/(2.d0*pi*fact(l+iabs(m))))
+if (factor2== 0.d0) return
 sum=0.d0
 sgn = 1.d0
 do k=0,(mu-1)/2
 do i=0,lambda-mu
+  if (coef_pm(lambda,i+mu) == 0.d0) cycle
  sgnp = 1.d0
  do kp=0,m/2
   do ip=0,l-m
+    if (coef_pm(l,ip+m) == 0.d0) cycle
    cylm=sgn*factor1  *binom_func(mu,2*k+1)*fact(mu+i)/fact(i)*coef_pm(lambda,i+mu)
+    if (cylm == 0.d0) cycle
    cylmp=sgnp*factor2*binom_func(m,2*kp)*fact(m+ip)/fact(ip)*coef_pm(l,ip+m)
+    if (cylmp == 0.d0) cycle
    sum=sum+cylm*cylmp*bigA(mu-(2*k+1)+m-2*kp+k1,2*k+1+2*kp+k2,i+ip+k3)
   enddo
   sgnp = -sgnp
@ -1544,7 +1594,7 @@ end
         r=(i-1)*dr
         x1=delta1*r
         x2=delta2*r
-         sum=sum+dr*r**(n+2)*dexp(-cc*r**2)*bessel_mod(x1,lambda)*bessel_mod(x2,lambdap)
+         sum=sum+dr*r**(n+2)*dexp(-cc*r*r)*bessel_mod(x1,lambda)*bessel_mod(x2,lambdap)
        enddo
        bigR=sum*factor
        end
@ -1569,8 +1619,8 @@ end
        return
       endif
       if(n.eq.0)a=dsinh(x)/x
-       if(n.eq.1)a=(x*dcosh(x)-dsinh(x))/x**2
-       if(n.ge.2)a=bessel_mod_recur(n-2,x)-(2*n-1)/x*bessel_mod_recur(n-1,x)
+       if(n.eq.1)a=(x*dcosh(x)-dsinh(x))/(x*x)
+       if(n.ge.2)a=bessel_mod_recur(n-2,x)-(n+n-1)/x*bessel_mod_recur(n-1,x)
       end

       double precision function bessel_mod_exp(n,x)
@ -1579,8 +1629,8 @@ end
       double precision x,coef,accu,fact,dble_fact
       accu=0.d0
       do k=0,10
-        coef=1.d0/fact(k)/dble_fact(2*(n+k)+1)
-        accu=accu+(x**2/2.d0)**k*coef
+        coef=1.d0/(fact(k)*dble_fact(2*(n+k)+1))
+        accu=accu+(0.5d0*x*x)**k*coef
       enddo
       bessel_mod_exp=x**n*accu
       end
@ -1778,15 +1828,15 @@ end

 double precision function coef_nk(n,k)
  implicit none
-  integer n,k, ISHFT
+  integer n,k

  double precision gam,dble_fact,fact

+  if (k<0) stop 'pseudopot.f90 : coef_nk'
+  if (k>63) stop 'pseudopot.f90 : coef_nk'
  gam=dble_fact(n+n+k+k+1)
-
-!  coef_nk=1.d0/(dble(ISHFT(1,k))*fact(k)*gam)
-  
-  coef_nk=1.d0/(2.d0**k*fact(k)*gam)
+!  coef_nk=1.d0/(2.d0**k*fact(k)*gam)
+  coef_nk=1.d0/(dble(ibset(0_8,k))*fact(k)*gam)
  
  return

@ -1811,11 +1861,11 @@ double precision function int_prod_bessel(l,gam,n,m,a,b,arg)
  double precision :: s_q_0, s_q_k, s_0_0, a_over_b_square
  double precision :: int_prod_bessel_loc
  double precision :: inverses(0:300)
-  double precision :: two_qkmp1, qk
+  double precision :: two_qkmp1, qk, mk, nk

  logical done

-  u=(a+b)/(2.d0*dsqrt(gam))
+  u=(a+b)*0.5d0/dsqrt(gam)
  freal=dexp(-arg)

  if(a.eq.0.d0.and.b.eq.0.d0)then
@ -1859,6 +1909,7 @@ double precision function int_prod_bessel(l,gam,n,m,a,b,arg)
    s_q_0 = s_0_0


+    mk = dble(m)
    ! Loop over q for the convergence of the sequence
    do while (.not.done)  

@ -1870,10 +1921,10 @@ double precision function int_prod_bessel(l,gam,n,m,a,b,arg)
        stop 'pseudopot.f90 : q > 300'
      endif

-      two_qkmp1 = dble(2*(q+m)+1)
      qk = dble(q)
+      two_qkmp1 = 2.d0*(qk+mk)+1.d0
      do k=0,q-1
-        s_q_k = ( two_qkmp1*qk*inverses(k) ) * s_q_k
+        s_q_k = two_qkmp1*qk*inverses(k)*s_q_k
        sum=sum+s_q_k
        two_qkmp1 = two_qkmp1-2.d0
        qk = qk-1.d0 
--- a/src/Utils/transpose.irp.f
+++ b/src/Utils/transpose.irp.f
@ -0,0 +1,78 @@
+!DIR$ attributes forceinline :: transpose
+recursive subroutine transpose(A,LDA,B,LDB,d1,d2)
+ implicit none
+ BEGIN_DOC
+! Transpose input matrix A into output matrix B
+ END_DOC
+ integer, intent(in) :: d1, d2, LDA, LDB
+ real, intent(in) :: A(LDA,d2)
+ real, intent(out) :: B(LDB,d1)
+
+ integer :: i,j,k, mod_align
+ if ( d2 < 32 ) then
+   do j=1,d1
+     !DIR$ LOOP COUNT (16)
+     do i=1,d2
+      B(i,j  ) = A(j  ,i)
+     enddo
+    enddo
+   return
+ else if (d1 > d2) then
+ !DIR$ forceinline 
+   k=d1/2
+ !DIR$ forceinline recursive
+   call transpose(A(1,1),LDA,B(1,1),LDB,k,d2)
+ !DIR$ forceinline recursive
+   call transpose(A(k+1,1),LDA,B(1,k+1),LDB,d1-k,d2)
+   return
+ else
+ !DIR$ forceinline
+   k=d2/2
+ !DIR$ forceinline recursive
+   call transpose(A(1,k+1),LDA,B(k+1,1),LDB,d1,d2-k)
+ !DIR$ forceinline recursive
+   call transpose(A(1,1),LDA,B(1,1),LDB,d1,k)
+   return
+ endif
+
+end
+
+!DIR$ attributes forceinline :: dtranspose
+recursive subroutine dtranspose(A,LDA,B,LDB,d1,d2)
+ implicit none
+ BEGIN_DOC
+! Transpose input matrix A into output matrix B
+ END_DOC
+ integer, intent(in) :: d1, d2, LDA, LDB
+ double precision, intent(in) :: A(LDA,d2)
+ double precision, intent(out) :: B(LDB,d1)
+
+ integer :: i,j,k, mod_align
+ if ( d2 < 32 ) then
+   do j=1,d1
+     !DIR$ LOOP COUNT (16)
+     do i=1,d2
+      B(i,j  ) = A(j  ,i)
+     enddo
+    enddo
+   return
+ else if (d1 > d2) then
+ !DIR$ forceinline 
+   k=d1/2
+ !DIR$ forceinline recursive
+   call dtranspose(A(1,1),LDA,B(1,1),LDB,k,d2)
+ !DIR$ forceinline recursive
+   call dtranspose(A(k+1,1),LDA,B(1,k+1),LDB,d1-k,d2)
+   return
+ else
+ !DIR$ forceinline
+   k=d2/2
+ !DIR$ forceinline recursive
+   call dtranspose(A(1,k+1),LDA,B(k+1,1),LDB,d1,d2-k)
+ !DIR$ forceinline recursive
+   call dtranspose(A(1,1),LDA,B(1,1),LDB,d1,k)
+   return
+ endif
+
+end
+
--- a/src/Utils/util.irp.f
+++ b/src/Utils/util.irp.f
@ -84,10 +84,8 @@ double precision function fact(n)
    memo(i) = memo(i-1)*dble(i)
  enddo
  memomax = min(n,100)
-  fact = memo(memomax)
-  do i=101,n
-    fact = fact*dble(i)
-  enddo
+  double precision :: logfact
+  fact = dexp(logfact(n))
 end function

 double precision function logfact(n)
@ -158,18 +156,41 @@ double precision function dble_fact_even(n) result(fact2)
  ! n!!
  END_DOC
  integer                        :: n,k
-  double precision, save         :: memo(1:100)
-  integer, save                  :: memomax = 2
+  double precision, save         :: memo(0:100)
+  integer, save                  :: memomax = 0
  double precision               :: prod

  ASSERT (iand(n,1) /= 1)

-  prod=1.d0
-  do k=2,n,2
-   prod=prod*dfloat(k)
+!  prod=1.d0
+!  do k=2,n,2
+!   prod=prod*dfloat(k)
+!  enddo
+!  fact2=prod
+!  return
+!
+  if (n <= memomax) then
+    if (n < 2) then
+      fact2 = 1.d0
+    else
+      fact2 = memo(n)
+    endif
+    return
+  endif
+
+  integer                        :: i
+  memo(0)=1.d0
+  memo(1)=1.d0
+  do i=memomax+2,min(n,100),2
+    memo(i) = memo(i-2)* dble(i)
  enddo
-  fact2=prod
-  return
+  memomax = min(n,100)
+  fact2 = memo(memomax)
+  
+  if (n > 100) then
+    double precision :: dble_logfact
+    fact2 = dexp(dble_logfact(n))
+  endif

 end function

--- a/src/ZMQ/utils.irp.f
+++ b/src/ZMQ/utils.irp.f
@ -473,6 +473,11 @@ subroutine end_zmq_push_socket(zmq_socket_push,thread)
  integer                        :: rc
  character*(8), external        :: zmq_port
  
+  rc = f77_zmq_setsockopt(zmq_socket_push,ZMQ_LINGER,300000,4)
+  if (rc /= 0) then
+    stop 'Unable to set ZMQ_LINGER on push socket'
+  endif
+
  rc = f77_zmq_close(zmq_socket_push)
  if (rc /= 0) then
    print *,  'f77_zmq_close(zmq_socket_push)'
@ -794,12 +799,6 @@ subroutine end_zmq_to_qp_run_socket(zmq_to_qp_run_socket)
  character*(8), external        :: zmq_port
  integer                        :: rc

-  rc = f77_zmq_disconnect(zmq_to_qp_run_socket, trim(qp_run_address)//':'//trim(zmq_port(0)))
-!  if (rc /= 0) then
-!    print *,  'f77_zmq_disconnect(zmq_to_qp_run_socket, trim(qp_run_address)//'':''//trim(zmq_port(0)))'
-!    stop 'error'
-!  endif
-
  rc = f77_zmq_setsockopt(zmq_to_qp_run_socket,ZMQ_LINGER,1000,4)
  if (rc /= 0) then
    stop 'Unable to set ZMQ_LINGER on zmq_to_qp_run_socket'
@ -907,3 +906,37 @@ end



+subroutine wait_for_states(state_wait,state,n)
+  use f77_zmq
+  implicit none
+  BEGIN_DOC
+! Wait for the ZMQ state to be ready
+  END_DOC
+  integer, intent(in)            :: n
+  character*(64), intent(in)     :: state_wait(n)
+  character*(64), intent(out)    :: state
+  integer(ZMQ_PTR)               :: zmq_socket_sub
+  integer(ZMQ_PTR), external     :: new_zmq_sub_socket
+  integer                        :: rc, i
+  logical                        :: condition
+
+  zmq_socket_sub       = new_zmq_sub_socket()
+  state = 'Waiting'
+  condition = .True.
+  do while (condition)
+    rc = f77_zmq_recv( zmq_socket_sub, state, 64, 0)
+    if (rc > 0) then
+      state = trim(state(1:rc))
+    else
+      print *,  'Timeout reached. Stopping'
+      state = "Stopped"
+    endif
+    condition = trim(state) /= 'Stopped'
+    do i=1,n
+      condition = condition .and. (trim(state) /= trim(state_wait(i)))
+    enddo
+  end do
+  call end_zmq_sub_socket(zmq_socket_sub)
+end
+
+