From 6492f613a141830cbcf207004d29fbce7fd935ca Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 15 Jul 2016 15:31:16 +0200
Subject: [PATCH 01/15] Removed debug

---
 ocaml/Address.ml                        | 2 +-
 src/Determinants/connected_to_ref.irp.f | 9 +++------
 src/Determinants/filter_connected.irp.f | 2 --
 src/Determinants/slater_rules.irp.f     | 5 -----
 4 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/ocaml/Address.ml b/ocaml/Address.ml
index e107cf0c..47eb3fd6 100644
--- a/ocaml/Address.ml
+++ b/ocaml/Address.ml
@@ -42,7 +42,7 @@ end = struct
     assert (String.is_prefix ~prefix:"inproc://" x);
     x
   let create name =
-    Printf.sprintf "ipc://%s" name
+    Printf.sprintf "inproc://%s" name
   let to_string x = x
 end
 
diff --git a/src/Determinants/connected_to_ref.irp.f b/src/Determinants/connected_to_ref.irp.f
index 7a54bdbc..c0b611be 100644
--- a/src/Determinants/connected_to_ref.irp.f
+++ b/src/Determinants/connected_to_ref.irp.f
@@ -109,8 +109,6 @@ integer function get_index_in_psi_det_sorted_bit(key,Nint)
       continue
     else
       in_wavefunction = .True.
-      !DIR$ IVDEP
-      !DIR$ LOOP COUNT MIN(3)
       do l=2,Nint
         if ( (key(l,1) /= psi_det_sorted_bit(l,1,i)).or.                           &
               (key(l,2) /= psi_det_sorted_bit(l,2,i)) ) then
@@ -175,7 +173,6 @@ logical function is_connected_to(key,keys,Nint,Ndet)
   do i=1,Ndet
     degree_x2 = popcnt(xor( key(1,1), keys(1,1,i))) +              &
         popcnt(xor( key(1,2), keys(1,2,i)))
-    !DEC$ LOOP COUNT MIN(3)
     do l=2,Nint
       degree_x2 = degree_x2 + popcnt(xor( key(l,1), keys(l,1,i))) +&
           popcnt(xor( key(l,2), keys(l,2,i)))
@@ -208,7 +205,6 @@ logical function is_connected_to_by_mono(key,keys,Nint,Ndet)
   do i=1,Ndet
     degree_x2 = popcnt(xor( key(1,1), keys(1,1,i))) +              &
         popcnt(xor( key(1,2), keys(1,2,i)))
-    !DEC$ LOOP COUNT MIN(3)
     do l=2,Nint
       degree_x2 = degree_x2 + popcnt(xor( key(l,1), keys(l,1,i))) +&
           popcnt(xor( key(l,2), keys(l,2,i)))
@@ -302,10 +298,12 @@ integer function connected_to_ref(key,keys,Nint,N_past_in,Ndet)
     do i=N_past-1,1,-1
       degree_x2 = popcnt(xor( key(1,1), keys(1,1,i))) +              &
           popcnt(xor( key(1,2), keys(1,2,i)))
-      !DEC$ LOOP COUNT MIN(3)
       do l=2,Nint
         degree_x2 = degree_x2 + popcnt(xor( key(l,1), keys(l,1,i))) +&
             popcnt(xor( key(l,2), keys(l,2,i)))
+        if (degree_x2 > 4) then
+          exit
+        endif
       enddo
       if (degree_x2 > 4) then
         cycle
@@ -406,7 +404,6 @@ integer function connected_to_ref_by_mono(key,keys,Nint,N_past_in,Ndet)
     do i=N_past-1,1,-1
       degree_x2 = popcnt(xor( key(1,1), keys(1,1,i))) +              &
           popcnt(xor( key(1,2), keys(1,2,i)))
-      !DEC$ LOOP COUNT MIN(3)
       do l=2,Nint
         degree_x2 = degree_x2 + popcnt(xor( key(l,1), keys(l,1,i))) +&
             popcnt(xor( key(l,2), keys(l,2,i)))
diff --git a/src/Determinants/filter_connected.irp.f b/src/Determinants/filter_connected.irp.f
index 46280b31..34d6feb9 100644
--- a/src/Determinants/filter_connected.irp.f
+++ b/src/Determinants/filter_connected.irp.f
@@ -299,7 +299,6 @@ subroutine filter_connected_i_H_psi0(key1,key2,Nint,sze,idx)
   else
     
 
-    integer, save :: icount(4) = (/0,0,0,0/)
     !DIR$ LOOP COUNT (1000)
     outer: do i=1,sze
       degree_x2 = 0
@@ -317,7 +316,6 @@ subroutine filter_connected_i_H_psi0(key1,key2,Nint,sze,idx)
       enddo
       idx(l) = i
       l = l+1
-      icount(3) = icount(3) + 1_8
     enddo outer
     
   endif
diff --git a/src/Determinants/slater_rules.irp.f b/src/Determinants/slater_rules.irp.f
index 967ac9a3..ec7eb76d 100644
--- a/src/Determinants/slater_rules.irp.f
+++ b/src/Determinants/slater_rules.irp.f
@@ -1249,7 +1249,6 @@ subroutine get_excitation_degree_vector(key1,key2,degree,Nint,sze,idx)
   l=1
   if (Nint==1) then
     
-    !DIR$ LOOP COUNT (1000)
     do i=1,sze
       d = popcnt(xor( key1(1,1,i), key2(1,1))) +       &
           popcnt(xor( key1(1,2,i), key2(1,2)))
@@ -1264,7 +1263,6 @@ subroutine get_excitation_degree_vector(key1,key2,degree,Nint,sze,idx)
     
   else if (Nint==2) then
     
-    !DIR$ LOOP COUNT (1000)
     do i=1,sze
       d = popcnt(xor( key1(1,1,i), key2(1,1))) +                     &
           popcnt(xor( key1(1,2,i), key2(1,2))) +                     &
@@ -1281,7 +1279,6 @@ subroutine get_excitation_degree_vector(key1,key2,degree,Nint,sze,idx)
     
   else if (Nint==3) then
     
-    !DIR$ LOOP COUNT (1000)
     do i=1,sze
       d = popcnt(xor( key1(1,1,i), key2(1,1))) +                     &
           popcnt(xor( key1(1,2,i), key2(1,2))) +                     &
@@ -1300,10 +1297,8 @@ subroutine get_excitation_degree_vector(key1,key2,degree,Nint,sze,idx)
     
   else
     
-    !DIR$ LOOP COUNT (1000)
     do i=1,sze
       d = 0
-      !DIR$ LOOP COUNT MIN(4)
       do m=1,Nint
         d = d + popcnt(xor( key1(m,1,i), key2(m,1)))                 &
               + popcnt(xor( key1(m,2,i), key2(m,2)))

From b780a6540abaa08142be8d8912131a50bb41c6ca Mon Sep 17 00:00:00 2001
From: Yann Garniron <yann.garniron@yahoo.fr>
Date: Fri, 14 Oct 2016 12:40:29 +0200
Subject: [PATCH 02/15] bugs in mrcepa0_general and mrcc_utils

---
 plugins/MRCC_Utils/mrcc_utils.irp.f   | 5 +++--
 plugins/mrcepa0/mrcepa0_general.irp.f | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/plugins/MRCC_Utils/mrcc_utils.irp.f b/plugins/MRCC_Utils/mrcc_utils.irp.f
index 79249051..81893781 100644
--- a/plugins/MRCC_Utils/mrcc_utils.irp.f
+++ b/plugins/MRCC_Utils/mrcc_utils.irp.f
@@ -644,7 +644,7 @@ END_PROVIDER
      AtA_ind = 0
      AtA_val = 0d0
      x = 0d0
-     A_val_mwen = 0d0
+     !A_val_mwen = 0d0
      N_col = 0
      col_shortcut = 0
      
@@ -699,7 +699,8 @@ END_PROVIDER
          !$OMP private(at_row, a_col, t, i, r1, r2, wk, A_ind_mwen, A_val_mwen)&
          !$OMP shared(col_shortcut, N_col, AtB, AtA_size, AtA_val, AtA_ind, s)
      allocate(A_val_mwen(nex), A_ind_mwen(nex))
-     A_ind_mwen = 0
+     !A_ind_mwen = 0
+     !A_val_mwen = 0d0
      !$OMP DO schedule(dynamic, 100)
      do at_row = 1, nex
        wk = 0
diff --git a/plugins/mrcepa0/mrcepa0_general.irp.f b/plugins/mrcepa0/mrcepa0_general.irp.f
index 1e7ad68d..63f03360 100644
--- a/plugins/mrcepa0/mrcepa0_general.irp.f
+++ b/plugins/mrcepa0/mrcepa0_general.irp.f
@@ -28,7 +28,7 @@ subroutine run(N_st,energy)
     enddo
     SOFT_TOUCH psi_coef ci_energy_dressed
     call write_double(6,ci_energy_dressed(1),"Final MRCC energy")
-    call ezfio_set_mrcc_cassd_energy(ci_energy_dressed(1))
+    call ezfio_set_mrcepa0_energy(ci_energy_dressed(1))
     call save_wavefunction
     energy(:) = ci_energy_dressed(:)
   else

From 1f4cd4c318847520bf7fd84c7e545c5cadabf870 Mon Sep 17 00:00:00 2001
From: Yann Garniron <yann.garniron@yahoo.fr>
Date: Mon, 17 Oct 2016 14:40:09 +0200
Subject: [PATCH 03/15] optimized calculation of inactive amplitudes

---
 plugins/MRCC_Utils/mrcc_utils.irp.f | 500 +++++++++++++++++-----------
 1 file changed, 301 insertions(+), 199 deletions(-)

diff --git a/plugins/MRCC_Utils/mrcc_utils.irp.f b/plugins/MRCC_Utils/mrcc_utils.irp.f
index 81893781..14885153 100644
--- a/plugins/MRCC_Utils/mrcc_utils.irp.f
+++ b/plugins/MRCC_Utils/mrcc_utils.irp.f
@@ -616,204 +616,301 @@ END_PROVIDER
 
  BEGIN_PROVIDER [ double precision, dIj_unique, (hh_shortcut(hh_shortcut(0)+1)-1, N_states) ]
 &BEGIN_PROVIDER [ double precision, rho_mrcc, (N_det_non_ref, N_states) ]
-   implicit none
-   logical                        :: ok
-   integer                        :: i, j, k, s, II, pp, hh, ind, wk, nex, a_col, at_row
-   integer, external              :: searchDet, unsortedSearchDet
-   integer(bit_kind)              :: myDet(N_int, 2), myMask(N_int, 2)
-   integer                        :: N, INFO, AtA_size, r1, r2
-   double precision , allocatable :: AtB(:), AtA_val(:), A_val(:,:), x(:), x_new(:), A_val_mwen(:)
-   double precision               :: t, norm, cx, res
-   integer, allocatable           :: A_ind(:,:), lref(:), AtA_ind(:), A_ind_mwen(:), col_shortcut(:), N_col(:)
-   double precision               :: phase
-   
-   
-   
-   nex = hh_shortcut(hh_shortcut(0)+1)-1
-   print *, "TI", nex, N_det_non_ref
-   allocate(A_ind(0:N_det_ref+1, nex), A_val(N_det_ref+1, nex))
-   allocate(AtA_ind(N_det_ref * nex), AtA_val(N_det_ref * nex)) !!!!! MAY BE TOO SMALL ? !!!!!!!!
-   allocate(x(nex), AtB(nex))
-   allocate(N_col(nex), col_shortcut(nex))
-   allocate(x_new(nex))
-   
-   do s = 1, N_states
-     
-     A_val = 0d0
-     A_ind = 0
-     AtA_ind = 0
-     AtA_val = 0d0
-     x = 0d0
-     !A_val_mwen = 0d0
-     N_col = 0
-     col_shortcut = 0
-     
-     !$OMP PARALLEL default(none) shared(psi_non_ref, hh_exists, pp_exists, N_int, A_val, A_ind)&
-         !$OMP shared(s, hh_shortcut, psi_ref_coef, N_det_non_ref, psi_non_ref_sorted, psi_non_ref_sorted_idx, psi_ref, N_det_ref)&
-         !$OMP private(lref, pp, II, ok, myMask, myDet, ind, phase, wk)
-      allocate(lref(N_det_non_ref))
-     !$OMP DO schedule(static,10) 
-     do hh = 1, hh_shortcut(0)
-       do pp = hh_shortcut(hh), hh_shortcut(hh+1)-1
-         lref = 0
-         do II = 1, N_det_ref
-           call apply_hole_local(psi_ref(1,1,II), hh_exists(1, hh), myMask, ok, N_int)
-           if(.not. ok) cycle
-           call apply_particle_local(myMask, pp_exists(1, pp), myDet, ok, N_int)
-           if(.not. ok) cycle
-           ind = searchDet(psi_non_ref_sorted(1,1,1), myDet(1,1), N_det_non_ref, N_int)
-           if(ind /= -1) then
-             call get_phase(myDet(1,1), psi_ref(1,1,II), phase, N_int)
-             if (phase  > 0.d0) then
-               lref(psi_non_ref_sorted_idx(ind)) =  II
-             else
-               lref(psi_non_ref_sorted_idx(ind)) = -II
-             endif
-           end if
-         end do
-         wk = 0
-         do i=1, N_det_non_ref
-           if(lref(i) > 0) then
-             wk += 1
-             A_val(wk, pp) = psi_ref_coef(lref(i), s)
-             A_ind(wk, pp) = i
-           else if(lref(i) < 0) then
-             wk += 1
-             A_val(wk, pp) = -psi_ref_coef(-lref(i), s)
-             A_ind(wk, pp) = i
-           end if
-         end do
-         A_ind(0,pp) = wk
-       end do
-     end do
-     !$OMP END DO
-     deallocate(lref)
-     !$OMP END PARALLEL   
- print *,  'Done building A_val, A_ind'
-     
-     AtB = 0d0
-     AtA_size = 0
-     col_shortcut = 0
-     N_col = 0
-     !$OMP PARALLEL default(none) shared(k, psi_non_ref_coef, A_ind, A_val, x, N_det_ref, nex, N_det_non_ref)&
-         !$OMP private(at_row, a_col, t, i, r1, r2, wk, A_ind_mwen, A_val_mwen)&
-         !$OMP shared(col_shortcut, N_col, AtB, AtA_size, AtA_val, AtA_ind, s)
-     allocate(A_val_mwen(nex), A_ind_mwen(nex))
-     !A_ind_mwen = 0
-     !A_val_mwen = 0d0
-     !$OMP DO schedule(dynamic, 100)
-     do at_row = 1, nex
-       wk = 0
-       if(mod(at_row, 10000) == 0) print *, "AtA", at_row, "/", nex
-       do i=1,A_ind(0,at_row)
-         AtB(at_row) = AtB(at_row) + psi_non_ref_coef(A_ind(i, at_row), s) * A_val(i, at_row)
-       end do
+  implicit none
+  logical                        :: ok
+  integer                        :: i, j, k, s, II, pp, ppp, hh, ind, wk, nex, a_col, at_row
+  integer, external              :: searchDet, unsortedSearchDet
+  integer(bit_kind)              :: myDet(N_int, 2), myMask(N_int, 2)
+  integer                        :: N, INFO, AtA_size, r1, r2
+  double precision , allocatable :: AtB(:), AtA_val(:), A_val(:,:), x(:), x_new(:), A_val_mwen(:)
+  double precision               :: t, norm, cx, res
+  integer, allocatable           :: A_ind(:,:), lref(:), AtA_ind(:), A_ind_mwen(:), col_shortcut(:), N_col(:)
+  double precision               :: phase
+  
+  
+  integer, allocatable :: pathTo(:), active_hh_idx(:), active_pp_idx(:)
+  logical, allocatable :: active(:)
+  double precision, allocatable :: rho_mrcc_init(:,:)
+  integer :: nactive
+  
+  nex = hh_shortcut(hh_shortcut(0)+1)-1
+  print *, "TI", nex, N_det_non_ref
+  
+  allocate(pathTo(N_det_non_ref), active(nex))
+  allocate(active_pp_idx(nex), active_hh_idx(nex))
+  allocate(rho_mrcc_init(N_det_non_ref, N_states))
+  
+  pathTo = 0
+  active = .false.
+  nactive = 0
+  
+  
+  do hh = 1, hh_shortcut(0)
+    do pp = hh_shortcut(hh), hh_shortcut(hh+1)-1
+      do II = 1, N_det_ref
+        call apply_hole_local(psi_ref(1,1,II), hh_exists(1, hh), myMask, ok, N_int)
+        if(.not. ok) cycle
+        call apply_particle_local(myMask, pp_exists(1, pp), myDet, ok, N_int)
+        if(.not. ok) cycle
+        ind = searchDet(psi_non_ref_sorted(1,1,1), myDet(1,1), N_det_non_ref, N_int)
+        if(ind == -1) cycle
+        ind = psi_non_ref_sorted_idx(ind)
+        if(pathTo(ind) == 0) then
+          pathTo(ind) = pp
+        else
+          active(pp) = .true.
+          active(pathTo(ind)) = .true.
+        end if
+      end do
+    end do
+  end do
+  
+   do hh = 1, hh_shortcut(0)
+    do pp = hh_shortcut(hh), hh_shortcut(hh+1)-1
+      if(active(pp)) then
+        nactive = nactive + 1
+        active_hh_idx(nactive) = hh
+        active_pp_idx(nactive) = pp
+      end if
+    end do
+   end do
+  
+  print *, nactive, "inact/", size(active)
+  
+  allocate(A_ind(0:N_det_ref+1, nactive), A_val(N_det_ref+1, nactive))
+  allocate(AtA_ind(N_det_ref * nactive), AtA_val(N_det_ref * nactive)) 
+  allocate(x(nex), AtB(nex))
+  allocate(N_col(nactive), col_shortcut(nactive))
+  allocate(x_new(nex))
+        
 
-       do a_col = 1, nex
-         t = 0d0
-         r1 = 1
-         r2 = 1
-         do while ((A_ind(r1, at_row) /= 0).and.(A_ind(r2, a_col) /= 0))
-           if(A_ind(r1, at_row) > A_ind(r2, a_col)) then
-             r2 = r2+1
-           else if(A_ind(r1, at_row) < A_ind(r2, a_col)) then
-             r1 = r1+1
-           else
-             t = t - A_val(r1, at_row) * A_val(r2, a_col)
-             r1 = r1+1
-             r2 = r2+1
-           end if
-         end do
-         
-         if(a_col == at_row) then
-           t = t + 1.d0
-         end if
-         if(t /= 0.d0) then
-           wk += 1
-           A_ind_mwen(wk) = a_col
-           A_val_mwen(wk) = t
-         end if
-       end do
-       
-       if(wk /= 0) then
-         !$OMP CRITICAL
-         col_shortcut(at_row) = AtA_size+1
-         N_col(at_row) = wk
-         if (AtA_size+wk > size(AtA_ind,1)) then
-            print *, AtA_size+wk , size(AtA_ind,1)
-            stop 'too small'
-         endif
-         do i=1,wk
-          AtA_ind(AtA_size+i) = A_ind_mwen(i)
-          AtA_val(AtA_size+i) = A_val_mwen(i)
-         enddo
-         AtA_size += wk
-         !$OMP END CRITICAL
-       end if
-     end do
-     !$OMP END DO NOWAIT
-     deallocate (A_ind_mwen, A_val_mwen)
-     !$OMP END PARALLEL
-     
-     if(AtA_size > size(AtA_val)) stop "SIZA"
-     print *, "ATA SIZE", ata_size
-     do i=1,nex
-       x(i) = AtB(i)
-     enddo
-     
-     double precision :: factor, resold
-     factor = 1.d0
-     resold = huge(1.d0)
-     do k=0,100000
-       !$OMP PARALLEL default(shared) private(cx, i, j, a_col)
-       
-       !$OMP DO
-       do i=1,N_det_non_ref
-         rho_mrcc(i,s) = 0.d0
-       enddo
-       !$OMP END DO
-       
-       !$OMP DO
-       do a_col = 1, nex
-         cx = 0d0
-         do i=col_shortcut(a_col), col_shortcut(a_col) + N_col(a_col) - 1
-           cx = cx + x(AtA_ind(i)) * AtA_val(i)
-         end do
-         x_new(a_col) = AtB(a_col) + cx * factor
-       end do
-       !$OMP END DO
-       
-       !$OMP END PARALLEL
-       
-       res = 0.d0
-       do a_col=1,nex
-         res = res + (X_new(a_col) - X(a_col))*(X_new(a_col) - X(a_col))
-       end do
-       
-       if (res < resold) then
-          do a_col=1,nex
-            do j=1,N_det_non_ref
-              i = A_ind(j,a_col)
-              if (i==0) exit
-              rho_mrcc(i,s) = rho_mrcc(i,s) + A_val(j,a_col) * X_new(a_col)
-            enddo
-            X(a_col) = X_new(a_col)
-          end do
-!          factor = 1.d0
-       else
-          factor = -factor * 0.5d0
-       endif
-       resold = res
+  
+  do s = 1, N_states
+    
+    A_val = 0d0
+    A_ind = 0
+    AtA_ind = 0
+    AtB = 0d0
+    AtA_val = 0d0
+    x = 0d0
+    N_col = 0
+    col_shortcut = 0
+    
+    !$OMP PARALLEL default(none) shared(psi_non_ref, hh_exists, pp_exists, N_int, A_val, A_ind)&
+        !$OMP shared(s, hh_shortcut, psi_ref_coef, N_det_non_ref, psi_non_ref_sorted, psi_non_ref_sorted_idx, psi_ref, N_det_ref)&
+        !$OMP shared(active, active_hh_idx, active_pp_idx, nactive)&
+        !$OMP private(lref, pp, II, ok, myMask, myDet, ind, phase, wk, ppp, hh)
+     allocate(lref(N_det_non_ref))
+    !$OMP DO schedule(static,10) 
+      do ppp=1,nactive
+        pp = active_pp_idx(ppp)
+        hh = active_hh_idx(ppp)
+        lref = 0
+        do II = 1, N_det_ref
+          call apply_hole_local(psi_ref(1,1,II), hh_exists(1, hh), myMask, ok, N_int)
+          if(.not. ok) cycle
+          call apply_particle_local(myMask, pp_exists(1, pp), myDet, ok, N_int)
+          if(.not. ok) cycle
+          ind = searchDet(psi_non_ref_sorted(1,1,1), myDet(1,1), N_det_non_ref, N_int)
+          if(ind /= -1) then
+            call get_phase(myDet(1,1), psi_ref(1,1,II), phase, N_int)
+            if (phase  > 0.d0) then
+              lref(psi_non_ref_sorted_idx(ind)) =  II
+            else
+              lref(psi_non_ref_sorted_idx(ind)) = -II
+            endif
+          end if
+        end do
+        wk = 0
+        do i=1, N_det_non_ref
+          if(lref(i) > 0) then
+            wk += 1
+            A_val(wk, ppp) = psi_ref_coef(lref(i), s)
+            A_ind(wk, ppp) = i
+          else if(lref(i) < 0) then
+            wk += 1
+            A_val(wk, ppp) = -psi_ref_coef(-lref(i), s)
+            A_ind(wk, ppp) = i
+          end if
+        end do
+        A_ind(0,ppp) = wk
+      end do
+    !$OMP END DO
+    deallocate(lref)
+    !$OMP END PARALLEL   
 
-       if(mod(k, 100) == 0) then
-         print *, "res", k, res
-       end if
-       
-       if(res < 1d-8) exit
-     end do
-     ! rho_mrcc now contains A.X
 
-     norm = 0.d0
+    print *,  'Done building A_val, A_ind'
+    
+    AtA_size = 0
+    col_shortcut = 0
+    N_col = 0
+    integer :: a_coll, at_roww
+    
+    
+    !$OMP PARALLEL default(none) shared(k, psi_non_ref_coef, A_ind, A_val, x, N_det_ref, nex, N_det_non_ref)&
+        !$OMP private(at_row, a_col, t, i, j, r1, r2, wk, A_ind_mwen, A_val_mwen, a_coll, at_roww)&
+        !$OMP shared(col_shortcut, N_col, AtB, AtA_size, AtA_val, AtA_ind, s, nactive, active_pp_idx)
+    allocate(A_val_mwen(nex), A_ind_mwen(nex))
+     
+    !$OMP DO schedule(dynamic, 100)
+    do at_roww = 1, nactive ! nex
+      at_row = active_pp_idx(at_roww)
+      wk = 0
+      if(mod(at_roww, 100) == 0) print *, "AtA", at_row, "/", nex
+      do i=1,A_ind(0,at_roww)
+        j = active_pp_idx(i)
+        AtB(at_row) = AtB(at_row) + psi_non_ref_coef(A_ind(i, at_roww), s) * A_val(i, at_roww)
+      end do
+
+      do a_coll = 1, nactive 
+        a_col = active_pp_idx(a_coll)
+        t = 0d0
+        r1 = 1
+        r2 = 1
+        do while ((A_ind(r1, at_roww) /= 0).and.(A_ind(r2, a_coll) /= 0))
+          if(A_ind(r1, at_roww) > A_ind(r2, a_coll)) then
+            r2 = r2+1
+          else if(A_ind(r1, at_roww) < A_ind(r2, a_coll)) then
+            r1 = r1+1
+          else
+            t = t - A_val(r1, at_roww) * A_val(r2, a_coll)
+            r1 = r1+1
+            r2 = r2+1
+          end if
+        end do
+        
+        if(a_col == at_row) then
+          t = t + 1.d0
+        end if
+        if(t /= 0.d0) then
+          wk += 1
+          A_ind_mwen(wk) = a_col
+          A_val_mwen(wk) = t
+        end if
+      end do
+      
+      if(wk /= 0) then
+        !$OMP CRITICAL
+        col_shortcut(at_roww) = AtA_size+1
+        N_col(at_roww) = wk
+        if (AtA_size+wk > size(AtA_ind,1)) then
+           print *, AtA_size+wk , size(AtA_ind,1)
+           stop 'too small'
+        endif
+        do i=1,wk
+         AtA_ind(AtA_size+i) = A_ind_mwen(i)
+         AtA_val(AtA_size+i) = A_val_mwen(i)
+        enddo
+        AtA_size += wk
+        !$OMP END CRITICAL
+      end if
+    end do
+    !$OMP END DO NOWAIT
+    deallocate (A_ind_mwen, A_val_mwen)
+    !$OMP END PARALLEL
+    
+    print *, "ATA SIZE", ata_size
+    x = 0d0
+    
+    
+    do a_coll = 1, nactive
+      a_col = active_pp_idx(a_coll)
+      X(a_col) = AtB(a_col)
+    end do
+    
+    rho_mrcc_init = 0d0
+    
+    allocate(lref(N_det_ref))
+    !$OMP PARALLEL DO default(shared) schedule(static, 1) &
+        !$OMP private(lref, hh, pp, II, myMask, myDet, ok, ind, phase)
+    do hh = 1, hh_shortcut(0)
+     do pp = hh_shortcut(hh), hh_shortcut(hh+1)-1
+      if(active(pp)) cycle
+      lref = 0
+      do II=1,N_det_ref
+        call apply_hole_local(psi_ref(1,1,II), hh_exists(1, hh), myMask, ok, N_int)
+        if(.not. ok) cycle
+        call apply_particle_local(myMask, pp_exists(1, pp), myDet, ok, N_int)
+        if(.not. ok) cycle
+        ind = searchDet(psi_non_ref_sorted(1,1,1), myDet(1,1), N_det_non_ref, N_int)
+        if(ind == -1) cycle
+        ind = psi_non_ref_sorted_idx(ind)
+        call get_phase(myDet(1,1), psi_ref(1,1,II), phase, N_int)
+        X(pp) += psi_ref_coef(II,s)**2
+        AtB(pp) += psi_non_ref_coef(ind, s) * psi_ref_coef(II, s) * phase
+        lref(II) = ind
+        if(phase < 0d0) lref(II) = -ind
+      end do
+      X(pp) =  AtB(pp) / X(pp)
+      do II=1,N_det_ref
+        if(lref(II) > 0) then
+          rho_mrcc_init(lref(II),s) = psi_ref_coef(II,s) * X(pp)
+        else if(lref(II) < 0) then
+          rho_mrcc_init(-lref(II),s) = -psi_ref_coef(II,s) * X(pp)
+        end if
+      end do
+     end do
+    end do
+    !$OMP END PARALLEL DO
+    
+    x_new = x
+     
+    double precision :: factor, resold
+    factor = 1.d0
+    resold = huge(1.d0)
+    do k=0,100000
+      !$OMP PARALLEL default(shared) private(cx, i, j, a_col, a_coll)
+      
+      !$OMP DO
+      do i=1,N_det_non_ref
+        rho_mrcc(i,s) = rho_mrcc_init(i,s) ! 0d0
+      enddo
+      !$OMP END DO
+      
+      !$OMP DO
+      do a_coll = 1, nactive !: nex
+        a_col = active_pp_idx(a_coll)
+        cx = 0d0
+        do i=col_shortcut(a_coll), col_shortcut(a_coll) + N_col(a_coll) - 1
+          cx = cx + x(AtA_ind(i)) * AtA_val(i)
+        end do
+        x_new(a_col) = AtB(a_col) + cx * factor
+      end do
+      !$OMP END DO
+      
+      !$OMP END PARALLEL
+      
+      res = 0.d0
+     
+      
+      if (res < resold) then
+         do a_coll=1,nactive ! nex
+          a_col = active_pp_idx(a_coll)
+          do j=1,N_det_non_ref
+            i = A_ind(j,a_coll)
+            if (i==0) exit
+            rho_mrcc(i,s) = rho_mrcc(i,s) + A_val(j,a_coll) * X_new(a_col)
+          enddo
+          res = res + (X_new(a_col) - X(a_col))*(X_new(a_col) - X(a_col))
+          X(a_col) = X_new(a_col)
+         end do
+         factor = 1.d0
+      else
+        factor = -factor * 0.5d0
+      endif
+      resold = res
+
+      if(mod(k, 5) == 0) then
+        print *, "res ", k, res
+      end if
+      
+      if(res < 1d-12) exit
+    end do
+    
+    
+    
+    norm = 0.d0
      do i=1,N_det_non_ref
        norm = norm + rho_mrcc(i,s)*rho_mrcc(i,s)
      enddo
@@ -826,7 +923,7 @@ END_PROVIDER
      
      print *, k, "res : ", res, "norm : ", sqrt(norm)
      
-     dIj_unique(:size(X), s) = X(:)
+     !dIj_unique(:size(X), s) = X(:)
      
      norm = 0.d0
      double precision :: f
@@ -871,12 +968,14 @@ END_PROVIDER
      enddo
      ! rho_mrcc now contains the product of the scaling factors and the
      ! normalization constant
-
-   end do
-   
+    
+     dIj_unique(:size(X), s) = X(:)
+  end do
 END_PROVIDER
 
 
+
+
 BEGIN_PROVIDER [ double precision, dij, (N_det_ref, N_det_non_ref, N_states) ]
   integer :: s,i,j
   double precision, external :: get_dij_index
@@ -1142,3 +1241,6 @@ subroutine apply_particle_local(det, exc, res, ok, Nint)
   ok = .true.
 end subroutine
 
+
+
+

From 62e8d1a0ac21f1981f08f85ed41cd038c3dcd2df Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Tue, 18 Oct 2016 13:38:45 +0200
Subject: [PATCH 04/15] The qp_run queue now pops from the back

---
 plugins/Full_CI_ZMQ/fci_zmq.irp.f          | 2 +-
 src/Davidson/u0Hu0.irp.f                   | 2 +-
 src/Determinants/H_apply_zmq.template.f    | 2 +-
 src/Integrals_Bielec/ao_bi_integrals.irp.f | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/plugins/Full_CI_ZMQ/fci_zmq.irp.f b/plugins/Full_CI_ZMQ/fci_zmq.irp.f
index c81b1266..a5dd8dcf 100644
--- a/plugins/Full_CI_ZMQ/fci_zmq.irp.f
+++ b/plugins/Full_CI_ZMQ/fci_zmq.irp.f
@@ -134,7 +134,7 @@ subroutine ZMQ_selection(N_in, pt2)
 
   step = int(5000000.d0 / dble(N_int * N_states * elec_num * elec_num * mo_tot_num * mo_tot_num ))
   step = max(1,step)
-  do i= N_det_generators, 1, -step
+  do i= 1,N_det_generators, step
     i_generator_start = max(i-step+1,1)
     i_generator_max = i
     write(task,*) i_generator_start, i_generator_max, 1, N
diff --git a/src/Davidson/u0Hu0.irp.f b/src/Davidson/u0Hu0.irp.f
index a1a72100..3787370a 100644
--- a/src/Davidson/u0Hu0.irp.f
+++ b/src/Davidson/u0Hu0.irp.f
@@ -252,7 +252,7 @@ subroutine H_S2_u_0_nstates(v_0,s_0,u_0,H_jj,S2_jj,n,keys_tmp,Nint,N_st,sze_8)
   ave_workload = ave_workload/dble(shortcut(0,1))
 
 
-  do sh=shortcut(0,1),1,-1
+  do sh=1,shortcut(0,1),1
     workload = shortcut(0,1)+dble(shortcut(sh+1,1) - shortcut(sh,1))**2
     do i=sh, shortcut(0,2), shortcut(0,1)
       do j=i, min(i, shortcut(0,2))
diff --git a/src/Determinants/H_apply_zmq.template.f b/src/Determinants/H_apply_zmq.template.f
index d59f2994..59544b79 100644
--- a/src/Determinants/H_apply_zmq.template.f
+++ b/src/Determinants/H_apply_zmq.template.f
@@ -35,7 +35,7 @@ subroutine $subroutine($params_main)
 
   call zmq_put_psi(zmq_to_qp_run_socket,1,energy,size(energy))
 
-  do i_generator=N_det_generators,1,-1
+  do i_generator=1,N_det_generators
     $skip
     write(task,*) i_generator
     call add_task_to_taskserver(zmq_to_qp_run_socket,task)
diff --git a/src/Integrals_Bielec/ao_bi_integrals.irp.f b/src/Integrals_Bielec/ao_bi_integrals.irp.f
index 2ebb402e..9eadbf35 100644
--- a/src/Integrals_Bielec/ao_bi_integrals.irp.f
+++ b/src/Integrals_Bielec/ao_bi_integrals.irp.f
@@ -368,7 +368,7 @@ BEGIN_PROVIDER [ logical, ao_bielec_integrals_in_map ]
 
   call new_parallel_job(zmq_to_qp_run_socket,'ao_integrals')
 
-  do l=1,ao_num
+  do l=ao_num,1,-1
     write(task,*) "triangle ", l
     call add_task_to_taskserver(zmq_to_qp_run_socket,task)
   enddo

From 973065319ce7bdd46c3fab86f4edfb102341e107 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Tue, 18 Oct 2016 19:29:50 +0200
Subject: [PATCH 05/15] Introduced QR in Davidson

---
 ocaml/Progress_bar.ml                  |   4 +-
 src/Davidson/diagonalization_hs2.irp.f | 143 ++++++-------------------
 src/Utils/LinearAlgebra.irp.f          |  44 +++++++-
 3 files changed, 71 insertions(+), 120 deletions(-)

diff --git a/ocaml/Progress_bar.ml b/ocaml/Progress_bar.ml
index 2ca8bd00..b8e97a59 100644
--- a/ocaml/Progress_bar.ml
+++ b/ocaml/Progress_bar.ml
@@ -14,13 +14,13 @@ type t =
 
 let init ?(bar_length=20) ?(start_value=0.) ?(end_value=1.) ~title =
   { title ; start_value ; end_value ; bar_length ; cur_value=start_value ;
-    init_time= Time.now () ; dirty = true ; next = Time.now () } 
+    init_time= Time.now () ; dirty = false ; next = Time.now () } 
 
 let update ~cur_value bar =
   { bar with cur_value ; dirty=true }
 
 let increment_end bar =
-  { bar with end_value=(bar.end_value +. 1.) ; dirty=true }
+  { bar with end_value=(bar.end_value +. 1.) ; dirty=false }
 
 let increment_cur bar =
   { bar with cur_value=(bar.cur_value +. 1.) ; dirty=true }
diff --git a/src/Davidson/diagonalization_hs2.irp.f b/src/Davidson/diagonalization_hs2.irp.f
index c8ac3733..2db6b4cd 100644
--- a/src/Davidson/diagonalization_hs2.irp.f
+++ b/src/Davidson/diagonalization_hs2.irp.f
@@ -95,7 +95,6 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
   
   double precision               :: u_dot_v, u_dot_u
   
-  integer, allocatable           :: kl_pairs(:,:)
   integer                        :: k_pairs, kl
   
   integer                        :: iter2
@@ -107,12 +106,14 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
   character*(16384)              :: write_buffer
   double precision               :: to_print(3,N_st)
   double precision               :: cpu, wall
-  integer                        :: shift, shift2
+  integer                        :: shift, shift2, itermax
   include 'constants.include.F'
   
   !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: U, W, R, S, y, h, lambda
-  if (N_st_diag > sze) then
-     stop 'error in Davidson : N_st_diag > sze'
+  if (N_st_diag*3 > sze) then
+     print *,  'error in Davidson :'
+     print *,  'Increase n_det_max_jacobi to ', N_st_diag*3
+     stop -1
   endif
 
   PROVIDE nuclear_repulsion
@@ -147,26 +148,26 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
   integer, external :: align_double
   sze_8 = align_double(sze)
 
+  itermax = min(davidson_sze_max, sze/N_st_diag)
   allocate(                                                          &
-      kl_pairs(2,N_st_diag*(N_st_diag+1)/2),                         &
-      W(sze_8,N_st_diag*davidson_sze_max),                           &
-      U(sze_8,N_st_diag*davidson_sze_max),                           &
+      W(sze_8,N_st_diag*itermax),                           &
+      U(sze_8,N_st_diag*itermax),                           &
       R(sze_8,N_st_diag),                                            &
-      S(sze_8,N_st_diag*davidson_sze_max),                           &
-      h(N_st_diag*davidson_sze_max,N_st_diag*davidson_sze_max),      &
-      y(N_st_diag*davidson_sze_max,N_st_diag*davidson_sze_max),      &
-      s_(N_st_diag*davidson_sze_max,N_st_diag*davidson_sze_max),     &
-      s_tmp(N_st_diag*davidson_sze_max,N_st_diag*davidson_sze_max),  &
+      S(sze_8,N_st_diag*itermax),                           &
+      h(N_st_diag*itermax,N_st_diag*itermax),      &
+      y(N_st_diag*itermax,N_st_diag*itermax),      &
+      s_(N_st_diag*itermax,N_st_diag*itermax),     &
+      s_tmp(N_st_diag*itermax,N_st_diag*itermax),  &
       residual_norm(N_st_diag),                                      &
-      c(N_st_diag*davidson_sze_max),                                 &
-      s2(N_st_diag*davidson_sze_max),                                &
-      lambda(N_st_diag*davidson_sze_max))
+      c(N_st_diag*itermax),                                 &
+      s2(N_st_diag*itermax),                                &
+      lambda(N_st_diag*itermax))
   
   h  = 0.d0
   s_ = 0.d0
   s_tmp = 0.d0
-  c = 0.d0
   U = 0.d0
+  W = 0.d0
   S = 0.d0
   R = 0.d0
   y = 0.d0
@@ -183,10 +184,6 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
   
   converged = .False.
   
-  do k=1,N_st
-    call normalize(u_in(1,k),sze)
-  enddo
-
   do k=N_st+1,N_st_diag
       do i=1,sze
         double precision               :: r1, r2
@@ -194,14 +191,6 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
         call random_number(r2)
         u_in(i,k) = dsqrt(-2.d0*dlog(r1))*dcos(dtwo_pi*r2)
       enddo
-    
-    ! Gram-Schmidt
-    ! ------------
-    call dgemv('T',sze,k-1,1.d0,u_in,size(u_in,1),                   &
-        u_in(1,k),1,0.d0,c,1)
-    call dgemv('N',sze,k-1,-1.d0,u_in,size(u_in,1),                  &
-        c,1,1.d0,u_in(1,k),1)
-    call normalize(u_in(1,k),sze)
   enddo
 
 
@@ -213,11 +202,12 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
       enddo
     enddo
 
-    do iter=1,davidson_sze_max-1
+    do iter=1,itermax-1
       
       shift  = N_st_diag*(iter-1)
       shift2 = N_st_diag*iter
 
+      call ortho_qr(U,size(U,1),sze,shift2)
 
       ! Compute |W_k> = \sum_i |i><i|H|u_k>
       ! -----------------------------------------
@@ -229,20 +219,6 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
       ! Compute h_kl = <u_k | W_l> = <u_k| H |u_l>
       ! -------------------------------------------
 
-
-!      do l=1,N_st_diag
-!        do k=1,N_st_diag
-!          do iter2=1,iter-1
-!            h(k,iter2,l,iter) = u_dot_v(U(1,k,iter2),W(1,l,iter),sze)
-!            h(k,iter,l,iter2) = h(k,iter2,l,iter)
-!          enddo
-!        enddo
-!        do k=1,l
-!          h(k,iter,l,iter) = u_dot_v(U(1,k,iter),W(1,l,iter),sze)
-!          h(l,iter,k,iter) = h(k,iter,l,iter)
-!        enddo
-!      enddo
-
       call dgemm('T','N', shift2, N_st_diag, sze,                    &
           1.d0, U, size(U,1), W(1,shift+1), size(W,1),               &
           0.d0, h(1,shift+1), size(h,1))
@@ -295,22 +271,6 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
       ! Express eigenvectors of h in the determinant basis
       ! --------------------------------------------------
       
-!      do k=1,N_st_diag
-!        do i=1,sze
-!          U(i,shift2+k) = 0.d0
-!          W(i,shift2+k) = 0.d0
-!          S(i,shift2+k) = 0.d0
-!        enddo
-!        do l=1,N_st_diag*iter
-!          do i=1,sze
-!            U(i,shift2+k) = U(i,shift2+k) + U(i,l)*y(l,k)
-!            W(i,shift2+k) = W(i,shift2+k) + W(i,l)*y(l,k)
-!            S(i,shift2+k) = S(i,shift2+k) + S(i,l)*y(l,k)
-!          enddo
-!        enddo
-!      enddo
-!
-!
       call dgemm('N','N', sze, N_st_diag, shift2,                    &
           1.d0, U, size(U,1), y, size(y,1), 0.d0, U(1,shift2+1), size(U,1))
       call dgemm('N','N', sze, N_st_diag, shift2,                    &
@@ -321,13 +281,6 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
       ! Compute residual vector
       ! -----------------------
       
-!      do k=1,N_st_diag
-!        print *,  s2(k)
-!        s2(k) = u_dot_v(U(1,shift2+k),  S(1,shift2+k), sze) + S_z2_Sz
-!        print *,  s2(k)
-!        print *,  ''
-!        pause
-!      enddo
       do k=1,N_st_diag
         do i=1,sze
           R(i,k) = (lambda(k) * U(i,shift2+k) - W(i,shift2+k) )      &
@@ -338,14 +291,17 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
           to_print(1,k) = lambda(k) + nuclear_repulsion
           to_print(2,k) = s2(k)
           to_print(3,k) = residual_norm(k)
-          if (residual_norm(k) > 1.e9) then
-            stop 'Davidson failed'
-          endif
         endif
       enddo
       
-      write(iunit,'(X,I3,X,100(X,F16.10,X,F11.6,X,E11.3))')  iter, to_print(:,1:N_st)
+      write(iunit,'(X,I3,X,100(X,F16.10,X,F11.6,X,E11.3,A20))')  iter, to_print(:,1:N_st), ''
       call davidson_converged(lambda,residual_norm,wall,iter,cpu,N_st,converged)
+      do k=1,N_st
+        if (residual_norm(k) > 1.e9) then
+        print *,  ''
+          stop 'Davidson failed'
+        endif
+      enddo
       if (converged) then
         exit
       endif
@@ -359,42 +315,10 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
         enddo
       enddo
       
-      ! Gram-Schmidt
-      ! ------------
-      
-      do k=1,N_st_diag
-
-!        do l=1,N_st_diag*iter
-!            c(1) = u_dot_v(U(1,shift2+k),U(1,l),sze)
-!            do i=1,sze
-!              U(i,k,iter+1) = U(i,shift2+k) - c(1) * U(i,l)
-!            enddo
-!        enddo
-!
-        call dgemv('T',sze,N_st_diag*iter,1.d0,U,size(U,1),          &
-            U(1,shift2+k),1,0.d0,c,1)
-        call dgemv('N',sze,N_st_diag*iter,-1.d0,U,size(U,1),         &
-            c,1,1.d0,U(1,shift2+k),1)
-!
-!        do l=1,k-1
-!          c(1) = u_dot_v(U(1,shift2+k),U(1,shift2+l),sze)
-!          do i=1,sze
-!            U(i,k,iter+1) = U(i,shift2+k) - c(1) * U(i,shift2+l)
-!          enddo
-!        enddo
-!
-        call dgemv('T',sze,k-1,1.d0,U(1,shift2+1),size(U,1),         &
-            U(1,shift2+k),1,0.d0,c,1)
-        call dgemv('N',sze,k-1,-1.d0,U(1,shift2+1),size(U,1),        &
-            c,1,1.d0,U(1,shift2+k),1)
-
-        call normalize( U(1,shift2+k), sze )
-      enddo
-
     enddo
 
     if (.not.converged) then
-      iter = davidson_sze_max-1
+      iter = itermax-1
     endif
     
     ! Re-contract to u_in
@@ -404,20 +328,14 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
       energies(k) = lambda(k)
     enddo
 
-!    do k=1,N_st_diag
-!      do i=1,sze
-!        do l=1,iter*N_st_diag
-!            u_in(i,k) += U(i,l)*y(l,k)
-!          enddo
-!        enddo
-!      enddo
-!    enddo
-
     call dgemm('N','N', sze, N_st_diag, N_st_diag*iter, 1.d0,      &
         U, size(U,1), y, size(y,1), 0.d0, u_in, size(u_in,1))
 
   enddo
 
+  do k=1,N_st_diag
+    S2_jj(k) = s2(k)
+  enddo
   write_buffer = '===== '
   do i=1,N_st
     write_buffer = trim(write_buffer)//' ================ =========== ==========='
@@ -427,7 +345,6 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
   call write_time(iunit)
 
   deallocate (                                                       &
-      kl_pairs,                                                      &
       W, residual_norm,                                              &
       U,                                                             &
       R, c, S,                                                       &
diff --git a/src/Utils/LinearAlgebra.irp.f b/src/Utils/LinearAlgebra.irp.f
index 00f61101..e44e8c2c 100644
--- a/src/Utils/LinearAlgebra.irp.f
+++ b/src/Utils/LinearAlgebra.irp.f
@@ -11,9 +11,9 @@ subroutine svd(A,LDA,U,LDU,D,Vt,LDVt,m,n)
   
   integer, intent(in)             :: LDA, LDU, LDVt, m, n
   double precision, intent(in)    :: A(LDA,n)
-  double precision, intent(out)   :: U(LDU,n)
+  double precision, intent(out)   :: U(LDU,m)
   double precision,intent(out)    :: Vt(LDVt,n)
-  double precision,intent(out)    :: D(n)
+  double precision,intent(out)    :: D(min(m,n))
   double precision,allocatable    :: work(:)
   integer                         :: info, lwork, i, j, k
   
@@ -24,13 +24,13 @@ subroutine svd(A,LDA,U,LDU,D,Vt,LDVt,m,n)
   ! Find optimal size for temp arrays
   allocate(work(1))
   lwork = -1
-  call dgesvd('A','A', n, n, A_tmp, LDA,                             &
+  call dgesvd('A','A', m, n, A_tmp, LDA,                             &
       D, U, LDU, Vt, LDVt, work, lwork, info)
   lwork = work(1)
   deallocate(work)
 
   allocate(work(lwork))
-  call dgesvd('A','A', n, n, A_tmp, LDA,                             &
+  call dgesvd('A','A', m, n, A_tmp, LDA,                             &
       D, U, LDU, Vt, LDVt, work, lwork, info)
   deallocate(work,A_tmp)
 
@@ -125,6 +125,40 @@ subroutine ortho_canonical(overlap,LDA,N,C,LDC,m)
 end
 
 
+subroutine ortho_qr(A,LDA,m,n)
+  implicit none
+  BEGIN_DOC
+  ! Orthogonalization using Q.R factorization
+  !
+  ! A : matrix to orthogonalize
+  !
+  ! LDA : leftmost dimension of A
+  !
+  ! n : Number of rows of A
+  !
+  ! m : Number of columns of A
+  !
+  END_DOC
+  integer, intent(in)            :: m,n, LDA
+  double precision, intent(inout) :: A(LDA,n)
+
+  integer                        :: lwork, info
+  integer, allocatable           :: jpvt(:)
+  double precision, allocatable  :: tau(:), work(:)
+
+  allocate (jpvt(n), tau(n), work(1))
+  LWORK=-1
+!  call dgeqp3(m, n, A, LDA, jpvt, tau, WORK, LWORK, INFO)
+  call  dgeqrf( m, n, A, LDA, TAU, WORK, LWORK, INFO )
+  LWORK=WORK(1)
+  deallocate(WORK)
+  allocate(WORK(LWORK))
+!  call dgeqp3(m, n, A, LDA, jpvt, tau, WORK, LWORK, INFO)
+  call  dgeqrf( m, n, A, LDA, TAU, WORK, LWORK, INFO )
+  call dorgqr(m, n, n, A, LDA, tau, WORK, LWORK, INFO)
+  deallocate(WORK,jpvt,tau)
+end
+
 subroutine ortho_lowdin(overlap,LDA,N,C,LDC,m)
   implicit none
   BEGIN_DOC
@@ -161,7 +195,7 @@ subroutine ortho_lowdin(overlap,LDA,N,C,LDC,m)
 
   allocate(U(ldc,n),Vt(lda,n),S_half(lda,n),D(n))
 
-  call svd(overlap,lda,U,ldc,D,Vt,lda,m,n)
+  call svd(overlap,lda,U,ldc,D,Vt,lda,n,n)
 
   !$OMP PARALLEL DEFAULT(NONE) &
   !$OMP SHARED(S_half,U,D,Vt,n,C,m) &

From 1fe1750f90bdd0176e2cd8f34499f39c650931de Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Tue, 18 Oct 2016 21:36:45 +0200
Subject: [PATCH 06/15] Removed residual in Davdison

---
 src/Davidson/diagonalization_hs2.irp.f | 46 ++++++++++++++------------
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/src/Davidson/diagonalization_hs2.irp.f b/src/Davidson/diagonalization_hs2.irp.f
index 2db6b4cd..abffcf81 100644
--- a/src/Davidson/diagonalization_hs2.irp.f
+++ b/src/Davidson/diagonalization_hs2.irp.f
@@ -98,7 +98,7 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
   integer                        :: k_pairs, kl
   
   integer                        :: iter2
-  double precision, allocatable  :: W(:,:),  U(:,:), R(:,:), S(:,:)
+  double precision, allocatable  :: W(:,:),  U(:,:), S(:,:)
   double precision, allocatable  :: y(:,:), h(:,:), lambda(:), s2(:)
   double precision, allocatable  :: c(:), s_(:,:), s_tmp(:,:)
   double precision               :: diag_h_mat_elem
@@ -109,7 +109,7 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
   integer                        :: shift, shift2, itermax
   include 'constants.include.F'
   
-  !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: U, W, R, S, y, h, lambda
+  !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: U, W, S, y, h, lambda
   if (N_st_diag*3 > sze) then
      print *,  'error in Davidson :'
      print *,  'Increase n_det_max_jacobi to ', N_st_diag*3
@@ -152,7 +152,6 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
   allocate(                                                          &
       W(sze_8,N_st_diag*itermax),                           &
       U(sze_8,N_st_diag*itermax),                           &
-      R(sze_8,N_st_diag),                                            &
       S(sze_8,N_st_diag*itermax),                           &
       h(N_st_diag*itermax,N_st_diag*itermax),      &
       y(N_st_diag*itermax,N_st_diag*itermax),      &
@@ -169,7 +168,6 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
   U = 0.d0
   W = 0.d0
   S = 0.d0
-  R = 0.d0
   y = 0.d0
 
 
@@ -184,12 +182,24 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
   
   converged = .False.
   
-  do k=N_st+1,N_st_diag
+  double precision               :: r1, r2
+  do k=N_st+1,N_st_diag-2,2
       do i=1,sze
-        double precision               :: r1, r2
         call random_number(r1)
         call random_number(r2)
-        u_in(i,k) = dsqrt(-2.d0*dlog(r1))*dcos(dtwo_pi*r2)
+        r1 = dsqrt(-2.d0*dlog(r1))
+        r2 = dtwo_pi*r2
+        u_in(i,k) = r1*dcos(r2)
+        u_in(i,k+1) = r1*dsin(r2)
+      enddo
+  enddo
+  do k=N_st_diag-1,N_st_diag
+      do i=1,sze
+        call random_number(r1)
+        call random_number(r2)
+        r1 = dsqrt(-2.d0*dlog(r1))
+        r2 = dtwo_pi*r2
+        u_in(i,k) = r1*dcos(r2)
       enddo
   enddo
 
@@ -278,16 +288,17 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
       call dgemm('N','N', sze, N_st_diag, shift2,                    &
           1.d0, S, size(S,1), y, size(y,1), 0.d0, S(1,shift2+1), size(S,1))
 
-      ! Compute residual vector
-      ! -----------------------
+      ! Compute residual vector and davidson step
+      ! -----------------------------------------
       
       do k=1,N_st_diag
         do i=1,sze
-          R(i,k) = (lambda(k) * U(i,shift2+k) - W(i,shift2+k) )      &
-              * (1.d0 + s2(k) * U(i,shift2+k) - S(i,shift2+k) - S_z2_Sz)
+          U(i,shift2+k) = (lambda(k) * U(i,shift2+k) - W(i,shift2+k) )      &
+              * (1.d0 + s2(k) * U(i,shift2+k) - S(i,shift2+k) - S_z2_Sz) &
+              /max(H_jj(i) - lambda (k),1.d-2)
         enddo
         if (k <= N_st) then
-          residual_norm(k) = u_dot_u(R(1,k),sze)
+          residual_norm(k) = u_dot_u(U(1,shift2+k),sze)
           to_print(1,k) = lambda(k) + nuclear_repulsion
           to_print(2,k) = s2(k)
           to_print(3,k) = residual_norm(k)
@@ -306,15 +317,6 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
         exit
       endif
       
-      ! Davidson step
-      ! -------------
-      
-      do k=1,N_st_diag
-        do i=1,sze
-          U(i,shift2+k) =  - R(i,k)/max(H_jj(i) - lambda(k),1.d-2) 
-        enddo
-      enddo
-      
     enddo
 
     if (.not.converged) then
@@ -347,7 +349,7 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
   deallocate (                                                       &
       W, residual_norm,                                              &
       U,                                                             &
-      R, c, S,                                                       &
+      c, S,                                                          &
       h,                                                             &
       y, s_, s_tmp,                                                  &
       lambda                                                         &

From 4119577ae8335d49548f875219a1b7a4263e81ba Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Tue, 18 Oct 2016 22:20:46 +0200
Subject: [PATCH 07/15] Minor changes

---
 src/Davidson/diagonalization_hs2.irp.f | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/Davidson/diagonalization_hs2.irp.f b/src/Davidson/diagonalization_hs2.irp.f
index abffcf81..a7bc2b95 100644
--- a/src/Davidson/diagonalization_hs2.irp.f
+++ b/src/Davidson/diagonalization_hs2.irp.f
@@ -294,8 +294,8 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
       do k=1,N_st_diag
         do i=1,sze
           U(i,shift2+k) = (lambda(k) * U(i,shift2+k) - W(i,shift2+k) )      &
-              * (1.d0 + s2(k) * U(i,shift2+k) - S(i,shift2+k) - S_z2_Sz) &
-              /max(H_jj(i) - lambda (k),1.d-2)
+              * (1.d0 + s2(k) * U(i,shift2+k) - S(i,shift2+k) - S_z2_Sz &
+             )/max(H_jj(i) - lambda (k),1.d-2)
         enddo
         if (k <= N_st) then
           residual_norm(k) = u_dot_u(U(1,shift2+k),sze)
@@ -305,10 +305,10 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s
         endif
       enddo
       
-      write(iunit,'(X,I3,X,100(X,F16.10,X,F11.6,X,E11.3,A20))')  iter, to_print(:,1:N_st), ''
+      write(iunit,'(X,I3,X,100(X,F16.10,X,F11.6,X,E11.3,A30))')  iter, to_print(:,1:N_st), ''
       call davidson_converged(lambda,residual_norm,wall,iter,cpu,N_st,converged)
       do k=1,N_st
-        if (residual_norm(k) > 1.e9) then
+        if (residual_norm(k) > 1.e4) then
         print *,  ''
           stop 'Davidson failed'
         endif

From 2f1c7c5ce90f1b107992b5b7c0e461cf8a3d1a63 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Tue, 18 Oct 2016 23:07:03 +0200
Subject: [PATCH 08/15] Small changes in MRCC

---
 plugins/MRCC_Utils/mrcc_dress.irp.f |   2 +-
 plugins/MRCC_Utils/mrcc_utils.irp.f | 323 +++++++++++++++++++---------
 plugins/mrcepa0/EZFIO.cfg           |   2 +-
 plugins/mrcepa0/dressing.irp.f      |  37 ++--
 4 files changed, 241 insertions(+), 123 deletions(-)

diff --git a/plugins/MRCC_Utils/mrcc_dress.irp.f b/plugins/MRCC_Utils/mrcc_dress.irp.f
index e6d0fb81..5c2f5efc 100644
--- a/plugins/MRCC_Utils/mrcc_dress.irp.f
+++ b/plugins/MRCC_Utils/mrcc_dress.irp.f
@@ -271,7 +271,7 @@ subroutine mrcc_dress(delta_ij_, delta_ii_, Nstates, Ndet_non_ref, Ndet_ref,i_ge
           !delta_ii_(i_state,i_I)  = 0.d0
           do l_sd=1,idx_alpha(0)
             k_sd = idx_alpha(l_sd)
-            delta_ij_(i_state,k_sd,i_I) = delta_ij_(i_state,k_sd,i_I) + dIa_hla(i_state,k_sd)
+            delta_ij_(i_state,k_sd,i_I) = delta_ij_(i_state,k_sd,i_I) + 0.5d0 * dIa_hla(i_state,k_sd)
           enddo
         endif
       enddo
diff --git a/plugins/MRCC_Utils/mrcc_utils.irp.f b/plugins/MRCC_Utils/mrcc_utils.irp.f
index 14885153..f1c4b4a3 100644
--- a/plugins/MRCC_Utils/mrcc_utils.irp.f
+++ b/plugins/MRCC_Utils/mrcc_utils.irp.f
@@ -685,7 +685,7 @@ END_PROVIDER
         
 
   
-  do s = 1, N_states
+  do s=1, N_states
     
     A_val = 0d0
     A_ind = 0
@@ -698,61 +698,61 @@ END_PROVIDER
     
     !$OMP PARALLEL default(none) shared(psi_non_ref, hh_exists, pp_exists, N_int, A_val, A_ind)&
         !$OMP shared(s, hh_shortcut, psi_ref_coef, N_det_non_ref, psi_non_ref_sorted, psi_non_ref_sorted_idx, psi_ref, N_det_ref)&
-        !$OMP shared(active, active_hh_idx, active_pp_idx, nactive)&
+        !$OMP shared(active, active_hh_idx, active_pp_idx, nactive)  &
         !$OMP private(lref, pp, II, ok, myMask, myDet, ind, phase, wk, ppp, hh)
-     allocate(lref(N_det_non_ref))
-    !$OMP DO schedule(static,10) 
-      do ppp=1,nactive
-        pp = active_pp_idx(ppp)
-        hh = active_hh_idx(ppp)
-        lref = 0
-        do II = 1, N_det_ref
-          call apply_hole_local(psi_ref(1,1,II), hh_exists(1, hh), myMask, ok, N_int)
-          if(.not. ok) cycle
-          call apply_particle_local(myMask, pp_exists(1, pp), myDet, ok, N_int)
-          if(.not. ok) cycle
-          ind = searchDet(psi_non_ref_sorted(1,1,1), myDet(1,1), N_det_non_ref, N_int)
-          if(ind /= -1) then
-            call get_phase(myDet(1,1), psi_ref(1,1,II), phase, N_int)
-            if (phase  > 0.d0) then
-              lref(psi_non_ref_sorted_idx(ind)) =  II
-            else
-              lref(psi_non_ref_sorted_idx(ind)) = -II
-            endif
-          end if
-        end do
-        wk = 0
-        do i=1, N_det_non_ref
-          if(lref(i) > 0) then
-            wk += 1
-            A_val(wk, ppp) = psi_ref_coef(lref(i), s)
-            A_ind(wk, ppp) = i
-          else if(lref(i) < 0) then
-            wk += 1
-            A_val(wk, ppp) = -psi_ref_coef(-lref(i), s)
-            A_ind(wk, ppp) = i
-          end if
-        end do
-        A_ind(0,ppp) = wk
+    allocate(lref(N_det_non_ref))
+    !$OMP DO schedule(static,10)
+    do ppp=1,nactive
+      pp = active_pp_idx(ppp)
+      hh = active_hh_idx(ppp)
+      lref = 0
+      do II = 1, N_det_ref
+        call apply_hole_local(psi_ref(1,1,II), hh_exists(1, hh), myMask, ok, N_int)
+        if(.not. ok) cycle
+        call apply_particle_local(myMask, pp_exists(1, pp), myDet, ok, N_int)
+        if(.not. ok) cycle
+        ind = searchDet(psi_non_ref_sorted(1,1,1), myDet(1,1), N_det_non_ref, N_int)
+        if(ind /= -1) then
+          call get_phase(myDet(1,1), psi_ref(1,1,II), phase, N_int)
+          if (phase  > 0.d0) then
+            lref(psi_non_ref_sorted_idx(ind)) =  II
+          else
+            lref(psi_non_ref_sorted_idx(ind)) = -II
+          endif
+        end if
       end do
+      wk = 0
+      do i=1, N_det_non_ref
+        if(lref(i) > 0) then
+          wk += 1
+          A_val(wk, ppp) = psi_ref_coef(lref(i), s)
+          A_ind(wk, ppp) = i
+        else if(lref(i) < 0) then
+          wk += 1
+          A_val(wk, ppp) = -psi_ref_coef(-lref(i), s)
+          A_ind(wk, ppp) = i
+        end if
+      end do
+      A_ind(0,ppp) = wk
+    end do
     !$OMP END DO
     deallocate(lref)
-    !$OMP END PARALLEL   
-
-
+    !$OMP END PARALLEL
+    
+    
     print *,  'Done building A_val, A_ind'
     
     AtA_size = 0
     col_shortcut = 0
     N_col = 0
-    integer :: a_coll, at_roww
+    integer                        :: a_coll, at_roww
     
     
     !$OMP PARALLEL default(none) shared(k, psi_non_ref_coef, A_ind, A_val, x, N_det_ref, nex, N_det_non_ref)&
         !$OMP private(at_row, a_col, t, i, j, r1, r2, wk, A_ind_mwen, A_val_mwen, a_coll, at_roww)&
         !$OMP shared(col_shortcut, N_col, AtB, AtA_size, AtA_val, AtA_ind, s, nactive, active_pp_idx)
     allocate(A_val_mwen(nex), A_ind_mwen(nex))
-     
+    
     !$OMP DO schedule(dynamic, 100)
     do at_roww = 1, nactive ! nex
       at_row = active_pp_idx(at_roww)
@@ -762,8 +762,8 @@ END_PROVIDER
         j = active_pp_idx(i)
         AtB(at_row) = AtB(at_row) + psi_non_ref_coef(A_ind(i, at_roww), s) * A_val(i, at_roww)
       end do
-
-      do a_coll = 1, nactive 
+      
+      do a_coll = 1, nactive
         a_col = active_pp_idx(a_coll)
         t = 0d0
         r1 = 1
@@ -795,12 +795,12 @@ END_PROVIDER
         col_shortcut(at_roww) = AtA_size+1
         N_col(at_roww) = wk
         if (AtA_size+wk > size(AtA_ind,1)) then
-           print *, AtA_size+wk , size(AtA_ind,1)
-           stop 'too small'
+          print *, AtA_size+wk , size(AtA_ind,1)
+          stop 'too small'
         endif
         do i=1,wk
-         AtA_ind(AtA_size+i) = A_ind_mwen(i)
-         AtA_val(AtA_size+i) = A_val_mwen(i)
+          AtA_ind(AtA_size+i) = A_ind_mwen(i)
+          AtA_val(AtA_size+i) = A_val_mwen(i)
         enddo
         AtA_size += wk
         !$OMP END CRITICAL
@@ -822,41 +822,41 @@ END_PROVIDER
     rho_mrcc_init = 0d0
     
     allocate(lref(N_det_ref))
-    !$OMP PARALLEL DO default(shared) schedule(static, 1) &
+    !$OMP PARALLEL DO default(shared) schedule(static, 1)            &
         !$OMP private(lref, hh, pp, II, myMask, myDet, ok, ind, phase)
     do hh = 1, hh_shortcut(0)
-     do pp = hh_shortcut(hh), hh_shortcut(hh+1)-1
-      if(active(pp)) cycle
-      lref = 0
-      do II=1,N_det_ref
-        call apply_hole_local(psi_ref(1,1,II), hh_exists(1, hh), myMask, ok, N_int)
-        if(.not. ok) cycle
-        call apply_particle_local(myMask, pp_exists(1, pp), myDet, ok, N_int)
-        if(.not. ok) cycle
-        ind = searchDet(psi_non_ref_sorted(1,1,1), myDet(1,1), N_det_non_ref, N_int)
-        if(ind == -1) cycle
-        ind = psi_non_ref_sorted_idx(ind)
-        call get_phase(myDet(1,1), psi_ref(1,1,II), phase, N_int)
-        X(pp) += psi_ref_coef(II,s)**2
-        AtB(pp) += psi_non_ref_coef(ind, s) * psi_ref_coef(II, s) * phase
-        lref(II) = ind
-        if(phase < 0d0) lref(II) = -ind
+      do pp = hh_shortcut(hh), hh_shortcut(hh+1)-1
+        if(active(pp)) cycle
+        lref = 0
+        do II=1,N_det_ref
+          call apply_hole_local(psi_ref(1,1,II), hh_exists(1, hh), myMask, ok, N_int)
+          if(.not. ok) cycle
+          call apply_particle_local(myMask, pp_exists(1, pp), myDet, ok, N_int)
+          if(.not. ok) cycle
+          ind = searchDet(psi_non_ref_sorted(1,1,1), myDet(1,1), N_det_non_ref, N_int)
+          if(ind == -1) cycle
+          ind = psi_non_ref_sorted_idx(ind)
+          call get_phase(myDet(1,1), psi_ref(1,1,II), phase, N_int)
+          X(pp) += psi_ref_coef(II,s)**2
+          AtB(pp) += psi_non_ref_coef(ind, s) * psi_ref_coef(II, s) * phase
+          lref(II) = ind
+          if(phase < 0d0) lref(II) = -ind
+        end do
+        X(pp) =  AtB(pp) / X(pp)
+        do II=1,N_det_ref
+          if(lref(II) > 0) then
+            rho_mrcc_init(lref(II),s) = psi_ref_coef(II,s) * X(pp)
+          else if(lref(II) < 0) then
+            rho_mrcc_init(-lref(II),s) = -psi_ref_coef(II,s) * X(pp)
+          end if
+        end do
       end do
-      X(pp) =  AtB(pp) / X(pp)
-      do II=1,N_det_ref
-        if(lref(II) > 0) then
-          rho_mrcc_init(lref(II),s) = psi_ref_coef(II,s) * X(pp)
-        else if(lref(II) < 0) then
-          rho_mrcc_init(-lref(II),s) = -psi_ref_coef(II,s) * X(pp)
-        end if
-      end do
-     end do
     end do
     !$OMP END PARALLEL DO
     
     x_new = x
-     
-    double precision :: factor, resold
+    
+    double precision               :: factor, resold
     factor = 1.d0
     resold = huge(1.d0)
     do k=0,100000
@@ -882,10 +882,10 @@ END_PROVIDER
       !$OMP END PARALLEL
       
       res = 0.d0
-     
+      
       
       if (res < resold) then
-         do a_coll=1,nactive ! nex
+        do a_coll=1,nactive ! nex
           a_col = active_pp_idx(a_coll)
           do j=1,N_det_non_ref
             i = A_ind(j,a_coll)
@@ -894,60 +894,172 @@ END_PROVIDER
           enddo
           res = res + (X_new(a_col) - X(a_col))*(X_new(a_col) - X(a_col))
           X(a_col) = X_new(a_col)
-         end do
-         factor = 1.d0
+        end do
+        factor = 1.d0
       else
         factor = -factor * 0.5d0
       endif
       resold = res
-
-      if(mod(k, 5) == 0) then
+      
+      if(mod(k, 100) == 0) then
         print *, "res ", k, res
       end if
       
-      if(res < 1d-12) exit
+      if(res < 1d-9) exit
     end do
     
     
     
     norm = 0.d0
-     do i=1,N_det_non_ref
-       norm = norm + rho_mrcc(i,s)*rho_mrcc(i,s)
-     enddo
-     ! Norm now contains the norm of A.X
-
-     do i=1,N_det_ref
-       norm = norm + psi_ref_coef(i,s)*psi_ref_coef(i,s)
-     enddo
-     ! Norm now contains the norm of Psi + A.X
-     
-     print *, k, "res : ", res, "norm : ", sqrt(norm)
-     
-     !dIj_unique(:size(X), s) = X(:)
+    do i=1,N_det_non_ref
+      norm = norm + rho_mrcc(i,s)*rho_mrcc(i,s)
+    enddo
+    ! Norm now contains the norm of A.X
+    
+    do i=1,N_det_ref
+      norm = norm + psi_ref_coef(i,s)*psi_ref_coef(i,s)
+    enddo
+    ! Norm now contains the norm of Psi + A.X
+    
+    print *, k, "res : ", res, "norm : ", sqrt(norm)
+        
+!---------------
+! double precision               :: e_0, overlap
+! double precision, allocatable  :: u_0(:)
+! integer(bit_kind), allocatable :: keys_tmp(:,:,:)
+! allocate (u_0(N_det), keys_tmp(N_int,2,N_det) )
+! k=0
+! overlap = 0.d0
+! do i=1,N_det_ref
+!   k = k+1
+!   u_0(k) = psi_ref_coef(i,1)
+!   keys_tmp(:,:,k) = psi_ref(:,:,i)
+!   overlap += u_0(k)*psi_ref_coef(i,1)
+! enddo
+! norm = 0.d0
+! do i=1,N_det_non_ref
+!   k = k+1
+!   u_0(k) = psi_non_ref_coef(i,1)
+!   keys_tmp(:,:,k) = psi_non_ref(:,:,i)
+!   overlap += u_0(k)*psi_non_ref_coef(i,1)
+! enddo
+! 
+! call u_0_H_u_0(e_0,u_0,N_det,keys_tmp,N_int,1,N_det)
+! print *,  'Energy of |Psi_CASSD> : ', e_0 + nuclear_repulsion, overlap
+!
+! k=0
+! overlap = 0.d0
+! do i=1,N_det_ref
+!   k = k+1
+!   u_0(k) = psi_ref_coef(i,1)
+!   keys_tmp(:,:,k) = psi_ref(:,:,i)
+!   overlap += u_0(k)*psi_ref_coef(i,1)
+! enddo
+! norm = 0.d0
+! do i=1,N_det_non_ref
+!   k = k+1
+!   ! f is such that f.\tilde{c_i} = c_i
+!   f = psi_non_ref_coef(i,1) / rho_mrcc(i,1)
+!   
+!   ! Avoid numerical instabilities
+!   f = min(f,2.d0)
+!   f = max(f,-2.d0)
+!
+!   f = 1.d0
+!
+!   u_0(k) = rho_mrcc(i,1)*f
+!   keys_tmp(:,:,k) = psi_non_ref(:,:,i)
+!   norm += u_0(k)**2
+!   overlap += u_0(k)*psi_non_ref_coef(i,1)
+! enddo
+! 
+! call u_0_H_u_0(e_0,u_0,N_det,keys_tmp,N_int,1,N_det)
+! print *,  'Energy of |(1+T)Psi_0> : ', e_0 + nuclear_repulsion, overlap
+!
+! f = 1.d0/norm
+! norm = 1.d0
+! do i=1,N_det_ref
+!  norm = norm - psi_ref_coef(i,s)*psi_ref_coef(i,s)
+! enddo
+! f = dsqrt(f*norm)
+! overlap = norm
+! do i=1,N_det_non_ref
+!   u_0(k) = rho_mrcc(i,1)*f
+!   overlap += u_0(k)*psi_non_ref_coef(i,1)
+! enddo
+!
+! call u_0_H_u_0(e_0,u_0,N_det,keys_tmp,N_int,1,N_det)
+! print *,  'Energy of |(1+T)Psi_0> (normalized) : ', e_0 + nuclear_repulsion,  overlap
+!
+! k=0
+! overlap = 0.d0
+! do i=1,N_det_ref
+!   k = k+1
+!   u_0(k) = psi_ref_coef(i,1)
+!   keys_tmp(:,:,k) = psi_ref(:,:,i)
+!   overlap += u_0(k)*psi_ref_coef(i,1)
+! enddo
+! norm = 0.d0
+! do i=1,N_det_non_ref
+!   k = k+1
+!   ! f is such that f.\tilde{c_i} = c_i
+!   f = psi_non_ref_coef(i,1) / rho_mrcc(i,1)
+!   
+!   ! Avoid numerical instabilities
+!   f = min(f,2.d0)
+!   f = max(f,-2.d0)
+!
+!   u_0(k) = rho_mrcc(i,1)*f
+!   keys_tmp(:,:,k) = psi_non_ref(:,:,i)
+!   norm += u_0(k)**2
+!   overlap += u_0(k)*psi_non_ref_coef(i,1)
+! enddo
+! 
+! call u_0_H_u_0(e_0,u_0,N_det,keys_tmp,N_int,1,N_det)
+! print *,  'Energy of |(1+T)Psi_0> (mu_i): ', e_0 + nuclear_repulsion, overlap
+!
+! f = 1.d0/norm
+! norm = 1.d0
+! do i=1,N_det_ref
+!  norm = norm - psi_ref_coef(i,s)*psi_ref_coef(i,s)
+! enddo
+! overlap = norm
+! f = dsqrt(f*norm)
+! do i=1,N_det_non_ref
+!   u_0(k) = rho_mrcc(i,1)*f
+!   overlap += u_0(k)*psi_non_ref_coef(i,1)
+! enddo
+!
+! call u_0_H_u_0(e_0,u_0,N_det,keys_tmp,N_int,1,N_det)
+! print *,  'Energy of |(1+T)Psi_0> (normalized mu_i) : ', e_0 + nuclear_repulsion, overlap
+!
+! deallocate(u_0, keys_tmp)
+!
+!---------------
      
      norm = 0.d0
-     double precision :: f
+     double precision               :: f
      do i=1,N_det_non_ref
        if (rho_mrcc(i,s) == 0.d0) then
          rho_mrcc(i,s) = 1.d-32
        endif
-
+       
        ! f is such that f.\tilde{c_i} = c_i
        f = psi_non_ref_coef(i,s) / rho_mrcc(i,s)
-
+       
        ! Avoid numerical instabilities
        f = min(f,2.d0)
        f = max(f,-2.d0)
-
+       
        norm = norm + f*f *rho_mrcc(i,s)*rho_mrcc(i,s)
        rho_mrcc(i,s) = f
      enddo
      ! norm now contains the norm of |T.Psi_0>
      ! rho_mrcc now contains the f factors
-
+     
      f = 1.d0/norm
      ! f now contains 1/ <T.Psi_0|T.Psi_0>
-
+     
      norm = 1.d0
      do i=1,N_det_ref
        norm = norm - psi_ref_coef(i,s)*psi_ref_coef(i,s)
@@ -955,22 +1067,23 @@ END_PROVIDER
      ! norm now contains <Psi_SD|Psi_SD>
      f = dsqrt(f*norm)
      ! f normalises T.Psi_0 such that (1+T)|Psi> is normalized
-
+     
      norm = norm*f
      print *,  'norm of |T Psi_0> = ', dsqrt(norm)
-
+     
      do i=1,N_det_ref
        norm = norm + psi_ref_coef(i,s)*psi_ref_coef(i,s)
      enddo
-
+     
      do i=1,N_det_non_ref
        rho_mrcc(i,s) = rho_mrcc(i,s) * f
      enddo
      ! rho_mrcc now contains the product of the scaling factors and the
      ! normalization constant
     
-     dIj_unique(:size(X), s) = X(:)
+    dIj_unique(:size(X), s) = X(:)
   end do
+
 END_PROVIDER
 
 
diff --git a/plugins/mrcepa0/EZFIO.cfg b/plugins/mrcepa0/EZFIO.cfg
index d792390d..61f3392f 100644
--- a/plugins/mrcepa0/EZFIO.cfg
+++ b/plugins/mrcepa0/EZFIO.cfg
@@ -23,7 +23,7 @@ interface: ezfio
 type: Threshold
 doc: Threshold on the convergence of the dressed CI energy
 interface: ezfio,provider,ocaml
-default: 1.e-4
+default: 5.e-5
 
 [n_it_max_dressed_ci]
 type: Strictly_positive_int
diff --git a/plugins/mrcepa0/dressing.irp.f b/plugins/mrcepa0/dressing.irp.f
index 8df7e91a..4f355f2b 100644
--- a/plugins/mrcepa0/dressing.irp.f
+++ b/plugins/mrcepa0/dressing.irp.f
@@ -299,7 +299,7 @@ subroutine mrcc_part_dress(delta_ij_, delta_ii_,i_generator,n_selected,det_buffe
           delta_ii_(i_state,i_I)  = 0.d0
           do l_sd=1,idx_alpha(0)
             k_sd = idx_alpha(l_sd)
-            delta_ij_(i_state,k_sd,i_I) = delta_ij_(i_state,k_sd,i_I) + dIa_hla(i_state,k_sd)
+            delta_ij_(i_state,k_sd,i_I) = delta_ij_(i_state,k_sd,i_I) + 0.5d0*dIa_hla(i_state,k_sd)
           enddo
         endif
       enddo
@@ -554,7 +554,7 @@ END_PROVIDER
         do k=1,N_det_non_ref
 
           call i_h_j(psi_ref(1,1,j), psi_non_ref(1,1,k),N_int,Hjk)
-          call i_h_j(psi_non_ref(1,1,k),psi_ref(1,1,i), N_int,Hki)
+!          call i_h_j(psi_non_ref(1,1,k),psi_ref(1,1,i), N_int,Hki)
           
           delta_cas(i,j,i_state) += Hjk * dij(i, k, i_state) ! * Hki * lambda_mrcc(i_state, k)
           !print *, Hjk * get_dij(psi_ref(1,1,i), psi_non_ref(1,1,k), N_int), Hki * get_dij(psi_ref(1,1,j), psi_non_ref(1,1,k), N_int)
@@ -647,7 +647,7 @@ end function
   integer                         :: p1,p2,h1,h2,s1,s2, p1_,p2_,h1_,h2_,s1_,s2_, sortRefIdx(N_det_ref)
   logical                         :: ok
   double precision                :: phase_iI, phase_Ik, phase_Jl, phase_IJ, phase_al, diI, hIi, hJi, delta_JI, dkI(1), HkI, ci_inv(1), dia_hla(1)
-  double precision                :: contrib, HIIi, HJk, wall
+  double precision                :: contrib, contrib2,  HIIi, HJk, wall
   integer, dimension(0:2,2,2)     :: exc_iI, exc_Ik, exc_IJ
   integer(bit_kind)               :: det_tmp(N_int, 2), made_hole(N_int,2), made_particle(N_int,2), myActive(N_int,2)
   integer(bit_kind),allocatable   :: sortRef(:,:,:)
@@ -677,7 +677,7 @@ end function
     delta_mrcepa0_ij(:,:,:) = 0d0
 
     !$OMP PARALLEL DO default(none) schedule(dynamic) shared(delta_mrcepa0_ij, delta_mrcepa0_ii)       &
-    !$OMP private(m,i,II,J,k,degree,myActive,made_hole,made_particle,hjk,contrib)       &
+    !$OMP private(m,i,II,J,k,degree,myActive,made_hole,made_particle,hjk,contrib,contrib2)       &
     !$OMP shared(active_sorb, psi_non_ref, psi_non_ref_coef, psi_ref, psi_ref_coef, cepa0_shortcut, det_cepa0_active)     &
     !$OMP shared(N_det_ref, N_det_non_ref,N_int,det_cepa0_idx,lambda_mrcc,det_ref_active, delta_cas) &
     !$OMP shared(notf,i_state, sortRef, sortRefIdx, dij)
@@ -720,16 +720,18 @@ end function
           !$OMP ATOMIC
           notf = notf+1
 
-          call i_h_j(psi_non_ref(1,1,det_cepa0_idx(k)),psi_ref(1,1,J),N_int,HJk)
-          !contrib = delta_cas(II, J, i_state) * HJk * lambda_mrcc(i_state, det_cepa0_idx(k))
+!          call i_h_j(psi_non_ref(1,1,det_cepa0_idx(k)),psi_ref(1,1,J),N_int,HJk)
           contrib = delta_cas(II, J, i_state) * dij(J, det_cepa0_idx(k), i_state)
-          !$OMP ATOMIC
-          delta_mrcepa0_ij(J, det_cepa0_idx(i), i_state) += contrib
           
           if(dabs(psi_ref_coef(J,i_state)).ge.5.d-5) then
+            contrib2 = contrib / psi_ref_coef(J, i_state) * psi_non_ref_coef(det_cepa0_idx(i),i_state)
             !$OMP ATOMIC
-            delta_mrcepa0_ii(J,i_state) -= contrib / psi_ref_coef(J, i_state) * psi_non_ref_coef(det_cepa0_idx(i),i_state)
+            delta_mrcepa0_ii(J,i_state) -= contrib2 
+          else
+            contrib = contrib * 0.5d0
           end if
+          !$OMP ATOMIC
+          delta_mrcepa0_ij(J, det_cepa0_idx(i), i_state) += contrib
 
         end do kloop
       end do
@@ -753,7 +755,7 @@ END_PROVIDER
   integer                         :: p1,p2,h1,h2,s1,s2, p1_,p2_,h1_,h2_,s1_,s2_
   logical                         :: ok
   double precision                :: phase_Ji, phase_Ik, phase_Ii
-  double precision                :: contrib, delta_IJk, HJk, HIk, HIl
+  double precision                :: contrib, contrib2, delta_IJk, HJk, HIk, HIl
   integer, dimension(0:2,2,2)     :: exc_Ik, exc_Ji, exc_Ii
   integer(bit_kind)               :: det_tmp(N_int, 2), det_tmp2(N_int, 2)
   integer, allocatable            :: idx_sorted_bit(:)
@@ -778,7 +780,7 @@ END_PROVIDER
     !$OMP PARALLEL DO default(none) schedule(dynamic,10) shared(delta_sub_ij, delta_sub_ii)       &
     !$OMP private(i, J, k, degree, degree2, l, deg, ni)       &
     !$OMP private(p1,p2,h1,h2,s1,s2, p1_,p2_,h1_,h2_,s1_,s2_)     &
-    !$OMP private(ok, phase_Ji, phase_Ik, phase_Ii, contrib, delta_IJk, HJk, HIk, HIl, exc_Ik, exc_Ji, exc_Ii) &
+    !$OMP private(ok, phase_Ji, phase_Ik, phase_Ii, contrib2, contrib, delta_IJk, HJk, HIk, HIl, exc_Ik, exc_Ji, exc_Ii) &
     !$OMP private(det_tmp, det_tmp2, II, blok)    &
     !$OMP shared(idx_sorted_bit, N_det_non_ref, N_det_ref, N_int, psi_non_ref, psi_non_ref_coef, psi_ref, psi_ref_coef)   &
     !$OMP shared(i_state,lambda_mrcc, hf_bitmask, active_sorb)
@@ -827,13 +829,16 @@ END_PROVIDER
             delta_IJk = HJk * HIk * lambda_mrcc(i_state, k)
             call apply_excitation(psi_non_ref(1,1,i),exc_Ik,det_tmp,ok,N_int)
             if(ok) cycle
-              contrib = delta_IJk * HIl * lambda_mrcc(i_state,l)   
+            contrib = delta_IJk * HIl * lambda_mrcc(i_state,l)   
+            if(dabs(psi_ref_coef(II,i_state)).ge.5.d-5) then
+              contrib2 = contrib / psi_ref_coef(II, i_state) * psi_non_ref_coef(l,i_state)
+              !$OMP ATOMIC
+              delta_sub_ii(II,i_state) -= contrib2
+            else
+              contrib = contrib * 0.5d0
+            endif
             !$OMP ATOMIC
             delta_sub_ij(II, i, i_state) += contrib
-            if(dabs(psi_ref_coef(II,i_state)).ge.5.d-5) then
-              !$OMP ATOMIC
-              delta_sub_ii(II,i_state) -= contrib / psi_ref_coef(II, i_state) * psi_non_ref_coef(l,i_state)
-            endif
           end do
         end do
       end do

From 360d38a41de3c30cdba6ce8fbadb50d0e18b82f4 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Tue, 18 Oct 2016 23:10:04 +0200
Subject: [PATCH 09/15] Format error in loc_cele

---
 plugins/loc_cele/loc_cele.irp.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/plugins/loc_cele/loc_cele.irp.f b/plugins/loc_cele/loc_cele.irp.f
index c9036aa1..52e0ef28 100644
--- a/plugins/loc_cele/loc_cele.irp.f
+++ b/plugins/loc_cele/loc_cele.irp.f
@@ -451,7 +451,7 @@
 
       enddo !big loop over symmetry
 
-      10 format (4E18.12)
+      10 format (4E19.12)
 
 
 !  Now we copyt the newcmo into the mo_coef

From 1185d70be7404841abb9d4c0c419ba41f91af49d Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Tue, 18 Oct 2016 23:31:20 +0200
Subject: [PATCH 10/15] Removed all ipc between Fortran and OCaml

---
 ocaml/TaskServer.ml                  |  4 ++--
 plugins/Full_CI_ZMQ/fci_zmq.irp.f    |  2 +-
 plugins/MRCC_Utils/mrcc_utils.irp.f  | 16 ++++++++--------
 plugins/mrcepa0/dressing.irp.f       |  1 -
 src/Davidson/davidson_parallel.irp.f |  4 ++--
 src/ZMQ/utils.irp.f                  | 27 ++++++++++-----------------
 6 files changed, 23 insertions(+), 31 deletions(-)

diff --git a/ocaml/TaskServer.ml b/ocaml/TaskServer.ml
index 9a1797f8..6edc8122 100644
--- a/ocaml/TaskServer.ml
+++ b/ocaml/TaskServer.ml
@@ -678,9 +678,9 @@ let run ~port =
 
               (** Debug input *)
               Printf.sprintf "q:%d  r:%d  n:%d  : %s\n%!"
-              (Queuing_system.number_of_queued  program_state.queue)
+              (Queuing_system.number_of_queued program_state.queue)
               (Queuing_system.number_of_running program_state.queue)
-              (Queuing_system.number_of_tasks   program_state.queue)
+              (Queuing_system.number_of_tasks program_state.queue)
               (Message.to_string message)
               |> debug;
 
diff --git a/plugins/Full_CI_ZMQ/fci_zmq.irp.f b/plugins/Full_CI_ZMQ/fci_zmq.irp.f
index a5dd8dcf..c81b1266 100644
--- a/plugins/Full_CI_ZMQ/fci_zmq.irp.f
+++ b/plugins/Full_CI_ZMQ/fci_zmq.irp.f
@@ -134,7 +134,7 @@ subroutine ZMQ_selection(N_in, pt2)
 
   step = int(5000000.d0 / dble(N_int * N_states * elec_num * elec_num * mo_tot_num * mo_tot_num ))
   step = max(1,step)
-  do i= 1,N_det_generators, step
+  do i= N_det_generators, 1, -step
     i_generator_start = max(i-step+1,1)
     i_generator_max = i
     write(task,*) i_generator_start, i_generator_max, 1, N
diff --git a/plugins/MRCC_Utils/mrcc_utils.irp.f b/plugins/MRCC_Utils/mrcc_utils.irp.f
index f1c4b4a3..84bca0b4 100644
--- a/plugins/MRCC_Utils/mrcc_utils.irp.f
+++ b/plugins/MRCC_Utils/mrcc_utils.irp.f
@@ -1043,23 +1043,23 @@ END_PROVIDER
        if (rho_mrcc(i,s) == 0.d0) then
          rho_mrcc(i,s) = 1.d-32
        endif
-       
+
        ! f is such that f.\tilde{c_i} = c_i
        f = psi_non_ref_coef(i,s) / rho_mrcc(i,s)
-       
+
        ! Avoid numerical instabilities
        f = min(f,2.d0)
        f = max(f,-2.d0)
-       
+
        norm = norm + f*f *rho_mrcc(i,s)*rho_mrcc(i,s)
        rho_mrcc(i,s) = f
      enddo
      ! norm now contains the norm of |T.Psi_0>
      ! rho_mrcc now contains the f factors
-     
+
      f = 1.d0/norm
      ! f now contains 1/ <T.Psi_0|T.Psi_0>
-     
+
      norm = 1.d0
      do i=1,N_det_ref
        norm = norm - psi_ref_coef(i,s)*psi_ref_coef(i,s)
@@ -1067,14 +1067,14 @@ END_PROVIDER
      ! norm now contains <Psi_SD|Psi_SD>
      f = dsqrt(f*norm)
      ! f normalises T.Psi_0 such that (1+T)|Psi> is normalized
-     
+
      norm = norm*f
      print *,  'norm of |T Psi_0> = ', dsqrt(norm)
-     
+
      do i=1,N_det_ref
        norm = norm + psi_ref_coef(i,s)*psi_ref_coef(i,s)
      enddo
-     
+
      do i=1,N_det_non_ref
        rho_mrcc(i,s) = rho_mrcc(i,s) * f
      enddo
diff --git a/plugins/mrcepa0/dressing.irp.f b/plugins/mrcepa0/dressing.irp.f
index 4f355f2b..3646b0b2 100644
--- a/plugins/mrcepa0/dressing.irp.f
+++ b/plugins/mrcepa0/dressing.irp.f
@@ -554,7 +554,6 @@ END_PROVIDER
         do k=1,N_det_non_ref
 
           call i_h_j(psi_ref(1,1,j), psi_non_ref(1,1,k),N_int,Hjk)
-!          call i_h_j(psi_non_ref(1,1,k),psi_ref(1,1,i), N_int,Hki)
           
           delta_cas(i,j,i_state) += Hjk * dij(i, k, i_state) ! * Hki * lambda_mrcc(i_state, k)
           !print *, Hjk * get_dij(psi_ref(1,1,i), psi_non_ref(1,1,k), N_int), Hki * get_dij(psi_ref(1,1,j), psi_non_ref(1,1,k), N_int)
diff --git a/src/Davidson/davidson_parallel.irp.f b/src/Davidson/davidson_parallel.irp.f
index 50b58f67..cede52c9 100644
--- a/src/Davidson/davidson_parallel.irp.f
+++ b/src/Davidson/davidson_parallel.irp.f
@@ -501,7 +501,7 @@ subroutine davidson_miniserver_end()
   integer                 rc
   character*(64)          buf
   
-  address = trim(qp_run_address_tcp)//':11223'
+  address = trim(qp_run_address)//':11223'
   requester = f77_zmq_socket(zmq_context, ZMQ_REQ)
   rc        = f77_zmq_connect(requester,address)
 
@@ -520,7 +520,7 @@ subroutine davidson_miniserver_get()
   character*(20)          buffer
   integer                 rc
   
-  address = trim(qp_run_address_tcp)//':11223'
+  address = trim(qp_run_address)//':11223'
   
   requester = f77_zmq_socket(zmq_context, ZMQ_REQ)
   rc        = f77_zmq_connect(requester,address)
diff --git a/src/ZMQ/utils.irp.f b/src/ZMQ/utils.irp.f
index 84665199..f2703ff8 100644
--- a/src/ZMQ/utils.irp.f
+++ b/src/ZMQ/utils.irp.f
@@ -17,8 +17,6 @@ END_PROVIDER
 
 
  BEGIN_PROVIDER [ character*(128), qp_run_address ]
-&BEGIN_PROVIDER [ character*(128), qp_run_address_ipc ]
-&BEGIN_PROVIDER [ character*(128), qp_run_address_tcp ]
 &BEGIN_PROVIDER [ integer, zmq_port_start ]
   use f77_zmq
   implicit none
@@ -36,22 +34,19 @@ END_PROVIDER
   integer                        :: i
   do i=len(buffer),1,-1
     if ( buffer(i:i) == ':') then
-      qp_run_address_tcp = trim(buffer(1:i-1))
+      qp_run_address = trim(buffer(1:i-1))
       read(buffer(i+1:), *) zmq_port_start
       exit
     endif
   enddo
-  qp_run_address_ipc = 'ipc:///tmp/qp_run'
-  qp_run_address = qp_run_address_ipc
 END_PROVIDER
 
-
  BEGIN_PROVIDER [ character*(128), zmq_socket_pull_tcp_address    ]
-&BEGIN_PROVIDER [ character*(128), zmq_socket_pull_inproc_address ]
 &BEGIN_PROVIDER [ character*(128), zmq_socket_pair_inproc_address ]
 &BEGIN_PROVIDER [ character*(128), zmq_socket_push_tcp_address    ]
+&BEGIN_PROVIDER [ character*(128), zmq_socket_pull_inproc_address ]
 &BEGIN_PROVIDER [ character*(128), zmq_socket_push_inproc_address ]
-&BEGIN_PROVIDER [ character*(128), zmq_socket_sub_address     ]
+&BEGIN_PROVIDER [ character*(128), zmq_socket_sub_tcp_address ]
   use f77_zmq
   implicit none
   BEGIN_DOC
@@ -59,12 +54,12 @@ END_PROVIDER
   END_DOC
   character*(8), external        :: zmq_port
 
+  zmq_socket_sub_tcp_address     = trim(qp_run_address)//':'//zmq_port(1)//' '
   zmq_socket_pull_tcp_address    = 'tcp://*:'//zmq_port(2)//' '
+  zmq_socket_push_tcp_address    = trim(qp_run_address)//':'//zmq_port(2)//' '
   zmq_socket_pull_inproc_address = 'inproc://'//zmq_port(2)//' '
-  zmq_socket_pair_inproc_address = 'inproc://'//zmq_port(3)//' '
-  zmq_socket_push_tcp_address    = trim(qp_run_address_tcp)//':'//zmq_port(2)//' '
   zmq_socket_push_inproc_address = zmq_socket_pull_inproc_address
-  zmq_socket_sub_address         = trim(qp_run_address)//':'//zmq_port(1)//' '
+  zmq_socket_pair_inproc_address = 'inproc://'//zmq_port(3)//' '
 
   ! /!\ Don't forget to change subroutine reset_zmq_addresses
 END_PROVIDER
@@ -77,13 +72,12 @@ subroutine reset_zmq_addresses
   END_DOC
   character*(8), external        :: zmq_port
 
+  zmq_socket_sub_tcp_address     = trim(qp_run_address)//':'//zmq_port(1)//' '
   zmq_socket_pull_tcp_address    = 'tcp://*:'//zmq_port(2)//' '
+  zmq_socket_push_tcp_address    = trim(qp_run_address)//':'//zmq_port(2)//' '
   zmq_socket_pull_inproc_address = 'inproc://'//zmq_port(2)//' '
-  zmq_socket_pair_inproc_address = 'inproc://'//zmq_port(3)//' '
-  zmq_socket_push_tcp_address    = trim(qp_run_address_tcp)//':'//zmq_port(2)//' '
   zmq_socket_push_inproc_address = zmq_socket_pull_inproc_address
-  zmq_socket_sub_address         = trim(qp_run_address)//':'//zmq_port(1)//' '
-
+  zmq_socket_pair_inproc_address = 'inproc://'//zmq_port(3)//' '
 end
 
 
@@ -111,7 +105,6 @@ subroutine switch_qp_run_to_master
       exit
     endif
   enddo
-  qp_run_address_tcp = qp_run_address
   call reset_zmq_addresses
 
 end
@@ -374,7 +367,7 @@ function new_zmq_sub_socket()
     stop 'Unable to subscribe new_zmq_sub_socket'
   endif
 
-  rc = f77_zmq_connect(new_zmq_sub_socket, zmq_socket_sub_address)
+  rc = f77_zmq_connect(new_zmq_sub_socket, zmq_socket_sub_tcp_address)
   if (rc /= 0) then
     stop 'Unable to connect new_zmq_sub_socket'
   endif

From 8802d988490bff8ffa656b1484d6526a343c65b1 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Thu, 27 Oct 2016 13:49:29 +0200
Subject: [PATCH 11/15] wrong dimensions in s2_out

---
 src/Davidson/diagonalization_hs2.irp.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Davidson/diagonalization_hs2.irp.f b/src/Davidson/diagonalization_hs2.irp.f
index a7bc2b95..d7ec11b6 100644
--- a/src/Davidson/diagonalization_hs2.irp.f
+++ b/src/Davidson/diagonalization_hs2.irp.f
@@ -22,7 +22,7 @@ subroutine davidson_diag_hs2(dets_in,u_in,s2_out,dim_in,energies,sze,N_st,N_st_d
   integer, intent(in)            :: dim_in, sze, N_st, N_st_diag, Nint, iunit
   integer(bit_kind), intent(in)  :: dets_in(Nint,2,sze)
   double precision, intent(inout) :: u_in(dim_in,N_st_diag)
-  double precision, intent(out)  :: energies(N_st), s2_out(N_st)
+  double precision, intent(out)  :: energies(N_st), s2_out(N_st_diag)
   double precision, allocatable  :: H_jj(:), S2_jj(:)
   
   double precision               :: diag_h_mat_elem

From afc4111e24dc5f0578346b863fa0ebfee4fdf9a6 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 28 Oct 2016 17:10:49 +0200
Subject: [PATCH 12/15] Fixed "Unable to bind socket"

---
 configure           |  5 ++++-
 src/ZMQ/utils.irp.f | 35 ++++++++++++++++++++++++++++-------
 2 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/configure b/configure
index 19016136..8cb02608 100755
--- a/configure
+++ b/configure
@@ -487,7 +487,6 @@ def create_ninja_and_rc(l_installed):
 
     l_rc = [
         'export QP_ROOT={0}'.format(QP_ROOT),
-        '#export QP_NIC=ib0   # Choose the correct network inuterface',
         'export QP_EZFIO={0}'.format(path_ezfio.replace(QP_ROOT,"${QP_ROOT}")),
         'export QP_PYTHON={0}'.format(":".join(l_python)), "",
         'export IRPF90={0}'.format(path_irpf90.replace(QP_ROOT,"${QP_ROOT}")),
@@ -498,6 +497,10 @@ def create_ninja_and_rc(l_installed):
         'export LIBRARY_PATH="${QP_ROOT}"/lib:"${LIBRARY_PATH}"', "",
         'source ${QP_ROOT}/install/EZFIO/Bash/ezfio.sh', "",
         'source ${HOME}/.opam/opam-init/init.sh > /dev/null 2> /dev/null || true',
+        '',
+        '# Choose the correct network interface',
+        '# export QP_NIC=ib0',
+        '# export QP_NIC=eth0',
         ""
     ]
 
diff --git a/src/ZMQ/utils.irp.f b/src/ZMQ/utils.irp.f
index f2703ff8..444c33fe 100644
--- a/src/ZMQ/utils.irp.f
+++ b/src/ZMQ/utils.irp.f
@@ -257,15 +257,36 @@ function new_zmq_pull_socket()
     stop 'Unable to set ZMQ_RCVHWM on pull socket'
   endif
   
-  rc = f77_zmq_bind(new_zmq_pull_socket, zmq_socket_pull_tcp_address)
-  if (rc /= 0) then
-    print *,  'Unable to bind new_zmq_pull_socket (tcp)', zmq_socket_pull_tcp_address
+  integer :: icount
+
+  icount = 10
+  do while (icount > 0)
+    rc = f77_zmq_bind(new_zmq_pull_socket, zmq_socket_pull_inproc_address)
+    if (rc /= 0) then
+      icount = icount-1
+      call sleep(3)
+    endif
+  enddo
+
+  if (icount == 0) then
+    print *,  'Unable to bind new_zmq_pull_socket (inproc)', zmq_socket_pull_inproc_address
     stop 
   endif
-  
-  rc = f77_zmq_bind(new_zmq_pull_socket, zmq_socket_pull_inproc_address)
-  if (rc /= 0) then
-    stop 'Unable to bind new_zmq_pull_socket (inproc)'
+
+
+  icount = 10
+  do while (icount > 0)
+    rc = f77_zmq_bind(new_zmq_pull_socket, zmq_socket_pull_tcp_address)
+    if (rc /= 0) then
+      icount = icount-1
+      call sleep(3)
+    endif
+    
+  enddo
+
+  if (icount == 0) then
+    print *,  'Unable to bind new_zmq_pull_socket (tcp)', zmq_socket_pull_tcp_address
+    stop 
   endif
   
 end

From 08ac74cc2dfe8c7f91b36c6e6da2ecb8911a2808 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 28 Oct 2016 18:18:46 +0200
Subject: [PATCH 13/15] Fixed binding bug

---
 src/ZMQ/utils.irp.f | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/ZMQ/utils.irp.f b/src/ZMQ/utils.irp.f
index 444c33fe..b2b5b7b4 100644
--- a/src/ZMQ/utils.irp.f
+++ b/src/ZMQ/utils.irp.f
@@ -265,12 +265,14 @@ function new_zmq_pull_socket()
     if (rc /= 0) then
       icount = icount-1
       call sleep(3)
+    else
+      exit
     endif
   enddo
 
   if (icount == 0) then
     print *,  'Unable to bind new_zmq_pull_socket (inproc)', zmq_socket_pull_inproc_address
-    stop 
+    stop -1
   endif
 
 
@@ -280,13 +282,14 @@ function new_zmq_pull_socket()
     if (rc /= 0) then
       icount = icount-1
       call sleep(3)
+    else
+      exit
     endif
-    
   enddo
 
   if (icount == 0) then
     print *,  'Unable to bind new_zmq_pull_socket (tcp)', zmq_socket_pull_tcp_address
-    stop 
+    stop -1
   endif
   
 end

From 156a3f551bc859c232bb181e31d83d9cd4ce5ca7 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 28 Oct 2016 21:59:39 +0200
Subject: [PATCH 14/15] Accelerated integral access

---
 src/Integrals_Bielec/map_integrals.irp.f | 85 ++++++++++++++----------
 1 file changed, 51 insertions(+), 34 deletions(-)

diff --git a/src/Integrals_Bielec/map_integrals.irp.f b/src/Integrals_Bielec/map_integrals.irp.f
index afc573aa..22fd48a6 100644
--- a/src/Integrals_Bielec/map_integrals.irp.f
+++ b/src/Integrals_Bielec/map_integrals.irp.f
@@ -120,15 +120,16 @@ end
 
 END_PROVIDER
 
-BEGIN_PROVIDER [ double precision, ao_integrals_cache, (ao_integrals_cache_min:ao_integrals_cache_max,ao_integrals_cache_min:ao_integrals_cache_max,ao_integrals_cache_min:ao_integrals_cache_max,ao_integrals_cache_min:ao_integrals_cache_max) ]
+BEGIN_PROVIDER [ double precision, ao_integrals_cache, (0:64*64*64*64) ]
  implicit none
  BEGIN_DOC
  ! Cache of AO integrals for fast access
  END_DOC
  PROVIDE ao_bielec_integrals_in_map
- integer                        :: i,j,k,l
+ integer                        :: i,j,k,l,ii
  integer(key_kind)              :: idx
- !$OMP PARALLEL DO PRIVATE (i,j,k,l,idx)
+ real(integral_kind)            :: integral
+ !$OMP PARALLEL DO PRIVATE (i,j,k,l,idx,ii,integral)
  do l=ao_integrals_cache_min,ao_integrals_cache_max
    do k=ao_integrals_cache_min,ao_integrals_cache_max
      do j=ao_integrals_cache_min,ao_integrals_cache_max
@@ -136,7 +137,12 @@ BEGIN_PROVIDER [ double precision, ao_integrals_cache, (ao_integrals_cache_min:a
          !DIR$ FORCEINLINE
          call bielec_integrals_index(i,j,k,l,idx)
          !DIR$ FORCEINLINE
-         call map_get(ao_integrals_map,idx,ao_integrals_cache(i,j,k,l))
+         call map_get(ao_integrals_map,idx,integral)
+         ii = l-ao_integrals_cache_min
+         ii = ior( ishft(ii,6), k-ao_integrals_cache_min)
+         ii = ior( ishft(ii,6), j-ao_integrals_cache_min)
+         ii = ior( ishft(ii,6), i-ao_integrals_cache_min)
+         ao_integrals_cache(ii) = integral
        enddo
      enddo
    enddo
@@ -155,30 +161,33 @@ double precision function get_ao_bielec_integral(i,j,k,l,map)
   integer, intent(in)            :: i,j,k,l
   integer(key_kind)              :: idx
   type(map_type), intent(inout)  :: map
+  integer                        :: ii
   real(integral_kind)            :: tmp
-  PROVIDE ao_bielec_integrals_in_map
+  PROVIDE ao_bielec_integrals_in_map ao_integrals_cache ao_integrals_cache_min
   !DIR$ FORCEINLINE
   if (ao_overlap_abs(i,k)*ao_overlap_abs(j,l) < ao_integrals_threshold ) then
     tmp = 0.d0
   else if (ao_bielec_integral_schwartz(i,k)*ao_bielec_integral_schwartz(j,l) < ao_integrals_threshold) then
     tmp = 0.d0
   else
-    if ( (i >= ao_integrals_cache_min) .and. &
-        (j >= ao_integrals_cache_min) .and. &
-        (k >= ao_integrals_cache_min) .and. &
-        (l >= ao_integrals_cache_min) .and. &
-        (i <= ao_integrals_cache_max) .and. &
-        (j <= ao_integrals_cache_max) .and. &
-        (k <= ao_integrals_cache_max) .and. &
-        (l <= ao_integrals_cache_max) ) then
-        tmp = ao_integrals_cache(i,j,k,l)
-    else
+    ii = l-ao_integrals_cache_min
+    ii = ior(ii, k-ao_integrals_cache_min)
+    ii = ior(ii, j-ao_integrals_cache_min)
+    ii = ior(ii, i-ao_integrals_cache_min)
+    if (iand(ii, -64) /= 0) then
       !DIR$ FORCEINLINE
       call bielec_integrals_index(i,j,k,l,idx)
+      !DIR$ FORCEINLINE
       call map_get(map,idx,tmp)
+      get_ao_bielec_integral = dble(tmp)
+    else
+      ii = l-ao_integrals_cache_min
+      ii = ior( ishft(ii,6), k-ao_integrals_cache_min)
+      ii = ior( ishft(ii,6), j-ao_integrals_cache_min)
+      ii = ior( ishft(ii,6), i-ao_integrals_cache_min)
+      get_ao_bielec_integral = ao_integrals_cache(ii)
     endif
   endif
-  get_ao_bielec_integral = tmp
 end
 
 
@@ -324,20 +333,22 @@ end
  ! Min and max values of the MOs for which the integrals are in the cache
  END_DOC
  mo_integrals_cache_min = max(1,elec_alpha_num - 31)
- mo_integrals_cache_max = min(mo_tot_num,elec_alpha_num + 32)
+ mo_integrals_cache_max = min(mo_tot_num,mo_integrals_cache_min+63)
 
 END_PROVIDER
 
-BEGIN_PROVIDER [ double precision, mo_integrals_cache, (mo_integrals_cache_min:mo_integrals_cache_max,mo_integrals_cache_min:mo_integrals_cache_max,mo_integrals_cache_min:mo_integrals_cache_max,mo_integrals_cache_min:mo_integrals_cache_max) ]
+BEGIN_PROVIDER [ double precision, mo_integrals_cache, (0:64*64*64*64) ]
  implicit none
  BEGIN_DOC
  ! Cache of MO integrals for fast access
  END_DOC
  PROVIDE mo_bielec_integrals_in_map
  integer                        :: i,j,k,l
+ integer                        :: ii
  integer(key_kind)              :: idx
+ real(integral_kind)            :: integral
  FREE ao_integrals_cache
- !$OMP PARALLEL DO PRIVATE (i,j,k,l,idx)
+ !$OMP PARALLEL DO PRIVATE (i,j,k,l,idx,ii,integral)
  do l=mo_integrals_cache_min,mo_integrals_cache_max
    do k=mo_integrals_cache_min,mo_integrals_cache_max
      do j=mo_integrals_cache_min,mo_integrals_cache_max
@@ -345,7 +356,12 @@ BEGIN_PROVIDER [ double precision, mo_integrals_cache, (mo_integrals_cache_min:m
          !DIR$ FORCEINLINE
          call bielec_integrals_index(i,j,k,l,idx)
          !DIR$ FORCEINLINE
-         call map_get(mo_integrals_map,idx,mo_integrals_cache(i,j,k,l))
+         call map_get(mo_integrals_map,idx,integral)
+         ii = l-mo_integrals_cache_min
+         ii = ior( ishft(ii,6), k-mo_integrals_cache_min)
+         ii = ior( ishft(ii,6), j-mo_integrals_cache_min)
+         ii = ior( ishft(ii,6), i-mo_integrals_cache_min)
+         mo_integrals_cache(ii) = integral
        enddo
      enddo
    enddo
@@ -362,24 +378,26 @@ double precision function get_mo_bielec_integral(i,j,k,l,map)
   END_DOC
   integer, intent(in)            :: i,j,k,l
   integer(key_kind)              :: idx
+  integer                        :: ii
   type(map_type), intent(inout)  :: map
   real(integral_kind)            :: tmp
   PROVIDE mo_bielec_integrals_in_map mo_integrals_cache
-  if ( (i >= mo_integrals_cache_min) .and. &
-       (j >= mo_integrals_cache_min) .and. &
-       (k >= mo_integrals_cache_min) .and. &
-       (l >= mo_integrals_cache_min) .and. &
-       (i <= mo_integrals_cache_max) .and. &
-       (j <= mo_integrals_cache_max) .and. &
-       (k <= mo_integrals_cache_max) .and. &
-       (l <= mo_integrals_cache_max) ) then
-       get_mo_bielec_integral = mo_integrals_cache(i,j,k,l)
-  else
+  ii = l-mo_integrals_cache_min
+  ii = ior(ii, k-mo_integrals_cache_min)
+  ii = ior(ii, j-mo_integrals_cache_min)
+  ii = ior(ii, i-mo_integrals_cache_min)
+  if (iand(ii, -64) /= 0) then
     !DIR$ FORCEINLINE
     call bielec_integrals_index(i,j,k,l,idx)
     !DIR$ FORCEINLINE
     call map_get(map,idx,tmp)
     get_mo_bielec_integral = dble(tmp)
+  else
+    ii = l-mo_integrals_cache_min
+    ii = ior( ishft(ii,6), k-mo_integrals_cache_min)
+    ii = ior( ishft(ii,6), j-mo_integrals_cache_min)
+    ii = ior( ishft(ii,6), i-mo_integrals_cache_min)
+    get_mo_bielec_integral = mo_integrals_cache(ii)
   endif
 end
 
@@ -390,16 +408,15 @@ double precision function get_mo_bielec_integral_schwartz(i,j,k,l,map)
   ! Returns one integral <ij|kl> in the MO basis
   END_DOC
   integer, intent(in)            :: i,j,k,l
-  integer(key_kind)              :: idx
   type(map_type), intent(inout)  :: map
-  real(integral_kind)            :: tmp
+  double precision, external     :: get_mo_bielec_integral
+
   PROVIDE mo_bielec_integrals_in_map  mo_integrals_cache
   if (mo_bielec_integral_schwartz(i,k)*mo_bielec_integral_schwartz(j,l) > mo_integrals_threshold) then
-    double precision, external :: get_mo_bielec_integral
     !DIR$ FORCEINLINE
     get_mo_bielec_integral_schwartz = get_mo_bielec_integral(i,j,k,l,map)
   else
-    tmp = 0.d0
+    get_mo_bielec_integral_schwartz = 0.d0
   endif
 end
 

From 3946c710feb8454d53f74ba8dc16390c2de01429 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 28 Oct 2016 22:26:20 +0200
Subject: [PATCH 15/15] Accelerated mono-excitations (mipi miip)

---
 src/Determinants/slater_rules.irp.f        | 131 +++------------------
 src/Integrals_Bielec/map_integrals.irp.f   |   1 +
 src/Integrals_Bielec/mo_bi_integrals.irp.f |  25 ++++
 3 files changed, 40 insertions(+), 117 deletions(-)

diff --git a/src/Determinants/slater_rules.irp.f b/src/Determinants/slater_rules.irp.f
index 67463088..7df6e79e 100644
--- a/src/Determinants/slater_rules.irp.f
+++ b/src/Determinants/slater_rules.irp.f
@@ -515,8 +515,6 @@ subroutine i_H_j(key_i,key_j,Nint,hij)
   integer                        :: occ(Nint*bit_kind_size,2)
   double precision               :: diag_H_mat_elem, phase,phase_2
   integer                        :: n_occ_ab(2)
-  logical                        :: has_mipi(Nint*bit_kind_size)
-  double precision               :: mipi(Nint*bit_kind_size), miip(Nint*bit_kind_size)
   PROVIDE mo_bielec_integrals_in_map mo_integrals_map
   
   ASSERT (Nint > 0)
@@ -568,59 +566,27 @@ subroutine i_H_j(key_i,key_j,Nint,hij)
       call get_mono_excitation(key_i,key_j,exc,phase,Nint)
       !DIR$ FORCEINLINE
       call bitstring_to_list_ab(key_i, occ, n_occ_ab, Nint)
-      has_mipi = .False.
       if (exc(0,1,1) == 1) then
         ! Mono alpha
         m = exc(1,1,1)
         p = exc(1,2,1)
         do k = 1, elec_alpha_num
-          i = occ(k,1)
-          if (.not.has_mipi(i)) then
-            mipi(i) = get_mo_bielec_integral(m,i,p,i,mo_integrals_map)
-            miip(i) = get_mo_bielec_integral(m,i,i,p,mo_integrals_map)
-            has_mipi(i) = .True.
-          endif
+          hij = hij + mo_bielec_integral_mipi_anti(occ(k,1),m,p)
         enddo
         do k = 1, elec_beta_num
-          i = occ(k,2)
-          if (.not.has_mipi(i)) then
-            mipi(i) = get_mo_bielec_integral(m,i,p,i,mo_integrals_map)
-            has_mipi(i) = .True.
-          endif
-        enddo
-        
-        do k = 1, elec_alpha_num
-          hij = hij + mipi(occ(k,1)) - miip(occ(k,1))
-        enddo
-        do k = 1, elec_beta_num
-          hij = hij + mipi(occ(k,2))
+          hij = hij + mo_bielec_integral_mipi(occ(k,2),m,p)
         enddo
         
       else
         ! Mono beta
         m = exc(1,1,2)
         p = exc(1,2,2)
-        do k = 1, elec_beta_num
-          i = occ(k,2)
-          if (.not.has_mipi(i)) then
-            mipi(i) = get_mo_bielec_integral(m,i,p,i,mo_integrals_map)
-            miip(i) = get_mo_bielec_integral(m,i,i,p,mo_integrals_map)
-            has_mipi(i) = .True.
-          endif
-        enddo
+
         do k = 1, elec_alpha_num
-          i = occ(k,1)
-          if (.not.has_mipi(i)) then
-            mipi(i) = get_mo_bielec_integral(m,i,p,i,mo_integrals_map)
-            has_mipi(i) = .True.
-          endif
-        enddo
-        
-        do k = 1, elec_alpha_num
-          hij = hij + mipi(occ(k,1))
+          hij = hij + mo_bielec_integral_mipi(occ(k,1),m,p) 
         enddo
         do k = 1, elec_beta_num
-          hij = hij + mipi(occ(k,2)) - miip(occ(k,2))
+          hij = hij + mo_bielec_integral_mipi_anti(occ(k,2),m,p)
         enddo
         
       endif
@@ -651,8 +617,6 @@ subroutine i_H_j_phase_out(key_i,key_j,Nint,hij,phase,exc,degree)
   integer                        :: occ(Nint*bit_kind_size,2)
   double precision               :: diag_H_mat_elem
   integer                        :: n_occ_ab(2)
-  logical                        :: has_mipi(Nint*bit_kind_size)
-  double precision               :: mipi(Nint*bit_kind_size), miip(Nint*bit_kind_size)
   PROVIDE mo_bielec_integrals_in_map mo_integrals_map
 
   ASSERT (Nint > 0)
@@ -704,59 +668,27 @@ subroutine i_H_j_phase_out(key_i,key_j,Nint,hij,phase,exc,degree)
       call get_mono_excitation(key_i,key_j,exc,phase,Nint)
       !DIR$ FORCEINLINE
       call bitstring_to_list_ab(key_i, occ, n_occ_ab, Nint)
-      has_mipi = .False.
       if (exc(0,1,1) == 1) then
         ! Mono alpha
         m = exc(1,1,1)
         p = exc(1,2,1)
         do k = 1, elec_alpha_num
-          i = occ(k,1)
-          if (.not.has_mipi(i)) then
-            mipi(i) = get_mo_bielec_integral(m,i,p,i,mo_integrals_map)
-            miip(i) = get_mo_bielec_integral(m,i,i,p,mo_integrals_map)
-            has_mipi(i) = .True.
-          endif
+          hij = hij + mo_bielec_integral_mipi_anti(occ(k,1),m,p)
         enddo
         do k = 1, elec_beta_num
-          i = occ(k,2)
-          if (.not.has_mipi(i)) then
-            mipi(i) = get_mo_bielec_integral(m,i,p,i,mo_integrals_map)
-            has_mipi(i) = .True.
-          endif
+          hij = hij + mo_bielec_integral_mipi(occ(k,2),m,p)
         enddo
-
-        do k = 1, elec_alpha_num
-          hij = hij + mipi(occ(k,1)) - miip(occ(k,1))
-        enddo
-        do k = 1, elec_beta_num
-          hij = hij + mipi(occ(k,2))
-        enddo
-
+        
       else
         ! Mono beta
         m = exc(1,1,2)
         p = exc(1,2,2)
-        do k = 1, elec_beta_num
-          i = occ(k,2)
-          if (.not.has_mipi(i)) then
-            mipi(i) = get_mo_bielec_integral(m,i,p,i,mo_integrals_map)
-            miip(i) = get_mo_bielec_integral(m,i,i,p,mo_integrals_map)
-            has_mipi(i) = .True.
-          endif
-        enddo
-        do k = 1, elec_alpha_num
-          i = occ(k,1)
-          if (.not.has_mipi(i)) then
-            mipi(i) = get_mo_bielec_integral(m,i,p,i,mo_integrals_map)
-            has_mipi(i) = .True.
-          endif
-        enddo
 
         do k = 1, elec_alpha_num
-          hij = hij + mipi(occ(k,1))
+          hij = hij + mo_bielec_integral_mipi(occ(k,1),m,p) 
         enddo
         do k = 1, elec_beta_num
-          hij = hij + mipi(occ(k,2)) - miip(occ(k,2))
+          hij = hij + mo_bielec_integral_mipi_anti(occ(k,2),m,p)
         enddo
 
       endif
@@ -787,8 +719,6 @@ subroutine i_H_j_verbose(key_i,key_j,Nint,hij,hmono,hdouble)
   integer                        :: occ(Nint*bit_kind_size,2)
   double precision               :: diag_H_mat_elem, phase,phase_2
   integer                        :: n_occ_ab(2)
-  logical                        :: has_mipi(Nint*bit_kind_size)
-  double precision               :: mipi(Nint*bit_kind_size), miip(Nint*bit_kind_size)
   PROVIDE mo_bielec_integrals_in_map mo_integrals_map
   
   ASSERT (Nint > 0)
@@ -842,59 +772,26 @@ subroutine i_H_j_verbose(key_i,key_j,Nint,hij,hmono,hdouble)
       call get_mono_excitation(key_i,key_j,exc,phase,Nint)
       !DIR$ FORCEINLINE
       call bitstring_to_list_ab(key_i, occ, n_occ_ab, Nint)
-      has_mipi = .False.
       if (exc(0,1,1) == 1) then
         ! Mono alpha
         m = exc(1,1,1)
         p = exc(1,2,1)
         do k = 1, elec_alpha_num
-          i = occ(k,1)
-          if (.not.has_mipi(i)) then
-            mipi(i) = get_mo_bielec_integral(m,i,p,i,mo_integrals_map)
-            miip(i) = get_mo_bielec_integral(m,i,i,p,mo_integrals_map)
-            has_mipi(i) = .True.
-          endif
+          hdouble = hdouble +  mo_bielec_integral_mipi_anti(occ(k,1),m,p)
         enddo
         do k = 1, elec_beta_num
-          i = occ(k,2)
-          if (.not.has_mipi(i)) then
-            mipi(i) = get_mo_bielec_integral(m,i,p,i,mo_integrals_map)
-            has_mipi(i) = .True.
-          endif
-        enddo
-        
-        do k = 1, elec_alpha_num
-          hdouble = hdouble + mipi(occ(k,1)) - miip(occ(k,1))
-        enddo
-        do k = 1, elec_beta_num
-          hdouble = hdouble + mipi(occ(k,2))
+          hdouble = hdouble + mo_bielec_integral_mipi(occ(k,2),m,p)
         enddo
         
       else
         ! Mono beta
         m = exc(1,1,2)
         p = exc(1,2,2)
-        do k = 1, elec_beta_num
-          i = occ(k,2)
-          if (.not.has_mipi(i)) then
-            mipi(i) = get_mo_bielec_integral(m,i,p,i,mo_integrals_map)
-            miip(i) = get_mo_bielec_integral(m,i,i,p,mo_integrals_map)
-            has_mipi(i) = .True.
-          endif
-        enddo
         do k = 1, elec_alpha_num
-          i = occ(k,1)
-          if (.not.has_mipi(i)) then
-            mipi(i) = get_mo_bielec_integral(m,i,p,i,mo_integrals_map)
-            has_mipi(i) = .True.
-          endif
-        enddo
-        
-        do k = 1, elec_alpha_num
-          hdouble = hdouble + mipi(occ(k,1))
+          hdouble = hdouble + mo_bielec_integral_mipi(occ(k,1),m,p) 
         enddo
         do k = 1, elec_beta_num
-          hdouble = hdouble + mipi(occ(k,2)) - miip(occ(k,2))
+          hdouble = hdouble + mo_bielec_integral_mipi_anti(occ(k,2),m,p)
         enddo
         
       endif
diff --git a/src/Integrals_Bielec/map_integrals.irp.f b/src/Integrals_Bielec/map_integrals.irp.f
index 22fd48a6..b41a3177 100644
--- a/src/Integrals_Bielec/map_integrals.irp.f
+++ b/src/Integrals_Bielec/map_integrals.irp.f
@@ -370,6 +370,7 @@ BEGIN_PROVIDER [ double precision, mo_integrals_cache, (0:64*64*64*64) ]
 
 END_PROVIDER
 
+
 double precision function get_mo_bielec_integral(i,j,k,l,map)
   use map_module
   implicit none
diff --git a/src/Integrals_Bielec/mo_bi_integrals.irp.f b/src/Integrals_Bielec/mo_bi_integrals.irp.f
index 0a468c24..e581b536 100644
--- a/src/Integrals_Bielec/mo_bi_integrals.irp.f
+++ b/src/Integrals_Bielec/mo_bi_integrals.irp.f
@@ -467,6 +467,31 @@ END_PROVIDER
    enddo
  enddo
   
+END_PROVIDER
+
+ BEGIN_PROVIDER [ double precision, mo_bielec_integral_mipi, (mo_tot_num_align,mo_tot_num,mo_tot_num) ]
+&BEGIN_PROVIDER [ double precision, mo_bielec_integral_mipi_anti, (mo_tot_num_align,mo_tot_num,mo_tot_num) ]
+  implicit none
+  BEGIN_DOC
+  ! <mi|pi> and <mi|pi> - <mi|ip>. Indices are (i,m,p)
+  END_DOC
+  
+  integer                        :: m,i,p
+  double precision               :: get_mo_bielec_integral
+  
+  PROVIDE mo_bielec_integrals_in_map
+
+  mo_bielec_integral_mipi = 0.d0
+  mo_bielec_integral_mipi_anti = 0.d0
+  do p=1,mo_tot_num
+   do m=1,mo_tot_num
+    do i=1,mo_tot_num
+      mo_bielec_integral_mipi(i,m,p) = get_mo_bielec_integral(m,i,p,i,mo_integrals_map)
+      mo_bielec_integral_mipi_anti(i,m,p) = mo_bielec_integral_mipi(i,m,p) - get_mo_bielec_integral(m,i,i,p,mo_integrals_map)
+    enddo
+   enddo
+ enddo
+  
 END_PROVIDER
 
 BEGIN_PROVIDER [ double precision, mo_bielec_integral_schwartz,(mo_tot_num,mo_tot_num)  ]