diff --git a/config/ifort_avx.cfg b/config/ifort_avx.cfg
index 7a13348c..d3fcd1f0 100644
--- a/config/ifort_avx.cfg
+++ b/config/ifort_avx.cfg
@@ -32,7 +32,7 @@ OPENMP  : 1          ; Append OpenMP flags
 #
 [OPT]
 FC       : -traceback
-FCFLAGS  : -march=corei7-avx -O2 -ip -ftz -g 
+FCFLAGS  : -xAVX -O2 -ip -ftz -g 
 
 # Profiling flags
 #################
diff --git a/src/cipsi/selection.irp.f b/src/cipsi/selection.irp.f
index 3585940e..6803cd73 100644
--- a/src/cipsi/selection.irp.f
+++ b/src/cipsi/selection.irp.f
@@ -306,9 +306,10 @@ subroutine select_singles_and_doubles(i_generator,hole_mask,particle_mask,fock_d
       i = psi_bilinear_matrix_rows(l_a)
       if (nt + exc_degree(i) <= 4) then
         idx = psi_det_sorted_order(psi_bilinear_matrix_order(l_a))
-        if (psi_average_norm_contrib_sorted(idx) < 1.d-12) cycle
-        indices(k) = idx
-        k=k+1
+        if (psi_average_norm_contrib_sorted(idx) > 1.d-12) then
+          indices(k) = idx
+          k=k+1
+        endif
       endif
     enddo
   enddo
@@ -329,9 +330,10 @@ subroutine select_singles_and_doubles(i_generator,hole_mask,particle_mask,fock_d
         idx = psi_det_sorted_order(                                  &
             psi_bilinear_matrix_order(                               &
             psi_bilinear_matrix_transp_order(l_a)))
-        if (psi_average_norm_contrib_sorted(idx) < 1.d-12) cycle
-        indices(k) = idx
-        k=k+1
+        if (psi_average_norm_contrib_sorted(idx) > 1.d-12) then
+          indices(k) = idx
+          k=k+1
+        endif
       endif
     enddo
   enddo
@@ -440,19 +442,20 @@ subroutine select_singles_and_doubles(i_generator,hole_mask,particle_mask,fock_d
       fullinteresting(0) = 0
       
       do ii=1,preinteresting(0)
+        i = preinteresting(ii) 
         select case (N_int)
           case (1)
-            mobMask(1,1) = iand(negMask(1,1), psi_det_sorted(1,1,preinteresting(ii)))
-            mobMask(1,2) = iand(negMask(1,2), psi_det_sorted(1,2,preinteresting(ii)))
+            mobMask(1,1) = iand(negMask(1,1), psi_det_sorted(1,1,i))
+            mobMask(1,2) = iand(negMask(1,2), psi_det_sorted(1,2,i))
             nt = popcnt(mobMask(1, 1)) + popcnt(mobMask(1, 2))
           case (2)
-            mobMask(1:2,1) = iand(negMask(1:2,1), psi_det_sorted(1:2,1,preinteresting(ii)))
-            mobMask(1:2,2) = iand(negMask(1:2,2), psi_det_sorted(1:2,2,preinteresting(ii)))
+            mobMask(1:2,1) = iand(negMask(1:2,1), psi_det_sorted(1:2,1,i))
+            mobMask(1:2,2) = iand(negMask(1:2,2), psi_det_sorted(1:2,2,i))
             nt = popcnt(mobMask(1, 1)) + popcnt(mobMask(1, 2)) +     &
                 popcnt(mobMask(2, 1)) + popcnt(mobMask(2, 2))
           case (3)
-            mobMask(1:3,1) = iand(negMask(1:3,1), psi_det_sorted(1:3,1,preinteresting(ii)))
-            mobMask(1:3,2) = iand(negMask(1:3,2), psi_det_sorted(1:3,2,preinteresting(ii)))
+            mobMask(1:3,1) = iand(negMask(1:3,1), psi_det_sorted(1:3,1,i))
+            mobMask(1:3,2) = iand(negMask(1:3,2), psi_det_sorted(1:3,2,i))
             nt = 0
             do j=3,1,-1
               if (mobMask(j,1) /= 0_bit_kind) then
@@ -465,8 +468,8 @@ subroutine select_singles_and_doubles(i_generator,hole_mask,particle_mask,fock_d
               endif
             end do
           case (4)
-            mobMask(1:4,1) = iand(negMask(1:4,1), psi_det_sorted(1:4,1,preinteresting(ii)))
-            mobMask(1:4,2) = iand(negMask(1:4,2), psi_det_sorted(1:4,2,preinteresting(ii)))
+            mobMask(1:4,1) = iand(negMask(1:4,1), psi_det_sorted(1:4,1,i))
+            mobMask(1:4,2) = iand(negMask(1:4,2), psi_det_sorted(1:4,2,i))
             nt = 0
             do j=4,1,-1
               if (mobMask(j,1) /= 0_bit_kind) then
@@ -479,8 +482,8 @@ subroutine select_singles_and_doubles(i_generator,hole_mask,particle_mask,fock_d
               endif
             end do
           case default
-            mobMask(1:N_int,1) = iand(negMask(1:N_int,1), psi_det_sorted(1:N_int,1,preinteresting(ii)))
-            mobMask(1:N_int,2) = iand(negMask(1:N_int,2), psi_det_sorted(1:N_int,2,preinteresting(ii)))
+            mobMask(1:N_int,1) = iand(negMask(1:N_int,1), psi_det_sorted(1:N_int,1,i))
+            mobMask(1:N_int,2) = iand(negMask(1:N_int,2), psi_det_sorted(1:N_int,2,i))
             nt = 0
             do j=N_int,1,-1
               if (mobMask(j,1) /= 0_bit_kind) then
@@ -495,7 +498,6 @@ subroutine select_singles_and_doubles(i_generator,hole_mask,particle_mask,fock_d
         end select
         
         if(nt <= 4) then
-          i = preinteresting(ii)
           sze = interesting(0) 
           if (sze+1 == size(interesting)) then
             allocate (tmp_array(0:sze))
@@ -556,13 +558,14 @@ subroutine select_singles_and_doubles(i_generator,hole_mask,particle_mask,fock_d
       allocate (fullminilist (N_int, 2, fullinteresting(0)), &
                     minilist (N_int, 2,     interesting(0)) )
       if(pert_2rdm)then
-       allocate(coef_fullminilist_rev(N_states,fullinteresting(0))) 
-       do i=1,fullinteresting(0)
-        do j = 1, N_states
-          coef_fullminilist_rev(j,i) = psi_coef_sorted(fullinteresting(i),j)
+        allocate(coef_fullminilist_rev(N_states,fullinteresting(0))) 
+        do i=1,fullinteresting(0)
+          do j = 1, N_states
+            coef_fullminilist_rev(j,i) = psi_coef_sorted(fullinteresting(i),j)
+          enddo
         enddo
-       enddo
       endif
+
       do i=1,fullinteresting(0)
         fullminilist(1:N_int,1:2,i) = psi_det_sorted(1:N_int,1:2,fullinteresting(i))
       enddo
@@ -625,7 +628,7 @@ subroutine select_singles_and_doubles(i_generator,hole_mask,particle_mask,fock_d
       enddo
       deallocate(fullminilist,minilist)
       if(pert_2rdm)then
-       deallocate(coef_fullminilist_rev)
+        deallocate(coef_fullminilist_rev)
       endif
     enddo
   enddo
@@ -707,8 +710,8 @@ subroutine fill_buffer_double(i_generator, sp, h1, h2, bannedOrb, banned, fock_d
       if(bannedOrb(p2, s2)) cycle
       if(banned(p1,p2)) cycle
 
-
-      if( sum(abs(mat(1:N_states, p1, p2))) == 0d0) cycle
+      val = maxval(abs(mat(1:N_states, p1, p2)))
+      if( val == 0d0) cycle
       call apply_particles(mask, s1, p1, s2, p2, det, ok, N_int)
 
       if (do_only_cas) then
@@ -946,6 +949,9 @@ subroutine get_d2(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
     if(tip == 3) then
       puti = p(1, mi)
       if(bannedOrb(puti, mi)) return
+      h1 = h(1, ma)
+      h2 = h(2, ma)
+
       do i = 1, 3
         putj = p(i, ma)
         if(banned(putj,puti,bant)) cycle
@@ -953,17 +959,21 @@ subroutine get_d2(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
         i2 = turn3(2,i)
         p1 = p(i1, ma)
         p2 = p(i2, ma)
-        h1 = h(1, ma)
-        h2 = h(2, ma)
 
-        hij = (mo_two_e_integral(p1, p2, h1, h2) - mo_two_e_integral(p2,p1, h1, h2)) * get_phase_bi(phasemask, ma, ma, h1, p1, h2, p2, N_int)
+        hij = mo_two_e_integral(p1, p2, h1, h2) - mo_two_e_integral(p2, p1, h1, h2)
+        if (hij == 0.d0) cycle
+
+        hij = hij * get_phase_bi(phasemask, ma, ma, h1, p1, h2, p2, N_int)
+
         if(ma == 1) then
+          !DIR$ LOOP COUNT AVG(4)
           do k=1,N_states
-            mat(k, putj, puti) = mat(k, putj, puti) +coefs(k) * hij
+            mat(k, putj, puti) = mat(k, putj, puti) + coefs(k) * hij
           enddo
         else
+          !DIR$ LOOP COUNT AVG(4)
           do k=1,N_states
-            mat(k, puti, putj) = mat(k, puti, putj) +coefs(k) * hij
+            mat(k, puti, putj) = mat(k, puti, putj) + coefs(k) * hij
           enddo
         end if
       end do
@@ -980,10 +990,14 @@ subroutine get_d2(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
           if(banned(puti,putj,bant) .or. bannedOrb(puti,1)) cycle
           p1 = p(turn2(i), 1)
 
-          hij = mo_two_e_integral(p1, p2, h1, h2) * get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2, N_int)
-          do k=1,N_states
-            mat(k, puti, putj) = mat(k, puti, putj) +coefs(k) * hij
-          enddo
+          hij = mo_two_e_integral(p1, p2, h1, h2)
+          if (hij /= 0.d0) then
+            hij = hij * get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2, N_int)
+            !DIR$ LOOP COUNT AVG(4)
+            do k=1,N_states
+              mat(k, puti, putj) = mat(k, puti, putj) + coefs(k) * hij
+            enddo
+          endif
         end do
       end do
     end if
@@ -993,22 +1007,26 @@ subroutine get_d2(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
       h1 = h(1, ma)
       h2 = h(2, ma)
       do i=1,3
-      puti = p(i, ma)
-      if(bannedOrb(puti,ma)) cycle
-      do j=i+1,4
-        putj = p(j, ma)
-        if(bannedOrb(putj,ma)) cycle
-        if(banned(puti,putj,1)) cycle
+        puti = p(i, ma)
+        if(bannedOrb(puti,ma)) cycle
+        do j=i+1,4
+          putj = p(j, ma)
+          if(bannedOrb(putj,ma)) cycle
+          if(banned(puti,putj,1)) cycle
 
-        i1 = turn2d(1, i, j)
-        i2 = turn2d(2, i, j)
-        p1 = p(i1, ma)
-        p2 = p(i2, ma)
-        hij = (mo_two_e_integral(p1, p2, h1, h2) - mo_two_e_integral(p2,p1, h1, h2)) * get_phase_bi(phasemask, ma, ma, h1, p1, h2, p2, N_int)
-        do k=1,N_states
-          mat(k, puti, putj) = mat(k, puti, putj) +coefs(k) * hij
-        enddo
-      end do
+          i1 = turn2d(1, i, j)
+          i2 = turn2d(2, i, j)
+          p1 = p(i1, ma)
+          p2 = p(i2, ma)
+          hij = mo_two_e_integral(p1, p2, h1, h2) - mo_two_e_integral(p2,p1, h1, h2)
+          if (hij == 0.d0) cycle
+
+          hij = hij * get_phase_bi(phasemask, ma, ma, h1, p1, h2, p2, N_int)
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat(k, puti, putj) = mat(k, puti, putj) +coefs(k) * hij
+          enddo
+        end do
       end do
     else if(tip == 3) then
       h1 = h(1, mi)
@@ -1022,10 +1040,21 @@ subroutine get_d2(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
         if(banned(puti,putj,1)) cycle
         p2 = p(i, ma)
 
-        hij = mo_two_e_integral(p1, p2, h1, h2) * get_phase_bi(phasemask, mi, ma, h1, p1, h2, p2, N_int)
-        do k=1,N_states
-          mat(k, min(puti, putj), max(puti, putj)) = mat(k, min(puti, putj), max(puti, putj)) + coefs(k) * hij
-        enddo
+        hij = mo_two_e_integral(p1, p2, h1, h2)
+        if (hij == 0.d0) cycle
+
+        hij = hij * get_phase_bi(phasemask, mi, ma, h1, p1, h2, p2, N_int)
+        if (puti < putj) then
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat(k, puti, putj) = mat(k, puti, putj) + coefs(k) * hij
+          enddo
+        else
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat(k, putj, puti) = mat(k, putj, puti) + coefs(k) * hij
+          enddo
+        endif
       end do
     else ! tip == 4
       puti = p(1, sp)
@@ -1035,10 +1064,14 @@ subroutine get_d2(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
         p2 = p(2, mi)
         h1 = h(1, mi)
         h2 = h(2, mi)
-        hij = (mo_two_e_integral(p1, p2, h1, h2) - mo_two_e_integral(p2,p1, h1, h2)) * get_phase_bi(phasemask, mi, mi, h1, p1, h2, p2, N_int)
-        do k=1,N_states
-          mat(k, puti, putj) = mat(k, puti, putj) +coefs(k) * hij
-        enddo
+        hij = (mo_two_e_integral(p1, p2, h1, h2) - mo_two_e_integral(p2,p1, h1, h2))
+        if (hij /= 0.d0) then
+          hij = hij * get_phase_bi(phasemask, mi, mi, h1, p1, h2, p2, N_int)
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat(k, puti, putj) = mat(k, puti, putj) + coefs(k) * hij
+          enddo
+        end if
       end if
     end if
   end if
@@ -1061,7 +1094,7 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
 
   logical, allocatable           :: lbanned(:,:)
   integer                        :: puti, putj, ma, mi, s1, s2, i, i1, i2, j
-  integer                        :: hfix, pfix, h1, h2, p1, p2, ib, k
+  integer                        :: hfix, pfix, h1, h2, p1, p2, ib, k, l
 
   integer, parameter             :: turn2(2) = (/2,1/)
   integer, parameter             :: turn3(2,3) = reshape((/2,3,  1,3, 1,2/), (/2,3/))
@@ -1105,7 +1138,10 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
         hij = hij_cache(putj,1) - hij_cache(putj,2)
         if (hij /= 0.d0) then
           hij = hij * get_phase_bi(phasemask, ma, ma, putj, p1, hfix, p2, N_int)
-          tmp_row(1:N_states,putj) = tmp_row(1:N_states,putj) + hij * coefs(1:N_states)
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            tmp_row(k,putj) = tmp_row(k,putj) + hij * coefs(k)
+          enddo
         endif
       end do
       do putj=hfix+1, mo_num
@@ -1114,14 +1150,22 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
         hij = hij_cache(putj,2) - hij_cache(putj,1)
         if (hij /= 0.d0) then
           hij = hij * get_phase_bi(phasemask, ma, ma, hfix, p1, putj, p2, N_int)
-          tmp_row(1:N_states,putj) = tmp_row(1:N_states,putj) + hij * coefs(1:N_states)
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            tmp_row(k,putj) = tmp_row(k,putj) + hij * coefs(k)
+          enddo
         endif
       end do
 
       if(ma == 1) then
         mat(1:N_states,1:mo_num,puti) = mat(1:N_states,1:mo_num,puti) + tmp_row(1:N_states,1:mo_num)
       else
-        mat(1:N_states,puti,1:mo_num) = mat(1:N_states,puti,1:mo_num) + tmp_row(1:N_states,1:mo_num)
+        do l=1,mo_num
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat(k,puti,l) = mat(k,puti,l) + tmp_row(k,l)
+          enddo
+        enddo
       end if
     end if
 
@@ -1132,7 +1176,7 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
     call get_mo_two_e_integrals(hfix,pfix,p1,mo_num,hij_cache(1,1),mo_integrals_map)
     call get_mo_two_e_integrals(hfix,pfix,p2,mo_num,hij_cache(1,2),mo_integrals_map)
     putj = p1
-    do puti=1,mo_num
+    do puti=1,mo_num !HOT
       if(lbanned(puti,mi)) cycle
       !p1 fixed
       putj = p1
@@ -1140,13 +1184,16 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
         hij = hij_cache(puti,2)
         if (hij /= 0.d0) then
           hij = hij * get_phase_bi(phasemask, ma, mi, hfix, p2, puti, pfix, N_int)
+          !DIR$ LOOP COUNT AVG(4)
           do k=1,N_states
             tmp_row(k,puti) = tmp_row(k,puti) + hij * coefs(k)
           enddo
         endif
       end if
-      
+!    enddo
+!      
       putj = p2
+!    do puti=1,mo_num !HOT
       if(.not. banned(putj,puti,bant)) then
         hij = hij_cache(puti,1)
         if (hij /= 0.d0) then
@@ -1162,8 +1209,13 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
       mat(:,:,p1) = mat(:,:,p1) + tmp_row(:,:)
       mat(:,:,p2) = mat(:,:,p2) + tmp_row2(:,:)
     else
-      mat(:,p1,:) = mat(:,p1,:) + tmp_row(:,:)
-      mat(:,p2,:) = mat(:,p2,:) + tmp_row2(:,:)
+      do l=1,mo_num
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          mat(k,p1,l) = mat(k,p1,l) + tmp_row(k,l)
+          mat(k,p2,l) = mat(k,p2,l) + tmp_row2(k,l)
+        enddo
+      enddo
     end if
 
   else  ! sp /= 3
@@ -1197,7 +1249,12 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
         end do
 
         mat(:, :puti-1, puti) = mat(:, :puti-1, puti) + tmp_row(:,:puti-1)
-        mat(:, puti, puti:) = mat(:, puti,puti:) + tmp_row(:,puti:)
+        do l=puti,mo_num
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat(k, puti, l) = mat(k, puti,l) + tmp_row(k,l)
+          enddo
+        enddo
       end do
     else
       hfix = h(1,mi)
@@ -1216,6 +1273,7 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
           hij = hij_cache(puti,1)
           if (hij /= 0.d0) then
             hij = hij * get_phase_bi(phasemask, mi, ma, hfix, pfix, puti, p1, N_int)
+            !DIR$ LOOP COUNT AVG(4)
             do k=1,N_states
               tmp_row(k,puti) = tmp_row(k,puti) + hij * coefs(k)
             enddo
@@ -1234,9 +1292,19 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
         end if
       end do
       mat(:,:p2-1,p2) = mat(:,:p2-1,p2) + tmp_row(:,:p2-1)
-      mat(:,p2,p2:) = mat(:,p2,p2:) + tmp_row(:,p2:)
+      do l=p2,mo_num
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          mat(k,p2,l) = mat(k,p2,l) + tmp_row(k,l)
+        enddo
+      enddo
       mat(:,:p1-1,p1) = mat(:,:p1-1,p1) + tmp_row2(:,:p1-1)
-      mat(:,p1,p1:) = mat(:,p1,p1:) + tmp_row2(:,p1:)
+      do l=p1,mo_num
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          mat(k,p1,l) = mat(k,p1,l) + tmp_row2(k,l)
+        enddo
+      enddo
     end if
   end if
   deallocate(lbanned,hij_cache)
@@ -1259,7 +1327,10 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
         if(bannedOrb(p1, s1) .or. bannedOrb(p2, s2) .or. banned(p1, p2, 1)) cycle
         call apply_particles(mask, s1, p1, s2, p2, det, ok, N_int)
         call i_h_j(gen, det, N_int, hij)
-        mat(:, p1, p2) = mat(:, p1, p2) + coefs(:) * hij
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          mat(k, p1, p2) = mat(k, p1, p2) + coefs(k) * hij
+        enddo
       end do
     end do
 end
@@ -1303,34 +1374,15 @@ subroutine get_d0(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
           call i_h_j(gen, det, N_int, hij)
         else
           phase = get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2, N_int)
-!          hij = mo_two_e_integral(p2, p1, h2, h1) * phase
           hij = hij_cache1(p2) * phase
         end if
         if (hij == 0.d0) cycle
+        !DIR$ LOOP COUNT AVG(4)
         do k=1,N_states
           mat(k, p1, p2) = mat(k, p1, p2) + coefs(k) * hij  ! HOTSPOT
         enddo
       end do
     end do
-!    do p2=1, mo_num
-!      if(bannedOrb(p2,2)) cycle
-!      call get_mo_two_e_integrals(p2,h1,h2,mo_num,hij_cache1,mo_integrals_map)
-!      do p1=1, mo_num
-!        if(bannedOrb(p1, 1) .or. banned(p1, p2, bant)) cycle
-!        if(p1 /= h1 .and. p2 /= h2) then
-!          if (hij_cache1(p1) == 0.d0) cycle
-!          phase = get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2, N_int)
-!          hij = hij_cache1(p1) * phase
-!        else
-!          call apply_particles(mask, 1,p1,2,p2, det, ok, N_int)
-!          call i_h_j(gen, det, N_int, hij)
-!          if (hij == 0.d0) cycle
-!        end if
-!        do k=1,N_states
-!          mat(k, p1, p2) = mat(k, p1, p2) + coefs(k) * hij  ! HOTSPOT
-!        enddo
-!      end do
-!    end do
 
   else ! AA BB
     p1 = p(1,sp)
@@ -1345,31 +1397,16 @@ subroutine get_d0(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
         if(puti == p1 .or. putj == p2 .or. puti == p2 .or. putj == p1) then
           call apply_particles(mask, sp,puti,sp,putj, det, ok, N_int)
           call i_h_j(gen, det, N_int, hij)
+          if (hij == 0.d0) cycle
         else
-          hij = (mo_two_e_integral(p1, p2, puti, putj) -  mo_two_e_integral(p2, p1, puti, putj))* get_phase_bi(phasemask, sp, sp, puti, p1 , putj, p2, N_int)
+          hij = (mo_two_e_integral(p1, p2, puti, putj) -  mo_two_e_integral(p2, p1, puti, putj))
+          if (hij == 0.d0) cycle
+          hij = hij * get_phase_bi(phasemask, sp, sp, puti, p1 , putj, p2, N_int)
         end if
-        if (hij == 0.d0) cycle
+        !DIR$ LOOP COUNT AVG(4)
         do k=1,N_states
           mat(k, puti, putj) = mat(k, puti, putj) + coefs(k) * hij
         enddo
-!        if(bannedOrb(putj, sp) .or. banned(putj, sp, bant)) cycle
-!        if(puti /= p1 .and. putj /= p2 .and. puti /= p2 .and. putj /= p1) then
-!          hij = hij_cache1(putj) -  hij_cache2(putj)
-!          if (hij /= 0.d0) then
-!            hij = hij * get_phase_bi(phasemask, sp, sp, puti, p1 , putj, p2, N_int)
-!            do k=1,N_states
-!              mat(k, puti, putj) = mat(k, puti, putj) + coefs(k) * hij
-!            enddo
-!          endif
-!        else
-!          call apply_particles(mask, sp,puti,sp,putj, det, ok, N_int)
-!          call i_h_j(gen, det, N_int, hij)
-!          if (hij /= 0.d0) then
-!            do k=1,N_states
-!              mat(k, puti, putj) = mat(k, puti, putj) + coefs(k) * hij
-!            enddo
-!          endif
-!        end if
       end do
     end do
   end if
@@ -1403,8 +1440,8 @@ subroutine past_d2(banned, p, sp)
   integer :: i,j
 
   if(sp == 3) then
-    do i=1,p(0,1)
-      do j=1,p(0,2)
+    do j=1,p(0,2)
+      do i=1,p(0,1)
         banned(p(i,1), p(j,2)) = .true.
       end do
     end do
diff --git a/src/davidson/EZFIO.cfg b/src/davidson/EZFIO.cfg
index 67d7786a..393b25f1 100644
--- a/src/davidson/EZFIO.cfg
+++ b/src/davidson/EZFIO.cfg
@@ -6,7 +6,7 @@ default: 1.e-10
 
 [n_states_diag]
 type: States_number
-doc: Number of states to consider during the Davdison diagonalization
+doc: Controls the number of states to consider during the Davdison diagonalization. The number of states is n_states * n_states_diag
 default: 4
 interface: ezfio,ocaml
 
diff --git a/src/davidson/davidson_parallel.irp.f b/src/davidson/davidson_parallel.irp.f
index 9e4402f6..c0d94b35 100644
--- a/src/davidson/davidson_parallel.irp.f
+++ b/src/davidson/davidson_parallel.irp.f
@@ -428,7 +428,7 @@ subroutine H_S2_u_0_nstates_zmq(v_0,s_0,u_0,N_st,sze)
 
   integer :: istep, imin, imax, ishift, ipos
   integer, external :: add_task_to_taskserver
-  integer, parameter :: tasksize=40000
+  integer, parameter :: tasksize=10000
   character*(100000) :: task
   istep=1
   ishift=0
diff --git a/src/davidson/diagonalization_hs2_dressed.irp.f b/src/davidson/diagonalization_hs2_dressed.irp.f
index ec9a194e..67c466a8 100644
--- a/src/davidson/diagonalization_hs2_dressed.irp.f
+++ b/src/davidson/diagonalization_hs2_dressed.irp.f
@@ -161,7 +161,7 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_
     stop -1
   endif
 
-  itermax = max(3,min(davidson_sze_max, sze/N_st_diag))
+  itermax = max(2,min(davidson_sze_max, sze/N_st_diag))+1
   itertot = 0
 
   if (state_following) then
@@ -322,6 +322,12 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_
     call normalize(u_in(1,k),sze)
   enddo
 
+  do k=1,N_st_diag
+    do i=1,sze
+      U(i,k) = u_in(i,k)
+    enddo
+  enddo
+
 
   do while (.not.converged)
     itertot = itertot+1
@@ -329,30 +335,33 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_
       exit
     endif
 
-    do k=1,N_st_diag
-      do i=1,sze
-        U(i,k) = u_in(i,k)
-      enddo
-    enddo
-
     do iter=1,itermax-1
 
       shift  = N_st_diag*(iter-1)
       shift2 = N_st_diag*iter
 
-      call ortho_qr(U,size(U,1),sze,shift2)
-      call ortho_qr(U,size(U,1),sze,shift2)
+      if ((iter > 1).or.(itertot == 1)) then
+        ! Compute |W_k> = \sum_i |i><i|H|u_k>
+        ! -----------------------------------
 
-      ! Compute |W_k> = \sum_i |i><i|H|u_k>
-      ! -----------------------------------------
+        if (disk_based) then
+          call ortho_qr_unblocked(U,size(U,1),sze,shift2)
+          call ortho_qr_unblocked(U,size(U,1),sze,shift2)
+        else
+          call ortho_qr(U,size(U,1),sze,shift2)
+          call ortho_qr(U,size(U,1),sze,shift2)
+        endif
 
-
-      if ((sze > 100000).and.distributed_davidson) then
-          call H_S2_u_0_nstates_zmq   (W(1,shift+1),S_d,U(1,shift+1),N_st_diag,sze)
+        if ((sze > 100000).and.distributed_davidson) then
+            call H_S2_u_0_nstates_zmq   (W(1,shift+1),S_d,U(1,shift+1),N_st_diag,sze)
+        else
+            call H_S2_u_0_nstates_openmp(W(1,shift+1),S_d,U(1,shift+1),N_st_diag,sze)
+        endif
+        S(1:sze,shift+1:shift+N_st_diag) = real(S_d(1:sze,1:N_st_diag))
       else
-          call H_S2_u_0_nstates_openmp(W(1,shift+1),S_d,U(1,shift+1),N_st_diag,sze)
+         ! Already computed in update below
+         continue
       endif
-      S(1:sze,shift+1:shift+N_st_diag) = real(S_d(1:sze,1:N_st_diag))
 
       if (dressing_state > 0) then
 
@@ -408,7 +417,7 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_
       ! Compute s_kl = <u_k | S_l> = <u_k| S2 |u_l>
       ! -------------------------------------------
 
-       !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i,j,k)
+       !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i,j,k) COLLAPSE(2)
        do j=1,shift2
          do i=1,shift2
            s_(i,j) = 0.d0
@@ -563,6 +572,7 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_
       ! Compute residual vector and davidson step
       ! -----------------------------------------
 
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i,k)
       do k=1,N_st_diag
         do i=1,sze
           U(i,shift2+k) =  &
@@ -577,9 +587,15 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_
           to_print(3,k) = residual_norm(k)
         endif
       enddo
+      !$OMP END PARALLEL DO
 
 
-      write(*,'(1X,I3,1X,100(1X,F16.10,1X,F11.6,1X,E11.3))') iter, to_print(1:3,1:N_st)
+      if ((itertot>1).and.(iter == 1)) then
+        !don't print 
+        continue
+      else
+        write(*,'(1X,I3,1X,100(1X,F16.10,1X,F11.6,1X,E11.3))') iter-1, to_print(1:3,1:N_st)
+      endif
       call davidson_converged(lambda,residual_norm,wall,iter,cpu,N_st,converged)
       do k=1,N_st
         if (residual_norm(k) > 1.e8) then
@@ -600,11 +616,56 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_
 
     enddo
 
-    ! Re-contract to u_in
-    ! -----------
+    ! Re-contract U and update S and W
+    ! --------------------------------
+
+    call sgemm('N','N', sze, N_st_diag, shift2, 1.,      &
+        S, size(S,1), y_s, size(y_s,1), 0., S(1,shift2+1), size(S,1))
+    do k=1,N_st_diag
+      do i=1,sze
+        S(i,k) = S(i,shift2+k)
+      enddo
+    enddo
+
+    call dgemm('N','N', sze, N_st_diag, shift2, 1.d0,      &
+        W, size(W,1), y, size(y,1), 0.d0, u_in, size(u_in,1))
+    do k=1,N_st_diag
+      do i=1,sze
+        W(i,k) = u_in(i,k)
+      enddo
+    enddo
 
     call dgemm('N','N', sze, N_st_diag, shift2, 1.d0,      &
         U, size(U,1), y, size(y,1), 0.d0, u_in, size(u_in,1))
+    do k=1,N_st_diag
+      do i=1,sze
+        U(i,k) = u_in(i,k)
+      enddo
+    enddo
+    if (disk_based) then
+      call ortho_qr_unblocked(U,size(U,1),sze,N_st_diag)
+      call ortho_qr_unblocked(U,size(U,1),sze,N_st_diag)
+    else
+      call ortho_qr(U,size(U,1),sze,N_st_diag)
+      call ortho_qr(U,size(U,1),sze,N_st_diag)
+    endif
+    do j=1,N_st_diag
+      k=1
+      do while ((k<sze).and.(U(k,j) == 0.d0))
+        k = k+1
+      enddo
+      if (U(k,j) * u_in(k,j) < 0.d0) then
+        do i=1,sze
+          W(i,j) = -W(i,j)
+          S(i,j) = -S(i,j)
+        enddo
+      endif
+    enddo
+    do j=1,N_st_diag
+      do i=1,sze
+        S_d(i,j) = dble(S(i,j))
+      enddo
+    enddo
 
   enddo
 
@@ -626,7 +687,7 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_
     call munmap( (/int(sze,8),int(N_st_diag*itermax,8)/), 8, fd_w, ptr_w )
     fd_w = getUnitAndOpen(trim(ezfio_work_dir)//'davidson_w','r')
     close(fd_w,status='delete')
-    call munmap( (/int(sze,8),int(N_st_diag*itermax,8)/), 8, fd_s, ptr_s )
+    call munmap( (/int(sze,8),int(N_st_diag*itermax,8)/), 4, fd_s, ptr_s )
     fd_s = getUnitAndOpen(trim(ezfio_work_dir)//'davidson_s','r')
     close(fd_s,status='delete')
   else
diff --git a/src/davidson/input.irp.f b/src/davidson/input.irp.f
index 64beaa83..aba88ae9 100644
--- a/src/davidson/input.irp.f
+++ b/src/davidson/input.irp.f
@@ -15,7 +15,7 @@ BEGIN_PROVIDER [ integer, n_states_diag  ]
       print *, 'davidson/n_states_diag not found in EZFIO file'
       stop 1
     endif
-    n_states_diag = max(N_states, N_states_diag)
+    n_states_diag = max(2,N_states * N_states_diag)
   endif
   IRP_IF MPI_DEBUG
     print *,  irp_here, mpi_rank
diff --git a/src/davidson/u0_h_u0.irp.f b/src/davidson/u0_h_u0.irp.f
index 726673ce..6117a13e 100644
--- a/src/davidson/u0_h_u0.irp.f
+++ b/src/davidson/u0_h_u0.irp.f
@@ -188,7 +188,7 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_t,s_t,u_t,N_st,sze,istart,iend,
   double precision, intent(out)  :: v_t(N_st,sze), s_t(N_st,sze)
 
   double precision               :: hij, sij
-  integer                        :: i,j,k,l
+  integer                        :: i,j,k,l,kk
   integer                        :: k_a, k_b, l_a, l_b, m_a, m_b
   integer                        :: istate
   integer                        :: krow, kcol, krow_b, kcol_b
@@ -209,6 +209,8 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_t,s_t,u_t,N_st,sze,istart,iend,
   logical                        :: compute_singles
   integer*8                      :: last_found, left, right, right_max
   double precision               :: rss, mem, ratio
+  double precision, allocatable  :: utl(:,:)
+  integer, parameter             :: block_size=128
 
 !  call resident_memory(rss)
 !  mem = dble(singles_beta_csc_size) / 1024.d0**3
@@ -247,7 +249,7 @@ compute_singles=.True.
       !$OMP          singles_alpha_csc,singles_alpha_csc_idx,        &
       !$OMP          singles_beta_csc,singles_beta_csc_idx)          &
       !$OMP   PRIVATE(krow, kcol, tmp_det, spindet, k_a, k_b, i,     &
-      !$OMP          lcol, lrow, l_a, l_b,                           &
+      !$OMP          lcol, lrow, l_a, l_b, utl, kk,                  &
       !$OMP          buffer, doubles, n_doubles,                     &
       !$OMP          tmp_det2, hij, sij, idx, l, kcol_prev,          &
       !$OMP          singles_a, n_singles_a, singles_b, ratio,       &
@@ -260,7 +262,7 @@ compute_singles=.True.
       singles_a(maxab),                                              &
       singles_b(maxab),                                              &
       doubles(maxab),                                                &
-      idx(maxab))
+      idx(maxab), utl(N_st,block_size))
 
   kcol_prev=-1
 
@@ -398,20 +400,32 @@ compute_singles=.True.
       ! -----------------------
 
       !DIR$ LOOP COUNT avg(1000)
-      do k = 1,n_singles_a
-        l_a = singles_a(k)
-        ASSERT (l_a <= N_det)
+      do k = 1,n_singles_a,block_size
+        ! Prefetch u_t(:,l_a)
+        do kk=0,block_size-1
+          if (k+kk > n_singles_a) exit
+          l_a = singles_a(k+kk)
+          ASSERT (l_a <= N_det)
 
-        lrow = psi_bilinear_matrix_rows(l_a)
-        ASSERT (lrow <= N_det_alpha_unique)
+          do l=1,N_st
+            utl(l,kk+1) = u_t(l,l_a)
+          enddo
+        enddo
 
-        tmp_det2(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, lrow)
-        call i_H_j_double_alpha_beta(tmp_det,tmp_det2,$N_int,hij)
-        call get_s2(tmp_det,tmp_det2,$N_int,sij)
-        !DIR$ LOOP COUNT AVG(4)
-        do l=1,N_st
-          v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,l_a)
-          s_t(l,k_a) = s_t(l,k_a) + sij * u_t(l,l_a)
+        do kk=0,block_size-1
+          if (k+kk > n_singles_a) exit
+          l_a = singles_a(k+kk)
+          lrow = psi_bilinear_matrix_rows(l_a)
+          ASSERT (lrow <= N_det_alpha_unique)
+
+          tmp_det2(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, lrow)
+          call i_H_j_double_alpha_beta(tmp_det,tmp_det2,$N_int,hij)
+          call get_s2(tmp_det,tmp_det2,$N_int,sij)
+          !DIR$ LOOP COUNT AVG(4)
+          do l=1,N_st
+            v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1)
+            s_t(l,k_a) = s_t(l,k_a) + sij * utl(l,kk+1)
+          enddo
         enddo
       enddo
 
@@ -475,20 +489,32 @@ compute_singles=.True.
 
     tmp_det2(1:$N_int,2) = psi_det_beta_unique (1:$N_int, kcol)
     !DIR$ LOOP COUNT avg(1000)
-    do i=1,n_singles_a
-      l_a = singles_a(i)
-      ASSERT (l_a <= N_det)
+    do i=1,n_singles_a,block_size
+      ! Prefetch u_t(:,l_a)
+      do kk=0,block_size-1
+        if (i+kk > n_singles_a) exit
+        l_a = singles_a(i+kk)
+        ASSERT (l_a <= N_det)
 
-      lrow = psi_bilinear_matrix_rows(l_a)
-      ASSERT (lrow <= N_det_alpha_unique)
+        do l=1,N_st
+          utl(l,kk+1) = u_t(l,l_a)
+        enddo
+      enddo
 
-      tmp_det2(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, lrow)
-      call i_h_j_single_spin( tmp_det, tmp_det2, $N_int, 1, hij)
+      do kk=0,block_size-1
+        if (i+kk > n_singles_a) exit
+        l_a = singles_a(i+kk)
+        lrow = psi_bilinear_matrix_rows(l_a)
+        ASSERT (lrow <= N_det_alpha_unique)
 
-      !DIR$ LOOP COUNT AVG(4)
-      do l=1,N_st
-        v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,l_a)
-        ! single => sij = 0
+        tmp_det2(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, lrow)
+        call i_h_j_single_spin( tmp_det, tmp_det2, $N_int, 1, hij)
+
+        !DIR$ LOOP COUNT AVG(4)
+        do l=1,N_st
+          v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1)
+          ! single => sij = 0
+        enddo
       enddo
     enddo
 
@@ -497,18 +523,30 @@ compute_singles=.True.
     ! ----------------------------------
 
     !DIR$ LOOP COUNT avg(50000)
-    do i=1,n_doubles
-      l_a = doubles(i)
-      ASSERT (l_a <= N_det)
+    do i=1,n_doubles,block_size
+      ! Prefetch u_t(:,l_a)
+      do kk=0,block_size-1
+        if (i+kk > n_doubles) exit
+        l_a = doubles(i+kk)
+        ASSERT (l_a <= N_det)
 
-      lrow = psi_bilinear_matrix_rows(l_a)
-      ASSERT (lrow <= N_det_alpha_unique)
+        do l=1,N_st
+          utl(l,kk+1) = u_t(l,l_a)
+        enddo
+      enddo
 
-      call i_H_j_double_spin( tmp_det(1,1), psi_det_alpha_unique(1, lrow), $N_int, hij)
-      !DIR$ LOOP COUNT AVG(4)
-      do l=1,N_st
-        v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,l_a)
-        ! same spin => sij = 0
+      do kk=0,block_size-1
+        if (i+kk > n_doubles) exit
+        l_a = doubles(i+kk)
+        lrow = psi_bilinear_matrix_rows(l_a)
+        ASSERT (lrow <= N_det_alpha_unique)
+
+        call i_H_j_double_spin( tmp_det(1,1), psi_det_alpha_unique(1, lrow), $N_int, hij)
+        !DIR$ LOOP COUNT AVG(4)
+        do l=1,N_st
+          v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1)
+          ! same spin => sij = 0
+        enddo
       enddo
     enddo
 
@@ -560,21 +598,34 @@ compute_singles=.True.
 
     tmp_det2(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, krow)
     !DIR$ LOOP COUNT avg(1000)
-    do i=1,n_singles_b
-      l_b = singles_b(i)
-      ASSERT (l_b <= N_det)
+    do i=1,n_singles_b,block_size
+      do kk=0,block_size-1
+        if (i+kk > n_singles_b) exit
+        l_b = singles_b(i+kk)
+        ASSERT (l_b <= N_det)
 
-      lcol = psi_bilinear_matrix_transp_columns(l_b)
-      ASSERT (lcol <= N_det_beta_unique)
+        l_a = psi_bilinear_matrix_transp_order(l_b)
+        ASSERT (l_a <= N_det)
 
-      tmp_det2(1:$N_int,2) = psi_det_beta_unique (1:$N_int, lcol)
-      call i_h_j_single_spin( tmp_det, tmp_det2, $N_int, 2, hij)
-      l_a = psi_bilinear_matrix_transp_order(l_b)
-      ASSERT (l_a <= N_det)
-      !DIR$ LOOP COUNT AVG(4)
-      do l=1,N_st
-        v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,l_a)
-        ! single => sij = 0
+        do l=1,N_st
+          utl(l,kk+1) = u_t(l,l_a)
+        enddo
+      enddo
+
+      do kk=0,block_size-1
+        if (i+kk > n_singles_b) exit
+        l_b = singles_b(i+kk)
+        l_a = psi_bilinear_matrix_transp_order(l_b)
+        lcol = psi_bilinear_matrix_transp_columns(l_b)
+        ASSERT (lcol <= N_det_beta_unique)
+
+        tmp_det2(1:$N_int,2) = psi_det_beta_unique (1:$N_int, lcol)
+        call i_h_j_single_spin( tmp_det, tmp_det2, $N_int, 2, hij)
+        !DIR$ LOOP COUNT AVG(4)
+        do l=1,N_st
+          v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1)
+          ! single => sij = 0
+        enddo
       enddo
     enddo
 
@@ -582,21 +633,33 @@ compute_singles=.True.
     ! ----------------------------------
 
     !DIR$ LOOP COUNT avg(50000)
-    do i=1,n_doubles
-      l_b = doubles(i)
-      ASSERT (l_b <= N_det)
+    do i=1,n_doubles,block_size
+      do kk=0,block_size-1
+        if (i+kk > n_doubles) exit
+        l_b = doubles(i+kk)
+        ASSERT (l_b <= N_det)
+        l_a = psi_bilinear_matrix_transp_order(l_b)
+        ASSERT (l_a <= N_det)
 
-      lcol = psi_bilinear_matrix_transp_columns(l_b)
-      ASSERT (lcol <= N_det_beta_unique)
+        do l=1,N_st
+          utl(l,kk+1) = u_t(l,l_a)
+        enddo
+      enddo
 
-      call i_H_j_double_spin( tmp_det(1,2), psi_det_beta_unique(1, lcol), $N_int, hij)
-      l_a = psi_bilinear_matrix_transp_order(l_b)
-      ASSERT (l_a <= N_det)
+      do kk=0,block_size-1
+        if (i+kk > n_doubles) exit
+        l_b = doubles(i+kk)
+        l_a = psi_bilinear_matrix_transp_order(l_b)
+        lcol = psi_bilinear_matrix_transp_columns(l_b)
+        ASSERT (lcol <= N_det_beta_unique)
 
-      !DIR$ LOOP COUNT AVG(4)
-      do l=1,N_st
-        v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,l_a)
-        ! same spin => sij = 0
+        call i_H_j_double_spin( tmp_det(1,2), psi_det_beta_unique(1, lcol), $N_int, hij)
+
+        !DIR$ LOOP COUNT AVG(4)
+        do l=1,N_st
+          v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1)
+          ! same spin => sij = 0
+        enddo
       enddo
     enddo
 
@@ -629,7 +692,7 @@ compute_singles=.True.
 
   end do
   !$OMP END DO
-  deallocate(buffer, singles_a, singles_b, doubles, idx)
+  deallocate(buffer, singles_a, singles_b, doubles, idx, utl)
   !$OMP END PARALLEL
 
 end
diff --git a/src/determinants/single_excitations.irp.f b/src/determinants/single_excitations.irp.f
index 03ff0f7e..ccfeaa2e 100644
--- a/src/determinants/single_excitations.irp.f
+++ b/src/determinants/single_excitations.irp.f
@@ -108,6 +108,11 @@ subroutine get_single_excitation_from_fock(det_1,det_2,h,p,spin,phase,hij)
  integer :: occ_partcl(N_int*bit_kind_size,2)
  integer :: n_occ_ab_hole(2),n_occ_ab_partcl(2)
  integer :: i0,i
+ double precision :: buffer_c(mo_num),buffer_x(mo_num)
+ do i=1, mo_num
+   buffer_c(i) = big_array_coulomb_integrals(i,h,p)
+   buffer_x(i) = big_array_exchange_integrals(i,h,p)
+ enddo
  do i = 1, N_int
   differences(i,1) = xor(det_1(i,1),ref_closed_shell_bitmask(i,1))
   differences(i,2) = xor(det_1(i,2),ref_closed_shell_bitmask(i,2))
@@ -122,33 +127,33 @@ subroutine get_single_excitation_from_fock(det_1,det_2,h,p,spin,phase,hij)
  ! holes :: direct terms
  do i0 = 1, n_occ_ab_hole(1)
   i = occ_hole(i0,1)
-  hij -= big_array_coulomb_integrals(i,h,p)
+  hij -= buffer_c(i)
  enddo
  do i0 = 1, n_occ_ab_hole(2)
   i = occ_hole(i0,2)
-  hij -= big_array_coulomb_integrals(i,h,p)
+  hij -= buffer_c(i)
  enddo
 
  ! holes :: exchange terms
  do i0 = 1, n_occ_ab_hole(spin)
   i = occ_hole(i0,spin)
-  hij += big_array_exchange_integrals(i,h,p)
+  hij += buffer_x(i)
  enddo
 
  ! particles :: direct terms
  do i0 = 1, n_occ_ab_partcl(1)
   i = occ_partcl(i0,1)
-  hij += big_array_coulomb_integrals(i,h,p)
+  hij += buffer_c(i)
  enddo
  do i0 = 1, n_occ_ab_partcl(2)
   i = occ_partcl(i0,2)
-  hij += big_array_coulomb_integrals(i,h,p)
+  hij += buffer_c(i)
  enddo
 
  ! particles :: exchange terms
  do i0 = 1, n_occ_ab_partcl(spin)
   i = occ_partcl(i0,spin)
-  hij -= big_array_exchange_integrals(i,h,p)
+  hij -= buffer_x(i)
  enddo
  hij = hij * phase
 
diff --git a/src/utils/linear_algebra.irp.f b/src/utils/linear_algebra.irp.f
index 357f169a..615f67d0 100644
--- a/src/utils/linear_algebra.irp.f
+++ b/src/utils/linear_algebra.irp.f
@@ -138,18 +138,27 @@ subroutine ortho_qr(A,LDA,m,n)
   double precision, intent(inout) :: A(LDA,n)
 
   integer                        :: lwork, info
-  integer, allocatable           :: jpvt(:)
-  double precision, allocatable  :: tau(:), work(:)
+  double precision, allocatable  :: TAU(:), WORK(:)
+
+  allocate (TAU(n), WORK(1))
 
-  allocate (jpvt(n), tau(n), work(1))
   LWORK=-1
   call dgeqrf( m, n, A, LDA, TAU, WORK, LWORK, INFO )
-  LWORK=2*int(WORK(1))
+  LWORK=int(WORK(1))
+
   deallocate(WORK)
   allocate(WORK(LWORK))
   call dgeqrf(m, n, A, LDA, TAU, WORK, LWORK, INFO )
-  call dorgqr(m, n, n, A, LDA, tau, WORK, LWORK, INFO)
-  deallocate(WORK,jpvt,tau)
+
+  LWORK=-1
+  call dorgqr(m, n, n, A, LDA, TAU, WORK, LWORK, INFO)
+  LWORK=int(WORK(1))
+
+  deallocate(WORK)
+  allocate(WORK(LWORK))
+  call dorgqr(m, n, n, A, LDA, TAU, WORK, LWORK, INFO)
+
+  deallocate(WORK,TAU)
 end
 
 subroutine ortho_qr_unblocked(A,LDA,m,n)
@@ -170,13 +179,12 @@ subroutine ortho_qr_unblocked(A,LDA,m,n)
   double precision, intent(inout) :: A(LDA,n)
 
   integer                        :: info
-  integer, allocatable           :: jpvt(:)
-  double precision, allocatable  :: tau(:), work(:)
+  double precision, allocatable  :: TAU(:), WORK(:)
 
-  allocate (jpvt(n), tau(n), work(n))
+  allocate (TAU(n), WORK(n))
   call dgeqr2( m, n, A, LDA, TAU, WORK, INFO )
-  call dorg2r(m, n, n, A, LDA, tau, WORK, INFO)
-  deallocate(WORK,jpvt,tau)
+  call dorg2r(m, n, n, A, LDA, TAU, WORK, INFO)
+  deallocate(WORK,TAU)
 end
 
 subroutine ortho_lowdin(overlap,LDA,N,C,LDC,m)
diff --git a/src/utils/memory.irp.f b/src/utils/memory.irp.f
index 3e077f97..3ea242b0 100644
--- a/src/utils/memory.irp.f
+++ b/src/utils/memory.irp.f
@@ -99,6 +99,7 @@ subroutine check_mem(rss_in,routine)
   rss += rss_in
   if (int(rss)+1 > qp_max_mem) then
     print *,  'Not enough memory: aborting in ', routine
+    print *,  int(rss)+1, ' GB required'
     stop -1
   endif
   !$OMP END CRITICAL