Merge pull request #214 from scemama/master

Bug in MO integrals cache fixed
2025-04-17 22:10:54 +02:00 · 2017-10-20 14:51:26 -05:00 · 2017-10-20 14:51:26 -05:00 · 5006801b43
commit 5006801b43
parent b4f518f44d 80e5819658
44 changed files with 3394 additions and 264 deletions
--- a/install/scripts/install_gpi2.sh
+++ b/install/scripts/install_gpi2.sh
@ -6,9 +6,9 @@ GPI_OPTIONS=--with-ethernet

 function _install()
 {
-  cd gpi2
+  cd _build/gpi2
  ./install.sh -p $QP_ROOT $GPI_OPTIONS
-  cp src/GASPI.f90 $QP_ROOT/src/plugins/GPI2/
+  cp src/GASPI.f90 $QP_ROOT/plugins/GPI2/
  return 0
 }

--- a/plugins/FourIdx/four_index.irp.f
+++ b/plugins/FourIdx/four_index.irp.f
@ -0,0 +1,180 @@
+subroutine four_index_transform(map_a,map_c,matrix_B,LDB,            &
+      i_start, j_start, k_start, l_start,                            &
+      i_end  , j_end  , k_end  , l_end  ,                            &
+      a_start, b_start, c_start, d_start,                            &
+      a_end  , b_end  , c_end  , d_end  )
+  implicit none
+  use map_module
+  use mmap_module
+  BEGIN_DOC
+! Performs a four-index transformation of map_a(N^4) into map_c(M^4) using b(NxM)
+! C_{abcd} = \sum_{ijkl} A_{ijkl}.B_{ia}.B_{jb}.B_{kc}.B_{ld}
+! Loops run over *_start->*_end
+  END_DOC
+  type(map_type), intent(in)     :: map_a
+  type(map_type), intent(inout)  :: map_c
+  integer, intent(in)            :: LDB
+  double precision, intent(in)   :: matrix_B(LDB,*)
+  integer, intent(in)            :: i_start, j_start, k_start, l_start
+  integer, intent(in)            :: i_end  , j_end  , k_end  , l_end
+  integer, intent(in)            :: a_start, b_start, c_start, d_start
+  integer, intent(in)            :: a_end  , b_end  , c_end  , d_end
+
+  double precision, allocatable  :: T(:,:,:), U(:,:,:), V(:,:,:)
+  integer                        :: i_max, j_max, k_max, l_max
+  integer                        :: i_min, j_min, k_min, l_min
+  integer                        :: i, j, k, l
+  integer                        :: a, b, c, d
+  double precision, external     :: get_ao_bielec_integral
+  integer(key_kind)              :: idx
+  real(integral_kind)            :: tmp
+  integer(key_kind), allocatable :: key(:)
+  real(integral_kind), allocatable :: value(:)
+
+  ASSERT (k_start == i_start)
+  ASSERT (l_start == j_start)
+  ASSERT (a_start == c_start)
+  ASSERT (b_start == d_start)
+
+  i_min = min(i_start,a_start)
+  i_max = max(i_end  ,a_end  )
+  j_min = min(j_start,b_start)
+  j_max = max(j_end  ,b_end  )
+  k_min = min(k_start,c_start)
+  k_max = max(k_end  ,c_end  )
+  l_min = min(l_start,d_start)
+  l_max = max(l_end  ,d_end  )
+
+  ASSERT (0 < i_max)
+  ASSERT (0 < j_max)
+  ASSERT (0 < k_max)
+  ASSERT (0 < l_max)
+  ASSERT (LDB >= i_max)
+  ASSERT (LDB >= j_max)
+  ASSERT (LDB >= k_max)
+  ASSERT (LDB >= l_max)
+
+  ! Create a temporary memory-mapped file
+  integer                        :: fd
+  type(c_ptr)                    :: c_pointer
+  integer*8, pointer             :: a_array(:,:,:)
+  call mmap(trim(ezfio_filename)//'/work/four_idx',                  &
+      (/ 4_8,int(i_end-i_start+1,8),int(j_end-j_start+1,8),int(k_end-k_start+1,8), int(l_end-l_start+1,8) /), 8, fd, .False., c_pointer)
+  call c_f_pointer(c_pointer, a_array, (/ 4, (i_end-i_start+1)*(j_end-j_start+1)*(k_end-k_start+1), l_end-l_start+1 /))
+
+
+  !$OMP PARALLEL DEFAULT(NONE) SHARED(a_array,c_pointer,fd,          &
+      !$OMP  a_start,a_end,b_start,b_end,c_start,c_end,d_start,d_end,&
+      !$OMP  i_start,i_end,j_start,j_end,k_start,k_end,l_start,l_end,&
+      !$OMP  i_min,i_max,j_min,j_max,k_min,k_max,l_min,l_max,        &
+      !$OMP  map_a,map_c,matrix_B)                                   &
+      !$OMP  PRIVATE(key,value,T,U,V,i,j,k,l,idx,   &
+      !$OMP  a,b,c,d,tmp)
+  allocate( key(i_max*j_max*k_max), value(i_max*j_max*k_max) )
+  allocate( U(a_start:a_end, c_start:c_end, b_start:b_end) )
+
+
+  !$OMP DO SCHEDULE(dynamic,4)
+  do l=l_start,l_end
+    a = 1
+    do j=j_start,j_end
+      do k=k_start,k_end
+        do i=i_start,i_end
+          call bielec_integrals_index(i,j,k,l,idx)
+          call map_get(map_a,idx,tmp)
+          if (tmp /= 0.d0) then
+            a = a+1
+            a_array(1,a,l-l_start+1) = i
+            a_array(2,a,l-l_start+1) = j
+            a_array(3,a,l-l_start+1) = k
+            a_array(4,a,l-l_start+1) = transfer(dble(tmp), 1_8)
+          endif
+        enddo
+      enddo
+    enddo
+    a_array(1,1,l-l_start+1) = a
+    print *,  l
+  enddo
+  !$OMP END DO
+
+  !$OMP DO SCHEDULE(dynamic)
+  do d=d_start,d_end
+    U = 0.d0
+    do l=l_start,l_end
+      if (dabs(matrix_B(l,d)) < 1.d-10) then
+        cycle
+      endif
+      print *,  d, l
+
+      allocate( T(i_start:i_end, k_start:k_end, j_start:j_end), &
+                V(a_start:a_end, k_start:k_end, j_start:j_end) )
+
+      T = 0.d0
+      do a=2,a_array(1,1,l-l_start+1)
+        i = a_array(1,a,l-l_start+1)
+        j = a_array(2,a,l-l_start+1)
+        k = a_array(3,a,l-l_start+1)
+        T(i, k,j) = transfer(a_array(4,a,l-l_start+1), 1.d0)
+      enddo
+
+      call DGEMM('T','N', (a_end-a_start+1),                         &
+          (k_end-k_start+1)*(j_end-j_start+1),                       &
+          (i_end-i_start+1), 1.d0,                                   &
+          matrix_B(i_start,a_start), size(matrix_B,1),               &
+          T(i_start,k_start,j_start), size(T,1),  0.d0,              &
+          V(a_start,k_start,j_start), size(V, 1) )
+
+      deallocate(T)
+      allocate( T(a_start:a_end, k_start:k_end, b_start:d) )
+
+      call DGEMM('N','N', (a_end-a_start+1)*(k_end-k_start+1),       &
+              (b_end-b_start+1),                                     &
+              (j_end-j_start+1), 1.d0,                               &
+              V(a_start,k_start,j_start), size(V,1)*size(V,2),       &
+              matrix_B(j_start,b_start), size(matrix_B,1),0.d0,      &
+              T(a_start,k_start,b_start), size(T,1)*size(T,2) )
+
+      deallocate(V)
+
+      do b=b_start,b_end
+        call DGEMM('N','N', (a_end-a_start+1), (c_end-c_start+1),    &
+            (k_end-k_start+1), matrix_B(l, d),                   &
+            T(a_start,k_start,b), size(T,1),                     &
+            matrix_B(k_start,c_start), size(matrix_B,1), 1.d0,   &
+            U(a_start,c_start,b), size(U,1) )
+      enddo
+
+      deallocate(T)
+
+    enddo
+
+    idx = 0_8
+    do b=b_start,b_end
+      do c=c_start,c_end
+        do a=a_start,a_end
+          if (dabs(U(a,c,b)) < 1.d-15) then
+            cycle
+          endif
+          idx = idx+1_8
+          call bielec_integrals_index(a,b,c,d,key(idx))
+          value(idx) = U(a,c,b)
+        enddo
+      enddo
+    enddo
+
+    !$OMP CRITICAL
+    call map_append(map_c, key, value, idx) 
+    call map_sort(map_c)
+    !$OMP END CRITICAL
+
+
+  enddo
+  !$OMP END DO
+
+  deallocate(key,value)
+  !$OMP END PARALLEL
+
+  call munmap( &
+      (/ 4_8,int(i_end-i_start+1,8),int(j_end-j_start+1,8),int(k_end-k_start+1,8), int(l_end-l_start+1,8) /), 8, fd, c_pointer)
+
+end
--- a/plugins/FourIdx/four_index_sym.irp.f
+++ b/plugins/FourIdx/four_index_sym.irp.f
@ -0,0 +1,277 @@
+subroutine four_index_transform_sym(map_a,map_c,matrix_B,LDB,            &
+      i_start, j_start, k_start, l_start,                            &
+      i_end  , j_end  , k_end  , l_end  ,                            &
+      a_start, b_start, c_start, d_start,                            &
+      a_end  , b_end  , c_end  , d_end  )
+  implicit none
+  use map_module
+  use mmap_module
+  BEGIN_DOC
+! Performs a four-index transformation of map_a(N^4) into map_c(M^4) using b(NxM)
+! C_{abcd} = \sum_{ijkl} A_{ijkl}.B_{ia}.B_{jb}.B_{kc}.B_{ld}
+! Loops run over *_start->*_end
+  END_DOC
+  type(map_type), intent(in)     :: map_a
+  type(map_type), intent(inout)  :: map_c
+  integer, intent(in)            :: LDB
+  double precision, intent(in)   :: matrix_B(LDB,*)
+  integer, intent(in)            :: i_start, j_start, k_start, l_start
+  integer, intent(in)            :: i_end  , j_end  , k_end  , l_end
+  integer, intent(in)            :: a_start, b_start, c_start, d_start
+  integer, intent(in)            :: a_end  , b_end  , c_end  , d_end
+
+  double precision, allocatable  :: T(:,:), U(:,:,:), V(:,:)
+  double precision, allocatable  :: T2d(:,:), V2d(:,:)
+  integer                        :: i_max, j_max, k_max, l_max
+  integer                        :: i_min, j_min, k_min, l_min
+  integer                        :: i, j, k, l, ik, ll
+  integer                        :: a, b, c, d
+  double precision, external     :: get_ao_bielec_integral
+  integer*8                      :: ii
+  integer(key_kind)              :: idx
+  real(integral_kind)            :: tmp
+  integer(key_kind), allocatable :: key(:)
+  real(integral_kind), allocatable :: value(:)
+  integer*8, allocatable         :: l_pointer(:)
+
+  ASSERT (k_start == i_start)
+  ASSERT (l_start == j_start)
+  ASSERT (a_start == c_start)
+  ASSERT (b_start == d_start)
+
+  i_min = min(i_start,a_start)
+  i_max = max(i_end  ,a_end  )
+  j_min = min(j_start,b_start)
+  j_max = max(j_end  ,b_end  )
+  k_min = min(k_start,c_start)
+  k_max = max(k_end  ,c_end  )
+  l_min = min(l_start,d_start)
+  l_max = max(l_end  ,d_end  )
+
+  ASSERT (0 < i_max)
+  ASSERT (0 < j_max)
+  ASSERT (0 < k_max)
+  ASSERT (0 < l_max)
+  ASSERT (LDB >= i_max)
+  ASSERT (LDB >= j_max)
+  ASSERT (LDB >= k_max)
+  ASSERT (LDB >= l_max)
+
+  ! Create a temporary memory-mapped file
+  integer                        :: fd
+  type(c_ptr)                    :: c_pointer
+  integer*8, pointer             :: a_array(:)
+  call mmap(trim(ezfio_filename)//'/work/four_idx',                  &
+      (/ 12_8 * map_a % n_elements /), 8, fd, .False., c_pointer)
+  call c_f_pointer(c_pointer, a_array, (/ 12_8 * map_a % n_elements /))
+
+  allocate(l_pointer(l_start:l_end+1), value((i_max*k_max)) )
+  ii = 1_8
+  !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,j,k,l,ik,idx) 
+  do l=l_start,l_end
+    !$OMP SINGLE
+    l_pointer(l) = ii
+    !$OMP END SINGLE
+    do j=j_start,j_end
+      !$OMP DO SCHEDULE(static,1)
+      do k=k_start,k_end
+        do i=i_start,k
+          ik = (i-i_start+1) + ishft( (k-k_start)*(k-k_start+1), -1 )
+          call bielec_integrals_index(i,j,k,l,idx)
+          call map_get(map_a,idx,value(ik))
+        enddo
+      enddo
+      !$OMP END DO
+
+      !$OMP SINGLE
+      ik=0
+      do k=k_start,k_end
+        do i=i_start,k
+          ik = ik+1
+          tmp=value(ik)
+          if (tmp /= 0.d0) then
+            a_array(ii) = ik
+            ii = ii+1_8
+            a_array(ii) = j
+            ii = ii+1_8
+            a_array(ii) = transfer(dble(tmp), 1_8)
+            ii = ii+1_8
+          endif
+        enddo
+      enddo
+      !$OMP END SINGLE
+    enddo
+  enddo
+  !$OMP SINGLE
+  l_pointer(l_end+1) = ii
+  !$OMP END SINGLE
+  !$OMP END PARALLEL  
+  deallocate(value)
+
+!INPUT DATA
+!open(unit=10,file='INPUT',form='UNFORMATTED')
+!write(10) i_start, j_start, i_end, j_end
+!write(10) a_start, b_start, a_end, b_end
+!write(10) LDB, mo_tot_num
+!write(10) matrix_B(1:LDB,1:mo_tot_num)
+!idx=size(a_array)
+!write(10) idx
+!write(10) a_array
+!write(10) l_pointer
+!close(10)
+!open(unit=10,file='OUTPUT',form='FORMATTED')
+! END INPUT DATA
+
+
+  !$OMP PARALLEL DEFAULT(NONE) SHARED(a_array,c_pointer,fd,          &
+      !$OMP  a_start,a_end,b_start,b_end,c_start,c_end,d_start,d_end,&
+      !$OMP  i_start,i_end,j_start,j_end,k_start,k_end,l_start,l_end,&
+      !$OMP  i_min,i_max,j_min,j_max,k_min,k_max,l_min,l_max,        &
+      !$OMP  map_c,matrix_B,l_pointer)                         &
+      !$OMP  PRIVATE(key,value,T,U,V,i,j,k,l,idx,ik,ll,   &
+      !$OMP  a,b,c,d,tmp,T2d,V2d,ii)
+  allocate( key(i_max*j_max*k_max), value(i_max*j_max*k_max) )
+  allocate( U(a_start:a_end, c_start:c_end, b_start:b_end) )
+
+
+
+  allocate( T2d((i_end-i_start+1)*(k_end-k_start+2)/2, j_start:j_end), &
+            V2d((i_end-i_start+1)*(k_end-k_start+2)/2, b_start:b_end), &
+            V(i_start:i_end, k_start:k_end), &
+            T(k_start:k_end, a_start:a_end))
+
+
+  !$OMP DO SCHEDULE(dynamic)
+  do d=d_start,d_end
+    U = 0.d0
+    do l=l_start,l_end
+      if (dabs(matrix_B(l,d)) < 1.d-10) then
+        cycle
+      endif
+      
+      ii=l_pointer(l)
+      do j=j_start,j_end
+        ik=0
+        do k=k_start,k_end
+          do i=i_start,k
+            ik = ik+1
+            if ( (ik /= a_array(ii)).or.(j /= a_array(ii+1_8))  &
+                 .or.(ii >= l_pointer(l+1)) ) then
+              T2d(ik,j) = 0.d0
+            else
+              T2d(ik,j) = transfer(a_array(ii+2_8), 1.d0)
+              ii=ii+3_8
+            endif
+          enddo
+        enddo
+      enddo
+      call DGEMM('N','N', ishft( (i_end-i_start+1)*(i_end-i_start+2), -1),&
+          (d-b_start+1),                                             &
+          (j_end-j_start+1), 1.d0,                                   &
+          T2d(1,j_start), size(T2d,1),                               &
+          matrix_B(j_start,b_start), size(matrix_B,1),0.d0,          &
+          V2d(1,b_start), size(V2d,1) )
+
+      do b=b_start,d
+        ik = 0
+        do k=k_start,k_end
+          do i=i_start,k
+            ik = ik+1
+            V(i,k) = V2d(ik,b)
+          enddo
+        enddo
+
+!        T = 0.d0
+!        do a=a_start,b
+!          do k=k_start,k_end
+!            do i=i_start,k
+!              T(k,a) = T(k,a) + V(i,k)*matrix_B(i,a)
+!            enddo
+!            do i=k+1,i_end
+!              T(k,a) = T(k,a) + V(k,i)*matrix_B(i,a)
+!            enddo
+!          enddo
+!        enddo
+        call DSYMM('L','U', (k_end-k_start+1), (b-a_start+1),        &
+            1.d0,                                                    &
+            V(i_start,k_start), size(V,1),                           &
+            matrix_B(i_start,a_start), size(matrix_B,1),0.d0,        &
+            T(k_start,a_start), size(T,1) )
+
+!        do c=c_start,b
+!          do a=a_start,c
+!            do k=k_start,k_end
+!              U(a,c,b) = U(a,c,b) + T(k,a)*matrix_B(k,c)*matrix_B(l,d)
+!            enddo
+!          enddo
+!        enddo
+        call DGEMM('T','N', (b-a_start+1), (b-c_start+1),            &
+            (k_end-k_start+1), matrix_B(l, d),                       &
+            T(k_start,a_start), size(T,1),                           &
+            matrix_B(k_start,c_start), size(matrix_B,1), 1.d0,       &
+            U(a_start,c_start,b), size(U,1) )
+!        do c=b+1,c_end
+!          do a=a_start,b
+!            do k=k_start,k_end
+!              U(a,c,b) = U(a,c,b) + T(k,a)*matrix_B(k,c)*matrix_B(l,d)
+!            enddo
+!          enddo
+!        enddo
+        if (b < b_end) then
+          call DGEMM('T','N', (b-a_start+1), (c_end-b),              &
+              (k_end-k_start+1), matrix_B(l, d),                     &
+              T(k_start,a_start), size(T,1),                         &
+              matrix_B(k_start,b+1), size(matrix_B,1), 1.d0,         &
+              U(a_start,b+1,b), size(U,1) )
+        endif
+      enddo
+
+    enddo
+
+    idx = 0_8
+    do b=b_start,d
+      do c=c_start,c_end
+        do a=a_start,min(b,c)
+          if (dabs(U(a,c,b)) < 1.d-15) then
+            cycle
+          endif
+          idx = idx+1_8
+          call bielec_integrals_index(a,b,c,d,key(idx))
+          value(idx) = U(a,c,b)
+        enddo
+      enddo
+    enddo
+
+    !$OMP CRITICAL
+    call map_append(map_c, key, value, idx) 
+    !$OMP END CRITICAL
+
+!WRITE OUTPUT
+! OMP CRITICAL
+!print *,  d
+!do b=b_start,d
+!  do c=c_start,c_end
+!    do a=a_start,min(b,c)
+!      if (dabs(U(a,c,b)) < 1.d-15) then
+!        cycle
+!      endif
+!      write(10,*) d,c,b,a,U(a,c,b)
+!    enddo
+!  enddo
+!enddo
+! OMP END CRITICAL
+!END WRITE OUTPUT
+
+
+  enddo
+  !$OMP END DO
+
+  deallocate(key,value,V,T)
+  !$OMP END PARALLEL
+  call map_sort(map_c)
+
+  call munmap( &
+      (/ 12_8 * map_a % n_elements /), 8, fd, c_pointer)
+  deallocate(l_pointer)
+
+end
--- a/plugins/Full_CI_ZMQ/NEEDED_CHILDREN_MODULES
+++ b/plugins/Full_CI_ZMQ/NEEDED_CHILDREN_MODULES
@ -1 +1 @@
-Perturbation Selectors_full Generators_full ZMQ 
+Perturbation Selectors_full Generators_full ZMQ FourIdx
--- a/plugins/Full_CI_ZMQ/pt2_stoch_routines.irp.f
+++ b/plugins/Full_CI_ZMQ/pt2_stoch_routines.irp.f
@ -350,12 +350,12 @@ subroutine get_first_tooth(computed, first_teeth)
 end subroutine


-BEGIN_PROVIDER [ integer, size_tbc ]
+BEGIN_PROVIDER [ integer*8, size_tbc ]
  implicit none
  BEGIN_DOC
 ! Size of the tbc array
  END_DOC
-  size_tbc = (comb_teeth+1)*N_det_generators + fragment_count*fragment_first
+  size_tbc = int((comb_teeth+1),8)*int(N_det_generators,8) + fragment_count*fragment_first
 END_PROVIDER

 subroutine get_carlo_workbatch(computed, comb, Ncomb, tbc)
@ -408,7 +408,8 @@ end subroutine

 subroutine add_comb(comb, computed, tbc, stbc, ct)
  implicit none
-  integer, intent(in) :: stbc, ct
+  integer*8, intent(in) :: stbc
+  integer, intent(in) :: ct
  double precision, intent(in) :: comb
  logical, intent(inout) :: computed(N_det_generators)
  integer, intent(inout) :: tbc(0:stbc)
--- a/plugins/Full_CI_ZMQ/run_selection_slave.irp.f
+++ b/plugins/Full_CI_ZMQ/run_selection_slave.irp.f
@ -57,7 +57,6 @@ subroutine run_selection_slave(thread,iproc,energy)
    endif

    if(done .or. ctask == size(task_id)) then
-      ASSERT (.not.(buf%N == 0 .and. ctask > 0))
      do i=1, ctask
         call task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id(i))
      end do
--- a/plugins/Full_CI_ZMQ/selection.irp.f
+++ b/plugins/Full_CI_ZMQ/selection.irp.f
@ -419,37 +419,82 @@ subroutine select_singles_and_doubles(i_generator,hole_mask,particle_mask,fock_d
      fullinteresting(0) = 0
      
      do ii=1,preinteresting(0)
-        i = preinteresting(ii)
-        mobMask(1,1) = iand(negMask(1,1), preinteresting_det(1,1,ii))
-        mobMask(1,2) = iand(negMask(1,2), preinteresting_det(1,2,ii))
-        nt = popcnt(mobMask(1, 1)) + popcnt(mobMask(1, 2))
-        do j=2,N_int
-          mobMask(j,1) = iand(negMask(j,1), preinteresting_det(j,1,ii))
-          mobMask(j,2) = iand(negMask(j,2), preinteresting_det(j,2,ii))
-          nt = nt+ popcnt(mobMask(j, 1)) + popcnt(mobMask(j, 2))
-        end do
+        select case (N_int)
+          case (1)
+            mobMask(1,1) = iand(negMask(1,1), preinteresting_det(1,1,ii))
+            mobMask(1,2) = iand(negMask(1,2), preinteresting_det(1,2,ii))
+            nt = popcnt(mobMask(1, 1)) + popcnt(mobMask(1, 2))
+          case (2)
+            mobMask(1:2,1) = iand(negMask(1:2,1), preinteresting_det(1:2,1,ii))
+            mobMask(1:2,2) = iand(negMask(1:2,2), preinteresting_det(1:2,2,ii))
+            nt = popcnt(mobMask(1, 1)) + popcnt(mobMask(1, 2)) + &
+                 popcnt(mobMask(2, 1)) + popcnt(mobMask(2, 2)) 
+          case (3)
+            mobMask(1:3,1) = iand(negMask(1:3,1), preinteresting_det(1:3,1,ii))
+            mobMask(1:3,2) = iand(negMask(1:3,2), preinteresting_det(1:3,2,ii))
+            nt = 0
+            do j=3,1,-1
+              if (mobMask(j,1) /= 0_bit_kind) then
+                nt = nt+ popcnt(mobMask(j, 1))
+                if (nt > 4) exit
+              endif
+              if (mobMask(j,2) /= 0_bit_kind) then
+                nt = nt+ popcnt(mobMask(j, 2))
+                if (nt > 4) exit
+              endif
+            end do
+          case (4)
+            mobMask(1:4,1) = iand(negMask(1:4,1), preinteresting_det(1:4,1,ii))
+            mobMask(1:4,2) = iand(negMask(1:4,2), preinteresting_det(1:4,2,ii))
+            nt = 0
+            do j=4,1,-1
+              if (mobMask(j,1) /= 0_bit_kind) then
+                nt = nt+ popcnt(mobMask(j, 1))
+                if (nt > 4) exit
+              endif
+              if (mobMask(j,2) /= 0_bit_kind) then
+                nt = nt+ popcnt(mobMask(j, 2))
+                if (nt > 4) exit
+              endif
+            end do
+          case default
+            mobMask(1:N_int,1) = iand(negMask(1:N_int,1), preinteresting_det(1:N_int,1,ii))
+            mobMask(1:N_int,2) = iand(negMask(1:N_int,2), preinteresting_det(1:N_int,2,ii))
+            nt = 0 
+            do j=N_int,1,-1
+              if (mobMask(j,1) /= 0_bit_kind) then
+                nt = nt+ popcnt(mobMask(j, 1))
+                if (nt > 4) exit
+              endif
+              if (mobMask(j,2) /= 0_bit_kind) then
+                nt = nt+ popcnt(mobMask(j, 2))
+                if (nt > 4) exit
+              endif
+            end do
+        end select
        
-         if(nt <= 4) then
-           interesting(0) += 1
-           interesting(interesting(0)) = i
+        if(nt <= 4) then
+          i = preinteresting(ii)
+          interesting(0) += 1
+          interesting(interesting(0)) = i
          minilist(1,1,interesting(0)) = preinteresting_det(1,1,ii)
          minilist(1,2,interesting(0)) = preinteresting_det(1,2,ii)
-           do j=2,N_int
+          do j=2,N_int
            minilist(j,1,interesting(0)) = preinteresting_det(j,1,ii)
            minilist(j,2,interesting(0)) = preinteresting_det(j,2,ii)
-           enddo
-           if(nt <= 2) then
-             fullinteresting(0) += 1
-             fullinteresting(fullinteresting(0)) = i
+          enddo
+          if(nt <= 2) then
+            fullinteresting(0) += 1
+            fullinteresting(fullinteresting(0)) = i
            fullminilist(1,1,fullinteresting(0)) = preinteresting_det(1,1,ii)
            fullminilist(1,2,fullinteresting(0)) = preinteresting_det(1,2,ii)
-             do j=2,N_int
+            do j=2,N_int
              fullminilist(j,1,fullinteresting(0)) = preinteresting_det(j,1,ii)
              fullminilist(j,2,fullinteresting(0)) = preinteresting_det(j,2,ii)
-             enddo
-           end if
-         end if
-
+            enddo
+          end if
+        end if
+        
      end do
      
      do ii=1,prefullinteresting(0)
@ -458,12 +503,14 @@ subroutine select_singles_and_doubles(i_generator,hole_mask,particle_mask,fock_d
        mobMask(1,1) = iand(negMask(1,1), psi_det_sorted(1,1,i))
        mobMask(1,2) = iand(negMask(1,2), psi_det_sorted(1,2,i))
        nt = popcnt(mobMask(1, 1)) + popcnt(mobMask(1, 2))
-        do j=2,N_int
+        if (nt > 2) cycle
+        do j=N_int,2,-1
          mobMask(j,1) = iand(negMask(j,1), psi_det_sorted(j,1,i))
          mobMask(j,2) = iand(negMask(j,2), psi_det_sorted(j,2,i))
          nt = nt+ popcnt(mobMask(j, 1)) + popcnt(mobMask(j, 2))
+          if (nt > 2) exit
        end do
-
+        
        if(nt <= 2) then
          fullinteresting(0) += 1
          fullinteresting(fullinteresting(0)) = i
--- a/plugins/GPI2/broadcast.irp.f
+++ b/plugins/GPI2/broadcast.irp.f
@ -0,0 +1,254 @@
+subroutine broadcast_wf(energy)
+  implicit none
+  BEGIN_DOC
+  ! Segment corresponding to the wave function. This is segment 0.
+  END_DOC
+  use bitmasks
+  use GASPI
+  use ISO_C_BINDING
+  
+  double precision, intent(inout) :: energy(N_states)
+  integer(gaspi_return_t)        :: res
+  
+  if (is_gaspi_master) then
+    call broadcast_wf_put(energy)
+  else
+    call broadcast_wf_get(energy)
+  endif
+
+  res = gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK)
+  if(res .ne. GASPI_SUCCESS) then
+     write(*,*) "gaspi_barrier failed"
+     stop -1
+  end if
+
+  
+  integer(gaspi_segment_id_t)    :: seg_id
+  do seg_id=0,3
+    res = gaspi_segment_delete(seg_id)
+    if(res .ne. GASPI_SUCCESS) then
+      write(*,*) "gaspi_segment_delete failed", seg_id
+      stop -1
+    end if
+  end do
+
+end
+
+
+
+
+
+subroutine broadcast_wf_put(energy)
+  implicit none
+  BEGIN_DOC
+  ! Initiates the broadcast of the wave function
+  END_DOC
+  use bitmasks
+  use GASPI
+  use ISO_C_BINDING
+  
+  double precision, intent(in)    :: energy(N_states)
+  integer(gaspi_segment_id_t)    :: seg_id
+  integer(gaspi_alloc_t)         :: seg_alloc_policy
+  integer(gaspi_size_t)          :: seg_size(0:3)
+  type(c_ptr)                    :: seg_ptr(0:3)
+  integer, pointer               :: params_int(:)       ! Segment 0
+  double precision, pointer      :: psi_coef_tmp(:,:)   ! Segment 1
+  integer(bit_kind), pointer     :: psi_det_tmp(:,:,:)  ! Segment 2
+  double precision, pointer      :: params_double(:)    ! Segment 3
+  
+  integer(gaspi_return_t)        :: res
+  
+  
+  seg_alloc_policy = GASPI_MEM_UNINITIALIZED
+
+  seg_size(0) = 4 * 5 
+  seg_id=0
+  res = gaspi_segment_create(seg_id, seg_size(seg_id), GASPI_GROUP_ALL, &
+      GASPI_BLOCK, seg_alloc_policy)
+  if(res .ne. GASPI_SUCCESS) then
+    write(*,*) "gaspi_create_segment failed", gaspi_rank, seg_id
+    stop -1
+  end if
+
+  res = gaspi_segment_ptr(seg_id, seg_ptr(seg_id))
+  if(res .ne. GASPI_SUCCESS) then
+    write(*,*) "gaspi_segment_ptr failed", gaspi_rank
+    stop -1
+  end if
+
+  call c_f_pointer(seg_ptr(0), params_int, shape=(/ 5 /))
+  params_int(1) = N_states
+  params_int(2) = N_det
+  params_int(3) = psi_det_size
+
+  res = gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK)
+  if(res .ne. GASPI_SUCCESS) then
+     write(*,*) "gaspi_barrier failed", gaspi_rank
+     stop -1
+  end if
+
+  seg_size(1) = 8 * psi_det_size * N_states
+  seg_size(2) = bit_kind * psi_det_size * 2 * N_int
+  seg_size(3) = 8 * N_states
+
+  do seg_id=1, 3
+    res = gaspi_segment_create(seg_id, seg_size(seg_id), GASPI_GROUP_ALL, &
+        GASPI_BLOCK, seg_alloc_policy)
+    if(res .ne. GASPI_SUCCESS) then
+      write(*,*) "gaspi_create_segment failed", gaspi_rank, seg_id
+      stop -1
+    end if
+
+    res = gaspi_segment_ptr(seg_id, seg_ptr(seg_id))
+    if(res .ne. GASPI_SUCCESS) then
+      write(*,*) "gaspi_segment_ptr failed", gaspi_rank
+      stop -1
+    end if
+  end do
+
+  call c_f_pointer(seg_ptr(1), psi_coef_tmp, shape=shape(psi_coef))
+  call c_f_pointer(seg_ptr(2), psi_det_tmp, shape=shape(psi_det))
+  call c_f_pointer(seg_ptr(3), params_double, shape=(/ N_states /))
+
+  psi_coef_tmp = psi_coef
+  psi_det_tmp  = psi_det
+  params_double = energy
+
+  res = gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK)
+  if(res .ne. GASPI_SUCCESS) then
+     write(*,*) "gaspi_barrier failed", gaspi_rank
+     stop -1
+  end if
+
+end
+
+
+
+
+
+
+
+subroutine broadcast_wf_get(energy)
+  implicit none
+  BEGIN_DOC
+  ! Gets the broadcasted wave function
+  END_DOC
+  use bitmasks
+  use GASPI
+  use ISO_C_BINDING
+  
+  double precision, intent(out)  :: energy(N_states)
+  integer(gaspi_segment_id_t)    :: seg_id
+  integer(gaspi_alloc_t)         :: seg_alloc_policy
+  integer(gaspi_size_t)          :: seg_size(0:3)
+  type(c_ptr)                    :: seg_ptr(0:3)
+  integer, pointer               :: params_int(:)       ! Segment 0
+  double precision, pointer      :: psi_coef_tmp(:,:)   ! Segment 1
+  integer(bit_kind), pointer     :: psi_det_tmp(:,:,:)  ! Segment 2
+  double precision, pointer      :: params_double(:)    ! Segment 3
+  
+  integer(gaspi_return_t)        :: res
+  
+  
+  seg_alloc_policy = GASPI_MEM_UNINITIALIZED
+  
+  seg_size(0) = 4 * 5
+  seg_id=0
+  res = gaspi_segment_create(seg_id, seg_size(seg_id), GASPI_GROUP_ALL,&
+      GASPI_BLOCK, seg_alloc_policy)
+  if(res .ne. GASPI_SUCCESS) then
+    write(*,*) "gaspi_create_segment failed"
+    stop -1
+  end if
+  
+  res = gaspi_segment_ptr(seg_id, seg_ptr(seg_id))
+  if(res .ne. GASPI_SUCCESS) then
+    write(*,*) "gaspi_segment_ptr failed"
+    stop -1
+  end if
+  
+  res = gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK)
+  if(res .ne. GASPI_SUCCESS) then
+     write(*,*) "gaspi_barrier failed"
+     stop -1
+  end if
+
+  integer(gaspi_offset_t)        :: localOff, remoteOff
+  integer(gaspi_rank_t)          :: remoteRank
+  integer(gaspi_queue_id_t)      :: queue
+  localOff = 0
+  remoteRank = 0
+  queue = 0
+  res = gaspi_read(seg_id, localOff, remoteRank,                     &
+      seg_id, remoteOff, seg_size(seg_id), queue, GASPI_BLOCK)
+  if(res .ne. GASPI_SUCCESS) then
+    write(*,*) "gaspi_read failed"
+    stop -1
+  end if
+
+  res = gaspi_wait(queue, GASPI_BLOCK)
+  if(res .ne. GASPI_SUCCESS) then
+    write(*,*) "gaspi_wait failed"
+    stop -1
+  end if
+
+  call c_f_pointer(seg_ptr(0), params_int, shape=shape( (/ 5 /) ))
+
+  N_states         = params_int(1)  
+  N_det            = params_int(2)  
+  psi_det_size     = params_int(3)  
+  TOUCH N_states N_det psi_det_size
+
+  seg_size(1) = 8 * psi_det_size * N_states
+  seg_size(2) = bit_kind * psi_det_size * 2 * N_int
+  seg_size(3) = 8 * N_states
+
+  do seg_id=1, 3
+    res = gaspi_segment_create(seg_id, seg_size(seg_id), GASPI_GROUP_ALL, &
+        GASPI_BLOCK, seg_alloc_policy)
+    if(res .ne. GASPI_SUCCESS) then
+      write(*,*) "gaspi_create_segment failed"
+      stop -1
+    end if
+
+    res = gaspi_segment_ptr(seg_id, seg_ptr(seg_id))
+    if(res .ne. GASPI_SUCCESS) then
+      write(*,*) "gaspi_segment_ptr failed"
+      stop -1
+    end if
+  end do
+
+  res = gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK)
+  if(res .ne. GASPI_SUCCESS) then
+     write(*,*) "gaspi_barrier failed"
+     stop -1
+  end if
+
+  do seg_id=1, 3
+    res = gaspi_read(seg_id, localOff, remoteRank,                     &
+        seg_id, remoteOff, seg_size(seg_id), queue, GASPI_BLOCK)
+    if(res .ne. GASPI_SUCCESS) then
+      write(*,*) "gaspi_read failed"
+      stop -1
+    end if
+    res = gaspi_wait(queue, GASPI_BLOCK)
+    if(res .ne. GASPI_SUCCESS) then
+      write(*,*) "gaspi_wait failed"
+      stop -1
+    end if
+  end do
+
+  call c_f_pointer(seg_ptr(1), psi_coef_tmp, shape=shape(psi_coef))
+  call c_f_pointer(seg_ptr(2), psi_det_tmp, shape=shape(psi_det))
+  call c_f_pointer(seg_ptr(3), params_double, shape=shape(energy))
+
+  psi_coef = psi_coef_tmp
+  psi_det  = psi_det_tmp
+  energy   = params_double
+
+end
+
+
+
+
--- a/plugins/Hartree_Fock/SCF_old.irp.f
+++ b/plugins/Hartree_Fock/SCF_old.irp.f
@ -0,0 +1,61 @@
+program scf
+  BEGIN_DOC
+! Produce `Hartree_Fock` MO orbital 
+! output: mo_basis.mo_tot_num mo_basis.mo_label mo_basis.ao_md5 mo_basis.mo_coef mo_basis.mo_occ
+! output: hartree_fock.energy
+! optional: mo_basis.mo_coef
+  END_DOC
+  call create_guess
+  call orthonormalize_mos
+  call run
+end
+
+subroutine create_guess
+  implicit none
+  BEGIN_DOC
+!   Create a MO guess if no MOs are present in the EZFIO directory
+  END_DOC
+  logical                        :: exists
+  PROVIDE ezfio_filename
+  call ezfio_has_mo_basis_mo_coef(exists)
+  if (.not.exists) then
+    if (mo_guess_type == "HCore") then
+      mo_coef = ao_ortho_lowdin_coef
+      TOUCH mo_coef
+      mo_label = 'Guess'
+      call mo_as_eigvectors_of_mo_matrix(mo_mono_elec_integral,size(mo_mono_elec_integral,1),size(mo_mono_elec_integral,2),mo_label)
+      SOFT_TOUCH mo_coef mo_label
+    else if (mo_guess_type == "Huckel") then
+      call huckel_guess
+    else
+      print *,  'Unrecognized MO guess type : '//mo_guess_type
+      stop 1
+    endif
+  endif
+end
+
+subroutine run
+
+  BEGIN_DOC
+!   Run SCF calculation
+  END_DOC
+
+  use bitmasks
+  implicit none
+
+  double precision               :: SCF_energy_before,SCF_energy_after,diag_H_mat_elem
+  double precision               :: EHF
+  integer                        :: i_it, i, j, k
+   
+  EHF = HF_energy 
+
+  mo_label = "Canonical"
+
+! Choose SCF algorithm
+
+  call damping_SCF   ! Deprecated routine
+!  call Roothaan_Hall_SCF
+  
+end
+
+
--- a/plugins/QMC/densify_coefmatrix.irp.f
+++ b/plugins/QMC/densify_coefmatrix.irp.f
@ -0,0 +1,8 @@
+program densify
+  implicit none
+  read_wf = .True.
+  touch read_wf
+  call generate_all_alpha_beta_det_products()
+  call diagonalize_ci
+  call save_wavefunction
+end
--- a/plugins/QMC/truncate_wf_spin.irp.f
+++ b/plugins/QMC/truncate_wf_spin.irp.f
@ -39,7 +39,8 @@ subroutine run
  call dsort(norm_sort(1),iorder(1),nab)


-  PROVIDE psi_bilinear_matrix_values nuclear_repulsion 
+  PROVIDE psi_bilinear_matrix_values psi_bilinear_matrix_rows psi_bilinear_matrix_columns
+  PROVIDE nuclear_repulsion 
  print *,  ''
  do j=0,nab
    i = iorder(j)
@ -47,7 +48,9 @@ subroutine run
      !$OMP PARALLEL DO PRIVATE(k)
      do k=1,n_det
        if (psi_bilinear_matrix_columns(k) == -i) then
-          psi_bilinear_matrix_values(k,1) = 0.d0
+          do l=1,N_states
+            psi_bilinear_matrix_values(k,l) = 0.d0
+          enddo
        endif
      enddo
      !$OMP END PARALLEL DO
@ -55,7 +58,9 @@ subroutine run
      !$OMP PARALLEL DO PRIVATE(k)
      do k=1,n_det
        if (psi_bilinear_matrix_rows(k) ==  i) then
-          psi_bilinear_matrix_values(k,1) = 0.d0
+          do l=1,N_states
+            psi_bilinear_matrix_values(k,l) = 0.d0
+          enddo
        endif
      enddo
      !$OMP END PARALLEL DO
@ -64,9 +69,11 @@ subroutine run
      cycle
    endif

-    u_0 = psi_bilinear_matrix_values(1:N_det,1:N_states)
-    v_t = 0.d0
-    s_t = 0.d0
+    u_0(1:N_det,1:N_states) = psi_bilinear_matrix_values(1:N_det,1:N_states)
+    v_0(1:N_det,1:N_states) = 0.d0
+    u_t(1:N_states,1:N_det) = 0.d0
+    v_t(1:N_states,1:N_det) = 0.d0
+    s_t(1:N_states,1:N_det) = 0.d0
    call dtranspose(                                                   &
        u_0,                                                           &
        size(u_0, 1),                                                  &
@ -85,20 +92,21 @@ subroutine run
    
    double precision, external :: u_dot_u, u_dot_v
    do i=1,N_states
-      e_0(i) = u_dot_v(v_t(1,i),u_0(1,i),N_det)/u_dot_u(u_0(1,i),N_det)
+      e_0(i) = u_dot_v(u_0(1,i),v_0(1,i),N_det)/u_dot_u(u_0(1,i),N_det)
+      print *,  'E = ', e_0(i) + nuclear_repulsion
    enddo

    m = 0
    do k=1,n_det
-     if (psi_bilinear_matrix_values(k,1) /= 0.d0) then
+     if (sum(psi_bilinear_matrix_values(k,1:N_states)) /= 0.d0) then
      m = m+1
     endif
    enddo

-    E = E_0(1) + nuclear_repulsion
-    norm = u_dot_u(u_0(1,1),N_det)
+    do k=1,N_states
+      E = E_0(k) + nuclear_repulsion
+    enddo
    print *,  'Number of determinants:', m
-    print *,  'Energy', E
    exit
  enddo
  call wf_of_psi_bilinear_matrix(.True.)
--- a/plugins/analyze_wf/analyze_wf.irp.f
+++ b/plugins/analyze_wf/analyze_wf.irp.f
@ -14,6 +14,17 @@ subroutine run
  integer                        :: class(0:mo_tot_num,5)
  double precision               :: occupation(mo_tot_num)

+  write(*,'(A)')  'Energy of 1st determinant'
+  write(*,'(A)')  '========================='
+  write(*,'(A)')  ''
+  write(*,*) 'Total', ref_bitmask_energy + nuclear_repulsion
+  write(*,*) 'Mono-electronic', mono_elec_ref_bitmask_energy
+  write(*,*) 'Kinetic', kinetic_ref_bitmask_energy
+  write(*,*) 'Electron-nucleus', nucl_elec_ref_bitmask_energy
+  write(*,*) 'Two-electron', bi_elec_ref_bitmask_energy
+  write(*,'(A)')  ''
+  write(*,'(A)')  ''
+
  write(*,'(A)')  'MO Occupation'
  write(*,'(A)')  '============='
  write(*,'(A)')  ''
--- a/plugins/mrcepa0/dressing_slave.irp.f
+++ b/plugins/mrcepa0/dressing_slave.irp.f
@ -42,18 +42,18 @@ subroutine mrsc2_dressing_slave(thread,iproc)
  integer, allocatable           :: hp(:,:)


-  integer                         :: i_state, i, i_I, J, k, k2, k1, kk, ll, degree, degree2, m, l, deg, ni, m2
+  integer                         :: i_state, i, i_I, J, k, k2, k1, kk, ll, m, l, deg, ni, m2
  integer                         :: n(2)
  integer                         :: p1,p2,h1,h2,s1,s2, blok, I_s, J_s, kn
  logical                         :: ok
-  double precision                :: phase_iI, phase_Ik, phase_Jl, phase_Ji, phase_al
+  double precision                :: phase_ia, phase_Ik, phase_Jl, phase_Ji, phase_la, phase_ka, phase_tmp
+  double precision                :: Hka, Hla, Ska, Sla, tmp
  double precision                :: diI, hIi, hJi, delta_JI, dkI, HkI, ci_inv(N_states), cj_inv(N_states)
  double precision                :: contrib, contrib_s2, wall, iwall
-  double precision, allocatable   :: dleat(:,:,:), dleat_s2(:,:,:)
-  integer, dimension(0:2,2,2)     :: exc_iI, exc_Ik, exc_IJ
+  integer, dimension(0:2,2,2)     :: exc_iI, exc_Ik, exc_IJ, exc
  integer(bit_kind)               :: det_tmp(N_int, 2), det_tmp2(N_int, 2), inac, virt
  integer, external               :: get_index_in_psi_det_sorted_bit, searchDet, detCmp
-  logical, external               :: is_in_wavefunction, isInCassd, detEq
+  logical, external               :: is_in_wavefunction
  integer,allocatable :: komon(:)
  logical :: komoned
  !double precision, external :: get_dij
@ -63,8 +63,8 @@ subroutine mrsc2_dressing_slave(thread,iproc)

  call connect_to_taskserver(zmq_to_qp_run_socket,worker_id,thread)

-  allocate (dleat(N_states, N_det_non_ref, 2), delta(N_states,0:N_det_non_ref, 2))
-  allocate (dleat_s2(N_states, N_det_non_ref, 2), delta_s2(N_states,0:N_det_non_ref, 2))
+  allocate (delta(N_states,0:N_det_non_ref, 2))
+  allocate (delta_s2(N_states,0:N_det_non_ref, 2))
  allocate(komon(0:N_det_non_ref))

  allocate(hp(2,N_det_non_ref))
@ -100,7 +100,7 @@ subroutine mrsc2_dressing_slave(thread,iproc)
      k = det_cepa0_idx(linked(kk, i_I))
      blok = blokMwen(kk, i_I)
      
-      call get_excitation(psi_ref(1,1,i_I),psi_non_ref(1,1,k),exc_Ik,degree,phase_Ik,N_int)
+      call get_excitation(psi_ref(1,1,i_I),psi_non_ref(1,1,k),exc_Ik,deg,phase_Ik,N_int)
    
      if(J /= i_I) then
        call apply_excitation(psi_ref(1,1,J),exc_Ik,det_tmp2,ok,N_int)
@ -135,36 +135,10 @@ subroutine mrsc2_dressing_slave(thread,iproc)
          
          if(h_cache(J,i) == 0.d0) cycle
          if(h_cache(i_I,i) == 0.d0) cycle
-          
-          !ok = .false.
-          !do i_state=1, N_states
-          !  if(lambda_mrcc(i_state, i) /= 0d0) then
-          !    ok = .true.
-          !    exit
-          !  end if
-          !end do
-          !if(.not. ok) cycle
-!         
-          
+         
          komon(0) += 1
          kn = komon(0)
          komon(kn) = i
-          
-          
-!           call get_excitation(psi_ref(1,1,J),psi_non_ref(1,1,i),exc_IJ,degree2,phase_Ji,N_int)
-!           if(I_i /= J) call get_excitation(psi_ref(1,1,I_i),psi_non_ref(1,1,i),exc_IJ,degree2,phase_Ii,N_int)
-!           if(I_i == J) phase_Ii = phase_Ji
-          
-          do i_state = 1,N_states
-            dkI = h_cache(J,i) * dij(i_I, i, i_state)
-            dleat(i_state, kn, 1) = dkI
-            dleat(i_state, kn, 2) = dkI
-
-            dkI = s2_cache(J,i) * dij(i_I, i, i_state)
-            dleat_s2(i_state, kn, 1) = dkI
-            dleat_s2(i_state, kn, 2) = dkI
-          end do
-
        end do
          
        komoned = .true.
@ -178,18 +152,20 @@ subroutine mrsc2_dressing_slave(thread,iproc)
        call apply_excitation(psi_non_ref(1,1,i),exc_Ik,det_tmp,ok,N_int)
        if(.not. ok) cycle
        if(HP(1,i) + HP(1,k) <= 2 .and. HP(2,i) + HP(2,k) <= 2) then
-          cycle
+          if(is_in_wavefunction(det_tmp, N_int)) cycle
        end if
        
-        !if(isInCassd(det_tmp, N_int)) cycle
-          
+        
+        call i_h_j_phase_out(psi_non_ref(1,1,i), det_tmp, N_int, tmp, phase_ia,exc, deg)
+        call i_h_j_phase_out(psi_ref(1,1,i_I), psi_non_ref(1,1,k), N_int, tmp, phase_ik,exc, deg)
+        
+        call i_h_j_phase_out(psi_non_ref(1,1,l), det_tmp, N_int, Hla, phase_la,exc,deg)
+        call get_s2(psi_non_ref(1,1,l), det_tmp, N_int, Sla)
+        
+        
        do i_state = 1, N_states 
-          !if(lambda_mrcc(i_state, i) == 0d0) cycle
-          
-
-          !contrib = h_cache(i_I,k) * lambda_mrcc(i_state, k) * dleat(i_state, m, 2)! * phase_al
-          contrib =  dij(i_I, k, i_state) * dleat(i_state, m, 2)
-          contrib_s2 =  dij(i_I, k, i_state) * dleat_s2(i_state, m, 2)
+          contrib =  dij(i_I, k, i_state) * dij(i_I, i, i_state) * Hla * phase_ia * phase_ik
+          contrib_s2 =  dij(i_I, k, i_state) * dij(i_I, i, i_state) * Sla *phase_ia * phase_ik
          delta(i_state,ll,1) += contrib
          delta_s2(i_state,ll,1) += contrib_s2
          if(dabs(psi_ref_coef(i_I,i_state)).ge.5.d-5) then
@ -198,9 +174,12 @@ subroutine mrsc2_dressing_slave(thread,iproc)
          endif
          
          if(I_i == J) cycle
-          !contrib = h_cache(J,l) * lambda_mrcc(i_state, l) * dleat(i_state, m, 1)! * phase_al
-          contrib =  dij(J, l, i_state) * dleat(i_state, m, 1)
-          contrib_s2 =  dij(J, l, i_state) * dleat_s2(i_state, m, 1)
+          call i_h_j_phase_out(psi_non_ref(1,1,k), det_tmp, N_int, Hka, phase_ka,exc,deg)
+          call get_s2(psi_non_ref(1,1,k), det_tmp, N_int, Ska) 
+          call i_h_j_phase_out(psi_ref(1,1,J), psi_non_ref(1,1,l), N_int, tmp, phase_jl,exc, deg)
+          
+          contrib =  dij(J, l, i_state) * dij(J, i, i_state) * Hka* phase_ia * phase_jl
+          contrib_s2 =  dij(J, l, i_state) * dij(J, i, i_state) * Ska*phase_ia*phase_jl
          delta(i_state,kk,2) += contrib
          delta_s2(i_state,kk,2) += contrib_s2
          if(dabs(psi_ref_coef(J,i_state)).ge.5.d-5) then
@ -211,12 +190,8 @@ subroutine mrsc2_dressing_slave(thread,iproc)
      end do ! while
    end do ! kk

-      
-      call push_mrsc2_results(zmq_socket_push, I_i, J, delta, delta_s2, task_id) 
-      call task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id)
- 
-!     end if
-    
+    call push_mrsc2_results(zmq_socket_push, I_i, J, delta, delta_s2, task_id) 
+    call task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id)
  enddo

  deallocate(delta)
--- a/plugins/read_integral/print_integrals_ao.irp.f
+++ b/plugins/read_integral/print_integrals_ao.irp.f
@ -0,0 +1,108 @@
+program print_integrals
+
+  PROVIDE ezfio_filename
+  call ezfio_set_integrals_monoelec_disk_access_ao_one_integrals('None')
+  call ezfio_set_integrals_bielec_disk_access_ao_integrals('None')
+  call run
+end
+
+subroutine run
+  implicit none
+  
+  integer :: iunit
+  integer :: getunitandopen
+
+  integer ::i,j,k,l
+  double precision :: integral
+
+  iunit = getunitandopen('kinetic_ao','w')
+  do i=1,ao_num
+    do j=1,ao_num
+      integral = ao_kinetic_integral(i,j)
+      if (dabs(integral) > ao_integrals_threshold) then
+        write(iunit,*) i,j, integral
+      endif
+    enddo
+  enddo
+  close(iunit)
+  
+  iunit = getunitandopen('overlap_ao','w')
+  do i=1,ao_num
+    do j=1,ao_num
+      integral = ao_overlap(i,j)
+      if (dabs(integral) > ao_integrals_threshold) then
+        write(iunit,*) i,j, integral
+      endif
+    enddo
+  enddo
+  close(iunit)
+  
+  iunit = getunitandopen('nuclear_ao','w')
+  do i=1,ao_num
+    do j=1,ao_num
+      integral = ao_nucl_elec_integral(i,j)
+      if (dabs(integral) > ao_integrals_threshold) then
+        write(iunit,*) i,j, integral
+      endif
+    enddo
+  enddo
+  close(iunit)
+
+!  iunit = getunitandopen('pseudo_ao','w')
+!  do i=1,ao_num
+!    do j=1,ao_num
+!      write(iunit,*) i,j, ao_pseudo_integral(i,j)
+!    enddo
+!  enddo
+!  close(iunit)
+
+  PROVIDE ao_bielec_integrals_in_map
+  iunit = getunitandopen('bielec_ao','w')
+
+  integer*8                      :: i8
+  integer                        :: i_idx, n_elements_max, k1, n_elements
+  integer                        :: ii(8), jj(8), kk(8), ll(8)
+  double precision, external     :: ao_bielec_integral
+  integer(key_kind), allocatable :: keys(:)
+  double precision, allocatable  :: values(:)
+
+
+  call get_cache_map_n_elements_max(ao_integrals_map,n_elements_max)
+  allocate(keys(n_elements_max), values(n_elements_max))
+
+!  do i8=0_8,ao_integrals_map%map_size
+!     n_elements = n_elements_max
+!     call get_cache_map(ao_integrals_map,i8,keys,values,n_elements)
+!     do k1=1,n_elements
+!      call bielec_integrals_index_reverse(kk,ii,ll,jj,keys(k1))
+!      if ( (kk(1)>ao_num).or.                                        &
+!            (ii(1)>ao_num).or.                                       &
+!            (jj(1)>ao_num).or.                                       &
+!            (ll(1)>ao_num) ) then
+!            cycle
+!      endif
+!      k = kk(1)
+!      i = ii(1)
+!      l = ll(1)
+!      j = jj(1)
+!      integral = values(k1)
+!      write (iunit,'(4(I6,X),F20.15)') k,i,l,j, integral 
+!    enddo
+!  enddo
+
+  do i=1,ao_num
+    do k=1,ao_num
+      do j=1,ao_num
+        do l=1,ao_num
+          double precision, external :: get_ao_bielec_integral
+          integral = get_ao_bielec_integral(i,j,k,l,ao_integrals_map)
+          if (dabs(integral)>=1.e-15) then
+            write (iunit,'(4(I6),F20.15)') i,j,k,l, integral 
+          endif
+        enddo
+      enddo
+    enddo
+  enddo
+
+  close(iunit)
+end
--- a/plugins/read_integral/print_integrals_mo.irp.f
+++ b/plugins/read_integral/print_integrals_mo.irp.f
@ -49,7 +49,7 @@ program print_integrals
          double precision :: get_mo_bielec_integral
          integral = get_mo_bielec_integral(i,j,k,l,mo_integrals_map)
          if (dabs(integral) > mo_integrals_threshold) then
-            write (iunit,'(4(I5,X),D22.15)') i,j,k,l, integral 
+            write (iunit,'(4(I6,X),F20.15)') i,j,k,l, integral 
          endif
       !end if
     enddo
--- a/plugins/read_integral/read_integrals_ao.irp.f
+++ b/plugins/read_integral/read_integrals_ao.irp.f
@ -0,0 +1,76 @@
+program read_integrals
+
+  PROVIDE ezfio_filename
+  call ezfio_set_integrals_monoelec_disk_access_ao_one_integrals("None")
+  call run
+end
+
+subroutine run
+  use map_module
+  implicit none
+  
+  integer :: iunit
+  integer :: getunitandopen
+
+  integer ::i,j,k,l
+  double precision :: integral
+  double precision, allocatable :: A(:,:)
+
+  integer             :: n_integrals 
+  integer(key_kind), allocatable   :: buffer_i(:) 
+  real(integral_kind), allocatable :: buffer_values(:)
+  integer(key_kind)  :: key
+   
+  allocate (A(ao_num,ao_num))
+  A = 0.d0
+  
+  iunit = getunitandopen('kinetic_ao','r')
+  do 
+    read (iunit,*,end=10) i,j, integral
+    A(i,j) = integral
+    A(j,i) = integral
+  enddo
+  10 continue
+  close(iunit)
+  call write_one_e_integrals('ao_kinetic_integral', A, size(A,1), size(A,2))
+
+
+  A = 0.d0
+  iunit = getunitandopen('nuclear_ao','r')
+  do 
+    read (iunit,*,end=12) i,j, integral
+    A(i,j) = integral
+    A(j,i) = integral
+  enddo
+  12 continue
+  close(iunit)
+  call write_one_e_integrals('ao_ne_integral', A, size(A,1), size(A,2))
+
+  call write_one_e_integrals('ao_pseudo_integral', ao_pseudo_integral,&
+        size(ao_pseudo_integral,1), size(ao_pseudo_integral,2))
+
+
+  call ezfio_set_integrals_monoelec_disk_access_ao_one_integrals("Read")
+
+  allocate(buffer_i(ao_num**4), buffer_values(ao_num**4))
+   
+  iunit = getunitandopen('bielec_ao','r')
+  n_integrals=0
+  do 
+    read (iunit,*,end=13) i,j,k,l, integral
+    n_integrals += 1
+    call bielec_integrals_index(i, j, k, l, buffer_i(n_integrals) )
+    buffer_values(n_integrals) = integral
+  enddo
+  13 continue
+  close(iunit)
+  
+  call insert_into_ao_integrals_map(n_integrals,buffer_i,buffer_values)
+
+  call map_sort(ao_integrals_map)
+  call map_unique(ao_integrals_map)
+
+  call map_save_to_disk(trim(ezfio_filename)//'/work/ao_ints',ao_integrals_map)
+  call ezfio_set_integrals_bielec_disk_access_ao_integrals('Read')
+
+end
--- a/plugins/read_integral/read_integrals_mo.irp.f
+++ b/plugins/read_integral/read_integrals_mo.irp.f
@ -1,5 +1,10 @@
 program read_integrals
-
+  BEGIN_DOC
+! Reads the integrals from the following files:
+! - kinetic_mo
+! - nuclear_mo
+! - bielec_mo
+  END_DOC
  PROVIDE ezfio_filename
  call ezfio_set_integrals_monoelec_disk_access_mo_one_integrals("None")
  call run
--- a/scripts/compilation/qp_create_ninja.py
+++ b/scripts/compilation/qp_create_ninja.py
@ -36,6 +36,7 @@ except ImportError:
 from qp_path import QP_ROOT, QP_SRC, QP_EZFIO

 LIB = "" # join(QP_ROOT, "lib", "rdtsc.o") 
+GPI_LIB = join(QP_ROOT, "lib64", "libGPI2.a") 
 EZFIO_LIB = join(QP_ROOT, "lib", "libezfio_irp.a") 
 ZMQ_LIB = join(QP_ROOT, "lib", "libf77zmq.a") + " "  + join(QP_ROOT, "lib", "libzmq.a") + " -lstdc++ -lrt"
 ROOT_BUILD_NINJA = join(QP_ROOT, "config", "build.ninja")
@ -96,8 +97,7 @@ def ninja_create_env_variable(pwd_config_file):
        l_string.append(str_)

    lib_lapack = get_compilation_option(pwd_config_file, "LAPACK_LIB")
-    lib_gpi2 = get_compilation_option(pwd_config_file, "GPI2_LIB")
-    str_lib = " ".join([LIB, lib_lapack, lib_gpi2, EZFIO_LIB, ZMQ_LIB])
+    str_lib = " ".join([LIB, lib_lapack, GPI_LIB, EZFIO_LIB, ZMQ_LIB])
    l_string.append("LIB = {0} ".format(str_lib))

    l_string.append("")
@ -266,7 +266,7 @@ def ninja_ezfio_rule():

    install_lib_ezfio = join(QP_ROOT, 'install', 'EZFIO', "lib", "libezfio_irp.a")
    l_cmd = ["cd {0}".format(QP_EZFIO)] + l_flag
-    l_cmd += ["rm -f make.config ; ninja && ln -sf {0} {1}".format(install_lib_ezfio, EZFIO_LIB)]
+    l_cmd += ["rm -f make.config ; ninja && rm -f {1} ; ln -sf {0} {1}".format(install_lib_ezfio, EZFIO_LIB)]

    l_string = ["rule build_ezfio",
                "   command = {0}".format(" ; ".join(l_cmd)),
@ -307,7 +307,7 @@ def ninja_symlink_rule():
    """
    Return the command to create for the symlink
    """
-    return ["rule build_symlink", "   command =  ln -sf $in $out", ""]
+    return ["rule build_symlink", "   command =  rm -f $out ; ln -sf $in $out", ""]


 def ninja_symlink_build(path_module, l_symlink):
--- a/src/Davidson/davidson_parallel.irp.f
+++ b/src/Davidson/davidson_parallel.irp.f
@ -205,10 +205,10 @@ subroutine davidson_pull_results(zmq_socket_pull, v_t, s_t, imin, imax, task_id)
  if(rc /= 4) stop "davidson_pull_results failed to pull task_id"

  rc = f77_zmq_recv( zmq_socket_pull, imin, 4, 0)
-  if(rc /= 4) stop "davidson_pull_results failed to pull task_id"
+  if(rc /= 4) stop "davidson_pull_results failed to pull imin"

  rc = f77_zmq_recv( zmq_socket_pull, imax, 4, 0)
-  if(rc /= 4) stop "davidson_pull_results failed to pull task_id"
+  if(rc /= 4) stop "davidson_pull_results failed to pull imax"

  sz = (imax-imin+1)*N_states_diag

--- a/src/Davidson/diagonalization_hs2.irp.f
+++ b/src/Davidson/diagonalization_hs2.irp.f
@ -139,7 +139,7 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_
  write(iunit,'(A)') trim(write_buffer)
  write_buffer = ' Iter'
  do i=1,N_st
-    write_buffer = trim(write_buffer)//'      Energy          S^2      Residual    '
+    write_buffer = trim(write_buffer)//'      Energy          S^2      Residual      '
  enddo
  write(iunit,'(A)') trim(write_buffer)
  write_buffer = '===== '
--- a/src/Davidson/print_energy.irp.f
+++ b/src/Davidson/print_energy.irp.f
@ -0,0 +1,22 @@
+program print_energy
+ implicit none
+ read_wf = .true.
+ touch read_wf
+ call routine
+end
+
+subroutine routine
+ implicit none
+ integer :: i,j
+ double precision :: accu,hij
+
+ print*, 'psi_energy          = ',psi_energy + nuclear_repulsion
+ accu = 0.d0
+! do i = 1,N_det
+!  do j = 1,N_det
+!   call i_H_j(psi_det(1,1,j),psi_det(1,1,i),N_int,hij)
+!   accu += psi_coef(i,1) * psi_coef(j,1) * hij
+!  enddo
+! enddo
+! print*, 'accu                = ',accu + nuclear_repulsion
+end
--- a/src/Determinants/H_apply.irp.f
+++ b/src/Determinants/H_apply.irp.f
@ -192,8 +192,8 @@ subroutine copy_H_apply_buffer_to_wf
  call normalize(psi_coef,N_det)
  SOFT_TOUCH N_det psi_det psi_coef
  
-  logical :: found_duplicates
-  !call remove_duplicates_in_psi_det(found_duplicates)
+!  logical :: found_duplicates
+!  call remove_duplicates_in_psi_det(found_duplicates)
 end

 subroutine remove_duplicates_in_psi_det(found_duplicates)
--- a/src/Determinants/determinants.irp.f
+++ b/src/Determinants/determinants.irp.f
@ -435,62 +435,32 @@ subroutine save_wavefunction_general(ndet,nstates,psidet,dim_psicoef,psicoef)
 !  Save the wave function into the EZFIO file
  END_DOC
  use bitmasks
+  include 'constants.include.F'
  integer, intent(in) :: ndet,nstates,dim_psicoef
  integer(bit_kind), intent(in) :: psidet(N_int,2,ndet)
  double precision, intent(in)  :: psicoef(dim_psicoef,nstates)
  integer*8, allocatable         :: psi_det_save(:,:,:)
  double precision, allocatable  :: psi_coef_save(:,:)
-  integer*8                      :: det_8(100)
-  integer(bit_kind)              :: det_bk((100*8)/bit_kind)
-  integer                        :: N_int2
-  equivalence (det_8, det_bk)

-  integer :: i,k
+  integer :: i,j,k

-  PROVIDE progress_bar
-  call start_progress(7,'Saving wfunction',0.d0)
-
-  progress_bar(1) = 1
-  progress_value = dble(progress_bar(1))
  call ezfio_set_determinants_N_int(N_int)
-  progress_bar(1) = 2
-  progress_value = dble(progress_bar(1))
  call ezfio_set_determinants_bit_kind(bit_kind)
-  progress_bar(1) = 3
-  progress_value = dble(progress_bar(1))
  call ezfio_set_determinants_N_det(ndet)
-  progress_bar(1) = 4
-  progress_value = dble(progress_bar(1))
  call ezfio_set_determinants_n_states(nstates)
-  progress_bar(1) = 5
-  progress_value = dble(progress_bar(1))
  call ezfio_set_determinants_mo_label(mo_label)

-  progress_bar(1) = 6
-  progress_value = dble(progress_bar(1))
-
-  N_int2 = (N_int*bit_kind)/8
-  allocate (psi_det_save(N_int2,2,ndet))
+  allocate (psi_det_save(N_int,2,ndet))
  do i=1,ndet
+   do j=1,2
    do k=1,N_int
-      det_bk(k) = psidet(k,1,i)
+      psi_det_save(k,j,i) = transfer(psidet(k,j,i),1_8)
    enddo
-    do k=1,N_int2
-      psi_det_save(k,1,i) = det_8(k)
-    enddo
-    do k=1,N_int
-      det_bk(k) = psidet(k,2,i)
-    enddo
-    do k=1,N_int2
-      psi_det_save(k,2,i) = det_8(k)
-    enddo
-!   print*,psi_det_save
+   enddo
  enddo
  call ezfio_set_determinants_psi_det(psi_det_save)
  deallocate (psi_det_save)

-  progress_bar(1) = 7
-  progress_value = dble(progress_bar(1))
  allocate (psi_coef_save(ndet,nstates))
  double precision :: accu_norm(nstates)
  accu_norm = 0.d0
@ -511,7 +481,6 @@ subroutine save_wavefunction_general(ndet,nstates,psidet,dim_psicoef,psicoef)

  call ezfio_set_determinants_psi_coef(psi_coef_save)
  call write_int(output_determinants,ndet,'Saved determinants')
-  call stop_progress
  deallocate (psi_coef_save)
 end

@ -537,28 +506,12 @@ subroutine save_wavefunction_specified(ndet,nstates,psidet,psicoef,ndetsave,inde

  integer :: i,k

-  PROVIDE progress_bar
-  call start_progress(7,'Saving wfunction',0.d0)
-
-  progress_bar(1) = 1
-  progress_value = dble(progress_bar(1))
  call ezfio_set_determinants_N_int(N_int)
-  progress_bar(1) = 2
-  progress_value = dble(progress_bar(1))
  call ezfio_set_determinants_bit_kind(bit_kind)
-  progress_bar(1) = 3
-  progress_value = dble(progress_bar(1))
  call ezfio_set_determinants_N_det(ndetsave)
-  progress_bar(1) = 4
-  progress_value = dble(progress_bar(1))
  call ezfio_set_determinants_n_states(nstates)
-  progress_bar(1) = 5
-  progress_value = dble(progress_bar(1))
  call ezfio_set_determinants_mo_label(mo_label)

-  progress_bar(1) = 6
-  progress_value = dble(progress_bar(1))
-
  N_int2 = (N_int*bit_kind)/8
  allocate (psi_det_save(N_int2,2,ndetsave))
  do i=1,ndetsave
@ -600,7 +553,6 @@ subroutine save_wavefunction_specified(ndet,nstates,psidet,psicoef,ndetsave,inde

  call ezfio_set_determinants_psi_coef(psi_coef_save)
  call write_int(output_determinants,ndet,'Saved determinants')
-  call stop_progress
  deallocate (psi_coef_save)
 end

--- a/src/Determinants/slater_rules.irp.f
+++ b/src/Determinants/slater_rules.irp.f
@ -234,61 +234,66 @@ subroutine get_double_excitation(det1,det2,exc,phase,Nint)
        cycle
        
      case(1)
+
+        high = max(exc(1,1,ispin), exc(1,2,ispin))-1
        low  = min(exc(1,1,ispin), exc(1,2,ispin))
-        high = max(exc(1,1,ispin), exc(1,2,ispin))
-        
-        ASSERT (low > 0)
-        j = ishft(low-1,-bit_kind_shift)+1   ! Find integer in array(Nint)
-        n = iand(low-1,bit_kind_size-1)+1        ! mod(low,bit_kind_size)
+
+        ASSERT (low >= 0)
        ASSERT (high > 0)
-        k = ishft(high-1,-bit_kind_shift)+1
-        m = iand(high-1,bit_kind_size-1)+1
+
+        k = ishft(high,-bit_kind_shift)+1
+        j = ishft(low,-bit_kind_shift)+1
+        m = iand(high,bit_kind_size-1)
+        n = iand(low,bit_kind_size-1)
        
        if (j==k) then
-          nperm = nperm + popcnt(iand(det1(j,ispin),                 &
-              iand( ibset(0_bit_kind,m-1)-1_bit_kind,                &
-                    ibclr(-1_bit_kind,n)+1_bit_kind ) ))
-! TODO        iand( not(ishft(1_bit_kind,n+1))+1_bit_kind, &
-!                   ishft(1_bit_kind,m)-1_bit_kind)))
+          nperm = nperm + popcnt(iand(det1(j,ispin),           &
+              iand( ishft(1_bit_kind,m)-1_bit_kind,            &
+                    not(ishft(1_bit_kind,n))+1_bit_kind)) )
        else
-          nperm = nperm + popcnt(iand(det1(k,ispin),                 &
-              ibset(0_bit_kind,m-1)-1_bit_kind)) 
-! TODO        ishft(1_bit_kind,m)-1_bit_kind)) 
-          if (n < bit_kind_size) then
-              nperm = nperm + popcnt(iand(det1(j,ispin), ibclr(-1_bit_kind,n) +1_bit_kind))
-! TODO        ishft(1_bit_kind,m)-1_bit_kind)) 
-          endif
+          nperm = nperm + popcnt(                                    &
+               iand(det1(j,ispin),                                   &
+                    iand(not(0_bit_kind),                            &
+                         (not(ishft(1_bit_kind,n)) + 1_bit_kind) ))) &
+               + popcnt(iand(det1(k,ispin),                          &
+                             (ishft(1_bit_kind,m) - 1_bit_kind ) ))
+
          do i=j+1,k-1
            nperm = nperm + popcnt(det1(i,ispin))
          end do
+
        endif
        
      case (2)
        
-        do i=1,2
-          low  = min(exc(i,1,ispin), exc(i,2,ispin))
-          high = max(exc(i,1,ispin), exc(i,2,ispin))
-          
+        do l=1,2
+          high = max(exc(l,1,ispin), exc(l,2,ispin))-1
+          low  = min(exc(l,1,ispin), exc(l,2,ispin))
+
          ASSERT (low > 0)
-          j = ishft(low-1,-bit_kind_shift)+1   ! Find integer in array(Nint)
-          n = iand(low-1,bit_kind_size-1)+1        ! mod(low,bit_kind_size)
          ASSERT (high > 0)
-          k = ishft(high-1,-bit_kind_shift)+1
-          m = iand(high-1,bit_kind_size-1)+1
+
+          k = ishft(high,-bit_kind_shift)+1
+          j = ishft(low,-bit_kind_shift)+1
+          m = iand(high,bit_kind_size-1)
+          n = iand(low,bit_kind_size-1)
          
          if (j==k) then
-            nperm = nperm + popcnt(iand(det1(j,ispin),               &
-                iand( ibset(0_bit_kind,m-1)-1_bit_kind,              &
-                ibclr(-1_bit_kind,n)+1_bit_kind ) ))
+            nperm = nperm + popcnt(iand(det1(j,ispin),           &
+                iand( ishft(1_bit_kind,m)-1_bit_kind,            &
+                    not(ishft(1_bit_kind,n))+1_bit_kind)) )
          else
-            nperm = nperm + popcnt(iand(det1(k,ispin),               &
-                ibset(0_bit_kind,m-1)-1_bit_kind)) 
-            if (n < bit_kind_size) then
-               nperm = nperm + popcnt(iand(det1(j,ispin), ibclr(-1_bit_kind,n) +1_bit_kind))
-            endif
-            do l=j+1,k-1
-              nperm = nperm + popcnt(det1(l,ispin))
+            nperm = nperm + popcnt(                                    &
+                 iand(det1(j,ispin),                                   &
+                      iand(not(0_bit_kind),                            &
+                           (not(ishft(1_bit_kind,n)) + 1_bit_kind) ))) &
+                 + popcnt(iand(det1(k,ispin),                          &
+                               (ishft(1_bit_kind,m) - 1_bit_kind ) ))
+    
+            do i=j+1,k-1
+              nperm = nperm + popcnt(det1(i,ispin))
            end do
+    
          endif
          
        enddo
@ -297,7 +302,7 @@ subroutine get_double_excitation(det1,det2,exc,phase,Nint)
        b = max(exc(1,1,ispin), exc(1,2,ispin))
        c = min(exc(2,1,ispin), exc(2,2,ispin))
        d = max(exc(2,1,ispin), exc(2,2,ispin))
-        if (c>a .and. c<b .and. d>b) then
+        if ((a<c) .and. (c<b) .and. (b<d)) then
          nperm = nperm + 1
        endif
        exit
@ -358,37 +363,42 @@ subroutine get_mono_excitation(det1,det2,exc,phase,Nint)
      if ( iand(exc(0,1,ispin),exc(0,2,ispin)) /= 1) then  ! exc(0,1,ispin)/=1 and exc(0,2,ispin) /= 1
        cycle
      endif
-      
-      low = min(exc(1,1,ispin),exc(1,2,ispin))
-      high = max(exc(1,1,ispin),exc(1,2,ispin))
-      
-      ASSERT (low > 0)
-      j = ishft(low-1,-bit_kind_shift)+1   ! Find integer in array(Nint)
-      n = iand(low-1,bit_kind_size-1)+1      ! mod(low,bit_kind_size)
+
+      high = max(exc(1,1,ispin), exc(1,2,ispin))-1
+      low  = min(exc(1,1,ispin), exc(1,2,ispin))
+
+      ASSERT (low >= 0)
      ASSERT (high > 0)
-      k = ishft(high-1,-bit_kind_shift)+1
-      m = iand(high-1,bit_kind_size-1)+1
+
+      k = ishft(high,-bit_kind_shift)+1
+      j = ishft(low,-bit_kind_shift)+1
+      m = iand(high,bit_kind_size-1)
+      n = iand(low,bit_kind_size-1)
+      
      if (j==k) then
-        nperm = popcnt(iand(det1(j,ispin),                           &
-            iand(ibset(0_bit_kind,m-1)-1_bit_kind,ibclr(-1_bit_kind,n)+1_bit_kind)))
-!TODO       iand( not(ishft(1_bit_kind,n+1))+1_bit_kind, &
-!                 ishft(1_bit_kind,m)-1_bit_kind)))
+        nperm = nperm + popcnt(iand(det1(j,ispin),           &
+            iand( ishft(1_bit_kind,m)-1_bit_kind,            &
+                  not(ishft(1_bit_kind,n))+1_bit_kind)) )
      else
-        nperm = nperm + popcnt(iand(det1(k,ispin),ibset(0_bit_kind,m-1)-1_bit_kind))
-!TODO    nperm = popcnt(iand(det1(k,ispin), ishft(1_bit_kind,m)-1_bit_kind)) + &
-!                popcnt(iand(det1(j,ispin), not(ishft(1_bit_kind,n+1))+1_bit_kind))
-        if (n < bit_kind_size) then
-            nperm = nperm + popcnt(iand(det1(j,ispin),ibclr(-1_bit_kind,n)+1_bit_kind))
-        endif
+        nperm = nperm + popcnt(                                    &
+             iand(det1(j,ispin),                                   &
+                  iand(not(0_bit_kind),                            &
+                       (not(ishft(1_bit_kind,n)) + 1_bit_kind) ))) &
+             + popcnt(iand(det1(k,ispin),                          &
+                           (ishft(1_bit_kind,m) - 1_bit_kind ) ))
+
        do i=j+1,k-1
          nperm = nperm + popcnt(det1(i,ispin))
        end do
+
      endif
+      
      phase = phase_dble(iand(nperm,1))
      return
      
    enddo
  enddo
+
 end

 subroutine bitstring_to_list_ab( string, list, n_elements, Nint)
@ -428,7 +438,6 @@ subroutine bitstring_to_list_ab( string, list, n_elements, Nint)
  enddo

 end
-
 subroutine bitstring_to_list_ab_old( string, list, n_elements, Nint)
  use bitmasks
  implicit none
@ -2030,6 +2039,112 @@ subroutine get_occ_from_key(key,occ,Nint)
 end


+subroutine get_double_excitation_phase_new(det1,det2,exc,phase,Nint)
+  use bitmasks
+  implicit none
+  
+  integer, intent(in)            :: Nint
+  integer(bit_kind), intent(in)  :: det1(Nint,2)
+  integer(bit_kind), intent(in)  :: det2(Nint,2)
+  integer, intent(in)           :: exc(0:2,2,2)
+  double precision, intent(out)  :: phase
+  integer                        :: tz
+  integer                        :: l, ispin, idx_hole, idx_particle, ishift
+  integer                        :: nperm
+  integer                        :: i,j,k,m,n
+  integer                        :: high, low
+  integer                        :: a,b,c,d
+  integer(bit_kind)              :: hole, particle, tmp
+  double precision, parameter    :: phase_dble(0:1) = (/ 1.d0, -1.d0 /)
+
+  ASSERT (Nint > 0)
+  nperm = 0
+  do ispin = 1,2
+    select case (exc(0,1,ispin))
+      case(0)
+        cycle
+
+      case(1)
+
+        high = max(exc(1,1,ispin), exc(1,2,ispin))-1
+        low  = min(exc(1,1,ispin), exc(1,2,ispin))
+
+        ASSERT (low >= 0)
+        ASSERT (high > 0)
+
+        k = ishft(high,-bit_kind_shift)
+        j = ishft(low,-bit_kind_shift)
+        m = iand(high,bit_kind_size-1)
+        n = iand(low,bit_kind_size-1)
+        
+        if (j==k) then
+          nperm = nperm + popcnt(iand(det1(j,ispin),           &
+              iand( ishft(1_bit_kind,m)-1_bit_kind,            &
+                  not(ishft(1_bit_kind,n))+1_bit_kind)) )
+        else
+          nperm = nperm + popcnt(                                    &
+               iand(det1(j,ispin),                                   &
+                    iand(not(0_bit_kind),                            &
+                         (not(ishft(1_bit_kind,n)) + 1_bit_kind) ))) &
+               + popcnt(iand(det1(k,ispin),                          &
+                             (ishft(1_bit_kind,m) - 1_bit_kind ) ))
+
+          do i=j+1,k-1
+            nperm = nperm + popcnt(det1(i,ispin))
+          end do
+
+        endif
+        
+      case (2)
+        
+        do l=1,2
+          high = max(exc(l,1,ispin), exc(l,2,ispin))-1
+          low  = min(exc(l,1,ispin), exc(l,2,ispin))
+
+          ASSERT (low > 0)
+          ASSERT (high > 0)
+
+          k = ishft(high,-bit_kind_shift)
+          j = ishft(low,-bit_kind_shift)
+          m = iand(high,bit_kind_size-1)
+          n = iand(low,bit_kind_size-1)
+          
+          if (j==k) then
+            nperm = nperm + popcnt(iand(det1(j,ispin),           &
+                iand( ishft(1_bit_kind,m)-1_bit_kind,            &
+                  not(ishft(1_bit_kind,n))+1_bit_kind)) )
+          else
+            nperm = nperm + popcnt(                                    &
+                 iand(det1(j,ispin),                                   &
+                      iand(not(0_bit_kind),                            &
+                           (not(ishft(1_bit_kind,n)) + 1_bit_kind) ))) &
+                 + popcnt(iand(det1(k,ispin),                          &
+                               (ishft(1_bit_kind,m) - 1_bit_kind ) ))
+    
+            do i=j+1,k-1
+              nperm = nperm + popcnt(det1(i,ispin))
+            end do
+    
+          endif
+          
+        enddo
+        
+        a = min(exc(1,1,ispin), exc(1,2,ispin))
+        b = max(exc(1,1,ispin), exc(1,2,ispin))
+        c = min(exc(2,1,ispin), exc(2,2,ispin))
+        d = max(exc(2,1,ispin), exc(2,2,ispin))
+        if (c>a .and. c<b .and. d>b) then
+          nperm = nperm + 1
+        endif
+        exit
+    end select
+
+  enddo
+  phase = phase_dble(iand(nperm,1))
+end
+
+
+
 subroutine get_double_excitation_phase(det1,det2,exc,phase,Nint)
  use bitmasks
  implicit none
@ -2315,6 +2430,356 @@ subroutine decode_exc_spin(exc,h1,p1,h2,p2)
  end select
 end

+subroutine get_excitation_degree_spin_new(key1,key2,degree,Nint)
+  use bitmasks
+  include 'Utils/constants.include.F'
+  implicit none
+  BEGIN_DOC
+  ! Returns the excitation degree between two determinants
+  END_DOC
+  integer, intent(in)            :: Nint
+  integer(bit_kind), intent(in)  :: key1(Nint)
+  integer(bit_kind), intent(in)  :: key2(Nint)
+  integer, intent(out)           :: degree
+  
+  integer(bit_kind)              :: xorvec(N_int_max)
+  integer                        :: l
+  
+  ASSERT (Nint > 0)
+  
+  select case (Nint)
+
+    case (1)
+      xorvec(1) = xor( key1(1), key2(1))
+      degree = popcnt(xorvec(1))
+
+    case (2)
+      xorvec(1) = xor( key1(1), key2(1))
+      xorvec(2) = xor( key1(2), key2(2))
+      degree = popcnt(xorvec(1))+popcnt(xorvec(2))
+
+    case (3)
+      xorvec(1) = xor( key1(1), key2(1))
+      xorvec(2) = xor( key1(2), key2(2))
+      xorvec(3) = xor( key1(3), key2(3))
+      degree = sum(popcnt(xorvec(1:3)))
+
+    case (4)
+      xorvec(1) = xor( key1(1), key2(1))
+      xorvec(2) = xor( key1(2), key2(2))
+      xorvec(3) = xor( key1(3), key2(3))
+      xorvec(4) = xor( key1(4), key2(4))
+      degree = sum(popcnt(xorvec(1:4)))
+
+    case default
+      do l=1,Nint
+        xorvec(l) = xor( key1(l), key2(l))
+      enddo
+      degree = sum(popcnt(xorvec(1:Nint)))
+  
+  end select
+
+  degree = ishft(degree,-1)
+  
+end
+
+
+subroutine get_excitation_spin_new(det1,det2,exc,degree,phase,Nint)
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Returns the excitation operators between two determinants and the phase
+  END_DOC
+  integer, intent(in)            :: Nint
+  integer(bit_kind), intent(in)  :: det1(Nint)
+  integer(bit_kind), intent(in)  :: det2(Nint)
+  integer, intent(out)           :: exc(0:2,2)
+  integer, intent(out)           :: degree
+  double precision, intent(out)  :: phase
+  ! exc(number,hole/particle)
+  ! ex :
+  ! exc(0,1) = number of holes
+  ! exc(0,2) = number of particles
+  ! exc(1,2) = first particle 
+  ! exc(1,1) = first hole     
+  
+  ASSERT (Nint > 0)
+  
+  !DIR$ FORCEINLINE
+  call get_excitation_degree_spin(det1,det2,degree,Nint)
+  select case (degree)
+      
+    case (3:)
+      degree = -1
+      return
+      
+    case (2)
+      call get_double_excitation_spin(det1,det2,exc,phase,Nint)
+      return
+      
+    case (1)
+      call get_mono_excitation_spin(det1,det2,exc,phase,Nint)
+      return
+      
+    case(0)
+      return
+      
+  end select
+end
+
+subroutine decode_exc_spin_new(exc,h1,p1,h2,p2)
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Decodes the exc arrays returned by get_excitation.
+  ! h1,h2 : Holes
+  ! p1,p2 : Particles
+  END_DOC
+  integer, intent(in)            :: exc(0:2,2)
+  integer, intent(out)           :: h1,h2,p1,p2
+  
+  select case (exc(0,1))
+    case(2)
+      h1 = exc(1,1)
+      h2 = exc(2,1)
+      p1 = exc(1,2)
+      p2 = exc(2,2)
+    case(1)
+      h1 = exc(1,1)
+      h2 = 0
+      p1 = exc(1,2)
+      p2 = 0
+    case default
+      h1 = 0
+      p1 = 0
+      h2 = 0
+      p2 = 0
+  end select
+end
+
+
+subroutine get_double_excitation_spin_new(det1,det2,exc,phase,Nint)
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Returns the two excitation operators between two doubly excited spin-determinants
+  ! and the phase
+  END_DOC
+  integer, intent(in)            :: Nint
+  integer(bit_kind), intent(in)  :: det1(Nint)
+  integer(bit_kind), intent(in)  :: det2(Nint)
+  integer, intent(out)           :: exc(0:2,2)
+  double precision, intent(out)  :: phase
+  integer                        :: tz
+  integer                        :: l, idx_hole, idx_particle, ishift
+  integer                        :: nperm
+  integer                        :: i,j,k,m,n
+  integer                        :: high, low
+  integer                        :: a,b,c,d
+  integer(bit_kind)              :: hole, particle, tmp
+  double precision, parameter    :: phase_dble(0:1) = (/ 1.d0, -1.d0 /)
+  
+  ASSERT (Nint > 0)
+  nperm = 0
+  exc(0,1) = 0
+  exc(0,2) = 0
+  
+  idx_particle = 0
+  idx_hole = 0
+  ishift = 1-bit_kind_size
+  do l=1,Nint
+    ishift = ishift + bit_kind_size
+    if (det1(l) == det2(l)) then
+      cycle
+    endif
+    tmp = xor( det1(l), det2(l) )
+    particle = iand(tmp, det2(l))
+    hole     = iand(tmp, det1(l))
+    do while (particle /= 0_bit_kind)
+      tz = trailz(particle)
+      idx_particle = idx_particle + 1
+      exc(0,2) = exc(0,2) + 1
+      exc(idx_particle,2) = tz+ishift
+      particle = iand(particle,particle-1_bit_kind)
+    enddo
+    if (iand(exc(0,1),exc(0,2))==2) then  ! exc(0,1)==2 or exc(0,2)==2
+      exit
+    endif
+    do while (hole /= 0_bit_kind)
+      tz = trailz(hole)
+      idx_hole = idx_hole + 1
+      exc(0,1) = exc(0,1) + 1
+      exc(idx_hole,1) = tz+ishift
+      hole = iand(hole,hole-1_bit_kind)
+    enddo
+    if (iand(exc(0,1),exc(0,2))==2) then ! exc(0,1)==2 or exc(0,2)==2
+      exit
+    endif
+  enddo
+  
+  select case (exc(0,1))
+      
+    case(1)
+
+      high = max(exc(1,1), exc(1,2))-1
+      low  = min(exc(1,1), exc(1,2))
+
+      ASSERT (low >= 0)
+      ASSERT (high > 0)
+
+      k = ishft(high,-bit_kind_shift)
+      j = ishft(low,-bit_kind_shift)
+      m = iand(high,bit_kind_size-1)
+      n = iand(low,bit_kind_size-1)
+      
+      if (j==k) then
+        nperm = nperm + popcnt(iand(det1(j),                 &
+            iand( ishft(1_bit_kind,m)-1_bit_kind,            &
+                  not(ishft(1_bit_kind,n))+1_bit_kind)) )
+      else
+        nperm = nperm + popcnt(                                    &
+             iand(det1(j),                                         &
+                  iand(not(0_bit_kind),                            &
+                       (not(ishft(1_bit_kind,n)) + 1_bit_kind) ))) &
+             + popcnt(iand(det1(k),                                &
+                           (ishft(1_bit_kind,m) - 1_bit_kind ) ))
+
+        do i=j+1,k-1
+          nperm = nperm + popcnt(det1(i))
+        end do
+
+      endif
+      
+    case (2)
+      
+      do l=1,2
+        high = max(exc(l,1), exc(l,2))-1
+        low  = min(exc(l,1), exc(l,2))
+
+        ASSERT (low > 0)
+        ASSERT (high > 0)
+
+        k = ishft(high,-bit_kind_shift)
+        j = ishft(low,-bit_kind_shift)
+        m = iand(high,bit_kind_size-1)
+        n = iand(low,bit_kind_size-1)
+        
+        if (j==k) then
+          nperm = nperm + popcnt(iand(det1(j),                 &
+              iand( ishft(1_bit_kind,m)-1_bit_kind,            &
+                  not(ishft(1_bit_kind,n))+1_bit_kind)) )
+        else
+          nperm = nperm + popcnt(                                    &
+               iand(det1(j),                                         &
+                    iand(not(0_bit_kind),                            &
+                         (not(ishft(1_bit_kind,n)) + 1_bit_kind) ))) &
+               + popcnt(iand(det1(k),                                &
+                             (ishft(1_bit_kind,m) - 1_bit_kind ) ))
+  
+          do i=j+1,k-1
+            nperm = nperm + popcnt(det1(i))
+          end do
+  
+        endif
+        
+      enddo
+        
+      a = min(exc(1,1), exc(1,2))
+      b = max(exc(1,1), exc(1,2))
+      c = min(exc(2,1), exc(2,2))
+      d = max(exc(2,1), exc(2,2))
+      if (c>a .and. c<b .and. d>b) then
+        nperm = nperm + 1
+      endif
+  end select
+  
+  phase = phase_dble(iand(nperm,1))
+  
+end
+
+subroutine get_mono_excitation_spin_new(det1,det2,exc,phase,Nint)
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Returns the excitation operator between two singly excited determinants and the phase
+  END_DOC
+  integer, intent(in)            :: Nint
+  integer(bit_kind), intent(in)  :: det1(Nint)
+  integer(bit_kind), intent(in)  :: det2(Nint)
+  integer, intent(out)           :: exc(0:2,2)
+  double precision, intent(out)  :: phase
+  integer                        :: tz
+  integer                        :: l, idx_hole, idx_particle, ishift
+  integer                        :: nperm
+  integer                        :: i,j,k,m,n
+  integer                        :: high, low
+  integer                        :: a,b,c,d
+  integer(bit_kind)              :: hole, particle, tmp
+  double precision, parameter    :: phase_dble(0:1) = (/ 1.d0, -1.d0 /)
+  
+  ASSERT (Nint > 0)
+  nperm = 0
+  exc(0,1) = 0
+  exc(0,2) = 0
+  
+  ishift = 1-bit_kind_size
+  do l=1,Nint
+    ishift = ishift + bit_kind_size
+    if (det1(l) == det2(l)) then
+      cycle
+    endif
+    tmp = xor( det1(l), det2(l) )
+    particle = iand(tmp, det2(l))
+    hole     = iand(tmp, det1(l))
+    if (particle /= 0_bit_kind) then
+      tz = trailz(particle)
+      exc(0,2) = 1
+      exc(1,2) = tz+ishift
+    endif
+    if (hole /= 0_bit_kind) then
+      tz = trailz(hole)
+      exc(0,1) = 1
+      exc(1,1) = tz+ishift
+    endif
+    
+    if ( iand(exc(0,1),exc(0,2)) /= 1) then  ! exc(0,1)/=1 and exc(0,2) /= 1
+      cycle
+    endif
+    
+    high = max(exc(1,1), exc(1,2))-1
+    low  = min(exc(1,1), exc(1,2))
+
+    ASSERT (low >= 0)
+    ASSERT (high > 0)
+
+    k = ishft(high,-bit_kind_shift)
+    j = ishft(low,-bit_kind_shift)
+    m = iand(high,bit_kind_size-1)
+    n = iand(low,bit_kind_size-1)
+    
+    if (j==k) then
+      nperm = nperm + popcnt(iand(det1(j),                 &
+          iand( ishft(1_bit_kind,m)-1_bit_kind,            &
+                  not(ishft(1_bit_kind,n))+1_bit_kind)) )
+    else
+      nperm = nperm + popcnt(                                    &
+           iand(det1(j),                                         &
+                iand(not(0_bit_kind),                            &
+                     (not(ishft(1_bit_kind,n)) + 1_bit_kind) ))) &
+           + popcnt(iand(det1(k),                                &
+                         (ishft(1_bit_kind,m) - 1_bit_kind ) ))
+
+      do i=j+1,k-1
+        nperm = nperm + popcnt(det1(i))
+      end do
+
+    endif
+      
+    phase = phase_dble(iand(nperm,1))
+    return
+    
+  enddo
+end

 subroutine get_double_excitation_spin(det1,det2,exc,phase,Nint)
  use bitmasks
--- a/src/Determinants/spindeterminants.irp.f
+++ b/src/Determinants/spindeterminants.irp.f
@ -365,8 +365,9 @@ end
 do k=1,N_det
   i = psi_bilinear_matrix_rows(k)
   j = psi_bilinear_matrix_columns(k)
+   f = 0.d0
   do l=1,N_states
-    f = psi_bilinear_matrix_values(k,l)*psi_bilinear_matrix_values(k,l)
+    f += psi_bilinear_matrix_values(k,l)*psi_bilinear_matrix_values(k,l)
   enddo
   det_alpha_norm(i) += f
   det_beta_norm(j)  += f
@ -690,7 +691,7 @@ subroutine generate_all_alpha_beta_det_products
  integer, external              :: get_index_in_psi_det_sorted_bit
  integer(bit_kind), allocatable :: tmp_det(:,:,:)
  logical, external              :: is_in_wavefunction
-  integer, external              :: omp_get_thread_num
+  PROVIDE H_apply_buffer_allocated

  !$OMP PARALLEL DEFAULT(NONE) SHARED(psi_coef_sorted_bit,N_det_beta_unique,&
      !$OMP N_det_alpha_unique, N_int, psi_det_alpha_unique, psi_det_beta_unique,&
@ -712,7 +713,7 @@ subroutine generate_all_alpha_beta_det_products
    enddo
    call fill_H_apply_buffer_no_selection(l-1, tmp_det, N_int, iproc)
  enddo
-  !$OMP END DO NOWAIT
+  !$OMP END DO 
  deallocate(tmp_det)
  !$OMP END PARALLEL
  call copy_H_apply_buffer_to_wf
--- a/src/Determinants/two_body_dm_map.irp.f
+++ b/src/Determinants/two_body_dm_map.irp.f
@ -187,7 +187,7 @@ subroutine add_values_to_two_body_dm_map(mask_ijkl)
  print*,'n_elements = ',n_elements
  call insert_into_two_body_dm_ab_map(n_elements,buffer_i,buffer_value,&
      real(mo_integrals_threshold,integral_kind))
-  call map_unique(two_body_dm_ab_map)
+  call map_merge(two_body_dm_ab_map)

  deallocate(buffer_i,buffer_value)

--- a/src/FourIdx/NEEDED_CHILDREN_MODULES
+++ b/src/FourIdx/NEEDED_CHILDREN_MODULES
@ -0,0 +1 @@
+ZMQ
--- a/src/FourIdx/README.rst
+++ b/src/FourIdx/README.rst
@ -0,0 +1,6 @@
+=======
+FourIdx 
+=======
+
+Four-index transformation.
+
--- a/src/FourIdx/four_index.irp.f
+++ b/src/FourIdx/four_index.irp.f
@ -0,0 +1,180 @@
+subroutine four_index_transform(map_a,map_c,matrix_B,LDB,            &
+      i_start, j_start, k_start, l_start,                            &
+      i_end  , j_end  , k_end  , l_end  ,                            &
+      a_start, b_start, c_start, d_start,                            &
+      a_end  , b_end  , c_end  , d_end  )
+  implicit none
+  use map_module
+  use mmap_module
+  BEGIN_DOC
+! Performs a four-index transformation of map_a(N^4) into map_c(M^4) using b(NxM)
+! C_{abcd} = \sum_{ijkl} A_{ijkl}.B_{ia}.B_{jb}.B_{kc}.B_{ld}
+! Loops run over *_start->*_end
+  END_DOC
+  type(map_type), intent(in)     :: map_a
+  type(map_type), intent(inout)  :: map_c
+  integer, intent(in)            :: LDB
+  double precision, intent(in)   :: matrix_B(LDB,*)
+  integer, intent(in)            :: i_start, j_start, k_start, l_start
+  integer, intent(in)            :: i_end  , j_end  , k_end  , l_end
+  integer, intent(in)            :: a_start, b_start, c_start, d_start
+  integer, intent(in)            :: a_end  , b_end  , c_end  , d_end
+
+  double precision, allocatable  :: T(:,:,:), U(:,:,:), V(:,:,:)
+  integer                        :: i_max, j_max, k_max, l_max
+  integer                        :: i_min, j_min, k_min, l_min
+  integer                        :: i, j, k, l
+  integer                        :: a, b, c, d
+  double precision, external     :: get_ao_bielec_integral
+  integer(key_kind)              :: idx
+  real(integral_kind)            :: tmp
+  integer(key_kind), allocatable :: key(:)
+  real(integral_kind), allocatable :: value(:)
+
+  ASSERT (k_start == i_start)
+  ASSERT (l_start == j_start)
+  ASSERT (a_start == c_start)
+  ASSERT (b_start == d_start)
+
+  i_min = min(i_start,a_start)
+  i_max = max(i_end  ,a_end  )
+  j_min = min(j_start,b_start)
+  j_max = max(j_end  ,b_end  )
+  k_min = min(k_start,c_start)
+  k_max = max(k_end  ,c_end  )
+  l_min = min(l_start,d_start)
+  l_max = max(l_end  ,d_end  )
+
+  ASSERT (0 < i_max)
+  ASSERT (0 < j_max)
+  ASSERT (0 < k_max)
+  ASSERT (0 < l_max)
+  ASSERT (LDB >= i_max)
+  ASSERT (LDB >= j_max)
+  ASSERT (LDB >= k_max)
+  ASSERT (LDB >= l_max)
+
+  ! Create a temporary memory-mapped file
+  integer                        :: fd
+  type(c_ptr)                    :: c_pointer
+  integer*8, pointer             :: a_array(:,:,:)
+  call mmap(trim(ezfio_filename)//'/work/four_idx',                  &
+      (/ 4_8,int(i_end-i_start+1,8),int(j_end-j_start+1,8),int(k_end-k_start+1,8), int(l_end-l_start+1,8) /), 8, fd, .False., c_pointer)
+  call c_f_pointer(c_pointer, a_array, (/ 4, (i_end-i_start+1)*(j_end-j_start+1)*(k_end-k_start+1), l_end-l_start+1 /))
+
+
+  !$OMP PARALLEL DEFAULT(NONE) SHARED(a_array,c_pointer,fd,          &
+      !$OMP  a_start,a_end,b_start,b_end,c_start,c_end,d_start,d_end,&
+      !$OMP  i_start,i_end,j_start,j_end,k_start,k_end,l_start,l_end,&
+      !$OMP  i_min,i_max,j_min,j_max,k_min,k_max,l_min,l_max,        &
+      !$OMP  map_a,map_c,matrix_B)                                   &
+      !$OMP  PRIVATE(key,value,T,U,V,i,j,k,l,idx,   &
+      !$OMP  a,b,c,d,tmp)
+  allocate( key(i_max*j_max*k_max), value(i_max*j_max*k_max) )
+  allocate( U(a_start:a_end, c_start:c_end, b_start:b_end) )
+
+
+  !$OMP DO SCHEDULE(dynamic,4)
+  do l=l_start,l_end
+    a = 1
+    do j=j_start,j_end
+      do k=k_start,k_end
+        do i=i_start,i_end
+          call bielec_integrals_index(i,j,k,l,idx)
+          call map_get(map_a,idx,tmp)
+          if (tmp /= 0.d0) then
+            a = a+1
+            a_array(1,a,l-l_start+1) = i
+            a_array(2,a,l-l_start+1) = j
+            a_array(3,a,l-l_start+1) = k
+            a_array(4,a,l-l_start+1) = transfer(dble(tmp), 1_8)
+          endif
+        enddo
+      enddo
+    enddo
+    a_array(1,1,l-l_start+1) = a
+    print *,  l
+  enddo
+  !$OMP END DO
+
+  !$OMP DO SCHEDULE(dynamic)
+  do d=d_start,d_end
+    U = 0.d0
+    do l=l_start,l_end
+      if (dabs(matrix_B(l,d)) < 1.d-10) then
+        cycle
+      endif
+      print *,  d, l
+
+      allocate( T(i_start:i_end, k_start:k_end, j_start:j_end), &
+                V(a_start:a_end, k_start:k_end, j_start:j_end) )
+
+      T = 0.d0
+      do a=2,a_array(1,1,l-l_start+1)
+        i = a_array(1,a,l-l_start+1)
+        j = a_array(2,a,l-l_start+1)
+        k = a_array(3,a,l-l_start+1)
+        T(i, k,j) = transfer(a_array(4,a,l-l_start+1), 1.d0)
+      enddo
+
+      call DGEMM('T','N', (a_end-a_start+1),                         &
+          (k_end-k_start+1)*(j_end-j_start+1),                       &
+          (i_end-i_start+1), 1.d0,                                   &
+          matrix_B(i_start,a_start), size(matrix_B,1),               &
+          T(i_start,k_start,j_start), size(T,1),  0.d0,              &
+          V(a_start,k_start,j_start), size(V, 1) )
+
+      deallocate(T)
+      allocate( T(a_start:a_end, k_start:k_end, b_start:d) )
+
+      call DGEMM('N','N', (a_end-a_start+1)*(k_end-k_start+1),       &
+              (b_end-b_start+1),                                     &
+              (j_end-j_start+1), 1.d0,                               &
+              V(a_start,k_start,j_start), size(V,1)*size(V,2),       &
+              matrix_B(j_start,b_start), size(matrix_B,1),0.d0,      &
+              T(a_start,k_start,b_start), size(T,1)*size(T,2) )
+
+      deallocate(V)
+
+      do b=b_start,b_end
+        call DGEMM('N','N', (a_end-a_start+1), (c_end-c_start+1),    &
+            (k_end-k_start+1), matrix_B(l, d),                   &
+            T(a_start,k_start,b), size(T,1),                     &
+            matrix_B(k_start,c_start), size(matrix_B,1), 1.d0,   &
+            U(a_start,c_start,b), size(U,1) )
+      enddo
+
+      deallocate(T)
+
+    enddo
+
+    idx = 0_8
+    do b=b_start,b_end
+      do c=c_start,c_end
+        do a=a_start,a_end
+          if (dabs(U(a,c,b)) < 1.d-15) then
+            cycle
+          endif
+          idx = idx+1_8
+          call bielec_integrals_index(a,b,c,d,key(idx))
+          value(idx) = U(a,c,b)
+        enddo
+      enddo
+    enddo
+
+    !$OMP CRITICAL
+    call map_append(map_c, key, value, idx) 
+    call map_sort(map_c)
+    !$OMP END CRITICAL
+
+
+  enddo
+  !$OMP END DO
+
+  deallocate(key,value)
+  !$OMP END PARALLEL
+
+  call munmap( &
+      (/ 4_8,int(i_end-i_start+1,8),int(j_end-j_start+1,8),int(k_end-k_start+1,8), int(l_end-l_start+1,8) /), 8, fd, c_pointer)
+
+end
--- a/src/FourIdx/four_index_block.irp.f
+++ b/src/FourIdx/four_index_block.irp.f
@ -0,0 +1,300 @@
+subroutine four_index_transform_block(map_a,map_c,matrix_B,LDB,            &
+      i_start, j_start, k_start, l_start,                            &
+      i_end  , j_end  , k_end  , l_end  ,                            &
+      a_start, b_start, c_start, d_start,                            &
+      a_end  , b_end  , c_end  , d_end  )
+  implicit none
+  use map_module
+  use mmap_module
+  BEGIN_DOC
+! Performs a four-index transformation of map_a(N^4) into map_c(M^4) using b(NxM)
+! C_{abcd} = \sum_{ijkl} A_{ijkl}.B_{ia}.B_{jb}.B_{kc}.B_{ld}
+! Loops run over *_start->*_end
+  END_DOC
+  type(map_type), intent(in)     :: map_a
+  type(map_type), intent(inout)  :: map_c
+  integer, intent(in)            :: LDB
+  double precision, intent(in)   :: matrix_B(LDB,*)
+  integer, intent(in)            :: i_start, j_start, k_start, l_start
+  integer, intent(in)            :: i_end  , j_end  , k_end  , l_end
+  integer, intent(in)            :: a_start, b_start, c_start, d_start
+  integer, intent(in)            :: a_end  , b_end  , c_end  , d_end
+
+  double precision, allocatable  :: T(:,:), U(:,:,:), V(:,:)
+  double precision, allocatable  :: T2d(:,:), V2d(:,:)
+  integer                        :: i_max, j_max, k_max, l_max
+  integer                        :: i_min, j_min, k_min, l_min
+  integer                        :: i, j, k, l, ik, ll
+  integer                        :: l_start_block, l_end_block, l_block
+  integer                        :: a, b, c, d
+  double precision, external     :: get_ao_bielec_integral
+  integer*8                      :: ii
+  integer(key_kind)              :: idx
+  real(integral_kind)            :: tmp
+  integer(key_kind), allocatable :: key(:)
+  real(integral_kind), allocatable :: value(:)
+  integer*8, allocatable         :: l_pointer(:)
+
+  ASSERT (k_start == i_start)
+  ASSERT (l_start == j_start)
+  ASSERT (a_start == c_start)
+  ASSERT (b_start == d_start)
+
+  i_min = min(i_start,a_start)
+  i_max = max(i_end  ,a_end  )
+  j_min = min(j_start,b_start)
+  j_max = max(j_end  ,b_end  )
+  k_min = min(k_start,c_start)
+  k_max = max(k_end  ,c_end  )
+  l_min = min(l_start,d_start)
+  l_max = max(l_end  ,d_end  )
+
+  ASSERT (0 < i_max)
+  ASSERT (0 < j_max)
+  ASSERT (0 < k_max)
+  ASSERT (0 < l_max)
+  ASSERT (LDB >= i_max)
+  ASSERT (LDB >= j_max)
+  ASSERT (LDB >= k_max)
+  ASSERT (LDB >= l_max)
+
+  integer*4, allocatable         :: a_array_ik(:)
+  integer*4, allocatable         :: a_array_j(:)
+  double precision, allocatable  :: a_array_value(:)
+
+  integer*8 :: new_size
+  new_size = max(1024_8, 5_8 * map_a % n_elements )
+
+  allocate(a_array_ik(new_size), a_array_j(new_size), a_array_value(new_size))
+
+  integer :: ipass, npass
+  integer*8 :: tempspace
+
+  tempspace = (new_size * 16_8) / (1024_8 * 1024_8)
+  npass = min(int(l_end-l_start,8),1_8 + tempspace / 2048_8)   ! 2 GiB of scratch space
+  l_block = (l_end-l_start+1)/npass
+
+  ipass = 0
+  do l_start_block = l_start, l_end, l_block
+    ipass = ipass+1
+    print *,  'Pass ', ipass
+    l_end_block = min(l_end, l_start_block+l_block-1)
+
+    allocate(l_pointer(l_start_block:l_end_block+1), value((i_max*k_max)) )
+    ii = 1_8
+    !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,j,k,l,ik,idx) 
+    do l=l_start_block,l_end_block
+      !$OMP SINGLE
+      l_pointer(l) = ii
+      !$OMP END SINGLE
+      do j=j_start,j_end
+        !$OMP DO SCHEDULE(static,16)
+        do k=k_start,k_end
+          do i=i_start,k
+            ik = (i-i_start+1) + ishft( (k-k_start)*(k-k_start+1), -1 )
+            call bielec_integrals_index(i,j,k,l,idx)
+            call map_get(map_a,idx,value(ik))
+          enddo
+        enddo
+        !$OMP END DO
+
+        !$OMP SINGLE
+        ik=0
+        do k=k_start,k_end
+          do i=i_start,k
+            ik = ik+1
+            tmp=value(ik)
+            if (tmp /= 0.d0) then
+              a_array_ik(ii) = ik
+              a_array_j(ii)  = j
+              a_array_value(ii) = tmp
+              ii=ii+1_8
+            endif
+          enddo
+        enddo
+        !$OMP END SINGLE
+      enddo
+    enddo
+    !$OMP SINGLE
+    a_array_ik(ii) = 0
+    a_array_j(ii)  = 0
+    a_array_value(ii) = 0.d0
+    l_pointer(l_end_block+1) = ii
+    !$OMP END SINGLE
+    !$OMP END PARALLEL  
+    deallocate(value)
+
+  !INPUT DATA
+  !open(unit=10,file='INPUT',form='UNFORMATTED')
+  !write(10) i_start, j_start, i_end, j_end
+  !write(10) a_start, b_start, a_end, b_end
+  !write(10) LDB, mo_tot_num
+  !write(10) matrix_B(1:LDB,1:mo_tot_num)
+  !idx=size(a_array)
+  !write(10) idx
+  !write(10) a_array
+  !write(10) l_pointer
+  !close(10)
+  !open(unit=10,file='OUTPUT',form='FORMATTED')
+  ! END INPUT DATA
+
+
+    !$OMP PARALLEL DEFAULT(NONE) SHARED(a_array_ik,a_array_j,a_array_value,&
+        !$OMP  a_start,a_end,b_start,b_end,c_start,c_end,d_start,d_end,&
+        !$OMP  i_start,i_end,j_start,j_end,k_start,k_end,l_start_block,l_end_block,&
+        !$OMP  i_min,i_max,j_min,j_max,k_min,k_max,l_min,l_max,      &
+        !$OMP  map_c,matrix_B,l_pointer)                             &
+        !$OMP  PRIVATE(key,value,T,U,V,i,j,k,l,idx,ik,ll,            &
+        !$OMP  a,b,c,d,tmp,T2d,V2d,ii,p,q)
+    allocate( key(i_max*j_max*k_max), value(i_max*j_max*k_max) )
+    allocate( U(a_start:a_end, c_start:c_end, b_start:b_end) )
+
+
+
+    allocate( T2d((i_end-i_start+1)*(k_end-k_start+2)/2, j_start:j_end), &
+              V2d((i_end-i_start+1)*(k_end-k_start+2)/2, b_start:b_end), &
+              V(i_start:i_end, k_start:k_end), &
+              T(k_start:k_end, a_start:a_end))
+
+
+    !$OMP DO SCHEDULE(dynamic)
+    do d=d_start,d_end
+      U = 0.d0
+      do l=l_start_block,l_end_block
+        if (dabs(matrix_B(l,d)) < 1.d-10) then
+          cycle
+        endif
+        
+        ii=l_pointer(l)
+        do j=j_start,j_end
+          !DIR$ VECTOR NONTEMPORAL
+          T2d(:,j) = 0.d0
+          !DIR$ IVDEP
+          do while (j == a_array_j(ii))
+            T2d(a_array_ik(ii),j) = a_array_value(ii)
+            ii = ii + 1_8
+          enddo
+        enddo
+
+        call DGEMM('N','N', ishft( (i_end-i_start+1)*(i_end-i_start+2), -1),&
+            (d-b_start+1),                                             &
+            (j_end-j_start+1), 1.d0,                                   &
+            T2d(1,j_start), size(T2d,1),                               &
+            matrix_B(j_start,b_start), size(matrix_B,1),0.d0,          &
+            V2d(1,b_start), size(V2d,1) )
+
+        do b=b_start,d
+          ik = 0
+          do k=k_start,k_end
+            do i=i_start,k
+              ik = ik+1
+              V(i,k) = V2d(ik,b)
+            enddo
+          enddo
+
+  !        T = 0.d0
+  !        do a=a_start,b
+  !          do k=k_start,k_end
+  !            do i=i_start,k
+  !              T(k,a) = T(k,a) + V(i,k)*matrix_B(i,a)
+  !            enddo
+  !            do i=k+1,i_end
+  !              T(k,a) = T(k,a) + V(k,i)*matrix_B(i,a)
+  !            enddo
+  !          enddo
+  !        enddo
+          call DSYMM('L','U', (k_end-k_start+1), (b-a_start+1),        &
+              1.d0,                                                    &
+              V(i_start,k_start), size(V,1),                           &
+              matrix_B(i_start,a_start), size(matrix_B,1),0.d0,        &
+              T(k_start,a_start), size(T,1) )
+
+  !        do c=c_start,b
+  !          do a=a_start,c
+  !            do k=k_start,k_end
+  !              U(a,c,b) = U(a,c,b) + T(k,a)*matrix_B(k,c)*matrix_B(l,d)
+  !            enddo
+  !          enddo
+  !        enddo
+          call DGEMM('T','N', (b-a_start+1), (b-c_start+1),            &
+              (k_end-k_start+1), matrix_B(l, d),                       &
+              T(k_start,a_start), size(T,1),                           &
+              matrix_B(k_start,c_start), size(matrix_B,1), 1.d0,       &
+              U(a_start,c_start,b), size(U,1) )
+  !        do c=b+1,c_end
+  !          do a=a_start,b
+  !            do k=k_start,k_end
+  !              U(a,c,b) = U(a,c,b) + T(k,a)*matrix_B(k,c)*matrix_B(l,d)
+  !            enddo
+  !          enddo
+  !        enddo
+          if (b < b_end) then
+            call DGEMM('T','N', (b-a_start+1), (c_end-b),              &
+                (k_end-k_start+1), matrix_B(l, d),                     &
+                T(k_start,a_start), size(T,1),                         &
+                matrix_B(k_start,b+1), size(matrix_B,1), 1.d0,         &
+                U(a_start,b+1,b), size(U,1) )
+          endif
+        enddo
+
+      enddo
+
+      idx = 0_8
+
+      integer :: p, q
+      do b=b_start,d
+        q = b+ishft(d*d-d,-1)
+        do c=c_start,c_end
+          p = a_start+ishft(c*c-c,-1)
+          do a=a_start,min(b,c)
+            if (dabs(U(a,c,b)) < 1.d-15) then
+              cycle
+            endif
+            if ((a==b).and.(p>q)) cycle
+            p = p+1
+            idx = idx+1_8
+            call bielec_integrals_index(a,b,c,d,key(idx))
+!print *,  int(key(idx),4), int(a,2),int(b,2),int(c,2),int(d,2), p, q
+            value(idx) = U(a,c,b)
+          enddo
+        enddo
+      enddo
+
+
+
+
+
+
+      !$OMP CRITICAL
+      call map_update(map_c, key, value, idx,1.d-15) 
+      !$OMP END CRITICAL
+
+  !WRITE OUTPUT
+  ! OMP CRITICAL
+  !print *,  d
+  !do b=b_start,d
+  !  do c=c_start,c_end
+  !    do a=a_start,min(b,c)
+  !      if (dabs(U(a,c,b)) < 1.d-15) then
+  !        cycle
+  !      endif
+  !      write(10,*) d,c,b,a,U(a,c,b)
+  !    enddo
+  !  enddo
+  !enddo
+  ! OMP END CRITICAL
+  !END WRITE OUTPUT
+
+
+    enddo
+    !$OMP END DO
+
+    deallocate(key,value,V,T)
+    !$OMP END PARALLEL
+    call map_merge(map_c)
+
+    deallocate(l_pointer)
+  enddo
+  deallocate(a_array_ik,a_array_j,a_array_value)
+
+end
--- a/src/FourIdx/four_index_slave.irp.f.todo
+++ b/src/FourIdx/four_index_slave.irp.f.todo
@ -0,0 +1,279 @@
+subroutine four_index_transform_slave(map_a,map_c,matrix_B,LDB,            &
+      i_start, j_start, k_start, l_start,                            &
+      i_end  , j_end  , k_end  , l_end  ,                            &
+      a_start, b_start, c_start, d_start,                            &
+      a_end  , b_end  , c_end  , d_end, task_id, thread  )
+  implicit none
+  use f77_zmq
+  use map_module
+  use mmap_module
+  BEGIN_DOC
+! Performs a four-index transformation of map_a(N^4) into map_c(M^4) using b(NxM)
+! C_{abcd} = \sum_{ijkl} A_{ijkl}.B_{ia}.B_{jb}.B_{kc}.B_{ld}
+! Loops run over *_start->*_end
+  END_DOC
+  type(map_type), intent(in)     :: map_a
+  type(map_type), intent(inout)  :: map_c
+  integer, intent(in)            :: LDB
+  double precision, intent(in)   :: matrix_B(LDB,*)
+  integer, intent(in)            :: i_start, j_start, k_start, l_start
+  integer, intent(in)            :: i_end  , j_end  , k_end  , l_end
+  integer, intent(in)            :: a_start, b_start, c_start, d_start
+  integer, intent(in)            :: a_end  , b_end  , c_end  , d_end
+  integer, intent(in)            :: task_id, thread
+
+  double precision, allocatable  :: T(:,:), U(:,:,:), V(:,:)
+  double precision, allocatable  :: T2d(:,:), V2d(:,:)
+  integer                        :: i_max, j_max, k_max, l_max
+  integer                        :: i_min, j_min, k_min, l_min
+  integer                        :: i, j, k, l, ik, ll
+  integer                        :: a, b, c, d
+  double precision, external     :: get_ao_bielec_integral
+  integer*8                      :: ii
+  integer(key_kind)              :: idx
+  real(integral_kind)            :: tmp
+  integer(key_kind), allocatable :: key(:)
+  real(integral_kind), allocatable :: value(:)
+  integer*8, allocatable         :: l_pointer(:)
+
+  ASSERT (k_start == i_start)
+  ASSERT (l_start == j_start)
+  ASSERT (a_start == c_start)
+  ASSERT (b_start == d_start)
+
+  i_min = min(i_start,a_start)
+  i_max = max(i_end  ,a_end  )
+  j_min = min(j_start,b_start)
+  j_max = max(j_end  ,b_end  )
+  k_min = min(k_start,c_start)
+  k_max = max(k_end  ,c_end  )
+  l_min = min(l_start,d_start)
+  l_max = max(l_end  ,d_end  )
+
+  ASSERT (0 < i_max)
+  ASSERT (0 < j_max)
+  ASSERT (0 < k_max)
+  ASSERT (0 < l_max)
+  ASSERT (LDB >= i_max)
+  ASSERT (LDB >= j_max)
+  ASSERT (LDB >= k_max)
+  ASSERT (LDB >= l_max)
+
+  integer*4, allocatable         :: a_array_ik(:)
+  integer*2, allocatable         :: a_array_j(:)
+  double precision, allocatable  :: a_array_value(:)
+
+  integer*8 :: new_size
+  new_size = max(1024_8, 5_8 * map_a % n_elements )
+
+  allocate(a_array_ik(new_size), a_array_j(new_size), a_array_value(new_size))
+
+
+  allocate(l_pointer(l_start:l_end+1), value((i_max*k_max)) )
+  ii = 1_8
+  !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,j,k,l,ik,idx) 
+  do l=l_start,l_end
+    !$OMP SINGLE
+    l_pointer(l) = ii
+    !$OMP END SINGLE
+    do j=j_start,j_end
+      !$OMP DO SCHEDULE(static,1)
+      do k=k_start,k_end
+        do i=i_start,k
+          ik = (i-i_start+1) + ishft( (k-k_start)*(k-k_start+1), -1 )
+          call bielec_integrals_index(i,j,k,l,idx)
+          call map_get(map_a,idx,value(ik))
+        enddo
+      enddo
+      !$OMP END DO
+
+      !$OMP SINGLE
+      ik=0
+      do k=k_start,k_end
+        do i=i_start,k
+          ik = ik+1
+          tmp=value(ik)
+          if (tmp /= 0.d0) then
+            a_array_ik(ii) = ik
+            a_array_j(ii)  = j
+            a_array_value(ii) = tmp
+            ii=ii+1_8
+          endif
+        enddo
+      enddo
+      !$OMP END SINGLE
+    enddo
+  enddo
+  !$OMP SINGLE
+  a_array_ik(ii) = 0
+  a_array_j(ii)  = 0
+  a_array_value(ii) = 0.d0
+  l_pointer(l_end+1) = ii
+  !$OMP END SINGLE
+  !$OMP END PARALLEL  
+  deallocate(value)
+
+!INPUT DATA
+!open(unit=10,file='INPUT',form='UNFORMATTED')
+!write(10) i_start, j_start, i_end, j_end
+!write(10) a_start, b_start, a_end, b_end
+!write(10) LDB, mo_tot_num
+!write(10) matrix_B(1:LDB,1:mo_tot_num)
+!idx=size(a_array)
+!write(10) idx
+!write(10) a_array
+!write(10) l_pointer
+!close(10)
+!open(unit=10,file='OUTPUT',form='FORMATTED')
+! END INPUT DATA
+
+
+  !$OMP PARALLEL DEFAULT(NONE) SHARED(a_array_ik,a_array_j,a_array_value, &
+      !$OMP  a_start,a_end,b_start,b_end,c_start,c_end,d_start,d_end,&
+      !$OMP  i_start,i_end,j_start,j_end,k_start,k_end,l_start,l_end,&
+      !$OMP  i_min,i_max,j_min,j_max,k_min,k_max,l_min,l_max,        &
+      !$OMP  map_c,matrix_B,l_pointer)                         &
+      !$OMP  PRIVATE(key,value,T,U,V,i,j,k,l,idx,ik,ll,   &
+      !$OMP  a,b,c,d,tmp,T2d,V2d,ii)
+  allocate( key(i_max*j_max*k_max), value(i_max*j_max*k_max) )
+  allocate( U(a_start:a_end, c_start:c_end, b_start:b_end) )
+
+  integer(ZMQ_PTR)               :: zmq_socket_push
+  zmq_socket_push = new_zmq_push_socket(thread)
+
+
+
+  allocate( T2d((i_end-i_start+1)*(k_end-k_start+2)/2, j_start:j_end), &
+            V2d((i_end-i_start+1)*(k_end-k_start+2)/2, b_start:b_end), &
+            V(i_start:i_end, k_start:k_end), &
+            T(k_start:k_end, a_start:a_end))
+
+
+  !$OMP DO SCHEDULE(dynamic)
+  do d=d_start,d_end
+    U = 0.d0
+    do l=l_start,l_end
+      if (dabs(matrix_B(l,d)) < 1.d-10) then
+        cycle
+      endif
+      
+      ii=l_pointer(l)
+      do j=j_start,j_end
+        !DIR$ VECTOR NONTEMPORAL
+        T2d(:,j) = 0.d0
+        !DIR$ IVDEP
+        do while (j == a_array_j(ii))
+          T2d(a_array_ik(ii),j) = transfer(a_array_value(ii), 1.d0)
+          ii = ii + 1_8
+        enddo
+      enddo
+
+      call DGEMM('N','N', ishft( (i_end-i_start+1)*(i_end-i_start+2), -1),&
+          (d-b_start+1),                                             &
+          (j_end-j_start+1), 1.d0,                                   &
+          T2d(1,j_start), size(T2d,1),                               &
+          matrix_B(j_start,b_start), size(matrix_B,1),0.d0,          &
+          V2d(1,b_start), size(V2d,1) )
+
+      do b=b_start,d
+        ik = 0
+        do k=k_start,k_end
+          do i=i_start,k
+            ik = ik+1
+            V(i,k) = V2d(ik,b)
+          enddo
+        enddo
+
+!        T = 0.d0
+!        do a=a_start,b
+!          do k=k_start,k_end
+!            do i=i_start,k
+!              T(k,a) = T(k,a) + V(i,k)*matrix_B(i,a)
+!            enddo
+!            do i=k+1,i_end
+!              T(k,a) = T(k,a) + V(k,i)*matrix_B(i,a)
+!            enddo
+!          enddo
+!        enddo
+        call DSYMM('L','U', (k_end-k_start+1), (b-a_start+1),        &
+            1.d0,                                                    &
+            V(i_start,k_start), size(V,1),                           &
+            matrix_B(i_start,a_start), size(matrix_B,1),0.d0,        &
+            T(k_start,a_start), size(T,1) )
+
+!        do c=c_start,b
+!          do a=a_start,c
+!            do k=k_start,k_end
+!              U(a,c,b) = U(a,c,b) + T(k,a)*matrix_B(k,c)*matrix_B(l,d)
+!            enddo
+!          enddo
+!        enddo
+        call DGEMM('T','N', (b-a_start+1), (b-c_start+1),            &
+            (k_end-k_start+1), matrix_B(l, d),                       &
+            T(k_start,a_start), size(T,1),                           &
+            matrix_B(k_start,c_start), size(matrix_B,1), 1.d0,       &
+            U(a_start,c_start,b), size(U,1) )
+!        do c=b+1,c_end
+!          do a=a_start,b
+!            do k=k_start,k_end
+!              U(a,c,b) = U(a,c,b) + T(k,a)*matrix_B(k,c)*matrix_B(l,d)
+!            enddo
+!          enddo
+!        enddo
+        if (b < b_end) then
+          call DGEMM('T','N', (b-a_start+1), (c_end-b),              &
+              (k_end-k_start+1), matrix_B(l, d),                     &
+              T(k_start,a_start), size(T,1),                         &
+              matrix_B(k_start,b+1), size(matrix_B,1), 1.d0,         &
+              U(a_start,b+1,b), size(U,1) )
+        endif
+      enddo
+
+    enddo
+
+    idx = 0_8
+    do b=b_start,d
+      do c=c_start,c_end
+        do a=a_start,min(b,c)
+          if (dabs(U(a,c,b)) < 1.d-15) then
+            cycle
+          endif
+          idx = idx+1_8
+          call bielec_integrals_index(a,b,c,d,key(idx))
+          value(idx) = U(a,c,b)
+        enddo
+      enddo
+    enddo
+
+    !$OMP CRITICAL
+    call four_idx_push_results(zmq_socket_push, key, value, idx, task_id)
+    !$OMP END CRITICAL
+
+!WRITE OUTPUT
+! OMP CRITICAL
+!print *,  d
+!do b=b_start,d
+!  do c=c_start,c_end
+!    do a=a_start,min(b,c)
+!      if (dabs(U(a,c,b)) < 1.d-15) then
+!        cycle
+!      endif
+!      write(10,*) d,c,b,a,U(a,c,b)
+!    enddo
+!  enddo
+!enddo
+! OMP END CRITICAL
+!END WRITE OUTPUT
+
+
+  enddo
+  !$OMP END DO
+  call end_zmq_push_socket(zmq_socket_push,thread)
+  deallocate(key,value,V,T)
+  !$OMP END PARALLEL
+  call map_merge(map_c)
+
+  deallocate(l_pointer)
+  deallocate(a_array_ik,a_array_j,a_array_value)
+
+end
--- a/src/FourIdx/four_index_sym.irp.f
+++ b/src/FourIdx/four_index_sym.irp.f
@ -0,0 +1,293 @@
+subroutine four_index_transform_sym(map_a,map_c,matrix_B,LDB,            &
+      i_start, j_start, k_start, l_start,                            &
+      i_end  , j_end  , k_end  , l_end  ,                            &
+      a_start, b_start, c_start, d_start,                            &
+      a_end  , b_end  , c_end  , d_end  )
+  implicit none
+  use map_module
+  use mmap_module
+  BEGIN_DOC
+! Performs a four-index transformation of map_a(N^4) into map_c(M^4) using b(NxM)
+! C_{abcd} = \sum_{ijkl} A_{ijkl}.B_{ia}.B_{jb}.B_{kc}.B_{ld}
+! Loops run over *_start->*_end
+  END_DOC
+  type(map_type), intent(in)     :: map_a
+  type(map_type), intent(inout)  :: map_c
+  integer, intent(in)            :: LDB
+  double precision, intent(in)   :: matrix_B(LDB,*)
+  integer, intent(in)            :: i_start, j_start, k_start, l_start
+  integer, intent(in)            :: i_end  , j_end  , k_end  , l_end
+  integer, intent(in)            :: a_start, b_start, c_start, d_start
+  integer, intent(in)            :: a_end  , b_end  , c_end  , d_end
+
+  double precision, allocatable  :: T(:,:), U(:,:,:), V(:,:)
+  double precision, allocatable  :: T2d(:,:), V2d(:,:)
+  integer                        :: i_max, j_max, k_max, l_max
+  integer                        :: i_min, j_min, k_min, l_min
+  integer                        :: i, j, k, l, ik, ll
+  integer                        :: a, b, c, d
+  double precision, external     :: get_ao_bielec_integral
+  integer*8                      :: ii
+  integer(key_kind)              :: idx
+  real(integral_kind)            :: tmp
+  integer(key_kind), allocatable :: key(:)
+  real(integral_kind), allocatable :: value(:)
+  integer*8, allocatable         :: l_pointer(:)
+
+  ASSERT (k_start == i_start)
+  ASSERT (l_start == j_start)
+  ASSERT (a_start == c_start)
+  ASSERT (b_start == d_start)
+
+  i_min = min(i_start,a_start)
+  i_max = max(i_end  ,a_end  )
+  j_min = min(j_start,b_start)
+  j_max = max(j_end  ,b_end  )
+  k_min = min(k_start,c_start)
+  k_max = max(k_end  ,c_end  )
+  l_min = min(l_start,d_start)
+  l_max = max(l_end  ,d_end  )
+
+  ASSERT (0 < i_max)
+  ASSERT (0 < j_max)
+  ASSERT (0 < k_max)
+  ASSERT (0 < l_max)
+  ASSERT (LDB >= i_max)
+  ASSERT (LDB >= j_max)
+  ASSERT (LDB >= k_max)
+  ASSERT (LDB >= l_max)
+
+  ! Create a temporary memory-mapped file
+  integer                        :: fd(3)
+  type(c_ptr)                    :: c_pointer(3)
+  integer*4, pointer             :: a_array_ik(:)
+  integer*2, pointer             :: a_array_j(:)
+  double precision, pointer      :: a_array_value(:)
+
+  integer*8 :: new_size
+  new_size = max(1024_8, 5_8 * map_a % n_elements )
+
+  call mmap(trim(ezfio_filename)//'/work/four_idx_ik', (/ new_size /), 4, fd(1), .False., c_pointer(1))
+  call c_f_pointer(c_pointer(1), a_array_ik, (/ new_size /))
+
+  call mmap(trim(ezfio_filename)//'/work/four_idx_j', (/ new_size /), 2, fd(2), .False., c_pointer(2))
+  call c_f_pointer(c_pointer(2), a_array_j, (/ new_size /))
+
+  call mmap(trim(ezfio_filename)//'/work/four_idx_value', (/ new_size /), 8, fd(3), .False., c_pointer(3))
+  call c_f_pointer(c_pointer(3), a_array_value, (/ new_size /))
+
+  print *,  'Transforming MO integrals'
+  allocate(l_pointer(l_start:l_end+1), value((i_max*k_max)) )
+  ii = 1_8
+  !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,j,k,l,ik,idx) 
+  do l=l_start,l_end
+    !$OMP SINGLE
+    l_pointer(l) = ii
+    !$OMP END SINGLE
+    do j=j_start,j_end
+      !$OMP DO SCHEDULE(static,1)
+      do k=k_start,k_end
+        do i=i_start,k
+          ik = (i-i_start+1) + ishft( (k-k_start)*(k-k_start+1), -1 )
+          call bielec_integrals_index(i,j,k,l,idx)
+          call map_get(map_a,idx,value(ik))
+        enddo
+      enddo
+      !$OMP END DO
+
+      !$OMP SINGLE
+      ik=0
+      do k=k_start,k_end
+        do i=i_start,k
+          ik = ik+1
+          tmp=value(ik)
+          if (tmp /= 0.d0) then
+            a_array_ik(ii) = ik
+            a_array_j(ii)  = j
+            a_array_value(ii) = tmp
+            ii=ii+1_8
+          endif
+        enddo
+      enddo
+      !$OMP END SINGLE
+    enddo
+  enddo
+  !$OMP SINGLE
+  a_array_ik(ii) = 0
+  a_array_j(ii)  = 0
+  a_array_value(ii) = 0.d0
+  l_pointer(l_end+1) = ii
+  !$OMP END SINGLE
+  !$OMP END PARALLEL  
+  deallocate(value)
+
+!INPUT DATA
+!open(unit=10,file='INPUT',form='UNFORMATTED')
+!write(10) i_start, j_start, i_end, j_end
+!write(10) a_start, b_start, a_end, b_end
+!write(10) LDB, mo_tot_num
+!write(10) matrix_B(1:LDB,1:mo_tot_num)
+!idx=size(a_array)
+!write(10) idx
+!write(10) a_array
+!write(10) l_pointer
+!close(10)
+!open(unit=10,file='OUTPUT',form='FORMATTED')
+! END INPUT DATA
+
+
+  !$OMP PARALLEL DEFAULT(NONE) SHARED(a_array_ik,a_array_j,a_array_value,c_pointer,fd, &
+      !$OMP  a_start,a_end,b_start,b_end,c_start,c_end,d_start,d_end,&
+      !$OMP  i_start,i_end,j_start,j_end,k_start,k_end,l_start,l_end,&
+      !$OMP  i_min,i_max,j_min,j_max,k_min,k_max,l_min,l_max,        &
+      !$OMP  map_c,matrix_B,l_pointer)                         &
+      !$OMP  PRIVATE(key,value,T,U,V,i,j,k,l,idx,ik,ll,   &
+      !$OMP  a,b,c,d,tmp,T2d,V2d,ii)
+  allocate( key(i_max*j_max*k_max), value(i_max*j_max*k_max) )
+  allocate( U(a_start:a_end, c_start:c_end, b_start:b_end) )
+
+
+
+  allocate( T2d((i_end-i_start+1)*(k_end-k_start+2)/2, j_start:j_end), &
+            V2d((i_end-i_start+1)*(k_end-k_start+2)/2, b_start:b_end), &
+            V(i_start:i_end, k_start:k_end), &
+            T(k_start:k_end, a_start:a_end))
+
+
+  !$OMP DO SCHEDULE(dynamic)
+  do d=d_start,d_end
+    print *,  d, '/', d_end
+    U = 0.d0
+    do l=l_start,l_end
+      if (dabs(matrix_B(l,d)) < 1.d-10) then
+        cycle
+      endif
+      
+      ii=l_pointer(l)
+      do j=j_start,j_end
+        !DIR$ VECTOR NONTEMPORAL
+        T2d(:,j) = 0.d0
+        !DIR$ IVDEP
+        do while (j == a_array_j(ii))
+          T2d(a_array_ik(ii),j) = transfer(a_array_value(ii), 1.d0)
+          ii = ii + 1_8
+        enddo
+      enddo
+
+      call DGEMM('N','N', ishft( (i_end-i_start+1)*(i_end-i_start+2), -1),&
+          (d-b_start+1),                                             &
+          (j_end-j_start+1), 1.d0,                                   &
+          T2d(1,j_start), size(T2d,1),                               &
+          matrix_B(j_start,b_start), size(matrix_B,1),0.d0,          &
+          V2d(1,b_start), size(V2d,1) )
+
+      do b=b_start,d
+        ik = 0
+        do k=k_start,k_end
+          do i=i_start,k
+            ik = ik+1
+            V(i,k) = V2d(ik,b)
+          enddo
+        enddo
+
+!        T = 0.d0
+!        do a=a_start,b
+!          do k=k_start,k_end
+!            do i=i_start,k
+!              T(k,a) = T(k,a) + V(i,k)*matrix_B(i,a)
+!            enddo
+!            do i=k+1,i_end
+!              T(k,a) = T(k,a) + V(k,i)*matrix_B(i,a)
+!            enddo
+!          enddo
+!        enddo
+        call DSYMM('L','U', (k_end-k_start+1), (b-a_start+1),        &
+            1.d0,                                                    &
+            V(i_start,k_start), size(V,1),                           &
+            matrix_B(i_start,a_start), size(matrix_B,1),0.d0,        &
+            T(k_start,a_start), size(T,1) )
+
+!        do c=c_start,b
+!          do a=a_start,c
+!            do k=k_start,k_end
+!              U(a,c,b) = U(a,c,b) + T(k,a)*matrix_B(k,c)*matrix_B(l,d)
+!            enddo
+!          enddo
+!        enddo
+        call DGEMM('T','N', (b-a_start+1), (b-c_start+1),            &
+            (k_end-k_start+1), matrix_B(l, d),                       &
+            T(k_start,a_start), size(T,1),                           &
+            matrix_B(k_start,c_start), size(matrix_B,1), 1.d0,       &
+            U(a_start,c_start,b), size(U,1) )
+!        do c=b+1,c_end
+!          do a=a_start,b
+!            do k=k_start,k_end
+!              U(a,c,b) = U(a,c,b) + T(k,a)*matrix_B(k,c)*matrix_B(l,d)
+!            enddo
+!          enddo
+!        enddo
+        if (b < b_end) then
+          call DGEMM('T','N', (b-a_start+1), (c_end-b),              &
+              (k_end-k_start+1), matrix_B(l, d),                     &
+              T(k_start,a_start), size(T,1),                         &
+              matrix_B(k_start,b+1), size(matrix_B,1), 1.d0,         &
+              U(a_start,b+1,b), size(U,1) )
+        endif
+      enddo
+
+    enddo
+
+    idx = 0_8
+    do b=b_start,d
+      do c=c_start,c_end
+        do a=a_start,min(b,c)
+          if (dabs(U(a,c,b)) < 1.d-15) then
+            cycle
+          endif
+          idx = idx+1_8
+          call bielec_integrals_index(a,b,c,d,key(idx))
+          value(idx) = U(a,c,b)
+        enddo
+      enddo
+    enddo
+
+    !$OMP CRITICAL
+    call map_update(map_c, key, value, idx,1.d-15) 
+    !$OMP END CRITICAL
+
+!WRITE OUTPUT
+! OMP CRITICAL
+!print *,  d
+!do b=b_start,d
+!  do c=c_start,c_end
+!    do a=a_start,min(b,c)
+!      if (dabs(U(a,c,b)) < 1.d-15) then
+!        cycle
+!      endif
+!      write(10,*) d,c,b,a,U(a,c,b)
+!    enddo
+!  enddo
+!enddo
+! OMP END CRITICAL
+!END WRITE OUTPUT
+
+
+  enddo
+  !$OMP END DO
+
+  deallocate(key,value,V,T)
+  !$OMP END PARALLEL
+  call map_merge(map_c)
+
+  call munmap( (/ new_size /), 4, fd(1), c_pointer(1))
+  open(unit=10,file=trim(ezfio_filename)//'/work/four_idx_ik')
+  close(10,status='DELETE')
+  call munmap( (/ new_size /), 2, fd(2), c_pointer(2))
+  open(unit=10,file=trim(ezfio_filename)//'/work/four_idx_j')
+  close(10,status='DELETE')
+  call munmap( (/ new_size /), 8, fd(3), c_pointer(3))
+  open(unit=10,file=trim(ezfio_filename)//'/work/four_idx_value')
+  close(10,status='DELETE')
+  deallocate(l_pointer)
+
+end
--- a/src/FourIdx/four_index_sym_mmap.irp.f
+++ b/src/FourIdx/four_index_sym_mmap.irp.f
@ -0,0 +1,292 @@
+subroutine four_index_transform_sym_mmap(map_a,map_c,matrix_B,LDB,            &
+      i_start, j_start, k_start, l_start,                            &
+      i_end  , j_end  , k_end  , l_end  ,                            &
+      a_start, b_start, c_start, d_start,                            &
+      a_end  , b_end  , c_end  , d_end  )
+  implicit none
+  use map_module
+  use mmap_module
+  BEGIN_DOC
+! Performs a four-index transformation of map_a(N^4) into map_c(M^4) using b(NxM)
+! C_{abcd} = \sum_{ijkl} A_{ijkl}.B_{ia}.B_{jb}.B_{kc}.B_{ld}
+! Loops run over *_start->*_end
+  END_DOC
+  type(map_type), intent(in)     :: map_a
+  type(map_type), intent(inout)  :: map_c
+  integer, intent(in)            :: LDB
+  double precision, intent(in)   :: matrix_B(LDB,*)
+  integer, intent(in)            :: i_start, j_start, k_start, l_start
+  integer, intent(in)            :: i_end  , j_end  , k_end  , l_end
+  integer, intent(in)            :: a_start, b_start, c_start, d_start
+  integer, intent(in)            :: a_end  , b_end  , c_end  , d_end
+
+  double precision, allocatable  :: T(:,:), U(:,:,:), V(:,:)
+  double precision, allocatable  :: T2d(:,:), V2d(:,:)
+  integer                        :: i_max, j_max, k_max, l_max
+  integer                        :: i_min, j_min, k_min, l_min
+  integer                        :: i, j, k, l, ik, ll
+  integer                        :: a, b, c, d
+  double precision, external     :: get_ao_bielec_integral
+  integer*8                      :: ii
+  integer(key_kind)              :: idx
+  real(integral_kind)            :: tmp
+  integer(key_kind), allocatable :: key(:)
+  real(integral_kind), allocatable :: value(:)
+  integer*8, allocatable         :: l_pointer(:)
+
+  ASSERT (k_start == i_start)
+  ASSERT (l_start == j_start)
+  ASSERT (a_start == c_start)
+  ASSERT (b_start == d_start)
+
+  i_min = min(i_start,a_start)
+  i_max = max(i_end  ,a_end  )
+  j_min = min(j_start,b_start)
+  j_max = max(j_end  ,b_end  )
+  k_min = min(k_start,c_start)
+  k_max = max(k_end  ,c_end  )
+  l_min = min(l_start,d_start)
+  l_max = max(l_end  ,d_end  )
+
+  ASSERT (0 < i_max)
+  ASSERT (0 < j_max)
+  ASSERT (0 < k_max)
+  ASSERT (0 < l_max)
+  ASSERT (LDB >= i_max)
+  ASSERT (LDB >= j_max)
+  ASSERT (LDB >= k_max)
+  ASSERT (LDB >= l_max)
+
+  ! Create a temporary memory-mapped file
+  integer                        :: fd(3)
+  type(c_ptr)                    :: c_pointer(3)
+  integer*4, pointer             :: a_array_ik(:)
+  integer*2, pointer             :: a_array_j(:)
+  double precision, pointer      :: a_array_value(:)
+
+  integer*8 :: new_size
+  new_size = max(1024_8, 5_8 * map_a % n_elements )
+
+  call mmap(trim(ezfio_filename)//'/work/four_idx_ik', (/ new_size /), 4, fd(1), .False., c_pointer(1))
+  call c_f_pointer(c_pointer(1), a_array_ik, (/ new_size /))
+
+  call mmap(trim(ezfio_filename)//'/work/four_idx_j', (/ new_size /), 2, fd(2), .False., c_pointer(2))
+  call c_f_pointer(c_pointer(2), a_array_j, (/ new_size /))
+
+  call mmap(trim(ezfio_filename)//'/work/four_idx_value', (/ new_size /), 8, fd(3), .False., c_pointer(3))
+  call c_f_pointer(c_pointer(3), a_array_value, (/ new_size /))
+
+  print *,  'Transforming MO integrals'
+  allocate(l_pointer(l_start:l_end+1), value((i_max*k_max)) )
+  ii = 1_8
+  !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,j,k,l,ik,idx) 
+  do l=l_start,l_end
+    !$OMP SINGLE
+    l_pointer(l) = ii
+    !$OMP END SINGLE
+    do j=j_start,j_end
+      !$OMP DO SCHEDULE(static,1)
+      do k=k_start,k_end
+        do i=i_start,k
+          ik = (i-i_start+1) + ishft( (k-k_start)*(k-k_start+1), -1 )
+          call bielec_integrals_index(i,j,k,l,idx)
+          call map_get(map_a,idx,value(ik))
+        enddo
+      enddo
+      !$OMP END DO
+
+      !$OMP SINGLE
+      ik=0
+      do k=k_start,k_end
+        do i=i_start,k
+          ik = ik+1
+          tmp=value(ik)
+          if (tmp /= 0.d0) then
+            a_array_ik(ii) = ik
+            a_array_j(ii)  = j
+            a_array_value(ii) = tmp
+            ii=ii+1_8
+          endif
+        enddo
+      enddo
+      !$OMP END SINGLE
+    enddo
+  enddo
+  !$OMP SINGLE
+  a_array_ik(ii) = 0
+  a_array_j(ii)  = 0
+  a_array_value(ii) = 0.d0
+  l_pointer(l_end+1) = ii
+  !$OMP END SINGLE
+  !$OMP END PARALLEL  
+  deallocate(value)
+
+!INPUT DATA
+!open(unit=10,file='INPUT',form='UNFORMATTED')
+!write(10) i_start, j_start, i_end, j_end
+!write(10) a_start, b_start, a_end, b_end
+!write(10) LDB, mo_tot_num
+!write(10) matrix_B(1:LDB,1:mo_tot_num)
+!idx=size(a_array)
+!write(10) idx
+!write(10) a_array
+!write(10) l_pointer
+!close(10)
+!open(unit=10,file='OUTPUT',form='FORMATTED')
+! END INPUT DATA
+
+
+  !$OMP PARALLEL DEFAULT(NONE) SHARED(a_array_ik,a_array_j,a_array_value,c_pointer,fd, &
+      !$OMP  a_start,a_end,b_start,b_end,c_start,c_end,d_start,d_end,&
+      !$OMP  i_start,i_end,j_start,j_end,k_start,k_end,l_start,l_end,&
+      !$OMP  i_min,i_max,j_min,j_max,k_min,k_max,l_min,l_max,        &
+      !$OMP  map_c,matrix_B,l_pointer)                         &
+      !$OMP  PRIVATE(key,value,T,U,V,i,j,k,l,idx,ik,ll,   &
+      !$OMP  a,b,c,d,tmp,T2d,V2d,ii)
+  allocate( key(i_max*j_max*k_max), value(i_max*j_max*k_max) )
+  allocate( U(a_start:a_end, c_start:c_end, b_start:b_end) )
+
+
+
+  allocate( T2d((i_end-i_start+1)*(k_end-k_start+2)/2, j_start:j_end), &
+            V2d((i_end-i_start+1)*(k_end-k_start+2)/2, b_start:b_end), &
+            V(i_start:i_end, k_start:k_end), &
+            T(k_start:k_end, a_start:a_end))
+
+
+  !$OMP DO SCHEDULE(dynamic)
+  do d=d_start,d_end
+    print *,  d, '/', d_end
+    U = 0.d0
+    do l=l_start,l_end
+      if (dabs(matrix_B(l,d)) < 1.d-10) then
+        cycle
+      endif
+      
+      ii=l_pointer(l)
+      do j=j_start,j_end
+        !DIR$ VECTOR NONTEMPORAL
+        T2d(:,j) = 0.d0
+        !DIR$ IVDEP
+        do while (j == a_array_j(ii))
+          T2d(a_array_ik(ii),j) = transfer(a_array_value(ii), 1.d0)
+          ii = ii + 1_8
+        enddo
+      enddo
+      call DGEMM('N','N', ishft( (i_end-i_start+1)*(i_end-i_start+2), -1),&
+          (d-b_start+1),                                             &
+          (j_end-j_start+1), 1.d0,                                   &
+          T2d(1,j_start), size(T2d,1),                               &
+          matrix_B(j_start,b_start), size(matrix_B,1),0.d0,          &
+          V2d(1,b_start), size(V2d,1) )
+
+      do b=b_start,d
+        ik = 0
+        do k=k_start,k_end
+          do i=i_start,k
+            ik = ik+1
+            V(i,k) = V2d(ik,b)
+          enddo
+        enddo
+
+!        T = 0.d0
+!        do a=a_start,b
+!          do k=k_start,k_end
+!            do i=i_start,k
+!              T(k,a) = T(k,a) + V(i,k)*matrix_B(i,a)
+!            enddo
+!            do i=k+1,i_end
+!              T(k,a) = T(k,a) + V(k,i)*matrix_B(i,a)
+!            enddo
+!          enddo
+!        enddo
+        call DSYMM('L','U', (k_end-k_start+1), (b-a_start+1),        &
+            1.d0,                                                    &
+            V(i_start,k_start), size(V,1),                           &
+            matrix_B(i_start,a_start), size(matrix_B,1),0.d0,        &
+            T(k_start,a_start), size(T,1) )
+
+!        do c=c_start,b
+!          do a=a_start,c
+!            do k=k_start,k_end
+!              U(a,c,b) = U(a,c,b) + T(k,a)*matrix_B(k,c)*matrix_B(l,d)
+!            enddo
+!          enddo
+!        enddo
+        call DGEMM('T','N', (b-a_start+1), (b-c_start+1),            &
+            (k_end-k_start+1), matrix_B(l, d),                       &
+            T(k_start,a_start), size(T,1),                           &
+            matrix_B(k_start,c_start), size(matrix_B,1), 1.d0,       &
+            U(a_start,c_start,b), size(U,1) )
+!        do c=b+1,c_end
+!          do a=a_start,b
+!            do k=k_start,k_end
+!              U(a,c,b) = U(a,c,b) + T(k,a)*matrix_B(k,c)*matrix_B(l,d)
+!            enddo
+!          enddo
+!        enddo
+        if (b < b_end) then
+          call DGEMM('T','N', (b-a_start+1), (c_end-b),              &
+              (k_end-k_start+1), matrix_B(l, d),                     &
+              T(k_start,a_start), size(T,1),                         &
+              matrix_B(k_start,b+1), size(matrix_B,1), 1.d0,         &
+              U(a_start,b+1,b), size(U,1) )
+        endif
+      enddo
+
+    enddo
+
+    idx = 0_8
+    do b=b_start,d
+      do c=c_start,c_end
+        do a=a_start,min(b,c)
+          if (dabs(U(a,c,b)) < 1.d-15) then
+            cycle
+          endif
+          idx = idx+1_8
+          call bielec_integrals_index(a,b,c,d,key(idx))
+          value(idx) = U(a,c,b)
+        enddo
+      enddo
+    enddo
+
+    !$OMP CRITICAL
+    call map_append(map_c, key, value, idx) 
+    !$OMP END CRITICAL
+
+!WRITE OUTPUT
+! OMP CRITICAL
+!print *,  d
+!do b=b_start,d
+!  do c=c_start,c_end
+!    do a=a_start,min(b,c)
+!      if (dabs(U(a,c,b)) < 1.d-15) then
+!        cycle
+!      endif
+!      write(10,*) d,c,b,a,U(a,c,b)
+!    enddo
+!  enddo
+!enddo
+! OMP END CRITICAL
+!END WRITE OUTPUT
+
+
+  enddo
+  !$OMP END DO
+
+  deallocate(key,value,V,T)
+  !$OMP END PARALLEL
+  call map_sort(map_c)
+
+  call munmap( (/ new_size /), 4, fd(1), c_pointer(1))
+  open(unit=10,file=trim(ezfio_filename)//'/work/four_idx_ik')
+  close(10,status='DELETE')
+  call munmap( (/ new_size /), 2, fd(2), c_pointer(2))
+  open(unit=10,file=trim(ezfio_filename)//'/work/four_idx_j')
+  close(10,status='DELETE')
+  call munmap( (/ new_size /), 8, fd(3), c_pointer(3))
+  open(unit=10,file=trim(ezfio_filename)//'/work/four_idx_value')
+  close(10,status='DELETE')
+  deallocate(l_pointer)
+
+end
--- a/src/FourIdx/four_index_zmq.irp.f.todo
+++ b/src/FourIdx/four_index_zmq.irp.f.todo
@ -0,0 +1,273 @@
+subroutine four_index_transform_zmq(map_a,map_c,matrix_B,LDB,        &
+      i_start, j_start, k_start, l_start,                            &
+      i_end  , j_end  , k_end  , l_end  ,                            &
+      a_start, b_start, c_start, d_start,                            &
+      a_end  , b_end  , c_end  , d_end  )
+  implicit none
+  use f77_zmq
+  use map_module
+  BEGIN_DOC
+! Performs a four-index transformation of map_a(N^4) into map_c(M^4) using b(NxM)
+! C_{abcd} = \sum_{ijkl} A_{ijkl}.B_{ia}.B_{jb}.B_{kc}.B_{ld}
+! Loops run over *_start->*_end
+  END_DOC
+  type(map_type), intent(in)     :: map_a
+  type(map_type), intent(inout)  :: map_c
+  integer, intent(in)            :: LDB
+  double precision, intent(in)   :: matrix_B(LDB,*)
+  integer, intent(in)            :: i_start, j_start, k_start, l_start
+  integer, intent(in)            :: i_end  , j_end  , k_end  , l_end
+  integer, intent(in)            :: a_start, b_start, c_start, d_start
+  integer, intent(in)            :: a_end  , b_end  , c_end  , d_end
+
+  double precision, allocatable  :: T(:,:), U(:,:,:), V(:,:)
+  double precision, allocatable  :: T2d(:,:), V2d(:,:)
+  integer                        :: i_max, j_max, k_max, l_max
+  integer                        :: i_min, j_min, k_min, l_min
+  integer                        :: i, j, k, l, ik, ll
+  integer                        :: l_start_block, l_end_block, l_block
+  integer                        :: a, b, c, d
+  double precision, external     :: get_ao_bielec_integral
+  integer*8                      :: ii
+  integer(key_kind)              :: idx
+  real(integral_kind)            :: tmp
+  integer(key_kind), allocatable :: key(:)
+  real(integral_kind), allocatable :: value(:)
+  integer*8, allocatable         :: l_pointer(:)
+
+  ASSERT (k_start == i_start)
+  ASSERT (l_start == j_start)
+  ASSERT (a_start == c_start)
+  ASSERT (b_start == d_start)
+
+  i_min = min(i_start,a_start)
+  i_max = max(i_end  ,a_end  )
+  j_min = min(j_start,b_start)
+  j_max = max(j_end  ,b_end  )
+  k_min = min(k_start,c_start)
+  k_max = max(k_end  ,c_end  )
+  l_min = min(l_start,d_start)
+  l_max = max(l_end  ,d_end  )
+
+  ASSERT (0 < i_max)
+  ASSERT (0 < j_max)
+  ASSERT (0 < k_max)
+  ASSERT (0 < l_max)
+  ASSERT (LDB >= i_max)
+  ASSERT (LDB >= j_max)
+  ASSERT (LDB >= k_max)
+  ASSERT (LDB >= l_max)
+
+
+  integer(ZMQ_PTR) :: zmq_to_qp_run_socket
+  call new_parallel_job(zmq_to_qp_run_socket,'four_idx')
+
+  integer*8 :: new_size
+  new_size = max(1024_8, 5_8 * map_a % n_elements )
+
+  integer :: npass
+  integer*8 :: tempspace
+
+  tempspace = (new_size * 14_8) / (1024_8 * 1024_8)
+  npass = min(l_end-l_start,1 + tempspace / 2048)   ! 2 GiB of scratch space
+  l_block = (l_end-l_start)/npass
+
+  ! Create tasks
+  ! ============
+
+  character(len=64), allocatable :: task
+
+  do l_start_block = l_start, l_end, l_block
+    l_end_block = min(l_end, l_start_block+l_block-1)
+    write(task,'I10,X,I10') l_start_block, l_end_block
+    call add_task_to_taskserver(zmq_to_qp_run_socket,trim(task))
+  enddo
+
+  call zmq_set_running(zmq_to_qp_run_socket)
+
+  PROVIDE nproc
+
+  call omp_set_nested(.True.)
+  integer :: ithread
+  !$OMP PARALLEL NUM_THREADS(2) PRIVATE(ithread)
+  ithread = omp_get_thread_num()
+  if (ithread==0) then
+    call four_idx_collector(zmq_to_qp_run_socket,map_c)
+  else
+    !TODO : Put strings of map_a and matrix_b on server and broadcast
+    call four_index_transform_slave_inproc(map_a,map_c,matrix_B,LDB,     &
+          i_start, j_start, k_start, l_start_block,                      &
+          i_end  , j_end  , k_end  , l_end_block  ,                      &
+          a_start, b_start, c_start, d_start,                            &
+          a_end  , b_end  , c_end  , d_end, 1 )
+  endif
+  !$OMP END PARALLEL
+
+  call end_parallel_job(zmq_to_qp_run_socket, 'four_idx')
+
+
+end
+
+
+subroutine four_idx_slave_work(zmq_to_qp_run_socket, worker_id)
+  use f77_zmq
+  implicit none
+
+  integer(ZMQ_PTR),intent(in)   :: zmq_to_qp_run_socket
+  integer,intent(in)             :: worker_id
+  integer                        :: task_id
+  character*(512)                :: msg
+
+  integer                        :: i_start, j_start, k_start, l_start_block
+  integer                        :: i_end  , j_end  , k_end  , l_end_block
+  integer                        :: a_start, b_start, c_start, d_start
+  integer                        :: a_end  , b_end  , c_end  , d_end
+
+  !TODO : get map_a and matrix_B from server
+  do
+    call get_task_from_taskserver(zmq_to_qp_run_socket,worker_id, task_id, msg)
+    if(task_id == 0) exit
+    read (msg,*) LDB,     &
+          i_start, j_start, k_start, l_start_block,                      &
+          i_end  , j_end  , k_end  , l_end_block  ,                      &
+          a_start, b_start, c_start, d_start,                            &
+          a_end  , b_end  , c_end  , d_end
+
+    call four_index_transform_slave(map_a,map_c,matrix_B,LDB,            &
+          i_start, j_start, k_start, l_start_block,                      &
+          i_end  , j_end  , k_end  , l_end_block  ,                      &
+          a_start, b_start, c_start, d_start,                            &
+          a_end  , b_end  , c_end  , d_end, zmq_to_qp_run_socket,        &
+          task_id)
+    call task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id)
+
+  enddo
+end
+
+
+BEGIN_PROVIDER [ integer, nthreads_four_idx ]
+ implicit none
+ BEGIN_DOC
+ ! Number of threads for 4-index transformation
+ END_DOC
+ nthreads_four_idx = nproc
+ character*(32) :: env
+ call getenv('NTHREADS_FOUR_IDX',env)
+ if (trim(env) /= '') then
+   read(env,*) nthreads_four_idx
+ endif
+ call write_int(6,nthreads_davidson,'Number of threads for 4-index transformation')
+END_PROVIDER
+
+
+
+subroutine four_idx_collector(zmq_to_qp_run_socket,map_c)
+  use f77_zmq
+  use map_module
+  implicit none
+  type(map_type), intent(inout)  :: map_c
+
+  integer :: more
+  integer(ZMQ_PTR), external     :: new_zmq_pull_socket
+  integer(ZMQ_PTR)               :: zmq_socket_pull
+
+
+  more = 1
+  zmq_socket_pull = new_zmq_pull_socket()
+
+  do while (more == 1)
+    call four_idx_pull_results(zmq_socket_pull, map_c, task_id)
+    call zmq_delete_task(zmq_to_qp_run_socket,zmq_socket_pull,task_id,more)
+  enddo
+
+  call end_zmq_pull_socket(zmq_socket_pull)
+
+end
+
+
+subroutine four_idx_pull_results(zmq_socket_pull, map_c, task_id)
+  use f77_zmq
+  use map_module
+  implicit none
+  type(map_type), intent(inout)   :: map_c
+  integer(ZMQ_PTR), intent(inout) :: zmq_socket_pull
+
+  integer, intent(out) :: task_id
+
+  integer                            :: rc, sze
+  integer*8                          :: rc8
+
+
+  rc = f77_zmq_recv( zmq_socket_pull, task_id, 4, 0)
+  if(rc /= 4) stop "four_idx_pull_results failed to pull task_id"
+
+  rc = f77_zmq_recv( zmq_socket_pull, sze, 4, 0)
+  if(rc /= 4) stop "four_idx_pull_results failed to pull sze"
+
+  integer(key_kind), allocatable :: key(:)
+  real(integral_kind), allocatable :: value(:)
+
+  allocate(key(sze), value(sze))
+
+  rc8 = f77_zmq_recv8( zmq_socket_pull, key, key_kind*sze, 0)
+  if(rc8 /= key_kind*sze) stop "four_idx_pull_results failed to pull key"
+
+  rc8 = f77_zmq_recv8( zmq_socket_pull, value, integral_kind*sze, 0)
+  if(rc8 /= integral_kind*sze) stop "four_idx_pull_results failed to pull value"
+
+! Activate if zmq_socket_pull is a REP
+IRP_IF ZMQ_PUSH
+IRP_ELSE
+  rc = f77_zmq_send( zmq_socket_pull, 0, 4, 0)
+  if (rc /= 4) then
+    print *,  irp_here, ' : f77_zmq_send (zmq_socket_pull,...'
+    stop 'error'
+  endif
+IRP_ENDIF
+
+  call map_update(map_c, key, value, sze, 1.d-15)  ! TODO : threshold
+
+  deallocate(key, value)
+end
+
+
+
+subroutine four_idx_push_results(zmq_socket_push, key, value, sze, task_id)
+  use f77_zmq
+  use map_module
+  implicit none
+  integer, intent(in)             :: sze
+  integer(key_kind), intent(in)   :: key(sze)
+  real(integral_kind), intent(in) :: value(sze)
+  integer(ZMQ_PTR), intent(in)    :: zmq_socket_push
+  integer, intent(in)             :: task_id
+
+  integer                         :: rc, sze
+  integer*8                       :: rc8
+
+
+  rc = f77_zmq_send( zmq_socket_push, task_id, 4, ZMQ_SNDMORE)
+  if(rc /= 4) stop "four_idx_push_results failed to push task_id"
+
+  rc = f77_zmq_send( zmq_socket_push, sze, 4, ZMQ_SNDMORE)
+  if(rc /= 4) stop "four_idx_push_results failed to push sze"
+
+  rc8 = f77_zmq_send8( zmq_socket_push, key, key_kind*sze, ZMQ_SNDMORE)
+  if(rc8 /= key_kind*sze) stop "four_idx_push_results failed to push key"
+
+  rc8 = f77_zmq_send8( zmq_socket_push, value, integral_kind*sze, 0)
+  if(rc8 /= integral_kind*sze) stop "four_idx_push_results failed to push value"
+
+! Activate if zmq_socket_push is a REP
+IRP_IF ZMQ_PUSH
+IRP_ELSE
+  rc = f77_zmq_send( zmq_socket_push, 0, 4, 0)
+  if (rc /= 4) then
+    print *,  irp_here, ' : f77_zmq_send (zmq_socket_push,...'
+    stop 'error'
+  endif
+IRP_ENDIF
+
+end
+
+
--- a/src/Integrals_Bielec/NEEDED_CHILDREN_MODULES
+++ b/src/Integrals_Bielec/NEEDED_CHILDREN_MODULES
@ -1 +1 @@
-Pseudo Bitmask ZMQ
+Pseudo Bitmask ZMQ FourIdx
--- a/src/Integrals_Bielec/map_integrals.irp.f
+++ b/src/Integrals_Bielec/map_integrals.irp.f
@ -179,7 +179,6 @@ double precision function get_ao_bielec_integral(i,j,k,l,map) result(result)
      call bielec_integrals_index(i,j,k,l,idx)
      !DIR$ FORCEINLINE
      call map_get(map,idx,tmp)
-      tmp = tmp
    else
      ii = l-ao_integrals_cache_min
      ii = ior( ishft(ii,6), k-ao_integrals_cache_min)
@ -336,7 +335,7 @@ end
 ! Min and max values of the MOs for which the integrals are in the cache
 END_DOC
 mo_integrals_cache_min_8 = max(1_8,elec_alpha_num - 63_8)
- mo_integrals_cache_max_8 = min(int(mo_tot_num,8),mo_integrals_cache_min+127_8)
+ mo_integrals_cache_max_8 = min(int(mo_tot_num,8),mo_integrals_cache_min_8+127_8)
 mo_integrals_cache_min   = max(1,elec_alpha_num - 63)
 mo_integrals_cache_max   = min(mo_tot_num,mo_integrals_cache_min+127)

--- a/src/Integrals_Bielec/mo_bi_integrals.irp.f
+++ b/src/Integrals_Bielec/mo_bi_integrals.irp.f
@ -117,7 +117,17 @@ BEGIN_PROVIDER [ logical, mo_bielec_integrals_in_map ]
    endif
    
  else
-    call add_integrals_to_map(full_ijkl_bitmask_4)
+!    call add_integrals_to_map(full_ijkl_bitmask_4)
+
+     call four_index_transform_block(ao_integrals_map,mo_integrals_map, &
+         mo_coef, size(mo_coef,1),                                      &
+         1, 1, 1, 1, ao_num, ao_num, ao_num, ao_num,                    &
+         1, 1, 1, 1, mo_tot_num, mo_tot_num, mo_tot_num, mo_tot_num)
+
+    integer*8                      :: get_mo_map_size, mo_map_size
+    mo_map_size = get_mo_map_size()
+    
+    print*,'Molecular integrals provided'
  endif
  if (write_mo_integrals) then
    call ezfio_set_work_empty(.False.)
@ -146,7 +156,7 @@ subroutine set_integrals_jj_into_map
  enddo
  call insert_into_mo_integrals_map(n_integrals,buffer_i,buffer_value,&
      real(mo_integrals_threshold,integral_kind))
-  call map_unique(mo_integrals_map)
+  call map_merge(mo_integrals_map)
 end

 subroutine set_integrals_exchange_jj_into_map
@ -167,7 +177,7 @@ subroutine set_integrals_exchange_jj_into_map
  enddo
  call insert_into_mo_integrals_map(n_integrals,buffer_i,buffer_value,&
      real(mo_integrals_threshold,integral_kind))
-  call map_unique(mo_integrals_map)
+  call map_merge(mo_integrals_map)
  
 end

@ -458,7 +468,7 @@ subroutine add_integrals_to_map(mask_ijkl)
      real(mo_integrals_threshold,integral_kind))
  deallocate(buffer_i, buffer_value)
  !$OMP END PARALLEL
-  call map_unique(mo_integrals_map)
+  call map_merge(mo_integrals_map)
  
  call wall_time(wall_2)
  call cpu_time(cpu_2)
@ -773,7 +783,7 @@ subroutine add_integrals_to_map_three_indices(mask_ijk)
      real(mo_integrals_threshold,integral_kind))
  deallocate(buffer_i, buffer_value)
  !$OMP END PARALLEL
-  call map_unique(mo_integrals_map)
+  call map_merge(mo_integrals_map)
  
  call wall_time(wall_2)
  call cpu_time(cpu_2)
@ -1035,7 +1045,7 @@ subroutine add_integrals_to_map_no_exit_34(mask_ijkl)
  !  print*, 'Communicating the map'
  !  call communicate_mo_integrals()
  !IRP_ENDIF
-  call map_unique(mo_integrals_map)
+  call map_merge(mo_integrals_map)
  
  call wall_time(wall_2)
  call cpu_time(cpu_2)
--- a/src/Integrals_Monoelec/kin_ao_ints.irp.f
+++ b/src/Integrals_Monoelec/kin_ao_ints.irp.f
@ -1,6 +1,6 @@
- BEGIN_PROVIDER [ double precision, ao_deriv2_x,(ao_num_align,ao_num) ]
-&BEGIN_PROVIDER [ double precision, ao_deriv2_y,(ao_num_align,ao_num) ]
-&BEGIN_PROVIDER [ double precision, ao_deriv2_z,(ao_num_align,ao_num) ]
+ BEGIN_PROVIDER [ double precision, ao_deriv2_x,(ao_num,ao_num) ]
+&BEGIN_PROVIDER [ double precision, ao_deriv2_y,(ao_num,ao_num) ]
+&BEGIN_PROVIDER [ double precision, ao_deriv2_z,(ao_num,ao_num) ]
  implicit none
  integer :: i,j,n,l
  double precision :: f
@ -45,8 +45,6 @@
   power_A(1)  = ao_power( j, 1 )
   power_A(2)  = ao_power( j, 2 )
   power_A(3)  = ao_power( j, 3 )
-   !DEC$ VECTOR ALIGNED
-   !DEC$ VECTOR ALWAYS
   do i= 1,ao_num
    ao_deriv2_x(i,j)= 0.d0
    ao_deriv2_y(i,j)= 0.d0
@ -59,7 +57,6 @@
    power_B(3)  = ao_power( i, 3 )
    do n = 1,ao_prim_num(j)
     alpha = ao_expo_ordered_transp(n,j)
-     !DEC$ VECTOR ALIGNED
     do l = 1, ao_prim_num(i)
      beta = ao_expo_ordered_transp(l,i)
      call overlap_gaussian_xyz(A_center,B_center,alpha,beta,power_A,power_B,overlap_x0,overlap_y0,overlap_z0,overlap,dim1)
@ -122,7 +119,7 @@

 END_PROVIDER

-BEGIN_PROVIDER [double precision, ao_kinetic_integral, (ao_num_align,ao_num)]
+BEGIN_PROVIDER [double precision, ao_kinetic_integral, (ao_num,ao_num)]
  implicit none
  BEGIN_DOC
  ! array of the priminitve basis kinetic integrals
@ -131,27 +128,23 @@ BEGIN_PROVIDER [double precision, ao_kinetic_integral, (ao_num_align,ao_num)]
  integer                        :: i,j,k,l
  
  if (read_ao_one_integrals) then
-     call ezfio_get_ao_basis_integral_kinetic(ao_kinetic_integral(1:ao_num, 1:ao_num))
- call ezfio_set_ao_basis_integral_kinetic(ao_kinetic_integral(1:ao_num, 1:ao_num))
+    call read_one_e_integrals('ao_kinetic_integral', ao_kinetic_integral,&
+        size(ao_kinetic_integral,1), size(ao_kinetic_integral,2))
    print *,  'AO kinetic integrals read from disk'
  else
    !$OMP PARALLEL DO DEFAULT(NONE) &
    !$OMP  PRIVATE(i,j) &
-    !$OMP  SHARED(ao_num, ao_num_align, ao_kinetic_integral,ao_deriv2_x,ao_deriv2_y,ao_deriv2_z)
+    !$OMP  SHARED(ao_num, ao_kinetic_integral,ao_deriv2_x,ao_deriv2_y,ao_deriv2_z)
    do j = 1, ao_num
-      !DEC$ VECTOR ALWAYS
-      !DEC$ VECTOR ALIGNED
      do i = 1, ao_num
      ao_kinetic_integral(i,j) = -0.5d0 * (ao_deriv2_x(i,j) + ao_deriv2_y(i,j) + ao_deriv2_z(i,j) )
      enddo
-      do i = ao_num +1,ao_num_align
-        ao_kinetic_integral(i,j) = 0.d0
-      enddo
    enddo
    !$OMP END PARALLEL DO
  endif
  if (write_ao_one_integrals) then
-    call ezfio_set_ao_basis_integral_kinetic(ao_kinetic_integral(1:ao_num, 1:ao_num))
+    call write_one_e_integrals('ao_kinetic_integral', ao_kinetic_integral,&
+        size(ao_kinetic_integral,1), size(ao_kinetic_integral,2))
    print *,  'AO kinetic integrals written to disk'
  endif
 END_PROVIDER
--- a/src/Integrals_Monoelec/pot_ao_ints.irp.f
+++ b/src/Integrals_Monoelec/pot_ao_ints.irp.f
@ -1,4 +1,4 @@
-BEGIN_PROVIDER [ double precision, ao_nucl_elec_integral, (ao_num_align,ao_num)]
+BEGIN_PROVIDER [ double precision, ao_nucl_elec_integral, (ao_num,ao_num)]
   BEGIN_DOC
   ! interaction nuclear electron
   END_DOC
@ -11,7 +11,8 @@ BEGIN_PROVIDER [ double precision, ao_nucl_elec_integral, (ao_num_align,ao_num)]
   double precision               :: overlap_x,overlap_y,overlap_z,overlap,dx,NAI_pol_mult
   
   if (read_ao_one_integrals) then
-     call ezfio_get_ao_basis_integral_nuclear(ao_nucl_elec_integral(1:ao_num, 1:ao_num))
+    call read_one_e_integrals('ao_ne_integral', ao_nucl_elec_integral,      &
+            size(ao_nucl_elec_integral,1), size(ao_nucl_elec_integral,2))
     print *,  'AO N-e integrals read from disk'
   else
     
@ -73,14 +74,15 @@ BEGIN_PROVIDER [ double precision, ao_nucl_elec_integral, (ao_num_align,ao_num)]
     !$OMP END PARALLEL
   endif
   if (write_ao_one_integrals) then
-     call ezfio_set_ao_basis_integral_nuclear(ao_nucl_elec_integral(1:ao_num, 1:ao_num))
+    call write_one_e_integrals('ao_ne_integral', ao_nucl_elec_integral, &
+            size(ao_nucl_elec_integral,1), size(ao_nucl_elec_integral,2))
     print *,  'AO N-e integrals written to disk'
   endif
   
   
 END_PROVIDER

- BEGIN_PROVIDER [ double precision, ao_nucl_elec_integral_per_atom, (ao_num_align,ao_num,nucl_num)]
+ BEGIN_PROVIDER [ double precision, ao_nucl_elec_integral_per_atom, (ao_num,ao_num,nucl_num)]
 BEGIN_DOC
 ! ao_nucl_elec_integral_per_atom(i,j,k) = -<AO(i)|1/|r-Rk|AO(j)> 
 ! where Rk is the geometry of the kth atom
--- a/src/Utils/map_functions.irp.f
+++ b/src/Utils/map_functions.irp.f
@ -46,8 +46,8 @@ subroutine map_save_to_disk(filename,map)
    enddo
    deallocate(map % map(i) % value)
    deallocate(map % map(i) % key)
-    map % map(i) % value => map % consolidated_value ( map % consolidated_idx (i+1) :)
-    map % map(i) % key   => map % consolidated_key   ( map % consolidated_idx (i+1) :)
+    map % map(i) % value => map % consolidated_value ( map % consolidated_idx (i+1_8) :)
+    map % map(i) % key   => map % consolidated_key   ( map % consolidated_idx (i+1_8) :)
  enddo
  map % consolidated_idx (map % map_size + 2_8) = k
  map % consolidated = .True.
@ -82,7 +82,7 @@ subroutine map_load_from_disk(filename,map)
  call mmap(trim(filename)//'_consolidated_idx', (/ map % map_size + 2_8 /), 8, fd(1), .True., c_pointer(1))
  call c_f_pointer(c_pointer(1),map % consolidated_idx, (/ map % map_size + 2_8/))

-  map% n_elements = map % consolidated_idx (map % map_size+2_8)-1
+  map% n_elements = map % consolidated_idx (map % map_size+2_8)-1_8

  call mmap(trim(filename)//'_consolidated_key', (/ map % n_elements /), cache_key_kind, fd(2), .True., c_pointer(2))
  call c_f_pointer(c_pointer(2),map % consolidated_key, (/ map % n_elements /))
@ -96,11 +96,11 @@ subroutine map_load_from_disk(filename,map)
  do i=0_8, map % map_size
    deallocate(map % map(i) % value)
    deallocate(map % map(i) % key)
-    map % map(i) % value => map % consolidated_value ( map % consolidated_idx (i+1) :)
-    map % map(i) % key   => map % consolidated_key   ( map % consolidated_idx (i+1) :)
+    map % map(i) % value => map % consolidated_value ( map % consolidated_idx (i+1_8) :)
+    map % map(i) % key   => map % consolidated_key   ( map % consolidated_idx (i+1_8) :)
    map % map(i) % sorted = .True.
-    n_elements = int( map % consolidated_idx (i+2) - k, 4)
-    k = map % consolidated_idx (i+2)
+    n_elements = int( map % consolidated_idx (i+2_8) - k, 4)
+    k = map % consolidated_idx (i+2_8)
    map % map(i) % map_size = n_elements
    map % map(i) % n_elements = n_elements
    ! Load memory from disk
@ -116,7 +116,7 @@ subroutine map_load_from_disk(filename,map)
    enddo
  enddo
  map % sorted = x>0 .or. l == 0_8
-  map % n_elements = k-1
+  map % n_elements = k-1_8
  map % sorted = map % sorted .or. .True. 
  map % consolidated = .True.

--- a/src/Utils/map_module.f90
+++ b/src/Utils/map_module.f90
@ -13,7 +13,7 @@ module map_module
 ! cache_map using a binary search
 !
 ! When using the map_update subroutine to build the map,
-! the map_unique subroutine
+! the map_merge subroutine
 ! should be called before getting data from the map.

 use omp_lib
@ -274,7 +274,7 @@ subroutine map_sort(map)
  
 end

-subroutine cache_map_unique(map)
+subroutine cache_map_merge(map)
  use map_module
  implicit none
  type (cache_map_type), intent(inout) :: map
@ -298,6 +298,28 @@ subroutine cache_map_unique(map)
  
 end

+subroutine cache_map_unique(map)
+  use map_module
+  implicit none
+  type (cache_map_type), intent(inout) :: map
+  integer(cache_key_kind)        :: prev_key
+  integer(cache_map_size_kind)   :: i, j
+  
+  call cache_map_sort(map)
+  prev_key = -1_8
+  j=0
+  do i=1,map%n_elements
+    if (map%key(i) /= prev_key) then
+      j = j+1
+      map%value(j) = map%value(i)
+      map%key(j) = map%key(i)
+      prev_key = map%key(i)
+    endif
+  enddo
+  map%n_elements = j
+  
+end
+
 subroutine cache_map_shrink(map,thr)
  use map_module
  implicit none
@ -338,6 +360,27 @@ subroutine map_unique(map)
  
 end

+subroutine map_merge(map)
+  use map_module
+  implicit none
+  type (map_type), intent(inout) :: map
+  integer(map_size_kind)         :: i
+  integer(map_size_kind)         :: icount
+  
+  icount = 0_8
+  !$OMP PARALLEL DO SCHEDULE(dynamic,1000) DEFAULT(SHARED) PRIVATE(i)&
+      !$OMP REDUCTION(+:icount)
+  do i=0_8,map%map_size
+    call omp_set_lock(map%map(i)%lock)
+    call cache_map_merge(map%map(i))
+    call omp_unset_lock(map%map(i)%lock)
+    icount = icount + map%map(i)%n_elements
+  enddo
+  !$OMP END PARALLEL DO
+  map%n_elements = icount
+  
+end
+
 subroutine map_shrink(map,thr)
  use map_module
  implicit none
@ -402,7 +445,7 @@ subroutine map_update(map, key, value, sze, thr)
          else
            ! Assert that the map has a proper size
            if (local_map%n_elements == local_map%map_size) then
-              call cache_map_unique(local_map)
+              call cache_map_merge(local_map)
              call cache_map_reallocate(local_map, local_map%n_elements + local_map%n_elements)
              call cache_map_shrink(local_map,thr)
            endif
--- a/src/Utils/transpose.irp.f
+++ b/src/Utils/transpose.irp.f
@ -47,6 +47,14 @@ recursive subroutine dtranspose(A,LDA,B,LDB,d1,d2)
 double precision, intent(in) :: A(LDA,d2)
 double precision, intent(out) :: B(LDB,d1)

+
+! do j=1,d1
+!   do i=1,d2
+!    B(i,j  ) = A(j  ,i)
+!   enddo
+! enddo
+! return
+
 integer :: i,j,k, mod_align
 if ( d2 < 32 ) then
   do j=1,d1
--- a/tests/bats/fci.bats
+++ b/tests/bats/fci.bats
@ -42,11 +42,12 @@ function run_FCI_ZMQ() {
  qp_set_mo_class h2o.ezfio -core "[1]" -act "[2-12]" -del "[13-24]"
 }
@test "FCI H2O cc-pVDZ" {
-  run_FCI h2o.ezfio 2000  -76.1253758241716     -76.1258130146102     
+  run_FCI h2o.ezfio 2000  -76.1253757275131     -76.1258128174355
 }



+
@test "FCI-ZMQ H2O cc-pVDZ" {
  run_FCI_ZMQ h2o.ezfio 2000 -76.1250552686394     -76.1258817228809  
 }
--- a/tests/run_tests.sh
+++ b/tests/run_tests.sh
@ -3,10 +3,10 @@
 LIST="
 convert.bats
 hf.bats
-pseudo.bats
 fci.bats
 cassd.bats
 mrcepa0.bats
+pseudo.bats
 "
 #foboci.bats