Four index transformation with DGEMM

2024-12-22 12:23:48 +01:00 · 2017-10-02 11:40:41 +02:00 · 2017-10-02 11:40:41 +02:00 · 574c4f1222
commit 574c4f1222
parent 08460ab6a4 1c58249f53
11 changed files with 561 additions and 32 deletions
--- a/plugins/FourIdx/four_index.irp.f
+++ b/plugins/FourIdx/four_index.irp.f
@ -0,0 +1,180 @@
+subroutine four_index_transform(map_a,map_c,matrix_B,LDB,            &
+      i_start, j_start, k_start, l_start,                            &
+      i_end  , j_end  , k_end  , l_end  ,                            &
+      a_start, b_start, c_start, d_start,                            &
+      a_end  , b_end  , c_end  , d_end  )
+  implicit none
+  use map_module
+  use mmap_module
+  BEGIN_DOC
+! Performs a four-index transformation of map_a(N^4) into map_c(M^4) using b(NxM)
+! C_{abcd} = \sum_{ijkl} A_{ijkl}.B_{ia}.B_{jb}.B_{kc}.B_{ld}
+! Loops run over *_start->*_end
+  END_DOC
+  type(map_type), intent(in)     :: map_a
+  type(map_type), intent(inout)  :: map_c
+  integer, intent(in)            :: LDB
+  double precision, intent(in)   :: matrix_B(LDB,*)
+  integer, intent(in)            :: i_start, j_start, k_start, l_start
+  integer, intent(in)            :: i_end  , j_end  , k_end  , l_end
+  integer, intent(in)            :: a_start, b_start, c_start, d_start
+  integer, intent(in)            :: a_end  , b_end  , c_end  , d_end
+
+  double precision, allocatable  :: T(:,:,:), U(:,:,:), V(:,:,:)
+  integer                        :: i_max, j_max, k_max, l_max
+  integer                        :: i_min, j_min, k_min, l_min
+  integer                        :: i, j, k, l
+  integer                        :: a, b, c, d
+  double precision, external     :: get_ao_bielec_integral
+  integer(key_kind)              :: idx
+  real(integral_kind)            :: tmp
+  integer(key_kind), allocatable :: key(:)
+  real(integral_kind), allocatable :: value(:)
+
+  ASSERT (k_start == i_start)
+  ASSERT (l_start == j_start)
+  ASSERT (a_start == c_start)
+  ASSERT (b_start == d_start)
+
+  i_min = min(i_start,a_start)
+  i_max = max(i_end  ,a_end  )
+  j_min = min(j_start,b_start)
+  j_max = max(j_end  ,b_end  )
+  k_min = min(k_start,c_start)
+  k_max = max(k_end  ,c_end  )
+  l_min = min(l_start,d_start)
+  l_max = max(l_end  ,d_end  )
+
+  ASSERT (0 < i_max)
+  ASSERT (0 < j_max)
+  ASSERT (0 < k_max)
+  ASSERT (0 < l_max)
+  ASSERT (LDB >= i_max)
+  ASSERT (LDB >= j_max)
+  ASSERT (LDB >= k_max)
+  ASSERT (LDB >= l_max)
+
+  ! Create a temporary memory-mapped file
+  integer                        :: fd
+  type(c_ptr)                    :: c_pointer
+  integer*8, pointer             :: a_array(:,:,:)
+  call mmap(trim(ezfio_filename)//'/work/four_idx',                  &
+      (/ 4_8,int(i_end-i_start+1,8),int(j_end-j_start+1,8),int(k_end-k_start+1,8), int(l_end-l_start+1,8) /), 8, fd, .False., c_pointer)
+  call c_f_pointer(c_pointer, a_array, (/ 4, (i_end-i_start+1)*(j_end-j_start+1)*(k_end-k_start+1), l_end-l_start+1 /))
+
+
+  !$OMP PARALLEL DEFAULT(NONE) SHARED(a_array,c_pointer,fd,          &
+      !$OMP  a_start,a_end,b_start,b_end,c_start,c_end,d_start,d_end,&
+      !$OMP  i_start,i_end,j_start,j_end,k_start,k_end,l_start,l_end,&
+      !$OMP  i_min,i_max,j_min,j_max,k_min,k_max,l_min,l_max,        &
+      !$OMP  map_a,map_c,matrix_B)                                   &
+      !$OMP  PRIVATE(key,value,T,U,V,i,j,k,l,idx,   &
+      !$OMP  a,b,c,d,tmp)
+  allocate( key(i_max*j_max*k_max), value(i_max*j_max*k_max) )
+  allocate( U(a_start:a_end, c_start:c_end, b_start:b_end) )
+
+
+  !$OMP DO SCHEDULE(dynamic,4)
+  do l=l_start,l_end
+    a = 1
+    do j=j_start,j_end
+      do k=k_start,k_end
+        do i=i_start,i_end
+          call bielec_integrals_index(i,j,k,l,idx)
+          call map_get(map_a,idx,tmp)
+          if (tmp /= 0.d0) then
+            a = a+1
+            a_array(1,a,l-l_start+1) = i
+            a_array(2,a,l-l_start+1) = j
+            a_array(3,a,l-l_start+1) = k
+            a_array(4,a,l-l_start+1) = transfer(dble(tmp), 1_8)
+          endif
+        enddo
+      enddo
+    enddo
+    a_array(1,1,l-l_start+1) = a
+    print *,  l
+  enddo
+  !$OMP END DO
+
+  !$OMP DO SCHEDULE(dynamic)
+  do d=d_start,d_end
+    U = 0.d0
+    do l=l_start,l_end
+      if (dabs(matrix_B(l,d)) < 1.d-10) then
+        cycle
+      endif
+      print *,  d, l
+
+      allocate( T(i_start:i_end, k_start:k_end, j_start:j_end), &
+                V(a_start:a_end, k_start:k_end, j_start:j_end) )
+
+      T = 0.d0
+      do a=2,a_array(1,1,l-l_start+1)
+        i = a_array(1,a,l-l_start+1)
+        j = a_array(2,a,l-l_start+1)
+        k = a_array(3,a,l-l_start+1)
+        T(i, k,j) = transfer(a_array(4,a,l-l_start+1), 1.d0)
+      enddo
+
+      call DGEMM('T','N', (a_end-a_start+1),                         &
+          (k_end-k_start+1)*(j_end-j_start+1),                       &
+          (i_end-i_start+1), 1.d0,                                   &
+          matrix_B(i_start,a_start), size(matrix_B,1),               &
+          T(i_start,k_start,j_start), size(T,1),  0.d0,              &
+          V(a_start,k_start,j_start), size(V, 1) )
+
+      deallocate(T)
+      allocate( T(a_start:a_end, k_start:k_end, b_start:d) )
+
+      call DGEMM('N','N', (a_end-a_start+1)*(k_end-k_start+1),       &
+              (b_end-b_start+1),                                     &
+              (j_end-j_start+1), 1.d0,                               &
+              V(a_start,k_start,j_start), size(V,1)*size(V,2),       &
+              matrix_B(j_start,b_start), size(matrix_B,1),0.d0,      &
+              T(a_start,k_start,b_start), size(T,1)*size(T,2) )
+
+      deallocate(V)
+
+      do b=b_start,b_end
+        call DGEMM('N','N', (a_end-a_start+1), (c_end-c_start+1),    &
+            (k_end-k_start+1), matrix_B(l, d),                   &
+            T(a_start,k_start,b), size(T,1),                     &
+            matrix_B(k_start,c_start), size(matrix_B,1), 1.d0,   &
+            U(a_start,c_start,b), size(U,1) )
+      enddo
+
+      deallocate(T)
+
+    enddo
+
+    idx = 0_8
+    do b=b_start,b_end
+      do c=c_start,c_end
+        do a=a_start,a_end
+          if (dabs(U(a,c,b)) < 1.d-15) then
+            cycle
+          endif
+          idx = idx+1_8
+          call bielec_integrals_index(a,b,c,d,key(idx))
+          value(idx) = U(a,c,b)
+        enddo
+      enddo
+    enddo
+
+    !$OMP CRITICAL
+    call map_append(map_c, key, value, idx) 
+    call map_sort(map_c)
+    !$OMP END CRITICAL
+
+
+  enddo
+  !$OMP END DO
+
+  deallocate(key,value)
+  !$OMP END PARALLEL
+
+  call munmap( &
+      (/ 4_8,int(i_end-i_start+1,8),int(j_end-j_start+1,8),int(k_end-k_start+1,8), int(l_end-l_start+1,8) /), 8, fd, c_pointer)
+
+end
--- a/plugins/FourIdx/four_index_sym.irp.f
+++ b/plugins/FourIdx/four_index_sym.irp.f
@ -0,0 +1,277 @@
+subroutine four_index_transform_sym(map_a,map_c,matrix_B,LDB,            &
+      i_start, j_start, k_start, l_start,                            &
+      i_end  , j_end  , k_end  , l_end  ,                            &
+      a_start, b_start, c_start, d_start,                            &
+      a_end  , b_end  , c_end  , d_end  )
+  implicit none
+  use map_module
+  use mmap_module
+  BEGIN_DOC
+! Performs a four-index transformation of map_a(N^4) into map_c(M^4) using b(NxM)
+! C_{abcd} = \sum_{ijkl} A_{ijkl}.B_{ia}.B_{jb}.B_{kc}.B_{ld}
+! Loops run over *_start->*_end
+  END_DOC
+  type(map_type), intent(in)     :: map_a
+  type(map_type), intent(inout)  :: map_c
+  integer, intent(in)            :: LDB
+  double precision, intent(in)   :: matrix_B(LDB,*)
+  integer, intent(in)            :: i_start, j_start, k_start, l_start
+  integer, intent(in)            :: i_end  , j_end  , k_end  , l_end
+  integer, intent(in)            :: a_start, b_start, c_start, d_start
+  integer, intent(in)            :: a_end  , b_end  , c_end  , d_end
+
+  double precision, allocatable  :: T(:,:), U(:,:,:), V(:,:)
+  double precision, allocatable  :: T2d(:,:), V2d(:,:)
+  integer                        :: i_max, j_max, k_max, l_max
+  integer                        :: i_min, j_min, k_min, l_min
+  integer                        :: i, j, k, l, ik, ll
+  integer                        :: a, b, c, d
+  double precision, external     :: get_ao_bielec_integral
+  integer*8                      :: ii
+  integer(key_kind)              :: idx
+  real(integral_kind)            :: tmp
+  integer(key_kind), allocatable :: key(:)
+  real(integral_kind), allocatable :: value(:)
+  integer*8, allocatable         :: l_pointer(:)
+
+  ASSERT (k_start == i_start)
+  ASSERT (l_start == j_start)
+  ASSERT (a_start == c_start)
+  ASSERT (b_start == d_start)
+
+  i_min = min(i_start,a_start)
+  i_max = max(i_end  ,a_end  )
+  j_min = min(j_start,b_start)
+  j_max = max(j_end  ,b_end  )
+  k_min = min(k_start,c_start)
+  k_max = max(k_end  ,c_end  )
+  l_min = min(l_start,d_start)
+  l_max = max(l_end  ,d_end  )
+
+  ASSERT (0 < i_max)
+  ASSERT (0 < j_max)
+  ASSERT (0 < k_max)
+  ASSERT (0 < l_max)
+  ASSERT (LDB >= i_max)
+  ASSERT (LDB >= j_max)
+  ASSERT (LDB >= k_max)
+  ASSERT (LDB >= l_max)
+
+  ! Create a temporary memory-mapped file
+  integer                        :: fd
+  type(c_ptr)                    :: c_pointer
+  integer*8, pointer             :: a_array(:)
+  call mmap(trim(ezfio_filename)//'/work/four_idx',                  &
+      (/ 12_8 * map_a % n_elements /), 8, fd, .False., c_pointer)
+  call c_f_pointer(c_pointer, a_array, (/ 12_8 * map_a % n_elements /))
+
+  allocate(l_pointer(l_start:l_end+1), value((i_max*k_max)) )
+  ii = 1_8
+  !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,j,k,l,ik,idx) 
+  do l=l_start,l_end
+    !$OMP SINGLE
+    l_pointer(l) = ii
+    !$OMP END SINGLE
+    do j=j_start,j_end
+      !$OMP DO SCHEDULE(static,1)
+      do k=k_start,k_end
+        do i=i_start,k
+          ik = (i-i_start+1) + ishft( (k-k_start)*(k-k_start+1), -1 )
+          call bielec_integrals_index(i,j,k,l,idx)
+          call map_get(map_a,idx,value(ik))
+        enddo
+      enddo
+      !$OMP END DO
+
+      !$OMP SINGLE
+      ik=0
+      do k=k_start,k_end
+        do i=i_start,k
+          ik = ik+1
+          tmp=value(ik)
+          if (tmp /= 0.d0) then
+            a_array(ii) = ik
+            ii = ii+1_8
+            a_array(ii) = j
+            ii = ii+1_8
+            a_array(ii) = transfer(dble(tmp), 1_8)
+            ii = ii+1_8
+          endif
+        enddo
+      enddo
+      !$OMP END SINGLE
+    enddo
+  enddo
+  !$OMP SINGLE
+  l_pointer(l_end+1) = ii
+  !$OMP END SINGLE
+  !$OMP END PARALLEL  
+  deallocate(value)
+
+!INPUT DATA
+!open(unit=10,file='INPUT',form='UNFORMATTED')
+!write(10) i_start, j_start, i_end, j_end
+!write(10) a_start, b_start, a_end, b_end
+!write(10) LDB, mo_tot_num
+!write(10) matrix_B(1:LDB,1:mo_tot_num)
+!idx=size(a_array)
+!write(10) idx
+!write(10) a_array
+!write(10) l_pointer
+!close(10)
+!open(unit=10,file='OUTPUT',form='FORMATTED')
+! END INPUT DATA
+
+
+  !$OMP PARALLEL DEFAULT(NONE) SHARED(a_array,c_pointer,fd,          &
+      !$OMP  a_start,a_end,b_start,b_end,c_start,c_end,d_start,d_end,&
+      !$OMP  i_start,i_end,j_start,j_end,k_start,k_end,l_start,l_end,&
+      !$OMP  i_min,i_max,j_min,j_max,k_min,k_max,l_min,l_max,        &
+      !$OMP  map_c,matrix_B,l_pointer)                         &
+      !$OMP  PRIVATE(key,value,T,U,V,i,j,k,l,idx,ik,ll,   &
+      !$OMP  a,b,c,d,tmp,T2d,V2d,ii)
+  allocate( key(i_max*j_max*k_max), value(i_max*j_max*k_max) )
+  allocate( U(a_start:a_end, c_start:c_end, b_start:b_end) )
+
+
+
+  allocate( T2d((i_end-i_start+1)*(k_end-k_start+2)/2, j_start:j_end), &
+            V2d((i_end-i_start+1)*(k_end-k_start+2)/2, b_start:b_end), &
+            V(i_start:i_end, k_start:k_end), &
+            T(k_start:k_end, a_start:a_end))
+
+
+  !$OMP DO SCHEDULE(dynamic)
+  do d=d_start,d_end
+    U = 0.d0
+    do l=l_start,l_end
+      if (dabs(matrix_B(l,d)) < 1.d-10) then
+        cycle
+      endif
+      
+      ii=l_pointer(l)
+      do j=j_start,j_end
+        ik=0
+        do k=k_start,k_end
+          do i=i_start,k
+            ik = ik+1
+            if ( (ik /= a_array(ii)).or.(j /= a_array(ii+1_8))  &
+                 .or.(ii >= l_pointer(l+1)) ) then
+              T2d(ik,j) = 0.d0
+            else
+              T2d(ik,j) = transfer(a_array(ii+2_8), 1.d0)
+              ii=ii+3_8
+            endif
+          enddo
+        enddo
+      enddo
+      call DGEMM('N','N', ishft( (i_end-i_start+1)*(i_end-i_start+2), -1),&
+          (d-b_start+1),                                             &
+          (j_end-j_start+1), 1.d0,                                   &
+          T2d(1,j_start), size(T2d,1),                               &
+          matrix_B(j_start,b_start), size(matrix_B,1),0.d0,          &
+          V2d(1,b_start), size(V2d,1) )
+
+      do b=b_start,d
+        ik = 0
+        do k=k_start,k_end
+          do i=i_start,k
+            ik = ik+1
+            V(i,k) = V2d(ik,b)
+          enddo
+        enddo
+
+!        T = 0.d0
+!        do a=a_start,b
+!          do k=k_start,k_end
+!            do i=i_start,k
+!              T(k,a) = T(k,a) + V(i,k)*matrix_B(i,a)
+!            enddo
+!            do i=k+1,i_end
+!              T(k,a) = T(k,a) + V(k,i)*matrix_B(i,a)
+!            enddo
+!          enddo
+!        enddo
+        call DSYMM('L','U', (k_end-k_start+1), (b-a_start+1),        &
+            1.d0,                                                    &
+            V(i_start,k_start), size(V,1),                           &
+            matrix_B(i_start,a_start), size(matrix_B,1),0.d0,        &
+            T(k_start,a_start), size(T,1) )
+
+!        do c=c_start,b
+!          do a=a_start,c
+!            do k=k_start,k_end
+!              U(a,c,b) = U(a,c,b) + T(k,a)*matrix_B(k,c)*matrix_B(l,d)
+!            enddo
+!          enddo
+!        enddo
+        call DGEMM('T','N', (b-a_start+1), (b-c_start+1),            &
+            (k_end-k_start+1), matrix_B(l, d),                       &
+            T(k_start,a_start), size(T,1),                           &
+            matrix_B(k_start,c_start), size(matrix_B,1), 1.d0,       &
+            U(a_start,c_start,b), size(U,1) )
+!        do c=b+1,c_end
+!          do a=a_start,b
+!            do k=k_start,k_end
+!              U(a,c,b) = U(a,c,b) + T(k,a)*matrix_B(k,c)*matrix_B(l,d)
+!            enddo
+!          enddo
+!        enddo
+        if (b < b_end) then
+          call DGEMM('T','N', (b-a_start+1), (c_end-b),              &
+              (k_end-k_start+1), matrix_B(l, d),                     &
+              T(k_start,a_start), size(T,1),                         &
+              matrix_B(k_start,b+1), size(matrix_B,1), 1.d0,         &
+              U(a_start,b+1,b), size(U,1) )
+        endif
+      enddo
+
+    enddo
+
+    idx = 0_8
+    do b=b_start,d
+      do c=c_start,c_end
+        do a=a_start,min(b,c)
+          if (dabs(U(a,c,b)) < 1.d-15) then
+            cycle
+          endif
+          idx = idx+1_8
+          call bielec_integrals_index(a,b,c,d,key(idx))
+          value(idx) = U(a,c,b)
+        enddo
+      enddo
+    enddo
+
+    !$OMP CRITICAL
+    call map_append(map_c, key, value, idx) 
+    !$OMP END CRITICAL
+
+!!$OMP CRITICAL
+!WRITE OUTPUT
+!print *,  d
+!do b=b_start,d
+!  do c=c_start,c_end
+!    do a=a_start,min(b,c)
+!      if (dabs(U(a,c,b)) < 1.d-15) then
+!        cycle
+!      endif
+!      write(10,*) d,c,b,a,U(a,c,b)
+!    enddo
+!  enddo
+!enddo
+!END WRITE OUTPUT
+!!$OMP END CRITICAL
+
+
+  enddo
+  !$OMP END DO
+
+  deallocate(key,value,V,T)
+  !$OMP END PARALLEL
+  call map_sort(map_c)
+
+  call munmap( &
+      (/ 12_8 * map_a % n_elements /), 8, fd, c_pointer)
+  deallocate(l_pointer)
+
+end
--- a/plugins/Full_CI_ZMQ/NEEDED_CHILDREN_MODULES
+++ b/plugins/Full_CI_ZMQ/NEEDED_CHILDREN_MODULES
@ -1 +1 @@
-Perturbation Selectors_full Generators_full ZMQ 
+Perturbation Selectors_full Generators_full ZMQ FourIdx
--- a/plugins/Full_CI_ZMQ/pt2_stoch_routines.irp.f
+++ b/plugins/Full_CI_ZMQ/pt2_stoch_routines.irp.f
@ -350,12 +350,12 @@ subroutine get_first_tooth(computed, first_teeth)
 end subroutine


-BEGIN_PROVIDER [ integer, size_tbc ]
+BEGIN_PROVIDER [ integer*8, size_tbc ]
  implicit none
  BEGIN_DOC
 ! Size of the tbc array
  END_DOC
-  size_tbc = (comb_teeth+1)*N_det_generators + fragment_count*fragment_first
+  size_tbc = int((comb_teeth+1),8)*int(N_det_generators,8) + fragment_count*fragment_first
 END_PROVIDER

 subroutine get_carlo_workbatch(computed, comb, Ncomb, tbc)
--- a/plugins/read_integral/print_integrals_ao.irp.f
+++ b/plugins/read_integral/print_integrals_ao.irp.f
@ -70,23 +70,37 @@ subroutine run
  call get_cache_map_n_elements_max(ao_integrals_map,n_elements_max)
  allocate(keys(n_elements_max), values(n_elements_max))

-  do i8=0_8,ao_integrals_map%map_size
-     n_elements = n_elements_max
-     call get_cache_map(ao_integrals_map,i8,keys,values,n_elements)
-     do k1=1,n_elements
-      call bielec_integrals_index_reverse(kk,ii,ll,jj,keys(k1))
-      if ( (kk(1)>ao_num).or.                                        &
-            (ii(1)>ao_num).or.                                       &
-            (jj(1)>ao_num).or.                                       &
-            (ll(1)>ao_num) ) then
-            cycle
-      endif
-      k = kk(1)
-      i = ii(1)
-      l = ll(1)
-      j = jj(1)
-      integral = values(k1)
-      write (iunit,'(4(I5,X),D22.15)') k,i,l,j, integral 
+!  do i8=0_8,ao_integrals_map%map_size
+!     n_elements = n_elements_max
+!     call get_cache_map(ao_integrals_map,i8,keys,values,n_elements)
+!     do k1=1,n_elements
+!      call bielec_integrals_index_reverse(kk,ii,ll,jj,keys(k1))
+!      if ( (kk(1)>ao_num).or.                                        &
+!            (ii(1)>ao_num).or.                                       &
+!            (jj(1)>ao_num).or.                                       &
+!            (ll(1)>ao_num) ) then
+!            cycle
+!      endif
+!      k = kk(1)
+!      i = ii(1)
+!      l = ll(1)
+!      j = jj(1)
+!      integral = values(k1)
+!      write (iunit,'(4(I6,X),F20.15)') k,i,l,j, integral 
+!    enddo
+!  enddo
+
+  do i=1,ao_num
+    do k=1,ao_num
+      do j=1,ao_num
+        do l=1,ao_num
+          double precision, external :: get_ao_bielec_integral
+          integral = get_ao_bielec_integral(i,j,k,l,ao_integrals_map)
+          if (dabs(integral)>=1.e-15) then
+            write (iunit,'(4(I6),F20.15)') i,j,k,l, integral 
+          endif
+        enddo
+      enddo
    enddo
  enddo

--- a/plugins/read_integral/print_integrals_mo.irp.f
+++ b/plugins/read_integral/print_integrals_mo.irp.f
@ -49,7 +49,7 @@ program print_integrals
          double precision :: get_mo_bielec_integral
          integral = get_mo_bielec_integral(i,j,k,l,mo_integrals_map)
          if (dabs(integral) > mo_integrals_threshold) then
-            write (iunit,'(4(I5,X),D22.15)') i,j,k,l, integral 
+            write (iunit,'(4(I6,X),F20.15)') i,j,k,l, integral 
          endif
       !end if
     enddo
--- a/plugins/read_integral/read_integrals_ao.irp.f
+++ b/plugins/read_integral/read_integrals_ao.irp.f
@ -68,6 +68,7 @@ subroutine run
  call insert_into_ao_integrals_map(n_integrals,buffer_i,buffer_values)

  call map_sort(ao_integrals_map)
+  call map_unique(ao_integrals_map)

  call map_save_to_disk(trim(ezfio_filename)//'/work/ao_ints',ao_integrals_map)
  call ezfio_set_integrals_bielec_disk_access_ao_integrals('Read')
--- a/plugins/read_integral/read_integrals_mo.irp.f
+++ b/plugins/read_integral/read_integrals_mo.irp.f
@ -1,5 +1,10 @@
 program read_integrals
-
+  BEGIN_DOC
+! Reads the integrals from the following files:
+! - kinetic_mo
+! - nuclear_mo
+! - bielec_mo
+  END_DOC
  PROVIDE ezfio_filename
  call ezfio_set_integrals_monoelec_disk_access_mo_one_integrals("None")
  call run
--- a/src/Determinants/two_body_dm_map.irp.f
+++ b/src/Determinants/two_body_dm_map.irp.f
@ -187,7 +187,7 @@ subroutine add_values_to_two_body_dm_map(mask_ijkl)
  print*,'n_elements = ',n_elements
  call insert_into_two_body_dm_ab_map(n_elements,buffer_i,buffer_value,&
      real(mo_integrals_threshold,integral_kind))
-  call map_unique(two_body_dm_ab_map)
+  call map_merge(two_body_dm_ab_map)

  deallocate(buffer_i,buffer_value)

--- a/src/Integrals_Bielec/mo_bi_integrals.irp.f
+++ b/src/Integrals_Bielec/mo_bi_integrals.irp.f
@ -117,7 +117,16 @@ BEGIN_PROVIDER [ logical, mo_bielec_integrals_in_map ]
    endif
    
  else
-    call add_integrals_to_map(full_ijkl_bitmask_4)
+!    call add_integrals_to_map(full_ijkl_bitmask_4)
+    call four_index_transform_sym(ao_integrals_map,mo_integrals_map,   &
+        mo_coef, size(mo_coef,1),                                      &
+        1, 1, 1, 1, ao_num, ao_num, ao_num, ao_num,                    &
+        1, 1, 1, 1, mo_tot_num, mo_tot_num, mo_tot_num, mo_tot_num)
+  
+    integer*8                      :: get_mo_map_size, mo_map_size
+    mo_map_size = get_mo_map_size()
+    
+    print*,'Molecular integrals provided'
  endif
  if (write_mo_integrals) then
    call ezfio_set_work_empty(.False.)
@ -146,7 +155,7 @@ subroutine set_integrals_jj_into_map
  enddo
  call insert_into_mo_integrals_map(n_integrals,buffer_i,buffer_value,&
      real(mo_integrals_threshold,integral_kind))
-  call map_unique(mo_integrals_map)
+  call map_merge(mo_integrals_map)
 end

 subroutine set_integrals_exchange_jj_into_map
@ -167,7 +176,7 @@ subroutine set_integrals_exchange_jj_into_map
  enddo
  call insert_into_mo_integrals_map(n_integrals,buffer_i,buffer_value,&
      real(mo_integrals_threshold,integral_kind))
-  call map_unique(mo_integrals_map)
+  call map_merge(mo_integrals_map)
  
 end

@ -458,7 +467,7 @@ subroutine add_integrals_to_map(mask_ijkl)
      real(mo_integrals_threshold,integral_kind))
  deallocate(buffer_i, buffer_value)
  !$OMP END PARALLEL
-  call map_unique(mo_integrals_map)
+  call map_merge(mo_integrals_map)
  
  call wall_time(wall_2)
  call cpu_time(cpu_2)
@ -773,7 +782,7 @@ subroutine add_integrals_to_map_three_indices(mask_ijk)
      real(mo_integrals_threshold,integral_kind))
  deallocate(buffer_i, buffer_value)
  !$OMP END PARALLEL
-  call map_unique(mo_integrals_map)
+  call map_merge(mo_integrals_map)
  
  call wall_time(wall_2)
  call cpu_time(cpu_2)
@ -1035,7 +1044,7 @@ subroutine add_integrals_to_map_no_exit_34(mask_ijkl)
  !  print*, 'Communicating the map'
  !  call communicate_mo_integrals()
  !IRP_ENDIF
-  call map_unique(mo_integrals_map)
+  call map_merge(mo_integrals_map)
  
  call wall_time(wall_2)
  call cpu_time(cpu_2)
--- a/src/Utils/map_module.f90
+++ b/src/Utils/map_module.f90
@ -13,7 +13,7 @@ module map_module
 ! cache_map using a binary search
 !
 ! When using the map_update subroutine to build the map,
-! the map_unique subroutine
+! the map_merge subroutine
 ! should be called before getting data from the map.

 use omp_lib
@ -274,7 +274,7 @@ subroutine map_sort(map)
  
 end

-subroutine cache_map_unique(map)
+subroutine cache_map_merge(map)
  use map_module
  implicit none
  type (cache_map_type), intent(inout) :: map
@ -298,6 +298,28 @@ subroutine cache_map_unique(map)
  
 end

+subroutine cache_map_unique(map)
+  use map_module
+  implicit none
+  type (cache_map_type), intent(inout) :: map
+  integer(cache_key_kind)        :: prev_key
+  integer(cache_map_size_kind)   :: i, j
+  
+  call cache_map_sort(map)
+  prev_key = -1_8
+  j=0
+  do i=1,map%n_elements
+    if (map%key(i) /= prev_key) then
+      j = j+1
+      map%value(j) = map%value(i)
+      map%key(j) = map%key(i)
+      prev_key = map%key(i)
+    endif
+  enddo
+  map%n_elements = j
+  
+end
+
 subroutine cache_map_shrink(map,thr)
  use map_module
  implicit none
@ -338,6 +360,27 @@ subroutine map_unique(map)
  
 end

+subroutine map_merge(map)
+  use map_module
+  implicit none
+  type (map_type), intent(inout) :: map
+  integer(map_size_kind)         :: i
+  integer(map_size_kind)         :: icount
+  
+  icount = 0_8
+  !$OMP PARALLEL DO SCHEDULE(dynamic,1000) DEFAULT(SHARED) PRIVATE(i)&
+      !$OMP REDUCTION(+:icount)
+  do i=0_8,map%map_size
+    call omp_set_lock(map%map(i)%lock)
+    call cache_map_merge(map%map(i))
+    call omp_unset_lock(map%map(i)%lock)
+    icount = icount + map%map(i)%n_elements
+  enddo
+  !$OMP END PARALLEL DO
+  map%n_elements = icount
+  
+end
+
 subroutine map_shrink(map,thr)
  use map_module
  implicit none
@ -402,7 +445,7 @@ subroutine map_update(map, key, value, sze, thr)
          else
            ! Assert that the map has a proper size
            if (local_map%n_elements == local_map%map_size) then
-              call cache_map_unique(local_map)
+              call cache_map_merge(local_map)
              call cache_map_reallocate(local_map, local_map%n_elements + local_map%n_elements)
              call cache_map_shrink(local_map,thr)
            endif