subroutine four_index_transform_slave_work(map_a,matrix_B,LDB,            &
      i_start, j_start, k_start, l_start,                            &
      i_end  , j_end  , k_end  , l_end  ,                            &
      a_start, b_start, c_start, d_start,                            &
      a_end  , b_end  , c_end  , d_end, task_id, worker_id, thread  )
  implicit none
  use f77_zmq
  use map_module
  use mmap_module
  BEGIN_DOC
! Performs a four-index transformation of map_a(N^4) into map_c(M^4) using b(NxM)
! C_{abcd} = \sum_{ijkl} A_{ijkl}.B_{ia}.B_{jb}.B_{kc}.B_{ld}
! Loops run over *_start->*_end
  END_DOC
  type(map_type), intent(in)     :: map_a
  integer, intent(in)            :: LDB
  double precision, intent(in)   :: matrix_B(LDB,*)
  integer, intent(in)            :: i_start, j_start, k_start, l_start
  integer, intent(in)            :: i_end  , j_end  , k_end  , l_end
  integer, intent(in)            :: a_start, b_start, c_start, d_start
  integer, intent(in)            :: a_end  , b_end  , c_end  , d_end
  integer, intent(in)            :: task_id, thread, worker_id

  double precision, allocatable  :: T(:,:), U(:,:,:), V(:,:)
  double precision, allocatable  :: T2d(:,:), V2d(:,:)
  integer                        :: i_max, j_max, k_max, l_max
  integer                        :: i_min, j_min, k_min, l_min
  integer                        :: i, j, k, l, ik, ll
  integer                        :: a, b, c, d
  double precision, external     :: get_ao_bielec_integral
  integer*8                      :: ii
  integer(key_kind)              :: idx
  real(integral_kind)            :: tmp
  integer(key_kind), allocatable :: key(:)
  real(integral_kind), allocatable :: value(:)
  integer*8, allocatable         :: l_pointer(:)

  ASSERT (k_start == i_start)
  ASSERT (l_start == j_start)
  ASSERT (a_start == c_start)
  ASSERT (b_start == d_start)

  i_min = min(i_start,a_start)
  i_max = max(i_end  ,a_end  )
  j_min = min(j_start,b_start)
  j_max = max(j_end  ,b_end  )
  k_min = min(k_start,c_start)
  k_max = max(k_end  ,c_end  )
  l_min = min(l_start,d_start)
  l_max = max(l_end  ,d_end  )

  ASSERT (0 < i_max)
  ASSERT (0 < j_max)
  ASSERT (0 < k_max)
  ASSERT (0 < l_max)
  ASSERT (LDB >= i_max)
  ASSERT (LDB >= j_max)
  ASSERT (LDB >= k_max)
  ASSERT (LDB >= l_max)

  integer*4, allocatable         :: a_array_ik(:)
  integer*2, allocatable         :: a_array_j(:)
  double precision, allocatable  :: a_array_value(:)

  integer*8 :: new_size
  new_size = max(2048_8, 5_8 * map_a % n_elements )

  integer*8 :: tempspace
  integer   :: npass, l_block

  tempspace = (new_size * 16_8) / (2048_8 * 2048_8)
  npass = int(min(1_8+int(l_end-l_start,8),1_8 + tempspace / 2048_8),4)   ! 2 GiB of scratch space
  l_block = (l_end-l_start+1)/npass

  allocate(a_array_ik(new_size/npass), a_array_j(new_size/npass), a_array_value(new_size/npass))


  allocate(l_pointer(l_start:l_end+1), value((i_max*k_max)) )
  ii = 1_8
  !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,j,k,l,ik,idx) 
  do l=l_start,l_end
    !$OMP SINGLE
    l_pointer(l) = ii
    !$OMP END SINGLE
    do j=j_start,j_end
      !$OMP DO SCHEDULE(static,1)
      do k=k_start,k_end
        do i=i_start,k
          ik = (i-i_start+1) + ishft( (k-k_start)*(k-k_start+1), -1 )
          call bielec_integrals_index(i,j,k,l,idx)
          call map_get(map_a,idx,value(ik))
        enddo
      enddo
      !$OMP END DO

      !$OMP SINGLE
      ik=0
      do k=k_start,k_end
        do i=i_start,k
          ik = ik+1
          tmp=value(ik)
          if (tmp /= 0.d0) then
            a_array_ik(ii) = ik
            a_array_j(ii)  = int(j,2)  ! Warning: integer*2
            a_array_value(ii) = tmp
            ii=ii+1_8
          endif
        enddo
      enddo
      !$OMP END SINGLE
    enddo
  enddo
  !$OMP SINGLE
  a_array_ik(ii) = 0
  a_array_j(ii)  = 0
  a_array_value(ii) = 0.d0
  l_pointer(l_end+1) = ii
  !$OMP END SINGLE
  !$OMP END PARALLEL  
  deallocate(value)

!INPUT DATA
!open(unit=10,file='INPUT',form='UNFORMATTED')
!write(10) i_start, j_start, i_end, j_end
!write(10) a_start, b_start, a_end, b_end
!write(10) LDB, mo_tot_num
!write(10) matrix_B(1:LDB,1:mo_tot_num)
!idx=size(a_array)
!write(10) idx
!write(10) a_array
!write(10) l_pointer
!close(10)
!open(unit=10,file='OUTPUT',form='FORMATTED')
! END INPUT DATA

  !$OMP PARALLEL DEFAULT(NONE) SHARED(a_array_ik,a_array_j,a_array_value, &
      !$OMP  a_start,a_end,b_start,b_end,c_start,c_end,d_start,d_end,&
      !$OMP  i_start,i_end,j_start,j_end,k_start,k_end,l_start,l_end,&
      !$OMP  i_min,i_max,j_min,j_max,k_min,k_max,l_min,l_max,        &
      !$OMP  matrix_B,l_pointer,thread,task_id,worker_id) &
      !$OMP  PRIVATE(key,value,T,U,V,i,j,k,l,idx,ik,ll,zmq_to_qp_run_socket,   &
      !$OMP  a,b,c,d,p,q,tmp,T2d,V2d,ii,zmq_socket_push) 

  integer(ZMQ_PTR)               :: zmq_socket_push
  integer(ZMQ_PTR), external     :: new_zmq_push_socket
  zmq_socket_push = new_zmq_push_socket(thread)


  allocate( U(a_start:a_end, c_start:c_end, b_start:b_end) )

  !$OMP DO SCHEDULE(dynamic,1)
  do d=d_start,d_end
    U = 0.d0
    do l=l_start,l_end
      if (dabs(matrix_B(l,d)) < 1.d-10) then
        cycle
      endif
      
      allocate( T2d((i_end-i_start+1)*(k_end-k_start+2)/2, j_start:j_end) )
      ii=l_pointer(l)
      do j=j_start,j_end
        !DIR$ VECTOR NONTEMPORAL
        T2d(:,j) = 0.d0
        !DIR$ IVDEP
        do while (j == a_array_j(ii))
          T2d(a_array_ik(ii),j) = transfer(a_array_value(ii), 1.d0)
          ii = ii + 1_8
        enddo
      enddo

      allocate (V2d((i_end-i_start+1)*(k_end-k_start+2)/2, b_start:b_end)) 
      call DGEMM('N','N', ishft( (i_end-i_start+1)*(i_end-i_start+2), -1),&
          (d-b_start+1),                                             &
          (j_end-j_start+1), 1.d0,                                   &
          T2d(1,j_start), size(T2d,1),                               &
          matrix_B(j_start,b_start), size(matrix_B,1),0.d0,          &
          V2d(1,b_start), size(V2d,1) )
      deallocate(T2d)

      allocate( V(i_start:i_end, k_start:k_end), T(k_start:k_end, a_start:a_end))
      do b=b_start,d
        ik = 0
        do k=k_start,k_end
          do i=i_start,k
            ik = ik+1
            V(i,k) = V2d(ik,b)
          enddo
        enddo

        call DSYMM('L','U', (k_end-k_start+1), (b-a_start+1),        &
            1.d0,                                                    &
            V(i_start,k_start), size(V,1),                           &
            matrix_B(i_start,a_start), size(matrix_B,1),0.d0,        &
            T(k_start,a_start), size(T,1) )

        call DGEMM('T','N', (b-a_start+1), (b-c_start+1),            &
            (k_end-k_start+1), matrix_B(l, d),                       &
            T(k_start,a_start), size(T,1),                           &
            matrix_B(k_start,c_start), size(matrix_B,1), 1.d0,       &
            U(a_start,c_start,b), size(U,1) )

        if (b < b_end) then
          call DGEMM('T','N', (b-a_start+1), (c_end-b),              &
              (k_end-k_start+1), matrix_B(l, d),                     &
              T(k_start,a_start), size(T,1),                         &
              matrix_B(k_start,b+1), size(matrix_B,1), 1.d0,         &
              U(a_start,b+1,b), size(U,1) )
        endif
      enddo
      deallocate(T,V,V2d)
    enddo

    idx = 0_8

    allocate( key(i_max*j_max*k_max), value(i_max*j_max*k_max) )
    integer :: p, q
    do b=b_start,d
      q = b+ishft(d*d-d,-1)
      do c=c_start,c_end
        p = a_start+ishft(c*c-c,-1)
        do a=a_start,min(b,c)
          if (dabs(U(a,c,b)) < 1.d-15) then
            cycle
          endif
          if ((a==b).and.(p>q)) cycle
          p = p+1
          idx = idx+1_8
          call bielec_integrals_index(a,b,c,d,key(idx))
!print *,  int(key(idx),4), int(a,2),int(b,2),int(c,2),int(d,2), p, q
          value(idx) = U(a,c,b)
        enddo
      enddo
    enddo

    call four_idx_push_results(zmq_socket_push, key, value, idx, -task_id)
    deallocate(key,value)

  enddo
  deallocate(U)

  !$OMP BARRIER
  !$OMP MASTER
  integer(ZMQ_PTR)               :: zmq_to_qp_run_socket
  integer(ZMQ_PTR), external     :: task_done_to_taskserver, new_zmq_to_qp_run_socket
  zmq_to_qp_run_socket = new_zmq_to_qp_run_socket()
  if (task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id) == -1) then
    stop 'Unable to send task done'
  endif
  call four_idx_push_results(zmq_socket_push, 0_8, 0.d0, 0, task_id)
  call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket)
  !$OMP END MASTER
  call end_zmq_push_socket(zmq_socket_push)
  !$OMP END PARALLEL


  deallocate(l_pointer,a_array_ik,a_array_j,a_array_value)

end