Load balancing in 4idx

2025-01-03 10:05:57 +01:00 · 2017-12-14 14:01:30 +01:00 · 2017-12-14 14:01:30 +01:00 · af229e7028
commit af229e7028
parent 31b3b7dd79
2 changed files with 28 additions and 44 deletions
--- a/src/FourIdx/four_index_slave.irp.f
+++ b/src/FourIdx/four_index_slave.irp.f
@ -2,7 +2,7 @@ subroutine four_index_transform_slave_work(map_a,matrix_B,LDB,            &
      i_start, j_start, k_start, l_start,                            &
      i_end  , j_end  , k_end  , l_end  ,                            &
      a_start, b_start, c_start, d_start,                            &
-      a_end  , b_end  , c_end  , d_end, task_id, thread  )
+      a_end  , b_end  , c_end  , d_end, task_id, worker_id, thread  )
  implicit none
  use f77_zmq
  use map_module
@ -19,7 +19,7 @@ subroutine four_index_transform_slave_work(map_a,matrix_B,LDB,            &
  integer, intent(in)            :: i_end  , j_end  , k_end  , l_end
  integer, intent(in)            :: a_start, b_start, c_start, d_start
  integer, intent(in)            :: a_end  , b_end  , c_end  , d_end
-  integer, intent(in)            :: task_id, thread
+  integer, intent(in)            :: task_id, thread, worker_id

  double precision, allocatable  :: T(:,:), U(:,:,:), V(:,:)
  double precision, allocatable  :: T2d(:,:), V2d(:,:)
@ -63,13 +63,13 @@ subroutine four_index_transform_slave_work(map_a,matrix_B,LDB,            &
  double precision, allocatable  :: a_array_value(:)

  integer*8 :: new_size
-  new_size = max(1024_8, 5_8 * map_a % n_elements )
+  new_size = max(2048_8, 5_8 * map_a % n_elements )

  integer*8 :: tempspace
  integer   :: npass, l_block

-  tempspace = (new_size * 16_8) / (1024_8 * 1024_8)
-  npass = int(min(1_8+int(l_end-l_start,8),1_8 + tempspace / 1024_8),4)   ! 1 GiB of scratch space
+  tempspace = (new_size * 16_8) / (2048_8 * 2048_8)
+  npass = int(min(1_8+int(l_end-l_start,8),1_8 + tempspace / 2048_8),4)   ! 2 GiB of scratch space
  l_block = (l_end-l_start+1)/npass

  allocate(a_array_ik(new_size/npass), a_array_j(new_size/npass), a_array_value(new_size/npass))
@ -133,17 +133,13 @@ subroutine four_index_transform_slave_work(map_a,matrix_B,LDB,            &
 !open(unit=10,file='OUTPUT',form='FORMATTED')
 ! END INPUT DATA

-  PROVIDE nproc
-  integer :: n_running_threads
-  n_running_threads = 0
-  call omp_set_nested(.true.)
  !$OMP PARALLEL DEFAULT(NONE) SHARED(a_array_ik,a_array_j,a_array_value, &
      !$OMP  a_start,a_end,b_start,b_end,c_start,c_end,d_start,d_end,&
      !$OMP  i_start,i_end,j_start,j_end,k_start,k_end,l_start,l_end,&
      !$OMP  i_min,i_max,j_min,j_max,k_min,k_max,l_min,l_max,        &
-      !$OMP  matrix_B,l_pointer,thread,task_id,n_running_threads,nproc) &
-      !$OMP  PRIVATE(key,value,T,U,V,i,j,k,l,idx,ik,ll,   &
-      !$OMP  a,b,c,d,p,q,tmp,T2d,V2d,ii,zmq_socket_push)
+      !$OMP  matrix_B,l_pointer,thread,task_id,worker_id) &
+      !$OMP  PRIVATE(key,value,T,U,V,i,j,k,l,idx,ik,ll,zmq_to_qp_run_socket,   &
+      !$OMP  a,b,c,d,p,q,tmp,T2d,V2d,ii,zmq_socket_push) 

  integer(ZMQ_PTR)               :: zmq_socket_push
  integer(ZMQ_PTR), external     :: new_zmq_push_socket
@ -153,9 +149,6 @@ subroutine four_index_transform_slave_work(map_a,matrix_B,LDB,            &

  allocate( U(a_start:a_end, c_start:c_end, b_start:b_end) )

-  !$OMP ATOMIC
-  n_running_threads = n_running_threads+1
-
  !$OMP DO SCHEDULE(dynamic,1)
  do d=d_start,d_end
    U = 0.d0
@ -185,13 +178,7 @@ subroutine four_index_transform_slave_work(map_a,matrix_B,LDB,            &
          V2d(1,b_start), size(V2d,1) )
      deallocate(T2d)

-      !$OMP FLUSH(n_running_threads)
-      !$OMP PARALLEL DEFAULT(NONE) SHARED(a_array_ik,a_array_j,a_array_value, &
-          !$OMP  a_start,b_start,b_end,c_start,c_end,i_start,k_start,k_end,   &
-          !$OMP  matrix_B,U,l,d,V2d,i_end,a_end,n_running_threads) &
-          !$OMP  PRIVATE(T,V,i,k,ik) NUM_THREADS(nproc-n_running_threads+1)
      allocate( V(i_start:i_end, k_start:k_end), T(k_start:k_end, a_start:a_end))
-      !$OMP DO SCHEDULE(static,1)
      do b=b_start,d
        ik = 0
        do k=k_start,k_end
@ -221,10 +208,7 @@ subroutine four_index_transform_slave_work(map_a,matrix_B,LDB,            &
              U(a_start,b+1,b), size(U,1) )
        endif
      enddo
-      deallocate(T,V)
-      !$OMP END PARALLEL
-
-      deallocate(V2d)
+      deallocate(T,V,V2d)
    enddo

    idx = 0_8
@ -253,19 +237,23 @@ subroutine four_index_transform_slave_work(map_a,matrix_B,LDB,            &
    deallocate(key,value)

  enddo
-  !$OMP END DO NOWAIT
  deallocate(U)

-  !$OMP ATOMIC
-  n_running_threads = n_running_threads-1
-
  !$OMP BARRIER
  !$OMP MASTER
+  integer(ZMQ_PTR)               :: zmq_to_qp_run_socket
+  integer(ZMQ_PTR), external     :: task_done_to_taskserver, new_zmq_to_qp_run_socket
+  zmq_to_qp_run_socket = new_zmq_to_qp_run_socket()
+  if (task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id) == -1) then
+    stop 'Unable to send task done'
+  endif
  call four_idx_push_results(zmq_socket_push, 0_8, 0.d0, 0, task_id)
+  call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket)
  !$OMP END MASTER
  call end_zmq_push_socket(zmq_socket_push)
  !$OMP END PARALLEL

+
  deallocate(l_pointer,a_array_ik,a_array_j,a_array_value)

 end
--- a/src/FourIdx/four_index_zmq.irp.f
+++ b/src/FourIdx/four_index_zmq.irp.f
@ -62,13 +62,13 @@ subroutine four_index_transform_zmq(map_a,map_c,matrix_B,LDB,        &
  call new_parallel_job(zmq_to_qp_run_socket,zmq_socket_pull,'four_idx')

  integer*8 :: new_size
-  new_size = max(1024_8, 5_8 * map_a % n_elements )
+  new_size = max(2048_8, 5_8 * map_a % n_elements )

  integer :: npass
  integer*8 :: tempspace

-  tempspace = (new_size * 16_8) / (1024_8 * 1024_8)
-  npass = int(min(int(l_end-l_start,8),1_8 + tempspace / 1024_8),4)   ! 1 GiB of scratch space
+  tempspace = (new_size * 16_8) / (2048_8 * 2048_8)
+  npass = int(min(int(l_end-l_start,8),1_8 + tempspace / 2048_8),4)   ! 2 GiB of scratch space
  l_block = (l_end-l_start+1)/npass

  ! Create tasks
@ -98,9 +98,11 @@ subroutine four_index_transform_zmq(map_a,map_c,matrix_B,LDB,        &

  PROVIDE nproc

+  integer :: ithread, sqnproc
+  sqnproc = int(sqrt(float(nproc))+0.5)
  call omp_set_nested(.True.)
-  integer :: ithread
-  !$OMP PARALLEL NUM_THREADS(2) PRIVATE(ithread)
+  call omp_set_dynamic(.True.)
+  !$OMP PARALLEL NUM_THREADS(1+max(1,sqnproc)) PRIVATE(ithread)
  ithread = omp_get_thread_num()
  if (ithread==0) then
    call four_idx_collector(zmq_socket_pull,map_c)
@ -178,19 +180,13 @@ subroutine four_index_transform_slave(thread,worker_id)
        i_end  , j_end  , k_end  , l_end_block  ,                    &
        a_start, b_start, c_start, d_start,                          &
        a_end  , b_end  , c_end  , d_end,                            &
-        task_id, thread)
+        task_id, worker_id, thread)

-    integer, external              :: task_done_to_taskserver
-    if (task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id) == -1) then
-      print *,  irp_here, ': Unable  to send task_done'
-      stop
-    endif

  enddo
  integer, external :: disconnect_from_taskserver
  if (disconnect_from_taskserver(zmq_to_qp_run_socket,thread) == -1) then
-    call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket)
-    return
+    continue
  endif
  call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket)

@ -287,8 +283,6 @@ subroutine four_idx_pull_results(zmq_socket_pull, map_c, task_id)
  rc8 = f77_zmq_recv8( zmq_socket_pull, value, int(integral_kind*sze,8), 0)
  if(rc8 /= integral_kind*sze) stop 'four_idx_pull_results failed to pull value'

-  call map_update(map_c, key, value, sze, mo_integrals_threshold)  
-
 ! Activate if zmq_socket_pull is a REP
  IRP_IF ZMQ_PUSH
  IRP_ELSE
@ -299,6 +293,8 @@ subroutine four_idx_pull_results(zmq_socket_pull, map_c, task_id)
    endif
  IRP_ENDIF

+  call map_update(map_c, key, value, sze, mo_integrals_threshold)  
+
  deallocate(key, value)
 end