10
0
mirror of https://github.com/LCPQ/quantum_package synced 2025-01-03 10:05:57 +01:00

Load balancing in 4idx

This commit is contained in:
Anthony Scemama 2017-12-14 14:01:30 +01:00
parent 31b3b7dd79
commit af229e7028
2 changed files with 28 additions and 44 deletions

View File

@ -2,7 +2,7 @@ subroutine four_index_transform_slave_work(map_a,matrix_B,LDB, &
i_start, j_start, k_start, l_start, &
i_end , j_end , k_end , l_end , &
a_start, b_start, c_start, d_start, &
a_end , b_end , c_end , d_end, task_id, thread )
a_end , b_end , c_end , d_end, task_id, worker_id, thread )
implicit none
use f77_zmq
use map_module
@ -19,7 +19,7 @@ subroutine four_index_transform_slave_work(map_a,matrix_B,LDB, &
integer, intent(in) :: i_end , j_end , k_end , l_end
integer, intent(in) :: a_start, b_start, c_start, d_start
integer, intent(in) :: a_end , b_end , c_end , d_end
integer, intent(in) :: task_id, thread
integer, intent(in) :: task_id, thread, worker_id
double precision, allocatable :: T(:,:), U(:,:,:), V(:,:)
double precision, allocatable :: T2d(:,:), V2d(:,:)
@ -63,13 +63,13 @@ subroutine four_index_transform_slave_work(map_a,matrix_B,LDB, &
double precision, allocatable :: a_array_value(:)
integer*8 :: new_size
new_size = max(1024_8, 5_8 * map_a % n_elements )
new_size = max(2048_8, 5_8 * map_a % n_elements )
integer*8 :: tempspace
integer :: npass, l_block
tempspace = (new_size * 16_8) / (1024_8 * 1024_8)
npass = int(min(1_8+int(l_end-l_start,8),1_8 + tempspace / 1024_8),4) ! 1 GiB of scratch space
tempspace = (new_size * 16_8) / (2048_8 * 2048_8)
npass = int(min(1_8+int(l_end-l_start,8),1_8 + tempspace / 2048_8),4) ! 2 GiB of scratch space
l_block = (l_end-l_start+1)/npass
allocate(a_array_ik(new_size/npass), a_array_j(new_size/npass), a_array_value(new_size/npass))
@ -133,16 +133,12 @@ subroutine four_index_transform_slave_work(map_a,matrix_B,LDB, &
!open(unit=10,file='OUTPUT',form='FORMATTED')
! END INPUT DATA
PROVIDE nproc
integer :: n_running_threads
n_running_threads = 0
call omp_set_nested(.true.)
!$OMP PARALLEL DEFAULT(NONE) SHARED(a_array_ik,a_array_j,a_array_value, &
!$OMP a_start,a_end,b_start,b_end,c_start,c_end,d_start,d_end,&
!$OMP i_start,i_end,j_start,j_end,k_start,k_end,l_start,l_end,&
!$OMP i_min,i_max,j_min,j_max,k_min,k_max,l_min,l_max, &
!$OMP matrix_B,l_pointer,thread,task_id,n_running_threads,nproc) &
!$OMP PRIVATE(key,value,T,U,V,i,j,k,l,idx,ik,ll, &
!$OMP matrix_B,l_pointer,thread,task_id,worker_id) &
!$OMP PRIVATE(key,value,T,U,V,i,j,k,l,idx,ik,ll,zmq_to_qp_run_socket, &
!$OMP a,b,c,d,p,q,tmp,T2d,V2d,ii,zmq_socket_push)
integer(ZMQ_PTR) :: zmq_socket_push
@ -153,9 +149,6 @@ subroutine four_index_transform_slave_work(map_a,matrix_B,LDB, &
allocate( U(a_start:a_end, c_start:c_end, b_start:b_end) )
!$OMP ATOMIC
n_running_threads = n_running_threads+1
!$OMP DO SCHEDULE(dynamic,1)
do d=d_start,d_end
U = 0.d0
@ -185,13 +178,7 @@ subroutine four_index_transform_slave_work(map_a,matrix_B,LDB, &
V2d(1,b_start), size(V2d,1) )
deallocate(T2d)
!$OMP FLUSH(n_running_threads)
!$OMP PARALLEL DEFAULT(NONE) SHARED(a_array_ik,a_array_j,a_array_value, &
!$OMP a_start,b_start,b_end,c_start,c_end,i_start,k_start,k_end, &
!$OMP matrix_B,U,l,d,V2d,i_end,a_end,n_running_threads) &
!$OMP PRIVATE(T,V,i,k,ik) NUM_THREADS(nproc-n_running_threads+1)
allocate( V(i_start:i_end, k_start:k_end), T(k_start:k_end, a_start:a_end))
!$OMP DO SCHEDULE(static,1)
do b=b_start,d
ik = 0
do k=k_start,k_end
@ -221,10 +208,7 @@ subroutine four_index_transform_slave_work(map_a,matrix_B,LDB, &
U(a_start,b+1,b), size(U,1) )
endif
enddo
deallocate(T,V)
!$OMP END PARALLEL
deallocate(V2d)
deallocate(T,V,V2d)
enddo
idx = 0_8
@ -253,19 +237,23 @@ subroutine four_index_transform_slave_work(map_a,matrix_B,LDB, &
deallocate(key,value)
enddo
!$OMP END DO NOWAIT
deallocate(U)
!$OMP ATOMIC
n_running_threads = n_running_threads-1
!$OMP BARRIER
!$OMP MASTER
integer(ZMQ_PTR) :: zmq_to_qp_run_socket
integer(ZMQ_PTR), external :: task_done_to_taskserver, new_zmq_to_qp_run_socket
zmq_to_qp_run_socket = new_zmq_to_qp_run_socket()
if (task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id) == -1) then
stop 'Unable to send task done'
endif
call four_idx_push_results(zmq_socket_push, 0_8, 0.d0, 0, task_id)
call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket)
!$OMP END MASTER
call end_zmq_push_socket(zmq_socket_push)
!$OMP END PARALLEL
deallocate(l_pointer,a_array_ik,a_array_j,a_array_value)
end

View File

@ -62,13 +62,13 @@ subroutine four_index_transform_zmq(map_a,map_c,matrix_B,LDB, &
call new_parallel_job(zmq_to_qp_run_socket,zmq_socket_pull,'four_idx')
integer*8 :: new_size
new_size = max(1024_8, 5_8 * map_a % n_elements )
new_size = max(2048_8, 5_8 * map_a % n_elements )
integer :: npass
integer*8 :: tempspace
tempspace = (new_size * 16_8) / (1024_8 * 1024_8)
npass = int(min(int(l_end-l_start,8),1_8 + tempspace / 1024_8),4) ! 1 GiB of scratch space
tempspace = (new_size * 16_8) / (2048_8 * 2048_8)
npass = int(min(int(l_end-l_start,8),1_8 + tempspace / 2048_8),4) ! 2 GiB of scratch space
l_block = (l_end-l_start+1)/npass
! Create tasks
@ -98,9 +98,11 @@ subroutine four_index_transform_zmq(map_a,map_c,matrix_B,LDB, &
PROVIDE nproc
integer :: ithread, sqnproc
sqnproc = int(sqrt(float(nproc))+0.5)
call omp_set_nested(.True.)
integer :: ithread
!$OMP PARALLEL NUM_THREADS(2) PRIVATE(ithread)
call omp_set_dynamic(.True.)
!$OMP PARALLEL NUM_THREADS(1+max(1,sqnproc)) PRIVATE(ithread)
ithread = omp_get_thread_num()
if (ithread==0) then
call four_idx_collector(zmq_socket_pull,map_c)
@ -178,19 +180,13 @@ subroutine four_index_transform_slave(thread,worker_id)
i_end , j_end , k_end , l_end_block , &
a_start, b_start, c_start, d_start, &
a_end , b_end , c_end , d_end, &
task_id, thread)
task_id, worker_id, thread)
integer, external :: task_done_to_taskserver
if (task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id) == -1) then
print *, irp_here, ': Unable to send task_done'
stop
endif
enddo
integer, external :: disconnect_from_taskserver
if (disconnect_from_taskserver(zmq_to_qp_run_socket,thread) == -1) then
call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket)
return
continue
endif
call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket)
@ -287,8 +283,6 @@ subroutine four_idx_pull_results(zmq_socket_pull, map_c, task_id)
rc8 = f77_zmq_recv8( zmq_socket_pull, value, int(integral_kind*sze,8), 0)
if(rc8 /= integral_kind*sze) stop 'four_idx_pull_results failed to pull value'
call map_update(map_c, key, value, sze, mo_integrals_threshold)
! Activate if zmq_socket_pull is a REP
IRP_IF ZMQ_PUSH
IRP_ELSE
@ -299,6 +293,8 @@ subroutine four_idx_pull_results(zmq_socket_pull, map_c, task_id)
endif
IRP_ENDIF
call map_update(map_c, key, value, sze, mo_integrals_threshold)
deallocate(key, value)
end