From c3dd90e1994934ca1b26c10816e06cfb2d0418b1 Mon Sep 17 00:00:00 2001 From: Yann Garniron Date: Sat, 8 Oct 2016 19:21:53 +0200 Subject: [PATCH] davidson locally reduces task results --- src/Davidson/davidson_parallel.irp.f | 120 ++++++++++++++------------- src/Davidson/u0Hu0.irp.f | 4 +- 2 files changed, 62 insertions(+), 62 deletions(-) diff --git a/src/Davidson/davidson_parallel.irp.f b/src/Davidson/davidson_parallel.irp.f index ca128d7c..4da50355 100644 --- a/src/Davidson/davidson_parallel.irp.f +++ b/src/Davidson/davidson_parallel.irp.f @@ -4,25 +4,20 @@ use bitmasks use f77_zmq -subroutine davidson_process(blockb, blocke, N, idx, vt, st) +subroutine davidson_process(blockb, blocke, vt, st) use f77_zmq implicit none integer , intent(in) :: blockb, blocke - integer , intent(inout) :: N - integer , intent(inout) :: idx(dav_size) double precision , intent(inout) :: vt(N_states_diag, dav_size) double precision , intent(inout) :: st(N_states_diag, dav_size) integer :: i, j, sh, sh2, exa, ext, org_i, org_j, istate, ni, endi integer(bit_kind) :: sorted_i(N_int) double precision :: s2, hij - logical, allocatable :: wrotten(:) integer, external :: omp_get_thread_num - allocate(wrotten(dav_size)) - wrotten = .false. provide dav_det dav_ut shortcut_ !useless calls not to provide in the parallel section call i_h_j (dav_det(1,1,1),dav_det(1,1,dav_size),n_int,hij) @@ -31,7 +26,7 @@ subroutine davidson_process(blockb, blocke, N, idx, vt, st) do sh = blockb, blocke !$OMP PARALLEL DO default(none) schedule(dynamic) & - !$OMP shared(vt, st, wrotten, blockb, blocke, sh, shortcut_, version_, sorted_, sort_idx_, dav_det, dav_ut, N_int, N_states_diag) & + !$OMP shared(vt, st, blockb, blocke, sh, shortcut_, version_, sorted_, sort_idx_, dav_det, dav_ut, N_int, N_states_diag) & !$OMP private(exa, ni, ext, org_i, org_j, sorted_i, endi, hij, s2) do sh2=1,sh exa = 0 @@ -63,16 +58,6 @@ subroutine davidson_process(blockb, blocke, N, idx, vt, st) call i_h_j (dav_det(1,1,org_j),dav_det(1,1,org_i),n_int,hij) call get_s2(dav_det(1,1,org_j),dav_det(1,1,org_i),n_int,s2) !$OMP CRITICAL - if(.not. wrotten(org_i)) then - wrotten(org_i) = .true. - vt (:,org_i) = 0d0 - st (:,org_i) = 0d0 - end if - if(.not. wrotten(org_j)) then - wrotten(org_j) = .true. - vt (:,org_j) = 0d0 - st (:,org_j) = 0d0 - end if do istate=1,N_states_diag vt (istate,org_i) += hij*dav_ut(istate,org_j) st (istate,org_i) += s2*dav_ut(istate,org_j) @@ -89,7 +74,7 @@ subroutine davidson_process(blockb, blocke, N, idx, vt, st) do sh=blockb,min(blocke, shortcut_(0,2)) !$OMP PARALLEL DO default(none) schedule(dynamic) & - !$OMP shared(vt, st, wrotten, blockb, blocke, sh, shortcut_, version_, sorted_, sort_idx_, dav_det, dav_ut, N_int, N_states_diag) & + !$OMP shared(vt, st, blockb, blocke, sh, shortcut_, version_, sorted_, sort_idx_, dav_det, dav_ut, N_int, N_states_diag) & !$OMP private(exa, ni, ext, org_i, org_j, sorted_i, endi, hij, s2) do sh2=sh, shortcut_(0,2), shortcut_(0,1) do i=shortcut_(sh2,2),shortcut_(sh2+1,2)-1 @@ -104,16 +89,6 @@ subroutine davidson_process(blockb, blocke, N, idx, vt, st) call i_h_j (dav_det(1,1,org_j),dav_det(1,1,org_i),n_int,hij) call get_s2(dav_det(1,1,org_j),dav_det(1,1,org_i),n_int,s2) !$OMP CRITICAL - if(.not. wrotten(org_i)) then - wrotten(org_i) = .true. - vt (:,org_i) = 0d0 - st (:,org_i) = 0d0 - end if - if(.not. wrotten(org_j)) then - wrotten(org_j) = .true. - vt (:,org_j) = 0d0 - st (:,org_j) = 0d0 - end if do istate=1,N_states_diag vt (istate,org_i) = vt (istate,org_i) + hij*dav_ut(istate,org_j) vt (istate,org_j) = vt (istate,org_j) + hij*dav_ut(istate,org_i) @@ -127,18 +102,6 @@ subroutine davidson_process(blockb, blocke, N, idx, vt, st) enddo !$OMP END PARALLEL DO enddo - - N = 0 - do i=1, dav_size - if(wrotten(i)) then - N = N+1 - do istate=1,N_states_diag - vt (istate,N) = vt (istate,i) - st (istate,N) = st (istate,i) - idx(N) = i - enddo - end if - end do end subroutine @@ -262,7 +225,8 @@ subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, worker_id) integer(ZMQ_PTR),intent(in) :: zmq_to_qp_run_socket integer(ZMQ_PTR),intent(in) :: zmq_socket_push integer,intent(in) :: worker_id - integer :: task_id + integer :: i, taskn, myTask, istate + integer, allocatable :: task_id(:) character*(512) :: task @@ -272,32 +236,59 @@ subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, worker_id) double precision , allocatable :: vt(:,:) double precision , allocatable :: st(:,:) - + allocate(task_id(100)) allocate(idx(dav_size)) allocate(vt(N_states_diag, dav_size)) allocate(st(N_states_diag, dav_size)) + vt = 0d0 + st = 0d0 + taskn = 0 do - call get_task_from_taskserver(zmq_to_qp_run_socket,worker_id, task_id, task) - if(task_id == 0) exit - read (task,*) blockb, blocke + call get_task_from_taskserver(zmq_to_qp_run_socket,worker_id, myTask, task) + if(myTask /= 0) then + read (task,*) blockb, blocke + call davidson_process(blockb, blocke, vt, st) + taskn += 1 + task_id(taskn) = myTask + end if - call davidson_process(blockb, blocke, N, idx, vt, st) - call task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id) - call davidson_push_results(zmq_socket_push, blockb, blocke, N, idx, vt, st, task_id) + if(myTask == 0 .or. taskn == size(task_id)) then + N = 0 + do i=1, dav_size + if(vt(1,i) /= 0d0 .or. st(1,i) /= 0d0) then + N = N+1 + do istate=1,N_states_diag + vt (istate,N) = vt (istate,i) + st (istate,N) = st (istate,i) + idx(N) = i + enddo + end if + end do + + do i = 1, taskn + call task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id(i)) + end do + if(taskn /= 0) call davidson_push_results(zmq_socket_push, blockb, blocke, N, idx, vt, st, taskn, task_id) + + if(myTask == 0) exit + vt = 0d0 + st = 0d0 + taskn = 0 + end if end do end subroutine -subroutine davidson_push_results(zmq_socket_push, blockb, blocke, N, idx, vt, st, task_id) +subroutine davidson_push_results(zmq_socket_push, blockb, blocke, N, idx, vt, st, taskn, task_id) use f77_zmq implicit none integer(ZMQ_PTR) ,intent(in) :: zmq_socket_push - integer ,intent(in) :: task_id + integer ,intent(in) :: task_id(100), taskn integer ,intent(in) :: blockb, blocke integer ,intent(in) :: N @@ -324,18 +315,21 @@ subroutine davidson_push_results(zmq_socket_push, blockb, blocke, N, idx, vt, st rc = f77_zmq_send( zmq_socket_push, st, 8*N_states_diag* N, ZMQ_SNDMORE) if(rc /= 8*N_states_diag* N) stop "davidson_push_results failed to push st" - rc = f77_zmq_send( zmq_socket_push, task_id, 4, 0) - if(rc /= 4) stop "davidson_push_results failed to push task_id" + rc = f77_zmq_send( zmq_socket_push, taskn, 4, ZMQ_SNDMORE) + if(rc /= 4) stop "davidson_push_results failed to push taskn" + + rc = f77_zmq_send( zmq_socket_push, task_id, 4*taskn, 0) + if(rc /= 4*taskn) stop "davidson_push_results failed to push task_id" end subroutine -subroutine davidson_pull_results(zmq_socket_pull, blockb, blocke, N, idx, vt, st, task_id) +subroutine davidson_pull_results(zmq_socket_pull, blockb, blocke, N, idx, vt, st, taskn, task_id) use f77_zmq implicit none integer(ZMQ_PTR) ,intent(in) :: zmq_socket_pull - integer ,intent(out) :: task_id + integer ,intent(out) :: task_id(100), taskn integer ,intent(out) :: blockb, blocke integer ,intent(out) :: N integer ,intent(out) :: idx(dav_size) @@ -362,8 +356,11 @@ subroutine davidson_pull_results(zmq_socket_pull, blockb, blocke, N, idx, vt, st rc = f77_zmq_recv( zmq_socket_pull, st, 8*N_states_diag* N, 0) if(rc /= 8*N_states_diag* N) stop "davidson_push_results failed to pull st" - rc = f77_zmq_recv( zmq_socket_pull, task_id, 4, 0) - if(rc /= 4) stop "davidson_pull_results failed to pull task_id" + rc = f77_zmq_recv( zmq_socket_pull, taskn, 4, 0) + if(rc /= 4) stop "davidson_pull_results failed to pull taskn" + + rc = f77_zmq_recv( zmq_socket_pull, task_id, 4*taskn, 0) + if(rc /= 4*taskn) stop "davidson_pull_results failed to pull task_id" end subroutine @@ -378,7 +375,7 @@ subroutine davidson_collector(zmq_to_qp_run_socket, zmq_socket_pull , v0, s0) double precision ,intent(inout) :: v0(dav_size, N_states_diag) double precision ,intent(inout) :: s0(dav_size, N_states_diag) - integer :: more, task_id + integer :: more, task_id(100), taskn integer :: blockb, blocke @@ -387,6 +384,8 @@ subroutine davidson_collector(zmq_to_qp_run_socket, zmq_socket_pull , v0, s0) double precision , allocatable :: vt(:,:), v0t(:,:), s0t(:,:) double precision , allocatable :: st(:,:) + integer :: i + allocate(idx(dav_size)) allocate(vt(N_states_diag, dav_size)) allocate(st(N_states_diag, dav_size)) @@ -399,10 +398,13 @@ subroutine davidson_collector(zmq_to_qp_run_socket, zmq_socket_pull , v0, s0) more = 1 do while (more == 1) - call davidson_pull_results(zmq_socket_pull, blockb, blocke, N, idx, vt, st, task_id) + call davidson_pull_results(zmq_socket_pull, blockb, blocke, N, idx, vt, st, taskn, task_id) + !DIR$ FORCEINLINE call davidson_collect(blockb, blocke, N, idx, vt, st , v0t, s0t) - call zmq_delete_task(zmq_to_qp_run_socket,zmq_socket_pull,task_id,more) + do i=1,taskn + call zmq_delete_task(zmq_to_qp_run_socket,zmq_socket_pull,task_id(i),more) + end do end do deallocate(idx,vt,st) diff --git a/src/Davidson/u0Hu0.irp.f b/src/Davidson/u0Hu0.irp.f index ce9295d7..926531aa 100644 --- a/src/Davidson/u0Hu0.irp.f +++ b/src/Davidson/u0Hu0.irp.f @@ -240,21 +240,19 @@ subroutine H_S2_u_0_nstates(v_0,s_0,u_0,H_jj,S2_jj,n,keys_tmp,Nint,N_st,sze_8) touch dav_size dav_det = psi_det dav_ut = ut - workload = 0 blockb = shortcut(0,1) blocke = blockb call davidson_init(handler) do sh=shortcut(0,1),1,-1 workload += (shortcut(sh+1,1) - shortcut(sh,1))**2 - if(workload > 100000) then + if(workload > 1000) then blocke = sh call davidson_add_task(handler, blocke, blockb) blockb = sh-1 workload = 0 end if enddo - if(blockb > 0) call davidson_add_task(handler, 1, blockb) call davidson_run(handler, v_0, s_0)