From 30eda7b248a60e4edc9d88038f65e5a69c17bdea Mon Sep 17 00:00:00 2001 From: Yann Garniron Date: Mon, 10 Oct 2016 14:45:58 +0200 Subject: [PATCH] removed OMP - double work --- src/Davidson/davidson_parallel.irp.f | 279 +++++++++++---------------- src/Davidson/u0Hu0.irp.f | 10 +- 2 files changed, 124 insertions(+), 165 deletions(-) diff --git a/src/Davidson/davidson_parallel.irp.f b/src/Davidson/davidson_parallel.irp.f index 9d12a5b4..0e91ac90 100644 --- a/src/Davidson/davidson_parallel.irp.f +++ b/src/Davidson/davidson_parallel.irp.f @@ -4,58 +4,44 @@ use bitmasks use f77_zmq -subroutine davidson_process(blockb, blocke, vt, st) - use f77_zmq +subroutine davidson_process(blockb, blocke, N, idx, vt, st, bs) + implicit none - integer , intent(in) :: blockb, blocke - double precision , intent(inout) :: vt(N_states_diag, dav_size) - double precision , intent(inout) :: st(N_states_diag, dav_size) + integer , intent(in) :: blockb, blocke, bs + integer , intent(inout) :: N + integer , intent(inout) :: idx(bs) + double precision , intent(inout) :: vt(N_states_diag, bs) + double precision , intent(inout) :: st(N_states_diag, bs) - integer :: i, j, sh, sh2, exa, ext, org_i, org_j, istate, ni, endi + integer :: i,ii, j, sh, sh2, exa, ext, org_i, org_j, istate, ni, endi integer(bit_kind) :: sorted_i(N_int) double precision :: s2, hij - integer, external :: omp_get_thread_num - double precision, allocatable :: locals(:,:), localv(:,:) + logical, allocatable :: wrotten(:) + - provide dav_det dav_ut shortcut_ - !useless calls not to provide in the parallel section - call i_h_j (dav_det(1,1,1),dav_det(1,1,dav_size),n_int,hij) - call get_s2(dav_det(1,1,1),dav_det(1,1,dav_size),n_int,s2) - !!!!! + allocate(wrotten(bs)) + wrotten = .false. do sh = blockb, blocke - !$OMP PARALLEL & - !$OMP default(none) & - !$OMP shared(vt, st, blockb, blocke, sh, shortcut_, version_, sorted_, sort_idx_, dav_det, dav_ut, N_int, N_states_diag) & - !$OMP private(i,j,sh2, locals, localv, exa, ni, ext, org_i, org_j, sorted_i, endi, hij, s2) - allocate(locals(N_states_diag, shortcut_(sh+1,1) - shortcut_(sh,1))) - allocate(localv(N_states_diag, shortcut_(sh+1,1) - shortcut_(sh,1))) - locals = 0d0 - localv = 0d0 - !$OMP DO schedule(static, 1) - do sh2=sh, shortcut_(0,1) !1,sh + do sh2=1,shortcut_(0,1) exa = 0 do ni=1,N_int exa = exa + popcnt(xor(version_(ni,sh,1), version_(ni,sh2,1))) end do - if(exa > 2) then - cycle - end if + if(exa > 2) cycle do i=shortcut_(sh,1),shortcut_(sh+1,1)-1 - org_i = sort_idx_(i,1) - if(sh==sh2) then - endi = i-1 - else - endi = shortcut_(sh2+1,1)-1 - end if + ii = i - shortcut_(blockb,1) + 1 + + org_i = sort_idx_(i,1) do ni=1,N_int sorted_i(ni) = sorted_(ni,i,1) enddo - do j=shortcut_(sh2,1),endi + do j=shortcut_(sh2,1), shortcut_(sh2+1,1)-1 + if(i == j) cycle org_j = sort_idx_(j,1) ext = exa do ni=1,N_int @@ -64,49 +50,30 @@ subroutine davidson_process(blockb, blocke, vt, st) if(ext <= 4) then call i_h_j (dav_det(1,1,org_j),dav_det(1,1,org_i),n_int,hij) call get_s2(dav_det(1,1,org_j),dav_det(1,1,org_i),n_int,s2) - - if(sh == sh2) then - !$OMP CRITICAL - do istate=1,N_states_diag - localv (istate,i+1-shortcut_(sh,1)) += hij*dav_ut(istate,org_j) - locals (istate,i+1-shortcut_(sh,1)) += s2*dav_ut(istate,org_j) - vt (istate,org_j) += hij*dav_ut(istate,org_i) - st (istate,org_j) += s2*dav_ut(istate,org_i) - enddo - !$OMP END CRITICAL - else - do istate=1,N_states_diag - localv (istate,i+1-shortcut_(sh,1)) += hij*dav_ut(istate,org_j) - locals (istate,i+1-shortcut_(sh,1)) += s2*dav_ut(istate,org_j) - vt (istate,org_j) += hij*dav_ut(istate,org_i) - st (istate,org_j) += s2*dav_ut(istate,org_i) - enddo + if(.not. wrotten(ii)) then + wrotten(ii) = .true. + idx(ii) = org_i + vt (:,ii) = 0d0 + st (:,ii) = 0d0 end if + do istate=1,N_states_diag + vt (istate,ii) += hij*dav_ut(istate,org_j) + st (istate,ii) += s2*dav_ut(istate,org_j) + enddo endif enddo enddo enddo - !$OMP ENDDO - !$OMP CRITICAL - do i=1,shortcut_(sh+1,1) - shortcut_(sh,1) - do istate=1,N_states_diag - vt(istate, sort_idx_(shortcut_(sh,1) - 1 + i, 1)) += localv(istate,i) - st(istate, sort_idx_(shortcut_(sh,1) - 1 + i, 1)) += locals(istate,i) - end do - end do - !$OMP END CRITICAL - !$OMP END PARALLEL enddo - + do sh=blockb,min(blocke, shortcut_(0,2)) - !$OMP PARALLEL DO default(none) schedule(dynamic) & - !$OMP shared(vt, st, blockb, blocke, sh, shortcut_, version_, sorted_, sort_idx_, dav_det, dav_ut, N_int, N_states_diag) & - !$OMP private(exa, ni, ext, org_i, org_j, sorted_i, endi, hij, s2) - do sh2=sh, shortcut_(0,2), shortcut_(0,1) + do sh2=sh, shortcut_(0,2), shortcut_(0,1)*51 do i=shortcut_(sh2,2),shortcut_(sh2+1,2)-1 + ii += 1 org_i = sort_idx_(i,2) - do j=shortcut_(sh2,2),i-1 + do j=shortcut_(sh2,2),shortcut_(sh2+1,2)-1 + if(i == j) cycle org_j = sort_idx_(j,2) ext = 0 do ni=1,N_int @@ -115,20 +82,31 @@ subroutine davidson_process(blockb, blocke, vt, st) if(ext == 4) then call i_h_j (dav_det(1,1,org_j),dav_det(1,1,org_i),n_int,hij) call get_s2(dav_det(1,1,org_j),dav_det(1,1,org_i),n_int,s2) - !$OMP CRITICAL + if(.not. wrotten(ii)) then + wrotten(ii) = .true. + idx(ii) = org_i + vt (:,ii) = 0d0 + st (:,ii) = 0d0 + end if do istate=1,N_states_diag - vt (istate,org_i) = vt (istate,org_i) + hij*dav_ut(istate,org_j) - vt (istate,org_j) = vt (istate,org_j) + hij*dav_ut(istate,org_i) - st (istate,org_i) = st (istate,org_i) + s2*dav_ut(istate,org_j) - st (istate,org_j) = st (istate,org_j) + s2*dav_ut(istate,org_i) + vt (istate,ii) += hij*dav_ut(istate,org_j) + st (istate,ii) += s2*dav_ut(istate,org_j) enddo - !$OMP END CRITICAL end if end do end do enddo - !$OMP END PARALLEL DO enddo + + N=0 + do i=1,bs + if(wrotten(i)) then + N += 1 + idx(N) = idx(i) + vt(:,N) = vt(:,i) + st(:,N) = st(:,i) + end if + end do end subroutine @@ -194,12 +172,6 @@ subroutine davidson_slave_inproc(i) call davidson_run_slave(1,i) end -integer function davidson_slave_inproc_omp() - implicit none - - call davidson_run_slave(1,2) - davidson_slave_inproc_omp = 0 -end subroutine subroutine davidson_slave_tcp(i) implicit none @@ -252,8 +224,7 @@ subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, worker_id) integer(ZMQ_PTR),intent(in) :: zmq_to_qp_run_socket integer(ZMQ_PTR),intent(in) :: zmq_socket_push integer,intent(in) :: worker_id - integer :: i, taskn, myTask, istate - integer, allocatable :: task_id(:) + integer :: task_id character*(512) :: task @@ -263,59 +234,42 @@ subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, worker_id) double precision , allocatable :: vt(:,:) double precision , allocatable :: st(:,:) - allocate(task_id(100)) - allocate(idx(dav_size)) - allocate(vt(N_states_diag, dav_size)) - allocate(st(N_states_diag, dav_size)) - - vt = 0d0 - st = 0d0 - taskn = 0 + integer :: bs, i, j + allocate(idx(1), vt(1,1), st(1,1)) + do - call get_task_from_taskserver(zmq_to_qp_run_socket,worker_id, myTask, task) - if(myTask /= 0) then - read (task,*) blockb, blocke - call davidson_process(blockb, blocke, vt, st) - taskn += 1 - task_id(taskn) = myTask + call get_task_from_taskserver(zmq_to_qp_run_socket,worker_id, task_id, task) + if(task_id == 0) exit + read (task,*) blockb, blocke + bs = shortcut_(blocke+1,1) - shortcut_(blockb, 1) + do i=blockb, shortcut_(0,2), shortcut_(0,1) + do j=i, min(i+blocke-blockb, shortcut_(0,2)) + bs += shortcut_(j+1,2) - shortcut_(j, 2) + end do + end do + if(bs > size(idx)) then + deallocate(idx, vt, st) + allocate(idx(bs)) + allocate(vt(N_states_diag, bs)) + allocate(st(N_states_diag, bs)) end if + + call davidson_process(blockb, blocke, N, idx, vt, st, bs) - - if(myTask == 0 .or. taskn == size(task_id)) then - N = 0 - do i=1, dav_size - if(vt(1,i) /= 0d0 .or. st(1,i) /= 0d0) then - N = N+1 - do istate=1,N_states_diag - vt (istate,N) = vt (istate,i) - st (istate,N) = st (istate,i) - idx(N) = i - enddo - end if - end do - - do i = 1, taskn - call task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id(i)) - end do - if(taskn /= 0) call davidson_push_results(zmq_socket_push, blockb, blocke, N, idx, vt, st, taskn, task_id) - - if(myTask == 0) exit - vt = 0d0 - st = 0d0 - taskn = 0 - end if + call task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id) + call davidson_push_results(zmq_socket_push, blockb, blocke, N, idx, vt, st, task_id) end do end subroutine -subroutine davidson_push_results(zmq_socket_push, blockb, blocke, N, idx, vt, st, taskn, task_id) +subroutine davidson_push_results(zmq_socket_push, blockb, blocke, N, idx, vt, st, task_id) use f77_zmq implicit none integer(ZMQ_PTR) ,intent(in) :: zmq_socket_push - integer ,intent(in) :: task_id(100), taskn + integer ,intent(in) :: task_id integer ,intent(in) :: blockb, blocke integer ,intent(in) :: N @@ -342,26 +296,23 @@ subroutine davidson_push_results(zmq_socket_push, blockb, blocke, N, idx, vt, st rc = f77_zmq_send( zmq_socket_push, st, 8*N_states_diag* N, ZMQ_SNDMORE) if(rc /= 8*N_states_diag* N) stop "davidson_push_results failed to push st" - rc = f77_zmq_send( zmq_socket_push, taskn, 4, ZMQ_SNDMORE) - if(rc /= 4) stop "davidson_push_results failed to push taskn" - - rc = f77_zmq_send( zmq_socket_push, task_id, 4*taskn, 0) - if(rc /= 4*taskn) stop "davidson_push_results failed to push task_id" + rc = f77_zmq_send( zmq_socket_push, task_id, 4, 0) + if(rc /= 4) stop "davidson_push_results failed to push task_id" end subroutine -subroutine davidson_pull_results(zmq_socket_pull, blockb, blocke, N, idx, vt, st, taskn, task_id) +subroutine davidson_pull_results(zmq_socket_pull, blockb, blocke, N, idx, vt, st, task_id) use f77_zmq implicit none integer(ZMQ_PTR) ,intent(in) :: zmq_socket_pull - integer ,intent(out) :: task_id(100), taskn + integer ,intent(out) :: task_id integer ,intent(out) :: blockb, blocke integer ,intent(out) :: N - integer ,intent(out) :: idx(dav_size) - double precision ,intent(out) :: vt(N_states_diag, dav_size) - double precision ,intent(out) :: st(N_states_diag, dav_size) + integer ,intent(out) :: idx(*) + double precision ,intent(out) :: vt(N_states_diag, *) + double precision ,intent(out) :: st(N_states_diag, *) integer :: rc @@ -383,11 +334,8 @@ subroutine davidson_pull_results(zmq_socket_pull, blockb, blocke, N, idx, vt, st rc = f77_zmq_recv( zmq_socket_pull, st, 8*N_states_diag* N, 0) if(rc /= 8*N_states_diag* N) stop "davidson_push_results failed to pull st" - rc = f77_zmq_recv( zmq_socket_pull, taskn, 4, 0) - if(rc /= 4) stop "davidson_pull_results failed to pull taskn" - - rc = f77_zmq_recv( zmq_socket_pull, task_id, 4*taskn, 0) - if(rc /= 4*taskn) stop "davidson_pull_results failed to pull task_id" + rc = f77_zmq_recv( zmq_socket_pull, task_id, 4, 0) + if(rc /= 4) stop "davidson_pull_results failed to pull task_id" end subroutine @@ -402,7 +350,7 @@ subroutine davidson_collector(zmq_to_qp_run_socket, zmq_socket_pull , v0, s0) double precision ,intent(inout) :: v0(dav_size, N_states_diag) double precision ,intent(inout) :: s0(dav_size, N_states_diag) - integer :: more, task_id(100), taskn + integer :: more, task_id integer :: blockb, blocke @@ -411,11 +359,12 @@ subroutine davidson_collector(zmq_to_qp_run_socket, zmq_socket_pull , v0, s0) double precision , allocatable :: vt(:,:), v0t(:,:), s0t(:,:) double precision , allocatable :: st(:,:) - integer :: i + integer :: msize - allocate(idx(dav_size)) - allocate(vt(N_states_diag, dav_size)) - allocate(st(N_states_diag, dav_size)) + msize = (max_workload + max_blocksize)*2 + allocate(idx(msize)) + allocate(vt(N_states_diag, msize)) + allocate(st(N_states_diag, msize)) allocate(v0t(N_states_diag, dav_size)) allocate(s0t(N_states_diag, dav_size)) @@ -425,13 +374,10 @@ subroutine davidson_collector(zmq_to_qp_run_socket, zmq_socket_pull , v0, s0) more = 1 do while (more == 1) - call davidson_pull_results(zmq_socket_pull, blockb, blocke, N, idx, vt, st, taskn, task_id) - + call davidson_pull_results(zmq_socket_pull, blockb, blocke, N, idx, vt, st, task_id) !DIR$ FORCEINLINE call davidson_collect(blockb, blocke, N, idx, vt, st , v0t, s0t) - do i=1,taskn - call zmq_delete_task(zmq_to_qp_run_socket,zmq_socket_pull,task_id(i),more) - end do + call zmq_delete_task(zmq_to_qp_run_socket,zmq_socket_pull,task_id,more) end do deallocate(idx,vt,st) @@ -450,39 +396,41 @@ subroutine davidson_run(zmq_to_qp_run_socket , v0, s0) integer(ZMQ_PTR) :: zmq_collector integer(ZMQ_PTR), external :: new_zmq_pull_socket integer(ZMQ_PTR) :: zmq_socket_pull - integer(ZMQ_PTR) :: pthread_slave, pthread_miniserver - - + integer :: i integer, external :: omp_get_thread_num double precision , intent(inout) :: v0(dav_size, N_states_diag) double precision , intent(inout) :: s0(dav_size, N_states_diag) - integer, external :: davidson_miniserver_run, davidson_slave_inproc_omp + call zmq_set_running(zmq_to_qp_run_socket) zmq_collector = new_zmq_to_qp_run_socket() zmq_socket_pull = new_zmq_pull_socket() + i = omp_get_thread_num() PROVIDE nproc - - i = pthread_create ( pthread_miniserver, davidson_miniserver_run ) - i = pthread_create ( pthread_slave, davidson_slave_inproc_omp ) - - call davidson_collector(zmq_collector, zmq_socket_pull , v0, s0) - call end_zmq_to_qp_run_socket(zmq_collector) - call end_zmq_pull_socket(zmq_socket_pull) - call davidson_miniserver_end() - i = pthread_join(pthread_miniserver) - i = pthread_join(pthread_slave) - + !$OMP PARALLEL DEFAULT(shared) private(i) num_threads(nproc+2) + i = omp_get_thread_num() + if (i==0) then + call davidson_collector(zmq_collector, zmq_socket_pull , v0, s0) + call end_zmq_to_qp_run_socket(zmq_collector) + call end_zmq_pull_socket(zmq_socket_pull) + call davidson_miniserver_end() + else if(i==1) then + call davidson_miniserver_run() + else + call davidson_slave_inproc(i) + endif + !$OMP END PARALLEL call end_parallel_job(zmq_to_qp_run_socket, 'davidson') end subroutine -integer function davidson_miniserver_run() + +subroutine davidson_miniserver_run() use f77_zmq implicit none integer(ZMQ_PTR) responder @@ -509,7 +457,6 @@ integer function davidson_miniserver_run() enddo rc = f77_zmq_close(responder) - davidson_miniserver_run = 0 end subroutine @@ -574,7 +521,11 @@ END_PROVIDER &BEGIN_PROVIDER [ integer(bit_kind), version_, (N_int, dav_size, 2) ] &BEGIN_PROVIDER [ integer(bit_kind), sorted_, (N_int, dav_size, 2) ] &BEGIN_PROVIDER [ integer, sort_idx_, (dav_size, 2) ] - implicit none +&BEGIN_PROVIDER [ integer, max_blocksize ] +implicit none call sort_dets_ab_v(dav_det, sorted_(1,1,1), sort_idx_(1,1), shortcut_(0,1), version_(1,1,1), dav_size, N_int) call sort_dets_ba_v(dav_det, sorted_(1,1,2), sort_idx_(1,2), shortcut_(0,2), version_(1,1,2), dav_size, N_int) + max_blocksize = max(shortcut_(0,1), shortcut_(0,2)) END_PROVIDER + + diff --git a/src/Davidson/u0Hu0.irp.f b/src/Davidson/u0Hu0.irp.f index 926531aa..475487d0 100644 --- a/src/Davidson/u0Hu0.irp.f +++ b/src/Davidson/u0Hu0.irp.f @@ -240,19 +240,21 @@ subroutine H_S2_u_0_nstates(v_0,s_0,u_0,H_jj,S2_jj,n,keys_tmp,Nint,N_st,sze_8) touch dav_size dav_det = psi_det dav_ut = ut + workload = 0 blockb = shortcut(0,1) blocke = blockb call davidson_init(handler) do sh=shortcut(0,1),1,-1 workload += (shortcut(sh+1,1) - shortcut(sh,1))**2 - if(workload > 1000) then + if(workload > max_workload) then blocke = sh call davidson_add_task(handler, blocke, blockb) blockb = sh-1 workload = 0 end if enddo + if(blockb > 0) call davidson_add_task(handler, 1, blockb) call davidson_run(handler, v_0, s_0) @@ -265,3 +267,9 @@ subroutine H_S2_u_0_nstates(v_0,s_0,u_0,H_jj,S2_jj,n,keys_tmp,Nint,N_st,sze_8) enddo end + +BEGIN_PROVIDER [ integer, max_workload ] + max_workload = 1000 +END_PROVIDER + +