From 693604d33886d9c4dcf39616aed077e17ad1b620 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Fri, 7 Oct 2016 00:42:03 +0200 Subject: [PATCH] Improve parallel davidson --- src/Davidson/davidson_parallel.irp.f | 57 ++++++++++--------- src/Davidson/u0Hu0.irp.f | 85 ++++++++++++++-------------- src/Utils/transpose.irp.f | 78 +++++++++++++++++++++++++ 3 files changed, 150 insertions(+), 70 deletions(-) create mode 100644 src/Utils/transpose.irp.f diff --git a/src/Davidson/davidson_parallel.irp.f b/src/Davidson/davidson_parallel.irp.f index 0e86a9f7..90f2ec8f 100644 --- a/src/Davidson/davidson_parallel.irp.f +++ b/src/Davidson/davidson_parallel.irp.f @@ -91,7 +91,7 @@ end subroutine -subroutine davidson_collect(blockb, blocke, N, idx, vt, st , v0, s0) +subroutine davidson_collect(blockb, blocke, N, idx, vt, st , v0t, s0t) implicit none @@ -100,14 +100,19 @@ subroutine davidson_collect(blockb, blocke, N, idx, vt, st , v0, s0) integer , intent(in) :: idx(N) double precision , intent(in) :: vt(N_states_diag, N) double precision , intent(in) :: st(N_states_diag, N) - double precision , intent(inout) :: v0(dav_size, N_states_diag) - double precision , intent(inout) :: s0(dav_size, N_states_diag) + double precision , intent(inout) :: v0t(N_states_diag,dav_size) + double precision , intent(inout) :: s0t(N_states_diag,dav_size) - integer :: i + integer :: i, j, k + !DIR$ IVDEP do i=1,N - v0(idx(i), :) += vt(:, i) - s0(idx(i), :) += st(:, i) + k = idx(i) + !DIR$ IVDEP + do j=1,N_states_diag + v0t(j,k) = v0t(j,k) + vt(j,i) + s0t(j,k) = s0t(j,k) + st(j,i) + enddo end do end subroutine @@ -221,7 +226,6 @@ subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, worker_id) call davidson_process(blockb, blocke, N, idx, vt, st) - ! reverse ? call task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id) call davidson_push_results(zmq_socket_push, blockb, blocke, N, idx, vt, st, task_id) end do @@ -321,7 +325,7 @@ subroutine davidson_collector(zmq_to_qp_run_socket, zmq_socket_pull , v0, s0) integer :: blockb, blocke integer :: N integer , allocatable :: idx(:) - double precision , allocatable :: vt(:,:) + double precision , allocatable :: vt(:,:), v0t(:,:), s0t(:,:) double precision , allocatable :: st(:,:) integer :: deleted logical, allocatable :: done(:) @@ -332,18 +336,23 @@ subroutine davidson_collector(zmq_to_qp_run_socket, zmq_socket_pull , v0, s0) allocate(idx(dav_size)) allocate(vt(N_states_diag, dav_size)) allocate(st(N_states_diag, dav_size)) + allocate(v0t(N_states_diag, dav_size)) + allocate(s0t(N_states_diag, dav_size)) more = 1 do while (more == 1) call davidson_pull_results(zmq_socket_pull, blockb, blocke, N, idx, vt, st, task_id) - call davidson_collect(blockb, blocke, N, idx, vt, st , v0, s0) -! done(block) = .true. + !DIR$ FORCEINLINE + call davidson_collect(blockb, blocke, N, idx, vt, st , v0t, s0t) call zmq_delete_task(zmq_to_qp_run_socket,zmq_socket_pull,task_id,more) deleted += 1 -! print *, "DELETED", deleted, done end do -! print *, "done collector" + deallocate(idx,vt,st,done) + + call dtranspose(v0t,size(v0t,1), v0, size(v0,1), N_states_diag, dav_size) + call dtranspose(s0t,size(s0t,1), s0, size(s0,1), N_states_diag, dav_size) + deallocate(v0t,s0t) end subroutine @@ -382,7 +391,7 @@ subroutine davidson_run(zmq_to_qp_run_socket , v0, s0) else if(i==1) then call davidson_miniserver_run() else - call davidson_slave_inproc(i) + call davidson_slave_inproc(i-1) endif !$OMP END PARALLEL call end_parallel_job(zmq_to_qp_run_socket, 'davidson') @@ -393,7 +402,6 @@ end subroutine subroutine davidson_miniserver_run() use f77_zmq implicit none - integer(ZMQ_PTR) context integer(ZMQ_PTR) responder character*(64) address character(len=:), allocatable :: buffer @@ -402,8 +410,7 @@ subroutine davidson_miniserver_run() allocate (character(len=20) :: buffer) address = 'tcp://*:11223' - context = f77_zmq_ctx_new() - responder = f77_zmq_socket(context, ZMQ_REP) + responder = f77_zmq_socket(zmq_context, ZMQ_REP) rc = f77_zmq_bind(responder,address) do @@ -419,7 +426,6 @@ subroutine davidson_miniserver_run() enddo rc = f77_zmq_close(responder) - rc = f77_zmq_ctx_destroy(context) end subroutine @@ -427,21 +433,20 @@ subroutine davidson_miniserver_end() implicit none use f77_zmq - integer(ZMQ_PTR) context integer(ZMQ_PTR) requester character*(64) address integer rc character*(64) buf address = trim(qp_run_address)//':11223' - context = f77_zmq_ctx_new() - requester = f77_zmq_socket(context, ZMQ_REQ) + requester = f77_zmq_socket(zmq_context, ZMQ_REQ) rc = f77_zmq_connect(requester,address) - rc = f77_zmq_send(requester, "end", 3, 0) - rc = f77_zmq_recv(requester, buf, 3, 0) + rc = f77_zmq_send(requester, "end", 3, ZMQ_NOBLOCK) + if (rc > 0) then + rc = f77_zmq_recv(requester, buf, 3, 0) + endif rc = f77_zmq_close(requester) - rc = f77_zmq_ctx_destroy(context) end subroutine @@ -449,7 +454,6 @@ subroutine davidson_miniserver_get() implicit none use f77_zmq - integer(ZMQ_PTR) context integer(ZMQ_PTR) requester character*(64) address character*(20) buffer @@ -457,8 +461,7 @@ subroutine davidson_miniserver_get() address = trim(qp_run_address)//':11223' - context = f77_zmq_ctx_new() - requester = f77_zmq_socket(context, ZMQ_REQ) + requester = f77_zmq_socket(zmq_context, ZMQ_REQ) rc = f77_zmq_connect(requester,address) rc = f77_zmq_send(requester, "Hello", 5, 0) @@ -469,9 +472,7 @@ subroutine davidson_miniserver_get() TOUCH dav_det dav_ut rc = f77_zmq_close(requester) - rc = f77_zmq_ctx_destroy(context) - touch dav_det dav_ut end subroutine diff --git a/src/Davidson/u0Hu0.irp.f b/src/Davidson/u0Hu0.irp.f index e22fbbf9..8c2b373c 100644 --- a/src/Davidson/u0Hu0.irp.f +++ b/src/Davidson/u0Hu0.irp.f @@ -249,7 +249,7 @@ subroutine H_S2_u_0_nstates(v_0,s_0,u_0,H_jj,S2_jj,n,keys_tmp,Nint,N_st,sze_8) call davidson_init(handler) do sh=shortcut(0,1),1,-1 workload += (shortcut(sh+1,1) - shortcut(sh,1))**2 - if(workload > 1000000) then + if(workload > 100000) then blocke = sh call davidson_add_task(handler, blocke, blockb) blockb = sh-1 @@ -258,51 +258,51 @@ subroutine H_S2_u_0_nstates(v_0,s_0,u_0,H_jj,S2_jj,n,keys_tmp,Nint,N_st,sze_8) enddo if(blockb > 0) call davidson_add_task(handler, 1, blockb) - call davidson_run(handler, v_0, s_0) - + !$OMP PARALLEL DEFAULT(NONE) & !$OMP PRIVATE(i,hij,s2,j,k,jj,vt,st,ii,sh,sh2,ni,exa,ext,org_i,org_j,endi,sorted_i,istate)& - !$OMP SHARED(n,keys_tmp,ut,Nint,v_0,s_0,sorted,shortcut,sort_idx,version,N_st,N_st_8) - allocate(vt(N_st_8,n),st(N_st_8,n)) - Vt = 0.d0 - St = 0.d0 - - !$OMP DO SCHEDULE(dynamic) - do sh=1,shortcut(0,2) - do i=shortcut(sh,2),shortcut(sh+1,2)-1 - org_i = sort_idx(i,2) - do j=shortcut(sh,2),i-1 - org_j = sort_idx(j,2) - ext = 0 - do ni=1,Nint - ext = ext + popcnt(xor(sorted(ni,i,2), sorted(ni,j,2))) - end do - if(ext == 4) then - call i_h_j (keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,hij) - call get_s2(keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,s2) - do istate=1,n_st - vt (istate,org_i) = vt (istate,org_i) + hij*ut(istate,org_j) - vt (istate,org_j) = vt (istate,org_j) + hij*ut(istate,org_i) - st (istate,org_i) = st (istate,org_i) + s2*ut(istate,org_j) - st (istate,org_j) = st (istate,org_j) + s2*ut(istate,org_i) - enddo - end if - end do - end do - enddo - !$OMP END DO NOWAIT - - !$OMP CRITICAL - do istate=1,N_st - do i=n,1,-1 - v_0(i,istate) = v_0(i,istate) + vt(istate,i) - s_0(i,istate) = s_0(i,istate) + st(istate,i) - enddo - enddo - !$OMP END CRITICAL + !$OMP SHARED(n,keys_tmp,ut,Nint,v_0,s_0,sorted,shortcut,sort_idx,version,N_st,N_st_8) - deallocate(vt,st) + allocate(vt(N_st_8,n),st(N_st_8,n)) + Vt = 0.d0 + St = 0.d0 + + !$OMP DO SCHEDULE(dynamic) + do sh=1,shortcut(0,2) + do i=shortcut(sh,2),shortcut(sh+1,2)-1 + org_i = sort_idx(i,2) + do j=shortcut(sh,2),i-1 + org_j = sort_idx(j,2) + ext = 0 + do ni=1,Nint + ext = ext + popcnt(xor(sorted(ni,i,2), sorted(ni,j,2))) + end do + if(ext == 4) then + call i_h_j (keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,hij) + call get_s2(keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,s2) + do istate=1,n_st + vt (istate,org_i) = vt (istate,org_i) + hij*ut(istate,org_j) + vt (istate,org_j) = vt (istate,org_j) + hij*ut(istate,org_i) + st (istate,org_i) = st (istate,org_i) + s2*ut(istate,org_j) + st (istate,org_j) = st (istate,org_j) + s2*ut(istate,org_i) + enddo + end if + end do + end do + enddo + !$OMP END DO NOWAIT + + !$OMP CRITICAL + do istate=1,N_st + do i=n,1,-1 + v_0(i,istate) = v_0(i,istate) + vt(istate,i) + s_0(i,istate) = s_0(i,istate) + st(istate,i) + enddo + enddo + !$OMP END CRITICAL + + deallocate(vt,st) !$OMP END PARALLEL do istate=1,N_st @@ -311,6 +311,7 @@ subroutine H_S2_u_0_nstates(v_0,s_0,u_0,H_jj,S2_jj,n,keys_tmp,Nint,N_st,sze_8) s_0(i,istate) = s_0(i,istate) + s2_jj(i)* u_0(i,istate) enddo enddo + deallocate (shortcut, sort_idx, sorted, version) end diff --git a/src/Utils/transpose.irp.f b/src/Utils/transpose.irp.f new file mode 100644 index 00000000..32e502e9 --- /dev/null +++ b/src/Utils/transpose.irp.f @@ -0,0 +1,78 @@ +!DIR$ attributes forceinline :: transpose +recursive subroutine transpose(A,LDA,B,LDB,d1,d2) + implicit none + BEGIN_DOC +! Transpose input matrix A into output matrix B + END_DOC + integer, intent(in) :: d1, d2, LDA, LDB + real, intent(in) :: A(LDA,d2) + real, intent(out) :: B(LDB,d1) + + integer :: i,j,k, mod_align + if ( d2 < 32 ) then + do j=1,d1 + !DIR$ LOOP COUNT (16) + do i=1,d2 + B(i,j ) = A(j ,i) + enddo + enddo + return + else if (d1 > d2) then + !DIR$ forceinline + k=d1/2 + !DIR$ forceinline recursive + call transpose(A(1,1),LDA,B(1,1),LDB,k,d2) + !DIR$ forceinline recursive + call transpose(A(k+1,1),LDA,B(1,k+1),LDB,d1-k,d2) + return + else + !DIR$ forceinline + k=d2/2 + !DIR$ forceinline recursive + call transpose(A(1,k+1),LDA,B(k+1,1),LDB,d1,d2-k) + !DIR$ forceinline recursive + call transpose(A(1,1),LDA,B(1,1),LDB,d1,k) + return + endif + +end + +!DIR$ attributes forceinline :: dtranspose +recursive subroutine dtranspose(A,LDA,B,LDB,d1,d2) + implicit none + BEGIN_DOC +! Transpose input matrix A into output matrix B + END_DOC + integer, intent(in) :: d1, d2, LDA, LDB + double precision, intent(in) :: A(LDA,d2) + double precision, intent(out) :: B(LDB,d1) + + integer :: i,j,k, mod_align + if ( d2 < 32 ) then + do j=1,d1 + !DIR$ LOOP COUNT (16) + do i=1,d2 + B(i,j ) = A(j ,i) + enddo + enddo + return + else if (d1 > d2) then + !DIR$ forceinline + k=d1/2 + !DIR$ forceinline recursive + call dtranspose(A(1,1),LDA,B(1,1),LDB,k,d2) + !DIR$ forceinline recursive + call dtranspose(A(k+1,1),LDA,B(1,k+1),LDB,d1-k,d2) + return + else + !DIR$ forceinline + k=d2/2 + !DIR$ forceinline recursive + call dtranspose(A(1,k+1),LDA,B(k+1,1),LDB,d1,d2-k) + !DIR$ forceinline recursive + call dtranspose(A(1,1),LDA,B(1,1),LDB,d1,k) + return + endif + +end +