10
0
mirror of https://github.com/LCPQ/quantum_package synced 2025-01-10 04:58:25 +01:00

Improve parallel davidson

This commit is contained in:
Anthony Scemama 2016-10-07 00:42:03 +02:00
parent 1de02c46e6
commit 693604d338
3 changed files with 150 additions and 70 deletions

View File

@ -91,7 +91,7 @@ end subroutine
subroutine davidson_collect(blockb, blocke, N, idx, vt, st , v0, s0) subroutine davidson_collect(blockb, blocke, N, idx, vt, st , v0t, s0t)
implicit none implicit none
@ -100,14 +100,19 @@ subroutine davidson_collect(blockb, blocke, N, idx, vt, st , v0, s0)
integer , intent(in) :: idx(N) integer , intent(in) :: idx(N)
double precision , intent(in) :: vt(N_states_diag, N) double precision , intent(in) :: vt(N_states_diag, N)
double precision , intent(in) :: st(N_states_diag, N) double precision , intent(in) :: st(N_states_diag, N)
double precision , intent(inout) :: v0(dav_size, N_states_diag) double precision , intent(inout) :: v0t(N_states_diag,dav_size)
double precision , intent(inout) :: s0(dav_size, N_states_diag) double precision , intent(inout) :: s0t(N_states_diag,dav_size)
integer :: i integer :: i, j, k
!DIR$ IVDEP
do i=1,N do i=1,N
v0(idx(i), :) += vt(:, i) k = idx(i)
s0(idx(i), :) += st(:, i) !DIR$ IVDEP
do j=1,N_states_diag
v0t(j,k) = v0t(j,k) + vt(j,i)
s0t(j,k) = s0t(j,k) + st(j,i)
enddo
end do end do
end subroutine end subroutine
@ -221,7 +226,6 @@ subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, worker_id)
call davidson_process(blockb, blocke, N, idx, vt, st) call davidson_process(blockb, blocke, N, idx, vt, st)
! reverse ?
call task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id) call task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id)
call davidson_push_results(zmq_socket_push, blockb, blocke, N, idx, vt, st, task_id) call davidson_push_results(zmq_socket_push, blockb, blocke, N, idx, vt, st, task_id)
end do end do
@ -321,7 +325,7 @@ subroutine davidson_collector(zmq_to_qp_run_socket, zmq_socket_pull , v0, s0)
integer :: blockb, blocke integer :: blockb, blocke
integer :: N integer :: N
integer , allocatable :: idx(:) integer , allocatable :: idx(:)
double precision , allocatable :: vt(:,:) double precision , allocatable :: vt(:,:), v0t(:,:), s0t(:,:)
double precision , allocatable :: st(:,:) double precision , allocatable :: st(:,:)
integer :: deleted integer :: deleted
logical, allocatable :: done(:) logical, allocatable :: done(:)
@ -332,18 +336,23 @@ subroutine davidson_collector(zmq_to_qp_run_socket, zmq_socket_pull , v0, s0)
allocate(idx(dav_size)) allocate(idx(dav_size))
allocate(vt(N_states_diag, dav_size)) allocate(vt(N_states_diag, dav_size))
allocate(st(N_states_diag, dav_size)) allocate(st(N_states_diag, dav_size))
allocate(v0t(N_states_diag, dav_size))
allocate(s0t(N_states_diag, dav_size))
more = 1 more = 1
do while (more == 1) do while (more == 1)
call davidson_pull_results(zmq_socket_pull, blockb, blocke, N, idx, vt, st, task_id) call davidson_pull_results(zmq_socket_pull, blockb, blocke, N, idx, vt, st, task_id)
call davidson_collect(blockb, blocke, N, idx, vt, st , v0, s0) !DIR$ FORCEINLINE
! done(block) = .true. call davidson_collect(blockb, blocke, N, idx, vt, st , v0t, s0t)
call zmq_delete_task(zmq_to_qp_run_socket,zmq_socket_pull,task_id,more) call zmq_delete_task(zmq_to_qp_run_socket,zmq_socket_pull,task_id,more)
deleted += 1 deleted += 1
! print *, "DELETED", deleted, done
end do end do
! print *, "done collector" deallocate(idx,vt,st,done)
call dtranspose(v0t,size(v0t,1), v0, size(v0,1), N_states_diag, dav_size)
call dtranspose(s0t,size(s0t,1), s0, size(s0,1), N_states_diag, dav_size)
deallocate(v0t,s0t)
end subroutine end subroutine
@ -382,7 +391,7 @@ subroutine davidson_run(zmq_to_qp_run_socket , v0, s0)
else if(i==1) then else if(i==1) then
call davidson_miniserver_run() call davidson_miniserver_run()
else else
call davidson_slave_inproc(i) call davidson_slave_inproc(i-1)
endif endif
!$OMP END PARALLEL !$OMP END PARALLEL
call end_parallel_job(zmq_to_qp_run_socket, 'davidson') call end_parallel_job(zmq_to_qp_run_socket, 'davidson')
@ -393,7 +402,6 @@ end subroutine
subroutine davidson_miniserver_run() subroutine davidson_miniserver_run()
use f77_zmq use f77_zmq
implicit none implicit none
integer(ZMQ_PTR) context
integer(ZMQ_PTR) responder integer(ZMQ_PTR) responder
character*(64) address character*(64) address
character(len=:), allocatable :: buffer character(len=:), allocatable :: buffer
@ -402,8 +410,7 @@ subroutine davidson_miniserver_run()
allocate (character(len=20) :: buffer) allocate (character(len=20) :: buffer)
address = 'tcp://*:11223' address = 'tcp://*:11223'
context = f77_zmq_ctx_new() responder = f77_zmq_socket(zmq_context, ZMQ_REP)
responder = f77_zmq_socket(context, ZMQ_REP)
rc = f77_zmq_bind(responder,address) rc = f77_zmq_bind(responder,address)
do do
@ -419,7 +426,6 @@ subroutine davidson_miniserver_run()
enddo enddo
rc = f77_zmq_close(responder) rc = f77_zmq_close(responder)
rc = f77_zmq_ctx_destroy(context)
end subroutine end subroutine
@ -427,21 +433,20 @@ subroutine davidson_miniserver_end()
implicit none implicit none
use f77_zmq use f77_zmq
integer(ZMQ_PTR) context
integer(ZMQ_PTR) requester integer(ZMQ_PTR) requester
character*(64) address character*(64) address
integer rc integer rc
character*(64) buf character*(64) buf
address = trim(qp_run_address)//':11223' address = trim(qp_run_address)//':11223'
context = f77_zmq_ctx_new() requester = f77_zmq_socket(zmq_context, ZMQ_REQ)
requester = f77_zmq_socket(context, ZMQ_REQ)
rc = f77_zmq_connect(requester,address) rc = f77_zmq_connect(requester,address)
rc = f77_zmq_send(requester, "end", 3, 0) rc = f77_zmq_send(requester, "end", 3, ZMQ_NOBLOCK)
rc = f77_zmq_recv(requester, buf, 3, 0) if (rc > 0) then
rc = f77_zmq_recv(requester, buf, 3, 0)
endif
rc = f77_zmq_close(requester) rc = f77_zmq_close(requester)
rc = f77_zmq_ctx_destroy(context)
end subroutine end subroutine
@ -449,7 +454,6 @@ subroutine davidson_miniserver_get()
implicit none implicit none
use f77_zmq use f77_zmq
integer(ZMQ_PTR) context
integer(ZMQ_PTR) requester integer(ZMQ_PTR) requester
character*(64) address character*(64) address
character*(20) buffer character*(20) buffer
@ -457,8 +461,7 @@ subroutine davidson_miniserver_get()
address = trim(qp_run_address)//':11223' address = trim(qp_run_address)//':11223'
context = f77_zmq_ctx_new() requester = f77_zmq_socket(zmq_context, ZMQ_REQ)
requester = f77_zmq_socket(context, ZMQ_REQ)
rc = f77_zmq_connect(requester,address) rc = f77_zmq_connect(requester,address)
rc = f77_zmq_send(requester, "Hello", 5, 0) rc = f77_zmq_send(requester, "Hello", 5, 0)
@ -469,9 +472,7 @@ subroutine davidson_miniserver_get()
TOUCH dav_det dav_ut TOUCH dav_det dav_ut
rc = f77_zmq_close(requester) rc = f77_zmq_close(requester)
rc = f77_zmq_ctx_destroy(context)
touch dav_det dav_ut
end subroutine end subroutine

View File

@ -249,7 +249,7 @@ subroutine H_S2_u_0_nstates(v_0,s_0,u_0,H_jj,S2_jj,n,keys_tmp,Nint,N_st,sze_8)
call davidson_init(handler) call davidson_init(handler)
do sh=shortcut(0,1),1,-1 do sh=shortcut(0,1),1,-1
workload += (shortcut(sh+1,1) - shortcut(sh,1))**2 workload += (shortcut(sh+1,1) - shortcut(sh,1))**2
if(workload > 1000000) then if(workload > 100000) then
blocke = sh blocke = sh
call davidson_add_task(handler, blocke, blockb) call davidson_add_task(handler, blocke, blockb)
blockb = sh-1 blockb = sh-1
@ -258,51 +258,51 @@ subroutine H_S2_u_0_nstates(v_0,s_0,u_0,H_jj,S2_jj,n,keys_tmp,Nint,N_st,sze_8)
enddo enddo
if(blockb > 0) call davidson_add_task(handler, 1, blockb) if(blockb > 0) call davidson_add_task(handler, 1, blockb)
call davidson_run(handler, v_0, s_0) call davidson_run(handler, v_0, s_0)
!$OMP PARALLEL DEFAULT(NONE) & !$OMP PARALLEL DEFAULT(NONE) &
!$OMP PRIVATE(i,hij,s2,j,k,jj,vt,st,ii,sh,sh2,ni,exa,ext,org_i,org_j,endi,sorted_i,istate)& !$OMP PRIVATE(i,hij,s2,j,k,jj,vt,st,ii,sh,sh2,ni,exa,ext,org_i,org_j,endi,sorted_i,istate)&
!$OMP SHARED(n,keys_tmp,ut,Nint,v_0,s_0,sorted,shortcut,sort_idx,version,N_st,N_st_8) !$OMP SHARED(n,keys_tmp,ut,Nint,v_0,s_0,sorted,shortcut,sort_idx,version,N_st,N_st_8)
allocate(vt(N_st_8,n),st(N_st_8,n))
Vt = 0.d0
St = 0.d0
!$OMP DO SCHEDULE(dynamic) allocate(vt(N_st_8,n),st(N_st_8,n))
do sh=1,shortcut(0,2) Vt = 0.d0
do i=shortcut(sh,2),shortcut(sh+1,2)-1 St = 0.d0
org_i = sort_idx(i,2)
do j=shortcut(sh,2),i-1 !$OMP DO SCHEDULE(dynamic)
org_j = sort_idx(j,2) do sh=1,shortcut(0,2)
ext = 0 do i=shortcut(sh,2),shortcut(sh+1,2)-1
do ni=1,Nint org_i = sort_idx(i,2)
ext = ext + popcnt(xor(sorted(ni,i,2), sorted(ni,j,2))) do j=shortcut(sh,2),i-1
org_j = sort_idx(j,2)
ext = 0
do ni=1,Nint
ext = ext + popcnt(xor(sorted(ni,i,2), sorted(ni,j,2)))
end do
if(ext == 4) then
call i_h_j (keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,hij)
call get_s2(keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,s2)
do istate=1,n_st
vt (istate,org_i) = vt (istate,org_i) + hij*ut(istate,org_j)
vt (istate,org_j) = vt (istate,org_j) + hij*ut(istate,org_i)
st (istate,org_i) = st (istate,org_i) + s2*ut(istate,org_j)
st (istate,org_j) = st (istate,org_j) + s2*ut(istate,org_i)
enddo
end if
end do end do
if(ext == 4) then
call i_h_j (keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,hij)
call get_s2(keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,s2)
do istate=1,n_st
vt (istate,org_i) = vt (istate,org_i) + hij*ut(istate,org_j)
vt (istate,org_j) = vt (istate,org_j) + hij*ut(istate,org_i)
st (istate,org_i) = st (istate,org_i) + s2*ut(istate,org_j)
st (istate,org_j) = st (istate,org_j) + s2*ut(istate,org_i)
enddo
end if
end do end do
end do
enddo
!$OMP END DO NOWAIT
!$OMP CRITICAL
do istate=1,N_st
do i=n,1,-1
v_0(i,istate) = v_0(i,istate) + vt(istate,i)
s_0(i,istate) = s_0(i,istate) + st(istate,i)
enddo enddo
enddo !$OMP END DO NOWAIT
!$OMP END CRITICAL
deallocate(vt,st) !$OMP CRITICAL
do istate=1,N_st
do i=n,1,-1
v_0(i,istate) = v_0(i,istate) + vt(istate,i)
s_0(i,istate) = s_0(i,istate) + st(istate,i)
enddo
enddo
!$OMP END CRITICAL
deallocate(vt,st)
!$OMP END PARALLEL !$OMP END PARALLEL
do istate=1,N_st do istate=1,N_st
@ -311,6 +311,7 @@ subroutine H_S2_u_0_nstates(v_0,s_0,u_0,H_jj,S2_jj,n,keys_tmp,Nint,N_st,sze_8)
s_0(i,istate) = s_0(i,istate) + s2_jj(i)* u_0(i,istate) s_0(i,istate) = s_0(i,istate) + s2_jj(i)* u_0(i,istate)
enddo enddo
enddo enddo
deallocate (shortcut, sort_idx, sorted, version) deallocate (shortcut, sort_idx, sorted, version)
end end

78
src/Utils/transpose.irp.f Normal file
View File

@ -0,0 +1,78 @@
!DIR$ attributes forceinline :: transpose
recursive subroutine transpose(A,LDA,B,LDB,d1,d2)
implicit none
BEGIN_DOC
! Transpose input matrix A into output matrix B
END_DOC
integer, intent(in) :: d1, d2, LDA, LDB
real, intent(in) :: A(LDA,d2)
real, intent(out) :: B(LDB,d1)
integer :: i,j,k, mod_align
if ( d2 < 32 ) then
do j=1,d1
!DIR$ LOOP COUNT (16)
do i=1,d2
B(i,j ) = A(j ,i)
enddo
enddo
return
else if (d1 > d2) then
!DIR$ forceinline
k=d1/2
!DIR$ forceinline recursive
call transpose(A(1,1),LDA,B(1,1),LDB,k,d2)
!DIR$ forceinline recursive
call transpose(A(k+1,1),LDA,B(1,k+1),LDB,d1-k,d2)
return
else
!DIR$ forceinline
k=d2/2
!DIR$ forceinline recursive
call transpose(A(1,k+1),LDA,B(k+1,1),LDB,d1,d2-k)
!DIR$ forceinline recursive
call transpose(A(1,1),LDA,B(1,1),LDB,d1,k)
return
endif
end
!DIR$ attributes forceinline :: dtranspose
recursive subroutine dtranspose(A,LDA,B,LDB,d1,d2)
implicit none
BEGIN_DOC
! Transpose input matrix A into output matrix B
END_DOC
integer, intent(in) :: d1, d2, LDA, LDB
double precision, intent(in) :: A(LDA,d2)
double precision, intent(out) :: B(LDB,d1)
integer :: i,j,k, mod_align
if ( d2 < 32 ) then
do j=1,d1
!DIR$ LOOP COUNT (16)
do i=1,d2
B(i,j ) = A(j ,i)
enddo
enddo
return
else if (d1 > d2) then
!DIR$ forceinline
k=d1/2
!DIR$ forceinline recursive
call dtranspose(A(1,1),LDA,B(1,1),LDB,k,d2)
!DIR$ forceinline recursive
call dtranspose(A(k+1,1),LDA,B(1,k+1),LDB,d1-k,d2)
return
else
!DIR$ forceinline
k=d2/2
!DIR$ forceinline recursive
call dtranspose(A(1,k+1),LDA,B(k+1,1),LDB,d1,d2-k)
!DIR$ forceinline recursive
call dtranspose(A(1,1),LDA,B(1,1),LDB,d1,k)
return
endif
end