10
0
mirror of https://github.com/LCPQ/quantum_package synced 2024-06-26 07:02:14 +02:00

removed OMP - double work

This commit is contained in:
Yann Garniron 2016-10-10 14:45:58 +02:00
parent 61f1e0814a
commit 30eda7b248
2 changed files with 124 additions and 165 deletions

View File

@ -4,58 +4,44 @@
use bitmasks
use f77_zmq
subroutine davidson_process(blockb, blocke, vt, st)
use f77_zmq
subroutine davidson_process(blockb, blocke, N, idx, vt, st, bs)
implicit none
integer , intent(in) :: blockb, blocke
double precision , intent(inout) :: vt(N_states_diag, dav_size)
double precision , intent(inout) :: st(N_states_diag, dav_size)
integer , intent(in) :: blockb, blocke, bs
integer , intent(inout) :: N
integer , intent(inout) :: idx(bs)
double precision , intent(inout) :: vt(N_states_diag, bs)
double precision , intent(inout) :: st(N_states_diag, bs)
integer :: i, j, sh, sh2, exa, ext, org_i, org_j, istate, ni, endi
integer :: i,ii, j, sh, sh2, exa, ext, org_i, org_j, istate, ni, endi
integer(bit_kind) :: sorted_i(N_int)
double precision :: s2, hij
integer, external :: omp_get_thread_num
double precision, allocatable :: locals(:,:), localv(:,:)
logical, allocatable :: wrotten(:)
provide dav_det dav_ut shortcut_
!useless calls not to provide in the parallel section
call i_h_j (dav_det(1,1,1),dav_det(1,1,dav_size),n_int,hij)
call get_s2(dav_det(1,1,1),dav_det(1,1,dav_size),n_int,s2)
!!!!!
allocate(wrotten(bs))
wrotten = .false.
do sh = blockb, blocke
!$OMP PARALLEL &
!$OMP default(none) &
!$OMP shared(vt, st, blockb, blocke, sh, shortcut_, version_, sorted_, sort_idx_, dav_det, dav_ut, N_int, N_states_diag) &
!$OMP private(i,j,sh2, locals, localv, exa, ni, ext, org_i, org_j, sorted_i, endi, hij, s2)
allocate(locals(N_states_diag, shortcut_(sh+1,1) - shortcut_(sh,1)))
allocate(localv(N_states_diag, shortcut_(sh+1,1) - shortcut_(sh,1)))
locals = 0d0
localv = 0d0
!$OMP DO schedule(static, 1)
do sh2=sh, shortcut_(0,1) !1,sh
do sh2=1,shortcut_(0,1)
exa = 0
do ni=1,N_int
exa = exa + popcnt(xor(version_(ni,sh,1), version_(ni,sh2,1)))
end do
if(exa > 2) then
cycle
end if
if(exa > 2) cycle
do i=shortcut_(sh,1),shortcut_(sh+1,1)-1
org_i = sort_idx_(i,1)
if(sh==sh2) then
endi = i-1
else
endi = shortcut_(sh2+1,1)-1
end if
ii = i - shortcut_(blockb,1) + 1
org_i = sort_idx_(i,1)
do ni=1,N_int
sorted_i(ni) = sorted_(ni,i,1)
enddo
do j=shortcut_(sh2,1),endi
do j=shortcut_(sh2,1), shortcut_(sh2+1,1)-1
if(i == j) cycle
org_j = sort_idx_(j,1)
ext = exa
do ni=1,N_int
@ -64,49 +50,30 @@ subroutine davidson_process(blockb, blocke, vt, st)
if(ext <= 4) then
call i_h_j (dav_det(1,1,org_j),dav_det(1,1,org_i),n_int,hij)
call get_s2(dav_det(1,1,org_j),dav_det(1,1,org_i),n_int,s2)
if(sh == sh2) then
!$OMP CRITICAL
do istate=1,N_states_diag
localv (istate,i+1-shortcut_(sh,1)) += hij*dav_ut(istate,org_j)
locals (istate,i+1-shortcut_(sh,1)) += s2*dav_ut(istate,org_j)
vt (istate,org_j) += hij*dav_ut(istate,org_i)
st (istate,org_j) += s2*dav_ut(istate,org_i)
enddo
!$OMP END CRITICAL
else
do istate=1,N_states_diag
localv (istate,i+1-shortcut_(sh,1)) += hij*dav_ut(istate,org_j)
locals (istate,i+1-shortcut_(sh,1)) += s2*dav_ut(istate,org_j)
vt (istate,org_j) += hij*dav_ut(istate,org_i)
st (istate,org_j) += s2*dav_ut(istate,org_i)
enddo
if(.not. wrotten(ii)) then
wrotten(ii) = .true.
idx(ii) = org_i
vt (:,ii) = 0d0
st (:,ii) = 0d0
end if
do istate=1,N_states_diag
vt (istate,ii) += hij*dav_ut(istate,org_j)
st (istate,ii) += s2*dav_ut(istate,org_j)
enddo
endif
enddo
enddo
enddo
!$OMP ENDDO
!$OMP CRITICAL
do i=1,shortcut_(sh+1,1) - shortcut_(sh,1)
do istate=1,N_states_diag
vt(istate, sort_idx_(shortcut_(sh,1) - 1 + i, 1)) += localv(istate,i)
st(istate, sort_idx_(shortcut_(sh,1) - 1 + i, 1)) += locals(istate,i)
end do
end do
!$OMP END CRITICAL
!$OMP END PARALLEL
enddo
do sh=blockb,min(blocke, shortcut_(0,2))
!$OMP PARALLEL DO default(none) schedule(dynamic) &
!$OMP shared(vt, st, blockb, blocke, sh, shortcut_, version_, sorted_, sort_idx_, dav_det, dav_ut, N_int, N_states_diag) &
!$OMP private(exa, ni, ext, org_i, org_j, sorted_i, endi, hij, s2)
do sh2=sh, shortcut_(0,2), shortcut_(0,1)
do sh2=sh, shortcut_(0,2), shortcut_(0,1)*51
do i=shortcut_(sh2,2),shortcut_(sh2+1,2)-1
ii += 1
org_i = sort_idx_(i,2)
do j=shortcut_(sh2,2),i-1
do j=shortcut_(sh2,2),shortcut_(sh2+1,2)-1
if(i == j) cycle
org_j = sort_idx_(j,2)
ext = 0
do ni=1,N_int
@ -115,20 +82,31 @@ subroutine davidson_process(blockb, blocke, vt, st)
if(ext == 4) then
call i_h_j (dav_det(1,1,org_j),dav_det(1,1,org_i),n_int,hij)
call get_s2(dav_det(1,1,org_j),dav_det(1,1,org_i),n_int,s2)
!$OMP CRITICAL
if(.not. wrotten(ii)) then
wrotten(ii) = .true.
idx(ii) = org_i
vt (:,ii) = 0d0
st (:,ii) = 0d0
end if
do istate=1,N_states_diag
vt (istate,org_i) = vt (istate,org_i) + hij*dav_ut(istate,org_j)
vt (istate,org_j) = vt (istate,org_j) + hij*dav_ut(istate,org_i)
st (istate,org_i) = st (istate,org_i) + s2*dav_ut(istate,org_j)
st (istate,org_j) = st (istate,org_j) + s2*dav_ut(istate,org_i)
vt (istate,ii) += hij*dav_ut(istate,org_j)
st (istate,ii) += s2*dav_ut(istate,org_j)
enddo
!$OMP END CRITICAL
end if
end do
end do
enddo
!$OMP END PARALLEL DO
enddo
N=0
do i=1,bs
if(wrotten(i)) then
N += 1
idx(N) = idx(i)
vt(:,N) = vt(:,i)
st(:,N) = st(:,i)
end if
end do
end subroutine
@ -194,12 +172,6 @@ subroutine davidson_slave_inproc(i)
call davidson_run_slave(1,i)
end
integer function davidson_slave_inproc_omp()
implicit none
call davidson_run_slave(1,2)
davidson_slave_inproc_omp = 0
end subroutine
subroutine davidson_slave_tcp(i)
implicit none
@ -252,8 +224,7 @@ subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, worker_id)
integer(ZMQ_PTR),intent(in) :: zmq_to_qp_run_socket
integer(ZMQ_PTR),intent(in) :: zmq_socket_push
integer,intent(in) :: worker_id
integer :: i, taskn, myTask, istate
integer, allocatable :: task_id(:)
integer :: task_id
character*(512) :: task
@ -263,59 +234,42 @@ subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, worker_id)
double precision , allocatable :: vt(:,:)
double precision , allocatable :: st(:,:)
allocate(task_id(100))
allocate(idx(dav_size))
allocate(vt(N_states_diag, dav_size))
allocate(st(N_states_diag, dav_size))
vt = 0d0
st = 0d0
taskn = 0
integer :: bs, i, j
allocate(idx(1), vt(1,1), st(1,1))
do
call get_task_from_taskserver(zmq_to_qp_run_socket,worker_id, myTask, task)
if(myTask /= 0) then
read (task,*) blockb, blocke
call davidson_process(blockb, blocke, vt, st)
taskn += 1
task_id(taskn) = myTask
call get_task_from_taskserver(zmq_to_qp_run_socket,worker_id, task_id, task)
if(task_id == 0) exit
read (task,*) blockb, blocke
bs = shortcut_(blocke+1,1) - shortcut_(blockb, 1)
do i=blockb, shortcut_(0,2), shortcut_(0,1)
do j=i, min(i+blocke-blockb, shortcut_(0,2))
bs += shortcut_(j+1,2) - shortcut_(j, 2)
end do
end do
if(bs > size(idx)) then
deallocate(idx, vt, st)
allocate(idx(bs))
allocate(vt(N_states_diag, bs))
allocate(st(N_states_diag, bs))
end if
call davidson_process(blockb, blocke, N, idx, vt, st, bs)
if(myTask == 0 .or. taskn == size(task_id)) then
N = 0
do i=1, dav_size
if(vt(1,i) /= 0d0 .or. st(1,i) /= 0d0) then
N = N+1
do istate=1,N_states_diag
vt (istate,N) = vt (istate,i)
st (istate,N) = st (istate,i)
idx(N) = i
enddo
end if
end do
do i = 1, taskn
call task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id(i))
end do
if(taskn /= 0) call davidson_push_results(zmq_socket_push, blockb, blocke, N, idx, vt, st, taskn, task_id)
if(myTask == 0) exit
vt = 0d0
st = 0d0
taskn = 0
end if
call task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id)
call davidson_push_results(zmq_socket_push, blockb, blocke, N, idx, vt, st, task_id)
end do
end subroutine
subroutine davidson_push_results(zmq_socket_push, blockb, blocke, N, idx, vt, st, taskn, task_id)
subroutine davidson_push_results(zmq_socket_push, blockb, blocke, N, idx, vt, st, task_id)
use f77_zmq
implicit none
integer(ZMQ_PTR) ,intent(in) :: zmq_socket_push
integer ,intent(in) :: task_id(100), taskn
integer ,intent(in) :: task_id
integer ,intent(in) :: blockb, blocke
integer ,intent(in) :: N
@ -342,26 +296,23 @@ subroutine davidson_push_results(zmq_socket_push, blockb, blocke, N, idx, vt, st
rc = f77_zmq_send( zmq_socket_push, st, 8*N_states_diag* N, ZMQ_SNDMORE)
if(rc /= 8*N_states_diag* N) stop "davidson_push_results failed to push st"
rc = f77_zmq_send( zmq_socket_push, taskn, 4, ZMQ_SNDMORE)
if(rc /= 4) stop "davidson_push_results failed to push taskn"
rc = f77_zmq_send( zmq_socket_push, task_id, 4*taskn, 0)
if(rc /= 4*taskn) stop "davidson_push_results failed to push task_id"
rc = f77_zmq_send( zmq_socket_push, task_id, 4, 0)
if(rc /= 4) stop "davidson_push_results failed to push task_id"
end subroutine
subroutine davidson_pull_results(zmq_socket_pull, blockb, blocke, N, idx, vt, st, taskn, task_id)
subroutine davidson_pull_results(zmq_socket_pull, blockb, blocke, N, idx, vt, st, task_id)
use f77_zmq
implicit none
integer(ZMQ_PTR) ,intent(in) :: zmq_socket_pull
integer ,intent(out) :: task_id(100), taskn
integer ,intent(out) :: task_id
integer ,intent(out) :: blockb, blocke
integer ,intent(out) :: N
integer ,intent(out) :: idx(dav_size)
double precision ,intent(out) :: vt(N_states_diag, dav_size)
double precision ,intent(out) :: st(N_states_diag, dav_size)
integer ,intent(out) :: idx(*)
double precision ,intent(out) :: vt(N_states_diag, *)
double precision ,intent(out) :: st(N_states_diag, *)
integer :: rc
@ -383,11 +334,8 @@ subroutine davidson_pull_results(zmq_socket_pull, blockb, blocke, N, idx, vt, st
rc = f77_zmq_recv( zmq_socket_pull, st, 8*N_states_diag* N, 0)
if(rc /= 8*N_states_diag* N) stop "davidson_push_results failed to pull st"
rc = f77_zmq_recv( zmq_socket_pull, taskn, 4, 0)
if(rc /= 4) stop "davidson_pull_results failed to pull taskn"
rc = f77_zmq_recv( zmq_socket_pull, task_id, 4*taskn, 0)
if(rc /= 4*taskn) stop "davidson_pull_results failed to pull task_id"
rc = f77_zmq_recv( zmq_socket_pull, task_id, 4, 0)
if(rc /= 4) stop "davidson_pull_results failed to pull task_id"
end subroutine
@ -402,7 +350,7 @@ subroutine davidson_collector(zmq_to_qp_run_socket, zmq_socket_pull , v0, s0)
double precision ,intent(inout) :: v0(dav_size, N_states_diag)
double precision ,intent(inout) :: s0(dav_size, N_states_diag)
integer :: more, task_id(100), taskn
integer :: more, task_id
integer :: blockb, blocke
@ -411,11 +359,12 @@ subroutine davidson_collector(zmq_to_qp_run_socket, zmq_socket_pull , v0, s0)
double precision , allocatable :: vt(:,:), v0t(:,:), s0t(:,:)
double precision , allocatable :: st(:,:)
integer :: i
integer :: msize
allocate(idx(dav_size))
allocate(vt(N_states_diag, dav_size))
allocate(st(N_states_diag, dav_size))
msize = (max_workload + max_blocksize)*2
allocate(idx(msize))
allocate(vt(N_states_diag, msize))
allocate(st(N_states_diag, msize))
allocate(v0t(N_states_diag, dav_size))
allocate(s0t(N_states_diag, dav_size))
@ -425,13 +374,10 @@ subroutine davidson_collector(zmq_to_qp_run_socket, zmq_socket_pull , v0, s0)
more = 1
do while (more == 1)
call davidson_pull_results(zmq_socket_pull, blockb, blocke, N, idx, vt, st, taskn, task_id)
call davidson_pull_results(zmq_socket_pull, blockb, blocke, N, idx, vt, st, task_id)
!DIR$ FORCEINLINE
call davidson_collect(blockb, blocke, N, idx, vt, st , v0t, s0t)
do i=1,taskn
call zmq_delete_task(zmq_to_qp_run_socket,zmq_socket_pull,task_id(i),more)
end do
call zmq_delete_task(zmq_to_qp_run_socket,zmq_socket_pull,task_id,more)
end do
deallocate(idx,vt,st)
@ -450,39 +396,41 @@ subroutine davidson_run(zmq_to_qp_run_socket , v0, s0)
integer(ZMQ_PTR) :: zmq_collector
integer(ZMQ_PTR), external :: new_zmq_pull_socket
integer(ZMQ_PTR) :: zmq_socket_pull
integer(ZMQ_PTR) :: pthread_slave, pthread_miniserver
integer :: i
integer, external :: omp_get_thread_num
double precision , intent(inout) :: v0(dav_size, N_states_diag)
double precision , intent(inout) :: s0(dav_size, N_states_diag)
integer, external :: davidson_miniserver_run, davidson_slave_inproc_omp
call zmq_set_running(zmq_to_qp_run_socket)
zmq_collector = new_zmq_to_qp_run_socket()
zmq_socket_pull = new_zmq_pull_socket()
i = omp_get_thread_num()
PROVIDE nproc
i = pthread_create ( pthread_miniserver, davidson_miniserver_run )
i = pthread_create ( pthread_slave, davidson_slave_inproc_omp )
call davidson_collector(zmq_collector, zmq_socket_pull , v0, s0)
call end_zmq_to_qp_run_socket(zmq_collector)
call end_zmq_pull_socket(zmq_socket_pull)
call davidson_miniserver_end()
i = pthread_join(pthread_miniserver)
i = pthread_join(pthread_slave)
!$OMP PARALLEL DEFAULT(shared) private(i) num_threads(nproc+2)
i = omp_get_thread_num()
if (i==0) then
call davidson_collector(zmq_collector, zmq_socket_pull , v0, s0)
call end_zmq_to_qp_run_socket(zmq_collector)
call end_zmq_pull_socket(zmq_socket_pull)
call davidson_miniserver_end()
else if(i==1) then
call davidson_miniserver_run()
else
call davidson_slave_inproc(i)
endif
!$OMP END PARALLEL
call end_parallel_job(zmq_to_qp_run_socket, 'davidson')
end subroutine
integer function davidson_miniserver_run()
subroutine davidson_miniserver_run()
use f77_zmq
implicit none
integer(ZMQ_PTR) responder
@ -509,7 +457,6 @@ integer function davidson_miniserver_run()
enddo
rc = f77_zmq_close(responder)
davidson_miniserver_run = 0
end subroutine
@ -574,7 +521,11 @@ END_PROVIDER
&BEGIN_PROVIDER [ integer(bit_kind), version_, (N_int, dav_size, 2) ]
&BEGIN_PROVIDER [ integer(bit_kind), sorted_, (N_int, dav_size, 2) ]
&BEGIN_PROVIDER [ integer, sort_idx_, (dav_size, 2) ]
implicit none
&BEGIN_PROVIDER [ integer, max_blocksize ]
implicit none
call sort_dets_ab_v(dav_det, sorted_(1,1,1), sort_idx_(1,1), shortcut_(0,1), version_(1,1,1), dav_size, N_int)
call sort_dets_ba_v(dav_det, sorted_(1,1,2), sort_idx_(1,2), shortcut_(0,2), version_(1,1,2), dav_size, N_int)
max_blocksize = max(shortcut_(0,1), shortcut_(0,2))
END_PROVIDER

View File

@ -240,19 +240,21 @@ subroutine H_S2_u_0_nstates(v_0,s_0,u_0,H_jj,S2_jj,n,keys_tmp,Nint,N_st,sze_8)
touch dav_size
dav_det = psi_det
dav_ut = ut
workload = 0
blockb = shortcut(0,1)
blocke = blockb
call davidson_init(handler)
do sh=shortcut(0,1),1,-1
workload += (shortcut(sh+1,1) - shortcut(sh,1))**2
if(workload > 1000) then
if(workload > max_workload) then
blocke = sh
call davidson_add_task(handler, blocke, blockb)
blockb = sh-1
workload = 0
end if
enddo
if(blockb > 0) call davidson_add_task(handler, 1, blockb)
call davidson_run(handler, v_0, s_0)
@ -265,3 +267,9 @@ subroutine H_S2_u_0_nstates(v_0,s_0,u_0,H_jj,S2_jj,n,keys_tmp,Nint,N_st,sze_8)
enddo
end
BEGIN_PROVIDER [ integer, max_workload ]
max_workload = 1000
END_PROVIDER