diff --git a/config/ifort.cfg b/config/ifort.cfg index b04506d4..4b1429b8 100644 --- a/config/ifort.cfg +++ b/config/ifort.cfg @@ -32,7 +32,7 @@ OPENMP : 1 ; Append OpenMP flags # [OPT] FC : -traceback -FCFLAGS : -xHost -O2 -ip -ftz -g +FCFLAGS : -xSSE4.2 -O2 -ip -ftz -g # Profiling flags ################# diff --git a/plugins/Full_CI_ZMQ/selection_davidson_slave.irp.f b/plugins/Full_CI_ZMQ/selection_davidson_slave.irp.f index 2aba32fe..3cd02620 100644 --- a/plugins/Full_CI_ZMQ/selection_davidson_slave.irp.f +++ b/plugins/Full_CI_ZMQ/selection_davidson_slave.irp.f @@ -52,7 +52,7 @@ subroutine run_wf !$OMP PARALLEL PRIVATE(i) i = omp_get_thread_num() - call selection_dressing_slave_tcp(i, energy) + call selection_slave_tcp(i, energy) !$OMP END PARALLEL print *, 'Selection done' @@ -63,11 +63,7 @@ subroutine run_wf print *, 'Davidson' call davidson_miniserver_get() - - !$OMP PARALLEL PRIVATE(i) - i = omp_get_thread_num() - call davidson_slave_tcp(i) - !$OMP END PARALLEL + call davidson_slave_tcp(0) print *, 'Davidson done' endif @@ -98,7 +94,7 @@ subroutine update_energy(energy) call write_double(6,ci_energy,'Energy') end -subroutine selection_dressing_slave_tcp(i,energy) +subroutine selection_slave_tcp(i,energy) implicit none double precision, intent(in) :: energy(N_states_diag) integer, intent(in) :: i diff --git a/plugins/Full_CI_ZMQ/selection_slave.irp.f b/plugins/Full_CI_ZMQ/selection_slave.irp.f index bc8ba76f..06bcf533 100644 --- a/plugins/Full_CI_ZMQ/selection_slave.irp.f +++ b/plugins/Full_CI_ZMQ/selection_slave.irp.f @@ -51,7 +51,7 @@ subroutine run_wf !$OMP PARALLEL PRIVATE(i) i = omp_get_thread_num() - call selection_dressing_slave_tcp(i, energy) + call selection_slave_tcp(i, energy) !$OMP END PARALLEL print *, 'Selection done' @@ -83,7 +83,7 @@ subroutine update_energy(energy) call write_double(6,ci_energy,'Energy') end -subroutine selection_dressing_slave_tcp(i,energy) +subroutine selection_slave_tcp(i,energy) implicit none double precision, intent(in) :: energy(N_states_diag) integer, intent(in) :: i diff --git a/src/Davidson/davidson_parallel.irp.f b/src/Davidson/davidson_parallel.irp.f index 33b62c6a..40b11730 100644 --- a/src/Davidson/davidson_parallel.irp.f +++ b/src/Davidson/davidson_parallel.irp.f @@ -4,27 +4,31 @@ use bitmasks use f77_zmq -subroutine davidson_process(blockb, blocke, N, idx, vt, st) - +subroutine davidson_process(blockb, blocke, vt, st) + use f77_zmq implicit none integer , intent(in) :: blockb, blocke - integer , intent(inout) :: N - integer , intent(inout) :: idx(dav_size) double precision , intent(inout) :: vt(N_states_diag, dav_size) double precision , intent(inout) :: st(N_states_diag, dav_size) integer :: i, j, sh, sh2, exa, ext, org_i, org_j, istate, ni, endi integer(bit_kind) :: sorted_i(N_int) double precision :: s2, hij - logical, allocatable :: wrotten(:) + integer, external :: omp_get_thread_num - allocate(wrotten(dav_size)) - wrotten = .false. + provide dav_det dav_ut shortcut_ + !useless calls not to provide in the parallel section + call i_h_j (dav_det(1,1,1),dav_det(1,1,dav_size),n_int,hij) + call get_s2(dav_det(1,1,1),dav_det(1,1,dav_size),n_int,s2) + !!!!! do sh = blockb, blocke - do sh2=1,sh + !$OMP PARALLEL DO default(none) schedule(dynamic) & + !$OMP shared(vt, st, blockb, blocke, sh, shortcut_, version_, sorted_, sort_idx_, dav_det, dav_ut, N_int, N_states_diag) & + !$OMP private(exa, ni, ext, org_i, org_j, sorted_i, endi, hij, s2) + do sh2=1,sh exa = 0 do ni=1,N_int exa = exa + popcnt(xor(version_(ni,sh,1), version_(ni,sh2,1))) @@ -53,39 +57,51 @@ subroutine davidson_process(blockb, blocke, N, idx, vt, st) if(ext <= 4) then call i_h_j (dav_det(1,1,org_j),dav_det(1,1,org_i),n_int,hij) call get_s2(dav_det(1,1,org_j),dav_det(1,1,org_i),n_int,s2) - if(.not. wrotten(org_i)) then - wrotten(org_i) = .true. - vt (:,org_i) = 0d0 - st (:,org_i) = 0d0 - end if - if(.not. wrotten(org_j)) then - wrotten(org_j) = .true. - vt (:,org_j) = 0d0 - st (:,org_j) = 0d0 - end if + !$OMP CRITICAL do istate=1,N_states_diag vt(istate,org_i) = vt(istate,org_i) + hij*dav_ut(istate,org_j) st(istate,org_i) = st(istate,org_i) + s2 *dav_ut(istate,org_j) vt(istate,org_j) = vt(istate,org_j) + hij*dav_ut(istate,org_i) st(istate,org_j) = st(istate,org_j) + s2 *dav_ut(istate,org_i) enddo + !$OMP END CRITICAL endif enddo enddo enddo + !$OMP END PARALLEL DO + enddo + + do sh=blockb,min(blocke, shortcut_(0,2)) + !$OMP PARALLEL DO default(none) schedule(dynamic) & + !$OMP shared(vt, st, blockb, blocke, sh, shortcut_, version_, sorted_, sort_idx_, dav_det, dav_ut, N_int, N_states_diag) & + !$OMP private(exa, ni, ext, org_i, org_j, sorted_i, endi, hij, s2) + do sh2=sh, shortcut_(0,2), shortcut_(0,1) + do i=shortcut_(sh2,2),shortcut_(sh2+1,2)-1 + org_i = sort_idx_(i,2) + do j=shortcut_(sh2,2),i-1 + org_j = sort_idx_(j,2) + ext = 0 + do ni=1,N_int + ext = ext + popcnt(xor(sorted_(ni,i,2), sorted_(ni,j,2))) + end do + if(ext == 4) then + call i_h_j (dav_det(1,1,org_j),dav_det(1,1,org_i),n_int,hij) + call get_s2(dav_det(1,1,org_j),dav_det(1,1,org_i),n_int,s2) + !$OMP CRITICAL + do istate=1,N_states_diag + vt (istate,org_i) = vt (istate,org_i) + hij*dav_ut(istate,org_j) + vt (istate,org_j) = vt (istate,org_j) + hij*dav_ut(istate,org_i) + st (istate,org_i) = st (istate,org_i) + s2*dav_ut(istate,org_j) + st (istate,org_j) = st (istate,org_j) + s2*dav_ut(istate,org_i) + enddo + !$OMP END CRITICAL + end if + end do + end do + enddo + !$OMP END PARALLEL DO enddo - - N = 0 - do i=1, dav_size - if(wrotten(i)) then - N = N+1 - do istate=1,N_states_diag - vt (istate,N) = vt (istate,i) - st (istate,N) = st (istate,i) - idx(N) = i - enddo - end if - end do end subroutine @@ -171,6 +187,12 @@ subroutine davidson_slave_inproc(i) call davidson_run_slave(1,i) end +integer function davidson_slave_inproc_omp() + implicit none + + call davidson_run_slave(1,2) + davidson_slave_inproc_omp = 0 +end subroutine subroutine davidson_slave_tcp(i) implicit none @@ -223,7 +245,8 @@ subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, worker_id) integer(ZMQ_PTR),intent(in) :: zmq_to_qp_run_socket integer(ZMQ_PTR),intent(in) :: zmq_socket_push integer,intent(in) :: worker_id - integer :: task_id + integer :: i, taskn, myTask, istate + integer, allocatable :: task_id(:) character*(512) :: task @@ -233,32 +256,59 @@ subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, worker_id) double precision , allocatable :: vt(:,:) double precision , allocatable :: st(:,:) - + allocate(task_id(100)) allocate(idx(dav_size)) allocate(vt(N_states_diag, dav_size)) allocate(st(N_states_diag, dav_size)) + vt = 0d0 + st = 0d0 + taskn = 0 do - call get_task_from_taskserver(zmq_to_qp_run_socket,worker_id, task_id, task) - if(task_id == 0) exit - read (task,*) blockb, blocke + call get_task_from_taskserver(zmq_to_qp_run_socket,worker_id, myTask, task) + if(myTask /= 0) then + read (task,*) blockb, blocke + call davidson_process(blockb, blocke, vt, st) + taskn += 1 + task_id(taskn) = myTask + end if - call davidson_process(blockb, blocke, N, idx, vt, st) - call task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id) - call davidson_push_results(zmq_socket_push, blockb, blocke, N, idx, vt, st, task_id) + if(myTask == 0 .or. taskn == size(task_id)) then + N = 0 + do i=1, dav_size + if(vt(1,i) /= 0d0 .or. st(1,i) /= 0d0) then + N = N+1 + do istate=1,N_states_diag + vt (istate,N) = vt (istate,i) + st (istate,N) = st (istate,i) + idx(N) = i + enddo + end if + end do + + do i = 1, taskn + call task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id(i)) + end do + if(taskn /= 0) call davidson_push_results(zmq_socket_push, blockb, blocke, N, idx, vt, st, taskn, task_id) + + if(myTask == 0) exit + vt = 0d0 + st = 0d0 + taskn = 0 + end if end do end subroutine -subroutine davidson_push_results(zmq_socket_push, blockb, blocke, N, idx, vt, st, task_id) +subroutine davidson_push_results(zmq_socket_push, blockb, blocke, N, idx, vt, st, taskn, task_id) use f77_zmq implicit none integer(ZMQ_PTR) ,intent(in) :: zmq_socket_push - integer ,intent(in) :: task_id + integer ,intent(in) :: task_id(100), taskn integer ,intent(in) :: blockb, blocke integer ,intent(in) :: N @@ -285,18 +335,21 @@ subroutine davidson_push_results(zmq_socket_push, blockb, blocke, N, idx, vt, st rc = f77_zmq_send( zmq_socket_push, st, 8*N_states_diag* N, ZMQ_SNDMORE) if(rc /= 8*N_states_diag* N) stop "davidson_push_results failed to push st" - rc = f77_zmq_send( zmq_socket_push, task_id, 4, 0) - if(rc /= 4) stop "davidson_push_results failed to push task_id" + rc = f77_zmq_send( zmq_socket_push, taskn, 4, ZMQ_SNDMORE) + if(rc /= 4) stop "davidson_push_results failed to push taskn" + + rc = f77_zmq_send( zmq_socket_push, task_id, 4*taskn, 0) + if(rc /= 4*taskn) stop "davidson_push_results failed to push task_id" end subroutine -subroutine davidson_pull_results(zmq_socket_pull, blockb, blocke, N, idx, vt, st, task_id) +subroutine davidson_pull_results(zmq_socket_pull, blockb, blocke, N, idx, vt, st, taskn, task_id) use f77_zmq implicit none integer(ZMQ_PTR) ,intent(in) :: zmq_socket_pull - integer ,intent(out) :: task_id + integer ,intent(out) :: task_id(100), taskn integer ,intent(out) :: blockb, blocke integer ,intent(out) :: N integer ,intent(out) :: idx(dav_size) @@ -323,8 +376,11 @@ subroutine davidson_pull_results(zmq_socket_pull, blockb, blocke, N, idx, vt, st rc = f77_zmq_recv( zmq_socket_pull, st, 8*N_states_diag* N, 0) if(rc /= 8*N_states_diag* N) stop "davidson_push_results failed to pull st" - rc = f77_zmq_recv( zmq_socket_pull, task_id, 4, 0) - if(rc /= 4) stop "davidson_pull_results failed to pull task_id" + rc = f77_zmq_recv( zmq_socket_pull, taskn, 4, 0) + if(rc /= 4) stop "davidson_pull_results failed to pull taskn" + + rc = f77_zmq_recv( zmq_socket_pull, task_id, 4*taskn, 0) + if(rc /= 4*taskn) stop "davidson_pull_results failed to pull task_id" end subroutine @@ -340,8 +396,7 @@ subroutine davidson_collector(zmq_to_qp_run_socket, zmq_socket_pull , v0, s0, LD double precision ,intent(inout) :: v0(LDA, N_states_diag) double precision ,intent(inout) :: s0(LDA, N_states_diag) - integer :: more, task_id - + integer :: more, task_id(100), taskn integer :: blockb, blocke integer :: N @@ -349,6 +404,8 @@ subroutine davidson_collector(zmq_to_qp_run_socket, zmq_socket_pull , v0, s0, LD double precision , allocatable :: vt(:,:), v0t(:,:), s0t(:,:) double precision , allocatable :: st(:,:) + integer :: i + allocate(idx(dav_size)) allocate(vt(N_states_diag, dav_size)) allocate(st(N_states_diag, dav_size)) @@ -361,10 +418,13 @@ subroutine davidson_collector(zmq_to_qp_run_socket, zmq_socket_pull , v0, s0, LD more = 1 do while (more == 1) - call davidson_pull_results(zmq_socket_pull, blockb, blocke, N, idx, vt, st, task_id) + call davidson_pull_results(zmq_socket_pull, blockb, blocke, N, idx, vt, st, taskn, task_id) + !DIR$ FORCEINLINE call davidson_collect(blockb, blocke, N, idx, vt, st , v0t, s0t) - call zmq_delete_task(zmq_to_qp_run_socket,zmq_socket_pull,task_id,more) + do i=1,taskn + call zmq_delete_task(zmq_to_qp_run_socket,zmq_socket_pull,task_id(i),more) + end do end do deallocate(idx,vt,st) @@ -384,41 +444,39 @@ subroutine davidson_run(zmq_to_qp_run_socket , v0, s0, LDA) integer(ZMQ_PTR) :: zmq_collector integer(ZMQ_PTR), external :: new_zmq_pull_socket integer(ZMQ_PTR) :: zmq_socket_pull - + integer(ZMQ_PTR) :: pthread_slave, pthread_miniserver + + integer :: i integer, external :: omp_get_thread_num double precision , intent(inout) :: v0(LDA, N_states_diag) double precision , intent(inout) :: s0(LDA, N_states_diag) + integer, external :: davidson_miniserver_run, davidson_slave_inproc_omp call zmq_set_running(zmq_to_qp_run_socket) zmq_collector = new_zmq_to_qp_run_socket() zmq_socket_pull = new_zmq_pull_socket() - i = omp_get_thread_num() PROVIDE nproc - !$OMP PARALLEL DEFAULT(shared) private(i) num_threads(nproc+2) - i = omp_get_thread_num() - if (i==0) then - call davidson_collector(zmq_collector, zmq_socket_pull , v0, s0, size(v0,1)) - call end_zmq_to_qp_run_socket(zmq_collector) - call end_zmq_pull_socket(zmq_socket_pull) - call davidson_miniserver_end() - else if(i==1) then - call davidson_miniserver_run() - else - call davidson_slave_inproc(i) - endif - !$OMP END PARALLEL + i = pthread_create ( pthread_miniserver, davidson_miniserver_run ) + i = pthread_create ( pthread_slave, davidson_slave_inproc_omp ) + + call davidson_collector(zmq_collector, zmq_socket_pull , v0, s0) + call end_zmq_to_qp_run_socket(zmq_collector) + call end_zmq_pull_socket(zmq_socket_pull) + call davidson_miniserver_end() + i = pthread_join(pthread_miniserver) + i = pthread_join(pthread_slave) + call end_parallel_job(zmq_to_qp_run_socket, 'davidson') end subroutine - -subroutine davidson_miniserver_run() +integer function davidson_miniserver_run() use f77_zmq implicit none integer(ZMQ_PTR) responder @@ -445,6 +503,7 @@ subroutine davidson_miniserver_run() enddo rc = f77_zmq_close(responder) + davidson_miniserver_run = 0 end subroutine diff --git a/src/Davidson/davidson_slave.irp.f b/src/Davidson/davidson_slave.irp.f index b5ec0592..9195e46f 100644 --- a/src/Davidson/davidson_slave.irp.f +++ b/src/Davidson/davidson_slave.irp.f @@ -20,21 +20,20 @@ program davidson_slave do call wait_for_state(zmq_state,state) if(trim(state) /= "davidson") exit - !print *, 'Getting wave function' - !call zmq_get_psi(zmq_to_qp_run_socket,1,energy,N_states_diag) call davidson_miniserver_get() integer :: rc, i print *, 'Davidson slave running' - !$OMP PARALLEL PRIVATE(i) - i = omp_get_thread_num() - call davidson_slave_tcp(i) - !$OMP END PARALLEL + ! !$OMP PARALLEL PRIVATE(i) + !i = omp_get_thread_num() + call davidson_slave_tcp(0) + !!$OMP END PARALLEL end do end subroutine provide_everything PROVIDE mo_bielec_integrals_in_map psi_det_sorted_bit N_states_diag zmq_context end subroutine + diff --git a/src/Davidson/u0Hu0.irp.f b/src/Davidson/u0Hu0.irp.f index 13ba2b46..69b6c354 100644 --- a/src/Davidson/u0Hu0.irp.f +++ b/src/Davidson/u0Hu0.irp.f @@ -195,7 +195,6 @@ subroutine H_S2_u_0_nstates(v_0,s_0,u_0,H_jj,S2_jj,n,keys_tmp,Nint,N_st,sze_8) double precision, intent(in) :: H_jj(n), S2_jj(n) integer(bit_kind),intent(in) :: keys_tmp(Nint,2,n) double precision :: hij,s2 - double precision, allocatable :: vt(:,:), st(:,:) double precision, allocatable :: ut(:,:) integer :: i,j,k,l, jj,ii integer :: i0, j0 @@ -209,7 +208,6 @@ subroutine H_S2_u_0_nstates(v_0,s_0,u_0,H_jj,S2_jj,n,keys_tmp,Nint,N_st,sze_8) integer, external :: align_double integer :: workload, blockb, blocke -! !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: vt, ut integer(ZMQ_PTR) :: handler @@ -232,8 +230,8 @@ subroutine H_S2_u_0_nstates(v_0,s_0,u_0,H_jj,S2_jj,n,keys_tmp,Nint,N_st,sze_8) ut(istate,i) = u_0(i,istate) enddo enddo - call sort_dets_ab_v(keys_tmp, sorted(1,1,1), sort_idx(1,1), shortcut(0,1), version(1,1,1), n, Nint) - call sort_dets_ba_v(keys_tmp, sorted(1,1,2), sort_idx(1,2), shortcut(0,2), version(1,1,2), n, Nint) + call sort_dets_ab_v(keys_tmp, sorted(1,1,1), sort_idx(1,1), shortcut(0,1), version(1,1,1), n, Nint) + call sort_dets_ba_v(keys_tmp, sorted(1,1,2), sort_idx(1,2), shortcut(0,2), version(1,1,2), n, Nint) workload = 0 blockb = shortcut(0,1) @@ -241,69 +239,21 @@ subroutine H_S2_u_0_nstates(v_0,s_0,u_0,H_jj,S2_jj,n,keys_tmp,Nint,N_st,sze_8) call davidson_init(handler,n,N_st_8,ut) do sh=shortcut(0,1),1,-1 workload += (shortcut(sh+1,1) - shortcut(sh,1))**2 - if(workload > 100000) then + if(workload > 1000) then blocke = sh call davidson_add_task(handler, blocke, blockb) blockb = sh-1 workload = 0 end if enddo - if(blockb > 0) call davidson_add_task(handler, 1, blockb) call davidson_run(handler, v_0, s_0, size(v_0,1)) - !$OMP PARALLEL DEFAULT(NONE) & - !$OMP PRIVATE(i,hij,s2,j,k,jj,vt,st,ii,sh,sh2,ni,exa,ext,org_i,org_j,endi,sorted_i,istate)& - !$OMP SHARED(n,keys_tmp,ut,Nint,v_0,s_0,sorted,shortcut,sort_idx,version,N_st,N_st_8) - - allocate(vt(N_st_8,n),st(N_st_8,n)) - Vt = 0.d0 - St = 0.d0 - - !$OMP DO SCHEDULE(dynamic) - do sh=1,shortcut(0,2) - do i=shortcut(sh,2),shortcut(sh+1,2)-1 - org_i = sort_idx(i,2) - do j=shortcut(sh,2),i-1 - org_j = sort_idx(j,2) - ext = 0 - do ni=1,Nint - ext = ext + popcnt(xor(sorted(ni,i,2), sorted(ni,j,2))) - end do - if(ext == 4) then - call i_h_j (keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,hij) - call get_s2(keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,s2) - do istate=1,n_st - vt (istate,org_i) = vt (istate,org_i) + hij*ut(istate,org_j) - vt (istate,org_j) = vt (istate,org_j) + hij*ut(istate,org_i) - st (istate,org_i) = st (istate,org_i) + s2 *ut(istate,org_j) - st (istate,org_j) = st (istate,org_j) + s2 *ut(istate,org_i) - enddo - end if - end do - end do - enddo - !$OMP END DO NOWAIT - - !$OMP CRITICAL - do istate=1,N_st - do i=1,n - v_0(i,istate) = v_0(i,istate) + vt(istate,i) - s_0(i,istate) = s_0(i,istate) + st(istate,i) - enddo - enddo - !$OMP END CRITICAL - - deallocate(vt,st) - !$OMP END PARALLEL - do istate=1,N_st do i=1,n v_0(i,istate) = v_0(i,istate) + H_jj(i) * u_0(i,istate) s_0(i,istate) = s_0(i,istate) + s2_jj(i)* u_0(i,istate) enddo enddo - - deallocate (shortcut, sort_idx, sorted, version) end diff --git a/src/ZMQ/utils.irp.f b/src/ZMQ/utils.irp.f index 3b3c912d..c3a55a05 100644 --- a/src/ZMQ/utils.irp.f +++ b/src/ZMQ/utils.irp.f @@ -853,6 +853,31 @@ subroutine zmq_delete_task(zmq_to_qp_run_socket,zmq_socket_pull,task_id,more) endif end + +subroutine wait_for_next_state(state) + use f77_zmq + implicit none + + character*(64), intent(out) :: state + integer(ZMQ_PTR) :: zmq_socket_sub + integer(ZMQ_PTR), external :: new_zmq_sub_socket + integer :: rc + + zmq_socket_sub = new_zmq_sub_socket() + state = 'Waiting' + do while(state == "Waiting") + rc = f77_zmq_recv( zmq_socket_sub, state, 64, 0) + if (rc > 0) then + state = trim(state(1:rc)) + else + print *, 'Timeout reached. Stopping' + state = "Stopped" + end if + end do + call end_zmq_sub_socket(zmq_socket_sub) +end subroutine + + subroutine wait_for_state(state_wait,state) use f77_zmq implicit none