From c14fe5b99f4ae9bec09a9b78ad02040f5ba777f2 Mon Sep 17 00:00:00 2001 From: Yann Garniron Date: Tue, 1 May 2018 13:16:10 +0200 Subject: [PATCH] per checkpoint dressing communication - buggy --- plugins/dress_zmq/dress_slave.irp.f | 10 +- plugins/dress_zmq/dress_stoch_routines.irp.f | 215 +++------- plugins/dress_zmq/run_dress_slave.irp.f | 415 ++++++++++--------- 3 files changed, 277 insertions(+), 363 deletions(-) diff --git a/plugins/dress_zmq/dress_slave.irp.f b/plugins/dress_zmq/dress_slave.irp.f index ff003a21..10453d2a 100644 --- a/plugins/dress_zmq/dress_slave.irp.f +++ b/plugins/dress_zmq/dress_slave.irp.f @@ -54,12 +54,12 @@ subroutine run_wf PROVIDE psi_bilinear_matrix_transp_rows_loc psi_bilinear_matrix_transp_columns PROVIDE psi_bilinear_matrix_transp_order - !$OMP PARALLEL PRIVATE(i) - i = omp_get_thread_num() - call dress_slave_tcp(i+1, energy) - !$OMP END PARALLEL + !!$OMP PARALLEL PRIVATE(i) + !i = omp_get_thread_num() +! call dress_slave_tcp(i+1, energy) + call dress_slave_tcp(0, energy) + !!$OMP END PARALLEL print *, 'dress done' - endif end do diff --git a/plugins/dress_zmq/dress_stoch_routines.irp.f b/plugins/dress_zmq/dress_stoch_routines.irp.f index c1f64e7c..ad58aa5c 100644 --- a/plugins/dress_zmq/dress_stoch_routines.irp.f +++ b/plugins/dress_zmq/dress_stoch_routines.irp.f @@ -66,7 +66,7 @@ subroutine ZMQ_dress(E, dress, delta, delta_s2, relative_error) integer(ZMQ_PTR), external :: new_zmq_to_qp_run_socket integer :: ipos, sz - integer :: block(8), block_i, cur_tooth_reduce, ntas + integer :: block(1), block_i, cur_tooth_reduce, ntas logical :: flushme block = 0 block_i = 0 @@ -176,8 +176,7 @@ subroutine dress_collector(zmq_socket_pull, E, relative_error, delta, delta_s2, double precision, intent(out) :: delta(N_states, N_det) double precision, intent(out) :: delta_s2(N_states, N_det) - double precision, allocatable :: delta_loc(:,:,:), delta_det(:,:,:,:) - real, allocatable :: delta_loc4(:,:,:) + double precision, allocatable :: delta_loc(:,:,:) double precision, allocatable :: dress_detail(:,:) double precision :: dress_mwen(N_states) integer(ZMQ_PTR),external :: new_zmq_to_qp_run_socket @@ -189,164 +188,78 @@ subroutine dress_collector(zmq_socket_pull, E, relative_error, delta, delta_s2, integer :: i, j, k, i_state, N integer :: task_id, ind double precision, save :: time0 = -1.d0 - double precision :: time, timeLast, old_tooth + double precision :: time double precision, external :: omp_get_wtime - integer :: cur_cp, old_cur_cp - integer, allocatable :: parts_to_get(:) - logical, allocatable :: actually_computed(:) - integer :: total_computed + integer :: cur_cp integer :: delta_loc_cur, is, N_buf(3) - double precision :: fac , wei - integer, allocatable :: int_buf(:) + integer, allocatable :: int_buf(:), agreg_for_cp(:) double precision, allocatable :: double_buf(:) integer(bit_kind), allocatable :: det_buf(:,:,:) - + integer, external :: zmq_delete_tasks + + allocate(agreg_for_cp(N_cp)) + agreg_for_cp = 0 allocate(int_buf(N_dress_int_buffer), double_buf(N_dress_double_buffer), det_buf(N_int,2,N_dress_det_buffer)) delta_loc_cur = 1 delta = 0d0 delta_s2 = 0d0 - allocate(delta_det(N_states, N_det, 0:comb_teeth+1, 2)) allocate(cp(N_states, N_det, N_cp, 2), dress_detail(N_states, N_det)) allocate(delta_loc(N_states, N_det, 2)) - allocate(delta_loc4(N_states, N_det, 2)) - dress_detail = 0d0 - delta_det = 0d0 + dress_detail = -1000d0 cp = 0d0 - total_computed = 0 character*(512) :: task - - allocate(actually_computed(N_det_generators), parts_to_get(N_det_generators)) - - - parts_to_get(:) = 1 - if(fragment_first > 0) then - do i=1,fragment_first - parts_to_get(i) = fragment_count - enddo - endif - - actually_computed = .false. zmq_to_qp_run_socket = new_zmq_to_qp_run_socket() more = 1 if (time0 < 0.d0) then call wall_time(time0) endif - timeLast = time0 - cur_cp = 0 - old_cur_cp = 0 - logical :: loop, last, floop - integer, allocatable :: sparse(:) - allocate(sparse(0:N_det)) + logical :: loop, floop + integer :: finalcp + finalcp = N_cp*2 + floop = .true. loop = .true. pullLoop : do while (loop) - call pull_dress_results(zmq_socket_pull, ind, last, delta_loc, delta_loc4, int_buf, double_buf, det_buf, N_buf, task_id, sparse, dress_mwen) - call dress_pulled(ind, int_buf, double_buf, det_buf, N_buf) + call pull_dress_results(zmq_socket_pull, ind, cur_cp, delta_loc, int_buf, double_buf, det_buf, N_buf, task_id, dress_mwen) if(floop) then call wall_time(time) print *, "FIRST PULL", time-time0 floop = .false. end if - - integer, external :: zmq_delete_tasks - if(last) then + if(cur_cp == -1) then + call dress_pulled(ind, int_buf, double_buf, det_buf, N_buf) if (zmq_delete_tasks(zmq_to_qp_run_socket,zmq_socket_pull,task_id,1,more) == -1) then - stop 'Unable to delete tasks' + stop 'Unable to delete tasks' endif - if(more == 0) loop = .false. - end if - - !dress_mwen = 0d0 - - !do i_state=1,N_states - ! do i=1,sparse(0) - ! dress_mwen(i_state) += delta_loc(i_state, i, 1) * psi_coef(sparse(i), i_state) - ! end do - !end do + !if(more == 0) stop 'loop = .false.' !!!!!!!!!!!!!!!! + dress_detail(:, ind) = dress_mwen(:) + else if(cur_cp > 0) then - dress_detail(:, ind) += dress_mwen(:) - wei = dress_weight_inv(ind) - - do j=1,N_cp !! optimizable - fac = 0d0 - !fac(i) = cps(inds(i), j) / cps_N(j) * wei(i) * comb_step - fac = cps(ind, j) * wei * comb_step + if(ind == 0) cycle - if(fac /= 0) then - !$OMP PARALLEL DO SCHEDULE(STATIC) DEFAULT(SHARED) PRIVATE(i,is) - do i=1,sparse(0) - do is=1,N_states - cp(is,sparse(i),j,1) += delta_loc(is,i,1) * fac - end do - end do - !$OMP END PARALLEL DO - - !$OMP PARALLEL DO SCHEDULE(STATIC) DEFAULT(SHARED) PRIVATE(i,is) - do i=1,sparse(0) - do is=1,N_states - cp(is,sparse(i),j,2) += delta_loc(is,i,2) * fac - end do - end do - !$OMP END PARALLEL DO - end if - end do - - ! do i=1,delta_loc_cur - logical :: fracted - integer :: toothMwen - - toothMwen = tooth_of_det(ind) - fracted = (toothMwen /= 0) - if(fracted) fracted = (ind == first_det_of_teeth(toothMwen)) - - if(fracted .and. .false.) then - !$OMP PARALLEL DO SCHEDULE(STATIC) DEFAULT(SHARED) PRIVATE(i) - do i=1,sparse(0) - delta_det(1:N_states,sparse(i),toothMwen-1, 1) += delta_loc(1:N_states,i,1) * (1d0-fractage(toothMwen)) - delta_det(1:N_states,sparse(i),toothMwen-1, 2) += delta_loc(1:N_states,i,2) * (1d0-fractage(toothMwen)) - delta_det(1:N_states,sparse(i),toothMwen , 1) += delta_loc(1:N_states,i,1) * (fractage(toothMwen)) - delta_det(1:N_states,sparse(i),toothMwen , 2) += delta_loc(1:N_states,i,2) * (fractage(toothMwen)) - end do - !$OMP END PARALLEL DO - else if(.false.) then - !$OMP PARALLEL DO SCHEDULE(STATIC) DEFAULT(SHARED) PRIVATE(i) - do i=1,sparse(0) - delta_det(1:N_states,sparse(i),toothMwen , 1) = delta_loc(1:N_states,i,1) - delta_det(1:N_states,sparse(i),toothMwen , 2) = delta_loc(1:N_states,i,2) - end do - !$OMP END PARALLEL DO - end if - - parts_to_get(ind) -= 1 - if(parts_to_get(ind) == 0) then - actually_computed(ind) = .true. - total_computed += 1 - end if - !end do - - time = omp_get_wtime() - - if((time - timeLast > 5d0) .or. (.not. loop)) then - timeLast = time - cur_cp = N_cp - - do i=1,N_det_generators - if(.not. actually_computed(dress_jobs(i))) then - if(i /= 1) then - cur_cp = done_cp_at(i-1) - else - cur_cp = 0 - end if - exit - end if + !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i) + do i=1,N_det + cp(:,i,cur_cp,1) += delta_loc(:,i,1) end do - if(cur_cp == 0 .or. (cur_cp == old_cur_cp .and. total_computed /= N_det_generators)) cycle pullLoop - + + !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i) + do i=1,N_det + cp(:,i,cur_cp,2) += delta_loc(:,i,2) + end do + + agreg_for_cp(cur_cp) += ind + if(agreg_for_cp(cur_cp) > needed_by_cp(cur_cp)) then + stop "too much results..." + end if + if(agreg_for_cp(cur_cp) /= needed_by_cp(cur_cp)) cycle + + print *, "FINISHED CP", cur_cp + double precision :: su, su2, eqt, avg, E0, val integer, external :: zmq_abort @@ -359,6 +272,7 @@ subroutine dress_collector(zmq_socket_pull, E, relative_error, delta, delta_s2, su += val su2 += val*val end do + avg = su / cps_N(cur_cp) eqt = dsqrt( ((su2 / cps_N(cur_cp)) - avg*avg) / cps_N(cur_cp) ) E0 = sum(dress_detail(istate, :first_det_of_teeth(cp_first_tooth(cur_cp))-1)) @@ -366,47 +280,29 @@ subroutine dress_collector(zmq_socket_pull, E, relative_error, delta, delta_s2, E0 = E0 + dress_detail(istate, first_det_of_teeth(cp_first_tooth(cur_cp))) * (1d0-fractage(cp_first_tooth(cur_cp))) end if - call wall_time(time) - if ((dabs(eqt) < relative_error .and. cps_N(cur_cp) >= 30) .or. total_computed == N_det_generators) then + + print '(2X, F16.7, 2X, G16.3, 2X, F16.4, A20)', avg+E(istate)+E0, eqt, time-time0, '' + if ((dabs(eqt) < relative_error .and. cps_N(cur_cp) >= 30) .or. cur_cp == N_cp) then ! Termination - print '(2X, F16.7, 2X, G16.3, 2X, F16.4, A20)', avg+E(istate)+E0, eqt, time-time0, '' + print *, "TERMINATE" if (zmq_abort(zmq_to_qp_run_socket) == -1) then call sleep(1) if (zmq_abort(zmq_to_qp_run_socket) == -1) then print *, irp_here, ': Error in sending abort signal (2)' endif - endif - else - if (cur_cp > old_cur_cp) then - old_cur_cp = cur_cp - print '(2X, F16.7, 2X, G16.3, 2X, F16.4, A20)', avg+E(istate)+E0, eqt, time-time0, '' - endif + endif + !exit pullLoop endif end if end do pullLoop + print *, "exited" + + + delta(:,:) = cp(:,:,cur_cp,1) + delta_s2(:,:) = cp(:,:,cur_cp,2) - delta (1:N_states,1:N_det) = 0d0 - delta_s2(1:N_states,1:N_det) = 0d0 - if(total_computed == N_det_generators) then - do i=comb_teeth+1,0,-1 - delta (1:N_states,1:N_det) = delta (1:N_states,1:N_det) + delta_det(1:N_states,1:N_det,i,1) - delta_s2(1:N_states,1:N_det) = delta_s2(1:N_states,1:N_det) + delta_det(1:N_states,1:N_det,i,2) - end do - else - do i=1,cur_cp - delta (1:N_states,1:N_det) += cp(1:N_states,1:N_det,i,1) - delta_s2(1:N_states,1:N_det) += cp(1:N_states,1:N_det,i,2) - end do - delta (1:N_states,1:N_det) = delta(1:N_states,1:N_det) / cps_N(cur_cp) - delta_s2(1:N_states,1:N_det) = delta_s2(1:N_states,1:N_det) / cps_N(cur_cp) - do i=cp_first_tooth(cur_cp)-1,0,-1 - delta (1:N_states,1:N_det) = delta (1:N_states,1:N_det) + delta_det(1:N_states,1:N_det,i,1) - delta_s2(1:N_states,1:N_det) = delta_s2(1:N_states,1:N_det) + delta_det(1:N_states,1:N_det,i,2) - end do - - end if dress(istate) = E(istate)+E0 call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket) end subroutine @@ -458,6 +354,8 @@ END_PROVIDER &BEGIN_PROVIDER [ double precision, cps_N, (N_cps_max) ] &BEGIN_PROVIDER [ integer, cp_first_tooth, (N_cps_max) ] &BEGIN_PROVIDER [ integer, done_cp_at, (N_det_generators) ] +&BEGIN_PROVIDER [ integer, done_cp_at_det, (N_det_generators) ] +&BEGIN_PROVIDER [ integer, needed_by_cp, (0:N_cps_max) ] &BEGIN_PROVIDER [ double precision, cps, (N_det_generators, N_cps_max) ] &BEGIN_PROVIDER [ integer, N_dress_jobs ] &BEGIN_PROVIDER [ integer, dress_jobs, (N_det_generators) ] @@ -486,6 +384,8 @@ END_PROVIDER cps = 0d0 cur_cp = 1 done_cp_at = 0 + done_cp_at_det = 0 + needed_by_cp = 0 comp_filler = .false. computed = .false. cps_N = 1d0 @@ -506,6 +406,7 @@ END_PROVIDER end do l=first_det_of_comb + call random_seed(put=(/321,654,65,321,65/)) call RANDOM_NUMBER(comb) lfiller = 1 nfiller = 1 @@ -574,6 +475,8 @@ END_PROVIDER do i=1,N_dress_jobs if(done_cp_at(i) /= 0) cur_cp = done_cp_at(i) done_cp_at(i) = cur_cp + done_cp_at_det(dress_jobs(i)) = cur_cp + needed_by_cp(cur_cp) += 1 end do @@ -625,7 +528,7 @@ END_PROVIDER end do do i=1,N_cp-1 - call isort(dress_jobs(first_cp(i)+1:first_cp(i+1)),iorder,first_cp(i+1)-first_cp(i)) + call isort(dress_jobs(first_cp(i)+1),iorder,first_cp(i+1)-first_cp(i)-1) end do do i=1,N_det_generators diff --git a/plugins/dress_zmq/run_dress_slave.irp.f b/plugins/dress_zmq/run_dress_slave.irp.f index 9b4a3863..248b7d34 100644 --- a/plugins/dress_zmq/run_dress_slave.irp.f +++ b/plugins/dress_zmq/run_dress_slave.irp.f @@ -9,13 +9,13 @@ BEGIN_PROVIDER [ integer, fragment_count ] END_PROVIDER -subroutine run_dress_slave(thread,iproc,energy) +subroutine run_dress_slave(thread,iproce,energy) use f77_zmq implicit none double precision, intent(in) :: energy(N_states_diag) - integer, intent(in) :: thread, iproc - integer :: rc, i, subset, i_generator(60) + integer, intent(in) :: thread, iproce + integer :: rc, i, subset, i_generator integer :: worker_id, task_id, ctask, ltask character*(5120) :: task @@ -41,13 +41,24 @@ subroutine run_dress_slave(thread,iproc,energy) integer(bit_kind), allocatable :: det_buf(:,:,:) integer :: N_buf(3) logical :: last + integer, external :: omp_get_thread_num + double precision, allocatable :: delta_det(:,:,:,:), cp(:,:,:,:) + integer :: toothMwen + logical :: fracted + double precision :: fac + + + + if(iproce /= 0) stop "RUN DRESS SLAVE is OMP" + + allocate(delta_det(N_states, N_det, 0:comb_teeth+1, 2)) + allocate(cp(N_states, N_det, N_cp, 2)) + delta_det = 0d9 + cp = 0d0 + task(:) = CHAR(0) - allocate(int_buf(N_dress_int_buffer)) - allocate(double_buf(N_dress_double_buffer)) - allocate(det_buf(N_int, 2, N_dress_det_buffer)) - allocate(delta_ij_loc(N_states,N_det,2)) zmq_to_qp_run_socket = new_zmq_to_qp_run_socket() zmq_socket_push = new_zmq_push_socket(thread) @@ -61,48 +72,139 @@ subroutine run_dress_slave(thread,iproc,energy) do i=1,N_states div(i) = psi_coef(dressed_column_idx(i), i) end do - do - call get_task_from_taskserver(zmq_to_qp_run_socket,worker_id, task_id, task) - if(task_id /= 0) then - task = trim(task)//' 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0' + + integer :: iproc, cur_cp, done_for(0:N_cp) + integer, allocatable :: tasks(:) + logical :: loop, donedone + integer :: res_task(Nproc), res_gen(Nproc), res_sub(Nproc) + res_gen = 0 + + donedone = .false. + allocate(tasks(0:N_det)) + done_for = 0 + + do cur_cp=0, N_cp + if(donedone) exit + print *, "DOING CP", cur_cp + tasks(0) = 0 + !$OMP PARALLEL DEFAULT(SHARED) & + !$OMP PRIVATE(int_buf, double_buf, det_buf, delta_ij_loc, task, task_id) & + !$OMP PRIVATE(toothMwen, fracted, fac) & + !$OMP PRIVATE(loop, i_generator, subset, iproc, N_buf) + iproc = omp_get_thread_num()+1 + loop = .true. + allocate(int_buf(N_dress_int_buffer)) + allocate(double_buf(N_dress_double_buffer)) + allocate(det_buf(N_int, 2, N_dress_det_buffer)) + allocate(delta_ij_loc(N_states,N_det,2)) + do while(loop) + if(res_gen(iproc) == 0) then + !$OMP CRITICAL + call get_task_from_taskserver(zmq_to_qp_run_socket,worker_id, task_id, task) + !$OMP END CRITICAL + task = task//" 0" + if(task_id == 0) then + donedone = .true. + print *, "DONEDONE" + exit !! LAST MESSAGE ??? + end if + read (task,*) subset, i_generator + else + subset = res_sub(iproc) + i_generator = res_gen(iproc) + task_id = res_task(iproc) + res_gen(iproc) = 0 + end if - i_generator = 0 - read (task,*) subset, i_generator - if(i_generator(size(i_generator)) /= 0) stop "i_generator buffer too small" - delta_ij_loc = 0d0 - i=1 - do while(i_generator(i) /= 0) - call generator_start(i_generator(i), iproc) - call alpha_callback(delta_ij_loc, i_generator(i), subset, iproc) - call generator_done(i_generator(i), int_buf, double_buf, det_buf, N_buf, iproc) - last = (i_generator(i+1) == 0) - call push_dress_results(zmq_socket_push, i_generator(i), last, delta_ij_loc, int_buf, double_buf, det_buf, N_buf, task_id) - i += 1 + !if(done_cp_at_det(i_generator) > cur_cp) loop = .false. + if(done_cp_at_det(i_generator) > cur_cp) then + res_gen(iproc) = i_generator + res_task(iproc) = task_id + res_sub(iproc) = subset + exit + end if + + !$OMP ATOMIC + done_for(done_cp_at_det(i_generator)) += 1 + + delta_ij_loc(:,:,:) = 0d0 + call generator_start(i_generator, iproc) + call alpha_callback(delta_ij_loc, i_generator, subset, iproc) + call generator_done(i_generator, int_buf, double_buf, det_buf, N_buf, iproc) + + !if(.false.) then + !$OMP CRITICAL + do i=1,N_cp + fac = cps(i_generator, i) * dress_weight_inv(i_generator) * comb_step + if(fac == 0d0) cycle + cp(:,:,i,1) += (delta_ij_loc(:,:,1) * fac) + cp(:,:,i,2) += (delta_ij_loc(:,:,2) * fac) end do + + + toothMwen = tooth_of_det(i_generator) + fracted = (toothMwen /= 0) + if(fracted) fracted = (i_generator == first_det_of_teeth(toothMwen)) + if(fracted) then + delta_det(:,:,toothMwen-1, 1) += delta_ij_loc(:,:,1) * (1d0-fractage(toothMwen)) + delta_det(:,:,toothMwen-1, 2) += delta_ij_loc(:,:,2) * (1d0-fractage(toothMwen)) + delta_det(:,:,toothMwen , 1) += delta_ij_loc(:,:,1) * (fractage(toothMwen)) + delta_det(:,:,toothMwen , 2) += delta_ij_loc(:,:,2) * (fractage(toothMwen)) + else + delta_det(:,:,toothMwen , 1) += delta_ij_loc(:,:,1) + delta_det(:,:,toothMwen , 2) += delta_ij_loc(:,:,2) + end if + + + !$OMP END CRITICAL + !end if + + !$OMP CRITICAL + call push_dress_results(zmq_socket_push, i_generator, -1, delta_ij_loc, int_buf, double_buf, det_buf, N_buf, task_id) call task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id) - else - exit + !$OMP END CRITICAL + tasks(0) += 1 + tasks(tasks(0)) = task_id + + end do + print *, "SLAVE", iproc, "waits" + deallocate(int_buf,double_buf,det_buf,delta_ij_loc) + !$OMP END PARALLEL + + allocate(delta_ij_loc(N_states,N_det,2)) + allocate(int_buf(1), double_buf(1), det_buf(1,1,1)) + N_buf = (/0,1,0/) + + delta_ij_loc = 0d0 + + if(cur_cp > 0) then + do i=1,cur_cp + delta_ij_loc(:,:,:) += cp(:,:,i,:) + !delta_s2(:,:) += cp(:,:,i,2) + end do + + delta_ij_loc(:,:,:) = delta_ij_loc(:,:,:) / cps_N(cur_cp) + do i=cp_first_tooth(cur_cp)-1,0,-1 + delta_ij_loc(:,:,:) = delta_ij_loc(:,:,:) +delta_det(:,:,i,:) + end do end if + call sleep(1) + call push_dress_results(zmq_socket_push, done_for(cur_cp), cur_cp, delta_ij_loc, int_buf, double_buf, det_buf, N_buf, -1) + !do i=1,tasks(0) + ! call task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,tasks(i)) + !end do + deallocate(delta_ij_loc, int_buf, double_buf, det_buf) end do + + call sleep(10) call disconnect_from_taskserver(zmq_to_qp_run_socket,zmq_socket_push,worker_id) call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket) call end_zmq_push_socket(zmq_socket_push,thread) end subroutine -! BEGIN_PROVIDER [ integer, dress_int_buffer, (N_dress_int_buffer) ] -!&BEGIN_PROVIDER [ double precision, dress_double_buffer, (N_dress_double_buffer) ] -!&BEGIN_PROVIDER [ integer(bit_kind), dress_det_buffer, (N_int, 2, N_dress_det_buffer) ] -! implicit none -! -! dress_int_buffer = 0 -! dress_double_buffer = 0d0 - ! dress_det_buffer = 0_bit_kind -!END_PROVIDER - -!subroutine pull_dress_results(zmq_socket_pull, ind, delta_loc, int_buf, double_buf, det_buf, N_buf, task_id, felem) -subroutine push_dress_results(zmq_socket_push, ind, last, delta_loc, int_buf, double_buf, det_buf, N_bufi, task_id) +subroutine push_dress_results(zmq_socket_push, ind, cur_cp, delta_loc, int_buf, double_buf, det_buf, N_bufi, task_id) use f77_zmq implicit none @@ -110,135 +212,68 @@ subroutine push_dress_results(zmq_socket_push, ind, last, delta_loc, int_buf, do double precision, intent(inout) :: delta_loc(N_states, N_det, 2) real(kind=4), allocatable :: delta_loc4(:,:,:) double precision, intent(in) :: double_buf(*) - logical, intent(in) :: last integer, intent(in) :: int_buf(*) integer(bit_kind), intent(in) :: det_buf(N_int, 2, *) integer, intent(in) :: N_bufi(3) integer :: N_buf(3) - integer, intent(in) :: ind, task_id + integer, intent(in) :: ind, cur_cp, task_id integer :: rc, i, j, k, l - double precision :: tmp(N_states,2) - integer, allocatable :: sparse(:) - integer :: sparsei double precision :: contrib(N_states) - contrib = 0d0 - allocate(sparse(N_det)) - allocate(delta_loc4(N_states, N_det, 2)) + rc = f77_zmq_send( zmq_socket_push, ind, 4, ZMQ_SNDMORE) if(rc /= 4) stop "push" - rc = f77_zmq_send( zmq_socket_push, last, 1, ZMQ_SNDMORE) - if(rc /= 1) stop "push" + rc = f77_zmq_send( zmq_socket_push, cur_cp, 4, ZMQ_SNDMORE) + if(rc /= 4) stop "push" - if(last) then - sparsei = 0 + if(cur_cp /= -1) then + rc = f77_zmq_send( zmq_socket_push, delta_loc(1,1,1), 8*N_states*N_det, ZMQ_SNDMORE) + if(rc /= 8*N_states*N_det) stop "push" + + rc = f77_zmq_send( zmq_socket_push, delta_loc(1,1,2), 8*N_states*N_det, ZMQ_SNDMORE) + if(rc /= 8*N_states*N_det) stop "push" + else + contrib = 0d0 + do i=1,N_det - do j=1,N_states - if(delta_loc(j,i,1) /= 0d0 .or. delta_loc(j,i,2) /= 0d0) then - sparsei += 1 - sparse(sparsei) = i - do k=1,2 - do l=1,N_states - delta_loc4(l,sparsei,k) = real(delta_loc(l,i,k), kind=4) - end do - end do - contrib(:) += delta_loc(:,i, 1) * psi_coef(i, :) - end if - end do + contrib(:) += delta_loc(:,i, 1) * psi_coef(i, :) end do - - if(sparsei /= 0) then - if(sparsei < N_det / 2) then - rc = f77_zmq_send( zmq_socket_push, sparsei, 4, ZMQ_SNDMORE) - if(rc /= 4) stop "push" + rc = f77_zmq_send( zmq_socket_push, contrib, 8*N_states, ZMQ_SNDMORE) + if(rc /= 8*N_states) stop "push" - rc = f77_zmq_send( zmq_socket_push, contrib, 8*N_states, ZMQ_SNDMORE) - if(rc /= 8*N_states) stop "push" - - rc = f77_zmq_send( zmq_socket_push, sparse, 4*sparsei, ZMQ_SNDMORE) - if(rc /= 4*sparsei) stop "push" + N_buf = N_bufi + N_buf = (/0,1,0/) - - rc = f77_zmq_send( zmq_socket_push, delta_loc4(1,1,1), 4*N_states*sparsei, ZMQ_SNDMORE) - if(rc /= 4*N_states*sparsei) stop "push" - - rc = f77_zmq_send( zmq_socket_push, delta_loc4(1,1,2), 4*N_states*sparsei, ZMQ_SNDMORE) - if(rc /= 4*N_states*sparsei) stop "push" - else - rc = f77_zmq_send( zmq_socket_push, -1, 4, ZMQ_SNDMORE) - if(rc /= 4) stop "push" + rc = f77_zmq_send( zmq_socket_push, N_buf, 4*3, ZMQ_SNDMORE) + if(rc /= 4*3) stop "push5" - rc = f77_zmq_send( zmq_socket_push, contrib, 8*N_states, ZMQ_SNDMORE) - if(rc /= 8*N_states) stop "push" - - do i=1,N_det - sparse(i) = i - do k=1,2 - do l=1,N_states - delta_loc4(l,i,k) = real(delta_loc(l,i,k), kind=4) - end do - end do - end do + if(N_buf(1) > N_dress_int_buffer) stop "run_dress_slave N_buf bad size?" + if(N_buf(2) > N_dress_double_buffer) stop "run_dress_slave N_buf bad size?" + if(N_buf(3) > N_dress_det_buffer) stop "run_dress_slave N_buf bad size?" - !rc = f77_zmq_send( zmq_socket_push, sparse, 4*sparsei, ZMQ_SNDMORE) - !if(rc /= 4*sparsei) stop "push" - - - rc = f77_zmq_send( zmq_socket_push, delta_loc4(1,1,1), 4*N_states*N_det, ZMQ_SNDMORE) - if(rc /= 4*N_states*N_det) stop "push" - - rc = f77_zmq_send( zmq_socket_push, delta_loc4(1,1,2), 4*N_states*N_det, ZMQ_SNDMORE) - if(rc /= 4*N_states*N_det) stop "push" - end if - else - rc = f77_zmq_send( zmq_socket_push, 0, 4, ZMQ_SNDMORE) - if(rc /= 4) stop "push" + + if(N_buf(1) > 0) then + rc = f77_zmq_send( zmq_socket_push, int_buf, 4*N_buf(1), ZMQ_SNDMORE) + if(rc /= 4*N_buf(1)) stop "push6" + end if + + if(N_buf(2) > 0) then + rc = f77_zmq_send( zmq_socket_push, double_buf, 8*N_buf(2), ZMQ_SNDMORE) + if(rc /= 8*N_buf(2)) stop "push8" end if - - else - rc = f77_zmq_send( zmq_socket_push, 0, 4, ZMQ_SNDMORE) - if(rc /= 4) stop "push" - - !rc = f77_zmq_send( zmq_socket_push, vast_emptiness, 8*N_states, ZMQ_SNDMORE) - !if(rc /= 8*N_states) stop "push" + if(N_buf(3) > 0) then + rc = f77_zmq_send( zmq_socket_push, det_buf, 2*N_int*bit_kind*N_buf(3), ZMQ_SNDMORE) + if(rc /= 2*N_int*bit_kind*N_buf(3)) stop "push10" + end if - !rc = f77_zmq_send( zmq_socket_push, vast_emptiness, 8*N_states, ZMQ_SNDMORE) - !if(rc /= 8*N_states) stop "push" + rc = f77_zmq_send( zmq_socket_push, task_id, 4, 0) + if(rc /= 4) stop "push11" end if - - N_buf = N_bufi - !N_buf = (/0, 1, 0/) - - rc = f77_zmq_send( zmq_socket_push, N_buf, 4*3, ZMQ_SNDMORE) - if(rc /= 4*3) stop "push5" - - if(N_buf(1) > N_dress_int_buffer) stop "run_dress_slave N_buf bad size?" - if(N_buf(2) > N_dress_double_buffer) stop "run_dress_slave N_buf bad size?" - if(N_buf(3) > N_dress_det_buffer) stop "run_dress_slave N_buf bad size?" - - - if(N_buf(1) > 0) then - rc = f77_zmq_send( zmq_socket_push, int_buf, 4*N_buf(1), ZMQ_SNDMORE) - if(rc /= 4*N_buf(1)) stop "push6" - end if - - if(N_buf(2) > 0) then - rc = f77_zmq_send( zmq_socket_push, double_buf, 8*N_buf(2), ZMQ_SNDMORE) - if(rc /= 8*N_buf(2)) stop "push8" - end if - - if(N_buf(3) > 0) then - rc = f77_zmq_send( zmq_socket_push, det_buf, 2*N_int*bit_kind*N_buf(3), ZMQ_SNDMORE) - if(rc /= 2*N_int*bit_kind*N_buf(3)) stop "push10" - end if - - rc = f77_zmq_send( zmq_socket_push, task_id, 4, 0) - if(rc /= 4) stop "push11" ! Activate is zmq_socket_push is a REQ IRP_IF ZMQ_PUSH @@ -250,90 +285,66 @@ IRP_ENDIF end subroutine -subroutine pull_dress_results(zmq_socket_pull, ind, last, delta_loc, delta_loc4, int_buf, double_buf, det_buf, N_buf, task_id, sparse, contrib) +subroutine pull_dress_results(zmq_socket_pull, ind, cur_cp, delta_loc, int_buf, double_buf, det_buf, N_buf, task_id, contrib) use f77_zmq implicit none integer(ZMQ_PTR), intent(in) :: zmq_socket_pull - logical, intent(out) :: last + integer, intent(out) :: cur_cp double precision, intent(inout) :: delta_loc(N_states, N_det, 2) double precision, intent(out) :: double_buf(*), contrib(N_states) integer, intent(out) :: int_buf(*) integer(bit_kind), intent(out) :: det_buf(N_int, 2, *) - integer, intent(out) :: sparse(0:N_det) integer, intent(out) :: ind integer, intent(out) :: task_id - integer :: rc, i, j, k, sparsen + integer :: rc, i, j, k integer, intent(out) :: N_buf(3) - real(kind=4), intent(out) :: delta_loc4(N_states, N_det, 2) rc = f77_zmq_recv( zmq_socket_pull, ind, 4, 0) if(rc /= 4) stop "pulla" - rc = f77_zmq_recv( zmq_socket_pull, last, 1, 0) - if(rc /= 1) stop "pulla" + rc = f77_zmq_recv( zmq_socket_pull, cur_cp, 4, 0) + if(rc /= 4) stop "pulla" - rc = f77_zmq_recv( zmq_socket_pull, sparse(0), 4, 0) - if(rc /= 4) stop "pullb" - if(sparse(0) /= 0) then + + + if(cur_cp /= -1) then + rc = f77_zmq_recv( zmq_socket_pull, delta_loc(1,1,1), N_states*8*N_det, 0) + if(rc /= 8*N_states*N_det) stop "pullc" + + rc = f77_zmq_recv( zmq_socket_pull, delta_loc(1,1,2), N_states*8*N_det, 0) + if(rc /= 8*N_states*N_det) stop "pulld" + else rc = f77_zmq_recv( zmq_socket_pull, contrib, 8*N_states, 0) if(rc /= 8*N_states) stop "pullc" - if(sparse(0) == -1) then - do i=1,N_det - sparse(i) = i - end do - sparse(0) = N_det - else - rc = f77_zmq_recv( zmq_socket_pull, sparse(1), 4*sparse(0), 0) - if(rc /= 4*sparse(0)) stop "pullc" + rc = f77_zmq_recv( zmq_socket_pull, N_buf, 4*3, 0) + if(rc /= 4*3) stop "pull" + if(N_buf(1) > N_dress_int_buffer) stop "run_dress_slave N_buf bad size?" + if(N_buf(2) > N_dress_double_buffer) stop "run_dress_slave N_buf bad size?" + if(N_buf(3) > N_dress_det_buffer) stop "run_dress_slave N_buf bad size?" + + + if(N_buf(1) > 0) then + rc = f77_zmq_recv( zmq_socket_pull, int_buf, 4*N_buf(1), 0) + if(rc /= 4*N_buf(1)) stop "pull1" + end if + + if(N_buf(2) > 0) then + rc = f77_zmq_recv( zmq_socket_pull, double_buf, 8*N_buf(2), 0) + if(rc /= 8*N_buf(2)) stop "pull2" + end if + + if(N_buf(3) > 0) then + rc = f77_zmq_recv( zmq_socket_pull, det_buf, 2*N_int*bit_kind*N_buf(3), 0) + if(rc /= 2*N_int*bit_kind*N_buf(3)) stop "pull3" end if - - rc = f77_zmq_recv( zmq_socket_pull, delta_loc4(1,1,1), N_states*4*sparse(0), 0) - if(rc /= 4*N_states*sparse(0)) stop "pullc" - - rc = f77_zmq_recv( zmq_socket_pull, delta_loc4(1,1,2), N_states*4*sparse(0), 0) - if(rc /= 4*N_states*sparse(0)) stop "pulld" - do j=1,2 - do i=1,sparse(0) - do k=1,N_states - delta_loc(k,i,j) = real(delta_loc4(k,i,j), kind=8) - end do - end do - end do - else - contrib = 0d0 + rc = f77_zmq_recv( zmq_socket_pull, task_id, 4, 0) + if(rc /= 4) stop "pull4" end if - - - rc = f77_zmq_recv( zmq_socket_pull, N_buf, 4*3, 0) - if(rc /= 4*3) stop "pull" - if(N_buf(1) > N_dress_int_buffer) stop "run_dress_slave N_buf bad size?" - if(N_buf(2) > N_dress_double_buffer) stop "run_dress_slave N_buf bad size?" - if(N_buf(3) > N_dress_det_buffer) stop "run_dress_slave N_buf bad size?" - - - if(N_buf(1) > 0) then - rc = f77_zmq_recv( zmq_socket_pull, int_buf, 4*N_buf(1), 0) - if(rc /= 4*N_buf(1)) stop "pull1" - end if - - if(N_buf(2) > 0) then - rc = f77_zmq_recv( zmq_socket_pull, double_buf, 8*N_buf(2), 0) - if(rc /= 8*N_buf(2)) stop "pull2" - end if - - if(N_buf(3) > 0) then - rc = f77_zmq_recv( zmq_socket_pull, det_buf, 2*N_int*bit_kind*N_buf(3), 0) - if(rc /= 2*N_int*bit_kind*N_buf(3)) stop "pull3" - end if - - rc = f77_zmq_recv( zmq_socket_pull, task_id, 4, 0) - if(rc /= 4) stop "pull4" - ! Activate is zmq_socket_pull is a REP IRP_IF ZMQ_PUSH IRP_ELSE