From f8924a82f4b0bf40596698911bf39099d66d454a Mon Sep 17 00:00:00 2001 From: Yann Garniron Date: Fri, 2 Feb 2018 13:10:14 +0100 Subject: [PATCH] reduced write on checkpoints --- plugins/mrcepa0/mrcc_stoch_routines.irp.f | 221 ++++++++++++++-------- 1 file changed, 142 insertions(+), 79 deletions(-) diff --git a/plugins/mrcepa0/mrcc_stoch_routines.irp.f b/plugins/mrcepa0/mrcc_stoch_routines.irp.f index e83dfe15..fe6d38f1 100644 --- a/plugins/mrcepa0/mrcc_stoch_routines.irp.f +++ b/plugins/mrcepa0/mrcc_stoch_routines.irp.f @@ -110,16 +110,17 @@ subroutine ZMQ_mrcc(E, mrcc, delta, delta_s2, relative_error) print *, irp_here, ': Failed in zmq_set_running' endif - !$OMP PARALLEL DEFAULT(shared) NUM_THREADS(nproc+1) & - !$OMP PRIVATE(i) - i = omp_get_thread_num() - if (i==0) then - call mrcc_collector(zmq_socket_pull,E, relative_error, delta, delta_s2, mrcc) - - else - call mrcc_slave_inproc(i) - endif - !$OMP END PARALLEL + !!$OMP PARALLEL DEFAULT(shared) NUM_THREADS(nproc+1) & + ! !$OMP PRIVATE(i) + !i = omp_get_thread_num() + !if (i==0) then + ! call mrcc_collector(zmq_socket_pull,E, relative_error, delta, delta_s2, mrcc) +! +! else +! call mrcc_slave_inproc(i) +! endif +! !$OMP END PARALLEL + call mrcc_collector(zmq_socket_pull,E, relative_error, delta, delta_s2, mrcc) call end_parallel_job(zmq_to_qp_run_socket, zmq_socket_pull, 'mrcc') print *, '========== ================= ================= =================' @@ -146,15 +147,14 @@ subroutine mrcc_collector(zmq_socket_pull, E, relative_error, delta, delta_s2, m integer(ZMQ_PTR), intent(in) :: zmq_socket_pull - double precision, intent(in) :: relative_error, E + double precision, intent(in) :: relative_error, E(N_states) double precision, intent(out) :: mrcc(N_states) double precision, allocatable :: cp(:,:,:,:) double precision, intent(out) :: delta(N_states, N_det_non_ref) double precision, intent(out) :: delta_s2(N_states, N_det_non_ref) - double precision, allocatable :: delta_loc(:,:,:), delta_det(:,:,:,:) + double precision, allocatable :: delta_loc(:,:,:,:), delta_det(:,:,:,:) double precision, allocatable :: mrcc_detail(:,:) - double precision :: mrcc_mwen(N_states) integer(ZMQ_PTR),external :: new_zmq_to_qp_run_socket integer(ZMQ_PTR) :: zmq_to_qp_run_socket @@ -164,7 +164,7 @@ subroutine mrcc_collector(zmq_socket_pull, E, relative_error, delta, delta_s2, m integer :: i, j, k, i_state, N, ntask integer, allocatable :: task_id(:) integer :: Nindex - integer, allocatable :: ind(:) + integer :: ind !double precision, save :: time0 = -1.d0 double precision :: time, time0, timeInit, old_tooth double precision, external :: omp_get_wtime @@ -172,13 +172,22 @@ subroutine mrcc_collector(zmq_socket_pull, E, relative_error, delta, delta_s2, m integer, allocatable :: parts_to_get(:) logical, allocatable :: actually_computed(:) integer :: total_computed - + integer, parameter :: delta_loc_N = 4 + integer :: delta_loc_slot, delta_loc_i(delta_loc_N) + double precision :: mrcc_mwen(N_states, delta_loc_N), lcoef(delta_loc_N) + logical :: ok + double precision :: usf, num + integer(8), save :: rezo = 0_8 + + usf = 0d0 + num = 0d0 + print *, "TARGET ERROR :", relative_error delta = 0d0 delta_s2 = 0d0 allocate(delta_det(N_states, N_det_non_ref, 0:comb_teeth+1, 2)) allocate(cp(N_states, N_det_non_ref, N_cp, 2), mrcc_detail(N_states, N_det_generators)) - allocate(delta_loc(N_states, N_det_non_ref, 2)) + allocate(delta_loc(N_states, N_det_non_ref, 2, delta_loc_N)) mrcc_detail = 0d0 delta_det = 0d0 !mrcc_detail = mrcc_detail / 0d0 @@ -200,69 +209,122 @@ subroutine mrcc_collector(zmq_socket_pull, E, relative_error, delta, delta_s2, m actually_computed = .false. zmq_to_qp_run_socket = new_zmq_to_qp_run_socket() - allocate(task_id(N_det_generators), ind(1)) + allocate(task_id(N_det_generators)) more = 1 time = omp_get_wtime() time0 = time timeInit = time cur_cp = 0 old_cur_cp = 0 + delta_loc_slot = 1 + delta_loc_i = 0 pullLoop : do while (more == 1) - call pull_mrcc_results(zmq_socket_pull, Nindex, ind, mrcc_mwen, delta_loc, task_id, ntask) + call pull_mrcc_results(zmq_socket_pull, Nindex, ind, mrcc_mwen(1, delta_loc_slot), delta_loc(1,1,1,delta_loc_slot), task_id, ntask) + !rezo += N_det_non_ref*8*2 + !print *, rezo / 1000000_8, "M" if(Nindex /= 1) stop "tried pull multiple Nindex" - - do i=1,Nindex - mrcc_detail(:, ind(i)) += mrcc_mwen(:) - do j=1,N_cp !! optimizable - if(cps(ind(i), j) > 0d0) then - if(tooth_of_det(ind(i)) < cp_first_tooth(j)) stop "coef on supposedely deterministic det" - double precision :: fac - integer :: toothMwen - logical :: fracted - fac = cps(ind(i), j) / cps_N(j) * mrcc_weight_inv(ind(i)) * comb_step - do k=1,N_det_non_ref - do i_state=1,N_states - cp(i_state,k,j,1) += delta_loc(i_state,k,1) * fac - cp(i_state,k,j,2) += delta_loc(i_state,k,2) * fac - end do - end do - end if - end do - toothMwen = tooth_of_det(ind(i)) - fracted = (toothMwen /= 0) - if(fracted) fracted = (ind(i) == first_det_of_teeth(toothMwen)) - - if(fracted) then - delta_det(:,:,toothMwen-1, 1) += delta_loc(:,:,1) * (1d0-fractage(toothMwen)) - delta_det(:,:,toothMwen-1, 2) += delta_loc(:,:,2) * (1d0-fractage(toothMwen)) - delta_det(:,:,toothMwen, 1) += delta_loc(:,:,1) * (fractage(toothMwen)) - delta_det(:,:,toothMwen, 2) += delta_loc(:,:,2) * (fractage(toothMwen)) - else - delta_det(:,:,toothMwen, 1) += delta_loc(:,:,1) - delta_det(:,:,toothMwen, 2) += delta_loc(:,:,2) - end if - - parts_to_get(ind(i)) -= 1 - if(parts_to_get(ind(i)) == 0) then - actually_computed(ind(i)) = .true. - !print *, "CONTRIB", ind(i), psi_non_ref_coef(ind(i),1), mrcc_detail(1, ind(i)) - total_computed += 1 - end if - end do - + delta_loc_i(delta_loc_slot) = ind integer, external :: zmq_delete_tasks if (zmq_delete_tasks(zmq_to_qp_run_socket,zmq_socket_pull,task_id,ntask,more) == -1) then stop 'Unable to delete tasks' endif - - + time = omp_get_wtime() - - - - if(time - time0 > 10d0 .or. more /= 1) then +!time - time0 > 10d0 + if(more /= 1 .or. delta_loc_slot == delta_loc_N) then time0 = time + do i=1,delta_loc_N + if(delta_loc_i(i) /= 0) then + mrcc_detail(:, delta_loc_i(i)) += mrcc_mwen(:,i) + end if + end do + + !$OMP PARALLEL DO SCHEDULE(DYNAMIC) DEFAULT(shared) private(j, ok, i, lcoef, k, i_state) + do j=1,N_cp !! optimizable + ok = .false. + do i=1,delta_loc_N + if(delta_loc_i(i) == 0) then + lcoef(i) = 0d0 + else + lcoef(i) = cps(delta_loc_i(i), j) / cps_N(j) * mrcc_weight_inv(delta_loc_i(i)) * comb_step + if(lcoef(i) /= 0d0) then + !usf = usf + 1d0 + ok = .true. + end if + end if + end do + if(.not. ok) cycle + !num += 1d0 + !print *, "USEFUL", usf, num, usf/num + !do j=1,N_cp !! optimizable + ! if(cps(ind, j) > 0d0) then + !if(tooth_of_det(ind) < cp_first_tooth(j)) stop "coef on supposedely deterministic det" + double precision :: fac + integer :: toothMwen + logical :: fracted, toothMwendid(0:10000) + !fac = cps(ind, j) / cps_N(j) * mrcc_weight_inv(ind) * comb_step + !!$OMP PARALLEL DO COLLAPSE(2) DEFAULT(shared) + do k=1,N_det_non_ref + do i_state=1,N_states + cp(i_state,k,j,1) += delta_loc(i_state,k,1,1) * lcoef(1) + & + delta_loc(i_state,k,1,2) * lcoef(2) + & + delta_loc(i_state,k,1,3) * lcoef(3) + & + delta_loc(i_state,k,1,4) * lcoef(4) + end do + end do + + !!$OMP PARALLEL DO COLLAPSE(2) DEFAULT(shared) + do k=1,N_det_non_ref + do i_state=1,N_states + cp(i_state,k,j,2) += delta_loc(i_state,k,2,1) * lcoef(1) + & + delta_loc(i_state,k,2,2) * lcoef(2) + & + delta_loc(i_state,k,2,3) * lcoef(3) + & + delta_loc(i_state,k,2,4) * lcoef(4) + end do + end do + + ! end if + end do + !$OMP END PARALLEL DO + + !toothmwendid = .false. + do i=1,delta_loc_N + ind = delta_loc_i(i) + if(ind == 0) cycle + toothMwen = tooth_of_det(ind) + !if(.not. toothmwendid(toothMwen)) then + ! usf += 1d0 + ! toothmwendid(toothMwen) = .true. + !end if + + fracted = (toothMwen /= 0) + if(fracted) fracted = (ind == first_det_of_teeth(toothMwen)) + + if(fracted) then + delta_det(:,:,toothMwen-1, 1) += delta_loc(:,:,1,i) * (1d0-fractage(toothMwen)) + delta_det(:,:,toothMwen-1, 2) += delta_loc(:,:,2,i) * (1d0-fractage(toothMwen)) + delta_det(:,:,toothMwen, 1) += delta_loc(:,:,1,i) * (fractage(toothMwen)) + delta_det(:,:,toothMwen, 2) += delta_loc(:,:,2,i) * (fractage(toothMwen)) + else + delta_det(:,:,toothMwen, 1) += delta_loc(:,:,1,i) + delta_det(:,:,toothMwen, 2) += delta_loc(:,:,2,i) + end if + parts_to_get(ind) -= 1 + if(parts_to_get(ind) == 0) then + actually_computed(ind) = .true. + !print *, "CONTRIB", ind, psi_non_ref_coef(ind,1), mrcc_detail(1, ind) + total_computed += 1 + end if + end do + !num += 1d0 + !print *, "USEFUL", usf, num, usf/num + + delta_loc_slot = 1 + delta_loc_i = 0 + + + !if(time - time0 > 10d0 .or. more /= 1) then cur_cp = N_cp !if(.not. actually_computed(mrcc_jobs(1))) cycle pullLoop @@ -300,7 +362,7 @@ subroutine mrcc_collector(zmq_socket_pull, E, relative_error, delta, delta_s2, m E0 = E0 + mrcc_detail(1, first_det_of_teeth(cp_first_tooth(cur_cp))) * (1d0-fractage(cp_first_tooth(cur_cp))) end if - print "(I5,F15.7,F10.2,E12.4)", cur_cp, E+E0+avg, eqt, time-timeInit + print "(I5,F15.7,E12.4,F10.2)", cur_cp, E+E0+avg, eqt, time-timeInit if ((dabs(eqt) < relative_error .and. cps_N(cur_cp) >= 30) .or. total_computed == N_det_generators) then if (zmq_abort(zmq_to_qp_run_socket) == -1) then @@ -310,6 +372,8 @@ subroutine mrcc_collector(zmq_socket_pull, E, relative_error, delta, delta_s2, m endif endif endif + else + delta_loc_slot += 1 end if end do pullLoop @@ -321,16 +385,14 @@ subroutine mrcc_collector(zmq_socket_pull, E, relative_error, delta, delta_s2, m delta_s2 += delta_det(:,:,i,2) end do else - - delta = cp(:,:,cur_cp,1) - delta_s2 = cp(:,:,cur_cp,2) - do i=cp_first_tooth(cur_cp)-1,0,-1 - delta += delta_det(:,:,i,1) - delta_s2 += delta_det(:,:,i,2) - end do - + delta = cp(:,:,cur_cp,1) + delta_s2 = cp(:,:,cur_cp,2) + do i=cp_first_tooth(cur_cp)-1,0,-1 + delta += delta_det(:,:,i,1) + delta_s2 += delta_det(:,:,i,2) + end do end if - mrcc(1) = E + mrcc = E call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket) end subroutine @@ -389,6 +451,8 @@ END_PROVIDER integer :: i, j, last_full, dets(comb_teeth) integer :: k, l, cur_cp, under_det(comb_teeth+1) integer, allocatable :: iorder(:), first_cp(:) + double precision :: tmp + allocate(iorder(N_det_generators), first_cp(N_cps_max+1)) allocate(computed(N_det_generators)) @@ -468,11 +532,10 @@ END_PROVIDER cps(:, N_cp) = 0d0 cp_first_tooth(N_cp) = comb_teeth+1 - iorder = -1132154665 - do i=1,N_cp-1 - call isort(mrcc_jobs(first_cp(i)+1:first_cp(i+1)),iorder,first_cp(i+1)-first_cp(i)) - end do -! end subroutine + !iorder = -1132154665 + !do i=1,N_cp-1 + ! call isort(mrcc_jobs(first_cp(i)+1:first_cp(i+1)),iorder,first_cp(i+1)-first_cp(i)) + !end do END_PROVIDER