From 63fcb619aaef23055ea4ce9fffc2c85802561e74 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Sat, 26 Jan 2019 11:18:29 +0100 Subject: [PATCH] Fixing MPI Davidson --- src/cipsi/slave_cipsi.irp.f | 22 ++++---- src/davidson/davidson_parallel.irp.f | 77 +++++++++++++++++++++++++++- 2 files changed, 88 insertions(+), 11 deletions(-) diff --git a/src/cipsi/slave_cipsi.irp.f b/src/cipsi/slave_cipsi.irp.f index 4b5f4252..a655f295 100644 --- a/src/cipsi/slave_cipsi.irp.f +++ b/src/cipsi/slave_cipsi.irp.f @@ -173,6 +173,7 @@ subroutine run_slave_main call davidson_slave_tcp(0) call omp_set_nested(.False.) print *, mpi_rank, ': Davidson done' + IRP_IF MPI call MPI_BARRIER(MPI_COMM_WORLD, ierr) if (ierr /= MPI_SUCCESS) then @@ -223,14 +224,6 @@ subroutine run_slave_main if (zmq_get_dvector(zmq_to_qp_run_socket,1,'state_average_weight',state_average_weight,N_states) == -1) cycle pt2_e0_denominator(1:N_states) = energy(1:N_states) SOFT_TOUCH pt2_e0_denominator state_average_weight pt2_stoch_istate threshold_generators - if (mpi_master) then - print *, 'N_det', N_det - print *, 'N_det_generators', N_det_generators - print *, 'N_det_selectors', N_det_selectors - print *, 'pt2_e0_denominator', pt2_e0_denominator - print *, 'pt2_stoch_istate', pt2_stoch_istate - print *, 'state_average_weight', state_average_weight - endif call wall_time(t1) call write_double(6,(t1-t0),'Broadcast time') @@ -281,7 +274,18 @@ subroutine run_slave_main nproc_target = nproc_target - 1 enddo - !$OMP PARALLEL PRIVATE(i) + + if (mpi_master) then + print *, 'N_det', N_det + print *, 'N_det_generators', N_det_generators + print *, 'N_det_selectors', N_det_selectors + print *, 'pt2_e0_denominator', pt2_e0_denominator + print *, 'pt2_stoch_istate', pt2_stoch_istate + print *, 'state_average_weight', state_average_weight + print *, 'Number of threads', nproc_target + endif + + !$OMP PARALLEL PRIVATE(i) NUM_THREADS(nproc_target+1) i = omp_get_thread_num() call run_pt2_slave(0,i,pt2_e0_denominator) !$OMP END PARALLEL diff --git a/src/davidson/davidson_parallel.irp.f b/src/davidson/davidson_parallel.irp.f index 9a0844c6..394e724f 100644 --- a/src/davidson/davidson_parallel.irp.f +++ b/src/davidson/davidson_parallel.irp.f @@ -35,7 +35,7 @@ subroutine davidson_run_slave(thread,iproc) integer(ZMQ_PTR) :: zmq_socket_push integer, external :: connect_to_taskserver - integer, external :: zmq_get_N_states_diag + integer, external :: zmq_get_N_states_diag_notouch PROVIDE mpi_rank zmq_to_qp_run_socket = new_zmq_to_qp_run_socket() @@ -68,10 +68,23 @@ subroutine davidson_run_slave(thread,iproc) if (doexit == 0) then exit else - print *, irp_here, ': retrying connection (', doexit, ')' + if (thread == 0) then + print *, irp_here, ': Connection failed. Exiting Davidson.' + return + endif endif enddo + do + if (zmq_get_N_states_diag_notouch(zmq_to_qp_run_socket,1) == -1) then + call sleep(1) + print *, irp_here, ': waiting for N_states_diag' + else + exit + endif + enddo + SOFT_TOUCH N_states_diag + call davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, N_states_diag, N_det, worker_id) integer, external :: disconnect_from_taskserver @@ -575,3 +588,63 @@ integer function zmq_get_N_states_diag(zmq_to_qp_run_socket, worker_id) endif IRP_ENDIF end + +integer function zmq_get_N_states_diag_notouch(zmq_to_qp_run_socket, worker_id) + use f77_zmq + implicit none + BEGIN_DOC +! Get N_states_diag from the qp_run scheduler + END_DOC + integer(ZMQ_PTR), intent(in) :: zmq_to_qp_run_socket + integer, intent(in) :: worker_id + integer :: rc + character*(256) :: msg + + zmq_get_N_states_diag_notouch = 0 + + if (mpi_master) then + write(msg,'(A,1X,I8,1X,A200)') 'get_data '//trim(zmq_state), worker_id, 'N_states_diag' + rc = f77_zmq_send(zmq_to_qp_run_socket,trim(msg),len(trim(msg)),0) + if (rc /= len(trim(msg))) go to 10 + + rc = f77_zmq_recv(zmq_to_qp_run_socket,msg,len(msg),0) + if (msg(1:14) /= 'get_data_reply') go to 10 + + rc = f77_zmq_recv(zmq_to_qp_run_socket,N_states_diag,4,0) + if (rc /= 4) go to 10 + endif + + IRP_IF MPI_DEBUG + print *, irp_here, mpi_rank + call MPI_BARRIER(MPI_COMM_WORLD, ierr) + IRP_ENDIF + IRP_IF MPI + include 'mpif.h' + integer :: ierr + call MPI_BCAST (zmq_get_N_states_diag_notouch, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) + if (ierr /= MPI_SUCCESS) then + print *, irp_here//': Unable to broadcast N_states' + stop -1 + endif + if (zmq_get_N_states_diag_notouch == 0) then + call MPI_BCAST (N_states_diag, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) + if (ierr /= MPI_SUCCESS) then + print *, irp_here//': Unable to broadcast N_states' + stop -1 + endif + endif + IRP_ENDIF + + return + + ! Exception + 10 continue + zmq_get_N_states_diag_notouch = -1 + IRP_IF MPI + call MPI_BCAST (zmq_get_N_states_diag_notouch, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) + if (ierr /= MPI_SUCCESS) then + print *, irp_here//': Unable to broadcast N_states' + stop -1 + endif + IRP_ENDIF +end