From f8bda54c75b4d23756980a1789b349df3ef42b2d Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Sun, 9 Sep 2018 12:20:33 +0200 Subject: [PATCH] Fixed MPI --- plugins/Full_CI_ZMQ/run_pt2_slave.irp.f | 1 + .../selection_davidson_slave.irp.f | 15 ++++++++--- .../Full_CI_ZMQ/target_pt2_ratio_zmq.irp.f | 2 +- plugins/Full_CI_ZMQ/target_pt2_zmq.irp.f | 2 +- plugins/Selectors_Utils/zmq.irp.f | 4 +++ .../ezfio_generate_provider.py | 4 +++ src/Bitmask/bitmasks.irp.f | 24 ++++++++++++++++++ src/Bitmask/mpi.irp.f | 4 +++ src/Davidson/davidson_parallel.irp.f | 13 ++++++++-- src/Davidson/{ezfio.irp.f => input.irp.f} | 4 +++ src/Determinants/determinants.irp.f | 16 ++++++++++++ src/Determinants/zmq.irp.f | 12 +++++++++ src/MO_Basis/mos.irp.f | 16 ++++++++++++ src/MPI/mpi.irp.f | 5 ++++ src/Nuclei/nuclei.irp.f | 12 +++++++++ src/ZMQ/put_get.irp.f | 25 +++++++++++-------- 16 files changed, 141 insertions(+), 18 deletions(-) rename src/Davidson/{ezfio.irp.f => input.irp.f} (88%) diff --git a/plugins/Full_CI_ZMQ/run_pt2_slave.irp.f b/plugins/Full_CI_ZMQ/run_pt2_slave.irp.f index 732c8ca8..6d8b6a8c 100644 --- a/plugins/Full_CI_ZMQ/run_pt2_slave.irp.f +++ b/plugins/Full_CI_ZMQ/run_pt2_slave.irp.f @@ -77,6 +77,7 @@ subroutine run_pt2_slave(thread,iproc,energy) ! Try to adjust n_tasks around 1 second per job n_tasks = min(n_tasks,int( 1.d0*dble(n_tasks) / (time1 - time0 + 1.d-9)))+1 +! n_tasks = n_tasks+1 end do integer, external :: disconnect_from_taskserver diff --git a/plugins/Full_CI_ZMQ/selection_davidson_slave.irp.f b/plugins/Full_CI_ZMQ/selection_davidson_slave.irp.f index 415270f1..f3534240 100644 --- a/plugins/Full_CI_ZMQ/selection_davidson_slave.irp.f +++ b/plugins/Full_CI_ZMQ/selection_davidson_slave.irp.f @@ -13,9 +13,10 @@ program selection_slave end subroutine provide_everything - PROVIDE H_apply_buffer_allocated mo_bielec_integrals_in_map psi_det_generators psi_coef_generators psi_det_sorted_bit psi_selectors n_det_generators n_states generators_bitmask zmq_context n_states_diag + PROVIDE H_apply_buffer_allocated mo_bielec_integrals_in_map psi_det_generators psi_coef_generators psi_det_sorted_bit psi_selectors n_det_generators n_states generators_bitmask zmq_context N_states_diag PROVIDE pt2_e0_denominator mo_tot_num N_int ci_energy mpi_master zmq_state zmq_context - PROVIDE psi_det psi_coef + PROVIDE psi_det psi_coef threshold_generators threshold_selectors state_average_weight + PROVIDE N_det_selectors pt2_stoch_istate N_det end subroutine run_wf @@ -39,8 +40,6 @@ subroutine run_wf integer, external :: zmq_get_psi, zmq_get_N_det_selectors integer, external :: zmq_get_N_states_diag - call provide_everything - zmq_context = f77_zmq_ctx_new () states(1) = 'selection' states(2) = 'davidson' @@ -49,6 +48,10 @@ subroutine run_wf zmq_to_qp_run_socket = new_zmq_to_qp_run_socket() + PROVIDE psi_det psi_coef threshold_generators threshold_selectors state_average_weight mpi_master + PROVIDE zmq_state N_det_selectors pt2_stoch_istate N_det pt2_e0_denominator + PROVIDE N_det_generators N_states N_states_diag + call MPI_BARRIER(MPI_COMM_WORLD, ierr) do if (mpi_master) then @@ -62,6 +65,10 @@ subroutine run_wf print *, trim(zmq_state) endif + IRP_IF MPI_DEBUG + print *, irp_here, mpi_rank + call MPI_BARRIER(MPI_COMM_WORLD, ierr) + IRP_ENDIF IRP_IF MPI call MPI_BCAST (zmq_state, 128, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) if (ierr /= MPI_SUCCESS) then diff --git a/plugins/Full_CI_ZMQ/target_pt2_ratio_zmq.irp.f b/plugins/Full_CI_ZMQ/target_pt2_ratio_zmq.irp.f index 04a1d9d4..dbe436ff 100644 --- a/plugins/Full_CI_ZMQ/target_pt2_ratio_zmq.irp.f +++ b/plugins/Full_CI_ZMQ/target_pt2_ratio_zmq.irp.f @@ -1,4 +1,4 @@ -program fci_zmq +program target_pt2_ratio implicit none integer :: i,j,k logical, external :: detEq diff --git a/plugins/Full_CI_ZMQ/target_pt2_zmq.irp.f b/plugins/Full_CI_ZMQ/target_pt2_zmq.irp.f index 52f825f1..851190be 100644 --- a/plugins/Full_CI_ZMQ/target_pt2_zmq.irp.f +++ b/plugins/Full_CI_ZMQ/target_pt2_zmq.irp.f @@ -1,4 +1,4 @@ -program fci_zmq +program target_pt2 implicit none integer :: i,j,k logical, external :: detEq diff --git a/plugins/Selectors_Utils/zmq.irp.f b/plugins/Selectors_Utils/zmq.irp.f index b32436aa..375a77d1 100644 --- a/plugins/Selectors_Utils/zmq.irp.f +++ b/plugins/Selectors_Utils/zmq.irp.f @@ -72,6 +72,10 @@ integer function zmq_get_$X(zmq_to_qp_run_socket, worker_id) 10 continue + IRP_IF MPI_DEBUG + print *, irp_here, mpi_rank + call MPI_BARRIER(MPI_COMM_WORLD, ierr) + IRP_ENDIF IRP_IF MPI include 'mpif.h' integer :: ierr diff --git a/scripts/ezfio_interface/ezfio_generate_provider.py b/scripts/ezfio_interface/ezfio_generate_provider.py index 4a8e7ec4..d6dcc716 100755 --- a/scripts/ezfio_interface/ezfio_generate_provider.py +++ b/scripts/ezfio_interface/ezfio_generate_provider.py @@ -32,6 +32,10 @@ BEGIN_PROVIDER [ %(type)s, %(name)s %(size)s ] stop 1 endif endif + IRP_IF MPI_DEBUG + print *, irp_here, mpi_rank + call MPI_BARRIER(MPI_COMM_WORLD, ierr) + IRP_ENDIF IRP_IF MPI include 'mpif.h' integer :: ierr diff --git a/src/Bitmask/bitmasks.irp.f b/src/Bitmask/bitmasks.irp.f index fcfe03c8..d5472a25 100644 --- a/src/Bitmask/bitmasks.irp.f +++ b/src/Bitmask/bitmasks.irp.f @@ -127,6 +127,10 @@ BEGIN_PROVIDER [ integer, N_generators_bitmask ] ASSERT (N_generators_bitmask > 0) call write_int(6,N_generators_bitmask,'N_generators_bitmask') endif + IRP_IF MPI_DEBUG + print *, irp_here, mpi_rank + call MPI_BARRIER(MPI_COMM_WORLD, ierr) + IRP_ENDIF IRP_IF MPI include 'mpif.h' integer :: ierr @@ -170,6 +174,10 @@ BEGIN_PROVIDER [ integer, N_generators_bitmask_restart ] ASSERT (N_generators_bitmask_restart > 0) call write_int(6,N_generators_bitmask_restart,'N_generators_bitmask_restart') endif + IRP_IF MPI_DEBUG + print *, irp_here, mpi_rank + call MPI_BARRIER(MPI_COMM_WORLD, ierr) + IRP_ENDIF IRP_IF MPI include 'mpif.h' integer :: ierr @@ -244,6 +252,10 @@ BEGIN_PROVIDER [ integer(bit_kind), generators_bitmask_restart, (N_int,2,6,N_gen enddo enddo endif + IRP_IF MPI_DEBUG + print *, irp_here, mpi_rank + call MPI_BARRIER(MPI_COMM_WORLD, ierr) + IRP_ENDIF IRP_IF MPI include 'mpif.h' integer :: ierr @@ -313,6 +325,10 @@ if (mpi_master) then enddo enddo endif + IRP_IF MPI_DEBUG + print *, irp_here, mpi_rank + call MPI_BARRIER(MPI_COMM_WORLD, ierr) + IRP_ENDIF IRP_IF MPI include 'mpif.h' integer :: ierr @@ -354,6 +370,10 @@ BEGIN_PROVIDER [ integer, N_cas_bitmask ] call write_int(6,N_cas_bitmask,'N_cas_bitmask') endif ASSERT (N_cas_bitmask > 0) + IRP_IF MPI_DEBUG + print *, irp_here, mpi_rank + call MPI_BARRIER(MPI_COMM_WORLD, ierr) + IRP_ENDIF IRP_IF MPI include 'mpif.h' integer :: ierr @@ -407,6 +427,10 @@ BEGIN_PROVIDER [ integer(bit_kind), cas_bitmask, (N_int,2,N_cas_bitmask) ] enddo write(*,*) 'Read CAS bitmask' endif + IRP_IF MPI_DEBUG + print *, irp_here, mpi_rank + call MPI_BARRIER(MPI_COMM_WORLD, ierr) + IRP_ENDIF IRP_IF MPI include 'mpif.h' integer :: ierr diff --git a/src/Bitmask/mpi.irp.f b/src/Bitmask/mpi.irp.f index 18af1ca3..11d6777a 100644 --- a/src/Bitmask/mpi.irp.f +++ b/src/Bitmask/mpi.irp.f @@ -26,6 +26,10 @@ subroutine broadcast_chunks_bit_kind(A, LDA) BEGIN_DOC ! Broadcast with chunks of ~2GB END_DOC + IRP_IF MPI_DEBUG + print *, irp_here, mpi_rank + call MPI_BARRIER(MPI_COMM_WORLD, ierr) + IRP_ENDIF IRP_IF MPI include 'mpif.h' integer :: i, sze, ierr diff --git a/src/Davidson/davidson_parallel.irp.f b/src/Davidson/davidson_parallel.irp.f index 15eede23..0a0881a6 100644 --- a/src/Davidson/davidson_parallel.irp.f +++ b/src/Davidson/davidson_parallel.irp.f @@ -13,7 +13,6 @@ end subroutine davidson_slave_tcp(i) implicit none integer, intent(in) :: i - call davidson_run_slave(0,i) end @@ -36,6 +35,10 @@ subroutine davidson_run_slave(thread,iproc) integer, external :: connect_to_taskserver + +include 'mpif.h' +integer ierr + if (connect_to_taskserver(zmq_to_qp_run_socket,worker_id,thread) == -1) then call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket) endif @@ -86,11 +89,13 @@ subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, N_st, sze, allocate (energy(N_st)) if (zmq_get_dvector(zmq_to_qp_run_socket, worker_id, 'u_t', u_t, size(u_t)) == -1) then + print *, irp_here, ': Unable to get u_t' deallocate(u_t,energy) return endif if (zmq_get_dvector(zmq_to_qp_run_socket, worker_id, 'energy', energy, size(energy)) == -1) then + print *, irp_here, ': Unable to get energy' deallocate(u_t,energy) return endif @@ -467,10 +472,13 @@ integer function zmq_get_N_states_diag(zmq_to_qp_run_socket, worker_id) if (rc /= 4) go to 10 endif + IRP_IF MPI_DEBUG + print *, irp_here, mpi_rank + call MPI_BARRIER(MPI_COMM_WORLD, ierr) + IRP_ENDIF IRP_IF MPI include 'mpif.h' integer :: ierr - call MPI_BCAST (zmq_get_N_states_diag, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) if (ierr /= MPI_SUCCESS) then print *, irp_here//': Unable to broadcast N_states' @@ -484,6 +492,7 @@ integer function zmq_get_N_states_diag(zmq_to_qp_run_socket, worker_id) endif endif IRP_ENDIF + TOUCH N_states_diag return diff --git a/src/Davidson/ezfio.irp.f b/src/Davidson/input.irp.f similarity index 88% rename from src/Davidson/ezfio.irp.f rename to src/Davidson/input.irp.f index a22bd456..2904176c 100644 --- a/src/Davidson/ezfio.irp.f +++ b/src/Davidson/input.irp.f @@ -17,6 +17,10 @@ BEGIN_PROVIDER [ integer, n_states_diag ] endif n_states_diag = max(N_states, N_states_diag) endif + IRP_IF MPI_DEBUG + print *, irp_here, mpi_rank + call MPI_BARRIER(MPI_COMM_WORLD, ierr) + IRP_ENDIF IRP_IF MPI include 'mpif.h' integer :: ierr diff --git a/src/Determinants/determinants.irp.f b/src/Determinants/determinants.irp.f index 3db76eef..f04a85a5 100644 --- a/src/Determinants/determinants.irp.f +++ b/src/Determinants/determinants.irp.f @@ -45,6 +45,10 @@ BEGIN_PROVIDER [ integer, N_det ] endif call write_int(6,N_det,'Number of determinants') endif + IRP_IF MPI_DEBUG + print *, irp_here, mpi_rank + call MPI_BARRIER(MPI_COMM_WORLD, ierr) + IRP_ENDIF IRP_IF MPI include 'mpif.h' integer :: ierr @@ -89,6 +93,10 @@ BEGIN_PROVIDER [ integer, psi_det_size ] psi_det_size = max(psi_det_size,100000) call write_int(6,psi_det_size,'Dimension of the psi arrays') endif + IRP_IF MPI_DEBUG + print *, irp_here, mpi_rank + call MPI_BARRIER(MPI_COMM_WORLD, ierr) + IRP_ENDIF IRP_IF MPI include 'mpif.h' integer :: ierr @@ -154,6 +162,10 @@ BEGIN_PROVIDER [ integer(bit_kind), psi_det, (N_int,2,psi_det_size) ] enddo endif endif + IRP_IF MPI_DEBUG + print *, irp_here, mpi_rank + call MPI_BARRIER(MPI_COMM_WORLD, ierr) + IRP_ENDIF IRP_IF MPI include 'mpif.h' integer :: ierr @@ -212,6 +224,10 @@ BEGIN_PROVIDER [ double precision, psi_coef, (psi_det_size,N_states) ] endif endif endif + IRP_IF MPI_DEBUG + print *, irp_here, mpi_rank + call MPI_BARRIER(MPI_COMM_WORLD, ierr) + IRP_ENDIF IRP_IF MPI include 'mpif.h' integer :: ierr diff --git a/src/Determinants/zmq.irp.f b/src/Determinants/zmq.irp.f index e8edc1a8..6c25173a 100644 --- a/src/Determinants/zmq.irp.f +++ b/src/Determinants/zmq.irp.f @@ -112,6 +112,10 @@ integer function zmq_get_$X(zmq_to_qp_run_socket, worker_id) endif 10 continue + IRP_IF MPI_DEBUG + print *, irp_here, mpi_rank + call MPI_BARRIER(MPI_COMM_WORLD, ierr) + IRP_ENDIF IRP_IF MPI include 'mpif.h' integer :: ierr @@ -297,6 +301,10 @@ integer function zmq_get_psi_det(zmq_to_qp_run_socket, worker_id) endif 10 continue + IRP_IF MPI_DEBUG + print *, irp_here, mpi_rank + call MPI_BARRIER(MPI_COMM_WORLD, ierr) + IRP_ENDIF IRP_IF MPI include 'mpif.h' integer :: ierr @@ -346,6 +354,10 @@ integer function zmq_get_psi_coef(zmq_to_qp_run_socket, worker_id) 10 continue + IRP_IF MPI_DEBUG + print *, irp_here, mpi_rank + call MPI_BARRIER(MPI_COMM_WORLD, ierr) + IRP_ENDIF IRP_IF MPI include 'mpif.h' integer :: ierr diff --git a/src/MO_Basis/mos.irp.f b/src/MO_Basis/mos.irp.f index 3c920d67..2662c6e6 100644 --- a/src/MO_Basis/mos.irp.f +++ b/src/MO_Basis/mos.irp.f @@ -9,6 +9,10 @@ BEGIN_PROVIDER [ integer, mo_tot_num ] if (mpi_master) then call ezfio_has_mo_basis_mo_tot_num(has) endif + IRP_IF MPI_DEBUG + print *, irp_here, mpi_rank + call MPI_BARRIER(MPI_COMM_WORLD, ierr) + IRP_ENDIF IRP_IF MPI include 'mpif.h' integer :: ierr @@ -71,6 +75,10 @@ BEGIN_PROVIDER [ double precision, mo_coef, (ao_num,mo_tot_num) ] ! Coefs call ezfio_has_mo_basis_mo_coef(exists) endif + IRP_IF MPI_DEBUG + print *, irp_here, mpi_rank + call MPI_BARRIER(MPI_COMM_WORLD, ierr) + IRP_ENDIF IRP_IF MPI include 'mpif.h' integer :: ierr @@ -136,6 +144,10 @@ BEGIN_PROVIDER [ character*(64), mo_label ] endif write(*,*) '* mo_label ', trim(mo_label) endif + IRP_IF MPI_DEBUG + print *, irp_here, mpi_rank + call MPI_BARRIER(MPI_COMM_WORLD, ierr) + IRP_ENDIF IRP_IF MPI include 'mpif.h' integer :: ierr @@ -198,6 +210,10 @@ BEGIN_PROVIDER [ double precision, mo_occ, (mo_tot_num) ] endif write(*,*) 'Read mo_occ' endif + IRP_IF MPI_DEBUG + print *, irp_here, mpi_rank + call MPI_BARRIER(MPI_COMM_WORLD, ierr) + IRP_ENDIF IRP_IF MPI include 'mpif.h' integer :: ierr diff --git a/src/MPI/mpi.irp.f b/src/MPI/mpi.irp.f index f24a2923..41694c8f 100644 --- a/src/MPI/mpi.irp.f +++ b/src/MPI/mpi.irp.f @@ -70,6 +70,10 @@ subroutine broadcast_chunks_$double(A, LDA) BEGIN_DOC ! Broadcast with chunks of ~2GB END_DOC + IRP_IF MPI_DEBUG + print *, irp_here, mpi_rank + call MPI_BARRIER(MPI_COMM_WORLD, ierr) + IRP_ENDIF IRP_IF MPI include 'mpif.h' integer :: i, sze, ierr @@ -80,6 +84,7 @@ subroutine broadcast_chunks_$double(A, LDA) print *, irp_here//': Unable to broadcast chunks $double ', i stop -1 endif + call MPI_BARRIER(MPI_COMM_WORLD,ierr) enddo IRP_ENDIF end diff --git a/src/Nuclei/nuclei.irp.f b/src/Nuclei/nuclei.irp.f index 3528bf50..b2d1f54b 100644 --- a/src/Nuclei/nuclei.irp.f +++ b/src/Nuclei/nuclei.irp.f @@ -54,6 +54,10 @@ BEGIN_PROVIDER [ double precision, nucl_coord, (nucl_num,3) ] endif + IRP_IF MPI_DEBUG + print *, irp_here, mpi_rank + call MPI_BARRIER(MPI_COMM_WORLD, ierr) + IRP_ENDIF IRP_IF MPI include 'mpif.h' integer :: ierr @@ -159,6 +163,10 @@ BEGIN_PROVIDER [ double precision, nuclear_repulsion ] endif print*, 'Read nuclear_repulsion' endif + IRP_IF MPI_DEBUG + print *, irp_here, mpi_rank + call MPI_BARRIER(MPI_COMM_WORLD, ierr) + IRP_ENDIF IRP_IF MPI include 'mpif.h' integer :: ierr @@ -228,6 +236,10 @@ END_PROVIDER close(10) endif + IRP_IF MPI_DEBUG + print *, irp_here, mpi_rank + call MPI_BARRIER(MPI_COMM_WORLD, ierr) + IRP_ENDIF IRP_IF MPI include 'mpif.h' integer :: ierr diff --git a/src/ZMQ/put_get.irp.f b/src/ZMQ/put_get.irp.f index 5269bd5e..207cb0ae 100644 --- a/src/ZMQ/put_get.irp.f +++ b/src/ZMQ/put_get.irp.f @@ -60,17 +60,20 @@ integer function zmq_get_dvector(zmq_to_qp_run_socket, worker_id, name, x, size_ rc = f77_zmq_send(zmq_to_qp_run_socket,trim(msg),len(trim(msg)),0) if (rc /= len(trim(msg))) then zmq_get_dvector = -1 + print *, irp_here, 'rc /= len(trim(msg))', rc, len(trim(msg)) go to 10 endif rc = f77_zmq_recv(zmq_to_qp_run_socket,msg,len(msg),0) if (msg(1:14) /= 'get_data_reply') then + print *, irp_here, 'msg(1:14) /= get_data_reply', msg(1:14) zmq_get_dvector = -1 go to 10 endif rc = f77_zmq_recv(zmq_to_qp_run_socket,x,size_x*8,0) if (rc /= size_x*8) then + print *, irp_here, 'rc /= size_x*8', rc, size_x*8 zmq_get_dvector = -1 go to 10 endif @@ -78,6 +81,10 @@ integer function zmq_get_dvector(zmq_to_qp_run_socket, worker_id, name, x, size_ 10 continue + IRP_IF MPI_DEBUG + print *, irp_here, mpi_rank + call MPI_BARRIER(MPI_COMM_WORLD, ierr) + IRP_ENDIF IRP_IF MPI integer :: ierr include 'mpif.h' @@ -86,11 +93,8 @@ integer function zmq_get_dvector(zmq_to_qp_run_socket, worker_id, name, x, size_ print *, irp_here//': Unable to broadcast zmq_get_dvector' stop -1 endif - call MPI_BCAST (x, size_x, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr) - if (ierr /= MPI_SUCCESS) then - print *, irp_here//': Unable to broadcast dvector' - stop -1 - endif + call MPI_BARRIER(MPI_COMM_WORLD,ierr) + call broadcast_chunks_double(x, size_x) IRP_ENDIF end @@ -177,6 +181,10 @@ integer function zmq_get_ivector(zmq_to_qp_run_socket, worker_id, name, x, size_ 10 continue + IRP_IF MPI_DEBUG + print *, irp_here, mpi_rank + call MPI_BARRIER(MPI_COMM_WORLD, ierr) + IRP_ENDIF IRP_IF MPI integer :: ierr include 'mpif.h' @@ -185,11 +193,8 @@ integer function zmq_get_ivector(zmq_to_qp_run_socket, worker_id, name, x, size_ print *, irp_here//': Unable to broadcast zmq_get_ivector' stop -1 endif - call MPI_BCAST (x, size_x, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) - if (ierr /= MPI_SUCCESS) then - print *, irp_here//': Unable to broadcast ivector' - stop -1 - endif + call MPI_BARRIER(MPI_COMM_WORLD,ierr) + call broadcast_chunks_integer(x, size_x) IRP_ENDIF end