From 8f294a6b804d2bb902e445fad30e941aa713b0fd Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Wed, 28 Nov 2018 11:03:10 +0100 Subject: [PATCH] More verbose ZMQ errors --- data/module_gitignore | 1 + scripts/qp_srun | 2 +- src/Davidson/u0Hu0.irp.f | 10 +++++----- src/ZMQ/put_get.irp.f | 36 ++++++++++++++++++++++++++++++++++-- 4 files changed, 41 insertions(+), 8 deletions(-) diff --git a/data/module_gitignore b/data/module_gitignore index 1c1ac198..ab657a01 100644 --- a/data/module_gitignore +++ b/data/module_gitignore @@ -3,6 +3,7 @@ IRPF90_man/ irpf90.make irpf90_entities tags +Makefile AO_Basis AO_one_e_integrals Bitmask diff --git a/scripts/qp_srun b/scripts/qp_srun index 5c7847de..7caf1a8f 100755 --- a/scripts/qp_srun +++ b/scripts/qp_srun @@ -69,7 +69,7 @@ as a single-node job before re-submitting the current job. fi rm -f $INPUT/work/qp_run_address set -x -srun -n 1 qp_run $PROG $INPUT & +srun -N 1 -n 1 qp_run $PROG $INPUT & if [[ $NPROC -gt 1 ]] then diff --git a/src/Davidson/u0Hu0.irp.f b/src/Davidson/u0Hu0.irp.f index fce85c7c..5158858f 100644 --- a/src/Davidson/u0Hu0.irp.f +++ b/src/Davidson/u0Hu0.irp.f @@ -486,11 +486,11 @@ subroutine u_0_H_u_0(e_0,u_0,n,keys_tmp,Nint,N_st,sze) double precision :: u_dot_u,u_dot_v,diag_H_mat_elem integer :: i,j - if ((sze > 100000).and.distributed_davidson) then - allocate (v_0(sze,N_states_diag),s_0(sze,N_states_diag), u_1(sze,N_states_diag)) - u_1(1:sze,1:N_states) = u_0(1:sze,1:N_states) - u_1(1:sze,N_states+1:N_states_diag) = 0.d0 - call H_S2_u_0_nstates_zmq(v_0,s_0,u_1,N_states_diag,sze) + if ((n > 100000).and.distributed_davidson) then + allocate (v_0(n,N_states_diag),s_0(n,N_states_diag), u_1(n,N_states_diag)) + u_1(1:n,1:N_states) = u_0(1:n,1:N_states) + u_1(1:n,N_states+1:N_states_diag) = 0.d0 + call H_S2_u_0_nstates_zmq(v_0,s_0,u_1,N_states_diag,n) deallocate(u_1) else allocate (v_0(n,N_st),s_0(n,N_st),u_1(n,N_st)) diff --git a/src/ZMQ/put_get.irp.f b/src/ZMQ/put_get.irp.f index 0df5268c..663732df 100644 --- a/src/ZMQ/put_get.irp.f +++ b/src/ZMQ/put_get.irp.f @@ -17,6 +17,7 @@ integer function zmq_put_dvector(zmq_to_qp_run_socket, worker_id, name, x, size_ write(msg,'(A,1X,I8,1X,A200)') 'put_data '//trim(zmq_state), worker_id, name rc = f77_zmq_send(zmq_to_qp_run_socket,trim(msg),len(trim(msg)),ZMQ_SNDMORE) if (rc /= len(trim(msg))) then + print *, trim(msg) zmq_put_dvector = -1 return endif @@ -29,6 +30,7 @@ integer function zmq_put_dvector(zmq_to_qp_run_socket, worker_id, name, x, size_ rc = f77_zmq_recv(zmq_to_qp_run_socket,msg,len(msg),0) if (msg(1:rc) /= 'put_data_reply ok') then + print *, trim(msg) zmq_put_dvector = -1 return endif @@ -58,6 +60,7 @@ integer function zmq_get_dvector(zmq_to_qp_run_socket, worker_id, name, x, size_ write(msg,'(A,1X,I8,1X,A200)') 'get_data '//trim(zmq_state), worker_id, name rc = f77_zmq_send(zmq_to_qp_run_socket,trim(msg),len(trim(msg)),0) if (rc /= len(trim(msg))) then + print *, trim(msg) zmq_get_dvector = -1 print *, irp_here, 'rc /= len(trim(msg))', rc, len(trim(msg)) go to 10 @@ -120,6 +123,7 @@ integer function zmq_put_ivector(zmq_to_qp_run_socket, worker_id, name, x, size_ write(msg,'(A,1X,I8,1X,A200)') 'put_data '//trim(zmq_state), worker_id, name rc = f77_zmq_send(zmq_to_qp_run_socket,trim(msg),len(trim(msg)),ZMQ_SNDMORE) if (rc /= len(trim(msg))) then + print *, trim(msg) zmq_put_ivector = -1 return endif @@ -132,6 +136,7 @@ integer function zmq_put_ivector(zmq_to_qp_run_socket, worker_id, name, x, size_ rc = f77_zmq_recv(zmq_to_qp_run_socket,msg,len(msg),0) if (msg(1:rc) /= 'put_data_reply ok') then + print *, trim(msg) zmq_put_ivector = -1 return endif @@ -161,12 +166,14 @@ integer function zmq_get_ivector(zmq_to_qp_run_socket, worker_id, name, x, size_ write(msg,'(A,1X,I8,1X,A200)') 'get_data '//trim(zmq_state), worker_id, name rc = f77_zmq_send(zmq_to_qp_run_socket,trim(msg),len(trim(msg)),0) if (rc /= len(trim(msg))) then + print *, trim(msg) zmq_get_ivector = -1 go to 10 endif rc = f77_zmq_recv(zmq_to_qp_run_socket,msg,len(msg),0) if (msg(1:14) /= 'get_data_reply') then + print *, trim(msg) zmq_get_ivector = -1 go to 10 endif @@ -218,6 +225,7 @@ integer function zmq_put8_dvector(zmq_to_qp_run_socket, worker_id, name, x, size write(msg,'(A,1X,I8,1X,A200)') 'put_data '//trim(zmq_state), worker_id, name rc = f77_zmq_send(zmq_to_qp_run_socket,trim(msg),len(trim(msg)),ZMQ_SNDMORE) if (rc /= len(trim(msg))) then + print *, trim(msg) zmq_put8_dvector = -1 print *, 'Failed in put_data' return @@ -233,6 +241,7 @@ integer function zmq_put8_dvector(zmq_to_qp_run_socket, worker_id, name, x, size rc = f77_zmq_recv(zmq_to_qp_run_socket,msg,len(msg),0) if (msg(1:rc) /= 'put_data_reply ok') then print *, 'Failed in recv ', rc + print *, trim(msg) zmq_put8_dvector = -1 return endif @@ -262,6 +271,7 @@ integer function zmq_get8_dvector(zmq_to_qp_run_socket, worker_id, name, x, size write(msg,'(A,1X,I8,1X,A200)') 'get_data '//trim(zmq_state), worker_id, name rc = f77_zmq_send(zmq_to_qp_run_socket,trim(msg),len(trim(msg)),0) if (rc /= len(trim(msg))) then + print *, trim(msg) zmq_get8_dvector = -1 print *, irp_here, 'rc /= len(trim(msg))', rc, len(trim(msg)) go to 10 @@ -269,7 +279,8 @@ integer function zmq_get8_dvector(zmq_to_qp_run_socket, worker_id, name, x, size rc = f77_zmq_recv(zmq_to_qp_run_socket,msg,len(msg),0) if (msg(1:14) /= 'get_data_reply') then - print *, irp_here, 'msg(1:14) /= get_data_reply', msg(1:14) + print *, irp_here, 'msg(1:14) /= get_data_reply' + print *, trim(msg) zmq_get8_dvector = -1 go to 10 endif @@ -330,6 +341,7 @@ integer function zmq_put_dmatrix(zmq_to_qp_run_socket, worker_id, name, x, size_ write(msg,'(A,1X,I8,1X,A,I8.8)') 'put_data '//trim(zmq_state), worker_id, trim(name), j rc = f77_zmq_send(zmq_to_qp_run_socket,trim(msg),len(trim(msg)),ZMQ_SNDMORE) if (rc /= len(trim(msg))) then + print *, trim(msg) zmq_put_dmatrix = -1 print *, 'Failed in put_data', rc, j return @@ -344,6 +356,7 @@ integer function zmq_put_dmatrix(zmq_to_qp_run_socket, worker_id, name, x, size_ rc = f77_zmq_recv(zmq_to_qp_run_socket,msg,len(msg),0) if (msg(1:rc) /= 'put_data_reply ok') then + print *, trim(msg) print *, 'Failed in recv ', rc, j zmq_put_dmatrix = -1 return @@ -382,6 +395,7 @@ integer function zmq_get_dmatrix(zmq_to_qp_run_socket, worker_id, name, x, size_ write(msg,'(A,1X,I8,1X,A,I8.8)') 'get_data '//trim(zmq_state), worker_id, trim(name),j rc = f77_zmq_send(zmq_to_qp_run_socket,trim(msg),len(trim(msg)),0) if (rc /= len(trim(msg))) then + print *, trim(msg) zmq_get_dmatrix = -1 print *, irp_here, 'rc /= len(trim(msg))', rc, len(trim(msg)) go to 10 @@ -389,7 +403,8 @@ integer function zmq_get_dmatrix(zmq_to_qp_run_socket, worker_id, name, x, size_ rc = f77_zmq_recv(zmq_to_qp_run_socket,msg,len(msg),0) if (msg(1:14) /= 'get_data_reply') then - print *, irp_here, 'msg(1:14) /= get_data_reply', msg(1:14) + print *, irp_here, 'msg(1:14) /= get_data_reply' + print *, trim(msg) zmq_get_dmatrix = -1 go to 10 endif @@ -444,6 +459,7 @@ integer function zmq_put8_ivector(zmq_to_qp_run_socket, worker_id, name, x, size write(msg,'(A,1X,I8,1X,A200)') 'put_data '//trim(zmq_state), worker_id, name rc = f77_zmq_send(zmq_to_qp_run_socket,trim(msg),len(trim(msg)),ZMQ_SNDMORE) if (rc /= len(trim(msg))) then + print *, trim(msg) zmq_put8_ivector = -1 return endif @@ -456,6 +472,7 @@ integer function zmq_put8_ivector(zmq_to_qp_run_socket, worker_id, name, x, size rc = f77_zmq_recv(zmq_to_qp_run_socket,msg,len(msg),0) if (msg(1:rc) /= 'put_data_reply ok') then + print *, trim(msg) zmq_put8_ivector = -1 return endif @@ -485,12 +502,14 @@ integer function zmq_get8_ivector(zmq_to_qp_run_socket, worker_id, name, x, size write(msg,'(A,1X,I8,1X,A200)') 'get_data '//trim(zmq_state), worker_id, name rc = f77_zmq_send(zmq_to_qp_run_socket,trim(msg),len(trim(msg)),0) if (rc /= len(trim(msg))) then + print *, trim(msg) zmq_get8_ivector = -1 go to 10 endif rc = f77_zmq_recv(zmq_to_qp_run_socket,msg,len(msg),0) if (msg(1:14) /= 'get_data_reply') then + print *, trim(msg) zmq_get8_ivector = -1 go to 10 endif @@ -542,6 +561,7 @@ integer function zmq_put_int(zmq_to_qp_run_socket, worker_id, name, x) write(msg,'(A,1X,I8,1X,A200)') 'put_data '//trim(zmq_state), worker_id, name rc = f77_zmq_send(zmq_to_qp_run_socket,trim(msg),len(trim(msg)),ZMQ_SNDMORE) if (rc /= len(trim(msg))) then + print *, trim(msg) zmq_put_int = -1 return endif @@ -554,6 +574,7 @@ integer function zmq_put_int(zmq_to_qp_run_socket, worker_id, name, x) rc = f77_zmq_recv(zmq_to_qp_run_socket,msg,len(msg),0) if (msg(1:rc) /= 'put_data_reply ok') then + print *, trim(msg) zmq_put_int = -1 return endif @@ -581,12 +602,14 @@ integer function zmq_get_int(zmq_to_qp_run_socket, worker_id, name, x) write(msg,'(A,1X,I8,1X,A200)') 'get_data '//trim(zmq_state), worker_id, name rc = f77_zmq_send(zmq_to_qp_run_socket,trim(msg),len(trim(msg)),0) if (rc /= len(trim(msg))) then + print *, trim(msg) zmq_get_int = -1 go to 10 endif rc = f77_zmq_recv(zmq_to_qp_run_socket,msg,len(msg),0) if (msg(1:14) /= 'get_data_reply') then + print *, trim(msg) zmq_get_int = -1 go to 10 endif @@ -643,12 +666,14 @@ integer function zmq_get_int_nompi(zmq_to_qp_run_socket, worker_id, name, x) write(msg,'(A,1X,I8,1X,A200)') 'get_data '//trim(zmq_state), worker_id, name rc = f77_zmq_send(zmq_to_qp_run_socket,trim(msg),len(trim(msg)),0) if (rc /= len(trim(msg))) then + print *, trim(msg) zmq_get_int_nompi = -1 go to 10 endif rc = f77_zmq_recv(zmq_to_qp_run_socket,msg,len(msg),0) if (msg(1:14) /= 'get_data_reply') then + print *, trim(msg) zmq_get_int_nompi = -1 go to 10 endif @@ -690,6 +715,7 @@ integer function zmq_put_i8matrix(zmq_to_qp_run_socket, worker_id, name, x, size write(msg,'(A,1X,I8,1X,A,I8.8)') 'put_data '//trim(zmq_state), worker_id, trim(name), j rc = f77_zmq_send(zmq_to_qp_run_socket,trim(msg),len(trim(msg)),ZMQ_SNDMORE) if (rc /= len(trim(msg))) then + print *, trim(msg) zmq_put_i8matrix = -1 print *, irp_here, 'Failed in put_data', rc, j return @@ -705,6 +731,7 @@ integer function zmq_put_i8matrix(zmq_to_qp_run_socket, worker_id, name, x, size rc = f77_zmq_recv(zmq_to_qp_run_socket,msg,len(msg),0) if (msg(1:rc) /= 'put_data_reply ok') then print *, irp_here, 'Failed in recv ', rc, j + print *, trim(msg) zmq_put_i8matrix = -1 return endif @@ -750,6 +777,7 @@ integer function zmq_get_i8matrix(zmq_to_qp_run_socket, worker_id, name, x, size rc = f77_zmq_recv(zmq_to_qp_run_socket,msg,len(msg),0) if (msg(1:14) /= 'get_data_reply') then print *, irp_here, 'msg(1:14) /= get_data_reply', msg(1:14) + print *, trim(msg) zmq_get_i8matrix = -1 go to 10 endif @@ -813,6 +841,7 @@ integer function zmq_put_imatrix(zmq_to_qp_run_socket, worker_id, name, x, size_ write(msg,'(A,1X,I8,1X,A,I8.8)') 'put_data '//trim(zmq_state), worker_id, trim(name), j rc = f77_zmq_send(zmq_to_qp_run_socket,trim(msg),len(trim(msg)),ZMQ_SNDMORE) if (rc /= len(trim(msg))) then + print *, trim(msg) zmq_put_imatrix = -1 print *, irp_here, 'Failed in put_data', rc, j return @@ -827,6 +856,7 @@ integer function zmq_put_imatrix(zmq_to_qp_run_socket, worker_id, name, x, size_ rc = f77_zmq_recv(zmq_to_qp_run_socket,msg,len(msg),0) if (msg(1:rc) /= 'put_data_reply ok') then + print *, trim(msg) print *, irp_here, 'Failed in recv ', rc, j zmq_put_imatrix = -1 return @@ -865,6 +895,7 @@ integer function zmq_get_imatrix(zmq_to_qp_run_socket, worker_id, name, x, size_ write(msg,'(A,1X,I8,1X,A,I8.8)') 'get_data '//trim(zmq_state), worker_id, trim(name),j rc = f77_zmq_send(zmq_to_qp_run_socket,trim(msg),len(trim(msg)),0) if (rc /= len(trim(msg))) then + print *, trim(msg) zmq_get_imatrix = -1 print *, irp_here, 'rc /= len(trim(msg))', rc, len(trim(msg)) go to 10 @@ -872,6 +903,7 @@ integer function zmq_get_imatrix(zmq_to_qp_run_socket, worker_id, name, x, size_ rc = f77_zmq_recv(zmq_to_qp_run_socket,msg,len(msg),0) if (msg(1:14) /= 'get_data_reply') then + print *, trim(msg) print *, irp_here, 'msg(1:14) /= get_data_reply', msg(1:14) zmq_get_imatrix = -1 go to 10