diff --git a/plugins/Full_CI_ZMQ/pt2_stoch_routines.irp.f b/plugins/Full_CI_ZMQ/pt2_stoch_routines.irp.f index e6758e8f..36ad63b2 100644 --- a/plugins/Full_CI_ZMQ/pt2_stoch_routines.irp.f +++ b/plugins/Full_CI_ZMQ/pt2_stoch_routines.irp.f @@ -139,7 +139,17 @@ subroutine ZMQ_pt2(E, pt2,relative_error, absolute_error, error) endif - !$OMP PARALLEL DEFAULT(shared) NUM_THREADS(nproc+1) & + integer :: nproc_target + nproc_target = nproc + double precision :: mem + mem = 8.d0 * N_det * (N_int * 2.d0 * 3.d0 + 3.d0 + 5.d0) / (1024.d0**3) + call write_double(6,mem,'Estimated memory/thread (Gb)') + if (qp_max_mem > 0) then + nproc_target = max(1,int(dble(qp_max_mem)/mem)) + nproc_target = min(nproc_target,nproc) + endif + + !$OMP PARALLEL DEFAULT(shared) NUM_THREADS(nproc_target+1) & !$OMP PRIVATE(i) i = omp_get_thread_num() if (i==0) then diff --git a/plugins/Full_CI_ZMQ/selection_slave.irp.f b/plugins/Full_CI_ZMQ/selection_slave.irp.f index c1712b2d..8036985a 100644 --- a/plugins/Full_CI_ZMQ/selection_slave.irp.f +++ b/plugins/Full_CI_ZMQ/selection_slave.irp.f @@ -29,11 +29,13 @@ subroutine run_wf integer(ZMQ_PTR), external :: new_zmq_to_qp_run_socket integer(ZMQ_PTR) :: zmq_to_qp_run_socket double precision :: energy(N_states) - character*(64) :: states(2) + character*(64) :: states(3) + character*(64) :: old_state integer :: rc, i, ierr double precision :: t0, t1 integer, external :: zmq_get_dvector, zmq_get_N_det_generators + integer, external :: zmq_get_ivector integer, external :: zmq_get_psi, zmq_get_N_det_selectors integer, external :: zmq_get_N_states_diag @@ -41,30 +43,61 @@ subroutine run_wf zmq_context = f77_zmq_ctx_new () states(1) = 'selection' - states(2) = 'pt2' + states(2) = 'davidson' + states(3) = 'pt2' + old_state = 'Waiting' zmq_to_qp_run_socket = new_zmq_to_qp_run_socket() do - call wait_for_states(states,zmq_state,size(states)) - print *, trim(zmq_state) + if (mpi_master) then + call wait_for_states(states,zmq_state,size(states)) + if (zmq_state(1:64) == old_state(1:64)) then + call sleep(1) + cycle + else + old_state(1:64) = zmq_state(1:64) + endif + print *, trim(zmq_state) + endif + + IRP_IF MPI + call MPI_BCAST (zmq_state, 128, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + if (ierr /= MPI_SUCCESS) then + print *, irp_here, 'error in broadcast of zmq_state' + endif + IRP_ENDIF if(zmq_state(1:7) == 'Stopped') then - exit + endif - else if (zmq_state(1:9) == 'selection') then + + if (zmq_state(1:9) == 'selection') then ! Selection ! --------- call wall_time(t0) if (zmq_get_psi(zmq_to_qp_run_socket,1) == -1) cycle + if (zmq_get_dvector(zmq_to_qp_run_socket,1,'threshold_generators',threshold_generators,1) == -1) cycle + if (zmq_get_dvector(zmq_to_qp_run_socket,1,'threshold_selectors',threshold_selectors,1) == -1) cycle + if (zmq_get_dvector(zmq_to_qp_run_socket,1,'energy',energy,N_states) == -1) cycle if (zmq_get_N_det_generators (zmq_to_qp_run_socket, 1) == -1) cycle if (zmq_get_N_det_selectors(zmq_to_qp_run_socket, 1) == -1) cycle - if (zmq_get_dvector(zmq_to_qp_run_socket,1,'energy',energy,N_states) == -1) cycle + if (zmq_get_dvector(zmq_to_qp_run_socket,1,'state_average_weight',state_average_weight,N_states) == -1) cycle + psi_energy(1:N_states) = energy(1:N_states) + TOUCH psi_energy state_average_weight threshold_selectors threshold_generators + if (mpi_master) then + print *, 'N_det', N_det + print *, 'N_det_generators', N_det_generators + print *, 'N_det_selectors', N_det_selectors + print *, 'psi_energy', psi_energy + print *, 'pt2_stoch_istate', pt2_stoch_istate + print *, 'state_average_weight', state_average_weight + endif call wall_time(t1) call write_double(6,(t1-t0),'Broadcast time') @@ -73,42 +106,50 @@ subroutine run_wf call run_selection_slave(0,i,energy) !$OMP END PARALLEL print *, 'Selection done' + IRP_IF MPI + call MPI_BARRIER(MPI_COMM_WORLD, ierr) + if (ierr /= MPI_SUCCESS) then + print *, irp_here, 'error in barrier' + endif + IRP_ENDIF + print *, 'All selection done' + if (N_det < 100000) then + exit + endif - else if (zmq_state(1:3) == 'pt2') then + else if (zmq_state(1:8) == 'davidson') then - ! PT2 - ! --- + ! Davidson + ! -------- - print *, 'PT2' call wall_time(t0) if (zmq_get_psi(zmq_to_qp_run_socket,1) == -1) cycle - if (zmq_get_dvector(zmq_to_qp_run_socket,1,'energy',energy,N_states) == -1) cycle - if (zmq_get_N_det_generators (zmq_to_qp_run_socket, 1) == -1) cycle - if (zmq_get_N_det_selectors(zmq_to_qp_run_socket, 1) == -1) cycle + if (zmq_get_N_states_diag(zmq_to_qp_run_socket,1) == -1) cycle + if (zmq_get_dvector(zmq_to_qp_run_socket,1,'energy',energy,N_states_diag) == -1) cycle call wall_time(t1) - call write_double(6,(t1-t0),'Broadcast time') + if (mpi_master) then + call write_double(6,(t1-t0),'Broadcast time') + endif - logical :: lstop - lstop = .False. - !$OMP PARALLEL PRIVATE(i) - i = omp_get_thread_num() - call run_pt2_slave(0,i,energy,lstop) - !$OMP END PARALLEL - print *, 'PT2 done' + call omp_set_nested(.True.) + call davidson_slave_tcp(0) + call omp_set_nested(.False.) + print *, 'Davidson done' + IRP_IF MPI + call MPI_BARRIER(MPI_COMM_WORLD, ierr) + if (ierr /= MPI_SUCCESS) then + print *, irp_here, 'error in barrier' + endif + IRP_ENDIF + print *, 'All Davidson done' + exit endif - IRP_IF MPI - call MPI_BARRIER(MPI_COMM_WORLD, ierr) - if (ierr /= MPI_SUCCESS) then - print *, irp_here, 'error in barrier' - endif - IRP_ENDIF - end do IRP_IF MPI - call MPI_finalize(i) + call MPI_finalize(ierr) IRP_ENDIF end diff --git a/plugins/Full_CI_ZMQ/zmq_selection.irp.f b/plugins/Full_CI_ZMQ/zmq_selection.irp.f index 55cdb718..6bcc548a 100644 --- a/plugins/Full_CI_ZMQ/zmq_selection.irp.f +++ b/plugins/Full_CI_ZMQ/zmq_selection.irp.f @@ -97,7 +97,17 @@ subroutine ZMQ_selection(N_in, pt2) print *, irp_here, ': Failed in zmq_set_running' endif - !$OMP PARALLEL DEFAULT(shared) SHARED(b, pt2) PRIVATE(i) NUM_THREADS(nproc+1) + integer :: nproc_target + nproc_target = nproc + double precision :: mem + mem = 8.d0 * N_det * (N_int * 2.d0 * 3.d0 + 3.d0 + 5.d0) / (1024.d0**3) + call write_double(6,mem,'Estimated memory/thread (Gb)') + if (qp_max_mem > 0) then + nproc_target = max(1,int(dble(qp_max_mem)/mem)) + nproc_target = min(nproc_target,nproc) + endif + + !$OMP PARALLEL DEFAULT(shared) SHARED(b, pt2) PRIVATE(i) NUM_THREADS(nproc_target+1) i = omp_get_thread_num() if (i==0) then call selection_collector(zmq_socket_pull, b, N, pt2) diff --git a/src/Davidson/davidson_parallel.irp.f b/src/Davidson/davidson_parallel.irp.f index ce6d670a..15eede23 100644 --- a/src/Davidson/davidson_parallel.irp.f +++ b/src/Davidson/davidson_parallel.irp.f @@ -403,8 +403,8 @@ BEGIN_PROVIDER [ integer, nthreads_davidson ] call getenv('NTHREADS_DAVIDSON',env) if (trim(env) /= '') then read(env,*) nthreads_davidson + call write_int(6,nthreads_davidson,'Number of threads for ') endif - call write_int(6,nthreads_davidson,'Number of threads for Diagonalization') END_PROVIDER diff --git a/src/Davidson/diagonalization_hs2_dressed.irp.f b/src/Davidson/diagonalization_hs2_dressed.irp.f index ad75f583..491ddea7 100644 --- a/src/Davidson/diagonalization_hs2_dressed.irp.f +++ b/src/Davidson/diagonalization_hs2_dressed.irp.f @@ -138,6 +138,7 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_ integer :: shift, shift2, itermax, istate double precision :: r1, r2 logical :: state_ok(N_st_diag*davidson_sze_max) + integer :: nproc_target include 'constants.include.F' !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: U, W, S, y, h, lambda @@ -162,8 +163,22 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_ call write_int(6,N_st,'Number of states') call write_int(6,N_st_diag,'Number of states in diagonalization') call write_int(6,sze,'Number of determinants') + nproc_target = nproc r1 = 8.d0*(3.d0*dble(sze*N_st_diag*itermax+5.d0*(N_st_diag*itermax)**2 & - + 4.d0*(N_st_diag*itermax)+nproc*(4.d0*N_det_alpha_unique+2.d0*N_st_diag*sze)))/(1024.d0**3) + + 3.d0*(N_st_diag*itermax)+nproc*(4.d0*N_det_alpha_unique+2.d0*N_st_diag*sze)))/(1024.d0**3) + if (qp_max_mem > 0) then + do while (r1 > qp_max_mem) + nproc_target = nproc_target - 1 + r1 = 8.d0*(3.d0*dble(sze*N_st_diag*itermax+5.d0*(N_st_diag*itermax)**2 & + + 3.d0*(N_st_diag*itermax)+nproc_target*(4.d0*N_det_alpha_unique+2.d0*N_st_diag*sze)))/(1024.d0**3) + if (nproc_target == 0) then + nproc_target = 1 + exit + endif + enddo + call omp_set_num_threads(nproc_target) + call write_int(6,nproc_target,'Number of threads for diagonalization') + endif call write_double(6, r1, 'Memory(Gb)') write(6,'(A)') '' write_buffer = '=====' @@ -512,6 +527,7 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_ y, s_, s_tmp, & lambda & ) + call omp_set_num_threads(nproc) end diff --git a/src/Utils/util.irp.f b/src/Utils/util.irp.f index e4b21f2e..9f4f2168 100644 --- a/src/Utils/util.irp.f +++ b/src/Utils/util.irp.f @@ -286,6 +286,23 @@ BEGIN_PROVIDER [ integer, nproc ] !$OMP END PARALLEL END_PROVIDER +BEGIN_PROVIDER [ integer, qp_max_mem ] + implicit none + BEGIN_DOC + ! Maximum memory in Gb + END_DOC + character*(128) :: env + + qp_max_mem = -1 + call getenv('QP_MAXMEM',env) + if (trim(env) /= '') then + read(env,*) qp_max_mem + call write_int(6,qp_max_mem,'Target maximum memory') + endif + + +END_PROVIDER + double precision function u_dot_v(u,v,sze) implicit none diff --git a/src/ZMQ/utils.irp.f b/src/ZMQ/utils.irp.f index a77f974a..97fded04 100644 --- a/src/ZMQ/utils.irp.f +++ b/src/ZMQ/utils.irp.f @@ -246,7 +246,7 @@ IRP_ENDIF ! stop 'Unable to set ZMQ_RCVBUF on pull socket' ! endif - rc = f77_zmq_setsockopt(new_zmq_pull_socket,ZMQ_RCVHWM,1,4) + rc = f77_zmq_setsockopt(new_zmq_pull_socket,ZMQ_RCVHWM,nproc,4) if (rc /= 0) then stop 'Unable to set ZMQ_RCVHWM on pull socket' endif @@ -323,7 +323,7 @@ IRP_ENDIF stop 'Unable to set ZMQ_LINGER on push socket' endif - rc = f77_zmq_setsockopt(new_zmq_push_socket,ZMQ_SNDHWM,1,4) + rc = f77_zmq_setsockopt(new_zmq_push_socket,ZMQ_SNDHWM,3,4) if (rc /= 0) then stop 'Unable to set ZMQ_SNDHWM on push socket' endif