From a38b57ac34c3f5957358ea05ba22230f03df613d Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Wed, 9 Jan 2019 11:31:59 +0100 Subject: [PATCH] Better estimate of the PT2 memory --- etc/ezfio.rc | 2 + etc/irpf90.rc | 2 + etc/libraries.rc | 2 + etc/local.rc | 15 ++++++ etc/network.rc | 5 -- etc/ninja.rc | 2 + etc/ocaml.rc | 2 + etc/paths.rc | 2 + etc/qp.rc | 2 + src/cipsi/cipsi.irp.f | 3 ++ src/cipsi/pt2_stoch_routines.irp.f | 54 +++++++++++++++---- src/cipsi/stochastic_cipsi.irp.f | 3 ++ src/davidson/davidson_parallel.irp.f | 4 +- .../diagonalization_hs2_dressed.irp.f | 51 ++++++++++++++---- src/fci/environment.irp.f | 14 +++++ 15 files changed, 137 insertions(+), 26 deletions(-) create mode 100644 etc/local.rc delete mode 100644 etc/network.rc create mode 100644 src/fci/environment.irp.f diff --git a/etc/ezfio.rc b/etc/ezfio.rc index a75a8c9e..67a8a008 100644 --- a/etc/ezfio.rc +++ b/etc/ezfio.rc @@ -1,3 +1,5 @@ +# Configuration of EZFIO package + export QP_EZFIO=${QP_ROOT}/external/ezfio if [[ -f ${QP_EZFIO}/Bash/ezfio.sh ]]; then diff --git a/etc/irpf90.rc b/etc/irpf90.rc index 82f0e751..f5dbffae 100644 --- a/etc/irpf90.rc +++ b/etc/irpf90.rc @@ -1,3 +1,5 @@ +# Configuration of IRPF90 package + export IRPF90=${QP_ROOT}/bin/irpf90 # Load irpman shell completion diff --git a/etc/libraries.rc b/etc/libraries.rc index c41daac1..223322df 100644 --- a/etc/libraries.rc +++ b/etc/libraries.rc @@ -1,3 +1,5 @@ +# Configuration of additional libraries required for modules + QP_LIB="" # Include here the optional external libraries to link with your binaries, diff --git a/etc/local.rc b/etc/local.rc new file mode 100644 index 00000000..552e1a9b --- /dev/null +++ b/etc/local.rc @@ -0,0 +1,15 @@ +# Configuration specific to the local machine + +# Maximum allowed memory per node +# export QP_MAXMEM=64 + +# Target number of threads for Davidson's algorithm +# export QP_NTHREADS_DAVIDSON=32 + +# Target number of threads for the computation of the PT2 +# export QP_NTHREADS_PT2=32 + +# Name of the network interface to be chosen +# export QP_NIC=ib0 + + diff --git a/etc/network.rc b/etc/network.rc deleted file mode 100644 index 1fdb91ef..00000000 --- a/etc/network.rc +++ /dev/null @@ -1,5 +0,0 @@ - # Choose the correct network interface if the default one is incorrect -# export QP_NIC=ib0 -# export QP_NIC=eth0 - - diff --git a/etc/ninja.rc b/etc/ninja.rc index 6f717599..a5d5f4a9 100644 --- a/etc/ninja.rc +++ b/etc/ninja.rc @@ -1 +1,3 @@ +# Configuration for the Ninja package + export NINJA=${QP_ROOT}/bin/ninja diff --git a/etc/ocaml.rc b/etc/ocaml.rc index dd1bc127..1a1e5612 100644 --- a/etc/ocaml.rc +++ b/etc/ocaml.rc @@ -1,3 +1,5 @@ +# Configuration for the OCaml compiler + if [[ -z $OPAMROOT ]] then diff --git a/etc/paths.rc b/etc/paths.rc index 3ab5b547..33104af6 100644 --- a/etc/paths.rc +++ b/etc/paths.rc @@ -1,3 +1,5 @@ +# Configuration of all the paths to executables and libraries + if [[ -z $QP_PYTHON ]] then diff --git a/etc/qp.rc b/etc/qp.rc index 63104dc2..dcf9dc35 100644 --- a/etc/qp.rc +++ b/etc/qp.rc @@ -1,3 +1,5 @@ +# Configuration of the qp shell command + if [[ "$(ps -p $$ -ocomm=)" == "zsh" ]] ; then autoload bashcompinit bashcompinit diff --git a/src/cipsi/cipsi.irp.f b/src/cipsi/cipsi.irp.f index ecbab7c6..8e93297d 100644 --- a/src/cipsi/cipsi.irp.f +++ b/src/cipsi/cipsi.irp.f @@ -46,6 +46,9 @@ subroutine run_cipsi psi_coef = psi_coef_sorted N_det = N_det_max soft_touch N_det psi_det psi_coef + if (s2_eig) then + call make_s2_eigenfunction + endif call diagonalize_CI call save_wavefunction endif diff --git a/src/cipsi/pt2_stoch_routines.irp.f b/src/cipsi/pt2_stoch_routines.irp.f index b9a81fb6..e5774e1f 100644 --- a/src/cipsi/pt2_stoch_routines.irp.f +++ b/src/cipsi/pt2_stoch_routines.irp.f @@ -185,7 +185,7 @@ subroutine ZMQ_pt2(E, pt2,relative_error, error, variance, norm, N_in) ipos=1 do i= 1, N_det_generators do j=1,pt2_F(pt2_J(i)) - write(task(ipos:ipos+30),'(I9,1X,I9,1X,I9,''|'')') j, pt2_J(i), N + write(task(ipos:ipos+30),'(I9,1X,I9,1X,I9,''|'')') j, pt2_J(i), N_in ipos += 30 if (ipos > 300000-30) then if (add_task_to_taskserver(zmq_to_qp_run_socket,trim(task(1:ipos))) == -1) then @@ -213,15 +213,49 @@ subroutine ZMQ_pt2(E, pt2,relative_error, error, variance, norm, N_in) endif + double precision :: mem_collector, mem, rss + + call resident_memory(rss) + + mem_collector = 8.d0 * & ! bytes + ( 1.d0*pt2_n_tasks_max & ! task_id, index + + 0.635d0*N_det_generators & ! f,d + + 3.d0*N_det_generators*N_states & ! eI, vI, nI + + 3.d0*pt2_n_tasks_max*N_states & ! eI_task, vI_task, nI_task + + 4.d0*(pt2_N_teeth+1) & ! S, S2, T2, T3 + + 1.d0*(N_int*2.d0*N + N) & ! selection buffer + + 1.d0*(N_int*2.d0*N + N) & ! sort selection buffer + ) / 1024.d0**3 + integer :: nproc_target - nproc_target = nproc - double precision :: mem - mem = 8.d0 * N_det * (N_int * 2.d0 * 3.d0 + 3.d0 + 5.d0) / (1024.d0**3) - call write_double(6,mem,'Estimated memory/thread (Gb)') - if (qp_max_mem > 0) then - nproc_target = max(1,int(dble(qp_max_mem)/mem)) - nproc_target = min(nproc_target,nproc) - endif + nproc_target = nthreads_pt2 + + do + mem = mem_collector + & ! + nproc_target * 8.d0 * & ! bytes + ( 0.5d0*pt2_n_tasks_max & ! task_id + + 64.d0*pt2_n_tasks_max & ! task + + 3.d0*pt2_n_tasks_max*N_states & ! pt2, variance, norm + + 1.d0*pt2_n_tasks_max & ! i_generator, subset + + 2.d0*(N_int*2.d0*N_in + N_in) & ! selection buffers + + 1.d0*(N_int*2.d0*N_in + N_in) & ! sort/merge selection buffers + ) / 1024.d0**3 + + if (nproc_target == 0) then + call check_mem(mem,irp_here) + nproc_target = 1 + exit + endif + + if (mem+rss < qp_max_mem) then + exit + endif + + nproc_target = nproc_target - 1 + + enddo + call write_int(6,nproc_target,'Number of threads for PT2') + call write_double(6,mem,'Memory (Gb)') call omp_set_nested(.false.) @@ -329,6 +363,8 @@ subroutine pt2_collector(zmq_socket_pull, E, relative_error, pt2, error, & rss += memory_of_double(pt2_N_teeth+1)*4.d0 call check_mem(rss,irp_here) + ! If an allocation is added here, the estimate of the memory should also be + ! updated in ZMQ_pt2 allocate(task_id(pt2_n_tasks_max), index(pt2_n_tasks_max), f(N_det_generators)) allocate(d(N_det_generators+1)) allocate(eI(N_states, N_det_generators), eI_task(N_states, pt2_n_tasks_max)) diff --git a/src/cipsi/stochastic_cipsi.irp.f b/src/cipsi/stochastic_cipsi.irp.f index 64822dcd..59fb227c 100644 --- a/src/cipsi/stochastic_cipsi.irp.f +++ b/src/cipsi/stochastic_cipsi.irp.f @@ -48,6 +48,9 @@ subroutine run_stochastic_cipsi psi_coef = psi_coef_sorted N_det = N_det_max soft_touch N_det psi_det psi_coef + if (s2_eig) then + call make_s2_eigenfunction + endif call diagonalize_CI call save_wavefunction endif diff --git a/src/davidson/davidson_parallel.irp.f b/src/davidson/davidson_parallel.irp.f index 35ff6240..505393a8 100644 --- a/src/davidson/davidson_parallel.irp.f +++ b/src/davidson/davidson_parallel.irp.f @@ -452,10 +452,10 @@ BEGIN_PROVIDER [ integer, nthreads_davidson ] END_DOC nthreads_davidson = nproc character*(32) :: env - call getenv('NTHREADS_DAVIDSON',env) + call getenv('QP_NTHREADS_DAVIDSON',env) if (trim(env) /= '') then read(env,*) nthreads_davidson - call write_int(6,nthreads_davidson,'Number of threads for ') + call write_int(6,nthreads_davidson,'Target number of threads for ') endif END_PROVIDER diff --git a/src/davidson/diagonalization_hs2_dressed.irp.f b/src/davidson/diagonalization_hs2_dressed.irp.f index 52c31c2f..8faf0c0f 100644 --- a/src/davidson/diagonalization_hs2_dressed.irp.f +++ b/src/davidson/diagonalization_hs2_dressed.irp.f @@ -115,7 +115,7 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_ integer :: iter2, itertot double precision, allocatable :: W(:,:), U(:,:), S(:,:), overlap(:,:) double precision, allocatable :: y(:,:), h(:,:), lambda(:), s2(:) - double precision, allocatable :: c(:), s_(:,:), s_tmp(:,:) + double precision, allocatable :: s_(:,:), s_tmp(:,:) double precision :: diag_h_mat_elem double precision, allocatable :: residual_norm(:) character*(16384) :: write_buffer @@ -137,6 +137,13 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_ itermax = max(3,min(davidson_sze_max, sze/N_st_diag)) itertot = 0 + if (state_following) then + allocate(overlap(N_st_diag*itermax, N_st_diag*itermax)) + else + allocate(overlap(1,1)) ! avoid 'if' for deallocate + endif + overlap = 0.d0 + PROVIDE nuclear_repulsion expected_s2 psi_bilinear_matrix_order psi_bilinear_matrix_order_reverse call write_time(6) @@ -149,25 +156,51 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_ call write_int(6,N_st,'Number of states') call write_int(6,N_st_diag,'Number of states in diagonalization') call write_int(6,sze,'Number of determinants') + + ! Find max number of cores to fit in memory + ! ----------------------------------------- + nproc_target = nproc double precision :: rss + integer :: maxab + maxab = max(N_det_alpha_unique, N_det_beta_unique)+1 + call resident_memory(rss) - r1 = 8.d0*(3.d0*dble(sze*N_st_diag*itermax+5.d0*(N_st_diag*itermax)**2 & - + 3.d0*(N_st_diag*itermax)+nproc*(4.d0*N_det_alpha_unique+2.d0*N_st_diag*sze)))/(1024.d0**3) - do while (r1+rss > qp_max_mem) - nproc_target = nproc_target - 1 - r1 = 8.d0*(3.d0*dble(sze*N_st_diag*itermax+5.d0*(N_st_diag*itermax)**2 & - + 3.d0*(N_st_diag*itermax)+nproc_target*(4.d0*N_det_alpha_unique+2.d0*N_st_diag*sze)))/(1024.d0**3) + do + r1 = 8.d0 * &! bytes + ( 3.d0*(dble(sze)*(N_st_diag*itermax)) &! W,U,S + + 4.d0*(N_st_diag*itermax)**2 &! h,y,s_,s_tmp + + 2.d0*(N_st_diag*itermax) &! s2,lambda + + 1.d0*(N_st_diag) &! residual_norm + ! In H_S2_u_0_nstates_zmq + + 3.d0*(N_st_diag*N_det) &! u_t, v_t, s_t on collector + + 3.d0*(N_st_diag*N_det) &! u_t, v_t, s_t on slave + + 0.5d0*maxab &! idx0 in H_S2_u_0_nstates_openmp_work_* + + nproc_target * &! In OMP section + ( 1.d0*(N_int*maxab) &! buffer + + 3.5d0*(maxab) ) &! singles_a, singles_b, doubles, idx + ) / 1024.d0**3 + if (nproc_target == 0) then call check_mem(r1,irp_here) nproc_target = 1 exit endif + + if (r1+rss < qp_max_mem) then + exit + endif + + nproc_target = nproc_target - 1 + enddo nthreads_davidson = nproc_target TOUCH nthreads_davidson call write_int(6,nproc_target,'Number of threads for diagonalization') call write_double(6, r1, 'Memory(Gb)') + + !--------------- + write(6,'(A)') '' write_buffer = '=====' do i=1,N_st @@ -198,9 +231,7 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_ s_(N_st_diag*itermax,N_st_diag*itermax), & s_tmp(N_st_diag*itermax,N_st_diag*itermax), & residual_norm(N_st_diag), & - c(N_st_diag*itermax), & s2(N_st_diag*itermax), & - overlap(N_st_diag*itermax, N_st_diag*itermax), & lambda(N_st_diag*itermax)) h = 0.d0 @@ -503,7 +534,7 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_ deallocate ( & W, residual_norm, & U, overlap, & - c, S, & + S, & h, & y, s_, s_tmp, & lambda & diff --git a/src/fci/environment.irp.f b/src/fci/environment.irp.f new file mode 100644 index 00000000..5c0e0820 --- /dev/null +++ b/src/fci/environment.irp.f @@ -0,0 +1,14 @@ +BEGIN_PROVIDER [ integer, nthreads_pt2 ] + implicit none + BEGIN_DOC + ! Number of threads for Davidson + END_DOC + nthreads_pt2 = nproc + character*(32) :: env + call getenv('QP_NTHREADS_PT2',env) + if (trim(env) /= '') then + read(env,*) nthreads_pt2 + call write_int(6,nthreads_pt2,'Target number of threads for PT2') + endif +END_PROVIDER +