10
0
mirror of https://github.com/LCPQ/quantum_package synced 2024-06-02 11:25:26 +02:00

Better estimate of the PT2 memory

This commit is contained in:
Anthony Scemama 2019-01-09 11:31:59 +01:00
parent bd72bace4e
commit a38b57ac34
15 changed files with 137 additions and 26 deletions

View File

@ -1,3 +1,5 @@
# Configuration of EZFIO package
export QP_EZFIO=${QP_ROOT}/external/ezfio
if [[ -f ${QP_EZFIO}/Bash/ezfio.sh ]]; then

View File

@ -1,3 +1,5 @@
# Configuration of IRPF90 package
export IRPF90=${QP_ROOT}/bin/irpf90
# Load irpman shell completion

View File

@ -1,3 +1,5 @@
# Configuration of additional libraries required for modules
QP_LIB=""
# Include here the optional external libraries to link with your binaries,

15
etc/local.rc Normal file
View File

@ -0,0 +1,15 @@
# Configuration specific to the local machine
# Maximum allowed memory per node
# export QP_MAXMEM=64
# Target number of threads for Davidson's algorithm
# export QP_NTHREADS_DAVIDSON=32
# Target number of threads for the computation of the PT2
# export QP_NTHREADS_PT2=32
# Name of the network interface to be chosen
# export QP_NIC=ib0

View File

@ -1,5 +0,0 @@
# Choose the correct network interface if the default one is incorrect
# export QP_NIC=ib0
# export QP_NIC=eth0

View File

@ -1 +1,3 @@
# Configuration for the Ninja package
export NINJA=${QP_ROOT}/bin/ninja

View File

@ -1,3 +1,5 @@
# Configuration for the OCaml compiler
if [[ -z $OPAMROOT ]]
then

View File

@ -1,3 +1,5 @@
# Configuration of all the paths to executables and libraries
if [[ -z $QP_PYTHON ]]
then

View File

@ -1,3 +1,5 @@
# Configuration of the qp shell command
if [[ "$(ps -p $$ -ocomm=)" == "zsh" ]] ; then
autoload bashcompinit
bashcompinit

View File

@ -46,6 +46,9 @@ subroutine run_cipsi
psi_coef = psi_coef_sorted
N_det = N_det_max
soft_touch N_det psi_det psi_coef
if (s2_eig) then
call make_s2_eigenfunction
endif
call diagonalize_CI
call save_wavefunction
endif

View File

@ -185,7 +185,7 @@ subroutine ZMQ_pt2(E, pt2,relative_error, error, variance, norm, N_in)
ipos=1
do i= 1, N_det_generators
do j=1,pt2_F(pt2_J(i))
write(task(ipos:ipos+30),'(I9,1X,I9,1X,I9,''|'')') j, pt2_J(i), N
write(task(ipos:ipos+30),'(I9,1X,I9,1X,I9,''|'')') j, pt2_J(i), N_in
ipos += 30
if (ipos > 300000-30) then
if (add_task_to_taskserver(zmq_to_qp_run_socket,trim(task(1:ipos))) == -1) then
@ -213,15 +213,49 @@ subroutine ZMQ_pt2(E, pt2,relative_error, error, variance, norm, N_in)
endif
double precision :: mem_collector, mem, rss
call resident_memory(rss)
mem_collector = 8.d0 * & ! bytes
( 1.d0*pt2_n_tasks_max & ! task_id, index
+ 0.635d0*N_det_generators & ! f,d
+ 3.d0*N_det_generators*N_states & ! eI, vI, nI
+ 3.d0*pt2_n_tasks_max*N_states & ! eI_task, vI_task, nI_task
+ 4.d0*(pt2_N_teeth+1) & ! S, S2, T2, T3
+ 1.d0*(N_int*2.d0*N + N) & ! selection buffer
+ 1.d0*(N_int*2.d0*N + N) & ! sort selection buffer
) / 1024.d0**3
integer :: nproc_target
nproc_target = nproc
double precision :: mem
mem = 8.d0 * N_det * (N_int * 2.d0 * 3.d0 + 3.d0 + 5.d0) / (1024.d0**3)
call write_double(6,mem,'Estimated memory/thread (Gb)')
if (qp_max_mem > 0) then
nproc_target = max(1,int(dble(qp_max_mem)/mem))
nproc_target = min(nproc_target,nproc)
endif
nproc_target = nthreads_pt2
do
mem = mem_collector + & !
nproc_target * 8.d0 * & ! bytes
( 0.5d0*pt2_n_tasks_max & ! task_id
+ 64.d0*pt2_n_tasks_max & ! task
+ 3.d0*pt2_n_tasks_max*N_states & ! pt2, variance, norm
+ 1.d0*pt2_n_tasks_max & ! i_generator, subset
+ 2.d0*(N_int*2.d0*N_in + N_in) & ! selection buffers
+ 1.d0*(N_int*2.d0*N_in + N_in) & ! sort/merge selection buffers
) / 1024.d0**3
if (nproc_target == 0) then
call check_mem(mem,irp_here)
nproc_target = 1
exit
endif
if (mem+rss < qp_max_mem) then
exit
endif
nproc_target = nproc_target - 1
enddo
call write_int(6,nproc_target,'Number of threads for PT2')
call write_double(6,mem,'Memory (Gb)')
call omp_set_nested(.false.)
@ -329,6 +363,8 @@ subroutine pt2_collector(zmq_socket_pull, E, relative_error, pt2, error, &
rss += memory_of_double(pt2_N_teeth+1)*4.d0
call check_mem(rss,irp_here)
! If an allocation is added here, the estimate of the memory should also be
! updated in ZMQ_pt2
allocate(task_id(pt2_n_tasks_max), index(pt2_n_tasks_max), f(N_det_generators))
allocate(d(N_det_generators+1))
allocate(eI(N_states, N_det_generators), eI_task(N_states, pt2_n_tasks_max))

View File

@ -48,6 +48,9 @@ subroutine run_stochastic_cipsi
psi_coef = psi_coef_sorted
N_det = N_det_max
soft_touch N_det psi_det psi_coef
if (s2_eig) then
call make_s2_eigenfunction
endif
call diagonalize_CI
call save_wavefunction
endif

View File

@ -452,10 +452,10 @@ BEGIN_PROVIDER [ integer, nthreads_davidson ]
END_DOC
nthreads_davidson = nproc
character*(32) :: env
call getenv('NTHREADS_DAVIDSON',env)
call getenv('QP_NTHREADS_DAVIDSON',env)
if (trim(env) /= '') then
read(env,*) nthreads_davidson
call write_int(6,nthreads_davidson,'Number of threads for <Psi|H|Psi>')
call write_int(6,nthreads_davidson,'Target number of threads for <Psi|H|Psi>')
endif
END_PROVIDER

View File

@ -115,7 +115,7 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_
integer :: iter2, itertot
double precision, allocatable :: W(:,:), U(:,:), S(:,:), overlap(:,:)
double precision, allocatable :: y(:,:), h(:,:), lambda(:), s2(:)
double precision, allocatable :: c(:), s_(:,:), s_tmp(:,:)
double precision, allocatable :: s_(:,:), s_tmp(:,:)
double precision :: diag_h_mat_elem
double precision, allocatable :: residual_norm(:)
character*(16384) :: write_buffer
@ -137,6 +137,13 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_
itermax = max(3,min(davidson_sze_max, sze/N_st_diag))
itertot = 0
if (state_following) then
allocate(overlap(N_st_diag*itermax, N_st_diag*itermax))
else
allocate(overlap(1,1)) ! avoid 'if' for deallocate
endif
overlap = 0.d0
PROVIDE nuclear_repulsion expected_s2 psi_bilinear_matrix_order psi_bilinear_matrix_order_reverse
call write_time(6)
@ -149,25 +156,51 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_
call write_int(6,N_st,'Number of states')
call write_int(6,N_st_diag,'Number of states in diagonalization')
call write_int(6,sze,'Number of determinants')
! Find max number of cores to fit in memory
! -----------------------------------------
nproc_target = nproc
double precision :: rss
integer :: maxab
maxab = max(N_det_alpha_unique, N_det_beta_unique)+1
call resident_memory(rss)
r1 = 8.d0*(3.d0*dble(sze*N_st_diag*itermax+5.d0*(N_st_diag*itermax)**2 &
+ 3.d0*(N_st_diag*itermax)+nproc*(4.d0*N_det_alpha_unique+2.d0*N_st_diag*sze)))/(1024.d0**3)
do while (r1+rss > qp_max_mem)
nproc_target = nproc_target - 1
r1 = 8.d0*(3.d0*dble(sze*N_st_diag*itermax+5.d0*(N_st_diag*itermax)**2 &
+ 3.d0*(N_st_diag*itermax)+nproc_target*(4.d0*N_det_alpha_unique+2.d0*N_st_diag*sze)))/(1024.d0**3)
do
r1 = 8.d0 * &! bytes
( 3.d0*(dble(sze)*(N_st_diag*itermax)) &! W,U,S
+ 4.d0*(N_st_diag*itermax)**2 &! h,y,s_,s_tmp
+ 2.d0*(N_st_diag*itermax) &! s2,lambda
+ 1.d0*(N_st_diag) &! residual_norm
! In H_S2_u_0_nstates_zmq
+ 3.d0*(N_st_diag*N_det) &! u_t, v_t, s_t on collector
+ 3.d0*(N_st_diag*N_det) &! u_t, v_t, s_t on slave
+ 0.5d0*maxab &! idx0 in H_S2_u_0_nstates_openmp_work_*
+ nproc_target * &! In OMP section
( 1.d0*(N_int*maxab) &! buffer
+ 3.5d0*(maxab) ) &! singles_a, singles_b, doubles, idx
) / 1024.d0**3
if (nproc_target == 0) then
call check_mem(r1,irp_here)
nproc_target = 1
exit
endif
if (r1+rss < qp_max_mem) then
exit
endif
nproc_target = nproc_target - 1
enddo
nthreads_davidson = nproc_target
TOUCH nthreads_davidson
call write_int(6,nproc_target,'Number of threads for diagonalization')
call write_double(6, r1, 'Memory(Gb)')
!---------------
write(6,'(A)') ''
write_buffer = '====='
do i=1,N_st
@ -198,9 +231,7 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_
s_(N_st_diag*itermax,N_st_diag*itermax), &
s_tmp(N_st_diag*itermax,N_st_diag*itermax), &
residual_norm(N_st_diag), &
c(N_st_diag*itermax), &
s2(N_st_diag*itermax), &
overlap(N_st_diag*itermax, N_st_diag*itermax), &
lambda(N_st_diag*itermax))
h = 0.d0
@ -503,7 +534,7 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_
deallocate ( &
W, residual_norm, &
U, overlap, &
c, S, &
S, &
h, &
y, s_, s_tmp, &
lambda &

14
src/fci/environment.irp.f Normal file
View File

@ -0,0 +1,14 @@
BEGIN_PROVIDER [ integer, nthreads_pt2 ]
implicit none
BEGIN_DOC
! Number of threads for Davidson
END_DOC
nthreads_pt2 = nproc
character*(32) :: env
call getenv('QP_NTHREADS_PT2',env)
if (trim(env) /= '') then
read(env,*) nthreads_pt2
call write_int(6,nthreads_pt2,'Target number of threads for PT2')
endif
END_PROVIDER