10
0
mirror of https://github.com/QuantumPackage/qp2.git synced 2025-01-05 10:59:45 +01:00

Reduced memory in cholesky SCF

This commit is contained in:
Anthony Scemama 2023-07-07 17:42:20 +02:00
parent e35f847341
commit 905d88529f
8 changed files with 193 additions and 67 deletions

View File

@ -29,7 +29,7 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ]
double precision, pointer :: L(:,:), L_old(:,:) double precision, pointer :: L(:,:), L_old(:,:)
double precision, parameter :: s = 1.d-1 double precision :: s
double precision, parameter :: dscale = 1.d0 double precision, parameter :: dscale = 1.d0
double precision, allocatable :: D(:), Delta(:,:), Ltmp_p(:,:), Ltmp_q(:,:) double precision, allocatable :: D(:), Delta(:,:), Ltmp_p(:,:), Ltmp_q(:,:)
@ -47,6 +47,11 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ]
integer :: block_size, iblock, ierr integer :: block_size, iblock, ierr
integer(omp_lock_kind), allocatable :: lock(:) integer(omp_lock_kind), allocatable :: lock(:)
double precision :: rss
double precision, external :: memory_of_double, memory_of_int
PROVIDE nucl_coord PROVIDE nucl_coord
if (.not.do_direct_integrals) then if (.not.do_direct_integrals) then
@ -57,6 +62,9 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ]
ndim = ao_num*ao_num ndim = ao_num*ao_num
tau = ao_cholesky_threshold tau = ao_cholesky_threshold
rss = 6.d0 * memory_of_double(ndim) + &
6.d0 * memory_of_int(ndim)
call check_mem(rss, irp_here)
allocate(L(ndim,1)) allocate(L(ndim,1))
@ -97,7 +105,7 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ]
enddo enddo
!$OMP END PARALLEL DO !$OMP END PARALLEL DO
else else
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i) !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i) SCHEDULE(guided)
do i=1,ndim do i=1,ndim
D(i) = get_ao_two_e_integral(addr(1,i), addr(1,i), & D(i) = get_ao_two_e_integral(addr(1,i), addr(1,i), &
addr(2,i), addr(2,i), & addr(2,i), addr(2,i), &
@ -130,6 +138,14 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ]
! a. ! a.
i = i+1 i = i+1
logical :: memory_ok
memory_ok = .False.
s = 1.d-2
! Inrease s until the arrays fit in memory
do
! b. ! b.
Dmin = max(s*Dmax,tau) Dmin = max(s*Dmax,tau)
@ -147,6 +163,26 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ]
endif endif
enddo enddo
call resident_memory(rss)
rss = rss &
+ np*memory_of_double(nq) & ! Delta(np,nq)
+ (rank+nq)* memory_of_double(ndim) & ! L(ndim,rank+nq)
+ (np+nq)*memory_of_double(block_size) ! Ltmp_p(np,block_size)
! Ltmp_q(nq,block_size)
if (rss > qp_max_mem) then
s = s*2.d0
else
exit
endif
if (nq == 0) then
print *, 'Not enough memory. Reduce cholesky threshold'
stop -1
endif
enddo
! d., e. ! d., e.
block_size = max(N,24) block_size = max(N,24)
@ -198,7 +234,7 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ]
enddo enddo
!$OMP END DO !$OMP END DO
!$OMP DO SCHEDULE(dynamic,8) !$OMP DO SCHEDULE(guided)
do m=1,nq do m=1,nq
call omp_set_lock(lock(m)) call omp_set_lock(lock(m))

View File

@ -460,7 +460,7 @@ BEGIN_PROVIDER [ double precision, ao_two_e_integral_schwartz, (ao_num, ao_num)
!$OMP PARALLEL DO PRIVATE(i,k) & !$OMP PARALLEL DO PRIVATE(i,k) &
!$OMP DEFAULT(NONE) & !$OMP DEFAULT(NONE) &
!$OMP SHARED (ao_num,ao_two_e_integral_schwartz) & !$OMP SHARED (ao_num,ao_two_e_integral_schwartz) &
!$OMP SCHEDULE(dynamic) !$OMP SCHEDULE(guided)
do i=1,ao_num do i=1,ao_num
do k=1,i do k=1,i
ao_two_e_integral_schwartz(i,k) = dsqrt(ao_two_e_integral(i,i,k,k)) ao_two_e_integral_schwartz(i,k) = dsqrt(ao_two_e_integral(i,i,k,k))
@ -975,7 +975,8 @@ recursive subroutine I_x1_pol_mult_recurs(a,c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt
! !DIR$ FORCEINLINE ! !DIR$ FORCEINLINE
! call multiply_poly(X,nx,B_10,2,d,nd) ! call multiply_poly(X,nx,B_10,2,d,nd)
call multiply_poly_c2(X,nx,B_10,d,nd) !DIR$ FORCEINLINE
call multiply_poly_c2_inline_2e(X,nx,B_10,d,nd)
nx = nd nx = nd
!DIR$ LOOP COUNT(8) !DIR$ LOOP COUNT(8)
@ -998,7 +999,8 @@ recursive subroutine I_x1_pol_mult_recurs(a,c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt
endif endif
! !DIR$ FORCEINLINE ! !DIR$ FORCEINLINE
! call multiply_poly(X,nx,B_00,2,d,nd) ! call multiply_poly(X,nx,B_00,2,d,nd)
call multiply_poly_c2(X,nx,B_00,d,nd) !DIR$ FORCEINLINE
call multiply_poly_c2_inline_2e(X,nx,B_00,d,nd)
endif endif
ny=0 ny=0
@ -1017,7 +1019,8 @@ recursive subroutine I_x1_pol_mult_recurs(a,c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt
! !DIR$ FORCEINLINE ! !DIR$ FORCEINLINE
! call multiply_poly(Y,ny,C_00,2,d,nd) ! call multiply_poly(Y,ny,C_00,2,d,nd)
call multiply_poly_c2(Y,ny,C_00,d,nd) !DIR$ FORCEINLINE
call multiply_poly_c2_inline_2e(Y,ny,C_00,d,nd)
end end
recursive subroutine I_x1_pol_mult_a1(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in) recursive subroutine I_x1_pol_mult_a1(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in)
@ -1057,7 +1060,8 @@ recursive subroutine I_x1_pol_mult_a1(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in)
! !DIR$ FORCEINLINE ! !DIR$ FORCEINLINE
! call multiply_poly(X,nx,B_00,2,d,nd) ! call multiply_poly(X,nx,B_00,2,d,nd)
call multiply_poly_c2(X,nx,B_00,d,nd) !DIR$ FORCEINLINE
call multiply_poly_c2_inline_2e(X,nx,B_00,d,nd)
ny=0 ny=0
@ -1069,7 +1073,8 @@ recursive subroutine I_x1_pol_mult_a1(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in)
! !DIR$ FORCEINLINE ! !DIR$ FORCEINLINE
! call multiply_poly(Y,ny,C_00,2,d,nd) ! call multiply_poly(Y,ny,C_00,2,d,nd)
call multiply_poly_c2(Y,ny,C_00,d,nd) !DIR$ FORCEINLINE
call multiply_poly_c2_inline_2e(Y,ny,C_00,d,nd)
end end
@ -1098,7 +1103,8 @@ recursive subroutine I_x1_pol_mult_a2(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in)
! !DIR$ FORCEINLINE ! !DIR$ FORCEINLINE
! call multiply_poly(X,nx,B_10,2,d,nd) ! call multiply_poly(X,nx,B_10,2,d,nd)
call multiply_poly_c2(X,nx,B_10,d,nd) !DIR$ FORCEINLINE
call multiply_poly_c2_inline_2e(X,nx,B_10,d,nd)
nx = nd nx = nd
!DIR$ LOOP COUNT(8) !DIR$ LOOP COUNT(8)
@ -1118,7 +1124,8 @@ recursive subroutine I_x1_pol_mult_a2(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in)
! !DIR$ FORCEINLINE ! !DIR$ FORCEINLINE
! call multiply_poly(X,nx,B_00,2,d,nd) ! call multiply_poly(X,nx,B_00,2,d,nd)
call multiply_poly_c2(X,nx,B_00,d,nd) !DIR$ FORCEINLINE
call multiply_poly_c2_inline_2e(X,nx,B_00,d,nd)
ny=0 ny=0
!DIR$ LOOP COUNT(8) !DIR$ LOOP COUNT(8)
@ -1130,7 +1137,8 @@ recursive subroutine I_x1_pol_mult_a2(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in)
! !DIR$ FORCEINLINE ! !DIR$ FORCEINLINE
! call multiply_poly(Y,ny,C_00,2,d,nd) ! call multiply_poly(Y,ny,C_00,2,d,nd)
call multiply_poly_c2(Y,ny,C_00,d,nd) !DIR$ FORCEINLINE
call multiply_poly_c2_inline_2e(Y,ny,C_00,d,nd)
end end
recursive subroutine I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,d,nd,dim) recursive subroutine I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,d,nd,dim)
@ -1177,9 +1185,9 @@ recursive subroutine I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,d,nd,dim)
Y(1) = D_00(1) Y(1) = D_00(1)
Y(2) = D_00(2) Y(2) = D_00(2)
! !DIR$ FORCEINLINE
! call multiply_poly(Y,ny,D_00,2,d,nd) ! call multiply_poly(Y,ny,D_00,2,d,nd)
call multiply_poly_c2(Y,ny,D_00,d,nd) !DIR$ FORCEINLINE
call multiply_poly_c2_inline_2e(Y,ny,D_00,d,nd)
return return
@ -1199,7 +1207,8 @@ recursive subroutine I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,d,nd,dim)
! !DIR$ FORCEINLINE ! !DIR$ FORCEINLINE
! call multiply_poly(X,nx,B_01,2,d,nd) ! call multiply_poly(X,nx,B_01,2,d,nd)
call multiply_poly_c2(X,nx,B_01,d,nd) !DIR$ FORCEINLINE
call multiply_poly_c2_inline_2e(X,nx,B_01,d,nd)
ny = 0 ny = 0
!DIR$ LOOP COUNT(6) !DIR$ LOOP COUNT(6)
@ -1208,9 +1217,9 @@ recursive subroutine I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,d,nd,dim)
enddo enddo
call I_x2_pol_mult(c-1,B_10,B_01,B_00,C_00,D_00,Y,ny,dim) call I_x2_pol_mult(c-1,B_10,B_01,B_00,C_00,D_00,Y,ny,dim)
! !DIR$ FORCEINLINE
! call multiply_poly(Y,ny,D_00,2,d,nd) ! call multiply_poly(Y,ny,D_00,2,d,nd)
call multiply_poly_c2(Y,ny,D_00,d,nd) !DIR$ FORCEINLINE
call multiply_poly_c2_inline_2e(Y,ny,D_00,d,nd)
end select end select
end end
@ -1300,3 +1309,56 @@ subroutine multiply_poly_local(b,nb,c,nc,d,nd)
end end
!DIR$ FORCEINLINE
subroutine multiply_poly_c2_inline_2e(b,nb,c,d,nd)
implicit none
BEGIN_DOC
! Multiply two polynomials
! D(t) += B(t)*C(t)
END_DOC
integer, intent(in) :: nb
integer, intent(out) :: nd
double precision, intent(in) :: b(0:nb), c(0:2)
double precision, intent(inout) :: d(0:nb+2)
integer :: ndtmp
integer :: ib, ic, id, k
if(nb < 0) return !False if nb>=0
select case (nb)
case (0)
d(0) = d(0) + c(0) * b(0)
d(1) = d(1) + c(1) * b(0)
d(2) = d(2) + c(2) * b(0)
case (1)
d(0) = d(0) + c(0) * b(0)
d(1) = d(1) + c(0) * b(1) + c(1) * b(0)
d(2) = d(2) + c(1) * b(1) + c(2) * b(0)
d(3) = d(3) + c(2) * b(1)
case (2)
d(0) = d(0) + c(0) * b(0)
d(1) = d(1) + c(0) * b(1) + c(1) * b(0)
d(2) = d(2) + c(0) * b(2) + c(1) * b(1) + c(2) * b(0)
d(3) = d(3) + c(1) * b(2) + c(2) * b(1)
d(4) = d(4) + c(2) * b(2)
case default
d(0) = d(0) + c(0) * b(0)
d(1) = d(1) + c(0) * b(1) + c(1) * b(0)
do ib=2,nb
d(ib) = d(ib) + c(0) * b(ib) + c(1) * b(ib-1) + c(2) * b(ib-2)
enddo
d(nb+1) = d(nb+1) + c(1) * b(nb) + c(2) * b(nb-1)
d(nb+2) = d(nb+2) + c(2) * b(nb)
end select
do nd = nb+2,0,-1
if (d(nd) /= 0.d0) exit
enddo
end

View File

@ -101,7 +101,7 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
!$OMP PARALLEL PRIVATE(a,b,c,e) DEFAULT(SHARED) !$OMP PARALLEL PRIVATE(a,b,c,e) DEFAULT(SHARED)
e = 0d0 e = 0d0
!$OMP DO SCHEDULE(dynamic) !$OMP DO SCHEDULE(guided)
do a = 1, nV do a = 1, nV
do b = a+1, nV do b = a+1, nV
do c = b+1, nV do c = b+1, nV

View File

@ -117,7 +117,7 @@ END_PROVIDER
!$OMP N_det_alpha_unique,N_det_beta_unique,irp_here) !$OMP N_det_alpha_unique,N_det_beta_unique,irp_here)
allocate(tmp_a(mo_num,mo_num,N_states), tmp_b(mo_num,mo_num,N_states) ) allocate(tmp_a(mo_num,mo_num,N_states), tmp_b(mo_num,mo_num,N_states) )
tmp_a = 0.d0 tmp_a = 0.d0
!$OMP DO SCHEDULE(dynamic,64) !$OMP DO SCHEDULE(guided)
do k_a=1,N_det do k_a=1,N_det
krow = psi_bilinear_matrix_rows(k_a) krow = psi_bilinear_matrix_rows(k_a)
ASSERT (krow <= N_det_alpha_unique) ASSERT (krow <= N_det_alpha_unique)
@ -173,7 +173,7 @@ END_PROVIDER
deallocate(tmp_a) deallocate(tmp_a)
tmp_b = 0.d0 tmp_b = 0.d0
!$OMP DO SCHEDULE(dynamic,64) !$OMP DO SCHEDULE(guided)
do k_b=1,N_det do k_b=1,N_det
krow = psi_bilinear_matrix_transp_rows(k_b) krow = psi_bilinear_matrix_transp_rows(k_b)
ASSERT (krow <= N_det_alpha_unique) ASSERT (krow <= N_det_alpha_unique)

View File

@ -250,7 +250,7 @@ subroutine remove_duplicates_in_psi_det(found_duplicates)
enddo enddo
!$OMP END DO !$OMP END DO
!$OMP DO schedule(dynamic,1024) !$OMP DO schedule(guided,64)
do i=1,N_det-1 do i=1,N_det-1
if (duplicate(i)) then if (duplicate(i)) then
cycle cycle

View File

@ -317,7 +317,7 @@ subroutine get_uJ_s2_uI(psi_keys_tmp,psi_coefs_tmp,n,nmax_coefs,nmax_keys,s2,nst
!$OMP SHARED (ll,jj,psi_keys_tmp,psi_coefs_tmp,N_int,n,nstates)& !$OMP SHARED (ll,jj,psi_keys_tmp,psi_coefs_tmp,N_int,n,nstates)&
!$OMP REDUCTION(+:accu) !$OMP REDUCTION(+:accu)
allocate(idx(0:n)) allocate(idx(0:n))
!$OMP DO SCHEDULE(dynamic) !$OMP DO SCHEDULE(guided)
do i = n,1,-1 ! Better OMP scheduling do i = n,1,-1 ! Better OMP scheduling
call get_s2(psi_keys_tmp(1,1,i),psi_keys_tmp(1,1,i),N_int,s2_tmp) call get_s2(psi_keys_tmp(1,1,i),psi_keys_tmp(1,1,i),N_int,s2_tmp)
accu += psi_coefs_tmp(i,ll) * s2_tmp * psi_coefs_tmp(i,jj) accu += psi_coefs_tmp(i,ll) * s2_tmp * psi_coefs_tmp(i,jj)

View File

@ -190,27 +190,39 @@ END_PROVIDER
deallocate(X) deallocate(X)
if (elec_alpha_num > elec_beta_num) then
ao_two_e_integral_beta_chol = ao_two_e_integral_alpha_chol ao_two_e_integral_beta_chol = ao_two_e_integral_alpha_chol
endif
allocate(X2(ao_num,ao_num,cholesky_ao_num,2)) double precision :: rss
double precision :: memory_of_double
integer :: iblock
integer, parameter :: block_size = 32
rss = memory_of_double(ao_num*ao_num)
call check_mem(2.d0*block_size*rss, irp_here)
allocate(X2(ao_num,ao_num,block_size,2))
allocate(X3(ao_num,block_size,ao_num,2))
! ao_two_e_integral_alpha_chol (l,s) -= cholesky_ao(l,m,j) * SCF_density_matrix_ao_beta (m,n) * cholesky_ao(n,s,j) ! ao_two_e_integral_alpha_chol (l,s) -= cholesky_ao(l,m,j) * SCF_density_matrix_ao_beta (m,n) * cholesky_ao(n,s,j)
call dgemm('N','N',ao_num,ao_num*cholesky_ao_num,ao_num, 1.d0, & do iblock=1,cholesky_ao_num,block_size
call dgemm('N','N',ao_num,ao_num*min(cholesky_ao_num-iblock+1,block_size),ao_num, 1.d0, &
SCF_density_matrix_ao_alpha, ao_num, & SCF_density_matrix_ao_alpha, ao_num, &
cholesky_ao, ao_num, 0.d0, & cholesky_ao(1,1,iblock), ao_num, 0.d0, &
X2(1,1,1,1), ao_num) X2(1,1,1,1), ao_num)
call dgemm('N','N',ao_num,ao_num*cholesky_ao_num,ao_num, 1.d0, & if (elec_alpha_num > elec_beta_num) then
call dgemm('N','N',ao_num,ao_num*min(cholesky_ao_num-iblock+1,block_size),ao_num, 1.d0, &
SCF_density_matrix_ao_beta, ao_num, & SCF_density_matrix_ao_beta, ao_num, &
cholesky_ao, ao_num, 0.d0, & cholesky_ao(1,1,iblock), ao_num, 0.d0, &
X2(1,1,1,2), ao_num) X2(1,1,1,2), ao_num)
allocate(X3(ao_num,cholesky_ao_num,ao_num,2))
do s=1,ao_num do s=1,ao_num
do j=1,cholesky_ao_num do j=1,min(cholesky_ao_num-iblock+1,block_size)
do m=1,ao_num do m=1,ao_num
X3(m,j,s,1) = X2(m,s,j,1) X3(m,j,s,1) = X2(m,s,j,1)
X3(m,j,s,2) = X2(m,s,j,2) X3(m,j,s,2) = X2(m,s,j,2)
@ -218,19 +230,35 @@ END_PROVIDER
enddo enddo
enddo enddo
deallocate(X2) else
call dgemm('N','N',ao_num,ao_num,ao_num*cholesky_ao_num, -1.d0, & do s=1,ao_num
cholesky_ao, ao_num, & do j=1,min(cholesky_ao_num-iblock+1,block_size)
X3(1,1,1,1), ao_num*cholesky_ao_num, 1.d0, & do m=1,ao_num
X3(m,j,s,1) = X2(m,s,j,1)
enddo
enddo
enddo
endif
call dgemm('N','N',ao_num,ao_num,ao_num*min(cholesky_ao_num-iblock+1,block_size), -1.d0, &
cholesky_ao(1,1,iblock), ao_num, &
X3(1,1,1,1), ao_num*block_size, 1.d0, &
ao_two_e_integral_alpha_chol, ao_num) ao_two_e_integral_alpha_chol, ao_num)
call dgemm('N','N',ao_num,ao_num,ao_num*cholesky_ao_num, -1.d0, & if (elec_alpha_num > elec_beta_num) then
cholesky_ao, ao_num, & call dgemm('N','N',ao_num,ao_num,ao_num*min(cholesky_ao_num-iblock+1,block_size), -1.d0, &
X3(1,1,1,2), ao_num*cholesky_ao_num, 1.d0, & cholesky_ao(1,1,iblock), ao_num, &
X3(1,1,1,2), ao_num*block_size, 1.d0, &
ao_two_e_integral_beta_chol, ao_num) ao_two_e_integral_beta_chol, ao_num)
endif
deallocate(X3) enddo
if (elec_alpha_num == elec_beta_num) then
ao_two_e_integral_beta_chol = ao_two_e_integral_alpha_chol
endif
deallocate(X2,X3)
END_PROVIDER END_PROVIDER

View File

@ -5,7 +5,7 @@ BEGIN_PROVIDER [ integer, qp_max_mem ]
END_DOC END_DOC
character*(128) :: env character*(128) :: env
qp_max_mem = 2000 qp_max_mem = 500
call getenv('QP_MAXMEM',env) call getenv('QP_MAXMEM',env)
if (trim(env) /= '') then if (trim(env) /= '') then
call lock_io() call lock_io()