From 905d88529f1b3a2711951cd59585dbea2b398fea Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Fri, 7 Jul 2023 17:42:20 +0200 Subject: [PATCH] Reduced memory in cholesky SCF --- src/ao_two_e_ints/cholesky.irp.f | 68 ++++++++++++++----- src/ao_two_e_ints/two_e_integrals.irp.f | 90 +++++++++++++++++++++---- src/ccsd/ccsd_t_space_orb_abc.irp.f | 2 +- src/determinants/density_matrix.irp.f | 4 +- src/determinants/h_apply.irp.f | 2 +- src/determinants/s2.irp.f | 2 +- src/hartree_fock/fock_matrix_hf.irp.f | 90 ++++++++++++++++--------- src/utils/memory.irp.f | 2 +- 8 files changed, 193 insertions(+), 67 deletions(-) diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f index f7eae638..d0fa735d 100644 --- a/src/ao_two_e_ints/cholesky.irp.f +++ b/src/ao_two_e_ints/cholesky.irp.f @@ -29,7 +29,7 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] double precision, pointer :: L(:,:), L_old(:,:) - double precision, parameter :: s = 1.d-1 + double precision :: s double precision, parameter :: dscale = 1.d0 double precision, allocatable :: D(:), Delta(:,:), Ltmp_p(:,:), Ltmp_q(:,:) @@ -47,6 +47,11 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] integer :: block_size, iblock, ierr integer(omp_lock_kind), allocatable :: lock(:) + + double precision :: rss + double precision, external :: memory_of_double, memory_of_int + + PROVIDE nucl_coord if (.not.do_direct_integrals) then @@ -57,6 +62,9 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] ndim = ao_num*ao_num tau = ao_cholesky_threshold + rss = 6.d0 * memory_of_double(ndim) + & + 6.d0 * memory_of_int(ndim) + call check_mem(rss, irp_here) allocate(L(ndim,1)) @@ -97,7 +105,7 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] enddo !$OMP END PARALLEL DO else - !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i) + !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i) SCHEDULE(guided) do i=1,ndim D(i) = get_ao_two_e_integral(addr(1,i), addr(1,i), & addr(2,i), addr(2,i), & @@ -130,21 +138,49 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] ! a. i = i+1 - ! b. - Dmin = max(s*Dmax,tau) + logical :: memory_ok + memory_ok = .False. - ! c. - nq=0 - LDmap = 0 - DLmap = 0 - do p=1,np - if ( D(Lset(p)) > Dmin ) then - nq = nq+1 - Dset(nq) = Lset(p) - Dset_rev(Dset(nq)) = nq - LDmap(p) = nq - DLmap(nq) = p + s = 1.d-2 + + ! Inrease s until the arrays fit in memory + do + + ! b. + Dmin = max(s*Dmax,tau) + + ! c. + nq=0 + LDmap = 0 + DLmap = 0 + do p=1,np + if ( D(Lset(p)) > Dmin ) then + nq = nq+1 + Dset(nq) = Lset(p) + Dset_rev(Dset(nq)) = nq + LDmap(p) = nq + DLmap(nq) = p + endif + enddo + + call resident_memory(rss) + rss = rss & + + np*memory_of_double(nq) & ! Delta(np,nq) + + (rank+nq)* memory_of_double(ndim) & ! L(ndim,rank+nq) + + (np+nq)*memory_of_double(block_size) ! Ltmp_p(np,block_size) + ! Ltmp_q(nq,block_size) + + if (rss > qp_max_mem) then + s = s*2.d0 + else + exit endif + + if (nq == 0) then + print *, 'Not enough memory. Reduce cholesky threshold' + stop -1 + endif + enddo ! d., e. @@ -198,7 +234,7 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] enddo !$OMP END DO - !$OMP DO SCHEDULE(dynamic,8) + !$OMP DO SCHEDULE(guided) do m=1,nq call omp_set_lock(lock(m)) diff --git a/src/ao_two_e_ints/two_e_integrals.irp.f b/src/ao_two_e_ints/two_e_integrals.irp.f index 0c70aae5..f86fb269 100644 --- a/src/ao_two_e_ints/two_e_integrals.irp.f +++ b/src/ao_two_e_ints/two_e_integrals.irp.f @@ -460,7 +460,7 @@ BEGIN_PROVIDER [ double precision, ao_two_e_integral_schwartz, (ao_num, ao_num) !$OMP PARALLEL DO PRIVATE(i,k) & !$OMP DEFAULT(NONE) & !$OMP SHARED (ao_num,ao_two_e_integral_schwartz) & - !$OMP SCHEDULE(dynamic) + !$OMP SCHEDULE(guided) do i=1,ao_num do k=1,i ao_two_e_integral_schwartz(i,k) = dsqrt(ao_two_e_integral(i,i,k,k)) @@ -975,7 +975,8 @@ recursive subroutine I_x1_pol_mult_recurs(a,c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt ! !DIR$ FORCEINLINE ! call multiply_poly(X,nx,B_10,2,d,nd) - call multiply_poly_c2(X,nx,B_10,d,nd) + !DIR$ FORCEINLINE + call multiply_poly_c2_inline_2e(X,nx,B_10,d,nd) nx = nd !DIR$ LOOP COUNT(8) @@ -998,7 +999,8 @@ recursive subroutine I_x1_pol_mult_recurs(a,c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt endif ! !DIR$ FORCEINLINE ! call multiply_poly(X,nx,B_00,2,d,nd) - call multiply_poly_c2(X,nx,B_00,d,nd) + !DIR$ FORCEINLINE + call multiply_poly_c2_inline_2e(X,nx,B_00,d,nd) endif ny=0 @@ -1017,7 +1019,8 @@ recursive subroutine I_x1_pol_mult_recurs(a,c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt ! !DIR$ FORCEINLINE ! call multiply_poly(Y,ny,C_00,2,d,nd) - call multiply_poly_c2(Y,ny,C_00,d,nd) + !DIR$ FORCEINLINE + call multiply_poly_c2_inline_2e(Y,ny,C_00,d,nd) end recursive subroutine I_x1_pol_mult_a1(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in) @@ -1057,7 +1060,8 @@ recursive subroutine I_x1_pol_mult_a1(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in) ! !DIR$ FORCEINLINE ! call multiply_poly(X,nx,B_00,2,d,nd) - call multiply_poly_c2(X,nx,B_00,d,nd) + !DIR$ FORCEINLINE + call multiply_poly_c2_inline_2e(X,nx,B_00,d,nd) ny=0 @@ -1069,7 +1073,8 @@ recursive subroutine I_x1_pol_mult_a1(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in) ! !DIR$ FORCEINLINE ! call multiply_poly(Y,ny,C_00,2,d,nd) - call multiply_poly_c2(Y,ny,C_00,d,nd) + !DIR$ FORCEINLINE + call multiply_poly_c2_inline_2e(Y,ny,C_00,d,nd) end @@ -1098,7 +1103,8 @@ recursive subroutine I_x1_pol_mult_a2(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in) ! !DIR$ FORCEINLINE ! call multiply_poly(X,nx,B_10,2,d,nd) - call multiply_poly_c2(X,nx,B_10,d,nd) + !DIR$ FORCEINLINE + call multiply_poly_c2_inline_2e(X,nx,B_10,d,nd) nx = nd !DIR$ LOOP COUNT(8) @@ -1118,7 +1124,8 @@ recursive subroutine I_x1_pol_mult_a2(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in) ! !DIR$ FORCEINLINE ! call multiply_poly(X,nx,B_00,2,d,nd) - call multiply_poly_c2(X,nx,B_00,d,nd) + !DIR$ FORCEINLINE + call multiply_poly_c2_inline_2e(X,nx,B_00,d,nd) ny=0 !DIR$ LOOP COUNT(8) @@ -1130,7 +1137,8 @@ recursive subroutine I_x1_pol_mult_a2(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in) ! !DIR$ FORCEINLINE ! call multiply_poly(Y,ny,C_00,2,d,nd) - call multiply_poly_c2(Y,ny,C_00,d,nd) + !DIR$ FORCEINLINE + call multiply_poly_c2_inline_2e(Y,ny,C_00,d,nd) end recursive subroutine I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,d,nd,dim) @@ -1177,9 +1185,9 @@ recursive subroutine I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,d,nd,dim) Y(1) = D_00(1) Y(2) = D_00(2) -! !DIR$ FORCEINLINE ! call multiply_poly(Y,ny,D_00,2,d,nd) - call multiply_poly_c2(Y,ny,D_00,d,nd) + !DIR$ FORCEINLINE + call multiply_poly_c2_inline_2e(Y,ny,D_00,d,nd) return @@ -1199,7 +1207,8 @@ recursive subroutine I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,d,nd,dim) ! !DIR$ FORCEINLINE ! call multiply_poly(X,nx,B_01,2,d,nd) - call multiply_poly_c2(X,nx,B_01,d,nd) + !DIR$ FORCEINLINE + call multiply_poly_c2_inline_2e(X,nx,B_01,d,nd) ny = 0 !DIR$ LOOP COUNT(6) @@ -1208,9 +1217,9 @@ recursive subroutine I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,d,nd,dim) enddo call I_x2_pol_mult(c-1,B_10,B_01,B_00,C_00,D_00,Y,ny,dim) -! !DIR$ FORCEINLINE ! call multiply_poly(Y,ny,D_00,2,d,nd) - call multiply_poly_c2(Y,ny,D_00,d,nd) + !DIR$ FORCEINLINE + call multiply_poly_c2_inline_2e(Y,ny,D_00,d,nd) end select end @@ -1300,3 +1309,56 @@ subroutine multiply_poly_local(b,nb,c,nc,d,nd) end +!DIR$ FORCEINLINE +subroutine multiply_poly_c2_inline_2e(b,nb,c,d,nd) + implicit none + BEGIN_DOC + ! Multiply two polynomials + ! D(t) += B(t)*C(t) + END_DOC + + integer, intent(in) :: nb + integer, intent(out) :: nd + double precision, intent(in) :: b(0:nb), c(0:2) + double precision, intent(inout) :: d(0:nb+2) + + integer :: ndtmp + integer :: ib, ic, id, k + if(nb < 0) return !False if nb>=0 + + select case (nb) + case (0) + d(0) = d(0) + c(0) * b(0) + d(1) = d(1) + c(1) * b(0) + d(2) = d(2) + c(2) * b(0) + + case (1) + d(0) = d(0) + c(0) * b(0) + d(1) = d(1) + c(0) * b(1) + c(1) * b(0) + d(2) = d(2) + c(1) * b(1) + c(2) * b(0) + d(3) = d(3) + c(2) * b(1) + + case (2) + d(0) = d(0) + c(0) * b(0) + d(1) = d(1) + c(0) * b(1) + c(1) * b(0) + d(2) = d(2) + c(0) * b(2) + c(1) * b(1) + c(2) * b(0) + d(3) = d(3) + c(1) * b(2) + c(2) * b(1) + d(4) = d(4) + c(2) * b(2) + + case default + d(0) = d(0) + c(0) * b(0) + d(1) = d(1) + c(0) * b(1) + c(1) * b(0) + do ib=2,nb + d(ib) = d(ib) + c(0) * b(ib) + c(1) * b(ib-1) + c(2) * b(ib-2) + enddo + d(nb+1) = d(nb+1) + c(1) * b(nb) + c(2) * b(nb-1) + d(nb+2) = d(nb+2) + c(2) * b(nb) + + end select + + do nd = nb+2,0,-1 + if (d(nd) /= 0.d0) exit + enddo + +end + diff --git a/src/ccsd/ccsd_t_space_orb_abc.irp.f b/src/ccsd/ccsd_t_space_orb_abc.irp.f index 1aab6bd7..12a71045 100644 --- a/src/ccsd/ccsd_t_space_orb_abc.irp.f +++ b/src/ccsd/ccsd_t_space_orb_abc.irp.f @@ -101,7 +101,7 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy) !$OMP PARALLEL PRIVATE(a,b,c,e) DEFAULT(SHARED) e = 0d0 - !$OMP DO SCHEDULE(dynamic) + !$OMP DO SCHEDULE(guided) do a = 1, nV do b = a+1, nV do c = b+1, nV diff --git a/src/determinants/density_matrix.irp.f b/src/determinants/density_matrix.irp.f index 1a1d92b5..ce4d96c2 100644 --- a/src/determinants/density_matrix.irp.f +++ b/src/determinants/density_matrix.irp.f @@ -117,7 +117,7 @@ END_PROVIDER !$OMP N_det_alpha_unique,N_det_beta_unique,irp_here) allocate(tmp_a(mo_num,mo_num,N_states), tmp_b(mo_num,mo_num,N_states) ) tmp_a = 0.d0 - !$OMP DO SCHEDULE(dynamic,64) + !$OMP DO SCHEDULE(guided) do k_a=1,N_det krow = psi_bilinear_matrix_rows(k_a) ASSERT (krow <= N_det_alpha_unique) @@ -173,7 +173,7 @@ END_PROVIDER deallocate(tmp_a) tmp_b = 0.d0 - !$OMP DO SCHEDULE(dynamic,64) + !$OMP DO SCHEDULE(guided) do k_b=1,N_det krow = psi_bilinear_matrix_transp_rows(k_b) ASSERT (krow <= N_det_alpha_unique) diff --git a/src/determinants/h_apply.irp.f b/src/determinants/h_apply.irp.f index 078c2104..65f1a832 100644 --- a/src/determinants/h_apply.irp.f +++ b/src/determinants/h_apply.irp.f @@ -250,7 +250,7 @@ subroutine remove_duplicates_in_psi_det(found_duplicates) enddo !$OMP END DO - !$OMP DO schedule(dynamic,1024) + !$OMP DO schedule(guided,64) do i=1,N_det-1 if (duplicate(i)) then cycle diff --git a/src/determinants/s2.irp.f b/src/determinants/s2.irp.f index 2c1a8757..6dc49526 100644 --- a/src/determinants/s2.irp.f +++ b/src/determinants/s2.irp.f @@ -317,7 +317,7 @@ subroutine get_uJ_s2_uI(psi_keys_tmp,psi_coefs_tmp,n,nmax_coefs,nmax_keys,s2,nst !$OMP SHARED (ll,jj,psi_keys_tmp,psi_coefs_tmp,N_int,n,nstates)& !$OMP REDUCTION(+:accu) allocate(idx(0:n)) - !$OMP DO SCHEDULE(dynamic) + !$OMP DO SCHEDULE(guided) do i = n,1,-1 ! Better OMP scheduling call get_s2(psi_keys_tmp(1,1,i),psi_keys_tmp(1,1,i),N_int,s2_tmp) accu += psi_coefs_tmp(i,ll) * s2_tmp * psi_coefs_tmp(i,jj) diff --git a/src/hartree_fock/fock_matrix_hf.irp.f b/src/hartree_fock/fock_matrix_hf.irp.f index 8c6658c5..a5ab6a60 100644 --- a/src/hartree_fock/fock_matrix_hf.irp.f +++ b/src/hartree_fock/fock_matrix_hf.irp.f @@ -190,47 +190,75 @@ END_PROVIDER deallocate(X) - ao_two_e_integral_beta_chol = ao_two_e_integral_alpha_chol + if (elec_alpha_num > elec_beta_num) then + ao_two_e_integral_beta_chol = ao_two_e_integral_alpha_chol + endif - allocate(X2(ao_num,ao_num,cholesky_ao_num,2)) + double precision :: rss + double precision :: memory_of_double + integer :: iblock + integer, parameter :: block_size = 32 + + rss = memory_of_double(ao_num*ao_num) + call check_mem(2.d0*block_size*rss, irp_here) + allocate(X2(ao_num,ao_num,block_size,2)) + allocate(X3(ao_num,block_size,ao_num,2)) + ! ao_two_e_integral_alpha_chol (l,s) -= cholesky_ao(l,m,j) * SCF_density_matrix_ao_beta (m,n) * cholesky_ao(n,s,j) - call dgemm('N','N',ao_num,ao_num*cholesky_ao_num,ao_num, 1.d0, & - SCF_density_matrix_ao_alpha, ao_num, & - cholesky_ao, ao_num, 0.d0, & - X2(1,1,1,1), ao_num) + do iblock=1,cholesky_ao_num,block_size - call dgemm('N','N',ao_num,ao_num*cholesky_ao_num,ao_num, 1.d0, & - SCF_density_matrix_ao_beta, ao_num, & - cholesky_ao, ao_num, 0.d0, & - X2(1,1,1,2), ao_num) + call dgemm('N','N',ao_num,ao_num*min(cholesky_ao_num-iblock+1,block_size),ao_num, 1.d0, & + SCF_density_matrix_ao_alpha, ao_num, & + cholesky_ao(1,1,iblock), ao_num, 0.d0, & + X2(1,1,1,1), ao_num) - allocate(X3(ao_num,cholesky_ao_num,ao_num,2)) + if (elec_alpha_num > elec_beta_num) then + call dgemm('N','N',ao_num,ao_num*min(cholesky_ao_num-iblock+1,block_size),ao_num, 1.d0, & + SCF_density_matrix_ao_beta, ao_num, & + cholesky_ao(1,1,iblock), ao_num, 0.d0, & + X2(1,1,1,2), ao_num) + + do s=1,ao_num + do j=1,min(cholesky_ao_num-iblock+1,block_size) + do m=1,ao_num + X3(m,j,s,1) = X2(m,s,j,1) + X3(m,j,s,2) = X2(m,s,j,2) + enddo + enddo + enddo + + else + + do s=1,ao_num + do j=1,min(cholesky_ao_num-iblock+1,block_size) + do m=1,ao_num + X3(m,j,s,1) = X2(m,s,j,1) + enddo + enddo + enddo + endif + + call dgemm('N','N',ao_num,ao_num,ao_num*min(cholesky_ao_num-iblock+1,block_size), -1.d0, & + cholesky_ao(1,1,iblock), ao_num, & + X3(1,1,1,1), ao_num*block_size, 1.d0, & + ao_two_e_integral_alpha_chol, ao_num) + + if (elec_alpha_num > elec_beta_num) then + call dgemm('N','N',ao_num,ao_num,ao_num*min(cholesky_ao_num-iblock+1,block_size), -1.d0, & + cholesky_ao(1,1,iblock), ao_num, & + X3(1,1,1,2), ao_num*block_size, 1.d0, & + ao_two_e_integral_beta_chol, ao_num) + endif - do s=1,ao_num - do j=1,cholesky_ao_num - do m=1,ao_num - X3(m,j,s,1) = X2(m,s,j,1) - X3(m,j,s,2) = X2(m,s,j,2) - enddo - enddo enddo - deallocate(X2) - - call dgemm('N','N',ao_num,ao_num,ao_num*cholesky_ao_num, -1.d0, & - cholesky_ao, ao_num, & - X3(1,1,1,1), ao_num*cholesky_ao_num, 1.d0, & - ao_two_e_integral_alpha_chol, ao_num) - - call dgemm('N','N',ao_num,ao_num,ao_num*cholesky_ao_num, -1.d0, & - cholesky_ao, ao_num, & - X3(1,1,1,2), ao_num*cholesky_ao_num, 1.d0, & - ao_two_e_integral_beta_chol, ao_num) - - deallocate(X3) + if (elec_alpha_num == elec_beta_num) then + ao_two_e_integral_beta_chol = ao_two_e_integral_alpha_chol + endif + deallocate(X2,X3) END_PROVIDER diff --git a/src/utils/memory.irp.f b/src/utils/memory.irp.f index 115b2cbe..0cd2133e 100644 --- a/src/utils/memory.irp.f +++ b/src/utils/memory.irp.f @@ -5,7 +5,7 @@ BEGIN_PROVIDER [ integer, qp_max_mem ] END_DOC character*(128) :: env - qp_max_mem = 2000 + qp_max_mem = 500 call getenv('QP_MAXMEM',env) if (trim(env) /= '') then call lock_io()