From a4834d0acee2b18820a9efea2a090e6c9e804c33 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 13 Jun 2023 16:01:05 +0200 Subject: [PATCH 01/40] Allow merge with master --- src/davidson_keywords/usef.irp.f | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/davidson_keywords/usef.irp.f b/src/davidson_keywords/usef.irp.f index fed2ba9b..7ca2d203 100644 --- a/src/davidson_keywords/usef.irp.f +++ b/src/davidson_keywords/usef.irp.f @@ -13,7 +13,9 @@ BEGIN_PROVIDER [ integer, nthreads_davidson ] character*(32) :: env call getenv('QP_NTHREADS_DAVIDSON',env) if (trim(env) /= '') then + call lock_io read(env,*) nthreads_davidson + call unlock_io call write_int(6,nthreads_davidson,'Target number of threads for ') endif END_PROVIDER From 6881a65994fe04eebcfedde0871e34d7737b9b8c Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Thu, 22 Jun 2023 13:34:36 +0200 Subject: [PATCH 02/40] Fix possible float_of_string: 0.160099927795302-102 error --- src/utils/linear_algebra.irp.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils/linear_algebra.irp.f b/src/utils/linear_algebra.irp.f index 76a539a6..65c57a76 100644 --- a/src/utils/linear_algebra.irp.f +++ b/src/utils/linear_algebra.irp.f @@ -1565,7 +1565,7 @@ subroutine nullify_small_elements(m,n,A,LDA,thresh) ! Remove tiny elements do j=1,n do i=1,m - if ( dabs(A(i,j) * amax) < thresh ) then + if ( (dabs(A(i,j) * amax) < thresh).or.(dabs(A(i,j)) < 1.d-99) ) then A(i,j) = 0.d0 endif enddo From d911f4eee8b1569d6e34dc4ec4081031f400bcc1 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Mon, 3 Jul 2023 17:41:34 +0200 Subject: [PATCH 03/40] Rewrote Cholesky --- src/ao_two_e_ints/cholesky.irp.f | 197 ++++++++++++++++++++++++++++++- 1 file changed, 193 insertions(+), 4 deletions(-) diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f index 77eb6ddc..2d2a40ab 100644 --- a/src/ao_two_e_ints/cholesky.irp.f +++ b/src/ao_two_e_ints/cholesky.irp.f @@ -1,10 +1,28 @@ -BEGIN_PROVIDER [ integer, cholesky_ao_num_guess ] +BEGIN_PROVIDER [ integer, mini_basis_size, (128) ] + implicit none + BEGIN_DOC + ! Size of the minimal basis set per element + END_DOC + + mini_basis_size(1:2) = 1 + mini_basis_size(3:4) = 2 + mini_basis_size(5:10) = 5 + mini_basis_size(11:12) = 6 + mini_basis_size(13:18) = 9 + mini_basis_size(19:20) = 13 + mini_basis_size(21:36) = 18 + mini_basis_size(37:38) = 22 + mini_basis_size(39:54) = 27 + mini_basis_size(55:) = 36 +END_PROVIDER + + BEGIN_PROVIDER [ integer, cholesky_ao_num_guess ] implicit none BEGIN_DOC ! Number of Cholesky vectors in AO basis END_DOC - cholesky_ao_num_guess = ao_num*ao_num + cholesky_ao_num_guess = ao_num*ao_num !sum(mini_basis_size(int(nucl_charge(:)))) END_PROVIDER BEGIN_PROVIDER [ integer, cholesky_ao_num ] @@ -103,8 +121,10 @@ END_PROVIDER ! Call Lapack cholesky_ao_num = cholesky_ao_num_guess - call pivoted_cholesky(ao_integrals, cholesky_ao_num, ao_cholesky_threshold, ao_num*ao_num, cholesky_ao) - print *, 'Rank: ', cholesky_ao_num, '(', 100.d0*dble(cholesky_ao_num)/dble(ao_num*ao_num), ' %)' +! call pivoted_cholesky(ao_integrals, cholesky_ao_num, ao_cholesky_threshold, ao_num*ao_num, cholesky_ao) + + call direct_cholesky(ao_integrals, cholesky_ao_num, ao_cholesky_threshold, ao_num*ao_num, cholesky_ao) + print *, 'Rank : ', cholesky_ao_num, '(', 100.d0*dble(cholesky_ao_num)/dble(ao_num*ao_num), ' %)' ! Remove mmap double precision, external :: getUnitAndOpen @@ -131,3 +151,172 @@ BEGIN_PROVIDER [ double precision, cholesky_ao_transp, (cholesky_ao_num, ao_num, enddo END_PROVIDER + +subroutine direct_cholesky( A, rank, tau, ndim, L) + implicit none + integer :: ndim + integer, intent(inout) :: rank + double precision, intent(inout) :: A(ndim, ndim) + double precision, intent(out) :: L(ndim, rank) + double precision, intent(in) :: tau + + double precision, parameter :: s = 1.d-2 + double precision, parameter :: dscale = 1.d0 + + double precision, allocatable :: D(:), Delta(:,:) + integer, allocatable :: Lset(:), Dset(:) + + integer :: i,j,k,m,p,q, qj, dj + integer :: N, np, nq + + double precision :: Dmax, Dmin, Qmax, f + allocate( D(ndim), Lset(ndim), Dset(ndim) ) + + L = 0.d0 + + ! 1. + do i=1,ndim + D(i) = A(i,i) + enddo + Dmax = maxval(D) +! print *, '# 1. ', D +! print *, '# 1. ', Dmax + + ! 2. + np=0 + do p=1,ndim + if ( dscale*dscale*Dmax*D(p) > tau*tau ) then + np = np+1 + Lset(np) = p + endif + enddo +! print *, '# 2. ', Lset(:np) + + ! 3. + N = 0 +! print *, '# 3. ', N + + ! 4. + i = 0 +! print *, '# 4. ', i + + ! 5. + do while (Dmax > tau) + ! a. + i = i+1 +! print *, '# 5.a ', i + + ! b. + Dmin = max(s*Dmax, tau) +! print *, '# 5.b ', Dmin + + ! c. + nq=0 + do q=1,np + if ( D(Lset(q)) > Dmin ) then + nq = nq+1 + Dset(nq) = Lset(q) + endif + enddo +! print *, '# 5.c ', Dset(:nq) + + ! d. + allocate(Delta(np,nq)) + do m=1,nq + do k=1,np + Delta(k,m) = A(Lset(k), Dset(m)) + enddo + enddo +! print *, '# 5.d ', Delta + + ! e. + do m=1,nq + do k=1,np + do p=1,N + Delta(k,m) = Delta(k,m) - L(Lset(k),p) * L(Dset(m),p) + enddo + enddo + enddo +! print *, '# 5.e ', Delta + + ! f. + Qmax = D(Dset(1)) + do q=1,nq + Qmax = max(Qmax, D(Dset(q))) + enddo +! print *, '# 5.f ', Qmax + + ! g. + j = 0 +! print *, '# 5.g ', j + + do while ( (j <= nq).and.(Qmax > Dmin) ) + ! i. + j = j+1 + rank = N+j +! print *, '# 5.h.i ', j, rank + + ! ii. + do dj=1,nq + qj = Dset(dj) + if (D(qj) == Qmax) then + exit + endif + enddo +! print *, ' # 5.h.ii ', qj, dj + + ! iii. + f = 1.d0/dsqrt(Qmax) + do p=1,np + L(Lset(p), rank) = Delta(p,dj) * f + enddo +! print *, ' # 5.h.iii ' +! do k=1,20 +! print *, L(k,1:rank) +! enddo + + ! iv. + do m=1, nq + do k=1, np + Delta(k,m) = Delta(k,m) - L(Lset(k),rank) * L(Dset(m),rank) + enddo + enddo + + do k=1, np + D(Lset(k)) = D(Lset(k)) - L(Lset(k),rank) * L(Lset(k),rank) + enddo + + Qmax = D(Dset(1)) + do q=1,np + Qmax = max(Qmax, D(Lset(q))) + enddo +! print *, '# 5.h.iv ', Delta +! print *, '# 5.h.iv ', D +! print *, '# 5.h.iv ', Qmax + + enddo + + deallocate(Delta) + + ! i. + N = N+j +! print *, '# 5.i ', N + + ! j. + Dmax = D(Lset(1)) + do p=1,np + Dmax = max(Dmax, D(Lset(p))) + enddo +! print *, '# 5.j ', Dmax + + np=0 + do p=1,ndim + if ( dscale*dscale*Dmax*D(p) > tau*tau ) then + np = np+1 + Lset(np) = p + endif + enddo +! print *, '# k. ', Lset(:np) + enddo + +end From 487e85c6aef05219eb9e716eae429b5a126600c6 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Mon, 3 Jul 2023 18:19:31 +0200 Subject: [PATCH 04/40] Cholesky OK --- src/ao_two_e_ints/cholesky.irp.f | 188 ++++++++----------------------- 1 file changed, 48 insertions(+), 140 deletions(-) diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f index 2d2a40ab..6a78e9ff 100644 --- a/src/ao_two_e_ints/cholesky.irp.f +++ b/src/ao_two_e_ints/cholesky.irp.f @@ -34,106 +34,11 @@ END_PROVIDER ! = (ik|jl) = sum_a (ik|a).(a|jl) END_DOC - type(c_ptr) :: ptr - integer :: fd, i,j,k,l,m,rank - double precision, pointer :: ao_integrals(:,:,:,:) - double precision, external :: ao_two_e_integral - - ! Store AO integrals in a memory mapped file - call mmap(trim(ezfio_work_dir)//'ao_integrals', & - (/ int(ao_num,8), int(ao_num,8), int(ao_num,8), int(ao_num,8) /), & - 8, fd, .False., ptr) - call c_f_pointer(ptr, ao_integrals, (/ao_num, ao_num, ao_num, ao_num/)) - - print*, 'Providing the AO integrals (Cholesky)' - call wall_time(wall_1) - call cpu_time(cpu_1) - - ao_integrals = 0.d0 - - double precision :: integral, cpu_1, cpu_2, wall_1, wall_2 - logical, external :: ao_two_e_integral_zero - double precision, external :: get_ao_two_e_integral - - if (read_ao_two_e_integrals) then - PROVIDE ao_two_e_integrals_in_map - - !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,j,k,l, integral, wall_2) - do m=0,9 - do l=1+m,ao_num,10 - !$OMP DO SCHEDULE(dynamic) - do j=1,ao_num - do k=1,ao_num - do i=1,ao_num - if (ao_two_e_integral_zero(i,j,k,l)) cycle - integral = get_ao_two_e_integral(i,j,k,l, ao_integrals_map) - ao_integrals(i,k,j,l) = integral - enddo - enddo - enddo - !$OMP END DO NOWAIT - enddo - !$OMP MASTER - call wall_time(wall_2) - print '(I10,'' % in'', 4X, F10.2, '' s.'')', (m+1) * 10, wall_2-wall_1 - !$OMP END MASTER - enddo - !$OMP END PARALLEL - - else - - !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,j,k,l, integral, wall_2) - do m=0,9 - do l=1+m,ao_num,10 - !$OMP DO SCHEDULE(dynamic) - do j=1,l - do k=1,ao_num - do i=1,min(k,j) - if (ao_two_e_integral_zero(i,j,k,l)) cycle - integral = ao_two_e_integral(i,k,j,l) - ao_integrals(i,k,j,l) = integral - ao_integrals(k,i,j,l) = integral - ao_integrals(i,k,l,j) = integral - ao_integrals(k,i,l,j) = integral - ao_integrals(j,l,i,k) = integral - ao_integrals(j,l,k,i) = integral - ao_integrals(l,j,i,k) = integral - ao_integrals(l,j,k,i) = integral - enddo - enddo - enddo - !$OMP END DO NOWAIT - enddo - !$OMP MASTER - call wall_time(wall_2) - print '(I10,'' % in'', 4X, F10.2, '' s.'')', (m+1) * 10, wall_2-wall_1 - !$OMP END MASTER - enddo - !$OMP END PARALLEL - - call wall_time(wall_2) - call cpu_time(cpu_2) - print*, 'AO integrals provided:' - print*, ' cpu time :',cpu_2 - cpu_1, 's' - print*, ' wall time :',wall_2 - wall_1, 's ( x ', (cpu_2-cpu_1)/(wall_2-wall_1+tiny(1.d0)), ' )' - - endif - - ! Call Lapack cholesky_ao_num = cholesky_ao_num_guess -! call pivoted_cholesky(ao_integrals, cholesky_ao_num, ao_cholesky_threshold, ao_num*ao_num, cholesky_ao) - call direct_cholesky(ao_integrals, cholesky_ao_num, ao_cholesky_threshold, ao_num*ao_num, cholesky_ao) + call direct_cholesky(cholesky_ao, ao_num*ao_num, cholesky_ao_num, ao_cholesky_threshold) print *, 'Rank : ', cholesky_ao_num, '(', 100.d0*dble(cholesky_ao_num)/dble(ao_num*ao_num), ' %)' - ! Remove mmap - double precision, external :: getUnitAndOpen - call munmap( & - (/ int(ao_num,8), int(ao_num,8), int(ao_num,8), int(ao_num,8) /), & - 8, fd, ptr) - open(unit=99,file=trim(ezfio_work_dir)//'ao_integrals') - close(99, status='delete') - END_PROVIDER BEGIN_PROVIDER [ double precision, cholesky_ao_transp, (cholesky_ao_num, ao_num, ao_num) ] @@ -152,35 +57,53 @@ BEGIN_PROVIDER [ double precision, cholesky_ao_transp, (cholesky_ao_num, ao_num, END_PROVIDER -subroutine direct_cholesky( A, rank, tau, ndim, L) +subroutine direct_cholesky(L, ndim, rank, tau) implicit none + BEGIN_DOC +! Cholesky-decomposed AOs. +! +! https://www.diva-portal.org/smash/get/diva2:396223/FULLTEXT01.pdf : +! Page 32, section 13.5 + END_DOC integer :: ndim - integer, intent(inout) :: rank - double precision, intent(inout) :: A(ndim, ndim) - double precision, intent(out) :: L(ndim, rank) + integer, intent(out) :: rank + double precision, intent(out) :: L(ndim, ndim) double precision, intent(in) :: tau double precision, parameter :: s = 1.d-2 double precision, parameter :: dscale = 1.d0 double precision, allocatable :: D(:), Delta(:,:) - integer, allocatable :: Lset(:), Dset(:) + integer, allocatable :: Lset(:), Dset(:), addr(:,:) integer :: i,j,k,m,p,q, qj, dj integer :: N, np, nq double precision :: Dmax, Dmin, Qmax, f - allocate( D(ndim), Lset(ndim), Dset(ndim) ) + double precision, external :: get_ao_two_e_integral - L = 0.d0 + allocate( D(ndim), Lset(ndim), Dset(ndim) ) + allocate( addr(2,ndim) ) ! 1. - do i=1,ndim - D(i) = A(i,i) + k=0 + do i=1,ao_num + do j=1,ao_num + k = k+1 + addr(1,k) = i + addr(2,k) = j + enddo enddo + + !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i) + do i=1,ndim + D(i) = get_ao_two_e_integral(addr(1,i), addr(1,i), & + addr(2,i), addr(2,i), & + ao_integrals_map) + enddo + !$OMP END PARALLEL DO + Dmax = maxval(D) -! print *, '# 1. ', D -! print *, '# 1. ', Dmax ! 2. np=0 @@ -190,25 +113,20 @@ subroutine direct_cholesky( A, rank, tau, ndim, L) Lset(np) = p endif enddo -! print *, '# 2. ', Lset(:np) ! 3. N = 0 -! print *, '# 3. ', N - ! 4. + ! 4. i = 0 -! print *, '# 4. ', i - ! 5. + ! 5. do while (Dmax > tau) ! a. i = i+1 -! print *, '# 5.a ', i ! b. Dmin = max(s*Dmax, tau) -! print *, '# 5.b ', Dmin ! c. nq=0 @@ -218,43 +136,42 @@ subroutine direct_cholesky( A, rank, tau, ndim, L) Dset(nq) = Lset(q) endif enddo -! print *, '# 5.c ', Dset(:nq) - ! d. + ! d., e. allocate(Delta(np,nq)) + !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(m,k) do m=1,nq do k=1,np - Delta(k,m) = A(Lset(k), Dset(m)) + Delta(k,m) = get_ao_two_e_integral( & + addr(1,Lset(k)), & + addr(1,Dset(m)), & + addr(2,Lset(k)), & + addr(2,Dset(m)), & + ao_integrals_map) enddo - enddo -! print *, '# 5.d ', Delta - ! e. - do m=1,nq - do k=1,np - do p=1,N - Delta(k,m) = Delta(k,m) - L(Lset(k),p) * L(Dset(m),p) + do p=1,N + f = L(Dset(m),p) + do k=1,np + Delta(k,m) = Delta(k,m) - L(Lset(k),p) * f enddo enddo enddo -! print *, '# 5.e ', Delta + !$OMP END PARALLEL DO ! f. Qmax = D(Dset(1)) do q=1,nq Qmax = max(Qmax, D(Dset(q))) enddo -! print *, '# 5.f ', Qmax ! g. j = 0 -! print *, '# 5.g ', j do while ( (j <= nq).and.(Qmax > Dmin) ) ! i. j = j+1 rank = N+j -! print *, '# 5.h.i ', j, rank ! ii. do dj=1,nq @@ -263,22 +180,18 @@ subroutine direct_cholesky( A, rank, tau, ndim, L) exit endif enddo -! print *, ' # 5.h.ii ', qj, dj ! iii. f = 1.d0/dsqrt(Qmax) do p=1,np L(Lset(p), rank) = Delta(p,dj) * f enddo -! print *, ' # 5.h.iii ' -! do k=1,20 -! print *, L(k,1:rank) -! enddo ! iv. do m=1, nq + f = L(Dset(m),rank) do k=1, np - Delta(k,m) = Delta(k,m) - L(Lset(k),rank) * L(Dset(m),rank) + Delta(k,m) = Delta(k,m) - L(Lset(k),rank) * f enddo enddo @@ -290,9 +203,6 @@ subroutine direct_cholesky( A, rank, tau, ndim, L) do q=1,np Qmax = max(Qmax, D(Lset(q))) enddo -! print *, '# 5.h.iv ', Delta -! print *, '# 5.h.iv ', D -! print *, '# 5.h.iv ', Qmax enddo @@ -300,14 +210,12 @@ subroutine direct_cholesky( A, rank, tau, ndim, L) ! i. N = N+j -! print *, '# 5.i ', N ! j. Dmax = D(Lset(1)) do p=1,np Dmax = max(Dmax, D(Lset(p))) enddo -! print *, '# 5.j ', Dmax np=0 do p=1,ndim @@ -316,7 +224,7 @@ subroutine direct_cholesky( A, rank, tau, ndim, L) Lset(np) = p endif enddo -! print *, '# k. ', Lset(:np) + enddo end From 3c7a10934f51aea2b97ea3196fae6442b1c0030a Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Mon, 3 Jul 2023 19:54:00 +0200 Subject: [PATCH 05/40] Accelerated Cholesky --- external/ezfio | 2 +- external/irpf90 | 2 +- external/qp2-dependencies | 2 +- src/ao_two_e_ints/cholesky.irp.f | 42 ++++++++++++++++++++------------ 4 files changed, 30 insertions(+), 18 deletions(-) diff --git a/external/ezfio b/external/ezfio index 0520b5e2..ed1df9f3 160000 --- a/external/ezfio +++ b/external/ezfio @@ -1 +1 @@ -Subproject commit 0520b5e2cf70e2451c37ce5b7f2f64f6d2e5e956 +Subproject commit ed1df9f3c1f51752656ca98da5693a4119add05c diff --git a/external/irpf90 b/external/irpf90 index 0007f72f..33ca5e10 160000 --- a/external/irpf90 +++ b/external/irpf90 @@ -1 +1 @@ -Subproject commit 0007f72f677fe7d61c5e1ed461882cb239517102 +Subproject commit 33ca5e1018f3bbb5e695e6ee558f5dac0753b271 diff --git a/external/qp2-dependencies b/external/qp2-dependencies index e0d0e02e..f40bde09 160000 --- a/external/qp2-dependencies +++ b/external/qp2-dependencies @@ -1 +1 @@ -Subproject commit e0d0e02e9f5ece138d1520106954a881ab0b8db2 +Subproject commit f40bde0925808bbec0424b57bfcef1b26473a1c8 diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f index 6a78e9ff..dc5040be 100644 --- a/src/ao_two_e_ints/cholesky.irp.f +++ b/src/ao_two_e_ints/cholesky.irp.f @@ -73,7 +73,7 @@ subroutine direct_cholesky(L, ndim, rank, tau) double precision, parameter :: s = 1.d-2 double precision, parameter :: dscale = 1.d0 - double precision, allocatable :: D(:), Delta(:,:) + double precision, allocatable :: D(:), Delta(:,:), Ltmp_p(:,:), Ltmp_q(:,:) integer, allocatable :: Lset(:), Dset(:), addr(:,:) integer :: i,j,k,m,p,q, qj, dj @@ -138,7 +138,16 @@ subroutine direct_cholesky(L, ndim, rank, tau) enddo ! d., e. - allocate(Delta(np,nq)) + allocate(Delta(np,nq), Ltmp_p(max(np,1),max(N,1)), Ltmp_q(max(nq,1),max(N,1))) + do k=1,N + do p=1,np + Ltmp_p(p,k) = L(Lset(p),k) + enddo + do q=1,nq + Ltmp_q(q,k) = L(Dset(q),k) + enddo + enddo + !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(m,k) do m=1,nq do k=1,np @@ -149,17 +158,13 @@ subroutine direct_cholesky(L, ndim, rank, tau) addr(2,Dset(m)), & ao_integrals_map) enddo - - do p=1,N - f = L(Dset(m),p) - do k=1,np - Delta(k,m) = Delta(k,m) - L(Lset(k),p) * f - enddo - enddo enddo !$OMP END PARALLEL DO - ! f. + call dgemm('N','T',np,nq,N,-1.d0, & + Ltmp_p, np, Ltmp_q, nq, 1.d0, Delta, np) + + ! f. Qmax = D(Dset(1)) do q=1,nq Qmax = max(Qmax, D(Dset(q))) @@ -184,19 +189,26 @@ subroutine direct_cholesky(L, ndim, rank, tau) ! iii. f = 1.d0/dsqrt(Qmax) do p=1,np - L(Lset(p), rank) = Delta(p,dj) * f + Ltmp_p(p,1) = Delta(p,dj) * f + L(Lset(p), rank) = Ltmp_p(p,1) + enddo + + do q=1,nq + Ltmp_q(q,1) = L(Dset(q), rank) enddo ! iv. +! call dger(np, nq, -1.d0, Ltmp_p, 1, Ltmp_q, 1, Delta, np) + !$OMP PARALLEL DO PRIVATE(f,m,k) do m=1, nq - f = L(Dset(m),rank) do k=1, np - Delta(k,m) = Delta(k,m) - L(Lset(k),rank) * f + Delta(k,m) = Delta(k,m) - Ltmp_p(k,1) * Ltmp_q(m,1) enddo enddo + !$OMP END PARALLEL DO do k=1, np - D(Lset(k)) = D(Lset(k)) - L(Lset(k),rank) * L(Lset(k),rank) + D(Lset(k)) = D(Lset(k)) - Ltmp_p(k,1) * Ltmp_p(k,1) enddo Qmax = D(Dset(1)) @@ -206,7 +218,7 @@ subroutine direct_cholesky(L, ndim, rank, tau) enddo - deallocate(Delta) + deallocate(Delta, Ltmp_p, Ltmp_q) ! i. N = N+j From 837ec89f1baf8cef63a045a63394edce15f2883d Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Mon, 3 Jul 2023 21:04:50 +0200 Subject: [PATCH 06/40] Accelerate Cholesky --- src/ao_two_e_ints/cholesky.irp.f | 40 ++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f index dc5040be..27aa1aa6 100644 --- a/src/ao_two_e_ints/cholesky.irp.f +++ b/src/ao_two_e_ints/cholesky.irp.f @@ -81,6 +81,9 @@ subroutine direct_cholesky(L, ndim, rank, tau) double precision :: Dmax, Dmin, Qmax, f double precision, external :: get_ao_two_e_integral + logical, external :: ao_two_e_integral_zero + + print *, 'Entering Cholesky' allocate( D(ndim), Lset(ndim), Dset(ndim) ) allocate( addr(2,ndim) ) @@ -139,6 +142,9 @@ subroutine direct_cholesky(L, ndim, rank, tau) ! d., e. allocate(Delta(np,nq), Ltmp_p(max(np,1),max(N,1)), Ltmp_q(max(nq,1),max(N,1))) + !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(m,k,p,q) + + !$OMP DO do k=1,N do p=1,np Ltmp_p(p,k) = L(Lset(p),k) @@ -147,10 +153,24 @@ subroutine direct_cholesky(L, ndim, rank, tau) Ltmp_q(q,k) = L(Dset(q),k) enddo enddo + !$OMP END DO NOWAIT - !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(m,k) + !$OMP DO do m=1,nq do k=1,np + Delta(k,m) = 0.d0 + enddo + enddo + !$OMP END DO + + !$OMP DO + do m=1,nq + do k=1,np + if (ao_two_e_integral_zero( & + addr(1,Lset(k)), & + addr(1,Dset(m)), & + addr(2,Lset(k)), & + addr(2,Dset(m)) ) ) cycle Delta(k,m) = get_ao_two_e_integral( & addr(1,Lset(k)), & addr(1,Dset(m)), & @@ -159,7 +179,9 @@ subroutine direct_cholesky(L, ndim, rank, tau) ao_integrals_map) enddo enddo - !$OMP END PARALLEL DO + !$OMP END DO + + !$OMP END PARALLEL call dgemm('N','T',np,nq,N,-1.d0, & Ltmp_p, np, Ltmp_q, nq, 1.d0, Delta, np) @@ -188,28 +210,36 @@ subroutine direct_cholesky(L, ndim, rank, tau) ! iii. f = 1.d0/dsqrt(Qmax) + !$OMP PARALLEL PRIVATE(m,k) + !$OMP DO do p=1,np Ltmp_p(p,1) = Delta(p,dj) * f L(Lset(p), rank) = Ltmp_p(p,1) enddo + !$OMP END DO + !$OMP DO do q=1,nq Ltmp_q(q,1) = L(Dset(q), rank) enddo + !$OMP END DO ! iv. -! call dger(np, nq, -1.d0, Ltmp_p, 1, Ltmp_q, 1, Delta, np) - !$OMP PARALLEL DO PRIVATE(f,m,k) + !$OMP DO do m=1, nq do k=1, np Delta(k,m) = Delta(k,m) - Ltmp_p(k,1) * Ltmp_q(m,1) enddo enddo - !$OMP END PARALLEL DO + !$OMP END DO NOWAIT +! call dger(np, nq, -1.d0, Ltmp_p, 1, Ltmp_q, 1, Delta, np) + !$OMP DO do k=1, np D(Lset(k)) = D(Lset(k)) - Ltmp_p(k,1) * Ltmp_p(k,1) enddo + !$OMP END DO + !$OMP END PARALLEL Qmax = D(Dset(1)) do q=1,np From 06720f3f210bc548346f9194a8d89761aa228f35 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 4 Jul 2023 01:22:12 +0200 Subject: [PATCH 07/40] integer8 in cholesky --- external/qp2-dependencies | 2 +- src/ao_two_e_ints/cholesky.irp.f | 111 +++++++++++++++++++------------ src/utils/fast_mkl.c | 5 ++ 3 files changed, 73 insertions(+), 45 deletions(-) create mode 100644 src/utils/fast_mkl.c diff --git a/external/qp2-dependencies b/external/qp2-dependencies index f40bde09..e0d0e02e 160000 --- a/external/qp2-dependencies +++ b/external/qp2-dependencies @@ -1 +1 @@ -Subproject commit f40bde0925808bbec0424b57bfcef1b26473a1c8 +Subproject commit e0d0e02e9f5ece138d1520106954a881ab0b8db2 diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f index 27aa1aa6..01c79d12 100644 --- a/src/ao_two_e_ints/cholesky.irp.f +++ b/src/ao_two_e_ints/cholesky.irp.f @@ -74,27 +74,31 @@ subroutine direct_cholesky(L, ndim, rank, tau) double precision, parameter :: dscale = 1.d0 double precision, allocatable :: D(:), Delta(:,:), Ltmp_p(:,:), Ltmp_q(:,:) - integer, allocatable :: Lset(:), Dset(:), addr(:,:) + integer*8, allocatable :: Lset(:), Dset(:), addr(:,:), LDmap(:), DLmap(:) + integer*8, allocatable :: Lset_rev(:), Dset_rev(:) - integer :: i,j,k,m,p,q, qj, dj - integer :: N, np, nq + integer*8 :: i,j,k,m,p,q, qj, dj, p2, q2 + integer*8 :: N, np, nq double precision :: Dmax, Dmin, Qmax, f double precision, external :: get_ao_two_e_integral logical, external :: ao_two_e_integral_zero print *, 'Entering Cholesky' + rank = 0 - allocate( D(ndim), Lset(ndim), Dset(ndim) ) - allocate( addr(2,ndim) ) + allocate( D(ndim), Lset(ndim), LDmap(ndim), DLmap(ndim), Dset(ndim) ) + allocate( Lset_rev(ndim), Dset_rev(ndim) ) + allocate( addr(3,ndim) ) ! 1. k=0 - do i=1,ao_num - do j=1,ao_num + do j=1,ao_num + do i=1,ao_num k = k+1 addr(1,k) = i addr(2,k) = j + addr(3,k) = (i-1)*ao_num + j enddo enddo @@ -110,10 +114,12 @@ subroutine direct_cholesky(L, ndim, rank, tau) ! 2. np=0 + Lset_rev = 0 do p=1,ndim if ( dscale*dscale*Dmax*D(p) > tau*tau ) then np = np+1 Lset(np) = p + Lset_rev(p) = np endif enddo @@ -133,15 +139,21 @@ subroutine direct_cholesky(L, ndim, rank, tau) ! c. nq=0 - do q=1,np - if ( D(Lset(q)) > Dmin ) then + LDmap = 0 + DLmap = 0 + do p=1,np + if ( D(Lset(p)) > Dmin ) then nq = nq+1 - Dset(nq) = Lset(q) + Dset(nq) = Lset(p) + Dset_rev(Dset(nq)) = nq + LDmap(p) = nq + DLmap(nq) = p endif enddo ! d., e. allocate(Delta(np,nq), Ltmp_p(max(np,1),max(N,1)), Ltmp_q(max(nq,1),max(N,1))) + !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(m,k,p,q) !$OMP DO @@ -153,38 +165,47 @@ subroutine direct_cholesky(L, ndim, rank, tau) Ltmp_q(q,k) = L(Dset(q),k) enddo enddo - !$OMP END DO NOWAIT + !$OMP END DO - !$OMP DO + !$OMP DO SCHEDULE(dynamic,8) do m=1,nq - do k=1,np - Delta(k,m) = 0.d0 + + do k=1, nq + ! Apply only to (k,m) pairs both in Dset + p = DLmap(k) + q = Lset_rev(addr(3,Dset(k))) + if ((0 < q).and.(q < p)) cycle + if (ao_two_e_integral_zero( addr(1,Dset(k)), addr(1,Dset(m)), & + addr(2,Dset(k)), addr(2,Dset(m)) ) ) then + Delta(p,m) = 0.d0 + else + Delta(p,m) = get_ao_two_e_integral( addr(1,Dset(k)), addr(1,Dset(m)), & + addr(2,Dset(k)), addr(2,Dset(m)), ao_integrals_map) + endif + Delta(q,m) = Delta(p,m) enddo - enddo - !$OMP END DO - !$OMP DO - do m=1,nq do k=1,np - if (ao_two_e_integral_zero( & - addr(1,Lset(k)), & - addr(1,Dset(m)), & - addr(2,Lset(k)), & - addr(2,Dset(m)) ) ) cycle - Delta(k,m) = get_ao_two_e_integral( & - addr(1,Lset(k)), & - addr(1,Dset(m)), & - addr(2,Lset(k)), & - addr(2,Dset(m)), & - ao_integrals_map) + ! Apply only to (k,m) pairs where k is not in Dset + if (LDmap(k) /= 0) cycle + q = Lset_rev(addr(3,Lset(k))) + if ((0 < q).and.(q < k)) cycle + if (ao_two_e_integral_zero( addr(1,Lset(k)), addr(1,Dset(m)), & + addr(2,Lset(k)), addr(2,Dset(m)) ) ) then + Delta(k,m) = 0.d0 + else + Delta(k,m) = get_ao_two_e_integral( addr(1,Lset(k)), addr(1,Dset(m)), & + addr(2,Lset(k)), addr(2,Dset(m)), ao_integrals_map) + endif + Delta(q,m) = Delta(k,m) enddo enddo !$OMP END DO !$OMP END PARALLEL - call dgemm('N','T',np,nq,N,-1.d0, & - Ltmp_p, np, Ltmp_q, nq, 1.d0, Delta, np) + call dgemm('N','T', int(np,4), int(nq,4), int(N,4), -1.d0, & + Ltmp_p, int(np,4), Ltmp_q, int(nq,4), 1.d0, Delta, int(np,4)) ! f. Qmax = D(Dset(1)) @@ -193,11 +214,11 @@ subroutine direct_cholesky(L, ndim, rank, tau) enddo ! g. - j = 0 - do while ( (j <= nq).and.(Qmax > Dmin) ) + do j=1,nq + + if ( (Qmax <= Dmin).or.(N+j > ndim) ) exit ! i. - j = j+1 rank = N+j ! ii. @@ -208,13 +229,17 @@ subroutine direct_cholesky(L, ndim, rank, tau) endif enddo + L(:, rank) = 0.d0 + ! iii. f = 1.d0/dsqrt(Qmax) - !$OMP PARALLEL PRIVATE(m,k) + + !$OMP PARALLEL PRIVATE(m,p,q,k) DEFAULT(shared) !$OMP DO do p=1,np Ltmp_p(p,1) = Delta(p,dj) * f L(Lset(p), rank) = Ltmp_p(p,1) + D(Lset(p)) = D(Lset(p)) - Ltmp_p(p,1) * Ltmp_p(p,1) enddo !$OMP END DO @@ -223,22 +248,17 @@ subroutine direct_cholesky(L, ndim, rank, tau) Ltmp_q(q,1) = L(Dset(q), rank) enddo !$OMP END DO - + ! iv. - !$OMP DO + + !$OMP DO SCHEDULE(static) do m=1, nq do k=1, np Delta(k,m) = Delta(k,m) - Ltmp_p(k,1) * Ltmp_q(m,1) enddo enddo - !$OMP END DO NOWAIT -! call dger(np, nq, -1.d0, Ltmp_p, 1, Ltmp_q, 1, Delta, np) - - !$OMP DO - do k=1, np - D(Lset(k)) = D(Lset(k)) - Ltmp_p(k,1) * Ltmp_p(k,1) - enddo !$OMP END DO + !$OMP END PARALLEL Qmax = D(Dset(1)) @@ -247,6 +267,7 @@ subroutine direct_cholesky(L, ndim, rank, tau) enddo enddo + print *, Qmax deallocate(Delta, Ltmp_p, Ltmp_q) @@ -260,10 +281,12 @@ subroutine direct_cholesky(L, ndim, rank, tau) enddo np=0 + Lset_rev = 0 do p=1,ndim if ( dscale*dscale*Dmax*D(p) > tau*tau ) then np = np+1 Lset(np) = p + Lset_rev(p) = np endif enddo diff --git a/src/utils/fast_mkl.c b/src/utils/fast_mkl.c new file mode 100644 index 00000000..aa1f82f1 --- /dev/null +++ b/src/utils/fast_mkl.c @@ -0,0 +1,5 @@ +int mkl_serv_intel_cpu_true() { + return 1; +} + + From 6a53e44e9bed0bf6aa40f24c1fc13a25889ef727 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 4 Jul 2023 01:43:44 +0200 Subject: [PATCH 08/40] Fast MKL on AMD --- src/ezfio_files/NEED | 1 + src/ezfio_files/ezfio.irp.f | 7 ++++++- src/utils/c_functions.f90 | 7 ++++++- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/src/ezfio_files/NEED b/src/ezfio_files/NEED index d06d604c..1766924f 100644 --- a/src/ezfio_files/NEED +++ b/src/ezfio_files/NEED @@ -1,2 +1,3 @@ mpi zmq +utils diff --git a/src/ezfio_files/ezfio.irp.f b/src/ezfio_files/ezfio.irp.f index 4f53b173..e18b2378 100644 --- a/src/ezfio_files/ezfio.irp.f +++ b/src/ezfio_files/ezfio.irp.f @@ -7,6 +7,8 @@ BEGIN_PROVIDER [ character*(1024), ezfio_filename ] PROVIDE mpi_initialized + integer :: i + ! Get the QPACKAGE_INPUT environment variable call getenv('QPACKAGE_INPUT',ezfio_filename) if (ezfio_filename == '') then @@ -44,11 +46,14 @@ BEGIN_PROVIDER [ character*(1024), ezfio_filename ] END_PROVIDER BEGIN_PROVIDER [ character*(1024), ezfio_work_dir ] + use c_functions implicit none BEGIN_DOC ! EZFIO/work/ END_DOC - call ezfio_set_work_empty(.False.) + logical :: b + b = mkl_serv_intel_cpu_true() /= 1 + call ezfio_set_work_empty(b) ezfio_work_dir = trim(ezfio_filename)//'/work/' END_PROVIDER diff --git a/src/utils/c_functions.f90 b/src/utils/c_functions.f90 index 65d4ad62..a9c8900b 100644 --- a/src/utils/c_functions.f90 +++ b/src/utils/c_functions.f90 @@ -57,6 +57,12 @@ module c_functions end subroutine sscanf_sd_c end interface + interface + integer(kind=c_int) function mkl_serv_intel_cpu_true() bind(C) + use iso_c_binding + end function + end interface + contains integer function atoi(a) @@ -131,4 +137,3 @@ subroutine usleep(us) call usleep_c(u) end subroutine usleep - From faf43331edb20391a10ec6cb85a354d471f1612c Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 4 Jul 2023 01:46:49 +0200 Subject: [PATCH 09/40] Fix segfault in CC --- src/ccsd/ccsd_t_space_orb_stoch.irp.f | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/ccsd/ccsd_t_space_orb_stoch.irp.f b/src/ccsd/ccsd_t_space_orb_stoch.irp.f index 31fe67ce..770d629a 100644 --- a/src/ccsd/ccsd_t_space_orb_stoch.irp.f +++ b/src/ccsd/ccsd_t_space_orb_stoch.irp.f @@ -332,7 +332,10 @@ subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energ print '(A)', ' +----------------------+--------------+----------+' print '(A)', '' - deallocate(X_vovv,X_ooov,T_voov,T_oovv) + deallocate(X_vovv) + deallocate(X_ooov) + deallocate(T_voov) + deallocate(T_oovv) end From 9b0c270662c35f856ae98f4832a13d39dca59c8c Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 4 Jul 2023 10:46:05 +0200 Subject: [PATCH 10/40] Block cholesky --- src/ao_two_e_ints/cholesky.irp.f | 46 +++++++++++++++++++-------- src/ccsd/ccsd_t_space_orb_stoch.irp.f | 4 ++- 2 files changed, 36 insertions(+), 14 deletions(-) diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f index 01c79d12..f26a2729 100644 --- a/src/ao_two_e_ints/cholesky.irp.f +++ b/src/ao_two_e_ints/cholesky.irp.f @@ -22,7 +22,8 @@ END_PROVIDER ! Number of Cholesky vectors in AO basis END_DOC - cholesky_ao_num_guess = ao_num*ao_num !sum(mini_basis_size(int(nucl_charge(:)))) + cholesky_ao_num_guess = ao_num*ao_num + cholesky_ao_num_guess = 2* ao_num * sum(mini_basis_size(int(nucl_charge(:)))) END_PROVIDER BEGIN_PROVIDER [ integer, cholesky_ao_num ] @@ -84,6 +85,8 @@ subroutine direct_cholesky(L, ndim, rank, tau) double precision, external :: get_ao_two_e_integral logical, external :: ao_two_e_integral_zero + integer :: block_size, iblock + print *, 'Entering Cholesky' rank = 0 @@ -152,7 +155,10 @@ subroutine direct_cholesky(L, ndim, rank, tau) enddo ! d., e. - allocate(Delta(np,nq), Ltmp_p(max(np,1),max(N,1)), Ltmp_q(max(nq,1),max(N,1))) + block_size = max(N,32) + allocate(Delta(np,nq), & + Ltmp_p(max(np,1),block_size), & + Ltmp_q(max(nq,1),block_size) ) !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(m,k,p,q) @@ -215,12 +221,19 @@ subroutine direct_cholesky(L, ndim, rank, tau) ! g. + iblock = 0 do j=1,nq if ( (Qmax <= Dmin).or.(N+j > ndim) ) exit ! i. rank = N+j + if (iblock == block_size) then + call dgemm('N','T',np,nq,block_size,-1.d0, & + Ltmp_p, np, Ltmp_q, nq, 1.d0, Delta, np) + iblock = 0 + endif + ! ii. do dj=1,nq qj = Dset(dj) @@ -231,33 +244,40 @@ subroutine direct_cholesky(L, ndim, rank, tau) L(:, rank) = 0.d0 + iblock = iblock+1 + do p=1,np + Ltmp_p(p,iblock) = Delta(p,dj) + enddo + call dgemv('N', np, iblock-1, -1.d0, Ltmp_p, np, Ltmp_q(dj,1), nq, 1.d0, & + Ltmp_p(1,iblock), 1) + ! iii. f = 1.d0/dsqrt(Qmax) !$OMP PARALLEL PRIVATE(m,p,q,k) DEFAULT(shared) !$OMP DO do p=1,np - Ltmp_p(p,1) = Delta(p,dj) * f - L(Lset(p), rank) = Ltmp_p(p,1) - D(Lset(p)) = D(Lset(p)) - Ltmp_p(p,1) * Ltmp_p(p,1) + Ltmp_p(p,iblock) = Ltmp_p(p,iblock) * f + L(Lset(p), rank) = Ltmp_p(p,iblock) + D(Lset(p)) = D(Lset(p)) - Ltmp_p(p,iblock) * Ltmp_p(p,iblock) enddo !$OMP END DO !$OMP DO do q=1,nq - Ltmp_q(q,1) = L(Dset(q), rank) + Ltmp_q(q,iblock) = L(Dset(q), rank) enddo !$OMP END DO ! iv. - !$OMP DO SCHEDULE(static) - do m=1, nq - do k=1, np - Delta(k,m) = Delta(k,m) - Ltmp_p(k,1) * Ltmp_q(m,1) - enddo - enddo - !$OMP END DO +! !$OMP DO SCHEDULE(static) +! do m=1, nq +! do k=1, np +! Delta(k,m) = Delta(k,m) - Ltmp_p(k,iblock) * Ltmp_q(m,iblock) +! enddo +! enddo +! !$OMP END DO !$OMP END PARALLEL diff --git a/src/ccsd/ccsd_t_space_orb_stoch.irp.f b/src/ccsd/ccsd_t_space_orb_stoch.irp.f index 770d629a..dbbed19e 100644 --- a/src/ccsd/ccsd_t_space_orb_stoch.irp.f +++ b/src/ccsd/ccsd_t_space_orb_stoch.irp.f @@ -94,6 +94,7 @@ subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energ enddo !$OMP END DO nowait + !$OMP BARRIER !$OMP END PARALLEL double precision, external :: ccsd_t_task_aba @@ -280,9 +281,10 @@ subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energ call wall_time(t01) if ((t01-t00 > 1.0d0).or.(imin >= Nabc)) then - t00 = t01 !$OMP TASKWAIT + call wall_time(t01) + t00 = t01 double precision :: ET, ET2 double precision :: energy_stoch, energy_det From 0242e9c37634ec593c3dcd3dd37d5c4a18ec3b69 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 4 Jul 2023 22:17:31 +0200 Subject: [PATCH 11/40] Changed formats E to ES --- config/ifort_2021_debug.cfg | 66 ++++++ src/ao_two_e_ints/cholesky.irp.f | 192 ++++++++++-------- src/ccsd/ccsd_space_orb_sub.irp.f | 4 +- src/ccsd/ccsd_spin_orb_sub.irp.f | 4 +- src/ccsd/ccsd_t_space_orb_stoch.irp.f | 12 +- src/cipsi/pt2_stoch_routines.irp.f | 2 +- .../dav_diag_dressed_ext_rout.irp.f | 2 +- .../dav_double_dress_ext_rout.irp.f | 2 +- .../dav_dressed_ext_rout.irp.f | 2 +- src/dav_general_mat/dav_ext_rout.irp.f | 2 +- src/dav_general_mat/dav_general.irp.f | 2 +- src/davidson/diagonalization_h_dressed.irp.f | 2 +- .../diagonalization_hcsf_dressed.irp.f | 2 +- .../diagonalization_hs2_dressed.irp.f | 2 +- .../diagonalization_nonsym_h_dressed.irp.f | 2 +- src/determinants/dipole_moments.irp.f | 6 +- src/ezfio_files/ezfio.irp.f | 2 +- src/mo_optimization/first_gradient_opt.irp.f | 2 +- src/tc_bi_ortho/print_tc_dump.irp.f | 10 +- src/tc_scf/molden_lr_mos.irp.f | 14 +- src/tools/molden.irp.f | 4 +- src/tools/print_ci_vectors.irp.f | 2 +- src/utils/format_w_error.irp.f | 2 +- .../rotation_matrix_iterative.irp.f | 4 +- .../trust_region_optimal_lambda.irp.f | 6 +- 25 files changed, 214 insertions(+), 136 deletions(-) create mode 100644 config/ifort_2021_debug.cfg diff --git a/config/ifort_2021_debug.cfg b/config/ifort_2021_debug.cfg new file mode 100644 index 00000000..d70b1465 --- /dev/null +++ b/config/ifort_2021_debug.cfg @@ -0,0 +1,66 @@ +# Common flags +############## +# +# -mkl=[parallel|sequential] : Use the MKL library +# --ninja : Allow the utilisation of ninja. It is mandatory ! +# --align=32 : Align all provided arrays on a 32-byte boundary +# +[COMMON] +FC : ifort -fpic +LAPACK_LIB : -mkl=parallel -lirc -lsvml -limf -lipps +IRPF90 : irpf90 +IRPF90_FLAGS : --ninja --align=32 --assert -DINTEL + +# Global options +################ +# +# 1 : Activate +# 0 : Deactivate +# +[OPTION] +MODE : DEBUG ; [ OPT | PROFILE | DEBUG ] : Chooses the section below +CACHE : 0 ; Enable cache_compile.py +OPENMP : 1 ; Append OpenMP flags + +# Optimization flags +#################### +# +# -xHost : Compile a binary optimized for the current architecture +# -O2 : O3 not better than O2. +# -ip : Inter-procedural optimizations +# -ftz : Flushes denormal results to zero +# +[OPT] +FC : -traceback +FCFLAGS : -msse4.2 -O2 -ip -ftz -g + + +# Profiling flags +################# +# +[PROFILE] +FC : -p -g +FCFLAGS : -msse4.2 -O2 -ip -ftz + + +# Debugging flags +################# +# +# -traceback : Activate backtrace on runtime +# -fpe0 : All floating point exaceptions +# -C : Checks uninitialized variables, array subscripts, etc... +# -g : Extra debugging information +# -msse4.2 : Valgrind needs a very simple x86 executable +# +[DEBUG] +FC : -g -traceback +FCFLAGS : -msse4.2 -check all -debug all -fpe-all=0 -implicitnone + + +# OpenMP flags +################# +# +[OPENMP] +FC : -qopenmp +IRPF90_FLAGS : --openmp + diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f index f26a2729..18180efb 100644 --- a/src/ao_two_e_ints/cholesky.irp.f +++ b/src/ao_two_e_ints/cholesky.irp.f @@ -1,47 +1,3 @@ -BEGIN_PROVIDER [ integer, mini_basis_size, (128) ] - implicit none - BEGIN_DOC - ! Size of the minimal basis set per element - END_DOC - - mini_basis_size(1:2) = 1 - mini_basis_size(3:4) = 2 - mini_basis_size(5:10) = 5 - mini_basis_size(11:12) = 6 - mini_basis_size(13:18) = 9 - mini_basis_size(19:20) = 13 - mini_basis_size(21:36) = 18 - mini_basis_size(37:38) = 22 - mini_basis_size(39:54) = 27 - mini_basis_size(55:) = 36 -END_PROVIDER - - BEGIN_PROVIDER [ integer, cholesky_ao_num_guess ] - implicit none - BEGIN_DOC - ! Number of Cholesky vectors in AO basis - END_DOC - - cholesky_ao_num_guess = ao_num*ao_num - cholesky_ao_num_guess = 2* ao_num * sum(mini_basis_size(int(nucl_charge(:)))) -END_PROVIDER - - BEGIN_PROVIDER [ integer, cholesky_ao_num ] -&BEGIN_PROVIDER [ double precision, cholesky_ao, (ao_num, ao_num, cholesky_ao_num_guess) ] - use mmap_module - implicit none - BEGIN_DOC - ! Cholesky vectors in AO basis: (ik|a): - ! = (ik|jl) = sum_a (ik|a).(a|jl) - END_DOC - - cholesky_ao_num = cholesky_ao_num_guess - - call direct_cholesky(cholesky_ao, ao_num*ao_num, cholesky_ao_num, ao_cholesky_threshold) - print *, 'Rank : ', cholesky_ao_num, '(', 100.d0*dble(cholesky_ao_num)/dble(ao_num*ao_num), ' %)' - -END_PROVIDER - BEGIN_PROVIDER [ double precision, cholesky_ao_transp, (cholesky_ao_num, ao_num, ao_num) ] implicit none BEGIN_DOC @@ -58,36 +14,55 @@ BEGIN_PROVIDER [ double precision, cholesky_ao_transp, (cholesky_ao_num, ao_num, END_PROVIDER -subroutine direct_cholesky(L, ndim, rank, tau) +BEGIN_PROVIDER [ integer, cholesky_ao_num ] +&BEGIN_PROVIDER [ double precision, cholesky_ao, (ao_num, ao_num, 1) ] implicit none BEGIN_DOC -! Cholesky-decomposed AOs. -! -! https://www.diva-portal.org/smash/get/diva2:396223/FULLTEXT01.pdf : -! Page 32, section 13.5 + ! Cholesky vectors in AO basis: (ik|a): + ! = (ik|jl) = sum_a (ik|a).(a|jl) + ! + ! Last dimension of cholesky_ao is cholesky_ao_num END_DOC - integer :: ndim - integer, intent(out) :: rank - double precision, intent(out) :: L(ndim, ndim) - double precision, intent(in) :: tau + + integer :: rank, ndim + double precision :: tau + double precision, pointer :: L(:,:), L_old(:,:) + double precision, parameter :: s = 1.d-2 double precision, parameter :: dscale = 1.d0 double precision, allocatable :: D(:), Delta(:,:), Ltmp_p(:,:), Ltmp_q(:,:) - integer*8, allocatable :: Lset(:), Dset(:), addr(:,:), LDmap(:), DLmap(:) - integer*8, allocatable :: Lset_rev(:), Dset_rev(:) + integer, allocatable :: Lset(:), Dset(:), addr(:,:), LDmap(:), DLmap(:) + integer, allocatable :: Lset_rev(:), Dset_rev(:) - integer*8 :: i,j,k,m,p,q, qj, dj, p2, q2 - integer*8 :: N, np, nq + integer :: i,j,k,m,p,q, qj, dj, p2, q2 + integer :: N, np, nq double precision :: Dmax, Dmin, Qmax, f double precision, external :: get_ao_two_e_integral logical, external :: ao_two_e_integral_zero - integer :: block_size, iblock + integer :: block_size, iblock, ierr + + PROVIDE ao_two_e_integrals_in_map + deallocate(cholesky_ao) + + ndim = ao_num*ao_num + tau = ao_cholesky_threshold + + + allocate(L(ndim,1)) + + print *, '' + print *, 'Cholesky decomposition of AO integrals' + print *, '======================================' + print *, '' + print *, '============ =============' + print *, ' Rank Threshold' + print *, '============ =============' + - print *, 'Entering Cholesky' rank = 0 allocate( D(ndim), Lset(ndim), LDmap(ndim), DLmap(ndim), Dset(ndim) ) @@ -155,10 +130,40 @@ subroutine direct_cholesky(L, ndim, rank, tau) enddo ! d., e. - block_size = max(N,32) - allocate(Delta(np,nq), & - Ltmp_p(max(np,1),block_size), & - Ltmp_q(max(nq,1),block_size) ) + block_size = max(N,24) + + L_old => L + allocate(L(ndim,rank+nq), stat=ierr) + if (ierr /= 0) then + print *, irp_here, ': allocation failed : (Delta(np,nq))' + stop -1 + endif + + !$OMP PARALLEL DO PRIVATE(k) + do k=1,rank + L(:,k) = L_old(:,k) + enddo + !$OMP END PARALLEL DO + + deallocate(L_old) + + allocate(Delta(np,nq), stat=ierr) + if (ierr /= 0) then + print *, irp_here, ': allocation failed : (Delta(np,nq))' + stop -1 + endif + + allocate(Ltmp_p(np,block_size), stat=ierr) + if (ierr /= 0) then + print *, irp_here, ': allocation failed : (Ltmp_p(np,block_size))' + stop -1 + endif + + allocate(Ltmp_q(nq,block_size), stat=ierr) + if (ierr /= 0) then + print *, irp_here, ': allocation failed : (Ltmp_q(nq,block_size))' + stop -1 + endif !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(m,k,p,q) @@ -176,19 +181,18 @@ subroutine direct_cholesky(L, ndim, rank, tau) !$OMP DO SCHEDULE(dynamic,8) do m=1,nq + Delta(:,m) = 0.d0 do k=1, nq ! Apply only to (k,m) pairs both in Dset p = DLmap(k) q = Lset_rev(addr(3,Dset(k))) if ((0 < q).and.(q < p)) cycle - if (ao_two_e_integral_zero( addr(1,Dset(k)), addr(1,Dset(m)), & + if (.not.ao_two_e_integral_zero( addr(1,Dset(k)), addr(1,Dset(m)), & addr(2,Dset(k)), addr(2,Dset(m)) ) ) then - Delta(p,m) = 0.d0 - else Delta(p,m) = get_ao_two_e_integral( addr(1,Dset(k)), addr(1,Dset(m)), & addr(2,Dset(k)), addr(2,Dset(m)), ao_integrals_map) + Delta(q,m) = Delta(p,m) endif - Delta(q,m) = Delta(p,m) enddo do k=1,np @@ -196,22 +200,22 @@ subroutine direct_cholesky(L, ndim, rank, tau) if (LDmap(k) /= 0) cycle q = Lset_rev(addr(3,Lset(k))) if ((0 < q).and.(q < k)) cycle - if (ao_two_e_integral_zero( addr(1,Lset(k)), addr(1,Dset(m)), & + if (.not.ao_two_e_integral_zero( addr(1,Lset(k)), addr(1,Dset(m)), & addr(2,Lset(k)), addr(2,Dset(m)) ) ) then - Delta(k,m) = 0.d0 - else Delta(k,m) = get_ao_two_e_integral( addr(1,Lset(k)), addr(1,Dset(m)), & addr(2,Lset(k)), addr(2,Dset(m)), ao_integrals_map) + Delta(q,m) = Delta(k,m) endif - Delta(q,m) = Delta(k,m) enddo enddo !$OMP END DO !$OMP END PARALLEL - call dgemm('N','T', int(np,4), int(nq,4), int(N,4), -1.d0, & - Ltmp_p, int(np,4), Ltmp_q, int(nq,4), 1.d0, Delta, int(np,4)) + if (N>0) then + call dgemm('N','T', np, nq, N, -1.d0, & + Ltmp_p, np, Ltmp_q, nq, 1.d0, Delta, np) + endif ! f. Qmax = D(Dset(1)) @@ -242,14 +246,18 @@ subroutine direct_cholesky(L, ndim, rank, tau) endif enddo - L(:, rank) = 0.d0 + L(1:ndim, rank) = 0.d0 iblock = iblock+1 do p=1,np Ltmp_p(p,iblock) = Delta(p,dj) enddo - call dgemv('N', np, iblock-1, -1.d0, Ltmp_p, np, Ltmp_q(dj,1), nq, 1.d0, & + + ! iv. + if (iblock > 1) then + call dgemv('N', np, iblock-1, -1.d0, Ltmp_p, np, Ltmp_q(dj,1), nq, 1.d0, & Ltmp_p(1,iblock), 1) + endif ! iii. f = 1.d0/dsqrt(Qmax) @@ -269,27 +277,20 @@ subroutine direct_cholesky(L, ndim, rank, tau) enddo !$OMP END DO - ! iv. - -! !$OMP DO SCHEDULE(static) -! do m=1, nq -! do k=1, np -! Delta(k,m) = Delta(k,m) - Ltmp_p(k,iblock) * Ltmp_q(m,iblock) -! enddo -! enddo -! !$OMP END DO - !$OMP END PARALLEL Qmax = D(Dset(1)) - do q=1,np - Qmax = max(Qmax, D(Lset(q))) + do q=1,nq + Qmax = max(Qmax, D(Dset(q))) enddo enddo - print *, Qmax - deallocate(Delta, Ltmp_p, Ltmp_q) + print '(I10, 4X, ES12.3)', rank, Qmax + + deallocate(Delta, stat=ierr) + deallocate(Ltmp_p, stat=ierr) + deallocate(Ltmp_q, stat=ierr) ! i. N = N+j @@ -312,4 +313,15 @@ subroutine direct_cholesky(L, ndim, rank, tau) enddo -end + allocate(cholesky_ao(ao_num,ao_num,rank)) + call dcopy(ndim*rank, L, 1, cholesky_ao, 1) + deallocate(L) + cholesky_ao_num = rank + + print *, '============ =============' + print *, '' + print *, 'Rank : ', cholesky_ao_num, '(', 100.d0*dble(cholesky_ao_num)/dble(ao_num*ao_num), ' %)' + print *, '' + +END_PROVIDER + diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 40c57188..d23073b8 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -112,7 +112,7 @@ subroutine run_ccsd_space_orb ! Energy call ccsd_energy_space(nO,nV,tau,t1,energy) - write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,1pE10.2,A3,1pE10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |' + write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |' nb_iter = nb_iter + 1 if (max_r < cc_thresh_conv .or. nb_iter > cc_max_iter) then @@ -132,7 +132,7 @@ subroutine run_ccsd_space_orb print*,'' write(*,'(A15,F18.12,A3)') ' E(CCSD) = ', uncorr_energy+energy, ' Ha' write(*,'(A15,F18.12,A3)') ' Correlation = ', energy, ' Ha' - write(*,'(A15,1pE10.2,A3)')' Conv = ', max_r + write(*,'(A15,ES10.2,A3)')' Conv = ', max_r print*,'' if (write_amplitudes) then diff --git a/src/ccsd/ccsd_spin_orb_sub.irp.f b/src/ccsd/ccsd_spin_orb_sub.irp.f index a267cc45..09d6a0fe 100644 --- a/src/ccsd/ccsd_spin_orb_sub.irp.f +++ b/src/ccsd/ccsd_spin_orb_sub.irp.f @@ -241,7 +241,7 @@ subroutine run_ccsd_spin_orb call ccsd_energy_spin(nO,nV,t1,t2,F_ov,v_oovv,energy) call wall_time(tfi) - write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,1pE10.2,A3,1pE10.2,A2)') ' | ',nb_iter,' | ', & + write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', & uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |' if (cc_dev) then print*,'Total:',tfi-tbi,'s' @@ -266,7 +266,7 @@ subroutine run_ccsd_spin_orb print*,'' write(*,'(A15,F18.12,A3)') ' E(CCSD) = ', uncorr_energy+energy, ' Ha' write(*,'(A15,F18.12,A3)') ' Correlation = ', energy, ' Ha' - write(*,'(A15,1pE10.2,A3)')' Conv = ', max_r + write(*,'(A15,ES10.2,A3)')' Conv = ', max_r print*,'' if (write_amplitudes) then diff --git a/src/ccsd/ccsd_t_space_orb_stoch.irp.f b/src/ccsd/ccsd_t_space_orb_stoch.irp.f index dbbed19e..13fa4f1a 100644 --- a/src/ccsd/ccsd_t_space_orb_stoch.irp.f +++ b/src/ccsd/ccsd_t_space_orb_stoch.irp.f @@ -210,9 +210,9 @@ subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energ Pabc(:) = 1.d0/Pabc(:) print '(A)', '' - print '(A)', ' +----------------------+--------------+----------+' - print '(A)', ' | E(CCSD(T)) | Error | % |' - print '(A)', ' +----------------------+--------------+----------+' + print '(A)', ' ======================= ============== ==========' + print '(A)', ' E(CCSD(T)) Error % ' + print '(A)', ' ======================= ============== ==========' call wall_time(t00) @@ -257,7 +257,7 @@ subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energ if (imin >= bounds(2,isample)) then cycle endif - ieta = binary_search(waccu,(eta + dble(isample-1))/dble(nbuckets),Nabc) + ieta = binary_search(waccu,(eta + dble(isample-1))/dble(nbuckets),Nabc)+1 if (sampled(ieta) == -1_8) then sampled(ieta) = 0_8 @@ -324,14 +324,14 @@ subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energ energy = energy_det + energy_stoch - print '('' | '',F20.8, '' | '', E12.4,'' | '', F8.2,'' |'')', eccsd+energy, dsqrt(variance/(norm-1.d0)), 100.*real(Ncomputed)/real(Nabc) + print '('' '',F20.8, '' '', ES12.4,'' '', F8.2,'' '')', eccsd+energy, dsqrt(variance/(norm-1.d0)), 100.*real(Ncomputed)/real(Nabc) endif !$OMP END MASTER if (imin >= Nabc) exit enddo !$OMP END PARALLEL - print '(A)', ' +----------------------+--------------+----------+' + print '(A)', ' ======================= ============== ========== ' print '(A)', '' deallocate(X_vovv) diff --git a/src/cipsi/pt2_stoch_routines.irp.f b/src/cipsi/pt2_stoch_routines.irp.f index 7909007a..3b048c14 100644 --- a/src/cipsi/pt2_stoch_routines.irp.f +++ b/src/cipsi/pt2_stoch_routines.irp.f @@ -591,7 +591,7 @@ subroutine pt2_collector(zmq_socket_pull, E, relative_error, pt2_data, pt2_data_ time-time0 ! Old print - !print '(I10, X, F12.6, X, G10.3, X, F10.6, X, G10.3, X, F10.6, X, G10.3, X, F10.1,1pE16.6,1pE16.6)', c, & + !print '(I10, X, F12.6, X, G10.3, X, F10.6, X, G10.3, X, F10.6, X, G10.3, X, F10.1,ES16.6,ES16.6)', c, & ! pt2_data % pt2(pt2_stoch_istate) +E, & ! pt2_data_err % pt2(pt2_stoch_istate), & ! pt2_data % variance(pt2_stoch_istate), & diff --git a/src/dav_general_mat/dav_diag_dressed_ext_rout.irp.f b/src/dav_general_mat/dav_diag_dressed_ext_rout.irp.f index 73608720..0dc939cb 100644 --- a/src/dav_general_mat/dav_diag_dressed_ext_rout.irp.f +++ b/src/dav_general_mat/dav_diag_dressed_ext_rout.irp.f @@ -331,7 +331,7 @@ subroutine davidson_general_ext_rout_diag_dressed(u_in,H_jj,Dress_jj,energies,sz !don't print continue else - write(*,'(1X,I3,1X,100(1X,F16.10,1X,F11.6,1X,E11.3))') iter-1, to_print(1:2,1:N_st) + write(*,'(1X,I3,1X,100(1X,F16.10,1X,F11.6,1X,ES11.3))') iter-1, to_print(1:2,1:N_st) endif ! Check convergence diff --git a/src/dav_general_mat/dav_double_dress_ext_rout.irp.f b/src/dav_general_mat/dav_double_dress_ext_rout.irp.f index e59d21d1..24f4fa10 100644 --- a/src/dav_general_mat/dav_double_dress_ext_rout.irp.f +++ b/src/dav_general_mat/dav_double_dress_ext_rout.irp.f @@ -405,7 +405,7 @@ subroutine dav_double_dressed(u_in,H_jj,Dress_jj,Dressing_vec,idx_dress,energies !don't print continue else - write(*,'(1X,I3,1X,100(1X,F16.10,1X,E11.3))') iter-1, to_print(1:2,1:N_st) + write(*,'(1X,I3,1X,100(1X,F16.10,1X,ES11.3))') iter-1, to_print(1:2,1:N_st) endif ! Check convergence diff --git a/src/dav_general_mat/dav_dressed_ext_rout.irp.f b/src/dav_general_mat/dav_dressed_ext_rout.irp.f index c045aa1a..cedaaf0a 100644 --- a/src/dav_general_mat/dav_dressed_ext_rout.irp.f +++ b/src/dav_general_mat/dav_dressed_ext_rout.irp.f @@ -398,7 +398,7 @@ subroutine davidson_general_ext_rout_dressed(u_in,H_jj,energies,sze,N_st,N_st_di !don't print continue else - write(*,'(1X,I3,1X,100(1X,F16.10,1X,E11.3))') iter-1, to_print(1:2,1:N_st) + write(*,'(1X,I3,1X,100(1X,F16.10,1X,ES11.3))') iter-1, to_print(1:2,1:N_st) endif ! Check convergence diff --git a/src/dav_general_mat/dav_ext_rout.irp.f b/src/dav_general_mat/dav_ext_rout.irp.f index 2621e3a9..deb7e3a9 100644 --- a/src/dav_general_mat/dav_ext_rout.irp.f +++ b/src/dav_general_mat/dav_ext_rout.irp.f @@ -316,7 +316,7 @@ subroutine davidson_general_ext_rout(u_in,H_jj,energies,sze,N_st,N_st_diag_in,co !don't print continue else - write(*,'(1X,I3,1X,100(1X,F16.10,1X,F11.6,1X,E11.3))') iter-1, to_print(1:2,1:N_st) + write(*,'(1X,I3,1X,100(1X,F16.10,1X,F11.6,1X,ES11.3))') iter-1, to_print(1:2,1:N_st) endif ! Check convergence diff --git a/src/dav_general_mat/dav_general.irp.f b/src/dav_general_mat/dav_general.irp.f index cd9124e6..9940bf1e 100644 --- a/src/dav_general_mat/dav_general.irp.f +++ b/src/dav_general_mat/dav_general.irp.f @@ -327,7 +327,7 @@ subroutine davidson_general(u_in,H_jj,energies,dim_in,sze,N_st,N_st_diag_in,conv !don't print continue else - write(*,'(1X,I3,1X,100(1X,F16.10,1X,F11.6,1X,E11.3))') iter-1, to_print(1:2,1:N_st) + write(*,'(1X,I3,1X,100(1X,F16.10,1X,F11.6,1X,ES11.3))') iter-1, to_print(1:2,1:N_st) endif ! Check convergence diff --git a/src/davidson/diagonalization_h_dressed.irp.f b/src/davidson/diagonalization_h_dressed.irp.f index 26853df9..b7179c18 100644 --- a/src/davidson/diagonalization_h_dressed.irp.f +++ b/src/davidson/diagonalization_h_dressed.irp.f @@ -457,7 +457,7 @@ subroutine davidson_diag_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,N_st,N_st_dia !don't print continue else - write(*,'(1X,I3,1X,100(1X,F16.10,1X,E11.3))') iter-1, to_print(1:2,1:N_st) + write(*,'(1X,I3,1X,100(1X,F16.10,1X,ES11.3))') iter-1, to_print(1:2,1:N_st) endif ! Check convergence diff --git a/src/davidson/diagonalization_hcsf_dressed.irp.f b/src/davidson/diagonalization_hcsf_dressed.irp.f index 0c3c6f92..fa8aff80 100644 --- a/src/davidson/diagonalization_hcsf_dressed.irp.f +++ b/src/davidson/diagonalization_hcsf_dressed.irp.f @@ -477,7 +477,7 @@ subroutine davidson_diag_csf_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,sze_csf,N !don't print continue else - write(*,'(1X,I3,1X,100(1X,F16.10,1X,E11.3))') iter-1, to_print(1:2,1:N_st) + write(*,'(1X,I3,1X,100(1X,F16.10,1X,ES11.3))') iter-1, to_print(1:2,1:N_st) endif ! Check convergence diff --git a/src/davidson/diagonalization_hs2_dressed.irp.f b/src/davidson/diagonalization_hs2_dressed.irp.f index 45258c1c..7b559925 100644 --- a/src/davidson/diagonalization_hs2_dressed.irp.f +++ b/src/davidson/diagonalization_hs2_dressed.irp.f @@ -611,7 +611,7 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_ !don't print continue else - write(*,'(1X,I3,1X,100(1X,F16.10,1X,F11.6,1X,E11.3))') iter-1, to_print(1:3,1:N_st) + write(*,'(1X,I3,1X,100(1X,F16.10,1X,F11.6,1X,ES11.3))') iter-1, to_print(1:3,1:N_st) endif ! Check convergence diff --git a/src/davidson/diagonalization_nonsym_h_dressed.irp.f b/src/davidson/diagonalization_nonsym_h_dressed.irp.f index 3ff060a6..96ca84ab 100644 --- a/src/davidson/diagonalization_nonsym_h_dressed.irp.f +++ b/src/davidson/diagonalization_nonsym_h_dressed.irp.f @@ -436,7 +436,7 @@ subroutine davidson_diag_nonsym_hjj(dets_in, u_in, H_jj, energies, dim_in, sze, !don't print continue else - write(*, '(1X, I3, 1X, 100(1X, F16.10, 1X, E11.3))') iter-1, to_print(1:2,1:N_st) + write(*, '(1X, I3, 1X, 100(1X, F16.10, 1X, ES11.3))') iter-1, to_print(1:2,1:N_st) endif ! Check convergence diff --git a/src/determinants/dipole_moments.irp.f b/src/determinants/dipole_moments.irp.f index 06fca0cd..e445c56b 100644 --- a/src/determinants/dipole_moments.irp.f +++ b/src/determinants/dipole_moments.irp.f @@ -66,9 +66,9 @@ END_PROVIDER write(*,'(i16)',advance='no') i end do write(*,*) '' - write(*,'(A17,100(1pE16.8))') 'x_dipole_moment = ',x_dipole_moment - write(*,'(A17,100(1pE16.8))') 'y_dipole_moment = ',y_dipole_moment - write(*,'(A17,100(1pE16.8))') 'z_dipole_moment = ',z_dipole_moment + write(*,'(A17,100(ES16.8))') 'x_dipole_moment = ',x_dipole_moment + write(*,'(A17,100(ES16.8))') 'y_dipole_moment = ',y_dipole_moment + write(*,'(A17,100(ES16.8))') 'z_dipole_moment = ',z_dipole_moment !print*, 'x_dipole_moment = ',x_dipole_moment !print*, 'y_dipole_moment = ',y_dipole_moment !print*, 'z_dipole_moment = ',z_dipole_moment diff --git a/src/ezfio_files/ezfio.irp.f b/src/ezfio_files/ezfio.irp.f index e18b2378..7e414a04 100644 --- a/src/ezfio_files/ezfio.irp.f +++ b/src/ezfio_files/ezfio.irp.f @@ -5,7 +5,7 @@ BEGIN_PROVIDER [ character*(1024), ezfio_filename ] ! variable if it is set, or as the 1st argument of the command line. END_DOC - PROVIDE mpi_initialized + PROVIDE mpi_initialized output_wall_time_0 integer :: i diff --git a/src/mo_optimization/first_gradient_opt.irp.f b/src/mo_optimization/first_gradient_opt.irp.f index d6918a00..f08b9d1f 100644 --- a/src/mo_optimization/first_gradient_opt.irp.f +++ b/src/mo_optimization/first_gradient_opt.irp.f @@ -111,7 +111,7 @@ subroutine first_gradient_opt(n,v_grad) if (debug) then print*,'Matrix containing the gradient :' do i = 1, mo_num - write(*,'(100(E12.5))') A(i,1:mo_num) + write(*,'(100(ES12.5))') A(i,1:mo_num) enddo endif diff --git a/src/tc_bi_ortho/print_tc_dump.irp.f b/src/tc_bi_ortho/print_tc_dump.irp.f index 868de444..37dfe051 100644 --- a/src/tc_bi_ortho/print_tc_dump.irp.f +++ b/src/tc_bi_ortho/print_tc_dump.irp.f @@ -62,7 +62,7 @@ subroutine KMat_tilde_dump() do j = 1, mo_num do i = 1, mo_num ! TCHint convention - write(33, '(E15.7, 4X, 4(I4, 2X))') mo_bi_ortho_tc_two_e_chemist(j,i,l,k), i, j, k, l + write(33, '(ES15.7, 4X, 4(I4, 2X))') mo_bi_ortho_tc_two_e_chemist(j,i,l,k), i, j, k, l enddo enddo enddo @@ -71,7 +71,7 @@ subroutine KMat_tilde_dump() do j = 1, mo_num do i = 1, mo_num ! TCHint convention - write(33, '(E15.7, 4X, 4(I4, 2X))') mo_bi_ortho_tc_one_e(i,j), i, j, 0, 0 + write(33, '(ES15.7, 4X, 4(I4, 2X))') mo_bi_ortho_tc_one_e(i,j), i, j, 0, 0 enddo enddo @@ -128,7 +128,7 @@ subroutine ERI_dump() do k = 1, mo_num do j = 1, mo_num do i = 1, mo_num - write(33, '(4(I4, 2X), 4X, E15.7)') i, j, k, l, a1(i,j,k,l) + write(33, '(4(I4, 2X), 4X, ES15.7)') i, j, k, l, a1(i,j,k,l) enddo enddo enddo @@ -167,8 +167,8 @@ subroutine LMat_tilde_dump() !write(33, '(6(I4, 2X), 4X, E15.7)') i, j, k, l, m, n, integral ! TCHint convention if(dabs(integral).gt.1d-10) then - write(33, '(E15.7, 4X, 6(I4, 2X))') -integral/3.d0, i, j, k, l, m, n - !write(33, '(E15.7, 4X, 6(I4, 2X))') -integral/3.d0, l, m, n, i, j, k + write(33, '(ES15.7, 4X, 6(I4, 2X))') -integral/3.d0, i, j, k, l, m, n + !write(33, '(ES15.7, 4X, 6(I4, 2X))') -integral/3.d0, l, m, n, i, j, k endif enddo enddo diff --git a/src/tc_scf/molden_lr_mos.irp.f b/src/tc_scf/molden_lr_mos.irp.f index b86009ee..98c7b230 100644 --- a/src/tc_scf/molden_lr_mos.irp.f +++ b/src/tc_scf/molden_lr_mos.irp.f @@ -72,7 +72,7 @@ subroutine molden_lr write(i_unit_output,*) character_shell, ao_prim_num(i_ao), '1.00' do k = 1, ao_prim_num(i_ao) i_prim +=1 - write(i_unit_output,'(E20.10,2X,E20.10)') ao_expo(i_ao,k), ao_coef(i_ao,k) + write(i_unit_output,'(ES20.10,2X,ES20.10)') ao_expo(i_ao,k), ao_coef(i_ao,k) enddo l = i_ao do while ( ao_l(l) == ao_l(i_ao) ) @@ -170,7 +170,7 @@ subroutine molden_lr write (i_unit_output,*) 'Spin= Alpha' write (i_unit_output,*) 'Occup=', mo_occ(i) do j=1,ao_num - write(i_unit_output, '(I6,2X,E20.10)') j, mo_r_coef(iorder(j),i) + write(i_unit_output, '(I6,2X,ES20.10)') j, mo_r_coef(iorder(j),i) enddo write (i_unit_output,*) 'Sym= 1' @@ -178,7 +178,7 @@ subroutine molden_lr write (i_unit_output,*) 'Spin= Alpha' write (i_unit_output,*) 'Occup=', mo_occ(i) do j=1,ao_num - write(i_unit_output, '(I6,2X,E20.10)') j, mo_l_coef(iorder(j),i) + write(i_unit_output, '(I6,2X,ES20.10)') j, mo_l_coef(iorder(j),i) enddo enddo close(i_unit_output) @@ -235,7 +235,7 @@ subroutine molden_l() write(i_unit_output,*) character_shell, ao_prim_num(i_ao), '1.00' do k = 1, ao_prim_num(i_ao) i_prim +=1 - write(i_unit_output,'(E20.10,2X,E20.10)') ao_expo(i_ao,k), ao_coef(i_ao,k) + write(i_unit_output,'(ES20.10,2X,ES20.10)') ao_expo(i_ao,k), ao_coef(i_ao,k) enddo l = i_ao do while ( ao_l(l) == ao_l(i_ao) ) @@ -333,7 +333,7 @@ subroutine molden_l() write (i_unit_output,*) 'Spin= Alpha' write (i_unit_output,*) 'Occup=', mo_occ(i) do j=1,ao_num - write(i_unit_output, '(I6,2X,E20.10)') j, mo_l_coef(iorder(j),i) + write(i_unit_output, '(I6,2X,ES20.10)') j, mo_l_coef(iorder(j),i) enddo enddo close(i_unit_output) @@ -390,7 +390,7 @@ subroutine molden_r() write(i_unit_output,*) character_shell, ao_prim_num(i_ao), '1.00' do k = 1, ao_prim_num(i_ao) i_prim +=1 - write(i_unit_output,'(E20.10,2X,E20.10)') ao_expo(i_ao,k), ao_coef(i_ao,k) + write(i_unit_output,'(ES20.10,2X,ES20.10)') ao_expo(i_ao,k), ao_coef(i_ao,k) enddo l = i_ao do while ( ao_l(l) == ao_l(i_ao) ) @@ -488,7 +488,7 @@ subroutine molden_r() write (i_unit_output,*) 'Spin= Alpha' write (i_unit_output,*) 'Occup=', mo_occ(i) do j=1,ao_num - write(i_unit_output, '(I6,2X,E20.10)') j, mo_r_coef(iorder(j),i) + write(i_unit_output, '(I6,2X,ES20.10)') j, mo_r_coef(iorder(j),i) enddo enddo close(i_unit_output) diff --git a/src/tools/molden.irp.f b/src/tools/molden.irp.f index 830a141e..e5902a6f 100644 --- a/src/tools/molden.irp.f +++ b/src/tools/molden.irp.f @@ -44,7 +44,7 @@ program molden write(i_unit_output,*) character_shell, ao_prim_num(i_ao), '1.00' do k = 1, ao_prim_num(i_ao) i_prim +=1 - write(i_unit_output,'(E20.10,2X,E20.10)') ao_expo(i_ao,k), ao_coef(i_ao,k) + write(i_unit_output,'(ES20.10,2X,ES20.10)') ao_expo(i_ao,k), ao_coef(i_ao,k) enddo l = i_ao do while ( ao_l(l) == ao_l(i_ao) ) @@ -142,7 +142,7 @@ program molden write (i_unit_output,*) 'Spin= Alpha' write (i_unit_output,*) 'Occup=', mo_occ(i) do j=1,ao_num - write(i_unit_output, '(I6,2X,E20.10)') j, mo_coef(iorder(j),i) + write(i_unit_output, '(I6,2X,ES20.10)') j, mo_coef(iorder(j),i) enddo enddo close(i_unit_output) diff --git a/src/tools/print_ci_vectors.irp.f b/src/tools/print_ci_vectors.irp.f index 97dfdc0b..d5f86213 100644 --- a/src/tools/print_ci_vectors.irp.f +++ b/src/tools/print_ci_vectors.irp.f @@ -28,7 +28,7 @@ subroutine routine do i = 1, N_det print *, 'Determinant ', i call debug_det(psi_det(1,1,i),N_int) - print '(4E20.12,X)', (psi_coef(i,k), k=1,N_states) + print '(4ES20.12,X)', (psi_coef(i,k), k=1,N_states) print *, '' print *, '' enddo diff --git a/src/utils/format_w_error.irp.f b/src/utils/format_w_error.irp.f index 7f7458b6..c253456e 100644 --- a/src/utils/format_w_error.irp.f +++ b/src/utils/format_w_error.irp.f @@ -39,7 +39,7 @@ subroutine format_w_error(value,error,size_nb,max_nb_digits,format_value,str_err write(str_size,'(I3)') size_nb ! Error - write(str_exp,'(1pE20.0)') error + write(str_exp,'(ES20.0)') error str_error = trim(adjustl(str_exp)) ! Number of digit: Y (FX.Y) from the exponent diff --git a/src/utils_trust_region/rotation_matrix_iterative.irp.f b/src/utils_trust_region/rotation_matrix_iterative.irp.f index f268df04..db3d5c99 100644 --- a/src/utils_trust_region/rotation_matrix_iterative.irp.f +++ b/src/utils_trust_region/rotation_matrix_iterative.irp.f @@ -73,7 +73,7 @@ subroutine rotation_matrix_iterative(m,X,R) !print*,'R' !do i = 1, m - ! write(*,'(10(E12.5))') R(i,:) + ! write(*,'(10(ES12.5))') R(i,:) !enddo do i = 1, m @@ -82,7 +82,7 @@ subroutine rotation_matrix_iterative(m,X,R) !print*,'RRT' !do i = 1, m - ! write(*,'(10(E12.5))') RRT(i,:) + ! write(*,'(10(ES12.5))') RRT(i,:) !enddo max_elem = 0d0 diff --git a/src/utils_trust_region/trust_region_optimal_lambda.irp.f b/src/utils_trust_region/trust_region_optimal_lambda.irp.f index b7dcf875..e98bbfb7 100644 --- a/src/utils_trust_region/trust_region_optimal_lambda.irp.f +++ b/src/utils_trust_region/trust_region_optimal_lambda.irp.f @@ -336,7 +336,7 @@ subroutine trust_region_optimal_lambda(n,e_val,tmp_wtg,delta,lambda) d_1 = d1_norm_inverse_trust_region_omp(n,e_val,tmp_wtg,lambda,delta) ! first derivative of (1/||x(lambda)||^2 - 1/delta^2)^2 d_2 = d2_norm_inverse_trust_region_omp(n,e_val,tmp_wtg,lambda,delta) ! second derivative of (1/||x(lambda)||^2 - 1/delta^2)^2 endif - !write(*,'(a,E12.5,a,E12.5)') ' 1st and 2nd derivative: ', d_1,', ', d_2 + !write(*,'(a,ES12.5,a,ES12.5)') ' 1st and 2nd derivative: ', d_1,', ', d_2 ! Newton's step y = -(1d0/DABS(d_2))*d_1 @@ -345,7 +345,7 @@ subroutine trust_region_optimal_lambda(n,e_val,tmp_wtg,delta,lambda) if (DABS(y) > alpha) then y = alpha * (y/DABS(y)) ! preservation of the sign of y endif - !write(*,'(a,E12.5)') ' Step length: ', y + !write(*,'(a,ES12.5)') ' Step length: ', y ! Predicted value of (||x(lambda)||^2 - delta^2)^2, Taylor series model = prev_f_R + d_1 * y + 0.5d0 * d_2 * y**2 @@ -414,7 +414,7 @@ subroutine trust_region_optimal_lambda(n,e_val,tmp_wtg,delta,lambda) else alpha = 0.25d0 * alpha endif - !write(*,'(a,E12.5)') ' New trust length alpha: ', alpha + !write(*,'(a,ES12.5)') ' New trust length alpha: ', alpha ! cancellaion of the step if rho < 0.1 if (rho_2 < thresh_rho_2) then !0.1d0) then From 119779595aba655ce1effe2f7cb93ea26701c226 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 4 Jul 2023 23:43:26 +0200 Subject: [PATCH 12/40] Accelerate Cholesky CCSD --- src/ccsd/ccsd_space_orb_sub.irp.f | 13 +---- src/utils_cc/mo_integrals_cc.irp.f | 91 ++++++++++++++++++++++++------ 2 files changed, 76 insertions(+), 28 deletions(-) diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index d23073b8..3c9a2cfc 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -1549,19 +1549,12 @@ subroutine compute_B1_gam(nO,nV,t1,t2,B1,gam) double precision, allocatable :: X_vvvo(:,:,:), Y_vvvv(:,:,:) allocate(X_vvvo(nV,nV,nO), Y_vvvv(nV,nV,nV)) ! ! B1(a,b,beta,gam) = cc_space_v_vvvv(a,b,beta,gam) + call gen_v_space(cc_nVa,cc_nVa,cc_nVa,1, cc_list_vir,cc_list_vir,cc_list_vir,(/ gam /), B1) + !$omp parallel & !$omp shared(nO,nV,B1,cc_space_v_vvvv,cc_space_v_vvov,X_vvvo,gam) & !$omp private(a,b,beta) & !$omp default(none) - !$omp do - do beta = 1, nV - do b = 1, nV - do a = 1, nV - B1(a,b,beta) = cc_space_v_vvvv(a,b,beta,gam) - enddo - enddo - enddo - !$omp end do nowait do i = 1, nO !$omp do do b = 1, nV @@ -1569,7 +1562,7 @@ subroutine compute_B1_gam(nO,nV,t1,t2,B1,gam) X_vvvo(a,b,i) = cc_space_v_vvov(a,b,i,gam) enddo enddo - !$omp end do nowait + !$omp end do enddo !$omp end parallel diff --git a/src/utils_cc/mo_integrals_cc.irp.f b/src/utils_cc/mo_integrals_cc.irp.f index dafcf7af..2e7ecdd4 100644 --- a/src/utils_cc/mo_integrals_cc.irp.f +++ b/src/utils_cc/mo_integrals_cc.irp.f @@ -48,32 +48,86 @@ subroutine gen_v_space(n1,n2,n3,n4,list1,list2,list3,list4,v) integer :: i1,i2,i3,i4,idx1,idx2,idx3,idx4,k if (do_ao_cholesky) then - double precision, allocatable :: buffer(:,:,:) - !$OMP PARALLEL & - !$OMP SHARED(n1,n2,n3,n4,list1,list2,list3,list4,v,mo_num,cholesky_mo_transp,cholesky_ao_num) & - !$OMP PRIVATE(i1,i2,i3,i4,idx1,idx2,idx3,idx4,k,buffer)& - !$OMP DEFAULT(NONE) - allocate(buffer(mo_num,mo_num,mo_num)) + double precision, allocatable :: buffer(:,:,:,:) + double precision, allocatable :: v1(:,:,:), v2(:,:,:) + allocate(v1(cholesky_ao_num,n1,n3), v2(cholesky_ao_num,n2,n4)) + allocate(buffer(n1,n3,n2,n4)) + + !$OMP PARALLEL PRIVATE(i1,i2,i3,i4,idx1,idx2,idx3,idx4,k) !$OMP DO - do i4 = 1, n4 + do i3=1,n3 + idx3 = list3(i3) + do i1=1,n1 + idx1 = list1(i1) + do k=1,cholesky_ao_num + v1(k,i1,i3) = cholesky_mo_transp(k,idx1,idx3) + enddo + enddo + enddo + !$OMP END DO NOWAIT + + !$OMP DO + do i4=1,n4 idx4 = list4(i4) - call dgemm('T','N', mo_num*mo_num, mo_num, cholesky_ao_num, 1.d0, & - cholesky_mo_transp, cholesky_ao_num, & - cholesky_mo_transp(1,1,idx4), cholesky_ao_num, 0.d0, buffer, mo_num*mo_num) - do i2 = 1, n2 + do i2=1,n2 idx2 = list2(i2) - do i3 = 1, n3 - idx3 = list3(i3) + do k=1,cholesky_ao_num + v2(k,i2,i4) = cholesky_mo_transp(k,idx2,idx4) + enddo + enddo + enddo + !$OMP END DO NOWAIT + + !$OMP BARRIER + !$OMP END PARALLEL + + call dgemm('T','N', n1*n3, n2*n4, cholesky_ao_num, 1.d0, & + v1, cholesky_ao_num, & + v2, cholesky_ao_num, 0.d0, buffer, n1*n3) + + deallocate(v1,v2) + + !$OMP PARALLEL DO PRIVATE(i1,i2,i3,i4) + do i4 = 1, n4 + do i3 = 1, n3 + do i2 = 1, n2 do i1 = 1, n1 - idx1 = list1(i1) - v(i1,i2,i3,i4) = buffer(idx1,idx3,idx2) + v(i1,i2,i3,i4) = buffer(i1,i3,i2,i4) enddo enddo enddo enddo - !$OMP END DO - deallocate(buffer) - !$OMP END PARALLEL + !$OMP END PARALLEL DO + +! !$OMP PARALLEL & +! !$OMP SHARED(n1,n2,n3,n4,list1,list2,list3,list4,v,cholesky_mo_transp,cholesky_ao_num,v1) & +! !$OMP PRIVATE(i1,i2,i3,i4,idx1,idx2,idx3,idx4,k,buffer,v2)& +! !$OMP DEFAULT(NONE) +! allocate(buffer(n1,n3,n2), v2(cholesky_ao_num,n2)) +! !$OMP DO +! do i4 = 1, n4 +! idx4 = list4(i4) +! do i2=1,n2 +! idx2 = list2(i2) +! do k=1,cholesky_ao_num +! v2(k,i2) = cholesky_mo_transp(k,idx2,idx4) +! enddo +! enddo +! call dgemm('T','N', n1*n3, n2, cholesky_ao_num, 1.d0, & +! v1, cholesky_ao_num, & +! v2, cholesky_ao_num, 0.d0, buffer, n1*n3) +! do i3 = 1, n3 +! do i2 = 1, n2 +! do i1 = 1, n1 +! v(i1,i2,i3,i4) = buffer(i1,i3,i2) +! enddo +! enddo +! enddo +! enddo +! !$OMP END DO +! deallocate(buffer, v2) +! !$OMP END PARALLEL +! deallocate(v1) else double precision :: get_two_e_integral @@ -112,6 +166,7 @@ BEGIN_PROVIDER [double precision, cc_space_v, (mo_num,mo_num,mo_num,mo_num)] if (do_ao_cholesky) then integer :: i1,i2,i3,i4 double precision, allocatable :: buffer(:,:,:) + call set_multiple_levels_omp(.False.) !$OMP PARALLEL & !$OMP SHARED(cc_space_v,mo_num,cholesky_mo_transp,cholesky_ao_num) & !$OMP PRIVATE(i1,i2,i3,i4,k,buffer)& From 94b1ae138b999517a62aa5ae0bcb9ab7fb00db77 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Wed, 5 Jul 2023 00:07:46 +0200 Subject: [PATCH 13/40] Cleaning --- src/ao_two_e_ints/cholesky.irp.f | 4 ++-- src/ccsd/ccsd_space_orb_sub.irp.f | 16 +++++++++++++++- src/utils_cc/mo_integrals_cc.irp.f | 30 ------------------------------ 3 files changed, 17 insertions(+), 33 deletions(-) diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f index 18180efb..98652d8f 100644 --- a/src/ao_two_e_ints/cholesky.irp.f +++ b/src/ao_two_e_ints/cholesky.irp.f @@ -29,7 +29,7 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] double precision, pointer :: L(:,:), L_old(:,:) - double precision, parameter :: s = 1.d-2 + double precision, parameter :: s = 3.d-2 double precision, parameter :: dscale = 1.d0 double precision, allocatable :: D(:), Delta(:,:), Ltmp_p(:,:), Ltmp_q(:,:) @@ -135,7 +135,7 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] L_old => L allocate(L(ndim,rank+nq), stat=ierr) if (ierr /= 0) then - print *, irp_here, ': allocation failed : (Delta(np,nq))' + print *, irp_here, ': allocation failed : (L(ndim,rank+nq))' stop -1 endif diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 3c9a2cfc..1d77180e 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -1549,12 +1549,26 @@ subroutine compute_B1_gam(nO,nV,t1,t2,B1,gam) double precision, allocatable :: X_vvvo(:,:,:), Y_vvvv(:,:,:) allocate(X_vvvo(nV,nV,nO), Y_vvvv(nV,nV,nV)) ! ! B1(a,b,beta,gam) = cc_space_v_vvvv(a,b,beta,gam) - call gen_v_space(cc_nVa,cc_nVa,cc_nVa,1, cc_list_vir,cc_list_vir,cc_list_vir,(/ gam /), B1) + + call gen_v_space(cc_nVa,cc_nVa,cc_nVa,1, & + cc_list_vir,cc_list_vir,cc_list_vir,(/ cc_list_vir(gam) /), B1) + !$omp parallel & !$omp shared(nO,nV,B1,cc_space_v_vvvv,cc_space_v_vvov,X_vvvo,gam) & !$omp private(a,b,beta) & !$omp default(none) + +! !$omp do +! do beta = 1, nV +! do b = 1, nV +! do a = 1, nV +! B1(a,b,beta) = cc_space_v_vvvv(a,b,beta,gam) +! enddo +! enddo +! enddo +! !$omp end do nowait + do i = 1, nO !$omp do do b = 1, nV diff --git a/src/utils_cc/mo_integrals_cc.irp.f b/src/utils_cc/mo_integrals_cc.irp.f index 2e7ecdd4..2db614b4 100644 --- a/src/utils_cc/mo_integrals_cc.irp.f +++ b/src/utils_cc/mo_integrals_cc.irp.f @@ -99,36 +99,6 @@ subroutine gen_v_space(n1,n2,n3,n4,list1,list2,list3,list4,v) enddo !$OMP END PARALLEL DO -! !$OMP PARALLEL & -! !$OMP SHARED(n1,n2,n3,n4,list1,list2,list3,list4,v,cholesky_mo_transp,cholesky_ao_num,v1) & -! !$OMP PRIVATE(i1,i2,i3,i4,idx1,idx2,idx3,idx4,k,buffer,v2)& -! !$OMP DEFAULT(NONE) -! allocate(buffer(n1,n3,n2), v2(cholesky_ao_num,n2)) -! !$OMP DO -! do i4 = 1, n4 -! idx4 = list4(i4) -! do i2=1,n2 -! idx2 = list2(i2) -! do k=1,cholesky_ao_num -! v2(k,i2) = cholesky_mo_transp(k,idx2,idx4) -! enddo -! enddo -! call dgemm('T','N', n1*n3, n2, cholesky_ao_num, 1.d0, & -! v1, cholesky_ao_num, & -! v2, cholesky_ao_num, 0.d0, buffer, n1*n3) -! do i3 = 1, n3 -! do i2 = 1, n2 -! do i1 = 1, n1 -! v(i1,i2,i3,i4) = buffer(i1,i3,i2) -! enddo -! enddo -! enddo -! enddo -! !$OMP END DO -! deallocate(buffer, v2) -! !$OMP END PARALLEL -! deallocate(v1) - else double precision :: get_two_e_integral From 0132eb87fe786f39ee4e9326844829229716c19d Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Wed, 5 Jul 2023 02:40:59 +0200 Subject: [PATCH 14/40] Symmetry in cholesky --- src/ao_two_e_ints/cholesky.irp.f | 64 ++++++++++++++++++++++---------- 1 file changed, 45 insertions(+), 19 deletions(-) diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f index 98652d8f..f4746144 100644 --- a/src/ao_two_e_ints/cholesky.irp.f +++ b/src/ao_two_e_ints/cholesky.irp.f @@ -29,7 +29,7 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] double precision, pointer :: L(:,:), L_old(:,:) - double precision, parameter :: s = 3.d-2 + double precision, parameter :: s = 1.d-1 double precision, parameter :: dscale = 1.d0 double precision, allocatable :: D(:), Delta(:,:), Ltmp_p(:,:), Ltmp_q(:,:) @@ -45,6 +45,8 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] integer :: block_size, iblock, ierr + integer(omp_lock_kind), allocatable :: lock(:) + PROVIDE ao_two_e_integrals_in_map deallocate(cholesky_ao) @@ -66,8 +68,11 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] rank = 0 allocate( D(ndim), Lset(ndim), LDmap(ndim), DLmap(ndim), Dset(ndim) ) - allocate( Lset_rev(ndim), Dset_rev(ndim) ) + allocate( Lset_rev(ndim), Dset_rev(ndim), lock(ndim) ) allocate( addr(3,ndim) ) + do k=1,ndim + call omp_init_lock(lock(k)) + enddo ! 1. k=0 @@ -113,7 +118,7 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] i = i+1 ! b. - Dmin = max(s*Dmax, tau) + Dmin = max(s*Dmax,tau) ! c. nq=0 @@ -165,7 +170,9 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] stop -1 endif - !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(m,k,p,q) + Delta(:,:) = 0.d0 + + !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(m,k,p,q,j) !$OMP DO do k=1,N @@ -181,20 +188,7 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] !$OMP DO SCHEDULE(dynamic,8) do m=1,nq - Delta(:,m) = 0.d0 - do k=1, nq - ! Apply only to (k,m) pairs both in Dset - p = DLmap(k) - q = Lset_rev(addr(3,Dset(k))) - if ((0 < q).and.(q < p)) cycle - if (.not.ao_two_e_integral_zero( addr(1,Dset(k)), addr(1,Dset(m)), & - addr(2,Dset(k)), addr(2,Dset(m)) ) ) then - Delta(p,m) = get_ao_two_e_integral( addr(1,Dset(k)), addr(1,Dset(m)), & - addr(2,Dset(k)), addr(2,Dset(m)), ao_integrals_map) - Delta(q,m) = Delta(p,m) - endif - enddo - + call omp_set_lock(lock(m)) do k=1,np ! Apply only to (k,m) pairs where k is not in Dset if (LDmap(k) /= 0) cycle @@ -204,9 +198,37 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] addr(2,Lset(k)), addr(2,Dset(m)) ) ) then Delta(k,m) = get_ao_two_e_integral( addr(1,Lset(k)), addr(1,Dset(m)), & addr(2,Lset(k)), addr(2,Dset(m)), ao_integrals_map) - Delta(q,m) = Delta(k,m) + if (q /= 0) Delta(q,m) = Delta(k,m) endif enddo + + j = Dset_rev(addr(3,Dset(m))) + if ((0 < j).and.(j < m)) then + call omp_unset_lock(lock(m)) + cycle + endif + + if ((j /= m).and.(j /= 0)) then + call omp_set_lock(lock(j)) + endif + do k=1,nq + ! Apply only to (k,m) pairs both in Dset + p = DLmap(k) + q = Lset_rev(addr(3,Dset(k))) + if ((0 < q).and.(q < p)) cycle + if (.not.ao_two_e_integral_zero( addr(1,Dset(k)), addr(1,Dset(m)), & + addr(2,Dset(k)), addr(2,Dset(m)) ) ) then + Delta(p,m) = get_ao_two_e_integral( addr(1,Dset(k)), addr(1,Dset(m)), & + addr(2,Dset(k)), addr(2,Dset(m)), ao_integrals_map) + if (q /= 0) Delta(q,m) = Delta(p,m) + if (j /= 0) Delta(p,j) = Delta(p,m) + if (q*j /= 0) Delta(q,j) = Delta(p,m) + endif + enddo + call omp_unset_lock(lock(m)) + if ((j /= m).and.(j /= 0)) then + call omp_unset_lock(lock(j)) + endif enddo !$OMP END DO @@ -313,6 +335,10 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] enddo + do k=1,ndim + call omp_destroy_lock(lock(k)) + enddo + allocate(cholesky_ao(ao_num,ao_num,rank)) call dcopy(ndim*rank, L, 1, cholesky_ao, 1) deallocate(L) From 9293f360d51d31248d2edcd9cffeed16d90924f1 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Wed, 5 Jul 2023 09:09:12 +0200 Subject: [PATCH 15/40] RELEASE_NOTES.org --- RELEASE_NOTES.org | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/RELEASE_NOTES.org b/RELEASE_NOTES.org index 3bd02898..a0e6d104 100644 --- a/RELEASE_NOTES.org +++ b/RELEASE_NOTES.org @@ -10,7 +10,8 @@ - Added many types of integrals - Accelerated four-index transformation - Added transcorrelated SCF - - Added transcorrelated CIPSI + - Added bi-orthonormal transcorrelated CIPSI + - Added Cholesky decomposition of AO integrals - Added CCSD and CCSD(T) - Added MO localization - Changed coupling parameters for ROHF @@ -20,7 +21,7 @@ - Removed cryptokit dependency in OCaml - Using now standard convention in RDM - Added molecular properties - - [ ] Added GTOs with complex exponent + - Added GTOs with complex exponent *** TODO: take from dev - Updated version of f77-zmq From 41a369dd687fd498917c675930f169e736d766a0 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Wed, 5 Jul 2023 17:43:31 +0200 Subject: [PATCH 16/40] Optimized 4idx with Cholesky --- src/mo_two_e_ints/mo_bi_integrals.irp.f | 47 ++++++++++++++++--------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/src/mo_two_e_ints/mo_bi_integrals.irp.f b/src/mo_two_e_ints/mo_bi_integrals.irp.f index a461504e..b15d9745 100644 --- a/src/mo_two_e_ints/mo_bi_integrals.irp.f +++ b/src/mo_two_e_ints/mo_bi_integrals.irp.f @@ -465,31 +465,34 @@ subroutine add_integrals_to_map_cholesky integer :: size_buffer, n_integrals size_buffer = min(mo_num*mo_num*mo_num,16000000) - double precision, allocatable :: Vtmp(:,:,:,:) + double precision, allocatable :: Vtmp(:,:,:) integer(key_kind) , allocatable :: buffer_i(:) real(integral_kind), allocatable :: buffer_value(:) if (.True.) then ! In-memory transformation - allocate (Vtmp(mo_num,mo_num,mo_num,mo_num)) + call set_multiple_levels_omp(.False.) - call dgemm('N','T',mo_num*mo_num,mo_num*mo_num,cholesky_ao_num,1.d0, & - cholesky_mo, mo_num*mo_num, & - cholesky_mo, mo_num*mo_num, 0.d0, & - Vtmp, mo_num*mo_num) - - !$OMP PARALLEL PRIVATE(i,j,k,l,n_integrals,buffer_value, buffer_i) + !$OMP PARALLEL PRIVATE(i,j,k,l,n_integrals,buffer_value, buffer_i, Vtmp) allocate (buffer_i(size_buffer), buffer_value(size_buffer)) n_integrals = 0 + + allocate (Vtmp(mo_num,mo_num,mo_num)) + !$OMP DO do l=1,mo_num + + call dgemm('T','N',mo_num*mo_num,mo_num,cholesky_ao_num,1.d0, & + cholesky_mo_transp, cholesky_ao_num, & + cholesky_mo_transp(1,1,l), cholesky_ao_num, 0.d0, & + Vtmp, mo_num*mo_num) do k=1,l do j=1,mo_num do i=1,j - if (abs(Vtmp(i,j,k,l)) > mo_integrals_threshold) then + if (abs(Vtmp(i,j,k)) > mo_integrals_threshold) then n_integrals += 1 - buffer_value(n_integrals) = Vtmp(i,j,k,l) + buffer_value(n_integrals) = Vtmp(i,j,k) !DIR$ FORCEINLINE call mo_two_e_integrals_index(i,k,j,l,buffer_i(n_integrals)) if (n_integrals == size_buffer) then @@ -503,10 +506,9 @@ subroutine add_integrals_to_map_cholesky enddo !$OMP END DO call map_append(mo_integrals_map, buffer_i, buffer_value, n_integrals) - deallocate(buffer_i, buffer_value) + deallocate(buffer_i, buffer_value, Vtmp) !$OMP END PARALLEL - deallocate(Vtmp) call map_unique(mo_integrals_map) endif @@ -1350,16 +1352,29 @@ END_PROVIDER ! mo_two_e_integrals_jj_anti(i,j) = J_ij - K_ij END_DOC - integer :: i,j + integer :: i,j,k double precision :: get_two_e_integral if (do_ao_cholesky) then + double precision, allocatable :: buffer(:,:) + allocate (buffer(cholesky_ao_num,mo_num)) + do k=1,cholesky_ao_num + do i=1,mo_num + buffer(k,i) = cholesky_mo_transp(k,i,i) + enddo + enddo + call dgemm('T','N',mo_num,mo_num,cholesky_ao_num,1.d0, & + buffer, cholesky_ao_num, buffer, cholesky_ao_num, 0.d0, mo_two_e_integrals_jj, mo_num) + deallocate(buffer) + do j=1,mo_num do i=1,mo_num - !TODO: use dgemm - mo_two_e_integrals_jj(i,j) = sum(cholesky_mo_transp(:,i,i)*cholesky_mo_transp(:,j,j)) - mo_two_e_integrals_jj_exchange(i,j) = sum(cholesky_mo_transp(:,i,j)*cholesky_mo_transp(:,j,i)) + mo_two_e_integrals_jj_exchange(i,j) = 0.d0 + do k=1,cholesky_ao_num + mo_two_e_integrals_jj_exchange(i,j) = mo_two_e_integrals_jj_exchange(i,j) + & + cholesky_mo_transp(k,i,j)*cholesky_mo_transp(k,j,i) + enddo enddo enddo From 5a0c8de5a39390a63b29d2748fa6a92cb00107ea Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Wed, 5 Jul 2023 19:12:03 +0200 Subject: [PATCH 17/40] Accelerated cholesky AO-MO transformation --- src/mo_two_e_ints/cholesky.irp.f | 38 ++++++++++++-------------------- 1 file changed, 14 insertions(+), 24 deletions(-) diff --git a/src/mo_two_e_ints/cholesky.irp.f b/src/mo_two_e_ints/cholesky.irp.f index 32c0dccd..7cfbaa58 100644 --- a/src/mo_two_e_ints/cholesky.irp.f +++ b/src/mo_two_e_ints/cholesky.irp.f @@ -4,16 +4,18 @@ BEGIN_PROVIDER [ double precision, cholesky_mo, (mo_num, mo_num, cholesky_ao_num ! Cholesky vectors in MO basis END_DOC - integer :: k + integer :: k, i, j call set_multiple_levels_omp(.False.) - print *, 'AO->MO Transformation of Cholesky vectors' !$OMP PARALLEL DO PRIVATE(k) do k=1,cholesky_ao_num - call ao_to_mo(cholesky_ao(1,1,k),ao_num,cholesky_mo(1,1,k),mo_num) + do j=1,mo_num + do i=1,mo_num + cholesky_mo(i,j,k) = cholesky_mo_transp(k,i,j) + enddo + enddo enddo !$OMP END PARALLEL DO - print *, '' END_PROVIDER @@ -23,27 +25,15 @@ BEGIN_PROVIDER [ double precision, cholesky_mo_transp, (cholesky_ao_num, mo_num, ! Cholesky vectors in MO basis END_DOC - integer :: i,j,k - double precision, allocatable :: buffer(:,:) + double precision, allocatable :: X(:,:,:) + print *, 'AO->MO Transformation of Cholesky vectors' - print *, 'AO->MO Transformation of Cholesky vectors .' - - call set_multiple_levels_omp(.False.) - !$OMP PARALLEL PRIVATE(i,j,k,buffer) - allocate(buffer(mo_num,mo_num)) - !$OMP DO SCHEDULE(static) - do k=1,cholesky_ao_num - call ao_to_mo(cholesky_ao(1,1,k),ao_num,buffer,mo_num) - do j=1,mo_num - do i=1,mo_num - cholesky_mo_transp(k,i,j) = buffer(i,j) - enddo - enddo - enddo - !$OMP END DO - deallocate(buffer) - !$OMP END PARALLEL - print *, '' + allocate(X(mo_num,cholesky_ao_num,ao_num)) + call dgemm('T','N', ao_num*cholesky_ao_num, mo_num, ao_num, 1.d0, & + cholesky_ao, ao_num, mo_coef, ao_num, 0.d0, X, ao_num*cholesky_ao_num) + call dgemm('T','N', cholesky_ao_num*mo_num, mo_num, ao_num, 1.d0, & + X, ao_num, mo_coef, ao_num, 0.d0, cholesky_mo_transp, cholesky_ao_num*mo_num) + deallocate(X) END_PROVIDER From e82220a6a414bd20eac08d5bca584ad0fb315495 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Thu, 6 Jul 2023 02:12:42 +0200 Subject: [PATCH 18/40] Working on Cholesky CCSD --- src/ccsd/ccsd_space_orb_sub.irp.f | 26 +- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 1395 +++++++++++++++++++++++ src/mo_two_e_ints/mo_bi_integrals.irp.f | 81 +- src/tools/four_idx_transform.irp.f | 7 + src/utils/fortran_mmap.c | 14 +- src/utils/mmap.f90 | 25 +- src/utils_cc/mo_integrals_cc.irp.f | 85 +- 7 files changed, 1542 insertions(+), 91 deletions(-) create mode 100644 src/ccsd/ccsd_space_orb_sub_chol.irp.f diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 1d77180e..76c9351e 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -85,13 +85,23 @@ subroutine run_ccsd_space_orb do while (not_converged) - call compute_H_oo(nO,nV,t1,t2,tau,H_oo) - call compute_H_vv(nO,nV,t1,t2,tau,H_vv) - call compute_H_vo(nO,nV,t1,t2,H_vo) - ! Residue - call compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) - call compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) +! if (do_ao_cholesky) then + if (.False.) then + call compute_H_oo_chol(nO,nV,t1,t2,tau,H_oo) + call compute_H_vv_chol(nO,nV,t1,t2,tau,H_vv) + call compute_H_vo_chol(nO,nV,t1,t2,H_vo) + + call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) + call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) + else + call compute_H_oo(nO,nV,t1,t2,tau,H_oo) + call compute_H_vv(nO,nV,t1,t2,tau,H_vv) + call compute_H_vo(nO,nV,t1,t2,H_vo) + + call compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) + call compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) + endif max_r = max(max_r1,max_r2) ! Update @@ -839,6 +849,10 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) ! allocate(B1(nV,nV,nV,nV)) ! call compute_B1(nO,nV,t1,t2,B1) +! call dgemm('N','N',nO*nO,nV*nV,nV*nV, & +! 1d0, tau, size(tau,1) * size(tau,2), & +! B1 , size(B1_gam,1) * size(B1_gam,2), & +! 1d0, r2, size(r2,1) * size(r2,2)) allocate(B1_gam(nV,nV,nV)) do gam=1,nV call compute_B1_gam(nO,nV,t1,t2,B1_gam,gam) diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f new file mode 100644 index 00000000..190c163b --- /dev/null +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -0,0 +1,1395 @@ +subroutine ccsd_energy_space_chol(nO,nV,tau,t1,energy) + + implicit none + + integer, intent(in) :: nO, nV + double precision, intent(in) :: tau(nO,nO,nV,nV) + double precision, intent(in) :: t1(nO,nV) + double precision, intent(out) :: energy + + ! internal + integer :: i,j,a,b + double precision :: e + + energy = 0d0 + !$omp parallel & + !$omp shared(nO,nV,energy,tau,t1,& + !$omp cc_space_f_vo,cc_space_w_oovv) & + !$omp private(i,j,a,b,e) & + !$omp default(none) + e = 0d0 + !$omp do + do a = 1, nV + do i = 1, nO + e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a) + enddo + enddo + !$omp end do nowait + !$omp do + do b = 1, nV + do a = 1, nV + do j = 1, nO + do i = 1, nO + e = e + tau(i,j,a,b) * cc_space_w_oovv(i,j,a,b) + enddo + enddo + enddo + enddo + !$omp end do nowait + !$omp critical + energy = energy + e + !$omp end critical + !$omp end parallel + +end + +! Tau + +subroutine update_tau_space_chol(nO,nV,t1,t2,tau) + + implicit none + + ! in + integer, intent(in) :: nO, nV + double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV) + + ! out + double precision, intent(out) :: tau(nO,nO,nV,nV) + + ! internal + integer :: i,j,a,b + + !$OMP PARALLEL & + !$OMP SHARED(nO,nV,tau,t2,t1) & + !$OMP PRIVATE(i,j,a,b) & + !$OMP DEFAULT(NONE) + !$OMP DO + do b = 1, nV + do a = 1, nV + do j = 1, nO + do i = 1, nO + tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b) + enddo + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + +end + +! R1 + +subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) + + implicit none + + ! in + integer, intent(in) :: nO, nV + double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV), tau(nO,nO,nV,nV) + double precision, intent(in) :: H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO) + + ! out + double precision, intent(out) :: r1(nO,nV), max_r1 + + ! internal + integer :: u,i,j,beta,a,b + + !$omp parallel & + !$omp shared(nO,nV,r1,cc_space_f_ov) & + !$omp private(u,beta) & + !$omp default(none) + !$omp do + do beta = 1, nV + do u = 1, nO + r1(u,beta) = cc_space_f_ov(u,beta) + enddo + enddo + !$omp end do + !$omp end parallel + + double precision, allocatable :: X_oo(:,:) + allocate(X_oo(nO,nO)) + call dgemm('N','N', nO, nO, nV, & + -2d0, t1 , size(t1,1), & + cc_space_f_vo, size(cc_space_f_vo,1), & + 0d0, X_oo , size(X_oo,1)) + + call dgemm('T','N', nO, nV, nO, & + 1d0, X_oo, size(X_oo,2), & + t1 , size(t1,1), & + 1d0, r1 , size(r1,1)) + deallocate(X_oo) + + call dgemm('N','N', nO, nV, nV, & + 1d0, t1 , size(t1,1), & + H_vv, size(H_vv,1), & + 1d0, r1 , size(r1,1)) + + call dgemm('N','N', nO, nV, nO, & + -1d0, H_oo, size(H_oo,1), & + t1 , size(t1,1), & + 1d0, r1, size(r1,1)) + + double precision, allocatable :: X_voov(:,:,:,:) + allocate(X_voov(nV, nO, nO, nV)) + + !$omp parallel & + !$omp shared(nO,nV,X_voov,t2,t1) & + !$omp private(u,beta,i,a) & + !$omp default(none) + !$omp do + do beta = 1, nV + do u = 1, nO + do i = 1, nO + do a = 1, nV + X_voov(a,i,u,beta) = 2d0 * t2(i,u,a,beta) - t2(u,i,a,beta) + t1(u,a) * t1(i,beta) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + call dgemv('T', nV*nO, nO*nV, & + 1d0, X_voov, size(X_voov,1) * size(X_voov,2), & + H_vo , 1, & + 1d0, r1 , 1) + + deallocate(X_voov) + + double precision, allocatable :: X_ovov(:,:,:,:) + allocate(X_ovov(nO, nV, nO, nV)) + + !$omp parallel & + !$omp shared(nO,nV,cc_space_v_ovov,cc_space_v_voov,X_ovov) & + !$omp private(u,beta,i,a) & + !$omp default(none) + !$omp do + do beta = 1, nV + do u = 1, nO + do a = 1, nv + do i = 1, nO + X_ovov(i,a,u,beta) = 2d0 * cc_space_v_voov(a,u,i,beta) - cc_space_v_ovov(u,a,i,beta) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + call dgemv('T', nO*nV, nO*nV, & + 1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), & + t1 , 1, & + 1d0, r1 , 1) + + deallocate(X_ovov) + + double precision, allocatable :: W_vvov(:,:,:,:), T_vvoo(:,:,:,:) + allocate(W_vvov(nV,nV,nO,nV), T_vvoo(nV,nV,nO,nO)) + + !$omp parallel & + !$omp shared(nO,nV,cc_space_v_vvov,W_vvov,T_vvoo,tau) & + !$omp private(b,beta,i,a) & + !$omp default(none) + !$omp do + do beta = 1, nV + do i = 1, nO + do b = 1, nV + do a = 1, nV + W_vvov(a,b,i,beta) = 2d0 * cc_space_v_vvov(a,b,i,beta) - cc_space_v_vvov(b,a,i,beta) + enddo + enddo + enddo + enddo + !$omp end do nowait + + !$omp do + do u = 1, nO + do i = 1, nO + do b = 1, nV + do a = 1, nV + T_vvoo(a,b,i,u) = tau(i,u,a,b) + enddo + enddo + enddo + enddo + !$omp end do nowait + !$omp end parallel + + call dgemm('T','N',nO,nV,nO*nV*nV, & + 1d0, T_vvoo, size(T_vvoo,1) * size(T_vvoo,2) * size(T_vvoo,3), & + W_vvov, size(W_vvov,1) * size(W_vvov,2) * size(W_vvov,3), & + 1d0, r1 , size(r1,1)) + + deallocate(W_vvov,T_vvoo) + + double precision, allocatable :: W_oovo(:,:,:,:) + allocate(W_oovo(nO,nO,nV,nO)) + + !$omp parallel & + !$omp shared(nO,nV,cc_space_v_vooo,W_oovo) & + !$omp private(u,a,i,j) & + !$omp default(none) + do u = 1, nO + !$omp do + do a = 1, nV + do j = 1, nO + do i = 1, nO + W_oovo(i,j,a,u) = 2d0 * cc_space_v_vooo(a,u,i,j) - cc_space_v_vooo(a,u,j,i) + enddo + enddo + enddo + !$omp end do nowait + enddo + !$omp end parallel + + call dgemm('T','N', nO, nV, nO*nO*nV, & + -1d0, W_oovo, size(W_oovo,1) * size(W_oovo,2) * size(W_oovo,3), & + tau , size(tau,1) * size(tau,2) * size(tau,3), & + 1d0, r1 , size(r1,1)) + + deallocate(W_oovo) + + max_r1 = 0d0 + do a = 1, nV + do i = 1, nO + max_r1 = max(dabs(r1(i,a)), max_r1) + enddo + enddo + + ! Change the sign for consistency with the code in spin orbitals + !$omp parallel & + !$omp shared(nO,nV,r1) & + !$omp private(a,i) & + !$omp default(none) + !$omp do + do a = 1, nV + do i = 1, nO + r1(i,a) = -r1(i,a) + enddo + enddo + !$omp end do + !$omp end parallel + +end + +! H_oo + +subroutine compute_H_oo_chol(nO,nV,t1,t2,tau,H_oo) + + implicit none + + integer, intent(in) :: nO,nV + double precision, intent(in) :: t1(nO, nV) + double precision, intent(in) :: t2(nO, nO, nV, nV) + double precision, intent(in) :: tau(nO, nO, nV, nV) + double precision, intent(out) :: H_oo(nO, nO) + + integer :: a,tmp_a,k,b,l,c,d,tmp_c,tmp_d,i,j,u + + ! H_oo(u,i) = cc_space_f_oo(u,i) + !$omp parallel & + !$omp shared(nO,H_oo,cc_space_f_oo) & + !$omp private(i,u) & + !$omp default(none) + !$omp do + do i = 1, nO + do u = 1, nO + H_oo(u,i) = cc_space_f_oo(u,i) + enddo + enddo + !$omp end do + !$omp end parallel + + ! H_oo(u,i) += cc_space_w_oovv(i,j,a,b) * tau(u,j,a,b) + ! H_oo(u,i) += tau(u,j,a,b) * cc_space_w_oovv(i,j,a,b) + call dgemm('N','T', nO, nO, nO*nV*nV, & + 1d0, tau , size(tau,1), & + cc_space_w_oovv, size(cc_space_w_oovv,1), & + 1d0, H_oo , size(H_oo,1)) + +end + +! H_vv + +subroutine compute_H_vv_chol(nO,nV,t1,t2,tau,H_vv) + + implicit none + + integer, intent(in) :: nO,nV + double precision, intent(in) :: t1(nO, nV) + double precision, intent(in) :: t2(nO, nO, nV, nV) + double precision, intent(in) :: tau(nO, nO, nV, nV) + double precision, intent(out) :: H_vv(nV, nV) + + integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u, beta + + double precision, allocatable :: tmp_tau(:,:,:,:) + + allocate(tmp_tau(nV,nO,nO,nV)) + + ! H_vv(a,beta) = cc_space_f_vv(a,beta) + !$omp parallel & + !$omp shared(nV,nO,H_vv,cc_space_f_vv,tmp_tau,tau) & + !$omp private(a,beta,i,j,b) & + !$omp default(none) + !$omp do + do beta = 1, nV + do a = 1, nV + H_vv(a,beta) = cc_space_f_vv(a,beta) + enddo + enddo + !$omp end do nowait + + !$omp do + do beta = 1, nV + do j = 1, nO + do i = 1, nO + do b = 1, nV + tmp_tau(b,i,j,beta) = tau(i,j,beta,b) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + call dgemm('N','N',nV,nV,nO*nO*nV, & + -1d0, cc_space_w_vvoo, size(cc_space_w_vvoo,1), & + tmp_tau , size(tmp_tau,1) * size(tmp_tau,2) * size(tmp_tau,3), & + 1d0, H_vv , size(H_vv,1)) + + deallocate(tmp_tau) + +end + +! H_vo + +subroutine compute_H_vo_chol(nO,nV,t1,t2,H_vo) + + implicit none + + integer, intent(in) :: nO,nV + double precision, intent(in) :: t1(nO, nV) + double precision, intent(in) :: t2(nO, nO, nV, nV) + double precision, intent(out) :: H_vo(nV, nO) + + integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u, beta + + double precision, allocatable :: w(:,:,:,:) + + allocate(w(nV,nO,nO,nV)) + + !$omp parallel & + !$omp shared(nV,nO,H_vo,cc_space_f_vo,w,cc_space_w_vvoo,t1) & + !$omp private(a,beta,i,j,b) & + !$omp default(none) + !$omp do + do i = 1, nO + do a = 1, nV + H_vo(a,i) = cc_space_f_vo(a,i) + enddo + enddo + !$omp end do nowait + + ! H_vo(a,i) = H_vo(a,i) + cc_space_w_vvoo(a,b,i,j) * t1(j,b) + ! H_vo(a,i) = H_vo(a,i) + w(a,i,j,b) * t1(j,b) + + !$omp do + do b = 1, nV + do j = 1, nO + do i = 1, nO + do a = 1, nV + w(a,i,j,b) = cc_space_w_vvoo(a,b,i,j) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + call dgemv('N',nV*nO, nO*nV, & + 1d0, w , size(w,1) * size(w,2), & + t1 , 1, & + 1d0, H_vo, 1) + + deallocate(w) + +end + +! R2 + +subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) + + implicit none + + ! in + integer, intent(in) :: nO, nV + double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV), tau(nO,nO,nV,nV) + double precision, intent(in) :: H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO) + + ! out + double precision, intent(out) :: r2(nO,nO,nV,nV), max_r2 + + ! internal + double precision, allocatable :: g_occ(:,:), g_vir(:,:), J1(:,:,:,:), K1(:,:,:,:) + double precision, allocatable :: A1(:,:,:,:) + integer :: u,v,i,j,beta,gam,a,b + + allocate(g_occ(nO,nO), g_vir(nV,nV)) + allocate(J1(nO,nV,nV,nO), K1(nO,nV,nO,nV)) + allocate(A1(nO,nO,nO,nO)) + + call compute_g_occ_chol(nO,nV,t1,t2,H_oo,g_occ) + call compute_g_vir_chol(nO,nV,t1,t2,H_vv,g_vir) + call compute_A1_chol(nO,nV,t1,t2,tau,A1) + call compute_J1_chol(nO,nV,t1,t2,cc_space_v_ovvo,cc_space_v_ovoo, & + cc_space_v_vvvo,cc_space_v_vvoo,J1) + call compute_K1_chol(nO,nV,t1,t2,cc_space_v_ovoo,cc_space_v_vvoo, & + cc_space_v_ovov,cc_space_v_vvov,K1) + + ! Residual + !r2 = 0d0 + + !$omp parallel & + !$omp shared(nO,nV,r2,cc_space_v_oovv) & + !$omp private(u,v,gam,beta) & + !$omp default(none) + !$omp do + do gam = 1, nV + do beta = 1, nV + do v = 1, nO + do u = 1, nO + r2(u,v,beta,gam) = cc_space_v_oovv(u,v,beta,gam) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + call dgemm('N','N',nO*nO,nV*nV,nO*nO, & + 1d0, A1, size(A1,1) * size(A1,2), & + tau, size(tau,1) * size(tau,2), & + 1d0, r2, size(r2,1) * size(r2,2)) + + double precision, dimension(:,:,:,:), allocatable :: r2_chem, tmp, tau_chem + double precision, dimension(:,:,:,:), allocatable :: B1 + + allocate(B1(nV,nV,nV,nV)) + call compute_B1_chol(nO,nV,t1,B1,cholesky_ao_num) + call dgemm('N','N',nO*nO,nV*nV,nV*nV, & + 1d0, tau, size(tau,1) * size(tau,2), & + B1 , size(B1 ,1) * size(B1 ,2), & + 1d0, r2, size(r2 ,1) * size(r2 ,2)) + + double precision, allocatable :: X_oovv(:,:,:,:),Y_oovv(:,:,:,:) + allocate(X_oovv(nO,nO,nV,nV),Y_oovv(nO,nO,nV,nV)) + + !$omp parallel & + !$omp shared(nO,nV,t2,X_oovv) & + !$omp private(u,v,gam,a) & + !$omp default(none) + !$omp do + do a = 1, nV + do gam = 1, nV + do v = 1, nO + do u = 1, nO + X_oovv(u,v,gam,a) = t2(u,v,gam,a) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + call dgemm('N','N',nO*nO*nV,nV,nV, & + 1d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3), & + g_vir, size(g_vir,1), & + 0d0, Y_oovv, size(Y_oovv,1) * size(Y_oovv,2) * size(Y_oovv,3)) + + !$omp parallel & + !$omp shared(nO,nV,r2,Y_oovv) & + !$omp private(u,v,gam,beta) & + !$omp default(none) + !$omp do + do gam = 1, nV + do beta = 1, nV + do v = 1, nO + do u = 1, nO + r2(u,v,beta,gam) = r2(u,v,beta,gam) + Y_oovv(u,v,beta,gam) + Y_oovv(v,u,gam,beta) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + call dgemm('N','N',nO,nO*nV*nV,nO, & + 1d0, g_occ , size(g_occ,1), & + t2 , size(t2,1), & + 0d0, X_oovv, size(X_oovv,1)) + + !$omp parallel & + !$omp shared(nO,nV,r2,X_oovv) & + !$omp private(u,v,gam,beta) & + !$omp default(none) + !$omp do + do gam = 1, nV + do beta = 1, nV + do v = 1, nO + do u = 1, nO + r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(u,v,beta,gam) - X_oovv(v,u,gam,beta) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + deallocate(X_oovv) + + double precision, allocatable :: X_vovv(:,:,:,:) + allocate(X_vovv(nV,nO,nV,nV)) + + !$omp parallel & + !$omp shared(nO,nV,X_vovv,cc_space_v_ovvv) & + !$omp private(u,a,gam,beta) & + !$omp default(none) + !$omp do + do gam = 1, nV + do beta = 1, nV + do u = 1, nO + do a = 1, nV + X_vovv(a,u,beta,gam) = cc_space_v_ovvv(u,a,beta,gam) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + call dgemm('N','N',nO,nO*nV*nV,nV, & + 1d0, t1 , size(t1,1), & + X_vovv, size(X_vovv,1), & + 0d0, Y_oovv, size(Y_oovv,1)) + + !$omp parallel & + !$omp shared(nO,nV,r2,Y_oovv) & + !$omp private(u,v,gam,beta) & + !$omp default(none) + !$omp do + do gam = 1, nV + do beta = 1, nV + do v = 1, nO + do u = 1, nO + r2(u,v,beta,gam) = r2(u,v,beta,gam) + Y_oovv(v,u,beta,gam) + Y_oovv(u,v,gam,beta) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + double precision, allocatable :: X_vovo(:,:,:,:), Y_vovv(:,:,:,:) + allocate(X_vovo(nV,nO,nV,nO), Y_vovv(nV,nO,nV,nV),X_oovv(nO,nO,nV,nV)) + + !$omp parallel & + !$omp shared(nO,nV,X_vovo,cc_space_v_ovov) & + !$omp private(u,v,gam,i) & + !$omp default(none) + do i = 1, nO + !$omp do + do gam = 1, nV + do u = 1, nO + do a = 1, nV + X_vovo(a,u,gam,i) = cc_space_v_ovov(u,a,i,gam) + enddo + enddo + enddo + !$omp end do nowait + enddo + !$omp end parallel + + call dgemm('N','N',nV*nO*nV,nV,nO, & + 1d0, X_vovo, size(X_vovo,1) * size(X_vovo,2) * size(X_vovo,3), & + t1 , size(t1,1), & + 0d0, Y_vovv, size(Y_vovv,1) * size(Y_vovv,2) * size(Y_vovv,3)) + + call dgemm('N','N',nO,nO*nV*nV,nV, & + 1d0, t1, size(t1,1), & + Y_vovv, size(Y_vovv,1), & + 0d0, X_oovv, size(X_oovv,1)) + + !$omp parallel & + !$omp shared(nO,nV,r2,X_oovv) & + !$omp private(u,v,gam,beta) & + !$omp default(none) + !$omp do + do gam = 1, nV + do beta = 1, nV + do v = 1, nO + do u = 1, nO + r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(v,u,gam,beta) - X_oovv(u,v,beta,gam) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + deallocate(X_vovo,Y_vovv) + + call dgemm('N','N',nO*nO*nV,nV,nO, & + 1d0, cc_space_v_oovo, size(cc_space_v_oovo,1) * size(cc_space_v_oovo,2) * size(cc_space_v_oovo,3), & + t1 , size(t1,1), & + 0d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3)) + + !$omp parallel & + !$omp shared(nO,nV,r2,X_oovv) & + !$omp private(u,v,gam,beta) & + !$omp default(none) + !$omp do + do gam = 1, nV + do beta = 1, nV + do v = 1, nO + do u = 1, nO + r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(u,v,beta,gam) - X_oovv(v,u,gam,beta) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + double precision, allocatable :: Y_oovo(:,:,:,:) + allocate(X_vovo(nV,nO,nV,nO), Y_oovo(nO,nO,nV,nO)) + + !$omp parallel & + !$omp shared(nO,nV,X_vovo,cc_space_v_ovvo) & + !$omp private(a,v,gam,i) & + !$omp default(none) + do i = 1, nO + !$omp do + do gam = 1, nV + do v = 1, nO + do a = 1, nV + X_vovo(a,v,gam,i) = cc_space_v_ovvo(v,a,gam,i) + enddo + enddo + enddo + !$omp end do nowait + enddo + !$omp end parallel + + call dgemm('N','N',nO,nO*nV*nO,nV, & + 1d0, t1, size(t1,1), & + X_vovo, size(X_vovo,1), & + 0d0, Y_oovo, size(Y_oovo,1)) + + call dgemm('N','N',nO*nO*nV, nV, nO, & + 1d0, Y_oovo, size(Y_oovo,1) * size(Y_oovo,2) * size(Y_oovo,3), & + t1 , size(t1,1), & + 0d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3)) + + !$omp parallel & + !$omp shared(nO,nV,r2,X_oovv) & + !$omp private(u,v,gam,beta) & + !$omp default(none) + !$omp do + do gam = 1, nV + do beta = 1, nV + do v = 1, nO + do u = 1, nO + r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(u,v,gam,beta) - X_oovv(v,u,beta,gam) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + deallocate(X_vovo,Y_oovo) + + double precision, allocatable :: X_ovvo(:,:,:,:), Y_voov(:,:,:,:), Z_ovov(:,:,:,:) + allocate(X_ovvo(nO,nV,nV,nO), Y_voov(nV,nO,nO,nV),Z_ovov(nO,nV,nO,nV)) + !$omp parallel & + !$omp shared(nO,nV,X_ovvo,Y_voov,K1,J1,t2) & + !$omp private(u,v,gam,beta,i,a) & + !$omp default(none) + do i = 1, nO + !$omp do + do a = 1, nV + do beta = 1, nV + do u = 1, nO + X_ovvo(u,beta,a,i) = (J1(u,a,beta,i) - 0.5d0 * K1(u,a,i,beta)) + enddo + enddo + enddo + !$omp end do nowait + enddo + + !$omp do + do gam = 1, nV + do v = 1, nO + do i = 1, nO + do a = 1, nV + Y_voov(a,i,v,gam) = 2d0 * t2(i,v,a,gam) - t2(i,v,gam,a) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + call dgemm('N','N', nO*nV,nO*nV,nV*nO, & + 1d0, X_ovvo, size(X_ovvo,1) * size(X_ovvo,2), & + Y_voov, size(Y_voov,1) * size(Y_voov,2), & + 0d0, Z_ovov, size(Z_ovov,1) * size(Z_ovov,2)) + + !$omp parallel & + !$omp shared(nO,nV,r2,Z_ovov) & + !$omp private(u,v,gam,beta) & + !$omp default(none) + !$omp do + do gam = 1, nV + do beta = 1, nV + do v = 1, nO + do u = 1, nO + r2(u,v,beta,gam) = r2(u,v,beta,gam) + Z_ovov(u,beta,v,gam) + Z_ovov(v,gam,u,beta) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + deallocate(X_ovvo,Y_voov) + + double precision, allocatable :: X_ovov(:,:,:,:),Y_ovov(:,:,:,:) + allocate(X_ovov(nO,nV,nO,nV),Y_ovov(nO,nV,nO,nV)) + !$omp parallel & + !$omp shared(nO,nV,r2,K1,X_ovov,Y_ovov,t2) & + !$omp private(u,a,i,beta,gam) & + !$omp default(none) + !$omp do + do beta = 1, nV + do u = 1, nO + do a = 1, nV + do i = 1, nO + X_ovov(i,a,u,beta) = 0.5d0 * K1(u,a,i,beta) + enddo + enddo + enddo + enddo + !$omp end do nowait + + !$omp do + do gam = 1, nV + do v = 1, nO + do a = 1, nV + do i = 1, nO + Y_ovov(i,a,v,gam) = t2(i,v,gam,a) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + call dgemm('T','N',nO*nV,nO*nV,nO*nV, & + 1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), & + Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), & + 0d0, Z_ovov, size(Y_ovov,1) * size(Y_ovov,2)) + + !$omp parallel & + !$omp shared(nO,nV,r2,Z_ovov) & + !$omp private(u,v,gam,beta) & + !$omp default(none) + !$omp do + do gam = 1, nV + do beta = 1, nV + do v = 1, nO + do u = 1, nO + r2(u,v,beta,gam) = r2(u,v,beta,gam) - Z_ovov(u,beta,v,gam) - Z_ovov(v,gam,u,beta) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + !$omp parallel & + !$omp shared(nO,nV,K1,X_ovov,Z_ovov,t2) & + !$omp private(u,v,gam,beta,i,a) & + !$omp default(none) + !$omp do + do a = 1, nV + do i = 1, nO + do gam = 1, nV + do u = 1, nO + X_ovov(u,gam,i,a) = K1(u,a,i,gam) + enddo + enddo + enddo + enddo + !$omp end do nowait + + !$omp do + do beta = 1, nV + do v = 1, nO + do a = 1, nV + do i = 1, nO + Z_ovov(i,a,v,beta) = t2(i,v,beta,a) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + call dgemm('N','N',nO*nV,nO*nV,nO*nV, & + 1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), & + Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), & + 0d0, Z_ovov, size(Y_ovov,1) * size(Y_ovov,2)) + + !$omp parallel & + !$omp shared(nO,nV,r2,Z_ovov) & + !$omp private(u,v,gam,beta) & + !$omp default(none) + !$omp do + do gam = 1, nV + do beta = 1, nV + do v = 1, nO + do u = 1, nO + r2(u,v,beta,gam) = r2(u,v,beta,gam) - Z_ovov(u,gam,v,beta) - Z_ovov(v,beta,u,gam) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + deallocate(X_ovov,Y_ovov,Z_ovov) + + ! Change the sign for consistency with the code in spin orbitals + !$omp parallel & + !$omp shared(nO,nV,r2) & + !$omp private(i,j,a,b) & + !$omp default(none) + !$omp do + do b = 1, nV + do a = 1, nV + do j = 1, nO + do i = 1, nO + r2(i,j,a,b) = -r2(i,j,a,b) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + max_r2 = 0d0 + do b = 1, nV + do a = 1, nV + do j = 1, nO + do i = 1, nO + max_r2 = max(r2(i,j,a,b), max_r2) + enddo + enddo + enddo + enddo + + deallocate(g_occ,g_vir,J1,K1,A1) + +end + +! A1 + +subroutine compute_A1_chol(nO,nV,t1,t2,tau,A1) + + implicit none + + integer, intent(in) :: nO,nV + double precision, intent(in) :: t1(nO, nV) + double precision, intent(in) :: t2(nO, nO, nV, nV) + double precision, intent(in) :: tau(nO, nO, nV, nV) + double precision, intent(out) :: A1(nO, nO, nO, nO) + + integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta + + double precision, allocatable :: X_vooo(:,:,:,:), Y_oooo(:,:,:,:) + allocate(X_vooo(nV,nO,nO,nO), Y_oooo(nO,nO,nO,nO)) + + ! A1(u,v,i,j) = cc_space_v_oooo(u,v,i,j) + !$omp parallel & + !$omp shared(nO,nV,A1,cc_space_v_oooo,cc_space_v_ovoo,X_vooo) & + !$omp private(u,v,i,j) & + !$omp default(none) + !$omp do collapse(2) + do j = 1, nO + do i = 1, nO + do v = 1, nO + do u = 1, nO + A1(u,v,i,j) = cc_space_v_oooo(u,v,i,j) + enddo + enddo + enddo + enddo + !$omp end do nowait + + ! A1(u,v,i,j) += cc_space_v_ovoo(u,a,i,j) * t1(v,a) & + + !$omp do collapse(2) + do j = 1, nO + do i = 1, nO + do u = 1, nO + do a = 1, nV + X_vooo(a,u,i,j) = cc_space_v_ovoo(u,a,i,j) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + call dgemm('N','N', nO, nO*nO*nO, nV, & + 1d0, t1 , size(t1,1), & + X_vooo, size(X_vooo,1), & + 0d0, Y_oooo, size(Y_oooo,1)) + + !$omp parallel & + !$omp shared(nO,nV,A1,Y_oooo) & + !$omp private(u,v,i,j) & + !$omp default(none) + !$omp do collapse(2) + do j = 1, nO + do i = 1, nO + do v = 1, nO + do u = 1, nO + A1(u,v,i,j) = A1(u,v,i,j) + Y_oooo(v,u,i,j) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + deallocate(X_vooo,Y_oooo) + + ! A1(u,v,i,j) += cc_space_v_vooo(a,v,i,j) * t1(u,a) + call dgemm('N','N', nO, nO*nO*nO, nV, & + 1d0, t1 , size(t1,1), & + cc_space_v_vooo, size(cc_space_v_vooo,1), & + 1d0, A1 , size(A1,1)) + + ! A1(u,v,i,j) += cc_space_v_vvoo(a,b,i,j) * tau(u,v,a,b) + call dgemm('N','N', nO*nO, nO*nO, nV*nV, & + 1d0, tau , size(tau,1) * size(tau,2), & + cc_space_v_vvoo, size(cc_space_v_vvoo,1) * size(cc_space_v_vvoo,2), & + 1d0, A1 , size(A1,1) * size(A1,2)) + +end + +! B1 +subroutine compute_B1_chol(nO,nV,t1,B1,ldb) + + implicit none + + integer, intent(in) :: nO,nV,ldb + double precision, intent(in) :: t1(nO, nV) + double precision, intent(out) :: B1(nV, nV, nV, nV) + + integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam + + do gam = 1, nV + do beta = 1, nV + do b = 1, nV + do a = 1, nV + B1(a,b,beta,gam) = cc_space_v_vvvv(a,b,beta,gam) + + do i = 1, nO + B1(a,b,beta,gam) = B1(a,b,beta,gam) & + - cc_space_v_vvvo(a,b,beta,i) * t1(i,gam) & + - cc_space_v_vvov(a,b,i,gam) * t1(i,beta) + enddo + + enddo + enddo + enddo + enddo + +end + +! g_occ + +subroutine compute_g_occ_chol(nO,nV,t1,t2,H_oo,g_occ) + + implicit none + + integer, intent(in) :: nO,nV + double precision, intent(in) :: t1(nO, nV), H_oo(nO, nO) + double precision, intent(in) :: t2(nO, nO, nV, nV) + double precision, intent(out) :: g_occ(nO, nO) + + integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam + + call dgemm('N','N',nO,nO,nV, & + 1d0, t1, size(t1,1), & + cc_space_f_vo, size(cc_space_f_vo,1), & + 0d0, g_occ, size(g_occ,1)) + + !$omp parallel & + !$omp shared(nO,nV,g_occ,H_oo, cc_space_v_ovoo,t1) & + !$omp private(i,j,a,u) & + !$omp default(none) + !$omp do + do i = 1, nO + do u = 1, nO + g_occ(u,i) = g_occ(u,i) + H_oo(u,i) + enddo + enddo + !$omp end do + + !$omp do + do i = 1, nO + do j = 1, nO + do a = 1, nV + do u = 1, nO + g_occ(u,i) = g_occ(u,i) + (2d0 * cc_space_v_ovoo(u,a,i,j) - cc_space_v_ovoo(u,a,j,i)) * t1(j,a) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + +end + +! g_vir + +subroutine compute_g_vir_chol(nO,nV,t1,t2,H_vv,g_vir) + + implicit none + + integer, intent(in) :: nO,nV + double precision, intent(in) :: t1(nO, nV), H_vv(nV, nV) + double precision, intent(in) :: t2(nO, nO, nV, nV) + double precision, intent(out) :: g_vir(nV, nV) + + integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam + + call dgemm('N','N',nV,nV,nO, & + -1d0, cc_space_f_vo , size(cc_space_f_vo,1), & + t1 , size(t1,1), & + 0d0, g_vir, size(g_vir,1)) + + !$omp parallel & + !$omp shared(nO,nV,g_vir,H_vv, cc_space_v_vvvo,t1) & + !$omp private(i,b,a,beta) & + !$omp default(none) + !$omp do + do beta = 1, nV + do a = 1, nV + g_vir(a,beta) = g_vir(a,beta) + H_vv(a,beta) + enddo + enddo + !$omp end do + + !$omp do + do beta = 1, nV + do i = 1, nO + do b = 1, nV + do a = 1, nV + g_vir(a,beta) = g_vir(a,beta) + (2d0 * cc_space_v_vvvo(a,b,beta,i) - cc_space_v_vvvo(b,a,beta,i)) * t1(i,b) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + +end + +! J1 + +subroutine compute_J1_chol(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1) + + implicit none + + integer, intent(in) :: nO,nV + double precision, intent(in) :: t1(nO, nV) + double precision, intent(in) :: t2(nO, nO, nV, nV) + double precision, intent(in) :: v_ovvo(nO,nV,nV,nO), v_ovoo(nO,nV,nO,nO) + double precision, intent(in) :: v_vvvo(nV,nV,nV,nO), v_vvoo(nV,nV,nO,nO) + double precision, intent(out) :: J1(nO, nV, nV, nO) + + integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam + + double precision, allocatable :: X_ovoo(:,:,:,:), Y_ovov(:,:,:,:) + allocate(X_ovoo(nO,nV,nO,nO),Y_ovov(nO,nV,nO,nV)) + + !$omp parallel & + !$omp shared(nO,nV,J1,v_ovvo,v_ovoo,X_ovoo) & + !$omp private(i,j,a,u,beta) & + !$omp default(none) + do i = 1, nO + !$omp do + do beta = 1, nV + do a = 1, nV + do u = 1, nO + J1(u,a,beta,i) = v_ovvo(u,a,beta,i) + enddo + enddo + enddo + !$omp end do nowait + enddo + + !$omp do collapse(2) + do j = 1, nO + do i = 1, nO + do a = 1, nV + do u = 1, nO + X_ovoo(u,a,i,j) = v_ovoo(u,a,j,i) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + call dgemm('N','N',nO*nV*nO,nV,nO, & + -1d0, X_ovoo, size(X_ovoo,1) * size(X_ovoo,2) * size(X_ovoo,3), & + t1 , size(t1,1), & + 0d0, Y_ovov, size(Y_ovov,1) * size(Y_ovov,2) * size(Y_ovov,3)) + + !$omp parallel & + !$omp shared(nO,nV,J1,Y_ovov) & + !$omp private(i,beta,a,u) & + !$omp default(none) + do i = 1, nO + !$omp do + do beta = 1, nV + do a = 1, nV + do u = 1, nO + J1(u,a,beta,i) = J1(u,a,beta,i) + Y_ovov(u,a,i,beta) + enddo + enddo + enddo + !$omp end do nowait + enddo + !$omp end parallel + deallocate(X_ovoo) + + ! v_vvvo(b,a,beta,i) * t1(u,b) + call dgemm('N','N',nO,nV*nV*nO,nV, & + 1d0, t1 , size(t1,1), & + v_vvvo, size(v_vvvo,1), & + 1d0, J1 , size(J1,1)) + + !- cc_space_v_vvoo(a,b,i,j) * (0.5d0 * t2(u,j,b,beta) + t1(u,b) * t1(j,beta)) & + double precision, allocatable :: X_voov(:,:,:,:), Z_ovvo(:,:,:,:) + allocate(X_voov(nV,nO,nO,nV), Z_ovvo(nO,nV,nV,nO)) + !$omp parallel & + !$omp shared(nO,nV,t2,t1,Y_ovov,X_voov,v_vvoo) & + !$omp private(i,beta,a,u,b,j) & + !$omp default(none) + !$omp do + do b = 1, nV + do j = 1, nO + do beta = 1, nV + do u = 1, nO + Y_ovov(u,beta,j,b) = 0.5d0 * t2(u,j,b,beta) + t1(u,b) * t1(j,beta) + enddo + enddo + enddo + enddo + !$omp end do nowait + + !$omp do + do b = 1, nV + do j = 1, nO + do i = 1, nO + do a = 1, nV + X_voov(a,i,j,b) = v_vvoo(a,b,i,j) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + call dgemm('N','T',nO*nV,nV*nO,nO*nV, & + -1d0, Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), & + X_voov, size(X_voov,1) * size(X_voov,2), & + 0d0, Z_ovvo, size(Z_ovvo,1) * size(Z_ovvo,2)) + deallocate(X_voov) + + double precision, allocatable :: X_ovvo(:,:,:,:), Y_vovo(:,:,:,:) + allocate(X_ovvo(nO,nV,nV,nO),Y_vovo(nV,nO,nV,nO)) + !$omp parallel & + !$omp shared(nO,nV,J1,Z_ovvo,t2,Y_vovo,v_vvoo,X_ovvo) & + !$omp private(i,beta,a,u,j,b) & + !$omp default(none) + do i = 1, nO + !$omp do + do beta = 1, nV + do a = 1, nV + do u = 1, nO + J1(u,a,beta,i) = J1(u,a,beta,i) + Z_ovvo(u,beta,a,i) + enddo + enddo + enddo + !$omp end do nowait + enddo + + !+ 0.5d0 * (2d0 * cc_space_v_vvoo(a,b,i,j) - cc_space_v_vvoo(b,a,i,j)) * t2(u,j,beta,b) + do j = 1, nO + !$omp do + do b = 1, nV + do i = 1, nO + do a = 1, nV + Y_vovo(a,i,b,j) = 0.5d0 * (2d0 * v_vvoo(a,b,i,j) - v_vvoo(b,a,i,j)) + enddo + enddo + enddo + !$omp end do nowait + enddo + + do j = 1, nO + !$omp do + do b = 1, nV + do beta = 1, nV + do u = 1, nO + X_ovvo(u,beta,b,j) = t2(u,j,beta,b) + enddo + enddo + enddo + !$omp end do nowait + enddo + !$omp end parallel + + call dgemm('N','T',nO*nV,nV*nO,nV*nO, & + 1d0, X_ovvo, size(X_ovvo,1) * size(X_ovvo,2), & + Y_vovo, size(Y_vovo,1) * size(Y_vovo,2), & + 0d0, Z_ovvo, size(Z_ovvo,1) * size(Z_ovvo,2)) + + !$omp parallel & + !$omp shared(nO,nV,J1,Z_ovvo) & + !$omp private(i,beta,a,u) & + !$omp default(none) + do i = 1, nO + !$omp do + do beta = 1, nV + do a = 1, nV + do u = 1, nO + J1(u,a,beta,i) = J1(u,a,beta,i) + Z_ovvo(u,beta,a,i) + enddo + enddo + enddo + !$omp end do nowait + enddo + !$omp end parallel + + deallocate(X_ovvo,Z_ovvo,Y_ovov) + +end + +! K1 + +subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov,v_vvov,K1) + + implicit none + + integer, intent(in) :: nO,nV + double precision, intent(in) :: t1(nO, nV) + double precision, intent(in) :: t2(nO, nO, nV, nV) + double precision, intent(in) :: v_vvoo(nV,nV,nO,nO), v_ovov(nO,nV,nO,nV) + double precision, intent(in) :: v_vvov(nV,nV,nO,nV), v_ovoo(nO,nV,nO,nO) + double precision, intent(out) :: K1(nO, nV, nO, nV) + + double precision, allocatable :: X(:,:,:,:), Y(:,:,:,:), Z(:,:,:,:) + + integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam + + allocate(X(nV,nO,nV,nO),Y(nO,nV,nV,nO),Z(nO,nV,nV,nO)) + + !$omp parallel & + !$omp shared(nO,nV,K1,X,Y,v_vvoo,v_ovov,t1,t2) & + !$omp private(i,beta,a,u,j,b) & + !$omp default(none) + !$omp do + do beta = 1, nV + do i = 1, nO + do a = 1, nV + do u = 1, nO + K1(u,a,i,beta) = v_ovov(u,a,i,beta) + enddo + enddo + enddo + enddo + !$omp end do nowait + + do i = 1, nO + !$omp do + do a = 1, nV + do j = 1, nO + do b = 1, nV + X(b,j,a,i) = - v_vvoo(b,a,i,j) + enddo + enddo + enddo + !$omp end do nowait + enddo + + do j = 1, nO + !$omp do + do b = 1, nV + do beta = 1, nV + do u = 1, nO + Y(u,beta,b,j) = 0.5d0 * t2(u,j,b,beta) + t1(u,b) * t1(j,beta) + enddo + enddo + enddo + !$omp end do + enddo + !$omp end parallel + + call dgemm('N','N',nO*nV*nO,nV,nO, & + -1d0, v_ovoo, size(v_ovoo,1) * size(v_ovoo,2) * size(v_ovoo,3), & + t1 , size(t1,1), & + 1d0, K1 , size(K1,1) * size(K1,2) * size(K1,3)) + + call dgemm('N','N',nO,nV*nO*nV,nV, & + 1d0, t1 , size(t1,1), & + v_vvov, size(v_vvov,1), & + 1d0, K1 , size(K1,1)) + + ! Y(u,beta,b,j) * X(b,j,a,i) = Z(u,beta,a,i) + call dgemm('N','N',nV*nO,nO*nV,nV*nO, & + 1d0, Y, size(Y,1) * size(Y,2), & + X, size(X,1) * size(X,2), & + 0d0, Z, size(Z,1) * size(Z,2)) + + !$omp parallel & + !$omp shared(nO,nV,K1,Z) & + !$omp private(i,beta,a,u) & + !$omp default(none) + !$omp do + do beta = 1, nV + do i = 1, nO + do a = 1, nV + do u = 1, nO + K1(u,a,i,beta) = K1(u,a,i,beta) + Z(u,beta,a,i) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + deallocate(X,Y,Z) + +end diff --git a/src/mo_two_e_ints/mo_bi_integrals.irp.f b/src/mo_two_e_ints/mo_bi_integrals.irp.f index b15d9745..af40e571 100644 --- a/src/mo_two_e_ints/mo_bi_integrals.irp.f +++ b/src/mo_two_e_ints/mo_bi_integrals.irp.f @@ -166,11 +166,9 @@ subroutine four_idx_dgemm deallocate (a1) + call map_sort(mo_integrals_map) call map_unique(mo_integrals_map) - integer*8 :: get_mo_map_size, mo_map_size - mo_map_size = get_mo_map_size() - end subroutine subroutine add_integrals_to_map(mask_ijkl) @@ -250,7 +248,7 @@ subroutine add_integrals_to_map(mask_ijkl) call wall_time(wall_1) - size_buffer = min(ao_num*ao_num*ao_num,8000000) + size_buffer = min(ao_num*ao_num,8000000) print*, 'Buffers : ', 8.*(mo_num*(n_j)*(n_k+1) + mo_num+& ao_num+ao_num*ao_num+ size_buffer*3)/(1024*1024), 'MB / core' @@ -443,11 +441,6 @@ subroutine add_integrals_to_map(mask_ijkl) !$OMP END PARALLEL call map_merge(mo_integrals_map) - call wall_time(wall_2) - call cpu_time(cpu_2) - integer*8 :: get_mo_map_size, mo_map_size - mo_map_size = get_mo_map_size() - deallocate(list_ijkl) @@ -463,55 +456,55 @@ subroutine add_integrals_to_map_cholesky integer :: i,j,k,l,m integer :: size_buffer, n_integrals - size_buffer = min(mo_num*mo_num*mo_num,16000000) + size_buffer = min(mo_num*mo_num,16000000) double precision, allocatable :: Vtmp(:,:,:) integer(key_kind) , allocatable :: buffer_i(:) real(integral_kind), allocatable :: buffer_value(:) - if (.True.) then - ! In-memory transformation + call set_multiple_levels_omp(.False.) - call set_multiple_levels_omp(.False.) + !$OMP PARALLEL DEFAULT(SHARED) & + !$OMP PRIVATE(i,j,k,l,n_integrals,buffer_value, buffer_i, Vtmp) + allocate (buffer_i(size_buffer), buffer_value(size_buffer)) + allocate (Vtmp(mo_num,mo_num,mo_num)) + n_integrals = 0 - !$OMP PARALLEL PRIVATE(i,j,k,l,n_integrals,buffer_value, buffer_i, Vtmp) - allocate (buffer_i(size_buffer), buffer_value(size_buffer)) - n_integrals = 0 + !$OMP DO SCHEDULE(dynamic) + do l=1,mo_num + call dgemm('T','N',mo_num*mo_num,mo_num,cholesky_ao_num,1.d0, & + cholesky_mo_transp, cholesky_ao_num, & + cholesky_mo_transp(1,1,l), cholesky_ao_num, 0.d0, & + Vtmp, mo_num*mo_num) - allocate (Vtmp(mo_num,mo_num,mo_num)) - - !$OMP DO - do l=1,mo_num - - call dgemm('T','N',mo_num*mo_num,mo_num,cholesky_ao_num,1.d0, & - cholesky_mo_transp, cholesky_ao_num, & - cholesky_mo_transp(1,1,l), cholesky_ao_num, 0.d0, & - Vtmp, mo_num*mo_num) - do k=1,l - do j=1,mo_num - do i=1,j - if (abs(Vtmp(i,j,k)) > mo_integrals_threshold) then - n_integrals += 1 - buffer_value(n_integrals) = Vtmp(i,j,k) - !DIR$ FORCEINLINE - call mo_two_e_integrals_index(i,k,j,l,buffer_i(n_integrals)) - if (n_integrals == size_buffer) then - call map_append(mo_integrals_map, buffer_i, buffer_value, n_integrals) - n_integrals = 0 - endif + do k=1,l + do j=1,mo_num + do i=1,j + if (dabs(Vtmp(i,j,k)) > mo_integrals_threshold) then + n_integrals = n_integrals + 1 + buffer_value(n_integrals) = Vtmp(i,j,k) + !DIR$ FORCEINLINE + call mo_two_e_integrals_index(i,k,j,l,buffer_i(n_integrals)) + if (n_integrals == size_buffer) then + call map_append(mo_integrals_map, buffer_i, buffer_value, n_integrals) + n_integrals = 0 endif - enddo + endif enddo enddo enddo - !$OMP END DO + enddo + !$OMP END DO NOWAIT + + if (n_integrals > 0) then call map_append(mo_integrals_map, buffer_i, buffer_value, n_integrals) - deallocate(buffer_i, buffer_value, Vtmp) - !$OMP END PARALLEL - - call map_unique(mo_integrals_map) - endif + deallocate(buffer_i, buffer_value, Vtmp) + !$OMP BARRIER + !$OMP END PARALLEL + + call map_sort(mo_integrals_map) + call map_unique(mo_integrals_map) end diff --git a/src/tools/four_idx_transform.irp.f b/src/tools/four_idx_transform.irp.f index 92e87cad..f7520e71 100644 --- a/src/tools/four_idx_transform.irp.f +++ b/src/tools/four_idx_transform.irp.f @@ -14,6 +14,13 @@ program four_idx_transform io_mo_two_e_integrals = 'Write' SOFT_TOUCH io_mo_two_e_integrals + if (.true.) then + PROVIDE ao_two_e_integrals_in_map + endif + if (do_ao_cholesky) then + PROVIDE cholesky_mo_transp + FREE cholesky_ao + endif if (.true.) then PROVIDE mo_two_e_integrals_in_map endif diff --git a/src/utils/fortran_mmap.c b/src/utils/fortran_mmap.c index 52df2476..71426002 100644 --- a/src/utils/fortran_mmap.c +++ b/src/utils/fortran_mmap.c @@ -22,11 +22,7 @@ void* mmap_fortran(char* filename, size_t bytes, int* file_descr, int read_only) perror("Error opening mmap file for reading"); exit(EXIT_FAILURE); } - map = mmap(NULL, bytes, PROT_READ, MAP_SHARED | MAP_HUGETLB, fd, 0); - if (map == MAP_FAILED) { - /* try again without huge pages */ - map = mmap(NULL, bytes, PROT_READ, MAP_SHARED, fd, 0); - } + map = mmap(NULL, bytes, PROT_READ, MAP_SHARED, fd, 0); } else { @@ -53,16 +49,12 @@ void* mmap_fortran(char* filename, size_t bytes, int* file_descr, int read_only) exit(EXIT_FAILURE); } - map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_HUGETLB, fd, 0); - if (map == MAP_FAILED) { - /* try again without huge pages */ - map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - } + map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); } if (map == MAP_FAILED) { close(fd); - printf("%s:\n", filename); + printf("%s: %lu\n", filename, bytes); perror("Error mmapping the file"); exit(EXIT_FAILURE); } diff --git a/src/utils/mmap.f90 b/src/utils/mmap.f90 index 49147283..caabc6f1 100644 --- a/src/utils/mmap.f90 +++ b/src/utils/mmap.f90 @@ -46,7 +46,14 @@ module mmap_module integer(c_size_t) :: length integer(c_int) :: fd_ - length = PRODUCT( shape(:) ) * bytes + integer :: i + + length = int(bytes,8) + do i=1,size(shape) + length = length * shape(i) + enddo +print *, 'map_length: ', length + if (read_only) then map = c_mmap_fortran( trim(filename)//char(0), length, fd_, 1) else @@ -66,7 +73,13 @@ module mmap_module integer(c_size_t) :: length integer(c_int) :: fd_ - length = PRODUCT( shape(:) ) * bytes + integer :: i + + length = int(bytes,8) + do i=1,size(shape) + length = length * shape(i) + enddo +print *, 'map_length: ', length fd_ = fd call c_munmap_fortran( length, fd_, map) end subroutine @@ -82,7 +95,13 @@ module mmap_module integer(c_size_t) :: length integer(c_int) :: fd_ - length = PRODUCT( shape(:) ) * bytes + integer :: i + + length = int(bytes,8) + do i=1,size(shape) + length = length * shape(i) + enddo +print *, 'map_length: ', length fd_ = fd call c_msync_fortran( length, fd_, map) end subroutine diff --git a/src/utils_cc/mo_integrals_cc.irp.f b/src/utils_cc/mo_integrals_cc.irp.f index 2db614b4..62237229 100644 --- a/src/utils_cc/mo_integrals_cc.irp.f +++ b/src/utils_cc/mo_integrals_cc.irp.f @@ -53,33 +53,8 @@ subroutine gen_v_space(n1,n2,n3,n4,list1,list2,list3,list4,v) allocate(v1(cholesky_ao_num,n1,n3), v2(cholesky_ao_num,n2,n4)) allocate(buffer(n1,n3,n2,n4)) - !$OMP PARALLEL PRIVATE(i1,i2,i3,i4,idx1,idx2,idx3,idx4,k) - !$OMP DO - do i3=1,n3 - idx3 = list3(i3) - do i1=1,n1 - idx1 = list1(i1) - do k=1,cholesky_ao_num - v1(k,i1,i3) = cholesky_mo_transp(k,idx1,idx3) - enddo - enddo - enddo - !$OMP END DO NOWAIT - - !$OMP DO - do i4=1,n4 - idx4 = list4(i4) - do i2=1,n2 - idx2 = list2(i2) - do k=1,cholesky_ao_num - v2(k,i2,i4) = cholesky_mo_transp(k,idx2,idx4) - enddo - enddo - enddo - !$OMP END DO NOWAIT - - !$OMP BARRIER - !$OMP END PARALLEL + call gen_v_space_chol(n1,n3,list1,list3,v1,cholesky_ao_num) + call gen_v_space_chol(n2,n4,list2,list4,v2,cholesky_ao_num) call dgemm('T','N', n1*n3, n2*n4, cholesky_ao_num, 1.d0, & v1, cholesky_ao_num, & @@ -129,6 +104,30 @@ subroutine gen_v_space(n1,n2,n3,n4,list1,list2,list3,list4,v) end +subroutine gen_v_space_chol(n1,n3,list1,list3,v,ldv) + + implicit none + + integer, intent(in) :: n1,n3,ldv + integer, intent(in) :: list1(n1),list3(n3) + double precision, intent(out) :: v(ldv,n1,n3) + + integer :: i1,i3,idx1,idx3,k + + !$OMP PARALLEL DO PRIVATE(i1,i3,idx1,idx3,k) + do i3=1,n3 + idx3 = list3(i3) + do i1=1,n1 + idx1 = list1(i1) + do k=1,cholesky_ao_num + v(k,i1,i3) = cholesky_mo_transp(k,idx1,idx3) + enddo + enddo + enddo + !$OMP END PARALLEL DO + +end + ! full BEGIN_PROVIDER [double precision, cc_space_v, (mo_num,mo_num,mo_num,mo_num)] @@ -345,6 +344,38 @@ BEGIN_PROVIDER [double precision, cc_space_v_vvvv, (cc_nVa, cc_nVa, cc_nVa, cc_n END_PROVIDER +BEGIN_PROVIDER [double precision, cc_space_v_vv_chol, (cholesky_ao_num, cc_nVa, cc_nVa)] + + implicit none + + call gen_v_space_chol(cc_nVa, cc_nVa, cc_list_vir, cc_list_vir, cc_space_v_vv_chol, cholesky_ao_num) + +END_PROVIDER + +BEGIN_PROVIDER [double precision, cc_space_v_vo_chol, (cholesky_ao_num, cc_nVa, cc_nOa)] + + implicit none + + call gen_v_space_chol(cc_nVa, cc_nOa, cc_list_vir, cc_list_occ, cc_space_v_vo_chol, cholesky_ao_num) + +END_PROVIDER + +BEGIN_PROVIDER [double precision, cc_space_v_ov_chol, (cholesky_ao_num, cc_nOa, cc_nVa)] + + implicit none + + call gen_v_space_chol(cc_nOa, cc_nVa, cc_list_occ, cc_list_vir, cc_space_v_ov_chol, cholesky_ao_num) + +END_PROVIDER + +BEGIN_PROVIDER [double precision, cc_space_v_oo_chol, (cholesky_ao_num, cc_nOa, cc_nOa)] + + implicit none + + call gen_v_space_chol(cc_nOa, cc_nOa, cc_list_occ, cc_list_occ, cc_space_v_oo_chol, cholesky_ao_num) + +END_PROVIDER + ! ppqq BEGIN_PROVIDER [double precision, cc_space_v_ppqq, (cc_n_mo, cc_n_mo)] From a2c4a74d926e0017b701d1f6f510b2bf9a751f74 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Thu, 6 Jul 2023 16:21:37 +0200 Subject: [PATCH 19/40] Fixed writing MOs for large sizes' --- src/ao_two_e_ints/cholesky.irp.f | 12 ++++++++++-- src/mo_two_e_ints/cholesky.irp.f | 6 +++++- src/mo_two_e_ints/mo_bi_integrals.irp.f | 13 ++++++++++++- src/utils/map_functions.irp.f | 4 ++++ src/utils/mmap.f90 | 3 --- 5 files changed, 31 insertions(+), 7 deletions(-) diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f index f4746144..ce05de5b 100644 --- a/src/ao_two_e_ints/cholesky.irp.f +++ b/src/ao_two_e_ints/cholesky.irp.f @@ -339,8 +339,16 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] call omp_destroy_lock(lock(k)) enddo - allocate(cholesky_ao(ao_num,ao_num,rank)) - call dcopy(ndim*rank, L, 1, cholesky_ao, 1) + allocate(cholesky_ao(ao_num,ao_num,rank), stat=ierr) + if (ierr /= 0) then + print *, irp_here, ': Allocation failed' + stop -1 + endif + !$OMP PARALLEL DO PRIVATE(k) + do k=1,rank + call dcopy(ndim, L(1,k), 1, cholesky_ao(1,1,k), 1) + enddo + !$OMP END PARALLEL DO deallocate(L) cholesky_ao_num = rank diff --git a/src/mo_two_e_ints/cholesky.irp.f b/src/mo_two_e_ints/cholesky.irp.f index 7cfbaa58..3a868cbe 100644 --- a/src/mo_two_e_ints/cholesky.irp.f +++ b/src/mo_two_e_ints/cholesky.irp.f @@ -26,9 +26,13 @@ BEGIN_PROVIDER [ double precision, cholesky_mo_transp, (cholesky_ao_num, mo_num, END_DOC double precision, allocatable :: X(:,:,:) + integer :: ierr print *, 'AO->MO Transformation of Cholesky vectors' - allocate(X(mo_num,cholesky_ao_num,ao_num)) + allocate(X(mo_num,cholesky_ao_num,ao_num), stat=ierr) + if (ierr /= 0) then + print *, irp_here, ': Allocation failed' + endif call dgemm('T','N', ao_num*cholesky_ao_num, mo_num, ao_num, 1.d0, & cholesky_ao, ao_num, mo_coef, ao_num, 0.d0, X, ao_num*cholesky_ao_num) call dgemm('T','N', cholesky_ao_num*mo_num, mo_num, ao_num, 1.d0, & diff --git a/src/mo_two_e_ints/mo_bi_integrals.irp.f b/src/mo_two_e_ints/mo_bi_integrals.irp.f index af40e571..0ed6f373 100644 --- a/src/mo_two_e_ints/mo_bi_integrals.irp.f +++ b/src/mo_two_e_ints/mo_bi_integrals.irp.f @@ -90,6 +90,10 @@ subroutine four_idx_dgemm double precision, allocatable :: a1(:,:,:,:) double precision, allocatable :: a2(:,:,:,:) + if (ao_num > 1289) then + print *, irp_here, ': Integer overflow in ao_num**3' + endif + allocate (a1(ao_num,ao_num,ao_num,ao_num)) print *, 'Getting AOs' @@ -103,6 +107,7 @@ subroutine four_idx_dgemm enddo !$OMP END PARALLEL DO + print *, '1st transformation' ! 1st transformation allocate (a2(ao_num,ao_num,ao_num,mo_num)) @@ -456,7 +461,7 @@ subroutine add_integrals_to_map_cholesky integer :: i,j,k,l,m integer :: size_buffer, n_integrals - size_buffer = min(mo_num*mo_num,16000000) + size_buffer = min(mo_num*mo_num*mo_num,16000000) double precision, allocatable :: Vtmp(:,:,:) integer(key_kind) , allocatable :: buffer_i(:) @@ -575,6 +580,9 @@ subroutine add_integrals_to_map_three_indices(mask_ijk) return endif + if (ao_num > 1289) then + print *, irp_here, ': Integer overflow in ao_num**3' + endif size_buffer = min(ao_num*ao_num*ao_num,16000000) print*, 'Providing the molecular integrals ' print*, 'Buffers : ', 8.*(mo_num*(n_j)*(n_k+1) + mo_num+& @@ -850,6 +858,9 @@ subroutine add_integrals_to_map_no_exit_34(mask_ijkl) call bitstring_to_list( mask_ijkl(1,3), list_ijkl(1,3), n_k, N_int ) call bitstring_to_list( mask_ijkl(1,4), list_ijkl(1,4), n_l, N_int ) + if (ao_num > 1289) then + print *, irp_here, ': Integer overflow in ao_num**3' + endif size_buffer = min(ao_num*ao_num*ao_num,16000000) print*, 'Providing the molecular integrals ' print*, 'Buffers : ', 8.*(mo_num*(n_j)*(n_k+1) + mo_num+& diff --git a/src/utils/map_functions.irp.f b/src/utils/map_functions.irp.f index cd3b28a8..97d0e8bf 100644 --- a/src/utils/map_functions.irp.f +++ b/src/utils/map_functions.irp.f @@ -11,6 +11,10 @@ subroutine map_save_to_disk(filename,map) integer*8 :: n_elements n_elements = int(map % n_elements,8) + if (n_elements <= 0) then + print *, 'Unable to write map to disk: n_elements = ', n_elements + stop -1 + endif if (map % consolidated) then diff --git a/src/utils/mmap.f90 b/src/utils/mmap.f90 index caabc6f1..41e60224 100644 --- a/src/utils/mmap.f90 +++ b/src/utils/mmap.f90 @@ -52,7 +52,6 @@ module mmap_module do i=1,size(shape) length = length * shape(i) enddo -print *, 'map_length: ', length if (read_only) then map = c_mmap_fortran( trim(filename)//char(0), length, fd_, 1) @@ -79,7 +78,6 @@ print *, 'map_length: ', length do i=1,size(shape) length = length * shape(i) enddo -print *, 'map_length: ', length fd_ = fd call c_munmap_fortran( length, fd_, map) end subroutine @@ -101,7 +99,6 @@ print *, 'map_length: ', length do i=1,size(shape) length = length * shape(i) enddo -print *, 'map_length: ', length fd_ = fd call c_msync_fortran( length, fd_, map) end subroutine From e35f847341cf6094886cd675d5bf37b8e752c652 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Thu, 6 Jul 2023 17:51:59 +0200 Subject: [PATCH 20/40] Enabled direct integrals in Cholesky --- src/ao_two_e_ints/cholesky.irp.f | 43 +++++++++++++++++++------ src/ao_two_e_ints/two_e_integrals.irp.f | 3 +- src/mo_two_e_ints/mo_bi_integrals.irp.f | 4 ++- src/tools/four_idx_transform.irp.f | 7 ---- 4 files changed, 38 insertions(+), 19 deletions(-) diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f index ce05de5b..f7eae638 100644 --- a/src/ao_two_e_ints/cholesky.irp.f +++ b/src/ao_two_e_ints/cholesky.irp.f @@ -43,11 +43,15 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] double precision, external :: get_ao_two_e_integral logical, external :: ao_two_e_integral_zero + double precision, external :: ao_two_e_integral integer :: block_size, iblock, ierr integer(omp_lock_kind), allocatable :: lock(:) + PROVIDE nucl_coord - PROVIDE ao_two_e_integrals_in_map + if (.not.do_direct_integrals) then + PROVIDE ao_two_e_integrals_in_map + endif deallocate(cholesky_ao) ndim = ao_num*ao_num @@ -85,13 +89,22 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] enddo enddo - !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i) - do i=1,ndim - D(i) = get_ao_two_e_integral(addr(1,i), addr(1,i), & - addr(2,i), addr(2,i), & - ao_integrals_map) - enddo - !$OMP END PARALLEL DO + if (do_direct_integrals) then + !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i) + do i=1,ndim + D(i) = ao_two_e_integral(addr(1,i), addr(2,i), & + addr(1,i), addr(2,i)) + enddo + !$OMP END PARALLEL DO + else + !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i) + do i=1,ndim + D(i) = get_ao_two_e_integral(addr(1,i), addr(1,i), & + addr(2,i), addr(2,i), & + ao_integrals_map) + enddo + !$OMP END PARALLEL DO + endif Dmax = maxval(D) @@ -196,8 +209,13 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] if ((0 < q).and.(q < k)) cycle if (.not.ao_two_e_integral_zero( addr(1,Lset(k)), addr(1,Dset(m)), & addr(2,Lset(k)), addr(2,Dset(m)) ) ) then - Delta(k,m) = get_ao_two_e_integral( addr(1,Lset(k)), addr(1,Dset(m)), & + if (do_direct_integrals) then + Delta(k,m) = ao_two_e_integral(addr(1,Lset(k)), addr(2,Lset(k)), & + addr(1,Dset(m)), addr(2,Dset(m))) + else + Delta(k,m) = get_ao_two_e_integral( addr(1,Lset(k)), addr(1,Dset(m)), & addr(2,Lset(k)), addr(2,Dset(m)), ao_integrals_map) + endif if (q /= 0) Delta(q,m) = Delta(k,m) endif enddo @@ -218,8 +236,13 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] if ((0 < q).and.(q < p)) cycle if (.not.ao_two_e_integral_zero( addr(1,Dset(k)), addr(1,Dset(m)), & addr(2,Dset(k)), addr(2,Dset(m)) ) ) then - Delta(p,m) = get_ao_two_e_integral( addr(1,Dset(k)), addr(1,Dset(m)), & + if (do_direct_integrals) then + Delta(p,m) = ao_two_e_integral(addr(1,Dset(k)), addr(2,Dset(k)), & + addr(1,Dset(m)), addr(2,Dset(m))) + else + Delta(p,m) = get_ao_two_e_integral( addr(1,Dset(k)), addr(1,Dset(m)), & addr(2,Dset(k)), addr(2,Dset(m)), ao_integrals_map) + endif if (q /= 0) Delta(q,m) = Delta(p,m) if (j /= 0) Delta(p,j) = Delta(p,m) if (q*j /= 0) Delta(q,j) = Delta(p,m) diff --git a/src/ao_two_e_ints/two_e_integrals.irp.f b/src/ao_two_e_ints/two_e_integrals.irp.f index 85ff5bcf..0c70aae5 100644 --- a/src/ao_two_e_ints/two_e_integrals.irp.f +++ b/src/ao_two_e_ints/two_e_integrals.irp.f @@ -1232,7 +1232,8 @@ subroutine compute_ao_integrals_jl(j,l,n_integrals,buffer_i,buffer_value) logical, external :: ao_two_e_integral_zero integer :: i,k - double precision :: ao_two_e_integral,cpu_1,cpu_2, wall_1, wall_2 + double precision, external :: ao_two_e_integral + double precision :: cpu_1,cpu_2, wall_1, wall_2 double precision :: integral, wall_0 double precision :: thr integer :: kk, m, j1, i1 diff --git a/src/mo_two_e_ints/mo_bi_integrals.irp.f b/src/mo_two_e_ints/mo_bi_integrals.irp.f index 0ed6f373..0d3fe176 100644 --- a/src/mo_two_e_ints/mo_bi_integrals.irp.f +++ b/src/mo_two_e_ints/mo_bi_integrals.irp.f @@ -37,7 +37,9 @@ BEGIN_PROVIDER [ logical, mo_two_e_integrals_in_map ] call map_load_from_disk(trim(ezfio_filename)//'/work/mo_ints',mo_integrals_map) print*, 'MO integrals provided' return - else + endif + + if (.not. do_direct_integrals) then PROVIDE ao_two_e_integrals_in_map endif diff --git a/src/tools/four_idx_transform.irp.f b/src/tools/four_idx_transform.irp.f index f7520e71..92e87cad 100644 --- a/src/tools/four_idx_transform.irp.f +++ b/src/tools/four_idx_transform.irp.f @@ -14,13 +14,6 @@ program four_idx_transform io_mo_two_e_integrals = 'Write' SOFT_TOUCH io_mo_two_e_integrals - if (.true.) then - PROVIDE ao_two_e_integrals_in_map - endif - if (do_ao_cholesky) then - PROVIDE cholesky_mo_transp - FREE cholesky_ao - endif if (.true.) then PROVIDE mo_two_e_integrals_in_map endif From 905d88529f1b3a2711951cd59585dbea2b398fea Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Fri, 7 Jul 2023 17:42:20 +0200 Subject: [PATCH 21/40] Reduced memory in cholesky SCF --- src/ao_two_e_ints/cholesky.irp.f | 68 ++++++++++++++----- src/ao_two_e_ints/two_e_integrals.irp.f | 90 +++++++++++++++++++++---- src/ccsd/ccsd_t_space_orb_abc.irp.f | 2 +- src/determinants/density_matrix.irp.f | 4 +- src/determinants/h_apply.irp.f | 2 +- src/determinants/s2.irp.f | 2 +- src/hartree_fock/fock_matrix_hf.irp.f | 90 ++++++++++++++++--------- src/utils/memory.irp.f | 2 +- 8 files changed, 193 insertions(+), 67 deletions(-) diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f index f7eae638..d0fa735d 100644 --- a/src/ao_two_e_ints/cholesky.irp.f +++ b/src/ao_two_e_ints/cholesky.irp.f @@ -29,7 +29,7 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] double precision, pointer :: L(:,:), L_old(:,:) - double precision, parameter :: s = 1.d-1 + double precision :: s double precision, parameter :: dscale = 1.d0 double precision, allocatable :: D(:), Delta(:,:), Ltmp_p(:,:), Ltmp_q(:,:) @@ -47,6 +47,11 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] integer :: block_size, iblock, ierr integer(omp_lock_kind), allocatable :: lock(:) + + double precision :: rss + double precision, external :: memory_of_double, memory_of_int + + PROVIDE nucl_coord if (.not.do_direct_integrals) then @@ -57,6 +62,9 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] ndim = ao_num*ao_num tau = ao_cholesky_threshold + rss = 6.d0 * memory_of_double(ndim) + & + 6.d0 * memory_of_int(ndim) + call check_mem(rss, irp_here) allocate(L(ndim,1)) @@ -97,7 +105,7 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] enddo !$OMP END PARALLEL DO else - !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i) + !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i) SCHEDULE(guided) do i=1,ndim D(i) = get_ao_two_e_integral(addr(1,i), addr(1,i), & addr(2,i), addr(2,i), & @@ -130,21 +138,49 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] ! a. i = i+1 - ! b. - Dmin = max(s*Dmax,tau) + logical :: memory_ok + memory_ok = .False. - ! c. - nq=0 - LDmap = 0 - DLmap = 0 - do p=1,np - if ( D(Lset(p)) > Dmin ) then - nq = nq+1 - Dset(nq) = Lset(p) - Dset_rev(Dset(nq)) = nq - LDmap(p) = nq - DLmap(nq) = p + s = 1.d-2 + + ! Inrease s until the arrays fit in memory + do + + ! b. + Dmin = max(s*Dmax,tau) + + ! c. + nq=0 + LDmap = 0 + DLmap = 0 + do p=1,np + if ( D(Lset(p)) > Dmin ) then + nq = nq+1 + Dset(nq) = Lset(p) + Dset_rev(Dset(nq)) = nq + LDmap(p) = nq + DLmap(nq) = p + endif + enddo + + call resident_memory(rss) + rss = rss & + + np*memory_of_double(nq) & ! Delta(np,nq) + + (rank+nq)* memory_of_double(ndim) & ! L(ndim,rank+nq) + + (np+nq)*memory_of_double(block_size) ! Ltmp_p(np,block_size) + ! Ltmp_q(nq,block_size) + + if (rss > qp_max_mem) then + s = s*2.d0 + else + exit endif + + if (nq == 0) then + print *, 'Not enough memory. Reduce cholesky threshold' + stop -1 + endif + enddo ! d., e. @@ -198,7 +234,7 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] enddo !$OMP END DO - !$OMP DO SCHEDULE(dynamic,8) + !$OMP DO SCHEDULE(guided) do m=1,nq call omp_set_lock(lock(m)) diff --git a/src/ao_two_e_ints/two_e_integrals.irp.f b/src/ao_two_e_ints/two_e_integrals.irp.f index 0c70aae5..f86fb269 100644 --- a/src/ao_two_e_ints/two_e_integrals.irp.f +++ b/src/ao_two_e_ints/two_e_integrals.irp.f @@ -460,7 +460,7 @@ BEGIN_PROVIDER [ double precision, ao_two_e_integral_schwartz, (ao_num, ao_num) !$OMP PARALLEL DO PRIVATE(i,k) & !$OMP DEFAULT(NONE) & !$OMP SHARED (ao_num,ao_two_e_integral_schwartz) & - !$OMP SCHEDULE(dynamic) + !$OMP SCHEDULE(guided) do i=1,ao_num do k=1,i ao_two_e_integral_schwartz(i,k) = dsqrt(ao_two_e_integral(i,i,k,k)) @@ -975,7 +975,8 @@ recursive subroutine I_x1_pol_mult_recurs(a,c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt ! !DIR$ FORCEINLINE ! call multiply_poly(X,nx,B_10,2,d,nd) - call multiply_poly_c2(X,nx,B_10,d,nd) + !DIR$ FORCEINLINE + call multiply_poly_c2_inline_2e(X,nx,B_10,d,nd) nx = nd !DIR$ LOOP COUNT(8) @@ -998,7 +999,8 @@ recursive subroutine I_x1_pol_mult_recurs(a,c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt endif ! !DIR$ FORCEINLINE ! call multiply_poly(X,nx,B_00,2,d,nd) - call multiply_poly_c2(X,nx,B_00,d,nd) + !DIR$ FORCEINLINE + call multiply_poly_c2_inline_2e(X,nx,B_00,d,nd) endif ny=0 @@ -1017,7 +1019,8 @@ recursive subroutine I_x1_pol_mult_recurs(a,c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt ! !DIR$ FORCEINLINE ! call multiply_poly(Y,ny,C_00,2,d,nd) - call multiply_poly_c2(Y,ny,C_00,d,nd) + !DIR$ FORCEINLINE + call multiply_poly_c2_inline_2e(Y,ny,C_00,d,nd) end recursive subroutine I_x1_pol_mult_a1(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in) @@ -1057,7 +1060,8 @@ recursive subroutine I_x1_pol_mult_a1(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in) ! !DIR$ FORCEINLINE ! call multiply_poly(X,nx,B_00,2,d,nd) - call multiply_poly_c2(X,nx,B_00,d,nd) + !DIR$ FORCEINLINE + call multiply_poly_c2_inline_2e(X,nx,B_00,d,nd) ny=0 @@ -1069,7 +1073,8 @@ recursive subroutine I_x1_pol_mult_a1(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in) ! !DIR$ FORCEINLINE ! call multiply_poly(Y,ny,C_00,2,d,nd) - call multiply_poly_c2(Y,ny,C_00,d,nd) + !DIR$ FORCEINLINE + call multiply_poly_c2_inline_2e(Y,ny,C_00,d,nd) end @@ -1098,7 +1103,8 @@ recursive subroutine I_x1_pol_mult_a2(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in) ! !DIR$ FORCEINLINE ! call multiply_poly(X,nx,B_10,2,d,nd) - call multiply_poly_c2(X,nx,B_10,d,nd) + !DIR$ FORCEINLINE + call multiply_poly_c2_inline_2e(X,nx,B_10,d,nd) nx = nd !DIR$ LOOP COUNT(8) @@ -1118,7 +1124,8 @@ recursive subroutine I_x1_pol_mult_a2(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in) ! !DIR$ FORCEINLINE ! call multiply_poly(X,nx,B_00,2,d,nd) - call multiply_poly_c2(X,nx,B_00,d,nd) + !DIR$ FORCEINLINE + call multiply_poly_c2_inline_2e(X,nx,B_00,d,nd) ny=0 !DIR$ LOOP COUNT(8) @@ -1130,7 +1137,8 @@ recursive subroutine I_x1_pol_mult_a2(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in) ! !DIR$ FORCEINLINE ! call multiply_poly(Y,ny,C_00,2,d,nd) - call multiply_poly_c2(Y,ny,C_00,d,nd) + !DIR$ FORCEINLINE + call multiply_poly_c2_inline_2e(Y,ny,C_00,d,nd) end recursive subroutine I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,d,nd,dim) @@ -1177,9 +1185,9 @@ recursive subroutine I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,d,nd,dim) Y(1) = D_00(1) Y(2) = D_00(2) -! !DIR$ FORCEINLINE ! call multiply_poly(Y,ny,D_00,2,d,nd) - call multiply_poly_c2(Y,ny,D_00,d,nd) + !DIR$ FORCEINLINE + call multiply_poly_c2_inline_2e(Y,ny,D_00,d,nd) return @@ -1199,7 +1207,8 @@ recursive subroutine I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,d,nd,dim) ! !DIR$ FORCEINLINE ! call multiply_poly(X,nx,B_01,2,d,nd) - call multiply_poly_c2(X,nx,B_01,d,nd) + !DIR$ FORCEINLINE + call multiply_poly_c2_inline_2e(X,nx,B_01,d,nd) ny = 0 !DIR$ LOOP COUNT(6) @@ -1208,9 +1217,9 @@ recursive subroutine I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,d,nd,dim) enddo call I_x2_pol_mult(c-1,B_10,B_01,B_00,C_00,D_00,Y,ny,dim) -! !DIR$ FORCEINLINE ! call multiply_poly(Y,ny,D_00,2,d,nd) - call multiply_poly_c2(Y,ny,D_00,d,nd) + !DIR$ FORCEINLINE + call multiply_poly_c2_inline_2e(Y,ny,D_00,d,nd) end select end @@ -1300,3 +1309,56 @@ subroutine multiply_poly_local(b,nb,c,nc,d,nd) end +!DIR$ FORCEINLINE +subroutine multiply_poly_c2_inline_2e(b,nb,c,d,nd) + implicit none + BEGIN_DOC + ! Multiply two polynomials + ! D(t) += B(t)*C(t) + END_DOC + + integer, intent(in) :: nb + integer, intent(out) :: nd + double precision, intent(in) :: b(0:nb), c(0:2) + double precision, intent(inout) :: d(0:nb+2) + + integer :: ndtmp + integer :: ib, ic, id, k + if(nb < 0) return !False if nb>=0 + + select case (nb) + case (0) + d(0) = d(0) + c(0) * b(0) + d(1) = d(1) + c(1) * b(0) + d(2) = d(2) + c(2) * b(0) + + case (1) + d(0) = d(0) + c(0) * b(0) + d(1) = d(1) + c(0) * b(1) + c(1) * b(0) + d(2) = d(2) + c(1) * b(1) + c(2) * b(0) + d(3) = d(3) + c(2) * b(1) + + case (2) + d(0) = d(0) + c(0) * b(0) + d(1) = d(1) + c(0) * b(1) + c(1) * b(0) + d(2) = d(2) + c(0) * b(2) + c(1) * b(1) + c(2) * b(0) + d(3) = d(3) + c(1) * b(2) + c(2) * b(1) + d(4) = d(4) + c(2) * b(2) + + case default + d(0) = d(0) + c(0) * b(0) + d(1) = d(1) + c(0) * b(1) + c(1) * b(0) + do ib=2,nb + d(ib) = d(ib) + c(0) * b(ib) + c(1) * b(ib-1) + c(2) * b(ib-2) + enddo + d(nb+1) = d(nb+1) + c(1) * b(nb) + c(2) * b(nb-1) + d(nb+2) = d(nb+2) + c(2) * b(nb) + + end select + + do nd = nb+2,0,-1 + if (d(nd) /= 0.d0) exit + enddo + +end + diff --git a/src/ccsd/ccsd_t_space_orb_abc.irp.f b/src/ccsd/ccsd_t_space_orb_abc.irp.f index 1aab6bd7..12a71045 100644 --- a/src/ccsd/ccsd_t_space_orb_abc.irp.f +++ b/src/ccsd/ccsd_t_space_orb_abc.irp.f @@ -101,7 +101,7 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy) !$OMP PARALLEL PRIVATE(a,b,c,e) DEFAULT(SHARED) e = 0d0 - !$OMP DO SCHEDULE(dynamic) + !$OMP DO SCHEDULE(guided) do a = 1, nV do b = a+1, nV do c = b+1, nV diff --git a/src/determinants/density_matrix.irp.f b/src/determinants/density_matrix.irp.f index 1a1d92b5..ce4d96c2 100644 --- a/src/determinants/density_matrix.irp.f +++ b/src/determinants/density_matrix.irp.f @@ -117,7 +117,7 @@ END_PROVIDER !$OMP N_det_alpha_unique,N_det_beta_unique,irp_here) allocate(tmp_a(mo_num,mo_num,N_states), tmp_b(mo_num,mo_num,N_states) ) tmp_a = 0.d0 - !$OMP DO SCHEDULE(dynamic,64) + !$OMP DO SCHEDULE(guided) do k_a=1,N_det krow = psi_bilinear_matrix_rows(k_a) ASSERT (krow <= N_det_alpha_unique) @@ -173,7 +173,7 @@ END_PROVIDER deallocate(tmp_a) tmp_b = 0.d0 - !$OMP DO SCHEDULE(dynamic,64) + !$OMP DO SCHEDULE(guided) do k_b=1,N_det krow = psi_bilinear_matrix_transp_rows(k_b) ASSERT (krow <= N_det_alpha_unique) diff --git a/src/determinants/h_apply.irp.f b/src/determinants/h_apply.irp.f index 078c2104..65f1a832 100644 --- a/src/determinants/h_apply.irp.f +++ b/src/determinants/h_apply.irp.f @@ -250,7 +250,7 @@ subroutine remove_duplicates_in_psi_det(found_duplicates) enddo !$OMP END DO - !$OMP DO schedule(dynamic,1024) + !$OMP DO schedule(guided,64) do i=1,N_det-1 if (duplicate(i)) then cycle diff --git a/src/determinants/s2.irp.f b/src/determinants/s2.irp.f index 2c1a8757..6dc49526 100644 --- a/src/determinants/s2.irp.f +++ b/src/determinants/s2.irp.f @@ -317,7 +317,7 @@ subroutine get_uJ_s2_uI(psi_keys_tmp,psi_coefs_tmp,n,nmax_coefs,nmax_keys,s2,nst !$OMP SHARED (ll,jj,psi_keys_tmp,psi_coefs_tmp,N_int,n,nstates)& !$OMP REDUCTION(+:accu) allocate(idx(0:n)) - !$OMP DO SCHEDULE(dynamic) + !$OMP DO SCHEDULE(guided) do i = n,1,-1 ! Better OMP scheduling call get_s2(psi_keys_tmp(1,1,i),psi_keys_tmp(1,1,i),N_int,s2_tmp) accu += psi_coefs_tmp(i,ll) * s2_tmp * psi_coefs_tmp(i,jj) diff --git a/src/hartree_fock/fock_matrix_hf.irp.f b/src/hartree_fock/fock_matrix_hf.irp.f index 8c6658c5..a5ab6a60 100644 --- a/src/hartree_fock/fock_matrix_hf.irp.f +++ b/src/hartree_fock/fock_matrix_hf.irp.f @@ -190,47 +190,75 @@ END_PROVIDER deallocate(X) - ao_two_e_integral_beta_chol = ao_two_e_integral_alpha_chol + if (elec_alpha_num > elec_beta_num) then + ao_two_e_integral_beta_chol = ao_two_e_integral_alpha_chol + endif - allocate(X2(ao_num,ao_num,cholesky_ao_num,2)) + double precision :: rss + double precision :: memory_of_double + integer :: iblock + integer, parameter :: block_size = 32 + + rss = memory_of_double(ao_num*ao_num) + call check_mem(2.d0*block_size*rss, irp_here) + allocate(X2(ao_num,ao_num,block_size,2)) + allocate(X3(ao_num,block_size,ao_num,2)) + ! ao_two_e_integral_alpha_chol (l,s) -= cholesky_ao(l,m,j) * SCF_density_matrix_ao_beta (m,n) * cholesky_ao(n,s,j) - call dgemm('N','N',ao_num,ao_num*cholesky_ao_num,ao_num, 1.d0, & - SCF_density_matrix_ao_alpha, ao_num, & - cholesky_ao, ao_num, 0.d0, & - X2(1,1,1,1), ao_num) + do iblock=1,cholesky_ao_num,block_size - call dgemm('N','N',ao_num,ao_num*cholesky_ao_num,ao_num, 1.d0, & - SCF_density_matrix_ao_beta, ao_num, & - cholesky_ao, ao_num, 0.d0, & - X2(1,1,1,2), ao_num) + call dgemm('N','N',ao_num,ao_num*min(cholesky_ao_num-iblock+1,block_size),ao_num, 1.d0, & + SCF_density_matrix_ao_alpha, ao_num, & + cholesky_ao(1,1,iblock), ao_num, 0.d0, & + X2(1,1,1,1), ao_num) - allocate(X3(ao_num,cholesky_ao_num,ao_num,2)) + if (elec_alpha_num > elec_beta_num) then + call dgemm('N','N',ao_num,ao_num*min(cholesky_ao_num-iblock+1,block_size),ao_num, 1.d0, & + SCF_density_matrix_ao_beta, ao_num, & + cholesky_ao(1,1,iblock), ao_num, 0.d0, & + X2(1,1,1,2), ao_num) + + do s=1,ao_num + do j=1,min(cholesky_ao_num-iblock+1,block_size) + do m=1,ao_num + X3(m,j,s,1) = X2(m,s,j,1) + X3(m,j,s,2) = X2(m,s,j,2) + enddo + enddo + enddo + + else + + do s=1,ao_num + do j=1,min(cholesky_ao_num-iblock+1,block_size) + do m=1,ao_num + X3(m,j,s,1) = X2(m,s,j,1) + enddo + enddo + enddo + endif + + call dgemm('N','N',ao_num,ao_num,ao_num*min(cholesky_ao_num-iblock+1,block_size), -1.d0, & + cholesky_ao(1,1,iblock), ao_num, & + X3(1,1,1,1), ao_num*block_size, 1.d0, & + ao_two_e_integral_alpha_chol, ao_num) + + if (elec_alpha_num > elec_beta_num) then + call dgemm('N','N',ao_num,ao_num,ao_num*min(cholesky_ao_num-iblock+1,block_size), -1.d0, & + cholesky_ao(1,1,iblock), ao_num, & + X3(1,1,1,2), ao_num*block_size, 1.d0, & + ao_two_e_integral_beta_chol, ao_num) + endif - do s=1,ao_num - do j=1,cholesky_ao_num - do m=1,ao_num - X3(m,j,s,1) = X2(m,s,j,1) - X3(m,j,s,2) = X2(m,s,j,2) - enddo - enddo enddo - deallocate(X2) - - call dgemm('N','N',ao_num,ao_num,ao_num*cholesky_ao_num, -1.d0, & - cholesky_ao, ao_num, & - X3(1,1,1,1), ao_num*cholesky_ao_num, 1.d0, & - ao_two_e_integral_alpha_chol, ao_num) - - call dgemm('N','N',ao_num,ao_num,ao_num*cholesky_ao_num, -1.d0, & - cholesky_ao, ao_num, & - X3(1,1,1,2), ao_num*cholesky_ao_num, 1.d0, & - ao_two_e_integral_beta_chol, ao_num) - - deallocate(X3) + if (elec_alpha_num == elec_beta_num) then + ao_two_e_integral_beta_chol = ao_two_e_integral_alpha_chol + endif + deallocate(X2,X3) END_PROVIDER diff --git a/src/utils/memory.irp.f b/src/utils/memory.irp.f index 115b2cbe..0cd2133e 100644 --- a/src/utils/memory.irp.f +++ b/src/utils/memory.irp.f @@ -5,7 +5,7 @@ BEGIN_PROVIDER [ integer, qp_max_mem ] END_DOC character*(128) :: env - qp_max_mem = 2000 + qp_max_mem = 500 call getenv('QP_MAXMEM',env) if (trim(env) /= '') then call lock_io() From 4237fa888f0f537f113825557a6f0c38c2efeaff Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Fri, 7 Jul 2023 19:05:46 +0200 Subject: [PATCH 22/40] Get total memory --- src/ao_two_e_ints/cholesky.irp.f | 15 +++++++++---- src/utils/memory.irp.f | 36 +++++++++++++++++++++++++++++++- 2 files changed, 46 insertions(+), 5 deletions(-) diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f index d0fa735d..4702c850 100644 --- a/src/ao_two_e_ints/cholesky.irp.f +++ b/src/ao_two_e_ints/cholesky.irp.f @@ -141,7 +141,7 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] logical :: memory_ok memory_ok = .False. - s = 1.d-2 + s = 0.1d0 ! Inrease s until the arrays fit in memory do @@ -176,7 +176,7 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] exit endif - if (nq == 0) then + if ((s > 1.d0).or.(nq == 0)) then print *, 'Not enough memory. Reduce cholesky threshold' stop -1 endif @@ -219,10 +219,15 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] stop -1 endif - Delta(:,:) = 0.d0 !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(m,k,p,q,j) + !$OMP DO + do q=1,nq + Delta(:,q) = 0.d0 + enddo + !$OMP ENDDO NOWAIT + !$OMP DO do k=1,N do p=1,np @@ -232,7 +237,9 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] Ltmp_q(q,k) = L(Dset(q),k) enddo enddo - !$OMP END DO + !$OMP END DO NOWAIT + + !$OMP BARRIER !$OMP DO SCHEDULE(guided) do m=1,nq diff --git a/src/utils/memory.irp.f b/src/utils/memory.irp.f index 0cd2133e..7da283ec 100644 --- a/src/utils/memory.irp.f +++ b/src/utils/memory.irp.f @@ -4,8 +4,10 @@ BEGIN_PROVIDER [ integer, qp_max_mem ] ! Maximum memory in Gb END_DOC character*(128) :: env + integer, external :: get_total_available_memory - qp_max_mem = 500 + qp_max_mem = get_total_available_memory() + call write_int(6,qp_max_mem,'Total available memory (GB)') call getenv('QP_MAXMEM',env) if (trim(env) /= '') then call lock_io() @@ -122,3 +124,35 @@ subroutine print_memory_usage() '.. >>>>> [ RES MEM : ', rss , & ' GB ] [ VIRT MEM : ', mem, ' GB ] <<<<< ..' end + +integer function get_total_available_memory() result(res) + implicit none + BEGIN_DOC +! Returns the total available memory on the current machine + END_DOC + + character(len=128) :: line + integer :: status + integer :: iunit + integer*8, parameter :: KB = 1024 + integer*8, parameter :: GiB = 1024**3 + integer, external :: getUnitAndOpen + + iunit = getUnitAndOpen('/proc/meminfo','r') + + res = 512 + do + read(iunit, '(A)', END=10) line + if (line(1:10) == "MemTotal: ") then + read(line(11:), *, ERR=20) res + res = int((res*KB) / GiB,4) + exit + 20 continue + end if + end do + 10 continue + close(iunit) + +end function get_total_available_memory + + From 073aef70b8891d1027f14c8a3ca21d9261a81abe Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Fri, 7 Jul 2023 21:54:06 +0200 Subject: [PATCH 23/40] Inlined function in integrals --- src/ao_two_e_ints/two_e_integrals.irp.f | 466 ++++++++++++++++++++++-- 1 file changed, 429 insertions(+), 37 deletions(-) diff --git a/src/ao_two_e_ints/two_e_integrals.irp.f b/src/ao_two_e_ints/two_e_integrals.irp.f index f86fb269..148ebb62 100644 --- a/src/ao_two_e_ints/two_e_integrals.irp.f +++ b/src/ao_two_e_ints/two_e_integrals.irp.f @@ -951,7 +951,7 @@ recursive subroutine I_x1_pol_mult_recurs(a,c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt double precision :: X(0:max_dim) double precision :: Y(0:max_dim) !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: X,Y - integer :: nx, ix,iy,ny + integer :: nx, ix,iy,ny,ib ASSERT (a>2) !DIR$ LOOP COUNT(8) @@ -974,9 +974,43 @@ recursive subroutine I_x1_pol_mult_recurs(a,c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt enddo ! !DIR$ FORCEINLINE -! call multiply_poly(X,nx,B_10,2,d,nd) - !DIR$ FORCEINLINE - call multiply_poly_c2_inline_2e(X,nx,B_10,d,nd) +! call multiply_poly_c2_inline_2e(X,nx,B_10,d,nd) + if (nx >= 0) then + select case (nx) + case (0) + d(0) = d(0) + B_10(0) * X(0) + d(1) = d(1) + B_10(1) * X(0) + d(2) = d(2) + B_10(2) * X(0) + + case (1) + d(0) = d(0) + B_10(0) * X(0) + d(1) = d(1) + B_10(0) * X(1) + B_10(1) * X(0) + d(2) = d(2) + B_10(1) * X(1) + B_10(2) * X(0) + d(3) = d(3) + B_10(2) * X(1) + + case (2) + d(0) = d(0) + B_10(0) * X(0) + d(1) = d(1) + B_10(0) * X(1) + B_10(1) * X(0) + d(2) = d(2) + B_10(0) * X(2) + B_10(1) * X(1) + B_10(2) * X(0) + d(3) = d(3) + B_10(1) * X(2) + B_10(2) * X(1) + d(4) = d(4) + B_10(2) * X(2) + + case default + d(0) = d(0) + B_10(0) * X(0) + d(1) = d(1) + B_10(0) * X(1) + B_10(1) * X(0) + do ib=2,nx + d(ib) = d(ib) + B_10(0) * X(ib) + B_10(1) * X(ib-1) + B_10(2) * X(ib-2) + enddo + d(nx+1) = d(nx+1) + B_10(1) * X(nx) + B_10(2) * X(nx-1) + d(nx+2) = d(nx+2) + B_10(2) * X(nx) + + end select + + do nd = nx+2,0,-1 + if (d(nd) /= 0.d0) exit + enddo + + endif nx = nd !DIR$ LOOP COUNT(8) @@ -997,10 +1031,47 @@ recursive subroutine I_x1_pol_mult_recurs(a,c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt X(ix) *= c enddo endif + ! !DIR$ FORCEINLINE -! call multiply_poly(X,nx,B_00,2,d,nd) - !DIR$ FORCEINLINE - call multiply_poly_c2_inline_2e(X,nx,B_00,d,nd) +! call multiply_poly_c2_inline_2e(X,nx,B_00,d,nd) + if(nx >= 0) then + + select case (nx) + case (0) + d(0) = d(0) + B_00(0) * X(0) + d(1) = d(1) + B_00(1) * X(0) + d(2) = d(2) + B_00(2) * X(0) + + case (1) + d(0) = d(0) + B_00(0) * X(0) + d(1) = d(1) + B_00(0) * X(1) + B_00(1) * X(0) + d(2) = d(2) + B_00(1) * X(1) + B_00(2) * X(0) + d(3) = d(3) + B_00(2) * X(1) + + case (2) + d(0) = d(0) + B_00(0) * X(0) + d(1) = d(1) + B_00(0) * X(1) + B_00(1) * X(0) + d(2) = d(2) + B_00(0) * X(2) + B_00(1) * X(1) + B_00(2) * X(0) + d(3) = d(3) + B_00(1) * X(2) + B_00(2) * X(1) + d(4) = d(4) + B_00(2) * X(2) + + case default + d(0) = d(0) + B_00(0) * X(0) + d(1) = d(1) + B_00(0) * X(1) + B_00(1) * X(0) + do ib=2,nx + d(ib) = d(ib) + B_00(0) * X(ib) + B_00(1) * X(ib-1) + B_00(2) * X(ib-2) + enddo + d(nx+1) = d(nx+1) + B_00(1) * X(nx) + B_00(2) * X(nx-1) + d(nx+2) = d(nx+2) + B_00(2) * X(nx) + + end select + + do nd = nx+2,0,-1 + if (d(nd) /= 0.d0) exit + enddo + + endif + endif ny=0 @@ -1018,9 +1089,45 @@ recursive subroutine I_x1_pol_mult_recurs(a,c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt endif ! !DIR$ FORCEINLINE -! call multiply_poly(Y,ny,C_00,2,d,nd) - !DIR$ FORCEINLINE - call multiply_poly_c2_inline_2e(Y,ny,C_00,d,nd) +! call multiply_poly_c2_inline_2e(Y,ny,C_00,d,nd) + if(ny >= 0) then + + select case (ny) + case (0) + d(0) = d(0) + C_00(0) * Y(0) + d(1) = d(1) + C_00(1) * Y(0) + d(2) = d(2) + C_00(2) * Y(0) + + case (1) + d(0) = d(0) + C_00(0) * Y(0) + d(1) = d(1) + C_00(0) * Y(1) + C_00(1) * Y(0) + d(2) = d(2) + C_00(1) * Y(1) + C_00(2) * Y(0) + d(3) = d(3) + C_00(2) * Y(1) + + case (2) + d(0) = d(0) + C_00(0) * Y(0) + d(1) = d(1) + C_00(0) * Y(1) + C_00(1) * Y(0) + d(2) = d(2) + C_00(0) * Y(2) + C_00(1) * Y(1) + C_00(2) * Y(0) + d(3) = d(3) + C_00(1) * Y(2) + C_00(2) * Y(1) + d(4) = d(4) + C_00(2) * Y(2) + + case default + d(0) = d(0) + C_00(0) * Y(0) + d(1) = d(1) + C_00(0) * Y(1) + C_00(1) * Y(0) + do ib=2,ny + d(ib) = d(ib) + C_00(0) * Y(ib) + C_00(1) * Y(ib-1) + C_00(2) * Y(ib-2) + enddo + d(ny+1) = d(ny+1) + C_00(1) * Y(ny) + C_00(2) * Y(ny-1) + d(ny+2) = d(ny+2) + C_00(2) * Y(ny) + + end select + + do nd = ny+2,0,-1 + if (d(nd) /= 0.d0) exit + enddo + + endif + end recursive subroutine I_x1_pol_mult_a1(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in) @@ -1037,7 +1144,7 @@ recursive subroutine I_x1_pol_mult_a1(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in) double precision :: X(0:max_dim) double precision :: Y(0:max_dim) !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: X,Y - integer :: nx, ix,iy,ny + integer :: nx, ix,iy,ny,ib if( (c<0).or.(nd<0) )then nd = -1 @@ -1059,9 +1166,44 @@ recursive subroutine I_x1_pol_mult_a1(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in) endif ! !DIR$ FORCEINLINE -! call multiply_poly(X,nx,B_00,2,d,nd) - !DIR$ FORCEINLINE - call multiply_poly_c2_inline_2e(X,nx,B_00,d,nd) +! call multiply_poly_c2_inline_2e(X,nx,B_00,d,nd) + if(nx >= 0) then + + select case (nx) + case (0) + d(0) = d(0) + B_00(0) * X(0) + d(1) = d(1) + B_00(1) * X(0) + d(2) = d(2) + B_00(2) * X(0) + + case (1) + d(0) = d(0) + B_00(0) * X(0) + d(1) = d(1) + B_00(0) * X(1) + B_00(1) * X(0) + d(2) = d(2) + B_00(1) * X(1) + B_00(2) * X(0) + d(3) = d(3) + B_00(2) * X(1) + + case (2) + d(0) = d(0) + B_00(0) * X(0) + d(1) = d(1) + B_00(0) * X(1) + B_00(1) * X(0) + d(2) = d(2) + B_00(0) * X(2) + B_00(1) * X(1) + B_00(2) * X(0) + d(3) = d(3) + B_00(1) * X(2) + B_00(2) * X(1) + d(4) = d(4) + B_00(2) * X(2) + + case default + d(0) = d(0) + B_00(0) * X(0) + d(1) = d(1) + B_00(0) * X(1) + B_00(1) * X(0) + do ib=2,nx + d(ib) = d(ib) + B_00(0) * X(ib) + B_00(1) * X(ib-1) + B_00(2) * X(ib-2) + enddo + d(nx+1) = d(nx+1) + B_00(1) * X(nx) + B_00(2) * X(nx-1) + d(nx+2) = d(nx+2) + B_00(2) * X(nx) + + end select + + do nd = nx+2,0,-1 + if (d(nd) /= 0.d0) exit + enddo + + endif ny=0 @@ -1072,9 +1214,44 @@ recursive subroutine I_x1_pol_mult_a1(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in) call I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,Y,ny,n_pt_in) ! !DIR$ FORCEINLINE -! call multiply_poly(Y,ny,C_00,2,d,nd) - !DIR$ FORCEINLINE - call multiply_poly_c2_inline_2e(Y,ny,C_00,d,nd) +! call multiply_poly_c2_inline_2e(Y,ny,C_00,d,nd) + if(ny >= 0) then + + select case (ny) + case (0) + d(0) = d(0) + C_00(0) * Y(0) + d(1) = d(1) + C_00(1) * Y(0) + d(2) = d(2) + C_00(2) * Y(0) + + case (1) + d(0) = d(0) + C_00(0) * Y(0) + d(1) = d(1) + C_00(0) * Y(1) + C_00(1) * Y(0) + d(2) = d(2) + C_00(1) * Y(1) + C_00(2) * Y(0) + d(3) = d(3) + C_00(2) * Y(1) + + case (2) + d(0) = d(0) + C_00(0) * Y(0) + d(1) = d(1) + C_00(0) * Y(1) + C_00(1) * Y(0) + d(2) = d(2) + C_00(0) * Y(2) + C_00(1) * Y(1) + C_00(2) * Y(0) + d(3) = d(3) + C_00(1) * Y(2) + C_00(2) * Y(1) + d(4) = d(4) + C_00(2) * Y(2) + + case default + d(0) = d(0) + C_00(0) * Y(0) + d(1) = d(1) + C_00(0) * Y(1) + C_00(1) * Y(0) + do ib=2,ny + d(ib) = d(ib) + C_00(0) * Y(ib) + C_00(1) * Y(ib-1) + C_00(2) * Y(ib-2) + enddo + d(ny+1) = d(ny+1) + C_00(1) * Y(ny) + C_00(2) * Y(ny-1) + d(ny+2) = d(ny+2) + C_00(2) * Y(ny) + + end select + + do nd = ny+2,0,-1 + if (d(nd) /= 0.d0) exit + enddo + + endif end @@ -1092,7 +1269,7 @@ recursive subroutine I_x1_pol_mult_a2(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in) double precision :: X(0:max_dim) double precision :: Y(0:max_dim) !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: X,Y - integer :: nx, ix,iy,ny + integer :: nx, ix,iy,ny,ib !DIR$ LOOP COUNT(8) do ix=0,n_pt_in @@ -1102,9 +1279,44 @@ recursive subroutine I_x1_pol_mult_a2(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in) call I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,X,nx,n_pt_in) ! !DIR$ FORCEINLINE -! call multiply_poly(X,nx,B_10,2,d,nd) - !DIR$ FORCEINLINE - call multiply_poly_c2_inline_2e(X,nx,B_10,d,nd) +! call multiply_poly_c2_inline_2e(X,nx,B_10,d,nd) + if(nx >= 0) then + + select case (nx) + case (0) + d(0) = d(0) + B_10(0) * X(0) + d(1) = d(1) + B_10(1) * X(0) + d(2) = d(2) + B_10(2) * X(0) + + case (1) + d(0) = d(0) + B_10(0) * X(0) + d(1) = d(1) + B_10(0) * X(1) + B_10(1) * X(0) + d(2) = d(2) + B_10(1) * X(1) + B_10(2) * X(0) + d(3) = d(3) + B_10(2) * X(1) + + case (2) + d(0) = d(0) + B_10(0) * X(0) + d(1) = d(1) + B_10(0) * X(1) + B_10(1) * X(0) + d(2) = d(2) + B_10(0) * X(2) + B_10(1) * X(1) + B_10(2) * X(0) + d(3) = d(3) + B_10(1) * X(2) + B_10(2) * X(1) + d(4) = d(4) + B_10(2) * X(2) + + case default + d(0) = d(0) + B_10(0) * X(0) + d(1) = d(1) + B_10(0) * X(1) + B_10(1) * X(0) + do ib=2,nx + d(ib) = d(ib) + B_10(0) * X(ib) + B_10(1) * X(ib-1) + B_10(2) * X(ib-2) + enddo + d(nx+1) = d(nx+1) + B_10(1) * X(nx) + B_10(2) * X(nx-1) + d(nx+2) = d(nx+2) + B_10(2) * X(nx) + + end select + + do nd = nx+2,0,-1 + if (d(nd) /= 0.d0) exit + enddo + + endif nx = nd !DIR$ LOOP COUNT(8) @@ -1123,9 +1335,44 @@ recursive subroutine I_x1_pol_mult_a2(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in) endif ! !DIR$ FORCEINLINE -! call multiply_poly(X,nx,B_00,2,d,nd) - !DIR$ FORCEINLINE - call multiply_poly_c2_inline_2e(X,nx,B_00,d,nd) +! call multiply_poly_c2_inline_2e(X,nx,B_00,d,nd) + if(nx >= 0) then + + select case (nx) + case (0) + d(0) = d(0) + B_00(0) * X(0) + d(1) = d(1) + B_00(1) * X(0) + d(2) = d(2) + B_00(2) * X(0) + + case (1) + d(0) = d(0) + B_00(0) * X(0) + d(1) = d(1) + B_00(0) * X(1) + B_00(1) * X(0) + d(2) = d(2) + B_00(1) * X(1) + B_00(2) * X(0) + d(3) = d(3) + B_00(2) * X(1) + + case (2) + d(0) = d(0) + B_00(0) * X(0) + d(1) = d(1) + B_00(0) * X(1) + B_00(1) * X(0) + d(2) = d(2) + B_00(0) * X(2) + B_00(1) * X(1) + B_00(2) * X(0) + d(3) = d(3) + B_00(1) * X(2) + B_00(2) * X(1) + d(4) = d(4) + B_00(2) * X(2) + + case default + d(0) = d(0) + B_00(0) * X(0) + d(1) = d(1) + B_00(0) * X(1) + B_00(1) * X(0) + do ib=2,nx + d(ib) = d(ib) + B_00(0) * X(ib) + B_00(1) * X(ib-1) + B_00(2) * X(ib-2) + enddo + d(nx+1) = d(nx+1) + B_00(1) * X(nx) + B_00(2) * X(nx-1) + d(nx+2) = d(nx+2) + B_00(2) * X(nx) + + end select + + do nd = nx+2,0,-1 + if (d(nd) /= 0.d0) exit + enddo + + endif ny=0 !DIR$ LOOP COUNT(8) @@ -1136,9 +1383,45 @@ recursive subroutine I_x1_pol_mult_a2(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in) call I_x1_pol_mult_a1(c,B_10,B_01,B_00,C_00,D_00,Y,ny,n_pt_in) ! !DIR$ FORCEINLINE -! call multiply_poly(Y,ny,C_00,2,d,nd) - !DIR$ FORCEINLINE - call multiply_poly_c2_inline_2e(Y,ny,C_00,d,nd) +! call multiply_poly_c2_inline_2e(Y,ny,C_00,d,nd) + if(ny >= 0) then + + select case (ny) + case (0) + d(0) = d(0) + C_00(0) * Y(0) + d(1) = d(1) + C_00(1) * Y(0) + d(2) = d(2) + C_00(2) * Y(0) + + case (1) + d(0) = d(0) + C_00(0) * Y(0) + d(1) = d(1) + C_00(0) * Y(1) + C_00(1) * Y(0) + d(2) = d(2) + C_00(1) * Y(1) + C_00(2) * Y(0) + d(3) = d(3) + C_00(2) * Y(1) + + case (2) + d(0) = d(0) + C_00(0) * Y(0) + d(1) = d(1) + C_00(0) * Y(1) + C_00(1) * Y(0) + d(2) = d(2) + C_00(0) * Y(2) + C_00(1) * Y(1) + C_00(2) * Y(0) + d(3) = d(3) + C_00(1) * Y(2) + C_00(2) * Y(1) + d(4) = d(4) + C_00(2) * Y(2) + + case default + d(0) = d(0) + C_00(0) * Y(0) + d(1) = d(1) + C_00(0) * Y(1) + C_00(1) * Y(0) + do ib=2,ny + d(ib) = d(ib) + C_00(0) * Y(ib) + C_00(1) * Y(ib-1) + C_00(2) * Y(ib-2) + enddo + d(ny+1) = d(ny+1) + C_00(1) * Y(ny) + C_00(2) * Y(ny-1) + d(ny+2) = d(ny+2) + C_00(2) * Y(ny) + + end select + + do nd = ny+2,0,-1 + if (d(nd) /= 0.d0) exit + enddo + + endif + end recursive subroutine I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,d,nd,dim) @@ -1155,7 +1438,7 @@ recursive subroutine I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,d,nd,dim) integer :: nx, ix,ny double precision :: X(0:max_dim),Y(0:max_dim) !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: X, Y - integer :: i + integer :: i, ib select case (c) case (0) @@ -1185,9 +1468,46 @@ recursive subroutine I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,d,nd,dim) Y(1) = D_00(1) Y(2) = D_00(2) -! call multiply_poly(Y,ny,D_00,2,d,nd) - !DIR$ FORCEINLINE - call multiply_poly_c2_inline_2e(Y,ny,D_00,d,nd) +! !DIR$ FORCEINLINE +! call multiply_poly_c2_inline_2e(Y,ny,D_00,d,nd) + if(ny >= 0) then + + select case (ny) + case (0) + d(0) = d(0) + D_00(0) * Y(0) + d(1) = d(1) + D_00(1) * Y(0) + d(2) = d(2) + D_00(2) * Y(0) + + case (1) + d(0) = d(0) + D_00(0) * Y(0) + d(1) = d(1) + D_00(0) * Y(1) + D_00(1) * Y(0) + d(2) = d(2) + D_00(1) * Y(1) + D_00(2) * Y(0) + d(3) = d(3) + D_00(2) * Y(1) + + case (2) + d(0) = d(0) + D_00(0) * Y(0) + d(1) = d(1) + D_00(0) * Y(1) + D_00(1) * Y(0) + d(2) = d(2) + D_00(0) * Y(2) + D_00(1) * Y(1) + D_00(2) * Y(0) + d(3) = d(3) + D_00(1) * Y(2) + D_00(2) * Y(1) + d(4) = d(4) + D_00(2) * Y(2) + + case default + d(0) = d(0) + D_00(0) * Y(0) + d(1) = d(1) + D_00(0) * Y(1) + D_00(1) * Y(0) + do ib=2,ny + d(ib) = d(ib) + D_00(0) * Y(ib) + D_00(1) * Y(ib-1) + D_00(2) * Y(ib-2) + enddo + d(ny+1) = d(ny+1) + D_00(1) * Y(ny) + D_00(2) * Y(ny-1) + d(ny+2) = d(ny+2) + D_00(2) * Y(ny) + + end select + + do nd = ny+2,0,-1 + if (d(nd) /= 0.d0) exit + enddo + + endif + return @@ -1206,9 +1526,44 @@ recursive subroutine I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,d,nd,dim) enddo ! !DIR$ FORCEINLINE -! call multiply_poly(X,nx,B_01,2,d,nd) - !DIR$ FORCEINLINE - call multiply_poly_c2_inline_2e(X,nx,B_01,d,nd) +! call multiply_poly_c2_inline_2e(X,nx,B_01,d,nd) + if(nx >= 0) then + + select case (nx) + case (0) + d(0) = d(0) + B_01(0) * X(0) + d(1) = d(1) + B_01(1) * X(0) + d(2) = d(2) + B_01(2) * X(0) + + case (1) + d(0) = d(0) + B_01(0) * X(0) + d(1) = d(1) + B_01(0) * X(1) + B_01(1) * X(0) + d(2) = d(2) + B_01(1) * X(1) + B_01(2) * X(0) + d(3) = d(3) + B_01(2) * X(1) + + case (2) + d(0) = d(0) + B_01(0) * X(0) + d(1) = d(1) + B_01(0) * X(1) + B_01(1) * X(0) + d(2) = d(2) + B_01(0) * X(2) + B_01(1) * X(1) + B_01(2) * X(0) + d(3) = d(3) + B_01(1) * X(2) + B_01(2) * X(1) + d(4) = d(4) + B_01(2) * X(2) + + case default + d(0) = d(0) + B_01(0) * X(0) + d(1) = d(1) + B_01(0) * X(1) + B_01(1) * X(0) + do ib=2,nx + d(ib) = d(ib) + B_01(0) * X(ib) + B_01(1) * X(ib-1) + B_01(2) * X(ib-2) + enddo + d(nx+1) = d(nx+1) + B_01(1) * X(nx) + B_01(2) * X(nx-1) + d(nx+2) = d(nx+2) + B_01(2) * X(nx) + + end select + + do nd = nx+2,0,-1 + if (d(nd) /= 0.d0) exit + enddo + + endif ny = 0 !DIR$ LOOP COUNT(6) @@ -1217,9 +1572,46 @@ recursive subroutine I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,d,nd,dim) enddo call I_x2_pol_mult(c-1,B_10,B_01,B_00,C_00,D_00,Y,ny,dim) -! call multiply_poly(Y,ny,D_00,2,d,nd) - !DIR$ FORCEINLINE - call multiply_poly_c2_inline_2e(Y,ny,D_00,d,nd) +! !DIR$ FORCEINLINE +! call multiply_poly_c2_inline_2e(Y,ny,D_00,d,nd) + + if(ny >= 0) then + + select case (ny) + case (0) + d(0) = d(0) + D_00(0) * Y(0) + d(1) = d(1) + D_00(1) * Y(0) + d(2) = d(2) + D_00(2) * Y(0) + + case (1) + d(0) = d(0) + D_00(0) * Y(0) + d(1) = d(1) + D_00(0) * Y(1) + D_00(1) * Y(0) + d(2) = d(2) + D_00(1) * Y(1) + D_00(2) * Y(0) + d(3) = d(3) + D_00(2) * Y(1) + + case (2) + d(0) = d(0) + D_00(0) * Y(0) + d(1) = d(1) + D_00(0) * Y(1) + D_00(1) * Y(0) + d(2) = d(2) + D_00(0) * Y(2) + D_00(1) * Y(1) + D_00(2) * Y(0) + d(3) = d(3) + D_00(1) * Y(2) + D_00(2) * Y(1) + d(4) = d(4) + D_00(2) * Y(2) + + case default + d(0) = d(0) + D_00(0) * Y(0) + d(1) = d(1) + D_00(0) * Y(1) + D_00(1) * Y(0) + do ib=2,ny + d(ib) = d(ib) + D_00(0) * Y(ib) + D_00(1) * Y(ib-1) + D_00(2) * Y(ib-2) + enddo + d(ny+1) = d(ny+1) + D_00(1) * Y(ny) + D_00(2) * Y(ny-1) + d(ny+2) = d(ny+2) + D_00(2) * Y(ny) + + end select + + do nd = ny+2,0,-1 + if (d(nd) /= 0.d0) exit + enddo + + endif end select end From 9ce6eb78c84802739286092b1f3860d981bac361 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Mon, 10 Jul 2023 12:40:56 +0200 Subject: [PATCH 24/40] Cholesky in CCSD --- src/ccsd/ccsd_space_orb_sub.irp.f | 99 ++++++++++++-- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 173 +++++++++++++------------ 2 files changed, 178 insertions(+), 94 deletions(-) diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 76c9351e..04b7e955 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -9,7 +9,7 @@ subroutine run_ccsd_space_orb double precision :: uncorr_energy,energy, max_elem, max_r, max_r1, max_r2,ta,tb logical :: not_converged - double precision, allocatable :: t2(:,:,:,:), r2(:,:,:,:), tau(:,:,:,:) + double precision, allocatable :: t2(:,:,:,:), r2(:,:,:,:), tau(:,:,:,:), tau_x(:,:,:,:) double precision, allocatable :: t1(:,:), r1(:,:) double precision, allocatable :: H_oo(:,:), H_vv(:,:), H_vo(:,:) @@ -18,8 +18,6 @@ subroutine run_ccsd_space_orb integer(bit_kind) :: det(N_int,2) integer :: nO, nV, nOa, nVa -! PROVIDE mo_two_e_integrals_in_map - det = psi_det(:,:,cc_ref) print*,'Reference determinant:' call print_det(det,N_int) @@ -46,6 +44,7 @@ subroutine run_ccsd_space_orb allocate(t2(nO,nO,nV,nV), r2(nO,nO,nV,nV)) allocate(tau(nO,nO,nV,nV)) + allocate(tau_x(nO,nO,nV,nV)) allocate(t1(nO,nV), r1(nO,nV)) allocate(H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO)) @@ -67,10 +66,11 @@ subroutine run_ccsd_space_orb call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,t1) call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,t2) call update_tau_space(nO,nV,t1,t2,tau) + call update_tau_x_space(nO,nV,tau,tau_x) !print*,'hf_energy', hf_energy call det_energy(det,uncorr_energy) print*,'Det energy', uncorr_energy - call ccsd_energy_space(nO,nV,tau,t1,energy) + call ccsd_energy_space_x(nO,nV,tau_x,t1,energy) print*,'Guess energy', uncorr_energy+energy, energy nb_iter = 0 @@ -86,11 +86,11 @@ subroutine run_ccsd_space_orb do while (not_converged) ! Residue -! if (do_ao_cholesky) then - if (.False.) then - call compute_H_oo_chol(nO,nV,t1,t2,tau,H_oo) - call compute_H_vv_chol(nO,nV,t1,t2,tau,H_vv) - call compute_H_vo_chol(nO,nV,t1,t2,H_vo) + if (do_ao_cholesky) then +! if (.False.) then + call compute_H_oo_chol(nO,nV,tau_x,H_oo) + call compute_H_vv_chol(nO,nV,tau_x,H_vv) + call compute_H_vo_chol(nO,nV,t1,H_vo) call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) @@ -119,9 +119,10 @@ subroutine run_ccsd_space_orb endif call update_tau_space(nO,nV,t1,t2,tau) + call update_tau_x_space(nO,nV,tau,tau_x) ! Energy - call ccsd_energy_space(nO,nV,tau,t1,energy) + call ccsd_energy_space_x(nO,nV,tau_x,t1,energy) write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |' nb_iter = nb_iter + 1 @@ -249,6 +250,51 @@ subroutine ccsd_energy_space(nO,nV,tau,t1,energy) end +subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy) + + implicit none + + integer, intent(in) :: nO, nV + double precision, intent(in) :: tau_x(nO,nO,nV,nV) + double precision, intent(in) :: t1(nO,nV) + double precision, intent(out) :: energy + + ! internal + integer :: i,j,a,b + double precision :: e + + energy = 0d0 + !$omp parallel & + !$omp shared(nO,nV,energy,tau_x,t1,& + !$omp cc_space_f_vo,cc_space_v_oovv) & + !$omp private(i,j,a,b,e) & + !$omp default(none) + e = 0d0 + !$omp do + do a = 1, nV + do i = 1, nO + e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a) + enddo + enddo + !$omp end do nowait + !$omp do + do b = 1, nV + do a = 1, nV + do j = 1, nO + do i = 1, nO + e = e + tau_x(i,j,a,b) * cc_space_v_oovv(i,j,a,b) + enddo + enddo + enddo + enddo + !$omp end do nowait + !$omp critical + energy = energy + e + !$omp end critical + !$omp end parallel + +end + ! Tau subroutine update_tau_space(nO,nV,t1,t2,tau) @@ -284,6 +330,39 @@ subroutine update_tau_space(nO,nV,t1,t2,tau) end +subroutine update_tau_x_space(nO,nV,tau,tau_x) + + implicit none + + ! in + integer, intent(in) :: nO, nV + double precision, intent(in) :: tau(nO,nO,nV,nV) + + ! out + double precision, intent(out) :: tau_x(nO,nO,nV,nV) + + ! internal + integer :: i,j,a,b + + !$OMP PARALLEL & + !$OMP SHARED(nO,nV,tau,tau_x) & + !$OMP PRIVATE(i,j,a,b) & + !$OMP DEFAULT(NONE) + !$OMP DO + do b = 1, nV + do a = 1, nV + do j = 1, nO + do i = 1, nO + tau_x(i,j,a,b) = 2.d0*tau(i,j,a,b) - tau(i,j,b,a) + enddo + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + +end + ! R1 subroutine compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index 190c163b..0b9e123e 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -276,64 +276,88 @@ end ! H_oo -subroutine compute_H_oo_chol(nO,nV,t1,t2,tau,H_oo) +subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo) implicit none integer, intent(in) :: nO,nV - double precision, intent(in) :: t1(nO, nV) - double precision, intent(in) :: t2(nO, nO, nV, nV) - double precision, intent(in) :: tau(nO, nO, nV, nV) + double precision, intent(in) :: tau_x(nO, nO, nV, nV) double precision, intent(out) :: H_oo(nO, nO) - integer :: a,tmp_a,k,b,l,c,d,tmp_c,tmp_d,i,j,u + integer :: a,b,i,j,u,k - ! H_oo(u,i) = cc_space_f_oo(u,i) + double precision, allocatable :: tau_kau(:,:,:), tmp_vov(:,:,:) + + allocate(tau_kau(cholesky_ao_num,nV,nO)) !$omp parallel & - !$omp shared(nO,H_oo,cc_space_f_oo) & - !$omp private(i,u) & - !$omp default(none) + !$omp default(shared) & + !$omp private(i,u,j,k,a,b,tmp_vov) + allocate(tmp_vov(nV,nO,nV) ) + !$omp do + do u = 1, nO + do b=1,nV + do j=1,nO + do a=1,nV + tmp_vov(a,j,b) = tau_x(u,j,a,b) + enddo + enddo + enddo + call dgemm('N','T',cholesky_ao_num,nV,nO*nV,1.d0, & + cc_space_v_ov_chol, cholesky_ao_num, tmp_vov, nV, & + 0.d0, tau_kau(1,1,u), cholesky_ao_num) + enddo + !$omp end do nowait + deallocate(tmp_vov) !$omp do do i = 1, nO do u = 1, nO H_oo(u,i) = cc_space_f_oo(u,i) enddo enddo - !$omp end do - !$omp end parallel - - ! H_oo(u,i) += cc_space_w_oovv(i,j,a,b) * tau(u,j,a,b) - ! H_oo(u,i) += tau(u,j,a,b) * cc_space_w_oovv(i,j,a,b) - call dgemm('N','T', nO, nO, nO*nV*nV, & - 1d0, tau , size(tau,1), & - cc_space_w_oovv, size(cc_space_w_oovv,1), & - 1d0, H_oo , size(H_oo,1)) + !$omp end do nowait + !$omp barrier + !$omp end parallel + call dgemm('T', 'N', nO, nO, cholesky_ao_num*nV, 1.d0, & + tau_kau, cholesky_ao_num*nV, cc_space_v_vo_chol, cholesky_ao_num*nV, & + 1.d0, H_oo, nO) end ! H_vv -subroutine compute_H_vv_chol(nO,nV,t1,t2,tau,H_vv) +subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv) implicit none integer, intent(in) :: nO,nV - double precision, intent(in) :: t1(nO, nV) - double precision, intent(in) :: t2(nO, nO, nV, nV) - double precision, intent(in) :: tau(nO, nO, nV, nV) + double precision, intent(in) :: tau_x(nO, nO, nV, nV) double precision, intent(out) :: H_vv(nV, nV) - integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u, beta + integer :: a,b,i,j,u,k, beta - double precision, allocatable :: tmp_tau(:,:,:,:) + double precision, allocatable :: tau_kia(:,:,:), tmp_oov(:,:,:) - allocate(tmp_tau(nV,nO,nO,nV)) - - ! H_vv(a,beta) = cc_space_f_vv(a,beta) + allocate(tau_kia(cholesky_ao_num,nO,nV)) !$omp parallel & - !$omp shared(nV,nO,H_vv,cc_space_f_vv,tmp_tau,tau) & - !$omp private(a,beta,i,j,b) & - !$omp default(none) + !$omp default(shared) & + !$omp private(i,beta,j,k,a,b,tmp_oov) + allocate(tmp_oov(nO,nO,nV) ) + !$omp do + do a = 1, nV + do b=1,nV + do j=1,nO + do i=1,nO + tmp_oov(i,j,b) = tau_x(i,j,a,b) + enddo + enddo + enddo + call dgemm('N','T',cholesky_ao_num,nO,nO*nV,1.d0, & + cc_space_v_ov_chol, cholesky_ao_num, tmp_oov, nO, & + 0.d0, tau_kia(1,1,a), cholesky_ao_num) + enddo + !$omp end do nowait + deallocate(tmp_oov) + !$omp do do beta = 1, nV do a = 1, nV @@ -341,83 +365,64 @@ subroutine compute_H_vv_chol(nO,nV,t1,t2,tau,H_vv) enddo enddo !$omp end do nowait - - !$omp do - do beta = 1, nV - do j = 1, nO - do i = 1, nO - do b = 1, nV - tmp_tau(b,i,j,beta) = tau(i,j,beta,b) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - - call dgemm('N','N',nV,nV,nO*nO*nV, & - -1d0, cc_space_w_vvoo, size(cc_space_w_vvoo,1), & - tmp_tau , size(tmp_tau,1) * size(tmp_tau,2) * size(tmp_tau,3), & - 1d0, H_vv , size(H_vv,1)) - - deallocate(tmp_tau) + !$omp barrier + !$omp end parallel + call dgemm('T', 'N', nV, nV, cholesky_ao_num*nO, -1.d0, & + tau_kia, cholesky_ao_num*nO, cc_space_v_ov_chol, cholesky_ao_num*nO, & + 1.d0, H_vv, nV) end ! H_vo - -subroutine compute_H_vo_chol(nO,nV,t1,t2,H_vo) +subroutine compute_H_vo_chol(nO,nV,t1,H_vo) implicit none integer, intent(in) :: nO,nV double precision, intent(in) :: t1(nO, nV) - double precision, intent(in) :: t2(nO, nO, nV, nV) double precision, intent(out) :: H_vo(nV, nO) - integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u, beta + integer :: a,b,i,j,u,k - double precision, allocatable :: w(:,:,:,:) - - allocate(w(nV,nO,nO,nV)) - - !$omp parallel & - !$omp shared(nV,nO,H_vo,cc_space_f_vo,w,cc_space_w_vvoo,t1) & - !$omp private(a,beta,i,j,b) & - !$omp default(none) - !$omp do - do i = 1, nO - do a = 1, nV + double precision, allocatable :: tmp_k(:), tmp(:,:,:), tmp2(:,:,:) + do i=1,nO + do a=1,nV H_vo(a,i) = cc_space_f_vo(a,i) enddo enddo - !$omp end do nowait - ! H_vo(a,i) = H_vo(a,i) + cc_space_w_vvoo(a,b,i,j) * t1(j,b) - ! H_vo(a,i) = H_vo(a,i) + w(a,i,j,b) * t1(j,b) + allocate(tmp_k(cholesky_ao_num)) + call dgemm('N', 'N', cholesky_ao_num, 1, nO*nV, 2.d0, & + cc_space_v_ov_chol, cholesky_ao_num, & + t1, nO*nV, 0.d0, tmp_k, cholesky_ao_num) - !$omp do - do b = 1, nV - do j = 1, nO - do i = 1, nO - do a = 1, nV - w(a,i,j,b) = cc_space_w_vvoo(a,b,i,j) - enddo + call dgemm('T','N',nV*nO,1,cholesky_ao_num,1.d0, & + cc_space_v_vo_chol, cholesky_ao_num, tmp_k, cholesky_ao_num, 1.d0, & + H_vo, nV*nO) + deallocate(tmp_k) + + allocate(tmp(cholesky_ao_num,nO,nO)) + allocate(tmp2(cholesky_ao_num,nO,nO)) + + call dgemm('N','T', cholesky_ao_num*nO, nO, nV, 1.d0, & + cc_space_v_ov_chol, cholesky_ao_num*nO, t1, nO, 0.d0, tmp, cholesky_ao_num*nO) + + do i=1,nO + do j=1,nO + do k=1,cholesky_ao_num + tmp2(k,j,i) = tmp(k,i,j) enddo enddo enddo - !$omp end do - !$omp end parallel + deallocate(tmp) - call dgemv('N',nV*nO, nO*nV, & - 1d0, w , size(w,1) * size(w,2), & - t1 , 1, & - 1d0, H_vo, 1) - - deallocate(w) + call dgemm('T','N', nV, nO, cholesky_ao_num*nO, -1.d0, & + cc_space_v_ov_chol, cholesky_ao_num*nO, tmp2, cholesky_ao_num*nO, & + 1.d0, H_vo, nV) end + ! R2 subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) @@ -1015,7 +1020,7 @@ subroutine compute_B1_chol(nO,nV,t1,B1,ldb) - cc_space_v_vvvo(a,b,beta,i) * t1(i,gam) & - cc_space_v_vvov(a,b,i,gam) * t1(i,beta) enddo - + enddo enddo enddo From 44956060e7321b8bb76a0b13e83ae254039a8fa1 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Mon, 10 Jul 2023 17:06:34 +0200 Subject: [PATCH 25/40] Removed vvv arrays --- src/ccsd/ccsd_space_orb_sub.irp.f | 52 +++-- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 310 ++++++++++++++----------- 2 files changed, 209 insertions(+), 153 deletions(-) diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 04b7e955..35e14313 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -92,7 +92,7 @@ subroutine run_ccsd_space_orb call compute_H_vv_chol(nO,nV,tau_x,H_vv) call compute_H_vo_chol(nO,nV,t1,H_vo) - call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) + call compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) else call compute_H_oo(nO,nV,t1,t2,tau,H_oo) @@ -538,25 +538,16 @@ subroutine compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) ! enddo ! enddo !enddo + + integer :: iblock, block_size, nVmax double precision, allocatable :: W_vvov(:,:,:,:), T_vvoo(:,:,:,:) - allocate(W_vvov(nV,nV,nO,nV), T_vvoo(nV,nV,nO,nO)) + block_size = 8 + allocate(W_vvov(nV,nV,nO,block_size), T_vvoo(nV,nV,nO,nO)) !$omp parallel & !$omp shared(nO,nV,cc_space_v_vvov,W_vvov,T_vvoo,tau) & !$omp private(b,beta,i,a) & !$omp default(none) - !$omp do - do beta = 1, nV - do i = 1, nO - do b = 1, nV - do a = 1, nV - W_vvov(a,b,i,beta) = 2d0 * cc_space_v_vvov(a,b,i,beta) - cc_space_v_vvov(b,a,i,beta) - enddo - enddo - enddo - enddo - !$omp end do nowait - !$omp do do u = 1, nO do i = 1, nO @@ -570,13 +561,35 @@ subroutine compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) !$omp end do nowait !$omp end parallel - call dgemm('T','N',nO,nV,nO*nV*nV, & - 1d0, T_vvoo, size(T_vvoo,1) * size(T_vvoo,2) * size(T_vvoo,3), & - W_vvov, size(W_vvov,1) * size(W_vvov,2) * size(W_vvov,3), & - 1d0, r1 , size(r1,1)) + do iblock = 1, nV, block_size + nVmax = min(block_size,nV-iblock+1) + !$omp parallel & + !$omp shared(nO,nV,cc_space_v_vvov,W_vvov,T_vvoo,tau,nVmax,iblock) & + !$omp private(b,i,a,beta) & + !$omp default(none) + !$omp do collapse(2) + do beta = iblock, iblock + nVmax - 1 + do i = 1, nO + do b = 1, nV + do a = 1, nV + W_vvov(a,b,i,beta-iblock+1) = 2d0 * cc_space_v_vvov(a,b,i,beta) - cc_space_v_vvov(b,a,i,beta) + enddo + enddo + enddo + enddo + !$omp end do nowait + !$omp end parallel + + call dgemm('T','N',nO,nVmax,nO*nV*nV, & + 1d0, T_vvoo, nV*nV*nO, & + W_vvov, nO*nV*nV, & + 1d0, r1(1,iblock), nO) + enddo deallocate(W_vvov,T_vvoo) + + ! r1(u,beta) = r1(u,beta) - (2d0 * cc_space_v_vooo(a,u,i,j) - cc_space_v_vooo(a,u,j,i)) * tau(i,j,a,beta) ! r1(u,beta) = r1(u,beta) - W(i,j,a,u) * tau(i,j,a,beta) !do beta = 1, nV @@ -1640,11 +1653,12 @@ subroutine compute_B1_gam(nO,nV,t1,t2,B1,gam) ! enddo double precision, allocatable :: X_vvvo(:,:,:), Y_vvvv(:,:,:) + allocate(X_vvvo(nV,nV,nO), Y_vvvv(nV,nV,nV)) ! ! B1(a,b,beta,gam) = cc_space_v_vvvv(a,b,beta,gam) call gen_v_space(cc_nVa,cc_nVa,cc_nVa,1, & - cc_list_vir,cc_list_vir,cc_list_vir,(/ cc_list_vir(gam) /), B1) + cc_list_vir,cc_list_vir,cc_list_vir,cc_list_vir(gam), B1) !$omp parallel & diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index 0b9e123e..50f5f603 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -185,25 +185,15 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) deallocate(X_ovov) + integer :: iblock, block_size, nVmax double precision, allocatable :: W_vvov(:,:,:,:), T_vvoo(:,:,:,:) - allocate(W_vvov(nV,nV,nO,nV), T_vvoo(nV,nV,nO,nO)) + block_size = 8 + allocate(W_vvov(nV,nV,nO,block_size), T_vvoo(nV,nV,nO,nO)) !$omp parallel & !$omp shared(nO,nV,cc_space_v_vvov,W_vvov,T_vvoo,tau) & !$omp private(b,beta,i,a) & !$omp default(none) - !$omp do - do beta = 1, nV - do i = 1, nO - do b = 1, nV - do a = 1, nV - W_vvov(a,b,i,beta) = 2d0 * cc_space_v_vvov(a,b,i,beta) - cc_space_v_vvov(b,a,i,beta) - enddo - enddo - enddo - enddo - !$omp end do nowait - !$omp do do u = 1, nO do i = 1, nO @@ -217,10 +207,30 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) !$omp end do nowait !$omp end parallel - call dgemm('T','N',nO,nV,nO*nV*nV, & - 1d0, T_vvoo, size(T_vvoo,1) * size(T_vvoo,2) * size(T_vvoo,3), & - W_vvov, size(W_vvov,1) * size(W_vvov,2) * size(W_vvov,3), & - 1d0, r1 , size(r1,1)) + do iblock = 1, nV, block_size + nVmax = min(block_size,nV-iblock+1) + !$omp parallel & + !$omp shared(nO,nV,cc_space_v_vvov,W_vvov,T_vvoo,tau,nVmax,iblock) & + !$omp private(b,i,a,beta) & + !$omp default(none) + !$omp do collapse(2) + do beta = iblock, iblock + nVmax - 1 + do i = 1, nO + do b = 1, nV + do a = 1, nV + W_vvov(a,b,i,beta-iblock+1) = 2d0 * cc_space_v_vvov(a,b,i,beta) - cc_space_v_vvov(b,a,i,beta) + enddo + enddo + enddo + enddo + !$omp end do nowait + !$omp end parallel + + call dgemm('T','N',nO,nVmax,nO*nV*nV, & + 1d0, T_vvoo, nV*nV*nO, & + W_vvov, nO*nV*nV, & + 1d0, r1(1,iblock), nO) + enddo deallocate(W_vvov,T_vvoo) @@ -450,7 +460,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) call compute_g_vir_chol(nO,nV,t1,t2,H_vv,g_vir) call compute_A1_chol(nO,nV,t1,t2,tau,A1) call compute_J1_chol(nO,nV,t1,t2,cc_space_v_ovvo,cc_space_v_ovoo, & - cc_space_v_vvvo,cc_space_v_vvoo,J1) + cc_space_v_vvoo,J1) call compute_K1_chol(nO,nV,t1,t2,cc_space_v_ovoo,cc_space_v_vvoo, & cc_space_v_ovov,cc_space_v_vvov,K1) @@ -479,15 +489,54 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) tau, size(tau,1) * size(tau,2), & 1d0, r2, size(r2,1) * size(r2,2)) - double precision, dimension(:,:,:,:), allocatable :: r2_chem, tmp, tau_chem - double precision, dimension(:,:,:,:), allocatable :: B1 + integer :: block_size, iblock, k + block_size = 16 + double precision, dimension(:,:,:), allocatable :: B1, tmp_cc, tmpB1 - allocate(B1(nV,nV,nV,nV)) - call compute_B1_chol(nO,nV,t1,B1,cholesky_ao_num) - call dgemm('N','N',nO*nO,nV*nV,nV*nV, & + allocate(tmp_cc(cholesky_ao_num,nV,nV)) + call dgemm('N','N', cholesky_ao_num*nV, nV, nO, 1.d0, & + cc_space_v_vo_chol, cholesky_ao_num*nV, t1, nO, 0.d0, tmp_cc, cholesky_ao_num*nV) + + !$OMP PARALLEL PRIVATE(gam, iblock, B1, tmpB1, beta, b, a) + allocate(B1(nV,nV,block_size), tmpB1(nV,block_size,nV)) + !$OMP DO + do gam = 1, nV + do iblock = 1, nV, block_size + call dgemm('T', 'N', nV*min(block_size, nV-iblock+1), nV, cholesky_ao_num, & + -1.d0, cc_space_v_vv_chol(1,1,iblock), cholesky_ao_num, & + tmp_cc(1,1,gam), cholesky_ao_num, 0.d0, tmpB1, nV*block_size) + + call dgemm('T', 'N', nV*min(block_size, nV-iblock+1), nV, cholesky_ao_num, & + -1.d0, tmp_cc(1,1,iblock), cholesky_ao_num, & + cc_space_v_vv_chol(1,1,gam), cholesky_ao_num, 1.d0, tmpB1, nV*block_size) + + call dgemm('T','N', nV*min(block_size, nV-iblock+1), nV, cholesky_ao_num, 1.d0, & + cc_space_v_vv_chol(1,1,iblock), cholesky_ao_num, & + cc_space_v_vv_chol(1,1,gam), cholesky_ao_num, 1.d0, & + tmpB1, nV*block_size) + + do beta = iblock, min(nV, iblock+block_size-1) + do b = 1, nV + do a = 1, nV + B1(a,b,beta-iblock+1) = tmpB1(a,beta-iblock+1,b) + enddo + enddo + enddo + + call dgemm('N','N',nO*nO,min(block_size, nV-iblock+1),nV*nV, & 1d0, tau, size(tau,1) * size(tau,2), & B1 , size(B1 ,1) * size(B1 ,2), & - 1d0, r2, size(r2 ,1) * size(r2 ,2)) + 1d0, r2(1,1,iblock,gam), size(r2 ,1) * size(r2 ,2)) + enddo + + enddo + !$OMP ENDDO + + deallocate(B1, tmpB1) + !$OMP END PARALLEL + + deallocate(tmp_cc) + double precision, allocatable :: X_oovv(:,:,:,:),Y_oovv(:,:,:,:) allocate(X_oovv(nO,nO,nV,nV),Y_oovv(nO,nO,nV,nV)) @@ -556,29 +605,21 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) deallocate(X_oovv) double precision, allocatable :: X_vovv(:,:,:,:) - allocate(X_vovv(nV,nO,nV,nV)) - !$omp parallel & - !$omp shared(nO,nV,X_vovv,cc_space_v_ovvv) & - !$omp private(u,a,gam,beta) & - !$omp default(none) - !$omp do - do gam = 1, nV - do beta = 1, nV - do u = 1, nO - do a = 1, nV - X_vovv(a,u,beta,gam) = cc_space_v_ovvv(u,a,beta,gam) - enddo - enddo + allocate(X_vovv(nV,nO,nV,block_size)) + do iblock = 1, nV, block_size + do gam = iblock, min(nV, iblock+block_size-1) + call dgemm('T','N',nV, nO*nV, cholesky_ao_num, 1.d0, & + cc_space_v_vv_chol(1,1,gam), cholesky_ao_num, cc_space_v_ov_chol, & + cholesky_ao_num, 0.d0, X_vovv(1,1,1,gam-iblock+1), nV) + enddo - enddo - !$omp end do - !$omp end parallel - - call dgemm('N','N',nO,nO*nV*nV,nV, & + call dgemm('N','N',nO,nO*nV*min(block_size, nV-iblock+1),nV, & 1d0, t1 , size(t1,1), & X_vovv, size(X_vovv,1), & - 0d0, Y_oovv, size(Y_oovv,1)) + 0d0, Y_oovv(1,1,1,iblock), size(Y_oovv,1)) + + enddo !$omp parallel & !$omp shared(nO,nV,r2,Y_oovv) & @@ -597,38 +638,27 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) !$omp end do !$omp end parallel - double precision, allocatable :: X_vovo(:,:,:,:), Y_vovv(:,:,:,:) - allocate(X_vovo(nV,nO,nV,nO), Y_vovv(nV,nO,nV,nV),X_oovv(nO,nO,nV,nV)) + double precision, allocatable :: X_ovvo(:,:,:,:) + double precision, allocatable :: tcc(:,:,:), tcc2(:,:,:) + allocate(tcc2(cholesky_ao_num,nV,nO), X_ovvo(nO,nV,nV,nO)) + allocate(tcc(cholesky_ao_num,nO,nV)) + + call dgemm('N','T', cholesky_ao_num*nV, nO, nV, 1.d0, & + cc_space_v_vv_chol, cholesky_ao_num*nV, t1, nO, & + 0.d0, tcc2, cholesky_ao_num*nV) + + call dgemm('N','N', cholesky_ao_num*nO, nV, nO, 1.d0, & + cc_space_v_oo_chol, cholesky_ao_num*nO, t1, nO, & + 0.d0, tcc, cholesky_ao_num*nO) + + call dgemm('T','N', nO*nV, nV*nO, cholesky_ao_num, 1.d0, & + tcc, cholesky_ao_num, tcc2, cholesky_ao_num, 0.d0, & + X_ovvo, nO*nV) + + deallocate(tcc, tcc2) !$omp parallel & - !$omp shared(nO,nV,X_vovo,cc_space_v_ovov) & - !$omp private(u,v,gam,i) & - !$omp default(none) - do i = 1, nO - !$omp do - do gam = 1, nV - do u = 1, nO - do a = 1, nV - X_vovo(a,u,gam,i) = cc_space_v_ovov(u,a,i,gam) - enddo - enddo - enddo - !$omp end do nowait - enddo - !$omp end parallel - - call dgemm('N','N',nV*nO*nV,nV,nO, & - 1d0, X_vovo, size(X_vovo,1) * size(X_vovo,2) * size(X_vovo,3), & - t1 , size(t1,1), & - 0d0, Y_vovv, size(Y_vovv,1) * size(Y_vovv,2) * size(Y_vovv,3)) - - call dgemm('N','N',nO,nO*nV*nV,nV, & - 1d0, t1, size(t1,1), & - Y_vovv, size(Y_vovv,1), & - 0d0, X_oovv, size(X_oovv,1)) - - !$omp parallel & - !$omp shared(nO,nV,r2,X_oovv) & + !$omp shared(nO,nV,r2,X_ovvo) & !$omp private(u,v,gam,beta) & !$omp default(none) !$omp do @@ -636,7 +666,18 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) do beta = 1, nV do v = 1, nO do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(v,u,gam,beta) - X_oovv(u,v,beta,gam) + r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_ovvo(u,beta,gam,v) + enddo + enddo + enddo + enddo + !$omp end do + !$omp do + do beta = 1, nV + do gam = 1, nV + do v = 1, nO + do u = 1, nO + r2(v,u,gam,beta) = r2(v,u,gam,beta) - X_ovvo(u,beta,gam,v) enddo enddo enddo @@ -644,7 +685,9 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) !$omp end do !$omp end parallel - deallocate(X_vovo,Y_vovv) + deallocate(X_ovvo) + !----- + allocate(X_oovv(nO,nO,nV,nV)) call dgemm('N','N',nO*nO*nV,nV,nO, & 1d0, cc_space_v_oovo, size(cc_space_v_oovo,1) * size(cc_space_v_oovo,2) * size(cc_space_v_oovo,3), & @@ -668,7 +711,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) !$omp end do !$omp end parallel - double precision, allocatable :: Y_oovo(:,:,:,:) + double precision, allocatable :: X_vovo(:,:,:,:), Y_oovo(:,:,:,:) allocate(X_vovo(nV,nO,nV,nO), Y_oovo(nO,nO,nV,nO)) !$omp parallel & @@ -717,7 +760,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) deallocate(X_vovo,Y_oovo) - double precision, allocatable :: X_ovvo(:,:,:,:), Y_voov(:,:,:,:), Z_ovov(:,:,:,:) + double precision, allocatable :: Y_voov(:,:,:,:), Z_ovov(:,:,:,:) allocate(X_ovvo(nO,nV,nV,nO), Y_voov(nV,nO,nO,nV),Z_ovov(nO,nV,nO,nV)) !$omp parallel & !$omp shared(nO,nV,X_ovvo,Y_voov,K1,J1,t2) & @@ -772,8 +815,9 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) deallocate(X_ovvo,Y_voov) - double precision, allocatable :: X_ovov(:,:,:,:),Y_ovov(:,:,:,:) + double precision, allocatable :: Y_ovov(:,:,:,:), X_ovov(:,:,:,:) allocate(X_ovov(nO,nV,nO,nV),Y_ovov(nO,nV,nO,nV)) + !$omp parallel & !$omp shared(nO,nV,r2,K1,X_ovov,Y_ovov,t2) & !$omp private(u,a,i,beta,gam) & @@ -998,36 +1042,6 @@ subroutine compute_A1_chol(nO,nV,t1,t2,tau,A1) end -! B1 -subroutine compute_B1_chol(nO,nV,t1,B1,ldb) - - implicit none - - integer, intent(in) :: nO,nV,ldb - double precision, intent(in) :: t1(nO, nV) - double precision, intent(out) :: B1(nV, nV, nV, nV) - - integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam - - do gam = 1, nV - do beta = 1, nV - do b = 1, nV - do a = 1, nV - B1(a,b,beta,gam) = cc_space_v_vvvv(a,b,beta,gam) - - do i = 1, nO - B1(a,b,beta,gam) = B1(a,b,beta,gam) & - - cc_space_v_vvvo(a,b,beta,i) * t1(i,gam) & - - cc_space_v_vvov(a,b,i,gam) * t1(i,beta) - enddo - - enddo - enddo - enddo - enddo - -end - ! g_occ subroutine compute_g_occ_chol(nO,nV,t1,t2,H_oo,g_occ) @@ -1091,44 +1105,52 @@ subroutine compute_g_vir_chol(nO,nV,t1,t2,H_vv,g_vir) t1 , size(t1,1), & 0d0, g_vir, size(g_vir,1)) - !$omp parallel & - !$omp shared(nO,nV,g_vir,H_vv, cc_space_v_vvvo,t1) & - !$omp private(i,b,a,beta) & - !$omp default(none) - !$omp do + double precision, allocatable :: tmp_k(:), tmp_vo(:,:,:), tmp_vo2(:,:,:) + allocate(tmp_k(cholesky_ao_num)) + call dgemm('N','N', cholesky_ao_num, 1, nO*nV, 1.d0, & + cc_space_v_ov_chol, cholesky_ao_num, t1, nO*nV, 0.d0, tmp_k, cholesky_ao_num) + + call dgemm('T','N', nV*nV, 1, cholesky_ao_num, 2.d0, & + cc_space_v_vv_chol, cholesky_ao_num, tmp_k, cholesky_ao_num, 1.d0, & + g_vir, nV*nV) + deallocate(tmp_k) + + allocate(tmp_vo(cholesky_ao_num,nV,nO)) + call dgemm('N','T',cholesky_ao_num*nV, nO, nV, 1.d0, & + cc_space_v_vv_chol, cholesky_ao_num*nV, t1, nO, 0.d0, tmp_vo, cholesky_ao_num*nV) + + allocate(tmp_vo2(cholesky_ao_num,nO,nV)) + do beta=1,nV + do i=1,nO + do k=1,cholesky_ao_num + tmp_vo2(k,i,beta) = -tmp_vo(k,beta,i) + enddo + enddo + enddo + deallocate(tmp_vo) + do beta = 1, nV do a = 1, nV g_vir(a,beta) = g_vir(a,beta) + H_vv(a,beta) enddo enddo - !$omp end do - !$omp do - do beta = 1, nV - do i = 1, nO - do b = 1, nV - do a = 1, nV - g_vir(a,beta) = g_vir(a,beta) + (2d0 * cc_space_v_vvvo(a,b,beta,i) - cc_space_v_vvvo(b,a,beta,i)) * t1(i,b) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel + call dgemm('T','N', nV, nV, nO*cholesky_ao_num, 1.d0, & + cc_space_v_ov_chol, cholesky_ao_num*nO, & + tmp_vo2, cholesky_ao_num*nO, 1.d0, g_vir, nV) end ! J1 -subroutine compute_J1_chol(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1) - +subroutine compute_J1_chol(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvoo,J1) implicit none integer, intent(in) :: nO,nV double precision, intent(in) :: t1(nO, nV) double precision, intent(in) :: t2(nO, nO, nV, nV) double precision, intent(in) :: v_ovvo(nO,nV,nV,nO), v_ovoo(nO,nV,nO,nO) - double precision, intent(in) :: v_vvvo(nV,nV,nV,nO), v_vvoo(nV,nV,nO,nO) + double precision, intent(in) :: v_vvoo(nV,nV,nO,nO) double precision, intent(out) :: J1(nO, nV, nV, nO) integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam @@ -1188,11 +1210,31 @@ subroutine compute_J1_chol(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1) !$omp end parallel deallocate(X_ovoo) - ! v_vvvo(b,a,beta,i) * t1(u,b) - call dgemm('N','N',nO,nV*nV*nO,nV, & - 1d0, t1 , size(t1,1), & - v_vvvo, size(v_vvvo,1), & - 1d0, J1 , size(J1,1)) + double precision, allocatable :: tmp_cc(:,:,:), J1_tmp(:,:,:,:) + allocate(tmp_cc(cholesky_ao_num,nV,nO), J1_tmp(nV,nO,nV,nO)) + + call dgemm('N','T', cholesky_ao_num*nV, nO, nV, 1.d0, & + cc_space_v_vv_chol, cholesky_ao_num*nV, & + t1, nO, & + 0.d0, tmp_cc, cholesky_ao_num*nV) + + call dgemm('T','N', nV*nO, nV*nO, cholesky_ao_num, 1.d0, & + tmp_cc, cholesky_ao_num, cc_space_v_vo_chol, cholesky_ao_num, & + 0.d0, J1_tmp, nV*nO) + + deallocate(tmp_cc) + + do i=1,nO + do b=1,nV + do a=1,nV + do u=1,nO + J1(u,a,b,i) = J1(u,a,b,i) + J1_tmp(b,u,a,i) + enddo + enddo + enddo + enddo + + deallocate(J1_tmp) !- cc_space_v_vvoo(a,b,i,j) * (0.5d0 * t2(u,j,b,beta) + t1(u,b) * t1(j,beta)) & double precision, allocatable :: X_voov(:,:,:,:), Z_ovvo(:,:,:,:) From b4a2e9bd7648bb11f3dc379cd33673fe74a65102 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Mon, 10 Jul 2023 23:32:05 +0200 Subject: [PATCH 26/40] Fixed cholesky for tiny thresholds --- src/ao_two_e_ints/cholesky.irp.f | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f index 4702c850..128aa483 100644 --- a/src/ao_two_e_ints/cholesky.irp.f +++ b/src/ao_two_e_ints/cholesky.irp.f @@ -134,17 +134,14 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] i = 0 ! 5. - do while (Dmax > tau) + do while ( (Dmax > tau).and.(rank < ndim) ) ! a. i = i+1 - logical :: memory_ok - memory_ok = .False. - s = 0.1d0 ! Inrease s until the arrays fit in memory - do + do ! b. Dmin = max(s*Dmax,tau) @@ -153,6 +150,7 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] nq=0 LDmap = 0 DLmap = 0 + Dset_rev = 0 do p=1,np if ( D(Lset(p)) > Dmin ) then nq = nq+1 @@ -180,7 +178,7 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] print *, 'Not enough memory. Reduce cholesky threshold' stop -1 endif - + enddo ! d., e. @@ -197,11 +195,11 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] do k=1,rank L(:,k) = L_old(:,k) enddo - !$OMP END PARALLEL DO + !$OMP END PARALLEL DO deallocate(L_old) - allocate(Delta(np,nq), stat=ierr) + allocate(Delta(np,nq), stat=ierr) if (ierr /= 0) then print *, irp_here, ': allocation failed : (Delta(np,nq))' stop -1 @@ -228,7 +226,7 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] enddo !$OMP ENDDO NOWAIT - !$OMP DO + !$OMP DO do k=1,N do p=1,np Ltmp_p(p,k) = L(Lset(p),k) @@ -364,7 +362,7 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] Ltmp_q(q,iblock) = L(Dset(q), rank) enddo !$OMP END DO - + !$OMP END PARALLEL Qmax = D(Dset(1)) @@ -381,7 +379,7 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num ] deallocate(Ltmp_q, stat=ierr) ! i. - N = N+j + N = rank ! j. Dmax = D(Lset(1)) From 326dbe77408eecd626e57608aeb43d9f5d597114 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Mon, 10 Jul 2023 23:32:43 +0200 Subject: [PATCH 27/40] Removed vvov --- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index 50f5f603..b804792f 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -1409,11 +1409,23 @@ subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov,v_vvov,K1) t1 , size(t1,1), & 1d0, K1 , size(K1,1) * size(K1,2) * size(K1,3)) - call dgemm('N','N',nO,nV*nO*nV,nV, & - 1d0, t1 , size(t1,1), & - v_vvov, size(v_vvov,1), & - 1d0, K1 , size(K1,1)) + double precision, allocatable :: K1tmp(:,:,:,:), t1v(:,:,:) + allocate(K1tmp(nO,nO,nV,nV), t1v(cholesky_ao_num,nO,nO)) +! call dgemm('N','N',nO,nV*nO*nV,nV, & +! 1d0, t1 , size(t1,1), & +! v_vvov, size(v_vvov,1), & +! 1d0, K1 , size(K1,1)) + + call dgemm('N','T', cholesky_ao_num*nO, nO, nV, 1.d0, & + cc_space_v_ov_chol, cholesky_ao_num*nO, t1, nO, 0.d0, & + t1v, cholesky_ao_num*nO) + + call dgemm('T','N', nO*nO, nV*nV, cholesky_ao_num, 1.d0, & + t1v, cholesky_ao_num, cc_space_v_vv_chol, cholesky_ao_num, 0.d0, & + K1tmp, nO*nO) + + deallocate(t1v) ! Y(u,beta,b,j) * X(b,j,a,i) = Z(u,beta,a,i) call dgemm('N','N',nV*nO,nO*nV,nV*nO, & 1d0, Y, size(Y,1) * size(Y,2), & @@ -1421,7 +1433,7 @@ subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov,v_vvov,K1) 0d0, Z, size(Z,1) * size(Z,2)) !$omp parallel & - !$omp shared(nO,nV,K1,Z) & + !$omp shared(nO,nV,K1,Z,K1tmp) & !$omp private(i,beta,a,u) & !$omp default(none) !$omp do @@ -1429,7 +1441,7 @@ subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov,v_vvov,K1) do i = 1, nO do a = 1, nV do u = 1, nO - K1(u,a,i,beta) = K1(u,a,i,beta) + Z(u,beta,a,i) + K1(u,a,i,beta) = K1(u,a,i,beta) + K1tmp(u,i,a,beta) + Z(u,beta,a,i) enddo enddo enddo @@ -1437,6 +1449,6 @@ subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov,v_vvov,K1) !$omp end do !$omp end parallel - deallocate(X,Y,Z) + deallocate(K1tmp,X,Y,Z) end From 64ee4eab75165e4fa283cdf03393c7e93d29f66c Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 11 Jul 2023 15:13:01 +0200 Subject: [PATCH 28/40] Removed all vvv in CCSD --- src/ccsd/ccsd_space_orb_sub.irp.f | 4 +- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 43 ++-- src/utils_cc/mo_integrals_cc.irp.f | 323 ++++++++++++++++++++++++- 3 files changed, 335 insertions(+), 35 deletions(-) diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 35e14313..e7b115bb 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -92,7 +92,7 @@ subroutine run_ccsd_space_orb call compute_H_vv_chol(nO,nV,tau_x,H_vv) call compute_H_vo_chol(nO,nV,t1,H_vo) - call compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) + call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) else call compute_H_oo(nO,nV,t1,t2,tau,H_oo) @@ -588,8 +588,6 @@ subroutine compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) deallocate(W_vvov,T_vvoo) - - ! r1(u,beta) = r1(u,beta) - (2d0 * cc_space_v_vooo(a,u,i,j) - cc_space_v_vooo(a,u,j,i)) * tau(i,j,a,beta) ! r1(u,beta) = r1(u,beta) - W(i,j,a,u) * tau(i,j,a,beta) !do beta = 1, nV diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index b804792f..99a4e426 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -186,14 +186,13 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) deallocate(X_ovov) integer :: iblock, block_size, nVmax - double precision, allocatable :: W_vvov(:,:,:,:), T_vvoo(:,:,:,:) - block_size = 8 - allocate(W_vvov(nV,nV,nO,block_size), T_vvoo(nV,nV,nO,nO)) + double precision, allocatable :: W_vvov(:,:,:,:), W_vvov_tmp(:,:,:,:), T_vvoo(:,:,:,:) + block_size = 16 + allocate(W_vvov(nV,nV,nO,block_size), W_vvov_tmp(nV,nO,nV,block_size), T_vvoo(nV,nV,nO,nO)) !$omp parallel & - !$omp shared(nO,nV,cc_space_v_vvov,W_vvov,T_vvoo,tau) & - !$omp private(b,beta,i,a) & - !$omp default(none) + !$omp private(u,i,b,a) & + !$omp default(shared) !$omp do do u = 1, nO do i = 1, nO @@ -204,26 +203,32 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) enddo enddo enddo - !$omp end do nowait + !$omp end do !$omp end parallel do iblock = 1, nV, block_size nVmax = min(block_size,nV-iblock+1) + + call dgemm('T','N', nV*nO, nV*nVmax, cholesky_ao_num, 1.d0, & + cc_space_v_vo_chol , cholesky_ao_num, & + cc_space_v_vv_chol(1,1,iblock), cholesky_ao_num, & + 0.d0, W_vvov_tmp, nV*nO) + !$omp parallel & - !$omp shared(nO,nV,cc_space_v_vvov,W_vvov,T_vvoo,tau,nVmax,iblock) & !$omp private(b,i,a,beta) & - !$omp default(none) - !$omp do collapse(2) - do beta = iblock, iblock + nVmax - 1 + !$omp default(shared) + do beta = 1, nVmax do i = 1, nO + !$omp do do b = 1, nV do a = 1, nV - W_vvov(a,b,i,beta-iblock+1) = 2d0 * cc_space_v_vvov(a,b,i,beta) - cc_space_v_vvov(b,a,i,beta) + W_vvov(a,b,i,beta) = 2d0 * W_vvov_tmp(a,i,b,beta) - W_vvov_tmp(b,i,a,beta) enddo enddo + !$omp end do nowait enddo enddo - !$omp end do nowait + !$omp barrier !$omp end parallel call dgemm('T','N',nO,nVmax,nO*nV*nV, & @@ -234,6 +239,7 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) deallocate(W_vvov,T_vvoo) + double precision, allocatable :: W_oovo(:,:,:,:) allocate(W_oovo(nO,nO,nV,nO)) @@ -462,7 +468,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) call compute_J1_chol(nO,nV,t1,t2,cc_space_v_ovvo,cc_space_v_ovoo, & cc_space_v_vvoo,J1) call compute_K1_chol(nO,nV,t1,t2,cc_space_v_ovoo,cc_space_v_vvoo, & - cc_space_v_ovov,cc_space_v_vvov,K1) + cc_space_v_ovov,K1) ! Residual !r2 = 0d0 @@ -1346,7 +1352,7 @@ end ! K1 -subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov,v_vvov,K1) +subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov,K1) implicit none @@ -1354,7 +1360,7 @@ subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov,v_vvov,K1) double precision, intent(in) :: t1(nO, nV) double precision, intent(in) :: t2(nO, nO, nV, nV) double precision, intent(in) :: v_vvoo(nV,nV,nO,nO), v_ovov(nO,nV,nO,nV) - double precision, intent(in) :: v_vvov(nV,nV,nO,nV), v_ovoo(nO,nV,nO,nO) + double precision, intent(in) :: v_ovoo(nO,nV,nO,nO) double precision, intent(out) :: K1(nO, nV, nO, nV) double precision, allocatable :: X(:,:,:,:), Y(:,:,:,:), Z(:,:,:,:) @@ -1412,11 +1418,6 @@ subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov,v_vvov,K1) double precision, allocatable :: K1tmp(:,:,:,:), t1v(:,:,:) allocate(K1tmp(nO,nO,nV,nV), t1v(cholesky_ao_num,nO,nO)) -! call dgemm('N','N',nO,nV*nO*nV,nV, & -! 1d0, t1 , size(t1,1), & -! v_vvov, size(v_vvov,1), & -! 1d0, K1 , size(K1,1)) - call dgemm('N','T', cholesky_ao_num*nO, nO, nV, 1.d0, & cc_space_v_ov_chol, cholesky_ao_num*nO, t1, nO, 0.d0, & t1v, cholesky_ao_num*nO) diff --git a/src/utils_cc/mo_integrals_cc.irp.f b/src/utils_cc/mo_integrals_cc.irp.f index 62237229..a68ab8de 100644 --- a/src/utils_cc/mo_integrals_cc.irp.f +++ b/src/utils_cc/mo_integrals_cc.irp.f @@ -190,7 +190,40 @@ BEGIN_PROVIDER [double precision, cc_space_v_oooo, (cc_nOa, cc_nOa, cc_nOa, cc_n implicit none - call gen_v_space(cc_nOa,cc_nOa,cc_nOa,cc_nOa, cc_list_occ,cc_list_occ,cc_list_occ,cc_list_occ, cc_space_v_oooo) + if (do_ao_cholesky) then + + integer :: i1, i2, i3, i4 + integer :: n1, n2, n3, n4 + + n1 = size(cc_space_v_oooo,1) + n2 = size(cc_space_v_oooo,2) + n3 = size(cc_space_v_oooo,3) + n4 = size(cc_space_v_oooo,4) + + double precision, allocatable :: buffer(:,:,:,:) + allocate(buffer(n1,n3,n2,n4)) + + call dgemm('T','N', n1*n3, n2*n4, cholesky_ao_num, 1.d0, & + cc_space_v_oo_chol, cholesky_ao_num, & + cc_space_v_oo_chol, cholesky_ao_num, 0.d0, buffer, n1*n3) + + !$OMP PARALLEL DO PRIVATE(i1,i2,i3,i4) COLLAPSE(2) + do i4 = 1, n4 + do i3 = 1, n3 + do i2 = 1, n2 + do i1 = 1, n1 + cc_space_v_oooo(i1,i2,i3,i4) = buffer(i1,i3,i2,i4) + enddo + enddo + enddo + enddo + !$OMP END PARALLEL DO + + deallocate(buffer) + + else + call gen_v_space(cc_nOa,cc_nOa,cc_nOa,cc_nOa, cc_list_occ,cc_list_occ,cc_list_occ,cc_list_occ, cc_space_v_oooo) + endif END_PROVIDER @@ -200,7 +233,40 @@ BEGIN_PROVIDER [double precision, cc_space_v_vooo, (cc_nVa, cc_nOa, cc_nOa, cc_n implicit none - call gen_v_space(cc_nVa,cc_nOa,cc_nOa,cc_nOa, cc_list_vir,cc_list_occ,cc_list_occ,cc_list_occ, cc_space_v_vooo) + if (do_ao_cholesky) then + + integer :: i1, i2, i3, i4 + integer :: n1, n2, n3, n4 + + n1 = size(cc_space_v_vooo,1) + n2 = size(cc_space_v_vooo,2) + n3 = size(cc_space_v_vooo,3) + n4 = size(cc_space_v_vooo,4) + + double precision, allocatable :: buffer(:,:,:,:) + allocate(buffer(n1,n3,n2,n4)) + + call dgemm('T','N', n1*n3, n2*n4, cholesky_ao_num, 1.d0, & + cc_space_v_vo_chol, cholesky_ao_num, & + cc_space_v_oo_chol, cholesky_ao_num, 0.d0, buffer, n1*n3) + + !$OMP PARALLEL DO PRIVATE(i1,i2,i3,i4) COLLAPSE(2) + do i4 = 1, n4 + do i3 = 1, n3 + do i2 = 1, n2 + do i1 = 1, n1 + cc_space_v_vooo(i1,i2,i3,i4) = buffer(i1,i3,i2,i4) + enddo + enddo + enddo + enddo + !$OMP END PARALLEL DO + + deallocate(buffer) + + else + call gen_v_space(cc_nVa,cc_nOa,cc_nOa,cc_nOa, cc_list_vir,cc_list_occ,cc_list_occ,cc_list_occ, cc_space_v_vooo) + endif END_PROVIDER @@ -210,7 +276,32 @@ BEGIN_PROVIDER [double precision, cc_space_v_ovoo, (cc_nOa, cc_nVa, cc_nOa, cc_n implicit none - call gen_v_space(cc_nOa,cc_nVa,cc_nOa,cc_nOa, cc_list_occ,cc_list_vir,cc_list_occ,cc_list_occ, cc_space_v_ovoo) + + if (do_ao_cholesky) then + + integer :: i1, i2, i3, i4 + integer :: n1, n2, n3, n4 + + n1 = size(cc_space_v_ovoo,1) + n2 = size(cc_space_v_ovoo,2) + n3 = size(cc_space_v_ovoo,3) + n4 = size(cc_space_v_ovoo,4) + + !$OMP PARALLEL DO PRIVATE(i1,i2,i3,i4) COLLAPSE(2) + do i4 = 1, n4 + do i3 = 1, n3 + do i2 = 1, n2 + do i1 = 1, n1 + cc_space_v_ovoo(i1,i2,i3,i4) = cc_space_v_vooo(i2,i1,i4,i3) + enddo + enddo + enddo + enddo + !$OMP END PARALLEL DO + + else + call gen_v_space(cc_nOa,cc_nVa,cc_nOa,cc_nOa, cc_list_occ,cc_list_vir,cc_list_occ,cc_list_occ, cc_space_v_ovoo) + endif END_PROVIDER @@ -220,7 +311,31 @@ BEGIN_PROVIDER [double precision, cc_space_v_oovo, (cc_nOa, cc_nOa, cc_nVa, cc_n implicit none - call gen_v_space(cc_nOa,cc_nOa,cc_nVa,cc_nOa, cc_list_occ,cc_list_occ,cc_list_vir,cc_list_occ, cc_space_v_oovo) + if (do_ao_cholesky) then + + integer :: i1, i2, i3, i4 + integer :: n1, n2, n3, n4 + + n1 = size(cc_space_v_oovo,1) + n2 = size(cc_space_v_oovo,2) + n3 = size(cc_space_v_oovo,3) + n4 = size(cc_space_v_oovo,4) + + !$OMP PARALLEL DO PRIVATE(i1,i2,i3,i4) COLLAPSE(2) + do i4 = 1, n4 + do i3 = 1, n3 + do i2 = 1, n2 + do i1 = 1, n1 + cc_space_v_oovo(i1,i2,i3,i4) = cc_space_v_vooo(i3,i2,i1,i4) + enddo + enddo + enddo + enddo + !$OMP END PARALLEL DO + + else + call gen_v_space(cc_nOa,cc_nOa,cc_nVa,cc_nOa, cc_list_occ,cc_list_occ,cc_list_vir,cc_list_occ, cc_space_v_oovo) + endif END_PROVIDER @@ -230,7 +345,31 @@ BEGIN_PROVIDER [double precision, cc_space_v_ooov, (cc_nOa, cc_nOa, cc_nOa, cc_n implicit none - call gen_v_space(cc_nOa,cc_nOa,cc_nOa,cc_nVa, cc_list_occ,cc_list_occ,cc_list_occ,cc_list_vir, cc_space_v_ooov) + if (do_ao_cholesky) then + + integer :: i1, i2, i3, i4 + integer :: n1, n2, n3, n4 + + n1 = size(cc_space_v_oovo,1) + n2 = size(cc_space_v_oovo,2) + n3 = size(cc_space_v_oovo,3) + n4 = size(cc_space_v_oovo,4) + + !$OMP PARALLEL DO PRIVATE(i1,i2,i3,i4) COLLAPSE(2) + do i4 = 1, n4 + do i3 = 1, n3 + do i2 = 1, n2 + do i1 = 1, n1 + cc_space_v_ooov(i1,i2,i3,i4) = cc_space_v_ovoo(i1,i4,i3,i2) + enddo + enddo + enddo + enddo + !$OMP END PARALLEL DO + + else + call gen_v_space(cc_nOa,cc_nOa,cc_nOa,cc_nVa, cc_list_occ,cc_list_occ,cc_list_occ,cc_list_vir, cc_space_v_ooov) + endif END_PROVIDER @@ -240,7 +379,40 @@ BEGIN_PROVIDER [double precision, cc_space_v_vvoo, (cc_nVa, cc_nVa, cc_nOa, cc_n implicit none - call gen_v_space(cc_nVa,cc_nVa,cc_nOa,cc_nOa, cc_list_vir,cc_list_vir,cc_list_occ,cc_list_occ, cc_space_v_vvoo) + if (do_ao_cholesky) then + + integer :: i1, i2, i3, i4 + integer :: n1, n2, n3, n4 + + n1 = size(cc_space_v_vvoo,1) + n2 = size(cc_space_v_vvoo,2) + n3 = size(cc_space_v_vvoo,3) + n4 = size(cc_space_v_vvoo,4) + + double precision, allocatable :: buffer(:,:,:,:) + allocate(buffer(n1,n3,n2,n4)) + + call dgemm('T','N', n1*n3, n2*n4, cholesky_ao_num, 1.d0, & + cc_space_v_vo_chol, cholesky_ao_num, & + cc_space_v_vo_chol, cholesky_ao_num, 0.d0, buffer, n1*n3) + + !$OMP PARALLEL DO PRIVATE(i1,i2,i3,i4) COLLAPSE(2) + do i4 = 1, n4 + do i3 = 1, n3 + do i2 = 1, n2 + do i1 = 1, n1 + cc_space_v_vvoo(i1,i2,i3,i4) = buffer(i1,i3,i2,i4) + enddo + enddo + enddo + enddo + !$OMP END PARALLEL DO + + deallocate(buffer) + + else + call gen_v_space(cc_nVa,cc_nVa,cc_nOa,cc_nOa, cc_list_vir,cc_list_vir,cc_list_occ,cc_list_occ, cc_space_v_vvoo) + endif END_PROVIDER @@ -250,7 +422,40 @@ BEGIN_PROVIDER [double precision, cc_space_v_vovo, (cc_nVa, cc_nOa, cc_nVa, cc_n implicit none - call gen_v_space(cc_nVa,cc_nOa,cc_nVa,cc_nOa, cc_list_vir,cc_list_occ,cc_list_vir,cc_list_occ, cc_space_v_vovo) + if (do_ao_cholesky) then + + integer :: i1, i2, i3, i4 + integer :: n1, n2, n3, n4 + + n1 = size(cc_space_v_vovo,1) + n2 = size(cc_space_v_vovo,2) + n3 = size(cc_space_v_vovo,3) + n4 = size(cc_space_v_vovo,4) + + double precision, allocatable :: buffer(:,:,:,:) + allocate(buffer(n1,n3,n2,n4)) + + call dgemm('T','N', n1*n3, n2*n4, cholesky_ao_num, 1.d0, & + cc_space_v_vv_chol, cholesky_ao_num, & + cc_space_v_oo_chol, cholesky_ao_num, 0.d0, buffer, n1*n3) + + !$OMP PARALLEL DO PRIVATE(i1,i2,i3,i4) COLLAPSE(2) + do i4 = 1, n4 + do i3 = 1, n3 + do i2 = 1, n2 + do i1 = 1, n1 + cc_space_v_vovo(i1,i2,i3,i4) = buffer(i1,i3,i2,i4) + enddo + enddo + enddo + enddo + !$OMP END PARALLEL DO + + deallocate(buffer) + + else + call gen_v_space(cc_nVa,cc_nOa,cc_nVa,cc_nOa, cc_list_vir,cc_list_occ,cc_list_vir,cc_list_occ, cc_space_v_vovo) + endif END_PROVIDER @@ -260,7 +465,31 @@ BEGIN_PROVIDER [double precision, cc_space_v_voov, (cc_nVa, cc_nOa, cc_nOa, cc_n implicit none - call gen_v_space(cc_nVa,cc_nOa,cc_nOa,cc_nVa, cc_list_vir,cc_list_occ,cc_list_occ,cc_list_vir, cc_space_v_voov) + if (do_ao_cholesky) then + + integer :: i1, i2, i3, i4 + integer :: n1, n2, n3, n4 + + n1 = size(cc_space_v_voov,1) + n2 = size(cc_space_v_voov,2) + n3 = size(cc_space_v_voov,3) + n4 = size(cc_space_v_voov,4) + + !$OMP PARALLEL DO PRIVATE(i1,i2,i3,i4) COLLAPSE(2) + do i4 = 1, n4 + do i3 = 1, n3 + do i2 = 1, n2 + do i1 = 1, n1 + cc_space_v_voov(i1,i2,i3,i4) = cc_space_v_vvoo(i1,i4,i3,i2) + enddo + enddo + enddo + enddo + !$OMP END PARALLEL DO + + else + call gen_v_space(cc_nVa,cc_nOa,cc_nOa,cc_nVa, cc_list_vir,cc_list_occ,cc_list_occ,cc_list_vir, cc_space_v_voov) + endif END_PROVIDER @@ -270,7 +499,31 @@ BEGIN_PROVIDER [double precision, cc_space_v_ovvo, (cc_nOa, cc_nVa, cc_nVa, cc_n implicit none - call gen_v_space(cc_nOa,cc_nVa,cc_nVa,cc_nOa, cc_list_occ,cc_list_vir,cc_list_vir,cc_list_occ, cc_space_v_ovvo) + if (do_ao_cholesky) then + + integer :: i1, i2, i3, i4 + integer :: n1, n2, n3, n4 + + n1 = size(cc_space_v_ovvo,1) + n2 = size(cc_space_v_ovvo,2) + n3 = size(cc_space_v_ovvo,3) + n4 = size(cc_space_v_ovvo,4) + + !$OMP PARALLEL DO PRIVATE(i1,i2,i3,i4) COLLAPSE(2) + do i4 = 1, n4 + do i3 = 1, n3 + do i2 = 1, n2 + do i1 = 1, n1 + cc_space_v_ovvo(i1,i2,i3,i4) = cc_space_v_vvoo(i3,i2,i1,i4) + enddo + enddo + enddo + enddo + !$OMP END PARALLEL DO + + else + call gen_v_space(cc_nOa,cc_nVa,cc_nVa,cc_nOa, cc_list_occ,cc_list_vir,cc_list_vir,cc_list_occ, cc_space_v_ovvo) + endif END_PROVIDER @@ -280,7 +533,31 @@ BEGIN_PROVIDER [double precision, cc_space_v_ovov, (cc_nOa, cc_nVa, cc_nOa, cc_n implicit none - call gen_v_space(cc_nOa,cc_nVa,cc_nOa,cc_nVa, cc_list_occ,cc_list_vir,cc_list_occ,cc_list_vir, cc_space_v_ovov) + if (do_ao_cholesky) then + + integer :: i1, i2, i3, i4 + integer :: n1, n2, n3, n4 + + n1 = size(cc_space_v_ovov,1) + n2 = size(cc_space_v_ovov,2) + n3 = size(cc_space_v_ovov,3) + n4 = size(cc_space_v_ovov,4) + + !$OMP PARALLEL DO PRIVATE(i1,i2,i3,i4) COLLAPSE(2) + do i4 = 1, n4 + do i3 = 1, n3 + do i2 = 1, n2 + do i1 = 1, n1 + cc_space_v_ovov(i1,i2,i3,i4) = cc_space_v_vovo(i2,i1,i4,i3) + enddo + enddo + enddo + enddo + !$OMP END PARALLEL DO + + else + call gen_v_space(cc_nOa,cc_nVa,cc_nOa,cc_nVa, cc_list_occ,cc_list_vir,cc_list_occ,cc_list_vir, cc_space_v_ovov) + endif END_PROVIDER @@ -290,7 +567,31 @@ BEGIN_PROVIDER [double precision, cc_space_v_oovv, (cc_nOa, cc_nOa, cc_nVa, cc_n implicit none - call gen_v_space(cc_nOa,cc_nOa,cc_nVa,cc_nVa, cc_list_occ,cc_list_occ,cc_list_vir,cc_list_vir, cc_space_v_oovv) + if (do_ao_cholesky) then + + integer :: i1, i2, i3, i4 + integer :: n1, n2, n3, n4 + + n1 = size(cc_space_v_oovv,1) + n2 = size(cc_space_v_oovv,2) + n3 = size(cc_space_v_oovv,3) + n4 = size(cc_space_v_oovv,4) + + !$OMP PARALLEL DO PRIVATE(i1,i2,i3,i4) COLLAPSE(2) + do i4 = 1, n4 + do i3 = 1, n3 + do i2 = 1, n2 + do i1 = 1, n1 + cc_space_v_oovv(i1,i2,i3,i4) = cc_space_v_vvoo(i3,i4,i1,i2) + enddo + enddo + enddo + enddo + !$OMP END PARALLEL DO + + else + call gen_v_space(cc_nOa,cc_nOa,cc_nVa,cc_nVa, cc_list_occ,cc_list_occ,cc_list_vir,cc_list_vir, cc_space_v_oovv) + endif END_PROVIDER From 8c65e01eedebcf164e16ea78097d35ee42ca0b7e Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 11 Jul 2023 17:31:58 +0200 Subject: [PATCH 29/40] I/O in Cholesky --- src/ao_two_e_ints/EZFIO.cfg | 6 + src/ao_two_e_ints/cholesky.irp.f | 836 +++++++++++++------------ src/ccsd/ccsd_space_orb_sub.irp.f | 35 +- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 154 +++-- 4 files changed, 568 insertions(+), 463 deletions(-) diff --git a/src/ao_two_e_ints/EZFIO.cfg b/src/ao_two_e_ints/EZFIO.cfg index 9f523fca..9c017813 100644 --- a/src/ao_two_e_ints/EZFIO.cfg +++ b/src/ao_two_e_ints/EZFIO.cfg @@ -4,6 +4,12 @@ doc: Read/Write |AO| integrals from/to disk [ Write | Read | None ] interface: ezfio,provider,ocaml default: None +[io_ao_cholesky] +type: Disk_access +doc: Read/Write |AO| integrals from/to disk [ Write | Read | None ] +interface: ezfio,provider,ocaml +default: None + [ao_integrals_threshold] type: Threshold doc: If | (pq|rs) | < `ao_integrals_threshold` then (pq|rs) is zero diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f index 128aa483..8b969174 100644 --- a/src/ao_two_e_ints/cholesky.irp.f +++ b/src/ao_two_e_ints/cholesky.irp.f @@ -14,412 +14,438 @@ BEGIN_PROVIDER [ double precision, cholesky_ao_transp, (cholesky_ao_num, ao_num, END_PROVIDER -BEGIN_PROVIDER [ integer, cholesky_ao_num ] + BEGIN_PROVIDER [ integer, cholesky_ao_num ] &BEGIN_PROVIDER [ double precision, cholesky_ao, (ao_num, ao_num, 1) ] - implicit none - BEGIN_DOC - ! Cholesky vectors in AO basis: (ik|a): - ! = (ik|jl) = sum_a (ik|a).(a|jl) - ! - ! Last dimension of cholesky_ao is cholesky_ao_num - END_DOC - - integer :: rank, ndim - double precision :: tau - double precision, pointer :: L(:,:), L_old(:,:) - - - double precision :: s - double precision, parameter :: dscale = 1.d0 - - double precision, allocatable :: D(:), Delta(:,:), Ltmp_p(:,:), Ltmp_q(:,:) - integer, allocatable :: Lset(:), Dset(:), addr(:,:), LDmap(:), DLmap(:) - integer, allocatable :: Lset_rev(:), Dset_rev(:) - - integer :: i,j,k,m,p,q, qj, dj, p2, q2 - integer :: N, np, nq - - double precision :: Dmax, Dmin, Qmax, f - double precision, external :: get_ao_two_e_integral - logical, external :: ao_two_e_integral_zero - - double precision, external :: ao_two_e_integral - integer :: block_size, iblock, ierr - - integer(omp_lock_kind), allocatable :: lock(:) - - double precision :: rss - double precision, external :: memory_of_double, memory_of_int - - - PROVIDE nucl_coord - - if (.not.do_direct_integrals) then - PROVIDE ao_two_e_integrals_in_map - endif - deallocate(cholesky_ao) - - ndim = ao_num*ao_num - tau = ao_cholesky_threshold - - rss = 6.d0 * memory_of_double(ndim) + & - 6.d0 * memory_of_int(ndim) - call check_mem(rss, irp_here) - - allocate(L(ndim,1)) - - print *, '' - print *, 'Cholesky decomposition of AO integrals' - print *, '======================================' - print *, '' - print *, '============ =============' - print *, ' Rank Threshold' - print *, '============ =============' - - - rank = 0 - - allocate( D(ndim), Lset(ndim), LDmap(ndim), DLmap(ndim), Dset(ndim) ) - allocate( Lset_rev(ndim), Dset_rev(ndim), lock(ndim) ) - allocate( addr(3,ndim) ) - do k=1,ndim - call omp_init_lock(lock(k)) - enddo - - ! 1. - k=0 - do j=1,ao_num - do i=1,ao_num - k = k+1 - addr(1,k) = i - addr(2,k) = j - addr(3,k) = (i-1)*ao_num + j - enddo - enddo - - if (do_direct_integrals) then - !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i) - do i=1,ndim - D(i) = ao_two_e_integral(addr(1,i), addr(2,i), & - addr(1,i), addr(2,i)) - enddo - !$OMP END PARALLEL DO - else - !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i) SCHEDULE(guided) - do i=1,ndim - D(i) = get_ao_two_e_integral(addr(1,i), addr(1,i), & - addr(2,i), addr(2,i), & - ao_integrals_map) - enddo - !$OMP END PARALLEL DO - endif - - Dmax = maxval(D) - - ! 2. - np=0 - Lset_rev = 0 - do p=1,ndim - if ( dscale*dscale*Dmax*D(p) > tau*tau ) then - np = np+1 - Lset(np) = p - Lset_rev(p) = np - endif - enddo - - ! 3. - N = 0 - - ! 4. - i = 0 - - ! 5. - do while ( (Dmax > tau).and.(rank < ndim) ) - ! a. - i = i+1 - - s = 0.1d0 - - ! Inrease s until the arrays fit in memory - do - - ! b. - Dmin = max(s*Dmax,tau) - - ! c. - nq=0 - LDmap = 0 - DLmap = 0 - Dset_rev = 0 - do p=1,np - if ( D(Lset(p)) > Dmin ) then - nq = nq+1 - Dset(nq) = Lset(p) - Dset_rev(Dset(nq)) = nq - LDmap(p) = nq - DLmap(nq) = p - endif - enddo - - call resident_memory(rss) - rss = rss & - + np*memory_of_double(nq) & ! Delta(np,nq) - + (rank+nq)* memory_of_double(ndim) & ! L(ndim,rank+nq) - + (np+nq)*memory_of_double(block_size) ! Ltmp_p(np,block_size) - ! Ltmp_q(nq,block_size) - - if (rss > qp_max_mem) then - s = s*2.d0 - else - exit - endif - - if ((s > 1.d0).or.(nq == 0)) then - print *, 'Not enough memory. Reduce cholesky threshold' - stop -1 - endif - - enddo - - ! d., e. - block_size = max(N,24) - - L_old => L - allocate(L(ndim,rank+nq), stat=ierr) - if (ierr /= 0) then - print *, irp_here, ': allocation failed : (L(ndim,rank+nq))' - stop -1 - endif - - !$OMP PARALLEL DO PRIVATE(k) - do k=1,rank - L(:,k) = L_old(:,k) - enddo - !$OMP END PARALLEL DO - - deallocate(L_old) - - allocate(Delta(np,nq), stat=ierr) - if (ierr /= 0) then - print *, irp_here, ': allocation failed : (Delta(np,nq))' - stop -1 - endif - - allocate(Ltmp_p(np,block_size), stat=ierr) - if (ierr /= 0) then - print *, irp_here, ': allocation failed : (Ltmp_p(np,block_size))' - stop -1 - endif - - allocate(Ltmp_q(nq,block_size), stat=ierr) - if (ierr /= 0) then - print *, irp_here, ': allocation failed : (Ltmp_q(nq,block_size))' - stop -1 - endif - - - !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(m,k,p,q,j) - - !$OMP DO - do q=1,nq - Delta(:,q) = 0.d0 - enddo - !$OMP ENDDO NOWAIT - - !$OMP DO - do k=1,N - do p=1,np - Ltmp_p(p,k) = L(Lset(p),k) - enddo - do q=1,nq - Ltmp_q(q,k) = L(Dset(q),k) - enddo - enddo - !$OMP END DO NOWAIT - - !$OMP BARRIER - - !$OMP DO SCHEDULE(guided) - do m=1,nq - - call omp_set_lock(lock(m)) - do k=1,np - ! Apply only to (k,m) pairs where k is not in Dset - if (LDmap(k) /= 0) cycle - q = Lset_rev(addr(3,Lset(k))) - if ((0 < q).and.(q < k)) cycle - if (.not.ao_two_e_integral_zero( addr(1,Lset(k)), addr(1,Dset(m)), & - addr(2,Lset(k)), addr(2,Dset(m)) ) ) then - if (do_direct_integrals) then - Delta(k,m) = ao_two_e_integral(addr(1,Lset(k)), addr(2,Lset(k)), & - addr(1,Dset(m)), addr(2,Dset(m))) - else - Delta(k,m) = get_ao_two_e_integral( addr(1,Lset(k)), addr(1,Dset(m)), & - addr(2,Lset(k)), addr(2,Dset(m)), ao_integrals_map) + implicit none + BEGIN_DOC + ! Cholesky vectors in AO basis: (ik|a): + ! = (ik|jl) = sum_a (ik|a).(a|jl) + ! + ! Last dimension of cholesky_ao is cholesky_ao_num + END_DOC + + integer :: rank, ndim + double precision :: tau + double precision, pointer :: L(:,:), L_old(:,:) + + + double precision :: s + double precision, parameter :: dscale = 1.d0 + + double precision, allocatable :: D(:), Delta(:,:), Ltmp_p(:,:), Ltmp_q(:,:) + integer, allocatable :: Lset(:), Dset(:), addr(:,:), LDmap(:), DLmap(:) + integer, allocatable :: Lset_rev(:), Dset_rev(:) + + integer :: i,j,k,m,p,q, qj, dj, p2, q2 + integer :: N, np, nq + + double precision :: Dmax, Dmin, Qmax, f + double precision, external :: get_ao_two_e_integral + logical, external :: ao_two_e_integral_zero + + double precision, external :: ao_two_e_integral + integer :: block_size, iblock, ierr + + integer(omp_lock_kind), allocatable :: lock(:) + + double precision :: rss + double precision, external :: memory_of_double, memory_of_int + + integer, external :: getUnitAndOpen + integer :: iunit + + ndim = ao_num*ao_num + deallocate(cholesky_ao) + + if (read_ao_cholesky) then + print *, 'Reading Cholesky vectors from disk...' + iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_ao', 'R') + read(iunit) rank + allocate(cholesky_ao(ao_num,ao_num,rank), stat=ierr) + read(iunit) cholesky_ao + close(iunit) + cholesky_ao_num = rank + + else + + PROVIDE nucl_coord + + if (.not.do_direct_integrals) then + PROVIDE ao_two_e_integrals_in_map + endif + + tau = ao_cholesky_threshold + + rss = 6.d0 * memory_of_double(ndim) + & + 6.d0 * memory_of_int(ndim) + call check_mem(rss, irp_here) + + allocate(L(ndim,1)) + + print *, '' + print *, 'Cholesky decomposition of AO integrals' + print *, '======================================' + print *, '' + print *, '============ =============' + print *, ' Rank Threshold' + print *, '============ =============' + + + rank = 0 + + allocate( D(ndim), Lset(ndim), LDmap(ndim), DLmap(ndim), Dset(ndim) ) + allocate( Lset_rev(ndim), Dset_rev(ndim), lock(ndim) ) + allocate( addr(3,ndim) ) + do k=1,ndim + call omp_init_lock(lock(k)) + enddo + + ! 1. + k=0 + do j=1,ao_num + do i=1,ao_num + k = k+1 + addr(1,k) = i + addr(2,k) = j + addr(3,k) = (i-1)*ao_num + j + enddo + enddo + + if (do_direct_integrals) then + !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i) + do i=1,ndim + D(i) = ao_two_e_integral(addr(1,i), addr(2,i), & + addr(1,i), addr(2,i)) + enddo + !$OMP END PARALLEL DO + else + !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i) SCHEDULE(guided) + do i=1,ndim + D(i) = get_ao_two_e_integral(addr(1,i), addr(1,i), & + addr(2,i), addr(2,i), & + ao_integrals_map) + enddo + !$OMP END PARALLEL DO + endif + + Dmax = maxval(D) + + ! 2. + np=0 + Lset_rev = 0 + do p=1,ndim + if ( dscale*dscale*Dmax*D(p) > tau*tau ) then + np = np+1 + Lset(np) = p + Lset_rev(p) = np + endif + enddo + + ! 3. + N = 0 + + ! 4. + i = 0 + + ! 5. + do while ( (Dmax > tau).and.(rank < ndim) ) + ! a. + i = i+1 + + s = 0.1d0 + + ! Inrease s until the arrays fit in memory + do while (.True.) + + ! b. + Dmin = max(s*Dmax,tau) + + ! c. + nq=0 + LDmap = 0 + DLmap = 0 + Dset_rev = 0 + do p=1,np + if ( D(Lset(p)) > Dmin ) then + nq = nq+1 + Dset(nq) = Lset(p) + Dset_rev(Dset(nq)) = nq + LDmap(p) = nq + DLmap(nq) = p endif - if (q /= 0) Delta(q,m) = Delta(k,m) - endif - enddo - - j = Dset_rev(addr(3,Dset(m))) - if ((0 < j).and.(j < m)) then - call omp_unset_lock(lock(m)) - cycle - endif - - if ((j /= m).and.(j /= 0)) then - call omp_set_lock(lock(j)) - endif - do k=1,nq - ! Apply only to (k,m) pairs both in Dset - p = DLmap(k) - q = Lset_rev(addr(3,Dset(k))) - if ((0 < q).and.(q < p)) cycle - if (.not.ao_two_e_integral_zero( addr(1,Dset(k)), addr(1,Dset(m)), & - addr(2,Dset(k)), addr(2,Dset(m)) ) ) then - if (do_direct_integrals) then - Delta(p,m) = ao_two_e_integral(addr(1,Dset(k)), addr(2,Dset(k)), & - addr(1,Dset(m)), addr(2,Dset(m))) - else - Delta(p,m) = get_ao_two_e_integral( addr(1,Dset(k)), addr(1,Dset(m)), & - addr(2,Dset(k)), addr(2,Dset(m)), ao_integrals_map) - endif - if (q /= 0) Delta(q,m) = Delta(p,m) - if (j /= 0) Delta(p,j) = Delta(p,m) - if (q*j /= 0) Delta(q,j) = Delta(p,m) - endif - enddo - call omp_unset_lock(lock(m)) - if ((j /= m).and.(j /= 0)) then - call omp_unset_lock(lock(j)) - endif - enddo - !$OMP END DO - - !$OMP END PARALLEL - - if (N>0) then - call dgemm('N','T', np, nq, N, -1.d0, & - Ltmp_p, np, Ltmp_q, nq, 1.d0, Delta, np) - endif - - ! f. - Qmax = D(Dset(1)) - do q=1,nq - Qmax = max(Qmax, D(Dset(q))) - enddo - - ! g. - - iblock = 0 - do j=1,nq - - if ( (Qmax <= Dmin).or.(N+j > ndim) ) exit - ! i. - rank = N+j - - if (iblock == block_size) then - call dgemm('N','T',np,nq,block_size,-1.d0, & - Ltmp_p, np, Ltmp_q, nq, 1.d0, Delta, np) - iblock = 0 - endif - - ! ii. - do dj=1,nq - qj = Dset(dj) - if (D(qj) == Qmax) then - exit - endif - enddo - - L(1:ndim, rank) = 0.d0 - - iblock = iblock+1 - do p=1,np - Ltmp_p(p,iblock) = Delta(p,dj) - enddo - - ! iv. - if (iblock > 1) then - call dgemv('N', np, iblock-1, -1.d0, Ltmp_p, np, Ltmp_q(dj,1), nq, 1.d0, & - Ltmp_p(1,iblock), 1) - endif - - ! iii. - f = 1.d0/dsqrt(Qmax) - - !$OMP PARALLEL PRIVATE(m,p,q,k) DEFAULT(shared) - !$OMP DO - do p=1,np - Ltmp_p(p,iblock) = Ltmp_p(p,iblock) * f - L(Lset(p), rank) = Ltmp_p(p,iblock) - D(Lset(p)) = D(Lset(p)) - Ltmp_p(p,iblock) * Ltmp_p(p,iblock) - enddo - !$OMP END DO - - !$OMP DO - do q=1,nq - Ltmp_q(q,iblock) = L(Dset(q), rank) - enddo - !$OMP END DO - - !$OMP END PARALLEL - - Qmax = D(Dset(1)) - do q=1,nq - Qmax = max(Qmax, D(Dset(q))) - enddo - - enddo - - print '(I10, 4X, ES12.3)', rank, Qmax - - deallocate(Delta, stat=ierr) - deallocate(Ltmp_p, stat=ierr) - deallocate(Ltmp_q, stat=ierr) - - ! i. - N = rank - - ! j. - Dmax = D(Lset(1)) - do p=1,np - Dmax = max(Dmax, D(Lset(p))) - enddo - - np=0 - Lset_rev = 0 - do p=1,ndim - if ( dscale*dscale*Dmax*D(p) > tau*tau ) then - np = np+1 - Lset(np) = p - Lset_rev(p) = np - endif - enddo - - enddo - - do k=1,ndim - call omp_destroy_lock(lock(k)) - enddo - - allocate(cholesky_ao(ao_num,ao_num,rank), stat=ierr) - if (ierr /= 0) then - print *, irp_here, ': Allocation failed' - stop -1 - endif - !$OMP PARALLEL DO PRIVATE(k) - do k=1,rank - call dcopy(ndim, L(1,k), 1, cholesky_ao(1,1,k), 1) - enddo - !$OMP END PARALLEL DO - deallocate(L) - cholesky_ao_num = rank - - print *, '============ =============' - print *, '' - print *, 'Rank : ', cholesky_ao_num, '(', 100.d0*dble(cholesky_ao_num)/dble(ao_num*ao_num), ' %)' - print *, '' + enddo + + call resident_memory(rss) + rss = rss & + + np*memory_of_double(nq) &! Delta(np,nq) + + (rank+nq)* memory_of_double(ndim) &! L(ndim,rank+nq) + + (np+nq)*memory_of_double(block_size) ! Ltmp_p(np,block_size) + ! Ltmp_q(nq,block_size) + + if (rss > qp_max_mem) then + s = s*2.d0 + else + exit + endif + + if ((s > 1.d0).or.(nq == 0)) then + print *, 'Not enough memory. Reduce cholesky threshold' + stop -1 + endif + + enddo + + ! d., e. + block_size = max(N,24) + + L_old => L + allocate(L(ndim,rank+nq), stat=ierr) + if (ierr /= 0) then + print *, irp_here, ': allocation failed : (L(ndim,rank+nq))' + stop -1 + endif + + !$OMP PARALLEL DO PRIVATE(k) + do k=1,rank + L(:,k) = L_old(:,k) + enddo + !$OMP END PARALLEL DO + + deallocate(L_old) + + allocate(Delta(np,nq), stat=ierr) + if (ierr /= 0) then + print *, irp_here, ': allocation failed : (Delta(np,nq))' + stop -1 + endif + + allocate(Ltmp_p(np,block_size), stat=ierr) + if (ierr /= 0) then + print *, irp_here, ': allocation failed : (Ltmp_p(np,block_size))' + stop -1 + endif + + allocate(Ltmp_q(nq,block_size), stat=ierr) + if (ierr /= 0) then + print *, irp_here, ': allocation failed : (Ltmp_q(nq,block_size))' + stop -1 + endif + + + !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(m,k,p,q,j) + + !$OMP DO + do q=1,nq + Delta(:,q) = 0.d0 + enddo + !$OMP ENDDO NOWAIT + + !$OMP DO + do k=1,N + do p=1,np + Ltmp_p(p,k) = L(Lset(p),k) + enddo + do q=1,nq + Ltmp_q(q,k) = L(Dset(q),k) + enddo + enddo + !$OMP END DO NOWAIT + + !$OMP BARRIER + + !$OMP DO SCHEDULE(guided) + do m=1,nq + + call omp_set_lock(lock(m)) + do k=1,np + ! Apply only to (k,m) pairs where k is not in Dset + if (LDmap(k) /= 0) cycle + q = Lset_rev(addr(3,Lset(k))) + if ((0 < q).and.(q < k)) cycle + if (.not.ao_two_e_integral_zero( addr(1,Lset(k)), addr(1,Dset(m)),& + addr(2,Lset(k)), addr(2,Dset(m)) ) ) then + if (do_direct_integrals) then + Delta(k,m) = ao_two_e_integral(addr(1,Lset(k)), addr(2,Lset(k)),& + addr(1,Dset(m)), addr(2,Dset(m))) + else + Delta(k,m) = get_ao_two_e_integral( addr(1,Lset(k)), addr(1,Dset(m)),& + addr(2,Lset(k)), addr(2,Dset(m)), ao_integrals_map) + endif + if (q /= 0) Delta(q,m) = Delta(k,m) + endif + enddo + + j = Dset_rev(addr(3,Dset(m))) + if ((0 < j).and.(j < m)) then + call omp_unset_lock(lock(m)) + cycle + endif + + if ((j /= m).and.(j /= 0)) then + call omp_set_lock(lock(j)) + endif + do k=1,nq + ! Apply only to (k,m) pairs both in Dset + p = DLmap(k) + q = Lset_rev(addr(3,Dset(k))) + if ((0 < q).and.(q < p)) cycle + if (.not.ao_two_e_integral_zero( addr(1,Dset(k)), addr(1,Dset(m)),& + addr(2,Dset(k)), addr(2,Dset(m)) ) ) then + if (do_direct_integrals) then + Delta(p,m) = ao_two_e_integral(addr(1,Dset(k)), addr(2,Dset(k)),& + addr(1,Dset(m)), addr(2,Dset(m))) + else + Delta(p,m) = get_ao_two_e_integral( addr(1,Dset(k)), addr(1,Dset(m)),& + addr(2,Dset(k)), addr(2,Dset(m)), ao_integrals_map) + endif + if (q /= 0) Delta(q,m) = Delta(p,m) + if (j /= 0) Delta(p,j) = Delta(p,m) + if (q*j /= 0) Delta(q,j) = Delta(p,m) + endif + enddo + call omp_unset_lock(lock(m)) + if ((j /= m).and.(j /= 0)) then + call omp_unset_lock(lock(j)) + endif + enddo + !$OMP END DO + + !$OMP END PARALLEL + + if (N>0) then + call dgemm('N','T', np, nq, N, -1.d0, & + Ltmp_p, np, Ltmp_q, nq, 1.d0, Delta, np) + endif + + ! f. + Qmax = D(Dset(1)) + do q=1,nq + Qmax = max(Qmax, D(Dset(q))) + enddo + + ! g. + + iblock = 0 + do j=1,nq + + if ( (Qmax <= Dmin).or.(N+j > ndim) ) exit + ! i. + rank = N+j + + if (iblock == block_size) then + call dgemm('N','T',np,nq,block_size,-1.d0, & + Ltmp_p, np, Ltmp_q, nq, 1.d0, Delta, np) + iblock = 0 + endif + + ! ii. + do dj=1,nq + qj = Dset(dj) + if (D(qj) == Qmax) then + exit + endif + enddo + + L(1:ndim, rank) = 0.d0 + + iblock = iblock+1 + do p=1,np + Ltmp_p(p,iblock) = Delta(p,dj) + enddo + + ! iv. + if (iblock > 1) then + call dgemv('N', np, iblock-1, -1.d0, Ltmp_p, np, Ltmp_q(dj,1), nq, 1.d0,& + Ltmp_p(1,iblock), 1) + endif + + ! iii. + f = 1.d0/dsqrt(Qmax) + + !$OMP PARALLEL PRIVATE(m,p,q,k) DEFAULT(shared) + !$OMP DO + do p=1,np + Ltmp_p(p,iblock) = Ltmp_p(p,iblock) * f + L(Lset(p), rank) = Ltmp_p(p,iblock) + D(Lset(p)) = D(Lset(p)) - Ltmp_p(p,iblock) * Ltmp_p(p,iblock) + enddo + !$OMP END DO + + !$OMP DO + do q=1,nq + Ltmp_q(q,iblock) = L(Dset(q), rank) + enddo + !$OMP END DO + + !$OMP END PARALLEL + + Qmax = D(Dset(1)) + do q=1,nq + Qmax = max(Qmax, D(Dset(q))) + enddo + + enddo + + print '(I10, 4X, ES12.3)', rank, Qmax + + deallocate(Delta, stat=ierr) + deallocate(Ltmp_p, stat=ierr) + deallocate(Ltmp_q, stat=ierr) + + ! i. + N = rank + + ! j. + Dmax = D(Lset(1)) + do p=1,np + Dmax = max(Dmax, D(Lset(p))) + enddo + + np=0 + Lset_rev = 0 + do p=1,ndim + if ( dscale*dscale*Dmax*D(p) > tau*tau ) then + np = np+1 + Lset(np) = p + Lset_rev(p) = np + endif + enddo + + enddo + + do k=1,ndim + call omp_destroy_lock(lock(k)) + enddo + + allocate(cholesky_ao(ao_num,ao_num,rank), stat=ierr) + if (ierr /= 0) then + print *, irp_here, ': Allocation failed' + stop -1 + endif + !$OMP PARALLEL DO PRIVATE(k) + do k=1,rank + call dcopy(ndim, L(1,k), 1, cholesky_ao(1,1,k), 1) + enddo + !$OMP END PARALLEL DO + deallocate(L) + cholesky_ao_num = rank + + print *, '============ =============' + print *, '' + + if (write_ao_cholesky) then + print *, 'Writing Cholesky vectors to disk...' + iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_ao', 'W') + write(iunit) rank + write(iunit) cholesky_ao + close(iunit) + call ezfio_set_ao_two_e_ints_io_ao_cholesky('Read') + endif + endif + + print *, 'Rank : ', cholesky_ao_num, '(', 100.d0*dble(cholesky_ao_num)/dble(ao_num*ao_num), ' %)' + print *, '' + END_PROVIDER - + diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index e7b115bb..f97514cd 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -49,9 +49,34 @@ subroutine run_ccsd_space_orb allocate(H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO)) if (cc_update_method == 'diis') then - allocate(all_err(nO*nV+nO*nO*nV*nV,cc_diis_depth), all_t(nO*nV+nO*nO*nV*nV,cc_diis_depth)) - all_err = 0d0 - all_t = 0d0 + double precision :: rss, diis_mem, extra_mem + double precision, external :: memory_of_double + diis_mem = 2.d0*memory_of_double(nO*nV)*(1.d0+nO*nV) + call resident_memory(rss) + do while (cc_diis_depth > 1) + if (rss + diis_mem * cc_diis_depth > qp_max_mem) then + cc_diis_depth = cc_diis_depth - 1 + else + exit + endif + end do + if (cc_diis_depth <= 1) then + print *, 'Not enough memory for DIIS' + stop -1 + endif + print *, 'DIIS size ', cc_diis_depth + + allocate(all_err(nO*nV+nO*nO*nV*(nV*1_8),cc_diis_depth), all_t(nO*nV+nO*nO*nV*(nV*1_8),cc_diis_depth)) + !$OMP PARALLEL PRIVATE(i,j) DEFAULT(SHARED) + do j=1,cc_diis_depth + !$OMP DO + do i=1, size(all_err,1) + all_err(i,j) = 0d0 + all_t(i,j) = 0d0 + enddo + !$OMP END DO NOWAIT + enddo + !$OMP END PARALLEL endif if (elec_alpha_num /= elec_beta_num) then @@ -1427,7 +1452,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) !enddo !$omp parallel & - !$omp shared(nO,nV,K1,X_ovov,Z_ovov,t2) & + !$omp shared(nO,nV,K1,X_ovov,Y_ovov,t2) & !$omp private(u,v,gam,beta,i,a) & !$omp default(none) !$omp do @@ -1447,7 +1472,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) do v = 1, nO do a = 1, nV do i = 1, nO - Z_ovov(i,a,v,beta) = t2(i,v,beta,a) + Y_ovov(i,a,v,beta) = t2(i,v,beta,a) enddo enddo enddo diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index 99a4e426..1c56996e 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -454,21 +454,8 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) double precision, intent(out) :: r2(nO,nO,nV,nV), max_r2 ! internal - double precision, allocatable :: g_occ(:,:), g_vir(:,:), J1(:,:,:,:), K1(:,:,:,:) - double precision, allocatable :: A1(:,:,:,:) integer :: u,v,i,j,beta,gam,a,b - - allocate(g_occ(nO,nO), g_vir(nV,nV)) - allocate(J1(nO,nV,nV,nO), K1(nO,nV,nO,nV)) - allocate(A1(nO,nO,nO,nO)) - - call compute_g_occ_chol(nO,nV,t1,t2,H_oo,g_occ) - call compute_g_vir_chol(nO,nV,t1,t2,H_vv,g_vir) - call compute_A1_chol(nO,nV,t1,t2,tau,A1) - call compute_J1_chol(nO,nV,t1,t2,cc_space_v_ovvo,cc_space_v_ovoo, & - cc_space_v_vvoo,J1) - call compute_K1_chol(nO,nV,t1,t2,cc_space_v_ovoo,cc_space_v_vvoo, & - cc_space_v_ovov,K1) + double precision :: max_r2_local ! Residual !r2 = 0d0 @@ -490,36 +477,47 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) !$omp end do !$omp end parallel + double precision, allocatable :: A1(:,:,:,:) + allocate(A1(nO,nO,nO,nO)) + call compute_A1_chol(nO,nV,t1,t2,tau,A1) call dgemm('N','N',nO*nO,nV*nV,nO*nO, & 1d0, A1, size(A1,1) * size(A1,2), & tau, size(tau,1) * size(tau,2), & 1d0, r2, size(r2,1) * size(r2,2)) + deallocate(A1) integer :: block_size, iblock, k block_size = 16 double precision, dimension(:,:,:), allocatable :: B1, tmp_cc, tmpB1 + double precision, dimension(:,:), allocatable :: tmp_cc2 allocate(tmp_cc(cholesky_ao_num,nV,nV)) call dgemm('N','N', cholesky_ao_num*nV, nV, nO, 1.d0, & cc_space_v_vo_chol, cholesky_ao_num*nV, t1, nO, 0.d0, tmp_cc, cholesky_ao_num*nV) - !$OMP PARALLEL PRIVATE(gam, iblock, B1, tmpB1, beta, b, a) - allocate(B1(nV,nV,block_size), tmpB1(nV,block_size,nV)) + call set_multiple_levels_omp(.False.) + + !$OMP PARALLEL PRIVATE(gam, iblock, B1, tmpB1, tmp_cc2, beta, b, a) + allocate(B1(nV,nV,block_size), tmpB1(nV,block_size,nV), tmp_cc2(cholesky_ao_num,nV)) !$OMP DO do gam = 1, nV do iblock = 1, nV, block_size - call dgemm('T', 'N', nV*min(block_size, nV-iblock+1), nV, cholesky_ao_num, & - -1.d0, cc_space_v_vv_chol(1,1,iblock), cholesky_ao_num, & - tmp_cc(1,1,gam), cholesky_ao_num, 0.d0, tmpB1, nV*block_size) call dgemm('T', 'N', nV*min(block_size, nV-iblock+1), nV, cholesky_ao_num, & -1.d0, tmp_cc(1,1,iblock), cholesky_ao_num, & - cc_space_v_vv_chol(1,1,gam), cholesky_ao_num, 1.d0, tmpB1, nV*block_size) + cc_space_v_vv_chol(1,1,gam), cholesky_ao_num, & + 0.d0, tmpB1, nV*block_size) + + do a=1,nV + do k=1,cholesky_ao_num + tmp_cc2(k,a) = cc_space_v_vv_chol(k,a,gam) - tmp_cc(k,a,gam) + enddo + enddo call dgemm('T','N', nV*min(block_size, nV-iblock+1), nV, cholesky_ao_num, 1.d0, & cc_space_v_vv_chol(1,1,iblock), cholesky_ao_num, & - cc_space_v_vv_chol(1,1,gam), cholesky_ao_num, 1.d0, & - tmpB1, nV*block_size) + tmp_cc2, cholesky_ao_num, & + 1.d0, tmpB1, nV*block_size) do beta = iblock, min(nV, iblock+block_size-1) do b = 1, nV @@ -538,15 +536,14 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) enddo !$OMP ENDDO - deallocate(B1, tmpB1) + deallocate(B1, tmpB1, tmp_cc2) !$OMP END PARALLEL deallocate(tmp_cc) - double precision, allocatable :: X_oovv(:,:,:,:),Y_oovv(:,:,:,:) - allocate(X_oovv(nO,nO,nV,nV),Y_oovv(nO,nO,nV,nV)) - + double precision, allocatable :: X_oovv(:,:,:,:) + allocate(X_oovv(nO,nO,nV,nV)) !$omp parallel & !$omp shared(nO,nV,t2,X_oovv) & !$omp private(u,v,gam,a) & @@ -564,10 +561,19 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) !$omp end do !$omp end parallel + double precision, allocatable :: g_vir(:,:) + allocate(g_vir(nV,nV)) + call compute_g_vir_chol(nO,nV,t1,t2,H_vv,g_vir) + + double precision, allocatable :: Y_oovv(:,:,:,:) + allocate(Y_oovv(nO,nO,nV,nV)) + call dgemm('N','N',nO*nO*nV,nV,nV, & 1d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3), & g_vir, size(g_vir,1), & 0d0, Y_oovv, size(Y_oovv,1) * size(Y_oovv,2) * size(Y_oovv,3)) + deallocate(g_vir) + deallocate(X_oovv) !$omp parallel & !$omp shared(nO,nV,r2,Y_oovv) & @@ -585,11 +591,18 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) enddo !$omp end do !$omp end parallel + deallocate(Y_oovv) + double precision, allocatable :: g_occ(:,:) + allocate(g_occ(nO,nO)) + call compute_g_occ_chol(nO,nV,t1,t2,H_oo,g_occ) + + allocate(X_oovv(nO,nO,nV,nV)) call dgemm('N','N',nO,nO*nV*nV,nO, & 1d0, g_occ , size(g_occ,1), & t2 , size(t2,1), & 0d0, X_oovv, size(X_oovv,1)) + deallocate(g_occ) !$omp parallel & !$omp shared(nO,nV,r2,X_oovv) & @@ -613,6 +626,8 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) double precision, allocatable :: X_vovv(:,:,:,:) allocate(X_vovv(nV,nO,nV,block_size)) + allocate(Y_oovv(nO,nO,nV,nV)) + do iblock = 1, nV, block_size do gam = iblock, min(nV, iblock+block_size-1) call dgemm('T','N',nV, nO*nV, cholesky_ao_num, 1.d0, & @@ -626,6 +641,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) 0d0, Y_oovv(1,1,1,iblock), size(Y_oovv,1)) enddo + deallocate(X_vovv) !$omp parallel & !$omp shared(nO,nV,r2,Y_oovv) & @@ -643,6 +659,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) enddo !$omp end do !$omp end parallel + deallocate(Y_oovv) double precision, allocatable :: X_ovvo(:,:,:,:) double precision, allocatable :: tcc(:,:,:), tcc2(:,:,:) @@ -693,6 +710,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) deallocate(X_ovvo) !----- + allocate(X_oovv(nO,nO,nV,nV)) call dgemm('N','N',nO*nO*nV,nV,nO, & @@ -716,9 +734,10 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) enddo !$omp end do !$omp end parallel + deallocate(X_oovv) double precision, allocatable :: X_vovo(:,:,:,:), Y_oovo(:,:,:,:) - allocate(X_vovo(nV,nO,nV,nO), Y_oovo(nO,nO,nV,nO)) + allocate(X_vovo(nV,nO,nV,nO)) !$omp parallel & !$omp shared(nO,nV,X_vovo,cc_space_v_ovvo) & @@ -737,15 +756,19 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) enddo !$omp end parallel + allocate(Y_oovo(nO,nO,nV,nO)) call dgemm('N','N',nO,nO*nV*nO,nV, & 1d0, t1, size(t1,1), & X_vovo, size(X_vovo,1), & 0d0, Y_oovo, size(Y_oovo,1)) + deallocate(X_vovo) + allocate(X_oovv(nO,nO,nV,nV)) call dgemm('N','N',nO*nO*nV, nV, nO, & 1d0, Y_oovo, size(Y_oovo,1) * size(Y_oovo,2) * size(Y_oovo,3), & t1 , size(t1,1), & 0d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3)) + deallocate(Y_oovo) !$omp parallel & !$omp shared(nO,nV,r2,X_oovv) & @@ -763,15 +786,23 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) enddo !$omp end do !$omp end parallel + deallocate(X_oovv) - deallocate(X_vovo,Y_oovo) - double precision, allocatable :: Y_voov(:,:,:,:), Z_ovov(:,:,:,:) - allocate(X_ovvo(nO,nV,nV,nO), Y_voov(nV,nO,nO,nV),Z_ovov(nO,nV,nO,nV)) + double precision, allocatable :: J1(:,:,:,:) + allocate(J1(nO,nV,nV,nO)) + call compute_J1_chol(nO,nV,t1,t2,cc_space_v_ovvo,cc_space_v_ovoo, & + cc_space_v_vvoo,J1) + + double precision, allocatable :: K1(:,:,:,:) + allocate(K1(nO,nV,nO,nV)) + call compute_K1_chol(nO,nV,t1,t2,cc_space_v_ovoo,cc_space_v_vvoo, & + cc_space_v_ovov,K1) + + allocate(X_ovvo(nO,nV,nV,nO)) !$omp parallel & - !$omp shared(nO,nV,X_ovvo,Y_voov,K1,J1,t2) & !$omp private(u,v,gam,beta,i,a) & - !$omp default(none) + !$omp default(shared) do i = 1, nO !$omp do do a = 1, nV @@ -783,7 +814,15 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) enddo !$omp end do nowait enddo + !$omp end parallel + deallocate(J1) + double precision, allocatable :: Y_voov(:,:,:,:) + allocate(Y_voov(nV,nO,nO,nV)) + + !$omp parallel & + !$omp private(u,v,gam,beta,i,a) & + !$omp default(shared) !$omp do do gam = 1, nV do v = 1, nO @@ -797,11 +836,16 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) !$omp end do !$omp end parallel + double precision, allocatable :: Z_ovov(:,:,:,:) + allocate(Z_ovov(nO,nV,nO,nV)) + call dgemm('N','N', nO*nV,nO*nV,nV*nO, & 1d0, X_ovvo, size(X_ovvo,1) * size(X_ovvo,2), & Y_voov, size(Y_voov,1) * size(Y_voov,2), & 0d0, Z_ovov, size(Z_ovov,1) * size(Z_ovov,2)) + deallocate(X_ovvo,Y_voov) + !$omp parallel & !$omp shared(nO,nV,r2,Z_ovov) & !$omp private(u,v,gam,beta) & @@ -819,10 +863,11 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) !$omp end do !$omp end parallel - deallocate(X_ovvo,Y_voov) + deallocate(Z_ovov) double precision, allocatable :: Y_ovov(:,:,:,:), X_ovov(:,:,:,:) - allocate(X_ovov(nO,nV,nO,nV),Y_ovov(nO,nV,nO,nV)) + allocate(X_ovov(nO,nV,nO,nV)) + allocate(Y_ovov(nO,nV,nO,nV)) !$omp parallel & !$omp shared(nO,nV,r2,K1,X_ovov,Y_ovov,t2) & @@ -853,10 +898,12 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) !$omp end do !$omp end parallel + allocate(Z_ovov(nO,nV,nO,nV)) call dgemm('T','N',nO*nV,nO*nV,nO*nV, & 1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), & Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), & 0d0, Z_ovov, size(Y_ovov,1) * size(Y_ovov,2)) + deallocate(X_ovov, Y_ovov) !$omp parallel & !$omp shared(nO,nV,r2,Z_ovov) & @@ -874,9 +921,11 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) enddo !$omp end do !$omp end parallel + deallocate(Z_ovov) + allocate(X_ovov(nO,nV,nO,nV),Y_ovov(nO,nV,nO,nV)) !$omp parallel & - !$omp shared(nO,nV,K1,X_ovov,Z_ovov,t2) & + !$omp shared(nO,nV,K1,X_ovov,Y_ovov,t2) & !$omp private(u,v,gam,beta,i,a) & !$omp default(none) !$omp do @@ -896,7 +945,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) do v = 1, nO do a = 1, nV do i = 1, nO - Z_ovov(i,a,v,beta) = t2(i,v,beta,a) + Y_ovov(i,a,v,beta) = t2(i,v,beta,a) enddo enddo enddo @@ -904,11 +953,16 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) !$omp end do !$omp end parallel + deallocate(K1) + + allocate(Z_ovov(nO,nV,nO,nV)) call dgemm('N','N',nO*nV,nO*nV,nO*nV, & 1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), & Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), & 0d0, Z_ovov, size(Y_ovov,1) * size(Y_ovov,2)) + deallocate(X_ovov,Y_ovov) + !$omp parallel & !$omp shared(nO,nV,r2,Z_ovov) & !$omp private(u,v,gam,beta) & @@ -926,39 +980,33 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) !$omp end do !$omp end parallel - deallocate(X_ovov,Y_ovov,Z_ovov) + deallocate(Z_ovov) ! Change the sign for consistency with the code in spin orbitals + + max_r2 = 0d0 !$omp parallel & - !$omp shared(nO,nV,r2) & - !$omp private(i,j,a,b) & + !$omp shared(nO,nV,r2,max_r2) & + !$omp private(i,j,a,b,max_r2_local) & !$omp default(none) + max_r2_local = 0.d0 !$omp do do b = 1, nV do a = 1, nV do j = 1, nO do i = 1, nO r2(i,j,a,b) = -r2(i,j,a,b) + max_r2_local = max(r2(i,j,a,b), max_r2_local) enddo enddo enddo enddo - !$omp end do + !$omp end do nowait + !$omp critical + max_r2 = max(max_r2, max_r2_local) + !$omp end critical !$omp end parallel - max_r2 = 0d0 - do b = 1, nV - do a = 1, nV - do j = 1, nO - do i = 1, nO - max_r2 = max(r2(i,j,a,b), max_r2) - enddo - enddo - enddo - enddo - - deallocate(g_occ,g_vir,J1,K1,A1) - end ! A1 From 9e833cc47627e819e60e05faace5fabb3540f760 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 11 Jul 2023 22:17:40 +0200 Subject: [PATCH 30/40] Improve memory control --- src/ao_two_e_ints/cholesky.irp.f | 35 +++++++++++++++++++++----------- src/utils/memory.irp.f | 13 ++++++------ 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f index 8b969174..4bf60847 100644 --- a/src/ao_two_e_ints/cholesky.irp.f +++ b/src/ao_two_e_ints/cholesky.irp.f @@ -48,7 +48,7 @@ END_PROVIDER integer(omp_lock_kind), allocatable :: lock(:) - double precision :: rss + double precision :: mem double precision, external :: memory_of_double, memory_of_int integer, external :: getUnitAndOpen @@ -70,16 +70,22 @@ END_PROVIDER PROVIDE nucl_coord - if (.not.do_direct_integrals) then + if (do_direct_integrals) then + if (ao_two_e_integral(1,1,1,1) < huge(1.d0)) then + ! Trigger providers inside ao_two_e_integral + continue + endif + else PROVIDE ao_two_e_integrals_in_map endif tau = ao_cholesky_threshold - rss = 6.d0 * memory_of_double(ndim) + & - 6.d0 * memory_of_int(ndim) - call check_mem(rss, irp_here) + mem = 6.d0 * memory_of_double(ndim) + 6.d0 * memory_of_int(ndim) + call check_mem(mem, irp_here) + call print_memory_usage() + allocate(L(ndim,1)) print *, '' @@ -112,7 +118,7 @@ END_PROVIDER enddo if (do_direct_integrals) then - !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i) + !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i) SCHEDULE(guided) do i=1,ndim D(i) = ao_two_e_integral(addr(1,i), addr(2,i), & addr(1,i), addr(2,i)) @@ -175,20 +181,20 @@ END_PROVIDER endif enddo - call resident_memory(rss) - rss = rss & + call total_memory(mem) + mem = mem & + np*memory_of_double(nq) &! Delta(np,nq) + (rank+nq)* memory_of_double(ndim) &! L(ndim,rank+nq) - + (np+nq)*memory_of_double(block_size) ! Ltmp_p(np,block_size) - ! Ltmp_q(nq,block_size) + + (np+nq)*memory_of_double(block_size) ! Ltmp_p(np,block_size) + Ltmp_q(nq,block_size) - if (rss > qp_max_mem) then + if (mem > qp_max_mem) then s = s*2.d0 else exit endif if ((s > 1.d0).or.(nq == 0)) then + call print_memory_usage() print *, 'Not enough memory. Reduce cholesky threshold' stop -1 endif @@ -201,6 +207,7 @@ END_PROVIDER L_old => L allocate(L(ndim,rank+nq), stat=ierr) if (ierr /= 0) then + call print_memory_usage() print *, irp_here, ': allocation failed : (L(ndim,rank+nq))' stop -1 endif @@ -215,18 +222,21 @@ END_PROVIDER allocate(Delta(np,nq), stat=ierr) if (ierr /= 0) then + call print_memory_usage() print *, irp_here, ': allocation failed : (Delta(np,nq))' stop -1 endif allocate(Ltmp_p(np,block_size), stat=ierr) if (ierr /= 0) then + call print_memory_usage() print *, irp_here, ': allocation failed : (Ltmp_p(np,block_size))' stop -1 endif allocate(Ltmp_q(nq,block_size), stat=ierr) if (ierr /= 0) then + call print_memory_usage() print *, irp_here, ': allocation failed : (Ltmp_q(nq,block_size))' stop -1 endif @@ -253,7 +263,7 @@ END_PROVIDER !$OMP BARRIER - !$OMP DO SCHEDULE(guided) + !$OMP DO SCHEDULE(dynamic) do m=1,nq call omp_set_lock(lock(m)) @@ -419,6 +429,7 @@ END_PROVIDER allocate(cholesky_ao(ao_num,ao_num,rank), stat=ierr) if (ierr /= 0) then + call print_memory_usage() print *, irp_here, ': Allocation failed' stop -1 endif diff --git a/src/utils/memory.irp.f b/src/utils/memory.irp.f index 7da283ec..41ec0428 100644 --- a/src/utils/memory.irp.f +++ b/src/utils/memory.irp.f @@ -99,16 +99,15 @@ subroutine check_mem(rss_in,routine) END_DOC double precision, intent(in) :: rss_in character*(*) :: routine - double precision :: rss - !$OMP CRITICAL - call resident_memory(rss) - rss += rss_in - if (int(rss)+1 > qp_max_mem) then + double precision :: mem + call total_memory(mem) + mem += rss_in + if (mem > qp_max_mem) then + call print_memory_usage() print *, 'Not enough memory: aborting in ', routine - print *, int(rss)+1, ' GB required' + print *, mem, ' GB required' stop -1 endif - !$OMP END CRITICAL end subroutine print_memory_usage() From 349f956e1cd8c6af519718b3043d4f8fd26b7f4f Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 11 Jul 2023 22:31:51 +0200 Subject: [PATCH 31/40] Super fast cholesky --- src/ao_two_e_ints/cholesky.irp.f | 92 +++++++++++--------------------- 1 file changed, 30 insertions(+), 62 deletions(-) diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f index 4bf60847..7d02d27f 100644 --- a/src/ao_two_e_ints/cholesky.irp.f +++ b/src/ao_two_e_ints/cholesky.irp.f @@ -35,6 +35,7 @@ END_PROVIDER double precision, allocatable :: D(:), Delta(:,:), Ltmp_p(:,:), Ltmp_q(:,:) integer, allocatable :: Lset(:), Dset(:), addr(:,:), LDmap(:), DLmap(:) integer, allocatable :: Lset_rev(:), Dset_rev(:) + logical, allocatable :: computed(:) integer :: i,j,k,m,p,q, qj, dj, p2, q2 integer :: N, np, nq @@ -158,7 +159,7 @@ END_PROVIDER ! a. i = i+1 - s = 0.1d0 + s = 0.01d0 ! Inrease s until the arrays fit in memory do while (.True.) @@ -242,11 +243,14 @@ END_PROVIDER endif + allocate(computed(nq)) + !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(m,k,p,q,j) !$OMP DO do q=1,nq Delta(:,q) = 0.d0 + computed(q) = .False. enddo !$OMP ENDDO NOWAIT @@ -262,64 +266,6 @@ END_PROVIDER !$OMP END DO NOWAIT !$OMP BARRIER - - !$OMP DO SCHEDULE(dynamic) - do m=1,nq - - call omp_set_lock(lock(m)) - do k=1,np - ! Apply only to (k,m) pairs where k is not in Dset - if (LDmap(k) /= 0) cycle - q = Lset_rev(addr(3,Lset(k))) - if ((0 < q).and.(q < k)) cycle - if (.not.ao_two_e_integral_zero( addr(1,Lset(k)), addr(1,Dset(m)),& - addr(2,Lset(k)), addr(2,Dset(m)) ) ) then - if (do_direct_integrals) then - Delta(k,m) = ao_two_e_integral(addr(1,Lset(k)), addr(2,Lset(k)),& - addr(1,Dset(m)), addr(2,Dset(m))) - else - Delta(k,m) = get_ao_two_e_integral( addr(1,Lset(k)), addr(1,Dset(m)),& - addr(2,Lset(k)), addr(2,Dset(m)), ao_integrals_map) - endif - if (q /= 0) Delta(q,m) = Delta(k,m) - endif - enddo - - j = Dset_rev(addr(3,Dset(m))) - if ((0 < j).and.(j < m)) then - call omp_unset_lock(lock(m)) - cycle - endif - - if ((j /= m).and.(j /= 0)) then - call omp_set_lock(lock(j)) - endif - do k=1,nq - ! Apply only to (k,m) pairs both in Dset - p = DLmap(k) - q = Lset_rev(addr(3,Dset(k))) - if ((0 < q).and.(q < p)) cycle - if (.not.ao_two_e_integral_zero( addr(1,Dset(k)), addr(1,Dset(m)),& - addr(2,Dset(k)), addr(2,Dset(m)) ) ) then - if (do_direct_integrals) then - Delta(p,m) = ao_two_e_integral(addr(1,Dset(k)), addr(2,Dset(k)),& - addr(1,Dset(m)), addr(2,Dset(m))) - else - Delta(p,m) = get_ao_two_e_integral( addr(1,Dset(k)), addr(1,Dset(m)),& - addr(2,Dset(k)), addr(2,Dset(m)), ao_integrals_map) - endif - if (q /= 0) Delta(q,m) = Delta(p,m) - if (j /= 0) Delta(p,j) = Delta(p,m) - if (q*j /= 0) Delta(q,j) = Delta(p,m) - endif - enddo - call omp_unset_lock(lock(m)) - if ((j /= m).and.(j /= 0)) then - call omp_unset_lock(lock(j)) - endif - enddo - !$OMP END DO - !$OMP END PARALLEL if (N>0) then @@ -358,6 +304,27 @@ END_PROVIDER L(1:ndim, rank) = 0.d0 + if (.not.computed(dj)) then + m = dj + !$OMP PARALLEL DO PRIVATE(k) SCHEDULE(guided) + do k=np,1,-1 + if (.not.ao_two_e_integral_zero( addr(1,Lset(k)), addr(1,Dset(m)),& + addr(2,Lset(k)), addr(2,Dset(m)) ) ) then + if (do_direct_integrals) then + Delta(k,m) = Delta(k,m) + & + ao_two_e_integral(addr(1,Lset(k)), addr(2,Lset(k)),& + addr(1,Dset(m)), addr(2,Dset(m))) + else + Delta(k,m) = Delta(k,m) + & + get_ao_two_e_integral( addr(1,Lset(k)), addr(1,Dset(m)),& + addr(2,Lset(k)), addr(2,Dset(m)), ao_integrals_map) + endif + endif + enddo + !$OMP END PARALLEL DO + computed(dj) = .True. + endif + iblock = iblock+1 do p=1,np Ltmp_p(p,iblock) = Delta(p,dj) @@ -398,9 +365,10 @@ END_PROVIDER print '(I10, 4X, ES12.3)', rank, Qmax - deallocate(Delta, stat=ierr) - deallocate(Ltmp_p, stat=ierr) - deallocate(Ltmp_q, stat=ierr) + deallocate(computed) + deallocate(Delta) + deallocate(Ltmp_p) + deallocate(Ltmp_q) ! i. N = rank From 1e390d83574392887c6e3890b9f860c98cd66904 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 11 Jul 2023 23:50:31 +0200 Subject: [PATCH 32/40] Reduce memory --- src/ao_two_e_ints/cholesky.irp.f | 25 ++----------------------- src/ccsd/ccsd_space_orb_sub.irp.f | 7 +++++++ 2 files changed, 9 insertions(+), 23 deletions(-) diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f index 7d02d27f..175ccf6e 100644 --- a/src/ao_two_e_ints/cholesky.irp.f +++ b/src/ao_two_e_ints/cholesky.irp.f @@ -33,8 +33,7 @@ END_PROVIDER double precision, parameter :: dscale = 1.d0 double precision, allocatable :: D(:), Delta(:,:), Ltmp_p(:,:), Ltmp_q(:,:) - integer, allocatable :: Lset(:), Dset(:), addr(:,:), LDmap(:), DLmap(:) - integer, allocatable :: Lset_rev(:), Dset_rev(:) + integer, allocatable :: Lset(:), Dset(:), addr(:,:) logical, allocatable :: computed(:) integer :: i,j,k,m,p,q, qj, dj, p2, q2 @@ -47,8 +46,6 @@ END_PROVIDER double precision, external :: ao_two_e_integral integer :: block_size, iblock, ierr - integer(omp_lock_kind), allocatable :: lock(:) - double precision :: mem double precision, external :: memory_of_double, memory_of_int @@ -100,12 +97,8 @@ END_PROVIDER rank = 0 - allocate( D(ndim), Lset(ndim), LDmap(ndim), DLmap(ndim), Dset(ndim) ) - allocate( Lset_rev(ndim), Dset_rev(ndim), lock(ndim) ) + allocate( D(ndim), Lset(ndim), Dset(ndim) ) allocate( addr(3,ndim) ) - do k=1,ndim - call omp_init_lock(lock(k)) - enddo ! 1. k=0 @@ -139,12 +132,10 @@ END_PROVIDER ! 2. np=0 - Lset_rev = 0 do p=1,ndim if ( dscale*dscale*Dmax*D(p) > tau*tau ) then np = np+1 Lset(np) = p - Lset_rev(p) = np endif enddo @@ -169,16 +160,10 @@ END_PROVIDER ! c. nq=0 - LDmap = 0 - DLmap = 0 - Dset_rev = 0 do p=1,np if ( D(Lset(p)) > Dmin ) then nq = nq+1 Dset(nq) = Lset(p) - Dset_rev(Dset(nq)) = nq - LDmap(p) = nq - DLmap(nq) = p endif enddo @@ -380,21 +365,15 @@ END_PROVIDER enddo np=0 - Lset_rev = 0 do p=1,ndim if ( dscale*dscale*Dmax*D(p) > tau*tau ) then np = np+1 Lset(np) = p - Lset_rev(p) = np endif enddo enddo - do k=1,ndim - call omp_destroy_lock(lock(k)) - enddo - allocate(cholesky_ao(ao_num,ao_num,rank), stat=ierr) if (ierr /= 0) then call print_memory_usage() diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index f97514cd..b48ca7da 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -18,6 +18,13 @@ subroutine run_ccsd_space_orb integer(bit_kind) :: det(N_int,2) integer :: nO, nV, nOa, nVa + if (do_ao_cholesky) then + PROVIDE cholesky_mo_transp + FREE cholesky_ao + else + PROVIDE mo_two_e_integrals_in_map + endif + det = psi_det(:,:,cc_ref) print*,'Reference determinant:' call print_det(det,N_int) From 3c89e9d88d21d6ea171889989008172b53262e67 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Wed, 12 Jul 2023 11:50:34 +0200 Subject: [PATCH 33/40] Fixed qp set_file --- etc/qp.rc | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/etc/qp.rc b/etc/qp.rc index 9eec4570..c485abea 100644 --- a/etc/qp.rc +++ b/etc/qp.rc @@ -188,7 +188,19 @@ _qp_Complete() ;; esac;; set_file) - COMPREPLY=( $(compgen -W "$(for i in */ $(find . -name ezfio | sed 's/ezfio$/.version/') ; do [[ -f $i ]] && echo ${i%/.version} ; done)" -- ${cur} ) ) + # Array to store directory names + dirs=() + + # Find directories containing "ezfio/.version" file recursively + for i in $(find . -name ezfio | sed 's/ezfio$/.version/') + do + dir_name=${i%/.version} # Remove the ".version" suffix + dir_name=${dir_name#./} # Remove the leading "./" if present + dirs+=("$dir_name") + done + + # Output the directory names for completion + COMPREPLY=("${dirs[@]/#/.\/}") # Prefix each directory name with "./" return 0 ;; plugins) From 0aed20f53a68c0225d05b7917da351926c2234e0 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Wed, 12 Jul 2023 12:04:42 +0200 Subject: [PATCH 34/40] Fixed previous commit --- etc/qp.rc | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/etc/qp.rc b/etc/qp.rc index c485abea..d316faf5 100644 --- a/etc/qp.rc +++ b/etc/qp.rc @@ -189,18 +189,17 @@ _qp_Complete() esac;; set_file) # Array to store directory names - dirs=() + dirs="" # Find directories containing "ezfio/.version" file recursively for i in $(find . -name ezfio | sed 's/ezfio$/.version/') do dir_name=${i%/.version} # Remove the ".version" suffix - dir_name=${dir_name#./} # Remove the leading "./" if present - dirs+=("$dir_name") + dir_name=${dir_name#./} # Remove the leading "./" + dirs+="./$dir_name " done - # Output the directory names for completion - COMPREPLY=("${dirs[@]/#/.\/}") # Prefix each directory name with "./" + COMPREPLY=( $(compgen -W "$dirs" -- ${cur} ) ) return 0 ;; plugins) From d4574f24d981d793a6038039a4b19af6733fe7a3 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Wed, 12 Jul 2023 12:34:48 +0200 Subject: [PATCH 35/40] Reduced memory in CCSD --- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 154 +++++++++++----------- src/mo_two_e_ints/cholesky.irp.f | 24 ++-- src/mo_two_e_ints/integrals_3_index.irp.f | 14 +- src/mo_two_e_ints/mo_bi_integrals.irp.f | 16 +-- src/utils_cc/mo_integrals_cc.irp.f | 62 ++++----- 5 files changed, 139 insertions(+), 131 deletions(-) diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index 1c56996e..5969928a 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -209,9 +209,9 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) do iblock = 1, nV, block_size nVmax = min(block_size,nV-iblock+1) - call dgemm('T','N', nV*nO, nV*nVmax, cholesky_ao_num, 1.d0, & - cc_space_v_vo_chol , cholesky_ao_num, & - cc_space_v_vv_chol(1,1,iblock), cholesky_ao_num, & + call dgemm('T','N', nV*nO, nV*nVmax, cholesky_mo_num, 1.d0, & + cc_space_v_vo_chol , cholesky_mo_num, & + cc_space_v_vv_chol(1,1,iblock), cholesky_mo_num, & 0.d0, W_vvov_tmp, nV*nO) !$omp parallel & @@ -304,7 +304,7 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo) double precision, allocatable :: tau_kau(:,:,:), tmp_vov(:,:,:) - allocate(tau_kau(cholesky_ao_num,nV,nO)) + allocate(tau_kau(cholesky_mo_num,nV,nO)) !$omp parallel & !$omp default(shared) & !$omp private(i,u,j,k,a,b,tmp_vov) @@ -318,9 +318,9 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo) enddo enddo enddo - call dgemm('N','T',cholesky_ao_num,nV,nO*nV,1.d0, & - cc_space_v_ov_chol, cholesky_ao_num, tmp_vov, nV, & - 0.d0, tau_kau(1,1,u), cholesky_ao_num) + call dgemm('N','T',cholesky_mo_num,nV,nO*nV,1.d0, & + cc_space_v_ov_chol, cholesky_mo_num, tmp_vov, nV, & + 0.d0, tau_kau(1,1,u), cholesky_mo_num) enddo !$omp end do nowait deallocate(tmp_vov) @@ -333,8 +333,8 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo) !$omp end do nowait !$omp barrier !$omp end parallel - call dgemm('T', 'N', nO, nO, cholesky_ao_num*nV, 1.d0, & - tau_kau, cholesky_ao_num*nV, cc_space_v_vo_chol, cholesky_ao_num*nV, & + call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & + tau_kau, cholesky_mo_num*nV, cc_space_v_vo_chol, cholesky_mo_num*nV, & 1.d0, H_oo, nO) end @@ -353,7 +353,7 @@ subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv) double precision, allocatable :: tau_kia(:,:,:), tmp_oov(:,:,:) - allocate(tau_kia(cholesky_ao_num,nO,nV)) + allocate(tau_kia(cholesky_mo_num,nO,nV)) !$omp parallel & !$omp default(shared) & !$omp private(i,beta,j,k,a,b,tmp_oov) @@ -367,9 +367,9 @@ subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv) enddo enddo enddo - call dgemm('N','T',cholesky_ao_num,nO,nO*nV,1.d0, & - cc_space_v_ov_chol, cholesky_ao_num, tmp_oov, nO, & - 0.d0, tau_kia(1,1,a), cholesky_ao_num) + call dgemm('N','T',cholesky_mo_num,nO,nO*nV,1.d0, & + cc_space_v_ov_chol, cholesky_mo_num, tmp_oov, nO, & + 0.d0, tau_kia(1,1,a), cholesky_mo_num) enddo !$omp end do nowait deallocate(tmp_oov) @@ -383,8 +383,8 @@ subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv) !$omp end do nowait !$omp barrier !$omp end parallel - call dgemm('T', 'N', nV, nV, cholesky_ao_num*nO, -1.d0, & - tau_kia, cholesky_ao_num*nO, cc_space_v_ov_chol, cholesky_ao_num*nO, & + call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, & + tau_kia, cholesky_mo_num*nO, cc_space_v_ov_chol, cholesky_mo_num*nO, & 1.d0, H_vv, nV) end @@ -407,33 +407,33 @@ subroutine compute_H_vo_chol(nO,nV,t1,H_vo) enddo enddo - allocate(tmp_k(cholesky_ao_num)) - call dgemm('N', 'N', cholesky_ao_num, 1, nO*nV, 2.d0, & - cc_space_v_ov_chol, cholesky_ao_num, & - t1, nO*nV, 0.d0, tmp_k, cholesky_ao_num) + allocate(tmp_k(cholesky_mo_num)) + call dgemm('N', 'N', cholesky_mo_num, 1, nO*nV, 2.d0, & + cc_space_v_ov_chol, cholesky_mo_num, & + t1, nO*nV, 0.d0, tmp_k, cholesky_mo_num) - call dgemm('T','N',nV*nO,1,cholesky_ao_num,1.d0, & - cc_space_v_vo_chol, cholesky_ao_num, tmp_k, cholesky_ao_num, 1.d0, & + call dgemm('T','N',nV*nO,1,cholesky_mo_num,1.d0, & + cc_space_v_vo_chol, cholesky_mo_num, tmp_k, cholesky_mo_num, 1.d0, & H_vo, nV*nO) deallocate(tmp_k) - allocate(tmp(cholesky_ao_num,nO,nO)) - allocate(tmp2(cholesky_ao_num,nO,nO)) + allocate(tmp(cholesky_mo_num,nO,nO)) + allocate(tmp2(cholesky_mo_num,nO,nO)) - call dgemm('N','T', cholesky_ao_num*nO, nO, nV, 1.d0, & - cc_space_v_ov_chol, cholesky_ao_num*nO, t1, nO, 0.d0, tmp, cholesky_ao_num*nO) + call dgemm('N','T', cholesky_mo_num*nO, nO, nV, 1.d0, & + cc_space_v_ov_chol, cholesky_mo_num*nO, t1, nO, 0.d0, tmp, cholesky_mo_num*nO) do i=1,nO do j=1,nO - do k=1,cholesky_ao_num + do k=1,cholesky_mo_num tmp2(k,j,i) = tmp(k,i,j) enddo enddo enddo deallocate(tmp) - call dgemm('T','N', nV, nO, cholesky_ao_num*nO, -1.d0, & - cc_space_v_ov_chol, cholesky_ao_num*nO, tmp2, cholesky_ao_num*nO, & + call dgemm('T','N', nV, nO, cholesky_mo_num*nO, -1.d0, & + cc_space_v_ov_chol, cholesky_mo_num*nO, tmp2, cholesky_mo_num*nO, & 1.d0, H_vo, nV) end @@ -491,32 +491,32 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) double precision, dimension(:,:,:), allocatable :: B1, tmp_cc, tmpB1 double precision, dimension(:,:), allocatable :: tmp_cc2 - allocate(tmp_cc(cholesky_ao_num,nV,nV)) - call dgemm('N','N', cholesky_ao_num*nV, nV, nO, 1.d0, & - cc_space_v_vo_chol, cholesky_ao_num*nV, t1, nO, 0.d0, tmp_cc, cholesky_ao_num*nV) + allocate(tmp_cc(cholesky_mo_num,nV,nV)) + call dgemm('N','N', cholesky_mo_num*nV, nV, nO, 1.d0, & + cc_space_v_vo_chol, cholesky_mo_num*nV, t1, nO, 0.d0, tmp_cc, cholesky_mo_num*nV) call set_multiple_levels_omp(.False.) !$OMP PARALLEL PRIVATE(gam, iblock, B1, tmpB1, tmp_cc2, beta, b, a) - allocate(B1(nV,nV,block_size), tmpB1(nV,block_size,nV), tmp_cc2(cholesky_ao_num,nV)) + allocate(B1(nV,nV,block_size), tmpB1(nV,block_size,nV), tmp_cc2(cholesky_mo_num,nV)) !$OMP DO do gam = 1, nV do iblock = 1, nV, block_size - call dgemm('T', 'N', nV*min(block_size, nV-iblock+1), nV, cholesky_ao_num, & - -1.d0, tmp_cc(1,1,iblock), cholesky_ao_num, & - cc_space_v_vv_chol(1,1,gam), cholesky_ao_num, & + call dgemm('T', 'N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, & + -1.d0, tmp_cc(1,1,iblock), cholesky_mo_num, & + cc_space_v_vv_chol(1,1,gam), cholesky_mo_num, & 0.d0, tmpB1, nV*block_size) do a=1,nV - do k=1,cholesky_ao_num + do k=1,cholesky_mo_num tmp_cc2(k,a) = cc_space_v_vv_chol(k,a,gam) - tmp_cc(k,a,gam) enddo enddo - call dgemm('T','N', nV*min(block_size, nV-iblock+1), nV, cholesky_ao_num, 1.d0, & - cc_space_v_vv_chol(1,1,iblock), cholesky_ao_num, & - tmp_cc2, cholesky_ao_num, & + call dgemm('T','N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, 1.d0, & + cc_space_v_vv_chol(1,1,iblock), cholesky_mo_num, & + tmp_cc2, cholesky_mo_num, & 1.d0, tmpB1, nV*block_size) do beta = iblock, min(nV, iblock+block_size-1) @@ -630,9 +630,9 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) do iblock = 1, nV, block_size do gam = iblock, min(nV, iblock+block_size-1) - call dgemm('T','N',nV, nO*nV, cholesky_ao_num, 1.d0, & - cc_space_v_vv_chol(1,1,gam), cholesky_ao_num, cc_space_v_ov_chol, & - cholesky_ao_num, 0.d0, X_vovv(1,1,1,gam-iblock+1), nV) + call dgemm('T','N',nV, nO*nV, cholesky_mo_num, 1.d0, & + cc_space_v_vv_chol(1,1,gam), cholesky_mo_num, cc_space_v_ov_chol, & + cholesky_mo_num, 0.d0, X_vovv(1,1,1,gam-iblock+1), nV) enddo call dgemm('N','N',nO,nO*nV*min(block_size, nV-iblock+1),nV, & @@ -663,19 +663,19 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) double precision, allocatable :: X_ovvo(:,:,:,:) double precision, allocatable :: tcc(:,:,:), tcc2(:,:,:) - allocate(tcc2(cholesky_ao_num,nV,nO), X_ovvo(nO,nV,nV,nO)) - allocate(tcc(cholesky_ao_num,nO,nV)) + allocate(tcc2(cholesky_mo_num,nV,nO), X_ovvo(nO,nV,nV,nO)) + allocate(tcc(cholesky_mo_num,nO,nV)) - call dgemm('N','T', cholesky_ao_num*nV, nO, nV, 1.d0, & - cc_space_v_vv_chol, cholesky_ao_num*nV, t1, nO, & - 0.d0, tcc2, cholesky_ao_num*nV) + call dgemm('N','T', cholesky_mo_num*nV, nO, nV, 1.d0, & + cc_space_v_vv_chol, cholesky_mo_num*nV, t1, nO, & + 0.d0, tcc2, cholesky_mo_num*nV) - call dgemm('N','N', cholesky_ao_num*nO, nV, nO, 1.d0, & - cc_space_v_oo_chol, cholesky_ao_num*nO, t1, nO, & - 0.d0, tcc, cholesky_ao_num*nO) + call dgemm('N','N', cholesky_mo_num*nO, nV, nO, 1.d0, & + cc_space_v_oo_chol, cholesky_mo_num*nO, t1, nO, & + 0.d0, tcc, cholesky_mo_num*nO) - call dgemm('T','N', nO*nV, nV*nO, cholesky_ao_num, 1.d0, & - tcc, cholesky_ao_num, tcc2, cholesky_ao_num, 0.d0, & + call dgemm('T','N', nO*nV, nV*nO, cholesky_mo_num, 1.d0, & + tcc, cholesky_mo_num, tcc2, cholesky_mo_num, 0.d0, & X_ovvo, nO*nV) deallocate(tcc, tcc2) @@ -1160,23 +1160,23 @@ subroutine compute_g_vir_chol(nO,nV,t1,t2,H_vv,g_vir) 0d0, g_vir, size(g_vir,1)) double precision, allocatable :: tmp_k(:), tmp_vo(:,:,:), tmp_vo2(:,:,:) - allocate(tmp_k(cholesky_ao_num)) - call dgemm('N','N', cholesky_ao_num, 1, nO*nV, 1.d0, & - cc_space_v_ov_chol, cholesky_ao_num, t1, nO*nV, 0.d0, tmp_k, cholesky_ao_num) + allocate(tmp_k(cholesky_mo_num)) + call dgemm('N','N', cholesky_mo_num, 1, nO*nV, 1.d0, & + cc_space_v_ov_chol, cholesky_mo_num, t1, nO*nV, 0.d0, tmp_k, cholesky_mo_num) - call dgemm('T','N', nV*nV, 1, cholesky_ao_num, 2.d0, & - cc_space_v_vv_chol, cholesky_ao_num, tmp_k, cholesky_ao_num, 1.d0, & + call dgemm('T','N', nV*nV, 1, cholesky_mo_num, 2.d0, & + cc_space_v_vv_chol, cholesky_mo_num, tmp_k, cholesky_mo_num, 1.d0, & g_vir, nV*nV) deallocate(tmp_k) - allocate(tmp_vo(cholesky_ao_num,nV,nO)) - call dgemm('N','T',cholesky_ao_num*nV, nO, nV, 1.d0, & - cc_space_v_vv_chol, cholesky_ao_num*nV, t1, nO, 0.d0, tmp_vo, cholesky_ao_num*nV) + allocate(tmp_vo(cholesky_mo_num,nV,nO)) + call dgemm('N','T',cholesky_mo_num*nV, nO, nV, 1.d0, & + cc_space_v_vv_chol, cholesky_mo_num*nV, t1, nO, 0.d0, tmp_vo, cholesky_mo_num*nV) - allocate(tmp_vo2(cholesky_ao_num,nO,nV)) + allocate(tmp_vo2(cholesky_mo_num,nO,nV)) do beta=1,nV do i=1,nO - do k=1,cholesky_ao_num + do k=1,cholesky_mo_num tmp_vo2(k,i,beta) = -tmp_vo(k,beta,i) enddo enddo @@ -1189,9 +1189,9 @@ subroutine compute_g_vir_chol(nO,nV,t1,t2,H_vv,g_vir) enddo enddo - call dgemm('T','N', nV, nV, nO*cholesky_ao_num, 1.d0, & - cc_space_v_ov_chol, cholesky_ao_num*nO, & - tmp_vo2, cholesky_ao_num*nO, 1.d0, g_vir, nV) + call dgemm('T','N', nV, nV, nO*cholesky_mo_num, 1.d0, & + cc_space_v_ov_chol, cholesky_mo_num*nO, & + tmp_vo2, cholesky_mo_num*nO, 1.d0, g_vir, nV) end @@ -1265,15 +1265,15 @@ subroutine compute_J1_chol(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvoo,J1) deallocate(X_ovoo) double precision, allocatable :: tmp_cc(:,:,:), J1_tmp(:,:,:,:) - allocate(tmp_cc(cholesky_ao_num,nV,nO), J1_tmp(nV,nO,nV,nO)) + allocate(tmp_cc(cholesky_mo_num,nV,nO), J1_tmp(nV,nO,nV,nO)) - call dgemm('N','T', cholesky_ao_num*nV, nO, nV, 1.d0, & - cc_space_v_vv_chol, cholesky_ao_num*nV, & + call dgemm('N','T', cholesky_mo_num*nV, nO, nV, 1.d0, & + cc_space_v_vv_chol, cholesky_mo_num*nV, & t1, nO, & - 0.d0, tmp_cc, cholesky_ao_num*nV) + 0.d0, tmp_cc, cholesky_mo_num*nV) - call dgemm('T','N', nV*nO, nV*nO, cholesky_ao_num, 1.d0, & - tmp_cc, cholesky_ao_num, cc_space_v_vo_chol, cholesky_ao_num, & + call dgemm('T','N', nV*nO, nV*nO, cholesky_mo_num, 1.d0, & + tmp_cc, cholesky_mo_num, cc_space_v_vo_chol, cholesky_mo_num, & 0.d0, J1_tmp, nV*nO) deallocate(tmp_cc) @@ -1464,14 +1464,14 @@ subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov,K1) 1d0, K1 , size(K1,1) * size(K1,2) * size(K1,3)) double precision, allocatable :: K1tmp(:,:,:,:), t1v(:,:,:) - allocate(K1tmp(nO,nO,nV,nV), t1v(cholesky_ao_num,nO,nO)) + allocate(K1tmp(nO,nO,nV,nV), t1v(cholesky_mo_num,nO,nO)) - call dgemm('N','T', cholesky_ao_num*nO, nO, nV, 1.d0, & - cc_space_v_ov_chol, cholesky_ao_num*nO, t1, nO, 0.d0, & - t1v, cholesky_ao_num*nO) + call dgemm('N','T', cholesky_mo_num*nO, nO, nV, 1.d0, & + cc_space_v_ov_chol, cholesky_mo_num*nO, t1, nO, 0.d0, & + t1v, cholesky_mo_num*nO) - call dgemm('T','N', nO*nO, nV*nV, cholesky_ao_num, 1.d0, & - t1v, cholesky_ao_num, cc_space_v_vv_chol, cholesky_ao_num, 0.d0, & + call dgemm('T','N', nO*nO, nV*nV, cholesky_mo_num, 1.d0, & + t1v, cholesky_mo_num, cc_space_v_vv_chol, cholesky_mo_num, 0.d0, & K1tmp, nO*nO) deallocate(t1v) diff --git a/src/mo_two_e_ints/cholesky.irp.f b/src/mo_two_e_ints/cholesky.irp.f index 3a868cbe..349f13b9 100644 --- a/src/mo_two_e_ints/cholesky.irp.f +++ b/src/mo_two_e_ints/cholesky.irp.f @@ -1,4 +1,12 @@ -BEGIN_PROVIDER [ double precision, cholesky_mo, (mo_num, mo_num, cholesky_ao_num) ] +BEGIN_PROVIDER [ integer, cholesky_mo_num ] + implicit none + BEGIN_DOC + ! Number of Cholesky vectors in MO basis + END_DOC + cholesky_mo_num = cholesky_ao_num +END_PROVIDER + +BEGIN_PROVIDER [ double precision, cholesky_mo, (mo_num, mo_num, cholesky_mo_num) ] implicit none BEGIN_DOC ! Cholesky vectors in MO basis @@ -8,7 +16,7 @@ BEGIN_PROVIDER [ double precision, cholesky_mo, (mo_num, mo_num, cholesky_ao_num call set_multiple_levels_omp(.False.) !$OMP PARALLEL DO PRIVATE(k) - do k=1,cholesky_ao_num + do k=1,cholesky_mo_num do j=1,mo_num do i=1,mo_num cholesky_mo(i,j,k) = cholesky_mo_transp(k,i,j) @@ -19,7 +27,7 @@ BEGIN_PROVIDER [ double precision, cholesky_mo, (mo_num, mo_num, cholesky_ao_num END_PROVIDER -BEGIN_PROVIDER [ double precision, cholesky_mo_transp, (cholesky_ao_num, mo_num, mo_num) ] +BEGIN_PROVIDER [ double precision, cholesky_mo_transp, (cholesky_mo_num, mo_num, mo_num) ] implicit none BEGIN_DOC ! Cholesky vectors in MO basis @@ -29,14 +37,14 @@ BEGIN_PROVIDER [ double precision, cholesky_mo_transp, (cholesky_ao_num, mo_num, integer :: ierr print *, 'AO->MO Transformation of Cholesky vectors' - allocate(X(mo_num,cholesky_ao_num,ao_num), stat=ierr) + allocate(X(mo_num,cholesky_mo_num,ao_num), stat=ierr) if (ierr /= 0) then print *, irp_here, ': Allocation failed' endif - call dgemm('T','N', ao_num*cholesky_ao_num, mo_num, ao_num, 1.d0, & - cholesky_ao, ao_num, mo_coef, ao_num, 0.d0, X, ao_num*cholesky_ao_num) - call dgemm('T','N', cholesky_ao_num*mo_num, mo_num, ao_num, 1.d0, & - X, ao_num, mo_coef, ao_num, 0.d0, cholesky_mo_transp, cholesky_ao_num*mo_num) + call dgemm('T','N', ao_num*cholesky_mo_num, mo_num, ao_num, 1.d0, & + cholesky_ao, ao_num, mo_coef, ao_num, 0.d0, X, ao_num*cholesky_mo_num) + call dgemm('T','N', cholesky_mo_num*mo_num, mo_num, ao_num, 1.d0, & + X, ao_num, mo_coef, ao_num, 0.d0, cholesky_mo_transp, cholesky_mo_num*mo_num) deallocate(X) END_PROVIDER diff --git a/src/mo_two_e_ints/integrals_3_index.irp.f b/src/mo_two_e_ints/integrals_3_index.irp.f index d807f619..eb05da84 100644 --- a/src/mo_two_e_ints/integrals_3_index.irp.f +++ b/src/mo_two_e_ints/integrals_3_index.irp.f @@ -13,14 +13,14 @@ if (do_ao_cholesky) then double precision, allocatable :: buffer_jj(:,:), buffer(:,:,:) - allocate(buffer_jj(cholesky_ao_num,mo_num), buffer(mo_num,mo_num,mo_num)) + allocate(buffer_jj(cholesky_mo_num,mo_num), buffer(mo_num,mo_num,mo_num)) do j=1,mo_num buffer_jj(:,j) = cholesky_mo_transp(:,j,j) enddo - call dgemm('T','N', mo_num*mo_num,mo_num,cholesky_ao_num, 1.d0, & - cholesky_mo_transp, cholesky_ao_num, & - buffer_jj, cholesky_ao_num, 0.d0, & + call dgemm('T','N', mo_num*mo_num,mo_num,cholesky_mo_num, 1.d0, & + cholesky_mo_transp, cholesky_mo_num, & + buffer_jj, cholesky_mo_num, 0.d0, & buffer, mo_num*mo_num) do k = 1, mo_num @@ -36,9 +36,9 @@ do j = 1, mo_num - call dgemm('T','N',mo_num,mo_num,cholesky_ao_num, 1.d0, & - cholesky_mo_transp(1,1,j), cholesky_ao_num, & - cholesky_mo_transp(1,1,j), cholesky_ao_num, 0.d0, & + call dgemm('T','N',mo_num,mo_num,cholesky_mo_num, 1.d0, & + cholesky_mo_transp(1,1,j), cholesky_mo_num, & + cholesky_mo_transp(1,1,j), cholesky_mo_num, 0.d0, & buffer_jj, mo_num) do k=1,mo_num diff --git a/src/mo_two_e_ints/mo_bi_integrals.irp.f b/src/mo_two_e_ints/mo_bi_integrals.irp.f index 0d3fe176..0e77b6a2 100644 --- a/src/mo_two_e_ints/mo_bi_integrals.irp.f +++ b/src/mo_two_e_ints/mo_bi_integrals.irp.f @@ -479,9 +479,9 @@ subroutine add_integrals_to_map_cholesky !$OMP DO SCHEDULE(dynamic) do l=1,mo_num - call dgemm('T','N',mo_num*mo_num,mo_num,cholesky_ao_num,1.d0, & - cholesky_mo_transp, cholesky_ao_num, & - cholesky_mo_transp(1,1,l), cholesky_ao_num, 0.d0, & + call dgemm('T','N',mo_num*mo_num,mo_num,cholesky_mo_num,1.d0, & + cholesky_mo_transp, cholesky_mo_num, & + cholesky_mo_transp(1,1,l), cholesky_mo_num, 0.d0, & Vtmp, mo_num*mo_num) do k=1,l @@ -1364,20 +1364,20 @@ END_PROVIDER if (do_ao_cholesky) then double precision, allocatable :: buffer(:,:) - allocate (buffer(cholesky_ao_num,mo_num)) - do k=1,cholesky_ao_num + allocate (buffer(cholesky_mo_num,mo_num)) + do k=1,cholesky_mo_num do i=1,mo_num buffer(k,i) = cholesky_mo_transp(k,i,i) enddo enddo - call dgemm('T','N',mo_num,mo_num,cholesky_ao_num,1.d0, & - buffer, cholesky_ao_num, buffer, cholesky_ao_num, 0.d0, mo_two_e_integrals_jj, mo_num) + call dgemm('T','N',mo_num,mo_num,cholesky_mo_num,1.d0, & + buffer, cholesky_mo_num, buffer, cholesky_mo_num, 0.d0, mo_two_e_integrals_jj, mo_num) deallocate(buffer) do j=1,mo_num do i=1,mo_num mo_two_e_integrals_jj_exchange(i,j) = 0.d0 - do k=1,cholesky_ao_num + do k=1,cholesky_mo_num mo_two_e_integrals_jj_exchange(i,j) = mo_two_e_integrals_jj_exchange(i,j) + & cholesky_mo_transp(k,i,j)*cholesky_mo_transp(k,j,i) enddo diff --git a/src/utils_cc/mo_integrals_cc.irp.f b/src/utils_cc/mo_integrals_cc.irp.f index a68ab8de..b2b68d05 100644 --- a/src/utils_cc/mo_integrals_cc.irp.f +++ b/src/utils_cc/mo_integrals_cc.irp.f @@ -50,15 +50,15 @@ subroutine gen_v_space(n1,n2,n3,n4,list1,list2,list3,list4,v) if (do_ao_cholesky) then double precision, allocatable :: buffer(:,:,:,:) double precision, allocatable :: v1(:,:,:), v2(:,:,:) - allocate(v1(cholesky_ao_num,n1,n3), v2(cholesky_ao_num,n2,n4)) + allocate(v1(cholesky_mo_num,n1,n3), v2(cholesky_mo_num,n2,n4)) allocate(buffer(n1,n3,n2,n4)) - call gen_v_space_chol(n1,n3,list1,list3,v1,cholesky_ao_num) - call gen_v_space_chol(n2,n4,list2,list4,v2,cholesky_ao_num) + call gen_v_space_chol(n1,n3,list1,list3,v1,cholesky_mo_num) + call gen_v_space_chol(n2,n4,list2,list4,v2,cholesky_mo_num) - call dgemm('T','N', n1*n3, n2*n4, cholesky_ao_num, 1.d0, & - v1, cholesky_ao_num, & - v2, cholesky_ao_num, 0.d0, buffer, n1*n3) + call dgemm('T','N', n1*n3, n2*n4, cholesky_mo_num, 1.d0, & + v1, cholesky_mo_num, & + v2, cholesky_mo_num, 0.d0, buffer, n1*n3) deallocate(v1,v2) @@ -119,7 +119,7 @@ subroutine gen_v_space_chol(n1,n3,list1,list3,v,ldv) idx3 = list3(i3) do i1=1,n1 idx1 = list1(i1) - do k=1,cholesky_ao_num + do k=1,cholesky_mo_num v(k,i1,i3) = cholesky_mo_transp(k,idx1,idx3) enddo enddo @@ -137,15 +137,15 @@ BEGIN_PROVIDER [double precision, cc_space_v, (mo_num,mo_num,mo_num,mo_num)] double precision, allocatable :: buffer(:,:,:) call set_multiple_levels_omp(.False.) !$OMP PARALLEL & - !$OMP SHARED(cc_space_v,mo_num,cholesky_mo_transp,cholesky_ao_num) & + !$OMP SHARED(cc_space_v,mo_num,cholesky_mo_transp,cholesky_mo_num) & !$OMP PRIVATE(i1,i2,i3,i4,k,buffer)& !$OMP DEFAULT(NONE) allocate(buffer(mo_num,mo_num,mo_num)) !$OMP DO do i4 = 1, mo_num - call dgemm('T','N', mo_num*mo_num, mo_num, cholesky_ao_num, 1.d0, & - cholesky_mo_transp, cholesky_ao_num, & - cholesky_mo_transp(1,1,i4), cholesky_ao_num, 0.d0, buffer, mo_num*mo_num) + call dgemm('T','N', mo_num*mo_num, mo_num, cholesky_mo_num, 1.d0, & + cholesky_mo_transp, cholesky_mo_num, & + cholesky_mo_transp(1,1,i4), cholesky_mo_num, 0.d0, buffer, mo_num*mo_num) do i2 = 1, mo_num do i3 = 1, mo_num do i1 = 1, mo_num @@ -203,9 +203,9 @@ BEGIN_PROVIDER [double precision, cc_space_v_oooo, (cc_nOa, cc_nOa, cc_nOa, cc_n double precision, allocatable :: buffer(:,:,:,:) allocate(buffer(n1,n3,n2,n4)) - call dgemm('T','N', n1*n3, n2*n4, cholesky_ao_num, 1.d0, & - cc_space_v_oo_chol, cholesky_ao_num, & - cc_space_v_oo_chol, cholesky_ao_num, 0.d0, buffer, n1*n3) + call dgemm('T','N', n1*n3, n2*n4, cholesky_mo_num, 1.d0, & + cc_space_v_oo_chol, cholesky_mo_num, & + cc_space_v_oo_chol, cholesky_mo_num, 0.d0, buffer, n1*n3) !$OMP PARALLEL DO PRIVATE(i1,i2,i3,i4) COLLAPSE(2) do i4 = 1, n4 @@ -246,9 +246,9 @@ BEGIN_PROVIDER [double precision, cc_space_v_vooo, (cc_nVa, cc_nOa, cc_nOa, cc_n double precision, allocatable :: buffer(:,:,:,:) allocate(buffer(n1,n3,n2,n4)) - call dgemm('T','N', n1*n3, n2*n4, cholesky_ao_num, 1.d0, & - cc_space_v_vo_chol, cholesky_ao_num, & - cc_space_v_oo_chol, cholesky_ao_num, 0.d0, buffer, n1*n3) + call dgemm('T','N', n1*n3, n2*n4, cholesky_mo_num, 1.d0, & + cc_space_v_vo_chol, cholesky_mo_num, & + cc_space_v_oo_chol, cholesky_mo_num, 0.d0, buffer, n1*n3) !$OMP PARALLEL DO PRIVATE(i1,i2,i3,i4) COLLAPSE(2) do i4 = 1, n4 @@ -392,9 +392,9 @@ BEGIN_PROVIDER [double precision, cc_space_v_vvoo, (cc_nVa, cc_nVa, cc_nOa, cc_n double precision, allocatable :: buffer(:,:,:,:) allocate(buffer(n1,n3,n2,n4)) - call dgemm('T','N', n1*n3, n2*n4, cholesky_ao_num, 1.d0, & - cc_space_v_vo_chol, cholesky_ao_num, & - cc_space_v_vo_chol, cholesky_ao_num, 0.d0, buffer, n1*n3) + call dgemm('T','N', n1*n3, n2*n4, cholesky_mo_num, 1.d0, & + cc_space_v_vo_chol, cholesky_mo_num, & + cc_space_v_vo_chol, cholesky_mo_num, 0.d0, buffer, n1*n3) !$OMP PARALLEL DO PRIVATE(i1,i2,i3,i4) COLLAPSE(2) do i4 = 1, n4 @@ -435,9 +435,9 @@ BEGIN_PROVIDER [double precision, cc_space_v_vovo, (cc_nVa, cc_nOa, cc_nVa, cc_n double precision, allocatable :: buffer(:,:,:,:) allocate(buffer(n1,n3,n2,n4)) - call dgemm('T','N', n1*n3, n2*n4, cholesky_ao_num, 1.d0, & - cc_space_v_vv_chol, cholesky_ao_num, & - cc_space_v_oo_chol, cholesky_ao_num, 0.d0, buffer, n1*n3) + call dgemm('T','N', n1*n3, n2*n4, cholesky_mo_num, 1.d0, & + cc_space_v_vv_chol, cholesky_mo_num, & + cc_space_v_oo_chol, cholesky_mo_num, 0.d0, buffer, n1*n3) !$OMP PARALLEL DO PRIVATE(i1,i2,i3,i4) COLLAPSE(2) do i4 = 1, n4 @@ -645,35 +645,35 @@ BEGIN_PROVIDER [double precision, cc_space_v_vvvv, (cc_nVa, cc_nVa, cc_nVa, cc_n END_PROVIDER -BEGIN_PROVIDER [double precision, cc_space_v_vv_chol, (cholesky_ao_num, cc_nVa, cc_nVa)] +BEGIN_PROVIDER [double precision, cc_space_v_vv_chol, (cholesky_mo_num, cc_nVa, cc_nVa)] implicit none - call gen_v_space_chol(cc_nVa, cc_nVa, cc_list_vir, cc_list_vir, cc_space_v_vv_chol, cholesky_ao_num) + call gen_v_space_chol(cc_nVa, cc_nVa, cc_list_vir, cc_list_vir, cc_space_v_vv_chol, cholesky_mo_num) END_PROVIDER -BEGIN_PROVIDER [double precision, cc_space_v_vo_chol, (cholesky_ao_num, cc_nVa, cc_nOa)] +BEGIN_PROVIDER [double precision, cc_space_v_vo_chol, (cholesky_mo_num, cc_nVa, cc_nOa)] implicit none - call gen_v_space_chol(cc_nVa, cc_nOa, cc_list_vir, cc_list_occ, cc_space_v_vo_chol, cholesky_ao_num) + call gen_v_space_chol(cc_nVa, cc_nOa, cc_list_vir, cc_list_occ, cc_space_v_vo_chol, cholesky_mo_num) END_PROVIDER -BEGIN_PROVIDER [double precision, cc_space_v_ov_chol, (cholesky_ao_num, cc_nOa, cc_nVa)] +BEGIN_PROVIDER [double precision, cc_space_v_ov_chol, (cholesky_mo_num, cc_nOa, cc_nVa)] implicit none - call gen_v_space_chol(cc_nOa, cc_nVa, cc_list_occ, cc_list_vir, cc_space_v_ov_chol, cholesky_ao_num) + call gen_v_space_chol(cc_nOa, cc_nVa, cc_list_occ, cc_list_vir, cc_space_v_ov_chol, cholesky_mo_num) END_PROVIDER -BEGIN_PROVIDER [double precision, cc_space_v_oo_chol, (cholesky_ao_num, cc_nOa, cc_nOa)] +BEGIN_PROVIDER [double precision, cc_space_v_oo_chol, (cholesky_mo_num, cc_nOa, cc_nOa)] implicit none - call gen_v_space_chol(cc_nOa, cc_nOa, cc_list_occ, cc_list_occ, cc_space_v_oo_chol, cholesky_ao_num) + call gen_v_space_chol(cc_nOa, cc_nOa, cc_list_occ, cc_list_occ, cc_space_v_oo_chol, cholesky_mo_num) END_PROVIDER From fba2fefb1943cff26ba7c18d1ee92448b8482b3a Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Wed, 12 Jul 2023 18:33:18 +0200 Subject: [PATCH 36/40] Moved loop --- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index 5969928a..fc5da8c0 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -501,6 +501,13 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) allocate(B1(nV,nV,block_size), tmpB1(nV,block_size,nV), tmp_cc2(cholesky_mo_num,nV)) !$OMP DO do gam = 1, nV + + do a=1,nV + do k=1,cholesky_mo_num + tmp_cc2(k,a) = cc_space_v_vv_chol(k,a,gam) - tmp_cc(k,a,gam) + enddo + enddo + do iblock = 1, nV, block_size call dgemm('T', 'N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, & @@ -508,12 +515,6 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) cc_space_v_vv_chol(1,1,gam), cholesky_mo_num, & 0.d0, tmpB1, nV*block_size) - do a=1,nV - do k=1,cholesky_mo_num - tmp_cc2(k,a) = cc_space_v_vv_chol(k,a,gam) - tmp_cc(k,a,gam) - enddo - enddo - call dgemm('T','N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, 1.d0, & cc_space_v_vv_chol(1,1,iblock), cholesky_mo_num, & tmp_cc2, cholesky_mo_num, & From 4ad77651276305fdfbed5648a57bb9965dab636b Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Wed, 12 Jul 2023 18:56:13 +0200 Subject: [PATCH 37/40] Minor changes --- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index fc5da8c0..0ba46e56 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -457,8 +457,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) integer :: u,v,i,j,beta,gam,a,b double precision :: max_r2_local - ! Residual - !r2 = 0d0 + call set_multiple_levels_omp(.False.) !$omp parallel & !$omp shared(nO,nV,r2,cc_space_v_oovv) & From 467f7563797c4c32fc9597542b9d761309e3565c Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Sun, 16 Jul 2023 20:04:17 +0200 Subject: [PATCH 38/40] Optimized A1 in CCSD --- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 48 ++++---------------------- 1 file changed, 6 insertions(+), 42 deletions(-) diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index 0ba46e56..ec6c2afb 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -1023,56 +1023,26 @@ subroutine compute_A1_chol(nO,nV,t1,t2,tau,A1) integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta - double precision, allocatable :: X_vooo(:,:,:,:), Y_oooo(:,:,:,:) - allocate(X_vooo(nV,nO,nO,nO), Y_oooo(nO,nO,nO,nO)) + double precision, allocatable :: Y_oooo(:,:,:,:) + allocate(Y_oooo(nO,nO,nO,nO)) ! A1(u,v,i,j) = cc_space_v_oooo(u,v,i,j) - !$omp parallel & - !$omp shared(nO,nV,A1,cc_space_v_oooo,cc_space_v_ovoo,X_vooo) & - !$omp private(u,v,i,j) & - !$omp default(none) - !$omp do collapse(2) - do j = 1, nO - do i = 1, nO - do v = 1, nO - do u = 1, nO - A1(u,v,i,j) = cc_space_v_oooo(u,v,i,j) - enddo - enddo - enddo - enddo - !$omp end do nowait - ! A1(u,v,i,j) += cc_space_v_ovoo(u,a,i,j) * t1(v,a) & - !$omp do collapse(2) - do j = 1, nO - do i = 1, nO - do u = 1, nO - do a = 1, nV - X_vooo(a,u,i,j) = cc_space_v_ovoo(u,a,i,j) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - call dgemm('N','N', nO, nO*nO*nO, nV, & 1d0, t1 , size(t1,1), & - X_vooo, size(X_vooo,1), & + cc_space_v_vooo, size(cc_space_v_vooo,1), & 0d0, Y_oooo, size(Y_oooo,1)) !$omp parallel & - !$omp shared(nO,nV,A1,Y_oooo) & !$omp private(u,v,i,j) & - !$omp default(none) + !$omp default(shared) !$omp do collapse(2) do j = 1, nO do i = 1, nO do v = 1, nO do u = 1, nO - A1(u,v,i,j) = A1(u,v,i,j) + Y_oooo(v,u,i,j) + A1(u,v,i,j) = cc_space_v_oooo(u,v,i,j) + Y_oooo(v,u,j,i) + Y_oooo(u,v,i,j) enddo enddo enddo @@ -1080,13 +1050,7 @@ subroutine compute_A1_chol(nO,nV,t1,t2,tau,A1) !$omp end do !$omp end parallel - deallocate(X_vooo,Y_oooo) - - ! A1(u,v,i,j) += cc_space_v_vooo(a,v,i,j) * t1(u,a) - call dgemm('N','N', nO, nO*nO*nO, nV, & - 1d0, t1 , size(t1,1), & - cc_space_v_vooo, size(cc_space_v_vooo,1), & - 1d0, A1 , size(A1,1)) + deallocate(Y_oooo) ! A1(u,v,i,j) += cc_space_v_vvoo(a,b,i,j) * tau(u,v,a,b) call dgemm('N','N', nO*nO, nO*nO, nV*nV, & From bd570b19c1d4ae0479b8ff4c4611ae1127605441 Mon Sep 17 00:00:00 2001 From: ydamour Date: Mon, 17 Jul 2023 17:05:48 +0200 Subject: [PATCH 39/40] fix bug restore_symmetry --- src/utils/linear_algebra.irp.f | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/utils/linear_algebra.irp.f b/src/utils/linear_algebra.irp.f index 65c57a76..314ad4f6 100644 --- a/src/utils/linear_algebra.irp.f +++ b/src/utils/linear_algebra.irp.f @@ -1661,7 +1661,15 @@ subroutine restore_symmetry(m,n,A,LDA,thresh) ! Update i i = i + 1 enddo - copy(i:) = 0.d0 + + ! To nullify the remaining elements that are below the threshold + if (i == sze) then + if (-copy(i) <= thresh) then + copy(i) = 0d0 + endif + else + copy(i:) = 0.d0 + endif !$OMP PARALLEL if (sze>10000) & !$OMP SHARED(m,sze,copy_sign,copy,key,A,ii,jj) & From cc7b97c09b5f8a970319a3e247551c34401e731c Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Sat, 5 Aug 2023 01:47:48 +0200 Subject: [PATCH 40/40] Cleaning in C --- external/ezfio | 2 +- external/irpf90 | 2 +- src/utils/fortran_mmap.c | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/external/ezfio b/external/ezfio index ed1df9f3..d5805497 160000 --- a/external/ezfio +++ b/external/ezfio @@ -1 +1 @@ -Subproject commit ed1df9f3c1f51752656ca98da5693a4119add05c +Subproject commit d5805497fa0ef30e70e055cde1ecec2963303e93 diff --git a/external/irpf90 b/external/irpf90 index 33ca5e10..0007f72f 160000 --- a/external/irpf90 +++ b/external/irpf90 @@ -1 +1 @@ -Subproject commit 33ca5e1018f3bbb5e695e6ee558f5dac0753b271 +Subproject commit 0007f72f677fe7d61c5e1ed461882cb239517102 diff --git a/src/utils/fortran_mmap.c b/src/utils/fortran_mmap.c index 71426002..e8d85a2f 100644 --- a/src/utils/fortran_mmap.c +++ b/src/utils/fortran_mmap.c @@ -9,7 +9,6 @@ void* mmap_fortran(char* filename, size_t bytes, int* file_descr, int read_only) { - int i; int fd; int result; void* map;