diff --git a/src/ao_two_e_ints/EZFIO.cfg b/src/ao_two_e_ints/EZFIO.cfg index 9f523fca..9c017813 100644 --- a/src/ao_two_e_ints/EZFIO.cfg +++ b/src/ao_two_e_ints/EZFIO.cfg @@ -4,6 +4,12 @@ doc: Read/Write |AO| integrals from/to disk [ Write | Read | None ] interface: ezfio,provider,ocaml default: None +[io_ao_cholesky] +type: Disk_access +doc: Read/Write |AO| integrals from/to disk [ Write | Read | None ] +interface: ezfio,provider,ocaml +default: None + [ao_integrals_threshold] type: Threshold doc: If | (pq|rs) | < `ao_integrals_threshold` then (pq|rs) is zero diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f index 128aa483..8b969174 100644 --- a/src/ao_two_e_ints/cholesky.irp.f +++ b/src/ao_two_e_ints/cholesky.irp.f @@ -14,412 +14,438 @@ BEGIN_PROVIDER [ double precision, cholesky_ao_transp, (cholesky_ao_num, ao_num, END_PROVIDER -BEGIN_PROVIDER [ integer, cholesky_ao_num ] + BEGIN_PROVIDER [ integer, cholesky_ao_num ] &BEGIN_PROVIDER [ double precision, cholesky_ao, (ao_num, ao_num, 1) ] - implicit none - BEGIN_DOC - ! Cholesky vectors in AO basis: (ik|a): - ! = (ik|jl) = sum_a (ik|a).(a|jl) - ! - ! Last dimension of cholesky_ao is cholesky_ao_num - END_DOC - - integer :: rank, ndim - double precision :: tau - double precision, pointer :: L(:,:), L_old(:,:) - - - double precision :: s - double precision, parameter :: dscale = 1.d0 - - double precision, allocatable :: D(:), Delta(:,:), Ltmp_p(:,:), Ltmp_q(:,:) - integer, allocatable :: Lset(:), Dset(:), addr(:,:), LDmap(:), DLmap(:) - integer, allocatable :: Lset_rev(:), Dset_rev(:) - - integer :: i,j,k,m,p,q, qj, dj, p2, q2 - integer :: N, np, nq - - double precision :: Dmax, Dmin, Qmax, f - double precision, external :: get_ao_two_e_integral - logical, external :: ao_two_e_integral_zero - - double precision, external :: ao_two_e_integral - integer :: block_size, iblock, ierr - - integer(omp_lock_kind), allocatable :: lock(:) - - double precision :: rss - double precision, external :: memory_of_double, memory_of_int - - - PROVIDE nucl_coord - - if (.not.do_direct_integrals) then - PROVIDE ao_two_e_integrals_in_map - endif - deallocate(cholesky_ao) - - ndim = ao_num*ao_num - tau = ao_cholesky_threshold - - rss = 6.d0 * memory_of_double(ndim) + & - 6.d0 * memory_of_int(ndim) - call check_mem(rss, irp_here) - - allocate(L(ndim,1)) - - print *, '' - print *, 'Cholesky decomposition of AO integrals' - print *, '======================================' - print *, '' - print *, '============ =============' - print *, ' Rank Threshold' - print *, '============ =============' - - - rank = 0 - - allocate( D(ndim), Lset(ndim), LDmap(ndim), DLmap(ndim), Dset(ndim) ) - allocate( Lset_rev(ndim), Dset_rev(ndim), lock(ndim) ) - allocate( addr(3,ndim) ) - do k=1,ndim - call omp_init_lock(lock(k)) - enddo - - ! 1. - k=0 - do j=1,ao_num - do i=1,ao_num - k = k+1 - addr(1,k) = i - addr(2,k) = j - addr(3,k) = (i-1)*ao_num + j - enddo - enddo - - if (do_direct_integrals) then - !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i) - do i=1,ndim - D(i) = ao_two_e_integral(addr(1,i), addr(2,i), & - addr(1,i), addr(2,i)) - enddo - !$OMP END PARALLEL DO - else - !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i) SCHEDULE(guided) - do i=1,ndim - D(i) = get_ao_two_e_integral(addr(1,i), addr(1,i), & - addr(2,i), addr(2,i), & - ao_integrals_map) - enddo - !$OMP END PARALLEL DO - endif - - Dmax = maxval(D) - - ! 2. - np=0 - Lset_rev = 0 - do p=1,ndim - if ( dscale*dscale*Dmax*D(p) > tau*tau ) then - np = np+1 - Lset(np) = p - Lset_rev(p) = np - endif - enddo - - ! 3. - N = 0 - - ! 4. - i = 0 - - ! 5. - do while ( (Dmax > tau).and.(rank < ndim) ) - ! a. - i = i+1 - - s = 0.1d0 - - ! Inrease s until the arrays fit in memory - do - - ! b. - Dmin = max(s*Dmax,tau) - - ! c. - nq=0 - LDmap = 0 - DLmap = 0 - Dset_rev = 0 - do p=1,np - if ( D(Lset(p)) > Dmin ) then - nq = nq+1 - Dset(nq) = Lset(p) - Dset_rev(Dset(nq)) = nq - LDmap(p) = nq - DLmap(nq) = p - endif - enddo - - call resident_memory(rss) - rss = rss & - + np*memory_of_double(nq) & ! Delta(np,nq) - + (rank+nq)* memory_of_double(ndim) & ! L(ndim,rank+nq) - + (np+nq)*memory_of_double(block_size) ! Ltmp_p(np,block_size) - ! Ltmp_q(nq,block_size) - - if (rss > qp_max_mem) then - s = s*2.d0 - else - exit - endif - - if ((s > 1.d0).or.(nq == 0)) then - print *, 'Not enough memory. Reduce cholesky threshold' - stop -1 - endif - - enddo - - ! d., e. - block_size = max(N,24) - - L_old => L - allocate(L(ndim,rank+nq), stat=ierr) - if (ierr /= 0) then - print *, irp_here, ': allocation failed : (L(ndim,rank+nq))' - stop -1 - endif - - !$OMP PARALLEL DO PRIVATE(k) - do k=1,rank - L(:,k) = L_old(:,k) - enddo - !$OMP END PARALLEL DO - - deallocate(L_old) - - allocate(Delta(np,nq), stat=ierr) - if (ierr /= 0) then - print *, irp_here, ': allocation failed : (Delta(np,nq))' - stop -1 - endif - - allocate(Ltmp_p(np,block_size), stat=ierr) - if (ierr /= 0) then - print *, irp_here, ': allocation failed : (Ltmp_p(np,block_size))' - stop -1 - endif - - allocate(Ltmp_q(nq,block_size), stat=ierr) - if (ierr /= 0) then - print *, irp_here, ': allocation failed : (Ltmp_q(nq,block_size))' - stop -1 - endif - - - !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(m,k,p,q,j) - - !$OMP DO - do q=1,nq - Delta(:,q) = 0.d0 - enddo - !$OMP ENDDO NOWAIT - - !$OMP DO - do k=1,N - do p=1,np - Ltmp_p(p,k) = L(Lset(p),k) - enddo - do q=1,nq - Ltmp_q(q,k) = L(Dset(q),k) - enddo - enddo - !$OMP END DO NOWAIT - - !$OMP BARRIER - - !$OMP DO SCHEDULE(guided) - do m=1,nq - - call omp_set_lock(lock(m)) - do k=1,np - ! Apply only to (k,m) pairs where k is not in Dset - if (LDmap(k) /= 0) cycle - q = Lset_rev(addr(3,Lset(k))) - if ((0 < q).and.(q < k)) cycle - if (.not.ao_two_e_integral_zero( addr(1,Lset(k)), addr(1,Dset(m)), & - addr(2,Lset(k)), addr(2,Dset(m)) ) ) then - if (do_direct_integrals) then - Delta(k,m) = ao_two_e_integral(addr(1,Lset(k)), addr(2,Lset(k)), & - addr(1,Dset(m)), addr(2,Dset(m))) - else - Delta(k,m) = get_ao_two_e_integral( addr(1,Lset(k)), addr(1,Dset(m)), & - addr(2,Lset(k)), addr(2,Dset(m)), ao_integrals_map) + implicit none + BEGIN_DOC + ! Cholesky vectors in AO basis: (ik|a): + ! = (ik|jl) = sum_a (ik|a).(a|jl) + ! + ! Last dimension of cholesky_ao is cholesky_ao_num + END_DOC + + integer :: rank, ndim + double precision :: tau + double precision, pointer :: L(:,:), L_old(:,:) + + + double precision :: s + double precision, parameter :: dscale = 1.d0 + + double precision, allocatable :: D(:), Delta(:,:), Ltmp_p(:,:), Ltmp_q(:,:) + integer, allocatable :: Lset(:), Dset(:), addr(:,:), LDmap(:), DLmap(:) + integer, allocatable :: Lset_rev(:), Dset_rev(:) + + integer :: i,j,k,m,p,q, qj, dj, p2, q2 + integer :: N, np, nq + + double precision :: Dmax, Dmin, Qmax, f + double precision, external :: get_ao_two_e_integral + logical, external :: ao_two_e_integral_zero + + double precision, external :: ao_two_e_integral + integer :: block_size, iblock, ierr + + integer(omp_lock_kind), allocatable :: lock(:) + + double precision :: rss + double precision, external :: memory_of_double, memory_of_int + + integer, external :: getUnitAndOpen + integer :: iunit + + ndim = ao_num*ao_num + deallocate(cholesky_ao) + + if (read_ao_cholesky) then + print *, 'Reading Cholesky vectors from disk...' + iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_ao', 'R') + read(iunit) rank + allocate(cholesky_ao(ao_num,ao_num,rank), stat=ierr) + read(iunit) cholesky_ao + close(iunit) + cholesky_ao_num = rank + + else + + PROVIDE nucl_coord + + if (.not.do_direct_integrals) then + PROVIDE ao_two_e_integrals_in_map + endif + + tau = ao_cholesky_threshold + + rss = 6.d0 * memory_of_double(ndim) + & + 6.d0 * memory_of_int(ndim) + call check_mem(rss, irp_here) + + allocate(L(ndim,1)) + + print *, '' + print *, 'Cholesky decomposition of AO integrals' + print *, '======================================' + print *, '' + print *, '============ =============' + print *, ' Rank Threshold' + print *, '============ =============' + + + rank = 0 + + allocate( D(ndim), Lset(ndim), LDmap(ndim), DLmap(ndim), Dset(ndim) ) + allocate( Lset_rev(ndim), Dset_rev(ndim), lock(ndim) ) + allocate( addr(3,ndim) ) + do k=1,ndim + call omp_init_lock(lock(k)) + enddo + + ! 1. + k=0 + do j=1,ao_num + do i=1,ao_num + k = k+1 + addr(1,k) = i + addr(2,k) = j + addr(3,k) = (i-1)*ao_num + j + enddo + enddo + + if (do_direct_integrals) then + !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i) + do i=1,ndim + D(i) = ao_two_e_integral(addr(1,i), addr(2,i), & + addr(1,i), addr(2,i)) + enddo + !$OMP END PARALLEL DO + else + !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i) SCHEDULE(guided) + do i=1,ndim + D(i) = get_ao_two_e_integral(addr(1,i), addr(1,i), & + addr(2,i), addr(2,i), & + ao_integrals_map) + enddo + !$OMP END PARALLEL DO + endif + + Dmax = maxval(D) + + ! 2. + np=0 + Lset_rev = 0 + do p=1,ndim + if ( dscale*dscale*Dmax*D(p) > tau*tau ) then + np = np+1 + Lset(np) = p + Lset_rev(p) = np + endif + enddo + + ! 3. + N = 0 + + ! 4. + i = 0 + + ! 5. + do while ( (Dmax > tau).and.(rank < ndim) ) + ! a. + i = i+1 + + s = 0.1d0 + + ! Inrease s until the arrays fit in memory + do while (.True.) + + ! b. + Dmin = max(s*Dmax,tau) + + ! c. + nq=0 + LDmap = 0 + DLmap = 0 + Dset_rev = 0 + do p=1,np + if ( D(Lset(p)) > Dmin ) then + nq = nq+1 + Dset(nq) = Lset(p) + Dset_rev(Dset(nq)) = nq + LDmap(p) = nq + DLmap(nq) = p endif - if (q /= 0) Delta(q,m) = Delta(k,m) - endif - enddo - - j = Dset_rev(addr(3,Dset(m))) - if ((0 < j).and.(j < m)) then - call omp_unset_lock(lock(m)) - cycle - endif - - if ((j /= m).and.(j /= 0)) then - call omp_set_lock(lock(j)) - endif - do k=1,nq - ! Apply only to (k,m) pairs both in Dset - p = DLmap(k) - q = Lset_rev(addr(3,Dset(k))) - if ((0 < q).and.(q < p)) cycle - if (.not.ao_two_e_integral_zero( addr(1,Dset(k)), addr(1,Dset(m)), & - addr(2,Dset(k)), addr(2,Dset(m)) ) ) then - if (do_direct_integrals) then - Delta(p,m) = ao_two_e_integral(addr(1,Dset(k)), addr(2,Dset(k)), & - addr(1,Dset(m)), addr(2,Dset(m))) - else - Delta(p,m) = get_ao_two_e_integral( addr(1,Dset(k)), addr(1,Dset(m)), & - addr(2,Dset(k)), addr(2,Dset(m)), ao_integrals_map) - endif - if (q /= 0) Delta(q,m) = Delta(p,m) - if (j /= 0) Delta(p,j) = Delta(p,m) - if (q*j /= 0) Delta(q,j) = Delta(p,m) - endif - enddo - call omp_unset_lock(lock(m)) - if ((j /= m).and.(j /= 0)) then - call omp_unset_lock(lock(j)) - endif - enddo - !$OMP END DO - - !$OMP END PARALLEL - - if (N>0) then - call dgemm('N','T', np, nq, N, -1.d0, & - Ltmp_p, np, Ltmp_q, nq, 1.d0, Delta, np) - endif - - ! f. - Qmax = D(Dset(1)) - do q=1,nq - Qmax = max(Qmax, D(Dset(q))) - enddo - - ! g. - - iblock = 0 - do j=1,nq - - if ( (Qmax <= Dmin).or.(N+j > ndim) ) exit - ! i. - rank = N+j - - if (iblock == block_size) then - call dgemm('N','T',np,nq,block_size,-1.d0, & - Ltmp_p, np, Ltmp_q, nq, 1.d0, Delta, np) - iblock = 0 - endif - - ! ii. - do dj=1,nq - qj = Dset(dj) - if (D(qj) == Qmax) then - exit - endif - enddo - - L(1:ndim, rank) = 0.d0 - - iblock = iblock+1 - do p=1,np - Ltmp_p(p,iblock) = Delta(p,dj) - enddo - - ! iv. - if (iblock > 1) then - call dgemv('N', np, iblock-1, -1.d0, Ltmp_p, np, Ltmp_q(dj,1), nq, 1.d0, & - Ltmp_p(1,iblock), 1) - endif - - ! iii. - f = 1.d0/dsqrt(Qmax) - - !$OMP PARALLEL PRIVATE(m,p,q,k) DEFAULT(shared) - !$OMP DO - do p=1,np - Ltmp_p(p,iblock) = Ltmp_p(p,iblock) * f - L(Lset(p), rank) = Ltmp_p(p,iblock) - D(Lset(p)) = D(Lset(p)) - Ltmp_p(p,iblock) * Ltmp_p(p,iblock) - enddo - !$OMP END DO - - !$OMP DO - do q=1,nq - Ltmp_q(q,iblock) = L(Dset(q), rank) - enddo - !$OMP END DO - - !$OMP END PARALLEL - - Qmax = D(Dset(1)) - do q=1,nq - Qmax = max(Qmax, D(Dset(q))) - enddo - - enddo - - print '(I10, 4X, ES12.3)', rank, Qmax - - deallocate(Delta, stat=ierr) - deallocate(Ltmp_p, stat=ierr) - deallocate(Ltmp_q, stat=ierr) - - ! i. - N = rank - - ! j. - Dmax = D(Lset(1)) - do p=1,np - Dmax = max(Dmax, D(Lset(p))) - enddo - - np=0 - Lset_rev = 0 - do p=1,ndim - if ( dscale*dscale*Dmax*D(p) > tau*tau ) then - np = np+1 - Lset(np) = p - Lset_rev(p) = np - endif - enddo - - enddo - - do k=1,ndim - call omp_destroy_lock(lock(k)) - enddo - - allocate(cholesky_ao(ao_num,ao_num,rank), stat=ierr) - if (ierr /= 0) then - print *, irp_here, ': Allocation failed' - stop -1 - endif - !$OMP PARALLEL DO PRIVATE(k) - do k=1,rank - call dcopy(ndim, L(1,k), 1, cholesky_ao(1,1,k), 1) - enddo - !$OMP END PARALLEL DO - deallocate(L) - cholesky_ao_num = rank - - print *, '============ =============' - print *, '' - print *, 'Rank : ', cholesky_ao_num, '(', 100.d0*dble(cholesky_ao_num)/dble(ao_num*ao_num), ' %)' - print *, '' + enddo + + call resident_memory(rss) + rss = rss & + + np*memory_of_double(nq) &! Delta(np,nq) + + (rank+nq)* memory_of_double(ndim) &! L(ndim,rank+nq) + + (np+nq)*memory_of_double(block_size) ! Ltmp_p(np,block_size) + ! Ltmp_q(nq,block_size) + + if (rss > qp_max_mem) then + s = s*2.d0 + else + exit + endif + + if ((s > 1.d0).or.(nq == 0)) then + print *, 'Not enough memory. Reduce cholesky threshold' + stop -1 + endif + + enddo + + ! d., e. + block_size = max(N,24) + + L_old => L + allocate(L(ndim,rank+nq), stat=ierr) + if (ierr /= 0) then + print *, irp_here, ': allocation failed : (L(ndim,rank+nq))' + stop -1 + endif + + !$OMP PARALLEL DO PRIVATE(k) + do k=1,rank + L(:,k) = L_old(:,k) + enddo + !$OMP END PARALLEL DO + + deallocate(L_old) + + allocate(Delta(np,nq), stat=ierr) + if (ierr /= 0) then + print *, irp_here, ': allocation failed : (Delta(np,nq))' + stop -1 + endif + + allocate(Ltmp_p(np,block_size), stat=ierr) + if (ierr /= 0) then + print *, irp_here, ': allocation failed : (Ltmp_p(np,block_size))' + stop -1 + endif + + allocate(Ltmp_q(nq,block_size), stat=ierr) + if (ierr /= 0) then + print *, irp_here, ': allocation failed : (Ltmp_q(nq,block_size))' + stop -1 + endif + + + !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(m,k,p,q,j) + + !$OMP DO + do q=1,nq + Delta(:,q) = 0.d0 + enddo + !$OMP ENDDO NOWAIT + + !$OMP DO + do k=1,N + do p=1,np + Ltmp_p(p,k) = L(Lset(p),k) + enddo + do q=1,nq + Ltmp_q(q,k) = L(Dset(q),k) + enddo + enddo + !$OMP END DO NOWAIT + + !$OMP BARRIER + + !$OMP DO SCHEDULE(guided) + do m=1,nq + + call omp_set_lock(lock(m)) + do k=1,np + ! Apply only to (k,m) pairs where k is not in Dset + if (LDmap(k) /= 0) cycle + q = Lset_rev(addr(3,Lset(k))) + if ((0 < q).and.(q < k)) cycle + if (.not.ao_two_e_integral_zero( addr(1,Lset(k)), addr(1,Dset(m)),& + addr(2,Lset(k)), addr(2,Dset(m)) ) ) then + if (do_direct_integrals) then + Delta(k,m) = ao_two_e_integral(addr(1,Lset(k)), addr(2,Lset(k)),& + addr(1,Dset(m)), addr(2,Dset(m))) + else + Delta(k,m) = get_ao_two_e_integral( addr(1,Lset(k)), addr(1,Dset(m)),& + addr(2,Lset(k)), addr(2,Dset(m)), ao_integrals_map) + endif + if (q /= 0) Delta(q,m) = Delta(k,m) + endif + enddo + + j = Dset_rev(addr(3,Dset(m))) + if ((0 < j).and.(j < m)) then + call omp_unset_lock(lock(m)) + cycle + endif + + if ((j /= m).and.(j /= 0)) then + call omp_set_lock(lock(j)) + endif + do k=1,nq + ! Apply only to (k,m) pairs both in Dset + p = DLmap(k) + q = Lset_rev(addr(3,Dset(k))) + if ((0 < q).and.(q < p)) cycle + if (.not.ao_two_e_integral_zero( addr(1,Dset(k)), addr(1,Dset(m)),& + addr(2,Dset(k)), addr(2,Dset(m)) ) ) then + if (do_direct_integrals) then + Delta(p,m) = ao_two_e_integral(addr(1,Dset(k)), addr(2,Dset(k)),& + addr(1,Dset(m)), addr(2,Dset(m))) + else + Delta(p,m) = get_ao_two_e_integral( addr(1,Dset(k)), addr(1,Dset(m)),& + addr(2,Dset(k)), addr(2,Dset(m)), ao_integrals_map) + endif + if (q /= 0) Delta(q,m) = Delta(p,m) + if (j /= 0) Delta(p,j) = Delta(p,m) + if (q*j /= 0) Delta(q,j) = Delta(p,m) + endif + enddo + call omp_unset_lock(lock(m)) + if ((j /= m).and.(j /= 0)) then + call omp_unset_lock(lock(j)) + endif + enddo + !$OMP END DO + + !$OMP END PARALLEL + + if (N>0) then + call dgemm('N','T', np, nq, N, -1.d0, & + Ltmp_p, np, Ltmp_q, nq, 1.d0, Delta, np) + endif + + ! f. + Qmax = D(Dset(1)) + do q=1,nq + Qmax = max(Qmax, D(Dset(q))) + enddo + + ! g. + + iblock = 0 + do j=1,nq + + if ( (Qmax <= Dmin).or.(N+j > ndim) ) exit + ! i. + rank = N+j + + if (iblock == block_size) then + call dgemm('N','T',np,nq,block_size,-1.d0, & + Ltmp_p, np, Ltmp_q, nq, 1.d0, Delta, np) + iblock = 0 + endif + + ! ii. + do dj=1,nq + qj = Dset(dj) + if (D(qj) == Qmax) then + exit + endif + enddo + + L(1:ndim, rank) = 0.d0 + + iblock = iblock+1 + do p=1,np + Ltmp_p(p,iblock) = Delta(p,dj) + enddo + + ! iv. + if (iblock > 1) then + call dgemv('N', np, iblock-1, -1.d0, Ltmp_p, np, Ltmp_q(dj,1), nq, 1.d0,& + Ltmp_p(1,iblock), 1) + endif + + ! iii. + f = 1.d0/dsqrt(Qmax) + + !$OMP PARALLEL PRIVATE(m,p,q,k) DEFAULT(shared) + !$OMP DO + do p=1,np + Ltmp_p(p,iblock) = Ltmp_p(p,iblock) * f + L(Lset(p), rank) = Ltmp_p(p,iblock) + D(Lset(p)) = D(Lset(p)) - Ltmp_p(p,iblock) * Ltmp_p(p,iblock) + enddo + !$OMP END DO + + !$OMP DO + do q=1,nq + Ltmp_q(q,iblock) = L(Dset(q), rank) + enddo + !$OMP END DO + + !$OMP END PARALLEL + + Qmax = D(Dset(1)) + do q=1,nq + Qmax = max(Qmax, D(Dset(q))) + enddo + + enddo + + print '(I10, 4X, ES12.3)', rank, Qmax + + deallocate(Delta, stat=ierr) + deallocate(Ltmp_p, stat=ierr) + deallocate(Ltmp_q, stat=ierr) + + ! i. + N = rank + + ! j. + Dmax = D(Lset(1)) + do p=1,np + Dmax = max(Dmax, D(Lset(p))) + enddo + + np=0 + Lset_rev = 0 + do p=1,ndim + if ( dscale*dscale*Dmax*D(p) > tau*tau ) then + np = np+1 + Lset(np) = p + Lset_rev(p) = np + endif + enddo + + enddo + + do k=1,ndim + call omp_destroy_lock(lock(k)) + enddo + + allocate(cholesky_ao(ao_num,ao_num,rank), stat=ierr) + if (ierr /= 0) then + print *, irp_here, ': Allocation failed' + stop -1 + endif + !$OMP PARALLEL DO PRIVATE(k) + do k=1,rank + call dcopy(ndim, L(1,k), 1, cholesky_ao(1,1,k), 1) + enddo + !$OMP END PARALLEL DO + deallocate(L) + cholesky_ao_num = rank + + print *, '============ =============' + print *, '' + + if (write_ao_cholesky) then + print *, 'Writing Cholesky vectors to disk...' + iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_ao', 'W') + write(iunit) rank + write(iunit) cholesky_ao + close(iunit) + call ezfio_set_ao_two_e_ints_io_ao_cholesky('Read') + endif + endif + + print *, 'Rank : ', cholesky_ao_num, '(', 100.d0*dble(cholesky_ao_num)/dble(ao_num*ao_num), ' %)' + print *, '' + END_PROVIDER - + diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index e7b115bb..f97514cd 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -49,9 +49,34 @@ subroutine run_ccsd_space_orb allocate(H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO)) if (cc_update_method == 'diis') then - allocate(all_err(nO*nV+nO*nO*nV*nV,cc_diis_depth), all_t(nO*nV+nO*nO*nV*nV,cc_diis_depth)) - all_err = 0d0 - all_t = 0d0 + double precision :: rss, diis_mem, extra_mem + double precision, external :: memory_of_double + diis_mem = 2.d0*memory_of_double(nO*nV)*(1.d0+nO*nV) + call resident_memory(rss) + do while (cc_diis_depth > 1) + if (rss + diis_mem * cc_diis_depth > qp_max_mem) then + cc_diis_depth = cc_diis_depth - 1 + else + exit + endif + end do + if (cc_diis_depth <= 1) then + print *, 'Not enough memory for DIIS' + stop -1 + endif + print *, 'DIIS size ', cc_diis_depth + + allocate(all_err(nO*nV+nO*nO*nV*(nV*1_8),cc_diis_depth), all_t(nO*nV+nO*nO*nV*(nV*1_8),cc_diis_depth)) + !$OMP PARALLEL PRIVATE(i,j) DEFAULT(SHARED) + do j=1,cc_diis_depth + !$OMP DO + do i=1, size(all_err,1) + all_err(i,j) = 0d0 + all_t(i,j) = 0d0 + enddo + !$OMP END DO NOWAIT + enddo + !$OMP END PARALLEL endif if (elec_alpha_num /= elec_beta_num) then @@ -1427,7 +1452,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) !enddo !$omp parallel & - !$omp shared(nO,nV,K1,X_ovov,Z_ovov,t2) & + !$omp shared(nO,nV,K1,X_ovov,Y_ovov,t2) & !$omp private(u,v,gam,beta,i,a) & !$omp default(none) !$omp do @@ -1447,7 +1472,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) do v = 1, nO do a = 1, nV do i = 1, nO - Z_ovov(i,a,v,beta) = t2(i,v,beta,a) + Y_ovov(i,a,v,beta) = t2(i,v,beta,a) enddo enddo enddo diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index 99a4e426..1c56996e 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -454,21 +454,8 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) double precision, intent(out) :: r2(nO,nO,nV,nV), max_r2 ! internal - double precision, allocatable :: g_occ(:,:), g_vir(:,:), J1(:,:,:,:), K1(:,:,:,:) - double precision, allocatable :: A1(:,:,:,:) integer :: u,v,i,j,beta,gam,a,b - - allocate(g_occ(nO,nO), g_vir(nV,nV)) - allocate(J1(nO,nV,nV,nO), K1(nO,nV,nO,nV)) - allocate(A1(nO,nO,nO,nO)) - - call compute_g_occ_chol(nO,nV,t1,t2,H_oo,g_occ) - call compute_g_vir_chol(nO,nV,t1,t2,H_vv,g_vir) - call compute_A1_chol(nO,nV,t1,t2,tau,A1) - call compute_J1_chol(nO,nV,t1,t2,cc_space_v_ovvo,cc_space_v_ovoo, & - cc_space_v_vvoo,J1) - call compute_K1_chol(nO,nV,t1,t2,cc_space_v_ovoo,cc_space_v_vvoo, & - cc_space_v_ovov,K1) + double precision :: max_r2_local ! Residual !r2 = 0d0 @@ -490,36 +477,47 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) !$omp end do !$omp end parallel + double precision, allocatable :: A1(:,:,:,:) + allocate(A1(nO,nO,nO,nO)) + call compute_A1_chol(nO,nV,t1,t2,tau,A1) call dgemm('N','N',nO*nO,nV*nV,nO*nO, & 1d0, A1, size(A1,1) * size(A1,2), & tau, size(tau,1) * size(tau,2), & 1d0, r2, size(r2,1) * size(r2,2)) + deallocate(A1) integer :: block_size, iblock, k block_size = 16 double precision, dimension(:,:,:), allocatable :: B1, tmp_cc, tmpB1 + double precision, dimension(:,:), allocatable :: tmp_cc2 allocate(tmp_cc(cholesky_ao_num,nV,nV)) call dgemm('N','N', cholesky_ao_num*nV, nV, nO, 1.d0, & cc_space_v_vo_chol, cholesky_ao_num*nV, t1, nO, 0.d0, tmp_cc, cholesky_ao_num*nV) - !$OMP PARALLEL PRIVATE(gam, iblock, B1, tmpB1, beta, b, a) - allocate(B1(nV,nV,block_size), tmpB1(nV,block_size,nV)) + call set_multiple_levels_omp(.False.) + + !$OMP PARALLEL PRIVATE(gam, iblock, B1, tmpB1, tmp_cc2, beta, b, a) + allocate(B1(nV,nV,block_size), tmpB1(nV,block_size,nV), tmp_cc2(cholesky_ao_num,nV)) !$OMP DO do gam = 1, nV do iblock = 1, nV, block_size - call dgemm('T', 'N', nV*min(block_size, nV-iblock+1), nV, cholesky_ao_num, & - -1.d0, cc_space_v_vv_chol(1,1,iblock), cholesky_ao_num, & - tmp_cc(1,1,gam), cholesky_ao_num, 0.d0, tmpB1, nV*block_size) call dgemm('T', 'N', nV*min(block_size, nV-iblock+1), nV, cholesky_ao_num, & -1.d0, tmp_cc(1,1,iblock), cholesky_ao_num, & - cc_space_v_vv_chol(1,1,gam), cholesky_ao_num, 1.d0, tmpB1, nV*block_size) + cc_space_v_vv_chol(1,1,gam), cholesky_ao_num, & + 0.d0, tmpB1, nV*block_size) + + do a=1,nV + do k=1,cholesky_ao_num + tmp_cc2(k,a) = cc_space_v_vv_chol(k,a,gam) - tmp_cc(k,a,gam) + enddo + enddo call dgemm('T','N', nV*min(block_size, nV-iblock+1), nV, cholesky_ao_num, 1.d0, & cc_space_v_vv_chol(1,1,iblock), cholesky_ao_num, & - cc_space_v_vv_chol(1,1,gam), cholesky_ao_num, 1.d0, & - tmpB1, nV*block_size) + tmp_cc2, cholesky_ao_num, & + 1.d0, tmpB1, nV*block_size) do beta = iblock, min(nV, iblock+block_size-1) do b = 1, nV @@ -538,15 +536,14 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) enddo !$OMP ENDDO - deallocate(B1, tmpB1) + deallocate(B1, tmpB1, tmp_cc2) !$OMP END PARALLEL deallocate(tmp_cc) - double precision, allocatable :: X_oovv(:,:,:,:),Y_oovv(:,:,:,:) - allocate(X_oovv(nO,nO,nV,nV),Y_oovv(nO,nO,nV,nV)) - + double precision, allocatable :: X_oovv(:,:,:,:) + allocate(X_oovv(nO,nO,nV,nV)) !$omp parallel & !$omp shared(nO,nV,t2,X_oovv) & !$omp private(u,v,gam,a) & @@ -564,10 +561,19 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) !$omp end do !$omp end parallel + double precision, allocatable :: g_vir(:,:) + allocate(g_vir(nV,nV)) + call compute_g_vir_chol(nO,nV,t1,t2,H_vv,g_vir) + + double precision, allocatable :: Y_oovv(:,:,:,:) + allocate(Y_oovv(nO,nO,nV,nV)) + call dgemm('N','N',nO*nO*nV,nV,nV, & 1d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3), & g_vir, size(g_vir,1), & 0d0, Y_oovv, size(Y_oovv,1) * size(Y_oovv,2) * size(Y_oovv,3)) + deallocate(g_vir) + deallocate(X_oovv) !$omp parallel & !$omp shared(nO,nV,r2,Y_oovv) & @@ -585,11 +591,18 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) enddo !$omp end do !$omp end parallel + deallocate(Y_oovv) + double precision, allocatable :: g_occ(:,:) + allocate(g_occ(nO,nO)) + call compute_g_occ_chol(nO,nV,t1,t2,H_oo,g_occ) + + allocate(X_oovv(nO,nO,nV,nV)) call dgemm('N','N',nO,nO*nV*nV,nO, & 1d0, g_occ , size(g_occ,1), & t2 , size(t2,1), & 0d0, X_oovv, size(X_oovv,1)) + deallocate(g_occ) !$omp parallel & !$omp shared(nO,nV,r2,X_oovv) & @@ -613,6 +626,8 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) double precision, allocatable :: X_vovv(:,:,:,:) allocate(X_vovv(nV,nO,nV,block_size)) + allocate(Y_oovv(nO,nO,nV,nV)) + do iblock = 1, nV, block_size do gam = iblock, min(nV, iblock+block_size-1) call dgemm('T','N',nV, nO*nV, cholesky_ao_num, 1.d0, & @@ -626,6 +641,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) 0d0, Y_oovv(1,1,1,iblock), size(Y_oovv,1)) enddo + deallocate(X_vovv) !$omp parallel & !$omp shared(nO,nV,r2,Y_oovv) & @@ -643,6 +659,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) enddo !$omp end do !$omp end parallel + deallocate(Y_oovv) double precision, allocatable :: X_ovvo(:,:,:,:) double precision, allocatable :: tcc(:,:,:), tcc2(:,:,:) @@ -693,6 +710,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) deallocate(X_ovvo) !----- + allocate(X_oovv(nO,nO,nV,nV)) call dgemm('N','N',nO*nO*nV,nV,nO, & @@ -716,9 +734,10 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) enddo !$omp end do !$omp end parallel + deallocate(X_oovv) double precision, allocatable :: X_vovo(:,:,:,:), Y_oovo(:,:,:,:) - allocate(X_vovo(nV,nO,nV,nO), Y_oovo(nO,nO,nV,nO)) + allocate(X_vovo(nV,nO,nV,nO)) !$omp parallel & !$omp shared(nO,nV,X_vovo,cc_space_v_ovvo) & @@ -737,15 +756,19 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) enddo !$omp end parallel + allocate(Y_oovo(nO,nO,nV,nO)) call dgemm('N','N',nO,nO*nV*nO,nV, & 1d0, t1, size(t1,1), & X_vovo, size(X_vovo,1), & 0d0, Y_oovo, size(Y_oovo,1)) + deallocate(X_vovo) + allocate(X_oovv(nO,nO,nV,nV)) call dgemm('N','N',nO*nO*nV, nV, nO, & 1d0, Y_oovo, size(Y_oovo,1) * size(Y_oovo,2) * size(Y_oovo,3), & t1 , size(t1,1), & 0d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3)) + deallocate(Y_oovo) !$omp parallel & !$omp shared(nO,nV,r2,X_oovv) & @@ -763,15 +786,23 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) enddo !$omp end do !$omp end parallel + deallocate(X_oovv) - deallocate(X_vovo,Y_oovo) - double precision, allocatable :: Y_voov(:,:,:,:), Z_ovov(:,:,:,:) - allocate(X_ovvo(nO,nV,nV,nO), Y_voov(nV,nO,nO,nV),Z_ovov(nO,nV,nO,nV)) + double precision, allocatable :: J1(:,:,:,:) + allocate(J1(nO,nV,nV,nO)) + call compute_J1_chol(nO,nV,t1,t2,cc_space_v_ovvo,cc_space_v_ovoo, & + cc_space_v_vvoo,J1) + + double precision, allocatable :: K1(:,:,:,:) + allocate(K1(nO,nV,nO,nV)) + call compute_K1_chol(nO,nV,t1,t2,cc_space_v_ovoo,cc_space_v_vvoo, & + cc_space_v_ovov,K1) + + allocate(X_ovvo(nO,nV,nV,nO)) !$omp parallel & - !$omp shared(nO,nV,X_ovvo,Y_voov,K1,J1,t2) & !$omp private(u,v,gam,beta,i,a) & - !$omp default(none) + !$omp default(shared) do i = 1, nO !$omp do do a = 1, nV @@ -783,7 +814,15 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) enddo !$omp end do nowait enddo + !$omp end parallel + deallocate(J1) + double precision, allocatable :: Y_voov(:,:,:,:) + allocate(Y_voov(nV,nO,nO,nV)) + + !$omp parallel & + !$omp private(u,v,gam,beta,i,a) & + !$omp default(shared) !$omp do do gam = 1, nV do v = 1, nO @@ -797,11 +836,16 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) !$omp end do !$omp end parallel + double precision, allocatable :: Z_ovov(:,:,:,:) + allocate(Z_ovov(nO,nV,nO,nV)) + call dgemm('N','N', nO*nV,nO*nV,nV*nO, & 1d0, X_ovvo, size(X_ovvo,1) * size(X_ovvo,2), & Y_voov, size(Y_voov,1) * size(Y_voov,2), & 0d0, Z_ovov, size(Z_ovov,1) * size(Z_ovov,2)) + deallocate(X_ovvo,Y_voov) + !$omp parallel & !$omp shared(nO,nV,r2,Z_ovov) & !$omp private(u,v,gam,beta) & @@ -819,10 +863,11 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) !$omp end do !$omp end parallel - deallocate(X_ovvo,Y_voov) + deallocate(Z_ovov) double precision, allocatable :: Y_ovov(:,:,:,:), X_ovov(:,:,:,:) - allocate(X_ovov(nO,nV,nO,nV),Y_ovov(nO,nV,nO,nV)) + allocate(X_ovov(nO,nV,nO,nV)) + allocate(Y_ovov(nO,nV,nO,nV)) !$omp parallel & !$omp shared(nO,nV,r2,K1,X_ovov,Y_ovov,t2) & @@ -853,10 +898,12 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) !$omp end do !$omp end parallel + allocate(Z_ovov(nO,nV,nO,nV)) call dgemm('T','N',nO*nV,nO*nV,nO*nV, & 1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), & Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), & 0d0, Z_ovov, size(Y_ovov,1) * size(Y_ovov,2)) + deallocate(X_ovov, Y_ovov) !$omp parallel & !$omp shared(nO,nV,r2,Z_ovov) & @@ -874,9 +921,11 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) enddo !$omp end do !$omp end parallel + deallocate(Z_ovov) + allocate(X_ovov(nO,nV,nO,nV),Y_ovov(nO,nV,nO,nV)) !$omp parallel & - !$omp shared(nO,nV,K1,X_ovov,Z_ovov,t2) & + !$omp shared(nO,nV,K1,X_ovov,Y_ovov,t2) & !$omp private(u,v,gam,beta,i,a) & !$omp default(none) !$omp do @@ -896,7 +945,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) do v = 1, nO do a = 1, nV do i = 1, nO - Z_ovov(i,a,v,beta) = t2(i,v,beta,a) + Y_ovov(i,a,v,beta) = t2(i,v,beta,a) enddo enddo enddo @@ -904,11 +953,16 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) !$omp end do !$omp end parallel + deallocate(K1) + + allocate(Z_ovov(nO,nV,nO,nV)) call dgemm('N','N',nO*nV,nO*nV,nO*nV, & 1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), & Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), & 0d0, Z_ovov, size(Y_ovov,1) * size(Y_ovov,2)) + deallocate(X_ovov,Y_ovov) + !$omp parallel & !$omp shared(nO,nV,r2,Z_ovov) & !$omp private(u,v,gam,beta) & @@ -926,39 +980,33 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) !$omp end do !$omp end parallel - deallocate(X_ovov,Y_ovov,Z_ovov) + deallocate(Z_ovov) ! Change the sign for consistency with the code in spin orbitals + + max_r2 = 0d0 !$omp parallel & - !$omp shared(nO,nV,r2) & - !$omp private(i,j,a,b) & + !$omp shared(nO,nV,r2,max_r2) & + !$omp private(i,j,a,b,max_r2_local) & !$omp default(none) + max_r2_local = 0.d0 !$omp do do b = 1, nV do a = 1, nV do j = 1, nO do i = 1, nO r2(i,j,a,b) = -r2(i,j,a,b) + max_r2_local = max(r2(i,j,a,b), max_r2_local) enddo enddo enddo enddo - !$omp end do + !$omp end do nowait + !$omp critical + max_r2 = max(max_r2, max_r2_local) + !$omp end critical !$omp end parallel - max_r2 = 0d0 - do b = 1, nV - do a = 1, nV - do j = 1, nO - do i = 1, nO - max_r2 = max(r2(i,j,a,b), max_r2) - enddo - enddo - enddo - enddo - - deallocate(g_occ,g_vir,J1,K1,A1) - end ! A1