diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 1d77180e..76c9351e 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -85,13 +85,23 @@ subroutine run_ccsd_space_orb do while (not_converged) - call compute_H_oo(nO,nV,t1,t2,tau,H_oo) - call compute_H_vv(nO,nV,t1,t2,tau,H_vv) - call compute_H_vo(nO,nV,t1,t2,H_vo) - ! Residue - call compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) - call compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) +! if (do_ao_cholesky) then + if (.False.) then + call compute_H_oo_chol(nO,nV,t1,t2,tau,H_oo) + call compute_H_vv_chol(nO,nV,t1,t2,tau,H_vv) + call compute_H_vo_chol(nO,nV,t1,t2,H_vo) + + call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) + call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) + else + call compute_H_oo(nO,nV,t1,t2,tau,H_oo) + call compute_H_vv(nO,nV,t1,t2,tau,H_vv) + call compute_H_vo(nO,nV,t1,t2,H_vo) + + call compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) + call compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) + endif max_r = max(max_r1,max_r2) ! Update @@ -839,6 +849,10 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) ! allocate(B1(nV,nV,nV,nV)) ! call compute_B1(nO,nV,t1,t2,B1) +! call dgemm('N','N',nO*nO,nV*nV,nV*nV, & +! 1d0, tau, size(tau,1) * size(tau,2), & +! B1 , size(B1_gam,1) * size(B1_gam,2), & +! 1d0, r2, size(r2,1) * size(r2,2)) allocate(B1_gam(nV,nV,nV)) do gam=1,nV call compute_B1_gam(nO,nV,t1,t2,B1_gam,gam) diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f new file mode 100644 index 00000000..190c163b --- /dev/null +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -0,0 +1,1395 @@ +subroutine ccsd_energy_space_chol(nO,nV,tau,t1,energy) + + implicit none + + integer, intent(in) :: nO, nV + double precision, intent(in) :: tau(nO,nO,nV,nV) + double precision, intent(in) :: t1(nO,nV) + double precision, intent(out) :: energy + + ! internal + integer :: i,j,a,b + double precision :: e + + energy = 0d0 + !$omp parallel & + !$omp shared(nO,nV,energy,tau,t1,& + !$omp cc_space_f_vo,cc_space_w_oovv) & + !$omp private(i,j,a,b,e) & + !$omp default(none) + e = 0d0 + !$omp do + do a = 1, nV + do i = 1, nO + e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a) + enddo + enddo + !$omp end do nowait + !$omp do + do b = 1, nV + do a = 1, nV + do j = 1, nO + do i = 1, nO + e = e + tau(i,j,a,b) * cc_space_w_oovv(i,j,a,b) + enddo + enddo + enddo + enddo + !$omp end do nowait + !$omp critical + energy = energy + e + !$omp end critical + !$omp end parallel + +end + +! Tau + +subroutine update_tau_space_chol(nO,nV,t1,t2,tau) + + implicit none + + ! in + integer, intent(in) :: nO, nV + double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV) + + ! out + double precision, intent(out) :: tau(nO,nO,nV,nV) + + ! internal + integer :: i,j,a,b + + !$OMP PARALLEL & + !$OMP SHARED(nO,nV,tau,t2,t1) & + !$OMP PRIVATE(i,j,a,b) & + !$OMP DEFAULT(NONE) + !$OMP DO + do b = 1, nV + do a = 1, nV + do j = 1, nO + do i = 1, nO + tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b) + enddo + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + +end + +! R1 + +subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) + + implicit none + + ! in + integer, intent(in) :: nO, nV + double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV), tau(nO,nO,nV,nV) + double precision, intent(in) :: H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO) + + ! out + double precision, intent(out) :: r1(nO,nV), max_r1 + + ! internal + integer :: u,i,j,beta,a,b + + !$omp parallel & + !$omp shared(nO,nV,r1,cc_space_f_ov) & + !$omp private(u,beta) & + !$omp default(none) + !$omp do + do beta = 1, nV + do u = 1, nO + r1(u,beta) = cc_space_f_ov(u,beta) + enddo + enddo + !$omp end do + !$omp end parallel + + double precision, allocatable :: X_oo(:,:) + allocate(X_oo(nO,nO)) + call dgemm('N','N', nO, nO, nV, & + -2d0, t1 , size(t1,1), & + cc_space_f_vo, size(cc_space_f_vo,1), & + 0d0, X_oo , size(X_oo,1)) + + call dgemm('T','N', nO, nV, nO, & + 1d0, X_oo, size(X_oo,2), & + t1 , size(t1,1), & + 1d0, r1 , size(r1,1)) + deallocate(X_oo) + + call dgemm('N','N', nO, nV, nV, & + 1d0, t1 , size(t1,1), & + H_vv, size(H_vv,1), & + 1d0, r1 , size(r1,1)) + + call dgemm('N','N', nO, nV, nO, & + -1d0, H_oo, size(H_oo,1), & + t1 , size(t1,1), & + 1d0, r1, size(r1,1)) + + double precision, allocatable :: X_voov(:,:,:,:) + allocate(X_voov(nV, nO, nO, nV)) + + !$omp parallel & + !$omp shared(nO,nV,X_voov,t2,t1) & + !$omp private(u,beta,i,a) & + !$omp default(none) + !$omp do + do beta = 1, nV + do u = 1, nO + do i = 1, nO + do a = 1, nV + X_voov(a,i,u,beta) = 2d0 * t2(i,u,a,beta) - t2(u,i,a,beta) + t1(u,a) * t1(i,beta) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + call dgemv('T', nV*nO, nO*nV, & + 1d0, X_voov, size(X_voov,1) * size(X_voov,2), & + H_vo , 1, & + 1d0, r1 , 1) + + deallocate(X_voov) + + double precision, allocatable :: X_ovov(:,:,:,:) + allocate(X_ovov(nO, nV, nO, nV)) + + !$omp parallel & + !$omp shared(nO,nV,cc_space_v_ovov,cc_space_v_voov,X_ovov) & + !$omp private(u,beta,i,a) & + !$omp default(none) + !$omp do + do beta = 1, nV + do u = 1, nO + do a = 1, nv + do i = 1, nO + X_ovov(i,a,u,beta) = 2d0 * cc_space_v_voov(a,u,i,beta) - cc_space_v_ovov(u,a,i,beta) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + call dgemv('T', nO*nV, nO*nV, & + 1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), & + t1 , 1, & + 1d0, r1 , 1) + + deallocate(X_ovov) + + double precision, allocatable :: W_vvov(:,:,:,:), T_vvoo(:,:,:,:) + allocate(W_vvov(nV,nV,nO,nV), T_vvoo(nV,nV,nO,nO)) + + !$omp parallel & + !$omp shared(nO,nV,cc_space_v_vvov,W_vvov,T_vvoo,tau) & + !$omp private(b,beta,i,a) & + !$omp default(none) + !$omp do + do beta = 1, nV + do i = 1, nO + do b = 1, nV + do a = 1, nV + W_vvov(a,b,i,beta) = 2d0 * cc_space_v_vvov(a,b,i,beta) - cc_space_v_vvov(b,a,i,beta) + enddo + enddo + enddo + enddo + !$omp end do nowait + + !$omp do + do u = 1, nO + do i = 1, nO + do b = 1, nV + do a = 1, nV + T_vvoo(a,b,i,u) = tau(i,u,a,b) + enddo + enddo + enddo + enddo + !$omp end do nowait + !$omp end parallel + + call dgemm('T','N',nO,nV,nO*nV*nV, & + 1d0, T_vvoo, size(T_vvoo,1) * size(T_vvoo,2) * size(T_vvoo,3), & + W_vvov, size(W_vvov,1) * size(W_vvov,2) * size(W_vvov,3), & + 1d0, r1 , size(r1,1)) + + deallocate(W_vvov,T_vvoo) + + double precision, allocatable :: W_oovo(:,:,:,:) + allocate(W_oovo(nO,nO,nV,nO)) + + !$omp parallel & + !$omp shared(nO,nV,cc_space_v_vooo,W_oovo) & + !$omp private(u,a,i,j) & + !$omp default(none) + do u = 1, nO + !$omp do + do a = 1, nV + do j = 1, nO + do i = 1, nO + W_oovo(i,j,a,u) = 2d0 * cc_space_v_vooo(a,u,i,j) - cc_space_v_vooo(a,u,j,i) + enddo + enddo + enddo + !$omp end do nowait + enddo + !$omp end parallel + + call dgemm('T','N', nO, nV, nO*nO*nV, & + -1d0, W_oovo, size(W_oovo,1) * size(W_oovo,2) * size(W_oovo,3), & + tau , size(tau,1) * size(tau,2) * size(tau,3), & + 1d0, r1 , size(r1,1)) + + deallocate(W_oovo) + + max_r1 = 0d0 + do a = 1, nV + do i = 1, nO + max_r1 = max(dabs(r1(i,a)), max_r1) + enddo + enddo + + ! Change the sign for consistency with the code in spin orbitals + !$omp parallel & + !$omp shared(nO,nV,r1) & + !$omp private(a,i) & + !$omp default(none) + !$omp do + do a = 1, nV + do i = 1, nO + r1(i,a) = -r1(i,a) + enddo + enddo + !$omp end do + !$omp end parallel + +end + +! H_oo + +subroutine compute_H_oo_chol(nO,nV,t1,t2,tau,H_oo) + + implicit none + + integer, intent(in) :: nO,nV + double precision, intent(in) :: t1(nO, nV) + double precision, intent(in) :: t2(nO, nO, nV, nV) + double precision, intent(in) :: tau(nO, nO, nV, nV) + double precision, intent(out) :: H_oo(nO, nO) + + integer :: a,tmp_a,k,b,l,c,d,tmp_c,tmp_d,i,j,u + + ! H_oo(u,i) = cc_space_f_oo(u,i) + !$omp parallel & + !$omp shared(nO,H_oo,cc_space_f_oo) & + !$omp private(i,u) & + !$omp default(none) + !$omp do + do i = 1, nO + do u = 1, nO + H_oo(u,i) = cc_space_f_oo(u,i) + enddo + enddo + !$omp end do + !$omp end parallel + + ! H_oo(u,i) += cc_space_w_oovv(i,j,a,b) * tau(u,j,a,b) + ! H_oo(u,i) += tau(u,j,a,b) * cc_space_w_oovv(i,j,a,b) + call dgemm('N','T', nO, nO, nO*nV*nV, & + 1d0, tau , size(tau,1), & + cc_space_w_oovv, size(cc_space_w_oovv,1), & + 1d0, H_oo , size(H_oo,1)) + +end + +! H_vv + +subroutine compute_H_vv_chol(nO,nV,t1,t2,tau,H_vv) + + implicit none + + integer, intent(in) :: nO,nV + double precision, intent(in) :: t1(nO, nV) + double precision, intent(in) :: t2(nO, nO, nV, nV) + double precision, intent(in) :: tau(nO, nO, nV, nV) + double precision, intent(out) :: H_vv(nV, nV) + + integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u, beta + + double precision, allocatable :: tmp_tau(:,:,:,:) + + allocate(tmp_tau(nV,nO,nO,nV)) + + ! H_vv(a,beta) = cc_space_f_vv(a,beta) + !$omp parallel & + !$omp shared(nV,nO,H_vv,cc_space_f_vv,tmp_tau,tau) & + !$omp private(a,beta,i,j,b) & + !$omp default(none) + !$omp do + do beta = 1, nV + do a = 1, nV + H_vv(a,beta) = cc_space_f_vv(a,beta) + enddo + enddo + !$omp end do nowait + + !$omp do + do beta = 1, nV + do j = 1, nO + do i = 1, nO + do b = 1, nV + tmp_tau(b,i,j,beta) = tau(i,j,beta,b) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + call dgemm('N','N',nV,nV,nO*nO*nV, & + -1d0, cc_space_w_vvoo, size(cc_space_w_vvoo,1), & + tmp_tau , size(tmp_tau,1) * size(tmp_tau,2) * size(tmp_tau,3), & + 1d0, H_vv , size(H_vv,1)) + + deallocate(tmp_tau) + +end + +! H_vo + +subroutine compute_H_vo_chol(nO,nV,t1,t2,H_vo) + + implicit none + + integer, intent(in) :: nO,nV + double precision, intent(in) :: t1(nO, nV) + double precision, intent(in) :: t2(nO, nO, nV, nV) + double precision, intent(out) :: H_vo(nV, nO) + + integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u, beta + + double precision, allocatable :: w(:,:,:,:) + + allocate(w(nV,nO,nO,nV)) + + !$omp parallel & + !$omp shared(nV,nO,H_vo,cc_space_f_vo,w,cc_space_w_vvoo,t1) & + !$omp private(a,beta,i,j,b) & + !$omp default(none) + !$omp do + do i = 1, nO + do a = 1, nV + H_vo(a,i) = cc_space_f_vo(a,i) + enddo + enddo + !$omp end do nowait + + ! H_vo(a,i) = H_vo(a,i) + cc_space_w_vvoo(a,b,i,j) * t1(j,b) + ! H_vo(a,i) = H_vo(a,i) + w(a,i,j,b) * t1(j,b) + + !$omp do + do b = 1, nV + do j = 1, nO + do i = 1, nO + do a = 1, nV + w(a,i,j,b) = cc_space_w_vvoo(a,b,i,j) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + call dgemv('N',nV*nO, nO*nV, & + 1d0, w , size(w,1) * size(w,2), & + t1 , 1, & + 1d0, H_vo, 1) + + deallocate(w) + +end + +! R2 + +subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) + + implicit none + + ! in + integer, intent(in) :: nO, nV + double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV), tau(nO,nO,nV,nV) + double precision, intent(in) :: H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO) + + ! out + double precision, intent(out) :: r2(nO,nO,nV,nV), max_r2 + + ! internal + double precision, allocatable :: g_occ(:,:), g_vir(:,:), J1(:,:,:,:), K1(:,:,:,:) + double precision, allocatable :: A1(:,:,:,:) + integer :: u,v,i,j,beta,gam,a,b + + allocate(g_occ(nO,nO), g_vir(nV,nV)) + allocate(J1(nO,nV,nV,nO), K1(nO,nV,nO,nV)) + allocate(A1(nO,nO,nO,nO)) + + call compute_g_occ_chol(nO,nV,t1,t2,H_oo,g_occ) + call compute_g_vir_chol(nO,nV,t1,t2,H_vv,g_vir) + call compute_A1_chol(nO,nV,t1,t2,tau,A1) + call compute_J1_chol(nO,nV,t1,t2,cc_space_v_ovvo,cc_space_v_ovoo, & + cc_space_v_vvvo,cc_space_v_vvoo,J1) + call compute_K1_chol(nO,nV,t1,t2,cc_space_v_ovoo,cc_space_v_vvoo, & + cc_space_v_ovov,cc_space_v_vvov,K1) + + ! Residual + !r2 = 0d0 + + !$omp parallel & + !$omp shared(nO,nV,r2,cc_space_v_oovv) & + !$omp private(u,v,gam,beta) & + !$omp default(none) + !$omp do + do gam = 1, nV + do beta = 1, nV + do v = 1, nO + do u = 1, nO + r2(u,v,beta,gam) = cc_space_v_oovv(u,v,beta,gam) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + call dgemm('N','N',nO*nO,nV*nV,nO*nO, & + 1d0, A1, size(A1,1) * size(A1,2), & + tau, size(tau,1) * size(tau,2), & + 1d0, r2, size(r2,1) * size(r2,2)) + + double precision, dimension(:,:,:,:), allocatable :: r2_chem, tmp, tau_chem + double precision, dimension(:,:,:,:), allocatable :: B1 + + allocate(B1(nV,nV,nV,nV)) + call compute_B1_chol(nO,nV,t1,B1,cholesky_ao_num) + call dgemm('N','N',nO*nO,nV*nV,nV*nV, & + 1d0, tau, size(tau,1) * size(tau,2), & + B1 , size(B1 ,1) * size(B1 ,2), & + 1d0, r2, size(r2 ,1) * size(r2 ,2)) + + double precision, allocatable :: X_oovv(:,:,:,:),Y_oovv(:,:,:,:) + allocate(X_oovv(nO,nO,nV,nV),Y_oovv(nO,nO,nV,nV)) + + !$omp parallel & + !$omp shared(nO,nV,t2,X_oovv) & + !$omp private(u,v,gam,a) & + !$omp default(none) + !$omp do + do a = 1, nV + do gam = 1, nV + do v = 1, nO + do u = 1, nO + X_oovv(u,v,gam,a) = t2(u,v,gam,a) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + call dgemm('N','N',nO*nO*nV,nV,nV, & + 1d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3), & + g_vir, size(g_vir,1), & + 0d0, Y_oovv, size(Y_oovv,1) * size(Y_oovv,2) * size(Y_oovv,3)) + + !$omp parallel & + !$omp shared(nO,nV,r2,Y_oovv) & + !$omp private(u,v,gam,beta) & + !$omp default(none) + !$omp do + do gam = 1, nV + do beta = 1, nV + do v = 1, nO + do u = 1, nO + r2(u,v,beta,gam) = r2(u,v,beta,gam) + Y_oovv(u,v,beta,gam) + Y_oovv(v,u,gam,beta) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + call dgemm('N','N',nO,nO*nV*nV,nO, & + 1d0, g_occ , size(g_occ,1), & + t2 , size(t2,1), & + 0d0, X_oovv, size(X_oovv,1)) + + !$omp parallel & + !$omp shared(nO,nV,r2,X_oovv) & + !$omp private(u,v,gam,beta) & + !$omp default(none) + !$omp do + do gam = 1, nV + do beta = 1, nV + do v = 1, nO + do u = 1, nO + r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(u,v,beta,gam) - X_oovv(v,u,gam,beta) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + deallocate(X_oovv) + + double precision, allocatable :: X_vovv(:,:,:,:) + allocate(X_vovv(nV,nO,nV,nV)) + + !$omp parallel & + !$omp shared(nO,nV,X_vovv,cc_space_v_ovvv) & + !$omp private(u,a,gam,beta) & + !$omp default(none) + !$omp do + do gam = 1, nV + do beta = 1, nV + do u = 1, nO + do a = 1, nV + X_vovv(a,u,beta,gam) = cc_space_v_ovvv(u,a,beta,gam) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + call dgemm('N','N',nO,nO*nV*nV,nV, & + 1d0, t1 , size(t1,1), & + X_vovv, size(X_vovv,1), & + 0d0, Y_oovv, size(Y_oovv,1)) + + !$omp parallel & + !$omp shared(nO,nV,r2,Y_oovv) & + !$omp private(u,v,gam,beta) & + !$omp default(none) + !$omp do + do gam = 1, nV + do beta = 1, nV + do v = 1, nO + do u = 1, nO + r2(u,v,beta,gam) = r2(u,v,beta,gam) + Y_oovv(v,u,beta,gam) + Y_oovv(u,v,gam,beta) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + double precision, allocatable :: X_vovo(:,:,:,:), Y_vovv(:,:,:,:) + allocate(X_vovo(nV,nO,nV,nO), Y_vovv(nV,nO,nV,nV),X_oovv(nO,nO,nV,nV)) + + !$omp parallel & + !$omp shared(nO,nV,X_vovo,cc_space_v_ovov) & + !$omp private(u,v,gam,i) & + !$omp default(none) + do i = 1, nO + !$omp do + do gam = 1, nV + do u = 1, nO + do a = 1, nV + X_vovo(a,u,gam,i) = cc_space_v_ovov(u,a,i,gam) + enddo + enddo + enddo + !$omp end do nowait + enddo + !$omp end parallel + + call dgemm('N','N',nV*nO*nV,nV,nO, & + 1d0, X_vovo, size(X_vovo,1) * size(X_vovo,2) * size(X_vovo,3), & + t1 , size(t1,1), & + 0d0, Y_vovv, size(Y_vovv,1) * size(Y_vovv,2) * size(Y_vovv,3)) + + call dgemm('N','N',nO,nO*nV*nV,nV, & + 1d0, t1, size(t1,1), & + Y_vovv, size(Y_vovv,1), & + 0d0, X_oovv, size(X_oovv,1)) + + !$omp parallel & + !$omp shared(nO,nV,r2,X_oovv) & + !$omp private(u,v,gam,beta) & + !$omp default(none) + !$omp do + do gam = 1, nV + do beta = 1, nV + do v = 1, nO + do u = 1, nO + r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(v,u,gam,beta) - X_oovv(u,v,beta,gam) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + deallocate(X_vovo,Y_vovv) + + call dgemm('N','N',nO*nO*nV,nV,nO, & + 1d0, cc_space_v_oovo, size(cc_space_v_oovo,1) * size(cc_space_v_oovo,2) * size(cc_space_v_oovo,3), & + t1 , size(t1,1), & + 0d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3)) + + !$omp parallel & + !$omp shared(nO,nV,r2,X_oovv) & + !$omp private(u,v,gam,beta) & + !$omp default(none) + !$omp do + do gam = 1, nV + do beta = 1, nV + do v = 1, nO + do u = 1, nO + r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(u,v,beta,gam) - X_oovv(v,u,gam,beta) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + double precision, allocatable :: Y_oovo(:,:,:,:) + allocate(X_vovo(nV,nO,nV,nO), Y_oovo(nO,nO,nV,nO)) + + !$omp parallel & + !$omp shared(nO,nV,X_vovo,cc_space_v_ovvo) & + !$omp private(a,v,gam,i) & + !$omp default(none) + do i = 1, nO + !$omp do + do gam = 1, nV + do v = 1, nO + do a = 1, nV + X_vovo(a,v,gam,i) = cc_space_v_ovvo(v,a,gam,i) + enddo + enddo + enddo + !$omp end do nowait + enddo + !$omp end parallel + + call dgemm('N','N',nO,nO*nV*nO,nV, & + 1d0, t1, size(t1,1), & + X_vovo, size(X_vovo,1), & + 0d0, Y_oovo, size(Y_oovo,1)) + + call dgemm('N','N',nO*nO*nV, nV, nO, & + 1d0, Y_oovo, size(Y_oovo,1) * size(Y_oovo,2) * size(Y_oovo,3), & + t1 , size(t1,1), & + 0d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3)) + + !$omp parallel & + !$omp shared(nO,nV,r2,X_oovv) & + !$omp private(u,v,gam,beta) & + !$omp default(none) + !$omp do + do gam = 1, nV + do beta = 1, nV + do v = 1, nO + do u = 1, nO + r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(u,v,gam,beta) - X_oovv(v,u,beta,gam) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + deallocate(X_vovo,Y_oovo) + + double precision, allocatable :: X_ovvo(:,:,:,:), Y_voov(:,:,:,:), Z_ovov(:,:,:,:) + allocate(X_ovvo(nO,nV,nV,nO), Y_voov(nV,nO,nO,nV),Z_ovov(nO,nV,nO,nV)) + !$omp parallel & + !$omp shared(nO,nV,X_ovvo,Y_voov,K1,J1,t2) & + !$omp private(u,v,gam,beta,i,a) & + !$omp default(none) + do i = 1, nO + !$omp do + do a = 1, nV + do beta = 1, nV + do u = 1, nO + X_ovvo(u,beta,a,i) = (J1(u,a,beta,i) - 0.5d0 * K1(u,a,i,beta)) + enddo + enddo + enddo + !$omp end do nowait + enddo + + !$omp do + do gam = 1, nV + do v = 1, nO + do i = 1, nO + do a = 1, nV + Y_voov(a,i,v,gam) = 2d0 * t2(i,v,a,gam) - t2(i,v,gam,a) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + call dgemm('N','N', nO*nV,nO*nV,nV*nO, & + 1d0, X_ovvo, size(X_ovvo,1) * size(X_ovvo,2), & + Y_voov, size(Y_voov,1) * size(Y_voov,2), & + 0d0, Z_ovov, size(Z_ovov,1) * size(Z_ovov,2)) + + !$omp parallel & + !$omp shared(nO,nV,r2,Z_ovov) & + !$omp private(u,v,gam,beta) & + !$omp default(none) + !$omp do + do gam = 1, nV + do beta = 1, nV + do v = 1, nO + do u = 1, nO + r2(u,v,beta,gam) = r2(u,v,beta,gam) + Z_ovov(u,beta,v,gam) + Z_ovov(v,gam,u,beta) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + deallocate(X_ovvo,Y_voov) + + double precision, allocatable :: X_ovov(:,:,:,:),Y_ovov(:,:,:,:) + allocate(X_ovov(nO,nV,nO,nV),Y_ovov(nO,nV,nO,nV)) + !$omp parallel & + !$omp shared(nO,nV,r2,K1,X_ovov,Y_ovov,t2) & + !$omp private(u,a,i,beta,gam) & + !$omp default(none) + !$omp do + do beta = 1, nV + do u = 1, nO + do a = 1, nV + do i = 1, nO + X_ovov(i,a,u,beta) = 0.5d0 * K1(u,a,i,beta) + enddo + enddo + enddo + enddo + !$omp end do nowait + + !$omp do + do gam = 1, nV + do v = 1, nO + do a = 1, nV + do i = 1, nO + Y_ovov(i,a,v,gam) = t2(i,v,gam,a) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + call dgemm('T','N',nO*nV,nO*nV,nO*nV, & + 1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), & + Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), & + 0d0, Z_ovov, size(Y_ovov,1) * size(Y_ovov,2)) + + !$omp parallel & + !$omp shared(nO,nV,r2,Z_ovov) & + !$omp private(u,v,gam,beta) & + !$omp default(none) + !$omp do + do gam = 1, nV + do beta = 1, nV + do v = 1, nO + do u = 1, nO + r2(u,v,beta,gam) = r2(u,v,beta,gam) - Z_ovov(u,beta,v,gam) - Z_ovov(v,gam,u,beta) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + !$omp parallel & + !$omp shared(nO,nV,K1,X_ovov,Z_ovov,t2) & + !$omp private(u,v,gam,beta,i,a) & + !$omp default(none) + !$omp do + do a = 1, nV + do i = 1, nO + do gam = 1, nV + do u = 1, nO + X_ovov(u,gam,i,a) = K1(u,a,i,gam) + enddo + enddo + enddo + enddo + !$omp end do nowait + + !$omp do + do beta = 1, nV + do v = 1, nO + do a = 1, nV + do i = 1, nO + Z_ovov(i,a,v,beta) = t2(i,v,beta,a) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + call dgemm('N','N',nO*nV,nO*nV,nO*nV, & + 1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), & + Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), & + 0d0, Z_ovov, size(Y_ovov,1) * size(Y_ovov,2)) + + !$omp parallel & + !$omp shared(nO,nV,r2,Z_ovov) & + !$omp private(u,v,gam,beta) & + !$omp default(none) + !$omp do + do gam = 1, nV + do beta = 1, nV + do v = 1, nO + do u = 1, nO + r2(u,v,beta,gam) = r2(u,v,beta,gam) - Z_ovov(u,gam,v,beta) - Z_ovov(v,beta,u,gam) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + deallocate(X_ovov,Y_ovov,Z_ovov) + + ! Change the sign for consistency with the code in spin orbitals + !$omp parallel & + !$omp shared(nO,nV,r2) & + !$omp private(i,j,a,b) & + !$omp default(none) + !$omp do + do b = 1, nV + do a = 1, nV + do j = 1, nO + do i = 1, nO + r2(i,j,a,b) = -r2(i,j,a,b) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + max_r2 = 0d0 + do b = 1, nV + do a = 1, nV + do j = 1, nO + do i = 1, nO + max_r2 = max(r2(i,j,a,b), max_r2) + enddo + enddo + enddo + enddo + + deallocate(g_occ,g_vir,J1,K1,A1) + +end + +! A1 + +subroutine compute_A1_chol(nO,nV,t1,t2,tau,A1) + + implicit none + + integer, intent(in) :: nO,nV + double precision, intent(in) :: t1(nO, nV) + double precision, intent(in) :: t2(nO, nO, nV, nV) + double precision, intent(in) :: tau(nO, nO, nV, nV) + double precision, intent(out) :: A1(nO, nO, nO, nO) + + integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta + + double precision, allocatable :: X_vooo(:,:,:,:), Y_oooo(:,:,:,:) + allocate(X_vooo(nV,nO,nO,nO), Y_oooo(nO,nO,nO,nO)) + + ! A1(u,v,i,j) = cc_space_v_oooo(u,v,i,j) + !$omp parallel & + !$omp shared(nO,nV,A1,cc_space_v_oooo,cc_space_v_ovoo,X_vooo) & + !$omp private(u,v,i,j) & + !$omp default(none) + !$omp do collapse(2) + do j = 1, nO + do i = 1, nO + do v = 1, nO + do u = 1, nO + A1(u,v,i,j) = cc_space_v_oooo(u,v,i,j) + enddo + enddo + enddo + enddo + !$omp end do nowait + + ! A1(u,v,i,j) += cc_space_v_ovoo(u,a,i,j) * t1(v,a) & + + !$omp do collapse(2) + do j = 1, nO + do i = 1, nO + do u = 1, nO + do a = 1, nV + X_vooo(a,u,i,j) = cc_space_v_ovoo(u,a,i,j) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + call dgemm('N','N', nO, nO*nO*nO, nV, & + 1d0, t1 , size(t1,1), & + X_vooo, size(X_vooo,1), & + 0d0, Y_oooo, size(Y_oooo,1)) + + !$omp parallel & + !$omp shared(nO,nV,A1,Y_oooo) & + !$omp private(u,v,i,j) & + !$omp default(none) + !$omp do collapse(2) + do j = 1, nO + do i = 1, nO + do v = 1, nO + do u = 1, nO + A1(u,v,i,j) = A1(u,v,i,j) + Y_oooo(v,u,i,j) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + deallocate(X_vooo,Y_oooo) + + ! A1(u,v,i,j) += cc_space_v_vooo(a,v,i,j) * t1(u,a) + call dgemm('N','N', nO, nO*nO*nO, nV, & + 1d0, t1 , size(t1,1), & + cc_space_v_vooo, size(cc_space_v_vooo,1), & + 1d0, A1 , size(A1,1)) + + ! A1(u,v,i,j) += cc_space_v_vvoo(a,b,i,j) * tau(u,v,a,b) + call dgemm('N','N', nO*nO, nO*nO, nV*nV, & + 1d0, tau , size(tau,1) * size(tau,2), & + cc_space_v_vvoo, size(cc_space_v_vvoo,1) * size(cc_space_v_vvoo,2), & + 1d0, A1 , size(A1,1) * size(A1,2)) + +end + +! B1 +subroutine compute_B1_chol(nO,nV,t1,B1,ldb) + + implicit none + + integer, intent(in) :: nO,nV,ldb + double precision, intent(in) :: t1(nO, nV) + double precision, intent(out) :: B1(nV, nV, nV, nV) + + integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam + + do gam = 1, nV + do beta = 1, nV + do b = 1, nV + do a = 1, nV + B1(a,b,beta,gam) = cc_space_v_vvvv(a,b,beta,gam) + + do i = 1, nO + B1(a,b,beta,gam) = B1(a,b,beta,gam) & + - cc_space_v_vvvo(a,b,beta,i) * t1(i,gam) & + - cc_space_v_vvov(a,b,i,gam) * t1(i,beta) + enddo + + enddo + enddo + enddo + enddo + +end + +! g_occ + +subroutine compute_g_occ_chol(nO,nV,t1,t2,H_oo,g_occ) + + implicit none + + integer, intent(in) :: nO,nV + double precision, intent(in) :: t1(nO, nV), H_oo(nO, nO) + double precision, intent(in) :: t2(nO, nO, nV, nV) + double precision, intent(out) :: g_occ(nO, nO) + + integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam + + call dgemm('N','N',nO,nO,nV, & + 1d0, t1, size(t1,1), & + cc_space_f_vo, size(cc_space_f_vo,1), & + 0d0, g_occ, size(g_occ,1)) + + !$omp parallel & + !$omp shared(nO,nV,g_occ,H_oo, cc_space_v_ovoo,t1) & + !$omp private(i,j,a,u) & + !$omp default(none) + !$omp do + do i = 1, nO + do u = 1, nO + g_occ(u,i) = g_occ(u,i) + H_oo(u,i) + enddo + enddo + !$omp end do + + !$omp do + do i = 1, nO + do j = 1, nO + do a = 1, nV + do u = 1, nO + g_occ(u,i) = g_occ(u,i) + (2d0 * cc_space_v_ovoo(u,a,i,j) - cc_space_v_ovoo(u,a,j,i)) * t1(j,a) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + +end + +! g_vir + +subroutine compute_g_vir_chol(nO,nV,t1,t2,H_vv,g_vir) + + implicit none + + integer, intent(in) :: nO,nV + double precision, intent(in) :: t1(nO, nV), H_vv(nV, nV) + double precision, intent(in) :: t2(nO, nO, nV, nV) + double precision, intent(out) :: g_vir(nV, nV) + + integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam + + call dgemm('N','N',nV,nV,nO, & + -1d0, cc_space_f_vo , size(cc_space_f_vo,1), & + t1 , size(t1,1), & + 0d0, g_vir, size(g_vir,1)) + + !$omp parallel & + !$omp shared(nO,nV,g_vir,H_vv, cc_space_v_vvvo,t1) & + !$omp private(i,b,a,beta) & + !$omp default(none) + !$omp do + do beta = 1, nV + do a = 1, nV + g_vir(a,beta) = g_vir(a,beta) + H_vv(a,beta) + enddo + enddo + !$omp end do + + !$omp do + do beta = 1, nV + do i = 1, nO + do b = 1, nV + do a = 1, nV + g_vir(a,beta) = g_vir(a,beta) + (2d0 * cc_space_v_vvvo(a,b,beta,i) - cc_space_v_vvvo(b,a,beta,i)) * t1(i,b) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + +end + +! J1 + +subroutine compute_J1_chol(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1) + + implicit none + + integer, intent(in) :: nO,nV + double precision, intent(in) :: t1(nO, nV) + double precision, intent(in) :: t2(nO, nO, nV, nV) + double precision, intent(in) :: v_ovvo(nO,nV,nV,nO), v_ovoo(nO,nV,nO,nO) + double precision, intent(in) :: v_vvvo(nV,nV,nV,nO), v_vvoo(nV,nV,nO,nO) + double precision, intent(out) :: J1(nO, nV, nV, nO) + + integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam + + double precision, allocatable :: X_ovoo(:,:,:,:), Y_ovov(:,:,:,:) + allocate(X_ovoo(nO,nV,nO,nO),Y_ovov(nO,nV,nO,nV)) + + !$omp parallel & + !$omp shared(nO,nV,J1,v_ovvo,v_ovoo,X_ovoo) & + !$omp private(i,j,a,u,beta) & + !$omp default(none) + do i = 1, nO + !$omp do + do beta = 1, nV + do a = 1, nV + do u = 1, nO + J1(u,a,beta,i) = v_ovvo(u,a,beta,i) + enddo + enddo + enddo + !$omp end do nowait + enddo + + !$omp do collapse(2) + do j = 1, nO + do i = 1, nO + do a = 1, nV + do u = 1, nO + X_ovoo(u,a,i,j) = v_ovoo(u,a,j,i) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + call dgemm('N','N',nO*nV*nO,nV,nO, & + -1d0, X_ovoo, size(X_ovoo,1) * size(X_ovoo,2) * size(X_ovoo,3), & + t1 , size(t1,1), & + 0d0, Y_ovov, size(Y_ovov,1) * size(Y_ovov,2) * size(Y_ovov,3)) + + !$omp parallel & + !$omp shared(nO,nV,J1,Y_ovov) & + !$omp private(i,beta,a,u) & + !$omp default(none) + do i = 1, nO + !$omp do + do beta = 1, nV + do a = 1, nV + do u = 1, nO + J1(u,a,beta,i) = J1(u,a,beta,i) + Y_ovov(u,a,i,beta) + enddo + enddo + enddo + !$omp end do nowait + enddo + !$omp end parallel + deallocate(X_ovoo) + + ! v_vvvo(b,a,beta,i) * t1(u,b) + call dgemm('N','N',nO,nV*nV*nO,nV, & + 1d0, t1 , size(t1,1), & + v_vvvo, size(v_vvvo,1), & + 1d0, J1 , size(J1,1)) + + !- cc_space_v_vvoo(a,b,i,j) * (0.5d0 * t2(u,j,b,beta) + t1(u,b) * t1(j,beta)) & + double precision, allocatable :: X_voov(:,:,:,:), Z_ovvo(:,:,:,:) + allocate(X_voov(nV,nO,nO,nV), Z_ovvo(nO,nV,nV,nO)) + !$omp parallel & + !$omp shared(nO,nV,t2,t1,Y_ovov,X_voov,v_vvoo) & + !$omp private(i,beta,a,u,b,j) & + !$omp default(none) + !$omp do + do b = 1, nV + do j = 1, nO + do beta = 1, nV + do u = 1, nO + Y_ovov(u,beta,j,b) = 0.5d0 * t2(u,j,b,beta) + t1(u,b) * t1(j,beta) + enddo + enddo + enddo + enddo + !$omp end do nowait + + !$omp do + do b = 1, nV + do j = 1, nO + do i = 1, nO + do a = 1, nV + X_voov(a,i,j,b) = v_vvoo(a,b,i,j) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + call dgemm('N','T',nO*nV,nV*nO,nO*nV, & + -1d0, Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), & + X_voov, size(X_voov,1) * size(X_voov,2), & + 0d0, Z_ovvo, size(Z_ovvo,1) * size(Z_ovvo,2)) + deallocate(X_voov) + + double precision, allocatable :: X_ovvo(:,:,:,:), Y_vovo(:,:,:,:) + allocate(X_ovvo(nO,nV,nV,nO),Y_vovo(nV,nO,nV,nO)) + !$omp parallel & + !$omp shared(nO,nV,J1,Z_ovvo,t2,Y_vovo,v_vvoo,X_ovvo) & + !$omp private(i,beta,a,u,j,b) & + !$omp default(none) + do i = 1, nO + !$omp do + do beta = 1, nV + do a = 1, nV + do u = 1, nO + J1(u,a,beta,i) = J1(u,a,beta,i) + Z_ovvo(u,beta,a,i) + enddo + enddo + enddo + !$omp end do nowait + enddo + + !+ 0.5d0 * (2d0 * cc_space_v_vvoo(a,b,i,j) - cc_space_v_vvoo(b,a,i,j)) * t2(u,j,beta,b) + do j = 1, nO + !$omp do + do b = 1, nV + do i = 1, nO + do a = 1, nV + Y_vovo(a,i,b,j) = 0.5d0 * (2d0 * v_vvoo(a,b,i,j) - v_vvoo(b,a,i,j)) + enddo + enddo + enddo + !$omp end do nowait + enddo + + do j = 1, nO + !$omp do + do b = 1, nV + do beta = 1, nV + do u = 1, nO + X_ovvo(u,beta,b,j) = t2(u,j,beta,b) + enddo + enddo + enddo + !$omp end do nowait + enddo + !$omp end parallel + + call dgemm('N','T',nO*nV,nV*nO,nV*nO, & + 1d0, X_ovvo, size(X_ovvo,1) * size(X_ovvo,2), & + Y_vovo, size(Y_vovo,1) * size(Y_vovo,2), & + 0d0, Z_ovvo, size(Z_ovvo,1) * size(Z_ovvo,2)) + + !$omp parallel & + !$omp shared(nO,nV,J1,Z_ovvo) & + !$omp private(i,beta,a,u) & + !$omp default(none) + do i = 1, nO + !$omp do + do beta = 1, nV + do a = 1, nV + do u = 1, nO + J1(u,a,beta,i) = J1(u,a,beta,i) + Z_ovvo(u,beta,a,i) + enddo + enddo + enddo + !$omp end do nowait + enddo + !$omp end parallel + + deallocate(X_ovvo,Z_ovvo,Y_ovov) + +end + +! K1 + +subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov,v_vvov,K1) + + implicit none + + integer, intent(in) :: nO,nV + double precision, intent(in) :: t1(nO, nV) + double precision, intent(in) :: t2(nO, nO, nV, nV) + double precision, intent(in) :: v_vvoo(nV,nV,nO,nO), v_ovov(nO,nV,nO,nV) + double precision, intent(in) :: v_vvov(nV,nV,nO,nV), v_ovoo(nO,nV,nO,nO) + double precision, intent(out) :: K1(nO, nV, nO, nV) + + double precision, allocatable :: X(:,:,:,:), Y(:,:,:,:), Z(:,:,:,:) + + integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam + + allocate(X(nV,nO,nV,nO),Y(nO,nV,nV,nO),Z(nO,nV,nV,nO)) + + !$omp parallel & + !$omp shared(nO,nV,K1,X,Y,v_vvoo,v_ovov,t1,t2) & + !$omp private(i,beta,a,u,j,b) & + !$omp default(none) + !$omp do + do beta = 1, nV + do i = 1, nO + do a = 1, nV + do u = 1, nO + K1(u,a,i,beta) = v_ovov(u,a,i,beta) + enddo + enddo + enddo + enddo + !$omp end do nowait + + do i = 1, nO + !$omp do + do a = 1, nV + do j = 1, nO + do b = 1, nV + X(b,j,a,i) = - v_vvoo(b,a,i,j) + enddo + enddo + enddo + !$omp end do nowait + enddo + + do j = 1, nO + !$omp do + do b = 1, nV + do beta = 1, nV + do u = 1, nO + Y(u,beta,b,j) = 0.5d0 * t2(u,j,b,beta) + t1(u,b) * t1(j,beta) + enddo + enddo + enddo + !$omp end do + enddo + !$omp end parallel + + call dgemm('N','N',nO*nV*nO,nV,nO, & + -1d0, v_ovoo, size(v_ovoo,1) * size(v_ovoo,2) * size(v_ovoo,3), & + t1 , size(t1,1), & + 1d0, K1 , size(K1,1) * size(K1,2) * size(K1,3)) + + call dgemm('N','N',nO,nV*nO*nV,nV, & + 1d0, t1 , size(t1,1), & + v_vvov, size(v_vvov,1), & + 1d0, K1 , size(K1,1)) + + ! Y(u,beta,b,j) * X(b,j,a,i) = Z(u,beta,a,i) + call dgemm('N','N',nV*nO,nO*nV,nV*nO, & + 1d0, Y, size(Y,1) * size(Y,2), & + X, size(X,1) * size(X,2), & + 0d0, Z, size(Z,1) * size(Z,2)) + + !$omp parallel & + !$omp shared(nO,nV,K1,Z) & + !$omp private(i,beta,a,u) & + !$omp default(none) + !$omp do + do beta = 1, nV + do i = 1, nO + do a = 1, nV + do u = 1, nO + K1(u,a,i,beta) = K1(u,a,i,beta) + Z(u,beta,a,i) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + deallocate(X,Y,Z) + +end diff --git a/src/mo_two_e_ints/mo_bi_integrals.irp.f b/src/mo_two_e_ints/mo_bi_integrals.irp.f index b15d9745..af40e571 100644 --- a/src/mo_two_e_ints/mo_bi_integrals.irp.f +++ b/src/mo_two_e_ints/mo_bi_integrals.irp.f @@ -166,11 +166,9 @@ subroutine four_idx_dgemm deallocate (a1) + call map_sort(mo_integrals_map) call map_unique(mo_integrals_map) - integer*8 :: get_mo_map_size, mo_map_size - mo_map_size = get_mo_map_size() - end subroutine subroutine add_integrals_to_map(mask_ijkl) @@ -250,7 +248,7 @@ subroutine add_integrals_to_map(mask_ijkl) call wall_time(wall_1) - size_buffer = min(ao_num*ao_num*ao_num,8000000) + size_buffer = min(ao_num*ao_num,8000000) print*, 'Buffers : ', 8.*(mo_num*(n_j)*(n_k+1) + mo_num+& ao_num+ao_num*ao_num+ size_buffer*3)/(1024*1024), 'MB / core' @@ -443,11 +441,6 @@ subroutine add_integrals_to_map(mask_ijkl) !$OMP END PARALLEL call map_merge(mo_integrals_map) - call wall_time(wall_2) - call cpu_time(cpu_2) - integer*8 :: get_mo_map_size, mo_map_size - mo_map_size = get_mo_map_size() - deallocate(list_ijkl) @@ -463,55 +456,55 @@ subroutine add_integrals_to_map_cholesky integer :: i,j,k,l,m integer :: size_buffer, n_integrals - size_buffer = min(mo_num*mo_num*mo_num,16000000) + size_buffer = min(mo_num*mo_num,16000000) double precision, allocatable :: Vtmp(:,:,:) integer(key_kind) , allocatable :: buffer_i(:) real(integral_kind), allocatable :: buffer_value(:) - if (.True.) then - ! In-memory transformation + call set_multiple_levels_omp(.False.) - call set_multiple_levels_omp(.False.) + !$OMP PARALLEL DEFAULT(SHARED) & + !$OMP PRIVATE(i,j,k,l,n_integrals,buffer_value, buffer_i, Vtmp) + allocate (buffer_i(size_buffer), buffer_value(size_buffer)) + allocate (Vtmp(mo_num,mo_num,mo_num)) + n_integrals = 0 - !$OMP PARALLEL PRIVATE(i,j,k,l,n_integrals,buffer_value, buffer_i, Vtmp) - allocate (buffer_i(size_buffer), buffer_value(size_buffer)) - n_integrals = 0 + !$OMP DO SCHEDULE(dynamic) + do l=1,mo_num + call dgemm('T','N',mo_num*mo_num,mo_num,cholesky_ao_num,1.d0, & + cholesky_mo_transp, cholesky_ao_num, & + cholesky_mo_transp(1,1,l), cholesky_ao_num, 0.d0, & + Vtmp, mo_num*mo_num) - allocate (Vtmp(mo_num,mo_num,mo_num)) - - !$OMP DO - do l=1,mo_num - - call dgemm('T','N',mo_num*mo_num,mo_num,cholesky_ao_num,1.d0, & - cholesky_mo_transp, cholesky_ao_num, & - cholesky_mo_transp(1,1,l), cholesky_ao_num, 0.d0, & - Vtmp, mo_num*mo_num) - do k=1,l - do j=1,mo_num - do i=1,j - if (abs(Vtmp(i,j,k)) > mo_integrals_threshold) then - n_integrals += 1 - buffer_value(n_integrals) = Vtmp(i,j,k) - !DIR$ FORCEINLINE - call mo_two_e_integrals_index(i,k,j,l,buffer_i(n_integrals)) - if (n_integrals == size_buffer) then - call map_append(mo_integrals_map, buffer_i, buffer_value, n_integrals) - n_integrals = 0 - endif + do k=1,l + do j=1,mo_num + do i=1,j + if (dabs(Vtmp(i,j,k)) > mo_integrals_threshold) then + n_integrals = n_integrals + 1 + buffer_value(n_integrals) = Vtmp(i,j,k) + !DIR$ FORCEINLINE + call mo_two_e_integrals_index(i,k,j,l,buffer_i(n_integrals)) + if (n_integrals == size_buffer) then + call map_append(mo_integrals_map, buffer_i, buffer_value, n_integrals) + n_integrals = 0 endif - enddo + endif enddo enddo enddo - !$OMP END DO + enddo + !$OMP END DO NOWAIT + + if (n_integrals > 0) then call map_append(mo_integrals_map, buffer_i, buffer_value, n_integrals) - deallocate(buffer_i, buffer_value, Vtmp) - !$OMP END PARALLEL - - call map_unique(mo_integrals_map) - endif + deallocate(buffer_i, buffer_value, Vtmp) + !$OMP BARRIER + !$OMP END PARALLEL + + call map_sort(mo_integrals_map) + call map_unique(mo_integrals_map) end diff --git a/src/tools/four_idx_transform.irp.f b/src/tools/four_idx_transform.irp.f index 92e87cad..f7520e71 100644 --- a/src/tools/four_idx_transform.irp.f +++ b/src/tools/four_idx_transform.irp.f @@ -14,6 +14,13 @@ program four_idx_transform io_mo_two_e_integrals = 'Write' SOFT_TOUCH io_mo_two_e_integrals + if (.true.) then + PROVIDE ao_two_e_integrals_in_map + endif + if (do_ao_cholesky) then + PROVIDE cholesky_mo_transp + FREE cholesky_ao + endif if (.true.) then PROVIDE mo_two_e_integrals_in_map endif diff --git a/src/utils/fortran_mmap.c b/src/utils/fortran_mmap.c index 52df2476..71426002 100644 --- a/src/utils/fortran_mmap.c +++ b/src/utils/fortran_mmap.c @@ -22,11 +22,7 @@ void* mmap_fortran(char* filename, size_t bytes, int* file_descr, int read_only) perror("Error opening mmap file for reading"); exit(EXIT_FAILURE); } - map = mmap(NULL, bytes, PROT_READ, MAP_SHARED | MAP_HUGETLB, fd, 0); - if (map == MAP_FAILED) { - /* try again without huge pages */ - map = mmap(NULL, bytes, PROT_READ, MAP_SHARED, fd, 0); - } + map = mmap(NULL, bytes, PROT_READ, MAP_SHARED, fd, 0); } else { @@ -53,16 +49,12 @@ void* mmap_fortran(char* filename, size_t bytes, int* file_descr, int read_only) exit(EXIT_FAILURE); } - map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_HUGETLB, fd, 0); - if (map == MAP_FAILED) { - /* try again without huge pages */ - map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - } + map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); } if (map == MAP_FAILED) { close(fd); - printf("%s:\n", filename); + printf("%s: %lu\n", filename, bytes); perror("Error mmapping the file"); exit(EXIT_FAILURE); } diff --git a/src/utils/mmap.f90 b/src/utils/mmap.f90 index 49147283..caabc6f1 100644 --- a/src/utils/mmap.f90 +++ b/src/utils/mmap.f90 @@ -46,7 +46,14 @@ module mmap_module integer(c_size_t) :: length integer(c_int) :: fd_ - length = PRODUCT( shape(:) ) * bytes + integer :: i + + length = int(bytes,8) + do i=1,size(shape) + length = length * shape(i) + enddo +print *, 'map_length: ', length + if (read_only) then map = c_mmap_fortran( trim(filename)//char(0), length, fd_, 1) else @@ -66,7 +73,13 @@ module mmap_module integer(c_size_t) :: length integer(c_int) :: fd_ - length = PRODUCT( shape(:) ) * bytes + integer :: i + + length = int(bytes,8) + do i=1,size(shape) + length = length * shape(i) + enddo +print *, 'map_length: ', length fd_ = fd call c_munmap_fortran( length, fd_, map) end subroutine @@ -82,7 +95,13 @@ module mmap_module integer(c_size_t) :: length integer(c_int) :: fd_ - length = PRODUCT( shape(:) ) * bytes + integer :: i + + length = int(bytes,8) + do i=1,size(shape) + length = length * shape(i) + enddo +print *, 'map_length: ', length fd_ = fd call c_msync_fortran( length, fd_, map) end subroutine diff --git a/src/utils_cc/mo_integrals_cc.irp.f b/src/utils_cc/mo_integrals_cc.irp.f index 2db614b4..62237229 100644 --- a/src/utils_cc/mo_integrals_cc.irp.f +++ b/src/utils_cc/mo_integrals_cc.irp.f @@ -53,33 +53,8 @@ subroutine gen_v_space(n1,n2,n3,n4,list1,list2,list3,list4,v) allocate(v1(cholesky_ao_num,n1,n3), v2(cholesky_ao_num,n2,n4)) allocate(buffer(n1,n3,n2,n4)) - !$OMP PARALLEL PRIVATE(i1,i2,i3,i4,idx1,idx2,idx3,idx4,k) - !$OMP DO - do i3=1,n3 - idx3 = list3(i3) - do i1=1,n1 - idx1 = list1(i1) - do k=1,cholesky_ao_num - v1(k,i1,i3) = cholesky_mo_transp(k,idx1,idx3) - enddo - enddo - enddo - !$OMP END DO NOWAIT - - !$OMP DO - do i4=1,n4 - idx4 = list4(i4) - do i2=1,n2 - idx2 = list2(i2) - do k=1,cholesky_ao_num - v2(k,i2,i4) = cholesky_mo_transp(k,idx2,idx4) - enddo - enddo - enddo - !$OMP END DO NOWAIT - - !$OMP BARRIER - !$OMP END PARALLEL + call gen_v_space_chol(n1,n3,list1,list3,v1,cholesky_ao_num) + call gen_v_space_chol(n2,n4,list2,list4,v2,cholesky_ao_num) call dgemm('T','N', n1*n3, n2*n4, cholesky_ao_num, 1.d0, & v1, cholesky_ao_num, & @@ -129,6 +104,30 @@ subroutine gen_v_space(n1,n2,n3,n4,list1,list2,list3,list4,v) end +subroutine gen_v_space_chol(n1,n3,list1,list3,v,ldv) + + implicit none + + integer, intent(in) :: n1,n3,ldv + integer, intent(in) :: list1(n1),list3(n3) + double precision, intent(out) :: v(ldv,n1,n3) + + integer :: i1,i3,idx1,idx3,k + + !$OMP PARALLEL DO PRIVATE(i1,i3,idx1,idx3,k) + do i3=1,n3 + idx3 = list3(i3) + do i1=1,n1 + idx1 = list1(i1) + do k=1,cholesky_ao_num + v(k,i1,i3) = cholesky_mo_transp(k,idx1,idx3) + enddo + enddo + enddo + !$OMP END PARALLEL DO + +end + ! full BEGIN_PROVIDER [double precision, cc_space_v, (mo_num,mo_num,mo_num,mo_num)] @@ -345,6 +344,38 @@ BEGIN_PROVIDER [double precision, cc_space_v_vvvv, (cc_nVa, cc_nVa, cc_nVa, cc_n END_PROVIDER +BEGIN_PROVIDER [double precision, cc_space_v_vv_chol, (cholesky_ao_num, cc_nVa, cc_nVa)] + + implicit none + + call gen_v_space_chol(cc_nVa, cc_nVa, cc_list_vir, cc_list_vir, cc_space_v_vv_chol, cholesky_ao_num) + +END_PROVIDER + +BEGIN_PROVIDER [double precision, cc_space_v_vo_chol, (cholesky_ao_num, cc_nVa, cc_nOa)] + + implicit none + + call gen_v_space_chol(cc_nVa, cc_nOa, cc_list_vir, cc_list_occ, cc_space_v_vo_chol, cholesky_ao_num) + +END_PROVIDER + +BEGIN_PROVIDER [double precision, cc_space_v_ov_chol, (cholesky_ao_num, cc_nOa, cc_nVa)] + + implicit none + + call gen_v_space_chol(cc_nOa, cc_nVa, cc_list_occ, cc_list_vir, cc_space_v_ov_chol, cholesky_ao_num) + +END_PROVIDER + +BEGIN_PROVIDER [double precision, cc_space_v_oo_chol, (cholesky_ao_num, cc_nOa, cc_nOa)] + + implicit none + + call gen_v_space_chol(cc_nOa, cc_nOa, cc_list_occ, cc_list_occ, cc_space_v_oo_chol, cholesky_ao_num) + +END_PROVIDER + ! ppqq BEGIN_PROVIDER [double precision, cc_space_v_ppqq, (cc_n_mo, cc_n_mo)]