From 4e9143133ac19cbfbc8dbda1f358331da6ef5277 Mon Sep 17 00:00:00 2001 From: AbdAmmar Date: Sat, 10 Aug 2024 18:13:23 +0200 Subject: [PATCH] added 1e-noL with CUDA --- plugins/local/tc_int/cutc_module.F90 | 51 + plugins/local/tc_int/deb_no_1e_gpu.irp.f | 499 +++++++++ plugins/local/tc_int/deb_no_2e_gpu.irp.f | 1 - plugins/local/tc_int/deb_tc_int_cuda.irp.f | 6 +- plugins/local/tc_int/no_1e.irp.f | 1101 +++++++++++++++----- 5 files changed, 1394 insertions(+), 264 deletions(-) create mode 100644 plugins/local/tc_int/deb_no_1e_gpu.irp.f diff --git a/plugins/local/tc_int/cutc_module.F90 b/plugins/local/tc_int/cutc_module.F90 index d7b922cd..d1466697 100644 --- a/plugins/local/tc_int/cutc_module.F90 +++ b/plugins/local/tc_int/cutc_module.F90 @@ -119,6 +119,57 @@ module cutc_module ! --- + subroutine cutc_no_1e(n_grid1, n_mo, ne_a, ne_b, & + wr1, mos_l_in_r, mos_r_in_r, int2_grad1_u12, & + no_1e) bind(C, name = "cutc_no_1e") + + import c_int, c_double, c_ptr + + integer(c_int), intent(in), value :: n_grid1 + integer(c_int), intent(in), value :: n_mo + integer(c_int), intent(in), value :: ne_a + integer(c_int), intent(in), value :: ne_b + real(c_double), intent(in) :: wr1(n_grid1) + real(c_double), intent(in) :: mos_l_in_r(n_grid1,n_mo) + real(c_double), intent(in) :: mos_r_in_r(n_grid1,n_mo) + real(c_double), intent(in) :: int2_grad1_u12(n_grid1,3,n_mo,n_mo) + real(c_double), intent(out) :: no_1e(n_mo,n_mo) + + end subroutine cutc_no_1e + + ! --- + + subroutine deb_no_1e(n_grid1, n_mo, ne_a, ne_b, & + wr1, mos_l_in_r, mos_r_in_r, int2_grad1_u12, & + tmpO, tmpJ, tmpM, tmpS, tmpC, tmpD, tmpL, tmpR, tmpE, tmpF, & + no_1e) bind(C, name = "deb_no_1e") + + import c_int, c_double, c_ptr + + integer(c_int), intent(in), value :: n_grid1 + integer(c_int), intent(in), value :: n_mo + integer(c_int), intent(in), value :: ne_a + integer(c_int), intent(in), value :: ne_b + real(c_double), intent(in) :: wr1(n_grid1) + real(c_double), intent(in) :: mos_l_in_r(n_grid1,n_mo) + real(c_double), intent(in) :: mos_r_in_r(n_grid1,n_mo) + real(c_double), intent(in) :: int2_grad1_u12(n_grid1,3,n_mo,n_mo) + real(c_double), intent(out) :: tmpO(n_grid1) + real(c_double), intent(out) :: tmpJ(n_grid1,3) + real(c_double), intent(out) :: tmpM(n_grid1,3) + real(c_double), intent(out) :: tmpS(n_grid1) + real(c_double), intent(out) :: tmpC(n_grid1,4,n_mo,n_mo) + real(c_double), intent(out) :: tmpD(n_grid1,4) + real(c_double), intent(out) :: tmpL(n_grid1,3,n_mo) + real(c_double), intent(out) :: tmpR(n_grid1,3,n_mo) + real(c_double), intent(out) :: tmpE(n_grid1,5,n_mo) + real(c_double), intent(out) :: tmpF(n_grid1,5,n_mo) + real(c_double), intent(out) :: no_1e(n_mo,n_mo) + + end subroutine deb_no_1e + + ! --- + end interface end module cutc_module diff --git a/plugins/local/tc_int/deb_no_1e_gpu.irp.f b/plugins/local/tc_int/deb_no_1e_gpu.irp.f new file mode 100644 index 00000000..1efbb913 --- /dev/null +++ b/plugins/local/tc_int/deb_no_1e_gpu.irp.f @@ -0,0 +1,499 @@ + +! --- + +subroutine deb_no_1e_gpu() + + use cutc_module + + implicit none + + integer :: i, j, k, l, ipoint + double precision :: acc_thr, err_tot, nrm_tot, err_loc + double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:) + double precision, allocatable :: tmp(:,:,:,:) + double precision, allocatable :: int2_grad1_u12_bimo_t(:,:,:,:) + double precision, allocatable :: noL_1e(:,:) + double precision, allocatable :: noL_1e_gpu(:,:) + + + PROVIDE mo_l_coef mo_r_coef + PROVIDE mos_l_in_r_array_transp mos_r_in_r_array_transp + + + allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,3)) + print*, ' Reading int2_grad1_u12_ao from ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao' + open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="read") + read(11) int2_grad1_u12_ao + close(11) + + allocate(tmp(mo_num,mo_num,n_points_final_grid,3)) + allocate(int2_grad1_u12_bimo_t(n_points_final_grid,3,mo_num,mo_num)) + + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (ipoint) & + !$OMP SHARED (ao_num, mo_num, n_points_final_grid, int2_grad1_u12_ao, tmp) + !$OMP DO SCHEDULE (dynamic) + do ipoint = 1, n_points_final_grid + call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,1), ao_num, tmp(1,1,ipoint,1), mo_num) + call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,2), ao_num, tmp(1,1,ipoint,2), mo_num) + call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,3), ao_num, tmp(1,1,ipoint,3), mo_num) + enddo + !$OMP END DO + !$OMP END PARALLEL + + deallocate(int2_grad1_u12_ao) + + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (i, j, ipoint) & + !$OMP SHARED (mo_num, n_points_final_grid, tmp, int2_grad1_u12_bimo_t) + !$OMP DO COLLAPSE(2) SCHEDULE (dynamic) + do ipoint = 1, n_points_final_grid + do i = 1, mo_num + do j = 1, mo_num + int2_grad1_u12_bimo_t(ipoint,1,j,i) = tmp(j,i,ipoint,1) + int2_grad1_u12_bimo_t(ipoint,2,j,i) = tmp(j,i,ipoint,2) + int2_grad1_u12_bimo_t(ipoint,3,j,i) = tmp(j,i,ipoint,3) + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + + deallocate(tmp) + + ! --- + + allocate(noL_1e_gpu(mo_num,mo_num)) + + call cutc_no_1e(n_points_final_grid, mo_num, elec_alpha_num, elec_beta_num, & + final_weight_at_r_vector(1), & + mos_l_in_r_array_transp(1,1), mos_r_in_r_array_transp(1,1), & + int2_grad1_u12_bimo_t(1,1,1,1), noL_1e_gpu(1,1)) + + ! --- + + allocate(noL_1e(mo_num,mo_num)) + + call provide_no_1e(n_points_final_grid, mo_num, elec_alpha_num, elec_beta_num, & + final_weight_at_r_vector(1), & + mos_l_in_r_array_transp(1,1), mos_r_in_r_array_transp(1,1), & + int2_grad1_u12_bimo_t(1,1,1,1), noL_1e(1,1)) + + ! --- + + deallocate(int2_grad1_u12_bimo_t) + + acc_thr = 1d-12 + + err_tot = 0.d0 + nrm_tot = 0.d0 + do k = 1, mo_num + do l = 1, mo_num + err_loc = dabs(noL_1e(l,k) - noL_1e_gpu(l,k)) + if(err_loc > acc_thr) then + print*, " error on", l, k + print*, " CPU res", noL_1e (l,k) + print*, " GPU res", noL_1e_gpu(l,k) + stop + endif + err_tot = err_tot + err_loc + nrm_tot = nrm_tot + dabs(noL_1e(l,k)) + enddo + enddo + print *, ' absolute accuracy on noL_1e (%) =', 100.d0 * err_tot / nrm_tot + + deallocate(noL_1e) + deallocate(noL_1e_gpu) + + + return + +end + +! --- + +subroutine deb_no_1e_gpu_tmp() + + use cutc_module + + implicit none + + integer :: i, j, k, l, m, ipoint + double precision :: acc_thr, err_tot, nrm_tot, err_loc + double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:) + double precision, allocatable :: tmp(:,:,:,:) + double precision, allocatable :: int2_grad1_u12_bimo_t(:,:,:,:) + double precision, allocatable :: tmpO(:), tmpO_gpu(:) + double precision, allocatable :: tmpJ(:,:), tmpJ_gpu(:,:) + double precision, allocatable :: tmpM(:,:), tmpM_gpu(:,:) + double precision, allocatable :: tmpS(:), tmpS_gpu(:) + double precision, allocatable :: tmpC(:,:,:,:), tmpC_gpu(:,:,:,:) + double precision, allocatable :: tmpD(:,:), tmpD_gpu(:,:) + double precision, allocatable :: tmpL(:,:,:), tmpL_gpu(:,:,:) + double precision, allocatable :: tmpR(:,:,:), tmpR_gpu(:,:,:) + double precision, allocatable :: tmpE(:,:,:), tmpE_gpu(:,:,:) + double precision, allocatable :: tmpF(:,:,:), tmpF_gpu(:,:,:) + double precision, allocatable :: noL_1e(:,:), noL_1e_gpu(:,:) + + + ! --- + + + PROVIDE mo_l_coef mo_r_coef + PROVIDE mos_l_in_r_array_transp mos_r_in_r_array_transp + + + allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,3)) + print*, ' Reading int2_grad1_u12_ao from ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao' + open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="read") + read(11) int2_grad1_u12_ao + close(11) + + allocate(tmp(mo_num,mo_num,n_points_final_grid,3)) + allocate(int2_grad1_u12_bimo_t(n_points_final_grid,3,mo_num,mo_num)) + + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (ipoint) & + !$OMP SHARED (ao_num, mo_num, n_points_final_grid, int2_grad1_u12_ao, tmp) + !$OMP DO SCHEDULE (dynamic) + do ipoint = 1, n_points_final_grid + call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,1), ao_num, tmp(1,1,ipoint,1), mo_num) + call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,2), ao_num, tmp(1,1,ipoint,2), mo_num) + call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,3), ao_num, tmp(1,1,ipoint,3), mo_num) + enddo + !$OMP END DO + !$OMP END PARALLEL + + deallocate(int2_grad1_u12_ao) + + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (i, j, ipoint) & + !$OMP SHARED (mo_num, n_points_final_grid, tmp, int2_grad1_u12_bimo_t) + !$OMP DO COLLAPSE(2) SCHEDULE (dynamic) + do ipoint = 1, n_points_final_grid + do i = 1, mo_num + do j = 1, mo_num + int2_grad1_u12_bimo_t(ipoint,1,j,i) = tmp(j,i,ipoint,1) + int2_grad1_u12_bimo_t(ipoint,2,j,i) = tmp(j,i,ipoint,2) + int2_grad1_u12_bimo_t(ipoint,3,j,i) = tmp(j,i,ipoint,3) + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + + deallocate(tmp) + + ! --- + + allocate(tmpO_gpu(n_points_final_grid)) + allocate(tmpJ_gpu(n_points_final_grid,3)) + allocate(tmpM_gpu(n_points_final_grid,3)) + allocate(tmpS_gpu(n_points_final_grid)) + allocate(tmpC_gpu(n_points_final_grid,4,mo_num,mo_num)) + allocate(tmpD_gpu(n_points_final_grid,4)) + allocate(tmpL_gpu(n_points_final_grid,3,mo_num)) + allocate(tmpR_gpu(n_points_final_grid,3,mo_num)) + allocate(tmpE_gpu(n_points_final_grid,5,mo_num)) + allocate(tmpF_gpu(n_points_final_grid,5,mo_num)) + allocate(noL_1e_gpu(mo_num,mo_num)) + + call deb_no_1e(n_points_final_grid, mo_num, elec_alpha_num, elec_beta_num, & + final_weight_at_r_vector(1), & + mos_l_in_r_array_transp(1,1), mos_r_in_r_array_transp(1,1), int2_grad1_u12_bimo_t(1,1,1,1), & + tmpO_gpu(1), tmpJ_gpu(1,1), tmpM_gpu(1,1), tmpS_gpu(1), tmpC_gpu(1,1,1,1), tmpD_gpu(1,1), & + tmpL_gpu(1,1,1), tmpR_gpu(1,1,1), tmpE_gpu(1,1,1), tmpF_gpu(1,1,1), noL_1e_gpu(1,1)) + + ! --- + + allocate(tmpO(n_points_final_grid)) + allocate(tmpJ(n_points_final_grid,3)) + allocate(tmpM(n_points_final_grid,3)) + allocate(tmpS(n_points_final_grid)) + allocate(tmpC(n_points_final_grid,4,mo_num,mo_num)) + allocate(tmpD(n_points_final_grid,4)) + allocate(tmpL(n_points_final_grid,3,mo_num)) + allocate(tmpR(n_points_final_grid,3,mo_num)) + allocate(tmpE(n_points_final_grid,5,mo_num)) + allocate(tmpF(n_points_final_grid,5,mo_num)) + allocate(noL_1e(mo_num,mo_num)) + + call provide_no_1e_tmp(n_points_final_grid, mo_num, elec_alpha_num, elec_beta_num, & + final_weight_at_r_vector(1), & + mos_l_in_r_array_transp(1,1), mos_r_in_r_array_transp(1,1), int2_grad1_u12_bimo_t(1,1,1,1), & + tmpO(1), tmpJ(1,1), tmpM(1,1), tmpS(1), tmpC(1,1,1,1), tmpD(1,1), tmpL(1,1,1), tmpR(1,1,1), & + tmpE(1,1,1), tmpF(1,1,1), noL_1e(1,1)) + + ! --- + + deallocate(int2_grad1_u12_bimo_t) + + + acc_thr = 1d-12 + + ! --- + + ! tmpO(n_points_final_grid)) + err_tot = 0.d0 + nrm_tot = 0.d0 + do ipoint = 1, n_points_final_grid + err_loc = dabs(tmpO(ipoint) - tmpO_gpu(ipoint)) + if(err_loc > acc_thr) then + print*, " error on", ipoint + print*, " CPU res", tmpO (ipoint) + print*, " GPU res", tmpO_gpu(ipoint) + stop + endif + err_tot = err_tot + err_loc + nrm_tot = nrm_tot + dabs(tmpO(ipoint)) + enddo + print *, ' absolute accuracy on tmpO (%) =', 100.d0 * err_tot / nrm_tot + + ! --- + + ! tmpJ(n_points_final_grid,3)) + err_tot = 0.d0 + nrm_tot = 0.d0 + do m = 1, 3 + do ipoint = 1, n_points_final_grid + err_loc = dabs(tmpJ(ipoint,m) - tmpJ_gpu(ipoint,m)) + if(err_loc > acc_thr) then + print*, " error on", ipoint, m + print*, " CPU res", tmpJ (ipoint,m) + print*, " GPU res", tmpJ_gpu(ipoint,m) + stop + endif + err_tot = err_tot + err_loc + nrm_tot = nrm_tot + dabs(tmpJ(ipoint,m)) + enddo + enddo + print *, ' absolute accuracy on tmpJ (%) =', 100.d0 * err_tot / nrm_tot + + ! --- + + ! tmpM(n_points_final_grid,3)) + err_tot = 0.d0 + nrm_tot = 0.d0 + do m = 1, 3 + do ipoint = 1, n_points_final_grid + err_loc = dabs(tmpM(ipoint,m) - tmpM_gpu(ipoint,m)) + if(err_loc > acc_thr) then + print*, " error on", ipoint, m + print*, " CPU res", tmpM (ipoint,m) + print*, " GPU res", tmpM_gpu(ipoint,m) + stop + endif + err_tot = err_tot + err_loc + nrm_tot = nrm_tot + dabs(tmpM(ipoint,m)) + enddo + enddo + print *, ' absolute accuracy on tmpM (%) =', 100.d0 * err_tot / nrm_tot + + ! --- + + ! tmpS(n_points_final_grid)) + err_tot = 0.d0 + nrm_tot = 0.d0 + do ipoint = 1, n_points_final_grid + err_loc = dabs(tmpS(ipoint) - tmpS_gpu(ipoint)) + if(err_loc > acc_thr) then + print*, " error on", ipoint + print*, " CPU res", tmpS (ipoint) + print*, " GPU res", tmpS_gpu(ipoint) + stop + endif + err_tot = err_tot + err_loc + nrm_tot = nrm_tot + dabs(tmpS(ipoint)) + enddo + print *, ' absolute accuracy on tmpS (%) =', 100.d0 * err_tot / nrm_tot + + ! --- + + ! tmpC(n_points_final_grid,4,mo_num,mo_num)) + err_tot = 0.d0 + nrm_tot = 0.d0 + do i = 1, mo_num + do j = 1, mo_num + do m = 1, 4 + do ipoint = 1, n_points_final_grid + err_loc = dabs(tmpC(ipoint,m,j,i) - tmpC_gpu(ipoint,m,j,i)) + if(err_loc > acc_thr) then + print*, " error on", ipoint, m, j, i + print*, " CPU res", tmpC (ipoint,m,j,i) + print*, " GPU res", tmpC_gpu(ipoint,m,j,i) + stop + endif + err_tot = err_tot + err_loc + nrm_tot = nrm_tot + dabs(tmpC(ipoint,m,j,i)) + enddo + enddo + enddo + enddo + print *, ' absolute accuracy on tmpC (%) =', 100.d0 * err_tot / nrm_tot + + ! --- + + ! tmpD(n_points_final_grid,4)) + err_tot = 0.d0 + nrm_tot = 0.d0 + do m = 1, 4 + do ipoint = 1, n_points_final_grid + err_loc = dabs(tmpD(ipoint,m) - tmpD_gpu(ipoint,m)) + if(err_loc > acc_thr) then + print*, " error on", ipoint, m + print*, " CPU res", tmpD (ipoint,m) + print*, " GPU res", tmpD_gpu(ipoint,m) + stop + endif + err_tot = err_tot + err_loc + nrm_tot = nrm_tot + dabs(tmpD(ipoint,m)) + enddo + enddo + print *, ' absolute accuracy on tmpD (%) =', 100.d0 * err_tot / nrm_tot + + ! --- + + ! tmpL(n_points_final_grid,3,mo_num)) + err_tot = 0.d0 + nrm_tot = 0.d0 + do i = 1, mo_num + do m = 1, 3 + do ipoint = 1, n_points_final_grid + err_loc = dabs(tmpL(ipoint,m,i) - tmpL_gpu(ipoint,m,i)) + if(err_loc > acc_thr) then + print*, " error on", ipoint, m, i + print*, " CPU res", tmpL (ipoint,m,i) + print*, " GPU res", tmpL_gpu(ipoint,m,i) + stop + endif + err_tot = err_tot + err_loc + nrm_tot = nrm_tot + dabs(tmpL(ipoint,m,i)) + enddo + enddo + enddo + print *, ' absolute accuracy on tmpL (%) =', 100.d0 * err_tot / nrm_tot + + ! --- + + ! tmpR(n_points_final_grid,3,mo_num)) + err_tot = 0.d0 + nrm_tot = 0.d0 + do i = 1, mo_num + do m = 1, 3 + do ipoint = 1, n_points_final_grid + err_loc = dabs(tmpR(ipoint,m,i) - tmpR_gpu(ipoint,m,i)) + if(err_loc > acc_thr) then + print*, " error on", ipoint, m, i + print*, " CPU res", tmpR (ipoint,m,i) + print*, " GPU res", tmpR_gpu(ipoint,m,i) + stop + endif + err_tot = err_tot + err_loc + nrm_tot = nrm_tot + dabs(tmpR(ipoint,m,i)) + enddo + enddo + enddo + print *, ' absolute accuracy on tmpR (%) =', 100.d0 * err_tot / nrm_tot + + ! --- + + ! tmpE(n_points_final_grid,5,mo_num)) + err_tot = 0.d0 + nrm_tot = 0.d0 + do i = 1, mo_num + do m = 1, 5 + do ipoint = 1, n_points_final_grid + err_loc = dabs(tmpE(ipoint,m,i) - tmpE_gpu(ipoint,m,i)) + if(err_loc > acc_thr) then + print*, " error on", ipoint, m, i + print*, " CPU res", tmpE (ipoint,m,i) + print*, " GPU res", tmpE_gpu(ipoint,m,i) + stop + endif + err_tot = err_tot + err_loc + nrm_tot = nrm_tot + dabs(tmpE(ipoint,m,i)) + enddo + enddo + enddo + print *, ' absolute accuracy on tmpE (%) =', 100.d0 * err_tot / nrm_tot + + ! --- + + ! tmpF(n_points_final_grid,5,mo_num)) + err_tot = 0.d0 + nrm_tot = 0.d0 + do i = 1, mo_num + do m = 1, 5 + do ipoint = 1, n_points_final_grid + err_loc = dabs(tmpF(ipoint,m,i) - tmpF_gpu(ipoint,m,i)) + if(err_loc > acc_thr) then + print*, " error on", ipoint, m, i + print*, " CPU res", tmpF (ipoint,m,i) + print*, " GPU res", tmpF_gpu(ipoint,m,i) + stop + endif + err_tot = err_tot + err_loc + nrm_tot = nrm_tot + dabs(tmpF(ipoint,m,i)) + enddo + enddo + enddo + print *, ' absolute accuracy on tmpF (%) =', 100.d0 * err_tot / nrm_tot + + ! --- + + ! noL_1e(mo_num,mo_num)) + err_tot = 0.d0 + nrm_tot = 0.d0 + do k = 1, mo_num + do l = 1, mo_num + err_loc = dabs(noL_1e(l,k) - noL_1e_gpu(l,k)) + if(err_loc > acc_thr) then + print*, " error on", l, k + print*, " CPU res", noL_1e (l,k) + print*, " GPU res", noL_1e_gpu(l,k) + stop + endif + err_tot = err_tot + err_loc + nrm_tot = nrm_tot + dabs(noL_1e(l,k)) + enddo + enddo + print *, ' absolute accuracy on noL_1e (%) =', 100.d0 * err_tot / nrm_tot + + ! --- + + + deallocate(tmpO) + deallocate(tmpJ) + deallocate(tmpM) + deallocate(tmpS) + deallocate(tmpC) + deallocate(tmpD) + deallocate(tmpL) + deallocate(tmpR) + deallocate(tmpE) + deallocate(tmpF) + deallocate(noL_1e) + + deallocate(tmpO_gpu) + deallocate(tmpJ_gpu) + deallocate(tmpM_gpu) + deallocate(tmpS_gpu) + deallocate(tmpC_gpu) + deallocate(tmpD_gpu) + deallocate(tmpL_gpu) + deallocate(tmpR_gpu) + deallocate(tmpE_gpu) + deallocate(tmpF_gpu) + deallocate(noL_1e_gpu) + + + return + +end + + + diff --git a/plugins/local/tc_int/deb_no_2e_gpu.irp.f b/plugins/local/tc_int/deb_no_2e_gpu.irp.f index 2be53ddd..16f58cca 100644 --- a/plugins/local/tc_int/deb_no_2e_gpu.irp.f +++ b/plugins/local/tc_int/deb_no_2e_gpu.irp.f @@ -87,7 +87,6 @@ subroutine deb_no_2e_gpu() acc_thr = 1d-12 - print *, ' precision on noL_2e ' err_tot = 0.d0 nrm_tot = 0.d0 do i = 1, mo_num diff --git a/plugins/local/tc_int/deb_tc_int_cuda.irp.f b/plugins/local/tc_int/deb_tc_int_cuda.irp.f index 9da9ac95..8d0cc4f3 100644 --- a/plugins/local/tc_int/deb_tc_int_cuda.irp.f +++ b/plugins/local/tc_int/deb_tc_int_cuda.irp.f @@ -37,8 +37,12 @@ subroutine main() implicit none !call deb_int_2e_ao_gpu() + !call deb_no_2e_gpu_tmp() - call deb_no_2e_gpu() + !call deb_no_2e_gpu() + + call deb_no_1e_gpu_tmp() + !call deb_no_1e_gpu() return end diff --git a/plugins/local/tc_int/no_1e.irp.f b/plugins/local/tc_int/no_1e.irp.f index 3a990276..5a9798f0 100644 --- a/plugins/local/tc_int/no_1e.irp.f +++ b/plugins/local/tc_int/no_1e.irp.f @@ -15,10 +15,10 @@ subroutine provide_no_1e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r, integer :: p, s, i, j, ipoint double precision :: t0, t1 - double precision, allocatable :: tmp1(:,:,:,:), tmp2(:,:), tmp3(:,:,:), tmp4(:,:,:) - double precision, allocatable :: tmp_L(:,:,:), tmp_R(:,:,:), tmp_M(:,:), tmp_S(:), tmp_O(:), tmp_J(:,:) - double precision, allocatable :: tmp_L0(:,:,:), tmp_R0(:,:,:) - double precision, allocatable :: tmp_M_priv(:,:), tmp_S_priv(:), tmp_O_priv(:), tmp_J_priv(:,:) + double precision, allocatable :: tmpC(:,:,:,:), tmpD(:,:), tmpE(:,:,:), tmpF(:,:,:) + double precision, allocatable :: tmpL(:,:,:), tmpR(:,:,:), tmpM(:,:), tmpS(:), tmpO(:), tmpJ(:,:) + double precision, allocatable :: tmpL0(:,:,:), tmpR0(:,:,:) + double precision, allocatable :: tmpM_priv(:,:), tmpS_priv(:), tmpO_priv(:), tmpJ_priv(:,:) call wall_time(t0) @@ -26,119 +26,119 @@ subroutine provide_no_1e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r, if(ne_a .eq. ne_b) then - allocate(tmp_O(n_grid), tmp_J(n_grid,3)) - tmp_O = 0.d0 - tmp_J = 0.d0 + allocate(tmpO(n_grid), tmpJ(n_grid,3)) + tmpO = 0.d0 + tmpJ = 0.d0 - !$OMP PARALLEL & - !$OMP DEFAULT(NONE) & - !$OMP PRIVATE(i, ipoint, tmp_O_priv, tmp_J_priv) & - !$OMP SHARED(ne_b, n_grid, & - !$OMP mos_l_in_r, mos_r_in_r, & - !$OMP int2_grad1_u12, tmp_O, tmp_J) + !$OMP PARALLEL & + !$OMP DEFAULT(NONE) & + !$OMP PRIVATE(i, ipoint, tmpO_priv, tmpJ_priv) & + !$OMP SHARED(ne_b, n_grid, & + !$OMP mos_l_in_r, mos_r_in_r, & + !$OMP int2_grad1_u12, tmpO, tmpJ) - allocate(tmp_O_priv(n_grid), tmp_J_priv(n_grid,3)) - tmp_O_priv = 0.d0 - tmp_J_priv = 0.d0 + allocate(tmpO_priv(n_grid), tmpJ_priv(n_grid,3)) + tmpO_priv = 0.d0 + tmpJ_priv = 0.d0 !$OMP DO do i = 1, ne_b do ipoint = 1, n_grid - tmp_O_priv(ipoint) = tmp_O_priv(ipoint) + mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i) - tmp_J_priv(ipoint,1) = tmp_J_priv(ipoint,1) + int2_grad1_u12(ipoint,1,i,i) - tmp_J_priv(ipoint,2) = tmp_J_priv(ipoint,2) + int2_grad1_u12(ipoint,2,i,i) - tmp_J_priv(ipoint,3) = tmp_J_priv(ipoint,3) + int2_grad1_u12(ipoint,3,i,i) + tmpO_priv(ipoint) = tmpO_priv(ipoint) + mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i) + tmpJ_priv(ipoint,1) = tmpJ_priv(ipoint,1) + int2_grad1_u12(ipoint,1,i,i) + tmpJ_priv(ipoint,2) = tmpJ_priv(ipoint,2) + int2_grad1_u12(ipoint,2,i,i) + tmpJ_priv(ipoint,3) = tmpJ_priv(ipoint,3) + int2_grad1_u12(ipoint,3,i,i) enddo enddo !$OMP END DO NOWAIT !$OMP CRITICAL - tmp_O = tmp_O + tmp_O_priv - tmp_J = tmp_J + tmp_J_priv + tmpO = tmpO + tmpO_priv + tmpJ = tmpJ + tmpJ_priv !$OMP END CRITICAL - deallocate(tmp_O_priv, tmp_J_priv) + deallocate(tmpO_priv, tmpJ_priv) !$OMP END PARALLEL ! --- - allocate(tmp_M(n_grid,3), tmp_S(n_grid)) - tmp_M = 0.d0 - tmp_S = 0.d0 + allocate(tmpM(n_grid,3), tmpS(n_grid)) + tmpM = 0.d0 + tmpS = 0.d0 - !$OMP PARALLEL & - !$OMP DEFAULT(NONE) & - !$OMP PRIVATE(i, j, ipoint, tmp_M_priv, tmp_S_priv) & - !$OMP SHARED(ne_b, n_grid, & - !$OMP mos_l_in_r, mos_r_in_r, & - !$OMP int2_grad1_u12, tmp_M, tmp_S) + !$OMP PARALLEL & + !$OMP DEFAULT(NONE) & + !$OMP PRIVATE(i, j, ipoint, tmpM_priv, tmpS_priv) & + !$OMP SHARED(ne_b, n_grid, & + !$OMP mos_l_in_r, mos_r_in_r, & + !$OMP int2_grad1_u12, tmpM, tmpS) - allocate(tmp_M_priv(n_grid,3), tmp_S_priv(n_grid)) - tmp_M_priv = 0.d0 - tmp_S_priv = 0.d0 + allocate(tmpM_priv(n_grid,3), tmpS_priv(n_grid)) + tmpM_priv = 0.d0 + tmpS_priv = 0.d0 !$OMP DO COLLAPSE(2) do i = 1, ne_b do j = 1, ne_b do ipoint = 1, n_grid - tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) - tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) - tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) - tmp_S_priv(ipoint) = tmp_S_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) & - + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) & - + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i) + tmpS_priv(ipoint) = tmpS_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) & + + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) & + + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i) enddo enddo enddo !$OMP END DO NOWAIT !$OMP CRITICAL - tmp_M = tmp_M + tmp_M_priv - tmp_S = tmp_S + tmp_S_priv + tmpM = tmpM + tmpM_priv + tmpS = tmpS + tmpS_priv !$OMP END CRITICAL - deallocate(tmp_M_priv, tmp_S_priv) + deallocate(tmpM_priv, tmpS_priv) !$OMP END PARALLEL ! --- - allocate(tmp2(n_grid,4)) - allocate(tmp1(n_grid,4,mo_num,mo_num)) + allocate(tmpC(n_grid,4,n_mo,n_mo)) + allocate(tmpD(n_grid,4)) do ipoint = 1, n_grid - tmp2(ipoint,1) = wr1(ipoint) * (2.d0 * tmp_O(ipoint) * tmp_J(ipoint,1) - tmp_M(ipoint,1)) - tmp2(ipoint,2) = wr1(ipoint) * (2.d0 * tmp_O(ipoint) * tmp_J(ipoint,2) - tmp_M(ipoint,2)) - tmp2(ipoint,3) = wr1(ipoint) * (2.d0 * tmp_O(ipoint) * tmp_J(ipoint,3) - tmp_M(ipoint,3)) - tmp2(ipoint,4) = -wr1(ipoint) * tmp_O(ipoint) + tmpD(ipoint,1) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,1) - tmpM(ipoint,1)) + tmpD(ipoint,2) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,2) - tmpM(ipoint,2)) + tmpD(ipoint,3) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,3) - tmpM(ipoint,3)) + tmpD(ipoint,4) = -wr1(ipoint) * tmpO(ipoint) - tmp_S(ipoint) = 2.d0 * (tmp_J(ipoint,1) * tmp_J(ipoint,1) + tmp_J(ipoint,2) * tmp_J(ipoint,2) + tmp_J(ipoint,3) * tmp_J(ipoint,3)) - tmp_S(ipoint) + tmpS(ipoint) = 2.d0 * (tmpJ(ipoint,1) * tmpJ(ipoint,1) + tmpJ(ipoint,2) * tmpJ(ipoint,2) + tmpJ(ipoint,3) * tmpJ(ipoint,3)) - tmpS(ipoint) enddo - deallocate(tmp_O, tmp_M) + deallocate(tmpO, tmpM) - !$OMP PARALLEL & - !$OMP DEFAULT(NONE) & - !$OMP PRIVATE(p, s, i, ipoint) & - !$OMP SHARED(mo_num, ne_b, n_grid, & - !$OMP int2_grad1_u12, tmp1) + !$OMP PARALLEL & + !$OMP DEFAULT(NONE) & + !$OMP PRIVATE(p, s, i, ipoint) & + !$OMP SHARED(n_mo, ne_b, n_grid, & + !$OMP int2_grad1_u12, tmpC) !$OMP DO COLLAPSE(2) - do s = 1, mo_num - do p = 1, mo_num + do s = 1, n_mo + do p = 1, n_mo do ipoint = 1, n_grid - tmp1(ipoint,1,p,s) = int2_grad1_u12(ipoint,1,p,s) - tmp1(ipoint,2,p,s) = int2_grad1_u12(ipoint,2,p,s) - tmp1(ipoint,3,p,s) = int2_grad1_u12(ipoint,3,p,s) + tmpC(ipoint,1,p,s) = int2_grad1_u12(ipoint,1,p,s) + tmpC(ipoint,2,p,s) = int2_grad1_u12(ipoint,2,p,s) + tmpC(ipoint,3,p,s) = int2_grad1_u12(ipoint,3,p,s) enddo - tmp1(:,4,p,s) = 0.d0 + tmpC(:,4,p,s) = 0.d0 do i = 1, ne_b do ipoint = 1, n_grid - tmp1(ipoint,4,p,s) = tmp1(ipoint,4,p,s) + int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,s) & + tmpC(ipoint,4,p,s) = tmpC(ipoint,4,p,s) + int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,s) & + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,s) & + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,s) enddo @@ -149,41 +149,41 @@ subroutine provide_no_1e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r, !$OMP END DO !$OMP END PARALLEL - call dgemv( 'T', 4*n_grid, mo_num*mo_num, 2.d0 & - , tmp1(1,1,1,1), size(tmp1, 1) * size(tmp1, 2) & - , tmp2(1,1), 1 & + call dgemv( 'T', 4*n_grid, n_mo*n_mo, 2.d0 & + , tmpC(1,1,1,1), size(tmpC, 1) * size(tmpC, 2) & + , tmpD(1,1), 1 & , 0.d0, noL_1e(1,1), 1) - deallocate(tmp1, tmp2) + deallocate(tmpC, tmpD) ! --- - allocate(tmp_L(n_grid,3,mo_num)) - allocate(tmp_R(n_grid,3,mo_num)) + allocate(tmpL(n_grid,3,n_mo)) + allocate(tmpR(n_grid,3,n_mo)) - !$OMP PARALLEL & - !$OMP DEFAULT(NONE) & - !$OMP PRIVATE(p, i, ipoint) & - !$OMP SHARED(ne_b, n_grid, mo_num, & - !$OMP mos_l_in_r, mos_r_in_r, & - !$OMP int2_grad1_u12, tmp_L, tmp_R) + !$OMP PARALLEL & + !$OMP DEFAULT(NONE) & + !$OMP PRIVATE(p, i, ipoint) & + !$OMP SHARED(ne_b, n_grid, n_mo, & + !$OMP mos_l_in_r, mos_r_in_r, & + !$OMP int2_grad1_u12, tmpL, tmpR) !$OMP DO - do p = 1, mo_num + do p = 1, n_mo - tmp_L(:,1:3,p) = 0.d0 - tmp_R(:,1:3,p) = 0.d0 + tmpL(:,1:3,p) = 0.d0 + tmpR(:,1:3,p) = 0.d0 do i = 1, ne_b do ipoint = 1, n_grid - tmp_L(ipoint,1,p) = tmp_L(ipoint,1,p) + int2_grad1_u12(ipoint,1,p,i) * mos_l_in_r(ipoint,i) - tmp_L(ipoint,2,p) = tmp_L(ipoint,2,p) + int2_grad1_u12(ipoint,2,p,i) * mos_l_in_r(ipoint,i) - tmp_L(ipoint,3,p) = tmp_L(ipoint,3,p) + int2_grad1_u12(ipoint,3,p,i) * mos_l_in_r(ipoint,i) + tmpL(ipoint,1,p) = tmpL(ipoint,1,p) + int2_grad1_u12(ipoint,1,p,i) * mos_l_in_r(ipoint,i) + tmpL(ipoint,2,p) = tmpL(ipoint,2,p) + int2_grad1_u12(ipoint,2,p,i) * mos_l_in_r(ipoint,i) + tmpL(ipoint,3,p) = tmpL(ipoint,3,p) + int2_grad1_u12(ipoint,3,p,i) * mos_l_in_r(ipoint,i) - tmp_R(ipoint,1,p) = tmp_R(ipoint,1,p) + int2_grad1_u12(ipoint,1,i,p) * mos_r_in_r(ipoint,i) - tmp_R(ipoint,2,p) = tmp_R(ipoint,2,p) + int2_grad1_u12(ipoint,2,i,p) * mos_r_in_r(ipoint,i) - tmp_R(ipoint,3,p) = tmp_R(ipoint,3,p) + int2_grad1_u12(ipoint,3,i,p) * mos_r_in_r(ipoint,i) + tmpR(ipoint,1,p) = tmpR(ipoint,1,p) + int2_grad1_u12(ipoint,1,i,p) * mos_r_in_r(ipoint,i) + tmpR(ipoint,2,p) = tmpR(ipoint,2,p) + int2_grad1_u12(ipoint,2,i,p) * mos_r_in_r(ipoint,i) + tmpR(ipoint,3,p) = tmpR(ipoint,3,p) + int2_grad1_u12(ipoint,3,i,p) * mos_r_in_r(ipoint,i) enddo enddo enddo ! p @@ -192,45 +192,45 @@ subroutine provide_no_1e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r, ! --- - allocate(tmp3(n_grid,5,mo_num)) - allocate(tmp4(n_grid,5,mo_num)) + allocate(tmpE(n_grid,5,n_mo)) + allocate(tmpF(n_grid,5,n_mo)) - !$OMP PARALLEL & - !$OMP DEFAULT(NONE) & - !$OMP PRIVATE(p, i, j, ipoint) & - !$OMP SHARED(ne_b, n_grid, mo_num, & - !$OMP mos_l_in_r, mos_r_in_r, & - !$OMP int2_grad1_u12, wr1, & - !$OMP tmp_L, tmp_R, tmp_J, tmp_S, tmp3, tmp4) + !$OMP PARALLEL & + !$OMP DEFAULT(NONE) & + !$OMP PRIVATE(p, i, j, ipoint) & + !$OMP SHARED(ne_b, n_grid, n_mo, & + !$OMP mos_l_in_r, mos_r_in_r, & + !$OMP int2_grad1_u12, wr1, & + !$OMP tmpL, tmpR, tmpJ, tmpS, tmpE, tmpF) !$OMP DO - do p = 1, mo_num + do p = 1, n_mo do ipoint = 1, n_grid - tmp3(ipoint,1,p) = wr1(ipoint) * mos_l_in_r(ipoint,p) - tmp3(ipoint,2,p) = -2.d0 * (tmp_L(ipoint,1,p) * tmp_J(ipoint,1) + tmp_L(ipoint,2,p) * tmp_J(ipoint,2) + tmp_L(ipoint,3,p) * tmp_J(ipoint,3)) - tmp3(ipoint,3,p) = wr1(ipoint) * tmp_L(ipoint,1,p) - tmp3(ipoint,4,p) = wr1(ipoint) * tmp_L(ipoint,2,p) - tmp3(ipoint,5,p) = wr1(ipoint) * tmp_L(ipoint,3,p) + tmpE(ipoint,1,p) = wr1(ipoint) * mos_l_in_r(ipoint,p) + tmpE(ipoint,2,p) = -2.d0 * (tmpL(ipoint,1,p) * tmpJ(ipoint,1) + tmpL(ipoint,2,p) * tmpJ(ipoint,2) + tmpL(ipoint,3,p) * tmpJ(ipoint,3)) + tmpE(ipoint,3,p) = wr1(ipoint) * tmpL(ipoint,1,p) + tmpE(ipoint,4,p) = wr1(ipoint) * tmpL(ipoint,2,p) + tmpE(ipoint,5,p) = wr1(ipoint) * tmpL(ipoint,3,p) - tmp4(ipoint,1,p) = -2.d0 * (tmp_R(ipoint,1,p) * tmp_J(ipoint,1) + tmp_R(ipoint,2,p) * tmp_J(ipoint,2) + tmp_R(ipoint,3,p) * tmp_J(ipoint,3)) & - + mos_r_in_r(ipoint,p) * tmp_S(ipoint) - tmp4(ipoint,2,p) = wr1(ipoint) * mos_r_in_r(ipoint,p) - tmp4(ipoint,3,p) = tmp_R(ipoint,1,p) - tmp4(ipoint,4,p) = tmp_R(ipoint,2,p) - tmp4(ipoint,5,p) = tmp_R(ipoint,3,p) + tmpF(ipoint,1,p) = -2.d0 * (tmpR(ipoint,1,p) * tmpJ(ipoint,1) + tmpR(ipoint,2,p) * tmpJ(ipoint,2) + tmpR(ipoint,3,p) * tmpJ(ipoint,3)) & + + mos_r_in_r(ipoint,p) * tmpS(ipoint) + tmpF(ipoint,2,p) = wr1(ipoint) * mos_r_in_r(ipoint,p) + tmpF(ipoint,3,p) = tmpR(ipoint,1,p) + tmpF(ipoint,4,p) = tmpR(ipoint,2,p) + tmpF(ipoint,5,p) = tmpR(ipoint,3,p) enddo do i = 1, ne_b do j = 1, ne_b do ipoint = 1, n_grid - tmp3(ipoint,2,p) = tmp3(ipoint,2,p) + mos_l_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,j) & + tmpE(ipoint,2,p) = tmpE(ipoint,2,p) + mos_l_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,j) & + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,j) & + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,j) ) - tmp4(ipoint,1,p) = tmp4(ipoint,1,p) + mos_r_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,p) & + tmpF(ipoint,1,p) = tmpF(ipoint,1,p) + mos_r_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,p) & + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,p) & + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,p) ) enddo ! ipoint @@ -241,40 +241,40 @@ subroutine provide_no_1e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r, !$OMP END DO !$OMP END PARALLEL - deallocate(tmp_L, tmp_R, tmp_J, tmp_S) + deallocate(tmpL, tmpR, tmpJ, tmpS) - call dgemm( 'T', 'N', mo_num, mo_num, 5*n_grid, 1.d0 & - , tmp3(1,1,1), 5*n_grid, tmp4(1,1,1), 5*n_grid & - , 1.d0, noL_1e(1,1), mo_num) + call dgemm( 'T', 'N', n_mo, n_mo, 5*n_grid, 1.d0 & + , tmpE(1,1,1), 5*n_grid, tmpF(1,1,1), 5*n_grid & + , 1.d0, noL_1e(1,1), n_mo) - deallocate(tmp3, tmp4) + deallocate(tmpE, tmpF) ! --- else - allocate(tmp_O(n_grid), tmp_J(n_grid,3)) - tmp_O = 0.d0 - tmp_J = 0.d0 + allocate(tmpO(n_grid), tmpJ(n_grid,3)) + tmpO = 0.d0 + tmpJ = 0.d0 - !$OMP PARALLEL & - !$OMP DEFAULT(NONE) & - !$OMP PRIVATE(i, ipoint, tmp_O_priv, tmp_J_priv) & - !$OMP SHARED(ne_b, ne_a, n_grid, & - !$OMP mos_l_in_r, mos_r_in_r, & - !$OMP int2_grad1_u12, tmp_O, tmp_J) + !$OMP PARALLEL & + !$OMP DEFAULT(NONE) & + !$OMP PRIVATE(i, ipoint, tmpO_priv, tmpJ_priv) & + !$OMP SHARED(ne_b, ne_a, n_grid, & + !$OMP mos_l_in_r, mos_r_in_r, & + !$OMP int2_grad1_u12, tmpO, tmpJ) - allocate(tmp_O_priv(n_grid), tmp_J_priv(n_grid,3)) - tmp_O_priv = 0.d0 - tmp_J_priv = 0.d0 + allocate(tmpO_priv(n_grid), tmpJ_priv(n_grid,3)) + tmpO_priv = 0.d0 + tmpJ_priv = 0.d0 !$OMP DO do i = 1, ne_b do ipoint = 1, n_grid - tmp_O_priv(ipoint) = tmp_O_priv(ipoint) + mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i) - tmp_J_priv(ipoint,1) = tmp_J_priv(ipoint,1) + int2_grad1_u12(ipoint,1,i,i) - tmp_J_priv(ipoint,2) = tmp_J_priv(ipoint,2) + int2_grad1_u12(ipoint,2,i,i) - tmp_J_priv(ipoint,3) = tmp_J_priv(ipoint,3) + int2_grad1_u12(ipoint,3,i,i) + tmpO_priv(ipoint) = tmpO_priv(ipoint) + mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i) + tmpJ_priv(ipoint,1) = tmpJ_priv(ipoint,1) + int2_grad1_u12(ipoint,1,i,i) + tmpJ_priv(ipoint,2) = tmpJ_priv(ipoint,2) + int2_grad1_u12(ipoint,2,i,i) + tmpJ_priv(ipoint,3) = tmpJ_priv(ipoint,3) + int2_grad1_u12(ipoint,3,i,i) enddo enddo !$OMP END DO NOWAIT @@ -282,51 +282,51 @@ subroutine provide_no_1e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r, !$OMP DO do i = ne_b+1, ne_a do ipoint = 1, n_grid - tmp_O_priv(ipoint) = tmp_O_priv(ipoint) + 0.5d0 * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i) - tmp_J_priv(ipoint,1) = tmp_J_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,i) - tmp_J_priv(ipoint,2) = tmp_J_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,i) - tmp_J_priv(ipoint,3) = tmp_J_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,i) + tmpO_priv(ipoint) = tmpO_priv(ipoint) + 0.5d0 * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i) + tmpJ_priv(ipoint,1) = tmpJ_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,i) + tmpJ_priv(ipoint,2) = tmpJ_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,i) + tmpJ_priv(ipoint,3) = tmpJ_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,i) enddo enddo !$OMP END DO NOWAIT !$OMP CRITICAL - tmp_O = tmp_O + tmp_O_priv - tmp_J = tmp_J + tmp_J_priv + tmpO = tmpO + tmpO_priv + tmpJ = tmpJ + tmpJ_priv !$OMP END CRITICAL - deallocate(tmp_O_priv, tmp_J_priv) + deallocate(tmpO_priv, tmpJ_priv) !$OMP END PARALLEL ! --- - allocate(tmp_M(n_grid,3), tmp_S(n_grid)) - tmp_M = 0.d0 - tmp_S = 0.d0 + allocate(tmpM(n_grid,3), tmpS(n_grid)) + tmpM = 0.d0 + tmpS = 0.d0 - !$OMP PARALLEL & - !$OMP DEFAULT(NONE) & - !$OMP PRIVATE(i, j, ipoint, tmp_M_priv, tmp_S_priv) & - !$OMP SHARED(ne_b, ne_a, n_grid, & - !$OMP mos_l_in_r, mos_r_in_r, & - !$OMP int2_grad1_u12, tmp_M, tmp_S) + !$OMP PARALLEL & + !$OMP DEFAULT(NONE) & + !$OMP PRIVATE(i, j, ipoint, tmpM_priv, tmpS_priv) & + !$OMP SHARED(ne_b, ne_a, n_grid, & + !$OMP mos_l_in_r, mos_r_in_r, & + !$OMP int2_grad1_u12, tmpM, tmpS) - allocate(tmp_M_priv(n_grid,3), tmp_S_priv(n_grid)) - tmp_M_priv = 0.d0 - tmp_S_priv = 0.d0 + allocate(tmpM_priv(n_grid,3), tmpS_priv(n_grid)) + tmpM_priv = 0.d0 + tmpS_priv = 0.d0 !$OMP DO COLLAPSE(2) do i = 1, ne_b do j = 1, ne_b do ipoint = 1, n_grid - tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) - tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) - tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) - tmp_S_priv(ipoint) = tmp_S_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) & - + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) & - + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i) + tmpS_priv(ipoint) = tmpS_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) & + + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) & + + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i) enddo enddo enddo @@ -337,17 +337,17 @@ subroutine provide_no_1e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r, do j = 1, ne_b do ipoint = 1, n_grid - tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) - tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) - tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) - tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i) - tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i) - tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i) + tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i) + tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i) + tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i) - tmp_S_priv(ipoint) = tmp_S_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) & - + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) & - + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i) + tmpS_priv(ipoint) = tmpS_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) & + + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) & + + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i) enddo enddo enddo @@ -358,70 +358,70 @@ subroutine provide_no_1e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r, do j = ne_b+1, ne_a do ipoint = 1, n_grid - tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) - tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) - tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) - tmp_S_priv(ipoint) = tmp_S_priv(ipoint) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) & - + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) & - + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i) + tmpS_priv(ipoint) = tmpS_priv(ipoint) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) & + + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) & + + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i) enddo enddo enddo !$OMP END DO NOWAIT !$OMP CRITICAL - tmp_M = tmp_M + tmp_M_priv - tmp_S = tmp_S + tmp_S_priv + tmpM = tmpM + tmpM_priv + tmpS = tmpS + tmpS_priv !$OMP END CRITICAL - deallocate(tmp_M_priv, tmp_S_priv) + deallocate(tmpM_priv, tmpS_priv) !$OMP END PARALLEL ! --- - allocate(tmp2(n_grid,4)) - allocate(tmp1(n_grid,4,mo_num,mo_num)) + allocate(tmpC(n_grid,4,n_mo,n_mo)) + allocate(tmpD(n_grid,4)) do ipoint = 1, n_grid - tmp2(ipoint,1) = wr1(ipoint) * (2.d0 * tmp_O(ipoint) * tmp_J(ipoint,1) - tmp_M(ipoint,1)) - tmp2(ipoint,2) = wr1(ipoint) * (2.d0 * tmp_O(ipoint) * tmp_J(ipoint,2) - tmp_M(ipoint,2)) - tmp2(ipoint,3) = wr1(ipoint) * (2.d0 * tmp_O(ipoint) * tmp_J(ipoint,3) - tmp_M(ipoint,3)) - tmp2(ipoint,4) = -wr1(ipoint) * tmp_O(ipoint) + tmpD(ipoint,1) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,1) - tmpM(ipoint,1)) + tmpD(ipoint,2) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,2) - tmpM(ipoint,2)) + tmpD(ipoint,3) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,3) - tmpM(ipoint,3)) + tmpD(ipoint,4) = -wr1(ipoint) * tmpO(ipoint) - tmp_S(ipoint) = 2.d0 * (tmp_J(ipoint,1) * tmp_J(ipoint,1) + tmp_J(ipoint,2) * tmp_J(ipoint,2) + tmp_J(ipoint,3) * tmp_J(ipoint,3)) - tmp_S(ipoint) + tmpS(ipoint) = 2.d0 * (tmpJ(ipoint,1) * tmpJ(ipoint,1) + tmpJ(ipoint,2) * tmpJ(ipoint,2) + tmpJ(ipoint,3) * tmpJ(ipoint,3)) - tmpS(ipoint) enddo - deallocate(tmp_O, tmp_M) + deallocate(tmpO, tmpM) - !$OMP PARALLEL & - !$OMP DEFAULT(NONE) & - !$OMP PRIVATE(p, s, i, ipoint) & - !$OMP SHARED(mo_num, ne_b, n_grid, & - !$OMP ne_a, int2_grad1_u12, tmp1) + !$OMP PARALLEL & + !$OMP DEFAULT(NONE) & + !$OMP PRIVATE(p, s, i, ipoint) & + !$OMP SHARED(n_mo, ne_b, n_grid, & + !$OMP ne_a, int2_grad1_u12, tmpC) !$OMP DO COLLAPSE(2) - do s = 1, mo_num - do p = 1, mo_num + do s = 1, n_mo + do p = 1, n_mo do ipoint = 1, n_grid - tmp1(ipoint,1,p,s) = int2_grad1_u12(ipoint,1,p,s) - tmp1(ipoint,2,p,s) = int2_grad1_u12(ipoint,2,p,s) - tmp1(ipoint,3,p,s) = int2_grad1_u12(ipoint,3,p,s) + tmpC(ipoint,1,p,s) = int2_grad1_u12(ipoint,1,p,s) + tmpC(ipoint,2,p,s) = int2_grad1_u12(ipoint,2,p,s) + tmpC(ipoint,3,p,s) = int2_grad1_u12(ipoint,3,p,s) enddo - tmp1(:,4,p,s) = 0.d0 + tmpC(:,4,p,s) = 0.d0 do i = 1, ne_b do ipoint = 1, n_grid - tmp1(ipoint,4,p,s) = tmp1(ipoint,4,p,s) + int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,s) & + tmpC(ipoint,4,p,s) = tmpC(ipoint,4,p,s) + int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,s) & + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,s) & + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,s) enddo enddo do i = ne_b+1, ne_a do ipoint = 1, n_grid - tmp1(ipoint,4,p,s) = tmp1(ipoint,4,p,s) + 0.5d0 * int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,s) & + tmpC(ipoint,4,p,s) = tmpC(ipoint,4,p,s) + 0.5d0 * int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,s) & + 0.5d0 * int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,s) & + 0.5d0 * int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,s) enddo @@ -432,55 +432,55 @@ subroutine provide_no_1e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r, !$OMP END DO !$OMP END PARALLEL - call dgemv( 'T', 4*n_grid, mo_num*mo_num, 2.d0 & - , tmp1(1,1,1,1), size(tmp1, 1) * size(tmp1, 2) & - , tmp2(1,1), 1 & + call dgemv( 'T', 4*n_grid, n_mo*n_mo, 2.d0 & + , tmpC(1,1,1,1), size(tmpC, 1) * size(tmpC, 2) & + , tmpD(1,1), 1 & , 0.d0, noL_1e(1,1), 1) - deallocate(tmp1, tmp2) + deallocate(tmpC, tmpD) ! --- - allocate(tmp_L(n_grid,3,mo_num), tmp_L0(n_grid,3,mo_num)) - allocate(tmp_R(n_grid,3,mo_num), tmp_R0(n_grid,3,mo_num)) + allocate(tmpL(n_grid,3,n_mo), tmpL0(n_grid,3,n_mo)) + allocate(tmpR(n_grid,3,n_mo), tmpR0(n_grid,3,n_mo)) - !$OMP PARALLEL & - !$OMP DEFAULT(NONE) & - !$OMP PRIVATE(p, i, ipoint) & - !$OMP SHARED(ne_b, ne_a, n_grid, mo_num, & - !$OMP mos_l_in_r, mos_r_in_r, & - !$OMP int2_grad1_u12, tmp_L0, tmp_R0, tmp_L, tmp_R) + !$OMP PARALLEL & + !$OMP DEFAULT(NONE) & + !$OMP PRIVATE(p, i, ipoint) & + !$OMP SHARED(ne_b, ne_a, n_grid, n_mo, & + !$OMP mos_l_in_r, mos_r_in_r, & + !$OMP int2_grad1_u12, tmpL0, tmpR0, tmpL, tmpR) !$OMP DO - do p = 1, mo_num + do p = 1, n_mo - tmp_L0(:,1:3,p) = 0.d0 - tmp_R0(:,1:3,p) = 0.d0 + tmpL0(:,1:3,p) = 0.d0 + tmpR0(:,1:3,p) = 0.d0 do i = ne_b+1, ne_a do ipoint = 1, n_grid - tmp_L0(ipoint,1,p) = tmp_L0(ipoint,1,p) + 0.5d0 * int2_grad1_u12(ipoint,1,p,i) * mos_l_in_r(ipoint,i) - tmp_L0(ipoint,2,p) = tmp_L0(ipoint,2,p) + 0.5d0 * int2_grad1_u12(ipoint,2,p,i) * mos_l_in_r(ipoint,i) - tmp_L0(ipoint,3,p) = tmp_L0(ipoint,3,p) + 0.5d0 * int2_grad1_u12(ipoint,3,p,i) * mos_l_in_r(ipoint,i) + tmpL0(ipoint,1,p) = tmpL0(ipoint,1,p) + 0.5d0 * int2_grad1_u12(ipoint,1,p,i) * mos_l_in_r(ipoint,i) + tmpL0(ipoint,2,p) = tmpL0(ipoint,2,p) + 0.5d0 * int2_grad1_u12(ipoint,2,p,i) * mos_l_in_r(ipoint,i) + tmpL0(ipoint,3,p) = tmpL0(ipoint,3,p) + 0.5d0 * int2_grad1_u12(ipoint,3,p,i) * mos_l_in_r(ipoint,i) - tmp_R0(ipoint,1,p) = tmp_R0(ipoint,1,p) + 0.5d0 * int2_grad1_u12(ipoint,1,i,p) * mos_r_in_r(ipoint,i) - tmp_R0(ipoint,2,p) = tmp_R0(ipoint,2,p) + 0.5d0 * int2_grad1_u12(ipoint,2,i,p) * mos_r_in_r(ipoint,i) - tmp_R0(ipoint,3,p) = tmp_R0(ipoint,3,p) + 0.5d0 * int2_grad1_u12(ipoint,3,i,p) * mos_r_in_r(ipoint,i) + tmpR0(ipoint,1,p) = tmpR0(ipoint,1,p) + 0.5d0 * int2_grad1_u12(ipoint,1,i,p) * mos_r_in_r(ipoint,i) + tmpR0(ipoint,2,p) = tmpR0(ipoint,2,p) + 0.5d0 * int2_grad1_u12(ipoint,2,i,p) * mos_r_in_r(ipoint,i) + tmpR0(ipoint,3,p) = tmpR0(ipoint,3,p) + 0.5d0 * int2_grad1_u12(ipoint,3,i,p) * mos_r_in_r(ipoint,i) enddo enddo - tmp_L(:,1:3,p) = tmp_L0(:,1:3,p) - tmp_R(:,1:3,p) = tmp_R0(:,1:3,p) + tmpL(:,1:3,p) = tmpL0(:,1:3,p) + tmpR(:,1:3,p) = tmpR0(:,1:3,p) do i = 1, ne_b do ipoint = 1, n_grid - tmp_L(ipoint,1,p) = tmp_L(ipoint,1,p) + int2_grad1_u12(ipoint,1,p,i) * mos_l_in_r(ipoint,i) - tmp_L(ipoint,2,p) = tmp_L(ipoint,2,p) + int2_grad1_u12(ipoint,2,p,i) * mos_l_in_r(ipoint,i) - tmp_L(ipoint,3,p) = tmp_L(ipoint,3,p) + int2_grad1_u12(ipoint,3,p,i) * mos_l_in_r(ipoint,i) + tmpL(ipoint,1,p) = tmpL(ipoint,1,p) + int2_grad1_u12(ipoint,1,p,i) * mos_l_in_r(ipoint,i) + tmpL(ipoint,2,p) = tmpL(ipoint,2,p) + int2_grad1_u12(ipoint,2,p,i) * mos_l_in_r(ipoint,i) + tmpL(ipoint,3,p) = tmpL(ipoint,3,p) + int2_grad1_u12(ipoint,3,p,i) * mos_l_in_r(ipoint,i) - tmp_R(ipoint,1,p) = tmp_R(ipoint,1,p) + int2_grad1_u12(ipoint,1,i,p) * mos_r_in_r(ipoint,i) - tmp_R(ipoint,2,p) = tmp_R(ipoint,2,p) + int2_grad1_u12(ipoint,2,i,p) * mos_r_in_r(ipoint,i) - tmp_R(ipoint,3,p) = tmp_R(ipoint,3,p) + int2_grad1_u12(ipoint,3,i,p) * mos_r_in_r(ipoint,i) + tmpR(ipoint,1,p) = tmpR(ipoint,1,p) + int2_grad1_u12(ipoint,1,i,p) * mos_r_in_r(ipoint,i) + tmpR(ipoint,2,p) = tmpR(ipoint,2,p) + int2_grad1_u12(ipoint,2,i,p) * mos_r_in_r(ipoint,i) + tmpR(ipoint,3,p) = tmpR(ipoint,3,p) + int2_grad1_u12(ipoint,3,i,p) * mos_r_in_r(ipoint,i) enddo enddo @@ -490,51 +490,51 @@ subroutine provide_no_1e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r, ! --- - allocate(tmp3(n_grid,8,mo_num)) - allocate(tmp4(n_grid,8,mo_num)) + allocate(tmpE(n_grid,8,n_mo)) + allocate(tmpF(n_grid,8,n_mo)) - !$OMP PARALLEL & - !$OMP DEFAULT(NONE) & - !$OMP PRIVATE(p, i, j, ipoint) & - !$OMP SHARED(ne_b, ne_a, n_grid, mo_num, & - !$OMP mos_l_in_r, mos_r_in_r, & - !$OMP int2_grad1_u12, wr1, & - !$OMP tmp_L, tmp_L0, tmp_R, tmp_R0, tmp_J, tmp_S, tmp3, tmp4) + !$OMP PARALLEL & + !$OMP DEFAULT(NONE) & + !$OMP PRIVATE(p, i, j, ipoint) & + !$OMP SHARED(ne_b, ne_a, n_grid, n_mo, & + !$OMP mos_l_in_r, mos_r_in_r, & + !$OMP int2_grad1_u12, wr1, & + !$OMP tmpL, tmpL0, tmpR, tmpR0, tmpJ, tmpS, tmpE, tmpF) !$OMP DO - do p = 1, mo_num + do p = 1, n_mo do ipoint = 1, n_grid - tmp3(ipoint,1,p) = wr1(ipoint) * mos_l_in_r(ipoint,p) - tmp3(ipoint,2,p) = -2.d0 * (tmp_L(ipoint,1,p) * tmp_J(ipoint,1) + tmp_L(ipoint,2,p) * tmp_J(ipoint,2) + tmp_L(ipoint,3,p) * tmp_J(ipoint,3)) - tmp3(ipoint,3,p) = wr1(ipoint) * tmp_L(ipoint,1,p) - tmp3(ipoint,4,p) = wr1(ipoint) * tmp_L(ipoint,2,p) - tmp3(ipoint,5,p) = wr1(ipoint) * tmp_L(ipoint,3,p) - tmp3(ipoint,6,p) = wr1(ipoint) * tmp_L0(ipoint,1,p) - tmp3(ipoint,7,p) = wr1(ipoint) * tmp_L0(ipoint,2,p) - tmp3(ipoint,8,p) = wr1(ipoint) * tmp_L0(ipoint,3,p) + tmpE(ipoint,1,p) = wr1(ipoint) * mos_l_in_r(ipoint,p) + tmpE(ipoint,2,p) = -2.d0 * (tmpL(ipoint,1,p) * tmpJ(ipoint,1) + tmpL(ipoint,2,p) * tmpJ(ipoint,2) + tmpL(ipoint,3,p) * tmpJ(ipoint,3)) + tmpE(ipoint,3,p) = wr1(ipoint) * tmpL(ipoint,1,p) + tmpE(ipoint,4,p) = wr1(ipoint) * tmpL(ipoint,2,p) + tmpE(ipoint,5,p) = wr1(ipoint) * tmpL(ipoint,3,p) + tmpE(ipoint,6,p) = wr1(ipoint) * tmpL0(ipoint,1,p) + tmpE(ipoint,7,p) = wr1(ipoint) * tmpL0(ipoint,2,p) + tmpE(ipoint,8,p) = wr1(ipoint) * tmpL0(ipoint,3,p) - tmp4(ipoint,1,p) = -2.d0 * (tmp_R(ipoint,1,p) * tmp_J(ipoint,1) + tmp_R(ipoint,2,p) * tmp_J(ipoint,2) + tmp_R(ipoint,3,p) * tmp_J(ipoint,3)) & - + mos_r_in_r(ipoint,p) * tmp_S(ipoint) - tmp4(ipoint,2,p) = wr1(ipoint) * mos_r_in_r(ipoint,p) - tmp4(ipoint,3,p) = tmp_R(ipoint,1,p) - tmp4(ipoint,4,p) = tmp_R(ipoint,2,p) - tmp4(ipoint,5,p) = tmp_R(ipoint,3,p) - tmp4(ipoint,6,p) = tmp_R0(ipoint,1,p) - tmp4(ipoint,7,p) = tmp_R0(ipoint,2,p) - tmp4(ipoint,8,p) = tmp_R0(ipoint,3,p) + tmpF(ipoint,1,p) = -2.d0 * (tmpR(ipoint,1,p) * tmpJ(ipoint,1) + tmpR(ipoint,2,p) * tmpJ(ipoint,2) + tmpR(ipoint,3,p) * tmpJ(ipoint,3)) & + + mos_r_in_r(ipoint,p) * tmpS(ipoint) + tmpF(ipoint,2,p) = wr1(ipoint) * mos_r_in_r(ipoint,p) + tmpF(ipoint,3,p) = tmpR(ipoint,1,p) + tmpF(ipoint,4,p) = tmpR(ipoint,2,p) + tmpF(ipoint,5,p) = tmpR(ipoint,3,p) + tmpF(ipoint,6,p) = tmpR0(ipoint,1,p) + tmpF(ipoint,7,p) = tmpR0(ipoint,2,p) + tmpF(ipoint,8,p) = tmpR0(ipoint,3,p) enddo do i = 1, ne_b do j = 1, ne_b do ipoint = 1, n_grid - tmp3(ipoint,2,p) = tmp3(ipoint,2,p) + mos_l_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,j) & + tmpE(ipoint,2,p) = tmpE(ipoint,2,p) + mos_l_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,j) & + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,j) & + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,j) ) - tmp4(ipoint,1,p) = tmp4(ipoint,1,p) + mos_r_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,p) & + tmpF(ipoint,1,p) = tmpF(ipoint,1,p) + mos_r_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,p) & + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,p) & + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,p) ) enddo ! ipoint @@ -545,17 +545,17 @@ subroutine provide_no_1e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r, do j = 1, ne_b do ipoint = 1, n_grid - tmp3(ipoint,2,p) = tmp3(ipoint,2,p) + 0.5d0 * mos_l_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,j) & + tmpE(ipoint,2,p) = tmpE(ipoint,2,p) + 0.5d0 * mos_l_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,j) & + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,j) & + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,j) ) - tmp3(ipoint,2,p) = tmp3(ipoint,2,p) + 0.5d0 * mos_l_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,p,j) * int2_grad1_u12(ipoint,1,j,i) & + tmpE(ipoint,2,p) = tmpE(ipoint,2,p) + 0.5d0 * mos_l_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,p,j) * int2_grad1_u12(ipoint,1,j,i) & + int2_grad1_u12(ipoint,2,p,j) * int2_grad1_u12(ipoint,2,j,i) & + int2_grad1_u12(ipoint,3,p,j) * int2_grad1_u12(ipoint,3,j,i) ) - tmp4(ipoint,1,p) = tmp4(ipoint,1,p) + 0.5d0 * mos_r_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,p) & + tmpF(ipoint,1,p) = tmpF(ipoint,1,p) + 0.5d0 * mos_r_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,p) & + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,p) & + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,p) ) - tmp4(ipoint,1,p) = tmp4(ipoint,1,p) + 0.5d0 * mos_r_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,j,i) * int2_grad1_u12(ipoint,1,i,p) & + tmpF(ipoint,1,p) = tmpF(ipoint,1,p) + 0.5d0 * mos_r_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,j,i) * int2_grad1_u12(ipoint,1,i,p) & + int2_grad1_u12(ipoint,2,j,i) * int2_grad1_u12(ipoint,2,i,p) & + int2_grad1_u12(ipoint,3,j,i) * int2_grad1_u12(ipoint,3,i,p) ) enddo ! ipoint @@ -566,11 +566,11 @@ subroutine provide_no_1e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r, do j = ne_b+1, ne_a do ipoint = 1, n_grid - tmp3(ipoint,2,p) = tmp3(ipoint,2,p) + 0.5d0 * mos_l_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,j) & + tmpE(ipoint,2,p) = tmpE(ipoint,2,p) + 0.5d0 * mos_l_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,j) & + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,j) & + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,j) ) - tmp4(ipoint,1,p) = tmp4(ipoint,1,p) + 0.5d0 * mos_r_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,p) & + tmpF(ipoint,1,p) = tmpF(ipoint,1,p) + 0.5d0 * mos_r_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,p) & + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,p) & + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,p) ) enddo ! ipoint @@ -581,13 +581,590 @@ subroutine provide_no_1e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r, !$OMP END DO !$OMP END PARALLEL - deallocate(tmp_L0, tmp_L, tmp_R0, tmp_R, tmp_J, tmp_S) + deallocate(tmpL0, tmpL, tmpR0, tmpR, tmpJ, tmpS) - call dgemm( 'T', 'N', mo_num, mo_num, 8*n_grid, 1.d0 & - , tmp3(1,1,1), 8*n_grid, tmp4(1,1,1), 8*n_grid & - , 1.d0, noL_1e(1,1), mo_num) + call dgemm( 'T', 'N', n_mo, n_mo, 8*n_grid, 1.d0 & + , tmpE(1,1,1), 8*n_grid, tmpF(1,1,1), 8*n_grid & + , 1.d0, noL_1e(1,1), n_mo) - deallocate(tmp3, tmp4) + deallocate(tmpE, tmpF) + + endif + + + call wall_time(t1) + write(*,"(A,2X,F15.7)") ' wall time for noL_1e (sec) = ', (t1 - t0) + + return +end + +! --- + +subroutine provide_no_1e_tmp(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r, int2_grad1_u12, & + tmpO, tmpJ, tmpM, tmpS, tmpC, tmpD, tmpL, tmpR, tmpE, tmpF, noL_1e) + + + implicit none + + integer, intent(in) :: n_grid, n_mo + integer, intent(in) :: ne_a, ne_b + double precision, intent(in) :: wr1(n_grid) + double precision, intent(in) :: mos_l_in_r(n_grid,n_mo) + double precision, intent(in) :: mos_r_in_r(n_grid,n_mo) + double precision, intent(in) :: int2_grad1_u12(n_grid,3,n_mo,n_mo) + double precision, intent(out) :: tmpO(n_grid), tmpJ(n_grid,3) + double precision, intent(out) :: tmpM(n_grid,3), tmpS(n_grid) + double precision, intent(out) :: tmpC(n_grid,4,n_mo,n_mo), tmpD(n_grid,4) + double precision, intent(out) :: tmpL(n_grid,3,n_mo), tmpR(n_grid,3,n_mo) + double precision, intent(out) :: tmpE(n_grid,5,n_mo), tmpF(n_grid,5,n_mo) + double precision, intent(out) :: noL_1e(n_mo,n_mo) + + integer :: p, s, i, j, ipoint + double precision :: t0, t1 + double precision, allocatable :: tmpM_priv(:,:), tmpS_priv(:), tmpO_priv(:), tmpJ_priv(:,:) + double precision, allocatable :: tmpL0(:,:,:), tmpR0(:,:,:) + double precision, allocatable :: tmpE_os(:,:,:), tmpF_os(:,:,:) + + + call wall_time(t0) + + + if(ne_a .eq. ne_b) then + + tmpO = 0.d0 + tmpJ = 0.d0 + + !$OMP PARALLEL & + !$OMP DEFAULT(NONE) & + !$OMP PRIVATE(i, ipoint, tmpO_priv, tmpJ_priv) & + !$OMP SHARED(ne_b, n_grid, & + !$OMP mos_l_in_r, mos_r_in_r, & + !$OMP int2_grad1_u12, tmpO, tmpJ) + + allocate(tmpO_priv(n_grid), tmpJ_priv(n_grid,3)) + tmpO_priv = 0.d0 + tmpJ_priv = 0.d0 + + !$OMP DO + do i = 1, ne_b + do ipoint = 1, n_grid + tmpO_priv(ipoint) = tmpO_priv(ipoint) + mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i) + tmpJ_priv(ipoint,1) = tmpJ_priv(ipoint,1) + int2_grad1_u12(ipoint,1,i,i) + tmpJ_priv(ipoint,2) = tmpJ_priv(ipoint,2) + int2_grad1_u12(ipoint,2,i,i) + tmpJ_priv(ipoint,3) = tmpJ_priv(ipoint,3) + int2_grad1_u12(ipoint,3,i,i) + enddo + enddo + !$OMP END DO NOWAIT + + !$OMP CRITICAL + tmpO = tmpO + tmpO_priv + tmpJ = tmpJ + tmpJ_priv + !$OMP END CRITICAL + + deallocate(tmpO_priv, tmpJ_priv) + !$OMP END PARALLEL + + ! --- + + tmpM = 0.d0 + tmpS = 0.d0 + + !$OMP PARALLEL & + !$OMP DEFAULT(NONE) & + !$OMP PRIVATE(i, j, ipoint, tmpM_priv, tmpS_priv) & + !$OMP SHARED(ne_b, n_grid, & + !$OMP mos_l_in_r, mos_r_in_r, & + !$OMP int2_grad1_u12, tmpM, tmpS) + + allocate(tmpM_priv(n_grid,3), tmpS_priv(n_grid)) + tmpM_priv = 0.d0 + tmpS_priv = 0.d0 + + !$OMP DO COLLAPSE(2) + do i = 1, ne_b + do j = 1, ne_b + do ipoint = 1, n_grid + + tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + + tmpS_priv(ipoint) = tmpS_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) & + + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) & + + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i) + enddo + enddo + enddo + !$OMP END DO NOWAIT + + !$OMP CRITICAL + tmpM = tmpM + tmpM_priv + tmpS = tmpS + tmpS_priv + !$OMP END CRITICAL + + deallocate(tmpM_priv, tmpS_priv) + !$OMP END PARALLEL + + ! --- + + do ipoint = 1, n_grid + + tmpD(ipoint,1) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,1) - tmpM(ipoint,1)) + tmpD(ipoint,2) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,2) - tmpM(ipoint,2)) + tmpD(ipoint,3) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,3) - tmpM(ipoint,3)) + tmpD(ipoint,4) = -wr1(ipoint) * tmpO(ipoint) + + tmpS(ipoint) = 2.d0 * (tmpJ(ipoint,1) * tmpJ(ipoint,1) + tmpJ(ipoint,2) * tmpJ(ipoint,2) + tmpJ(ipoint,3) * tmpJ(ipoint,3)) - tmpS(ipoint) + enddo + + !$OMP PARALLEL & + !$OMP DEFAULT(NONE) & + !$OMP PRIVATE(p, s, i, ipoint) & + !$OMP SHARED(n_mo, ne_b, n_grid, & + !$OMP int2_grad1_u12, tmpC) + + !$OMP DO COLLAPSE(2) + do s = 1, n_mo + do p = 1, n_mo + + do ipoint = 1, n_grid + tmpC(ipoint,1,p,s) = int2_grad1_u12(ipoint,1,p,s) + tmpC(ipoint,2,p,s) = int2_grad1_u12(ipoint,2,p,s) + tmpC(ipoint,3,p,s) = int2_grad1_u12(ipoint,3,p,s) + enddo + + tmpC(:,4,p,s) = 0.d0 + do i = 1, ne_b + do ipoint = 1, n_grid + tmpC(ipoint,4,p,s) = tmpC(ipoint,4,p,s) + int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,s) & + + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,s) & + + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,s) + enddo + enddo + + enddo ! p + enddo ! s + !$OMP END DO + !$OMP END PARALLEL + + call dgemv( 'T', 4*n_grid, n_mo*n_mo, 2.d0 & + , tmpC(1,1,1,1), size(tmpC, 1) * size(tmpC, 2) & + , tmpD(1,1), 1 & + , 0.d0, noL_1e(1,1), 1) + + ! --- + + !$OMP PARALLEL & + !$OMP DEFAULT(NONE) & + !$OMP PRIVATE(p, i, ipoint) & + !$OMP SHARED(ne_b, n_grid, n_mo, & + !$OMP mos_l_in_r, mos_r_in_r, & + !$OMP int2_grad1_u12, tmpL, tmpR) + + !$OMP DO + do p = 1, n_mo + + tmpL(:,1:3,p) = 0.d0 + tmpR(:,1:3,p) = 0.d0 + + do i = 1, ne_b + do ipoint = 1, n_grid + + tmpL(ipoint,1,p) = tmpL(ipoint,1,p) + int2_grad1_u12(ipoint,1,p,i) * mos_l_in_r(ipoint,i) + tmpL(ipoint,2,p) = tmpL(ipoint,2,p) + int2_grad1_u12(ipoint,2,p,i) * mos_l_in_r(ipoint,i) + tmpL(ipoint,3,p) = tmpL(ipoint,3,p) + int2_grad1_u12(ipoint,3,p,i) * mos_l_in_r(ipoint,i) + + tmpR(ipoint,1,p) = tmpR(ipoint,1,p) + int2_grad1_u12(ipoint,1,i,p) * mos_r_in_r(ipoint,i) + tmpR(ipoint,2,p) = tmpR(ipoint,2,p) + int2_grad1_u12(ipoint,2,i,p) * mos_r_in_r(ipoint,i) + tmpR(ipoint,3,p) = tmpR(ipoint,3,p) + int2_grad1_u12(ipoint,3,i,p) * mos_r_in_r(ipoint,i) + enddo + enddo + enddo ! p + !$OMP END DO + !$OMP END PARALLEL + + ! --- + + !$OMP PARALLEL & + !$OMP DEFAULT(NONE) & + !$OMP PRIVATE(p, i, j, ipoint) & + !$OMP SHARED(ne_b, n_grid, n_mo, & + !$OMP mos_l_in_r, mos_r_in_r, & + !$OMP int2_grad1_u12, wr1, & + !$OMP tmpL, tmpR, tmpJ, tmpS, tmpE, tmpF) + + !$OMP DO + do p = 1, n_mo + + do ipoint = 1, n_grid + + tmpE(ipoint,1,p) = wr1(ipoint) * mos_l_in_r(ipoint,p) + tmpE(ipoint,2,p) = -2.d0 * (tmpL(ipoint,1,p) * tmpJ(ipoint,1) + tmpL(ipoint,2,p) * tmpJ(ipoint,2) + tmpL(ipoint,3,p) * tmpJ(ipoint,3)) + tmpE(ipoint,3,p) = wr1(ipoint) * tmpL(ipoint,1,p) + tmpE(ipoint,4,p) = wr1(ipoint) * tmpL(ipoint,2,p) + tmpE(ipoint,5,p) = wr1(ipoint) * tmpL(ipoint,3,p) + + tmpF(ipoint,1,p) = -2.d0 * (tmpR(ipoint,1,p) * tmpJ(ipoint,1) + tmpR(ipoint,2,p) * tmpJ(ipoint,2) + tmpR(ipoint,3,p) * tmpJ(ipoint,3)) & + + mos_r_in_r(ipoint,p) * tmpS(ipoint) + tmpF(ipoint,2,p) = wr1(ipoint) * mos_r_in_r(ipoint,p) + tmpF(ipoint,3,p) = tmpR(ipoint,1,p) + tmpF(ipoint,4,p) = tmpR(ipoint,2,p) + tmpF(ipoint,5,p) = tmpR(ipoint,3,p) + enddo + + do i = 1, ne_b + do j = 1, ne_b + do ipoint = 1, n_grid + + tmpE(ipoint,2,p) = tmpE(ipoint,2,p) + mos_l_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,j) & + + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,j) & + + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,j) ) + + tmpF(ipoint,1,p) = tmpF(ipoint,1,p) + mos_r_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,p) & + + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,p) & + + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,p) ) + enddo ! ipoint + enddo ! j + enddo ! i + + enddo ! p + !$OMP END DO + !$OMP END PARALLEL + + call dgemm( 'T', 'N', n_mo, n_mo, 5*n_grid, 1.d0 & + , tmpE(1,1,1), 5*n_grid, tmpF(1,1,1), 5*n_grid & + , 1.d0, noL_1e(1,1), n_mo) + + ! --- + + else + + tmpO = 0.d0 + tmpJ = 0.d0 + + !$OMP PARALLEL & + !$OMP DEFAULT(NONE) & + !$OMP PRIVATE(i, ipoint, tmpO_priv, tmpJ_priv) & + !$OMP SHARED(ne_b, ne_a, n_grid, & + !$OMP mos_l_in_r, mos_r_in_r, & + !$OMP int2_grad1_u12, tmpO, tmpJ) + + allocate(tmpO_priv(n_grid), tmpJ_priv(n_grid,3)) + tmpO_priv = 0.d0 + tmpJ_priv = 0.d0 + + !$OMP DO + do i = 1, ne_b + do ipoint = 1, n_grid + tmpO_priv(ipoint) = tmpO_priv(ipoint) + mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i) + tmpJ_priv(ipoint,1) = tmpJ_priv(ipoint,1) + int2_grad1_u12(ipoint,1,i,i) + tmpJ_priv(ipoint,2) = tmpJ_priv(ipoint,2) + int2_grad1_u12(ipoint,2,i,i) + tmpJ_priv(ipoint,3) = tmpJ_priv(ipoint,3) + int2_grad1_u12(ipoint,3,i,i) + enddo + enddo + !$OMP END DO NOWAIT + + !$OMP DO + do i = ne_b+1, ne_a + do ipoint = 1, n_grid + tmpO_priv(ipoint) = tmpO_priv(ipoint) + 0.5d0 * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i) + tmpJ_priv(ipoint,1) = tmpJ_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,i) + tmpJ_priv(ipoint,2) = tmpJ_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,i) + tmpJ_priv(ipoint,3) = tmpJ_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,i) + enddo + enddo + !$OMP END DO NOWAIT + + !$OMP CRITICAL + tmpO = tmpO + tmpO_priv + tmpJ = tmpJ + tmpJ_priv + !$OMP END CRITICAL + + deallocate(tmpO_priv, tmpJ_priv) + !$OMP END PARALLEL + + ! --- + + tmpM = 0.d0 + tmpS = 0.d0 + + !$OMP PARALLEL & + !$OMP DEFAULT(NONE) & + !$OMP PRIVATE(i, j, ipoint, tmpM_priv, tmpS_priv) & + !$OMP SHARED(ne_b, ne_a, n_grid, & + !$OMP mos_l_in_r, mos_r_in_r, & + !$OMP int2_grad1_u12, tmpM, tmpS) + + allocate(tmpM_priv(n_grid,3), tmpS_priv(n_grid)) + tmpM_priv = 0.d0 + tmpS_priv = 0.d0 + + !$OMP DO COLLAPSE(2) + do i = 1, ne_b + do j = 1, ne_b + do ipoint = 1, n_grid + + tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + + tmpS_priv(ipoint) = tmpS_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) & + + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) & + + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i) + enddo + enddo + enddo + !$OMP END DO NOWAIT + + !$OMP DO COLLAPSE(2) + do i = ne_b+1, ne_a + do j = 1, ne_b + do ipoint = 1, n_grid + + tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + + tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i) + tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i) + tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i) + + tmpS_priv(ipoint) = tmpS_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) & + + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) & + + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i) + enddo + enddo + enddo + !$OMP END DO NOWAIT + + !$OMP DO COLLAPSE(2) + do i = ne_b+1, ne_a + do j = ne_b+1, ne_a + do ipoint = 1, n_grid + + tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + + tmpS_priv(ipoint) = tmpS_priv(ipoint) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) & + + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) & + + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i) + enddo + enddo + enddo + !$OMP END DO NOWAIT + + !$OMP CRITICAL + tmpM = tmpM + tmpM_priv + tmpS = tmpS + tmpS_priv + !$OMP END CRITICAL + + deallocate(tmpM_priv, tmpS_priv) + !$OMP END PARALLEL + + ! --- + + do ipoint = 1, n_grid + + tmpD(ipoint,1) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,1) - tmpM(ipoint,1)) + tmpD(ipoint,2) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,2) - tmpM(ipoint,2)) + tmpD(ipoint,3) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,3) - tmpM(ipoint,3)) + tmpD(ipoint,4) = -wr1(ipoint) * tmpO(ipoint) + + tmpS(ipoint) = 2.d0 * (tmpJ(ipoint,1) * tmpJ(ipoint,1) + tmpJ(ipoint,2) * tmpJ(ipoint,2) + tmpJ(ipoint,3) * tmpJ(ipoint,3)) - tmpS(ipoint) + enddo + + !$OMP PARALLEL & + !$OMP DEFAULT(NONE) & + !$OMP PRIVATE(p, s, i, ipoint) & + !$OMP SHARED(n_mo, ne_b, n_grid, & + !$OMP ne_a, int2_grad1_u12, tmpC) + + !$OMP DO COLLAPSE(2) + do s = 1, n_mo + do p = 1, n_mo + + do ipoint = 1, n_grid + tmpC(ipoint,1,p,s) = int2_grad1_u12(ipoint,1,p,s) + tmpC(ipoint,2,p,s) = int2_grad1_u12(ipoint,2,p,s) + tmpC(ipoint,3,p,s) = int2_grad1_u12(ipoint,3,p,s) + enddo + + tmpC(:,4,p,s) = 0.d0 + do i = 1, ne_b + do ipoint = 1, n_grid + tmpC(ipoint,4,p,s) = tmpC(ipoint,4,p,s) + int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,s) & + + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,s) & + + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,s) + enddo + enddo + do i = ne_b+1, ne_a + do ipoint = 1, n_grid + tmpC(ipoint,4,p,s) = tmpC(ipoint,4,p,s) + 0.5d0 * int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,s) & + + 0.5d0 * int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,s) & + + 0.5d0 * int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,s) + enddo + enddo + + enddo ! p + enddo ! s + !$OMP END DO + !$OMP END PARALLEL + + call dgemv( 'T', 4*n_grid, n_mo*n_mo, 2.d0 & + , tmpC(1,1,1,1), size(tmpC, 1) * size(tmpC, 2) & + , tmpD(1,1), 1 & + , 0.d0, noL_1e(1,1), 1) + + ! --- + + allocate(tmpL0(n_grid,3,n_mo)) + allocate(tmpR0(n_grid,3,n_mo)) + + !$OMP PARALLEL & + !$OMP DEFAULT(NONE) & + !$OMP PRIVATE(p, i, ipoint) & + !$OMP SHARED(ne_b, ne_a, n_grid, n_mo, & + !$OMP mos_l_in_r, mos_r_in_r, & + !$OMP int2_grad1_u12, tmpL0, tmpR0, tmpL, tmpR) + + !$OMP DO + do p = 1, n_mo + + tmpL0(:,1:3,p) = 0.d0 + tmpR0(:,1:3,p) = 0.d0 + do i = ne_b+1, ne_a + do ipoint = 1, n_grid + + tmpL0(ipoint,1,p) = tmpL0(ipoint,1,p) + 0.5d0 * int2_grad1_u12(ipoint,1,p,i) * mos_l_in_r(ipoint,i) + tmpL0(ipoint,2,p) = tmpL0(ipoint,2,p) + 0.5d0 * int2_grad1_u12(ipoint,2,p,i) * mos_l_in_r(ipoint,i) + tmpL0(ipoint,3,p) = tmpL0(ipoint,3,p) + 0.5d0 * int2_grad1_u12(ipoint,3,p,i) * mos_l_in_r(ipoint,i) + + tmpR0(ipoint,1,p) = tmpR0(ipoint,1,p) + 0.5d0 * int2_grad1_u12(ipoint,1,i,p) * mos_r_in_r(ipoint,i) + tmpR0(ipoint,2,p) = tmpR0(ipoint,2,p) + 0.5d0 * int2_grad1_u12(ipoint,2,i,p) * mos_r_in_r(ipoint,i) + tmpR0(ipoint,3,p) = tmpR0(ipoint,3,p) + 0.5d0 * int2_grad1_u12(ipoint,3,i,p) * mos_r_in_r(ipoint,i) + enddo + enddo + + tmpL(:,1:3,p) = tmpL0(:,1:3,p) + tmpR(:,1:3,p) = tmpR0(:,1:3,p) + do i = 1, ne_b + do ipoint = 1, n_grid + + tmpL(ipoint,1,p) = tmpL(ipoint,1,p) + int2_grad1_u12(ipoint,1,p,i) * mos_l_in_r(ipoint,i) + tmpL(ipoint,2,p) = tmpL(ipoint,2,p) + int2_grad1_u12(ipoint,2,p,i) * mos_l_in_r(ipoint,i) + tmpL(ipoint,3,p) = tmpL(ipoint,3,p) + int2_grad1_u12(ipoint,3,p,i) * mos_l_in_r(ipoint,i) + + tmpR(ipoint,1,p) = tmpR(ipoint,1,p) + int2_grad1_u12(ipoint,1,i,p) * mos_r_in_r(ipoint,i) + tmpR(ipoint,2,p) = tmpR(ipoint,2,p) + int2_grad1_u12(ipoint,2,i,p) * mos_r_in_r(ipoint,i) + tmpR(ipoint,3,p) = tmpR(ipoint,3,p) + int2_grad1_u12(ipoint,3,i,p) * mos_r_in_r(ipoint,i) + enddo + enddo + + enddo ! p + !$OMP END DO + !$OMP END PARALLEL + + ! --- + + allocate(tmpE_os(n_grid,8,n_mo)) + allocate(tmpF_os(n_grid,8,n_mo)) + + !$OMP PARALLEL & + !$OMP DEFAULT(NONE) & + !$OMP PRIVATE(p, i, j, ipoint) & + !$OMP SHARED(ne_b, ne_a, n_grid, n_mo, & + !$OMP mos_l_in_r, mos_r_in_r, & + !$OMP int2_grad1_u12, wr1, & + !$OMP tmpL, tmpL0, tmpR, tmpR0, tmpJ, tmpS, tmpE_os, tmpF_os) + + !$OMP DO + do p = 1, n_mo + + do ipoint = 1, n_grid + + tmpE_os(ipoint,1,p) = wr1(ipoint) * mos_l_in_r(ipoint,p) + tmpE_os(ipoint,2,p) = -2.d0 * (tmpL(ipoint,1,p) * tmpJ(ipoint,1) + tmpL(ipoint,2,p) * tmpJ(ipoint,2) + tmpL(ipoint,3,p) * tmpJ(ipoint,3)) + tmpE_os(ipoint,3,p) = wr1(ipoint) * tmpL(ipoint,1,p) + tmpE_os(ipoint,4,p) = wr1(ipoint) * tmpL(ipoint,2,p) + tmpE_os(ipoint,5,p) = wr1(ipoint) * tmpL(ipoint,3,p) + tmpE_os(ipoint,6,p) = wr1(ipoint) * tmpL0(ipoint,1,p) + tmpE_os(ipoint,7,p) = wr1(ipoint) * tmpL0(ipoint,2,p) + tmpE_os(ipoint,8,p) = wr1(ipoint) * tmpL0(ipoint,3,p) + + tmpF_os(ipoint,1,p) = -2.d0 * (tmpR(ipoint,1,p) * tmpJ(ipoint,1) + tmpR(ipoint,2,p) * tmpJ(ipoint,2) + tmpR(ipoint,3,p) * tmpJ(ipoint,3)) & + + mos_r_in_r(ipoint,p) * tmpS(ipoint) + tmpF_os(ipoint,2,p) = wr1(ipoint) * mos_r_in_r(ipoint,p) + tmpF_os(ipoint,3,p) = tmpR(ipoint,1,p) + tmpF_os(ipoint,4,p) = tmpR(ipoint,2,p) + tmpF_os(ipoint,5,p) = tmpR(ipoint,3,p) + tmpF_os(ipoint,6,p) = tmpR0(ipoint,1,p) + tmpF_os(ipoint,7,p) = tmpR0(ipoint,2,p) + tmpF_os(ipoint,8,p) = tmpR0(ipoint,3,p) + enddo + + do i = 1, ne_b + do j = 1, ne_b + do ipoint = 1, n_grid + + tmpE_os(ipoint,2,p) = tmpE_os(ipoint,2,p) + mos_l_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,j) & + + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,j) & + + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,j) ) + + tmpF_os(ipoint,1,p) = tmpF_os(ipoint,1,p) + mos_r_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,p) & + + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,p) & + + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,p) ) + enddo ! ipoint + enddo ! j + enddo ! i + + do i = ne_b+1, ne_a + do j = 1, ne_b + do ipoint = 1, n_grid + + tmpE_os(ipoint,2,p) = tmpE_os(ipoint,2,p) + 0.5d0 * mos_l_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,j) & + + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,j) & + + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,j) ) + tmpE_os(ipoint,2,p) = tmpE_os(ipoint,2,p) + 0.5d0 * mos_l_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,p,j) * int2_grad1_u12(ipoint,1,j,i) & + + int2_grad1_u12(ipoint,2,p,j) * int2_grad1_u12(ipoint,2,j,i) & + + int2_grad1_u12(ipoint,3,p,j) * int2_grad1_u12(ipoint,3,j,i) ) + + tmpF_os(ipoint,1,p) = tmpF_os(ipoint,1,p) + 0.5d0 * mos_r_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,p) & + + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,p) & + + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,p) ) + tmpF_os(ipoint,1,p) = tmpF_os(ipoint,1,p) + 0.5d0 * mos_r_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,j,i) * int2_grad1_u12(ipoint,1,i,p) & + + int2_grad1_u12(ipoint,2,j,i) * int2_grad1_u12(ipoint,2,i,p) & + + int2_grad1_u12(ipoint,3,j,i) * int2_grad1_u12(ipoint,3,i,p) ) + enddo ! ipoint + enddo ! j + enddo ! i + + do i = ne_b+1, ne_a + do j = ne_b+1, ne_a + do ipoint = 1, n_grid + + tmpE_os(ipoint,2,p) = tmpE_os(ipoint,2,p) + 0.5d0 * mos_l_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,j) & + + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,j) & + + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,j) ) + + tmpF_os(ipoint,1,p) = tmpF_os(ipoint,1,p) + 0.5d0 * mos_r_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,p) & + + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,p) & + + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,p) ) + enddo ! ipoint + enddo ! j + enddo ! i + + enddo ! p + !$OMP END DO + !$OMP END PARALLEL + + deallocate(tmpL0, tmpR0) + + call dgemm( 'T', 'N', n_mo, n_mo, 8*n_grid, 1.d0 & + , tmpE_os(1,1,1), 8*n_grid, tmpF_os(1,1,1), 8*n_grid & + , 1.d0, noL_1e(1,1), n_mo) + + deallocate(tmpE_os, tmpF_os) endif