From bf15b68b0b64a229812dc4eeb2ef498fe56b194c Mon Sep 17 00:00:00 2001 From: AbdAmmar Date: Tue, 13 Aug 2024 10:57:44 +0200 Subject: [PATCH] add normal-ordering with CuTC --- plugins/local/tc_int/cutc_module.F90 | 58 +++- plugins/local/tc_int/deb_no_0e_gpu.irp.f | 96 ++++++ plugins/local/tc_int/deb_no_gpu.irp.f | 170 ++++++++++ plugins/local/tc_int/deb_tc_int_cuda.irp.f | 6 +- plugins/local/tc_int/no_0e.irp.f | 331 +++++++++---------- plugins/local/tc_int/write_tc_int_cuda.irp.f | 12 +- 6 files changed, 490 insertions(+), 183 deletions(-) create mode 100644 plugins/local/tc_int/deb_no_0e_gpu.irp.f create mode 100644 plugins/local/tc_int/deb_no_gpu.irp.f diff --git a/plugins/local/tc_int/cutc_module.F90 b/plugins/local/tc_int/cutc_module.F90 index d1466697..eaf271e5 100644 --- a/plugins/local/tc_int/cutc_module.F90 +++ b/plugins/local/tc_int/cutc_module.F90 @@ -9,13 +9,13 @@ module cutc_module ! --- - subroutine cutc_int_c(nxBlocks, nyBlocks, nzBlocks, & - blockxSize, blockySize, blockzSize, & - n_grid1, n_grid2, n_ao, n_nuc, size_bh, & - r1, wr1, r2, wr2, rn, & - aos_data1, aos_data2, & - c_bh, m_bh, n_bh, o_bh, & - int2_grad1_u12_ao, int_2e_ao) bind(C, name = "cutc_int_c") + subroutine cutc_int(nxBlocks, nyBlocks, nzBlocks, & + blockxSize, blockySize, blockzSize, & + n_grid1, n_grid2, n_ao, n_nuc, size_bh, & + r1, wr1, r2, wr2, rn, & + aos_data1, aos_data2, & + c_bh, m_bh, n_bh, o_bh, & + int2_grad1_u12_ao, int_2e_ao) bind(C, name = "cutc_int") import c_int, c_double, c_ptr integer(c_int), intent(in), value :: nxBlocks, blockxSize @@ -37,7 +37,7 @@ module cutc_module real(c_double), intent(out) :: int2_grad1_u12_ao(n_ao,n_ao,n_grid1,3) real(c_double), intent(out) :: int_2e_ao(n_ao,n_ao,n_ao,n_ao) - end subroutine cutc_int_c + end subroutine cutc_int ! --- @@ -170,6 +170,48 @@ module cutc_module ! --- + subroutine cutc_no_0e(n_grid1, n_mo, ne_a, ne_b, & + wr1, mos_l_in_r, mos_r_in_r, int2_grad1_u12, & + no_0e) bind(C, name = "cutc_no_0e") + + import c_int, c_double, c_ptr + + integer(c_int), intent(in), value :: n_grid1 + integer(c_int), intent(in), value :: n_mo + integer(c_int), intent(in), value :: ne_a + integer(c_int), intent(in), value :: ne_b + real(c_double), intent(in) :: wr1(n_grid1) + real(c_double), intent(in) :: mos_l_in_r(n_grid1,n_mo) + real(c_double), intent(in) :: mos_r_in_r(n_grid1,n_mo) + real(c_double), intent(in) :: int2_grad1_u12(n_grid1,3,n_mo,n_mo) + real(c_double), intent(out) :: no_0e(1) + + end subroutine cutc_no_0e + + ! --- + + subroutine cutc_no(n_grid1, n_mo, ne_a, ne_b, & + wr1, mos_l_in_r, mos_r_in_r, int2_grad1_u12, & + no_2e, no_1e, no_0e) bind(C, name = "cutc_no") + + import c_int, c_double, c_ptr + + integer(c_int), intent(in), value :: n_grid1 + integer(c_int), intent(in), value :: n_mo + integer(c_int), intent(in), value :: ne_a + integer(c_int), intent(in), value :: ne_b + real(c_double), intent(in) :: wr1(n_grid1) + real(c_double), intent(in) :: mos_l_in_r(n_grid1,n_mo) + real(c_double), intent(in) :: mos_r_in_r(n_grid1,n_mo) + real(c_double), intent(in) :: int2_grad1_u12(n_grid1,3,n_mo,n_mo) + real(c_double), intent(out) :: no_2e(n_mo,n_mo,n_mo,n_mo) + real(c_double), intent(out) :: no_1e(n_mo,n_mo) + real(c_double), intent(out) :: no_0e(1) + + end subroutine cutc_no + + ! --- + end interface end module cutc_module diff --git a/plugins/local/tc_int/deb_no_0e_gpu.irp.f b/plugins/local/tc_int/deb_no_0e_gpu.irp.f new file mode 100644 index 00000000..afff060a --- /dev/null +++ b/plugins/local/tc_int/deb_no_0e_gpu.irp.f @@ -0,0 +1,96 @@ + +! --- + +subroutine deb_no_0e_gpu() + + use cutc_module + + implicit none + + integer :: i, j, k, l, ipoint + double precision :: acc_thr, err_tot, nrm_tot, err_loc + double precision :: noL_0e + double precision :: noL_0e_gpu(1) + double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:) + double precision, allocatable :: tmp(:,:,:,:) + double precision, allocatable :: int2_grad1_u12_bimo_t(:,:,:,:) + + + PROVIDE mo_l_coef mo_r_coef + PROVIDE mos_l_in_r_array_transp mos_r_in_r_array_transp + + + allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,3)) + print*, ' Reading int2_grad1_u12_ao from ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao' + open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="read") + read(11) int2_grad1_u12_ao + close(11) + + allocate(tmp(mo_num,mo_num,n_points_final_grid,3)) + allocate(int2_grad1_u12_bimo_t(n_points_final_grid,3,mo_num,mo_num)) + + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (ipoint) & + !$OMP SHARED (ao_num, mo_num, n_points_final_grid, int2_grad1_u12_ao, tmp) + !$OMP DO SCHEDULE (dynamic) + do ipoint = 1, n_points_final_grid + call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,1), ao_num, tmp(1,1,ipoint,1), mo_num) + call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,2), ao_num, tmp(1,1,ipoint,2), mo_num) + call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,3), ao_num, tmp(1,1,ipoint,3), mo_num) + enddo + !$OMP END DO + !$OMP END PARALLEL + + deallocate(int2_grad1_u12_ao) + + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (i, j, ipoint) & + !$OMP SHARED (mo_num, n_points_final_grid, tmp, int2_grad1_u12_bimo_t) + !$OMP DO COLLAPSE(2) SCHEDULE (dynamic) + do ipoint = 1, n_points_final_grid + do i = 1, mo_num + do j = 1, mo_num + int2_grad1_u12_bimo_t(ipoint,1,j,i) = tmp(j,i,ipoint,1) + int2_grad1_u12_bimo_t(ipoint,2,j,i) = tmp(j,i,ipoint,2) + int2_grad1_u12_bimo_t(ipoint,3,j,i) = tmp(j,i,ipoint,3) + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + + deallocate(tmp) + + ! --- + + call cutc_no_0e(n_points_final_grid, mo_num, elec_alpha_num, elec_beta_num, & + final_weight_at_r_vector(1), & + mos_l_in_r_array_transp(1,1), mos_r_in_r_array_transp(1,1), & + int2_grad1_u12_bimo_t(1,1,1,1), noL_0e_gpu(1)) + + ! --- + + call provide_no_0e(n_points_final_grid, mo_num, elec_alpha_num, elec_beta_num, & + final_weight_at_r_vector(1), & + mos_l_in_r_array_transp(1,1), mos_r_in_r_array_transp(1,1), & + int2_grad1_u12_bimo_t(1,1,1,1), noL_0e) + + ! --- + + deallocate(int2_grad1_u12_bimo_t) + + print *, 'noL_0e CPU = ', noL_0e + print *, 'noL_0e GPU = ', noL_0e_gpu(1) + + err_tot = dabs(noL_0e - noL_0e_gpu(1)) + nrm_tot = dabs(noL_0e) + print *, ' absolute accuracy on noL_0e (%) =', 100.d0 * err_tot / nrm_tot + + return + +end + +! --- + diff --git a/plugins/local/tc_int/deb_no_gpu.irp.f b/plugins/local/tc_int/deb_no_gpu.irp.f new file mode 100644 index 00000000..e14404e6 --- /dev/null +++ b/plugins/local/tc_int/deb_no_gpu.irp.f @@ -0,0 +1,170 @@ + +! --- + +subroutine deb_no_gpu() + + use cutc_module + + implicit none + + integer :: i, j, k, l, ipoint + double precision :: acc_thr, err_tot, nrm_tot, err_loc + double precision :: noL_0e + double precision :: noL_0e_gpu(1) + double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:) + double precision, allocatable :: tmp(:,:,:,:) + double precision, allocatable :: int2_grad1_u12_bimo_t(:,:,:,:) + double precision, allocatable :: noL_1e (:,:) + double precision, allocatable :: noL_1e_gpu(:,:) + double precision, allocatable :: noL_2e (:,:,:,:) + double precision, allocatable :: noL_2e_gpu(:,:,:,:) + + + PROVIDE mo_l_coef mo_r_coef + PROVIDE mos_l_in_r_array_transp mos_r_in_r_array_transp + + + allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,3)) + print*, ' Reading int2_grad1_u12_ao from ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao' + open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="read") + read(11) int2_grad1_u12_ao + close(11) + + allocate(tmp(mo_num,mo_num,n_points_final_grid,3)) + allocate(int2_grad1_u12_bimo_t(n_points_final_grid,3,mo_num,mo_num)) + + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (ipoint) & + !$OMP SHARED (ao_num, mo_num, n_points_final_grid, int2_grad1_u12_ao, tmp) + !$OMP DO SCHEDULE (dynamic) + do ipoint = 1, n_points_final_grid + call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,1), ao_num, tmp(1,1,ipoint,1), mo_num) + call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,2), ao_num, tmp(1,1,ipoint,2), mo_num) + call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,3), ao_num, tmp(1,1,ipoint,3), mo_num) + enddo + !$OMP END DO + !$OMP END PARALLEL + + deallocate(int2_grad1_u12_ao) + + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (i, j, ipoint) & + !$OMP SHARED (mo_num, n_points_final_grid, tmp, int2_grad1_u12_bimo_t) + !$OMP DO COLLAPSE(2) SCHEDULE (dynamic) + do ipoint = 1, n_points_final_grid + do i = 1, mo_num + do j = 1, mo_num + int2_grad1_u12_bimo_t(ipoint,1,j,i) = tmp(j,i,ipoint,1) + int2_grad1_u12_bimo_t(ipoint,2,j,i) = tmp(j,i,ipoint,2) + int2_grad1_u12_bimo_t(ipoint,3,j,i) = tmp(j,i,ipoint,3) + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + + deallocate(tmp) + + ! --- + + allocate(noL_2e_gpu(mo_num,mo_num,mo_num,mo_num)) + allocate(noL_1e_gpu(mo_num,mo_num)) + + call cutc_no(n_points_final_grid, mo_num, elec_alpha_num, elec_beta_num, & + final_weight_at_r_vector(1), & + mos_l_in_r_array_transp(1,1), mos_r_in_r_array_transp(1,1), & + int2_grad1_u12_bimo_t(1,1,1,1), noL_2e_gpu(1,1,1,1), noL_1e_gpu(1,1), noL_0e_gpu(1)) + + ! --- + + allocate(noL_2e(mo_num,mo_num,mo_num,mo_num)) + allocate(noL_1e(mo_num,mo_num)) + + call provide_no_2e(n_points_final_grid, mo_num, elec_alpha_num, elec_beta_num, & + final_weight_at_r_vector(1), & + mos_l_in_r_array_transp(1,1), mos_r_in_r_array_transp(1,1), & + int2_grad1_u12_bimo_t(1,1,1,1), noL_2e(1,1,1,1)) + + call provide_no_1e(n_points_final_grid, mo_num, elec_alpha_num, elec_beta_num, & + final_weight_at_r_vector(1), & + mos_l_in_r_array_transp(1,1), mos_r_in_r_array_transp(1,1), & + int2_grad1_u12_bimo_t(1,1,1,1), noL_1e(1,1)) + + call provide_no_0e(n_points_final_grid, mo_num, elec_alpha_num, elec_beta_num, & + final_weight_at_r_vector(1), & + mos_l_in_r_array_transp(1,1), mos_r_in_r_array_transp(1,1), & + int2_grad1_u12_bimo_t(1,1,1,1), noL_0e) + + ! --- + + deallocate(int2_grad1_u12_bimo_t) + + acc_thr = 1d-12 + + ! --- + + err_tot = 0.d0 + nrm_tot = 0.d0 + do i = 1, mo_num + do j = 1, mo_num + do k = 1, mo_num + do l = 1, mo_num + err_loc = dabs(noL_2e(l,k,j,i) - noL_2e_gpu(l,k,j,i)) + if(err_loc > acc_thr) then + print*, " error on", l, k, j, i + print*, " CPU res", noL_2e (l,k,j,i) + print*, " GPU res", noL_2e_gpu(l,k,j,i) + stop + endif + err_tot = err_tot + err_loc + nrm_tot = nrm_tot + dabs(noL_2e(l,k,j,i)) + enddo + enddo + enddo + enddo + print *, ' absolute accuracy on noL_2e (%) =', 100.d0 * err_tot / nrm_tot + + deallocate(noL_2e) + deallocate(noL_2e_gpu) + + ! --- + + err_tot = 0.d0 + nrm_tot = 0.d0 + do k = 1, mo_num + do l = 1, mo_num + err_loc = dabs(noL_1e(l,k) - noL_1e_gpu(l,k)) + if(err_loc > acc_thr) then + print*, " error on", l, k + print*, " CPU res", noL_1e (l,k) + print*, " GPU res", noL_1e_gpu(l,k) + stop + endif + err_tot = err_tot + err_loc + nrm_tot = nrm_tot + dabs(noL_1e(l,k)) + enddo + enddo + print *, ' absolute accuracy on noL_1e (%) =', 100.d0 * err_tot / nrm_tot + + deallocate(noL_1e) + deallocate(noL_1e_gpu) + + ! --- + + print *, 'noL_0e CPU = ', noL_0e + print *, 'noL_0e GPU = ', noL_0e_gpu(1) + + err_tot = dabs(noL_0e - noL_0e_gpu(1)) + nrm_tot = dabs(noL_0e) + print *, ' absolute accuracy on noL_0e (%) =', 100.d0 * err_tot / nrm_tot + + + return + +end + +! --- + + diff --git a/plugins/local/tc_int/deb_tc_int_cuda.irp.f b/plugins/local/tc_int/deb_tc_int_cuda.irp.f index 8d0cc4f3..ad20d861 100644 --- a/plugins/local/tc_int/deb_tc_int_cuda.irp.f +++ b/plugins/local/tc_int/deb_tc_int_cuda.irp.f @@ -41,9 +41,13 @@ subroutine main() !call deb_no_2e_gpu_tmp() !call deb_no_2e_gpu() - call deb_no_1e_gpu_tmp() + !call deb_no_1e_gpu_tmp() !call deb_no_1e_gpu() + !call deb_no_0e_gpu() + + call deb_no_gpu() + return end diff --git a/plugins/local/tc_int/no_0e.irp.f b/plugins/local/tc_int/no_0e.irp.f index b945e0dd..830b91a8 100644 --- a/plugins/local/tc_int/no_0e.irp.f +++ b/plugins/local/tc_int/no_0e.irp.f @@ -3,12 +3,6 @@ subroutine provide_no_0e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r, int2_grad1_u12, noL_0e) - BEGIN_DOC - ! - ! < Phi_left | L | Phi_right > - ! - END_DOC - implicit none integer, intent(in) :: n_grid, n_mo @@ -22,44 +16,47 @@ subroutine provide_no_0e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r, integer :: i, j, k, ipoint double precision :: t0, t1 double precision, allocatable :: tmp(:) - double precision, allocatable :: tmp_L(:,:), tmp_R(:,:) - double precision, allocatable :: tmp_M(:,:), tmp_S(:), tmp_O(:), tmp_J(:,:) - double precision, allocatable :: tmp_M_priv(:,:), tmp_S_priv(:), tmp_O_priv(:), tmp_J_priv(:,:) + double precision, allocatable :: tmpL(:,:), tmpR(:,:) + double precision, allocatable :: tmpM(:,:), tmpS(:), tmpO(:), tmpJ(:,:) + double precision, allocatable :: tmpM_priv(:,:), tmpS_priv(:), tmpO_priv(:), tmpJ_priv(:,:) + + + call wall_time(t0) if(ne_a .eq. ne_b) then allocate(tmp(ne_b)) - allocate(tmp_L(n_grid,3), tmp_R(n_grid,3)) + allocate(tmpL(n_grid,3), tmpR(n_grid,3)) !$OMP PARALLEL & !$OMP DEFAULT(NONE) & - !$OMP PRIVATE(j, i, ipoint, tmp_L, tmp_R) & - !$OMP SHARED(ne_b, n_grid, & + !$OMP PRIVATE(j, i, ipoint, tmpL, tmpR) & + !$OMP SHARED(ne_b, n_grid, & !$OMP mos_l_in_r, mos_r_in_r, wr1, & !$OMP int2_grad1_u12, tmp) !$OMP DO do j = 1, ne_b - tmp_L = 0.d0 - tmp_R = 0.d0 + tmpL = 0.d0 + tmpR = 0.d0 do i = 1, ne_b do ipoint = 1, n_grid - tmp_L(ipoint,1) = tmp_L(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) - tmp_L(ipoint,2) = tmp_L(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) - tmp_L(ipoint,3) = tmp_L(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) + tmpL(ipoint,1) = tmpL(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) + tmpL(ipoint,2) = tmpL(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) + tmpL(ipoint,3) = tmpL(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) - tmp_R(ipoint,1) = tmp_R(ipoint,1) + int2_grad1_u12(ipoint,1,i,j) * mos_r_in_r(ipoint,i) - tmp_R(ipoint,2) = tmp_R(ipoint,2) + int2_grad1_u12(ipoint,2,i,j) * mos_r_in_r(ipoint,i) - tmp_R(ipoint,3) = tmp_R(ipoint,3) + int2_grad1_u12(ipoint,3,i,j) * mos_r_in_r(ipoint,i) + tmpR(ipoint,1) = tmpR(ipoint,1) + int2_grad1_u12(ipoint,1,i,j) * mos_r_in_r(ipoint,i) + tmpR(ipoint,2) = tmpR(ipoint,2) + int2_grad1_u12(ipoint,2,i,j) * mos_r_in_r(ipoint,i) + tmpR(ipoint,3) = tmpR(ipoint,3) + int2_grad1_u12(ipoint,3,i,j) * mos_r_in_r(ipoint,i) enddo enddo tmp(j) = 0.d0 do ipoint = 1, n_grid - tmp(j) = tmp(j) + wr1(ipoint) * (tmp_L(ipoint,1)*tmp_R(ipoint,1) + tmp_L(ipoint,2)*tmp_R(ipoint,2) + tmp_L(ipoint,3)*tmp_R(ipoint,3)) + tmp(j) = tmp(j) + wr1(ipoint) * (tmpL(ipoint,1)*tmpR(ipoint,1) + tmpL(ipoint,2)*tmpR(ipoint,2) + tmpL(ipoint,3)*tmpR(ipoint,3)) enddo enddo ! j !$OMP END DO @@ -68,150 +65,149 @@ subroutine provide_no_0e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r, noL_0e = -2.d0 * sum(tmp) deallocate(tmp) - deallocate(tmp_L, tmp_R) + deallocate(tmpL, tmpR) ! --- - allocate(tmp_O(n_grid), tmp_J(n_grid,3)) - tmp_O = 0.d0 - tmp_J = 0.d0 + allocate(tmpO(n_grid), tmpJ(n_grid,3)) + tmpO = 0.d0 + tmpJ = 0.d0 - !$OMP PARALLEL & - !$OMP DEFAULT(NONE) & - !$OMP PRIVATE(i, ipoint, tmp_O_priv, tmp_J_priv) & - !$OMP SHARED(ne_b, n_grid, & - !$OMP mos_l_in_r, mos_r_in_r, & - !$OMP int2_grad1_u12, tmp_O, tmp_J) + !$OMP PARALLEL & + !$OMP DEFAULT(NONE) & + !$OMP PRIVATE(i, ipoint, tmpO_priv, tmpJ_priv) & + !$OMP SHARED(ne_b, n_grid, & + !$OMP mos_l_in_r, mos_r_in_r, & + !$OMP int2_grad1_u12, tmpO, tmpJ) - allocate(tmp_O_priv(n_grid), tmp_J_priv(n_grid,3)) - tmp_O_priv = 0.d0 - tmp_J_priv = 0.d0 + allocate(tmpO_priv(n_grid), tmpJ_priv(n_grid,3)) + tmpO_priv = 0.d0 + tmpJ_priv = 0.d0 !$OMP DO do i = 1, ne_b do ipoint = 1, n_grid - tmp_O_priv(ipoint) = tmp_O_priv(ipoint) + mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i) - tmp_J_priv(ipoint,1) = tmp_J_priv(ipoint,1) + int2_grad1_u12(ipoint,1,i,i) - tmp_J_priv(ipoint,2) = tmp_J_priv(ipoint,2) + int2_grad1_u12(ipoint,2,i,i) - tmp_J_priv(ipoint,3) = tmp_J_priv(ipoint,3) + int2_grad1_u12(ipoint,3,i,i) + tmpO_priv(ipoint) = tmpO_priv(ipoint) + mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i) + tmpJ_priv(ipoint,1) = tmpJ_priv(ipoint,1) + int2_grad1_u12(ipoint,1,i,i) + tmpJ_priv(ipoint,2) = tmpJ_priv(ipoint,2) + int2_grad1_u12(ipoint,2,i,i) + tmpJ_priv(ipoint,3) = tmpJ_priv(ipoint,3) + int2_grad1_u12(ipoint,3,i,i) enddo enddo !$OMP END DO NOWAIT !$OMP CRITICAL - tmp_O = tmp_O + tmp_O_priv - tmp_J = tmp_J + tmp_J_priv + tmpO = tmpO + tmpO_priv + tmpJ = tmpJ + tmpJ_priv !$OMP END CRITICAL - deallocate(tmp_O_priv, tmp_J_priv) + deallocate(tmpO_priv, tmpJ_priv) !$OMP END PARALLEL - allocate(tmp_M(n_grid,3), tmp_S(n_grid)) - tmp_M = 0.d0 - tmp_S = 0.d0 + allocate(tmpM(n_grid,3), tmpS(n_grid)) + tmpM = 0.d0 + tmpS = 0.d0 - !$OMP PARALLEL & - !$OMP DEFAULT(NONE) & - !$OMP PRIVATE(i, j, ipoint, tmp_M_priv, tmp_S_priv) & - !$OMP SHARED(ne_b, n_grid, & - !$OMP mos_l_in_r, mos_r_in_r, & - !$OMP int2_grad1_u12, tmp_M, tmp_S) + !$OMP PARALLEL & + !$OMP DEFAULT(NONE) & + !$OMP PRIVATE(i, j, ipoint, tmpM_priv, tmpS_priv) & + !$OMP SHARED(ne_b, n_grid, & + !$OMP mos_l_in_r, mos_r_in_r, & + !$OMP int2_grad1_u12, tmpM, tmpS) - allocate(tmp_M_priv(n_grid,3), tmp_S_priv(n_grid)) - tmp_M_priv = 0.d0 - tmp_S_priv = 0.d0 + allocate(tmpM_priv(n_grid,3), tmpS_priv(n_grid)) + tmpM_priv = 0.d0 + tmpS_priv = 0.d0 !$OMP DO COLLAPSE(2) do i = 1, ne_b do j = 1, ne_b do ipoint = 1, n_grid - tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) - tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) - tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) - tmp_S_priv(ipoint) = tmp_S_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) & - + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) & - + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i) + tmpS_priv(ipoint) = tmpS_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) & + + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) & + + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i) enddo enddo enddo !$OMP END DO NOWAIT !$OMP CRITICAL - tmp_M = tmp_M + tmp_M_priv - tmp_S = tmp_S + tmp_S_priv + tmpM = tmpM + tmpM_priv + tmpS = tmpS + tmpS_priv !$OMP END CRITICAL - deallocate(tmp_M_priv, tmp_S_priv) + deallocate(tmpM_priv, tmpS_priv) !$OMP END PARALLEL allocate(tmp(n_grid)) do ipoint = 1, n_grid - tmp_S(ipoint) = 2.d0 * (tmp_J(ipoint,1)*tmp_J(ipoint,1) + tmp_J(ipoint,2)*tmp_J(ipoint,2) + tmp_J(ipoint,3)*tmp_J(ipoint,3)) - tmp_S(ipoint) + tmpS(ipoint) = 2.d0 * (tmpJ(ipoint,1)*tmpJ(ipoint,1) + tmpJ(ipoint,2)*tmpJ(ipoint,2) + tmpJ(ipoint,3)*tmpJ(ipoint,3)) - tmpS(ipoint) - tmp(ipoint) = wr1(ipoint) * ( tmp_O(ipoint) * tmp_S(ipoint) & - - 2.d0 * ( tmp_J(ipoint,1) * tmp_M(ipoint,1) & - + tmp_J(ipoint,2) * tmp_M(ipoint,2) & - + tmp_J(ipoint,3) * tmp_M(ipoint,3))) + tmp(ipoint) = wr1(ipoint) * ( tmpO(ipoint) * tmpS(ipoint) - 2.d0 * ( tmpJ(ipoint,1) * tmpM(ipoint,1) & + + tmpJ(ipoint,2) * tmpM(ipoint,2) & + + tmpJ(ipoint,3) * tmpM(ipoint,3) ) ) enddo - noL_0e = noL_0e -2.d0 * (sum(tmp)) + noL_0e = noL_0e - 2.d0 * (sum(tmp)) deallocate(tmp) else allocate(tmp(ne_a)) - allocate(tmp_L(n_grid,3), tmp_R(n_grid,3)) + allocate(tmpL(n_grid,3), tmpR(n_grid,3)) - !$OMP PARALLEL & - !$OMP DEFAULT(NONE) & - !$OMP PRIVATE(j, i, ipoint, tmp_L, tmp_R) & - !$OMP SHARED(ne_b, ne_a, n_grid, & - !$OMP mos_l_in_r, mos_r_in_r, & + !$OMP PARALLEL & + !$OMP DEFAULT(NONE) & + !$OMP PRIVATE(j, i, ipoint, tmpL, tmpR) & + !$OMP SHARED(ne_b, ne_a, n_grid, & + !$OMP mos_l_in_r, mos_r_in_r, & !$OMP int2_grad1_u12, tmp, wr1) !$OMP DO do j = 1, ne_b - tmp_L = 0.d0 - tmp_R = 0.d0 + tmpL = 0.d0 + tmpR = 0.d0 do i = ne_b+1, ne_a do ipoint = 1, n_grid - tmp_L(ipoint,1) = tmp_L(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) - tmp_L(ipoint,2) = tmp_L(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) - tmp_L(ipoint,3) = tmp_L(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) + tmpL(ipoint,1) = tmpL(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) + tmpL(ipoint,2) = tmpL(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) + tmpL(ipoint,3) = tmpL(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) - tmp_R(ipoint,1) = tmp_R(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * mos_r_in_r(ipoint,i) - tmp_R(ipoint,2) = tmp_R(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * mos_r_in_r(ipoint,i) - tmp_R(ipoint,3) = tmp_R(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * mos_r_in_r(ipoint,i) + tmpR(ipoint,1) = tmpR(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * mos_r_in_r(ipoint,i) + tmpR(ipoint,2) = tmpR(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * mos_r_in_r(ipoint,i) + tmpR(ipoint,3) = tmpR(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * mos_r_in_r(ipoint,i) enddo enddo tmp(j) = 0.d0 do ipoint = 1, n_grid - tmp(j) = tmp(j) + wr1(ipoint) * (tmp_L(ipoint,1)*tmp_R(ipoint,1) + tmp_L(ipoint,2)*tmp_R(ipoint,2) + tmp_L(ipoint,3)*tmp_R(ipoint,3)) + tmp(j) = tmp(j) + wr1(ipoint) * (tmpL(ipoint,1)*tmpR(ipoint,1) + tmpL(ipoint,2)*tmpR(ipoint,2) + tmpL(ipoint,3)*tmpR(ipoint,3)) enddo do i = 1, ne_b do ipoint = 1, n_grid - tmp_L(ipoint,1) = tmp_L(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) - tmp_L(ipoint,2) = tmp_L(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) - tmp_L(ipoint,3) = tmp_L(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) + tmpL(ipoint,1) = tmpL(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) + tmpL(ipoint,2) = tmpL(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) + tmpL(ipoint,3) = tmpL(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) - tmp_R(ipoint,1) = tmp_R(ipoint,1) + int2_grad1_u12(ipoint,1,i,j) * mos_r_in_r(ipoint,i) - tmp_R(ipoint,2) = tmp_R(ipoint,2) + int2_grad1_u12(ipoint,2,i,j) * mos_r_in_r(ipoint,i) - tmp_R(ipoint,3) = tmp_R(ipoint,3) + int2_grad1_u12(ipoint,3,i,j) * mos_r_in_r(ipoint,i) + tmpR(ipoint,1) = tmpR(ipoint,1) + int2_grad1_u12(ipoint,1,i,j) * mos_r_in_r(ipoint,i) + tmpR(ipoint,2) = tmpR(ipoint,2) + int2_grad1_u12(ipoint,2,i,j) * mos_r_in_r(ipoint,i) + tmpR(ipoint,3) = tmpR(ipoint,3) + int2_grad1_u12(ipoint,3,i,j) * mos_r_in_r(ipoint,i) enddo enddo do ipoint = 1, n_grid - tmp(j) = tmp(j) + wr1(ipoint) * (tmp_L(ipoint,1)*tmp_R(ipoint,1) + tmp_L(ipoint,2)*tmp_R(ipoint,2) + tmp_L(ipoint,3)*tmp_R(ipoint,3)) + tmp(j) = tmp(j) + wr1(ipoint) * (tmpL(ipoint,1)*tmpR(ipoint,1) + tmpL(ipoint,2)*tmpR(ipoint,2) + tmpL(ipoint,3)*tmpR(ipoint,3)) enddo enddo ! j !$OMP END DO @@ -219,33 +215,33 @@ subroutine provide_no_0e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r, ! --- - !$OMP PARALLEL & - !$OMP DEFAULT(NONE) & - !$OMP PRIVATE(j, i, ipoint, tmp_L, tmp_R) & - !$OMP SHARED(ne_b, ne_a, n_grid, & - !$OMP mos_l_in_r, mos_r_in_r, & + !$OMP PARALLEL & + !$OMP DEFAULT(NONE) & + !$OMP PRIVATE(j, i, ipoint, tmpL, tmpR) & + !$OMP SHARED(ne_b, ne_a, n_grid, & + !$OMP mos_l_in_r, mos_r_in_r, & !$OMP int2_grad1_u12, tmp, wr1) !$OMP DO do j = ne_b+1, ne_a - tmp_L = 0.d0 - tmp_R = 0.d0 + tmpL = 0.d0 + tmpR = 0.d0 do i = 1, ne_a do ipoint = 1, n_grid - tmp_L(ipoint,1) = tmp_L(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) - tmp_L(ipoint,2) = tmp_L(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) - tmp_L(ipoint,3) = tmp_L(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) + tmpL(ipoint,1) = tmpL(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) + tmpL(ipoint,2) = tmpL(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) + tmpL(ipoint,3) = tmpL(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) - tmp_R(ipoint,1) = tmp_R(ipoint,1) + int2_grad1_u12(ipoint,1,i,j) * mos_r_in_r(ipoint,i) - tmp_R(ipoint,2) = tmp_R(ipoint,2) + int2_grad1_u12(ipoint,2,i,j) * mos_r_in_r(ipoint,i) - tmp_R(ipoint,3) = tmp_R(ipoint,3) + int2_grad1_u12(ipoint,3,i,j) * mos_r_in_r(ipoint,i) + tmpR(ipoint,1) = tmpR(ipoint,1) + int2_grad1_u12(ipoint,1,i,j) * mos_r_in_r(ipoint,i) + tmpR(ipoint,2) = tmpR(ipoint,2) + int2_grad1_u12(ipoint,2,i,j) * mos_r_in_r(ipoint,i) + tmpR(ipoint,3) = tmpR(ipoint,3) + int2_grad1_u12(ipoint,3,i,j) * mos_r_in_r(ipoint,i) enddo enddo tmp(j) = 0.d0 do ipoint = 1, n_grid - tmp(j) = tmp(j) + 0.5d0 * wr1(ipoint) * (tmp_L(ipoint,1)*tmp_R(ipoint,1) + tmp_L(ipoint,2)*tmp_R(ipoint,2) + tmp_L(ipoint,3)*tmp_R(ipoint,3)) + tmp(j) = tmp(j) + 0.5d0 * wr1(ipoint) * (tmpL(ipoint,1)*tmpR(ipoint,1) + tmpL(ipoint,2)*tmpR(ipoint,2) + tmpL(ipoint,3)*tmpR(ipoint,3)) enddo enddo ! j !$OMP END DO @@ -254,32 +250,32 @@ subroutine provide_no_0e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r, noL_0e = -2.d0 * sum(tmp) deallocate(tmp) - deallocate(tmp_L, tmp_R) + deallocate(tmpL, tmpR) ! --- - allocate(tmp_O(n_grid), tmp_J(n_grid,3)) - tmp_O = 0.d0 - tmp_J = 0.d0 + allocate(tmpO(n_grid), tmpJ(n_grid,3)) + tmpO = 0.d0 + tmpJ = 0.d0 - !$OMP PARALLEL & - !$OMP DEFAULT(NONE) & - !$OMP PRIVATE(i, ipoint, tmp_O_priv, tmp_J_priv) & - !$OMP SHARED(ne_b, ne_a, n_grid, & - !$OMP mos_l_in_r, mos_r_in_r, & - !$OMP int2_grad1_u12, tmp_O, tmp_J) + !$OMP PARALLEL & + !$OMP DEFAULT(NONE) & + !$OMP PRIVATE(i, ipoint, tmpO_priv, tmpJ_priv) & + !$OMP SHARED(ne_b, ne_a, n_grid, & + !$OMP mos_l_in_r, mos_r_in_r, & + !$OMP int2_grad1_u12, tmpO, tmpJ) - allocate(tmp_O_priv(n_grid), tmp_J_priv(n_grid,3)) - tmp_O_priv = 0.d0 - tmp_J_priv = 0.d0 + allocate(tmpO_priv(n_grid), tmpJ_priv(n_grid,3)) + tmpO_priv = 0.d0 + tmpJ_priv = 0.d0 !$OMP DO do i = 1, ne_b do ipoint = 1, n_grid - tmp_O_priv(ipoint) = tmp_O_priv(ipoint) + mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i) - tmp_J_priv(ipoint,1) = tmp_J_priv(ipoint,1) + int2_grad1_u12(ipoint,1,i,i) - tmp_J_priv(ipoint,2) = tmp_J_priv(ipoint,2) + int2_grad1_u12(ipoint,2,i,i) - tmp_J_priv(ipoint,3) = tmp_J_priv(ipoint,3) + int2_grad1_u12(ipoint,3,i,i) + tmpO_priv(ipoint) = tmpO_priv(ipoint) + mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i) + tmpJ_priv(ipoint,1) = tmpJ_priv(ipoint,1) + int2_grad1_u12(ipoint,1,i,i) + tmpJ_priv(ipoint,2) = tmpJ_priv(ipoint,2) + int2_grad1_u12(ipoint,2,i,i) + tmpJ_priv(ipoint,3) = tmpJ_priv(ipoint,3) + int2_grad1_u12(ipoint,3,i,i) enddo enddo !$OMP END DO NOWAIT @@ -287,49 +283,49 @@ subroutine provide_no_0e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r, !$OMP DO do i = ne_b+1, ne_a do ipoint = 1, n_grid - tmp_O_priv(ipoint) = tmp_O_priv(ipoint) + 0.5d0 * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i) - tmp_J_priv(ipoint,1) = tmp_J_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,i) - tmp_J_priv(ipoint,2) = tmp_J_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,i) - tmp_J_priv(ipoint,3) = tmp_J_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,i) + tmpO_priv(ipoint) = tmpO_priv(ipoint) + 0.5d0 * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i) + tmpJ_priv(ipoint,1) = tmpJ_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,i) + tmpJ_priv(ipoint,2) = tmpJ_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,i) + tmpJ_priv(ipoint,3) = tmpJ_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,i) enddo enddo !$OMP END DO NOWAIT !$OMP CRITICAL - tmp_O = tmp_O + tmp_O_priv - tmp_J = tmp_J + tmp_J_priv + tmpO = tmpO + tmpO_priv + tmpJ = tmpJ + tmpJ_priv !$OMP END CRITICAL - deallocate(tmp_O_priv, tmp_J_priv) + deallocate(tmpO_priv, tmpJ_priv) !$OMP END PARALLEL ! --- - allocate(tmp_M(n_grid,3), tmp_S(n_grid)) - tmp_M = 0.d0 - tmp_S = 0.d0 + allocate(tmpM(n_grid,3), tmpS(n_grid)) + tmpM = 0.d0 + tmpS = 0.d0 - !$OMP PARALLEL & - !$OMP DEFAULT(NONE) & - !$OMP PRIVATE(i, j, ipoint, tmp_M_priv, tmp_S_priv) & - !$OMP SHARED(ne_b, ne_a, n_grid, & - !$OMP mos_l_in_r, mos_r_in_r, & - !$OMP int2_grad1_u12, tmp_M, tmp_S) + !$OMP PARALLEL & + !$OMP DEFAULT(NONE) & + !$OMP PRIVATE(i, j, ipoint, tmpM_priv, tmpS_priv) & + !$OMP SHARED(ne_b, ne_a, n_grid, & + !$OMP mos_l_in_r, mos_r_in_r, & + !$OMP int2_grad1_u12, tmpM, tmpS) - allocate(tmp_M_priv(n_grid,3), tmp_S_priv(n_grid)) - tmp_M_priv = 0.d0 - tmp_S_priv = 0.d0 + allocate(tmpM_priv(n_grid,3), tmpS_priv(n_grid)) + tmpM_priv = 0.d0 + tmpS_priv = 0.d0 !$OMP DO COLLAPSE(2) do i = 1, ne_b do j = 1, ne_b do ipoint = 1, n_grid - tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) - tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) - tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) - tmp_S_priv(ipoint) = tmp_S_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) & + tmpS_priv(ipoint) = tmpS_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) & + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) & + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i) enddo @@ -342,17 +338,17 @@ subroutine provide_no_0e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r, do j = 1, ne_b do ipoint = 1, n_grid - tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) - tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) - tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) - tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i) - tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i) - tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i) + tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i) + tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i) + tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i) - tmp_S_priv(ipoint) = tmp_S_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) & - + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) & - + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i) + tmpS_priv(ipoint) = tmpS_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) & + + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) & + + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i) enddo enddo enddo @@ -363,39 +359,38 @@ subroutine provide_no_0e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r, do j = ne_b+1, ne_a do ipoint = 1, n_grid - tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) - tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) - tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) + tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j) - tmp_S_priv(ipoint) = tmp_S_priv(ipoint) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) & - + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) & - + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i) + tmpS_priv(ipoint) = tmpS_priv(ipoint) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) & + + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) & + + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i) enddo enddo enddo !$OMP END DO NOWAIT !$OMP CRITICAL - tmp_M = tmp_M + tmp_M_priv - tmp_S = tmp_S + tmp_S_priv + tmpM = tmpM + tmpM_priv + tmpS = tmpS + tmpS_priv !$OMP END CRITICAL - deallocate(tmp_M_priv, tmp_S_priv) + deallocate(tmpM_priv, tmpS_priv) !$OMP END PARALLEL allocate(tmp(n_grid)) do ipoint = 1, n_grid - tmp_S(ipoint) = 2.d0 * (tmp_J(ipoint,1)*tmp_J(ipoint,1) + tmp_J(ipoint,2)*tmp_J(ipoint,2) + tmp_J(ipoint,3)*tmp_J(ipoint,3)) - tmp_S(ipoint) + tmpS(ipoint) = 2.d0 * (tmpJ(ipoint,1)*tmpJ(ipoint,1) + tmpJ(ipoint,2)*tmpJ(ipoint,2) + tmpJ(ipoint,3)*tmpJ(ipoint,3)) - tmpS(ipoint) - tmp(ipoint) = wr1(ipoint) * ( tmp_O(ipoint) * tmp_S(ipoint) & - - 2.d0 * ( tmp_J(ipoint,1) * tmp_M(ipoint,1) & - + tmp_J(ipoint,2) * tmp_M(ipoint,2) & - + tmp_J(ipoint,3) * tmp_M(ipoint,3))) + tmp(ipoint) = wr1(ipoint) * ( tmpO(ipoint) * tmpS(ipoint) - 2.d0 * ( tmpJ(ipoint,1) * tmpM(ipoint,1) & + + tmpJ(ipoint,2) * tmpM(ipoint,2) & + + tmpJ(ipoint,3) * tmpM(ipoint,3) ) ) enddo - noL_0e = noL_0e -2.d0 * (sum(tmp)) + noL_0e = noL_0e - 2.d0 * (sum(tmp)) deallocate(tmp) diff --git a/plugins/local/tc_int/write_tc_int_cuda.irp.f b/plugins/local/tc_int/write_tc_int_cuda.irp.f index 2a3dc4d1..756630b8 100644 --- a/plugins/local/tc_int/write_tc_int_cuda.irp.f +++ b/plugins/local/tc_int/write_tc_int_cuda.irp.f @@ -120,12 +120,12 @@ subroutine do_work_on_gpu() call wall_time(cuda_time0) print*, ' start CUDA kernel' - call cutc_int_c(nxBlocks, nyBlocks, nzBlocks, blockxSize, blockySize, blockzSize, & - n_points_final_grid, n_points_extra_final_grid, ao_num, nucl_num, jBH_size, & - final_grid_points, final_weight_at_r_vector, & - final_grid_points_extra, final_weight_at_r_vector_extra, & - rn, aos_data1, aos_data2, jBH_c, jBH_m, jBH_n, jBH_o, & - int2_grad1_u12_ao, int_2e_ao) + call cutc_int(nxBlocks, nyBlocks, nzBlocks, blockxSize, blockySize, blockzSize, & + n_points_final_grid, n_points_extra_final_grid, ao_num, nucl_num, jBH_size, & + final_grid_points, final_weight_at_r_vector, & + final_grid_points_extra, final_weight_at_r_vector_extra, & + rn, aos_data1, aos_data2, jBH_c, jBH_m, jBH_n, jBH_o, & + int2_grad1_u12_ao, int_2e_ao) call wall_time(cuda_time1) print*, ' wall time for CUDA kernel (min) = ', (cuda_time1-cuda_time0) / 60.d0