9
1
mirror of https://github.com/QuantumPackage/qp2.git synced 2024-12-22 03:23:29 +01:00

add normal-ordering with CuTC

This commit is contained in:
AbdAmmar 2024-08-13 10:57:44 +02:00
parent 4e9143133a
commit bf15b68b0b
6 changed files with 490 additions and 183 deletions

View File

@ -9,13 +9,13 @@ module cutc_module
! ---
subroutine cutc_int_c(nxBlocks, nyBlocks, nzBlocks, &
blockxSize, blockySize, blockzSize, &
n_grid1, n_grid2, n_ao, n_nuc, size_bh, &
r1, wr1, r2, wr2, rn, &
aos_data1, aos_data2, &
c_bh, m_bh, n_bh, o_bh, &
int2_grad1_u12_ao, int_2e_ao) bind(C, name = "cutc_int_c")
subroutine cutc_int(nxBlocks, nyBlocks, nzBlocks, &
blockxSize, blockySize, blockzSize, &
n_grid1, n_grid2, n_ao, n_nuc, size_bh, &
r1, wr1, r2, wr2, rn, &
aos_data1, aos_data2, &
c_bh, m_bh, n_bh, o_bh, &
int2_grad1_u12_ao, int_2e_ao) bind(C, name = "cutc_int")
import c_int, c_double, c_ptr
integer(c_int), intent(in), value :: nxBlocks, blockxSize
@ -37,7 +37,7 @@ module cutc_module
real(c_double), intent(out) :: int2_grad1_u12_ao(n_ao,n_ao,n_grid1,3)
real(c_double), intent(out) :: int_2e_ao(n_ao,n_ao,n_ao,n_ao)
end subroutine cutc_int_c
end subroutine cutc_int
! ---
@ -170,6 +170,48 @@ module cutc_module
! ---
subroutine cutc_no_0e(n_grid1, n_mo, ne_a, ne_b, &
wr1, mos_l_in_r, mos_r_in_r, int2_grad1_u12, &
no_0e) bind(C, name = "cutc_no_0e")
import c_int, c_double, c_ptr
integer(c_int), intent(in), value :: n_grid1
integer(c_int), intent(in), value :: n_mo
integer(c_int), intent(in), value :: ne_a
integer(c_int), intent(in), value :: ne_b
real(c_double), intent(in) :: wr1(n_grid1)
real(c_double), intent(in) :: mos_l_in_r(n_grid1,n_mo)
real(c_double), intent(in) :: mos_r_in_r(n_grid1,n_mo)
real(c_double), intent(in) :: int2_grad1_u12(n_grid1,3,n_mo,n_mo)
real(c_double), intent(out) :: no_0e(1)
end subroutine cutc_no_0e
! ---
subroutine cutc_no(n_grid1, n_mo, ne_a, ne_b, &
wr1, mos_l_in_r, mos_r_in_r, int2_grad1_u12, &
no_2e, no_1e, no_0e) bind(C, name = "cutc_no")
import c_int, c_double, c_ptr
integer(c_int), intent(in), value :: n_grid1
integer(c_int), intent(in), value :: n_mo
integer(c_int), intent(in), value :: ne_a
integer(c_int), intent(in), value :: ne_b
real(c_double), intent(in) :: wr1(n_grid1)
real(c_double), intent(in) :: mos_l_in_r(n_grid1,n_mo)
real(c_double), intent(in) :: mos_r_in_r(n_grid1,n_mo)
real(c_double), intent(in) :: int2_grad1_u12(n_grid1,3,n_mo,n_mo)
real(c_double), intent(out) :: no_2e(n_mo,n_mo,n_mo,n_mo)
real(c_double), intent(out) :: no_1e(n_mo,n_mo)
real(c_double), intent(out) :: no_0e(1)
end subroutine cutc_no
! ---
end interface
end module cutc_module

View File

@ -0,0 +1,96 @@
! ---
subroutine deb_no_0e_gpu()
use cutc_module
implicit none
integer :: i, j, k, l, ipoint
double precision :: acc_thr, err_tot, nrm_tot, err_loc
double precision :: noL_0e
double precision :: noL_0e_gpu(1)
double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:)
double precision, allocatable :: tmp(:,:,:,:)
double precision, allocatable :: int2_grad1_u12_bimo_t(:,:,:,:)
PROVIDE mo_l_coef mo_r_coef
PROVIDE mos_l_in_r_array_transp mos_r_in_r_array_transp
allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,3))
print*, ' Reading int2_grad1_u12_ao from ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao'
open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="read")
read(11) int2_grad1_u12_ao
close(11)
allocate(tmp(mo_num,mo_num,n_points_final_grid,3))
allocate(int2_grad1_u12_bimo_t(n_points_final_grid,3,mo_num,mo_num))
!$OMP PARALLEL &
!$OMP DEFAULT (NONE) &
!$OMP PRIVATE (ipoint) &
!$OMP SHARED (ao_num, mo_num, n_points_final_grid, int2_grad1_u12_ao, tmp)
!$OMP DO SCHEDULE (dynamic)
do ipoint = 1, n_points_final_grid
call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,1), ao_num, tmp(1,1,ipoint,1), mo_num)
call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,2), ao_num, tmp(1,1,ipoint,2), mo_num)
call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,3), ao_num, tmp(1,1,ipoint,3), mo_num)
enddo
!$OMP END DO
!$OMP END PARALLEL
deallocate(int2_grad1_u12_ao)
!$OMP PARALLEL &
!$OMP DEFAULT (NONE) &
!$OMP PRIVATE (i, j, ipoint) &
!$OMP SHARED (mo_num, n_points_final_grid, tmp, int2_grad1_u12_bimo_t)
!$OMP DO COLLAPSE(2) SCHEDULE (dynamic)
do ipoint = 1, n_points_final_grid
do i = 1, mo_num
do j = 1, mo_num
int2_grad1_u12_bimo_t(ipoint,1,j,i) = tmp(j,i,ipoint,1)
int2_grad1_u12_bimo_t(ipoint,2,j,i) = tmp(j,i,ipoint,2)
int2_grad1_u12_bimo_t(ipoint,3,j,i) = tmp(j,i,ipoint,3)
enddo
enddo
enddo
!$OMP END DO
!$OMP END PARALLEL
deallocate(tmp)
! ---
call cutc_no_0e(n_points_final_grid, mo_num, elec_alpha_num, elec_beta_num, &
final_weight_at_r_vector(1), &
mos_l_in_r_array_transp(1,1), mos_r_in_r_array_transp(1,1), &
int2_grad1_u12_bimo_t(1,1,1,1), noL_0e_gpu(1))
! ---
call provide_no_0e(n_points_final_grid, mo_num, elec_alpha_num, elec_beta_num, &
final_weight_at_r_vector(1), &
mos_l_in_r_array_transp(1,1), mos_r_in_r_array_transp(1,1), &
int2_grad1_u12_bimo_t(1,1,1,1), noL_0e)
! ---
deallocate(int2_grad1_u12_bimo_t)
print *, 'noL_0e CPU = ', noL_0e
print *, 'noL_0e GPU = ', noL_0e_gpu(1)
err_tot = dabs(noL_0e - noL_0e_gpu(1))
nrm_tot = dabs(noL_0e)
print *, ' absolute accuracy on noL_0e (%) =', 100.d0 * err_tot / nrm_tot
return
end
! ---

View File

@ -0,0 +1,170 @@
! ---
subroutine deb_no_gpu()
use cutc_module
implicit none
integer :: i, j, k, l, ipoint
double precision :: acc_thr, err_tot, nrm_tot, err_loc
double precision :: noL_0e
double precision :: noL_0e_gpu(1)
double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:)
double precision, allocatable :: tmp(:,:,:,:)
double precision, allocatable :: int2_grad1_u12_bimo_t(:,:,:,:)
double precision, allocatable :: noL_1e (:,:)
double precision, allocatable :: noL_1e_gpu(:,:)
double precision, allocatable :: noL_2e (:,:,:,:)
double precision, allocatable :: noL_2e_gpu(:,:,:,:)
PROVIDE mo_l_coef mo_r_coef
PROVIDE mos_l_in_r_array_transp mos_r_in_r_array_transp
allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,3))
print*, ' Reading int2_grad1_u12_ao from ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao'
open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="read")
read(11) int2_grad1_u12_ao
close(11)
allocate(tmp(mo_num,mo_num,n_points_final_grid,3))
allocate(int2_grad1_u12_bimo_t(n_points_final_grid,3,mo_num,mo_num))
!$OMP PARALLEL &
!$OMP DEFAULT (NONE) &
!$OMP PRIVATE (ipoint) &
!$OMP SHARED (ao_num, mo_num, n_points_final_grid, int2_grad1_u12_ao, tmp)
!$OMP DO SCHEDULE (dynamic)
do ipoint = 1, n_points_final_grid
call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,1), ao_num, tmp(1,1,ipoint,1), mo_num)
call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,2), ao_num, tmp(1,1,ipoint,2), mo_num)
call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,3), ao_num, tmp(1,1,ipoint,3), mo_num)
enddo
!$OMP END DO
!$OMP END PARALLEL
deallocate(int2_grad1_u12_ao)
!$OMP PARALLEL &
!$OMP DEFAULT (NONE) &
!$OMP PRIVATE (i, j, ipoint) &
!$OMP SHARED (mo_num, n_points_final_grid, tmp, int2_grad1_u12_bimo_t)
!$OMP DO COLLAPSE(2) SCHEDULE (dynamic)
do ipoint = 1, n_points_final_grid
do i = 1, mo_num
do j = 1, mo_num
int2_grad1_u12_bimo_t(ipoint,1,j,i) = tmp(j,i,ipoint,1)
int2_grad1_u12_bimo_t(ipoint,2,j,i) = tmp(j,i,ipoint,2)
int2_grad1_u12_bimo_t(ipoint,3,j,i) = tmp(j,i,ipoint,3)
enddo
enddo
enddo
!$OMP END DO
!$OMP END PARALLEL
deallocate(tmp)
! ---
allocate(noL_2e_gpu(mo_num,mo_num,mo_num,mo_num))
allocate(noL_1e_gpu(mo_num,mo_num))
call cutc_no(n_points_final_grid, mo_num, elec_alpha_num, elec_beta_num, &
final_weight_at_r_vector(1), &
mos_l_in_r_array_transp(1,1), mos_r_in_r_array_transp(1,1), &
int2_grad1_u12_bimo_t(1,1,1,1), noL_2e_gpu(1,1,1,1), noL_1e_gpu(1,1), noL_0e_gpu(1))
! ---
allocate(noL_2e(mo_num,mo_num,mo_num,mo_num))
allocate(noL_1e(mo_num,mo_num))
call provide_no_2e(n_points_final_grid, mo_num, elec_alpha_num, elec_beta_num, &
final_weight_at_r_vector(1), &
mos_l_in_r_array_transp(1,1), mos_r_in_r_array_transp(1,1), &
int2_grad1_u12_bimo_t(1,1,1,1), noL_2e(1,1,1,1))
call provide_no_1e(n_points_final_grid, mo_num, elec_alpha_num, elec_beta_num, &
final_weight_at_r_vector(1), &
mos_l_in_r_array_transp(1,1), mos_r_in_r_array_transp(1,1), &
int2_grad1_u12_bimo_t(1,1,1,1), noL_1e(1,1))
call provide_no_0e(n_points_final_grid, mo_num, elec_alpha_num, elec_beta_num, &
final_weight_at_r_vector(1), &
mos_l_in_r_array_transp(1,1), mos_r_in_r_array_transp(1,1), &
int2_grad1_u12_bimo_t(1,1,1,1), noL_0e)
! ---
deallocate(int2_grad1_u12_bimo_t)
acc_thr = 1d-12
! ---
err_tot = 0.d0
nrm_tot = 0.d0
do i = 1, mo_num
do j = 1, mo_num
do k = 1, mo_num
do l = 1, mo_num
err_loc = dabs(noL_2e(l,k,j,i) - noL_2e_gpu(l,k,j,i))
if(err_loc > acc_thr) then
print*, " error on", l, k, j, i
print*, " CPU res", noL_2e (l,k,j,i)
print*, " GPU res", noL_2e_gpu(l,k,j,i)
stop
endif
err_tot = err_tot + err_loc
nrm_tot = nrm_tot + dabs(noL_2e(l,k,j,i))
enddo
enddo
enddo
enddo
print *, ' absolute accuracy on noL_2e (%) =', 100.d0 * err_tot / nrm_tot
deallocate(noL_2e)
deallocate(noL_2e_gpu)
! ---
err_tot = 0.d0
nrm_tot = 0.d0
do k = 1, mo_num
do l = 1, mo_num
err_loc = dabs(noL_1e(l,k) - noL_1e_gpu(l,k))
if(err_loc > acc_thr) then
print*, " error on", l, k
print*, " CPU res", noL_1e (l,k)
print*, " GPU res", noL_1e_gpu(l,k)
stop
endif
err_tot = err_tot + err_loc
nrm_tot = nrm_tot + dabs(noL_1e(l,k))
enddo
enddo
print *, ' absolute accuracy on noL_1e (%) =', 100.d0 * err_tot / nrm_tot
deallocate(noL_1e)
deallocate(noL_1e_gpu)
! ---
print *, 'noL_0e CPU = ', noL_0e
print *, 'noL_0e GPU = ', noL_0e_gpu(1)
err_tot = dabs(noL_0e - noL_0e_gpu(1))
nrm_tot = dabs(noL_0e)
print *, ' absolute accuracy on noL_0e (%) =', 100.d0 * err_tot / nrm_tot
return
end
! ---

View File

@ -41,9 +41,13 @@ subroutine main()
!call deb_no_2e_gpu_tmp()
!call deb_no_2e_gpu()
call deb_no_1e_gpu_tmp()
!call deb_no_1e_gpu_tmp()
!call deb_no_1e_gpu()
!call deb_no_0e_gpu()
call deb_no_gpu()
return
end

View File

@ -3,12 +3,6 @@
subroutine provide_no_0e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r, int2_grad1_u12, noL_0e)
BEGIN_DOC
!
! < Phi_left | L | Phi_right >
!
END_DOC
implicit none
integer, intent(in) :: n_grid, n_mo
@ -22,44 +16,47 @@ subroutine provide_no_0e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r,
integer :: i, j, k, ipoint
double precision :: t0, t1
double precision, allocatable :: tmp(:)
double precision, allocatable :: tmp_L(:,:), tmp_R(:,:)
double precision, allocatable :: tmp_M(:,:), tmp_S(:), tmp_O(:), tmp_J(:,:)
double precision, allocatable :: tmp_M_priv(:,:), tmp_S_priv(:), tmp_O_priv(:), tmp_J_priv(:,:)
double precision, allocatable :: tmpL(:,:), tmpR(:,:)
double precision, allocatable :: tmpM(:,:), tmpS(:), tmpO(:), tmpJ(:,:)
double precision, allocatable :: tmpM_priv(:,:), tmpS_priv(:), tmpO_priv(:), tmpJ_priv(:,:)
call wall_time(t0)
if(ne_a .eq. ne_b) then
allocate(tmp(ne_b))
allocate(tmp_L(n_grid,3), tmp_R(n_grid,3))
allocate(tmpL(n_grid,3), tmpR(n_grid,3))
!$OMP PARALLEL &
!$OMP DEFAULT(NONE) &
!$OMP PRIVATE(j, i, ipoint, tmp_L, tmp_R) &
!$OMP SHARED(ne_b, n_grid, &
!$OMP PRIVATE(j, i, ipoint, tmpL, tmpR) &
!$OMP SHARED(ne_b, n_grid, &
!$OMP mos_l_in_r, mos_r_in_r, wr1, &
!$OMP int2_grad1_u12, tmp)
!$OMP DO
do j = 1, ne_b
tmp_L = 0.d0
tmp_R = 0.d0
tmpL = 0.d0
tmpR = 0.d0
do i = 1, ne_b
do ipoint = 1, n_grid
tmp_L(ipoint,1) = tmp_L(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i)
tmp_L(ipoint,2) = tmp_L(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i)
tmp_L(ipoint,3) = tmp_L(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i)
tmpL(ipoint,1) = tmpL(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i)
tmpL(ipoint,2) = tmpL(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i)
tmpL(ipoint,3) = tmpL(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i)
tmp_R(ipoint,1) = tmp_R(ipoint,1) + int2_grad1_u12(ipoint,1,i,j) * mos_r_in_r(ipoint,i)
tmp_R(ipoint,2) = tmp_R(ipoint,2) + int2_grad1_u12(ipoint,2,i,j) * mos_r_in_r(ipoint,i)
tmp_R(ipoint,3) = tmp_R(ipoint,3) + int2_grad1_u12(ipoint,3,i,j) * mos_r_in_r(ipoint,i)
tmpR(ipoint,1) = tmpR(ipoint,1) + int2_grad1_u12(ipoint,1,i,j) * mos_r_in_r(ipoint,i)
tmpR(ipoint,2) = tmpR(ipoint,2) + int2_grad1_u12(ipoint,2,i,j) * mos_r_in_r(ipoint,i)
tmpR(ipoint,3) = tmpR(ipoint,3) + int2_grad1_u12(ipoint,3,i,j) * mos_r_in_r(ipoint,i)
enddo
enddo
tmp(j) = 0.d0
do ipoint = 1, n_grid
tmp(j) = tmp(j) + wr1(ipoint) * (tmp_L(ipoint,1)*tmp_R(ipoint,1) + tmp_L(ipoint,2)*tmp_R(ipoint,2) + tmp_L(ipoint,3)*tmp_R(ipoint,3))
tmp(j) = tmp(j) + wr1(ipoint) * (tmpL(ipoint,1)*tmpR(ipoint,1) + tmpL(ipoint,2)*tmpR(ipoint,2) + tmpL(ipoint,3)*tmpR(ipoint,3))
enddo
enddo ! j
!$OMP END DO
@ -68,150 +65,149 @@ subroutine provide_no_0e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r,
noL_0e = -2.d0 * sum(tmp)
deallocate(tmp)
deallocate(tmp_L, tmp_R)
deallocate(tmpL, tmpR)
! ---
allocate(tmp_O(n_grid), tmp_J(n_grid,3))
tmp_O = 0.d0
tmp_J = 0.d0
allocate(tmpO(n_grid), tmpJ(n_grid,3))
tmpO = 0.d0
tmpJ = 0.d0
!$OMP PARALLEL &
!$OMP DEFAULT(NONE) &
!$OMP PRIVATE(i, ipoint, tmp_O_priv, tmp_J_priv) &
!$OMP SHARED(ne_b, n_grid, &
!$OMP mos_l_in_r, mos_r_in_r, &
!$OMP int2_grad1_u12, tmp_O, tmp_J)
!$OMP PARALLEL &
!$OMP DEFAULT(NONE) &
!$OMP PRIVATE(i, ipoint, tmpO_priv, tmpJ_priv) &
!$OMP SHARED(ne_b, n_grid, &
!$OMP mos_l_in_r, mos_r_in_r, &
!$OMP int2_grad1_u12, tmpO, tmpJ)
allocate(tmp_O_priv(n_grid), tmp_J_priv(n_grid,3))
tmp_O_priv = 0.d0
tmp_J_priv = 0.d0
allocate(tmpO_priv(n_grid), tmpJ_priv(n_grid,3))
tmpO_priv = 0.d0
tmpJ_priv = 0.d0
!$OMP DO
do i = 1, ne_b
do ipoint = 1, n_grid
tmp_O_priv(ipoint) = tmp_O_priv(ipoint) + mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
tmp_J_priv(ipoint,1) = tmp_J_priv(ipoint,1) + int2_grad1_u12(ipoint,1,i,i)
tmp_J_priv(ipoint,2) = tmp_J_priv(ipoint,2) + int2_grad1_u12(ipoint,2,i,i)
tmp_J_priv(ipoint,3) = tmp_J_priv(ipoint,3) + int2_grad1_u12(ipoint,3,i,i)
tmpO_priv(ipoint) = tmpO_priv(ipoint) + mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
tmpJ_priv(ipoint,1) = tmpJ_priv(ipoint,1) + int2_grad1_u12(ipoint,1,i,i)
tmpJ_priv(ipoint,2) = tmpJ_priv(ipoint,2) + int2_grad1_u12(ipoint,2,i,i)
tmpJ_priv(ipoint,3) = tmpJ_priv(ipoint,3) + int2_grad1_u12(ipoint,3,i,i)
enddo
enddo
!$OMP END DO NOWAIT
!$OMP CRITICAL
tmp_O = tmp_O + tmp_O_priv
tmp_J = tmp_J + tmp_J_priv
tmpO = tmpO + tmpO_priv
tmpJ = tmpJ + tmpJ_priv
!$OMP END CRITICAL
deallocate(tmp_O_priv, tmp_J_priv)
deallocate(tmpO_priv, tmpJ_priv)
!$OMP END PARALLEL
allocate(tmp_M(n_grid,3), tmp_S(n_grid))
tmp_M = 0.d0
tmp_S = 0.d0
allocate(tmpM(n_grid,3), tmpS(n_grid))
tmpM = 0.d0
tmpS = 0.d0
!$OMP PARALLEL &
!$OMP DEFAULT(NONE) &
!$OMP PRIVATE(i, j, ipoint, tmp_M_priv, tmp_S_priv) &
!$OMP SHARED(ne_b, n_grid, &
!$OMP mos_l_in_r, mos_r_in_r, &
!$OMP int2_grad1_u12, tmp_M, tmp_S)
!$OMP PARALLEL &
!$OMP DEFAULT(NONE) &
!$OMP PRIVATE(i, j, ipoint, tmpM_priv, tmpS_priv) &
!$OMP SHARED(ne_b, n_grid, &
!$OMP mos_l_in_r, mos_r_in_r, &
!$OMP int2_grad1_u12, tmpM, tmpS)
allocate(tmp_M_priv(n_grid,3), tmp_S_priv(n_grid))
tmp_M_priv = 0.d0
tmp_S_priv = 0.d0
allocate(tmpM_priv(n_grid,3), tmpS_priv(n_grid))
tmpM_priv = 0.d0
tmpS_priv = 0.d0
!$OMP DO COLLAPSE(2)
do i = 1, ne_b
do j = 1, ne_b
do ipoint = 1, n_grid
tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
tmp_S_priv(ipoint) = tmp_S_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
+ int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
+ int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
tmpS_priv(ipoint) = tmpS_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
+ int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
+ int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
enddo
enddo
enddo
!$OMP END DO NOWAIT
!$OMP CRITICAL
tmp_M = tmp_M + tmp_M_priv
tmp_S = tmp_S + tmp_S_priv
tmpM = tmpM + tmpM_priv
tmpS = tmpS + tmpS_priv
!$OMP END CRITICAL
deallocate(tmp_M_priv, tmp_S_priv)
deallocate(tmpM_priv, tmpS_priv)
!$OMP END PARALLEL
allocate(tmp(n_grid))
do ipoint = 1, n_grid
tmp_S(ipoint) = 2.d0 * (tmp_J(ipoint,1)*tmp_J(ipoint,1) + tmp_J(ipoint,2)*tmp_J(ipoint,2) + tmp_J(ipoint,3)*tmp_J(ipoint,3)) - tmp_S(ipoint)
tmpS(ipoint) = 2.d0 * (tmpJ(ipoint,1)*tmpJ(ipoint,1) + tmpJ(ipoint,2)*tmpJ(ipoint,2) + tmpJ(ipoint,3)*tmpJ(ipoint,3)) - tmpS(ipoint)
tmp(ipoint) = wr1(ipoint) * ( tmp_O(ipoint) * tmp_S(ipoint) &
- 2.d0 * ( tmp_J(ipoint,1) * tmp_M(ipoint,1) &
+ tmp_J(ipoint,2) * tmp_M(ipoint,2) &
+ tmp_J(ipoint,3) * tmp_M(ipoint,3)))
tmp(ipoint) = wr1(ipoint) * ( tmpO(ipoint) * tmpS(ipoint) - 2.d0 * ( tmpJ(ipoint,1) * tmpM(ipoint,1) &
+ tmpJ(ipoint,2) * tmpM(ipoint,2) &
+ tmpJ(ipoint,3) * tmpM(ipoint,3) ) )
enddo
noL_0e = noL_0e -2.d0 * (sum(tmp))
noL_0e = noL_0e - 2.d0 * (sum(tmp))
deallocate(tmp)
else
allocate(tmp(ne_a))
allocate(tmp_L(n_grid,3), tmp_R(n_grid,3))
allocate(tmpL(n_grid,3), tmpR(n_grid,3))
!$OMP PARALLEL &
!$OMP DEFAULT(NONE) &
!$OMP PRIVATE(j, i, ipoint, tmp_L, tmp_R) &
!$OMP SHARED(ne_b, ne_a, n_grid, &
!$OMP mos_l_in_r, mos_r_in_r, &
!$OMP PARALLEL &
!$OMP DEFAULT(NONE) &
!$OMP PRIVATE(j, i, ipoint, tmpL, tmpR) &
!$OMP SHARED(ne_b, ne_a, n_grid, &
!$OMP mos_l_in_r, mos_r_in_r, &
!$OMP int2_grad1_u12, tmp, wr1)
!$OMP DO
do j = 1, ne_b
tmp_L = 0.d0
tmp_R = 0.d0
tmpL = 0.d0
tmpR = 0.d0
do i = ne_b+1, ne_a
do ipoint = 1, n_grid
tmp_L(ipoint,1) = tmp_L(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i)
tmp_L(ipoint,2) = tmp_L(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i)
tmp_L(ipoint,3) = tmp_L(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i)
tmpL(ipoint,1) = tmpL(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i)
tmpL(ipoint,2) = tmpL(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i)
tmpL(ipoint,3) = tmpL(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i)
tmp_R(ipoint,1) = tmp_R(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * mos_r_in_r(ipoint,i)
tmp_R(ipoint,2) = tmp_R(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * mos_r_in_r(ipoint,i)
tmp_R(ipoint,3) = tmp_R(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * mos_r_in_r(ipoint,i)
tmpR(ipoint,1) = tmpR(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * mos_r_in_r(ipoint,i)
tmpR(ipoint,2) = tmpR(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * mos_r_in_r(ipoint,i)
tmpR(ipoint,3) = tmpR(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * mos_r_in_r(ipoint,i)
enddo
enddo
tmp(j) = 0.d0
do ipoint = 1, n_grid
tmp(j) = tmp(j) + wr1(ipoint) * (tmp_L(ipoint,1)*tmp_R(ipoint,1) + tmp_L(ipoint,2)*tmp_R(ipoint,2) + tmp_L(ipoint,3)*tmp_R(ipoint,3))
tmp(j) = tmp(j) + wr1(ipoint) * (tmpL(ipoint,1)*tmpR(ipoint,1) + tmpL(ipoint,2)*tmpR(ipoint,2) + tmpL(ipoint,3)*tmpR(ipoint,3))
enddo
do i = 1, ne_b
do ipoint = 1, n_grid
tmp_L(ipoint,1) = tmp_L(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i)
tmp_L(ipoint,2) = tmp_L(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i)
tmp_L(ipoint,3) = tmp_L(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i)
tmpL(ipoint,1) = tmpL(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i)
tmpL(ipoint,2) = tmpL(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i)
tmpL(ipoint,3) = tmpL(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i)
tmp_R(ipoint,1) = tmp_R(ipoint,1) + int2_grad1_u12(ipoint,1,i,j) * mos_r_in_r(ipoint,i)
tmp_R(ipoint,2) = tmp_R(ipoint,2) + int2_grad1_u12(ipoint,2,i,j) * mos_r_in_r(ipoint,i)
tmp_R(ipoint,3) = tmp_R(ipoint,3) + int2_grad1_u12(ipoint,3,i,j) * mos_r_in_r(ipoint,i)
tmpR(ipoint,1) = tmpR(ipoint,1) + int2_grad1_u12(ipoint,1,i,j) * mos_r_in_r(ipoint,i)
tmpR(ipoint,2) = tmpR(ipoint,2) + int2_grad1_u12(ipoint,2,i,j) * mos_r_in_r(ipoint,i)
tmpR(ipoint,3) = tmpR(ipoint,3) + int2_grad1_u12(ipoint,3,i,j) * mos_r_in_r(ipoint,i)
enddo
enddo
do ipoint = 1, n_grid
tmp(j) = tmp(j) + wr1(ipoint) * (tmp_L(ipoint,1)*tmp_R(ipoint,1) + tmp_L(ipoint,2)*tmp_R(ipoint,2) + tmp_L(ipoint,3)*tmp_R(ipoint,3))
tmp(j) = tmp(j) + wr1(ipoint) * (tmpL(ipoint,1)*tmpR(ipoint,1) + tmpL(ipoint,2)*tmpR(ipoint,2) + tmpL(ipoint,3)*tmpR(ipoint,3))
enddo
enddo ! j
!$OMP END DO
@ -219,33 +215,33 @@ subroutine provide_no_0e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r,
! ---
!$OMP PARALLEL &
!$OMP DEFAULT(NONE) &
!$OMP PRIVATE(j, i, ipoint, tmp_L, tmp_R) &
!$OMP SHARED(ne_b, ne_a, n_grid, &
!$OMP mos_l_in_r, mos_r_in_r, &
!$OMP PARALLEL &
!$OMP DEFAULT(NONE) &
!$OMP PRIVATE(j, i, ipoint, tmpL, tmpR) &
!$OMP SHARED(ne_b, ne_a, n_grid, &
!$OMP mos_l_in_r, mos_r_in_r, &
!$OMP int2_grad1_u12, tmp, wr1)
!$OMP DO
do j = ne_b+1, ne_a
tmp_L = 0.d0
tmp_R = 0.d0
tmpL = 0.d0
tmpR = 0.d0
do i = 1, ne_a
do ipoint = 1, n_grid
tmp_L(ipoint,1) = tmp_L(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i)
tmp_L(ipoint,2) = tmp_L(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i)
tmp_L(ipoint,3) = tmp_L(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i)
tmpL(ipoint,1) = tmpL(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i)
tmpL(ipoint,2) = tmpL(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i)
tmpL(ipoint,3) = tmpL(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i)
tmp_R(ipoint,1) = tmp_R(ipoint,1) + int2_grad1_u12(ipoint,1,i,j) * mos_r_in_r(ipoint,i)
tmp_R(ipoint,2) = tmp_R(ipoint,2) + int2_grad1_u12(ipoint,2,i,j) * mos_r_in_r(ipoint,i)
tmp_R(ipoint,3) = tmp_R(ipoint,3) + int2_grad1_u12(ipoint,3,i,j) * mos_r_in_r(ipoint,i)
tmpR(ipoint,1) = tmpR(ipoint,1) + int2_grad1_u12(ipoint,1,i,j) * mos_r_in_r(ipoint,i)
tmpR(ipoint,2) = tmpR(ipoint,2) + int2_grad1_u12(ipoint,2,i,j) * mos_r_in_r(ipoint,i)
tmpR(ipoint,3) = tmpR(ipoint,3) + int2_grad1_u12(ipoint,3,i,j) * mos_r_in_r(ipoint,i)
enddo
enddo
tmp(j) = 0.d0
do ipoint = 1, n_grid
tmp(j) = tmp(j) + 0.5d0 * wr1(ipoint) * (tmp_L(ipoint,1)*tmp_R(ipoint,1) + tmp_L(ipoint,2)*tmp_R(ipoint,2) + tmp_L(ipoint,3)*tmp_R(ipoint,3))
tmp(j) = tmp(j) + 0.5d0 * wr1(ipoint) * (tmpL(ipoint,1)*tmpR(ipoint,1) + tmpL(ipoint,2)*tmpR(ipoint,2) + tmpL(ipoint,3)*tmpR(ipoint,3))
enddo
enddo ! j
!$OMP END DO
@ -254,32 +250,32 @@ subroutine provide_no_0e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r,
noL_0e = -2.d0 * sum(tmp)
deallocate(tmp)
deallocate(tmp_L, tmp_R)
deallocate(tmpL, tmpR)
! ---
allocate(tmp_O(n_grid), tmp_J(n_grid,3))
tmp_O = 0.d0
tmp_J = 0.d0
allocate(tmpO(n_grid), tmpJ(n_grid,3))
tmpO = 0.d0
tmpJ = 0.d0
!$OMP PARALLEL &
!$OMP DEFAULT(NONE) &
!$OMP PRIVATE(i, ipoint, tmp_O_priv, tmp_J_priv) &
!$OMP SHARED(ne_b, ne_a, n_grid, &
!$OMP mos_l_in_r, mos_r_in_r, &
!$OMP int2_grad1_u12, tmp_O, tmp_J)
!$OMP PARALLEL &
!$OMP DEFAULT(NONE) &
!$OMP PRIVATE(i, ipoint, tmpO_priv, tmpJ_priv) &
!$OMP SHARED(ne_b, ne_a, n_grid, &
!$OMP mos_l_in_r, mos_r_in_r, &
!$OMP int2_grad1_u12, tmpO, tmpJ)
allocate(tmp_O_priv(n_grid), tmp_J_priv(n_grid,3))
tmp_O_priv = 0.d0
tmp_J_priv = 0.d0
allocate(tmpO_priv(n_grid), tmpJ_priv(n_grid,3))
tmpO_priv = 0.d0
tmpJ_priv = 0.d0
!$OMP DO
do i = 1, ne_b
do ipoint = 1, n_grid
tmp_O_priv(ipoint) = tmp_O_priv(ipoint) + mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
tmp_J_priv(ipoint,1) = tmp_J_priv(ipoint,1) + int2_grad1_u12(ipoint,1,i,i)
tmp_J_priv(ipoint,2) = tmp_J_priv(ipoint,2) + int2_grad1_u12(ipoint,2,i,i)
tmp_J_priv(ipoint,3) = tmp_J_priv(ipoint,3) + int2_grad1_u12(ipoint,3,i,i)
tmpO_priv(ipoint) = tmpO_priv(ipoint) + mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
tmpJ_priv(ipoint,1) = tmpJ_priv(ipoint,1) + int2_grad1_u12(ipoint,1,i,i)
tmpJ_priv(ipoint,2) = tmpJ_priv(ipoint,2) + int2_grad1_u12(ipoint,2,i,i)
tmpJ_priv(ipoint,3) = tmpJ_priv(ipoint,3) + int2_grad1_u12(ipoint,3,i,i)
enddo
enddo
!$OMP END DO NOWAIT
@ -287,49 +283,49 @@ subroutine provide_no_0e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r,
!$OMP DO
do i = ne_b+1, ne_a
do ipoint = 1, n_grid
tmp_O_priv(ipoint) = tmp_O_priv(ipoint) + 0.5d0 * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
tmp_J_priv(ipoint,1) = tmp_J_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,i)
tmp_J_priv(ipoint,2) = tmp_J_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,i)
tmp_J_priv(ipoint,3) = tmp_J_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,i)
tmpO_priv(ipoint) = tmpO_priv(ipoint) + 0.5d0 * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
tmpJ_priv(ipoint,1) = tmpJ_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,i)
tmpJ_priv(ipoint,2) = tmpJ_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,i)
tmpJ_priv(ipoint,3) = tmpJ_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,i)
enddo
enddo
!$OMP END DO NOWAIT
!$OMP CRITICAL
tmp_O = tmp_O + tmp_O_priv
tmp_J = tmp_J + tmp_J_priv
tmpO = tmpO + tmpO_priv
tmpJ = tmpJ + tmpJ_priv
!$OMP END CRITICAL
deallocate(tmp_O_priv, tmp_J_priv)
deallocate(tmpO_priv, tmpJ_priv)
!$OMP END PARALLEL
! ---
allocate(tmp_M(n_grid,3), tmp_S(n_grid))
tmp_M = 0.d0
tmp_S = 0.d0
allocate(tmpM(n_grid,3), tmpS(n_grid))
tmpM = 0.d0
tmpS = 0.d0
!$OMP PARALLEL &
!$OMP DEFAULT(NONE) &
!$OMP PRIVATE(i, j, ipoint, tmp_M_priv, tmp_S_priv) &
!$OMP SHARED(ne_b, ne_a, n_grid, &
!$OMP mos_l_in_r, mos_r_in_r, &
!$OMP int2_grad1_u12, tmp_M, tmp_S)
!$OMP PARALLEL &
!$OMP DEFAULT(NONE) &
!$OMP PRIVATE(i, j, ipoint, tmpM_priv, tmpS_priv) &
!$OMP SHARED(ne_b, ne_a, n_grid, &
!$OMP mos_l_in_r, mos_r_in_r, &
!$OMP int2_grad1_u12, tmpM, tmpS)
allocate(tmp_M_priv(n_grid,3), tmp_S_priv(n_grid))
tmp_M_priv = 0.d0
tmp_S_priv = 0.d0
allocate(tmpM_priv(n_grid,3), tmpS_priv(n_grid))
tmpM_priv = 0.d0
tmpS_priv = 0.d0
!$OMP DO COLLAPSE(2)
do i = 1, ne_b
do j = 1, ne_b
do ipoint = 1, n_grid
tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
tmp_S_priv(ipoint) = tmp_S_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
tmpS_priv(ipoint) = tmpS_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
+ int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
+ int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
enddo
@ -342,17 +338,17 @@ subroutine provide_no_0e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r,
do j = 1, ne_b
do ipoint = 1, n_grid
tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i)
tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i)
tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i)
tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i)
tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i)
tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i)
tmp_S_priv(ipoint) = tmp_S_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
+ int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
+ int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
tmpS_priv(ipoint) = tmpS_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
+ int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
+ int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
enddo
enddo
enddo
@ -363,39 +359,38 @@ subroutine provide_no_0e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r,
do j = ne_b+1, ne_a
do ipoint = 1, n_grid
tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
tmp_S_priv(ipoint) = tmp_S_priv(ipoint) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
+ 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
+ 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
tmpS_priv(ipoint) = tmpS_priv(ipoint) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
+ 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
+ 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
enddo
enddo
enddo
!$OMP END DO NOWAIT
!$OMP CRITICAL
tmp_M = tmp_M + tmp_M_priv
tmp_S = tmp_S + tmp_S_priv
tmpM = tmpM + tmpM_priv
tmpS = tmpS + tmpS_priv
!$OMP END CRITICAL
deallocate(tmp_M_priv, tmp_S_priv)
deallocate(tmpM_priv, tmpS_priv)
!$OMP END PARALLEL
allocate(tmp(n_grid))
do ipoint = 1, n_grid
tmp_S(ipoint) = 2.d0 * (tmp_J(ipoint,1)*tmp_J(ipoint,1) + tmp_J(ipoint,2)*tmp_J(ipoint,2) + tmp_J(ipoint,3)*tmp_J(ipoint,3)) - tmp_S(ipoint)
tmpS(ipoint) = 2.d0 * (tmpJ(ipoint,1)*tmpJ(ipoint,1) + tmpJ(ipoint,2)*tmpJ(ipoint,2) + tmpJ(ipoint,3)*tmpJ(ipoint,3)) - tmpS(ipoint)
tmp(ipoint) = wr1(ipoint) * ( tmp_O(ipoint) * tmp_S(ipoint) &
- 2.d0 * ( tmp_J(ipoint,1) * tmp_M(ipoint,1) &
+ tmp_J(ipoint,2) * tmp_M(ipoint,2) &
+ tmp_J(ipoint,3) * tmp_M(ipoint,3)))
tmp(ipoint) = wr1(ipoint) * ( tmpO(ipoint) * tmpS(ipoint) - 2.d0 * ( tmpJ(ipoint,1) * tmpM(ipoint,1) &
+ tmpJ(ipoint,2) * tmpM(ipoint,2) &
+ tmpJ(ipoint,3) * tmpM(ipoint,3) ) )
enddo
noL_0e = noL_0e -2.d0 * (sum(tmp))
noL_0e = noL_0e - 2.d0 * (sum(tmp))
deallocate(tmp)

View File

@ -120,12 +120,12 @@ subroutine do_work_on_gpu()
call wall_time(cuda_time0)
print*, ' start CUDA kernel'
call cutc_int_c(nxBlocks, nyBlocks, nzBlocks, blockxSize, blockySize, blockzSize, &
n_points_final_grid, n_points_extra_final_grid, ao_num, nucl_num, jBH_size, &
final_grid_points, final_weight_at_r_vector, &
final_grid_points_extra, final_weight_at_r_vector_extra, &
rn, aos_data1, aos_data2, jBH_c, jBH_m, jBH_n, jBH_o, &
int2_grad1_u12_ao, int_2e_ao)
call cutc_int(nxBlocks, nyBlocks, nzBlocks, blockxSize, blockySize, blockzSize, &
n_points_final_grid, n_points_extra_final_grid, ao_num, nucl_num, jBH_size, &
final_grid_points, final_weight_at_r_vector, &
final_grid_points_extra, final_weight_at_r_vector_extra, &
rn, aos_data1, aos_data2, jBH_c, jBH_m, jBH_n, jBH_o, &
int2_grad1_u12_ao, int_2e_ao)
call wall_time(cuda_time1)
print*, ' wall time for CUDA kernel (min) = ', (cuda_time1-cuda_time0) / 60.d0