From 4bd8b710a548db5efddaf79fcdb555dace58a1c6 Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Fri, 2 Aug 2024 21:16:27 +0200 Subject: [PATCH] CuTC integrals: OK --- plugins/local/tc_int/compute_tc_int.irp.f | 11 +- plugins/local/tc_int/compute_tc_int_gpu.irp.f | 7 +- .../{gpu_module.F90 => cutc_module.F90} | 49 +- plugins/local/tc_int/deb_tc_int_cuda.irp.f | 487 ++++++++++++++---- plugins/local/tc_int/install | 17 + plugins/local/tc_int/uninstall | 13 + plugins/local/tc_int/write_tc_int_cuda.irp.f | 88 +--- 7 files changed, 482 insertions(+), 190 deletions(-) rename plugins/local/tc_int/{gpu_module.F90 => cutc_module.F90} (54%) create mode 100755 plugins/local/tc_int/install create mode 100755 plugins/local/tc_int/uninstall diff --git a/plugins/local/tc_int/compute_tc_int.irp.f b/plugins/local/tc_int/compute_tc_int.irp.f index 35034454..97815904 100644 --- a/plugins/local/tc_int/compute_tc_int.irp.f +++ b/plugins/local/tc_int/compute_tc_int.irp.f @@ -149,6 +149,7 @@ subroutine provide_int2_grad1_u12_ao() call wall_time(time1) print*, ' wall time for int2_grad1_u12_ao (min) = ', (time1-time0) / 60.d0 print*, ' wall time Jastrow derivatives (min) = ', tc / 60.d0 + call print_memory_usage() ! --- @@ -156,11 +157,11 @@ subroutine provide_int2_grad1_u12_ao() ! --- + allocate(c_mat(n_points_final_grid,ao_num,ao_num)) allocate(tc_int_2e_ao(ao_num,ao_num,ao_num,ao_num)) call wall_time(time1) - allocate(c_mat(n_points_final_grid,ao_num,ao_num)) !$OMP PARALLEL & !$OMP DEFAULT (NONE) & !$OMP PRIVATE (i, k, ipoint) & @@ -178,17 +179,16 @@ subroutine provide_int2_grad1_u12_ao() call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0 & , int2_grad1_u12_ao(1,1,1,4), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid & , 0.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num) - deallocate(c_mat) call wall_time(time2) print*, ' wall time of Hermitian part of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0 + call print_memory_usage() ! --- call wall_time(time1) - allocate(c_mat(n_points_final_grid,ao_num,ao_num)) do m = 1, 3 !$OMP PARALLEL & !$OMP DEFAULT (NONE) & @@ -215,10 +215,12 @@ subroutine provide_int2_grad1_u12_ao() , int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid & , 1.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num) enddo - deallocate(c_mat) call wall_time(time2) print*, ' wall time of non-Hermitian part of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0 + + deallocate(c_mat) + call print_memory_usage() ! --- @@ -229,6 +231,7 @@ subroutine provide_int2_grad1_u12_ao() call wall_time(time2) print*, ' lower- and upper-triangle of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0 + call print_memory_usage() ! --- diff --git a/plugins/local/tc_int/compute_tc_int_gpu.irp.f b/plugins/local/tc_int/compute_tc_int_gpu.irp.f index 5db07dd6..c2653ac6 100644 --- a/plugins/local/tc_int/compute_tc_int_gpu.irp.f +++ b/plugins/local/tc_int/compute_tc_int_gpu.irp.f @@ -178,11 +178,11 @@ stop ! --- + allocate(c_mat(n_points_final_grid,ao_num,ao_num)) allocate(tc_int_2e_ao(ao_num,ao_num,ao_num,ao_num)) call wall_time(time1) - allocate(c_mat(n_points_final_grid,ao_num,ao_num)) !$OMP PARALLEL & !$OMP DEFAULT (NONE) & !$OMP PRIVATE (i, k, ipoint) & @@ -200,7 +200,6 @@ stop call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0 & , int2_grad1_u12_ao%f(1,1,1,4), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid & , 0.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num) - deallocate(c_mat) call wall_time(time2) print*, ' wall time of Hermitian part of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0 @@ -210,7 +209,6 @@ stop call wall_time(time1) - allocate(c_mat(n_points_final_grid,ao_num,ao_num)) do m = 1, 3 !$OMP PARALLEL & !$OMP DEFAULT (NONE) & @@ -237,12 +235,13 @@ stop , int2_grad1_u12_ao%f(1,1,1,m), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid & , 1.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num) enddo - deallocate(c_mat) call wall_time(time2) print*, ' wall time of non-Hermitian part of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0 call print_memory_usage() + deallocate(c_mat) + ! --- call wall_time(time1) diff --git a/plugins/local/tc_int/gpu_module.F90 b/plugins/local/tc_int/cutc_module.F90 similarity index 54% rename from plugins/local/tc_int/gpu_module.F90 rename to plugins/local/tc_int/cutc_module.F90 index 1f07a2ea..69c2a131 100644 --- a/plugins/local/tc_int/gpu_module.F90 +++ b/plugins/local/tc_int/cutc_module.F90 @@ -1,5 +1,5 @@ -module gpu_module +module cutc_module use, intrinsic :: iso_c_binding @@ -9,7 +9,8 @@ module gpu_module ! --- - subroutine tc_int_c(nBlocks, blockSize, & + subroutine tc_int_c(nxBlocks, nyBlocks, nzBlocks, & + blockxSize, blockySize, blockzSize, & n_grid1, n_grid2, n_ao, n_nuc, size_bh, & r1, wr1, r2, wr2, rn, & aos_data1, aos_data2, & @@ -17,14 +18,16 @@ module gpu_module int2_grad1_u12_ao, int_2e_ao) bind(C, name = "tc_int_c") import c_int, c_double, c_ptr - integer(c_int), intent(in), value :: nBlocks, blockSize + integer(c_int), intent(in), value :: nxBlocks, blockxSize + integer(c_int), intent(in), value :: nyBlocks, blockySize + integer(c_int), intent(in), value :: nzBlocks, blockzSize integer(c_int), intent(in), value :: n_grid1, n_grid2 integer(c_int), intent(in), value :: n_ao integer(c_int), intent(in), value :: n_nuc integer(c_int), intent(in), value :: size_bh - real(c_double), intent(in) :: r1(n_grid1,3), wr1(n_grid1) - real(c_double), intent(in) :: r2(n_grid2,3), wr2(n_grid2) - real(c_double), intent(in) :: rn(n_nuc,3) + real(c_double), intent(in) :: r1(3,n_grid1), wr1(n_grid1) + real(c_double), intent(in) :: r2(3,n_grid2), wr2(n_grid2) + real(c_double), intent(in) :: rn(3,n_nuc) real(c_double), intent(in) :: aos_data1(n_grid1,n_ao,4) real(c_double), intent(in) :: aos_data2(n_grid2,n_ao,4) real(c_double), intent(in) :: c_bh(size_bh,n_nuc) @@ -66,8 +69,40 @@ module gpu_module ! --- + subroutine deb_int_2e_ao(nxBlocks, nyBlocks, nzBlocks, & + blockxSize, blockySize, blockzSize, & + n_grid1, n_grid2, n_ao, n_nuc, size_bh, & + r1, wr1, r2, wr2, rn, & + aos_data1, aos_data2, & + c_bh, m_bh, n_bh, o_bh, & + int2_grad1_u12_ao, int_2e_ao) bind(C, name = "deb_int_2e_ao") + + import c_int, c_double, c_ptr + integer(c_int), intent(in), value :: nxBlocks, blockxSize + integer(c_int), intent(in), value :: nyBlocks, blockySize + integer(c_int), intent(in), value :: nzBlocks, blockzSize + integer(c_int), intent(in), value :: n_grid1, n_grid2 + integer(c_int), intent(in), value :: n_ao + integer(c_int), intent(in), value :: n_nuc + integer(c_int), intent(in), value :: size_bh + real(c_double), intent(in) :: r1(3,n_grid1), wr1(n_grid1) + real(c_double), intent(in) :: r2(3,n_grid2), wr2(n_grid2) + real(c_double), intent(in) :: rn(3,n_nuc) + real(c_double), intent(in) :: aos_data1(n_grid1,n_ao,4) + real(c_double), intent(in) :: aos_data2(n_grid2,n_ao,4) + real(c_double), intent(in) :: c_bh(size_bh,n_nuc) + integer(c_int), intent(in) :: m_bh(size_bh,n_nuc) + integer(c_int), intent(in) :: n_bh(size_bh,n_nuc) + integer(c_int), intent(in) :: o_bh(size_bh,n_nuc) + real(c_double), intent(out) :: int2_grad1_u12_ao(n_ao,n_ao,n_grid1,4) + real(c_double), intent(out) :: int_2e_ao(n_ao,n_ao,n_ao,n_ao) + + end subroutine deb_int_2e_ao + + ! --- + end interface -end module gpu_module +end module cutc_module diff --git a/plugins/local/tc_int/deb_tc_int_cuda.irp.f b/plugins/local/tc_int/deb_tc_int_cuda.irp.f index f888d792..2c4e975b 100644 --- a/plugins/local/tc_int/deb_tc_int_cuda.irp.f +++ b/plugins/local/tc_int/deb_tc_int_cuda.irp.f @@ -36,7 +36,8 @@ subroutine main() implicit none - call deb_int2_grad1_u12_ao_gpu() + !call deb_int2_grad1_u12_ao_gpu() + call deb_int_2e_ao_gpu() return end @@ -45,7 +46,7 @@ end subroutine deb_int2_grad1_u12_ao_gpu() - use gpu_module + use cutc_module implicit none @@ -56,11 +57,10 @@ subroutine deb_int2_grad1_u12_ao_gpu() double precision :: acc_thr, err_tot, nrm_tot, err_loc double precision :: time0, time1 - double precision :: cuda_time0, cuda_time1 double precision :: cpu_time0, cpu_time1 double precision :: cpu_ttime0, cpu_ttime1 - double precision, allocatable :: r1(:,:), r2(:,:), rn(:,:), aos_data2(:,:,:) + double precision, allocatable :: rn(:,:), aos_data2(:,:,:) double precision, allocatable :: grad1_u12(:,:,:), int_fct_long_range(:,:,:) double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:) double precision, allocatable :: int2_grad1_u12_ao_gpu(:,:,:,:) @@ -73,23 +73,9 @@ subroutine deb_int2_grad1_u12_ao_gpu() ! --- - allocate(r1(n_points_final_grid,3)) - allocate(r2(n_points_extra_final_grid,3)) allocate(rn(3,nucl_num)) allocate(aos_data2(n_points_extra_final_grid,ao_num,4)) - do ipoint = 1, n_points_final_grid - r1(ipoint,1) = final_grid_points(1,ipoint) - r1(ipoint,2) = final_grid_points(2,ipoint) - r1(ipoint,3) = final_grid_points(3,ipoint) - enddo - - do ipoint = 1, n_points_extra_final_grid - r2(ipoint,1) = final_grid_points_extra(1,ipoint) - r2(ipoint,2) = final_grid_points_extra(2,ipoint) - r2(ipoint,3) = final_grid_points_extra(3,ipoint) - enddo - do k = 1, nucl_num rn(1,k) = nucl_coord(k,1) rn(2,k) = nucl_coord(k,2) @@ -121,110 +107,387 @@ subroutine deb_int2_grad1_u12_ao_gpu() allocate(int2_grad1_u12_ao_gpu(ao_num,ao_num,n_points_final_grid,4)) - call wall_time(cuda_time0) - call deb_int2_grad1_u12_ao(nxBlocks, nyBlocks, nzBlocks, blockxSize, blockySize, blockzSize, & n_points_final_grid, n_points_extra_final_grid, ao_num, nucl_num, jBH_size, & final_grid_points, final_grid_points_extra, final_weight_at_r_vector_extra, rn, & aos_data2, jBH_c, jBH_m, jBH_n, jBH_o, & int2_grad1_u12_ao_gpu) - call wall_time(cuda_time1) - write(*,"(A,2X,F15.7)") ' wall time for CUDA kernel (sec) = ', (cuda_time1 - cuda_time0) + ! --- + + allocate(int_fct_long_range(n_points_extra_final_grid,ao_num,ao_num)) + allocate(grad1_u12(n_points_extra_final_grid,n_points_final_grid,4)) + allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,4)) + + call wall_time(cpu_time0) + + call wall_time(cpu_ttime0) + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (j, i, jpoint) & + !$OMP SHARED (int_fct_long_range, ao_num, n_points_extra_final_grid, final_weight_at_r_vector_extra, aos_in_r_array_extra_transp) + !$OMP DO SCHEDULE (static) + do j = 1, ao_num + do i = 1, ao_num + do jpoint = 1, n_points_extra_final_grid + int_fct_long_range(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j) + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + call wall_time(cpu_ttime1) + write(*,"(A,2X,F15.7)") ' wall time for int_long_range (sec) = ', (cpu_ttime1 - cpu_ttime0) + + + call wall_time(cpu_ttime0) + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (ipoint) & + !$OMP SHARED (n_points_final_grid, n_points_extra_final_grid, grad1_u12) + !$OMP DO + do ipoint = 1, n_points_final_grid + call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, grad1_u12(1,ipoint,1) & + , grad1_u12(1,ipoint,2) & + , grad1_u12(1,ipoint,3) & + , grad1_u12(1,ipoint,4) ) + enddo + !$OMP END DO + !$OMP END PARALLEL + call wall_time(cpu_ttime1) + write(*,"(A,2X,F15.7)") ' wall time for tc_int_bh (sec) = ', (cpu_ttime1 - cpu_ttime0) + + + call wall_time(cpu_ttime0) + do m = 1, 4 + call dgemm("T", "N", ao_num*ao_num, n_points_final_grid, n_points_extra_final_grid, 1.d0 & + , int_fct_long_range(1,1,1), n_points_extra_final_grid, grad1_u12(1,1,m), n_points_extra_final_grid & + , 0.d0, int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num) + enddo + call wall_time(cpu_ttime1) + write(*,"(A,2X,F15.7)") ' wall time for DGEMM (sec) = ', (cpu_ttime1 - cpu_ttime0) + + call wall_time(cpu_time1) + write(*,"(A,2X,F15.7)") ' wall time on cpu (sec) = ', (cpu_time1 - cpu_time0) ! --- + acc_thr = 1d-12 + err_tot = 0.d0 + nrm_tot = 0.d0 -! allocate(int_fct_long_range(n_points_extra_final_grid,ao_num,ao_num)) -! allocate(grad1_u12(n_points_extra_final_grid,n_points_final_grid,4)) -! allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,4)) -! -! call wall_time(cpu_time0) -! -! call wall_time(cpu_ttime0) -! !$OMP PARALLEL & -! !$OMP DEFAULT (NONE) & -! !$OMP PRIVATE (j, i, jpoint) & -! !$OMP SHARED (int_fct_long_range, ao_num, n_points_extra_final_grid, final_weight_at_r_vector_extra, aos_in_r_array_extra_transp) -! !$OMP DO SCHEDULE (static) -! do j = 1, ao_num -! do i = 1, ao_num -! do jpoint = 1, n_points_extra_final_grid -! int_fct_long_range(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j) -! enddo -! enddo -! enddo -! !$OMP END DO -! !$OMP END PARALLEL -! call wall_time(cpu_ttime1) -! write(*,"(A,2X,F15.7)") ' wall time for int_long_range (sec) = ', (cpu_ttime1 - cpu_ttime0) -! -! -! call wall_time(cpu_ttime0) -! !$OMP PARALLEL & -! !$OMP DEFAULT (NONE) & -! !$OMP PRIVATE (ipoint) & -! !$OMP SHARED (n_points_final_grid, n_points_extra_final_grid, grad1_u12) -! !$OMP DO -! do ipoint = 1, n_points_final_grid -! call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, grad1_u12(1,ipoint,1) & -! , grad1_u12(1,ipoint,2) & -! , grad1_u12(1,ipoint,3) & -! , grad1_u12(1,ipoint,4) ) -! enddo -! !$OMP END DO -! !$OMP END PARALLEL -! call wall_time(cpu_ttime1) -! write(*,"(A,2X,F15.7)") ' wall time for tc_int_bh (sec) = ', (cpu_ttime1 - cpu_ttime0) -! -! -! call wall_time(cpu_ttime0) -! do m = 1, 4 -! call dgemm("T", "N", ao_num*ao_num, n_points_final_grid, n_points_extra_final_grid, 1.d0 & -! , int_fct_long_range(1,1,1), n_points_extra_final_grid, grad1_u12(1,1,m), n_points_extra_final_grid & -! , 0.d0, int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num) -! enddo -! call wall_time(cpu_ttime1) -! write(*,"(A,2X,F15.7)") ' wall time for DGEMM (sec) = ', (cpu_ttime1 - cpu_ttime0) -! -! call wall_time(cpu_time1) -! write(*,"(A,2X,F15.7)") ' wall time on cpu (sec) = ', (cpu_time1 - cpu_time0) -! -! ! --- -! -! acc_thr = 1d-12 -! err_tot = 0.d0 -! nrm_tot = 0.d0 -! -! do m = 1, 4 -! do ipoint = 1, n_points_final_grid -! do j = 1, ao_num -! do i = 1, ao_num -! err_loc = dabs(int2_grad1_u12_ao(i,j,ipoint,m) - int2_grad1_u12_ao_gpu(i,j,ipoint,m)) -! if(err_loc > acc_thr) then -! print*, " error on", i, j, ipoint, m -! print*, " CPU res", int2_grad1_u12_ao (i,j,ipoint,m) -! print*, " GPU res", int2_grad1_u12_ao_gpu(i,j,ipoint,m) -! stop -! endif -! err_tot = err_tot + err_loc -! nrm_tot = nrm_tot + dabs(int2_grad1_u12_ao(i,j,ipoint,m)) -! enddo -! enddo -! enddo -! enddo -! -! print *, ' absolute accuracy (%) =', 100.d0 * err_tot / nrm_tot -! -! ! --- -! -! deallocate(r1, r2, rn, aos_data2) -! deallocate(int_fct_long_range, grad1_u12) -! deallocate(int2_grad1_u12_ao) -! deallocate(int2_grad1_u12_ao_gpu) -! -! call wall_time(time1) -! write(*,"(A,2X,F15.7)") ' wall time for deb_int2_grad1_u12_ao_gpu (sec) = ', (time1 - time0) + do m = 1, 4 + do ipoint = 1, n_points_final_grid + do j = 1, ao_num + do i = 1, ao_num + err_loc = dabs(int2_grad1_u12_ao(i,j,ipoint,m) - int2_grad1_u12_ao_gpu(i,j,ipoint,m)) + if(err_loc > acc_thr) then + print*, " error on", i, j, ipoint, m + print*, " CPU res", int2_grad1_u12_ao (i,j,ipoint,m) + print*, " GPU res", int2_grad1_u12_ao_gpu(i,j,ipoint,m) + stop + endif + err_tot = err_tot + err_loc + nrm_tot = nrm_tot + dabs(int2_grad1_u12_ao(i,j,ipoint,m)) + enddo + enddo + enddo + enddo + + print *, ' absolute accuracy (%) =', 100.d0 * err_tot / nrm_tot + + ! --- + + deallocate(int_fct_long_range, grad1_u12) + deallocate(int2_grad1_u12_ao) + deallocate(int2_grad1_u12_ao_gpu) + deallocate(rn, aos_data2) + + call wall_time(time1) + write(*,"(A,2X,F15.7)") ' wall time for deb_int2_grad1_u12_ao_gpu (sec) = ', (time1 - time0) + + return +end + +! --- + +subroutine deb_int_2e_ao_gpu() + + use cutc_module + + implicit none + + integer :: m + integer :: i, j, k, l + integer :: ipoint, jpoint + + double precision :: weight1, ao_i_r, ao_k_r + + double precision :: acc_thr, err_tot, nrm_tot, err_loc + + double precision :: time0, time1 + double precision :: cpu_time0, cpu_time1 + double precision :: cpu_ttime0, cpu_ttime1 + double precision :: tt1, tt2 + + double precision, allocatable :: rn(:,:), aos_data1(:,:,:), aos_data2(:,:,:) + double precision, allocatable :: grad1_u12(:,:,:), int_fct_long_range(:,:,:), c_mat(:,:,:) + double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:) + double precision, allocatable :: int2_grad1_u12_ao_gpu(:,:,:,:) + double precision, allocatable :: int_2e_ao(:,:,:,:) + double precision, allocatable :: int_2e_ao_gpu(:,:,:,:) + + + + call wall_time(time0) + print*, ' start deb_int_2e_ao_gpu' + + + ! --- + + allocate(rn(3,nucl_num)) + allocate(aos_data1(n_points_final_grid,ao_num,4)) + allocate(aos_data2(n_points_extra_final_grid,ao_num,4)) + + do k = 1, nucl_num + rn(1,k) = nucl_coord(k,1) + rn(2,k) = nucl_coord(k,2) + rn(3,k) = nucl_coord(k,3) + enddo + + do k = 1, ao_num + do ipoint = 1, n_points_final_grid + aos_data1(ipoint,k,1) = aos_in_r_array(k,ipoint) + aos_data1(ipoint,k,2) = aos_grad_in_r_array(k,ipoint,1) + aos_data1(ipoint,k,3) = aos_grad_in_r_array(k,ipoint,2) + aos_data1(ipoint,k,4) = aos_grad_in_r_array(k,ipoint,3) + enddo + enddo + + do k = 1, ao_num + do ipoint = 1, n_points_extra_final_grid + aos_data2(ipoint,k,1) = aos_in_r_array_extra(k,ipoint) + aos_data2(ipoint,k,2) = aos_grad_in_r_array_extra(k,ipoint,1) + aos_data2(ipoint,k,3) = aos_grad_in_r_array_extra(k,ipoint,2) + aos_data2(ipoint,k,4) = aos_grad_in_r_array_extra(k,ipoint,3) + enddo + enddo + + ! --- + + integer :: nB + integer :: sB + + PROVIDE nxBlocks nyBlocks nzBlocks + PROVIDE blockxSize blockySize blockzSize + + sB = 32 + nB = (n_points_final_grid + sB - 1) / sB + + call ezfio_set_tc_int_blockxSize(sB) + call ezfio_set_tc_int_nxBlocks(nB) + + allocate(int2_grad1_u12_ao_gpu(ao_num,ao_num,n_points_final_grid,4)) + allocate(int_2e_ao_gpu(ao_num,ao_num,ao_num,ao_num)) + + call deb_int_2e_ao(nxBlocks, nyBlocks, nzBlocks, blockxSize, blockySize, blockzSize, & + n_points_final_grid, n_points_extra_final_grid, ao_num, nucl_num, jBH_size, & + final_grid_points, final_weight_at_r_vector, & + final_grid_points_extra, final_weight_at_r_vector_extra, & + rn, aos_data1, aos_data2, jBH_c, jBH_m, jBH_n, jBH_o, & + int2_grad1_u12_ao_gpu, int_2e_ao_gpu) + + ! --- + + allocate(int_fct_long_range(n_points_extra_final_grid,ao_num,ao_num)) + allocate(grad1_u12(n_points_extra_final_grid,n_points_final_grid,4)) + allocate(c_mat(n_points_final_grid,ao_num,ao_num)) + allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,4)) + allocate(int_2e_ao(ao_num,ao_num,ao_num,ao_num)) + + call wall_time(cpu_time0) + + call wall_time(cpu_ttime0) + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (j, i, jpoint) & + !$OMP SHARED (int_fct_long_range, ao_num, n_points_extra_final_grid, final_weight_at_r_vector_extra, aos_in_r_array_extra_transp) + !$OMP DO SCHEDULE (static) + do j = 1, ao_num + do i = 1, ao_num + do jpoint = 1, n_points_extra_final_grid + int_fct_long_range(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j) + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + call wall_time(cpu_ttime1) + write(*,"(A,2X,F15.7)") ' wall time for int_long_range (sec) = ', (cpu_ttime1 - cpu_ttime0) + + + call wall_time(cpu_ttime0) + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (ipoint) & + !$OMP SHARED (n_points_final_grid, n_points_extra_final_grid, grad1_u12) + !$OMP DO + do ipoint = 1, n_points_final_grid + call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, grad1_u12(1,ipoint,1) & + , grad1_u12(1,ipoint,2) & + , grad1_u12(1,ipoint,3) & + , grad1_u12(1,ipoint,4) ) + enddo + !$OMP END DO + !$OMP END PARALLEL + call wall_time(cpu_ttime1) + write(*,"(A,2X,F15.7)") ' wall time for tc_int_bh (sec) = ', (cpu_ttime1 - cpu_ttime0) + + + call wall_time(cpu_ttime0) + do m = 1, 4 + call dgemm("T", "N", ao_num*ao_num, n_points_final_grid, n_points_extra_final_grid, 1.d0 & + , int_fct_long_range(1,1,1), n_points_extra_final_grid, grad1_u12(1,1,m), n_points_extra_final_grid & + , 0.d0, int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num) + enddo + call wall_time(cpu_ttime1) + write(*,"(A,2X,F15.7)") ' wall time for DGEMM of integ over r2 (sec) = ', (cpu_ttime1 - cpu_ttime0) + + + call wall_time(cpu_ttime0) + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (i, k, ipoint) & + !$OMP SHARED (aos_in_r_array_transp, c_mat, ao_num, n_points_final_grid, final_weight_at_r_vector) + !$OMP DO SCHEDULE (static) + do i = 1, ao_num + do k = 1, ao_num + do ipoint = 1, n_points_final_grid + c_mat(ipoint,k,i) = final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,k) + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + call wall_time(cpu_ttime1) + write(*,"(A,2X,F15.7)") ' wall time of Hermitian part (sec) = ', (cpu_ttime1 - cpu_ttime0) + + + call wall_time(cpu_ttime0) + call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0 & + , int2_grad1_u12_ao(1,1,1,4), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid & + , 0.d0, int_2e_ao(1,1,1,1), ao_num*ao_num) + call wall_time(cpu_ttime1) + write(*,"(A,2X,F15.7)") ' wall time for DGEMM of Hermitian part (sec) = ', (cpu_ttime1 - cpu_ttime0) + + + tt1 = 0.d0 + tt2 = 0.d0 + do m = 1, 3 + + call wall_time(cpu_ttime0) + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r) & + !$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, c_mat, & + !$OMP ao_num, n_points_final_grid, final_weight_at_r_vector, m) + !$OMP DO SCHEDULE (static) + do i = 1, ao_num + do k = 1, ao_num + do ipoint = 1, n_points_final_grid + + weight1 = final_weight_at_r_vector(ipoint) + ao_i_r = aos_in_r_array_transp(ipoint,i) + ao_k_r = aos_in_r_array_transp(ipoint,k) + + c_mat(ipoint,k,i) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,m) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,m)) + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + call wall_time(cpu_ttime1) + tt1 += cpu_ttime1 - cpu_ttime0 + + call wall_time(cpu_ttime0) + call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, -0.5d0 & + , int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid & + , 1.d0, int_2e_ao(1,1,1,1), ao_num*ao_num) + call wall_time(cpu_ttime1) + tt2 += cpu_ttime1 - cpu_ttime0 + enddo + write(*,"(A,2X,F15.7)") ' wall time of non-Hermitian part (sec) = ', tt1 + write(*,"(A,2X,F15.7)") ' wall time for DGEMM of non Hermitian part (sec) = ', tt2 + + + call wall_time(cpu_ttime0) + call sum_A_At(int_2e_ao(1,1,1,1), ao_num*ao_num) + call wall_time(cpu_ttime1) + write(*,"(A,2X,F15.7)") ' wall time of A + A.T (sec) = ', cpu_ttime1 - cpu_ttime0 + + + call wall_time(cpu_time1) + write(*,"(A,2X,F15.7)") ' wall time on cpu (sec) = ', (cpu_time1 - cpu_time0) + + ! --- + + acc_thr = 1d-12 + + print *, ' precision on int2_grad1_u12_ao ' + err_tot = 0.d0 + nrm_tot = 0.d0 + do m = 1, 4 + do ipoint = 1, n_points_final_grid + do j = 1, ao_num + do i = 1, ao_num + err_loc = dabs(int2_grad1_u12_ao(i,j,ipoint,m) - int2_grad1_u12_ao_gpu(i,j,ipoint,m)) + if(err_loc > acc_thr) then + print*, " error on", i, j, ipoint, m + print*, " CPU res", int2_grad1_u12_ao (i,j,ipoint,m) + print*, " GPU res", int2_grad1_u12_ao_gpu(i,j,ipoint,m) + stop + endif + err_tot = err_tot + err_loc + nrm_tot = nrm_tot + dabs(int2_grad1_u12_ao(i,j,ipoint,m)) + enddo + enddo + enddo + enddo + print *, ' absolute accuracy on int2_grad1_u12_ao (%) =', 100.d0 * err_tot / nrm_tot + + + print *, ' precision on int_2e_ao ' + err_tot = 0.d0 + nrm_tot = 0.d0 + do i = 1, ao_num + do j = 1, ao_num + do k = 1, ao_num + do l = 1, ao_num + err_loc = dabs(int_2e_ao(l,k,j,i) - int_2e_ao_gpu(l,k,j,i)) + if(err_loc > acc_thr) then + print*, " error on", l, k, j, i + print*, " CPU res", int_2e_ao (l,k,j,i) + print*, " GPU res", int_2e_ao_gpu(l,k,j,i) + stop + endif + err_tot = err_tot + err_loc + nrm_tot = nrm_tot + dabs(int_2e_ao(l,k,j,i)) + enddo + enddo + enddo + enddo + print *, ' absolute accuracy on int_2e_ao (%) =', 100.d0 * err_tot / nrm_tot + + + ! --- + + deallocate(int_fct_long_range, grad1_u12, c_mat) + deallocate(int_2e_ao, int2_grad1_u12_ao) + deallocate(int_2e_ao_gpu, int2_grad1_u12_ao_gpu) + deallocate(rn, aos_data1, aos_data2) + + call wall_time(time1) + write(*,"(A,2X,F15.7)") ' wall time for deb_int_2e_ao_gpu (sec) = ', (time1 - time0) return end diff --git a/plugins/local/tc_int/install b/plugins/local/tc_int/install new file mode 100755 index 00000000..9d1886f0 --- /dev/null +++ b/plugins/local/tc_int/install @@ -0,0 +1,17 @@ +#!/bin/bash + +# Check if the QP_ROOT environment variable is set. +if [[ -z ${QP_ROOT} ]] +then + print "The QP_ROOT environment variable is not set." + print "Please reload the quantum_package.rc file." + exit -1 +fi + +git clone https://github.com/AbdAmmar/CuTC +cd CuTC +source config/env.rc +make + +ln -s ${PWD}/CuTC/build/libtc_int_cu.so ${QP_ROOT}/lib + diff --git a/plugins/local/tc_int/uninstall b/plugins/local/tc_int/uninstall new file mode 100755 index 00000000..3dd3612c --- /dev/null +++ b/plugins/local/tc_int/uninstall @@ -0,0 +1,13 @@ +#!/bin/bash + +# Check if the QP_ROOT environment variable is set. +if [[ -z ${QP_ROOT} ]] +then + print "The QP_ROOT environment variable is not set." + print "Please reload the quantum_package.rc file." + exit -1 +fi + +rm -rf ${PWD}/CuTC +rm ${QP_ROOT}/lib/libtc_int_cu.so + diff --git a/plugins/local/tc_int/write_tc_int_cuda.irp.f b/plugins/local/tc_int/write_tc_int_cuda.irp.f index de3be412..bc1a118d 100644 --- a/plugins/local/tc_int/write_tc_int_cuda.irp.f +++ b/plugins/local/tc_int/write_tc_int_cuda.irp.f @@ -56,21 +56,13 @@ end subroutine do_work_on_gpu() - use gpu_module + use cutc_module implicit none integer :: k, ipoint - integer :: nBlocks, blockSize - integer :: n_grid1, n_grid2 - integer :: n_ao - integer :: n_nuc - integer :: size_bh - double precision, allocatable :: r1(:,:), wr1(:), r2(:,:), wr2(:), rn(:,:) double precision, allocatable :: aos_data1(:,:,:), aos_data2(:,:,:) - double precision, allocatable :: c_bh(:,:) - integer, allocatable :: m_bh(:,:), n_bh(:,:), o_bh(:,:) double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:) double precision, allocatable :: int_2e_ao(:,:,:,:) @@ -80,47 +72,11 @@ subroutine do_work_on_gpu() call wall_time(time0) print*, ' start calculation of TC-integrals' - nBlocks = 100 - blockSize = 32 + allocate(aos_data1(n_points_final_grid,ao_num,4)) + allocate(aos_data2(n_points_extra_final_grid,ao_num,4)) + allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,4)) + allocate(int_2e_ao(ao_num,ao_num,ao_num,ao_num)) - n_grid1 = n_points_final_grid - n_grid2 = n_points_extra_final_grid - - n_ao = ao_num - n_nuc = nucl_num - - size_bh = jBH_size - - print*, " nBlocks =", nBlocks - print*, " blockSize =", blockSize - print*, " n_grid1 =", n_grid1 - print*, " n_grid2 =", n_grid2 - print*, " n_ao =", n_ao - print*, " n_nuc =", n_nuc - print *, " size_bh =", size_bh - - allocate(r1(n_grid1,3), wr1(n_grid1)) - allocate(r2(n_grid2,3), wr2(n_grid2)) - allocate(rn(n_nuc,3)) - allocate(aos_data1(n_grid1,n_ao,4)) - allocate(aos_data2(n_grid2,n_ao,4)) - allocate(c_bh(size_bh,n_nuc), m_bh(size_bh,n_nuc), n_bh(size_bh,n_nuc), o_bh(size_bh,n_nuc)) - allocate(int2_grad1_u12_ao(n_ao,n_ao,n_grid1,4)) - allocate(int_2e_ao(n_ao,n_ao,n_ao,n_ao)) - - do ipoint = 1, n_points_final_grid - r1(ipoint,1) = final_grid_points(1,ipoint) - r1(ipoint,2) = final_grid_points(2,ipoint) - r1(ipoint,3) = final_grid_points(3,ipoint) - wr1(ipoint) = final_weight_at_r_vector(ipoint) - enddo - - do ipoint = 1, n_points_extra_final_grid - r2(ipoint,1) = final_grid_points_extra(1,ipoint) - r2(ipoint,2) = final_grid_points_extra(2,ipoint) - r2(ipoint,3) = final_grid_points_extra(3,ipoint) - wr2(ipoint) = final_weight_at_r_vector_extra(ipoint) - enddo do k = 1, ao_num do ipoint = 1, n_points_final_grid @@ -138,31 +94,37 @@ subroutine do_work_on_gpu() enddo enddo - rn(:,:) = nucl_coord(:,:) + ! --- + + integer :: nB + integer :: sB + + PROVIDE nxBlocks nyBlocks nzBlocks + PROVIDE blockxSize blockySize blockzSize + + sB = 32 + nB = (n_points_final_grid + sB - 1) / sB + + call ezfio_set_tc_int_blockxSize(sB) + call ezfio_set_tc_int_nxBlocks(nB) + - c_bh(:,:) = jBH_c(:,:) - m_bh(:,:) = jBH_m(:,:) - n_bh(:,:) = jBH_n(:,:) - o_bh(:,:) = jBH_o(:,:) call wall_time(cuda_time0) print*, ' start CUDA kernel' - int2_grad1_u12_ao = 0.d0 - int_2e_ao = 0.d0 - - call tc_int_c(nBlocks, blockSize, & - n_grid1, n_grid2, n_ao, n_nuc, size_bh, & - r1, wr1, r2, wr2, rn, aos_data1, aos_data2, & - c_bh, m_bh, n_bh, o_bh, & + call tc_int_c(nxBlocks, nyBlocks, nzBlocks, blockxSize, blockySize, blockzSize, & + n_points_final_grid, n_points_extra_final_grid, ao_num, nucl_num, jBH_size, & + final_grid_points, final_weight_at_r_vector, & + final_grid_points_extra, final_weight_at_r_vector_extra, & + nucl_coord, aos_data1, aos_data2, & + jBH_c, jBH_m, jBH_n, jBH_o, & int2_grad1_u12_ao, int_2e_ao) call wall_time(cuda_time1) print*, ' wall time for CUDA kernel (min) = ', (cuda_time1-cuda_time0) / 60.d0 - deallocate(r1, wr1, r2, wr2, rn) deallocate(aos_data1, aos_data2) - deallocate(c_bh, m_bh, n_bh, o_bh) ! ---