diff --git a/plugins/local/tc_int/EZFIO.cfg b/plugins/local/tc_int/EZFIO.cfg index 12366f01..5615ce4b 100644 --- a/plugins/local/tc_int/EZFIO.cfg +++ b/plugins/local/tc_int/EZFIO.cfg @@ -8,7 +8,7 @@ default: 10 type: integer doc: nb of y blocks in the Grid interface: ezfio,provider,ocaml -default: 10 +default: 1 [nzBlocks] type: integer @@ -26,7 +26,7 @@ default: 32 type: integer doc: size of y blocks interface: ezfio,provider,ocaml -default: 32 +default: 1 [blockzSize] type: integer diff --git a/plugins/local/tc_int/cutc_module.F90 b/plugins/local/tc_int/cutc_module.F90 index 69c2a131..b96c1bef 100644 --- a/plugins/local/tc_int/cutc_module.F90 +++ b/plugins/local/tc_int/cutc_module.F90 @@ -41,34 +41,6 @@ module cutc_module ! --- - subroutine deb_int2_grad1_u12_ao(nxBlocks, nyBlocks, nzBlocks, blockxSize, blockySize, blockzSize, & - n_grid1, n_grid2, n_ao, n_nuc, size_bh, & - r1, r2, wr2, rn, aos_data2, c_bh, m_bh, n_bh, o_bh, & - int2_grad1_u12_ao) bind(C, name ="deb_int2_grad1_u12_ao") - - import c_int, c_double, c_ptr - integer(c_int), intent(in), value :: nxBlocks, blockxSize - integer(c_int), intent(in), value :: nyBlocks, blockySize - integer(c_int), intent(in), value :: nzBlocks, blockzSize - integer(c_int), intent(in), value :: n_grid1, n_grid2 - integer(c_int), intent(in), value :: n_ao - integer(c_int), intent(in), value :: n_nuc - integer(c_int), intent(in), value :: size_bh - real(c_double), intent(in) :: r1(3,n_grid1) - real(c_double), intent(in) :: r2(3,n_grid2) - real(c_double), intent(in) :: wr2(n_grid2) - real(c_double), intent(in) :: rn(3,n_nuc) - real(c_double), intent(in) :: aos_data2(n_grid2,n_ao,4) - real(c_double), intent(in) :: c_bh(size_bh,n_nuc) - integer(c_int), intent(in) :: m_bh(size_bh,n_nuc) - integer(c_int), intent(in) :: n_bh(size_bh,n_nuc) - integer(c_int), intent(in) :: o_bh(size_bh,n_nuc) - real(c_double), intent(out) :: int2_grad1_u12_ao(n_ao,n_ao,n_grid1,4) - - end subroutine deb_int2_grad1_u12_ao - - ! --- - subroutine deb_int_2e_ao(nxBlocks, nyBlocks, nzBlocks, & blockxSize, blockySize, blockzSize, & n_grid1, n_grid2, n_ao, n_nuc, size_bh, & diff --git a/plugins/local/tc_int/deb_tc_int_cuda.irp.f b/plugins/local/tc_int/deb_tc_int_cuda.irp.f index 2c4e975b..75e3b4fe 100644 --- a/plugins/local/tc_int/deb_tc_int_cuda.irp.f +++ b/plugins/local/tc_int/deb_tc_int_cuda.irp.f @@ -36,7 +36,6 @@ subroutine main() implicit none - !call deb_int2_grad1_u12_ao_gpu() call deb_int_2e_ao_gpu() return @@ -44,173 +43,6 @@ end ! --- -subroutine deb_int2_grad1_u12_ao_gpu() - - use cutc_module - - implicit none - - integer :: m - integer :: i, j, k - integer :: ipoint, jpoint - - double precision :: acc_thr, err_tot, nrm_tot, err_loc - - double precision :: time0, time1 - double precision :: cpu_time0, cpu_time1 - double precision :: cpu_ttime0, cpu_ttime1 - - double precision, allocatable :: rn(:,:), aos_data2(:,:,:) - double precision, allocatable :: grad1_u12(:,:,:), int_fct_long_range(:,:,:) - double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:) - double precision, allocatable :: int2_grad1_u12_ao_gpu(:,:,:,:) - - - - call wall_time(time0) - print*, ' start deb_int2_grad1_u12_ao_gpu' - - - ! --- - - allocate(rn(3,nucl_num)) - allocate(aos_data2(n_points_extra_final_grid,ao_num,4)) - - do k = 1, nucl_num - rn(1,k) = nucl_coord(k,1) - rn(2,k) = nucl_coord(k,2) - rn(3,k) = nucl_coord(k,3) - enddo - - do k = 1, ao_num - do ipoint = 1, n_points_extra_final_grid - aos_data2(ipoint,k,1) = aos_in_r_array_extra(k,ipoint) - aos_data2(ipoint,k,2) = aos_grad_in_r_array_extra(k,ipoint,1) - aos_data2(ipoint,k,3) = aos_grad_in_r_array_extra(k,ipoint,2) - aos_data2(ipoint,k,4) = aos_grad_in_r_array_extra(k,ipoint,3) - enddo - enddo - - ! --- - - integer :: nB - integer :: sB - - PROVIDE nxBlocks nyBlocks nzBlocks - PROVIDE blockxSize blockySize blockzSize - - sB = 32 - nB = (n_points_final_grid + sB - 1) / sB - - call ezfio_set_tc_int_blockxSize(sB) - call ezfio_set_tc_int_nxBlocks(nB) - - allocate(int2_grad1_u12_ao_gpu(ao_num,ao_num,n_points_final_grid,4)) - - call deb_int2_grad1_u12_ao(nxBlocks, nyBlocks, nzBlocks, blockxSize, blockySize, blockzSize, & - n_points_final_grid, n_points_extra_final_grid, ao_num, nucl_num, jBH_size, & - final_grid_points, final_grid_points_extra, final_weight_at_r_vector_extra, rn, & - aos_data2, jBH_c, jBH_m, jBH_n, jBH_o, & - int2_grad1_u12_ao_gpu) - - ! --- - - allocate(int_fct_long_range(n_points_extra_final_grid,ao_num,ao_num)) - allocate(grad1_u12(n_points_extra_final_grid,n_points_final_grid,4)) - allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,4)) - - call wall_time(cpu_time0) - - call wall_time(cpu_ttime0) - !$OMP PARALLEL & - !$OMP DEFAULT (NONE) & - !$OMP PRIVATE (j, i, jpoint) & - !$OMP SHARED (int_fct_long_range, ao_num, n_points_extra_final_grid, final_weight_at_r_vector_extra, aos_in_r_array_extra_transp) - !$OMP DO SCHEDULE (static) - do j = 1, ao_num - do i = 1, ao_num - do jpoint = 1, n_points_extra_final_grid - int_fct_long_range(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j) - enddo - enddo - enddo - !$OMP END DO - !$OMP END PARALLEL - call wall_time(cpu_ttime1) - write(*,"(A,2X,F15.7)") ' wall time for int_long_range (sec) = ', (cpu_ttime1 - cpu_ttime0) - - - call wall_time(cpu_ttime0) - !$OMP PARALLEL & - !$OMP DEFAULT (NONE) & - !$OMP PRIVATE (ipoint) & - !$OMP SHARED (n_points_final_grid, n_points_extra_final_grid, grad1_u12) - !$OMP DO - do ipoint = 1, n_points_final_grid - call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, grad1_u12(1,ipoint,1) & - , grad1_u12(1,ipoint,2) & - , grad1_u12(1,ipoint,3) & - , grad1_u12(1,ipoint,4) ) - enddo - !$OMP END DO - !$OMP END PARALLEL - call wall_time(cpu_ttime1) - write(*,"(A,2X,F15.7)") ' wall time for tc_int_bh (sec) = ', (cpu_ttime1 - cpu_ttime0) - - - call wall_time(cpu_ttime0) - do m = 1, 4 - call dgemm("T", "N", ao_num*ao_num, n_points_final_grid, n_points_extra_final_grid, 1.d0 & - , int_fct_long_range(1,1,1), n_points_extra_final_grid, grad1_u12(1,1,m), n_points_extra_final_grid & - , 0.d0, int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num) - enddo - call wall_time(cpu_ttime1) - write(*,"(A,2X,F15.7)") ' wall time for DGEMM (sec) = ', (cpu_ttime1 - cpu_ttime0) - - call wall_time(cpu_time1) - write(*,"(A,2X,F15.7)") ' wall time on cpu (sec) = ', (cpu_time1 - cpu_time0) - - ! --- - - acc_thr = 1d-12 - err_tot = 0.d0 - nrm_tot = 0.d0 - - do m = 1, 4 - do ipoint = 1, n_points_final_grid - do j = 1, ao_num - do i = 1, ao_num - err_loc = dabs(int2_grad1_u12_ao(i,j,ipoint,m) - int2_grad1_u12_ao_gpu(i,j,ipoint,m)) - if(err_loc > acc_thr) then - print*, " error on", i, j, ipoint, m - print*, " CPU res", int2_grad1_u12_ao (i,j,ipoint,m) - print*, " GPU res", int2_grad1_u12_ao_gpu(i,j,ipoint,m) - stop - endif - err_tot = err_tot + err_loc - nrm_tot = nrm_tot + dabs(int2_grad1_u12_ao(i,j,ipoint,m)) - enddo - enddo - enddo - enddo - - print *, ' absolute accuracy (%) =', 100.d0 * err_tot / nrm_tot - - ! --- - - deallocate(int_fct_long_range, grad1_u12) - deallocate(int2_grad1_u12_ao) - deallocate(int2_grad1_u12_ao_gpu) - deallocate(rn, aos_data2) - - call wall_time(time1) - write(*,"(A,2X,F15.7)") ' wall time for deb_int2_grad1_u12_ao_gpu (sec) = ', (time1 - time0) - - return -end - -! --- - subroutine deb_int_2e_ao_gpu() use cutc_module @@ -479,6 +311,22 @@ subroutine deb_int_2e_ao_gpu() print *, ' absolute accuracy on int_2e_ao (%) =', 100.d0 * err_tot / nrm_tot + ! --- + + print*, ' Writing int2_grad1_u12_ao in ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao' + open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="write") + call ezfio_set_work_empty(.False.) + write(11) int2_grad1_u12_ao_gpu(:,:,:,1:3) + close(11) + + print*, ' Saving tc_int_2e_ao in ', trim(ezfio_filename) // '/work/ao_two_e_tc_tot' + open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/ao_two_e_tc_tot', action="write") + call ezfio_set_work_empty(.False.) + do k = 1, ao_num + write(11) int_2e_ao_gpu(:,:,:,k) + enddo + close(11) + ! --- deallocate(int_fct_long_range, grad1_u12, c_mat) diff --git a/plugins/local/tc_int/write_tc_int_cuda.irp.f b/plugins/local/tc_int/write_tc_int_cuda.irp.f index bc1a118d..b74cd0cd 100644 --- a/plugins/local/tc_int/write_tc_int_cuda.irp.f +++ b/plugins/local/tc_int/write_tc_int_cuda.irp.f @@ -62,7 +62,7 @@ subroutine do_work_on_gpu() integer :: k, ipoint - double precision, allocatable :: aos_data1(:,:,:), aos_data2(:,:,:) + double precision, allocatable :: rn(:,:), aos_data1(:,:,:), aos_data2(:,:,:) double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:) double precision, allocatable :: int_2e_ao(:,:,:,:) @@ -72,12 +72,19 @@ subroutine do_work_on_gpu() call wall_time(time0) print*, ' start calculation of TC-integrals' + allocate(rn(3,nucl_num)) allocate(aos_data1(n_points_final_grid,ao_num,4)) allocate(aos_data2(n_points_extra_final_grid,ao_num,4)) allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,4)) allocate(int_2e_ao(ao_num,ao_num,ao_num,ao_num)) + do k = 1, nucl_num + rn(1,k) = nucl_coord(k,1) + rn(2,k) = nucl_coord(k,2) + rn(3,k) = nucl_coord(k,3) + enddo + do k = 1, ao_num do ipoint = 1, n_points_final_grid aos_data1(ipoint,k,1) = aos_in_r_array(k,ipoint) @@ -117,8 +124,7 @@ subroutine do_work_on_gpu() n_points_final_grid, n_points_extra_final_grid, ao_num, nucl_num, jBH_size, & final_grid_points, final_weight_at_r_vector, & final_grid_points_extra, final_weight_at_r_vector_extra, & - nucl_coord, aos_data1, aos_data2, & - jBH_c, jBH_m, jBH_n, jBH_o, & + rn, aos_data1, aos_data2, jBH_c, jBH_m, jBH_n, jBH_o, & int2_grad1_u12_ao, int_2e_ao) call wall_time(cuda_time1) @@ -128,6 +134,35 @@ subroutine do_work_on_gpu() ! --- + integer :: i, j, l + double precision :: t1, t2 + double precision, external :: get_ao_two_e_integral + + call wall_time(t1) + + PROVIDE ao_integrals_map + !$OMP PARALLEL DEFAULT(NONE) & + !$OMP SHARED(ao_num, int_2e_ao, ao_integrals_map) & + !$OMP PRIVATE(i, j, k, l) + !$OMP DO COLLAPSE(3) + do j = 1, ao_num + do l = 1, ao_num + do i = 1, ao_num + do k = 1, ao_num + ! < 1:i, 2:j | 1:k, 2:l > + int_2e_ao(k,i,l,j) = int_2e_ao(k,i,l,j) + get_ao_two_e_integral(i, j, k, l, ao_integrals_map) + enddo + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + + call wall_time(t2) + print*, ' wall time of Coulomb part of tc_int_2e_ao (min) ', (t2 - t1) / 60.d0 + + ! --- + print*, ' Writing int2_grad1_u12_ao in ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao' open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="write") call ezfio_set_work_empty(.False.)