From 8739c26509e07efa0586dc543f61f3920bd2e28a Mon Sep 17 00:00:00 2001 From: AbdAmmar Date: Wed, 6 Sep 2023 21:03:22 +0200 Subject: [PATCH] OPTIMZATIONS IN 4-ind integ --- src/bi_ort_ints/bi_ort_ints.irp.f | 35 +++-- src/bi_ort_ints/three_body_ijmk.irp.f | 183 +++++++++++++------------- 2 files changed, 116 insertions(+), 102 deletions(-) diff --git a/src/bi_ort_ints/bi_ort_ints.irp.f b/src/bi_ort_ints/bi_ort_ints.irp.f index cac46b18..0349c731 100644 --- a/src/bi_ort_ints/bi_ort_ints.irp.f +++ b/src/bi_ort_ints/bi_ort_ints.irp.f @@ -18,10 +18,11 @@ program bi_ort_ints ! call test_5idx ! call test_5idx2 call test_4idx() - call test_4idx_n4() + !call test_4idx_n4() !call test_4idx2() !call test_5idx2 !call test_5idx + end subroutine test_5idx2 @@ -340,7 +341,7 @@ subroutine test_4idx() implicit none integer :: i, j, k, l - double precision :: accu, contrib, new, ref, thr + double precision :: accu, contrib, new, ref, thr, norm thr = 1d-10 @@ -348,6 +349,7 @@ subroutine test_4idx() PROVIDE three_e_4_idx_direct_bi_ort accu = 0.d0 + norm = 0.d0 do i = 1, mo_num do j = 1, mo_num do k = 1, mo_num @@ -356,7 +358,6 @@ subroutine test_4idx() new = three_e_4_idx_direct_bi_ort (l,k,j,i) ref = three_e_4_idx_direct_bi_ort_old(l,k,j,i) contrib = dabs(new - ref) - accu += contrib if(contrib .gt. thr) then print*, ' problem in three_e_4_idx_direct_bi_ort' print*, l, k, j, i @@ -364,11 +365,14 @@ subroutine test_4idx() stop endif + accu += contrib + norm += dabs(ref) enddo enddo enddo enddo - print*, ' accu on three_e_4_idx_direct_bi_ort = ', accu / dble(mo_num)**4 + + print*, ' accu on three_e_4_idx_direct_bi_ort (%) = ', 100.d0 * accu / norm ! --- @@ -376,6 +380,7 @@ subroutine test_4idx() PROVIDE three_e_4_idx_exch13_bi_ort accu = 0.d0 + norm = 0.d0 do i = 1, mo_num do j = 1, mo_num do k = 1, mo_num @@ -384,7 +389,6 @@ subroutine test_4idx() new = three_e_4_idx_exch13_bi_ort (l,k,j,i) ref = three_e_4_idx_exch13_bi_ort_old(l,k,j,i) contrib = dabs(new - ref) - accu += contrib if(contrib .gt. thr) then print*, ' problem in three_e_4_idx_exch13_bi_ort' print*, l, k, j, i @@ -392,11 +396,14 @@ subroutine test_4idx() stop endif + accu += contrib + norm += dabs(ref) enddo enddo enddo enddo - print*, ' accu on three_e_4_idx_exch13_bi_ort = ', accu / dble(mo_num)**4 + + print*, ' accu on three_e_4_idx_exch13_bi_ort (%) = ', 100.d0 * accu / norm ! --- @@ -404,6 +411,7 @@ subroutine test_4idx() PROVIDE three_e_4_idx_cycle_1_bi_ort accu = 0.d0 + norm = 0.d0 do i = 1, mo_num do j = 1, mo_num do k = 1, mo_num @@ -412,7 +420,6 @@ subroutine test_4idx() new = three_e_4_idx_cycle_1_bi_ort (l,k,j,i) ref = three_e_4_idx_cycle_1_bi_ort_old(l,k,j,i) contrib = dabs(new - ref) - accu += contrib if(contrib .gt. thr) then print*, ' problem in three_e_4_idx_cycle_1_bi_ort' print*, l, k, j, i @@ -420,11 +427,14 @@ subroutine test_4idx() stop endif + accu += contrib + norm += dabs(ref) enddo enddo enddo enddo - print*, ' accu on three_e_4_idx_cycle_1_bi_ort = ', accu / dble(mo_num)**4 + + print*, ' accu on three_e_4_idx_cycle_1_bi_ort (%) = ', 100.d0 * accu / norm ! --- @@ -432,6 +442,7 @@ subroutine test_4idx() PROVIDE three_e_4_idx_exch23_bi_ort accu = 0.d0 + norm = 0.d0 do i = 1, mo_num do j = 1, mo_num do k = 1, mo_num @@ -440,7 +451,6 @@ subroutine test_4idx() new = three_e_4_idx_exch23_bi_ort (l,k,j,i) ref = three_e_4_idx_exch23_bi_ort_old(l,k,j,i) contrib = dabs(new - ref) - accu += contrib if(contrib .gt. thr) then print*, ' problem in three_e_4_idx_exch23_bi_ort' print*, l, k, j, i @@ -448,13 +458,18 @@ subroutine test_4idx() stop endif + accu += contrib + norm += dabs(ref) enddo enddo enddo enddo - print*, ' accu on three_e_4_idx_exch23_bi_ort = ', accu / dble(mo_num)**4 + + print*, ' accu on three_e_4_idx_exch23_bi_ort (%) = ', 100.d0 * accu / norm ! --- return end + + diff --git a/src/bi_ort_ints/three_body_ijmk.irp.f b/src/bi_ort_ints/three_body_ijmk.irp.f index 669861b7..742d5a80 100644 --- a/src/bi_ort_ints/three_body_ijmk.irp.f +++ b/src/bi_ort_ints/three_body_ijmk.irp.f @@ -64,120 +64,117 @@ !$OMP END DO !$OMP END PARALLEL + + + + + ! loops approach to break the O(N^4) scaling in memory + + call set_multiple_levels_omp(.false.) + + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (n, ipoint, tmp_loc_1, tmp_loc_2, tmp_2d, tmp1, tmp2) & + !$OMP SHARED (mo_num, n_points_final_grid, i, k, & + !$OMP mos_l_in_r_array_transp, mos_r_in_r_array_transp, & + !$OMP int2_grad1_u12_bimo_t, final_weight_at_r_vector, & + !$OMP tmp_aux_1, tmp_aux_2, & + !$OMP three_e_4_idx_direct_bi_ort, three_e_4_idx_exch13_bi_ort, & + !$OMP three_e_4_idx_exch23_bi_ort, three_e_4_idx_cycle_1_bi_ort) + allocate(tmp_2d(mo_num,mo_num)) allocate(tmp1(n_points_final_grid,4,mo_num)) allocate(tmp2(n_points_final_grid,4,mo_num)) - ! loops approach to break the O(N^4) scaling in memory + !$OMP DO do k = 1, mo_num + + ! --- + do i = 1, mo_num - !$OMP PARALLEL & - !$OMP DEFAULT (NONE) & - !$OMP PRIVATE (n, ipoint, tmp_loc_1, tmp_loc_2) & - !$OMP SHARED (mo_num, n_points_final_grid, i, k, & - !$OMP mos_l_in_r_array_transp, mos_r_in_r_array_transp, & - !$OMP int2_grad1_u12_bimo_t, final_weight_at_r_vector, & - !$OMP tmp_aux_2, tmp1) - !$OMP DO - do n = 1, mo_num - do ipoint = 1, n_points_final_grid + ! --- - tmp_loc_1 = mos_l_in_r_array_transp(ipoint,k) * mos_r_in_r_array_transp(ipoint,i) - tmp_loc_2 = tmp_aux_2(ipoint,n) + do n = 1, mo_num + do ipoint = 1, n_points_final_grid - tmp1(ipoint,1,n) = int2_grad1_u12_bimo_t(ipoint,1,n,n) * tmp_loc_1 + int2_grad1_u12_bimo_t(ipoint,1,k,i) * tmp_loc_2 - tmp1(ipoint,2,n) = int2_grad1_u12_bimo_t(ipoint,2,n,n) * tmp_loc_1 + int2_grad1_u12_bimo_t(ipoint,2,k,i) * tmp_loc_2 - tmp1(ipoint,3,n) = int2_grad1_u12_bimo_t(ipoint,3,n,n) * tmp_loc_1 + int2_grad1_u12_bimo_t(ipoint,3,k,i) * tmp_loc_2 - tmp1(ipoint,4,n) = int2_grad1_u12_bimo_t(ipoint,1,n,n) * int2_grad1_u12_bimo_t(ipoint,1,k,i) & - + int2_grad1_u12_bimo_t(ipoint,2,n,n) * int2_grad1_u12_bimo_t(ipoint,2,k,i) & - + int2_grad1_u12_bimo_t(ipoint,3,n,n) * int2_grad1_u12_bimo_t(ipoint,3,k,i) + tmp_loc_1 = mos_l_in_r_array_transp(ipoint,k) * mos_r_in_r_array_transp(ipoint,i) + tmp_loc_2 = tmp_aux_2(ipoint,n) + tmp1(ipoint,1,n) = int2_grad1_u12_bimo_t(ipoint,1,n,n) * tmp_loc_1 + int2_grad1_u12_bimo_t(ipoint,1,k,i) * tmp_loc_2 + tmp1(ipoint,2,n) = int2_grad1_u12_bimo_t(ipoint,2,n,n) * tmp_loc_1 + int2_grad1_u12_bimo_t(ipoint,2,k,i) * tmp_loc_2 + tmp1(ipoint,3,n) = int2_grad1_u12_bimo_t(ipoint,3,n,n) * tmp_loc_1 + int2_grad1_u12_bimo_t(ipoint,3,k,i) * tmp_loc_2 + tmp1(ipoint,4,n) = int2_grad1_u12_bimo_t(ipoint,1,n,n) * int2_grad1_u12_bimo_t(ipoint,1,k,i) & + + int2_grad1_u12_bimo_t(ipoint,2,n,n) * int2_grad1_u12_bimo_t(ipoint,2,k,i) & + + int2_grad1_u12_bimo_t(ipoint,3,n,n) * int2_grad1_u12_bimo_t(ipoint,3,k,i) + + enddo enddo - enddo - !$OMP END DO - !$OMP END PARALLEL - call dgemm( 'T', 'N', mo_num, mo_num, 4*n_points_final_grid, 1.d0 & - , tmp_aux_1(1,1,1), 4*n_points_final_grid, tmp1(1,1,1), 4*n_points_final_grid & - , 0.d0, tmp_2d(1,1), mo_num) + call dgemm( 'T', 'N', mo_num, mo_num, 4*n_points_final_grid, 1.d0 & + , tmp_aux_1(1,1,1), 4*n_points_final_grid, tmp1(1,1,1), 4*n_points_final_grid & + , 0.d0, tmp_2d(1,1), mo_num) - !$OMP PARALLEL DO PRIVATE(j,m) - do j = 1, mo_num - do m = 1, mo_num - three_e_4_idx_direct_bi_ort(m,j,k,i) = -tmp_2d(m,j) + do j = 1, mo_num + do m = 1, mo_num + three_e_4_idx_direct_bi_ort(m,j,k,i) = -tmp_2d(m,j) + enddo enddo - enddo - !$OMP END PARALLEL DO + ! --- - - !$OMP PARALLEL & - !$OMP DEFAULT (NONE) & - !$OMP PRIVATE (n, ipoint, tmp_loc_1, tmp_loc_2) & - !$OMP SHARED (mo_num, n_points_final_grid, i, k, & - !$OMP mos_l_in_r_array_transp, mos_r_in_r_array_transp, & - !$OMP int2_grad1_u12_bimo_t, final_weight_at_r_vector, & - !$OMP tmp1, tmp2) - !$OMP DO - do n = 1, mo_num - do ipoint = 1, n_points_final_grid - - tmp_loc_1 = mos_l_in_r_array_transp(ipoint,k) * mos_r_in_r_array_transp(ipoint,n) - tmp_loc_2 = mos_l_in_r_array_transp(ipoint,n) * mos_r_in_r_array_transp(ipoint,i) - - tmp1(ipoint,1,n) = int2_grad1_u12_bimo_t(ipoint,1,n,i) * tmp_loc_1 + int2_grad1_u12_bimo_t(ipoint,1,k,n) * tmp_loc_2 - tmp1(ipoint,2,n) = int2_grad1_u12_bimo_t(ipoint,2,n,i) * tmp_loc_1 + int2_grad1_u12_bimo_t(ipoint,2,k,n) * tmp_loc_2 - tmp1(ipoint,3,n) = int2_grad1_u12_bimo_t(ipoint,3,n,i) * tmp_loc_1 + int2_grad1_u12_bimo_t(ipoint,3,k,n) * tmp_loc_2 - tmp1(ipoint,4,n) = int2_grad1_u12_bimo_t(ipoint,1,n,i) * int2_grad1_u12_bimo_t(ipoint,1,k,n) & - + int2_grad1_u12_bimo_t(ipoint,2,n,i) * int2_grad1_u12_bimo_t(ipoint,2,k,n) & - + int2_grad1_u12_bimo_t(ipoint,3,n,i) * int2_grad1_u12_bimo_t(ipoint,3,k,n) - - tmp2(ipoint,1,n) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,n) - tmp2(ipoint,2,n) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,n) - tmp2(ipoint,3,n) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,n) - tmp2(ipoint,4,n) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,n) + do n = 1, mo_num + do ipoint = 1, n_points_final_grid + + tmp_loc_1 = mos_l_in_r_array_transp(ipoint,k) * mos_r_in_r_array_transp(ipoint,n) + tmp_loc_2 = mos_l_in_r_array_transp(ipoint,n) * mos_r_in_r_array_transp(ipoint,i) + + tmp1(ipoint,1,n) = int2_grad1_u12_bimo_t(ipoint,1,n,i) * tmp_loc_1 + int2_grad1_u12_bimo_t(ipoint,1,k,n) * tmp_loc_2 + tmp1(ipoint,2,n) = int2_grad1_u12_bimo_t(ipoint,2,n,i) * tmp_loc_1 + int2_grad1_u12_bimo_t(ipoint,2,k,n) * tmp_loc_2 + tmp1(ipoint,3,n) = int2_grad1_u12_bimo_t(ipoint,3,n,i) * tmp_loc_1 + int2_grad1_u12_bimo_t(ipoint,3,k,n) * tmp_loc_2 + tmp1(ipoint,4,n) = int2_grad1_u12_bimo_t(ipoint,1,n,i) * int2_grad1_u12_bimo_t(ipoint,1,k,n) & + + int2_grad1_u12_bimo_t(ipoint,2,n,i) * int2_grad1_u12_bimo_t(ipoint,2,k,n) & + + int2_grad1_u12_bimo_t(ipoint,3,n,i) * int2_grad1_u12_bimo_t(ipoint,3,k,n) + + tmp2(ipoint,1,n) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,n) + tmp2(ipoint,2,n) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,n) + tmp2(ipoint,3,n) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,n) + tmp2(ipoint,4,n) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,n) + enddo enddo - enddo - !$OMP END DO - !$OMP END PARALLEL - call dgemm( 'T', 'N', mo_num, mo_num, 4*n_points_final_grid, 1.d0 & - , tmp1(1,1,1), 4*n_points_final_grid, tmp_aux_1(1,1,1), 4*n_points_final_grid & - , 0.d0, tmp_2d(1,1), mo_num) + ! --- - !$OMP PARALLEL DO PRIVATE(j,m) - do j = 1, mo_num - do m = 1, mo_num - three_e_4_idx_exch13_bi_ort(m,j,k,i) = -tmp_2d(m,j) + call dgemm( 'T', 'N', mo_num, mo_num, 4*n_points_final_grid, 1.d0 & + , tmp1(1,1,1), 4*n_points_final_grid, tmp_aux_1(1,1,1), 4*n_points_final_grid & + , 0.d0, tmp_2d(1,1), mo_num) + + do j = 1, mo_num + do m = 1, mo_num + three_e_4_idx_exch13_bi_ort(m,j,k,i) = -tmp_2d(m,j) + enddo enddo - enddo - !$OMP END PARALLEL DO - call dgemm( 'T', 'N', mo_num, mo_num, 4*n_points_final_grid, 1.d0 & - , tmp1(1,1,1), 4*n_points_final_grid, tmp2(1,1,1), 4*n_points_final_grid & - , 0.d0, tmp_2d(1,1), mo_num) + ! --- - !$OMP PARALLEL DO PRIVATE(j,m) - do j = 1, mo_num - do m = 1, mo_num - three_e_4_idx_cycle_1_bi_ort(m,i,k,j) = -tmp_2d(m,j) + call dgemm( 'T', 'N', mo_num, mo_num, 4*n_points_final_grid, 1.d0 & + , tmp1(1,1,1), 4*n_points_final_grid, tmp2(1,1,1), 4*n_points_final_grid & + , 0.d0, tmp_2d(1,1), mo_num) + + do j = 1, mo_num + do m = 1, mo_num + three_e_4_idx_cycle_1_bi_ort(m,i,k,j) = -tmp_2d(m,j) + enddo enddo - enddo - !$OMP END PARALLEL DO - enddo ! i + ! --- + + enddo ! i + + ! --- do j = 1, mo_num - !$OMP PARALLEL & - !$OMP DEFAULT (NONE) & - !$OMP PRIVATE (n, ipoint, tmp_loc_1, tmp_loc_2) & - !$OMP SHARED (mo_num, n_points_final_grid, j, k, & - !$OMP mos_l_in_r_array_transp, mos_r_in_r_array_transp, & - !$OMP int2_grad1_u12_bimo_t, final_weight_at_r_vector, & - !$OMP tmp1, tmp2) - !$OMP DO do n = 1, mo_num do ipoint = 1, n_points_final_grid @@ -197,31 +194,33 @@ tmp2(ipoint,4,n) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,k) * mos_r_in_r_array_transp(ipoint,n) enddo enddo - !$OMP END DO - !$OMP END PARALLEL call dgemm( 'T', 'N', mo_num, mo_num, 4*n_points_final_grid, 1.d0 & , tmp1(1,1,1), 4*n_points_final_grid, tmp2(1,1,1), 4*n_points_final_grid & , 0.d0, tmp_2d(1,1), mo_num) - !$OMP PARALLEL DO PRIVATE(i,m) do i = 1, mo_num do m = 1, mo_num three_e_4_idx_exch23_bi_ort(m,j,k,i) = -tmp_2d(m,i) enddo enddo - !$OMP END PARALLEL DO enddo ! j + + ! --- + enddo !k + !$OMP END DO deallocate(tmp_2d) deallocate(tmp1) deallocate(tmp2) + + !$OMP END PARALLEL + deallocate(tmp_aux_1) deallocate(tmp_aux_2) - call wall_time(wall1) print *, ' wall time for three_e_4_idx_bi_ort', wall1 - wall0 call print_memory_usage()