From f8bff471222ac9cd2e6f23342f7d7a7aff5d62cd Mon Sep 17 00:00:00 2001 From: AbdAmmar Date: Thu, 28 Mar 2024 15:27:11 +0100 Subject: [PATCH] added loops --- .../local/non_h_ints_mu/total_tc_int.irp.f | 165 +++++++++++++----- 1 file changed, 121 insertions(+), 44 deletions(-) diff --git a/plugins/local/non_h_ints_mu/total_tc_int.irp.f b/plugins/local/non_h_ints_mu/total_tc_int.irp.f index c7230dc3..72fd0f53 100644 --- a/plugins/local/non_h_ints_mu/total_tc_int.irp.f +++ b/plugins/local/non_h_ints_mu/total_tc_int.irp.f @@ -65,27 +65,59 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n PROVIDE int2_grad1_u12_square_ao - allocate(c_mat(n_points_final_grid,ao_num,ao_num)) + if(tc_save_mem) then - !$OMP PARALLEL & - !$OMP DEFAULT (NONE) & - !$OMP PRIVATE (i, k, ipoint) & - !$OMP SHARED (aos_in_r_array_transp, c_mat, ao_num, n_points_final_grid, final_weight_at_r_vector) - !$OMP DO SCHEDULE (static) - do i = 1, ao_num - do k = 1, ao_num - do ipoint = 1, n_points_final_grid - c_mat(ipoint,k,i) = final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,k) + print*, ' LOOPS are used to evaluate Hermitian part of ao_two_e_tc_tot ...' + + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (i, j, k, l, ipoint, ao_i_r, ao_k_r, weight1) & + !$OMP SHARED (ao_num, n_points_final_grid, ao_two_e_tc_tot, & + !$OMP aos_in_r_array_transp, final_weight_at_r_vector, int2_grad1_u12_square_ao) + !$OMP DO COLLAPSE(4) + do i = 1, ao_num + do k = 1, ao_num + do l = 1, ao_num + do j = 1, ao_num + ao_two_e_tc_tot(j,l,k,i) = 0.d0 + do ipoint = 1, n_points_final_grid + weight1 = final_weight_at_r_vector(ipoint) + ao_i_r = aos_in_r_array_transp(ipoint,i) + ao_k_r = aos_in_r_array_transp(ipoint,k) + ao_two_e_tc_tot(j,l,k,i) = ao_two_e_tc_tot(j,l,k,i) + int2_grad1_u12_square_ao(j,l,ipoint) * weight1 * ao_i_r * ao_k_r + enddo + enddo + enddo enddo enddo - enddo - !$OMP END DO - !$OMP END PARALLEL + !$OMP END DO + !$OMP END PARALLEL - call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0 & - , int2_grad1_u12_square_ao(1,1,1), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid & - , 0.d0, ao_two_e_tc_tot, ao_num*ao_num) + else + print*, ' DGEMM are used to evaluate Hermitian part of ao_two_e_tc_tot ...' + + allocate(c_mat(n_points_final_grid,ao_num,ao_num)) + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (i, k, ipoint) & + !$OMP SHARED (aos_in_r_array_transp, c_mat, ao_num, n_points_final_grid, final_weight_at_r_vector) + !$OMP DO SCHEDULE (static) + do i = 1, ao_num + do k = 1, ao_num + do ipoint = 1, n_points_final_grid + c_mat(ipoint,k,i) = final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,k) + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0 & + , int2_grad1_u12_square_ao(1,1,1), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid & + , 0.d0, ao_two_e_tc_tot, ao_num*ao_num) + deallocate(c_mat) + endif + FREE int2_grad1_u12_square_ao if( (tc_integ_type .eq. "semi-analytic") .and. & @@ -96,6 +128,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n ! an additional term is added here directly instead of ! being added in int2_grad1_u12_square_ao for performance + allocate(c_mat(n_points_final_grid,ao_num,ao_num)) PROVIDE int2_u2_env2 !$OMP PARALLEL & @@ -127,10 +160,13 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n , int2_u2_env2(1,1,1), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid & , 1.d0, ao_two_e_tc_tot(1,1,1,1), ao_num*ao_num) + deallocate(c_mat) FREE int2_u2_env2 endif ! use_ipp - deallocate(c_mat) + call wall_time(time1) + print*, ' done with Hermitian part after (min) ', (time1 - time0) / 60.d0 + call print_memory_usage() ! --- @@ -138,38 +174,73 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n PROVIDE int2_grad1_u12_ao - allocate(b_mat(n_points_final_grid,ao_num,ao_num,3)) + if(tc_save_mem) then - !$OMP PARALLEL & - !$OMP DEFAULT (NONE) & - !$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r) & - !$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, b_mat, & - !$OMP ao_num, n_points_final_grid, final_weight_at_r_vector) - !$OMP DO SCHEDULE (static) - do i = 1, ao_num - do k = 1, ao_num - do ipoint = 1, n_points_final_grid + print*, ' LOOPS are used to evaluate non-Hermitian part of ao_two_e_tc_tot ...' - weight1 = 0.5d0 * final_weight_at_r_vector(ipoint) - ao_i_r = aos_in_r_array_transp(ipoint,i) - ao_k_r = aos_in_r_array_transp(ipoint,k) - - b_mat(ipoint,k,i,1) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,1) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,1)) - b_mat(ipoint,k,i,2) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,2) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,2)) - b_mat(ipoint,k,i,3) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,3) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,3)) + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (i, j, k, l, ipoint, ao_i_r, ao_k_r, weight1) & + !$OMP SHARED (ao_num, n_points_final_grid, ao_two_e_tc_tot, & + !$OMP aos_in_r_array_transp, final_weight_at_r_vector, & + !$OMP int2_grad1_u12_ao, aos_grad_in_r_array_transp_bis) + !$OMP DO COLLAPSE(4) + do i = 1, ao_num + do k = 1, ao_num + do l = 1, ao_num + do j = 1, ao_num + do ipoint = 1, n_points_final_grid + weight1 = 0.5d0 * final_weight_at_r_vector(ipoint) + ao_i_r = aos_in_r_array_transp(ipoint,i) + ao_k_r = aos_in_r_array_transp(ipoint,k) + ao_two_e_tc_tot(j,l,k,i) = ao_two_e_tc_tot(j,l,k,i) & + + weight1 * int2_grad1_u12_ao(j,l,ipoint,1) * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,1) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,1)) & + + weight1 * int2_grad1_u12_ao(j,l,ipoint,2) * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,2) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,2)) & + + weight1 * int2_grad1_u12_ao(j,l,ipoint,3) * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,3) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,3)) + enddo + enddo + enddo enddo enddo - enddo - !$OMP END DO - !$OMP END PARALLEL + !$OMP END DO + !$OMP END PARALLEL - do m = 1, 3 - call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, -1.d0 & - , int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num, b_mat(1,1,1,m), n_points_final_grid & - , 1.d0, ao_two_e_tc_tot(1,1,1,1), ao_num*ao_num) - enddo - deallocate(b_mat) + else + print*, ' DGEMM are used to evaluate non-Hermitian part of ao_two_e_tc_tot ...' + + allocate(b_mat(n_points_final_grid,ao_num,ao_num,3)) + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r) & + !$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, b_mat, & + !$OMP ao_num, n_points_final_grid, final_weight_at_r_vector) + !$OMP DO SCHEDULE (static) + do i = 1, ao_num + do k = 1, ao_num + do ipoint = 1, n_points_final_grid + + weight1 = 0.5d0 * final_weight_at_r_vector(ipoint) + ao_i_r = aos_in_r_array_transp(ipoint,i) + ao_k_r = aos_in_r_array_transp(ipoint,k) + + b_mat(ipoint,k,i,1) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,1) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,1)) + b_mat(ipoint,k,i,2) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,2) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,2)) + b_mat(ipoint,k,i,3) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,3) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,3)) + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + + do m = 1, 3 + call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, -1.d0 & + , int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num, b_mat(1,1,1,m), n_points_final_grid & + , 1.d0, ao_two_e_tc_tot(1,1,1,1), ao_num*ao_num) + enddo + deallocate(b_mat) + + end if FREE int2_grad1_u12_ao if(tc_integ_type .eq. "semi-analytic") then @@ -178,16 +249,22 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n endif ! var_tc + call wall_time(time1) + print*, ' done with non-Hermitian part after (min) ', (time1 - time0) / 60.d0 + call print_memory_usage() + ! --- call sum_A_At(ao_two_e_tc_tot(1,1,1,1), ao_num*ao_num) + ! --- + PROVIDE ao_integrals_map !$OMP PARALLEL DEFAULT(NONE) & !$OMP SHARED(ao_num, ao_two_e_tc_tot, ao_integrals_map) & !$OMP PRIVATE(i, j, k, l) - !$OMP DO + !$OMP DO COLLAPSE(4) do j = 1, ao_num do l = 1, ao_num do i = 1, ao_num