From 710198f32f478faf91db81689c5891542b7ba7ef Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Sat, 12 Jan 2019 22:12:59 +0100 Subject: [PATCH] Optimmization --- src/davidson/u0_h_u0.irp.f | 96 +++++++++++++++++++++++++++++--------- src/utils/map_module.f90 | 17 ++++--- 2 files changed, 82 insertions(+), 31 deletions(-) diff --git a/src/davidson/u0_h_u0.irp.f b/src/davidson/u0_h_u0.irp.f index 2e822e18..ae808d12 100644 --- a/src/davidson/u0_h_u0.irp.f +++ b/src/davidson/u0_h_u0.irp.f @@ -138,22 +138,18 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_t,s_t,u_t,N_st,sze,istart,iend, integer :: maxab, n_singles_a, n_singles_b, kcol_prev integer*8 :: k8 logical :: compute_singles - integer*8 :: last_found, left, right - - double precision :: rss, mem - call resident_memory(rss) - mem = dble(singles_beta_csc_size) / 1024.d0**3 - - compute_singles = (mem+rss > qp_max_mem) -! compute_singles = (iend-istart < 100000).and.(mem+rss < qp_max_mem) - -! compute_singles = .True. - - if (.not.compute_singles) then -! provide singles_alpha_csc - provide singles_beta_csc - endif + integer*8 :: last_found, left, right, right_max + double precision :: rss, mem, ratio +! call resident_memory(rss) +! mem = dble(singles_beta_csc_size) / 1024.d0**3 +! +! compute_singles = (mem+rss > qp_max_mem) +! +! if (.not.compute_singles) then +! provide singles_beta_csc +! endif +compute_singles=.True. maxab = max(N_det_alpha_unique, N_det_beta_unique)+1 allocate(idx0(maxab)) @@ -185,8 +181,8 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_t,s_t,u_t,N_st,sze,istart,iend, !$OMP lcol, lrow, l_a, l_b, & !$OMP buffer, doubles, n_doubles, & !$OMP tmp_det2, hij, sij, idx, l, kcol_prev, & - !$OMP singles_a, n_singles_a, singles_b, & - !$OMP n_singles_b, k8, last_found,left,right) + !$OMP singles_a, n_singles_a, singles_b, ratio, & + !$OMP n_singles_b, k8, last_found,left,right,right_max) ! Alpha/Beta double excitations ! ============================= @@ -223,6 +219,7 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_t,s_t,u_t,N_st,sze,istart,iend, singles_b, n_singles_b) else n_singles_b = 0 + !DIR$ LOOP COUNT avg(1000) do k8=singles_beta_csc_idx(kcol),singles_beta_csc_idx(kcol+1)-1 n_singles_b = n_singles_b+1 singles_b(n_singles_b) = singles_beta_csc(k8) @@ -234,15 +231,20 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_t,s_t,u_t,N_st,sze,istart,iend, ! Loop over singly excited beta columns ! ------------------------------------- + !DIR$ LOOP COUNT avg(1000) do i=1,n_singles_b lcol = singles_b(i) tmp_det2(1:$N_int,2) = psi_det_beta_unique(1:$N_int, lcol) - l_a = psi_bilinear_matrix_columns_loc(lcol) - ASSERT (l_a <= N_det) - +!--- ! if (compute_singles) then + + l_a = psi_bilinear_matrix_columns_loc(lcol) + ASSERT (l_a <= N_det) + + !DIR$ UNROLL(8) + !DIR$ LOOP COUNT avg(50000) do j=1,psi_bilinear_matrix_columns_loc(lcol+1) - psi_bilinear_matrix_columns_loc(lcol) lrow = psi_bilinear_matrix_rows(l_a) ASSERT (lrow <= N_det_alpha_unique) @@ -258,16 +260,51 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_t,s_t,u_t,N_st,sze,istart,iend, call get_all_spin_singles_$N_int( & buffer, idx, tmp_det(1,1), j, & singles_a, n_singles_a ) + +!----- ! else ! +! ! Search for singles +! +!call cpu_time(time0) +! ! Right boundary +! l_a = psi_bilinear_matrix_columns_loc(lcol+1)-1 +! ASSERT (l_a <= N_det) +! do j=1,psi_bilinear_matrix_columns_loc(lcol+1) - psi_bilinear_matrix_columns_loc(lcol) +! lrow = psi_bilinear_matrix_rows(l_a) +! ASSERT (lrow <= N_det_alpha_unique) +! +! left = singles_alpha_csc_idx(krow) +! right_max = -1_8 +! right = singles_alpha_csc_idx(krow+1) +! do while (right-left>0_8) +! k8 = shiftr(right+left,1) +! if (singles_alpha_csc(k8) > lrow) then +! right = k8 +! else if (singles_alpha_csc(k8) < lrow) then +! left = k8 + 1_8 +! else +! right_max = k8+1_8 +! exit +! endif +! enddo +! if (right_max > 0_8) exit +! l_a = l_a-1 +! enddo +! if (right_max < 0_8) right_max = singles_alpha_csc_idx(krow) +! +! ! Search ! n_singles_a = 0 +! l_a = psi_bilinear_matrix_columns_loc(lcol) +! ASSERT (l_a <= N_det) +! ! last_found = singles_alpha_csc_idx(krow) ! do j=1,psi_bilinear_matrix_columns_loc(lcol+1) - psi_bilinear_matrix_columns_loc(lcol) ! lrow = psi_bilinear_matrix_rows(l_a) ! ASSERT (lrow <= N_det_alpha_unique) ! ! left = last_found -! right = singles_alpha_csc_idx(krow+1) +! right = right_max ! do while (right-left>0_8) ! k8 = shiftr(right+left,1) ! if (singles_alpha_csc(k8) > lrow) then @@ -286,10 +323,12 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_t,s_t,u_t,N_st,sze,istart,iend, ! j = j-1 ! ! endif +!----- ! Loop over alpha singles ! ----------------------- + !DIR$ LOOP COUNT avg(1000) do k = 1,n_singles_a l_a = singles_a(k) ASSERT (l_a <= N_det) @@ -300,9 +339,10 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_t,s_t,u_t,N_st,sze,istart,iend, tmp_det2(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, lrow) call i_H_j_double_alpha_beta(tmp_det,tmp_det2,$N_int,hij) call get_s2(tmp_det,tmp_det2,$N_int,sij) + !DIR$ LOOP COUNT AVG(4) do l=1,N_st v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,l_a) - s_t(l,k_a) = s_t(l,k_a) + sij * u_t(l,l_a) + s_t(l,k_a) = s_t(l,k_a) + sij * u_t(l,l_a) enddo enddo @@ -342,6 +382,8 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_t,s_t,u_t,N_st,sze,istart,iend, ! Loop inside the beta column to gather all the connected alphas lcol = psi_bilinear_matrix_columns(k_a) l_a = psi_bilinear_matrix_columns_loc(lcol) + + !DIR$ LOOP COUNT avg(200000) do i=1,N_det_alpha_unique if (l_a > N_det) exit lcol = psi_bilinear_matrix_columns(l_a) @@ -363,6 +405,7 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_t,s_t,u_t,N_st,sze,istart,iend, ! ---------------------------------- tmp_det2(1:$N_int,2) = psi_det_beta_unique (1:$N_int, kcol) + !DIR$ LOOP COUNT avg(1000) do i=1,n_singles_a l_a = singles_a(i) ASSERT (l_a <= N_det) @@ -373,6 +416,7 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_t,s_t,u_t,N_st,sze,istart,iend, tmp_det2(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, lrow) call i_H_j_mono_spin( tmp_det, tmp_det2, $N_int, 1, hij) + !DIR$ LOOP COUNT AVG(4) do l=1,N_st v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,l_a) ! single => sij = 0 @@ -383,6 +427,7 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_t,s_t,u_t,N_st,sze,istart,iend, ! Compute Hij for all alpha doubles ! ---------------------------------- + !DIR$ LOOP COUNT avg(50000) do i=1,n_doubles l_a = doubles(i) ASSERT (l_a <= N_det) @@ -391,6 +436,7 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_t,s_t,u_t,N_st,sze,istart,iend, ASSERT (lrow <= N_det_alpha_unique) call i_H_j_double_spin( tmp_det(1,1), psi_det_alpha_unique(1, lrow), $N_int, hij) + !DIR$ LOOP COUNT AVG(4) do l=1,N_st v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,l_a) ! same spin => sij = 0 @@ -422,6 +468,7 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_t,s_t,u_t,N_st,sze,istart,iend, ! Loop inside the alpha row to gather all the connected betas lrow = psi_bilinear_matrix_transp_rows(k_b) l_b = psi_bilinear_matrix_transp_rows_loc(lrow) + !DIR$ LOOP COUNT avg(200000) do i=1,N_det_beta_unique if (l_b > N_det) exit lrow = psi_bilinear_matrix_transp_rows(l_b) @@ -443,6 +490,7 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_t,s_t,u_t,N_st,sze,istart,iend, ! ---------------------------------- tmp_det2(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, krow) + !DIR$ LOOP COUNT avg(1000) do i=1,n_singles_b l_b = singles_b(i) ASSERT (l_b <= N_det) @@ -454,6 +502,7 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_t,s_t,u_t,N_st,sze,istart,iend, call i_H_j_mono_spin( tmp_det, tmp_det2, $N_int, 2, hij) l_a = psi_bilinear_matrix_transp_order(l_b) ASSERT (l_a <= N_det) + !DIR$ LOOP COUNT AVG(4) do l=1,N_st v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,l_a) ! single => sij = 0 @@ -463,6 +512,7 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_t,s_t,u_t,N_st,sze,istart,iend, ! Compute Hij for all beta doubles ! ---------------------------------- + !DIR$ LOOP COUNT avg(50000) do i=1,n_doubles l_b = doubles(i) ASSERT (l_b <= N_det) @@ -474,6 +524,7 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_t,s_t,u_t,N_st,sze,istart,iend, l_a = psi_bilinear_matrix_transp_order(l_b) ASSERT (l_a <= N_det) + !DIR$ LOOP COUNT AVG(4) do l=1,N_st v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,l_a) ! same spin => sij = 0 @@ -501,6 +552,7 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_t,s_t,u_t,N_st,sze,istart,iend, hij = diag_H_mat_elem(tmp_det,$N_int) sij = diag_S_mat_elem(tmp_det,$N_int) + !DIR$ LOOP COUNT AVG(4) do l=1,N_st v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,k_a) s_t(l,k_a) = s_t(l,k_a) + sij * u_t(l,k_a) diff --git a/src/utils/map_module.f90 b/src/utils/map_module.f90 index 9600cddf..05693ac1 100644 --- a/src/utils/map_module.f90 +++ b/src/utils/map_module.f90 @@ -773,7 +773,7 @@ subroutine search_key_value_big_interval(key,value,X,Y,sze,idx,ibegin_in,iend_in istep = shiftr(iend-ibegin,1) idx = ibegin + istep - do while (istep > 64) + do while (istep > 4) idx = ibegin + istep if (cache_key < X(idx)) then iend = idx @@ -813,20 +813,17 @@ subroutine search_key_value_big_interval(key,value,X,Y,sze,idx,ibegin_in,iend_in endif enddo idx = ibegin - value = Y(idx) - if (min(iend_in,sze) > ibegin+64) then - iend = ibegin+64 + if (min(iend_in,sze) > ibegin+4) then + iend = ibegin+4 + !DIR$ LOOP COUNT MAX(4) do while (cache_key > X(idx)) idx = idx+1 - value = Y(idx) end do else + !DIR$ LOOP COUNT MAX(4) do while (cache_key > X(idx)) idx = idx+1 - value = Y(idx) - if (idx /= iend) then - cycle - else + if (idx == iend) then exit endif end do @@ -834,6 +831,8 @@ subroutine search_key_value_big_interval(key,value,X,Y,sze,idx,ibegin_in,iend_in if (cache_key /= X(idx)) then idx = 1-idx value = 0.d0 + else + value = Y(idx) endif return