From bd9d32206857952ad043d08eb8215c0d8cb02118 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Sat, 17 Sep 2016 01:54:44 +0200 Subject: [PATCH] Accelerated Davidson in MRCC --- plugins/MRCC_Utils/davidson.irp.f | 311 ++++++++++++++++------------ plugins/MRCC_Utils/mrcc_utils.irp.f | 4 +- src/Determinants/davidson.irp.f | 12 +- 3 files changed, 189 insertions(+), 138 deletions(-) diff --git a/plugins/MRCC_Utils/davidson.irp.f b/plugins/MRCC_Utils/davidson.irp.f index be8fbad5..746fef7a 100644 --- a/plugins/MRCC_Utils/davidson.irp.f +++ b/plugins/MRCC_Utils/davidson.irp.f @@ -96,7 +96,7 @@ subroutine davidson_diag_hjj_mrcc(dets_in,u_in,H_jj,energies,dim_in,sze,N_st,Nin integer, allocatable :: kl_pairs(:,:) integer :: k_pairs, kl - integer :: iter2 + integer :: iter2, sze_8 double precision, allocatable :: W(:,:,:), U(:,:,:), R(:,:) double precision, allocatable :: y(:,:,:,:), h(:,:,:,:), lambda(:) double precision :: diag_h_mat_elem @@ -134,11 +134,14 @@ subroutine davidson_diag_hjj_mrcc(dets_in,u_in,H_jj,energies,dim_in,sze,N_st,Nin enddo write(iunit,'(A)') trim(write_buffer) + integer, external :: align_double + sze_8 = align_double(sze) + allocate( & kl_pairs(2,N_st*(N_st+1)/2), & - W(sze,N_st,davidson_sze_max), & - U(sze,N_st,davidson_sze_max), & - R(sze,N_st), & + W(sze_8,N_st,davidson_sze_max), & + U(sze_8,N_st,davidson_sze_max), & + R(sze_8,N_st), & h(N_st,davidson_sze_max,N_st,davidson_sze_max), & y(N_st,davidson_sze_max,N_st,davidson_sze_max), & lambda(N_st*davidson_sze_max)) @@ -152,40 +155,52 @@ subroutine davidson_diag_hjj_mrcc(dets_in,u_in,H_jj,energies,dim_in,sze,N_st,Nin ! ============== - - k_pairs=0 - do l=1,N_st - do k=1,l - k_pairs+=1 - kl_pairs(1,k_pairs) = k - kl_pairs(2,k_pairs) = l + if (N_st > 1) then + + k_pairs=0 + do l=1,N_st + do k=1,l + k_pairs+=1 + kl_pairs(1,k_pairs) = k + kl_pairs(2,k_pairs) = l + enddo enddo - enddo - - !$OMP PARALLEL DEFAULT(NONE) & - !$OMP SHARED(U,sze,N_st,overlap,kl_pairs,k_pairs, & - !$OMP Nint,dets_in,u_in) & - !$OMP PRIVATE(k,l,kl,i) - - - ! Orthonormalize initial guess - ! ============================ - - !$OMP DO - do kl=1,k_pairs - k = kl_pairs(1,kl) - l = kl_pairs(2,kl) - if (k/=l) then - overlap(k,l) = u_dot_v(U_in(1,k),U_in(1,l),sze) - overlap(l,k) = overlap(k,l) - else - overlap(k,k) = u_dot_u(U_in(1,k),sze) - endif - enddo - !$OMP END DO - !$OMP END PARALLEL + + !$OMP PARALLEL DEFAULT(NONE) & + !$OMP SHARED(U,sze,N_st,overlap,kl_pairs,k_pairs, & + !$OMP Nint,dets_in,u_in) & + !$OMP PRIVATE(k,l,kl,i) + + + ! Orthonormalize initial guess + ! ============================ + + !$OMP DO + do kl=1,k_pairs + k = kl_pairs(1,kl) + l = kl_pairs(2,kl) + if (k/=l) then + overlap(k,l) = u_dot_v(U_in(1,k),U_in(1,l),sze) + overlap(l,k) = overlap(k,l) + else + overlap(k,k) = u_dot_u(U_in(1,k),sze) + endif + enddo + !$OMP END DO + !$OMP END PARALLEL - call ortho_lowdin(overlap,size(overlap,1),N_st,U_in,size(U_in,1),sze) + call ortho_lowdin(overlap,size(overlap,1),N_st,U_in,size(U_in,1),sze) + + else + + overlap(1,1) = u_dot_u(U_in(1,1),sze) + double precision :: f + f = 1.d0 / dsqrt(overlap(1,1)) + do i=1,sze + U_in(i,1) = U_in(i,1) * f + enddo + + endif ! Davidson iterations ! =================== @@ -211,9 +226,7 @@ subroutine davidson_diag_hjj_mrcc(dets_in,u_in,H_jj,energies,dim_in,sze,N_st,Nin ! Compute W_k = H |u_k> ! ---------------------- - do k=1,N_st - call H_u_0_mrcc(W(1,k,iter),U(1,k,iter),H_jj,sze,dets_in,Nint,istate) - enddo + call H_u_0_mrcc_nstates(W(1,1,iter),U(1,1,iter),H_jj,sze,dets_in,Nint,istate,N_st,sze_8) ! Compute h_kl = = ! ------------------------------------------- @@ -364,8 +377,7 @@ subroutine davidson_diag_hjj_mrcc(dets_in,u_in,H_jj,energies,dim_in,sze,N_st,Nin end - -subroutine u0_H_u_0_mrcc(e_0,u_0,n,keys_tmp,Nint,istate) +subroutine u_0_H_u_0_mrcc(e_0,u_0,n,keys_tmp,Nint,istate) use bitmasks implicit none BEGIN_DOC @@ -374,14 +386,30 @@ subroutine u0_H_u_0_mrcc(e_0,u_0,n,keys_tmp,Nint,istate) ! n : number of determinants ! END_DOC - integer, intent(in) :: n,Nint + integer, intent(in) :: n,Nint,istate double precision, intent(out) :: e_0 double precision, intent(in) :: u_0(n) integer(bit_kind),intent(in) :: keys_tmp(Nint,2,n) + call u_0_H_u_0_mrcc_nstates(e_0,u_0,n,keys_tmp,Nint,1,n,istate) +end + +subroutine u_0_H_u_0_mrcc_nstates(e_0,u_0,n,keys_tmp,Nint,istate,N_st,sze_8) + use bitmasks + implicit none + BEGIN_DOC + ! Computes e_0 = / + ! + ! n : number of determinants + ! + END_DOC + integer, intent(in) :: n,Nint,N_st,sze_8 + double precision, intent(out) :: e_0(N_st) + double precision, intent(in) :: u_0(n,N_st) + integer(bit_kind),intent(in) :: keys_tmp(Nint,2,n) integer,intent(in) :: istate double precision :: H_jj(n) - double precision :: v_0(n) + double precision :: v_0(n,N_st) double precision :: u_dot_u,u_dot_v,diag_H_mat_elem integer :: i,j do i = 1, n @@ -392,12 +420,32 @@ subroutine u0_H_u_0_mrcc(e_0,u_0,n,keys_tmp,Nint,istate) H_jj(idx_ref(i)) += delta_ii(istate,i) enddo - call H_u_0_mrcc(v_0,u_0,H_jj,n,keys_tmp,Nint,istate) - e_0 = u_dot_v(v_0,u_0,n)/u_dot_u(u_0,n) + call H_u_0_mrcc_nstates(v_0,u_0,H_jj,n,keys_tmp,Nint,istate,N_st,sze_8) + do i=1,N_st + e_0(i) = u_dot_v(v_0(1,i),u_0(1,i),n)/u_dot_u(u_0(1,i),n) + enddo end -subroutine H_u_0_mrcc(v_0,u_0,H_jj,n,keys_tmp,Nint,istate) +subroutine H_u_0_mrcc(v_0,u_0,H_jj,n,keys_tmp,Nint,istate_in) + use bitmasks + implicit none + BEGIN_DOC + ! Computes v_0 = H|u_0> + ! + ! n : number of determinants + ! + ! H_jj : array of + END_DOC + integer, intent(in) :: n,Nint,istate_in + double precision, intent(out) :: v_0(n) + double precision, intent(in) :: u_0(n) + double precision, intent(in) :: H_jj(n) + integer(bit_kind),intent(in) :: keys_tmp(Nint,2,n) + call H_u_0_mrcc_nstates(v_0,u_0,H_jj,n,keys_tmp,Nint,1,n) +end + +subroutine H_u_0_mrcc_nstates(v_0,u_0,H_jj,n,keys_tmp,Nint,istate_in,N_st,sze_8) use bitmasks implicit none BEGIN_DOC @@ -407,130 +455,135 @@ subroutine H_u_0_mrcc(v_0,u_0,H_jj,n,keys_tmp,Nint,istate) ! ! H_jj : array of END_DOC - integer, intent(in) :: n,Nint,istate - double precision, intent(out) :: v_0(n) - double precision, intent(in) :: u_0(n) + integer, intent(in) :: n,Nint,istate_in,N_st,sze_8 + double precision, intent(out) :: v_0(sze_8,N_st) + double precision, intent(in) :: u_0(sze_8,N_st) double precision, intent(in) :: H_jj(n) integer(bit_kind),intent(in) :: keys_tmp(Nint,2,n) - integer, allocatable :: idx(:) double precision :: hij - double precision, allocatable :: vt(:) + double precision, allocatable :: vt(:,:) integer :: i,j,k,l, jj,ii integer :: i0, j0 + integer(bit_kind) :: sorted_i(Nint) + - integer :: shortcut(0:n+1), sort_idx(n) - integer(bit_kind) :: sorted(Nint,n), version(Nint,n) + integer,allocatable :: shortcut(:,:), sort_idx(:,:) + integer(bit_kind), allocatable :: sorted(:,:,:), version(:,:,:) - integer :: sh, sh2, ni, exa, ext, org_i, org_j, endi, pass -! - + integer :: sh, sh2, ni, exa, ext, org_i, org_j, endi, pass, istate + + ASSERT (Nint > 0) ASSERT (Nint == N_int) ASSERT (n>0) - PROVIDE ref_bitmask_energy delta_ij - integer, parameter :: block_size = 157 - - - !$OMP PARALLEL DEFAULT(NONE) & - !$OMP PRIVATE(i,hij,j,k,idx,jj,vt,ii,sh, sh2, ni, exa, ext, org_i, org_j, endi, pass) & - !$OMP SHARED(n_det_ref,n_det_non_ref,idx_ref,idx_non_ref,n,H_jj,u_0,keys_tmp,Nint,v_0,istate,delta_ij,sorted,shortcut,sort_idx,version) - - - - !$OMP DO SCHEDULE(static) - do i=1,n - v_0(i) = H_jj(i) * u_0(i) - enddo - !$OMP END DO + PROVIDE ref_bitmask_energy + allocate (shortcut(0:n+1,2), sort_idx(n,2), sorted(Nint,n,2), version(Nint,n,2)) + v_0 = 0.d0 - allocate(idx(0:n), vt(n)) + call sort_dets_ab_v(keys_tmp, sorted(1,1,1), sort_idx(1,1), shortcut(0,1), version(1,1,1), n, Nint) + call sort_dets_ba_v(keys_tmp, sorted(1,1,2), sort_idx(1,2), shortcut(0,2), version(1,1,2), n, Nint) + + !$OMP PARALLEL DEFAULT(NONE) & + !$OMP PRIVATE(i,hij,j,k,jj,vt,ii,sh,sh2,ni,exa,ext,org_i,org_j,endi,sorted_i,istate)& + !$OMP SHARED(n,H_jj,u_0,keys_tmp,Nint,v_0,sorted,shortcut,sort_idx,version,N_st,sze_8,& + !$OMP istate_in,delta_ij,N_det_ref,N_det_non_ref,idx_ref,idx_non_ref) + allocate(vt(sze_8,N_st)) Vt = 0.d0 - - !$OMP SINGLE - call sort_dets_ab_v(keys_tmp, sorted, sort_idx, shortcut, version, n, Nint) - !$OMP END SINGLE - - !$OMP DO SCHEDULE(dynamic) - do sh=1,shortcut(0) - do sh2=1,sh - exa = 0 - do ni=1,Nint - exa += popcnt(xor(version(ni,sh), version(ni,sh2))) - end do - if(exa > 2) then - cycle - end if - - do i=shortcut(sh),shortcut(sh+1)-1 - if(sh==sh2) then - endi = i-1 - else - endi = shortcut(sh2+1)-1 + do sh=1,shortcut(0,1) + do sh2=sh,shortcut(0,1) + exa = 0 + do ni=1,Nint + exa = exa + popcnt(xor(version(ni,sh,1), version(ni,sh2,1))) + end do + if(exa > 2) then + cycle end if - do j=shortcut(sh2),endi - ext = exa - do ni=1,Nint - ext += popcnt(xor(sorted(ni,i), sorted(ni,j))) - end do - if(ext <= 4) then - org_i = sort_idx(i) - org_j = sort_idx(j) - - call i_H_j(keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),Nint,hij) - vt (org_i) = vt (org_i) + hij*u_0(org_j) - vt (org_j) = vt (org_j) + hij*u_0(org_i) + do i=shortcut(sh,1),shortcut(sh+1,1)-1 + org_i = sort_idx(i,1) + if(sh==sh2) then + endi = i-1 + else + endi = shortcut(sh2+1,1)-1 end if - end do - end do - end do + do ni=1,Nint + sorted_i(ni) = sorted(ni,i,1) + enddo + + do j=shortcut(sh2,1),endi + org_j = sort_idx(j,1) + ext = exa + do ni=1,Nint + ext = ext + popcnt(xor(sorted_i(ni), sorted(ni,j,1))) + end do + if(ext <= 4) then + call i_H_j(keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),Nint,hij) + do istate=1,N_st + vt (org_i,istate) = vt (org_i,istate) + hij*u_0(org_j,istate) + vt (org_j,istate) = vt (org_j,istate) + hij*u_0(org_i,istate) + enddo + endif + enddo + enddo + enddo enddo - !$OMP END DO + !$OMP END DO NOWAIT - !$OMP SINGLE - call sort_dets_ba_v(keys_tmp, sorted, sort_idx, shortcut, version, n, Nint) - !$OMP END SINGLE - !$OMP DO SCHEDULE(dynamic) - do sh=1,shortcut(0) - do i=shortcut(sh),shortcut(sh+1)-1 - do j=shortcut(sh),i-1 + do sh=1,shortcut(0,2) + do i=shortcut(sh,2),shortcut(sh+1,2)-1 + org_i = sort_idx(i,2) + do j=shortcut(sh,2),i-1 + org_j = sort_idx(j,2) ext = 0 do ni=1,Nint - ext += popcnt(xor(sorted(ni,i), sorted(ni,j))) + ext = ext + popcnt(xor(sorted(ni,i,2), sorted(ni,j,2))) end do if(ext == 4) then - org_i = sort_idx(i) - org_j = sort_idx(j) call i_H_j(keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),Nint,hij) - vt (org_i) = vt (org_i) + hij*u_0(org_j) - vt (org_j) = vt (org_j) + hij*u_0(org_i) + do istate=1,N_st + vt (org_i,istate) = vt (org_i,istate) + hij*u_0(org_j,istate) + vt (org_j,istate) = vt (org_j,istate) + hij*u_0(org_i,istate) + enddo end if end do end do enddo - !$OMP END DO + !$OMP END DO NOWAIT - - !$OMP DO SCHEDULE(guided) + !$OMP DO do ii=1,n_det_ref i = idx_ref(ii) do jj = 1, n_det_non_ref j = idx_non_ref(jj) - vt (i) = vt (i) + delta_ij(istate,jj,ii)*u_0(j) - vt (j) = vt (j) + delta_ij(istate,jj,ii)*u_0(i) + do istate=1,N_st + vt (i,istate) = vt (i,istate) + delta_ij(istate_in,jj,ii)*u_0(j,istate) + vt (j,istate) = vt (j,istate) + delta_ij(istate_in,jj,ii)*u_0(i,istate) + enddo enddo enddo !$OMP END DO + !$OMP CRITICAL - do i=1,n - v_0(i) = v_0(i) + vt(i) + do istate=1,N_st + do i=n,1,-1 + v_0(i,istate) = v_0(i,istate) + vt(i,istate) + enddo enddo !$OMP END CRITICAL - deallocate(idx,vt) + + deallocate(vt) !$OMP END PARALLEL + + do istate=1,N_st + do i=1,n + v_0(i,istate) += H_jj(i) * u_0(i,istate) + enddo + enddo + deallocate (shortcut, sort_idx, sorted, version) + end diff --git a/plugins/MRCC_Utils/mrcc_utils.irp.f b/plugins/MRCC_Utils/mrcc_utils.irp.f index 210ce848..2304e39b 100644 --- a/plugins/MRCC_Utils/mrcc_utils.irp.f +++ b/plugins/MRCC_Utils/mrcc_utils.irp.f @@ -129,7 +129,7 @@ END_PROVIDER integer :: i_other_state double precision, allocatable :: eigenvectors(:,:), eigenvalues(:) integer :: i_state - double precision :: s2,e_0 + double precision :: e_0 integer :: i,j,k double precision, allocatable :: s2_eigvalues(:) double precision, allocatable :: e_array(:) @@ -168,7 +168,7 @@ END_PROVIDER N_det,size(eigenvectors,1)) do j=1,N_det ! Select at least n_states states with S^2 values closed to "expected_s2" - if(dabs(s2-expected_s2).le.0.5d0)then + if(dabs(s2_eigvalues(j)-expected_s2).le.0.5d0)then i_state += 1 index_good_state_array(i_state) = j good_state_array(j) = .True. diff --git a/src/Determinants/davidson.irp.f b/src/Determinants/davidson.irp.f index b751bcbc..2d7b32bf 100644 --- a/src/Determinants/davidson.irp.f +++ b/src/Determinants/davidson.irp.f @@ -327,7 +327,7 @@ subroutine davidson_diag_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,N_st,Nint,iun integer :: k_pairs, kl integer :: iter2 - double precision, allocatable :: W(:,:,:), U(:,:,:), R(:,:), Wt(:) + double precision, allocatable :: W(:,:,:), U(:,:,:), R(:,:) double precision, allocatable :: y(:,:,:,:), h(:,:,:,:), lambda(:) double precision :: diag_h_mat_elem double precision :: residual_norm(N_st) @@ -335,7 +335,7 @@ subroutine davidson_diag_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,N_st,Nint,iun double precision :: to_print(2,N_st) double precision :: cpu, wall - !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: U, W, R, Wt, y, h, lambda + !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: U, W, R, y, h, lambda call write_time(iunit) @@ -369,10 +369,9 @@ subroutine davidson_diag_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,N_st,Nint,iun allocate( & kl_pairs(2,N_st*(N_st+1)/2), & - W(sze_8,N_st,davidson_sze_max), & - Wt(sze), & - U(sze_8,N_st,davidson_sze_max), & - R(sze_8,N_st), & + W(sze_8,N_st,davidson_sze_max), & + U(sze_8,N_st,davidson_sze_max), & + R(sze_8,N_st), & h(N_st,davidson_sze_max,N_st,davidson_sze_max), & y(N_st,davidson_sze_max,N_st,davidson_sze_max), & lambda(N_st*davidson_sze_max)) @@ -612,7 +611,6 @@ subroutine davidson_diag_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,N_st,Nint,iun deallocate ( & kl_pairs, & W, & - Wt, & U, & R, & h, &