From 67836776e1646f008bbab9da48f3f121e011e525 Mon Sep 17 00:00:00 2001 From: Yann Garniron Date: Sun, 1 Nov 2015 08:57:01 +0100 Subject: [PATCH] optimized H_u_0_mrcc and get_s2_u0 - some unused functions commented out --- plugins/MRCC_Utils/davidson.irp.f | 132 +++++++++++++++++- src/Determinants/davidson.irp.f | 24 ++-- src/Determinants/s2.irp.f | 224 ++++++++++++++++++++++-------- 3 files changed, 307 insertions(+), 73 deletions(-) diff --git a/plugins/MRCC_Utils/davidson.irp.f b/plugins/MRCC_Utils/davidson.irp.f index 33aa8ee5..1757305c 100644 --- a/plugins/MRCC_Utils/davidson.irp.f +++ b/plugins/MRCC_Utils/davidson.irp.f @@ -215,8 +215,9 @@ subroutine davidson_diag_hjj_mrcc(dets_in,u_in,H_jj,energies,dim_in,sze,N_st,Nin ! ---------------------- do k=1,N_st - call H_u_0_mrcc(W(1,k,iter),U(1,k,iter),H_jj,sze,dets_in_sorted,shortcut,idx,Nint,istate) + !call H_u_0_mrcc(W(1,k,iter),U(1,k,iter),H_jj,sze,dets_in_sorted,shortcut,idx,Nint,istate) !call H_u_0_mrcc_org(W(1,k,iter),U(1,k,iter),H_jj,sze,dets_in,Nint,istate) + call H_u_0_mrcc(W(1,k,iter),U(1,k,iter),H_jj,sze,dets_in,Nint,istate) enddo ! Compute h_kl = = @@ -368,7 +369,9 @@ subroutine davidson_diag_hjj_mrcc(dets_in,u_in,H_jj,energies,dim_in,sze,N_st,Nin abort_here = abort_all end -subroutine H_u_0_mrcc(v_0,u_0,H_jj,n,keys_tmp,shortcut,sort_idx,Nint,istate) + + +subroutine H_u_0_mrcc_myold(v_0,u_0,H_jj,n,keys_tmp,shortcut,sort_idx,Nint,istate) use bitmasks implicit none BEGIN_DOC @@ -449,6 +452,131 @@ subroutine H_u_0_mrcc(v_0,u_0,H_jj,n,keys_tmp,shortcut,sort_idx,Nint,istate) + !$OMP DO SCHEDULE(guided) + do ii=1,n_det_ref + i = idx_ref(ii) + do jj = 1, n_det_non_ref + j = idx_non_ref(jj) + vt (i) = vt (i) + delta_ij(ii,jj,istate)*u_0(j) + vt (j) = vt (j) + delta_ij(ii,jj,istate)*u_0(i) + enddo + enddo + !$OMP END DO + !$OMP CRITICAL + do i=1,n + v_0(i) = v_0(i) + vt(i) + enddo + !$OMP END CRITICAL + deallocate(idx,vt) + !$OMP END PARALLEL +end + + +subroutine H_u_0_mrcc(v_0,u_0,H_jj,n,keys_tmp,Nint,istate) + use bitmasks + implicit none + BEGIN_DOC + ! Computes v_0 = H|u_0> + ! + ! n : number of determinants + ! + ! H_jj : array of + END_DOC + integer, intent(in) :: n,Nint,istate + double precision, intent(out) :: v_0(n) + double precision, intent(in) :: u_0(n) + double precision, intent(in) :: H_jj(n) + integer(bit_kind),intent(in) :: keys_tmp(Nint,2,n) + integer, allocatable :: idx(:) + double precision :: hij + double precision, allocatable :: vt(:) + integer :: i,j,k,l, jj,ii + integer :: i0, j0 + + integer :: shortcut(0:n+1), sort_idx(n) + integer(bit_kind) :: sorted(Nint,n), version(Nint,n) + integer :: sh, sh2, ni, exa, ext, org_i, org_j, endi, pass +! + + ASSERT (Nint > 0) + ASSERT (Nint == N_int) + ASSERT (n>0) + PROVIDE ref_bitmask_energy delta_ij + integer, parameter :: block_size = 157 + + + !$OMP PARALLEL DEFAULT(NONE) & + !$OMP PRIVATE(i,hij,j,k,idx,jj,vt,ii,sh, sh2, ni, exa, ext, org_i, org_j, endi, pass) & + !$OMP SHARED(n_det_ref,n_det_non_ref,idx_ref,idx_non_ref,n,H_jj,u_0,keys_tmp,Nint,v_0,istate,delta_ij,sorted,shortcut,sort_idx,version) + + + + !$OMP DO SCHEDULE(static) + do i=1,n + v_0(i) = H_jj(i) * u_0(i) + enddo + !$OMP END DO + + allocate(idx(0:n), vt(n)) + Vt = 0.d0 + + do pass=1,2 + if(pass == 1) then + call sort_dets_ab_v(keys_tmp, sorted, sort_idx, shortcut, version, n, Nint) + else + call sort_dets_ba_v(keys_tmp, sorted, sort_idx, shortcut, version, n, Nint) + end if + + + !$OMP DO SCHEDULE(dynamic) + do sh=1,shortcut(0) + + if(pass == 2) then + endi = sh + else + endi = 1 + end if + + do sh2=endi,sh + exa = 0 + do ni=1,Nint + exa += popcnt(xor(version(ni,sh), version(ni,sh2))) + end do + if(exa > 2) then + cycle + end if + + do i=shortcut(sh),shortcut(sh+1)-1 + if(sh==sh2) then + endi = i-1 + else + endi = shortcut(sh2+1)-1 + end if + + do j=shortcut(sh2),endi + ext = exa + do ni=1,Nint + ext += popcnt(xor(sorted(ni,i), sorted(ni,j))) + end do + if(ext <= 4) then + org_i = sort_idx(i) + org_j = sort_idx(j) + if ( (dabs(u_0(org_j)) > 1.d-7).or.((dabs(u_0(org_i)) > 1.d-7)) ) then + call i_H_j(keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),Nint,hij) + vt (org_i) = vt (org_i) + hij*u_0(org_j) + vt (org_j) = vt (org_j) + hij*u_0(org_i) + endif + end if + end do + end do + end do + enddo + !$OMP END DO + end do + + + + !$OMP DO SCHEDULE(guided) do ii=1,n_det_ref i = idx_ref(ii) diff --git a/src/Determinants/davidson.irp.f b/src/Determinants/davidson.irp.f index b017570a..ebc30e99 100644 --- a/src/Determinants/davidson.irp.f +++ b/src/Determinants/davidson.irp.f @@ -121,35 +121,35 @@ subroutine tamiser(key, idx, no, n, Nint, N_key) end subroutine -subroutine sort_dets_ba_v(key, idx, shortcut, version, N_key, Nint) +subroutine sort_dets_ba_v(key_in, key_out, idx, shortcut, version, N_key, Nint) use bitmasks implicit none - integer(bit_kind),intent(inout) :: key(Nint,2,N_key) + integer(bit_kind),intent(in) :: key_in(Nint,2,N_key) + integer(bit_kind) :: key(Nint,2,N_key) + integer(bit_kind),intent(out) :: key_out(Nint,N_key) integer,intent(out) :: idx(N_key) integer,intent(out) :: shortcut(0:N_key+1) integer(bit_kind),intent(out) :: version(Nint,N_key+1) integer, intent(in) :: Nint, N_key integer(bit_kind) :: tmp(Nint, 2,N_key) - tmp(:,1,:N_key) = key(:,2,:N_key) - tmp(:,2,:N_key) = key(:,1,:N_key) + key(:,1,:N_key) = key_in(:,2,:N_key) + key(:,2,:N_key) = key_in(:,1,:N_key) - call sort_dets_ab_v(tmp, idx, shortcut, version, N_key, Nint) - - - key(:,1,:N_key) = tmp(:,2,:N_key) - key(:,2,:N_key) = tmp(:,1,:N_key) + call sort_dets_ab_v(key, key_out, idx, shortcut, version, N_key, Nint) end subroutine -subroutine sort_dets_ab_v(key, idx, shortcut, version, N_key, Nint) +subroutine sort_dets_ab_v(key_in, key_out, idx, shortcut, version, N_key, Nint) use bitmasks implicit none - integer(bit_kind),intent(inout) :: key(Nint,2,N_key) + integer(bit_kind),intent(in) :: key_in(Nint,2,N_key) + integer(bit_kind) :: key(Nint,2,N_key) + integer(bit_kind),intent(out) :: key_out(Nint,N_key) integer,intent(out) :: idx(N_key) integer,intent(out) :: shortcut(0:N_key+1) integer(bit_kind),intent(out) :: version(Nint,N_key+1) @@ -157,6 +157,7 @@ subroutine sort_dets_ab_v(key, idx, shortcut, version, N_key, Nint) integer(bit_kind) :: tmp(Nint, 2) integer :: tmpidx,i,ni + key(:,:,:) = key_in(:,:,:) do i=1,N_key idx(i) = i end do @@ -188,6 +189,7 @@ subroutine sort_dets_ab_v(key, idx, shortcut, version, N_key, Nint) end do end do shortcut(shortcut(0)+1) = N_key+1 + key_out(:,:) = key(:,2,:) end subroutine c diff --git a/src/Determinants/s2.irp.f b/src/Determinants/s2.irp.f index 33b3c45d..89d0a72b 100644 --- a/src/Determinants/s2.irp.f +++ b/src/Determinants/s2.irp.f @@ -106,78 +106,182 @@ subroutine get_s2_u0_old(psi_keys_tmp,psi_coefs_tmp,n,nmax,s2) s2 += S_z2_Sz end - subroutine get_s2_u0(psi_keys_tmp,psi_coefs_tmp,n,nmax,s2) - implicit none - use bitmasks - integer(bit_kind), intent(in) :: psi_keys_tmp(N_int,2,nmax) - integer, intent(in) :: n,nmax - double precision, intent(in) :: psi_coefs_tmp(nmax) - double precision, intent(out) :: s2 - double precision :: s2_tmp - integer :: i,j,l,jj,ii - integer, allocatable :: idx(:) + implicit none + use bitmasks + integer(bit_kind), intent(in) :: psi_keys_tmp(N_int,2,nmax) + integer, intent(in) :: n,nmax + double precision, intent(in) :: psi_coefs_tmp(nmax) + double precision, intent(out) :: s2 + double precision :: s2_tmp + integer :: i,j,l,jj,ii + integer, allocatable :: idx(:) - integer(bit_kind) :: psi_keys_srt(N_int,2,n) - integer :: shortcut(0:n+1), sort_idx(n), warp(2,0:n+1), ni, sh, tmp - - - print *, "totolacitrouille" - call write_time(6) - psi_keys_srt(:,:,:) = psi_keys_tmp(:,:,:) - call sort_dets_ab(psi_keys_srt, sort_idx, shortcut, n, N_int) - print *, "totolacitrouille 2" - s2 = 0.d0 - !$OMP PARALLEL DEFAULT(NONE) & - !$OMP PRIVATE(i,j,s2_tmp,idx,warp,tmp) & - !$OMP SHARED(n,psi_coefs_tmp,psi_keys_tmp,N_int,davidson_threshold,shortcut,psi_keys_srt,sort_idx)& - !$OMP REDUCTION(+:s2) - allocate(idx(0:n)) - !$OMP DO SCHEDULE(dynamic) - - do sh=1,shortcut(0) - warp(1,0) = 0 - do ii=1,sh!shortcut(0) - tmp = 0 - do ni=1,N_int - tmp = popcnt(xor(psi_keys_tmp(ni,1, shortcut(ii)), psi_keys_tmp(ni,1,shortcut(sh)))) - end do - if(tmp <= 4) then - tmp = warp(1,0) + 1 - warp(1,0) = tmp - warp(1,tmp) = shortcut(ii) - warp(2,tmp) = shortcut(ii+1)-1 - end if - end do + integer :: shortcut(0:n+1), sort_idx(n) + integer(bit_kind) :: sorted(N_int,n), version(N_int,n) + integer :: sh, sh2, ni, exa, ext, org_i, org_j, endi, pass + double precision :: davidson_threshold_bis + + !PROVIDE davidson_threshold + + s2 = 0.d0 + davidson_threshold_bis = davidson_threshold + !$OMP PARALLEL DEFAULT(NONE) & + !$OMP PRIVATE(i,j,s2_tmp,idx,sh, sh2, ni, exa, ext, org_i, org_j, endi, pass) & + !$OMP SHARED(n,psi_coefs_tmp,psi_keys_tmp,N_int,davidson_threshold,shortcut,sorted,sort_idx,version)& + !$OMP REDUCTION(+:s2) + allocate(idx(0:n)) + + do pass=1,2 + if(pass == 1) then + call sort_dets_ab_v(psi_keys_tmp, sorted, sort_idx, shortcut, version, n, N_int) + else + call sort_dets_ba_v(psi_keys_tmp, sorted, sort_idx, shortcut, version, n, N_int) + end if - do ii=shortcut(sh),shortcut(sh+1)-1 - !do ii=1,n - idx(0) = ii - call filter_connected_davidson_warp(psi_keys_srt,warp,psi_keys_srt(1,1,ii),N_int,ii-1,idx) - i = sort_idx(ii) - do jj=1,idx(0) - j = sort_idx(idx(jj)) - if ( dabs(psi_coefs_tmp(j)) + dabs(psi_coefs_tmp(i)) & - > davidson_threshold ) then - call get_s2(psi_keys_srt(1,1,ii),psi_keys_srt(1,1,idx(jj)),s2_tmp,N_int) - s2 = s2 + psi_coefs_tmp(i)*psi_coefs_tmp(j)*s2_tmp - endif - enddo - end do - enddo - !$OMP END DO + + !$OMP DO SCHEDULE(dynamic) + do sh=1,shortcut(0) + + if(pass == 2) then + endi = sh + else + endi = 1 + end if + + do sh2=endi,sh + exa = 0 + do ni=1,N_int + exa += popcnt(xor(version(ni,sh), version(ni,sh2))) + end do + if(exa > 2) then + cycle + end if + + do i=shortcut(sh),shortcut(sh+1)-1 + if(sh==sh2) then + endi = i-1 + else + endi = shortcut(sh2+1)-1 + end if + + do j=shortcut(sh2),endi + ext = exa + do ni=1,N_int + ext += popcnt(xor(sorted(ni,i), sorted(ni,j))) + end do + if(ext <= 4) then + org_i = sort_idx(i) + org_j = sort_idx(j) + + + if ( dabs(psi_coefs_tmp(org_j)) + dabs(psi_coefs_tmp(org_i)) & + > davidson_threshold ) then + call get_s2(psi_keys_tmp(1,1,org_i),psi_keys_tmp(1,1,org_j),s2_tmp,N_int) + s2 = s2 + psi_coefs_tmp(org_i)*psi_coefs_tmp(org_j)*s2_tmp + endif + + end if + end do + end do + end do + enddo + !$OMP END DO + end do deallocate(idx) !$OMP END PARALLEL s2 = s2+s2 do i=1,n - call get_s2(psi_keys_srt(1,1,sort_idx(i)),psi_keys_srt(1,1,sort_idx(i)),s2_tmp,N_int) + call get_s2(psi_keys_tmp(1,1,i),psi_keys_tmp(1,1,i),s2_tmp,N_int) s2 = s2 + psi_coefs_tmp(i)*psi_coefs_tmp(i)*s2_tmp enddo s2 = s2 + S_z2_Sz - print *, "totolacitrouille 3" - call write_time(6) end + + + +! subroutine get_s2_u0(psi_keys_tmp,psi_coefs_tmp,n,nmax,s2) +! implicit none +! use bitmasks +! integer(bit_kind), intent(in) :: psi_keys_tmp(N_int,2,nmax) +! integer, intent(in) :: n,nmax +! double precision, intent(in) :: psi_coefs_tmp(nmax) +! double precision, intent(out) :: s2 +! double precision :: s2_tmp +! integer :: i,j,l,jj,ii +! integer, allocatable :: idx(:) +! +! integer(bit_kind) :: psi_keys_srt(N_int,2,n) +! integer :: shortcut(0:n+1), sort_idx(n), warp(2,0:n+1), ni, sh, tmp +! integer :: mon, bie, egz +! +! +! psi_keys_srt(:,:,:) = psi_keys_tmp(:,:,:) +! call sort_dets_ab(psi_keys_srt, sort_idx, shortcut, n, N_int) +! +! s2 = 0.d0 +! !$OMP PARALLEL DEFAULT(NONE) & +! !$OMP PRIVATE(i,j,s2_tmp,idx,warp,tmp,mon,bie,egz) & +! !$OMP SHARED(n,psi_coefs_tmp,psi_keys_tmp,N_int,davidson_threshold,shortcut,psi_keys_srt,sort_idx)& +! !$OMP REDUCTION(+:s2) +! allocate(idx(0:n)) +! !$OMP DO SCHEDULE(dynamic) +! +! do sh=1,shortcut(0) +! mon = 0 +! bie = 0 +! +! warp(1,0) = 0 +! do ii=1,sh!shortcut(0) +! tmp = 0 +! do ni=1,N_int +! tmp += popcnt(xor(psi_keys_tmp(ni,1, shortcut(ii)), psi_keys_tmp(ni,1,shortcut(sh)))) +! end do +! egz = tmp +! if(tmp <= 4) then +! tmp = warp(1,0) + 1 +! warp(1,0) = tmp +! warp(1,tmp) = shortcut(ii) +! warp(2,tmp) = shortcut(ii+1)-1 +! if(egz == 4) then +! bie = bie + shortcut(ii+1) - shortcut(ii) +! else +! mon = mon + shortcut(ii+1) - shortcut(ii) +! end if +! end if +! end do +! +! if(shortcut(sh+1) - shortcut(sh) /= 1) then +! print *, shortcut(sh+1) - shortcut(sh), shortcut(sh+1), mon, bie +! end if +! +! do ii=shortcut(sh),shortcut(sh+1)-1 +! !do ii=1,n +! idx(0) = ii +! call filter_connected_davidson_warp(psi_keys_srt,warp,psi_keys_srt(1,1,ii),N_int,ii-1,idx) +! i = sort_idx(ii) +! do jj=1,idx(0) +! j = sort_idx(idx(jj)) +! if ( dabs(psi_coefs_tmp(j)) + dabs(psi_coefs_tmp(i)) & +! > davidson_threshold ) then +! call get_s2(psi_keys_srt(1,1,ii),psi_keys_srt(1,1,idx(jj)),s2_tmp,N_int) +! s2 = s2 + psi_coefs_tmp(i)*psi_coefs_tmp(j)*s2_tmp +! endif +! enddo +! end do +! enddo +! !$OMP END DO +! deallocate(idx) +! !$OMP END PARALLEL +! s2 = s2+s2 +! do i=1,n +! call get_s2(psi_keys_srt(1,1,sort_idx(i)),psi_keys_srt(1,1,sort_idx(i)),s2_tmp,N_int) +! s2 = s2 + psi_coefs_tmp(i)*psi_coefs_tmp(i)*s2_tmp +! enddo +! s2 = s2 + S_z2_Sz +! end + ! ! subroutine get_s2_u0_org(psi_keys_tmp,psi_coefs_tmp,n,nmax,s2) ! implicit none