From 6a640567e89ca28b522b7774aa4054fa7aae30f4 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Fri, 13 Nov 2015 17:36:29 +0100 Subject: [PATCH] Davidson optimization --- src/Determinants/davidson.irp.f | 99 +++++++++++++++++------- src/Determinants/s2.irp.f | 116 ++++++++++++++-------------- src/Determinants/slater_rules.irp.f | 104 +++++++++++++------------ 3 files changed, 184 insertions(+), 135 deletions(-) diff --git a/src/Determinants/davidson.irp.f b/src/Determinants/davidson.irp.f index 6ba39424..fd2efcf9 100644 --- a/src/Determinants/davidson.irp.f +++ b/src/Determinants/davidson.irp.f @@ -90,27 +90,36 @@ end function subroutine tamiser(key, idx, no, n, Nint, N_key) use bitmasks - implicit none - integer(bit_kind),intent(inout) :: key(Nint, 2, N_key) + + BEGIN_DOC +! Uncodumented : TODO + END_DOC integer,intent(in) :: no, n, Nint, N_key + integer(bit_kind),intent(inout) :: key(Nint, 2, N_key) integer,intent(inout) :: idx(N_key) integer :: k,j,tmpidx integer(bit_kind) :: tmp(Nint, 2) logical :: det_inf + integer :: ni k = no j = 2*k do while(j <= n) if(j < n) then - if (det_inf(key(:,:,j), key(:,:,j+1), Nint)) then + if (det_inf(key(1,1,j), key(1,1,j+1), Nint)) then j = j+1 endif endif - if(det_inf(key(:,:,k), key(:,:,j), Nint)) then - tmp(:,:) = key(:,:,k) - key(:,:,k) = key(:,:,j) - key(:,:,j) = tmp(:,:) + if(det_inf(key(1,1,k), key(1,1,j), Nint)) then + do ni=1,Nint + tmp(ni,1) = key(ni,1,k) + tmp(ni,2) = key(ni,2,k) + key(ni,1,k) = key(ni,1,j) + key(ni,2,k) = key(ni,2,j) + key(ni,1,j) = tmp(ni,1) + key(ni,2,j) = tmp(ni,2) + enddo tmpidx = idx(k) idx(k) = idx(j) idx(j) = tmpidx @@ -126,17 +135,25 @@ end subroutine subroutine sort_dets_ba_v(key_in, key_out, idx, shortcut, version, N_key, Nint) use bitmasks implicit none - integer(bit_kind),intent(in) :: key_in(Nint,2,N_key) - integer(bit_kind) :: key(Nint,2,N_key) - integer(bit_kind),intent(out) :: key_out(Nint,N_key) - integer,intent(out) :: idx(N_key) - integer,intent(out) :: shortcut(0:N_key+1) - integer(bit_kind),intent(out) :: version(Nint,N_key+1) - integer, intent(in) :: Nint, N_key - integer(bit_kind) :: tmp(Nint, 2,N_key) + integer, intent(in) :: Nint, N_key + integer(bit_kind),intent(in) :: key_in(Nint,2,N_key) + integer(bit_kind) :: key(Nint,2,N_key) + integer(bit_kind),intent(out) :: key_out(Nint,N_key) + integer,intent(out) :: idx(N_key) + integer,intent(out) :: shortcut(0:N_key+1) + integer(bit_kind),intent(out) :: version(Nint,N_key+1) + integer(bit_kind) :: tmp(Nint, 2,N_key) + integer :: i,ni - key(:,1,:N_key) = key_in(:,2,:N_key) - key(:,2,:N_key) = key_in(:,1,:N_key) + BEGIN_DOC +! Uncodumented : TODO + END_DOC + do i=1,N_key + do ni=1,Nint + key(ni,1,i) = key_in(ni,2,i) + key(ni,2,i) = key_in(ni,1,i) + enddo + enddo call sort_dets_ab_v(key, key_out, idx, shortcut, version, N_key, Nint) @@ -148,18 +165,24 @@ subroutine sort_dets_ab_v(key_in, key_out, idx, shortcut, version, N_key, Nint) use bitmasks implicit none + BEGIN_DOC +! Uncodumented : TODO + END_DOC + integer, intent(in) :: Nint, N_key integer(bit_kind),intent(in) :: key_in(Nint,2,N_key) integer(bit_kind) :: key(Nint,2,N_key) integer(bit_kind),intent(out) :: key_out(Nint,N_key) integer,intent(out) :: idx(N_key) integer,intent(out) :: shortcut(0:N_key+1) integer(bit_kind),intent(out) :: version(Nint,N_key+1) - integer, intent(in) :: Nint, N_key integer(bit_kind) :: tmp(Nint, 2) integer :: tmpidx,i,ni - key(:,:,:) = key_in(:,:,:) do i=1,N_key + do ni=1,Nint + key(ni,1,i) = key_in(ni,1,i) + key(ni,2,i) = key_in(ni,2,i) + enddo idx(i) = i end do @@ -168,9 +191,14 @@ subroutine sort_dets_ab_v(key_in, key_out, idx, shortcut, version, N_key, Nint) end do do i=N_key,2,-1 - tmp(:,:) = key(:,:,i) - key(:,:,i) = key(:,:,1) - key(:,:,1) = tmp(:,:) + do ni=1,Nint + tmp(ni,1) = key(ni,1,i) + tmp(ni,2) = key(ni,2,i) + key(ni,1,i) = key(ni,1,1) + key(ni,2,i) = key(ni,2,1) + key(ni,1,1) = tmp(ni,1) + key(ni,2,1) = tmp(ni,2) + enddo tmpidx = idx(i) idx(i) = idx(1) idx(1) = tmpidx @@ -179,7 +207,9 @@ subroutine sort_dets_ab_v(key_in, key_out, idx, shortcut, version, N_key, Nint) shortcut(0) = 1 shortcut(1) = 1 - version(:,1) = key(:,1,1) + do ni=1,Nint + version(ni,1) = key(ni,1,1) + enddo do i=2,N_key do ni=1,nint if(key(ni,1,i) /= key(ni,1,i-1)) then @@ -191,15 +221,22 @@ subroutine sort_dets_ab_v(key_in, key_out, idx, shortcut, version, N_key, Nint) end do end do shortcut(shortcut(0)+1) = N_key+1 - key_out(:,:) = key(:,2,:) + do i=1,N_key + do ni=1,Nint + key_out(ni,i) = key(ni,2,i) + enddo + enddo end subroutine -c subroutine sort_dets_ab(key, idx, shortcut, N_key, Nint) use bitmasks implicit none + + BEGIN_DOC +! Uncodumented : TODO + END_DOC integer(bit_kind),intent(inout) :: key(Nint,2,N_key) integer,intent(out) :: idx(N_key) integer,intent(out) :: shortcut(0:N_key+1) @@ -216,9 +253,15 @@ subroutine sort_dets_ab(key, idx, shortcut, N_key, Nint) end do do i=N_key,2,-1 - tmp(:,:) = key(:,:,i) - key(:,:,i) = key(:,:,1) - key(:,:,1) = tmp(:,:) + do ni=1,Nint + tmp(ni,1) = key(ni,1,i) + tmp(ni,2) = key(ni,2,i) + key(ni,1,i) = key(ni,1,1) + key(ni,2,i) = key(ni,2,1) + key(ni,1,1) = tmp(ni,1) + key(ni,2,1) = tmp(ni,2) + enddo + tmpidx = idx(i) idx(i) = idx(1) idx(1) = tmpidx diff --git a/src/Determinants/s2.irp.f b/src/Determinants/s2.irp.f index e836d25d..573f094f 100644 --- a/src/Determinants/s2.irp.f +++ b/src/Determinants/s2.irp.f @@ -109,81 +109,82 @@ end subroutine get_s2_u0(psi_keys_tmp,psi_coefs_tmp,n,nmax,s2) implicit none use bitmasks - integer(bit_kind), intent(in) :: psi_keys_tmp(N_int,2,nmax) - integer, intent(in) :: n,nmax - double precision, intent(in) :: psi_coefs_tmp(nmax) - double precision, intent(out) :: s2 - double precision :: s2_tmp - integer :: i,j,l,jj,ii + integer(bit_kind), intent(in) :: psi_keys_tmp(N_int,2,nmax) + integer, intent(in) :: n,nmax + double precision, intent(in) :: psi_coefs_tmp(nmax) + double precision, intent(out) :: s2 + double precision :: s2_tmp + integer :: i,j,l,jj,ii integer, allocatable :: idx(:) - + integer :: shortcut(0:n+1), sort_idx(n) integer(bit_kind) :: sorted(N_int,n), version(N_int,n) integer :: sh, sh2, ni, exa, ext, org_i, org_j, endi, pass double precision :: davidson_threshold_bis - + !PROVIDE davidson_threshold s2 = 0.d0 davidson_threshold_bis = davidson_threshold - !$OMP PARALLEL DEFAULT(NONE) & - !$OMP PRIVATE(i,j,s2_tmp,idx,sh, sh2, ni, exa, ext, org_i, org_j, endi, pass) & + call sort_dets_ab_v(psi_keys_tmp, sorted, sort_idx, shortcut, version, n, N_int) + + !$OMP PARALLEL DEFAULT(NONE) & + !$OMP PRIVATE(i,j,s2_tmp,sh, sh2, ni, exa, ext, org_i, org_j, endi, pass)& !$OMP SHARED(n,psi_coefs_tmp,psi_keys_tmp,N_int,davidson_threshold,shortcut,sorted,sort_idx,version)& !$OMP REDUCTION(+:s2) - allocate(idx(0:n)) - - - !$OMP SINGLE - call sort_dets_ab_v(psi_keys_tmp, sorted, sort_idx, shortcut, version, n, N_int) - !$OMP END SINGLE !$OMP DO SCHEDULE(dynamic) do sh=1,shortcut(0) - - do sh2=1,sh - exa = 0 - do ni=1,N_int - exa += popcnt(xor(version(ni,sh), version(ni,sh2))) - end do - if(exa > 2) then - cycle - end if - do i=shortcut(sh),shortcut(sh+1)-1 - if(sh==sh2) then - endi = i-1 - else - endi = shortcut(sh2+1)-1 + do sh2=1,sh + exa = 0 + do ni=1,N_int + exa += popcnt(xor(version(ni,sh), version(ni,sh2))) + end do + if(exa > 2) then + cycle end if - do j=shortcut(sh2),endi - ext = exa - do ni=1,N_int - ext += popcnt(xor(sorted(ni,i), sorted(ni,j))) - end do - if(ext <= 4) then - org_i = sort_idx(i) - org_j = sort_idx(j) - - if ( dabs(psi_coefs_tmp(org_j)) + dabs(psi_coefs_tmp(org_i)) & - > davidson_threshold ) then - call get_s2(psi_keys_tmp(1,1,org_i),psi_keys_tmp(1,1,org_j),s2_tmp,N_int) - s2 = s2 + psi_coefs_tmp(org_i)*psi_coefs_tmp(org_j)*s2_tmp - endif + do i=shortcut(sh),shortcut(sh+1)-1 + if(sh==sh2) then + endi = i-1 + else + endi = shortcut(sh2+1)-1 end if + + do j=shortcut(sh2),endi + ext = exa + do ni=1,N_int + ext += popcnt(xor(sorted(ni,i), sorted(ni,j))) + end do + if(ext <= 4) then + org_i = sort_idx(i) + org_j = sort_idx(j) + + if ( dabs(psi_coefs_tmp(org_j)) + dabs(psi_coefs_tmp(org_i))& + > davidson_threshold ) then + call get_s2(psi_keys_tmp(1,1,org_i),psi_keys_tmp(1,1,org_j),s2_tmp,N_int) + s2 = s2 + psi_coefs_tmp(org_i)*psi_coefs_tmp(org_j)*s2_tmp + endif + end if + end do end do end do - end do enddo !$OMP END DO - !$OMP SINGLE + !$OMP END PARALLEL + call sort_dets_ba_v(psi_keys_tmp, sorted, sort_idx, shortcut, version, n, N_int) - !$OMP END SINGLE + + !$OMP PARALLEL DEFAULT(NONE) & + !$OMP PRIVATE(i,j,s2_tmp,sh, sh2, ni, exa, ext, org_i, org_j, endi, pass)& + !$OMP SHARED(n,psi_coefs_tmp,psi_keys_tmp,N_int,davidson_threshold,shortcut,sorted,sort_idx,version)& + !$OMP REDUCTION(+:s2) !$OMP DO SCHEDULE(dynamic) do sh=1,shortcut(0) - do i=shortcut(sh),shortcut(sh+1)-1 + do i=shortcut(sh),shortcut(sh+1)-1 do j=shortcut(sh),i-1 ext = 0 do ni=1,N_int @@ -193,25 +194,24 @@ subroutine get_s2_u0(psi_keys_tmp,psi_coefs_tmp,n,nmax,s2) org_i = sort_idx(i) org_j = sort_idx(j) - if ( dabs(psi_coefs_tmp(org_j)) + dabs(psi_coefs_tmp(org_i)) & + if ( dabs(psi_coefs_tmp(org_j)) + dabs(psi_coefs_tmp(org_i))& > davidson_threshold ) then call get_s2(psi_keys_tmp(1,1,org_i),psi_keys_tmp(1,1,org_j),s2_tmp,N_int) s2 = s2 + psi_coefs_tmp(org_i)*psi_coefs_tmp(org_j)*s2_tmp endif end if end do - end do + end do enddo !$OMP END DO - deallocate(idx) - !$OMP END PARALLEL - s2 = s2+s2 - do i=1,n - call get_s2(psi_keys_tmp(1,1,i),psi_keys_tmp(1,1,i),s2_tmp,N_int) - s2 = s2 + psi_coefs_tmp(i)*psi_coefs_tmp(i)*s2_tmp - enddo - s2 = s2 + S_z2_Sz + !$OMP END PARALLEL + s2 = s2+s2 + do i=1,n + call get_s2(psi_keys_tmp(1,1,i),psi_keys_tmp(1,1,i),s2_tmp,N_int) + s2 = s2 + psi_coefs_tmp(i)*psi_coefs_tmp(i)*s2_tmp + enddo + s2 = s2 + S_z2_Sz end diff --git a/src/Determinants/slater_rules.irp.f b/src/Determinants/slater_rules.irp.f index c566b51f..4cb1afff 100644 --- a/src/Determinants/slater_rules.irp.f +++ b/src/Determinants/slater_rules.irp.f @@ -1068,14 +1068,14 @@ double precision function diag_H_mat_elem(det_in,Nint) nexc(1) = 0 nexc(2) = 0 do i=1,Nint - hole(i,1) = xor(det_in(i,1),ref_bitmask(i,1)) - hole(i,2) = xor(det_in(i,2),ref_bitmask(i,2)) + hole(i,1) = xor(det_in(i,1),ref_bitmask(i,1)) + hole(i,2) = xor(det_in(i,2),ref_bitmask(i,2)) particle(i,1) = iand(hole(i,1),det_in(i,1)) particle(i,2) = iand(hole(i,2),det_in(i,2)) hole(i,1) = iand(hole(i,1),ref_bitmask(i,1)) hole(i,2) = iand(hole(i,2),ref_bitmask(i,2)) - nexc(1) += popcnt(hole(i,1)) - nexc(2) += popcnt(hole(i,2)) + nexc(1) = nexc(1) + popcnt(hole(i,1)) + nexc(2) = nexc(2) + popcnt(hole(i,2)) enddo diag_H_mat_elem = ref_bitmask_energy @@ -1239,10 +1239,11 @@ subroutine H_u_0(v_0,u_0,H_jj,n,keys_tmp,Nint) integer :: shortcut(0:n+1), sort_idx(n) integer(bit_kind) :: sorted(Nint,n), version(Nint,n) + integer(bit_kind) :: sorted_i(Nint) integer :: sh, sh2, ni, exa, ext, org_i, org_j, endi -! + double precision :: local_threshold ASSERT (Nint > 0) @@ -1254,47 +1255,51 @@ subroutine H_u_0(v_0,u_0,H_jj,n,keys_tmp,Nint) call sort_dets_ab_v(keys_tmp, sorted, sort_idx, shortcut, version, n, Nint) !$OMP PARALLEL DEFAULT(NONE) & - !$OMP PRIVATE(i,hij,j,k,idx,jj,vt,ii,sh,sh2,ni,exa,ext,org_i,org_j,endi) & + !$OMP PRIVATE(i,hij,j,k,jj,vt,ii,sh,sh2,ni,exa,ext,org_i,org_j,endi,local_threshold,sorted_i)& !$OMP SHARED(n,H_jj,u_0,keys_tmp,Nint,v_0,davidson_threshold,sorted,shortcut,sort_idx,version) - allocate(idx(0:n), vt(n)) + allocate(vt(n)) Vt = 0.d0 !$OMP DO SCHEDULE(dynamic) do sh=1,shortcut(0) - do sh2=1,sh - exa = 0 - do ni=1,Nint - exa += popcnt(xor(version(ni,sh), version(ni,sh2))) - end do - if(exa > 2) then - cycle - end if - - do i=shortcut(sh),shortcut(sh+1)-1 - if(sh==sh2) then - endi = i-1 - else - endi = shortcut(sh2+1)-1 + do sh2=1,sh + exa = 0 + do ni=1,Nint + exa = exa + popcnt(xor(version(ni,sh), version(ni,sh2))) + end do + if(exa > 2) then + cycle end if - do j=shortcut(sh2),endi - ext = exa + do i=shortcut(sh),shortcut(sh+1)-1 + org_i = sort_idx(i) + local_threshold = davidson_threshold - dabs(u_0(org_i)) + if(sh==sh2) then + endi = i-1 + else + endi = shortcut(sh2+1)-1 + end if do ni=1,Nint - ext += popcnt(xor(sorted(ni,i), sorted(ni,j))) - end do - if(ext <= 4) then - org_i = sort_idx(i) + sorted_i(ni) = sorted(ni,i) + enddo + + do j=shortcut(sh2),endi org_j = sort_idx(j) - if ( dabs(u_0(org_j)) + dabs(u_0(org_i)) > davidson_threshold ) then - call i_H_j(keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),Nint,hij) - vt (org_i) = vt (org_i) + hij*u_0(org_j) - vt (org_j) = vt (org_j) + hij*u_0(org_i) + if ( dabs(u_0(org_j)) > local_threshold ) then + ext = exa + do ni=1,Nint + ext = ext + popcnt(xor(sorted_i(ni), sorted(ni,j))) + end do + if(ext <= 4) then + call i_H_j(keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),Nint,hij) + vt (org_i) = vt (org_i) + hij*u_0(org_j) + vt (org_j) = vt (org_j) + hij*u_0(org_i) + endif endif - endif + enddo enddo enddo enddo - enddo !$OMP END DO !$OMP CRITICAL @@ -1302,30 +1307,31 @@ subroutine H_u_0(v_0,u_0,H_jj,n,keys_tmp,Nint) v_0(i) = v_0(i) + vt(i) enddo !$OMP END CRITICAL - - deallocate(idx,vt) + + deallocate(vt) !$OMP END PARALLEL - + call sort_dets_ba_v(keys_tmp, sorted, sort_idx, shortcut, version, n, Nint) - + !$OMP PARALLEL DEFAULT(NONE) & - !$OMP PRIVATE(i,hij,j,k,idx,jj,vt,ii,sh,sh2,ni,exa,ext,org_i,org_j,endi) & + !$OMP PRIVATE(i,hij,j,k,jj,vt,ii,sh,sh2,ni,exa,ext,org_i,org_j,endi,local_threshold)& !$OMP SHARED(n,H_jj,u_0,keys_tmp,Nint,v_0,davidson_threshold,sorted,shortcut,sort_idx,version) - allocate(idx(0:n), vt(n)) + allocate(vt(n)) Vt = 0.d0 !$OMP DO SCHEDULE(dynamic) do sh=1,shortcut(0) do i=shortcut(sh),shortcut(sh+1)-1 + local_threshold = davidson_threshold - dabs(u_0(org_i)) + org_i = sort_idx(i) do j=shortcut(sh),i-1 - ext = 0 - do ni=1,Nint - ext += popcnt(xor(sorted(ni,i), sorted(ni,j))) - end do - if(ext == 4) then - org_i = sort_idx(i) - org_j = sort_idx(j) - if ( dabs(u_0(org_j)) + dabs(u_0(org_i)) > davidson_threshold ) then + org_j = sort_idx(j) + if ( dabs(u_0(org_j)) > local_threshold ) then + ext = 0 + do ni=1,Nint + ext = ext + popcnt(xor(sorted(ni,i), sorted(ni,j))) + end do + if(ext == 4) then call i_H_j(keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),Nint,hij) vt (org_i) = vt (org_i) + hij*u_0(org_j) vt (org_j) = vt (org_j) + hij*u_0(org_i) @@ -1341,9 +1347,9 @@ subroutine H_u_0(v_0,u_0,H_jj,n,keys_tmp,Nint) v_0(i) = v_0(i) + vt(i) enddo !$OMP END CRITICAL - deallocate(idx,vt) + deallocate(vt) !$OMP END PARALLEL - + do i=1,n v_0(i) += H_jj(i) * u_0(i) enddo