diff --git a/plugins/Full_CI/full_ci.irp.f b/plugins/Full_CI/full_ci.irp.f index 7070bd13..70a6ec43 100644 --- a/plugins/Full_CI/full_ci.irp.f +++ b/plugins/Full_CI/full_ci.irp.f @@ -100,8 +100,8 @@ program full_ci print *, 'N_det = ', N_det print *, 'N_states = ', N_states print *, 'PT2 = ', pt2 - print *, 'E = ', CI_energy - print *, 'E+PT2 = ', CI_energy+pt2 + print *, 'E = ', CI_energy(1:N_states) + print *, 'E+PT2 = ', CI_energy(1:N_states)+pt2(1:N_states) print *, '-----' call ezfio_set_full_ci_energy_pt2(CI_energy(1)+pt2(1)) endif diff --git a/plugins/Full_CI_ZMQ/selection_double.irp.f b/plugins/Full_CI_ZMQ/selection_double.irp.f index 83418307..3e602c21 100644 --- a/plugins/Full_CI_ZMQ/selection_double.irp.f +++ b/plugins/Full_CI_ZMQ/selection_double.irp.f @@ -425,19 +425,19 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs) if(lbanned(putj, ma) .or. banned(putj, puti,bant)) cycle hij = (integral8(p1, p2, putj, hfix)-integral8(p2,p1,putj,hfix)) * get_phase_bi(phasemask, ma, ma, putj, p1, hfix, p2) !call debug_hij(hij, gen, mask, mi, ma, puti, putj) - tmp_row(:,putj) += hij * coefs + tmp_row(1:N_states,putj) += hij * coefs(1:N_states) end do do putj=hfix+1, mo_tot_num if(lbanned(putj, ma) .or. banned(putj, puti,bant)) cycle hij = (integral8(p1, p2, hfix, putj)-integral8(p2,p1,hfix,putj)) * get_phase_bi(phasemask, ma, ma, hfix, p1, putj, p2) !call debug_hij(hij, gen, mask, mi, ma, puti, putj) - tmp_row(:,putj) += hij * coefs + tmp_row(1:N_states,putj) += hij * coefs(1:N_states) end do if(ma == 1) then - mat(:,:,puti) += tmp_row(:,:) + mat(1:N_states,1:mo_tot_num,puti) += tmp_row(1:N_states,1:mo_tot_num) else - mat(:,puti,:) += tmp_row(:,:) + mat(1:N_states,puti,1:mo_tot_num) += tmp_row(1:N_states,1:mo_tot_num) end if end if @@ -585,12 +585,12 @@ subroutine get_d0(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs) call apply_particles(mask, 1,p1,2,p2, det, ok, N_int) !call assert(ok, "zsdq") call i_h_j(gen, det, N_int, hij) - mat(:, p1, p2) += coefs * hij + mat(:, p1, p2) += coefs(:) * hij else hij = integral8(p1, p2, h1, h2) * get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2) phase = get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2) !call debug_hij(hij, gen, mask, 1, 2, p1, p2) - mat(:, p1, p2) += coefs * hij + mat(:, p1, p2) += coefs(:) * hij end if end do end do @@ -605,10 +605,10 @@ subroutine get_d0(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs) if(puti == p1 .or. putj == p2 .or. puti == p2 .or. putj == p1) then call apply_particles(mask, sp,puti,sp,putj, det, ok, N_int) call i_h_j(gen, det, N_int, hij) - mat(:, puti, putj) += coefs * hij + mat(:, puti, putj) += coefs(:) * hij else hij = (integral8(p1, p2, puti, putj) - integral8(p2, p1, puti, putj))* get_phase_bi(phasemask, sp, sp, puti, p1 , putj, p2) - mat(:, puti, putj) += coefs * hij + mat(:, puti, putj) += coefs(:) * hij !call debug_hij(hij, gen, mask, sp, sp, puti, putj) end if end do diff --git a/scripts/generate_h_apply.py b/scripts/generate_h_apply.py index d90a0e92..bd66611b 100755 --- a/scripts/generate_h_apply.py +++ b/scripts/generate_h_apply.py @@ -439,13 +439,13 @@ class H_apply_zmq(H_apply): enddo """ % (self.energy) self.data["copy_buffer"] = """ - do i=1,N_det_generators - do k=1,N_st - pt2(k) = pt2(k) + pt2_generators(k,i) - norm_pert(k) = norm_pert(k) + norm_pert_generators(k,i) - H_pert_diag(k) = H_pert_diag(k) + H_pert_diag_generators(k,i) + do i=1,N_det_generators + do k=1,N_st + pt2(k) = pt2(k) + pt2_generators(k,i) + norm_pert(k) = norm_pert(k) + norm_pert_generators(k,i) + H_pert_diag(k) = H_pert_diag(k) + H_pert_diag_generators(k,i) + enddo enddo - enddo """ def set_selection_pt2(self,pert): diff --git a/src/Davidson/diagonalize_CI.irp.f b/src/Davidson/diagonalize_CI.irp.f index e6b230b5..8e9a0543 100644 --- a/src/Davidson/diagonalize_CI.irp.f +++ b/src/Davidson/diagonalize_CI.irp.f @@ -55,12 +55,13 @@ END_PROVIDER if (diag_algorithm == "Davidson") then - call davidson_diag(psi_det,CI_eigenvectors,CI_electronic_energy,& - size(CI_eigenvectors,1),N_det,N_states,N_states_diag,N_int,output_determinants) + call davidson_diag_HS2(psi_det,CI_eigenvectors, & + size(CI_eigenvectors,1),CI_electronic_energy, & + N_det,N_states,N_states_diag,N_int,output_determinants) call u_0_S2_u_0(CI_eigenvectors_s2,CI_eigenvectors,N_det,psi_det,N_int,& N_states_diag,size(CI_eigenvectors,1)) - + else if (diag_algorithm == "Lapack") then diff --git a/src/Davidson/diagonalize_CI_mono.irp.f b/src/Davidson/diagonalize_CI_mono.irp.f index b45d03d6..1de9a211 100644 --- a/src/Davidson/diagonalize_CI_mono.irp.f +++ b/src/Davidson/diagonalize_CI_mono.irp.f @@ -34,7 +34,7 @@ i_state = 0 if (s2_eig) then do j=1,N_det - call get_s2_u0(psi_det,eigenvectors(1,j),N_det,N_det,s2) + call get_s2_u0(psi_det,eigenvectors(1,j),N_det,s2,N_det) if(dabs(s2-expected_s2).le.0.3d0)then print*,'j = ',j print*,'e = ',eigenvalues(j) @@ -54,7 +54,7 @@ enddo else do j=1,N_states_diag - call get_s2_u0(psi_det,eigenvectors(1,j),N_det,N_det,s2) + call get_s2_u0(psi_det,eigenvectors(1,j),N_det,s2,N_det) if(dabs(eigenvectors(1,j)).gt.0.9d0)then i_state += 1 do i=1,N_det diff --git a/src/Davidson/u0Hu0.irp.f b/src/Davidson/u0Hu0.irp.f index 0339c31f..daef3ee5 100644 --- a/src/Davidson/u0Hu0.irp.f +++ b/src/Davidson/u0Hu0.irp.f @@ -174,3 +174,158 @@ BEGIN_PROVIDER [ double precision, psi_energy, (N_states) ] call u_0_H_u_0(psi_energy,psi_coef,N_det,psi_det,N_int,N_states,psi_det_size) END_PROVIDER + + + + +subroutine H_S2_u_0_nstates(v_0,s_0,u_0,H_jj,S2_jj,n,keys_tmp,Nint,N_st,sze_8) + use bitmasks + implicit none + BEGIN_DOC + ! Computes v_0 = H|u_0> and s_0 = S^2 |u_0> + ! + ! n : number of determinants + ! + ! H_jj : array of + ! + ! S2_jj : array of + END_DOC + integer, intent(in) :: N_st,n,Nint, sze_8 + double precision, intent(out) :: v_0(sze_8,N_st), s_0(sze_8,N_st) + double precision, intent(in) :: u_0(sze_8,N_st) + double precision, intent(in) :: H_jj(n), S2_jj(n) + integer(bit_kind),intent(in) :: keys_tmp(Nint,2,n) + double precision :: hij,s2 + double precision, allocatable :: vt(:,:), ut(:,:), st(:,:) + integer :: i,j,k,l, jj,ii + integer :: i0, j0 + + integer, allocatable :: shortcut(:,:), sort_idx(:,:) + integer(bit_kind), allocatable :: sorted(:,:,:), version(:,:,:) + integer(bit_kind) :: sorted_i(Nint) + + integer :: sh, sh2, ni, exa, ext, org_i, org_j, endi, istate + integer :: N_st_8 + + integer, external :: align_double + !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: vt, ut + + N_st_8 = align_double(N_st) + + ASSERT (Nint > 0) + ASSERT (Nint == N_int) + ASSERT (n>0) + PROVIDE ref_bitmask_energy + + allocate (shortcut(0:n+1,2), sort_idx(n,2), sorted(Nint,n,2), version(Nint,n,2)) + allocate(ut(N_st_8,n)) + + v_0 = 0.d0 + s_0 = 0.d0 + + do i=1,n + do istate=1,N_st + ut(istate,i) = u_0(i,istate) + enddo + enddo + + call sort_dets_ab_v(keys_tmp, sorted(1,1,1), sort_idx(1,1), shortcut(0,1), version(1,1,1), n, Nint) + call sort_dets_ba_v(keys_tmp, sorted(1,1,2), sort_idx(1,2), shortcut(0,2), version(1,1,2), n, Nint) + + !$OMP PARALLEL DEFAULT(NONE) & + !$OMP PRIVATE(i,hij,s2,j,k,jj,vt,st,ii,sh,sh2,ni,exa,ext,org_i,org_j,endi,sorted_i,istate)& + !$OMP SHARED(n,keys_tmp,ut,Nint,v_0,s_0,sorted,shortcut,sort_idx,version,N_st,N_st_8) + allocate(vt(N_st_8,n),st(N_st_8,n)) + Vt = 0.d0 + St = 0.d0 + + !$OMP DO SCHEDULE(dynamic) + do sh=1,shortcut(0,1) + do sh2=sh,shortcut(0,1) + exa = 0 + do ni=1,Nint + exa = exa + popcnt(xor(version(ni,sh,1), version(ni,sh2,1))) + end do + if(exa > 2) then + cycle + end if + + do i=shortcut(sh,1),shortcut(sh+1,1)-1 + org_i = sort_idx(i,1) + if(sh==sh2) then + endi = i-1 + else + endi = shortcut(sh2+1,1)-1 + end if + do ni=1,Nint + sorted_i(ni) = sorted(ni,i,1) + enddo + + do j=shortcut(sh2,1),endi + org_j = sort_idx(j,1) + ext = exa + do ni=1,Nint + ext = ext + popcnt(xor(sorted_i(ni), sorted(ni,j,1))) + end do + if(ext <= 4) then + call i_h_j (keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,hij) + call get_s2(keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,s2) + do istate=1,n_st + vt (istate,org_i) = vt (istate,org_i) + hij*ut(istate,org_j) + vt (istate,org_j) = vt (istate,org_j) + hij*ut(istate,org_i) + st (istate,org_i) = st (istate,org_i) + s2*ut(istate,org_j) + st (istate,org_j) = st (istate,org_j) + s2*ut(istate,org_i) + enddo + endif + enddo + enddo + enddo + enddo + !$OMP END DO NOWAIT + + !$OMP DO SCHEDULE(dynamic) + do sh=1,shortcut(0,2) + do i=shortcut(sh,2),shortcut(sh+1,2)-1 + org_i = sort_idx(i,2) + do j=shortcut(sh,2),i-1 + org_j = sort_idx(j,2) + ext = 0 + do ni=1,Nint + ext = ext + popcnt(xor(sorted(ni,i,2), sorted(ni,j,2))) + end do + if(ext == 4) then + call i_h_j (keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,hij) + call get_s2(keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,s2) + do istate=1,n_st + vt (istate,org_i) = vt (istate,org_i) + hij*ut(istate,org_j) + vt (istate,org_j) = vt (istate,org_j) + hij*ut(istate,org_i) + st (istate,org_i) = st (istate,org_i) + s2*ut(istate,org_j) + st (istate,org_j) = st (istate,org_j) + s2*ut(istate,org_i) + enddo + end if + end do + end do + enddo + !$OMP END DO NOWAIT + + !$OMP CRITICAL + do istate=1,N_st + do i=n,1,-1 + v_0(i,istate) = v_0(i,istate) + vt(istate,i) + s_0(i,istate) = s_0(i,istate) + st(istate,i) + enddo + enddo + !$OMP END CRITICAL + + deallocate(vt,st) + !$OMP END PARALLEL + + do istate=1,N_st + do i=1,n + v_0(i,istate) = v_0(i,istate) + H_jj(i) * u_0(i,istate) + s_0(i,istate) = s_0(i,istate) + s2_jj(i)* u_0(i,istate) + enddo + enddo + deallocate (shortcut, sort_idx, sorted, version, ut) +end + diff --git a/src/Determinants/s2.irp.f b/src/Determinants/s2.irp.f index a9722df7..c6bb8390 100644 --- a/src/Determinants/s2.irp.f +++ b/src/Determinants/s2.irp.f @@ -1,4 +1,4 @@ -subroutine get_s2(key_i,key_j,s2,Nint) +subroutine get_s2(key_i,key_j,Nint,s2) implicit none use bitmasks BEGIN_DOC @@ -189,7 +189,7 @@ subroutine S2_u_0_nstates(v_0,u_0,n,keys_tmp,Nint,N_st,sze_8) ext = ext + popcnt(xor(sorted_i(ni), sorted(ni,j,1))) end do if(ext <= 4) then - call get_s2(keys_tmp(1,1,org_i),keys_tmp(1,1,org_j),s2_tmp,Nint) + call get_s2(keys_tmp(1,1,org_i),keys_tmp(1,1,org_j),Nint,s2_tmp) do istate=1,N_st vt (org_i,istate) = vt (org_i,istate) + s2_tmp*u_0(org_j,istate) vt (org_j,istate) = vt (org_j,istate) + s2_tmp*u_0(org_i,istate) @@ -212,7 +212,7 @@ subroutine S2_u_0_nstates(v_0,u_0,n,keys_tmp,Nint,N_st,sze_8) ext = ext + popcnt(xor(sorted(ni,i,2), sorted(ni,j,2))) end do if(ext == 4) then - call get_s2(keys_tmp(1,1,org_i),keys_tmp(1,1,org_j),s2_tmp,Nint) + call get_s2(keys_tmp(1,1,org_i),keys_tmp(1,1,org_j),Nint,s2_tmp) do istate=1,N_st vt (org_i,istate) = vt (org_i,istate) + s2_tmp*u_0(org_j,istate) vt (org_j,istate) = vt (org_j,istate) + s2_tmp*u_0(org_i,istate) @@ -235,7 +235,7 @@ subroutine S2_u_0_nstates(v_0,u_0,n,keys_tmp,Nint,N_st,sze_8) !$OMP END PARALLEL do i=1,n - call get_s2(keys_tmp(1,1,i),keys_tmp(1,1,i),s2_tmp,Nint) + call get_s2(keys_tmp(1,1,i),keys_tmp(1,1,i),Nint,s2_tmp) do istate=1,N_st v_0(i,istate) += s2_tmp * u_0(i,istate) enddo @@ -275,12 +275,12 @@ subroutine get_uJ_s2_uI(psi_keys_tmp,psi_coefs_tmp,n,nmax_coefs,nmax_keys,s2,nst allocate(idx(0:n)) !$OMP DO SCHEDULE(dynamic) do i = n,1,-1 ! Better OMP scheduling - call get_s2(psi_keys_tmp(1,1,i),psi_keys_tmp(1,1,i),s2_tmp,N_int) + call get_s2(psi_keys_tmp(1,1,i),psi_keys_tmp(1,1,i),N_int,s2_tmp) accu += psi_coefs_tmp(i,ll) * s2_tmp * psi_coefs_tmp(i,jj) call filter_connected(psi_keys_tmp,psi_keys_tmp(1,1,i),N_int,i-1,idx) do kk=1,idx(0) j = idx(kk) - call get_s2(psi_keys_tmp(1,1,i),psi_keys_tmp(1,1,j),s2_tmp,N_int) + call get_s2(psi_keys_tmp(1,1,i),psi_keys_tmp(1,1,j),N_int,s2_tmp) accu += psi_coefs_tmp(i,ll) * s2_tmp * psi_coefs_tmp(j,jj) + psi_coefs_tmp(i,jj) * s2_tmp * psi_coefs_tmp(j,ll) enddo enddo @@ -418,3 +418,58 @@ subroutine diagonalize_s2_betweenstates(keys_tmp,u_0,n,nmax_keys,nmax_coefs,nsta end +subroutine i_S2_psi_minilist(key,keys,idx_key,N_minilist,coef,Nint,Ndet,Ndet_max,Nstate,i_S2_psi_array) + use bitmasks + implicit none + integer, intent(in) :: Nint, Ndet,Ndet_max,Nstate,idx_key(Ndet), N_minilist + integer(bit_kind), intent(in) :: keys(Nint,2,Ndet) + integer(bit_kind), intent(in) :: key(Nint,2) + double precision, intent(in) :: coef(Ndet_max,Nstate) + double precision, intent(out) :: i_S2_psi_array(Nstate) + + integer :: i, ii,j, i_in_key, i_in_coef + double precision :: phase + integer :: exc(0:2,2,2) + double precision :: s2ij + integer :: idx(0:Ndet) + BEGIN_DOC +! Computes = \sum_J c_J . +! +! Uses filter_connected_i_H_psi0 to get all the |J> to which |i> +! is connected. The |J> are searched in short pre-computed lists. + END_DOC + + ASSERT (Nint > 0) + ASSERT (N_int == Nint) + ASSERT (Nstate > 0) + ASSERT (Ndet > 0) + ASSERT (Ndet_max >= Ndet) + i_S2_psi_array = 0.d0 + + call filter_connected_i_H_psi0(keys,key,Nint,N_minilist,idx) + if (Nstate == 1) then + + do ii=1,idx(0) + i_in_key = idx(ii) + i_in_coef = idx_key(idx(ii)) + !DIR$ FORCEINLINE + call get_s2(keys(1,1,i_in_key),key,Nint,s2ij) + ! TODO : Cache misses + i_S2_psi_array(1) = i_S2_psi_array(1) + coef(i_in_coef,1)*s2ij + enddo + + else + + do ii=1,idx(0) + i_in_key = idx(ii) + i_in_coef = idx_key(idx(ii)) + !DIR$ FORCEINLINE + call get_s2(keys(1,1,i_in_key),key,Nint,s2ij) + do j = 1, Nstate + i_S2_psi_array(j) = i_S2_psi_array(j) + coef(i_in_coef,j)*s2ij + enddo + enddo + + endif + +end