From a67225a1a33985e3d1abd8f74e12080c3ca39550 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Mon, 2 Jun 2014 18:21:17 +0200 Subject: [PATCH 1/2] Accelerated det_connections --- scripts/generate_h_apply.py | 6 +- src/Bitmask/README.rst | 12 -- src/Dets/H_apply_template.f | 2 + src/Dets/README.rst | 21 ++- src/Dets/slater_rules.irp.f | 229 ++++++++++++++++++++++--------- src/Full_CI/full_ci.irp.f | 2 +- src/MonoInts/README.rst | 18 +-- src/README.rst | 31 ++--- src/SC2/SC2.irp.f | 32 ++--- src/SC2/diagonalize_CI_SC2.irp.f | 6 +- 10 files changed, 221 insertions(+), 138 deletions(-) diff --git a/scripts/generate_h_apply.py b/scripts/generate_h_apply.py index 4e5eda1d..6d3a207d 100755 --- a/scripts/generate_h_apply.py +++ b/scripts/generate_h_apply.py @@ -118,6 +118,7 @@ class H_apply(object): double precision :: sum_H_pert_diag(N_st) double precision, allocatable :: e_2_pert_buffer(:,:) double precision, allocatable :: coef_pert_buffer(:,:) + !$ call omp_init_lock(lck) ASSERT (Nint == N_int) """ self.data["init_thread"] = """ @@ -130,13 +131,13 @@ class H_apply(object): """ self.data["deinit_thread"] = """ - !$OMP CRITICAL + !$ call omp_set_lock(lck) do k=1,N_st sum_e_2_pert_in(k) = sum_e_2_pert_in(k) + sum_e_2_pert(k) sum_norm_pert_in(k) = sum_norm_pert_in(k) + sum_norm_pert(k) sum_H_pert_diag_in(k) = sum_H_pert_diag_in(k) + sum_H_pert_diag(k) enddo - !$OMP END CRITICAL + !$ call omp_unset_lock(lck) deallocate (e_2_pert_buffer, coef_pert_buffer) """ self.data["size_max"] = "256" @@ -148,6 +149,7 @@ class H_apply(object): sum_norm_pert,sum_H_pert_diag,N_st,N_int) """%(pert,) self.data["finalization"] = """ + !$ call omp_destroy_lock(lck) """ self.data["copy_buffer"] = "" self.data["generate_psi_guess"] = "" diff --git a/src/Bitmask/README.rst b/src/Bitmask/README.rst index 772c5b1c..4169d164 100644 --- a/src/Bitmask/README.rst +++ b/src/Bitmask/README.rst @@ -57,27 +57,15 @@ Documentation `full_ijkl_bitmask `_ Bitmask to include all possible MOs -`generators_bitmask `_ - Bitmasks for generator determinants. (N_int, alpha/beta, hole/particle, generator) - `hf_bitmask `_ Hartree Fock bit mask -`n_generators_bitmask `_ - Number of bitmasks for generators - `n_int `_ Number of 64-bit integers needed to represent determinants as binary strings -`n_reference_bitmask `_ - Number of bitmasks for reference - `ref_bitmask `_ Reference bit mask, used in Slater rules, chosen as Hartree-Fock bitmask -`reference_bitmask `_ - Bitmasks for reference determinants. (N_int, alpha/beta, hole/particle, reference) - `bitstring_to_hexa `_ Transform a bit string to a string in hexadecimal format for printing diff --git a/src/Dets/H_apply_template.f b/src/Dets/H_apply_template.f index 5c6ce54a..78c95f5f 100644 --- a/src/Dets/H_apply_template.f +++ b/src/Dets/H_apply_template.f @@ -28,6 +28,7 @@ subroutine $subroutine_diexc(key_in, hole_1,particl_1, hole_2, particl_2, i_gene integer, allocatable :: ia_ja_pairs(:,:,:) double precision :: diag_H_mat_elem integer :: iproc + integer(omp_lock_kind) :: lck PROVIDE H_apply_threshold $initialization @@ -245,6 +246,7 @@ subroutine $subroutine_monoexc(key_in, hole_1,particl_1,i_generator $parameters integer, allocatable :: ia_ja_pairs(:,:,:) double precision :: diag_H_mat_elem integer :: iproc + integer(omp_lock_kind) :: lck PROVIDE H_apply_threshold $initialization diff --git a/src/Dets/README.rst b/src/Dets/README.rst index 349a545b..208042cf 100644 --- a/src/Dets/README.rst +++ b/src/Dets/README.rst @@ -133,36 +133,33 @@ Documentation `n_det `_ Number of determinants in the wave function -`n_det_max_jacobi `_ - Maximum number of determinants diagonalized my jacobi - -`n_det_reference `_ +`n_det_reference `_ Number of determinants in the reference wave function `n_states `_ Number of states to consider -`psi_average_norm_contrib `_ +`psi_average_norm_contrib `_ Contribution of determinants to the state-averaged density -`psi_average_norm_contrib_sorted `_ +`psi_average_norm_contrib_sorted `_ Wave function sorted by determinants (state-averaged) -`psi_coef `_ +`psi_coef `_ The wave function. Initialized with Hartree-Fock if the EZFIO file is empty -`psi_coef_sorted `_ +`psi_coef_sorted `_ Wave function sorted by determinants (state-averaged) -`psi_det `_ +`psi_det `_ The wave function. Initialized with Hartree-Fock if the EZFIO file is empty -`psi_det_size `_ +`psi_det_size `_ Size of the psi_det/psi_coef arrays -`psi_det_sorted `_ +`psi_det_sorted `_ Wave function sorted by determinants (state-averaged) `double_exc_bitmask `_ @@ -301,7 +298,7 @@ Documentation Returns where i and j are determinants `i_h_psi `_ - for the various Nstates + for the various Nstate `i_h_psi_sc2 `_ for the various Nstate diff --git a/src/Dets/slater_rules.irp.f b/src/Dets/slater_rules.irp.f index 62206b59..741a1e01 100644 --- a/src/Dets/slater_rules.irp.f +++ b/src/Dets/slater_rules.irp.f @@ -2,7 +2,7 @@ subroutine get_excitation_degree(key1,key2,degree,Nint) use bitmasks implicit none BEGIN_DOC -! Returns the excitation degree between two determinants + ! Returns the excitation degree between two determinants END_DOC integer, intent(in) :: Nint integer(bit_kind), intent(in) :: key1(Nint,2) @@ -10,7 +10,7 @@ subroutine get_excitation_degree(key1,key2,degree,Nint) integer, intent(out) :: degree integer :: l - + ASSERT (Nint > 0) degree = popcnt(xor( key1(1,1), key2(1,1))) + & @@ -31,7 +31,7 @@ subroutine get_excitation(det1,det2,exc,degree,phase,Nint) use bitmasks implicit none BEGIN_DOC -! Returns the excitation operators between two determinants and the phase + ! Returns the excitation operators between two determinants and the phase END_DOC integer, intent(in) :: Nint integer(bit_kind), intent(in) :: det1(Nint,2) @@ -87,7 +87,7 @@ subroutine decode_exc(exc,degree,h1,p1,h2,p2,s1,s2) integer, intent(out) :: h1,h2,p1,p2,s1,s2 ASSERT (degree > 0) ASSERT (degree < 3) - + select case(degree) case(2) if (exc(0,1,1) == 2) then @@ -142,7 +142,7 @@ subroutine get_double_excitation(det1,det2,exc,phase,Nint) use bitmasks implicit none BEGIN_DOC -! Returns the two excitation operators between two doubly excited determinants and the phase + ! Returns the two excitation operators between two doubly excited determinants and the phase END_DOC integer, intent(in) :: Nint integer(bit_kind), intent(in) :: det1(Nint,2) @@ -275,7 +275,7 @@ subroutine get_mono_excitation(det1,det2,exc,phase,Nint) use bitmasks implicit none BEGIN_DOC -! Returns the excitation operator between two singly excited determinants and the phase + ! Returns the excitation operator between two singly excited determinants and the phase END_DOC integer, intent(in) :: Nint integer(bit_kind), intent(in) :: det1(Nint,2) @@ -356,7 +356,7 @@ subroutine i_H_j(key_i,key_j,Nint,hij) use bitmasks implicit none BEGIN_DOC -! Returns where i and j are determinants + ! Returns where i and j are determinants END_DOC integer, intent(in) :: Nint integer(bit_kind), intent(in) :: key_i(Nint,2), key_j(Nint,2) @@ -528,16 +528,16 @@ subroutine i_H_psi_SC2(key,keys,coef,Nint,Ndet,Ndet_max,Nstate,i_H_psi_array,idx use bitmasks BEGIN_DOC ! for the various Nstate - ! - ! returns in addition + ! + ! returns in addition ! ! the array of the index of the non connected determinants to key1 ! ! in order to know what double excitation can be repeated on key1 ! - ! idx_repeat(0) is the number of determinants that can be used - ! - ! to repeat the excitations + ! idx_repeat(0) is the number of determinants that can be used + ! + ! to repeat the excitations END_DOC implicit none integer, intent(in) :: Nint, Ndet,Ndet_max,Nstate @@ -576,7 +576,7 @@ subroutine get_excitation_degree_vector(key1,key2,degree,Nint,sze,idx) use bitmasks implicit none BEGIN_DOC -! Applies get_excitation_degree to an array of determinants + ! Applies get_excitation_degree to an array of determinants END_DOC integer, intent(in) :: Nint, sze integer(bit_kind), intent(in) :: key1(Nint,2,sze) @@ -588,7 +588,7 @@ subroutine get_excitation_degree_vector(key1,key2,degree,Nint,sze,idx) ASSERT (Nint > 0) ASSERT (sze > 0) - + l=1 if (Nint==1) then @@ -659,7 +659,7 @@ end double precision function diag_H_mat_elem(det_in,Nint) implicit none BEGIN_DOC -! Computes + ! Computes END_DOC integer,intent(in) :: Nint integer(bit_kind),intent(in) :: det_in(Nint,2) @@ -671,7 +671,7 @@ double precision function diag_H_mat_elem(det_in,Nint) integer :: occ_hole(Nint*bit_kind_size,2) integer(bit_kind) :: det_tmp(Nint,2) integer :: na, nb - + ASSERT (Nint > 0) ASSERT (sum(popcnt(det_in(:,1))) == elec_alpha_num) ASSERT (sum(popcnt(det_in(:,2))) == elec_beta_num) @@ -688,13 +688,13 @@ double precision function diag_H_mat_elem(det_in,Nint) nexc(1) += popcnt(hole(i,1)) nexc(2) += popcnt(hole(i,2)) enddo - + diag_H_mat_elem = ref_bitmask_energy if (nexc(1)+nexc(2) == 0) then return endif -!call debug_det(det_in,Nint) + !call debug_det(det_in,Nint) integer :: tmp call bitstring_to_list(particle(1,1), occ_particle(1,1), tmp, Nint) ASSERT (tmp == nexc(1)) @@ -722,7 +722,7 @@ subroutine a_operator(iorb,ispin,key,hjj,Nint,na,nb) use bitmasks implicit none BEGIN_DOC -! Needed for diag_H_mat_elem + ! Needed for diag_H_mat_elem END_DOC integer, intent(in) :: iorb, ispin, Nint integer, intent(inout) :: na, nb @@ -737,7 +737,7 @@ subroutine a_operator(iorb,ispin,key,hjj,Nint,na,nb) ASSERT (ispin > 0) ASSERT (ispin < 3) ASSERT (Nint > 0) - + k = ishft(iorb-1,-bit_kind_shift)+1 ASSERT (k > 0) l = iorb - ishft(k-1,bit_kind_shift)-1 @@ -767,7 +767,7 @@ subroutine ac_operator(iorb,ispin,key,hjj,Nint,na,nb) use bitmasks implicit none BEGIN_DOC -! Needed for diag_H_mat_elem + ! Needed for diag_H_mat_elem END_DOC integer, intent(in) :: iorb, ispin, Nint integer, intent(inout) :: na, nb @@ -779,10 +779,10 @@ subroutine ac_operator(iorb,ispin,key,hjj,Nint,na,nb) integer :: k,l,i ASSERT (iorb > 0) - ASSERT (ispin > 0) - ASSERT (ispin < 3) + ASSERT (ispin > 0) + ASSERT (ispin < 3) ASSERT (Nint > 0) - + integer :: tmp !DIR$ FORCEINLINE call bitstring_to_list(key(1,1), occ(1,1), tmp, Nint) @@ -815,7 +815,7 @@ subroutine get_occ_from_key(key,occ,Nint) use bitmasks implicit none BEGIN_DOC -! Returns a list of occupation numbers from a bitstring + ! Returns a list of occupation numbers from a bitstring END_DOC integer(bit_kind), intent(in) :: key(Nint,2) integer , intent(in) :: Nint @@ -831,11 +831,11 @@ subroutine H_u_0(v_0,u_0,H_jj,n,keys_tmp,Nint) use bitmasks implicit none BEGIN_DOC -! Computes v_0 = H|u_0> -! -! n : number of determinants -! -! H_jj : array of + ! Computes v_0 = H|u_0> + ! + ! n : number of determinants + ! + ! H_jj : array of END_DOC integer, intent(in) :: n,Nint double precision, intent(out) :: v_0(n) @@ -849,11 +849,11 @@ subroutine H_u_0(v_0,u_0,H_jj,n,keys_tmp,Nint) integer :: i0, j0 ASSERT (Nint > 0) ASSERT (Nint == N_int) - ASSERT (n>0) + ASSERT (n>0) PROVIDE ref_bitmask_energy - integer, parameter :: block_size = 157 + integer, parameter :: block_size = 157 !$OMP PARALLEL DEFAULT(NONE) & - !$OMP PRIVATE(i,hij,j,k,idx,jj,vt) & + !$OMP PRIVATE(i,hij,j,k,idx,jj,vt) & !$OMP SHARED(n,H_jj,u_0,keys_tmp,Nint,v_0) !$OMP DO SCHEDULE(static) do i=1,n @@ -864,23 +864,23 @@ subroutine H_u_0(v_0,u_0,H_jj,n,keys_tmp,Nint) Vt = 0.d0 !$OMP DO SCHEDULE(guided) do i=1,n - idx(0) = i - call filter_connected_davidson(keys_tmp,keys_tmp(1,1,i),Nint,i-1,idx) - do jj=1,idx(0) - j = idx(jj) - if ( (dabs(u_0(j)) > 1.d-7).or.((dabs(u_0(i)) > 1.d-7)) ) then - call i_H_j(keys_tmp(1,1,j),keys_tmp(1,1,i),Nint,hij) - vt (i) = vt (i) + hij*u_0(j) - vt (j) = vt (j) + hij*u_0(i) - endif - enddo + idx(0) = i + call filter_connected_davidson(keys_tmp,keys_tmp(1,1,i),Nint,i-1,idx) + do jj=1,idx(0) + j = idx(jj) + if ( (dabs(u_0(j)) > 1.d-7).or.((dabs(u_0(i)) > 1.d-7)) ) then + call i_H_j(keys_tmp(1,1,j),keys_tmp(1,1,i),Nint,hij) + vt (i) = vt (i) + hij*u_0(j) + vt (j) = vt (j) + hij*u_0(i) + endif + enddo enddo !$OMP END DO !$OMP CRITICAL do i=1,n - v_0(i) = v_0(i) + vt(i) + v_0(i) = v_0(i) + vt(i) enddo - !$OMP END CRITICAL + !$OMP END CRITICAL deallocate(idx,vt) !$OMP END PARALLEL end @@ -904,30 +904,125 @@ BEGIN_PROVIDER [ integer*8, det_connections, (N_con_int,N_det) ] integer :: degree integer :: j_int, j_k, j_l integer, allocatable :: idx(:) - !$OMP PARALLEL DEFAULT (NONE) & - !$OMP SHARED(N_det, N_con_int, psi_det,N_int, det_connections) & - !$OMP PRIVATE(i,j_int,j_k,j_l,j,degree,idx) - allocate (idx(0:N_det)) - !$OMP DO SCHEDULE(guided) - do i=1,N_det - do j_int=1,N_con_int - det_connections(j_int,i) = 0_8 - j_k = ishft(j_int-1,11) - do j_l = j_k,min(j_k+2047,N_det), 32 - do j = j_l+1,min(j_l+32,i) - !DIR$ FORCEINLINE - call get_excitation_degree(psi_det(1,1,i),psi_det(1,1,j),degree,N_int) - if (degree < 3) then - det_connections(j_int,i) = ibset( det_connections(j_int,i), iand(63,ishft(j_l,-5)) ) - exit - endif + + select case(N_int) + + case(1) + + !$OMP PARALLEL DEFAULT (NONE) & + !$OMP SHARED(N_det, N_con_int, psi_det,N_int, det_connections)& + !$OMP PRIVATE(i,j_int,j_k,j_l,j,degree,idx) + allocate (idx(0:N_det)) + !$OMP DO SCHEDULE(guided) + do i=1,N_det + do j_int=1,N_con_int + det_connections(j_int,i) = 0_8 + j_k = ishft(j_int-1,11) + do j_l = j_k,min(j_k+2047,N_det), 32 + do j = j_l+1,min(j_l+32,i) + degree = popcnt(xor( psi_det(1,1,i),psi_det(1,1,j))) + & + popcnt(xor( psi_det(1,2,i),psi_det(1,2,j))) + if (degree < 5) then + det_connections(j_int,i) = ibset( det_connections(j_int,i), iand(63,ishft(j_l,-5)) ) + exit + endif + enddo + enddo enddo enddo - enddo - enddo - !$OMP ENDDO - deallocate(idx) - !$OMP ENDPARALLEL + !$OMP ENDDO + deallocate(idx) + !$OMP END PARALLEL + + case(2) + + !$OMP PARALLEL DEFAULT (NONE) & + !$OMP SHARED(N_det, N_con_int, psi_det,N_int, det_connections)& + !$OMP PRIVATE(i,j_int,j_k,j_l,j,degree,idx) + allocate (idx(0:N_det)) + !$OMP DO SCHEDULE(guided) + do i=1,N_det + do j_int=1,N_con_int + det_connections(j_int,i) = 0_8 + j_k = ishft(j_int-1,11) + do j_l = j_k,min(j_k+2047,N_det), 32 + do j = j_l+1,min(j_l+32,i) + degree = popcnt(xor( psi_det(1,1,i),psi_det(1,1,j))) + & + popcnt(xor( psi_det(1,2,i),psi_det(1,2,j))) + & + popcnt(xor( psi_det(2,1,i),psi_det(2,1,j))) + & + popcnt(xor( psi_det(2,2,i),psi_det(2,2,j))) + if (degree < 5) then + det_connections(j_int,i) = ibset( det_connections(j_int,i), iand(63,ishft(j_l,-5)) ) + exit + endif + enddo + enddo + enddo + enddo + !$OMP ENDDO + deallocate(idx) + !$OMP END PARALLEL + + case(3) + + !$OMP PARALLEL DEFAULT (NONE) & + !$OMP SHARED(N_det, N_con_int, psi_det,N_int, det_connections)& + !$OMP PRIVATE(i,j_int,j_k,j_l,j,degree,idx) + allocate (idx(0:N_det)) + !$OMP DO SCHEDULE(guided) + do i=1,N_det + do j_int=1,N_con_int + det_connections(j_int,i) = 0_8 + j_k = ishft(j_int-1,11) + do j_l = j_k,min(j_k+2047,N_det), 32 + do j = j_l+1,min(j_l+32,i) + degree = popcnt(xor( psi_det(1,1,i),psi_det(1,1,j))) + & + popcnt(xor( psi_det(1,2,i),psi_det(1,2,j))) + & + popcnt(xor( psi_det(2,1,i),psi_det(2,1,j))) + & + popcnt(xor( psi_det(2,2,i),psi_det(2,2,j))) + & + popcnt(xor( psi_det(3,1,i),psi_det(3,1,j))) + & + popcnt(xor( psi_det(3,2,i),psi_det(3,2,j))) + if (degree < 5) then + det_connections(j_int,i) = ibset( det_connections(j_int,i), iand(63,ishft(j_l,-5)) ) + exit + endif + enddo + enddo + enddo + enddo + !$OMP ENDDO + deallocate(idx) + !$OMP END PARALLEL + + case default + + + !$OMP PARALLEL DEFAULT (NONE) & + !$OMP SHARED(N_det, N_con_int, psi_det,N_int, det_connections)& + !$OMP PRIVATE(i,j_int,j_k,j_l,j,degree,idx) + allocate (idx(0:N_det)) + !$OMP DO SCHEDULE(guided) + do i=1,N_det + do j_int=1,N_con_int + det_connections(j_int,i) = 0_8 + j_k = ishft(j_int-1,11) + do j_l = j_k,min(j_k+2047,N_det), 32 + do j = j_l+1,min(j_l+32,i) + !DIR$ FORCEINLINE + call get_excitation_degree(psi_det(1,1,i),psi_det(1,1,j),degree,N_int) + if (degree < 3) then + det_connections(j_int,i) = ibset( det_connections(j_int,i), iand(63,ishft(j_l,-5)) ) + exit + endif + enddo + enddo + enddo + enddo + !$OMP ENDDO + deallocate(idx) + !$OMP END PARALLEL + + end select END_PROVIDER diff --git a/src/Full_CI/full_ci.irp.f b/src/Full_CI/full_ci.irp.f index 3651ecea..12979d1a 100644 --- a/src/Full_CI/full_ci.irp.f +++ b/src/Full_CI/full_ci.irp.f @@ -11,7 +11,7 @@ program cisd pt2 = 1.d0 diag_algorithm = "Lapack" - do while (maxval(abs(pt2(1:N_st))) > 1.d-4) + do while (maxval(abs(pt2(1:N_st))) > 1.d-2) call H_apply_FCI(pt2, norm_pert, H_pert_diag, N_st) call diagonalize_CI print *, 'N_det = ', N_det diff --git a/src/MonoInts/README.rst b/src/MonoInts/README.rst index a9bd07bc..052e2053 100644 --- a/src/MonoInts/README.rst +++ b/src/MonoInts/README.rst @@ -91,34 +91,34 @@ Documentation `ao_nucl_elec_integral `_ interaction nuclear electron -`give_polynom_mult_center_mono_elec `_ +`give_polynom_mult_center_mono_elec `_ Undocumented -`i_x1_pol_mult_mono_elec `_ +`i_x1_pol_mult_mono_elec `_ Undocumented -`i_x2_pol_mult_mono_elec `_ +`i_x2_pol_mult_mono_elec `_ Undocumented -`int_gaus_pol `_ +`int_gaus_pol `_ Undocumented `nai_pol_mult `_ Undocumented -`v_e_n `_ +`v_e_n `_ Undocumented -`v_phi `_ +`v_phi `_ Undocumented -`v_r `_ +`v_r `_ Undocumented -`v_theta `_ +`v_theta `_ Undocumented -`wallis `_ +`wallis `_ Undocumented `mo_nucl_elec_integral `_ diff --git a/src/README.rst b/src/README.rst index 4b6fab44..d7d9b4f9 100644 --- a/src/README.rst +++ b/src/README.rst @@ -115,30 +115,29 @@ Needed Modules .. NEEDED_MODULES file. * `AOs `_ +* `BiInts `_ * `Bitmask `_ +* `CISD `_ +* `CISD_SC2_selected `_ +* `CISD_selected `_ +* `DensityMatrix `_ +* `Dets `_ * `Electrons `_ * `Ezfio_files `_ +* `Full_CI `_ +* `Generators_full `_ +* `Hartree_Fock `_ +* `MOGuess `_ +* `MonoInts `_ * `MOs `_ +* `MP2 `_ * `Nuclei `_ * `Output `_ -* `Utils `_ -* `Hartree_Fock `_ -* `BiInts `_ -* `MonoInts `_ -* `MOGuess `_ -* `Dets `_ -* `DensityMatrix `_ -* `CISD `_ * `Perturbation `_ -* `SingleRefMethod `_ -* `CISD_selected `_ +* `SC2 `_ * `Selectors_full `_ -* `MP2 `_ -* `Generators_full `_ -* `Full_CI `_ -* `CISD_SC2_selected `_ -* `CISD_SC2_selected `_ -* `CISD_SC2 `_ +* `SingleRefMethod `_ +* `Utils `_ Documentation ============= diff --git a/src/SC2/SC2.irp.f b/src/SC2/SC2.irp.f index 16937cc5..d0eeaedc 100644 --- a/src/SC2/SC2.irp.f +++ b/src/SC2/SC2.irp.f @@ -50,10 +50,10 @@ subroutine CISD_SC2(dets_in,u_in,energies,dim_in,sze,N_st,Nint,iunit) enddo endif - call write_time(output_CISD_SC2) - write(output_CISD_SC2,'(A)') '' - write(output_CISD_SC2,'(A)') 'CISD SC2' - write(output_CISD_SC2,'(A)') '========' + call write_time(output_SC2) + write(output_SC2,'(A)') '' + write(output_SC2,'(A)') 'CISD SC2' + write(output_SC2,'(A)') '========' !$OMP PARALLEL DEFAULT(NONE) & !$OMP SHARED(sze,N_st, & !$OMP H_jj_ref,Nint,dets_in,u_in) & @@ -120,7 +120,7 @@ subroutine CISD_SC2(dets_in,u_in,energies,dim_in,sze,N_st,Nint,iunit) enddo if(sze>1000)then - call davidson_diag_hjj(dets_in,u_in,H_jj_dressed,energies,dim_in,sze,N_st,Nint,output_CISD_SC2) + call davidson_diag_hjj(dets_in,u_in,H_jj_dressed,energies,dim_in,sze,N_st,Nint,output_SC2) else do i = 1,sze H_matrix_tmp(i,i) = H_jj_dressed(i) @@ -141,18 +141,18 @@ subroutine CISD_SC2(dets_in,u_in,energies,dim_in,sze,N_st,Nint,iunit) e_corr_array(i) = u_in(index_double(i),1)*inv_c0 * hij_double(i) e_corr_double += e_corr_array(i) enddo - write(output_CISD_SC2,'(A,I3)') 'SC2 Iteration ', iter - write(output_CISD_SC2,'(A)') '------------------' - write(output_CISD_SC2,'(A)') '' - write(output_CISD_SC2,'(A)') '===== ================' - write(output_CISD_SC2,'(A)') 'State Energy ' - write(output_CISD_SC2,'(A)') '===== ================' + write(output_SC2,'(A,I3)') 'SC2 Iteration ', iter + write(output_SC2,'(A)') '------------------' + write(output_SC2,'(A)') '' + write(output_SC2,'(A)') '===== ================' + write(output_SC2,'(A)') 'State Energy ' + write(output_SC2,'(A)') '===== ================' do i=1,N_st - write(output_CISD_SC2,'(I5,X,F16.10)') i, energies(i)+nuclear_repulsion + write(output_SC2,'(I5,X,F16.10)') i, energies(i)+nuclear_repulsion enddo - write(output_CISD_SC2,'(A)') '===== ================' - write(output_CISD_SC2,'(A)') '' - call write_double(output_CISD_SC2,(e_corr_double - e_corr_double_before),& + write(output_SC2,'(A)') '===== ================' + write(output_SC2,'(A)') '' + call write_double(output_SC2,(e_corr_double - e_corr_double_before),& 'Delta(E_corr)') converged = dabs(e_corr_double - e_corr_double_before) < 1.d-10 if (converged) then @@ -162,7 +162,7 @@ subroutine CISD_SC2(dets_in,u_in,energies,dim_in,sze,N_st,Nint,iunit) enddo - call write_time(output_CISD_SC2) + call write_time(output_SC2) end diff --git a/src/SC2/diagonalize_CI_SC2.irp.f b/src/SC2/diagonalize_CI_SC2.irp.f index c4bfa376..2f83d1ed 100644 --- a/src/SC2/diagonalize_CI_SC2.irp.f +++ b/src/SC2/diagonalize_CI_SC2.irp.f @@ -6,11 +6,11 @@ BEGIN_PROVIDER [ double precision, CI_SC2_energy, (N_states) ] integer :: j character*(8) :: st - call write_time(output_CISD_SC2) + call write_time(output_SC2) do j=1,N_states CI_SC2_energy(j) = CI_SC2_electronic_energy(j) + nuclear_repulsion write(st,'(I4)') j - call write_double(output_CISD_SC2,CI_SC2_energy(j),'Energy of state '//trim(st)) + call write_double(output_SC2,CI_SC2_energy(j),'Energy of state '//trim(st)) enddo END_PROVIDER @@ -32,7 +32,7 @@ END_PROVIDER call CISD_SC2(psi_det,CI_SC2_eigenvectors,CI_SC2_electronic_energy, & - size(CI_SC2_eigenvectors,1),N_det,N_states,N_int,output_CISD_SC2) + size(CI_SC2_eigenvectors,1),N_det,N_states,N_int,output_SC2) END_PROVIDER subroutine diagonalize_CI_SC2 From 79ad24bee43125b658ac1775d477178705ad1a69 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Mon, 2 Jun 2014 18:49:34 +0200 Subject: [PATCH 2/2] Accelerated openmp --- scripts/generate_h_apply.py | 2 -- src/Dets/H_apply_template.f | 54 +++++++++++++++++++++++++++++++++---- 2 files changed, 49 insertions(+), 7 deletions(-) diff --git a/scripts/generate_h_apply.py b/scripts/generate_h_apply.py index 6d3a207d..d39273b2 100755 --- a/scripts/generate_h_apply.py +++ b/scripts/generate_h_apply.py @@ -118,7 +118,6 @@ class H_apply(object): double precision :: sum_H_pert_diag(N_st) double precision, allocatable :: e_2_pert_buffer(:,:) double precision, allocatable :: coef_pert_buffer(:,:) - !$ call omp_init_lock(lck) ASSERT (Nint == N_int) """ self.data["init_thread"] = """ @@ -149,7 +148,6 @@ class H_apply(object): sum_norm_pert,sum_H_pert_diag,N_st,N_int) """%(pert,) self.data["finalization"] = """ - !$ call omp_destroy_lock(lck) """ self.data["copy_buffer"] = "" self.data["generate_psi_guess"] = "" diff --git a/src/Dets/H_apply_template.f b/src/Dets/H_apply_template.f index 78c95f5f..48b117cc 100644 --- a/src/Dets/H_apply_template.f +++ b/src/Dets/H_apply_template.f @@ -28,7 +28,11 @@ subroutine $subroutine_diexc(key_in, hole_1,particl_1, hole_2, particl_2, i_gene integer, allocatable :: ia_ja_pairs(:,:,:) double precision :: diag_H_mat_elem integer :: iproc - integer(omp_lock_kind) :: lck + integer(omp_lock_kind), save :: lck, ifirst=0 + if (ifirst == 0) then + ifirst=1 +!$ call omp_init_lock(lck) + endif PROVIDE H_apply_threshold $initialization @@ -246,7 +250,11 @@ subroutine $subroutine_monoexc(key_in, hole_1,particl_1,i_generator $parameters integer, allocatable :: ia_ja_pairs(:,:,:) double precision :: diag_H_mat_elem integer :: iproc - integer(omp_lock_kind) :: lck + integer(omp_lock_kind), save :: lck, ifirst=0 + if (ifirst == 0) then + ifirst=1 +!$ call omp_init_lock(lck) + endif PROVIDE H_apply_threshold $initialization @@ -335,6 +343,7 @@ end subroutine $subroutine($params_main) implicit none + use omp_lib use bitmasks BEGIN_DOC ! Calls H_apply on the HF determinant and selects all connected single and double @@ -343,12 +352,23 @@ subroutine $subroutine($params_main) $decls_main + integer :: i_generator, k, nmax + double precision :: wall_0, wall_1, wall_2, d + integer(omp_lock_kind) :: lck + PROVIDE H_apply_buffer_allocated mo_bielec_integrals_in_map N_det_selectors psi_generators - integer :: i_generator, k - double precision :: wall_0, wall_1, wall_2 + PROVIDE psi_det_sorted_bit coef_hf_selector + nmax = ( N_det_generators/nproc ) *nproc call wall_time(wall_1) - do i_generator=1,N_det_generators + !$ call omp_init_lock(lck) + !$OMP PARALLEL DEFAULT(SHARED) & + !$OMP PRIVATE(i_generator,wall_2) + !$OMP DO SCHEDULE(guided) + do i_generator=1,nmax + if (abort_here) then + cycle + endif call $subroutine_diexc(psi_generators(1,1,i_generator), & generators_bitmask(1,1,d_hole1,i_bitmask_gen), & generators_bitmask(1,1,d_part1,i_bitmask_gen), & @@ -359,9 +379,33 @@ subroutine $subroutine($params_main) generators_bitmask(1,1,s_hole ,i_bitmask_gen), & generators_bitmask(1,1,s_part ,i_bitmask_gen), & i_generator $params_post) + !$ call omp_set_lock(lck) + call wall_time(wall_2) + $printout_always + if (wall_2 - wall_0 > 2.d0) then + wall_0 = wall_2 + $printout_now + endif + !$ call omp_unset_lock(lck) + enddo + !$OMP END DO NOWAIT + !$OMP END PARALLEL + !$ call omp_destroy_lock(lck) + + do i_generator=nmax+1,N_det_generators if (abort_here) then exit endif + call $subroutine_diexc(psi_generators(1,1,i_generator), & + generators_bitmask(1,1,d_hole1,i_bitmask_gen), & + generators_bitmask(1,1,d_part1,i_bitmask_gen), & + generators_bitmask(1,1,d_hole2,i_bitmask_gen), & + generators_bitmask(1,1,d_part2,i_bitmask_gen), & + i_generator $params_post) + call $subroutine_monoexc(psi_generators(1,1,i_generator), & + generators_bitmask(1,1,s_hole ,i_bitmask_gen), & + generators_bitmask(1,1,s_part ,i_bitmask_gen), & + i_generator $params_post) call wall_time(wall_2) $printout_always if (wall_2 - wall_0 > 2.d0) then