diff --git a/config/sse4_avx2.cfg b/config/sse4_avx2.cfg new file mode 100644 index 00000000..eabf75a3 --- /dev/null +++ b/config/sse4_avx2.cfg @@ -0,0 +1,62 @@ +# Common flags +############## +# +# -mkl=[parallel|sequential] : Use the MKL library +# --ninja : Allow the utilisation of ninja. It is mandatory ! +# --align=32 : Align all provided arrays on a 32-byte boundary +# +[COMMON] +FC : ifort +LAPACK_LIB : -mkl=parallel +IRPF90 : irpf90 +IRPF90_FLAGS : --ninja --align=32 + +# Global options +################ +# +# 1 : Activate +# 0 : Deactivate +# +[OPTION] +MODE : OPT ; [ OPT | PROFILE | DEBUG ] : Chooses the section below +CACHE : 1 ; Enable cache_compile.py +OPENMP : 1 ; Append OpenMP flags + +# Optimization flags +#################### +# +# -xHost : Compile a binary optimized for the current architecture +# -O2 : O3 not better than O2. +# -ip : Inter-procedural optimizations +# -ftz : Flushes denormal results to zero +# +[OPT] +FCFLAGS : -axSSE4.2,AVX,CORE-AVX2 -O2 -ip -ftz -g -traceback + +# Profiling flags +################# +# +[PROFILE] +FC : -p -g +FCFLAGS : -xSSE4.2 -O2 -ip -ftz + +# Debugging flags +################# +# +# -traceback : Activate backtrace on runtime +# -fpe0 : All floating point exaceptions +# -C : Checks uninitialized variables, array subscripts, etc... +# -g : Extra debugging information +# -xSSE2 : Valgrind needs a very simple x86 executable +# +[DEBUG] +FC : -g -traceback +FCFLAGS : -xSSE2 -C -fpe0 + +# OpenMP flags +################# +# +[OPENMP] +FC : -openmp +IRPF90_FLAGS : --openmp + diff --git a/plugins/Full_CI_ZMQ/fci_zmq.irp.f b/plugins/Full_CI_ZMQ/fci_zmq.irp.f index 0577a408..892e0e4b 100644 --- a/plugins/Full_CI_ZMQ/fci_zmq.irp.f +++ b/plugins/Full_CI_ZMQ/fci_zmq.irp.f @@ -38,7 +38,6 @@ program fci_zmq do while (N_det < N_det_max.and.maxval(abs(pt2(1:N_st))) > pt2_max) n_det_before = N_det - ! call H_apply_FCI(pt2, norm_pert, H_pert_diag, N_st) call ZMQ_selection(max(1024-N_det, N_det), pt2) PROVIDE psi_coef @@ -90,21 +89,21 @@ program fci_zmq N_det = min(N_det_max,N_det) touch N_det psi_det psi_coef call diagonalize_CI -! if(do_pt2_end)then -! print*,'Last iteration only to compute the PT2' -! threshold_selectors = 1.d0 -! threshold_generators = 0.999d0 -! call H_apply_FCI_PT2(pt2, norm_pert, H_pert_diag, N_st) -! -! print *, 'Final step' -! print *, 'N_det = ', N_det -! print *, 'N_states = ', N_states -! print *, 'PT2 = ', pt2 -! print *, 'E = ', CI_energy -! print *, 'E+PT2 = ', CI_energy+pt2 -! print *, '-----' -! call ezfio_set_full_ci_energy_pt2(CI_energy+pt2) -! endif + if(do_pt2_end)then + print*,'Last iteration only to compute the PT2' + threshold_selectors = 1.d0 + threshold_generators = 0.9999d0 + E_CI_before = CI_energy + call ZMQ_selection(1, pt2) + print *, 'Final step' + print *, 'N_det = ', N_det + print *, 'N_states = ', N_states + print *, 'PT2 = ', pt2 + print *, 'E = ', E_CI_before + print *, 'E+PT2 = ', E_CI_before+pt2 + print *, '-----' + call ezfio_set_full_ci_energy_pt2(E_CI_before+pt2) + endif call save_wavefunction end diff --git a/src/Determinants/davidson.irp.f b/src/Determinants/davidson.irp.f index e7480ca2..5bc3768e 100644 --- a/src/Determinants/davidson.irp.f +++ b/src/Determinants/davidson.irp.f @@ -530,7 +530,7 @@ subroutine davidson_diag_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,N_st,Nint,iun !$OMP END PARALLEL write(iunit,'(X,I3,X,100(X,F16.10,X,E16.6))') iter, to_print(:,1:N_st) - call davidson_converged(lambda,residual_norm,wall,iter,cpu,N_st,converged) + call davidson_converged(lambda,residual_norm,wall,iter,cpu,N_states,converged) if (converged) then exit endif diff --git a/src/Determinants/s2.irp.f b/src/Determinants/s2.irp.f index 9a60dbd9..96e342cd 100644 --- a/src/Determinants/s2.irp.f +++ b/src/Determinants/s2.irp.f @@ -215,54 +215,52 @@ subroutine get_s2_u0(psi_keys_tmp,psi_coefs_tmp,n,nmax,s2) end subroutine get_uJ_s2_uI(psi_keys_tmp,psi_coefs_tmp,n,nmax_coefs,nmax_keys,s2,nstates) - implicit none - use bitmasks - integer(bit_kind), intent(in) :: psi_keys_tmp(N_int,2,nmax_keys) - integer, intent(in) :: n,nmax_coefs,nmax_keys,nstates - double precision, intent(in) :: psi_coefs_tmp(nmax_coefs,nstates) - double precision, intent(out) :: s2(nstates,nstates) - double precision :: s2_tmp,accu - integer :: i,j,l,jj,ll,kk - integer, allocatable :: idx(:) - double precision, allocatable :: tmp(:,:) - BEGIN_DOC - ! returns the matrix elements of S^2 "s2(i,j)" between the "nstates" states - ! psi_coefs_tmp(:,i) and psi_coefs_tmp(:,j) - END_DOC - s2 = 0.d0 - do ll = 1, nstates - do jj = 1, nstates - accu = 0.d0 - !$OMP PARALLEL DEFAULT(NONE) & - !$OMP PRIVATE (i,j,kk,idx,tmp,s2_tmp) & - !$OMP SHARED (ll,jj,psi_keys_tmp,psi_coefs_tmp,N_int,n,nstates) & - !$OMP REDUCTION(+:accu) - allocate(idx(0:n)) - !$OMP DO SCHEDULE(dynamic) - do i = 1, n - call get_s2(psi_keys_tmp(1,1,i),psi_keys_tmp(1,1,i),s2_tmp,N_int) - accu += psi_coefs_tmp(i,ll) * s2_tmp * psi_coefs_tmp(i,jj) - call filter_connected(psi_keys_tmp,psi_keys_tmp(1,1,i),N_int,i-1,idx) - do kk=1,idx(0) - j = idx(kk) - call get_s2(psi_keys_tmp(1,1,i),psi_keys_tmp(1,1,j),s2_tmp,N_int) - accu += psi_coefs_tmp(i,ll) * s2_tmp * psi_coefs_tmp(j,jj) + psi_coefs_tmp(i,jj) * s2_tmp * psi_coefs_tmp(j,ll) + implicit none + use bitmasks + integer(bit_kind), intent(in) :: psi_keys_tmp(N_int,2,nmax_keys) + integer, intent(in) :: n,nmax_coefs,nmax_keys,nstates + double precision, intent(in) :: psi_coefs_tmp(nmax_coefs,nstates) + double precision, intent(out) :: s2(nstates,nstates) + double precision :: s2_tmp,accu + integer :: i,j,l,jj,ll,kk + integer, allocatable :: idx(:) + BEGIN_DOC + ! returns the matrix elements of S^2 "s2(i,j)" between the "nstates" states + ! psi_coefs_tmp(:,i) and psi_coefs_tmp(:,j) + END_DOC + s2 = 0.d0 + do ll = 1, nstates + do jj = 1, nstates + accu = 0.d0 + !$OMP PARALLEL DEFAULT(NONE) & + !$OMP PRIVATE (i,j,kk,idx,s2_tmp) & + !$OMP SHARED (ll,jj,psi_keys_tmp,psi_coefs_tmp,N_int,n,nstates)& + !$OMP REDUCTION(+:accu) + allocate(idx(0:n)) + !$OMP DO SCHEDULE(dynamic) + do i = n,1,-1 ! Better OMP scheduling + call get_s2(psi_keys_tmp(1,1,i),psi_keys_tmp(1,1,i),s2_tmp,N_int) + accu += psi_coefs_tmp(i,ll) * s2_tmp * psi_coefs_tmp(i,jj) + call filter_connected(psi_keys_tmp,psi_keys_tmp(1,1,i),N_int,i-1,idx) + do kk=1,idx(0) + j = idx(kk) + call get_s2(psi_keys_tmp(1,1,i),psi_keys_tmp(1,1,j),s2_tmp,N_int) + accu += psi_coefs_tmp(i,ll) * s2_tmp * psi_coefs_tmp(j,jj) + psi_coefs_tmp(i,jj) * s2_tmp * psi_coefs_tmp(j,ll) + enddo + enddo + !$OMP END DO + deallocate(idx) + !$OMP END PARALLEL + s2(ll,jj) += accu enddo - enddo - !$OMP END DO NOWAIT - deallocate(idx) - !$OMP BARRIER - !$OMP END PARALLEL - s2(ll,jj) += accu enddo - enddo - do i = 1, nstates - do j =i+1,nstates - accu = 0.5d0 * (s2(i,j) + s2(j,i)) - s2(i,j) = accu - s2(j,i) = accu + do i = 1, nstates + do j =i+1,nstates + accu = 0.5d0 * (s2(i,j) + s2(j,i)) + s2(i,j) = accu + s2(j,i) = accu + enddo enddo - enddo end subroutine diagonalize_s2_betweenstates(keys_tmp,psi_coefs_inout,n,nmax_keys,nmax_coefs,nstates,s2_eigvalues)