Added PT2 to FCI ZMQ

2016-08-04 15:15:14 +02:00 · 2016-08-04 15:15:14 +02:00 · 8b530a6db6
parent c30ceed47d
commit 8b530a6db6
4 changed files with 121 additions and 62 deletions
--- a/config/sse4_avx2.cfg
+++ b/config/sse4_avx2.cfg
@ -0,0 +1,62 @@
 # Common flags
 ##############
 #
 # -mkl=[parallel|sequential] : Use the MKL library
 # --ninja                 : Allow the utilisation of ninja. It is mandatory !
 # --align=32                 : Align all provided arrays on a 32-byte boundary
 #
 [COMMON]
 FC           : ifort
 LAPACK_LIB   : -mkl=parallel
 IRPF90       : irpf90
 IRPF90_FLAGS : --ninja --align=32
 # Global options
 ################
 #
 # 1 : Activate
 # 0 : Deactivate
 # 
 [OPTION]
 MODE    : OPT        ; [ OPT | PROFILE | DEBUG ] : Chooses the section below
 CACHE   : 1          ; Enable cache_compile.py
 OPENMP  : 1          ; Append OpenMP flags
 # Optimization flags
 ####################
 #
 # -xHost                     : Compile a binary optimized for the current architecture
 # -O2                        : O3 not better than O2.
 # -ip                        : Inter-procedural optimizations
 # -ftz                       : Flushes denormal results to zero
 #
 [OPT]
 FCFLAGS  : -axSSE4.2,AVX,CORE-AVX2 -O2 -ip -ftz -g -traceback
 # Profiling flags
 #################
 #
 [PROFILE]
 FC       : -p -g
 FCFLAGS  : -xSSE4.2 -O2 -ip -ftz
 # Debugging flags
 #################
 #
 # -traceback   : Activate backtrace on runtime
 # -fpe0        : All floating point exaceptions
 # -C           : Checks uninitialized variables,  array subscripts, etc...
 # -g           : Extra debugging information
 # -xSSE2       : Valgrind needs a very simple x86 executable
 #
 [DEBUG]
 FC      : -g -traceback
 FCFLAGS : -xSSE2 -C  -fpe0 
 # OpenMP flags
 #################
 #
 [OPENMP]
 FC           : -openmp
 IRPF90_FLAGS : --openmp 
--- a/plugins/Full_CI_ZMQ/fci_zmq.irp.f
+++ b/plugins/Full_CI_ZMQ/fci_zmq.irp.f
@ -38,7 +38,6 @@ program fci_zmq
  do while (N_det < N_det_max.and.maxval(abs(pt2(1:N_st))) > pt2_max)
    n_det_before = N_det
    ! call H_apply_FCI(pt2, norm_pert, H_pert_diag,  N_st)
    call ZMQ_selection(max(1024-N_det, N_det), pt2)
    PROVIDE  psi_coef
@ -90,21 +89,21 @@ program fci_zmq
   N_det = min(N_det_max,N_det)
   touch N_det psi_det psi_coef
   call diagonalize_CI
-!    if(do_pt2_end)then
+   if(do_pt2_end)then
-!     print*,'Last iteration only to compute the PT2'
+     print*,'Last iteration only to compute the PT2'
-!     threshold_selectors = 1.d0
+     threshold_selectors = 1.d0
-!     threshold_generators = 0.999d0
+     threshold_generators = 0.9999d0
-!     call H_apply_FCI_PT2(pt2, norm_pert, H_pert_diag,  N_st)
+     E_CI_before = CI_energy
-!  
+     call ZMQ_selection(1, pt2)
-!     print *,  'Final step'
+     print *,  'Final step'
-!     print *,  'N_det    = ', N_det
+     print *,  'N_det    = ', N_det
-!     print *,  'N_states = ', N_states
+     print *,  'N_states = ', N_states
-!     print *,  'PT2      = ', pt2
+     print *,  'PT2      = ', pt2
-!     print *,  'E        = ', CI_energy
+     print *,  'E        = ', E_CI_before
-!     print *,  'E+PT2    = ', CI_energy+pt2
+     print *,  'E+PT2    = ', E_CI_before+pt2
-!     print *,  '-----'
+     print *,  '-----'
-!     call ezfio_set_full_ci_energy_pt2(CI_energy+pt2)
+     call ezfio_set_full_ci_energy_pt2(E_CI_before+pt2)
-!    endif
+   endif
   call save_wavefunction
 end
--- a/src/Determinants/davidson.irp.f
+++ b/src/Determinants/davidson.irp.f
@ -530,7 +530,7 @@ subroutine davidson_diag_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,N_st,Nint,iun
      !$OMP END PARALLEL
      write(iunit,'(X,I3,X,100(X,F16.10,X,E16.6))')  iter, to_print(:,1:N_st)
-      call davidson_converged(lambda,residual_norm,wall,iter,cpu,N_st,converged)
+      call davidson_converged(lambda,residual_norm,wall,iter,cpu,N_states,converged)
      if (converged) then
        exit
      endif
--- a/src/Determinants/s2.irp.f
+++ b/src/Determinants/s2.irp.f
@ -215,54 +215,52 @@ subroutine get_s2_u0(psi_keys_tmp,psi_coefs_tmp,n,nmax,s2)
 end
 subroutine get_uJ_s2_uI(psi_keys_tmp,psi_coefs_tmp,n,nmax_coefs,nmax_keys,s2,nstates)
- implicit none
+  implicit none
- use bitmasks
+  use bitmasks
- integer(bit_kind), intent(in) :: psi_keys_tmp(N_int,2,nmax_keys)
+  integer(bit_kind), intent(in)  :: psi_keys_tmp(N_int,2,nmax_keys)
- integer, intent(in) :: n,nmax_coefs,nmax_keys,nstates
+  integer, intent(in)            :: n,nmax_coefs,nmax_keys,nstates
- double precision, intent(in) :: psi_coefs_tmp(nmax_coefs,nstates)
+  double precision, intent(in)   :: psi_coefs_tmp(nmax_coefs,nstates)
- double precision, intent(out) :: s2(nstates,nstates)
+  double precision, intent(out)  :: s2(nstates,nstates)
- double precision :: s2_tmp,accu
+  double precision               :: s2_tmp,accu
- integer :: i,j,l,jj,ll,kk
+  integer                        :: i,j,l,jj,ll,kk
- integer, allocatable           :: idx(:)
+  integer, allocatable           :: idx(:)
- double precision, allocatable :: tmp(:,:)
+  BEGIN_DOC
- BEGIN_DOC
+  ! returns the matrix elements of S^2 "s2(i,j)" between the "nstates" states
- ! returns the matrix elements of S^2 "s2(i,j)" between the "nstates" states 
+  ! psi_coefs_tmp(:,i) and psi_coefs_tmp(:,j)
- ! psi_coefs_tmp(:,i) and psi_coefs_tmp(:,j)
+  END_DOC
- END_DOC
+  s2 = 0.d0
- s2 = 0.d0
+  do ll = 1, nstates
- do ll = 1, nstates
+    do jj = 1, nstates
-  do jj = 1, nstates
+      accu = 0.d0
- accu = 0.d0
+      !$OMP PARALLEL DEFAULT(NONE)                                   &
- !$OMP PARALLEL DEFAULT(NONE)                                         &
+          !$OMP PRIVATE (i,j,kk,idx,s2_tmp)                          &
- !$OMP PRIVATE (i,j,kk,idx,tmp,s2_tmp) & 
+          !$OMP SHARED (ll,jj,psi_keys_tmp,psi_coefs_tmp,N_int,n,nstates)&
- !$OMP SHARED (ll,jj,psi_keys_tmp,psi_coefs_tmp,N_int,n,nstates)      &
+          !$OMP REDUCTION(+:accu)
- !$OMP REDUCTION(+:accu)
+      allocate(idx(0:n))
- allocate(idx(0:n))
+      !$OMP DO SCHEDULE(dynamic)
-   !$OMP DO SCHEDULE(dynamic)
+      do i = n,1,-1   ! Better OMP scheduling
-   do i = 1, n
+        call get_s2(psi_keys_tmp(1,1,i),psi_keys_tmp(1,1,i),s2_tmp,N_int)
-    call get_s2(psi_keys_tmp(1,1,i),psi_keys_tmp(1,1,i),s2_tmp,N_int)
+        accu += psi_coefs_tmp(i,ll) * s2_tmp * psi_coefs_tmp(i,jj)
-    accu += psi_coefs_tmp(i,ll) * s2_tmp * psi_coefs_tmp(i,jj)
+        call filter_connected(psi_keys_tmp,psi_keys_tmp(1,1,i),N_int,i-1,idx)
-    call filter_connected(psi_keys_tmp,psi_keys_tmp(1,1,i),N_int,i-1,idx)
+        do kk=1,idx(0)
-    do kk=1,idx(0)
+          j = idx(kk)
-     j = idx(kk)
+          call get_s2(psi_keys_tmp(1,1,i),psi_keys_tmp(1,1,j),s2_tmp,N_int)
-     call get_s2(psi_keys_tmp(1,1,i),psi_keys_tmp(1,1,j),s2_tmp,N_int)
+          accu += psi_coefs_tmp(i,ll) * s2_tmp * psi_coefs_tmp(j,jj) + psi_coefs_tmp(i,jj) * s2_tmp * psi_coefs_tmp(j,ll)
-     accu += psi_coefs_tmp(i,ll) * s2_tmp * psi_coefs_tmp(j,jj) + psi_coefs_tmp(i,jj) * s2_tmp * psi_coefs_tmp(j,ll)
+        enddo
      enddo
      !$OMP END DO
      deallocate(idx)
      !$OMP END PARALLEL
      s2(ll,jj) += accu
    enddo
   enddo
   !$OMP END DO NOWAIT
 deallocate(idx)
 !$OMP BARRIER
 !$OMP END PARALLEL
   s2(ll,jj) += accu
  enddo
- enddo
+  do i = 1, nstates
- do i = 1, nstates
+    do j =i+1,nstates
-  do j =i+1,nstates
+      accu = 0.5d0 * (s2(i,j) + s2(j,i))
-   accu = 0.5d0 * (s2(i,j) + s2(j,i))
+      s2(i,j) = accu
-   s2(i,j) = accu
+      s2(j,i) = accu
-   s2(j,i) = accu
+    enddo
  enddo
 enddo
 end
 subroutine diagonalize_s2_betweenstates(keys_tmp,psi_coefs_inout,n,nmax_keys,nmax_coefs,nstates,s2_eigvalues)