diff --git a/plugins/Full_CI/H_apply.irp.f b/plugins/Full_CI/H_apply.irp.f index 3dc1e0f0..d6888dc3 100644 --- a/plugins/Full_CI/H_apply.irp.f +++ b/plugins/Full_CI/H_apply.irp.f @@ -9,7 +9,7 @@ print s s = H_apply_zmq("FCI_PT2") s.set_perturbation("epstein_nesbet_2x2") -#s.unset_openmp() +s.unset_openmp() print s s = H_apply("FCI_no_skip") diff --git a/src/Determinants/H_apply.irp.f b/src/Determinants/H_apply.irp.f index b8604e8b..28513597 100644 --- a/src/Determinants/H_apply.irp.f +++ b/src/Determinants/H_apply.irp.f @@ -350,12 +350,12 @@ subroutine push_pt2(zmq_socket_push,pt2,norm_pert,H_pert_diag,N_st,task_id) endif ! Activate if zmq_socket_push is a REQ - integer :: idummy - rc = f77_zmq_recv( zmq_socket_push, idummy, 4, 0) - if (rc /= 4) then - print *, irp_here, 'f77_zmq_send( zmq_socket_push, idummy, 4, 0)' - stop 'error' - endif +! integer :: idummy +! rc = f77_zmq_recv( zmq_socket_push, idummy, 4, 0) +! if (rc /= 4) then +! print *, irp_here, 'f77_zmq_send( zmq_socket_push, idummy, 4, 0)' +! stop 'error' +! endif end subroutine pull_pt2(zmq_socket_pull,pt2,norm_pert,H_pert_diag,N_st,n,task_id) @@ -411,11 +411,11 @@ subroutine pull_pt2(zmq_socket_pull,pt2,norm_pert,H_pert_diag,N_st,n,task_id) endif ! Activate if zmq_socket_pull is a REP - rc = f77_zmq_send( zmq_socket_pull, 0, 4, 0) - if (rc /= 4) then - print *, irp_here, 'f77_zmq_send( zmq_socket_pull, 0, 4, 0)' - stop 'error' - endif +! rc = f77_zmq_send( zmq_socket_pull, 0, 4, 0) +! if (rc /= 4) then +! print *, irp_here, 'f77_zmq_send( zmq_socket_pull, 0, 4, 0)' +! stop 'error' +! endif end diff --git a/src/Determinants/davidson.irp.f b/src/Determinants/davidson.irp.f index deba43c5..c78a3826 100644 --- a/src/Determinants/davidson.irp.f +++ b/src/Determinants/davidson.irp.f @@ -315,6 +315,7 @@ subroutine davidson_diag_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,N_st,Nint,iun double precision, intent(inout) :: u_in(dim_in,N_st) double precision, intent(out) :: energies(N_st) + integer :: sze_8 integer :: iter integer :: i,j,k,l,m logical :: converged @@ -334,6 +335,7 @@ subroutine davidson_diag_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,N_st,Nint,iun double precision :: to_print(2,N_st) double precision :: cpu, wall + !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: U, W, R, Wt, y, h, lambda call write_time(iunit) @@ -362,12 +364,15 @@ subroutine davidson_diag_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,N_st,Nint,iun enddo write(iunit,'(A)') trim(write_buffer) + integer, external :: align_double + sze_8 = align_double(sze) + allocate( & kl_pairs(2,N_st*(N_st+1)/2), & - W(sze,N_st,davidson_sze_max), & + W(sze_8,N_st,davidson_sze_max), & Wt(sze), & - U(sze,N_st,davidson_sze_max), & - R(sze,N_st), & + U(sze_8,N_st,davidson_sze_max), & + R(sze_8,N_st), & h(N_st,davidson_sze_max,N_st,davidson_sze_max), & y(N_st,davidson_sze_max,N_st,davidson_sze_max), & lambda(N_st*davidson_sze_max)) @@ -473,7 +478,10 @@ subroutine davidson_diag_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,N_st,Nint,iun ! Express eigenvectors of h in the determinant basis ! -------------------------------------------------- + !$OMP PARALLEL DEFAULT(NONE) & + !$OMP PRIVATE(k,i,l,iter2) SHARED(U,W,R,y,iter,lambda,N_st,sze) do k=1,N_st + !$OMP DO do i=1,sze U(i,k,iter+1) = 0.d0 W(i,k,iter+1) = 0.d0 @@ -484,7 +492,9 @@ subroutine davidson_diag_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,N_st,Nint,iun enddo enddo enddo + !$OMP END DO enddo + !$OMP END PARALLEL ! Compute residual vector ! ----------------------- diff --git a/src/Determinants/slater_rules.irp.f b/src/Determinants/slater_rules.irp.f index 4ab9deda..9bcc95f9 100644 --- a/src/Determinants/slater_rules.irp.f +++ b/src/Determinants/slater_rules.irp.f @@ -1608,12 +1608,11 @@ subroutine H_u_0(v_0,u_0,H_jj,n,keys_tmp,Nint) integer :: i,j,k,l, jj,ii integer :: i0, j0 - integer, allocatable :: shortcut(:), sort_idx(:) - integer(bit_kind), allocatable :: sorted(:,:), version(:,:) + integer, allocatable :: shortcut(:,:), sort_idx(:,:) + integer(bit_kind), allocatable :: sorted(:,:,:), version(:,:,:) integer(bit_kind) :: sorted_i(Nint) integer :: sh, sh2, ni, exa, ext, org_i, org_j, endi - double precision :: local_threshold ASSERT (Nint > 0) @@ -1621,104 +1620,83 @@ subroutine H_u_0(v_0,u_0,H_jj,n,keys_tmp,Nint) ASSERT (n>0) PROVIDE ref_bitmask_energy davidson_criterion - allocate (shortcut(0:n+1), sort_idx(n), sorted(Nint,n), version(Nint,n)) + allocate (shortcut(0:n+1,2), sort_idx(n,2), sorted(Nint,n,2), version(Nint,n,2)) v_0 = 0.d0 - call sort_dets_ab_v(keys_tmp, sorted, sort_idx, shortcut, version, n, Nint) + call sort_dets_ab_v(keys_tmp, sorted(1,1,1), sort_idx(1,1), shortcut(0,1), version(1,1,1), n, Nint) + call sort_dets_ba_v(keys_tmp, sorted(1,1,2), sort_idx(1,2), shortcut(0,2), version(1,1,2), n, Nint) !$OMP PARALLEL DEFAULT(NONE) & - !$OMP PRIVATE(i,hij,j,k,jj,vt,ii,sh,sh2,ni,exa,ext,org_i,org_j,endi,local_threshold,sorted_i)& - !$OMP SHARED(n,H_jj,u_0,keys_tmp,Nint,v_0,threshold_davidson,sorted,shortcut,sort_idx,version) + !$OMP PRIVATE(i,hij,j,k,jj,vt,ii,sh,sh2,ni,exa,ext,org_i,org_j,endi,sorted_i)& + !$OMP SHARED(n,H_jj,u_0,keys_tmp,Nint,v_0,sorted,shortcut,sort_idx,version) allocate(vt(n)) Vt = 0.d0 !$OMP DO SCHEDULE(dynamic) - do sh=1,shortcut(0) - do sh2=1,sh + do sh=1,shortcut(0,1) + do sh2=sh,shortcut(0,1) exa = 0 do ni=1,Nint - exa = exa + popcnt(xor(version(ni,sh), version(ni,sh2))) + exa = exa + popcnt(xor(version(ni,sh,1), version(ni,sh2,1))) end do if(exa > 2) then cycle end if - do i=shortcut(sh),shortcut(sh+1)-1 - org_i = sort_idx(i) - local_threshold = threshold_davidson - dabs(u_0(org_i)) + do i=shortcut(sh,1),shortcut(sh+1,1)-1 + org_i = sort_idx(i,1) if(sh==sh2) then endi = i-1 else - endi = shortcut(sh2+1)-1 + endi = shortcut(sh2+1,1)-1 end if do ni=1,Nint - sorted_i(ni) = sorted(ni,i) + sorted_i(ni) = sorted(ni,i,1) enddo - do j=shortcut(sh2),endi - org_j = sort_idx(j) - if ( dabs(u_0(org_j)) > local_threshold ) then - ext = exa - do ni=1,Nint - ext = ext + popcnt(xor(sorted_i(ni), sorted(ni,j))) - end do - if(ext <= 4) then - call i_H_j(keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),Nint,hij) - vt (org_i) = vt (org_i) + hij*u_0(org_j) - vt (org_j) = vt (org_j) + hij*u_0(org_i) - endif + do j=shortcut(sh2,1),endi + org_j = sort_idx(j,1) + ext = exa + do ni=1,Nint + ext = ext + popcnt(xor(sorted_i(ni), sorted(ni,j,1))) + end do + if(ext <= 4) then + call i_H_j(keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),Nint,hij) + vt (org_i) = vt (org_i) + hij*u_0(org_j) + vt (org_j) = vt (org_j) + hij*u_0(org_i) endif enddo enddo enddo enddo - !$OMP END DO - - !$OMP CRITICAL - do i=1,n - v_0(i) = v_0(i) + vt(i) - enddo - !$OMP END CRITICAL - - deallocate(vt) - !$OMP END PARALLEL - - call sort_dets_ba_v(keys_tmp, sorted, sort_idx, shortcut, version, n, Nint) - - !$OMP PARALLEL DEFAULT(NONE) & - !$OMP PRIVATE(i,hij,j,k,jj,vt,ii,sh,sh2,ni,exa,ext,org_i,org_j,endi,local_threshold)& - !$OMP SHARED(n,H_jj,u_0,keys_tmp,Nint,v_0,threshold_davidson,sorted,shortcut,sort_idx,version) - allocate(vt(n)) - Vt = 0.d0 + !$OMP END DO NOWAIT !$OMP DO SCHEDULE(dynamic) - do sh=1,shortcut(0) - do i=shortcut(sh),shortcut(sh+1)-1 - org_i = sort_idx(i) - local_threshold = threshold_davidson - dabs(u_0(org_i)) - do j=shortcut(sh),i-1 - org_j = sort_idx(j) - if ( dabs(u_0(org_j)) > local_threshold ) then - ext = 0 - do ni=1,Nint - ext = ext + popcnt(xor(sorted(ni,i), sorted(ni,j))) - end do - if(ext == 4) then - call i_H_j(keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),Nint,hij) - vt (org_i) = vt (org_i) + hij*u_0(org_j) - vt (org_j) = vt (org_j) + hij*u_0(org_i) - end if + do sh=1,shortcut(0,2) + do i=shortcut(sh,2),shortcut(sh+1,2)-1 + org_i = sort_idx(i,2) + do j=shortcut(sh,2),i-1 + org_j = sort_idx(j,2) + ext = 0 + do ni=1,Nint + ext = ext + popcnt(xor(sorted(ni,i,2), sorted(ni,j,2))) + end do + if(ext == 4) then + call i_H_j(keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),Nint,hij) + vt (org_i) = vt (org_i) + hij*u_0(org_j) + vt (org_j) = vt (org_j) + hij*u_0(org_i) end if end do end do enddo - !$OMP END DO + !$OMP END DO NOWAIT !$OMP CRITICAL - do i=1,n + do i=n,1,-1 v_0(i) = v_0(i) + vt(i) enddo !$OMP END CRITICAL + deallocate(vt) !$OMP END PARALLEL diff --git a/src/Integrals_Bielec/ao_bielec_integrals_in_map_slave.irp.f b/src/Integrals_Bielec/ao_bielec_integrals_in_map_slave.irp.f index 4e2f7152..f15376b5 100644 --- a/src/Integrals_Bielec/ao_bielec_integrals_in_map_slave.irp.f +++ b/src/Integrals_Bielec/ao_bielec_integrals_in_map_slave.irp.f @@ -57,16 +57,18 @@ subroutine push_integrals(zmq_socket_push, n_integrals, buffer_i, buffer_value, endif ! Activate is zmq_socket_push is a REQ - integer :: idummy - rc = f77_zmq_recv( zmq_socket_push, idummy, 4, 0) - if (rc /= 4) then - print *, irp_here, 'f77_zmq_send( zmq_socket_push, idummy, 4, 0)' - stop 'error' - endif +! integer :: idummy +! rc = f77_zmq_recv( zmq_socket_push, idummy, 4, 0) +! if (rc /= 4) then +! print *, irp_here, 'f77_zmq_send( zmq_socket_push, idummy, 4, 0)' +! stop 'error' +! endif end + + subroutine ao_bielec_integrals_in_map_slave(thread,iproc) use map_module use f77_zmq @@ -107,7 +109,7 @@ subroutine ao_bielec_integrals_in_map_slave(thread,iproc) call push_integrals(zmq_socket_push, n_integrals, buffer_i, buffer_value, 0) enddo call compute_ao_integrals_jl(l,l,n_integrals,buffer_i,buffer_value) - call task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id,n_integrals) + call task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id) call push_integrals(zmq_socket_push, n_integrals, buffer_i, buffer_value, task_id) enddo @@ -127,7 +129,7 @@ subroutine pull_integrals(zmq_socket_pull, n_integrals, buffer_i, buffer_value, BEGIN_DOC ! How the collector pulls the computed integrals END_DOC - integer(ZMQ_PTR), intent(out) :: zmq_socket_pull + integer(ZMQ_PTR), intent(in) :: zmq_socket_pull integer, intent(out) :: n_integrals integer(key_kind), intent(out) :: buffer_i(*) real(integral_kind), intent(out) :: buffer_value(*) @@ -167,11 +169,11 @@ subroutine pull_integrals(zmq_socket_pull, n_integrals, buffer_i, buffer_value, endif ! Activate if zmq_socket_pull is a REP - rc = f77_zmq_send( zmq_socket_pull, 0, 4, 0) - if (rc /= 4) then - print *, irp_here, ' f77_zmq_send (zmq_socket_pull,...' - stop 'error' - endif +! rc = f77_zmq_send( zmq_socket_pull, 0, 4, 0) +! if (rc /= 4) then +! print *, irp_here, ' f77_zmq_send (zmq_socket_pull,...' +! stop 'error' +! endif end diff --git a/src/Utils/util.irp.f b/src/Utils/util.irp.f index a5904183..a0ea668d 100644 --- a/src/Utils/util.irp.f +++ b/src/Utils/util.irp.f @@ -324,6 +324,7 @@ double precision function u_dot_v(u,v,sze) t3 = t2+t2 t4 = t3+t2 u_dot_v = 0.d0 + !DIR$ VECTOR ALWAYS do i=1,t2 u_dot_v = u_dot_v + u(t1+i)*v(t1+i) + u(t2+i)*v(t2+i) + & u(t3+i)*v(t3+i) + u(t4+i)*v(t4+i) @@ -359,6 +360,7 @@ double precision function u_dot_u(u,sze) ! u_dot_u = u_dot_u+u(i)*u(i) ! enddo + !DIR$ VECTOR ALWAYS do i=1,sze u_dot_u = u_dot_u + u(i)*u(i) enddo diff --git a/src/ZMQ/utils.irp.f b/src/ZMQ/utils.irp.f index b161f26d..af97161b 100644 --- a/src/ZMQ/utils.irp.f +++ b/src/ZMQ/utils.irp.f @@ -217,8 +217,8 @@ function new_zmq_pull_socket() integer(ZMQ_PTR) :: new_zmq_pull_socket call omp_set_lock(zmq_lock) -! new_zmq_pull_socket = f77_zmq_socket(zmq_context, ZMQ_PULL) - new_zmq_pull_socket = f77_zmq_socket(zmq_context, ZMQ_REP) + new_zmq_pull_socket = f77_zmq_socket(zmq_context, ZMQ_PULL) +! new_zmq_pull_socket = f77_zmq_socket(zmq_context, ZMQ_REP) call omp_unset_lock(zmq_lock) if (new_zmq_pull_socket == 0_ZMQ_PTR) then stop 'Unable to create zmq pull socket' @@ -267,8 +267,8 @@ function new_zmq_push_socket(thread) integer(ZMQ_PTR) :: new_zmq_push_socket call omp_set_lock(zmq_lock) -! new_zmq_push_socket = f77_zmq_socket(zmq_context, ZMQ_PUSH) - new_zmq_push_socket = f77_zmq_socket(zmq_context, ZMQ_REQ) + new_zmq_push_socket = f77_zmq_socket(zmq_context, ZMQ_PUSH) +! new_zmq_push_socket = f77_zmq_socket(zmq_context, ZMQ_REQ) call omp_unset_lock(zmq_lock) if (new_zmq_push_socket == 0_ZMQ_PTR) then stop 'Unable to create zmq push socket' @@ -361,6 +361,8 @@ subroutine end_zmq_pull_socket(zmq_socket_pull) stop 'error' endif + call sleep(1) ! see https://github.com/zeromq/libzmq/issues/1922 + rc = f77_zmq_setsockopt(zmq_socket_pull,ZMQ_LINGER,0,4) if (rc /= 0) then stop 'Unable to set ZMQ_LINGER on zmq_socket_pull'