diff --git a/src/Davidson/davidson_parallel.irp.f b/src/Davidson/davidson_parallel.irp.f index 9bca4b14..76386c7b 100644 --- a/src/Davidson/davidson_parallel.irp.f +++ b/src/Davidson/davidson_parallel.irp.f @@ -64,8 +64,8 @@ subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, N_st, sze, integer :: imin, imax, ishift, istep integer, allocatable :: psi_det_read(:,:,:) - double precision, allocatable :: v_0(:,:), s_0(:,:), u_t(:,:) - !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: u_t, v_0, s_0 + double precision, allocatable :: v_t(:,:), s_t(:,:), u_t(:,:) + !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: u_t, v_t, s_t ! Get wave function (u_t) ! ----------------------- @@ -108,7 +108,7 @@ subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, N_st, sze, TOUCH N_det endif - allocate(v_0(sze,N_st), s_0(sze,N_st),u_t(N_st,N_det_read)) + allocate(u_t(N_st,N_det_read)) rc = f77_zmq_recv(zmq_to_qp_run_socket,psi_det,N_int*2*N_det_read*bit_kind,0) if (rc /= N_int*2*N_det_read*bit_kind) then @@ -133,41 +133,50 @@ subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, N_st, sze, ! --------- + allocate(v_t(N_st,N_det), s_t(N_st,N_det)) do - v_0 = 0.d0 - s_0 = 0.d0 call get_task_from_taskserver(zmq_to_qp_run_socket,worker_id, task_id, msg) if(task_id == 0) exit read (msg,*) imin, imax, ishift, istep - call H_S2_u_0_nstates_openmp_work(v_0,s_0,u_t,N_st,N_det,imin,imax,ishift,istep) + v_t = 0.d0 + s_t = 0.d0 + call H_S2_u_0_nstates_openmp_work(v_t,s_t,u_t,N_st,N_det,imin,imax,ishift,istep) call task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id) - call davidson_push_results(zmq_socket_push, v_0, s_0, task_id) + call davidson_push_results(zmq_socket_push, v_t, s_t, imin, imax, task_id) end do - deallocate(v_0, s_0, u_t) + deallocate(u_t,v_t, s_t) end subroutine -subroutine davidson_push_results(zmq_socket_push, v_0, s_0, task_id) +subroutine davidson_push_results(zmq_socket_push, v_t, s_t, imin, imax, task_id) use f77_zmq implicit none integer(ZMQ_PTR) ,intent(in) :: zmq_socket_push - integer ,intent(in) :: task_id - double precision ,intent(in) :: v_0(N_det,N_states_diag) - double precision ,intent(in) :: s_0(N_det,N_states_diag) - integer :: rc + integer ,intent(in) :: task_id, imin, imax + double precision ,intent(in) :: v_t(N_states_diag,N_det) + double precision ,intent(in) :: s_t(N_states_diag,N_det) + integer :: rc, sz - rc = f77_zmq_send( zmq_socket_push, v_0, 8*N_states_diag*N_det, ZMQ_SNDMORE) - if(rc /= 8*N_states_diag* N_det) stop "davidson_push_results failed to push vt" + sz = (imax-imin+1)*N_states_diag - rc = f77_zmq_send( zmq_socket_push, s_0, 8*N_states_diag*N_det, ZMQ_SNDMORE) - if(rc /= 8*N_states_diag* N_det) stop "davidson_push_results failed to push st" - - rc = f77_zmq_send( zmq_socket_push, task_id, 4, 0) + rc = f77_zmq_send( zmq_socket_push, task_id, 4, ZMQ_SNDMORE) if(rc /= 4) stop "davidson_push_results failed to push task_id" + rc = f77_zmq_send( zmq_socket_push, imin, 4, ZMQ_SNDMORE) + if(rc /= 4) stop "davidson_push_results failed to push imin" + + rc = f77_zmq_send( zmq_socket_push, imax, 4, ZMQ_SNDMORE) + if(rc /= 4) stop "davidson_push_results failed to push imax" + + rc = f77_zmq_send( zmq_socket_push, v_t(1,imin), 8*sz, ZMQ_SNDMORE) + if(rc /= 8*sz) stop "davidson_push_results failed to push vt" + + rc = f77_zmq_send( zmq_socket_push, s_t(1,imin), 8*sz, 0) + if(rc /= 8*sz) stop "davidson_push_results failed to push st" + ! Activate is zmq_socket_push is a REQ IRP_IF ZMQ_PUSH IRP_ELSE @@ -183,26 +192,34 @@ end subroutine -subroutine davidson_pull_results(zmq_socket_pull, v_0, s_0, task_id) +subroutine davidson_pull_results(zmq_socket_pull, v_t, s_t, imin, imax, task_id) use f77_zmq implicit none integer(ZMQ_PTR) ,intent(in) :: zmq_socket_pull - integer ,intent(out) :: task_id - double precision ,intent(out) :: v_0(N_det,N_states_diag) - double precision ,intent(out) :: s_0(N_det,N_states_diag) + integer ,intent(out) :: task_id, imin, imax + double precision ,intent(out) :: v_t(N_states_diag,N_det) + double precision ,intent(out) :: s_t(N_states_diag,N_det) - integer :: rc - - rc = f77_zmq_recv( zmq_socket_pull, v_0, 8*N_det*N_states_diag, 0) - if(rc /= 8*N_det*N_states_diag) stop "davidson_push_results failed to pull v_0" - - rc = f77_zmq_recv( zmq_socket_pull, s_0, 8*N_det*N_states_diag, 0) - if(rc /= 8*N_det*N_states_diag) stop "davidson_push_results failed to pull s_0" + integer :: rc, sz rc = f77_zmq_recv( zmq_socket_pull, task_id, 4, 0) if(rc /= 4) stop "davidson_pull_results failed to pull task_id" + rc = f77_zmq_recv( zmq_socket_pull, imin, 4, 0) + if(rc /= 4) stop "davidson_pull_results failed to pull task_id" + + rc = f77_zmq_recv( zmq_socket_pull, imax, 4, 0) + if(rc /= 4) stop "davidson_pull_results failed to pull task_id" + + sz = (imax-imin+1)*N_states_diag + + rc = f77_zmq_recv( zmq_socket_pull, v_t(1,imin), 8*sz, 0) + if(rc /= 8*sz) stop "davidson_pull_results failed to pull v_t" + + rc = f77_zmq_recv( zmq_socket_pull, s_t(1,imin), 8*sz, 0) + if(rc /= 8*sz) stop "davidson_pull_results failed to pull s_t" + ! Activate if zmq_socket_pull is a REP IRP_IF ZMQ_PUSH IRP_ELSE @@ -227,29 +244,29 @@ subroutine davidson_collector(zmq_to_qp_run_socket, v0, s0, sze, N_st) double precision ,intent(inout) :: v0(sze, N_st) double precision ,intent(inout) :: s0(sze, N_st) - integer :: more, task_id + integer :: more, task_id, imin, imax - double precision, allocatable :: v_0(:,:), s_0(:,:) + double precision, allocatable :: v_t(:,:), s_t(:,:) integer :: i,j integer(ZMQ_PTR), external :: new_zmq_pull_socket integer(ZMQ_PTR) :: zmq_socket_pull - allocate(v_0(N_det,N_st), s_0(N_det,N_st)) + allocate(v_t(N_st,N_det), s_t(N_st,N_det)) v0 = 0.d0 s0 = 0.d0 more = 1 zmq_socket_pull = new_zmq_pull_socket() do while (more == 1) - call davidson_pull_results(zmq_socket_pull, v_0, s_0, task_id) + call davidson_pull_results(zmq_socket_pull, v_t, s_t, imin, imax, task_id) do j=1,N_st - do i=1,N_det - v0(i,j) = v0(i,j) + v_0(i,j) - s0(i,j) = s0(i,j) + s_0(i,j) + do i=imin,imax + v0(i,j) = v0(i,j) + v_t(j,i) + s0(i,j) = s0(i,j) + s_t(j,i) enddo enddo call zmq_delete_task(zmq_to_qp_run_socket,zmq_socket_pull,task_id,more) end do - deallocate(v_0,s_0) + deallocate(v_t,s_t) call end_zmq_pull_socket(zmq_socket_pull) end subroutine @@ -349,16 +366,15 @@ subroutine H_S2_u_0_nstates_zmq(v_0,s_0,u_0,N_st,sze) integer :: istep, imin, imax, ishift double precision :: w, max_workload, N_det_inv, di - max_workload = 1000000.d0 w = 0.d0 - istep=8 + istep=1 ishift=0 imin=1 N_det_inv = 1.d0/dble(N_det) di = dble(N_det) + max_workload = 50000.d0 do imax=1,N_det - di = di-1.d0 - w = w + di*N_det_inv + w = w + 1.d0 if (w > max_workload) then do ishift=0,istep-1 write(task,'(4(I9,1X),1A)') imin, imax, ishift, istep, '|' diff --git a/src/Davidson/u0Hu0.irp.f b/src/Davidson/u0Hu0.irp.f index 38576cd1..1fbf00e0 100644 --- a/src/Davidson/u0Hu0.irp.f +++ b/src/Davidson/u0Hu0.irp.f @@ -21,14 +21,14 @@ subroutine H_S2_u_0_nstates_openmp(v_0,s_0,u_0,N_st,sze) integer, intent(in) :: N_st,sze double precision, intent(inout) :: v_0(sze,N_st), s_0(sze,N_st), u_0(sze,N_st) integer :: k - double precision, allocatable :: u_t(:,:) + double precision, allocatable :: u_t(:,:), v_t(:,:), s_t(:,:) !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: u_t - allocate(u_t(N_st,N_det)) + allocate(u_t(N_st,N_det),v_t(N_st,N_det),s_t(N_st,N_det)) do k=1,N_st call dset_order(u_0(1,k),psi_bilinear_matrix_order,N_det) enddo - v_0 = 0.d0 - s_0 = 0.d0 + v_t = 0.d0 + s_t = 0.d0 call dtranspose( & u_0, & size(u_0, 1), & @@ -36,9 +36,23 @@ subroutine H_S2_u_0_nstates_openmp(v_0,s_0,u_0,N_st,sze) size(u_t, 1), & N_det, N_st) - call H_S2_u_0_nstates_openmp_work(v_0,s_0,u_t,N_st,sze,1,N_det,0,1) + call H_S2_u_0_nstates_openmp_work(v_t,s_t,u_t,N_st,sze,1,N_det,0,1) deallocate(u_t) + call dtranspose( & + v_t, & + size(v_t, 1), & + v_0, & + size(v_0, 1), & + N_st, N_det) + call dtranspose( & + s_t, & + size(s_t, 1), & + s_0, & + size(s_0, 1), & + N_st, N_det) + deallocate(v_t,s_t) + do k=1,N_st call dset_order(v_0(1,k),psi_bilinear_matrix_order_reverse,N_det) call dset_order(s_0(1,k),psi_bilinear_matrix_order_reverse,N_det) @@ -48,47 +62,47 @@ subroutine H_S2_u_0_nstates_openmp(v_0,s_0,u_0,N_st,sze) end -subroutine H_S2_u_0_nstates_openmp_work(v_0,s_0,u_t,N_st,sze,istart,iend,ishift,istep) +subroutine H_S2_u_0_nstates_openmp_work(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep) use bitmasks implicit none BEGIN_DOC - ! Computes v_0 = H|u_0> and s_0 = S^2 |u_0> + ! Computes v_t = H|u_t> and s_t = S^2 |u_t> ! ! Default should be 1,N_det,0,1 END_DOC integer, intent(in) :: N_st,sze,istart,iend,ishift,istep double precision, intent(in) :: u_t(N_st,N_det) - double precision, intent(out) :: v_0(sze,N_st), s_0(sze,N_st) + double precision, intent(out) :: v_t(N_st,sze), s_t(N_st,sze) PROVIDE ref_bitmask_energy N_int select case (N_int) case (1) - call H_S2_u_0_nstates_openmp_work_1(v_0,s_0,u_t,N_st,sze,istart,iend,ishift,istep) + call H_S2_u_0_nstates_openmp_work_1(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep) case (2) - call H_S2_u_0_nstates_openmp_work_2(v_0,s_0,u_t,N_st,sze,istart,iend,ishift,istep) + call H_S2_u_0_nstates_openmp_work_2(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep) case (3) - call H_S2_u_0_nstates_openmp_work_3(v_0,s_0,u_t,N_st,sze,istart,iend,ishift,istep) + call H_S2_u_0_nstates_openmp_work_3(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep) case (4) - call H_S2_u_0_nstates_openmp_work_4(v_0,s_0,u_t,N_st,sze,istart,iend,ishift,istep) + call H_S2_u_0_nstates_openmp_work_4(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep) case default - call H_S2_u_0_nstates_openmp_work_N_int(v_0,s_0,u_t,N_st,sze,istart,iend,ishift,istep) + call H_S2_u_0_nstates_openmp_work_N_int(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep) end select end BEGIN_TEMPLATE -subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_0,s_0,u_t,N_st,sze,istart,iend,ishift,istep) +subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep) use bitmasks implicit none BEGIN_DOC - ! Computes v_0 = H|u_0> and s_0 = S^2 |u_0> + ! Computes v_t = H|u_t> and s_t = S^2 |u_t> ! ! Default should be 1,N_det,0,1 END_DOC integer, intent(in) :: N_st,sze,istart,iend,ishift,istep double precision, intent(in) :: u_t(N_st,N_det) - double precision, intent(out) :: v_0(sze,N_st), s_0(sze,N_st) + double precision, intent(out) :: v_t(N_st,sze), s_t(N_st,sze) double precision :: hij, sij integer :: i,j,k,l @@ -109,8 +123,6 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_0,s_0,u_t,N_st,sze,istart,iend, integer, allocatable :: idx(:), idx0(:) integer :: maxab, n_singles_a, n_singles_b, kcol_prev integer*8 :: k8 - double precision, allocatable :: v_t(:,:), s_t(:,:) - !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: v_t, s_t maxab = max(N_det_alpha_unique, N_det_beta_unique)+1 allocate(idx0(maxab)) @@ -133,14 +145,15 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_0,s_0,u_t,N_st,sze,istart,iend, !$OMP psi_bilinear_matrix_transp_order, N_st, & !$OMP psi_bilinear_matrix_order_transp_reverse, & !$OMP psi_bilinear_matrix_columns_loc, & - !$OMP istart, iend, istep, irp_here, & - !$OMP ishift, idx0, u_t, maxab, v_0, s_0) & + !$OMP psi_bilinear_matrix_transp_rows_loc, & + !$OMP istart, iend, istep, irp_here, v_t, s_t, & + !$OMP ishift, idx0, u_t, maxab) & !$OMP PRIVATE(krow, kcol, tmp_det, spindet, k_a, k_b, i, & !$OMP lcol, lrow, l_a, l_b, & !$OMP buffer, doubles, n_doubles, & - !$OMP tmp_det2, hij, sij, idx, l, kcol_prev, v_t, & + !$OMP tmp_det2, hij, sij, idx, l, kcol_prev, & !$OMP singles_a, n_singles_a, singles_b, & - !$OMP n_singles_b, s_t, k8) + !$OMP n_singles_b, k8) ! Alpha/Beta double excitations ! ============================= @@ -149,12 +162,9 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_0,s_0,u_t,N_st,sze,istart,iend, singles_a(maxab), & singles_b(maxab), & doubles(maxab), & - idx(maxab), & - v_t(N_st,N_det), s_t(N_st,N_det)) - kcol_prev=-1 + idx(maxab)) - v_t = 0.d0 - s_t = 0.d0 + kcol_prev=-1 ASSERT (iend <= N_det) ASSERT (istart > 0) @@ -168,20 +178,20 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_0,s_0,u_t,N_st,sze,istart,iend, kcol = psi_bilinear_matrix_columns(k_a) ASSERT (kcol <= N_det_beta_unique) - + tmp_det(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, krow) tmp_det(1:$N_int,2) = psi_det_beta_unique (1:$N_int, kcol) - + if (kcol /= kcol_prev) then call get_all_spin_singles_$N_int( & - psi_det_beta_unique(1,kcol+1), idx0(kcol+1), & - tmp_det(1,2), N_det_beta_unique-kcol, & + psi_det_beta_unique, idx0, & + tmp_det(1,2), N_det_beta_unique, & singles_b, n_singles_b) endif kcol_prev = kcol - ! Loop over singly excited beta columns > current column - ! ------------------------------------------------------ + ! Loop over singly excited beta columns + ! ------------------------------------- do i=1,n_singles_b lcol = singles_b(i) @@ -202,7 +212,7 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_0,s_0,u_t,N_st,sze,istart,iend, l_a = l_a+1 enddo j = j-1 - + call get_all_spin_singles_$N_int( & buffer, idx, tmp_det(1,1), j, & singles_a, n_singles_a ) @@ -223,8 +233,6 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_0,s_0,u_t,N_st,sze,istart,iend, do l=1,N_st v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,l_a) s_t(l,k_a) = s_t(l,k_a) + sij * u_t(l,l_a) - v_t(l,l_a) = v_t(l,l_a) + hij * u_t(l,k_a) - s_t(l,l_a) = s_t(l,l_a) + sij * u_t(l,k_a) enddo enddo @@ -262,7 +270,8 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_0,s_0,u_t,N_st,sze,istart,iend, spindet(1:$N_int) = tmp_det(1:$N_int,1) ! Loop inside the beta column to gather all the connected alphas - l_a = k_a+1 + lcol = psi_bilinear_matrix_columns(k_a) + l_a = psi_bilinear_matrix_columns_loc(lcol) do i=1,N_det_alpha_unique if (l_a > N_det) exit lcol = psi_bilinear_matrix_columns(l_a) @@ -295,7 +304,6 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_0,s_0,u_t,N_st,sze,istart,iend, call i_H_j_mono_spin( tmp_det, tmp_det2, $N_int, 1, hij) do l=1,N_st - v_t(l,l_a) = v_t(l,l_a) + hij * u_t(l,k_a) v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,l_a) ! single => sij = 0 enddo @@ -314,7 +322,6 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_0,s_0,u_t,N_st,sze,istart,iend, call i_H_j_double_spin( tmp_det(1,1), psi_det_alpha_unique(1, lrow), $N_int, hij) do l=1,N_st - v_t(l,l_a) = v_t(l,l_a) + hij * u_t(l,k_a) v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,l_a) ! same spin => sij = 0 enddo @@ -343,7 +350,8 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_0,s_0,u_t,N_st,sze,istart,iend, ASSERT (k_b <= N_det) ! Loop inside the alpha row to gather all the connected betas - l_b = k_b+1 + lrow = psi_bilinear_matrix_transp_rows(k_b) + l_b = psi_bilinear_matrix_transp_rows_loc(lrow) do i=1,N_det_beta_unique if (l_b > N_det) exit lrow = psi_bilinear_matrix_transp_rows(l_b) @@ -377,7 +385,6 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_0,s_0,u_t,N_st,sze,istart,iend, l_a = psi_bilinear_matrix_transp_order(l_b) ASSERT (l_a <= N_det) do l=1,N_st - v_t(l,l_a) = v_t(l,l_a) + hij * u_t(l,k_a) v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,l_a) ! single => sij = 0 enddo @@ -398,7 +405,6 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_0,s_0,u_t,N_st,sze,istart,iend, ASSERT (l_a <= N_det) do l=1,N_st - v_t(l,l_a) = v_t(l,l_a) + hij * u_t(l,k_a) v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,l_a) ! same spin => sij = 0 enddo @@ -431,20 +437,8 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_0,s_0,u_t,N_st,sze,istart,iend, enddo end do - !$OMP END DO NOWAIT + !$OMP END DO deallocate(buffer, singles_a, singles_b, doubles, idx) - - !$OMP CRITICAL - do l=1,N_st - do i=1, N_det - v_0(i,l) = v_0(i,l) + v_t(l,i) - s_0(i,l) = s_0(i,l) + s_t(l,i) - enddo - enddo - !$OMP END CRITICAL - deallocate(v_t, s_t) - - !$OMP BARRIER !$OMP END PARALLEL end diff --git a/src/Davidson/u0Hu0_old.irp.f b/src/Davidson/u0Hu0_old.irp.f index 142197d6..5fc68f04 100644 --- a/src/Davidson/u0Hu0_old.irp.f +++ b/src/Davidson/u0Hu0_old.irp.f @@ -500,7 +500,7 @@ subroutine H_S2_u_0_nstates_test(v_0,s_0,u_0,H_jj,S2_jj,n,keys_tmp,Nint,N_st,sze ! if (exc(0,1,2) /= 0) cycle ! if (exc(0,1,1) == 2) cycle ! if (exc(0,1,2) == 2) cycle - if ((degree==1).and.(exc(0,1,1) == 1)) cycle +! if ((degree==1).and.(exc(0,1,1) == 1)) cycle call i_H_j(keys_tmp(1,1,j),keys_tmp(1,1,i),Nint,hij) do l=1,N_st !$OMP ATOMIC