diff --git a/plugins/Full_CI_ZMQ/selection_davidson_slave.irp.f b/plugins/Full_CI_ZMQ/selection_davidson_slave.irp.f index a25ff56d..6fc59cc9 100644 --- a/plugins/Full_CI_ZMQ/selection_davidson_slave.irp.f +++ b/plugins/Full_CI_ZMQ/selection_davidson_slave.irp.f @@ -140,6 +140,16 @@ subroutine run_wf call write_double(6,(t1-t0),'Broadcast time') endif + call wall_time(t0) + if (.True.) then + PROVIDE psi_bilinear_matrix_columns_loc psi_det_alpha_unique psi_det_beta_unique + PROVIDE psi_bilinear_matrix_rows psi_det_sorted_order psi_bilinear_matrix_order + PROVIDE psi_bilinear_matrix_transp_rows_loc psi_bilinear_matrix_transp_columns + PROVIDE psi_bilinear_matrix_transp_order + endif + call wall_time(t1) + call write_double(6,(t1-t0),'Sort time') + call omp_set_nested(.True.) call davidson_slave_tcp(0) call omp_set_nested(.False.) diff --git a/src/Davidson/davidson_parallel.irp.f b/src/Davidson/davidson_parallel.irp.f index 8d580083..c03b6e5e 100644 --- a/src/Davidson/davidson_parallel.irp.f +++ b/src/Davidson/davidson_parallel.irp.f @@ -84,6 +84,11 @@ subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, N_st, sze, integer, external :: zmq_get_dvector integer, external :: zmq_get_dmatrix + PROVIDE psi_det_beta_unique psi_bilinear_matrix_order_transp_reverse psi_det_alpha_unique + PROVIDE psi_bilinear_matrix_transp_values psi_bilinear_matrix_values psi_bilinear_matrix_columns_loc + PROVIDE ref_bitmask_energy nproc + PROVIDE mpi_initialized + allocate(u_t(N_st,N_det)) allocate (energy(N_st)) @@ -96,11 +101,11 @@ subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, N_st, sze, ni = 8388608 nj = int(size(u_t,kind=8)/8388608_8,4) + 1 endif - if (zmq_get_dmatrix(zmq_to_qp_run_socket, worker_id, 'u_t', u_t, ni, nj, size(u_t,kind=8)) == -1) then - print *, irp_here, ': Unable to get u_t' - deallocate(u_t,energy) - return - endif + + do while (zmq_get_dmatrix(zmq_to_qp_run_socket, worker_id, 'u_t', u_t, ni, nj, size(u_t,kind=8)) == -1) + call sleep(1) + print *, irp_here, ': waiting for u_t...' + enddo if (zmq_get_dvector(zmq_to_qp_run_socket, worker_id, 'energy', energy, size(energy)) == -1) then print *, irp_here, ': Unable to get energy' @@ -293,66 +298,19 @@ subroutine H_S2_u_0_nstates_zmq(v_0,s_0,u_0,N_st,sze) integer :: ithread double precision, allocatable :: u_t(:,:) !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: u_t - - PROVIDE psi_det_beta_unique psi_bilinear_matrix_order_transp_reverse psi_det_alpha_unique - PROVIDE psi_bilinear_matrix_transp_values psi_bilinear_matrix_values psi_bilinear_matrix_columns_loc - PROVIDE ref_bitmask_energy nproc - PROVIDE mpi_initialized - - - allocate(u_t(N_st,N_det)) - do k=1,N_st - call dset_order(u_0(1,k),psi_bilinear_matrix_order,N_det) - enddo - call dtranspose( & - u_0, & - size(u_0, 1), & - u_t, & - size(u_t, 1), & - N_det, N_st) - - integer(ZMQ_PTR) :: zmq_to_qp_run_socket, zmq_socket_pull - - ASSERT (N_st == N_states_diag) - ASSERT (sze >= N_det) - call new_parallel_job(zmq_to_qp_run_socket,zmq_socket_pull,'davidson') - character*(512) :: task - integer :: rc, ni, nj - integer*8 :: rc8 - double precision :: energy(N_st) - - integer, external :: zmq_put_dvector, zmq_put_psi, zmq_put_N_states_diag - integer, external :: zmq_put_dmatrix - - energy = 0.d0 - - if (zmq_put_N_states_diag(zmq_to_qp_run_socket, 1) == -1) then - stop 'Unable to put N_states_diag on ZMQ server' - endif if (zmq_put_psi(zmq_to_qp_run_socket,1) == -1) then stop 'Unable to put psi on ZMQ server' endif + if (zmq_put_N_states_diag(zmq_to_qp_run_socket, 1) == -1) then + stop 'Unable to put N_states_diag on ZMQ server' + endif + energy = 0.d0 if (zmq_put_dvector(zmq_to_qp_run_socket,1,'energy',energy,size(energy)) == -1) then stop 'Unable to put energy on ZMQ server' endif - if (size(u_t) < 8388608) then - ni = size(u_t) - nj = 1 - else - ni = 8388608 - nj = size(u_t)/8388608 + 1 - endif - ! Warning : dimensions are modified for efficiency, It is OK since we get the - ! full matrix - if (zmq_put_dmatrix(zmq_to_qp_run_socket, 1, 'u_t', u_t, ni, nj, size(u_t,kind=8)) == -1) then - stop 'Unable to put u_t on ZMQ server' - endif - - deallocate(u_t) - ! Create tasks ! ============ @@ -390,14 +348,61 @@ subroutine H_S2_u_0_nstates_zmq(v_0,s_0,u_0,N_st,sze) endif - v_0 = 0.d0 - s_0 = 0.d0 - integer, external :: zmq_set_running if (zmq_set_running(zmq_to_qp_run_socket) == -1) then print *, irp_here, ': Failed in zmq_set_running' endif + if (.True.) then + PROVIDE psi_det_beta_unique psi_bilinear_matrix_order_transp_reverse psi_det_alpha_unique + PROVIDE psi_bilinear_matrix_transp_values psi_bilinear_matrix_values psi_bilinear_matrix_columns_loc + PROVIDE ref_bitmask_energy nproc + PROVIDE mpi_initialized + endif + + allocate(u_t(N_st,N_det)) + do k=1,N_st + call dset_order(u_0(1,k),psi_bilinear_matrix_order,N_det) + enddo + + call dtranspose( & + u_0, & + size(u_0, 1), & + u_t, & + size(u_t, 1), & + N_det, N_st) + + + ASSERT (N_st == N_states_diag) + ASSERT (sze >= N_det) + + character*(512) :: task + integer :: rc, ni, nj + integer*8 :: rc8 + double precision :: energy(N_st) + + integer, external :: zmq_put_dvector, zmq_put_psi, zmq_put_N_states_diag + integer, external :: zmq_put_dmatrix + + if (size(u_t) < 8388608) then + ni = size(u_t) + nj = 1 + else + ni = 8388608 + nj = size(u_t)/8388608 + 1 + endif + ! Warning : dimensions are modified for efficiency, It is OK since we get the + ! full matrix + if (zmq_put_dmatrix(zmq_to_qp_run_socket, 1, 'u_t', u_t, ni, nj, size(u_t,kind=8)) == -1) then + stop 'Unable to put u_t on ZMQ server' + endif + + deallocate(u_t) + + + v_0 = 0.d0 + s_0 = 0.d0 + call omp_set_nested(.True.) !$OMP PARALLEL NUM_THREADS(2) PRIVATE(ithread) ithread = omp_get_thread_num() diff --git a/src/DavidsonUndressed/print_energy.irp.f b/src/DavidsonUndressed/print_energy.irp.f index d694cb6c..94165560 100644 --- a/src/DavidsonUndressed/print_energy.irp.f +++ b/src/DavidsonUndressed/print_energy.irp.f @@ -2,7 +2,7 @@ program print_energy implicit none read_wf = .true. touch read_wf - provide mo_bielec_integrals_in_map + provide mo_bielec_integrals_in_map psi_coef psi_det psi_bilinear_matrix_transp_values double precision :: time1, time0 call wall_time(time0) call routine diff --git a/src/Determinants/spindeterminants.irp.f b/src/Determinants/spindeterminants.irp.f index 9a39212c..29d0eb30 100644 --- a/src/Determinants/spindeterminants.irp.f +++ b/src/Determinants/spindeterminants.irp.f @@ -519,7 +519,7 @@ BEGIN_PROVIDER [ double precision, psi_bilinear_matrix_transp_values, (N_det,N_ do k=1,N_det psi_bilinear_matrix_transp_values (k,l) = psi_bilinear_matrix_values (k,l) enddo - !$OMP ENDDO + !$OMP ENDDO NOWAIT enddo !$OMP DO do k=1,N_det diff --git a/src/Utils/sort.irp.f b/src/Utils/sort.irp.f index a9594d6c..085d3d35 100644 --- a/src/Utils/sort.irp.f +++ b/src/Utils/sort.irp.f @@ -456,16 +456,16 @@ BEGIN_TEMPLATE iorder(i) = iorder1(1_$int_type+i1-i) enddo endif - deallocate(x1,iorder1,stat=err) - if (err /= 0) then - print *, irp_here, ': Unable to deallocate arrays x1, iorder1' - stop - endif if (i2>1_$int_type) then call $Xradix_sort$big(x(i1+1_$int_type),iorder(i1+1_$int_type),i2,-2) endif + deallocate(x1,iorder1,stat=err) + if (err /= 0) then + print *, irp_here, ': Unable to deallocate arrays x1, iorder1' + stop + endif return else if (iradix == -2) then ! Positive @@ -526,13 +526,23 @@ BEGIN_TEMPLATE endif + !$OMP PARALLEL DEFAULT(SHARED) if (isize > 1000000) + !$OMP SINGLE if (i3>1_$int_type) then + !$OMP TASK FIRSTPRIVATE(iradix_new,i3) SHARED(x,iorder) if(i3 > 1000000) call $Xradix_sort$big(x,iorder,i3,iradix_new-1) + !$OMP END TASK endif if (isize-i3>1_$int_type) then + !$OMP TASK FIRSTPRIVATE(iradix_new,i3) SHARED(x,iorder) if(isize-i3 > 1000000) call $Xradix_sort$big(x(i3+1_$int_type),iorder(i3+1_$int_type),isize-i3,iradix_new-1) + !$OMP END TASK endif + + !$OMP TASKWAIT + !$OMP END SINGLE + !$OMP END PARALLEL return endif @@ -588,11 +598,16 @@ BEGIN_TEMPLATE if (i1>1_$int_type) then + !$OMP TASK FIRSTPRIVATE(i0,iradix,i1) SHARED(x,iorder) if(i1 >1000000) call $Xradix_sort$big(x(i0+1_$int_type),iorder(i0+1_$int_type),i1,iradix-1) + !$OMP END TASK endif if (i0>1) then + !$OMP TASK FIRSTPRIVATE(i0,iradix) SHARED(x,iorder) if(i0 >1000000) call $Xradix_sort$big(x,iorder,i0,iradix-1) + !$OMP END TASK endif + !$OMP TASKWAIT end