diff --git a/src/cipsi/cipsi.irp.f b/src/cipsi/cipsi.irp.f index ba922c49..66881b28 100644 --- a/src/cipsi/cipsi.irp.f +++ b/src/cipsi/cipsi.irp.f @@ -45,13 +45,19 @@ subroutine run_cipsi if (N_det > N_det_max) then psi_det = psi_det_sorted - psi_coef = psi_coef_sorted - N_det = N_det_max - soft_touch N_det psi_det psi_coef + if (is_complex) then + psi_coef_complex = psi_coef_sorted_complex + N_det = N_det_max + soft_touch N_det psi_det psi_coef_complex + else + psi_coef = psi_coef_sorted + N_det = N_det_max + soft_touch N_det psi_det psi_coef + endif if (s2_eig) then call make_s2_eigenfunction endif - call diagonalize_CI + call diagonalize_ci call save_wavefunction endif @@ -109,8 +115,11 @@ subroutine run_cipsi to_select = int(sqrt(dble(N_states))*dble(N_det)*selection_factor) to_select = max(N_states_diag, to_select) call ZMQ_selection(to_select, pt2, variance, norm) - - PROVIDE psi_coef + if (is_complex) then + PROVIDE psi_coef_complex + else + PROVIDE psi_coef + endif PROVIDE psi_det PROVIDE psi_det_sorted diff --git a/src/davidson/davidson_parallel.irp.f b/src/davidson/davidson_parallel.irp.f index c0d94b35..b98ec377 100644 --- a/src/davidson/davidson_parallel.irp.f +++ b/src/davidson/davidson_parallel.irp.f @@ -89,21 +89,93 @@ subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, N_st, sze, character*(512) :: msg integer :: imin, imax, ishift, istep - integer, allocatable :: psi_det_read(:,:,:) - double precision, allocatable :: v_t(:,:), s_t(:,:), u_t(:,:) - - !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: u_t, v_t, s_t - - ! Get wave function (u_t) - ! ----------------------- - integer :: rc, ni, nj integer*8 :: rc8 integer :: N_states_read, N_det_read, psi_det_size_read integer :: N_det_selectors_read, N_det_generators_read - integer, external :: zmq_get_dvector + integer, allocatable :: psi_det_read(:,:,:) + logical :: sending + integer, external :: get_task_from_taskserver + integer, external :: task_done_to_taskserver + integer :: k + integer :: ierr + + +! integer, external :: zmq_get_dvector integer, external :: zmq_get_dmatrix + + if (is_complex) then + complex*16, allocatable :: v_tc(:,:), s_tc(:,:), u_tc(:,:) + + !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: u_tc, v_tc, s_tc + + + ! Get wave function (u_tc) + ! ----------------------- + + PROVIDE psi_det_beta_unique psi_bilinear_matrix_order_transp_reverse psi_det_alpha_unique + PROVIDE psi_bilinear_matrix_transp_values_complex psi_bilinear_matrix_values_complex psi_bilinear_matrix_columns_loc + PROVIDE ref_bitmask_energy nproc + PROVIDE mpi_initialized + + allocate(u_tc(N_st,N_det)) + + !todo: resize for complex? + ! Warning : dimensions are modified for efficiency, It is OK since we get the + ! full matrix + if (size(u_tc,kind=8) < 8388608_8) then + ni = size(u_tc) + nj = 1 + else + ni = 8388608 + nj = int(size(u_tc,kind=8)/8388608_8,4) + 1 + endif + + do while (zmq_get_cdmatrix(zmq_to_qp_run_socket, worker_id, 'u_tc', u_tc, ni, nj, size(u_tc,kind=8)) == -1) + print *, 'mpi_rank, N_states_diag, N_det' + print *, mpi_rank, N_states_diag, N_det + stop 'u_t' + enddo + + IRP_IF MPI + include 'mpif.h' + call broadcast_chunks_complex_double(u_tc,size(u_tc,kind=8)) + IRP_ENDIF + + ! Run tasks + ! --------- + + sending=.False. + + allocate(v_tc(N_st,N_det), s_tc(N_st,N_det)) + do + if (get_task_from_taskserver(zmq_to_qp_run_socket,worker_id, task_id, msg) == -1) then + exit + endif + if(task_id == 0) exit + read (msg,*) imin, imax, ishift, istep + do k=imin,imax + v_tc(:,k) = (0.d0,0.d0) + s_tc(:,k) = (0.d0,0.d0) + enddo + call h_s2_u_0_nstates_openmp_work_complex(v_tc,s_tc,u_tc,N_st,N_det,imin,imax,ishift,istep) + if (task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id) == -1) then + print *, irp_here, 'Unable to send task_done' + endif + call davidson_push_results_async_recv(zmq_socket_push, sending) + call davidson_push_results_async_send_complex(zmq_socket_push, v_tc, s_tc, imin, imax, task_id, sending) + end do + deallocate(u_tc,v_tc, s_tc) + call davidson_push_results_async_recv(zmq_socket_push, sending) + else + double precision, allocatable :: v_t(:,:), s_t(:,:), u_t(:,:) + + !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: u_t, v_t, s_t + + + ! Get wave function (u_t) + ! ----------------------- PROVIDE psi_det_beta_unique psi_bilinear_matrix_order_transp_reverse psi_det_alpha_unique PROVIDE psi_bilinear_matrix_transp_values psi_bilinear_matrix_values psi_bilinear_matrix_columns_loc @@ -130,28 +202,21 @@ subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, N_st, sze, IRP_IF MPI include 'mpif.h' - integer :: ierr - call broadcast_chunks_double(u_t,size(u_t,kind=8)) - IRP_ENDIF ! Run tasks ! --------- - logical :: sending sending=.False. allocate(v_t(N_st,N_det), s_t(N_st,N_det)) do - integer, external :: get_task_from_taskserver - integer, external :: task_done_to_taskserver if (get_task_from_taskserver(zmq_to_qp_run_socket,worker_id, task_id, msg) == -1) then exit endif if(task_id == 0) exit read (msg,*) imin, imax, ishift, istep - integer :: k do k=imin,imax v_t(:,k) = 0.d0 s_t(:,k) = 0.d0 @@ -165,7 +230,7 @@ subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, N_st, sze, end do deallocate(u_t,v_t, s_t) call davidson_push_results_async_recv(zmq_socket_push, sending) - + endif end subroutine @@ -533,6 +598,7 @@ subroutine H_S2_u_0_nstates_zmq(v_0,s_0,u_0,N_st,sze) end + BEGIN_PROVIDER [ integer, nthreads_davidson ] implicit none BEGIN_DOC @@ -643,3 +709,365 @@ integer function zmq_get_N_states_diag(zmq_to_qp_run_socket, worker_id) IRP_ENDIF end + +!==============================================================================! +! ! +! Complex ! +! ! +!==============================================================================! + +subroutine davidson_push_results_complex(zmq_socket_push, v_t, s_t, imin, imax, task_id) + !todo: implement for complex; check double sz + print*,irp_here,' not implemented for complex' + stop -1 + use f77_zmq + implicit none + BEGIN_DOC +! Push the results of $H | U \rangle$ from a worker to the master. + END_DOC + + integer(ZMQ_PTR) ,intent(in) :: zmq_socket_push + integer ,intent(in) :: task_id, imin, imax + complex*16 ,intent(in) :: v_t(N_states_diag,N_det) + complex*16 ,intent(in) :: s_t(N_states_diag,N_det) + integer :: rc, sz + integer*8 :: rc8 + + sz = (imax-imin+1)*N_states_diag + + rc = f77_zmq_send( zmq_socket_push, task_id, 4, ZMQ_SNDMORE) + if(rc /= 4) stop 'davidson_push_results failed to push task_id' + + rc = f77_zmq_send( zmq_socket_push, imin, 4, ZMQ_SNDMORE) + if(rc /= 4) stop 'davidson_push_results failed to push imin' + + rc = f77_zmq_send( zmq_socket_push, imax, 4, ZMQ_SNDMORE) + if(rc /= 4) stop 'davidson_push_results failed to push imax' + + !todo: double sz for complex? + rc8 = f77_zmq_send8( zmq_socket_push, v_t(1,imin), 8_8*sz, ZMQ_SNDMORE) + if(rc8 /= 8_8*sz) stop 'davidson_push_results failed to push vt' + + !todo: double sz for complex? + rc8 = f77_zmq_send8( zmq_socket_push, s_t(1,imin), 8_8*sz, 0) + if(rc8 /= 8_8*sz) stop 'davidson_push_results failed to push st' + +! Activate is zmq_socket_push is a REQ +IRP_IF ZMQ_PUSH +IRP_ELSE + character*(2) :: ok + rc = f77_zmq_recv( zmq_socket_push, ok, 2, 0) + if ((rc /= 2).and.(ok(1:2)/='ok')) then + print *, irp_here, ': f77_zmq_recv( zmq_socket_push, ok, 2, 0)' + stop -1 + endif +IRP_ENDIF + +end subroutine + +subroutine davidson_push_results_async_send_complex(zmq_socket_push, v_t, s_t, imin, imax, task_id,sending) + !todo: implement for complex; check double sz + print*,irp_here,' not implemented for complex' + stop -1 + use f77_zmq + implicit none + BEGIN_DOC +! Push the results of $H | U \rangle$ from a worker to the master. + END_DOC + + integer(ZMQ_PTR) ,intent(in) :: zmq_socket_push + integer ,intent(in) :: task_id, imin, imax + complex*16 ,intent(in) :: v_t(N_states_diag,N_det) + complex*16 ,intent(in) :: s_t(N_states_diag,N_det) + logical ,intent(inout) :: sending + integer :: rc, sz + integer*8 :: rc8 + + if (sending) then + print *, irp_here, ': sending=true' + stop -1 + endif + sending = .True. + + sz = (imax-imin+1)*N_states_diag + + rc = f77_zmq_send( zmq_socket_push, task_id, 4, ZMQ_SNDMORE) + if(rc /= 4) stop 'davidson_push_results failed to push task_id' + + rc = f77_zmq_send( zmq_socket_push, imin, 4, ZMQ_SNDMORE) + if(rc /= 4) stop 'davidson_push_results failed to push imin' + + rc = f77_zmq_send( zmq_socket_push, imax, 4, ZMQ_SNDMORE) + if(rc /= 4) stop 'davidson_push_results failed to push imax' + + !todo: double sz for complex? + rc8 = f77_zmq_send8( zmq_socket_push, v_t(1,imin), 8_8*sz, ZMQ_SNDMORE) + if(rc8 /= 8_8*sz) stop 'davidson_push_results failed to push vt' + + !todo: double sz for complex? + rc8 = f77_zmq_send8( zmq_socket_push, s_t(1,imin), 8_8*sz, 0) + if(rc8 /= 8_8*sz) stop 'davidson_push_results failed to push st' + +end subroutine + + +subroutine davidson_pull_results_complex(zmq_socket_pull, v_t, s_t, imin, imax, task_id) + !todo: implement for complex; check double sz + print*,irp_here,' not implemented for complex' + stop -1 + use f77_zmq + implicit none + BEGIN_DOC +! Pull the results of $H | U \rangle$ on the master. + END_DOC + + integer(ZMQ_PTR) ,intent(in) :: zmq_socket_pull + integer ,intent(out) :: task_id, imin, imax + complex*16 ,intent(out) :: v_t(N_states_diag,N_det) + complex*16 ,intent(out) :: s_t(N_states_diag,N_det) + + integer :: rc, sz + integer*8 :: rc8 + + rc = f77_zmq_recv( zmq_socket_pull, task_id, 4, 0) + if(rc /= 4) stop 'davidson_pull_results failed to pull task_id' + + rc = f77_zmq_recv( zmq_socket_pull, imin, 4, 0) + if(rc /= 4) stop 'davidson_pull_results failed to pull imin' + + rc = f77_zmq_recv( zmq_socket_pull, imax, 4, 0) + if(rc /= 4) stop 'davidson_pull_results failed to pull imax' + + sz = (imax-imin+1)*N_states_diag + + !todo: double sz for complex? + rc8 = f77_zmq_recv8( zmq_socket_pull, v_t(1,imin), 8_8*sz, 0) + if(rc8 /= 8*sz) stop 'davidson_pull_results failed to pull v_t' + + !todo: double sz for complex? + rc8 = f77_zmq_recv8( zmq_socket_pull, s_t(1,imin), 8_8*sz, 0) + if(rc8 /= 8*sz) stop 'davidson_pull_results failed to pull s_t' + +! Activate if zmq_socket_pull is a REP +IRP_IF ZMQ_PUSH +IRP_ELSE + rc = f77_zmq_send( zmq_socket_pull, 'ok', 2, 0) + if (rc /= 2) then + print *, irp_here, ' : f77_zmq_send (zmq_socket_pull,...' + stop -1 + endif +IRP_ENDIF + +end subroutine + + +subroutine davidson_collector_complex(zmq_to_qp_run_socket, zmq_socket_pull, v0, s0, sze, N_st) + !todo: implement for complex; check conjg v_t s_t + print*,irp_here,' not implemented for complex' + stop -1 + use f77_zmq + implicit none + BEGIN_DOC +! Routine collecting the results of the workers in Davidson's algorithm. + END_DOC + + integer(ZMQ_PTR), intent(in) :: zmq_socket_pull + integer, intent(in) :: sze, N_st + integer(ZMQ_PTR), intent(in) :: zmq_to_qp_run_socket + + complex*16 ,intent(inout) :: v0(sze, N_st) + complex*16 ,intent(inout) :: s0(sze, N_st) + + integer :: more, task_id, imin, imax + + complex*16, allocatable :: v_t(:,:), s_t(:,:) + logical :: sending + integer :: i,j + integer, external :: zmq_delete_task_async_send + integer, external :: zmq_delete_task_async_recv + + allocate(v_t(N_st,N_det), s_t(N_st,N_det)) + v0 = (0.d0,0.d0) + s0 = (0.d0,0.d0) + more = 1 + sending = .False. + do while (more == 1) + call davidson_pull_results_complex(zmq_socket_pull, v_t, s_t, imin, imax, task_id) + if (zmq_delete_task_async_send(zmq_to_qp_run_socket,task_id,sending) == -1) then + stop 'davidson: Unable to delete task (send)' + endif + do j=1,N_st + do i=imin,imax + !todo: conjg or no? + print*,irp_here,' not implemented for complex (conjg?)' + v0(i,j) = v0(i,j) + v_t(j,i) + s0(i,j) = s0(i,j) + s_t(j,i) + enddo + enddo + if (zmq_delete_task_async_recv(zmq_to_qp_run_socket,more,sending) == -1) then + stop 'davidson: Unable to delete task (recv)' + endif + end do + deallocate(v_t,s_t) + +end subroutine + + + +subroutine h_s2_u_0_nstates_zmq_complex(v_0,s_0,u_0,N_st,sze) + !todo: implement for complex + print*,irp_here,' not implemented for complex' + stop -1 + use omp_lib + use bitmasks + use f77_zmq + implicit none + BEGIN_DOC + ! Computes $v_0 = H | u_0\rangle$ and $s_0 = S^2 | u_0\rangle$ + ! + ! n : number of determinants + ! + ! H_jj : array of $\langle j | H | j \rangle$ + ! + ! S2_jj : array of $\langle j | S^2 | j \rangle$ + END_DOC + integer, intent(in) :: N_st, sze + complex*16, intent(out) :: v_0(sze,N_st), s_0(sze,N_st) + complex*16, intent(inout) :: u_0(sze,N_st) + integer :: i,j,k + integer :: ithread + complex*16, allocatable :: u_t(:,:) + !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: u_t + integer(ZMQ_PTR) :: zmq_to_qp_run_socket, zmq_socket_pull + PROVIDE psi_det_beta_unique psi_bilinear_matrix_order_transp_reverse psi_det_alpha_unique + PROVIDE psi_bilinear_matrix_transp_values_complex psi_bilinear_matrix_values_complex psi_bilinear_matrix_columns_loc + PROVIDE ref_bitmask_energy nproc + PROVIDE mpi_initialized + + call new_parallel_job(zmq_to_qp_run_socket,zmq_socket_pull,'davidson') + +! integer :: N_states_diag_save +! N_states_diag_save = N_states_diag +! N_states_diag = N_st + if (zmq_put_N_states_diag(zmq_to_qp_run_socket, 1) == -1) then + stop 'Unable to put N_states_diag on ZMQ server' + endif + + if (zmq_put_psi(zmq_to_qp_run_socket,1) == -1) then + stop 'Unable to put psi on ZMQ server' + endif + energy = 0.d0 + if (zmq_put_dvector(zmq_to_qp_run_socket,1,'energy',energy,size(energy)) == -1) then + stop 'Unable to put energy on ZMQ server' + endif + + + ! Create tasks + ! ============ + + integer :: istep, imin, imax, ishift, ipos + integer, external :: add_task_to_taskserver + integer, parameter :: tasksize=10000 + character*(100000) :: task + istep=1 + ishift=0 + imin=1 + + + ipos=1 + do imin=1,N_det,tasksize + imax = min(N_det,imin-1+tasksize) + do ishift=0,istep-1 + write(task(ipos:ipos+50),'(4(I11,1X),1X,1A)') imin, imax, ishift, istep, '|' + ipos = ipos+50 + if (ipos > 100000-50) then + if (add_task_to_taskserver(zmq_to_qp_run_socket,trim(task(1:ipos))) == -1) then + stop 'Unable to add task' + endif + ipos=1 + endif + enddo + enddo + + if (ipos > 1) then + if (add_task_to_taskserver(zmq_to_qp_run_socket,trim(task(1:ipos))) == -1) then + stop 'Unable to add task' + endif + ipos=1 + endif + + allocate(u_t(N_st,N_det)) + do k=1,N_st + call cdset_order(u_0(1,k),psi_bilinear_matrix_order,N_det) + enddo + + call cdtranspose( & + u_0, & + size(u_0, 1), & + u_t, & + size(u_t, 1), & + N_det, N_st) + + + ASSERT (N_st == N_states_diag) + ASSERT (sze >= N_det) + + integer :: rc, ni, nj + integer*8 :: rc8 + double precision :: energy(N_st) + + integer, external :: zmq_put_dvector, zmq_put_psi, zmq_put_N_states_diag + integer, external :: zmq_put_cdmatrix + !todo: size/2 for complex? + if (size(u_t) < 8388608) then + ni = size(u_t) + nj = 1 + else + ni = 8388608 + nj = size(u_t)/8388608 + 1 + endif + ! Warning : dimensions are modified for efficiency, It is OK since we get the + ! full matrix + if (zmq_put_cdmatrix(zmq_to_qp_run_socket, 1, 'u_t', u_t, ni, nj, size(u_t,kind=8)) == -1) then + stop 'Unable to put u_t on ZMQ server' + endif + + deallocate(u_t) + + integer, external :: zmq_set_running + if (zmq_set_running(zmq_to_qp_run_socket) == -1) then + print *, irp_here, ': Failed in zmq_set_running' + endif + + call omp_set_nested(.True.) + !$OMP PARALLEL DEFAULT(shared) NUM_THREADS(2) PRIVATE(ithread) + ithread = omp_get_thread_num() + if (ithread == 0 ) then + call davidson_collector_complex(zmq_to_qp_run_socket, zmq_socket_pull, v_0, s_0, N_det, N_st) + else + call davidson_slave_inproc(1) + endif + !$OMP END PARALLEL + call end_parallel_job(zmq_to_qp_run_socket, zmq_socket_pull, 'davidson') + + !$OMP PARALLEL + !$OMP SINGLE + do k=1,N_st + !$OMP TASK DEFAULT(SHARED) FIRSTPRIVATE(k,N_det) + call cdset_order(v_0(1,k),psi_bilinear_matrix_order_reverse,N_det) + !$OMP END TASK + !$OMP TASK DEFAULT(SHARED) FIRSTPRIVATE(k,N_det) + call cdset_order(s_0(1,k),psi_bilinear_matrix_order_reverse,N_det) + !$OMP END TASK + !$OMP TASK DEFAULT(SHARED) FIRSTPRIVATE(k,N_det) + call cdset_order(u_0(1,k),psi_bilinear_matrix_order_reverse,N_det) + !$OMP END TASK + enddo + !$OMP END SINGLE + !$OMP TASKWAIT + !$OMP END PARALLEL + +! N_states_diag = N_states_diag_save +! SOFT_TOUCH N_states_diag +end + diff --git a/src/davidson/u0_h_u0.irp.f b/src/davidson/u0_h_u0.irp.f index 6117a13e..e3a2e1ed 100644 --- a/src/davidson/u0_h_u0.irp.f +++ b/src/davidson/u0_h_u0.irp.f @@ -6,7 +6,11 @@ ! ! psi_s2(i) = $\langle \Psi_i | S^2 | \Psi_i \rangle$ END_DOC - call u_0_H_u_0(psi_energy,psi_s2,psi_coef,N_det,psi_det,N_int,N_states,psi_det_size) + if (is_complex) then + call u_0_h_u_0_complex(psi_energy,psi_s2,psi_coef_complex,N_det,psi_det,N_int,N_states,psi_det_size) + else + call u_0_H_u_0(psi_energy,psi_s2,psi_coef,N_det,psi_det,N_int,N_states,psi_det_size) + endif integer :: i do i=N_det+1,N_states psi_energy(i) = 0.d0 @@ -708,3 +712,703 @@ N_int;; END_TEMPLATE +!==============================================================================! +! ! +! Complex ! +! ! +!==============================================================================! + +subroutine u_0_H_u_0_complex(e_0,s_0,u_0,n,keys_tmp,Nint,N_st,sze) + !todo: implement for complex + print*,irp_here,' not implemented for complex' + stop -1 + use bitmasks + implicit none + BEGIN_DOC + ! Computes $E_0 = \frac{\langle u_0 | H | u_0 \rangle}{\langle u_0 | u_0 \rangle}$ + ! + ! and $S_0 = \frac{\langle u_0 | S^2 | u_0 \rangle}{\langle u_0 | u_0 \rangle}$ + ! + ! n : number of determinants + ! + END_DOC + integer, intent(in) :: n,Nint, N_st, sze + double precision, intent(out) :: e_0(N_st),s_0(N_st) + complex*16, intent(inout) :: u_0(sze,N_st) + integer(bit_kind),intent(in) :: keys_tmp(Nint,2,n) + + complex*16, allocatable :: v_0(:,:), s_vec(:,:), u_1(:,:) + double precision :: u_dot_u_complex,diag_H_mat_elem + complex*16 :: u_dot_v_complex + integer :: i,j, istate + + if ((n > 100000).and.distributed_davidson) then + allocate (v_0(n,N_states_diag),s_vec(n,N_states_diag), u_1(n,N_states_diag)) + u_1(:,:) = (0.d0,0.d0) + u_1(1:n,1:N_st) = u_0(1:n,1:N_st) + call h_s2_u_0_nstates_zmq(v_0,s_vec,u_1,N_states_diag,n) + else if (n < n_det_max_full) then + allocate (v_0(n,N_st),s_vec(n,N_st), u_1(n,N_st)) + v_0(:,:) = 0.d0 + u_1(:,:) = 0.d0 + s_vec(:,:) = 0.d0 + u_1(1:n,1:N_st) = u_0(1:n,1:N_st) + do istate = 1,N_st + do j=1,n + do i=1,n + v_0(i,istate) = v_0(i,istate) + h_matrix_all_dets(i,j) * u_0(j,istate) + s_vec(i,istate) = s_vec(i,istate) + S2_matrix_all_dets(i,j) * u_0(j,istate) + enddo + enddo + enddo + else + allocate (v_0(n,N_st),s_vec(n,N_st),u_1(n,N_st)) + u_1(:,:) = 0.d0 + u_1(1:n,1:N_st) = u_0(1:n,1:N_st) + call H_S2_u_0_nstates_openmp(v_0,s_vec,u_1,N_st,n) + endif + u_0(1:n,1:N_st) = u_1(1:n,1:N_st) + deallocate(u_1) + double precision :: norm + !$OMP PARALLEL DO PRIVATE(i,norm) DEFAULT(SHARED) + do i=1,N_st + norm = u_dot_u(u_0(1,i),n) + if (norm /= 0.d0) then + e_0(i) = u_dot_v(v_0(1,i),u_0(1,i),n) + s_0(i) = u_dot_v(s_vec(1,i),u_0(1,i),n) + else + e_0(i) = 0.d0 + s_0(i) = 0.d0 + endif + enddo + !$OMP END PARALLEL DO + deallocate (s_vec, v_0) +end + + +subroutine H_S2_u_0_nstates_openmp_complex(v_0,s_0,u_0,N_st,sze) + !todo: implement for complex + print*,irp_here,' not implemented for complex' + stop -1 + use bitmasks + implicit none + BEGIN_DOC + ! Computes $v_0 = H | u_0\rangle$ and $s_0 = S^2 | u_0\rangle$. + ! + ! Assumes that the determinants are in psi_det + ! + ! istart, iend, ishift, istep are used in ZMQ parallelization. + END_DOC + integer, intent(in) :: N_st,sze + double precision, intent(inout) :: v_0(sze,N_st), s_0(sze,N_st), u_0(sze,N_st) + integer :: k + double precision, allocatable :: u_t(:,:), v_t(:,:), s_t(:,:) + !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: u_t + allocate(u_t(N_st,N_det),v_t(N_st,N_det),s_t(N_st,N_det)) + + do k=1,N_st + call dset_order(u_0(1,k),psi_bilinear_matrix_order,N_det) + enddo + v_t = 0.d0 + s_t = 0.d0 + call dtranspose( & + u_0, & + size(u_0, 1), & + u_t, & + size(u_t, 1), & + N_det, N_st) + + call H_S2_u_0_nstates_openmp_work(v_t,s_t,u_t,N_st,sze,1,N_det,0,1) + deallocate(u_t) + + call dtranspose( & + v_t, & + size(v_t, 1), & + v_0, & + size(v_0, 1), & + N_st, N_det) + call dtranspose( & + s_t, & + size(s_t, 1), & + s_0, & + size(s_0, 1), & + N_st, N_det) + deallocate(v_t,s_t) + + do k=1,N_st + call dset_order(v_0(1,k),psi_bilinear_matrix_order_reverse,N_det) + call dset_order(s_0(1,k),psi_bilinear_matrix_order_reverse,N_det) + call dset_order(u_0(1,k),psi_bilinear_matrix_order_reverse,N_det) + enddo + +end +subroutine H_S2_u_0_nstates_openmp_work_complex(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep) + !todo: implement for complex + print*,irp_here,' not implemented for complex' + stop -1 + use bitmasks + implicit none + BEGIN_DOC + ! Computes $v_t = H | u_t\rangle$ and $s_t = S^2 | u_t\rangle$ + ! + ! Default should be 1,N_det,0,1 + END_DOC + integer, intent(in) :: N_st,sze,istart,iend,ishift,istep + complex*16, intent(in) :: u_t(N_st,N_det) + complex*16, intent(out) :: v_t(N_st,sze), s_t(N_st,sze) + + + PROVIDE ref_bitmask_energy N_int + + select case (N_int) + case (1) + call H_S2_u_0_nstates_openmp_work_complex_1(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep) + case (2) + call H_S2_u_0_nstates_openmp_work_complex_2(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep) + case (3) + call H_S2_u_0_nstates_openmp_work_complex_3(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep) + case (4) + call H_S2_u_0_nstates_openmp_work_complex_4(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep) + case default + call H_S2_u_0_nstates_openmp_work_complex_N_int(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep) + end select +end + +BEGIN_TEMPLATE + +subroutine H_S2_u_0_nstates_openmp_work_complex_$N_int(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep) + !todo: implement for complex + print*,irp_here,' not implemented for complex' + stop -1 + use bitmasks + implicit none + BEGIN_DOC + ! Computes $v_t = H | u_t \\rangle$ and $s_t = S^2 | u_t\\rangle$ + ! + ! Default should be 1,N_det,0,1 + END_DOC + integer, intent(in) :: N_st,sze,istart,iend,ishift,istep + complex*16, intent(in) :: u_t(N_st,N_det) + complex*16, intent(out) :: v_t(N_st,sze), s_t(N_st,sze) + + complex*16 :: hij, sij + integer :: i,j,k,l,kk + integer :: k_a, k_b, l_a, l_b, m_a, m_b + integer :: istate + integer :: krow, kcol, krow_b, kcol_b + integer :: lrow, lcol + integer :: mrow, mcol + integer(bit_kind) :: spindet($N_int) + integer(bit_kind) :: tmp_det($N_int,2) + integer(bit_kind) :: tmp_det2($N_int,2) + integer(bit_kind) :: tmp_det3($N_int,2) + integer(bit_kind), allocatable :: buffer(:,:) + integer :: n_doubles + integer, allocatable :: doubles(:) + integer, allocatable :: singles_a(:) + integer, allocatable :: singles_b(:) + integer, allocatable :: idx(:), idx0(:) + integer :: maxab, n_singles_a, n_singles_b, kcol_prev + integer*8 :: k8 + logical :: compute_singles + integer*8 :: last_found, left, right, right_max + double precision :: rss, mem, ratio + complex*16, allocatable :: utl(:,:) + integer, parameter :: block_size=128 + +! call resident_memory(rss) +! mem = dble(singles_beta_csc_size) / 1024.d0**3 +! +! compute_singles = (mem+rss > qp_max_mem) +! +! if (.not.compute_singles) then +! provide singles_beta_csc +! endif +compute_singles=.True. + + maxab = max(N_det_alpha_unique, N_det_beta_unique)+1 + allocate(idx0(maxab)) + + do i=1,maxab + idx0(i) = i + enddo + + ! Prepare the array of all alpha single excitations + ! ------------------------------------------------- + + PROVIDE N_int nthreads_davidson + !$OMP PARALLEL DEFAULT(SHARED) NUM_THREADS(nthreads_davidson) & + !$OMP SHARED(psi_bilinear_matrix_rows, N_det, & + !$OMP psi_bilinear_matrix_columns, & + !$OMP psi_det_alpha_unique, psi_det_beta_unique, & + !$OMP n_det_alpha_unique, n_det_beta_unique, N_int, & + !$OMP psi_bilinear_matrix_transp_rows, & + !$OMP psi_bilinear_matrix_transp_columns, & + !$OMP psi_bilinear_matrix_transp_order, N_st, & + !$OMP psi_bilinear_matrix_order_transp_reverse, & + !$OMP psi_bilinear_matrix_columns_loc, & + !$OMP psi_bilinear_matrix_transp_rows_loc, & + !$OMP istart, iend, istep, irp_here, v_t, s_t, & + !$OMP ishift, idx0, u_t, maxab, compute_singles, & + !$OMP singles_alpha_csc,singles_alpha_csc_idx, & + !$OMP singles_beta_csc,singles_beta_csc_idx) & + !$OMP PRIVATE(krow, kcol, tmp_det, spindet, k_a, k_b, i, & + !$OMP lcol, lrow, l_a, l_b, utl, kk, & + !$OMP buffer, doubles, n_doubles, & + !$OMP tmp_det2, hij, sij, idx, l, kcol_prev, & + !$OMP singles_a, n_singles_a, singles_b, ratio, & + !$OMP n_singles_b, k8, last_found,left,right,right_max) + + ! Alpha/Beta double excitations + ! ============================= + + allocate( buffer($N_int,maxab), & + singles_a(maxab), & + singles_b(maxab), & + doubles(maxab), & + idx(maxab), utl(N_st,block_size)) + + kcol_prev=-1 + + ASSERT (iend <= N_det) + ASSERT (istart > 0) + ASSERT (istep > 0) + + !$OMP DO SCHEDULE(guided,64) + do k_a=istart+ishift,iend,istep + + krow = psi_bilinear_matrix_rows(k_a) + ASSERT (krow <= N_det_alpha_unique) + + kcol = psi_bilinear_matrix_columns(k_a) + ASSERT (kcol <= N_det_beta_unique) + + tmp_det(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, krow) + + if (kcol /= kcol_prev) then + tmp_det(1:$N_int,2) = psi_det_beta_unique (1:$N_int, kcol) + if (compute_singles) then + call get_all_spin_singles_$N_int( & + psi_det_beta_unique, idx0, & + tmp_det(1,2), N_det_beta_unique, & + singles_b, n_singles_b) + else + n_singles_b = 0 + !DIR$ LOOP COUNT avg(1000) + do k8=singles_beta_csc_idx(kcol),singles_beta_csc_idx(kcol+1)-1 + n_singles_b = n_singles_b+1 + singles_b(n_singles_b) = singles_beta_csc(k8) + enddo + endif + endif + kcol_prev = kcol + + ! Loop over singly excited beta columns + ! ------------------------------------- + + !DIR$ LOOP COUNT avg(1000) + do i=1,n_singles_b + lcol = singles_b(i) + + tmp_det2(1:$N_int,2) = psi_det_beta_unique(1:$N_int, lcol) + +!--- +! if (compute_singles) then + + l_a = psi_bilinear_matrix_columns_loc(lcol) + ASSERT (l_a <= N_det) + + !DIR$ UNROLL(8) + !DIR$ LOOP COUNT avg(50000) + do j=1,psi_bilinear_matrix_columns_loc(lcol+1) - psi_bilinear_matrix_columns_loc(lcol) + lrow = psi_bilinear_matrix_rows(l_a) + ASSERT (lrow <= N_det_alpha_unique) + + buffer(1:$N_int,j) = psi_det_alpha_unique(1:$N_int, lrow) ! hot spot + + ASSERT (l_a <= N_det) + idx(j) = l_a + l_a = l_a+1 + enddo + j = j-1 + + call get_all_spin_singles_$N_int( & + buffer, idx, tmp_det(1,1), j, & + singles_a, n_singles_a ) + +!----- +! else +! +! ! Search for singles +! +!call cpu_time(time0) +! ! Right boundary +! l_a = psi_bilinear_matrix_columns_loc(lcol+1)-1 +! ASSERT (l_a <= N_det) +! do j=1,psi_bilinear_matrix_columns_loc(lcol+1) - psi_bilinear_matrix_columns_loc(lcol) +! lrow = psi_bilinear_matrix_rows(l_a) +! ASSERT (lrow <= N_det_alpha_unique) +! +! left = singles_alpha_csc_idx(krow) +! right_max = -1_8 +! right = singles_alpha_csc_idx(krow+1) +! do while (right-left>0_8) +! k8 = shiftr(right+left,1) +! if (singles_alpha_csc(k8) > lrow) then +! right = k8 +! else if (singles_alpha_csc(k8) < lrow) then +! left = k8 + 1_8 +! else +! right_max = k8+1_8 +! exit +! endif +! enddo +! if (right_max > 0_8) exit +! l_a = l_a-1 +! enddo +! if (right_max < 0_8) right_max = singles_alpha_csc_idx(krow) +! +! ! Search +! n_singles_a = 0 +! l_a = psi_bilinear_matrix_columns_loc(lcol) +! ASSERT (l_a <= N_det) +! +! last_found = singles_alpha_csc_idx(krow) +! do j=1,psi_bilinear_matrix_columns_loc(lcol+1) - psi_bilinear_matrix_columns_loc(lcol) +! lrow = psi_bilinear_matrix_rows(l_a) +! ASSERT (lrow <= N_det_alpha_unique) +! +! left = last_found +! right = right_max +! do while (right-left>0_8) +! k8 = shiftr(right+left,1) +! if (singles_alpha_csc(k8) > lrow) then +! right = k8 +! else if (singles_alpha_csc(k8) < lrow) then +! left = k8 + 1_8 +! else +! n_singles_a += 1 +! singles_a(n_singles_a) = l_a +! last_found = k8+1_8 +! exit +! endif +! enddo +! l_a = l_a+1 +! enddo +! j = j-1 +! +! endif +!----- + + ! Loop over alpha singles + ! ----------------------- + + !DIR$ LOOP COUNT avg(1000) + do k = 1,n_singles_a,block_size + ! Prefetch u_t(:,l_a) + do kk=0,block_size-1 + if (k+kk > n_singles_a) exit + l_a = singles_a(k+kk) + ASSERT (l_a <= N_det) + + do l=1,N_st + utl(l,kk+1) = u_t(l,l_a) + enddo + enddo + + do kk=0,block_size-1 + if (k+kk > n_singles_a) exit + l_a = singles_a(k+kk) + lrow = psi_bilinear_matrix_rows(l_a) + ASSERT (lrow <= N_det_alpha_unique) + + tmp_det2(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, lrow) + call i_H_j_double_alpha_beta(tmp_det,tmp_det2,$N_int,hij) + call get_s2(tmp_det,tmp_det2,$N_int,sij) + !DIR$ LOOP COUNT AVG(4) + do l=1,N_st + v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1) + s_t(l,k_a) = s_t(l,k_a) + sij * utl(l,kk+1) + enddo + enddo + enddo + + enddo + + enddo + !$OMP END DO + + !$OMP DO SCHEDULE(guided,64) + do k_a=istart+ishift,iend,istep + + + ! Single and double alpha excitations + ! =================================== + + + ! Initial determinant is at k_a in alpha-major representation + ! ----------------------------------------------------------------------- + + krow = psi_bilinear_matrix_rows(k_a) + ASSERT (krow <= N_det_alpha_unique) + + kcol = psi_bilinear_matrix_columns(k_a) + ASSERT (kcol <= N_det_beta_unique) + + tmp_det(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, krow) + tmp_det(1:$N_int,2) = psi_det_beta_unique (1:$N_int, kcol) + + ! Initial determinant is at k_b in beta-major representation + ! ---------------------------------------------------------------------- + + k_b = psi_bilinear_matrix_order_transp_reverse(k_a) + ASSERT (k_b <= N_det) + + spindet(1:$N_int) = tmp_det(1:$N_int,1) + + ! Loop inside the beta column to gather all the connected alphas + lcol = psi_bilinear_matrix_columns(k_a) + l_a = psi_bilinear_matrix_columns_loc(lcol) + + !DIR$ LOOP COUNT avg(200000) + do i=1,N_det_alpha_unique + if (l_a > N_det) exit + lcol = psi_bilinear_matrix_columns(l_a) + if (lcol /= kcol) exit + lrow = psi_bilinear_matrix_rows(l_a) + ASSERT (lrow <= N_det_alpha_unique) + + buffer(1:$N_int,i) = psi_det_alpha_unique(1:$N_int, lrow) ! Hot spot + idx(i) = l_a + l_a = l_a+1 + enddo + i = i-1 + + call get_all_spin_singles_and_doubles_$N_int( & + buffer, idx, spindet, i, & + singles_a, doubles, n_singles_a, n_doubles ) + + ! Compute Hij for all alpha singles + ! ---------------------------------- + + tmp_det2(1:$N_int,2) = psi_det_beta_unique (1:$N_int, kcol) + !DIR$ LOOP COUNT avg(1000) + do i=1,n_singles_a,block_size + ! Prefetch u_t(:,l_a) + do kk=0,block_size-1 + if (i+kk > n_singles_a) exit + l_a = singles_a(i+kk) + ASSERT (l_a <= N_det) + + do l=1,N_st + utl(l,kk+1) = u_t(l,l_a) + enddo + enddo + + do kk=0,block_size-1 + if (i+kk > n_singles_a) exit + l_a = singles_a(i+kk) + lrow = psi_bilinear_matrix_rows(l_a) + ASSERT (lrow <= N_det_alpha_unique) + + tmp_det2(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, lrow) + call i_h_j_single_spin( tmp_det, tmp_det2, $N_int, 1, hij) + + !DIR$ LOOP COUNT AVG(4) + do l=1,N_st + v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1) + ! single => sij = 0 + enddo + enddo + enddo + + + ! Compute Hij for all alpha doubles + ! ---------------------------------- + + !DIR$ LOOP COUNT avg(50000) + do i=1,n_doubles,block_size + ! Prefetch u_t(:,l_a) + do kk=0,block_size-1 + if (i+kk > n_doubles) exit + l_a = doubles(i+kk) + ASSERT (l_a <= N_det) + + do l=1,N_st + utl(l,kk+1) = u_t(l,l_a) + enddo + enddo + + do kk=0,block_size-1 + if (i+kk > n_doubles) exit + l_a = doubles(i+kk) + lrow = psi_bilinear_matrix_rows(l_a) + ASSERT (lrow <= N_det_alpha_unique) + + call i_H_j_double_spin( tmp_det(1,1), psi_det_alpha_unique(1, lrow), $N_int, hij) + !DIR$ LOOP COUNT AVG(4) + do l=1,N_st + v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1) + ! same spin => sij = 0 + enddo + enddo + enddo + + + ! Single and double beta excitations + ! ================================== + + + ! Initial determinant is at k_a in alpha-major representation + ! ----------------------------------------------------------------------- + + krow = psi_bilinear_matrix_rows(k_a) + kcol = psi_bilinear_matrix_columns(k_a) + + tmp_det(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, krow) + tmp_det(1:$N_int,2) = psi_det_beta_unique (1:$N_int, kcol) + + spindet(1:$N_int) = tmp_det(1:$N_int,2) + + ! Initial determinant is at k_b in beta-major representation + ! ----------------------------------------------------------------------- + + k_b = psi_bilinear_matrix_order_transp_reverse(k_a) + ASSERT (k_b <= N_det) + + ! Loop inside the alpha row to gather all the connected betas + lrow = psi_bilinear_matrix_transp_rows(k_b) + l_b = psi_bilinear_matrix_transp_rows_loc(lrow) + !DIR$ LOOP COUNT avg(200000) + do i=1,N_det_beta_unique + if (l_b > N_det) exit + lrow = psi_bilinear_matrix_transp_rows(l_b) + if (lrow /= krow) exit + lcol = psi_bilinear_matrix_transp_columns(l_b) + ASSERT (lcol <= N_det_beta_unique) + + buffer(1:$N_int,i) = psi_det_beta_unique(1:$N_int, lcol) + idx(i) = l_b + l_b = l_b+1 + enddo + i = i-1 + + call get_all_spin_singles_and_doubles_$N_int( & + buffer, idx, spindet, i, & + singles_b, doubles, n_singles_b, n_doubles ) + + ! Compute Hij for all beta singles + ! ---------------------------------- + + tmp_det2(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, krow) + !DIR$ LOOP COUNT avg(1000) + do i=1,n_singles_b,block_size + do kk=0,block_size-1 + if (i+kk > n_singles_b) exit + l_b = singles_b(i+kk) + ASSERT (l_b <= N_det) + + l_a = psi_bilinear_matrix_transp_order(l_b) + ASSERT (l_a <= N_det) + + do l=1,N_st + utl(l,kk+1) = u_t(l,l_a) + enddo + enddo + + do kk=0,block_size-1 + if (i+kk > n_singles_b) exit + l_b = singles_b(i+kk) + l_a = psi_bilinear_matrix_transp_order(l_b) + lcol = psi_bilinear_matrix_transp_columns(l_b) + ASSERT (lcol <= N_det_beta_unique) + + tmp_det2(1:$N_int,2) = psi_det_beta_unique (1:$N_int, lcol) + call i_h_j_single_spin( tmp_det, tmp_det2, $N_int, 2, hij) + !DIR$ LOOP COUNT AVG(4) + do l=1,N_st + v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1) + ! single => sij = 0 + enddo + enddo + enddo + + ! Compute Hij for all beta doubles + ! ---------------------------------- + + !DIR$ LOOP COUNT avg(50000) + do i=1,n_doubles,block_size + do kk=0,block_size-1 + if (i+kk > n_doubles) exit + l_b = doubles(i+kk) + ASSERT (l_b <= N_det) + l_a = psi_bilinear_matrix_transp_order(l_b) + ASSERT (l_a <= N_det) + + do l=1,N_st + utl(l,kk+1) = u_t(l,l_a) + enddo + enddo + + do kk=0,block_size-1 + if (i+kk > n_doubles) exit + l_b = doubles(i+kk) + l_a = psi_bilinear_matrix_transp_order(l_b) + lcol = psi_bilinear_matrix_transp_columns(l_b) + ASSERT (lcol <= N_det_beta_unique) + + call i_H_j_double_spin( tmp_det(1,2), psi_det_beta_unique(1, lcol), $N_int, hij) + + !DIR$ LOOP COUNT AVG(4) + do l=1,N_st + v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1) + ! same spin => sij = 0 + enddo + enddo + enddo + + + ! Diagonal contribution + ! ===================== + + + ! Initial determinant is at k_a in alpha-major representation + ! ----------------------------------------------------------------------- + + krow = psi_bilinear_matrix_rows(k_a) + ASSERT (krow <= N_det_alpha_unique) + + kcol = psi_bilinear_matrix_columns(k_a) + ASSERT (kcol <= N_det_beta_unique) + + tmp_det(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, krow) + tmp_det(1:$N_int,2) = psi_det_beta_unique (1:$N_int, kcol) + + double precision, external :: diag_H_mat_elem, diag_S_mat_elem + + hij = diag_H_mat_elem(tmp_det,$N_int) + sij = diag_S_mat_elem(tmp_det,$N_int) + !DIR$ LOOP COUNT AVG(4) + do l=1,N_st + v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,k_a) + s_t(l,k_a) = s_t(l,k_a) + sij * u_t(l,k_a) + enddo + + end do + !$OMP END DO + deallocate(buffer, singles_a, singles_b, doubles, idx, utl) + !$OMP END PARALLEL + +end + +SUBST [ N_int ] + +1;; +2;; +3;; +4;; +N_int;; + +END_TEMPLATE + + diff --git a/src/utils/transpose.irp.f b/src/utils/transpose.irp.f index 7c86f458..ddffb172 100644 --- a/src/utils/transpose.irp.f +++ b/src/utils/transpose.irp.f @@ -84,3 +84,100 @@ recursive subroutine dtranspose(A,LDA,B,LDB,d1,d2) end + +!DIR$ attributes forceinline :: cdtranspose +recursive subroutine cdtranspose(A,LDA,B,LDB,d1,d2) + implicit none + BEGIN_DOC +! Transpose input matrix A into output matrix B +! don't take complex conjugate + END_DOC + integer, intent(in) :: d1, d2, LDA, LDB + complex*16, intent(in) :: A(LDA,d2) + complex*16, intent(out) :: B(LDB,d1) + + +! do j=1,d1 +! do i=1,d2 +! B(i,j ) = A(j ,i) +! enddo +! enddo +! return + + integer :: i,j,k + if ( d2 < 32 ) then + do j=1,d1 + !DIR$ LOOP COUNT (16) + do i=1,d2 + B(i,j ) = A(j ,i) + enddo + enddo + return + else if (d1 > d2) then + !DIR$ forceinline + k=d1/2 + !DIR$ forceinline recursive + call cdtranspose(A(1,1),LDA,B(1,1),LDB,k,d2) + !DIR$ forceinline recursive + call cdtranspose(A(k+1,1),LDA,B(1,k+1),LDB,d1-k,d2) + return + else + !DIR$ forceinline + k=d2/2 + !DIR$ forceinline recursive + call cdtranspose(A(1,k+1),LDA,B(k+1,1),LDB,d1,d2-k) + !DIR$ forceinline recursive + call cdtranspose(A(1,1),LDA,B(1,1),LDB,d1,k) + return + endif + +end + +!DIR$ attributes forceinline :: cdadjoint +recursive subroutine cdadjoint(A,LDA,B,LDB,d1,d2) + implicit none + BEGIN_DOC +! Transpose input matrix A into output matrix B +! and take complex conjugate + END_DOC + integer, intent(in) :: d1, d2, LDA, LDB + complex*16, intent(in) :: A(LDA,d2) + complex*16, intent(out) :: B(LDB,d1) + + +! do j=1,d1 +! do i=1,d2 +! B(i,j ) = A(j ,i) +! enddo +! enddo +! return + + integer :: i,j,k + if ( d2 < 32 ) then + do j=1,d1 + !DIR$ LOOP COUNT (16) + do i=1,d2 + B(i,j ) = conjg(A(j ,i)) + enddo + enddo + return + else if (d1 > d2) then + !DIR$ forceinline + k=d1/2 + !DIR$ forceinline recursive + call cdadjoint(A(1,1),LDA,B(1,1),LDB,k,d2) + !DIR$ forceinline recursive + call cdadjoint(A(k+1,1),LDA,B(1,k+1),LDB,d1-k,d2) + return + else + !DIR$ forceinline + k=d2/2 + !DIR$ forceinline recursive + call cdadjoint(A(1,k+1),LDA,B(k+1,1),LDB,d1,d2-k) + !DIR$ forceinline recursive + call cdadjoint(A(1,1),LDA,B(1,1),LDB,d1,k) + return + endif + +end + diff --git a/src/utils_complex/qp2-pbc-diff.txt b/src/utils_complex/qp2-pbc-diff.txt index 9f3598e3..730a3970 100644 --- a/src/utils_complex/qp2-pbc-diff.txt +++ b/src/utils_complex/qp2-pbc-diff.txt @@ -1,6 +1,11 @@ ------------------------------------------------------------------------------------- current: + + irp_align for complex? + zmq_put_psi_complex instead of branch inside zmq_put_psi? + are there cases where we call this without already being on a complex branch of code? + h_apply.irp.f push/pull_pt2 pt2,norm_pert,h_pert_diag @@ -83,6 +88,27 @@ determinants: and broadcast_chunks_complex_double in mpi/mpi.irp.f +davidson + (****) davidson_parallel.irp.f + davidson_slave_work + branch inside or outside? + (currently inside) + same broadcast size issue as in h_apply (2^23 elements) + needs h_s2_u_0_nstates_openmp_work_complex (be careful with transp/conjg) + needs davidson_push_results_async_send_complex + davidson_pu{sh,ll}_results{,_async_send}_complex + double sz? + does f77_zmq_send8 know about types, or just send raw data? + davidson_collector_complex + is {v,s}_t conjugate transpose or just transpose? + (****) diagonalization_hs2_dressed.irp.f + (****) diagonalize_ci.irp.f + (****) EZFIO.cfg + (****) ezfio_interface.irp.f + (****) input.irp.f + (****) print_e_components.irp.f + (****) u0_h_u0.irp.f + (****) u0_wee_u0.irp.f ------------------------------------------------------------------------------------- for complex data, add extra dim (size 2) and treat as real in EZFIO.cfg @@ -496,6 +522,3 @@ src/scf_utils/scf_density_matrix_ao_complex.irp.f [ complex*16, SCF_density_matrix_ao_complex, (ao_num,ao_num) ] - - -