10
0
mirror of https://github.com/QuantumPackage/qp2.git synced 2024-06-23 21:52:25 +02:00

working on complex davidson

This commit is contained in:
Kevin Gasperich 2020-02-25 13:09:15 -06:00
parent 5418ed0f1c
commit f869d347b8
5 changed files with 1288 additions and 27 deletions

View File

@ -45,13 +45,19 @@ subroutine run_cipsi
if (N_det > N_det_max) then
psi_det = psi_det_sorted
psi_coef = psi_coef_sorted
N_det = N_det_max
soft_touch N_det psi_det psi_coef
if (is_complex) then
psi_coef_complex = psi_coef_sorted_complex
N_det = N_det_max
soft_touch N_det psi_det psi_coef_complex
else
psi_coef = psi_coef_sorted
N_det = N_det_max
soft_touch N_det psi_det psi_coef
endif
if (s2_eig) then
call make_s2_eigenfunction
endif
call diagonalize_CI
call diagonalize_ci
call save_wavefunction
endif
@ -109,8 +115,11 @@ subroutine run_cipsi
to_select = int(sqrt(dble(N_states))*dble(N_det)*selection_factor)
to_select = max(N_states_diag, to_select)
call ZMQ_selection(to_select, pt2, variance, norm)
PROVIDE psi_coef
if (is_complex) then
PROVIDE psi_coef_complex
else
PROVIDE psi_coef
endif
PROVIDE psi_det
PROVIDE psi_det_sorted

View File

@ -89,21 +89,93 @@ subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, N_st, sze,
character*(512) :: msg
integer :: imin, imax, ishift, istep
integer, allocatable :: psi_det_read(:,:,:)
double precision, allocatable :: v_t(:,:), s_t(:,:), u_t(:,:)
!DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: u_t, v_t, s_t
! Get wave function (u_t)
! -----------------------
integer :: rc, ni, nj
integer*8 :: rc8
integer :: N_states_read, N_det_read, psi_det_size_read
integer :: N_det_selectors_read, N_det_generators_read
integer, external :: zmq_get_dvector
integer, allocatable :: psi_det_read(:,:,:)
logical :: sending
integer, external :: get_task_from_taskserver
integer, external :: task_done_to_taskserver
integer :: k
integer :: ierr
! integer, external :: zmq_get_dvector
integer, external :: zmq_get_dmatrix
if (is_complex) then
complex*16, allocatable :: v_tc(:,:), s_tc(:,:), u_tc(:,:)
!DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: u_tc, v_tc, s_tc
! Get wave function (u_tc)
! -----------------------
PROVIDE psi_det_beta_unique psi_bilinear_matrix_order_transp_reverse psi_det_alpha_unique
PROVIDE psi_bilinear_matrix_transp_values_complex psi_bilinear_matrix_values_complex psi_bilinear_matrix_columns_loc
PROVIDE ref_bitmask_energy nproc
PROVIDE mpi_initialized
allocate(u_tc(N_st,N_det))
!todo: resize for complex?
! Warning : dimensions are modified for efficiency, It is OK since we get the
! full matrix
if (size(u_tc,kind=8) < 8388608_8) then
ni = size(u_tc)
nj = 1
else
ni = 8388608
nj = int(size(u_tc,kind=8)/8388608_8,4) + 1
endif
do while (zmq_get_cdmatrix(zmq_to_qp_run_socket, worker_id, 'u_tc', u_tc, ni, nj, size(u_tc,kind=8)) == -1)
print *, 'mpi_rank, N_states_diag, N_det'
print *, mpi_rank, N_states_diag, N_det
stop 'u_t'
enddo
IRP_IF MPI
include 'mpif.h'
call broadcast_chunks_complex_double(u_tc,size(u_tc,kind=8))
IRP_ENDIF
! Run tasks
! ---------
sending=.False.
allocate(v_tc(N_st,N_det), s_tc(N_st,N_det))
do
if (get_task_from_taskserver(zmq_to_qp_run_socket,worker_id, task_id, msg) == -1) then
exit
endif
if(task_id == 0) exit
read (msg,*) imin, imax, ishift, istep
do k=imin,imax
v_tc(:,k) = (0.d0,0.d0)
s_tc(:,k) = (0.d0,0.d0)
enddo
call h_s2_u_0_nstates_openmp_work_complex(v_tc,s_tc,u_tc,N_st,N_det,imin,imax,ishift,istep)
if (task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id) == -1) then
print *, irp_here, 'Unable to send task_done'
endif
call davidson_push_results_async_recv(zmq_socket_push, sending)
call davidson_push_results_async_send_complex(zmq_socket_push, v_tc, s_tc, imin, imax, task_id, sending)
end do
deallocate(u_tc,v_tc, s_tc)
call davidson_push_results_async_recv(zmq_socket_push, sending)
else
double precision, allocatable :: v_t(:,:), s_t(:,:), u_t(:,:)
!DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: u_t, v_t, s_t
! Get wave function (u_t)
! -----------------------
PROVIDE psi_det_beta_unique psi_bilinear_matrix_order_transp_reverse psi_det_alpha_unique
PROVIDE psi_bilinear_matrix_transp_values psi_bilinear_matrix_values psi_bilinear_matrix_columns_loc
@ -130,28 +202,21 @@ subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, N_st, sze,
IRP_IF MPI
include 'mpif.h'
integer :: ierr
call broadcast_chunks_double(u_t,size(u_t,kind=8))
IRP_ENDIF
! Run tasks
! ---------
logical :: sending
sending=.False.
allocate(v_t(N_st,N_det), s_t(N_st,N_det))
do
integer, external :: get_task_from_taskserver
integer, external :: task_done_to_taskserver
if (get_task_from_taskserver(zmq_to_qp_run_socket,worker_id, task_id, msg) == -1) then
exit
endif
if(task_id == 0) exit
read (msg,*) imin, imax, ishift, istep
integer :: k
do k=imin,imax
v_t(:,k) = 0.d0
s_t(:,k) = 0.d0
@ -165,7 +230,7 @@ subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, N_st, sze,
end do
deallocate(u_t,v_t, s_t)
call davidson_push_results_async_recv(zmq_socket_push, sending)
endif
end subroutine
@ -533,6 +598,7 @@ subroutine H_S2_u_0_nstates_zmq(v_0,s_0,u_0,N_st,sze)
end
BEGIN_PROVIDER [ integer, nthreads_davidson ]
implicit none
BEGIN_DOC
@ -643,3 +709,365 @@ integer function zmq_get_N_states_diag(zmq_to_qp_run_socket, worker_id)
IRP_ENDIF
end
!==============================================================================!
! !
! Complex !
! !
!==============================================================================!
subroutine davidson_push_results_complex(zmq_socket_push, v_t, s_t, imin, imax, task_id)
!todo: implement for complex; check double sz
print*,irp_here,' not implemented for complex'
stop -1
use f77_zmq
implicit none
BEGIN_DOC
! Push the results of $H | U \rangle$ from a worker to the master.
END_DOC
integer(ZMQ_PTR) ,intent(in) :: zmq_socket_push
integer ,intent(in) :: task_id, imin, imax
complex*16 ,intent(in) :: v_t(N_states_diag,N_det)
complex*16 ,intent(in) :: s_t(N_states_diag,N_det)
integer :: rc, sz
integer*8 :: rc8
sz = (imax-imin+1)*N_states_diag
rc = f77_zmq_send( zmq_socket_push, task_id, 4, ZMQ_SNDMORE)
if(rc /= 4) stop 'davidson_push_results failed to push task_id'
rc = f77_zmq_send( zmq_socket_push, imin, 4, ZMQ_SNDMORE)
if(rc /= 4) stop 'davidson_push_results failed to push imin'
rc = f77_zmq_send( zmq_socket_push, imax, 4, ZMQ_SNDMORE)
if(rc /= 4) stop 'davidson_push_results failed to push imax'
!todo: double sz for complex?
rc8 = f77_zmq_send8( zmq_socket_push, v_t(1,imin), 8_8*sz, ZMQ_SNDMORE)
if(rc8 /= 8_8*sz) stop 'davidson_push_results failed to push vt'
!todo: double sz for complex?
rc8 = f77_zmq_send8( zmq_socket_push, s_t(1,imin), 8_8*sz, 0)
if(rc8 /= 8_8*sz) stop 'davidson_push_results failed to push st'
! Activate is zmq_socket_push is a REQ
IRP_IF ZMQ_PUSH
IRP_ELSE
character*(2) :: ok
rc = f77_zmq_recv( zmq_socket_push, ok, 2, 0)
if ((rc /= 2).and.(ok(1:2)/='ok')) then
print *, irp_here, ': f77_zmq_recv( zmq_socket_push, ok, 2, 0)'
stop -1
endif
IRP_ENDIF
end subroutine
subroutine davidson_push_results_async_send_complex(zmq_socket_push, v_t, s_t, imin, imax, task_id,sending)
!todo: implement for complex; check double sz
print*,irp_here,' not implemented for complex'
stop -1
use f77_zmq
implicit none
BEGIN_DOC
! Push the results of $H | U \rangle$ from a worker to the master.
END_DOC
integer(ZMQ_PTR) ,intent(in) :: zmq_socket_push
integer ,intent(in) :: task_id, imin, imax
complex*16 ,intent(in) :: v_t(N_states_diag,N_det)
complex*16 ,intent(in) :: s_t(N_states_diag,N_det)
logical ,intent(inout) :: sending
integer :: rc, sz
integer*8 :: rc8
if (sending) then
print *, irp_here, ': sending=true'
stop -1
endif
sending = .True.
sz = (imax-imin+1)*N_states_diag
rc = f77_zmq_send( zmq_socket_push, task_id, 4, ZMQ_SNDMORE)
if(rc /= 4) stop 'davidson_push_results failed to push task_id'
rc = f77_zmq_send( zmq_socket_push, imin, 4, ZMQ_SNDMORE)
if(rc /= 4) stop 'davidson_push_results failed to push imin'
rc = f77_zmq_send( zmq_socket_push, imax, 4, ZMQ_SNDMORE)
if(rc /= 4) stop 'davidson_push_results failed to push imax'
!todo: double sz for complex?
rc8 = f77_zmq_send8( zmq_socket_push, v_t(1,imin), 8_8*sz, ZMQ_SNDMORE)
if(rc8 /= 8_8*sz) stop 'davidson_push_results failed to push vt'
!todo: double sz for complex?
rc8 = f77_zmq_send8( zmq_socket_push, s_t(1,imin), 8_8*sz, 0)
if(rc8 /= 8_8*sz) stop 'davidson_push_results failed to push st'
end subroutine
subroutine davidson_pull_results_complex(zmq_socket_pull, v_t, s_t, imin, imax, task_id)
!todo: implement for complex; check double sz
print*,irp_here,' not implemented for complex'
stop -1
use f77_zmq
implicit none
BEGIN_DOC
! Pull the results of $H | U \rangle$ on the master.
END_DOC
integer(ZMQ_PTR) ,intent(in) :: zmq_socket_pull
integer ,intent(out) :: task_id, imin, imax
complex*16 ,intent(out) :: v_t(N_states_diag,N_det)
complex*16 ,intent(out) :: s_t(N_states_diag,N_det)
integer :: rc, sz
integer*8 :: rc8
rc = f77_zmq_recv( zmq_socket_pull, task_id, 4, 0)
if(rc /= 4) stop 'davidson_pull_results failed to pull task_id'
rc = f77_zmq_recv( zmq_socket_pull, imin, 4, 0)
if(rc /= 4) stop 'davidson_pull_results failed to pull imin'
rc = f77_zmq_recv( zmq_socket_pull, imax, 4, 0)
if(rc /= 4) stop 'davidson_pull_results failed to pull imax'
sz = (imax-imin+1)*N_states_diag
!todo: double sz for complex?
rc8 = f77_zmq_recv8( zmq_socket_pull, v_t(1,imin), 8_8*sz, 0)
if(rc8 /= 8*sz) stop 'davidson_pull_results failed to pull v_t'
!todo: double sz for complex?
rc8 = f77_zmq_recv8( zmq_socket_pull, s_t(1,imin), 8_8*sz, 0)
if(rc8 /= 8*sz) stop 'davidson_pull_results failed to pull s_t'
! Activate if zmq_socket_pull is a REP
IRP_IF ZMQ_PUSH
IRP_ELSE
rc = f77_zmq_send( zmq_socket_pull, 'ok', 2, 0)
if (rc /= 2) then
print *, irp_here, ' : f77_zmq_send (zmq_socket_pull,...'
stop -1
endif
IRP_ENDIF
end subroutine
subroutine davidson_collector_complex(zmq_to_qp_run_socket, zmq_socket_pull, v0, s0, sze, N_st)
!todo: implement for complex; check conjg v_t s_t
print*,irp_here,' not implemented for complex'
stop -1
use f77_zmq
implicit none
BEGIN_DOC
! Routine collecting the results of the workers in Davidson's algorithm.
END_DOC
integer(ZMQ_PTR), intent(in) :: zmq_socket_pull
integer, intent(in) :: sze, N_st
integer(ZMQ_PTR), intent(in) :: zmq_to_qp_run_socket
complex*16 ,intent(inout) :: v0(sze, N_st)
complex*16 ,intent(inout) :: s0(sze, N_st)
integer :: more, task_id, imin, imax
complex*16, allocatable :: v_t(:,:), s_t(:,:)
logical :: sending
integer :: i,j
integer, external :: zmq_delete_task_async_send
integer, external :: zmq_delete_task_async_recv
allocate(v_t(N_st,N_det), s_t(N_st,N_det))
v0 = (0.d0,0.d0)
s0 = (0.d0,0.d0)
more = 1
sending = .False.
do while (more == 1)
call davidson_pull_results_complex(zmq_socket_pull, v_t, s_t, imin, imax, task_id)
if (zmq_delete_task_async_send(zmq_to_qp_run_socket,task_id,sending) == -1) then
stop 'davidson: Unable to delete task (send)'
endif
do j=1,N_st
do i=imin,imax
!todo: conjg or no?
print*,irp_here,' not implemented for complex (conjg?)'
v0(i,j) = v0(i,j) + v_t(j,i)
s0(i,j) = s0(i,j) + s_t(j,i)
enddo
enddo
if (zmq_delete_task_async_recv(zmq_to_qp_run_socket,more,sending) == -1) then
stop 'davidson: Unable to delete task (recv)'
endif
end do
deallocate(v_t,s_t)
end subroutine
subroutine h_s2_u_0_nstates_zmq_complex(v_0,s_0,u_0,N_st,sze)
!todo: implement for complex
print*,irp_here,' not implemented for complex'
stop -1
use omp_lib
use bitmasks
use f77_zmq
implicit none
BEGIN_DOC
! Computes $v_0 = H | u_0\rangle$ and $s_0 = S^2 | u_0\rangle$
!
! n : number of determinants
!
! H_jj : array of $\langle j | H | j \rangle$
!
! S2_jj : array of $\langle j | S^2 | j \rangle$
END_DOC
integer, intent(in) :: N_st, sze
complex*16, intent(out) :: v_0(sze,N_st), s_0(sze,N_st)
complex*16, intent(inout) :: u_0(sze,N_st)
integer :: i,j,k
integer :: ithread
complex*16, allocatable :: u_t(:,:)
!DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: u_t
integer(ZMQ_PTR) :: zmq_to_qp_run_socket, zmq_socket_pull
PROVIDE psi_det_beta_unique psi_bilinear_matrix_order_transp_reverse psi_det_alpha_unique
PROVIDE psi_bilinear_matrix_transp_values_complex psi_bilinear_matrix_values_complex psi_bilinear_matrix_columns_loc
PROVIDE ref_bitmask_energy nproc
PROVIDE mpi_initialized
call new_parallel_job(zmq_to_qp_run_socket,zmq_socket_pull,'davidson')
! integer :: N_states_diag_save
! N_states_diag_save = N_states_diag
! N_states_diag = N_st
if (zmq_put_N_states_diag(zmq_to_qp_run_socket, 1) == -1) then
stop 'Unable to put N_states_diag on ZMQ server'
endif
if (zmq_put_psi(zmq_to_qp_run_socket,1) == -1) then
stop 'Unable to put psi on ZMQ server'
endif
energy = 0.d0
if (zmq_put_dvector(zmq_to_qp_run_socket,1,'energy',energy,size(energy)) == -1) then
stop 'Unable to put energy on ZMQ server'
endif
! Create tasks
! ============
integer :: istep, imin, imax, ishift, ipos
integer, external :: add_task_to_taskserver
integer, parameter :: tasksize=10000
character*(100000) :: task
istep=1
ishift=0
imin=1
ipos=1
do imin=1,N_det,tasksize
imax = min(N_det,imin-1+tasksize)
do ishift=0,istep-1
write(task(ipos:ipos+50),'(4(I11,1X),1X,1A)') imin, imax, ishift, istep, '|'
ipos = ipos+50
if (ipos > 100000-50) then
if (add_task_to_taskserver(zmq_to_qp_run_socket,trim(task(1:ipos))) == -1) then
stop 'Unable to add task'
endif
ipos=1
endif
enddo
enddo
if (ipos > 1) then
if (add_task_to_taskserver(zmq_to_qp_run_socket,trim(task(1:ipos))) == -1) then
stop 'Unable to add task'
endif
ipos=1
endif
allocate(u_t(N_st,N_det))
do k=1,N_st
call cdset_order(u_0(1,k),psi_bilinear_matrix_order,N_det)
enddo
call cdtranspose( &
u_0, &
size(u_0, 1), &
u_t, &
size(u_t, 1), &
N_det, N_st)
ASSERT (N_st == N_states_diag)
ASSERT (sze >= N_det)
integer :: rc, ni, nj
integer*8 :: rc8
double precision :: energy(N_st)
integer, external :: zmq_put_dvector, zmq_put_psi, zmq_put_N_states_diag
integer, external :: zmq_put_cdmatrix
!todo: size/2 for complex?
if (size(u_t) < 8388608) then
ni = size(u_t)
nj = 1
else
ni = 8388608
nj = size(u_t)/8388608 + 1
endif
! Warning : dimensions are modified for efficiency, It is OK since we get the
! full matrix
if (zmq_put_cdmatrix(zmq_to_qp_run_socket, 1, 'u_t', u_t, ni, nj, size(u_t,kind=8)) == -1) then
stop 'Unable to put u_t on ZMQ server'
endif
deallocate(u_t)
integer, external :: zmq_set_running
if (zmq_set_running(zmq_to_qp_run_socket) == -1) then
print *, irp_here, ': Failed in zmq_set_running'
endif
call omp_set_nested(.True.)
!$OMP PARALLEL DEFAULT(shared) NUM_THREADS(2) PRIVATE(ithread)
ithread = omp_get_thread_num()
if (ithread == 0 ) then
call davidson_collector_complex(zmq_to_qp_run_socket, zmq_socket_pull, v_0, s_0, N_det, N_st)
else
call davidson_slave_inproc(1)
endif
!$OMP END PARALLEL
call end_parallel_job(zmq_to_qp_run_socket, zmq_socket_pull, 'davidson')
!$OMP PARALLEL
!$OMP SINGLE
do k=1,N_st
!$OMP TASK DEFAULT(SHARED) FIRSTPRIVATE(k,N_det)
call cdset_order(v_0(1,k),psi_bilinear_matrix_order_reverse,N_det)
!$OMP END TASK
!$OMP TASK DEFAULT(SHARED) FIRSTPRIVATE(k,N_det)
call cdset_order(s_0(1,k),psi_bilinear_matrix_order_reverse,N_det)
!$OMP END TASK
!$OMP TASK DEFAULT(SHARED) FIRSTPRIVATE(k,N_det)
call cdset_order(u_0(1,k),psi_bilinear_matrix_order_reverse,N_det)
!$OMP END TASK
enddo
!$OMP END SINGLE
!$OMP TASKWAIT
!$OMP END PARALLEL
! N_states_diag = N_states_diag_save
! SOFT_TOUCH N_states_diag
end

View File

@ -6,7 +6,11 @@
!
! psi_s2(i) = $\langle \Psi_i | S^2 | \Psi_i \rangle$
END_DOC
call u_0_H_u_0(psi_energy,psi_s2,psi_coef,N_det,psi_det,N_int,N_states,psi_det_size)
if (is_complex) then
call u_0_h_u_0_complex(psi_energy,psi_s2,psi_coef_complex,N_det,psi_det,N_int,N_states,psi_det_size)
else
call u_0_H_u_0(psi_energy,psi_s2,psi_coef,N_det,psi_det,N_int,N_states,psi_det_size)
endif
integer :: i
do i=N_det+1,N_states
psi_energy(i) = 0.d0
@ -708,3 +712,703 @@ N_int;;
END_TEMPLATE
!==============================================================================!
! !
! Complex !
! !
!==============================================================================!
subroutine u_0_H_u_0_complex(e_0,s_0,u_0,n,keys_tmp,Nint,N_st,sze)
!todo: implement for complex
print*,irp_here,' not implemented for complex'
stop -1
use bitmasks
implicit none
BEGIN_DOC
! Computes $E_0 = \frac{\langle u_0 | H | u_0 \rangle}{\langle u_0 | u_0 \rangle}$
!
! and $S_0 = \frac{\langle u_0 | S^2 | u_0 \rangle}{\langle u_0 | u_0 \rangle}$
!
! n : number of determinants
!
END_DOC
integer, intent(in) :: n,Nint, N_st, sze
double precision, intent(out) :: e_0(N_st),s_0(N_st)
complex*16, intent(inout) :: u_0(sze,N_st)
integer(bit_kind),intent(in) :: keys_tmp(Nint,2,n)
complex*16, allocatable :: v_0(:,:), s_vec(:,:), u_1(:,:)
double precision :: u_dot_u_complex,diag_H_mat_elem
complex*16 :: u_dot_v_complex
integer :: i,j, istate
if ((n > 100000).and.distributed_davidson) then
allocate (v_0(n,N_states_diag),s_vec(n,N_states_diag), u_1(n,N_states_diag))
u_1(:,:) = (0.d0,0.d0)
u_1(1:n,1:N_st) = u_0(1:n,1:N_st)
call h_s2_u_0_nstates_zmq(v_0,s_vec,u_1,N_states_diag,n)
else if (n < n_det_max_full) then
allocate (v_0(n,N_st),s_vec(n,N_st), u_1(n,N_st))
v_0(:,:) = 0.d0
u_1(:,:) = 0.d0
s_vec(:,:) = 0.d0
u_1(1:n,1:N_st) = u_0(1:n,1:N_st)
do istate = 1,N_st
do j=1,n
do i=1,n
v_0(i,istate) = v_0(i,istate) + h_matrix_all_dets(i,j) * u_0(j,istate)
s_vec(i,istate) = s_vec(i,istate) + S2_matrix_all_dets(i,j) * u_0(j,istate)
enddo
enddo
enddo
else
allocate (v_0(n,N_st),s_vec(n,N_st),u_1(n,N_st))
u_1(:,:) = 0.d0
u_1(1:n,1:N_st) = u_0(1:n,1:N_st)
call H_S2_u_0_nstates_openmp(v_0,s_vec,u_1,N_st,n)
endif
u_0(1:n,1:N_st) = u_1(1:n,1:N_st)
deallocate(u_1)
double precision :: norm
!$OMP PARALLEL DO PRIVATE(i,norm) DEFAULT(SHARED)
do i=1,N_st
norm = u_dot_u(u_0(1,i),n)
if (norm /= 0.d0) then
e_0(i) = u_dot_v(v_0(1,i),u_0(1,i),n)
s_0(i) = u_dot_v(s_vec(1,i),u_0(1,i),n)
else
e_0(i) = 0.d0
s_0(i) = 0.d0
endif
enddo
!$OMP END PARALLEL DO
deallocate (s_vec, v_0)
end
subroutine H_S2_u_0_nstates_openmp_complex(v_0,s_0,u_0,N_st,sze)
!todo: implement for complex
print*,irp_here,' not implemented for complex'
stop -1
use bitmasks
implicit none
BEGIN_DOC
! Computes $v_0 = H | u_0\rangle$ and $s_0 = S^2 | u_0\rangle$.
!
! Assumes that the determinants are in psi_det
!
! istart, iend, ishift, istep are used in ZMQ parallelization.
END_DOC
integer, intent(in) :: N_st,sze
double precision, intent(inout) :: v_0(sze,N_st), s_0(sze,N_st), u_0(sze,N_st)
integer :: k
double precision, allocatable :: u_t(:,:), v_t(:,:), s_t(:,:)
!DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: u_t
allocate(u_t(N_st,N_det),v_t(N_st,N_det),s_t(N_st,N_det))
do k=1,N_st
call dset_order(u_0(1,k),psi_bilinear_matrix_order,N_det)
enddo
v_t = 0.d0
s_t = 0.d0
call dtranspose( &
u_0, &
size(u_0, 1), &
u_t, &
size(u_t, 1), &
N_det, N_st)
call H_S2_u_0_nstates_openmp_work(v_t,s_t,u_t,N_st,sze,1,N_det,0,1)
deallocate(u_t)
call dtranspose( &
v_t, &
size(v_t, 1), &
v_0, &
size(v_0, 1), &
N_st, N_det)
call dtranspose( &
s_t, &
size(s_t, 1), &
s_0, &
size(s_0, 1), &
N_st, N_det)
deallocate(v_t,s_t)
do k=1,N_st
call dset_order(v_0(1,k),psi_bilinear_matrix_order_reverse,N_det)
call dset_order(s_0(1,k),psi_bilinear_matrix_order_reverse,N_det)
call dset_order(u_0(1,k),psi_bilinear_matrix_order_reverse,N_det)
enddo
end
subroutine H_S2_u_0_nstates_openmp_work_complex(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
!todo: implement for complex
print*,irp_here,' not implemented for complex'
stop -1
use bitmasks
implicit none
BEGIN_DOC
! Computes $v_t = H | u_t\rangle$ and $s_t = S^2 | u_t\rangle$
!
! Default should be 1,N_det,0,1
END_DOC
integer, intent(in) :: N_st,sze,istart,iend,ishift,istep
complex*16, intent(in) :: u_t(N_st,N_det)
complex*16, intent(out) :: v_t(N_st,sze), s_t(N_st,sze)
PROVIDE ref_bitmask_energy N_int
select case (N_int)
case (1)
call H_S2_u_0_nstates_openmp_work_complex_1(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
case (2)
call H_S2_u_0_nstates_openmp_work_complex_2(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
case (3)
call H_S2_u_0_nstates_openmp_work_complex_3(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
case (4)
call H_S2_u_0_nstates_openmp_work_complex_4(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
case default
call H_S2_u_0_nstates_openmp_work_complex_N_int(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
end select
end
BEGIN_TEMPLATE
subroutine H_S2_u_0_nstates_openmp_work_complex_$N_int(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
!todo: implement for complex
print*,irp_here,' not implemented for complex'
stop -1
use bitmasks
implicit none
BEGIN_DOC
! Computes $v_t = H | u_t \\rangle$ and $s_t = S^2 | u_t\\rangle$
!
! Default should be 1,N_det,0,1
END_DOC
integer, intent(in) :: N_st,sze,istart,iend,ishift,istep
complex*16, intent(in) :: u_t(N_st,N_det)
complex*16, intent(out) :: v_t(N_st,sze), s_t(N_st,sze)
complex*16 :: hij, sij
integer :: i,j,k,l,kk
integer :: k_a, k_b, l_a, l_b, m_a, m_b
integer :: istate
integer :: krow, kcol, krow_b, kcol_b
integer :: lrow, lcol
integer :: mrow, mcol
integer(bit_kind) :: spindet($N_int)
integer(bit_kind) :: tmp_det($N_int,2)
integer(bit_kind) :: tmp_det2($N_int,2)
integer(bit_kind) :: tmp_det3($N_int,2)
integer(bit_kind), allocatable :: buffer(:,:)
integer :: n_doubles
integer, allocatable :: doubles(:)
integer, allocatable :: singles_a(:)
integer, allocatable :: singles_b(:)
integer, allocatable :: idx(:), idx0(:)
integer :: maxab, n_singles_a, n_singles_b, kcol_prev
integer*8 :: k8
logical :: compute_singles
integer*8 :: last_found, left, right, right_max
double precision :: rss, mem, ratio
complex*16, allocatable :: utl(:,:)
integer, parameter :: block_size=128
! call resident_memory(rss)
! mem = dble(singles_beta_csc_size) / 1024.d0**3
!
! compute_singles = (mem+rss > qp_max_mem)
!
! if (.not.compute_singles) then
! provide singles_beta_csc
! endif
compute_singles=.True.
maxab = max(N_det_alpha_unique, N_det_beta_unique)+1
allocate(idx0(maxab))
do i=1,maxab
idx0(i) = i
enddo
! Prepare the array of all alpha single excitations
! -------------------------------------------------
PROVIDE N_int nthreads_davidson
!$OMP PARALLEL DEFAULT(SHARED) NUM_THREADS(nthreads_davidson) &
!$OMP SHARED(psi_bilinear_matrix_rows, N_det, &
!$OMP psi_bilinear_matrix_columns, &
!$OMP psi_det_alpha_unique, psi_det_beta_unique, &
!$OMP n_det_alpha_unique, n_det_beta_unique, N_int, &
!$OMP psi_bilinear_matrix_transp_rows, &
!$OMP psi_bilinear_matrix_transp_columns, &
!$OMP psi_bilinear_matrix_transp_order, N_st, &
!$OMP psi_bilinear_matrix_order_transp_reverse, &
!$OMP psi_bilinear_matrix_columns_loc, &
!$OMP psi_bilinear_matrix_transp_rows_loc, &
!$OMP istart, iend, istep, irp_here, v_t, s_t, &
!$OMP ishift, idx0, u_t, maxab, compute_singles, &
!$OMP singles_alpha_csc,singles_alpha_csc_idx, &
!$OMP singles_beta_csc,singles_beta_csc_idx) &
!$OMP PRIVATE(krow, kcol, tmp_det, spindet, k_a, k_b, i, &
!$OMP lcol, lrow, l_a, l_b, utl, kk, &
!$OMP buffer, doubles, n_doubles, &
!$OMP tmp_det2, hij, sij, idx, l, kcol_prev, &
!$OMP singles_a, n_singles_a, singles_b, ratio, &
!$OMP n_singles_b, k8, last_found,left,right,right_max)
! Alpha/Beta double excitations
! =============================
allocate( buffer($N_int,maxab), &
singles_a(maxab), &
singles_b(maxab), &
doubles(maxab), &
idx(maxab), utl(N_st,block_size))
kcol_prev=-1
ASSERT (iend <= N_det)
ASSERT (istart > 0)
ASSERT (istep > 0)
!$OMP DO SCHEDULE(guided,64)
do k_a=istart+ishift,iend,istep
krow = psi_bilinear_matrix_rows(k_a)
ASSERT (krow <= N_det_alpha_unique)
kcol = psi_bilinear_matrix_columns(k_a)
ASSERT (kcol <= N_det_beta_unique)
tmp_det(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, krow)
if (kcol /= kcol_prev) then
tmp_det(1:$N_int,2) = psi_det_beta_unique (1:$N_int, kcol)
if (compute_singles) then
call get_all_spin_singles_$N_int( &
psi_det_beta_unique, idx0, &
tmp_det(1,2), N_det_beta_unique, &
singles_b, n_singles_b)
else
n_singles_b = 0
!DIR$ LOOP COUNT avg(1000)
do k8=singles_beta_csc_idx(kcol),singles_beta_csc_idx(kcol+1)-1
n_singles_b = n_singles_b+1
singles_b(n_singles_b) = singles_beta_csc(k8)
enddo
endif
endif
kcol_prev = kcol
! Loop over singly excited beta columns
! -------------------------------------
!DIR$ LOOP COUNT avg(1000)
do i=1,n_singles_b
lcol = singles_b(i)
tmp_det2(1:$N_int,2) = psi_det_beta_unique(1:$N_int, lcol)
!---
! if (compute_singles) then
l_a = psi_bilinear_matrix_columns_loc(lcol)
ASSERT (l_a <= N_det)
!DIR$ UNROLL(8)
!DIR$ LOOP COUNT avg(50000)
do j=1,psi_bilinear_matrix_columns_loc(lcol+1) - psi_bilinear_matrix_columns_loc(lcol)
lrow = psi_bilinear_matrix_rows(l_a)
ASSERT (lrow <= N_det_alpha_unique)
buffer(1:$N_int,j) = psi_det_alpha_unique(1:$N_int, lrow) ! hot spot
ASSERT (l_a <= N_det)
idx(j) = l_a
l_a = l_a+1
enddo
j = j-1
call get_all_spin_singles_$N_int( &
buffer, idx, tmp_det(1,1), j, &
singles_a, n_singles_a )
!-----
! else
!
! ! Search for singles
!
!call cpu_time(time0)
! ! Right boundary
! l_a = psi_bilinear_matrix_columns_loc(lcol+1)-1
! ASSERT (l_a <= N_det)
! do j=1,psi_bilinear_matrix_columns_loc(lcol+1) - psi_bilinear_matrix_columns_loc(lcol)
! lrow = psi_bilinear_matrix_rows(l_a)
! ASSERT (lrow <= N_det_alpha_unique)
!
! left = singles_alpha_csc_idx(krow)
! right_max = -1_8
! right = singles_alpha_csc_idx(krow+1)
! do while (right-left>0_8)
! k8 = shiftr(right+left,1)
! if (singles_alpha_csc(k8) > lrow) then
! right = k8
! else if (singles_alpha_csc(k8) < lrow) then
! left = k8 + 1_8
! else
! right_max = k8+1_8
! exit
! endif
! enddo
! if (right_max > 0_8) exit
! l_a = l_a-1
! enddo
! if (right_max < 0_8) right_max = singles_alpha_csc_idx(krow)
!
! ! Search
! n_singles_a = 0
! l_a = psi_bilinear_matrix_columns_loc(lcol)
! ASSERT (l_a <= N_det)
!
! last_found = singles_alpha_csc_idx(krow)
! do j=1,psi_bilinear_matrix_columns_loc(lcol+1) - psi_bilinear_matrix_columns_loc(lcol)
! lrow = psi_bilinear_matrix_rows(l_a)
! ASSERT (lrow <= N_det_alpha_unique)
!
! left = last_found
! right = right_max
! do while (right-left>0_8)
! k8 = shiftr(right+left,1)
! if (singles_alpha_csc(k8) > lrow) then
! right = k8
! else if (singles_alpha_csc(k8) < lrow) then
! left = k8 + 1_8
! else
! n_singles_a += 1
! singles_a(n_singles_a) = l_a
! last_found = k8+1_8
! exit
! endif
! enddo
! l_a = l_a+1
! enddo
! j = j-1
!
! endif
!-----
! Loop over alpha singles
! -----------------------
!DIR$ LOOP COUNT avg(1000)
do k = 1,n_singles_a,block_size
! Prefetch u_t(:,l_a)
do kk=0,block_size-1
if (k+kk > n_singles_a) exit
l_a = singles_a(k+kk)
ASSERT (l_a <= N_det)
do l=1,N_st
utl(l,kk+1) = u_t(l,l_a)
enddo
enddo
do kk=0,block_size-1
if (k+kk > n_singles_a) exit
l_a = singles_a(k+kk)
lrow = psi_bilinear_matrix_rows(l_a)
ASSERT (lrow <= N_det_alpha_unique)
tmp_det2(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, lrow)
call i_H_j_double_alpha_beta(tmp_det,tmp_det2,$N_int,hij)
call get_s2(tmp_det,tmp_det2,$N_int,sij)
!DIR$ LOOP COUNT AVG(4)
do l=1,N_st
v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1)
s_t(l,k_a) = s_t(l,k_a) + sij * utl(l,kk+1)
enddo
enddo
enddo
enddo
enddo
!$OMP END DO
!$OMP DO SCHEDULE(guided,64)
do k_a=istart+ishift,iend,istep
! Single and double alpha excitations
! ===================================
! Initial determinant is at k_a in alpha-major representation
! -----------------------------------------------------------------------
krow = psi_bilinear_matrix_rows(k_a)
ASSERT (krow <= N_det_alpha_unique)
kcol = psi_bilinear_matrix_columns(k_a)
ASSERT (kcol <= N_det_beta_unique)
tmp_det(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, krow)
tmp_det(1:$N_int,2) = psi_det_beta_unique (1:$N_int, kcol)
! Initial determinant is at k_b in beta-major representation
! ----------------------------------------------------------------------
k_b = psi_bilinear_matrix_order_transp_reverse(k_a)
ASSERT (k_b <= N_det)
spindet(1:$N_int) = tmp_det(1:$N_int,1)
! Loop inside the beta column to gather all the connected alphas
lcol = psi_bilinear_matrix_columns(k_a)
l_a = psi_bilinear_matrix_columns_loc(lcol)
!DIR$ LOOP COUNT avg(200000)
do i=1,N_det_alpha_unique
if (l_a > N_det) exit
lcol = psi_bilinear_matrix_columns(l_a)
if (lcol /= kcol) exit
lrow = psi_bilinear_matrix_rows(l_a)
ASSERT (lrow <= N_det_alpha_unique)
buffer(1:$N_int,i) = psi_det_alpha_unique(1:$N_int, lrow) ! Hot spot
idx(i) = l_a
l_a = l_a+1
enddo
i = i-1
call get_all_spin_singles_and_doubles_$N_int( &
buffer, idx, spindet, i, &
singles_a, doubles, n_singles_a, n_doubles )
! Compute Hij for all alpha singles
! ----------------------------------
tmp_det2(1:$N_int,2) = psi_det_beta_unique (1:$N_int, kcol)
!DIR$ LOOP COUNT avg(1000)
do i=1,n_singles_a,block_size
! Prefetch u_t(:,l_a)
do kk=0,block_size-1
if (i+kk > n_singles_a) exit
l_a = singles_a(i+kk)
ASSERT (l_a <= N_det)
do l=1,N_st
utl(l,kk+1) = u_t(l,l_a)
enddo
enddo
do kk=0,block_size-1
if (i+kk > n_singles_a) exit
l_a = singles_a(i+kk)
lrow = psi_bilinear_matrix_rows(l_a)
ASSERT (lrow <= N_det_alpha_unique)
tmp_det2(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, lrow)
call i_h_j_single_spin( tmp_det, tmp_det2, $N_int, 1, hij)
!DIR$ LOOP COUNT AVG(4)
do l=1,N_st
v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1)
! single => sij = 0
enddo
enddo
enddo
! Compute Hij for all alpha doubles
! ----------------------------------
!DIR$ LOOP COUNT avg(50000)
do i=1,n_doubles,block_size
! Prefetch u_t(:,l_a)
do kk=0,block_size-1
if (i+kk > n_doubles) exit
l_a = doubles(i+kk)
ASSERT (l_a <= N_det)
do l=1,N_st
utl(l,kk+1) = u_t(l,l_a)
enddo
enddo
do kk=0,block_size-1
if (i+kk > n_doubles) exit
l_a = doubles(i+kk)
lrow = psi_bilinear_matrix_rows(l_a)
ASSERT (lrow <= N_det_alpha_unique)
call i_H_j_double_spin( tmp_det(1,1), psi_det_alpha_unique(1, lrow), $N_int, hij)
!DIR$ LOOP COUNT AVG(4)
do l=1,N_st
v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1)
! same spin => sij = 0
enddo
enddo
enddo
! Single and double beta excitations
! ==================================
! Initial determinant is at k_a in alpha-major representation
! -----------------------------------------------------------------------
krow = psi_bilinear_matrix_rows(k_a)
kcol = psi_bilinear_matrix_columns(k_a)
tmp_det(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, krow)
tmp_det(1:$N_int,2) = psi_det_beta_unique (1:$N_int, kcol)
spindet(1:$N_int) = tmp_det(1:$N_int,2)
! Initial determinant is at k_b in beta-major representation
! -----------------------------------------------------------------------
k_b = psi_bilinear_matrix_order_transp_reverse(k_a)
ASSERT (k_b <= N_det)
! Loop inside the alpha row to gather all the connected betas
lrow = psi_bilinear_matrix_transp_rows(k_b)
l_b = psi_bilinear_matrix_transp_rows_loc(lrow)
!DIR$ LOOP COUNT avg(200000)
do i=1,N_det_beta_unique
if (l_b > N_det) exit
lrow = psi_bilinear_matrix_transp_rows(l_b)
if (lrow /= krow) exit
lcol = psi_bilinear_matrix_transp_columns(l_b)
ASSERT (lcol <= N_det_beta_unique)
buffer(1:$N_int,i) = psi_det_beta_unique(1:$N_int, lcol)
idx(i) = l_b
l_b = l_b+1
enddo
i = i-1
call get_all_spin_singles_and_doubles_$N_int( &
buffer, idx, spindet, i, &
singles_b, doubles, n_singles_b, n_doubles )
! Compute Hij for all beta singles
! ----------------------------------
tmp_det2(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, krow)
!DIR$ LOOP COUNT avg(1000)
do i=1,n_singles_b,block_size
do kk=0,block_size-1
if (i+kk > n_singles_b) exit
l_b = singles_b(i+kk)
ASSERT (l_b <= N_det)
l_a = psi_bilinear_matrix_transp_order(l_b)
ASSERT (l_a <= N_det)
do l=1,N_st
utl(l,kk+1) = u_t(l,l_a)
enddo
enddo
do kk=0,block_size-1
if (i+kk > n_singles_b) exit
l_b = singles_b(i+kk)
l_a = psi_bilinear_matrix_transp_order(l_b)
lcol = psi_bilinear_matrix_transp_columns(l_b)
ASSERT (lcol <= N_det_beta_unique)
tmp_det2(1:$N_int,2) = psi_det_beta_unique (1:$N_int, lcol)
call i_h_j_single_spin( tmp_det, tmp_det2, $N_int, 2, hij)
!DIR$ LOOP COUNT AVG(4)
do l=1,N_st
v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1)
! single => sij = 0
enddo
enddo
enddo
! Compute Hij for all beta doubles
! ----------------------------------
!DIR$ LOOP COUNT avg(50000)
do i=1,n_doubles,block_size
do kk=0,block_size-1
if (i+kk > n_doubles) exit
l_b = doubles(i+kk)
ASSERT (l_b <= N_det)
l_a = psi_bilinear_matrix_transp_order(l_b)
ASSERT (l_a <= N_det)
do l=1,N_st
utl(l,kk+1) = u_t(l,l_a)
enddo
enddo
do kk=0,block_size-1
if (i+kk > n_doubles) exit
l_b = doubles(i+kk)
l_a = psi_bilinear_matrix_transp_order(l_b)
lcol = psi_bilinear_matrix_transp_columns(l_b)
ASSERT (lcol <= N_det_beta_unique)
call i_H_j_double_spin( tmp_det(1,2), psi_det_beta_unique(1, lcol), $N_int, hij)
!DIR$ LOOP COUNT AVG(4)
do l=1,N_st
v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1)
! same spin => sij = 0
enddo
enddo
enddo
! Diagonal contribution
! =====================
! Initial determinant is at k_a in alpha-major representation
! -----------------------------------------------------------------------
krow = psi_bilinear_matrix_rows(k_a)
ASSERT (krow <= N_det_alpha_unique)
kcol = psi_bilinear_matrix_columns(k_a)
ASSERT (kcol <= N_det_beta_unique)
tmp_det(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, krow)
tmp_det(1:$N_int,2) = psi_det_beta_unique (1:$N_int, kcol)
double precision, external :: diag_H_mat_elem, diag_S_mat_elem
hij = diag_H_mat_elem(tmp_det,$N_int)
sij = diag_S_mat_elem(tmp_det,$N_int)
!DIR$ LOOP COUNT AVG(4)
do l=1,N_st
v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,k_a)
s_t(l,k_a) = s_t(l,k_a) + sij * u_t(l,k_a)
enddo
end do
!$OMP END DO
deallocate(buffer, singles_a, singles_b, doubles, idx, utl)
!$OMP END PARALLEL
end
SUBST [ N_int ]
1;;
2;;
3;;
4;;
N_int;;
END_TEMPLATE

View File

@ -84,3 +84,100 @@ recursive subroutine dtranspose(A,LDA,B,LDB,d1,d2)
end
!DIR$ attributes forceinline :: cdtranspose
recursive subroutine cdtranspose(A,LDA,B,LDB,d1,d2)
implicit none
BEGIN_DOC
! Transpose input matrix A into output matrix B
! don't take complex conjugate
END_DOC
integer, intent(in) :: d1, d2, LDA, LDB
complex*16, intent(in) :: A(LDA,d2)
complex*16, intent(out) :: B(LDB,d1)
! do j=1,d1
! do i=1,d2
! B(i,j ) = A(j ,i)
! enddo
! enddo
! return
integer :: i,j,k
if ( d2 < 32 ) then
do j=1,d1
!DIR$ LOOP COUNT (16)
do i=1,d2
B(i,j ) = A(j ,i)
enddo
enddo
return
else if (d1 > d2) then
!DIR$ forceinline
k=d1/2
!DIR$ forceinline recursive
call cdtranspose(A(1,1),LDA,B(1,1),LDB,k,d2)
!DIR$ forceinline recursive
call cdtranspose(A(k+1,1),LDA,B(1,k+1),LDB,d1-k,d2)
return
else
!DIR$ forceinline
k=d2/2
!DIR$ forceinline recursive
call cdtranspose(A(1,k+1),LDA,B(k+1,1),LDB,d1,d2-k)
!DIR$ forceinline recursive
call cdtranspose(A(1,1),LDA,B(1,1),LDB,d1,k)
return
endif
end
!DIR$ attributes forceinline :: cdadjoint
recursive subroutine cdadjoint(A,LDA,B,LDB,d1,d2)
implicit none
BEGIN_DOC
! Transpose input matrix A into output matrix B
! and take complex conjugate
END_DOC
integer, intent(in) :: d1, d2, LDA, LDB
complex*16, intent(in) :: A(LDA,d2)
complex*16, intent(out) :: B(LDB,d1)
! do j=1,d1
! do i=1,d2
! B(i,j ) = A(j ,i)
! enddo
! enddo
! return
integer :: i,j,k
if ( d2 < 32 ) then
do j=1,d1
!DIR$ LOOP COUNT (16)
do i=1,d2
B(i,j ) = conjg(A(j ,i))
enddo
enddo
return
else if (d1 > d2) then
!DIR$ forceinline
k=d1/2
!DIR$ forceinline recursive
call cdadjoint(A(1,1),LDA,B(1,1),LDB,k,d2)
!DIR$ forceinline recursive
call cdadjoint(A(k+1,1),LDA,B(1,k+1),LDB,d1-k,d2)
return
else
!DIR$ forceinline
k=d2/2
!DIR$ forceinline recursive
call cdadjoint(A(1,k+1),LDA,B(k+1,1),LDB,d1,d2-k)
!DIR$ forceinline recursive
call cdadjoint(A(1,1),LDA,B(1,1),LDB,d1,k)
return
endif
end

View File

@ -1,6 +1,11 @@
-------------------------------------------------------------------------------------
current:
irp_align for complex?
zmq_put_psi_complex instead of branch inside zmq_put_psi?
are there cases where we call this without already being on a complex branch of code?
h_apply.irp.f
push/pull_pt2
pt2,norm_pert,h_pert_diag
@ -83,6 +88,27 @@ determinants:
and broadcast_chunks_complex_double in mpi/mpi.irp.f
davidson
(****) davidson_parallel.irp.f
davidson_slave_work
branch inside or outside?
(currently inside)
same broadcast size issue as in h_apply (2^23 elements)
needs h_s2_u_0_nstates_openmp_work_complex (be careful with transp/conjg)
needs davidson_push_results_async_send_complex
davidson_pu{sh,ll}_results{,_async_send}_complex
double sz?
does f77_zmq_send8 know about types, or just send raw data?
davidson_collector_complex
is {v,s}_t conjugate transpose or just transpose?
(****) diagonalization_hs2_dressed.irp.f
(****) diagonalize_ci.irp.f
(****) EZFIO.cfg
(****) ezfio_interface.irp.f
(****) input.irp.f
(****) print_e_components.irp.f
(****) u0_h_u0.irp.f
(****) u0_wee_u0.irp.f
-------------------------------------------------------------------------------------
for complex data, add extra dim (size 2) and treat as real in EZFIO.cfg
@ -496,6 +522,3 @@ src/scf_utils/scf_density_matrix_ao_complex.irp.f
[ complex*16, SCF_density_matrix_ao_complex, (ao_num,ao_num) ]