10
0
mirror of https://github.com/LCPQ/quantum_package synced 2024-07-03 09:55:59 +02:00

Dissymmetrized u0Hu0: less memory and communication

This commit is contained in:
Anthony Scemama 2017-05-18 11:14:55 +02:00
parent ef07f9bc8e
commit ef5f905afc
3 changed files with 108 additions and 98 deletions

View File

@ -64,8 +64,8 @@ subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, N_st, sze,
integer :: imin, imax, ishift, istep
integer, allocatable :: psi_det_read(:,:,:)
double precision, allocatable :: v_0(:,:), s_0(:,:), u_t(:,:)
!DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: u_t, v_0, s_0
double precision, allocatable :: v_t(:,:), s_t(:,:), u_t(:,:)
!DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: u_t, v_t, s_t
! Get wave function (u_t)
! -----------------------
@ -108,7 +108,7 @@ subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, N_st, sze,
TOUCH N_det
endif
allocate(v_0(sze,N_st), s_0(sze,N_st),u_t(N_st,N_det_read))
allocate(u_t(N_st,N_det_read))
rc = f77_zmq_recv(zmq_to_qp_run_socket,psi_det,N_int*2*N_det_read*bit_kind,0)
if (rc /= N_int*2*N_det_read*bit_kind) then
@ -133,41 +133,50 @@ subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, N_st, sze,
! ---------
allocate(v_t(N_st,N_det), s_t(N_st,N_det))
do
v_0 = 0.d0
s_0 = 0.d0
call get_task_from_taskserver(zmq_to_qp_run_socket,worker_id, task_id, msg)
if(task_id == 0) exit
read (msg,*) imin, imax, ishift, istep
call H_S2_u_0_nstates_openmp_work(v_0,s_0,u_t,N_st,N_det,imin,imax,ishift,istep)
v_t = 0.d0
s_t = 0.d0
call H_S2_u_0_nstates_openmp_work(v_t,s_t,u_t,N_st,N_det,imin,imax,ishift,istep)
call task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id)
call davidson_push_results(zmq_socket_push, v_0, s_0, task_id)
call davidson_push_results(zmq_socket_push, v_t, s_t, imin, imax, task_id)
end do
deallocate(v_0, s_0, u_t)
deallocate(u_t,v_t, s_t)
end subroutine
subroutine davidson_push_results(zmq_socket_push, v_0, s_0, task_id)
subroutine davidson_push_results(zmq_socket_push, v_t, s_t, imin, imax, task_id)
use f77_zmq
implicit none
integer(ZMQ_PTR) ,intent(in) :: zmq_socket_push
integer ,intent(in) :: task_id
double precision ,intent(in) :: v_0(N_det,N_states_diag)
double precision ,intent(in) :: s_0(N_det,N_states_diag)
integer :: rc
integer ,intent(in) :: task_id, imin, imax
double precision ,intent(in) :: v_t(N_states_diag,N_det)
double precision ,intent(in) :: s_t(N_states_diag,N_det)
integer :: rc, sz
rc = f77_zmq_send( zmq_socket_push, v_0, 8*N_states_diag*N_det, ZMQ_SNDMORE)
if(rc /= 8*N_states_diag* N_det) stop "davidson_push_results failed to push vt"
sz = (imax-imin+1)*N_states_diag
rc = f77_zmq_send( zmq_socket_push, s_0, 8*N_states_diag*N_det, ZMQ_SNDMORE)
if(rc /= 8*N_states_diag* N_det) stop "davidson_push_results failed to push st"
rc = f77_zmq_send( zmq_socket_push, task_id, 4, 0)
rc = f77_zmq_send( zmq_socket_push, task_id, 4, ZMQ_SNDMORE)
if(rc /= 4) stop "davidson_push_results failed to push task_id"
rc = f77_zmq_send( zmq_socket_push, imin, 4, ZMQ_SNDMORE)
if(rc /= 4) stop "davidson_push_results failed to push imin"
rc = f77_zmq_send( zmq_socket_push, imax, 4, ZMQ_SNDMORE)
if(rc /= 4) stop "davidson_push_results failed to push imax"
rc = f77_zmq_send( zmq_socket_push, v_t(1,imin), 8*sz, ZMQ_SNDMORE)
if(rc /= 8*sz) stop "davidson_push_results failed to push vt"
rc = f77_zmq_send( zmq_socket_push, s_t(1,imin), 8*sz, 0)
if(rc /= 8*sz) stop "davidson_push_results failed to push st"
! Activate is zmq_socket_push is a REQ
IRP_IF ZMQ_PUSH
IRP_ELSE
@ -183,26 +192,34 @@ end subroutine
subroutine davidson_pull_results(zmq_socket_pull, v_0, s_0, task_id)
subroutine davidson_pull_results(zmq_socket_pull, v_t, s_t, imin, imax, task_id)
use f77_zmq
implicit none
integer(ZMQ_PTR) ,intent(in) :: zmq_socket_pull
integer ,intent(out) :: task_id
double precision ,intent(out) :: v_0(N_det,N_states_diag)
double precision ,intent(out) :: s_0(N_det,N_states_diag)
integer ,intent(out) :: task_id, imin, imax
double precision ,intent(out) :: v_t(N_states_diag,N_det)
double precision ,intent(out) :: s_t(N_states_diag,N_det)
integer :: rc
rc = f77_zmq_recv( zmq_socket_pull, v_0, 8*N_det*N_states_diag, 0)
if(rc /= 8*N_det*N_states_diag) stop "davidson_push_results failed to pull v_0"
rc = f77_zmq_recv( zmq_socket_pull, s_0, 8*N_det*N_states_diag, 0)
if(rc /= 8*N_det*N_states_diag) stop "davidson_push_results failed to pull s_0"
integer :: rc, sz
rc = f77_zmq_recv( zmq_socket_pull, task_id, 4, 0)
if(rc /= 4) stop "davidson_pull_results failed to pull task_id"
rc = f77_zmq_recv( zmq_socket_pull, imin, 4, 0)
if(rc /= 4) stop "davidson_pull_results failed to pull task_id"
rc = f77_zmq_recv( zmq_socket_pull, imax, 4, 0)
if(rc /= 4) stop "davidson_pull_results failed to pull task_id"
sz = (imax-imin+1)*N_states_diag
rc = f77_zmq_recv( zmq_socket_pull, v_t(1,imin), 8*sz, 0)
if(rc /= 8*sz) stop "davidson_pull_results failed to pull v_t"
rc = f77_zmq_recv( zmq_socket_pull, s_t(1,imin), 8*sz, 0)
if(rc /= 8*sz) stop "davidson_pull_results failed to pull s_t"
! Activate if zmq_socket_pull is a REP
IRP_IF ZMQ_PUSH
IRP_ELSE
@ -227,29 +244,29 @@ subroutine davidson_collector(zmq_to_qp_run_socket, v0, s0, sze, N_st)
double precision ,intent(inout) :: v0(sze, N_st)
double precision ,intent(inout) :: s0(sze, N_st)
integer :: more, task_id
integer :: more, task_id, imin, imax
double precision, allocatable :: v_0(:,:), s_0(:,:)
double precision, allocatable :: v_t(:,:), s_t(:,:)
integer :: i,j
integer(ZMQ_PTR), external :: new_zmq_pull_socket
integer(ZMQ_PTR) :: zmq_socket_pull
allocate(v_0(N_det,N_st), s_0(N_det,N_st))
allocate(v_t(N_st,N_det), s_t(N_st,N_det))
v0 = 0.d0
s0 = 0.d0
more = 1
zmq_socket_pull = new_zmq_pull_socket()
do while (more == 1)
call davidson_pull_results(zmq_socket_pull, v_0, s_0, task_id)
call davidson_pull_results(zmq_socket_pull, v_t, s_t, imin, imax, task_id)
do j=1,N_st
do i=1,N_det
v0(i,j) = v0(i,j) + v_0(i,j)
s0(i,j) = s0(i,j) + s_0(i,j)
do i=imin,imax
v0(i,j) = v0(i,j) + v_t(j,i)
s0(i,j) = s0(i,j) + s_t(j,i)
enddo
enddo
call zmq_delete_task(zmq_to_qp_run_socket,zmq_socket_pull,task_id,more)
end do
deallocate(v_0,s_0)
deallocate(v_t,s_t)
call end_zmq_pull_socket(zmq_socket_pull)
end subroutine
@ -349,16 +366,15 @@ subroutine H_S2_u_0_nstates_zmq(v_0,s_0,u_0,N_st,sze)
integer :: istep, imin, imax, ishift
double precision :: w, max_workload, N_det_inv, di
max_workload = 1000000.d0
w = 0.d0
istep=8
istep=1
ishift=0
imin=1
N_det_inv = 1.d0/dble(N_det)
di = dble(N_det)
max_workload = 50000.d0
do imax=1,N_det
di = di-1.d0
w = w + di*N_det_inv
w = w + 1.d0
if (w > max_workload) then
do ishift=0,istep-1
write(task,'(4(I9,1X),1A)') imin, imax, ishift, istep, '|'

View File

@ -21,14 +21,14 @@ subroutine H_S2_u_0_nstates_openmp(v_0,s_0,u_0,N_st,sze)
integer, intent(in) :: N_st,sze
double precision, intent(inout) :: v_0(sze,N_st), s_0(sze,N_st), u_0(sze,N_st)
integer :: k
double precision, allocatable :: u_t(:,:)
double precision, allocatable :: u_t(:,:), v_t(:,:), s_t(:,:)
!DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: u_t
allocate(u_t(N_st,N_det))
allocate(u_t(N_st,N_det),v_t(N_st,N_det),s_t(N_st,N_det))
do k=1,N_st
call dset_order(u_0(1,k),psi_bilinear_matrix_order,N_det)
enddo
v_0 = 0.d0
s_0 = 0.d0
v_t = 0.d0
s_t = 0.d0
call dtranspose( &
u_0, &
size(u_0, 1), &
@ -36,9 +36,23 @@ subroutine H_S2_u_0_nstates_openmp(v_0,s_0,u_0,N_st,sze)
size(u_t, 1), &
N_det, N_st)
call H_S2_u_0_nstates_openmp_work(v_0,s_0,u_t,N_st,sze,1,N_det,0,1)
call H_S2_u_0_nstates_openmp_work(v_t,s_t,u_t,N_st,sze,1,N_det,0,1)
deallocate(u_t)
call dtranspose( &
v_t, &
size(v_t, 1), &
v_0, &
size(v_0, 1), &
N_st, N_det)
call dtranspose( &
s_t, &
size(s_t, 1), &
s_0, &
size(s_0, 1), &
N_st, N_det)
deallocate(v_t,s_t)
do k=1,N_st
call dset_order(v_0(1,k),psi_bilinear_matrix_order_reverse,N_det)
call dset_order(s_0(1,k),psi_bilinear_matrix_order_reverse,N_det)
@ -48,47 +62,47 @@ subroutine H_S2_u_0_nstates_openmp(v_0,s_0,u_0,N_st,sze)
end
subroutine H_S2_u_0_nstates_openmp_work(v_0,s_0,u_t,N_st,sze,istart,iend,ishift,istep)
subroutine H_S2_u_0_nstates_openmp_work(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
use bitmasks
implicit none
BEGIN_DOC
! Computes v_0 = H|u_0> and s_0 = S^2 |u_0>
! Computes v_t = H|u_t> and s_t = S^2 |u_t>
!
! Default should be 1,N_det,0,1
END_DOC
integer, intent(in) :: N_st,sze,istart,iend,ishift,istep
double precision, intent(in) :: u_t(N_st,N_det)
double precision, intent(out) :: v_0(sze,N_st), s_0(sze,N_st)
double precision, intent(out) :: v_t(N_st,sze), s_t(N_st,sze)
PROVIDE ref_bitmask_energy N_int
select case (N_int)
case (1)
call H_S2_u_0_nstates_openmp_work_1(v_0,s_0,u_t,N_st,sze,istart,iend,ishift,istep)
call H_S2_u_0_nstates_openmp_work_1(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
case (2)
call H_S2_u_0_nstates_openmp_work_2(v_0,s_0,u_t,N_st,sze,istart,iend,ishift,istep)
call H_S2_u_0_nstates_openmp_work_2(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
case (3)
call H_S2_u_0_nstates_openmp_work_3(v_0,s_0,u_t,N_st,sze,istart,iend,ishift,istep)
call H_S2_u_0_nstates_openmp_work_3(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
case (4)
call H_S2_u_0_nstates_openmp_work_4(v_0,s_0,u_t,N_st,sze,istart,iend,ishift,istep)
call H_S2_u_0_nstates_openmp_work_4(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
case default
call H_S2_u_0_nstates_openmp_work_N_int(v_0,s_0,u_t,N_st,sze,istart,iend,ishift,istep)
call H_S2_u_0_nstates_openmp_work_N_int(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
end select
end
BEGIN_TEMPLATE
subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_0,s_0,u_t,N_st,sze,istart,iend,ishift,istep)
subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
use bitmasks
implicit none
BEGIN_DOC
! Computes v_0 = H|u_0> and s_0 = S^2 |u_0>
! Computes v_t = H|u_t> and s_t = S^2 |u_t>
!
! Default should be 1,N_det,0,1
END_DOC
integer, intent(in) :: N_st,sze,istart,iend,ishift,istep
double precision, intent(in) :: u_t(N_st,N_det)
double precision, intent(out) :: v_0(sze,N_st), s_0(sze,N_st)
double precision, intent(out) :: v_t(N_st,sze), s_t(N_st,sze)
double precision :: hij, sij
integer :: i,j,k,l
@ -109,8 +123,6 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_0,s_0,u_t,N_st,sze,istart,iend,
integer, allocatable :: idx(:), idx0(:)
integer :: maxab, n_singles_a, n_singles_b, kcol_prev
integer*8 :: k8
double precision, allocatable :: v_t(:,:), s_t(:,:)
!DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: v_t, s_t
maxab = max(N_det_alpha_unique, N_det_beta_unique)+1
allocate(idx0(maxab))
@ -133,14 +145,15 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_0,s_0,u_t,N_st,sze,istart,iend,
!$OMP psi_bilinear_matrix_transp_order, N_st, &
!$OMP psi_bilinear_matrix_order_transp_reverse, &
!$OMP psi_bilinear_matrix_columns_loc, &
!$OMP istart, iend, istep, irp_here, &
!$OMP ishift, idx0, u_t, maxab, v_0, s_0) &
!$OMP psi_bilinear_matrix_transp_rows_loc, &
!$OMP istart, iend, istep, irp_here, v_t, s_t, &
!$OMP ishift, idx0, u_t, maxab) &
!$OMP PRIVATE(krow, kcol, tmp_det, spindet, k_a, k_b, i, &
!$OMP lcol, lrow, l_a, l_b, &
!$OMP buffer, doubles, n_doubles, &
!$OMP tmp_det2, hij, sij, idx, l, kcol_prev, v_t, &
!$OMP tmp_det2, hij, sij, idx, l, kcol_prev, &
!$OMP singles_a, n_singles_a, singles_b, &
!$OMP n_singles_b, s_t, k8)
!$OMP n_singles_b, k8)
! Alpha/Beta double excitations
! =============================
@ -149,12 +162,9 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_0,s_0,u_t,N_st,sze,istart,iend,
singles_a(maxab), &
singles_b(maxab), &
doubles(maxab), &
idx(maxab), &
v_t(N_st,N_det), s_t(N_st,N_det))
kcol_prev=-1
idx(maxab))
v_t = 0.d0
s_t = 0.d0
kcol_prev=-1
ASSERT (iend <= N_det)
ASSERT (istart > 0)
@ -168,20 +178,20 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_0,s_0,u_t,N_st,sze,istart,iend,
kcol = psi_bilinear_matrix_columns(k_a)
ASSERT (kcol <= N_det_beta_unique)
tmp_det(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, krow)
tmp_det(1:$N_int,2) = psi_det_beta_unique (1:$N_int, kcol)
if (kcol /= kcol_prev) then
call get_all_spin_singles_$N_int( &
psi_det_beta_unique(1,kcol+1), idx0(kcol+1), &
tmp_det(1,2), N_det_beta_unique-kcol, &
psi_det_beta_unique, idx0, &
tmp_det(1,2), N_det_beta_unique, &
singles_b, n_singles_b)
endif
kcol_prev = kcol
! Loop over singly excited beta columns > current column
! ------------------------------------------------------
! Loop over singly excited beta columns
! -------------------------------------
do i=1,n_singles_b
lcol = singles_b(i)
@ -202,7 +212,7 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_0,s_0,u_t,N_st,sze,istart,iend,
l_a = l_a+1
enddo
j = j-1
call get_all_spin_singles_$N_int( &
buffer, idx, tmp_det(1,1), j, &
singles_a, n_singles_a )
@ -223,8 +233,6 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_0,s_0,u_t,N_st,sze,istart,iend,
do l=1,N_st
v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,l_a)
s_t(l,k_a) = s_t(l,k_a) + sij * u_t(l,l_a)
v_t(l,l_a) = v_t(l,l_a) + hij * u_t(l,k_a)
s_t(l,l_a) = s_t(l,l_a) + sij * u_t(l,k_a)
enddo
enddo
@ -262,7 +270,8 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_0,s_0,u_t,N_st,sze,istart,iend,
spindet(1:$N_int) = tmp_det(1:$N_int,1)
! Loop inside the beta column to gather all the connected alphas
l_a = k_a+1
lcol = psi_bilinear_matrix_columns(k_a)
l_a = psi_bilinear_matrix_columns_loc(lcol)
do i=1,N_det_alpha_unique
if (l_a > N_det) exit
lcol = psi_bilinear_matrix_columns(l_a)
@ -295,7 +304,6 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_0,s_0,u_t,N_st,sze,istart,iend,
call i_H_j_mono_spin( tmp_det, tmp_det2, $N_int, 1, hij)
do l=1,N_st
v_t(l,l_a) = v_t(l,l_a) + hij * u_t(l,k_a)
v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,l_a)
! single => sij = 0
enddo
@ -314,7 +322,6 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_0,s_0,u_t,N_st,sze,istart,iend,
call i_H_j_double_spin( tmp_det(1,1), psi_det_alpha_unique(1, lrow), $N_int, hij)
do l=1,N_st
v_t(l,l_a) = v_t(l,l_a) + hij * u_t(l,k_a)
v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,l_a)
! same spin => sij = 0
enddo
@ -343,7 +350,8 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_0,s_0,u_t,N_st,sze,istart,iend,
ASSERT (k_b <= N_det)
! Loop inside the alpha row to gather all the connected betas
l_b = k_b+1
lrow = psi_bilinear_matrix_transp_rows(k_b)
l_b = psi_bilinear_matrix_transp_rows_loc(lrow)
do i=1,N_det_beta_unique
if (l_b > N_det) exit
lrow = psi_bilinear_matrix_transp_rows(l_b)
@ -377,7 +385,6 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_0,s_0,u_t,N_st,sze,istart,iend,
l_a = psi_bilinear_matrix_transp_order(l_b)
ASSERT (l_a <= N_det)
do l=1,N_st
v_t(l,l_a) = v_t(l,l_a) + hij * u_t(l,k_a)
v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,l_a)
! single => sij = 0
enddo
@ -398,7 +405,6 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_0,s_0,u_t,N_st,sze,istart,iend,
ASSERT (l_a <= N_det)
do l=1,N_st
v_t(l,l_a) = v_t(l,l_a) + hij * u_t(l,k_a)
v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,l_a)
! same spin => sij = 0
enddo
@ -431,20 +437,8 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_0,s_0,u_t,N_st,sze,istart,iend,
enddo
end do
!$OMP END DO NOWAIT
!$OMP END DO
deallocate(buffer, singles_a, singles_b, doubles, idx)
!$OMP CRITICAL
do l=1,N_st
do i=1, N_det
v_0(i,l) = v_0(i,l) + v_t(l,i)
s_0(i,l) = s_0(i,l) + s_t(l,i)
enddo
enddo
!$OMP END CRITICAL
deallocate(v_t, s_t)
!$OMP BARRIER
!$OMP END PARALLEL
end

View File

@ -500,7 +500,7 @@ subroutine H_S2_u_0_nstates_test(v_0,s_0,u_0,H_jj,S2_jj,n,keys_tmp,Nint,N_st,sze
! if (exc(0,1,2) /= 0) cycle
! if (exc(0,1,1) == 2) cycle
! if (exc(0,1,2) == 2) cycle
if ((degree==1).and.(exc(0,1,1) == 1)) cycle
! if ((degree==1).and.(exc(0,1,1) == 1)) cycle
call i_H_j(keys_tmp(1,1,j),keys_tmp(1,1,i),Nint,hij)
do l=1,N_st
!$OMP ATOMIC