From f3a46c55c1f67177ce43968ae7e6a58061700425 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 11 Oct 2016 22:44:51 +0200 Subject: [PATCH 1/3] Fixed selection bug --- plugins/Full_CI_ZMQ/selection_davidson_slave.irp.f | 2 +- plugins/Selectors_full/zmq.irp.f | 2 +- src/Davidson/davidson_parallel.irp.f | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/plugins/Full_CI_ZMQ/selection_davidson_slave.irp.f b/plugins/Full_CI_ZMQ/selection_davidson_slave.irp.f index 554c62d5..6e4cf44f 100644 --- a/plugins/Full_CI_ZMQ/selection_davidson_slave.irp.f +++ b/plugins/Full_CI_ZMQ/selection_davidson_slave.irp.f @@ -12,7 +12,7 @@ program selection_slave end subroutine provide_everything - PROVIDE H_apply_buffer_allocated mo_bielec_integrals_in_map psi_det_generators psi_coef_generators psi_det_sorted_bit psi_selectors n_det_generators n_states generators_bitmask zmq_context + PROVIDE H_apply_buffer_allocated mo_bielec_integrals_in_map psi_det_generators psi_coef_generators psi_det_sorted_bit psi_selectors n_det_generators n_states generators_bitmask zmq_context mo_mono_elec_integral ! PROVIDE ci_electronic_energy mo_tot_num N_int end diff --git a/plugins/Selectors_full/zmq.irp.f b/plugins/Selectors_full/zmq.irp.f index f3dea8f5..dfa94884 100644 --- a/plugins/Selectors_full/zmq.irp.f +++ b/plugins/Selectors_full/zmq.irp.f @@ -114,7 +114,7 @@ subroutine zmq_get_psi(zmq_to_qp_run_socket, worker_id, energy, size_energy) if (N_det_selectors_read > 0) then N_det_selectors = N_det_selectors_read endif - SOFT_TOUCH psi_det psi_coef N_det_selectors N_det_generators psi_coef_generators psi_det_generators + SOFT_TOUCH psi_det psi_coef N_det_selectors N_det_generators end diff --git a/src/Davidson/davidson_parallel.irp.f b/src/Davidson/davidson_parallel.irp.f index d610bb3d..5243d3e4 100644 --- a/src/Davidson/davidson_parallel.irp.f +++ b/src/Davidson/davidson_parallel.irp.f @@ -4,12 +4,12 @@ use bitmasks use f77_zmq -subroutine davidson_process(blockb, blocke, N, idx, vt, st, bs) +subroutine davidson_process(blockb, blockb2, N, idx, vt, st, bs, istep) implicit none - integer , intent(in) :: blockb, blocke, bs + integer , intent(in) :: blockb, bs, blockb2, istep integer , intent(inout) :: N integer , intent(inout) :: idx(bs) double precision , intent(inout) :: vt(N_states_diag, bs) From 7fcb4008de319292a8511c6d7f51099aa518880a Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 11 Oct 2016 22:45:40 +0200 Subject: [PATCH 2/3] Improved parallel davidson --- src/Davidson/davidson_parallel.irp.f | 98 ++++++++++++------------ src/Davidson/u0Hu0.irp.f | 44 +++++++---- src/Integrals_Bielec/map_integrals.irp.f | 6 +- 3 files changed, 79 insertions(+), 69 deletions(-) diff --git a/src/Davidson/davidson_parallel.irp.f b/src/Davidson/davidson_parallel.irp.f index 5243d3e4..cede52c9 100644 --- a/src/Davidson/davidson_parallel.irp.f +++ b/src/Davidson/davidson_parallel.irp.f @@ -20,11 +20,12 @@ subroutine davidson_process(blockb, blockb2, N, idx, vt, st, bs, istep) double precision :: s2, hij logical, allocatable :: wrotten(:) - allocate(wrotten(bs)) wrotten = .false. + PROVIDE dav_det - do sh = blockb, blocke + ii=0 + sh = blockb do sh2=1,shortcut_(0,1) exa = 0 do ni=1,N_int @@ -32,7 +33,7 @@ subroutine davidson_process(blockb, blockb2, N, idx, vt, st, bs, istep) end do if(exa > 2) cycle - do i=shortcut_(sh,1),shortcut_(sh+1,1)-1 + do i=blockb2+shortcut_(sh,1),shortcut_(sh+1,1)-1, istep ii = i - shortcut_(blockb,1) + 1 org_i = sort_idx_(i,1) @@ -60,47 +61,43 @@ subroutine davidson_process(blockb, blockb2, N, idx, vt, st, bs, istep) vt (istate,ii) += hij*dav_ut(istate,org_j) st (istate,ii) += s2*dav_ut(istate,org_j) enddo -! call daxpy(N_states_diag,hij,dav_ut(1,org_j),1,vt(1,org_i),1) -! call daxpy(N_states_diag,s2, dav_ut(1,org_j),1,st(1,org_i),1) endif enddo enddo enddo - enddo - do sh=blockb,min(blocke, shortcut_(0,2)) - do sh2=sh, shortcut_(0,2), shortcut_(0,1) - do i=shortcut_(sh2,2),shortcut_(sh2+1,2)-1 - ii += 1 - org_i = sort_idx_(i,2) - do j=shortcut_(sh2,2),shortcut_(sh2+1,2)-1 - if(i == j) cycle - org_j = sort_idx_(j,2) - ext = 0 - do ni=1,N_int - ext = ext + popcnt(xor(sorted_(ni,i,2), sorted_(ni,j,2))) + if (blockb <= shortcut_(0,2)) then + sh=blockb + do sh2=sh, shortcut_(0,2), shortcut_(0,1) + do i=blockb2+shortcut_(sh2,2),shortcut_(sh2+1,2)-1, istep + ii += 1 + org_i = sort_idx_(i,2) + do j=shortcut_(sh2,2),shortcut_(sh2+1,2)-1 + if(i == j) cycle + org_j = sort_idx_(j,2) + ext = 0 + do ni=1,N_int + ext = ext + popcnt(xor(sorted_(ni,i,2), sorted_(ni,j,2))) + end do + if(ext == 4) then + call i_h_j (dav_det(1,1,org_j),dav_det(1,1,org_i),n_int,hij) + call get_s2(dav_det(1,1,org_j),dav_det(1,1,org_i),n_int,s2) + if(.not. wrotten(ii)) then + wrotten(ii) = .true. + idx(ii) = org_i + vt (:,ii) = 0d0 + st (:,ii) = 0d0 + end if + do istate=1,N_states_diag + vt (istate,ii) += hij*dav_ut(istate,org_j) + st (istate,ii) += s2*dav_ut(istate,org_j) + enddo + end if end do - if(ext == 4) then - call i_h_j (dav_det(1,1,org_j),dav_det(1,1,org_i),n_int,hij) - call get_s2(dav_det(1,1,org_j),dav_det(1,1,org_i),n_int,s2) - if(.not. wrotten(ii)) then - wrotten(ii) = .true. - idx(ii) = org_i - vt (:,ii) = 0d0 - st (:,ii) = 0d0 - end if -! call daxpy(N_states_diag,hij,dav_ut(1,org_j),1,vt(1,ii),1) -! call daxpy(N_states_diag,s2, dav_ut(1,org_j),1,st(1,ii),1) - do istate=1,N_states_diag - vt (istate,ii) += hij*dav_ut(istate,org_j) - st (istate,ii) += s2*dav_ut(istate,org_j) - enddo - end if end do - end do - enddo - enddo + enddo + endif N=0 do i=1,bs @@ -112,16 +109,16 @@ subroutine davidson_process(blockb, blockb2, N, idx, vt, st, bs, istep) end if end do + end subroutine -subroutine davidson_collect(blockb, blocke, N, idx, vt, st , v0t, s0t) +subroutine davidson_collect(N, idx, vt, st , v0t, s0t) implicit none - integer , intent(in) :: blockb, blocke integer , intent(in) :: N integer , intent(in) :: idx(N) double precision , intent(in) :: vt(N_states_diag, N) @@ -175,16 +172,16 @@ end subroutine -subroutine davidson_add_task(zmq_to_qp_run_socket, blockb, blocke) +subroutine davidson_add_task(zmq_to_qp_run_socket, blockb, blockb2, istep) use f77_zmq implicit none integer(ZMQ_PTR) ,intent(in) :: zmq_to_qp_run_socket - integer ,intent(in) :: blockb, blocke + integer ,intent(in) :: blockb, blockb2, istep character*(512) :: task - write(task,*) blockb, blocke + write(task,*) blockb, blockb2, istep call add_task_to_taskserver(zmq_to_qp_run_socket, task) end subroutine @@ -213,7 +210,7 @@ subroutine davidson_run_slave(thread,iproc) integer, intent(in) :: thread, iproc - integer :: worker_id, task_id, blockb, blocke + integer :: worker_id, task_id, blockb character*(512) :: task integer(ZMQ_PTR),external :: new_zmq_to_qp_run_socket @@ -253,7 +250,7 @@ subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, worker_id) character*(512) :: task - integer :: blockb, blocke + integer :: blockb, blockb2, istep integer :: N integer , allocatable :: idx(:) double precision , allocatable :: vt(:,:) @@ -266,10 +263,10 @@ subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, worker_id) do call get_task_from_taskserver(zmq_to_qp_run_socket,worker_id, task_id, task) if(task_id == 0) exit - read (task,*) blockb, blocke - bs = shortcut_(blocke+1,1) - shortcut_(blockb, 1) + read (task,*) blockb, blockb2, istep + bs = shortcut_(blockb+1,1) - shortcut_(blockb, 1) do i=blockb, shortcut_(0,2), shortcut_(0,1) - do j=i, min(i+blocke-blockb, shortcut_(0,2)) + do j=i, min(i, shortcut_(0,2)) bs += shortcut_(j+1,2) - shortcut_(j, 2) end do end do @@ -280,10 +277,9 @@ subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, worker_id) allocate(st(N_states_diag, bs)) end if - call davidson_process(blockb, blocke, N, idx, vt, st, bs) - + call davidson_process(blockb, blockb2, N, idx, vt, st, bs, istep) call task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id) - call davidson_push_results(zmq_socket_push, blockb, blocke, N, idx, vt, st, task_id) + call davidson_push_results(zmq_socket_push, blockb, blockb2, N, idx, vt, st, task_id) end do end subroutine @@ -387,7 +383,7 @@ subroutine davidson_collector(zmq_to_qp_run_socket, zmq_socket_pull , v0, s0, LD integer :: msize - msize = (max_workload + max_blocksize)*2 + msize = (1 + max_blocksize)*2 allocate(idx(msize)) allocate(vt(N_states_diag, msize)) allocate(st(N_states_diag, msize)) @@ -402,7 +398,7 @@ subroutine davidson_collector(zmq_to_qp_run_socket, zmq_socket_pull , v0, s0, LD do while (more == 1) call davidson_pull_results(zmq_socket_pull, blockb, blocke, N, idx, vt, st, task_id) !DIR$ FORCEINLINE - call davidson_collect(blockb, blocke, N, idx, vt, st , v0t, s0t) + call davidson_collect(N, idx, vt, st , v0t, s0t) call zmq_delete_task(zmq_to_qp_run_socket,zmq_socket_pull,task_id,more) end do deallocate(idx,vt,st) diff --git a/src/Davidson/u0Hu0.irp.f b/src/Davidson/u0Hu0.irp.f index d1a7b8e2..09843aee 100644 --- a/src/Davidson/u0Hu0.irp.f +++ b/src/Davidson/u0Hu0.irp.f @@ -208,7 +208,8 @@ subroutine H_S2_u_0_nstates(v_0,s_0,u_0,H_jj,S2_jj,n,keys_tmp,Nint,N_st,sze_8) integer :: N_st_8 integer, external :: align_double - integer :: workload, blockb, blocke + integer :: blockb, blockb2, istep + double precision :: ave_workload, workload integer(ZMQ_PTR) :: handler @@ -234,21 +235,38 @@ subroutine H_S2_u_0_nstates(v_0,s_0,u_0,H_jj,S2_jj,n,keys_tmp,Nint,N_st,sze_8) call sort_dets_ab_v(keys_tmp, sorted, sort_idx, shortcut(0,1), version, n, Nint) call sort_dets_ba_v(keys_tmp, sorted, sort_idx, shortcut(0,2), version, n, Nint) - workload = 0 blockb = shortcut(0,1) - blocke = blockb call davidson_init(handler,n,N_st_8,ut) + + + ave_workload = 0.d0 + do sh=1,shortcut(0,1) + ave_workload += shortcut(0,1) + ave_workload += (shortcut(sh+1,1) - shortcut(sh,1))**2 + do i=sh, shortcut(0,2), shortcut(0,1) + do j=i, min(i, shortcut(0,2)) + ave_workload += (shortcut(j+1,2) - shortcut(j, 2))**2 + end do + end do + enddo + ave_workload = ave_workload/dble(shortcut(0,1)) + + + print *, 'Ave workload :', ave_workload + do sh=shortcut(0,1),1,-1 - workload += (shortcut(sh+1,1) - shortcut(sh,1))**2 - if(workload > max_workload) then - blocke = sh - call davidson_add_task(handler, blocke, blockb) - blockb = sh-1 - workload = 0 - end if + workload = shortcut(0,1)+dble(shortcut(sh+1,1) - shortcut(sh,1))**2 + do i=sh, shortcut(0,2), shortcut(0,1) + do j=i, min(i, shortcut(0,2)) + workload += (shortcut(j+1,2) - shortcut(j, 2))**2 + end do + end do + istep = 1+ int(0.5d0*workload/ave_workload) + do blockb2=0, istep-1 + call davidson_add_task(handler, sh, blockb2, istep) + enddo enddo - if(blockb > 0) call davidson_add_task(handler, 1, blockb) call davidson_run(handler, v_0, s_0, size(v_0,1)) do istate=1,N_st @@ -262,8 +280,4 @@ subroutine H_S2_u_0_nstates(v_0,s_0,u_0,H_jj,S2_jj,n,keys_tmp,Nint,N_st,sze_8) end -BEGIN_PROVIDER [ integer, max_workload ] - max_workload = 1000 -END_PROVIDER - diff --git a/src/Integrals_Bielec/map_integrals.irp.f b/src/Integrals_Bielec/map_integrals.irp.f index 28b7d2e2..afc573aa 100644 --- a/src/Integrals_Bielec/map_integrals.irp.f +++ b/src/Integrals_Bielec/map_integrals.irp.f @@ -364,7 +364,7 @@ double precision function get_mo_bielec_integral(i,j,k,l,map) integer(key_kind) :: idx type(map_type), intent(inout) :: map real(integral_kind) :: tmp - PROVIDE mo_bielec_integrals_in_map + PROVIDE mo_bielec_integrals_in_map mo_integrals_cache if ( (i >= mo_integrals_cache_min) .and. & (j >= mo_integrals_cache_min) .and. & (k >= mo_integrals_cache_min) .and. & @@ -393,7 +393,7 @@ double precision function get_mo_bielec_integral_schwartz(i,j,k,l,map) integer(key_kind) :: idx type(map_type), intent(inout) :: map real(integral_kind) :: tmp - PROVIDE mo_bielec_integrals_in_map + PROVIDE mo_bielec_integrals_in_map mo_integrals_cache if (mo_bielec_integral_schwartz(i,k)*mo_bielec_integral_schwartz(j,l) > mo_integrals_threshold) then double precision, external :: get_mo_bielec_integral !DIR$ FORCEINLINE @@ -411,7 +411,7 @@ double precision function mo_bielec_integral(i,j,k,l) END_DOC integer, intent(in) :: i,j,k,l double precision :: get_mo_bielec_integral - PROVIDE mo_bielec_integrals_in_map + PROVIDE mo_bielec_integrals_in_map mo_integrals_cache !DIR$ FORCEINLINE mo_bielec_integral = get_mo_bielec_integral(i,j,k,l,mo_integrals_map) return From fa3ffdd6966a12f95803b716c40bf37b311814c9 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 11 Oct 2016 22:46:12 +0200 Subject: [PATCH 3/3] Comments in SVD of MRCC --- plugins/MRCC_Utils/mrcc_utils.irp.f | 45 ++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 11 deletions(-) diff --git a/plugins/MRCC_Utils/mrcc_utils.irp.f b/plugins/MRCC_Utils/mrcc_utils.irp.f index 4139ac30..79249051 100644 --- a/plugins/MRCC_Utils/mrcc_utils.irp.f +++ b/plugins/MRCC_Utils/mrcc_utils.irp.f @@ -810,14 +810,18 @@ END_PROVIDER if(res < 1d-8) exit end do - + ! rho_mrcc now contains A.X + norm = 0.d0 do i=1,N_det_non_ref norm = norm + rho_mrcc(i,s)*rho_mrcc(i,s) enddo + ! Norm now contains the norm of A.X + do i=1,N_det_ref norm = norm + psi_ref_coef(i,s)*psi_ref_coef(i,s) enddo + ! Norm now contains the norm of Psi + A.X print *, k, "res : ", res, "norm : ", sqrt(norm) @@ -829,30 +833,46 @@ END_PROVIDER if (rho_mrcc(i,s) == 0.d0) then rho_mrcc(i,s) = 1.d-32 endif + + ! f is such that f.\tilde{c_i} = c_i f = psi_non_ref_coef(i,s) / rho_mrcc(i,s) + ! Avoid numerical instabilities f = min(f,2.d0) f = max(f,-2.d0) + norm = norm + f*f *rho_mrcc(i,s)*rho_mrcc(i,s) rho_mrcc(i,s) = f enddo + ! norm now contains the norm of |T.Psi_0> + ! rho_mrcc now contains the f factors f = 1.d0/norm - norm = 0.d0 - do i=1,N_det_non_ref - norm = norm + psi_non_ref_coef(i,s)*psi_non_ref_coef(i,s) - enddo - f = dsqrt(f*norm) + ! f now contains 1/ - print *, 'norm of |T Psi_0> = ', norm*f*f + norm = 1.d0 + do i=1,N_det_ref + norm = norm - psi_ref_coef(i,s)*psi_ref_coef(i,s) + enddo + ! norm now contains + f = dsqrt(f*norm) + ! f normalises T.Psi_0 such that (1+T)|Psi> is normalized + + norm = norm*f + print *, 'norm of |T Psi_0> = ', dsqrt(norm) + + do i=1,N_det_ref + norm = norm + psi_ref_coef(i,s)*psi_ref_coef(i,s) + enddo do i=1,N_det_non_ref rho_mrcc(i,s) = rho_mrcc(i,s) * f enddo + ! rho_mrcc now contains the product of the scaling factors and the + ! normalization constant end do - print *, "done" END_PROVIDER @@ -860,13 +880,17 @@ BEGIN_PROVIDER [ double precision, dij, (N_det_ref, N_det_non_ref, N_states) ] integer :: s,i,j double precision, external :: get_dij_index print *, "computing amplitudes..." + !$OMP PARALLEL DEFAULT(shared) PRIVATE(s,i,j) do s=1, N_states + !$OMP DO do i=1, N_det_non_ref do j=1, N_det_ref dij(j, i, s) = get_dij_index(j, i, s, N_int) end do end do + !$OMP END DO end do + !$OMP END PARALLEL print *, "done computing amplitudes" END_PROVIDER @@ -879,10 +903,9 @@ double precision function get_dij_index(II, i, s, Nint) double precision :: HIi, phase if(lambda_type == 0) then - get_dij_index = get_dij(psi_ref(1,1,II), psi_non_ref(1,1,i), s, Nint) -! get_dij_index = get_dij_index * rho_mrcc(i,s) call get_phase(psi_ref(1,1,II), psi_non_ref(1,1,i), phase, N_int) - get_dij_index = get_dij_index * rho_mrcc(i,s) * phase + get_dij_index = get_dij(psi_ref(1,1,II), psi_non_ref(1,1,i), s, Nint) * phase + get_dij_index = get_dij_index * rho_mrcc(i,s) else call i_h_j(psi_ref(1,1,II), psi_non_ref(1,1,i), Nint, HIi) get_dij_index = HIi * lambda_mrcc(s, i)