diff --git a/plugins/CAS_SD_ZMQ/selection.irp.f b/plugins/CAS_SD_ZMQ/selection.irp.f index a0eb5efd..cca17d71 100644 --- a/plugins/CAS_SD_ZMQ/selection.irp.f +++ b/plugins/CAS_SD_ZMQ/selection.irp.f @@ -635,20 +635,20 @@ subroutine fill_buffer_double(i_generator, sp, h1, h2, bannedOrb, banned, fock_d use selection_types implicit none - integer, intent(in) :: i_generator, sp, h1, h2 - double precision, intent(in) :: mat(N_states, mo_tot_num, mo_tot_num) - logical, intent(in) :: bannedOrb(mo_tot_num, 2), banned(mo_tot_num, mo_tot_num) - double precision, intent(in) :: fock_diag_tmp(mo_tot_num) - double precision, intent(in) :: E0(N_states) - double precision, intent(inout) :: pt2(N_states) + integer, intent(in) :: i_generator, sp, h1, h2 + double precision, intent(in) :: mat(N_states, mo_tot_num, mo_tot_num) + logical, intent(in) :: bannedOrb(mo_tot_num, 2), banned(mo_tot_num, mo_tot_num) + double precision, intent(in) :: fock_diag_tmp(mo_tot_num) + double precision, intent(in) :: E0(N_states) + double precision, intent(inout) :: pt2(N_states) type(selection_buffer), intent(inout) :: buf - logical :: ok - integer :: s1, s2, p1, p2, ib, j, istate - integer(bit_kind) :: mask(N_int, 2), det(N_int, 2) - double precision :: e_pert, delta_E, val, Hii, max_e_pert,tmp - double precision, external :: diag_H_mat_elem_fock + logical :: ok + integer :: s1, s2, p1, p2, ib, j, istate + integer(bit_kind) :: mask(N_int, 2), det(N_int, 2) + double precision :: e_pert, delta_E, val, Hii, max_e_pert,tmp + double precision, external :: diag_H_mat_elem_fock - logical, external :: detEq + logical, external :: detEq if(sp == 3) then @@ -670,18 +670,20 @@ subroutine fill_buffer_double(i_generator, sp, h1, h2, bannedOrb, banned, fock_d if(banned(p1,p2)) cycle if(mat(1, p1, p2) == 0d0) cycle call apply_particles(mask, s1, p1, s2, p2, det, ok, N_int) -logical, external :: is_in_wavefunction -if (is_in_wavefunction(det,N_int)) then - cycle -endif + if (do_ddci) then + logical, external :: is_a_two_holes_two_particles + if (is_a_two_holes_two_particles(det)) then + cycle + endif + endif Hii = diag_H_mat_elem_fock(psi_det_generators(1,1,i_generator),det,fock_diag_tmp,N_int) max_e_pert = 0d0 do istate=1,N_states delta_E = E0(istate) - Hii - val = mat(istate, p1, p2) + mat(istate, p1, p2) + val = mat(istate, p1, p2) + mat(istate, p1, p2) tmp = dsqrt(delta_E * delta_E + val * val) if (delta_E < 0.d0) then tmp = -tmp @@ -1205,3 +1207,127 @@ subroutine spot_isinwf(mask, det, i_gen, N, banned, fullMatch, interesting) end do genl end subroutine + +subroutine ZMQ_selection(N_in, pt2) + use f77_zmq + use selection_types + + implicit none + + integer(ZMQ_PTR) :: zmq_to_qp_run_socket + integer, intent(in) :: N_in + type(selection_buffer) :: b + integer :: i, N + integer, external :: omp_get_thread_num + double precision, intent(out) :: pt2(N_states) + integer, parameter :: maxtasks=10000 + + + N = max(N_in,1) + if (.True.) then + PROVIDE pt2_e0_denominator + provide nproc + call new_parallel_job(zmq_to_qp_run_socket,"selection") + call zmq_put_psi(zmq_to_qp_run_socket,1,pt2_e0_denominator,size(pt2_e0_denominator)) + call create_selection_buffer(N, N*2, b) + endif + + character*(20*maxtasks) :: task + task = ' ' + + integer :: k + k=0 + do i= 1, N_det_generators + k = k+1 + write(task(20*(k-1)+1:20*k),'(I9,1X,I9,''|'')') i, N + if (k>=maxtasks) then + k=0 + call add_task_to_taskserver(zmq_to_qp_run_socket,task) + endif + enddo + if (k > 0) then + call add_task_to_taskserver(zmq_to_qp_run_socket,task) + endif + call zmq_set_running(zmq_to_qp_run_socket) + + !$OMP PARALLEL DEFAULT(shared) SHARED(b, pt2) PRIVATE(i) NUM_THREADS(nproc+1) + i = omp_get_thread_num() + if (i==0) then + call selection_collector(b, pt2) + else + call selection_slave_inproc(i) + endif + !$OMP END PARALLEL + call end_parallel_job(zmq_to_qp_run_socket, 'selection') + if (N_in > 0) then + call fill_H_apply_buffer_no_selection(b%cur,b%det,N_int,0) !!! PAS DE ROBIN + call copy_H_apply_buffer_to_wf() + if (s2_eig) then + call make_s2_eigenfunction + endif + call save_wavefunction + endif +end subroutine + + +subroutine selection_slave_inproc(i) + implicit none + integer, intent(in) :: i + + call run_selection_slave(1,i,pt2_e0_denominator) +end + +subroutine selection_collector(b, pt2) + use f77_zmq + use selection_types + use bitmasks + implicit none + + + type(selection_buffer), intent(inout) :: b + double precision, intent(out) :: pt2(N_states) + double precision :: pt2_mwen(N_states) + integer(ZMQ_PTR),external :: new_zmq_to_qp_run_socket + integer(ZMQ_PTR) :: zmq_to_qp_run_socket + + integer(ZMQ_PTR), external :: new_zmq_pull_socket + integer(ZMQ_PTR) :: zmq_socket_pull + + integer :: msg_size, rc, more + integer :: acc, i, j, robin, N, ntask + double precision, allocatable :: val(:) + integer(bit_kind), allocatable :: det(:,:,:) + integer, allocatable :: task_id(:) + integer :: done + real :: time, time0 + zmq_to_qp_run_socket = new_zmq_to_qp_run_socket() + zmq_socket_pull = new_zmq_pull_socket() + allocate(val(b%N), det(N_int, 2, b%N), task_id(N_det)) + done = 0 + more = 1 + pt2(:) = 0d0 + call CPU_TIME(time0) + do while (more == 1) + call pull_selection_results(zmq_socket_pull, pt2_mwen, val(1), det(1,1,1), N, task_id, ntask) + pt2 += pt2_mwen + do i=1, N + call add_to_selection_buffer(b, det(1,1,i), val(i)) + end do + + do i=1, ntask + if(task_id(i) == 0) then + print *, "Error in collector" + endif + call zmq_delete_task(zmq_to_qp_run_socket,zmq_socket_pull,task_id(i),more) + end do + done += ntask + call CPU_TIME(time) +! print *, "DONE" , done, time - time0 + end do + + + call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket) + call end_zmq_pull_socket(zmq_socket_pull) + call sort_selection_buffer(b) +end subroutine + diff --git a/plugins/Full_CI_ZMQ/pt2_stoch_routines.irp.f b/plugins/Full_CI_ZMQ/pt2_stoch_routines.irp.f new file mode 100644 index 00000000..d78a9705 --- /dev/null +++ b/plugins/Full_CI_ZMQ/pt2_stoch_routines.irp.f @@ -0,0 +1,580 @@ +BEGIN_PROVIDER [ integer, fragment_first ] + implicit none + fragment_first = first_det_of_teeth(1) +END_PROVIDER + +subroutine ZMQ_pt2(pt2,relative_error) + use f77_zmq + use selection_types + + implicit none + + character(len=64000) :: task + integer(ZMQ_PTR) :: zmq_to_qp_run_socket, zmq_to_qp_run_socket2 + type(selection_buffer) :: b + integer, external :: omp_get_thread_num + double precision, intent(in) :: relative_error + double precision, intent(out) :: pt2(N_states) + + + double precision, allocatable :: pt2_detail(:,:), comb(:) + logical, allocatable :: computed(:) + integer, allocatable :: tbc(:) + integer :: i, j, k, Ncomb, generator_per_task, i_generator_end + integer, external :: pt2_find + + double precision :: sumabove(comb_teeth), sum2above(comb_teeth), Nabove(comb_teeth) + double precision, external :: omp_get_wtime + double precision :: time0, time + + allocate(pt2_detail(N_states, N_det_generators), comb(N_det_generators/2), computed(N_det_generators), tbc(0:size_tbc)) + sumabove = 0d0 + sum2above = 0d0 + Nabove = 0d0 + + provide nproc fragment_first fragment_count mo_bielec_integrals_in_map mo_mono_elec_integral pt2_weight + + !call random_seed() + + computed = .false. + + tbc(0) = first_det_of_comb - 1 + do i=1, tbc(0) + tbc(i) = i + computed(i) = .true. + end do + + pt2_detail = 0d0 + time0 = omp_get_wtime() + print *, "time - avg - err - n_combs" + generator_per_task = 1 + do while(.true.) + + call write_time(6) + call new_parallel_job(zmq_to_qp_run_socket,"pt2") + call zmq_put_psi(zmq_to_qp_run_socket,1,pt2_e0_denominator,size(pt2_e0_denominator)) + call create_selection_buffer(1, 1*2, b) + + Ncomb=size(comb) + call get_carlo_workbatch(computed, comb, Ncomb, tbc) + + call write_time(6) + + + integer(ZMQ_PTR), external :: new_zmq_to_qp_run_socket + integer :: ipos + logical :: tasks + tasks = .False. + ipos=1 + + do i=1,tbc(0) + if(tbc(i) > fragment_first) then + write(task(ipos:ipos+20),'(I9,1X,I9,''|'')') 0, tbc(i) + ipos += 20 + if (ipos > 63980) then + call add_task_to_taskserver(zmq_to_qp_run_socket,trim(task(1:ipos-20))) + ipos=1 + tasks = .True. + endif + else + do j=1,fragment_count + write(task(ipos:ipos+20),'(I9,1X,I9,''|'')') j, tbc(i) + ipos += 20 + if (ipos > 63980) then + call add_task_to_taskserver(zmq_to_qp_run_socket,trim(task(1:ipos-20))) + ipos=1 + tasks = .True. + endif + end do + end if + end do + if (ipos > 1) then + call add_task_to_taskserver(zmq_to_qp_run_socket,trim(task(1:ipos-20))) + tasks = .True. + endif + + if (tasks) then + call zmq_set_running(zmq_to_qp_run_socket) + + !$OMP PARALLEL DEFAULT(shared) NUM_THREADS(nproc+1) & + !$OMP PRIVATE(i) + i = omp_get_thread_num() + if (i==0) then + call pt2_collector(b, tbc, comb, Ncomb, computed, pt2_detail, sumabove, sum2above, Nabove, relative_error, pt2) + else + call pt2_slave_inproc(i) + endif + !$OMP END PARALLEL + call end_parallel_job(zmq_to_qp_run_socket, 'pt2') + + else + pt2 = 0.d0 + do i=1,N_det_generators + do k=1,N_states + pt2(k) = pt2(k) + pt2_detail(k,i) + enddo + enddo + endif + + tbc(0) = 0 + if (pt2(1) /= 0.d0) then + exit + endif + end do + + deallocate(pt2_detail, comb, computed, tbc) + +end subroutine + + +subroutine do_carlo(tbc, Ncomb, comb, pt2_detail, computed, sumabove, sum2above, Nabove) + integer, intent(in) :: tbc(0:size_tbc), Ncomb + logical, intent(in) :: computed(N_det_generators) + double precision, intent(in) :: comb(Ncomb), pt2_detail(N_states, N_det_generators) + double precision, intent(inout) :: sumabove(comb_teeth), sum2above(comb_teeth), Nabove(comb_teeth) + integer :: i, dets(comb_teeth) + double precision :: myVal, myVal2 + + mainLoop : do i=1,Ncomb + call get_comb(comb(i), dets, comb_teeth) + do j=1,comb_teeth + if(.not.(computed(dets(j)))) then + exit mainLoop + end if + end do + + myVal = 0d0 + myVal2 = 0d0 + do j=comb_teeth,1,-1 + myVal += pt2_detail(1, dets(j)) * pt2_weight_inv(dets(j)) * comb_step + sumabove(j) += myVal + sum2above(j) += myVal*myVal + Nabove(j) += 1 + end do + end do mainLoop +end subroutine + + +subroutine pt2_slave_inproc(i) + implicit none + integer, intent(in) :: i + + call run_pt2_slave(1,i,pt2_e0_denominator) +end + +subroutine pt2_collector(b, tbc, comb, Ncomb, computed, pt2_detail, sumabove, sum2above, Nabove, relative_error, pt2) + use f77_zmq + use selection_types + use bitmasks + implicit none + + + integer, intent(in) :: Ncomb + double precision, intent(inout) :: pt2_detail(N_states, N_det_generators) + double precision, intent(in) :: comb(Ncomb), relative_error + logical, intent(inout) :: computed(N_det_generators) + integer, intent(in) :: tbc(0:size_tbc) + double precision, intent(inout) :: sumabove(comb_teeth), sum2above(comb_teeth), Nabove(comb_teeth) + double precision, intent(out) :: pt2(N_states) + + + type(selection_buffer), intent(inout) :: b + double precision, allocatable :: pt2_mwen(:,:) + integer(ZMQ_PTR),external :: new_zmq_to_qp_run_socket + integer(ZMQ_PTR) :: zmq_to_qp_run_socket + + integer(ZMQ_PTR), external :: new_zmq_pull_socket + integer(ZMQ_PTR) :: zmq_socket_pull + + integer :: msg_size, rc, more + integer :: acc, i, j, robin, N, ntask + double precision, allocatable :: val(:) + integer(bit_kind), allocatable :: det(:,:,:) + integer, allocatable :: task_id(:) + integer :: done, Nindex + integer, allocatable :: index(:) + double precision, save :: time0 = -1.d0 + double precision :: time, timeLast + double precision, external :: omp_get_wtime + integer :: tooth, firstTBDcomb, orgTBDcomb + integer, allocatable :: parts_to_get(:) + logical, allocatable :: actually_computed(:) + + allocate(actually_computed(N_det_generators), parts_to_get(N_det_generators), & + pt2_mwen(N_states, N_det_generators) ) + do i=1,N_det_generators + actually_computed(i) = computed(i) + enddo + + parts_to_get(:) = 1 + if(fragment_first > 0) then + do i=1,fragment_first + parts_to_get(i) = fragment_count + enddo + endif + + do i=1,tbc(0) + actually_computed(tbc(i)) = .false. + end do + + orgTBDcomb = Nabove(1) + firstTBDcomb = 1 + + zmq_to_qp_run_socket = new_zmq_to_qp_run_socket() + zmq_socket_pull = new_zmq_pull_socket() + allocate(val(b%N), det(N_int, 2, b%N), task_id(N_det_generators), index(1)) + more = 1 + if (time0 < 0.d0) then + time0 = omp_get_wtime() + endif + timeLast = time0 + + print *, 'N_deterministic = ', first_det_of_teeth(1)-1 + pullLoop : do while (more == 1) + call pull_pt2_results(zmq_socket_pull, Nindex, index, pt2_mwen, task_id, ntask) + do i=1,Nindex + pt2_detail(1:N_states, index(i)) += pt2_mwen(1:N_states,i) + parts_to_get(index(i)) -= 1 + if(parts_to_get(index(i)) < 0) then + print *, i, index(i), parts_to_get(index(i)), Nindex + print *, "PARTS ??" + print *, parts_to_get + stop "PARTS ??" + end if + if(parts_to_get(index(i)) == 0) actually_computed(index(i)) = .true. + end do + + do i=1, ntask + if(task_id(i) == 0) then + print *, "Error in collector" + endif + call zmq_delete_task(zmq_to_qp_run_socket,zmq_socket_pull,task_id(i),more) + end do + + time = omp_get_wtime() + + if(time - timeLast > 1d1 .or. more /= 1) then + timeLast = time + do i=1, first_det_of_teeth(1)-1 + if(.not.(actually_computed(i))) then + print *, "PT2 : deterministic part not finished" + cycle pullLoop + end if + end do + + double precision :: E0, avg, eqt, prop + call do_carlo(tbc, Ncomb+1-firstTBDcomb, comb(firstTBDcomb), pt2_detail, actually_computed, sumabove, sum2above, Nabove) + firstTBDcomb = Nabove(1) - orgTBDcomb + 1 + if(Nabove(1) < 2d0) cycle + call get_first_tooth(actually_computed, tooth) + + done = 0 + do i=first_det_of_teeth(tooth), first_det_of_teeth(tooth+1)-1 + if(actually_computed(i)) done = done + 1 + end do + + E0 = sum(pt2_detail(1,:first_det_of_teeth(tooth)-1)) + prop = ((1d0 - dfloat(comb_teeth - tooth + 1) * comb_step) - pt2_cweight(first_det_of_teeth(tooth)-1)) + prop = prop * pt2_weight_inv(first_det_of_teeth(tooth)) + E0 += pt2_detail(1,first_det_of_teeth(tooth)) * prop + avg = E0 + (sumabove(tooth) / Nabove(tooth)) + eqt = sqrt(1d0 / (Nabove(tooth)-1) * abs(sum2above(tooth) / Nabove(tooth) - (sumabove(tooth)/Nabove(tooth))**2)) + time = omp_get_wtime() + if (dabs(eqt/avg) < relative_error) then + pt2(1) = avg +! exit pullLoop + else + print "(4(G22.13), 4(I9))", time - time0, avg, eqt, Nabove(tooth), tooth, first_det_of_teeth(tooth)-1, done, first_det_of_teeth(tooth+1)-first_det_of_teeth(tooth) + endif + end if + end do pullLoop + + call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket) + call end_zmq_pull_socket(zmq_socket_pull) + call sort_selection_buffer(b) +end subroutine + +integer function pt2_find(v, w, sze, imin, imax) + implicit none + integer, intent(in) :: sze, imin, imax + double precision, intent(in) :: v, w(sze) + integer :: i,l,h + integer, parameter :: block=64 + + l = imin + h = imax-1 + + do while(h-l >= block) + i = ishft(h+l,-1) + if(w(i+1) > v) then + h = i-1 + else + l = i+1 + end if + end do + !DIR$ LOOP COUNT (64) + do pt2_find=l,h + if(w(pt2_find) >= v) then + exit + end if + end do +end function + + +BEGIN_PROVIDER [ integer, comb_teeth ] + implicit none + comb_teeth = 100 +END_PROVIDER + + + +subroutine get_first_tooth(computed, first_teeth) + implicit none + logical, intent(in) :: computed(N_det_generators) + integer, intent(out) :: first_teeth + integer :: i, first_det + + first_det = 1 + first_teeth = 1 + do i=first_det_of_comb, N_det_generators + if(.not.(computed(i))) then + first_det = i + exit + end if + end do + + do i=comb_teeth, 1, -1 + if(first_det_of_teeth(i) < first_det) then + first_teeth = i + exit + end if + end do + +end subroutine + + +subroutine get_last_full_tooth(computed, last_tooth) + implicit none + logical, intent(in) :: computed(N_det_generators) + integer, intent(out) :: last_tooth + integer :: i, j, missing + + last_tooth = 0 + combLoop : do i=comb_teeth, 1, -1 + missing = 1+ ishft(first_det_of_teeth(i+1)-first_det_of_teeth(i),-12) ! /4096 + do j=first_det_of_teeth(i), first_det_of_teeth(i+1)-1 + if(.not.computed(j)) then + missing -= 1 + if(missing < 0) cycle combLoop + end if + end do + last_tooth = i + exit + end do combLoop +end subroutine + + +BEGIN_PROVIDER [ integer, size_tbc ] + implicit none + BEGIN_DOC +! Size of the tbc array + END_DOC + size_tbc = N_det_generators + fragment_count*fragment_first +END_PROVIDER + +subroutine get_carlo_workbatch(computed, comb, Ncomb, tbc) + implicit none + integer, intent(inout) :: Ncomb + double precision, intent(out) :: comb(Ncomb) + integer, intent(inout) :: tbc(0:size_tbc) + logical, intent(inout) :: computed(N_det_generators) + integer :: i, j, last_full, dets(comb_teeth), tbc_save + integer :: icount, n + n = tbc(0) + icount = 0 + call RANDOM_NUMBER(comb) + do i=1,size(comb) + comb(i) = comb(i) * comb_step + tbc_save = tbc(0) + !DIR$ FORCEINLINE + call add_comb(comb(i), computed, tbc, size_tbc, comb_teeth) + if (tbc(0) < size(tbc)) then + Ncomb = i + else + tbc(0) = tbc_save + return + endif + icount = icount + tbc(0) - tbc_save + if ((i>1000).and.(icount > n)) then + call get_filling_teeth(computed, tbc) + icount = 0 + n = ishft(tbc_save,-4) + endif + enddo + call get_filling_teeth(computed, tbc) + +end subroutine + + +subroutine get_filling_teeth(computed, tbc) + implicit none + integer, intent(inout) :: tbc(0:size_tbc) + logical, intent(inout) :: computed(N_det_generators) + integer :: i, j, k, last_full, dets(comb_teeth) + + call get_last_full_tooth(computed, last_full) + if(last_full /= 0) then + if (tbc(0) > size(tbc) - first_det_of_teeth(last_full+1) -2) then + return + endif + k = tbc(0)+1 + do j=1,first_det_of_teeth(last_full+1)-1 + if(.not.(computed(j))) then + tbc(k) = j + k=k+1 + computed(j) = .true. + end if + end do + tbc(0) = k-1 + end if + +end subroutine + + +subroutine reorder_tbc(tbc) + implicit none + integer, intent(inout) :: tbc(0:size_tbc) + logical, allocatable :: ltbc(:) + integer :: i, ci + + allocate(ltbc(size_tbc)) + ltbc(:) = .false. + do i=1,tbc(0) + ltbc(tbc(i)) = .true. + end do + + ci = 0 + do i=1,size_tbc + if(ltbc(i)) then + ci = ci+1 + tbc(ci) = i + end if + end do +end subroutine + + +subroutine get_comb(stato, dets, ct) + implicit none + integer, intent(in) :: ct + double precision, intent(in) :: stato + integer, intent(out) :: dets(ct) + double precision :: curs + integer :: j + integer, external :: pt2_find + + curs = 1d0 - stato + do j = comb_teeth, 1, -1 + !DIR$ FORCEINLINE + dets(j) = pt2_find(curs, pt2_cweight,size(pt2_cweight), first_det_of_teeth(j), first_det_of_teeth(j+1)) + curs -= comb_step + end do +end subroutine + + +subroutine add_comb(comb, computed, tbc, stbc, ct) + implicit none + integer, intent(in) :: stbc, ct + double precision, intent(in) :: comb + logical, intent(inout) :: computed(N_det_generators) + integer, intent(inout) :: tbc(0:stbc) + integer :: i, k, l, dets(ct) + + !DIR$ FORCEINLINE + call get_comb(comb, dets, ct) + + k=tbc(0)+1 + do i = 1, ct + l = dets(i) + if(.not.(computed(l))) then + tbc(k) = l + k = k+1 + computed(l) = .true. + end if + end do + tbc(0) = k-1 +end subroutine + + + + BEGIN_PROVIDER [ double precision, pt2_weight, (N_det_generators) ] +&BEGIN_PROVIDER [ double precision, pt2_cweight, (N_det_generators) ] +&BEGIN_PROVIDER [ double precision, pt2_cweight_cache, (N_det_generators) ] +&BEGIN_PROVIDER [ double precision, comb_step ] +&BEGIN_PROVIDER [ integer, first_det_of_teeth, (comb_teeth+1) ] +&BEGIN_PROVIDER [ integer, first_det_of_comb ] + implicit none + integer :: i + double precision :: norm_left, stato + integer, external :: pt2_find + + pt2_weight(1) = psi_coef_generators(1,1)**2 + pt2_cweight(1) = psi_coef_generators(1,1)**2 + + do i=2,N_det_generators + pt2_weight(i) = psi_coef_generators(i,1)**2 + pt2_cweight(i) = pt2_cweight(i-1) + psi_coef_generators(i,1)**2 + end do + + do i=1,N_det_generators + pt2_weight(i) = pt2_weight(i) / pt2_cweight(N_det_generators) + pt2_cweight(i) = pt2_cweight(i) / pt2_cweight(N_det_generators) + enddo + + norm_left = 1d0 + + comb_step = 1d0/dfloat(comb_teeth) + first_det_of_comb = 1 + do i=1,N_det_generators + if(pt2_weight(i)/norm_left < comb_step*.5d0) then + first_det_of_comb = i + exit + end if + norm_left -= pt2_weight(i) + end do + + comb_step = (1d0 - pt2_cweight(first_det_of_comb-1)) * comb_step + + stato = 1d0 - comb_step + iloc = N_det_generators + do i=comb_teeth, 1, -1 + integer :: iloc + iloc = pt2_find(stato, pt2_cweight, N_det_generators, 1, iloc) + first_det_of_teeth(i) = iloc + stato -= comb_step + end do + first_det_of_teeth(comb_teeth+1) = N_det_generators + 1 + first_det_of_teeth(1) = first_det_of_comb + if(first_det_of_teeth(1) /= first_det_of_comb) then + print *, 'Error in ', irp_here + stop "comb provider" + endif + +END_PROVIDER + +BEGIN_PROVIDER [ double precision, pt2_weight_inv, (N_det_generators) ] + implicit none + BEGIN_DOC +! Inverse of pt2_weight array + END_DOC + integer :: i + do i=1,N_det_generators + pt2_weight_inv(i) = 1.d0/pt2_weight(i) + enddo + +END_PROVIDER + + + + + + diff --git a/plugins/Full_CI_ZMQ/run_selection_slave.irp.f b/plugins/Full_CI_ZMQ/run_selection_slave.irp.f index dfaee629..50e44901 100644 --- a/plugins/Full_CI_ZMQ/run_selection_slave.irp.f +++ b/plugins/Full_CI_ZMQ/run_selection_slave.irp.f @@ -46,7 +46,7 @@ subroutine run_selection_slave(thread,iproc,energy) if(buf%N == 0) then ! Only first time call create_selection_buffer(N, N*2, buf) - call create_selection_buffer(N, N*3, buf2) + call create_selection_buffer(N, N*2, buf2) else if(N /= buf%N) stop "N changed... wtf man??" end if @@ -66,8 +66,10 @@ subroutine run_selection_slave(thread,iproc,energy) call push_selection_results(zmq_socket_push, pt2, buf, task_id(1), ctask) do i=1,buf%cur call add_to_selection_buffer(buf2, buf%det(1,1,i), buf%val(i)) + if (buf2%cur == buf2%N) then + call sort_selection_buffer(buf2) + endif enddo - call sort_selection_buffer(buf2) buf%mini = buf2%mini pt2 = 0d0 buf%cur = 0 diff --git a/plugins/Full_CI_ZMQ/selection_buffer.irp.f b/plugins/Full_CI_ZMQ/selection_buffer.irp.f index 2bcb11d3..8a067357 100644 --- a/plugins/Full_CI_ZMQ/selection_buffer.irp.f +++ b/plugins/Full_CI_ZMQ/selection_buffer.irp.f @@ -27,7 +27,7 @@ subroutine add_to_selection_buffer(b, det, val) if(dabs(val) >= b%mini) then b%cur += 1 - b%det(:,:,b%cur) = det(:,:) + b%det(1:N_int,1:2,b%cur) = det(1:N_int,1:2) b%val(b%cur) = val if(b%cur == size(b%val)) then call sort_selection_buffer(b) @@ -41,29 +41,32 @@ subroutine sort_selection_buffer(b) implicit none type(selection_buffer), intent(inout) :: b - double precision, allocatable :: vals(:), absval(:) + double precision, allocatable:: absval(:) integer, allocatable :: iorder(:) - integer(bit_kind), allocatable :: detmp(:,:,:) + double precision, pointer :: vals(:) + integer(bit_kind), pointer :: detmp(:,:,:) integer :: i, nmwen logical, external :: detEq nmwen = min(b%N, b%cur) - allocate(iorder(b%cur), detmp(N_int, 2, nmwen), absval(b%cur), vals(nmwen)) + allocate(iorder(b%cur), detmp(N_int, 2, size(b%det,3)), absval(b%cur), vals(size(b%val))) absval = -dabs(b%val(:b%cur)) do i=1,b%cur iorder(i) = i end do call dsort(absval, iorder, b%cur) - do i=1, nmwen - detmp(:,:,i) = b%det(:,:,iorder(i)) + detmp(1:N_int,1,i) = b%det(1:N_int,1,iorder(i)) + detmp(1:N_int,2,i) = b%det(1:N_int,2,iorder(i)) vals(i) = b%val(iorder(i)) end do - b%det(:,:,:nmwen) = detmp(:,:,:) - b%det(:,:,nmwen+1:) = 0_bit_kind - b%val(:nmwen) = vals(:) - b%val(nmwen+1:) = 0d0 + do i=nmwen+1, size(vals) + vals(i) = 0.d0 + enddo + deallocate(b%det, b%val) + b%det => detmp + b%val => vals b%mini = max(b%mini,dabs(b%val(b%N))) b%cur = nmwen end subroutine diff --git a/plugins/Full_CI_ZMQ/selection_davidson_slave.irp.f b/plugins/Full_CI_ZMQ/selection_davidson_slave.irp.f index d6204cc3..306320f7 100644 --- a/plugins/Full_CI_ZMQ/selection_davidson_slave.irp.f +++ b/plugins/Full_CI_ZMQ/selection_davidson_slave.irp.f @@ -12,8 +12,8 @@ program selection_slave end subroutine provide_everything - PROVIDE H_apply_buffer_allocated mo_bielec_integrals_in_map psi_det_generators psi_coef_generators psi_det_sorted_bit psi_selectors n_det_generators n_states generators_bitmask zmq_context mo_mono_elec_integral -! PROVIDE pt2_e0_denominator mo_tot_num N_int + PROVIDE H_apply_buffer_allocated mo_bielec_integrals_in_map psi_det_generators psi_coef_generators psi_det_sorted_bit psi_selectors n_det_generators n_states generators_bitmask zmq_context + PROVIDE pt2_e0_denominator mo_tot_num N_int fragment_count end subroutine run_wf @@ -23,7 +23,7 @@ subroutine run_wf integer(ZMQ_PTR), external :: new_zmq_to_qp_run_socket integer(ZMQ_PTR) :: zmq_to_qp_run_socket double precision :: energy(N_states) - character*(64) :: states(2) + character*(64) :: states(4) integer :: rc, i call provide_everything @@ -31,6 +31,7 @@ subroutine run_wf zmq_context = f77_zmq_ctx_new () states(1) = 'selection' states(2) = 'davidson' + states(3) = 'pt2' zmq_to_qp_run_socket = new_zmq_to_qp_run_socket() @@ -52,7 +53,7 @@ subroutine run_wf !$OMP PARALLEL PRIVATE(i) i = omp_get_thread_num() - call selection_slave_tcp(i, energy) + call run_selection_slave(0,i,energy) !$OMP END PARALLEL print *, 'Selection done' @@ -62,46 +63,29 @@ subroutine run_wf ! -------- print *, 'Davidson' - call davidson_miniserver_get() + call davidson_slave_tcp(i) + print *, 'Davidson done' + + else if (trim(zmq_state) == 'pt2') then + + ! PT2 + ! --- + + print *, 'PT2' + call zmq_get_psi(zmq_to_qp_run_socket,1,energy,N_states) + + logical :: lstop + lstop = .False. !$OMP PARALLEL PRIVATE(i) i = omp_get_thread_num() - call davidson_slave_tcp(i) + call run_pt2_slave(0,i,energy,lstop) !$OMP END PARALLEL - print *, 'Davidson done' + print *, 'PT2 done' endif end do end -subroutine update_energy(energy) - implicit none - double precision, intent(in) :: energy(N_states) - BEGIN_DOC -! Update energy when it is received from ZMQ - END_DOC - integer :: j,k - do j=1,N_states - do k=1,N_det - CI_eigenvectors(k,j) = psi_coef(k,j) - enddo - enddo - call u_0_S2_u_0(CI_eigenvectors_s2,CI_eigenvectors,N_det,psi_det,N_int) - if (.True.) then - do k=1,N_states - ci_electronic_energy(k) = energy(k) - enddo - TOUCH ci_electronic_energy CI_eigenvectors_s2 CI_eigenvectors - endif - call write_double(6,ci_energy,'Energy') -end - -subroutine selection_slave_tcp(i,energy) - implicit none - double precision, intent(in) :: energy(N_states) - integer, intent(in) :: i - - call run_selection_slave(0,i,energy) -end diff --git a/plugins/Full_CI_ZMQ/zmq_selection.irp.f b/plugins/Full_CI_ZMQ/zmq_selection.irp.f new file mode 100644 index 00000000..7ffb4a44 --- /dev/null +++ b/plugins/Full_CI_ZMQ/zmq_selection.irp.f @@ -0,0 +1,126 @@ +subroutine ZMQ_selection(N_in, pt2) + use f77_zmq + use selection_types + + implicit none + + integer(ZMQ_PTR) :: zmq_to_qp_run_socket + integer, intent(in) :: N_in + type(selection_buffer) :: b + integer :: i, N + integer, external :: omp_get_thread_num + double precision, intent(out) :: pt2(N_states) + integer, parameter :: maxtasks=10000 + + + PROVIDE fragment_count + + N = max(N_in,1) + if (.True.) then + PROVIDE pt2_e0_denominator + provide nproc + call new_parallel_job(zmq_to_qp_run_socket,"selection") + call zmq_put_psi(zmq_to_qp_run_socket,1,pt2_e0_denominator,size(pt2_e0_denominator)) + call create_selection_buffer(N, N*2, b) + endif + + character*(20*maxtasks) :: task + task = ' ' + + integer :: k + k=0 + do i= 1, N_det_generators + k = k+1 + write(task(20*(k-1)+1:20*k),'(I9,1X,I9,''|'')') i, N + if (k>=maxtasks) then + k=0 + call add_task_to_taskserver(zmq_to_qp_run_socket,task) + endif + end do + if (k > 0) then + call add_task_to_taskserver(zmq_to_qp_run_socket,task) + endif + call zmq_set_running(zmq_to_qp_run_socket) + + !$OMP PARALLEL DEFAULT(shared) SHARED(b, pt2) PRIVATE(i) NUM_THREADS(nproc+1) + i = omp_get_thread_num() + if (i==0) then + call selection_collector(b, pt2) + else + call selection_slave_inproc(i) + endif + !$OMP END PARALLEL + call end_parallel_job(zmq_to_qp_run_socket, 'selection') + if (N_in > 0) then + call fill_H_apply_buffer_no_selection(b%cur,b%det,N_int,0) + call copy_H_apply_buffer_to_wf() + if (s2_eig) then + call make_s2_eigenfunction + endif + call save_wavefunction + endif + +end subroutine + + +subroutine selection_slave_inproc(i) + implicit none + integer, intent(in) :: i + + call run_selection_slave(1,i,pt2_e0_denominator) +end + +subroutine selection_collector(b, pt2) + use f77_zmq + use selection_types + use bitmasks + implicit none + + + type(selection_buffer), intent(inout) :: b + double precision, intent(out) :: pt2(N_states) + double precision :: pt2_mwen(N_states) + integer(ZMQ_PTR),external :: new_zmq_to_qp_run_socket + integer(ZMQ_PTR) :: zmq_to_qp_run_socket + + integer(ZMQ_PTR), external :: new_zmq_pull_socket + integer(ZMQ_PTR) :: zmq_socket_pull + + integer :: msg_size, rc, more + integer :: acc, i, j, robin, N, ntask + double precision, allocatable :: val(:) + integer(bit_kind), allocatable :: det(:,:,:) + integer, allocatable :: task_id(:) + integer :: done + real :: time, time0 + zmq_to_qp_run_socket = new_zmq_to_qp_run_socket() + zmq_socket_pull = new_zmq_pull_socket() + allocate(val(b%N), det(N_int, 2, b%N), task_id(N_det_generators)) + done = 0 + more = 1 + pt2(:) = 0d0 + call CPU_TIME(time0) + do while (more == 1) + call pull_selection_results(zmq_socket_pull, pt2_mwen, val(1), det(1,1,1), N, task_id, ntask) + pt2 += pt2_mwen + do i=1, N + call add_to_selection_buffer(b, det(1,1,i), val(i)) + end do + + do i=1, ntask + if(task_id(i) == 0) then + print *, "Error in collector" + endif + call zmq_delete_task(zmq_to_qp_run_socket,zmq_socket_pull,task_id(i),more) + end do + done += ntask + call CPU_TIME(time) +! print *, "DONE" , done, time - time0 + end do + + + call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket) + call end_zmq_pull_socket(zmq_socket_pull) + call sort_selection_buffer(b) +end subroutine + diff --git a/plugins/Selectors_full/zmq.irp.f b/plugins/Selectors_full/zmq.irp.f index 8046212b..59f40daf 100644 --- a/plugins/Selectors_full/zmq.irp.f +++ b/plugins/Selectors_full/zmq.irp.f @@ -90,13 +90,13 @@ subroutine zmq_get_psi(zmq_to_qp_run_socket, worker_id, energy, size_energy) psi_det_size = psi_det_size_read TOUCH psi_det_size N_det N_states - rc = f77_zmq_recv(zmq_to_qp_run_socket,psi_det,N_int*2*N_det*bit_kind,ZMQ_SNDMORE) + rc = f77_zmq_recv(zmq_to_qp_run_socket,psi_det,N_int*2*N_det*bit_kind,0) if (rc /= N_int*2*N_det*bit_kind) then print *, 'f77_zmq_recv(zmq_to_qp_run_socket,psi_det,N_int*2*N_det*bit_kind,ZMQ_SNDMORE)' stop 'error' endif - rc = f77_zmq_recv(zmq_to_qp_run_socket,psi_coef,psi_det_size*N_states*8,ZMQ_SNDMORE) + rc = f77_zmq_recv(zmq_to_qp_run_socket,psi_coef,psi_det_size*N_states*8,0) if (rc /= psi_det_size*N_states*8) then print *, '77_zmq_recv(zmq_to_qp_run_socket,psi_coef,psi_det_size*N_states*8,ZMQ_SNDMORE)' stop 'error' diff --git a/src/Davidson/EZFIO.cfg b/src/Davidson/EZFIO.cfg index 7724400f..84c292dd 100644 --- a/src/Davidson/EZFIO.cfg +++ b/src/Davidson/EZFIO.cfg @@ -7,13 +7,13 @@ default: 1.e-12 [n_states_diag] type: States_number doc: Number of states to consider during the Davdison diagonalization -default: 10 +default: 4 interface: ezfio,provider,ocaml [davidson_sze_max] type: Strictly_positive_int doc: Number of micro-iterations before re-contracting -default: 10 +default: 8 interface: ezfio,provider,ocaml [state_following] diff --git a/src/Davidson/davidson_parallel.irp.f b/src/Davidson/davidson_parallel.irp.f index cede52c9..37c87878 100644 --- a/src/Davidson/davidson_parallel.irp.f +++ b/src/Davidson/davidson_parallel.irp.f @@ -1,191 +1,6 @@ - -!brought to you by garniroy inc. - use bitmasks use f77_zmq -subroutine davidson_process(blockb, blockb2, N, idx, vt, st, bs, istep) - - implicit none - - - integer , intent(in) :: blockb, bs, blockb2, istep - integer , intent(inout) :: N - integer , intent(inout) :: idx(bs) - double precision , intent(inout) :: vt(N_states_diag, bs) - double precision , intent(inout) :: st(N_states_diag, bs) - - integer :: i,ii, j, sh, sh2, exa, ext, org_i, org_j, istate, ni, endi - integer(bit_kind) :: sorted_i(N_int) - double precision :: s2, hij - logical, allocatable :: wrotten(:) - - allocate(wrotten(bs)) - wrotten = .false. - PROVIDE dav_det - - ii=0 - sh = blockb - do sh2=1,shortcut_(0,1) - exa = 0 - do ni=1,N_int - exa = exa + popcnt(xor(version_(ni,sh,1), version_(ni,sh2,1))) - end do - if(exa > 2) cycle - - do i=blockb2+shortcut_(sh,1),shortcut_(sh+1,1)-1, istep - ii = i - shortcut_(blockb,1) + 1 - - org_i = sort_idx_(i,1) - do ni=1,N_int - sorted_i(ni) = sorted_(ni,i,1) - enddo - - do j=shortcut_(sh2,1), shortcut_(sh2+1,1)-1 - if(i == j) cycle - org_j = sort_idx_(j,1) - ext = exa - do ni=1,N_int - ext = ext + popcnt(xor(sorted_i(ni), sorted_(ni,j,1))) - end do - if(ext <= 4) then - call get_s2(dav_det(1,1,org_j),dav_det(1,1,org_i),n_int,s2) - call i_h_j (dav_det(1,1,org_j),dav_det(1,1,org_i),n_int,hij) - if(.not. wrotten(ii)) then - wrotten(ii) = .true. - idx(ii) = org_i - vt (:,ii) = 0d0 - st (:,ii) = 0d0 - end if - do istate=1,N_states_diag - vt (istate,ii) += hij*dav_ut(istate,org_j) - st (istate,ii) += s2*dav_ut(istate,org_j) - enddo - endif - enddo - enddo - enddo - - - if (blockb <= shortcut_(0,2)) then - sh=blockb - do sh2=sh, shortcut_(0,2), shortcut_(0,1) - do i=blockb2+shortcut_(sh2,2),shortcut_(sh2+1,2)-1, istep - ii += 1 - org_i = sort_idx_(i,2) - do j=shortcut_(sh2,2),shortcut_(sh2+1,2)-1 - if(i == j) cycle - org_j = sort_idx_(j,2) - ext = 0 - do ni=1,N_int - ext = ext + popcnt(xor(sorted_(ni,i,2), sorted_(ni,j,2))) - end do - if(ext == 4) then - call i_h_j (dav_det(1,1,org_j),dav_det(1,1,org_i),n_int,hij) - call get_s2(dav_det(1,1,org_j),dav_det(1,1,org_i),n_int,s2) - if(.not. wrotten(ii)) then - wrotten(ii) = .true. - idx(ii) = org_i - vt (:,ii) = 0d0 - st (:,ii) = 0d0 - end if - do istate=1,N_states_diag - vt (istate,ii) += hij*dav_ut(istate,org_j) - st (istate,ii) += s2*dav_ut(istate,org_j) - enddo - end if - end do - end do - enddo - endif - - N=0 - do i=1,bs - if(wrotten(i)) then - N += 1 - idx(N) = idx(i) - vt(:,N) = vt(:,i) - st(:,N) = st(:,i) - end if - end do - - -end subroutine - - - - -subroutine davidson_collect(N, idx, vt, st , v0t, s0t) - implicit none - - - integer , intent(in) :: N - integer , intent(in) :: idx(N) - double precision , intent(in) :: vt(N_states_diag, N) - double precision , intent(in) :: st(N_states_diag, N) - double precision , intent(inout) :: v0t(N_states_diag,dav_size) - double precision , intent(inout) :: s0t(N_states_diag,dav_size) - - integer :: i, j, k - - !DIR$ IVDEP - do i=1,N - k = idx(i) - !DIR$ IVDEP - do j=1,N_states_diag - v0t(j,k) = v0t(j,k) + vt(j,i) - s0t(j,k) = s0t(j,k) + st(j,i) - enddo - end do -end subroutine - - -subroutine davidson_init(zmq_to_qp_run_socket,n,n_st_8,ut) - use f77_zmq - implicit none - - integer(ZMQ_PTR), intent(out) :: zmq_to_qp_run_socket - integer, intent(in) :: n, n_st_8 - double precision, intent(in) :: ut(n_st_8,n) - integer :: i,k - - - dav_size = n - touch dav_size - - do i=1,n - do k=1,N_int - dav_det(k,1,i) = psi_det(k,1,i) - dav_det(k,2,i) = psi_det(k,2,i) - enddo - enddo - do i=1,n - do k=1,N_states_diag - dav_ut(k,i) = ut(k,i) - enddo - enddo - - touch dav_det dav_ut - - call new_parallel_job(zmq_to_qp_run_socket,"davidson") -end subroutine - - - -subroutine davidson_add_task(zmq_to_qp_run_socket, blockb, blockb2, istep) - use f77_zmq - implicit none - - integer(ZMQ_PTR) ,intent(in) :: zmq_to_qp_run_socket - integer ,intent(in) :: blockb, blockb2, istep - character*(512) :: task - - - write(task,*) blockb, blockb2, istep - call add_task_to_taskserver(zmq_to_qp_run_socket, task) -end subroutine - - subroutine davidson_slave_inproc(i) implicit none @@ -211,8 +26,6 @@ subroutine davidson_run_slave(thread,iproc) integer, intent(in) :: thread, iproc integer :: worker_id, task_id, blockb - character*(512) :: task - integer(ZMQ_PTR),external :: new_zmq_to_qp_run_socket integer(ZMQ_PTR) :: zmq_to_qp_run_socket @@ -231,7 +44,7 @@ subroutine davidson_run_slave(thread,iproc) return end if - call davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, worker_id) + call davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, N_states_diag, N_det, worker_id) call disconnect_from_taskserver(zmq_to_qp_run_socket,zmq_socket_push,worker_id) call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket) call end_zmq_push_socket(zmq_socket_push,thread) @@ -239,338 +52,346 @@ end subroutine -subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, worker_id) +subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, N_st, sze, worker_id) use f77_zmq implicit none integer(ZMQ_PTR),intent(in) :: zmq_to_qp_run_socket integer(ZMQ_PTR),intent(in) :: zmq_socket_push - integer,intent(in) :: worker_id - integer :: task_id - character*(512) :: task + integer,intent(in) :: worker_id, N_st, sze + integer :: task_id + character*(512) :: msg + integer :: imin, imax, ishift, istep + double precision, allocatable :: v_0(:,:), s_0(:,:), u_t(:,:) + !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: u_t, v_0, s_0 - integer :: blockb, blockb2, istep - integer :: N - integer , allocatable :: idx(:) - double precision , allocatable :: vt(:,:) - double precision , allocatable :: st(:,:) + ! Get wave function (u_t) + ! ----------------------- + + integer :: rc + write(msg, *) 'get_psi ', worker_id + rc = f77_zmq_send(zmq_to_qp_run_socket,trim(msg),len(trim(msg)),0) + if (rc /= len(trim(msg))) then + print *, 'f77_zmq_send(zmq_to_qp_run_socket,trim(msg),len(trim(msg)),0)' + stop 'error' + endif + + rc = f77_zmq_recv(zmq_to_qp_run_socket,msg,len(msg),0) + if (msg(1:13) /= 'get_psi_reply') then + print *, rc, trim(msg) + print *, 'Error in get_psi_reply' + stop 'error' + endif + + integer :: N_states_read, N_det_read, psi_det_size_read + integer :: N_det_selectors_read, N_det_generators_read + double precision :: energy(N_st) + + read(msg(14:rc),*) rc, N_states_read, N_det_read, psi_det_size_read, & + N_det_generators_read, N_det_selectors_read + + if (rc /= worker_id) then + print *, 'Wrong worker ID' + stop 'error' + endif - integer :: bs, i, j - - allocate(idx(1), vt(1,1), st(1,1)) + if (N_states_read /= N_st) then + print *, N_st + stop 'error : N_st' + endif + + if (N_det_read /= N_det) then + N_det = N_det_read + TOUCH N_det + endif + + + allocate(v_0(sze,N_st), s_0(sze,N_st),u_t(N_st,N_det)) + + rc = f77_zmq_recv(zmq_to_qp_run_socket,psi_det,N_int*2*N_det*bit_kind,0) + if (rc /= N_int*2*N_det*bit_kind) then + print *, 'f77_zmq_recv(zmq_to_qp_run_socket,psi_det,N_int*2*N_det*bit_kind,0)' + stop 'error' + endif + + rc = f77_zmq_recv(zmq_to_qp_run_socket,u_t,size(u_t)*8,0) + if (rc /= size(u_t)*8) then + print *, rc, size(u_t)*8 + print *, 'f77_zmq_recv(zmq_to_qp_run_socket,u_t,size(u_t)×8,0)' + stop 'error' + endif + + rc = f77_zmq_recv(zmq_to_qp_run_socket,energy,N_st*8,0) + if (rc /= N_st*8) then + print *, '77_zmq_recv(zmq_to_qp_run_socket,energy,N_st*8,0)' + stop 'error' + endif + + ! Run tasks + ! --------- do - call get_task_from_taskserver(zmq_to_qp_run_socket,worker_id, task_id, task) + v_0 = 0.d0 + s_0 = 0.d0 + call get_task_from_taskserver(zmq_to_qp_run_socket,worker_id, task_id, msg) if(task_id == 0) exit - read (task,*) blockb, blockb2, istep - bs = shortcut_(blockb+1,1) - shortcut_(blockb, 1) - do i=blockb, shortcut_(0,2), shortcut_(0,1) - do j=i, min(i, shortcut_(0,2)) - bs += shortcut_(j+1,2) - shortcut_(j, 2) - end do - end do - if(bs > size(idx)) then - deallocate(idx, vt, st) - allocate(idx(bs)) - allocate(vt(N_states_diag, bs)) - allocate(st(N_states_diag, bs)) - end if - - call davidson_process(blockb, blockb2, N, idx, vt, st, bs, istep) + read (msg,*) imin, imax, ishift, istep + call H_S2_u_0_nstates_openmp_work(v_0,s_0,u_t,N_st,N_det,imin,imax,ishift,istep) call task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id) - call davidson_push_results(zmq_socket_push, blockb, blockb2, N, idx, vt, st, task_id) + call davidson_push_results(zmq_socket_push, v_0, s_0, task_id) end do + deallocate(v_0, s_0, u_t) end subroutine -subroutine davidson_push_results(zmq_socket_push, blockb, blocke, N, idx, vt, st, task_id) +subroutine davidson_push_results(zmq_socket_push, v_0, s_0, task_id) use f77_zmq implicit none integer(ZMQ_PTR) ,intent(in) :: zmq_socket_push integer ,intent(in) :: task_id - - integer ,intent(in) :: blockb, blocke - integer ,intent(in) :: N - integer ,intent(in) :: idx(N) - double precision ,intent(in) :: vt(N_states_diag, N) - double precision ,intent(in) :: st(N_states_diag, N) + double precision ,intent(in) :: v_0(N_det,N_states_diag) + double precision ,intent(in) :: s_0(N_det,N_states_diag) integer :: rc - rc = f77_zmq_send( zmq_socket_push, blockb, 4, ZMQ_SNDMORE) - if(rc /= 4) stop "davidson_push_results failed to push blockb" + rc = f77_zmq_send( zmq_socket_push, v_0, 8*N_states_diag*N_det, ZMQ_SNDMORE) + if(rc /= 8*N_states_diag* N_det) stop "davidson_push_results failed to push vt" - rc = f77_zmq_send( zmq_socket_push, blocke, 4, ZMQ_SNDMORE) - if(rc /= 4) stop "davidson_push_results failed to push blocke" - - rc = f77_zmq_send( zmq_socket_push, N, 4, ZMQ_SNDMORE) - if(rc /= 4) stop "davidson_push_results failed to push N" - - rc = f77_zmq_send( zmq_socket_push, idx, 4*N, ZMQ_SNDMORE) - if(rc /= 4*N) stop "davidson_push_results failed to push idx" - - rc = f77_zmq_send( zmq_socket_push, vt, 8*N_states_diag* N, ZMQ_SNDMORE) - if(rc /= 8*N_states_diag* N) stop "davidson_push_results failed to push vt" - - rc = f77_zmq_send( zmq_socket_push, st, 8*N_states_diag* N, ZMQ_SNDMORE) - if(rc /= 8*N_states_diag* N) stop "davidson_push_results failed to push st" + rc = f77_zmq_send( zmq_socket_push, s_0, 8*N_states_diag*N_det, ZMQ_SNDMORE) + if(rc /= 8*N_states_diag* N_det) stop "davidson_push_results failed to push st" rc = f77_zmq_send( zmq_socket_push, task_id, 4, 0) if(rc /= 4) stop "davidson_push_results failed to push task_id" + +! Activate is zmq_socket_push is a REQ + integer :: idummy + rc = f77_zmq_recv( zmq_socket_push, idummy, 4, 0) + if (rc /= 4) then + print *, irp_here, ': f77_zmq_send( zmq_socket_push, idummy, 4, 0)' + stop 'error' + endif + end subroutine -subroutine davidson_pull_results(zmq_socket_pull, blockb, blocke, N, idx, vt, st, task_id) +subroutine davidson_pull_results(zmq_socket_pull, v_0, s_0, task_id) use f77_zmq implicit none integer(ZMQ_PTR) ,intent(in) :: zmq_socket_pull integer ,intent(out) :: task_id - integer ,intent(out) :: blockb, blocke - integer ,intent(out) :: N - integer ,intent(out) :: idx(*) - double precision ,intent(out) :: vt(N_states_diag, *) - double precision ,intent(out) :: st(N_states_diag, *) + double precision ,intent(out) :: v_0(N_det,N_states_diag) + double precision ,intent(out) :: s_0(N_det,N_states_diag) integer :: rc - rc = f77_zmq_recv( zmq_socket_pull, blockb, 4, 0) - if(rc /= 4) stop "davidson_push_results failed to pull blockb" - - rc = f77_zmq_recv( zmq_socket_pull, blocke, 4, 0) - if(rc /= 4) stop "davidson_push_results failed to pull blocke" - - rc = f77_zmq_recv( zmq_socket_pull, N, 4, 0) - if(rc /= 4) stop "davidson_push_results failed to pull N" - - rc = f77_zmq_recv( zmq_socket_pull, idx, 4*N, 0) - if(rc /= 4*N) stop "davidson_push_results failed to pull idx" + rc = f77_zmq_recv( zmq_socket_pull, v_0, 8*N_det*N_states_diag, 0) + if(rc /= 8*N_det*N_states_diag) stop "davidson_push_results failed to pull v_0" - rc = f77_zmq_recv( zmq_socket_pull, vt, 8*N_states_diag* N, 0) - if(rc /= 8*N_states_diag* N) stop "davidson_push_results failed to pull vt" - - rc = f77_zmq_recv( zmq_socket_pull, st, 8*N_states_diag* N, 0) - if(rc /= 8*N_states_diag* N) stop "davidson_push_results failed to pull st" + rc = f77_zmq_recv( zmq_socket_pull, s_0, 8*N_det*N_states_diag, 0) + if(rc /= 8*N_det*N_states_diag) stop "davidson_push_results failed to pull s_0" rc = f77_zmq_recv( zmq_socket_pull, task_id, 4, 0) if(rc /= 4) stop "davidson_pull_results failed to pull task_id" + +! Activate if zmq_socket_pull is a REP + rc = f77_zmq_send( zmq_socket_pull, 0, 4, 0) + if (rc /= 4) then + print *, irp_here, ' : f77_zmq_send (zmq_socket_pull,...' + stop 'error' + endif + end subroutine -subroutine davidson_collector(zmq_to_qp_run_socket, zmq_socket_pull , v0, s0, LDA) +subroutine davidson_collector(zmq_to_qp_run_socket, v0, s0, sze, N_st) use f77_zmq implicit none - integer :: LDA + integer, intent(in) :: sze, N_st integer(ZMQ_PTR), intent(in) :: zmq_to_qp_run_socket - integer(ZMQ_PTR), intent(in) :: zmq_socket_pull - double precision ,intent(inout) :: v0(LDA, N_states_diag) - double precision ,intent(inout) :: s0(LDA, N_states_diag) + double precision ,intent(inout) :: v0(sze, N_st) + double precision ,intent(inout) :: s0(sze, N_st) - integer :: more, task_id, taskn + integer :: more, task_id - integer :: blockb, blocke - integer :: N - integer , allocatable :: idx(:) - double precision , allocatable :: vt(:,:), v0t(:,:), s0t(:,:) - double precision , allocatable :: st(:,:) - - integer :: msize - - msize = (1 + max_blocksize)*2 - allocate(idx(msize)) - allocate(vt(N_states_diag, msize)) - allocate(st(N_states_diag, msize)) - allocate(v0t(N_states_diag, dav_size)) - allocate(s0t(N_states_diag, dav_size)) - - v0t = 00.d0 - s0t = 00.d0 - - more = 1 - - do while (more == 1) - call davidson_pull_results(zmq_socket_pull, blockb, blocke, N, idx, vt, st, task_id) - !DIR$ FORCEINLINE - call davidson_collect(N, idx, vt, st , v0t, s0t) - call zmq_delete_task(zmq_to_qp_run_socket,zmq_socket_pull,task_id,more) - end do - deallocate(idx,vt,st) - + double precision, allocatable :: v_0(:,:), s_0(:,:) integer :: i,j - !DIR$ IVDEP - do j=1,N_states_diag - !DIR$ IVDEP - do i=1,dav_size - v0(i,j) = v0t(j,i) - s0(i,j) = s0t(j,i) - enddo - enddo - - deallocate(v0t,s0t) -end subroutine - - -subroutine davidson_run(zmq_to_qp_run_socket , v0, s0, LDA) - use f77_zmq - implicit none - - integer :: LDA - integer(ZMQ_PTR), intent(in) :: zmq_to_qp_run_socket - integer(ZMQ_PTR),external :: new_zmq_to_qp_run_socket - integer(ZMQ_PTR) :: zmq_collector integer(ZMQ_PTR), external :: new_zmq_pull_socket integer(ZMQ_PTR) :: zmq_socket_pull - - integer :: i - integer, external :: omp_get_thread_num - double precision , intent(inout) :: v0(LDA, N_states_diag) - double precision , intent(inout) :: s0(LDA, N_states_diag) - - call zmq_set_running(zmq_to_qp_run_socket) - - zmq_collector = new_zmq_to_qp_run_socket() + allocate(v_0(N_det,N_st), s_0(N_det,N_st)) + v0 = 0.d0 + s0 = 0.d0 + more = 1 zmq_socket_pull = new_zmq_pull_socket() - i = omp_get_thread_num() - - - PROVIDE nproc - - !$OMP PARALLEL NUM_THREADS(nproc+2) PRIVATE(i) - i = omp_get_thread_num() - if (i == 0 ) then - call davidson_collector(zmq_collector, zmq_socket_pull , v0, s0, LDA) - call end_zmq_to_qp_run_socket(zmq_collector) - call end_zmq_pull_socket(zmq_socket_pull) - call davidson_miniserver_end() - else if (i == 1 ) then - call davidson_miniserver_run () - else - call davidson_slave_inproc(i) - endif - !$OMP END PARALLEL + do while (more == 1) + call davidson_pull_results(zmq_socket_pull, v_0, s_0, task_id) + do j=1,N_st + do i=1,N_det + v0(i,j) = v0(i,j) + v_0(i,j) + s0(i,j) = s0(i,j) + s_0(i,j) + enddo + enddo + call zmq_delete_task(zmq_to_qp_run_socket,zmq_socket_pull,task_id,more) + end do + deallocate(v_0,s_0) + call end_zmq_pull_socket(zmq_socket_pull) - call end_parallel_job(zmq_to_qp_run_socket, 'davidson') end subroutine -subroutine davidson_miniserver_run() + +subroutine H_S2_u_0_nstates_zmq(v_0,s_0,u_0,N_st,sze) + use omp_lib + use bitmasks use f77_zmq implicit none - integer(ZMQ_PTR) responder - character*(64) address - character(len=:), allocatable :: buffer - integer rc + BEGIN_DOC + ! Computes v_0 = H|u_0> and s_0 = S^2 |u_0> + ! + ! n : number of determinants + ! + ! H_jj : array of + ! + ! S2_jj : array of + END_DOC + integer, intent(in) :: N_st, sze + double precision, intent(out) :: v_0(sze,N_st), s_0(sze,N_st) + double precision, intent(inout):: u_0(sze,N_st) + integer :: i,j,k + integer :: ithread + double precision, allocatable :: u_t(:,:) + !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: u_t - allocate (character(len=20) :: buffer) - address = 'tcp://*:11223' + PROVIDE psi_det_beta_unique psi_bilinear_matrix_order_transp_reverse psi_det_alpha_unique + PROVIDE psi_bilinear_matrix_transp_values psi_bilinear_matrix_values psi_bilinear_matrix_columns_loc + PROVIDE ref_bitmask_energy nproc + + + allocate(u_t(N_st,N_det)) + do k=1,N_st + call dset_order(u_0(1,k),psi_bilinear_matrix_order,N_det) + enddo + call dtranspose( & + u_0, & + size(u_0, 1), & + u_t, & + size(u_t, 1), & + N_det, N_st) + + + integer(ZMQ_PTR) :: zmq_to_qp_run_socket - responder = f77_zmq_socket(zmq_context, ZMQ_REP) - rc = f77_zmq_bind(responder,address) + if(N_st /= N_states_diag .or. sze < N_det) stop "assert fail in H_S2_u_0_nstates" + + ASSERT (Nint > 0) + ASSERT (Nint == N_int) + ASSERT (n>0) + + call new_parallel_job(zmq_to_qp_run_socket,'davidson') - do - rc = f77_zmq_recv(responder, buffer, 5, 0) - if (buffer(1:rc) /= 'end') then - rc = f77_zmq_send (responder, dav_size, 4, ZMQ_SNDMORE) - rc = f77_zmq_send (responder, dav_det, 16*N_int*dav_size, ZMQ_SNDMORE) - rc = f77_zmq_send (responder, dav_ut, 8*dav_size*N_states_diag, 0) - else - rc = f77_zmq_send (responder, "end", 3, 0) - exit + character*(512) :: task + integer :: rc + double precision :: energy(N_st) + energy = 0.d0 + + task = ' ' + write(task,*) 'put_psi ', 1, N_st, N_det, N_det + rc = f77_zmq_send(zmq_to_qp_run_socket,trim(task),len(trim(task)),ZMQ_SNDMORE) + if (rc /= len(trim(task))) then + print *, 'f77_zmq_send(zmq_to_qp_run_socket,trim(task),len(trim(task)),ZMQ_SNDMORE)' + stop 'error' + endif + + rc = f77_zmq_send(zmq_to_qp_run_socket,psi_det,N_int*2*N_det*bit_kind,ZMQ_SNDMORE) + if (rc /= N_int*2*N_det*bit_kind) then + print *, 'f77_zmq_send(zmq_to_qp_run_socket,psi_det,N_int*2*N_det*bit_kind,ZMQ_SNDMORE)' + stop 'error' + endif + + rc = f77_zmq_send(zmq_to_qp_run_socket,u_t,size(u_t)*8,ZMQ_SNDMORE) + if (rc /= size(u_t)*8) then + print *, 'f77_zmq_send(zmq_to_qp_run_socket,u_t,size(u_t)*8,ZMQ_SNDMORE)' + stop 'error' + endif + + rc = f77_zmq_send(zmq_to_qp_run_socket,energy,N_st*8,0) + if (rc /= N_st*8) then + print *, 'f77_zmq_send(zmq_to_qp_run_socket,energy,size_energy*8,0)' + stop 'error' + endif + + rc = f77_zmq_recv(zmq_to_qp_run_socket,task,len(task),0) + if (task(1:rc) /= 'put_psi_reply 1') then + print *, rc, trim(task) + print *, 'Error in put_psi_reply' + stop 'error' + endif + + deallocate(u_t) + + + ! Create tasks + ! ============ + + integer :: istep, imin, imax, ishift + double precision :: w, max_workload, N_det_inv, di + max_workload = 1000000.d0 + w = 0.d0 + istep=8 + ishift=0 + imin=1 + N_det_inv = 1.d0/dble(N_det) + di = dble(N_det) + do imax=1,N_det + di = di-1.d0 + w = w + di*N_det_inv + if (w > max_workload) then + do ishift=0,istep-1 + write(task,'(4(I9,1X),1A)') imin, imax, ishift, istep, '|' + call add_task_to_taskserver(zmq_to_qp_run_socket,trim(task)) + enddo + imin = imax+1 + w = 0.d0 endif enddo + if (w > 0.d0) then + imax = N_det + do ishift=0,istep-1 + write(task,'(4(I9,1X),1A)') imin, imax, ishift, istep, '|' + call add_task_to_taskserver(zmq_to_qp_run_socket,trim(task)) + enddo + endif + - rc = f77_zmq_close(responder) -end subroutine - - -subroutine davidson_miniserver_end() - implicit none - use f77_zmq - - integer(ZMQ_PTR) requester - character*(64) address - integer rc - character*(64) buf - - address = trim(qp_run_address)//':11223' - requester = f77_zmq_socket(zmq_context, ZMQ_REQ) - rc = f77_zmq_connect(requester,address) - - rc = f77_zmq_send(requester, "end", 3, 0) - rc = f77_zmq_recv(requester, buf, 3, 0) - rc = f77_zmq_close(requester) -end subroutine - - -subroutine davidson_miniserver_get() - implicit none - use f77_zmq - - integer(ZMQ_PTR) requester - character*(64) address - character*(20) buffer - integer rc - - address = trim(qp_run_address)//':11223' - - requester = f77_zmq_socket(zmq_context, ZMQ_REQ) - rc = f77_zmq_connect(requester,address) - - rc = f77_zmq_send(requester, "Hello", 5, 0) - rc = f77_zmq_recv(requester, dav_size, 4, 0) - TOUCH dav_size - rc = f77_zmq_recv(requester, dav_det, 16*N_int*dav_size, 0) - rc = f77_zmq_recv(requester, dav_ut, 8*dav_size*N_states_diag, 0) - TOUCH dav_det dav_ut - - -end subroutine - - - - BEGIN_PROVIDER [ integer(bit_kind), dav_det, (N_int, 2, dav_size) ] -&BEGIN_PROVIDER [ double precision, dav_ut, (N_states_diag, dav_size) ] - use bitmasks - implicit none - BEGIN_DOC -! Temporary arrays for parallel davidson -! -! Touched in davidson_miniserver_get - END_DOC - dav_det = 0_bit_kind - dav_ut = -huge(1.d0) -END_PROVIDER - - -BEGIN_PROVIDER [ integer, dav_size ] - implicit none - BEGIN_DOC -! Size of the arrays for Davidson -! -! Touched in davidson_miniserver_get - END_DOC - dav_size = 1 -END_PROVIDER - - - BEGIN_PROVIDER [ integer, shortcut_, (0:dav_size+1, 2) ] -&BEGIN_PROVIDER [ integer(bit_kind), version_, (N_int, dav_size, 2) ] -&BEGIN_PROVIDER [ integer(bit_kind), sorted_, (N_int, dav_size, 2) ] -&BEGIN_PROVIDER [ integer, sort_idx_, (dav_size, 2) ] -&BEGIN_PROVIDER [ integer, max_blocksize ] -implicit none - call sort_dets_ab_v(dav_det, sorted_(1,1,1), sort_idx_(1,1), shortcut_(0,1), version_(1,1,1), dav_size, N_int) - call sort_dets_ba_v(dav_det, sorted_(1,1,2), sort_idx_(1,2), shortcut_(0,2), version_(1,1,2), dav_size, N_int) - max_blocksize = max(shortcut_(0,1), shortcut_(0,2)) -END_PROVIDER + v_0 = 0.d0 + s_0 = 0.d0 + + call omp_set_nested(.True.) + call zmq_set_running(zmq_to_qp_run_socket) + !$OMP PARALLEL NUM_THREADS(2) PRIVATE(ithread) + ithread = omp_get_thread_num() + if (ithread == 0 ) then + call davidson_collector(zmq_to_qp_run_socket, v_0, s_0, N_det, N_st) + else + call davidson_slave_inproc(1) + endif + !$OMP END PARALLEL + call end_parallel_job(zmq_to_qp_run_socket, 'davidson') + + do k=1,N_st + call dset_order(v_0(1,k),psi_bilinear_matrix_order_reverse,N_det) + call dset_order(s_0(1,k),psi_bilinear_matrix_order_reverse,N_det) + call dset_order(u_0(1,k),psi_bilinear_matrix_order_reverse,N_det) + enddo +end diff --git a/src/Davidson/davidson_slave.irp.f b/src/Davidson/davidson_slave.irp.f index e28712e2..8aa0d6e7 100644 --- a/src/Davidson/davidson_slave.irp.f +++ b/src/Davidson/davidson_slave.irp.f @@ -16,24 +16,17 @@ program davidson_slave state = 'Waiting' zmq_to_qp_run_socket = new_zmq_to_qp_run_socket() - do call wait_for_state(zmq_state,state) if(trim(state) /= "davidson") exit - call davidson_miniserver_get() - integer :: rc, i - print *, 'Davidson slave running' - - !$OMP PARALLEL PRIVATE(i) - i = omp_get_thread_num() call davidson_slave_tcp(i) - !$OMP END PARALLEL end do end subroutine provide_everything - PROVIDE mo_bielec_integrals_in_map psi_det_sorted_bit N_states_diag zmq_context + PROVIDE mo_bielec_integrals_in_map psi_det_sorted_bit N_states_diag zmq_context ref_bitmask_energy end subroutine + diff --git a/src/Davidson/diagonalization.irp.f b/src/Davidson/diagonalization.irp.f index 9bbd00f5..4d9a67d5 100644 --- a/src/Davidson/diagonalization.irp.f +++ b/src/Davidson/diagonalization.irp.f @@ -302,7 +302,6 @@ subroutine davidson_diag_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,N_st,N_st_dia double precision, intent(inout) :: u_in(dim_in,N_st_diag) double precision, intent(out) :: energies(N_st_diag) - integer :: sze_8 integer :: iter integer :: i,j,k,l,m logical :: converged @@ -365,13 +364,12 @@ subroutine davidson_diag_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,N_st,N_st_dia write(iunit,'(A)') trim(write_buffer) integer, external :: align_double - sze_8 = align_double(sze) allocate( & kl_pairs(2,N_st_diag*(N_st_diag+1)/2), & - W(sze_8,N_st_diag,davidson_sze_max), & - U(sze_8,N_st_diag,davidson_sze_max), & - R(sze_8,N_st_diag), & + W(sze,N_st_diag,davidson_sze_max), & + U(sze,N_st_diag,davidson_sze_max), & + R(sze,N_st_diag), & h(N_st_diag,davidson_sze_max,N_st_diag,davidson_sze_max), & y(N_st_diag,davidson_sze_max,N_st_diag,davidson_sze_max), & residual_norm(N_st_diag), & @@ -426,7 +424,7 @@ subroutine davidson_diag_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,N_st,N_st_dia ! Compute |W_k> = \sum_i |i> ! ----------------------------------------- - call H_u_0_nstates(W(1,1,iter),U(1,1,iter),H_jj,sze,dets_in,Nint,N_st_diag,sze_8) + call H_u_0_nstates(W(1,1,iter),U(1,1,iter),H_jj,sze,dets_in,Nint,N_st_diag,sze) ! do k=1,N_st ! if(store_full_H_mat.and.sze.le.n_det_max_stored)then ! call H_u_0_stored(W(1,k,iter),U(1,k,iter),H_matrix_all_dets,sze) diff --git a/src/Davidson/diagonalization_hs2.irp.f b/src/Davidson/diagonalization_hs2.irp.f index dccc8ee5..54672609 100644 --- a/src/Davidson/diagonalization_hs2.irp.f +++ b/src/Davidson/diagonalization_hs2.irp.f @@ -23,41 +23,33 @@ subroutine davidson_diag_hs2(dets_in,u_in,s2_out,dim_in,energies,sze,N_st,N_st_d integer(bit_kind), intent(in) :: dets_in(Nint,2,sze) double precision, intent(inout) :: u_in(dim_in,N_st_diag) double precision, intent(out) :: energies(N_st_diag), s2_out(N_st_diag) - double precision, allocatable :: H_jj(:), S2_jj(:) + double precision, allocatable :: H_jj(:) - double precision :: diag_h_mat_elem + double precision :: diag_H_mat_elem, diag_S_mat_elem integer :: i ASSERT (N_st > 0) ASSERT (sze > 0) ASSERT (Nint > 0) ASSERT (Nint == N_int) PROVIDE mo_bielec_integrals_in_map - allocate(H_jj(sze), S2_jj(sze)) + allocate(H_jj(sze) ) !$OMP PARALLEL DEFAULT(NONE) & - !$OMP SHARED(sze,H_jj,S2_jj, dets_in,Nint) & + !$OMP SHARED(sze,H_jj, dets_in,Nint) & !$OMP PRIVATE(i) - !$OMP DO SCHEDULE(guided) + !$OMP DO SCHEDULE(static) do i=1,sze - H_jj(i) = diag_h_mat_elem(dets_in(1,1,i),Nint) - call get_s2(dets_in(1,1,i),dets_in(1,1,i),Nint,S2_jj(i)) + H_jj(i) = diag_H_mat_elem(dets_in(1,1,i),Nint) enddo !$OMP END DO !$OMP END PARALLEL - if (disk_based_davidson) then - call davidson_diag_hjj_sjj_mmap(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_st,N_st_diag,Nint,iunit) - else - call davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_st,N_st_diag,Nint,iunit) - endif - do i=1,N_st_diag - s2_out(i) = S2_jj(i) - enddo - deallocate (H_jj,S2_jj) + call davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_out,energies,dim_in,sze,N_st,N_st_diag,Nint,iunit) + deallocate (H_jj) end -subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_st,N_st_diag,Nint,iunit) +subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_st,N_st_diag,Nint,iunit) use bitmasks implicit none BEGIN_DOC @@ -65,7 +57,7 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s ! ! H_jj : specific diagonal H matrix elements to diagonalize de Davidson ! - ! S2_jj : specific diagonal S^2 matrix elements + ! S2_out : Output : s^2 ! ! dets_in : bitmasks corresponding to determinants ! @@ -87,12 +79,11 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s integer, intent(in) :: dim_in, sze, N_st, N_st_diag, Nint integer(bit_kind), intent(in) :: dets_in(Nint,2,sze) double precision, intent(in) :: H_jj(sze) - double precision, intent(inout) :: S2_jj(sze) + double precision, intent(inout) :: s2_out(N_st_diag) integer, intent(in) :: iunit double precision, intent(inout) :: u_in(dim_in,N_st_diag) double precision, intent(out) :: energies(N_st_diag) - integer :: sze_8 integer :: iter integer :: i,j,k,l,m logical :: converged @@ -122,7 +113,10 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s stop -1 endif - PROVIDE nuclear_repulsion expected_s2 + integer, external :: align_double + itermax = max(3,min(davidson_sze_max, sze/N_st_diag)) + + PROVIDE nuclear_repulsion expected_s2 psi_bilinear_matrix_order psi_bilinear_matrix_order_reverse call write_time(iunit) call wall_time(wall) @@ -134,6 +128,9 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s call write_int(iunit,N_st,'Number of states') call write_int(iunit,N_st_diag,'Number of states in diagonalization') call write_int(iunit,sze,'Number of determinants') + r1 = 8.d0*(3.d0*dble(sze*N_st_diag*itermax+5.d0*(N_st_diag*itermax)**2 & + + 4.d0*(N_st_diag*itermax)+nproc*(4.d0*N_det_alpha_unique+2.d0*N_st_diag*sze)))/(1024.d0**3) + call write_double(iunit, r1, 'Memory(Gb)') write(iunit,'(A)') '' write_buffer = '===== ' do i=1,N_st @@ -151,14 +148,14 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s enddo write(iunit,'(A)') trim(write_buffer) - integer, external :: align_double - sze_8 = align_double(sze) - - itermax = max(3,min(davidson_sze_max, sze/N_st_diag)) + allocate( & - W(sze_8,N_st_diag*itermax), & - U(sze_8,N_st_diag*itermax), & - S(sze_8,N_st_diag*itermax), & + ! Large + W(sze,N_st_diag*itermax), & + U(sze,N_st_diag*itermax), & + S(sze,N_st_diag*itermax), & + + ! Small h(N_st_diag*itermax,N_st_diag*itermax), & y(N_st_diag*itermax,N_st_diag*itermax), & s_(N_st_diag*itermax,N_st_diag*itermax), & @@ -202,7 +199,7 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s do k=1,N_st_diag call normalize(u_in(1,k),sze) enddo - + do while (.not.converged) @@ -223,8 +220,11 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s ! ----------------------------------------- -! call H_S2_u_0_nstates_zmq(W(1,shift+1),S(1,shift+1),U(1,shift+1),H_jj,S2_jj,sze,dets_in,Nint,N_st_diag,sze_8) - call H_S2_u_0_nstates(W(1,shift+1),S(1,shift+1),U(1,shift+1),H_jj,S2_jj,sze,dets_in,Nint,N_st_diag,sze_8) + if (distributed_davidson) then + call H_S2_u_0_nstates_zmq (W(1,shift+1),S(1,shift+1),U(1,shift+1),N_st_diag,sze) + else + call H_S2_u_0_nstates_openmp(W(1,shift+1),S(1,shift+1),U(1,shift+1),N_st_diag,sze) + endif ! Compute h_kl = = @@ -400,7 +400,7 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s endif enddo - write(iunit,'(X,I3,X,100(X,F16.10,X,F11.6,X,E11.3))') iter, to_print(1:3,1:N_st) + write(iunit,'(1X,I3,1X,100(1X,F16.10,1X,F11.6,1X,E11.3))') iter, to_print(1:3,1:N_st) call davidson_converged(lambda,residual_norm,wall,iter,cpu,N_st,converged) do k=1,N_st if (residual_norm(k) > 1.e8) then @@ -424,7 +424,7 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s do k=1,N_st_diag energies(k) = lambda(k) - S2_jj(k) = s2(k) + s2_out(k) = s2(k) enddo write_buffer = '===== ' do i=1,N_st @@ -444,439 +444,3 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_s ) end -subroutine davidson_diag_hjj_sjj_mmap(dets_in,u_in,H_jj,S2_jj,energies,dim_in,sze,N_st,N_st_diag,Nint,iunit) - use bitmasks - use mmap_module - implicit none - BEGIN_DOC - ! Davidson diagonalization with specific diagonal elements of the H matrix - ! - ! H_jj : specific diagonal H matrix elements to diagonalize de Davidson - ! - ! S2_jj : specific diagonal S^2 matrix elements - ! - ! dets_in : bitmasks corresponding to determinants - ! - ! u_in : guess coefficients on the various states. Overwritten - ! on exit - ! - ! dim_in : leftmost dimension of u_in - ! - ! sze : Number of determinants - ! - ! N_st : Number of eigenstates - ! - ! N_st_diag : Number of states in which H is diagonalized. Assumed > sze - ! - ! iunit : Unit for the I/O - ! - ! Initial guess vectors are not necessarily orthonormal - END_DOC - integer, intent(in) :: dim_in, sze, N_st, N_st_diag, Nint - integer(bit_kind), intent(in) :: dets_in(Nint,2,sze) - double precision, intent(in) :: H_jj(sze) - double precision, intent(inout) :: S2_jj(sze) - integer, intent(in) :: iunit - double precision, intent(inout) :: u_in(dim_in,N_st_diag) - double precision, intent(out) :: energies(N_st_diag) - - integer :: sze_8 - integer :: iter - integer :: i,j,k,l,m - logical :: converged - - double precision :: u_dot_v, u_dot_u - - integer :: k_pairs, kl - - integer :: iter2 - double precision, pointer :: W(:,:), U(:,:), S(:,:), overlap(:,:) - double precision, allocatable :: y(:,:), h(:,:), lambda(:), s2(:) - double precision, allocatable :: c(:), s_(:,:), s_tmp(:,:) - double precision :: diag_h_mat_elem - double precision, allocatable :: residual_norm(:) - character*(16384) :: write_buffer - double precision :: to_print(3,N_st) - double precision :: cpu, wall - logical :: state_ok(N_st_diag*davidson_sze_max) - integer :: shift, shift2, itermax - include 'constants.include.F' - - !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: U, W, S, y, h, lambda - if (N_st_diag*3 > sze) then - print *, 'error in Davidson :' - print *, 'Increase n_det_max_jacobi to ', N_st_diag*3 - stop -1 - endif - - PROVIDE nuclear_repulsion expected_s2 - - call write_time(iunit) - call wall_time(wall) - call cpu_time(cpu) - write(iunit,'(A)') '' - write(iunit,'(A)') 'Davidson Diagonalization' - write(iunit,'(A)') '------------------------' - write(iunit,'(A)') '' - call write_int(iunit,N_st,'Number of states') - call write_int(iunit,N_st_diag,'Number of states in diagonalization') - call write_int(iunit,sze,'Number of determinants') - write(iunit,'(A)') '' - write_buffer = '===== ' - do i=1,N_st - write_buffer = trim(write_buffer)//' ================ =========== ===========' - enddo - write(iunit,'(A)') trim(write_buffer) - write_buffer = ' Iter' - do i=1,N_st - write_buffer = trim(write_buffer)//' Energy S^2 Residual ' - enddo - write(iunit,'(A)') trim(write_buffer) - write_buffer = '===== ' - do i=1,N_st - write_buffer = trim(write_buffer)//' ================ =========== ===========' - enddo - write(iunit,'(A)') trim(write_buffer) - - integer, external :: align_double - integer :: fd(3) - type(c_ptr) :: c_pointer(3) - sze_8 = align_double(sze) - - itermax = min(davidson_sze_max, sze/N_st_diag) - - call mmap( & - trim(ezfio_work_dir)//'U', & - (/ int(sze_8,8),int(N_st_diag*itermax,8) /), & - 8, fd(1), .False., c_pointer(1)) - call c_f_pointer(c_pointer(1), W, (/ sze_8,N_st_diag*itermax /) ) - - call mmap( & - trim(ezfio_work_dir)//'W', & - (/ int(sze_8,8),int(N_st_diag*itermax,8) /), & - 8, fd(2), .False., c_pointer(2)) - call c_f_pointer(c_pointer(2), U, (/ sze_8,N_st_diag*itermax /) ) - - call mmap( & - trim(ezfio_work_dir)//'S', & - (/ int(sze_8,8),int(N_st_diag*itermax,8) /), & - 8, fd(3), .False., c_pointer(3)) - call c_f_pointer(c_pointer(3), S, (/ sze_8,N_st_diag*itermax /) ) - - allocate( & - h(N_st_diag*itermax,N_st_diag*itermax), & - y(N_st_diag*itermax,N_st_diag*itermax), & - s_(N_st_diag*itermax,N_st_diag*itermax), & - s_tmp(N_st_diag*itermax,N_st_diag*itermax), & - overlap(N_st_diag*itermax, N_st_diag*itermax), & - residual_norm(N_st_diag), & - c(N_st_diag*itermax), & - s2(N_st_diag*itermax), & - lambda(N_st_diag*itermax)) - - h = 0.d0 - U = 0.d0 - W = 0.d0 - S = 0.d0 - y = 0.d0 - s_ = 0.d0 - s_tmp = 0.d0 - - - ASSERT (N_st > 0) - ASSERT (N_st_diag >= N_st) - ASSERT (sze > 0) - ASSERT (Nint > 0) - ASSERT (Nint == N_int) - - ! Davidson iterations - ! =================== - - converged = .False. - - double precision :: r1, r2 - do k=N_st+1,N_st_diag - u_in(k,k) = 10.d0 - do i=1,sze - call random_number(r1) - r1 = dsqrt(-2.d0*dlog(r1)) - r2 = dtwo_pi*r2 - u_in(i,k) = r1*dcos(r2) - enddo - enddo - do k=1,N_st_diag - call normalize(u_in(1,k),sze) - enddo - - - do while (.not.converged) - - do k=1,N_st_diag - do i=1,sze - U(i,k) = u_in(i,k) - enddo - enddo - - do iter=1,itermax-1 - - shift = N_st_diag*(iter-1) - shift2 = N_st_diag*iter - - call ortho_qr(U,size(U,1),sze,shift2) - - ! Compute |W_k> = \sum_i |i> - ! ----------------------------------------- - - -! call H_S2_u_0_nstates_zmq(W(1,shift+1),S(1,shift+1),U(1,shift+1),H_jj,S2_jj,sze,dets_in,Nint,N_st_diag,sze_8) - call H_S2_u_0_nstates(W(1,shift+1),S(1,shift+1),U(1,shift+1),H_jj,S2_jj,sze,dets_in,Nint,N_st_diag,sze_8) - - - ! Compute h_kl = = - ! ------------------------------------------- - - do k=1,iter - shift = N_st_diag*(k-1) - call dgemm('T','N', N_st_diag, shift2, sze, & - 1.d0, U(1,shift+1), size(U,1), W, size(W,1), & - 0.d0, h(shift+1,1), size(h,1)) - - call dgemm('T','N', N_st_diag, shift2, sze, & - 1.d0, U(1,shift+1), size(U,1), S, size(S,1), & - 0.d0, s_(shift+1,1), size(s_,1)) - enddo - -! ! Diagonalize S^2 -! ! --------------- -! -! call lapack_diag(s2,y,s_,size(s_,1),shift2) -! -! -! ! Rotate H in the basis of eigenfunctions of s2 -! ! --------------------------------------------- -! -! call dgemm('N','N',shift2,shift2,shift2, & -! 1.d0, h, size(h,1), y, size(y,1), & -! 0.d0, s_tmp, size(s_tmp,1)) -! -! call dgemm('T','N',shift2,shift2,shift2, & -! 1.d0, y, size(y,1), s_tmp, size(s_tmp,1), & -! 0.d0, h, size(h,1)) -! -! ! Damp interaction between different spin states -! ! ------------------------------------------------ -! -! do k=1,shift2 -! do l=1,shift2 -! if (dabs(s2(k) - s2(l)) > 1.d0) then -! h(k,l) = h(k,l)*(max(0.d0,1.d0 - dabs(s2(k) - s2(l)))) -! endif -! enddo -! enddo -! -! ! Rotate back H -! ! ------------- -! -! call dgemm('N','T',shift2,shift2,shift2, & -! 1.d0, h, size(h,1), y, size(y,1), & -! 0.d0, s_tmp, size(s_tmp,1)) -! -! call dgemm('N','N',shift2,shift2,shift2, & -! 1.d0, y, size(y,1), s_tmp, size(s_tmp,1), & -! 0.d0, h, size(h,1)) - - - ! Diagonalize h - ! ------------- - call lapack_diag(lambda,y,h,size(h,1),shift2) - - ! Compute S2 for each eigenvector - ! ------------------------------- - - call dgemm('N','N',shift2,shift2,shift2, & - 1.d0, s_, size(s_,1), y, size(y,1), & - 0.d0, s_tmp, size(s_tmp,1)) - - call dgemm('T','N',shift2,shift2,shift2, & - 1.d0, y, size(y,1), s_tmp, size(s_tmp,1), & - 0.d0, s_, size(s_,1)) - - - - do k=1,shift2 - s2(k) = s_(k,k) + S_z2_Sz - enddo - - - if (s2_eig) then - do k=1,shift2 - state_ok(k) = (dabs(s2(k)-expected_s2) < 0.6d0) - enddo - else - state_ok(k) = .True. - endif - - do k=1,shift2 - if (.not. state_ok(k)) then - do l=k+1,shift2 - if (state_ok(l)) then - call dswap(shift2, y(1,k), 1, y(1,l), 1) - call dswap(1, s2(k), 1, s2(l), 1) - call dswap(1, lambda(k), 1, lambda(l), 1) - state_ok(k) = .True. - state_ok(l) = .False. - exit - endif - enddo - endif - enddo - - if (state_following) then - - ! Compute overlap with U_in - ! ------------------------- - - integer :: order(N_st_diag) - double precision :: cmax - overlap = -1.d0 - do k=1,shift2 - do i=1,shift2 - overlap(k,i) = dabs(y(k,i)) - enddo - enddo - do k=1,N_st - cmax = -1.d0 - do i=1,shift2 - if (overlap(i,k) > cmax) then - cmax = overlap(i,k) - order(k) = i - endif - enddo - do i=1,shift2 - overlap(order(k),i) = -1.d0 - enddo - enddo - overlap = y - do k=1,N_st - l = order(k) - if (k /= l) then - y(1:shift2,k) = overlap(1:shift2,l) - endif - enddo - do k=1,N_st - overlap(k,1) = lambda(k) - overlap(k,2) = s2(k) - enddo - do k=1,N_st - l = order(k) - if (k /= l) then - lambda(k) = overlap(l,1) - s2(k) = overlap(l,2) - endif - enddo - - endif - - - ! Express eigenvectors of h in the determinant basis - ! -------------------------------------------------- - - call dgemm('N','N', sze, N_st_diag, shift2, & - 1.d0, U, size(U,1), y, size(y,1), 0.d0, U(1,shift2+1), size(U,1)) - call dgemm('N','N', sze, N_st_diag, shift2, & - 1.d0, W, size(W,1), y, size(y,1), 0.d0, W(1,shift2+1), size(W,1)) - call dgemm('N','N', sze, N_st_diag, shift2, & - 1.d0, S, size(S,1), y, size(y,1), 0.d0, S(1,shift2+1), size(S,1)) - - ! Compute residual vector and davidson step - ! ----------------------------------------- - - do k=1,N_st_diag - if (state_ok(k)) then - do i=1,sze - U(i,shift2+k) = (lambda(k) * U(i,shift2+k) - W(i,shift2+k) ) & - * (1.d0 + s2(k) * U(i,shift2+k) - S(i,shift2+k) - S_z2_Sz & - )/max(H_jj(i) - lambda (k),1.d-2) - enddo - else - ! Randomize components with bad - do i=1,sze-2,2 - call random_number(r1) - call random_number(r2) - r1 = dsqrt(-2.d0*dlog(r1)) - r2 = dtwo_pi*r2 - U(i,shift2+k) = r1*dcos(r2) - U(i+1,shift2+k) = r1*dsin(r2) - enddo - do i=sze-2+1,sze - call random_number(r1) - call random_number(r2) - r1 = dsqrt(-2.d0*dlog(r1)) - r2 = dtwo_pi*r2 - U(i,shift2+k) = r1*dcos(r2) - enddo - endif - - if (k <= N_st) then - residual_norm(k) = u_dot_u(U(1,shift2+k),sze) - to_print(1,k) = lambda(k) + nuclear_repulsion - to_print(2,k) = s2(k) - to_print(3,k) = residual_norm(k) - endif - enddo - - write(iunit,'(X,I3,X,100(X,F16.10,X,F11.6,X,E11.3))') iter, to_print(1:3,1:N_st) - call davidson_converged(lambda,residual_norm,wall,iter,cpu,N_st,converged) - do k=1,N_st - if (residual_norm(k) > 1.e8) then - print *, '' - stop 'Davidson failed' - endif - enddo - if (converged) then - exit - endif - - enddo - - ! Re-contract to u_in - ! ----------- - - call dgemm('N','N', sze, N_st_diag, shift2, 1.d0, & - U, size(U,1), y, size(y,1), 0.d0, u_in, size(u_in,1)) - - enddo - - do k=1,N_st_diag - energies(k) = lambda(k) - S2_jj(k) = s2(k) - enddo - write_buffer = '===== ' - do i=1,N_st - write_buffer = trim(write_buffer)//' ================ =========== ===========' - enddo - write(iunit,'(A)') trim(write_buffer) - write(iunit,'(A)') '' - call write_time(iunit) - - call munmap( & - (/ int(sze_8,8),int(N_st_diag*itermax,8) /), & - 8, fd(1), c_pointer(1)) - - call munmap( & - (/ int(sze_8,8),int(N_st_diag*itermax,8) /), & - 8, fd(2), c_pointer(2)) - - call munmap( & - (/ int(sze_8,8),int(N_st_diag*itermax,8) /), & - 8, fd(3), c_pointer(3)) - - deallocate ( & - residual_norm, & - c, overlap, & - h, & - y, s_, s_tmp, & - lambda & - ) -end - diff --git a/src/Davidson/diagonalize_CI.irp.f b/src/Davidson/diagonalize_CI.irp.f index e1b67438..9b98ea91 100644 --- a/src/Davidson/diagonalize_CI.irp.f +++ b/src/Davidson/diagonalize_CI.irp.f @@ -66,7 +66,6 @@ END_PROVIDER call davidson_diag_HS2(psi_det,CI_eigenvectors, CI_eigenvectors_s2, & size(CI_eigenvectors,1),CI_electronic_energy, & N_det,min(N_det,N_states),min(N_det,N_states_diag),N_int,output_determinants) - else if (diag_algorithm == "Lapack") then diff --git a/src/Davidson/u0Hu0.irp.f b/src/Davidson/u0Hu0.irp.f index 117e704e..4f68f85a 100644 --- a/src/Davidson/u0Hu0.irp.f +++ b/src/Davidson/u0Hu0.irp.f @@ -1,4 +1,4 @@ -subroutine u_0_H_u_0(e_0,u_0,n,keys_tmp,Nint,N_st,sze_8) +subroutine u_0_H_u_0(e_0,u_0,n,keys_tmp,Nint,N_st,sze) use bitmasks implicit none BEGIN_DOC @@ -7,166 +7,20 @@ subroutine u_0_H_u_0(e_0,u_0,n,keys_tmp,Nint,N_st,sze_8) ! n : number of determinants ! END_DOC - integer, intent(in) :: n,Nint, N_st, sze_8 + integer, intent(in) :: n,Nint, N_st, sze double precision, intent(out) :: e_0(N_st) - double precision, intent(in) :: u_0(sze_8,N_st) + double precision, intent(inout):: u_0(sze,N_st) integer(bit_kind),intent(in) :: keys_tmp(Nint,2,n) - double precision, allocatable :: H_jj(:), v_0(:,:) + double precision, allocatable :: v_0(:,:), s_0(:,:) double precision :: u_dot_u,u_dot_v,diag_H_mat_elem integer :: i,j - allocate (H_jj(n), v_0(sze_8,N_st)) - do i = 1, n - H_jj(i) = diag_H_mat_elem(keys_tmp(1,1,i),Nint) - enddo - - call H_u_0_nstates(v_0,u_0,H_jj,n,keys_tmp,Nint,N_st,sze_8) + allocate (v_0(sze,N_st),s_0(sze,N_st)) + call H_S2_u_0_nstates_openmp(v_0,s_0,u_0,N_st,sze) do i=1,N_st e_0(i) = u_dot_v(v_0(1,i),u_0(1,i),n)/u_dot_u(u_0(1,i),n) enddo - deallocate (H_jj, v_0) -end - - -subroutine H_u_0_nstates(v_0,u_0,H_jj,n,keys_tmp,Nint,N_st,sze_8) - use bitmasks - implicit none - BEGIN_DOC - ! Computes v_0 = H|u_0> - ! - ! n : number of determinants - ! - ! H_jj : array of - END_DOC - integer, intent(in) :: N_st,n,Nint, sze_8 - double precision, intent(out) :: v_0(sze_8,N_st) - double precision, intent(in) :: u_0(sze_8,N_st) - double precision, intent(in) :: H_jj(n) - integer(bit_kind),intent(in) :: keys_tmp(Nint,2,n) - double precision :: hij - double precision, allocatable :: vt(:,:) - double precision, allocatable :: ut(:,:) - integer :: i,j,k,l, jj,ii - integer :: i0, j0 - - integer, allocatable :: shortcut(:,:), sort_idx(:,:) - integer(bit_kind), allocatable :: sorted(:,:,:), version(:,:,:) - integer(bit_kind) :: sorted_i(Nint) - - integer :: sh, sh2, ni, exa, ext, org_i, org_j, endi, istate - integer :: N_st_8 - - integer, external :: align_double - !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: vt, ut - - N_st_8 = align_double(N_st) - - ASSERT (Nint > 0) - ASSERT (Nint == N_int) - ASSERT (n>0) - PROVIDE ref_bitmask_energy - - allocate (shortcut(0:n+1,2), sort_idx(n,2), sorted(Nint,n,2), version(Nint,n,2)) - allocate(ut(N_st_8,n)) - - v_0 = 0.d0 - - do i=1,n - do istate=1,N_st - ut(istate,i) = u_0(i,istate) - enddo - enddo - - call sort_dets_ab_v(keys_tmp, sorted(1,1,1), sort_idx(1,1), shortcut(0,1), version(1,1,1), n, Nint) - call sort_dets_ba_v(keys_tmp, sorted(1,1,2), sort_idx(1,2), shortcut(0,2), version(1,1,2), n, Nint) - - !$OMP PARALLEL DEFAULT(NONE) & - !$OMP PRIVATE(i,hij,j,k,jj,vt,ii,sh,sh2,ni,exa,ext,org_i,org_j,endi,sorted_i,istate)& - !$OMP SHARED(n,H_jj,keys_tmp,ut,Nint,v_0,sorted,shortcut,sort_idx,version,N_st,N_st_8) - allocate(vt(N_st_8,n)) - Vt = 0.d0 - - !$OMP DO SCHEDULE(dynamic) - do sh=1,shortcut(0,1) - do sh2=1,shortcut(0,1) - exa = popcnt(xor(version(1,sh,1), version(1,sh2,1))) - if(exa > 2) then - cycle - end if - do ni=2,Nint - exa = exa + popcnt(xor(version(ni,sh,1), version(ni,sh2,1))) - end do - if(exa > 2) then - cycle - end if - - do i=shortcut(sh,1),shortcut(sh+1,1)-1 - org_i = sort_idx(i,1) - do ni=1,Nint - sorted_i(ni) = sorted(ni,i,1) - enddo - - jloop: do j=shortcut(sh2,1),shortcut(sh2+1,1)-1 - org_j = sort_idx(j,1) - ext = exa + popcnt(xor(sorted_i(1), sorted(1,j,1))) - if(ext > 4) then - cycle jloop - endif - do ni=2,Nint - ext = ext + popcnt(xor(sorted_i(ni), sorted(ni,j,1))) - if(ext > 4) then - cycle jloop - endif - end do - call i_H_j(keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),Nint,hij) - do istate=1,N_st - vt (istate,org_i) = vt (istate,org_i) + hij*ut(istate,org_j) - enddo - enddo jloop - enddo - enddo - enddo - !$OMP END DO NOWAIT - - !$OMP DO SCHEDULE(dynamic) - do sh=1,shortcut(0,2) - do i=shortcut(sh,2),shortcut(sh+1,2)-1 - org_i = sort_idx(i,2) - do j=shortcut(sh,2),shortcut(sh+1,2)-1 - org_j = sort_idx(j,2) - ext = popcnt(xor(sorted(1,i,2), sorted(1,j,2))) - do ni=2,Nint - ext = ext + popcnt(xor(sorted(ni,i,2), sorted(ni,j,2))) - end do - if(ext /= 4) then - cycle - endif - call i_H_j(keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),Nint,hij) - do istate=1,N_st - vt (istate,org_i) = vt (istate,org_i) + hij*ut(istate,org_j) - enddo - end do - end do - enddo - !$OMP END DO NOWAIT - - !$OMP CRITICAL - do istate=1,N_st - do i=n,1,-1 - v_0(i,istate) = v_0(i,istate) + vt(istate,i) - enddo - enddo - !$OMP END CRITICAL - - deallocate(vt) - !$OMP END PARALLEL - - do istate=1,N_st - do i=1,n - v_0(i,istate) += H_jj(i) * u_0(i,istate) - enddo - enddo - deallocate (shortcut, sort_idx, sorted, version, ut) + deallocate (s_0, v_0) end BEGIN_PROVIDER [ double precision, psi_energy, (N_states) ] @@ -178,338 +32,411 @@ BEGIN_PROVIDER [ double precision, psi_energy, (N_states) ] END_PROVIDER -subroutine H_S2_u_0_nstates_zmq(v_0,s_0,u_0,H_jj,S2_jj,n,keys_tmp,Nint,N_st,sze_8) + +subroutine H_S2_u_0_nstates_openmp(v_0,s_0,u_0,N_st,sze) use bitmasks - use f77_zmq implicit none BEGIN_DOC ! Computes v_0 = H|u_0> and s_0 = S^2 |u_0> ! - ! n : number of determinants + ! Assumes that the determinants are in psi_det ! - ! H_jj : array of - ! - ! S2_jj : array of + ! istart, iend, ishift, istep are used in ZMQ parallelization. END_DOC - integer, intent(in) :: N_st,n,Nint, sze_8 - double precision, intent(out) :: v_0(sze_8,N_st), s_0(sze_8,N_st) - double precision, intent(in) :: u_0(sze_8,N_st) - double precision, intent(in) :: H_jj(n), S2_jj(n) - integer(bit_kind),intent(in) :: keys_tmp(Nint,2,n) - double precision :: hij,s2 - double precision, allocatable :: ut(:,:) - integer :: i,j,k,l, jj,ii - integer :: i0, j0 - - integer, allocatable :: shortcut(:,:), sort_idx(:) - integer(bit_kind), allocatable :: sorted(:,:), version(:,:) - integer(bit_kind) :: sorted_i(Nint) - - integer :: sh, sh2, ni, exa, ext, org_i, org_j, endi, istate - integer :: N_st_8 - - integer, external :: align_double - integer :: blockb, blockb2, istep - double precision :: ave_workload, workload, target_workload_inv - - integer(ZMQ_PTR) :: handler - - if(N_st /= N_states_diag .or. sze_8 < N_det) stop "assert fail in H_S2_u_0_nstates" - N_st_8 = N_st ! align_double(N_st) - - ASSERT (Nint > 0) - ASSERT (Nint == N_int) - ASSERT (n>0) - PROVIDE ref_bitmask_energy - - allocate (shortcut(0:n+1,2), sort_idx(n), sorted(Nint,n), version(Nint,n)) - allocate(ut(N_st_8,n)) - + integer, intent(in) :: N_st,sze + double precision, intent(inout) :: v_0(sze,N_st), s_0(sze,N_st), u_0(sze,N_st) + integer :: k + double precision, allocatable :: u_t(:,:) + !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: u_t + allocate(u_t(N_st,N_det)) + do k=1,N_st + call dset_order(u_0(1,k),psi_bilinear_matrix_order,N_det) + enddo v_0 = 0.d0 s_0 = 0.d0 - - do i=1,n - do istate=1,N_st - ut(istate,i) = u_0(i,istate) - enddo + call dtranspose( & + u_0, & + size(u_0, 1), & + u_t, & + size(u_t, 1), & + N_det, N_st) + + call H_S2_u_0_nstates_openmp_work(v_0,s_0,u_t,N_st,sze,1,N_det,0,1) + deallocate(u_t) + + do k=1,N_st + call dset_order(v_0(1,k),psi_bilinear_matrix_order_reverse,N_det) + call dset_order(s_0(1,k),psi_bilinear_matrix_order_reverse,N_det) + call dset_order(u_0(1,k),psi_bilinear_matrix_order_reverse,N_det) enddo - call sort_dets_ab_v(keys_tmp, sorted, sort_idx, shortcut(0,1), version, n, Nint) - call sort_dets_ba_v(keys_tmp, sorted, sort_idx, shortcut(0,2), version, n, Nint) - - blockb = shortcut(0,1) - call davidson_init(handler,n,N_st_8,ut) - - ave_workload = 0.d0 - do sh=1,shortcut(0,1) - ave_workload += shortcut(0,1) - ave_workload += (shortcut(sh+1,1) - shortcut(sh,1))**2 - do i=sh, shortcut(0,2), shortcut(0,1) - do j=i, min(i, shortcut(0,2)) - ave_workload += (shortcut(j+1,2) - shortcut(j, 2))**2 - end do - end do - enddo - ave_workload = ave_workload/dble(shortcut(0,1)) - target_workload_inv = 0.001d0/ave_workload - - - do sh=1,shortcut(0,1),1 - workload = shortcut(0,1)+dble(shortcut(sh+1,1) - shortcut(sh,1))**2 - do i=sh, shortcut(0,2), shortcut(0,1) - do j=i, min(i, shortcut(0,2)) - workload += (shortcut(j+1,2) - shortcut(j, 2))**2 - end do - end do - istep = 1+ int(workload*target_workload_inv) - do blockb2=0, istep-1 - call davidson_add_task(handler, sh, blockb2, istep) - enddo - enddo - - call davidson_run(handler, v_0, s_0, size(v_0,1)) - - do istate=1,N_st - do i=1,n - v_0(i,istate) = v_0(i,istate) + H_jj(i) * u_0(i,istate) - s_0(i,istate) = s_0(i,istate) + s2_jj(i)* u_0(i,istate) - enddo - enddo - deallocate(shortcut, sort_idx, sorted, version) - deallocate(ut) end - -subroutine H_S2_u_0_nstates(v_0,s_0,u_0,H_jj,S2_jj,n,keys_tmp,Nint,N_st,sze_8) +subroutine H_S2_u_0_nstates_openmp_work(v_0,s_0,u_t,N_st,sze,istart,iend,ishift,istep) use bitmasks implicit none BEGIN_DOC ! Computes v_0 = H|u_0> and s_0 = S^2 |u_0> ! - ! n : number of determinants - ! - ! H_jj : array of - ! - ! S2_jj : array of + ! Default should be 1,N_det,0,1 END_DOC - integer, intent(in) :: N_st,n,Nint, sze_8 - double precision, intent(out) :: v_0(sze_8,N_st), s_0(sze_8,N_st) - double precision, intent(in) :: u_0(sze_8,N_st) - double precision, intent(in) :: H_jj(n), S2_jj(n) - integer(bit_kind),intent(in) :: keys_tmp(Nint,2,n) - double precision :: hij,s2 - double precision, allocatable :: vt(:,:), ut(:,:), st(:,:) - integer :: i,j,k,l, jj,ii - integer :: i0, j0 + integer, intent(in) :: N_st,sze,istart,iend,ishift,istep + double precision, intent(in) :: u_t(N_st,N_det) + double precision, intent(out) :: v_0(sze,N_st), s_0(sze,N_st) + - integer, allocatable :: shortcut(:,:), sort_idx(:,:) - integer(bit_kind), allocatable :: sorted(:,:,:), version(:,:,:) - integer(bit_kind) :: sorted_i(Nint) + PROVIDE ref_bitmask_energy N_int + + select case (N_int) + case (1) + call H_S2_u_0_nstates_openmp_work_1(v_0,s_0,u_t,N_st,sze,istart,iend,ishift,istep) + case (2) + call H_S2_u_0_nstates_openmp_work_2(v_0,s_0,u_t,N_st,sze,istart,iend,ishift,istep) + case (3) + call H_S2_u_0_nstates_openmp_work_3(v_0,s_0,u_t,N_st,sze,istart,iend,ishift,istep) + case (4) + call H_S2_u_0_nstates_openmp_work_4(v_0,s_0,u_t,N_st,sze,istart,iend,ishift,istep) + case default + call H_S2_u_0_nstates_openmp_work_N_int(v_0,s_0,u_t,N_st,sze,istart,iend,ishift,istep) + end select +end +BEGIN_TEMPLATE + +subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_0,s_0,u_t,N_st,sze,istart,iend,ishift,istep) + use bitmasks + implicit none + BEGIN_DOC + ! Computes v_0 = H|u_0> and s_0 = S^2 |u_0> + ! + ! Default should be 1,N_det,0,1 + END_DOC + integer, intent(in) :: N_st,sze,istart,iend,ishift,istep + double precision, intent(in) :: u_t(N_st,N_det) + double precision, intent(out) :: v_0(sze,N_st), s_0(sze,N_st) + + double precision :: hij, sij + integer :: i,j,k,l + integer :: k_a, k_b, l_a, l_b, m_a, m_b + integer :: istate + integer :: krow, kcol, krow_b, kcol_b + integer :: lrow, lcol + integer :: mrow, mcol + integer(bit_kind) :: spindet($N_int) + integer(bit_kind) :: tmp_det($N_int,2) + integer(bit_kind) :: tmp_det2($N_int,2) + integer(bit_kind) :: tmp_det3($N_int,2) + integer(bit_kind), allocatable :: buffer(:,:) + integer :: n_doubles + integer, allocatable :: doubles(:) + integer, allocatable :: singles_a(:) + integer, allocatable :: singles_b(:) + integer, allocatable :: idx(:), idx0(:) + integer :: maxab, n_singles_a, n_singles_b, kcol_prev, nmax + integer*8 :: k8 + double precision, allocatable :: v_t(:,:), s_t(:,:) + !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: v_t, s_t + + maxab = max(N_det_alpha_unique, N_det_beta_unique)+1 + allocate(idx0(maxab)) - integer :: sh, sh2, ni, exa, ext, org_i, org_j, endi, istate - integer :: N_st_8 - - integer, external :: align_double - integer :: blockb, blockb2, istep - double precision :: ave_workload, workload, target_workload_inv - - !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: vt, ut, st + do i=1,maxab + idx0(i) = i + enddo - N_st_8 = align_double(N_st) - - ASSERT (Nint > 0) - ASSERT (Nint == N_int) - ASSERT (n>0) - PROVIDE ref_bitmask_energy - - allocate (shortcut(0:n+1,2), sort_idx(n,2), sorted(Nint,n,2), version(Nint,n,2)) - allocate(ut(N_st_8,n)) - - v_0 = 0.d0 - s_0 = 0.d0 - - call sort_dets_ab_v(keys_tmp, sorted(1,1,1), sort_idx(1,1), shortcut(0,1), version(1,1,1), n, Nint) - call sort_dets_ba_v(keys_tmp, sorted(1,1,2), sort_idx(1,2), shortcut(0,2), version(1,1,2), n, Nint) + ! Prepare the array of all alpha single excitations + ! ------------------------------------------------- + PROVIDE N_int !$OMP PARALLEL DEFAULT(NONE) & - !$OMP PRIVATE(i,hij,s2,j,k,jj,vt,st,ii,sh,sh2,ni,exa,ext,org_i,org_j,endi,sorted_i,istate)& - !$OMP SHARED(n,keys_tmp,ut,Nint,u_0,v_0,s_0,sorted,shortcut,sort_idx,version,N_st,N_st_8) - allocate(vt(N_st_8,n),st(N_st_8,n)) - Vt = 0.d0 - St = 0.d0 + !$OMP SHARED(psi_bilinear_matrix_rows, N_det, & + !$OMP psi_bilinear_matrix_columns, & + !$OMP psi_det_alpha_unique, psi_det_beta_unique, & + !$OMP n_det_alpha_unique, n_det_beta_unique, N_int, & + !$OMP psi_bilinear_matrix_transp_rows, & + !$OMP psi_bilinear_matrix_transp_columns, & + !$OMP psi_bilinear_matrix_transp_order, N_st, & + !$OMP psi_bilinear_matrix_order_transp_reverse, & + !$OMP psi_bilinear_matrix_columns_loc, & + !$OMP istart, iend, istep, & + !$OMP ishift, idx0, u_t, maxab, v_0, s_0) & + !$OMP PRIVATE(krow, kcol, tmp_det, spindet, k_a, k_b, i, & + !$OMP lcol, lrow, l_a, l_b, nmax, & + !$OMP buffer, doubles, n_doubles, & + !$OMP tmp_det2, hij, sij, idx, l, kcol_prev, v_t, & + !$OMP singles_a, n_singles_a, singles_b, & + !$OMP n_singles_b, s_t, k8) + + ! Alpha/Beta double excitations + ! ============================= + + allocate( buffer($N_int,maxab), & + singles_a(maxab), & + singles_b(maxab), & + doubles(maxab), & + idx(maxab), & + v_t(N_st,N_det), s_t(N_st,N_det)) + kcol_prev=-1 - !$OMP DO - do i=1,n - do istate=1,N_st - ut(istate,i) = u_0(sort_idx(i,2),istate) - enddo - enddo - !$OMP END DO + v_t = 0.d0 + s_t = 0.d0 - !$OMP DO SCHEDULE(dynamic) - do sh=1,shortcut(0,2) - do i=shortcut(sh,2),shortcut(sh+1,2)-1 - org_i = sort_idx(i,2) - do j=shortcut(sh,2),shortcut(sh+1,2)-1 - org_j = sort_idx(j,2) - ext = popcnt(xor(sorted(1,i,2), sorted(1,j,2))) - if (ext > 4) cycle - do ni=2,Nint - ext = ext + popcnt(xor(sorted(ni,i,2), sorted(ni,j,2))) - if (ext > 4) exit - end do - if(ext == 4) then - call i_h_j (keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,hij) - call get_s2(keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,s2) - do istate=1,n_st - vt (istate,org_i) = vt (istate,org_i) + hij*ut(istate,j) - st (istate,org_i) = st (istate,org_i) + s2*ut(istate,j) - enddo - end if - end do - end do - enddo - !$OMP END DO - !$OMP DO - do i=1,n - do istate=1,N_st - ut(istate,i) = u_0(sort_idx(i,1),istate) - enddo - enddo - !$OMP END DO + !$OMP DO SCHEDULE(dynamic,64) + do k_a=istart+ishift,iend,istep - !$OMP DO SCHEDULE(dynamic) - do sh=1,shortcut(0,1) - do sh2=1,shortcut(0,1) - if (sh==sh2) cycle + krow = psi_bilinear_matrix_rows(k_a) + kcol = psi_bilinear_matrix_columns(k_a) + + tmp_det(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, krow) + tmp_det(1:$N_int,2) = psi_det_beta_unique (1:$N_int, kcol) + + if (kcol /= kcol_prev) then + call get_all_spin_singles_$N_int( & + psi_det_beta_unique(1,kcol+1), idx0(kcol+1), & + tmp_det(1,2), N_det_beta_unique-kcol, & + singles_b, n_singles_b) + endif + kcol_prev = kcol - exa = 0 - do ni=1,Nint - exa = exa + popcnt(xor(version(ni,sh,1), version(ni,sh2,1))) - end do - if(exa > 2) then - cycle - end if + ! Loop over singly excited beta columns > current column + ! ------------------------------------------------------ - do i=shortcut(sh,1),shortcut(sh+1,1)-1 - org_i = sort_idx(i,1) - do ni=1,Nint - sorted_i(ni) = sorted(ni,i,1) - enddo + do i=1,n_singles_b + lcol = singles_b(i) - do j=shortcut(sh2,1),shortcut(sh2+1,1)-1 - ext = exa + popcnt(xor(sorted_i(1), sorted(1,j,1))) - if (ext > 4) cycle - do ni=2,Nint - ext = ext + popcnt(xor(sorted_i(ni), sorted(ni,j,1))) - if (ext > 4) exit - end do - if(ext <= 4) then - org_j = sort_idx(j,1) - call i_h_j (keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,hij) - if (hij /= 0.d0) then - do istate=1,n_st - vt (istate,org_i) = vt (istate,org_i) + hij*ut(istate,j) - enddo - endif - if (ext /= 2) then - call get_s2(keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,s2) - if (s2 /= 0.d0) then - do istate=1,n_st - st (istate,org_i) = st (istate,org_i) + s2*ut(istate,j) - enddo - endif - endif - endif - enddo - - enddo - enddo - - exa = 0 - - do i=shortcut(sh,1),shortcut(sh+1,1)-1 - org_i = sort_idx(i,1) - do ni=1,Nint - sorted_i(ni) = sorted(ni,i,1) - enddo - - do j=shortcut(sh,1),i-1 - ext = exa + popcnt(xor(sorted_i(1), sorted(1,j,1))) - if (ext > 4) cycle - do ni=2,Nint - ext = ext + popcnt(xor(sorted_i(ni), sorted(ni,j,1))) - if (ext > 4) exit - end do - if(ext <= 4) then - org_j = sort_idx(j,1) - call i_h_j (keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,hij) - if (hij /= 0.d0) then - do istate=1,n_st - vt (istate,org_i) = vt (istate,org_i) + hij*ut(istate,j) - enddo - endif - if (ext /= 2) then - call get_s2(keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,s2) - if (s2 /= 0.d0) then - do istate=1,n_st - st (istate,org_i) = st (istate,org_i) + s2*ut(istate,j) - enddo - endif - endif - endif + tmp_det2(1:$N_int,2) = psi_det_beta_unique(1:$N_int, lcol) + + l_a = psi_bilinear_matrix_columns_loc(lcol) + + nmax = psi_bilinear_matrix_columns_loc(lcol+1) - l_a + do j=1,nmax + lrow = psi_bilinear_matrix_rows(l_a) + buffer(1:$N_int,j) = psi_det_alpha_unique(1:$N_int, lrow) + idx(j) = l_a + l_a = l_a+1 enddo + j = j-1 - do j=i+1,shortcut(sh+1,1)-1 - if (i==j) cycle - ext = exa + popcnt(xor(sorted_i(1), sorted(1,j,1))) - if (ext > 4) cycle - do ni=2,Nint - ext = ext + popcnt(xor(sorted_i(ni), sorted(ni,j,1))) - if (ext > 4) exit - end do - if(ext <= 4) then - org_j = sort_idx(j,1) - call i_h_j (keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,hij) - if (hij /= 0.d0) then - do istate=1,n_st - vt (istate,org_i) = vt (istate,org_i) + hij*ut(istate,j) - enddo - endif - if (ext /= 2) then - call get_s2(keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,s2) - if (s2 /= 0.d0) then - do istate=1,n_st - st (istate,org_i) = st (istate,org_i) + s2*ut(istate,j) - enddo - endif - endif - endif + call get_all_spin_singles_$N_int( & + buffer, idx, tmp_det(1,1), j, & + singles_a, n_singles_a ) + + ! Loop over alpha singles + ! ----------------------- + + do k = 1,n_singles_a + l_a = singles_a(k) + lrow = psi_bilinear_matrix_rows(l_a) + tmp_det2(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, lrow) + call i_H_j_double_alpha_beta(tmp_det,tmp_det2,$N_int,hij) + call get_s2(tmp_det,tmp_det2,$N_int,sij) + do l=1,N_st + v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,l_a) + s_t(l,k_a) = s_t(l,k_a) + sij * u_t(l,l_a) + v_t(l,l_a) = v_t(l,l_a) + hij * u_t(l,k_a) + s_t(l,l_a) = s_t(l,l_a) + sij * u_t(l,k_a) + enddo enddo + enddo + enddo !$OMP END DO - !$OMP CRITICAL (u0Hu0) - do istate=1,N_st - do i=1,n - v_0(i,istate) = v_0(i,istate) + vt(istate,i) - s_0(i,istate) = s_0(i,istate) + st(istate,i) + !$OMP DO SCHEDULE(dynamic,64) + do k_a=istart+ishift,iend,istep + + + ! Single and double alpha excitations + ! =================================== + + + ! Initial determinant is at k_a in alpha-major representation + ! ----------------------------------------------------------------------- + + krow = psi_bilinear_matrix_rows(k_a) + kcol = psi_bilinear_matrix_columns(k_a) + + tmp_det(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, krow) + tmp_det(1:$N_int,2) = psi_det_beta_unique (1:$N_int, kcol) + + ! Initial determinant is at k_b in beta-major representation + ! ---------------------------------------------------------------------- + + k_b = psi_bilinear_matrix_order_transp_reverse(k_a) + + spindet(1:$N_int) = tmp_det(1:$N_int,1) + + ! Loop inside the beta column to gather all the connected alphas + l_a = k_a+1 + nmax = min(N_det_alpha_unique, N_det - l_a) + do i=1,nmax + lcol = psi_bilinear_matrix_columns(l_a) + if (lcol /= kcol) exit + lrow = psi_bilinear_matrix_rows(l_a) + buffer(1:$N_int,i) = psi_det_alpha_unique(1:$N_int, lrow) + idx(i) = l_a + l_a = l_a+1 + enddo + i = i-1 + + call get_all_spin_singles_and_doubles_$N_int( & + buffer, idx, spindet, i, & + singles_a, doubles, n_singles_a, n_doubles ) + + ! Compute Hij for all alpha singles + ! ---------------------------------- + + tmp_det2(1:$N_int,2) = psi_det_beta_unique (1:$N_int, kcol) + do i=1,n_singles_a + l_a = singles_a(i) + lrow = psi_bilinear_matrix_rows(l_a) + tmp_det2(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, lrow) + call i_H_j_mono_spin( tmp_det, tmp_det2, $N_int, 1, hij) + do l=1,N_st + v_t(l,l_a) = v_t(l,l_a) + hij * u_t(l,k_a) + v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,l_a) + ! single => sij = 0 + enddo + enddo + + + ! Compute Hij for all alpha doubles + ! ---------------------------------- + + do i=1,n_doubles + l_a = doubles(i) + lrow = psi_bilinear_matrix_rows(l_a) + call i_H_j_double_spin( tmp_det(1,1), psi_det_alpha_unique(1, lrow), $N_int, hij) + do l=1,N_st + v_t(l,l_a) = v_t(l,l_a) + hij * u_t(l,k_a) + v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,l_a) + ! same spin => sij = 0 + enddo + enddo + + + + ! Single and double beta excitations + ! ================================== + + + ! Initial determinant is at k_a in alpha-major representation + ! ----------------------------------------------------------------------- + + krow = psi_bilinear_matrix_rows(k_a) + kcol = psi_bilinear_matrix_columns(k_a) + + tmp_det(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, krow) + tmp_det(1:$N_int,2) = psi_det_beta_unique (1:$N_int, kcol) + + spindet(1:$N_int) = tmp_det(1:$N_int,2) + + ! Initial determinant is at k_b in beta-major representation + ! ----------------------------------------------------------------------- + + k_b = psi_bilinear_matrix_order_transp_reverse(k_a) + + ! Loop inside the alpha row to gather all the connected betas + l_b = k_b+1 + nmax = min(N_det_beta_unique, N_det - l_b) + do i=1,nmax + lrow = psi_bilinear_matrix_transp_rows(l_b) + if (lrow /= krow) exit + lcol = psi_bilinear_matrix_transp_columns(l_b) + buffer(1:$N_int,i) = psi_det_beta_unique(1:$N_int, lcol) + idx(i) = l_b + l_b = l_b+1 + enddo + i = i-1 + + call get_all_spin_singles_and_doubles_$N_int( & + buffer, idx, spindet, i, & + singles_b, doubles, n_singles_b, n_doubles ) + + ! Compute Hij for all beta singles + ! ---------------------------------- + + tmp_det2(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, krow) + do i=1,n_singles_b + l_b = singles_b(i) + lcol = psi_bilinear_matrix_transp_columns(l_b) + tmp_det2(1:$N_int,2) = psi_det_beta_unique (1:$N_int, lcol) + call i_H_j_mono_spin( tmp_det, tmp_det2, $N_int, 2, hij) + l_a = psi_bilinear_matrix_transp_order(l_b) + do l=1,N_st + v_t(l,l_a) = v_t(l,l_a) + hij * u_t(l,k_a) + v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,l_a) + ! single => sij = 0 + enddo + enddo + + ! Compute Hij for all beta doubles + ! ---------------------------------- + + do i=1,n_doubles + l_b = doubles(i) + lcol = psi_bilinear_matrix_transp_columns(l_b) + call i_H_j_double_spin( tmp_det(1,2), psi_det_beta_unique(1, lcol), $N_int, hij) + l_a = psi_bilinear_matrix_transp_order(l_b) + do l=1,N_st + v_t(l,l_a) = v_t(l,l_a) + hij * u_t(l,k_a) + v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,l_a) + ! same spin => sij = 0 + enddo + enddo + + + ! Diagonal contribution + ! ===================== + + + ! Initial determinant is at k_a in alpha-major representation + ! ----------------------------------------------------------------------- + + krow = psi_bilinear_matrix_rows(k_a) + kcol = psi_bilinear_matrix_columns(k_a) + + tmp_det(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, krow) + tmp_det(1:$N_int,2) = psi_det_beta_unique (1:$N_int, kcol) + + double precision, external :: diag_H_mat_elem, diag_S_mat_elem + + hij = diag_H_mat_elem(tmp_det,$N_int) + sij = diag_S_mat_elem(tmp_det,$N_int) + do l=1,N_st + v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,k_a) + s_t(l,k_a) = s_t(l,k_a) + sij * u_t(l,k_a) + enddo + + end do + !$OMP END DO NOWAIT + deallocate(buffer, singles_a, singles_b, doubles, idx) + + !$OMP CRITICAL + do l=1,N_st + do i=1, N_det + v_0(i,l) = v_0(i,l) + v_t(l,i) + s_0(i,l) = s_0(i,l) + s_t(l,i) enddo enddo - !$OMP END CRITICAL (u0Hu0) + !$OMP END CRITICAL + deallocate(v_t, s_t) - deallocate(vt,st) + !$OMP BARRIER !$OMP END PARALLEL - do istate=1,N_st - do i=1,n - v_0(i,istate) = v_0(i,istate) + H_jj(i) * u_0(i,istate) - s_0(i,istate) = s_0(i,istate) + s2_jj(i)* u_0(i,istate) - enddo - enddo - deallocate (shortcut, sort_idx, sorted, version, ut) end +SUBST [ N_int ] + +1;; +2;; +3;; +4;; +N_int;; + +END_TEMPLATE + + diff --git a/src/Davidson/u0Hu0_old.irp.f b/src/Davidson/u0Hu0_old.irp.f new file mode 100644 index 00000000..70aea449 --- /dev/null +++ b/src/Davidson/u0Hu0_old.irp.f @@ -0,0 +1,517 @@ + +subroutine H_u_0_nstates(v_0,u_0,H_jj,n,keys_tmp,Nint,N_st,sze) + use bitmasks + implicit none + BEGIN_DOC + ! Computes v_0 = H|u_0> + ! + ! n : number of determinants + ! + ! H_jj : array of + ! + END_DOC + integer, intent(in) :: N_st,n,Nint, sze + double precision, intent(out) :: v_0(sze,N_st) + double precision, intent(in) :: u_0(sze,N_st) + double precision, intent(in) :: H_jj(n) + integer(bit_kind),intent(in) :: keys_tmp(Nint,2,n) + double precision :: hij,s2 + double precision, allocatable :: vt(:,:), ut(:,:), st(:,:) + integer :: i,j,k,l, jj,ii + integer :: i0, j0 + + integer, allocatable :: shortcut(:,:), sort_idx(:,:) + integer(bit_kind), allocatable :: sorted(:,:,:), version(:,:,:) + integer(bit_kind) :: sorted_i(Nint) + + integer :: sh, sh2, ni, exa, ext, org_i, org_j, endi, istate + integer :: N_st_8 + + integer, external :: align_double + + !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: vt, ut, st + + N_st_8 = align_double(N_st) + + ASSERT (Nint > 0) + ASSERT (Nint == N_int) + ASSERT (n>0) + PROVIDE ref_bitmask_energy + + allocate (shortcut(0:n+1,2), sort_idx(n,2), sorted(Nint,n,2), version(Nint,n,2)) + allocate( ut(N_st_8,n)) + + v_0 = 0.d0 + + call sort_dets_ab_v(keys_tmp, sorted(1,1,1), sort_idx(1,1), shortcut(0,1), version(1,1,1), n, Nint) + call sort_dets_ba_v(keys_tmp, sorted(1,1,2), sort_idx(1,2), shortcut(0,2), version(1,1,2), n, Nint) + + !$OMP PARALLEL DEFAULT(NONE) & + !$OMP PRIVATE(i,hij,s2,j,k,jj,vt,st,ii,sh,sh2,ni,exa,ext,org_i,org_j,endi,sorted_i,istate)& + !$OMP SHARED(n,keys_tmp,ut,Nint,u_0,v_0,sorted,shortcut,sort_idx,version,N_st,N_st_8) + allocate(vt(N_st_8,n),st(N_st_8,n)) + Vt = 0.d0 + St = 0.d0 + + !$OMP DO + do i=1,n + do istate=1,N_st + ut(istate,i) = u_0(sort_idx(i,2),istate) + enddo + enddo + !$OMP END DO + + !$OMP DO SCHEDULE(static,1) + do sh=1,shortcut(0,2) + do i=shortcut(sh,2),shortcut(sh+1,2)-1 + org_i = sort_idx(i,2) + do j=shortcut(sh,2),shortcut(sh+1,2)-1 + org_j = sort_idx(j,2) + ext = popcnt(xor(sorted(1,i,2), sorted(1,j,2))) + if (ext > 4) cycle + do ni=2,Nint + ext = ext + popcnt(xor(sorted(ni,i,2), sorted(ni,j,2))) + if (ext > 4) exit + end do + if(ext == 4) then + call i_h_j (keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,hij) + call get_s2(keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,s2) + do istate=1,n_st + vt (istate,org_i) = vt (istate,org_i) + hij*ut(istate,j) + st (istate,org_i) = st (istate,org_i) + s2*ut(istate,j) + enddo + end if + end do + end do + enddo + !$OMP END DO + + !$OMP DO + do i=1,n + do istate=1,N_st + ut(istate,i) = u_0(sort_idx(i,1),istate) + enddo + enddo + !$OMP END DO + + !$OMP DO SCHEDULE(static,1) + do sh=1,shortcut(0,1) + do sh2=1,shortcut(0,1) + if (sh==sh2) cycle + + exa = 0 + do ni=1,Nint + exa = exa + popcnt(xor(version(ni,sh,1), version(ni,sh2,1))) + end do + if(exa > 2) then + cycle + end if + + do i=shortcut(sh,1),shortcut(sh+1,1)-1 + org_i = sort_idx(i,1) + do ni=1,Nint + sorted_i(ni) = sorted(ni,i,1) + enddo + + do j=shortcut(sh2,1),shortcut(sh2+1,1)-1 + ext = exa + popcnt(xor(sorted_i(1), sorted(1,j,1))) + if (ext > 4) cycle + do ni=2,Nint + ext = ext + popcnt(xor(sorted_i(ni), sorted(ni,j,1))) + if (ext > 4) exit + end do + if(ext <= 4) then + org_j = sort_idx(j,1) + call i_h_j (keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,hij) + if (hij /= 0.d0) then + do istate=1,n_st + vt (istate,org_i) = vt (istate,org_i) + hij*ut(istate,j) + enddo + endif + if (ext /= 2) then + call get_s2(keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,s2) + if (s2 /= 0.d0) then + do istate=1,n_st + st (istate,org_i) = st (istate,org_i) + s2*ut(istate,j) + enddo + endif + endif + endif + enddo + + enddo + enddo + + exa = 0 + + do i=shortcut(sh,1),shortcut(sh+1,1)-1 + org_i = sort_idx(i,1) + do ni=1,Nint + sorted_i(ni) = sorted(ni,i,1) + enddo + + do j=shortcut(sh,1),i-1 + ext = exa + popcnt(xor(sorted_i(1), sorted(1,j,1))) + if (ext > 4) cycle + do ni=2,Nint + ext = ext + popcnt(xor(sorted_i(ni), sorted(ni,j,1))) + if (ext > 4) exit + end do + if(ext <= 4) then + org_j = sort_idx(j,1) + call i_h_j (keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,hij) + if (hij /= 0.d0) then + do istate=1,n_st + vt (istate,org_i) = vt (istate,org_i) + hij*ut(istate,j) + enddo + endif + if (ext /= 2) then + call get_s2(keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,s2) + if (s2 /= 0.d0) then + do istate=1,n_st + st (istate,org_i) = st (istate,org_i) + s2*ut(istate,j) + enddo + endif + endif + endif + enddo + + do j=i+1,shortcut(sh+1,1)-1 + if (i==j) cycle + ext = exa + popcnt(xor(sorted_i(1), sorted(1,j,1))) + if (ext > 4) cycle + do ni=2,Nint + ext = ext + popcnt(xor(sorted_i(ni), sorted(ni,j,1))) + if (ext > 4) exit + end do + if(ext <= 4) then + org_j = sort_idx(j,1) + call i_h_j (keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,hij) + if (hij /= 0.d0) then + do istate=1,n_st + vt (istate,org_i) = vt (istate,org_i) + hij*ut(istate,j) + enddo + endif + if (ext /= 2) then + call get_s2(keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,s2) + if (s2 /= 0.d0) then + do istate=1,n_st + st (istate,org_i) = st (istate,org_i) + s2*ut(istate,j) + enddo + endif + endif + endif + enddo + enddo + enddo + !$OMP END DO + + do istate=1,N_st + do i=1,n + !$OMP ATOMIC + v_0(i,istate) = v_0(i,istate) + vt(istate,i) + enddo + enddo + + deallocate(vt,st) + !$OMP END PARALLEL + + do istate=1,N_st + do i=1,n + v_0(i,istate) = v_0(i,istate) + H_jj(i) * u_0(i,istate) + enddo + enddo + deallocate (shortcut, sort_idx, sorted, version, ut) +end + + + + + +subroutine H_S2_u_0_nstates(v_0,s_0,u_0,H_jj,S2_jj,n,keys_tmp,Nint,N_st,sze) + use bitmasks + implicit none + BEGIN_DOC + ! Computes v_0 = H|u_0> and s_0 = S^2 |u_0> + ! + ! n : number of determinants + ! + ! H_jj : array of + ! + ! S2_jj : array of + END_DOC + integer, intent(in) :: N_st,n,Nint, sze + double precision, intent(out) :: v_0(sze,N_st), s_0(sze,N_st) + double precision, intent(in) :: u_0(sze,N_st) + double precision, intent(in) :: H_jj(n), S2_jj(n) + integer(bit_kind),intent(in) :: keys_tmp(Nint,2,n) + double precision :: hij,s2 + double precision, allocatable :: vt(:,:), ut(:,:), st(:,:) + integer :: i,j,k,l, jj,ii + integer :: i0, j0 + + integer, allocatable :: shortcut(:,:), sort_idx(:,:) + integer(bit_kind), allocatable :: sorted(:,:,:), version(:,:,:) + integer(bit_kind) :: sorted_i(Nint) + + integer :: sh, sh2, ni, exa, ext, org_i, org_j, endi, istate + integer :: N_st_8 + + integer, external :: align_double + + !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: vt, ut, st + + N_st_8 = align_double(N_st) + + ASSERT (Nint > 0) + ASSERT (Nint == N_int) + ASSERT (n>0) + PROVIDE ref_bitmask_energy + + allocate (shortcut(0:n+1,2), sort_idx(n,2), sorted(Nint,n,2), version(Nint,n,2)) + allocate( ut(N_st_8,n)) + + v_0 = 0.d0 + s_0 = 0.d0 + + call sort_dets_ab_v(keys_tmp, sorted(1,1,1), sort_idx(1,1), shortcut(0,1), version(1,1,1), n, Nint) + call sort_dets_ba_v(keys_tmp, sorted(1,1,2), sort_idx(1,2), shortcut(0,2), version(1,1,2), n, Nint) + + !$OMP PARALLEL DEFAULT(NONE) & + !$OMP PRIVATE(i,hij,s2,j,k,jj,vt,st,ii,sh,sh2,ni,exa,ext,org_i,org_j,endi,sorted_i,istate)& + !$OMP SHARED(n,keys_tmp,ut,Nint,u_0,v_0,s_0,sorted,shortcut,sort_idx,version,N_st,N_st_8) + allocate(vt(N_st_8,n),st(N_st_8,n)) + Vt = 0.d0 + St = 0.d0 + + !$OMP DO + do i=1,n + do istate=1,N_st + ut(istate,i) = u_0(sort_idx(i,2),istate) + enddo + enddo + !$OMP END DO + + !$OMP DO SCHEDULE(static,4) + do sh=1,shortcut(0,2) + do i=shortcut(sh,2),shortcut(sh+1,2)-1 + org_i = sort_idx(i,2) + do j=shortcut(sh,2),shortcut(sh+1,2)-1 + org_j = sort_idx(j,2) + ext = popcnt(xor(sorted(1,i,2), sorted(1,j,2))) + if (ext > 4) cycle + do ni=2,Nint + ext = ext + popcnt(xor(sorted(ni,i,2), sorted(ni,j,2))) + if (ext > 4) exit + end do + if(ext == 4) then + call i_h_j (keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,hij) + call get_s2(keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,s2) + do istate=1,n_st + vt (istate,org_i) = vt (istate,org_i) + hij*ut(istate,j) + st (istate,org_i) = st (istate,org_i) + s2*ut(istate,j) + enddo + end if + end do + end do + enddo + !$OMP END DO + + !$OMP DO + do i=1,n + do istate=1,N_st + ut(istate,i) = u_0(sort_idx(i,1),istate) + enddo + enddo + !$OMP END DO + + !$OMP DO SCHEDULE(static,4) + do sh=1,shortcut(0,1) + do sh2=1,shortcut(0,1) + if (sh==sh2) cycle + + exa = 0 + do ni=1,Nint + exa = exa + popcnt(xor(version(ni,sh,1), version(ni,sh2,1))) + end do + if(exa > 2) then + cycle + end if + + do i=shortcut(sh,1),shortcut(sh+1,1)-1 + org_i = sort_idx(i,1) + do ni=1,Nint + sorted_i(ni) = sorted(ni,i,1) + enddo + + do j=shortcut(sh2,1),shortcut(sh2+1,1)-1 + ext = exa + popcnt(xor(sorted_i(1), sorted(1,j,1))) + if (ext > 4) cycle + do ni=2,Nint + ext = ext + popcnt(xor(sorted_i(ni), sorted(ni,j,1))) + if (ext > 4) exit + end do + if(ext <= 4) then + org_j = sort_idx(j,1) + call i_h_j (keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,hij) + if (hij /= 0.d0) then + do istate=1,n_st + vt (istate,org_i) = vt (istate,org_i) + hij*ut(istate,j) + enddo + endif + if (ext /= 2) then + call get_s2(keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,s2) + if (s2 /= 0.d0) then + do istate=1,n_st + st (istate,org_i) = st (istate,org_i) + s2*ut(istate,j) + enddo + endif + endif + endif + enddo + + enddo + enddo + + exa = 0 + + do i=shortcut(sh,1),shortcut(sh+1,1)-1 + org_i = sort_idx(i,1) + do ni=1,Nint + sorted_i(ni) = sorted(ni,i,1) + enddo + + do j=shortcut(sh,1),i-1 + ext = exa + popcnt(xor(sorted_i(1), sorted(1,j,1))) + if (ext > 4) cycle + do ni=2,Nint + ext = ext + popcnt(xor(sorted_i(ni), sorted(ni,j,1))) + if (ext > 4) exit + end do + if(ext <= 4) then + org_j = sort_idx(j,1) + call i_h_j (keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,hij) + if (hij /= 0.d0) then + do istate=1,n_st + vt (istate,org_i) = vt (istate,org_i) + hij*ut(istate,j) + enddo + endif + if (ext /= 2) then + call get_s2(keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,s2) + if (s2 /= 0.d0) then + do istate=1,n_st + st (istate,org_i) = st (istate,org_i) + s2*ut(istate,j) + enddo + endif + endif + endif + enddo + + do j=i+1,shortcut(sh+1,1)-1 + ext = exa + popcnt(xor(sorted_i(1), sorted(1,j,1))) + if (ext > 4) cycle + do ni=2,Nint + ext = ext + popcnt(xor(sorted_i(ni), sorted(ni,j,1))) + if (ext > 4) exit + end do + if(ext <= 4) then + org_j = sort_idx(j,1) + call i_h_j (keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,hij) + if (hij /= 0.d0) then + do istate=1,n_st + vt (istate,org_i) = vt (istate,org_i) + hij*ut(istate,j) + enddo + endif + if (ext /= 2) then + call get_s2(keys_tmp(1,1,org_j),keys_tmp(1,1,org_i),nint,s2) + if (s2 /= 0.d0) then + do istate=1,n_st + st (istate,org_i) = st (istate,org_i) + s2*ut(istate,j) + enddo + endif + endif + endif + enddo + enddo + enddo + !$OMP END DO + + do istate=1,N_st + do i=1,n + !$OMP ATOMIC + v_0(i,istate) = v_0(i,istate) + vt(istate,i) + !$OMP ATOMIC + s_0(i,istate) = s_0(i,istate) + st(istate,i) + enddo + enddo + + deallocate(vt,st) + !$OMP END PARALLEL + + do istate=1,N_st + do i=1,n + v_0(i,istate) = v_0(i,istate) + H_jj(i) * u_0(i,istate) + s_0(i,istate) = s_0(i,istate) + s2_jj(i)* u_0(i,istate) + enddo + enddo + deallocate (shortcut, sort_idx, sorted, version, ut) +end + +subroutine H_S2_u_0_nstates_test(v_0,s_0,u_0,H_jj,S2_jj,n,keys_tmp,Nint,N_st,sze) + use bitmasks + implicit none + integer, intent(in) :: N_st,n,Nint, sze + integer(bit_kind), intent(in) :: keys_tmp(Nint,2,n) + double precision, intent(inout) :: v_0(sze,N_st), s_0(sze,N_st) + double precision, intent(in) :: u_0(sze,N_st) + double precision, intent(in) :: H_jj(n), S2_jj(n) + + PROVIDE ref_bitmask_energy + + double precision, allocatable :: vt(:,:) + integer, allocatable :: idx(:) + integer :: i,j, jj, l + double precision :: hij + + do i=1,n + v_0(i,:) = H_jj(i) * u_0(i,:) + enddo + + allocate(idx(0:n), vt(N_st,n)) + Vt = 0.d0 + !$OMP PARALLEL DO DEFAULT(shared) PRIVATE(i,idx,jj,j,degree,exc,phase,hij,l) SCHEDULE(static,1) + do i=2,n + idx(0) = i + call filter_connected(keys_tmp,keys_tmp(1,1,i),Nint,i-1,idx) + do jj=1,idx(0) + j = idx(jj) + double precision :: phase + integer :: degree + integer :: exc(0:2,2,2) + call get_excitation(keys_tmp(1,1,j),keys_tmp(1,1,i),exc,degree,phase,Nint) +! if ((degree == 2).and.(exc(0,1,1)==1)) then +! continue +! else +! cycle +! endif +! if ((degree == 2).and.(exc(0,1,1)==1)) cycle +! if ((degree > 1)) cycle +! if (exc(0,1,2) /= 0) cycle +! if (exc(0,1,1) == 2) cycle +! if (exc(0,1,2) == 2) cycle +! if ((degree==1).and.(exc(0,1,2) == 1)) cycle + call i_H_j(keys_tmp(1,1,j),keys_tmp(1,1,i),Nint,hij) + do l=1,N_st + !$OMP ATOMIC + vt (l,i) = vt (l,i) + hij*u_0(j,l) + !$OMP ATOMIC + vt (l,j) = vt (l,j) + hij*u_0(i,l) + enddo + enddo + enddo + !$OMP END PARALLEL DO + do i=1,n + v_0(i,:) = v_0(i,:) + vt(:,i) + enddo +end + diff --git a/src/Determinants/s2.irp.f b/src/Determinants/s2.irp.f index 7e62befb..4409502b 100644 --- a/src/Determinants/s2.irp.f +++ b/src/Determinants/s2.irp.f @@ -1,3 +1,35 @@ +double precision function diag_S_mat_elem(key_i,Nint) + implicit none + use bitmasks + include 'Utils/constants.include.F' + + integer :: Nint + integer(bit_kind), intent(in) :: key_i(Nint,2) + BEGIN_DOC +! Returns + END_DOC + integer :: nup, i + integer(bit_kind) :: xorvec(N_int_max) + !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: xorvec + + do i=1,Nint + xorvec(i) = xor(key_i(i,1),key_i(i,2)) + enddo + + do i=1,Nint + xorvec(i) = iand(xorvec(i),key_i(i,1)) + enddo + + nup = 0 + do i=1,Nint + if (xorvec(i) /= 0_bit_kind) then + nup += popcnt(xorvec(i)) + endif + enddo + diag_S_mat_elem = dble(nup) + +end + subroutine get_s2(key_i,key_j,Nint,s2) implicit none use bitmasks @@ -25,11 +57,9 @@ subroutine get_s2(key_i,key_j,Nint,s2) endif endif case(0) - nup = 0 - do i=1,Nint - nup += popcnt(iand(xor(key_i(i,1),key_i(i,2)),key_i(i,1))) - enddo - s2 = dble(nup) + double precision, external :: diag_S_mat_elem + !DIR$ FORCEINLINE + s2 = diag_S_mat_elem(key_i,Nint) end select end diff --git a/src/Determinants/slater_rules.irp.f b/src/Determinants/slater_rules.irp.f index 789dc93c..4d5b1bd3 100644 --- a/src/Determinants/slater_rules.irp.f +++ b/src/Determinants/slater_rules.irp.f @@ -1,32 +1,60 @@ subroutine get_excitation_degree(key1,key2,degree,Nint) use bitmasks + include 'Utils/constants.include.F' implicit none BEGIN_DOC ! Returns the excitation degree between two determinants END_DOC integer, intent(in) :: Nint - integer(bit_kind), intent(in) :: key1(Nint,2) - integer(bit_kind), intent(in) :: key2(Nint,2) + integer(bit_kind), intent(in) :: key1(Nint*2) + integer(bit_kind), intent(in) :: key2(Nint*2) integer, intent(out) :: degree + integer(bit_kind) :: xorvec(2*N_int_max) + !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: xorvec integer :: l ASSERT (Nint > 0) - degree = popcnt(xor( key1(1,1), key2(1,1))) + & - popcnt(xor( key1(1,2), key2(1,2))) - !DIR$ NOUNROLL - do l=2,Nint - degree = degree+ popcnt(xor( key1(l,1), key2(l,1))) + & - popcnt(xor( key1(l,2), key2(l,2))) - enddo - ASSERT (degree >= 0) + select case (Nint) + + case (1) + xorvec(1) = xor( key1(1), key2(1)) + xorvec(2) = xor( key1(2), key2(2)) + degree = sum(popcnt(xorvec(1:2))) + + case (2) + xorvec(1) = xor( key1(1), key2(1)) + xorvec(2) = xor( key1(2), key2(2)) + xorvec(3) = xor( key1(3), key2(3)) + xorvec(4) = xor( key1(4), key2(4)) + degree = sum(popcnt(xorvec(1:4))) + + case (3) + do l=1,6 + xorvec(l) = xor( key1(l), key2(l)) + enddo + degree = sum(popcnt(xorvec(1:6))) + + case (4) + do l=1,8 + xorvec(l) = xor( key1(l), key2(l)) + enddo + degree = sum(popcnt(xorvec(1:8))) + + case default + do l=1,ishft(Nint,1) + xorvec(l) = xor( key1(l), key2(l)) + enddo + degree = sum(popcnt(xorvec(1:l))) + + end select + degree = ishft(degree,-1) end - subroutine get_excitation(det1,det2,exc,degree,phase,Nint) use bitmasks implicit none @@ -139,72 +167,6 @@ subroutine decode_exc(exc,degree,h1,p1,h2,p2,s1,s2) end -subroutine decode_exc_int2(exc,degree,h1,p1,h2,p2,s1,s2) - use bitmasks - implicit none - BEGIN_DOC - ! Decodes the exc arrays returned by get_excitation. - ! h1,h2 : Holes - ! p1,p2 : Particles - ! s1,s2 : Spins (1:alpha, 2:beta) - ! degree : Degree of excitation - END_DOC - integer, intent(in) :: exc(0:2,2,2),degree - integer*2, intent(out) :: h1,h2,p1,p2,s1,s2 - ASSERT (degree > 0) - ASSERT (degree < 3) - - select case(degree) - case(2) - if (exc(0,1,1) == 2) then - h1 = exc(1,1,1) - h2 = exc(2,1,1) - p1 = exc(1,2,1) - p2 = exc(2,2,1) - s1 = 1 - s2 = 1 - else if (exc(0,1,2) == 2) then - h1 = exc(1,1,2) - h2 = exc(2,1,2) - p1 = exc(1,2,2) - p2 = exc(2,2,2) - s1 = 2 - s2 = 2 - else - h1 = exc(1,1,1) - h2 = exc(1,1,2) - p1 = exc(1,2,1) - p2 = exc(1,2,2) - s1 = 1 - s2 = 2 - endif - case(1) - if (exc(0,1,1) == 1) then - h1 = exc(1,1,1) - h2 = 0 - p1 = exc(1,2,1) - p2 = 0 - s1 = 1 - s2 = 0 - else - h1 = exc(1,1,2) - h2 = 0 - p1 = exc(1,2,2) - p2 = 0 - s1 = 2 - s2 = 0 - endif - case(0) - h1 = 0 - p1 = 0 - h2 = 0 - p2 = 0 - s1 = 0 - s2 = 0 - end select -end - - subroutine get_double_excitation(det1,det2,exc,phase,Nint) use bitmasks implicit none @@ -925,22 +887,29 @@ subroutine create_minilist(key_mask, fullList, miniList, idx_miniList, N_fullLis N_miniList = 0 + integer :: e_ab + e_ab = n_a+n_b do i=1,N_fullList - e_a = n_a - popcnt(iand(fullList(1, 1, i), key_mask(1, 1))) - e_b = n_b - popcnt(iand(fullList(1, 2, i), key_mask(1, 2))) + e_a = e_ab - popcnt(iand(fullList(1, 1, i), key_mask(1, 1))) & + - popcnt(iand(fullList(1, 2, i), key_mask(1, 2))) do ni=2,nint - e_a -= popcnt(iand(fullList(ni, 1, i), key_mask(ni, 1))) - e_b -= popcnt(iand(fullList(ni, 2, i), key_mask(ni, 2))) + e_a = e_a - popcnt(iand(fullList(ni, 1, i), key_mask(ni, 1))) & + - popcnt(iand(fullList(ni, 2, i), key_mask(ni, 2))) end do - if(e_a + e_b <= 2) then - N_miniList = N_miniList + 1 - do ni=1,Nint - miniList(ni,1,N_miniList) = fullList(ni,1,i) - miniList(ni,2,N_miniList) = fullList(ni,2,i) - enddo - idx_miniList(N_miniList) = i - end if + if(e_a > 2) then + cycle + endif + + N_miniList = N_miniList + 1 + miniList(1,1,N_miniList) = fullList(1,1,i) + miniList(1,2,N_miniList) = fullList(1,2,i) + do ni=2,Nint + miniList(ni,1,N_miniList) = fullList(ni,1,i) + miniList(ni,2,N_miniList) = fullList(ni,2,i) + enddo + idx_miniList(N_miniList) = i + end do end subroutine @@ -1041,13 +1010,15 @@ subroutine i_H_psi(key,keys,coef,Nint,Ndet,Ndet_max,Nstate,i_H_psi_array) double precision :: phase integer :: exc(0:2,2,2) double precision :: hij - integer :: idx(0:Ndet) + integer, allocatable :: idx(:) ASSERT (Nint > 0) ASSERT (N_int == Nint) ASSERT (Nstate > 0) ASSERT (Ndet > 0) ASSERT (Ndet_max >= Ndet) + allocate(idx(0:Ndet)) + i_H_psi_array = 0.d0 call filter_connected_i_H_psi0(keys,key,Nint,Ndet,idx) @@ -1089,7 +1060,7 @@ subroutine i_H_psi_minilist(key,keys,idx_key,N_minilist,coef,Nint,Ndet,Ndet_max, double precision :: phase integer :: exc(0:2,2,2) double precision :: hij - integer :: idx(0:Ndet) + integer, allocatable :: idx(:) BEGIN_DOC ! Computes = \sum_J c_J . ! @@ -1102,6 +1073,7 @@ subroutine i_H_psi_minilist(key,keys,idx_key,N_minilist,coef,Nint,Ndet,Ndet_max, ASSERT (Nstate > 0) ASSERT (Ndet > 0) ASSERT (Ndet_max >= Ndet) + allocate(idx(0:Ndet)) i_H_psi_array = 0.d0 call filter_connected_i_H_psi0(keys,key,Nint,N_minilist,idx) @@ -1148,7 +1120,8 @@ subroutine i_H_psi_sec_ord(key,keys,coef,Nint,Ndet,Ndet_max,Nstate,i_H_psi_array double precision :: phase integer :: exc(0:2,2,2) double precision :: hij - integer :: idx(0:Ndet),n_interact + integer,allocatable :: idx(:) + integer :: n_interact BEGIN_DOC ! for the various Nstates END_DOC @@ -1158,6 +1131,7 @@ subroutine i_H_psi_sec_ord(key,keys,coef,Nint,Ndet,Ndet_max,Nstate,i_H_psi_array ASSERT (Nstate > 0) ASSERT (Ndet > 0) ASSERT (Ndet_max >= Ndet) + allocate(idx(0:Ndet)) i_H_psi_array = 0.d0 call filter_connected_i_H_psi0(keys,key,Nint,Ndet,idx) n_interact = 0 @@ -1207,7 +1181,7 @@ subroutine i_H_psi_SC2(key,keys,coef,Nint,Ndet,Ndet_max,Nstate,i_H_psi_array,idx double precision :: phase integer :: exc(0:2,2,2) double precision :: hij - integer :: idx(0:Ndet) + integer,allocatable :: idx(:) ASSERT (Nint > 0) ASSERT (N_int == Nint) @@ -1215,6 +1189,7 @@ subroutine i_H_psi_SC2(key,keys,coef,Nint,Ndet,Ndet_max,Nstate,i_H_psi_array,idx ASSERT (Ndet > 0) ASSERT (Ndet_max >= Ndet) i_H_psi_array = 0.d0 + allocate(idx(0:Ndet)) call filter_connected_i_H_psi0_SC2(keys,key,Nint,Ndet,idx,idx_repeat) do ii=1,idx(0) i = idx(ii) @@ -1254,7 +1229,7 @@ subroutine i_H_psi_SC2_verbose(key,keys,coef,Nint,Ndet,Ndet_max,Nstate,i_H_psi_a double precision :: phase integer :: exc(0:2,2,2) double precision :: hij - integer :: idx(0:Ndet) + integer,allocatable :: idx(:) ASSERT (Nint > 0) ASSERT (N_int == Nint) @@ -1262,6 +1237,7 @@ subroutine i_H_psi_SC2_verbose(key,keys,coef,Nint,Ndet,Ndet_max,Nstate,i_H_psi_a ASSERT (Ndet > 0) ASSERT (Ndet_max >= Ndet) i_H_psi_array = 0.d0 + allocate(idx(0:Ndet)) call filter_connected_i_H_psi0_SC2(keys,key,Nint,Ndet,idx,idx_repeat) print*,'--------' do ii=1,idx(0) @@ -2140,8 +2116,8 @@ end subroutine get_phase(key1,key2,phase,Nint) use bitmasks implicit none - integer(bit_kind), intent(in) :: key1(Nint,2), key2(Nint,2) integer, intent(in) :: Nint + integer(bit_kind), intent(in) :: key1(Nint,2), key2(Nint,2) double precision, intent(out) :: phase BEGIN_DOC ! Returns the phase between key1 and key2 @@ -2192,3 +2168,424 @@ subroutine u_0_H_u_0_stored(e_0,u_0,hmatrix,sze) call matrix_vector_product(u_0,v_0,hmatrix,sze,sze) e_0 = u_dot_v(v_0,u_0,sze) end + + + +! Spin-determinant routines +! ------------------------- + +subroutine get_excitation_degree_spin(key1,key2,degree,Nint) + use bitmasks + include 'Utils/constants.include.F' + implicit none + BEGIN_DOC + ! Returns the excitation degree between two determinants + END_DOC + integer, intent(in) :: Nint + integer(bit_kind), intent(in) :: key1(Nint) + integer(bit_kind), intent(in) :: key2(Nint) + integer, intent(out) :: degree + + integer(bit_kind) :: xorvec(N_int_max) + integer :: l + + ASSERT (Nint > 0) + + select case (Nint) + + case (1) + xorvec(1) = xor( key1(1), key2(1)) + degree = popcnt(xorvec(1)) + + case (2) + xorvec(1) = xor( key1(1), key2(1)) + xorvec(2) = xor( key1(2), key2(2)) + degree = popcnt(xorvec(1))+popcnt(xorvec(2)) + + case (3) + xorvec(1) = xor( key1(1), key2(1)) + xorvec(2) = xor( key1(2), key2(2)) + xorvec(3) = xor( key1(3), key2(3)) + degree = sum(popcnt(xorvec(1:3))) + + case (4) + xorvec(1) = xor( key1(1), key2(1)) + xorvec(2) = xor( key1(2), key2(2)) + xorvec(3) = xor( key1(3), key2(3)) + xorvec(4) = xor( key1(4), key2(4)) + degree = sum(popcnt(xorvec(1:4))) + + case default + do l=1,N_int + xorvec(l) = xor( key1(l), key2(l)) + enddo + degree = sum(popcnt(xorvec(1:Nint))) + + end select + + degree = ishft(degree,-1) + +end + + +subroutine get_excitation_spin(det1,det2,exc,degree,phase,Nint) + use bitmasks + implicit none + BEGIN_DOC + ! Returns the excitation operators between two determinants and the phase + END_DOC + integer, intent(in) :: Nint + integer(bit_kind), intent(in) :: det1(Nint) + integer(bit_kind), intent(in) :: det2(Nint) + integer, intent(out) :: exc(0:2,2) + integer, intent(out) :: degree + double precision, intent(out) :: phase + ! exc(number,hole/particle) + ! ex : + ! exc(0,1) = number of holes + ! exc(0,2) = number of particles + ! exc(1,2) = first particle + ! exc(1,1) = first hole + + ASSERT (Nint > 0) + + !DIR$ FORCEINLINE + call get_excitation_degree_spin(det1,det2,degree,Nint) + select case (degree) + + case (3:) + degree = -1 + return + + case (2) + call get_double_excitation_spin(det1,det2,exc,phase,Nint) + return + + case (1) + call get_mono_excitation_spin(det1,det2,exc,phase,Nint) + return + + case(0) + return + + end select +end + +subroutine decode_exc_spin(exc,h1,p1,h2,p2) + use bitmasks + implicit none + BEGIN_DOC + ! Decodes the exc arrays returned by get_excitation. + ! h1,h2 : Holes + ! p1,p2 : Particles + END_DOC + integer, intent(in) :: exc(0:2,2) + integer, intent(out) :: h1,h2,p1,p2 + + select case (exc(0,1)) + case(2) + h1 = exc(1,1) + h2 = exc(2,1) + p1 = exc(1,2) + p2 = exc(2,2) + case(1) + h1 = exc(1,1) + h2 = 0 + p1 = exc(1,2) + p2 = 0 + case default + h1 = 0 + p1 = 0 + h2 = 0 + p2 = 0 + end select +end + + +subroutine get_double_excitation_spin(det1,det2,exc,phase,Nint) + use bitmasks + implicit none + BEGIN_DOC + ! Returns the two excitation operators between two doubly excited spin-determinants + ! and the phase + END_DOC + integer, intent(in) :: Nint + integer(bit_kind), intent(in) :: det1(Nint) + integer(bit_kind), intent(in) :: det2(Nint) + integer, intent(out) :: exc(0:2,2) + double precision, intent(out) :: phase + integer :: tz + integer :: l, idx_hole, idx_particle, ishift + integer :: nperm + integer :: i,j,k,m,n + integer :: high, low + integer :: a,b,c,d + integer(bit_kind) :: hole, particle, tmp + double precision, parameter :: phase_dble(0:1) = (/ 1.d0, -1.d0 /) + + ASSERT (Nint > 0) + nperm = 0 + exc(0,1) = 0 + exc(0,2) = 0 + + idx_particle = 0 + idx_hole = 0 + ishift = 1-bit_kind_size + do l=1,Nint + ishift = ishift + bit_kind_size + if (det1(l) == det2(l)) then + cycle + endif + tmp = xor( det1(l), det2(l) ) + particle = iand(tmp, det2(l)) + hole = iand(tmp, det1(l)) + do while (particle /= 0_bit_kind) + tz = trailz(particle) + idx_particle = idx_particle + 1 + exc(0,2) = exc(0,2) + 1 + exc(idx_particle,2) = tz+ishift + particle = iand(particle,particle-1_bit_kind) + enddo + if (iand(exc(0,1),exc(0,2))==2) then ! exc(0,1)==2 or exc(0,2)==2 + exit + endif + do while (hole /= 0_bit_kind) + tz = trailz(hole) + idx_hole = idx_hole + 1 + exc(0,1) = exc(0,1) + 1 + exc(idx_hole,1) = tz+ishift + hole = iand(hole,hole-1_bit_kind) + enddo + if (iand(exc(0,1),exc(0,2))==2) then ! exc(0,1)==2 or exc(0,2)==2 + exit + endif + enddo + + select case (exc(0,1)) + + case(1) + low = min(exc(1,1), exc(1,2)) + high = max(exc(1,1), exc(1,2)) + + ASSERT (low > 0) + j = ishft(low-1,-bit_kind_shift)+1 ! Find integer in array(Nint) + n = iand(low-1,bit_kind_size-1)+1 ! mod(low,bit_kind_size) + ASSERT (high > 0) + k = ishft(high-1,-bit_kind_shift)+1 + m = iand(high-1,bit_kind_size-1)+1 + + if (j==k) then + nperm = nperm + popcnt(iand(det1(j), & + iand( ibset(0_bit_kind,m-1)-1_bit_kind, & + ibclr(-1_bit_kind,n)+1_bit_kind ) )) + else + nperm = nperm + popcnt(iand(det1(k), & + ibset(0_bit_kind,m-1)-1_bit_kind)) + if (n < bit_kind_size) then + nperm = nperm + popcnt(iand(det1(j), ibclr(-1_bit_kind,n) +1_bit_kind)) + endif + do i=j+1,k-1 + nperm = nperm + popcnt(det1(i)) + end do + endif + + case (2) + + do i=1,2 + low = min(exc(i,1), exc(i,2)) + high = max(exc(i,1), exc(i,2)) + + ASSERT (low > 0) + j = ishft(low-1,-bit_kind_shift)+1 ! Find integer in array(Nint) + n = iand(low-1,bit_kind_size-1)+1 ! mod(low,bit_kind_size) + ASSERT (high > 0) + k = ishft(high-1,-bit_kind_shift)+1 + m = iand(high-1,bit_kind_size-1)+1 + + if (j==k) then + nperm = nperm + popcnt(iand(det1(j), & + iand( ibset(0_bit_kind,m-1)-1_bit_kind, & + ibclr(-1_bit_kind,n)+1_bit_kind ) )) + else + nperm = nperm + popcnt(iand(det1(k), & + ibset(0_bit_kind,m-1)-1_bit_kind)) + if (n < bit_kind_size) then + nperm = nperm + popcnt(iand(det1(j), ibclr(-1_bit_kind,n) +1_bit_kind)) + endif + do l=j+1,k-1 + nperm = nperm + popcnt(det1(l)) + end do + endif + + enddo + + a = min(exc(1,1), exc(1,2)) + b = max(exc(1,1), exc(1,2)) + c = min(exc(2,1), exc(2,2)) + d = max(exc(2,1), exc(2,2)) + if (c>a .and. cb) then + nperm = nperm + 1 + endif + end select + + phase = phase_dble(iand(nperm,1)) + +end + +subroutine get_mono_excitation_spin(det1,det2,exc,phase,Nint) + use bitmasks + implicit none + BEGIN_DOC + ! Returns the excitation operator between two singly excited determinants and the phase + END_DOC + integer, intent(in) :: Nint + integer(bit_kind), intent(in) :: det1(Nint) + integer(bit_kind), intent(in) :: det2(Nint) + integer, intent(out) :: exc(0:2,2) + double precision, intent(out) :: phase + integer :: tz + integer :: l, idx_hole, idx_particle, ishift + integer :: nperm + integer :: i,j,k,m,n + integer :: high, low + integer :: a,b,c,d + integer(bit_kind) :: hole, particle, tmp + double precision, parameter :: phase_dble(0:1) = (/ 1.d0, -1.d0 /) + + ASSERT (Nint > 0) + nperm = 0 + exc(0,1) = 0 + exc(0,2) = 0 + + ishift = 1-bit_kind_size + do l=1,Nint + ishift = ishift + bit_kind_size + if (det1(l) == det2(l)) then + cycle + endif + tmp = xor( det1(l), det2(l) ) + particle = iand(tmp, det2(l)) + hole = iand(tmp, det1(l)) + if (particle /= 0_bit_kind) then + tz = trailz(particle) + exc(0,2) = 1 + exc(1,2) = tz+ishift + endif + if (hole /= 0_bit_kind) then + tz = trailz(hole) + exc(0,1) = 1 + exc(1,1) = tz+ishift + endif + + if ( iand(exc(0,1),exc(0,2)) /= 1) then ! exc(0,1)/=1 and exc(0,2) /= 1 + cycle + endif + + low = min(exc(1,1),exc(1,2)) + high = max(exc(1,1),exc(1,2)) + + ASSERT (low > 0) + j = ishft(low-1,-bit_kind_shift)+1 ! Find integer in array(Nint) + n = iand(low-1,bit_kind_size-1)+1 ! mod(low,bit_kind_size) + ASSERT (high > 0) + k = ishft(high-1,-bit_kind_shift)+1 + m = iand(high-1,bit_kind_size-1)+1 + if (j==k) then + nperm = popcnt(iand(det1(j), & + iand(ibset(0_bit_kind,m-1)-1_bit_kind,ibclr(-1_bit_kind,n)+1_bit_kind))) + else + nperm = nperm + popcnt(iand(det1(k),ibset(0_bit_kind,m-1)-1_bit_kind)) + if (n < bit_kind_size) then + nperm = nperm + popcnt(iand(det1(j),ibclr(-1_bit_kind,n)+1_bit_kind)) + endif + do i=j+1,k-1 + nperm = nperm + popcnt(det1(i)) + end do + endif + phase = phase_dble(iand(nperm,1)) + return + + enddo +end + +subroutine i_H_j_mono_spin(key_i,key_j,Nint,spin,hij) + use bitmasks + implicit none + BEGIN_DOC + ! Returns where i and j are determinants differing by a single excitation + END_DOC + integer, intent(in) :: Nint, spin + integer(bit_kind), intent(in) :: key_i(Nint,2), key_j(Nint,2) + double precision, intent(out) :: hij + + integer :: exc(0:2,2) + double precision :: phase + + PROVIDE big_array_exchange_integrals mo_bielec_integrals_in_map + + call get_mono_excitation_spin(key_i(1,spin),key_j(1,spin),exc,phase,Nint) + call get_mono_excitation_from_fock(key_i,key_j,exc(1,1),exc(1,2),spin,phase,hij) +end + +subroutine i_H_j_double_spin(key_i,key_j,Nint,hij) + use bitmasks + implicit none + BEGIN_DOC + ! Returns where i and j are determinants differing by a same-spin double excitation + END_DOC + integer, intent(in) :: Nint + integer(bit_kind), intent(in) :: key_i(Nint), key_j(Nint) + double precision, intent(out) :: hij + + integer :: exc(0:2,2) + double precision :: phase + double precision, external :: get_mo_bielec_integral + + PROVIDE big_array_exchange_integrals mo_bielec_integrals_in_map + + call get_double_excitation_spin(key_i,key_j,exc,phase,Nint) + hij = phase*(get_mo_bielec_integral( & + exc(1,1), & + exc(2,1), & + exc(1,2), & + exc(2,2), mo_integrals_map) - & + get_mo_bielec_integral( & + exc(1,1), & + exc(2,1), & + exc(2,2), & + exc(1,2), mo_integrals_map) ) +end + +subroutine i_H_j_double_alpha_beta(key_i,key_j,Nint,hij) + use bitmasks + implicit none + BEGIN_DOC + ! Returns where i and j are determinants differing by an opposite-spin double excitation + END_DOC + integer, intent(in) :: Nint + integer(bit_kind), intent(in) :: key_i(Nint,2), key_j(Nint,2) + double precision, intent(out) :: hij + + integer :: exc(0:2,2,2) + double precision :: phase, phase2 + double precision, external :: get_mo_bielec_integral + + PROVIDE big_array_exchange_integrals mo_bielec_integrals_in_map + + call get_mono_excitation_spin(key_i(1,1),key_j(1,1),exc(0,1,1),phase,Nint) + call get_mono_excitation_spin(key_i(1,2),key_j(1,2),exc(0,1,2),phase2,Nint) + phase = phase*phase2 + if (exc(1,1,1) == exc(1,2,2)) then + hij = phase * big_array_exchange_integrals(exc(1,1,1),exc(1,1,2),exc(1,2,1)) + else if (exc(1,2,1) == exc(1,1,2)) then + hij = phase * big_array_exchange_integrals(exc(1,2,1),exc(1,1,1),exc(1,2,2)) + else + hij = phase*get_mo_bielec_integral( & + exc(1,1,1), & + exc(1,1,2), & + exc(1,2,1), & + exc(1,2,2) ,mo_integrals_map) + endif +end + + diff --git a/src/Determinants/spindeterminants.irp.f b/src/Determinants/spindeterminants.irp.f index 2eec0dea..5a5723ae 100644 --- a/src/Determinants/spindeterminants.irp.f +++ b/src/Determinants/spindeterminants.irp.f @@ -64,9 +64,9 @@ BEGIN_TEMPLATE integer :: i,j,k integer, allocatable :: iorder(:) - integer(8), allocatable :: bit_tmp(:) - integer(8) :: last_key - integer(8), external :: spin_det_search_key + integer*8, allocatable :: bit_tmp(:) + integer*8 :: last_key + integer*8, external :: spin_det_search_key logical,allocatable :: duplicate(:) allocate ( iorder(N_det), bit_tmp(N_det), duplicate(N_det) ) @@ -386,26 +386,31 @@ END_PROVIDER !==============================================================================! BEGIN_PROVIDER [ double precision, psi_bilinear_matrix_values, (N_det,N_states) ] -&BEGIN_PROVIDER [ integer, psi_bilinear_matrix_rows, (N_det) ] +&BEGIN_PROVIDER [ integer, psi_bilinear_matrix_rows , (N_det) ] &BEGIN_PROVIDER [ integer, psi_bilinear_matrix_columns, (N_det) ] +&BEGIN_PROVIDER [ integer, psi_bilinear_matrix_order , (N_det) ] use bitmasks implicit none BEGIN_DOC ! Sparse coefficient matrix if the wave function is expressed in a bilinear form : ! D_a^t C D_b +! +! Rows are alpha determinants and columns are beta. +! +! Order refers to psi_det END_DOC integer :: i,j,k, l integer(bit_kind) :: tmp_det(N_int,2) - integer :: idx integer, external :: get_index_in_psi_det_sorted_bit PROVIDE psi_coef_sorted_bit - integer, allocatable :: iorder(:), to_sort(:) + integer*8, allocatable :: to_sort(:) integer, external :: get_index_in_psi_det_alpha_unique integer, external :: get_index_in_psi_det_beta_unique - allocate(iorder(N_det), to_sort(N_det)) + allocate(to_sort(N_det)) + !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i,j,k,l) do k=1,N_det i = get_index_in_psi_det_alpha_unique(psi_det(1,1,k),N_int) j = get_index_in_psi_det_beta_unique (psi_det(1,2,k),N_int) @@ -415,16 +420,146 @@ BEGIN_PROVIDER [ double precision, psi_bilinear_matrix_values, (N_det,N_states) enddo psi_bilinear_matrix_rows(k) = i psi_bilinear_matrix_columns(k) = j - to_sort(k) = N_det_alpha_unique * (j-1) + i - iorder(k) = k + to_sort(k) = int(N_det_alpha_unique,8) * int(j-1,8) + int(i,8) + psi_bilinear_matrix_order(k) = k enddo - call isort(to_sort, iorder, N_det) - call iset_order(psi_bilinear_matrix_rows,iorder,N_det) - call iset_order(psi_bilinear_matrix_columns,iorder,N_det) - call dset_order(psi_bilinear_matrix_values,iorder,N_det) - deallocate(iorder,to_sort) + !$OMP END PARALLEL DO + call i8sort(to_sort, psi_bilinear_matrix_order, N_det) + call iset_order(psi_bilinear_matrix_rows,psi_bilinear_matrix_order,N_det) + call iset_order(psi_bilinear_matrix_columns,psi_bilinear_matrix_order,N_det) + do l=1,N_states + call dset_order(psi_bilinear_matrix_values(1,l),psi_bilinear_matrix_order,N_det) + enddo + deallocate(to_sort) END_PROVIDER + +BEGIN_PROVIDER [ integer, psi_bilinear_matrix_order_reverse , (N_det) ] + use bitmasks + implicit none + BEGIN_DOC +! Order which allors to go from psi_bilinear_matrix to psi_det + END_DOC + integer :: k + !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(k) + do k=1,N_det + psi_bilinear_matrix_order_reverse(psi_bilinear_matrix_order(k)) = k + enddo + !$OMP END PARALLEL DO +END_PROVIDER + + +BEGIN_PROVIDER [ integer, psi_bilinear_matrix_columns_loc, (N_det_beta_unique+1) ] + use bitmasks + implicit none + BEGIN_DOC +! Sparse coefficient matrix if the wave function is expressed in a bilinear form : +! D_a^t C D_b +! +! Rows are alpha determinants and columns are beta. +! +! Order refers to psi_det + END_DOC + integer :: i,j,k, l + + l = psi_bilinear_matrix_columns(1) + psi_bilinear_matrix_columns_loc(l) = 1 + do k=2,N_det + if (psi_bilinear_matrix_columns(k) == psi_bilinear_matrix_columns(k-1)) then + cycle + else + l = psi_bilinear_matrix_columns(k) + psi_bilinear_matrix_columns_loc(l) = k + endif + enddo + psi_bilinear_matrix_columns_loc(N_det_beta_unique+1) = N_det+1 +END_PROVIDER + +BEGIN_PROVIDER [ double precision, psi_bilinear_matrix_transp_values, (N_det,N_states) ] +&BEGIN_PROVIDER [ integer, psi_bilinear_matrix_transp_rows , (N_det) ] +&BEGIN_PROVIDER [ integer, psi_bilinear_matrix_transp_columns, (N_det) ] +&BEGIN_PROVIDER [ integer, psi_bilinear_matrix_transp_order , (N_det) ] + use bitmasks + implicit none + BEGIN_DOC +! Transpose of psi_bilinear_matrix +! D_b^t C^t D_a +! +! Rows are Alpha determinants and columns are beta, but the matrix is stored in row major +! format + END_DOC + integer :: i,j,k,l + + + PROVIDE psi_coef_sorted_bit + + integer*8, allocatable :: to_sort(:) + allocate(to_sort(N_det)) + !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,j,k,l) + !$OMP DO COLLAPSE(2) + do l=1,N_states + do k=1,N_det + psi_bilinear_matrix_transp_values (k,l) = psi_bilinear_matrix_values (k,l) + enddo + enddo + !$OMP ENDDO + !$OMP DO + do k=1,N_det + psi_bilinear_matrix_transp_columns(k) = psi_bilinear_matrix_columns(k) + psi_bilinear_matrix_transp_rows (k) = psi_bilinear_matrix_rows (k) + i = psi_bilinear_matrix_transp_columns(k) + j = psi_bilinear_matrix_transp_rows (k) + to_sort(k) = int(N_det_beta_unique,8) * int(j-1,8) + int(i,8) + psi_bilinear_matrix_transp_order(k) = k + enddo + !$OMP ENDDO + !$OMP END PARALLEL + call i8radix_sort(to_sort, psi_bilinear_matrix_transp_order, N_det,-1) + call iset_order(psi_bilinear_matrix_transp_rows,psi_bilinear_matrix_transp_order,N_det) + call iset_order(psi_bilinear_matrix_transp_columns,psi_bilinear_matrix_transp_order,N_det) + do l=1,N_states + call dset_order(psi_bilinear_matrix_transp_values(1,l),psi_bilinear_matrix_transp_order,N_det) + enddo + deallocate(to_sort) +END_PROVIDER + +BEGIN_PROVIDER [ integer, psi_bilinear_matrix_transp_rows_loc, (N_det_alpha_unique+1) ] + use bitmasks + implicit none + BEGIN_DOC +! Location of the columns in the psi_bilinear_matrix + END_DOC + integer :: i,j,k, l + + l = psi_bilinear_matrix_transp_rows(1) + psi_bilinear_matrix_transp_rows_loc(l) = 1 + do k=2,N_det + if (psi_bilinear_matrix_transp_rows(k) == psi_bilinear_matrix_transp_rows(k-1)) then + cycle + else + l = psi_bilinear_matrix_transp_rows(k) + psi_bilinear_matrix_transp_rows_loc(l) = k + endif + enddo + psi_bilinear_matrix_transp_rows_loc(N_det_alpha_unique+1) = N_det+1 +END_PROVIDER + +BEGIN_PROVIDER [ integer, psi_bilinear_matrix_order_transp_reverse , (N_det) ] + use bitmasks + implicit none + BEGIN_DOC +! Order which allows to go from psi_bilinear_matrix_order_transp to psi_bilinear_matrix + END_DOC + integer :: k + + !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(k) + do k=1,N_det + psi_bilinear_matrix_order_transp_reverse(psi_bilinear_matrix_transp_order(k)) = k + enddo + !$OMP END PARALLEL DO +END_PROVIDER + + BEGIN_PROVIDER [ double precision, psi_bilinear_matrix, (N_det_alpha_unique,N_det_beta_unique,N_states) ] implicit none BEGIN_DOC @@ -506,7 +641,7 @@ subroutine generate_all_alpha_beta_det_products ! Create a wave function from all possible alpha x beta determinants END_DOC integer :: i,j,k,l - integer :: idx, iproc + integer :: iproc integer, external :: get_index_in_psi_det_sorted_bit integer(bit_kind), allocatable :: tmp_det(:,:,:) logical, external :: is_in_wavefunction @@ -540,3 +675,489 @@ subroutine generate_all_alpha_beta_det_products end +<<<<<<< HEAD +======= + + +subroutine get_all_spin_singles_and_doubles(buffer, idx, spindet, Nint, size_buffer, singles, doubles, n_singles, n_doubles) + use bitmasks + implicit none + BEGIN_DOC +! +! Returns the indices of all the single and double excitations in the list of +! unique alpha determinants. +! +! /!\ : The buffer is transposed ! +! + END_DOC + integer, intent(in) :: Nint, size_buffer, idx(size_buffer) + integer(bit_kind), intent(in) :: buffer(Nint,size_buffer) + integer(bit_kind), intent(in) :: spindet(Nint) + integer, intent(out) :: singles(size_buffer) + integer, intent(out) :: doubles(size_buffer) + integer, intent(out) :: n_singles + integer, intent(out) :: n_doubles + + select case (Nint) + case (1) + call get_all_spin_singles_and_doubles_1(buffer, idx, spindet(1), size_buffer, singles, doubles, n_singles, n_doubles) + case (2) + call get_all_spin_singles_and_doubles_2(buffer, idx, spindet, size_buffer, singles, doubles, n_singles, n_doubles) + case (3) + call get_all_spin_singles_and_doubles_3(buffer, idx, spindet, size_buffer, singles, doubles, n_singles, n_doubles) + case (4) + call get_all_spin_singles_and_doubles_4(buffer, idx, spindet, size_buffer, singles, doubles, n_singles, n_doubles) + case default + call get_all_spin_singles_and_doubles_N_int(buffer, idx, spindet, size_buffer, singles, doubles, n_singles, n_doubles) + end select + +end + + +subroutine get_all_spin_singles(buffer, idx, spindet, Nint, size_buffer, singles, n_singles) + use bitmasks + implicit none + BEGIN_DOC +! +! Returns the indices of all the single excitations in the list of +! unique alpha determinants. +! + END_DOC + integer, intent(in) :: Nint, size_buffer, idx(size_buffer) + integer(bit_kind), intent(in) :: buffer(Nint,size_buffer) + integer(bit_kind), intent(in) :: spindet(Nint) + integer, intent(out) :: singles(size_buffer) + integer, intent(out) :: n_singles + + select case (N_int) + case (1) + call get_all_spin_singles_1(buffer, idx, spindet(1), size_buffer, singles, n_singles) + return + case (2) + call get_all_spin_singles_2(buffer, idx, spindet, size_buffer, singles, n_singles) + case (3) + call get_all_spin_singles_3(buffer, idx, spindet, size_buffer, singles, n_singles) + case (4) + call get_all_spin_singles_4(buffer, idx, spindet, size_buffer, singles, n_singles) + case default + call get_all_spin_singles_N_int(buffer, idx, spindet, size_buffer, singles, n_singles) + end select + +end + + +subroutine get_all_spin_doubles(buffer, idx, spindet, Nint, size_buffer, doubles, n_doubles) + use bitmasks + implicit none + BEGIN_DOC +! +! Returns the indices of all the double excitations in the list of +! unique alpha determinants. +! + END_DOC + integer, intent(in) :: Nint, size_buffer, idx(size_buffer) + integer(bit_kind), intent(in) :: buffer(Nint,size_buffer) + integer(bit_kind), intent(in) :: spindet(Nint) + integer, intent(out) :: doubles(size_buffer) + integer, intent(out) :: n_doubles + + select case (N_int) + case (1) + call get_all_spin_doubles_1(buffer, idx, spindet(1), size_buffer, doubles, n_doubles) + case (2) + call get_all_spin_doubles_2(buffer, idx, spindet, size_buffer, doubles, n_doubles) + case (3) + call get_all_spin_doubles_3(buffer, idx, spindet, size_buffer, doubles, n_doubles) + case (4) + call get_all_spin_doubles_4(buffer, idx, spindet, size_buffer, doubles, n_doubles) + case default + call get_all_spin_doubles_N_int(buffer, idx, spindet, size_buffer, doubles, n_doubles) + end select + +end + + + + + +subroutine copy_psi_bilinear_to_psi(psi, isize) + implicit none + BEGIN_DOC +! Overwrites psi_det and psi_coef with the wf in bilinear order + END_DOC + integer, intent(in) :: isize + integer(bit_kind), intent(out) :: psi(N_int,2,isize) + integer :: i,j,k,l + do k=1,isize + i = psi_bilinear_matrix_rows(k) + j = psi_bilinear_matrix_columns(k) + psi(1:N_int,1,k) = psi_det_alpha_unique(1:N_int,i) + psi(1:N_int,2,k) = psi_det_beta_unique(1:N_int,j) + enddo +end + +BEGIN_PROVIDER [ integer, singles_alpha_size ] + implicit none + BEGIN_DOC + ! Dimension of the singles_alpha array + END_DOC + singles_alpha_size = elec_alpha_num * (mo_tot_num - elec_alpha_num) +END_PROVIDER + + BEGIN_PROVIDER [ integer*8, singles_alpha_csc_idx, (N_det_alpha_unique+1) ] +&BEGIN_PROVIDER [ integer*8, singles_alpha_csc_size ] + implicit none + BEGIN_DOC + ! Dimension of the singles_alpha array + END_DOC + integer :: i,j + integer, allocatable :: idx0(:), s(:) + allocate (idx0(N_det_alpha_unique)) + do i=1, N_det_alpha_unique + idx0(i) = i + enddo + + !$OMP PARALLEL DEFAULT(NONE) & + !$OMP SHARED(N_det_alpha_unique, psi_det_alpha_unique, & + !$OMP idx0, N_int, singles_alpha_csc, & + !$OMP singles_alpha_size, singles_alpha_csc_idx) & + !$OMP PRIVATE(i,s,j) + allocate (s(singles_alpha_size)) + !$OMP DO SCHEDULE(static,1) + do i=1, N_det_alpha_unique + call get_all_spin_singles( & + psi_det_alpha_unique, idx0, psi_det_alpha_unique(1,i), N_int, & + N_det_alpha_unique, s, j) + singles_alpha_csc_idx(i+1) = int(j,8) + enddo + !$OMP END DO + deallocate(s) + !$OMP END PARALLEL + deallocate(idx0) + + singles_alpha_csc_idx(1) = 1_8 + do i=2, N_det_alpha_unique+1 + singles_alpha_csc_idx(i) = singles_alpha_csc_idx(i) + singles_alpha_csc_idx(i-1) + enddo + singles_alpha_csc_size = singles_alpha_csc_idx(N_det_alpha_unique+1) +END_PROVIDER + + +BEGIN_PROVIDER [ integer, singles_alpha_csc, (singles_alpha_csc_size) ] + implicit none + BEGIN_DOC + ! Dimension of the singles_alpha array + END_DOC + integer :: i, k + integer, allocatable :: idx0(:) + allocate (idx0(N_det_alpha_unique)) + do i=1, N_det_alpha_unique + idx0(i) = i + enddo + + !$OMP PARALLEL DO DEFAULT(NONE) & + !$OMP SHARED(N_det_alpha_unique, psi_det_alpha_unique, & + !$OMP idx0, N_int, singles_alpha_csc, singles_alpha_csc_idx) & + !$OMP PRIVATE(i,k) SCHEDULE(static,1) + do i=1, N_det_alpha_unique + call get_all_spin_singles( & + psi_det_alpha_unique, idx0, psi_det_alpha_unique(1,i), N_int, & + N_det_alpha_unique, singles_alpha_csc(singles_alpha_csc_idx(i)), & + k) + enddo + !$OMP END PARALLEL DO + deallocate(idx0) + +END_PROVIDER + + + + +subroutine get_all_spin_singles_and_doubles_1(buffer, idx, spindet, size_buffer, singles, doubles, n_singles, n_doubles) + use bitmasks + implicit none + BEGIN_DOC +! +! Returns the indices of all the single and double excitations in the list of +! unique alpha determinants. +! +! /!\ : The buffer is transposed ! +! + END_DOC + integer, intent(in) :: size_buffer, idx(size_buffer) + integer(bit_kind), intent(in) :: buffer(size_buffer) + integer(bit_kind), intent(in) :: spindet + integer, intent(out) :: singles(size_buffer) + integer, intent(out) :: doubles(size_buffer) + integer, intent(out) :: n_singles + integer, intent(out) :: n_doubles + + integer :: i + include 'Utils/constants.include.F' + integer :: degree + + + n_singles = 1 + n_doubles = 1 + !DIR$ VECTOR ALIGNED + do i=1,size_buffer + degree = popcnt( xor( spindet, buffer(i) ) ) + if ( degree == 4 ) then + doubles(n_doubles) = idx(i) + n_doubles = n_doubles+1 + else if ( degree == 2 ) then + singles(n_singles) = idx(i) + n_singles = n_singles+1 + endif + enddo + n_singles = n_singles-1 + n_doubles = n_doubles-1 + +end + + + +subroutine get_all_spin_singles_1(buffer, idx, spindet, size_buffer, singles, n_singles) + use bitmasks + implicit none + BEGIN_DOC +! +! Returns the indices of all the single excitations in the list of +! unique alpha determinants. +! + END_DOC + integer, intent(in) :: size_buffer, idx(size_buffer) + integer(bit_kind), intent(in) :: buffer(size_buffer) + integer(bit_kind), intent(in) :: spindet + integer, intent(out) :: singles(size_buffer) + integer, intent(out) :: n_singles + integer :: i + integer :: degree + include 'Utils/constants.include.F' + + n_singles = 1 + do i=1,size_buffer + degree = popcnt(xor( spindet, buffer(i) )) + singles(n_singles) = idx(i) + if (degree == 2) then + n_singles = n_singles+1 + endif + enddo + n_singles = n_singles-1 + +end + + +subroutine get_all_spin_doubles_1(buffer, idx, spindet, size_buffer, doubles, n_doubles) + use bitmasks + implicit none + BEGIN_DOC +! +! Returns the indices of all the double excitations in the list of +! unique alpha determinants. +! + END_DOC + integer, intent(in) :: size_buffer, idx(size_buffer) + integer(bit_kind), intent(in) :: buffer(size_buffer) + integer(bit_kind), intent(in) :: spindet + integer, intent(out) :: doubles(size_buffer) + integer, intent(out) :: n_doubles + integer :: i + include 'Utils/constants.include.F' + integer :: degree + + n_doubles = 1 + !DIR$ VECTOR ALIGNED + do i=1,size_buffer + degree = popcnt(xor( spindet, buffer(i) )) + if ( degree == 4 ) then + doubles(n_doubles) = idx(i) + n_doubles = n_doubles+1 + endif + enddo + n_doubles = n_doubles-1 + +end + + + +BEGIN_TEMPLATE + +subroutine get_all_spin_singles_and_doubles_$N_int(buffer, idx, spindet, size_buffer, singles, doubles, n_singles, n_doubles) + use bitmasks + implicit none + BEGIN_DOC +! +! Returns the indices of all the single and double excitations in the list of +! unique alpha determinants. +! +! /!\ : The buffer is transposed ! +! + END_DOC + integer, intent(in) :: size_buffer, idx(size_buffer) + integer(bit_kind), intent(in) :: buffer($N_int,size_buffer) + integer(bit_kind), intent(in) :: spindet($N_int) + integer, intent(out) :: singles(size_buffer) + integer, intent(out) :: doubles(size_buffer) + integer, intent(out) :: n_singles + integer, intent(out) :: n_doubles + + integer :: i,k + integer(bit_kind) :: xorvec($N_int) + integer :: degree + + integer, external :: align_double + + !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: xorvec, degree + n_singles = 1 + n_doubles = 1 + !DIR$ VECTOR ALIGNED + do i=1,size_buffer + + do k=1,$N_int + xorvec(k) = xor( spindet(k), buffer(k,i) ) + enddo + + if (xorvec(1) /= 0_8) then + degree = popcnt(xorvec(1)) + else + degree = 0 + endif + + do k=2,$N_int + !DIR$ VECTOR ALIGNED + if ( (degree <= 4).and.(xorvec(k) /= 0_8) ) then + degree = degree + popcnt(xorvec(k)) + endif + enddo + + if ( degree == 4 ) then + doubles(n_doubles) = idx(i) + n_doubles = n_doubles+1 + else if ( degree == 2 ) then + singles(n_singles) = idx(i) + n_singles = n_singles+1 + endif + + enddo + n_singles = n_singles-1 + n_doubles = n_doubles-1 + +end + + +subroutine get_all_spin_singles_$N_int(buffer, idx, spindet, size_buffer, singles, n_singles) + use bitmasks + implicit none + BEGIN_DOC +! +! Returns the indices of all the single excitations in the list of +! unique alpha determinants. +! + END_DOC + integer, intent(in) :: size_buffer, idx(size_buffer) + integer(bit_kind), intent(in) :: buffer($N_int,size_buffer) + integer(bit_kind), intent(in) :: spindet($N_int) + integer, intent(out) :: singles(size_buffer) + integer, intent(out) :: n_singles + + integer :: i,k + include 'Utils/constants.include.F' + integer(bit_kind) :: xorvec($N_int) + integer :: degree + + integer, external :: align_double + + !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: xorvec + + n_singles = 1 + !DIR$ VECTOR ALIGNED + do i=1,size_buffer + + do k=1,$N_int + xorvec(k) = xor( spindet(k), buffer(k,i) ) + enddo + + if (xorvec(1) /= 0_8) then + degree = popcnt(xorvec(1)) + else + degree = 0 + endif + + do k=2,$N_int + if ( (degree <= 2).and.(xorvec(k) /= 0_8) ) then + degree = degree + popcnt(xorvec(k)) + endif + enddo + + if ( degree == 2 ) then + singles(n_singles) = idx(i) + n_singles = n_singles+1 + endif + + enddo + n_singles = n_singles-1 + +end + + +subroutine get_all_spin_doubles_$N_int(buffer, idx, spindet, size_buffer, doubles, n_doubles) + use bitmasks + implicit none + BEGIN_DOC +! +! Returns the indices of all the double excitations in the list of +! unique alpha determinants. +! + END_DOC + integer, intent(in) :: size_buffer, idx(size_buffer) + integer(bit_kind), intent(in) :: buffer($N_int,size_buffer) + integer(bit_kind), intent(in) :: spindet($N_int) + integer, intent(out) :: doubles(size_buffer) + integer, intent(out) :: n_doubles + + integer :: i,k, degree + include 'Utils/constants.include.F' + integer(bit_kind) :: xorvec($N_int) + + n_doubles = 1 + !DIR$ VECTOR ALIGNED + do i=1,size_buffer + + do k=1,$N_int + xorvec(k) = xor( spindet(k), buffer(k,i) ) + enddo + + if (xorvec(1) /= 0_8) then + degree = popcnt(xorvec(1)) + else + degree = 0 + endif + + do k=2,$N_int + !DIR$ VECTOR ALIGNED + if ( (degree <= 4).and.(xorvec(k) /= 0_8) ) then + degree = degree + popcnt(xorvec(k)) + endif + enddo + + if ( degree == 4 ) then + doubles(n_doubles) = idx(i) + n_doubles = n_doubles+1 + endif + + enddo + + n_doubles = n_doubles-1 + +end + +SUBST [ N_int ] +2;; +3;; +4;; +N_int;; + +END_TEMPLATE + + diff --git a/src/Integrals_Bielec/ao_bi_integrals.irp.f b/src/Integrals_Bielec/ao_bi_integrals.irp.f index 68a7a050..6ce159af 100644 --- a/src/Integrals_Bielec/ao_bi_integrals.irp.f +++ b/src/Integrals_Bielec/ao_bi_integrals.irp.f @@ -349,7 +349,7 @@ BEGIN_PROVIDER [ logical, ao_bielec_integrals_in_map ] integral = ao_bielec_integral(1,1,1,1) - real :: map_mb + double precision :: map_mb PROVIDE read_ao_integrals disk_access_ao_integrals if (read_ao_integrals) then print*,'Reading the AO integrals' diff --git a/src/Integrals_Bielec/mo_bi_integrals.irp.f b/src/Integrals_Bielec/mo_bi_integrals.irp.f index b56f3518..442c38b5 100644 --- a/src/Integrals_Bielec/mo_bi_integrals.irp.f +++ b/src/Integrals_Bielec/mo_bi_integrals.irp.f @@ -196,7 +196,7 @@ subroutine add_integrals_to_map(mask_ijkl) integer :: size_buffer integer(key_kind),allocatable :: buffer_i(:) real(integral_kind),allocatable :: buffer_value(:) - real :: map_mb + double precision :: map_mb integer :: i1,j1,k1,l1, ii1, kmax, thread_num integer :: i2,i3,i4 @@ -503,7 +503,7 @@ subroutine add_integrals_to_map_three_indices(mask_ijk) integer :: size_buffer integer(key_kind),allocatable :: buffer_i(:) real(integral_kind),allocatable :: buffer_value(:) - real :: map_mb + double precision :: map_mb integer :: i1,j1,k1,l1, ii1, kmax, thread_num integer :: i2,i3,i4 @@ -817,7 +817,7 @@ subroutine add_integrals_to_map_no_exit_34(mask_ijkl) integer :: size_buffer integer(key_kind),allocatable :: buffer_i(:) real(integral_kind),allocatable :: buffer_value(:) - real :: map_mb + double precision :: map_mb integer :: i1,j1,k1,l1, ii1, kmax, thread_num integer :: i2,i3,i4 diff --git a/src/Utils/sort.irp.f b/src/Utils/sort.irp.f index dd7fbc33..bb93d44f 100644 --- a/src/Utils/sort.irp.f +++ b/src/Utils/sort.irp.f @@ -12,17 +12,15 @@ BEGIN_TEMPLATE $type :: xtmp integer :: i, i0, j, jmax - do i=1,isize + do i=2,isize xtmp = x(i) i0 = iorder(i) - j = i-1 - do j=i-1,1,-1 - if ( x(j) > xtmp ) then - x(j+1) = x(j) - iorder(j+1) = iorder(j) - else - exit - endif + j=i-1 + do while (j>0) + if ((x(j) <= xtmp)) exit + x(j+1) = x(j) + iorder(j+1) = iorder(j) + j=j-1 enddo x(j+1) = xtmp iorder(j+1) = i0 @@ -158,6 +156,38 @@ BEGIN_TEMPLATE end subroutine heap_$Xsort_big + subroutine sorted_$Xnumber(x,isize,n) + implicit none + BEGIN_DOC +! Returns the number of sorted elements + END_DOC + integer, intent(in) :: isize + $type, intent(in) :: x(isize) + integer, intent(out) :: n + integer :: i + n=1 + + if (isize < 2) then + return + endif + + do i=2,isize + if (x(i-1) <= x(i)) then + n=n+1 + endif + enddo + + end + +SUBST [ X, type ] + ; real ;; + d ; double precision ;; + i ; integer ;; + i8 ; integer*8 ;; + i2 ; integer*2 ;; +END_TEMPLATE + +BEGIN_TEMPLATE subroutine $Xsort(x,iorder,isize) implicit none BEGIN_DOC @@ -168,16 +198,42 @@ BEGIN_TEMPLATE integer,intent(in) :: isize $type,intent(inout) :: x(isize) integer,intent(inout) :: iorder(isize) - if (isize < 32) then + integer :: n + if (isize < 2) then + return + endif + call sorted_$Xnumber(x,isize,n) + if (isize == n) then + return + endif + if ( isize < 32+n) then call insertion_$Xsort(x,iorder,isize) else call heap_$Xsort(x,iorder,isize) endif end subroutine $Xsort +SUBST [ X, type, Y ] + ; real ; i ;; + d ; double precision ; i8 ;; +END_TEMPLATE + +BEGIN_TEMPLATE + subroutine $Xsort(x,iorder,isize) + implicit none + BEGIN_DOC + ! Sort array x(isize). + ! iorder in input should be (1,2,3,...,isize), and in output + ! contains the new order of the elements. + END_DOC + integer,intent(in) :: isize + $type,intent(inout) :: x(isize) + integer,intent(inout) :: iorder(isize) + integer :: n + call $Xradix_sort(x,iorder,isize,-1) + end subroutine $Xsort + SUBST [ X, type ] - ; real ;; - d ; double precision ;; i ; integer ;; i8 ; integer*8 ;; i2 ; integer*2 ;; @@ -232,17 +288,15 @@ BEGIN_TEMPLATE $type :: xtmp integer*8 :: i, i0, j, jmax - do i=1_8,isize + do i=2_8,isize xtmp = x(i) i0 = iorder(i) j = i-1_8 - do j=i-1_8,1_8,-1_8 - if ( x(j) > xtmp ) then - x(j+1_8) = x(j) - iorder(j+1_8) = iorder(j) - else - exit - endif + do while (j>0_8) + if (x(j)<=xtmp) exit + x(j+1_8) = x(j) + iorder(j+1_8) = iorder(j) + j = j-1_8 enddo x(j+1_8) = xtmp iorder(j+1_8) = i0 @@ -292,63 +346,107 @@ BEGIN_TEMPLATE ! contains the new order of the elements. ! iradix should be -1 in input. END_DOC - $int_type, intent(in) :: isize - $int_type, intent(inout) :: iorder(isize) - $type, intent(inout) :: x(isize) + integer*$int_type, intent(in) :: isize + integer*$int_type, intent(inout) :: iorder(isize) + integer*$type, intent(inout) :: x(isize) integer, intent(in) :: iradix integer :: iradix_new - $type, allocatable :: x2(:), x1(:) - $type :: i4 - $int_type, allocatable :: iorder1(:),iorder2(:) - $int_type :: i0, i1, i2, i3, i - integer, parameter :: integer_size=$octets - $type, parameter :: zero=$zero - $type :: mask - integer :: nthreads, omp_get_num_threads + integer*$type, allocatable :: x2(:), x1(:) + integer*$type :: i4 ! data type + integer*$int_type, allocatable :: iorder1(:),iorder2(:) + integer*$int_type :: i0, i1, i2, i3, i ! index type + integer*$type :: mask + integer :: err !DIR$ ATTRIBUTES ALIGN : 128 :: iorder1,iorder2, x2, x1 - if (iradix == -1) then - - ! Find most significant bit - - i0 = 0_8 - i4 = -1_8 - - do i=1,isize - i4 = max(i4,x(i)) - enddo - i3 = i4 ! Type conversion - - iradix_new = integer_size-1-leadz(i3) - mask = ibset(zero,iradix_new) - nthreads = 1 - ! nthreads = 1+ishft(omp_get_num_threads(),-1) - - integer :: err - allocate(x1(isize/nthreads+1),iorder1(isize/nthreads+1),x2(isize/nthreads+1),iorder2(isize/nthreads+1),stat=err) + if (iradix == -1) then ! Sort Positive and negative + + allocate(x1(isize),iorder1(isize), x2(isize),iorder2(isize),stat=err) if (err /= 0) then print *, irp_here, ': Unable to allocate arrays' stop endif - i1=1_8 - i2=1_8 - - do i=1,isize - if (iand(mask,x(i)) == zero) then + i1=1_$int_type + i2=1_$int_type + do i=1_$int_type,isize + if (x(i) < 0_$type) then iorder1(i1) = iorder(i) - x1(i1) = x(i) - i1 = i1+1_8 + x1(i1) = -x(i) + i1 = i1+1_$int_type else iorder2(i2) = iorder(i) x2(i2) = x(i) - i2 = i2+1_8 + i2 = i2+1_$int_type endif enddo - i1=i1-1_8 - i2=i2-1_8 + i1=i1-1_$int_type + i2=i2-1_$int_type + + do i=1_$int_type,i2 + iorder(i1+i) = iorder2(i) + x(i1+i) = x2(i) + enddo + deallocate(x2,iorder2,stat=err) + if (err /= 0) then + print *, irp_here, ': Unable to deallocate arrays x2, iorder2' + stop + endif - do i=1,i1 + + if (i1 > 1_$int_type) then + call $Xradix_sort$big(x1,iorder1,i1,-2) + do i=1_$int_type,i1 + x(i) = -x1(1_$int_type+i1-i) + iorder(i) = iorder1(1_$int_type+i1-i) + enddo + endif + deallocate(x1,iorder1,stat=err) + if (err /= 0) then + print *, irp_here, ': Unable to deallocate arrays x1, iorder1' + stop + endif + + if (i2>1_$int_type) then + call $Xradix_sort$big(x(i1+1_$int_type),iorder(i1+1_$int_type),i2,-2) + endif + + return + + else if (iradix == -2) then ! Positive + + ! Find most significant bit + + i0 = 0_$int_type + i4 = maxval(x) + + iradix_new = max($integer_size-1-leadz(i4),1) + mask = ibset(0_$type,iradix_new) + + allocate(x1(isize),iorder1(isize), x2(isize),iorder2(isize),stat=err) + if (err /= 0) then + print *, irp_here, ': Unable to allocate arrays' + stop + endif + + i1=1_$int_type + i2=1_$int_type + + do i=1_$int_type,isize + if (iand(mask,x(i)) == 0_$type) then + iorder1(i1) = iorder(i) + x1(i1) = x(i) + i1 = i1+1_$int_type + else + iorder2(i2) = iorder(i) + x2(i2) = x(i) + i2 = i2+1_$int_type + endif + enddo + i1=i1-1_$int_type + i2=i2-1_$int_type + + do i=1_$int_type,i1 iorder(i0+i) = iorder1(i) x(i0+i) = x1(i) enddo @@ -361,7 +459,7 @@ BEGIN_TEMPLATE endif - do i=1,i2 + do i=1_$int_type,i2 iorder(i0+i) = iorder2(i) x(i0+i) = x2(i) enddo @@ -373,12 +471,12 @@ BEGIN_TEMPLATE endif - if (i3>1) then + if (i3>1_$int_type) then call $Xradix_sort$big(x,iorder,i3,iradix_new-1) endif - if (isize-i3>1) then - call $Xradix_sort$big(x(i3+1),iorder(i3+1),isize-i3,iradix_new-1) + if (isize-i3>1_$int_type) then + call $Xradix_sort$big(x(i3+1_$int_type),iorder(i3+1_$int_type),isize-i3,iradix_new-1) endif return @@ -399,25 +497,25 @@ BEGIN_TEMPLATE endif - mask = ibset(zero,iradix) - i0=1 - i1=1 + mask = ibset(0_$type,iradix) + i0=1_$int_type + i1=1_$int_type - do i=1,isize - if (iand(mask,x(i)) == zero) then + do i=1_$int_type,isize + if (iand(mask,x(i)) == 0_$type) then iorder(i0) = iorder(i) x(i0) = x(i) - i0 = i0+1 + i0 = i0+1_$int_type else iorder2(i1) = iorder(i) x2(i1) = x(i) - i1 = i1+1 + i1 = i1+1_$int_type endif enddo - i0=i0-1 - i1=i1-1 + i0=i0-1_$int_type + i1=i1-1_$int_type - do i=1,i1 + do i=1_$int_type,i1 iorder(i0+i) = iorder2(i) x(i0+i) = x2(i) enddo @@ -434,8 +532,8 @@ BEGIN_TEMPLATE endif - if (i1>1) then - call $Xradix_sort$big(x(i0+1),iorder(i0+1),i1,iradix-1) + if (i1>1_$int_type) then + call $Xradix_sort$big(x(i0+1_$int_type),iorder(i0+1_$int_type),i1,iradix-1) endif if (i0>1) then call $Xradix_sort$big(x,iorder,i0,iradix-1) @@ -443,12 +541,12 @@ BEGIN_TEMPLATE end -SUBST [ X, type, octets, is_big, big, int_type, zero ] - i ; integer ; 32 ; .False. ; ; integer ; 0;; - i8 ; integer*8 ; 32 ; .False. ; ; integer ; 0_8;; - i2 ; integer*2 ; 32 ; .False. ; ; integer ; 0;; - i ; integer ; 64 ; .True. ; _big ; integer*8 ; 0 ;; - i8 ; integer*8 ; 64 ; .True. ; _big ; integer*8 ; 0_8 ;; +SUBST [ X, type, integer_size, is_big, big, int_type ] + i ; 4 ; 32 ; .False. ; ; 4 ;; + i8 ; 8 ; 64 ; .False. ; ; 4 ;; + i2 ; 2 ; 16 ; .False. ; ; 4 ;; + i ; 4 ; 32 ; .True. ; _big ; 8 ;; + i8 ; 8 ; 64 ; .True. ; _big ; 8 ;; END_TEMPLATE diff --git a/src/ZMQ/utils.irp.f b/src/ZMQ/utils.irp.f index 3177d3e3..bf83cae4 100644 --- a/src/ZMQ/utils.irp.f +++ b/src/ZMQ/utils.irp.f @@ -1,11 +1,8 @@ use f77_zmq use omp_lib -integer, pointer :: thread_id -integer(omp_lock_kind) :: zmq_lock - - -BEGIN_PROVIDER [ integer(ZMQ_PTR), zmq_context ] + BEGIN_PROVIDER [ integer(ZMQ_PTR), zmq_context ] +&BEGIN_PROVIDER [ integer(omp_lock_kind), zmq_lock ] use f77_zmq implicit none BEGIN_DOC @@ -94,7 +91,7 @@ subroutine switch_qp_run_to_master print *, 'This run should be started with the qp_run command' stop -1 endif - qp_run_address = trim(buffer) + qp_run_address = adjustl(buffer) print *, 'Switched to qp_run master : ', trim(qp_run_address) integer :: i @@ -235,8 +232,8 @@ function new_zmq_pull_socket() if (zmq_context == 0_ZMQ_PTR) then stop 'zmq_context is uninitialized' endif - new_zmq_pull_socket = f77_zmq_socket(zmq_context, ZMQ_PULL) -! new_zmq_pull_socket = f77_zmq_socket(zmq_context, ZMQ_REP) +! new_zmq_pull_socket = f77_zmq_socket(zmq_context, ZMQ_PULL) + new_zmq_pull_socket = f77_zmq_socket(zmq_context, ZMQ_REP) call omp_unset_lock(zmq_lock) if (new_zmq_pull_socket == 0_ZMQ_PTR) then stop 'Unable to create zmq pull socket' @@ -312,8 +309,8 @@ function new_zmq_push_socket(thread) if (zmq_context == 0_ZMQ_PTR) then stop 'zmq_context is uninitialized' endif - new_zmq_push_socket = f77_zmq_socket(zmq_context, ZMQ_PUSH) -! new_zmq_push_socket = f77_zmq_socket(zmq_context, ZMQ_REQ) +! new_zmq_push_socket = f77_zmq_socket(zmq_context, ZMQ_PUSH) + new_zmq_push_socket = f77_zmq_socket(zmq_context, ZMQ_REQ) call omp_unset_lock(zmq_lock) if (new_zmq_push_socket == 0_ZMQ_PTR) then stop 'Unable to create zmq push socket' @@ -407,7 +404,9 @@ subroutine end_zmq_sub_socket(zmq_socket_sub) integer(ZMQ_PTR), intent(in) :: zmq_socket_sub integer :: rc + call omp_set_lock(zmq_lock) rc = f77_zmq_close(zmq_socket_sub) + call omp_unset_lock(zmq_lock) if (rc /= 0) then print *, 'f77_zmq_close(zmq_socket_sub)' stop 'error' @@ -426,7 +425,9 @@ subroutine end_zmq_pair_socket(zmq_socket_pair) integer :: rc character*(8), external :: zmq_port + call omp_set_lock(zmq_lock) rc = f77_zmq_close(zmq_socket_pair) + call omp_unset_lock(zmq_lock) if (rc /= 0) then print *, 'f77_zmq_close(zmq_socket_pair)' stop 'error' @@ -444,7 +445,14 @@ subroutine end_zmq_pull_socket(zmq_socket_pull) integer :: rc character*(8), external :: zmq_port + rc = f77_zmq_setsockopt(zmq_socket_pull,ZMQ_LINGER,0,4) + if (rc /= 0) then + stop 'Unable to set ZMQ_LINGER on pull socket' + endif + + call omp_set_lock(zmq_lock) rc = f77_zmq_close(zmq_socket_pull) + call omp_unset_lock(zmq_lock) if (rc /= 0) then print *, 'f77_zmq_close(zmq_socket_pull)' stop 'error' @@ -469,7 +477,9 @@ subroutine end_zmq_push_socket(zmq_socket_push,thread) stop 'Unable to set ZMQ_LINGER on push socket' endif + call omp_set_lock(zmq_lock) rc = f77_zmq_close(zmq_socket_push) + call omp_unset_lock(zmq_lock) if (rc /= 0) then print *, 'f77_zmq_close(zmq_socket_push)' stop 'error' @@ -500,10 +510,17 @@ subroutine new_parallel_job(zmq_to_qp_run_socket,name_in) integer(ZMQ_PTR),external :: new_zmq_to_qp_run_socket integer(ZMQ_PTR), intent(out) :: zmq_to_qp_run_socket + call omp_set_lock(zmq_lock) zmq_context = f77_zmq_ctx_new () + call omp_unset_lock(zmq_lock) if (zmq_context == 0_ZMQ_PTR) then stop 'ZMQ_PTR is null' endif +! rc = f77_zmq_ctx_set(zmq_context, ZMQ_IO_THREADS, nproc) +! if (rc /= 0) then +! print *, 'Unable to set the number of ZMQ IO threads to', nproc +! endif + zmq_to_qp_run_socket = new_zmq_to_qp_run_socket() name = name_in sze = len(trim(name)) @@ -584,7 +601,10 @@ subroutine end_parallel_job(zmq_to_qp_run_socket,name_in) zmq_state = 'No_state' call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket) + call omp_set_lock(zmq_lock) rc = f77_zmq_ctx_term(zmq_context) + zmq_context = 0_ZMQ_PTR + call omp_unset_lock(zmq_lock) if (rc /= 0) then print *, 'Unable to terminate ZMQ context' stop 'error' @@ -684,10 +704,43 @@ subroutine add_task_to_taskserver(zmq_to_qp_run_socket,task) character*(*), intent(in) :: task integer :: rc, sze - character*(512) :: message + character(len=:), allocatable :: message + + message='add_task '//trim(zmq_state)//' '//trim(task) + sze = len(message) + rc = f77_zmq_send(zmq_to_qp_run_socket, message, sze, 0) + + if (rc /= sze) then + print *, rc, sze + print *, irp_here,': f77_zmq_send(zmq_to_qp_run_socket, trim(message), sze, 0)' + stop 'error' + endif + + rc = f77_zmq_recv(zmq_to_qp_run_socket, message, sze-1, 0) + if (message(1:rc) /= 'ok') then + print *, trim(task) + print *, 'Unable to add the next task' + stop -1 + endif + +end + +subroutine add_task_to_taskserver_send(zmq_to_qp_run_socket,task) + use f77_zmq + implicit none + BEGIN_DOC + ! Get a task from the task server + END_DOC + integer(ZMQ_PTR), intent(in) :: zmq_to_qp_run_socket + character*(*), intent(in) :: task + + integer :: rc, sze + character(len=:), allocatable :: message + + sze = len(trim(task))+12+len(trim(zmq_state)) + message = repeat(' ',sze) write(message,*) 'add_task '//trim(zmq_state)//' '//trim(task) - sze = len(trim(message)) rc = f77_zmq_send(zmq_to_qp_run_socket, trim(message), sze, 0) if (rc /= sze) then print *, rc, sze @@ -695,10 +748,20 @@ subroutine add_task_to_taskserver(zmq_to_qp_run_socket,task) stop 'error' endif +end + +subroutine add_task_to_taskserver_recv(zmq_to_qp_run_socket) + use f77_zmq + implicit none + BEGIN_DOC + ! Get a task from the task server + END_DOC + integer(ZMQ_PTR), intent(in) :: zmq_to_qp_run_socket + + integer :: rc, sze + character*(512) :: message rc = f77_zmq_recv(zmq_to_qp_run_socket, message, 510, 0) - message = trim(message(1:rc)) - if (trim(message) /= 'ok') then - print *, trim(task) + if (message(1:rc) /= 'ok') then print *, 'Unable to add the next task' stop -1 endif @@ -726,8 +789,7 @@ subroutine task_done_to_taskserver(zmq_to_qp_run_socket, worker_id, task_id) endif rc = f77_zmq_recv(zmq_to_qp_run_socket, message, 510, 0) - message = trim(message(1:rc)) - if (trim(message) /= 'ok') then + if (trim(message(1:rc)) /= 'ok') then print *, 'Unable to send task_done message' stop -1 endif @@ -752,17 +814,17 @@ subroutine get_task_from_taskserver(zmq_to_qp_run_socket,worker_id,task_id,task) write(message,*) 'get_task '//trim(zmq_state), worker_id sze = len(trim(message)) - rc = f77_zmq_send(zmq_to_qp_run_socket, trim(message), sze, 0) + rc = f77_zmq_send(zmq_to_qp_run_socket, message, sze, 0) if (rc /= sze) then print *, irp_here, ':f77_zmq_send(zmq_to_qp_run_socket, trim(message), sze, 0)' stop 'error' endif + message = repeat(' ',512) rc = f77_zmq_recv(zmq_to_qp_run_socket, message, 510, 0) - message = trim(message(1:rc)) - read(message,*) reply + read(message(1:rc),*) reply if (trim(reply) == 'get_task_reply') then - read(message,*) reply, task_id + read(message(1:rc),*) reply, task_id rc = 15 do while (message(rc:rc) == ' ') rc += 1 @@ -938,3 +1000,4 @@ subroutine wait_for_states(state_wait,state,n) end +