diff --git a/config/ifort_avx.cfg b/config/ifort_avx.cfg index 7a13348c..d3fcd1f0 100644 --- a/config/ifort_avx.cfg +++ b/config/ifort_avx.cfg @@ -32,7 +32,7 @@ OPENMP : 1 ; Append OpenMP flags # [OPT] FC : -traceback -FCFLAGS : -march=corei7-avx -O2 -ip -ftz -g +FCFLAGS : -xAVX -O2 -ip -ftz -g # Profiling flags ################# diff --git a/src/cipsi/selection.irp.f b/src/cipsi/selection.irp.f index 3585940e..6803cd73 100644 --- a/src/cipsi/selection.irp.f +++ b/src/cipsi/selection.irp.f @@ -306,9 +306,10 @@ subroutine select_singles_and_doubles(i_generator,hole_mask,particle_mask,fock_d i = psi_bilinear_matrix_rows(l_a) if (nt + exc_degree(i) <= 4) then idx = psi_det_sorted_order(psi_bilinear_matrix_order(l_a)) - if (psi_average_norm_contrib_sorted(idx) < 1.d-12) cycle - indices(k) = idx - k=k+1 + if (psi_average_norm_contrib_sorted(idx) > 1.d-12) then + indices(k) = idx + k=k+1 + endif endif enddo enddo @@ -329,9 +330,10 @@ subroutine select_singles_and_doubles(i_generator,hole_mask,particle_mask,fock_d idx = psi_det_sorted_order( & psi_bilinear_matrix_order( & psi_bilinear_matrix_transp_order(l_a))) - if (psi_average_norm_contrib_sorted(idx) < 1.d-12) cycle - indices(k) = idx - k=k+1 + if (psi_average_norm_contrib_sorted(idx) > 1.d-12) then + indices(k) = idx + k=k+1 + endif endif enddo enddo @@ -440,19 +442,20 @@ subroutine select_singles_and_doubles(i_generator,hole_mask,particle_mask,fock_d fullinteresting(0) = 0 do ii=1,preinteresting(0) + i = preinteresting(ii) select case (N_int) case (1) - mobMask(1,1) = iand(negMask(1,1), psi_det_sorted(1,1,preinteresting(ii))) - mobMask(1,2) = iand(negMask(1,2), psi_det_sorted(1,2,preinteresting(ii))) + mobMask(1,1) = iand(negMask(1,1), psi_det_sorted(1,1,i)) + mobMask(1,2) = iand(negMask(1,2), psi_det_sorted(1,2,i)) nt = popcnt(mobMask(1, 1)) + popcnt(mobMask(1, 2)) case (2) - mobMask(1:2,1) = iand(negMask(1:2,1), psi_det_sorted(1:2,1,preinteresting(ii))) - mobMask(1:2,2) = iand(negMask(1:2,2), psi_det_sorted(1:2,2,preinteresting(ii))) + mobMask(1:2,1) = iand(negMask(1:2,1), psi_det_sorted(1:2,1,i)) + mobMask(1:2,2) = iand(negMask(1:2,2), psi_det_sorted(1:2,2,i)) nt = popcnt(mobMask(1, 1)) + popcnt(mobMask(1, 2)) + & popcnt(mobMask(2, 1)) + popcnt(mobMask(2, 2)) case (3) - mobMask(1:3,1) = iand(negMask(1:3,1), psi_det_sorted(1:3,1,preinteresting(ii))) - mobMask(1:3,2) = iand(negMask(1:3,2), psi_det_sorted(1:3,2,preinteresting(ii))) + mobMask(1:3,1) = iand(negMask(1:3,1), psi_det_sorted(1:3,1,i)) + mobMask(1:3,2) = iand(negMask(1:3,2), psi_det_sorted(1:3,2,i)) nt = 0 do j=3,1,-1 if (mobMask(j,1) /= 0_bit_kind) then @@ -465,8 +468,8 @@ subroutine select_singles_and_doubles(i_generator,hole_mask,particle_mask,fock_d endif end do case (4) - mobMask(1:4,1) = iand(negMask(1:4,1), psi_det_sorted(1:4,1,preinteresting(ii))) - mobMask(1:4,2) = iand(negMask(1:4,2), psi_det_sorted(1:4,2,preinteresting(ii))) + mobMask(1:4,1) = iand(negMask(1:4,1), psi_det_sorted(1:4,1,i)) + mobMask(1:4,2) = iand(negMask(1:4,2), psi_det_sorted(1:4,2,i)) nt = 0 do j=4,1,-1 if (mobMask(j,1) /= 0_bit_kind) then @@ -479,8 +482,8 @@ subroutine select_singles_and_doubles(i_generator,hole_mask,particle_mask,fock_d endif end do case default - mobMask(1:N_int,1) = iand(negMask(1:N_int,1), psi_det_sorted(1:N_int,1,preinteresting(ii))) - mobMask(1:N_int,2) = iand(negMask(1:N_int,2), psi_det_sorted(1:N_int,2,preinteresting(ii))) + mobMask(1:N_int,1) = iand(negMask(1:N_int,1), psi_det_sorted(1:N_int,1,i)) + mobMask(1:N_int,2) = iand(negMask(1:N_int,2), psi_det_sorted(1:N_int,2,i)) nt = 0 do j=N_int,1,-1 if (mobMask(j,1) /= 0_bit_kind) then @@ -495,7 +498,6 @@ subroutine select_singles_and_doubles(i_generator,hole_mask,particle_mask,fock_d end select if(nt <= 4) then - i = preinteresting(ii) sze = interesting(0) if (sze+1 == size(interesting)) then allocate (tmp_array(0:sze)) @@ -556,13 +558,14 @@ subroutine select_singles_and_doubles(i_generator,hole_mask,particle_mask,fock_d allocate (fullminilist (N_int, 2, fullinteresting(0)), & minilist (N_int, 2, interesting(0)) ) if(pert_2rdm)then - allocate(coef_fullminilist_rev(N_states,fullinteresting(0))) - do i=1,fullinteresting(0) - do j = 1, N_states - coef_fullminilist_rev(j,i) = psi_coef_sorted(fullinteresting(i),j) + allocate(coef_fullminilist_rev(N_states,fullinteresting(0))) + do i=1,fullinteresting(0) + do j = 1, N_states + coef_fullminilist_rev(j,i) = psi_coef_sorted(fullinteresting(i),j) + enddo enddo - enddo endif + do i=1,fullinteresting(0) fullminilist(1:N_int,1:2,i) = psi_det_sorted(1:N_int,1:2,fullinteresting(i)) enddo @@ -625,7 +628,7 @@ subroutine select_singles_and_doubles(i_generator,hole_mask,particle_mask,fock_d enddo deallocate(fullminilist,minilist) if(pert_2rdm)then - deallocate(coef_fullminilist_rev) + deallocate(coef_fullminilist_rev) endif enddo enddo @@ -707,8 +710,8 @@ subroutine fill_buffer_double(i_generator, sp, h1, h2, bannedOrb, banned, fock_d if(bannedOrb(p2, s2)) cycle if(banned(p1,p2)) cycle - - if( sum(abs(mat(1:N_states, p1, p2))) == 0d0) cycle + val = maxval(abs(mat(1:N_states, p1, p2))) + if( val == 0d0) cycle call apply_particles(mask, s1, p1, s2, p2, det, ok, N_int) if (do_only_cas) then @@ -946,6 +949,9 @@ subroutine get_d2(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs) if(tip == 3) then puti = p(1, mi) if(bannedOrb(puti, mi)) return + h1 = h(1, ma) + h2 = h(2, ma) + do i = 1, 3 putj = p(i, ma) if(banned(putj,puti,bant)) cycle @@ -953,17 +959,21 @@ subroutine get_d2(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs) i2 = turn3(2,i) p1 = p(i1, ma) p2 = p(i2, ma) - h1 = h(1, ma) - h2 = h(2, ma) - hij = (mo_two_e_integral(p1, p2, h1, h2) - mo_two_e_integral(p2,p1, h1, h2)) * get_phase_bi(phasemask, ma, ma, h1, p1, h2, p2, N_int) + hij = mo_two_e_integral(p1, p2, h1, h2) - mo_two_e_integral(p2, p1, h1, h2) + if (hij == 0.d0) cycle + + hij = hij * get_phase_bi(phasemask, ma, ma, h1, p1, h2, p2, N_int) + if(ma == 1) then + !DIR$ LOOP COUNT AVG(4) do k=1,N_states - mat(k, putj, puti) = mat(k, putj, puti) +coefs(k) * hij + mat(k, putj, puti) = mat(k, putj, puti) + coefs(k) * hij enddo else + !DIR$ LOOP COUNT AVG(4) do k=1,N_states - mat(k, puti, putj) = mat(k, puti, putj) +coefs(k) * hij + mat(k, puti, putj) = mat(k, puti, putj) + coefs(k) * hij enddo end if end do @@ -980,10 +990,14 @@ subroutine get_d2(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs) if(banned(puti,putj,bant) .or. bannedOrb(puti,1)) cycle p1 = p(turn2(i), 1) - hij = mo_two_e_integral(p1, p2, h1, h2) * get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2, N_int) - do k=1,N_states - mat(k, puti, putj) = mat(k, puti, putj) +coefs(k) * hij - enddo + hij = mo_two_e_integral(p1, p2, h1, h2) + if (hij /= 0.d0) then + hij = hij * get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2, N_int) + !DIR$ LOOP COUNT AVG(4) + do k=1,N_states + mat(k, puti, putj) = mat(k, puti, putj) + coefs(k) * hij + enddo + endif end do end do end if @@ -993,22 +1007,26 @@ subroutine get_d2(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs) h1 = h(1, ma) h2 = h(2, ma) do i=1,3 - puti = p(i, ma) - if(bannedOrb(puti,ma)) cycle - do j=i+1,4 - putj = p(j, ma) - if(bannedOrb(putj,ma)) cycle - if(banned(puti,putj,1)) cycle + puti = p(i, ma) + if(bannedOrb(puti,ma)) cycle + do j=i+1,4 + putj = p(j, ma) + if(bannedOrb(putj,ma)) cycle + if(banned(puti,putj,1)) cycle - i1 = turn2d(1, i, j) - i2 = turn2d(2, i, j) - p1 = p(i1, ma) - p2 = p(i2, ma) - hij = (mo_two_e_integral(p1, p2, h1, h2) - mo_two_e_integral(p2,p1, h1, h2)) * get_phase_bi(phasemask, ma, ma, h1, p1, h2, p2, N_int) - do k=1,N_states - mat(k, puti, putj) = mat(k, puti, putj) +coefs(k) * hij - enddo - end do + i1 = turn2d(1, i, j) + i2 = turn2d(2, i, j) + p1 = p(i1, ma) + p2 = p(i2, ma) + hij = mo_two_e_integral(p1, p2, h1, h2) - mo_two_e_integral(p2,p1, h1, h2) + if (hij == 0.d0) cycle + + hij = hij * get_phase_bi(phasemask, ma, ma, h1, p1, h2, p2, N_int) + !DIR$ LOOP COUNT AVG(4) + do k=1,N_states + mat(k, puti, putj) = mat(k, puti, putj) +coefs(k) * hij + enddo + end do end do else if(tip == 3) then h1 = h(1, mi) @@ -1022,10 +1040,21 @@ subroutine get_d2(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs) if(banned(puti,putj,1)) cycle p2 = p(i, ma) - hij = mo_two_e_integral(p1, p2, h1, h2) * get_phase_bi(phasemask, mi, ma, h1, p1, h2, p2, N_int) - do k=1,N_states - mat(k, min(puti, putj), max(puti, putj)) = mat(k, min(puti, putj), max(puti, putj)) + coefs(k) * hij - enddo + hij = mo_two_e_integral(p1, p2, h1, h2) + if (hij == 0.d0) cycle + + hij = hij * get_phase_bi(phasemask, mi, ma, h1, p1, h2, p2, N_int) + if (puti < putj) then + !DIR$ LOOP COUNT AVG(4) + do k=1,N_states + mat(k, puti, putj) = mat(k, puti, putj) + coefs(k) * hij + enddo + else + !DIR$ LOOP COUNT AVG(4) + do k=1,N_states + mat(k, putj, puti) = mat(k, putj, puti) + coefs(k) * hij + enddo + endif end do else ! tip == 4 puti = p(1, sp) @@ -1035,10 +1064,14 @@ subroutine get_d2(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs) p2 = p(2, mi) h1 = h(1, mi) h2 = h(2, mi) - hij = (mo_two_e_integral(p1, p2, h1, h2) - mo_two_e_integral(p2,p1, h1, h2)) * get_phase_bi(phasemask, mi, mi, h1, p1, h2, p2, N_int) - do k=1,N_states - mat(k, puti, putj) = mat(k, puti, putj) +coefs(k) * hij - enddo + hij = (mo_two_e_integral(p1, p2, h1, h2) - mo_two_e_integral(p2,p1, h1, h2)) + if (hij /= 0.d0) then + hij = hij * get_phase_bi(phasemask, mi, mi, h1, p1, h2, p2, N_int) + !DIR$ LOOP COUNT AVG(4) + do k=1,N_states + mat(k, puti, putj) = mat(k, puti, putj) + coefs(k) * hij + enddo + end if end if end if end if @@ -1061,7 +1094,7 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs) logical, allocatable :: lbanned(:,:) integer :: puti, putj, ma, mi, s1, s2, i, i1, i2, j - integer :: hfix, pfix, h1, h2, p1, p2, ib, k + integer :: hfix, pfix, h1, h2, p1, p2, ib, k, l integer, parameter :: turn2(2) = (/2,1/) integer, parameter :: turn3(2,3) = reshape((/2,3, 1,3, 1,2/), (/2,3/)) @@ -1105,7 +1138,10 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs) hij = hij_cache(putj,1) - hij_cache(putj,2) if (hij /= 0.d0) then hij = hij * get_phase_bi(phasemask, ma, ma, putj, p1, hfix, p2, N_int) - tmp_row(1:N_states,putj) = tmp_row(1:N_states,putj) + hij * coefs(1:N_states) + !DIR$ LOOP COUNT AVG(4) + do k=1,N_states + tmp_row(k,putj) = tmp_row(k,putj) + hij * coefs(k) + enddo endif end do do putj=hfix+1, mo_num @@ -1114,14 +1150,22 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs) hij = hij_cache(putj,2) - hij_cache(putj,1) if (hij /= 0.d0) then hij = hij * get_phase_bi(phasemask, ma, ma, hfix, p1, putj, p2, N_int) - tmp_row(1:N_states,putj) = tmp_row(1:N_states,putj) + hij * coefs(1:N_states) + !DIR$ LOOP COUNT AVG(4) + do k=1,N_states + tmp_row(k,putj) = tmp_row(k,putj) + hij * coefs(k) + enddo endif end do if(ma == 1) then mat(1:N_states,1:mo_num,puti) = mat(1:N_states,1:mo_num,puti) + tmp_row(1:N_states,1:mo_num) else - mat(1:N_states,puti,1:mo_num) = mat(1:N_states,puti,1:mo_num) + tmp_row(1:N_states,1:mo_num) + do l=1,mo_num + !DIR$ LOOP COUNT AVG(4) + do k=1,N_states + mat(k,puti,l) = mat(k,puti,l) + tmp_row(k,l) + enddo + enddo end if end if @@ -1132,7 +1176,7 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs) call get_mo_two_e_integrals(hfix,pfix,p1,mo_num,hij_cache(1,1),mo_integrals_map) call get_mo_two_e_integrals(hfix,pfix,p2,mo_num,hij_cache(1,2),mo_integrals_map) putj = p1 - do puti=1,mo_num + do puti=1,mo_num !HOT if(lbanned(puti,mi)) cycle !p1 fixed putj = p1 @@ -1140,13 +1184,16 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs) hij = hij_cache(puti,2) if (hij /= 0.d0) then hij = hij * get_phase_bi(phasemask, ma, mi, hfix, p2, puti, pfix, N_int) + !DIR$ LOOP COUNT AVG(4) do k=1,N_states tmp_row(k,puti) = tmp_row(k,puti) + hij * coefs(k) enddo endif end if - +! enddo +! putj = p2 +! do puti=1,mo_num !HOT if(.not. banned(putj,puti,bant)) then hij = hij_cache(puti,1) if (hij /= 0.d0) then @@ -1162,8 +1209,13 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs) mat(:,:,p1) = mat(:,:,p1) + tmp_row(:,:) mat(:,:,p2) = mat(:,:,p2) + tmp_row2(:,:) else - mat(:,p1,:) = mat(:,p1,:) + tmp_row(:,:) - mat(:,p2,:) = mat(:,p2,:) + tmp_row2(:,:) + do l=1,mo_num + !DIR$ LOOP COUNT AVG(4) + do k=1,N_states + mat(k,p1,l) = mat(k,p1,l) + tmp_row(k,l) + mat(k,p2,l) = mat(k,p2,l) + tmp_row2(k,l) + enddo + enddo end if else ! sp /= 3 @@ -1197,7 +1249,12 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs) end do mat(:, :puti-1, puti) = mat(:, :puti-1, puti) + tmp_row(:,:puti-1) - mat(:, puti, puti:) = mat(:, puti,puti:) + tmp_row(:,puti:) + do l=puti,mo_num + !DIR$ LOOP COUNT AVG(4) + do k=1,N_states + mat(k, puti, l) = mat(k, puti,l) + tmp_row(k,l) + enddo + enddo end do else hfix = h(1,mi) @@ -1216,6 +1273,7 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs) hij = hij_cache(puti,1) if (hij /= 0.d0) then hij = hij * get_phase_bi(phasemask, mi, ma, hfix, pfix, puti, p1, N_int) + !DIR$ LOOP COUNT AVG(4) do k=1,N_states tmp_row(k,puti) = tmp_row(k,puti) + hij * coefs(k) enddo @@ -1234,9 +1292,19 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs) end if end do mat(:,:p2-1,p2) = mat(:,:p2-1,p2) + tmp_row(:,:p2-1) - mat(:,p2,p2:) = mat(:,p2,p2:) + tmp_row(:,p2:) + do l=p2,mo_num + !DIR$ LOOP COUNT AVG(4) + do k=1,N_states + mat(k,p2,l) = mat(k,p2,l) + tmp_row(k,l) + enddo + enddo mat(:,:p1-1,p1) = mat(:,:p1-1,p1) + tmp_row2(:,:p1-1) - mat(:,p1,p1:) = mat(:,p1,p1:) + tmp_row2(:,p1:) + do l=p1,mo_num + !DIR$ LOOP COUNT AVG(4) + do k=1,N_states + mat(k,p1,l) = mat(k,p1,l) + tmp_row2(k,l) + enddo + enddo end if end if deallocate(lbanned,hij_cache) @@ -1259,7 +1327,10 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs) if(bannedOrb(p1, s1) .or. bannedOrb(p2, s2) .or. banned(p1, p2, 1)) cycle call apply_particles(mask, s1, p1, s2, p2, det, ok, N_int) call i_h_j(gen, det, N_int, hij) - mat(:, p1, p2) = mat(:, p1, p2) + coefs(:) * hij + !DIR$ LOOP COUNT AVG(4) + do k=1,N_states + mat(k, p1, p2) = mat(k, p1, p2) + coefs(k) * hij + enddo end do end do end @@ -1303,34 +1374,15 @@ subroutine get_d0(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs) call i_h_j(gen, det, N_int, hij) else phase = get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2, N_int) -! hij = mo_two_e_integral(p2, p1, h2, h1) * phase hij = hij_cache1(p2) * phase end if if (hij == 0.d0) cycle + !DIR$ LOOP COUNT AVG(4) do k=1,N_states mat(k, p1, p2) = mat(k, p1, p2) + coefs(k) * hij ! HOTSPOT enddo end do end do -! do p2=1, mo_num -! if(bannedOrb(p2,2)) cycle -! call get_mo_two_e_integrals(p2,h1,h2,mo_num,hij_cache1,mo_integrals_map) -! do p1=1, mo_num -! if(bannedOrb(p1, 1) .or. banned(p1, p2, bant)) cycle -! if(p1 /= h1 .and. p2 /= h2) then -! if (hij_cache1(p1) == 0.d0) cycle -! phase = get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2, N_int) -! hij = hij_cache1(p1) * phase -! else -! call apply_particles(mask, 1,p1,2,p2, det, ok, N_int) -! call i_h_j(gen, det, N_int, hij) -! if (hij == 0.d0) cycle -! end if -! do k=1,N_states -! mat(k, p1, p2) = mat(k, p1, p2) + coefs(k) * hij ! HOTSPOT -! enddo -! end do -! end do else ! AA BB p1 = p(1,sp) @@ -1345,31 +1397,16 @@ subroutine get_d0(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs) if(puti == p1 .or. putj == p2 .or. puti == p2 .or. putj == p1) then call apply_particles(mask, sp,puti,sp,putj, det, ok, N_int) call i_h_j(gen, det, N_int, hij) + if (hij == 0.d0) cycle else - hij = (mo_two_e_integral(p1, p2, puti, putj) - mo_two_e_integral(p2, p1, puti, putj))* get_phase_bi(phasemask, sp, sp, puti, p1 , putj, p2, N_int) + hij = (mo_two_e_integral(p1, p2, puti, putj) - mo_two_e_integral(p2, p1, puti, putj)) + if (hij == 0.d0) cycle + hij = hij * get_phase_bi(phasemask, sp, sp, puti, p1 , putj, p2, N_int) end if - if (hij == 0.d0) cycle + !DIR$ LOOP COUNT AVG(4) do k=1,N_states mat(k, puti, putj) = mat(k, puti, putj) + coefs(k) * hij enddo -! if(bannedOrb(putj, sp) .or. banned(putj, sp, bant)) cycle -! if(puti /= p1 .and. putj /= p2 .and. puti /= p2 .and. putj /= p1) then -! hij = hij_cache1(putj) - hij_cache2(putj) -! if (hij /= 0.d0) then -! hij = hij * get_phase_bi(phasemask, sp, sp, puti, p1 , putj, p2, N_int) -! do k=1,N_states -! mat(k, puti, putj) = mat(k, puti, putj) + coefs(k) * hij -! enddo -! endif -! else -! call apply_particles(mask, sp,puti,sp,putj, det, ok, N_int) -! call i_h_j(gen, det, N_int, hij) -! if (hij /= 0.d0) then -! do k=1,N_states -! mat(k, puti, putj) = mat(k, puti, putj) + coefs(k) * hij -! enddo -! endif -! end if end do end do end if @@ -1403,8 +1440,8 @@ subroutine past_d2(banned, p, sp) integer :: i,j if(sp == 3) then - do i=1,p(0,1) - do j=1,p(0,2) + do j=1,p(0,2) + do i=1,p(0,1) banned(p(i,1), p(j,2)) = .true. end do end do diff --git a/src/davidson/EZFIO.cfg b/src/davidson/EZFIO.cfg index 67d7786a..393b25f1 100644 --- a/src/davidson/EZFIO.cfg +++ b/src/davidson/EZFIO.cfg @@ -6,7 +6,7 @@ default: 1.e-10 [n_states_diag] type: States_number -doc: Number of states to consider during the Davdison diagonalization +doc: Controls the number of states to consider during the Davdison diagonalization. The number of states is n_states * n_states_diag default: 4 interface: ezfio,ocaml diff --git a/src/davidson/davidson_parallel.irp.f b/src/davidson/davidson_parallel.irp.f index 9e4402f6..c0d94b35 100644 --- a/src/davidson/davidson_parallel.irp.f +++ b/src/davidson/davidson_parallel.irp.f @@ -428,7 +428,7 @@ subroutine H_S2_u_0_nstates_zmq(v_0,s_0,u_0,N_st,sze) integer :: istep, imin, imax, ishift, ipos integer, external :: add_task_to_taskserver - integer, parameter :: tasksize=40000 + integer, parameter :: tasksize=10000 character*(100000) :: task istep=1 ishift=0 diff --git a/src/davidson/diagonalization_hs2_dressed.irp.f b/src/davidson/diagonalization_hs2_dressed.irp.f index ec9a194e..67c466a8 100644 --- a/src/davidson/diagonalization_hs2_dressed.irp.f +++ b/src/davidson/diagonalization_hs2_dressed.irp.f @@ -161,7 +161,7 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_ stop -1 endif - itermax = max(3,min(davidson_sze_max, sze/N_st_diag)) + itermax = max(2,min(davidson_sze_max, sze/N_st_diag))+1 itertot = 0 if (state_following) then @@ -322,6 +322,12 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_ call normalize(u_in(1,k),sze) enddo + do k=1,N_st_diag + do i=1,sze + U(i,k) = u_in(i,k) + enddo + enddo + do while (.not.converged) itertot = itertot+1 @@ -329,30 +335,33 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_ exit endif - do k=1,N_st_diag - do i=1,sze - U(i,k) = u_in(i,k) - enddo - enddo - do iter=1,itermax-1 shift = N_st_diag*(iter-1) shift2 = N_st_diag*iter - call ortho_qr(U,size(U,1),sze,shift2) - call ortho_qr(U,size(U,1),sze,shift2) + if ((iter > 1).or.(itertot == 1)) then + ! Compute |W_k> = \sum_i |i> + ! ----------------------------------- - ! Compute |W_k> = \sum_i |i> - ! ----------------------------------------- + if (disk_based) then + call ortho_qr_unblocked(U,size(U,1),sze,shift2) + call ortho_qr_unblocked(U,size(U,1),sze,shift2) + else + call ortho_qr(U,size(U,1),sze,shift2) + call ortho_qr(U,size(U,1),sze,shift2) + endif - - if ((sze > 100000).and.distributed_davidson) then - call H_S2_u_0_nstates_zmq (W(1,shift+1),S_d,U(1,shift+1),N_st_diag,sze) + if ((sze > 100000).and.distributed_davidson) then + call H_S2_u_0_nstates_zmq (W(1,shift+1),S_d,U(1,shift+1),N_st_diag,sze) + else + call H_S2_u_0_nstates_openmp(W(1,shift+1),S_d,U(1,shift+1),N_st_diag,sze) + endif + S(1:sze,shift+1:shift+N_st_diag) = real(S_d(1:sze,1:N_st_diag)) else - call H_S2_u_0_nstates_openmp(W(1,shift+1),S_d,U(1,shift+1),N_st_diag,sze) + ! Already computed in update below + continue endif - S(1:sze,shift+1:shift+N_st_diag) = real(S_d(1:sze,1:N_st_diag)) if (dressing_state > 0) then @@ -408,7 +417,7 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_ ! Compute s_kl = = ! ------------------------------------------- - !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i,j,k) + !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i,j,k) COLLAPSE(2) do j=1,shift2 do i=1,shift2 s_(i,j) = 0.d0 @@ -563,6 +572,7 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_ ! Compute residual vector and davidson step ! ----------------------------------------- + !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i,k) do k=1,N_st_diag do i=1,sze U(i,shift2+k) = & @@ -577,9 +587,15 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_ to_print(3,k) = residual_norm(k) endif enddo + !$OMP END PARALLEL DO - write(*,'(1X,I3,1X,100(1X,F16.10,1X,F11.6,1X,E11.3))') iter, to_print(1:3,1:N_st) + if ((itertot>1).and.(iter == 1)) then + !don't print + continue + else + write(*,'(1X,I3,1X,100(1X,F16.10,1X,F11.6,1X,E11.3))') iter-1, to_print(1:3,1:N_st) + endif call davidson_converged(lambda,residual_norm,wall,iter,cpu,N_st,converged) do k=1,N_st if (residual_norm(k) > 1.e8) then @@ -600,11 +616,56 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_ enddo - ! Re-contract to u_in - ! ----------- + ! Re-contract U and update S and W + ! -------------------------------- + + call sgemm('N','N', sze, N_st_diag, shift2, 1., & + S, size(S,1), y_s, size(y_s,1), 0., S(1,shift2+1), size(S,1)) + do k=1,N_st_diag + do i=1,sze + S(i,k) = S(i,shift2+k) + enddo + enddo + + call dgemm('N','N', sze, N_st_diag, shift2, 1.d0, & + W, size(W,1), y, size(y,1), 0.d0, u_in, size(u_in,1)) + do k=1,N_st_diag + do i=1,sze + W(i,k) = u_in(i,k) + enddo + enddo call dgemm('N','N', sze, N_st_diag, shift2, 1.d0, & U, size(U,1), y, size(y,1), 0.d0, u_in, size(u_in,1)) + do k=1,N_st_diag + do i=1,sze + U(i,k) = u_in(i,k) + enddo + enddo + if (disk_based) then + call ortho_qr_unblocked(U,size(U,1),sze,N_st_diag) + call ortho_qr_unblocked(U,size(U,1),sze,N_st_diag) + else + call ortho_qr(U,size(U,1),sze,N_st_diag) + call ortho_qr(U,size(U,1),sze,N_st_diag) + endif + do j=1,N_st_diag + k=1 + do while ((k n_singles_a) exit + l_a = singles_a(k+kk) + ASSERT (l_a <= N_det) - lrow = psi_bilinear_matrix_rows(l_a) - ASSERT (lrow <= N_det_alpha_unique) + do l=1,N_st + utl(l,kk+1) = u_t(l,l_a) + enddo + enddo - tmp_det2(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, lrow) - call i_H_j_double_alpha_beta(tmp_det,tmp_det2,$N_int,hij) - call get_s2(tmp_det,tmp_det2,$N_int,sij) - !DIR$ LOOP COUNT AVG(4) - do l=1,N_st - v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,l_a) - s_t(l,k_a) = s_t(l,k_a) + sij * u_t(l,l_a) + do kk=0,block_size-1 + if (k+kk > n_singles_a) exit + l_a = singles_a(k+kk) + lrow = psi_bilinear_matrix_rows(l_a) + ASSERT (lrow <= N_det_alpha_unique) + + tmp_det2(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, lrow) + call i_H_j_double_alpha_beta(tmp_det,tmp_det2,$N_int,hij) + call get_s2(tmp_det,tmp_det2,$N_int,sij) + !DIR$ LOOP COUNT AVG(4) + do l=1,N_st + v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1) + s_t(l,k_a) = s_t(l,k_a) + sij * utl(l,kk+1) + enddo enddo enddo @@ -475,20 +489,32 @@ compute_singles=.True. tmp_det2(1:$N_int,2) = psi_det_beta_unique (1:$N_int, kcol) !DIR$ LOOP COUNT avg(1000) - do i=1,n_singles_a - l_a = singles_a(i) - ASSERT (l_a <= N_det) + do i=1,n_singles_a,block_size + ! Prefetch u_t(:,l_a) + do kk=0,block_size-1 + if (i+kk > n_singles_a) exit + l_a = singles_a(i+kk) + ASSERT (l_a <= N_det) - lrow = psi_bilinear_matrix_rows(l_a) - ASSERT (lrow <= N_det_alpha_unique) + do l=1,N_st + utl(l,kk+1) = u_t(l,l_a) + enddo + enddo - tmp_det2(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, lrow) - call i_h_j_single_spin( tmp_det, tmp_det2, $N_int, 1, hij) + do kk=0,block_size-1 + if (i+kk > n_singles_a) exit + l_a = singles_a(i+kk) + lrow = psi_bilinear_matrix_rows(l_a) + ASSERT (lrow <= N_det_alpha_unique) - !DIR$ LOOP COUNT AVG(4) - do l=1,N_st - v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,l_a) - ! single => sij = 0 + tmp_det2(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, lrow) + call i_h_j_single_spin( tmp_det, tmp_det2, $N_int, 1, hij) + + !DIR$ LOOP COUNT AVG(4) + do l=1,N_st + v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1) + ! single => sij = 0 + enddo enddo enddo @@ -497,18 +523,30 @@ compute_singles=.True. ! ---------------------------------- !DIR$ LOOP COUNT avg(50000) - do i=1,n_doubles - l_a = doubles(i) - ASSERT (l_a <= N_det) + do i=1,n_doubles,block_size + ! Prefetch u_t(:,l_a) + do kk=0,block_size-1 + if (i+kk > n_doubles) exit + l_a = doubles(i+kk) + ASSERT (l_a <= N_det) - lrow = psi_bilinear_matrix_rows(l_a) - ASSERT (lrow <= N_det_alpha_unique) + do l=1,N_st + utl(l,kk+1) = u_t(l,l_a) + enddo + enddo - call i_H_j_double_spin( tmp_det(1,1), psi_det_alpha_unique(1, lrow), $N_int, hij) - !DIR$ LOOP COUNT AVG(4) - do l=1,N_st - v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,l_a) - ! same spin => sij = 0 + do kk=0,block_size-1 + if (i+kk > n_doubles) exit + l_a = doubles(i+kk) + lrow = psi_bilinear_matrix_rows(l_a) + ASSERT (lrow <= N_det_alpha_unique) + + call i_H_j_double_spin( tmp_det(1,1), psi_det_alpha_unique(1, lrow), $N_int, hij) + !DIR$ LOOP COUNT AVG(4) + do l=1,N_st + v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1) + ! same spin => sij = 0 + enddo enddo enddo @@ -560,21 +598,34 @@ compute_singles=.True. tmp_det2(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, krow) !DIR$ LOOP COUNT avg(1000) - do i=1,n_singles_b - l_b = singles_b(i) - ASSERT (l_b <= N_det) + do i=1,n_singles_b,block_size + do kk=0,block_size-1 + if (i+kk > n_singles_b) exit + l_b = singles_b(i+kk) + ASSERT (l_b <= N_det) - lcol = psi_bilinear_matrix_transp_columns(l_b) - ASSERT (lcol <= N_det_beta_unique) + l_a = psi_bilinear_matrix_transp_order(l_b) + ASSERT (l_a <= N_det) - tmp_det2(1:$N_int,2) = psi_det_beta_unique (1:$N_int, lcol) - call i_h_j_single_spin( tmp_det, tmp_det2, $N_int, 2, hij) - l_a = psi_bilinear_matrix_transp_order(l_b) - ASSERT (l_a <= N_det) - !DIR$ LOOP COUNT AVG(4) - do l=1,N_st - v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,l_a) - ! single => sij = 0 + do l=1,N_st + utl(l,kk+1) = u_t(l,l_a) + enddo + enddo + + do kk=0,block_size-1 + if (i+kk > n_singles_b) exit + l_b = singles_b(i+kk) + l_a = psi_bilinear_matrix_transp_order(l_b) + lcol = psi_bilinear_matrix_transp_columns(l_b) + ASSERT (lcol <= N_det_beta_unique) + + tmp_det2(1:$N_int,2) = psi_det_beta_unique (1:$N_int, lcol) + call i_h_j_single_spin( tmp_det, tmp_det2, $N_int, 2, hij) + !DIR$ LOOP COUNT AVG(4) + do l=1,N_st + v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1) + ! single => sij = 0 + enddo enddo enddo @@ -582,21 +633,33 @@ compute_singles=.True. ! ---------------------------------- !DIR$ LOOP COUNT avg(50000) - do i=1,n_doubles - l_b = doubles(i) - ASSERT (l_b <= N_det) + do i=1,n_doubles,block_size + do kk=0,block_size-1 + if (i+kk > n_doubles) exit + l_b = doubles(i+kk) + ASSERT (l_b <= N_det) + l_a = psi_bilinear_matrix_transp_order(l_b) + ASSERT (l_a <= N_det) - lcol = psi_bilinear_matrix_transp_columns(l_b) - ASSERT (lcol <= N_det_beta_unique) + do l=1,N_st + utl(l,kk+1) = u_t(l,l_a) + enddo + enddo - call i_H_j_double_spin( tmp_det(1,2), psi_det_beta_unique(1, lcol), $N_int, hij) - l_a = psi_bilinear_matrix_transp_order(l_b) - ASSERT (l_a <= N_det) + do kk=0,block_size-1 + if (i+kk > n_doubles) exit + l_b = doubles(i+kk) + l_a = psi_bilinear_matrix_transp_order(l_b) + lcol = psi_bilinear_matrix_transp_columns(l_b) + ASSERT (lcol <= N_det_beta_unique) - !DIR$ LOOP COUNT AVG(4) - do l=1,N_st - v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,l_a) - ! same spin => sij = 0 + call i_H_j_double_spin( tmp_det(1,2), psi_det_beta_unique(1, lcol), $N_int, hij) + + !DIR$ LOOP COUNT AVG(4) + do l=1,N_st + v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1) + ! same spin => sij = 0 + enddo enddo enddo @@ -629,7 +692,7 @@ compute_singles=.True. end do !$OMP END DO - deallocate(buffer, singles_a, singles_b, doubles, idx) + deallocate(buffer, singles_a, singles_b, doubles, idx, utl) !$OMP END PARALLEL end diff --git a/src/determinants/single_excitations.irp.f b/src/determinants/single_excitations.irp.f index 03ff0f7e..ccfeaa2e 100644 --- a/src/determinants/single_excitations.irp.f +++ b/src/determinants/single_excitations.irp.f @@ -108,6 +108,11 @@ subroutine get_single_excitation_from_fock(det_1,det_2,h,p,spin,phase,hij) integer :: occ_partcl(N_int*bit_kind_size,2) integer :: n_occ_ab_hole(2),n_occ_ab_partcl(2) integer :: i0,i + double precision :: buffer_c(mo_num),buffer_x(mo_num) + do i=1, mo_num + buffer_c(i) = big_array_coulomb_integrals(i,h,p) + buffer_x(i) = big_array_exchange_integrals(i,h,p) + enddo do i = 1, N_int differences(i,1) = xor(det_1(i,1),ref_closed_shell_bitmask(i,1)) differences(i,2) = xor(det_1(i,2),ref_closed_shell_bitmask(i,2)) @@ -122,33 +127,33 @@ subroutine get_single_excitation_from_fock(det_1,det_2,h,p,spin,phase,hij) ! holes :: direct terms do i0 = 1, n_occ_ab_hole(1) i = occ_hole(i0,1) - hij -= big_array_coulomb_integrals(i,h,p) + hij -= buffer_c(i) enddo do i0 = 1, n_occ_ab_hole(2) i = occ_hole(i0,2) - hij -= big_array_coulomb_integrals(i,h,p) + hij -= buffer_c(i) enddo ! holes :: exchange terms do i0 = 1, n_occ_ab_hole(spin) i = occ_hole(i0,spin) - hij += big_array_exchange_integrals(i,h,p) + hij += buffer_x(i) enddo ! particles :: direct terms do i0 = 1, n_occ_ab_partcl(1) i = occ_partcl(i0,1) - hij += big_array_coulomb_integrals(i,h,p) + hij += buffer_c(i) enddo do i0 = 1, n_occ_ab_partcl(2) i = occ_partcl(i0,2) - hij += big_array_coulomb_integrals(i,h,p) + hij += buffer_c(i) enddo ! particles :: exchange terms do i0 = 1, n_occ_ab_partcl(spin) i = occ_partcl(i0,spin) - hij -= big_array_exchange_integrals(i,h,p) + hij -= buffer_x(i) enddo hij = hij * phase diff --git a/src/utils/linear_algebra.irp.f b/src/utils/linear_algebra.irp.f index 357f169a..615f67d0 100644 --- a/src/utils/linear_algebra.irp.f +++ b/src/utils/linear_algebra.irp.f @@ -138,18 +138,27 @@ subroutine ortho_qr(A,LDA,m,n) double precision, intent(inout) :: A(LDA,n) integer :: lwork, info - integer, allocatable :: jpvt(:) - double precision, allocatable :: tau(:), work(:) + double precision, allocatable :: TAU(:), WORK(:) + + allocate (TAU(n), WORK(1)) - allocate (jpvt(n), tau(n), work(1)) LWORK=-1 call dgeqrf( m, n, A, LDA, TAU, WORK, LWORK, INFO ) - LWORK=2*int(WORK(1)) + LWORK=int(WORK(1)) + deallocate(WORK) allocate(WORK(LWORK)) call dgeqrf(m, n, A, LDA, TAU, WORK, LWORK, INFO ) - call dorgqr(m, n, n, A, LDA, tau, WORK, LWORK, INFO) - deallocate(WORK,jpvt,tau) + + LWORK=-1 + call dorgqr(m, n, n, A, LDA, TAU, WORK, LWORK, INFO) + LWORK=int(WORK(1)) + + deallocate(WORK) + allocate(WORK(LWORK)) + call dorgqr(m, n, n, A, LDA, TAU, WORK, LWORK, INFO) + + deallocate(WORK,TAU) end subroutine ortho_qr_unblocked(A,LDA,m,n) @@ -170,13 +179,12 @@ subroutine ortho_qr_unblocked(A,LDA,m,n) double precision, intent(inout) :: A(LDA,n) integer :: info - integer, allocatable :: jpvt(:) - double precision, allocatable :: tau(:), work(:) + double precision, allocatable :: TAU(:), WORK(:) - allocate (jpvt(n), tau(n), work(n)) + allocate (TAU(n), WORK(n)) call dgeqr2( m, n, A, LDA, TAU, WORK, INFO ) - call dorg2r(m, n, n, A, LDA, tau, WORK, INFO) - deallocate(WORK,jpvt,tau) + call dorg2r(m, n, n, A, LDA, TAU, WORK, INFO) + deallocate(WORK,TAU) end subroutine ortho_lowdin(overlap,LDA,N,C,LDC,m) diff --git a/src/utils/memory.irp.f b/src/utils/memory.irp.f index 3e077f97..3ea242b0 100644 --- a/src/utils/memory.irp.f +++ b/src/utils/memory.irp.f @@ -99,6 +99,7 @@ subroutine check_mem(rss_in,routine) rss += rss_in if (int(rss)+1 > qp_max_mem) then print *, 'Not enough memory: aborting in ', routine + print *, int(rss)+1, ' GB required' stop -1 endif !$OMP END CRITICAL