diff --git a/ezfio_config/qmc.config b/ezfio_config/qmc.config index bdddf2f..d19bd07 100644 --- a/ezfio_config/qmc.config +++ b/ezfio_config/qmc.config @@ -11,9 +11,6 @@ mo_basis mo_tot_num integer mo_coef real (ao_basis_ao_num,mo_basis_mo_tot_num) mo_classif character (mo_basis_mo_tot_num) - mo_closed_num integer =n_count_ch(mo_basis_mo_classif,size(mo_basis_mo_classif),'c') - mo_active_num integer =n_count_ch(mo_basis_mo_classif,size(mo_basis_mo_classif),'a') - mo_virtual_num integer =n_count_ch(mo_basis_mo_classif,size(mo_basis_mo_classif),'v') mo_energy real (mo_basis_mo_tot_num) mo_occ real (mo_basis_mo_tot_num) mo_symmetry character*(8) (mo_basis_mo_tot_num) diff --git a/src/AO/ao_axis.irp.f b/src/AO/ao_axis.irp.f index 5216ded..9cfde0f 100644 --- a/src/AO/ao_axis.irp.f +++ b/src/AO/ao_axis.irp.f @@ -48,11 +48,11 @@ subroutine pow_l(r,a,x1,x2,x3) end function - BEGIN_PROVIDER [ real, ao_axis_block, ((-2*simd_sp+1):ao_block_num_8) ] -&BEGIN_PROVIDER [ real, ao_axis_grad_block_x, ((-2*simd_sp+1):ao_block_num_8) ] -&BEGIN_PROVIDER [ real, ao_axis_grad_block_y, ((-2*simd_sp+1):ao_block_num_8) ] -&BEGIN_PROVIDER [ real, ao_axis_grad_block_z, ((-2*simd_sp+1):ao_block_num_8) ] -&BEGIN_PROVIDER [ real, ao_axis_lapl_block, ((-2*simd_sp+1):ao_block_num_8) ] + BEGIN_PROVIDER [ real, ao_axis_block, (ao_block_num_8) ] +&BEGIN_PROVIDER [ real, ao_axis_grad_block_x, (ao_block_num_8) ] +&BEGIN_PROVIDER [ real, ao_axis_grad_block_y, (ao_block_num_8) ] +&BEGIN_PROVIDER [ real, ao_axis_grad_block_z, (ao_block_num_8) ] +&BEGIN_PROVIDER [ real, ao_axis_lapl_block, (ao_block_num_8) ] implicit none include '../types.F' @@ -111,13 +111,13 @@ end function ao_axis_block(idx) = p023 * p10 p023 = real_of_int(pow1) * p023 + ao_axis_grad_block_x(idx) = p023 * p11 + ao_axis_grad_block_y(idx) = p013 * p21 + ao_axis_grad_block_z(idx) = p012 * p31 ao_axis_lapl_block(idx) = real_of_int(pow1-1) * p023 * p12 & + real_of_int(pow2-1) * p013 * p22 & + real_of_int(pow3-1) * p012 * p32 - ao_axis_grad_block_x(idx) = p023 * p11 - ao_axis_grad_block_y(idx) = p013 * p21 - ao_axis_grad_block_z(idx) = p012 * p31 enddo diff --git a/src/SAMPLING/reconfigure.irp.f b/src/SAMPLING/reconfigure.irp.f index c9434fa..6c92502 100644 --- a/src/SAMPLING/reconfigure.irp.f +++ b/src/SAMPLING/reconfigure.irp.f @@ -15,7 +15,7 @@ subroutine reconfigure(ipos,w) tmp = tmp + w(k) enddo dwalk_num = dble(walk_num)/tmp - + integer :: kp, km kp=0 km=0 diff --git a/src/SAMPLING/srmc_step.irp.f b/src/SAMPLING/srmc_step.irp.f index fc98ab5..5e06331 100644 --- a/src/SAMPLING/srmc_step.irp.f +++ b/src/SAMPLING/srmc_step.irp.f @@ -44,8 +44,8 @@ END_SHELL real, allocatable :: elec_coord_tmp(:,:,:) integer :: mod_align - double precision :: E_loc_save(walk_num_dmc_max) - double precision :: E_loc_save_tmp(walk_num_dmc_max) + double precision :: E_loc_save(4,walk_num_dmc_max) + double precision :: E_loc_save_tmp(4,walk_num_dmc_max) double precision :: psi_value_save(walk_num) double precision :: psi_value_save_tmp(walk_num) double precision :: srmc_weight(walk_num) @@ -124,7 +124,7 @@ END_SHELL psi_grad_psi_inv_z(i) = psi_grad_psi_inv_save(i,3,i_walk) enddo psi_value = psi_value_save(i_walk) - E_loc = E_loc_save(i_walk) + E_loc = E_loc_save(1,i_walk) enddo SOFT_TOUCH elec_coord psi_grad_psi_inv_x psi_grad_psi_inv_y psi_grad_psi_inv_z psi_value E_loc else @@ -135,7 +135,7 @@ END_SHELL enddo TOUCH elec_coord psi_value_save(i_walk) = psi_value - E_loc_save(i_walk) = E_loc + E_loc_save(:,i_walk) = E_loc endif double precision :: p,q @@ -144,19 +144,54 @@ END_SHELL call brownian_step(p,q,accepted,delta_x) if ( psi_value * psi_value_save(i_walk) >= 0.d0 ) then - delta = ((E_loc+E_loc_save(i_walk))*0.5d0 - E_ref) * p - if ( delta > thr ) then - srmc_weight(i_walk) = dexp(-dtime_step*thr) - else if ( delta < -thr ) then - srmc_weight(i_walk) = dexp(dtime_step*thr) - else +! delta = (E_loc+E_loc_save(1,i_walk))*0.5d0 +! delta = (5.d0 * E_loc + 8.d0 * E_loc_save(1,i_walk) - E_loc_save(2,i_walk))/12.d0 + + delta = (9.d0*E_loc+19.d0*E_loc_save(1,i_walk)- & + 5.d0*E_loc_save(2,i_walk)+E_loc_save(3,i_walk))/24.d0 + +! delta = -((-251.d0*E_loc)-646.d0*E_loc_save(1,i_walk)+264.d0*E_loc_save(2,i_walk)-& +! 106.d0*E_loc_save(3,i_walk)+19.d0*E_loc_save(4,i_walk))/720.d0 + + delta = (delta - E_ref)*p + + if (delta >= 0.d0) then srmc_weight(i_walk) = dexp(-dtime_step*delta) + else + srmc_weight(i_walk) = 2.d0-dexp(dtime_step*delta) endif + + +! if (accepted) then +! ! Compute correction to past weights +! double precision :: delta_old, delta_new +! delta_old = (9.d0*E_loc_save(1,i_walk)+19.d0*E_loc_save(2,i_walk)-& +! 5.d0*E_loc_save(3,i_walk)+E_loc_save(4,i_walk))/24.d0 - E_ref +! +! +! if (delta_old >= 0.d0) then +! srmc_weight(i_walk) = srmc_weight(i_walk) * dexp(dtime_step*delta_old) +! else +! srmc_weight(i_walk) = srmc_weight(i_walk) * (2.d0-dexp(-dtime_step*delta_old)) +! endif +! +! delta_new = (-(E_loc_save_tmp(3,i_walk)-13.d0*E_loc_save_tmp(2,i_walk)& +! -13.d0*E_loc_save_tmp(1,i_walk)+E_loc))/24.d0 - E_ref +! +! if (delta_new >= 0.d0) then +! srmc_weight(i_walk) = srmc_weight(i_walk) * dexp(-dtime_step*delta_new) +! else +! srmc_weight(i_walk) = srmc_weight(i_walk) * (2.d0-dexp(dtime_step*delta_new) ) +! endif +! +! endif + elec_coord(elec_num+1,1) += p*time_step elec_coord(elec_num+1,2) = E_loc elec_coord(elec_num+1,3) = srmc_weight(i_walk) * pop_weight_mult do l=1,3 do i=1,elec_num+1 + elec_coord_full(i,l,i_walk) = elec_coord(i,l) enddo enddo @@ -167,25 +202,30 @@ END_SHELL enddo psi_value_save(i_walk) = psi_value - E_loc_save(i_walk) = E_loc + if (accepted) then + E_loc_save(4,i_walk) = E_loc_save(3,i_walk) + E_loc_save(3,i_walk) = E_loc_save(2,i_walk) + E_loc_save(2,i_walk) = E_loc_save(1,i_walk) + E_loc_save(1,i_walk) = E_loc + endif BEGIN_SHELL [ /usr/bin/python ] from properties import * t = """ if (calc_$X) then ! Kahan's summation algorithm to compute these sums reducing the rounding error: - ! $X_srmc_block_walk += $X * pop_weight_mult * srmc_weight(i_walk) - ! $X_2_srmc_block_walk += $X_2 * pop_weight_mult * srmc_weight(i_walk) + ! $X_srmc_block_walk += $X * srmc_pop_weight_mult * srmc_weight(i_walk) + ! $X_2_srmc_block_walk += $X_2 * srmc_pop_weight_mult * srmc_weight(i_walk) ! see http://en.wikipedia.org/wiki/Kahan_summation_algorithm - $X_srmc_block_walk_kahan($D2 3) = $X * pop_weight_mult * srmc_weight(i_walk) - $X_srmc_block_walk_kahan($D2 1) + $X_srmc_block_walk_kahan($D2 3) = $X * srmc_pop_weight_mult * srmc_weight(i_walk) - $X_srmc_block_walk_kahan($D2 1) $X_srmc_block_walk_kahan($D2 2) = $X_srmc_block_walk $D1 + $X_srmc_block_walk_kahan($D2 3) $X_srmc_block_walk_kahan($D2 1) = ($X_srmc_block_walk_kahan($D2 2) - $X_srmc_block_walk $D1 ) & - $X_srmc_block_walk_kahan($D2 3) $X_srmc_block_walk $D1 = $X_srmc_block_walk_kahan($D2 2) - $X_2_srmc_block_walk_kahan($D2 3) = $X_2 * pop_weight_mult * srmc_weight(i_walk) - $X_2_srmc_block_walk_kahan($D2 1) + $X_2_srmc_block_walk_kahan($D2 3) = $X_2 * srmc_pop_weight_mult * srmc_weight(i_walk) - $X_2_srmc_block_walk_kahan($D2 1) $X_2_srmc_block_walk_kahan($D2 2) = $X_2_srmc_block_walk $D1 + $X_2_srmc_block_walk_kahan($D2 3) $X_2_srmc_block_walk_kahan($D2 1) = ($X_2_srmc_block_walk_kahan($D2 2) - $X_2_srmc_block_walk $D1 ) & - $X_2_srmc_block_walk_kahan($D2 3) @@ -203,7 +243,7 @@ for p in properties: END_SHELL - block_weight += pop_weight_mult * srmc_weight(i_walk) + block_weight += srmc_pop_weight_mult * srmc_weight(i_walk) else srmc_weight(i_walk) = 0.d0 @@ -221,15 +261,15 @@ END_SHELL ! Eventually, recompute the weight of the population if (srmc_projection_step == 1) then - pop_weight_mult = 1.d0 + srmc_pop_weight_mult = 1.d0 do k=1,srmc_projection - pop_weight_mult *= pop_weight(k) + srmc_pop_weight_mult *= srmc_pop_weight(k) enddo endif ! Remove contribution of the old value of the weight at the new ! projection step - pop_weight_mult *= 1.d0/pop_weight(srmc_projection_step) + srmc_pop_weight_mult *= 1.d0/srmc_pop_weight(srmc_projection_step) ! Compute the new weight of the population double precision :: sum_weight @@ -237,10 +277,10 @@ END_SHELL do k=1,walk_num sum_weight += srmc_weight(k) enddo - pop_weight(srmc_projection_step) = sum_weight/dble(walk_num) + srmc_pop_weight(srmc_projection_step) = sum_weight/dble(walk_num) ! Update the running population weight - pop_weight_mult *= pop_weight(srmc_projection_step) + srmc_pop_weight_mult *= srmc_pop_weight(srmc_projection_step) ! Reconfiguration integer :: ipos(walk_num) @@ -257,7 +297,7 @@ END_SHELL enddo enddo psi_value_save_tmp(k) = psi_value_save(k) - E_loc_save_tmp(k) = E_loc_save(k) + E_loc_save_tmp(:,k) = E_loc_save(:,k) enddo integer :: ipm @@ -272,7 +312,7 @@ END_SHELL enddo enddo psi_value_save(k) = psi_value_save_tmp(ipm) - E_loc_save(k) = E_loc_save_tmp(ipm) + E_loc_save(:,k) = E_loc_save_tmp(:,ipm) enddo call system_clock(cpu1, count_rate, count_max) @@ -287,7 +327,7 @@ END_SHELL cpu2 = cpu1 endif - SOFT_TOUCH elec_coord_full pop_weight_mult + SOFT_TOUCH elec_coord_full srmc_pop_weight_mult first_loop = .False. @@ -314,7 +354,7 @@ END_SHELL END_PROVIDER -BEGIN_PROVIDER [ double precision, pop_weight_mult ] +BEGIN_PROVIDER [ double precision, srmc_pop_weight_mult ] implicit none BEGIN_DOC ! Population weight of SRMC @@ -335,12 +375,12 @@ END_PROVIDER srmc_projection_step = 0 END_PROVIDER -BEGIN_PROVIDER [ double precision, pop_weight, (0:srmc_projection+1) ] +BEGIN_PROVIDER [ double precision, srmc_pop_weight, (0:srmc_projection+1) ] implicit none BEGIN_DOC ! Population weight of SRMC END_DOC - pop_weight = 1.d0 + srmc_pop_weight = 1.d0 END_PROVIDER diff --git a/src/det.irp.f b/src/det.irp.f index ef2db3d..0982d07 100644 --- a/src/det.irp.f +++ b/src/det.irp.f @@ -87,6 +87,106 @@ subroutine det_update(n,LDS,m,l,S,S_inv,d) 48;; 49;; 50;; + 51;; + 52;; + 53;; + 54;; + 55;; + 56;; + 57;; + 58;; + 59;; + 60;; + 61;; + 62;; + 63;; + 64;; + 65;; + 66;; + 67;; + 68;; + 69;; + 70;; + 71;; + 72;; + 73;; + 74;; + 75;; + 76;; + 77;; + 78;; + 79;; + 80;; + 81;; + 82;; + 83;; + 84;; + 85;; + 86;; + 87;; + 88;; + 89;; + 90;; + 91;; + 92;; + 93;; + 94;; + 95;; + 96;; + 97;; + 98;; + 99;; + 100;; + 101;; + 102;; + 103;; + 104;; + 105;; + 106;; + 107;; + 108;; + 109;; + 110;; + 111;; + 112;; + 113;; + 114;; + 115;; + 116;; + 117;; + 118;; + 119;; + 120;; + 121;; + 122;; + 123;; + 124;; + 125;; + 126;; + 127;; + 128;; + 129;; + 130;; + 131;; + 132;; + 133;; + 134;; + 135;; + 136;; + 137;; + 138;; + 139;; + 140;; + 141;; + 142;; + 143;; + 144;; + 145;; + 146;; + 147;; + 148;; + 149;; + 150;; END_TEMPLATE end select end @@ -141,7 +241,6 @@ subroutine det_update3(n,LDS,m,l,S,S_inv,d) double precision,intent(inout) :: d ! Det(S) integer :: i - !DIR$ VECTOR ALIGNED do i=1,3 S(i,l) = m(i) enddo @@ -186,6 +285,7 @@ subroutine det_update4(n,LDS,m,l,S,S_inv,d) return endif + !DIR$ VECTOR ALIGNED do i=1,4 w(i) = S_inv(i,l)*d_inv S(i,l) = m(i) @@ -220,58 +320,73 @@ subroutine det_update$n(n,LDS,m,l,S,S_inv,d) !DIR$ ASSUME (mod(LDS,$IRP_ALIGN/8) == 0) !DIR$ ASSUME (LDS >= $n) integer :: i,j + double precision :: zj, zj1, zj2, zj3 - !$OMP SIMD + !DIR$ NOPREFETCH + !DIR$ SIMD NOVECREMAINDER do i=1,$n u(i) = m(i) - S(i,l) enddo - !$OMP END SIMD - z(l) = S_inv($n,l)*u($n) + zj = 0.d0 !DIR$ VECTOR ALIGNED - do i=1,$n-1 - z(l) = z(l) + S_inv(i,l)*u(i) + !DIR$ NOPREFETCH + !DIR$ SIMD REDUCTION(+:zj) NOVECREMAINDER + do i=1,$n-1,4 + zj = zj + S_inv(i,l)*u(i) + S_inv(i+1,l)*u(i+1) & + + S_inv(i+2,l)*u(i+2) + S_inv(i+3,l)*u(i+3) enddo d_inv = 1.d0/d - d = d+z(l) + d = d+zj lambda = d*d_inv if (dabs(lambda) < 1.d-3) then d = 0.d0 return endif + !DIR$ VECTOR ALIGNED do j=1,$n,4 - z(j ) = S_inv($n,j )*u($n) - z(j+1) = S_inv($n,j+1)*u($n) - z(j+2) = S_inv($n,j+2)*u($n) - z(j+3) = S_inv($n,j+3)*u($n) + zj = 0.d0 + zj1 = 0.d0 + zj2 = 0.d0 + zj3 = 0.d0 !DIR$ VECTOR ALIGNED - do i=1,$n-1 - z(j ) = z(j ) + S_inv(i,j )*u(i) - z(j+1) = z(j+1) + S_inv(i,j+1)*u(i) - z(j+2) = z(j+2) + S_inv(i,j+2)*u(i) - z(j+3) = z(j+3) + S_inv(i,j+3)*u(i) + !DIR$ NOPREFETCH + !DIR$ SIMD REDUCTION(+:zj,zj1,zj2,zj3) NOVECREMAINDER + do i=1,$n + zj = zj + S_inv(i,j )*u(i) + zj1 = zj1 + S_inv(i,j+1)*u(i) + zj2 = zj2 + S_inv(i,j+2)*u(i) + zj3 = zj3 + S_inv(i,j+3)*u(i) enddo + z(j ) = zj + z(j+1) = zj1 + z(j+2) = zj2 + z(j+3) = zj3 enddo - !$OMP SIMD + !DIR$ NOPREFETCH + !DIR$ SIMD FIRSTPRIVATE(d_inv) NOVECREMAINDER do i=1,$n w(i) = S_inv(i,l)*d_inv S(i,l) = m(i) enddo - !$OMP END SIMD do i=1,$n,4 + zj = z(i ) + zj1 = z(i+1) + zj2 = z(i+2) + zj3 = z(i+3) !DIR$ VECTOR ALIGNED - !$OMP SIMD + !DIR$ NOPREFETCH + !DIR$ SIMD FIRSTPRIVATE(lambda,zj,zj1,zj2,zj3) NOVECREMAINDER do j=1,$n - S_inv(j,i ) = S_inv(j,i )*lambda -z(i )*w(j) - S_inv(j,i+1) = S_inv(j,i+1)*lambda -z(i+1)*w(j) - S_inv(j,i+2) = S_inv(j,i+2)*lambda -z(i+2)*w(j) - S_inv(j,i+3) = S_inv(j,i+3)*lambda -z(i+3)*w(j) + S_inv(j,i ) = S_inv(j,i )*lambda - w(j)*zj + S_inv(j,i+1) = S_inv(j,i+1)*lambda - w(j)*zj1 + S_inv(j,i+2) = S_inv(j,i+2)*lambda - w(j)*zj2 + S_inv(j,i+3) = S_inv(j,i+3)*lambda - w(j)*zj3 enddo - !$OMP END SIMD enddo end @@ -288,6 +403,31 @@ SUBST [ n ] 40 ;; 44 ;; 48 ;; +52 ;; +56 ;; +60 ;; +64 ;; +68 ;; +72 ;; +76 ;; +80 ;; +84 ;; +88 ;; +92 ;; +96 ;; +100 ;; +104 ;; +108 ;; +112 ;; +116 ;; +120 ;; +124 ;; +128 ;; +132 ;; +136 ;; +140 ;; +144 ;; +148 ;; END_TEMPLATE @@ -311,19 +451,23 @@ subroutine det_update$n(n,LDS,m,l,S,S_inv,d) !DIR$ ASSUME (mod(LDS,$IRP_ALIGN/8) == 0) !DIR$ ASSUME (LDS >= $n) integer :: i,j + double precision :: zj, zj1, zj2, zj3 do i=1,$n u(i) = m(i) - S(i,l) enddo - - z(l) = S_inv($n,l)*u($n) - !DIR$ VECTOR ALIGNED - do i=1,$n-1 - z(l) = z(l) + S_inv(i,l)*u(i) - enddo + zj = 0.d0 + !DIR$ NOPREFETCH + !DIR$ SIMD REDUCTION(+:zj) + do i=1,$n-1,4 + zj = zj + S_inv(i,l)*u(i) + S_inv(i+1,l)*u(i+1) & + + S_inv(i+2,l)*u(i+2) + S_inv(i+3,l)*u(i+3) + enddo + zj = zj + S_inv($n,l)*u($n) + d_inv = 1.d0/d - d = d+z(l) + d = d+zj lambda = d*d_inv if (dabs(lambda) < 1.d-3) then d = 0.d0 @@ -332,47 +476,67 @@ subroutine det_update$n(n,LDS,m,l,S,S_inv,d) !DIR$ VECTOR ALIGNED do j=1,$n-1,4 - z(j ) = S_inv($n,j )*u($n) - z(j+1) = S_inv($n,j+1)*u($n) - z(j+2) = S_inv($n,j+2)*u($n) - z(j+3) = S_inv($n,j+3)*u($n) + zj = 0.d0 + zj1 = 0.d0 + zj2 = 0.d0 + zj3 = 0.d0 + !DIR$ VECTOR ALIGNED + !DIR$ NOPREFETCH + !DIR$ SIMD REDUCTION(+:zj,zj1,zj2,zj3) NOVECREMAINDER do i=1,$n-1 - z(j ) = z(j ) + S_inv(i,j )*u(i) - z(j+1) = z(j+1) + S_inv(i,j+1)*u(i) - z(j+2) = z(j+2) + S_inv(i,j+2)*u(i) - z(j+3) = z(j+3) + S_inv(i,j+3)*u(i) + zj = zj + S_inv(i,j )*u(i) + zj1 = zj1 + S_inv(i,j+1)*u(i) + zj2 = zj2 + S_inv(i,j+2)*u(i) + zj3 = zj3 + S_inv(i,j+3)*u(i) enddo + z(j ) = zj + S_inv($n,j )*u($n) + z(j+1) = zj1 + S_inv($n,j+1)*u($n) + z(j+2) = zj2 + S_inv($n,j+2)*u($n) + z(j+3) = zj3 + S_inv($n,j+3)*u($n) enddo - z($n) = S_inv($n,$n)*u($n) + zj = 0.d0 + !DIR$ VECTOR ALIGNED + !DIR$ NOPREFETCH + !DIR$ SIMD REDUCTION(+:zj) NOVECREMAINDER do i=1,$n-1 - z($n) = z($n) + S_inv(i,$n)*u(i) + zj = zj + S_inv(i,$n)*u(i) enddo + z($n) = zj + S_inv($n,$n)*u($n) + + !DIR$ NOPREFETCH + !DIR$ SIMD FIRSTPRIVATE(d_inv) NOVECREMAINDER do i=1,$n w(i) = S_inv(i,l)*d_inv S(i,l) = m(i) enddo do i=1,$n-1,4 - !$OMP SIMD + zj = z(i ) + zj1 = z(i+1) + zj2 = z(i+2) + zj3 = z(i+3) + !DIR$ VECTOR ALIGNED + !DIR$ NOPREFETCH + !DIR$ SIMD FIRSTPRIVATE(lambda,zj,zj1,zj2,zj3) NOVECREMAINDER do j=1,$n-1 - S_inv(j,i ) = S_inv(j,i )*lambda - w(j)*z(i ) - S_inv(j,i+1) = S_inv(j,i+1)*lambda - w(j)*z(i+1) - S_inv(j,i+2) = S_inv(j,i+2)*lambda - w(j)*z(i+2) - S_inv(j,i+3) = S_inv(j,i+3)*lambda - w(j)*z(i+3) + S_inv(j,i ) = S_inv(j,i )*lambda - w(j)*zj + S_inv(j,i+1) = S_inv(j,i+1)*lambda - w(j)*zj1 + S_inv(j,i+2) = S_inv(j,i+2)*lambda - w(j)*zj2 + S_inv(j,i+3) = S_inv(j,i+3)*lambda - w(j)*zj3 enddo - !$OMP END SIMD - enddo - - do i=1,$n-1,4 - S_inv($n,i ) = S_inv($n,i )*lambda - w($n)*z(i ) - S_inv($n,i+1) = S_inv($n,i+1)*lambda - w($n)*z(i+1) - S_inv($n,i+2) = S_inv($n,i+2)*lambda - w($n)*z(i+2) - S_inv($n,i+3) = S_inv($n,i+3)*lambda - w($n)*z(i+3) + S_inv($n,i ) = S_inv($n,i )*lambda - w($n)*zj + S_inv($n,i+1) = S_inv($n,i+1)*lambda - w($n)*zj1 + S_inv($n,i+2) = S_inv($n,i+2)*lambda - w($n)*zj2 + S_inv($n,i+3) = S_inv($n,i+3)*lambda - w($n)*zj3 enddo + zj = z($n) + !DIR$ VECTOR ALIGNED + !DIR$ NOPREFETCH + !DIR$ SIMD FIRSTPRIVATE(lambda,zj) NOVECREMAINDER do i=1,$n - S_inv(i,$n) = S_inv(i,$n)*lambda -w(i)*z($n) + S_inv(i,$n) = S_inv(i,$n)*lambda -w(i)*zj enddo @@ -391,6 +555,31 @@ SUBST [ n ] 41 ;; 45 ;; 49 ;; +53 ;; +57 ;; +61 ;; +65 ;; +69 ;; +73 ;; +77 ;; +81 ;; +85 ;; +89 ;; +93 ;; +97 ;; +101 ;; +105 ;; +109 ;; +113 ;; +117 ;; +121 ;; +125 ;; +129 ;; +133 ;; +137 ;; +141 ;; +145 ;; +149 ;; END_TEMPLATE @@ -416,76 +605,114 @@ subroutine det_update$n(n,LDS,m,l,S,S_inv,d) !DIR$ ASSUME (LDS >= $n) integer :: i,j - !$OMP SIMD + double precision :: zj, zj1, zj2, zj3 + !DIR$ NOPREFETCH + !DIR$ SIMD NOVECREMAINDER do i=1,$n u(i) = m(i) - S(i,l) enddo - !$OMP END SIMD - z(l) = S_inv($n,l)*u($n) + zj = 0.d0 !DIR$ VECTOR ALIGNED - do i=1,$n-1 - z(l) = z(l) + S_inv(i,l)*u(i) + !DIR$ NOPREFETCH + !DIR$ SIMD REDUCTION(+:zj) NOVECREMAINDER + do i=1,$n-2,4 + zj = zj + S_inv(i,l)*u(i) + S_inv(i+1,l)*u(i+1) & + + S_inv(i+2,l)*u(i+2) + S_inv(i+3,l)*u(i+3) enddo + i=$n-1 + zj = zj + S_inv(i,l)*u(i) + S_inv(i+1,l)*u(i+1) d_inv = 1.d0/d - d = d+z(l) + d = d+zj lambda = d*d_inv if (dabs(lambda) < 1.d-3) then d = 0.d0 return endif + !DIR$ VECTOR ALIGNED do j=1,$n-2,4 - z(j ) = S_inv($n,j )*u($n) - z(j+1) = S_inv($n,j+1)*u($n) - z(j+2) = S_inv($n,j+2)*u($n) - z(j+3) = S_inv($n,j+3)*u($n) + zj = 0.d0 + zj1 = 0.d0 + zj2 = 0.d0 + zj3 = 0.d0 !DIR$ VECTOR ALIGNED - do i=1,$n-1 - z(j ) = z(j ) + S_inv(i,j )*u(i) - z(j+1) = z(j+1) + S_inv(i,j+1)*u(i) - z(j+2) = z(j+2) + S_inv(i,j+2)*u(i) - z(j+3) = z(j+3) + S_inv(i,j+3)*u(i) + !DIR$ SIMD REDUCTION(+:zj,zj1,zj2,zj3) NOVECREMAINDER + do i=1,$n-2 + zj = zj + S_inv(i,j )*u(i) + zj1 = zj1 + S_inv(i,j+1)*u(i) + zj2 = zj2 + S_inv(i,j+2)*u(i) + zj3 = zj3 + S_inv(i,j+3)*u(i) enddo + z(j ) = zj + S_inv($n-1,j )*u($n-1) + z(j ) = z(j ) + S_inv($n,j )*u($n) + z(j+1) = zj1 + S_inv($n-1,j+1)*u($n-1) + z(j+1) = z(j+1) + S_inv($n,j+1)*u($n) + z(j+2) = zj2 + S_inv($n-1,j+2)*u($n-1) + z(j+2) = z(j+2) + S_inv($n,j+2)*u($n) + z(j+3) = zj3 + S_inv($n-1,j+3)*u($n-1) + z(j+3) = z(j+3) + S_inv($n,j+3)*u($n) enddo j=$n-1 - z(j ) = S_inv($n,j )*u($n) - z(j+1) = S_inv($n,j+1)*u($n) + zj = 0.d0 + zj1 = 0.d0 !DIR$ VECTOR ALIGNED - do i=1,$n-1 - z(j ) = z(j ) + S_inv(i,j )*u(i) - z(j+1) = z(j+1) + S_inv(i,j+1)*u(i) + !DIR$ NOPREFETCH + !DIR$ SIMD REDUCTION(+:zj,zj1) NOVECREMAINDER + do i=1,$n-2 + zj = zj + S_inv(i,j )*u(i) + zj1 = zj1 + S_inv(i,j+1)*u(i) enddo + z(j ) = zj + S_inv($n-1,j )*u($n-1) + z(j ) = z(j ) + S_inv($n,j )*u($n) + z(j+1) = zj1 + S_inv($n-1,j+1)*u($n-1) + z(j+1) = z(j+1) + S_inv($n,j+1)*u($n) - !$OMP SIMD + !DIR$ NOPREFETCH + !DIR$ SIMD FIRSTPRIVATE(d_inv) NOVECREMAINDER do i=1,$n w(i) = S_inv(i,l)*d_inv S(i,l) = m(i) enddo - !$OMP END SIMD do i=1,$n-2,4 + zj = z(i) + zj1 = z(i+1) + zj2 = z(i+2) + zj3 = z(i+3) !DIR$ VECTOR ALIGNED - !$OMP SIMD - do j=1,$n - S_inv(j,i ) = S_inv(j,i )*lambda -z(i )*w(j) - S_inv(j,i+1) = S_inv(j,i+1)*lambda -z(i+1)*w(j) - S_inv(j,i+2) = S_inv(j,i+2)*lambda -z(i+2)*w(j) - S_inv(j,i+3) = S_inv(j,i+3)*lambda -z(i+3)*w(j) + !DIR$ SIMD FIRSTPRIVATE(lambda,zj,zj1,zj2,zj3) NOVECREMAINDER + do j=1,$n-2 + S_inv(j,i ) = S_inv(j,i )*lambda -zj *w(j) + S_inv(j,i+1) = S_inv(j,i+1)*lambda -zj1*w(j) + S_inv(j,i+2) = S_inv(j,i+2)*lambda -zj2*w(j) + S_inv(j,i+3) = S_inv(j,i+3)*lambda -zj3*w(j) enddo - !$OMP END SIMD + S_inv($n-1,i ) = S_inv($n-1,i )*lambda -zj *w($n-1) + S_inv($n ,i ) = S_inv($n ,i )*lambda -zj *w($n ) + S_inv($n-1,i+1) = S_inv($n-1,i+1)*lambda -zj1*w($n-1) + S_inv($n ,i+1) = S_inv($n ,i+1)*lambda -zj1*w($n ) + S_inv($n-1,i+2) = S_inv($n-1,i+2)*lambda -zj2*w($n-1) + S_inv($n ,i+2) = S_inv($n ,i+2)*lambda -zj2*w($n ) + S_inv($n-1,i+3) = S_inv($n-1,i+3)*lambda -zj3*w($n-1) + S_inv($n ,i+3) = S_inv($n ,i+3)*lambda -zj3*w($n ) enddo i=$n-1 + zj = z(i) + zj1= z(i+1) !DIR$ VECTOR ALIGNED - !$OMP SIMD - do j=1,$n - S_inv(j,i ) = S_inv(j,i )*lambda -z(i )*w(j) - S_inv(j,i+1) = S_inv(j,i+1)*lambda -z(i+1)*w(j) + !DIR$ SIMD FIRSTPRIVATE(lambda,zj,zj1) + do j=1,$n-2 + S_inv(j,i ) = S_inv(j,i )*lambda -zj*w(j) + S_inv(j,i+1) = S_inv(j,i+1)*lambda -zj1*w(j) enddo - !$OMP END SIMD + S_inv($n-1,i ) = S_inv($n-1,i )*lambda -zj*w($n-1) + S_inv($n-1,i+1) = S_inv($n-1,i+1)*lambda -zj1*w($n-1) + S_inv($n ,i ) = S_inv($n ,i )*lambda -zj*w($n ) + S_inv($n ,i+1) = S_inv($n ,i+1)*lambda -zj1*w($n ) end @@ -502,6 +729,31 @@ SUBST [ n ] 42 ;; 46 ;; 50 ;; +54 ;; +58 ;; +62 ;; +66 ;; +70 ;; +74 ;; +78 ;; +82 ;; +86 ;; +90 ;; +94 ;; +98 ;; +102 ;; +106 ;; +110 ;; +114 ;; +118 ;; +122 ;; +126 ;; +130 ;; +134 ;; +138 ;; +142 ;; +146 ;; +150 ;; END_TEMPLATE @@ -525,20 +777,28 @@ subroutine det_update$n(n,LDS,m,l,S,S_inv,d) !DIR$ ASSUME (mod(LDS,$IRP_ALIGN/8) == 0) !DIR$ ASSUME (LDS >= $n) integer :: i,j - + + double precision :: zj, zj1, zj2, zj3 + + !DIR$ SIMD do i=1,$n u(i) = m(i) - S(i,l) enddo - z(l) = S_inv($n,l)*u($n) + zj = 0.d0 !DIR$ VECTOR ALIGNED - do i=1,$n-1 - z(l) = z(l) + S_inv(i,l)*u(i) + !DIR$ NOPREFETCH + !DIR$ SIMD REDUCTION(+:zj) NOVECREMAINDER + do i=1,$n-3,4 + zj = zj + S_inv(i,l)*u(i) + S_inv(i+1,l)*u(i+1) & + + S_inv(i+2,l)*u(i+2) + S_inv(i+3,l)*u(i+3) enddo + i=$n-2 + zj = zj + S_inv(i,l)*u(i) + S_inv(i+1,l)*u(i+1) + S_inv(i+2,l)*u(i+2) d_inv = 1.d0/d - d = d+z(l) + d = d+zj lambda = d*d_inv if (dabs(lambda) < 1.d-3) then d = 0.d0 @@ -547,52 +807,102 @@ subroutine det_update$n(n,LDS,m,l,S,S_inv,d) !DIR$ VECTOR ALIGNED do j=1,$n-3,4 - z(j ) = S_inv($n,j )*u($n) - z(j+1) = S_inv($n,j+1)*u($n) - z(j+2) = S_inv($n,j+2)*u($n) - z(j+3) = S_inv($n,j+3)*u($n) - do i=1,$n-1 - z(j ) = z(j ) + S_inv(i,j )*u(i) - z(j+1) = z(j+1) + S_inv(i,j+1)*u(i) - z(j+2) = z(j+2) + S_inv(i,j+2)*u(i) - z(j+3) = z(j+3) + S_inv(i,j+3)*u(i) + zj = 0.d0 + zj1 = 0.d0 + zj2 = 0.d0 + zj3 = 0.d0 + !DIR$ VECTOR ALIGNED + !DIR$ SIMD REDUCTION(+:zj,zj1,zj2,zj3) + do i=1,$n-3 + zj = zj + S_inv(i,j )*u(i) + zj1 = zj1 + S_inv(i,j+1)*u(i) + zj2 = zj2 + S_inv(i,j+2)*u(i) + zj3 = zj3 + S_inv(i,j+3)*u(i) enddo + z(j ) = zj + S_inv($n-2,j )*u($n-2) + z(j ) = z(j ) + S_inv($n-1,j )*u($n-1) + z(j ) = z(j ) + S_inv($n,j )*u($n) + z(j+1) = zj1 + S_inv($n-2,j+1)*u($n-2) + z(j+1) = z(j+1) + S_inv($n-1,j+1)*u($n-1) + z(j+1) = z(j+1) + S_inv($n,j+1)*u($n) + z(j+2) = zj2 + S_inv($n-2,j+2)*u($n-2) + z(j+2) = z(j+2) + S_inv($n-1,j+2)*u($n-1) + z(j+2) = z(j+2) + S_inv($n,j+2)*u($n) + z(j+3) = zj3 + S_inv($n-2,j+3)*u($n-2) + z(j+3) = z(j+3) + S_inv($n-1,j+3)*u($n-1) + z(j+3) = z(j+3) + S_inv($n,j+3)*u($n) enddo j=$n-2 - z(j ) = S_inv($n,j )*u($n) - z(j+1) = S_inv($n,j+1)*u($n) - z(j+2) = S_inv($n,j+2)*u($n) - do i=1,$n-1 - z(j ) = z(j ) + S_inv(i,j )*u(i) - z(j+1) = z(j+1) + S_inv(i,j+1)*u(i) - z(j+2) = z(j+2) + S_inv(i,j+2)*u(i) + zj = 0.d0 + zj1 = 0.d0 + zj2 = 0.d0 + !DIR$ VECTOR ALIGNED + !DIR$ NOPREFETCH + !DIR$ SIMD REDUCTION(+:zj,zj1,zj2) + do i=1,$n-3 + zj = zj + S_inv(i,j )*u(i) + zj1 = zj1 + S_inv(i,j+1)*u(i) + zj2 = zj2 + S_inv(i,j+2)*u(i) enddo + z(j ) = zj + S_inv($n-2,j )*u($n-2) + z(j ) = z(j ) + S_inv($n-1,j )*u($n-1) + z(j ) = z(j ) + S_inv($n,j )*u($n) + z(j+1) = zj1 + S_inv($n-2,j+1)*u($n-2) + z(j+1) = z(j+1) + S_inv($n-1,j+1)*u($n-1) + z(j+1) = z(j+1) + S_inv($n,j+1)*u($n) + z(j+2) = zj2 + S_inv($n-2,j+2)*u($n-2) + z(j+2) = z(j+2) + S_inv($n-1,j+2)*u($n-1) + z(j+2) = z(j+2) + S_inv($n,j+2)*u($n) + !DIR$ NOPREFETCH + !DIR$ SIMD FIRSTPRIVATE(d_inv) do i=1,$n w(i) = S_inv(i,l)*d_inv S(i,l) = m(i) enddo do i=1,$n-3,4 - !$OMP SIMD - do j=1,$n - S_inv(j,i ) = S_inv(j,i )*lambda - w(j)*z(i ) - S_inv(j,i+1) = S_inv(j,i+1)*lambda - w(j)*z(i+1) - S_inv(j,i+2) = S_inv(j,i+2)*lambda - w(j)*z(i+2) - S_inv(j,i+3) = S_inv(j,i+3)*lambda - w(j)*z(i+3) + zj = z(i) + zj1 = z(i+1) + zj2 = z(i+2) + zj3 = z(i+3) + !DIR$ VECTOR ALIGNED + !DIR$ NOPREFETCH + !DIR$ SIMD FIRSTPRIVATE(lambda,zj,zj1,zj2,zj3) + do j=1,$n-3 + S_inv(j,i ) = S_inv(j,i )*lambda - w(j)*zj + S_inv(j,i+1) = S_inv(j,i+1)*lambda - w(j)*zj1 + S_inv(j,i+2) = S_inv(j,i+2)*lambda - w(j)*zj2 + S_inv(j,i+3) = S_inv(j,i+3)*lambda - w(j)*zj3 enddo - !$OMP END SIMD + S_inv($n-2,i ) = S_inv($n-2,i )*lambda -zj *w($n-2) + S_inv($n-1,i ) = S_inv($n-1,i )*lambda -zj *w($n-1) + S_inv($n ,i ) = S_inv($n ,i )*lambda -zj *w($n ) + S_inv($n-2,i+1) = S_inv($n-2,i+1)*lambda -zj1*w($n-2) + S_inv($n-1,i+1) = S_inv($n-1,i+1)*lambda -zj1*w($n-1) + S_inv($n ,i+1) = S_inv($n ,i+1)*lambda -zj1*w($n ) + S_inv($n-2,i+2) = S_inv($n-2,i+2)*lambda -zj2*w($n-2) + S_inv($n-1,i+2) = S_inv($n-1,i+2)*lambda -zj2*w($n-1) + S_inv($n ,i+2) = S_inv($n ,i+2)*lambda -zj2*w($n ) + S_inv($n-2,i+3) = S_inv($n-2,i+3)*lambda -zj3*w($n-2) + S_inv($n-1,i+3) = S_inv($n-1,i+3)*lambda -zj3*w($n-1) + S_inv($n ,i+3) = S_inv($n ,i+3)*lambda -zj3*w($n ) enddo i=$n-2 - !$OMP SIMD + zj = z(i) + zj1 = z(i+1) + zj2 = z(i+2) + !DIR$ VECTOR ALIGNED + !DIR$ NOPREFETCH + !DIR$ SIMD FIRSTPRIVATE(lambda,zj,zj1,zj2) do j=1,$n - S_inv(j,i ) = S_inv(j,i )*lambda - w(j)*z(i ) - S_inv(j,i+1) = S_inv(j,i+1)*lambda - w(j)*z(i+1) - S_inv(j,i+2) = S_inv(j,i+2)*lambda - w(j)*z(i+2) + S_inv(j,i ) = S_inv(j,i )*lambda - w(j)*zj + S_inv(j,i+1) = S_inv(j,i+1)*lambda - w(j)*zj1 + S_inv(j,i+2) = S_inv(j,i+2)*lambda - w(j)*zj2 enddo - !$OMP END SIMD + end @@ -608,6 +918,31 @@ SUBST [ n ] 39 ;; 43 ;; 47 ;; +51 ;; +55 ;; +59 ;; +63 ;; +67 ;; +71 ;; +75 ;; +79 ;; +83 ;; +87 ;; +91 ;; +95 ;; +99 ;; +103 ;; +107 ;; +111 ;; +115 ;; +119 ;; +123 ;; +127 ;; +131 ;; +135 ;; +139 ;; +143 ;; +147 ;; END_TEMPLATE @@ -632,22 +967,26 @@ subroutine det_update_general(n,LDS,m,l,S,S_inv,d) !DIR$ ASSUME (LDS >= n) !DIR$ ASSUME (LDS <= 3840) !DIR$ ASSUME (MOD(LDS,$IRP_ALIGN/8) == 0) - integer :: i,j,n4 + !DIR$ ASSUME (n>150) - !$OMP SIMD + integer :: i,j,n4 + double precision :: zl + + !DIR$ NOPREFETCH do i=1,n u(i) = m(i) - S(i,l) enddo - !$OMP END SIMD - z(l) = 0.d0 + zl = 0.d0 !DIR$ VECTOR ALIGNED + !DIR$ NOPREFETCH + !DIR$ SIMD REDUCTION(+:zl) do i=1,n - z(l) = z(l) + S_inv(i,l)*u(i) + zl = zl + S_inv(i,l)*u(i) enddo d_inv = 1.d0/d - d = d+z(l) + d = d+zl lambda = d*d_inv if ( dabs(lambda) < 1.d-3 ) then @@ -655,51 +994,80 @@ subroutine det_update_general(n,LDS,m,l,S,S_inv,d) return endif - n4 = iand(n,not(4)) - do j=1,n4,8 - z(j ) = 0.d0 - z(j+1) = 0.d0 - z(j+2) = 0.d0 - z(j+3) = 0.d0 - z(j+4) = 0.d0 - z(j+5) = 0.d0 - z(j+6) = 0.d0 - z(j+7) = 0.d0 + double precision :: zj, zj1, zj2, zj3 + + n4 = iand(n,not(3)) + !DIR$ VECTOR ALIGNED + !DIR$ NOPREFETCH + do j=1,n4,4 + zj = 0.d0 + zj1 = 0.d0 + zj2 = 0.d0 + zj3 = 0.d0 !DIR$ VECTOR ALIGNED + !DIR$ NOPREFETCH + !DIR$ SIMD REDUCTION(+:zj,zj1,zj2,zj3) do i=1,n - z(j ) = z(j ) + S_inv(i,j )*u(i) - z(j+1) = z(j+1) + S_inv(i,j+1)*u(i) - z(j+2) = z(j+2) + S_inv(i,j+2)*u(i) - z(j+3) = z(j+3) + S_inv(i,j+3)*u(i) - z(j+4) = z(j+4) + S_inv(i,j+4)*u(i) - z(j+5) = z(j+5) + S_inv(i,j+5)*u(i) - z(j+6) = z(j+6) + S_inv(i,j+6)*u(i) - z(j+7) = z(j+7) + S_inv(i,j+7)*u(i) + zj = zj + S_inv(i,j )*u(i) + zj1 = zj1 + S_inv(i,j+1)*u(i) + zj2 = zj2 + S_inv(i,j+2)*u(i) + zj3 = zj3 + S_inv(i,j+3)*u(i) enddo + z(j ) = zj + z(j+1) = zj1 + z(j+2) = zj2 + z(j+3) = zj3 enddo do j=n4+1,n - z(j) = 0.d0 + zj = 0.d0 !DIR$ VECTOR ALIGNED + !DIR$ NOPREFETCH + !DIR$ SIMD REDUCTION(+:zj) do i=1,n - z(j) = z(j) + S_inv(i,j)*u(i) + zj = zj + S_inv(i,j)*u(i) enddo + z(j ) = zj enddo - !$OMP SIMD + !DIR$ NOPREFETCH + !DIR$ SIMD FIRSTPRIVATE(d_inv) do i=1,n w(i) = S_inv(i,l)*d_inv S(i,l) = m(i) enddo - !$OMP END SIMD + !DIR$ NOPREFETCH + !DIR$ SIMD FIRSTPRIVATE(d_inv) do i=1,n - !DIR$ VECTOR ALIGNED - !$OMP SIMD aligned(S_inv,z) - do j=1,n - S_inv(j,i) = S_inv(j,i)*lambda -z(i)*w(j) - enddo - !$OMP END SIMD + w(i) = S_inv(i,l)*d_inv + S(i,l) = m(i) + enddo + + do i=1,n4,4 + zj = z(i) + zj1 = z(i+1) + zj2 = z(i+2) + zj3 = z(i+3) + !DIR$ VECTOR ALIGNED + !DIR$ NOPREFETCH + !DIR$ SIMD FIRSTPRIVATE(lambda,zj,zj1,zj2,zj3) + do j=1,n + S_inv(j,i ) = S_inv(j,i )*lambda -zj *w(j) + S_inv(j,i+1) = S_inv(j,i+1)*lambda -zj1*w(j) + S_inv(j,i+2) = S_inv(j,i+2)*lambda -zj2*w(j) + S_inv(j,i+3) = S_inv(j,i+3)*lambda -zj3*w(j) + enddo + enddo + + do i=n4+1,n + zj = z(i) + !DIR$ VECTOR ALIGNED + !DIR$ NOPREFETCH + !DIR$ SIMD FIRSTPRIVATE(lambda,zj) + do j=1,n + S_inv(j,i) = S_inv(j,i)*lambda -zj*w(j) + enddo enddo end @@ -1034,11 +1402,10 @@ END_PROVIDER det_alpha_value(det_i) = det_alpha_value_curr do i=1,elec_alpha_num - !$OMP SIMD + !DIR$ VECTOR ALIGNED do k=1,4 det_alpha_grad_lapl(k,i,det_i) = det_alpha_grad_lapl_curr(k,i) enddo - !$OMP END SIMD if (do_pseudo) then det_alpha_pseudo(i,det_i) = det_alpha_pseudo_curr(i) endif @@ -1088,11 +1455,9 @@ END_PROVIDER det_beta_value(det_j) = det_beta_value_curr !DIR$ LOOP COUNT (200) do i=elec_alpha_num+1,elec_num - !$OMP SIMD do k=1,4 det_beta_grad_lapl(k,i,det_j) = det_beta_grad_lapl_curr(k,i) enddo - !$OMP END SIMD if (do_pseudo) then det_beta_pseudo(i,det_j) = det_beta_pseudo_curr(i) endif @@ -1223,11 +1588,9 @@ END_PROVIDER ! ----- psidet_value = 0.d0 - !$OMP SIMD reduction(+:psidet_value) do j=1,det_beta_num psidet_value = psidet_value + det_beta_value(j) * DaC(j) enddo - !$OMP END SIMD if (psidet_value == 0.d0) then @@ -1246,53 +1609,51 @@ END_PROVIDER do i=1,det_alpha_num do j=1,elec_alpha_num - !$OMP SIMD + !DIR$ VECTOR ALIGNED do k=1,4 psidet_grad_lapl(k,j) = psidet_grad_lapl(k,j) + det_alpha_grad_lapl(k,j,i)*CDb(i) enddo - !$OMP END SIMD pseudo_non_local(j) = pseudo_non_local(j) + det_alpha_pseudo(j,i)*CDb(i) enddo enddo do i=1,det_beta_num do j=elec_alpha_num+1,elec_num - !$OMP SIMD + !DIR$ VECTOR ALIGNED do k=1,4 psidet_grad_lapl(k,j) = psidet_grad_lapl(k,j) + det_beta_grad_lapl(k,j,i)*DaC(i) enddo - !$OMP END SIMD pseudo_non_local(j) = pseudo_non_local(j) + det_beta_pseudo(j,i)*DaC(i) enddo enddo + !DIR$ VECTOR ALIGNED do j=1,elec_num pseudo_non_local(j) = pseudo_non_local(j) * psidet_inv enddo else + !DIR$ VECTOR ALIGNED do j=1,elec_num psidet_grad_lapl(1:4,j) = 0.d0 enddo do i=1,det_alpha_num do j=1,elec_alpha_num - !$OMP SIMD + !DIR$ VECTOR ALIGNED do k=1,4 psidet_grad_lapl(k,j) = psidet_grad_lapl(k,j) + det_alpha_grad_lapl(k,j,i)*CDb(i) enddo - !$OMP END SIMD enddo enddo do i=1,det_beta_num do j=elec_alpha_num+1,elec_num - !$OMP SIMD + !DIR$ VECTOR ALIGNED do k=1,4 psidet_grad_lapl(k,j) = psidet_grad_lapl(k,j) + det_beta_grad_lapl(k,j,i)*DaC(i) enddo - !$OMP END SIMD enddo enddo @@ -1392,14 +1753,13 @@ BEGIN_PROVIDER [ double precision, det_alpha_grad_lapl_curr, (4,elec_alpha_num) imo = mo_list_alpha_curr(j ) imo2 = mo_list_alpha_curr(j+1) do i=1,elec_alpha_num,2 - !$OMP SIMD + !DIR$ VECTOR ALIGNED do k=1,4 - det_alpha_grad_lapl_curr(k,i ) = det_alpha_grad_lapl_curr(k,i ) + mo_grad_lapl(k,i ,imo )*slater_matrix_alpha_inv_det(i ,j ) & - + mo_grad_lapl(k,i ,imo2)*slater_matrix_alpha_inv_det(i ,j+1) - det_alpha_grad_lapl_curr(k,i+1) = det_alpha_grad_lapl_curr(k,i+1) + mo_grad_lapl(k,i+1,imo )*slater_matrix_alpha_inv_det(i+1,j ) & - + mo_grad_lapl(k,i+1,imo2)*slater_matrix_alpha_inv_det(i+1,j+1) + det_alpha_grad_lapl_curr(k,i ) = det_alpha_grad_lapl_curr(k,i ) + mo_grad_lapl_alpha(k,i ,imo )*slater_matrix_alpha_inv_det(i ,j ) & + + mo_grad_lapl_alpha(k,i ,imo2)*slater_matrix_alpha_inv_det(i ,j+1) + det_alpha_grad_lapl_curr(k,i+1) = det_alpha_grad_lapl_curr(k,i+1) + mo_grad_lapl_alpha(k,i+1,imo )*slater_matrix_alpha_inv_det(i+1,j ) & + + mo_grad_lapl_alpha(k,i+1,imo2)*slater_matrix_alpha_inv_det(i+1,j+1) enddo - !$OMP END SIMD enddo enddo @@ -1409,32 +1769,29 @@ BEGIN_PROVIDER [ double precision, det_alpha_grad_lapl_curr, (4,elec_alpha_num) imo = mo_list_alpha_curr(j ) imo2 = mo_list_alpha_curr(j+1) do i=1,elec_alpha_num-1,2 - !$OMP SIMD + !DIR$ VECTOR ALIGNED do k=1,4 - det_alpha_grad_lapl_curr(k,i ) = det_alpha_grad_lapl_curr(k,i ) + mo_grad_lapl(k,i ,imo )*slater_matrix_alpha_inv_det(i ,j ) & - + mo_grad_lapl(k,i ,imo2)*slater_matrix_alpha_inv_det(i ,j+1) - det_alpha_grad_lapl_curr(k,i+1) = det_alpha_grad_lapl_curr(k,i+1) + mo_grad_lapl(k,i+1,imo )*slater_matrix_alpha_inv_det(i+1,j ) & - + mo_grad_lapl(k,i+1,imo2)*slater_matrix_alpha_inv_det(i+1,j+1) + det_alpha_grad_lapl_curr(k,i ) = det_alpha_grad_lapl_curr(k,i ) + mo_grad_lapl_alpha(k,i ,imo )*slater_matrix_alpha_inv_det(i ,j ) & + + mo_grad_lapl_alpha(k,i ,imo2)*slater_matrix_alpha_inv_det(i ,j+1) + det_alpha_grad_lapl_curr(k,i+1) = det_alpha_grad_lapl_curr(k,i+1) + mo_grad_lapl_alpha(k,i+1,imo )*slater_matrix_alpha_inv_det(i+1,j ) & + + mo_grad_lapl_alpha(k,i+1,imo2)*slater_matrix_alpha_inv_det(i+1,j+1) enddo - !$OMP END SIMD enddo i=elec_alpha_num - !$OMP SIMD + !DIR$ VECTOR ALIGNED do k=1,4 - det_alpha_grad_lapl_curr(k,i) = det_alpha_grad_lapl_curr(k,i) + mo_grad_lapl(k,i,imo )*slater_matrix_alpha_inv_det(i,j ) & - + mo_grad_lapl(k,i,imo2)*slater_matrix_alpha_inv_det(i,j+1) + det_alpha_grad_lapl_curr(k,i) = det_alpha_grad_lapl_curr(k,i) + mo_grad_lapl_alpha(k,i,imo )*slater_matrix_alpha_inv_det(i,j ) & + + mo_grad_lapl_alpha(k,i,imo2)*slater_matrix_alpha_inv_det(i,j+1) enddo - !$OMP END SIMD enddo j=elec_alpha_num imo = mo_list_alpha_curr(j) do i=1,elec_alpha_num - !$OMP SIMD + !DIR$ VECTOR ALIGNED do k=1,4 - det_alpha_grad_lapl_curr(k,i ) = det_alpha_grad_lapl_curr(k,i ) + mo_grad_lapl(k,i ,imo)*slater_matrix_alpha_inv_det(i ,j) + det_alpha_grad_lapl_curr(k,i ) = det_alpha_grad_lapl_curr(k,i ) + mo_grad_lapl_alpha(k,i ,imo)*slater_matrix_alpha_inv_det(i ,j) enddo - !$OMP END SIMD enddo endif @@ -1469,7 +1826,7 @@ BEGIN_PROVIDER [ double precision, det_beta_grad_lapl_curr, (4,elec_alpha_num+1 ! do i=elec_alpha_num+1,elec_num ! do k=1,4 ! det_beta_grad_lapl_curr(k,i) = det_beta_grad_lapl_curr(k,i) +& -! mo_grad_lapl(k,i,imo)*slater_matrix_beta_inv_det(i-elec_alpha_num,j) +! mo_grad_lapl_alpha(k,i,imo)*slater_matrix_beta_inv_det(i-elec_alpha_num,j) ! enddo ! enddo ! enddo @@ -1484,16 +1841,15 @@ BEGIN_PROVIDER [ double precision, det_beta_grad_lapl_curr, (4,elec_alpha_num+1 !DIR$ LOOP COUNT (16) do i=elec_alpha_num+1,elec_num,2 l = i-elec_alpha_num - !$OMP SIMD + !DIR$ VECTOR ALIGNED do k=1,4 det_beta_grad_lapl_curr(k,i) = det_beta_grad_lapl_curr(k,i) +& - mo_grad_lapl(k,i,imo )*slater_matrix_beta_inv_det(l,j ) + & - mo_grad_lapl(k,i,imo2)*slater_matrix_beta_inv_det(l,j+1) + mo_grad_lapl_beta(k,i,imo )*slater_matrix_beta_inv_det(l,j ) + & + mo_grad_lapl_beta(k,i,imo2)*slater_matrix_beta_inv_det(l,j+1) det_beta_grad_lapl_curr(k,i+1) = det_beta_grad_lapl_curr(k,i+1) +& - mo_grad_lapl(k,i+1,imo )*slater_matrix_beta_inv_det(l+1,j ) + & - mo_grad_lapl(k,i+1,imo2)*slater_matrix_beta_inv_det(l+1,j+1) + mo_grad_lapl_beta(k,i+1,imo )*slater_matrix_beta_inv_det(l+1,j ) + & + mo_grad_lapl_beta(k,i+1,imo2)*slater_matrix_beta_inv_det(l+1,j+1) enddo - !$OMP END SIMD enddo enddo @@ -1505,39 +1861,35 @@ BEGIN_PROVIDER [ double precision, det_beta_grad_lapl_curr, (4,elec_alpha_num+1 !DIR$ LOOP COUNT (16) do i=elec_alpha_num+1,elec_num-1,2 l = i-elec_alpha_num - !$OMP SIMD + !DIR$ VECTOR ALIGNED do k=1,4 det_beta_grad_lapl_curr(k,i) = det_beta_grad_lapl_curr(k,i) +& - mo_grad_lapl(k,i,imo )*slater_matrix_beta_inv_det(l,j ) + & - mo_grad_lapl(k,i,imo2)*slater_matrix_beta_inv_det(l,j+1) + mo_grad_lapl_beta(k,i,imo )*slater_matrix_beta_inv_det(l,j ) + & + mo_grad_lapl_beta(k,i,imo2)*slater_matrix_beta_inv_det(l,j+1) det_beta_grad_lapl_curr(k,i+1) = det_beta_grad_lapl_curr(k,i+1) +& - mo_grad_lapl(k,i+1,imo )*slater_matrix_beta_inv_det(l+1,j ) + & - mo_grad_lapl(k,i+1,imo2)*slater_matrix_beta_inv_det(l+1,j+1) + mo_grad_lapl_beta(k,i+1,imo )*slater_matrix_beta_inv_det(l+1,j ) + & + mo_grad_lapl_beta(k,i+1,imo2)*slater_matrix_beta_inv_det(l+1,j+1) enddo - !$OMP END SIMD enddo i = elec_num l = elec_num-elec_alpha_num - !$OMP SIMD + !DIR$ VECTOR ALIGNED do k=1,4 det_beta_grad_lapl_curr(k,i) = det_beta_grad_lapl_curr(k,i) +& - mo_grad_lapl(k,i,imo )*slater_matrix_beta_inv_det(l,j ) + & - mo_grad_lapl(k,i,imo2)*slater_matrix_beta_inv_det(l,j+1) + mo_grad_lapl_beta(k,i,imo )*slater_matrix_beta_inv_det(l,j ) + & + mo_grad_lapl_beta(k,i,imo2)*slater_matrix_beta_inv_det(l,j+1) enddo - !$OMP END SIMD enddo j = elec_beta_num imo = mo_list_beta_curr(j) - !DIR$ LOOP COUNT (16) do i=elec_alpha_num+1,elec_num l = i-elec_alpha_num - !$OMP SIMD + !DIR$ VECTOR ALIGNED do k=1,4 det_beta_grad_lapl_curr(k,i) = det_beta_grad_lapl_curr(k,i) +& - mo_grad_lapl(k,i,imo)*slater_matrix_beta_inv_det(l,j) + mo_grad_lapl_beta(k,i,imo)*slater_matrix_beta_inv_det(l,j) enddo - !$OMP END SIMD enddo endif diff --git a/src/electrons.irp.f b/src/electrons.irp.f index f120b39..1170603 100644 --- a/src/electrons.irp.f +++ b/src/electrons.irp.f @@ -148,8 +148,8 @@ END_PROVIDER BEGIN_PROVIDER [ real, elec_dist, (elec_num_8,elec_num) ] &BEGIN_PROVIDER [ real, elec_dist_vec_x, (elec_num_8,elec_num) ] -&BEGIN_PROVIDER [ real, elec_dist_vec_y, ((-simd_sp+1):elec_num_8,elec_num) ] -&BEGIN_PROVIDER [ real, elec_dist_vec_z, ((-2*simd_sp+1):elec_num_8,elec_num) ] +&BEGIN_PROVIDER [ real, elec_dist_vec_y, (elec_num_8,elec_num) ] +&BEGIN_PROVIDER [ real, elec_dist_vec_z, (elec_num_8,elec_num) ] implicit none BEGIN_DOC ! Electron-electron distances @@ -171,19 +171,15 @@ END_PROVIDER do ie2 = 1,elec_num real :: x, y, z + real :: x2, y2, z2 x = elec_coord(ie2,1) y = elec_coord(ie2,2) z = elec_coord(ie2,3) !DIR$ VECTOR ALIGNED - !DIR$ LOOP COUNT (200) do ie1 = 1,elec_num elec_dist_vec_x(ie1,ie2) = elec_coord(ie1,1) - x elec_dist_vec_y(ie1,ie2) = elec_coord(ie1,2) - y elec_dist_vec_z(ie1,ie2) = elec_coord(ie1,3) - z - enddo - !DIR$ VECTOR ALIGNED - !DIR$ LOOP COUNT (200) - do ie1 = 1,elec_num elec_dist(ie1,ie2) = sqrt( & elec_dist_vec_x(ie1,ie2)*elec_dist_vec_x(ie1,ie2) + & elec_dist_vec_y(ie1,ie2)*elec_dist_vec_y(ie1,ie2) + & diff --git a/src/ezfio_interface.irp.f b/src/ezfio_interface.irp.f index ec4a3b1..2c4682b 100644 --- a/src/ezfio_interface.irp.f +++ b/src/ezfio_interface.irp.f @@ -51,8 +51,6 @@ data = [ \ data_no_set = [\ ("mo_basis_mo_tot_num" , "integer" , ""), -("mo_basis_mo_active_num" , "integer" , ""), -("mo_basis_mo_closed_num" , "integer" , ""), ("pseudo_ao_pseudo_grid" , "double precision" , "(ao_num,pseudo_lmax+pseudo_lmax+1,pseudo_lmax-0+1,nucl_num,pseudo_grid_size)"), ("pseudo_mo_pseudo_grid" , "double precision" , "(ao_num,pseudo_lmax+pseudo_lmax+1,pseudo_lmax-0+1,nucl_num,pseudo_grid_size)"), ("pseudo_pseudo_dz_k" , "double precision" , "(nucl_num,pseudo_klocmax)"), diff --git a/src/mo.irp.f b/src/mo.irp.f index db2473e..5c7d0f9 100644 --- a/src/mo.irp.f +++ b/src/mo.irp.f @@ -187,11 +187,11 @@ BEGIN_PROVIDER [ real, mo_coef_transp_present, (num_present_mos_8,ao_num_8) ] END_PROVIDER - BEGIN_PROVIDER [ real, mo_value_transp, ((-simd_sp+1):mo_num_8,elec_num) ] -&BEGIN_PROVIDER [ real, mo_grad_transp_x, ((-2*simd_sp+1):mo_num_8,elec_num) ] -&BEGIN_PROVIDER [ real, mo_grad_transp_y, ((-3*simd_sp+1):mo_num_8,elec_num) ] -&BEGIN_PROVIDER [ real, mo_grad_transp_z, ((-4*simd_sp+1):mo_num_8,elec_num) ] -&BEGIN_PROVIDER [ real, mo_lapl_transp, ((-5*simd_sp+1):mo_num_8,elec_num) ] + BEGIN_PROVIDER [ real, mo_value_transp, (mo_num_8,elec_num) ] +&BEGIN_PROVIDER [ real, mo_grad_transp_x, (mo_num_8,elec_num) ] +&BEGIN_PROVIDER [ real, mo_grad_transp_y, (mo_num_8,elec_num) ] +&BEGIN_PROVIDER [ real, mo_grad_transp_z, (mo_num_8,elec_num) ] +&BEGIN_PROVIDER [ real, mo_lapl_transp, (mo_num_8,elec_num) ] implicit none BEGIN_DOC @@ -267,7 +267,6 @@ END_PROVIDER cycle endif r_inv = nucl_elec_dist_inv(k,i) - !DIR$ LOOP COUNT (500) do j=1,mo_num mo_value_transp(j,i) = mo_value_transp(j,i) + nucl_fitcusp_param(1,j,k) +& r * (nucl_fitcusp_param(2,j,k) + & @@ -299,7 +298,6 @@ END_PROVIDER ! Scale off-diagonal elements t = prepare_walkers_t do i=1,mo_num - !DIR$ LOOP COUNT (100) do j=1,elec_alpha_num if (i /= j) then mo_value_transp(i,j) *= t @@ -309,7 +307,6 @@ END_PROVIDER mo_lapl_transp(i,j) *= t endif enddo - !DIR$ LOOP COUNT (100) do j=1,elec_beta_num if (i /= j) then mo_value_transp(i,j+elec_alpha_num) *= t @@ -333,7 +330,6 @@ END_PROVIDER enddo END_PROVIDER - BEGIN_PROVIDER [ real, mo_value, (elec_num_8,mo_num) ] implicit none BEGIN_DOC @@ -349,7 +345,7 @@ BEGIN_PROVIDER [ real, mo_value, (elec_num_8,mo_num) ] !DIR$ VECTOR ALIGNED mo_value = 0. endif - call transpose(mo_value_transp(1,1),mo_num_8+simd_sp,mo_value,elec_num_8,mo_num,elec_num) + call transpose(mo_value_transp(1,1),mo_num_8,mo_value,elec_num_8,mo_num,elec_num) END_PROVIDER @@ -377,27 +373,11 @@ END_PROVIDER PROVIDE primitives_reduced endif ! Transpose x last for cache efficiency - call transpose_to_dp(mo_grad_transp_y(1,1),mo_num_8+3*simd_sp,mo_grad_y(1,1),elec_num_8,mo_num,elec_num) - call transpose_to_dp(mo_grad_transp_z(1,1),mo_num_8+4*simd_sp,mo_grad_z(1,1),elec_num_8,mo_num,elec_num) - call transpose_to_dp(mo_grad_transp_x(1,1),mo_num_8+2*simd_sp,mo_grad_x(1,1),elec_num_8,mo_num,elec_num) - -END_PROVIDER + call transpose_to_dp(mo_grad_transp_y(1,1),mo_num_8,mo_grad_y(1,1),elec_num_8,mo_num,elec_num) + call transpose_to_dp(mo_grad_transp_z(1,1),mo_num_8,mo_grad_z(1,1),elec_num_8,mo_num,elec_num) + call transpose_to_dp(mo_grad_transp_x(1,1),mo_num_8,mo_grad_x(1,1),elec_num_8,mo_num,elec_num) -BEGIN_PROVIDER [ double precision, mo_grad_lapl, (4,elec_num,mo_num) ] - implicit none - BEGIN_DOC -! Gradients and laplacian - END_DOC - integer :: i,j - do j=1,mo_num - do i=1,elec_num - mo_grad_lapl(1,i,j) = mo_grad_x(i,j) - mo_grad_lapl(2,i,j) = mo_grad_y(i,j) - mo_grad_lapl(3,i,j) = mo_grad_z(i,j) - mo_grad_lapl(4,i,j) = mo_lapl (i,j) - enddo - enddo - + END_PROVIDER BEGIN_PROVIDER [ double precision, mo_lapl, (elec_num_8,mo_num) ] @@ -414,11 +394,55 @@ BEGIN_PROVIDER [ double precision, mo_lapl, (elec_num_8,mo_num) ] !DIR$ VECTOR ALIGNED mo_lapl = 0.d0 endif - call transpose_to_dp(mo_lapl_transp(1,1),mo_num_8+5*simd_sp,mo_lapl,elec_num_8,mo_num,elec_num) + call transpose_to_dp(mo_lapl_transp(1,1),mo_num_8,mo_lapl,elec_num_8,mo_num,elec_num) END_PROVIDER + BEGIN_PROVIDER [ double precision, mo_grad_lapl_alpha, (4,elec_alpha_num,mo_num) ] +&BEGIN_PROVIDER [ double precision, mo_grad_lapl_beta , (4,elec_alpha_num+1:elec_num,mo_num) ] + implicit none + BEGIN_DOC +! Gradients and laplacian + END_DOC + integer :: i,j + do j=1,mo_num + do i=1,elec_alpha_num + mo_grad_lapl_alpha(1,i,j) = mo_grad_transp_x(j,i) + mo_grad_lapl_alpha(2,i,j) = mo_grad_transp_y(j,i) + mo_grad_lapl_alpha(3,i,j) = mo_grad_transp_z(j,i) + mo_grad_lapl_alpha(4,i,j) = mo_lapl_transp (j,i) + enddo + enddo + do j=1,mo_num + do i=elec_alpha_num+1,elec_num + mo_grad_lapl_beta(1,i,j) = mo_grad_transp_x(j,i) + mo_grad_lapl_beta(2,i,j) = mo_grad_transp_y(j,i) + mo_grad_lapl_beta(3,i,j) = mo_grad_transp_z(j,i) + mo_grad_lapl_beta(4,i,j) = mo_lapl_transp (j,i) + enddo + enddo + +END_PROVIDER + +BEGIN_PROVIDER [ double precision, mo_grad_lapl_transp, (4,mo_num,elec_num) ] + implicit none + BEGIN_DOC +! Gradients and laplacian + END_DOC + integer :: i,j + do i=1,elec_num + do j=1,mo_num + mo_grad_lapl_transp(1,j,i) = mo_grad_transp_x(j,i) + mo_grad_lapl_transp(2,j,i) = mo_grad_transp_y(j,i) + mo_grad_lapl_transp(3,j,i) = mo_grad_transp_z(j,i) + mo_grad_lapl_transp(4,j,i) = mo_lapl_transp (j,i) + enddo + enddo + +END_PROVIDER + + BEGIN_PROVIDER [ real, prepare_walkers_t ] implicit none BEGIN_DOC @@ -468,7 +492,6 @@ BEGIN_PROVIDER [ double precision , mo_value_at_nucl, (mo_num_8,nucl_num) ] PROVIDE ao_value_p - !DIR$ LOOP COUNT (2000) do i=1,ao_num if (ao_nucl(i) /= k) then ao_value_at_nucl_no_S(i) = ao_value_p(i) @@ -510,7 +533,6 @@ END_PROVIDER point(3) = nucl_coord(k,3)+ nucl_fitcusp_radius(k) TOUCH point - !DIR$ LOOP COUNT (2000) do j=1,ao_num ao_value_at_fitcusp_radius(j,k) = ao_value_p(j) ao_grad_at_fitcusp_radius(j,k) = ao_grad_p(j,3) @@ -664,7 +686,6 @@ BEGIN_PROVIDER [ real, nucl_fitcusp_param, (4,mo_num,nucl_num) ] cycle endif R = nucl_fitcusp_radius(k) - !DIR$ LOOP COUNT (500) do i=1,mo_num double precision :: lap_phi, grad_phi, phi, eta lap_phi = mo_lapl_at_fitcusp_radius(i,k) @@ -730,9 +751,15 @@ subroutine sparse_full_mv(A,LDA, & ! LDC and LDA have to be factors of simd_sp - !DIR$ VECTOR ALIGNED - !DIR$ LOOP COUNT (256) - !$OMP SIMD + IRP_IF NO_PREFETCH + IRP_ELSE + call MM_PREFETCH (A(j,indices(1)),3) + call MM_PREFETCH (A(j,indices(2)),3) + call MM_PREFETCH (A(j,indices(3)),3) + call MM_PREFETCH (A(j,indices(4)),3) + IRP_ENDIF + + !DIR$ SIMD do j=1,LDC C1(j) = 0. C2(j) = 0. @@ -740,11 +767,11 @@ subroutine sparse_full_mv(A,LDA, & C4(j) = 0. C5(j) = 0. enddo - !$OMP END SIMD + kmax2 = ishft(indices(0),-2) kmax2 = ishft(kmax2,2) kmax3 = indices(0) - !DIR$ LOOP COUNT (200) + do kao=1,kmax2,4 k_vec(1) = indices(kao ) k_vec(2) = indices(kao+1) @@ -761,19 +788,6 @@ subroutine sparse_full_mv(A,LDA, & d32 = B2(kao+2) d42 = B2(kao+3) - !DIR$ LOOP COUNT (256) - do k=0,LDA-1,$IRP_ALIGN/4 - !DIR$ VECTOR ALIGNED - !$OMP SIMD - do j=1,$IRP_ALIGN/4 - C1(j+k) = C1(j+k) + A(j+k,k_vec(1))*d11 + A(j+k,k_vec(2))*d21& - + A(j+k,k_vec(3))*d31 + A(j+k,k_vec(4))*d41 - C2(j+k) = C2(j+k) + A(j+k,k_vec(1))*d12 + A(j+k,k_vec(2))*d22& - + A(j+k,k_vec(3))*d32 + A(j+k,k_vec(4))*d42 - enddo - !$OMP END SIMD - enddo - d13 = B3(kao ) d23 = B3(kao+1) d33 = B3(kao+2) @@ -784,38 +798,47 @@ subroutine sparse_full_mv(A,LDA, & d34 = B4(kao+2) d44 = B4(kao+3) - !DIR$ LOOP COUNT (256) - do k=0,LDA-1,$IRP_ALIGN/4 - !DIR$ VECTOR ALIGNED - !$OMP SIMD - do j=1,$IRP_ALIGN/4 - C3(j+k) = C3(j+k) + A(j+k,k_vec(1))*d13 + A(j+k,k_vec(2))*d23& - + A(j+k,k_vec(3))*d33 + A(j+k,k_vec(4))*d43 - C4(j+k) = C4(j+k) + A(j+k,k_vec(1))*d14 + A(j+k,k_vec(2))*d24& - + A(j+k,k_vec(3))*d34 + A(j+k,k_vec(4))*d44 - enddo - !$OMP END SIMD - enddo - d15 = B5(kao ) d25 = B5(kao+1) d35 = B5(kao+2) d45 = B5(kao+3) - !DIR$ LOOP COUNT (256) do k=0,LDA-1,$IRP_ALIGN/4 !DIR$ VECTOR ALIGNED - !$OMP SIMD + !DIR$ SIMD FIRSTPRIVATE(d11,d21,d31,d41) do j=1,$IRP_ALIGN/4 + IRP_IF NO_PREFETCH + IRP_ELSE + call MM_PREFETCH (A(j+k,indices(kao+4)),3) + call MM_PREFETCH (A(j+k,indices(kao+5)),3) + call MM_PREFETCH (A(j+k,indices(kao+6)),3) + call MM_PREFETCH (A(j+k,indices(kao+7)),3) + IRP_ENDIF + C1(j+k) = C1(j+k) + A(j+k,k_vec(1))*d11 + A(j+k,k_vec(2))*d21& + + A(j+k,k_vec(3))*d31 + A(j+k,k_vec(4))*d41 + enddo + + !DIR$ VECTOR ALIGNED + !DIR$ SIMD FIRSTPRIVATE(d12,d22,d32,d42,d13,d23,d33,d43) + do j=1,$IRP_ALIGN/4 + C2(j+k) = C2(j+k) + A(j+k,k_vec(1))*d12 + A(j+k,k_vec(2))*d22& + + A(j+k,k_vec(3))*d32 + A(j+k,k_vec(4))*d42 + C3(j+k) = C3(j+k) + A(j+k,k_vec(1))*d13 + A(j+k,k_vec(2))*d23& + + A(j+k,k_vec(3))*d33 + A(j+k,k_vec(4))*d43 + enddo + + !DIR$ VECTOR ALIGNED + !DIR$ SIMD FIRSTPRIVATE(d14,d24,d34,d44,d15,d25,d35,d45) + do j=1,$IRP_ALIGN/4 + C4(j+k) = C4(j+k) + A(j+k,k_vec(1))*d14 + A(j+k,k_vec(2))*d24& + + A(j+k,k_vec(3))*d34 + A(j+k,k_vec(4))*d44 C5(j+k) = C5(j+k) + A(j+k,k_vec(1))*d15 + A(j+k,k_vec(2))*d25& + A(j+k,k_vec(3))*d35 + A(j+k,k_vec(4))*d45 enddo - !$OMP END SIMD enddo enddo - !DIR$ LOOP COUNT (200) do kao = kmax2+1, kmax3 k_vec(1) = indices(kao) d11 = B1(kao) @@ -824,10 +847,9 @@ subroutine sparse_full_mv(A,LDA, & d14 = B4(kao) d15 = B5(kao) !DIR$ VECTOR ALIGNED - !DIR$ LOOP COUNT (256) - do k=0,LDA-1,simd_sp + do k=0,LDA-1,$IRP_ALIGN/4 !DIR$ VECTOR ALIGNED - !$OMP SIMD + !DIR$ SIMD FIRSTPRIVATE(d11,d12,d13,d14,d15) do j=1,$IRP_ALIGN/4 C1(j+k) = C1(j+k) + A(j+k,k_vec(1))*d11 C2(j+k) = C2(j+k) + A(j+k,k_vec(1))*d12 @@ -835,7 +857,6 @@ subroutine sparse_full_mv(A,LDA, & C4(j+k) = C4(j+k) + A(j+k,k_vec(1))*d14 C5(j+k) = C5(j+k) + A(j+k,k_vec(1))*d15 enddo - !$OMP END SIMD enddo enddo