diff --git a/src/det.irp.f b/src/det.irp.f index 9e2e11d..c488e12 100644 --- a/src/det.irp.f +++ b/src/det.irp.f @@ -434,49 +434,60 @@ subroutine det_update$n(n,LDS,m,l,S,S_inv,d) !DIR$ ASSUME (mod(LDS,$IRP_ALIGN/8) == 0) !DIR$ ASSUME (LDS >= $n) integer :: i,j - double precision :: zl, ui + double precision :: zj, zj1, zj2, zj3 - zl = 0.d0 !DIR$ VECTOR ALIGNED - do i=1,$n-1 - ui = m(i) - S(i,l) - u(i) = ui - zl = zl + S_inv(i,l)*ui + !DIR$ SIMD + do i=1,$n + u(i) = m(i) - S(i,l) enddo - u($n) = m($n) - S($n,l) - zl = zl + S_inv($n,l)*u($n) + + zj = 0.d0 + !DIR$ SIMD REDUCTION(+:zj) + do i=1,$n-1,4 + zj = zj + S_inv(i,l)*u(i) + S_inv(i+1,l)*u(i+1) & + + S_inv(i+2,l)*u(i+2) + S_inv(i+3,l)*u(i+3) + enddo + zj = zj + S_inv($n,l)*u($n) d_inv = 1.d0/d - d = d+zl + d = d+zj lambda = d*d_inv if (dabs(lambda) < 1.d-3) then d = 0.d0 return endif - !DIR$ VECTOR ALIGNED - z = 0.d0 !DIR$ VECTOR ALIGNED do j=1,$n-1,4 + zj = 0.d0 + zj1 = 0.d0 + zj2 = 0.d0 + zj3 = 0.d0 !DIR$ VECTOR ALIGNED + !DIR$ SIMD REDUCTION(+:zj,zj1,zj2,zj3) do i=1,$n-1 - z(j ) = z(j ) + S_inv(i,j )*u(i) - z(j+1) = z(j+1) + S_inv(i,j+1)*u(i) - z(j+2) = z(j+2) + S_inv(i,j+2)*u(i) - z(j+3) = z(j+3) + S_inv(i,j+3)*u(i) + zj = zj + S_inv(i,j )*u(i) + zj1 = zj1 + S_inv(i,j+1)*u(i) + zj2 = zj2 + S_inv(i,j+2)*u(i) + zj3 = zj3 + S_inv(i,j+3)*u(i) enddo - z(j ) = z(j ) + S_inv($n,j )*u($n) - z(j+1) = z(j+1) + S_inv($n,j+1)*u($n) - z(j+2) = z(j+2) + S_inv($n,j+2)*u($n) - z(j+3) = z(j+3) + S_inv($n,j+3)*u($n) + z(j ) = zj + S_inv($n,j )*u($n) + z(j+1) = zj1 + S_inv($n,j+1)*u($n) + z(j+2) = zj2 + S_inv($n,j+2)*u($n) + z(j+3) = zj3 + S_inv($n,j+3)*u($n) enddo + zj = 0.d0 !DIR$ VECTOR ALIGNED + !DIR$ SIMD REDUCTION(+:zj) do i=1,$n - z($n) = z($n) + S_inv(i,$n)*u(i) + zj = zj + S_inv(i,$n)*u(i) enddo + z($n) = zj !DIR$ VECTOR ALIGNED + !DIR$ SIMD FIRSTPRIVATE(d_inv) do i=1,$n w(i) = S_inv(i,l)*d_inv S(i,l) = m(i) @@ -484,6 +495,7 @@ subroutine det_update$n(n,LDS,m,l,S,S_inv,d) do i=1,$n-1,4 !DIR$ VECTOR ALIGNED + !DIR$ SIMD FIRSTPRIVATE(lambda,z) do j=1,$n-1 S_inv(j,i ) = S_inv(j,i )*lambda - w(j)*z(i ) S_inv(j,i+1) = S_inv(j,i+1)*lambda - w(j)*z(i+1) @@ -497,6 +509,7 @@ subroutine det_update$n(n,LDS,m,l,S,S_inv,d) enddo !DIR$ VECTOR ALIGNED + !DIR$ SIMD FIRSTPRIVATE(lambda,z) do i=1,$n S_inv(i,$n) = S_inv(i,$n)*lambda -w(i)*z($n) enddo @@ -567,61 +580,71 @@ subroutine det_update$n(n,LDS,m,l,S,S_inv,d) !DIR$ ASSUME (LDS >= $n) integer :: i,j + double precision :: zj, zj1, zj2, zj3 !DIR$ VECTOR ALIGNED + !DIR$ SIMD do i=1,$n u(i) = m(i) - S(i,l) enddo - z(l) = 0.d0 + zj = 0.d0 !DIR$ VECTOR ALIGNED + !DIR$ SIMD REDUCTION(+:zj) do i=1,$n-2,4 - z(l) = z(l) + S_inv(i,l)*u(i) + S_inv(i+1,l)*u(i+1) & - + S_inv(i+2,l)*u(i+2) + S_inv(i+3,l)*u(i+3) + zj = zj + S_inv(i,l)*u(i) + S_inv(i+1,l)*u(i+1) & + + S_inv(i+2,l)*u(i+2) + S_inv(i+3,l)*u(i+3) enddo i=$n-1 - z(l) = z(l) + S_inv(i,l)*u(i) + S_inv(i+1,l)*u(i+1) + zj = zj + S_inv(i,l)*u(i) + S_inv(i+1,l)*u(i+1) d_inv = 1.d0/d - d = d+z(l) + d = d+zj lambda = d*d_inv if (dabs(lambda) < 1.d-3) then d = 0.d0 return endif - !DIR$ VECTOR ALIGNED - z = 0.d0 !DIR$ VECTOR ALIGNED do j=1,$n-2,4 + zj = 0.d0 + zj1 = 0.d0 + zj2 = 0.d0 + zj3 = 0.d0 !DIR$ VECTOR ALIGNED + !DIR$ SIMD REDUCTION(+:zj,zj1,zj2,zj3) do i=1,$n-2 - z(j ) = z(j ) + S_inv(i,j )*u(i) - z(j+1) = z(j+1) + S_inv(i,j+1)*u(i) - z(j+2) = z(j+2) + S_inv(i,j+2)*u(i) - z(j+3) = z(j+3) + S_inv(i,j+3)*u(i) + zj = zj + S_inv(i,j )*u(i) + zj1 = zj1 + S_inv(i,j+1)*u(i) + zj2 = zj2 + S_inv(i,j+2)*u(i) + zj3 = zj3 + S_inv(i,j+3)*u(i) enddo - z(j ) = z(j ) + S_inv($n-1,j )*u($n-1) + z(j ) = zj + S_inv($n-1,j )*u($n-1) z(j ) = z(j ) + S_inv($n,j )*u($n) - z(j+1) = z(j+1) + S_inv($n-1,j+1)*u($n-1) + z(j+1) = zj1 + S_inv($n-1,j+1)*u($n-1) z(j+1) = z(j+1) + S_inv($n,j+1)*u($n) - z(j+2) = z(j+2) + S_inv($n-1,j+2)*u($n-1) + z(j+2) = zj2 + S_inv($n-1,j+2)*u($n-1) z(j+2) = z(j+2) + S_inv($n,j+2)*u($n) - z(j+3) = z(j+3) + S_inv($n-1,j+3)*u($n-1) + z(j+3) = zj3 + S_inv($n-1,j+3)*u($n-1) z(j+3) = z(j+3) + S_inv($n,j+3)*u($n) enddo j=$n-1 + zj = 0.d0 + zj1 = 0.d0 !DIR$ VECTOR ALIGNED + !DIR$ SIMD REDUCTION(+:zj,zj1) do i=1,$n-2 - z(j ) = z(j ) + S_inv(i,j )*u(i) - z(j+1) = z(j+1) + S_inv(i,j+1)*u(i) + zj = zj + S_inv(i,j )*u(i) + zj1 = zj1 + S_inv(i,j+1)*u(i) enddo - z(j ) = z(j ) + S_inv($n-1,j )*u($n-1) + z(j ) = zj + S_inv($n-1,j )*u($n-1) z(j ) = z(j ) + S_inv($n,j )*u($n) - z(j+1) = z(j+1) + S_inv($n-1,j+1)*u($n-1) + z(j+1) = zj1 + S_inv($n-1,j+1)*u($n-1) z(j+1) = z(j+1) + S_inv($n,j+1)*u($n) !DIR$ VECTOR ALIGNED + !DIR$ SIMD FIRSTPRIVATE(d_inv) do i=1,$n w(i) = S_inv(i,l)*d_inv S(i,l) = m(i) @@ -629,6 +652,7 @@ subroutine det_update$n(n,LDS,m,l,S,S_inv,d) do i=1,$n-2,4 !DIR$ VECTOR ALIGNED + !DIR$ SIMD FIRSTPRIVATE(lambda,z) do j=1,$n-2 S_inv(j,i ) = S_inv(j,i )*lambda -z(i )*w(j) S_inv(j,i+1) = S_inv(j,i+1)*lambda -z(i+1)*w(j) @@ -647,6 +671,7 @@ subroutine det_update$n(n,LDS,m,l,S,S_inv,d) i=$n-1 !DIR$ VECTOR ALIGNED + !DIR$ SIMD FIRSTPRIVATE(lambda,z) do j=1,$n-2 S_inv(j,i ) = S_inv(j,i )*lambda -z(i )*w(j) S_inv(j,i+1) = S_inv(j,i+1)*lambda -z(i+1)*w(j) @@ -719,67 +744,75 @@ subroutine det_update$n(n,LDS,m,l,S,S_inv,d) !DIR$ ASSUME (mod(LDS,$IRP_ALIGN/8) == 0) !DIR$ ASSUME (LDS >= $n) integer :: i,j - + + double precision :: zj, zj1, zj2, zj3 !DIR$ VECTOR ALIGNED + !DIR$ SIMD do i=1,$n u(i) = m(i) - S(i,l) enddo - z(l) = 0.d0 + zj = 0.d0 !DIR$ VECTOR ALIGNED + !DIR$ SIMD REDUCTION(+:zj) do i=1,$n-3,4 - z(l) = z(l) + S_inv(i,l)*u(i) + S_inv(i+1,l)*u(i+1) & - + S_inv(i+2,l)*u(i+2) + S_inv(i+3,l)*u(i+3) + zj = zj + S_inv(i,l)*u(i) + S_inv(i+1,l)*u(i+1) & + + S_inv(i+2,l)*u(i+2) + S_inv(i+3,l)*u(i+3) enddo i=$n-2 - z(l) = z(l) + S_inv(i,l)*u(i) + S_inv(i+1,l)*u(i+1) + S_inv(i+2,l)*u(i+2) + zj = zj + S_inv(i,l)*u(i) + S_inv(i+1,l)*u(i+1) + S_inv(i+2,l)*u(i+2) d_inv = 1.d0/d - d = d+z(l) + d = d+zj lambda = d*d_inv if (dabs(lambda) < 1.d-3) then d = 0.d0 return endif - !DIR$ VECTOR ALIGNED - z = 0.d0 !DIR$ VECTOR ALIGNED do j=1,$n-3,4 + zj = 0.d0 + zj1 = 0.d0 + zj2 = 0.d0 + zj3 = 0.d0 !DIR$ VECTOR ALIGNED + !DIR$ SIMD REDUCTION(+:zj,zj1,zj2,zj3) do i=1,$n-3 - z(j ) = z(j ) + S_inv(i,j )*u(i) - z(j+1) = z(j+1) + S_inv(i,j+1)*u(i) - z(j+2) = z(j+2) + S_inv(i,j+2)*u(i) - z(j+3) = z(j+3) + S_inv(i,j+3)*u(i) + zj = zj + S_inv(i,j )*u(i) + zj1 = zj1 + S_inv(i,j+1)*u(i) + zj2 = zj2 + S_inv(i,j+2)*u(i) + zj3 = zj3 + S_inv(i,j+3)*u(i) enddo - z(j ) = z(j ) + S_inv($n-2,j )*u($n-2) + z(j ) = zj + S_inv($n-2,j )*u($n-2) z(j ) = z(j ) + S_inv($n-1,j )*u($n-1) z(j ) = z(j ) + S_inv($n,j )*u($n) - z(j+1) = z(j+1) + S_inv($n-2,j+1)*u($n-2) + z(j+1) = zj1 + S_inv($n-2,j+1)*u($n-2) z(j+1) = z(j+1) + S_inv($n-1,j+1)*u($n-1) z(j+1) = z(j+1) + S_inv($n,j+1)*u($n) - z(j+2) = z(j+2) + S_inv($n-2,j+2)*u($n-2) + z(j+2) = zj2 + S_inv($n-2,j+2)*u($n-2) z(j+2) = z(j+2) + S_inv($n-1,j+2)*u($n-1) z(j+2) = z(j+2) + S_inv($n,j+2)*u($n) - z(j+3) = z(j+3) + S_inv($n-2,j+3)*u($n-2) + z(j+3) = zj3 + S_inv($n-2,j+3)*u($n-2) z(j+3) = z(j+3) + S_inv($n-1,j+3)*u($n-1) z(j+3) = z(j+3) + S_inv($n,j+3)*u($n) enddo j=$n-2 - z(j ) = S_inv($n,j )*u($n) - z(j+1) = S_inv($n,j+1)*u($n) - z(j+2) = S_inv($n,j+2)*u($n) + zj = 0.d0 + zj1 = 0.d0 + zj2 = 0.d0 !DIR$ VECTOR ALIGNED - do i=1,$n-1 - z(j ) = z(j ) + S_inv(i,j )*u(i) - z(j+1) = z(j+1) + S_inv(i,j+1)*u(i) - z(j+2) = z(j+2) + S_inv(i,j+2)*u(i) + !DIR$ SIMD REDUCTION(+:zj,zj1,zj2) + do i=1,$n + zj = zj + S_inv(i,j )*u(i) + zj1 = zj1 + S_inv(i,j+1)*u(i) + zj2 = zj2 + S_inv(i,j+2)*u(i) enddo !DIR$ VECTOR ALIGNED + !DIR$ SIMD FIRSTPRIVATE(d_inv) do i=1,$n w(i) = S_inv(i,l)*d_inv S(i,l) = m(i) @@ -787,6 +820,7 @@ subroutine det_update$n(n,LDS,m,l,S,S_inv,d) do i=1,$n-3,4 !DIR$ VECTOR ALIGNED + !DIR$ SIMD FIRSTPRIVATE(lambda,z) do j=1,$n-3 S_inv(j,i ) = S_inv(j,i )*lambda - w(j)*z(i ) S_inv(j,i+1) = S_inv(j,i+1)*lambda - w(j)*z(i+1) @@ -809,6 +843,7 @@ subroutine det_update$n(n,LDS,m,l,S,S_inv,d) i=$n-2 !DIR$ VECTOR ALIGNED + !DIR$ SIMD FIRSTPRIVATE(lambda,z) do j=1,$n S_inv(j,i ) = S_inv(j,i )*lambda - w(j)*z(i ) S_inv(j,i+1) = S_inv(j,i+1)*lambda - w(j)*z(i+1) @@ -881,20 +916,23 @@ subroutine det_update_general(n,LDS,m,l,S,S_inv,d) !DIR$ ASSUME (n>150) integer :: i,j,n4 + double precision :: zl !DIR$ VECTOR ALIGNED + !DIR$ SIMD do i=1,n u(i) = m(i) - S(i,l) enddo - z(l) = 0.d0 + zl = 0.d0 !DIR$ VECTOR ALIGNED + !DIR$ SIMD REDUCTION(+:zl) do i=1,n - z(l) = z(l) + S_inv(i,l)*u(i) + zl = zl + S_inv(i,l)*u(i) enddo d_inv = 1.d0/d - d = d+z(l) + d = d+zl lambda = d*d_inv if ( dabs(lambda) < 1.d-3 ) then @@ -902,31 +940,41 @@ subroutine det_update_general(n,LDS,m,l,S,S_inv,d) return endif + double precision :: zj, zj1, zj2, zj3 + n4 = iand(n,not(3)) !DIR$ VECTOR ALIGNED do j=1,n4,4 - z(j ) = 0.d0 - z(j+1) = 0.d0 - z(j+2) = 0.d0 - z(j+3) = 0.d0 + zj = 0.d0 + zj1 = 0.d0 + zj2 = 0.d0 + zj3 = 0.d0 !DIR$ VECTOR ALIGNED + !DIR$ SIMD REDUCTION(+:zj,zj1,zj2,zj3) do i=1,n - z(j ) = z(j ) + S_inv(i,j )*u(i) - z(j+1) = z(j+1) + S_inv(i,j+1)*u(i) - z(j+2) = z(j+2) + S_inv(i,j+2)*u(i) - z(j+3) = z(j+3) + S_inv(i,j+3)*u(i) + zj = zj + S_inv(i,j )*u(i) + zj1 = zj1 + S_inv(i,j+1)*u(i) + zj2 = zj2 + S_inv(i,j+2)*u(i) + zj3 = zj3 + S_inv(i,j+3)*u(i) enddo + z(j ) = zj + z(j+1) = zj1 + z(j+2) = zj2 + z(j+3) = zj3 enddo do j=n4+1,n - z(j) = 0.d0 + zj = 0.d0 !DIR$ VECTOR ALIGNED + !DIR$ SIMD REDUCTION(+:zj) do i=1,n - z(j) = z(j) + S_inv(i,j)*u(i) + zj = zj + S_inv(i,j)*u(i) enddo + z(j ) = zj enddo !DIR$ VECTOR ALIGNED + !DIR$ SIMD FIRSTPRIVATE(d_inv) do i=1,n w(i) = S_inv(i,l)*d_inv S(i,l) = m(i) @@ -934,7 +982,7 @@ subroutine det_update_general(n,LDS,m,l,S,S_inv,d) do i=1,n4,4 !DIR$ VECTOR ALIGNED - !DIR$ VECTOR ALWAYS + !DIR$ SIMD FIRSTPRIVATE(lambda,z) do j=1,n S_inv(j,i ) = S_inv(j,i )*lambda -z(i )*w(j) S_inv(j,i+1) = S_inv(j,i+1)*lambda -z(i+1)*w(j) @@ -945,7 +993,7 @@ subroutine det_update_general(n,LDS,m,l,S,S_inv,d) do i=n4+1,n !DIR$ VECTOR ALIGNED - !DIR$ VECTOR ALWAYS + !DIR$ SIMD FIRSTPRIVATE(lambda,z) do j=1,n S_inv(j,i) = S_inv(j,i)*lambda -z(i)*w(j) enddo diff --git a/src/mo.irp.f b/src/mo.irp.f index 12d407a..eb300af 100644 --- a/src/mo.irp.f +++ b/src/mo.irp.f @@ -709,7 +709,7 @@ subroutine sparse_full_mv(A,LDA, & call MM_PREFETCH (A(j,indices(4)),3) IRP_ENDIF - !DIR$ VECTOR ALIGNED + !DIR$ SIMD do j=1,LDC C1(j) = 0. C2(j) = 0. @@ -755,6 +755,7 @@ subroutine sparse_full_mv(A,LDA, & do k=0,LDA-1,$IRP_ALIGN/4 !DIR$ VECTOR ALIGNED + !DIR$ SIMD FIRSTPRIVATE(d11,d21,d31,d41) do j=1,$IRP_ALIGN/4 IRP_IF NO_PREFETCH IRP_ELSE @@ -768,6 +769,7 @@ subroutine sparse_full_mv(A,LDA, & enddo !DIR$ VECTOR ALIGNED + !DIR$ SIMD FIRSTPRIVATE(d12,d22,d32,d42,d13,d23,d33,d43) do j=1,$IRP_ALIGN/4 C2(j+k) = C2(j+k) + A(j+k,k_vec(1))*d12 + A(j+k,k_vec(2))*d22& + A(j+k,k_vec(3))*d32 + A(j+k,k_vec(4))*d42 @@ -776,6 +778,7 @@ subroutine sparse_full_mv(A,LDA, & enddo !DIR$ VECTOR ALIGNED + !DIR$ SIMD FIRSTPRIVATE(d14,d24,d34,d44,d15,d25,d35,d45) do j=1,$IRP_ALIGN/4 C4(j+k) = C4(j+k) + A(j+k,k_vec(1))*d14 + A(j+k,k_vec(2))*d24& + A(j+k,k_vec(3))*d34 + A(j+k,k_vec(4))*d44 @@ -796,6 +799,7 @@ subroutine sparse_full_mv(A,LDA, & !DIR$ VECTOR ALIGNED do k=0,LDA-1,$IRP_ALIGN/4 !DIR$ VECTOR ALIGNED + !DIR$ SIMD FIRSTPRIVATE(d11,d12,d13,d14,d15) do j=1,$IRP_ALIGN/4 C1(j+k) = C1(j+k) + A(j+k,k_vec(1))*d11 C2(j+k) = C2(j+k) + A(j+k,k_vec(1))*d12