Scaling

2024-04-16 14:45:18 +02:00 · 2024-04-16 14:45:18 +02:00 · 792ab5ceb2
commit 792ab5ceb2
parent a83cdf77d1
5 changed files with 44 additions and 185 deletions
--- a/Data/scaling.plt
+++ b/Data/scaling.plt
@ -1,180 +1,13 @@
-#!/usr/bin/gnuplot -persist
-#
-#    
-#    	G N U P L O T
-#    	Version 5.4 patchlevel 2    last modified 2021-06-01 
-#    
-#    	Copyright (C) 1986-1993, 1998, 2004, 2007-2021
-#    	Thomas Williams, Colin Kelley and many others
-#    
-#    	gnuplot home:     http://www.gnuplot.info
-#    	faq, bugs, etc:   type "help FAQ"
-#    	immediate help:   type "help"  (plot window: hit 'h')
-# set terminal qt 0 font "Sans,9"
-# set output
-unset clip points
-set clip one
-unset clip two
-unset clip radial
-set errorbars front 1.000000 
-set border 31 front lt black linewidth 1.000 dashtype solid
-set zdata 
-set ydata 
-set xdata 
-set y2data 
-set x2data 
-set boxwidth
-set boxdepth 0
-set style fill  empty border
-set style rectangle back fc  bgnd fillstyle   solid 1.00 border lt -1
-set style circle radius graph 0.02 
-set style ellipse size graph 0.05, 0.03 angle 0 units xy
-set dummy x, y
-set format x "% h" 
-set format y "% h" 
-set format x2 "% h" 
-set format y2 "% h" 
-set format z "% h" 
-set format cb "% h" 
-set format r "% h" 
-set ttics format "% h"
-set timefmt "%d/%m/%y,%H:%M"
-set angles radians
-set tics back
-set grid nopolar
-set grid xtics nomxtics ytics nomytics noztics nomztics nortics nomrtics \
- nox2tics nomx2tics noy2tics nomy2tics nocbtics nomcbtics
-set grid layerdefault   lt 0 linecolor 0 linewidth 0.500,  lt 0 linecolor 0 linewidth 0.500
-unset raxis
-set theta counterclockwise right
-set style parallel front  lt black linewidth 2.000 dashtype solid
-set key notitle
-set key fixed left top vertical Right noreverse enhanced autotitle nobox
-set key noinvert samplen 4 spacing 1 width 0 height 0 
-set key maxcolumns 0 maxrows 0
-set key noopaque
-unset label
-unset arrow
-unset style line
-unset style arrow
-set style histogram clustered gap 2 title textcolor lt -1
-unset object
-unset walls
-set style textbox  transparent margins  1.0,  1.0 border  lt -1 linewidth  1.0
-set offsets 0, 0, 0, 0
-set pointsize 1
-set pointintervalbox 1
-set encoding default
-unset polar
-unset parametric
-unset spiderplot
-unset decimalsign
-unset micro
-unset minussign
-set view 60, 30, 1, 1
-set view azimuth 0
-set rgbmax 255
-set samples 100, 100
-set isosamples 10, 10
-set surface 
-unset contour
-set cntrlabel  format '%8.3g' font '' start 5 interval 20
-set mapping cartesian
-set datafile separator whitespace
-set datafile nocolumnheaders
-unset hidden3d
-set cntrparam order 4
-set cntrparam linear
-set cntrparam levels 5
-set cntrparam levels auto
-set cntrparam firstlinetype 0 unsorted
-set cntrparam points 5
-set size ratio 0 1,1
-set origin 0,0
-set style data points
-set style function lines
-unset xzeroaxis
-unset yzeroaxis
-unset zzeroaxis
-unset x2zeroaxis
-unset y2zeroaxis
-set xyplane relative 0.5
-set tics scale  1, 0.5, 1, 1, 1
-set mxtics default
-set mytics default
-set mztics default
-set mx2tics default
-set my2tics default
-set mcbtics default
-set mrtics default
-set nomttics
-set xtics border in scale 1,0.5 mirror norotate  autojustify
-set xtics  norangelimit autofreq 
-set ytics border in scale 1,0.5 mirror norotate  autojustify
-set ytics  norangelimit autofreq 
-set ztics border in scale 1,0.5 nomirror norotate  autojustify
-set ztics  norangelimit autofreq 
-unset x2tics
-unset y2tics
-set cbtics border in scale 1,0.5 mirror norotate  autojustify
-set cbtics  norangelimit autofreq 
-set rtics axis in scale 1,0.5 nomirror norotate  autojustify
-set rtics  norangelimit autofreq 
-unset ttics
-set title "" 
-set title  font "" textcolor lt -1 norotate
-set timestamp bottom 
-set timestamp "" 
-set timestamp  font "" textcolor lt -1 norotate
-set trange [ * : * ] noreverse nowriteback
-set urange [ * : * ] noreverse nowriteback
-set vrange [ * : * ] noreverse nowriteback
+#!/usr/bin/env gnuplot
+
+set grid
+set key bottom
+set format y "%.1f"
 set xlabel "Number of cores" 
-set xlabel  font "" textcolor lt -1 norotate
-set x2label "" 
-set x2label  font "" textcolor lt -1 norotate
-set xrange [ * : * ] noreverse writeback
-set x2range [ * : * ] noreverse writeback
 set ylabel "Speedup" 
-set ylabel  font "" textcolor lt -1 rotate
-set y2label "" 
-set y2label  font "" textcolor lt -1 rotate
-set yrange [ * : * ] noreverse writeback
-set y2range [ * : * ] noreverse writeback
-set zlabel "" 
-set zlabel  font "" textcolor lt -1 norotate
-set zrange [ * : * ] noreverse writeback
-set cblabel "" 
-set cblabel  font "" textcolor lt -1 rotate
-set cbrange [ * : * ] noreverse writeback
-set rlabel "" 
-set rlabel  font "" textcolor lt -1 norotate
-set rrange [ * : * ] noreverse writeback
-unset logscale
-unset jitter
-set zero 1e-08
-set lmargin  -1
-set bmargin  -1
-set rmargin  -1
-set tmargin  -1
-set locale "en_AU.UTF-8"
-set pm3d explicit at s
-set pm3d scansautomatic
-set pm3d interpolate 1,1 flush begin noftriangles noborder corners2color mean
-set pm3d clip z 
-set pm3d nolighting
-set palette positive nops_allcF maxcolors 0 gamma 1.5 color model RGB 
-set palette rgbformulae 7, 5, 15
-set colorbox default
-set colorbox vertical origin screen 0.9, 0.2 size screen 0.05, 0.6 front  noinvert bdefault
-set style boxplot candles range  1.50 outliers pt 7 separation 1 labels auto unsorted
-set loadpath 
-set fontpath
-set psdir
-set fit brief errorvariables nocovariancevariables errorscaling prescale nowrap v5
-GNUTERM = "qt"
-I = {0.0, 1.0}
-VoxelDistance = 0.0
-## Last datafile plotted: "scaling.dat"
-plot 'scaling.dat' u 1:(740.99828964984044/$2) w lp notitle, x title "Ideal"
-#    EOF
+set term pdfcairo enhanced font "Times,14" linewidth 2 rounded size 5.0in, 3.0in
+set output 'scaling.pdf'
+set pointsize 0.5
+plot 'scaling.dat' i 1 u 1:(740.99828964984044/$2) w lp title "ARM Q80", \
+     'scaling.dat' i 0 u 1:(266./$2) w lp title "AMD EPYC", \
+      x title "Ideal"
--- a/Manuscript/benzene_qz.pdf
+++ b/Manuscript/benzene_qz.pdf
--- a/Manuscript/benzene_tz.pdf
+++ b/Manuscript/benzene_tz.pdf
--- a/Manuscript/scaling.pdf
+++ b/Manuscript/scaling.pdf
--- a/Manuscript/stochastic_triples.tex
+++ b/Manuscript/stochastic_triples.tex
@ -243,6 +243,20 @@ As each bucket is equiprobable, samples are defined as combinations of triplets,
 Should the values of $E_{abc}$ be skewed, this advanced refinement significantly diminishes the variance.


+The total perturbative contribution is computed as the aggregate of contributions from various buckets:
+\begin{equation}
+E_{(T)} = \sum_B E_B = \sum_B\sum_{(a,b,c) \in B} E_{abc}.
+\end{equation}
+Once every triplet within a bucket $B$ has been drawn at least once, the contribution $E_B$ can be determined.
+At this juncture, there is no longer a necessity to evaluate \(E_B\) stochastically, and the buckets can be categorized into stochastic ($\mathcal{S}$) and deterministic ($\mathcal{D}$) groups:
+\begin{equation}
+E_{(T)} = \sum_{B \in \mathcal{D}} E_B + \frac{1}{|\mathcal{S}|} \sum_{B \in \mathcal{S}}
+\left \langle E^B_{abc} \times \frac{- \epsilon_a - \epsilon_b - \epsilon_c}{\mathcal{N}} \right \rangle_{P(a,b,c), (a,b,c) \in B}.
+\end{equation}
+Not all buckets are of equal size; the number of triplets per bucket decreases with the bucket's index. Consequently, the initial buckets transition into the deterministic set first, gradually reducing the stochastic contribution. When every triplet has been drawn, the exact value of $E_{(T)}$ is obtained, devoid of statistical error.
+To accelerate the completion of the buckets, each Monte Carlo iteration triggers the computation of the first non-computed triplet. This ensures that after $N$ drawings, the
+exact contribution from each bucket can be obtained.
+
 %=================================================================%
 \subsection{Implementation Details}
 \label{sec:implementation}
@ -410,15 +424,12 @@ The vibrational frequency and equilibrium distance estimated using this data, $\
 Figure \ref{fig:cucl} illustrates the potential energy surface of \ce{CuCl}, displaying both the exact CCSD(T) energies and those estimated via the semi-stochastic method.


-\subsection{Parallel efficiency}
-
-
 \subsection{Performance analysis}

 The primary bottleneck of our proposed algorithm lies in the generation of the sub-tensor $W^{abc}$ for each $(a,b,c)$ triplet, as discussed in Section~\ref{sec:theory}.
 However, we have outlined a strategy to reframe this operation into BLAS matrix multiplications,\cite{form_w_abc} offering the potential for significantly enhanced efficiency.

-We evaluated the efficiency of our implementation using the Likwid\cite{treibig_2010} performance analysis tool on two distinct x86 platforms: an AMD EPYC 7513 dual-socket server equipped with 64 cores at \SI{2.6}{\giga\hertz}, and an Intel Xeon Gold 6130 dual-socket server with 32 cores at \SI{2.1}{\giga\hertz}.
+We evaluated the efficiency of our implementation using the Likwid\cite{treibig_2010} performance analysis tool on two distinct x86 platforms: an AMD \textsc{Epyc} 7513 dual-socket server equipped with 64 cores at \SI{2.6}{\giga\hertz}, and an Intel Xeon Gold 6130 dual-socket server with 32 cores at \SI{2.1}{\giga\hertz}.
 We linked our code with the Intel MKL library for BLAS operations.
 Additionally, we executed the code on an ARM Q80 server featuring 80 cores at \SI{2.8}{\giga\hertz}, and although performance counters were unavailable, we approximated the Flop/s rate by comparing the total execution time with that measured on the AMD CPU.
 For this, we utilized the ArmPL library for BLAS operations.
@ -429,7 +440,7 @@ For this, we utilized the ArmPL library for BLAS operations.
 CPU & $N_{\text{cores}}$ & $V$ & $F$   & Memory Bandwidth & Peak DP   & Measured performance \\
               &         &     & (GHz) &      (GB/s)      & (GFlop/s) & (GFlop/s) \\
 \hline
-EPYC 7513      &      64 &  4  &  2.6  &    409.6         &     2~662 & 1~576 \\
+\textsc{EPYC} 7513      &      64 &  4  &  2.6  &    409.6         &     2~662 & 1~576 \\
 Xeon Gold 6130 &      32 &  8  &  2.1  &    256.0         &     2~150 &   667 \\  % 239.891
 ARM Q80        &      80 &  2  &  2.8  &    204.8         &     1~792 &   547 \\  % 292.492
 \end{tabular}
@ -442,7 +453,7 @@ Peak performance is determined by calculating the maximum achievable Flops/s on
 \begin{equation}
 P = N_{\text{cores}} \times N_{\text{FMA}} \times 2 \times V \times F
 \end{equation}
-where $F$ represents the frequency, $V$ the number of double precision elements in a vector register, $N_{\text{FMA}}$ denotes the number of vector FMA units per core (all considered CPUs possess two), and $N_{\text{cores}}$ reflects the number of cores. Notably, the Xeon and ARM CPUs both operate at approximately 30\% of peak performance, while the AMD EPYC CPU demonstrates twice the efficiency, achieving 60\% of the peak.
+where $F$ represents the frequency, $V$ the number of double precision elements in a vector register, $N_{\text{FMA}}$ denotes the number of vector FMA units per core (all considered CPUs possess two), and $N_{\text{cores}}$ reflects the number of cores. Notably, the Xeon and ARM CPUs both operate at approximately 30\% of peak performance, while the AMD \textsc{Epyc} CPU demonstrates twice the efficiency, achieving 60\% of the peak.


 The relatively modest performance, at around 30\% efficiency, is attributed to the small dimensions of the matrices involved.
@ -453,7 +464,22 @@ I = \frac{2\, {N_\text{o}}^3\, N_\text{v}}{8\, \qty({N_\text{o}}^3 + {N_\text{o}
 \end{equation}
 which can be approximated by $N_\text{o} / 4$ flops/byte as an upper bound, which is usually relatively low.
 For instance, in the case of benzene with a triple-zeta basis set, the arithmetic intensity is calculated to be 3.33 flops/byte, falling short of the threshold required to attain peak performance on any of the CPUs.
-By leveraging memory bandwidth and double precision throughput peak, we determined the critical arithmetic intensity necessary to achieve peak performance. On the Xeon and ARM CPUs, this critical value stands at approximately 8.4 and 8.8 flops/byte, respectively. Meanwhile, the EPYC CPU exhibits a value of 6.5 flops/byte, thanks to its superior memory bandwidth.
+By leveraging memory bandwidth and double precision throughput peak, we determined the critical arithmetic intensity necessary to achieve peak performance. On the Xeon and ARM CPUs, this critical value stands at approximately 8.4 and 8.8 flops/byte, respectively. Meanwhile, the \textsc{EPYC} CPU exhibits a value of 6.5 flops/byte, thanks to its superior memory bandwidth.
+
+\subsection{Parallel efficiency}
+
+\begin{figure}
+\includegraphics[width=\columnwidth]{scaling.pdf}
+\caption{\label{fig:speedup} Parallel speedup obtained with the ARM Q80 and AMD \textsc{Epyc} servers.}
+\end{figure}
+Figure~\ref{fig:speedup} shows the parallel speedups obtained with the ARM and AMD servers for the benzene molecule in the triple-zeta basis set.
+Three distinct regimes appear.
+The first one, up to 24 cores is close to the ideal regime
+The second one, between 24 and 64 cores is decent and enables an acceleration of $40 \times$ with 64 cores. Then, beyond 64 cores, the parallel efficiency drops quickly.
+
+These behaviors can be explained by the arithmetic intensity and the bandwidth of these machines.
+On the ARM server, we have seen that the critical arithmetic intensity to leverage peak performance was 8.8 flops/byte. However, if the number of cores decreases, the bandwidth per core increases and so does the efficiency.
+