Speedup curve

2024-04-03 18:14:14 +02:00 · 2024-04-03 18:14:14 +02:00 · 3986bf4062
commit 3986bf4062
parent 1505062cbb
4 changed files with 322 additions and 28 deletions
--- a/Data/scaling.dat
+++ b/Data/scaling.dat
@ -0,0 +1,35 @@
+# 1
+# 2
+# 4
+# 8
+#16
+#32
+#64  
+
+# TZ : AMD EPYC 7402 24-Core Processor
+#1
+#2
+#4
+#6
+#12
+#24
+#32 123.718727827072
+#48 121.613038063049
+
+
+# DZ: ARM Q80
+ 1  740.99828964984044  0.109375
+ 2  368.10031103389338  0.21875
+ 4  183.47006468195468  0.4375
+ 8  92.296218489762396  0.875
+16  47.492009534966201  1.75
+24  32.694960118737072  2.625
+32  25.408835886977613  3.5
+40  21.404467605985701  4.375
+48  19.549200831912458  5.25
+56  18.556575596332550  6.125   # Max
+64  18.981703117024153  7.0
+68  21.941064534708858  7.4375
+72  27.031634503975511  7.87
+76  32.015603828709573  8.3125
+80  39.939096361864358  8.75
--- a/Data/scaling.plt
+++ b/Data/scaling.plt
@ -0,0 +1,180 @@
+#!/usr/bin/gnuplot -persist
+#
+#    
+#    	G N U P L O T
+#    	Version 5.4 patchlevel 2    last modified 2021-06-01 
+#    
+#    	Copyright (C) 1986-1993, 1998, 2004, 2007-2021
+#    	Thomas Williams, Colin Kelley and many others
+#    
+#    	gnuplot home:     http://www.gnuplot.info
+#    	faq, bugs, etc:   type "help FAQ"
+#    	immediate help:   type "help"  (plot window: hit 'h')
+# set terminal qt 0 font "Sans,9"
+# set output
+unset clip points
+set clip one
+unset clip two
+unset clip radial
+set errorbars front 1.000000 
+set border 31 front lt black linewidth 1.000 dashtype solid
+set zdata 
+set ydata 
+set xdata 
+set y2data 
+set x2data 
+set boxwidth
+set boxdepth 0
+set style fill  empty border
+set style rectangle back fc  bgnd fillstyle   solid 1.00 border lt -1
+set style circle radius graph 0.02 
+set style ellipse size graph 0.05, 0.03 angle 0 units xy
+set dummy x, y
+set format x "% h" 
+set format y "% h" 
+set format x2 "% h" 
+set format y2 "% h" 
+set format z "% h" 
+set format cb "% h" 
+set format r "% h" 
+set ttics format "% h"
+set timefmt "%d/%m/%y,%H:%M"
+set angles radians
+set tics back
+set grid nopolar
+set grid xtics nomxtics ytics nomytics noztics nomztics nortics nomrtics \
+ nox2tics nomx2tics noy2tics nomy2tics nocbtics nomcbtics
+set grid layerdefault   lt 0 linecolor 0 linewidth 0.500,  lt 0 linecolor 0 linewidth 0.500
+unset raxis
+set theta counterclockwise right
+set style parallel front  lt black linewidth 2.000 dashtype solid
+set key notitle
+set key fixed left top vertical Right noreverse enhanced autotitle nobox
+set key noinvert samplen 4 spacing 1 width 0 height 0 
+set key maxcolumns 0 maxrows 0
+set key noopaque
+unset label
+unset arrow
+unset style line
+unset style arrow
+set style histogram clustered gap 2 title textcolor lt -1
+unset object
+unset walls
+set style textbox  transparent margins  1.0,  1.0 border  lt -1 linewidth  1.0
+set offsets 0, 0, 0, 0
+set pointsize 1
+set pointintervalbox 1
+set encoding default
+unset polar
+unset parametric
+unset spiderplot
+unset decimalsign
+unset micro
+unset minussign
+set view 60, 30, 1, 1
+set view azimuth 0
+set rgbmax 255
+set samples 100, 100
+set isosamples 10, 10
+set surface 
+unset contour
+set cntrlabel  format '%8.3g' font '' start 5 interval 20
+set mapping cartesian
+set datafile separator whitespace
+set datafile nocolumnheaders
+unset hidden3d
+set cntrparam order 4
+set cntrparam linear
+set cntrparam levels 5
+set cntrparam levels auto
+set cntrparam firstlinetype 0 unsorted
+set cntrparam points 5
+set size ratio 0 1,1
+set origin 0,0
+set style data points
+set style function lines
+unset xzeroaxis
+unset yzeroaxis
+unset zzeroaxis
+unset x2zeroaxis
+unset y2zeroaxis
+set xyplane relative 0.5
+set tics scale  1, 0.5, 1, 1, 1
+set mxtics default
+set mytics default
+set mztics default
+set mx2tics default
+set my2tics default
+set mcbtics default
+set mrtics default
+set nomttics
+set xtics border in scale 1,0.5 mirror norotate  autojustify
+set xtics  norangelimit autofreq 
+set ytics border in scale 1,0.5 mirror norotate  autojustify
+set ytics  norangelimit autofreq 
+set ztics border in scale 1,0.5 nomirror norotate  autojustify
+set ztics  norangelimit autofreq 
+unset x2tics
+unset y2tics
+set cbtics border in scale 1,0.5 mirror norotate  autojustify
+set cbtics  norangelimit autofreq 
+set rtics axis in scale 1,0.5 nomirror norotate  autojustify
+set rtics  norangelimit autofreq 
+unset ttics
+set title "" 
+set title  font "" textcolor lt -1 norotate
+set timestamp bottom 
+set timestamp "" 
+set timestamp  font "" textcolor lt -1 norotate
+set trange [ * : * ] noreverse nowriteback
+set urange [ * : * ] noreverse nowriteback
+set vrange [ * : * ] noreverse nowriteback
+set xlabel "Number of cores" 
+set xlabel  font "" textcolor lt -1 norotate
+set x2label "" 
+set x2label  font "" textcolor lt -1 norotate
+set xrange [ * : * ] noreverse writeback
+set x2range [ * : * ] noreverse writeback
+set ylabel "Speedup" 
+set ylabel  font "" textcolor lt -1 rotate
+set y2label "" 
+set y2label  font "" textcolor lt -1 rotate
+set yrange [ * : * ] noreverse writeback
+set y2range [ * : * ] noreverse writeback
+set zlabel "" 
+set zlabel  font "" textcolor lt -1 norotate
+set zrange [ * : * ] noreverse writeback
+set cblabel "" 
+set cblabel  font "" textcolor lt -1 rotate
+set cbrange [ * : * ] noreverse writeback
+set rlabel "" 
+set rlabel  font "" textcolor lt -1 norotate
+set rrange [ * : * ] noreverse writeback
+unset logscale
+unset jitter
+set zero 1e-08
+set lmargin  -1
+set bmargin  -1
+set rmargin  -1
+set tmargin  -1
+set locale "en_AU.UTF-8"
+set pm3d explicit at s
+set pm3d scansautomatic
+set pm3d interpolate 1,1 flush begin noftriangles noborder corners2color mean
+set pm3d clip z 
+set pm3d nolighting
+set palette positive nops_allcF maxcolors 0 gamma 1.5 color model RGB 
+set palette rgbformulae 7, 5, 15
+set colorbox default
+set colorbox vertical origin screen 0.9, 0.2 size screen 0.05, 0.6 front  noinvert bdefault
+set style boxplot candles range  1.50 outliers pt 7 separation 1 labels auto unsorted
+set loadpath 
+set fontpath
+set psdir
+set fit brief errorvariables nocovariancevariables errorscaling prescale nowrap v5
+GNUTERM = "qt"
+I = {0.0, 1.0}
+VoxelDistance = 0.0
+## Last datafile plotted: "scaling.dat"
+plot 'scaling.dat' u 1:(740.99828964984044/$2) w lp notitle, x title "Ideal"
+#    EOF
--- a/Manuscript/stochastic_triples.tex
+++ b/Manuscript/stochastic_triples.tex
@ -2,10 +2,11 @@
 %\usepackage{graphicx,dcolumn,bm,xcolor,microtype,multirow,amscd,amsmath,amssymb,amsfonts,physics,longtable,wrapfig,bbold,siunitx,xspace}
 \usepackage{graphicx,xcolor,physics,siunitx,xspace}
 \usepackage[version=4]{mhchem}
-
 \usepackage[utf8]{inputenc}
 \usepackage[T1]{fontenc}

+\usepackage{algorithm2e}
+
 \usepackage{hyperref}
 \hypersetup{
    colorlinks,
@ -71,23 +72,23 @@
 \newcommand{\LCPQ}{Laboratoire de Chimie et Physique Quantiques (UMR 5626), Universit\'e de Toulouse, CNRS, UPS, France}
 \newcommand{\Vienna}{Vienna}

-\begin{document}	
+\begin{document}

 \title{Stochastically accelerated perturbative triples correction in coupled cluster calculations}

 % Alphabetic order
 \author{Yann \surname{Damour}}
-	\affiliation{\LCPQ}
+    \affiliation{\LCPQ}
 \author{Alejandro \surname{Gallo}}
-	\affiliation{\Vienna}
+    \affiliation{\Vienna}
 \author{Andreas \surname{Irmler}}
-	\affiliation{\Vienna}
+    \affiliation{\Vienna}
 \author{Andreas \surname{Gr\"uneis}}
-	\affiliation{\Vienna}
+    \affiliation{\Vienna}
 \author{Anthony \surname{Scemama}}
-	\email{scemama@irsamc.ups-tlse.fr}
-	\affiliation{\LCPQ}
-	
+    \email{scemama@irsamc.ups-tlse.fr}
+    \affiliation{\LCPQ}
+
 \begin{abstract}
 We introduce a novel algorithm that leverages stochastic sampling
 techniques to approximate perturbative triples correction in the
@ -109,7 +110,7 @@ computations, enabling investigations of complex molecular systems
 that were previously computationally prohibitive.
 \bigskip
 %\begin{center}
-%	\boxed{\includegraphics[width=0.5\linewidth]{TOC}}
+%    \boxed{\includegraphics[width=0.5\linewidth]{TOC}}
 %\end{center}
 %\bigskip
 \end{abstract}
@ -161,7 +162,7 @@ Each individual term is expressed as
 \end{equation}
 and depends on the canonical orbital energies $\epsilon$, and on the tensors $W$ and $V$:
 \begin{align}
-W_{ijk}^{abc} & = P_{ijk}^{abc} \qty( \sum_d^{\text{virt}} \qty(bd|ai) t_{kj}^{cd} - 
+W_{ijk}^{abc} & = P_{ijk}^{abc} \qty( \sum_d^{\text{virt}} \qty(bd|ai) t_{kj}^{cd} -
 \sum_l^{\text{occ}} \qty(ck|jl) t_{ab}^{il}) \\
 V_{ijk}^{abc} & = W_{ijk}^{abc} + \qty(bj|ck) t_i^a + \qty(ai|ck) t_j^b + \qty(ai|bj) t_k^c
 \end{align}
@ -204,7 +205,7 @@ In the algorithm proposed by Rendell\cite{rendell_1991}, for each given triplet
 \section{Implementation Details}
 \label{sec:implementation}

-The algorithm was implemented in the \textsc{Quantum Package} software.
+The algorithm presented in Algorithm~\cite{alg:stoch} was implemented in the \textsc{Quantum Package} software.
 \cite{garniron_2019}
 The stochastic algorithm is implemented using OpenMP tasks, where each task
 consists in the computation of a single component $E^{abc}$.
@ -214,15 +215,79 @@ for printing or for exiting when the statistical error gets below a given thresh
 The number of samples $N^{abc}$ of each triplet $(abc)$ is initialized to $-1$, to identify
 the contributions that have not been already computed.
 An outer \emph{for} loop runs over the maximum number of iteration, equal to
-$N_{abc}$, the number of different triplets $(abc)$.
+the number of different triplets $N_{\text{triplets}}$.

 Within a loop iteration, the index of the first non-computed triplet $(abc)$ is identified, and the task associated with its computation is sent to the task queue.
 As this triplet has never been drawn, $N^{abc}$ is set to zero.
 Then, a triplet $(abc)$ is drawn randomly.
-If the $E^{abc}$ has not been computed (identified by $N^{abc}=-1$), the number of samples is set to zero and the task for the computation of this contribution is enqueued. 
+If the $E^{abc}$ has not been computed (identified by $N^{abc}=-1$), the number of samples is set to zero and the task for the computation of this contribution is enqueued.
 In any case, $N^{abc}$ is then incremented.


+\begin{algorithm}[H]
+ \caption{\label{alg:stoch} Pseudo-code for the computation of the perturbative triples correction implemented in Quantum Package. $i_\text{min}$ denotes the first non-computed triplet, $w_{\text{accu}}$ contains the cumulative probability density, $\text{Search}(A, x)$ searches for $x$ in array $A$, $\text{First}(i)$ and $\text{Last}(i)$ return the first last indices belonging to bucket $i$.}
+$i_{\text{min}} \leftarrow 1$ ;
+$N^{abc}[1,\dots,N_{\text{triplets}}] \leftarrow [-1, -1, \dots]$ \;
+$t_0 \leftarrow \text{WallClockTime}()$ \;
+\For {$i_{\text{iter}}=1,\ldots,N_{\text{triplets}}$}
+{
+ \tcc{Deterministic computation}
+ \While {$N^{abc}[i_{\text{min}}] > -1$ and $i_{\text{min}} \le N_{\text{triplets}}$}
+ {
+   $i_{\text{min}} \leftarrow i_{\text{min}}+1$ \;
+ }
+
+ \If{$i_{\text{min}} \le N_{\text{triplets}}$}
+ {
+   Send OpenMP task \{ {
+     $E[i_{\text{min}}] \gets \text{Compute}(i_{\text{min}})$\;
+     \} }\;
+ }
+ \tcc{Stochastic computation}
+
+ $\eta \gets \text{RandomNumber}()$ \;
+ \For {$i_\text{bucket} = 1,\ldots, N_{\text{buckets}}$}
+ {
+   \If{$i_{\text{min}} \le \text{Last}(i_\text{bucket})$}
+   {
+     $i_\eta \gets \text{Search}(w_{\text{accu}}, \frac{\eta + i_\text{bucket}-1}{N_{\text{buckets}}})+1$ \;
+     \If{$N^{abc}[i_{\eta}] = -1$}
+     {
+       $N^{abc}[i_{\eta}] \gets 0$ \;
+       Send OpenMP task \{ {
+         $E[i_{\eta}] \gets \text{Compute}(i_{\eta})$\;
+         \} }\;
+     }
+     $N^{abc}[i_{\eta}] \gets N^{abc}[i_\eta]+1$ \;
+   }
+ }
+ \tcc{Compute the mean and error every second}
+ $t_1 \gets \text{WallClockTime}()$ \;
+ \If {$t_1 - t_0 > 1$ or $i_{\text{min}} \ge N_{\text{triplets}}$}
+ {
+   $i_\text{bucket} = 0$ \;
+   \While {$i_{\text{bucket}} < N_\text{buckets}$ and
+     $i_{\text{min}} > \text{Last}(i_\text{bucket}+1)$}
+   {
+     $i_\text{bucket} \gets i_\text{bucket} + 1$\;
+   }
+   $\mathcal{N} \gets \frac{ \sum_{i=\text{First}(i_\text{bucket}+1)}^{N_\text{triplets}} \max(N^{abc}[i],0)}
+   { 1 - \sum_{i=1}^{\text{Last}(i_\text{bucket})} P[i] }$ \;
+   $E_{d} \gets \sum_{i=1}^{\text{Last}(i_\text{bucket})} E^{abc}[i]$ \;
+   $E_{s} \gets 1/\mathcal{N}\, \sum_{i=\text{First}(i_\text{bucket}+1)}^{N_\text{triplets}}
+   \max(N^{abc}[i],0)\, E^{abc}[i]/P[i]$ \;
+   $E_{s^2} \gets 1/\mathcal{N}\, \sum_{i=\text{First}(i_\text{bucket}+1)}^{N_\text{triplets}}
+   \max(N^{abc}[i],0)\, \qty(E^{abc}[i]/P[i])^2$ \;
+   $E \gets E_{d} + E_{s}$ \;
+   $\Delta E \gets \sqrt{ \qty(E_{s^2} - {E_{s}}^2) / \qty(\mathcal{N}-1) }$ \;
+   \If{$\Delta E < \epsilon$}
+   {
+     Exit outermost loop \;
+   }
+ }
+}
+\end{algorithm}
+

 %a. Description of the computational framework and software used
 %b. Discussion of any specific optimizations or parallelization techniques employed
@ -252,19 +317,19 @@ The calculations were performed on an Intel Xeon Gold 6130 dual socket server (3
 \begin{figure}
 \includegraphics[width=\columnwidth]{benzene_tz.pdf}
 \includegraphics[width=\columnwidth]{benzene_qz.pdf}
-	\caption{\label{fig:benzene} Energy convergence of benzene plotted against the program execution time, showing comparisons between the cc-pVTZ (upper curve) and cc-pVQZ (lower curve) basis sets. The blue lines indicate the exact CCSD(T) energies.}
+    \caption{\label{fig:benzene} Energy convergence of benzene plotted against the program execution time, showing comparisons between the cc-pVTZ (upper curve) and cc-pVQZ (lower curve) basis sets. The blue lines indicate the exact CCSD(T) energies.}
 \end{figure}

 \begin{figure}
 \includegraphics[width=\columnwidth]{benzene_err.pdf}
-	\caption{\label{fig:benzene_err} Convergence of the statistical error of the perturbative triples contribution in benzene as a function of the percentage of computed contributions, for both cc-pVTZ and cc-pVQZ basis sets.}
+    \caption{\label{fig:benzene_err} Convergence of the statistical error of the perturbative triples contribution in benzene as a function of the percentage of computed contributions, for both cc-pVTZ and cc-pVQZ basis sets.}
 \end{figure}

 Figure~\ref{fig:benzene} shows the convergence of the CCSD(T) energy as a function of the program execution time using the two basis sets.
 Notably, the exact CCSD(T) energy always falls within $2\sigma$, affirming the reliability of the statistical error.
 Figure~\ref{fig:benzene_err} displays the statistical error as a function of the percentage of computed contributions.
 Noteworthy in both figures are the curve discontinuities, attributable to readjustments in the separation between the deterministic and stochastic components of the calculation.
-These updates lead to revised estimates and a diminution in statistical error. 
+These updates lead to revised estimates and a diminution in statistical error.

 Achieving chemical accuracy, defined as \SI{1.6}{\milli\hartree}, necessitates less than 1\% of the total contributions in both basis sets.
 Attaining a \SI{0.1}{\milli\hartree} precision level requires computation of 32\% and 15\% of the contributions for cc-pVTZ and cc-pVQZ, respectively.
@ -303,6 +368,9 @@ The vibrational frequency and equilibrium distance estimated using this data, $\
 Figure \ref{fig:cucl} illustrates the potential energy surface of \ce{CuCl}, displaying both the exact CCSD(T) energies and those estimated via the semi-stochastic method.


+\subsection{Parallel efficiency}
+
+
 \subsection{Performance analysis}

 The primary bottleneck of our proposed algorithm lies in the generation of the sub-tensor $W^{abc}$ for each $(a,b,c)$ triplet, as discussed in Section~\ref{sec:theory}.
@ -318,10 +386,10 @@ For this, we utilized the ArmPL library for BLAS operations.
 \begin{tabular}{lcccccc}
 CPU & $N_{\text{cores}}$ & $V$ & $F$   & Memory Bandwidth & Peak DP   & Measured performance \\
               &         &     & (GHz) &      (GB/s)      & (GFlop/s) & (GFlop/s) \\
-\hline                     
+\hline
 EPYC 7513      &      64 &  4  &  2.6  &    409.6         &     2~662 & 1~576 \\
 Xeon Gold 6130 &      32 &  8  &  2.1  &    256.0         &     2~150 &   667 \\  % 239.891
-ARM Q80        &      80 &  2  &  2.8  &    204.8         &     1~792 &   547 \\  % 292.492 
+ARM Q80        &      80 &  2  &  2.8  &    204.8         &     1~792 &   547 \\  % 292.492
 \end{tabular}
 \end{ruledtabular}
 \caption{\label{tab:flops} Average performance of the code measured as the number of double precision (DP) floating-point operations per second (Flop/s) on different machines.}
@ -336,8 +404,12 @@ where $F$ represents the frequency, $V$ the number of double precision elements


 The relatively modest performance, at around 30\% efficiency, is attributed to the small dimensions of the matrices involved.
-The largest matrix multiplications in the computational task entail a matrix of size $N_\text{o}^2 \times N_\text{v}$ and another of size $N_\text{v} \times N_\text{o}$ to yield an $N_\text{o}^2 \times N_\text{o}$ matrix.
-These multiplications exhibit an arithmetic intensity below $N_\text{o} / 4$ flops/byte, which is usually relatively low.
+The largest matrix multiplications in the computational task entail a matrix of size ${N_\text{o}}^2 \times N_\text{v}$ and another of size $N_\text{v} \times N_\text{o}$ to yield an ${N_\text{o}}^2 \times N_\text{o}$ matrix.
+These multiplications exhibit an arithmetic intensity of
+\begin{equation}
+I = \frac{2\, {N_\text{o}}^3\, N_\text{v}}{8\, \qty({N_\text{o}}^3 + {N_\text{o}}^2 N_\text{v} + {N_\text{o}} N_\text{v})}
+\end{equation}
+which can be approximated by $N_\text{o} / 4$ flops/byte as an upper bound, which is usually relatively low.
 For instance, in the case of benzene with a triple-zeta basis set, the arithmetic intensity is calculated to be 3.52 flops/byte, falling short of the threshold required to attain peak performance on any of the CPUs.
 By leveraging memory bandwidth and double precision throughput peak, we determined the critical arithmetic intensity necessary to achieve peak performance. On the Xeon and ARM CPUs, this critical value stands at approximately 8.4 and 8.8 flops/byte, respectively. Meanwhile, the EPYC CPU exhibits a value of 6.5 flops/byte, thanks to its superior memory bandwidth.

@ -362,7 +434,7 @@ By leveraging memory bandwidth and double precision throughput peak, we determin

 %%%%%%%%%%%%%%%%%%%%%%%
 \acknowledgements{
-This work was supported by the European Centre of Excellence in Exascale Computing TREX --- Targeting Real Chemical Accuracy at the Exascale. 
+This work was supported by the European Centre of Excellence in Exascale Computing TREX --- Targeting Real Chemical Accuracy at the Exascale.
 This project has received funding from the European Union's Horizon 2020 — Research and Innovation program --- under grant agreement No.~952165.
 This work was performed using HPC resourced from CALMIP (Toulouse) under allocations p18005 and p22001.}
 %%%%%%%%%%%%%%%%%%%%%%%
--- a/triples.org
+++ b/triples.org
@ -7706,16 +7706,16 @@ plot E(x), data using ($1*a0):2:3 w err


   #+begin_example
-a               = 0.84615          +/- 0.03216      (3.8%)
-re              = 3.92539          +/- 0.01058      (0.2696%)
-De              = 0.101589         +/- 0.00932      (9.174%)
-E0              = -2099.77         +/- 0.0008014    (3.817e-05%)
+a               = 0.849036         +/- 0.02921      (3.44%)
+re              = 3.92202          +/- 0.009502     (0.2423%)
+De              = 0.101614         +/- 0.008801     (8.661%)
+E0              = -2099.77         +/- 0.0007748    (3.69e-05%)
   #+end_example

-   #+CALL:freq(0.84615,0.101589)
+   #+CALL:freq(0.849036,0.101614)

   #+RESULTS:
-   : 413.5302408975902
+   : 414.99173933985907

 ** CCSD(T) exact

@ -7805,6 +7805,13 @@ plot data2 using ($1*a0-0.002):2 pointtype 2 lt 3 title "Full", data using ($1*a
   #+RESULTS:
   [[file:cucl_ccsdt2.png]]

+   
+* Arithmetic intensity
+
+   No^3 x Nv / ( No^2 x Nv + No x Nv + No^3)
+   
+   No^2 / ( No + 1 + No^2/Nv)
+   
 * Export                                                           :noexport:
  #+BEGIN_SRC elisp :output none
 (setq org-latex-listings 'minted