Added perf measurements

2024-03-28 18:16:14 +01:00 · 2024-03-28 18:16:14 +01:00 · 98a897d9c8
commit 98a897d9c8
parent 8a24b96dcd
2 changed files with 77 additions and 28 deletions
--- a/Manuscript/stochastic_triples.bib
+++ b/Manuscript/stochastic_triples.bib
@ -163,15 +163,34 @@ i@article{watson_2016,
 }

@article{vilarrubias_2020,
-author = {Pere Vilarrubias},
-title = {Electronic spectroscopy of some small anions containing S, N and O using CR-EOM-CCSD(T) method},
-journal = {Molecular Physics},
-volume = {118},
-number = {24},
-pages = {e1797915},
-year = {2020},
-publisher = {Taylor and Francis},
-doi = {10.1080/00268976.2020.1797915},
-URL = {https://doi.org/10.1080/00268976.2020.1797915},
-eprint = {https://doi.org/10.1080/00268976.2020.1797915}
+  author = {Pere Vilarrubias},
+  title = {Electronic spectroscopy of some small anions containing S, N and O using CR-EOM-CCSD(T) method},
+  journal = {Molecular Physics},
+  volume = {118},
+  number = {24},
+  pages = {e1797915},
+  year = {2020},
+  publisher = {Taylor and Francis},
+  doi = {10.1080/00268976.2020.1797915},
+  URL = {https://doi.org/10.1080/00268976.2020.1797915},
+  eprint = {https://doi.org/10.1080/00268976.2020.1797915}
+}
+
+@incollection{treibig_2010,
+	author = {Treibig, Jan and Hager, Georg and Wellein, Gerhard},
+	title = {{LIKWID: A Lightweight Performance-Oriented Tool Suite for x86 Multicore Environments}},
+	booktitle = {{2010 39th International Conference on Parallel Processing Workshops}},
+	journal = {Published in: 2010 39th International Conference on Parallel Processing Workshops},
+	pages = {13--16},
+	publisher = {IEEE},
+	doi = {10.1109/ICPPW.2010.38}
+}
+
+@misc{nist_2022,
+	title = {{Diatomic Spectral Database {$\vert$} NIST}},
+	journal = {NIST},
+	year = {2022},
+	month = jan,
+	note = {[Online; accessed 28. Mar. 2024]},
+	url = {https://www.nist.gov/pml/diatomic-spectral-database}
 }
--- a/Manuscript/stochastic_triples.tex
+++ b/Manuscript/stochastic_triples.tex
@ -131,7 +131,7 @@ However, the computational cost associated with their calculation can be prohibi
 The CCSD(T) method, which includes the perturbative treatment of triples, is known to have a computational scaling of $\order{N^7}$, where $N$ represents the system size.
 This scaling can rapidly become impractical, posing significant challenges in terms of computational resources and time requirements.

-To address this computational bottleneck, our goal is to develop a novel semi-stochastic algorithm that brings back the computational time to a level comparable to that of the CCSD method, which has a scaling of $\order{N^6}$, while ensuring well-controlled approximations.
+To address this computational bottleneck, our goal is to develop a novel semi-stochastic algorithm that brings back the computational time to a level smaller or comparable to that of the CCSD method, which has a scaling of $\order{N^6}$, while ensuring well-controlled approximations.
 Our algorithm strikes a balance between computational efficiency and
 accuracy, making calculations for larger basis sets more feasible without compromising precision.
 By incorporating stochastic sampling techniques, our approach provides an alternative avenue for approximating perturbative triples, relieving the computational burden inherent in traditional deterministic methods. This not only reduces the computational time to a more favorable level but also preserves the parallelism capabilities of CC calculations, ensuring efficient utilization of computational resources.
@ -149,22 +149,24 @@ In the following sections of this paper, we will provide a brief introduction to

 The perturbative triples correction,
 \begin{equation}
-E_{(T)}  =  \sum_{ijkabc} E_{ijk}^{abc} 
+E_{(T)}  =  \sum_{ijk\,abc} E_{ijk}^{abc},
 \end{equation}
-is a sum of $N=N_o^3 \times N_v^3$ terms,
+is a sum of $N_{\text{o}}^3 \times N_{\text{v}}^3$ terms, where $N_{\text{o}}^3$ and  $N_{\text{v}}^3$ denote the number of occupied and virtual molecular orbitals, respectively.
+Each individual term is expressed as
 \begin{equation}
 E_{ijk}^{abc}  =  \frac{(4 W_{ijk}^{abc} +
              W_{ijk}^{bca} + W_{ijk}^{cab})
              (V_{ijk}^{abc} - V_{ijk}^{cba})}{\epsilon_i + \epsilon_j + \epsilon_k -
-\epsilon_a - \epsilon_b - \epsilon_c} 
+\epsilon_a - \epsilon_b - \epsilon_c},
 \end{equation}
-which depend on the canonical orbital energies $\epsilon$, and the tensors $W$ and $V$.
-The indices $i,j,k$ and $a,b,c$ denote respectively occupied and virtual orbitals.
+and depends on the canonical orbital energies $\epsilon$, and on the tensors $W$ and $V$.
+The indices $i,j,k$ and $a,b,c$ refer to occupied and virtual orbitals, respectively.

-The bottleneck is the computation of $W$, which requires $\order{N_o^3 \times
-N_v^4}$ operations. However, most of the operations involved in the computation of $W$
-can be recast into matrix multiplications, which are among the most efficient
-operations than can be executed on modern CPUs and
+The bottleneck of the perturbative triples correction is the computation of the $W$ tensor
+which requires $\order{N_o^3 \times N_v^4}$ operations. Fortunately, most of
+the operations involved in the computation of $W$ can be recast into matrix
+multiplications, which are among the most efficient operations than can be
+executed on modern CPUs and
 accelerators.\cite{ma_2011,haidar_2015,dinapoli_2014,springer_2018}

 %=================================================================%
@ -201,15 +203,13 @@ accelerators.\cite{ma_2011,haidar_2015,dinapoli_2014,springer_2018}
 % - Limitations: memory because in-core algorithm.

 %=================================================================%
-\section{Examples of applications}
+\section{Numerical experiments}

-%a. Presentation of benchmark systems and datasets used for evaluation
-% - Benzene TZ
+% + Benzene TZ/QZ
 % - Streptocyanine QZ: Small molecule in a large basis set
 % - Caffeine def2-svp: Large molecule in a small basis set
-% - Vibrational frequency of CuCl/cc-pvqz
-%b. Discussion of the obtained results, comparing against other methods
-% - Measure flops and compare to the peak
+% + Vibrational frequency of CuCl/cc-pvqz
+% + Measure flops and compare to the peak
 %c. Analysis of the algorithm's accuracy, efficiency, and scalability
 %d. Discussion of any observed limitations or challenges

@ -218,6 +218,7 @@ accelerators.\cite{ma_2011,haidar_2015,dinapoli_2014,springer_2018}
 In this section we illustrate the convergence of the statistical error of the perturbative triples correction as a function of the computational cost.
 The benzene molecule serves as our reference system for conducting frozen-core CCSD(T) calculations with the cc-pVTZ and cc-pVQZ basis sets.
 Essentially, this involves the correlation of 30 electrons using either 258 or 503 molecular orbitals.
+The calculations were performed on an Intel Xeon Gold 6130 dual socket server (32 cores in total).

 \begin{figure}
 \includegraphics[width=\columnwidth]{benzene_tz.pdf}
@ -242,6 +243,35 @@ The more rapid convergence observed with the larger basis set aligns with expect
 This trend underscores the algorithm's enhanced suitability for systems with fewer electrons and extensive basis sets, as opposed to larger electron counts in smaller basis sets.


+\subsection{Performance analysis}
+
+The bottleneck of the proposed algorithm is the creation of the sub-tensor $W^{abc}$ for each given $(a,b,c)$ triplet.
+We have mentioned in section~\ref{sec:theory} that this operation could be recast into matrix multiplications, leading to a high efficiency of our implementation.
+
+We have measured the efficiency of our implementation using the Likwid\cite{treibig_2010} performance analysis tool on an AMD EPYC 7513 dual-socket server (64 cores at \SI{2.6}{\giga \hertz}) and on an
+Intel Xeon Gold 6130 dual-socket server (32 cores at \SI{2.1}{\giga \hertz}).
+The code was linked with the Intel MKL library for BLAS operations.
+Measurements of the number of floating-point operations per second (Flop/s) we activated section of the code for the computation of the perturbative triples correction. 
+We have also run the code on an ARM Q80 server (80 cores at \SI{2.8}{\giga \hertz})), and as the performance counters were not available to Likwid, we have compared the total execution time of the computation of the perturbative triples correction with the time measured on the AMD CPU to estimate the Flop/s rate.
+The code was linked with the ArmPL library for BLAS operations.
+
+\begin{table}
+\begin{ruledtabular}
+\begin{tabular}{lcccc}
+CPU & \# cores & Vector length & Performance  & \% Peak \\
+        &          & (bits)        & (GFlop/s)    & \\
+\hline
+EPYC 7513      & 64 & 256    &1~576 & 59.2 \% \\  % 101.53
+Xeon Gold 6130 & 32 & 512    &  667 & 31.0 \% \\  % 239.891
+ARM Q80        & 80 & 128    &  547 & 30.5 \% \\  % 292.492 
+\end{tabular}
+\end{ruledtabular}
+\caption{\label{tab:flops} Performance of the code measured on different architectures.}
+\end{table}
+
+Table~\ref{tab:flops} shows the results of these tests on an AMD EPYC 7513 dual socket server (64 cores in total).
+
+
 \subsection{Vibrational frequency of copper chloride}

 Our methodology proves especially advantageous for scenarios requiring the aggregation of numerous CCSD(T) energies, such as neural network training or the exploration of potential energy surfaces.
@ -264,7 +294,7 @@ with $\mu$ denoting the reduced mass of the \ce{CuCl} molecule, and $c$ the spee

 The initial step involved the precise calculation of the CCSD(T) energy across various points along the potential curve.
 We froze the six lowest molecular orbitals, specifically the $1s$ orbital of \ce{Cl} and the $1s$, $2s$, and $2p$ orbitals of \ce{Cu}, and correlated 34 electrons within 157 molecular orbitals.
-The fitted Morse potential revealed a vibrational frequency of $\nu = \SI{414.7}{\per\centi\meter}$ and an equilibrium bond length of $r_e = \SI{3.92}{\bohr}$, aligning remarkably well with experimental values $\nu = \SI{414}{\per\centi\meter}$ and $r_e = \SI{3.88}{\bohr}$.
+The fitted Morse potential revealed a vibrational frequency of $\nu = \SI{414.7}{\per\centi\meter}$ and an equilibrium bond length of $r_e = \SI{3.92}{\bohr}$, aligning remarkably well with experimental values from the NIST database\cite{nist_2022} $\nu = \SI{417.6}{\per\centi\meter}$ and $r_e = \SI{3.88}{\bohr}$.

 Subsequently, we applied our semi-stochastic algorithm to estimate the perturbative triples correction, utilizing merely 1\% of the total contributions.
 This approach yielded a hundredfold acceleration in computational efficiency, achieving statistical uncertainty within the range of \SI{1.3} to \SI{2.5}{\milli\hartree}.