Minor changes

2024-05-25 17:36:00 +02:00 · 2024-05-25 17:36:00 +02:00 · 2e18256b80
commit 2e18256b80
parent 0ecca24d2e
7 changed files with 211 additions and 22 deletions
--- a/Data/data_buckets.txt
+++ b/Data/data_buckets.txt
@ -0,0 +1,102 @@
 # Indices of the first and last triplet in each bucket
                     1                  3115
                  3116                  7569
                  7570                 12858
                 12859                 18844
                 18845                 25457
                 25458                 32641
                 32642                 40359
                 40360                 48602
                 48603                 57368
                 57369                 66666
                 66667                 76503
                 76504                 86871
                 86872                 97755
                 97756                109142
                109143                121012
                121013                133351
                133352                146139
                146140                159350
                159351                172962
                172963                186962
                186963                201323
                201324                216044
                216045                231111
                231112                246517
                246518                262251
                262252                278306
                278307                294677
                294678                311356
                311357                328334
                328335                345611
                345612                363187
                363188                381057
                381058                399222
                399223                417686
                417687                436440
                436441                455486
                455487                474825
                474826                494452
                494453                514365
                514366                534562
                534563                555043
                555044                575811
                575812                596863
                596864                618204
                618205                639836
                639837                661752
                661753                683957
                683958                706450
                706451                729231
                729232                752298
                752299                775647
                775648                799283
                799284                823209
                823210                847424
                847425                871929
                871930                896723
                896724                921818
                921819                947209
                947210                972890
                972891                998869
                998870               1025144
               1025145               1051719
               1051720               1078598
               1078599               1105778
               1105779               1133271
               1133272               1161078
               1161079               1189201
               1189202               1217647
               1217648               1246417
               1246418               1275517
               1275518               1304950
               1304951               1334724
               1334725               1364840
               1364841               1395318
               1395319               1426160
               1426161               1457375
               1457376               1488973
               1488974               1520966
               1520967               1553357
               1553358               1586169
               1586170               1619407
               1619408               1653094
               1653095               1687252
               1687253               1721895
               1721896               1757050
               1757051               1792738
               1792739               1828990
               1828991               1865850
               1865851               1903360
               1903361               1941580
               1941581               1980567
               1980568               2020412
               2020413               2061234
               2061235               2103178
               2103179               2146484
               2146485               2191517
               2191518               2238934
               2238935               2290534
               2290535               2349496
               2349497               2420847
--- a/Data/plot_buckets.plt
+++ b/Data/plot_buckets.plt
@ -283,8 +283,6 @@ unset grid
 ## Last datafile plotted: "max_data.dat"
 plot 'max_data.dat' every 1 index 0 u 1:(-$3 * 2420853) w impulses lw 3 title 'Uniform sampling', \
     'max_data.dat' every 1 index 0 u 1:(-$3 * $2) w impulses lw 3 title 'Importance sampling' , 0.051806920848 w l title ''
 #plot 'data.dat' every 1 index 0 u (-$3 * 2420853) w impulses lw 3 title 'Uniform sampling', \
 #     'data.dat' every 1 index 0 u (-$3 * $2) w impulses lw 3 title 'Importance sampling' 
 # Zoomed plot
 set grid
--- a/Manuscript/.gitignore
+++ b/Manuscript/.gitignore
@ -0,0 +1,2 @@
 /build/
 /svg-inkscape/
--- a/Manuscript/buckets.pdf
+++ b/Manuscript/buckets.pdf
--- a/Manuscript/orcidlink.sty
+++ b/Manuscript/orcidlink.sty
@ -0,0 +1,70 @@
 %%
 %% This is file `orcidlink.sty',
 %% generated with the docstrip utility.
 %%
 %% The original source files were:
 %%
 %% orcidlink.dtx  (with options: `package')
 %% 
 %% This is a generated file.
 %% 
 %% Copyright (C) 2019-2023 by Leo C. Stein <leo.stein@gmail.com>
 %% --------------------------------------------------------------------------
 %% This work may be distributed and/or modified under the
 %% conditions of the LaTeX Project Public License, either version 1.3
 %% of this license or (at your option) any later version.
 %% The latest version of this license is in
 %%   http://www.latex-project.org/lppl.txt
 %% and version 1.3 or later is part of all distributions of LaTeX
 %% version 2005/12/01 or later.
 %% 
 \NeedsTeXFormat{LaTeX2e}[1994/06/01]
 \ProvidesPackage{orcidlink}
    [2023/12/30 v1.0.5 Linked ORCiD logo macro package]
 %% All I did was package up Milo's code on TeX.SE,
 %% see https://tex.stackexchange.com/a/445583/34063
 \RequirePackage{hyperref}
 \RequirePackage{tikz}
 \ProcessOptions\relax
 \usetikzlibrary{svg.path}
 \definecolor{orcidlogocol}{HTML}{A6CE39}
 \tikzset{
  orcidlogo/.pic={
    \fill[orcidlogocol] svg{M256,128c0,70.7-57.3,128-128,128C57.3,256,0,198.7,0,128C0,57.3,57.3,0,128,0C198.7,0,256,57.3,256,128z};
    \fill[white] svg{M86.3,186.2H70.9V79.1h15.4v48.4V186.2z}
                 svg{M108.9,79.1h41.6c39.6,0,57,28.3,57,53.6c0,27.5-21.5,53.6-56.8,53.6h-41.8V79.1z M124.3,172.4h24.5c34.9,0,42.9-26.5,42.9-39.7c0-21.5-13.7-39.7-43.7-39.7h-23.7V172.4z}
                 svg{M88.7,56.8c0,5.5-4.5,10.1-10.1,10.1c-5.6,0-10.1-4.6-10.1-10.1c0-5.6,4.5-10.1,10.1-10.1C84.2,46.7,88.7,51.3,88.7,56.8z};
  }
 }
 %% Reciprocal of the height of the svg whose source is above.  The
 %% original generates a 256pt high graphic; this macro holds 1/256.
 \newcommand{\@OrigHeightRecip}{0.00390625}
 %% We will compute the current X height to make the logo the right height
 \newlength{\@curXheight}
 %% Prevent externalization of the ORCiD logo.
 \newcommand{\@preventExternalization}{%
 \ifcsname tikz@library@external@loaded\endcsname%
 \tikzset{external/export next=false}\else\fi%
 }
 \DeclareRobustCommand\orcidlink[1]{%
 \texorpdfstring{%
 \setlength{\@curXheight}{\fontcharht\font`X}%
 \href{https://orcid.org/#1}{\XeTeXLinkBox{\mbox{%
 \@preventExternalization%
 \begin{tikzpicture}[yscale=-\@OrigHeightRecip*\@curXheight,
 xscale=\@OrigHeightRecip*\@curXheight,transform shape]
 \pic{orcidlogo};
 \end{tikzpicture}%
 }}}}{}}
 \endinput
 %%
 %% End of file `orcidlink.sty'.
--- a/Manuscript/stochastic_triples.bib
+++ b/Manuscript/stochastic_triples.bib
@ -563,4 +563,18 @@ swh:1:dir:6d82ae7ac757c78d7720dd89dfa52d7a453d2f68;origin=https://github.com/Qua
  url = {https://pubs.acs.org/doi/abs/10.1021/acs.jctc.7b00049},
  volume = {13},
  year = {2017}
 }
@article{pople_1999,
 	author = {Pople, John A.},
 	title = {{Nobel Lecture: Quantum chemical models}},
 	journal = {Rev. Mod. Phys.},
 	volume = {71},
 	number = {5},
 	pages = {1267--1274},
 	year = {1999},
 	month = oct,
 	issn = {1539-0756},
 	publisher = {American Physical Society},
 	doi = {10.1103/RevModPhys.71.1267}
 }
--- a/Manuscript/stochastic_triples.tex
+++ b/Manuscript/stochastic_triples.tex
@ -165,11 +165,11 @@ For a closed-shell reference with canonical orbitals, each individual term is ex
 \end{equation}
 and depends on the canonical orbital energies $\epsilon$, and on the tensors $W$ and $V$:
 \begin{align}
-W_{ijk}^{abc} & = P_{ijk}^{abc} \qty( \sum_d^{\text{virt}} \qty(bd|ai) t_{kj}^{cd} -
+W_{ijk}^{abc} & = \Pi_{ijk}^{abc} \qty( \sum_d^{\text{virt}} \qty(bd|ai) t_{kj}^{cd} -
 \sum_l^{\text{occ}} \qty(ck|jl) t_{ab}^{il}) \\
 V_{ijk}^{abc} & = W_{ijk}^{abc} + \qty(bj|ck) t_i^a + \qty(ai|ck) t_j^b + \qty(ai|bj) t_k^c
 \end{align}
-where $P_{ijk}^{abc}$ is a sign-less permutation operator, $(t^a_i, t^{ab}_{ij})$ are the CCSD amplitudes,
+where $\Pi_{ijk}^{abc}$ is a sign-less permutation operator, $(t^a_i, t^{ab}_{ij})$ are the CCSD amplitudes,
 \((pq|rs)\) are the two-electron coulomb integrals,
 and the indices $i,j,k,l$ and $a,b,c,d$ refer to occupied and virtual orbitals, respectively.
@ -189,7 +189,7 @@ In the algorithm proposed by Rendell\cite{rendell_1991}, for each given triplet
 \subsection{Stochastic formulation}
-We propose an algorithm influenced by the semi-stochastic approach originally developed for computing the Epstein-Nesbet second-order perturbation correction to the energy. \cite{garniron_2017}
+We propose an algorithm influenced by the semi-stochastic approach introduced in Ref.~\citenum{garniron_2017}, originally developed for computing the Epstein-Nesbet second-order perturbation correction to the energy. 
 The perturbative triples correction is expressed as a sum of corrections, each indexed solely by virtual orbitals:
 \begin{equation}
@ -200,11 +200,11 @@ Monte Carlo sampling is employed by selecting samples $E^{abc}$.
 The principal advantage of this formulation is that the number of triplet combinations $(a,b,c)$, given by $N_v^3$, is sufficiently small to allow for all contributions $E^{abc}$ to be stored in memory.
 The first time a triplet $(a,b,c)$ is drawn, its corresponding value $E^{abc}$ is computed and then stored.
 Subsequent drawings of the same triplet retrieve the value from memory. We refer to this technique as \emph{memoization}.
-Thus, the computational expense of calculating the sample, which scales as $N_\text{o}^3 \times N_\text{v}$, is incurred only once, with all subsequent accesses being computationally trivial.
+Thus, the computational expense of calculating the sample, which scales as $N_\text{o}^3 \times N_\text{v}$, is incurred only once, with all subsequent accesses being made at no cost.
 Consequently, employing a sufficient number of Monte Carlo samples to ensure that each contribution is selected at least once results in a total computational cost that is only negligibly higher than that of an exact computation.
-To reduce the variance, we apply importance sampling: the samples are drawn using the probability
+To reduce the fluctuations of the statistical estimator, we apply importance sampling: the samples are drawn using the probability
 \begin{equation}
 P^{abc} = \frac{1}{\mathcal{N}} \frac{1}{\max \left(\epsilon_{\min}, \epsilon_a + \epsilon_b + \epsilon_c \right)}
 \end{equation}
@ -222,7 +222,8 @@ where $n^{abc}$ is the number of times the triplet $(a,b,c)$ was drawn with prob
 \caption{%
 Ratios $\frac{E^{abc}}{P^{abc}}$ obtained with the data of benzene/cc-pVTZ, using uniform or importance sampling.
 Every bucket, delimited by vertical bars, contains a number of triplets such that the sum
-\(\sum_{(a,b,c)}P^{abc}\) remains as uniform as possible. The zoomed window corresponds to the first bucket. The high fluctuations occurring in the first buckets are reduced by importance sampling.
+\(\sum_{(a,b,c)}P^{abc}\) remains as uniform as possible. The zoomed window corresponds to the first bucket.
 The fluctuations originating from the discrepancy of the values in the first buckets are considerably reduced by importance sampling.
 \label{fig:buckets}
 }
 \end{figure}
@ -230,8 +231,8 @@ Every bucket, delimited by vertical bars, contains a number of triplets such tha
 This approach effectively reduces the statistical error bars by approximately a factor of two for the same computational expense due to two primary reasons: i) the estimator exhibits reduced fluctuations, ii) triplet combinations with low-energy orbitals are significantly more likely to be selected than others, enhancing the efficiency of memoization (see Fig.~\ref{fig:buckets}).
 We employ the inverse transform sampling technique to select samples, where an array of pairs $\qty(P^{abc}, (a,b,c))$ is stored.
-To further reduce the variance of the samples, this array is sorted in descending order based on $P^{abc}$ and subsequently partitioned into buckets, $B$, as can be seen diagrammatically in Figure~\ref{fig:buckets}.
+To further reduce the variance of the samples, this array is sorted in descending order based on $P^{abc}$ and subsequently partitioned into buckets as can be seen diagrammatically in Figure~\ref{fig:buckets}.
-Each bucket is designed such that the sum $\sum_{(a,b,c) \in B} P^{abc}$ within it is as uniform
+The partitioning into buckets is designed such that the sum $\sum_{(a,b,c) \in B} P^{abc}$ within each bucket $B$ is as uniform
 as possible across all buckets.
 As each bucket is equally probable, samples are defined as combinations of triplets, with one triplet drawn from each bucket.
 Should the values of $E^{abc}$ be skewed, this advanced refinement significantly diminishes the variance.
@ -241,13 +242,15 @@ The total perturbative contribution is computed as the aggregate of contribution
 E_{(T)} = \sum_B E_B = \sum_B\sum_{(a,b,c) \in B} E^{abc}.
 \end{equation}
 Once every triplet within a bucket $B$ has been drawn at least once, the contribution $E_B$ can be determined.
-At this juncture, there is no longer a necessity to evaluate \(E_B\) stochastically, and the buckets can be categorized into stochastic ($\mathcal{S}$) and deterministic ($\mathcal{D}$) groups:
+At this juncture, there is no longer a necessity to evaluate \(E_B\) stochastically, and the buckets can be categorized into
 deterministic ($\mathcal{D}$) and stochastic ($\mathcal{S}$) groups:
 \begin{equation}
 \label{eq:separation}
 E_{(T)} = \sum_{B \in \mathcal{D}} E_B + \frac{1}{|\mathcal{S}|} \sum_{B \in \mathcal{S}}
 \left \langle \frac{E^B_{abc}}{P^{abc}} \right \rangle_{P^{abc}}.
 \end{equation}
-Not all buckets are of equal size; the number of triplets per bucket increases with the bucket's index. Consequently, the initial buckets transition into the deterministic set first, gradually reducing the stochastic contribution. When every triplet has been drawn, the exact value of $E_{(T)}$ is obtained, devoid of statistical error.
+Not all buckets are of equal size (see Figure~\ref{fig:buckets}); the number of triplets per bucket increases with the bucket's index. Consequently, the initial buckets transition into the deterministic set first, gradually reducing the stochastic contribution. When every triplet has been drawn, the exact value of $E_{(T)}$ is obtained, devoid of statistical error.
-To accelerate the completion of the buckets, each Monte Carlo iteration triggers the computation of the first non-computed triplet. This ensures that after $N$ drawings, the
+To accelerate the completion of the buckets, each Monte Carlo iteration triggers concurrently the computation of the first non-computed triplet. This ensures that after $N$ drawings, the
 exact contribution from each bucket can be obtained.
 The computational time required to generate a random number is negligible compared to the time needed to compute a contribution, $E^{abc}$.
@ -331,7 +334,7 @@ for printing or for exiting when the statistical error gets below a given thresh
 The number of samples $N^{abc}$ of each triplet $(a,b,c)$ is initialized to $-1$, to identify
 the contributions that have not been already computed.
-An outer \emph{for loop} runs over the maximum number of iteration, equal to
+An outer \emph{for loop} runs over the maximum number of iterations, equal by construction to
 the number of different triplets $N_{\text{triplets}}$.
 Within a loop iteration, the index of the first non-computed triplet $(a,b,c)$ is identified, and the task associated with its computation is sent to the task queue.
@ -363,10 +366,10 @@ The calculations were performed on an AMD \textsc{Epyc} 7513 dual socket server
 Figure~\ref{fig:benzene} shows the convergence of the CCSD(T) energy as a function of the program execution time using the two basis sets.
 Notably, the exact CCSD(T) energy always falls within $2\sigma$, affirming the reliability of the statistical error.
 Figure~\ref{fig:benzene_err} displays the statistical error as a function of the percentage of computed contributions.
-Noteworthy in the figure are the curve discontinuities, attributable to readjustments in the separation between the deterministic and stochastic components of the calculation.
+Noteworthy in the figure are the curve discontinuities, attributable to readjustments in the separation between the deterministic and stochastic components of the calculation (Eq.~\eqref{eq:separation}).
 These updates lead to revised estimates and a diminution in statistical error.
-Achieving chemical accuracy, defined as \SI{1.6}{\milli\hartree}, necessitates less than 1\% of the total contributions in both basis sets.
+Achieving chemical accuracy, defined as \SI{1.6}{\milli\hartree},\cite{pople_1999} necessitates less than 1\% of the total contributions in both basis sets.
 Attaining a \SI{0.1}{\milli\hartree} precision level requires computation of 32\% and 15\% of the contributions for cc-pVTZ and cc-pVQZ, respectively.
 The more rapid convergence observed with the larger basis set aligns with expectations, as expanding the basis set tends to increase the proportion of minor contributions while maintaining a relatively steady count of significant contributions.
 This trend underscores the algorithm's enhanced suitability for systems with fewer electrons and extensive basis sets, as opposed to larger electron counts in smaller basis sets.
@ -378,8 +381,8 @@ This trend underscores the algorithm's enhanced suitability for systems with few
 Our methodology proves especially advantageous for scenarios requiring the
 aggregation of numerous CCSD(T) energies, such as neural network training or
 the exploration of potential energy surfaces.
-In a recent article, the authors highlight the pivotal role of Quantum Monte
+In a recent article, Ceperley \textit{et al} highlight the pivotal role of Quantum Monte
-Carlo (QMC) in generating data for constructing potential energy surfaces.
+Carlo (QMC) in generating data for constructing potential energy surfaces.\cite{ceperley_2024} 
 The study suggests that stochastic noise inherent in QMC can facilitate machine
 learning model training, demonstrating that models can benefit from numerous,
 less precise data points. These findings are supported by an analysis of
@ -411,7 +414,7 @@ We froze the six lowest molecular orbitals, specifically the $1s$ orbital of \ce
 The fitted Morse potential revealed a vibrational frequency of $\nu = \SI{414.7}{\per\centi\meter}$ and an equilibrium bond length of $r_e = \SI{3.92}{\bohr}$, aligning remarkably well with experimental values from the NIST database\cite{nist_2022} $\nu = \SI{417.6}{\per\centi\meter}$ and $r_e = \SI{3.88}{\bohr}$.
 Subsequently, we applied our semi-stochastic algorithm to estimate the perturbative triples correction, utilizing merely 1\% of the total contributions.
-This approach yielded a hundredfold acceleration in computational efficiency, achieving statistical uncertainty within the range of \SI{1.2} to \SI{2.0}{\milli\hartree}.
+This approach yielded a hundredfold acceleration in computational efficiency, achieving statistical uncertainty within the range of \SI{1.2} to \SI{2.0}{\milli\hartree} for each data point.
 The vibrational frequency and equilibrium distance estimated using this data, $\nu = \SI{415.1}{\per\centi\meter}$ and $r_e = \SI{3.91}{\bohr}$, demonstrated comparable precision to the full computational results.
 Figure \ref{fig:cucl} illustrates the potential energy surface of \ce{CuCl}, displaying both the exact CCSD(T) energies and those estimated via the semi-stochastic method.
@ -424,7 +427,7 @@ However, we have outlined a strategy to reframe this operation into BLAS matrix
 We evaluated the efficiency of our implementation using the Likwid\cite{treibig_2010} performance analysis tool on two distinct x86 platforms: an AMD \textsc{Epyc} 7513 dual-socket server equipped with 64 cores at \SI{2.6}{\giga\hertz}, and an Intel Xeon Gold 6130 dual-socket server with 32 cores at \SI{2.1}{\giga\hertz}.
 We linked our code with the Intel MKL library for BLAS operations.
 Additionally, we executed the code on an ARM Q80 server featuring 80 cores at \SI{2.8}{\giga\hertz}, and although performance counters were unavailable, we approximated the Flop/s rate by comparing the total execution time with that measured on the AMD CPU.
-For this, we utilized the \textsc{ArmPL} library for BLAS operations.
+On the ARM architecture, we utilized the \textsc{ArmPL} library for BLAS operations.
 \begin{table*}[htb]
 \begin{ruledtabular}
@ -445,7 +448,7 @@ Peak performance is determined by calculating the maximum achievable Flops/s on
 \begin{equation}
 P = N_{\text{cores}} \times N_{\text{FMA}} \times 2 \times V \times F
 \end{equation}
-where $F$ represents the frequency, $V$ the number of double precision elements in a vector register, $N_{\text{FMA}}$ denotes the number of vector FMA units per core (all considered CPUs possess two), and $N_{\text{cores}}$ reflects the number of cores. Notably, the Xeon and ARM CPUs both operate at approximately 30\% of peak performance, while the AMD \textsc{Epyc} CPU demonstrates twice the efficiency, achieving 60\% of the peak.
+where $F$ represents the processor frequency, $V$ the number of double precision elements in a vector register, $N_{\text{FMA}}$ denotes the number of vector fused multiply-accumulate (FMA) units per core (all considered CPUs possess two), and $N_{\text{cores}}$ reflects the number of cores. Notably, the Xeon and ARM CPUs both operate at approximately 30\% of peak performance, while the AMD \textsc{Epyc} CPU demonstrates twice the efficiency, achieving 60\% of the peak.
 The relatively modest performance, at around 30\% efficiency, is attributed to the small dimensions of the matrices involved.
@ -491,7 +494,7 @@ This novel approach combines deterministic and stochastic methods to optimize bo
 The core of our algorithm is based on selectively calculating contributions labeled by triplets of virtual orbitals leveraging Monte Carlo sampling, and employing memoization to suppress redundant calculations.
 Our results demonstrate that the semi-stochastic algorithm substantially reduces the computational effort compared to traditional deterministic methods, achieving near-exact accuracy with significantly reduced computational resources. Specifically, we have shown that the algorithm can achieve chemical accuracy with a small fraction of the computational effort required by fully deterministic approaches. This efficiency opens up new possibilities for studying larger systems or employing more extensive basis sets that were previously beyond reach due to computational constraints.
-Additionally, the implementation of this algorithm has proven to be highly parallelizable, demonstrating excellent scalability across different high-performance computing platforms.
+Additionally, the implementation of this algorithm has proven to be highly parallelizable, demonstrating excellent scalability across different platforms.
 An important aspect of our investigation focused on the application of our algorithm to potential energy surface scanning.
 Our method aligns well with recent findings suggesting the utility of numerous, less precise data points in constructing machine learning models.\cite{ceperley_2024}