Minor changes

This commit is contained in:
Anthony Scemama 2024-05-25 17:36:00 +02:00
parent 0ecca24d2e
commit 2e18256b80
7 changed files with 211 additions and 22 deletions

102
Data/data_buckets.txt Normal file
View File

@ -0,0 +1,102 @@
# Indices of the first and last triplet in each bucket
1 3115
3116 7569
7570 12858
12859 18844
18845 25457
25458 32641
32642 40359
40360 48602
48603 57368
57369 66666
66667 76503
76504 86871
86872 97755
97756 109142
109143 121012
121013 133351
133352 146139
146140 159350
159351 172962
172963 186962
186963 201323
201324 216044
216045 231111
231112 246517
246518 262251
262252 278306
278307 294677
294678 311356
311357 328334
328335 345611
345612 363187
363188 381057
381058 399222
399223 417686
417687 436440
436441 455486
455487 474825
474826 494452
494453 514365
514366 534562
534563 555043
555044 575811
575812 596863
596864 618204
618205 639836
639837 661752
661753 683957
683958 706450
706451 729231
729232 752298
752299 775647
775648 799283
799284 823209
823210 847424
847425 871929
871930 896723
896724 921818
921819 947209
947210 972890
972891 998869
998870 1025144
1025145 1051719
1051720 1078598
1078599 1105778
1105779 1133271
1133272 1161078
1161079 1189201
1189202 1217647
1217648 1246417
1246418 1275517
1275518 1304950
1304951 1334724
1334725 1364840
1364841 1395318
1395319 1426160
1426161 1457375
1457376 1488973
1488974 1520966
1520967 1553357
1553358 1586169
1586170 1619407
1619408 1653094
1653095 1687252
1687253 1721895
1721896 1757050
1757051 1792738
1792739 1828990
1828991 1865850
1865851 1903360
1903361 1941580
1941581 1980567
1980568 2020412
2020413 2061234
2061235 2103178
2103179 2146484
2146485 2191517
2191518 2238934
2238935 2290534
2290535 2349496
2349497 2420847

View File

@ -283,8 +283,6 @@ unset grid
## Last datafile plotted: "max_data.dat" ## Last datafile plotted: "max_data.dat"
plot 'max_data.dat' every 1 index 0 u 1:(-$3 * 2420853) w impulses lw 3 title 'Uniform sampling', \ plot 'max_data.dat' every 1 index 0 u 1:(-$3 * 2420853) w impulses lw 3 title 'Uniform sampling', \
'max_data.dat' every 1 index 0 u 1:(-$3 * $2) w impulses lw 3 title 'Importance sampling' , 0.051806920848 w l title '' 'max_data.dat' every 1 index 0 u 1:(-$3 * $2) w impulses lw 3 title 'Importance sampling' , 0.051806920848 w l title ''
#plot 'data.dat' every 1 index 0 u (-$3 * 2420853) w impulses lw 3 title 'Uniform sampling', \
# 'data.dat' every 1 index 0 u (-$3 * $2) w impulses lw 3 title 'Importance sampling'
# Zoomed plot # Zoomed plot
set grid set grid

2
Manuscript/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
/build/
/svg-inkscape/

BIN
Manuscript/buckets.pdf Normal file

Binary file not shown.

70
Manuscript/orcidlink.sty Normal file
View File

@ -0,0 +1,70 @@
%%
%% This is file `orcidlink.sty',
%% generated with the docstrip utility.
%%
%% The original source files were:
%%
%% orcidlink.dtx (with options: `package')
%%
%% This is a generated file.
%%
%% Copyright (C) 2019-2023 by Leo C. Stein <leo.stein@gmail.com>
%% --------------------------------------------------------------------------
%% This work may be distributed and/or modified under the
%% conditions of the LaTeX Project Public License, either version 1.3
%% of this license or (at your option) any later version.
%% The latest version of this license is in
%% http://www.latex-project.org/lppl.txt
%% and version 1.3 or later is part of all distributions of LaTeX
%% version 2005/12/01 or later.
%%
\NeedsTeXFormat{LaTeX2e}[1994/06/01]
\ProvidesPackage{orcidlink}
[2023/12/30 v1.0.5 Linked ORCiD logo macro package]
%% All I did was package up Milo's code on TeX.SE,
%% see https://tex.stackexchange.com/a/445583/34063
\RequirePackage{hyperref}
\RequirePackage{tikz}
\ProcessOptions\relax
\usetikzlibrary{svg.path}
\definecolor{orcidlogocol}{HTML}{A6CE39}
\tikzset{
orcidlogo/.pic={
\fill[orcidlogocol] svg{M256,128c0,70.7-57.3,128-128,128C57.3,256,0,198.7,0,128C0,57.3,57.3,0,128,0C198.7,0,256,57.3,256,128z};
\fill[white] svg{M86.3,186.2H70.9V79.1h15.4v48.4V186.2z}
svg{M108.9,79.1h41.6c39.6,0,57,28.3,57,53.6c0,27.5-21.5,53.6-56.8,53.6h-41.8V79.1z M124.3,172.4h24.5c34.9,0,42.9-26.5,42.9-39.7c0-21.5-13.7-39.7-43.7-39.7h-23.7V172.4z}
svg{M88.7,56.8c0,5.5-4.5,10.1-10.1,10.1c-5.6,0-10.1-4.6-10.1-10.1c0-5.6,4.5-10.1,10.1-10.1C84.2,46.7,88.7,51.3,88.7,56.8z};
}
}
%% Reciprocal of the height of the svg whose source is above. The
%% original generates a 256pt high graphic; this macro holds 1/256.
\newcommand{\@OrigHeightRecip}{0.00390625}
%% We will compute the current X height to make the logo the right height
\newlength{\@curXheight}
%% Prevent externalization of the ORCiD logo.
\newcommand{\@preventExternalization}{%
\ifcsname tikz@library@external@loaded\endcsname%
\tikzset{external/export next=false}\else\fi%
}
\DeclareRobustCommand\orcidlink[1]{%
\texorpdfstring{%
\setlength{\@curXheight}{\fontcharht\font`X}%
\href{https://orcid.org/#1}{\XeTeXLinkBox{\mbox{%
\@preventExternalization%
\begin{tikzpicture}[yscale=-\@OrigHeightRecip*\@curXheight,
xscale=\@OrigHeightRecip*\@curXheight,transform shape]
\pic{orcidlogo};
\end{tikzpicture}%
}}}}{}}
\endinput
%%
%% End of file `orcidlink.sty'.

View File

@ -563,4 +563,18 @@ swh:1:dir:6d82ae7ac757c78d7720dd89dfa52d7a453d2f68;origin=https://github.com/Qua
url = {https://pubs.acs.org/doi/abs/10.1021/acs.jctc.7b00049}, url = {https://pubs.acs.org/doi/abs/10.1021/acs.jctc.7b00049},
volume = {13}, volume = {13},
year = {2017} year = {2017}
}
@article{pople_1999,
author = {Pople, John A.},
title = {{Nobel Lecture: Quantum chemical models}},
journal = {Rev. Mod. Phys.},
volume = {71},
number = {5},
pages = {1267--1274},
year = {1999},
month = oct,
issn = {1539-0756},
publisher = {American Physical Society},
doi = {10.1103/RevModPhys.71.1267}
} }

View File

@ -165,11 +165,11 @@ For a closed-shell reference with canonical orbitals, each individual term is ex
\end{equation} \end{equation}
and depends on the canonical orbital energies $\epsilon$, and on the tensors $W$ and $V$: and depends on the canonical orbital energies $\epsilon$, and on the tensors $W$ and $V$:
\begin{align} \begin{align}
W_{ijk}^{abc} & = P_{ijk}^{abc} \qty( \sum_d^{\text{virt}} \qty(bd|ai) t_{kj}^{cd} - W_{ijk}^{abc} & = \Pi_{ijk}^{abc} \qty( \sum_d^{\text{virt}} \qty(bd|ai) t_{kj}^{cd} -
\sum_l^{\text{occ}} \qty(ck|jl) t_{ab}^{il}) \\ \sum_l^{\text{occ}} \qty(ck|jl) t_{ab}^{il}) \\
V_{ijk}^{abc} & = W_{ijk}^{abc} + \qty(bj|ck) t_i^a + \qty(ai|ck) t_j^b + \qty(ai|bj) t_k^c V_{ijk}^{abc} & = W_{ijk}^{abc} + \qty(bj|ck) t_i^a + \qty(ai|ck) t_j^b + \qty(ai|bj) t_k^c
\end{align} \end{align}
where $P_{ijk}^{abc}$ is a sign-less permutation operator, $(t^a_i, t^{ab}_{ij})$ are the CCSD amplitudes, where $\Pi_{ijk}^{abc}$ is a sign-less permutation operator, $(t^a_i, t^{ab}_{ij})$ are the CCSD amplitudes,
\((pq|rs)\) are the two-electron coulomb integrals, \((pq|rs)\) are the two-electron coulomb integrals,
and the indices $i,j,k,l$ and $a,b,c,d$ refer to occupied and virtual orbitals, respectively. and the indices $i,j,k,l$ and $a,b,c,d$ refer to occupied and virtual orbitals, respectively.
@ -189,7 +189,7 @@ In the algorithm proposed by Rendell\cite{rendell_1991}, for each given triplet
\subsection{Stochastic formulation} \subsection{Stochastic formulation}
We propose an algorithm influenced by the semi-stochastic approach originally developed for computing the Epstein-Nesbet second-order perturbation correction to the energy. \cite{garniron_2017} We propose an algorithm influenced by the semi-stochastic approach introduced in Ref.~\citenum{garniron_2017}, originally developed for computing the Epstein-Nesbet second-order perturbation correction to the energy.
The perturbative triples correction is expressed as a sum of corrections, each indexed solely by virtual orbitals: The perturbative triples correction is expressed as a sum of corrections, each indexed solely by virtual orbitals:
\begin{equation} \begin{equation}
@ -200,11 +200,11 @@ Monte Carlo sampling is employed by selecting samples $E^{abc}$.
The principal advantage of this formulation is that the number of triplet combinations $(a,b,c)$, given by $N_v^3$, is sufficiently small to allow for all contributions $E^{abc}$ to be stored in memory. The principal advantage of this formulation is that the number of triplet combinations $(a,b,c)$, given by $N_v^3$, is sufficiently small to allow for all contributions $E^{abc}$ to be stored in memory.
The first time a triplet $(a,b,c)$ is drawn, its corresponding value $E^{abc}$ is computed and then stored. The first time a triplet $(a,b,c)$ is drawn, its corresponding value $E^{abc}$ is computed and then stored.
Subsequent drawings of the same triplet retrieve the value from memory. We refer to this technique as \emph{memoization}. Subsequent drawings of the same triplet retrieve the value from memory. We refer to this technique as \emph{memoization}.
Thus, the computational expense of calculating the sample, which scales as $N_\text{o}^3 \times N_\text{v}$, is incurred only once, with all subsequent accesses being computationally trivial. Thus, the computational expense of calculating the sample, which scales as $N_\text{o}^3 \times N_\text{v}$, is incurred only once, with all subsequent accesses being made at no cost.
Consequently, employing a sufficient number of Monte Carlo samples to ensure that each contribution is selected at least once results in a total computational cost that is only negligibly higher than that of an exact computation. Consequently, employing a sufficient number of Monte Carlo samples to ensure that each contribution is selected at least once results in a total computational cost that is only negligibly higher than that of an exact computation.
To reduce the variance, we apply importance sampling: the samples are drawn using the probability To reduce the fluctuations of the statistical estimator, we apply importance sampling: the samples are drawn using the probability
\begin{equation} \begin{equation}
P^{abc} = \frac{1}{\mathcal{N}} \frac{1}{\max \left(\epsilon_{\min}, \epsilon_a + \epsilon_b + \epsilon_c \right)} P^{abc} = \frac{1}{\mathcal{N}} \frac{1}{\max \left(\epsilon_{\min}, \epsilon_a + \epsilon_b + \epsilon_c \right)}
\end{equation} \end{equation}
@ -222,7 +222,8 @@ where $n^{abc}$ is the number of times the triplet $(a,b,c)$ was drawn with prob
\caption{% \caption{%
Ratios $\frac{E^{abc}}{P^{abc}}$ obtained with the data of benzene/cc-pVTZ, using uniform or importance sampling. Ratios $\frac{E^{abc}}{P^{abc}}$ obtained with the data of benzene/cc-pVTZ, using uniform or importance sampling.
Every bucket, delimited by vertical bars, contains a number of triplets such that the sum Every bucket, delimited by vertical bars, contains a number of triplets such that the sum
\(\sum_{(a,b,c)}P^{abc}\) remains as uniform as possible. The zoomed window corresponds to the first bucket. The high fluctuations occurring in the first buckets are reduced by importance sampling. \(\sum_{(a,b,c)}P^{abc}\) remains as uniform as possible. The zoomed window corresponds to the first bucket.
The fluctuations originating from the discrepancy of the values in the first buckets are considerably reduced by importance sampling.
\label{fig:buckets} \label{fig:buckets}
} }
\end{figure} \end{figure}
@ -230,8 +231,8 @@ Every bucket, delimited by vertical bars, contains a number of triplets such tha
This approach effectively reduces the statistical error bars by approximately a factor of two for the same computational expense due to two primary reasons: i) the estimator exhibits reduced fluctuations, ii) triplet combinations with low-energy orbitals are significantly more likely to be selected than others, enhancing the efficiency of memoization (see Fig.~\ref{fig:buckets}). This approach effectively reduces the statistical error bars by approximately a factor of two for the same computational expense due to two primary reasons: i) the estimator exhibits reduced fluctuations, ii) triplet combinations with low-energy orbitals are significantly more likely to be selected than others, enhancing the efficiency of memoization (see Fig.~\ref{fig:buckets}).
We employ the inverse transform sampling technique to select samples, where an array of pairs $\qty(P^{abc}, (a,b,c))$ is stored. We employ the inverse transform sampling technique to select samples, where an array of pairs $\qty(P^{abc}, (a,b,c))$ is stored.
To further reduce the variance of the samples, this array is sorted in descending order based on $P^{abc}$ and subsequently partitioned into buckets, $B$, as can be seen diagrammatically in Figure~\ref{fig:buckets}. To further reduce the variance of the samples, this array is sorted in descending order based on $P^{abc}$ and subsequently partitioned into buckets as can be seen diagrammatically in Figure~\ref{fig:buckets}.
Each bucket is designed such that the sum $\sum_{(a,b,c) \in B} P^{abc}$ within it is as uniform The partitioning into buckets is designed such that the sum $\sum_{(a,b,c) \in B} P^{abc}$ within each bucket $B$ is as uniform
as possible across all buckets. as possible across all buckets.
As each bucket is equally probable, samples are defined as combinations of triplets, with one triplet drawn from each bucket. As each bucket is equally probable, samples are defined as combinations of triplets, with one triplet drawn from each bucket.
Should the values of $E^{abc}$ be skewed, this advanced refinement significantly diminishes the variance. Should the values of $E^{abc}$ be skewed, this advanced refinement significantly diminishes the variance.
@ -241,13 +242,15 @@ The total perturbative contribution is computed as the aggregate of contribution
E_{(T)} = \sum_B E_B = \sum_B\sum_{(a,b,c) \in B} E^{abc}. E_{(T)} = \sum_B E_B = \sum_B\sum_{(a,b,c) \in B} E^{abc}.
\end{equation} \end{equation}
Once every triplet within a bucket $B$ has been drawn at least once, the contribution $E_B$ can be determined. Once every triplet within a bucket $B$ has been drawn at least once, the contribution $E_B$ can be determined.
At this juncture, there is no longer a necessity to evaluate \(E_B\) stochastically, and the buckets can be categorized into stochastic ($\mathcal{S}$) and deterministic ($\mathcal{D}$) groups: At this juncture, there is no longer a necessity to evaluate \(E_B\) stochastically, and the buckets can be categorized into
deterministic ($\mathcal{D}$) and stochastic ($\mathcal{S}$) groups:
\begin{equation} \begin{equation}
\label{eq:separation}
E_{(T)} = \sum_{B \in \mathcal{D}} E_B + \frac{1}{|\mathcal{S}|} \sum_{B \in \mathcal{S}} E_{(T)} = \sum_{B \in \mathcal{D}} E_B + \frac{1}{|\mathcal{S}|} \sum_{B \in \mathcal{S}}
\left \langle \frac{E^B_{abc}}{P^{abc}} \right \rangle_{P^{abc}}. \left \langle \frac{E^B_{abc}}{P^{abc}} \right \rangle_{P^{abc}}.
\end{equation} \end{equation}
Not all buckets are of equal size; the number of triplets per bucket increases with the bucket's index. Consequently, the initial buckets transition into the deterministic set first, gradually reducing the stochastic contribution. When every triplet has been drawn, the exact value of $E_{(T)}$ is obtained, devoid of statistical error. Not all buckets are of equal size (see Figure~\ref{fig:buckets}); the number of triplets per bucket increases with the bucket's index. Consequently, the initial buckets transition into the deterministic set first, gradually reducing the stochastic contribution. When every triplet has been drawn, the exact value of $E_{(T)}$ is obtained, devoid of statistical error.
To accelerate the completion of the buckets, each Monte Carlo iteration triggers the computation of the first non-computed triplet. This ensures that after $N$ drawings, the To accelerate the completion of the buckets, each Monte Carlo iteration triggers concurrently the computation of the first non-computed triplet. This ensures that after $N$ drawings, the
exact contribution from each bucket can be obtained. exact contribution from each bucket can be obtained.
The computational time required to generate a random number is negligible compared to the time needed to compute a contribution, $E^{abc}$. The computational time required to generate a random number is negligible compared to the time needed to compute a contribution, $E^{abc}$.
@ -331,7 +334,7 @@ for printing or for exiting when the statistical error gets below a given thresh
The number of samples $N^{abc}$ of each triplet $(a,b,c)$ is initialized to $-1$, to identify The number of samples $N^{abc}$ of each triplet $(a,b,c)$ is initialized to $-1$, to identify
the contributions that have not been already computed. the contributions that have not been already computed.
An outer \emph{for loop} runs over the maximum number of iteration, equal to An outer \emph{for loop} runs over the maximum number of iterations, equal by construction to
the number of different triplets $N_{\text{triplets}}$. the number of different triplets $N_{\text{triplets}}$.
Within a loop iteration, the index of the first non-computed triplet $(a,b,c)$ is identified, and the task associated with its computation is sent to the task queue. Within a loop iteration, the index of the first non-computed triplet $(a,b,c)$ is identified, and the task associated with its computation is sent to the task queue.
@ -363,10 +366,10 @@ The calculations were performed on an AMD \textsc{Epyc} 7513 dual socket server
Figure~\ref{fig:benzene} shows the convergence of the CCSD(T) energy as a function of the program execution time using the two basis sets. Figure~\ref{fig:benzene} shows the convergence of the CCSD(T) energy as a function of the program execution time using the two basis sets.
Notably, the exact CCSD(T) energy always falls within $2\sigma$, affirming the reliability of the statistical error. Notably, the exact CCSD(T) energy always falls within $2\sigma$, affirming the reliability of the statistical error.
Figure~\ref{fig:benzene_err} displays the statistical error as a function of the percentage of computed contributions. Figure~\ref{fig:benzene_err} displays the statistical error as a function of the percentage of computed contributions.
Noteworthy in the figure are the curve discontinuities, attributable to readjustments in the separation between the deterministic and stochastic components of the calculation. Noteworthy in the figure are the curve discontinuities, attributable to readjustments in the separation between the deterministic and stochastic components of the calculation (Eq.~\eqref{eq:separation}).
These updates lead to revised estimates and a diminution in statistical error. These updates lead to revised estimates and a diminution in statistical error.
Achieving chemical accuracy, defined as \SI{1.6}{\milli\hartree}, necessitates less than 1\% of the total contributions in both basis sets. Achieving chemical accuracy, defined as \SI{1.6}{\milli\hartree},\cite{pople_1999} necessitates less than 1\% of the total contributions in both basis sets.
Attaining a \SI{0.1}{\milli\hartree} precision level requires computation of 32\% and 15\% of the contributions for cc-pVTZ and cc-pVQZ, respectively. Attaining a \SI{0.1}{\milli\hartree} precision level requires computation of 32\% and 15\% of the contributions for cc-pVTZ and cc-pVQZ, respectively.
The more rapid convergence observed with the larger basis set aligns with expectations, as expanding the basis set tends to increase the proportion of minor contributions while maintaining a relatively steady count of significant contributions. The more rapid convergence observed with the larger basis set aligns with expectations, as expanding the basis set tends to increase the proportion of minor contributions while maintaining a relatively steady count of significant contributions.
This trend underscores the algorithm's enhanced suitability for systems with fewer electrons and extensive basis sets, as opposed to larger electron counts in smaller basis sets. This trend underscores the algorithm's enhanced suitability for systems with fewer electrons and extensive basis sets, as opposed to larger electron counts in smaller basis sets.
@ -378,8 +381,8 @@ This trend underscores the algorithm's enhanced suitability for systems with few
Our methodology proves especially advantageous for scenarios requiring the Our methodology proves especially advantageous for scenarios requiring the
aggregation of numerous CCSD(T) energies, such as neural network training or aggregation of numerous CCSD(T) energies, such as neural network training or
the exploration of potential energy surfaces. the exploration of potential energy surfaces.
In a recent article, the authors highlight the pivotal role of Quantum Monte In a recent article, Ceperley \textit{et al} highlight the pivotal role of Quantum Monte
Carlo (QMC) in generating data for constructing potential energy surfaces. Carlo (QMC) in generating data for constructing potential energy surfaces.\cite{ceperley_2024}
The study suggests that stochastic noise inherent in QMC can facilitate machine The study suggests that stochastic noise inherent in QMC can facilitate machine
learning model training, demonstrating that models can benefit from numerous, learning model training, demonstrating that models can benefit from numerous,
less precise data points. These findings are supported by an analysis of less precise data points. These findings are supported by an analysis of
@ -411,7 +414,7 @@ We froze the six lowest molecular orbitals, specifically the $1s$ orbital of \ce
The fitted Morse potential revealed a vibrational frequency of $\nu = \SI{414.7}{\per\centi\meter}$ and an equilibrium bond length of $r_e = \SI{3.92}{\bohr}$, aligning remarkably well with experimental values from the NIST database\cite{nist_2022} $\nu = \SI{417.6}{\per\centi\meter}$ and $r_e = \SI{3.88}{\bohr}$. The fitted Morse potential revealed a vibrational frequency of $\nu = \SI{414.7}{\per\centi\meter}$ and an equilibrium bond length of $r_e = \SI{3.92}{\bohr}$, aligning remarkably well with experimental values from the NIST database\cite{nist_2022} $\nu = \SI{417.6}{\per\centi\meter}$ and $r_e = \SI{3.88}{\bohr}$.
Subsequently, we applied our semi-stochastic algorithm to estimate the perturbative triples correction, utilizing merely 1\% of the total contributions. Subsequently, we applied our semi-stochastic algorithm to estimate the perturbative triples correction, utilizing merely 1\% of the total contributions.
This approach yielded a hundredfold acceleration in computational efficiency, achieving statistical uncertainty within the range of \SI{1.2} to \SI{2.0}{\milli\hartree}. This approach yielded a hundredfold acceleration in computational efficiency, achieving statistical uncertainty within the range of \SI{1.2} to \SI{2.0}{\milli\hartree} for each data point.
The vibrational frequency and equilibrium distance estimated using this data, $\nu = \SI{415.1}{\per\centi\meter}$ and $r_e = \SI{3.91}{\bohr}$, demonstrated comparable precision to the full computational results. The vibrational frequency and equilibrium distance estimated using this data, $\nu = \SI{415.1}{\per\centi\meter}$ and $r_e = \SI{3.91}{\bohr}$, demonstrated comparable precision to the full computational results.
Figure \ref{fig:cucl} illustrates the potential energy surface of \ce{CuCl}, displaying both the exact CCSD(T) energies and those estimated via the semi-stochastic method. Figure \ref{fig:cucl} illustrates the potential energy surface of \ce{CuCl}, displaying both the exact CCSD(T) energies and those estimated via the semi-stochastic method.
@ -424,7 +427,7 @@ However, we have outlined a strategy to reframe this operation into BLAS matrix
We evaluated the efficiency of our implementation using the Likwid\cite{treibig_2010} performance analysis tool on two distinct x86 platforms: an AMD \textsc{Epyc} 7513 dual-socket server equipped with 64 cores at \SI{2.6}{\giga\hertz}, and an Intel Xeon Gold 6130 dual-socket server with 32 cores at \SI{2.1}{\giga\hertz}. We evaluated the efficiency of our implementation using the Likwid\cite{treibig_2010} performance analysis tool on two distinct x86 platforms: an AMD \textsc{Epyc} 7513 dual-socket server equipped with 64 cores at \SI{2.6}{\giga\hertz}, and an Intel Xeon Gold 6130 dual-socket server with 32 cores at \SI{2.1}{\giga\hertz}.
We linked our code with the Intel MKL library for BLAS operations. We linked our code with the Intel MKL library for BLAS operations.
Additionally, we executed the code on an ARM Q80 server featuring 80 cores at \SI{2.8}{\giga\hertz}, and although performance counters were unavailable, we approximated the Flop/s rate by comparing the total execution time with that measured on the AMD CPU. Additionally, we executed the code on an ARM Q80 server featuring 80 cores at \SI{2.8}{\giga\hertz}, and although performance counters were unavailable, we approximated the Flop/s rate by comparing the total execution time with that measured on the AMD CPU.
For this, we utilized the \textsc{ArmPL} library for BLAS operations. On the ARM architecture, we utilized the \textsc{ArmPL} library for BLAS operations.
\begin{table*}[htb] \begin{table*}[htb]
\begin{ruledtabular} \begin{ruledtabular}
@ -445,7 +448,7 @@ Peak performance is determined by calculating the maximum achievable Flops/s on
\begin{equation} \begin{equation}
P = N_{\text{cores}} \times N_{\text{FMA}} \times 2 \times V \times F P = N_{\text{cores}} \times N_{\text{FMA}} \times 2 \times V \times F
\end{equation} \end{equation}
where $F$ represents the frequency, $V$ the number of double precision elements in a vector register, $N_{\text{FMA}}$ denotes the number of vector FMA units per core (all considered CPUs possess two), and $N_{\text{cores}}$ reflects the number of cores. Notably, the Xeon and ARM CPUs both operate at approximately 30\% of peak performance, while the AMD \textsc{Epyc} CPU demonstrates twice the efficiency, achieving 60\% of the peak. where $F$ represents the processor frequency, $V$ the number of double precision elements in a vector register, $N_{\text{FMA}}$ denotes the number of vector fused multiply-accumulate (FMA) units per core (all considered CPUs possess two), and $N_{\text{cores}}$ reflects the number of cores. Notably, the Xeon and ARM CPUs both operate at approximately 30\% of peak performance, while the AMD \textsc{Epyc} CPU demonstrates twice the efficiency, achieving 60\% of the peak.
The relatively modest performance, at around 30\% efficiency, is attributed to the small dimensions of the matrices involved. The relatively modest performance, at around 30\% efficiency, is attributed to the small dimensions of the matrices involved.
@ -491,7 +494,7 @@ This novel approach combines deterministic and stochastic methods to optimize bo
The core of our algorithm is based on selectively calculating contributions labeled by triplets of virtual orbitals leveraging Monte Carlo sampling, and employing memoization to suppress redundant calculations. The core of our algorithm is based on selectively calculating contributions labeled by triplets of virtual orbitals leveraging Monte Carlo sampling, and employing memoization to suppress redundant calculations.
Our results demonstrate that the semi-stochastic algorithm substantially reduces the computational effort compared to traditional deterministic methods, achieving near-exact accuracy with significantly reduced computational resources. Specifically, we have shown that the algorithm can achieve chemical accuracy with a small fraction of the computational effort required by fully deterministic approaches. This efficiency opens up new possibilities for studying larger systems or employing more extensive basis sets that were previously beyond reach due to computational constraints. Our results demonstrate that the semi-stochastic algorithm substantially reduces the computational effort compared to traditional deterministic methods, achieving near-exact accuracy with significantly reduced computational resources. Specifically, we have shown that the algorithm can achieve chemical accuracy with a small fraction of the computational effort required by fully deterministic approaches. This efficiency opens up new possibilities for studying larger systems or employing more extensive basis sets that were previously beyond reach due to computational constraints.
Additionally, the implementation of this algorithm has proven to be highly parallelizable, demonstrating excellent scalability across different high-performance computing platforms. Additionally, the implementation of this algorithm has proven to be highly parallelizable, demonstrating excellent scalability across different platforms.
An important aspect of our investigation focused on the application of our algorithm to potential energy surface scanning. An important aspect of our investigation focused on the application of our algorithm to potential energy surface scanning.
Our method aligns well with recent findings suggesting the utility of numerous, less precise data points in constructing machine learning models.\cite{ceperley_2024} Our method aligns well with recent findings suggesting the utility of numerous, less precise data points in constructing machine learning models.\cite{ceperley_2024}