pres_intel/scemama.tex
2021-10-04 16:34:42 +02:00

630 lines
20 KiB
TeX

% Created 2021-10-04 Mon 16:33
% Intended LaTeX compiler: pdflatex
\documentclass[aspectratio=169]{beamer}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{graphicx}
\usepackage{grffile}
\usepackage{longtable}
\usepackage{wrapfig}
\usepackage{rotating}
\usepackage[normalem]{ulem}
\usepackage{amsmath}
\usepackage{textcomp}
\usepackage{amssymb}
\usepackage{capt-of}
\usepackage{hyperref}
\institute{$^1$University of Toulouse/CNRS, LCPQ (France) \\
$^2$University of Versailles, Li-PaRAD (France)}
\usepackage{minted}
\usemintedstyle{emacs}
\newminted{f90}{fontsize=\footnotesize}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{hyperref}
\usepackage{mathtools}
\usepackage{physics}
\definecolor{darkgreen}{rgb}{0.,0.6,0.}
\definecolor{darkblue}{rgb}{0.,0.2,0.7}
\definecolor{darkred}{rgb}{0.6,0.1,0.1}
\definecolor{darkpink}{rgb}{0.7,0.0,0.7}
\newcommand{\coord }{{\bf r}_1, \dots, {\bf r}_N }
\newcommand{\dcoord }{\dd {\bf r}_1 \dots \dd{\bf r}_N }
\usepackage[backend=biber,style=alphabetic,autocite=plain,sorting=none]{biblatex}
\addbibresource{verificarlo.bib}
\usepackage{graphicx}
\usepackage[many]{tcolorbox}
\usepackage{tikz}
\usetikzlibrary{tikzmark,positioning}
\definecolor{grey}{RGB}{170,170,170}
\usetheme{trex}
\author{A. Scemama\(^1\), V.G. Chilkuri\(^1\), E. Posenitskiy\(^1\), P. de Oliveira Castro\(^2\), C. Valensi\(^2\), W. Jalby\(^2\)}
\date{08/10/2021}
\title{Libraries developed in the TREX CoE}
\hypersetup{
pdfauthor={A. Scemama\(^1\), V.G. Chilkuri\(^1\), E. Posenitskiy\(^1\), P. de Oliveira Castro\(^2\), C. Valensi\(^2\), W. Jalby\(^2\)},
pdftitle={Libraries developed in the TREX CoE},
pdfkeywords={},
pdfsubject={},
pdfcreator={Emacs 26.3 (Org mode 9.4)},
pdflang={English}}
\begin{document}
\maketitle
\section{TREX: Targeting REal chemical accuracy at the EXascale}
\label{sec:org5b14751}
\begin{frame}[label={sec:orge8da598}]{TREX: Targeting REal chemical accuracy at the EXascale}
\begin{exampleblock}{QMC: Quantum Monte Carlo methods}
\begin{itemize}
\item Highly accurate methods
\item Massively parallelisable (multiple QMC trajectories)
\item Very CPU intensive: One of the most "compute-hungry" methods
\item Still under development: scientists need to run \emph{and} develop code
\item Input data is complex (wave function)
\end{itemize}
\end{exampleblock}
\begin{exampleblock}{Objective: Make codes ready for exascale}
How: Instead of re-writing codes, provide libraries (free software)
\begin{enumerate}
\item \alert{TREXIO}: A library for exchanging information between codes
\(\Longrightarrow\) Enables HTC
\item \alert{QMCkl}: A library for high-performance \(\Longrightarrow\) Enables HPC
\end{enumerate}
\end{exampleblock}
\end{frame}
\begin{frame}[label={sec:orgce5753d}]{Quantum Monte Carlo (QMC)}
\alert{Problem}: Stochastic resolution of the Schr\"odinger equation for $N$ electrons
\begin{eqnarray}
E &= &\frac{\int \dcoord \Phi(\coord) {\cal H} \Phi(\coord)}
{\int \dcoord \Phi(\coord) \Phi(\coord)} \nonumber \\
&\sim & \sum \frac{ {\cal H}\Psi(\coord )}{\Psi(\coord )}
\text{, sampled with } (\Psi \times \Phi)
\nonumber
\end{eqnarray}
\begin{columns}
\begin{column}{.5\textwidth}
\begin{itemize}
\item[$\cal H $: ] Hamiltonian operator
\item[$E$: ] Energy
\end{itemize}
\end{column}
\begin{column}{.4\textwidth}
\begin{itemize}
\item[$\coord $: ] Electron coordinates
\item[$\Phi $: ] Almost exact wave function
\item[$\Psi $: ] Trial wave function
\end{itemize}
\end{column}
\end{columns}
\end{frame}
\begin{frame}[label={sec:org6ed0682}]{Quantum Monte Carlo (QMC)}
\begin{columns}
\begin{column}{0.4\textwidth}
\begin{itemize}
\item Very low memory requirements (no integrals)
\item Distribute walkers on different cores or compute nodes
\item No blocking communication: near-ideal scaling
\item Difficulty: parallelize within a QMC trajectory
\end{itemize}
\end{column}
\begin{column}{0.6\textwidth}
\begin{center}
\includegraphics[width=\textwidth]{./Qmc.png}
\end{center}
\end{column}
\end{columns}
\end{frame}
\begin{frame}[label={sec:orgc59d3f5}]{Both libraries}
\begin{block}{Three objectives}
\begin{enumerate}
\item \alert{Productivity} \\
Used and developed by scientists in different languages
\item \alert{Portability} \\
Target: all HPC systems (CPU, GPU, ARM, x86, etc.)
\item \alert{Performance} \\
Must be efficient on all architectures
\end{enumerate}
\end{block}
\begin{block}{Free (libre) software}
\begin{itemize}
\item Requirement for open science
\item BSD license for adoption by any software (academic, commercial, \ldots{})
\end{itemize}
\end{block}
\end{frame}
\section{TREXIO: I/O library}
\label{sec:org88424a7}
\begin{frame}[label={sec:org9c9c2f0}]{TREXIO: I/O library}
\begin{columns}
\begin{column}{0.4\textwidth}
\begin{exampleblock}{Before}
\begin{center}
\includegraphics[width=.9\linewidth]{interfaces.png}
\end{center}
\end{exampleblock}
\end{column}
\begin{column}{0.6\textwidth}
\begin{exampleblock}{After}
\begin{center}
\includegraphics[width=.9\linewidth]{interfaces2.png}
\end{center}
\end{exampleblock}
\end{column}
\end{columns}
(BSD license) \\
\url{https://github.com/trex-coe/trexio}
\end{frame}
\begin{frame}[label={sec:org8aca922}]{TREXIO: I/O library}
\begin{exampleblock}{Front end}
\begin{itemize}
\item Definition of an API for to read/write wave functions
\item C-compatible API: Easy usage in all common languages
\end{itemize}
\end{exampleblock}
\begin{columns}
\begin{column}{0.5\textwidth}
\begin{center}
\includegraphics[width=\textwidth]{./api.png}
\end{center}
\end{column}
\begin{column}{0.5\textwidth}
\begin{exampleblock}{Back end}
\begin{itemize}
\item HDF5: Efficient I/O
\item Text:
\begin{itemize}
\item Fallback when HDF5 can't be installed
\item Debugging
\item Version control systems
\end{itemize}
\end{itemize}
\end{exampleblock}
\end{column}
\end{columns}
\end{frame}
\begin{frame}[label={sec:orgbb567ff}]{Content of the files}
\begin{itemize}
\item File is \alert{self-contained}: no external knowledge needed to compute
\(\Psi(r_1,\dots,r_n)\) (normalization factors, basis et
parameters, \emph{etc})
\item \alert{Strong conventions} (atomic units, ordering of atomic orbitals, etc.)
\item The data stored in the files is organized in different \alert{groups}:
\begin{center}
\begin{tabular}{lll}
Metadata & Electron & Slater Determinants\\
Nucleus & Basis & CI coefficients\\
AO & MO & Two-electron integrals\\
One-electron integrals & Density matrices & ECP\\
\end{tabular}
\end{center}
\item Each group contains multiple \alert{attributes}
\end{itemize}
\end{frame}
\begin{frame}[label={sec:orgaacdd0f},fragile]{Source code}
\begin{itemize}
\item For each attribute :
\begin{minted}[frame=lines,fontsize=\scriptsize,linenos]{c}
trexio_exit_code trexio_[has/read/write]_<group>_<attribute>
(trexio_t* file, <type> attribute)
\end{minted}
\item The library can be auto-generated by a script as the function names can
be computed
\item Productivity : Literate programming with Org-mode \\
Table \(\rightarrow\) JSON \(\rightarrow\) C \\
\phantom{Table} \(\rightarrow\) Documentation
\item Fortran and Python/Numpy interfaces are also generated
\item Performance : HDF5 back end
\item Portability : Only optional dependency is HDF5
\end{itemize}
\end{frame}
\begin{frame}[label={sec:org7802c31}]{Source code}
Productivity:
\begin{center}
\includegraphics[width=\textwidth]{./trexio-doc1.png}
\end{center}
\end{frame}
\begin{frame}[label={sec:orga268d6e}]{Documentation}
\begin{center}
\includegraphics[height=\textheight]{./trexio-doc2.png}
\end{center}
\end{frame}
\section{QMCkl: QMC kernel library}
\label{sec:org9bb0da1}
\begin{frame}[label={sec:org549026f}]{QMC kernel library}
\begin{block}{Computational kernels}
\begin{itemize}
\item QMCkl will contain the main kernels of QMC methods (Domain
specific library, end-user driven)
\item Written together by QMC experts and HPC experts
\item Multiple high performance implementations of the kernels, tuned
for different
\begin{itemize}
\item architectures (portability is critical for users)
\item problem sizes (from small to large systems)
\item requested accuracy (reduced precision)
\end{itemize}
\end{itemize}
\end{block}
\end{frame}
\begin{frame}[label={sec:org8cd96b3}]{Objectives}
\begin{itemize}
\item The code must stay easy to understand by the physicists/chemists.
Performance-related aspects are delegated to the library
\item Scientists should be able to use their preferred language
\item Scientists should not lose control on their codes
\item Codes should not die when the architecture changes
\item Scientific code development should not kill the performance
\item Reuse of the optimization effort among the community
\end{itemize}
\end{frame}
\begin{frame}[label={sec:org6609bc5}]{Functionality and performance}
\begin{itemize}
\item Keeping high \emph{productivity}, \emph{portability} and \emph{performance} is very
hard in a single piece of software.
We propose (at least) two implementations:
\begin{enumerate}
\item \alert{Documentation library} \\
Easy to read, understand, modify for scientists, not necessarily efficient.
\item \alert{High performance libraries} \\
Efficient on a given architecture, but not necessarily
readable by physicists/chemists. \\
Performance within 10$\backslash$% to maximize portability and simplicity.
\end{enumerate}
\item Both \emph{Documentation} and \emph{High performance} have the same API
(similar to BLAS on netlib \emph{vs} MKL).
\item Scientific progress is made in the documentation library, and
implemented in the HPC versions when the API is stabilized.
\item Performance: enable a data-driven task-based parallelism
\end{itemize}
\end{frame}
\begin{frame}[label={sec:org33f426a},fragile]{Library design}
\begin{itemize}
\item Creation of a \emph{Context} that keeps a consistent state of the library
\item Memory allocation is abstract:
\begin{minted}[frame=lines,fontsize=\scriptsize,linenos]{c}
void* qmckl_malloc(qmckl_context context, const qmckl_memory_info_struct info);
\end{minted}
allows allocation on CPU/GPU by the HPC variants
\item Low level functions: access to simple low-level functions leaving the
context untouched (no allocation, no modification in-place)
\item High-level functions: let the library call multiple kernels in an
optimal way, possibly updating the context
\item Use of IRP programming paradigm to keep track of dependencies
between kernels: re-compute only what is necessary
\end{itemize}
\end{frame}
\begin{frame}[label={sec:org7f2e24e},fragile]{Use case: low-level}
\begin{minted}[frame=lines,fontsize=\scriptsize,linenos]{c}
#include <qmckl.h>
// ...
qmckl_exit_code rc;
int64_t m, n, LDA, LDB, LDC;
// ...
double A[LDA*3];
double B[LDB*3];
double C[LDC*n];
// ...
context = qmckl_context_create();
// Compute inter-particle distances between xyz coordinates in A[m][3] and B[3][n]
// and store the result in C[m][n]
rc = qmckl_distance(context, 'N', 'T', m, n, A, LDA, B, LDB, C, LDC);
assert (rc == QMCKL_SUCCESS);
// ...
\end{minted}
\end{frame}
\begin{frame}[label={sec:orgf45dd18},fragile]{Use case: high-level}
\begin{minted}[frame=lines,fontsize=\scriptsize,linenos]{c}
#include <qmckl.h>
// ...
qmckl_exit_code rc;
double e_loc;
qmckl_context context;
context = qmckl_context_create();
// Store WF parameters in the context
rc = qmckl_read_trexio(context, trexio_filename);
assert (rc == QMCKL_SUCCESS);
// Set the electron coordinates in the context
rc = qmckl_set_electron_coord (context, 'N', elec_coord);
assert(rc == QMCKL_SUCCESS);
// Return the local energy at the current electron positions
rc = qmckl_get_local_energy(context, &e_loc);
// ...
\end{minted}
\end{frame}
\begin{frame}[label={sec:org8378951}]{Dependencies between kernels}
\begin{columns}
\begin{column}{0.5\textwidth}
\begin{center}
\includegraphics[width=.9\linewidth]{irp.png}
\end{center}
\end{column}
\begin{column}{0.5\textwidth}
\begin{itemize}
\item Only the needed sub-graph is computed
\item HPC: Each kernel is one/many parallel Task(s)
\item HPC: Use OpenMP tasks or StarPU\footnote{C. Augonnet et al, doi:10.1002/cpe.1631} for hybrid architectures:
(StarPU handles very well asynchronous CPU-GPU transfers).
\end{itemize}
\end{column}
\end{columns}
\end{frame}
\begin{frame}[label={sec:orgd876ae6}]{Development strategy}
\begin{enumerate}
\item Kernel extraction: QMC specialists agree on the
mathematical expression of the problem
\item A mini-application is written to find the optimal data layout
with HPC experts from real-size examples
\item The kernel is written in the documentation library
\item The documentation library is linked in a QMC code to check correctness
\item HPC experts provide an HPC version of the kernel
\item The HPC library is linked in the QMC codes of the CoE
\end{enumerate}
\end{frame}
\begin{frame}[label={sec:org2b9e52e}]{Documentation library}
Literate programming with Org-mode:
\begin{itemize}
\item Comments are more important than code
\item Can add graphics, \LaTeX formulas, tables, etc
\item Documentation always synchronized with the code
\item Some routines can be generated by embedded scripts
\item Kernels are implemented in Fortran for readability
\item The API is C-compatible: QMCkl appears like a C library
\(\Longrightarrow\) can be used in all other languages
\item Example: Prototyping in Julia
\end{itemize}
\end{frame}
\begin{frame}[label={sec:orgda06b11}]{High-Performance strategies}
\begin{block}{Linear algebra hot spots}
\begin{center}
\begin{tabular}{lll}
GEMM & Rank-1 update & Matrix Inversion\\
GEMV & Diagonal of GEMM & Shermann-Morrison-Woodburry\\
\end{tabular}
\end{center}
\end{block}
\begin{block}{Matrices are relatively small (\(\le 1000\times 1000\))}
\begin{itemize}
\item Matrices are stored in tiled format \(\Longrightarrow\) task-based
linear algebra interleaved computation of multiple kernels
\item Increase parallelism by agregating multiple independent walkers
in matrices
\item Needs fast linear algebra kernels for small matrices
\end{itemize}
\end{block}
\end{frame}
\begin{frame}[label={sec:orga9a5f05}]{High-Performance strategies}
\begin{block}{Tuning}
\begin{itemize}
\item Optimization is guided by analysis with \alert{MAQAO}\footnote{https://maqao.org}.
\item Specialized versions of critical hot-spots
\item MIPP for portable intrinsics / specialized code generation
\item Monitoring of the use of the library to choose most efficient versions
\item Optimizations guided by monitoring numerical accuracy (\alert{Verificarlo}\footnote{https://github.com/verificarlo/verificarlo})
\end{itemize}
\end{block}
\end{frame}
\begin{frame}[label={sec:org1afd5ba}]{Example: Specialized DGEMM kernel}
VIJAY
\end{frame}
\begin{frame}[label={sec:orgd391c54}]{Efficiently guiding the developer}
\begin{center}
\includegraphics[width=\textwidth]{./maqao1.png}
\end{center}
\end{frame}
\begin{frame}[label={sec:org543da85}]{Extensive/automatic testing of different configurations}
\begin{center}
\includegraphics[width=\textwidth]{./maqao2.png}
\end{center}
\end{frame}
\begin{frame}[label={sec:org95699bb}]{First application : 3-body Jastrow factor}
\newcommand{\Jeen}{J_{\text{een}}}
\newcommand{\Nel}{N_{\text{elec}}}
\newcommand{\Nat}{N_{\text{nucl}}}
\newcommand{\Nord}{N_{\text{nord}}}
\newcommand{\lmax}{p-k-2\delta_{k,0}}
\newcommand{\br}{\mathbf{r}}
\newcommand{\bR}{\mathbf{R}}
\newcommand{\ttr}{\, \bar{\mathtt{r}}}
\newcommand{\tR}{\, \bar{\mathtt{R}}}
\newcommand{\tP}{\, \bar{\mathtt{P}}}
\[
\Jeen (\br,\bR) = \sum_{\alpha=1}^{\Nat} \sum_{i=1}^{\Nel} \sum_{j=1}^{i-1}
\sum_{p=2}^{\Nord} \sum_{k=0}^{p-1}
\sum_{l=0}^{\lmax} c_{lkp\alpha}
\left( {r}_{ij} \right)^k
\left[ \left( {R}_{i\alpha} \right)^l + \left( {R}_{j\alpha} \right)^l \right]
\left( {R}_{i\,\alpha} \, {R}_{j\alpha} \right)^{(p-k-l)/2}
\]
\begin{columns}
\begin{column}{0.5\textwidth}
\begin{center}
\includegraphics[width=\textwidth]{./speedup.pdf}
\end{center}
\end{column}
\begin{column}{0.5\textwidth}
\begin{itemize}
\item Gradient and Laplacian are also required
\item Up to \(20\times\) faster than in the original code
\item \(\sim 80\%\) of the AVX-512 peak is reached
\item Expressed with a DGEMM kernel \(\Longrightarrow\) also efficient on GPU
\end{itemize}
\end{column}
\end{columns}
\end{frame}
\begin{frame}[fragile]{Numerical analysis with Verificarlo}
\textbf{Verificarlo} is a tool for assessing the precision of floating point operations.
It can be used to :
\begin{columns}
\column{0.3\textwidth}
{\centering
\includegraphics[width=80px, keepaspectratio]{img/verificarlo.png}
}\\%
{\footnotesize
\url{https://github.com/verificarlo/verificarlo} GPL v3 \\
}
\column{0.7\textwidth}
\begin{itemize}
\item \textbf{Find numerical bugs} in codes \footnotemark[1]
\begin{itemize}
\item Stochastic arithmetic to simulate round-off and cancellations
\item Localization techniques to pinpoint source of errors
\end{itemize}
\item \textbf{Optimize precision} \footnotemark[2]
\begin{itemize}
\item Simulate custom formats for mixed precision \\(float, bf16)
\item Tune precision in math library calls
\end{itemize}
\end{itemize}
\end{columns}
\footnotetext[1]{
C. Denis \textit{et al.} \href{https://dx.doi.org/10.1109/ARITH.2016.31}{doi:10.1109/ARITH.2016.31}
}
\footnotetext[2]{
Y Chatelain \textit{et al.} \href{https://dx.doi.org/10.1007/978-3-030-29400-7\_34}{doi:10.1007/978-3-030-29400-7\_34}
}
\end{frame}
\begin{frame}[fragile]{The Verificarlo pipeline}
\begin{itemize}
\item Each Floating-Point (FP) operation may introduce a $\delta$ error
$$ z = fl[x+y] = (x+y)(1+\delta) $$
\item When chaining multiple operations, errors can accumulate and snowball
\item \structure{Monte Carlo Arithmetic key principle}
\begin{itemize}
\item Make $\delta$ a \structure{random variable}
\item Use a Monte Carlo simulation to empirically estimate the FP error distribution
\end{itemize}
\end{itemize}
\begin{center}
\includegraphics[width=.8\textwidth]{img/verificarlo_pipeline.png}
\end{center}
\end{frame}
\begin{frame}{Continuous-Integration precision tracking}
\begin{itemize}
\item Each push to \structure{QMCkl} triggers a Verificarlo analysis.
\item QMCkl kernels unit tests are augmented with probes:
\begin{itemize}
\item track a scalar value precision
\item ensure that a target precision is reached
\end{itemize}
\end{itemize}
\vspace{2cm}\vfill
vfc\_probe(\tikzmark{kernel}"Sherman-Morisson", \tikzmark{var}"residual", res) \\
vfc\_probe\_assert("Sherman-Morisson", "res", res, \tikzmark{target}1e-7)
\begin{tikzpicture}[
remember picture,
overlay,
expl/.style={draw=orange,fill=orange!30,rounded corners,text width=3cm},
arrow/.style={red!80!black,ultra thick,->,>=latex}
]
\node[expl]
(kernelex)
at (2,3cm)
{Kernel name};
\node[expl]
(varex)
at (7,3cm)
{Variable name};
\node[expl]
(targetex)
at (12,3cm)
{Target precision};
\draw[arrow]
(kernelex.south) to[out=-90,in=90] ([yshift=1.2ex, xshift=1.7cm]{pic cs:kernel});
\draw[arrow]
(varex.south) to[out=-90,in=90] ([yshift=1.2ex, xshift=1cm]{pic cs:var});
\draw[arrow]
(targetex.south) to[out=-90,in=90] ([yshift=1.2ex, xshift=.5cm]{pic cs:target});
\end{tikzpicture}
\end{frame}
\begin{frame}[label={sec:orge29c5eb}]{Verificarlo CI}
\begin{columns}
\begin{column}{0.5\textwidth}
\begin{exampleblock}{Compare runs}
\begin{center}
\includegraphics[width=0.85\textwidth]{./img/cmp-runs.png}
\end{center}
\begin{itemize}
\item Track precision of kernels over commits
\item Shows significant digits \(s\), standard deviation \(\sigma\),
variable distribution
\end{itemize}
\end{exampleblock}
\end{column}
\begin{column}{0.5\textwidth}
\begin{exampleblock}{Inspect runs}
\begin{center}
\includegraphics[width=0.85\textwidth]{./img/inspect-runs.png}
\end{center}
\begin{itemize}
\item Focus in depth on one particular run
\item Compare multiple implementations of the same kernel
\end{itemize}
\end{exampleblock}
\end{column}
\end{columns}
\end{frame}
\end{document}