666 lines
22 KiB
Org Mode
666 lines
22 KiB
Org Mode
#+TITLE: Libraries developed in the TREX CoE
|
|
#+DATE: 08/10/2021
|
|
#+AUTHOR: A. Scemama$^1$, V.G. Chilkuri$^1$, E. Posenitskiy$^1$, P. de Oliveira Castro$^2$, C. Valensi$^2$, W. Jalby$^2$
|
|
|
|
#+LaTeX_HEADER: \institute{$^1$University of Toulouse/CNRS, LCPQ (France) \\
|
|
#+LaTeX_HEADER: $^2$University of Versailles, Li-PaRAD (France)}
|
|
#+LATEX_CLASS: beamer
|
|
#+LaTeX_CLASS_OPTIONS:[aspectratio=169]
|
|
#+BEAMER_THEME: trex
|
|
#+LaTeX_HEADER: \usepackage{minted}
|
|
#+LaTeX_HEADER: \usemintedstyle{emacs}
|
|
#+LaTeX_HEADER: \newminted{f90}{fontsize=\footnotesize}
|
|
#+LaTeX_HEADER: \usepackage[utf8]{inputenc}
|
|
#+LaTeX_HEADER: \usepackage[T1]{fontenc}
|
|
#+LaTeX_HEADER: \usepackage{hyperref}
|
|
#+LaTeX_HEADER: \usepackage{mathtools}
|
|
#+LaTeX_HEADER: \usepackage{physics}
|
|
#+LaTeX_HEADER: \definecolor{darkgreen}{rgb}{0.,0.6,0.}
|
|
#+LaTeX_HEADER: \definecolor{darkblue}{rgb}{0.,0.2,0.7}
|
|
#+LaTeX_HEADER: \definecolor{darkred}{rgb}{0.6,0.1,0.1}
|
|
#+LaTeX_HEADER: \definecolor{darkpink}{rgb}{0.7,0.0,0.7}
|
|
#+LaTeX_HEADER: \newcommand{\coord }{{\bf r}_1, \dots, {\bf r}_N }
|
|
#+LaTeX_HEADER: \newcommand{\dcoord }{\dd {\bf r}_1 \dots \dd{\bf r}_N }
|
|
|
|
#+LaTeX_HEADER: \usepackage[backend=biber,style=alphabetic,autocite=plain,sorting=none]{biblatex}
|
|
#+LaTeX_HEADER: \addbibresource{verificarlo.bib}
|
|
#+LaTeX_HEADER: \usepackage{graphicx}
|
|
#+LaTeX_HEADER: \usepackage[many]{tcolorbox}
|
|
#+LaTeX_HEADER: \usepackage{tikz}
|
|
#+LaTeX_HEADER: \usetikzlibrary{tikzmark,positioning}
|
|
#+LaTeX_HEADER: \definecolor{grey}{RGB}{170,170,170}
|
|
|
|
#+EXPORT_EXCLUDE_TAGS: noexport
|
|
|
|
#+startup: beamer
|
|
#+options: H:2 toc:nil
|
|
|
|
* QMC in TREX
|
|
** QMC in TREX
|
|
|
|
#+LATEX: \begin{exampleblock}{QMC: Quantum Monte Carlo methods}
|
|
- Highly accurate methods
|
|
- Massively parallelisable (multiple QMC trajectories)
|
|
- Very CPU intensive: One of the most "compute-hungry" methods
|
|
- Still under development: scientists need to run /and/ develop code
|
|
- Input data is complex (wave function)
|
|
#+LATEX: \end{exampleblock}
|
|
|
|
#+LATEX: \begin{exampleblock}{Objective: Make codes ready for exascale}
|
|
How: Instead of re-writing codes, provide libraries (free software)
|
|
1. *TREXIO*: A library for exchanging information between codes
|
|
$\Longrightarrow$ Enables HTC
|
|
2. *QMCkl*: A library for high-performance $\Longrightarrow$ Enables HPC
|
|
#+LATEX: \end{exampleblock}
|
|
|
|
** Quantum Monte Carlo (QMC)
|
|
|
|
#+BEGIN_SRC latex
|
|
\alert{Problem}: Stochastic resolution of the Schr\"odinger equation for $N$ electrons
|
|
\begin{eqnarray}
|
|
E &= &\frac{\int \dcoord \Phi(\coord) {\cal H} \Phi(\coord)}
|
|
{\int \dcoord \Phi(\coord) \Phi(\coord)} \nonumber \\
|
|
&\sim & \sum \frac{ {\cal H}\Psi(\coord )}{\Psi(\coord )}
|
|
\text{, sampled with } (\Psi \times \Phi)
|
|
\nonumber
|
|
\end{eqnarray}
|
|
\begin{columns}
|
|
\begin{column}{.5\textwidth}
|
|
\begin{itemize}
|
|
\item[$\cal H $: ] Hamiltonian operator
|
|
\item[$E$: ] Energy
|
|
\end{itemize}
|
|
\end{column}
|
|
\begin{column}{.4\textwidth}
|
|
\begin{itemize}
|
|
\item[$\coord $: ] Electron coordinates
|
|
\item[$\Phi $: ] Almost exact wave function
|
|
\item[$\Psi $: ] Trial wave function
|
|
\end{itemize}
|
|
\end{column}
|
|
\end{columns}
|
|
#+END_SRC
|
|
|
|
** Quantum Monte Carlo (QMC)
|
|
|
|
#+LATEX: \begin{columns}
|
|
#+LATEX: \begin{column}{0.4\textwidth}
|
|
- Very low memory requirements (no integrals)
|
|
- Distribute walkers on different cores or compute nodes
|
|
- No blocking communication: near-ideal scaling
|
|
- Difficulty to parallelize within a QMC trajectory: depends on the
|
|
number of electrons
|
|
#+LATEX: \end{column}
|
|
#+LATEX: \begin{column}{0.6\textwidth}
|
|
#+ATTR_LATEX: :width \textwidth
|
|
[[./Qmc.png]]
|
|
#+LATEX: \end{column}
|
|
#+LATEX: \end{columns}
|
|
|
|
** Both libraries
|
|
*** Three objectives
|
|
1. *Productivity* \\
|
|
Usable and useful by scientists in different programming languages
|
|
2. *Portability* \\
|
|
Target: all HPC systems (CPU, GPU, ARM, x86, etc.)
|
|
3. *Performance* \\
|
|
Must be efficient on all architectures: possible tradeoffs
|
|
between portability and performance
|
|
|
|
*** Free (libre) software
|
|
- Requirement for open science
|
|
- BSD license for adoption by any software (academic, commercial, ...)
|
|
|
|
* TREXIO: I/O library
|
|
** TREXIO: I/O library
|
|
|
|
#+LATEX: \begin{columns}
|
|
#+LATEX: \begin{column}{0.4\textwidth}
|
|
#+LATEX: \begin{exampleblock}{Before}
|
|
#+BEGIN_SRC dot :output file :file interfaces.png
|
|
digraph G {
|
|
QP [label="Quantum Package"];
|
|
QMCCHEM [label="QMC=Chem"];
|
|
Turbo [label="TurboRVB"];
|
|
QP -> NECI;
|
|
NECI -> GammCor [style="dotted"];
|
|
NECI -> QMCCHEM [style="dotted"] ;
|
|
QP -> QMCCHEM;
|
|
QP -> CHAMP;
|
|
QP -> GammCor [style="dotted"];
|
|
QP -> Turbo [style="dotted"];
|
|
NECI -> Turbo [style="dotted"];
|
|
NECI -> CHAMP [style="dotted"];
|
|
QMCCHEM -> GammCor [style="dotted"];
|
|
CHAMP -> GammCor [style="dotted"];
|
|
Turbo -> GammCor [style="dotted"];
|
|
}
|
|
#+END_SRC
|
|
#+RESULTS:
|
|
[[file:interfaces.png]]
|
|
#+LATEX: \end{exampleblock}
|
|
#+LATEX: \end{column}
|
|
#+LATEX: \begin{column}{0.6\textwidth}
|
|
#+LATEX: \begin{exampleblock}{After}
|
|
#+BEGIN_SRC dot :output file :file interfaces2.png
|
|
digraph G {
|
|
layout=circo;
|
|
External [label="External\ncodes"];
|
|
QP [label="Quantum\nPackage"];
|
|
QMCCHEM [label="QMC=Chem"];
|
|
Turbo [label="TurboRVB"];
|
|
TREX [label="TREXIO File", shape="box"];
|
|
CHAMP -> TREX;
|
|
GammCor -> TREX;
|
|
NECI -> TREX;
|
|
QMCCHEM -> TREX;
|
|
QP -> TREX;
|
|
Turbo -> TREX;
|
|
External -> TREX;
|
|
|
|
TREX -> CHAMP;
|
|
TREX -> GammCor;
|
|
TREX -> NECI;
|
|
TREX -> QMCCHEM;
|
|
TREX -> QP;
|
|
TREX -> Turbo;
|
|
TREX -> External;
|
|
}
|
|
#+END_SRC
|
|
#+RESULTS:
|
|
[[file:interfaces2.png]]
|
|
#+LATEX: \end{exampleblock}
|
|
#+LATEX: \end{column}
|
|
#+LATEX: \end{columns}
|
|
|
|
(BSD license) \\
|
|
[[https://github.com/trex-coe/trexio]]
|
|
|
|
** TREXIO: I/O library
|
|
|
|
#+LATEX: \begin{exampleblock}{Front end}
|
|
- Definition of an API for to read/write wave functions
|
|
- C-compatible API: Easy usage in all common languages
|
|
#+LATEX: \end{exampleblock}
|
|
#+LATEX: \begin{columns}
|
|
#+LATEX: \begin{column}{0.5\textwidth}
|
|
#+ATTR_LATEX: :width \textwidth
|
|
[[./api.png]]
|
|
#+LATEX: \end{column}
|
|
#+LATEX: \begin{column}{0.5\textwidth}
|
|
#+LATEX: \begin{exampleblock}{Back end}
|
|
- HDF5: Efficient I/O
|
|
- Text:
|
|
- Fallback when HDF5 can't be installed
|
|
- Debugging
|
|
- Version control systems
|
|
#+LATEX: \end{exampleblock}
|
|
#+LATEX: \end{column}
|
|
#+LATEX: \end{columns}
|
|
|
|
** Content of the files
|
|
|
|
- File is *self-contained*: no external knowledge needed to compute
|
|
$\Psi(r_1,\dots,r_n)$ (normalization factors, basis et
|
|
parameters, /etc/)
|
|
- *Strong conventions* (atomic units, ordering of atomic orbitals, etc.)
|
|
- The data stored in the files is organized in different *groups*:
|
|
|
|
| Metadata | Electron | Slater Determinants |
|
|
| Nucleus | Basis | CI coefficients |
|
|
| AO | MO | Two-electron integrals |
|
|
| One-electron integrals | Density matrices | ECP |
|
|
- Each group contains multiple *attributes*: information related to the
|
|
group
|
|
|
|
** Source code :noexport:
|
|
|
|
- For each attribute :
|
|
#+begin_src c
|
|
trexio_exit_code trexio_[has/read/write]_<group>_<attribute>
|
|
(trexio_t* file, <type> attribute)
|
|
#+end_src
|
|
- The library can be auto-generated by a script as the function names can
|
|
be computed
|
|
- Productivity : Literate programming with Org-mode \\
|
|
Table $\rightarrow$ JSON $\rightarrow$ C \\
|
|
\phantom{Table} $\rightarrow$ Documentation
|
|
- Fortran and Python/Numpy interfaces are also generated
|
|
- Performance : HDF5 back end
|
|
- Portability : Only optional dependency is HDF5
|
|
|
|
** Source code :noexport:
|
|
|
|
Productivity:
|
|
|
|
#+ATTR_LATEX: :width \textwidth
|
|
[[./trexio-doc1.png]]
|
|
|
|
** Documentation :noexport:
|
|
|
|
#+ATTR_LATEX: :height \textheight
|
|
[[./trexio-doc2.png]]
|
|
|
|
* QMCkl: QMC kernel library
|
|
|
|
** QMC kernel library
|
|
|
|
*** Computational kernels
|
|
- QMCkl will contain the main kernels of QMC methods: Domain
|
|
specific library, end-user driven
|
|
- Written together by QMC experts and HPC experts
|
|
- Multiple high performance implementations of the kernels, tuned
|
|
for different
|
|
- architectures: portability is critical for users
|
|
- problem sizes: from small to large systems
|
|
- requested accuracy: reduced precision
|
|
|
|
** Objectives
|
|
|
|
- The code must stay easy to understand by the physicists/chemists.
|
|
Performance-related aspects should be delegated to the library
|
|
- Scientists should be able to use their preferred language
|
|
- Scientists should not lose control of their codes
|
|
- Codes should not die when the architecture changes
|
|
- Scientific code development should not kill the performance
|
|
- Reuse of the optimization effort among the community
|
|
|
|
** Functionality and performance
|
|
|
|
- Keeping high /productivity/, /portability/ and /performance/ is very
|
|
hard in a single piece of software.
|
|
|
|
We propose (at least) two implementations:
|
|
|
|
1. *Documentation library* \\
|
|
Easy to read, understand, modify for scientists, not necessarily efficient.
|
|
2. *High performance libraries* \\
|
|
Efficient on a given architecture, but not necessarily
|
|
readable by physicists/chemists. \\
|
|
Performance within 10% to maximize portability and simplicity.
|
|
3. *Ultra-High performance libraries* \\
|
|
Generated with auto-tuning tools for well identified datasets.
|
|
|
|
- Both /Documentation/ and /High performance/ have the same API
|
|
(similar to BLAS on netlib /vs/ MKL).
|
|
|
|
- Scientific progress is made in the documentation library, and
|
|
implemented in the HPC versions when the API is stabilized.
|
|
|
|
- Performance: enable a data-driven task-based parallelism
|
|
|
|
** Documentation library :noexport:
|
|
Literate programming with Org-mode:
|
|
- Comments are more important than code
|
|
- Can add graphics, \LaTeX formulas, tables, etc
|
|
- Documentation always synchronized with the code
|
|
- Some routines can be generated by embedded scripts
|
|
- Kernels are implemented in Fortran for readability
|
|
- The API is C-compatible: QMCkl appears like a C library
|
|
$\Longrightarrow$ can be used in all other languages
|
|
- Example: Prototyping in Julia
|
|
|
|
** Library design
|
|
|
|
- Creation of a /Context/ that keeps a consistent state of the
|
|
library (pointers to computed data, configuration parameters, etc.)
|
|
- Memory allocation is abstract:
|
|
#+begin_src c
|
|
void* qmckl_malloc(qmckl_context context, const qmckl_memory_info_struct info);
|
|
#+end_src
|
|
allows allocation on CPU/GPU by the HPC variants
|
|
- Low level functions: access to simple low-level functions leaving the
|
|
context untouched (no allocation, no modification in-place)
|
|
- High-level functions: let the library call multiple kernels in an
|
|
optimal way, possibly updating the context
|
|
- Use of IRP programming paradigm\footnote{http://arxiv.org/abs/0909.5012} to keep track of dependencies
|
|
between kernels: re-compute only what is necessary and store
|
|
computed data in the context
|
|
|
|
** Dependencies between kernels
|
|
|
|
#+LATEX: \begin{columns}
|
|
#+LATEX: \begin{column}{0.5\textwidth}
|
|
#+BEGIN_SRC dot :output file :file irp.png :export no
|
|
digraph G {
|
|
rankdir = BT;
|
|
E_pot -> E_loc;
|
|
E_kin -> E_loc;
|
|
V_NN -> E_pot;
|
|
V_eN -> E_pot;
|
|
V_ee -> E_pot;
|
|
Distance_ee -> V_ee;
|
|
Distance_eN -> V_eN;
|
|
Distance_NN -> V_NN;
|
|
ECP -> E_pot;
|
|
ECP_Local -> ECP;
|
|
ECP_Non_Local -> ECP;
|
|
Determinants -> E_kin;
|
|
Jastrow -> E_kin;
|
|
J_eN -> Jastrow;
|
|
J_ee -> Jastrow;
|
|
J_eeN -> Jastrow;
|
|
Distance_eN -> J_eN;
|
|
Distance_ee -> J_ee;
|
|
Distance_eN -> J_eeN;
|
|
Distance_ee -> J_eeN;
|
|
Det_up -> Determinants;
|
|
Det_down -> Determinants;
|
|
MOs -> Det_up;
|
|
MOs -> Det_down;
|
|
AOs -> MOs;
|
|
AO_radial -> AOs;
|
|
AO_angular -> AOs;
|
|
elec_coord -> Distance_ee;
|
|
elec_coord -> Distance_eN;
|
|
Distance_eN -> AO_radial;
|
|
Distance_eN -> AO_angular;
|
|
Determinants -> ECP_Non_Local;
|
|
}
|
|
#+END_SRC
|
|
#+LATEX: \end{column}
|
|
#+LATEX: \begin{column}{0.5\textwidth}
|
|
- Only the needed sub-graph is computed
|
|
- HPC: Each kernel is one/many parallel Task(s)
|
|
- HPC: Use OpenMP tasks or StarPU\footnote{C. Augonnet et al, doi:10.1002/cpe.1631} for hybrid architectures:
|
|
(StarPU handles very well asynchronous CPU-GPU transfers).
|
|
|
|
#+LATEX: \end{column}
|
|
#+LATEX: \end{columns}
|
|
|
|
** Use case: low-level
|
|
|
|
#+begin_src c
|
|
#include <qmckl.h>
|
|
|
|
// ...
|
|
qmckl_exit_code rc;
|
|
int64_t m, n, LDA, LDB, LDC;
|
|
// ...
|
|
double A[LDA*3];
|
|
double B[LDB*3];
|
|
double C[LDC*n];
|
|
// ...
|
|
|
|
context = qmckl_context_create();
|
|
|
|
// Compute inter-particle distances between xyz coordinates in A[m][3] and B[3][n]
|
|
// and store the result in C[m][n]
|
|
rc = qmckl_distance(context, 'N', 'T', m, n, A, LDA, B, LDB, C, LDC);
|
|
assert (rc == QMCKL_SUCCESS);
|
|
// ...
|
|
#+end_src
|
|
|
|
** Use case: high-level
|
|
|
|
#+begin_src c
|
|
#include <qmckl.h>
|
|
// ...
|
|
qmckl_exit_code rc;
|
|
double e_loc;
|
|
qmckl_context context;
|
|
|
|
context = qmckl_context_create();
|
|
|
|
// Store WF parameters in the context
|
|
rc = qmckl_read_trexio(context, trexio_filename);
|
|
assert (rc == QMCKL_SUCCESS);
|
|
|
|
// Set the electron coordinates in the context
|
|
rc = qmckl_set_electron_coord (context, 'N', elec_coord);
|
|
assert(rc == QMCKL_SUCCESS);
|
|
|
|
// Return the local energy at the current electron positions
|
|
rc = qmckl_get_local_energy(context, &e_loc);
|
|
// ...
|
|
#+end_src
|
|
|
|
#+RESULTS:
|
|
: /home/scemama/MEGA/TEX/Presentations/2021/Intel/scemama.pdf
|
|
|
|
** Development strategy
|
|
|
|
1. Kernel extraction: QMC specialists agree on the
|
|
mathematical expression of the problem
|
|
2. A mini-application is written to find the optimal data layout
|
|
with HPC experts from real-size examples
|
|
3. The kernel is written in the documentation library
|
|
4. The documentation library is linked in a QMC code to check
|
|
correctness and numerical accuracy
|
|
5. HPC experts provide an HPC version of the kernel
|
|
6. The HPC library is linked in the QMC codes of the CoE
|
|
|
|
** First application : 3-body Jastrow factor
|
|
|
|
#+LATEX: \newcommand{\Jeen}{J_{\text{een}}}
|
|
#+LATEX: \newcommand{\Nel}{N_{\text{elec}}}
|
|
#+LATEX: \newcommand{\Nat}{N_{\text{nucl}}}
|
|
#+LATEX: \newcommand{\Nord}{N_{\text{nord}}}
|
|
#+LATEX: \newcommand{\lmax}{p-k-2\delta_{k,0}}
|
|
#+LATEX: \newcommand{\br}{\mathbf{r}}
|
|
#+LATEX: \newcommand{\bR}{\mathbf{R}}
|
|
#+LATEX: \newcommand{\ttr}{\, \bar{\mathtt{r}}}
|
|
#+LATEX: \newcommand{\tR}{\, \bar{\mathtt{R}}}
|
|
#+LATEX: \newcommand{\tP}{\, \bar{\mathtt{P}}}
|
|
|
|
\[
|
|
\Jeen (\br,\bR) = \sum_{\alpha=1}^{\Nat} \sum_{i=1}^{\Nel} \sum_{j=1}^{i-1}
|
|
\sum_{p=2}^{\Nord} \sum_{k=0}^{p-1}
|
|
\sum_{l=0}^{\lmax} c_{lkp\alpha}
|
|
\left( {r}_{ij} \right)^k
|
|
\left[ \left( {R}_{i\alpha} \right)^l + \left( {R}_{j\alpha} \right)^l \right]
|
|
\left( {R}_{i\,\alpha} \, {R}_{j\alpha} \right)^{(p-k-l)/2}
|
|
\]
|
|
|
|
#+LATEX: \begin{columns}
|
|
#+LATEX: \begin{column}{0.5\textwidth}
|
|
#+ATTR_LATEX: :width \textwidth
|
|
[[./speedup.pdf]]
|
|
#+LATEX: \end{column}
|
|
#+LATEX: \begin{column}{0.5\textwidth}
|
|
- Gradient and Laplacian are also required
|
|
- Up to $20\times$ faster than in the original code
|
|
- $\sim 80\%$ of the AVX-512 peak is reached using standard MKL on
|
|
Intel Skylake
|
|
- Expressed with a DGEMM kernel $\Longrightarrow$ also efficient on GPU
|
|
#+LATEX: \end{column}
|
|
#+LATEX: \end{columns}
|
|
|
|
** High-Performance strategies
|
|
|
|
*** Linear algebra hot spots
|
|
|
|
| GEMM | Rank-1 update | Matrix Inversion |
|
|
| GEMV | Diagonal of GEMM | Shermann-Morrison-Woodburry |
|
|
|
|
*** Matrices are relatively small ($\le 1000\times 1000$)
|
|
|
|
- Matrices are stored in tiled format fitting a block formulation
|
|
of the algorithms $\Longrightarrow$ task-based
|
|
linear algebra, interleaved computation of multiple kernels
|
|
- Tile sizes will be adjusted by auto-tuning
|
|
- Increase parallelism by aggregating multiple independent walkers
|
|
in matrices
|
|
- Needs fast linear algebra kernels for small matrices (tile size)
|
|
- For tiny matrices ($<5\times5$) specialized versions are implemented
|
|
|
|
** Example: Specialized DGEMM kernel I
|
|
|
|
*** Simple algorithm :B_block:BMCOL:
|
|
:PROPERTIES:
|
|
:BEAMER_env: block
|
|
:BEAMER_col: 0.45
|
|
:END:
|
|
- Simple micro kernel (*GotoDGEMM*\footnote{doi:10.1145/1356052.1356053})
|
|
- Code written using ~asm_volatile~ to force good code generation by
|
|
compilers
|
|
- *Tiling* scheme\footnote{doi:10.1109/ICPP.2015.29}
|
|
|
|
*** Tiling scheme :B_block:BMCOL:
|
|
:PROPERTIES:
|
|
:BEAMER_col: 0.45
|
|
:BEAMER_env: block
|
|
:END:
|
|
#+ATTR_LATEX: :width 5cm :height 5cm :keepaspectratio :right
|
|
[[./tiling_icpp2015.pdf]]
|
|
|
|
** Example: Specialized DGEMM kernel II
|
|
|
|
*** Benchmarks
|
|
|
|
- Comparison of MKL vs Specialized DGEMM
|
|
|
|
#+ATTR_LATEX: :height 4cm :keepaspectratio
|
|
[[./plot_percentage_vs_mkl_tiled_good.pdf]]
|
|
|
|
- Strong impact on MKL performance due to the number of consecutive executions
|
|
- Favorable comparison for MKL: Many consecutive executions to
|
|
amortize setup cost, JIT, Skylake CPU
|
|
|
|
** Why do we like our DGEMM?
|
|
|
|
- Open source code : can be modified easily
|
|
- Simple code (280 LOC)
|
|
- Decent performance with 10% of MKL
|
|
- Can be rewritten in different languages to increase
|
|
portability (MIPP\footnote{https://github.com/aff3ct/MIPP})
|
|
- Can be coupled with simple pack/unpack routines to handle different
|
|
data storage (tiled matrices)
|
|
- Allows to keep control on parallelism
|
|
- A good starting point for autotuning
|
|
|
|
** High-Performance strategies
|
|
|
|
*** Tuning
|
|
- Optimization is guided by analysis with *MAQAO*\footnote{https://maqao.org}.
|
|
- Specialized versions of critical hot-spots
|
|
- *MIPP* for portable intrinsics / specialized code generation
|
|
- Monitoring of the use of the library to choose most efficient versions
|
|
- Optimizations guided by monitoring numerical accuracy (*Verificarlo*\footnote{https://github.com/verificarlo/verificarlo})
|
|
|
|
** Efficiently guiding the developer
|
|
|
|
#+ATTR_LATEX: :width \textwidth
|
|
[[./maqao1.png]]
|
|
** Extensive/automatic testing of different configurations
|
|
|
|
#+ATTR_LATEX: :width \textwidth
|
|
[[./maqao2.png]]
|
|
|
|
* Summary
|
|
|
|
** Summary
|
|
- QMC codes integrated in an ecosystem of multiple codes for
|
|
high-accuracy quantum chemistry
|
|
- Development of open-source libraries to be used in the
|
|
TREX codes and beyond
|
|
- Libraries focus on /performance/, /portability/ and /productivity/
|
|
- Strategies to make the collaboration between physicists/chemists
|
|
and HPC experts optimal
|
|
|
|
|
|
* Bonus slides
|
|
|
|
#+INCLUDE: "verificarlo.tex" export latex
|
|
|
|
** Verificarlo CI
|
|
|
|
#+LATEX: \begin{columns}
|
|
#+LATEX: \begin{column}{0.5\textwidth}
|
|
#+LATEX: \begin{exampleblock}{Compare runs}
|
|
#+ATTR_LATEX: :width 0.85\textwidth
|
|
[[./img/cmp-runs.png]]
|
|
- Track precision of kernels over commits
|
|
- Shows significant digits $s$, standard deviation $\sigma$,
|
|
variable distribution
|
|
#+LATEX: \end{exampleblock}
|
|
#+LATEX: \end{column}
|
|
#+LATEX: \begin{column}{0.5\textwidth}
|
|
#+LATEX: \begin{exampleblock}{Inspect runs}
|
|
#+ATTR_LATEX: :width 0.85\textwidth
|
|
[[./img/inspect-runs.png]]
|
|
- Focus in depth on one particular run
|
|
- Compare multiple implementations of the same kernel
|
|
#+LATEX: \end{exampleblock}
|
|
#+LATEX: \end{column}
|
|
#+LATEX: \end{columns}
|
|
* Useful links :noexport:
|
|
|
|
| TREX web site | https://trex-coe.eu |
|
|
| TREXIO | https://github.com/trex-coe/trexio |
|
|
| QMCkl | https://github.com/trex-coe/qmckl |
|
|
| QMCkl documentation | https://trex-coe.github.io/qmckl |
|
|
| MAQAO | http://www.maqao.org |
|
|
| Verificarlo | https://github.com/verificarlo/verificarlo |
|
|
|
|
* TODO [19/21] Improvements :noexport:
|
|
|
|
- [X] Fully parallelisable
|
|
- [X] Un peu plus de details sur les I/O
|
|
- [X] portability: very important for users
|
|
- [X] portable using standards.
|
|
- [X] Decscription du contexte
|
|
- Performance
|
|
- Productivity
|
|
- Portability
|
|
- [X] Advantages => Objectives
|
|
- can stay : must stay
|
|
- Application specific library: end-user driven
|
|
- [X] How to make functionality and performance objectves go well along
|
|
together: perf et productivity
|
|
- [X] Two implementations:
|
|
1. Perf
|
|
2. Comment maintient le lien entre les deux
|
|
3. science in documentation
|
|
4. easy to read/maintain /and modify/
|
|
- [X] Problems with small matrices (at best matrices are ~400x400). A real problem for GPU.
|
|
- [X] Routines specialisees dans la perf
|
|
- dgemm_diag
|
|
- Rank-1 update
|
|
- dgemm
|
|
- Donnees d'entree standard, reformattees chez nous (tiles)
|
|
- [X] Arbre de dependence entre les variables
|
|
- [X] On pense aux outils de performance qui vont nous permettre de
|
|
passer a l'echelle
|
|
- [X] On introduit un monitor de l'utilisation:
|
|
- [X] Qu'est ce qui est utilise? ou pas? => MKL_VERBOSE. Garder un log des
|
|
appels
|
|
- [X] Reporting on the use and the performance
|
|
|
|
- [X] Doc -> HPC: On s'appuie sur des outils d'analyze de perf, MIPP,
|
|
intrinsics, limited autotuning
|
|
|
|
- [ ] Recompilation du code on the fly: PGO.
|
|
|
|
- [X] Outils qui checkent la perf et la numerical accuracy.
|
|
|
|
- [ ] Combien de routines? 10/20/30??? Versions specialisees.
|
|
|
|
- [X] On cherche de la perf within 10%. On en laisse sur la table
|
|
|
|
- [X] Multiple verions: CPU, GPU
|
|
|
|
* Export :noexport:
|
|
#+BEGIN_SRC elisp :output none
|
|
(setq org-latex-listings 'minted)
|
|
(setq org-latex-custom-lang-environments
|
|
'(
|
|
(f90 "fortran")
|
|
))
|
|
(setq org-latex-minted-options
|
|
'(("frame" "lines")
|
|
("fontsize" "\\scriptsize")
|
|
("linenos" "")))
|
|
(setq org-latex-to-pdf-process
|
|
'("pdflatex -shell-escape -interaction nonstopmode -output-directory %o %f"
|
|
"pdflatex -shell-escape -interaction nonstopmode -output-directory %o %f"
|
|
"pdflatex -shell-escape -interaction nonstopmode -output-directory %o %f"))
|
|
(org-beamer-export-to-pdf)
|
|
#+END_SRC
|
|
|
|
#+RESULTS:
|
|
: /home/scemama/MEGA/TEX/Presentations/2021/Intel/scemama.pdf
|
|
|
|
|
|
|