Added two slides on DGEMM.
This commit is contained in:
parent
9046dc8702
commit
877feadb7b
BIN
plot_percentage_vs_mkl_tiled_good.pdf
Normal file
BIN
plot_percentage_vs_mkl_tiled_good.pdf
Normal file
Binary file not shown.
32
scemama.org
32
scemama.org
@ -447,9 +447,33 @@ digraph G {
|
||||
- Monitoring of the use of the library to choose most efficient versions
|
||||
- Optimizations guided by monitoring numerical accuracy (*Verificarlo*\footnote{https://github.com/verificarlo/verificarlo})
|
||||
|
||||
** Example: Specialized DGEMM kernel
|
||||
** Example: Specialized DGEMM kernel I
|
||||
|
||||
VIJAY
|
||||
*** Simple algorithm :B_block:BMCOL:
|
||||
:PROPERTIES:
|
||||
:BEAMER_env: block
|
||||
:BEAMER_col: 0.45
|
||||
:END:
|
||||
- Simple micro kernel (*GotoDGEMM*\footnote{doi:10.1145/1356052.1356053})
|
||||
- Code generation
|
||||
- *Tiling* scheme\footnote{doi:10.1109/ICPP.2015.29}
|
||||
|
||||
*** Tiling scheme :B_block:BMCOL:
|
||||
:PROPERTIES:
|
||||
:BEAMER_col: 0.45
|
||||
:BEAMER_env: block
|
||||
:END:
|
||||
#+ATTR_LATEX: :width 5cm :height 5cm :keepaspectratio :right
|
||||
[[./tiling_icpp2015.pdf]]
|
||||
|
||||
** Example: Specialized DGEMM kernel II
|
||||
|
||||
*** Benchmarks
|
||||
|
||||
- Comparison of MKL vs Specialied DGEMM
|
||||
|
||||
#+ATTR_LATEX: :width 10cm :height 6cm :keepaspectratio
|
||||
[[./plot_percentage_vs_mkl_tiled_good.pdf]]
|
||||
|
||||
** Efficiently guiding the developer
|
||||
|
||||
@ -461,7 +485,7 @@ digraph G {
|
||||
[[./maqao2.png]]
|
||||
|
||||
** First application : 3-body Jastrow factor
|
||||
|
||||
|
||||
#+LATEX: \newcommand{\Jeen}{J_{\text{een}}}
|
||||
#+LATEX: \newcommand{\Nel}{N_{\text{elec}}}
|
||||
#+LATEX: \newcommand{\Nat}{N_{\text{nucl}}}
|
||||
@ -548,7 +572,7 @@ digraph G {
|
||||
#+END_SRC
|
||||
|
||||
#+RESULTS:
|
||||
: /home/scemama/MEGA/TEX/Presentations/2021/Intel/scemama.pdf
|
||||
: /home/vijay/Documents/presentations/pres_intel/scemama.pdf
|
||||
|
||||
|
||||
* TODO [19/21] Improvements :noexport:
|
||||
|
104
scemama.tex
104
scemama.tex
@ -1,4 +1,4 @@
|
||||
% Created 2021-10-04 Mon 16:33
|
||||
% Created 2021-10-06 mer. 11:46
|
||||
% Intended LaTeX compiler: pdflatex
|
||||
\documentclass[aspectratio=169]{beamer}
|
||||
\usepackage[utf8]{inputenc}
|
||||
@ -46,20 +46,20 @@ $^2$University of Versailles, Li-PaRAD (France)}
|
||||
pdftitle={Libraries developed in the TREX CoE},
|
||||
pdfkeywords={},
|
||||
pdfsubject={},
|
||||
pdfcreator={Emacs 26.3 (Org mode 9.4)},
|
||||
pdfcreator={Emacs 27.1 (Org mode 9.5)},
|
||||
pdflang={English}}
|
||||
\begin{document}
|
||||
|
||||
\maketitle
|
||||
|
||||
\section{TREX: Targeting REal chemical accuracy at the EXascale}
|
||||
\label{sec:org5b14751}
|
||||
\begin{frame}[label={sec:orge8da598}]{TREX: Targeting REal chemical accuracy at the EXascale}
|
||||
\label{sec:org4b839d0}
|
||||
\begin{frame}[label={sec:orgbdf1b22}]{TREX: Targeting REal chemical accuracy at the EXascale}
|
||||
\begin{exampleblock}{QMC: Quantum Monte Carlo methods}
|
||||
\begin{itemize}
|
||||
\item Highly accurate methods
|
||||
\item Massively parallelisable (multiple QMC trajectories)
|
||||
\item Very CPU intensive: One of the most "compute-hungry" methods
|
||||
\item Very CPU intensive: One of the most ``compute-hungry'' methods
|
||||
\item Still under development: scientists need to run \emph{and} develop code
|
||||
\item Input data is complex (wave function)
|
||||
\end{itemize}
|
||||
@ -75,7 +75,7 @@ How: Instead of re-writing codes, provide libraries (free software)
|
||||
\end{exampleblock}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:orgce5753d}]{Quantum Monte Carlo (QMC)}
|
||||
\begin{frame}[label={sec:orga6495e2}]{Quantum Monte Carlo (QMC)}
|
||||
\alert{Problem}: Stochastic resolution of the Schr\"odinger equation for $N$ electrons
|
||||
\begin{eqnarray}
|
||||
E &= &\frac{\int \dcoord \Phi(\coord) {\cal H} \Phi(\coord)}
|
||||
@ -101,7 +101,7 @@ E &= &\frac{\int \dcoord \Phi(\coord) {\cal H} \Phi(\coord)}
|
||||
\end{columns}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:org6ed0682}]{Quantum Monte Carlo (QMC)}
|
||||
\begin{frame}[label={sec:org8ad33cb}]{Quantum Monte Carlo (QMC)}
|
||||
\begin{columns}
|
||||
\begin{column}{0.4\textwidth}
|
||||
\begin{itemize}
|
||||
@ -119,7 +119,7 @@ E &= &\frac{\int \dcoord \Phi(\coord) {\cal H} \Phi(\coord)}
|
||||
\end{columns}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:orgc59d3f5}]{Both libraries}
|
||||
\begin{frame}[label={sec:orgb43d37e}]{Both libraries}
|
||||
\begin{block}{Three objectives}
|
||||
\begin{enumerate}
|
||||
\item \alert{Productivity} \\
|
||||
@ -140,8 +140,8 @@ Must be efficient on all architectures
|
||||
\end{frame}
|
||||
|
||||
\section{TREXIO: I/O library}
|
||||
\label{sec:org88424a7}
|
||||
\begin{frame}[label={sec:org9c9c2f0}]{TREXIO: I/O library}
|
||||
\label{sec:orgbe916a5}
|
||||
\begin{frame}[label={sec:orgdf330cc}]{TREXIO: I/O library}
|
||||
\begin{columns}
|
||||
\begin{column}{0.4\textwidth}
|
||||
\begin{exampleblock}{Before}
|
||||
@ -163,7 +163,7 @@ Must be efficient on all architectures
|
||||
\url{https://github.com/trex-coe/trexio}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:org8aca922}]{TREXIO: I/O library}
|
||||
\begin{frame}[label={sec:orgaf241aa}]{TREXIO: I/O library}
|
||||
\begin{exampleblock}{Front end}
|
||||
\begin{itemize}
|
||||
\item Definition of an API for to read/write wave functions
|
||||
@ -192,7 +192,7 @@ Must be efficient on all architectures
|
||||
\end{columns}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:orgbb567ff}]{Content of the files}
|
||||
\begin{frame}[label={sec:orgd025d67}]{Content of the files}
|
||||
\begin{itemize}
|
||||
\item File is \alert{self-contained}: no external knowledge needed to compute
|
||||
\(\Psi(r_1,\dots,r_n)\) (normalization factors, basis et
|
||||
@ -212,7 +212,7 @@ One-electron integrals & Density matrices & ECP\\
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:orgaacdd0f},fragile]{Source code}
|
||||
\begin{frame}[label={sec:org17b60da},fragile]{Source code}
|
||||
\begin{itemize}
|
||||
\item For each attribute :
|
||||
\begin{minted}[frame=lines,fontsize=\scriptsize,linenos]{c}
|
||||
@ -230,7 +230,7 @@ Table \(\rightarrow\) JSON \(\rightarrow\) C \\
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:org7802c31}]{Source code}
|
||||
\begin{frame}[label={sec:orgbaa9dcc}]{Source code}
|
||||
Productivity:
|
||||
|
||||
\begin{center}
|
||||
@ -238,16 +238,16 @@ Productivity:
|
||||
\end{center}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:orga268d6e}]{Documentation}
|
||||
\begin{frame}[label={sec:orgf8a4de9}]{Documentation}
|
||||
\begin{center}
|
||||
\includegraphics[height=\textheight]{./trexio-doc2.png}
|
||||
\end{center}
|
||||
\end{frame}
|
||||
|
||||
\section{QMCkl: QMC kernel library}
|
||||
\label{sec:org9bb0da1}
|
||||
\label{sec:org8f7a589}
|
||||
|
||||
\begin{frame}[label={sec:org549026f}]{QMC kernel library}
|
||||
\begin{frame}[label={sec:orga61a195}]{QMC kernel library}
|
||||
\begin{block}{Computational kernels}
|
||||
\begin{itemize}
|
||||
\item QMCkl will contain the main kernels of QMC methods (Domain
|
||||
@ -264,19 +264,19 @@ for different
|
||||
\end{block}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:org8cd96b3}]{Objectives}
|
||||
\begin{frame}[label={sec:orgde1dc38}]{Objectives}
|
||||
\begin{itemize}
|
||||
\item The code must stay easy to understand by the physicists/chemists.
|
||||
Performance-related aspects are delegated to the library
|
||||
\item Scientists should be able to use their preferred language
|
||||
\item Scientists should not lose control on their codes
|
||||
\item Scientists should not lose control of their codes
|
||||
\item Codes should not die when the architecture changes
|
||||
\item Scientific code development should not kill the performance
|
||||
\item Reuse of the optimization effort among the community
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:org6609bc5}]{Functionality and performance}
|
||||
\begin{frame}[label={sec:org511ef07}]{Functionality and performance}
|
||||
\begin{itemize}
|
||||
\item Keeping high \emph{productivity}, \emph{portability} and \emph{performance} is very
|
||||
hard in a single piece of software.
|
||||
@ -287,7 +287,7 @@ We propose (at least) two implementations:
|
||||
\item \alert{Documentation library} \\
|
||||
Easy to read, understand, modify for scientists, not necessarily efficient.
|
||||
\item \alert{High performance libraries} \\
|
||||
Efficient on a given architecture, but not necessarily
|
||||
Efficient on a given architecture, but not necessarily
|
||||
readable by physicists/chemists. \\
|
||||
Performance within 10$\backslash$% to maximize portability and simplicity.
|
||||
\end{enumerate}
|
||||
@ -302,7 +302,7 @@ implemented in the HPC versions when the API is stabilized.
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:org33f426a},fragile]{Library design}
|
||||
\begin{frame}[label={sec:org858b4e8},fragile]{Library design}
|
||||
\begin{itemize}
|
||||
\item Creation of a \emph{Context} that keeps a consistent state of the library
|
||||
\item Memory allocation is abstract:
|
||||
@ -319,7 +319,7 @@ between kernels: re-compute only what is necessary
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:org7f2e24e},fragile]{Use case: low-level}
|
||||
\begin{frame}[label={sec:orgaf195d6},fragile]{Use case: low-level}
|
||||
\begin{minted}[frame=lines,fontsize=\scriptsize,linenos]{c}
|
||||
#include <qmckl.h>
|
||||
|
||||
@ -342,7 +342,7 @@ assert (rc == QMCKL_SUCCESS);
|
||||
\end{minted}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:orgf45dd18},fragile]{Use case: high-level}
|
||||
\begin{frame}[label={sec:orgb573ec0},fragile]{Use case: high-level}
|
||||
\begin{minted}[frame=lines,fontsize=\scriptsize,linenos]{c}
|
||||
#include <qmckl.h>
|
||||
// ...
|
||||
@ -366,7 +366,7 @@ rc = qmckl_get_local_energy(context, &e_loc);
|
||||
\end{minted}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:org8378951}]{Dependencies between kernels}
|
||||
\begin{frame}[label={sec:orgf244d95}]{Dependencies between kernels}
|
||||
\begin{columns}
|
||||
\begin{column}{0.5\textwidth}
|
||||
\begin{center}
|
||||
@ -387,7 +387,7 @@ rc = qmckl_get_local_energy(context, &e_loc);
|
||||
\end{frame}
|
||||
|
||||
|
||||
\begin{frame}[label={sec:orgd876ae6}]{Development strategy}
|
||||
\begin{frame}[label={sec:org2d8f7db}]{Development strategy}
|
||||
\begin{enumerate}
|
||||
\item Kernel extraction: QMC specialists agree on the
|
||||
mathematical expression of the problem
|
||||
@ -400,7 +400,7 @@ with HPC experts from real-size examples
|
||||
\end{enumerate}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:org2b9e52e}]{Documentation library}
|
||||
\begin{frame}[label={sec:org186f986}]{Documentation library}
|
||||
Literate programming with Org-mode:
|
||||
\begin{itemize}
|
||||
\item Comments are more important than code
|
||||
@ -414,7 +414,7 @@ Literate programming with Org-mode:
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:orgda06b11}]{High-Performance strategies}
|
||||
\begin{frame}[label={sec:org79ccfae}]{High-Performance strategies}
|
||||
\begin{block}{Linear algebra hot spots}
|
||||
\begin{center}
|
||||
\begin{tabular}{lll}
|
||||
@ -428,14 +428,14 @@ GEMV & Diagonal of GEMM & Shermann-Morrison-Woodburry\\
|
||||
\begin{itemize}
|
||||
\item Matrices are stored in tiled format \(\Longrightarrow\) task-based
|
||||
linear algebra interleaved computation of multiple kernels
|
||||
\item Increase parallelism by agregating multiple independent walkers
|
||||
\item Increase parallelism by aggregating multiple independent walkers
|
||||
in matrices
|
||||
\item Needs fast linear algebra kernels for small matrices
|
||||
\end{itemize}
|
||||
\end{block}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:orga9a5f05}]{High-Performance strategies}
|
||||
\begin{frame}[label={sec:orgf0771f2}]{High-Performance strategies}
|
||||
\begin{block}{Tuning}
|
||||
\begin{itemize}
|
||||
\item Optimization is guided by analysis with \alert{MAQAO}\footnote{https://maqao.org}.
|
||||
@ -447,22 +447,52 @@ in matrices
|
||||
\end{block}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:org1afd5ba}]{Example: Specialized DGEMM kernel}
|
||||
VIJAY
|
||||
\begin{frame}[label={sec:org750d177}]{Example: Specialized DGEMM kernel I}
|
||||
\begin{columns}
|
||||
\begin{column}{0.45\columnwidth}
|
||||
\begin{block}{Simple algorithm}
|
||||
\begin{itemize}
|
||||
\item Simple micro kernel (\alert{GotoDGEMM}\footnote{doi:10.1145/1356052.1356053})
|
||||
\item Code generation
|
||||
\item \alert{Tiling} scheme\footnote{doi:10.1109/ICPP.2015.29}
|
||||
\end{itemize}
|
||||
\end{block}
|
||||
\end{column}
|
||||
|
||||
\begin{column}{0.45\columnwidth}
|
||||
\begin{block}{Tiling scheme}
|
||||
\begin{center}
|
||||
\includegraphics[width=5cm,height=5cm]{./tiling_icpp2015.pdf}
|
||||
\end{center}
|
||||
\end{block}
|
||||
\end{column}
|
||||
\end{columns}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:orgd391c54}]{Efficiently guiding the developer}
|
||||
\begin{frame}[label={sec:orged49595}]{Example: Specialized DGEMM kernel II}
|
||||
\begin{block}{Benchmarks}
|
||||
\begin{itemize}
|
||||
\item Comparison of MKL vs Specialied DGEMM
|
||||
|
||||
\begin{center}
|
||||
\includegraphics[width=10cm,height=6cm]{./plot_percentage_vs_mkl_tiled_good.pdf}
|
||||
\end{center}
|
||||
\end{itemize}
|
||||
\end{block}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:orgd61c781}]{Efficiently guiding the developer}
|
||||
\begin{center}
|
||||
\includegraphics[width=\textwidth]{./maqao1.png}
|
||||
\end{center}
|
||||
\end{frame}
|
||||
\begin{frame}[label={sec:org543da85}]{Extensive/automatic testing of different configurations}
|
||||
\begin{frame}[label={sec:orgacd8d69}]{Extensive/automatic testing of different configurations}
|
||||
\begin{center}
|
||||
\includegraphics[width=\textwidth]{./maqao2.png}
|
||||
\end{center}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:org95699bb}]{First application : 3-body Jastrow factor}
|
||||
\begin{frame}[label={sec:org3ef76e8}]{First application : 3-body Jastrow factor}
|
||||
\newcommand{\Jeen}{J_{\text{een}}}
|
||||
\newcommand{\Nel}{N_{\text{elec}}}
|
||||
\newcommand{\Nat}{N_{\text{nucl}}}
|
||||
@ -600,7 +630,7 @@ vfc\_probe\_assert("Sherman-Morisson", "res", res, \tikzmark{target}1e-7)
|
||||
(targetex.south) to[out=-90,in=90] ([yshift=1.2ex, xshift=.5cm]{pic cs:target});
|
||||
\end{tikzpicture}
|
||||
\end{frame}
|
||||
\begin{frame}[label={sec:orge29c5eb}]{Verificarlo CI}
|
||||
\begin{frame}[label={sec:org8e7bd27}]{Verificarlo CI}
|
||||
\begin{columns}
|
||||
\begin{column}{0.5\textwidth}
|
||||
\begin{exampleblock}{Compare runs}
|
||||
@ -627,4 +657,4 @@ variable distribution
|
||||
\end{column}
|
||||
\end{columns}
|
||||
\end{frame}
|
||||
\end{document}
|
||||
\end{document}
|
||||
|
BIN
tiling_icpp2015.pdf
Normal file
BIN
tiling_icpp2015.pdf
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user