Added two slides on DGEMM.

This commit is contained in:
v1j4y 2021-10-06 11:47:14 +02:00
parent 9046dc8702
commit 877feadb7b
4 changed files with 95 additions and 41 deletions

Binary file not shown.

View File

@ -447,9 +447,33 @@ digraph G {
- Monitoring of the use of the library to choose most efficient versions
- Optimizations guided by monitoring numerical accuracy (*Verificarlo*\footnote{https://github.com/verificarlo/verificarlo})
** Example: Specialized DGEMM kernel
** Example: Specialized DGEMM kernel I
VIJAY
*** Simple algorithm :B_block:BMCOL:
:PROPERTIES:
:BEAMER_env: block
:BEAMER_col: 0.45
:END:
- Simple micro kernel (*GotoDGEMM*\footnote{doi:10.1145/1356052.1356053})
- Code generation
- *Tiling* scheme\footnote{doi:10.1109/ICPP.2015.29}
*** Tiling scheme :B_block:BMCOL:
:PROPERTIES:
:BEAMER_col: 0.45
:BEAMER_env: block
:END:
#+ATTR_LATEX: :width 5cm :height 5cm :keepaspectratio :right
[[./tiling_icpp2015.pdf]]
** Example: Specialized DGEMM kernel II
*** Benchmarks
- Comparison of MKL vs Specialied DGEMM
#+ATTR_LATEX: :width 10cm :height 6cm :keepaspectratio
[[./plot_percentage_vs_mkl_tiled_good.pdf]]
** Efficiently guiding the developer
@ -548,7 +572,7 @@ digraph G {
#+END_SRC
#+RESULTS:
: /home/scemama/MEGA/TEX/Presentations/2021/Intel/scemama.pdf
: /home/vijay/Documents/presentations/pres_intel/scemama.pdf
* TODO [19/21] Improvements :noexport:

View File

@ -1,4 +1,4 @@
% Created 2021-10-04 Mon 16:33
% Created 2021-10-06 mer. 11:46
% Intended LaTeX compiler: pdflatex
\documentclass[aspectratio=169]{beamer}
\usepackage[utf8]{inputenc}
@ -46,20 +46,20 @@ $^2$University of Versailles, Li-PaRAD (France)}
pdftitle={Libraries developed in the TREX CoE},
pdfkeywords={},
pdfsubject={},
pdfcreator={Emacs 26.3 (Org mode 9.4)},
pdfcreator={Emacs 27.1 (Org mode 9.5)},
pdflang={English}}
\begin{document}
\maketitle
\section{TREX: Targeting REal chemical accuracy at the EXascale}
\label{sec:org5b14751}
\begin{frame}[label={sec:orge8da598}]{TREX: Targeting REal chemical accuracy at the EXascale}
\label{sec:org4b839d0}
\begin{frame}[label={sec:orgbdf1b22}]{TREX: Targeting REal chemical accuracy at the EXascale}
\begin{exampleblock}{QMC: Quantum Monte Carlo methods}
\begin{itemize}
\item Highly accurate methods
\item Massively parallelisable (multiple QMC trajectories)
\item Very CPU intensive: One of the most "compute-hungry" methods
\item Very CPU intensive: One of the most ``compute-hungry'' methods
\item Still under development: scientists need to run \emph{and} develop code
\item Input data is complex (wave function)
\end{itemize}
@ -75,7 +75,7 @@ How: Instead of re-writing codes, provide libraries (free software)
\end{exampleblock}
\end{frame}
\begin{frame}[label={sec:orgce5753d}]{Quantum Monte Carlo (QMC)}
\begin{frame}[label={sec:orga6495e2}]{Quantum Monte Carlo (QMC)}
\alert{Problem}: Stochastic resolution of the Schr\"odinger equation for $N$ electrons
\begin{eqnarray}
E &= &\frac{\int \dcoord \Phi(\coord) {\cal H} \Phi(\coord)}
@ -101,7 +101,7 @@ E &= &\frac{\int \dcoord \Phi(\coord) {\cal H} \Phi(\coord)}
\end{columns}
\end{frame}
\begin{frame}[label={sec:org6ed0682}]{Quantum Monte Carlo (QMC)}
\begin{frame}[label={sec:org8ad33cb}]{Quantum Monte Carlo (QMC)}
\begin{columns}
\begin{column}{0.4\textwidth}
\begin{itemize}
@ -119,7 +119,7 @@ E &= &\frac{\int \dcoord \Phi(\coord) {\cal H} \Phi(\coord)}
\end{columns}
\end{frame}
\begin{frame}[label={sec:orgc59d3f5}]{Both libraries}
\begin{frame}[label={sec:orgb43d37e}]{Both libraries}
\begin{block}{Three objectives}
\begin{enumerate}
\item \alert{Productivity} \\
@ -140,8 +140,8 @@ Must be efficient on all architectures
\end{frame}
\section{TREXIO: I/O library}
\label{sec:org88424a7}
\begin{frame}[label={sec:org9c9c2f0}]{TREXIO: I/O library}
\label{sec:orgbe916a5}
\begin{frame}[label={sec:orgdf330cc}]{TREXIO: I/O library}
\begin{columns}
\begin{column}{0.4\textwidth}
\begin{exampleblock}{Before}
@ -163,7 +163,7 @@ Must be efficient on all architectures
\url{https://github.com/trex-coe/trexio}
\end{frame}
\begin{frame}[label={sec:org8aca922}]{TREXIO: I/O library}
\begin{frame}[label={sec:orgaf241aa}]{TREXIO: I/O library}
\begin{exampleblock}{Front end}
\begin{itemize}
\item Definition of an API for to read/write wave functions
@ -192,7 +192,7 @@ Must be efficient on all architectures
\end{columns}
\end{frame}
\begin{frame}[label={sec:orgbb567ff}]{Content of the files}
\begin{frame}[label={sec:orgd025d67}]{Content of the files}
\begin{itemize}
\item File is \alert{self-contained}: no external knowledge needed to compute
\(\Psi(r_1,\dots,r_n)\) (normalization factors, basis et
@ -212,7 +212,7 @@ One-electron integrals & Density matrices & ECP\\
\end{itemize}
\end{frame}
\begin{frame}[label={sec:orgaacdd0f},fragile]{Source code}
\begin{frame}[label={sec:org17b60da},fragile]{Source code}
\begin{itemize}
\item For each attribute :
\begin{minted}[frame=lines,fontsize=\scriptsize,linenos]{c}
@ -230,7 +230,7 @@ Table \(\rightarrow\) JSON \(\rightarrow\) C \\
\end{itemize}
\end{frame}
\begin{frame}[label={sec:org7802c31}]{Source code}
\begin{frame}[label={sec:orgbaa9dcc}]{Source code}
Productivity:
\begin{center}
@ -238,16 +238,16 @@ Productivity:
\end{center}
\end{frame}
\begin{frame}[label={sec:orga268d6e}]{Documentation}
\begin{frame}[label={sec:orgf8a4de9}]{Documentation}
\begin{center}
\includegraphics[height=\textheight]{./trexio-doc2.png}
\end{center}
\end{frame}
\section{QMCkl: QMC kernel library}
\label{sec:org9bb0da1}
\label{sec:org8f7a589}
\begin{frame}[label={sec:org549026f}]{QMC kernel library}
\begin{frame}[label={sec:orga61a195}]{QMC kernel library}
\begin{block}{Computational kernels}
\begin{itemize}
\item QMCkl will contain the main kernels of QMC methods (Domain
@ -264,19 +264,19 @@ for different
\end{block}
\end{frame}
\begin{frame}[label={sec:org8cd96b3}]{Objectives}
\begin{frame}[label={sec:orgde1dc38}]{Objectives}
\begin{itemize}
\item The code must stay easy to understand by the physicists/chemists.
Performance-related aspects are delegated to the library
\item Scientists should be able to use their preferred language
\item Scientists should not lose control on their codes
\item Scientists should not lose control of their codes
\item Codes should not die when the architecture changes
\item Scientific code development should not kill the performance
\item Reuse of the optimization effort among the community
\end{itemize}
\end{frame}
\begin{frame}[label={sec:org6609bc5}]{Functionality and performance}
\begin{frame}[label={sec:org511ef07}]{Functionality and performance}
\begin{itemize}
\item Keeping high \emph{productivity}, \emph{portability} and \emph{performance} is very
hard in a single piece of software.
@ -302,7 +302,7 @@ implemented in the HPC versions when the API is stabilized.
\end{itemize}
\end{frame}
\begin{frame}[label={sec:org33f426a},fragile]{Library design}
\begin{frame}[label={sec:org858b4e8},fragile]{Library design}
\begin{itemize}
\item Creation of a \emph{Context} that keeps a consistent state of the library
\item Memory allocation is abstract:
@ -319,7 +319,7 @@ between kernels: re-compute only what is necessary
\end{itemize}
\end{frame}
\begin{frame}[label={sec:org7f2e24e},fragile]{Use case: low-level}
\begin{frame}[label={sec:orgaf195d6},fragile]{Use case: low-level}
\begin{minted}[frame=lines,fontsize=\scriptsize,linenos]{c}
#include <qmckl.h>
@ -342,7 +342,7 @@ assert (rc == QMCKL_SUCCESS);
\end{minted}
\end{frame}
\begin{frame}[label={sec:orgf45dd18},fragile]{Use case: high-level}
\begin{frame}[label={sec:orgb573ec0},fragile]{Use case: high-level}
\begin{minted}[frame=lines,fontsize=\scriptsize,linenos]{c}
#include <qmckl.h>
// ...
@ -366,7 +366,7 @@ rc = qmckl_get_local_energy(context, &e_loc);
\end{minted}
\end{frame}
\begin{frame}[label={sec:org8378951}]{Dependencies between kernels}
\begin{frame}[label={sec:orgf244d95}]{Dependencies between kernels}
\begin{columns}
\begin{column}{0.5\textwidth}
\begin{center}
@ -387,7 +387,7 @@ rc = qmckl_get_local_energy(context, &e_loc);
\end{frame}
\begin{frame}[label={sec:orgd876ae6}]{Development strategy}
\begin{frame}[label={sec:org2d8f7db}]{Development strategy}
\begin{enumerate}
\item Kernel extraction: QMC specialists agree on the
mathematical expression of the problem
@ -400,7 +400,7 @@ with HPC experts from real-size examples
\end{enumerate}
\end{frame}
\begin{frame}[label={sec:org2b9e52e}]{Documentation library}
\begin{frame}[label={sec:org186f986}]{Documentation library}
Literate programming with Org-mode:
\begin{itemize}
\item Comments are more important than code
@ -414,7 +414,7 @@ Literate programming with Org-mode:
\end{itemize}
\end{frame}
\begin{frame}[label={sec:orgda06b11}]{High-Performance strategies}
\begin{frame}[label={sec:org79ccfae}]{High-Performance strategies}
\begin{block}{Linear algebra hot spots}
\begin{center}
\begin{tabular}{lll}
@ -428,14 +428,14 @@ GEMV & Diagonal of GEMM & Shermann-Morrison-Woodburry\\
\begin{itemize}
\item Matrices are stored in tiled format \(\Longrightarrow\) task-based
linear algebra interleaved computation of multiple kernels
\item Increase parallelism by agregating multiple independent walkers
\item Increase parallelism by aggregating multiple independent walkers
in matrices
\item Needs fast linear algebra kernels for small matrices
\end{itemize}
\end{block}
\end{frame}
\begin{frame}[label={sec:orga9a5f05}]{High-Performance strategies}
\begin{frame}[label={sec:orgf0771f2}]{High-Performance strategies}
\begin{block}{Tuning}
\begin{itemize}
\item Optimization is guided by analysis with \alert{MAQAO}\footnote{https://maqao.org}.
@ -447,22 +447,52 @@ in matrices
\end{block}
\end{frame}
\begin{frame}[label={sec:org1afd5ba}]{Example: Specialized DGEMM kernel}
VIJAY
\begin{frame}[label={sec:org750d177}]{Example: Specialized DGEMM kernel I}
\begin{columns}
\begin{column}{0.45\columnwidth}
\begin{block}{Simple algorithm}
\begin{itemize}
\item Simple micro kernel (\alert{GotoDGEMM}\footnote{doi:10.1145/1356052.1356053})
\item Code generation
\item \alert{Tiling} scheme\footnote{doi:10.1109/ICPP.2015.29}
\end{itemize}
\end{block}
\end{column}
\begin{column}{0.45\columnwidth}
\begin{block}{Tiling scheme}
\begin{center}
\includegraphics[width=5cm,height=5cm]{./tiling_icpp2015.pdf}
\end{center}
\end{block}
\end{column}
\end{columns}
\end{frame}
\begin{frame}[label={sec:orgd391c54}]{Efficiently guiding the developer}
\begin{frame}[label={sec:orged49595}]{Example: Specialized DGEMM kernel II}
\begin{block}{Benchmarks}
\begin{itemize}
\item Comparison of MKL vs Specialied DGEMM
\begin{center}
\includegraphics[width=10cm,height=6cm]{./plot_percentage_vs_mkl_tiled_good.pdf}
\end{center}
\end{itemize}
\end{block}
\end{frame}
\begin{frame}[label={sec:orgd61c781}]{Efficiently guiding the developer}
\begin{center}
\includegraphics[width=\textwidth]{./maqao1.png}
\end{center}
\end{frame}
\begin{frame}[label={sec:org543da85}]{Extensive/automatic testing of different configurations}
\begin{frame}[label={sec:orgacd8d69}]{Extensive/automatic testing of different configurations}
\begin{center}
\includegraphics[width=\textwidth]{./maqao2.png}
\end{center}
\end{frame}
\begin{frame}[label={sec:org95699bb}]{First application : 3-body Jastrow factor}
\begin{frame}[label={sec:org3ef76e8}]{First application : 3-body Jastrow factor}
\newcommand{\Jeen}{J_{\text{een}}}
\newcommand{\Nel}{N_{\text{elec}}}
\newcommand{\Nat}{N_{\text{nucl}}}
@ -600,7 +630,7 @@ vfc\_probe\_assert("Sherman-Morisson", "res", res, \tikzmark{target}1e-7)
(targetex.south) to[out=-90,in=90] ([yshift=1.2ex, xshift=.5cm]{pic cs:target});
\end{tikzpicture}
\end{frame}
\begin{frame}[label={sec:orge29c5eb}]{Verificarlo CI}
\begin{frame}[label={sec:org8e7bd27}]{Verificarlo CI}
\begin{columns}
\begin{column}{0.5\textwidth}
\begin{exampleblock}{Compare runs}

BIN
tiling_icpp2015.pdf Normal file

Binary file not shown.