Update
This commit is contained in:
parent
47c0a9ea37
commit
ef1802427e
168
scemama.org
168
scemama.org
@ -35,8 +35,8 @@
|
||||
#+startup: beamer
|
||||
#+options: H:2 toc:nil
|
||||
|
||||
* TREX: Targeting REal chemical accuracy at the EXascale
|
||||
** TREX: Targeting REal chemical accuracy at the EXascale
|
||||
* QMC in TREX
|
||||
** QMC in TREX
|
||||
|
||||
#+LATEX: \begin{exampleblock}{QMC: Quantum Monte Carlo methods}
|
||||
- Highly accurate methods
|
||||
@ -172,7 +172,7 @@ digraph G {
|
||||
#+LATEX: \end{columns}
|
||||
|
||||
(BSD license) \\
|
||||
https://github.com/trex-coe/trexio
|
||||
[[https://github.com/trex-coe/trexio]]
|
||||
|
||||
** TREXIO: I/O library
|
||||
|
||||
@ -210,7 +210,7 @@ digraph G {
|
||||
| One-electron integrals | Density matrices | ECP |
|
||||
- Each group contains multiple *attributes*
|
||||
|
||||
** Source code
|
||||
** Source code :noexport:
|
||||
|
||||
- For each attribute :
|
||||
#+begin_src c
|
||||
@ -226,14 +226,14 @@ trexio_exit_code trexio_[has/read/write]_<group>_<attribute>
|
||||
- Performance : HDF5 back end
|
||||
- Portability : Only optional dependency is HDF5
|
||||
|
||||
** Source code
|
||||
** Source code :noexport:
|
||||
|
||||
Productivity:
|
||||
|
||||
#+ATTR_LATEX: :width \textwidth
|
||||
[[./trexio-doc1.png]]
|
||||
|
||||
** Documentation
|
||||
** Documentation :noexport:
|
||||
|
||||
#+ATTR_LATEX: :height \textheight
|
||||
[[./trexio-doc2.png]]
|
||||
@ -248,14 +248,14 @@ trexio_exit_code trexio_[has/read/write]_<group>_<attribute>
|
||||
- Written together by QMC experts and HPC experts
|
||||
- Multiple high performance implementations of the kernels, tuned
|
||||
for different
|
||||
- architectures (portability is critical for users)
|
||||
- architectures: portability is critical for users
|
||||
- problem sizes (from small to large systems)
|
||||
- requested accuracy (reduced precision)
|
||||
|
||||
** Objectives
|
||||
|
||||
- The code must stay easy to understand by the physicists/chemists.
|
||||
Performance-related aspects are delegated to the library
|
||||
Performance-related aspects should be delegated to the library
|
||||
- Scientists should be able to use their preferred language
|
||||
- Scientists should not lose control on their codes
|
||||
- Codes should not die when the architecture changes
|
||||
@ -274,7 +274,7 @@ trexio_exit_code trexio_[has/read/write]_<group>_<attribute>
|
||||
2. *High performance libraries* \\
|
||||
Efficient on a given architecture, but not necessarily
|
||||
readable by physicists/chemists. \\
|
||||
Performance within 10\% to maximize portability and simplicity.
|
||||
Performance within 10% to maximize portability and simplicity.
|
||||
|
||||
- Both /Documentation/ and /High performance/ have the same API
|
||||
(similar to BLAS on netlib /vs/ MKL).
|
||||
@ -296,59 +296,9 @@ void* qmckl_malloc(qmckl_context context, const qmckl_memory_info_struct info);
|
||||
context untouched (no allocation, no modification in-place)
|
||||
- High-level functions: let the library call multiple kernels in an
|
||||
optimal way, possibly updating the context
|
||||
- Use of IRP programming paradigm to keep track of dependencies
|
||||
- Use of IRP programming paradigm\footnote{http://arxiv.org/abs/0909.5012} to keep track of dependencies
|
||||
between kernels: re-compute only what is necessary
|
||||
|
||||
** Use case: low-level
|
||||
|
||||
#+begin_src c
|
||||
#include <qmckl.h>
|
||||
|
||||
// ...
|
||||
qmckl_exit_code rc;
|
||||
int64_t m, n, LDA, LDB, LDC;
|
||||
// ...
|
||||
double A[LDA*3];
|
||||
double B[LDB*3];
|
||||
double C[LDC*n];
|
||||
// ...
|
||||
|
||||
context = qmckl_context_create();
|
||||
|
||||
// Compute inter-particle distances between xyz coordinates in A[m][3] and B[3][n]
|
||||
// and store the result in C[m][n]
|
||||
rc = qmckl_distance(context, 'N', 'T', m, n, A, LDA, B, LDB, C, LDC);
|
||||
assert (rc == QMCKL_SUCCESS);
|
||||
// ...
|
||||
#+end_src
|
||||
|
||||
** Use case: high-level
|
||||
|
||||
#+begin_src c
|
||||
#include <qmckl.h>
|
||||
// ...
|
||||
qmckl_exit_code rc;
|
||||
double e_loc;
|
||||
qmckl_context context;
|
||||
|
||||
context = qmckl_context_create();
|
||||
|
||||
// Store WF parameters in the context
|
||||
rc = qmckl_read_trexio(context, trexio_filename);
|
||||
assert (rc == QMCKL_SUCCESS);
|
||||
|
||||
// Set the electron coordinates in the context
|
||||
rc = qmckl_set_electron_coord (context, 'N', elec_coord);
|
||||
assert(rc == QMCKL_SUCCESS);
|
||||
|
||||
// Return the local energy at the current electron positions
|
||||
rc = qmckl_get_local_energy(context, &e_loc);
|
||||
// ...
|
||||
#+end_src
|
||||
|
||||
#+RESULTS:
|
||||
: /home/scemama/MEGA/TEX/Presentations/2021/Intel/scemama.pdf
|
||||
|
||||
** Dependencies between kernels
|
||||
|
||||
#+LATEX: \begin{columns}
|
||||
@ -400,7 +350,56 @@ digraph G {
|
||||
#+LATEX: \end{column}
|
||||
#+LATEX: \end{columns}
|
||||
|
||||
|
||||
** Use case: low-level
|
||||
|
||||
#+begin_src c
|
||||
#include <qmckl.h>
|
||||
|
||||
// ...
|
||||
qmckl_exit_code rc;
|
||||
int64_t m, n, LDA, LDB, LDC;
|
||||
// ...
|
||||
double A[LDA*3];
|
||||
double B[LDB*3];
|
||||
double C[LDC*n];
|
||||
// ...
|
||||
|
||||
context = qmckl_context_create();
|
||||
|
||||
// Compute inter-particle distances between xyz coordinates in A[m][3] and B[3][n]
|
||||
// and store the result in C[m][n]
|
||||
rc = qmckl_distance(context, 'N', 'T', m, n, A, LDA, B, LDB, C, LDC);
|
||||
assert (rc == QMCKL_SUCCESS);
|
||||
// ...
|
||||
#+end_src
|
||||
|
||||
** Use case: high-level
|
||||
|
||||
#+begin_src c
|
||||
#include <qmckl.h>
|
||||
// ...
|
||||
qmckl_exit_code rc;
|
||||
double e_loc;
|
||||
qmckl_context context;
|
||||
|
||||
context = qmckl_context_create();
|
||||
|
||||
// Store WF parameters in the context
|
||||
rc = qmckl_read_trexio(context, trexio_filename);
|
||||
assert (rc == QMCKL_SUCCESS);
|
||||
|
||||
// Set the electron coordinates in the context
|
||||
rc = qmckl_set_electron_coord (context, 'N', elec_coord);
|
||||
assert(rc == QMCKL_SUCCESS);
|
||||
|
||||
// Return the local energy at the current electron positions
|
||||
rc = qmckl_get_local_energy(context, &e_loc);
|
||||
// ...
|
||||
#+end_src
|
||||
|
||||
#+RESULTS:
|
||||
: /home/scemama/MEGA/TEX/Presentations/2021/Intel/scemama.pdf
|
||||
|
||||
** Development strategy
|
||||
|
||||
1. Kernel extraction: QMC specialists agree on the
|
||||
@ -443,7 +442,7 @@ digraph G {
|
||||
*** Tuning
|
||||
- Optimization is guided by analysis with *MAQAO*\footnote{https://maqao.org}.
|
||||
- Specialized versions of critical hot-spots
|
||||
- MIPP for portable intrinsics / specialized code generation
|
||||
- MIPP\footnote{https://github.com/aff3ct/MIPP} for portable intrinsics / specialized code generation
|
||||
- Monitoring of the use of the library to choose most efficient versions
|
||||
- Optimizations guided by monitoring numerical accuracy (*Verificarlo*\footnote{https://github.com/verificarlo/verificarlo})
|
||||
|
||||
@ -529,28 +528,6 @@ digraph G {
|
||||
| MAQAO | http://www.maqao.org |
|
||||
| Verificarlo | https://github.com/verificarlo/verificarlo |
|
||||
|
||||
* Export :noexport:
|
||||
#+BEGIN_SRC elisp :output none
|
||||
(setq org-latex-listings 'minted)
|
||||
(setq org-latex-custom-lang-environments
|
||||
'(
|
||||
(f90 "fortran")
|
||||
))
|
||||
(setq org-latex-minted-options
|
||||
'(("frame" "lines")
|
||||
("fontsize" "\\scriptsize")
|
||||
("linenos" "")))
|
||||
(setq org-latex-to-pdf-process
|
||||
'("pdflatex -shell-escape -interaction nonstopmode -output-directory %o %f"
|
||||
"pdflatex -shell-escape -interaction nonstopmode -output-directory %o %f"
|
||||
"pdflatex -shell-escape -interaction nonstopmode -output-directory %o %f"))
|
||||
(org-beamer-export-to-pdf)
|
||||
#+END_SRC
|
||||
|
||||
#+RESULTS:
|
||||
: /home/scemama/MEGA/TEX/Presentations/2021/Intel/scemama.pdf
|
||||
|
||||
|
||||
* TODO [19/21] Improvements :noexport:
|
||||
|
||||
- [X] Fully parallelisable
|
||||
@ -597,3 +574,26 @@ together: perf et productivity
|
||||
- [X] On cherche de la perf within 10%. On en laisse sur la table
|
||||
|
||||
- [X] Multiple verions: CPU, GPU
|
||||
|
||||
* Export :noexport:
|
||||
#+BEGIN_SRC elisp :output none
|
||||
(setq org-latex-listings 'minted)
|
||||
(setq org-latex-custom-lang-environments
|
||||
'(
|
||||
(f90 "fortran")
|
||||
))
|
||||
(setq org-latex-minted-options
|
||||
'(("frame" "lines")
|
||||
("fontsize" "\\scriptsize")
|
||||
("linenos" "")))
|
||||
(setq org-latex-to-pdf-process
|
||||
'("pdflatex -shell-escape -interaction nonstopmode -output-directory %o %f"
|
||||
"pdflatex -shell-escape -interaction nonstopmode -output-directory %o %f"
|
||||
"pdflatex -shell-escape -interaction nonstopmode -output-directory %o %f"))
|
||||
(org-beamer-export-to-pdf)
|
||||
#+END_SRC
|
||||
|
||||
#+RESULTS:
|
||||
: /home/scemama/MEGA/TEX/Presentations/2021/Intel/scemama.pdf
|
||||
|
||||
|
||||
|
139
scemama.tex
139
scemama.tex
@ -1,4 +1,4 @@
|
||||
% Created 2021-10-04 Mon 16:33
|
||||
% Created 2021-10-07 Thu 12:17
|
||||
% Intended LaTeX compiler: pdflatex
|
||||
\documentclass[aspectratio=169]{beamer}
|
||||
\usepackage[utf8]{inputenc}
|
||||
@ -52,9 +52,9 @@ $^2$University of Versailles, Li-PaRAD (France)}
|
||||
|
||||
\maketitle
|
||||
|
||||
\section{TREX: Targeting REal chemical accuracy at the EXascale}
|
||||
\label{sec:org5b14751}
|
||||
\begin{frame}[label={sec:orge8da598}]{TREX: Targeting REal chemical accuracy at the EXascale}
|
||||
\section{QMC in TREX}
|
||||
\label{sec:org527cfcf}
|
||||
\begin{frame}[label={sec:org3bfadea}]{QMC in TREX}
|
||||
\begin{exampleblock}{QMC: Quantum Monte Carlo methods}
|
||||
\begin{itemize}
|
||||
\item Highly accurate methods
|
||||
@ -75,7 +75,7 @@ How: Instead of re-writing codes, provide libraries (free software)
|
||||
\end{exampleblock}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:orgce5753d}]{Quantum Monte Carlo (QMC)}
|
||||
\begin{frame}[label={sec:orge26ef23}]{Quantum Monte Carlo (QMC)}
|
||||
\alert{Problem}: Stochastic resolution of the Schr\"odinger equation for $N$ electrons
|
||||
\begin{eqnarray}
|
||||
E &= &\frac{\int \dcoord \Phi(\coord) {\cal H} \Phi(\coord)}
|
||||
@ -101,7 +101,7 @@ E &= &\frac{\int \dcoord \Phi(\coord) {\cal H} \Phi(\coord)}
|
||||
\end{columns}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:org6ed0682}]{Quantum Monte Carlo (QMC)}
|
||||
\begin{frame}[label={sec:orgd65402e}]{Quantum Monte Carlo (QMC)}
|
||||
\begin{columns}
|
||||
\begin{column}{0.4\textwidth}
|
||||
\begin{itemize}
|
||||
@ -119,7 +119,7 @@ E &= &\frac{\int \dcoord \Phi(\coord) {\cal H} \Phi(\coord)}
|
||||
\end{columns}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:orgc59d3f5}]{Both libraries}
|
||||
\begin{frame}[label={sec:org3e8242f}]{Both libraries}
|
||||
\begin{block}{Three objectives}
|
||||
\begin{enumerate}
|
||||
\item \alert{Productivity} \\
|
||||
@ -140,8 +140,8 @@ Must be efficient on all architectures
|
||||
\end{frame}
|
||||
|
||||
\section{TREXIO: I/O library}
|
||||
\label{sec:org88424a7}
|
||||
\begin{frame}[label={sec:org9c9c2f0}]{TREXIO: I/O library}
|
||||
\label{sec:orgf8ad1e7}
|
||||
\begin{frame}[label={sec:org02f0485}]{TREXIO: I/O library}
|
||||
\begin{columns}
|
||||
\begin{column}{0.4\textwidth}
|
||||
\begin{exampleblock}{Before}
|
||||
@ -160,10 +160,10 @@ Must be efficient on all architectures
|
||||
\end{columns}
|
||||
|
||||
(BSD license) \\
|
||||
\url{https://github.com/trex-coe/trexio}
|
||||
\url{https://github.com/trex-coe/trexio}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:org8aca922}]{TREXIO: I/O library}
|
||||
\begin{frame}[label={sec:org2341c39}]{TREXIO: I/O library}
|
||||
\begin{exampleblock}{Front end}
|
||||
\begin{itemize}
|
||||
\item Definition of an API for to read/write wave functions
|
||||
@ -192,7 +192,7 @@ Must be efficient on all architectures
|
||||
\end{columns}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:orgbb567ff}]{Content of the files}
|
||||
\begin{frame}[label={sec:org51a55c1}]{Content of the files}
|
||||
\begin{itemize}
|
||||
\item File is \alert{self-contained}: no external knowledge needed to compute
|
||||
\(\Psi(r_1,\dots,r_n)\) (normalization factors, basis et
|
||||
@ -212,42 +212,10 @@ One-electron integrals & Density matrices & ECP\\
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:orgaacdd0f},fragile]{Source code}
|
||||
\begin{itemize}
|
||||
\item For each attribute :
|
||||
\begin{minted}[frame=lines,fontsize=\scriptsize,linenos]{c}
|
||||
trexio_exit_code trexio_[has/read/write]_<group>_<attribute>
|
||||
(trexio_t* file, <type> attribute)
|
||||
\end{minted}
|
||||
\item The library can be auto-generated by a script as the function names can
|
||||
be computed
|
||||
\item Productivity : Literate programming with Org-mode \\
|
||||
Table \(\rightarrow\) JSON \(\rightarrow\) C \\
|
||||
\phantom{Table} \(\rightarrow\) Documentation
|
||||
\item Fortran and Python/Numpy interfaces are also generated
|
||||
\item Performance : HDF5 back end
|
||||
\item Portability : Only optional dependency is HDF5
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:org7802c31}]{Source code}
|
||||
Productivity:
|
||||
|
||||
\begin{center}
|
||||
\includegraphics[width=\textwidth]{./trexio-doc1.png}
|
||||
\end{center}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:orga268d6e}]{Documentation}
|
||||
\begin{center}
|
||||
\includegraphics[height=\textheight]{./trexio-doc2.png}
|
||||
\end{center}
|
||||
\end{frame}
|
||||
|
||||
\section{QMCkl: QMC kernel library}
|
||||
\label{sec:org9bb0da1}
|
||||
\label{sec:org53e6105}
|
||||
|
||||
\begin{frame}[label={sec:org549026f}]{QMC kernel library}
|
||||
\begin{frame}[label={sec:org4dc9060}]{QMC kernel library}
|
||||
\begin{block}{Computational kernels}
|
||||
\begin{itemize}
|
||||
\item QMCkl will contain the main kernels of QMC methods (Domain
|
||||
@ -256,7 +224,7 @@ specific library, end-user driven)
|
||||
\item Multiple high performance implementations of the kernels, tuned
|
||||
for different
|
||||
\begin{itemize}
|
||||
\item architectures (portability is critical for users)
|
||||
\item architectures: portability is critical for users
|
||||
\item problem sizes (from small to large systems)
|
||||
\item requested accuracy (reduced precision)
|
||||
\end{itemize}
|
||||
@ -264,10 +232,10 @@ for different
|
||||
\end{block}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:org8cd96b3}]{Objectives}
|
||||
\begin{frame}[label={sec:orgcf8c268}]{Objectives}
|
||||
\begin{itemize}
|
||||
\item The code must stay easy to understand by the physicists/chemists.
|
||||
Performance-related aspects are delegated to the library
|
||||
Performance-related aspects should be delegated to the library
|
||||
\item Scientists should be able to use their preferred language
|
||||
\item Scientists should not lose control on their codes
|
||||
\item Codes should not die when the architecture changes
|
||||
@ -276,7 +244,7 @@ Performance-related aspects are delegated to the library
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:org6609bc5}]{Functionality and performance}
|
||||
\begin{frame}[label={sec:org523cd8a}]{Functionality and performance}
|
||||
\begin{itemize}
|
||||
\item Keeping high \emph{productivity}, \emph{portability} and \emph{performance} is very
|
||||
hard in a single piece of software.
|
||||
@ -289,7 +257,7 @@ Easy to read, understand, modify for scientists, not necessarily efficient.
|
||||
\item \alert{High performance libraries} \\
|
||||
Efficient on a given architecture, but not necessarily
|
||||
readable by physicists/chemists. \\
|
||||
Performance within 10$\backslash$% to maximize portability and simplicity.
|
||||
Performance within 10\% to maximize portability and simplicity.
|
||||
\end{enumerate}
|
||||
|
||||
\item Both \emph{Documentation} and \emph{High performance} have the same API
|
||||
@ -302,7 +270,7 @@ implemented in the HPC versions when the API is stabilized.
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:org33f426a},fragile]{Library design}
|
||||
\begin{frame}[label={sec:org1030a63},fragile]{Library design}
|
||||
\begin{itemize}
|
||||
\item Creation of a \emph{Context} that keeps a consistent state of the library
|
||||
\item Memory allocation is abstract:
|
||||
@ -314,12 +282,32 @@ allows allocation on CPU/GPU by the HPC variants
|
||||
context untouched (no allocation, no modification in-place)
|
||||
\item High-level functions: let the library call multiple kernels in an
|
||||
optimal way, possibly updating the context
|
||||
\item Use of IRP programming paradigm to keep track of dependencies
|
||||
\item Use of IRP programming paradigm\footnote{http://arxiv.org/abs/0909.5012} to keep track of dependencies
|
||||
between kernels: re-compute only what is necessary
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:org7f2e24e},fragile]{Use case: low-level}
|
||||
\begin{frame}[label={sec:orgd8c37c2}]{Dependencies between kernels}
|
||||
\begin{columns}
|
||||
\begin{column}{0.5\textwidth}
|
||||
\begin{center}
|
||||
\includegraphics[width=.9\linewidth]{irp.png}
|
||||
\end{center}
|
||||
|
||||
\end{column}
|
||||
\begin{column}{0.5\textwidth}
|
||||
\begin{itemize}
|
||||
\item Only the needed sub-graph is computed
|
||||
\item HPC: Each kernel is one/many parallel Task(s)
|
||||
\item HPC: Use OpenMP tasks or StarPU\footnote{C. Augonnet et al, doi:10.1002/cpe.1631} for hybrid architectures:
|
||||
(StarPU handles very well asynchronous CPU-GPU transfers).
|
||||
\end{itemize}
|
||||
|
||||
\end{column}
|
||||
\end{columns}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:org465f70f},fragile]{Use case: low-level}
|
||||
\begin{minted}[frame=lines,fontsize=\scriptsize,linenos]{c}
|
||||
#include <qmckl.h>
|
||||
|
||||
@ -342,7 +330,7 @@ assert (rc == QMCKL_SUCCESS);
|
||||
\end{minted}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:orgf45dd18},fragile]{Use case: high-level}
|
||||
\begin{frame}[label={sec:orgb80c323},fragile]{Use case: high-level}
|
||||
\begin{minted}[frame=lines,fontsize=\scriptsize,linenos]{c}
|
||||
#include <qmckl.h>
|
||||
// ...
|
||||
@ -366,28 +354,7 @@ rc = qmckl_get_local_energy(context, &e_loc);
|
||||
\end{minted}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:org8378951}]{Dependencies between kernels}
|
||||
\begin{columns}
|
||||
\begin{column}{0.5\textwidth}
|
||||
\begin{center}
|
||||
\includegraphics[width=.9\linewidth]{irp.png}
|
||||
\end{center}
|
||||
|
||||
\end{column}
|
||||
\begin{column}{0.5\textwidth}
|
||||
\begin{itemize}
|
||||
\item Only the needed sub-graph is computed
|
||||
\item HPC: Each kernel is one/many parallel Task(s)
|
||||
\item HPC: Use OpenMP tasks or StarPU\footnote{C. Augonnet et al, doi:10.1002/cpe.1631} for hybrid architectures:
|
||||
(StarPU handles very well asynchronous CPU-GPU transfers).
|
||||
\end{itemize}
|
||||
|
||||
\end{column}
|
||||
\end{columns}
|
||||
\end{frame}
|
||||
|
||||
|
||||
\begin{frame}[label={sec:orgd876ae6}]{Development strategy}
|
||||
\begin{frame}[label={sec:org518f369}]{Development strategy}
|
||||
\begin{enumerate}
|
||||
\item Kernel extraction: QMC specialists agree on the
|
||||
mathematical expression of the problem
|
||||
@ -400,7 +367,7 @@ with HPC experts from real-size examples
|
||||
\end{enumerate}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:org2b9e52e}]{Documentation library}
|
||||
\begin{frame}[label={sec:org7c60b7a}]{Documentation library}
|
||||
Literate programming with Org-mode:
|
||||
\begin{itemize}
|
||||
\item Comments are more important than code
|
||||
@ -414,7 +381,7 @@ Literate programming with Org-mode:
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:orgda06b11}]{High-Performance strategies}
|
||||
\begin{frame}[label={sec:orgf424cd4}]{High-Performance strategies}
|
||||
\begin{block}{Linear algebra hot spots}
|
||||
\begin{center}
|
||||
\begin{tabular}{lll}
|
||||
@ -435,34 +402,34 @@ in matrices
|
||||
\end{block}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:orga9a5f05}]{High-Performance strategies}
|
||||
\begin{frame}[label={sec:orgea7372b}]{High-Performance strategies}
|
||||
\begin{block}{Tuning}
|
||||
\begin{itemize}
|
||||
\item Optimization is guided by analysis with \alert{MAQAO}\footnote{https://maqao.org}.
|
||||
\item Specialized versions of critical hot-spots
|
||||
\item MIPP for portable intrinsics / specialized code generation
|
||||
\item MIPP\footnote{https://github.com/aff3ct/MIPP} for portable intrinsics / specialized code generation
|
||||
\item Monitoring of the use of the library to choose most efficient versions
|
||||
\item Optimizations guided by monitoring numerical accuracy (\alert{Verificarlo}\footnote{https://github.com/verificarlo/verificarlo})
|
||||
\end{itemize}
|
||||
\end{block}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:org1afd5ba}]{Example: Specialized DGEMM kernel}
|
||||
\begin{frame}[label={sec:orgba656d9}]{Example: Specialized DGEMM kernel}
|
||||
VIJAY
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:orgd391c54}]{Efficiently guiding the developer}
|
||||
\begin{frame}[label={sec:orgd3ca712}]{Efficiently guiding the developer}
|
||||
\begin{center}
|
||||
\includegraphics[width=\textwidth]{./maqao1.png}
|
||||
\end{center}
|
||||
\end{frame}
|
||||
\begin{frame}[label={sec:org543da85}]{Extensive/automatic testing of different configurations}
|
||||
\begin{frame}[label={sec:orgcc14268}]{Extensive/automatic testing of different configurations}
|
||||
\begin{center}
|
||||
\includegraphics[width=\textwidth]{./maqao2.png}
|
||||
\end{center}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[label={sec:org95699bb}]{First application : 3-body Jastrow factor}
|
||||
\begin{frame}[label={sec:org7ee3c30}]{First application : 3-body Jastrow factor}
|
||||
\newcommand{\Jeen}{J_{\text{een}}}
|
||||
\newcommand{\Nel}{N_{\text{elec}}}
|
||||
\newcommand{\Nat}{N_{\text{nucl}}}
|
||||
@ -600,7 +567,7 @@ vfc\_probe\_assert("Sherman-Morisson", "res", res, \tikzmark{target}1e-7)
|
||||
(targetex.south) to[out=-90,in=90] ([yshift=1.2ex, xshift=.5cm]{pic cs:target});
|
||||
\end{tikzpicture}
|
||||
\end{frame}
|
||||
\begin{frame}[label={sec:orge29c5eb}]{Verificarlo CI}
|
||||
\begin{frame}[label={sec:org8493521}]{Verificarlo CI}
|
||||
\begin{columns}
|
||||
\begin{column}{0.5\textwidth}
|
||||
\begin{exampleblock}{Compare runs}
|
||||
|
Loading…
Reference in New Issue
Block a user