diff --git a/Amdahl2.pdf b/Amdahl2.pdf new file mode 100644 index 0000000..f8424ac Binary files /dev/null and b/Amdahl2.pdf differ diff --git a/h2o.png b/h2o.png new file mode 100644 index 0000000..9e75d9a Binary files /dev/null and b/h2o.png differ diff --git a/img_20160510_152246_resize.jpg b/img_20160510_152246_resize.jpg index e905c33..5db1738 100644 Binary files a/img_20160510_152246_resize.jpg and b/img_20160510_152246_resize.jpg differ diff --git a/merge.png b/merge.png new file mode 100644 index 0000000..7559daa Binary files /dev/null and b/merge.png differ diff --git a/merge_parallel.png b/merge_parallel.png new file mode 100644 index 0000000..f56a8f4 Binary files /dev/null and b/merge_parallel.png differ diff --git a/parallelism_scemama.org b/parallelism_scemama.org index a71fcb2..ad733fd 100644 --- a/parallelism_scemama.org +++ b/parallelism_scemama.org @@ -219,6 +219,59 @@ ** Core #+ATTR_LATEX: :height 0.9\textheight [[./Nehalem.jpg]] +** Why such an architecture? +*** Moore's "Law" + - /The number of transistors doubles every two years/. + - Often interpreted as /the computational power doubles every two years/. + - Exponential law $\Longrightarrow$ will fail. + - 7~nm semiconductors in 2021: the atomic limit is approaching. + +** Why such an architecture? +*** Main problem: energy + Dissipation : $D = F \times V^2 \times C$ + + - $F$ : Frequency (\sim 3 GHz) + - $V$ : tension + - $C$ : constant related to the size of semiconductors (nm) + + For the processor to be stable, the tension needs to be linear with the + frequency: so $D = \mathcal{O}(F^3)$ $\Longrightarrow$ The + frequency has to be kept around 3 GHz. + + 1. Double the number of Flops = double the number of processors. + 2. Use accelerators (GPUs, FPGA, TPUs, ...) + \pause +*** Consequence + This requires to re-think programming $\Longrightarrow$ + Parallel programming is *unavoidable*. + +** Today's main problems + + 1. Energy, physical limits + 2. Interconnect technology + 3. Increase of computational power is faster than memory capacity : + reduction of memory per CPU core + 4. Latencies can't be reduced much more : moving data becomes very expensive + 5. File systems are *extremely* slow + 6. Supercomputers are well tuned for benchmarks (dense linear algebra), but not + that much for general scientific applications + 7. Fault-tolerance + +** Practical solution + +*** GPUs + Use less intelligent CPUs, but in a much larger number + $\Longrightarrow$ Better flops/watt rate + +*** But... + - Transfer from main memory to GPU as expensive as a network communication + - More computational power and less RAM than a CPU node + - Not adapted to all algorithms + - Requires re-thinking and rewriting of codes: loss of years of debugging + - Software stack is not as mature as the CPU stack $\Longrightarrow$ + needs hacking to get performance + + * Fundamentals of parallelization @@ -228,7 +281,6 @@ - Concurrency :: Running multiple computations at the same time. - Parallelism :: Running multiple computations *on different execution units*. - *** Multiple levels | *Distributed* | Multiple machines | @@ -241,9 +293,383 @@ All levels of parallelism can be exploited in the same code, but every problem is not parallelizable at all levels. +** Outline + 1. Compute a PES with GNU parallel (Bash) + 2. Compute $\pi$ with a Monte Carlo algorithm (Python, sockets) + 3. Compute $\pi$ with a deterministic algorithm (Python, MPI) + 4. Matrix multiplication (Fortran, OpenMP) - +** Problem 1: Potential energy surface + We want to create the CCSD(T) potential energy surface of the water molecule. + + #+LATEX: \begin{columns} + #+LATEX: \begin{column}{0.3\textwidth} + [[./h2o.png]] + #+LATEX: \end{column} + #+LATEX: \begin{column}{0.6\textwidth} + [[./pes.png]] + #+LATEX: \end{column} + #+LATEX: \end{columns} + +** Problem 1: Potential energy surface +*** Constraints + - We want to compute $25 \times 25 \times 25 = 15~625$ points + - We are allowed to use 100 CPU cores simultaneously + - We like to use GAU$$IAN to calculate the CCSD(T) energy + +*** But: + + - The grid points are completely independent + - Any CPU core can calculate any point + +** Problem 1: Potential energy surface +*** Optimal solution: /work stealing/ + + - One grid point is $E(r_1,r_2,\theta)$ + - Dress the list of all the arguments $(r_1,r_2,\theta)$: + ~[ (0.8,0.8,70.), ..., (1.1,1.1,140.) ]~ (the *queue*) + - Each CPU core, when idle, pops out the head of the queue + and computes $E(r_1,r_2,\theta)$ + - All the results are stored in a single file + - The results are sorted for plotting + +** GNU parallel + + GNU Parallel executes Linux commands in parallel and can guarantee that + the output is the same as if the commands were executed sequentially. + +#+begin_src bash +$ parallel echo ::: A B C +A +B +C +#+end_src + + is equivalent to: + +#+begin_src bash +$ echo A ; echo B ; echo C +#+end_src + +Multiple input sources can be given: + +#+begin_src bash +$ parallel echo ::: A B ::: C D +A C +A D +B C +B D +#+end_src + + +** GNU parallel + + If no command is given after parallel the arguments are treated as commands: + + #+begin_src bash +$ parallel ::: pwd hostname "echo $TMPDIR" +/home/scemama +lpqdh82.ups-tlse.fr +/tmp + #+end_src + + Jobs can be run on remote servers: + + #+begin_src bash +$ parallel ::: "echo hello" hostname +lpqdh82.ups-tlse.fr +hello +$ parallel -S lpqlx139.ups-tlse.fr ::: "echo hello" hostname +hello +lpqlx139.ups-tlse.fr + #+end_src + +** GNU parallel + + File can be transfered to the remote hosts: + + #+begin_src bash +$ echo Hello > input +$ parallel -S lpqlx139.ups-tlse.fr cat ::: input +cat: input: No such file or directory + +$ echo Hello > input +$ parallel -S lpqlx139.ups-tlse.fr --transfer --cleanup cat ::: input +Hello + #+end_src + +** GNU Parallel: example + + Convert thousands of images from =.gif= to =.jpg= + + #+begin_src bash +$ ls +img1000.gif img241.gif img394.gif img546.gif img699.gif img850.gif +img1001.gif img242.gif img395.gif img547.gif img69.gif img851.gif +[...] +img23.gif img392.gif img544.gif img697.gif img849.gif +img240.gif img393.gif img545.gif img698.gif img84.gif + #+end_src + + To convert one =.gif= into =.jpg= format: + + #+begin_src bash +$ time convert img1.gif img1.jpg +real 0m0.008s +user 0m0.000s +sys 0m0.000s + #+end_src + +** GNU Parallel: example + +*** Sequential execution + + #+begin_src bash +$ time for i in {1..1011} +> do +> convert img${i}.gif img${i}.jpg +> done +real 0m7.936s +user 0m0.210s +sys 0m0.270s + #+end_src + +*** Parallel execution on a quad-core: + + #+begin_src bash +$ time parallel convert {.}.gif {.}.jpg ::: *.gif +real 0m2.051s +user 0m1.000s +sys 0m0.540s + #+end_src + +** Computing the CCSD(T) surface + + *1. Fetch the energy in an output file* + + Running a CCSD(T) calculation with GAU$$IAN gives the energy + somewhere in the output:: + #+begin_example + CCSD(T)= -0.76329294074D+02 + #+end_example + + To get only the energy in the output, we can use the following command: + + #+begin_src bash +$ g09 < input | grep "^ CCSD(T)=" | cut -d "=" -f 2 +-0.76329294074D+02 + #+end_src + + +** Computing the CCSD(T) surface + + *2. Script =run_h2o.sh= that takes $r_1$, $r_2$ and $\theta$ as arguments* + + #+LATEX: \begin{columns} + #+LATEX: \begin{column}{0.6\textwidth} + #+begin_src bash +#!/bin/bash + +r1=$1 ; r2=$2 ; angle=$3 + +# Create Gaussian input file, pipe it to Gaussian, +# and get the CCSD(T) energy +cat << EOF | g09 | grep "^ CCSD(T)=" | cut -d "=" -f 2 +# CCSD(T)/cc-pVTZ + +Water molecule r1=${r1} r2=${r2} angle=${angle} + +0 1 +h +o 1 ${r1} +h 2 ${r2} 1 ${angle} + + +EOF + + #+end_src + #+LATEX: \end{column} + #+LATEX: \begin{column}{0.4\textwidth} +Example: + +#+begin_src text +$ ./run_h2o.sh 1.08 1.08 104. +-0.76310788178D+02 + +$ ./run_h2o.sh 0.98 1.0 100. +-0.76330291742D+02 +#+end_src + #+LATEX: \end{column} + #+LATEX: \end{columns} + +** Computing the CCSD(T) surface + + *3. Files containing the parameters* + + #+LATEX: \begin{columns} + #+LATEX: \begin{column}{0.5\textwidth} + #+begin_src text +$ cat r1_file + 0.75 + 0.80 + 0.85 + 0.90 + 0.95 + 1.00 + #+end_src + #+LATEX: \end{column} + #+LATEX: \begin{column}{0.5\textwidth} + #+begin_src text +$ cat angle_file + 100. + 101. + 102. + 103. + 104. + 105. + 106. + #+end_src + #+LATEX: \end{column} + #+LATEX: \end{columns} + + #+LATEX: \begin{columns} + #+LATEX: \begin{column}{0.5\textwidth} + #+begin_src text +$ cat nodefile + 2//usr/bin/ssh compute-0-10.local + 2//usr/bin/ssh compute-0-6.local + 16//usr/bin/ssh compute-0-12.local + 16//usr/bin/ssh compute-0-5.local + 16//usr/bin/ssh compute-0-7.local + 6//usr/bin/ssh compute-0-1.local + 2//usr/bin/ssh compute-0-13.local + 4//usr/bin/ssh compute-0-8.local + #+end_src + #+LATEX: \end{column} + #+LATEX: \begin{column}{0.5\textwidth} + =nodefile= contains the names of the machines and the number of CPUs. + #+LATEX: \end{column} + #+LATEX: \end{columns} + +** Computing the CCSD(T) surface + *4. Run with GNU parallel* + + On a single CPU: + #+begin_src text +$ time parallel -a r1_file -a r1_file -a angle_file \ + --keep-order --tag -j 1 $PWD/run_h2o.sh +0.75 0.75 100. -0.76185942070D+02 +0.75 0.75 101. -0.76186697072D+02 +0.75 0.75 102. -0.76187387594D+02 +[...] +0.80 1.00 106. -0.76294078963D+02 +0.85 0.75 100. -0.76243282762D+02 +0.85 0.75 101. -0.76243869316D+02 +[...] +1.00 1.00 105. -0.76329165017D+02 +1.00 1.00 106. -0.76328988177D+02 + +real 15m5.293s +user 11m25.679s +sys 2m20.194s + #+end_src + +** Computing the CCSD(T) surface + *4. RUn with GNU parallel* + + On 64 CPUs: $39\times$ faster! + + #+begin_src text +$ time parallel -a r1_file -a r1_file -a angle_file \ + --keep-order --tag --sshloginfile nodefile $PWD/run_h2o.sh +0.75 0.75 100. -0.76185942070D+02 +0.75 0.75 101. -0.76186697072D+02 +0.75 0.75 102. -0.76187387594D+02 +[...] +0.80 1.00 106. -0.76294078963D+02 +0.85 0.75 100. -0.76243282762D+02 +0.85 0.75 101. -0.76243869316D+02 +[...] +1.00 1.00 105. -0.76329165017D+02 +1.00 1.00 106. -0.76328988177D+02 + +real 0m23.848s +user 0m3.359s +sys 0m3.172s + #+end_src + +** Amdahl's law + +*** Definition + If P is the proportion of a program that can be made parallel, the maximum speedup + that can be achieved is: + #+LATEX: \begin{columns} + #+LATEX: \begin{column}{0.1\textwidth} + \[ + S_{\max} = \frac{1}{(1-P)+P/n} + \] + #+LATEX: \end{column} + #+LATEX: \begin{column}{0.8\textwidth} + - $S$ :: : Speedup + - $P$ :: : Proportion of a program that can be made parallel + - $n$ :: : Number of cores + #+LATEX: \end{column} + #+LATEX: \end{columns} + +*** Example + + #+LATEX: \begin{columns} + #+LATEX: \begin{column}{0.4\textwidth} + #+ATTR_LATEX: :width 0.8 \textwidth + [[./Amdahl2.pdf]] + #+LATEX: \end{column} + #+LATEX: \begin{column}{0.6\textwidth} + - Max speedup : 100$\times$ + - Perfect scaling needs *all* the program to be parallelized (difficult) + #+LATEX: \end{column} + #+LATEX: \end{columns} + +** Experiment +*** The Human Parallel Machine + - Sort a deck of cards + - Do it as fast as you can + - I did it alone in 5'25'' (325 seconds) + - Each one of you is a /human compute node/ + - How fast can all of you sort the same deck? +** Disappointing result + + You were many more people than me, but you didn't go many times faster ! + + $\Longrightarrow$ Try the Merge sort + +** Merge sort + #+ATTR_LATEX: :height 0.9\textheight + [[file:merge.png]] +** Merge sort + #+ATTR_LATEX: :height 0.9\textheight + [[file:merge_parallel.png]] +** Better, but still not perfect + + - Moving the cards around the room takes time (communication) + - Sorting the sub-piles is super-fast (computation) + + Algorithm is *bounded by communication* : Difficult to scale + + #+LATEX: \begin{exampleblock}{If the assignment was} + - Consider the function $f(a,b,x) = a\,x^5 - \frac{b\,x^3}{a}$ + - Each card has a value for $a$, $b$ and $x$ + - Evaluate $f(a,b,x)$ for each card and write the result on the card + - Sort the results + #+LATEX: \end{exampleblock} + + Same communication pattern, but more computational effort $\Longrightarrow$ better scaling. + + #+LATEX: \begin{alertblock}{Important} + Difficulty is /data movement/ (communication), not /computation/ + #+LATEX: \end{alertblock} + + ** Data movement * OpenMP @@ -252,11 +678,226 @@ but every problem is not parallelizable at all levels. +* Figures :noexport: + + #+BEGIN_SRC dot :output file :file merge.png +digraph G { + graph [layout=dot rankdir=TB] + node [shape=record] + + subgraph clusterA { + label="A" + node37641025 [label="3|7|6|4|1|0|2|5"]; + {rank=same ; node3764 [label="3|7|6|4"] ; node1025 [label="1|0|2|5"]} + {rank=same ; node37 [label="3|7"] ; node64 [label="6|4"]} + {rank=same ; node10 [label="1|0"] ; node25 [label="2|5"]} + {rank=same ; node3 [label="3"] ; node7 [label="7"]} + {rank=same ; node6 [label="6"] ; node4 [label="4"]} + {rank=same ; node1 [label="1"] ; node0 [label="0"]} + {rank=same ; node2 [label="2"] ; node5 [label="5"]} + {rank=same ; node37_ [label="3|7"] ; node46_ [label="4|6"]} + {rank=same ; node01_ [label="0|1"] ; node25_ [label="2|5"]} + {rank=same ; node3467_ [label="3|4|6|7"] ; node0125_ [label="0|1|2|5"]} + node01234567_ [label="0|1|2|3|4|5|6|7"]; + } + node37641025 -> node3764; + node3764 -> node37; + node37 -> node3; + node37 -> node7; + node3764 -> node64; + node64 -> node6; + node64 -> node4; + node37641025 -> node1025; + node1025 -> node10; + node10 -> node1; + node10 -> node0; + node1025 -> node25; + node25 -> node2; + node25 -> node5; + + node3 -> node37_; + node7 -> node37_; + node6 -> node46_; + node4 -> node46_; + node1 -> node01_; + node0 -> node01_; + node2 -> node25_; + node5 -> node25_; + node37_ -> node3467_; + node46_ -> node3467_; + node01_ -> node0125_; + node25_ -> node0125_; + node0125_ -> node01234567_; + node3467_ -> node01234567_; +} + #+END_SRC + + #+RESULTS: + + #+BEGIN_SRC dot :output file :file merge_parallel.png +digraph G { + graph [layout=dot rankdir=TB] + node [shape=record] + + node37641025 [label="3|7|6|4|1|0|2|5"]; + node3764 [label="3|7|6|4"] ; node1025 [label="1|0|2|5"] + node37 [label="3|7"] ; node64 [label="6|4"] + node10 [label="1|0"] ; node25 [label="2|5"] + node3 [label="3"] ; node7 [label="7"] + node6 [label="6"] ; node4 [label="4"] + node1 [label="1"] ; node0 [label="0"] + node2 [label="2"] ; node5 [label="5"] + node37_ [label="3|7"] ; node46_ [label="4|6"] + node01_ [label="0|1"] ; node25_ [label="2|5"] + node3467_ [label="3|4|6|7"] ; node0125_ [label="0|1|2|5"] + node01234567_ [label="0|1|2|3|4|5|6|7"]; + + subgraph clusterA { + label="A" + node37641025; + node3764; + node37; + node3; + node7; + node37_; + node3467_; + node01234567_; + } + + subgraph clusterB { + label="B"; + node1025; + node10; + node1; + node0; + node01_; + node0125_; + } + + subgraph clusterC { + label="C"; + node64; + node6; + node4; + node46_; + } + + subgraph clusterD { + label="D"; + node25; + node2; + node5; + node25_; + } + + node37641025 -> node3764; + node3764 -> node37; + node37 -> node3; + node37 -> node7; + node3764 -> node64; + node64 -> node6; + node64 -> node4; + node37641025 -> node1025; + node1025 -> node10; + node10 -> node1; + node10 -> node0; + node1025 -> node25; + node25 -> node2; + node25 -> node5; + + node3 -> node37_; + node7 -> node37_; + node6 -> node46_; + node4 -> node46_; + node1 -> node01_; + node0 -> node01_; + node2 -> node25_; + node5 -> node25_; + node37_ -> node3467_; + node46_ -> node3467_; + node01_ -> node0125_; + node25_ -> node0125_; + node0125_ -> node01234567_; + node3467_ -> node01234567_; +} + #+END_SRC + + #+RESULTS: + +* Exam questions :noexport: + + +** Question 1 + Consider a matrix multiplication. Which one can /not/ be executed safely + in parallel? + + a) + #+begin_src f90 +!$OMP PARALLEL DO PRIVATE(i,j,k) SHARED(A,B,C) + do j=1,n + do i=1,n + do k=1,n + C(i,j) = C(i,j) + A(i,k) + B(k,j) + end do + end do + end do +!$OMP END PARALLEL DO + #+end_src + + b) + #+begin_src f90 +!$OMP PARALLEL PRIVATE(i,j,k) SHARED(A,B,C) +!$OMP DO + do j=1,n + do i=1,n + do k=1,n + C(i,j) = C(i,j) + A(i,k) + B(k,j) + end do + end do + end do +!$OMP END DO +!$OMP END PARALLEL + #+end_src + + c) + #+begin_src f90 +!$OMP PARALLEL PRIVATE(i,j,k) SHARED(A,B,C) + do j=1,n + !$OMP DO + do i=1,n + do k=1,n + C(i,j) = C(i,j) + A(i,k) + B(k,j) + end do + end do + !$OMP END DO + end do +!$OMP END PARALLEL + #+end_src + + d) + #+begin_src f90 +!$OMP PARALLEL PRIVATE(i,j,k) SHARED(A,B,C) + do j=1,n + do i=1,n + !$OMP DO + do k=1,n + C(i,j) = C(i,j) + A(i,k) + B(k,j) + end do + !$OMP END DO + end do + end do +!$OMP END PARALLEL + #+end_src + +** Question 2 +** Question 3 +** Question 4 * Export :noexport: #+BEGIN_SRC elisp :output none (setq org-latex-listings 'minted) (setq org-latex-custom-lang-environments '( + (f90 "fortran") )) (setq org-latex-minted-options @@ -274,65 +915,4 @@ but every problem is not parallelizable at all levels. : /home/scemama/MEGA/TEX/Cours/TCCM/TCCM2022/Parallelism/parallelism_scemama.pdf -* Figures :noexport: - - #+BEGIN_SRC dot :output file :file interfaces.png -digraph G { - QP [label="Quantum Package"]; - DM [label="Density matrices"]; - QMCCHEM [label="QMC=Chem"]; - Turbo [label="TurboRVB"]; - QP -> FCIDUMP; - FCIDUMP -> NECI; - NECI -> DM [style="dotted"]; - NECI -> QMCCHEM [style="dotted"] ; - QP -> QMCCHEM; - QP -> CHAMP; - QP -> DM [style="dotted"]; - QP -> Turbo [style="dotted"]; - NECI -> Turbo [style="dotted"]; - NECI -> CHAMP [style="dotted"]; - QMCCHEM -> DM [style="dotted"]; - CHAMP -> DM [style="dotted"]; - Turbo -> DM [style="dotted"]; - DM -> GammCor; - - QP -> QML [style="dotted"]; - NECI -> QML [style="dotted"]; - QMCCHEM -> QML [style="dotted"]; - CHAMP -> QML [style="dotted"]; - Turbo -> QML [style="dotted"]; - GammCor -> QML [style="dotted"]; - } - #+END_SRC - #+RESULTS: - [[file:interfaces.png]] - - #+BEGIN_SRC dot :output file :file interfaces2.png -digraph G { - layout=circo; - QP [label="Quantum Package"]; - QMCCHEM [label="QMC=Chem"]; - Turbo [label="TurboRVB"]; - TREX [label="TREX Format"]; - CHAMP -> TREX; - GammCor -> TREX; - NECI -> TREX; - QMCCHEM -> TREX; - QML -> TREX; - QP -> TREX; - Turbo -> TREX; - - TREX -> CHAMP; - TREX -> GammCor; - TREX -> NECI; - TREX -> QMCCHEM; - TREX -> QML; - TREX -> QP; - TREX -> Turbo; - } - #+END_SRC - #+RESULTS: - [[file:interfaces2.png]] - diff --git a/pes.png b/pes.png new file mode 100644 index 0000000..a8c3c2d Binary files /dev/null and b/pes.png differ diff --git a/tgcc.jpg b/tgcc.jpg index 3a46969..f60d1ec 100644 Binary files a/tgcc.jpg and b/tgcc.jpg differ diff --git a/top500_21.png b/top500_21.png new file mode 100644 index 0000000..481af6b Binary files /dev/null and b/top500_21.png differ