diff --git a/fork.png b/fork.png new file mode 100644 index 0000000..47132f4 Binary files /dev/null and b/fork.png differ diff --git a/parallelism_scemama.org b/parallelism_scemama.org index ad733fd..dcf03a7 100644 --- a/parallelism_scemama.org +++ b/parallelism_scemama.org @@ -8,7 +8,6 @@ #+BEAMER_THEME: trex #+LaTeX_HEADER: \usepackage{minted} #+LaTeX_HEADER: \usemintedstyle{emacs} -#+LaTeX_HEADER: \newminted{f90}{fontsize=\footnotesize} #+LaTeX_HEADER: \usepackage[utf8]{inputenc} #+LaTeX_HEADER: \usepackage[T1]{fontenc} #+LaTeX_HEADER: \usepackage{hyperref} @@ -23,48 +22,26 @@ #+startup: beamer #+options: H:2 -* Program :noexport: +* Program NOEXPORT :noexport: -** Mardi apres-midi - - Je peux leur faire - - paralleliser un produit de matrices avec OpenMP en Fortran/C - - calculer pi avec un algorithme Monte Carlo avec MPI (Master-Worker) d'abord en Python puis en Fortran/C. - - calculer une integrale dans R^3 sur une grille de points avec MPI en Fortran/C. - - Ca va juste leur donner les bases avec OMP DO et MPI_Reduce, et on ne pourra pas - aller beaucoup plus loin. Mais ils pourront quand meme faire tourner les 112 - coeurs du cluster. - - Attention: il faut aussi prevoir qu'ils n'auront peut-etre jamais utilise un - cluster, et ils ne savent probablement pas ce que c'est. Donc je vais devoir - inclure un peu de hardware dans ma presentation sur le parallelisme et leur - expliquer qu'il faut faire sbatch pour lancer un calcul. - - C'est bien slurm to gestionnaire de batch? - - -** Mercredi + +14:00-14:30 : Supercomputers +14:30-15:30 : Parallelism +15:45-17:30 : OpenMP/MPI +09:00-10:30 : Presentation IRP + QP Pour IRPF90, je peux faire une presentation assez generale. - +10:30-12:00 : TP IRPF90 J'ai deja un tutoriel pour ecrire un programme de dynamique moleculaire avec un potentiel Lennard-Jones, je pense que ce sera plus facile puisqu'il n'y a pas beaucoup de temps pour faire les TPs. - Si ils vont suffisamment vite, on peut ensuite basculer sur la parallelisation - OpenMP des boucles dans le code, et sur le lancement de plusieurs trajectoires - avec MPI en reprenant le modele de parallelisation de pi de la veille. +13:45-17:00 : TP Parallel + - calculer pi avec un algorithme Monte Carlo avec MPI (Master-Worker) d'abord en Python puis en Fortran/C. + - paralleliser un produit de matrices avec OpenMP en Fortran/C + - calculer une integrale dans R^3 sur une grille de points avec MPI en Fortran/C. - Pour QP, je me dis que ce serait bien de le le presenter en 15 minutes une fois - qu'ils ont fait le TP sur IRPF90. Ensuite on peut leur faire une demo sur comment - implementer un SCF en 10 minutes, mais je ne pense pas qu'on aura le temps de - leur faire faire le travail dans QP, et ca simplifie les problemes de - compilation sur les machines. On peut aussi leur donner acces au compte ou QP - est installe pour ceux qui vont tres vite et qui veulent essayer de jouer avec. - - -* Supercomputers +* Supercomputers :noexport: ** Computers @@ -271,9 +248,7 @@ - Software stack is not as mature as the CPU stack $\Longrightarrow$ needs hacking to get performance - -* Fundamentals of parallelization - +* Fundamentals of parallelization :noexport: ** Introduction @@ -292,14 +267,87 @@ All levels of parallelism can be exploited in the same code, but every problem is not parallelizable at all levels. +** Amdahl's law + +*** Definition + If P is the proportion of a program that can be made parallel, the maximum speedup + that can be achieved is: + #+LATEX: \begin{columns} + #+LATEX: \begin{column}{0.1\textwidth} + \[ + S_{\max} = \frac{1}{(1-P)+P/n} + \] + #+LATEX: \end{column} + #+LATEX: \begin{column}{0.8\textwidth} + - $S$ :: : Speedup + - $P$ :: : Proportion of a program that can be made parallel + - $n$ :: : Number of cores + #+LATEX: \end{column} + #+LATEX: \end{columns} + +*** Example + + #+LATEX: \begin{columns} + #+LATEX: \begin{column}{0.4\textwidth} + #+ATTR_LATEX: :width 0.8 \textwidth + [[./Amdahl2.pdf]] + #+LATEX: \end{column} + #+LATEX: \begin{column}{0.6\textwidth} + - Max speedup : 100$\times$ + - Perfect scaling needs *all* the program to be parallelized (difficult) + #+LATEX: \end{column} + #+LATEX: \end{columns} ** Outline - 1. Compute a PES with GNU parallel (Bash) - 2. Compute $\pi$ with a Monte Carlo algorithm (Python, sockets) - 3. Compute $\pi$ with a deterministic algorithm (Python, MPI) - 4. Matrix multiplication (Fortran, OpenMP) + 1. Human parallel machine + 2. Compute a PES with GNU parallel (Bash) + 3. Compute $\pi$ with a Monte Carlo algorithm (Python, sockets) + 4. Compute $\pi$ with a deterministic algorithm (Python, MPI) + 5. Matrix multiplication (Fortran, OpenMP) -** Problem 1: Potential energy surface +** Experiment +*** The Human Parallel Machine + - Sort a deck of cards + - Do it as fast as you can + - I did it alone in 5'25'' (325 seconds) + - Each one of you is a /human compute node/ + - How fast can all of you sort the same deck? +** Disappointing result + + You were many more people than me, but you didn't go many times faster ! + + $\Longrightarrow$ Try the Merge sort + +** Merge sort + #+ATTR_LATEX: :height 0.9\textheight + [[file:merge.png]] +** Merge sort + #+ATTR_LATEX: :height 0.9\textheight + [[file:merge_parallel.png]] +** Better, but still not perfect + + - Moving the cards around the room takes time (communication) + - Sorting the sub-piles is super-fast (computation) + + Algorithm is *bounded by communication* : Difficult to scale + + #+LATEX: \begin{exampleblock}{If the assignment was} + - Consider the function $f(a,b,x) = a\,x^5 - \frac{b\,x^3}{a}$ + - Each card has a value for $a$, $b$ and $x$ + - Evaluate $f(a,b,x)$ for each card and write the result on the card + - Sort the results + #+LATEX: \end{exampleblock} + + Same communication pattern, but more computational effort $\Longrightarrow$ better scaling. + + #+LATEX: \begin{alertblock}{Take-home messages} + - Parallel workloads may require different algorithms than scalar workloads + - Difficulty is /data movement/ (communication), not /computation/ + #+LATEX: \end{alertblock} + +* Embarrassingly parallel computations :noexport: + +** Example: Potential energy surface We want to create the CCSD(T) potential energy surface of the water molecule. #+LATEX: \begin{columns} @@ -311,7 +359,7 @@ but every problem is not parallelizable at all levels. #+LATEX: \end{column} #+LATEX: \end{columns} -** Problem 1: Potential energy surface +** Example: Potential energy surface *** Constraints - We want to compute $25 \times 25 \times 25 = 15~625$ points @@ -323,7 +371,7 @@ but every problem is not parallelizable at all levels. - The grid points are completely independent - Any CPU core can calculate any point -** Problem 1: Potential energy surface +** Example: Potential energy surface *** Optimal solution: /work stealing/ - One grid point is $E(r_1,r_2,\theta)$ @@ -362,7 +410,6 @@ B C B D #+end_src - ** GNU parallel If no command is given after parallel the arguments are treated as commands: @@ -461,7 +508,6 @@ $ g09 < input | grep "^ CCSD(T)=" | cut -d "=" -f 2 -0.76329294074D+02 #+end_src - ** Computing the CCSD(T) surface *2. Script =run_h2o.sh= that takes $r_1$, $r_2$ and $\theta$ as arguments* @@ -599,85 +645,376 @@ user 0m3.359s sys 0m3.172s #+end_src -** Amdahl's law +* Inter-process communication + +** Processes /vs/ threads + +*** Process + - Has its own memory address space + - Context switching between processes is slow + - Processes interact only through system-provided communication mechanisms + - Fork: creates a copy of the current process + - Exec: switches to running another binary executable + - Spawn: =Fork=, then =exec= the child +*** Thread + - Exist as subsets of a process + - Context switching between threads is fast + - Share the same memory address space : interact via shared memory + +** Inter-process communication + + - Processes exchange data via read/write on /File descriptors/ + - These file descriptors can point to: + - Files on disk: Simplest choice + - Named Pipes: Same program as with files + - Pipes: Same behavior as files + - Network sockets + +** Named pipes + A named pipe is a *virtual* file which is read by a process and + written by other processes. It allows processes to communicate using + standard I/O operations: + #+ATTR_LATEX: :height 0.7\textheight + [[./pipe.png]] + +** Example + #+LATEX: \begin{columns} + #+LATEX: \begin{column}{0.5\textwidth} + #+ATTR_LATEX: :height 0.8\textheight + [[./pipe_example.png]] + #+LATEX: \end{column} + #+LATEX: \begin{column}{0.5\textwidth} + - Unzip =input.gz= + - Sort the unzipped file + - Zip the result into =output.gz= + + Same as + #+begin_src bash +$ gunzip --to-stdout input.gz \ + | sort \ + | gzip > output.gz + #+end_src + #+LATEX: \end{column} + #+LATEX: \end{columns} + +** Example: Program 1 + + #+LATEX: \begin{columns} + #+LATEX: \begin{column}{0.5\textwidth} + #+begin_src bash +#!/bin/bash + +# Create two pipes using the mkfifo command +mkfifo /tmp/pipe /tmp/pipe2 + +# Unzip the input file and write the result +# in the 1st pipe +echo "Run gunzip" +gunzip --to-stdout input.gz > /tmp/pipe + +# Zip what comes from the second pipe +echo "Run gzip" +gzip < /tmp/pipe2 > output.gz + +# Clear the pipes in the filesystem +rm /tmp/pipe /tmp/pipe2 + #+end_src + #+LATEX: \end{column} + #+LATEX: \begin{column}{0.5\textwidth} + #+begin_src bash +#!/bin/bash + +# Read the 1st pipe, sort the result and write +# in the 2nd pipe +echo "Run sort" +sort < /tmp/pipe > /tmp/pipe2 + #+end_src + + #+LATEX: \begin{exampleblock}{Execution} + #+begin_src text +$ ./p1.sh & +Run gunzip +$ ./p2.sh +Run sort +Run gzip +[1]+ Done +./p1.sh + #+end_src + #+LATEX: \end{exampleblock} + #+LATEX: \end{column} + #+LATEX: \end{columns} + +** Pipe + #+LATEX: \begin{columns} + #+LATEX: \begin{column}{0.6\textwidth} + *Fork* : Copy the current process + + Pipes don't have an entry on the file system, and are opened/closed in the programs. + 1. Create the pipe + 2. Fork in parent/child processes + 3. Exchange data between parent and child + #+LATEX: \end{column} + #+LATEX: \begin{column}{0.4\textwidth} + [[./fork.png]] + #+LATEX: \end{column} + #+LATEX: \end{columns} + +** Pipe + #+LATEX: \begin{columns} + #+LATEX: \begin{column}{0.6\textwidth} + #+begin_src python :tangle Codes/fork.py +#!/usr/bin/env python +import sys,os + +def main(): + print("Process ID: %d" % (os.getpid())) + r, w = os.pipe() + new_pid = os.fork() + + if new_pid != 0: # This is the parent process + print("I am the parent, my PID is %d"%(os.getpid())) + print("and the PID of my child is %d"%(new_pid)) + + # Close write and open read file descriptors + os.close(w) + r = os.fdopen(r,'r') + + # Read data from the child + print("Reading from the child") + s = r.read() + r.close() + print("Read '%s' from the child"%(s)) + + #+end_src + #+LATEX: \end{column} + #+LATEX: \begin{column}{0.4\textwidth} + [[./fork.png]] + #+LATEX: \end{column} + #+LATEX: \end{columns} + +** Pipe + #+LATEX: \begin{columns} + #+LATEX: \begin{column}{0.6\textwidth} + #+begin_src python :tangle Codes/fork.py + else: # This is the child process + print(" I am the child, my PID is %d"%(os.getpid())) + + # Close read and open write file descriptors + os.close(r) + w = os.fdopen(w,'w') + + # Send 'Hello' to the parent + print(" Sending 'Hello' to the parent") + w.write( "Hello!" ) + w.close() + + print(" Sent 'Hello'") + +if __name__ == "__main__": + main() + + #+end_src + #+LATEX: \end{column} + #+LATEX: \begin{column}{0.4\textwidth} + [[./fork.png]] + #+LATEX: \end{column} + #+LATEX: \end{columns} + +** Computation of $\pi$ + + #+LATEX: \begin{columns} + #+LATEX: \begin{column}{0.4\textwidth} + #+ATTR_LATEX: width=0.9\textwidth + [[./pi.png]] + #+LATEX: \end{column} + #+LATEX: \begin{column}{0.6\textwidth} + - The surface of the circle is $\pi r^2$ $\Longrightarrow$ For a unit circle, the + surface is $\pi$ + - The function in the red square is $y = \sqrt{1-x^2}$ (the circle is $\sqrt{x^2 + y^2} = 1$) + - The surface in grey corresponds to \[ \int_0^1 \sqrt{1-x^2} dx = \frac{\pi}{4} \] + + #+LATEX: \end{column} + #+LATEX: \end{columns} + +** Monte Carlo computation of $\pi$ + + #+LATEX: \begin{columns} + #+LATEX: \begin{column}{0.4\textwidth} + #+ATTR_LATEX: width=0.9\textwidth + [[./pi_mc.png]] + #+LATEX: \end{column} + #+LATEX: \begin{column}{0.6\textwidth} + - Points $(x,y)$ are drawn randomly in the unit square + - Count how many times the points are inside the circle + + \[ + \frac{N_{\text{in}}}{N_{\text{in}}+N_{\text{out}}} = \frac{\pi}{4} + \] + + #+LATEX: \end{column} + #+LATEX: \end{columns} + +** Optimal algorithm + - Each core $1 \le i \le M$ computes its own average $X_i$ + - All $M$ results are independent $\Longrightarrow$ + Gaussian-distributed random variables (central-limit theorem) + + #+ATTR_LATEX: :height 0.6\textheight + [[./pi_convergence.png]] + +** Computation of $\pi$ with pipes in Python + + #+begin_src python :tangle Codes/pi_python.py +import os, sys +from random import random, seed +from math import sqrt + +NMAX = 10000000 # Nb of MC steps/process +error_threshold = 1.0e-4 # Stopping criterion +NPROC=4 # Use 4 processes + +def compute_pi(): + """Local Monte Carlo calculation of pi""" + seed(None) # Initialize random number generator -*** Definition - If P is the proportion of a program that can be made parallel, the maximum speedup - that can be achieved is: - #+LATEX: \begin{columns} - #+LATEX: \begin{column}{0.1\textwidth} - \[ - S_{\max} = \frac{1}{(1-P)+P/n} - \] - #+LATEX: \end{column} - #+LATEX: \begin{column}{0.8\textwidth} - - $S$ :: : Speedup - - $P$ :: : Proportion of a program that can be made parallel - - $n$ :: : Number of cores - #+LATEX: \end{column} - #+LATEX: \end{columns} + result = 0. + for i in range(NMAX): # Loop NMAX times + x,y = random(), random() # Draw 2 random numbers x and y + if x*x + y*y <= 1.: # Check if (x,y) is in the circle + result += 1 + return 4.* float(result)/float(NMAX) # Estimate of pi + #+end_src -*** Example +** Computation of $\pi$ with pipes in Python - #+LATEX: \begin{columns} - #+LATEX: \begin{column}{0.4\textwidth} - #+ATTR_LATEX: :width 0.8 \textwidth - [[./Amdahl2.pdf]] - #+LATEX: \end{column} - #+LATEX: \begin{column}{0.6\textwidth} - - Max speedup : 100$\times$ - - Perfect scaling needs *all* the program to be parallelized (difficult) - #+LATEX: \end{column} - #+LATEX: \end{columns} - -** Experiment -*** The Human Parallel Machine - - Sort a deck of cards - - Do it as fast as you can - - I did it alone in 5'25'' (325 seconds) - - Each one of you is a /human compute node/ - - How fast can all of you sort the same deck? -** Disappointing result - - You were many more people than me, but you didn't go many times faster ! - - $\Longrightarrow$ Try the Merge sort - -** Merge sort - #+ATTR_LATEX: :height 0.9\textheight - [[file:merge.png]] -** Merge sort - #+ATTR_LATEX: :height 0.9\textheight - [[file:merge_parallel.png]] -** Better, but still not perfect - - - Moving the cards around the room takes time (communication) - - Sorting the sub-piles is super-fast (computation) - - Algorithm is *bounded by communication* : Difficult to scale - - #+LATEX: \begin{exampleblock}{If the assignment was} - - Consider the function $f(a,b,x) = a\,x^5 - \frac{b\,x^3}{a}$ - - Each card has a value for $a$, $b$ and $x$ - - Evaluate $f(a,b,x)$ for each card and write the result on the card - - Sort the results - #+LATEX: \end{exampleblock} - - Same communication pattern, but more computational effort $\Longrightarrow$ better scaling. - - #+LATEX: \begin{alertblock}{Important} - Difficulty is /data movement/ (communication), not /computation/ - #+LATEX: \end{alertblock} + #+begin_src python :tangle Codes/pi_python.py +def main(): + r = [None]*NPROC # Reading edges of the pipes + pid = [None]*NPROC # Running processes + for i in range(NPROC): + r[i], w = os.pipe() # Create the pipe + pid[i] = os.fork() # Fork and save the PIDs + + if pid[i] != 0: # This is the parent process + os.close(w) + r[i] = os.fdopen(r[i],'r') + else: # This is the child process + os.close(r[i]) + w = os.fdopen(w,'w') + while True: # Compute pi on this process + X = compute_pi() + try: + w.write("%f\n"%(X)) # Write the result in the pipe + w.flush() + except IOError: # Child process exits here + sys.exit(0) + #+end_src -** Data movement +** Computation of $\pi$ with pipes in Python -* OpenMP + #+begin_src python :tangle Codes/pi_python.py + data = [] + while True: + for i in range(NPROC): # Read in the pipe of each process + data.append( float(r[i].readline()) ) + N = len(data) + average = sum(data)/N # Compute average + if N > 2: # Compute variance + l = [ (x-average)*(x-average) for x in data ] + variance = sum(l)/(N-1.) + else: + variance = 0. + error = sqrt(variance)/sqrt(N) # Compute error + print(f"%{average} +/- %{error} %{N}") + + if N > 2 and error < error_threshold: # Stopping condition + for i in range(NPROC): # Kill all children + try: os.kill(pid[i],9) + except: pass + sys.exit(0) + +if __name__ == '__main__': main() + #+end_src * Message Passing Interface (MPI) +* OpenMP +* Exercises + +** Monte Carlo + + 1. Write a Fortran src_fortran{double precision function compute_pi(M)} that computes + $\pi$ with the Monte Carlo algorithm using $M$ samples + 2. Call it like this: + #+begin_src fortran +program pi_mc + implicit none + integer :: M + logical :: iterate + double precision :: sample + double precision, external :: compute_pi + + call random_seed() ! Initialize random number generator + read (*,*) M ! Read number of samples in compute_pi + + iterate = .True. + do while (iterate) ! Compute pi over N samples until 'iterate=.False.' + sample = compute_pi(M) + write(*,*) sample + read (*,*) iterate + end do +end program pi_mc + #+end_src + +** Monte Carlo + + 3. Write a Fortran src_fortran{double precision function compute_pi(M)} that computes + $\pi$ with the Monte Carlo algorithm using $M$ samples + #+begin_src fortran +program pi_mc + implicit none + integer :: M + logical :: iterate + double precision :: sample + double precision, external :: compute_pi + + call random_seed() ! Initialize random number generator + read (*,*) M ! Read number of samples in compute_pi + + iterate = .True. + do while (iterate) ! Compute pi over N samples until 'iterate=.False.' + sample = compute_pi(M) + write(*,*) sample + read (*,*) iterate + end do +end program pi_mc + #+end_src + +** Monte Carlo (solution) + + #+begin_src fortran +double precision function compute_pi(M) + implicit none + integer, intent(in) :: M + double precision :: x, y, n_in + integer :: i + + n_in = 0.d0 + do i=1, M + call random_number(x) + call random_number(y) + if (x*x + y*y <= 1.d0) then + n_in = n_in+1.d0 + end if + end do + compute_pi = 4.d0*n_in/dble(nmax) + +end function compute_pi + + #+end_src * Figures :noexport: #+BEGIN_SRC dot :output file :file merge.png @@ -832,7 +1169,7 @@ digraph G { in parallel? a) - #+begin_src f90 + #+begin_src fortran !$OMP PARALLEL DO PRIVATE(i,j,k) SHARED(A,B,C) do j=1,n do i=1,n @@ -845,7 +1182,7 @@ digraph G { #+end_src b) - #+begin_src f90 + #+begin_src fortran !$OMP PARALLEL PRIVATE(i,j,k) SHARED(A,B,C) !$OMP DO do j=1,n @@ -860,7 +1197,7 @@ digraph G { #+end_src c) - #+begin_src f90 + #+begin_src fortran !$OMP PARALLEL PRIVATE(i,j,k) SHARED(A,B,C) do j=1,n !$OMP DO @@ -875,7 +1212,7 @@ digraph G { #+end_src d) - #+begin_src f90 + #+begin_src fortran !$OMP PARALLEL PRIVATE(i,j,k) SHARED(A,B,C) do j=1,n do i=1,n @@ -895,11 +1232,7 @@ digraph G { * Export :noexport: #+BEGIN_SRC elisp :output none (setq org-latex-listings 'minted) -(setq org-latex-custom-lang-environments - '( - - (f90 "fortran") - )) +(setq org-latex-custom-lang-environments nil) (setq org-latex-minted-options '(("frame" "lines") ("fontsize" "\\scriptsize") @@ -913,6 +1246,3 @@ digraph G { #+RESULTS: : /home/scemama/MEGA/TEX/Cours/TCCM/TCCM2022/Parallelism/parallelism_scemama.pdf - - - diff --git a/pi.png b/pi.png new file mode 100644 index 0000000..668ea50 Binary files /dev/null and b/pi.png differ diff --git a/pi_convergence.png b/pi_convergence.png new file mode 100644 index 0000000..f4cd5ee Binary files /dev/null and b/pi_convergence.png differ diff --git a/pi_mc.png b/pi_mc.png new file mode 100644 index 0000000..8db5339 Binary files /dev/null and b/pi_mc.png differ diff --git a/pipe.png b/pipe.png new file mode 100644 index 0000000..86c7bfd Binary files /dev/null and b/pipe.png differ diff --git a/pipe_example.png b/pipe_example.png new file mode 100644 index 0000000..403e459 Binary files /dev/null and b/pipe_example.png differ