From daa408fc9ca22ad0970ab719a4a2721e7ff7565a Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Wed, 5 Feb 2020 23:32:55 +0100 Subject: [PATCH] Improved parallelism in F12 --- CI/CI.ml | 3 +- CI/F12CI.ml | 18 +++++++++--- MOBasis/HF12.ml | 77 ++++++++++++++++++++++++++++++++++++++++++++++++- Utils/Matrix.ml | 40 +++++++++++++++---------- Utils/Util.ml | 1 + 5 files changed, 117 insertions(+), 22 deletions(-) diff --git a/CI/CI.ml b/CI/CI.ml index 2a87baa..14034f3 100644 --- a/CI/CI.ml +++ b/CI/CI.ml @@ -666,7 +666,8 @@ let make ?(n_states=1) ?(algo=`Direct) det_space = in let matrix_prod psi = let result = - Matrix.mm ~transa:`T m_H psi + Matrix.parallel_mm ~transa:`T psi m_H + |> Matrix.transpose in Parallel.broadcast (lazy result) in diff --git a/CI/F12CI.ml b/CI/F12CI.ml index d7dfa52..c1cf903 100644 --- a/CI/F12CI.ml +++ b/CI/F12CI.ml @@ -25,7 +25,7 @@ let eigensystem t = Lazy.force t.eigensystem let dressing_vector ~frozen_core hf12_integrals f12_amplitudes ci = if Parallel.master then - Printf.printf "Building matrix\n%!"; + Printf.printf "Building dressing\n%!"; let det_space = ci.CI.det_space in @@ -50,11 +50,20 @@ let dressing_vector ~frozen_core hf12_integrals f12_amplitudes ci = | 3 -> f_3 ki kj | _ -> assert false ) det_space - + in + + let m_HF = + Lazy.force m_HF in - Matrix.mm (Lazy.force m_HF) (Matrix.dense_of_mat f12_amplitudes) + let result = + Matrix.parallel_mm m_HF (Matrix.dense_of_mat f12_amplitudes) + in + if Parallel.master then + Printf.printf "dressing done\n%!"; + + Parallel.broadcast (lazy result) let sum l f = List.fold_left (fun accu i -> accu +. f i) 0. l @@ -134,7 +143,8 @@ Format.printf "%a@." Matrix.pp_matrix @@ Matrix.dense_of_mat psi; let matrix_prod c = let w = - Matrix.mm ~transa:`T m_H c + Matrix.mm ~transa:`T c m_H + |> Matrix.transpose |> Matrix.to_mat in let c = Matrix.to_mat c in diff --git a/MOBasis/HF12.ml b/MOBasis/HF12.ml index 90a786e..4fcd438 100644 --- a/MOBasis/HF12.ml +++ b/MOBasis/HF12.ml @@ -15,7 +15,7 @@ type t = { } -let sum l f = List.fold_left (fun accu i -> accu +. f i) 0. l +let sum l f = Array.fold_left (fun accu i -> accu +. f i) 0. l let array_3_init d1 d2 d3 fx = let f k = @@ -225,6 +225,7 @@ let make ~frozen_core ~simulation ~mo_basis ~aux_basis_filename () = let mos_cabs = Util.list_range (mo_num+1) aux_num + |> Array.of_list in let n_core = @@ -235,18 +236,21 @@ let make ~frozen_core ~simulation ~mo_basis ~aux_basis_filename () = let mos_in = Util.list_range (n_core+1) mo_num + |> Array.of_list in let mos_a k = Determinant.alfa k |> Spindeterminant.to_list |> List.filter (fun i -> i > n_core) + |> Array.of_list in let mos_b k = Determinant.beta k |> Spindeterminant.to_list |> List.filter (fun i -> i > n_core) + |> Array.of_list in let h_one = @@ -643,11 +647,13 @@ let make ~frozen_core ~simulation ~mo_basis ~aux_basis_filename () = let i = Spindeterminant.bitstring @@ Determinant.alfa ki in let j = Spindeterminant.bitstring @@ Determinant.alfa kj in Bitstring.to_list (Bitstring.logor i j) + |> Array.of_list in let beta = let i = Spindeterminant.bitstring @@ Determinant.beta ki in let j = Spindeterminant.bitstring @@ Determinant.beta kj in Bitstring.to_list (Bitstring.logor i j) + |> Array.of_list in match s with | Spin.Alfa -> alfa, beta @@ -659,11 +665,13 @@ let make ~frozen_core ~simulation ~mo_basis ~aux_basis_filename () = let i = Spindeterminant.bitstring @@ Determinant.alfa ki in let j = Spindeterminant.bitstring @@ Determinant.alfa kj in Bitstring.to_list (Bitstring.logand i j) + |> Array.of_list in let beta = let i = Spindeterminant.bitstring @@ Determinant.beta ki in let j = Spindeterminant.bitstring @@ Determinant.beta kj in Bitstring.to_list (Bitstring.logand i j) + |> Array.of_list in match s with | Spin.Alfa -> alfa, beta @@ -906,11 +914,13 @@ let make ~frozen_core ~simulation ~mo_basis ~aux_basis_filename () = let i = Spindeterminant.bitstring @@ Determinant.alfa ki in let j = Spindeterminant.bitstring @@ Determinant.alfa kj in Bitstring.to_list (Bitstring.logand i j) + |> Array.of_list in let beta = let i = Spindeterminant.bitstring @@ Determinant.beta ki in let j = Spindeterminant.bitstring @@ Determinant.beta kj in Bitstring.to_list (Bitstring.logand i j) + |> Array.of_list in match s with | Spin.Alfa -> alfa, beta @@ -922,11 +932,13 @@ let make ~frozen_core ~simulation ~mo_basis ~aux_basis_filename () = let i = Spindeterminant.bitstring @@ Determinant.alfa ki in let j = Spindeterminant.bitstring @@ Determinant.alfa kj in Bitstring.to_list (Bitstring.logor i j) + |> Array.of_list in let beta = let i = Spindeterminant.bitstring @@ Determinant.beta ki in let j = Spindeterminant.bitstring @@ Determinant.beta kj in Bitstring.to_list (Bitstring.logor i j) + |> Array.of_list in match s with | Spin.Alfa -> alfa, beta @@ -961,6 +973,10 @@ let make ~frozen_core ~simulation ~mo_basis ~aux_basis_filename () = in + let ncabs = Array.length mos_cabs in + let tmp_h = Mat.make0 ncabs 9 in + let tmp_f = Mat.make0 ncabs 9 in + let f_3 ki kj = let i, j, m, k, l, n, s1, s2, s3, phase = match Excitation.of_det ki kj with @@ -977,6 +993,7 @@ let make ~frozen_core ~simulation ~mo_basis ~aux_basis_filename () = match s1, s2, s3 with | Alfa, Alfa, Alfa | Beta, Beta, Beta -> + (* sum mos_cabs (fun a -> h_two a k i j s1 s2 *. f_two a m n l s3 s3) +. sum mos_cabs (fun a -> h_two a n i j s1 s2 *. f_two a m l k s2 s3) -. sum mos_cabs (fun a -> h_two a l i j s1 s2 *. f_two a m n k s3 s3) @@ -986,8 +1003,29 @@ let make ~frozen_core ~simulation ~mo_basis ~aux_basis_filename () = +. sum mos_cabs (fun a -> h_two a n j m s2 s3 *. f_two a i l k s2 s1) +. sum mos_cabs (fun a -> h_two a k j m s2 s3 *. f_two a i n l s3 s1) -. sum mos_cabs (fun a -> h_two a l j m s2 s3 *. f_two a i n k s3 s1) + *) + Array.iteri (fun idx a -> tmp_f.{idx+1,7} <- f_two a i l k s2 s1) mos_cabs; + Array.iteri (fun idx a -> tmp_f.{idx+1,5} <- f_two a j l k s2 s2) mos_cabs; + Array.iteri (fun idx a -> tmp_f.{idx+1,2} <- f_two a m l k s2 s3) mos_cabs; + Array.iteri (fun idx a -> tmp_f.{idx+1,9} <- f_two a i n k s3 s1) mos_cabs; + Array.iteri (fun idx a -> tmp_f.{idx+1,4} <- f_two a j n k s3 s2) mos_cabs; + Array.iteri (fun idx a -> tmp_f.{idx+1,3} <- f_two a m n k s3 s3) mos_cabs; + Array.iteri (fun idx a -> tmp_f.{idx+1,8} <- f_two a i n l s3 s1) mos_cabs; + Array.iteri (fun idx a -> tmp_f.{idx+1,6} <- f_two a j n l s3 s2) mos_cabs; + Array.iteri (fun idx a -> tmp_f.{idx+1,1} <- f_two a m n l s3 s3) mos_cabs; + Array.iteri (fun idx a -> tmp_h.{idx+1,3} <- if tmp_f.{idx+1,3} = 0. then 0. else -. h_two a l i j s1 s2) mos_cabs; + Array.iteri (fun idx a -> tmp_h.{idx+1,1} <- if tmp_f.{idx+1,1} = 0. then 0. else h_two a k i j s1 s2) mos_cabs; + Array.iteri (fun idx a -> tmp_h.{idx+1,2} <- if tmp_f.{idx+1,2} = 0. then 0. else h_two a n i j s1 s2) mos_cabs; + Array.iteri (fun idx a -> tmp_h.{idx+1,5} <- if tmp_f.{idx+1,5} = 0. then 0. else -. h_two a n i m s1 s3) mos_cabs; + Array.iteri (fun idx a -> tmp_h.{idx+1,6} <- if tmp_f.{idx+1,6} = 0. then 0. else -. h_two a k i m s1 s3) mos_cabs; + Array.iteri (fun idx a -> tmp_h.{idx+1,4} <- if tmp_f.{idx+1,4} = 0. then 0. else h_two a l i m s1 s3) mos_cabs; + Array.iteri (fun idx a -> tmp_h.{idx+1,9} <- if tmp_f.{idx+1,9} = 0. then 0. else -. h_two a l j m s2 s3) mos_cabs; + Array.iteri (fun idx a -> tmp_h.{idx+1,7} <- if tmp_f.{idx+1,7} = 0. then 0. else h_two a n j m s2 s3) mos_cabs; + Array.iteri (fun idx a -> tmp_h.{idx+1,8} <- if tmp_f.{idx+1,8} = 0. then 0. else h_two a k j m s2 s3) mos_cabs; + dot (Mat.as_vec tmp_h) (Mat.as_vec tmp_f) | Alfa, Alfa, Beta | Beta, Beta, Alfa -> + (* sum mos_cabs (fun a -> h_two a l i j s1 s2 *. f_two a m k n s1 s3) -. sum mos_cabs (fun a -> h_two a k i j s1 s2 *. f_two a m l n s1 s3) +. sum mos_cabs (fun a -> h_two a l m j s3 s2 *. f_two a i n k s3 s1) @@ -996,8 +1034,27 @@ let make ~frozen_core ~simulation ~mo_basis ~aux_basis_filename () = -. sum mos_cabs (fun a -> h_two a l m i s3 s1 *. f_two a j n k s3 s2) +. sum mos_cabs (fun a -> h_two a n j m s2 s3 *. f_two a i l k s2 s1) -. sum mos_cabs (fun a -> h_two a n i m s1 s3 *. f_two a j l k s2 s2) + *) + Array.iteri (fun idx a -> tmp_f.{idx+1,1} <- f_two a m k n s1 s3) mos_cabs; + Array.iteri (fun idx a -> tmp_f.{idx+1,2} <- f_two a m l n s1 s3) mos_cabs; + Array.iteri (fun idx a -> tmp_f.{idx+1,4} <- f_two a i n l s3 s1) mos_cabs; + Array.iteri (fun idx a -> tmp_f.{idx+1,5} <- f_two a j n l s3 s2) mos_cabs; + Array.iteri (fun idx a -> tmp_f.{idx+1,3} <- f_two a i n k s3 s1) mos_cabs; + Array.iteri (fun idx a -> tmp_f.{idx+1,6} <- f_two a j n k s3 s2) mos_cabs; + Array.iteri (fun idx a -> tmp_f.{idx+1,7} <- f_two a i l k s2 s1) mos_cabs; + Array.iteri (fun idx a -> tmp_f.{idx+1,8} <- f_two a j l k s2 s2) mos_cabs; + Array.iteri (fun idx a -> tmp_h.{idx+1,1} <- if tmp_f.{idx+1,1} = 0. then 0. else h_two a l i j s1 s2) mos_cabs; + Array.iteri (fun idx a -> tmp_h.{idx+1,2} <- if tmp_f.{idx+1,2} = 0. then 0. else -. h_two a k i j s1 s2) mos_cabs; + Array.iteri (fun idx a -> tmp_h.{idx+1,3} <- if tmp_f.{idx+1,3} = 0. then 0. else +. h_two a l m j s3 s2) mos_cabs; + Array.iteri (fun idx a -> tmp_h.{idx+1,4} <- if tmp_f.{idx+1,4} = 0. then 0. else -. h_two a k m j s3 s2) mos_cabs; + Array.iteri (fun idx a -> tmp_h.{idx+1,5} <- if tmp_f.{idx+1,5} = 0. then 0. else +. h_two a k m i s3 s1) mos_cabs; + Array.iteri (fun idx a -> tmp_h.{idx+1,6} <- if tmp_f.{idx+1,6} = 0. then 0. else -. h_two a l m i s3 s1) mos_cabs; + Array.iteri (fun idx a -> tmp_h.{idx+1,7} <- if tmp_f.{idx+1,7} = 0. then 0. else +. h_two a n j m s2 s3) mos_cabs; + Array.iteri (fun idx a -> tmp_h.{idx+1,8} <- if tmp_f.{idx+1,8} = 0. then 0. else -. h_two a n i m s1 s3) mos_cabs; + dot ~n:(8*ncabs) (Mat.as_vec tmp_h) (Mat.as_vec tmp_f) | Alfa, Beta, Beta | Beta, Alfa, Alfa -> + (* sum mos_cabs (fun a -> h_two a l i j s1 s2 *. f_two a m k n s1 s3) -. sum mos_cabs (fun a -> h_two a n i j s1 s2 *. f_two a m k l s1 s2) +. sum mos_cabs (fun a -> h_two a n i m s1 s3 *. f_two a j k l s1 s2) @@ -1006,6 +1063,24 @@ let make ~frozen_core ~simulation ~mo_basis ~aux_basis_filename () = -. sum mos_cabs (fun a -> h_two a l j m s2 s3 *. f_two a i n k s3 s1) +. sum mos_cabs (fun a -> h_two a k m i s3 s1 *. f_two a j n l s3 s2) -. sum mos_cabs (fun a -> h_two a k j i s2 s1 *. f_two a m n l s3 s2) + *) + Array.iteri (fun idx a -> tmp_f.{idx+1,6} <- f_two a i n k s3 s1) mos_cabs; + Array.iteri (fun idx a -> tmp_f.{idx+1,4} <- f_two a i l k s2 s1) mos_cabs; + Array.iteri (fun idx a -> tmp_f.{idx+1,5} <- f_two a j k n s1 s3) mos_cabs; + Array.iteri (fun idx a -> tmp_f.{idx+1,1} <- f_two a m k n s1 s3) mos_cabs; + Array.iteri (fun idx a -> tmp_f.{idx+1,2} <- f_two a m k l s1 s2) mos_cabs; + Array.iteri (fun idx a -> tmp_f.{idx+1,3} <- f_two a j k l s1 s2) mos_cabs; + Array.iteri (fun idx a -> tmp_f.{idx+1,7} <- f_two a j n l s3 s2) mos_cabs; + Array.iteri (fun idx a -> tmp_f.{idx+1,8} <- f_two a m n l s3 s2) mos_cabs; + Array.iteri (fun idx a -> tmp_h.{idx+1,1} <- if tmp_f.{idx+1,1} = 0. then 0. else h_two a l i j s1 s2) mos_cabs; + Array.iteri (fun idx a -> tmp_h.{idx+1,2} <- if tmp_f.{idx+1,2} = 0. then 0. else -. h_two a n i j s1 s2) mos_cabs; + Array.iteri (fun idx a -> tmp_h.{idx+1,3} <- if tmp_f.{idx+1,3} = 0. then 0. else +. h_two a n i m s1 s3) mos_cabs; + Array.iteri (fun idx a -> tmp_h.{idx+1,4} <- if tmp_f.{idx+1,4} = 0. then 0. else +. h_two a n j m s2 s3) mos_cabs; + Array.iteri (fun idx a -> tmp_h.{idx+1,5} <- if tmp_f.{idx+1,5} = 0. then 0. else -. h_two a l i m s1 s3) mos_cabs; + Array.iteri (fun idx a -> tmp_h.{idx+1,6} <- if tmp_f.{idx+1,6} = 0. then 0. else -. h_two a l j m s2 s3) mos_cabs; + Array.iteri (fun idx a -> tmp_h.{idx+1,7} <- if tmp_f.{idx+1,7} = 0. then 0. else +. h_two a k m i s3 s1) mos_cabs; + Array.iteri (fun idx a -> tmp_h.{idx+1,8} <- if tmp_f.{idx+1,8} = 0. then 0. else -. h_two a k j i s2 s1) mos_cabs; + dot ~n:(8*ncabs) (Mat.as_vec tmp_h) (Mat.as_vec tmp_f) | Beta, Alfa, Beta | Alfa, Beta, Alfa -> assert false in diff --git a/Utils/Matrix.ml b/Utils/Matrix.ml index 21e6133..9a764cc 100644 --- a/Utils/Matrix.ml +++ b/Utils/Matrix.ml @@ -364,7 +364,7 @@ let rec mm ?(transa=`N) ?(transb=`N) ?(threshold=epsilon) a b = if a <> 0. then begin for i = 1 to m' do - Bigarray.Array1.unsafe_set v i (f' i (k+1)); + v.{i} <- (f' i (k+1)); done; axpy ~alpha:a v accu end @@ -595,31 +595,39 @@ let rec ax_eq_b ?(trans=`N) a b = (* ------- Parallel routines ---------- *) -let parallel_mm ?(transa=`N) ?(transb=`N) ?(threshold=epsilon) a b = +let rec parallel_mm ?(transa=`N) ?(transb=`N) ?(threshold=epsilon) a b = - let n = + let m = match transa with | `N -> dim2 a | `T -> dim1 a in - let n = n / (Parallel.size * 7) in let b = match transb with | `T -> transpose b | `N -> b in - split_cols n b - |> Stream.of_list - |> Farm.run ~ordered:true ~f:(fun b -> - match a, b with - | Computed _, Computed _ -> - mm ~transa ~threshold a b - |> sparse_of_computed ~threshold - | _ -> - mm ~transa ~threshold a b - ) - |> Util.stream_to_list - |> join_cols + assert (m = dim1 b); + let n = 1 + (m / (Parallel.size * 7)) in + let chunks = + split_cols n b + in + + let result = + Stream.of_list chunks + |> Farm.run ~ordered:true ~f:(fun b -> + match a, b with + | Computed _, Computed _ -> + mm ~transa ~threshold a b + |> sparse_of_computed ~threshold + | _ -> + mm ~transa ~threshold a b + ) + |> Util.stream_to_list + |> join_cols + in + + result (* ------------ Printers ------------ *) diff --git a/Utils/Util.ml b/Utils/Util.ml index 3b33011..6744357 100644 --- a/Utils/Util.ml +++ b/Utils/Util.ml @@ -220,6 +220,7 @@ let list_range first last = let list_pack n l = + assert (n>=0); let rec aux i accu1 accu2 = function | [] -> if accu1 = [] then List.rev accu2