dft_tools/triqs/arrays/blas_lapack/gemm.hpp

/*******************************************************************************
 *
 * TRIQS: a Toolbox for Research in Interacting Quantum Systems
 *
 * Copyright (C) 2012 by O. Parcollet
 *
 * TRIQS is free software: you can redistribute it and/or modify it under the
 * terms of the GNU General Public License as published by the Free Software
 * Foundation, either version 3 of the License, or (at your option) any later
 * version.
 *
 * TRIQS is distributed in the hope that it will be useful, but WITHOUT ANY
 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License along with
 * TRIQS. If not, see <http://www.gnu.org/licenses/>.
 *
 ******************************************************************************/
#ifndef TRIQS_ARRAYS_BLAS_LAPACK_GEMM_H
#define TRIQS_ARRAYS_BLAS_LAPACK_GEMM_H
#include <complex>
#include "./tools.hpp"
#include "./qcache.hpp"

namespace triqs { namespace arrays { namespace blas { 
 
 using namespace blas_lapack_tools;
 namespace f77 { // overload

  extern "C" { 
   void TRIQS_FORTRAN_MANGLING(dgemm) (char *, char *, const int & , const int & , const int & , const double &, 
     const double[], const int &, const double[], const int &, const double &, double[], const int & ); 
   void TRIQS_FORTRAN_MANGLING(zgemm) (char *, char *, const int & , const int & , const int & , const std::complex<double> &,
     const std::complex<double>[], const int &, const std::complex<double>[], const int &, const std::complex<double> &, std::complex<double>[], const int & );
  }

  inline void gemm (char trans_a, char trans_b, const int & M, const int & N, const int & K, const double & alpha, 
    const double* A, const int & LDA, const double* B, const int & LDB, const double & beta, double* C, const int & LDC) { 
   TRIQS_FORTRAN_MANGLING(dgemm)(&trans_a,&trans_b,M,N,K,alpha, A, LDA, B, LDB, beta, C, LDC);
  }

  typedef std::complex<double> dcomplex;
  inline void gemm (char trans_a, char trans_b, const int & M, const int & N, const int & K, const dcomplex & alpha, 
    const dcomplex* A, const int & LDA, const dcomplex* B, const int & LDB, const dcomplex & beta, dcomplex* C, const int &  LDC) { 
   TRIQS_FORTRAN_MANGLING(zgemm)(&trans_a,&trans_b,M,N,K,alpha, A, LDA, B, LDB, beta, C, LDC);
  }
 }

 template<typename MT1, typename MT2, typename MTOut> 
 struct use_blas_gemm { 
  static_assert(is_amv_value_or_view_class<MTOut>::value, "output of matrix product must be a matrix or matrix_view");
  //static constexpr bool are_both_value_view = is_amv_value_or_view_class<MT1>::value && is_amv_value_or_view_class<MT2>::value;
  //static constexpr bool value = are_both_value_view && is_blas_lapack_type<typename MT1::value_type>::value && have_same_value_type< MT1, MT2, MTOut>::value;
  static constexpr bool value = is_blas_lapack_type<typename MT1::value_type>::value && have_same_value_type< MT1, MT2, MTOut>::value;
  // if inverse_lazy e.g. it is ok, we will use a cache anyway....
 };

 /**
  * Calls gemm on a matrix or view
  * Takes care of making temporary copies if necessary
  */
 template<typename MT1, typename MT2, typename MTOut> 
  typename std::enable_if< use_blas_gemm<MT1,MT2,MTOut>::value >::type
  gemm (typename MT1::value_type alpha, MT1 const & A, MT2 const & B, typename MT1::value_type beta, MTOut & C) { 
   //std::cerr  << "gemm: blas call "<< std::endl ;
   // first resize if necessary and possible 
   resize_or_check_if_view(C,make_shape(first_dim(A),second_dim(B)));

   // now we use qcache instead of the matrix to make a copy if necessary ... 
   // not optimal : if stride == 1, N ---> use LDA parameters
   // change the condition in the qcache construction....
   reflexive_qcache<MTOut> Cc(C);

   if (C.memory_layout_is_c()) {
    // then tC = tB tA ! 
    const_qcache<MT1> Cb(A); // note the inversion  A <-> B
    const_qcache<MT2> Ca(B); // note the inversion  A <-> B
    if (!(first_dim(Ca()) == second_dim(Cb()))) TRIQS_RUNTIME_ERROR << "Dimension mismatch in gemm : A : "<< get_shape(Ca()) <<" while B : "<<get_shape(Cb());
    char trans_a= get_trans(Ca(), true); 
    char trans_b= get_trans(Cb(), true); 
    int m = (trans_a == 'N' ? get_n_rows(Ca()) : get_n_cols(Ca()));
    int n = (trans_b == 'N' ? get_n_cols(Cb()) : get_n_rows(Cb()));
    int k = (trans_a == 'N' ? get_n_cols(Ca()) : get_n_rows(Ca()));
    //std::cerr<< " about to call GEMM"<< std::endl ;
    //std::cerr<< "A = "<< get_shape(Ca())<< Ca()<< std::endl;
    //std::cerr<< "B = "<< get_shape(Cb())<< Cb()<< std::endl;
    //std::cerr<< "C c" << get_shape(Cc()) << Cc().indexmap().strides() << std::endl;
    //std::cerr<<Ca().memory_layout_is_c() <<Ca().memory_layout_is_fortran()<<std::endl;
    //std::cerr<< get_n_rows(Ca())<<get_n_cols(Cb())<<get_n_cols(Ca()) << std::endl ;
    f77::gemm(trans_a,trans_b,m,n,k,
      alpha, Ca().data_start(), get_ld(Ca()) , Cb().data_start(), get_ld(Cb()), beta, Cc().data_start(), get_ld(Cc()));
    //std::cerr << " gemm ok "<< std::endl ; 
   }
   else {
    const_qcache<MT1> Ca(A);
    const_qcache<MT2> Cb(B);
    if (!(second_dim(Ca()) == first_dim(Cb()))) TRIQS_RUNTIME_ERROR << "Dimension mismatch in gemm : A : "<< get_shape(Ca()) <<" while B : "<<get_shape(Cb());
    char trans_a= get_trans(Ca(), false); 
    char trans_b= get_trans(Cb(), false); 
    int m = (trans_a == 'N' ? get_n_rows(Ca()) : get_n_cols(Ca()));
    int n = (trans_b == 'N' ? get_n_cols(Cb()) : get_n_rows(Cb()));
    int k = (trans_a == 'N' ? get_n_cols(Ca()) : get_n_rows(Ca()));
    f77::gemm(trans_a,trans_b,m,n,k,
      alpha, Ca().data_start(), get_ld(Ca()) , Cb().data_start(), get_ld(Cb()), beta, Cc().data_start(), get_ld(Cc()));
   }
  }

 // make the generic version for non lapack types or more complex types
 // largely suboptimal 
 template<typename MT1, typename MT2, typename MTOut> 
  void gemm_generic  (typename MT1::value_type alpha, MT1 const & A, MT2 const & B, typename MT1::value_type beta, MTOut & C) { 
   //std::cerr  << "gemm: generic call "<< std::endl ;
   // first resize if necessary and possible 
   resize_or_check_if_view(C,make_shape(first_dim(A),second_dim(B)));
   if (second_dim(A) != first_dim(B)) TRIQS_RUNTIME_ERROR << "gemm generic : dimension mismatch "<< get_shape(A) << get_shape(B);
   C() = 0;
   for (int i=0; i<first_dim(A); ++i)
    for (int k=0; k<second_dim(A); ++k)
     for (int j=0; j<second_dim(B); ++j)
      C(i,j) += A(i,k)*B(k,j);
  }

 // generic version for non lapack 
 template<typename MT1, typename MT2, typename MTOut> 
  typename std::enable_if< !use_blas_gemm<MT1,MT2,MTOut>::value >::type
  gemm (typename MT1::value_type alpha, MT1 const & A, MT2 const & B, typename MT1::value_type beta, MTOut & C) { 
   gemm_generic(alpha,A,B,beta,C);
  }

 // to allow gemm (alpha, a, b, beta, M(..., ...)) i.e. a temporary view, which is not matched by previos templates
 // which require an lvalue. This is the only version which takes an && as last argument
 // indeed, in the routine, c is a *lvalue*, since it has a name, and hence we call *other* overload of the function
 template<typename A, typename MT1, typename MT2, typename B, typename V, typename To, bool W> 
  void gemm (A alpha, MT1 const & a, MT2 const & b, B beta, matrix_view<V,To,W> && c) { gemm(alpha,a,b,beta,c);}
 
}}}// namespace


#endif
First commit : triqs libs version 1.0 alpha1 for earlier commits, see TRIQS0.x repository. 2013-07-17 19:24:07 +02:00			`/*******************************************************************************`
			`*`
			`* TRIQS: a Toolbox for Research in Interacting Quantum Systems`
			`*`
			`* Copyright (C) 2012 by O. Parcollet`
			`*`
			`* TRIQS is free software: you can redistribute it and/or modify it under the`
			`* terms of the GNU General Public License as published by the Free Software`
			`* Foundation, either version 3 of the License, or (at your option) any later`
			`* version.`
			`*`
			`* TRIQS is distributed in the hope that it will be useful, but WITHOUT ANY`
			`* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS`
			`* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more`
			`* details.`
			`*`
			`* You should have received a copy of the GNU General Public License along with`
			`* TRIQS. If not, see <http://www.gnu.org/licenses/>.`
			`*`
			`******************************************************************************/`
			`#ifndef TRIQS_ARRAYS_BLAS_LAPACK_GEMM_H`
			`#define TRIQS_ARRAYS_BLAS_LAPACK_GEMM_H`
			`#include <complex>`
			`#include "./tools.hpp"`
			`#include "./qcache.hpp"`

			`namespace triqs { namespace arrays { namespace blas {`

			`using namespace blas_lapack_tools;`
			`namespace f77 { // overload`

			`extern "C" {`
			`void TRIQS_FORTRAN_MANGLING(dgemm) (char , char , const int & , const int & , const int & , const double &,`
			`const double[], const int &, const double[], const int &, const double &, double[], const int & );`
			`void TRIQS_FORTRAN_MANGLING(zgemm) (char , char , const int & , const int & , const int & , const std::complex<double> &,`
			`const std::complex<double>[], const int &, const std::complex<double>[], const int &, const std::complex<double> &, std::complex<double>[], const int & );`
			`}`

			`inline void gemm (char trans_a, char trans_b, const int & M, const int & N, const int & K, const double & alpha,`
			`const double* A, const int & LDA, const double* B, const int & LDB, const double & beta, double* C, const int & LDC) {`
			`TRIQS_FORTRAN_MANGLING(dgemm)(&trans_a,&trans_b,M,N,K,alpha, A, LDA, B, LDB, beta, C, LDC);`
			`}`

			`typedef std::complex<double> dcomplex;`
			`inline void gemm (char trans_a, char trans_b, const int & M, const int & N, const int & K, const dcomplex & alpha,`
			`const dcomplex* A, const int & LDA, const dcomplex* B, const int & LDB, const dcomplex & beta, dcomplex* C, const int & LDC) {`
			`TRIQS_FORTRAN_MANGLING(zgemm)(&trans_a,&trans_b,M,N,K,alpha, A, LDA, B, LDB, beta, C, LDC);`
			`}`
			`}`

			`template<typename MT1, typename MT2, typename MTOut>`
			`struct use_blas_gemm {`
			`static_assert(is_amv_value_or_view_class<MTOut>::value, "output of matrix product must be a matrix or matrix_view");`
			`//static constexpr bool are_both_value_view = is_amv_value_or_view_class<MT1>::value && is_amv_value_or_view_class<MT2>::value;`
			`//static constexpr bool value = are_both_value_view && is_blas_lapack_type<typename MT1::value_type>::value && have_same_value_type< MT1, MT2, MTOut>::value;`
			`static constexpr bool value = is_blas_lapack_type<typename MT1::value_type>::value && have_same_value_type< MT1, MT2, MTOut>::value;`
			`// if inverse_lazy e.g. it is ok, we will use a cache anyway....`
			`};`

			`/**`
			`* Calls gemm on a matrix or view`
			`* Takes care of making temporary copies if necessary`
			`*/`
			`template<typename MT1, typename MT2, typename MTOut>`
			`typename std::enable_if< use_blas_gemm<MT1,MT2,MTOut>::value >::type`
			`gemm (typename MT1::value_type alpha, MT1 const & A, MT2 const & B, typename MT1::value_type beta, MTOut & C) {`
			`//std::cerr << "gemm: blas call "<< std::endl ;`
			`// first resize if necessary and possible`
arrays: Remove dim0, dim1, .shape in various matrix object. Not in the concept, not needed, just an annoyance. replaced by free functions : first_dim(A), second_dim(A), get_shape(A) and so on... 2013-07-29 22:44:43 +02:00			`resize_or_check_if_view(C,make_shape(first_dim(A),second_dim(B)));`
First commit : triqs libs version 1.0 alpha1 for earlier commits, see TRIQS0.x repository. 2013-07-17 19:24:07 +02:00
			`// now we use qcache instead of the matrix to make a copy if necessary ...`
			`// not optimal : if stride == 1, N ---> use LDA parameters`
			`// change the condition in the qcache construction....`
			`reflexive_qcache<MTOut> Cc(C);`

			`if (C.memory_layout_is_c()) {`
			`// then tC = tB tA !`
			`const_qcache<MT1> Cb(A); // note the inversion A <-> B`
			`const_qcache<MT2> Ca(B); // note the inversion A <-> B`
arrays: Remove dim0, dim1, .shape in various matrix object. Not in the concept, not needed, just an annoyance. replaced by free functions : first_dim(A), second_dim(A), get_shape(A) and so on... 2013-07-29 22:44:43 +02:00			`if (!(first_dim(Ca()) == second_dim(Cb()))) TRIQS_RUNTIME_ERROR << "Dimension mismatch in gemm : A : "<< get_shape(Ca()) <<" while B : "<<get_shape(Cb());`
First commit : triqs libs version 1.0 alpha1 for earlier commits, see TRIQS0.x repository. 2013-07-17 19:24:07 +02:00			`char trans_a= get_trans(Ca(), true);`
			`char trans_b= get_trans(Cb(), true);`
			`int m = (trans_a == 'N' ? get_n_rows(Ca()) : get_n_cols(Ca()));`
			`int n = (trans_b == 'N' ? get_n_cols(Cb()) : get_n_rows(Cb()));`
			`int k = (trans_a == 'N' ? get_n_cols(Ca()) : get_n_rows(Ca()));`
			`//std::cerr<< " about to call GEMM"<< std::endl ;`
arrays: Remove dim0, dim1, .shape in various matrix object. Not in the concept, not needed, just an annoyance. replaced by free functions : first_dim(A), second_dim(A), get_shape(A) and so on... 2013-07-29 22:44:43 +02:00			`//std::cerr<< "A = "<< get_shape(Ca())<< Ca()<< std::endl;`
			`//std::cerr<< "B = "<< get_shape(Cb())<< Cb()<< std::endl;`
			`//std::cerr<< "C c" << get_shape(Cc()) << Cc().indexmap().strides() << std::endl;`
First commit : triqs libs version 1.0 alpha1 for earlier commits, see TRIQS0.x repository. 2013-07-17 19:24:07 +02:00			`//std::cerr<<Ca().memory_layout_is_c() <<Ca().memory_layout_is_fortran()<<std::endl;`
			`//std::cerr<< get_n_rows(Ca())<<get_n_cols(Cb())<<get_n_cols(Ca()) << std::endl ;`
			`f77::gemm(trans_a,trans_b,m,n,k,`
			`alpha, Ca().data_start(), get_ld(Ca()) , Cb().data_start(), get_ld(Cb()), beta, Cc().data_start(), get_ld(Cc()));`
			`//std::cerr << " gemm ok "<< std::endl ;`
			`}`
			`else {`
			`const_qcache<MT1> Ca(A);`
			`const_qcache<MT2> Cb(B);`
arrays: Remove dim0, dim1, .shape in various matrix object. Not in the concept, not needed, just an annoyance. replaced by free functions : first_dim(A), second_dim(A), get_shape(A) and so on... 2013-07-29 22:44:43 +02:00			`if (!(second_dim(Ca()) == first_dim(Cb()))) TRIQS_RUNTIME_ERROR << "Dimension mismatch in gemm : A : "<< get_shape(Ca()) <<" while B : "<<get_shape(Cb());`
First commit : triqs libs version 1.0 alpha1 for earlier commits, see TRIQS0.x repository. 2013-07-17 19:24:07 +02:00			`char trans_a= get_trans(Ca(), false);`
			`char trans_b= get_trans(Cb(), false);`
			`int m = (trans_a == 'N' ? get_n_rows(Ca()) : get_n_cols(Ca()));`
			`int n = (trans_b == 'N' ? get_n_cols(Cb()) : get_n_rows(Cb()));`
			`int k = (trans_a == 'N' ? get_n_cols(Ca()) : get_n_rows(Ca()));`
			`f77::gemm(trans_a,trans_b,m,n,k,`
			`alpha, Ca().data_start(), get_ld(Ca()) , Cb().data_start(), get_ld(Cb()), beta, Cc().data_start(), get_ld(Cc()));`
			`}`
			`}`

			`// make the generic version for non lapack types or more complex types`
			`// largely suboptimal`
			`template<typename MT1, typename MT2, typename MTOut>`
			`void gemm_generic (typename MT1::value_type alpha, MT1 const & A, MT2 const & B, typename MT1::value_type beta, MTOut & C) {`
			`//std::cerr << "gemm: generic call "<< std::endl ;`
			`// first resize if necessary and possible`
arrays: Remove dim0, dim1, .shape in various matrix object. Not in the concept, not needed, just an annoyance. replaced by free functions : first_dim(A), second_dim(A), get_shape(A) and so on... 2013-07-29 22:44:43 +02:00			`resize_or_check_if_view(C,make_shape(first_dim(A),second_dim(B)));`
			`if (second_dim(A) != first_dim(B)) TRIQS_RUNTIME_ERROR << "gemm generic : dimension mismatch "<< get_shape(A) << get_shape(B);`
First commit : triqs libs version 1.0 alpha1 for earlier commits, see TRIQS0.x repository. 2013-07-17 19:24:07 +02:00			`C() = 0;`
arrays: Remove dim0, dim1, .shape in various matrix object. Not in the concept, not needed, just an annoyance. replaced by free functions : first_dim(A), second_dim(A), get_shape(A) and so on... 2013-07-29 22:44:43 +02:00			`for (int i=0; i<first_dim(A); ++i)`
			`for (int k=0; k<second_dim(A); ++k)`
			`for (int j=0; j<second_dim(B); ++j)`
First commit : triqs libs version 1.0 alpha1 for earlier commits, see TRIQS0.x repository. 2013-07-17 19:24:07 +02:00			`C(i,j) += A(i,k)*B(k,j);`
			`}`

			`// generic version for non lapack`
			`template<typename MT1, typename MT2, typename MTOut>`
			`typename std::enable_if< !use_blas_gemm<MT1,MT2,MTOut>::value >::type`
			`gemm (typename MT1::value_type alpha, MT1 const & A, MT2 const & B, typename MT1::value_type beta, MTOut & C) {`
			`gemm_generic(alpha,A,B,beta,C);`
			`}`

Fix matrix * alias issue and adapt det_manip - The previous version of the * operator for matrix was too clever. It was giving a lazy object and then rewriting C = A B into gemm (a,A,B,0,C). The pb was in case of aliasing : when e.g. C = A, or is a part of A. gemm is not correct that case, and as a result generic code like a = a b may not be correct in matrix case, which is unacceptable. - So we revert to a simple * operator for matrix that does immediate computation. Same thing for matrix* vector - we also suppress a_x_ty class. -> for M = a * b, when M is a matrix, there is no overhead due to move assignment -> however, when M is a view, there is an additionnal copy. -Correctness comes first, hence the fix. However, if one wants more speed and one can guarantee that there is no aliasing possible, then one has to write a direct gemm call. -> det_manip class was adapted, since in that case, we can show there no alias, and we want the speed gain, so the * ops where replaced by direct blas call (using the array blas interface). -> also gemm, gemv, ger were overloaded in the case the return matrix/vector (i.e. last parameter of the function) is not an lvalue, but a temporary view created on the fly. 2013-09-10 21:41:17 +02:00			`// to allow gemm (alpha, a, b, beta, M(..., ...)) i.e. a temporary view, which is not matched by previos templates`
			`// which require an lvalue. This is the only version which takes an && as last argument`
			`// indeed, in the routine, c is a lvalue, since it has a name, and hence we call other overload of the function`
[arrays] Important changes in implementation. - Simplify group_indices - Only for C ordered, remove complex compile time. - Could be generalized to non C ordered, but no need. - Fix slice for custom orders. - Generalize the group_indices for the custom order. - Add c_ordered_transposed_view (useful ?) - Improve slice, special for ellipsis (quicker). - Simplify TraversalOrder - Assignement. Specialize one case for speed. - use FORCEINLINE in foreach, according to speed test for clang - add one speed test - Modify iterators for better speed. - along the lines decided for the foreach - update doc. 2014-09-07 14:34:10 +02:00			`template<typename A, typename MT1, typename MT2, typename B, typename V, typename To, bool W>`
			`void gemm (A alpha, MT1 const & a, MT2 const & b, B beta, matrix_view<V,To,W> && c) { gemm(alpha,a,b,beta,c);}`
Fix matrix * alias issue and adapt det_manip - The previous version of the * operator for matrix was too clever. It was giving a lazy object and then rewriting C = A B into gemm (a,A,B,0,C). The pb was in case of aliasing : when e.g. C = A, or is a part of A. gemm is not correct that case, and as a result generic code like a = a b may not be correct in matrix case, which is unacceptable. - So we revert to a simple * operator for matrix that does immediate computation. Same thing for matrix* vector - we also suppress a_x_ty class. -> for M = a * b, when M is a matrix, there is no overhead due to move assignment -> however, when M is a view, there is an additionnal copy. -Correctness comes first, hence the fix. However, if one wants more speed and one can guarantee that there is no aliasing possible, then one has to write a direct gemm call. -> det_manip class was adapted, since in that case, we can show there no alias, and we want the speed gain, so the * ops where replaced by direct blas call (using the array blas interface). -> also gemm, gemv, ger were overloaded in the case the return matrix/vector (i.e. last parameter of the function) is not an lvalue, but a temporary view created on the fly. 2013-09-10 21:41:17 +02:00
First commit : triqs libs version 1.0 alpha1 for earlier commits, see TRIQS0.x repository. 2013-07-17 19:24:07 +02:00			`}}}// namespace`


			`#endif`