#+TITLE: QMCkl dgemm #+NAME: header #+BEGIN_SRC text Generated from qmckl_dgemm.org #+END_SRC * Fortran interface #+BEGIN_SRC f90 :noweb yes :tangle qmckl_blas_f.f90 ! <
> module qmckl_blas use :: iso_c_binding interface subroutine qmckl_dgemm(transa, transb, m, n, k, & alpha, A, lda, B, ldb, beta, C, ldc) bind(C) use :: iso_c_binding implicit none character(kind=c_char ), value :: transa, transb integer (kind=c_int ), value :: m, n, k, lda, ldb, ldc real (kind=c_double), value :: alpha, beta real (kind=c_double) :: A(lda,*), B(ldb,*), C(ldc,*) end subroutine qmckl_dgemm end interface end module qmckl_blas #+END_SRC * TODO C code To avoid passing too many arguments to recursive subroutines, we put all the arguments in a struct. #+NAME: dgemm_args #+BEGIN_SRC c struct dgemm_args { double alpha; double beta; double* A; double* B; double* C; int m; int n; int k; int lda; int ldb; int ldc; CBLAS_LAYOUT transa; CBLAS_LAYOUT transb; }; #+END_SRC The driver routine packs the arguments in the struct and calls the recursive routine. #+NAME: dgemm #+BEGIN_SRC c void qmckl_dgemm(char transa, char transb, int m, int n, int k, double alpha, double* A, int lda, double* B, int ldb, double beta, double* C, int ldc) { struct dgemm_args args; args.alpha = alpha; args.beta = beta ; args.A = A; args.B = B; args.C = C; args.m = m; args.n = n; args.k = k; args.lda = lda; args.ldb = ldb; args.ldc = ldc; if (transa == 'T' || transa == 't') { args.transa = CblasTrans; } else { args.transa = CblasNoTrans; } CBLAS_LAYOUT tb; if (transa == 'T' || transa == 't') { args.transb = CblasTrans; } else { args.transb = CblasNoTrans; } #pragma omp parallel { #pragma omp single { qmckl_dgemm_rec(args); } #pragma omp taskwait } } #+END_SRC #+NAME: dgemm_rec #+BEGIN_SRC c #define MIN_SIZE 512 #include static void qmckl_dgemm_rec(struct dgemm_args args) { // printf("%5d %5d\n", args.m, args.n); if ( (args.m <= MIN_SIZE) || (args.n <= MIN_SIZE)) { #pragma omp task { cblas_dgemm(CblasColMajor, args.transa, args.transb, args.m, args.n, args.k, args.alpha, args.A, args.lda, args.B, args.ldb, args.beta, args.C, args.ldc); } } else { int m1 = args.m / 2; int m2 = args.m - m1; int n1 = args.n / 2; int n2 = args.n - n1; #pragma omp task { struct dgemm_args args_1 = args; args_1.m = m1; args_1.n = n1; qmckl_dgemm_rec(args_1); } #pragma omp task { // TODO: assuming 'N', 'N' here struct dgemm_args args_2 = args; args_2.B = args.B + args.ldb*n1; args_2.C = args.C + args.ldc*n1; args_2.m = m1; args_2.n = n2; qmckl_dgemm_rec(args_2); } #pragma omp task { struct dgemm_args args_3 = args; args_3.A = args.A + m1; args_3.C = args.C + m1; args_3.m = m2; args_3.n = n1; qmckl_dgemm_rec(args_3); } #pragma omp task { struct dgemm_args args_4 = args; args_4.A = args.A + m1; args_4.B = args.B + args.ldb*n1; args_4.C = args.C + m1 + args.ldc*n1; args_4.m = m2; args_4.n = n2; qmckl_dgemm_rec(args_4); } } } #+END_SRC #+BEGIN_SRC c :noweb yes :tangle qmckl_dgemm.c /* <
> */ #include <> <> <> #+END_SRC