#include #include #include #include #include #include #include #include "gpu.h" #define BLOCK_SIZE 16 void dgemm_(char*, char*, int*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int*); void gpu_dgemm(char transa, char transb, int m, int n, int k, double alpha, double* A, int lda, double* B, int ldb, double beta, double* C, int ldc) { cudaError_t cudaStat = cudaSuccess; cublasHandle_t handle; cublasCreate(&handle); double * d_A; double * d_B; double * d_C; cublasOperation_t ta, tb; if (transa == 'N') { cudaStat = gpu_malloc((void**)&d_A, (size_t) lda*k*sizeof(double)); assert(cudaStat == cudaSuccess); cudaStat = cublasSetMatrix(m, k, sizeof(double), A, lda, d_A, lda); assert(cudaStat == cudaSuccess); ta = CUBLAS_OP_N; } else { cudaStat = gpu_malloc((void**)&d_A, (size_t) lda*m*sizeof(double)); assert(cudaStat == cudaSuccess); cudaStat = cublasSetMatrix(k, m, sizeof(double), A, lda, d_A, lda); assert(cudaStat == cudaSuccess); ta = CUBLAS_OP_T; } if (transb == 'N') { cudaStat = gpu_malloc((void**)&d_B, (size_t) ldb*n*sizeof(double)); assert(cudaStat == cudaSuccess); cudaStat = cublasSetMatrix(k, n, sizeof(double), B, ldb, d_B, ldb); assert(cudaStat == cudaSuccess); tb = CUBLAS_OP_N; } else { cudaStat = gpu_malloc((void**)&d_B, (size_t) ldb*k*sizeof(double)); assert(cudaStat == cudaSuccess); cudaStat = cublasSetMatrix(n, k, sizeof(double), B, ldb, d_B, ldb); assert(cudaStat == cudaSuccess); tb = CUBLAS_OP_T; } cudaStat = gpu_malloc((void**)&d_C, (size_t) ldc*n*sizeof(double)); assert(cudaStat == cudaSuccess); if (beta != 0.) { cudaStat = cublasSetMatrix(m, n, sizeof(double), C, ldc, d_C, ldc); assert(cudaStat == cudaSuccess); } cudaStat = cublasDgemm(handle, ta, tb, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, ldc); assert(cudaStat == cudaSuccess); cudaFree(d_A); cudaFree(d_B); cudaStat = cublasGetMatrix(m, n, sizeof(double), d_C, ldc, C, ldc); assert(cudaStat == cudaSuccess); cudaFree(d_C); cublasDestroy(handle); }