2023-08-02 16:17:43 +02:00
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <string.h>
|
|
|
|
#include <omp.h>
|
|
|
|
#include <cublas_v2.h>
|
|
|
|
#include <cuda_runtime.h>
|
2023-08-21 12:34:55 +02:00
|
|
|
#include <assert.h>
|
2023-12-06 21:43:47 +01:00
|
|
|
#include "gpu.h"
|
2023-08-02 16:17:43 +02:00
|
|
|
|
|
|
|
#define BLOCK_SIZE 16
|
|
|
|
|
|
|
|
void dgemm_(char*, char*, int*, int*, int*, double*, double*, int*, double*, int*,
|
|
|
|
double*, double*, int*);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void gpu_dgemm(char transa, char transb, int m, int n, int k, double alpha,
|
|
|
|
double* A, int lda, double* B, int ldb, double beta, double* C, int ldc)
|
|
|
|
{
|
2023-08-21 12:34:55 +02:00
|
|
|
cudaError_t cudaStat = cudaSuccess;
|
2023-08-02 16:17:43 +02:00
|
|
|
cublasHandle_t handle;
|
|
|
|
cublasCreate(&handle);
|
|
|
|
|
|
|
|
double * d_A;
|
|
|
|
double * d_B;
|
|
|
|
double * d_C;
|
|
|
|
cublasOperation_t ta, tb;
|
|
|
|
|
|
|
|
if (transa == 'N') {
|
2023-12-06 21:40:20 +01:00
|
|
|
cudaStat = gpu_malloc((void**)&d_A, (size_t) lda*k*sizeof(double));
|
2023-08-21 12:34:55 +02:00
|
|
|
assert(cudaStat == cudaSuccess);
|
|
|
|
cudaStat = cublasSetMatrix(m, k, sizeof(double), A, lda, d_A, lda);
|
|
|
|
assert(cudaStat == cudaSuccess);
|
2023-08-02 16:17:43 +02:00
|
|
|
ta = CUBLAS_OP_N;
|
|
|
|
} else {
|
2023-12-06 21:40:20 +01:00
|
|
|
cudaStat = gpu_malloc((void**)&d_A, (size_t) lda*m*sizeof(double));
|
2023-08-21 12:34:55 +02:00
|
|
|
assert(cudaStat == cudaSuccess);
|
|
|
|
cudaStat = cublasSetMatrix(k, m, sizeof(double), A, lda, d_A, lda);
|
|
|
|
assert(cudaStat == cudaSuccess);
|
2023-08-02 16:17:43 +02:00
|
|
|
ta = CUBLAS_OP_T;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (transb == 'N') {
|
2023-12-06 21:40:20 +01:00
|
|
|
cudaStat = gpu_malloc((void**)&d_B, (size_t) ldb*n*sizeof(double));
|
2023-08-21 12:34:55 +02:00
|
|
|
assert(cudaStat == cudaSuccess);
|
|
|
|
cudaStat = cublasSetMatrix(k, n, sizeof(double), B, ldb, d_B, ldb);
|
|
|
|
assert(cudaStat == cudaSuccess);
|
2023-08-02 16:17:43 +02:00
|
|
|
tb = CUBLAS_OP_N;
|
|
|
|
} else {
|
2023-12-06 21:40:20 +01:00
|
|
|
cudaStat = gpu_malloc((void**)&d_B, (size_t) ldb*k*sizeof(double));
|
2023-08-21 12:34:55 +02:00
|
|
|
assert(cudaStat == cudaSuccess);
|
|
|
|
cudaStat = cublasSetMatrix(n, k, sizeof(double), B, ldb, d_B, ldb);
|
|
|
|
assert(cudaStat == cudaSuccess);
|
2023-08-02 16:17:43 +02:00
|
|
|
tb = CUBLAS_OP_T;
|
|
|
|
}
|
|
|
|
|
2023-12-06 21:40:20 +01:00
|
|
|
cudaStat = gpu_malloc((void**)&d_C, (size_t) ldc*n*sizeof(double));
|
2023-08-21 12:34:55 +02:00
|
|
|
assert(cudaStat == cudaSuccess);
|
2023-08-02 16:17:43 +02:00
|
|
|
if (beta != 0.) {
|
2023-08-21 12:34:55 +02:00
|
|
|
cudaStat = cublasSetMatrix(m, n, sizeof(double), C, ldc, d_C, ldc);
|
|
|
|
assert(cudaStat == cudaSuccess);
|
2023-08-02 16:17:43 +02:00
|
|
|
}
|
|
|
|
|
2023-08-21 12:34:55 +02:00
|
|
|
cudaStat = cublasDgemm(handle, ta, tb, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, ldc);
|
|
|
|
assert(cudaStat == cudaSuccess);
|
2023-08-02 16:17:43 +02:00
|
|
|
cudaFree(d_A);
|
|
|
|
cudaFree(d_B);
|
2023-08-21 12:34:55 +02:00
|
|
|
|
|
|
|
cudaStat = cublasGetMatrix(m, n, sizeof(double), d_C, ldc, C, ldc);
|
|
|
|
assert(cudaStat == cudaSuccess);
|
|
|
|
|
2023-08-02 16:17:43 +02:00
|
|
|
cudaFree(d_C);
|
|
|
|
cublasDestroy(handle);
|
|
|
|
}
|
|
|
|
|