qp_plugins_scemama/devel/ccsd_gpu/gpu_dgemm.c

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <omp.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>

#define BLOCK_SIZE 16

void dgemm_(char*, char*, int*, int*, int*, double*, double*, int*, double*, int*,
		double*, double*, int*);


void gpu_dgemm(char transa, char transb, int m, int n, int k, double alpha,
                  double* A, int lda, double* B, int ldb, double beta, double* C, int ldc)
{
    cublasHandle_t handle;
    cublasCreate(&handle);

    double * d_A;
    double * d_B;
    double * d_C;
    cublasOperation_t ta, tb;

    if (transa == 'N') {
      cudaMalloc((void**)&d_A, lda*k*sizeof(double));
      cublasSetMatrix(m, k, sizeof(double), A, lda, d_A, lda);
      ta = CUBLAS_OP_N;
    } else {
      cudaMalloc((void**)&d_A, lda*m*sizeof(double));
      cublasSetMatrix(k, m, sizeof(double), A, lda, d_A, lda);
      ta = CUBLAS_OP_T;
    }

    if (transb == 'N') {
      cudaMalloc((void**)&d_B, ldb*n*sizeof(double));
      cublasSetMatrix(k, n, sizeof(double), B, ldb, d_B, ldb);
      tb = CUBLAS_OP_N;
    } else {
      cudaMalloc((void**)&d_B, ldb*k*sizeof(double));
      cublasSetMatrix(n, k, sizeof(double), B, ldb, d_B, ldb);
      tb = CUBLAS_OP_T;
    }

    cudaMalloc((void**)&d_C, ldc*n*sizeof(double));
    if (beta != 0.) {
      cublasSetMatrix(m, n, sizeof(double), C, ldc, d_C, ldc);
    }

    cublasDgemm(handle, ta, tb, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, ldc);

    cublasGetMatrix(m, n,  sizeof(double), d_C, ldc, C, ldc);

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    cublasDestroy(handle);
}