QuantumPackage/plugins/local/gpu_nvidia/gpu.c

#include <stdint.h>
#include <stdio.h>
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

#include <cublas_v2.h>
#include <cuda_runtime.h>


/* Generic functions */

int gpu_ndevices() {
  int ngpus;
  cudaGetDeviceCount(&ngpus);
  return ngpus;
}

void gpu_set_device(int32_t igpu) {
  cudaSetDevice((int) igpu);
}


/* Allocation functions */

void gpu_allocate(void** ptr, const int64_t size) {
    size_t free, total;
    cudaError_t rc = cudaMemGetInfo( &free, &total );
    if (rc != cudaSuccess) {
      free = INT64_MAX;
    }

    rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
//    /* Use managed memory if it does not fit on the GPU */
//    if (size < free && size < total/2) {
//      rc= cudaMalloc(ptr, size);
//    } else {
//      rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
//    }
    assert (rc == cudaSuccess);
}

void gpu_deallocate(void** ptr) {
  assert (*ptr != NULL);
  cudaFree(*ptr);
  *ptr = NULL;
}


/* Memory transfer functions */

void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
  cudaMemcpy (gpu_ptr, cpu_ptr, n, cudaMemcpyHostToDevice);
}

void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
  cudaMemcpy (cpu_ptr, gpu_ptr, n, cudaMemcpyDeviceToHost);
}

void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
  cudaMemcpy (gpu_ptr_dest, gpu_ptr_src, n, cudaMemcpyDeviceToDevice);
}


/* Streams */

void gpu_stream_create(cudaStream_t* ptr) {
  cudaError_t rc = cudaStreamCreate(ptr);
  assert (rc == cudaSuccess);
}

void gpu_stream_destroy(cudaStream_t* ptr) {
  assert (ptr != NULL);
  cudaError_t rc = cudaStreamDestroy(*ptr);
  assert (rc == cudaSuccess);
  *ptr = NULL;
}

void gpu_set_stream(cublasHandle_t handle, cudaStream_t stream) {
  cublasSetStream(handle, stream);
}

void gpu_synchronize() {
  cudaDeviceSynchronize();
}


/* BLAS functions */

void gpu_blas_create(cublasHandle_t* ptr) {
  cublasStatus_t rc = cublasCreate(ptr);
  assert (rc == CUBLAS_STATUS_SUCCESS);
}


void gpu_blas_destroy(cublasHandle_t* ptr) {
  assert (ptr != NULL);
  cublasStatus_t rc = cublasDestroy(*ptr);
  assert (rc == CUBLAS_STATUS_SUCCESS);
  ptr = NULL;
}


void gpu_ddot(cublasHandle_t handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
  assert (handle != NULL);
  /* Convert to int */
  int n_, incx_, incy_;

  n_    = (int) n;
  incx_ = (int) incx;
  incy_ = (int) incy;

  assert ( (int64_t)    n_ == n   );
  assert ( (int64_t) incx_ == incx);
  assert ( (int64_t) incy_ == incy);

  cublasStatus_t rc = cublasDdot(handle, n_, x, incx_, y, incy_, result);
/*
  double alpha = 1.0;
  double beta = 0.0;
  cublasStatus_t rc = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, n_, &alpha, x, 1, y, n_, &beta, &result_, 1);
*/
  assert (rc == CUBLAS_STATUS_SUCCESS);
}


void gpu_sdot(cublasHandle_t handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
  assert (handle != NULL);

  /* Convert to int */
  int n_, incx_, incy_;

  n_    = (int) n;
  incx_ = (int) incx;
  incy_ = (int) incy;

  /* Check for integer overflows */
  assert ( (int64_t)    n_ == n   );
  assert ( (int64_t) incx_ == incx);
  assert ( (int64_t) incy_ == incy);

  float result_ = 0.;
  cublasStatus_t rc = cublasSdot(handle, n_, x, incx_, y, incy_, &result_);
  assert (rc == CUBLAS_STATUS_SUCCESS);
  *result = result_;
}


void gpu_dgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const double alpha,
               const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) {

  assert (handle != NULL);

  /* Convert to int */
  int m_, n_, lda_, incx_, incy_;

  m_    = (int) m;
  n_    = (int) n;
  lda_  = (int) lda;
  incx_ = (int) incx;
  incy_ = (int) incy;

  /* Check for integer overflows */
  assert ( (int64_t)    m_ == m   );
  assert ( (int64_t)    n_ == n   );
  assert ( (int64_t)  lda_ == lda );
  assert ( (int64_t) incx_ == incx);
  assert ( (int64_t) incy_ == incy);

  cublasOperation_t transa_ = CUBLAS_OP_N;
  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;

  cublasDgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_);
}


void gpu_sgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const float alpha,
               const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) {

  assert (handle != NULL);

  /* Convert to int */
  int m_, n_, lda_, incx_, incy_;

  m_    = (int) m;
  n_    = (int) n;
  lda_  = (int) lda;
  incx_ = (int) incx;
  incy_ = (int) incy;

  /* Check for integer overflows */
  assert ( (int64_t)    m_ == m   );
  assert ( (int64_t)    n_ == n   );
  assert ( (int64_t)  lda_ == lda );
  assert ( (int64_t) incx_ == incx);
  assert ( (int64_t) incy_ == incy);

  cublasOperation_t transa_ = CUBLAS_OP_N;
  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;

  cublasSgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_);
}


void gpu_dgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) {

  assert (handle != NULL);

  /* Convert to int */
  int m_, n_, k_, lda_, ldb_, ldc_;

  m_   = (int) m;
  n_   = (int) n;
  k_   = (int) k;
  lda_ = (int) lda;
  ldb_ = (int) ldb;
  ldc_ = (int) ldc;

  /* Check for integer overflows */
  assert ( (int64_t)   m_ == m  );
  assert ( (int64_t)   n_ == n  );
  assert ( (int64_t)   k_ == k  );
  assert ( (int64_t) lda_ == lda);
  assert ( (int64_t) ldb_ == ldb);
  assert ( (int64_t) ldc_ == ldc);

  cublasOperation_t transa_ = CUBLAS_OP_N;
  cublasOperation_t transb_ = CUBLAS_OP_N;
  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;

  cublasDgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_);
}


void gpu_sgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) {

  assert (handle != NULL);

  /* Convert to int */
  int m_, n_, k_, lda_, ldb_, ldc_;

  m_   = (int) m;
  n_   = (int) n;
  k_   = (int) k;
  lda_ = (int) lda;
  ldb_ = (int) ldb;
  ldc_ = (int) ldc;

  /* Check for integer overflows */
  assert ( (int64_t)   m_ == m  );
  assert ( (int64_t)   n_ == n  );
  assert ( (int64_t)   k_ == k  );
  assert ( (int64_t) lda_ == lda);
  assert ( (int64_t) ldb_ == ldb);
  assert ( (int64_t) ldc_ == ldc);

  cublasOperation_t transa_ = CUBLAS_OP_N;
  cublasOperation_t transb_ = CUBLAS_OP_N;
  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;

  cublasSgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_);
}


void gpu_dgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double alpha,
               const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
  assert (handle != NULL);

  /* Convert to int */
  int m_, n_, lda_, ldb_, ldc_;

  m_   = (int) m;
  n_   = (int) n;
  lda_ = (int) lda;
  ldb_ = (int) ldb;
  ldc_ = (int) ldc;

  /* Check for integer overflows */
  assert ( (int64_t)   m_ == m  );
  assert ( (int64_t)   n_ == n  );
  assert ( (int64_t) lda_ == lda);
  assert ( (int64_t) ldb_ == ldb);
  assert ( (int64_t) ldc_ == ldc);

  cublasOperation_t transa_ = CUBLAS_OP_N;
  cublasOperation_t transb_ = CUBLAS_OP_N;
  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;

  cublasDgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_);

}


void gpu_sgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float alpha,
               const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
  assert (handle != NULL);

  /* Convert to int */
  int m_, n_, lda_, ldb_, ldc_;

  m_   = (int) m;
  n_   = (int) n;
  lda_ = (int) lda;
  ldb_ = (int) ldb;
  ldc_ = (int) ldc;

  /* Check for integer overflows */
  assert ( (int64_t)   m_ == m  );
  assert ( (int64_t)   n_ == n  );
  assert ( (int64_t) lda_ == lda);
  assert ( (int64_t) ldb_ == ldb);
  assert ( (int64_t) ldc_ == ldc);

  cublasOperation_t transa_ = CUBLAS_OP_N;
  cublasOperation_t transb_ = CUBLAS_OP_N;
  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;

  cublasSgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_);

}
Added Nvidia module 2024-06-28 11:00:58 +02:00			`#include <stdint.h>`
			`#include <stdio.h>`
H_oo on GPU 2024-06-29 02:27:50 +02:00			`#include <stdbool.h>`
Added Nvidia module 2024-06-28 11:00:58 +02:00			`#include <stdlib.h>`
			`#include <string.h>`
			`#include <assert.h>`

			`#include <cublas_v2.h>`
			`#include <cuda_runtime.h>`


			`/* Generic functions */`

			`int gpu_ndevices() {`
			`int ngpus;`
			`cudaGetDeviceCount(&ngpus);`
			`return ngpus;`
			`}`

			`void gpu_set_device(int32_t igpu) {`
H_oo on GPU 2024-06-29 02:27:50 +02:00			`cudaSetDevice((int) igpu);`
Added Nvidia module 2024-06-28 11:00:58 +02:00			`}`


			`/* Allocation functions */`

			`void gpu_allocate(void** ptr, const int64_t size) {`
			`size_t free, total;`
			`cudaError_t rc = cudaMemGetInfo( &free, &total );`
			`if (rc != cudaSuccess) {`
			`free = INT64_MAX;`
			`}`

H_vv 2024-07-01 18:04:48 +02:00			`rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);`
			`// /* Use managed memory if it does not fit on the GPU */`
			`// if (size < free && size < total/2) {`
Added Nvidia module 2024-06-28 11:00:58 +02:00			`// rc= cudaMalloc(ptr, size);`
H_vv 2024-07-01 18:04:48 +02:00			`// } else {`
			`// rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);`
			`// }`
Added Nvidia module 2024-06-28 11:00:58 +02:00			`assert (rc == cudaSuccess);`
			`}`

			`void gpu_deallocate(void** ptr) {`
			`assert (*ptr != NULL);`
			`cudaFree(*ptr);`
			`*ptr = NULL;`
			`}`


			`/* Memory transfer functions */`

			`void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {`
			`cudaMemcpy (gpu_ptr, cpu_ptr, n, cudaMemcpyHostToDevice);`
			`}`

			`void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {`
			`cudaMemcpy (cpu_ptr, gpu_ptr, n, cudaMemcpyDeviceToHost);`
			`}`

			`void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {`
			`cudaMemcpy (gpu_ptr_dest, gpu_ptr_src, n, cudaMemcpyDeviceToDevice);`
			`}`


			`/* Streams */`

H_oo on GPU 2024-06-29 02:27:50 +02:00			`void gpu_stream_create(cudaStream_t* ptr) {`
			`cudaError_t rc = cudaStreamCreate(ptr);`
Added Nvidia module 2024-06-28 11:00:58 +02:00			`assert (rc == cudaSuccess);`
			`}`

H_oo on GPU 2024-06-29 02:27:50 +02:00			`void gpu_stream_destroy(cudaStream_t* ptr) {`
			`assert (ptr != NULL);`
			`cudaError_t rc = cudaStreamDestroy(*ptr);`
Added Nvidia module 2024-06-28 11:00:58 +02:00			`assert (rc == cudaSuccess);`
			`*ptr = NULL;`
			`}`

H_oo on GPU 2024-06-29 02:27:50 +02:00			`void gpu_set_stream(cublasHandle_t handle, cudaStream_t stream) {`
			`cublasSetStream(handle, stream);`
Added Nvidia module 2024-06-28 11:00:58 +02:00			`}`

			`void gpu_synchronize() {`
			`cudaDeviceSynchronize();`
			`}`


			`/* BLAS functions */`

H_oo on GPU 2024-06-29 02:27:50 +02:00			`void gpu_blas_create(cublasHandle_t* ptr) {`
			`cublasStatus_t rc = cublasCreate(ptr);`
Added Nvidia module 2024-06-28 11:00:58 +02:00			`assert (rc == CUBLAS_STATUS_SUCCESS);`
			`}`


H_oo on GPU 2024-06-29 02:27:50 +02:00			`void gpu_blas_destroy(cublasHandle_t* ptr) {`
			`assert (ptr != NULL);`
			`cublasStatus_t rc = cublasDestroy(*ptr);`
Added Nvidia module 2024-06-28 11:00:58 +02:00			`assert (rc == CUBLAS_STATUS_SUCCESS);`
H_oo on GPU 2024-06-29 02:27:50 +02:00			`ptr = NULL;`
Added Nvidia module 2024-06-28 11:00:58 +02:00			`}`


H_oo on GPU 2024-06-29 02:27:50 +02:00			`void gpu_ddot(cublasHandle_t handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {`
			`assert (handle != NULL);`
			`/* Convert to int */`
			`int n_, incx_, incy_;`
Added Nvidia module 2024-06-28 11:00:58 +02:00
H_oo on GPU 2024-06-29 02:27:50 +02:00			`n_ = (int) n;`
			`incx_ = (int) incx;`
			`incy_ = (int) incy;`
Added Nvidia module 2024-06-28 11:00:58 +02:00
			`assert ( (int64_t) n_ == n );`
			`assert ( (int64_t) incx_ == incx);`
			`assert ( (int64_t) incy_ == incy);`

H_oo on GPU 2024-06-29 02:27:50 +02:00			`cublasStatus_t rc = cublasDdot(handle, n_, x, incx_, y, incy_, result);`
			`/*`
			`double alpha = 1.0;`
			`double beta = 0.0;`
			`cublasStatus_t rc = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, n_, &alpha, x, 1, y, n_, &beta, &result_, 1);`
			`*/`
			`assert (rc == CUBLAS_STATUS_SUCCESS);`
Added Nvidia module 2024-06-28 11:00:58 +02:00			`}`



H_oo on GPU 2024-06-29 02:27:50 +02:00			`void gpu_sdot(cublasHandle_t handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {`
			`assert (handle != NULL);`
Added Nvidia module 2024-06-28 11:00:58 +02:00
H_oo on GPU 2024-06-29 02:27:50 +02:00			`/* Convert to int */`
			`int n_, incx_, incy_;`
Added Nvidia module 2024-06-28 11:00:58 +02:00
H_oo on GPU 2024-06-29 02:27:50 +02:00			`n_ = (int) n;`
			`incx_ = (int) incx;`
			`incy_ = (int) incy;`
Added Nvidia module 2024-06-28 11:00:58 +02:00
			`/* Check for integer overflows */`
			`assert ( (int64_t) n_ == n );`
			`assert ( (int64_t) incx_ == incx);`
			`assert ( (int64_t) incy_ == incy);`

H_oo on GPU 2024-06-29 02:27:50 +02:00			`float result_ = 0.;`
			`cublasStatus_t rc = cublasSdot(handle, n_, x, incx_, y, incy_, &result_);`
			`assert (rc == CUBLAS_STATUS_SUCCESS);`
			`*result = result_;`
Added Nvidia module 2024-06-28 11:00:58 +02:00			`}`



Working on r1 2024-07-02 17:22:41 +02:00			`void gpu_dgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const double alpha,`
Added Nvidia module 2024-06-28 11:00:58 +02:00			`const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) {`

H_oo on GPU 2024-06-29 02:27:50 +02:00			`assert (handle != NULL);`
Added Nvidia module 2024-06-28 11:00:58 +02:00
H_oo on GPU 2024-06-29 02:27:50 +02:00			`/* Convert to int */`
			`int m_, n_, lda_, incx_, incy_;`
Added Nvidia module 2024-06-28 11:00:58 +02:00
H_oo on GPU 2024-06-29 02:27:50 +02:00			`m_ = (int) m;`
			`n_ = (int) n;`
			`lda_ = (int) lda;`
			`incx_ = (int) incx;`
			`incy_ = (int) incy;`
Added Nvidia module 2024-06-28 11:00:58 +02:00
			`/* Check for integer overflows */`
			`assert ( (int64_t) m_ == m );`
			`assert ( (int64_t) n_ == n );`
			`assert ( (int64_t) lda_ == lda );`
			`assert ( (int64_t) incx_ == incx);`
			`assert ( (int64_t) incy_ == incy);`

			`cublasOperation_t transa_ = CUBLAS_OP_N;`
Working on r1 2024-07-02 17:22:41 +02:00			`if (transa == 'T' \|\| transa == 't') transa_ = CUBLAS_OP_T;`
Added Nvidia module 2024-06-28 11:00:58 +02:00
H_oo on GPU 2024-06-29 02:27:50 +02:00			`cublasDgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_);`
Added Nvidia module 2024-06-28 11:00:58 +02:00			`}`



Working on r1 2024-07-02 17:22:41 +02:00			`void gpu_sgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const float alpha,`
Added Nvidia module 2024-06-28 11:00:58 +02:00			`const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) {`

H_oo on GPU 2024-06-29 02:27:50 +02:00			`assert (handle != NULL);`
Added Nvidia module 2024-06-28 11:00:58 +02:00
H_oo on GPU 2024-06-29 02:27:50 +02:00			`/* Convert to int */`
			`int m_, n_, lda_, incx_, incy_;`
Added Nvidia module 2024-06-28 11:00:58 +02:00
H_oo on GPU 2024-06-29 02:27:50 +02:00			`m_ = (int) m;`
			`n_ = (int) n;`
			`lda_ = (int) lda;`
			`incx_ = (int) incx;`
			`incy_ = (int) incy;`
Added Nvidia module 2024-06-28 11:00:58 +02:00
			`/* Check for integer overflows */`
			`assert ( (int64_t) m_ == m );`
			`assert ( (int64_t) n_ == n );`
			`assert ( (int64_t) lda_ == lda );`
			`assert ( (int64_t) incx_ == incx);`
			`assert ( (int64_t) incy_ == incy);`

			`cublasOperation_t transa_ = CUBLAS_OP_N;`
Working on r1 2024-07-02 17:22:41 +02:00			`if (transa == 'T' \|\| transa == 't') transa_ = CUBLAS_OP_T;`
Added Nvidia module 2024-06-28 11:00:58 +02:00
H_oo on GPU 2024-06-29 02:27:50 +02:00			`cublasSgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_);`
Added Nvidia module 2024-06-28 11:00:58 +02:00			`}`


Working on r1 2024-07-02 17:22:41 +02:00			`void gpu_dgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,`
Added Nvidia module 2024-06-28 11:00:58 +02:00			`const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) {`

H_oo on GPU 2024-06-29 02:27:50 +02:00			`assert (handle != NULL);`
Added Nvidia module 2024-06-28 11:00:58 +02:00
H_oo on GPU 2024-06-29 02:27:50 +02:00			`/* Convert to int */`
			`int m_, n_, k_, lda_, ldb_, ldc_;`
Added Nvidia module 2024-06-28 11:00:58 +02:00
H_oo on GPU 2024-06-29 02:27:50 +02:00			`m_ = (int) m;`
			`n_ = (int) n;`
			`k_ = (int) k;`
			`lda_ = (int) lda;`
			`ldb_ = (int) ldb;`
			`ldc_ = (int) ldc;`
Added Nvidia module 2024-06-28 11:00:58 +02:00
			`/* Check for integer overflows */`
			`assert ( (int64_t) m_ == m );`
			`assert ( (int64_t) n_ == n );`
			`assert ( (int64_t) k_ == k );`
			`assert ( (int64_t) lda_ == lda);`
			`assert ( (int64_t) ldb_ == ldb);`
			`assert ( (int64_t) ldc_ == ldc);`

			`cublasOperation_t transa_ = CUBLAS_OP_N;`
			`cublasOperation_t transb_ = CUBLAS_OP_N;`
Working on r1 2024-07-02 17:22:41 +02:00			`if (transa == 'T' \|\| transa == 't') transa_ = CUBLAS_OP_T;`
			`if (transb == 'T' \|\| transb == 't') transb_ = CUBLAS_OP_T;`
Added Nvidia module 2024-06-28 11:00:58 +02:00
H_oo on GPU 2024-06-29 02:27:50 +02:00			`cublasDgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_);`
Added Nvidia module 2024-06-28 11:00:58 +02:00			`}`



Working on r1 2024-07-02 17:22:41 +02:00			`void gpu_sgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,`
Added Nvidia module 2024-06-28 11:00:58 +02:00			`const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) {`

H_oo on GPU 2024-06-29 02:27:50 +02:00			`assert (handle != NULL);`
Added Nvidia module 2024-06-28 11:00:58 +02:00
H_oo on GPU 2024-06-29 02:27:50 +02:00			`/* Convert to int */`
			`int m_, n_, k_, lda_, ldb_, ldc_;`
Added Nvidia module 2024-06-28 11:00:58 +02:00
H_oo on GPU 2024-06-29 02:27:50 +02:00			`m_ = (int) m;`
			`n_ = (int) n;`
			`k_ = (int) k;`
			`lda_ = (int) lda;`
			`ldb_ = (int) ldb;`
			`ldc_ = (int) ldc;`
Added Nvidia module 2024-06-28 11:00:58 +02:00
			`/* Check for integer overflows */`
			`assert ( (int64_t) m_ == m );`
			`assert ( (int64_t) n_ == n );`
			`assert ( (int64_t) k_ == k );`
			`assert ( (int64_t) lda_ == lda);`
			`assert ( (int64_t) ldb_ == ldb);`
			`assert ( (int64_t) ldc_ == ldc);`

			`cublasOperation_t transa_ = CUBLAS_OP_N;`
			`cublasOperation_t transb_ = CUBLAS_OP_N;`
Working on r1 2024-07-02 17:22:41 +02:00			`if (transa == 'T' \|\| transa == 't') transa_ = CUBLAS_OP_T;`
			`if (transb == 'T' \|\| transb == 't') transb_ = CUBLAS_OP_T;`
Added Nvidia module 2024-06-28 11:00:58 +02:00
H_oo on GPU 2024-06-29 02:27:50 +02:00			`cublasSgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_);`
Added Nvidia module 2024-06-28 11:00:58 +02:00			`}`


Working on r1 2024-07-02 17:22:41 +02:00			`void gpu_dgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double alpha,`
Added Nvidia module 2024-06-28 11:00:58 +02:00			`const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {`
H_oo on GPU 2024-06-29 02:27:50 +02:00			`assert (handle != NULL);`
Added Nvidia module 2024-06-28 11:00:58 +02:00
H_oo on GPU 2024-06-29 02:27:50 +02:00			`/* Convert to int */`
			`int m_, n_, lda_, ldb_, ldc_;`
Added Nvidia module 2024-06-28 11:00:58 +02:00
H_oo on GPU 2024-06-29 02:27:50 +02:00			`m_ = (int) m;`
			`n_ = (int) n;`
			`lda_ = (int) lda;`
			`ldb_ = (int) ldb;`
			`ldc_ = (int) ldc;`
Added Nvidia module 2024-06-28 11:00:58 +02:00
			`/* Check for integer overflows */`
			`assert ( (int64_t) m_ == m );`
			`assert ( (int64_t) n_ == n );`
			`assert ( (int64_t) lda_ == lda);`
			`assert ( (int64_t) ldb_ == ldb);`
			`assert ( (int64_t) ldc_ == ldc);`

			`cublasOperation_t transa_ = CUBLAS_OP_N;`
			`cublasOperation_t transb_ = CUBLAS_OP_N;`
Working on r1 2024-07-02 17:22:41 +02:00			`if (transa == 'T' \|\| transa == 't') transa_ = CUBLAS_OP_T;`
			`if (transb == 'T' \|\| transb == 't') transb_ = CUBLAS_OP_T;`
Added Nvidia module 2024-06-28 11:00:58 +02:00
H_oo on GPU 2024-06-29 02:27:50 +02:00			`cublasDgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_);`
Added Nvidia module 2024-06-28 11:00:58 +02:00
			`}`


Working on r1 2024-07-02 17:22:41 +02:00			`void gpu_sgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float alpha,`
Added Nvidia module 2024-06-28 11:00:58 +02:00			`const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {`
H_oo on GPU 2024-06-29 02:27:50 +02:00			`assert (handle != NULL);`
Added Nvidia module 2024-06-28 11:00:58 +02:00
H_oo on GPU 2024-06-29 02:27:50 +02:00			`/* Convert to int */`
			`int m_, n_, lda_, ldb_, ldc_;`
Added Nvidia module 2024-06-28 11:00:58 +02:00
H_oo on GPU 2024-06-29 02:27:50 +02:00			`m_ = (int) m;`
			`n_ = (int) n;`
			`lda_ = (int) lda;`
			`ldb_ = (int) ldb;`
			`ldc_ = (int) ldc;`
Added Nvidia module 2024-06-28 11:00:58 +02:00
			`/* Check for integer overflows */`
			`assert ( (int64_t) m_ == m );`
			`assert ( (int64_t) n_ == n );`
			`assert ( (int64_t) lda_ == lda);`
			`assert ( (int64_t) ldb_ == ldb);`
			`assert ( (int64_t) ldc_ == ldc);`

			`cublasOperation_t transa_ = CUBLAS_OP_N;`
			`cublasOperation_t transb_ = CUBLAS_OP_N;`
Working on r1 2024-07-02 17:22:41 +02:00			`if (transa == 'T' \|\| transa == 't') transa_ = CUBLAS_OP_T;`
			`if (transb == 'T' \|\| transb == 't') transb_ = CUBLAS_OP_T;`
Added Nvidia module 2024-06-28 11:00:58 +02:00
H_oo on GPU 2024-06-29 02:27:50 +02:00			`cublasSgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_);`
Added Nvidia module 2024-06-28 11:00:58 +02:00
			`}`