mirror of
https://github.com/QuantumPackage/qp2.git
synced 2024-12-22 12:23:43 +01:00
H_oo on GPU
This commit is contained in:
parent
b467bef6dd
commit
860121d404
@ -1,5 +1,6 @@
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
#include <stdbool.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
@ -10,6 +11,10 @@
|
|||||||
|
|
||||||
/* Generic functions */
|
/* Generic functions */
|
||||||
|
|
||||||
|
bool no_gpu() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
int gpu_ndevices() {
|
int gpu_ndevices() {
|
||||||
int ngpus;
|
int ngpus;
|
||||||
cudaGetDeviceCount(&ngpus);
|
cudaGetDeviceCount(&ngpus);
|
||||||
@ -17,7 +22,7 @@ int gpu_ndevices() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void gpu_set_device(int32_t igpu) {
|
void gpu_set_device(int32_t igpu) {
|
||||||
cudaSetDevice(igpu);
|
cudaSetDevice((int) igpu);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -64,22 +69,20 @@ void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
|
|||||||
|
|
||||||
/* Streams */
|
/* Streams */
|
||||||
|
|
||||||
void gpu_stream_create(void** ptr) {
|
void gpu_stream_create(cudaStream_t* ptr) {
|
||||||
cudaStream_t stream;
|
cudaError_t rc = cudaStreamCreate(ptr);
|
||||||
cudaError_t rc = cudaStreamCreate(&stream);
|
|
||||||
assert (rc == cudaSuccess);
|
assert (rc == cudaSuccess);
|
||||||
*ptr = (void*) stream;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void gpu_stream_destroy(void** ptr) {
|
void gpu_stream_destroy(cudaStream_t* ptr) {
|
||||||
assert (*ptr != NULL);
|
assert (ptr != NULL);
|
||||||
cudaError_t rc = cudaStreamDestroy( (cudaStream_t) *ptr);
|
cudaError_t rc = cudaStreamDestroy(*ptr);
|
||||||
assert (rc == cudaSuccess);
|
assert (rc == cudaSuccess);
|
||||||
*ptr = NULL;
|
*ptr = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
void gpu_set_stream(void** handle, void** stream) {
|
void gpu_set_stream(cublasHandle_t handle, cudaStream_t stream) {
|
||||||
cublasSetStream( (cublasHandle_t) *handle, (cudaStream_t) *stream);
|
cublasSetStream(handle, stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
void gpu_synchronize() {
|
void gpu_synchronize() {
|
||||||
@ -89,75 +92,80 @@ void gpu_synchronize() {
|
|||||||
|
|
||||||
/* BLAS functions */
|
/* BLAS functions */
|
||||||
|
|
||||||
void gpu_blas_create(void** handle) {
|
void gpu_blas_create(cublasHandle_t* ptr) {
|
||||||
cublasHandle_t cublas_handle;
|
cublasStatus_t rc = cublasCreate(ptr);
|
||||||
cublasStatus_t rc = cublasCreate(&cublas_handle);
|
|
||||||
assert (rc == CUBLAS_STATUS_SUCCESS);
|
assert (rc == CUBLAS_STATUS_SUCCESS);
|
||||||
*handle = (void*) cublas_handle;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void gpu_blas_destroy(void** handle) {
|
void gpu_blas_destroy(cublasHandle_t* ptr) {
|
||||||
assert (*handle != NULL);
|
assert (ptr != NULL);
|
||||||
cublasStatus_t rc = cublasDestroy( (cublasHandle_t) *handle);
|
cublasStatus_t rc = cublasDestroy(*ptr);
|
||||||
assert (rc == CUBLAS_STATUS_SUCCESS);
|
assert (rc == CUBLAS_STATUS_SUCCESS);
|
||||||
*handle = NULL;
|
ptr = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void gpu_ddot(void** handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
|
void gpu_ddot(cublasHandle_t handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
|
||||||
assert (*handle != NULL);
|
assert (handle != NULL);
|
||||||
|
/* Convert to int */
|
||||||
|
int n_, incx_, incy_;
|
||||||
|
|
||||||
/* Convert to int32_t */
|
n_ = (int) n;
|
||||||
int32_t n_, incx_, incy_;
|
incx_ = (int) incx;
|
||||||
|
incy_ = (int) incy;
|
||||||
|
|
||||||
n_ = (int32_t) n;
|
assert ( (int64_t) n_ == n );
|
||||||
incx_ = (int32_t) incx;
|
assert ( (int64_t) incx_ == incx);
|
||||||
incy_ = (int32_t) incy;
|
assert ( (int64_t) incy_ == incy);
|
||||||
|
|
||||||
|
cublasStatus_t rc = cublasDdot(handle, n_, x, incx_, y, incy_, result);
|
||||||
|
/*
|
||||||
|
double alpha = 1.0;
|
||||||
|
double beta = 0.0;
|
||||||
|
cublasStatus_t rc = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, n_, &alpha, x, 1, y, n_, &beta, &result_, 1);
|
||||||
|
*/
|
||||||
|
assert (rc == CUBLAS_STATUS_SUCCESS);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
void gpu_sdot(cublasHandle_t handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
|
||||||
|
assert (handle != NULL);
|
||||||
|
|
||||||
|
/* Convert to int */
|
||||||
|
int n_, incx_, incy_;
|
||||||
|
|
||||||
|
n_ = (int) n;
|
||||||
|
incx_ = (int) incx;
|
||||||
|
incy_ = (int) incy;
|
||||||
|
|
||||||
/* Check for integer overflows */
|
/* Check for integer overflows */
|
||||||
assert ( (int64_t) n_ == n );
|
assert ( (int64_t) n_ == n );
|
||||||
assert ( (int64_t) incx_ == incx);
|
assert ( (int64_t) incx_ == incx);
|
||||||
assert ( (int64_t) incy_ == incy);
|
assert ( (int64_t) incy_ == incy);
|
||||||
|
|
||||||
cublasDdot((cublasHandle_t) *handle, n_, x, incx_, y, incy_, result);
|
float result_ = 0.;
|
||||||
|
cublasStatus_t rc = cublasSdot(handle, n_, x, incx_, y, incy_, &result_);
|
||||||
|
assert (rc == CUBLAS_STATUS_SUCCESS);
|
||||||
|
*result = result_;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
void gpu_sdot(void** handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
|
void gpu_dgemv(cublasHandle_t handle, const char transa, const int64_t m, const int64_t n, const double alpha,
|
||||||
assert (*handle != NULL);
|
|
||||||
|
|
||||||
/* Convert to int32_t */
|
|
||||||
int32_t n_, incx_, incy_;
|
|
||||||
|
|
||||||
n_ = (int32_t) n;
|
|
||||||
incx_ = (int32_t) incx;
|
|
||||||
incy_ = (int32_t) incy;
|
|
||||||
|
|
||||||
/* Check for integer overflows */
|
|
||||||
assert ( (int64_t) n_ == n );
|
|
||||||
assert ( (int64_t) incx_ == incx);
|
|
||||||
assert ( (int64_t) incy_ == incy);
|
|
||||||
|
|
||||||
cublasSdot((cublasHandle_t) *handle, n_, x, incx_, y, incy_, result);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t n, const double alpha,
|
|
||||||
const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) {
|
const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) {
|
||||||
|
|
||||||
assert (*handle != NULL);
|
assert (handle != NULL);
|
||||||
|
|
||||||
/* Convert to int32_t */
|
/* Convert to int */
|
||||||
int32_t m_, n_, lda_, incx_, incy_;
|
int m_, n_, lda_, incx_, incy_;
|
||||||
|
|
||||||
m_ = (int32_t) m;
|
m_ = (int) m;
|
||||||
n_ = (int32_t) n;
|
n_ = (int) n;
|
||||||
lda_ = (int32_t) lda;
|
lda_ = (int) lda;
|
||||||
incx_ = (int32_t) incx;
|
incx_ = (int) incx;
|
||||||
incy_ = (int32_t) incy;
|
incy_ = (int) incy;
|
||||||
|
|
||||||
/* Check for integer overflows */
|
/* Check for integer overflows */
|
||||||
assert ( (int64_t) m_ == m );
|
assert ( (int64_t) m_ == m );
|
||||||
@ -169,24 +177,24 @@ void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t
|
|||||||
cublasOperation_t transa_ = CUBLAS_OP_N;
|
cublasOperation_t transa_ = CUBLAS_OP_N;
|
||||||
if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
|
if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
|
||||||
|
|
||||||
cublasDgemv((cublasHandle_t) *handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_);
|
cublasDgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t n, const float alpha,
|
void gpu_sgemv(cublasHandle_t handle, const char transa, const int64_t m, const int64_t n, const float alpha,
|
||||||
const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) {
|
const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) {
|
||||||
|
|
||||||
assert (*handle != NULL);
|
assert (handle != NULL);
|
||||||
|
|
||||||
/* Convert to int32_t */
|
/* Convert to int */
|
||||||
int32_t m_, n_, lda_, incx_, incy_;
|
int m_, n_, lda_, incx_, incy_;
|
||||||
|
|
||||||
m_ = (int32_t) m;
|
m_ = (int) m;
|
||||||
n_ = (int32_t) n;
|
n_ = (int) n;
|
||||||
lda_ = (int32_t) lda;
|
lda_ = (int) lda;
|
||||||
incx_ = (int32_t) incx;
|
incx_ = (int) incx;
|
||||||
incy_ = (int32_t) incy;
|
incy_ = (int) incy;
|
||||||
|
|
||||||
/* Check for integer overflows */
|
/* Check for integer overflows */
|
||||||
assert ( (int64_t) m_ == m );
|
assert ( (int64_t) m_ == m );
|
||||||
@ -198,24 +206,24 @@ void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t
|
|||||||
cublasOperation_t transa_ = CUBLAS_OP_N;
|
cublasOperation_t transa_ = CUBLAS_OP_N;
|
||||||
if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
|
if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
|
||||||
|
|
||||||
cublasSgemv((cublasHandle_t) *handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_);
|
cublasSgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void gpu_dgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
|
void gpu_dgemm(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
|
||||||
const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) {
|
const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) {
|
||||||
|
|
||||||
assert (*handle != NULL);
|
assert (handle != NULL);
|
||||||
|
|
||||||
/* Convert to int32_t */
|
/* Convert to int */
|
||||||
int32_t m_, n_, k_, lda_, ldb_, ldc_;
|
int m_, n_, k_, lda_, ldb_, ldc_;
|
||||||
|
|
||||||
m_ = (int32_t) m;
|
m_ = (int) m;
|
||||||
n_ = (int32_t) n;
|
n_ = (int) n;
|
||||||
k_ = (int32_t) k;
|
k_ = (int) k;
|
||||||
lda_ = (int32_t) lda;
|
lda_ = (int) lda;
|
||||||
ldb_ = (int32_t) ldb;
|
ldb_ = (int) ldb;
|
||||||
ldc_ = (int32_t) ldc;
|
ldc_ = (int) ldc;
|
||||||
|
|
||||||
/* Check for integer overflows */
|
/* Check for integer overflows */
|
||||||
assert ( (int64_t) m_ == m );
|
assert ( (int64_t) m_ == m );
|
||||||
@ -230,25 +238,25 @@ void gpu_dgemm(void** handle, const char transa, const char transb, const int64_
|
|||||||
if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
|
if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
|
||||||
if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
|
if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
|
||||||
|
|
||||||
cublasDgemm((cublasHandle_t) *handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_);
|
cublasDgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
void gpu_sgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
|
void gpu_sgemm(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
|
||||||
const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) {
|
const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) {
|
||||||
|
|
||||||
assert (*handle != NULL);
|
assert (handle != NULL);
|
||||||
|
|
||||||
/* Convert to int32_t */
|
/* Convert to int */
|
||||||
int32_t m_, n_, k_, lda_, ldb_, ldc_;
|
int m_, n_, k_, lda_, ldb_, ldc_;
|
||||||
|
|
||||||
m_ = (int32_t) m;
|
m_ = (int) m;
|
||||||
n_ = (int32_t) n;
|
n_ = (int) n;
|
||||||
k_ = (int32_t) k;
|
k_ = (int) k;
|
||||||
lda_ = (int32_t) lda;
|
lda_ = (int) lda;
|
||||||
ldb_ = (int32_t) ldb;
|
ldb_ = (int) ldb;
|
||||||
ldc_ = (int32_t) ldc;
|
ldc_ = (int) ldc;
|
||||||
|
|
||||||
/* Check for integer overflows */
|
/* Check for integer overflows */
|
||||||
assert ( (int64_t) m_ == m );
|
assert ( (int64_t) m_ == m );
|
||||||
@ -263,22 +271,22 @@ void gpu_sgemm(void** handle, const char transa, const char transb, const int64_
|
|||||||
if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
|
if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
|
||||||
if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
|
if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
|
||||||
|
|
||||||
cublasSgemm((cublasHandle_t) *handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_);
|
cublasSgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void gpu_dgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
|
void gpu_dgeam(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
|
||||||
const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
|
const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
|
||||||
assert (*handle != NULL);
|
assert (handle != NULL);
|
||||||
|
|
||||||
/* Convert to int32_t */
|
/* Convert to int */
|
||||||
int32_t m_, n_, lda_, ldb_, ldc_;
|
int m_, n_, lda_, ldb_, ldc_;
|
||||||
|
|
||||||
m_ = (int32_t) m;
|
m_ = (int) m;
|
||||||
n_ = (int32_t) n;
|
n_ = (int) n;
|
||||||
lda_ = (int32_t) lda;
|
lda_ = (int) lda;
|
||||||
ldb_ = (int32_t) ldb;
|
ldb_ = (int) ldb;
|
||||||
ldc_ = (int32_t) ldc;
|
ldc_ = (int) ldc;
|
||||||
|
|
||||||
/* Check for integer overflows */
|
/* Check for integer overflows */
|
||||||
assert ( (int64_t) m_ == m );
|
assert ( (int64_t) m_ == m );
|
||||||
@ -292,23 +300,23 @@ void gpu_dgeam(void** handle, const char transa, const char transb, const int64_
|
|||||||
if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
|
if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
|
||||||
if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
|
if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
|
||||||
|
|
||||||
cublasDgeam((cublasHandle_t) *handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_);
|
cublasDgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void gpu_sgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
|
void gpu_sgeam(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
|
||||||
const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
|
const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
|
||||||
assert (*handle != NULL);
|
assert (handle != NULL);
|
||||||
|
|
||||||
/* Convert to int32_t */
|
/* Convert to int */
|
||||||
int32_t m_, n_, lda_, ldb_, ldc_;
|
int m_, n_, lda_, ldb_, ldc_;
|
||||||
|
|
||||||
m_ = (int32_t) m;
|
m_ = (int) m;
|
||||||
n_ = (int32_t) n;
|
n_ = (int) n;
|
||||||
lda_ = (int32_t) lda;
|
lda_ = (int) lda;
|
||||||
ldb_ = (int32_t) ldb;
|
ldb_ = (int) ldb;
|
||||||
ldc_ = (int32_t) ldc;
|
ldc_ = (int) ldc;
|
||||||
|
|
||||||
/* Check for integer overflows */
|
/* Check for integer overflows */
|
||||||
assert ( (int64_t) m_ == m );
|
assert ( (int64_t) m_ == m );
|
||||||
@ -322,6 +330,6 @@ void gpu_sgeam(void** handle, const char transa, const char transb, const int64_
|
|||||||
if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
|
if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
|
||||||
if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
|
if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
|
||||||
|
|
||||||
cublasSgeam((cublasHandle_t) *handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_);
|
cublasSgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -2,8 +2,12 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
#include <stdbool.h>
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
|
||||||
|
bool no_gpu() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
/* Generic functions */
|
/* Generic functions */
|
||||||
|
|
||||||
@ -56,7 +60,7 @@ void gpu_stream_destroy(void** ptr) {
|
|||||||
*ptr = NULL;
|
*ptr = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
void gpu_set_stream(void** handle, void** stream) {
|
void gpu_set_stream(void* handle, void* stream) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -79,8 +83,8 @@ void gpu_blas_destroy(void** handle) {
|
|||||||
|
|
||||||
double ddot_(const int32_t* n, const double* x, const int32_t* incx, const double* y, const int32_t* incy);
|
double ddot_(const int32_t* n, const double* x, const int32_t* incx, const double* y, const int32_t* incy);
|
||||||
|
|
||||||
void gpu_ddot(void** handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
|
void gpu_ddot(void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
|
||||||
assert (*handle != NULL);
|
assert (handle != NULL);
|
||||||
|
|
||||||
/* Convert to int32_t */
|
/* Convert to int32_t */
|
||||||
int32_t n_, incx_, incy_;
|
int32_t n_, incx_, incy_;
|
||||||
@ -100,8 +104,8 @@ void gpu_ddot(void** handle, const int64_t n, const double* x, const int64_t inc
|
|||||||
|
|
||||||
float sdot_(const int32_t* n, const float* x, const int32_t* incx, const float* y, const int32_t* incy);
|
float sdot_(const int32_t* n, const float* x, const int32_t* incx, const float* y, const int32_t* incy);
|
||||||
|
|
||||||
void gpu_sdot(void** handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
|
void gpu_sdot(void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
|
||||||
assert (*handle != NULL);
|
assert (handle != NULL);
|
||||||
|
|
||||||
/* Convert to int32_t */
|
/* Convert to int32_t */
|
||||||
int32_t n_, incx_, incy_;
|
int32_t n_, incx_, incy_;
|
||||||
@ -122,10 +126,10 @@ void gpu_sdot(void** handle, const int64_t n, const float* x, const int64_t incx
|
|||||||
void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha,
|
void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha,
|
||||||
const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy);
|
const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy);
|
||||||
|
|
||||||
void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t n, const double alpha,
|
void gpu_dgemv(void* handle, const char transa, const int64_t m, const int64_t n, const double alpha,
|
||||||
const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) {
|
const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) {
|
||||||
|
|
||||||
assert (*handle != NULL);
|
assert (handle != NULL);
|
||||||
|
|
||||||
/* Convert to int32_t */
|
/* Convert to int32_t */
|
||||||
int32_t m_, n_, lda_, incx_, incy_;
|
int32_t m_, n_, lda_, incx_, incy_;
|
||||||
@ -150,10 +154,10 @@ void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t
|
|||||||
void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha,
|
void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha,
|
||||||
const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy);
|
const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy);
|
||||||
|
|
||||||
void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t n, const float alpha,
|
void gpu_sgemv(void* handle, const char transa, const int64_t m, const int64_t n, const float alpha,
|
||||||
const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) {
|
const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) {
|
||||||
|
|
||||||
assert (*handle != NULL);
|
assert (handle != NULL);
|
||||||
|
|
||||||
/* Convert to int32_t */
|
/* Convert to int32_t */
|
||||||
int32_t m_, n_, lda_, incx_, incy_;
|
int32_t m_, n_, lda_, incx_, incy_;
|
||||||
@ -178,10 +182,10 @@ void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t
|
|||||||
void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha,
|
void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha,
|
||||||
const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc);
|
const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc);
|
||||||
|
|
||||||
void gpu_dgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
|
void gpu_dgemm(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
|
||||||
const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) {
|
const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) {
|
||||||
|
|
||||||
assert (*handle != NULL);
|
assert (handle != NULL);
|
||||||
|
|
||||||
/* Convert to int32_t */
|
/* Convert to int32_t */
|
||||||
int32_t m_, n_, k_, lda_, ldb_, ldc_;
|
int32_t m_, n_, k_, lda_, ldb_, ldc_;
|
||||||
@ -209,10 +213,10 @@ void gpu_dgemm(void** handle, const char transa, const char transb, const int64_
|
|||||||
void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha,
|
void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha,
|
||||||
const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc);
|
const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc);
|
||||||
|
|
||||||
void gpu_sgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
|
void gpu_sgemm(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
|
||||||
const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) {
|
const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) {
|
||||||
|
|
||||||
assert (*handle != NULL);
|
assert (handle != NULL);
|
||||||
|
|
||||||
/* Convert to int32_t */
|
/* Convert to int32_t */
|
||||||
int32_t m_, n_, k_, lda_, ldb_, ldc_;
|
int32_t m_, n_, k_, lda_, ldb_, ldc_;
|
||||||
@ -236,9 +240,9 @@ void gpu_sgemm(void** handle, const char transa, const char transb, const int64_
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void gpu_dgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
|
void gpu_dgeam(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
|
||||||
const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
|
const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
|
||||||
assert (*handle != NULL);
|
assert (handle != NULL);
|
||||||
|
|
||||||
if ( (transa == 'N' && transb == 'N') ||
|
if ( (transa == 'N' && transb == 'N') ||
|
||||||
(transa == 'n' && transb == 'N') ||
|
(transa == 'n' && transb == 'N') ||
|
||||||
@ -368,9 +372,9 @@ void gpu_dgeam(void** handle, const char transa, const char transb, const int64_
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void gpu_sgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
|
void gpu_sgeam(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
|
||||||
const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
|
const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
|
||||||
assert (*handle != NULL);
|
assert (handle != NULL);
|
||||||
|
|
||||||
if ( (transa == 'N' && transb == 'N') ||
|
if ( (transa == 'N' && transb == 'N') ||
|
||||||
(transa == 'n' && transb == 'N') ||
|
(transa == 'n' && transb == 'N') ||
|
||||||
|
@ -14,9 +14,15 @@ subroutine run_ccsd_space_orb
|
|||||||
type(gpu_double2) :: t1, r1
|
type(gpu_double2) :: t1, r1
|
||||||
type(gpu_double2) :: H_oo, H_vv, H_vo
|
type(gpu_double2) :: H_oo, H_vv, H_vo
|
||||||
|
|
||||||
type(gpu_double2) :: d_cc_space_f_vo
|
type(gpu_double2) :: d_cc_space_f_oo, d_cc_space_f_vo
|
||||||
|
type(gpu_double2) :: d_cc_space_f_ov, d_cc_space_f_vv
|
||||||
|
|
||||||
|
type(gpu_double3) :: d_cc_space_v_oo_chol, d_cc_space_v_vo_chol
|
||||||
|
type(gpu_double3) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol
|
||||||
|
|
||||||
type(gpu_double4) :: d_cc_space_v_oovv
|
type(gpu_double4) :: d_cc_space_v_oovv
|
||||||
|
|
||||||
|
|
||||||
double precision, allocatable :: all_err(:,:), all_t(:,:)
|
double precision, allocatable :: all_err(:,:), all_t(:,:)
|
||||||
integer, allocatable :: list_occ(:), list_vir(:)
|
integer, allocatable :: list_occ(:), list_vir(:)
|
||||||
integer(bit_kind) :: det(N_int,2)
|
integer(bit_kind) :: det(N_int,2)
|
||||||
@ -24,7 +30,7 @@ subroutine run_ccsd_space_orb
|
|||||||
|
|
||||||
call set_multiple_levels_omp(.False.)
|
call set_multiple_levels_omp(.False.)
|
||||||
|
|
||||||
if (do_ao_cholesky) then
|
if (do_mo_cholesky) then
|
||||||
PROVIDE cholesky_mo_transp
|
PROVIDE cholesky_mo_transp
|
||||||
FREE cholesky_ao
|
FREE cholesky_ao
|
||||||
else
|
else
|
||||||
@ -55,11 +61,36 @@ subroutine run_ccsd_space_orb
|
|||||||
!print*,'occ',list_occ
|
!print*,'occ',list_occ
|
||||||
!print*,'vir',list_vir
|
!print*,'vir',list_vir
|
||||||
|
|
||||||
|
! GPU arrays
|
||||||
|
call gpu_allocate(d_cc_space_f_oo, nO, nO)
|
||||||
call gpu_allocate(d_cc_space_f_vo, nV, nO)
|
call gpu_allocate(d_cc_space_f_vo, nV, nO)
|
||||||
call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV)
|
call gpu_allocate(d_cc_space_f_ov, nO, nV)
|
||||||
call gpu_upload(cc_space_f_vo, d_cc_space_f_vo)
|
call gpu_allocate(d_cc_space_f_vv, nV, nV)
|
||||||
call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv)
|
|
||||||
|
|
||||||
|
call gpu_upload(cc_space_f_oo, d_cc_space_f_oo)
|
||||||
|
call gpu_upload(cc_space_f_vo, d_cc_space_f_vo)
|
||||||
|
call gpu_upload(cc_space_f_vv, d_cc_space_f_vv)
|
||||||
|
|
||||||
|
! FREE cc_space_f_oo
|
||||||
|
! FREE cc_space_f_vo
|
||||||
|
! FREE cc_space_f_vv
|
||||||
|
|
||||||
|
if (do_mo_cholesky) then
|
||||||
|
call gpu_allocate(d_cc_space_v_oo_chol, cholesky_mo_num, nO, nO)
|
||||||
|
call gpu_allocate(d_cc_space_v_ov_chol, cholesky_mo_num, nO, nV)
|
||||||
|
call gpu_allocate(d_cc_space_v_vo_chol, cholesky_mo_num, nV, nO)
|
||||||
|
call gpu_allocate(d_cc_space_v_vv_chol, cholesky_mo_num, nV, nV)
|
||||||
|
|
||||||
|
call gpu_upload(cc_space_v_oo_chol, d_cc_space_v_oo_chol)
|
||||||
|
call gpu_upload(cc_space_v_ov_chol, d_cc_space_v_ov_chol)
|
||||||
|
call gpu_upload(cc_space_v_vo_chol, d_cc_space_v_vo_chol)
|
||||||
|
call gpu_upload(cc_space_v_vv_chol, d_cc_space_v_vv_chol)
|
||||||
|
|
||||||
|
! FREE cc_space_v_oo_chol
|
||||||
|
! FREE cc_space_v_ov_chol
|
||||||
|
! FREE cc_space_v_vo_chol
|
||||||
|
! FREE cc_space_v_vv_chol
|
||||||
|
endif
|
||||||
|
|
||||||
call gpu_allocate(t2, nO,nO,nV,nV)
|
call gpu_allocate(t2, nO,nO,nV,nV)
|
||||||
call gpu_allocate(r2, nO,nO,nV,nV)
|
call gpu_allocate(r2, nO,nO,nV,nV)
|
||||||
@ -120,6 +151,13 @@ subroutine run_ccsd_space_orb
|
|||||||
call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,h_t2)
|
call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,h_t2)
|
||||||
call gpu_upload(h_t2, t2)
|
call gpu_upload(h_t2, t2)
|
||||||
|
|
||||||
|
|
||||||
|
call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV)
|
||||||
|
call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv)
|
||||||
|
|
||||||
|
! FREE cc_space_v_oovv
|
||||||
|
|
||||||
|
|
||||||
call update_tau_space(nO,nV,h_t1,t1,t2,tau)
|
call update_tau_space(nO,nV,h_t1,t1,t2,tau)
|
||||||
call update_tau_x_space(nO,nV,tau,tau_x)
|
call update_tau_x_space(nO,nV,tau,tau_x)
|
||||||
!print*,'hf_energy', hf_energy
|
!print*,'hf_energy', hf_energy
|
||||||
@ -142,10 +180,10 @@ subroutine run_ccsd_space_orb
|
|||||||
do while (not_converged)
|
do while (not_converged)
|
||||||
|
|
||||||
! Residue
|
! Residue
|
||||||
if (do_ao_cholesky) then
|
if (do_mo_cholesky) then
|
||||||
! if (.False.) then
|
call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, &
|
||||||
call compute_H_oo_chol(nO,nV,tau_x,H_oo)
|
d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
|
||||||
call compute_H_vv_chol(nO,nV,tau_x%f,H_vv%f)
|
call compute_H_vv_chol(nO,nV,tau_x,H_vv)
|
||||||
call compute_H_vo_chol(nO,nV,t1%f,H_vo%f)
|
call compute_H_vo_chol(nO,nV,t1%f,H_vo%f)
|
||||||
|
|
||||||
call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1)
|
call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1)
|
||||||
@ -249,6 +287,12 @@ subroutine run_ccsd_space_orb
|
|||||||
call save_energy(uncorr_energy + energy, e_t)
|
call save_energy(uncorr_energy + energy, e_t)
|
||||||
|
|
||||||
deallocate(h_t1, h_t2)
|
deallocate(h_t1, h_t2)
|
||||||
|
if (do_mo_cholesky) then
|
||||||
|
call gpu_deallocate(d_cc_space_v_oo_chol)
|
||||||
|
call gpu_deallocate(d_cc_space_v_ov_chol)
|
||||||
|
call gpu_deallocate(d_cc_space_v_vo_chol)
|
||||||
|
call gpu_deallocate(d_cc_space_v_vv_chol)
|
||||||
|
endif
|
||||||
call gpu_deallocate(d_cc_space_f_vo)
|
call gpu_deallocate(d_cc_space_f_vo)
|
||||||
call gpu_deallocate(d_cc_space_v_oovv)
|
call gpu_deallocate(d_cc_space_v_oovv)
|
||||||
call gpu_deallocate(t1)
|
call gpu_deallocate(t1)
|
||||||
@ -302,8 +346,21 @@ subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,
|
|||||||
! !$omp end parallel
|
! !$omp end parallel
|
||||||
|
|
||||||
|
|
||||||
call gpu_ddot(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1, d_cc_space_v_oovv, 1, energy)
|
type(gpu_stream) :: s1, s2
|
||||||
call gpu_ddot(blas_handle, nO*nV*1_8, d_cc_space_f_vo, 1, t1, 1, e)
|
call gpu_stream_create(s1)
|
||||||
|
call gpu_stream_create(s2)
|
||||||
|
|
||||||
|
call gpu_set_stream(blas_handle,s1)
|
||||||
|
call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo, 1, t1, 1, e)
|
||||||
|
|
||||||
|
call gpu_set_stream(blas_handle,s2)
|
||||||
|
call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1_8, d_cc_space_v_oovv, 1_8, energy)
|
||||||
|
call gpu_synchronize()
|
||||||
|
call gpu_set_stream(blas_handle,gpu_default_stream)
|
||||||
|
|
||||||
|
call gpu_stream_destroy(s1)
|
||||||
|
call gpu_stream_destroy(s2)
|
||||||
|
|
||||||
energy = energy + 2.d0*e
|
energy = energy + 2.d0*e
|
||||||
|
|
||||||
end
|
end
|
||||||
@ -346,32 +403,29 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
|
|||||||
|
|
||||||
type(gpu_stream) :: stream(nV)
|
type(gpu_stream) :: stream(nV)
|
||||||
|
|
||||||
do b=1,nV
|
!$OMP PARALLEL if (no_gpu()) &
|
||||||
call gpu_stream_create(stream(b))
|
|
||||||
enddo
|
|
||||||
|
|
||||||
!$OMP PARALLEL &
|
|
||||||
!$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) &
|
!$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) &
|
||||||
!$OMP PRIVATE(i,j,a,b) &
|
!$OMP PRIVATE(i,j,a,b) &
|
||||||
!$OMP DEFAULT(NONE)
|
!$OMP DEFAULT(NONE)
|
||||||
!$OMP DO
|
!$OMP DO
|
||||||
do b=1,nV
|
do b=1,nV
|
||||||
|
call gpu_stream_create(stream(b))
|
||||||
call gpu_set_stream(blas_handle,stream(b))
|
call gpu_set_stream(blas_handle,stream(b))
|
||||||
do j=1,nO
|
do j=1,nO
|
||||||
call gpu_dgeam(blas_handle, 'N', 'N', nO*1_8, nV*1_8, &
|
call gpu_dgeam_f(blas_handle, 'N', 'N', nO, nV, &
|
||||||
1.d0, t2%f(1,j,1,b), nO*nO*1_8, &
|
1.d0, t2%f(1,j,1,b), nO*nO, &
|
||||||
h_t1(j,b), t1%f, nO*1_8, &
|
h_t1(j,b), t1%f, nO, &
|
||||||
tau%f(1,j,1,b), nO*nO*1_8)
|
tau%f(1,j,1,b), nO*nO)
|
||||||
enddo
|
enddo
|
||||||
enddo
|
enddo
|
||||||
!$OMP END DO
|
!$OMP END DO
|
||||||
!$OMP END PARALLEL
|
!$OMP END PARALLEL
|
||||||
|
|
||||||
call gpu_synchronize()
|
|
||||||
|
|
||||||
do b=1,nV
|
do b=1,nV
|
||||||
call gpu_stream_destroy(stream(b))
|
call gpu_stream_destroy(stream(b))
|
||||||
enddo
|
enddo
|
||||||
|
call gpu_set_stream(blas_handle,gpu_default_stream)
|
||||||
|
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
@ -412,7 +466,7 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x)
|
|||||||
call gpu_stream_create(stream(a))
|
call gpu_stream_create(stream(a))
|
||||||
enddo
|
enddo
|
||||||
|
|
||||||
!$OMP PARALLEL &
|
!$OMP PARALLEL if (no_gpu()) &
|
||||||
!$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) &
|
!$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) &
|
||||||
!$OMP PRIVATE(i,j,a,b) &
|
!$OMP PRIVATE(i,j,a,b) &
|
||||||
!$OMP DEFAULT(NONE)
|
!$OMP DEFAULT(NONE)
|
||||||
@ -420,20 +474,20 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x)
|
|||||||
do b=1,nV
|
do b=1,nV
|
||||||
do a=1,nV
|
do a=1,nV
|
||||||
call gpu_set_stream(blas_handle,stream(a))
|
call gpu_set_stream(blas_handle,stream(a))
|
||||||
call gpu_dgeam(blas_handle, 'N', 'N', nO*1_8, nO*1_8, &
|
call gpu_dgeam_f(blas_handle, 'N', 'N', nO, nO, &
|
||||||
2.d0, tau%f(1,1,a,b), nO*1_8, &
|
2.d0, tau%f(1,1,a,b), nO, &
|
||||||
-1.d0, tau%f(1,1,b,a), nO*1_8, &
|
-1.d0, tau%f(1,1,b,a), nO, &
|
||||||
tau_x%f(1,1,a,b), nO*1_8)
|
tau_x%f(1,1,a,b), nO)
|
||||||
enddo
|
enddo
|
||||||
enddo
|
enddo
|
||||||
!$OMP END DO
|
!$OMP END DO
|
||||||
!$OMP END PARALLEL
|
!$OMP END PARALLEL
|
||||||
|
|
||||||
call gpu_synchronize()
|
|
||||||
|
|
||||||
do b=1,nV
|
do b=1,nV
|
||||||
call gpu_stream_destroy(stream(b))
|
call gpu_stream_destroy(stream(b))
|
||||||
enddo
|
enddo
|
||||||
|
call gpu_set_stream(blas_handle,gpu_default_stream)
|
||||||
|
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -293,62 +293,115 @@ end
|
|||||||
|
|
||||||
! H_oo
|
! H_oo
|
||||||
|
|
||||||
subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo)
|
subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, &
|
||||||
|
d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
|
||||||
use gpu
|
use gpu
|
||||||
implicit none
|
implicit none
|
||||||
|
|
||||||
integer, intent(in) :: nO,nV
|
integer, intent(in) :: nO,nV
|
||||||
|
type(gpu_double2), intent(in) :: d_cc_space_f_oo
|
||||||
|
type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol
|
||||||
type(gpu_double4), intent(in) :: tau_x
|
type(gpu_double4), intent(in) :: tau_x
|
||||||
type(gpu_double2), intent(out) :: H_oo
|
type(gpu_double2), intent(out) :: H_oo
|
||||||
|
|
||||||
integer :: a,b,i,j,u,k
|
integer :: a,b,i,j,u,k
|
||||||
|
|
||||||
double precision, allocatable :: tau_kau(:,:,:), tmp_vov(:,:,:)
|
type(gpu_double3) :: tau_kau, tmp_vov, tmp_ovv
|
||||||
|
|
||||||
allocate(tau_kau(cholesky_mo_num,nV,nO))
|
call gpu_allocate(tau_kau, cholesky_mo_num, nV, nO)
|
||||||
!$omp parallel &
|
|
||||||
!$omp default(shared) &
|
! !$omp parallel &
|
||||||
!$omp private(i,u,j,k,a,b,tmp_vov)
|
! !$omp default(shared) &
|
||||||
allocate(tmp_vov(nV,nO,nV) )
|
! !$omp private(i,u,j,k,a,b,tmp_vov)
|
||||||
!$omp do
|
! call gpu_allocate(tmp_vov, nV, nO, nV)
|
||||||
do u = 1, nO
|
! !$omp do
|
||||||
|
! do u = 1, nO
|
||||||
|
! do b=1,nV
|
||||||
|
! do j=1,nO
|
||||||
|
! do a=1,nV
|
||||||
|
! tmp_vov%f(a,j,b) = tau_x%f(u,j,a,b)
|
||||||
|
! enddo
|
||||||
|
! enddo
|
||||||
|
! enddo
|
||||||
|
! call dgemm('N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
|
||||||
|
! d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, tmp_vov%f, nV, &
|
||||||
|
! 0.d0, tau_kau%f(1,1,u), cholesky_mo_num)
|
||||||
|
! enddo
|
||||||
|
! !$omp end do nowait
|
||||||
|
! call gpu_deallocate(tmp_vov)
|
||||||
|
! !$omp do
|
||||||
|
! do i = 1, nO
|
||||||
|
! do u = 1, nO
|
||||||
|
! H_oo%f(u,i) = d_cc_space_f_oo%f(u,i)
|
||||||
|
! enddo
|
||||||
|
! enddo
|
||||||
|
! !$omp end do nowait
|
||||||
|
!
|
||||||
|
! !$omp barrier
|
||||||
|
! !$omp end parallel
|
||||||
|
! call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
|
||||||
|
! tau_kau%f(1,1,1), cholesky_mo_num*nV, d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num*nV, &
|
||||||
|
! 1.d0, H_oo%f(1,1), nO)
|
||||||
|
!
|
||||||
|
|
||||||
|
type(gpu_stream) :: stream(nV)
|
||||||
|
|
||||||
|
do b=1,nV
|
||||||
|
call gpu_stream_create(stream(b))
|
||||||
|
enddo
|
||||||
|
|
||||||
|
!$OMP PARALLEL if (no_gpu()) &
|
||||||
|
!$OMP DEFAULT(SHARED) &
|
||||||
|
!$OMP PRIVATE(u,b,tmp_vov,tmp_ovv)
|
||||||
|
|
||||||
|
call gpu_allocate(tmp_vov, nV, nO, nV)
|
||||||
|
call gpu_allocate(tmp_ovv, nO, nV, nV)
|
||||||
|
|
||||||
|
!$OMP DO
|
||||||
|
do u=1,nO
|
||||||
|
call gpu_dgeam_f(blas_handle, 'N', 'N', 1, nO*nV*nV, 1.d0, &
|
||||||
|
tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f, nO, tmp_ovv%f, 1)
|
||||||
do b=1,nV
|
do b=1,nV
|
||||||
do j=1,nO
|
call gpu_set_stream(blas_handle,stream(b))
|
||||||
do a=1,nV
|
call gpu_dgeam_f(blas_handle, 'T', 'T', nV, nO, 1.d0, &
|
||||||
tmp_vov(a,j,b) = tau_x%f(u,j,a,b)
|
tmp_ovv%f(1,1,b), nO, 0.d0, &
|
||||||
enddo
|
tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV)
|
||||||
enddo
|
|
||||||
enddo
|
enddo
|
||||||
call dgemm('N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
|
call gpu_dgemm_f(blas_handle, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
|
||||||
cc_space_v_ov_chol, cholesky_mo_num, tmp_vov, nV, &
|
d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_vov%f, nV, &
|
||||||
0.d0, tau_kau(1,1,u), cholesky_mo_num)
|
0.d0, tau_kau%f(1,1,u), cholesky_mo_num)
|
||||||
|
call gpu_synchronize()
|
||||||
enddo
|
enddo
|
||||||
!$omp end do nowait
|
!$OMP END DO
|
||||||
deallocate(tmp_vov)
|
|
||||||
!$omp do
|
|
||||||
do i = 1, nO
|
|
||||||
do u = 1, nO
|
|
||||||
H_oo%f(u,i) = cc_space_f_oo(u,i)
|
|
||||||
enddo
|
|
||||||
enddo
|
|
||||||
!$omp end do nowait
|
|
||||||
!$omp barrier
|
|
||||||
!$omp end parallel
|
|
||||||
call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
|
|
||||||
tau_kau, cholesky_mo_num*nV, cc_space_v_vo_chol, cholesky_mo_num*nV, &
|
|
||||||
1.d0, H_oo%f, nO)
|
|
||||||
|
|
||||||
|
call gpu_deallocate(tmp_vov)
|
||||||
|
call gpu_deallocate(tmp_ovv)
|
||||||
|
!$OMP END PARALLEL
|
||||||
|
|
||||||
|
do b=1,nV
|
||||||
|
call gpu_stream_destroy(stream(b))
|
||||||
|
enddo
|
||||||
|
|
||||||
|
call gpu_set_stream(blas_handle,gpu_default_stream)
|
||||||
|
|
||||||
|
call gpu_copy(d_cc_space_f_oo, H_oo)
|
||||||
|
|
||||||
|
call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
|
||||||
|
tau_kau, cholesky_mo_num*nV, d_cc_space_v_vo_chol, cholesky_mo_num*nV, &
|
||||||
|
1.d0, H_oo, nO)
|
||||||
|
|
||||||
|
call gpu_deallocate(tau_kau)
|
||||||
end
|
end
|
||||||
|
|
||||||
! H_vv
|
! H_vv
|
||||||
|
|
||||||
subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv)
|
subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv)
|
||||||
|
use gpu
|
||||||
implicit none
|
implicit none
|
||||||
|
|
||||||
integer, intent(in) :: nO,nV
|
integer, intent(in) :: nO,nV
|
||||||
double precision, intent(in) :: tau_x(nO, nO, nV, nV)
|
type(gpu_double4), intent(in) :: tau_x
|
||||||
double precision, intent(out) :: H_vv(nV, nV)
|
type(gpu_double2), intent(out) :: H_vv
|
||||||
|
|
||||||
integer :: a,b,i,j,u,k, beta
|
integer :: a,b,i,j,u,k, beta
|
||||||
|
|
||||||
@ -364,7 +417,7 @@ subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv)
|
|||||||
do b=1,nV
|
do b=1,nV
|
||||||
do j=1,nO
|
do j=1,nO
|
||||||
do i=1,nO
|
do i=1,nO
|
||||||
tmp_oov(i,j,b) = tau_x(i,j,a,b)
|
tmp_oov(i,j,b) = tau_x%f(i,j,a,b)
|
||||||
enddo
|
enddo
|
||||||
enddo
|
enddo
|
||||||
enddo
|
enddo
|
||||||
@ -378,7 +431,7 @@ subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv)
|
|||||||
!$omp do
|
!$omp do
|
||||||
do beta = 1, nV
|
do beta = 1, nV
|
||||||
do a = 1, nV
|
do a = 1, nV
|
||||||
H_vv(a,beta) = cc_space_f_vv(a,beta)
|
H_vv%f(a,beta) = cc_space_f_vv(a,beta)
|
||||||
enddo
|
enddo
|
||||||
enddo
|
enddo
|
||||||
!$omp end do nowait
|
!$omp end do nowait
|
||||||
@ -386,7 +439,7 @@ subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv)
|
|||||||
!$omp end parallel
|
!$omp end parallel
|
||||||
call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
|
call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
|
||||||
tau_kia, cholesky_mo_num*nO, cc_space_v_ov_chol, cholesky_mo_num*nO, &
|
tau_kia, cholesky_mo_num*nO, cc_space_v_ov_chol, cholesky_mo_num*nO, &
|
||||||
1.d0, H_vv, nV)
|
1.d0, H_vv%f, nV)
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -8,4 +8,11 @@ BEGIN_PROVIDER [ type(gpu_blas), blas_handle ]
|
|||||||
call gpu_blas_create(blas_handle)
|
call gpu_blas_create(blas_handle)
|
||||||
END_PROVIDER
|
END_PROVIDER
|
||||||
|
|
||||||
|
BEGIN_PROVIDER [ type(gpu_stream), gpu_default_stream ]
|
||||||
|
implicit none
|
||||||
|
BEGIN_DOC
|
||||||
|
! Default stream
|
||||||
|
END_DOC
|
||||||
|
gpu_default_stream%c = C_NULL_PTR
|
||||||
|
END_PROVIDER
|
||||||
|
|
||||||
|
@ -49,7 +49,12 @@ module gpu
|
|||||||
! ------------
|
! ------------
|
||||||
|
|
||||||
interface
|
interface
|
||||||
|
logical(c_bool) function no_gpu() bind(C)
|
||||||
|
import
|
||||||
|
end function
|
||||||
|
|
||||||
integer function gpu_ndevices() bind(C)
|
integer function gpu_ndevices() bind(C)
|
||||||
|
import
|
||||||
end function
|
end function
|
||||||
|
|
||||||
subroutine gpu_set_device(id) bind(C)
|
subroutine gpu_set_device(id) bind(C)
|
||||||
@ -101,7 +106,7 @@ module gpu
|
|||||||
|
|
||||||
subroutine gpu_set_stream_c(handle, stream) bind(C, name='gpu_set_stream')
|
subroutine gpu_set_stream_c(handle, stream) bind(C, name='gpu_set_stream')
|
||||||
import
|
import
|
||||||
type(c_ptr) :: handle, stream
|
type(c_ptr), value :: handle, stream
|
||||||
end subroutine
|
end subroutine
|
||||||
|
|
||||||
subroutine gpu_synchronize() bind(C)
|
subroutine gpu_synchronize() bind(C)
|
||||||
@ -120,15 +125,15 @@ module gpu
|
|||||||
|
|
||||||
subroutine gpu_ddot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_ddot')
|
subroutine gpu_ddot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_ddot')
|
||||||
import
|
import
|
||||||
type(c_ptr), intent(in) :: handle
|
type(c_ptr), value, intent(in) :: handle
|
||||||
integer(c_int64_t), value :: n, incx, incy
|
integer(c_int64_t), value :: n, incx, incy
|
||||||
type(c_ptr), intent(in), value :: dx, dy
|
type(c_ptr), value :: dx, dy
|
||||||
real(c_double), intent(out) :: res
|
real(c_double), intent(out) :: res
|
||||||
end subroutine
|
end subroutine
|
||||||
|
|
||||||
subroutine gpu_sdot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_sdot')
|
subroutine gpu_sdot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_sdot')
|
||||||
import
|
import
|
||||||
type(c_ptr), intent(in) :: handle
|
type(c_ptr), value, intent(in) :: handle
|
||||||
integer(c_int64_t), value :: n, incx, incy
|
integer(c_int64_t), value :: n, incx, incy
|
||||||
type(c_ptr), intent(in), value :: dx, dy
|
type(c_ptr), intent(in), value :: dx, dy
|
||||||
real(c_float), intent(out) :: res
|
real(c_float), intent(out) :: res
|
||||||
@ -137,8 +142,8 @@ module gpu
|
|||||||
subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
|
subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
|
||||||
b, ldb, c, ldc) bind(C, name='gpu_dgeam')
|
b, ldb, c, ldc) bind(C, name='gpu_dgeam')
|
||||||
import
|
import
|
||||||
type(c_ptr), intent(in) :: handle
|
type(c_ptr), value, intent(in) :: handle
|
||||||
character(c_char), intent(in), value :: transa, transb
|
character(c_char), intent(in), value :: transa, transb
|
||||||
integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
|
integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
|
||||||
real(c_double), intent(in), value :: alpha, beta
|
real(c_double), intent(in), value :: alpha, beta
|
||||||
type(c_ptr), value :: a, b, c
|
type(c_ptr), value :: a, b, c
|
||||||
@ -147,13 +152,33 @@ module gpu
|
|||||||
subroutine gpu_sgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
|
subroutine gpu_sgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
|
||||||
b, ldb, c, ldc) bind(C, name='gpu_sgeam')
|
b, ldb, c, ldc) bind(C, name='gpu_sgeam')
|
||||||
import
|
import
|
||||||
type(c_ptr), intent(in) :: handle
|
type(c_ptr), value, intent(in) :: handle
|
||||||
character(c_char), intent(in), value :: transa, transb
|
character(c_char), intent(in), value :: transa, transb
|
||||||
integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
|
integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
|
||||||
real(c_float), intent(in), value :: alpha, beta
|
real(c_float), intent(in), value :: alpha, beta
|
||||||
type(c_ptr), value :: a, b, c
|
type(c_ptr), value :: a, b, c
|
||||||
end subroutine
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_dgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, &
|
||||||
|
b, ldb, beta, c, ldc) bind(C, name='gpu_dgemm')
|
||||||
|
import
|
||||||
|
type(c_ptr), value, intent(in) :: handle
|
||||||
|
character(c_char), intent(in), value :: transa, transb
|
||||||
|
integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc
|
||||||
|
real(c_double), intent(in), value :: alpha, beta
|
||||||
|
type(c_ptr), value :: a, b, c
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_sgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, &
|
||||||
|
b, ldb, beta, c, ldc) bind(C, name='gpu_sgemm')
|
||||||
|
import
|
||||||
|
type(c_ptr), value, intent(in) :: handle
|
||||||
|
character(c_char), intent(in), value :: transa, transb
|
||||||
|
integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc
|
||||||
|
real(c_float), intent(in), value :: alpha, beta
|
||||||
|
type(c_ptr), value :: a, b, c
|
||||||
|
end subroutine
|
||||||
|
|
||||||
end interface
|
end interface
|
||||||
|
|
||||||
|
|
||||||
@ -161,20 +186,26 @@ module gpu
|
|||||||
! ----------------------
|
! ----------------------
|
||||||
|
|
||||||
interface gpu_allocate
|
interface gpu_allocate
|
||||||
procedure gpu_allocate_double1 &
|
procedure gpu_allocate_double1 &
|
||||||
,gpu_allocate_double2 &
|
,gpu_allocate_double2 &
|
||||||
,gpu_allocate_double3 &
|
,gpu_allocate_double3 &
|
||||||
,gpu_allocate_double4 &
|
,gpu_allocate_double4 &
|
||||||
,gpu_allocate_double5 &
|
,gpu_allocate_double5 &
|
||||||
,gpu_allocate_double6
|
,gpu_allocate_double6 &
|
||||||
|
,gpu_allocate_double1_64 &
|
||||||
|
,gpu_allocate_double2_64 &
|
||||||
|
,gpu_allocate_double3_64 &
|
||||||
|
,gpu_allocate_double4_64 &
|
||||||
|
,gpu_allocate_double5_64 &
|
||||||
|
,gpu_allocate_double6_64
|
||||||
end interface gpu_allocate
|
end interface gpu_allocate
|
||||||
|
|
||||||
interface gpu_deallocate
|
interface gpu_deallocate
|
||||||
procedure gpu_deallocate_double1 &
|
procedure gpu_deallocate_double1 &
|
||||||
,gpu_deallocate_double2 &
|
,gpu_deallocate_double2 &
|
||||||
,gpu_deallocate_double3 &
|
,gpu_deallocate_double3 &
|
||||||
,gpu_deallocate_double4 &
|
,gpu_deallocate_double4 &
|
||||||
,gpu_deallocate_double5 &
|
,gpu_deallocate_double5 &
|
||||||
,gpu_deallocate_double6
|
,gpu_deallocate_double6
|
||||||
end interface gpu_deallocate
|
end interface gpu_deallocate
|
||||||
|
|
||||||
@ -267,6 +298,61 @@ module gpu
|
|||||||
end subroutine
|
end subroutine
|
||||||
|
|
||||||
|
|
||||||
|
subroutine gpu_allocate_double1_64(ptr, s)
|
||||||
|
implicit none
|
||||||
|
type(gpu_double1), intent(inout) :: ptr
|
||||||
|
integer*8, intent(in) :: s
|
||||||
|
|
||||||
|
call gpu_allocate_c(ptr%c, s)
|
||||||
|
call c_f_pointer(ptr%c, ptr%f, (/ s /))
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_allocate_double2_64(ptr, s1, s2)
|
||||||
|
implicit none
|
||||||
|
type(gpu_double2), intent(inout) :: ptr
|
||||||
|
integer*8, intent(in) :: s1, s2
|
||||||
|
|
||||||
|
call gpu_allocate_c(ptr%c, s1*s2*8_8)
|
||||||
|
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /))
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_allocate_double3_64(ptr, s1, s2, s3)
|
||||||
|
implicit none
|
||||||
|
type(gpu_double3), intent(inout) :: ptr
|
||||||
|
integer*8, intent(in) :: s1, s2, s3
|
||||||
|
|
||||||
|
call gpu_allocate_c(ptr%c, s1*s2*s3*8_8)
|
||||||
|
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /))
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_allocate_double4_64(ptr, s1, s2, s3, s4)
|
||||||
|
implicit none
|
||||||
|
type(gpu_double4), intent(inout) :: ptr
|
||||||
|
integer*8, intent(in) :: s1, s2, s3, s4
|
||||||
|
|
||||||
|
call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8)
|
||||||
|
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /))
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_allocate_double5_64(ptr, s1, s2, s3, s4, s5)
|
||||||
|
implicit none
|
||||||
|
type(gpu_double5), intent(inout) :: ptr
|
||||||
|
integer*8, intent(in) :: s1, s2, s3, s4, s5
|
||||||
|
|
||||||
|
call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8)
|
||||||
|
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /))
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_allocate_double6_64(ptr, s1, s2, s3, s4, s5, s6)
|
||||||
|
implicit none
|
||||||
|
type(gpu_double6), intent(inout) :: ptr
|
||||||
|
integer*8, intent(in) :: s1, s2, s3, s4, s5, s6
|
||||||
|
|
||||||
|
call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8)
|
||||||
|
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /))
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
|
||||||
! gpu_deallocate
|
! gpu_deallocate
|
||||||
! --------------
|
! --------------
|
||||||
|
|
||||||
@ -494,19 +580,38 @@ end module
|
|||||||
subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res)
|
subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res)
|
||||||
use gpu
|
use gpu
|
||||||
type(gpu_blas), intent(in) :: handle
|
type(gpu_blas), intent(in) :: handle
|
||||||
integer*8 :: n, incx, incy
|
integer*4 :: n, incx, incy
|
||||||
double precision, target, intent(in) :: dx(*), dy(*)
|
type(gpu_double1), intent(in) :: dx, dy
|
||||||
double precision, intent(out) :: res
|
double precision, intent(out) :: res
|
||||||
call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res)
|
call gpu_ddot_c(handle%c, int(n,c_int64_t), dx%c, int(incx,c_int64_t), dy%c, int(incy,c_int64_t), res)
|
||||||
end subroutine
|
end subroutine
|
||||||
|
|
||||||
subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res)
|
subroutine gpu_ddot_f(handle, n, dx, incx, dy, incy, res)
|
||||||
|
use gpu
|
||||||
|
type(gpu_blas), intent(in) :: handle
|
||||||
|
integer*4 :: n, incx, incy
|
||||||
|
double precision, target :: dx(*), dy(*)
|
||||||
|
double precision, intent(out) :: res
|
||||||
|
call gpu_ddot_c(handle%c, int(n,c_int64_t), c_loc(dx), int(incx,c_int64_t), c_loc(dy), int(incy,c_int64_t), res)
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
|
||||||
|
subroutine gpu_ddot_64(handle, n, dx, incx, dy, incy, res)
|
||||||
use gpu
|
use gpu
|
||||||
type(gpu_blas), intent(in) :: handle
|
type(gpu_blas), intent(in) :: handle
|
||||||
integer*8 :: n, incx, incy
|
integer*8 :: n, incx, incy
|
||||||
real, target, intent(in) :: dx(*), dy(*)
|
type(gpu_double1), intent(in) :: dx, dy
|
||||||
real, intent(out) :: res
|
double precision, intent(out) :: res
|
||||||
call gpu_sdot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res)
|
call gpu_ddot_c(handle%c, n, dx%c, incx, dy%c, incy, res)
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_ddot_f_64(handle, n, dx, incx, dy, incy, res)
|
||||||
|
use gpu
|
||||||
|
type(gpu_blas), intent(in) :: handle
|
||||||
|
integer*8 :: n, incx, incy
|
||||||
|
double precision, target :: dx(*), dy(*)
|
||||||
|
double precision, intent(out) :: res
|
||||||
|
call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res)
|
||||||
end subroutine
|
end subroutine
|
||||||
|
|
||||||
|
|
||||||
@ -518,22 +623,103 @@ subroutine gpu_dgeam(handle, transa, transb, m, n, alpha, a, lda, beta, &
|
|||||||
use gpu
|
use gpu
|
||||||
type(gpu_blas), intent(in) :: handle
|
type(gpu_blas), intent(in) :: handle
|
||||||
character, intent(in) :: transa, transb
|
character, intent(in) :: transa, transb
|
||||||
integer*8, intent(in) :: m, n, lda, ldb, ldc
|
integer*4, intent(in) :: m, n, lda, ldb, ldc
|
||||||
double precision, intent(in) :: alpha, beta
|
double precision, intent(in) :: alpha, beta
|
||||||
double precision, target :: a(lda,*), b(ldb,*), c(ldc,*)
|
type(gpu_double2) :: a, b, c
|
||||||
call gpu_dgeam_c(handle%c, transa, transb, m, n, alpha, c_loc(a), lda, beta, &
|
call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, a%c, int(lda,c_int64_t), beta, &
|
||||||
c_loc(b), ldb, c_loc(c), ldc)
|
b%c, int(ldb,c_int64_t), c%c, int(ldc,c_int64_t))
|
||||||
end subroutine
|
end subroutine
|
||||||
|
|
||||||
subroutine gpu_sgeam(handle, transa, transb, m, n, alpha, a, lda, beta, &
|
|
||||||
|
subroutine gpu_dgeam_f(handle, transa, transb, m, n, alpha, a, lda, beta, &
|
||||||
b, ldb, c, ldc)
|
b, ldb, c, ldc)
|
||||||
use gpu
|
use gpu
|
||||||
|
type(gpu_blas), intent(in) :: handle
|
||||||
|
character, intent(in) :: transa, transb
|
||||||
|
integer*4, intent(in) :: m, n, lda, ldb, ldc
|
||||||
|
double precision, intent(in) :: alpha, beta
|
||||||
|
double precision, target :: a(*), b(*), c(*)
|
||||||
|
call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, &
|
||||||
|
c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t))
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
|
||||||
|
subroutine gpu_dgeam_64(handle, transa, transb, m, n, alpha, a, lda, beta, &
|
||||||
|
b, ldb, c, ldc)
|
||||||
|
use gpu
|
||||||
type(gpu_blas), intent(in) :: handle
|
type(gpu_blas), intent(in) :: handle
|
||||||
character, intent(in) :: transa, transb
|
character, intent(in) :: transa, transb
|
||||||
integer*8, intent(in) :: m, n, lda, ldb, ldc
|
integer*8, intent(in) :: m, n, lda, ldb, ldc
|
||||||
real, intent(in) :: alpha, beta
|
double precision, intent(in) :: alpha, beta
|
||||||
real, target :: a(lda,*), b(ldb,*), c(ldc,*)
|
type(gpu_double2) :: a, b, c
|
||||||
call gpu_sgeam_c(handle%c, transa, transb, m, n, alpha, c_loc(a), lda, beta, &
|
call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, a%c, int(lda,c_int64_t), beta, &
|
||||||
c_loc(b), ldb, c_loc(c), ldc)
|
b%c, int(ldb,c_int64_t), c%c, int(ldc,c_int64_t))
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
|
||||||
|
subroutine gpu_dgeam_f_64(handle, transa, transb, m, n, alpha, a, lda, beta, &
|
||||||
|
b, ldb, c, ldc)
|
||||||
|
use gpu
|
||||||
|
type(gpu_blas), intent(in) :: handle
|
||||||
|
character, intent(in) :: transa, transb
|
||||||
|
integer*8, intent(in) :: m, n, lda, ldb, ldc
|
||||||
|
double precision, intent(in) :: alpha, beta
|
||||||
|
double precision, target :: a(*), b(*), c(*)
|
||||||
|
call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, &
|
||||||
|
c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t))
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
|
||||||
|
! gemm
|
||||||
|
! ----
|
||||||
|
|
||||||
|
subroutine gpu_dgemm(handle, transa, transb, m, n, k, alpha, a, lda, &
|
||||||
|
b, ldb, beta, c, ldc)
|
||||||
|
use gpu
|
||||||
|
type(gpu_blas), intent(in) :: handle
|
||||||
|
character, intent(in) :: transa, transb
|
||||||
|
integer*4, intent(in) :: m, n, k, lda, ldb, ldc
|
||||||
|
double precision, intent(in) :: alpha, beta
|
||||||
|
type(gpu_double2) :: a, b, c
|
||||||
|
call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), &
|
||||||
|
alpha, a%c, int(lda,c_int64_t), &
|
||||||
|
b%c, int(ldb,c_int64_t), beta, c%c, int(ldc,c_int64_t))
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_dgemm_64(handle, transa, transb, m, n, k, alpha, a, lda, &
|
||||||
|
b, ldb, beta, c, ldc)
|
||||||
|
use gpu
|
||||||
|
type(gpu_blas), intent(in) :: handle
|
||||||
|
character, intent(in) :: transa, transb
|
||||||
|
integer*8, intent(in) :: m, n, k, lda, ldb, ldc
|
||||||
|
double precision, intent(in) :: alpha, beta
|
||||||
|
type(gpu_double2) :: a, b, c
|
||||||
|
call gpu_dgemm_c(handle%c, transa, transb, m, n, k, &
|
||||||
|
alpha, a%c, lda, b%c, ldb, beta, c%c, ldc)
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_dgemm_f(handle, transa, transb, m, n, k, alpha, a, lda, &
|
||||||
|
b, ldb, beta, c, ldc)
|
||||||
|
use gpu
|
||||||
|
type(gpu_blas), intent(in) :: handle
|
||||||
|
character, intent(in) :: transa, transb
|
||||||
|
integer*4, intent(in) :: m, n, k, lda, ldb, ldc
|
||||||
|
double precision, intent(in) :: alpha, beta
|
||||||
|
double precision, target :: a(*), b(*), c(*)
|
||||||
|
call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), &
|
||||||
|
alpha, c_loc(a), int(lda,c_int64_t), &
|
||||||
|
c_loc(b), int(ldb,c_int64_t), beta, c_loc(c), int(ldc,c_int64_t))
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_dgemm_f_64(handle, transa, transb, m, n, k, alpha, a, lda, &
|
||||||
|
b, ldb, beta, c, ldc)
|
||||||
|
use gpu
|
||||||
|
type(gpu_blas), intent(in) :: handle
|
||||||
|
character, intent(in) :: transa, transb
|
||||||
|
integer*8, intent(in) :: m, n, k, lda, ldb, ldc
|
||||||
|
double precision, intent(in) :: alpha, beta
|
||||||
|
double precision, target :: a(*), b(*), c(*)
|
||||||
|
call gpu_dgemm_c(handle%c, transa, transb, m, n, k, &
|
||||||
|
alpha, c_loc(a), lda, c_loc(b), ldb, beta, c_loc(c), ldc)
|
||||||
end subroutine
|
end subroutine
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user