mirror of
https://github.com/QuantumPackage/qp2.git
synced 2025-01-03 10:05:52 +01:00
Finished r1
This commit is contained in:
parent
d686972988
commit
7ceb8fdcca
@ -116,11 +116,6 @@ void gpu_ddot(cublasHandle_t handle, const int64_t n, const double* x, const int
|
|||||||
assert ( (int64_t) incy_ == incy);
|
assert ( (int64_t) incy_ == incy);
|
||||||
|
|
||||||
cublasStatus_t rc = cublasDdot(handle, n_, x, incx_, y, incy_, result);
|
cublasStatus_t rc = cublasDdot(handle, n_, x, incx_, y, incy_, result);
|
||||||
/*
|
|
||||||
double alpha = 1.0;
|
|
||||||
double beta = 0.0;
|
|
||||||
cublasStatus_t rc = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, n_, &alpha, x, 1, y, n_, &beta, &result_, 1);
|
|
||||||
*/
|
|
||||||
assert (rc == CUBLAS_STATUS_SUCCESS);
|
assert (rc == CUBLAS_STATUS_SUCCESS);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -149,8 +144,8 @@ void gpu_sdot(cublasHandle_t handle, const int64_t n, const float* x, const int6
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
void gpu_dgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const double alpha,
|
void gpu_dgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const double* alpha,
|
||||||
const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) {
|
const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) {
|
||||||
|
|
||||||
assert (handle != NULL);
|
assert (handle != NULL);
|
||||||
|
|
||||||
@ -173,13 +168,13 @@ void gpu_dgemv(cublasHandle_t handle, const char* transa, const int64_t m, const
|
|||||||
cublasOperation_t transa_ = CUBLAS_OP_N;
|
cublasOperation_t transa_ = CUBLAS_OP_N;
|
||||||
if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
|
if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
|
||||||
|
|
||||||
cublasDgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_);
|
cublasDgemv(handle, transa_, m_, n_, alpha, a, lda_, x, incx_, beta, y, incy_);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
void gpu_sgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const float alpha,
|
void gpu_sgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const float* alpha,
|
||||||
const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) {
|
const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy) {
|
||||||
|
|
||||||
assert (handle != NULL);
|
assert (handle != NULL);
|
||||||
|
|
||||||
@ -202,12 +197,12 @@ void gpu_sgemv(cublasHandle_t handle, const char* transa, const int64_t m, const
|
|||||||
cublasOperation_t transa_ = CUBLAS_OP_N;
|
cublasOperation_t transa_ = CUBLAS_OP_N;
|
||||||
if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
|
if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
|
||||||
|
|
||||||
cublasSgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_);
|
cublasSgemv(handle, transa_, m_, n_, alpha, a, lda_, x, incx_, beta, y, incy_);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void gpu_dgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
|
void gpu_dgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
|
||||||
const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) {
|
const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) {
|
||||||
|
|
||||||
assert (handle != NULL);
|
assert (handle != NULL);
|
||||||
|
|
||||||
@ -234,13 +229,13 @@ void gpu_dgemm(cublasHandle_t handle, const char* transa, const char* transb, co
|
|||||||
if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
|
if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
|
||||||
if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
|
if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
|
||||||
|
|
||||||
cublasDgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_);
|
cublasDgemm(handle, transa_, transb_, m_, n_, k_, alpha, a, lda_, b, ldb_, beta, c, ldc_);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
void gpu_sgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
|
void gpu_sgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha,
|
||||||
const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) {
|
const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc) {
|
||||||
|
|
||||||
assert (handle != NULL);
|
assert (handle != NULL);
|
||||||
|
|
||||||
@ -267,12 +262,12 @@ void gpu_sgemm(cublasHandle_t handle, const char* transa, const char* transb, co
|
|||||||
if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
|
if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
|
||||||
if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
|
if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
|
||||||
|
|
||||||
cublasSgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_);
|
cublasSgemm(handle, transa_, transb_, m_, n_, k_, alpha, a, lda_, b, ldb_, beta, c, ldc_);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void gpu_dgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double alpha,
|
void gpu_dgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha,
|
||||||
const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
|
const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
|
||||||
assert (handle != NULL);
|
assert (handle != NULL);
|
||||||
|
|
||||||
/* Convert to int */
|
/* Convert to int */
|
||||||
@ -296,13 +291,13 @@ void gpu_dgeam(cublasHandle_t handle, const char* transa, const char* transb, co
|
|||||||
if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
|
if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
|
||||||
if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
|
if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
|
||||||
|
|
||||||
cublasDgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_);
|
cublasDgeam(handle, transa_, transb_, m_, n_, alpha, a, lda_, beta, b, ldb_, c, ldc_);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void gpu_sgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float alpha,
|
void gpu_sgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float* alpha,
|
||||||
const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
|
const float* a, const int64_t lda, const float* beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
|
||||||
assert (handle != NULL);
|
assert (handle != NULL);
|
||||||
|
|
||||||
/* Convert to int */
|
/* Convert to int */
|
||||||
@ -326,6 +321,6 @@ void gpu_sgeam(cublasHandle_t handle, const char* transa, const char* transb, co
|
|||||||
if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
|
if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
|
||||||
if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
|
if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
|
||||||
|
|
||||||
cublasSgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_);
|
cublasSgeam(handle, transa_, transb_, m_, n_, alpha, a, lda_, beta, b, ldb_, c, ldc_);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -124,8 +124,8 @@ void gpu_sdot(void* handle, const int64_t n, const float* x, const int64_t incx,
|
|||||||
void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha,
|
void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha,
|
||||||
const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy);
|
const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy);
|
||||||
|
|
||||||
void gpu_dgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const double alpha,
|
void gpu_dgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const double* alpha,
|
||||||
const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) {
|
const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) {
|
||||||
|
|
||||||
assert (handle != NULL);
|
assert (handle != NULL);
|
||||||
|
|
||||||
@ -145,15 +145,15 @@ void gpu_dgemv(void* handle, const char* transa, const int64_t m, const int64_t
|
|||||||
assert ( (int64_t) incx_ == incx);
|
assert ( (int64_t) incx_ == incx);
|
||||||
assert ( (int64_t) incy_ == incy);
|
assert ( (int64_t) incy_ == incy);
|
||||||
|
|
||||||
dgemv_(transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_);
|
dgemv_(transa, &m_, &n_, alpha, a, &lda_, x, &incx_, beta, y, &incy_);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha,
|
void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha,
|
||||||
const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy);
|
const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy);
|
||||||
|
|
||||||
void gpu_sgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const float alpha,
|
void gpu_sgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const float* alpha,
|
||||||
const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) {
|
const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy) {
|
||||||
|
|
||||||
assert (handle != NULL);
|
assert (handle != NULL);
|
||||||
|
|
||||||
@ -173,15 +173,15 @@ void gpu_sgemv(void* handle, const char* transa, const int64_t m, const int64_t
|
|||||||
assert ( (int64_t) incx_ == incx);
|
assert ( (int64_t) incx_ == incx);
|
||||||
assert ( (int64_t) incy_ == incy);
|
assert ( (int64_t) incy_ == incy);
|
||||||
|
|
||||||
sgemv_(transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_);
|
sgemv_(transa, &m_, &n_, alpha, a, &lda_, x, &incx_, beta, y, &incy_);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha,
|
void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha,
|
||||||
const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc);
|
const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc);
|
||||||
|
|
||||||
void gpu_dgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
|
void gpu_dgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
|
||||||
const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) {
|
const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) {
|
||||||
|
|
||||||
assert (handle != NULL);
|
assert (handle != NULL);
|
||||||
|
|
||||||
@ -203,7 +203,7 @@ void gpu_dgemm(void* handle, const char* transa, const char* transb, const int64
|
|||||||
assert ( (int64_t) ldb_ == ldb);
|
assert ( (int64_t) ldb_ == ldb);
|
||||||
assert ( (int64_t) ldc_ == ldc);
|
assert ( (int64_t) ldc_ == ldc);
|
||||||
|
|
||||||
dgemm_(transa, transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_);
|
dgemm_(transa, transb, &m_, &n_, &k_, alpha, a, &lda_, b, &ldb_, beta, c, &ldc_);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -211,8 +211,8 @@ void gpu_dgemm(void* handle, const char* transa, const char* transb, const int64
|
|||||||
void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha,
|
void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha,
|
||||||
const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc);
|
const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc);
|
||||||
|
|
||||||
void gpu_sgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
|
void gpu_sgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha,
|
||||||
const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) {
|
const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc) {
|
||||||
|
|
||||||
assert (handle != NULL);
|
assert (handle != NULL);
|
||||||
|
|
||||||
@ -234,12 +234,12 @@ void gpu_sgemm(void* handle, const char* transa, const char* transb, const int64
|
|||||||
assert ( (int64_t) ldb_ == ldb);
|
assert ( (int64_t) ldb_ == ldb);
|
||||||
assert ( (int64_t) ldc_ == ldc);
|
assert ( (int64_t) ldc_ == ldc);
|
||||||
|
|
||||||
sgemm_(transa, transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_);
|
sgemm_(transa, transb, &m_, &n_, &k_, alpha, a, &lda_, b, &ldb_, beta, c, &ldc_);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double alpha,
|
void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha,
|
||||||
const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
|
const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
|
||||||
assert (handle != NULL);
|
assert (handle != NULL);
|
||||||
|
|
||||||
if ( (*transa == 'N' && *transb == 'N') ||
|
if ( (*transa == 'N' && *transb == 'N') ||
|
||||||
@ -247,19 +247,19 @@ void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64
|
|||||||
(*transa == 'N' && *transb == 'n') ||
|
(*transa == 'N' && *transb == 'n') ||
|
||||||
(*transa == 'n' && *transb == 'n') ) {
|
(*transa == 'n' && *transb == 'n') ) {
|
||||||
|
|
||||||
if (alpha == 0.) {
|
if (*alpha == 0.) {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<m ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = beta * b[j*ldb+i];
|
c[j*ldc+i] = *beta * b[j*ldb+i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} else if (beta == 0.) {
|
} else if (*beta == 0.) {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<m ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = alpha * a[j*lda+i];
|
c[j*ldc+i] = *alpha * a[j*lda+i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -267,7 +267,7 @@ void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64
|
|||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<m ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i];
|
c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[j*ldb+i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -278,19 +278,19 @@ void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64
|
|||||||
(*transa == 'N' && *transb == 't') ||
|
(*transa == 'N' && *transb == 't') ||
|
||||||
(*transa == 'n' && *transb == 't') ) {
|
(*transa == 'n' && *transb == 't') ) {
|
||||||
|
|
||||||
if (alpha == 0.) {
|
if (*alpha == 0.) {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<m ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = beta * b[i*ldb+j];
|
c[j*ldc+i] = *beta * b[i*ldb+j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} else if (beta == 0.) {
|
} else if (*beta == 0.) {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<m ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = alpha * a[j*lda+i];
|
c[j*ldc+i] = *alpha * a[j*lda+i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -298,7 +298,7 @@ void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64
|
|||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<m ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j];
|
c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[i*ldb+j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -309,19 +309,19 @@ void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64
|
|||||||
(*transa == 'T' && *transb == 'n') ||
|
(*transa == 'T' && *transb == 'n') ||
|
||||||
(*transa == 't' && *transb == 'n') ) {
|
(*transa == 't' && *transb == 'n') ) {
|
||||||
|
|
||||||
if (alpha == 0.) {
|
if (*alpha == 0.) {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<m ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = beta * b[j*ldb+i];
|
c[j*ldc+i] = *beta * b[j*ldb+i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} else if (beta == 0.) {
|
} else if (*beta == 0.) {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<m ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = alpha * a[i*lda+j];
|
c[j*ldc+i] = *alpha * a[i*lda+j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -329,7 +329,7 @@ void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64
|
|||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<m ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i];
|
c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[j*ldb+i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -340,19 +340,19 @@ void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64
|
|||||||
(*transa == 'T' && *transb == 't') ||
|
(*transa == 'T' && *transb == 't') ||
|
||||||
(*transa == 't' && *transb == 't') ) {
|
(*transa == 't' && *transb == 't') ) {
|
||||||
|
|
||||||
if (alpha == 0.) {
|
if (*alpha == 0.) {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<m ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = beta * b[i*ldb+j];
|
c[j*ldc+i] = *beta * b[i*ldb+j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} else if (beta == 0.) {
|
} else if (*beta == 0.) {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<m ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = alpha * a[i*lda+j];
|
c[j*ldc+i] = *alpha * a[i*lda+j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -360,7 +360,7 @@ void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64
|
|||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<m ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j];
|
c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[i*ldb+j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -370,8 +370,8 @@ void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void gpu_sgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float alpha,
|
void gpu_sgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float* alpha,
|
||||||
const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
|
const float* a, const int64_t lda, const float* beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
|
||||||
assert (handle != NULL);
|
assert (handle != NULL);
|
||||||
|
|
||||||
if ( (*transa == 'N' && *transb == 'N') ||
|
if ( (*transa == 'N' && *transb == 'N') ||
|
||||||
@ -379,19 +379,19 @@ void gpu_sgeam(void* handle, const char* transa, const char* transb, const int64
|
|||||||
(*transa == 'N' && *transb == 'n') ||
|
(*transa == 'N' && *transb == 'n') ||
|
||||||
(*transa == 'n' && *transb == 'n') ) {
|
(*transa == 'n' && *transb == 'n') ) {
|
||||||
|
|
||||||
if (alpha == 0.) {
|
if (*alpha == 0.) {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<m ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = beta * b[j*ldb+i];
|
c[j*ldc+i] = *beta * b[j*ldb+i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} else if (beta == 0.) {
|
} else if (*beta == 0.) {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<m ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = alpha * a[j*lda+i];
|
c[j*ldc+i] = *alpha * a[j*lda+i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -399,7 +399,7 @@ void gpu_sgeam(void* handle, const char* transa, const char* transb, const int64
|
|||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<m ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i];
|
c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[j*ldb+i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -410,19 +410,19 @@ void gpu_sgeam(void* handle, const char* transa, const char* transb, const int64
|
|||||||
(*transa == 'N' && *transb == 't') ||
|
(*transa == 'N' && *transb == 't') ||
|
||||||
(*transa == 'n' && *transb == 't') ) {
|
(*transa == 'n' && *transb == 't') ) {
|
||||||
|
|
||||||
if (alpha == 0.) {
|
if (*alpha == 0.) {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<m ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = beta * b[i*ldb+j];
|
c[j*ldc+i] = *beta * b[i*ldb+j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} else if (beta == 0.) {
|
} else if (*beta == 0.) {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<m ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = alpha * a[j*lda+i];
|
c[j*ldc+i] = *alpha * a[j*lda+i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -430,7 +430,7 @@ void gpu_sgeam(void* handle, const char* transa, const char* transb, const int64
|
|||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<m ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j];
|
c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[i*ldb+j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -441,19 +441,19 @@ void gpu_sgeam(void* handle, const char* transa, const char* transb, const int64
|
|||||||
(*transa == 'T' && *transb == 'n') ||
|
(*transa == 'T' && *transb == 'n') ||
|
||||||
(*transa == 't' && *transb == 'n') ) {
|
(*transa == 't' && *transb == 'n') ) {
|
||||||
|
|
||||||
if (alpha == 0.) {
|
if (*alpha == 0.) {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<m ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = beta * b[j*ldb+i];
|
c[j*ldc+i] = *beta * b[j*ldb+i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} else if (beta == 0.) {
|
} else if (*beta == 0.) {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<m ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = alpha * a[i*lda+j];
|
c[j*ldc+i] = *alpha * a[i*lda+j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -461,7 +461,7 @@ void gpu_sgeam(void* handle, const char* transa, const char* transb, const int64
|
|||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<m ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i];
|
c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[j*ldb+i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -472,19 +472,19 @@ void gpu_sgeam(void* handle, const char* transa, const char* transb, const int64
|
|||||||
(*transa == 'T' && *transb == 't') ||
|
(*transa == 'T' && *transb == 't') ||
|
||||||
(*transa == 't' && *transb == 't') ) {
|
(*transa == 't' && *transb == 't') ) {
|
||||||
|
|
||||||
if (alpha == 0.) {
|
if (*alpha == 0.) {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<m ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = beta * b[i*ldb+j];
|
c[j*ldc+i] = *beta * b[i*ldb+j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} else if (beta == 0.) {
|
} else if (*beta == 0.) {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<m ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = alpha * a[i*lda+j];
|
c[j*ldc+i] = *alpha * a[i*lda+j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -492,7 +492,7 @@ void gpu_sgeam(void* handle, const char* transa, const char* transb, const int64
|
|||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<m ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j];
|
c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[i*ldb+j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -217,21 +217,18 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_s
|
|||||||
! internal
|
! internal
|
||||||
integer :: u,i,j,beta,a,b
|
integer :: u,i,j,beta,a,b
|
||||||
|
|
||||||
call gpu_copy(d_cc_space_f_ov, r1)
|
type(gpu_stream) :: stream(nV)
|
||||||
|
|
||||||
|
do a=1,nV
|
||||||
|
call gpu_stream_create(stream(a))
|
||||||
|
enddo
|
||||||
|
|
||||||
type(gpu_double2) :: X_oo
|
type(gpu_double2) :: X_oo
|
||||||
call gpu_allocate(X_oo,nO,nO)
|
call gpu_allocate(X_oo,nO,nO)
|
||||||
|
|
||||||
call gpu_dgemm(blas_handle, 'N','N', nO, nO, nV, &
|
call gpu_copy(d_cc_space_f_ov, r1)
|
||||||
-2d0, t1%f(1,1), size(t1%f,1), &
|
|
||||||
d_cc_space_f_vo%f(1,1), size(d_cc_space_f_vo%f,1), &
|
|
||||||
0d0, X_oo%f(1,1), size(X_oo%f,1))
|
|
||||||
|
|
||||||
call gpu_dgemm(blas_handle, 'T','N', nO, nV, nO, &
|
|
||||||
1d0, X_oo%f(1,1), size(X_oo%f,2), &
|
|
||||||
t1%f(1,1) , size(t1%f,1), &
|
|
||||||
1d0, r1%f(1,1) , size(r1%f,1))
|
|
||||||
|
|
||||||
|
call gpu_set_stream(blas_handle, stream(1))
|
||||||
call gpu_dgemm(blas_handle, 'N','N', nO, nV, nV, &
|
call gpu_dgemm(blas_handle, 'N','N', nO, nV, nV, &
|
||||||
1d0, t1%f(1,1) , size(t1%f,1), &
|
1d0, t1%f(1,1) , size(t1%f,1), &
|
||||||
H_vv%f(1,1), size(H_vv%f,1), &
|
H_vv%f(1,1), size(H_vv%f,1), &
|
||||||
@ -242,35 +239,32 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_s
|
|||||||
t1%f(1,1) , size(t1%f,1), &
|
t1%f(1,1) , size(t1%f,1), &
|
||||||
1d0, r1%f(1,1), size(r1%f,1))
|
1d0, r1%f(1,1), size(r1%f,1))
|
||||||
|
|
||||||
|
call gpu_set_stream(blas_handle, stream(nV))
|
||||||
|
call gpu_dgemm(blas_handle, 'N','N', nO, nO, nV, &
|
||||||
|
-2d0, t1%f(1,1), size(t1%f,1), &
|
||||||
|
d_cc_space_f_vo%f(1,1), size(d_cc_space_f_vo%f,1), &
|
||||||
|
0d0, X_oo%f(1,1), size(X_oo%f,1))
|
||||||
|
|
||||||
|
call gpu_synchronize()
|
||||||
|
call gpu_set_stream(blas_handle, gpu_default_stream)
|
||||||
|
|
||||||
|
call gpu_dgemm(blas_handle, 'T','N', nO, nV, nO, &
|
||||||
|
1d0, X_oo%f(1,1), size(X_oo%f,2), &
|
||||||
|
t1%f(1,1) , size(t1%f,1), &
|
||||||
|
1d0, r1%f(1,1) , size(r1%f,1))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
type(gpu_double4) :: X_voov
|
type(gpu_double4) :: X_voov
|
||||||
call gpu_allocate(X_voov, nV, nO, nO, nV)
|
call gpu_allocate(X_voov, nV, nO, nO, nV)
|
||||||
|
|
||||||
type(gpu_stream) :: stream(nV)
|
do i=1,nO
|
||||||
|
do beta=1,nV
|
||||||
do a=1,nV
|
call gpu_set_stream(blas_handle, stream(beta))
|
||||||
call gpu_stream_create(stream(a))
|
call gpu_dgeam(blas_handle, 'T', 'T', nV, nO, -1.d0, t2%f(1,i,1,beta), &
|
||||||
enddo
|
nO*nO, t1%f(i,beta), t1%f(1,1), nO, X_voov%f(1,i,1,beta), nV*nO)
|
||||||
|
|
||||||
call gpu_synchronize()
|
|
||||||
|
|
||||||
! do i=1,nO
|
|
||||||
! do beta=1,nV
|
|
||||||
! call gpu_set_stream(blas_handle, stream(beta))
|
|
||||||
! call gpu_dgeam(blas_handle, 'T', 'T', nV, nO, -1.d0, t2%f(1,i,1,beta), &
|
|
||||||
! nO*nO, t1%f(i,beta), t1%f(1,1), nO, X_voov%f(1,i,1,beta), nV)
|
|
||||||
! enddo
|
|
||||||
! enddo
|
|
||||||
|
|
||||||
do beta = 1, nV
|
|
||||||
do u = 1, nO
|
|
||||||
do i = 1, nO
|
|
||||||
do a = 1, nV
|
|
||||||
X_voov%f(a,i,u,beta) = - t2%f(u,i,a,beta) + t1%f(u,a) * t1%f(i,beta)
|
|
||||||
enddo
|
enddo
|
||||||
enddo
|
enddo
|
||||||
enddo
|
|
||||||
enddo
|
|
||||||
call gpu_synchronize()
|
|
||||||
|
|
||||||
do beta=1,nV
|
do beta=1,nV
|
||||||
call gpu_set_stream(blas_handle, stream(beta))
|
call gpu_set_stream(blas_handle, stream(beta))
|
||||||
@ -279,9 +273,8 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_s
|
|||||||
enddo
|
enddo
|
||||||
|
|
||||||
call gpu_synchronize()
|
call gpu_synchronize()
|
||||||
do a=1,nV
|
call gpu_deallocate(X_oo)
|
||||||
call gpu_stream_destroy(stream(a))
|
|
||||||
enddo
|
|
||||||
call gpu_set_stream(blas_handle, gpu_default_stream)
|
call gpu_set_stream(blas_handle, gpu_default_stream)
|
||||||
|
|
||||||
call gpu_dgemv(blas_handle, 'T', nV*nO, nO*nV, &
|
call gpu_dgemv(blas_handle, 'T', nV*nO, nO*nV, &
|
||||||
@ -289,65 +282,38 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_s
|
|||||||
H_vo%f(1,1) , 1, &
|
H_vo%f(1,1) , 1, &
|
||||||
1d0, r1%f(1,1) , 1)
|
1d0, r1%f(1,1) , 1)
|
||||||
|
|
||||||
call gpu_synchronize()
|
|
||||||
call gpu_deallocate(X_oo)
|
|
||||||
call gpu_deallocate(X_voov)
|
|
||||||
|
|
||||||
type(gpu_double4) :: X_ovov
|
type(gpu_double4) :: X_ovov
|
||||||
call gpu_allocate(X_ovov, nO, nV, nO, nV)
|
call gpu_allocate(X_ovov, nO, nV, nO, nV)
|
||||||
|
|
||||||
!$omp parallel &
|
|
||||||
!$omp shared(nO,nV,d_cc_space_v_ovov,d_cc_space_v_voov,X_ovov) &
|
|
||||||
!$omp private(u,beta,i,a) &
|
|
||||||
!$omp default(none)
|
|
||||||
!$omp do
|
|
||||||
do beta = 1, nV
|
do beta = 1, nV
|
||||||
do u = 1, nO
|
call gpu_set_stream(blas_handle, stream(beta))
|
||||||
do a = 1, nv
|
do u=1,nO
|
||||||
do i = 1, nO
|
call gpu_dgeam(blas_handle, 'N', 'T', nO, nV, -1.d0, d_cc_space_v_ovov%f(1,1,u,beta), &
|
||||||
X_ovov%f(i,a,u,beta) = 2d0 * d_cc_space_v_voov%f(a,u,i,beta) - d_cc_space_v_ovov%f(u,a,i,beta)
|
nO, 2.d0, d_cc_space_v_voov%f(1,u,1,beta), nV*nO, X_ovov%f(1,1,u,beta), nO)
|
||||||
enddo
|
enddo
|
||||||
enddo
|
enddo
|
||||||
enddo
|
|
||||||
enddo
|
|
||||||
!$omp end do
|
|
||||||
!$omp end parallel
|
|
||||||
|
|
||||||
! call dgemv('T', nO*nV, nO*nV, &
|
call gpu_set_stream(blas_handle, gpu_default_stream)
|
||||||
! 1d0, X_ovov%f, size(X_ovov%f,1) * size(X_ovov%f,2), &
|
call gpu_synchronize()
|
||||||
! t1%f, 1, &
|
call gpu_deallocate(X_voov)
|
||||||
! 1d0, r1%f, 1)
|
|
||||||
call gpu_dgemv(blas_handle, 'T', nO*nV, nO*nV, &
|
call gpu_dgemv(blas_handle, 'T', nO*nV, nO*nV, &
|
||||||
1d0, X_ovov%f(1,1,1,1), size(X_ovov%f,1) * size(X_ovov%f,2), &
|
1d0, X_ovov%f(1,1,1,1), size(X_ovov%f,1) * size(X_ovov%f,2), &
|
||||||
t1%f(1,1), 1, &
|
t1%f(1,1), 1, &
|
||||||
1d0, r1%f(1,1), 1)
|
1d0, r1%f(1,1), 1)
|
||||||
|
|
||||||
|
|
||||||
integer :: iblock, block_size, nVmax
|
integer :: iblock, block_size, nVmax
|
||||||
type(gpu_double4) :: W_vvov, W_vvov_tmp, T_vvoo
|
type(gpu_double4) :: W_vvov, W_vvov_tmp, T_vvoo
|
||||||
|
|
||||||
block_size = 16
|
block_size = 16
|
||||||
call gpu_allocate(W_vvov,nV, nV,nO,block_size)
|
|
||||||
call gpu_allocate(W_vvov_tmp, nV,nO,nV,block_size)
|
|
||||||
call gpu_allocate(T_vvoo, nV,nV,nO,nO)
|
call gpu_allocate(T_vvoo, nV,nV,nO,nO)
|
||||||
|
|
||||||
call gpu_synchronize()
|
call gpu_dgeam(blas_handle, 'T', 'N', nV*nV, nO*nO, 1.d0, tau%f(1,1,1,1), &
|
||||||
call gpu_deallocate(X_ovov)
|
nO*nO, 0.d0, T_vvoo%f(1,1,1,1), nV*nV, T_vvoo%f(1,1,1,1), nV*nV)
|
||||||
|
|
||||||
!$omp parallel &
|
call gpu_allocate(W_vvov,nV, nV,nO,block_size)
|
||||||
!$omp private(u,i,b,a) &
|
call gpu_allocate(W_vvov_tmp, nV,nO,nV,block_size)
|
||||||
!$omp default(shared)
|
|
||||||
!$omp do
|
|
||||||
do u = 1, nO
|
|
||||||
do i = 1, nO
|
|
||||||
do b = 1, nV
|
|
||||||
do a = 1, nV
|
|
||||||
T_vvoo%f(a,b,i,u) = tau%f(i,u,a,b)
|
|
||||||
enddo
|
|
||||||
enddo
|
|
||||||
enddo
|
|
||||||
enddo
|
|
||||||
!$omp end do
|
|
||||||
!$omp end parallel
|
|
||||||
|
|
||||||
do iblock = 1, nV, block_size
|
do iblock = 1, nV, block_size
|
||||||
nVmax = min(block_size,nV-iblock+1)
|
nVmax = min(block_size,nV-iblock+1)
|
||||||
@ -358,22 +324,22 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_s
|
|||||||
0.d0, W_vvov_tmp%f(1,1,1,1), nV*nO)
|
0.d0, W_vvov_tmp%f(1,1,1,1), nV*nO)
|
||||||
|
|
||||||
call gpu_synchronize()
|
call gpu_synchronize()
|
||||||
!$omp parallel &
|
do b=1,nV
|
||||||
!$omp private(b,i,a,beta) &
|
call gpu_set_stream(blas_handle, stream(b))
|
||||||
!$omp default(shared)
|
do i=1,nO
|
||||||
|
call gpu_dgeam(blas_handle, 'N', 'N', nV, nVmax, 2.d0, W_vvov_tmp%f(1,i,b,1), &
|
||||||
|
nV*nO*nV, 0.d0, W_vvov_tmp%f(1,i,b,1), nV*nO*nV, W_vvov%f(1,b,i,1), nV*nV*nO)
|
||||||
|
enddo
|
||||||
|
enddo
|
||||||
|
|
||||||
|
call gpu_synchronize()
|
||||||
|
|
||||||
do beta = 1, nVmax
|
do beta = 1, nVmax
|
||||||
do i = 1, nO
|
call gpu_set_stream(blas_handle, stream(beta))
|
||||||
!$omp do
|
call gpu_dgeam(blas_handle, 'N', 'T', nV, nV*nO, 1.d0, W_vvov%f(1,1,1,beta), &
|
||||||
do b = 1, nV
|
nV, -1.d0, W_vvov_tmp%f(1,1,1,beta), nV*nO, W_vvov%f(1,1,1,beta), nV)
|
||||||
do a = 1, nV
|
|
||||||
W_vvov%f(a,b,i,beta) = 2d0 * W_vvov_tmp%f(a,i,b,beta) - W_vvov_tmp%f(b,i,a,beta)
|
|
||||||
enddo
|
enddo
|
||||||
enddo
|
call gpu_synchronize()
|
||||||
!$omp end do nowait
|
|
||||||
enddo
|
|
||||||
enddo
|
|
||||||
!$omp barrier
|
|
||||||
!$omp end parallel
|
|
||||||
|
|
||||||
call gpu_dgemm(blas_handle, 'T','N',nO,nVmax,nO*nV*nV, &
|
call gpu_dgemm(blas_handle, 'T','N',nO,nVmax,nO*nV*nV, &
|
||||||
1d0, T_vvoo%f(1,1,1,1), nV*nV*nO, &
|
1d0, T_vvoo%f(1,1,1,1), nV*nV*nO, &
|
||||||
@ -381,30 +347,24 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_s
|
|||||||
1d0, r1%f(1,iblock), nO)
|
1d0, r1%f(1,iblock), nO)
|
||||||
enddo
|
enddo
|
||||||
|
|
||||||
call gpu_synchronize()
|
call gpu_deallocate(X_ovov)
|
||||||
call gpu_deallocate(W_vvov)
|
|
||||||
call gpu_deallocate(T_vvoo)
|
|
||||||
|
|
||||||
|
|
||||||
type(gpu_double4) :: W_oovo
|
type(gpu_double4) :: W_oovo
|
||||||
call gpu_allocate(W_oovo, nO,nO,nV,nO)
|
call gpu_allocate(W_oovo, nO,nO,nV,nO)
|
||||||
|
|
||||||
!$omp parallel &
|
|
||||||
!$omp shared(nO,nV,d_cc_space_v_oovo,W_oovo) &
|
|
||||||
!$omp private(u,a,i,j) &
|
|
||||||
!$omp default(none)
|
|
||||||
do u = 1, nO
|
do u = 1, nO
|
||||||
!$omp do
|
|
||||||
do a = 1, nV
|
do a = 1, nV
|
||||||
do j = 1, nO
|
call gpu_set_stream(blas_handle, stream(a))
|
||||||
do i = 1, nO
|
call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, 2.d0, d_cc_space_v_oovo%f(1,1,a,u), &
|
||||||
W_oovo%f(i,j,a,u) = 2d0 * d_cc_space_v_oovo%f(i,j,a,u) - d_cc_space_v_oovo%f(j,i,a,u)
|
nO, -1.d0, d_cc_space_v_oovo%f(1,1,a,u), nO, W_oovo%f(1,1,a,u), nO)
|
||||||
enddo
|
enddo
|
||||||
enddo
|
enddo
|
||||||
enddo
|
|
||||||
!$omp end do nowait
|
call gpu_set_stream(blas_handle, gpu_default_stream)
|
||||||
enddo
|
call gpu_synchronize()
|
||||||
!$omp end parallel
|
|
||||||
|
call gpu_deallocate(W_vvov)
|
||||||
|
call gpu_deallocate(T_vvoo)
|
||||||
|
|
||||||
! Change the sign for consistency with the code in spin orbitals
|
! Change the sign for consistency with the code in spin orbitals
|
||||||
call gpu_dgemm(blas_handle, 'T','N', nO, nV, nO*nO*nV, &
|
call gpu_dgemm(blas_handle, 'T','N', nO, nV, nO*nO*nV, &
|
||||||
@ -422,6 +382,9 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_s
|
|||||||
enddo
|
enddo
|
||||||
enddo
|
enddo
|
||||||
|
|
||||||
|
do a=1,nV
|
||||||
|
call gpu_stream_destroy(stream(a))
|
||||||
|
enddo
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -22,20 +22,20 @@ void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_
|
|||||||
|
|
||||||
void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result);
|
void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result);
|
||||||
|
|
||||||
void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double alpha,
|
void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double* alpha,
|
||||||
const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy);
|
const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy);
|
||||||
|
|
||||||
void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float alpha,
|
void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float* alpha,
|
||||||
const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy);
|
const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy);
|
||||||
|
|
||||||
void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
|
void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
|
||||||
const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc);
|
const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc);
|
||||||
|
|
||||||
void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
|
void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha,
|
||||||
const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc);
|
const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc);
|
||||||
|
|
||||||
void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
|
void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double* alpha,
|
||||||
const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc);
|
const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc);
|
||||||
|
|
||||||
void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
|
void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float* alpha,
|
||||||
const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc);
|
const float* a, const int64_t lda, const float* beta, const float* b, const int64_t ldb, float* c, const int64_t ldc);
|
||||||
|
@ -145,7 +145,7 @@ module gpu
|
|||||||
type(c_ptr), value, intent(in) :: handle
|
type(c_ptr), value, intent(in) :: handle
|
||||||
character(c_char), intent(in), value :: transa, transb
|
character(c_char), intent(in), value :: transa, transb
|
||||||
integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
|
integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
|
||||||
real(c_double), intent(in), value :: alpha, beta
|
real(c_double), intent(in) :: alpha, beta
|
||||||
type(c_ptr), value :: a, b, c
|
type(c_ptr), value :: a, b, c
|
||||||
end subroutine
|
end subroutine
|
||||||
|
|
||||||
@ -155,7 +155,7 @@ module gpu
|
|||||||
type(c_ptr), value, intent(in) :: handle
|
type(c_ptr), value, intent(in) :: handle
|
||||||
character(c_char), intent(in), value :: transa, transb
|
character(c_char), intent(in), value :: transa, transb
|
||||||
integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
|
integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
|
||||||
real(c_float), intent(in), value :: alpha, beta
|
real(c_float), intent(in) :: alpha, beta
|
||||||
real(c_float) :: a, b, c
|
real(c_float) :: a, b, c
|
||||||
end subroutine
|
end subroutine
|
||||||
|
|
||||||
@ -165,7 +165,7 @@ module gpu
|
|||||||
type(c_ptr), value, intent(in) :: handle
|
type(c_ptr), value, intent(in) :: handle
|
||||||
character(c_char), intent(in) :: transa
|
character(c_char), intent(in) :: transa
|
||||||
integer(c_int64_t), intent(in), value :: m, n, lda, incx, incy
|
integer(c_int64_t), intent(in), value :: m, n, lda, incx, incy
|
||||||
real(c_double), intent(in), value :: alpha, beta
|
real(c_double), intent(in) :: alpha, beta
|
||||||
real(c_double) :: a, x, y
|
real(c_double) :: a, x, y
|
||||||
end subroutine
|
end subroutine
|
||||||
|
|
||||||
@ -175,7 +175,7 @@ module gpu
|
|||||||
type(c_ptr), value, intent(in) :: handle
|
type(c_ptr), value, intent(in) :: handle
|
||||||
character(c_char), intent(in) :: transa
|
character(c_char), intent(in) :: transa
|
||||||
integer(c_int64_t), intent(in), value :: m, n, lda, incx, incy
|
integer(c_int64_t), intent(in), value :: m, n, lda, incx, incy
|
||||||
real(c_float), intent(in), value :: alpha, beta
|
real(c_float), intent(in) :: alpha, beta
|
||||||
real(c_float) :: a, x, y
|
real(c_float) :: a, x, y
|
||||||
end subroutine
|
end subroutine
|
||||||
|
|
||||||
@ -186,7 +186,7 @@ module gpu
|
|||||||
type(c_ptr), value, intent(in) :: handle
|
type(c_ptr), value, intent(in) :: handle
|
||||||
character(c_char), intent(in) :: transa, transb
|
character(c_char), intent(in) :: transa, transb
|
||||||
integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc
|
integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc
|
||||||
real(c_double), intent(in), value :: alpha, beta
|
real(c_double), intent(in) :: alpha, beta
|
||||||
real(c_double) :: a, b, c
|
real(c_double) :: a, b, c
|
||||||
end subroutine
|
end subroutine
|
||||||
|
|
||||||
@ -196,7 +196,7 @@ module gpu
|
|||||||
type(c_ptr), value, intent(in) :: handle
|
type(c_ptr), value, intent(in) :: handle
|
||||||
character(c_char), intent(in), value :: transa, transb
|
character(c_char), intent(in), value :: transa, transb
|
||||||
integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc
|
integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc
|
||||||
real(c_float), intent(in), value :: alpha, beta
|
real(c_float), intent(in) :: alpha, beta
|
||||||
real(c_float) :: a, b, c
|
real(c_float) :: a, b, c
|
||||||
end subroutine
|
end subroutine
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user