10
0
mirror of https://github.com/QuantumPackage/qp2.git synced 2024-11-03 12:43:48 +01:00

Merge branch 'dev-stable' of github.com:QuantumPackage/qp2 into dev-stable

This commit is contained in:
Anthony Scemama 2024-07-03 14:50:14 +02:00
commit 6dfa8f8b32
25 changed files with 2827 additions and 388 deletions

36
configure vendored
View File

@ -40,14 +40,16 @@ Usage:
$(basename $0) -c <file> $(basename $0) -c <file>
$(basename $0) -h $(basename $0) -h
$(basename $0) -i <package> $(basename $0) -i <package>
$(basename $0) -g [nvidia|none]
Options: Options:
-c <file> Define a COMPILATION configuration file, -c <file> Define a COMPILATION configuration file,
in "${QP_ROOT}/config/". in "${QP_ROOT}/config/".
-h Print the HELP message -h Print the HELP message
-i <package> INSTALL <package>. Use at your OWN RISK: -i <package> INSTALL <package>. Use at your OWN RISK:
no support will be provided for the installation of no support will be provided for the installation of
dependencies. dependencies.
-g [nvidia|none] Choose GPU acceleration (experimental)
Example: Example:
./$(basename $0) -c config/gfortran.cfg ./$(basename $0) -c config/gfortran.cfg
@ -83,7 +85,7 @@ function execute () {
PACKAGES="" PACKAGES=""
while getopts "d:c:i:h" c ; do while getopts "d:c:i:g:h" c ; do
case "$c" in case "$c" in
c) c)
case "$OPTARG" in case "$OPTARG" in
@ -100,6 +102,9 @@ while getopts "d:c:i:h" c ; do
"") help ; break;; "") help ; break;;
*) PACKAGES="${PACKAGE} $OPTARG" *) PACKAGES="${PACKAGE} $OPTARG"
esac;; esac;;
g)
GPU=$OPTARG;
break;;
h) h)
help help
exit 0;; exit 0;;
@ -109,6 +114,23 @@ while getopts "d:c:i:h" c ; do
esac esac
done done
# Handle GPU acceleration
rm -f ${QP_ROOT}/src/gpu_arch
case "$GPU" in
amd) # Nvidia
echo "Activating AMD GPU acceleration"
ln -s ${QP_ROOT}/plugins/local/gpu_amd ${QP_ROOT}/src/gpu_arch
;;
nvidia) # Nvidia
echo "Activating Nvidia GPU acceleration"
ln -s ${QP_ROOT}/plugins/local/gpu_nvidia ${QP_ROOT}/src/gpu_arch
;;
*) # No Acceleration
echo "Disabling GPU acceleration"
ln -s ${QP_ROOT}/plugins/local/gpu_x86 ${QP_ROOT}/src/gpu_arch
;;
esac
# Trim leading and trailing spaces # Trim leading and trailing spaces
PACKAGES=$(echo $PACKAGES | xargs) PACKAGES=$(echo $PACKAGES | xargs)

View File

@ -0,0 +1 @@
-lcudart -lcublas -lcublasLt

View File

@ -0,0 +1 @@

View File

@ -0,0 +1,5 @@
==========
gpu_nvidia
==========
Nvidia implementation of GPU routines. Uses CUDA and CUBLAS libraries.

View File

@ -0,0 +1,331 @@
#include <stdint.h>
#include <stdio.h>
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
/* Generic functions */
int gpu_ndevices() {
int ngpus;
cudaGetDeviceCount(&ngpus);
return ngpus;
}
void gpu_set_device(int32_t igpu) {
cudaSetDevice((int) igpu);
}
/* Allocation functions */
void gpu_allocate(void** ptr, const int64_t size) {
size_t free, total;
cudaError_t rc = cudaMemGetInfo( &free, &total );
if (rc != cudaSuccess) {
free = INT64_MAX;
}
rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
// /* Use managed memory if it does not fit on the GPU */
// if (size < free && size < total/2) {
// rc= cudaMalloc(ptr, size);
// } else {
// rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
// }
assert (rc == cudaSuccess);
}
void gpu_deallocate(void** ptr) {
assert (*ptr != NULL);
cudaFree(*ptr);
*ptr = NULL;
}
/* Memory transfer functions */
void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
cudaMemcpy (gpu_ptr, cpu_ptr, n, cudaMemcpyHostToDevice);
}
void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
cudaMemcpy (cpu_ptr, gpu_ptr, n, cudaMemcpyDeviceToHost);
}
void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
cudaMemcpy (gpu_ptr_dest, gpu_ptr_src, n, cudaMemcpyDeviceToDevice);
}
/* Streams */
void gpu_stream_create(cudaStream_t* ptr) {
cudaError_t rc = cudaStreamCreate(ptr);
assert (rc == cudaSuccess);
}
void gpu_stream_destroy(cudaStream_t* ptr) {
assert (ptr != NULL);
cudaError_t rc = cudaStreamDestroy(*ptr);
assert (rc == cudaSuccess);
*ptr = NULL;
}
void gpu_set_stream(cublasHandle_t handle, cudaStream_t stream) {
cublasSetStream(handle, stream);
}
void gpu_synchronize() {
cudaDeviceSynchronize();
}
/* BLAS functions */
void gpu_blas_create(cublasHandle_t* ptr) {
cublasStatus_t rc = cublasCreate(ptr);
assert (rc == CUBLAS_STATUS_SUCCESS);
}
void gpu_blas_destroy(cublasHandle_t* ptr) {
assert (ptr != NULL);
cublasStatus_t rc = cublasDestroy(*ptr);
assert (rc == CUBLAS_STATUS_SUCCESS);
ptr = NULL;
}
void gpu_ddot(cublasHandle_t handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
assert (handle != NULL);
/* Convert to int */
int n_, incx_, incy_;
n_ = (int) n;
incx_ = (int) incx;
incy_ = (int) incy;
assert ( (int64_t) n_ == n );
assert ( (int64_t) incx_ == incx);
assert ( (int64_t) incy_ == incy);
cublasStatus_t rc = cublasDdot(handle, n_, x, incx_, y, incy_, result);
/*
double alpha = 1.0;
double beta = 0.0;
cublasStatus_t rc = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, n_, &alpha, x, 1, y, n_, &beta, &result_, 1);
*/
assert (rc == CUBLAS_STATUS_SUCCESS);
}
void gpu_sdot(cublasHandle_t handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
assert (handle != NULL);
/* Convert to int */
int n_, incx_, incy_;
n_ = (int) n;
incx_ = (int) incx;
incy_ = (int) incy;
/* Check for integer overflows */
assert ( (int64_t) n_ == n );
assert ( (int64_t) incx_ == incx);
assert ( (int64_t) incy_ == incy);
float result_ = 0.;
cublasStatus_t rc = cublasSdot(handle, n_, x, incx_, y, incy_, &result_);
assert (rc == CUBLAS_STATUS_SUCCESS);
*result = result_;
}
void gpu_dgemv(cublasHandle_t handle, const char transa, const int64_t m, const int64_t n, const double alpha,
const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) {
assert (handle != NULL);
/* Convert to int */
int m_, n_, lda_, incx_, incy_;
m_ = (int) m;
n_ = (int) n;
lda_ = (int) lda;
incx_ = (int) incx;
incy_ = (int) incy;
/* Check for integer overflows */
assert ( (int64_t) m_ == m );
assert ( (int64_t) n_ == n );
assert ( (int64_t) lda_ == lda );
assert ( (int64_t) incx_ == incx);
assert ( (int64_t) incy_ == incy);
cublasOperation_t transa_ = CUBLAS_OP_N;
if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
cublasDgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_);
}
void gpu_sgemv(cublasHandle_t handle, const char transa, const int64_t m, const int64_t n, const float alpha,
const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) {
assert (handle != NULL);
/* Convert to int */
int m_, n_, lda_, incx_, incy_;
m_ = (int) m;
n_ = (int) n;
lda_ = (int) lda;
incx_ = (int) incx;
incy_ = (int) incy;
/* Check for integer overflows */
assert ( (int64_t) m_ == m );
assert ( (int64_t) n_ == n );
assert ( (int64_t) lda_ == lda );
assert ( (int64_t) incx_ == incx);
assert ( (int64_t) incy_ == incy);
cublasOperation_t transa_ = CUBLAS_OP_N;
if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
cublasSgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_);
}
void gpu_dgemm(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) {
assert (handle != NULL);
/* Convert to int */
int m_, n_, k_, lda_, ldb_, ldc_;
m_ = (int) m;
n_ = (int) n;
k_ = (int) k;
lda_ = (int) lda;
ldb_ = (int) ldb;
ldc_ = (int) ldc;
/* Check for integer overflows */
assert ( (int64_t) m_ == m );
assert ( (int64_t) n_ == n );
assert ( (int64_t) k_ == k );
assert ( (int64_t) lda_ == lda);
assert ( (int64_t) ldb_ == ldb);
assert ( (int64_t) ldc_ == ldc);
cublasOperation_t transa_ = CUBLAS_OP_N;
cublasOperation_t transb_ = CUBLAS_OP_N;
if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
cublasDgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_);
}
void gpu_sgemm(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) {
assert (handle != NULL);
/* Convert to int */
int m_, n_, k_, lda_, ldb_, ldc_;
m_ = (int) m;
n_ = (int) n;
k_ = (int) k;
lda_ = (int) lda;
ldb_ = (int) ldb;
ldc_ = (int) ldc;
/* Check for integer overflows */
assert ( (int64_t) m_ == m );
assert ( (int64_t) n_ == n );
assert ( (int64_t) k_ == k );
assert ( (int64_t) lda_ == lda);
assert ( (int64_t) ldb_ == ldb);
assert ( (int64_t) ldc_ == ldc);
cublasOperation_t transa_ = CUBLAS_OP_N;
cublasOperation_t transb_ = CUBLAS_OP_N;
if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
cublasSgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_);
}
void gpu_dgeam(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
assert (handle != NULL);
/* Convert to int */
int m_, n_, lda_, ldb_, ldc_;
m_ = (int) m;
n_ = (int) n;
lda_ = (int) lda;
ldb_ = (int) ldb;
ldc_ = (int) ldc;
/* Check for integer overflows */
assert ( (int64_t) m_ == m );
assert ( (int64_t) n_ == n );
assert ( (int64_t) lda_ == lda);
assert ( (int64_t) ldb_ == ldb);
assert ( (int64_t) ldc_ == ldc);
cublasOperation_t transa_ = CUBLAS_OP_N;
cublasOperation_t transb_ = CUBLAS_OP_N;
if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
cublasDgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_);
}
void gpu_sgeam(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
assert (handle != NULL);
/* Convert to int */
int m_, n_, lda_, ldb_, ldc_;
m_ = (int) m;
n_ = (int) n;
lda_ = (int) lda;
ldb_ = (int) ldb;
ldc_ = (int) ldc;
/* Check for integer overflows */
assert ( (int64_t) m_ == m );
assert ( (int64_t) n_ == n );
assert ( (int64_t) lda_ == lda);
assert ( (int64_t) ldb_ == ldb);
assert ( (int64_t) ldc_ == ldc);
cublasOperation_t transa_ = CUBLAS_OP_N;
cublasOperation_t transb_ = CUBLAS_OP_N;
if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
cublasSgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_);
}

View File

@ -0,0 +1 @@

View File

@ -0,0 +1,5 @@
=======
gpu_x86
=======
x86 implementation of GPU routines. For use when GPUs are not available.

500
plugins/local/gpu_x86/gpu.c Normal file
View File

@ -0,0 +1,500 @@
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <assert.h>
/* Generic functions */
int gpu_ndevices() {
return 0;
}
void gpu_set_device(int32_t i) {
return;
}
/* Allocation functions */
void gpu_allocate(void** ptr, const int64_t n) {
*ptr = malloc((size_t) n);
if (*ptr == NULL) {
perror("Allocation failed");
}
}
void gpu_deallocate(void** ptr) {
free(*ptr);
*ptr = NULL;
}
/* Memory transfer functions */
void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
memcpy(gpu_ptr, cpu_ptr, n);
}
void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
memcpy(cpu_ptr, gpu_ptr, n);
}
void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
memcpy(gpu_ptr_dest, gpu_ptr_src, n);
}
/* Streams */
void gpu_stream_create(void** ptr) {
*ptr = (void*) 2;
}
void gpu_stream_destroy(void** ptr) {
*ptr = NULL;
}
void gpu_set_stream(void* handle, void* stream) {
return;
}
void gpu_synchronize() {
return;
}
/* BLAS functions */
void gpu_blas_create(void** handle) {
*handle = (void*) 1;
}
void gpu_blas_destroy(void** handle) {
*handle = NULL;
}
double ddot_(const int32_t* n, const double* x, const int32_t* incx, const double* y, const int32_t* incy);
void gpu_ddot(void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
assert (handle != NULL);
/* Convert to int32_t */
int32_t n_, incx_, incy_;
n_ = (int32_t) n;
incx_ = (int32_t) incx;
incy_ = (int32_t) incy;
/* Check for integer overflows */
assert ( (int64_t) n_ == n );
assert ( (int64_t) incx_ == incx);
assert ( (int64_t) incy_ == incy);
*result = ddot_(&n_, x, &incx_, y, &incy_);
}
float sdot_(const int32_t* n, const float* x, const int32_t* incx, const float* y, const int32_t* incy);
void gpu_sdot(void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
assert (handle != NULL);
/* Convert to int32_t */
int32_t n_, incx_, incy_;
n_ = (int32_t) n;
incx_ = (int32_t) incx;
incy_ = (int32_t) incy;
/* Check for integer overflows */
assert ( (int64_t) n_ == n );
assert ( (int64_t) incx_ == incx);
assert ( (int64_t) incy_ == incy);
*result = sdot_(&n_, x, &incx_, y, &incy_);
}
void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha,
const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy);
void gpu_dgemv(void* handle, const char transa, const int64_t m, const int64_t n, const double alpha,
const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) {
assert (handle != NULL);
/* Convert to int32_t */
int32_t m_, n_, lda_, incx_, incy_;
m_ = (int32_t) m;
n_ = (int32_t) n;
lda_ = (int32_t) lda;
incx_ = (int32_t) incx;
incy_ = (int32_t) incy;
/* Check for integer overflows */
assert ( (int64_t) m_ == m );
assert ( (int64_t) n_ == n );
assert ( (int64_t) lda_ == lda );
assert ( (int64_t) incx_ == incx);
assert ( (int64_t) incy_ == incy);
dgemv_(&transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_);
}
void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha,
const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy);
void gpu_sgemv(void* handle, const char transa, const int64_t m, const int64_t n, const float alpha,
const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) {
assert (handle != NULL);
/* Convert to int32_t */
int32_t m_, n_, lda_, incx_, incy_;
m_ = (int32_t) m;
n_ = (int32_t) n;
lda_ = (int32_t) lda;
incx_ = (int32_t) incx;
incy_ = (int32_t) incy;
/* Check for integer overflows */
assert ( (int64_t) m_ == m );
assert ( (int64_t) n_ == n );
assert ( (int64_t) lda_ == lda );
assert ( (int64_t) incx_ == incx);
assert ( (int64_t) incy_ == incy);
sgemv_(&transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_);
}
void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha,
const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc);
void gpu_dgemm(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) {
assert (handle != NULL);
/* Convert to int32_t */
int32_t m_, n_, k_, lda_, ldb_, ldc_;
m_ = (int32_t) m;
n_ = (int32_t) n;
k_ = (int32_t) k;
lda_ = (int32_t) lda;
ldb_ = (int32_t) ldb;
ldc_ = (int32_t) ldc;
/* Check for integer overflows */
assert ( (int64_t) m_ == m );
assert ( (int64_t) n_ == n );
assert ( (int64_t) k_ == k );
assert ( (int64_t) lda_ == lda);
assert ( (int64_t) ldb_ == ldb);
assert ( (int64_t) ldc_ == ldc);
dgemm_(&transa, &transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_);
}
void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha,
const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc);
void gpu_sgemm(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) {
assert (handle != NULL);
/* Convert to int32_t */
int32_t m_, n_, k_, lda_, ldb_, ldc_;
m_ = (int32_t) m;
n_ = (int32_t) n;
k_ = (int32_t) k;
lda_ = (int32_t) lda;
ldb_ = (int32_t) ldb;
ldc_ = (int32_t) ldc;
/* Check for integer overflows */
assert ( (int64_t) m_ == m );
assert ( (int64_t) n_ == n );
assert ( (int64_t) k_ == k );
assert ( (int64_t) lda_ == lda);
assert ( (int64_t) ldb_ == ldb);
assert ( (int64_t) ldc_ == ldc);
sgemm_(&transa, &transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_);
}
void gpu_dgeam(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
assert (handle != NULL);
if ( (transa == 'N' && transb == 'N') ||
(transa == 'n' && transb == 'N') ||
(transa == 'N' && transb == 'n') ||
(transa == 'n' && transb == 'n') ) {
if (alpha == 0.) {
for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = beta * b[j*ldb+i];
}
}
} else if (beta == 0.) {
for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = alpha * a[j*lda+i];
}
}
} else {
for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i];
}
}
}
} else if ( (transa == 'N' && transb == 'T') ||
(transa == 'n' && transb == 'T') ||
(transa == 'N' && transb == 't') ||
(transa == 'n' && transb == 't') ) {
if (alpha == 0.) {
for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = beta * b[i*ldb+j];
}
}
} else if (beta == 0.) {
for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = alpha * a[j*lda+i];
}
}
} else {
for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j];
}
}
}
} else if ( (transa == 'T' && transb == 'N') ||
(transa == 't' && transb == 'N') ||
(transa == 'T' && transb == 'n') ||
(transa == 't' && transb == 'n') ) {
if (alpha == 0.) {
for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = beta * b[j*ldb+i];
}
}
} else if (beta == 0.) {
for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = alpha * a[i*lda+j];
}
}
} else {
for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i];
}
}
}
} else if ( (transa == 'T' && transb == 'T') ||
(transa == 't' && transb == 'T') ||
(transa == 'T' && transb == 't') ||
(transa == 't' && transb == 't') ) {
if (alpha == 0.) {
for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = beta * b[i*ldb+j];
}
}
} else if (beta == 0.) {
for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = alpha * a[i*lda+j];
}
}
} else {
for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j];
}
}
}
}
}
void gpu_sgeam(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
assert (handle != NULL);
if ( (transa == 'N' && transb == 'N') ||
(transa == 'n' && transb == 'N') ||
(transa == 'N' && transb == 'n') ||
(transa == 'n' && transb == 'n') ) {
if (alpha == 0.) {
for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = beta * b[j*ldb+i];
}
}
} else if (beta == 0.) {
for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = alpha * a[j*lda+i];
}
}
} else {
for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i];
}
}
}
} else if ( (transa == 'N' && transb == 'T') ||
(transa == 'n' && transb == 'T') ||
(transa == 'N' && transb == 't') ||
(transa == 'n' && transb == 't') ) {
if (alpha == 0.) {
for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = beta * b[i*ldb+j];
}
}
} else if (beta == 0.) {
for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = alpha * a[j*lda+i];
}
}
} else {
for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j];
}
}
}
} else if ( (transa == 'T' && transb == 'N') ||
(transa == 't' && transb == 'N') ||
(transa == 'T' && transb == 'n') ||
(transa == 't' && transb == 'n') ) {
if (alpha == 0.) {
for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = beta * b[j*ldb+i];
}
}
} else if (beta == 0.) {
for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = alpha * a[i*lda+j];
}
}
} else {
for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i];
}
}
}
} else if ( (transa == 'T' && transb == 'T') ||
(transa == 't' && transb == 'T') ||
(transa == 'T' && transb == 't') ||
(transa == 't' && transb == 't') ) {
if (alpha == 0.) {
for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = beta * b[i*ldb+j];
}
}
} else if (beta == 0.) {
for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = alpha * a[i*lda+j];
}
}
} else {
for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j];
}
}
}
}
}

View File

@ -31,6 +31,9 @@ subroutine print_aos()
integer :: i, ipoint integer :: i, ipoint
double precision :: r(3) double precision :: r(3)
double precision :: ao_val, ao_der(3), ao_lap double precision :: ao_val, ao_der(3), ao_lap
double precision :: accu_vgl(5)
double precision :: accu_vgl_nrm(5)
double precision :: mo_val, mo_der(3), mo_lap double precision :: mo_val, mo_der(3), mo_lap
PROVIDE final_grid_points aos_in_r_array aos_grad_in_r_array aos_lapl_in_r_array PROVIDE final_grid_points aos_in_r_array aos_grad_in_r_array aos_lapl_in_r_array
@ -40,9 +43,6 @@ subroutine print_aos()
write(1000, '(3(f15.7, 3X))') r write(1000, '(3(f15.7, 3X))') r
enddo enddo
double precision :: accu_vgl(5)
double precision :: accu_vgl_nrm(5)
do ipoint = 1, n_points_final_grid do ipoint = 1, n_points_final_grid
do i = 1, ao_num do i = 1, ao_num
ao_val = aos_in_r_array (i,ipoint) ao_val = aos_in_r_array (i,ipoint)

View File

@ -78,7 +78,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
!$OMP PRIVATE (i, j, k, l, ipoint, ao_i_r, ao_k_r, weight1) & !$OMP PRIVATE (i, j, k, l, ipoint, ao_i_r, ao_k_r, weight1) &
!$OMP SHARED (ao_num, n_points_final_grid, ao_two_e_tc_tot, & !$OMP SHARED (ao_num, n_points_final_grid, ao_two_e_tc_tot, &
!$OMP aos_in_r_array_transp, final_weight_at_r_vector, int2_grad1_u12_square_ao) !$OMP aos_in_r_array_transp, final_weight_at_r_vector, int2_grad1_u12_square_ao)
!$OMP DO COLLAPSE(4) !$OMP DO COLLAPSE(3)
do i = 1, ao_num do i = 1, ao_num
do k = 1, ao_num do k = 1, ao_num
do l = 1, ao_num do l = 1, ao_num
@ -188,7 +188,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
!$OMP SHARED (ao_num, n_points_final_grid, ao_two_e_tc_tot, & !$OMP SHARED (ao_num, n_points_final_grid, ao_two_e_tc_tot, &
!$OMP aos_in_r_array_transp, final_weight_at_r_vector, & !$OMP aos_in_r_array_transp, final_weight_at_r_vector, &
!$OMP int2_grad1_u12_ao, aos_grad_in_r_array_transp_bis) !$OMP int2_grad1_u12_ao, aos_grad_in_r_array_transp_bis)
!$OMP DO COLLAPSE(4) !$OMP DO COLLAPSE(3)
do i = 1, ao_num do i = 1, ao_num
do k = 1, ao_num do k = 1, ao_num
do l = 1, ao_num do l = 1, ao_num
@ -270,7 +270,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
!$OMP PARALLEL DEFAULT(NONE) & !$OMP PARALLEL DEFAULT(NONE) &
!$OMP PRIVATE(i, j, k, l, integ_zero, integ_val) & !$OMP PRIVATE(i, j, k, l, integ_zero, integ_val) &
!$OMP SHARED(ao_num, ao_two_e_tc_tot) !$OMP SHARED(ao_num, ao_two_e_tc_tot)
!$OMP DO COLLAPSE(4) !$OMP DO COLLAPSE(3)
do j = 1, ao_num do j = 1, ao_num
do l = 1, ao_num do l = 1, ao_num
do i = 1, ao_num do i = 1, ao_num
@ -293,7 +293,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
!$OMP PARALLEL DEFAULT(NONE) & !$OMP PARALLEL DEFAULT(NONE) &
!$OMP SHARED(ao_num, ao_two_e_tc_tot, ao_integrals_map) & !$OMP SHARED(ao_num, ao_two_e_tc_tot, ao_integrals_map) &
!$OMP PRIVATE(i, j, k, l) !$OMP PRIVATE(i, j, k, l)
!$OMP DO COLLAPSE(4) !$OMP DO COLLAPSE(3)
do j = 1, ao_num do j = 1, ao_num
do l = 1, ao_num do l = 1, ao_num
do i = 1, ao_num do i = 1, ao_num
@ -306,7 +306,6 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
enddo enddo
!$OMP END DO !$OMP END DO
!$OMP END PARALLEL !$OMP END PARALLEL
!call clear_ao_map()
FREE ao_integrals_map FREE ao_integrals_map
endif endif

View File

@ -0,0 +1,5 @@
tc_keywords
jastrow
qmckl
becke_numerical_grid
dft_utils_in_r

View File

@ -0,0 +1,4 @@
======
tc_int
======

View File

@ -0,0 +1,295 @@
! ---
subroutine provide_int2_grad1_u12_ao()
BEGIN_DOC
!
! int2_grad1_u12_ao(i,j,ipoint,1) = \int dr2 [\grad1 u(r1,r2)]_x1 \chi_i(r2) \chi_j(r2)
! int2_grad1_u12_ao(i,j,ipoint,2) = \int dr2 [\grad1 u(r1,r2)]_y1 \chi_i(r2) \chi_j(r2)
! int2_grad1_u12_ao(i,j,ipoint,3) = \int dr2 [\grad1 u(r1,r2)]_z1 \chi_i(r2) \chi_j(r2)
! int2_grad1_u12_ao(i,j,ipoint,4) = \int dr2 [-(1/2) [\grad1 u(r1,r2)]^2] \chi_i(r2) \chi_j(r2)
!
!
! tc_int_2e_ao(k,i,l,j) = (ki|V^TC(r_12)|lj)
! = <lk| V^TC(r_12) |ji> where V^TC(r_12) is the total TC operator
! = tc_grad_and_lapl_ao(k,i,l,j) + tc_grad_square_ao(k,i,l,j) + ao_two_e_coul(k,i,l,j)
! where:
!
! tc_grad_and_lapl_ao(k,i,l,j) = < k l | -1/2 \Delta_1 u(r1,r2) - \grad_1 u(r1,r2) . \grad_1 | ij >
! = -1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2 \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2)
! = 1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2 (-1) \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2)
!
! tc_grad_square_ao(k,i,l,j) = -1/2 <kl | |\grad_1 u(r1,r2)|^2 + |\grad_2 u(r1,r2)|^2 | ij>
!
! ao_two_e_coul(k,i,l,j) = < l k | 1/r12 | j i > = ( k i | 1/r12 | l j )
!
END_DOC
implicit none
integer :: i, j, k, l, m, ipoint, jpoint
integer :: n_blocks, n_rest, n_pass
integer :: i_blocks, i_rest, i_pass, ii
double precision :: mem, n_double
double precision :: weight1, ao_k_r, ao_i_r
double precision :: der_envsq_x, der_envsq_y, der_envsq_z, lap_envsq
double precision :: time0, time1, time2, tc1, tc2, tc
double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:), tc_int_2e_ao(:,:,:,:)
double precision, allocatable :: tmp(:,:,:), c_mat(:,:,:), tmp_grad1_u12(:,:,:)
double precision, external :: get_ao_two_e_integral
PROVIDE final_weight_at_r_vector_extra aos_in_r_array_extra
PROVIDE final_weight_at_r_vector aos_grad_in_r_array_transp_bis final_weight_at_r_vector aos_in_r_array_transp
print*, ' start provide_int2_grad1_u12_ao ...'
call wall_time(time0)
call total_memory(mem)
mem = max(1.d0, qp_max_mem - mem)
n_double = mem * 1.d8
n_blocks = int(min(n_double / (n_points_extra_final_grid * 4.d0), 1.d0*n_points_final_grid))
n_rest = int(mod(n_points_final_grid, n_blocks))
n_pass = int((n_points_final_grid - n_rest) / n_blocks)
call write_int(6, n_pass, 'Number of passes')
call write_int(6, n_blocks, 'Size of the blocks')
call write_int(6, n_rest, 'Size of the last block')
! ---
! ---
! ---
allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,4))
allocate(tmp(n_points_extra_final_grid,ao_num,ao_num))
!$OMP PARALLEL &
!$OMP DEFAULT (NONE) &
!$OMP PRIVATE (j, i, jpoint) &
!$OMP SHARED (tmp, ao_num, n_points_extra_final_grid, final_weight_at_r_vector_extra, aos_in_r_array_extra_transp)
!$OMP DO SCHEDULE (static)
do j = 1, ao_num
do i = 1, ao_num
do jpoint = 1, n_points_extra_final_grid
tmp(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j)
enddo
enddo
enddo
!$OMP END DO
!$OMP END PARALLEL
allocate(tmp_grad1_u12(n_points_extra_final_grid,n_blocks,4))
tc = 0.d0
do i_pass = 1, n_pass
ii = (i_pass-1)*n_blocks + 1
call wall_time(tc1)
!$OMP PARALLEL &
!$OMP DEFAULT (NONE) &
!$OMP PRIVATE (i_blocks, ipoint) &
!$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12)
!$OMP DO
do i_blocks = 1, n_blocks
ipoint = ii - 1 + i_blocks ! r1
call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_blocks,1), tmp_grad1_u12(1,i_blocks,2), tmp_grad1_u12(1,i_blocks,3), tmp_grad1_u12(1,i_blocks,4))
enddo
!$OMP END DO
!$OMP END PARALLEL
call wall_time(tc2)
tc = tc + tc2 - tc1
do m = 1, 4
call dgemm( "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0 &
, tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
, 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num)
enddo
enddo
deallocate(tmp_grad1_u12)
if(n_rest .gt. 0) then
allocate(tmp_grad1_u12(n_points_extra_final_grid,n_rest,4))
ii = n_pass*n_blocks + 1
call wall_time(tc1)
!$OMP PARALLEL &
!$OMP DEFAULT (NONE) &
!$OMP PRIVATE (i_rest, ipoint) &
!$OMP SHARED (n_rest, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12)
!$OMP DO
do i_rest = 1, n_rest
ipoint = ii - 1 + i_rest ! r1
call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_rest,1), tmp_grad1_u12(1,i_rest,2), tmp_grad1_u12(1,i_rest,3), tmp_grad1_u12(1,i_rest,4))
enddo
!$OMP END DO
!$OMP END PARALLEL
call wall_time(tc2)
tc = tc + tc2 - tc1
do m = 1, 4
call dgemm( "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0 &
, tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
, 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num)
enddo
deallocate(tmp_grad1_u12)
endif
deallocate(tmp)
call wall_time(time1)
print*, ' wall time for int2_grad1_u12_ao (min) = ', (time1-time0) / 60.d0
print*, ' wall time Jastrow derivatives (min) = ', tc / 60.d0
call print_memory_usage()
! ---
! ---
! ---
allocate(tc_int_2e_ao(ao_num,ao_num,ao_num,ao_num))
call wall_time(time1)
allocate(c_mat(n_points_final_grid,ao_num,ao_num))
!$OMP PARALLEL &
!$OMP DEFAULT (NONE) &
!$OMP PRIVATE (i, k, ipoint) &
!$OMP SHARED (aos_in_r_array_transp, c_mat, ao_num, n_points_final_grid, final_weight_at_r_vector)
!$OMP DO SCHEDULE (static)
do i = 1, ao_num
do k = 1, ao_num
do ipoint = 1, n_points_final_grid
c_mat(ipoint,k,i) = final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,k)
enddo
enddo
enddo
!$OMP END DO
!$OMP END PARALLEL
call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0 &
, int2_grad1_u12_ao(1,1,1,4), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
, 0.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num)
deallocate(c_mat)
call wall_time(time2)
print*, ' wall time of Hermitian part of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0
call print_memory_usage()
! ---
call wall_time(time1)
allocate(c_mat(n_points_final_grid,ao_num,ao_num))
do m = 1, 3
!$OMP PARALLEL &
!$OMP DEFAULT (NONE) &
!$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r) &
!$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, c_mat, &
!$OMP ao_num, n_points_final_grid, final_weight_at_r_vector, m)
!$OMP DO SCHEDULE (static)
do i = 1, ao_num
do k = 1, ao_num
do ipoint = 1, n_points_final_grid
weight1 = 0.5d0 * final_weight_at_r_vector(ipoint)
ao_i_r = aos_in_r_array_transp(ipoint,i)
ao_k_r = aos_in_r_array_transp(ipoint,k)
c_mat(ipoint,k,i) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,m) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,m))
enddo
enddo
enddo
!$OMP END DO
!$OMP END PARALLEL
call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, -1.d0 &
, int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
, 1.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num)
enddo
deallocate(c_mat)
call wall_time(time2)
print*, ' wall time of non-Hermitian part of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0
call print_memory_usage()
! ---
call wall_time(time1)
call sum_A_At(tc_int_2e_ao(1,1,1,1), ao_num*ao_num)
call wall_time(time2)
print*, ' lower- and upper-triangle of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0
call print_memory_usage()
! ---
call wall_time(time1)
PROVIDE ao_integrals_map
!$OMP PARALLEL DEFAULT(NONE) &
!$OMP SHARED(ao_num, tc_int_2e_ao, ao_integrals_map) &
!$OMP PRIVATE(i, j, k, l)
!$OMP DO COLLAPSE(3)
do j = 1, ao_num
do l = 1, ao_num
do i = 1, ao_num
do k = 1, ao_num
! < 1:i, 2:j | 1:k, 2:l >
tc_int_2e_ao(k,i,l,j) = tc_int_2e_ao(k,i,l,j) + get_ao_two_e_integral(i, j, k, l, ao_integrals_map)
enddo
enddo
enddo
enddo
!$OMP END DO
!$OMP END PARALLEL
call wall_time(time2)
print*, ' wall time of Coulomb part of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0
call print_memory_usage()
! ---
print*, ' Writing int2_grad1_u12_ao in ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao'
open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="write")
call ezfio_set_work_empty(.False.)
write(11) int2_grad1_u12_ao(:,:,:,1:3)
close(11)
print*, ' Saving tc_int_2e_ao in ', trim(ezfio_filename) // '/work/ao_two_e_tc_tot'
open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/ao_two_e_tc_tot', action="write")
call ezfio_set_work_empty(.False.)
do i = 1, ao_num
write(11) tc_int_2e_ao(:,:,:,i)
enddo
close(11)
! ----
deallocate(int2_grad1_u12_ao)
deallocate(tc_int_2e_ao)
call wall_time(time2)
print*, ' wall time for tc_int_2e_ao (min) = ', (time2-time1) / 60.d0
call print_memory_usage()
! ---
call wall_time(time1)
print*, ' wall time for TC-integrals (min) = ', (time1-time0) / 60.d0
return
end
! ---

View File

@ -0,0 +1,245 @@
! ---
subroutine get_grad1_u12_for_tc(ipoint, n_grid2, resx, resy, resz, res)
BEGIN_DOC
!
! resx(ipoint) = [grad1 u(r1,r2)]_x1
! resy(ipoint) = [grad1 u(r1,r2)]_y1
! resz(ipoint) = [grad1 u(r1,r2)]_z1
! res (ipoint) = -0.5 [grad1 u(r1,r2)]^2
!
! We use:
! grid for r1
! extra_grid for r2
!
END_DOC
include 'constants.include.F'
implicit none
integer, intent(in) :: ipoint, n_grid2
double precision, intent(out) :: resx(n_grid2), resy(n_grid2), resz(n_grid2), res(n_grid2)
integer :: jpoint, i_nucl, p, mpA, npA, opA, pp
integer :: powmax1, powmax, powmax2
double precision :: r1(3), r2(3)
double precision :: tmp, tmp1, tmp2, tmp11, tmp22
double precision :: rn(3), f1A, grad1_f1A(3), f2A, grad2_f2A(3), g12, grad1_g12(3)
double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:)
r1(1) = final_grid_points(1,ipoint)
r1(2) = final_grid_points(2,ipoint)
r1(3) = final_grid_points(3,ipoint)
call grad1_j12_r1_seq(r1, n_grid2, resx, resy, resz)
do jpoint = 1, n_grid2 ! r2
res(jpoint) = -0.5d0 * (resx(jpoint) * resx(jpoint) + resy(jpoint) * resy(jpoint) + resz(jpoint) * resz(jpoint))
enddo
return
end
! ---
subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
include 'constants.include.F'
implicit none
integer , intent(in) :: n_grid2
double precision, intent(in) :: r1(3)
double precision, intent(out) :: gradx(n_grid2)
double precision, intent(out) :: grady(n_grid2)
double precision, intent(out) :: gradz(n_grid2)
integer :: jpoint, i_nucl, p, mpA, npA, opA
double precision :: r2(3)
double precision :: dx, dy, dz, r12, tmp
double precision :: rn(3), f1A, grad1_f1A(3), f2A, grad2_f2A(3), g12, grad1_g12(3)
double precision :: tmp1, tmp2, dist
integer :: powmax1, powmax, powmax2
double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:)
powmax1 = max(maxval(jBH_m), maxval(jBH_n))
powmax2 = maxval(jBH_o)
powmax = max(powmax1, powmax2)
allocate(f1A_power(-1:powmax), f2A_power(-1:powmax), g12_power(-1:powmax), double_p(0:powmax))
do p = 0, powmax
double_p(p) = dble(p)
enddo
f1A_power(-1) = 0.d0
f2A_power(-1) = 0.d0
g12_power(-1) = 0.d0
f1A_power(0) = 1.d0
f2A_power(0) = 1.d0
g12_power(0) = 1.d0
do jpoint = 1, n_grid2 ! r2
r2(1) = final_grid_points_extra(1,jpoint)
r2(2) = final_grid_points_extra(2,jpoint)
r2(3) = final_grid_points_extra(3,jpoint)
gradx(jpoint) = 0.d0
grady(jpoint) = 0.d0
gradz(jpoint) = 0.d0
call jBH_elem_fct_grad_alpha1(r1, r2, g12, grad1_g12)
! dist = (r1(1) - r2(1)) * (r1(1) - r2(1)) &
! + (r1(2) - r2(2)) * (r1(2) - r2(2)) &
! + (r1(3) - r2(3)) * (r1(3) - r2(3))
!
! if(dist .ge. 1d-15) then
! dist = dsqrt( dist )
!
! tmp1 = 1.d0 / (1.d0 + dist)
!
! g12 = dist * tmp1
! tmp2 = tmp1 * tmp1 / dist
! grad1_g12(1) = tmp2 * (r1(1) - r2(1))
! grad1_g12(2) = tmp2 * (r1(2) - r2(2))
! grad1_g12(3) = tmp2 * (r1(3) - r2(3))
!
! else
!
! grad1_g12(1) = 0.d0
! grad1_g12(2) = 0.d0
! grad1_g12(3) = 0.d0
! g12 = 0.d0
!
! endif
!
do p = 1, powmax2
g12_power(p) = g12_power(p-1) * g12
enddo
do i_nucl = 1, nucl_num
rn(1) = nucl_coord(i_nucl,1)
rn(2) = nucl_coord(i_nucl,2)
rn(3) = nucl_coord(i_nucl,3)
call jBH_elem_fct_grad_alpha1(r1, rn, f1A, grad1_f1A)
! dist = (r1(1) - rn(1)) * (r1(1) - rn(1)) &
! + (r1(2) - rn(2)) * (r1(2) - rn(2)) &
! + (r1(3) - rn(3)) * (r1(3) - rn(3))
! if (dist > 1.d-15) then
! dist = dsqrt( dist )
!
! tmp1 = 1.d0 / (1.d0 + dist)
!
! f1A = dist * tmp1
! tmp2 = tmp1 * tmp1 / dist
! grad1_f1A(1) = tmp2 * (r1(1) - rn(1))
! grad1_f1A(2) = tmp2 * (r1(2) - rn(2))
! grad1_f1A(3) = tmp2 * (r1(3) - rn(3))
!
! else
!
! grad1_f1A(1) = 0.d0
! grad1_f1A(2) = 0.d0
! grad1_f1A(3) = 0.d0
! f1A = 0.d0
!
! endif
call jBH_elem_fct_grad_alpha1(r2, rn, f2A, grad2_f2A)
! dist = (r2(1) - rn(1)) * (r2(1) - rn(1)) &
! + (r2(2) - rn(2)) * (r2(2) - rn(2)) &
! + (r2(3) - rn(3)) * (r2(3) - rn(3))
!
! if (dist > 1.d-15) then
! dist = dsqrt( dist )
!
! tmp1 = 1.d0 / (1.d0 + dist)
!
! f2A = dist * tmp1
! tmp2 = tmp1 * tmp1 / dist
! grad2_f2A(1) = tmp2 * (r2(1) - rn(1))
! grad2_f2A(2) = tmp2 * (r2(2) - rn(2))
! grad2_f2A(3) = tmp2 * (r2(3) - rn(3))
!
! else
!
! grad2_f2A(1) = 0.d0
! grad2_f2A(2) = 0.d0
! grad2_f2A(3) = 0.d0
! f2A = 0.d0
!
! endif
! Compute powers of f1A and f2A
do p = 1, powmax1
f1A_power(p) = f1A_power(p-1) * f1A
f2A_power(p) = f2A_power(p-1) * f2A
enddo
do p = 1, jBH_size
mpA = jBH_m(p,i_nucl)
npA = jBH_n(p,i_nucl)
opA = jBH_o(p,i_nucl)
tmp = jBH_c(p,i_nucl)
! if (dabs(tmp) <= 1.d-10) cycle
!
if(mpA .eq. npA) then
tmp = tmp * 0.5d0
endif
tmp1 = double_p(mpA) * f1A_power(mpA-1) * f2A_power(npA) + double_p(npA) * f1A_power(npA-1) * f2A_power(mpA)
tmp1 = tmp1 * g12_power(opA) * tmp
tmp2 = double_p(opA) * g12_power(opA-1) * (f1A_power(mpA) * f2A_power(npA) + f1A_power(npA) * f2A_power(mpA)) * tmp
gradx(jpoint) = gradx(jpoint) + tmp1 * grad1_f1A(1) + tmp2 * grad1_g12(1)
grady(jpoint) = grady(jpoint) + tmp1 * grad1_f1A(2) + tmp2 * grad1_g12(2)
gradz(jpoint) = gradz(jpoint) + tmp1 * grad1_f1A(3) + tmp2 * grad1_g12(3)
enddo ! p
enddo ! i_nucl
enddo ! jpoint
return
end
subroutine jBH_elem_fct_grad_alpha1(r1, r2, fct, grad1_fct)
implicit none
double precision, intent(in) :: r1(3), r2(3)
double precision, intent(out) :: fct, grad1_fct(3)
double precision :: dist, tmp1, tmp2
dist = (r1(1) - r2(1)) * (r1(1) - r2(1)) &
+ (r1(2) - r2(2)) * (r1(2) - r2(2)) &
+ (r1(3) - r2(3)) * (r1(3) - r2(3))
if(dist .ge. 1d-15) then
dist = dsqrt( dist )
tmp1 = 1.d0 / (1.d0 + dist)
fct = dist * tmp1
tmp2 = tmp1 * tmp1 / dist
grad1_fct(1) = tmp2 * (r1(1) - r2(1))
grad1_fct(2) = tmp2 * (r1(2) - r2(2))
grad1_fct(3) = tmp2 * (r1(3) - r2(3))
else
grad1_fct(1) = 0.d0
grad1_fct(2) = 0.d0
grad1_fct(3) = 0.d0
fct = 0.d0
endif
return
end
! ---

View File

@ -0,0 +1,43 @@
! ---
subroutine jBH_elem_fct_grad(alpha, r1, r2, fct, grad1_fct)
implicit none
double precision, intent(in) :: alpha, r1(3), r2(3)
double precision, intent(out) :: fct, grad1_fct(3)
double precision :: dist, tmp1, tmp2, dist_inv
dist = (r1(1) - r2(1)) * (r1(1) - r2(1)) &
+ (r1(2) - r2(2)) * (r1(2) - r2(2)) &
+ (r1(3) - r2(3)) * (r1(3) - r2(3))
if(dist .ge. 1d-15) then
dist_inv = 1.d0/dsqrt( dist )
dist = dist_inv * dist
tmp1 = 1.d0 / (1.d0 + alpha * dist)
fct = alpha * dist * tmp1
tmp2 = alpha * tmp1 * tmp1 * dist_inv
grad1_fct(1) = tmp2 * (r1(1) - r2(1))
grad1_fct(2) = tmp2 * (r1(2) - r2(2))
grad1_fct(3) = tmp2 * (r1(3) - r2(3))
else
grad1_fct(1) = 0.d0
grad1_fct(2) = 0.d0
grad1_fct(3) = 0.d0
fct = 0.d0
endif
return
end
! ---

View File

@ -0,0 +1,56 @@
! ---
program write_tc_int
implicit none
print *, ' j2e_type = ', j2e_type
print *, ' j1e_type = ', j1e_type
print *, ' env_type = ', env_type
my_grid_becke = .True.
PROVIDE tc_grid1_a tc_grid1_r
my_n_pt_r_grid = tc_grid1_r
my_n_pt_a_grid = tc_grid1_a
touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
my_extra_grid_becke = .True.
PROVIDE tc_grid2_a tc_grid2_r
my_n_pt_r_extra_grid = tc_grid2_r
my_n_pt_a_extra_grid = tc_grid2_a
touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid
call write_int(6, my_n_pt_r_grid, 'radial external grid over')
call write_int(6, my_n_pt_a_grid, 'angular external grid over')
call write_int(6, my_n_pt_r_extra_grid, 'radial internal grid over')
call write_int(6, my_n_pt_a_extra_grid, 'angular internal grid over')
call main()
end
! ---
subroutine main()
implicit none
PROVIDE io_tc_integ
print*, 'io_tc_integ = ', io_tc_integ
if(io_tc_integ .ne. "Write") then
print*, 'io_tc_integ != Write'
print*, io_tc_integ
stop
endif
call provide_int2_grad1_u12_ao()
call ezfio_set_tc_keywords_io_tc_integ('Read')
end
! ---

View File

@ -1,2 +1,3 @@
gpu
hartree_fock hartree_fock
utils_cc utils_cc

View File

@ -1,4 +1,5 @@
subroutine run_ccsd_space_orb subroutine run_ccsd_space_orb
use gpu
implicit none implicit none
@ -9,9 +10,18 @@ subroutine run_ccsd_space_orb
double precision :: uncorr_energy,energy, max_elem, max_r, max_r1, max_r2,ta,tb double precision :: uncorr_energy,energy, max_elem, max_r, max_r1, max_r2,ta,tb
logical :: not_converged logical :: not_converged
double precision, allocatable :: t2(:,:,:,:), r2(:,:,:,:), tau(:,:,:,:), tau_x(:,:,:,:) type(gpu_double4) :: t2, r2, tau, tau_x
double precision, allocatable :: t1(:,:), r1(:,:) type(gpu_double2) :: t1, r1
double precision, allocatable :: H_oo(:,:), H_vv(:,:), H_vo(:,:) type(gpu_double2) :: H_oo, H_vv, H_vo
type(gpu_double2) :: d_cc_space_f_oo, d_cc_space_f_vo
type(gpu_double2) :: d_cc_space_f_ov, d_cc_space_f_vv
type(gpu_double3) :: d_cc_space_v_oo_chol, d_cc_space_v_vo_chol
type(gpu_double3) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol
type(gpu_double4) :: d_cc_space_v_oovv
double precision, allocatable :: all_err(:,:), all_t(:,:) double precision, allocatable :: all_err(:,:), all_t(:,:)
integer, allocatable :: list_occ(:), list_vir(:) integer, allocatable :: list_occ(:), list_vir(:)
@ -20,7 +30,7 @@ subroutine run_ccsd_space_orb
call set_multiple_levels_omp(.False.) call set_multiple_levels_omp(.False.)
if (do_ao_cholesky) then if (do_mo_cholesky) then
PROVIDE cholesky_mo_transp PROVIDE cholesky_mo_transp
FREE cholesky_ao FREE cholesky_ao
else else
@ -51,11 +61,46 @@ subroutine run_ccsd_space_orb
!print*,'occ',list_occ !print*,'occ',list_occ
!print*,'vir',list_vir !print*,'vir',list_vir
allocate(t2(nO,nO,nV,nV), r2(nO,nO,nV,nV)) ! GPU arrays
allocate(tau(nO,nO,nV,nV)) call gpu_allocate(d_cc_space_f_oo, nO, nO)
allocate(tau_x(nO,nO,nV,nV)) call gpu_allocate(d_cc_space_f_vo, nV, nO)
allocate(t1(nO,nV), r1(nO,nV)) call gpu_allocate(d_cc_space_f_ov, nO, nV)
allocate(H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO)) call gpu_allocate(d_cc_space_f_vv, nV, nV)
call gpu_upload(cc_space_f_oo, d_cc_space_f_oo)
call gpu_upload(cc_space_f_vo, d_cc_space_f_vo)
call gpu_upload(cc_space_f_vv, d_cc_space_f_vv)
! FREE cc_space_f_oo
! FREE cc_space_f_vo
! FREE cc_space_f_vv
if (do_mo_cholesky) then
call gpu_allocate(d_cc_space_v_oo_chol, cholesky_mo_num, nO, nO)
call gpu_allocate(d_cc_space_v_ov_chol, cholesky_mo_num, nO, nV)
call gpu_allocate(d_cc_space_v_vo_chol, cholesky_mo_num, nV, nO)
call gpu_allocate(d_cc_space_v_vv_chol, cholesky_mo_num, nV, nV)
call gpu_upload(cc_space_v_oo_chol, d_cc_space_v_oo_chol)
call gpu_upload(cc_space_v_ov_chol, d_cc_space_v_ov_chol)
call gpu_upload(cc_space_v_vo_chol, d_cc_space_v_vo_chol)
call gpu_upload(cc_space_v_vv_chol, d_cc_space_v_vv_chol)
! FREE cc_space_v_oo_chol
! FREE cc_space_v_ov_chol
! FREE cc_space_v_vo_chol
! FREE cc_space_v_vv_chol
endif
call gpu_allocate(t2, nO,nO,nV,nV)
call gpu_allocate(r2, nO,nO,nV,nV)
call gpu_allocate(tau, nO,nO,nV,nV)
call gpu_allocate(tau_x, nO,nO,nV,nV)
call gpu_allocate(t1, nO,nV)
call gpu_allocate(r1, nO,nV)
call gpu_allocate(H_oo, nO, nO)
call gpu_allocate(H_vo, nV, nO)
call gpu_allocate(H_vv, nV, nV)
if (cc_update_method == 'diis') then if (cc_update_method == 'diis') then
double precision :: rss, diis_mem, extra_mem double precision :: rss, diis_mem, extra_mem
@ -97,14 +142,29 @@ subroutine run_ccsd_space_orb
endif endif
! Init ! Init
call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,t1) double precision, allocatable :: h_t1(:,:), h_t2(:,:,:,:)
call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,t2) allocate(h_t1(nO,nV), h_t2(nO,nO,nV,nV))
call update_tau_space(nO,nV,t1,t2,tau)
call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,h_t1)
call gpu_upload(h_t1, t1)
call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,h_t2)
call gpu_upload(h_t2, t2)
call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV)
call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv)
! FREE cc_space_v_oovv
call update_tau_space(nO,nV,h_t1,t1,t2,tau)
call update_tau_x_space(nO,nV,tau,tau_x) call update_tau_x_space(nO,nV,tau,tau_x)
!print*,'hf_energy', hf_energy !print*,'hf_energy', hf_energy
call det_energy(det,uncorr_energy) call det_energy(det,uncorr_energy)
print*,'Det energy', uncorr_energy print*,'Det energy', uncorr_energy
call ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
call ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy)
print*,'Guess energy', uncorr_energy+energy, energy print*,'Guess energy', uncorr_energy+energy, energy
nb_iter = 0 nb_iter = 0
@ -120,43 +180,40 @@ subroutine run_ccsd_space_orb
do while (not_converged) do while (not_converged)
! Residue ! Residue
if (do_ao_cholesky) then if (do_mo_cholesky) then
! if (.False.) then call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
call compute_H_oo_chol(nO,nV,tau_x,H_oo) call compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, d_cc_space_v_ov_chol,H_vv)
call compute_H_vv_chol(nO,nV,tau_x,H_vv) call compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol, H_vo)
call compute_H_vo_chol(nO,nV,t1,H_vo)
call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1)
call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) call compute_r2_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r2%f,max_r2)
else else
call compute_H_oo(nO,nV,t1,t2,tau,H_oo) call compute_H_oo(nO,nV,t1%f,t2%f,tau%f,H_oo%f)
call compute_H_vv(nO,nV,t1,t2,tau,H_vv) call compute_H_vv(nO,nV,t1%f,t2%f,tau%f,H_vv%f)
call compute_H_vo(nO,nV,t1,t2,H_vo) call compute_H_vo(nO,nV,t1%f,t2%f,H_vo%f)
call compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) call compute_r1_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r1%f,max_r1)
call compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) call compute_r2_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r2%f,max_r2)
endif endif
max_r = max(max_r1,max_r2) max_r = max(max_r1,max_r2)
! Update ! Update
if (cc_update_method == 'diis') then if (cc_update_method == 'diis') then
!call update_t_ccsd(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2) call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1%f,r2%f,t1%f,t2%f,all_err,all_t)
!call update_t_ccsd_diis(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1,r2,t1,t2,all_err,all_t)
! Standard update as T = T - Delta ! Standard update as T = T - Delta
elseif (cc_update_method == 'none') then elseif (cc_update_method == 'none') then
call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1,t1) call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1%f,t1%f)
call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2,t2) call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2%f,t2%f)
else else
print*,'Unkown cc_method_method: '//cc_update_method print*,'Unkown cc_method_method: '//cc_update_method
endif endif
call update_tau_space(nO,nV,t1,t2,tau) call update_tau_space(nO,nV,t1%f,t1,t2,tau)
call update_tau_x_space(nO,nV,tau,tau_x) call update_tau_x_space(nO,nV,tau,tau_x)
! Energy ! Energy
call ccsd_energy_space_x(nO,nV,tau_x,t1,energy) call ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy)
write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |' write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |'
nb_iter = nb_iter + 1 nb_iter = nb_iter + 1
@ -181,8 +238,8 @@ subroutine run_ccsd_space_orb
print*,'' print*,''
if (write_amplitudes) then if (write_amplitudes) then
call write_t1(nO,nV,t1) call write_t1(nO,nV,t1%f)
call write_t2(nO,nV,t2) call write_t2(nO,nV,t2%f)
call ezfio_set_utils_cc_io_amplitudes('Read') call ezfio_set_utils_cc_io_amplitudes('Read')
endif endif
@ -191,7 +248,14 @@ subroutine run_ccsd_space_orb
deallocate(all_err,all_t) deallocate(all_err,all_t)
endif endif
deallocate(H_vv,H_oo,H_vo,r1,r2,tau) call gpu_deallocate(H_oo)
call gpu_deallocate(H_vv)
call gpu_deallocate(H_vo)
call gpu_deallocate(r1)
call gpu_deallocate(r2)
call gpu_deallocate(tau)
call gpu_deallocate(tau_x)
! CCSD(T) ! CCSD(T)
double precision :: e_t, e_t_err double precision :: e_t, e_t_err
@ -199,28 +263,14 @@ subroutine run_ccsd_space_orb
if (cc_par_t .and. elec_alpha_num + elec_beta_num > 2) then if (cc_par_t .and. elec_alpha_num + elec_beta_num > 2) then
! Dumb way
!call wall_time(ta)
!call ccsd_par_t_space(nO,nV,t1,t2,e_t)
!call wall_time(tb)
!print*,'Time: ',tb-ta, ' s'
!print*,''
!write(*,'(A15,F18.12,A3)') ' E(CCSD(T)) = ', uncorr_energy + energy + e_t, ' Ha'
!write(*,'(A15,F18.12,A3)') ' E(T) = ', e_t, ' Ha'
!write(*,'(A15,F18.12,A3)') ' Correlation = ', energy + e_t, ' Ha'
!print*,''
! New ! New
e_t = uncorr_energy + energy ! For print in (T) call e_t = uncorr_energy + energy ! For print in (T) call
e_t_err = 0.d0 e_t_err = 0.d0
print*,'Computing (T) correction...' print*,'Computing (T) correction...'
call wall_time(ta) call wall_time(ta)
! call ccsd_par_t_space_v3(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
! ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t)
call ccsd_par_t_space_stoch(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v & call ccsd_par_t_space_stoch(nO,nV,t1%f,t2%f,cc_space_f_o,cc_space_f_v &
,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t, e_t_err) ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t, e_t_err)
call wall_time(tb) call wall_time(tb)
@ -235,168 +285,147 @@ subroutine run_ccsd_space_orb
call save_energy(uncorr_energy + energy, e_t) call save_energy(uncorr_energy + energy, e_t)
deallocate(t1,t2) deallocate(h_t1, h_t2)
if (do_mo_cholesky) then
call gpu_deallocate(d_cc_space_v_oo_chol)
call gpu_deallocate(d_cc_space_v_ov_chol)
call gpu_deallocate(d_cc_space_v_vo_chol)
call gpu_deallocate(d_cc_space_v_vv_chol)
endif
call gpu_deallocate(d_cc_space_f_vo)
call gpu_deallocate(d_cc_space_v_oovv)
call gpu_deallocate(t1)
call gpu_deallocate(t2)
end end
! Energy ! Energy
subroutine ccsd_energy_space(nO,nV,tau,t1,energy) subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy)
use gpu
implicit none implicit none
integer, intent(in) :: nO, nV integer, intent(in) :: nO, nV
double precision, intent(in) :: tau(nO,nO,nV,nV) type(gpu_double4), intent(in) :: tau_x, d_cc_space_v_oovv
double precision, intent(in) :: t1(nO,nV) type(gpu_double2), intent(in) :: t1, d_cc_space_f_vo
double precision, intent(out) :: energy double precision, intent(out) :: energy
! internal ! internal
integer :: i,j,a,b integer :: i,j,a,b
double precision :: e double precision :: e
energy = 0d0 type(gpu_stream) :: s1, s2
!$omp parallel & call gpu_stream_create(s1)
!$omp shared(nO,nV,energy,tau,t1,& call gpu_stream_create(s2)
!$omp cc_space_f_vo,cc_space_w_oovv) &
!$omp private(i,j,a,b,e) &
!$omp default(none)
e = 0d0
!$omp do
do a = 1, nV
do i = 1, nO
e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
enddo
enddo
!$omp end do nowait
!$omp do
do b = 1, nV
do a = 1, nV
do j = 1, nO
do i = 1, nO
e = e + tau(i,j,a,b) * cc_space_w_oovv(i,j,a,b)
enddo
enddo
enddo
enddo
!$omp end do nowait
!$omp critical
energy = energy + e
!$omp end critical
!$omp end parallel
end call gpu_set_stream(blas_handle,s1)
call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo, 1, t1, 1, e)
subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy) call gpu_set_stream(blas_handle,s2)
call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1_8, d_cc_space_v_oovv, 1_8, energy)
call gpu_set_stream(blas_handle,gpu_default_stream)
implicit none call gpu_synchronize()
call gpu_stream_destroy(s1)
call gpu_stream_destroy(s2)
integer, intent(in) :: nO, nV energy = energy + 2.d0*e
double precision, intent(in) :: tau_x(nO,nO,nV,nV)
double precision, intent(in) :: t1(nO,nV)
double precision, intent(out) :: energy
! internal
integer :: i,j,a,b
double precision :: e
energy = 0d0
!$omp parallel &
!$omp shared(nO,nV,energy,tau_x,t1,&
!$omp cc_space_f_vo,cc_space_v_oovv) &
!$omp private(i,j,a,b,e) &
!$omp default(none)
e = 0d0
!$omp do
do a = 1, nV
do i = 1, nO
e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
enddo
enddo
!$omp end do nowait
!$omp do
do b = 1, nV
do a = 1, nV
do j = 1, nO
do i = 1, nO
e = e + tau_x(i,j,a,b) * cc_space_v_oovv(i,j,a,b)
enddo
enddo
enddo
enddo
!$omp end do nowait
!$omp critical
energy = energy + e
!$omp end critical
!$omp end parallel
end end
! Tau ! Tau
subroutine update_tau_space(nO,nV,t1,t2,tau) subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
use gpu
implicit none implicit none
! in ! in
integer, intent(in) :: nO, nV integer, intent(in) :: nO, nV
double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV) double precision, intent(in) :: h_t1(nO,nV)
type(gpu_double2), intent(in) :: t1
type(gpu_double4), intent(in) :: t2
! out ! out
double precision, intent(out) :: tau(nO,nO,nV,nV) type(gpu_double4) :: tau
! internal ! internal
integer :: i,j,a,b integer :: i,j,a,b
type(gpu_stream) :: stream(nV)
!$OMP PARALLEL & !$OMP PARALLEL &
!$OMP SHARED(nO,nV,tau,t2,t1) & !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) &
!$OMP PRIVATE(i,j,a,b) & !$OMP PRIVATE(i,j,a,b) &
!$OMP DEFAULT(NONE) !$OMP DEFAULT(NONE)
!$OMP DO !$OMP DO
do b = 1, nV do b=1,nV
do a = 1, nV call gpu_stream_create(stream(b))
do j = 1, nO call gpu_set_stream(blas_handle,stream(b))
do i = 1, nO do j=1,nO
tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b) call gpu_dgeam_f(blas_handle, 'N', 'N', nO, nV, &
enddo 1.d0, t2%f(1,j,1,b), nO*nO, &
enddo h_t1(j,b), t1%f, nO, &
tau%f(1,j,1,b), nO*nO)
enddo enddo
enddo enddo
!$OMP END DO !$OMP END DO
!$OMP END PARALLEL !$OMP END PARALLEL
call gpu_synchronize()
do b=1,nV
call gpu_stream_destroy(stream(b))
enddo
call gpu_set_stream(blas_handle,gpu_default_stream)
end end
subroutine update_tau_x_space(nO,nV,tau,tau_x) subroutine update_tau_x_space(nO,nV,tau,tau_x)
use gpu
implicit none implicit none
! in ! in
integer, intent(in) :: nO, nV integer, intent(in) :: nO, nV
double precision, intent(in) :: tau(nO,nO,nV,nV) type(gpu_double4), intent(in) :: tau
! out ! out
double precision, intent(out) :: tau_x(nO,nO,nV,nV) type(gpu_double4) :: tau_x
! internal ! internal
integer :: i,j,a,b integer :: i,j,a,b
type(gpu_stream) :: stream(nV)
do a=1,nV
call gpu_stream_create(stream(a))
enddo
!$OMP PARALLEL & !$OMP PARALLEL &
!$OMP SHARED(nO,nV,tau,tau_x) & !$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) &
!$OMP PRIVATE(i,j,a,b) & !$OMP PRIVATE(a,b) &
!$OMP DEFAULT(NONE) !$OMP DEFAULT(NONE)
!$OMP DO !$OMP DO
do b = 1, nV do b=1,nV
do a = 1, nV do a=1,nV
do j = 1, nO call gpu_set_stream(blas_handle,stream(a))
do i = 1, nO call gpu_dgeam_f(blas_handle, 'N', 'N', nO, nO, &
tau_x(i,j,a,b) = 2.d0*tau(i,j,a,b) - tau(i,j,b,a) 2.d0, tau%f(1,1,a,b), nO, &
enddo -1.d0, tau%f(1,1,b,a), nO, &
enddo tau_x%f(1,1,a,b), nO)
enddo enddo
enddo enddo
!$OMP END DO !$OMP END DO
!$OMP END PARALLEL !$OMP END PARALLEL
call gpu_set_stream(blas_handle,gpu_default_stream)
call gpu_synchronize()
do b=1,nV
call gpu_stream_destroy(stream(b))
enddo
end end
! R1 ! R1

View File

@ -1,81 +1,200 @@
subroutine ccsd_energy_space_chol(nO,nV,tau,t1,energy) ! H_oo
subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, &
d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
use gpu
implicit none implicit none
integer, intent(in) :: nO, nV integer, intent(in) :: nO,nV
double precision, intent(in) :: tau(nO,nO,nV,nV) type(gpu_double2), intent(in) :: d_cc_space_f_oo
double precision, intent(in) :: t1(nO,nV) type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol
double precision, intent(out) :: energy type(gpu_double4), intent(in) :: tau_x
type(gpu_double2), intent(out) :: H_oo
! internal integer :: a,b,i,j,u,k
integer :: i,j,a,b
double precision :: e
energy = 0d0 type(gpu_double3) :: tau_kau, tmp_vov, tmp_ovv
!$omp parallel &
!$omp shared(nO,nV,energy,tau,t1,&
!$omp cc_space_f_vo,cc_space_w_oovv) &
!$omp private(i,j,a,b,e) &
!$omp default(none)
e = 0d0
!$omp do
do a = 1, nV
do i = 1, nO
e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
enddo
enddo
!$omp end do nowait
!$omp do
do b = 1, nV
do a = 1, nV
do j = 1, nO
do i = 1, nO
e = e + tau(i,j,a,b) * cc_space_w_oovv(i,j,a,b)
enddo
enddo
enddo
enddo
!$omp end do nowait
!$omp critical
energy = energy + e
!$omp end critical
!$omp end parallel
end call gpu_allocate(tau_kau, cholesky_mo_num, nV, nO)
! Tau type(gpu_blas) :: blas
subroutine update_tau_space_chol(nO,nV,t1,t2,tau)
implicit none !$OMP PARALLEL &
!$OMP DEFAULT(SHARED) &
!$OMP PRIVATE(blas,u,b,tmp_vov,tmp_ovv)
! in !$OMP SINGLE
integer, intent(in) :: nO, nV !$OMP TASK
double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV) call gpu_copy(d_cc_space_f_oo, H_oo)
!$OMP END TASK
!$OMP END SINGLE
! out call gpu_allocate(tmp_ovv, nO, nV, nV)
double precision, intent(out) :: tau(nO,nO,nV,nV) call gpu_allocate(tmp_vov, nV, nO, nV)
! internal call gpu_blas_create(blas)
integer :: i,j,a,b
!$OMP PARALLEL &
!$OMP SHARED(nO,nV,tau,t2,t1) &
!$OMP PRIVATE(i,j,a,b) &
!$OMP DEFAULT(NONE)
!$OMP DO !$OMP DO
do b = 1, nV do u=1,nO
do a = 1, nV call gpu_dgeam_f(blas, 'N', 'N', 1, nO*nV*nV, 1.d0, &
do j = 1, nO tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f, nO, tmp_ovv%f, 1)
do i = 1, nO do b=1,nV
tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b) call gpu_dgeam_f(blas, 'T', 'T', nV, nO, 1.d0, &
enddo tmp_ovv%f(1,1,b), nO, 0.d0, &
enddo tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV)
enddo enddo
call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_vov%f, nV, &
0.d0, tau_kau%f(1,1,u), cholesky_mo_num)
enddo enddo
!$OMP END DO !$OMP END DO
call gpu_blas_destroy(blas)
call gpu_deallocate(tmp_vov)
call gpu_deallocate(tmp_ovv)
!$OMP TASKWAIT
!$OMP END PARALLEL !$OMP END PARALLEL
call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
tau_kau, cholesky_mo_num*nV, d_cc_space_v_vo_chol, cholesky_mo_num*nV, &
1.d0, H_oo, nO)
call gpu_synchronize()
call gpu_deallocate(tau_kau)
end
! H_vv
subroutine compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, &
d_cc_space_v_ov_chol,H_vv)
use gpu
implicit none
integer, intent(in) :: nO,nV
type(gpu_double2), intent(in) :: d_cc_space_f_vv
type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol
type(gpu_double4), intent(in) :: tau_x
type(gpu_double2), intent(out) :: H_vv
integer :: a,b,i,j,u,k, beta
type(gpu_double3) :: tau_kia, tmp_oov
call gpu_allocate(tau_kia, cholesky_mo_num, nO, nV)
type(gpu_blas) :: blas
!$OMP PARALLEL &
!$OMP DEFAULT(SHARED) &
!$OMP PRIVATE(a,b,tmp_oov,blas)
!$OMP SINGLE
!$OMP TASK
call gpu_copy(d_cc_space_f_vv, H_vv)
!$OMP END TASK
!$OMP END SINGLE
call gpu_blas_create(blas)
call gpu_allocate(tmp_oov, nO, nO, nV)
!$OMP DO
do a = 1, nV
do b=1,nV
call gpu_dgeam_f(blas, 'N', 'N', nO, nO, 1.d0, &
tau_x%f(1,1,a,b), nO, 0.d0, &
tau_x%f(1,1,a,b), nO, tmp_oov%f(1,1,b), nO)
enddo
call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nO,nO*nV,1.d0, &
d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov%f, nO, &
0.d0, tau_kia%f(1,1,a), cholesky_mo_num)
enddo
!$OMP END DO
call gpu_blas_destroy(blas)
call gpu_deallocate(tmp_oov)
!$OMP TASKWAIT
!$OMP END PARALLEL
call gpu_dgemm(blas_handle,'T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
tau_kia, cholesky_mo_num*nO, d_cc_space_v_ov_chol, cholesky_mo_num*nO, &
1.d0, H_vv, nV)
call gpu_synchronize()
call gpu_deallocate(tau_kia)
end
! H_vo
subroutine compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, &
d_cc_space_v_ov_chol,d_cc_space_v_vo_chol, H_vo)
use gpu
implicit none
integer, intent(in) :: nO,nV
type(gpu_double2), intent(in) :: t1, d_cc_space_f_vo
type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol
type(gpu_double2), intent(out) :: H_vo
integer :: a,b,i,j,u,k
type(gpu_double1) :: tmp_k
type(gpu_double3) :: tmp, tmp2
call gpu_copy(d_cc_space_f_vo, H_vo)
call gpu_allocate(tmp_k, cholesky_mo_num)
call gpu_dgemm(blas_handle, 'N', 'N', cholesky_mo_num, 1, nO*nV, 2.d0, &
d_cc_space_v_ov_chol, cholesky_mo_num, &
t1, nO*nV, 0.d0, tmp_k, cholesky_mo_num)
call gpu_dgemm(blas_handle, 'T','N',nV*nO,1,cholesky_mo_num,1.d0, &
d_cc_space_v_vo_chol, cholesky_mo_num, tmp_k, cholesky_mo_num, 1.d0, &
H_vo, nV*nO)
call gpu_deallocate(tmp_k)
call gpu_allocate(tmp, cholesky_mo_num, nO, nO)
call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nO, nO, nV, 1.d0, &
d_cc_space_v_ov_chol, cholesky_mo_num*nO, t1, nO, 0.d0, tmp, cholesky_mo_num*nO)
call gpu_allocate(tmp2, cholesky_mo_num, nO, nO)
type(gpu_stream) :: stream(nO)
do i=1,nO
call gpu_stream_create(stream(i))
enddo
!$OMP PARALLEL DO COLLAPSE(2) PRIVATE(i,j)
do i=1,nO
do j=1,nO
call gpu_set_stream(blas_handle,stream(j))
call gpu_dgeam_f(blas_handle, 'N', 'N', cholesky_mo_num, 1, 1.d0, &
tmp%f(1,i,j), cholesky_mo_num, 0.d0, &
tmp%f(1,i,j), cholesky_mo_num, tmp2%f(1,j,i), cholesky_mo_num)
enddo
enddo
!$OMP END PARALLEL DO
call gpu_set_stream(blas_handle,gpu_default_stream)
call gpu_synchronize()
do i=1,nO
call gpu_stream_destroy(stream(i))
enddo
call gpu_deallocate(tmp)
call gpu_dgemm(blas_handle, 'T','N', nV, nO, cholesky_mo_num*nO, -1.d0, &
d_cc_space_v_ov_chol, cholesky_mo_num*nO, tmp2, cholesky_mo_num*nO, &
1.d0, H_vo, nV)
call gpu_synchronize()
call gpu_deallocate(tmp2)
end end
! R1 ! R1
@ -291,154 +410,6 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
end end
! H_oo
subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo)
implicit none
integer, intent(in) :: nO,nV
double precision, intent(in) :: tau_x(nO, nO, nV, nV)
double precision, intent(out) :: H_oo(nO, nO)
integer :: a,b,i,j,u,k
double precision, allocatable :: tau_kau(:,:,:), tmp_vov(:,:,:)
allocate(tau_kau(cholesky_mo_num,nV,nO))
!$omp parallel &
!$omp default(shared) &
!$omp private(i,u,j,k,a,b,tmp_vov)
allocate(tmp_vov(nV,nO,nV) )
!$omp do
do u = 1, nO
do b=1,nV
do j=1,nO
do a=1,nV
tmp_vov(a,j,b) = tau_x(u,j,a,b)
enddo
enddo
enddo
call dgemm('N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
cc_space_v_ov_chol, cholesky_mo_num, tmp_vov, nV, &
0.d0, tau_kau(1,1,u), cholesky_mo_num)
enddo
!$omp end do nowait
deallocate(tmp_vov)
!$omp do
do i = 1, nO
do u = 1, nO
H_oo(u,i) = cc_space_f_oo(u,i)
enddo
enddo
!$omp end do nowait
!$omp barrier
!$omp end parallel
call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
tau_kau, cholesky_mo_num*nV, cc_space_v_vo_chol, cholesky_mo_num*nV, &
1.d0, H_oo, nO)
end
! H_vv
subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv)
implicit none
integer, intent(in) :: nO,nV
double precision, intent(in) :: tau_x(nO, nO, nV, nV)
double precision, intent(out) :: H_vv(nV, nV)
integer :: a,b,i,j,u,k, beta
double precision, allocatable :: tau_kia(:,:,:), tmp_oov(:,:,:)
allocate(tau_kia(cholesky_mo_num,nO,nV))
!$omp parallel &
!$omp default(shared) &
!$omp private(i,beta,j,k,a,b,tmp_oov)
allocate(tmp_oov(nO,nO,nV) )
!$omp do
do a = 1, nV
do b=1,nV
do j=1,nO
do i=1,nO
tmp_oov(i,j,b) = tau_x(i,j,a,b)
enddo
enddo
enddo
call dgemm('N','T',cholesky_mo_num,nO,nO*nV,1.d0, &
cc_space_v_ov_chol, cholesky_mo_num, tmp_oov, nO, &
0.d0, tau_kia(1,1,a), cholesky_mo_num)
enddo
!$omp end do nowait
deallocate(tmp_oov)
!$omp do
do beta = 1, nV
do a = 1, nV
H_vv(a,beta) = cc_space_f_vv(a,beta)
enddo
enddo
!$omp end do nowait
!$omp barrier
!$omp end parallel
call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
tau_kia, cholesky_mo_num*nO, cc_space_v_ov_chol, cholesky_mo_num*nO, &
1.d0, H_vv, nV)
end
! H_vo
subroutine compute_H_vo_chol(nO,nV,t1,H_vo)
implicit none
integer, intent(in) :: nO,nV
double precision, intent(in) :: t1(nO, nV)
double precision, intent(out) :: H_vo(nV, nO)
integer :: a,b,i,j,u,k
double precision, allocatable :: tmp_k(:), tmp(:,:,:), tmp2(:,:,:)
do i=1,nO
do a=1,nV
H_vo(a,i) = cc_space_f_vo(a,i)
enddo
enddo
allocate(tmp_k(cholesky_mo_num))
call dgemm('N', 'N', cholesky_mo_num, 1, nO*nV, 2.d0, &
cc_space_v_ov_chol, cholesky_mo_num, &
t1, nO*nV, 0.d0, tmp_k, cholesky_mo_num)
call dgemm('T','N',nV*nO,1,cholesky_mo_num,1.d0, &
cc_space_v_vo_chol, cholesky_mo_num, tmp_k, cholesky_mo_num, 1.d0, &
H_vo, nV*nO)
deallocate(tmp_k)
allocate(tmp(cholesky_mo_num,nO,nO))
allocate(tmp2(cholesky_mo_num,nO,nO))
call dgemm('N','T', cholesky_mo_num*nO, nO, nV, 1.d0, &
cc_space_v_ov_chol, cholesky_mo_num*nO, t1, nO, 0.d0, tmp, cholesky_mo_num*nO)
do i=1,nO
do j=1,nO
do k=1,cholesky_mo_num
tmp2(k,j,i) = tmp(k,i,j)
enddo
enddo
enddo
deallocate(tmp)
call dgemm('T','N', nV, nO, cholesky_mo_num*nO, -1.d0, &
cc_space_v_ov_chol, cholesky_mo_num*nO, tmp2, cholesky_mo_num*nO, &
1.d0, H_vo, nV)
end
! R2 ! R2

1
src/gpu/NEED Normal file
View File

@ -0,0 +1 @@
gpu_arch

6
src/gpu/README.rst Normal file
View File

@ -0,0 +1,6 @@
===
gpu
===
Bindings for GPU routines (architecture independent).
Architecture-dependent files are in gpu_arch.

41
src/gpu/gpu.h Normal file
View File

@ -0,0 +1,41 @@
#include <stdint.h>
int gpu_ndevices();
void gpu_set_device(int32_t i);
void gpu_allocate(void** ptr, const int64_t n);
void gpu_free(void** ptr);
void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n);
void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n);
void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n);
void gpu_stream_create(void** ptr);
void gpu_stream_destroy(void** ptr);
void gpu_set_stream(void* handle, void* stream);
void gpu_synchronize();
void gpu_blas_create(void** handle);
void gpu_blas_destroy(void** handle);
void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result);
void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result);
void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double alpha,
const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy);
void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float alpha,
const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy);
void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc);
void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc);
void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc);
void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc);

26
src/gpu/gpu.irp.f Normal file
View File

@ -0,0 +1,26 @@
use gpu
BEGIN_PROVIDER [ type(gpu_blas), blas_handle ]
implicit none
BEGIN_DOC
! Handle for cuBLAS or RocBLAS
END_DOC
call gpu_blas_create(blas_handle)
END_PROVIDER
BEGIN_PROVIDER [ type(gpu_stream), gpu_default_stream ]
implicit none
BEGIN_DOC
! Default stream
END_DOC
gpu_default_stream%c = C_NULL_PTR
END_PROVIDER
BEGIN_PROVIDER [ integer, gpu_num ]
implicit none
BEGIN_DOC
! Number of usable GPUs
END_DOC
gpu_num = gpu_ndevices()
END_PROVIDER

725
src/gpu/gpu_module.F90 Normal file
View File

@ -0,0 +1,725 @@
module gpu
use, intrinsic :: iso_c_binding
implicit none
! Data types
! ----------
type gpu_double1
type(c_ptr) :: c
double precision, pointer :: f(:)
end type
type gpu_double2
type(c_ptr) :: c
double precision, pointer :: f(:,:)
end type
type gpu_double3
type(c_ptr) :: c
double precision, pointer :: f(:,:,:)
end type
type gpu_double4
type(c_ptr) :: c
double precision, pointer :: f(:,:,:,:)
end type
type gpu_double5
type(c_ptr) :: c
double precision, pointer :: f(:,:,:,:,:)
end type
type gpu_double6
type(c_ptr) :: c
double precision, pointer :: f(:,:,:,:,:,:)
end type
type gpu_blas
type(c_ptr) :: c
end type
type gpu_stream
type(c_ptr) :: c
end type
! C interfaces
! ------------
interface
logical(c_bool) function no_gpu() bind(C)
import
end function
integer function gpu_ndevices() bind(C)
import
end function
subroutine gpu_set_device(id) bind(C)
import
integer(c_int32_t), value :: id
end subroutine
subroutine gpu_allocate_c(ptr, n) bind(C, name='gpu_allocate')
import
type(c_ptr) :: ptr
integer(c_int64_t), value :: n
end subroutine
subroutine gpu_deallocate_c(ptr) bind(C, name='gpu_deallocate')
import
type(c_ptr) :: ptr
end subroutine
subroutine gpu_upload_c(cpu_ptr, gpu_ptr, n) bind(C, name='gpu_upload')
import
type(c_ptr), value :: cpu_ptr
type(c_ptr), value :: gpu_ptr
integer(c_int64_t), value :: n
end subroutine
subroutine gpu_download_c(gpu_ptr, cpu_ptr, n) bind(C, name='gpu_download')
import
type(c_ptr), value :: gpu_ptr
type(c_ptr), value :: cpu_ptr
integer(c_int64_t), value :: n
end subroutine
subroutine gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, n) bind(C, name='gpu_copy')
import
type(c_ptr), value :: gpu_ptr_src
type(c_ptr), value :: gpu_ptr_dest
integer(c_int64_t), value :: n
end subroutine
subroutine gpu_stream_create_c(stream) bind(C, name='gpu_stream_create')
import
type(c_ptr) :: stream
end subroutine
subroutine gpu_stream_destroy_c(stream) bind(C, name='gpu_stream_destroy')
import
type(c_ptr) :: stream
end subroutine
subroutine gpu_set_stream_c(handle, stream) bind(C, name='gpu_set_stream')
import
type(c_ptr), value :: handle, stream
end subroutine
subroutine gpu_synchronize() bind(C)
import
end subroutine
subroutine gpu_blas_create_c(handle) bind(C, name='gpu_blas_create')
import
type(c_ptr) :: handle
end subroutine
subroutine gpu_blas_destroy_c(handle) bind(C, name='gpu_blas_destroy')
import
type(c_ptr) :: handle
end subroutine
subroutine gpu_ddot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_ddot')
import
type(c_ptr), value, intent(in) :: handle
integer(c_int64_t), value :: n, incx, incy
type(c_ptr), value :: dx, dy
real(c_double), intent(out) :: res
end subroutine
subroutine gpu_sdot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_sdot')
import
type(c_ptr), value, intent(in) :: handle
integer(c_int64_t), value :: n, incx, incy
type(c_ptr), intent(in), value :: dx, dy
real(c_float), intent(out) :: res
end subroutine
subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
b, ldb, c, ldc) bind(C, name='gpu_dgeam')
import
type(c_ptr), value, intent(in) :: handle
character(c_char), intent(in), value :: transa, transb
integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
real(c_double), intent(in), value :: alpha, beta
type(c_ptr), value :: a, b, c
end subroutine
subroutine gpu_sgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
b, ldb, c, ldc) bind(C, name='gpu_sgeam')
import
type(c_ptr), value, intent(in) :: handle
character(c_char), intent(in), value :: transa, transb
integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
real(c_float), intent(in), value :: alpha, beta
type(c_ptr), value :: a, b, c
end subroutine
subroutine gpu_dgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, &
b, ldb, beta, c, ldc) bind(C, name='gpu_dgemm')
import
type(c_ptr), value, intent(in) :: handle
character(c_char), intent(in), value :: transa, transb
integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc
real(c_double), intent(in), value :: alpha, beta
type(c_ptr), value :: a, b, c
end subroutine
subroutine gpu_sgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, &
b, ldb, beta, c, ldc) bind(C, name='gpu_sgemm')
import
type(c_ptr), value, intent(in) :: handle
character(c_char), intent(in), value :: transa, transb
integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc
real(c_float), intent(in), value :: alpha, beta
type(c_ptr), value :: a, b, c
end subroutine
end interface
! Polymorphic interfaces
! ----------------------
interface gpu_allocate
procedure gpu_allocate_double1 &
,gpu_allocate_double2 &
,gpu_allocate_double3 &
,gpu_allocate_double4 &
,gpu_allocate_double5 &
,gpu_allocate_double6 &
,gpu_allocate_double1_64 &
,gpu_allocate_double2_64 &
,gpu_allocate_double3_64 &
,gpu_allocate_double4_64 &
,gpu_allocate_double5_64 &
,gpu_allocate_double6_64
end interface gpu_allocate
interface gpu_deallocate
procedure gpu_deallocate_double1 &
,gpu_deallocate_double2 &
,gpu_deallocate_double3 &
,gpu_deallocate_double4 &
,gpu_deallocate_double5 &
,gpu_deallocate_double6
end interface gpu_deallocate
interface gpu_upload
procedure gpu_upload_double1 &
,gpu_upload_double2 &
,gpu_upload_double3 &
,gpu_upload_double4 &
,gpu_upload_double5 &
,gpu_upload_double6
end interface gpu_upload
interface gpu_download
procedure gpu_download_double1 &
,gpu_download_double2 &
,gpu_download_double3 &
,gpu_download_double4 &
,gpu_download_double5 &
,gpu_download_double6
end interface gpu_download
interface gpu_copy
procedure gpu_copy_double1 &
,gpu_copy_double2 &
,gpu_copy_double3 &
,gpu_copy_double4 &
,gpu_copy_double5 &
,gpu_copy_double6
end interface gpu_copy
contains
! gpu_allocate
! ------------
subroutine gpu_allocate_double1(ptr, s)
implicit none
type(gpu_double1), intent(inout) :: ptr
integer, intent(in) :: s
call gpu_allocate_c(ptr%c, s*8_8)
call c_f_pointer(ptr%c, ptr%f, (/ s /))
end subroutine
subroutine gpu_allocate_double2(ptr, s1, s2)
implicit none
type(gpu_double2), intent(inout) :: ptr
integer, intent(in) :: s1, s2
call gpu_allocate_c(ptr%c, s1*s2*8_8)
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /))
end subroutine
subroutine gpu_allocate_double3(ptr, s1, s2, s3)
implicit none
type(gpu_double3), intent(inout) :: ptr
integer, intent(in) :: s1, s2, s3
call gpu_allocate_c(ptr%c, s1*s2*s3*8_8)
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /))
end subroutine
subroutine gpu_allocate_double4(ptr, s1, s2, s3, s4)
implicit none
type(gpu_double4), intent(inout) :: ptr
integer, intent(in) :: s1, s2, s3, s4
call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8)
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /))
end subroutine
subroutine gpu_allocate_double5(ptr, s1, s2, s3, s4, s5)
implicit none
type(gpu_double5), intent(inout) :: ptr
integer, intent(in) :: s1, s2, s3, s4, s5
call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8)
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /))
end subroutine
subroutine gpu_allocate_double6(ptr, s1, s2, s3, s4, s5, s6)
implicit none
type(gpu_double6), intent(inout) :: ptr
integer, intent(in) :: s1, s2, s3, s4, s5, s6
call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8)
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /))
end subroutine
subroutine gpu_allocate_double1_64(ptr, s)
implicit none
type(gpu_double1), intent(inout) :: ptr
integer*8, intent(in) :: s
call gpu_allocate_c(ptr%c, s)
call c_f_pointer(ptr%c, ptr%f, (/ s /))
end subroutine
subroutine gpu_allocate_double2_64(ptr, s1, s2)
implicit none
type(gpu_double2), intent(inout) :: ptr
integer*8, intent(in) :: s1, s2
call gpu_allocate_c(ptr%c, s1*s2*8_8)
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /))
end subroutine
subroutine gpu_allocate_double3_64(ptr, s1, s2, s3)
implicit none
type(gpu_double3), intent(inout) :: ptr
integer*8, intent(in) :: s1, s2, s3
call gpu_allocate_c(ptr%c, s1*s2*s3*8_8)
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /))
end subroutine
subroutine gpu_allocate_double4_64(ptr, s1, s2, s3, s4)
implicit none
type(gpu_double4), intent(inout) :: ptr
integer*8, intent(in) :: s1, s2, s3, s4
call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8)
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /))
end subroutine
subroutine gpu_allocate_double5_64(ptr, s1, s2, s3, s4, s5)
implicit none
type(gpu_double5), intent(inout) :: ptr
integer*8, intent(in) :: s1, s2, s3, s4, s5
call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8)
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /))
end subroutine
subroutine gpu_allocate_double6_64(ptr, s1, s2, s3, s4, s5, s6)
implicit none
type(gpu_double6), intent(inout) :: ptr
integer*8, intent(in) :: s1, s2, s3, s4, s5, s6
call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8)
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /))
end subroutine
! gpu_deallocate
! --------------
subroutine gpu_deallocate_double1(ptr)
implicit none
type(gpu_double1), intent(inout) :: ptr
call gpu_deallocate_c(ptr%c)
NULLIFY(ptr%f)
end subroutine
subroutine gpu_deallocate_double2(ptr)
implicit none
type(gpu_double2), intent(inout) :: ptr
call gpu_deallocate_c(ptr%c)
NULLIFY(ptr%f)
end subroutine
subroutine gpu_deallocate_double3(ptr)
implicit none
type(gpu_double3), intent(inout) :: ptr
call gpu_deallocate_c(ptr%c)
NULLIFY(ptr%f)
end subroutine
subroutine gpu_deallocate_double4(ptr)
implicit none
type(gpu_double4), intent(inout) :: ptr
call gpu_deallocate_c(ptr%c)
NULLIFY(ptr%f)
end subroutine
subroutine gpu_deallocate_double5(ptr)
implicit none
type(gpu_double5), intent(inout) :: ptr
call gpu_deallocate_c(ptr%c)
NULLIFY(ptr%f)
end subroutine
subroutine gpu_deallocate_double6(ptr)
implicit none
type(gpu_double6), intent(inout) :: ptr
call gpu_deallocate_c(ptr%c)
NULLIFY(ptr%f)
end subroutine
! gpu_upload
! ----------
subroutine gpu_upload_double1(cpu_ptr, gpu_ptr)
implicit none
double precision, target, intent(in) :: cpu_ptr(*)
type(gpu_double1), intent(in) :: gpu_ptr
call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, 8_8*size(gpu_ptr%f))
end subroutine
subroutine gpu_upload_double2(cpu_ptr, gpu_ptr)
implicit none
double precision, target, intent(in) :: cpu_ptr(:,:)
type(gpu_double2), intent(in) :: gpu_ptr
call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
end subroutine
subroutine gpu_upload_double3(cpu_ptr, gpu_ptr)
implicit none
double precision, target, intent(in) :: cpu_ptr(:,:,:)
type(gpu_double3), intent(in) :: gpu_ptr
call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
end subroutine
subroutine gpu_upload_double4(cpu_ptr, gpu_ptr)
implicit none
double precision, target, intent(in) :: cpu_ptr(:,:,:,:)
type(gpu_double4), intent(in) :: gpu_ptr
call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
end subroutine
subroutine gpu_upload_double5(cpu_ptr, gpu_ptr)
implicit none
double precision, target, intent(in) :: cpu_ptr(:,:,:,:,:)
type(gpu_double5), intent(in) :: gpu_ptr
call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
end subroutine
subroutine gpu_upload_double6(cpu_ptr, gpu_ptr)
implicit none
double precision, target, intent(in) :: cpu_ptr(:,:,:,:,:,:)
type(gpu_double6), intent(in) :: gpu_ptr
call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
end subroutine
! gpu_download
! ------------
subroutine gpu_download_double1(gpu_ptr, cpu_ptr)
implicit none
type(gpu_double1), intent(in) :: gpu_ptr
double precision, target, intent(in) :: cpu_ptr(:)
call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*size(gpu_ptr%f))
end subroutine
subroutine gpu_download_double2(gpu_ptr, cpu_ptr)
implicit none
type(gpu_double2), intent(in) :: gpu_ptr
double precision, target, intent(in) :: cpu_ptr(:,:)
call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
end subroutine
subroutine gpu_download_double3(gpu_ptr, cpu_ptr)
implicit none
type(gpu_double3), intent(in) :: gpu_ptr
double precision, target, intent(in) :: cpu_ptr(:,:,:)
call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
end subroutine
subroutine gpu_download_double4(gpu_ptr, cpu_ptr)
implicit none
type(gpu_double4), intent(in) :: gpu_ptr
double precision, target, intent(in) :: cpu_ptr(:,:,:,:)
call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
end subroutine
subroutine gpu_download_double5(gpu_ptr, cpu_ptr)
implicit none
type(gpu_double5), intent(in) :: gpu_ptr
double precision, target, intent(in) :: cpu_ptr(:,:,:,:,:)
call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
end subroutine
subroutine gpu_download_double6(gpu_ptr, cpu_ptr)
implicit none
type(gpu_double6), intent(in) :: gpu_ptr
double precision, target, intent(in) :: cpu_ptr(:,:,:,:,:,:)
call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
end subroutine
! gpu_copy
! --------
subroutine gpu_copy_double1(gpu_ptr_src, gpu_ptr_dest)
implicit none
type(gpu_double1), intent(in) :: gpu_ptr_src
type(gpu_double1), intent(in) :: gpu_ptr_dest
call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*size(gpu_ptr_dest%f))
end subroutine
subroutine gpu_copy_double2(gpu_ptr_src, gpu_ptr_dest)
implicit none
type(gpu_double2), intent(in) :: gpu_ptr_src
type(gpu_double2), intent(in) :: gpu_ptr_dest
call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
end subroutine
subroutine gpu_copy_double3(gpu_ptr_src, gpu_ptr_dest)
implicit none
type(gpu_double3), intent(in) :: gpu_ptr_src
type(gpu_double3), intent(in) :: gpu_ptr_dest
call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
end subroutine
subroutine gpu_copy_double4(gpu_ptr_src, gpu_ptr_dest)
implicit none
type(gpu_double4), intent(in) :: gpu_ptr_src
type(gpu_double4), intent(in) :: gpu_ptr_dest
call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
end subroutine
subroutine gpu_copy_double5(gpu_ptr_src, gpu_ptr_dest)
implicit none
type(gpu_double5), intent(in) :: gpu_ptr_src
type(gpu_double5), intent(in) :: gpu_ptr_dest
call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
end subroutine
subroutine gpu_copy_double6(gpu_ptr_src, gpu_ptr_dest)
implicit none
type(gpu_double6), intent(in) :: gpu_ptr_src
type(gpu_double6), intent(in) :: gpu_ptr_dest
call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
end subroutine
! gpu_stream
! ----------
subroutine gpu_stream_create(stream)
type(gpu_stream) :: stream
call gpu_stream_create_c(stream%c)
end subroutine
subroutine gpu_stream_destroy(stream)
type(gpu_stream) :: stream
call gpu_stream_destroy_c(stream%c)
end subroutine
subroutine gpu_set_stream(handle, stream)
type(gpu_blas) :: handle
type(gpu_stream) :: stream
call gpu_set_stream_c(handle%c, stream%c)
end subroutine
! gpu_blas
! --------
subroutine gpu_blas_create(handle)
type(gpu_blas) :: handle
call gpu_blas_create_c(handle%c)
end subroutine
subroutine gpu_blas_destroy(handle)
type(gpu_blas) :: handle
call gpu_blas_destroy_c(handle%c)
end subroutine
end module
! dot
! ---
subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res)
use gpu
type(gpu_blas), intent(in) :: handle
integer*4 :: n, incx, incy
type(gpu_double1), intent(in) :: dx, dy
double precision, intent(out) :: res
call gpu_ddot_c(handle%c, int(n,c_int64_t), dx%c, int(incx,c_int64_t), dy%c, int(incy,c_int64_t), res)
end subroutine
subroutine gpu_ddot_f(handle, n, dx, incx, dy, incy, res)
use gpu
type(gpu_blas), intent(in) :: handle
integer*4 :: n, incx, incy
double precision, target :: dx(*), dy(*)
double precision, intent(out) :: res
call gpu_ddot_c(handle%c, int(n,c_int64_t), c_loc(dx), int(incx,c_int64_t), c_loc(dy), int(incy,c_int64_t), res)
end subroutine
subroutine gpu_ddot_64(handle, n, dx, incx, dy, incy, res)
use gpu
type(gpu_blas), intent(in) :: handle
integer*8 :: n, incx, incy
type(gpu_double1), intent(in) :: dx, dy
double precision, intent(out) :: res
call gpu_ddot_c(handle%c, n, dx%c, incx, dy%c, incy, res)
end subroutine
subroutine gpu_ddot_f_64(handle, n, dx, incx, dy, incy, res)
use gpu
type(gpu_blas), intent(in) :: handle
integer*8 :: n, incx, incy
double precision, target :: dx(*), dy(*)
double precision, intent(out) :: res
call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res)
end subroutine
! geam
! ----
subroutine gpu_dgeam(handle, transa, transb, m, n, alpha, a, lda, beta, &
b, ldb, c, ldc)
use gpu
type(gpu_blas), intent(in) :: handle
character, intent(in) :: transa, transb
integer*4, intent(in) :: m, n, lda, ldb, ldc
double precision, intent(in) :: alpha, beta
type(gpu_double2) :: a, b, c
call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, a%c, int(lda,c_int64_t), beta, &
b%c, int(ldb,c_int64_t), c%c, int(ldc,c_int64_t))
end subroutine
subroutine gpu_dgeam_f(handle, transa, transb, m, n, alpha, a, lda, beta, &
b, ldb, c, ldc)
use gpu
type(gpu_blas), intent(in) :: handle
character, intent(in) :: transa, transb
integer*4, intent(in) :: m, n, lda, ldb, ldc
double precision, intent(in) :: alpha, beta
double precision, target :: a(*), b(*), c(*)
call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, &
c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t))
end subroutine
subroutine gpu_dgeam_64(handle, transa, transb, m, n, alpha, a, lda, beta, &
b, ldb, c, ldc)
use gpu
type(gpu_blas), intent(in) :: handle
character, intent(in) :: transa, transb
integer*8, intent(in) :: m, n, lda, ldb, ldc
double precision, intent(in) :: alpha, beta
type(gpu_double2) :: a, b, c
call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, a%c, int(lda,c_int64_t), beta, &
b%c, int(ldb,c_int64_t), c%c, int(ldc,c_int64_t))
end subroutine
subroutine gpu_dgeam_f_64(handle, transa, transb, m, n, alpha, a, lda, beta, &
b, ldb, c, ldc)
use gpu
type(gpu_blas), intent(in) :: handle
character, intent(in) :: transa, transb
integer*8, intent(in) :: m, n, lda, ldb, ldc
double precision, intent(in) :: alpha, beta
double precision, target :: a(*), b(*), c(*)
call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, &
c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t))
end subroutine
! gemm
! ----
subroutine gpu_dgemm(handle, transa, transb, m, n, k, alpha, a, lda, &
b, ldb, beta, c, ldc)
use gpu
type(gpu_blas), intent(in) :: handle
character, intent(in) :: transa, transb
integer*4, intent(in) :: m, n, k, lda, ldb, ldc
double precision, intent(in) :: alpha, beta
type(gpu_double2) :: a, b, c
call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), &
alpha, a%c, int(lda,c_int64_t), &
b%c, int(ldb,c_int64_t), beta, c%c, int(ldc,c_int64_t))
end subroutine
subroutine gpu_dgemm_64(handle, transa, transb, m, n, k, alpha, a, lda, &
b, ldb, beta, c, ldc)
use gpu
type(gpu_blas), intent(in) :: handle
character, intent(in) :: transa, transb
integer*8, intent(in) :: m, n, k, lda, ldb, ldc
double precision, intent(in) :: alpha, beta
type(gpu_double2) :: a, b, c
call gpu_dgemm_c(handle%c, transa, transb, m, n, k, &
alpha, a%c, lda, b%c, ldb, beta, c%c, ldc)
end subroutine
subroutine gpu_dgemm_f(handle, transa, transb, m, n, k, alpha, a, lda, &
b, ldb, beta, c, ldc)
use gpu
type(gpu_blas), intent(in) :: handle
character, intent(in) :: transa, transb
integer*4, intent(in) :: m, n, k, lda, ldb, ldc
double precision, intent(in) :: alpha, beta
double precision, target :: a(*), b(*), c(*)
call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), &
alpha, c_loc(a), int(lda,c_int64_t), &
c_loc(b), int(ldb,c_int64_t), beta, c_loc(c), int(ldc,c_int64_t))
end subroutine
subroutine gpu_dgemm_f_64(handle, transa, transb, m, n, k, alpha, a, lda, &
b, ldb, beta, c, ldc)
use gpu
type(gpu_blas), intent(in) :: handle
character, intent(in) :: transa, transb
integer*8, intent(in) :: m, n, k, lda, ldb, ldc
double precision, intent(in) :: alpha, beta
double precision, target :: a(*), b(*), c(*)
call gpu_dgemm_c(handle%c, transa, transb, m, n, k, &
alpha, c_loc(a), lda, c_loc(b), ldb, beta, c_loc(c), ldc)
end subroutine

View File

@ -18,7 +18,7 @@
BEGIN_PROVIDER [double precision, multi_s_dipole_moment, (N_states, N_states)] BEGIN_PROVIDER [double precision, multi_s_dipole_moment , (N_states, N_states)]
&BEGIN_PROVIDER [double precision, multi_s_x_dipole_moment, (N_states, N_states)] &BEGIN_PROVIDER [double precision, multi_s_x_dipole_moment, (N_states, N_states)]
&BEGIN_PROVIDER [double precision, multi_s_y_dipole_moment, (N_states, N_states)] &BEGIN_PROVIDER [double precision, multi_s_y_dipole_moment, (N_states, N_states)]
&BEGIN_PROVIDER [double precision, multi_s_z_dipole_moment, (N_states, N_states)] &BEGIN_PROVIDER [double precision, multi_s_z_dipole_moment, (N_states, N_states)]
@ -40,27 +40,153 @@ BEGIN_PROVIDER [double precision, multi_s_dipole_moment, (N_states, N_states)]
! gamma^{nm}: density matrix \bra{\Psi^n} a^{\dagger}_a a_i \ket{\Psi^m} ! gamma^{nm}: density matrix \bra{\Psi^n} a^{\dagger}_a a_i \ket{\Psi^m}
END_DOC END_DOC
integer :: istate,jstate ! States integer :: istate, jstate ! States
integer :: i,j ! general spatial MOs integer :: i, j ! general spatial MOs
double precision :: nuclei_part_x, nuclei_part_y, nuclei_part_z double precision :: nuclei_part_x, nuclei_part_y, nuclei_part_z
multi_s_x_dipole_moment = 0.d0 multi_s_x_dipole_moment = 0.d0
multi_s_y_dipole_moment = 0.d0 multi_s_y_dipole_moment = 0.d0
multi_s_z_dipole_moment = 0.d0 multi_s_z_dipole_moment = 0.d0
if(8.d0*mo_num*mo_num*n_states*n_states*1d-9 .lt. 200.d0) then
do jstate = 1, N_states do jstate = 1, N_states
do istate = 1, N_states do istate = 1, N_states
do i = 1, mo_num
do i = 1, mo_num do j = 1, mo_num
do j = 1, mo_num multi_s_x_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_x(j,i)
multi_s_x_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_x(j,i) multi_s_y_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_y(j,i)
multi_s_y_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_y(j,i) multi_s_z_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_z(j,i)
multi_s_z_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_z(j,i) enddo
enddo enddo
enddo enddo
enddo enddo
enddo
else
! no enouph memory
! on the fly scheme
PROVIDE psi_det_alpha_unique psi_det_beta_unique
integer :: l, k_a, k_b
integer :: occ(N_int*bit_kind_size,2)
integer :: h1, h2, p1, p2, degree
integer :: exc(0:2,2), n_occ(2)
integer :: krow, kcol, lrow, lcol
integer(bit_kind) :: tmp_det(N_int,2), tmp_det2(N_int)
double precision :: ck, ckl, phase
!$OMP PARALLEL DEFAULT(NONE) &
!$OMP PRIVATE(j, l, k_a, k_b, istate, jstate, occ, ck, ckl, h1, h2, p1, p2, exc, &
!$OMP phase, degree, n_occ, krow, kcol, lrow, lcol, tmp_det, tmp_det2) &
!$OMP SHARED(N_int, N_states, elec_alpha_num, elec_beta_num, N_det, &
!$OMP psi_bilinear_matrix_rows, psi_bilinear_matrix_columns, &
!$OMP psi_bilinear_matrix_transp_rows, psi_bilinear_matrix_transp_columns, &
!$OMP psi_det_alpha_unique, psi_det_beta_unique, &
!$OMP psi_bilinear_matrix_values, psi_bilinear_matrix_transp_values, &
!$OMP mo_dipole_x, mo_dipole_y, mo_dipole_z, &
!$OMP multi_s_x_dipole_moment, multi_s_y_dipole_moment, multi_s_z_dipole_moment)
!$OMP DO COLLAPSE(2)
do istate = 1, N_states
do jstate = 1, N_states
do k_a = 1, N_det
krow = psi_bilinear_matrix_rows (k_a)
kcol = psi_bilinear_matrix_columns(k_a)
tmp_det(1:N_int,1) = psi_det_alpha_unique(1:N_int,krow)
tmp_det(1:N_int,2) = psi_det_beta_unique (1:N_int,kcol)
! Diagonal part
call bitstring_to_list_ab(tmp_det, occ, n_occ, N_int)
ck = psi_bilinear_matrix_values(k_a,istate)*psi_bilinear_matrix_values(k_a,jstate)
do l = 1, elec_alpha_num
j = occ(l,1)
multi_s_x_dipole_moment(istate,jstate) -= ck * mo_dipole_x(j,j)
multi_s_y_dipole_moment(istate,jstate) -= ck * mo_dipole_y(j,j)
multi_s_z_dipole_moment(istate,jstate) -= ck * mo_dipole_z(j,j)
enddo
if (k_a == N_det) cycle
l = k_a + 1
lrow = psi_bilinear_matrix_rows (l)
lcol = psi_bilinear_matrix_columns(l)
! Fix beta determinant, loop over alphas
do while (lcol == kcol)
tmp_det2(:) = psi_det_alpha_unique(:,lrow)
call get_excitation_degree_spin(tmp_det(1,1), tmp_det2, degree, N_int)
if (degree == 1) then
exc = 0
call get_single_excitation_spin(tmp_det(1,1), tmp_det2, exc, phase, N_int)
call decode_exc_spin(exc, h1, p1, h2, p2)
ckl = psi_bilinear_matrix_values(k_a,istate)*psi_bilinear_matrix_values(l,jstate) * phase
multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(h1,p1)
multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(h1,p1)
multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(h1,p1)
ckl = psi_bilinear_matrix_values(k_a,jstate)*psi_bilinear_matrix_values(l,istate) * phase
multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(p1,h1)
multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(p1,h1)
multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(p1,h1)
endif
l = l+1
if (l > N_det) exit
lrow = psi_bilinear_matrix_rows (l)
lcol = psi_bilinear_matrix_columns(l)
enddo
enddo ! k_a
do k_b = 1, N_det
krow = psi_bilinear_matrix_transp_rows (k_b)
kcol = psi_bilinear_matrix_transp_columns(k_b)
tmp_det(1:N_int,1) = psi_det_alpha_unique(1:N_int,krow)
tmp_det(1:N_int,2) = psi_det_beta_unique (1:N_int,kcol)
! Diagonal part
call bitstring_to_list_ab(tmp_det, occ, n_occ, N_int)
ck = psi_bilinear_matrix_transp_values(k_b,istate)*psi_bilinear_matrix_transp_values(k_b,jstate)
do l = 1, elec_beta_num
j = occ(l,2)
multi_s_x_dipole_moment(istate,jstate) -= ck * mo_dipole_x(j,j)
multi_s_y_dipole_moment(istate,jstate) -= ck * mo_dipole_y(j,j)
multi_s_z_dipole_moment(istate,jstate) -= ck * mo_dipole_z(j,j)
enddo
if (k_b == N_det) cycle
l = k_b+1
lrow = psi_bilinear_matrix_transp_rows (l)
lcol = psi_bilinear_matrix_transp_columns(l)
! Fix beta determinant, loop over alphas
do while (lrow == krow)
tmp_det2(:) = psi_det_beta_unique(:,lcol)
call get_excitation_degree_spin(tmp_det(1,2), tmp_det2, degree, N_int)
if (degree == 1) then
exc = 0
call get_single_excitation_spin(tmp_det(1,2), tmp_det2, exc, phase, N_int)
call decode_exc_spin(exc, h1, p1, h2, p2)
ckl = psi_bilinear_matrix_transp_values(k_b,istate)*psi_bilinear_matrix_transp_values(l,jstate) * phase
multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(h1,p1)
multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(h1,p1)
multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(h1,p1)
ckl = psi_bilinear_matrix_transp_values(k_b,jstate)*psi_bilinear_matrix_transp_values(l,istate) * phase
multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(p1,h1)
multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(p1,h1)
multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(p1,h1)
endif
l = l+1
if (l > N_det) exit
lrow = psi_bilinear_matrix_transp_rows (l)
lcol = psi_bilinear_matrix_transp_columns(l)
enddo
enddo ! k_b
enddo ! istate
enddo ! jstate
!$OMP END DO
!$OMP END PARALLEL
endif ! memory condition
! Nuclei part ! Nuclei part
nuclei_part_x = 0.d0 nuclei_part_x = 0.d0