diff --git a/plugins/local/gpu_nvidia/gpu.c b/plugins/local/gpu_nvidia/gpu.c index f0bd247a..189de64c 100644 --- a/plugins/local/gpu_nvidia/gpu.c +++ b/plugins/local/gpu_nvidia/gpu.c @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -10,6 +11,10 @@ /* Generic functions */ +bool no_gpu() { + return false; +} + int gpu_ndevices() { int ngpus; cudaGetDeviceCount(&ngpus); @@ -17,7 +22,7 @@ int gpu_ndevices() { } void gpu_set_device(int32_t igpu) { - cudaSetDevice(igpu); + cudaSetDevice((int) igpu); } @@ -64,22 +69,20 @@ void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) { /* Streams */ -void gpu_stream_create(void** ptr) { - cudaStream_t stream; - cudaError_t rc = cudaStreamCreate(&stream); +void gpu_stream_create(cudaStream_t* ptr) { + cudaError_t rc = cudaStreamCreate(ptr); assert (rc == cudaSuccess); - *ptr = (void*) stream; } -void gpu_stream_destroy(void** ptr) { - assert (*ptr != NULL); - cudaError_t rc = cudaStreamDestroy( (cudaStream_t) *ptr); +void gpu_stream_destroy(cudaStream_t* ptr) { + assert (ptr != NULL); + cudaError_t rc = cudaStreamDestroy(*ptr); assert (rc == cudaSuccess); *ptr = NULL; } -void gpu_set_stream(void** handle, void** stream) { - cublasSetStream( (cublasHandle_t) *handle, (cudaStream_t) *stream); +void gpu_set_stream(cublasHandle_t handle, cudaStream_t stream) { + cublasSetStream(handle, stream); } void gpu_synchronize() { @@ -89,75 +92,80 @@ void gpu_synchronize() { /* BLAS functions */ -void gpu_blas_create(void** handle) { - cublasHandle_t cublas_handle; - cublasStatus_t rc = cublasCreate(&cublas_handle); +void gpu_blas_create(cublasHandle_t* ptr) { + cublasStatus_t rc = cublasCreate(ptr); assert (rc == CUBLAS_STATUS_SUCCESS); - *handle = (void*) cublas_handle; } -void gpu_blas_destroy(void** handle) { - assert (*handle != NULL); - cublasStatus_t rc = cublasDestroy( (cublasHandle_t) *handle); +void gpu_blas_destroy(cublasHandle_t* ptr) { + assert (ptr != NULL); + cublasStatus_t rc = cublasDestroy(*ptr); assert (rc == CUBLAS_STATUS_SUCCESS); - *handle = NULL; + ptr = NULL; } -void gpu_ddot(void** handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) { - assert (*handle != NULL); +void gpu_ddot(cublasHandle_t handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) { + assert (handle != NULL); + /* Convert to int */ + int n_, incx_, incy_; - /* Convert to int32_t */ - int32_t n_, incx_, incy_; + n_ = (int) n; + incx_ = (int) incx; + incy_ = (int) incy; - n_ = (int32_t) n; - incx_ = (int32_t) incx; - incy_ = (int32_t) incy; + assert ( (int64_t) n_ == n ); + assert ( (int64_t) incx_ == incx); + assert ( (int64_t) incy_ == incy); + + cublasStatus_t rc = cublasDdot(handle, n_, x, incx_, y, incy_, result); +/* + double alpha = 1.0; + double beta = 0.0; + cublasStatus_t rc = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, n_, &alpha, x, 1, y, n_, &beta, &result_, 1); +*/ + assert (rc == CUBLAS_STATUS_SUCCESS); +} + + + +void gpu_sdot(cublasHandle_t handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) { + assert (handle != NULL); + + /* Convert to int */ + int n_, incx_, incy_; + + n_ = (int) n; + incx_ = (int) incx; + incy_ = (int) incy; /* Check for integer overflows */ assert ( (int64_t) n_ == n ); assert ( (int64_t) incx_ == incx); assert ( (int64_t) incy_ == incy); - cublasDdot((cublasHandle_t) *handle, n_, x, incx_, y, incy_, result); + float result_ = 0.; + cublasStatus_t rc = cublasSdot(handle, n_, x, incx_, y, incy_, &result_); + assert (rc == CUBLAS_STATUS_SUCCESS); + *result = result_; } -void gpu_sdot(void** handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) { - assert (*handle != NULL); - - /* Convert to int32_t */ - int32_t n_, incx_, incy_; - - n_ = (int32_t) n; - incx_ = (int32_t) incx; - incy_ = (int32_t) incy; - - /* Check for integer overflows */ - assert ( (int64_t) n_ == n ); - assert ( (int64_t) incx_ == incx); - assert ( (int64_t) incy_ == incy); - - cublasSdot((cublasHandle_t) *handle, n_, x, incx_, y, incy_, result); -} - - - -void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t n, const double alpha, +void gpu_dgemv(cublasHandle_t handle, const char transa, const int64_t m, const int64_t n, const double alpha, const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) { - assert (*handle != NULL); + assert (handle != NULL); - /* Convert to int32_t */ - int32_t m_, n_, lda_, incx_, incy_; + /* Convert to int */ + int m_, n_, lda_, incx_, incy_; - m_ = (int32_t) m; - n_ = (int32_t) n; - lda_ = (int32_t) lda; - incx_ = (int32_t) incx; - incy_ = (int32_t) incy; + m_ = (int) m; + n_ = (int) n; + lda_ = (int) lda; + incx_ = (int) incx; + incy_ = (int) incy; /* Check for integer overflows */ assert ( (int64_t) m_ == m ); @@ -169,24 +177,24 @@ void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t cublasOperation_t transa_ = CUBLAS_OP_N; if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; - cublasDgemv((cublasHandle_t) *handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_); + cublasDgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_); } -void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t n, const float alpha, +void gpu_sgemv(cublasHandle_t handle, const char transa, const int64_t m, const int64_t n, const float alpha, const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) { - assert (*handle != NULL); + assert (handle != NULL); - /* Convert to int32_t */ - int32_t m_, n_, lda_, incx_, incy_; + /* Convert to int */ + int m_, n_, lda_, incx_, incy_; - m_ = (int32_t) m; - n_ = (int32_t) n; - lda_ = (int32_t) lda; - incx_ = (int32_t) incx; - incy_ = (int32_t) incy; + m_ = (int) m; + n_ = (int) n; + lda_ = (int) lda; + incx_ = (int) incx; + incy_ = (int) incy; /* Check for integer overflows */ assert ( (int64_t) m_ == m ); @@ -198,24 +206,24 @@ void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t cublasOperation_t transa_ = CUBLAS_OP_N; if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; - cublasSgemv((cublasHandle_t) *handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_); + cublasSgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_); } -void gpu_dgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, +void gpu_dgemm(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) { - assert (*handle != NULL); + assert (handle != NULL); - /* Convert to int32_t */ - int32_t m_, n_, k_, lda_, ldb_, ldc_; + /* Convert to int */ + int m_, n_, k_, lda_, ldb_, ldc_; - m_ = (int32_t) m; - n_ = (int32_t) n; - k_ = (int32_t) k; - lda_ = (int32_t) lda; - ldb_ = (int32_t) ldb; - ldc_ = (int32_t) ldc; + m_ = (int) m; + n_ = (int) n; + k_ = (int) k; + lda_ = (int) lda; + ldb_ = (int) ldb; + ldc_ = (int) ldc; /* Check for integer overflows */ assert ( (int64_t) m_ == m ); @@ -230,25 +238,25 @@ void gpu_dgemm(void** handle, const char transa, const char transb, const int64_ if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; - cublasDgemm((cublasHandle_t) *handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_); + cublasDgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_); } -void gpu_sgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, +void gpu_sgemm(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) { - assert (*handle != NULL); + assert (handle != NULL); - /* Convert to int32_t */ - int32_t m_, n_, k_, lda_, ldb_, ldc_; + /* Convert to int */ + int m_, n_, k_, lda_, ldb_, ldc_; - m_ = (int32_t) m; - n_ = (int32_t) n; - k_ = (int32_t) k; - lda_ = (int32_t) lda; - ldb_ = (int32_t) ldb; - ldc_ = (int32_t) ldc; + m_ = (int) m; + n_ = (int) n; + k_ = (int) k; + lda_ = (int) lda; + ldb_ = (int) ldb; + ldc_ = (int) ldc; /* Check for integer overflows */ assert ( (int64_t) m_ == m ); @@ -263,22 +271,22 @@ void gpu_sgemm(void** handle, const char transa, const char transb, const int64_ if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; - cublasSgemm((cublasHandle_t) *handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_); + cublasSgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_); } -void gpu_dgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, +void gpu_dgeam(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { - assert (*handle != NULL); + assert (handle != NULL); - /* Convert to int32_t */ - int32_t m_, n_, lda_, ldb_, ldc_; + /* Convert to int */ + int m_, n_, lda_, ldb_, ldc_; - m_ = (int32_t) m; - n_ = (int32_t) n; - lda_ = (int32_t) lda; - ldb_ = (int32_t) ldb; - ldc_ = (int32_t) ldc; + m_ = (int) m; + n_ = (int) n; + lda_ = (int) lda; + ldb_ = (int) ldb; + ldc_ = (int) ldc; /* Check for integer overflows */ assert ( (int64_t) m_ == m ); @@ -292,23 +300,23 @@ void gpu_dgeam(void** handle, const char transa, const char transb, const int64_ if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; - cublasDgeam((cublasHandle_t) *handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_); + cublasDgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_); } -void gpu_sgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha, +void gpu_sgeam(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha, const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) { - assert (*handle != NULL); + assert (handle != NULL); - /* Convert to int32_t */ - int32_t m_, n_, lda_, ldb_, ldc_; + /* Convert to int */ + int m_, n_, lda_, ldb_, ldc_; - m_ = (int32_t) m; - n_ = (int32_t) n; - lda_ = (int32_t) lda; - ldb_ = (int32_t) ldb; - ldc_ = (int32_t) ldc; + m_ = (int) m; + n_ = (int) n; + lda_ = (int) lda; + ldb_ = (int) ldb; + ldc_ = (int) ldc; /* Check for integer overflows */ assert ( (int64_t) m_ == m ); @@ -322,6 +330,6 @@ void gpu_sgeam(void** handle, const char transa, const char transb, const int64_ if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; - cublasSgeam((cublasHandle_t) *handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_); + cublasSgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_); } diff --git a/plugins/local/gpu_x86/gpu.c b/plugins/local/gpu_x86/gpu.c index ac7c3620..53267a7c 100644 --- a/plugins/local/gpu_x86/gpu.c +++ b/plugins/local/gpu_x86/gpu.c @@ -2,8 +2,12 @@ #include #include #include +#include #include +bool no_gpu() { + return true; +} /* Generic functions */ @@ -56,7 +60,7 @@ void gpu_stream_destroy(void** ptr) { *ptr = NULL; } -void gpu_set_stream(void** handle, void** stream) { +void gpu_set_stream(void* handle, void* stream) { return; } @@ -79,8 +83,8 @@ void gpu_blas_destroy(void** handle) { double ddot_(const int32_t* n, const double* x, const int32_t* incx, const double* y, const int32_t* incy); -void gpu_ddot(void** handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) { - assert (*handle != NULL); +void gpu_ddot(void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) { + assert (handle != NULL); /* Convert to int32_t */ int32_t n_, incx_, incy_; @@ -100,8 +104,8 @@ void gpu_ddot(void** handle, const int64_t n, const double* x, const int64_t inc float sdot_(const int32_t* n, const float* x, const int32_t* incx, const float* y, const int32_t* incy); -void gpu_sdot(void** handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) { - assert (*handle != NULL); +void gpu_sdot(void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) { + assert (handle != NULL); /* Convert to int32_t */ int32_t n_, incx_, incy_; @@ -122,10 +126,10 @@ void gpu_sdot(void** handle, const int64_t n, const float* x, const int64_t incx void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha, const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy); -void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t n, const double alpha, +void gpu_dgemv(void* handle, const char transa, const int64_t m, const int64_t n, const double alpha, const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) { - assert (*handle != NULL); + assert (handle != NULL); /* Convert to int32_t */ int32_t m_, n_, lda_, incx_, incy_; @@ -150,10 +154,10 @@ void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha, const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy); -void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t n, const float alpha, +void gpu_sgemv(void* handle, const char transa, const int64_t m, const int64_t n, const float alpha, const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) { - assert (*handle != NULL); + assert (handle != NULL); /* Convert to int32_t */ int32_t m_, n_, lda_, incx_, incy_; @@ -178,10 +182,10 @@ void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha, const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc); -void gpu_dgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, +void gpu_dgemm(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) { - assert (*handle != NULL); + assert (handle != NULL); /* Convert to int32_t */ int32_t m_, n_, k_, lda_, ldb_, ldc_; @@ -209,10 +213,10 @@ void gpu_dgemm(void** handle, const char transa, const char transb, const int64_ void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha, const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc); -void gpu_sgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, +void gpu_sgemm(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) { - assert (*handle != NULL); + assert (handle != NULL); /* Convert to int32_t */ int32_t m_, n_, k_, lda_, ldb_, ldc_; @@ -236,9 +240,9 @@ void gpu_sgemm(void** handle, const char transa, const char transb, const int64_ } -void gpu_dgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, +void gpu_dgeam(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { - assert (*handle != NULL); + assert (handle != NULL); if ( (transa == 'N' && transb == 'N') || (transa == 'n' && transb == 'N') || @@ -368,9 +372,9 @@ void gpu_dgeam(void** handle, const char transa, const char transb, const int64_ } -void gpu_sgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha, +void gpu_sgeam(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha, const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) { - assert (*handle != NULL); + assert (handle != NULL); if ( (transa == 'N' && transb == 'N') || (transa == 'n' && transb == 'N') || diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 5c2daa05..5ee7366e 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -14,9 +14,15 @@ subroutine run_ccsd_space_orb type(gpu_double2) :: t1, r1 type(gpu_double2) :: H_oo, H_vv, H_vo - type(gpu_double2) :: d_cc_space_f_vo + type(gpu_double2) :: d_cc_space_f_oo, d_cc_space_f_vo + type(gpu_double2) :: d_cc_space_f_ov, d_cc_space_f_vv + + type(gpu_double3) :: d_cc_space_v_oo_chol, d_cc_space_v_vo_chol + type(gpu_double3) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol + type(gpu_double4) :: d_cc_space_v_oovv + double precision, allocatable :: all_err(:,:), all_t(:,:) integer, allocatable :: list_occ(:), list_vir(:) integer(bit_kind) :: det(N_int,2) @@ -24,7 +30,7 @@ subroutine run_ccsd_space_orb call set_multiple_levels_omp(.False.) - if (do_ao_cholesky) then + if (do_mo_cholesky) then PROVIDE cholesky_mo_transp FREE cholesky_ao else @@ -55,11 +61,36 @@ subroutine run_ccsd_space_orb !print*,'occ',list_occ !print*,'vir',list_vir + ! GPU arrays + call gpu_allocate(d_cc_space_f_oo, nO, nO) call gpu_allocate(d_cc_space_f_vo, nV, nO) - call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV) - call gpu_upload(cc_space_f_vo, d_cc_space_f_vo) - call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv) + call gpu_allocate(d_cc_space_f_ov, nO, nV) + call gpu_allocate(d_cc_space_f_vv, nV, nV) + call gpu_upload(cc_space_f_oo, d_cc_space_f_oo) + call gpu_upload(cc_space_f_vo, d_cc_space_f_vo) + call gpu_upload(cc_space_f_vv, d_cc_space_f_vv) + +! FREE cc_space_f_oo +! FREE cc_space_f_vo +! FREE cc_space_f_vv + + if (do_mo_cholesky) then + call gpu_allocate(d_cc_space_v_oo_chol, cholesky_mo_num, nO, nO) + call gpu_allocate(d_cc_space_v_ov_chol, cholesky_mo_num, nO, nV) + call gpu_allocate(d_cc_space_v_vo_chol, cholesky_mo_num, nV, nO) + call gpu_allocate(d_cc_space_v_vv_chol, cholesky_mo_num, nV, nV) + + call gpu_upload(cc_space_v_oo_chol, d_cc_space_v_oo_chol) + call gpu_upload(cc_space_v_ov_chol, d_cc_space_v_ov_chol) + call gpu_upload(cc_space_v_vo_chol, d_cc_space_v_vo_chol) + call gpu_upload(cc_space_v_vv_chol, d_cc_space_v_vv_chol) + +! FREE cc_space_v_oo_chol +! FREE cc_space_v_ov_chol +! FREE cc_space_v_vo_chol +! FREE cc_space_v_vv_chol + endif call gpu_allocate(t2, nO,nO,nV,nV) call gpu_allocate(r2, nO,nO,nV,nV) @@ -120,6 +151,13 @@ subroutine run_ccsd_space_orb call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,h_t2) call gpu_upload(h_t2, t2) + + call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV) + call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv) + +! FREE cc_space_v_oovv + + call update_tau_space(nO,nV,h_t1,t1,t2,tau) call update_tau_x_space(nO,nV,tau,tau_x) !print*,'hf_energy', hf_energy @@ -142,10 +180,10 @@ subroutine run_ccsd_space_orb do while (not_converged) ! Residue - if (do_ao_cholesky) then -! if (.False.) then - call compute_H_oo_chol(nO,nV,tau_x,H_oo) - call compute_H_vv_chol(nO,nV,tau_x%f,H_vv%f) + if (do_mo_cholesky) then + call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, & + d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo) + call compute_H_vv_chol(nO,nV,tau_x,H_vv) call compute_H_vo_chol(nO,nV,t1%f,H_vo%f) call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1) @@ -249,6 +287,12 @@ subroutine run_ccsd_space_orb call save_energy(uncorr_energy + energy, e_t) deallocate(h_t1, h_t2) + if (do_mo_cholesky) then + call gpu_deallocate(d_cc_space_v_oo_chol) + call gpu_deallocate(d_cc_space_v_ov_chol) + call gpu_deallocate(d_cc_space_v_vo_chol) + call gpu_deallocate(d_cc_space_v_vv_chol) + endif call gpu_deallocate(d_cc_space_f_vo) call gpu_deallocate(d_cc_space_v_oovv) call gpu_deallocate(t1) @@ -302,8 +346,21 @@ subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1, ! !$omp end parallel - call gpu_ddot(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1, d_cc_space_v_oovv, 1, energy) - call gpu_ddot(blas_handle, nO*nV*1_8, d_cc_space_f_vo, 1, t1, 1, e) + type(gpu_stream) :: s1, s2 + call gpu_stream_create(s1) + call gpu_stream_create(s2) + + call gpu_set_stream(blas_handle,s1) + call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo, 1, t1, 1, e) + + call gpu_set_stream(blas_handle,s2) + call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1_8, d_cc_space_v_oovv, 1_8, energy) + call gpu_synchronize() + call gpu_set_stream(blas_handle,gpu_default_stream) + + call gpu_stream_destroy(s1) + call gpu_stream_destroy(s2) + energy = energy + 2.d0*e end @@ -346,32 +403,29 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau) type(gpu_stream) :: stream(nV) - do b=1,nV - call gpu_stream_create(stream(b)) - enddo - - !$OMP PARALLEL & + !$OMP PARALLEL if (no_gpu()) & !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) !$OMP DO do b=1,nV + call gpu_stream_create(stream(b)) call gpu_set_stream(blas_handle,stream(b)) do j=1,nO - call gpu_dgeam(blas_handle, 'N', 'N', nO*1_8, nV*1_8, & - 1.d0, t2%f(1,j,1,b), nO*nO*1_8, & - h_t1(j,b), t1%f, nO*1_8, & - tau%f(1,j,1,b), nO*nO*1_8) + call gpu_dgeam_f(blas_handle, 'N', 'N', nO, nV, & + 1.d0, t2%f(1,j,1,b), nO*nO, & + h_t1(j,b), t1%f, nO, & + tau%f(1,j,1,b), nO*nO) enddo enddo !$OMP END DO !$OMP END PARALLEL - call gpu_synchronize() - do b=1,nV call gpu_stream_destroy(stream(b)) enddo + call gpu_set_stream(blas_handle,gpu_default_stream) + end @@ -412,7 +466,7 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x) call gpu_stream_create(stream(a)) enddo - !$OMP PARALLEL & + !$OMP PARALLEL if (no_gpu()) & !$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) @@ -420,20 +474,20 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x) do b=1,nV do a=1,nV call gpu_set_stream(blas_handle,stream(a)) - call gpu_dgeam(blas_handle, 'N', 'N', nO*1_8, nO*1_8, & - 2.d0, tau%f(1,1,a,b), nO*1_8, & - -1.d0, tau%f(1,1,b,a), nO*1_8, & - tau_x%f(1,1,a,b), nO*1_8) + call gpu_dgeam_f(blas_handle, 'N', 'N', nO, nO, & + 2.d0, tau%f(1,1,a,b), nO, & + -1.d0, tau%f(1,1,b,a), nO, & + tau_x%f(1,1,a,b), nO) enddo enddo !$OMP END DO !$OMP END PARALLEL - call gpu_synchronize() - do b=1,nV call gpu_stream_destroy(stream(b)) enddo + call gpu_set_stream(blas_handle,gpu_default_stream) + end diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index 9b161001..288724f3 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -293,62 +293,115 @@ end ! H_oo -subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo) +subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, & + d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo) use gpu implicit none integer, intent(in) :: nO,nV + type(gpu_double2), intent(in) :: d_cc_space_f_oo + type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol type(gpu_double4), intent(in) :: tau_x type(gpu_double2), intent(out) :: H_oo integer :: a,b,i,j,u,k - double precision, allocatable :: tau_kau(:,:,:), tmp_vov(:,:,:) + type(gpu_double3) :: tau_kau, tmp_vov, tmp_ovv - allocate(tau_kau(cholesky_mo_num,nV,nO)) - !$omp parallel & - !$omp default(shared) & - !$omp private(i,u,j,k,a,b,tmp_vov) - allocate(tmp_vov(nV,nO,nV) ) - !$omp do - do u = 1, nO + call gpu_allocate(tau_kau, cholesky_mo_num, nV, nO) + +! !$omp parallel & +! !$omp default(shared) & +! !$omp private(i,u,j,k,a,b,tmp_vov) +! call gpu_allocate(tmp_vov, nV, nO, nV) +! !$omp do +! do u = 1, nO +! do b=1,nV +! do j=1,nO +! do a=1,nV +! tmp_vov%f(a,j,b) = tau_x%f(u,j,a,b) +! enddo +! enddo +! enddo +! call dgemm('N','T',cholesky_mo_num,nV,nO*nV,1.d0, & +! d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, tmp_vov%f, nV, & +! 0.d0, tau_kau%f(1,1,u), cholesky_mo_num) +! enddo +! !$omp end do nowait +! call gpu_deallocate(tmp_vov) +! !$omp do +! do i = 1, nO +! do u = 1, nO +! H_oo%f(u,i) = d_cc_space_f_oo%f(u,i) +! enddo +! enddo +! !$omp end do nowait +! +! !$omp barrier +! !$omp end parallel +! call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & +! tau_kau%f(1,1,1), cholesky_mo_num*nV, d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num*nV, & +! 1.d0, H_oo%f(1,1), nO) +! + + type(gpu_stream) :: stream(nV) + + do b=1,nV + call gpu_stream_create(stream(b)) + enddo + + !$OMP PARALLEL if (no_gpu()) & + !$OMP DEFAULT(SHARED) & + !$OMP PRIVATE(u,b,tmp_vov,tmp_ovv) + + call gpu_allocate(tmp_vov, nV, nO, nV) + call gpu_allocate(tmp_ovv, nO, nV, nV) + + !$OMP DO + do u=1,nO + call gpu_dgeam_f(blas_handle, 'N', 'N', 1, nO*nV*nV, 1.d0, & + tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f, nO, tmp_ovv%f, 1) do b=1,nV - do j=1,nO - do a=1,nV - tmp_vov(a,j,b) = tau_x%f(u,j,a,b) - enddo - enddo + call gpu_set_stream(blas_handle,stream(b)) + call gpu_dgeam_f(blas_handle, 'T', 'T', nV, nO, 1.d0, & + tmp_ovv%f(1,1,b), nO, 0.d0, & + tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV) enddo - call dgemm('N','T',cholesky_mo_num,nV,nO*nV,1.d0, & - cc_space_v_ov_chol, cholesky_mo_num, tmp_vov, nV, & - 0.d0, tau_kau(1,1,u), cholesky_mo_num) + call gpu_dgemm_f(blas_handle, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, & + d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_vov%f, nV, & + 0.d0, tau_kau%f(1,1,u), cholesky_mo_num) + call gpu_synchronize() enddo - !$omp end do nowait - deallocate(tmp_vov) - !$omp do - do i = 1, nO - do u = 1, nO - H_oo%f(u,i) = cc_space_f_oo(u,i) - enddo - enddo - !$omp end do nowait - !$omp barrier - !$omp end parallel - call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & - tau_kau, cholesky_mo_num*nV, cc_space_v_vo_chol, cholesky_mo_num*nV, & - 1.d0, H_oo%f, nO) + !$OMP END DO + call gpu_deallocate(tmp_vov) + call gpu_deallocate(tmp_ovv) + !$OMP END PARALLEL + + do b=1,nV + call gpu_stream_destroy(stream(b)) + enddo + + call gpu_set_stream(blas_handle,gpu_default_stream) + + call gpu_copy(d_cc_space_f_oo, H_oo) + + call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & + tau_kau, cholesky_mo_num*nV, d_cc_space_v_vo_chol, cholesky_mo_num*nV, & + 1.d0, H_oo, nO) + + call gpu_deallocate(tau_kau) end ! H_vv subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv) - + use gpu implicit none integer, intent(in) :: nO,nV - double precision, intent(in) :: tau_x(nO, nO, nV, nV) - double precision, intent(out) :: H_vv(nV, nV) + type(gpu_double4), intent(in) :: tau_x + type(gpu_double2), intent(out) :: H_vv integer :: a,b,i,j,u,k, beta @@ -364,7 +417,7 @@ subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv) do b=1,nV do j=1,nO do i=1,nO - tmp_oov(i,j,b) = tau_x(i,j,a,b) + tmp_oov(i,j,b) = tau_x%f(i,j,a,b) enddo enddo enddo @@ -378,7 +431,7 @@ subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv) !$omp do do beta = 1, nV do a = 1, nV - H_vv(a,beta) = cc_space_f_vv(a,beta) + H_vv%f(a,beta) = cc_space_f_vv(a,beta) enddo enddo !$omp end do nowait @@ -386,7 +439,7 @@ subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv) !$omp end parallel call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, & tau_kia, cholesky_mo_num*nO, cc_space_v_ov_chol, cholesky_mo_num*nO, & - 1.d0, H_vv, nV) + 1.d0, H_vv%f, nV) end diff --git a/src/gpu/gpu.irp.f b/src/gpu/gpu.irp.f index e91d66f5..6ad0a075 100644 --- a/src/gpu/gpu.irp.f +++ b/src/gpu/gpu.irp.f @@ -8,4 +8,11 @@ BEGIN_PROVIDER [ type(gpu_blas), blas_handle ] call gpu_blas_create(blas_handle) END_PROVIDER +BEGIN_PROVIDER [ type(gpu_stream), gpu_default_stream ] + implicit none + BEGIN_DOC + ! Default stream + END_DOC + gpu_default_stream%c = C_NULL_PTR +END_PROVIDER diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90 index ecf79c83..2676b339 100644 --- a/src/gpu/gpu_module.F90 +++ b/src/gpu/gpu_module.F90 @@ -49,7 +49,12 @@ module gpu ! ------------ interface + logical(c_bool) function no_gpu() bind(C) + import + end function + integer function gpu_ndevices() bind(C) + import end function subroutine gpu_set_device(id) bind(C) @@ -101,7 +106,7 @@ module gpu subroutine gpu_set_stream_c(handle, stream) bind(C, name='gpu_set_stream') import - type(c_ptr) :: handle, stream + type(c_ptr), value :: handle, stream end subroutine subroutine gpu_synchronize() bind(C) @@ -120,15 +125,15 @@ module gpu subroutine gpu_ddot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_ddot') import - type(c_ptr), intent(in) :: handle + type(c_ptr), value, intent(in) :: handle integer(c_int64_t), value :: n, incx, incy - type(c_ptr), intent(in), value :: dx, dy + type(c_ptr), value :: dx, dy real(c_double), intent(out) :: res end subroutine subroutine gpu_sdot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_sdot') import - type(c_ptr), intent(in) :: handle + type(c_ptr), value, intent(in) :: handle integer(c_int64_t), value :: n, incx, incy type(c_ptr), intent(in), value :: dx, dy real(c_float), intent(out) :: res @@ -137,8 +142,8 @@ module gpu subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, & b, ldb, c, ldc) bind(C, name='gpu_dgeam') import - type(c_ptr), intent(in) :: handle - character(c_char), intent(in), value :: transa, transb + type(c_ptr), value, intent(in) :: handle + character(c_char), intent(in), value :: transa, transb integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc real(c_double), intent(in), value :: alpha, beta type(c_ptr), value :: a, b, c @@ -147,13 +152,33 @@ module gpu subroutine gpu_sgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, & b, ldb, c, ldc) bind(C, name='gpu_sgeam') import - type(c_ptr), intent(in) :: handle - character(c_char), intent(in), value :: transa, transb + type(c_ptr), value, intent(in) :: handle + character(c_char), intent(in), value :: transa, transb integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc real(c_float), intent(in), value :: alpha, beta type(c_ptr), value :: a, b, c end subroutine + subroutine gpu_dgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, & + b, ldb, beta, c, ldc) bind(C, name='gpu_dgemm') + import + type(c_ptr), value, intent(in) :: handle + character(c_char), intent(in), value :: transa, transb + integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc + real(c_double), intent(in), value :: alpha, beta + type(c_ptr), value :: a, b, c + end subroutine + + subroutine gpu_sgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, & + b, ldb, beta, c, ldc) bind(C, name='gpu_sgemm') + import + type(c_ptr), value, intent(in) :: handle + character(c_char), intent(in), value :: transa, transb + integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc + real(c_float), intent(in), value :: alpha, beta + type(c_ptr), value :: a, b, c + end subroutine + end interface @@ -161,20 +186,26 @@ module gpu ! ---------------------- interface gpu_allocate - procedure gpu_allocate_double1 & - ,gpu_allocate_double2 & - ,gpu_allocate_double3 & - ,gpu_allocate_double4 & - ,gpu_allocate_double5 & - ,gpu_allocate_double6 + procedure gpu_allocate_double1 & + ,gpu_allocate_double2 & + ,gpu_allocate_double3 & + ,gpu_allocate_double4 & + ,gpu_allocate_double5 & + ,gpu_allocate_double6 & + ,gpu_allocate_double1_64 & + ,gpu_allocate_double2_64 & + ,gpu_allocate_double3_64 & + ,gpu_allocate_double4_64 & + ,gpu_allocate_double5_64 & + ,gpu_allocate_double6_64 end interface gpu_allocate interface gpu_deallocate - procedure gpu_deallocate_double1 & - ,gpu_deallocate_double2 & - ,gpu_deallocate_double3 & - ,gpu_deallocate_double4 & - ,gpu_deallocate_double5 & + procedure gpu_deallocate_double1 & + ,gpu_deallocate_double2 & + ,gpu_deallocate_double3 & + ,gpu_deallocate_double4 & + ,gpu_deallocate_double5 & ,gpu_deallocate_double6 end interface gpu_deallocate @@ -267,6 +298,61 @@ module gpu end subroutine + subroutine gpu_allocate_double1_64(ptr, s) + implicit none + type(gpu_double1), intent(inout) :: ptr + integer*8, intent(in) :: s + + call gpu_allocate_c(ptr%c, s) + call c_f_pointer(ptr%c, ptr%f, (/ s /)) + end subroutine + + subroutine gpu_allocate_double2_64(ptr, s1, s2) + implicit none + type(gpu_double2), intent(inout) :: ptr + integer*8, intent(in) :: s1, s2 + + call gpu_allocate_c(ptr%c, s1*s2*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /)) + end subroutine + + subroutine gpu_allocate_double3_64(ptr, s1, s2, s3) + implicit none + type(gpu_double3), intent(inout) :: ptr + integer*8, intent(in) :: s1, s2, s3 + + call gpu_allocate_c(ptr%c, s1*s2*s3*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /)) + end subroutine + + subroutine gpu_allocate_double4_64(ptr, s1, s2, s3, s4) + implicit none + type(gpu_double4), intent(inout) :: ptr + integer*8, intent(in) :: s1, s2, s3, s4 + + call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /)) + end subroutine + + subroutine gpu_allocate_double5_64(ptr, s1, s2, s3, s4, s5) + implicit none + type(gpu_double5), intent(inout) :: ptr + integer*8, intent(in) :: s1, s2, s3, s4, s5 + + call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /)) + end subroutine + + subroutine gpu_allocate_double6_64(ptr, s1, s2, s3, s4, s5, s6) + implicit none + type(gpu_double6), intent(inout) :: ptr + integer*8, intent(in) :: s1, s2, s3, s4, s5, s6 + + call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /)) + end subroutine + + ! gpu_deallocate ! -------------- @@ -494,19 +580,38 @@ end module subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res) use gpu type(gpu_blas), intent(in) :: handle - integer*8 :: n, incx, incy - double precision, target, intent(in) :: dx(*), dy(*) - double precision, intent(out) :: res - call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res) + integer*4 :: n, incx, incy + type(gpu_double1), intent(in) :: dx, dy + double precision, intent(out) :: res + call gpu_ddot_c(handle%c, int(n,c_int64_t), dx%c, int(incx,c_int64_t), dy%c, int(incy,c_int64_t), res) end subroutine -subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res) +subroutine gpu_ddot_f(handle, n, dx, incx, dy, incy, res) + use gpu + type(gpu_blas), intent(in) :: handle + integer*4 :: n, incx, incy + double precision, target :: dx(*), dy(*) + double precision, intent(out) :: res + call gpu_ddot_c(handle%c, int(n,c_int64_t), c_loc(dx), int(incx,c_int64_t), c_loc(dy), int(incy,c_int64_t), res) +end subroutine + + +subroutine gpu_ddot_64(handle, n, dx, incx, dy, incy, res) use gpu type(gpu_blas), intent(in) :: handle integer*8 :: n, incx, incy - real, target, intent(in) :: dx(*), dy(*) - real, intent(out) :: res - call gpu_sdot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res) + type(gpu_double1), intent(in) :: dx, dy + double precision, intent(out) :: res + call gpu_ddot_c(handle%c, n, dx%c, incx, dy%c, incy, res) +end subroutine + +subroutine gpu_ddot_f_64(handle, n, dx, incx, dy, incy, res) + use gpu + type(gpu_blas), intent(in) :: handle + integer*8 :: n, incx, incy + double precision, target :: dx(*), dy(*) + double precision, intent(out) :: res + call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res) end subroutine @@ -518,22 +623,103 @@ subroutine gpu_dgeam(handle, transa, transb, m, n, alpha, a, lda, beta, & use gpu type(gpu_blas), intent(in) :: handle character, intent(in) :: transa, transb - integer*8, intent(in) :: m, n, lda, ldb, ldc + integer*4, intent(in) :: m, n, lda, ldb, ldc double precision, intent(in) :: alpha, beta - double precision, target :: a(lda,*), b(ldb,*), c(ldc,*) - call gpu_dgeam_c(handle%c, transa, transb, m, n, alpha, c_loc(a), lda, beta, & - c_loc(b), ldb, c_loc(c), ldc) + type(gpu_double2) :: a, b, c + call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, a%c, int(lda,c_int64_t), beta, & + b%c, int(ldb,c_int64_t), c%c, int(ldc,c_int64_t)) end subroutine -subroutine gpu_sgeam(handle, transa, transb, m, n, alpha, a, lda, beta, & + +subroutine gpu_dgeam_f(handle, transa, transb, m, n, alpha, a, lda, beta, & b, ldb, c, ldc) - use gpu + use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa, transb + integer*4, intent(in) :: m, n, lda, ldb, ldc + double precision, intent(in) :: alpha, beta + double precision, target :: a(*), b(*), c(*) + call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, & + c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t)) +end subroutine + + +subroutine gpu_dgeam_64(handle, transa, transb, m, n, alpha, a, lda, beta, & + b, ldb, c, ldc) + use gpu type(gpu_blas), intent(in) :: handle character, intent(in) :: transa, transb integer*8, intent(in) :: m, n, lda, ldb, ldc - real, intent(in) :: alpha, beta - real, target :: a(lda,*), b(ldb,*), c(ldc,*) - call gpu_sgeam_c(handle%c, transa, transb, m, n, alpha, c_loc(a), lda, beta, & - c_loc(b), ldb, c_loc(c), ldc) + double precision, intent(in) :: alpha, beta + type(gpu_double2) :: a, b, c + call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, a%c, int(lda,c_int64_t), beta, & + b%c, int(ldb,c_int64_t), c%c, int(ldc,c_int64_t)) +end subroutine + + +subroutine gpu_dgeam_f_64(handle, transa, transb, m, n, alpha, a, lda, beta, & + b, ldb, c, ldc) + use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa, transb + integer*8, intent(in) :: m, n, lda, ldb, ldc + double precision, intent(in) :: alpha, beta + double precision, target :: a(*), b(*), c(*) + call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, & + c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t)) +end subroutine + + +! gemm +! ---- + +subroutine gpu_dgemm(handle, transa, transb, m, n, k, alpha, a, lda, & + b, ldb, beta, c, ldc) + use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa, transb + integer*4, intent(in) :: m, n, k, lda, ldb, ldc + double precision, intent(in) :: alpha, beta + type(gpu_double2) :: a, b, c + call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), & + alpha, a%c, int(lda,c_int64_t), & + b%c, int(ldb,c_int64_t), beta, c%c, int(ldc,c_int64_t)) +end subroutine + +subroutine gpu_dgemm_64(handle, transa, transb, m, n, k, alpha, a, lda, & + b, ldb, beta, c, ldc) + use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa, transb + integer*8, intent(in) :: m, n, k, lda, ldb, ldc + double precision, intent(in) :: alpha, beta + type(gpu_double2) :: a, b, c + call gpu_dgemm_c(handle%c, transa, transb, m, n, k, & + alpha, a%c, lda, b%c, ldb, beta, c%c, ldc) +end subroutine + +subroutine gpu_dgemm_f(handle, transa, transb, m, n, k, alpha, a, lda, & + b, ldb, beta, c, ldc) + use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa, transb + integer*4, intent(in) :: m, n, k, lda, ldb, ldc + double precision, intent(in) :: alpha, beta + double precision, target :: a(*), b(*), c(*) + call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), & + alpha, c_loc(a), int(lda,c_int64_t), & + c_loc(b), int(ldb,c_int64_t), beta, c_loc(c), int(ldc,c_int64_t)) +end subroutine + +subroutine gpu_dgemm_f_64(handle, transa, transb, m, n, k, alpha, a, lda, & + b, ldb, beta, c, ldc) + use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa, transb + integer*8, intent(in) :: m, n, k, lda, ldb, ldc + double precision, intent(in) :: alpha, beta + double precision, target :: a(*), b(*), c(*) + call gpu_dgemm_c(handle%c, transa, transb, m, n, k, & + alpha, c_loc(a), lda, c_loc(b), ldb, beta, c_loc(c), ldc) end subroutine