diff --git a/configure b/configure index 41c0123d..014275eb 100755 --- a/configure +++ b/configure @@ -40,14 +40,16 @@ Usage: $(basename $0) -c $(basename $0) -h $(basename $0) -i + $(basename $0) -g [nvidia|none] Options: - -c Define a COMPILATION configuration file, - in "${QP_ROOT}/config/". - -h Print the HELP message - -i INSTALL . Use at your OWN RISK: - no support will be provided for the installation of - dependencies. + -c Define a COMPILATION configuration file, + in "${QP_ROOT}/config/". + -h Print the HELP message + -i INSTALL . Use at your OWN RISK: + no support will be provided for the installation of + dependencies. + -g [nvidia|none] Choose GPU acceleration (experimental) Example: ./$(basename $0) -c config/gfortran.cfg @@ -83,7 +85,7 @@ function execute () { PACKAGES="" -while getopts "d:c:i:h" c ; do +while getopts "d:c:i:g:h" c ; do case "$c" in c) case "$OPTARG" in @@ -100,6 +102,9 @@ while getopts "d:c:i:h" c ; do "") help ; break;; *) PACKAGES="${PACKAGE} $OPTARG" esac;; + g) + GPU=$OPTARG; + break;; h) help exit 0;; @@ -109,6 +114,23 @@ while getopts "d:c:i:h" c ; do esac done +# Handle GPU acceleration +rm -f ${QP_ROOT}/src/gpu +case "$GPU" in + amd) # Nvidia + echo "Activating AMD GPU acceleration" + ln -s ${QP_ROOT}/src/gpu_amd ${QP_ROOT}/src/gpu + ;; + nvidia) # Nvidia + echo "Activating Nvidia GPU acceleration" + ln -s ${QP_ROOT}/src/gpu_nvidia ${QP_ROOT}/src/gpu + ;; + *) # No Acceleration + echo "Disabling GPU acceleration" + ln -s ${QP_ROOT}/src/gpu_x86 ${QP_ROOT}/src/gpu + ;; +esac + # Trim leading and trailing spaces PACKAGES=$(echo $PACKAGES | xargs) diff --git a/plugins/local/tc_int/jast_grad_full.irp.f b/plugins/local/tc_int/jast_grad_full.irp.f index 78ed1edf..599d3779 100644 --- a/plugins/local/tc_int/jast_grad_full.irp.f +++ b/plugins/local/tc_int/jast_grad_full.irp.f @@ -4,7 +4,7 @@ subroutine get_grad1_u12_for_tc(ipoint, n_grid2, resx, resy, resz, res) BEGIN_DOC - ! + ! ! resx(ipoint) = [grad1 u(r1,r2)]_x1 ! resy(ipoint) = [grad1 u(r1,r2)]_y1 ! resz(ipoint) = [grad1 u(r1,r2)]_z1 @@ -59,7 +59,7 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz) double precision :: r2(3) double precision :: dx, dy, dz, r12, tmp double precision :: rn(3), f1A, grad1_f1A(3), f2A, grad2_f2A(3), g12, grad1_g12(3) - double precision :: tmp1, tmp2 + double precision :: tmp1, tmp2, dist integer :: powmax1, powmax, powmax2 double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:) @@ -90,30 +90,105 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz) gradx(jpoint) = 0.d0 grady(jpoint) = 0.d0 gradz(jpoint) = 0.d0 + + call jBH_elem_fct_grad_alpha1(r1, r2, g12, grad1_g12) + +! dist = (r1(1) - r2(1)) * (r1(1) - r2(1)) & +! + (r1(2) - r2(2)) * (r1(2) - r2(2)) & +! + (r1(3) - r2(3)) * (r1(3) - r2(3)) +! +! if(dist .ge. 1d-15) then +! dist = dsqrt( dist ) +! +! tmp1 = 1.d0 / (1.d0 + dist) +! +! g12 = dist * tmp1 +! tmp2 = tmp1 * tmp1 / dist +! grad1_g12(1) = tmp2 * (r1(1) - r2(1)) +! grad1_g12(2) = tmp2 * (r1(2) - r2(2)) +! grad1_g12(3) = tmp2 * (r1(3) - r2(3)) +! +! else +! +! grad1_g12(1) = 0.d0 +! grad1_g12(2) = 0.d0 +! grad1_g12(3) = 0.d0 +! g12 = 0.d0 +! +! endif +! + do p = 1, powmax2 + g12_power(p) = g12_power(p-1) * g12 + enddo + do i_nucl = 1, nucl_num rn(1) = nucl_coord(i_nucl,1) rn(2) = nucl_coord(i_nucl,2) rn(3) = nucl_coord(i_nucl,3) - call jBH_elem_fct_grad(jBH_en(i_nucl), r1, rn, f1A, grad1_f1A) - call jBH_elem_fct_grad(jBH_en(i_nucl), r2, rn, f2A, grad2_f2A) - call jBH_elem_fct_grad(jBH_ee(i_nucl), r1, r2, g12, grad1_g12) + call jBH_elem_fct_grad_alpha1(r1, rn, f1A, grad1_f1A) +! dist = (r1(1) - rn(1)) * (r1(1) - rn(1)) & +! + (r1(2) - rn(2)) * (r1(2) - rn(2)) & +! + (r1(3) - rn(3)) * (r1(3) - rn(3)) +! if (dist > 1.d-15) then +! dist = dsqrt( dist ) +! +! tmp1 = 1.d0 / (1.d0 + dist) +! +! f1A = dist * tmp1 +! tmp2 = tmp1 * tmp1 / dist +! grad1_f1A(1) = tmp2 * (r1(1) - rn(1)) +! grad1_f1A(2) = tmp2 * (r1(2) - rn(2)) +! grad1_f1A(3) = tmp2 * (r1(3) - rn(3)) +! +! else +! +! grad1_f1A(1) = 0.d0 +! grad1_f1A(2) = 0.d0 +! grad1_f1A(3) = 0.d0 +! f1A = 0.d0 +! +! endif + + call jBH_elem_fct_grad_alpha1(r2, rn, f2A, grad2_f2A) +! dist = (r2(1) - rn(1)) * (r2(1) - rn(1)) & +! + (r2(2) - rn(2)) * (r2(2) - rn(2)) & +! + (r2(3) - rn(3)) * (r2(3) - rn(3)) +! +! if (dist > 1.d-15) then +! dist = dsqrt( dist ) +! +! tmp1 = 1.d0 / (1.d0 + dist) +! +! f2A = dist * tmp1 +! tmp2 = tmp1 * tmp1 / dist +! grad2_f2A(1) = tmp2 * (r2(1) - rn(1)) +! grad2_f2A(2) = tmp2 * (r2(2) - rn(2)) +! grad2_f2A(3) = tmp2 * (r2(3) - rn(3)) +! +! else +! +! grad2_f2A(1) = 0.d0 +! grad2_f2A(2) = 0.d0 +! grad2_f2A(3) = 0.d0 +! f2A = 0.d0 +! +! endif ! Compute powers of f1A and f2A do p = 1, powmax1 f1A_power(p) = f1A_power(p-1) * f1A f2A_power(p) = f2A_power(p-1) * f2A enddo - do p = 1, powmax2 - g12_power(p) = g12_power(p-1) * g12 - enddo do p = 1, jBH_size mpA = jBH_m(p,i_nucl) npA = jBH_n(p,i_nucl) opA = jBH_o(p,i_nucl) tmp = jBH_c(p,i_nucl) +! if (dabs(tmp) <= 1.d-10) cycle +! if(mpA .eq. npA) then tmp = tmp * 0.5d0 endif @@ -132,3 +207,39 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz) return end +subroutine jBH_elem_fct_grad_alpha1(r1, r2, fct, grad1_fct) + + implicit none + double precision, intent(in) :: r1(3), r2(3) + double precision, intent(out) :: fct, grad1_fct(3) + double precision :: dist, tmp1, tmp2 + + dist = (r1(1) - r2(1)) * (r1(1) - r2(1)) & + + (r1(2) - r2(2)) * (r1(2) - r2(2)) & + + (r1(3) - r2(3)) * (r1(3) - r2(3)) + + + if(dist .ge. 1d-15) then + dist = dsqrt( dist ) + + tmp1 = 1.d0 / (1.d0 + dist) + + fct = dist * tmp1 + tmp2 = tmp1 * tmp1 / dist + grad1_fct(1) = tmp2 * (r1(1) - r2(1)) + grad1_fct(2) = tmp2 * (r1(2) - r2(2)) + grad1_fct(3) = tmp2 * (r1(3) - r2(3)) + + else + + grad1_fct(1) = 0.d0 + grad1_fct(2) = 0.d0 + grad1_fct(3) = 0.d0 + fct = 0.d0 + + endif + + return +end + +! --- diff --git a/plugins/local/tc_int/jast_utils_bh.irp.f b/plugins/local/tc_int/jast_utils_bh.irp.f index 750ce90b..200bc5ff 100644 --- a/plugins/local/tc_int/jast_utils_bh.irp.f +++ b/plugins/local/tc_int/jast_utils_bh.irp.f @@ -1,35 +1,43 @@ ! --- + + subroutine jBH_elem_fct_grad(alpha, r1, r2, fct, grad1_fct) implicit none double precision, intent(in) :: alpha, r1(3), r2(3) double precision, intent(out) :: fct, grad1_fct(3) - double precision :: dist, tmp1, tmp2 + double precision :: dist, tmp1, tmp2, dist_inv - dist = dsqrt( (r1(1) - r2(1)) * (r1(1) - r2(1)) & - + (r1(2) - r2(2)) * (r1(2) - r2(2)) & - + (r1(3) - r2(3)) * (r1(3) - r2(3)) ) + dist = (r1(1) - r2(1)) * (r1(1) - r2(1)) & + + (r1(2) - r2(2)) * (r1(2) - r2(2)) & + + (r1(3) - r2(3)) * (r1(3) - r2(3)) - if(dist .ge. 1d-10) then + if(dist .ge. 1d-15) then + dist_inv = 1.d0/dsqrt( dist ) + dist = dist_inv * dist + tmp1 = 1.d0 / (1.d0 + alpha * dist) fct = alpha * dist * tmp1 - tmp2 = alpha * tmp1 * tmp1 / dist + tmp2 = alpha * tmp1 * tmp1 * dist_inv grad1_fct(1) = tmp2 * (r1(1) - r2(1)) grad1_fct(2) = tmp2 * (r1(2) - r2(2)) grad1_fct(3) = tmp2 * (r1(3) - r2(3)) + else + grad1_fct(1) = 0.d0 grad1_fct(2) = 0.d0 grad1_fct(3) = 0.d0 fct = 0.d0 + endif return -end +end ! --- diff --git a/src/gpu_x86/NEED b/src/gpu_x86/NEED new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/src/gpu_x86/NEED @@ -0,0 +1 @@ + diff --git a/src/gpu_x86/README.rst b/src/gpu_x86/README.rst new file mode 100644 index 00000000..f530bf29 --- /dev/null +++ b/src/gpu_x86/README.rst @@ -0,0 +1,5 @@ +======= +gpu_x86 +======= + +x86 implementation of GPU routines. For use when GPUs are not available. diff --git a/src/gpu_x86/gpu.c b/src/gpu_x86/gpu.c new file mode 100644 index 00000000..71505dbe --- /dev/null +++ b/src/gpu_x86/gpu.c @@ -0,0 +1,506 @@ +#include +#include +#include +#include +#include + + +/* Generic functions */ + +int gpu_ndevices() { + return 1; +} + +void gpu_set_device(int32_t i) { + return; +} + + +/* Allocation functions */ + +void gpu_allocate(void** ptr, const int64_t n) { + *ptr = malloc((size_t) n); + if (*ptr == NULL) { + perror("Allocation failed"); + } +} + +void gpu_free(void** ptr) { + free(*ptr); + *ptr = NULL; +} + + +/* Memory transfer functions */ + +void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) { + memcpy(gpu_ptr, cpu_ptr, n); +} + +void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) { + memcpy(cpu_ptr, gpu_ptr, n); +} + +void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) { + memcpy(gpu_ptr_dest, gpu_ptr_src, n); +} + + +/* Streams */ + +void gpu_stream_create(void** ptr) { + *ptr = (void*) 2; +} + +void gpu_stream_destroy(void** ptr) { + *ptr = NULL; +} + +void gpu_set_stream(void* handle, void* stream) { + return; +} + +void gpu_synchronize() { + return; +} + + +/* BLAS functions */ + +void gpu_blas_create(void** handle) { + *handle = (void*) 1; +} + + +void gpu_blas_destroy(void** handle) { + *handle = NULL; +} + + +double ddot_(const int32_t* n, const double* x, const int32_t* incx, const double* y, const int32_t* incy); + +void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) { + assert (handle != NULL); + + /* Convert to int32_t */ + int32_t n_, incx_, incy_; + + n_ = (int32_t) n; + incx_ = (int32_t) incx; + incy_ = (int32_t) incy; + + /* Check for integer overflows */ + assert ( (int64_t) n_ == n ); + assert ( (int64_t) incx_ == incx); + assert ( (int64_t) incy_ == incy); + + *result = ddot_(&n_, x, &incx_, y, &incy_); +} + + +float sdot_(const int32_t* n, const float* x, const int32_t* incx, const float* y, const int32_t* incy); + +void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) { + assert (handle != NULL); + + /* Convert to int32_t */ + int32_t n_, incx_, incy_; + + n_ = (int32_t) n; + incx_ = (int32_t) incx; + incy_ = (int32_t) incy; + + /* Check for integer overflows */ + assert ( (int64_t) n_ == n ); + assert ( (int64_t) incx_ == incx); + assert ( (int64_t) incy_ == incy); + + *result = sdot_(&n_, x, &incx_, y, &incy_); +} + + +void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha, + const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy); + +void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double alpha, + const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) { + + assert (handle != NULL); + + /* Convert to int32_t */ + int32_t m_, n_, lda_, incx_, incy_; + + m_ = (int32_t) m; + n_ = (int32_t) n; + lda_ = (int32_t) lda; + incx_ = (int32_t) incx; + incy_ = (int32_t) incy; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) lda_ == lda ); + assert ( (int64_t) incx_ == incx); + assert ( (int64_t) incy_ == incy); + + dgemv_(&transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_); +} + + +void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha, + const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy); + +void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float alpha, + const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) { + + assert (handle != NULL); + + /* Convert to int32_t */ + int32_t m_, n_, lda_, incx_, incy_; + + m_ = (int32_t) m; + n_ = (int32_t) n; + lda_ = (int32_t) lda; + incx_ = (int32_t) incx; + incy_ = (int32_t) incy; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) lda_ == lda ); + assert ( (int64_t) incx_ == incx); + assert ( (int64_t) incy_ == incy); + + sgemv_(&transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_); +} + + +void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha, + const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc); + +void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, + const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) { + + assert (handle != NULL); + + /* Convert to int32_t */ + int32_t m_, n_, k_, lda_, ldb_, ldc_; + + m_ = (int32_t) m; + n_ = (int32_t) n; + k_ = (int32_t) k; + lda_ = (int32_t) lda; + ldb_ = (int32_t) ldb; + ldc_ = (int32_t) ldc; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) k_ == k ); + assert ( (int64_t) lda_ == lda); + assert ( (int64_t) ldb_ == ldb); + assert ( (int64_t) ldc_ == ldc); + + dgemm_(&transa, &transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_); +} + + + +void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha, + const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc); + +void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, + const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) { + + assert (handle != NULL); + + /* Convert to int32_t */ + int32_t m_, n_, k_, lda_, ldb_, ldc_; + + m_ = (int32_t) m; + n_ = (int32_t) n; + k_ = (int32_t) k; + lda_ = (int32_t) lda; + ldb_ = (int32_t) ldb; + ldc_ = (int32_t) ldc; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) k_ == k ); + assert ( (int64_t) lda_ == lda); + assert ( (int64_t) ldb_ == ldb); + assert ( (int64_t) ldc_ == ldc); + + sgemm_(&transa, &transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_); +} + + +void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, + const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { + if (handle == NULL) { + perror("NULL handle"); + exit(-1); + } + + if ( (transa == 'N' && transb == 'N') || + (transa == 'n' && transb == 'N') || + (transa == 'N' && transb == 'n') || + (transa == 'n' && transb == 'n') ) { + + if (alpha == 0.) { + + for (int64_t j=0 ; j + +int gpu_ndevices(); +void gpu_set_device(int32_t i); + +void gpu_allocate(void** ptr, const int64_t n); +void gpu_free(void** ptr); + +void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n); +void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n); +void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n); + +void gpu_stream_create(void** ptr); +void gpu_stream_destroy(void** ptr); +void gpu_set_stream(void* handle, void* stream); +void gpu_synchronize(); + +void gpu_blas_create(void** handle); +void gpu_blas_destroy(void** handle); + +void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result); + +void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result); + +void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double alpha, + const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy); + +void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float alpha, + const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy); + +void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, + const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc); + +void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, + const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc); + +void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, + const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc); + +void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha, + const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc); diff --git a/src/gpu_x86/gpu_module.F90 b/src/gpu_x86/gpu_module.F90 new file mode 100644 index 00000000..86ba3926 --- /dev/null +++ b/src/gpu_x86/gpu_module.F90 @@ -0,0 +1,141 @@ +module gpu + use, intrinsic :: iso_c_binding, only : c_int32_t, c_int64_t, c_double, c_size_t, c_char + implicit none + + interface + integer function gpu_ndevices() bind(C) + end function + + subroutine gpu_set_device(id) bind(C) + import + integer(c_int32_t), value :: id + end subroutine + + subroutine gpu_allocate_c(ptr, n) bind(C, name='gpu_allocate') + import + type(c_ptr) :: ptr + integer(c_int64_t), value :: n + end subroutine + + subroutine gpu_free_c(ptr) bind(C, name='gpu_free') + import + type(c_ptr) :: ptr + end subroutine + + subroutine gpu_upload_c(cpu_ptr, gpu_ptr, n) bind(C, name='gpu_upload') + import + type(c_ptr), value :: cpu_ptr + type(c_ptr), value :: gpu_ptr + integer(c_int64_t), value :: n + end subroutine + + subroutine gpu_download_c(gpu_ptr, cpu_ptr, n) bind(C, name='gpu_download') + import + type(c_ptr), value :: gpu_ptr + type(c_ptr), value :: cpu_ptr + integer(c_int64_t), value :: n + end subroutine + + subroutine gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, n) bind(C, name='gpu_copy') + import + type(c_ptr), value :: gpu_ptr_src + type(c_ptr), value :: gpu_ptr_dest + integer(c_int64_t), value :: n + end subroutine + + subroutine gpu_stream_create(stream) bind(C) + import + type(c_ptr) :: stream + end subroutine + + subroutine gpu_stream_destroy(stream) bind(C) + import + type(c_ptr) :: stream + end subroutine + + subroutine gpu_set_stream(handle, stream) bind(C) + import + type(c_ptr) :: handle, stream + end subroutine + + subroutine gpu_synchronize() + end subroutine + + subroutine gpu_blas_create(handle) bind(C) + import + type(c_ptr) :: handle + end subroutine + + subroutine gpu_blas_destroy(handle) bind(C) + import + type(c_ptr) :: handle + end subroutine + + subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res) bind(C) + import + type(c_ptr), intent(in) :: handle + integer(c_int64_t), value :: n, incx, incy + real(c_double), intent(in) :: dx(*), dy(*) + real(c_double), intent(out) :: res + end subroutine + + subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res) bind(C) + import + type(c_ptr), intent(in) :: handle + integer(c_int64_t), value :: n, incx, incy + real(c_float), intent(in) :: dx(*), dy(*) + real(c_float), intent(out) :: res + end subroutine + + end interface + +end module + +subroutine gpu_allocate_double(ptr, s) + use gpu + implicit none + double precision, pointer, intent(inout) :: ptr + integer*8, intent(in) :: s(*) + type(c_ptr) :: cptr + + call gpu_allocate_c(cptr, sum(s)*8_8) + call c_f_pointer(cptr, ptr, s) +end subroutine + +subroutine gpu_free_double(ptr) + use gpu + implicit none + double precision, pointer, intent(inout) :: ptr + type(c_ptr) :: cptr + cptr = cloc(ptr) + call gpu_free(cptr) + NULLIFY(ptr) +end subroutine + +subroutine gpu_upload_double(cpu_ptr, gpu_ptr, n) + use gpu + implicit none + double precision, intent(in) :: cpu_ptr(*) + double precision, intent(out) :: gpu_ptr(*) + integer(c_int64_t), intent(in) :: n + call gpu_upload_c(cpu_ptr, gpu_ptr, 8_8*n) +end subroutine + +subroutine gpu_download_double(gpu_ptr, cpu_ptr, n) + use gpu + implicit none + double precision, intent(in) :: gpu_ptr(*) + double precision, intent(out) :: cpu_ptr(*) + integer(c_int64_t), intent(in) :: n + call gpu_download_c(gpu_ptr, cpu_ptr, 8_8*n) +end subroutine + +subroutine gpu_copy_double(gpu_ptr_src, gpu_ptr_dest, n) + use gpu + implicit none + double precision, intent(in) :: gpu_ptr_src(*) + double precision, intent(out) :: gpu_ptr_dest(*) + integer(c_int64_t), intent(in) :: n + call gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, 8_8*n) +end subroutine +