Merge pull request #38 from QuantumPackage/dev-stable

Dev stable
2025-04-28 11:14:43 +02:00 · 2024-06-26 20:18:24 +02:00 · 2024-06-26 20:18:24 +02:00 · 48c54fc20e
commit 48c54fc20e
parent 1d0bac25d0 a9d2f0e188
8 changed files with 857 additions and 22 deletions
--- a/24
+++ b/24
@ -40,6 +40,7 @@ Usage:
  $(basename $0) -c <file>    
  $(basename $0) -h           
  $(basename $0) -i <package> 
+  $(basename $0) -g [nvidia|none]

 Options:
  -c  <file>        Define a COMPILATION configuration file,
@ -48,6 +49,7 @@ Options:
  -i <package>      INSTALL <package>. Use at your OWN RISK:
                    no support will be provided for the installation of
                    dependencies.
+  -g [nvidia|none]  Choose GPU acceleration (experimental)

 Example:
  ./$(basename $0) -c config/gfortran.cfg
@ -83,7 +85,7 @@ function execute () {
 PACKAGES=""


-while getopts "d:c:i:h" c ; do
+while getopts "d:c:i:g:h" c ; do
    case "$c" in
        c)
            case "$OPTARG" in
@ -100,6 +102,9 @@ while getopts "d:c:i:h" c ; do
                "") help ; break;;
                *) PACKAGES="${PACKAGE} $OPTARG"
            esac;;
+        g)
+            GPU=$OPTARG;
+            break;;
        h)
            help
            exit 0;;
@ -109,6 +114,23 @@ while getopts "d:c:i:h" c ; do
    esac
 done

+# Handle GPU acceleration
+rm -f ${QP_ROOT}/src/gpu
+case "$GPU" in
+  amd) # Nvidia
+     echo "Activating AMD GPU acceleration"
+     ln -s ${QP_ROOT}/src/gpu_amd ${QP_ROOT}/src/gpu
+     ;;
+  nvidia) # Nvidia
+     echo "Activating Nvidia GPU acceleration"
+     ln -s ${QP_ROOT}/src/gpu_nvidia ${QP_ROOT}/src/gpu
+     ;;
+  *) # No Acceleration
+     echo "Disabling GPU acceleration"
+     ln -s ${QP_ROOT}/src/gpu_x86 ${QP_ROOT}/src/gpu
+     ;;
+esac
+
 # Trim leading and trailing spaces
 PACKAGES=$(echo $PACKAGES | xargs)

--- a/plugins/local/tc_int/jast_grad_full.irp.f
+++ b/plugins/local/tc_int/jast_grad_full.irp.f
@ -59,7 +59,7 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
  double precision              :: r2(3)
  double precision              :: dx, dy, dz, r12, tmp
  double precision              :: rn(3), f1A, grad1_f1A(3), f2A, grad2_f2A(3), g12, grad1_g12(3)
-  double precision              :: tmp1, tmp2
+  double precision              :: tmp1, tmp2, dist
  integer                       :: powmax1, powmax, powmax2
  double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:)

@ -90,30 +90,105 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
    gradx(jpoint) = 0.d0
    grady(jpoint) = 0.d0
    gradz(jpoint) = 0.d0
+
+    call jBH_elem_fct_grad_alpha1(r1, r2, g12, grad1_g12)
+
+!    dist =   (r1(1) - r2(1)) * (r1(1) - r2(1)) &
+!           + (r1(2) - r2(2)) * (r1(2) - r2(2)) &
+!           + (r1(3) - r2(3)) * (r1(3) - r2(3))
+!
+!    if(dist .ge. 1d-15) then
+!      dist = dsqrt( dist )
+!
+!      tmp1 = 1.d0 / (1.d0 + dist)
+!
+!      g12 = dist * tmp1
+!      tmp2 = tmp1 * tmp1 / dist
+!      grad1_g12(1) = tmp2 * (r1(1) - r2(1))
+!      grad1_g12(2) = tmp2 * (r1(2) - r2(2))
+!      grad1_g12(3) = tmp2 * (r1(3) - r2(3))
+!
+!    else
+!
+!      grad1_g12(1) = 0.d0
+!      grad1_g12(2) = 0.d0
+!      grad1_g12(3) = 0.d0
+!      g12 = 0.d0
+!
+!    endif
+!
+    do p = 1, powmax2
+      g12_power(p) = g12_power(p-1) * g12
+    enddo
+
    do i_nucl = 1, nucl_num

      rn(1) = nucl_coord(i_nucl,1)
      rn(2) = nucl_coord(i_nucl,2)
      rn(3) = nucl_coord(i_nucl,3)

-      call jBH_elem_fct_grad(jBH_en(i_nucl), r1, rn, f1A, grad1_f1A)
-      call jBH_elem_fct_grad(jBH_en(i_nucl), r2, rn, f2A, grad2_f2A)
-      call jBH_elem_fct_grad(jBH_ee(i_nucl), r1, r2, g12, grad1_g12)
+        call jBH_elem_fct_grad_alpha1(r1, rn, f1A, grad1_f1A)
+!      dist =   (r1(1) - rn(1)) * (r1(1) - rn(1)) &
+!             + (r1(2) - rn(2)) * (r1(2) - rn(2)) &
+!             + (r1(3) - rn(3)) * (r1(3) - rn(3))
+!      if (dist > 1.d-15) then
+!        dist = dsqrt( dist )
+!
+!        tmp1 = 1.d0 / (1.d0 + dist)
+!
+!        f1A = dist * tmp1
+!        tmp2 = tmp1 * tmp1 / dist
+!        grad1_f1A(1) = tmp2 * (r1(1) - rn(1))
+!        grad1_f1A(2) = tmp2 * (r1(2) - rn(2))
+!        grad1_f1A(3) = tmp2 * (r1(3) - rn(3))
+!
+!      else
+!
+!        grad1_f1A(1) = 0.d0
+!        grad1_f1A(2) = 0.d0
+!        grad1_f1A(3) = 0.d0
+!        f1A = 0.d0
+!
+!      endif
+
+        call jBH_elem_fct_grad_alpha1(r2, rn, f2A, grad2_f2A)
+!      dist =   (r2(1) - rn(1)) * (r2(1) - rn(1)) &
+!             + (r2(2) - rn(2)) * (r2(2) - rn(2)) &
+!             + (r2(3) - rn(3)) * (r2(3) - rn(3))
+!
+!      if (dist > 1.d-15) then
+!        dist = dsqrt( dist )
+!
+!        tmp1 = 1.d0 / (1.d0 + dist)
+!
+!        f2A = dist * tmp1
+!        tmp2 = tmp1 * tmp1 / dist
+!        grad2_f2A(1) = tmp2 * (r2(1) - rn(1))
+!        grad2_f2A(2) = tmp2 * (r2(2) - rn(2))
+!        grad2_f2A(3) = tmp2 * (r2(3) - rn(3))
+!
+!      else
+!
+!        grad2_f2A(1) = 0.d0
+!        grad2_f2A(2) = 0.d0
+!        grad2_f2A(3) = 0.d0
+!        f2A = 0.d0
+!
+!      endif

      ! Compute powers of f1A and f2A
      do p = 1, powmax1
        f1A_power(p) = f1A_power(p-1) * f1A
        f2A_power(p) = f2A_power(p-1) * f2A
      enddo
-      do p = 1, powmax2
-        g12_power(p) = g12_power(p-1) * g12
-      enddo

      do p = 1, jBH_size
        mpA = jBH_m(p,i_nucl)
        npA = jBH_n(p,i_nucl)
        opA = jBH_o(p,i_nucl)
        tmp = jBH_c(p,i_nucl)
+!        if (dabs(tmp) <= 1.d-10) cycle
+!
        if(mpA .eq. npA) then
          tmp = tmp * 0.5d0
        endif
@ -132,3 +207,39 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
  return
 end

+subroutine jBH_elem_fct_grad_alpha1(r1, r2, fct, grad1_fct)
+
+  implicit none
+  double precision, intent(in)  :: r1(3), r2(3)
+  double precision, intent(out) :: fct, grad1_fct(3)
+  double precision              :: dist, tmp1, tmp2
+
+  dist =   (r1(1) - r2(1)) * (r1(1) - r2(1)) &
+         + (r1(2) - r2(2)) * (r1(2) - r2(2)) &
+         + (r1(3) - r2(3)) * (r1(3) - r2(3))
+
+
+  if(dist .ge. 1d-15) then
+    dist = dsqrt( dist )
+
+    tmp1 = 1.d0 / (1.d0 + dist)
+
+    fct = dist * tmp1
+    tmp2 = tmp1 * tmp1 / dist
+    grad1_fct(1) = tmp2 * (r1(1) - r2(1))
+    grad1_fct(2) = tmp2 * (r1(2) - r2(2))
+    grad1_fct(3) = tmp2 * (r1(3) - r2(3))
+
+  else
+
+    grad1_fct(1) = 0.d0
+    grad1_fct(2) = 0.d0
+    grad1_fct(3) = 0.d0
+    fct = 0.d0
+
+  endif
+
+  return
+end
+
+! ---
--- a/plugins/local/tc_int/jast_utils_bh.irp.f
+++ b/plugins/local/tc_int/jast_utils_bh.irp.f
@ -1,31 +1,39 @@

 ! ---

+
+
 subroutine jBH_elem_fct_grad(alpha, r1, r2, fct, grad1_fct)

  implicit none
  double precision, intent(in)  :: alpha, r1(3), r2(3)
  double precision, intent(out) :: fct, grad1_fct(3)
-  double precision              :: dist, tmp1, tmp2
+  double precision              :: dist, tmp1, tmp2, dist_inv

-  dist = dsqrt( (r1(1) - r2(1)) * (r1(1) - r2(1)) &
+  dist =   (r1(1) - r2(1)) * (r1(1) - r2(1)) &
         + (r1(2) - r2(2)) * (r1(2) - r2(2)) &
-              + (r1(3) - r2(3)) * (r1(3) - r2(3)) )
+         + (r1(3) - r2(3)) * (r1(3) - r2(3))


-  if(dist .ge. 1d-10) then
+  if(dist .ge. 1d-15) then
+    dist_inv = 1.d0/dsqrt( dist )
+    dist = dist_inv * dist
+
    tmp1 = 1.d0 / (1.d0 + alpha * dist)

    fct = alpha * dist * tmp1
-    tmp2 = alpha * tmp1 * tmp1 / dist
+    tmp2 = alpha * tmp1 * tmp1 * dist_inv
    grad1_fct(1) = tmp2 * (r1(1) - r2(1))
    grad1_fct(2) = tmp2 * (r1(2) - r2(2))
    grad1_fct(3) = tmp2 * (r1(3) - r2(3))
+
  else
+
    grad1_fct(1) = 0.d0
    grad1_fct(2) = 0.d0
    grad1_fct(3) = 0.d0
    fct = 0.d0
+
  endif

  return
--- a/src/gpu_x86/NEED
+++ b/src/gpu_x86/NEED
@ -0,0 +1 @@
+
--- a/src/gpu_x86/README.rst
+++ b/src/gpu_x86/README.rst
@ -0,0 +1,5 @@
+=======
+gpu_x86
+=======
+
+x86 implementation of GPU routines. For use when GPUs are not available.
--- a/src/gpu_x86/gpu.c
+++ b/src/gpu_x86/gpu.c
@ -0,0 +1,506 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+
+/* Generic functions */
+
+int gpu_ndevices() {
+  return 1;
+}
+
+void gpu_set_device(int32_t i) {
+  return;
+}
+
+
+/* Allocation functions */
+
+void gpu_allocate(void** ptr, const int64_t n) {
+  *ptr = malloc((size_t) n);
+  if (*ptr == NULL) {
+    perror("Allocation failed");
+  }
+}
+
+void gpu_free(void** ptr) {
+  free(*ptr);
+  *ptr = NULL;
+}
+
+
+/* Memory transfer functions */
+
+void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
+  memcpy(gpu_ptr, cpu_ptr, n);
+}
+
+void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
+  memcpy(cpu_ptr, gpu_ptr, n);
+}
+
+void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
+  memcpy(gpu_ptr_dest, gpu_ptr_src, n);
+}
+
+
+/* Streams */
+
+void gpu_stream_create(void** ptr) {
+  *ptr = (void*) 2;
+}
+
+void gpu_stream_destroy(void** ptr) {
+  *ptr = NULL;
+}
+
+void gpu_set_stream(void* handle, void* stream) {
+  return;
+}
+
+void gpu_synchronize() {
+  return;
+}
+
+
+/* BLAS functions */
+
+void gpu_blas_create(void** handle) {
+  *handle = (void*) 1;
+}
+
+
+void gpu_blas_destroy(void** handle) {
+  *handle = NULL;
+}
+
+
+double ddot_(const int32_t* n, const double* x, const int32_t* incx, const double* y, const int32_t* incy);
+
+void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t n_, incx_, incy_;
+
+  n_    = (int32_t) n;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  *result = ddot_(&n_, x, &incx_, y, &incy_);
+}
+
+
+float sdot_(const int32_t* n, const float* x, const int32_t* incx, const float* y, const int32_t* incy);
+
+void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t n_, incx_, incy_;
+
+  n_    = (int32_t) n;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  *result = sdot_(&n_, x, &incx_, y, &incy_);
+}
+
+
+void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha,
+            const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy);
+
+void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) {
+
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, lda_, incx_, incy_;
+
+  m_    = (int32_t) m;
+  n_    = (int32_t) n;
+  lda_  = (int32_t) lda;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    m_ == m   );
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t)  lda_ == lda );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  dgemv_(&transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_);
+}
+
+
+void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha,
+               const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy);
+
+void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float alpha,
+               const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) {
+
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, lda_, incx_, incy_;
+
+  m_    = (int32_t) m;
+  n_    = (int32_t) n;
+  lda_  = (int32_t) lda;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    m_ == m   );
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t)  lda_ == lda );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  sgemv_(&transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_);
+}
+
+
+void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha,
+            const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc);
+
+void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) {
+
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, k_, lda_, ldb_, ldc_;
+
+  m_   = (int32_t) m;
+  n_   = (int32_t) n;
+  k_   = (int32_t) k;
+  lda_ = (int32_t) lda;
+  ldb_ = (int32_t) ldb;
+  ldc_ = (int32_t) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t)   k_ == k  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  dgemm_(&transa, &transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_);
+}
+
+
+
+void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha,
+            const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc);
+
+void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
+               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) {
+
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, k_, lda_, ldb_, ldc_;
+
+  m_   = (int32_t) m;
+  n_   = (int32_t) n;
+  k_   = (int32_t) k;
+  lda_ = (int32_t) lda;
+  ldb_ = (int32_t) ldb;
+  ldc_ = (int32_t) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t)   k_ == k  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  sgemm_(&transa, &transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_);
+}
+
+
+void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
+               const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
+  if (handle == NULL) {
+    perror("NULL handle");
+    exit(-1);
+  }
+
+  if ( (transa == 'N' && transb == 'N') ||
+       (transa == 'n' && transb == 'N') ||
+       (transa == 'N' && transb == 'n') ||
+       (transa == 'n' && transb == 'n') ) {
+
+     if (alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = beta * b[j*ldb+i];
+         }
+       }
+
+     } else if (beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[j*lda+i];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i];
+         }
+       }
+
+     }
+
+  } else if ( (transa == 'N' && transb == 'T') ||
+              (transa == 'n' && transb == 'T') ||
+              (transa == 'N' && transb == 't') ||
+              (transa == 'n' && transb == 't') ) {
+
+     if (alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = beta * b[i*ldb+j];
+         }
+       }
+
+     } else if (beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[j*lda+i];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j];
+         }
+       }
+
+     }
+
+  } else if ( (transa == 'T' && transb == 'N') ||
+              (transa == 't' && transb == 'N') ||
+              (transa == 'T' && transb == 'n') ||
+              (transa == 't' && transb == 'n') ) {
+
+     if (alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = beta * b[j*ldb+i];
+         }
+       }
+
+     } else if (beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[i*lda+j];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i];
+         }
+       }
+
+     }
+
+  } else if ( (transa == 'T' && transb == 'T') ||
+              (transa == 't' && transb == 'T') ||
+              (transa == 'T' && transb == 't') ||
+              (transa == 't' && transb == 't') ) {
+
+     if (alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = beta * b[i*ldb+j];
+         }
+       }
+
+     } else if (beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[i*lda+j];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j];
+         }
+       }
+
+     }
+
+  }
+}
+
+
+void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
+               const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
+  if (handle == NULL) {
+    perror("NULL handle");
+    exit(-1);
+  }
+
+  if ( (transa == 'N' && transb == 'N') ||
+       (transa == 'n' && transb == 'N') ||
+       (transa == 'N' && transb == 'n') ||
+       (transa == 'n' && transb == 'n') ) {
+
+     if (alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = beta * b[j*ldb+i];
+         }
+       }
+
+     } else if (beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[j*lda+i];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i];
+         }
+       }
+
+     }
+
+  } else if ( (transa == 'N' && transb == 'T') ||
+              (transa == 'n' && transb == 'T') ||
+              (transa == 'N' && transb == 't') ||
+              (transa == 'n' && transb == 't') ) {
+
+     if (alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = beta * b[i*ldb+j];
+         }
+       }
+
+     } else if (beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[j*lda+i];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j];
+         }
+       }
+
+     }
+
+  } else if ( (transa == 'T' && transb == 'N') ||
+              (transa == 't' && transb == 'N') ||
+              (transa == 'T' && transb == 'n') ||
+              (transa == 't' && transb == 'n') ) {
+
+     if (alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = beta * b[j*ldb+i];
+         }
+       }
+
+     } else if (beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[i*lda+j];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i];
+         }
+       }
+
+     }
+
+  } else if ( (transa == 'T' && transb == 'T') ||
+              (transa == 't' && transb == 'T') ||
+              (transa == 'T' && transb == 't') ||
+              (transa == 't' && transb == 't') ) {
+
+     if (alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = beta * b[i*ldb+j];
+         }
+       }
+
+     } else if (beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[i*lda+j];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j];
+         }
+       }
+
+     }
+
+  }
+}
--- a/src/gpu_x86/gpu.h
+++ b/src/gpu_x86/gpu.h
@ -0,0 +1,41 @@
+#include <stdint.h>
+
+int  gpu_ndevices();
+void gpu_set_device(int32_t i);
+
+void gpu_allocate(void** ptr, const int64_t n);
+void gpu_free(void** ptr);
+
+void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n);
+void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n);
+void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n);
+
+void gpu_stream_create(void** ptr);
+void gpu_stream_destroy(void** ptr);
+void gpu_set_stream(void* handle, void* stream);
+void gpu_synchronize();
+
+void gpu_blas_create(void** handle);
+void gpu_blas_destroy(void** handle);
+
+void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result);
+
+void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result);
+
+void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy);
+
+void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float alpha,
+               const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy);
+
+void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc);
+
+void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
+               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc);
+
+void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
+               const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc);
+
+void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
+               const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc);
--- a/src/gpu_x86/gpu_module.F90
+++ b/src/gpu_x86/gpu_module.F90
@ -0,0 +1,141 @@
+module gpu
+  use, intrinsic :: iso_c_binding, only : c_int32_t, c_int64_t, c_double, c_size_t, c_char
+  implicit none
+
+  interface
+    integer function gpu_ndevices() bind(C)
+    end function
+
+    subroutine gpu_set_device(id) bind(C)
+      import
+      integer(c_int32_t), value :: id
+    end subroutine
+
+    subroutine gpu_allocate_c(ptr, n) bind(C, name='gpu_allocate')
+      import
+      type(c_ptr) :: ptr
+      integer(c_int64_t), value :: n
+    end subroutine
+
+    subroutine gpu_free_c(ptr) bind(C, name='gpu_free')
+      import
+      type(c_ptr) :: ptr
+    end subroutine
+
+    subroutine gpu_upload_c(cpu_ptr, gpu_ptr, n) bind(C, name='gpu_upload')
+      import
+      type(c_ptr), value :: cpu_ptr
+      type(c_ptr), value :: gpu_ptr
+      integer(c_int64_t), value :: n
+    end subroutine
+
+    subroutine gpu_download_c(gpu_ptr, cpu_ptr, n) bind(C, name='gpu_download')
+      import
+      type(c_ptr), value :: gpu_ptr
+      type(c_ptr), value :: cpu_ptr
+      integer(c_int64_t), value :: n
+    end subroutine
+
+    subroutine gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, n) bind(C, name='gpu_copy')
+      import
+      type(c_ptr), value :: gpu_ptr_src
+      type(c_ptr), value :: gpu_ptr_dest
+      integer(c_int64_t), value :: n
+    end subroutine
+
+    subroutine gpu_stream_create(stream) bind(C)
+      import
+      type(c_ptr) :: stream
+    end subroutine
+
+    subroutine gpu_stream_destroy(stream) bind(C)
+      import
+      type(c_ptr) :: stream
+    end subroutine
+
+    subroutine gpu_set_stream(handle, stream) bind(C)
+      import
+      type(c_ptr) :: handle, stream
+    end subroutine
+
+    subroutine gpu_synchronize()
+    end subroutine
+
+    subroutine gpu_blas_create(handle) bind(C)
+      import
+      type(c_ptr) :: handle
+    end subroutine
+
+    subroutine gpu_blas_destroy(handle) bind(C)
+      import
+      type(c_ptr) :: handle
+    end subroutine
+
+    subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res) bind(C)
+      import
+      type(c_ptr), intent(in)     :: handle
+      integer(c_int64_t), value   :: n, incx, incy
+      real(c_double), intent(in)  :: dx(*), dy(*)
+      real(c_double), intent(out) :: res
+    end subroutine
+
+    subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res) bind(C)
+      import
+      type(c_ptr), intent(in)     :: handle
+      integer(c_int64_t), value   :: n, incx, incy
+      real(c_float), intent(in)   :: dx(*), dy(*)
+      real(c_float), intent(out)  :: res
+    end subroutine
+
+  end interface
+
+end module
+
+subroutine gpu_allocate_double(ptr, s)
+  use gpu
+  implicit none
+  double precision, pointer, intent(inout) :: ptr
+  integer*8, intent(in) :: s(*)
+  type(c_ptr) :: cptr
+
+  call gpu_allocate_c(cptr, sum(s)*8_8)
+  call c_f_pointer(cptr, ptr, s)
+end subroutine
+
+subroutine gpu_free_double(ptr)
+  use gpu
+  implicit none
+  double precision, pointer, intent(inout) :: ptr
+  type(c_ptr) :: cptr
+  cptr = cloc(ptr)
+  call gpu_free(cptr)
+  NULLIFY(ptr)
+end subroutine
+
+subroutine gpu_upload_double(cpu_ptr, gpu_ptr, n)
+  use gpu
+  implicit none
+  double precision, intent(in)   :: cpu_ptr(*)
+  double precision, intent(out)  :: gpu_ptr(*)
+  integer(c_int64_t), intent(in) :: n
+  call gpu_upload_c(cpu_ptr, gpu_ptr, 8_8*n)
+end subroutine
+
+subroutine gpu_download_double(gpu_ptr, cpu_ptr, n)
+  use gpu
+  implicit none
+  double precision, intent(in)   :: gpu_ptr(*)
+  double precision, intent(out)  :: cpu_ptr(*)
+  integer(c_int64_t), intent(in) :: n
+  call gpu_download_c(gpu_ptr, cpu_ptr, 8_8*n)
+end subroutine
+
+subroutine gpu_copy_double(gpu_ptr_src, gpu_ptr_dest, n)
+  use gpu
+  implicit none
+  double precision, intent(in)   :: gpu_ptr_src(*)
+  double precision, intent(out)  :: gpu_ptr_dest(*)
+  integer(c_int64_t), intent(in) :: n
+  call gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, 8_8*n)
+end subroutine
+