diff --git a/configure b/configure
index 41c0123d..43ca9f6d 100755
--- a/configure
+++ b/configure
@@ -40,14 +40,16 @@ Usage:
   $(basename $0) -c <file>    
   $(basename $0) -h           
   $(basename $0) -i <package> 
+  $(basename $0) -g [nvidia|intel|none]
 
 Options:
-  -c  <file>    Define a COMPILATION configuration file,
-                in "${QP_ROOT}/config/".
-  -h            Print the HELP message
-  -i <package>  INSTALL <package>. Use at your OWN RISK:
-                no support will be provided for the installation of
-                dependencies.
+  -c  <file>        Define a COMPILATION configuration file,
+                    in "${QP_ROOT}/config/".
+  -h                Print the HELP message
+  -i <package>      INSTALL <package>. Use at your OWN RISK:
+                    no support will be provided for the installation of
+                    dependencies.
+  -g [nvidia|intel|none]  Choose GPU acceleration
 
 Example:
   ./$(basename $0) -c config/gfortran.cfg
@@ -83,7 +85,7 @@ function execute () {
 PACKAGES=""
 
 
-while getopts "d:c:i:h" c ; do
+while getopts "d:c:i:g:h" c ; do
     case "$c" in
         c)
             case "$OPTARG" in
@@ -100,6 +102,9 @@ while getopts "d:c:i:h" c ; do
                 "") help ; break;;
                 *) PACKAGES="${PACKAGE} $OPTARG"
             esac;;
+        g)
+            GPU=$OPTARG;
+            break;;
         h)
             help
             exit 0;;
@@ -109,6 +114,27 @@ while getopts "d:c:i:h" c ; do
     esac
 done
 
+# Handle GPU acceleration
+rm -f ${QP_ROOT}/src/gpu_arch
+case "$GPU" in
+  amd) # AMD
+     echo "Activating AMD GPU acceleration"
+     ln -s ${QP_ROOT}/plugins/local/gpu_amd ${QP_ROOT}/src/gpu_arch
+     ;;
+  intel) # Intel
+     echo "Activating Intel GPU acceleration (EXPERIMENTAL)"
+     ln -s ${QP_ROOT}/plugins/local/gpu_intel ${QP_ROOT}/src/gpu_arch
+     ;;
+  nvidia) # Nvidia
+     echo "Activating Nvidia GPU acceleration"
+     ln -s ${QP_ROOT}/plugins/local/gpu_nvidia ${QP_ROOT}/src/gpu_arch
+     ;;
+  *) # No Acceleration
+     echo "Disabling GPU acceleration"
+     ln -s ${QP_ROOT}/plugins/local/gpu_x86 ${QP_ROOT}/src/gpu_arch
+     ;;
+esac
+
 # Trim leading and trailing spaces
 PACKAGES=$(echo $PACKAGES | xargs)
 
diff --git a/plugins/local/gpu_intel/LIB b/plugins/local/gpu_intel/LIB
new file mode 100644
index 00000000..199b0f1c
--- /dev/null
+++ b/plugins/local/gpu_intel/LIB
@@ -0,0 +1,2 @@
+-ltbb -lsycl -lmkl_sycl -lgpu -limf -lintlc -lstdc++ 
+
diff --git a/plugins/local/gpu_intel/NEED b/plugins/local/gpu_intel/NEED
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/plugins/local/gpu_intel/NEED
@@ -0,0 +1 @@
+
diff --git a/plugins/local/gpu_intel/README.rst b/plugins/local/gpu_intel/README.rst
new file mode 100644
index 00000000..d42e2557
--- /dev/null
+++ b/plugins/local/gpu_intel/README.rst
@@ -0,0 +1,8 @@
+=========
+gpu_intel
+=========
+
+Intel implementation of GPU routines. Uses MKL and SYCL.
+```bash
+icpx -fsycl gpu.cxx -c -qmkl=sequential
+```
diff --git a/plugins/local/gpu_intel/gpu.sycl b/plugins/local/gpu_intel/gpu.sycl
new file mode 100644
index 00000000..1f9f89ce
--- /dev/null
+++ b/plugins/local/gpu_intel/gpu.sycl
@@ -0,0 +1,177 @@
+#include <CL/sycl.hpp>
+#include <cassert>
+#include <limits>
+#include <oneapi/mkl/blas.hpp>
+
+extern "C" {
+
+/* Generic functions */
+
+int gpu_ndevices() {
+  return 1;
+}
+
+void gpu_set_device(int32_t igpu) {
+}
+
+
+/* Allocation functions */
+
+void gpu_allocate(void** ptr, int64_t size) {
+    auto queue = sycl::queue(sycl::default_selector_v);
+
+    try {
+        *ptr = sycl::malloc_shared(size, queue);
+        assert(*ptr != nullptr);
+    } catch (const sycl::exception& e) {
+        std::cerr << "SYCL exception caught: " << e.what() << std::endl;
+        *ptr = nullptr; // If allocation fails, set pointer to nullptr
+    }
+}
+
+void gpu_deallocate(void** ptr) {
+    assert(*ptr != nullptr);
+    sycl::free(*ptr, sycl::queue(sycl::default_selector_v));
+    *ptr = nullptr;
+}
+
+/* Upload data from host to device */
+void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
+    sycl::queue queue(sycl::default_selector_v);
+    queue.memcpy(gpu_ptr, cpu_ptr, n).wait();
+}
+
+/* Download data from device to host */
+void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
+    sycl::queue queue(sycl::default_selector_v);
+    queue.memcpy(cpu_ptr, gpu_ptr, n).wait();
+}
+
+/* Copy data from one GPU memory location to another */
+void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
+    sycl::queue queue(sycl::default_selector_v);
+    queue.memcpy(gpu_ptr_dest, gpu_ptr_src, n).wait();
+}
+
+/* Queues */
+
+/* SYCL queue as a replacement for CUDA stream */
+void gpu_stream_create(sycl::queue** ptr) {
+    *ptr = new sycl::queue(sycl::default_selector_v);
+}
+
+void gpu_stream_destroy(sycl::queue** ptr) {
+    assert(*ptr != nullptr);
+    delete *ptr;
+    *ptr = nullptr;
+}
+
+void gpu_synchronize() {
+    sycl::queue queue(sycl::default_selector_v);
+    queue.wait_and_throw();
+}
+
+/* BLAS functions */
+
+typedef struct {
+  sycl::queue* queue;
+} blasHandle_t;
+
+void gpu_set_stream(blasHandle_t* handle, sycl::queue* ptr) {
+  handle->queue = ptr;
+}
+
+void gpu_blas_create(blasHandle_t** ptr) {
+    *ptr = (blasHandle_t*) malloc(sizeof(blasHandle_t));
+    assert(*ptr != nullptr);
+    (*ptr)->queue = new sycl::queue(sycl::default_selector_v);
+    assert((*ptr)->queue != nullptr);
+}
+
+void gpu_blas_destroy(blasHandle_t** ptr) {
+    assert(*ptr != nullptr);
+    delete (*ptr)->queue;
+    free(*ptr);
+    *ptr = nullptr;
+}
+
+
+void gpu_ddot(blasHandle_t* handle, const int64_t n, const double* x, const int64_t incx, 
+              const double* y, const int64_t incy, double* result) {
+    // Ensure input parameters are valid
+    assert(handle != nullptr);
+    assert(handle->queue != nullptr);
+    assert(n > 0);
+    assert(incx > 0);
+    assert(incy > 0);
+    assert(x != nullptr);
+    assert(y != nullptr);
+    assert(result != nullptr);
+
+    oneapi::mkl::blas::dot(*handle->queue, n, x, incx, y, incy, result);
+
+}
+
+void gpu_dgemv(blasHandle_t* handle, const char* transa, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) {
+
+    assert(handle != nullptr);
+    assert(handle->queue != nullptr);
+
+    // Validate matrix dimensions and increments to be positive
+    assert(m > 0 && n > 0 && lda > 0 && incx > 0 && incy > 0);
+    assert(a != nullptr && x != nullptr && y != nullptr && alpha != nullptr && beta != nullptr);
+
+    // Determine the operation type
+    oneapi::mkl::transpose transa_ = oneapi::mkl::transpose::nontrans;
+    if (*transa == 'T' || *transa == 't') {
+        transa_ = oneapi::mkl::transpose::trans;
+    }
+
+    // Perform DGEMV operation using oneMKL
+    oneapi::mkl::blas::column_major::gemv(*handle->queue, transa_, m, n, *alpha, a, lda, x, incx, *beta, y, incy);
+
+}
+
+void gpu_dgemm(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) {
+
+    assert(handle != nullptr && handle->queue != nullptr);
+    assert(m > 0 && n > 0 && k > 0 && lda > 0 && ldb > 0 && ldc > 0);
+    assert(a != nullptr && b != nullptr && c != nullptr && alpha != nullptr && beta != nullptr);
+
+    // Transpose operations
+    auto transa_ = (*transa == 'T' || *transa == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans;
+    auto transb_ = (*transb == 'T' || *transb == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans;
+
+    oneapi::mkl::blas::column_major::gemm(*handle->queue, transa_, transb_, m, n, k,
+                                    *alpha, a, lda, b, ldb, *beta, c, ldc);
+
+}
+
+
+void gpu_dgeam(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
+    assert(handle != nullptr && handle->queue != nullptr);
+    assert(m > 0 && n > 0 && lda > 0 && ldb > 0 && ldc > 0);
+    assert(a != nullptr && b != nullptr && c != nullptr && alpha != nullptr && beta != nullptr);
+
+    // Determine transpose operations
+    bool transA = (*transa == 'T' || *transa == 't');
+    bool transB = (*transb == 'T' || *transb == 't');
+
+    handle->queue->submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(sycl::range<2>(m, n), [=](sycl::id<2> idx) {
+            const int i = idx[0];
+            const int j = idx[1];
+            const int ai = transA ? j * lda + i : i * lda + j;
+            const int bi = transB ? j * ldb + i : i * ldb + j;
+            const int ci = i * ldc + j;
+
+            c[ci] = (*alpha) * a[ai] + (*beta) * b[bi];
+        });
+    });
+
+}
+
+}  // extern C
diff --git a/plugins/local/gpu_nvidia/LIB b/plugins/local/gpu_nvidia/LIB
new file mode 100644
index 00000000..91f54e91
--- /dev/null
+++ b/plugins/local/gpu_nvidia/LIB
@@ -0,0 +1 @@
+-lcudart -lcublas -lcublasLt
diff --git a/plugins/local/gpu_nvidia/NEED b/plugins/local/gpu_nvidia/NEED
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/plugins/local/gpu_nvidia/NEED
@@ -0,0 +1 @@
+
diff --git a/plugins/local/gpu_nvidia/README.rst b/plugins/local/gpu_nvidia/README.rst
new file mode 100644
index 00000000..5dcfca92
--- /dev/null
+++ b/plugins/local/gpu_nvidia/README.rst
@@ -0,0 +1,5 @@
+==========
+gpu_nvidia
+==========
+
+Nvidia implementation of GPU routines. Uses CUDA and CUBLAS libraries.
diff --git a/plugins/local/gpu_nvidia/gpu.c b/plugins/local/gpu_nvidia/gpu.c
new file mode 100644
index 00000000..a775ab95
--- /dev/null
+++ b/plugins/local/gpu_nvidia/gpu.c
@@ -0,0 +1,326 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+
+
+/* Generic functions */
+
+int gpu_ndevices() {
+  int ngpus;
+  cudaGetDeviceCount(&ngpus);
+  return ngpus;
+}
+
+void gpu_set_device(int32_t igpu) {
+  cudaSetDevice((int) igpu);
+}
+
+
+/* Allocation functions */
+
+void gpu_allocate(void** ptr, const int64_t size) {
+    size_t free, total;
+    cudaError_t rc = cudaMemGetInfo( &free, &total );
+    if (rc != cudaSuccess) {
+      free = INT64_MAX;
+    }
+
+    rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
+//    /* Use managed memory if it does not fit on the GPU */
+//    if (size < free && size < total/2) {
+//      rc= cudaMalloc(ptr, size);
+//    } else {
+//      rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
+//    }
+    assert (rc == cudaSuccess);
+}
+
+void gpu_deallocate(void** ptr) {
+  assert (*ptr != NULL);
+  cudaFree(*ptr);
+  *ptr = NULL;
+}
+
+
+/* Memory transfer functions */
+
+void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
+  cudaMemcpy (gpu_ptr, cpu_ptr, n, cudaMemcpyHostToDevice);
+}
+
+void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
+  cudaMemcpy (cpu_ptr, gpu_ptr, n, cudaMemcpyDeviceToHost);
+}
+
+void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
+  cudaMemcpy (gpu_ptr_dest, gpu_ptr_src, n, cudaMemcpyDeviceToDevice);
+}
+
+
+/* Streams */
+
+void gpu_stream_create(cudaStream_t* ptr) {
+  cudaError_t rc = cudaStreamCreate(ptr);
+  assert (rc == cudaSuccess);
+}
+
+void gpu_stream_destroy(cudaStream_t* ptr) {
+  assert (ptr != NULL);
+  cudaError_t rc = cudaStreamDestroy(*ptr);
+  assert (rc == cudaSuccess);
+  *ptr = NULL;
+}
+
+void gpu_set_stream(cublasHandle_t handle, cudaStream_t stream) {
+  cublasSetStream(handle, stream);
+}
+
+void gpu_synchronize() {
+  cudaDeviceSynchronize();
+}
+
+
+/* BLAS functions */
+
+void gpu_blas_create(cublasHandle_t* ptr) {
+  cublasStatus_t rc = cublasCreate(ptr);
+  assert (rc == CUBLAS_STATUS_SUCCESS);
+}
+
+
+void gpu_blas_destroy(cublasHandle_t* ptr) {
+  assert (ptr != NULL);
+  cublasStatus_t rc = cublasDestroy(*ptr);
+  assert (rc == CUBLAS_STATUS_SUCCESS);
+  ptr = NULL;
+}
+
+
+void gpu_ddot(cublasHandle_t handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
+  assert (handle != NULL);
+  /* Convert to int */
+  int n_, incx_, incy_;
+
+  n_    = (int) n;
+  incx_ = (int) incx;
+  incy_ = (int) incy;
+
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  cublasStatus_t rc = cublasDdot(handle, n_, x, incx_, y, incy_, result);
+  assert (rc == CUBLAS_STATUS_SUCCESS);
+}
+
+
+
+void gpu_sdot(cublasHandle_t handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int n_, incx_, incy_;
+
+  n_    = (int) n;
+  incx_ = (int) incx;
+  incy_ = (int) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  float result_ = 0.;
+  cublasStatus_t rc = cublasSdot(handle, n_, x, incx_, y, incy_, &result_);
+  assert (rc == CUBLAS_STATUS_SUCCESS);
+  *result = result_;
+}
+
+
+
+void gpu_dgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) {
+
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, lda_, incx_, incy_;
+
+  m_    = (int) m;
+  n_    = (int) n;
+  lda_  = (int) lda;
+  incx_ = (int) incx;
+  incy_ = (int) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    m_ == m   );
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t)  lda_ == lda );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
+
+  cublasDgemv(handle, transa_, m_, n_, alpha, a, lda_, x, incx_, beta, y, incy_);
+}
+
+
+
+void gpu_sgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const float* alpha,
+               const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy) {
+
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, lda_, incx_, incy_;
+
+  m_    = (int) m;
+  n_    = (int) n;
+  lda_  = (int) lda;
+  incx_ = (int) incx;
+  incy_ = (int) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    m_ == m   );
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t)  lda_ == lda );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
+
+  cublasSgemv(handle, transa_, m_, n_, alpha, a, lda_, x, incx_, beta, y, incy_);
+}
+
+
+void gpu_dgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) {
+
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, k_, lda_, ldb_, ldc_;
+
+  m_   = (int) m;
+  n_   = (int) n;
+  k_   = (int) k;
+  lda_ = (int) lda;
+  ldb_ = (int) ldb;
+  ldc_ = (int) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t)   k_ == k  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
+  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasDgemm(handle, transa_, transb_, m_, n_, k_, alpha, a, lda_, b, ldb_, beta, c, ldc_);
+}
+
+
+
+void gpu_sgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha,
+               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc) {
+
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, k_, lda_, ldb_, ldc_;
+
+  m_   = (int) m;
+  n_   = (int) n;
+  k_   = (int) k;
+  lda_ = (int) lda;
+  ldb_ = (int) ldb;
+  ldc_ = (int) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t)   k_ == k  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
+  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasSgemm(handle, transa_, transb_, m_, n_, k_, alpha, a, lda_, b, ldb_, beta, c, ldc_);
+}
+
+
+void gpu_dgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, lda_, ldb_, ldc_;
+
+  m_   = (int) m;
+  n_   = (int) n;
+  lda_ = (int) lda;
+  ldb_ = (int) ldb;
+  ldc_ = (int) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
+  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasDgeam(handle, transa_, transb_, m_, n_, alpha, a, lda_, beta, b, ldb_, c, ldc_);
+
+}
+
+
+void gpu_sgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float* alpha,
+               const float* a, const int64_t lda, const float* beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, lda_, ldb_, ldc_;
+
+  m_   = (int) m;
+  n_   = (int) n;
+  lda_ = (int) lda;
+  ldb_ = (int) ldb;
+  ldc_ = (int) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
+  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasSgeam(handle, transa_, transb_, m_, n_, alpha, a, lda_, beta, b, ldb_, c, ldc_);
+
+}
diff --git a/plugins/local/gpu_x86/NEED b/plugins/local/gpu_x86/NEED
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/plugins/local/gpu_x86/NEED
@@ -0,0 +1 @@
+
diff --git a/plugins/local/gpu_x86/README.rst b/plugins/local/gpu_x86/README.rst
new file mode 100644
index 00000000..f530bf29
--- /dev/null
+++ b/plugins/local/gpu_x86/README.rst
@@ -0,0 +1,5 @@
+=======
+gpu_x86
+=======
+
+x86 implementation of GPU routines. For use when GPUs are not available.
diff --git a/plugins/local/gpu_x86/gpu.c b/plugins/local/gpu_x86/gpu.c
new file mode 100644
index 00000000..49aec9d3
--- /dev/null
+++ b/plugins/local/gpu_x86/gpu.c
@@ -0,0 +1,502 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <assert.h>
+
+/* Generic functions */
+
+int gpu_ndevices() {
+  return 0;
+}
+
+void gpu_set_device(int32_t i) {
+  return;
+}
+
+
+/* Allocation functions */
+
+void gpu_allocate(void** ptr, const int64_t n) {
+  *ptr = malloc((size_t) n);
+  if (*ptr == NULL) {
+    perror("Allocation failed");
+  }
+}
+
+void gpu_deallocate(void** ptr) {
+  free(*ptr);
+  *ptr = NULL;
+}
+
+
+/* Memory transfer functions */
+
+void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
+  memcpy(gpu_ptr, cpu_ptr, n);
+}
+
+void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
+  memcpy(cpu_ptr, gpu_ptr, n);
+}
+
+void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
+  memcpy(gpu_ptr_dest, gpu_ptr_src, n);
+}
+
+
+/* Streams */
+
+void gpu_stream_create(void** ptr) {
+  *ptr = (void*) malloc(sizeof(char));
+}
+
+void gpu_stream_destroy(void** ptr) {
+  free(*ptr);
+  *ptr = NULL;
+}
+
+void gpu_set_stream(void* handle, void* stream) {
+  return;
+}
+
+void gpu_synchronize() {
+  return;
+}
+
+
+/* BLAS functions */
+
+void gpu_blas_create(void** handle) {
+  *handle = (void*) malloc(sizeof(char));
+}
+
+
+void gpu_blas_destroy(void** handle) {
+  free(*handle);
+  *handle = NULL;
+}
+
+
+double ddot_(const int32_t* n, const double* x, const int32_t* incx, const double* y, const int32_t* incy);
+
+void gpu_ddot(void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t n_, incx_, incy_;
+
+  n_    = (int32_t) n;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  *result = ddot_(&n_, x, &incx_, y, &incy_);
+}
+
+
+float sdot_(const int32_t* n, const float* x, const int32_t* incx, const float* y, const int32_t* incy);
+
+void gpu_sdot(void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t n_, incx_, incy_;
+
+  n_    = (int32_t) n;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  *result = sdot_(&n_, x, &incx_, y, &incy_);
+}
+
+
+void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha,
+            const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy);
+
+void gpu_dgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) {
+
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, lda_, incx_, incy_;
+
+  m_    = (int32_t) m;
+  n_    = (int32_t) n;
+  lda_  = (int32_t) lda;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    m_ == m   );
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t)  lda_ == lda );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  dgemv_(transa, &m_, &n_, alpha, a, &lda_, x, &incx_, beta, y, &incy_);
+}
+
+
+void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha,
+               const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy);
+
+void gpu_sgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const float* alpha,
+               const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy) {
+
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, lda_, incx_, incy_;
+
+  m_    = (int32_t) m;
+  n_    = (int32_t) n;
+  lda_  = (int32_t) lda;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    m_ == m   );
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t)  lda_ == lda );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  sgemv_(transa, &m_, &n_, alpha, a, &lda_, x, &incx_, beta, y, &incy_);
+}
+
+
+void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha,
+            const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc);
+
+void gpu_dgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) {
+
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, k_, lda_, ldb_, ldc_;
+
+  m_   = (int32_t) m;
+  n_   = (int32_t) n;
+  k_   = (int32_t) k;
+  lda_ = (int32_t) lda;
+  ldb_ = (int32_t) ldb;
+  ldc_ = (int32_t) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t)   k_ == k  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  dgemm_(transa, transb, &m_, &n_, &k_, alpha, a, &lda_, b, &ldb_, beta, c, &ldc_);
+}
+
+
+
+void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha,
+            const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc);
+
+void gpu_sgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha,
+               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc) {
+
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, k_, lda_, ldb_, ldc_;
+
+  m_   = (int32_t) m;
+  n_   = (int32_t) n;
+  k_   = (int32_t) k;
+  lda_ = (int32_t) lda;
+  ldb_ = (int32_t) ldb;
+  ldc_ = (int32_t) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t)   k_ == k  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  sgemm_(transa, transb, &m_, &n_, &k_, alpha, a, &lda_, b, &ldb_, beta, c, &ldc_);
+}
+
+
+void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
+  assert (handle != NULL);
+
+  if ( (*transa == 'N' && *transb == 'N') ||
+       (*transa == 'n' && *transb == 'N') ||
+       (*transa == 'N' && *transb == 'n') ||
+       (*transa == 'n' && *transb == 'n') ) {
+
+     if (*alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[j*ldb+i];
+         }
+       }
+
+     } else if (*beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[j*ldb+i];
+         }
+       }
+
+     }
+
+  } else if ( (*transa == 'N' && *transb == 'T') ||
+              (*transa == 'n' && *transb == 'T') ||
+              (*transa == 'N' && *transb == 't') ||
+              (*transa == 'n' && *transb == 't') ) {
+
+     if (*alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[i*ldb+j];
+         }
+       }
+
+     } else if (*beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[i*ldb+j];
+         }
+       }
+
+     }
+
+  } else if ( (*transa == 'T' && *transb == 'N') ||
+              (*transa == 't' && *transb == 'N') ||
+              (*transa == 'T' && *transb == 'n') ||
+              (*transa == 't' && *transb == 'n') ) {
+
+     if (*alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[j*ldb+i];
+         }
+       }
+
+     } else if (*beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[j*ldb+i];
+         }
+       }
+
+     }
+
+  } else if ( (*transa == 'T' && *transb == 'T') ||
+              (*transa == 't' && *transb == 'T') ||
+              (*transa == 'T' && *transb == 't') ||
+              (*transa == 't' && *transb == 't') ) {
+
+     if (*alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[i*ldb+j];
+         }
+       }
+
+     } else if (*beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[i*ldb+j];
+         }
+       }
+
+     }
+
+  }
+}
+
+
+void gpu_sgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float* alpha,
+               const float* a, const int64_t lda, const float* beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
+  assert (handle != NULL);
+
+  if ( (*transa == 'N' && *transb == 'N') ||
+       (*transa == 'n' && *transb == 'N') ||
+       (*transa == 'N' && *transb == 'n') ||
+       (*transa == 'n' && *transb == 'n') ) {
+
+     if (*alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[j*ldb+i];
+         }
+       }
+
+     } else if (*beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[j*ldb+i];
+         }
+       }
+
+     }
+
+  } else if ( (*transa == 'N' && *transb == 'T') ||
+              (*transa == 'n' && *transb == 'T') ||
+              (*transa == 'N' && *transb == 't') ||
+              (*transa == 'n' && *transb == 't') ) {
+
+     if (*alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[i*ldb+j];
+         }
+       }
+
+     } else if (*beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[i*ldb+j];
+         }
+       }
+
+     }
+
+  } else if ( (*transa == 'T' && *transb == 'N') ||
+              (*transa == 't' && *transb == 'N') ||
+              (*transa == 'T' && *transb == 'n') ||
+              (*transa == 't' && *transb == 'n') ) {
+
+     if (*alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[j*ldb+i];
+         }
+       }
+
+     } else if (*beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[j*ldb+i];
+         }
+       }
+
+     }
+
+  } else if ( (*transa == 'T' && *transb == 'T') ||
+              (*transa == 't' && *transb == 'T') ||
+              (*transa == 'T' && *transb == 't') ||
+              (*transa == 't' && *transb == 't') ) {
+
+     if (*alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[i*ldb+j];
+         }
+       }
+
+     } else if (*beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[i*ldb+j];
+         }
+       }
+
+     }
+
+  }
+}
diff --git a/plugins/local/non_h_ints_mu/deb_aos.irp.f b/plugins/local/non_h_ints_mu/deb_aos.irp.f
index 4012f47c..70604f54 100644
--- a/plugins/local/non_h_ints_mu/deb_aos.irp.f
+++ b/plugins/local/non_h_ints_mu/deb_aos.irp.f
@@ -31,6 +31,9 @@ subroutine print_aos()
   integer          :: i, ipoint
   double precision :: r(3)
   double precision :: ao_val, ao_der(3), ao_lap
+  double precision :: accu_vgl(5)
+  double precision :: accu_vgl_nrm(5)
+
   double precision :: mo_val, mo_der(3), mo_lap
 
   PROVIDE final_grid_points aos_in_r_array aos_grad_in_r_array aos_lapl_in_r_array
@@ -40,9 +43,6 @@ subroutine print_aos()
     write(1000, '(3(f15.7, 3X))') r
   enddo
   
-double precision :: accu_vgl(5)
-double precision :: accu_vgl_nrm(5)
-
   do ipoint = 1, n_points_final_grid
     do i = 1, ao_num
       ao_val    = aos_in_r_array     (i,ipoint)
diff --git a/plugins/local/non_h_ints_mu/total_tc_int.irp.f b/plugins/local/non_h_ints_mu/total_tc_int.irp.f
index a1bbd6e0..656f5f16 100644
--- a/plugins/local/non_h_ints_mu/total_tc_int.irp.f
+++ b/plugins/local/non_h_ints_mu/total_tc_int.irp.f
@@ -78,7 +78,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
       !$OMP PRIVATE (i, j, k, l, ipoint, ao_i_r, ao_k_r, weight1) &
       !$OMP SHARED (ao_num, n_points_final_grid, ao_two_e_tc_tot, &
       !$OMP         aos_in_r_array_transp, final_weight_at_r_vector, int2_grad1_u12_square_ao)
-      !$OMP DO COLLAPSE(4)
+      !$OMP DO COLLAPSE(3)
       do i = 1, ao_num
         do k = 1, ao_num
           do l = 1, ao_num
@@ -188,7 +188,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
         !$OMP SHARED (ao_num, n_points_final_grid, ao_two_e_tc_tot,    &
         !$OMP         aos_in_r_array_transp, final_weight_at_r_vector, &
         !$OMP         int2_grad1_u12_ao, aos_grad_in_r_array_transp_bis)
-        !$OMP DO COLLAPSE(4)
+        !$OMP DO COLLAPSE(3)
         do i = 1, ao_num
           do k = 1, ao_num
             do l = 1, ao_num
@@ -270,7 +270,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
       !$OMP PARALLEL DEFAULT(NONE)                     &
       !$OMP PRIVATE(i, j, k, l, integ_zero, integ_val) & 
       !$OMP SHARED(ao_num, ao_two_e_tc_tot)
-      !$OMP DO COLLAPSE(4)
+      !$OMP DO COLLAPSE(3)
       do j = 1, ao_num
         do l = 1, ao_num
           do i = 1, ao_num
@@ -293,7 +293,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
       !$OMP PARALLEL DEFAULT(NONE)                            &
       !$OMP SHARED(ao_num, ao_two_e_tc_tot, ao_integrals_map) &
       !$OMP PRIVATE(i, j, k, l)
-      !$OMP DO COLLAPSE(4)
+      !$OMP DO COLLAPSE(3)
       do j = 1, ao_num
         do l = 1, ao_num
           do i = 1, ao_num
@@ -306,7 +306,6 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
       enddo
       !$OMP END DO
       !$OMP END PARALLEL
-      !call clear_ao_map()
       FREE ao_integrals_map
     endif
 
diff --git a/plugins/local/tc_int/NEED b/plugins/local/tc_int/NEED
new file mode 100644
index 00000000..8a4caf5b
--- /dev/null
+++ b/plugins/local/tc_int/NEED
@@ -0,0 +1,5 @@
+tc_keywords
+jastrow
+qmckl
+becke_numerical_grid
+dft_utils_in_r
diff --git a/plugins/local/tc_int/README.rst b/plugins/local/tc_int/README.rst
new file mode 100644
index 00000000..bc9e8483
--- /dev/null
+++ b/plugins/local/tc_int/README.rst
@@ -0,0 +1,4 @@
+======
+tc_int
+======
+
diff --git a/plugins/local/tc_int/compute_tc_int.irp.f b/plugins/local/tc_int/compute_tc_int.irp.f
new file mode 100644
index 00000000..02f21570
--- /dev/null
+++ b/plugins/local/tc_int/compute_tc_int.irp.f
@@ -0,0 +1,295 @@
+
+! ---
+
+subroutine provide_int2_grad1_u12_ao()
+
+  BEGIN_DOC
+  !
+  ! int2_grad1_u12_ao(i,j,ipoint,1) = \int dr2         [\grad1 u(r1,r2)]_x1 \chi_i(r2) \chi_j(r2) 
+  ! int2_grad1_u12_ao(i,j,ipoint,2) = \int dr2         [\grad1 u(r1,r2)]_y1 \chi_i(r2) \chi_j(r2) 
+  ! int2_grad1_u12_ao(i,j,ipoint,3) = \int dr2         [\grad1 u(r1,r2)]_z1 \chi_i(r2) \chi_j(r2) 
+  ! int2_grad1_u12_ao(i,j,ipoint,4) = \int dr2 [-(1/2) [\grad1 u(r1,r2)]^2] \chi_i(r2) \chi_j(r2) 
+  !
+  !
+  ! tc_int_2e_ao(k,i,l,j) = (ki|V^TC(r_12)|lj) 
+  !                       = <lk| V^TC(r_12) |ji> where V^TC(r_12) is the total TC operator 
+  !                       = tc_grad_and_lapl_ao(k,i,l,j) + tc_grad_square_ao(k,i,l,j) + ao_two_e_coul(k,i,l,j)
+  ! where:
+  !
+  ! tc_grad_and_lapl_ao(k,i,l,j) = < k l | -1/2 \Delta_1 u(r1,r2) - \grad_1 u(r1,r2) . \grad_1 | ij >
+  !                              = -1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2      \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2) 
+  !                              =  1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2 (-1) \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2) 
+  !
+  ! tc_grad_square_ao(k,i,l,j) = -1/2 <kl | |\grad_1 u(r1,r2)|^2 + |\grad_2 u(r1,r2)|^2 | ij>
+  !
+  ! ao_two_e_coul(k,i,l,j) = < l k | 1/r12 | j i > = ( k i | 1/r12 | l j )
+  !
+  END_DOC
+
+  implicit none
+
+  integer                       :: i, j, k, l, m, ipoint, jpoint
+  integer                       :: n_blocks, n_rest, n_pass
+  integer                       :: i_blocks, i_rest, i_pass, ii
+  double precision              :: mem, n_double
+  double precision              :: weight1, ao_k_r, ao_i_r
+  double precision              :: der_envsq_x, der_envsq_y, der_envsq_z, lap_envsq
+  double precision              :: time0, time1, time2, tc1, tc2, tc
+  double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:), tc_int_2e_ao(:,:,:,:)
+  double precision, allocatable :: tmp(:,:,:), c_mat(:,:,:), tmp_grad1_u12(:,:,:)
+
+  double precision, external    :: get_ao_two_e_integral
+
+
+  PROVIDE final_weight_at_r_vector_extra aos_in_r_array_extra
+  PROVIDE final_weight_at_r_vector aos_grad_in_r_array_transp_bis final_weight_at_r_vector aos_in_r_array_transp
+
+
+
+  print*, ' start provide_int2_grad1_u12_ao ...'
+  call wall_time(time0)
+
+  call total_memory(mem)
+  mem      = max(1.d0, qp_max_mem - mem)
+  n_double = mem * 1.d8
+  n_blocks = int(min(n_double / (n_points_extra_final_grid * 4.d0), 1.d0*n_points_final_grid))
+  n_rest   = int(mod(n_points_final_grid, n_blocks))
+  n_pass   = int((n_points_final_grid - n_rest) / n_blocks)
+
+  call write_int(6, n_pass, 'Number of passes')
+  call write_int(6, n_blocks, 'Size of the blocks')
+  call write_int(6, n_rest, 'Size of the last block')
+
+  ! ---
+  ! ---
+  ! ---
+
+  allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,4))
+
+  allocate(tmp(n_points_extra_final_grid,ao_num,ao_num))
+  !$OMP PARALLEL               &
+  !$OMP DEFAULT (NONE)         &
+  !$OMP PRIVATE (j, i, jpoint) &
+  !$OMP SHARED (tmp, ao_num, n_points_extra_final_grid, final_weight_at_r_vector_extra, aos_in_r_array_extra_transp)
+  !$OMP DO SCHEDULE (static)
+  do j = 1, ao_num
+    do i = 1, ao_num
+      do jpoint = 1, n_points_extra_final_grid
+        tmp(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  allocate(tmp_grad1_u12(n_points_extra_final_grid,n_blocks,4))
+
+  tc = 0.d0
+  
+  do i_pass = 1, n_pass
+    ii = (i_pass-1)*n_blocks + 1
+  
+    call wall_time(tc1)
+    !$OMP PARALLEL                   &
+    !$OMP DEFAULT (NONE)             &
+    !$OMP PRIVATE (i_blocks, ipoint) &
+    !$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12)
+    !$OMP DO 
+    do i_blocks = 1, n_blocks
+      ipoint = ii - 1 + i_blocks ! r1
+      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_blocks,1), tmp_grad1_u12(1,i_blocks,2), tmp_grad1_u12(1,i_blocks,3), tmp_grad1_u12(1,i_blocks,4))
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+    call wall_time(tc2)
+    tc = tc + tc2 - tc1 
+
+    do m = 1, 4
+      call dgemm( "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0                     &
+                , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
+                , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) 
+    enddo
+  enddo
+  
+  deallocate(tmp_grad1_u12)
+
+  
+  if(n_rest .gt. 0) then
+  
+    allocate(tmp_grad1_u12(n_points_extra_final_grid,n_rest,4))
+  
+    ii = n_pass*n_blocks + 1
+
+    call wall_time(tc1)
+    !$OMP PARALLEL                 &
+    !$OMP DEFAULT (NONE)           &
+    !$OMP PRIVATE (i_rest, ipoint) &
+    !$OMP SHARED (n_rest, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12)
+    !$OMP DO 
+    do i_rest = 1, n_rest
+      ipoint = ii - 1 + i_rest ! r1
+      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_rest,1), tmp_grad1_u12(1,i_rest,2), tmp_grad1_u12(1,i_rest,3), tmp_grad1_u12(1,i_rest,4))
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+    call wall_time(tc2)
+    tc = tc + tc2 - tc1 
+  
+    do m = 1, 4
+      call dgemm( "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0                       &
+                , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
+                , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) 
+    enddo
+
+    deallocate(tmp_grad1_u12)
+  endif
+
+  deallocate(tmp)
+
+
+  call wall_time(time1)
+  print*, ' wall time for int2_grad1_u12_ao (min) = ', (time1-time0) / 60.d0
+  print*, ' wall time Jastrow derivatives   (min) = ', tc / 60.d0
+  call print_memory_usage()
+
+  ! ---
+  ! ---
+  ! ---
+
+
+  allocate(tc_int_2e_ao(ao_num,ao_num,ao_num,ao_num))
+
+  call wall_time(time1)
+
+  allocate(c_mat(n_points_final_grid,ao_num,ao_num))
+  !$OMP PARALLEL               &
+  !$OMP DEFAULT (NONE)         &
+  !$OMP PRIVATE (i, k, ipoint) &
+  !$OMP SHARED (aos_in_r_array_transp, c_mat, ao_num, n_points_final_grid, final_weight_at_r_vector)
+  !$OMP DO SCHEDULE (static)
+  do i = 1, ao_num
+    do k = 1, ao_num
+      do ipoint = 1, n_points_final_grid
+        c_mat(ipoint,k,i) = final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,k)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+  call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0            &
+            , int2_grad1_u12_ao(1,1,1,4), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
+            , 0.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num)
+  deallocate(c_mat)
+
+  call wall_time(time2)
+  print*, ' wall time of Hermitian part of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0
+  call print_memory_usage()
+
+  ! ---
+
+  call wall_time(time1)
+  
+  allocate(c_mat(n_points_final_grid,ao_num,ao_num))
+  do m = 1, 3
+    !$OMP PARALLEL                                                              &
+    !$OMP DEFAULT (NONE)                                                        &
+    !$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r)                       & 
+    !$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, c_mat, & 
+    !$OMP         ao_num, n_points_final_grid, final_weight_at_r_vector, m)
+    !$OMP DO SCHEDULE (static)
+    do i = 1, ao_num
+      do k = 1, ao_num
+        do ipoint = 1, n_points_final_grid
+  
+          weight1 = 0.5d0 * final_weight_at_r_vector(ipoint)
+          ao_i_r  = aos_in_r_array_transp(ipoint,i)
+          ao_k_r  = aos_in_r_array_transp(ipoint,k)
+  
+          c_mat(ipoint,k,i) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,m) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,m))
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, -1.d0           &
+              , int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
+              , 1.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num)
+  enddo
+  deallocate(c_mat)
+
+  call wall_time(time2)
+  print*, ' wall time of non-Hermitian part of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0
+  call print_memory_usage()
+
+  ! ---
+
+  call wall_time(time1)
+
+  call sum_A_At(tc_int_2e_ao(1,1,1,1), ao_num*ao_num)
+
+  call wall_time(time2)
+  print*, ' lower- and upper-triangle of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0
+  call print_memory_usage()
+
+  ! ---
+
+  call wall_time(time1)  
+
+  PROVIDE ao_integrals_map
+  !$OMP PARALLEL DEFAULT(NONE)                         &
+  !$OMP SHARED(ao_num, tc_int_2e_ao, ao_integrals_map) &
+  !$OMP PRIVATE(i, j, k, l)
+  !$OMP DO COLLAPSE(3)
+  do j = 1, ao_num
+    do l = 1, ao_num
+      do i = 1, ao_num
+        do k = 1, ao_num
+          !                                               < 1:i, 2:j | 1:k, 2:l > 
+          tc_int_2e_ao(k,i,l,j) = tc_int_2e_ao(k,i,l,j) + get_ao_two_e_integral(i, j, k, l, ao_integrals_map)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  call wall_time(time2)
+  print*, ' wall time of Coulomb part of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0
+  call print_memory_usage()
+
+  ! ---
+
+  print*, ' Writing int2_grad1_u12_ao in ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao'
+  open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="write")
+  call ezfio_set_work_empty(.False.)
+    write(11) int2_grad1_u12_ao(:,:,:,1:3)
+  close(11)
+
+  print*, ' Saving tc_int_2e_ao in ', trim(ezfio_filename) // '/work/ao_two_e_tc_tot'
+  open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/ao_two_e_tc_tot', action="write")
+  call ezfio_set_work_empty(.False.)
+  do i = 1, ao_num
+    write(11) tc_int_2e_ao(:,:,:,i)
+  enddo
+  close(11)
+
+  ! ----
+
+  deallocate(int2_grad1_u12_ao)
+  deallocate(tc_int_2e_ao)
+
+  call wall_time(time2)
+  print*, ' wall time for tc_int_2e_ao (min) = ', (time2-time1) / 60.d0
+  call print_memory_usage()
+
+  ! ---
+
+  call wall_time(time1)
+  print*, ' wall time for TC-integrals (min) = ', (time1-time0) / 60.d0
+
+  return
+end
+
+! ---
+
diff --git a/plugins/local/tc_int/jast_grad_full.irp.f b/plugins/local/tc_int/jast_grad_full.irp.f
new file mode 100644
index 00000000..599d3779
--- /dev/null
+++ b/plugins/local/tc_int/jast_grad_full.irp.f
@@ -0,0 +1,245 @@
+
+! ---
+
+subroutine get_grad1_u12_for_tc(ipoint, n_grid2, resx, resy, resz, res)
+
+  BEGIN_DOC
+  !
+  ! resx(ipoint) =      [grad1 u(r1,r2)]_x1
+  ! resy(ipoint) =      [grad1 u(r1,r2)]_y1
+  ! resz(ipoint) =      [grad1 u(r1,r2)]_z1
+  ! res (ipoint) = -0.5 [grad1 u(r1,r2)]^2
+  !
+  ! We use:
+  !       grid for r1
+  ! extra_grid for r2
+  !
+  END_DOC
+
+  include 'constants.include.F'
+
+  implicit none
+  integer,          intent(in)  :: ipoint, n_grid2
+  double precision, intent(out) :: resx(n_grid2), resy(n_grid2), resz(n_grid2), res(n_grid2)
+
+  integer                       :: jpoint, i_nucl, p, mpA, npA, opA, pp
+  integer                       :: powmax1, powmax, powmax2
+  double precision              :: r1(3), r2(3)
+  double precision              :: tmp, tmp1, tmp2, tmp11, tmp22
+  double precision              :: rn(3), f1A, grad1_f1A(3), f2A, grad2_f2A(3), g12, grad1_g12(3)
+  double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:)
+
+  r1(1) = final_grid_points(1,ipoint)
+  r1(2) = final_grid_points(2,ipoint)
+  r1(3) = final_grid_points(3,ipoint)
+
+  call grad1_j12_r1_seq(r1, n_grid2, resx, resy, resz)
+
+  do jpoint = 1, n_grid2 ! r2
+    res(jpoint) = -0.5d0 * (resx(jpoint) * resx(jpoint) + resy(jpoint) * resy(jpoint) + resz(jpoint) * resz(jpoint))
+  enddo
+
+  return
+end
+
+! ---
+
+subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
+
+  include 'constants.include.F'
+
+  implicit none
+  integer         , intent(in)  :: n_grid2
+  double precision, intent(in)  :: r1(3)
+  double precision, intent(out) :: gradx(n_grid2)
+  double precision, intent(out) :: grady(n_grid2)
+  double precision, intent(out) :: gradz(n_grid2)
+
+  integer                       :: jpoint, i_nucl, p, mpA, npA, opA
+  double precision              :: r2(3)
+  double precision              :: dx, dy, dz, r12, tmp
+  double precision              :: rn(3), f1A, grad1_f1A(3), f2A, grad2_f2A(3), g12, grad1_g12(3)
+  double precision              :: tmp1, tmp2, dist
+  integer                       :: powmax1, powmax, powmax2
+  double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:)
+
+  powmax1 = max(maxval(jBH_m), maxval(jBH_n))
+  powmax2 = maxval(jBH_o)
+  powmax  = max(powmax1, powmax2)
+
+  allocate(f1A_power(-1:powmax), f2A_power(-1:powmax), g12_power(-1:powmax), double_p(0:powmax))
+
+  do p = 0, powmax
+    double_p(p) = dble(p)
+  enddo
+
+  f1A_power(-1) = 0.d0
+  f2A_power(-1) = 0.d0
+  g12_power(-1) = 0.d0
+
+  f1A_power(0) = 1.d0
+  f2A_power(0) = 1.d0
+  g12_power(0) = 1.d0
+
+  do jpoint = 1, n_grid2 ! r2
+
+    r2(1) = final_grid_points_extra(1,jpoint)
+    r2(2) = final_grid_points_extra(2,jpoint)
+    r2(3) = final_grid_points_extra(3,jpoint)
+
+    gradx(jpoint) = 0.d0
+    grady(jpoint) = 0.d0
+    gradz(jpoint) = 0.d0
+
+    call jBH_elem_fct_grad_alpha1(r1, r2, g12, grad1_g12)
+
+!    dist =   (r1(1) - r2(1)) * (r1(1) - r2(1)) &
+!           + (r1(2) - r2(2)) * (r1(2) - r2(2)) &
+!           + (r1(3) - r2(3)) * (r1(3) - r2(3))
+!
+!    if(dist .ge. 1d-15) then
+!      dist = dsqrt( dist )
+!
+!      tmp1 = 1.d0 / (1.d0 + dist)
+!
+!      g12 = dist * tmp1
+!      tmp2 = tmp1 * tmp1 / dist
+!      grad1_g12(1) = tmp2 * (r1(1) - r2(1))
+!      grad1_g12(2) = tmp2 * (r1(2) - r2(2))
+!      grad1_g12(3) = tmp2 * (r1(3) - r2(3))
+!
+!    else
+!
+!      grad1_g12(1) = 0.d0
+!      grad1_g12(2) = 0.d0
+!      grad1_g12(3) = 0.d0
+!      g12 = 0.d0
+!
+!    endif
+!
+    do p = 1, powmax2
+      g12_power(p) = g12_power(p-1) * g12
+    enddo
+
+    do i_nucl = 1, nucl_num
+
+      rn(1) = nucl_coord(i_nucl,1)
+      rn(2) = nucl_coord(i_nucl,2)
+      rn(3) = nucl_coord(i_nucl,3)
+
+        call jBH_elem_fct_grad_alpha1(r1, rn, f1A, grad1_f1A)
+!      dist =   (r1(1) - rn(1)) * (r1(1) - rn(1)) &
+!             + (r1(2) - rn(2)) * (r1(2) - rn(2)) &
+!             + (r1(3) - rn(3)) * (r1(3) - rn(3))
+!      if (dist > 1.d-15) then
+!        dist = dsqrt( dist )
+!
+!        tmp1 = 1.d0 / (1.d0 + dist)
+!
+!        f1A = dist * tmp1
+!        tmp2 = tmp1 * tmp1 / dist
+!        grad1_f1A(1) = tmp2 * (r1(1) - rn(1))
+!        grad1_f1A(2) = tmp2 * (r1(2) - rn(2))
+!        grad1_f1A(3) = tmp2 * (r1(3) - rn(3))
+!
+!      else
+!
+!        grad1_f1A(1) = 0.d0
+!        grad1_f1A(2) = 0.d0
+!        grad1_f1A(3) = 0.d0
+!        f1A = 0.d0
+!
+!      endif
+
+        call jBH_elem_fct_grad_alpha1(r2, rn, f2A, grad2_f2A)
+!      dist =   (r2(1) - rn(1)) * (r2(1) - rn(1)) &
+!             + (r2(2) - rn(2)) * (r2(2) - rn(2)) &
+!             + (r2(3) - rn(3)) * (r2(3) - rn(3))
+!
+!      if (dist > 1.d-15) then
+!        dist = dsqrt( dist )
+!
+!        tmp1 = 1.d0 / (1.d0 + dist)
+!
+!        f2A = dist * tmp1
+!        tmp2 = tmp1 * tmp1 / dist
+!        grad2_f2A(1) = tmp2 * (r2(1) - rn(1))
+!        grad2_f2A(2) = tmp2 * (r2(2) - rn(2))
+!        grad2_f2A(3) = tmp2 * (r2(3) - rn(3))
+!
+!      else
+!
+!        grad2_f2A(1) = 0.d0
+!        grad2_f2A(2) = 0.d0
+!        grad2_f2A(3) = 0.d0
+!        f2A = 0.d0
+!
+!      endif
+
+      ! Compute powers of f1A and f2A
+      do p = 1, powmax1
+        f1A_power(p) = f1A_power(p-1) * f1A
+        f2A_power(p) = f2A_power(p-1) * f2A
+      enddo
+
+      do p = 1, jBH_size
+        mpA = jBH_m(p,i_nucl)
+        npA = jBH_n(p,i_nucl)
+        opA = jBH_o(p,i_nucl)
+        tmp = jBH_c(p,i_nucl)
+!        if (dabs(tmp) <= 1.d-10) cycle
+!
+        if(mpA .eq. npA) then
+          tmp = tmp * 0.5d0
+        endif
+
+        tmp1 = double_p(mpA) * f1A_power(mpA-1) * f2A_power(npA) + double_p(npA) * f1A_power(npA-1) * f2A_power(mpA)
+        tmp1 = tmp1 * g12_power(opA) * tmp
+        tmp2 = double_p(opA) * g12_power(opA-1) * (f1A_power(mpA) * f2A_power(npA) + f1A_power(npA) * f2A_power(mpA)) * tmp
+
+        gradx(jpoint) = gradx(jpoint) + tmp1 * grad1_f1A(1) + tmp2 * grad1_g12(1)
+        grady(jpoint) = grady(jpoint) + tmp1 * grad1_f1A(2) + tmp2 * grad1_g12(2)
+        gradz(jpoint) = gradz(jpoint) + tmp1 * grad1_f1A(3) + tmp2 * grad1_g12(3)
+      enddo ! p
+    enddo ! i_nucl
+  enddo ! jpoint
+
+  return
+end
+
+subroutine jBH_elem_fct_grad_alpha1(r1, r2, fct, grad1_fct)
+
+  implicit none
+  double precision, intent(in)  :: r1(3), r2(3)
+  double precision, intent(out) :: fct, grad1_fct(3)
+  double precision              :: dist, tmp1, tmp2
+
+  dist =   (r1(1) - r2(1)) * (r1(1) - r2(1)) &
+         + (r1(2) - r2(2)) * (r1(2) - r2(2)) &
+         + (r1(3) - r2(3)) * (r1(3) - r2(3))
+
+
+  if(dist .ge. 1d-15) then
+    dist = dsqrt( dist )
+
+    tmp1 = 1.d0 / (1.d0 + dist)
+
+    fct = dist * tmp1
+    tmp2 = tmp1 * tmp1 / dist
+    grad1_fct(1) = tmp2 * (r1(1) - r2(1))
+    grad1_fct(2) = tmp2 * (r1(2) - r2(2))
+    grad1_fct(3) = tmp2 * (r1(3) - r2(3))
+
+  else
+
+    grad1_fct(1) = 0.d0
+    grad1_fct(2) = 0.d0
+    grad1_fct(3) = 0.d0
+    fct = 0.d0
+
+  endif
+
+  return
+end
+
+! ---
diff --git a/plugins/local/tc_int/jast_utils_bh.irp.f b/plugins/local/tc_int/jast_utils_bh.irp.f
new file mode 100644
index 00000000..200bc5ff
--- /dev/null
+++ b/plugins/local/tc_int/jast_utils_bh.irp.f
@@ -0,0 +1,43 @@
+
+! ---
+
+
+
+subroutine jBH_elem_fct_grad(alpha, r1, r2, fct, grad1_fct)
+
+  implicit none
+  double precision, intent(in)  :: alpha, r1(3), r2(3)
+  double precision, intent(out) :: fct, grad1_fct(3)
+  double precision              :: dist, tmp1, tmp2, dist_inv
+
+  dist =   (r1(1) - r2(1)) * (r1(1) - r2(1)) &
+         + (r1(2) - r2(2)) * (r1(2) - r2(2)) &
+         + (r1(3) - r2(3)) * (r1(3) - r2(3))
+
+
+  if(dist .ge. 1d-15) then
+    dist_inv = 1.d0/dsqrt( dist )
+    dist = dist_inv * dist
+
+    tmp1 = 1.d0 / (1.d0 + alpha * dist)
+
+    fct = alpha * dist * tmp1
+    tmp2 = alpha * tmp1 * tmp1 * dist_inv
+    grad1_fct(1) = tmp2 * (r1(1) - r2(1))
+    grad1_fct(2) = tmp2 * (r1(2) - r2(2))
+    grad1_fct(3) = tmp2 * (r1(3) - r2(3))
+
+  else
+
+    grad1_fct(1) = 0.d0
+    grad1_fct(2) = 0.d0
+    grad1_fct(3) = 0.d0
+    fct = 0.d0
+
+  endif
+
+  return
+end
+
+! ---
+
diff --git a/plugins/local/tc_int/write_tc_int.irp.f b/plugins/local/tc_int/write_tc_int.irp.f
new file mode 100644
index 00000000..9f25a6fd
--- /dev/null
+++ b/plugins/local/tc_int/write_tc_int.irp.f
@@ -0,0 +1,56 @@
+! ---
+
+program write_tc_int
+
+  implicit none
+
+  print *, ' j2e_type = ', j2e_type
+  print *, ' j1e_type = ', j1e_type
+  print *, ' env_type = ', env_type
+
+  my_grid_becke  = .True.
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
+  my_extra_grid_becke  = .True.
+  PROVIDE tc_grid2_a tc_grid2_r
+  my_n_pt_r_extra_grid = tc_grid2_r
+  my_n_pt_a_extra_grid = tc_grid2_a
+  touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid
+
+  call write_int(6, my_n_pt_r_grid, 'radial  external grid over')
+  call write_int(6, my_n_pt_a_grid, 'angular external grid over')
+
+  call write_int(6, my_n_pt_r_extra_grid, 'radial  internal grid over')
+  call write_int(6, my_n_pt_a_extra_grid, 'angular internal grid over')
+
+  call main()
+
+end
+
+! ---
+
+subroutine main()
+
+  implicit none
+
+  PROVIDE io_tc_integ
+
+  print*, 'io_tc_integ = ', io_tc_integ
+
+  if(io_tc_integ .ne. "Write") then
+    print*, 'io_tc_integ != Write'
+    print*, io_tc_integ
+    stop
+  endif
+
+  call provide_int2_grad1_u12_ao()
+
+  call ezfio_set_tc_keywords_io_tc_integ('Read')
+
+end
+
+! ---
+
diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f
index 05f7115d..a2d9d043 100644
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@@ -25,20 +25,22 @@ END_PROVIDER
    ! Last dimension of cholesky_ao is cholesky_ao_num
    !
    ! https://mogp-emulator.readthedocs.io/en/latest/methods/proc/ProcPivotedCholesky.html
+   !
    ! https://doi.org/10.1016/j.apnum.2011.10.001 : Page 4, Algorithm 1
+   !
+   ! https://www.diva-portal.org/smash/get/diva2:396223/FULLTEXT01.pdf
    END_DOC
 
    integer*8                      :: ndim8
    integer                        :: rank
    double precision               :: tau, tau2
-   double precision, pointer      :: L(:,:), Delta(:,:)
+   double precision, pointer      :: L(:,:)
 
    double precision               :: s
-   double precision               :: dscale, dscale_tmp
 
-   double precision, allocatable  :: D(:), Ltmp_p(:,:), Ltmp_q(:,:), D_sorted(:), Delta_col(:)
+   double precision, allocatable  :: D(:), Ltmp_p(:,:), Ltmp_q(:,:), D_sorted(:), Delta_col(:), Delta(:,:)
    integer, allocatable           :: addr1(:), addr2(:)
-   integer*8, allocatable         :: Lset(:), Dset(:), addr3(:)
+   integer*8, allocatable         :: Lset(:), Dset(:)
    logical, allocatable           :: computed(:)
 
    integer                        :: i,j,k,m,p,q, dj, p2, q2, ii, jj
@@ -64,11 +66,8 @@ END_PROVIDER
 
    type(c_ptr)                    :: c_pointer(2)
    integer                        :: fd(2)
-   logical                        :: delta_on_disk
-   integer                        :: dgemm_block_size, nqq
-   double precision, allocatable  :: dgemm_buffer1(:,:), dgemm_buffer2(:,:)
 
-   PROVIDE nproc
+   PROVIDE nproc ao_cholesky_threshold do_direct_integrals qp_max_mem
    PROVIDE nucl_coord ao_two_e_integral_schwartz
    call set_multiple_levels_omp(.False.)
 
@@ -88,19 +87,8 @@ END_PROVIDER
 
    else
 
-     PROVIDE nucl_coord ao_two_e_integral_schwartz
      call set_multiple_levels_omp(.False.)
 
-     call resident_memory(mem0)
-
-     rank_max = min(ndim8,(qp_max_mem*1024_8*1024_8*1024_8/8_8)/ndim8)
-     call mmap(trim(ezfio_work_dir)//'cholesky_ao_tmp', (/ ndim8, rank_max /), 8, fd(1), .False., .True., c_pointer(1))
-     call c_f_pointer(c_pointer(1), L, (/ ndim8, rank_max /))
-     ! Deleting the file while it is open makes the file invisible on the filesystem,
-     ! and automatically deleted, even if the program crashes
-     iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_ao_tmp', 'R')
-     close(iunit,status='delete')
-
      if (do_direct_integrals) then
        if (ao_two_e_integral(1,1,1,1) < huge(1.d0)) then
          ! Trigger providers inside ao_two_e_integral
@@ -113,8 +101,12 @@ END_PROVIDER
      tau = ao_cholesky_threshold
      tau2 = tau*tau
 
-     mem = 6.d0 * memory_of_double8(ndim8) + 6.d0 * memory_of_int8(ndim8)
-     call check_mem(mem, irp_here)
+     rank = 0
+
+     allocate( D(ndim8), Lset(ndim8), Dset(ndim8), D_sorted(ndim8))
+     allocate( addr1(ndim8), addr2(ndim8), Delta_col(ndim8), computed(ndim8) )
+
+     call resident_memory(mem0)
 
      call print_memory_usage()
 
@@ -127,59 +119,58 @@ END_PROVIDER
      print *,  '============ ============='
 
 
-     rank = 0
-
-     allocate( D(ndim8), Lset(ndim8), Dset(ndim8), D_sorted(ndim8))
-     allocate( addr1(ndim8), addr2(ndim8), addr3(ndim8) )
-!print *, 'allocate : (D(ndim8))', memory_of_int8(ndim8)
-!print *, 'allocate : (Lset(ndim8))', memory_of_int8(ndim8)
-!print *, 'allocate : (Dset(ndim8))', memory_of_int8(ndim8)
-!print *, 'allocate : (4,addr(ndim8))', memory_of_int8(4_8*ndim8)
-
      ! 1.
-     k=0
+     i8=0
      do j=1,ao_num
        do i=1,ao_num
-         k = k+1
-         addr1(k) = i
-         addr2(k) = j
-         addr3(k) = (i-1)*ao_num + j
+         i8 = i8+1
+         addr1(i8) = i
+         addr2(i8) = j
        enddo
      enddo
 
      if (do_direct_integrals) then
-       !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i8) SCHEDULE(dynamic,16)
+       !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i8) SCHEDULE(dynamic,21)
        do i8=ndim8,1,-1
          D(i8) = ao_two_e_integral(addr1(i8), addr2(i8),              &
              addr1(i8), addr2(i8))
        enddo
        !$OMP END PARALLEL DO
      else
-       !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i8) SCHEDULE(dynamic,16)
+       !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i8) SCHEDULE(dynamic,21)
        do i8=ndim8,1,-1
          D(i8) = get_ao_two_e_integral(addr1(i8), addr1(i8),          &
-             addr2(i8), addr2(i8),                                   &
-             ao_integrals_map)
+             addr2(i8), addr2(i8), ao_integrals_map)
        enddo
        !$OMP END PARALLEL DO
      endif
+
      D_sorted(:) = -D(:)
      call dsort_noidx_big(D_sorted,ndim8)
-     D_sorted(:) = dabs(D_sorted(:))
-
+     D_sorted(:) = -D_sorted(:)
      Dmax = D_sorted(1)
 
      ! 2.
-     dscale = 1.d0
-     dscale_tmp = dscale*dscale*Dmax
      np8=0_8
      do p8=1,ndim8
-       if ( dscale_tmp*D(p8) > tau2 ) then
+       if ( Dmax*D(p8) >= tau2 ) then
          np8 = np8+1_8
          Lset(np8) = p8
        endif
      enddo
-     np = np8
+     if (np8 > ndim8) stop 'np>ndim8'
+     np = int(np8,4)
+     if (np <= 0) stop 'np<=0'
+
+     rank_max = min(np,20*elec_num*elec_num)
+     call mmap(trim(ezfio_work_dir)//'cholesky_ao_tmp', (/ ndim8, rank_max /), 8, fd(1), .False., .True., c_pointer(1))
+     call c_f_pointer(c_pointer(1), L, (/ ndim8, rank_max /))
+
+     ! Deleting the file while it is open makes the file invisible on the filesystem,
+     ! and automatically deleted, even if the program crashes
+     iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_ao_tmp', 'R')
+     close(iunit,status='delete')
+
 
      ! 3.
      N = 0
@@ -187,82 +178,66 @@ END_PROVIDER
      ! 4.
      i = 0
 
+     mem = memory_of_double(np)                & ! Delta(np,nq)
+         + (np+1)*memory_of_double(block_size)   ! Ltmp_p(np,block_size) + Ltmp_q(nq,block_size)
+
+!     call check_mem(mem)
+
      ! 5.
-     do while ( (Dmax > tau).and.(rank*1_8 < min(ndim8,rank_max)) )
+     do while ( (Dmax > tau).and.(np > 0) )
        ! a.
        i = i+1
 
 
-       ! Inrease s until the arrays fit in memory
-       s = 0.01d0
        block_size = max(N,24)
+
+       ! Determine nq so that Delta fits in memory
+
+       s = 0.1d0
+       Dmin = max(s*Dmax,tau)
+       do nq=2,np-1
+         if (D_sorted(nq) < Dmin) exit
+       enddo
+
        do while (.True.)
 
-         ! b.
-         Dmin = max(s*Dmax,tau)
+         mem = mem0                                 &
+             + np*memory_of_double(nq)              & ! Delta(np,nq)
+             + (np+nq)*memory_of_double(block_size)   ! Ltmp_p(np,block_size) + Ltmp_q(nq,block_size)
 
-         ! c.
-         nq=0
-         do p=1,np
-           if ( D(Lset(p)) > Dmin ) then
-             nq = nq+1
-             Dset(nq) = Lset(p)
-           endif
-         enddo
-
-
-         mem = mem0                                &
-             + np*memory_of_double(nq)
-
-!print *, 'mem = ', mem
-         if (mem > qp_max_mem/2) then
-           s = s*2.d0
+         if (mem > qp_max_mem*0.5d0) then
+           Dmin = D_sorted(nq/2)
+           do ii=nq/2,np-1
+             if (D_sorted(ii) < Dmin) then
+               nq = ii
+               exit
+             endif
+           enddo
          else
            exit
          endif
 
-         if ((s > 1.d0).or.(nq == 0)) then
-           call print_memory_usage()
-           print *, 'Required peak memory: ', mem, 'Gb'
-           call resident_memory(mem)
-           print *, 'Already used  memory: ', mem, 'Gb'
-           print *, 'Not enough memory. Reduce cholesky threshold'
-           stop -1
-         endif
+       enddo
+!call print_memory_usage
+!print *, 'np, nq, Predicted memory: ', np, nq, mem
 
-         if (s > 0.1d0) then
-           exit
-         endif
+       if (nq <= 0) then
+         print *, nq
+         stop 'bug in cholesky: nq <= 0'
+       endif
 
+       Dmin = D_sorted(nq)
+       nq=0
+       do p=1,np
+         if ( D(Lset(p)) >= Dmin ) then
+           nq = nq+1
+           Dset(nq) = Lset(p)
+         endif
        enddo
 
-       ! d., e.
-       mem = mem0                            &
-             + memory_of_int(nq)             &! computed(nq)
-             + np*memory_of_int(nq)          &! computed(nq)
-             + memory_of_double(np)          &! Delta_col(np)
-             + 7*memory_of_double8(ndim8)     &! D, Lset, Dset, D_sorted, addr[1-3]
-             + np*memory_of_double(nq)       &! Delta(np,nq)
-             + (np+nq)*memory_of_double(block_size) ! Ltmp_p(np,block_size) + Ltmp_q(nq,block_size)
-
-       if (mem > qp_max_mem) then
-         call mmap(trim(ezfio_work_dir)//'cholesky_delta', (/ np*1_8, nq*1_8 /), 8, fd(2), .False., .True., c_pointer(2))
-         call c_f_pointer(c_pointer(2), Delta, (/ np, nq /))
-         ! Deleting the file while it is open makes the file invisible on the filesystem,
-         ! and automatically deleted, even if the program crashes
-         iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_delta', 'R')
-         close(iunit,status='delete')
-         delta_on_disk = .True.
-       else
-         allocate(Delta(np,nq))
-         delta_on_disk = .False.
-       endif
-!print *, delta_on_disk
-
-       allocate(Delta_col(np))
 
+       allocate(Delta(np,nq))
        allocate(Ltmp_p(np,block_size), stat=ierr)
-!print *, 'allocate : Ltmp_p(np,block_size)', memory_of_double8(np*block_size*1_8), np, block_size
 
        if (ierr /= 0) then
          call print_memory_usage()
@@ -271,7 +246,6 @@ END_PROVIDER
        endif
 
        allocate(Ltmp_q(nq,block_size), stat=ierr)
-!print *, 'allocate : Ltmp_q(nq,block_size)', memory_of_double8(nq*block_size*1_8), nq, block_size
 
        if (ierr /= 0) then
          call print_memory_usage()
@@ -280,11 +254,9 @@ END_PROVIDER
        endif
 
 
-       allocate(computed(nq))
-       computed(:) = .False.
+       computed(1:nq) = .False.
 
 
-!print *, 'N, rank, block_size', N, rank, block_size
        !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(k,p,q)
        do k=1,N
          !$OMP DO
@@ -302,50 +274,11 @@ END_PROVIDER
        !$OMP BARRIER
        !$OMP END PARALLEL
 
-       PROVIDE nproc
        if (N>0) then
 
-         if (delta_on_disk) then
-           ! Blocking improves I/O performance
-
-           dgemm_block_size = nproc*4
-
-           allocate (dgemm_buffer1(np,dgemm_block_size))
-           allocate (dgemm_buffer2(dgemm_block_size,N))
-
-           do jj=1,nq,dgemm_block_size
-
-             nqq = min(nq, jj+dgemm_block_size-1) - jj + 1
-
-             !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q,ii)
-             do ii=1,N
-               do q=jj,jj+nqq-1
-                 dgemm_buffer2(q-jj+1,ii) = Ltmp_q(q,ii)
-               enddo
-             enddo
-             !$OMP END PARALLEL DO
-
-             call dgemm('N', 'T', np, nqq, N, 1.d0,                       &
-                   Ltmp_p, np, dgemm_buffer2, dgemm_block_size, 0.d0, dgemm_buffer1, np)
-
-             !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q)
-             do q=jj,jj+nqq-1
-               Delta(:,q) = - dgemm_buffer1(:, q-jj+1)
-             enddo
-             !$OMP END PARALLEL DO
-
-           enddo
-
-           deallocate(dgemm_buffer1, dgemm_buffer2)
-
-         else
-
            call dgemm('N', 'T', np, nq, N, -1.d0,                       &
                   Ltmp_p(1,1), np, Ltmp_q(1,1), nq, 0.d0, Delta, np)
 
-         endif
-
-
        else
 
          !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q,j)
@@ -368,49 +301,20 @@ END_PROVIDER
        do j=1,nq
 
          if ( (Qmax <= Dmin).or.(N+j*1_8 > ndim8) ) exit
+
          ! i.
          rank = N+j
+         if (rank == rank_max) then
+           print *, 'cholesky: rank_max reached'
+           exit
+         endif
 
          if (iblock == block_size) then
 
-           if (delta_on_disk) then
-             ! Blocking improves I/O performance
-
-             dgemm_block_size = nproc*4
-             allocate (dgemm_buffer1(np,dgemm_block_size))
-             allocate (dgemm_buffer2(dgemm_block_size,block_size))
-
-             do jj=1,nq,dgemm_block_size
-               nqq = min(nq, jj+dgemm_block_size-1) - jj + 1
-
-               !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q,ii)
-               do ii=1,block_size
-                 do q=jj,jj+nqq-1
-                   dgemm_buffer2(q-jj+1,ii) = Ltmp_q(q,ii)
-                 enddo
-               enddo
-               !$OMP END PARALLEL DO
-
-               call dgemm('N', 'T', np, nqq, block_size, 1.d0,                       &
-                     Ltmp_p(1,1), np, dgemm_buffer2, dgemm_block_size, 0.d0, dgemm_buffer1, np)
-
-               !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q)
-               do q=jj,jj+nqq-1
-                 Delta(:,q) = Delta(:,q) - dgemm_buffer1(:, q-jj+1)
-               enddo
-               !$OMP END PARALLEL DO
-
-             enddo
-             deallocate(dgemm_buffer1, dgemm_buffer2)
-
-           else
-
-             call dgemm('N','T',np,nq,block_size,-1.d0,                &
+            call dgemm('N','T',np,nq,block_size,-1.d0,                &
                  Ltmp_p, np, Ltmp_q, nq, 1.d0, Delta, np)
 
-           endif
-
-           iblock = 0
+            iblock = 0
 
          endif
 
@@ -438,26 +342,25 @@ END_PROVIDER
            if (do_direct_integrals) then
                !$OMP PARALLEL DO PRIVATE(k) SCHEDULE(dynamic,21)
                do k=1,np
+                 Delta_col(k) = 0.d0
                  if (.not.ao_two_e_integral_zero( addr1(Lset(k)), addr1(Dset(m)),&
                        addr2(Lset(k)), addr2(Dset(m)) ) ) then
                      Delta_col(k) = &
                          ao_two_e_integral(addr1(Lset(k)), addr2(Lset(k)),&
                          addr1(Dset(m)), addr2(Dset(m)))
-                 else
-                     Delta_col(k) = 0.d0
                  endif
                enddo
                !$OMP END PARALLEL DO
            else
+               PROVIDE ao_integrals_map
                !$OMP PARALLEL DO PRIVATE(k) SCHEDULE(dynamic,21)
                do k=1,np
+                 Delta_col(k) = 0.d0
                  if (.not.ao_two_e_integral_zero( addr1(Lset(k)), addr1(Dset(m)),&
                        addr2(Lset(k)), addr2(Dset(m)) ) ) then
                      Delta_col(k) = &
                          get_ao_two_e_integral( addr1(Lset(k)), addr1(Dset(m)),&
                          addr2(Lset(k)), addr2(Dset(m)), ao_integrals_map)
-                 else
-                     Delta_col(k) = 0.d0
                  endif
                enddo
                !$OMP END PARALLEL DO
@@ -507,35 +410,28 @@ END_PROVIDER
 
        print '(I10, 4X, ES12.3)', rank, Qmax
 
-       deallocate(Delta_col)
        deallocate(Ltmp_p)
        deallocate(Ltmp_q)
-       deallocate(computed)
-       if (delta_on_disk) then
-         call munmap( (/ np*1_8, nq*1_8 /), 8, fd(2), c_pointer(2) )
-       else
-         deallocate(Delta)
-       endif
+       deallocate(Delta)
 
        ! i.
        N = rank
 
        ! j.
-       Dmax = D(Lset(1))
-       do p=1,np
-         Dmax = max(Dmax, D(Lset(p)))
-       enddo
+       D_sorted(:) = -D(:)
+       call dsort_noidx_big(D_sorted,ndim8)
+       D_sorted(:) = -D_sorted(:)
+
+       Dmax = D_sorted(1)
 
-       dscale = 1.d0
-       dscale_tmp = dscale*dscale*Dmax
        np8=0_8
        do p8=1,ndim8
-         if ( dscale_tmp*D(p8) > tau2 ) then
+         if ( Dmax*D(p8) >= tau2 ) then
            np8 = np8+1_8
            Lset(np8) = p8
          endif
        enddo
-       np = np8
+       np = int(np8,4)
 
      enddo
 
@@ -543,8 +439,11 @@ END_PROVIDER
      print *,  '============ ============='
      print *,  ''
 
+     deallocate( D, Lset, Dset, D_sorted )
+     deallocate( addr1, addr2, Delta_col, computed )
+
+
      allocate(cholesky_ao(ao_num,ao_num,rank), stat=ierr)
-!print *, 'allocate : cholesky_ao(ao_num,ao_num,rank)', memory_of_double8(ao_num*ao_num*rank*1_8)
 
      if (ierr /= 0) then
        call print_memory_usage()
@@ -556,7 +455,7 @@ END_PROVIDER
      !$OMP PARALLEL DO PRIVATE(k,j)
      do k=1,rank
        do j=1,ao_num
-           cholesky_ao(1:ao_num,j,k) = L((j-1)*ao_num+1:j*ao_num,k)
+           cholesky_ao(1:ao_num,j,k) = L((j-1_8)*ao_num+1_8:1_8*j*ao_num,k)
        enddo
      enddo
      !$OMP END PARALLEL DO
@@ -581,5 +480,6 @@ END_PROVIDER
    call wall_time(wall1)
    print*,'Time to provide AO cholesky vectors = ',(wall1-wall0)/60.d0, ' min'
 
+
 END_PROVIDER
 
diff --git a/src/ccsd/NEED b/src/ccsd/NEED
index e6e6bc59..8298f28e 100644
--- a/src/ccsd/NEED
+++ b/src/ccsd/NEED
@@ -1,2 +1,3 @@
+gpu
 hartree_fock
 utils_cc
diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index b8cfab2a..d8131a9c 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -1,4 +1,5 @@
 subroutine run_ccsd_space_orb
+  use gpu
 
   implicit none
 
@@ -9,16 +10,28 @@ subroutine run_ccsd_space_orb
   double precision :: uncorr_energy,energy, max_elem, max_r, max_r1, max_r2,ta,tb
   logical :: not_converged
 
-  double precision, allocatable :: t2(:,:,:,:), r2(:,:,:,:), tau(:,:,:,:), tau_x(:,:,:,:)
-  double precision, allocatable :: t1(:,:), r1(:,:)
-  double precision, allocatable :: H_oo(:,:), H_vv(:,:), H_vo(:,:)
+  type(gpu_double4) :: t2, r2, tau, tau_x
+  type(gpu_double2) :: t1, r1
+  type(gpu_double2) :: H_oo, H_vv, H_vo
+
+  type(gpu_double2) :: d_cc_space_f_oo, d_cc_space_f_vo
+  type(gpu_double2) :: d_cc_space_f_ov, d_cc_space_f_vv
+
+  type(gpu_double3) :: d_cc_space_v_oo_chol, d_cc_space_v_vo_chol
+  type(gpu_double3) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol
+
+  type(gpu_double4) :: d_cc_space_v_oovv, d_cc_space_v_voov, d_cc_space_v_ovov
+  type(gpu_double4) :: d_cc_space_v_oovo, d_cc_space_v_vooo, d_cc_space_v_oooo
+  type(gpu_double4) :: d_cc_space_v_vvoo, d_cc_space_v_ovvo, d_cc_space_v_ovoo
 
   double precision, allocatable :: all_err(:,:), all_t(:,:)
   integer, allocatable          :: list_occ(:), list_vir(:)
   integer(bit_kind)             :: det(N_int,2)
   integer                       :: nO, nV, nOa, nVa
 
-  if (do_ao_cholesky) then
+  call set_multiple_levels_omp(.False.)
+
+  if (do_mo_cholesky) then
     PROVIDE cholesky_mo_transp
     FREE cholesky_ao
   else
@@ -49,11 +62,77 @@ subroutine run_ccsd_space_orb
   !print*,'occ',list_occ
   !print*,'vir',list_vir
 
-  allocate(t2(nO,nO,nV,nV), r2(nO,nO,nV,nV))
-  allocate(tau(nO,nO,nV,nV))
-  allocate(tau_x(nO,nO,nV,nV))
-  allocate(t1(nO,nV), r1(nO,nV))
-  allocate(H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO))
+  ! GPU arrays
+  call gpu_allocate(d_cc_space_f_oo, nO, nO)
+  call gpu_allocate(d_cc_space_f_vo, nV, nO)
+  call gpu_allocate(d_cc_space_f_ov, nO, nV)
+  call gpu_allocate(d_cc_space_f_vv, nV, nV)
+
+  call gpu_upload(cc_space_f_oo, d_cc_space_f_oo)
+  call gpu_upload(cc_space_f_vo, d_cc_space_f_vo)
+  call gpu_upload(cc_space_f_ov, d_cc_space_f_ov)
+  call gpu_upload(cc_space_f_vv, d_cc_space_f_vv)
+
+!  FREE cc_space_f_oo
+!  FREE cc_space_f_vo
+!  FREE cc_space_f_vv
+
+  if (do_mo_cholesky) then
+    call gpu_allocate(d_cc_space_v_oo_chol, cholesky_mo_num, nO, nO)
+    call gpu_allocate(d_cc_space_v_ov_chol, cholesky_mo_num, nO, nV)
+    call gpu_allocate(d_cc_space_v_vo_chol, cholesky_mo_num, nV, nO)
+    call gpu_allocate(d_cc_space_v_vv_chol, cholesky_mo_num, nV, nV)
+
+    call gpu_upload(cc_space_v_oo_chol, d_cc_space_v_oo_chol)
+    call gpu_upload(cc_space_v_ov_chol, d_cc_space_v_ov_chol)
+    call gpu_upload(cc_space_v_vo_chol, d_cc_space_v_vo_chol)
+    call gpu_upload(cc_space_v_vv_chol, d_cc_space_v_vv_chol)
+
+!    FREE cc_space_v_oo_chol
+!    FREE cc_space_v_ov_chol
+!    FREE cc_space_v_vo_chol
+!    FREE cc_space_v_vv_chol
+  endif
+
+  call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV)
+  call gpu_allocate(d_cc_space_v_voov, nV, nO, nO, nV)
+  call gpu_allocate(d_cc_space_v_ovov, nO, nV, nO, nV)
+  call gpu_allocate(d_cc_space_v_oovo, nO, nO, nV, nO)
+  call gpu_allocate(d_cc_space_v_ovvo, nO, nV, nV, nO)
+  call gpu_allocate(d_cc_space_v_vooo, nV, nO, nO, nO)
+  call gpu_allocate(d_cc_space_v_oooo, nO, nO, nO, nO)
+  call gpu_allocate(d_cc_space_v_vvoo, nV, nV, nO, nO)
+  call gpu_allocate(d_cc_space_v_ovoo, nO, nV, nO, nO)
+
+  call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv)
+  call gpu_upload(cc_space_v_voov, d_cc_space_v_voov)
+  call gpu_upload(cc_space_v_ovov, d_cc_space_v_ovov)
+  call gpu_upload(cc_space_v_oovo, d_cc_space_v_oovo)
+  call gpu_upload(cc_space_v_ovvo, d_cc_space_v_ovvo)
+  call gpu_upload(cc_space_v_vooo, d_cc_space_v_vooo)
+  call gpu_upload(cc_space_v_oooo, d_cc_space_v_oooo)
+  call gpu_upload(cc_space_v_vvoo, d_cc_space_v_vvoo)
+  call gpu_upload(cc_space_v_ovoo, d_cc_space_v_ovoo)
+
+!  FREE cc_space_v_voov
+!  FREE cc_space_v_ovov
+!  FREE cc_space_v_oovo
+!  FREE cc_space_v_oovv
+!  FREE cc_space_v_vooo
+!  FREE cc_space_v_oooo
+!  FREE cc_space_v_vvoo
+!  FREE cc_space_v_ovvo
+!  FREE cc_space_v_ovoo
+
+  call gpu_allocate(t2, nO,nO,nV,nV)
+  call gpu_allocate(r2, nO,nO,nV,nV)
+  call gpu_allocate(tau, nO,nO,nV,nV)
+  call gpu_allocate(tau_x, nO,nO,nV,nV)
+  call gpu_allocate(t1, nO,nV)
+  call gpu_allocate(r1, nO,nV)
+  call gpu_allocate(H_oo, nO, nO)
+  call gpu_allocate(H_vo, nV, nO)
+  call gpu_allocate(H_vv, nV, nV)
 
   if (cc_update_method == 'diis') then
     double precision :: rss, diis_mem, extra_mem
@@ -95,14 +174,22 @@ subroutine run_ccsd_space_orb
   endif
 
   ! Init
-  call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,t1)
-  call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,t2)
-  call update_tau_space(nO,nV,t1,t2,tau)
+  double precision, allocatable :: h_t1(:,:), h_t2(:,:,:,:)
+  allocate(h_t1(nO,nV), h_t2(nO,nO,nV,nV))
+
+  call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,h_t1)
+  call gpu_upload(h_t1, t1)
+
+  call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,h_t2)
+  call gpu_upload(h_t2, t2)
+
+
+  call update_tau_space(nO,nV,h_t1,t1,t2,tau)
   call update_tau_x_space(nO,nV,tau,tau_x)
-  !print*,'hf_energy', hf_energy
   call det_energy(det,uncorr_energy)
   print*,'Det energy', uncorr_energy
-  call ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
+
+  call ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy)
   print*,'Guess energy', uncorr_energy+energy, energy
 
   nb_iter = 0
@@ -118,43 +205,45 @@ subroutine run_ccsd_space_orb
   do while (not_converged)
 
     ! Residue
-    if (do_ao_cholesky) then
-!    if (.False.) then
-      call compute_H_oo_chol(nO,nV,tau_x,H_oo)
-      call compute_H_vv_chol(nO,nV,tau_x,H_vv)
-      call compute_H_vo_chol(nO,nV,t1,H_vo)
+    if (do_mo_cholesky) then
+      call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
+      call compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, d_cc_space_v_ov_chol,H_vv)
+      call compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol, H_vo)
 
-      call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
-      call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
+      call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_space_f_ov,d_cc_space_f_vo, &
+           d_cc_space_v_voov, d_cc_space_v_ovov, d_cc_space_v_oovo, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol)
+      call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
+           d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, d_cc_space_v_oovo, d_cc_space_v_ovvo, d_cc_space_v_ovoo, &
+           d_cc_space_v_ovov, d_cc_space_v_vvoo, d_cc_space_v_oo_chol, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, &
+           d_cc_space_f_vo, &
+           r2, max_r2)
     else
-      call compute_H_oo(nO,nV,t1,t2,tau,H_oo)
-      call compute_H_vv(nO,nV,t1,t2,tau,H_vv)
-      call compute_H_vo(nO,nV,t1,t2,H_vo)
+      call compute_H_oo(nO,nV,t1%f,t2%f,tau%f,H_oo%f)
+      call compute_H_vv(nO,nV,t1%f,t2%f,tau%f,H_vv%f)
+      call compute_H_vo(nO,nV,t1%f,t2%f,H_vo%f)
 
-      call compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
-      call compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
+      call compute_r1_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r1%f,max_r1)
+      call compute_r2_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r2%f,max_r2)
     endif
     max_r = max(max_r1,max_r2)
 
     ! Update
     if (cc_update_method == 'diis') then
-      !call update_t_ccsd(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
-      !call update_t_ccsd_diis(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
-      call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1,r2,t1,t2,all_err,all_t)
+      call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1%f,r2%f,t1%f,t2%f,all_err,all_t)
 
     ! Standard update as T = T - Delta
     elseif (cc_update_method == 'none') then
-      call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1,t1)
-      call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2,t2)
+      call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1%f,t1%f)
+      call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2%f,t2%f)
     else
       print*,'Unkown cc_method_method: '//cc_update_method
     endif
 
-    call update_tau_space(nO,nV,t1,t2,tau)
+    call update_tau_space(nO,nV,t1%f,t1,t2,tau)
     call update_tau_x_space(nO,nV,tau,tau_x)
 
     ! Energy
-    call ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
+    call ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy)
     write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |'
 
     nb_iter = nb_iter + 1
@@ -179,8 +268,8 @@ subroutine run_ccsd_space_orb
   print*,''
 
   if (write_amplitudes) then
-    call write_t1(nO,nV,t1)
-    call write_t2(nO,nV,t2)
+    call write_t1(nO,nV,t1%f)
+    call write_t2(nO,nV,t2%f)
     call ezfio_set_utils_cc_io_amplitudes('Read')
   endif
 
@@ -189,7 +278,14 @@ subroutine run_ccsd_space_orb
     deallocate(all_err,all_t)
   endif
 
-  deallocate(H_vv,H_oo,H_vo,r1,r2,tau)
+  call gpu_deallocate(H_oo)
+  call gpu_deallocate(H_vv)
+  call gpu_deallocate(H_vo)
+
+  call gpu_deallocate(r1)
+  call gpu_deallocate(r2)
+  call gpu_deallocate(tau)
+  call gpu_deallocate(tau_x)
 
   ! CCSD(T)
   double precision :: e_t, e_t_err
@@ -197,28 +293,14 @@ subroutine run_ccsd_space_orb
 
   if (cc_par_t .and. elec_alpha_num + elec_beta_num > 2) then
 
-    ! Dumb way
-    !call wall_time(ta)
-    !call ccsd_par_t_space(nO,nV,t1,t2,e_t)
-    !call wall_time(tb)
-    !print*,'Time: ',tb-ta, ' s'
-
-    !print*,''
-    !write(*,'(A15,F18.12,A3)') ' E(CCSD(T))  = ', uncorr_energy + energy + e_t, ' Ha'
-    !write(*,'(A15,F18.12,A3)') ' E(T)        = ', e_t, ' Ha'
-    !write(*,'(A15,F18.12,A3)') ' Correlation = ', energy + e_t, ' Ha'
-    !print*,''
-
     ! New
     e_t = uncorr_energy + energy ! For print in (T) call
     e_t_err = 0.d0
 
     print*,'Computing (T) correction...'
     call wall_time(ta)
-!    call ccsd_par_t_space_v3(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
-!         ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t)
 
-    call ccsd_par_t_space_stoch(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
+    call ccsd_par_t_space_stoch(nO,nV,t1%f,t2%f,cc_space_f_o,cc_space_f_v &
          ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t, e_t_err)
 
     call wall_time(tb)
@@ -233,168 +315,161 @@ subroutine run_ccsd_space_orb
 
   call save_energy(uncorr_energy + energy, e_t)
 
-  deallocate(t1,t2)
+  deallocate(h_t1, h_t2)
+  if (do_mo_cholesky) then
+    call gpu_deallocate(d_cc_space_v_oo_chol)
+    call gpu_deallocate(d_cc_space_v_ov_chol)
+    call gpu_deallocate(d_cc_space_v_vo_chol)
+    call gpu_deallocate(d_cc_space_v_vv_chol)
+  endif
+
+  call gpu_deallocate(d_cc_space_v_oovv)
+  call gpu_deallocate(d_cc_space_v_voov)
+  call gpu_deallocate(d_cc_space_v_ovov)
+  call gpu_deallocate(d_cc_space_v_oovo)
+  call gpu_deallocate(d_cc_space_v_ovvo)
+  call gpu_deallocate(d_cc_space_v_vooo)
+  call gpu_deallocate(d_cc_space_v_oooo)
+  call gpu_deallocate(d_cc_space_v_vvoo)
+  call gpu_deallocate(d_cc_space_v_ovoo)
+
+  call gpu_deallocate(d_cc_space_f_oo)
+  call gpu_deallocate(d_cc_space_f_vo)
+  call gpu_deallocate(d_cc_space_f_ov)
+  call gpu_deallocate(d_cc_space_f_vv)
+
+  call gpu_deallocate(t1)
+  call gpu_deallocate(t2)
 
 end
 
 ! Energy
 
-subroutine ccsd_energy_space(nO,nV,tau,t1,energy)
-
+subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy)
+  use gpu
   implicit none
 
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: tau(nO,nO,nV,nV)
-  double precision, intent(in)  :: t1(nO,nV)
-  double precision, intent(out) :: energy
+  integer, intent(in)            :: nO, nV
+  type(gpu_double4), intent(in)  :: tau_x, d_cc_space_v_oovv
+  type(gpu_double2), intent(in)  :: t1, d_cc_space_f_vo
+  double precision, intent(out)  :: energy
 
   ! internal
   integer :: i,j,a,b
   double precision :: e
 
-  energy = 0d0
-  !$omp parallel &
-  !$omp shared(nO,nV,energy,tau,t1,&
-  !$omp cc_space_f_vo,cc_space_w_oovv) &
-  !$omp private(i,j,a,b,e) &
-  !$omp default(none)
-  e = 0d0
-  !$omp do
-  do a = 1, nV
-    do i = 1, nO
-      e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp do
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          e = e + tau(i,j,a,b) * cc_space_w_oovv(i,j,a,b)
-       enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp critical
-  energy = energy + e
-  !$omp end critical
-  !$omp end parallel
+  type(gpu_stream) :: s1, s2
+  call gpu_stream_create(s1)
+  call gpu_stream_create(s2)
 
-end
+  call gpu_set_stream(blas_handle,s1)
+  call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo%f(1,1), 1, t1%f(1,1), 1, e)
 
-subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
+  call gpu_set_stream(blas_handle,s2)
+  call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x%f(1,1,1,1), 1_8, d_cc_space_v_oovv%f(1,1,1,1), 1_8, energy)
+  call gpu_set_stream(blas_handle,gpu_default_stream)
 
-  implicit none
+  call gpu_synchronize()
+  call gpu_stream_destroy(s1)
+  call gpu_stream_destroy(s2)
 
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: tau_x(nO,nO,nV,nV)
-  double precision, intent(in)  :: t1(nO,nV)
-  double precision, intent(out) :: energy
-
-  ! internal
-  integer :: i,j,a,b
-  double precision :: e
-
-  energy = 0d0
-  !$omp parallel &
-  !$omp shared(nO,nV,energy,tau_x,t1,&
-  !$omp cc_space_f_vo,cc_space_v_oovv) &
-  !$omp private(i,j,a,b,e) &
-  !$omp default(none)
-  e = 0d0
-  !$omp do
-  do a = 1, nV
-    do i = 1, nO
-      e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp do
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          e = e + tau_x(i,j,a,b) * cc_space_v_oovv(i,j,a,b)
-       enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp critical
-  energy = energy + e
-  !$omp end critical
-  !$omp end parallel
+   energy = energy + 2.d0*e
 
 end
 
 ! Tau
 
-subroutine update_tau_space(nO,nV,t1,t2,tau)
-
+subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
+  use gpu
   implicit none
 
   ! in
   integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: t1(nO,nV), t2(nO,nO,nV,nV)
+  double precision, intent(in)  :: h_t1(nO,nV)
+  type(gpu_double2), intent(in) :: t1
+  type(gpu_double4), intent(in) :: t2
 
   ! out
-  double precision, intent(out) :: tau(nO,nO,nV,nV)
+  type(gpu_double4) :: tau
 
   ! internal
   integer                       :: i,j,a,b
 
+  type(gpu_stream) :: stream(nV)
+
   !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,tau,t2,t1) &
+  !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) &
   !$OMP PRIVATE(i,j,a,b) &
   !$OMP DEFAULT(NONE)
   !$OMP DO
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b)
-        enddo
-      enddo
+  do b=1,nV
+    call gpu_stream_create(stream(b))
+    call gpu_set_stream(blas_handle,stream(b))
+    do j=1,nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, &
+         1.d0, t2%f(1,j,1,b), nO*nO, &
+         h_t1(j,b), t1%f(1,1), nO, &
+         tau%f(1,j,1,b), nO*nO)
     enddo
   enddo
   !$OMP END DO
   !$OMP END PARALLEL
 
+  call gpu_synchronize()
+
+  do b=1,nV
+    call gpu_stream_destroy(stream(b))
+  enddo
+  call gpu_set_stream(blas_handle,gpu_default_stream)
+
+
 end
 
 subroutine update_tau_x_space(nO,nV,tau,tau_x)
-
+  use gpu
   implicit none
 
   ! in
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: tau(nO,nO,nV,nV)
+  integer, intent(in)         :: nO, nV
+  type(gpu_double4), intent(in)  :: tau
 
   ! out
-  double precision, intent(out) :: tau_x(nO,nO,nV,nV)
+  type(gpu_double4) :: tau_x
 
   ! internal
   integer                       :: i,j,a,b
 
+  type(gpu_stream) :: stream(nV)
+
+  do a=1,nV
+    call gpu_stream_create(stream(a))
+  enddo
+
   !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,tau,tau_x) &
-  !$OMP PRIVATE(i,j,a,b) &
+  !$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) &
+  !$OMP PRIVATE(a,b) &
   !$OMP DEFAULT(NONE)
   !$OMP DO
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          tau_x(i,j,a,b) = 2.d0*tau(i,j,a,b) - tau(i,j,b,a)
-        enddo
-      enddo
+  do b=1,nV
+    do a=1,nV
+      call gpu_set_stream(blas_handle,stream(a))
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, &
+          2.d0, tau%f(1,1,a,b), nO, &
+         -1.d0, tau%f(1,1,b,a), nO, &
+         tau_x%f(1,1,a,b), nO)
     enddo
   enddo
   !$OMP END DO
   !$OMP END PARALLEL
 
+  call gpu_set_stream(blas_handle,gpu_default_stream)
+  call gpu_synchronize()
+
+  do b=1,nV
+    call gpu_stream_destroy(stream(b))
+  enddo
+
+
 end
 
 ! R1
diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
index b59dc0bb..6f65ea79 100644
--- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
@@ -1,989 +1,789 @@
-subroutine ccsd_energy_space_chol(nO,nV,tau,t1,energy)
-
-  implicit none
-
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: tau(nO,nO,nV,nV)
-  double precision, intent(in)  :: t1(nO,nV)
-  double precision, intent(out) :: energy
-
-  ! internal
-  integer :: i,j,a,b
-  double precision :: e
-
-  energy = 0d0
-  !$omp parallel &
-  !$omp shared(nO,nV,energy,tau,t1,&
-  !$omp cc_space_f_vo,cc_space_w_oovv) &
-  !$omp private(i,j,a,b,e) &
-  !$omp default(none)
-  e = 0d0
-  !$omp do
-  do a = 1, nV
-    do i = 1, nO
-      e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp do
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          e = e + tau(i,j,a,b) * cc_space_w_oovv(i,j,a,b)
-       enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp critical
-  energy = energy + e
-  !$omp end critical
-  !$omp end parallel
-
-end
-
-! Tau
-
-subroutine update_tau_space_chol(nO,nV,t1,t2,tau)
-
-  implicit none
-
-  ! in
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: t1(nO,nV), t2(nO,nO,nV,nV)
-
-  ! out
-  double precision, intent(out) :: tau(nO,nO,nV,nV)
-
-  ! internal
-  integer                       :: i,j,a,b
-
-  !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,tau,t2,t1) &
-  !$OMP PRIVATE(i,j,a,b) &
-  !$OMP DEFAULT(NONE)
-  !$OMP DO
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-end
-
-! R1
-
-subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
-
-  implicit none
-
-  ! in
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: t1(nO,nV), t2(nO,nO,nV,nV), tau(nO,nO,nV,nV)
-  double precision, intent(in)  :: H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO)
-
-  ! out
-  double precision, intent(out) :: r1(nO,nV), max_r1
-
-  ! internal
-  integer                       :: u,i,j,beta,a,b
-
-  !$omp parallel &
-  !$omp shared(nO,nV,r1,cc_space_f_ov) &
-  !$omp private(u,beta) &
-  !$omp default(none)
-  !$omp do
-  do beta = 1, nV
-    do u = 1, nO
-      r1(u,beta) = cc_space_f_ov(u,beta)
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  double precision, allocatable :: X_oo(:,:)
-  allocate(X_oo(nO,nO))
-  call dgemm('N','N', nO, nO, nV, &
-             -2d0, t1    , size(t1,1), &
-                   cc_space_f_vo, size(cc_space_f_vo,1), &
-              0d0, X_oo  , size(X_oo,1))
-
-  call dgemm('T','N', nO, nV, nO, &
-             1d0, X_oo, size(X_oo,2), &
-                  t1  , size(t1,1), &
-             1d0, r1  , size(r1,1))
-  deallocate(X_oo)
-
-  call dgemm('N','N', nO, nV, nV, &
-             1d0, t1  , size(t1,1), &
-                  H_vv, size(H_vv,1), &
-             1d0, r1  , size(r1,1))
-
-  call dgemm('N','N', nO, nV, nO, &
-             -1d0, H_oo, size(H_oo,1), &
-                   t1  , size(t1,1), &
-              1d0, r1, size(r1,1))
-
-  double precision, allocatable :: X_voov(:,:,:,:)
-  allocate(X_voov(nV, nO, nO, nV))
-
-  !$omp parallel &
-  !$omp shared(nO,nV,X_voov,t2,t1) &
-  !$omp private(u,beta,i,a) &
-  !$omp default(none)
-  !$omp do
-  do beta = 1, nV
-    do u = 1, nO
-      do i = 1, nO
-        do a = 1, nV
-          X_voov(a,i,u,beta) = 2d0 * t2(i,u,a,beta) - t2(u,i,a,beta) + t1(u,a) * t1(i,beta)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  call dgemv('T', nV*nO, nO*nV, &
-             1d0, X_voov, size(X_voov,1) * size(X_voov,2), &
-                  H_vo  , 1, &
-             1d0, r1    , 1)
-
-  deallocate(X_voov)
-
-  double precision, allocatable :: X_ovov(:,:,:,:)
-  allocate(X_ovov(nO, nV, nO, nV))
-
-  !$omp parallel &
-  !$omp shared(nO,nV,cc_space_v_ovov,cc_space_v_voov,X_ovov) &
-  !$omp private(u,beta,i,a) &
-  !$omp default(none)
-  !$omp do
-  do beta = 1, nV
-    do u = 1, nO
-      do a = 1, nv
-        do i = 1, nO
-          X_ovov(i,a,u,beta) = 2d0 * cc_space_v_voov(a,u,i,beta) - cc_space_v_ovov(u,a,i,beta)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  call dgemv('T', nO*nV, nO*nV, &
-             1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), &
-                  t1     , 1, &
-             1d0, r1     , 1)
-
-  deallocate(X_ovov)
-
-  integer :: iblock, block_size, nVmax
-  double precision, allocatable :: W_vvov(:,:,:,:), W_vvov_tmp(:,:,:,:), T_vvoo(:,:,:,:)
-  block_size = 16
-  allocate(W_vvov(nV,nV,nO,block_size), W_vvov_tmp(nV,nO,nV,block_size), T_vvoo(nV,nV,nO,nO))
-
-  !$omp parallel &
-  !$omp private(u,i,b,a) &
-  !$omp default(shared)
-  !$omp do
-  do u = 1, nO
-    do i = 1, nO
-      do b = 1, nV
-        do a = 1, nV
-          T_vvoo(a,b,i,u) = tau(i,u,a,b)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  do iblock = 1, nV, block_size
-    nVmax = min(block_size,nV-iblock+1)
-
-    call dgemm('T','N', nV*nO, nV*nVmax, cholesky_mo_num, 1.d0, &
-      cc_space_v_vo_chol            , cholesky_mo_num, &
-      cc_space_v_vv_chol(1,1,iblock), cholesky_mo_num, &
-      0.d0, W_vvov_tmp, nV*nO)
-
-    !$omp parallel &
-    !$omp private(b,i,a,beta) &
-    !$omp default(shared)
-    do beta = 1,  nVmax
-      do i = 1, nO
-        !$omp do
-        do b = 1, nV
-          do a = 1, nV
-            W_vvov(a,b,i,beta) = 2d0 * W_vvov_tmp(a,i,b,beta) - W_vvov_tmp(b,i,a,beta)
-          enddo
-        enddo
-        !$omp end do nowait
-      enddo
-    enddo
-    !$omp barrier
-    !$omp end parallel
-
-    call dgemm('T','N',nO,nVmax,nO*nV*nV, &
-             1d0, T_vvoo, nV*nV*nO, &
-                  W_vvov, nO*nV*nV, &
-             1d0, r1(1,iblock), nO)
-  enddo
-
-  deallocate(W_vvov,T_vvoo)
-
-
-  double precision, allocatable :: W_oovo(:,:,:,:)
-  allocate(W_oovo(nO,nO,nV,nO))
-
-  !$omp parallel &
-  !$omp shared(nO,nV,cc_space_v_oovo,W_oovo) &
-  !$omp private(u,a,i,j) &
-  !$omp default(none)
-  do u = 1, nO
-    !$omp do
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-!          W_oovo(i,j,a,u) = 2d0 * cc_space_v_vooo(a,u,i,j) - cc_space_v_vooo(a,u,j,i)
-          W_oovo(i,j,a,u) = 2d0 * cc_space_v_oovo(i,j,a,u) - cc_space_v_oovo(j,i,a,u)
-        enddo
-      enddo
-    enddo
-    !$omp end do nowait
-  enddo
-  !$omp end parallel
-
-  call dgemm('T','N', nO, nV, nO*nO*nV, &
-             -1d0, W_oovo, size(W_oovo,1) * size(W_oovo,2) * size(W_oovo,3), &
-                   tau   , size(tau,1) * size(tau,2) * size(tau,3), &
-              1d0, r1    , size(r1,1))
-
-  deallocate(W_oovo)
-
-  max_r1 = 0d0
-  do a = 1, nV
-    do i = 1, nO
-      max_r1 = max(dabs(r1(i,a)), max_r1)
-    enddo
-  enddo
-
-  ! Change the sign for consistency with the code in spin orbitals
-  !$omp parallel &
-  !$omp shared(nO,nV,r1) &
-  !$omp private(a,i) &
-  !$omp default(none)
-  !$omp do
-  do a = 1, nV
-    do i = 1, nO
-      r1(i,a) = -r1(i,a)
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-end
-
 ! H_oo
 
-subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo)
-
+subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, &
+    d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
+  use gpu
   implicit none
 
   integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: tau_x(nO, nO, nV, nV)
-  double precision, intent(out) :: H_oo(nO, nO)
+  type(gpu_double2), intent(in)    :: d_cc_space_f_oo
+  type(gpu_double3), intent(in)    :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol
+  type(gpu_double4), intent(in)    :: tau_x
+  type(gpu_double2), intent(out)   :: H_oo
 
   integer :: a,b,i,j,u,k
 
-  double precision, allocatable :: tau_kau(:,:,:), tmp_vov(:,:,:)
+  type(gpu_double3) :: tau_kau, tmp_vov, tmp_ovv
 
-  allocate(tau_kau(cholesky_mo_num,nV,nO))
-  !$omp parallel &
-  !$omp default(shared) &
-  !$omp private(i,u,j,k,a,b,tmp_vov)
-  allocate(tmp_vov(nV,nO,nV) )
-  !$omp do
-  do u = 1, nO
+  call gpu_allocate(tau_kau, cholesky_mo_num, nV, nO)
+
+  type(gpu_blas) :: blas
+
+
+  !$OMP PARALLEL  &
+  !$OMP DEFAULT(SHARED) &
+  !$OMP PRIVATE(blas,u,b,tmp_vov,tmp_ovv)
+
+  !$OMP SINGLE
+  !$OMP TASK
+  call gpu_copy(d_cc_space_f_oo, H_oo)
+  !$OMP END TASK
+  !$OMP END SINGLE
+
+  call gpu_allocate(tmp_ovv, nO, nV, nV)
+  call gpu_allocate(tmp_vov, nV, nO, nV)
+
+  call gpu_blas_create(blas)
+
+  !$OMP DO
+  do u=1,nO
+    call gpu_dgeam(blas, 'N', 'N', 1, nO*nV*nV, 1.d0, &
+           tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f(1,1,1,1), nO, tmp_ovv%f(1,1,1), 1)
     do b=1,nV
-      do j=1,nO
-        do a=1,nV
-          tmp_vov(a,j,b) = tau_x(u,j,a,b)
-        enddo
-      enddo
+      call gpu_dgeam(blas, 'T', 'T', nV, nO, 1.d0, &
+           tmp_ovv%f(1,1,b), nO, 0.d0, &
+           tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV)
     enddo
-    call dgemm('N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
-      cc_space_v_ov_chol, cholesky_mo_num, tmp_vov, nV, &
-      0.d0, tau_kau(1,1,u), cholesky_mo_num)
+    call gpu_dgemm(blas, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
+      d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, tmp_vov%f(1,1,1), nV, &
+      0.d0, tau_kau%f(1,1,u), cholesky_mo_num)
   enddo
-  !$omp end do nowait
-  deallocate(tmp_vov)
-  !$omp do
-  do i = 1, nO
-    do u = 1, nO
-      H_oo(u,i) = cc_space_f_oo(u,i)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp barrier
-  !$omp end  parallel
-  call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
-    tau_kau, cholesky_mo_num*nV,  cc_space_v_vo_chol, cholesky_mo_num*nV, &
-    1.d0, H_oo, nO)
+  !$OMP END DO
 
+  call gpu_blas_destroy(blas)
+
+  call gpu_deallocate(tmp_vov)
+  call gpu_deallocate(tmp_ovv)
+
+  !$OMP TASKWAIT
+  !$OMP END PARALLEL
+
+  call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
+    tau_kau%f(1,1,1), cholesky_mo_num*nV,  d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num*nV, &
+    1.d0, H_oo%f(1,1), nO)
+
+  call gpu_synchronize()
+  call gpu_deallocate(tau_kau)
 end
 
 ! H_vv
 
-subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv)
-
+subroutine compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, &
+         d_cc_space_v_ov_chol,H_vv)
+  use gpu
   implicit none
 
-  integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: tau_x(nO, nO, nV, nV)
-  double precision, intent(out) :: H_vv(nV, nV)
+  integer, intent(in)              :: nO,nV
+  type(gpu_double2), intent(in)    :: d_cc_space_f_vv
+  type(gpu_double3), intent(in)    :: d_cc_space_v_ov_chol
+  type(gpu_double4), intent(in)    :: tau_x
+  type(gpu_double2), intent(out)   :: H_vv
 
   integer :: a,b,i,j,u,k, beta
 
-  double precision, allocatable :: tau_kia(:,:,:), tmp_oov(:,:,:)
+  type(gpu_double3) :: tau_kia, tmp_oov
 
-  allocate(tau_kia(cholesky_mo_num,nO,nV))
-  !$omp parallel &
-  !$omp default(shared) &
-  !$omp private(i,beta,j,k,a,b,tmp_oov)
-  allocate(tmp_oov(nO,nO,nV) )
-  !$omp do
+  call gpu_allocate(tau_kia, cholesky_mo_num, nO, nV)
+
+  type(gpu_blas) :: blas
+
+  !$OMP PARALLEL &
+  !$OMP DEFAULT(SHARED) &
+  !$OMP PRIVATE(a,b,tmp_oov,blas)
+
+  !$OMP SINGLE
+  !$OMP TASK
+  call gpu_copy(d_cc_space_f_vv, H_vv)
+  !$OMP END TASK
+  !$OMP END SINGLE
+
+  call gpu_blas_create(blas)
+  call gpu_allocate(tmp_oov, nO, nO, nV)
+
+  !$OMP DO
   do a = 1, nV
     do b=1,nV
-      do j=1,nO
-        do i=1,nO
-          tmp_oov(i,j,b) = tau_x(i,j,a,b)
-        enddo
-      enddo
+      call gpu_dgeam(blas, 'N', 'N', nO, nO, 1.d0, &
+        tau_x%f(1,1,a,b), nO, 0.d0, &
+        tau_x%f(1,1,a,b), nO, tmp_oov%f(1,1,b), nO)
     enddo
-    call dgemm('N','T',cholesky_mo_num,nO,nO*nV,1.d0, &
-      cc_space_v_ov_chol, cholesky_mo_num, tmp_oov, nO, &
-      0.d0, tau_kia(1,1,a), cholesky_mo_num)
+    call gpu_dgemm(blas, 'N', 'T', cholesky_mo_num, nO, nO*nV, 1.d0, &
+      d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, tmp_oov%f(1,1,1), nO, &
+      0.d0, tau_kia%f(1,1,a), cholesky_mo_num)
   enddo
-  !$omp end do nowait
-  deallocate(tmp_oov)
+  !$OMP END DO
 
-  !$omp do
-  do beta = 1, nV
-    do a = 1, nV
-      H_vv(a,beta) = cc_space_f_vv(a,beta)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp barrier
-  !$omp end  parallel
-  call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
-    tau_kia, cholesky_mo_num*nO,  cc_space_v_ov_chol, cholesky_mo_num*nO, &
-    1.d0, H_vv, nV)
+  call gpu_blas_destroy(blas)
 
+  call gpu_deallocate(tmp_oov)
+  !$OMP TASKWAIT
+  !$OMP END PARALLEL
+
+  call gpu_dgemm(blas_handle, 'T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
+    tau_kia%f(1,1,1), cholesky_mo_num*nO,  d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, &
+    1.d0, H_vv%f(1,1), nV)
+
+  call gpu_synchronize()
+  call gpu_deallocate(tau_kia)
 end
 
 ! H_vo
-subroutine compute_H_vo_chol(nO,nV,t1,H_vo)
-
+subroutine compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, &
+         d_cc_space_v_ov_chol,d_cc_space_v_vo_chol, H_vo)
+  use gpu
   implicit none
 
-  integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: t1(nO, nV)
-  double precision, intent(out) :: H_vo(nV, nO)
+  integer, intent(in)            :: nO,nV
+  type(gpu_double2), intent(in)  :: t1, d_cc_space_f_vo
+  type(gpu_double3), intent(in)  :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol
+  type(gpu_double2), intent(out) :: H_vo
 
   integer :: a,b,i,j,u,k
 
-  double precision, allocatable :: tmp_k(:), tmp(:,:,:), tmp2(:,:,:)
+  type(gpu_double1) :: tmp_k
+  type(gpu_double3) :: tmp, tmp2
+
+  call gpu_copy(d_cc_space_f_vo, H_vo)
+
+  call gpu_allocate(tmp_k, cholesky_mo_num)
+
+  call gpu_dgemm(blas_handle, 'N', 'N', cholesky_mo_num, 1, nO*nV, 2.d0, &
+     d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, &
+     t1%f(1,1), nO*nV, 0.d0, tmp_k%f(1), cholesky_mo_num)
+
+  call gpu_dgemm(blas_handle, 'T', 'N', nV*nO, 1, cholesky_mo_num, 1.d0, &
+      d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num, tmp_k%f(1), cholesky_mo_num, 1.d0, &
+      H_vo%f(1,1), nV*nO)
+
+  call gpu_deallocate(tmp_k)
+
+
+  call gpu_allocate(tmp,  cholesky_mo_num, nO, nO)
+
+  call gpu_dgemm(blas_handle, 'N', 'T', cholesky_mo_num*nO, nO, nV, 1.d0, &
+    d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, t1%f(1,1), nO, 0.d0, tmp%f(1,1,1), cholesky_mo_num*nO)
+
+  call gpu_allocate(tmp2, cholesky_mo_num, nO, nO)
+
+  type(gpu_stream) :: stream(nO)
   do i=1,nO
-    do a=1,nV
-      H_vo(a,i) = cc_space_f_vo(a,i)
-    enddo
+    call gpu_stream_create(stream(i))
   enddo
 
-  allocate(tmp_k(cholesky_mo_num))
-  call dgemm('N', 'N', cholesky_mo_num, 1, nO*nV, 2.d0, &
-     cc_space_v_ov_chol, cholesky_mo_num, &
-     t1, nO*nV, 0.d0, tmp_k, cholesky_mo_num)
-
-  call dgemm('T','N',nV*nO,1,cholesky_mo_num,1.d0, &
-      cc_space_v_vo_chol, cholesky_mo_num, tmp_k, cholesky_mo_num, 1.d0, &
-      H_vo, nV*nO)
-  deallocate(tmp_k)
-
-  allocate(tmp(cholesky_mo_num,nO,nO))
-  allocate(tmp2(cholesky_mo_num,nO,nO))
-
-  call dgemm('N','T', cholesky_mo_num*nO, nO, nV, 1.d0, &
-    cc_space_v_ov_chol, cholesky_mo_num*nO, t1, nO, 0.d0, tmp, cholesky_mo_num*nO)
-
+  !$OMP PARALLEL DO COLLAPSE(2) PRIVATE(i,j)
   do i=1,nO
     do j=1,nO
-      do k=1,cholesky_mo_num
-        tmp2(k,j,i) = tmp(k,i,j)
-      enddo
+      call gpu_set_stream(blas_handle,stream(j))
+      call gpu_dgeam(blas_handle, 'N', 'N', cholesky_mo_num, 1, 1.d0, &
+        tmp%f(1,i,j), cholesky_mo_num, 0.d0, &
+        tmp%f(1,i,j), cholesky_mo_num, tmp2%f(1,j,i), cholesky_mo_num)
     enddo
   enddo
-  deallocate(tmp)
+  !$OMP END PARALLEL DO
 
-  call dgemm('T','N', nV, nO, cholesky_mo_num*nO, -1.d0, &
-    cc_space_v_ov_chol, cholesky_mo_num*nO, tmp2, cholesky_mo_num*nO, &
-    1.d0, H_vo, nV)
+  call gpu_set_stream(blas_handle,gpu_default_stream)
+  call gpu_synchronize()
+
+  do i=1,nO
+    call gpu_stream_destroy(stream(i))
+  enddo
+  call gpu_deallocate(tmp)
+
+  call gpu_dgemm(blas_handle, 'T','N', nV, nO, cholesky_mo_num*nO, -1.d0, &
+    d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, tmp2%f(1,1,1), cholesky_mo_num*nO, &
+    1.d0, H_vo%f(1,1), nV)
+
+  call gpu_synchronize()
+  call gpu_deallocate(tmp2)
+end
+
+! R1
+
+subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_space_f_ov,d_cc_space_f_vo, &
+    d_cc_space_v_voov, d_cc_space_v_ovov, d_cc_space_v_oovo, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol)
+  use gpu
+  implicit none
+
+  ! in
+  integer, intent(in)           :: nO, nV
+  type(gpu_double2), intent(in) :: t1, H_oo, H_vo, H_vv, d_cc_space_f_ov,d_cc_space_f_vo
+  type(gpu_double3), intent(in) :: d_cc_space_v_vo_chol, d_cc_space_v_vv_chol
+  type(gpu_double4), intent(in) :: t2, tau, d_cc_space_v_voov, d_cc_space_v_ovov, d_cc_space_v_oovo
+
+  ! out
+  type(gpu_double2), intent(out) :: r1
+  double precision, intent(out)  :: max_r1
+
+  ! internal
+  integer                       :: u,i,j,beta,a,b
+
+  type(gpu_stream) :: stream(nV)
+
+  do a=1,nV
+    call gpu_stream_create(stream(a))
+  enddo
+
+  type(gpu_double2) :: X_oo
+  call gpu_allocate(X_oo,nO,nO)
+
+  call gpu_copy(d_cc_space_f_ov, r1)
+
+  call gpu_set_stream(blas_handle, stream(1))
+  call gpu_dgemm(blas_handle, 'N','N', nO, nV, nV, &
+             1d0, t1%f(1,1)  , size(t1%f,1), &
+                  H_vv%f(1,1), size(H_vv%f,1), &
+             1d0, r1%f(1,1)  , size(r1%f,1))
+
+  call gpu_dgemm(blas_handle, 'N','N', nO, nV, nO, &
+             -1d0, H_oo%f(1,1), size(H_oo%f,1), &
+                   t1%f(1,1)  , size(t1%f,1), &
+              1d0, r1%f(1,1), size(r1%f,1))
+
+  call gpu_set_stream(blas_handle, stream(nV))
+  call gpu_dgemm(blas_handle, 'N','N', nO, nO, nV, &
+             -2d0, t1%f(1,1), size(t1%f,1), &
+                   d_cc_space_f_vo%f(1,1), size(d_cc_space_f_vo%f,1), &
+              0d0, X_oo%f(1,1), size(X_oo%f,1))
+
+  call gpu_synchronize()
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+
+  call gpu_dgemm(blas_handle, 'T','N', nO, nV, nO, &
+             1d0, X_oo%f(1,1), size(X_oo%f,2), &
+                  t1%f(1,1)  , size(t1%f,1), &
+             1d0, r1%f(1,1)  , size(r1%f,1))
+
+
+
+  type(gpu_double4) :: X_voov
+  call gpu_allocate(X_voov, nV, nO, nO, nV)
+
+  do i=1,nO
+    do beta=1,nV
+      call gpu_set_stream(blas_handle, stream(beta))
+      call gpu_dgeam(blas_handle, 'T', 'T', nV, nO,  -1.d0, t2%f(1,i,1,beta), &
+         nO*nO, t1%f(i,beta), t1%f(1,1), nO, X_voov%f(1,i,1,beta), nV*nO)
+    enddo
+  enddo
+
+  do beta=1,nV
+    call gpu_set_stream(blas_handle, stream(beta))
+    call gpu_dgeam(blas_handle, 'N', 'T', nV, nO*nO,  1.d0, X_voov%f(1,1,1,beta), &
+         nV, 2.d0, t2%f(1,1,1,beta), nO*nO, X_voov%f(1,1,1,beta), nV)
+  enddo
+
+  call gpu_synchronize()
+  call gpu_deallocate(X_oo)
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+
+  call gpu_dgemv(blas_handle, 'T', nV*nO, nO*nV, &
+             1d0, X_voov%f(1,1,1,1), size(X_voov%f,1) * size(X_voov%f,2), &
+                  H_vo%f(1,1)  , 1, &
+             1d0, r1%f(1,1)    , 1)
+
+  type(gpu_double4) :: X_ovov
+  call gpu_allocate(X_ovov, nO, nV, nO, nV)
+
+  do beta = 1, nV
+    call gpu_set_stream(blas_handle, stream(beta))
+    do u=1,nO
+      call gpu_dgeam(blas_handle, 'N', 'T', nO, nV, -1.d0, d_cc_space_v_ovov%f(1,1,u,beta), &
+      nO, 2.d0, d_cc_space_v_voov%f(1,u,1,beta), nV*nO, X_ovov%f(1,1,u,beta), nO)
+    enddo
+  enddo
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+  call gpu_synchronize()
+  call gpu_deallocate(X_voov)
+
+  call gpu_dgemv(blas_handle, 'T', nO*nV, nO*nV, &
+             1d0, X_ovov%f(1,1,1,1), size(X_ovov%f,1) * size(X_ovov%f,2), &
+                  t1%f(1,1), 1, &
+             1d0, r1%f(1,1), 1)
+
+
+  integer :: iblock, block_size, nVmax
+  type(gpu_double4) :: W_vvov, W_vvov_tmp, T_vvoo
+
+  block_size = 16
+  call gpu_allocate(T_vvoo, nV,nV,nO,nO)
+
+  call gpu_dgeam(blas_handle, 'T', 'N', nV*nV, nO*nO, 1.d0, tau%f(1,1,1,1), &
+    nO*nO, 0.d0, T_vvoo%f(1,1,1,1), nV*nV, T_vvoo%f(1,1,1,1), nV*nV)
+
+  call gpu_allocate(W_vvov,nV, nV,nO,block_size)
+  call gpu_allocate(W_vvov_tmp, nV,nO,nV,block_size)
+
+  do iblock = 1, nV, block_size
+    nVmax = min(block_size,nV-iblock+1)
+
+    call gpu_dgemm(blas_handle, 'T','N', nV*nO, nV*nVmax, cholesky_mo_num, 1.d0, &
+      d_cc_space_v_vo_chol%f(1,1,1) , cholesky_mo_num, &
+      d_cc_space_v_vv_chol%f(1,1,iblock), cholesky_mo_num, &
+      0.d0, W_vvov_tmp%f(1,1,1,1), nV*nO)
+
+    call gpu_synchronize()
+    do b=1,nV
+      call gpu_set_stream(blas_handle, stream(b))
+      do i=1,nO
+        call gpu_dgeam(blas_handle, 'N', 'N', nV, nVmax,  2.d0, W_vvov_tmp%f(1,i,b,1), &
+         nV*nO*nV, 0.d0, W_vvov_tmp%f(1,i,b,1), nV*nO*nV, W_vvov%f(1,b,i,1), nV*nV*nO)
+      enddo
+    enddo
+
+    call gpu_synchronize()
+
+    do beta = 1,  nVmax
+      call gpu_set_stream(blas_handle, stream(beta))
+      call gpu_dgeam(blas_handle, 'N', 'T', nV, nV*nO,  1.d0, W_vvov%f(1,1,1,beta), &
+         nV, -1.d0, W_vvov_tmp%f(1,1,1,beta), nV*nO, W_vvov%f(1,1,1,beta), nV)
+    enddo
+    call gpu_synchronize()
+
+    call gpu_dgemm(blas_handle, 'T','N',nO,nVmax,nO*nV*nV, &
+             1d0, T_vvoo%f(1,1,1,1), nV*nV*nO, &
+                  W_vvov%f(1,1,1,1), nO*nV*nV, &
+             1d0, r1%f(1,iblock), nO)
+  enddo
+
+  call gpu_deallocate(X_ovov)
+
+  type(gpu_double4) :: W_oovo
+  call gpu_allocate(W_oovo, nO,nO,nV,nO)
+
+  do u = 1, nO
+    do a = 1, nV
+      call gpu_set_stream(blas_handle, stream(a))
+      call gpu_dgeam(blas_handle, 'N', 'T', nO, nO,  2.d0, d_cc_space_v_oovo%f(1,1,a,u), &
+         nO, -1.d0, d_cc_space_v_oovo%f(1,1,a,u), nO, W_oovo%f(1,1,a,u), nO)
+    enddo
+  enddo
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+  call gpu_synchronize()
+
+  call gpu_deallocate(W_vvov)
+  call gpu_deallocate(T_vvoo)
+
+  ! Change the sign for consistency with the code in spin orbitals
+  call gpu_dgemm(blas_handle, 'T','N', nO, nV, nO*nO*nV, &
+              1d0, W_oovo%f(1,1,1,1), size(W_oovo%f,1) * size(W_oovo%f,2) * size(W_oovo%f,3), &
+                   tau%f(1,1,1,1), size(tau%f,1) * size(tau%f,2) * size(tau%f,3), &
+             -1d0, r1%f(1,1), size(r1%f,1))
+
+  call gpu_synchronize()
+  call gpu_deallocate(W_oovo)
+
+  max_r1 = 0d0
+  do a = 1, nV
+    do i = 1, nO
+      max_r1 = max(dabs(r1%f(i,a)), max_r1)
+    enddo
+  enddo
+
+  do a=1,nV
+    call gpu_stream_destroy(stream(a))
+  enddo
 
 end
 
 
 ! R2
 
-subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
-
+subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
+    d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, d_cc_space_v_oovo, d_cc_space_v_ovvo, d_cc_space_v_ovoo, &
+    d_cc_space_v_ovov, d_cc_space_v_vvoo, d_cc_space_v_oo_chol, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, &
+    d_cc_space_f_vo, &
+    r2,max_r2)
+  use gpu
   implicit none
 
   ! in
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: t1(nO,nV), t2(nO,nO,nV,nV), tau(nO,nO,nV,nV)
-  double precision, intent(in)  :: H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO)
+  integer, intent(in)            :: nO, nV
+  type(gpu_double2), intent(in)  :: t1, H_oo, H_vv, d_cc_space_f_vo
+  type(gpu_double4), intent(in)  :: t2, tau, d_cc_space_v_oovv
+  type(gpu_double4), intent(in)  :: d_cc_space_v_vooo, d_cc_space_v_oooo
+  type(gpu_double4), intent(in)  :: d_cc_space_v_vvoo, d_cc_space_v_oovo
+  type(gpu_double4), intent(in)  :: d_cc_space_v_ovvo, d_cc_space_v_ovoo
+  type(gpu_double4), intent(in)  :: d_cc_space_v_ovov
+  type(gpu_double3), intent(in)  :: d_cc_space_v_oo_chol, d_cc_space_v_ov_chol
+  type(gpu_double3), intent(in)  :: d_cc_space_v_vo_chol, d_cc_space_v_vv_chol
 
   ! out
-  double precision, intent(out) :: r2(nO,nO,nV,nV), max_r2
+  double precision, intent(out)  :: max_r2
+  type(gpu_double4), intent(out) :: r2
 
   ! internal
   integer                       :: u,v,i,j,beta,gam,a,b
   double precision              :: max_r2_local
 
+  type(gpu_stream) :: stream(nV)
+
   call set_multiple_levels_omp(.False.)
 
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,cc_space_v_oovv) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
-  do gam = 1, nV
-    do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-         r2(u,v,beta,gam) = cc_space_v_oovv(u,v,beta,gam)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
+  call gpu_copy(d_cc_space_v_oovv, r2)
 
-  double precision, allocatable :: A1(:,:,:,:)
-  allocate(A1(nO,nO,nO,nO))
-  call compute_A1_chol(nO,nV,t1,t2,tau,A1)
-  call dgemm('N','N',nO*nO,nV*nV,nO*nO, &
-             1d0, A1, size(A1,1) * size(A1,2), &
-                  tau, size(tau,1) * size(tau,2), &
-             1d0, r2, size(r2,1) * size(r2,2))
+  type(gpu_double4) :: A1
+  call gpu_allocate(A1,nO,nO,nO,nO)
+  call compute_A1_chol(nO,nV,t1,t2,tau,d_cc_space_v_vooo, &
+      d_cc_space_v_oooo, d_cc_space_v_vvoo, A1)
+
+  call gpu_dgemm(blas_handle, 'N','N',nO*nO,nV*nV,nO*nO, &
+             1d0, A1%f(1,1,1,1), size(A1%f,1) * size(A1%f,2), &
+                  tau%f(1,1,1,1), size(tau%f,1) * size(tau%f,2), &
+             1d0, r2%f(1,1,1,1), size(r2%f,1) * size(r2%f,2))
+
+  call gpu_deallocate(A1)
 
-  deallocate(A1)
   integer :: block_size, iblock, k
   block_size = 16
-  double precision, dimension(:,:,:), allocatable :: B1, tmp_cc, tmpB1
-  double precision, dimension(:,:), allocatable :: tmp_cc2
+  type(gpu_double3) :: tmp_cc, B1, tmpB1
+  type(gpu_double2) :: tmp_cc2
 
-  allocate(tmp_cc(cholesky_mo_num,nV,nV))
-  call dgemm('N','N', cholesky_mo_num*nV, nV, nO, 1.d0, &
-      cc_space_v_vo_chol, cholesky_mo_num*nV, t1, nO, 0.d0, tmp_cc, cholesky_mo_num*nV)
+  call gpu_allocate(tmp_cc,cholesky_mo_num,nV,nV)
+  call gpu_dgemm(blas_handle, 'N','N', cholesky_mo_num*nV, nV, nO, 1.d0, &
+      d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num*nV, t1%f(1,1), nO, 0.d0, tmp_cc%f(1,1,1), cholesky_mo_num*nV)
 
   call set_multiple_levels_omp(.False.)
+  call gpu_synchronize()
+
+  type(gpu_blas) :: blas
+
+  !$OMP PARALLEL PRIVATE(gam, iblock, B1, tmpB1, tmp_cc2, beta, b, a, blas)
+  call gpu_allocate(B1,nV,nV,block_size)
+  call gpu_allocate(tmpB1,nV,block_size,nV)
+  call gpu_allocate(tmp_cc2,cholesky_mo_num,nV)
+
+  call gpu_blas_create(blas)
 
-  !$OMP PARALLEL PRIVATE(gam, iblock, B1, tmpB1, tmp_cc2, beta, b, a)
-  allocate(B1(nV,nV,block_size), tmpB1(nV,block_size,nV), tmp_cc2(cholesky_mo_num,nV))
   !$OMP DO
   do gam = 1, nV
 
-    do a=1,nV
-      do k=1,cholesky_mo_num
-        tmp_cc2(k,a) = cc_space_v_vv_chol(k,a,gam) - tmp_cc(k,a,gam)
-      enddo
-    enddo
+    call gpu_dgeam(blas, 'N', 'N', cholesky_mo_num, nV, 1.d0, d_cc_space_v_vv_chol%f(1,1,gam), &
+         cholesky_mo_num, -1.d0, tmp_cc%f(1,1,gam), cholesky_mo_num, tmp_cc2%f(1,1), cholesky_mo_num)
 
     do iblock = 1, nV, block_size
 
-        call dgemm('T', 'N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, &
-                -1.d0, tmp_cc(1,1,iblock), cholesky_mo_num, &
-                cc_space_v_vv_chol(1,1,gam), cholesky_mo_num, &
-                0.d0, tmpB1, nV*block_size)
+        call gpu_dgemm(blas, 'T', 'N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, &
+                -1.d0, tmp_cc%f(1,1,iblock), cholesky_mo_num, &
+                d_cc_space_v_vv_chol%f(1,1,gam), cholesky_mo_num, &
+                0.d0, tmpB1%f(1,1,1), nV*block_size)
 
-        call dgemm('T','N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, &
-                1.d0, cc_space_v_vv_chol(1,1,iblock), cholesky_mo_num, &
-                tmp_cc2, cholesky_mo_num, &
-                1.d0, tmpB1, nV*block_size)
+        call gpu_dgemm(blas, 'T','N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, &
+                1.d0, d_cc_space_v_vv_chol%f(1,1,iblock), cholesky_mo_num, &
+                tmp_cc2%f(1,1), cholesky_mo_num, &
+                1.d0, tmpB1%f(1,1,1), nV*block_size)
 
         do beta = iblock, min(nV, iblock+block_size-1)
-          do b = 1, nV
-            do a = 1, nV
-              B1(a,b,beta-iblock+1) = tmpB1(a,beta-iblock+1,b)
-            enddo
-          enddo
+          call gpu_dgeam(blas, 'N', 'N', nV, nV, 1.d0, tmpB1%f(1,beta-iblock+1,1), &
+             nV*block_size, 0.d0, B1%f(1,1,beta-iblock+1), nV, B1%f(1,1,beta-iblock+1), nV)
         enddo
 
-        call dgemm('N','N',nO*nO,min(block_size, nV-iblock+1),nV*nV, &
-              1d0, tau, size(tau,1) * size(tau,2), &
-                   B1 , size(B1 ,1) * size(B1 ,2), &
-              1d0, r2(1,1,iblock,gam),  size(r2 ,1) * size(r2 ,2))
+        call gpu_dgemm(blas, 'N','N',nO*nO,min(block_size, nV-iblock+1),nV*nV, &
+              1d0, tau%f(1,1,1,1), size(tau%f,1) * size(tau%f,2), &
+                   B1%f(1,1,1) , size(B1%f ,1) * size(B1%f ,2), &
+              1d0, r2%f(1,1,iblock,gam),  size(r2%f ,1) * size(r2%f ,2))
       enddo
 
   enddo
   !$OMP ENDDO
 
-  deallocate(B1, tmpB1, tmp_cc2)
+  call gpu_blas_destroy(blas)
+
+  call gpu_deallocate(B1)
+  call gpu_deallocate(tmpB1)
+  call gpu_deallocate(tmp_cc2)
   !$OMP END PARALLEL
 
-  deallocate(tmp_cc)
+  call gpu_deallocate(tmp_cc)
+
+  type(gpu_double4) :: X_oovv
+  call gpu_allocate(X_oovv,nO,nO,nV,nV)
+  call gpu_copy(t2,X_oovv)
+
+  type(gpu_double2) :: g_occ, g_vir
+  call gpu_allocate(g_vir,nV,nV)
+  call gpu_allocate(g_occ,nO,nO)
+  call compute_g_vir_chol(nO,nV,t1,t2,H_vv,d_cc_space_f_vo, &
+    d_cc_space_v_ov_chol, d_cc_space_v_vv_chol, g_vir)
+  call compute_g_occ_chol(nO,nV,t1,t2,H_oo, &
+    d_cc_space_f_vo, d_cc_space_v_ov_chol, d_cc_space_v_oo_chol, d_cc_space_v_ovoo, g_occ)
+
+  type(gpu_double4) :: Y_oovv
+  call gpu_allocate(Y_oovv,nO,nO,nV,nV)
+
+  call gpu_dgemm(blas_handle, 'N','N',nO*nO*nV,nV,nV, &
+             1d0, X_oovv%f(1,1,1,1), size(X_oovv%f,1) * size(X_oovv%f,2) * size(X_oovv%f,3), &
+                  g_vir%f(1,1), size(g_vir%f,1), &
+             0d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1) * size(Y_oovv%f,2) * size(Y_oovv%f,3))
+
+  call gpu_dgemm(blas_handle, 'N','N',nO,nO*nV*nV,nO, &
+             -1d0, g_occ%f(1,1), size(g_occ%f,1), &
+                  t2%f(1,1,1,1)    , size(t2%f,1), &
+             1d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1))
+
+  call gpu_dgemm(blas_handle, 'N','N',nO*nO*nV,nV,nO, &
+            -1d0, d_cc_space_v_oovo%f(1,1,1,1), size(cc_space_v_oovo,1) * size(cc_space_v_oovo,2) * size(cc_space_v_oovo,3), &
+                  t1%f(1,1) , size(t1%f,1), &
+             1d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1) * size(Y_oovv%f,2) * size(Y_oovv%f,3))
 
 
-  double precision, allocatable :: X_oovv(:,:,:,:)
-  allocate(X_oovv(nO,nO,nV,nV))
-  !$omp parallel &
-  !$omp shared(nO,nV,t2,X_oovv) &
-  !$omp private(u,v,gam,a) &
-  !$omp default(none)
-  !$omp do
-  do a = 1, nV
+  call gpu_dgeam(blas_handle, 'N', 'N', nO*nO, nV*nV, 1.d0, Y_oovv%f(1,1,1,1), &
+         nO*nO, 1.d0, r2%f(1,1,1,1), nO*nO, r2%f(1,1,1,1), nO*nO)
+
+  call gpu_synchronize()
+  call gpu_deallocate(X_oovv)
+
+  call gpu_deallocate(g_vir)
+  call gpu_deallocate(g_occ)
+
+  type(gpu_double4) :: X_vovo, Y_oovo
+  call gpu_allocate(X_vovo,nV,nO,nV,nO)
+
+  do a=1,nV
+    call gpu_stream_create(stream(a))
+  enddo
+
+  do gam = 1, nV
+    call gpu_set_stream(blas_handle, stream(gam))
+    do beta = 1, nV
+      call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, 1.d0, r2%f(1,1,beta,gam), &
+           nO, 1.d0, Y_oovv%f(1,1,gam,beta), nO, r2%f(1,1,beta,gam), nO)
+    enddo
+  enddo
+
+  do i = 1, nO
     do gam = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-          X_oovv(u,v,gam,a) = t2(u,v,gam,a)
-        enddo
-      enddo
+      call gpu_set_stream(blas_handle, stream(gam))
+      call gpu_dgeam(blas_handle, 'T', 'N', nV, nO, 1.d0, d_cc_space_v_ovvo%f(1,1,gam,i), &
+           nO, 0.d0, X_vovo%f(1,1,gam,i), nV, X_vovo%f(1,1,gam,i), nV)
     enddo
   enddo
-  !$omp end do
-  !$omp end parallel
 
-  double precision, allocatable :: g_vir(:,:)
-  allocate(g_vir(nV,nV))
-  call compute_g_vir_chol(nO,nV,t1,t2,H_vv,g_vir)
-
-  double precision, allocatable :: Y_oovv(:,:,:,:)
-  allocate(Y_oovv(nO,nO,nV,nV))
-
-  call dgemm('N','N',nO*nO*nV,nV,nV, &
-             1d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3), &
-                  g_vir, size(g_vir,1), &
-             0d0, Y_oovv, size(Y_oovv,1) * size(Y_oovv,2) * size(Y_oovv,3))
-  deallocate(g_vir)
-  deallocate(X_oovv)
-
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,Y_oovv) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
-  do gam = 1, nV
-    do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-          r2(u,v,beta,gam) = r2(u,v,beta,gam) + Y_oovv(u,v,beta,gam) + Y_oovv(v,u,gam,beta)
-        enddo
-      enddo
-    enddo
+  do a=1,nV
+    call gpu_stream_destroy(stream(a))
   enddo
-  !$omp end do
-  !$omp end parallel
-  deallocate(Y_oovv)
+  call gpu_set_stream(blas_handle, gpu_default_stream)
 
-  double precision, allocatable :: g_occ(:,:)
-  allocate(g_occ(nO,nO))
-  call compute_g_occ_chol(nO,nV,t1,t2,H_oo,g_occ)
 
-  allocate(X_oovv(nO,nO,nV,nV))
-  call dgemm('N','N',nO,nO*nV*nV,nO, &
-             1d0, g_occ , size(g_occ,1), &
-                  t2    , size(t2,1), &
-             0d0, X_oovv, size(X_oovv,1))
-  deallocate(g_occ)
 
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,X_oovv) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
-  do gam = 1, nV
-    do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-          r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(u,v,beta,gam) - X_oovv(v,u,gam,beta)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  deallocate(X_oovv)
-
-  double precision, allocatable :: X_vovv(:,:,:,:)
-
-  allocate(X_vovv(nV,nO,nV,block_size))
-  allocate(Y_oovv(nO,nO,nV,nV))
+  call gpu_allocate(Y_oovo,nO,nO,nV,nO)
 
+  !$OMP PARALLEL PRIVATE(blas, iblock, gam, X_vovv)
+  call gpu_blas_create(blas)
+  type(gpu_double4) :: X_vovv
+  call gpu_allocate(X_vovv,nV,nO,nV,block_size)
+  !$OMP DO
   do iblock = 1, nV, block_size
     do gam = iblock, min(nV, iblock+block_size-1)
-      call dgemm('T','N',nV, nO*nV, cholesky_mo_num, 1.d0, &
-        cc_space_v_vv_chol(1,1,gam), cholesky_mo_num, cc_space_v_ov_chol, &
-        cholesky_mo_num, 0.d0, X_vovv(1,1,1,gam-iblock+1), nV)
+      call gpu_dgemm(blas, 'T','N',nV, nO*nV, cholesky_mo_num, 1.d0, &
+        d_cc_space_v_vv_chol%f(1,1,gam), cholesky_mo_num, d_cc_space_v_ov_chol%f(1,1,1), &
+        cholesky_mo_num, 0.d0, X_vovv%f(1,1,1,gam-iblock+1), nV)
 
     enddo
-    call dgemm('N','N',nO,nO*nV*min(block_size, nV-iblock+1),nV, &
-             1d0, t1    , size(t1,1), &
-                  X_vovv, size(X_vovv,1), &
-             0d0, Y_oovv(1,1,1,iblock), size(Y_oovv,1))
 
+    call gpu_dgemm(blas, 'N','N', nO, &
+             nO*nV*min(block_size, nV-iblock+1),nV, &
+             1.d0, t1%f(1,1)    , size(t1%f,1), &
+             X_vovv%f(1,1,1,1), size(X_vovv%f,1), &
+             0d0, Y_oovv%f(1,1,1,iblock), size(Y_oovv%f,1))
+  enddo
+  !$OMP END DO
+
+  call gpu_blas_destroy(blas)
+  call gpu_deallocate(X_vovv)
+  !$OMP END PARALLEL
+
+  call gpu_dgemm(blas_handle, 'N','N',nO,nO*nV*nO,nV, &
+             1d0, t1%f(1,1), size(t1%f,1), &
+                  X_vovo%f(1,1,1,1), size(X_vovo%f,1), &
+             0d0, Y_oovo%f(1,1,1,1), size(Y_oovo%f,1))
+
+  call gpu_dgemm(blas_handle, 'N','N',nO*nO*nV, nV, nO, &
+            -1d0, Y_oovo%f(1,1,1,1), size(Y_oovo%f,1) * size(Y_oovo%f,2) * size(Y_oovo%f,3), &
+                  t1%f(1,1)    , size(t1%f,1), &
+             1d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1) * size(Y_oovv%f,2) * size(Y_oovv%f,3))
+
+  call gpu_synchronize()
+  call gpu_deallocate(X_vovo)
+  call gpu_deallocate(Y_oovo)
+
+  do a=1,nV
+    call gpu_stream_create(stream(a))
   enddo
-  deallocate(X_vovv)
 
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,Y_oovv) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
   do gam = 1, nV
+    call gpu_set_stream(blas_handle, stream(gam))
     do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-          r2(u,v,beta,gam) = r2(u,v,beta,gam) + Y_oovv(v,u,beta,gam) + Y_oovv(u,v,gam,beta)
-        enddo
-      enddo
+      call gpu_dgeam(blas_handle, 'T', 'N', nO, nO, 1.d0, Y_oovv%f(1,1,beta,gam), &
+           nO, 1.d0, r2%f(1,1,beta,gam), nO, r2%f(1,1,beta,gam), nO)
+    enddo
+    do j=1,nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, r2%f(1,j,1,gam), &
+           nO*nO, 1.d0, Y_oovv%f(1,j,gam,1), nO*nO*nV, r2%f(1,j,1,gam), nO*nO)
     enddo
   enddo
-  !$omp end do
-  !$omp end parallel
-  deallocate(Y_oovv)
 
-  double precision, allocatable :: X_ovvo(:,:,:,:)
-  double precision, allocatable :: tcc(:,:,:), tcc2(:,:,:)
-  allocate(tcc2(cholesky_mo_num,nV,nO), X_ovvo(nO,nV,nV,nO))
-  allocate(tcc(cholesky_mo_num,nO,nV))
+  call gpu_set_stream(blas_handle, gpu_default_stream)
 
-  call dgemm('N','T', cholesky_mo_num*nV, nO, nV, 1.d0, &
-     cc_space_v_vv_chol, cholesky_mo_num*nV, t1, nO, &
-     0.d0, tcc2, cholesky_mo_num*nV)
 
-  call dgemm('N','N', cholesky_mo_num*nO, nV, nO, 1.d0, &
-     cc_space_v_oo_chol, cholesky_mo_num*nO, t1, nO, &
-     0.d0, tcc, cholesky_mo_num*nO)
+  call gpu_synchronize()
+  call gpu_deallocate(Y_oovv)
 
-  call dgemm('T','N', nO*nV, nV*nO, cholesky_mo_num, 1.d0, &
-              tcc, cholesky_mo_num, tcc2, cholesky_mo_num, 0.d0, &
-              X_ovvo, nO*nV)
+  type(gpu_double4) :: X_ovvo
+  type(gpu_double3) :: tcc, tcc2
+  call gpu_allocate(tcc2,cholesky_mo_num,nV,nO)
+  call gpu_allocate(X_ovvo,nO,nV,nV,nO)
+  call gpu_allocate(tcc,cholesky_mo_num,nO,nV)
+
+  call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nV, nO, nV, 1.d0, &
+     d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num*nV, t1%f(1,1), nO, &
+     0.d0, tcc2%f(1,1,1), cholesky_mo_num*nV)
+
+  call gpu_dgemm(blas_handle, 'N','N', cholesky_mo_num*nO, nV, nO, 1.d0, &
+     d_cc_space_v_oo_chol%f(1,1,1), cholesky_mo_num*nO, t1%f(1,1), nO, &
+     0.d0, tcc%f(1,1,1), cholesky_mo_num*nO)
+
+  call gpu_dgemm(blas_handle, 'T','N', nO*nV, nV*nO, cholesky_mo_num, 1.d0, &
+              tcc%f(1,1,1), cholesky_mo_num, tcc2%f(1,1,1), cholesky_mo_num, 0.d0, &
+              X_ovvo%f(1,1,1,1), nO*nV)
+
+  call gpu_synchronize()
 
-  deallocate(tcc, tcc2)
 
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,X_ovvo) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
   do gam = 1, nV
+    call gpu_set_stream(blas_handle, stream(gam))
+    do j=1,nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, -1.d0, X_ovvo%f(1,1,gam,j), &
+           nO, 1.d0, r2%f(1,j,1,gam), nO*nO, r2%f(1,j,1,gam), nO*nO)
+    enddo
     do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-          r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_ovvo(u,beta,gam,v)
-        enddo
-      enddo
+      call gpu_dgeam(blas_handle, 'T', 'N', nO, nO, -1.d0, X_ovvo%f(1,gam,beta,1), &
+           nO*nV*nV, 1.d0, r2%f(1,1,beta,gam), nO, r2%f(1,1,beta,gam), nO)
     enddo
   enddo
-  !$omp end do
-  !$omp do
-  do beta = 1, nV
-    do gam = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-          r2(v,u,gam,beta) = r2(v,u,gam,beta) - X_ovvo(u,beta,gam,v)
-        enddo
-      enddo
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+
+  call gpu_synchronize
+  call gpu_deallocate(tcc)
+  call gpu_deallocate(tcc2)
+  call gpu_deallocate(X_ovvo)
+
+
+  type(gpu_double4) :: J1, K1
+  type(gpu_double4) :: Y_voov, Z_ovov
+
+
+  call gpu_allocate(J1,nO,nV,nV,nO)
+  call compute_J1_chol(nO,nV,t1,t2,d_cc_space_v_ovvo,d_cc_space_v_ovoo, &
+       d_cc_space_v_vvoo,d_cc_space_v_vo_chol,d_cc_space_v_vv_chol,J1)
+
+  call gpu_allocate(K1,nO,nV,nO,nV)
+  call compute_K1_chol(nO,nV,t1,t2,d_cc_space_v_ovoo,d_cc_space_v_vvoo, &
+       d_cc_space_v_ovov,d_cc_space_v_ov_chol,d_cc_space_v_vv_chol,K1)
+
+
+  call gpu_allocate(X_ovvo,nO,nV,nV,nO)
+  call gpu_allocate(Y_voov,nV,nO,nO,nV)
+
+  do a=1, nV
+    call gpu_set_stream(blas_handle, stream(a))
+    do i=1, nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,a,1,i), &
+           nO*nV, -0.5d0, K1%f(1,a,i,1), nO*nV*nO, X_ovvo%f(1,1,a,i), nO)
+      call gpu_dgeam(blas_handle, 'T', 'T', nV, nO, 2.d0, t2%f(1,i,1,a), &
+           nO*nO, -1.d0, t2%f(1,i,a,1), nO*nO*nV, Y_voov%f(1,1,i,a), nV)
     enddo
   enddo
-  !$omp end do
-  !$omp end parallel
 
-  deallocate(X_ovvo)
-  !-----
+  call gpu_allocate(Z_ovov,nO,nV,nO,nV)
 
-  allocate(X_oovv(nO,nO,nV,nV))
+  call gpu_synchronize()
+  call gpu_deallocate(J1)
+  call gpu_set_stream(blas_handle, gpu_default_stream)
 
-  call dgemm('N','N',nO*nO*nV,nV,nO, &
-             1d0, cc_space_v_oovo, size(cc_space_v_oovo,1) * size(cc_space_v_oovo,2) * size(cc_space_v_oovo,3), &
-                  t1 , size(t1,1), &
-             0d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3))
 
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,X_oovv) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
-  do gam = 1, nV
-    do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-           r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(u,v,beta,gam) - X_oovv(v,u,gam,beta)
-        enddo
-      enddo
+  call gpu_dgemm(blas_handle, 'N','N', nO*nV,nO*nV,nV*nO, &
+             1d0, X_ovvo%f(1,1,1,1), size(X_ovvo%f,1) * size(X_ovvo%f,2), &
+                  Y_voov%f(1,1,1,1), size(Y_voov%f,1) * size(Y_voov%f,2), &
+             0d0, Z_ovov%f(1,1,1,1), size(Z_ovov%f,1) * size(Z_ovov%f,2))
+
+  call gpu_synchronize()
+  call gpu_deallocate(Y_voov)
+  call gpu_deallocate(X_ovvo)
+
+  type(gpu_double4) :: Y_ovov, X_ovov
+  call gpu_allocate(X_ovov,nO,nV,nO,nV)
+  call gpu_allocate(Y_ovov,nO,nV,nO,nV)
+
+  do a=1, nV
+    call gpu_set_stream(blas_handle, stream(a))
+    do j=1,nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, t2%f(1,j,1,a), &
+           nO*nO, 0.d0, t2%f(1,j,1,a), nO*nO, Y_ovov%f(1,a,j,1), nO*nV*nO)
+    enddo
+    do beta=1, nV
+      call gpu_dgeam(blas_handle, 'T', 'T', nO, nO, 0.5d0, K1%f(1,a,1,beta), &
+           nO*nV, 0.d0, K1%f(1,a,1,beta), nO*nV, X_ovov%f(1,a,1,beta), nO*nV)
     enddo
   enddo
-  !$omp end do
-  !$omp end parallel
-  deallocate(X_oovv)
+  call gpu_set_stream(blas_handle, gpu_default_stream)
 
-  double precision, allocatable :: X_vovo(:,:,:,:), Y_oovo(:,:,:,:)
-  allocate(X_vovo(nV,nO,nV,nO))
+  call gpu_synchronize()
 
-  !$omp parallel &
-  !$omp shared(nO,nV,X_vovo,cc_space_v_ovvo) &
-  !$omp private(a,v,gam,i) &
-  !$omp default(none)
-  do i = 1, nO
-    !$omp do
-    do gam = 1, nV
-      do v = 1, nO
-        do a = 1, nV
-          X_vovo(a,v,gam,i) = cc_space_v_ovvo(v,a,gam,i)
-        enddo
-      enddo
+  call gpu_dgemm(blas_handle, 'T','N',nO*nV,nO*nV,nO*nV, &
+            -1d0, X_ovov%f(1,1,1,1), size(X_ovov%f,1) * size(X_ovov%f,2), &
+                  Y_ovov%f(1,1,1,1), size(Y_ovov%f,1) * size(Y_ovov%f,2), &
+             1d0, Z_ovov%f(1,1,1,1), size(Z_ovov%f,1) * size(Z_ovov%f,2))
+
+  call gpu_synchronize()
+
+  do gam=1, nV
+    call gpu_set_stream(blas_handle, stream(gam))
+    do j=1,nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, r2%f(1,j,1,gam), &
+           nO*nO, 1.d0, Z_ovov%f(1,1,j,gam), nO, r2%f(1,j,1,gam), nO*nO)
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, K1%f(1,1,j,gam), &
+           nO, 0.d0, K1%f(1,1,j,gam), nO, X_ovov%f(1,gam,j,1), nO*nV*nO)
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, 1.d0, t2%f(1,j,1,gam), &
+           nO*nO, 0.d0, t2%f(1,j,1,gam), nO*nO, Y_ovov%f(1,gam,j,1), nO*nV*nO)
     enddo
-    !$omp end do nowait
-  enddo
-  !$omp end parallel
-
-  allocate(Y_oovo(nO,nO,nV,nO))
-  call dgemm('N','N',nO,nO*nV*nO,nV, &
-             1d0, t1, size(t1,1), &
-                  X_vovo, size(X_vovo,1), &
-             0d0, Y_oovo, size(Y_oovo,1))
-
-  deallocate(X_vovo)
-  allocate(X_oovv(nO,nO,nV,nV))
-  call dgemm('N','N',nO*nO*nV, nV, nO, &
-             1d0, Y_oovo, size(Y_oovo,1) * size(Y_oovo,2) * size(Y_oovo,3), &
-                  t1    , size(t1,1), &
-             0d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3))
-  deallocate(Y_oovo)
-
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,X_oovv) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
-  do gam = 1, nV
-    do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-           r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(u,v,gam,beta) - X_oovv(v,u,beta,gam)
-        enddo
-      enddo
+    do beta=1, nV
+      call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, 1.d0, r2%f(1,1,beta,gam), &
+           nO, 1.d0, Z_ovov%f(1,gam,1,beta), nO*nV, r2%f(1,1,beta,gam), nO)
     enddo
   enddo
-  !$omp end do
-  !$omp end parallel
-  deallocate(X_oovv)
 
+  call gpu_set_stream(blas_handle, gpu_default_stream)
 
-  double precision, allocatable :: J1(:,:,:,:)
-  allocate(J1(nO,nV,nV,nO))
-  call compute_J1_chol(nO,nV,t1,t2,cc_space_v_ovvo,cc_space_v_ovoo, &
-       cc_space_v_vvoo,J1)
+  call gpu_deallocate(K1)
 
-  double precision, allocatable :: K1(:,:,:,:)
-  allocate(K1(nO,nV,nO,nV))
-  call compute_K1_chol(nO,nV,t1,t2,cc_space_v_ovoo,cc_space_v_vvoo, &
-       cc_space_v_ovov,K1)
+  call gpu_dgemm(blas_handle, 'N','N',nO*nV,nO*nV,nO*nV, &
+             1d0, X_ovov%f(1,1,1,1), size(X_ovov%f,1) * size(X_ovov%f,2), &
+                  Y_ovov%f(1,1,1,1), size(Y_ovov%f,1) * size(Y_ovov%f,2), &
+             0d0, Z_ovov%f(1,1,1,1), size(Z_ovov%f,1) * size(Z_ovov%f,2))
 
-  allocate(X_ovvo(nO,nV,nV,nO))
-  !$omp parallel &
-  !$omp private(u,v,gam,beta,i,a) &
-  !$omp default(shared)
-  do i = 1, nO
-    !$omp do
-    do a = 1, nV
-      do beta = 1, nV
-        do u = 1, nO
-          X_ovvo(u,beta,a,i) = (J1(u,a,beta,i) - 0.5d0 * K1(u,a,i,beta))
-        enddo
-      enddo
-    enddo
-    !$omp end do nowait
-  enddo
-  !$omp end parallel
-  deallocate(J1)
+  call gpu_synchronize()
 
-  double precision, allocatable :: Y_voov(:,:,:,:)
-  allocate(Y_voov(nV,nO,nO,nV))
-
-  !$omp parallel &
-  !$omp private(u,v,gam,beta,i,a) &
-  !$omp default(shared)
-  !$omp do
-  do gam = 1, nV
-    do v = 1, nO
-      do i = 1, nO
-        do a = 1, nV
-          Y_voov(a,i,v,gam) = 2d0 * t2(i,v,a,gam) - t2(i,v,gam,a)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  double precision, allocatable :: Z_ovov(:,:,:,:)
-  allocate(Z_ovov(nO,nV,nO,nV))
-
-  call dgemm('N','N', nO*nV,nO*nV,nV*nO, &
-             1d0, X_ovvo, size(X_ovvo,1) * size(X_ovvo,2), &
-                  Y_voov, size(Y_voov,1) * size(Y_voov,2), &
-             0d0, Z_ovov, size(Z_ovov,1) * size(Z_ovov,2))
-
-  deallocate(X_ovvo,Y_voov)
-
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,Z_ovov) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
-  do gam = 1, nV
-    do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-           r2(u,v,beta,gam) = r2(u,v,beta,gam) + Z_ovov(u,beta,v,gam) + Z_ovov(v,gam,u,beta)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  deallocate(Z_ovov)
-
-  double precision, allocatable :: Y_ovov(:,:,:,:), X_ovov(:,:,:,:)
-  allocate(X_ovov(nO,nV,nO,nV))
-  allocate(Y_ovov(nO,nV,nO,nV))
-
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,K1,X_ovov,Y_ovov,t2) &
-  !$omp private(u,a,i,beta,gam) &
-  !$omp default(none)
-  !$omp do
-  do beta = 1, nV
-    do u = 1, nO
-      do a = 1, nV
-        do i = 1, nO
-          X_ovov(i,a,u,beta) = 0.5d0 * K1(u,a,i,beta)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do nowait
-
-  !$omp do
-  do gam = 1, nV
-    do v = 1, nO
-      do a = 1, nV
-        do i = 1, nO
-          Y_ovov(i,a,v,gam) = t2(i,v,gam,a)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  allocate(Z_ovov(nO,nV,nO,nV))
-  call dgemm('T','N',nO*nV,nO*nV,nO*nV, &
-             1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), &
-                  Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), &
-             0d0, Z_ovov, size(Y_ovov,1) * size(Y_ovov,2))
-  deallocate(X_ovov, Y_ovov)
-
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,Z_ovov) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
-  do gam = 1, nV
-    do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-           r2(u,v,beta,gam) = r2(u,v,beta,gam) - Z_ovov(u,beta,v,gam) - Z_ovov(v,gam,u,beta)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-  deallocate(Z_ovov)
-
-  allocate(X_ovov(nO,nV,nO,nV),Y_ovov(nO,nV,nO,nV))
-  !$omp parallel &
-  !$omp shared(nO,nV,K1,X_ovov,Y_ovov,t2) &
-  !$omp private(u,v,gam,beta,i,a) &
-  !$omp default(none)
-  !$omp do
-  do a = 1, nV
-    do i = 1, nO
-      do gam = 1, nV
-        do u = 1, nO
-          X_ovov(u,gam,i,a) = K1(u,a,i,gam)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do nowait
-
-  !$omp do
-  do beta = 1, nV
-    do v = 1, nO
-      do a = 1, nV
-        do i = 1, nO
-          Y_ovov(i,a,v,beta) = t2(i,v,beta,a)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  deallocate(K1)
-
-  allocate(Z_ovov(nO,nV,nO,nV))
-  call dgemm('N','N',nO*nV,nO*nV,nO*nV, &
-             1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), &
-                  Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), &
-             0d0, Z_ovov, size(Y_ovov,1) * size(Y_ovov,2))
-
-  deallocate(X_ovov,Y_ovov)
-
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,Z_ovov) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
-  do gam = 1, nV
-    do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-           r2(u,v,beta,gam) = r2(u,v,beta,gam) - Z_ovov(u,gam,v,beta) - Z_ovov(v,beta,u,gam)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  deallocate(Z_ovov)
+  call gpu_deallocate(X_ovov)
+  call gpu_deallocate(Y_ovov)
 
   ! Change the sign for consistency with the code in spin orbitals
+  do gam = 1, nV
+    call gpu_set_stream(blas_handle, stream(gam))
+    do j=1,nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV,  1.d0, r2%f(1,j,1,gam), &
+           nO*nO, -1.d0, Z_ovov%f(1,gam,j,1), nO*nV*nO, r2%f(1,j,1,gam), nO*nO)
+    enddo
+    do beta = 1, nV
+      call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, -1.d0, r2%f(1,1,beta,gam), &
+           nO, 1.d0, Z_ovov%f(1,beta,1,gam), nO*nV, r2%f(1,1,beta,gam), nO)
+    enddo
+  enddo
+
+  call gpu_deallocate(Z_ovov)
 
   max_r2 = 0d0
   !$omp parallel &
@@ -996,8 +796,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
     do a = 1, nV
       do j = 1, nO
         do i = 1, nO
-          r2(i,j,a,b) = -r2(i,j,a,b)
-          max_r2_local = max(r2(i,j,a,b), max_r2_local)
+          max_r2_local = max(r2%f(i,j,a,b), max_r2_local)
         enddo
       enddo
     enddo
@@ -1012,447 +811,458 @@ end
 
 ! A1
 
-subroutine compute_A1_chol(nO,nV,t1,t2,tau,A1)
-
+subroutine compute_A1_chol(nO,nV,t1,t2,tau,d_cc_space_v_vooo, &
+  d_cc_space_v_oooo, d_cc_space_v_vvoo, A1)
+  use gpu
   implicit none
 
-  integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: t1(nO, nV)
-  double precision, intent(in)  :: t2(nO, nO, nV, nV)
-  double precision, intent(in)  :: tau(nO, nO, nV, nV)
-  double precision, intent(out) :: A1(nO, nO, nO, nO)
+  integer, intent(in)            :: nO,nV
+  type(gpu_double2), intent(in)  :: t1
+  type(gpu_double4), intent(in)  :: t2, tau
+  type(gpu_double4), intent(in)  :: d_cc_space_v_vooo, d_cc_space_v_oooo, d_cc_space_v_vvoo
+  type(gpu_double4), intent(out) :: A1
 
   integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta
 
-  double precision, allocatable :: Y_oooo(:,:,:,:)
-  allocate(Y_oooo(nO,nO,nO,nO))
+  type(gpu_double4) :: Y_oooo
+  call gpu_allocate(Y_oooo,nO,nO,nO,nO)
 
   ! A1(u,v,i,j) = cc_space_v_oooo(u,v,i,j)
   ! A1(u,v,i,j) += cc_space_v_ovoo(u,a,i,j) * t1(v,a) &
 
-  call dgemm('N','N', nO, nO*nO*nO, nV, &
-             1d0, t1    , size(t1,1), &
-                  cc_space_v_vooo, size(cc_space_v_vooo,1), &
-             0d0, Y_oooo, size(Y_oooo,1))
+  call gpu_dgemm(blas_handle, 'N','N', nO, nO*nO*nO, nV, &
+             1d0, t1%f(1,1)  , size(t1%f,1), &
+                  d_cc_space_v_vooo%f(1,1,1,1), size(d_cc_space_v_vooo%f,1), &
+             0d0, Y_oooo%f(1,1,1,1), size(Y_oooo%f,1))
 
-  !$omp parallel &
-  !$omp private(u,v,i,j) &
-  !$omp default(shared)
-  !$omp do collapse(2)
-  do j = 1, nO
-    do i = 1, nO
-      do v = 1, nO
-        do u = 1, nO
-          A1(u,v,i,j) = cc_space_v_oooo(u,v,i,j) + Y_oooo(v,u,j,i) + Y_oooo(u,v,i,j)
-        enddo
-      enddo
-    enddo
+  type(gpu_stream) :: stream(nO)
+
+  do i=1, nO
+    call gpu_stream_create(stream(i))
   enddo
-  !$omp end do
-  !$omp end parallel
 
-  deallocate(Y_oooo)
+  call gpu_synchronize()
+
+  do j = 1, nO
+    call gpu_set_stream(blas_handle, stream(j))
+    do i = 1, nO
+      call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, 1.d0, d_cc_space_v_oooo%f(1,1,i,j), &
+           nO, 1.d0, Y_oooo%f(1,1,j,i), nO, A1%f(1,1,i,j), nO)
+    enddo
+    call gpu_dgeam(blas_handle, 'N', 'N', nO, nO*nO, 1.d0, A1%f(1,1,1,j), &
+         nO, 1.d0, Y_oooo%f(1,1,1,j), nO, A1%f(1,1,1,j), nO)
+  enddo
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+  do i=1, nO
+    call gpu_stream_destroy(stream(i))
+  enddo
+
+  call gpu_deallocate(Y_oooo)
 
   ! A1(u,v,i,j) += cc_space_v_vvoo(a,b,i,j) * tau(u,v,a,b)
-  call dgemm('N','N', nO*nO, nO*nO, nV*nV, &
-             1d0, tau     , size(tau,1) * size(tau,2), &
-                  cc_space_v_vvoo, size(cc_space_v_vvoo,1) * size(cc_space_v_vvoo,2), &
-             1d0, A1      , size(A1,1) * size(A1,2))
+  call gpu_dgemm(blas_handle, 'N','N', nO*nO, nO*nO, nV*nV, &
+             1d0, tau%f(1,1,1,1), size(tau%f,1) * size(tau%f,2), &
+                  d_cc_space_v_vvoo%f(1,1,1,1), size(d_cc_space_v_vvoo%f,1) * size(d_cc_space_v_vvoo%f,2), &
+             1d0, A1%f(1,1,1,1), size(A1%f,1) * size(A1%f,2))
+  call gpu_synchronize()
 
 end
 
 ! g_occ
 
-subroutine compute_g_occ_chol(nO,nV,t1,t2,H_oo,g_occ)
+subroutine compute_g_occ_chol(nO,nV,t1,t2,H_oo, &
+  d_cc_space_f_vo, d_cc_space_v_ov_chol, d_cc_space_v_oo_chol, d_cc_space_v_ovoo, g_occ)
+  use gpu
 
   implicit none
 
   integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: t1(nO, nV), H_oo(nO, nO)
-  double precision, intent(in)  :: t2(nO, nO, nV, nV)
-  double precision, intent(out) :: g_occ(nO, nO)
+  type(gpu_double2), intent(in)  :: t1, H_oo, d_cc_space_f_vo
+  type(gpu_double3), intent(in)  :: d_cc_space_v_ov_chol, d_cc_space_v_oo_chol
+  type(gpu_double4), intent(in)  :: t2, d_cc_space_v_ovoo
+  type(gpu_double2), intent(out) :: g_occ
 
-  g_occ = H_oo
+  call gpu_copy(H_oo, g_occ)
 
-  call dgemm('N','N',nO,nO,nV, &
-             1d0, t1, size(t1,1), &
-                  cc_space_f_vo, size(cc_space_f_vo,1), &
-             1d0, g_occ, size(g_occ,1))
+  call gpu_dgemm(blas_handle, 'N','N',nO,nO,nV, &
+             1d0, t1%f(1,1), size(t1%f,1), &
+                  d_cc_space_f_vo%f(1,1), size(d_cc_space_f_vo%f,1), &
+             1d0, g_occ%f(1,1), size(g_occ%f,1))
 
-  double precision, allocatable :: X(:)
-  allocate(X(cholesky_mo_num))
-  call dgemv('N',cholesky_mo_num,nO*nV,2.d0, &
-    cc_space_v_ov_chol, cholesky_mo_num, &
-    t1, 1, 0.d0, X, 1)
+  type(gpu_double1) :: X
+  call gpu_allocate(X,cholesky_mo_num)
 
-  call dgemv('T',cholesky_mo_num,nO*nO,1.d0, &
-    cc_space_v_oo_chol, cholesky_mo_num, &
-    X, 1, 1.d0, g_occ, 1)
-  deallocate(X)
+  call gpu_dgemv(blas_handle, 'N',cholesky_mo_num,nO*nV,2.d0, &
+    d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, &
+    t1%f(1,1), 1, 0.d0, X%f(1), 1)
 
-  call dgemv('T',nO*nV,nO*nO,-1.d0, &
-    cc_space_v_ovoo, nO*nV, &
-    t1, 1, 1.d0, g_occ, 1)
+  call gpu_dgemv(blas_handle, 'T',cholesky_mo_num,nO*nO,1.d0, &
+    d_cc_space_v_oo_chol%f(1,1,1), cholesky_mo_num, &
+    X%f(1), 1, 1.d0, g_occ%f(1,1), 1)
+
+  call gpu_dgemv(blas_handle, 'T',nO*nV,nO*nO,-1.d0, &
+    d_cc_space_v_ovoo%f(1,1,1,1), nO*nV, &
+    t1%f(1,1), 1, 1.d0, g_occ%f(1,1), 1)
+
+  call gpu_synchronize()
+  call gpu_deallocate(X)
 
 end
 
 ! g_vir
 
-subroutine compute_g_vir_chol(nO,nV,t1,t2,H_vv,g_vir)
+subroutine compute_g_vir_chol(nO,nV,t1,t2,H_vv,d_cc_space_f_vo, &
+  d_cc_space_v_ov_chol, d_cc_space_v_vv_chol, g_vir)
+  use gpu
 
   implicit none
 
   integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: t1(nO, nV), H_vv(nV, nV)
-  double precision, intent(in)  :: t2(nO, nO, nV, nV)
-  double precision, intent(out) :: g_vir(nV, nV)
+  type(gpu_double2), intent(in)  :: t1, H_vv, d_cc_space_f_vo
+  type(gpu_double3), intent(in)  :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol
+  type(gpu_double4), intent(in)  :: t2
+  type(gpu_double2), intent(out) :: g_vir
 
   integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam
 
-  call dgemm('N','N',nV,nV,nO, &
-             -1d0, cc_space_f_vo , size(cc_space_f_vo,1), &
-                   t1   , size(t1,1), &
-              0d0, g_vir, size(g_vir,1))
+  type(gpu_stream) :: stream(max(nO,4))
 
-  double precision, allocatable :: tmp_k(:), tmp_vo(:,:,:), tmp_vo2(:,:,:)
-  allocate(tmp_k(cholesky_mo_num))
-  call dgemm('N','N', cholesky_mo_num, 1, nO*nV, 1.d0, &
-    cc_space_v_ov_chol, cholesky_mo_num, t1, nO*nV, 0.d0, tmp_k, cholesky_mo_num)
-
-  call dgemm('T','N', nV*nV, 1, cholesky_mo_num, 2.d0, &
-    cc_space_v_vv_chol, cholesky_mo_num, tmp_k, cholesky_mo_num, 1.d0, &
-    g_vir, nV*nV)
-  deallocate(tmp_k)
-
-  allocate(tmp_vo(cholesky_mo_num,nV,nO))
-  call dgemm('N','T',cholesky_mo_num*nV, nO, nV, 1.d0, &
-    cc_space_v_vv_chol, cholesky_mo_num*nV, t1, nO, 0.d0, tmp_vo, cholesky_mo_num*nV)
-
-  allocate(tmp_vo2(cholesky_mo_num,nO,nV))
-  do beta=1,nV
-    do i=1,nO
-      do k=1,cholesky_mo_num
-        tmp_vo2(k,i,beta) = -tmp_vo(k,beta,i)
-      enddo
-    enddo
-  enddo
-  deallocate(tmp_vo)
-
-  do beta = 1, nV
-    do a = 1, nV
-      g_vir(a,beta) = g_vir(a,beta) + H_vv(a,beta)
-    enddo
+  do i=1,max(nO,4)
+    call gpu_stream_create(stream(i))
   enddo
 
-  call dgemm('T','N', nV, nV, nO*cholesky_mo_num, 1.d0, &
-     cc_space_v_ov_chol, cholesky_mo_num*nO, &
-     tmp_vo2, cholesky_mo_num*nO, 1.d0, g_vir, nV)
+  call gpu_set_stream(blas_handle, stream(1))
+  call gpu_dgemm(blas_handle, 'N','N',nV,nV,nO, &
+             -1d0, d_cc_space_f_vo%f(1,1) , size(d_cc_space_f_vo%f,1), &
+                   t1%f(1,1)   , size(t1%f,1), &
+              0d0, g_vir%f(1,1), size(g_vir%f,1))
+
+  type(gpu_double1) :: tmp_k
+  type(gpu_double3) :: tmp_vo, tmp_vo2
+
+  call gpu_allocate(tmp_k,cholesky_mo_num)
+
+  call gpu_set_stream(blas_handle, stream(2))
+  call gpu_dgemm(blas_handle, 'N','N', cholesky_mo_num, 1, nO*nV, 1.d0, &
+    d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, t1%f(1,1), nO*nV, 0.d0, tmp_k%f(1), cholesky_mo_num)
+
+  call gpu_dgemm(blas_handle, 'T','N', nV*nV, 1, cholesky_mo_num, 2.d0, &
+    d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num, tmp_k%f(1), cholesky_mo_num, 1.d0, &
+    g_vir%f(1,1), nV*nV)
+
+  call gpu_set_stream(blas_handle, stream(3))
+  call gpu_allocate(tmp_vo,cholesky_mo_num,nV,nO)
+
+  call gpu_dgemm(blas_handle, 'N','T',cholesky_mo_num*nV, nO, nV, 1.d0, &
+    d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num*nV, t1%f(1,1), nO, 0.d0, tmp_vo%f(1,1,1), cholesky_mo_num*nV)
+
+  call gpu_allocate(tmp_vo2,cholesky_mo_num,nO,nV)
+
+  call gpu_synchronize()
+  call gpu_deallocate(tmp_k)
+
+  do i=1,nO
+    call gpu_set_stream(blas_handle, stream(i))
+    call gpu_dgeam(blas_handle, 'N', 'N', cholesky_mo_num, nV, -1.d0, tmp_vo%f(1,1,i), &
+         cholesky_mo_num, 0.d0, tmp_vo%f(1,1,i), cholesky_mo_num, tmp_vo2%f(1,i,1), cholesky_mo_num*nO)
+  enddo
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+
+  do i=1,max(nO,4)
+    call gpu_stream_destroy(stream(i))
+  enddo
+  call gpu_deallocate(tmp_vo)
+
+  call gpu_dgeam(blas_handle, 'N', 'N', nV, nV, 1.d0, g_vir%f(1,1), &
+         nV, 1.d0, H_vv%f(1,1), nV, g_vir%f(1,1), nV)
+
+  call gpu_dgemm(blas_handle, 'T','N', nV, nV, nO*cholesky_mo_num, 1.d0, &
+     d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, &
+     tmp_vo2%f(1,1,1), cholesky_mo_num*nO, 1.d0, g_vir%f(1,1), nV)
+
+  call gpu_synchronize()
+  call gpu_deallocate(tmp_vo2)
 
 end
 
 ! J1
 
-subroutine compute_J1_chol(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvoo,J1)
+subroutine compute_J1_chol(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvoo,d_cc_space_v_vo_chol,d_cc_space_v_vv_chol,J1)
+  use gpu
   implicit none
 
-  integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: t1(nO, nV)
-  double precision, intent(in)  :: t2(nO, nO, nV, nV)
-  double precision, intent(in)  :: v_ovvo(nO,nV,nV,nO), v_ovoo(nO,nV,nO,nO)
-  double precision, intent(in)  :: v_vvoo(nV,nV,nO,nO)
-  double precision, intent(out) :: J1(nO, nV, nV, nO)
+  integer, intent(in)            :: nO,nV
+  type(gpu_double2), intent(in)  :: t1
+  type(gpu_double4), intent(in)  :: t2, v_ovvo, v_ovoo, v_vvoo
+  type(gpu_double3), intent(in) :: d_cc_space_v_vo_chol,d_cc_space_v_vv_chol
+  type(gpu_double4), intent(out) :: J1
 
   integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam
 
-  double precision, allocatable :: X_ovoo(:,:,:,:), Y_ovov(:,:,:,:)
-  allocate(X_ovoo(nO,nV,nO,nO),Y_ovov(nO,nV,nO,nV))
+  type(gpu_double4) :: X_ovoo, Y_ovov
 
-  !$omp parallel &
-  !$omp shared(nO,nV,J1,v_ovvo,v_ovoo,X_ovoo) &
-  !$omp private(i,j,a,u,beta) &
-  !$omp default(none)
-  do i = 1, nO
-    !$omp do
-    do beta = 1, nV
-      do a = 1, nV
-        do u = 1, nO
-          J1(u,a,beta,i) = v_ovvo(u,a,beta,i)
-        enddo
-      enddo
-    enddo
-    !$omp end do nowait
-  enddo
+  call gpu_allocate(X_ovoo,nO,nV,nO,nO)
 
-  !$omp do collapse(2)
-  do j = 1, nO
-    do i = 1, nO
-      do a = 1, nV
-        do u = 1, nO
-          X_ovoo(u,a,i,j) = v_ovoo(u,a,j,i)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
+  type(gpu_stream) :: stream(nV)
 
-  call dgemm('N','N',nO*nV*nO,nV,nO, &
-            -1d0, X_ovoo, size(X_ovoo,1) * size(X_ovoo,2) * size(X_ovoo,3), &
-                  t1    , size(t1,1), &
-             0d0, Y_ovov, size(Y_ovov,1) * size(Y_ovov,2) * size(Y_ovov,3))
-
-  !$omp parallel &
-  !$omp shared(nO,nV,J1,Y_ovov) &
-  !$omp private(i,beta,a,u) &
-  !$omp default(none)
-  do i = 1, nO
-    !$omp do
-    do beta = 1, nV
-      do a = 1, nV
-        do u = 1, nO
-          J1(u,a,beta,i) = J1(u,a,beta,i) + Y_ovov(u,a,i,beta)
-        enddo
-      enddo
-    enddo
-    !$omp end do nowait
-  enddo
-  !$omp end parallel
-  deallocate(X_ovoo)
-
-  double precision, allocatable :: tmp_cc(:,:,:), J1_tmp(:,:,:,:)
-  allocate(tmp_cc(cholesky_mo_num,nV,nO), J1_tmp(nV,nO,nV,nO))
-
-  call dgemm('N','T', cholesky_mo_num*nV, nO, nV, 1.d0, &
-      cc_space_v_vv_chol, cholesky_mo_num*nV, &
-      t1, nO, &
-      0.d0, tmp_cc, cholesky_mo_num*nV)
-
-  call dgemm('T','N', nV*nO, nV*nO, cholesky_mo_num, 1.d0, &
-      tmp_cc, cholesky_mo_num, cc_space_v_vo_chol, cholesky_mo_num, &
-      0.d0, J1_tmp, nV*nO)
-
-  deallocate(tmp_cc)
 
   do i=1,nO
-    do b=1,nV
-      do a=1,nV
-        do u=1,nO
-          J1(u,a,b,i) = J1(u,a,b,i) + J1_tmp(b,u,a,i)
-        enddo
-      enddo
-    enddo
-  enddo
-
-  deallocate(J1_tmp)
-
-  !- cc_space_v_vvoo(a,b,i,j) * (0.5d0 * t2(u,j,b,beta) + t1(u,b) * t1(j,beta)) &
-  double precision, allocatable :: X_voov(:,:,:,:), Z_ovvo(:,:,:,:)
-  allocate(X_voov(nV,nO,nO,nV), Z_ovvo(nO,nV,nV,nO))
-  !$omp parallel &
-  !$omp shared(nO,nV,t2,t1,Y_ovov,X_voov,v_vvoo) &
-  !$omp private(i,beta,a,u,b,j) &
-  !$omp default(none)
-  !$omp do
-  do b = 1, nV
-    do j = 1, nO
-      do beta = 1, nV
-        do u = 1, nO
-          Y_ovov(u,beta,j,b) = 0.5d0 * t2(u,j,b,beta) + t1(u,b) * t1(j,beta)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do nowait
-
-  !$omp do
-  do b = 1, nV
-    do j = 1, nO
-      do i = 1, nO
-        do a = 1, nV
-          X_voov(a,i,j,b) = v_vvoo(a,b,i,j)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  call dgemm('N','T',nO*nV,nV*nO,nO*nV, &
-             -1d0, Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), &
-                   X_voov, size(X_voov,1) * size(X_voov,2), &
-              0d0, Z_ovvo, size(Z_ovvo,1) * size(Z_ovvo,2))
-  deallocate(X_voov)
-
-  double precision, allocatable :: X_ovvo(:,:,:,:), Y_vovo(:,:,:,:)
-  allocate(X_ovvo(nO,nV,nV,nO),Y_vovo(nV,nO,nV,nO))
-  !$omp parallel &
-  !$omp shared(nO,nV,J1,Z_ovvo,t2,Y_vovo,v_vvoo,X_ovvo) &
-  !$omp private(i,beta,a,u,j,b) &
-  !$omp default(none)
-  do i = 1, nO
-    !$omp do
-    do beta = 1, nV
-      do a = 1, nV
-        do u = 1, nO
-          J1(u,a,beta,i) = J1(u,a,beta,i) + Z_ovvo(u,beta,a,i)
-        enddo
-      enddo
-    enddo
-    !$omp end do nowait
-  enddo
-
-  !+ 0.5d0 * (2d0 * cc_space_v_vvoo(a,b,i,j) - cc_space_v_vvoo(b,a,i,j)) * t2(u,j,beta,b)
-  do j = 1, nO
-    !$omp do
-    do b = 1, nV
-      do i = 1, nO
-        do a = 1, nV
-          Y_vovo(a,i,b,j) = 0.5d0 * (2d0 * v_vvoo(a,b,i,j) - v_vvoo(b,a,i,j))
-        enddo
-      enddo
-    enddo
-    !$omp end do nowait
+    call gpu_stream_create(stream(i))
   enddo
 
   do j = 1, nO
-    !$omp do
-    do b = 1, nV
-      do beta = 1, nV
-        do u = 1, nO
-          X_ovvo(u,beta,b,j) = t2(u,j,beta,b)
-        enddo
-      enddo
+    call gpu_set_stream(blas_handle, stream(j))
+    do i = 1, nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, v_ovoo%f(1,1,j,i), &
+         nO, 0.d0, X_ovoo%f(1,1,i,j), nO, X_ovoo%f(1,1,i,j), nO)
     enddo
-    !$omp end do nowait
   enddo
-  !$omp end parallel
 
-  call dgemm('N','T',nO*nV,nV*nO,nV*nO, &
-             1d0, X_ovvo, size(X_ovvo,1) * size(X_ovvo,2), &
-                  Y_vovo, size(Y_vovo,1) * size(Y_vovo,2), &
-             0d0, Z_ovvo, size(Z_ovvo,1) * size(Z_ovvo,2))
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+
+  do i=1,nO
+    call gpu_stream_destroy(stream(i))
+  enddo
+
+  call gpu_allocate(Y_ovov,nO,nV,nO,nV)
+
+  call gpu_dgemm(blas_handle, 'N','N',nO*nV*nO,nV,nO, &
+            -1d0, X_ovoo%f(1,1,1,1), size(X_ovoo%f,1) * size(X_ovoo%f,2) * size(X_ovoo%f,3), &
+                  t1%f(1,1)    , size(t1%f,1), &
+             0d0, Y_ovov%f(1,1,1,1), size(Y_ovov%f,1) * size(Y_ovov%f,2) * size(Y_ovov%f,3))
+
+
+  call gpu_copy(v_ovvo, J1)
+
+  call gpu_synchronize()
+
+  do a=1,nV
+    call gpu_stream_create(stream(a))
+  enddo
 
-  !$omp parallel &
-  !$omp shared(nO,nV,J1,Z_ovvo) &
-  !$omp private(i,beta,a,u) &
-  !$omp default(none)
   do i = 1, nO
-    !$omp do
     do beta = 1, nV
-      do a = 1, nV
-        do u = 1, nO
-          J1(u,a,beta,i) = J1(u,a,beta,i) + Z_ovvo(u,beta,a,i)
-        enddo
-      enddo
+      call gpu_set_stream(blas_handle, stream(beta))
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,1,beta,i), &
+         nO, 1.d0, Y_ovov%f(1,1,i,beta), nO, J1%f(1,1,beta,i), nO)
     enddo
-    !$omp end do nowait
   enddo
-  !$omp end parallel
 
-  deallocate(X_ovvo,Z_ovvo,Y_ovov)
+  call gpu_allocate(tmp_cc,cholesky_mo_num,nV,nO)
+  call gpu_allocate(J1_tmp,nV,nO,nV,nO)
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+
+  type(gpu_double4) :: J1_tmp
+  type(gpu_double3) :: tmp_cc
+
+  call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nV, nO, nV, 1.d0, &
+      d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num*nV, &
+      t1%f(1,1), nO, &
+      0.d0, tmp_cc%f(1,1,1), cholesky_mo_num*nV)
+
+  call gpu_dgemm(blas_handle, 'T','N', nV*nO, nV*nO, cholesky_mo_num, 1.d0, &
+      tmp_cc%f(1,1,1), cholesky_mo_num, d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num, &
+      0.d0, J1_tmp%f(1,1,1,1), nV*nO)
+
+
+  call gpu_deallocate(X_ovoo)
+
+  call gpu_synchronize()
+  call gpu_deallocate(tmp_cc)
+
+  do i = 1, nO
+    do a = 1, nV
+      call gpu_set_stream(blas_handle, stream(a))
+      call gpu_dgeam(blas_handle, 'N', 'T', nO, nV, 1.d0, J1%f(1,a,1,i), &
+         nO*nV, 1.d0, J1_tmp%f(1,1,a,i), nV, J1%f(1,a,1,i), nO*nV)
+    enddo
+  enddo
+
+  type(gpu_double4) :: X_voov, Z_ovvo
+
+  call gpu_allocate(X_voov,nV,nO,nO,nV)
+  call gpu_allocate(Z_ovvo,nO,nV,nV,nO)
+
+  do j = 1, nO
+    do beta = 1, nV
+      call gpu_set_stream(blas_handle, stream(beta))
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 0.5d0, t2%f(1,j,1,beta), &
+         nO*nO, t1%f(j,beta), t1%f(1,1), nO, Y_ovov%f(1,beta,j,1), nO*nV*nO)
+    enddo
+  enddo
+
+  do b = 1, nV
+    call gpu_set_stream(blas_handle, stream(b))
+    call gpu_dgeam(blas_handle, 'N', 'N', nV, nO*nO, 1.d0, v_vvoo%f(1,b,1,1), &
+         nV*nV, 0.d0, X_voov%f(1,1,1,b), nV, X_voov%f(1,1,1,b), nV)
+  enddo
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+
+  call gpu_synchronize()
+  call gpu_deallocate(J1_tmp)
+
+  call gpu_dgemm(blas_handle, 'N','T',nO*nV,nV*nO,nO*nV, &
+             -1d0, Y_ovov%f(1,1,1,1), size(Y_ovov%f,1) * size(Y_ovov%f,2), &
+                   X_voov%f(1,1,1,1), size(X_voov%f,1) * size(X_voov%f,2), &
+              0d0, Z_ovvo%f(1,1,1,1), size(Z_ovvo%f,1) * size(Z_ovvo%f,2))
+
+  call gpu_synchronize()
+
+  do i = 1, nO
+    do a = 1, nV
+      call gpu_set_stream(blas_handle, stream(a))
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,a,1,i), &
+          nO*nV, 1.d0, Z_ovvo%f(1,1,a,i), nO, J1%f(1,a,1,i), nO*nV)
+    enddo
+  enddo
+
+  type(gpu_double4) :: X_ovvo, Y_vovo
+  call gpu_allocate(Y_vovo,nV,nO,nV,nO)
+
+  do j = 1, nO
+    do i = 1, nO
+      call gpu_set_stream(blas_handle, stream(i))
+      call gpu_dgeam(blas_handle, 'N', 'T', nV, nV, 1.d0, v_vvoo%f(1,1,i,j), &
+          nV, -0.5d0, v_vvoo%f(1,1,i,j), nV, Y_vovo%f(1,i,1,j), nO*nV)
+    enddo
+  enddo
+
+  call gpu_allocate(X_ovvo,nO,nV,nV,nO)
+
+  do j = 1, nO
+    do b = 1, nV
+      call gpu_set_stream(blas_handle, stream(b))
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, t2%f(1,j,1,b), &
+          nO*nO, 0.d0, t2%f(1,j,1,b), nO*nO, X_ovvo%f(1,1,b,j), nO)
+    enddo
+  enddo
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+  call gpu_synchronize()
+  call gpu_deallocate(X_voov)
+
+  call gpu_dgemm(blas_handle, 'N','T',nO*nV,nV*nO,nV*nO, &
+             1d0, X_ovvo%f(1,1,1,1), size(X_ovvo%f,1) * size(X_ovvo%f,2), &
+                  Y_vovo%f(1,1,1,1), size(Y_vovo%f,1) * size(Y_vovo%f,2), &
+             0d0, Z_ovvo%f(1,1,1,1), size(Z_ovvo%f,1) * size(Z_ovvo%f,2))
+
+  call gpu_synchronize()
+
+  do i = 1, nO
+    do beta = 1, nV
+      call gpu_set_stream(blas_handle, stream(beta))
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,1,beta,i), &
+          nO, 1.d0, Z_ovvo%f(1,beta,1,i), nO*nV, J1%f(1,1,beta,i), nO)
+    enddo
+  enddo
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+  call gpu_deallocate(Y_ovov)
+  call gpu_deallocate(X_ovvo)
+
+  do a = 1, nV
+    call gpu_stream_destroy(stream(a))
+  enddo
+
+  call gpu_deallocate(Z_ovvo)
 
 end
 
 ! K1
 
-subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov,K1)
+subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov, &
+   d_cc_space_v_ov_chol,d_cc_space_v_vv_chol,K1)
+  use gpu
 
   implicit none
 
-  integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: t1(nO, nV)
-  double precision, intent(in)  :: t2(nO, nO, nV, nV)
-  double precision, intent(in)  :: v_vvoo(nV,nV,nO,nO), v_ovov(nO,nV,nO,nV)
-  double precision, intent(in)  :: v_ovoo(nO,nV,nO,nO)
-  double precision, intent(out) :: K1(nO, nV, nO, nV)
+  integer, intent(in)            :: nO,nV
+  type(gpu_double2), intent(in)  :: t1
+  type(gpu_double4), intent(in)  :: t2, v_vvoo, v_ovov, v_ovoo
+  type(gpu_double3), intent(in)  :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol
+  type(gpu_double4), intent(out) :: K1
 
-  double precision, allocatable :: X(:,:,:,:), Y(:,:,:,:), Z(:,:,:,:)
+  type(gpu_double4) :: X, Y, Z
 
   integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam
 
-  allocate(X(nV,nO,nV,nO),Y(nO,nV,nV,nO),Z(nO,nV,nV,nO))
 
-  !$omp parallel &
-  !$omp shared(nO,nV,K1,X,Y,v_vvoo,v_ovov,t1,t2) &
-  !$omp private(i,beta,a,u,j,b) &
-  !$omp default(none)
-  !$omp do
-  do beta = 1, nV
+  call gpu_copy(v_ovov, K1)
+
+  type(gpu_stream) :: stream(nV)
+  do a = 1, nV
+    call gpu_stream_create(stream(a))
+  enddo
+
+  call gpu_allocate(X,nV,nO,nV,nO)
+  call gpu_allocate(Y,nO,nV,nV,nO)
+
+  do a = 1, nV
+    call gpu_set_stream(blas_handle, stream(a))
     do i = 1, nO
-      do a = 1, nV
-        do u = 1, nO
-          K1(u,a,i,beta) = v_ovov(u,a,i,beta)
-        enddo
-      enddo
+      call gpu_dgeam(blas_handle, 'N', 'N', nV, nO, -1.d0, v_vvoo%f(1,a,i,1), &
+          nV*nV*nO, 0.d0, v_vvoo%f(1,a,i,1), nV*nV*nO, X%f(1,1,a,i), nV)
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 0.5d0, t2%f(1,i,1,a), &
+          nO*nO, t1%f(i,a), t1%f(1,1), nO, Y%f(1,a,1,i), nO*nV)
     enddo
   enddo
-  !$omp end do nowait
 
-  do i = 1, nO
-    !$omp do
-    do a = 1, nV
-      do j = 1, nO
-        do b = 1, nV
-          X(b,j,a,i) = - v_vvoo(b,a,i,j)
-        enddo
-      enddo
-    enddo
-    !$omp end do nowait
-  enddo
+  call gpu_set_stream(blas_handle, gpu_default_stream)
 
-  do j = 1, nO
-    !$omp do
-    do b = 1, nV
-      do beta = 1, nV
-        do u = 1, nO
-          Y(u,beta,b,j) = 0.5d0 * t2(u,j,b,beta) + t1(u,b) * t1(j,beta)
-        enddo
-      enddo
-    enddo
-    !$omp end do
-  enddo
-  !$omp end parallel
+  call gpu_dgemm(blas_handle, 'N','N',nO*nV*nO,nV,nO, &
+            -1d0, v_ovoo%f(1,1,1,1), size(v_ovoo%f,1) * size(v_ovoo%f,2) * size(v_ovoo%f,3), &
+                  t1%f(1,1)    , size(t1%f,1), &
+            1d0, K1%f(1,1,1,1)    , size(K1%f,1) * size(K1%f,2) * size(K1%f,3))
 
-  call dgemm('N','N',nO*nV*nO,nV,nO, &
-            -1d0, v_ovoo, size(v_ovoo,1) * size(v_ovoo,2) * size(v_ovoo,3), &
-                  t1    , size(t1,1), &
-            1d0, K1    , size(K1,1) * size(K1,2) * size(K1,3))
+  type(gpu_double4) :: K1tmp
+  type(gpu_double3) :: t1v
 
-  double precision, allocatable :: K1tmp(:,:,:,:), t1v(:,:,:)
-  allocate(K1tmp(nO,nO,nV,nV), t1v(cholesky_mo_num,nO,nO))
+  call gpu_allocate(t1v,cholesky_mo_num,nO,nO)
 
-  call dgemm('N','T', cholesky_mo_num*nO, nO, nV, 1.d0, &
-    cc_space_v_ov_chol, cholesky_mo_num*nO, t1, nO, 0.d0, &
-    t1v, cholesky_mo_num*nO)
+  call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nO, nO, nV, 1.d0, &
+    d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, t1%f(1,1), nO, 0.d0, &
+    t1v%f(1,1,1), cholesky_mo_num*nO)
 
-  call dgemm('T','N', nO*nO, nV*nV, cholesky_mo_num, 1.d0, &
-    t1v, cholesky_mo_num, cc_space_v_vv_chol, cholesky_mo_num, 0.d0, &
-    K1tmp, nO*nO)
+  call gpu_allocate(K1tmp,nO,nO,nV,nV)
+
+  call gpu_dgemm(blas_handle, 'T','N', nO*nO, nV*nV, cholesky_mo_num, 1.d0, &
+    t1v%f(1,1,1), cholesky_mo_num, d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num, 0.d0, &
+    K1tmp%f(1,1,1,1), nO*nO)
+
+  call gpu_allocate(Z,nO,nV,nV,nO)
+  call gpu_synchronize()
 
-  deallocate(t1v)
   ! Y(u,beta,b,j) * X(b,j,a,i) = Z(u,beta,a,i)
-  call dgemm('N','N',nV*nO,nO*nV,nV*nO, &
-             1d0, Y, size(Y,1) * size(Y,2), &
-                  X, size(X,1) * size(X,2), &
-             0d0, Z, size(Z,1) * size(Z,2))
+  call gpu_dgemm(blas_handle, 'N','N',nV*nO,nO*nV,nV*nO, &
+             1d0, Y%f(1,1,1,1), size(Y%f,1) * size(Y%f,2), &
+                  X%f(1,1,1,1), size(X%f,1) * size(X%f,2), &
+             0d0, Z%f(1,1,1,1), size(Z%f,1) * size(Z%f,2))
 
-  !$omp parallel &
-  !$omp shared(nO,nV,K1,Z,K1tmp) &
-  !$omp private(i,beta,a,u) &
-  !$omp default(none)
-  !$omp do
-   do beta = 1, nV
-    do i = 1, nO
-      do a = 1, nV
-        do u = 1, nO
-          K1(u,a,i,beta) = K1(u,a,i,beta) + K1tmp(u,i,a,beta) + Z(u,beta,a,i)
-        enddo
-      enddo
+  call gpu_synchronize()
+  call gpu_deallocate(t1v)
+
+  do beta = 1, nV
+   call gpu_set_stream(blas_handle, stream(beta))
+   do i = 1, nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, K1%f(1,1,i,beta), &
+          nO, 1.d0, K1tmp%f(1,i,1,beta), nO*nO, K1%f(1,1,i,beta), nO)
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, K1%f(1,1,i,beta), &
+          nO, 1.d0, Z%f(1,beta,1,i), nO*nV, K1%f(1,1,i,beta), nO)
     enddo
   enddo
-  !$omp end do
-  !$omp end parallel
 
-  deallocate(K1tmp,X,Y,Z)
+  call gpu_deallocate(X)
+  call gpu_deallocate(Y)
+
+  do a = 1, nV
+    call gpu_stream_destroy(stream(a))
+  enddo
+
+  call gpu_deallocate(K1tmp)
+  call gpu_deallocate(Z)
 
 end
diff --git a/src/gpu/NEED b/src/gpu/NEED
new file mode 100644
index 00000000..c2af78d2
--- /dev/null
+++ b/src/gpu/NEED
@@ -0,0 +1 @@
+gpu_arch
diff --git a/src/gpu/README.rst b/src/gpu/README.rst
new file mode 100644
index 00000000..17ee28a0
--- /dev/null
+++ b/src/gpu/README.rst
@@ -0,0 +1,6 @@
+===
+gpu
+===
+
+Bindings for GPU routines (architecture independent).
+Architecture-dependent files are in gpu_arch.
diff --git a/src/gpu/gpu.h b/src/gpu/gpu.h
new file mode 100644
index 00000000..ac70e21a
--- /dev/null
+++ b/src/gpu/gpu.h
@@ -0,0 +1,41 @@
+#include <stdint.h>
+
+int  gpu_ndevices();
+void gpu_set_device(int32_t i);
+
+void gpu_allocate(void** ptr, const int64_t n);
+void gpu_free(void** ptr);
+
+void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n);
+void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n);
+void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n);
+
+void gpu_stream_create(void** ptr);
+void gpu_stream_destroy(void** ptr);
+void gpu_set_stream(void* handle, void* stream);
+void gpu_synchronize();
+
+void gpu_blas_create(void** handle);
+void gpu_blas_destroy(void** handle);
+
+void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result);
+
+void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result);
+
+void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy);
+
+void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float* alpha,
+               const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy);
+
+void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc);
+
+void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha,
+               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc);
+
+void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc);
+
+void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float* alpha,
+               const float* a, const int64_t lda, const float* beta, const float* b, const int64_t ldb, float* c, const int64_t ldc);
diff --git a/src/gpu/gpu.irp.f b/src/gpu/gpu.irp.f
new file mode 100644
index 00000000..3b2feeb6
--- /dev/null
+++ b/src/gpu/gpu.irp.f
@@ -0,0 +1,26 @@
+use gpu
+
+BEGIN_PROVIDER [ type(gpu_blas), blas_handle ]
+ implicit none
+ BEGIN_DOC
+ ! Handle for cuBLAS or RocBLAS
+ END_DOC
+ call gpu_blas_create(blas_handle)
+END_PROVIDER
+
+BEGIN_PROVIDER [ type(gpu_stream), gpu_default_stream ]
+ implicit none
+ BEGIN_DOC
+ ! Default stream
+ END_DOC
+ gpu_default_stream%c = C_NULL_PTR
+END_PROVIDER
+
+BEGIN_PROVIDER [ integer, gpu_num ]
+ implicit none
+ BEGIN_DOC
+ ! Number of usable GPUs
+ END_DOC
+ gpu_num = gpu_ndevices()
+END_PROVIDER
+
diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90
new file mode 100644
index 00000000..6050075f
--- /dev/null
+++ b/src/gpu/gpu_module.F90
@@ -0,0 +1,707 @@
+module gpu
+  use, intrinsic :: iso_c_binding
+  implicit none
+
+! Data types
+! ----------
+
+  type gpu_double1
+    type(c_ptr) :: c
+    double precision, pointer :: f(:)
+  end type
+
+  type gpu_double2
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:)
+  end type
+
+  type gpu_double3
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:,:)
+  end type
+
+  type gpu_double4
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:,:,:)
+  end type
+
+  type gpu_double5
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:,:,:,:)
+  end type
+
+  type gpu_double6
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:,:,:,:,:)
+  end type
+
+
+  type gpu_blas
+    type(c_ptr) :: c
+  end type
+
+  type gpu_stream
+    type(c_ptr) :: c
+  end type
+
+
+! C interfaces
+! ------------
+
+  interface
+    logical(c_bool) function no_gpu() bind(C)
+      import
+    end function
+
+    integer function gpu_ndevices() bind(C)
+      import
+    end function
+
+    subroutine gpu_set_device(id) bind(C)
+      import
+      integer(c_int32_t), value :: id
+    end subroutine
+
+    subroutine gpu_allocate_c(ptr, n) bind(C, name='gpu_allocate')
+      import
+      type(c_ptr) :: ptr
+      integer(c_int64_t), value :: n
+    end subroutine
+
+    subroutine gpu_deallocate_c(ptr) bind(C, name='gpu_deallocate')
+      import
+      type(c_ptr) :: ptr
+    end subroutine
+
+    subroutine gpu_upload_c(cpu_ptr, gpu_ptr, n) bind(C, name='gpu_upload')
+      import
+      type(c_ptr), value :: cpu_ptr
+      type(c_ptr), value :: gpu_ptr
+      integer(c_int64_t), value :: n
+    end subroutine
+
+    subroutine gpu_download_c(gpu_ptr, cpu_ptr, n) bind(C, name='gpu_download')
+      import
+      type(c_ptr), value :: gpu_ptr
+      type(c_ptr), value :: cpu_ptr
+      integer(c_int64_t), value :: n
+    end subroutine
+
+    subroutine gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, n) bind(C, name='gpu_copy')
+      import
+      type(c_ptr), value :: gpu_ptr_src
+      type(c_ptr), value :: gpu_ptr_dest
+      integer(c_int64_t), value :: n
+    end subroutine
+
+    subroutine gpu_stream_create_c(stream) bind(C, name='gpu_stream_create')
+      import
+      type(c_ptr) :: stream
+    end subroutine
+
+    subroutine gpu_stream_destroy_c(stream) bind(C, name='gpu_stream_destroy')
+      import
+      type(c_ptr) :: stream
+    end subroutine
+
+    subroutine gpu_set_stream_c(handle, stream) bind(C, name='gpu_set_stream')
+      import
+      type(c_ptr), value :: handle, stream
+    end subroutine
+
+    subroutine gpu_synchronize() bind(C)
+      import
+    end subroutine
+
+    subroutine gpu_blas_create_c(handle) bind(C, name='gpu_blas_create')
+      import
+      type(c_ptr) :: handle
+    end subroutine
+
+    subroutine gpu_blas_destroy_c(handle) bind(C, name='gpu_blas_destroy')
+      import
+      type(c_ptr) :: handle
+    end subroutine
+
+    subroutine gpu_ddot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_ddot')
+      import
+      type(c_ptr), value, intent(in) :: handle
+      integer(c_int64_t), value      :: n, incx, incy
+      type(c_ptr), value             :: dx, dy
+      real(c_double), intent(out)    :: res
+    end subroutine
+
+    subroutine gpu_sdot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_sdot')
+      import
+      type(c_ptr), value, intent(in) :: handle
+      integer(c_int64_t), value      :: n, incx, incy
+      type(c_ptr), intent(in), value :: dx, dy
+      real(c_float), intent(out)     :: res
+    end subroutine
+
+    subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
+      b, ldb, c, ldc) bind(C, name='gpu_dgeam')
+      import
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in), value  :: transa, transb
+      integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
+      real(c_double), intent(in)            :: alpha, beta
+      type(c_ptr), value :: a, b, c
+    end subroutine
+
+    subroutine gpu_sgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
+      b, ldb, c, ldc) bind(C, name='gpu_sgeam')
+      import
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in), value  :: transa, transb
+      integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
+      real(c_float), intent(in)             :: alpha, beta
+      real(c_float) :: a, b, c
+    end subroutine
+
+    subroutine gpu_dgemv_c(handle, transa, m, n, alpha, a, lda, &
+      x, incx, beta, y, incy) bind(C, name='gpu_dgemv')
+      import
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in)         :: transa
+      integer(c_int64_t), intent(in), value :: m, n, lda, incx, incy
+      real(c_double), intent(in)            :: alpha, beta
+      real(c_double)                        :: a, x, y
+    end subroutine
+
+    subroutine gpu_sgemv_c(handle, transa, m, n, alpha, a, lda, &
+      x, incx, beta, y, incy) bind(C, name='gpu_sgemv')
+      import
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in)         :: transa
+      integer(c_int64_t), intent(in), value :: m, n, lda, incx, incy
+      real(c_float), intent(in)             :: alpha, beta
+      real(c_float)                         :: a, x, y
+    end subroutine
+
+
+    subroutine gpu_dgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, &
+      b, ldb, beta, c, ldc) bind(C, name='gpu_dgemm')
+      import
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in)         :: transa, transb
+      integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc
+      real(c_double), intent(in)            :: alpha, beta
+      real(c_double) :: a, b, c
+    end subroutine
+
+    subroutine gpu_sgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, &
+      b, ldb, beta, c, ldc) bind(C, name='gpu_sgemm')
+      import
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in), value  :: transa, transb
+      integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc
+      real(c_float), intent(in)             :: alpha, beta
+      real(c_float) :: a, b, c
+    end subroutine
+
+  end interface
+
+
+! Polymorphic interfaces
+! ----------------------
+
+  interface gpu_allocate
+    procedure gpu_allocate_double1     &
+             ,gpu_allocate_double2     &
+             ,gpu_allocate_double3     &
+             ,gpu_allocate_double4     &
+             ,gpu_allocate_double5     &
+             ,gpu_allocate_double6     &
+             ,gpu_allocate_double1_64  &
+             ,gpu_allocate_double2_64  &
+             ,gpu_allocate_double3_64  &
+             ,gpu_allocate_double4_64  &
+             ,gpu_allocate_double5_64  &
+             ,gpu_allocate_double6_64
+  end interface gpu_allocate
+
+  interface gpu_deallocate
+    procedure gpu_deallocate_double1     &
+             ,gpu_deallocate_double2     &
+             ,gpu_deallocate_double3     &
+             ,gpu_deallocate_double4     &
+             ,gpu_deallocate_double5     &
+             ,gpu_deallocate_double6
+  end interface gpu_deallocate
+
+  interface gpu_upload
+    procedure gpu_upload_double1  &
+             ,gpu_upload_double2  &
+             ,gpu_upload_double3  &
+             ,gpu_upload_double4  &
+             ,gpu_upload_double5  &
+             ,gpu_upload_double6
+  end interface gpu_upload
+
+  interface gpu_download
+    procedure gpu_download_double1  &
+             ,gpu_download_double2  &
+             ,gpu_download_double3  &
+             ,gpu_download_double4  &
+             ,gpu_download_double5  &
+             ,gpu_download_double6
+  end interface gpu_download
+
+  interface gpu_copy
+    procedure gpu_copy_double1  &
+             ,gpu_copy_double2  &
+             ,gpu_copy_double3  &
+             ,gpu_copy_double4  &
+             ,gpu_copy_double5  &
+             ,gpu_copy_double6
+  end interface gpu_copy
+
+
+  contains
+
+
+! gpu_allocate
+! ------------
+
+    subroutine gpu_allocate_double1(ptr, s)
+      implicit none
+      type(gpu_double1), intent(inout) :: ptr
+      integer, intent(in) :: s
+
+      call gpu_allocate_c(ptr%c, s*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s /))
+    end subroutine
+
+    subroutine gpu_allocate_double2(ptr, s1, s2)
+      implicit none
+      type(gpu_double2), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2
+
+      call gpu_allocate_c(ptr%c, s1*s2*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /))
+    end subroutine
+
+    subroutine gpu_allocate_double3(ptr, s1, s2, s3)
+      implicit none
+      type(gpu_double3), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2, s3
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /))
+    end subroutine
+
+    subroutine gpu_allocate_double4(ptr, s1, s2, s3, s4)
+      implicit none
+      type(gpu_double4), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2, s3, s4
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /))
+    end subroutine
+
+    subroutine gpu_allocate_double5(ptr, s1, s2, s3, s4, s5)
+      implicit none
+      type(gpu_double5), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2, s3, s4, s5
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /))
+    end subroutine
+
+    subroutine gpu_allocate_double6(ptr, s1, s2, s3, s4, s5, s6)
+      implicit none
+      type(gpu_double6), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2, s3, s4, s5, s6
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /))
+    end subroutine
+
+
+    subroutine gpu_allocate_double1_64(ptr, s)
+      implicit none
+      type(gpu_double1), intent(inout) :: ptr
+      integer*8, intent(in) :: s
+
+      call gpu_allocate_c(ptr%c, s)
+      call c_f_pointer(ptr%c, ptr%f, (/ s /))
+    end subroutine
+
+    subroutine gpu_allocate_double2_64(ptr, s1, s2)
+      implicit none
+      type(gpu_double2), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2
+
+      call gpu_allocate_c(ptr%c, s1*s2*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /))
+    end subroutine
+
+    subroutine gpu_allocate_double3_64(ptr, s1, s2, s3)
+      implicit none
+      type(gpu_double3), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2, s3
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /))
+    end subroutine
+
+    subroutine gpu_allocate_double4_64(ptr, s1, s2, s3, s4)
+      implicit none
+      type(gpu_double4), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2, s3, s4
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /))
+    end subroutine
+
+    subroutine gpu_allocate_double5_64(ptr, s1, s2, s3, s4, s5)
+      implicit none
+      type(gpu_double5), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2, s3, s4, s5
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /))
+    end subroutine
+
+    subroutine gpu_allocate_double6_64(ptr, s1, s2, s3, s4, s5, s6)
+      implicit none
+      type(gpu_double6), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2, s3, s4, s5, s6
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /))
+    end subroutine
+
+
+! gpu_deallocate
+! --------------
+
+    subroutine gpu_deallocate_double1(ptr)
+      implicit none
+      type(gpu_double1), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double2(ptr)
+      implicit none
+      type(gpu_double2), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double3(ptr)
+      implicit none
+      type(gpu_double3), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double4(ptr)
+      implicit none
+      type(gpu_double4), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double5(ptr)
+      implicit none
+      type(gpu_double5), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double6(ptr)
+      implicit none
+      type(gpu_double6), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+
+! gpu_upload
+! ----------
+
+    subroutine gpu_upload_double1(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, target, intent(in)     :: cpu_ptr(*)
+      type(gpu_double1), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, 8_8*size(gpu_ptr%f))
+    end subroutine
+
+    subroutine gpu_upload_double2(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, target, intent(in)     :: cpu_ptr(:,:)
+      type(gpu_double2), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+    subroutine gpu_upload_double3(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, target, intent(in)     :: cpu_ptr(:,:,:)
+      type(gpu_double3), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+    subroutine gpu_upload_double4(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, target, intent(in)     :: cpu_ptr(:,:,:,:)
+      type(gpu_double4), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+    subroutine gpu_upload_double5(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, target, intent(in)     :: cpu_ptr(:,:,:,:,:)
+      type(gpu_double5), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+    subroutine gpu_upload_double6(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, target, intent(in)     :: cpu_ptr(:,:,:,:,:,:)
+      type(gpu_double6), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+
+! gpu_download
+! ------------
+
+    subroutine gpu_download_double1(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double1), intent(in)  :: gpu_ptr
+      double precision, target, intent(in)   :: cpu_ptr(:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*size(gpu_ptr%f))
+    end subroutine
+
+    subroutine gpu_download_double2(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double2), intent(in)  :: gpu_ptr
+      double precision, target, intent(in)   :: cpu_ptr(:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+    subroutine gpu_download_double3(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double3), intent(in)  :: gpu_ptr
+      double precision, target, intent(in)   :: cpu_ptr(:,:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+    subroutine gpu_download_double4(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double4), intent(in)  :: gpu_ptr
+      double precision, target, intent(in)   :: cpu_ptr(:,:,:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+    subroutine gpu_download_double5(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double5), intent(in)  :: gpu_ptr
+      double precision, target, intent(in)   :: cpu_ptr(:,:,:,:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+    subroutine gpu_download_double6(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double6), intent(in)  :: gpu_ptr
+      double precision, target, intent(in)   :: cpu_ptr(:,:,:,:,:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+! gpu_copy
+! --------
+
+    subroutine gpu_copy_double1(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double1), intent(in)        :: gpu_ptr_src
+      type(gpu_double1), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*size(gpu_ptr_dest%f))
+    end subroutine
+
+    subroutine gpu_copy_double2(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double2), intent(in)        :: gpu_ptr_src
+      type(gpu_double2), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+    subroutine gpu_copy_double3(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double3), intent(in)        :: gpu_ptr_src
+      type(gpu_double3), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+    subroutine gpu_copy_double4(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double4), intent(in)        :: gpu_ptr_src
+      type(gpu_double4), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+    subroutine gpu_copy_double5(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double5), intent(in)        :: gpu_ptr_src
+      type(gpu_double5), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+    subroutine gpu_copy_double6(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double6), intent(in)        :: gpu_ptr_src
+      type(gpu_double6), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+
+! gpu_stream
+! ----------
+
+    subroutine gpu_stream_create(stream)
+      type(gpu_stream) :: stream
+      call gpu_stream_create_c(stream%c)
+    end subroutine
+
+    subroutine gpu_stream_destroy(stream)
+      type(gpu_stream) :: stream
+      call gpu_stream_destroy_c(stream%c)
+    end subroutine
+
+    subroutine gpu_set_stream(handle, stream)
+      type(gpu_blas)   :: handle
+      type(gpu_stream) :: stream
+      call gpu_set_stream_c(handle%c, stream%c)
+    end subroutine
+
+
+! gpu_blas
+! --------
+
+    subroutine gpu_blas_create(handle)
+      type(gpu_blas) :: handle
+      call gpu_blas_create_c(handle%c)
+    end subroutine
+
+    subroutine gpu_blas_destroy(handle)
+      type(gpu_blas) :: handle
+      call gpu_blas_destroy_c(handle%c)
+    end subroutine
+
+
+
+
+
+! dot
+! ---
+
+subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res)
+!  use gpu
+  type(gpu_blas), intent(in)     :: handle
+  integer*4                      :: n, incx, incy
+  double precision, target       :: dx, dy
+  double precision, intent(out)  :: res
+  call gpu_ddot_c(handle%c, int(n,c_int64_t), c_loc(dx), int(incx,c_int64_t), c_loc(dy), int(incy,c_int64_t), res)
+end subroutine
+
+
+subroutine gpu_ddot_64(handle, n, dx, incx, dy, incy, res)
+!  use gpu
+  type(gpu_blas), intent(in)     :: handle
+  integer*8                      :: n, incx, incy
+  double precision, target       :: dx, dy
+  double precision, intent(out)  :: res
+  call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res)
+end subroutine
+
+
+! geam
+! ----
+
+subroutine gpu_dgeam(handle, transa, transb, m, n, alpha, a, lda, beta, &
+  b, ldb, c, ldc)
+!  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*4, intent(in)        :: m, n, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  double precision, target     :: a, b, c
+  call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, &
+        c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t))
+end subroutine
+
+
+subroutine gpu_dgeam_64(handle, transa, transb, m, n, alpha, a, lda, beta, &
+  b, ldb, c, ldc)
+!  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*8, intent(in)        :: m, n, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  double precision, target     :: a, b, c
+  call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, &
+        c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t))
+end subroutine
+
+
+! gemv
+! ----
+
+subroutine gpu_dgemv(handle, transa, m, n, alpha, a, lda, &
+  x, incx, beta, y, incy)
+!  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa
+  integer*4, intent(in)        :: m, n, lda, incx, incy
+  double precision, intent(in) :: alpha, beta
+  double precision             :: a, x, y
+  call gpu_dgemv_c(handle%c, transa, int(m,c_int64_t), int(n,c_int64_t), &
+        alpha, a, int(lda,c_int64_t), &
+        x, int(incx,c_int64_t), beta, y, int(incy,c_int64_t))
+end subroutine
+
+subroutine gpu_dgemv_64(handle, transa, m, n, alpha, a, lda, &
+  x, incx, beta, y, incy)
+!  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa
+  integer*8, intent(in)        :: m, n, lda, incx, incy
+  double precision, intent(in) :: alpha, beta
+  double precision             :: a, x, y
+  call gpu_dgemv_c(handle%c, transa, int(m,c_int64_t), int(n,c_int64_t), &
+        alpha, a, int(lda,c_int64_t), &
+        x, int(incx,c_int64_t), beta, y, int(incy,c_int64_t))
+end subroutine
+
+
+! gemm
+! ----
+
+subroutine gpu_dgemm(handle, transa, transb, m, n, k, alpha, a, lda, &
+  b, ldb, beta, c, ldc)
+!  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*4, intent(in)        :: m, n, k, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  double precision             :: a, b, c
+  call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), &
+        alpha, a, int(lda,c_int64_t), &
+        b, int(ldb,c_int64_t), beta, c, int(ldc,c_int64_t))
+end subroutine
+
+subroutine gpu_dgemm_64(handle, transa, transb, m, n, k, alpha, a, lda, &
+  b, ldb, beta, c, ldc)
+!  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*8, intent(in)        :: m, n, k, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  double precision             :: a, b, c
+  call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), &
+        alpha, a, int(lda,c_int64_t), b, int(ldb,c_int64_t), beta, c, int(ldc,c_int64_t))
+end subroutine
+
+end module
diff --git a/src/hartree_fock/fock_matrix_hf.irp.f b/src/hartree_fock/fock_matrix_hf.irp.f
index 65b3d63c..6d917322 100644
--- a/src/hartree_fock/fock_matrix_hf.irp.f
+++ b/src/hartree_fock/fock_matrix_hf.irp.f
@@ -194,17 +194,28 @@ END_PROVIDER
  endif
 
 
- double precision :: rss
+ double precision :: rss, mem0, mem
  double precision :: memory_of_double
 
  integer :: iblock
- integer, parameter :: block_size = 32
+ integer :: block_size
+
+ call resident_memory(mem0)
+
+ block_size = 1024
+
+ rss = memory_of_double(2.d0*ao_num*ao_num)
+ do
+   mem = mem0 + block_size*rss
+   if ( (block_size < 2).or.(mem < qp_max_mem) ) exit
+   block_size = block_size/2
+ enddo
+
+ call check_mem(block_size*rss, irp_here)
 
- rss = memory_of_double(ao_num*ao_num)
- call check_mem(2.d0*block_size*rss, irp_here)
  allocate(X2(ao_num,ao_num,block_size,2))
  allocate(X3(ao_num,block_size,ao_num,2))
-    
+
 ! ao_two_e_integral_alpha_chol (l,s) -= cholesky_ao(l,m,j) * SCF_density_matrix_ao_beta (m,n) * cholesky_ao(n,s,j)
 
  do iblock=1,cholesky_ao_num,block_size
diff --git a/src/mo_two_e_ints/map_integrals.irp.f b/src/mo_two_e_ints/map_integrals.irp.f
index 168c34b4..eeb4279f 100644
--- a/src/mo_two_e_ints/map_integrals.irp.f
+++ b/src/mo_two_e_ints/map_integrals.irp.f
@@ -40,7 +40,7 @@ end
  ! Min and max values of the MOs for which the integrals are in the cache
  END_DOC
 
- mo_integrals_cache_size  = 2_8**mo_integrals_cache_shift
+ mo_integrals_cache_size  = 2**mo_integrals_cache_shift
 
  mo_integrals_cache_min = max(1,elec_alpha_num - (mo_integrals_cache_size/2 - 1) )
  mo_integrals_cache_max = min(mo_num, mo_integrals_cache_min + mo_integrals_cache_size - 1)
diff --git a/src/mol_properties/multi_s_dipole_moment.irp.f b/src/mol_properties/multi_s_dipole_moment.irp.f
index c7216a61..8aae3bf4 100644
--- a/src/mol_properties/multi_s_dipole_moment.irp.f
+++ b/src/mol_properties/multi_s_dipole_moment.irp.f
@@ -18,7 +18,7 @@
 
 
 
-BEGIN_PROVIDER [double precision, multi_s_dipole_moment, (N_states, N_states)]
+ BEGIN_PROVIDER [double precision, multi_s_dipole_moment  , (N_states, N_states)]
 &BEGIN_PROVIDER [double precision, multi_s_x_dipole_moment, (N_states, N_states)]
 &BEGIN_PROVIDER [double precision, multi_s_y_dipole_moment, (N_states, N_states)]
 &BEGIN_PROVIDER [double precision, multi_s_z_dipole_moment, (N_states, N_states)]
@@ -40,27 +40,153 @@ BEGIN_PROVIDER [double precision, multi_s_dipole_moment, (N_states, N_states)]
   ! gamma^{nm}: density matrix \bra{\Psi^n} a^{\dagger}_a a_i \ket{\Psi^m}
   END_DOC
 
-  integer          :: istate,jstate ! States
-  integer          :: i,j           ! general spatial MOs
+  integer          :: istate, jstate ! States
+  integer          :: i, j           ! general spatial MOs
   double precision :: nuclei_part_x, nuclei_part_y, nuclei_part_z
  
   multi_s_x_dipole_moment = 0.d0
   multi_s_y_dipole_moment = 0.d0
   multi_s_z_dipole_moment = 0.d0
+
+  if(8.d0*mo_num*mo_num*n_states*n_states*1d-9 .lt. 200.d0) then
  
-  do jstate = 1, N_states
-    do istate = 1, N_states
- 
-      do i = 1, mo_num  
-        do j = 1, mo_num  
-          multi_s_x_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_x(j,i)  
-          multi_s_y_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_y(j,i) 
-          multi_s_z_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_z(j,i) 
-        enddo
+    do jstate = 1, N_states
+      do istate = 1, N_states
+        do i = 1, mo_num  
+          do j = 1, mo_num  
+            multi_s_x_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_x(j,i)  
+            multi_s_y_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_y(j,i) 
+            multi_s_z_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_z(j,i) 
+          enddo
+        enddo 
       enddo
- 
     enddo
-  enddo
+
+  else
+
+    ! no enouph memory
+    ! on the fly scheme
+
+    PROVIDE psi_det_alpha_unique psi_det_beta_unique
+
+    integer           :: l, k_a, k_b
+    integer           :: occ(N_int*bit_kind_size,2)
+    integer           :: h1, h2, p1, p2, degree
+    integer           :: exc(0:2,2), n_occ(2)
+    integer           :: krow, kcol, lrow, lcol
+    integer(bit_kind) :: tmp_det(N_int,2), tmp_det2(N_int)
+    double precision  :: ck, ckl, phase
+
+    !$OMP PARALLEL DEFAULT(NONE)                                                      &
+    !$OMP PRIVATE(j, l, k_a, k_b, istate, jstate, occ, ck, ckl, h1, h2, p1, p2, exc,  & 
+    !$OMP         phase, degree, n_occ, krow, kcol, lrow, lcol, tmp_det, tmp_det2)    &
+    !$OMP SHARED(N_int, N_states, elec_alpha_num, elec_beta_num, N_det,               &
+    !$OMP        psi_bilinear_matrix_rows, psi_bilinear_matrix_columns,               &
+    !$OMP        psi_bilinear_matrix_transp_rows, psi_bilinear_matrix_transp_columns, &
+    !$OMP        psi_det_alpha_unique, psi_det_beta_unique,                           &
+    !$OMP        psi_bilinear_matrix_values, psi_bilinear_matrix_transp_values,       &
+    !$OMP        mo_dipole_x, mo_dipole_y, mo_dipole_z,                               &
+    !$OMP        multi_s_x_dipole_moment, multi_s_y_dipole_moment, multi_s_z_dipole_moment)
+    !$OMP DO COLLAPSE(2)
+    do istate = 1, N_states
+      do jstate = 1, N_states
+
+        do k_a = 1, N_det
+          krow = psi_bilinear_matrix_rows   (k_a)
+          kcol = psi_bilinear_matrix_columns(k_a)
+  
+          tmp_det(1:N_int,1) = psi_det_alpha_unique(1:N_int,krow)
+          tmp_det(1:N_int,2) = psi_det_beta_unique (1:N_int,kcol)
+  
+          ! Diagonal part
+          call bitstring_to_list_ab(tmp_det, occ, n_occ, N_int)
+          ck = psi_bilinear_matrix_values(k_a,istate)*psi_bilinear_matrix_values(k_a,jstate)
+          do l = 1, elec_alpha_num
+            j = occ(l,1)
+            multi_s_x_dipole_moment(istate,jstate) -= ck * mo_dipole_x(j,j) 
+            multi_s_y_dipole_moment(istate,jstate) -= ck * mo_dipole_y(j,j) 
+            multi_s_z_dipole_moment(istate,jstate) -= ck * mo_dipole_z(j,j) 
+          enddo
+  
+          if (k_a == N_det) cycle
+          l = k_a + 1
+          lrow = psi_bilinear_matrix_rows   (l)
+          lcol = psi_bilinear_matrix_columns(l)
+          ! Fix beta determinant, loop over alphas
+          do while (lcol == kcol)
+            tmp_det2(:) = psi_det_alpha_unique(:,lrow)
+            call get_excitation_degree_spin(tmp_det(1,1), tmp_det2, degree, N_int)
+            if (degree == 1) then
+              exc = 0
+              call get_single_excitation_spin(tmp_det(1,1), tmp_det2, exc, phase, N_int)
+              call decode_exc_spin(exc, h1, p1, h2, p2)
+              ckl = psi_bilinear_matrix_values(k_a,istate)*psi_bilinear_matrix_values(l,jstate) * phase
+              multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(h1,p1) 
+              multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(h1,p1) 
+              multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(h1,p1) 
+              ckl = psi_bilinear_matrix_values(k_a,jstate)*psi_bilinear_matrix_values(l,istate) * phase
+              multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(p1,h1) 
+              multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(p1,h1) 
+              multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(p1,h1) 
+            endif
+            l = l+1
+            if (l > N_det) exit
+            lrow = psi_bilinear_matrix_rows   (l)
+            lcol = psi_bilinear_matrix_columns(l)
+          enddo
+        enddo ! k_a
+  
+        do k_b = 1, N_det
+          krow = psi_bilinear_matrix_transp_rows   (k_b)
+          kcol = psi_bilinear_matrix_transp_columns(k_b)
+      
+          tmp_det(1:N_int,1) = psi_det_alpha_unique(1:N_int,krow)
+          tmp_det(1:N_int,2) = psi_det_beta_unique (1:N_int,kcol)
+      
+          ! Diagonal part
+          call bitstring_to_list_ab(tmp_det, occ, n_occ, N_int)
+          ck = psi_bilinear_matrix_transp_values(k_b,istate)*psi_bilinear_matrix_transp_values(k_b,jstate)
+          do l = 1, elec_beta_num
+            j = occ(l,2)
+            multi_s_x_dipole_moment(istate,jstate) -= ck * mo_dipole_x(j,j) 
+            multi_s_y_dipole_moment(istate,jstate) -= ck * mo_dipole_y(j,j) 
+            multi_s_z_dipole_moment(istate,jstate) -= ck * mo_dipole_z(j,j) 
+          enddo
+      
+          if (k_b == N_det) cycle
+          l = k_b+1
+          lrow = psi_bilinear_matrix_transp_rows   (l)
+          lcol = psi_bilinear_matrix_transp_columns(l)
+          ! Fix beta determinant, loop over alphas
+          do while (lrow == krow)
+            tmp_det2(:) = psi_det_beta_unique(:,lcol)
+            call get_excitation_degree_spin(tmp_det(1,2), tmp_det2, degree, N_int)
+            if (degree == 1) then
+              exc = 0
+              call get_single_excitation_spin(tmp_det(1,2), tmp_det2, exc, phase, N_int)
+              call decode_exc_spin(exc, h1, p1, h2, p2)
+              ckl = psi_bilinear_matrix_transp_values(k_b,istate)*psi_bilinear_matrix_transp_values(l,jstate) * phase
+              multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(h1,p1) 
+              multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(h1,p1) 
+              multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(h1,p1) 
+              ckl = psi_bilinear_matrix_transp_values(k_b,jstate)*psi_bilinear_matrix_transp_values(l,istate) * phase
+              multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(p1,h1) 
+              multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(p1,h1) 
+              multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(p1,h1) 
+            endif
+            l = l+1
+            if (l > N_det) exit
+            lrow = psi_bilinear_matrix_transp_rows   (l)
+            lcol = psi_bilinear_matrix_transp_columns(l)
+          enddo
+        enddo ! k_b
+
+      enddo ! istate
+    enddo ! jstate
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+  endif ! memory condition
  
   ! Nuclei part
   nuclei_part_x = 0.d0
diff --git a/src/tools/four_idx_transform.irp.f b/src/tools/four_idx_transform.irp.f
index 92e87cad..fc6bface 100644
--- a/src/tools/four_idx_transform.irp.f
+++ b/src/tools/four_idx_transform.irp.f
@@ -12,6 +12,9 @@ program four_idx_transform
 !
   END_DOC
 
+  if (do_mo_cholesky) then
+    stop 'Not implemented with Cholesky integrals'
+  endif
   io_mo_two_e_integrals = 'Write'
   SOFT_TOUCH io_mo_two_e_integrals
   if (.true.) then
diff --git a/src/trexio/export_trexio_routines.irp.f b/src/trexio/export_trexio_routines.irp.f
index 63630243..0eec68bd 100644
--- a/src/trexio/export_trexio_routines.irp.f
+++ b/src/trexio/export_trexio_routines.irp.f
@@ -557,7 +557,7 @@ subroutine export_trexio(update,full_path)
     do k=1,cholesky_ao_num
      do j=1,mo_num
       do i=1,mo_num
-         integral = cholesky_mo(i,j,k)
+         integral = cholesky_mo_transp(k,i,j)
          if (integral == 0.d0) cycle
          icount += 1_8
          chol_buffer(icount) = integral
diff --git a/src/trexio/import_trexio_integrals.irp.f b/src/trexio/import_trexio_integrals.irp.f
index 5a6b3c03..556ed7bc 100644
--- a/src/trexio/import_trexio_integrals.irp.f
+++ b/src/trexio/import_trexio_integrals.irp.f
@@ -28,7 +28,7 @@ subroutine run(f)
   integer(trexio_t), intent(in)  :: f ! TREXIO file handle
   integer(trexio_exit_code)      :: rc
 
-  integer ::i,j,k,l
+  integer :: i,j,k,l, iunit
   integer(8) :: m, n_integrals
   double precision :: integral
 
@@ -41,10 +41,12 @@ subroutine run(f)
   integer         , allocatable :: Vi(:,:)
   double precision              :: s
 
-! TODO:
-! - If Cholesky AO in trexio file, read cholesky ao vectors
-! - If Cholesky MO in trexio file, read cholesky mo vectors
-! - If Cholesky MO not in trexio file, force do_cholesky_mo to False
+  integer*4 :: BUFSIZE
+  integer :: rank
+  double precision, allocatable :: tmp(:,:,:)
+  integer*8 :: offset, icount
+
+  integer, external :: getUnitAndOpen
 
   if (trexio_has_nucleus_repulsion(f) == TREXIO_SUCCESS) then
     rc = trexio_read_nucleus_repulsion(f, s)
@@ -120,45 +122,88 @@ subroutine run(f)
   rc = trexio_has_ao_2e_int(f)
   PROVIDE ao_num
   if (rc /= TREXIO_HAS_NOT) then
-      PROVIDE ao_integrals_map
 
-      integer*4 :: BUFSIZE
-      BUFSIZE=ao_num**2
-      allocate(buffer_i(BUFSIZE), buffer_values(BUFSIZE))
-      allocate(Vi(4,BUFSIZE), V(BUFSIZE))
+      rc = trexio_has_ao_2e_int_eri_cholesky(f)
+      if (rc /= TREXIO_HAS_NOT) then
 
-      integer*8 :: offset, icount
+          rc = trexio_read_ao_2e_int_eri_cholesky_num(f, rank)
+          call trexio_assert(rc, TREXIO_SUCCESS)
 
-      offset = 0_8
-      icount = BUFSIZE
-      rc = TREXIO_SUCCESS
-      do while (icount == size(V))
-        rc = trexio_read_ao_2e_int_eri(f, offset, icount, Vi, V)
-        do m=1,icount
-          i = Vi(1,m)
-          j = Vi(2,m)
-          k = Vi(3,m)
-          l = Vi(4,m)
-          integral = V(m)
-          call two_e_integrals_index(i, j, k, l, buffer_i(m) )
-          buffer_values(m) = integral
-        enddo
-        call insert_into_ao_integrals_map(int(icount,4),buffer_i,buffer_values)
-        offset = offset + icount
-        if (rc /= TREXIO_SUCCESS) then
-            exit
-        endif
-      end do
-      n_integrals = offset
+          allocate(tmp(ao_num,ao_num,rank))
+          tmp(:,:,:) = 0.d0
 
-      call map_sort(ao_integrals_map)
-      call map_unique(ao_integrals_map)
+          BUFSIZE=ao_num**2
+          allocate(Vi(3,BUFSIZE), V(BUFSIZE))
 
-      call map_save_to_disk(trim(ezfio_filename)//'/work/ao_ints',ao_integrals_map)
-      call ezfio_set_ao_two_e_ints_io_ao_two_e_integrals('Read')
 
-      deallocate(buffer_i, buffer_values, Vi, V)
-      print *, 'AO integrals read from TREXIO file'
+          offset = 0_8
+          icount = BUFSIZE
+          rc = TREXIO_SUCCESS
+          do while (icount == size(V))
+            rc = trexio_read_ao_2e_int_eri_cholesky(f, offset, icount, Vi, V)
+            do m=1,icount
+              i = Vi(1,m)
+              j = Vi(2,m)
+              k = Vi(3,m)
+              integral = V(m)
+              tmp(i,j,k) = integral
+            enddo
+            offset = offset + icount
+            if (rc /= TREXIO_SUCCESS) then
+                exit
+            endif
+          end do
+
+          print *,  'Writing Cholesky AO vectors to disk...'
+          iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_ao', 'W')
+          write(iunit) rank
+          write(iunit) tmp(:,:,:)
+          close(iunit)
+          call ezfio_set_ao_two_e_ints_io_ao_cholesky('Read')
+
+          deallocate(Vi, V, tmp)
+          print *, 'Cholesky AO integrals read from TREXIO file'
+      endif
+
+      rc = trexio_has_ao_2e_int_eri(f)
+      if (rc /= TREXIO_HAS_NOT) then
+          PROVIDE ao_integrals_map
+
+          BUFSIZE=ao_num**2
+          allocate(buffer_i(BUFSIZE), buffer_values(BUFSIZE))
+          allocate(Vi(4,BUFSIZE), V(BUFSIZE))
+
+          offset = 0_8
+          icount = BUFSIZE
+          rc = TREXIO_SUCCESS
+          do while (icount == size(V))
+            rc = trexio_read_ao_2e_int_eri(f, offset, icount, Vi, V)
+            do m=1,icount
+              i = Vi(1,m)
+              j = Vi(2,m)
+              k = Vi(3,m)
+              l = Vi(4,m)
+              integral = V(m)
+              call two_e_integrals_index(i, j, k, l, buffer_i(m) )
+              buffer_values(m) = integral
+            enddo
+            call insert_into_ao_integrals_map(int(icount,4),buffer_i,buffer_values)
+            offset = offset + icount
+            if (rc /= TREXIO_SUCCESS) then
+                exit
+            endif
+          end do
+          n_integrals = offset
+
+          call map_sort(ao_integrals_map)
+          call map_unique(ao_integrals_map)
+
+          call map_save_to_disk(trim(ezfio_filename)//'/work/ao_ints',ao_integrals_map)
+          call ezfio_set_ao_two_e_ints_io_ao_two_e_integrals('Read')
+
+          deallocate(buffer_i, buffer_values, Vi, V)
+          print *, 'AO integrals read from TREXIO file'
+      endif
   else
       print *, 'AO integrals not found in TREXIO file'
   endif
@@ -186,40 +231,85 @@ subroutine run(f)
   rc = trexio_has_mo_2e_int(f)
   if (rc /= TREXIO_HAS_NOT) then
 
-      BUFSIZE=mo_num**2
-      allocate(buffer_i(BUFSIZE), buffer_values(BUFSIZE))
-      allocate(Vi(4,BUFSIZE), V(BUFSIZE))
+      rc = trexio_has_mo_2e_int_eri_cholesky(f)
+      if (rc /= TREXIO_HAS_NOT) then
+
+          rc = trexio_read_mo_2e_int_eri_cholesky_num(f, rank)
+          call trexio_assert(rc, TREXIO_SUCCESS)
+
+          allocate(tmp(rank,mo_num,mo_num))
+          tmp(:,:,:) = 0.d0
+
+          BUFSIZE=mo_num**2
+          allocate(Vi(3,BUFSIZE), V(BUFSIZE))
+
+          offset = 0_8
+          icount = BUFSIZE
+          rc = TREXIO_SUCCESS
+          do while (icount == size(V))
+            rc = trexio_read_mo_2e_int_eri_cholesky(f, offset, icount, Vi, V)
+            do m=1,icount
+              i = Vi(1,m)
+              j = Vi(2,m)
+              k = Vi(3,m)
+              integral = V(m)
+              tmp(k,i,j) = integral
+            enddo
+            offset = offset + icount
+            if (rc /= TREXIO_SUCCESS) then
+                exit
+            endif
+          end do
+
+          print *,  'Writing Cholesky MO vectors to disk...'
+          iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_mo_transp', 'W')
+          write(iunit) rank
+          write(iunit) tmp(:,:,:)
+          close(iunit)
+          call ezfio_set_mo_two_e_ints_io_mo_cholesky('Read')
+
+          deallocate(Vi, V, tmp)
+          print *, 'Cholesky MO integrals read from TREXIO file'
+      endif
+
+      rc = trexio_has_mo_2e_int_eri(f)
+      if (rc /= TREXIO_HAS_NOT) then
+          BUFSIZE=mo_num**2
+          allocate(buffer_i(BUFSIZE), buffer_values(BUFSIZE))
+          allocate(Vi(4,BUFSIZE), V(BUFSIZE))
 
 
-      offset = 0_8
-      icount = BUFSIZE
-      rc = TREXIO_SUCCESS
-      do while (icount == size(V))
-        rc = trexio_read_mo_2e_int_eri(f, offset, icount, Vi, V)
-        do m=1,icount
-          i = Vi(1,m)
-          j = Vi(2,m)
-          k = Vi(3,m)
-          l = Vi(4,m)
-          integral = V(m)
-          call two_e_integrals_index(i, j, k, l, buffer_i(m) )
-          buffer_values(m) = integral
-        enddo
-        call map_append(mo_integrals_map, buffer_i, buffer_values, int(icount,4))
-        offset = offset + icount
-        if (rc /= TREXIO_SUCCESS) then
-            exit
-        endif
-      end do
-      n_integrals = offset
+          offset = 0_8
+          icount = BUFSIZE
+          rc = TREXIO_SUCCESS
+          do while (icount == size(V))
+            rc = trexio_read_mo_2e_int_eri(f, offset, icount, Vi, V)
+            do m=1,icount
+              i = Vi(1,m)
+              j = Vi(2,m)
+              k = Vi(3,m)
+              l = Vi(4,m)
+              integral = V(m)
+              call two_e_integrals_index(i, j, k, l, buffer_i(m) )
+              buffer_values(m) = integral
+            enddo
+            call map_append(mo_integrals_map, buffer_i, buffer_values, int(icount,4))
+            offset = offset + icount
+            if (rc /= TREXIO_SUCCESS) then
+                exit
+            endif
+          end do
+          n_integrals = offset
 
-      call map_sort(mo_integrals_map)
-      call map_unique(mo_integrals_map)
+          call map_sort(mo_integrals_map)
+          call map_unique(mo_integrals_map)
+
+          call map_save_to_disk(trim(ezfio_filename)//'/work/mo_ints',mo_integrals_map)
+          call ezfio_set_mo_two_e_ints_io_mo_two_e_integrals('Read')
+          deallocate(buffer_i, buffer_values, Vi, V)
+          print *, 'MO integrals read from TREXIO file'
+      endif
 
-      call map_save_to_disk(trim(ezfio_filename)//'/work/mo_ints',mo_integrals_map)
-      call ezfio_set_mo_two_e_ints_io_mo_two_e_integrals('Read')
-      deallocate(buffer_i, buffer_values, Vi, V)
-      print *, 'MO integrals read from TREXIO file'
   else
       print *, 'MO integrals not found in TREXIO file'
   endif
diff --git a/src/utils/fortran_mmap.c b/src/utils/fortran_mmap.c
index 711a9c34..0306f64f 100644
--- a/src/utils/fortran_mmap.c
+++ b/src/utils/fortran_mmap.c
@@ -40,7 +40,7 @@ void* mmap_fortran(char* filename, size_t bytes, int* file_descr, int read_only,
             exit(EXIT_FAILURE);
         }
 
-        result = write(fd, "", 1);
+        result = write(fd, " ", 1);
         if (result != 1) {
             close(fd);
             printf("%s:\n", filename);
@@ -49,7 +49,13 @@ void* mmap_fortran(char* filename, size_t bytes, int* file_descr, int read_only,
         }
 
         if (single_node == 1) {
-          map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_POPULATE | MAP_NONBLOCK, fd, 0);
+          map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+/*
+          map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_POPULATE | MAP_NONBLOCK | MAP_NORESERVE, fd, 0);
+          if (map == MAP_FAILED) {
+             map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+          }
+*/
         } else {
           map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
         }
diff --git a/src/utils/linear_algebra.irp.f b/src/utils/linear_algebra.irp.f
index 20386b30..4e7ca87d 100644
--- a/src/utils/linear_algebra.irp.f
+++ b/src/utils/linear_algebra.irp.f
@@ -1856,7 +1856,7 @@ subroutine pivoted_cholesky( A, rank, tol, ndim, U)
 !
 ! matrix A is destroyed inside this subroutine
 ! Cholesky vectors are stored in U
-! dimension of U: U(1:rank, 1:n)
+! dimension of U: U(1:n, 1:rank)
 ! U is allocated inside this subroutine
 ! rank is the number of Cholesky vectors depending on tol
 !