Merge branch 'dev-stable' of github.com:QuantumPackage/qp2 into dev-stable

2025-03-14 12:46:45 +01:00 · 2024-07-03 14:50:14 +02:00 · 2024-07-03 14:50:14 +02:00 · 6dfa8f8b32
commit 6dfa8f8b32
parent 42746e17ef 44a7729f65
25 changed files with 2827 additions and 388 deletions
--- a/36
+++ b/36
@ -40,14 +40,16 @@ Usage:
  $(basename $0) -c <file>    
  $(basename $0) -h           
  $(basename $0) -i <package> 
+  $(basename $0) -g [nvidia|none]

 Options:
-  -c  <file>    Define a COMPILATION configuration file,
-                in "${QP_ROOT}/config/".
-  -h            Print the HELP message
-  -i <package>  INSTALL <package>. Use at your OWN RISK:
-                no support will be provided for the installation of
-                dependencies.
+  -c  <file>        Define a COMPILATION configuration file,
+                    in "${QP_ROOT}/config/".
+  -h                Print the HELP message
+  -i <package>      INSTALL <package>. Use at your OWN RISK:
+                    no support will be provided for the installation of
+                    dependencies.
+  -g [nvidia|none]  Choose GPU acceleration (experimental)

 Example:
  ./$(basename $0) -c config/gfortran.cfg
@ -83,7 +85,7 @@ function execute () {
 PACKAGES=""


-while getopts "d:c:i:h" c ; do
+while getopts "d:c:i:g:h" c ; do
    case "$c" in
        c)
            case "$OPTARG" in
@ -100,6 +102,9 @@ while getopts "d:c:i:h" c ; do
                "") help ; break;;
                *) PACKAGES="${PACKAGE} $OPTARG"
            esac;;
+        g)
+            GPU=$OPTARG;
+            break;;
        h)
            help
            exit 0;;
@ -109,6 +114,23 @@ while getopts "d:c:i:h" c ; do
    esac
 done

+# Handle GPU acceleration
+rm -f ${QP_ROOT}/src/gpu_arch
+case "$GPU" in
+  amd) # Nvidia
+     echo "Activating AMD GPU acceleration"
+     ln -s ${QP_ROOT}/plugins/local/gpu_amd ${QP_ROOT}/src/gpu_arch
+     ;;
+  nvidia) # Nvidia
+     echo "Activating Nvidia GPU acceleration"
+     ln -s ${QP_ROOT}/plugins/local/gpu_nvidia ${QP_ROOT}/src/gpu_arch
+     ;;
+  *) # No Acceleration
+     echo "Disabling GPU acceleration"
+     ln -s ${QP_ROOT}/plugins/local/gpu_x86 ${QP_ROOT}/src/gpu_arch
+     ;;
+esac
+
 # Trim leading and trailing spaces
 PACKAGES=$(echo $PACKAGES | xargs)

--- a/plugins/local/gpu_nvidia/LIB
+++ b/plugins/local/gpu_nvidia/LIB
@ -0,0 +1 @@
+-lcudart -lcublas -lcublasLt
--- a/plugins/local/gpu_nvidia/NEED
+++ b/plugins/local/gpu_nvidia/NEED
@ -0,0 +1 @@
+
--- a/plugins/local/gpu_nvidia/README.rst
+++ b/plugins/local/gpu_nvidia/README.rst
@ -0,0 +1,5 @@
+==========
+gpu_nvidia
+==========
+
+Nvidia implementation of GPU routines. Uses CUDA and CUBLAS libraries.
--- a/plugins/local/gpu_nvidia/gpu.c
+++ b/plugins/local/gpu_nvidia/gpu.c
@ -0,0 +1,331 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+
+
+/* Generic functions */
+
+int gpu_ndevices() {
+  int ngpus;
+  cudaGetDeviceCount(&ngpus);
+  return ngpus;
+}
+
+void gpu_set_device(int32_t igpu) {
+  cudaSetDevice((int) igpu);
+}
+
+
+/* Allocation functions */
+
+void gpu_allocate(void** ptr, const int64_t size) {
+    size_t free, total;
+    cudaError_t rc = cudaMemGetInfo( &free, &total );
+    if (rc != cudaSuccess) {
+      free = INT64_MAX;
+    }
+
+    rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
+//    /* Use managed memory if it does not fit on the GPU */
+//    if (size < free && size < total/2) {
+//      rc= cudaMalloc(ptr, size);
+//    } else {
+//      rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
+//    }
+    assert (rc == cudaSuccess);
+}
+
+void gpu_deallocate(void** ptr) {
+  assert (*ptr != NULL);
+  cudaFree(*ptr);
+  *ptr = NULL;
+}
+
+
+/* Memory transfer functions */
+
+void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
+  cudaMemcpy (gpu_ptr, cpu_ptr, n, cudaMemcpyHostToDevice);
+}
+
+void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
+  cudaMemcpy (cpu_ptr, gpu_ptr, n, cudaMemcpyDeviceToHost);
+}
+
+void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
+  cudaMemcpy (gpu_ptr_dest, gpu_ptr_src, n, cudaMemcpyDeviceToDevice);
+}
+
+
+/* Streams */
+
+void gpu_stream_create(cudaStream_t* ptr) {
+  cudaError_t rc = cudaStreamCreate(ptr);
+  assert (rc == cudaSuccess);
+}
+
+void gpu_stream_destroy(cudaStream_t* ptr) {
+  assert (ptr != NULL);
+  cudaError_t rc = cudaStreamDestroy(*ptr);
+  assert (rc == cudaSuccess);
+  *ptr = NULL;
+}
+
+void gpu_set_stream(cublasHandle_t handle, cudaStream_t stream) {
+  cublasSetStream(handle, stream);
+}
+
+void gpu_synchronize() {
+  cudaDeviceSynchronize();
+}
+
+
+/* BLAS functions */
+
+void gpu_blas_create(cublasHandle_t* ptr) {
+  cublasStatus_t rc = cublasCreate(ptr);
+  assert (rc == CUBLAS_STATUS_SUCCESS);
+}
+
+
+void gpu_blas_destroy(cublasHandle_t* ptr) {
+  assert (ptr != NULL);
+  cublasStatus_t rc = cublasDestroy(*ptr);
+  assert (rc == CUBLAS_STATUS_SUCCESS);
+  ptr = NULL;
+}
+
+
+void gpu_ddot(cublasHandle_t handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
+  assert (handle != NULL);
+  /* Convert to int */
+  int n_, incx_, incy_;
+
+  n_    = (int) n;
+  incx_ = (int) incx;
+  incy_ = (int) incy;
+
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  cublasStatus_t rc = cublasDdot(handle, n_, x, incx_, y, incy_, result);
+/*
+  double alpha = 1.0;
+  double beta = 0.0;
+  cublasStatus_t rc = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, n_, &alpha, x, 1, y, n_, &beta, &result_, 1);
+*/
+  assert (rc == CUBLAS_STATUS_SUCCESS);
+}
+
+
+
+void gpu_sdot(cublasHandle_t handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int n_, incx_, incy_;
+
+  n_    = (int) n;
+  incx_ = (int) incx;
+  incy_ = (int) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  float result_ = 0.;
+  cublasStatus_t rc = cublasSdot(handle, n_, x, incx_, y, incy_, &result_);
+  assert (rc == CUBLAS_STATUS_SUCCESS);
+  *result = result_;
+}
+
+
+
+void gpu_dgemv(cublasHandle_t handle, const char transa, const int64_t m, const int64_t n, const double alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) {
+
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, lda_, incx_, incy_;
+
+  m_    = (int) m;
+  n_    = (int) n;
+  lda_  = (int) lda;
+  incx_ = (int) incx;
+  incy_ = (int) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    m_ == m   );
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t)  lda_ == lda );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
+
+  cublasDgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_);
+}
+
+
+
+void gpu_sgemv(cublasHandle_t handle, const char transa, const int64_t m, const int64_t n, const float alpha,
+               const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) {
+
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, lda_, incx_, incy_;
+
+  m_    = (int) m;
+  n_    = (int) n;
+  lda_  = (int) lda;
+  incx_ = (int) incx;
+  incy_ = (int) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    m_ == m   );
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t)  lda_ == lda );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
+
+  cublasSgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_);
+}
+
+
+void gpu_dgemm(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) {
+
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, k_, lda_, ldb_, ldc_;
+
+  m_   = (int) m;
+  n_   = (int) n;
+  k_   = (int) k;
+  lda_ = (int) lda;
+  ldb_ = (int) ldb;
+  ldc_ = (int) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t)   k_ == k  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
+  if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasDgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_);
+}
+
+
+
+void gpu_sgemm(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
+               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) {
+
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, k_, lda_, ldb_, ldc_;
+
+  m_   = (int) m;
+  n_   = (int) n;
+  k_   = (int) k;
+  lda_ = (int) lda;
+  ldb_ = (int) ldb;
+  ldc_ = (int) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t)   k_ == k  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
+  if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasSgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_);
+}
+
+
+void gpu_dgeam(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
+               const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, lda_, ldb_, ldc_;
+
+  m_   = (int) m;
+  n_   = (int) n;
+  lda_ = (int) lda;
+  ldb_ = (int) ldb;
+  ldc_ = (int) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
+  if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasDgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_);
+
+}
+
+
+void gpu_sgeam(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
+               const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, lda_, ldb_, ldc_;
+
+  m_   = (int) m;
+  n_   = (int) n;
+  lda_ = (int) lda;
+  ldb_ = (int) ldb;
+  ldc_ = (int) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
+  if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasSgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_);
+
+}
--- a/plugins/local/gpu_x86/NEED
+++ b/plugins/local/gpu_x86/NEED
@ -0,0 +1 @@
+
--- a/plugins/local/gpu_x86/README.rst
+++ b/plugins/local/gpu_x86/README.rst
@ -0,0 +1,5 @@
+=======
+gpu_x86
+=======
+
+x86 implementation of GPU routines. For use when GPUs are not available.
--- a/plugins/local/gpu_x86/gpu.c
+++ b/plugins/local/gpu_x86/gpu.c
@ -0,0 +1,500 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <assert.h>
+
+/* Generic functions */
+
+int gpu_ndevices() {
+  return 0;
+}
+
+void gpu_set_device(int32_t i) {
+  return;
+}
+
+
+/* Allocation functions */
+
+void gpu_allocate(void** ptr, const int64_t n) {
+  *ptr = malloc((size_t) n);
+  if (*ptr == NULL) {
+    perror("Allocation failed");
+  }
+}
+
+void gpu_deallocate(void** ptr) {
+  free(*ptr);
+  *ptr = NULL;
+}
+
+
+/* Memory transfer functions */
+
+void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
+  memcpy(gpu_ptr, cpu_ptr, n);
+}
+
+void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
+  memcpy(cpu_ptr, gpu_ptr, n);
+}
+
+void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
+  memcpy(gpu_ptr_dest, gpu_ptr_src, n);
+}
+
+
+/* Streams */
+
+void gpu_stream_create(void** ptr) {
+  *ptr = (void*) 2;
+}
+
+void gpu_stream_destroy(void** ptr) {
+  *ptr = NULL;
+}
+
+void gpu_set_stream(void* handle, void* stream) {
+  return;
+}
+
+void gpu_synchronize() {
+  return;
+}
+
+
+/* BLAS functions */
+
+void gpu_blas_create(void** handle) {
+  *handle = (void*) 1;
+}
+
+
+void gpu_blas_destroy(void** handle) {
+  *handle = NULL;
+}
+
+
+double ddot_(const int32_t* n, const double* x, const int32_t* incx, const double* y, const int32_t* incy);
+
+void gpu_ddot(void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t n_, incx_, incy_;
+
+  n_    = (int32_t) n;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  *result = ddot_(&n_, x, &incx_, y, &incy_);
+}
+
+
+float sdot_(const int32_t* n, const float* x, const int32_t* incx, const float* y, const int32_t* incy);
+
+void gpu_sdot(void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t n_, incx_, incy_;
+
+  n_    = (int32_t) n;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  *result = sdot_(&n_, x, &incx_, y, &incy_);
+}
+
+
+void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha,
+            const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy);
+
+void gpu_dgemv(void* handle, const char transa, const int64_t m, const int64_t n, const double alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) {
+
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, lda_, incx_, incy_;
+
+  m_    = (int32_t) m;
+  n_    = (int32_t) n;
+  lda_  = (int32_t) lda;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    m_ == m   );
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t)  lda_ == lda );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  dgemv_(&transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_);
+}
+
+
+void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha,
+               const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy);
+
+void gpu_sgemv(void* handle, const char transa, const int64_t m, const int64_t n, const float alpha,
+               const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) {
+
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, lda_, incx_, incy_;
+
+  m_    = (int32_t) m;
+  n_    = (int32_t) n;
+  lda_  = (int32_t) lda;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    m_ == m   );
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t)  lda_ == lda );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  sgemv_(&transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_);
+}
+
+
+void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha,
+            const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc);
+
+void gpu_dgemm(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) {
+
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, k_, lda_, ldb_, ldc_;
+
+  m_   = (int32_t) m;
+  n_   = (int32_t) n;
+  k_   = (int32_t) k;
+  lda_ = (int32_t) lda;
+  ldb_ = (int32_t) ldb;
+  ldc_ = (int32_t) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t)   k_ == k  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  dgemm_(&transa, &transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_);
+}
+
+
+
+void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha,
+            const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc);
+
+void gpu_sgemm(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
+               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) {
+
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, k_, lda_, ldb_, ldc_;
+
+  m_   = (int32_t) m;
+  n_   = (int32_t) n;
+  k_   = (int32_t) k;
+  lda_ = (int32_t) lda;
+  ldb_ = (int32_t) ldb;
+  ldc_ = (int32_t) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t)   k_ == k  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  sgemm_(&transa, &transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_);
+}
+
+
+void gpu_dgeam(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
+               const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
+  assert (handle != NULL);
+
+  if ( (transa == 'N' && transb == 'N') ||
+       (transa == 'n' && transb == 'N') ||
+       (transa == 'N' && transb == 'n') ||
+       (transa == 'n' && transb == 'n') ) {
+
+     if (alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = beta * b[j*ldb+i];
+         }
+       }
+
+     } else if (beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = alpha * a[j*lda+i];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i];
+         }
+       }
+
+     }
+
+  } else if ( (transa == 'N' && transb == 'T') ||
+              (transa == 'n' && transb == 'T') ||
+              (transa == 'N' && transb == 't') ||
+              (transa == 'n' && transb == 't') ) {
+
+     if (alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = beta * b[i*ldb+j];
+         }
+       }
+
+     } else if (beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = alpha * a[j*lda+i];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j];
+         }
+       }
+
+     }
+
+  } else if ( (transa == 'T' && transb == 'N') ||
+              (transa == 't' && transb == 'N') ||
+              (transa == 'T' && transb == 'n') ||
+              (transa == 't' && transb == 'n') ) {
+
+     if (alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = beta * b[j*ldb+i];
+         }
+       }
+
+     } else if (beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = alpha * a[i*lda+j];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i];
+         }
+       }
+
+     }
+
+  } else if ( (transa == 'T' && transb == 'T') ||
+              (transa == 't' && transb == 'T') ||
+              (transa == 'T' && transb == 't') ||
+              (transa == 't' && transb == 't') ) {
+
+     if (alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = beta * b[i*ldb+j];
+         }
+       }
+
+     } else if (beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = alpha * a[i*lda+j];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j];
+         }
+       }
+
+     }
+
+  }
+}
+
+
+void gpu_sgeam(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
+               const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
+  assert (handle != NULL);
+
+  if ( (transa == 'N' && transb == 'N') ||
+       (transa == 'n' && transb == 'N') ||
+       (transa == 'N' && transb == 'n') ||
+       (transa == 'n' && transb == 'n') ) {
+
+     if (alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = beta * b[j*ldb+i];
+         }
+       }
+
+     } else if (beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = alpha * a[j*lda+i];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i];
+         }
+       }
+
+     }
+
+  } else if ( (transa == 'N' && transb == 'T') ||
+              (transa == 'n' && transb == 'T') ||
+              (transa == 'N' && transb == 't') ||
+              (transa == 'n' && transb == 't') ) {
+
+     if (alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = beta * b[i*ldb+j];
+         }
+       }
+
+     } else if (beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = alpha * a[j*lda+i];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j];
+         }
+       }
+
+     }
+
+  } else if ( (transa == 'T' && transb == 'N') ||
+              (transa == 't' && transb == 'N') ||
+              (transa == 'T' && transb == 'n') ||
+              (transa == 't' && transb == 'n') ) {
+
+     if (alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = beta * b[j*ldb+i];
+         }
+       }
+
+     } else if (beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = alpha * a[i*lda+j];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i];
+         }
+       }
+
+     }
+
+  } else if ( (transa == 'T' && transb == 'T') ||
+              (transa == 't' && transb == 'T') ||
+              (transa == 'T' && transb == 't') ||
+              (transa == 't' && transb == 't') ) {
+
+     if (alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = beta * b[i*ldb+j];
+         }
+       }
+
+     } else if (beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = alpha * a[i*lda+j];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j];
+         }
+       }
+
+     }
+
+  }
+}
--- a/plugins/local/non_h_ints_mu/deb_aos.irp.f
+++ b/plugins/local/non_h_ints_mu/deb_aos.irp.f
@ -31,6 +31,9 @@ subroutine print_aos()
  integer          :: i, ipoint
  double precision :: r(3)
  double precision :: ao_val, ao_der(3), ao_lap
+  double precision :: accu_vgl(5)
+  double precision :: accu_vgl_nrm(5)
+
  double precision :: mo_val, mo_der(3), mo_lap

  PROVIDE final_grid_points aos_in_r_array aos_grad_in_r_array aos_lapl_in_r_array
@ -40,9 +43,6 @@ subroutine print_aos()
    write(1000, '(3(f15.7, 3X))') r
  enddo
  
-double precision :: accu_vgl(5)
-double precision :: accu_vgl_nrm(5)
-
  do ipoint = 1, n_points_final_grid
    do i = 1, ao_num
      ao_val    = aos_in_r_array     (i,ipoint)
--- a/plugins/local/non_h_ints_mu/total_tc_int.irp.f
+++ b/plugins/local/non_h_ints_mu/total_tc_int.irp.f
@ -78,7 +78,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
      !$OMP PRIVATE (i, j, k, l, ipoint, ao_i_r, ao_k_r, weight1) &
      !$OMP SHARED (ao_num, n_points_final_grid, ao_two_e_tc_tot, &
      !$OMP         aos_in_r_array_transp, final_weight_at_r_vector, int2_grad1_u12_square_ao)
-      !$OMP DO COLLAPSE(4)
+      !$OMP DO COLLAPSE(3)
      do i = 1, ao_num
        do k = 1, ao_num
          do l = 1, ao_num
@ -188,7 +188,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
        !$OMP SHARED (ao_num, n_points_final_grid, ao_two_e_tc_tot,    &
        !$OMP         aos_in_r_array_transp, final_weight_at_r_vector, &
        !$OMP         int2_grad1_u12_ao, aos_grad_in_r_array_transp_bis)
-        !$OMP DO COLLAPSE(4)
+        !$OMP DO COLLAPSE(3)
        do i = 1, ao_num
          do k = 1, ao_num
            do l = 1, ao_num
@ -270,7 +270,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
      !$OMP PARALLEL DEFAULT(NONE)                     &
      !$OMP PRIVATE(i, j, k, l, integ_zero, integ_val) & 
      !$OMP SHARED(ao_num, ao_two_e_tc_tot)
-      !$OMP DO COLLAPSE(4)
+      !$OMP DO COLLAPSE(3)
      do j = 1, ao_num
        do l = 1, ao_num
          do i = 1, ao_num
@ -293,7 +293,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
      !$OMP PARALLEL DEFAULT(NONE)                            &
      !$OMP SHARED(ao_num, ao_two_e_tc_tot, ao_integrals_map) &
      !$OMP PRIVATE(i, j, k, l)
-      !$OMP DO COLLAPSE(4)
+      !$OMP DO COLLAPSE(3)
      do j = 1, ao_num
        do l = 1, ao_num
          do i = 1, ao_num
@ -306,7 +306,6 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
      enddo
      !$OMP END DO
      !$OMP END PARALLEL
-      !call clear_ao_map()
      FREE ao_integrals_map
    endif

--- a/plugins/local/tc_int/NEED
+++ b/plugins/local/tc_int/NEED
@ -0,0 +1,5 @@
+tc_keywords
+jastrow
+qmckl
+becke_numerical_grid
+dft_utils_in_r
--- a/plugins/local/tc_int/README.rst
+++ b/plugins/local/tc_int/README.rst
@ -0,0 +1,4 @@
+======
+tc_int
+======
+
--- a/plugins/local/tc_int/compute_tc_int.irp.f
+++ b/plugins/local/tc_int/compute_tc_int.irp.f
@ -0,0 +1,295 @@
+
+! ---
+
+subroutine provide_int2_grad1_u12_ao()
+
+  BEGIN_DOC
+  !
+  ! int2_grad1_u12_ao(i,j,ipoint,1) = \int dr2         [\grad1 u(r1,r2)]_x1 \chi_i(r2) \chi_j(r2) 
+  ! int2_grad1_u12_ao(i,j,ipoint,2) = \int dr2         [\grad1 u(r1,r2)]_y1 \chi_i(r2) \chi_j(r2) 
+  ! int2_grad1_u12_ao(i,j,ipoint,3) = \int dr2         [\grad1 u(r1,r2)]_z1 \chi_i(r2) \chi_j(r2) 
+  ! int2_grad1_u12_ao(i,j,ipoint,4) = \int dr2 [-(1/2) [\grad1 u(r1,r2)]^2] \chi_i(r2) \chi_j(r2) 
+  !
+  !
+  ! tc_int_2e_ao(k,i,l,j) = (ki|V^TC(r_12)|lj) 
+  !                       = <lk| V^TC(r_12) |ji> where V^TC(r_12) is the total TC operator 
+  !                       = tc_grad_and_lapl_ao(k,i,l,j) + tc_grad_square_ao(k,i,l,j) + ao_two_e_coul(k,i,l,j)
+  ! where:
+  !
+  ! tc_grad_and_lapl_ao(k,i,l,j) = < k l | -1/2 \Delta_1 u(r1,r2) - \grad_1 u(r1,r2) . \grad_1 | ij >
+  !                              = -1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2      \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2) 
+  !                              =  1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2 (-1) \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2) 
+  !
+  ! tc_grad_square_ao(k,i,l,j) = -1/2 <kl | |\grad_1 u(r1,r2)|^2 + |\grad_2 u(r1,r2)|^2 | ij>
+  !
+  ! ao_two_e_coul(k,i,l,j) = < l k | 1/r12 | j i > = ( k i | 1/r12 | l j )
+  !
+  END_DOC
+
+  implicit none
+
+  integer                       :: i, j, k, l, m, ipoint, jpoint
+  integer                       :: n_blocks, n_rest, n_pass
+  integer                       :: i_blocks, i_rest, i_pass, ii
+  double precision              :: mem, n_double
+  double precision              :: weight1, ao_k_r, ao_i_r
+  double precision              :: der_envsq_x, der_envsq_y, der_envsq_z, lap_envsq
+  double precision              :: time0, time1, time2, tc1, tc2, tc
+  double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:), tc_int_2e_ao(:,:,:,:)
+  double precision, allocatable :: tmp(:,:,:), c_mat(:,:,:), tmp_grad1_u12(:,:,:)
+
+  double precision, external    :: get_ao_two_e_integral
+
+
+  PROVIDE final_weight_at_r_vector_extra aos_in_r_array_extra
+  PROVIDE final_weight_at_r_vector aos_grad_in_r_array_transp_bis final_weight_at_r_vector aos_in_r_array_transp
+
+
+
+  print*, ' start provide_int2_grad1_u12_ao ...'
+  call wall_time(time0)
+
+  call total_memory(mem)
+  mem      = max(1.d0, qp_max_mem - mem)
+  n_double = mem * 1.d8
+  n_blocks = int(min(n_double / (n_points_extra_final_grid * 4.d0), 1.d0*n_points_final_grid))
+  n_rest   = int(mod(n_points_final_grid, n_blocks))
+  n_pass   = int((n_points_final_grid - n_rest) / n_blocks)
+
+  call write_int(6, n_pass, 'Number of passes')
+  call write_int(6, n_blocks, 'Size of the blocks')
+  call write_int(6, n_rest, 'Size of the last block')
+
+  ! ---
+  ! ---
+  ! ---
+
+  allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,4))
+
+  allocate(tmp(n_points_extra_final_grid,ao_num,ao_num))
+  !$OMP PARALLEL               &
+  !$OMP DEFAULT (NONE)         &
+  !$OMP PRIVATE (j, i, jpoint) &
+  !$OMP SHARED (tmp, ao_num, n_points_extra_final_grid, final_weight_at_r_vector_extra, aos_in_r_array_extra_transp)
+  !$OMP DO SCHEDULE (static)
+  do j = 1, ao_num
+    do i = 1, ao_num
+      do jpoint = 1, n_points_extra_final_grid
+        tmp(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  allocate(tmp_grad1_u12(n_points_extra_final_grid,n_blocks,4))
+
+  tc = 0.d0
+  
+  do i_pass = 1, n_pass
+    ii = (i_pass-1)*n_blocks + 1
+  
+    call wall_time(tc1)
+    !$OMP PARALLEL                   &
+    !$OMP DEFAULT (NONE)             &
+    !$OMP PRIVATE (i_blocks, ipoint) &
+    !$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12)
+    !$OMP DO 
+    do i_blocks = 1, n_blocks
+      ipoint = ii - 1 + i_blocks ! r1
+      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_blocks,1), tmp_grad1_u12(1,i_blocks,2), tmp_grad1_u12(1,i_blocks,3), tmp_grad1_u12(1,i_blocks,4))
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+    call wall_time(tc2)
+    tc = tc + tc2 - tc1 
+
+    do m = 1, 4
+      call dgemm( "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0                     &
+                , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
+                , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) 
+    enddo
+  enddo
+  
+  deallocate(tmp_grad1_u12)
+
+  
+  if(n_rest .gt. 0) then
+  
+    allocate(tmp_grad1_u12(n_points_extra_final_grid,n_rest,4))
+  
+    ii = n_pass*n_blocks + 1
+
+    call wall_time(tc1)
+    !$OMP PARALLEL                 &
+    !$OMP DEFAULT (NONE)           &
+    !$OMP PRIVATE (i_rest, ipoint) &
+    !$OMP SHARED (n_rest, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12)
+    !$OMP DO 
+    do i_rest = 1, n_rest
+      ipoint = ii - 1 + i_rest ! r1
+      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_rest,1), tmp_grad1_u12(1,i_rest,2), tmp_grad1_u12(1,i_rest,3), tmp_grad1_u12(1,i_rest,4))
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+    call wall_time(tc2)
+    tc = tc + tc2 - tc1 
+  
+    do m = 1, 4
+      call dgemm( "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0                       &
+                , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
+                , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) 
+    enddo
+
+    deallocate(tmp_grad1_u12)
+  endif
+
+  deallocate(tmp)
+
+
+  call wall_time(time1)
+  print*, ' wall time for int2_grad1_u12_ao (min) = ', (time1-time0) / 60.d0
+  print*, ' wall time Jastrow derivatives   (min) = ', tc / 60.d0
+  call print_memory_usage()
+
+  ! ---
+  ! ---
+  ! ---
+
+
+  allocate(tc_int_2e_ao(ao_num,ao_num,ao_num,ao_num))
+
+  call wall_time(time1)
+
+  allocate(c_mat(n_points_final_grid,ao_num,ao_num))
+  !$OMP PARALLEL               &
+  !$OMP DEFAULT (NONE)         &
+  !$OMP PRIVATE (i, k, ipoint) &
+  !$OMP SHARED (aos_in_r_array_transp, c_mat, ao_num, n_points_final_grid, final_weight_at_r_vector)
+  !$OMP DO SCHEDULE (static)
+  do i = 1, ao_num
+    do k = 1, ao_num
+      do ipoint = 1, n_points_final_grid
+        c_mat(ipoint,k,i) = final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,k)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+  call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0            &
+            , int2_grad1_u12_ao(1,1,1,4), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
+            , 0.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num)
+  deallocate(c_mat)
+
+  call wall_time(time2)
+  print*, ' wall time of Hermitian part of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0
+  call print_memory_usage()
+
+  ! ---
+
+  call wall_time(time1)
+  
+  allocate(c_mat(n_points_final_grid,ao_num,ao_num))
+  do m = 1, 3
+    !$OMP PARALLEL                                                              &
+    !$OMP DEFAULT (NONE)                                                        &
+    !$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r)                       & 
+    !$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, c_mat, & 
+    !$OMP         ao_num, n_points_final_grid, final_weight_at_r_vector, m)
+    !$OMP DO SCHEDULE (static)
+    do i = 1, ao_num
+      do k = 1, ao_num
+        do ipoint = 1, n_points_final_grid
+  
+          weight1 = 0.5d0 * final_weight_at_r_vector(ipoint)
+          ao_i_r  = aos_in_r_array_transp(ipoint,i)
+          ao_k_r  = aos_in_r_array_transp(ipoint,k)
+  
+          c_mat(ipoint,k,i) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,m) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,m))
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, -1.d0           &
+              , int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
+              , 1.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num)
+  enddo
+  deallocate(c_mat)
+
+  call wall_time(time2)
+  print*, ' wall time of non-Hermitian part of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0
+  call print_memory_usage()
+
+  ! ---
+
+  call wall_time(time1)
+
+  call sum_A_At(tc_int_2e_ao(1,1,1,1), ao_num*ao_num)
+
+  call wall_time(time2)
+  print*, ' lower- and upper-triangle of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0
+  call print_memory_usage()
+
+  ! ---
+
+  call wall_time(time1)  
+
+  PROVIDE ao_integrals_map
+  !$OMP PARALLEL DEFAULT(NONE)                         &
+  !$OMP SHARED(ao_num, tc_int_2e_ao, ao_integrals_map) &
+  !$OMP PRIVATE(i, j, k, l)
+  !$OMP DO COLLAPSE(3)
+  do j = 1, ao_num
+    do l = 1, ao_num
+      do i = 1, ao_num
+        do k = 1, ao_num
+          !                                               < 1:i, 2:j | 1:k, 2:l > 
+          tc_int_2e_ao(k,i,l,j) = tc_int_2e_ao(k,i,l,j) + get_ao_two_e_integral(i, j, k, l, ao_integrals_map)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  call wall_time(time2)
+  print*, ' wall time of Coulomb part of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0
+  call print_memory_usage()
+
+  ! ---
+
+  print*, ' Writing int2_grad1_u12_ao in ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao'
+  open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="write")
+  call ezfio_set_work_empty(.False.)
+    write(11) int2_grad1_u12_ao(:,:,:,1:3)
+  close(11)
+
+  print*, ' Saving tc_int_2e_ao in ', trim(ezfio_filename) // '/work/ao_two_e_tc_tot'
+  open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/ao_two_e_tc_tot', action="write")
+  call ezfio_set_work_empty(.False.)
+  do i = 1, ao_num
+    write(11) tc_int_2e_ao(:,:,:,i)
+  enddo
+  close(11)
+
+  ! ----
+
+  deallocate(int2_grad1_u12_ao)
+  deallocate(tc_int_2e_ao)
+
+  call wall_time(time2)
+  print*, ' wall time for tc_int_2e_ao (min) = ', (time2-time1) / 60.d0
+  call print_memory_usage()
+
+  ! ---
+
+  call wall_time(time1)
+  print*, ' wall time for TC-integrals (min) = ', (time1-time0) / 60.d0
+
+  return
+end
+
+! ---
+
--- a/plugins/local/tc_int/jast_grad_full.irp.f
+++ b/plugins/local/tc_int/jast_grad_full.irp.f
@ -0,0 +1,245 @@
+
+! ---
+
+subroutine get_grad1_u12_for_tc(ipoint, n_grid2, resx, resy, resz, res)
+
+  BEGIN_DOC
+  !
+  ! resx(ipoint) =      [grad1 u(r1,r2)]_x1
+  ! resy(ipoint) =      [grad1 u(r1,r2)]_y1
+  ! resz(ipoint) =      [grad1 u(r1,r2)]_z1
+  ! res (ipoint) = -0.5 [grad1 u(r1,r2)]^2
+  !
+  ! We use:
+  !       grid for r1
+  ! extra_grid for r2
+  !
+  END_DOC
+
+  include 'constants.include.F'
+
+  implicit none
+  integer,          intent(in)  :: ipoint, n_grid2
+  double precision, intent(out) :: resx(n_grid2), resy(n_grid2), resz(n_grid2), res(n_grid2)
+
+  integer                       :: jpoint, i_nucl, p, mpA, npA, opA, pp
+  integer                       :: powmax1, powmax, powmax2
+  double precision              :: r1(3), r2(3)
+  double precision              :: tmp, tmp1, tmp2, tmp11, tmp22
+  double precision              :: rn(3), f1A, grad1_f1A(3), f2A, grad2_f2A(3), g12, grad1_g12(3)
+  double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:)
+
+  r1(1) = final_grid_points(1,ipoint)
+  r1(2) = final_grid_points(2,ipoint)
+  r1(3) = final_grid_points(3,ipoint)
+
+  call grad1_j12_r1_seq(r1, n_grid2, resx, resy, resz)
+
+  do jpoint = 1, n_grid2 ! r2
+    res(jpoint) = -0.5d0 * (resx(jpoint) * resx(jpoint) + resy(jpoint) * resy(jpoint) + resz(jpoint) * resz(jpoint))
+  enddo
+
+  return
+end
+
+! ---
+
+subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
+
+  include 'constants.include.F'
+
+  implicit none
+  integer         , intent(in)  :: n_grid2
+  double precision, intent(in)  :: r1(3)
+  double precision, intent(out) :: gradx(n_grid2)
+  double precision, intent(out) :: grady(n_grid2)
+  double precision, intent(out) :: gradz(n_grid2)
+
+  integer                       :: jpoint, i_nucl, p, mpA, npA, opA
+  double precision              :: r2(3)
+  double precision              :: dx, dy, dz, r12, tmp
+  double precision              :: rn(3), f1A, grad1_f1A(3), f2A, grad2_f2A(3), g12, grad1_g12(3)
+  double precision              :: tmp1, tmp2, dist
+  integer                       :: powmax1, powmax, powmax2
+  double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:)
+
+  powmax1 = max(maxval(jBH_m), maxval(jBH_n))
+  powmax2 = maxval(jBH_o)
+  powmax  = max(powmax1, powmax2)
+
+  allocate(f1A_power(-1:powmax), f2A_power(-1:powmax), g12_power(-1:powmax), double_p(0:powmax))
+
+  do p = 0, powmax
+    double_p(p) = dble(p)
+  enddo
+
+  f1A_power(-1) = 0.d0
+  f2A_power(-1) = 0.d0
+  g12_power(-1) = 0.d0
+
+  f1A_power(0) = 1.d0
+  f2A_power(0) = 1.d0
+  g12_power(0) = 1.d0
+
+  do jpoint = 1, n_grid2 ! r2
+
+    r2(1) = final_grid_points_extra(1,jpoint)
+    r2(2) = final_grid_points_extra(2,jpoint)
+    r2(3) = final_grid_points_extra(3,jpoint)
+
+    gradx(jpoint) = 0.d0
+    grady(jpoint) = 0.d0
+    gradz(jpoint) = 0.d0
+
+    call jBH_elem_fct_grad_alpha1(r1, r2, g12, grad1_g12)
+
+!    dist =   (r1(1) - r2(1)) * (r1(1) - r2(1)) &
+!           + (r1(2) - r2(2)) * (r1(2) - r2(2)) &
+!           + (r1(3) - r2(3)) * (r1(3) - r2(3))
+!
+!    if(dist .ge. 1d-15) then
+!      dist = dsqrt( dist )
+!
+!      tmp1 = 1.d0 / (1.d0 + dist)
+!
+!      g12 = dist * tmp1
+!      tmp2 = tmp1 * tmp1 / dist
+!      grad1_g12(1) = tmp2 * (r1(1) - r2(1))
+!      grad1_g12(2) = tmp2 * (r1(2) - r2(2))
+!      grad1_g12(3) = tmp2 * (r1(3) - r2(3))
+!
+!    else
+!
+!      grad1_g12(1) = 0.d0
+!      grad1_g12(2) = 0.d0
+!      grad1_g12(3) = 0.d0
+!      g12 = 0.d0
+!
+!    endif
+!
+    do p = 1, powmax2
+      g12_power(p) = g12_power(p-1) * g12
+    enddo
+
+    do i_nucl = 1, nucl_num
+
+      rn(1) = nucl_coord(i_nucl,1)
+      rn(2) = nucl_coord(i_nucl,2)
+      rn(3) = nucl_coord(i_nucl,3)
+
+        call jBH_elem_fct_grad_alpha1(r1, rn, f1A, grad1_f1A)
+!      dist =   (r1(1) - rn(1)) * (r1(1) - rn(1)) &
+!             + (r1(2) - rn(2)) * (r1(2) - rn(2)) &
+!             + (r1(3) - rn(3)) * (r1(3) - rn(3))
+!      if (dist > 1.d-15) then
+!        dist = dsqrt( dist )
+!
+!        tmp1 = 1.d0 / (1.d0 + dist)
+!
+!        f1A = dist * tmp1
+!        tmp2 = tmp1 * tmp1 / dist
+!        grad1_f1A(1) = tmp2 * (r1(1) - rn(1))
+!        grad1_f1A(2) = tmp2 * (r1(2) - rn(2))
+!        grad1_f1A(3) = tmp2 * (r1(3) - rn(3))
+!
+!      else
+!
+!        grad1_f1A(1) = 0.d0
+!        grad1_f1A(2) = 0.d0
+!        grad1_f1A(3) = 0.d0
+!        f1A = 0.d0
+!
+!      endif
+
+        call jBH_elem_fct_grad_alpha1(r2, rn, f2A, grad2_f2A)
+!      dist =   (r2(1) - rn(1)) * (r2(1) - rn(1)) &
+!             + (r2(2) - rn(2)) * (r2(2) - rn(2)) &
+!             + (r2(3) - rn(3)) * (r2(3) - rn(3))
+!
+!      if (dist > 1.d-15) then
+!        dist = dsqrt( dist )
+!
+!        tmp1 = 1.d0 / (1.d0 + dist)
+!
+!        f2A = dist * tmp1
+!        tmp2 = tmp1 * tmp1 / dist
+!        grad2_f2A(1) = tmp2 * (r2(1) - rn(1))
+!        grad2_f2A(2) = tmp2 * (r2(2) - rn(2))
+!        grad2_f2A(3) = tmp2 * (r2(3) - rn(3))
+!
+!      else
+!
+!        grad2_f2A(1) = 0.d0
+!        grad2_f2A(2) = 0.d0
+!        grad2_f2A(3) = 0.d0
+!        f2A = 0.d0
+!
+!      endif
+
+      ! Compute powers of f1A and f2A
+      do p = 1, powmax1
+        f1A_power(p) = f1A_power(p-1) * f1A
+        f2A_power(p) = f2A_power(p-1) * f2A
+      enddo
+
+      do p = 1, jBH_size
+        mpA = jBH_m(p,i_nucl)
+        npA = jBH_n(p,i_nucl)
+        opA = jBH_o(p,i_nucl)
+        tmp = jBH_c(p,i_nucl)
+!        if (dabs(tmp) <= 1.d-10) cycle
+!
+        if(mpA .eq. npA) then
+          tmp = tmp * 0.5d0
+        endif
+
+        tmp1 = double_p(mpA) * f1A_power(mpA-1) * f2A_power(npA) + double_p(npA) * f1A_power(npA-1) * f2A_power(mpA)
+        tmp1 = tmp1 * g12_power(opA) * tmp
+        tmp2 = double_p(opA) * g12_power(opA-1) * (f1A_power(mpA) * f2A_power(npA) + f1A_power(npA) * f2A_power(mpA)) * tmp
+
+        gradx(jpoint) = gradx(jpoint) + tmp1 * grad1_f1A(1) + tmp2 * grad1_g12(1)
+        grady(jpoint) = grady(jpoint) + tmp1 * grad1_f1A(2) + tmp2 * grad1_g12(2)
+        gradz(jpoint) = gradz(jpoint) + tmp1 * grad1_f1A(3) + tmp2 * grad1_g12(3)
+      enddo ! p
+    enddo ! i_nucl
+  enddo ! jpoint
+
+  return
+end
+
+subroutine jBH_elem_fct_grad_alpha1(r1, r2, fct, grad1_fct)
+
+  implicit none
+  double precision, intent(in)  :: r1(3), r2(3)
+  double precision, intent(out) :: fct, grad1_fct(3)
+  double precision              :: dist, tmp1, tmp2
+
+  dist =   (r1(1) - r2(1)) * (r1(1) - r2(1)) &
+         + (r1(2) - r2(2)) * (r1(2) - r2(2)) &
+         + (r1(3) - r2(3)) * (r1(3) - r2(3))
+
+
+  if(dist .ge. 1d-15) then
+    dist = dsqrt( dist )
+
+    tmp1 = 1.d0 / (1.d0 + dist)
+
+    fct = dist * tmp1
+    tmp2 = tmp1 * tmp1 / dist
+    grad1_fct(1) = tmp2 * (r1(1) - r2(1))
+    grad1_fct(2) = tmp2 * (r1(2) - r2(2))
+    grad1_fct(3) = tmp2 * (r1(3) - r2(3))
+
+  else
+
+    grad1_fct(1) = 0.d0
+    grad1_fct(2) = 0.d0
+    grad1_fct(3) = 0.d0
+    fct = 0.d0
+
+  endif
+
+  return
+end
+
+! ---
--- a/plugins/local/tc_int/jast_utils_bh.irp.f
+++ b/plugins/local/tc_int/jast_utils_bh.irp.f
@ -0,0 +1,43 @@
+
+! ---
+
+
+
+subroutine jBH_elem_fct_grad(alpha, r1, r2, fct, grad1_fct)
+
+  implicit none
+  double precision, intent(in)  :: alpha, r1(3), r2(3)
+  double precision, intent(out) :: fct, grad1_fct(3)
+  double precision              :: dist, tmp1, tmp2, dist_inv
+
+  dist =   (r1(1) - r2(1)) * (r1(1) - r2(1)) &
+         + (r1(2) - r2(2)) * (r1(2) - r2(2)) &
+         + (r1(3) - r2(3)) * (r1(3) - r2(3))
+
+
+  if(dist .ge. 1d-15) then
+    dist_inv = 1.d0/dsqrt( dist )
+    dist = dist_inv * dist
+
+    tmp1 = 1.d0 / (1.d0 + alpha * dist)
+
+    fct = alpha * dist * tmp1
+    tmp2 = alpha * tmp1 * tmp1 * dist_inv
+    grad1_fct(1) = tmp2 * (r1(1) - r2(1))
+    grad1_fct(2) = tmp2 * (r1(2) - r2(2))
+    grad1_fct(3) = tmp2 * (r1(3) - r2(3))
+
+  else
+
+    grad1_fct(1) = 0.d0
+    grad1_fct(2) = 0.d0
+    grad1_fct(3) = 0.d0
+    fct = 0.d0
+
+  endif
+
+  return
+end
+
+! ---
+
--- a/plugins/local/tc_int/write_tc_int.irp.f
+++ b/plugins/local/tc_int/write_tc_int.irp.f
@ -0,0 +1,56 @@
+! ---
+
+program write_tc_int
+
+  implicit none
+
+  print *, ' j2e_type = ', j2e_type
+  print *, ' j1e_type = ', j1e_type
+  print *, ' env_type = ', env_type
+
+  my_grid_becke  = .True.
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
+  my_extra_grid_becke  = .True.
+  PROVIDE tc_grid2_a tc_grid2_r
+  my_n_pt_r_extra_grid = tc_grid2_r
+  my_n_pt_a_extra_grid = tc_grid2_a
+  touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid
+
+  call write_int(6, my_n_pt_r_grid, 'radial  external grid over')
+  call write_int(6, my_n_pt_a_grid, 'angular external grid over')
+
+  call write_int(6, my_n_pt_r_extra_grid, 'radial  internal grid over')
+  call write_int(6, my_n_pt_a_extra_grid, 'angular internal grid over')
+
+  call main()
+
+end
+
+! ---
+
+subroutine main()
+
+  implicit none
+
+  PROVIDE io_tc_integ
+
+  print*, 'io_tc_integ = ', io_tc_integ
+
+  if(io_tc_integ .ne. "Write") then
+    print*, 'io_tc_integ != Write'
+    print*, io_tc_integ
+    stop
+  endif
+
+  call provide_int2_grad1_u12_ao()
+
+  call ezfio_set_tc_keywords_io_tc_integ('Read')
+
+end
+
+! ---
+
--- a/src/ccsd/NEED
+++ b/src/ccsd/NEED
@ -1,2 +1,3 @@
+gpu
 hartree_fock
 utils_cc
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@ -1,4 +1,5 @@
 subroutine run_ccsd_space_orb
+  use gpu

  implicit none

@ -9,9 +10,18 @@ subroutine run_ccsd_space_orb
  double precision :: uncorr_energy,energy, max_elem, max_r, max_r1, max_r2,ta,tb
  logical :: not_converged

-  double precision, allocatable :: t2(:,:,:,:), r2(:,:,:,:), tau(:,:,:,:), tau_x(:,:,:,:)
-  double precision, allocatable :: t1(:,:), r1(:,:)
-  double precision, allocatable :: H_oo(:,:), H_vv(:,:), H_vo(:,:)
+  type(gpu_double4) :: t2, r2, tau, tau_x
+  type(gpu_double2) :: t1, r1
+  type(gpu_double2) :: H_oo, H_vv, H_vo
+
+  type(gpu_double2) :: d_cc_space_f_oo, d_cc_space_f_vo
+  type(gpu_double2) :: d_cc_space_f_ov, d_cc_space_f_vv
+
+  type(gpu_double3) :: d_cc_space_v_oo_chol, d_cc_space_v_vo_chol
+  type(gpu_double3) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol
+
+  type(gpu_double4) :: d_cc_space_v_oovv
+

  double precision, allocatable :: all_err(:,:), all_t(:,:)
  integer, allocatable          :: list_occ(:), list_vir(:)
@ -20,7 +30,7 @@ subroutine run_ccsd_space_orb

  call set_multiple_levels_omp(.False.)

-  if (do_ao_cholesky) then
+  if (do_mo_cholesky) then
    PROVIDE cholesky_mo_transp
    FREE cholesky_ao
  else
@ -51,11 +61,46 @@ subroutine run_ccsd_space_orb
  !print*,'occ',list_occ
  !print*,'vir',list_vir

-  allocate(t2(nO,nO,nV,nV), r2(nO,nO,nV,nV))
-  allocate(tau(nO,nO,nV,nV))
-  allocate(tau_x(nO,nO,nV,nV))
-  allocate(t1(nO,nV), r1(nO,nV))
-  allocate(H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO))
+  ! GPU arrays
+  call gpu_allocate(d_cc_space_f_oo, nO, nO)
+  call gpu_allocate(d_cc_space_f_vo, nV, nO)
+  call gpu_allocate(d_cc_space_f_ov, nO, nV)
+  call gpu_allocate(d_cc_space_f_vv, nV, nV)
+
+  call gpu_upload(cc_space_f_oo, d_cc_space_f_oo)
+  call gpu_upload(cc_space_f_vo, d_cc_space_f_vo)
+  call gpu_upload(cc_space_f_vv, d_cc_space_f_vv)
+
+!  FREE cc_space_f_oo
+!  FREE cc_space_f_vo
+!  FREE cc_space_f_vv
+
+  if (do_mo_cholesky) then
+    call gpu_allocate(d_cc_space_v_oo_chol, cholesky_mo_num, nO, nO)
+    call gpu_allocate(d_cc_space_v_ov_chol, cholesky_mo_num, nO, nV)
+    call gpu_allocate(d_cc_space_v_vo_chol, cholesky_mo_num, nV, nO)
+    call gpu_allocate(d_cc_space_v_vv_chol, cholesky_mo_num, nV, nV)
+
+    call gpu_upload(cc_space_v_oo_chol, d_cc_space_v_oo_chol)
+    call gpu_upload(cc_space_v_ov_chol, d_cc_space_v_ov_chol)
+    call gpu_upload(cc_space_v_vo_chol, d_cc_space_v_vo_chol)
+    call gpu_upload(cc_space_v_vv_chol, d_cc_space_v_vv_chol)
+
+!    FREE cc_space_v_oo_chol
+!    FREE cc_space_v_ov_chol
+!    FREE cc_space_v_vo_chol
+!    FREE cc_space_v_vv_chol
+  endif
+
+  call gpu_allocate(t2, nO,nO,nV,nV)
+  call gpu_allocate(r2, nO,nO,nV,nV)
+  call gpu_allocate(tau, nO,nO,nV,nV)
+  call gpu_allocate(tau_x, nO,nO,nV,nV)
+  call gpu_allocate(t1, nO,nV)
+  call gpu_allocate(r1, nO,nV)
+  call gpu_allocate(H_oo, nO, nO)
+  call gpu_allocate(H_vo, nV, nO)
+  call gpu_allocate(H_vv, nV, nV)

  if (cc_update_method == 'diis') then
    double precision :: rss, diis_mem, extra_mem
@ -97,14 +142,29 @@ subroutine run_ccsd_space_orb
  endif

  ! Init
-  call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,t1)
-  call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,t2)
-  call update_tau_space(nO,nV,t1,t2,tau)
+  double precision, allocatable :: h_t1(:,:), h_t2(:,:,:,:)
+  allocate(h_t1(nO,nV), h_t2(nO,nO,nV,nV))
+
+  call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,h_t1)
+  call gpu_upload(h_t1, t1)
+
+  call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,h_t2)
+  call gpu_upload(h_t2, t2)
+
+
+  call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV)
+  call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv)
+
+!  FREE cc_space_v_oovv
+
+
+  call update_tau_space(nO,nV,h_t1,t1,t2,tau)
  call update_tau_x_space(nO,nV,tau,tau_x)
  !print*,'hf_energy', hf_energy
  call det_energy(det,uncorr_energy)
  print*,'Det energy', uncorr_energy
-  call ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
+
+  call ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy)
  print*,'Guess energy', uncorr_energy+energy, energy

  nb_iter = 0
@ -120,43 +180,40 @@ subroutine run_ccsd_space_orb
  do while (not_converged)

    ! Residue
-    if (do_ao_cholesky) then
-!    if (.False.) then
-      call compute_H_oo_chol(nO,nV,tau_x,H_oo)
-      call compute_H_vv_chol(nO,nV,tau_x,H_vv)
-      call compute_H_vo_chol(nO,nV,t1,H_vo)
+    if (do_mo_cholesky) then
+      call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
+      call compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, d_cc_space_v_ov_chol,H_vv)
+      call compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol, H_vo)

-      call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
-      call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
+      call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1)
+      call compute_r2_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r2%f,max_r2)
    else
-      call compute_H_oo(nO,nV,t1,t2,tau,H_oo)
-      call compute_H_vv(nO,nV,t1,t2,tau,H_vv)
-      call compute_H_vo(nO,nV,t1,t2,H_vo)
+      call compute_H_oo(nO,nV,t1%f,t2%f,tau%f,H_oo%f)
+      call compute_H_vv(nO,nV,t1%f,t2%f,tau%f,H_vv%f)
+      call compute_H_vo(nO,nV,t1%f,t2%f,H_vo%f)

-      call compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
-      call compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
+      call compute_r1_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r1%f,max_r1)
+      call compute_r2_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r2%f,max_r2)
    endif
    max_r = max(max_r1,max_r2)

    ! Update
    if (cc_update_method == 'diis') then
-      !call update_t_ccsd(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
-      !call update_t_ccsd_diis(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
-      call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1,r2,t1,t2,all_err,all_t)
+      call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1%f,r2%f,t1%f,t2%f,all_err,all_t)

    ! Standard update as T = T - Delta
    elseif (cc_update_method == 'none') then
-      call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1,t1)
-      call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2,t2)
+      call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1%f,t1%f)
+      call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2%f,t2%f)
    else
      print*,'Unkown cc_method_method: '//cc_update_method
    endif

-    call update_tau_space(nO,nV,t1,t2,tau)
+    call update_tau_space(nO,nV,t1%f,t1,t2,tau)
    call update_tau_x_space(nO,nV,tau,tau_x)

    ! Energy
-    call ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
+    call ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy)
    write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |'

    nb_iter = nb_iter + 1
@ -181,8 +238,8 @@ subroutine run_ccsd_space_orb
  print*,''

  if (write_amplitudes) then
-    call write_t1(nO,nV,t1)
-    call write_t2(nO,nV,t2)
+    call write_t1(nO,nV,t1%f)
+    call write_t2(nO,nV,t2%f)
    call ezfio_set_utils_cc_io_amplitudes('Read')
  endif

@ -191,7 +248,14 @@ subroutine run_ccsd_space_orb
    deallocate(all_err,all_t)
  endif

-  deallocate(H_vv,H_oo,H_vo,r1,r2,tau)
+  call gpu_deallocate(H_oo)
+  call gpu_deallocate(H_vv)
+  call gpu_deallocate(H_vo)
+
+  call gpu_deallocate(r1)
+  call gpu_deallocate(r2)
+  call gpu_deallocate(tau)
+  call gpu_deallocate(tau_x)

  ! CCSD(T)
  double precision :: e_t, e_t_err
@ -199,28 +263,14 @@ subroutine run_ccsd_space_orb

  if (cc_par_t .and. elec_alpha_num + elec_beta_num > 2) then

-    ! Dumb way
-    !call wall_time(ta)
-    !call ccsd_par_t_space(nO,nV,t1,t2,e_t)
-    !call wall_time(tb)
-    !print*,'Time: ',tb-ta, ' s'
-
-    !print*,''
-    !write(*,'(A15,F18.12,A3)') ' E(CCSD(T))  = ', uncorr_energy + energy + e_t, ' Ha'
-    !write(*,'(A15,F18.12,A3)') ' E(T)        = ', e_t, ' Ha'
-    !write(*,'(A15,F18.12,A3)') ' Correlation = ', energy + e_t, ' Ha'
-    !print*,''
-
    ! New
    e_t = uncorr_energy + energy ! For print in (T) call
    e_t_err = 0.d0

    print*,'Computing (T) correction...'
    call wall_time(ta)
-!    call ccsd_par_t_space_v3(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
-!         ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t)

-    call ccsd_par_t_space_stoch(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
+    call ccsd_par_t_space_stoch(nO,nV,t1%f,t2%f,cc_space_f_o,cc_space_f_v &
         ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t, e_t_err)

    call wall_time(tb)
@ -235,168 +285,147 @@ subroutine run_ccsd_space_orb

  call save_energy(uncorr_energy + energy, e_t)

-  deallocate(t1,t2)
+  deallocate(h_t1, h_t2)
+  if (do_mo_cholesky) then
+    call gpu_deallocate(d_cc_space_v_oo_chol)
+    call gpu_deallocate(d_cc_space_v_ov_chol)
+    call gpu_deallocate(d_cc_space_v_vo_chol)
+    call gpu_deallocate(d_cc_space_v_vv_chol)
+  endif
+  call gpu_deallocate(d_cc_space_f_vo)
+  call gpu_deallocate(d_cc_space_v_oovv)
+  call gpu_deallocate(t1)
+  call gpu_deallocate(t2)

 end

 ! Energy

-subroutine ccsd_energy_space(nO,nV,tau,t1,energy)
-
+subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy)
+  use gpu
  implicit none

-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: tau(nO,nO,nV,nV)
-  double precision, intent(in)  :: t1(nO,nV)
-  double precision, intent(out) :: energy
+  integer, intent(in)            :: nO, nV
+  type(gpu_double4), intent(in)  :: tau_x, d_cc_space_v_oovv
+  type(gpu_double2), intent(in)  :: t1, d_cc_space_f_vo
+  double precision, intent(out)  :: energy

  ! internal
  integer :: i,j,a,b
  double precision :: e

-  energy = 0d0
-  !$omp parallel &
-  !$omp shared(nO,nV,energy,tau,t1,&
-  !$omp cc_space_f_vo,cc_space_w_oovv) &
-  !$omp private(i,j,a,b,e) &
-  !$omp default(none)
-  e = 0d0
-  !$omp do
-  do a = 1, nV
-    do i = 1, nO
-      e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp do
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          e = e + tau(i,j,a,b) * cc_space_w_oovv(i,j,a,b)
-       enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp critical
-  energy = energy + e
-  !$omp end critical
-  !$omp end parallel
+  type(gpu_stream) :: s1, s2
+  call gpu_stream_create(s1)
+  call gpu_stream_create(s2)

-end
+  call gpu_set_stream(blas_handle,s1)
+  call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo, 1, t1, 1, e)

-subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
+  call gpu_set_stream(blas_handle,s2)
+  call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1_8, d_cc_space_v_oovv, 1_8, energy)
+  call gpu_set_stream(blas_handle,gpu_default_stream)

-  implicit none
+  call gpu_synchronize()
+  call gpu_stream_destroy(s1)
+  call gpu_stream_destroy(s2)

-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: tau_x(nO,nO,nV,nV)
-  double precision, intent(in)  :: t1(nO,nV)
-  double precision, intent(out) :: energy
-
-  ! internal
-  integer :: i,j,a,b
-  double precision :: e
-
-  energy = 0d0
-  !$omp parallel &
-  !$omp shared(nO,nV,energy,tau_x,t1,&
-  !$omp cc_space_f_vo,cc_space_v_oovv) &
-  !$omp private(i,j,a,b,e) &
-  !$omp default(none)
-  e = 0d0
-  !$omp do
-  do a = 1, nV
-    do i = 1, nO
-      e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp do
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          e = e + tau_x(i,j,a,b) * cc_space_v_oovv(i,j,a,b)
-       enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp critical
-  energy = energy + e
-  !$omp end critical
-  !$omp end parallel
+   energy = energy + 2.d0*e

 end

 ! Tau

-subroutine update_tau_space(nO,nV,t1,t2,tau)
-
+subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
+  use gpu
  implicit none

  ! in
  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: t1(nO,nV), t2(nO,nO,nV,nV)
+  double precision, intent(in)  :: h_t1(nO,nV)
+  type(gpu_double2), intent(in) :: t1
+  type(gpu_double4), intent(in) :: t2

  ! out
-  double precision, intent(out) :: tau(nO,nO,nV,nV)
+  type(gpu_double4) :: tau

  ! internal
  integer                       :: i,j,a,b

+  type(gpu_stream) :: stream(nV)
+
  !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,tau,t2,t1) &
+  !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) &
  !$OMP PRIVATE(i,j,a,b) &
  !$OMP DEFAULT(NONE)
  !$OMP DO
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b)
-        enddo
-      enddo
+  do b=1,nV
+    call gpu_stream_create(stream(b))
+    call gpu_set_stream(blas_handle,stream(b))
+    do j=1,nO
+      call gpu_dgeam_f(blas_handle, 'N', 'N', nO, nV, &
+         1.d0, t2%f(1,j,1,b), nO*nO, &
+         h_t1(j,b), t1%f, nO, &
+         tau%f(1,j,1,b), nO*nO)
    enddo
  enddo
  !$OMP END DO
  !$OMP END PARALLEL

+  call gpu_synchronize()
+
+  do b=1,nV
+    call gpu_stream_destroy(stream(b))
+  enddo
+  call gpu_set_stream(blas_handle,gpu_default_stream)
+
+
 end

 subroutine update_tau_x_space(nO,nV,tau,tau_x)
-
+  use gpu
  implicit none

  ! in
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: tau(nO,nO,nV,nV)
+  integer, intent(in)         :: nO, nV
+  type(gpu_double4), intent(in)  :: tau

  ! out
-  double precision, intent(out) :: tau_x(nO,nO,nV,nV)
+  type(gpu_double4) :: tau_x

  ! internal
  integer                       :: i,j,a,b

+  type(gpu_stream) :: stream(nV)
+
+  do a=1,nV
+    call gpu_stream_create(stream(a))
+  enddo
+
  !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,tau,tau_x) &
-  !$OMP PRIVATE(i,j,a,b) &
+  !$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) &
+  !$OMP PRIVATE(a,b) &
  !$OMP DEFAULT(NONE)
  !$OMP DO
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          tau_x(i,j,a,b) = 2.d0*tau(i,j,a,b) - tau(i,j,b,a)
-        enddo
-      enddo
+  do b=1,nV
+    do a=1,nV
+      call gpu_set_stream(blas_handle,stream(a))
+      call gpu_dgeam_f(blas_handle, 'N', 'N', nO, nO, &
+          2.d0, tau%f(1,1,a,b), nO, &
+         -1.d0, tau%f(1,1,b,a), nO, &
+         tau_x%f(1,1,a,b), nO)
    enddo
  enddo
  !$OMP END DO
  !$OMP END PARALLEL

+  call gpu_set_stream(blas_handle,gpu_default_stream)
+  call gpu_synchronize()
+
+  do b=1,nV
+    call gpu_stream_destroy(stream(b))
+  enddo
+
+
 end

 ! R1
--- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
@ -1,81 +1,200 @@
-subroutine ccsd_energy_space_chol(nO,nV,tau,t1,energy)
+! H_oo

+subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, &
+    d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
+  use gpu
  implicit none

-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: tau(nO,nO,nV,nV)
-  double precision, intent(in)  :: t1(nO,nV)
-  double precision, intent(out) :: energy
+  integer, intent(in)           :: nO,nV
+  type(gpu_double2), intent(in)    :: d_cc_space_f_oo
+  type(gpu_double3), intent(in)    :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol
+  type(gpu_double4), intent(in)    :: tau_x
+  type(gpu_double2), intent(out)   :: H_oo

-  ! internal
-  integer :: i,j,a,b
-  double precision :: e
+  integer :: a,b,i,j,u,k

-  energy = 0d0
-  !$omp parallel &
-  !$omp shared(nO,nV,energy,tau,t1,&
-  !$omp cc_space_f_vo,cc_space_w_oovv) &
-  !$omp private(i,j,a,b,e) &
-  !$omp default(none)
-  e = 0d0
-  !$omp do
-  do a = 1, nV
-    do i = 1, nO
-      e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp do
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          e = e + tau(i,j,a,b) * cc_space_w_oovv(i,j,a,b)
-       enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp critical
-  energy = energy + e
-  !$omp end critical
-  !$omp end parallel
+  type(gpu_double3) :: tau_kau, tmp_vov, tmp_ovv

-end
+  call gpu_allocate(tau_kau, cholesky_mo_num, nV, nO)

-! Tau
+  type(gpu_blas) :: blas

-subroutine update_tau_space_chol(nO,nV,t1,t2,tau)

-  implicit none
+  !$OMP PARALLEL  &
+  !$OMP DEFAULT(SHARED) &
+  !$OMP PRIVATE(blas,u,b,tmp_vov,tmp_ovv)

-  ! in
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: t1(nO,nV), t2(nO,nO,nV,nV)
+  !$OMP SINGLE
+  !$OMP TASK
+  call gpu_copy(d_cc_space_f_oo, H_oo)
+  !$OMP END TASK
+  !$OMP END SINGLE

-  ! out
-  double precision, intent(out) :: tau(nO,nO,nV,nV)
+  call gpu_allocate(tmp_ovv, nO, nV, nV)
+  call gpu_allocate(tmp_vov, nV, nO, nV)

-  ! internal
-  integer                       :: i,j,a,b
+  call gpu_blas_create(blas)

-  !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,tau,t2,t1) &
-  !$OMP PRIVATE(i,j,a,b) &
-  !$OMP DEFAULT(NONE)
  !$OMP DO
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b)
-        enddo
-      enddo
+  do u=1,nO
+    call gpu_dgeam_f(blas, 'N', 'N', 1, nO*nV*nV, 1.d0, &
+           tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f, nO, tmp_ovv%f, 1)
+    do b=1,nV
+      call gpu_dgeam_f(blas, 'T', 'T', nV, nO, 1.d0, &
+           tmp_ovv%f(1,1,b), nO, 0.d0, &
+           tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV)
    enddo
+    call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
+      d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_vov%f, nV, &
+      0.d0, tau_kau%f(1,1,u), cholesky_mo_num)
  enddo
  !$OMP END DO
+
+  call gpu_blas_destroy(blas)
+
+  call gpu_deallocate(tmp_vov)
+  call gpu_deallocate(tmp_ovv)
+
+  !$OMP TASKWAIT
  !$OMP END PARALLEL

+  call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
+    tau_kau, cholesky_mo_num*nV,  d_cc_space_v_vo_chol, cholesky_mo_num*nV, &
+    1.d0, H_oo, nO)
+
+  call gpu_synchronize()
+  call gpu_deallocate(tau_kau)
+end
+
+! H_vv
+
+subroutine compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, &
+         d_cc_space_v_ov_chol,H_vv)
+  use gpu
+  implicit none
+
+  integer, intent(in)              :: nO,nV
+  type(gpu_double2), intent(in)    :: d_cc_space_f_vv
+  type(gpu_double3), intent(in)    :: d_cc_space_v_ov_chol
+  type(gpu_double4), intent(in)    :: tau_x
+  type(gpu_double2), intent(out)   :: H_vv
+
+  integer :: a,b,i,j,u,k, beta
+
+  type(gpu_double3) :: tau_kia, tmp_oov
+
+  call gpu_allocate(tau_kia, cholesky_mo_num, nO, nV)
+
+  type(gpu_blas) :: blas
+
+  !$OMP PARALLEL &
+  !$OMP DEFAULT(SHARED) &
+  !$OMP PRIVATE(a,b,tmp_oov,blas)
+
+  !$OMP SINGLE
+  !$OMP TASK
+  call gpu_copy(d_cc_space_f_vv, H_vv)
+  !$OMP END TASK
+  !$OMP END SINGLE
+
+  call gpu_blas_create(blas)
+  call gpu_allocate(tmp_oov, nO, nO, nV)
+
+  !$OMP DO
+  do a = 1, nV
+    do b=1,nV
+      call gpu_dgeam_f(blas, 'N', 'N', nO, nO, 1.d0, &
+        tau_x%f(1,1,a,b), nO, 0.d0, &
+        tau_x%f(1,1,a,b), nO, tmp_oov%f(1,1,b), nO)
+    enddo
+    call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nO,nO*nV,1.d0, &
+      d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov%f, nO, &
+      0.d0, tau_kia%f(1,1,a), cholesky_mo_num)
+  enddo
+  !$OMP END DO
+
+  call gpu_blas_destroy(blas)
+
+  call gpu_deallocate(tmp_oov)
+  !$OMP TASKWAIT
+  !$OMP END PARALLEL
+
+  call gpu_dgemm(blas_handle,'T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
+    tau_kia, cholesky_mo_num*nO,  d_cc_space_v_ov_chol, cholesky_mo_num*nO, &
+    1.d0, H_vv, nV)
+
+  call gpu_synchronize()
+  call gpu_deallocate(tau_kia)
+end
+
+! H_vo
+subroutine compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, &
+         d_cc_space_v_ov_chol,d_cc_space_v_vo_chol, H_vo)
+  use gpu
+  implicit none
+
+  integer, intent(in)            :: nO,nV
+  type(gpu_double2), intent(in)  :: t1, d_cc_space_f_vo
+  type(gpu_double3), intent(in)  :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol
+  type(gpu_double2), intent(out) :: H_vo
+
+  integer :: a,b,i,j,u,k
+
+  type(gpu_double1) :: tmp_k
+  type(gpu_double3) :: tmp, tmp2
+
+  call gpu_copy(d_cc_space_f_vo, H_vo)
+
+  call gpu_allocate(tmp_k, cholesky_mo_num)
+
+  call gpu_dgemm(blas_handle, 'N', 'N', cholesky_mo_num, 1, nO*nV, 2.d0, &
+     d_cc_space_v_ov_chol, cholesky_mo_num, &
+     t1, nO*nV, 0.d0, tmp_k, cholesky_mo_num)
+
+  call gpu_dgemm(blas_handle, 'T','N',nV*nO,1,cholesky_mo_num,1.d0, &
+      d_cc_space_v_vo_chol, cholesky_mo_num, tmp_k, cholesky_mo_num, 1.d0, &
+      H_vo, nV*nO)
+
+  call gpu_deallocate(tmp_k)
+
+
+  call gpu_allocate(tmp,  cholesky_mo_num, nO, nO)
+
+  call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nO, nO, nV, 1.d0, &
+    d_cc_space_v_ov_chol, cholesky_mo_num*nO, t1, nO, 0.d0, tmp, cholesky_mo_num*nO)
+
+  call gpu_allocate(tmp2, cholesky_mo_num, nO, nO)
+
+  type(gpu_stream) :: stream(nO)
+  do i=1,nO
+    call gpu_stream_create(stream(i))
+  enddo
+
+  !$OMP PARALLEL DO COLLAPSE(2) PRIVATE(i,j)
+  do i=1,nO
+    do j=1,nO
+      call gpu_set_stream(blas_handle,stream(j))
+      call gpu_dgeam_f(blas_handle, 'N', 'N', cholesky_mo_num, 1, 1.d0, &
+        tmp%f(1,i,j), cholesky_mo_num, 0.d0, &
+        tmp%f(1,i,j), cholesky_mo_num, tmp2%f(1,j,i), cholesky_mo_num)
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+  call gpu_set_stream(blas_handle,gpu_default_stream)
+  call gpu_synchronize()
+
+  do i=1,nO
+    call gpu_stream_destroy(stream(i))
+  enddo
+  call gpu_deallocate(tmp)
+
+  call gpu_dgemm(blas_handle, 'T','N', nV, nO, cholesky_mo_num*nO, -1.d0, &
+    d_cc_space_v_ov_chol, cholesky_mo_num*nO, tmp2, cholesky_mo_num*nO, &
+    1.d0, H_vo, nV)
+
+  call gpu_synchronize()
+  call gpu_deallocate(tmp2)
 end

 ! R1
@ -291,154 +410,6 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)

 end

-! H_oo
-
-subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo)
-
-  implicit none
-
-  integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: tau_x(nO, nO, nV, nV)
-  double precision, intent(out) :: H_oo(nO, nO)
-
-  integer :: a,b,i,j,u,k
-
-  double precision, allocatable :: tau_kau(:,:,:), tmp_vov(:,:,:)
-
-  allocate(tau_kau(cholesky_mo_num,nV,nO))
-  !$omp parallel &
-  !$omp default(shared) &
-  !$omp private(i,u,j,k,a,b,tmp_vov)
-  allocate(tmp_vov(nV,nO,nV) )
-  !$omp do
-  do u = 1, nO
-    do b=1,nV
-      do j=1,nO
-        do a=1,nV
-          tmp_vov(a,j,b) = tau_x(u,j,a,b)
-        enddo
-      enddo
-    enddo
-    call dgemm('N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
-      cc_space_v_ov_chol, cholesky_mo_num, tmp_vov, nV, &
-      0.d0, tau_kau(1,1,u), cholesky_mo_num)
-  enddo
-  !$omp end do nowait
-  deallocate(tmp_vov)
-  !$omp do
-  do i = 1, nO
-    do u = 1, nO
-      H_oo(u,i) = cc_space_f_oo(u,i)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp barrier
-  !$omp end  parallel
-  call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
-    tau_kau, cholesky_mo_num*nV,  cc_space_v_vo_chol, cholesky_mo_num*nV, &
-    1.d0, H_oo, nO)
-
-end
-
-! H_vv
-
-subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv)
-
-  implicit none
-
-  integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: tau_x(nO, nO, nV, nV)
-  double precision, intent(out) :: H_vv(nV, nV)
-
-  integer :: a,b,i,j,u,k, beta
-
-  double precision, allocatable :: tau_kia(:,:,:), tmp_oov(:,:,:)
-
-  allocate(tau_kia(cholesky_mo_num,nO,nV))
-  !$omp parallel &
-  !$omp default(shared) &
-  !$omp private(i,beta,j,k,a,b,tmp_oov)
-  allocate(tmp_oov(nO,nO,nV) )
-  !$omp do
-  do a = 1, nV
-    do b=1,nV
-      do j=1,nO
-        do i=1,nO
-          tmp_oov(i,j,b) = tau_x(i,j,a,b)
-        enddo
-      enddo
-    enddo
-    call dgemm('N','T',cholesky_mo_num,nO,nO*nV,1.d0, &
-      cc_space_v_ov_chol, cholesky_mo_num, tmp_oov, nO, &
-      0.d0, tau_kia(1,1,a), cholesky_mo_num)
-  enddo
-  !$omp end do nowait
-  deallocate(tmp_oov)
-
-  !$omp do
-  do beta = 1, nV
-    do a = 1, nV
-      H_vv(a,beta) = cc_space_f_vv(a,beta)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp barrier
-  !$omp end  parallel
-  call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
-    tau_kia, cholesky_mo_num*nO,  cc_space_v_ov_chol, cholesky_mo_num*nO, &
-    1.d0, H_vv, nV)
-
-end
-
-! H_vo
-subroutine compute_H_vo_chol(nO,nV,t1,H_vo)
-
-  implicit none
-
-  integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: t1(nO, nV)
-  double precision, intent(out) :: H_vo(nV, nO)
-
-  integer :: a,b,i,j,u,k
-
-  double precision, allocatable :: tmp_k(:), tmp(:,:,:), tmp2(:,:,:)
-  do i=1,nO
-    do a=1,nV
-      H_vo(a,i) = cc_space_f_vo(a,i)
-    enddo
-  enddo
-
-  allocate(tmp_k(cholesky_mo_num))
-  call dgemm('N', 'N', cholesky_mo_num, 1, nO*nV, 2.d0, &
-     cc_space_v_ov_chol, cholesky_mo_num, &
-     t1, nO*nV, 0.d0, tmp_k, cholesky_mo_num)
-
-  call dgemm('T','N',nV*nO,1,cholesky_mo_num,1.d0, &
-      cc_space_v_vo_chol, cholesky_mo_num, tmp_k, cholesky_mo_num, 1.d0, &
-      H_vo, nV*nO)
-  deallocate(tmp_k)
-
-  allocate(tmp(cholesky_mo_num,nO,nO))
-  allocate(tmp2(cholesky_mo_num,nO,nO))
-
-  call dgemm('N','T', cholesky_mo_num*nO, nO, nV, 1.d0, &
-    cc_space_v_ov_chol, cholesky_mo_num*nO, t1, nO, 0.d0, tmp, cholesky_mo_num*nO)
-
-  do i=1,nO
-    do j=1,nO
-      do k=1,cholesky_mo_num
-        tmp2(k,j,i) = tmp(k,i,j)
-      enddo
-    enddo
-  enddo
-  deallocate(tmp)
-
-  call dgemm('T','N', nV, nO, cholesky_mo_num*nO, -1.d0, &
-    cc_space_v_ov_chol, cholesky_mo_num*nO, tmp2, cholesky_mo_num*nO, &
-    1.d0, H_vo, nV)
-
-end
-

 ! R2

--- a/src/gpu/NEED
+++ b/src/gpu/NEED
@ -0,0 +1 @@
+gpu_arch
--- a/src/gpu/README.rst
+++ b/src/gpu/README.rst
@ -0,0 +1,6 @@
+===
+gpu
+===
+
+Bindings for GPU routines (architecture independent).
+Architecture-dependent files are in gpu_arch.
--- a/src/gpu/gpu.h
+++ b/src/gpu/gpu.h
@ -0,0 +1,41 @@
+#include <stdint.h>
+
+int  gpu_ndevices();
+void gpu_set_device(int32_t i);
+
+void gpu_allocate(void** ptr, const int64_t n);
+void gpu_free(void** ptr);
+
+void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n);
+void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n);
+void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n);
+
+void gpu_stream_create(void** ptr);
+void gpu_stream_destroy(void** ptr);
+void gpu_set_stream(void* handle, void* stream);
+void gpu_synchronize();
+
+void gpu_blas_create(void** handle);
+void gpu_blas_destroy(void** handle);
+
+void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result);
+
+void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result);
+
+void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy);
+
+void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float alpha,
+               const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy);
+
+void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc);
+
+void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
+               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc);
+
+void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
+               const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc);
+
+void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
+               const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc);
--- a/src/gpu/gpu.irp.f
+++ b/src/gpu/gpu.irp.f
@ -0,0 +1,26 @@
+use gpu
+
+BEGIN_PROVIDER [ type(gpu_blas), blas_handle ]
+ implicit none
+ BEGIN_DOC
+ ! Handle for cuBLAS or RocBLAS
+ END_DOC
+ call gpu_blas_create(blas_handle)
+END_PROVIDER
+
+BEGIN_PROVIDER [ type(gpu_stream), gpu_default_stream ]
+ implicit none
+ BEGIN_DOC
+ ! Default stream
+ END_DOC
+ gpu_default_stream%c = C_NULL_PTR
+END_PROVIDER
+
+BEGIN_PROVIDER [ integer, gpu_num ]
+ implicit none
+ BEGIN_DOC
+ ! Number of usable GPUs
+ END_DOC
+ gpu_num = gpu_ndevices()
+END_PROVIDER
+
--- a/src/gpu/gpu_module.F90
+++ b/src/gpu/gpu_module.F90
@ -0,0 +1,725 @@
+module gpu
+  use, intrinsic :: iso_c_binding
+  implicit none
+
+! Data types
+! ----------
+
+  type gpu_double1
+    type(c_ptr) :: c
+    double precision, pointer :: f(:)
+  end type
+
+  type gpu_double2
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:)
+  end type
+
+  type gpu_double3
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:,:)
+  end type
+
+  type gpu_double4
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:,:,:)
+  end type
+
+  type gpu_double5
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:,:,:,:)
+  end type
+
+  type gpu_double6
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:,:,:,:,:)
+  end type
+
+
+  type gpu_blas
+    type(c_ptr) :: c
+  end type
+
+  type gpu_stream
+    type(c_ptr) :: c
+  end type
+
+
+! C interfaces
+! ------------
+
+  interface
+    logical(c_bool) function no_gpu() bind(C)
+      import
+    end function
+
+    integer function gpu_ndevices() bind(C)
+      import
+    end function
+
+    subroutine gpu_set_device(id) bind(C)
+      import
+      integer(c_int32_t), value :: id
+    end subroutine
+
+    subroutine gpu_allocate_c(ptr, n) bind(C, name='gpu_allocate')
+      import
+      type(c_ptr) :: ptr
+      integer(c_int64_t), value :: n
+    end subroutine
+
+    subroutine gpu_deallocate_c(ptr) bind(C, name='gpu_deallocate')
+      import
+      type(c_ptr) :: ptr
+    end subroutine
+
+    subroutine gpu_upload_c(cpu_ptr, gpu_ptr, n) bind(C, name='gpu_upload')
+      import
+      type(c_ptr), value :: cpu_ptr
+      type(c_ptr), value :: gpu_ptr
+      integer(c_int64_t), value :: n
+    end subroutine
+
+    subroutine gpu_download_c(gpu_ptr, cpu_ptr, n) bind(C, name='gpu_download')
+      import
+      type(c_ptr), value :: gpu_ptr
+      type(c_ptr), value :: cpu_ptr
+      integer(c_int64_t), value :: n
+    end subroutine
+
+    subroutine gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, n) bind(C, name='gpu_copy')
+      import
+      type(c_ptr), value :: gpu_ptr_src
+      type(c_ptr), value :: gpu_ptr_dest
+      integer(c_int64_t), value :: n
+    end subroutine
+
+    subroutine gpu_stream_create_c(stream) bind(C, name='gpu_stream_create')
+      import
+      type(c_ptr) :: stream
+    end subroutine
+
+    subroutine gpu_stream_destroy_c(stream) bind(C, name='gpu_stream_destroy')
+      import
+      type(c_ptr) :: stream
+    end subroutine
+
+    subroutine gpu_set_stream_c(handle, stream) bind(C, name='gpu_set_stream')
+      import
+      type(c_ptr), value :: handle, stream
+    end subroutine
+
+    subroutine gpu_synchronize() bind(C)
+      import
+    end subroutine
+
+    subroutine gpu_blas_create_c(handle) bind(C, name='gpu_blas_create')
+      import
+      type(c_ptr) :: handle
+    end subroutine
+
+    subroutine gpu_blas_destroy_c(handle) bind(C, name='gpu_blas_destroy')
+      import
+      type(c_ptr) :: handle
+    end subroutine
+
+    subroutine gpu_ddot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_ddot')
+      import
+      type(c_ptr), value, intent(in) :: handle
+      integer(c_int64_t), value      :: n, incx, incy
+      type(c_ptr), value             :: dx, dy
+      real(c_double), intent(out)    :: res
+    end subroutine
+
+    subroutine gpu_sdot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_sdot')
+      import
+      type(c_ptr), value, intent(in) :: handle
+      integer(c_int64_t), value      :: n, incx, incy
+      type(c_ptr), intent(in), value :: dx, dy
+      real(c_float), intent(out)  :: res
+    end subroutine
+
+    subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
+      b, ldb, c, ldc) bind(C, name='gpu_dgeam')
+      import
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in), value  :: transa, transb
+      integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
+      real(c_double), intent(in), value :: alpha, beta
+      type(c_ptr), value :: a, b, c
+    end subroutine
+
+    subroutine gpu_sgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
+      b, ldb, c, ldc) bind(C, name='gpu_sgeam')
+      import
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in), value  :: transa, transb
+      integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
+      real(c_float), intent(in), value :: alpha, beta
+      type(c_ptr), value :: a, b, c
+    end subroutine
+
+    subroutine gpu_dgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, &
+      b, ldb, beta, c, ldc) bind(C, name='gpu_dgemm')
+      import
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in), value  :: transa, transb
+      integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc
+      real(c_double), intent(in), value :: alpha, beta
+      type(c_ptr), value :: a, b, c
+    end subroutine
+
+    subroutine gpu_sgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, &
+      b, ldb, beta, c, ldc) bind(C, name='gpu_sgemm')
+      import
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in), value  :: transa, transb
+      integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc
+      real(c_float), intent(in), value :: alpha, beta
+      type(c_ptr), value :: a, b, c
+    end subroutine
+
+  end interface
+
+
+! Polymorphic interfaces
+! ----------------------
+
+  interface gpu_allocate
+    procedure gpu_allocate_double1     &
+             ,gpu_allocate_double2     &
+             ,gpu_allocate_double3     &
+             ,gpu_allocate_double4     &
+             ,gpu_allocate_double5     &
+             ,gpu_allocate_double6     &
+             ,gpu_allocate_double1_64  &
+             ,gpu_allocate_double2_64  &
+             ,gpu_allocate_double3_64  &
+             ,gpu_allocate_double4_64  &
+             ,gpu_allocate_double5_64  &
+             ,gpu_allocate_double6_64
+  end interface gpu_allocate
+
+  interface gpu_deallocate
+    procedure gpu_deallocate_double1     &
+             ,gpu_deallocate_double2     &
+             ,gpu_deallocate_double3     &
+             ,gpu_deallocate_double4     &
+             ,gpu_deallocate_double5     &
+             ,gpu_deallocate_double6
+  end interface gpu_deallocate
+
+  interface gpu_upload
+    procedure gpu_upload_double1  &
+             ,gpu_upload_double2  &
+             ,gpu_upload_double3  &
+             ,gpu_upload_double4  &
+             ,gpu_upload_double5  &
+             ,gpu_upload_double6
+  end interface gpu_upload
+
+  interface gpu_download
+    procedure gpu_download_double1  &
+             ,gpu_download_double2  &
+             ,gpu_download_double3  &
+             ,gpu_download_double4  &
+             ,gpu_download_double5  &
+             ,gpu_download_double6
+  end interface gpu_download
+
+  interface gpu_copy
+    procedure gpu_copy_double1  &
+             ,gpu_copy_double2  &
+             ,gpu_copy_double3  &
+             ,gpu_copy_double4  &
+             ,gpu_copy_double5  &
+             ,gpu_copy_double6
+  end interface gpu_copy
+
+
+  contains
+
+
+! gpu_allocate
+! ------------
+
+    subroutine gpu_allocate_double1(ptr, s)
+      implicit none
+      type(gpu_double1), intent(inout) :: ptr
+      integer, intent(in) :: s
+
+      call gpu_allocate_c(ptr%c, s*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s /))
+    end subroutine
+
+    subroutine gpu_allocate_double2(ptr, s1, s2)
+      implicit none
+      type(gpu_double2), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2
+
+      call gpu_allocate_c(ptr%c, s1*s2*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /))
+    end subroutine
+
+    subroutine gpu_allocate_double3(ptr, s1, s2, s3)
+      implicit none
+      type(gpu_double3), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2, s3
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /))
+    end subroutine
+
+    subroutine gpu_allocate_double4(ptr, s1, s2, s3, s4)
+      implicit none
+      type(gpu_double4), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2, s3, s4
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /))
+    end subroutine
+
+    subroutine gpu_allocate_double5(ptr, s1, s2, s3, s4, s5)
+      implicit none
+      type(gpu_double5), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2, s3, s4, s5
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /))
+    end subroutine
+
+    subroutine gpu_allocate_double6(ptr, s1, s2, s3, s4, s5, s6)
+      implicit none
+      type(gpu_double6), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2, s3, s4, s5, s6
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /))
+    end subroutine
+
+
+    subroutine gpu_allocate_double1_64(ptr, s)
+      implicit none
+      type(gpu_double1), intent(inout) :: ptr
+      integer*8, intent(in) :: s
+
+      call gpu_allocate_c(ptr%c, s)
+      call c_f_pointer(ptr%c, ptr%f, (/ s /))
+    end subroutine
+
+    subroutine gpu_allocate_double2_64(ptr, s1, s2)
+      implicit none
+      type(gpu_double2), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2
+
+      call gpu_allocate_c(ptr%c, s1*s2*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /))
+    end subroutine
+
+    subroutine gpu_allocate_double3_64(ptr, s1, s2, s3)
+      implicit none
+      type(gpu_double3), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2, s3
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /))
+    end subroutine
+
+    subroutine gpu_allocate_double4_64(ptr, s1, s2, s3, s4)
+      implicit none
+      type(gpu_double4), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2, s3, s4
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /))
+    end subroutine
+
+    subroutine gpu_allocate_double5_64(ptr, s1, s2, s3, s4, s5)
+      implicit none
+      type(gpu_double5), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2, s3, s4, s5
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /))
+    end subroutine
+
+    subroutine gpu_allocate_double6_64(ptr, s1, s2, s3, s4, s5, s6)
+      implicit none
+      type(gpu_double6), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2, s3, s4, s5, s6
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /))
+    end subroutine
+
+
+! gpu_deallocate
+! --------------
+
+    subroutine gpu_deallocate_double1(ptr)
+      implicit none
+      type(gpu_double1), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double2(ptr)
+      implicit none
+      type(gpu_double2), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double3(ptr)
+      implicit none
+      type(gpu_double3), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double4(ptr)
+      implicit none
+      type(gpu_double4), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double5(ptr)
+      implicit none
+      type(gpu_double5), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double6(ptr)
+      implicit none
+      type(gpu_double6), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+
+! gpu_upload
+! ----------
+
+    subroutine gpu_upload_double1(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, target, intent(in)     :: cpu_ptr(*)
+      type(gpu_double1), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, 8_8*size(gpu_ptr%f))
+    end subroutine
+
+    subroutine gpu_upload_double2(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, target, intent(in)     :: cpu_ptr(:,:)
+      type(gpu_double2), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+    subroutine gpu_upload_double3(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, target, intent(in)     :: cpu_ptr(:,:,:)
+      type(gpu_double3), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+    subroutine gpu_upload_double4(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, target, intent(in)     :: cpu_ptr(:,:,:,:)
+      type(gpu_double4), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+    subroutine gpu_upload_double5(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, target, intent(in)     :: cpu_ptr(:,:,:,:,:)
+      type(gpu_double5), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+    subroutine gpu_upload_double6(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, target, intent(in)     :: cpu_ptr(:,:,:,:,:,:)
+      type(gpu_double6), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+
+! gpu_download
+! ------------
+
+    subroutine gpu_download_double1(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double1), intent(in)  :: gpu_ptr
+      double precision, target, intent(in)   :: cpu_ptr(:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*size(gpu_ptr%f))
+    end subroutine
+
+    subroutine gpu_download_double2(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double2), intent(in)  :: gpu_ptr
+      double precision, target, intent(in)   :: cpu_ptr(:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+    subroutine gpu_download_double3(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double3), intent(in)  :: gpu_ptr
+      double precision, target, intent(in)   :: cpu_ptr(:,:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+    subroutine gpu_download_double4(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double4), intent(in)  :: gpu_ptr
+      double precision, target, intent(in)   :: cpu_ptr(:,:,:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+    subroutine gpu_download_double5(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double5), intent(in)  :: gpu_ptr
+      double precision, target, intent(in)   :: cpu_ptr(:,:,:,:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+    subroutine gpu_download_double6(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double6), intent(in)  :: gpu_ptr
+      double precision, target, intent(in)   :: cpu_ptr(:,:,:,:,:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+! gpu_copy
+! --------
+
+    subroutine gpu_copy_double1(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double1), intent(in)        :: gpu_ptr_src
+      type(gpu_double1), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*size(gpu_ptr_dest%f))
+    end subroutine
+
+    subroutine gpu_copy_double2(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double2), intent(in)        :: gpu_ptr_src
+      type(gpu_double2), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+    subroutine gpu_copy_double3(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double3), intent(in)        :: gpu_ptr_src
+      type(gpu_double3), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+    subroutine gpu_copy_double4(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double4), intent(in)        :: gpu_ptr_src
+      type(gpu_double4), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+    subroutine gpu_copy_double5(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double5), intent(in)        :: gpu_ptr_src
+      type(gpu_double5), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+    subroutine gpu_copy_double6(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double6), intent(in)        :: gpu_ptr_src
+      type(gpu_double6), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+
+! gpu_stream
+! ----------
+
+    subroutine gpu_stream_create(stream)
+      type(gpu_stream) :: stream
+      call gpu_stream_create_c(stream%c)
+    end subroutine
+
+    subroutine gpu_stream_destroy(stream)
+      type(gpu_stream) :: stream
+      call gpu_stream_destroy_c(stream%c)
+    end subroutine
+
+    subroutine gpu_set_stream(handle, stream)
+      type(gpu_blas)   :: handle
+      type(gpu_stream) :: stream
+      call gpu_set_stream_c(handle%c, stream%c)
+    end subroutine
+
+
+! gpu_blas
+! --------
+
+    subroutine gpu_blas_create(handle)
+      type(gpu_blas) :: handle
+      call gpu_blas_create_c(handle%c)
+    end subroutine
+
+    subroutine gpu_blas_destroy(handle)
+      type(gpu_blas) :: handle
+      call gpu_blas_destroy_c(handle%c)
+    end subroutine
+
+
+end module
+
+
+
+! dot
+! ---
+
+subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res)
+  use gpu
+  type(gpu_blas), intent(in)     :: handle
+  integer*4                      :: n, incx, incy
+  type(gpu_double1), intent(in)  :: dx, dy
+  double precision, intent(out)  :: res
+  call gpu_ddot_c(handle%c, int(n,c_int64_t), dx%c, int(incx,c_int64_t), dy%c, int(incy,c_int64_t), res)
+end subroutine
+
+subroutine gpu_ddot_f(handle, n, dx, incx, dy, incy, res)
+  use gpu
+  type(gpu_blas), intent(in)     :: handle
+  integer*4                      :: n, incx, incy
+  double precision, target       :: dx(*), dy(*)
+  double precision, intent(out)  :: res
+  call gpu_ddot_c(handle%c, int(n,c_int64_t), c_loc(dx), int(incx,c_int64_t), c_loc(dy), int(incy,c_int64_t), res)
+end subroutine
+
+
+subroutine gpu_ddot_64(handle, n, dx, incx, dy, incy, res)
+  use gpu
+  type(gpu_blas), intent(in)     :: handle
+  integer*8                      :: n, incx, incy
+  type(gpu_double1), intent(in)  :: dx, dy
+  double precision, intent(out)  :: res
+  call gpu_ddot_c(handle%c, n, dx%c, incx, dy%c, incy, res)
+end subroutine
+
+subroutine gpu_ddot_f_64(handle, n, dx, incx, dy, incy, res)
+  use gpu
+  type(gpu_blas), intent(in)     :: handle
+  integer*8                      :: n, incx, incy
+  double precision, target       :: dx(*), dy(*)
+  double precision, intent(out)  :: res
+  call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res)
+end subroutine
+
+
+! geam
+! ----
+
+subroutine gpu_dgeam(handle, transa, transb, m, n, alpha, a, lda, beta, &
+  b, ldb, c, ldc)
+  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*4, intent(in)        :: m, n, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  type(gpu_double2)            :: a, b, c
+  call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, a%c, int(lda,c_int64_t), beta, &
+        b%c, int(ldb,c_int64_t), c%c, int(ldc,c_int64_t))
+end subroutine
+
+
+subroutine gpu_dgeam_f(handle, transa, transb, m, n, alpha, a, lda, beta, &
+  b, ldb, c, ldc)
+  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*4, intent(in)        :: m, n, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  double precision, target     :: a(*), b(*), c(*)
+  call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, &
+        c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t))
+end subroutine
+
+
+subroutine gpu_dgeam_64(handle, transa, transb, m, n, alpha, a, lda, beta, &
+  b, ldb, c, ldc)
+  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*8, intent(in)        :: m, n, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  type(gpu_double2)            :: a, b, c
+  call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, a%c, int(lda,c_int64_t), beta, &
+        b%c, int(ldb,c_int64_t), c%c, int(ldc,c_int64_t))
+end subroutine
+
+
+subroutine gpu_dgeam_f_64(handle, transa, transb, m, n, alpha, a, lda, beta, &
+  b, ldb, c, ldc)
+  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*8, intent(in)        :: m, n, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  double precision, target     :: a(*), b(*), c(*)
+  call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, &
+        c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t))
+end subroutine
+
+
+! gemm
+! ----
+
+subroutine gpu_dgemm(handle, transa, transb, m, n, k, alpha, a, lda, &
+  b, ldb, beta, c, ldc)
+  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*4, intent(in)        :: m, n, k, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  type(gpu_double2)            :: a, b, c
+  call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), &
+        alpha, a%c, int(lda,c_int64_t), &
+        b%c, int(ldb,c_int64_t), beta, c%c, int(ldc,c_int64_t))
+end subroutine
+
+subroutine gpu_dgemm_64(handle, transa, transb, m, n, k, alpha, a, lda, &
+  b, ldb, beta, c, ldc)
+  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*8, intent(in)        :: m, n, k, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  type(gpu_double2)            :: a, b, c
+  call gpu_dgemm_c(handle%c, transa, transb, m, n, k, &
+        alpha, a%c, lda, b%c, ldb, beta, c%c, ldc)
+end subroutine
+
+subroutine gpu_dgemm_f(handle, transa, transb, m, n, k, alpha, a, lda, &
+  b, ldb, beta, c, ldc)
+  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*4, intent(in)        :: m, n, k, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  double precision, target     :: a(*), b(*), c(*)
+  call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), &
+        alpha, c_loc(a), int(lda,c_int64_t), &
+        c_loc(b), int(ldb,c_int64_t), beta, c_loc(c), int(ldc,c_int64_t))
+end subroutine
+
+subroutine gpu_dgemm_f_64(handle, transa, transb, m, n, k, alpha, a, lda, &
+  b, ldb, beta, c, ldc)
+  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*8, intent(in)        :: m, n, k, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  double precision, target     :: a(*), b(*), c(*)
+  call gpu_dgemm_c(handle%c, transa, transb, m, n, k, &
+        alpha, c_loc(a), lda, c_loc(b), ldb, beta, c_loc(c), ldc)
+end subroutine
+
--- a/src/mol_properties/multi_s_dipole_moment.irp.f
+++ b/src/mol_properties/multi_s_dipole_moment.irp.f
@ -18,7 +18,7 @@



-BEGIN_PROVIDER [double precision, multi_s_dipole_moment, (N_states, N_states)]
+ BEGIN_PROVIDER [double precision, multi_s_dipole_moment  , (N_states, N_states)]
 &BEGIN_PROVIDER [double precision, multi_s_x_dipole_moment, (N_states, N_states)]
 &BEGIN_PROVIDER [double precision, multi_s_y_dipole_moment, (N_states, N_states)]
 &BEGIN_PROVIDER [double precision, multi_s_z_dipole_moment, (N_states, N_states)]
@ -40,27 +40,153 @@ BEGIN_PROVIDER [double precision, multi_s_dipole_moment, (N_states, N_states)]
  ! gamma^{nm}: density matrix \bra{\Psi^n} a^{\dagger}_a a_i \ket{\Psi^m}
  END_DOC

-  integer          :: istate,jstate ! States
-  integer          :: i,j           ! general spatial MOs
+  integer          :: istate, jstate ! States
+  integer          :: i, j           ! general spatial MOs
  double precision :: nuclei_part_x, nuclei_part_y, nuclei_part_z
 
  multi_s_x_dipole_moment = 0.d0
  multi_s_y_dipole_moment = 0.d0
  multi_s_z_dipole_moment = 0.d0
+
+  if(8.d0*mo_num*mo_num*n_states*n_states*1d-9 .lt. 200.d0) then
 
-  do jstate = 1, N_states
-    do istate = 1, N_states
- 
-      do i = 1, mo_num  
-        do j = 1, mo_num  
-          multi_s_x_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_x(j,i)  
-          multi_s_y_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_y(j,i) 
-          multi_s_z_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_z(j,i) 
-        enddo
+    do jstate = 1, N_states
+      do istate = 1, N_states
+        do i = 1, mo_num  
+          do j = 1, mo_num  
+            multi_s_x_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_x(j,i)  
+            multi_s_y_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_y(j,i) 
+            multi_s_z_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_z(j,i) 
+          enddo
+        enddo 
      enddo
- 
    enddo
-  enddo
+
+  else
+
+    ! no enouph memory
+    ! on the fly scheme
+
+    PROVIDE psi_det_alpha_unique psi_det_beta_unique
+
+    integer           :: l, k_a, k_b
+    integer           :: occ(N_int*bit_kind_size,2)
+    integer           :: h1, h2, p1, p2, degree
+    integer           :: exc(0:2,2), n_occ(2)
+    integer           :: krow, kcol, lrow, lcol
+    integer(bit_kind) :: tmp_det(N_int,2), tmp_det2(N_int)
+    double precision  :: ck, ckl, phase
+
+    !$OMP PARALLEL DEFAULT(NONE)                                                      &
+    !$OMP PRIVATE(j, l, k_a, k_b, istate, jstate, occ, ck, ckl, h1, h2, p1, p2, exc,  & 
+    !$OMP         phase, degree, n_occ, krow, kcol, lrow, lcol, tmp_det, tmp_det2)    &
+    !$OMP SHARED(N_int, N_states, elec_alpha_num, elec_beta_num, N_det,               &
+    !$OMP        psi_bilinear_matrix_rows, psi_bilinear_matrix_columns,               &
+    !$OMP        psi_bilinear_matrix_transp_rows, psi_bilinear_matrix_transp_columns, &
+    !$OMP        psi_det_alpha_unique, psi_det_beta_unique,                           &
+    !$OMP        psi_bilinear_matrix_values, psi_bilinear_matrix_transp_values,       &
+    !$OMP        mo_dipole_x, mo_dipole_y, mo_dipole_z,                               &
+    !$OMP        multi_s_x_dipole_moment, multi_s_y_dipole_moment, multi_s_z_dipole_moment)
+    !$OMP DO COLLAPSE(2)
+    do istate = 1, N_states
+      do jstate = 1, N_states
+
+        do k_a = 1, N_det
+          krow = psi_bilinear_matrix_rows   (k_a)
+          kcol = psi_bilinear_matrix_columns(k_a)
+  
+          tmp_det(1:N_int,1) = psi_det_alpha_unique(1:N_int,krow)
+          tmp_det(1:N_int,2) = psi_det_beta_unique (1:N_int,kcol)
+  
+          ! Diagonal part
+          call bitstring_to_list_ab(tmp_det, occ, n_occ, N_int)
+          ck = psi_bilinear_matrix_values(k_a,istate)*psi_bilinear_matrix_values(k_a,jstate)
+          do l = 1, elec_alpha_num
+            j = occ(l,1)
+            multi_s_x_dipole_moment(istate,jstate) -= ck * mo_dipole_x(j,j) 
+            multi_s_y_dipole_moment(istate,jstate) -= ck * mo_dipole_y(j,j) 
+            multi_s_z_dipole_moment(istate,jstate) -= ck * mo_dipole_z(j,j) 
+          enddo
+  
+          if (k_a == N_det) cycle
+          l = k_a + 1
+          lrow = psi_bilinear_matrix_rows   (l)
+          lcol = psi_bilinear_matrix_columns(l)
+          ! Fix beta determinant, loop over alphas
+          do while (lcol == kcol)
+            tmp_det2(:) = psi_det_alpha_unique(:,lrow)
+            call get_excitation_degree_spin(tmp_det(1,1), tmp_det2, degree, N_int)
+            if (degree == 1) then
+              exc = 0
+              call get_single_excitation_spin(tmp_det(1,1), tmp_det2, exc, phase, N_int)
+              call decode_exc_spin(exc, h1, p1, h2, p2)
+              ckl = psi_bilinear_matrix_values(k_a,istate)*psi_bilinear_matrix_values(l,jstate) * phase
+              multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(h1,p1) 
+              multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(h1,p1) 
+              multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(h1,p1) 
+              ckl = psi_bilinear_matrix_values(k_a,jstate)*psi_bilinear_matrix_values(l,istate) * phase
+              multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(p1,h1) 
+              multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(p1,h1) 
+              multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(p1,h1) 
+            endif
+            l = l+1
+            if (l > N_det) exit
+            lrow = psi_bilinear_matrix_rows   (l)
+            lcol = psi_bilinear_matrix_columns(l)
+          enddo
+        enddo ! k_a
+  
+        do k_b = 1, N_det
+          krow = psi_bilinear_matrix_transp_rows   (k_b)
+          kcol = psi_bilinear_matrix_transp_columns(k_b)
+      
+          tmp_det(1:N_int,1) = psi_det_alpha_unique(1:N_int,krow)
+          tmp_det(1:N_int,2) = psi_det_beta_unique (1:N_int,kcol)
+      
+          ! Diagonal part
+          call bitstring_to_list_ab(tmp_det, occ, n_occ, N_int)
+          ck = psi_bilinear_matrix_transp_values(k_b,istate)*psi_bilinear_matrix_transp_values(k_b,jstate)
+          do l = 1, elec_beta_num
+            j = occ(l,2)
+            multi_s_x_dipole_moment(istate,jstate) -= ck * mo_dipole_x(j,j) 
+            multi_s_y_dipole_moment(istate,jstate) -= ck * mo_dipole_y(j,j) 
+            multi_s_z_dipole_moment(istate,jstate) -= ck * mo_dipole_z(j,j) 
+          enddo
+      
+          if (k_b == N_det) cycle
+          l = k_b+1
+          lrow = psi_bilinear_matrix_transp_rows   (l)
+          lcol = psi_bilinear_matrix_transp_columns(l)
+          ! Fix beta determinant, loop over alphas
+          do while (lrow == krow)
+            tmp_det2(:) = psi_det_beta_unique(:,lcol)
+            call get_excitation_degree_spin(tmp_det(1,2), tmp_det2, degree, N_int)
+            if (degree == 1) then
+              exc = 0
+              call get_single_excitation_spin(tmp_det(1,2), tmp_det2, exc, phase, N_int)
+              call decode_exc_spin(exc, h1, p1, h2, p2)
+              ckl = psi_bilinear_matrix_transp_values(k_b,istate)*psi_bilinear_matrix_transp_values(l,jstate) * phase
+              multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(h1,p1) 
+              multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(h1,p1) 
+              multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(h1,p1) 
+              ckl = psi_bilinear_matrix_transp_values(k_b,jstate)*psi_bilinear_matrix_transp_values(l,istate) * phase
+              multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(p1,h1) 
+              multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(p1,h1) 
+              multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(p1,h1) 
+            endif
+            l = l+1
+            if (l > N_det) exit
+            lrow = psi_bilinear_matrix_transp_rows   (l)
+            lcol = psi_bilinear_matrix_transp_columns(l)
+          enddo
+        enddo ! k_b
+
+      enddo ! istate
+    enddo ! jstate
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+  endif ! memory condition
 
  ! Nuclei part
  nuclei_part_x = 0.d0