Merge branch 'dev-stable' of https://github.com/QuantumPackage/qp2 into dev-stable

2024-12-23 03:53:29 +01:00 · 2024-07-12 16:20:12 +02:00 · 2024-07-12 16:20:12 +02:00 · d7bf334fc0
commit d7bf334fc0
parent 505d10084c f5cf674d7b
37 changed files with 4158 additions and 1653 deletions
--- a/28
+++ b/28
@ -40,6 +40,7 @@ Usage:
  $(basename $0) -c <file>    
  $(basename $0) -h           
  $(basename $0) -i <package> 
  $(basename $0) -g [nvidia|intel|none]
 Options:
  -c  <file>        Define a COMPILATION configuration file,
@ -48,6 +49,7 @@ Options:
  -i <package>      INSTALL <package>. Use at your OWN RISK:
                    no support will be provided for the installation of
                    dependencies.
  -g [nvidia|intel|none]  Choose GPU acceleration
 Example:
  ./$(basename $0) -c config/gfortran.cfg
@ -83,7 +85,7 @@ function execute () {
 PACKAGES=""
-while getopts "d:c:i:h" c ; do
+while getopts "d:c:i:g:h" c ; do
    case "$c" in
        c)
            case "$OPTARG" in
@ -100,6 +102,9 @@ while getopts "d:c:i:h" c ; do
                "") help ; break;;
                *) PACKAGES="${PACKAGE} $OPTARG"
            esac;;
        g)
            GPU=$OPTARG;
            break;;
        h)
            help
            exit 0;;
@ -109,6 +114,27 @@ while getopts "d:c:i:h" c ; do
    esac
 done
 # Handle GPU acceleration
 rm -f ${QP_ROOT}/src/gpu_arch
 case "$GPU" in
  amd) # AMD
     echo "Activating AMD GPU acceleration"
     ln -s ${QP_ROOT}/plugins/local/gpu_amd ${QP_ROOT}/src/gpu_arch
     ;;
  intel) # Intel
     echo "Activating Intel GPU acceleration (EXPERIMENTAL)"
     ln -s ${QP_ROOT}/plugins/local/gpu_intel ${QP_ROOT}/src/gpu_arch
     ;;
  nvidia) # Nvidia
     echo "Activating Nvidia GPU acceleration"
     ln -s ${QP_ROOT}/plugins/local/gpu_nvidia ${QP_ROOT}/src/gpu_arch
     ;;
  *) # No Acceleration
     echo "Disabling GPU acceleration"
     ln -s ${QP_ROOT}/plugins/local/gpu_x86 ${QP_ROOT}/src/gpu_arch
     ;;
 esac
 # Trim leading and trailing spaces
 PACKAGES=$(echo $PACKAGES | xargs)
--- a/plugins/local/gpu_intel/LIB
+++ b/plugins/local/gpu_intel/LIB
@ -0,0 +1,2 @@
 -ltbb -lsycl -lmkl_sycl -lgpu -limf -lintlc -lstdc++ 
--- a/plugins/local/gpu_intel/NEED
+++ b/plugins/local/gpu_intel/NEED
@ -0,0 +1 @@
--- a/plugins/local/gpu_intel/README.rst
+++ b/plugins/local/gpu_intel/README.rst
@ -0,0 +1,8 @@
 =========
 gpu_intel
 =========
 Intel implementation of GPU routines. Uses MKL and SYCL.
 ```bash
 icpx -fsycl gpu.cxx -c -qmkl=sequential
 ```
--- a/plugins/local/gpu_intel/gpu.sycl
+++ b/plugins/local/gpu_intel/gpu.sycl
@ -0,0 +1,177 @@
 #include <CL/sycl.hpp>
 #include <cassert>
 #include <limits>
 #include <oneapi/mkl/blas.hpp>
 extern "C" {
 /* Generic functions */
 int gpu_ndevices() {
  return 1;
 }
 void gpu_set_device(int32_t igpu) {
 }
 /* Allocation functions */
 void gpu_allocate(void** ptr, int64_t size) {
    auto queue = sycl::queue(sycl::default_selector_v);
    try {
        *ptr = sycl::malloc_shared(size, queue);
        assert(*ptr != nullptr);
    } catch (const sycl::exception& e) {
        std::cerr << "SYCL exception caught: " << e.what() << std::endl;
        *ptr = nullptr; // If allocation fails, set pointer to nullptr
    }
 }
 void gpu_deallocate(void** ptr) {
    assert(*ptr != nullptr);
    sycl::free(*ptr, sycl::queue(sycl::default_selector_v));
    *ptr = nullptr;
 }
 /* Upload data from host to device */
 void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
    sycl::queue queue(sycl::default_selector_v);
    queue.memcpy(gpu_ptr, cpu_ptr, n).wait();
 }
 /* Download data from device to host */
 void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
    sycl::queue queue(sycl::default_selector_v);
    queue.memcpy(cpu_ptr, gpu_ptr, n).wait();
 }
 /* Copy data from one GPU memory location to another */
 void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
    sycl::queue queue(sycl::default_selector_v);
    queue.memcpy(gpu_ptr_dest, gpu_ptr_src, n).wait();
 }
 /* Queues */
 /* SYCL queue as a replacement for CUDA stream */
 void gpu_stream_create(sycl::queue** ptr) {
    *ptr = new sycl::queue(sycl::default_selector_v);
 }
 void gpu_stream_destroy(sycl::queue** ptr) {
    assert(*ptr != nullptr);
    delete *ptr;
    *ptr = nullptr;
 }
 void gpu_synchronize() {
    sycl::queue queue(sycl::default_selector_v);
    queue.wait_and_throw();
 }
 /* BLAS functions */
 typedef struct {
  sycl::queue* queue;
 } blasHandle_t;
 void gpu_set_stream(blasHandle_t* handle, sycl::queue* ptr) {
  handle->queue = ptr;
 }
 void gpu_blas_create(blasHandle_t** ptr) {
    *ptr = (blasHandle_t*) malloc(sizeof(blasHandle_t));
    assert(*ptr != nullptr);
    (*ptr)->queue = new sycl::queue(sycl::default_selector_v);
    assert((*ptr)->queue != nullptr);
 }
 void gpu_blas_destroy(blasHandle_t** ptr) {
    assert(*ptr != nullptr);
    delete (*ptr)->queue;
    free(*ptr);
    *ptr = nullptr;
 }
 void gpu_ddot(blasHandle_t* handle, const int64_t n, const double* x, const int64_t incx, 
              const double* y, const int64_t incy, double* result) {
    // Ensure input parameters are valid
    assert(handle != nullptr);
    assert(handle->queue != nullptr);
    assert(n > 0);
    assert(incx > 0);
    assert(incy > 0);
    assert(x != nullptr);
    assert(y != nullptr);
    assert(result != nullptr);
    oneapi::mkl::blas::dot(*handle->queue, n, x, incx, y, incy, result);
 }
 void gpu_dgemv(blasHandle_t* handle, const char* transa, const int64_t m, const int64_t n, const double* alpha,
               const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) {
    assert(handle != nullptr);
    assert(handle->queue != nullptr);
    // Validate matrix dimensions and increments to be positive
    assert(m > 0 && n > 0 && lda > 0 && incx > 0 && incy > 0);
    assert(a != nullptr && x != nullptr && y != nullptr && alpha != nullptr && beta != nullptr);
    // Determine the operation type
    oneapi::mkl::transpose transa_ = oneapi::mkl::transpose::nontrans;
    if (*transa == 'T' || *transa == 't') {
        transa_ = oneapi::mkl::transpose::trans;
    }
    // Perform DGEMV operation using oneMKL
    oneapi::mkl::blas::column_major::gemv(*handle->queue, transa_, m, n, *alpha, a, lda, x, incx, *beta, y, incy);
 }
 void gpu_dgemm(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) {
    assert(handle != nullptr && handle->queue != nullptr);
    assert(m > 0 && n > 0 && k > 0 && lda > 0 && ldb > 0 && ldc > 0);
    assert(a != nullptr && b != nullptr && c != nullptr && alpha != nullptr && beta != nullptr);
    // Transpose operations
    auto transa_ = (*transa == 'T' || *transa == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans;
    auto transb_ = (*transb == 'T' || *transb == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans;
    oneapi::mkl::blas::column_major::gemm(*handle->queue, transa_, transb_, m, n, k,
                                    *alpha, a, lda, b, ldb, *beta, c, ldc);
 }
 void gpu_dgeam(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha,
               const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
    assert(handle != nullptr && handle->queue != nullptr);
    assert(m > 0 && n > 0 && lda > 0 && ldb > 0 && ldc > 0);
    assert(a != nullptr && b != nullptr && c != nullptr && alpha != nullptr && beta != nullptr);
    // Determine transpose operations
    bool transA = (*transa == 'T' || *transa == 't');
    bool transB = (*transb == 'T' || *transb == 't');
    handle->queue->submit([&](sycl::handler& cgh) {
        cgh.parallel_for(sycl::range<2>(m, n), [=](sycl::id<2> idx) {
            const int i = idx[0];
            const int j = idx[1];
            const int ai = transA ? j * lda + i : i * lda + j;
            const int bi = transB ? j * ldb + i : i * ldb + j;
            const int ci = i * ldc + j;
            c[ci] = (*alpha) * a[ai] + (*beta) * b[bi];
        });
    });
 }
 }  // extern C
--- a/plugins/local/gpu_nvidia/LIB
+++ b/plugins/local/gpu_nvidia/LIB
@ -0,0 +1 @@
 -lcudart -lcublas -lcublasLt
--- a/plugins/local/gpu_nvidia/NEED
+++ b/plugins/local/gpu_nvidia/NEED
@ -0,0 +1 @@
--- a/plugins/local/gpu_nvidia/README.rst
+++ b/plugins/local/gpu_nvidia/README.rst
@ -0,0 +1,5 @@
 ==========
 gpu_nvidia
 ==========
 Nvidia implementation of GPU routines. Uses CUDA and CUBLAS libraries.
--- a/plugins/local/gpu_nvidia/gpu.c
+++ b/plugins/local/gpu_nvidia/gpu.c
@ -0,0 +1,326 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <stdbool.h>
 #include <stdlib.h>
 #include <string.h>
 #include <assert.h>
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
 /* Generic functions */
 int gpu_ndevices() {
  int ngpus;
  cudaGetDeviceCount(&ngpus);
  return ngpus;
 }
 void gpu_set_device(int32_t igpu) {
  cudaSetDevice((int) igpu);
 }
 /* Allocation functions */
 void gpu_allocate(void** ptr, const int64_t size) {
    size_t free, total;
    cudaError_t rc = cudaMemGetInfo( &free, &total );
    if (rc != cudaSuccess) {
      free = INT64_MAX;
    }
    rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
 //    /* Use managed memory if it does not fit on the GPU */
 //    if (size < free && size < total/2) {
 //      rc= cudaMalloc(ptr, size);
 //    } else {
 //      rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
 //    }
    assert (rc == cudaSuccess);
 }
 void gpu_deallocate(void** ptr) {
  assert (*ptr != NULL);
  cudaFree(*ptr);
  *ptr = NULL;
 }
 /* Memory transfer functions */
 void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
  cudaMemcpy (gpu_ptr, cpu_ptr, n, cudaMemcpyHostToDevice);
 }
 void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
  cudaMemcpy (cpu_ptr, gpu_ptr, n, cudaMemcpyDeviceToHost);
 }
 void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
  cudaMemcpy (gpu_ptr_dest, gpu_ptr_src, n, cudaMemcpyDeviceToDevice);
 }
 /* Streams */
 void gpu_stream_create(cudaStream_t* ptr) {
  cudaError_t rc = cudaStreamCreate(ptr);
  assert (rc == cudaSuccess);
 }
 void gpu_stream_destroy(cudaStream_t* ptr) {
  assert (ptr != NULL);
  cudaError_t rc = cudaStreamDestroy(*ptr);
  assert (rc == cudaSuccess);
  *ptr = NULL;
 }
 void gpu_set_stream(cublasHandle_t handle, cudaStream_t stream) {
  cublasSetStream(handle, stream);
 }
 void gpu_synchronize() {
  cudaDeviceSynchronize();
 }
 /* BLAS functions */
 void gpu_blas_create(cublasHandle_t* ptr) {
  cublasStatus_t rc = cublasCreate(ptr);
  assert (rc == CUBLAS_STATUS_SUCCESS);
 }
 void gpu_blas_destroy(cublasHandle_t* ptr) {
  assert (ptr != NULL);
  cublasStatus_t rc = cublasDestroy(*ptr);
  assert (rc == CUBLAS_STATUS_SUCCESS);
  ptr = NULL;
 }
 void gpu_ddot(cublasHandle_t handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
  assert (handle != NULL);
  /* Convert to int */
  int n_, incx_, incy_;
  n_    = (int) n;
  incx_ = (int) incx;
  incy_ = (int) incy;
  assert ( (int64_t)    n_ == n   );
  assert ( (int64_t) incx_ == incx);
  assert ( (int64_t) incy_ == incy);
  cublasStatus_t rc = cublasDdot(handle, n_, x, incx_, y, incy_, result);
  assert (rc == CUBLAS_STATUS_SUCCESS);
 }
 void gpu_sdot(cublasHandle_t handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
  assert (handle != NULL);
  /* Convert to int */
  int n_, incx_, incy_;
  n_    = (int) n;
  incx_ = (int) incx;
  incy_ = (int) incy;
  /* Check for integer overflows */
  assert ( (int64_t)    n_ == n   );
  assert ( (int64_t) incx_ == incx);
  assert ( (int64_t) incy_ == incy);
  float result_ = 0.;
  cublasStatus_t rc = cublasSdot(handle, n_, x, incx_, y, incy_, &result_);
  assert (rc == CUBLAS_STATUS_SUCCESS);
  *result = result_;
 }
 void gpu_dgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const double* alpha,
               const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) {
  assert (handle != NULL);
  /* Convert to int */
  int m_, n_, lda_, incx_, incy_;
  m_    = (int) m;
  n_    = (int) n;
  lda_  = (int) lda;
  incx_ = (int) incx;
  incy_ = (int) incy;
  /* Check for integer overflows */
  assert ( (int64_t)    m_ == m   );
  assert ( (int64_t)    n_ == n   );
  assert ( (int64_t)  lda_ == lda );
  assert ( (int64_t) incx_ == incx);
  assert ( (int64_t) incy_ == incy);
  cublasOperation_t transa_ = CUBLAS_OP_N;
  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
  cublasDgemv(handle, transa_, m_, n_, alpha, a, lda_, x, incx_, beta, y, incy_);
 }
 void gpu_sgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const float* alpha,
               const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy) {
  assert (handle != NULL);
  /* Convert to int */
  int m_, n_, lda_, incx_, incy_;
  m_    = (int) m;
  n_    = (int) n;
  lda_  = (int) lda;
  incx_ = (int) incx;
  incy_ = (int) incy;
  /* Check for integer overflows */
  assert ( (int64_t)    m_ == m   );
  assert ( (int64_t)    n_ == n   );
  assert ( (int64_t)  lda_ == lda );
  assert ( (int64_t) incx_ == incx);
  assert ( (int64_t) incy_ == incy);
  cublasOperation_t transa_ = CUBLAS_OP_N;
  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
  cublasSgemv(handle, transa_, m_, n_, alpha, a, lda_, x, incx_, beta, y, incy_);
 }
 void gpu_dgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) {
  assert (handle != NULL);
  /* Convert to int */
  int m_, n_, k_, lda_, ldb_, ldc_;
  m_   = (int) m;
  n_   = (int) n;
  k_   = (int) k;
  lda_ = (int) lda;
  ldb_ = (int) ldb;
  ldc_ = (int) ldc;
  /* Check for integer overflows */
  assert ( (int64_t)   m_ == m  );
  assert ( (int64_t)   n_ == n  );
  assert ( (int64_t)   k_ == k  );
  assert ( (int64_t) lda_ == lda);
  assert ( (int64_t) ldb_ == ldb);
  assert ( (int64_t) ldc_ == ldc);
  cublasOperation_t transa_ = CUBLAS_OP_N;
  cublasOperation_t transb_ = CUBLAS_OP_N;
  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
  cublasDgemm(handle, transa_, transb_, m_, n_, k_, alpha, a, lda_, b, ldb_, beta, c, ldc_);
 }
 void gpu_sgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha,
               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc) {
  assert (handle != NULL);
  /* Convert to int */
  int m_, n_, k_, lda_, ldb_, ldc_;
  m_   = (int) m;
  n_   = (int) n;
  k_   = (int) k;
  lda_ = (int) lda;
  ldb_ = (int) ldb;
  ldc_ = (int) ldc;
  /* Check for integer overflows */
  assert ( (int64_t)   m_ == m  );
  assert ( (int64_t)   n_ == n  );
  assert ( (int64_t)   k_ == k  );
  assert ( (int64_t) lda_ == lda);
  assert ( (int64_t) ldb_ == ldb);
  assert ( (int64_t) ldc_ == ldc);
  cublasOperation_t transa_ = CUBLAS_OP_N;
  cublasOperation_t transb_ = CUBLAS_OP_N;
  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
  cublasSgemm(handle, transa_, transb_, m_, n_, k_, alpha, a, lda_, b, ldb_, beta, c, ldc_);
 }
 void gpu_dgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha,
               const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
  assert (handle != NULL);
  /* Convert to int */
  int m_, n_, lda_, ldb_, ldc_;
  m_   = (int) m;
  n_   = (int) n;
  lda_ = (int) lda;
  ldb_ = (int) ldb;
  ldc_ = (int) ldc;
  /* Check for integer overflows */
  assert ( (int64_t)   m_ == m  );
  assert ( (int64_t)   n_ == n  );
  assert ( (int64_t) lda_ == lda);
  assert ( (int64_t) ldb_ == ldb);
  assert ( (int64_t) ldc_ == ldc);
  cublasOperation_t transa_ = CUBLAS_OP_N;
  cublasOperation_t transb_ = CUBLAS_OP_N;
  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
  cublasDgeam(handle, transa_, transb_, m_, n_, alpha, a, lda_, beta, b, ldb_, c, ldc_);
 }
 void gpu_sgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float* alpha,
               const float* a, const int64_t lda, const float* beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
  assert (handle != NULL);
  /* Convert to int */
  int m_, n_, lda_, ldb_, ldc_;
  m_   = (int) m;
  n_   = (int) n;
  lda_ = (int) lda;
  ldb_ = (int) ldb;
  ldc_ = (int) ldc;
  /* Check for integer overflows */
  assert ( (int64_t)   m_ == m  );
  assert ( (int64_t)   n_ == n  );
  assert ( (int64_t) lda_ == lda);
  assert ( (int64_t) ldb_ == ldb);
  assert ( (int64_t) ldc_ == ldc);
  cublasOperation_t transa_ = CUBLAS_OP_N;
  cublasOperation_t transb_ = CUBLAS_OP_N;
  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
  cublasSgeam(handle, transa_, transb_, m_, n_, alpha, a, lda_, beta, b, ldb_, c, ldc_);
 }
--- a/plugins/local/gpu_x86/NEED
+++ b/plugins/local/gpu_x86/NEED
@ -0,0 +1 @@
--- a/plugins/local/gpu_x86/README.rst
+++ b/plugins/local/gpu_x86/README.rst
@ -0,0 +1,5 @@
 =======
 gpu_x86
 =======
 x86 implementation of GPU routines. For use when GPUs are not available.
--- a/plugins/local/gpu_x86/gpu.c
+++ b/plugins/local/gpu_x86/gpu.c
@ -0,0 +1,502 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <stdbool.h>
 #include <assert.h>
 /* Generic functions */
 int gpu_ndevices() {
  return 0;
 }
 void gpu_set_device(int32_t i) {
  return;
 }
 /* Allocation functions */
 void gpu_allocate(void** ptr, const int64_t n) {
  *ptr = malloc((size_t) n);
  if (*ptr == NULL) {
    perror("Allocation failed");
  }
 }
 void gpu_deallocate(void** ptr) {
  free(*ptr);
  *ptr = NULL;
 }
 /* Memory transfer functions */
 void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
  memcpy(gpu_ptr, cpu_ptr, n);
 }
 void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
  memcpy(cpu_ptr, gpu_ptr, n);
 }
 void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
  memcpy(gpu_ptr_dest, gpu_ptr_src, n);
 }
 /* Streams */
 void gpu_stream_create(void** ptr) {
  *ptr = (void*) malloc(sizeof(char));
 }
 void gpu_stream_destroy(void** ptr) {
  free(*ptr);
  *ptr = NULL;
 }
 void gpu_set_stream(void* handle, void* stream) {
  return;
 }
 void gpu_synchronize() {
  return;
 }
 /* BLAS functions */
 void gpu_blas_create(void** handle) {
  *handle = (void*) malloc(sizeof(char));
 }
 void gpu_blas_destroy(void** handle) {
  free(*handle);
  *handle = NULL;
 }
 double ddot_(const int32_t* n, const double* x, const int32_t* incx, const double* y, const int32_t* incy);
 void gpu_ddot(void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
  assert (handle != NULL);
  /* Convert to int32_t */
  int32_t n_, incx_, incy_;
  n_    = (int32_t) n;
  incx_ = (int32_t) incx;
  incy_ = (int32_t) incy;
  /* Check for integer overflows */
  assert ( (int64_t)    n_ == n   );
  assert ( (int64_t) incx_ == incx);
  assert ( (int64_t) incy_ == incy);
  *result = ddot_(&n_, x, &incx_, y, &incy_);
 }
 float sdot_(const int32_t* n, const float* x, const int32_t* incx, const float* y, const int32_t* incy);
 void gpu_sdot(void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
  assert (handle != NULL);
  /* Convert to int32_t */
  int32_t n_, incx_, incy_;
  n_    = (int32_t) n;
  incx_ = (int32_t) incx;
  incy_ = (int32_t) incy;
  /* Check for integer overflows */
  assert ( (int64_t)    n_ == n   );
  assert ( (int64_t) incx_ == incx);
  assert ( (int64_t) incy_ == incy);
  *result = sdot_(&n_, x, &incx_, y, &incy_);
 }
 void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha,
            const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy);
 void gpu_dgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const double* alpha,
               const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) {
  assert (handle != NULL);
  /* Convert to int32_t */
  int32_t m_, n_, lda_, incx_, incy_;
  m_    = (int32_t) m;
  n_    = (int32_t) n;
  lda_  = (int32_t) lda;
  incx_ = (int32_t) incx;
  incy_ = (int32_t) incy;
  /* Check for integer overflows */
  assert ( (int64_t)    m_ == m   );
  assert ( (int64_t)    n_ == n   );
  assert ( (int64_t)  lda_ == lda );
  assert ( (int64_t) incx_ == incx);
  assert ( (int64_t) incy_ == incy);
  dgemv_(transa, &m_, &n_, alpha, a, &lda_, x, &incx_, beta, y, &incy_);
 }
 void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha,
               const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy);
 void gpu_sgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const float* alpha,
               const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy) {
  assert (handle != NULL);
  /* Convert to int32_t */
  int32_t m_, n_, lda_, incx_, incy_;
  m_    = (int32_t) m;
  n_    = (int32_t) n;
  lda_  = (int32_t) lda;
  incx_ = (int32_t) incx;
  incy_ = (int32_t) incy;
  /* Check for integer overflows */
  assert ( (int64_t)    m_ == m   );
  assert ( (int64_t)    n_ == n   );
  assert ( (int64_t)  lda_ == lda );
  assert ( (int64_t) incx_ == incx);
  assert ( (int64_t) incy_ == incy);
  sgemv_(transa, &m_, &n_, alpha, a, &lda_, x, &incx_, beta, y, &incy_);
 }
 void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha,
            const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc);
 void gpu_dgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) {
  assert (handle != NULL);
  /* Convert to int32_t */
  int32_t m_, n_, k_, lda_, ldb_, ldc_;
  m_   = (int32_t) m;
  n_   = (int32_t) n;
  k_   = (int32_t) k;
  lda_ = (int32_t) lda;
  ldb_ = (int32_t) ldb;
  ldc_ = (int32_t) ldc;
  /* Check for integer overflows */
  assert ( (int64_t)   m_ == m  );
  assert ( (int64_t)   n_ == n  );
  assert ( (int64_t)   k_ == k  );
  assert ( (int64_t) lda_ == lda);
  assert ( (int64_t) ldb_ == ldb);
  assert ( (int64_t) ldc_ == ldc);
  dgemm_(transa, transb, &m_, &n_, &k_, alpha, a, &lda_, b, &ldb_, beta, c, &ldc_);
 }
 void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha,
            const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc);
 void gpu_sgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha,
               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc) {
  assert (handle != NULL);
  /* Convert to int32_t */
  int32_t m_, n_, k_, lda_, ldb_, ldc_;
  m_   = (int32_t) m;
  n_   = (int32_t) n;
  k_   = (int32_t) k;
  lda_ = (int32_t) lda;
  ldb_ = (int32_t) ldb;
  ldc_ = (int32_t) ldc;
  /* Check for integer overflows */
  assert ( (int64_t)   m_ == m  );
  assert ( (int64_t)   n_ == n  );
  assert ( (int64_t)   k_ == k  );
  assert ( (int64_t) lda_ == lda);
  assert ( (int64_t) ldb_ == ldb);
  assert ( (int64_t) ldc_ == ldc);
  sgemm_(transa, transb, &m_, &n_, &k_, alpha, a, &lda_, b, &ldb_, beta, c, &ldc_);
 }
 void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha,
               const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
  assert (handle != NULL);
  if ( (*transa == 'N' && *transb == 'N') ||
       (*transa == 'n' && *transb == 'N') ||
       (*transa == 'N' && *transb == 'n') ||
       (*transa == 'n' && *transb == 'n') ) {
     if (*alpha == 0.) {
       for (int64_t j=0 ; j<n ; ++j) {
         for (int64_t i=0 ; i<m ; ++i) {
           c[j*ldc+i] = *beta * b[j*ldb+i];
         }
       }
     } else if (*beta == 0.) {
       for (int64_t j=0 ; j<n ; ++j) {
         for (int64_t i=0 ; i<m ; ++i) {
           c[j*ldc+i] = *alpha * a[j*lda+i];
         }
       }
     } else {
       for (int64_t j=0 ; j<n ; ++j) {
         for (int64_t i=0 ; i<m ; ++i) {
           c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[j*ldb+i];
         }
       }
     }
  } else if ( (*transa == 'N' && *transb == 'T') ||
              (*transa == 'n' && *transb == 'T') ||
              (*transa == 'N' && *transb == 't') ||
              (*transa == 'n' && *transb == 't') ) {
     if (*alpha == 0.) {
       for (int64_t j=0 ; j<n ; ++j) {
         for (int64_t i=0 ; i<m ; ++i) {
           c[j*ldc+i] = *beta * b[i*ldb+j];
         }
       }
     } else if (*beta == 0.) {
       for (int64_t j=0 ; j<n ; ++j) {
         for (int64_t i=0 ; i<m ; ++i) {
           c[j*ldc+i] = *alpha * a[j*lda+i];
         }
       }
     } else {
       for (int64_t j=0 ; j<n ; ++j) {
         for (int64_t i=0 ; i<m ; ++i) {
           c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[i*ldb+j];
         }
       }
     }
  } else if ( (*transa == 'T' && *transb == 'N') ||
              (*transa == 't' && *transb == 'N') ||
              (*transa == 'T' && *transb == 'n') ||
              (*transa == 't' && *transb == 'n') ) {
     if (*alpha == 0.) {
       for (int64_t j=0 ; j<n ; ++j) {
         for (int64_t i=0 ; i<m ; ++i) {
           c[j*ldc+i] = *beta * b[j*ldb+i];
         }
       }
     } else if (*beta == 0.) {
       for (int64_t j=0 ; j<n ; ++j) {
         for (int64_t i=0 ; i<m ; ++i) {
           c[j*ldc+i] = *alpha * a[i*lda+j];
         }
       }
     } else {
       for (int64_t j=0 ; j<n ; ++j) {
         for (int64_t i=0 ; i<m ; ++i) {
           c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[j*ldb+i];
         }
       }
     }
  } else if ( (*transa == 'T' && *transb == 'T') ||
              (*transa == 't' && *transb == 'T') ||
              (*transa == 'T' && *transb == 't') ||
              (*transa == 't' && *transb == 't') ) {
     if (*alpha == 0.) {
       for (int64_t j=0 ; j<n ; ++j) {
         for (int64_t i=0 ; i<m ; ++i) {
           c[j*ldc+i] = *beta * b[i*ldb+j];
         }
       }
     } else if (*beta == 0.) {
       for (int64_t j=0 ; j<n ; ++j) {
         for (int64_t i=0 ; i<m ; ++i) {
           c[j*ldc+i] = *alpha * a[i*lda+j];
         }
       }
     } else {
       for (int64_t j=0 ; j<n ; ++j) {
         for (int64_t i=0 ; i<m ; ++i) {
           c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[i*ldb+j];
         }
       }
     }
  }
 }
 void gpu_sgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float* alpha,
               const float* a, const int64_t lda, const float* beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
  assert (handle != NULL);
  if ( (*transa == 'N' && *transb == 'N') ||
       (*transa == 'n' && *transb == 'N') ||
       (*transa == 'N' && *transb == 'n') ||
       (*transa == 'n' && *transb == 'n') ) {
     if (*alpha == 0.) {
       for (int64_t j=0 ; j<n ; ++j) {
         for (int64_t i=0 ; i<m ; ++i) {
           c[j*ldc+i] = *beta * b[j*ldb+i];
         }
       }
     } else if (*beta == 0.) {
       for (int64_t j=0 ; j<n ; ++j) {
         for (int64_t i=0 ; i<m ; ++i) {
           c[j*ldc+i] = *alpha * a[j*lda+i];
         }
       }
     } else {
       for (int64_t j=0 ; j<n ; ++j) {
         for (int64_t i=0 ; i<m ; ++i) {
           c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[j*ldb+i];
         }
       }
     }
  } else if ( (*transa == 'N' && *transb == 'T') ||
              (*transa == 'n' && *transb == 'T') ||
              (*transa == 'N' && *transb == 't') ||
              (*transa == 'n' && *transb == 't') ) {
     if (*alpha == 0.) {
       for (int64_t j=0 ; j<n ; ++j) {
         for (int64_t i=0 ; i<m ; ++i) {
           c[j*ldc+i] = *beta * b[i*ldb+j];
         }
       }
     } else if (*beta == 0.) {
       for (int64_t j=0 ; j<n ; ++j) {
         for (int64_t i=0 ; i<m ; ++i) {
           c[j*ldc+i] = *alpha * a[j*lda+i];
         }
       }
     } else {
       for (int64_t j=0 ; j<n ; ++j) {
         for (int64_t i=0 ; i<m ; ++i) {
           c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[i*ldb+j];
         }
       }
     }
  } else if ( (*transa == 'T' && *transb == 'N') ||
              (*transa == 't' && *transb == 'N') ||
              (*transa == 'T' && *transb == 'n') ||
              (*transa == 't' && *transb == 'n') ) {
     if (*alpha == 0.) {
       for (int64_t j=0 ; j<n ; ++j) {
         for (int64_t i=0 ; i<m ; ++i) {
           c[j*ldc+i] = *beta * b[j*ldb+i];
         }
       }
     } else if (*beta == 0.) {
       for (int64_t j=0 ; j<n ; ++j) {
         for (int64_t i=0 ; i<m ; ++i) {
           c[j*ldc+i] = *alpha * a[i*lda+j];
         }
       }
     } else {
       for (int64_t j=0 ; j<n ; ++j) {
         for (int64_t i=0 ; i<m ; ++i) {
           c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[j*ldb+i];
         }
       }
     }
  } else if ( (*transa == 'T' && *transb == 'T') ||
              (*transa == 't' && *transb == 'T') ||
              (*transa == 'T' && *transb == 't') ||
              (*transa == 't' && *transb == 't') ) {
     if (*alpha == 0.) {
       for (int64_t j=0 ; j<n ; ++j) {
         for (int64_t i=0 ; i<m ; ++i) {
           c[j*ldc+i] = *beta * b[i*ldb+j];
         }
       }
     } else if (*beta == 0.) {
       for (int64_t j=0 ; j<n ; ++j) {
         for (int64_t i=0 ; i<m ; ++i) {
           c[j*ldc+i] = *alpha * a[i*lda+j];
         }
       }
     } else {
       for (int64_t j=0 ; j<n ; ++j) {
         for (int64_t i=0 ; i<m ; ++i) {
           c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[i*ldb+j];
         }
       }
     }
  }
 }
--- a/plugins/local/non_h_ints_mu/deb_aos.irp.f
+++ b/plugins/local/non_h_ints_mu/deb_aos.irp.f
@ -31,6 +31,9 @@ subroutine print_aos()
  integer          :: i, ipoint
  double precision :: r(3)
  double precision :: ao_val, ao_der(3), ao_lap
  double precision :: accu_vgl(5)
  double precision :: accu_vgl_nrm(5)
  double precision :: mo_val, mo_der(3), mo_lap
  PROVIDE final_grid_points aos_in_r_array aos_grad_in_r_array aos_lapl_in_r_array
@ -40,9 +43,6 @@ subroutine print_aos()
    write(1000, '(3(f15.7, 3X))') r
  enddo
 double precision :: accu_vgl(5)
 double precision :: accu_vgl_nrm(5)
  do ipoint = 1, n_points_final_grid
    do i = 1, ao_num
      ao_val    = aos_in_r_array     (i,ipoint)
--- a/plugins/local/non_h_ints_mu/total_tc_int.irp.f
+++ b/plugins/local/non_h_ints_mu/total_tc_int.irp.f
@ -78,7 +78,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
      !$OMP PRIVATE (i, j, k, l, ipoint, ao_i_r, ao_k_r, weight1) &
      !$OMP SHARED (ao_num, n_points_final_grid, ao_two_e_tc_tot, &
      !$OMP         aos_in_r_array_transp, final_weight_at_r_vector, int2_grad1_u12_square_ao)
-      !$OMP DO COLLAPSE(4)
+      !$OMP DO COLLAPSE(3)
      do i = 1, ao_num
        do k = 1, ao_num
          do l = 1, ao_num
@ -188,7 +188,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
        !$OMP SHARED (ao_num, n_points_final_grid, ao_two_e_tc_tot,    &
        !$OMP         aos_in_r_array_transp, final_weight_at_r_vector, &
        !$OMP         int2_grad1_u12_ao, aos_grad_in_r_array_transp_bis)
-        !$OMP DO COLLAPSE(4)
+        !$OMP DO COLLAPSE(3)
        do i = 1, ao_num
          do k = 1, ao_num
            do l = 1, ao_num
@ -270,7 +270,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
      !$OMP PARALLEL DEFAULT(NONE)                     &
      !$OMP PRIVATE(i, j, k, l, integ_zero, integ_val) & 
      !$OMP SHARED(ao_num, ao_two_e_tc_tot)
-      !$OMP DO COLLAPSE(4)
+      !$OMP DO COLLAPSE(3)
      do j = 1, ao_num
        do l = 1, ao_num
          do i = 1, ao_num
@ -293,7 +293,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
      !$OMP PARALLEL DEFAULT(NONE)                            &
      !$OMP SHARED(ao_num, ao_two_e_tc_tot, ao_integrals_map) &
      !$OMP PRIVATE(i, j, k, l)
-      !$OMP DO COLLAPSE(4)
+      !$OMP DO COLLAPSE(3)
      do j = 1, ao_num
        do l = 1, ao_num
          do i = 1, ao_num
@ -306,7 +306,6 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
      enddo
      !$OMP END DO
      !$OMP END PARALLEL
      !call clear_ao_map()
      FREE ao_integrals_map
    endif
--- a/plugins/local/tc_int/NEED
+++ b/plugins/local/tc_int/NEED
@ -0,0 +1,5 @@
 tc_keywords
 jastrow
 qmckl
 becke_numerical_grid
 dft_utils_in_r
--- a/plugins/local/tc_int/README.rst
+++ b/plugins/local/tc_int/README.rst
@ -0,0 +1,4 @@
 ======
 tc_int
 ======
--- a/plugins/local/tc_int/compute_tc_int.irp.f
+++ b/plugins/local/tc_int/compute_tc_int.irp.f
@ -0,0 +1,295 @@
 ! ---
 subroutine provide_int2_grad1_u12_ao()
  BEGIN_DOC
  !
  ! int2_grad1_u12_ao(i,j,ipoint,1) = \int dr2         [\grad1 u(r1,r2)]_x1 \chi_i(r2) \chi_j(r2) 
  ! int2_grad1_u12_ao(i,j,ipoint,2) = \int dr2         [\grad1 u(r1,r2)]_y1 \chi_i(r2) \chi_j(r2) 
  ! int2_grad1_u12_ao(i,j,ipoint,3) = \int dr2         [\grad1 u(r1,r2)]_z1 \chi_i(r2) \chi_j(r2) 
  ! int2_grad1_u12_ao(i,j,ipoint,4) = \int dr2 [-(1/2) [\grad1 u(r1,r2)]^2] \chi_i(r2) \chi_j(r2) 
  !
  !
  ! tc_int_2e_ao(k,i,l,j) = (ki|V^TC(r_12)|lj) 
  !                       = <lk| V^TC(r_12) |ji> where V^TC(r_12) is the total TC operator 
  !                       = tc_grad_and_lapl_ao(k,i,l,j) + tc_grad_square_ao(k,i,l,j) + ao_two_e_coul(k,i,l,j)
  ! where:
  !
  ! tc_grad_and_lapl_ao(k,i,l,j) = < k l | -1/2 \Delta_1 u(r1,r2) - \grad_1 u(r1,r2) . \grad_1 | ij >
  !                              = -1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2      \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2) 
  !                              =  1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2 (-1) \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2) 
  !
  ! tc_grad_square_ao(k,i,l,j) = -1/2 <kl | |\grad_1 u(r1,r2)|^2 + |\grad_2 u(r1,r2)|^2 | ij>
  !
  ! ao_two_e_coul(k,i,l,j) = < l k | 1/r12 | j i > = ( k i | 1/r12 | l j )
  !
  END_DOC
  implicit none
  integer                       :: i, j, k, l, m, ipoint, jpoint
  integer                       :: n_blocks, n_rest, n_pass
  integer                       :: i_blocks, i_rest, i_pass, ii
  double precision              :: mem, n_double
  double precision              :: weight1, ao_k_r, ao_i_r
  double precision              :: der_envsq_x, der_envsq_y, der_envsq_z, lap_envsq
  double precision              :: time0, time1, time2, tc1, tc2, tc
  double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:), tc_int_2e_ao(:,:,:,:)
  double precision, allocatable :: tmp(:,:,:), c_mat(:,:,:), tmp_grad1_u12(:,:,:)
  double precision, external    :: get_ao_two_e_integral
  PROVIDE final_weight_at_r_vector_extra aos_in_r_array_extra
  PROVIDE final_weight_at_r_vector aos_grad_in_r_array_transp_bis final_weight_at_r_vector aos_in_r_array_transp
  print*, ' start provide_int2_grad1_u12_ao ...'
  call wall_time(time0)
  call total_memory(mem)
  mem      = max(1.d0, qp_max_mem - mem)
  n_double = mem * 1.d8
  n_blocks = int(min(n_double / (n_points_extra_final_grid * 4.d0), 1.d0*n_points_final_grid))
  n_rest   = int(mod(n_points_final_grid, n_blocks))
  n_pass   = int((n_points_final_grid - n_rest) / n_blocks)
  call write_int(6, n_pass, 'Number of passes')
  call write_int(6, n_blocks, 'Size of the blocks')
  call write_int(6, n_rest, 'Size of the last block')
  ! ---
  ! ---
  ! ---
  allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,4))
  allocate(tmp(n_points_extra_final_grid,ao_num,ao_num))
  !$OMP PARALLEL               &
  !$OMP DEFAULT (NONE)         &
  !$OMP PRIVATE (j, i, jpoint) &
  !$OMP SHARED (tmp, ao_num, n_points_extra_final_grid, final_weight_at_r_vector_extra, aos_in_r_array_extra_transp)
  !$OMP DO SCHEDULE (static)
  do j = 1, ao_num
    do i = 1, ao_num
      do jpoint = 1, n_points_extra_final_grid
        tmp(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j)
      enddo
    enddo
  enddo
  !$OMP END DO
  !$OMP END PARALLEL
  allocate(tmp_grad1_u12(n_points_extra_final_grid,n_blocks,4))
  tc = 0.d0
  do i_pass = 1, n_pass
    ii = (i_pass-1)*n_blocks + 1
    call wall_time(tc1)
    !$OMP PARALLEL                   &
    !$OMP DEFAULT (NONE)             &
    !$OMP PRIVATE (i_blocks, ipoint) &
    !$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12)
    !$OMP DO 
    do i_blocks = 1, n_blocks
      ipoint = ii - 1 + i_blocks ! r1
      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_blocks,1), tmp_grad1_u12(1,i_blocks,2), tmp_grad1_u12(1,i_blocks,3), tmp_grad1_u12(1,i_blocks,4))
    enddo
    !$OMP END DO
    !$OMP END PARALLEL
    call wall_time(tc2)
    tc = tc + tc2 - tc1 
    do m = 1, 4
      call dgemm( "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0                     &
                , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
                , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) 
    enddo
  enddo
  deallocate(tmp_grad1_u12)
  if(n_rest .gt. 0) then
    allocate(tmp_grad1_u12(n_points_extra_final_grid,n_rest,4))
    ii = n_pass*n_blocks + 1
    call wall_time(tc1)
    !$OMP PARALLEL                 &
    !$OMP DEFAULT (NONE)           &
    !$OMP PRIVATE (i_rest, ipoint) &
    !$OMP SHARED (n_rest, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12)
    !$OMP DO 
    do i_rest = 1, n_rest
      ipoint = ii - 1 + i_rest ! r1
      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_rest,1), tmp_grad1_u12(1,i_rest,2), tmp_grad1_u12(1,i_rest,3), tmp_grad1_u12(1,i_rest,4))
    enddo
    !$OMP END DO
    !$OMP END PARALLEL
    call wall_time(tc2)
    tc = tc + tc2 - tc1 
    do m = 1, 4
      call dgemm( "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0                       &
                , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
                , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) 
    enddo
    deallocate(tmp_grad1_u12)
  endif
  deallocate(tmp)
  call wall_time(time1)
  print*, ' wall time for int2_grad1_u12_ao (min) = ', (time1-time0) / 60.d0
  print*, ' wall time Jastrow derivatives   (min) = ', tc / 60.d0
  call print_memory_usage()
  ! ---
  ! ---
  ! ---
  allocate(tc_int_2e_ao(ao_num,ao_num,ao_num,ao_num))
  call wall_time(time1)
  allocate(c_mat(n_points_final_grid,ao_num,ao_num))
  !$OMP PARALLEL               &
  !$OMP DEFAULT (NONE)         &
  !$OMP PRIVATE (i, k, ipoint) &
  !$OMP SHARED (aos_in_r_array_transp, c_mat, ao_num, n_points_final_grid, final_weight_at_r_vector)
  !$OMP DO SCHEDULE (static)
  do i = 1, ao_num
    do k = 1, ao_num
      do ipoint = 1, n_points_final_grid
        c_mat(ipoint,k,i) = final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,k)
      enddo
    enddo
  enddo
  !$OMP END DO
  !$OMP END PARALLEL
  call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0            &
            , int2_grad1_u12_ao(1,1,1,4), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
            , 0.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num)
  deallocate(c_mat)
  call wall_time(time2)
  print*, ' wall time of Hermitian part of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0
  call print_memory_usage()
  ! ---
  call wall_time(time1)
  allocate(c_mat(n_points_final_grid,ao_num,ao_num))
  do m = 1, 3
    !$OMP PARALLEL                                                              &
    !$OMP DEFAULT (NONE)                                                        &
    !$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r)                       & 
    !$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, c_mat, & 
    !$OMP         ao_num, n_points_final_grid, final_weight_at_r_vector, m)
    !$OMP DO SCHEDULE (static)
    do i = 1, ao_num
      do k = 1, ao_num
        do ipoint = 1, n_points_final_grid
          weight1 = 0.5d0 * final_weight_at_r_vector(ipoint)
          ao_i_r  = aos_in_r_array_transp(ipoint,i)
          ao_k_r  = aos_in_r_array_transp(ipoint,k)
          c_mat(ipoint,k,i) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,m) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,m))
        enddo
      enddo
    enddo
    !$OMP END DO
    !$OMP END PARALLEL
    call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, -1.d0           &
              , int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
              , 1.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num)
  enddo
  deallocate(c_mat)
  call wall_time(time2)
  print*, ' wall time of non-Hermitian part of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0
  call print_memory_usage()
  ! ---
  call wall_time(time1)
  call sum_A_At(tc_int_2e_ao(1,1,1,1), ao_num*ao_num)
  call wall_time(time2)
  print*, ' lower- and upper-triangle of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0
  call print_memory_usage()
  ! ---
  call wall_time(time1)  
  PROVIDE ao_integrals_map
  !$OMP PARALLEL DEFAULT(NONE)                         &
  !$OMP SHARED(ao_num, tc_int_2e_ao, ao_integrals_map) &
  !$OMP PRIVATE(i, j, k, l)
  !$OMP DO COLLAPSE(3)
  do j = 1, ao_num
    do l = 1, ao_num
      do i = 1, ao_num
        do k = 1, ao_num
          !                                               < 1:i, 2:j | 1:k, 2:l > 
          tc_int_2e_ao(k,i,l,j) = tc_int_2e_ao(k,i,l,j) + get_ao_two_e_integral(i, j, k, l, ao_integrals_map)
        enddo
      enddo
    enddo
  enddo
  !$OMP END DO
  !$OMP END PARALLEL
  call wall_time(time2)
  print*, ' wall time of Coulomb part of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0
  call print_memory_usage()
  ! ---
  print*, ' Writing int2_grad1_u12_ao in ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao'
  open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="write")
  call ezfio_set_work_empty(.False.)
    write(11) int2_grad1_u12_ao(:,:,:,1:3)
  close(11)
  print*, ' Saving tc_int_2e_ao in ', trim(ezfio_filename) // '/work/ao_two_e_tc_tot'
  open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/ao_two_e_tc_tot', action="write")
  call ezfio_set_work_empty(.False.)
  do i = 1, ao_num
    write(11) tc_int_2e_ao(:,:,:,i)
  enddo
  close(11)
  ! ----
  deallocate(int2_grad1_u12_ao)
  deallocate(tc_int_2e_ao)
  call wall_time(time2)
  print*, ' wall time for tc_int_2e_ao (min) = ', (time2-time1) / 60.d0
  call print_memory_usage()
  ! ---
  call wall_time(time1)
  print*, ' wall time for TC-integrals (min) = ', (time1-time0) / 60.d0
  return
 end
 ! ---
--- a/plugins/local/tc_int/jast_grad_full.irp.f
+++ b/plugins/local/tc_int/jast_grad_full.irp.f
@ -0,0 +1,245 @@
 ! ---
 subroutine get_grad1_u12_for_tc(ipoint, n_grid2, resx, resy, resz, res)
  BEGIN_DOC
  !
  ! resx(ipoint) =      [grad1 u(r1,r2)]_x1
  ! resy(ipoint) =      [grad1 u(r1,r2)]_y1
  ! resz(ipoint) =      [grad1 u(r1,r2)]_z1
  ! res (ipoint) = -0.5 [grad1 u(r1,r2)]^2
  !
  ! We use:
  !       grid for r1
  ! extra_grid for r2
  !
  END_DOC
  include 'constants.include.F'
  implicit none
  integer,          intent(in)  :: ipoint, n_grid2
  double precision, intent(out) :: resx(n_grid2), resy(n_grid2), resz(n_grid2), res(n_grid2)
  integer                       :: jpoint, i_nucl, p, mpA, npA, opA, pp
  integer                       :: powmax1, powmax, powmax2
  double precision              :: r1(3), r2(3)
  double precision              :: tmp, tmp1, tmp2, tmp11, tmp22
  double precision              :: rn(3), f1A, grad1_f1A(3), f2A, grad2_f2A(3), g12, grad1_g12(3)
  double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:)
  r1(1) = final_grid_points(1,ipoint)
  r1(2) = final_grid_points(2,ipoint)
  r1(3) = final_grid_points(3,ipoint)
  call grad1_j12_r1_seq(r1, n_grid2, resx, resy, resz)
  do jpoint = 1, n_grid2 ! r2
    res(jpoint) = -0.5d0 * (resx(jpoint) * resx(jpoint) + resy(jpoint) * resy(jpoint) + resz(jpoint) * resz(jpoint))
  enddo
  return
 end
 ! ---
 subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
  include 'constants.include.F'
  implicit none
  integer         , intent(in)  :: n_grid2
  double precision, intent(in)  :: r1(3)
  double precision, intent(out) :: gradx(n_grid2)
  double precision, intent(out) :: grady(n_grid2)
  double precision, intent(out) :: gradz(n_grid2)
  integer                       :: jpoint, i_nucl, p, mpA, npA, opA
  double precision              :: r2(3)
  double precision              :: dx, dy, dz, r12, tmp
  double precision              :: rn(3), f1A, grad1_f1A(3), f2A, grad2_f2A(3), g12, grad1_g12(3)
  double precision              :: tmp1, tmp2, dist
  integer                       :: powmax1, powmax, powmax2
  double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:)
  powmax1 = max(maxval(jBH_m), maxval(jBH_n))
  powmax2 = maxval(jBH_o)
  powmax  = max(powmax1, powmax2)
  allocate(f1A_power(-1:powmax), f2A_power(-1:powmax), g12_power(-1:powmax), double_p(0:powmax))
  do p = 0, powmax
    double_p(p) = dble(p)
  enddo
  f1A_power(-1) = 0.d0
  f2A_power(-1) = 0.d0
  g12_power(-1) = 0.d0
  f1A_power(0) = 1.d0
  f2A_power(0) = 1.d0
  g12_power(0) = 1.d0
  do jpoint = 1, n_grid2 ! r2
    r2(1) = final_grid_points_extra(1,jpoint)
    r2(2) = final_grid_points_extra(2,jpoint)
    r2(3) = final_grid_points_extra(3,jpoint)
    gradx(jpoint) = 0.d0
    grady(jpoint) = 0.d0
    gradz(jpoint) = 0.d0
    call jBH_elem_fct_grad_alpha1(r1, r2, g12, grad1_g12)
 !    dist =   (r1(1) - r2(1)) * (r1(1) - r2(1)) &
 !           + (r1(2) - r2(2)) * (r1(2) - r2(2)) &
 !           + (r1(3) - r2(3)) * (r1(3) - r2(3))
 !
 !    if(dist .ge. 1d-15) then
 !      dist = dsqrt( dist )
 !
 !      tmp1 = 1.d0 / (1.d0 + dist)
 !
 !      g12 = dist * tmp1
 !      tmp2 = tmp1 * tmp1 / dist
 !      grad1_g12(1) = tmp2 * (r1(1) - r2(1))
 !      grad1_g12(2) = tmp2 * (r1(2) - r2(2))
 !      grad1_g12(3) = tmp2 * (r1(3) - r2(3))
 !
 !    else
 !
 !      grad1_g12(1) = 0.d0
 !      grad1_g12(2) = 0.d0
 !      grad1_g12(3) = 0.d0
 !      g12 = 0.d0
 !
 !    endif
 !
    do p = 1, powmax2
      g12_power(p) = g12_power(p-1) * g12
    enddo
    do i_nucl = 1, nucl_num
      rn(1) = nucl_coord(i_nucl,1)
      rn(2) = nucl_coord(i_nucl,2)
      rn(3) = nucl_coord(i_nucl,3)
        call jBH_elem_fct_grad_alpha1(r1, rn, f1A, grad1_f1A)
 !      dist =   (r1(1) - rn(1)) * (r1(1) - rn(1)) &
 !             + (r1(2) - rn(2)) * (r1(2) - rn(2)) &
 !             + (r1(3) - rn(3)) * (r1(3) - rn(3))
 !      if (dist > 1.d-15) then
 !        dist = dsqrt( dist )
 !
 !        tmp1 = 1.d0 / (1.d0 + dist)
 !
 !        f1A = dist * tmp1
 !        tmp2 = tmp1 * tmp1 / dist
 !        grad1_f1A(1) = tmp2 * (r1(1) - rn(1))
 !        grad1_f1A(2) = tmp2 * (r1(2) - rn(2))
 !        grad1_f1A(3) = tmp2 * (r1(3) - rn(3))
 !
 !      else
 !
 !        grad1_f1A(1) = 0.d0
 !        grad1_f1A(2) = 0.d0
 !        grad1_f1A(3) = 0.d0
 !        f1A = 0.d0
 !
 !      endif
        call jBH_elem_fct_grad_alpha1(r2, rn, f2A, grad2_f2A)
 !      dist =   (r2(1) - rn(1)) * (r2(1) - rn(1)) &
 !             + (r2(2) - rn(2)) * (r2(2) - rn(2)) &
 !             + (r2(3) - rn(3)) * (r2(3) - rn(3))
 !
 !      if (dist > 1.d-15) then
 !        dist = dsqrt( dist )
 !
 !        tmp1 = 1.d0 / (1.d0 + dist)
 !
 !        f2A = dist * tmp1
 !        tmp2 = tmp1 * tmp1 / dist
 !        grad2_f2A(1) = tmp2 * (r2(1) - rn(1))
 !        grad2_f2A(2) = tmp2 * (r2(2) - rn(2))
 !        grad2_f2A(3) = tmp2 * (r2(3) - rn(3))
 !
 !      else
 !
 !        grad2_f2A(1) = 0.d0
 !        grad2_f2A(2) = 0.d0
 !        grad2_f2A(3) = 0.d0
 !        f2A = 0.d0
 !
 !      endif
      ! Compute powers of f1A and f2A
      do p = 1, powmax1
        f1A_power(p) = f1A_power(p-1) * f1A
        f2A_power(p) = f2A_power(p-1) * f2A
      enddo
      do p = 1, jBH_size
        mpA = jBH_m(p,i_nucl)
        npA = jBH_n(p,i_nucl)
        opA = jBH_o(p,i_nucl)
        tmp = jBH_c(p,i_nucl)
 !        if (dabs(tmp) <= 1.d-10) cycle
 !
        if(mpA .eq. npA) then
          tmp = tmp * 0.5d0
        endif
        tmp1 = double_p(mpA) * f1A_power(mpA-1) * f2A_power(npA) + double_p(npA) * f1A_power(npA-1) * f2A_power(mpA)
        tmp1 = tmp1 * g12_power(opA) * tmp
        tmp2 = double_p(opA) * g12_power(opA-1) * (f1A_power(mpA) * f2A_power(npA) + f1A_power(npA) * f2A_power(mpA)) * tmp
        gradx(jpoint) = gradx(jpoint) + tmp1 * grad1_f1A(1) + tmp2 * grad1_g12(1)
        grady(jpoint) = grady(jpoint) + tmp1 * grad1_f1A(2) + tmp2 * grad1_g12(2)
        gradz(jpoint) = gradz(jpoint) + tmp1 * grad1_f1A(3) + tmp2 * grad1_g12(3)
      enddo ! p
    enddo ! i_nucl
  enddo ! jpoint
  return
 end
 subroutine jBH_elem_fct_grad_alpha1(r1, r2, fct, grad1_fct)
  implicit none
  double precision, intent(in)  :: r1(3), r2(3)
  double precision, intent(out) :: fct, grad1_fct(3)
  double precision              :: dist, tmp1, tmp2
  dist =   (r1(1) - r2(1)) * (r1(1) - r2(1)) &
         + (r1(2) - r2(2)) * (r1(2) - r2(2)) &
         + (r1(3) - r2(3)) * (r1(3) - r2(3))
  if(dist .ge. 1d-15) then
    dist = dsqrt( dist )
    tmp1 = 1.d0 / (1.d0 + dist)
    fct = dist * tmp1
    tmp2 = tmp1 * tmp1 / dist
    grad1_fct(1) = tmp2 * (r1(1) - r2(1))
    grad1_fct(2) = tmp2 * (r1(2) - r2(2))
    grad1_fct(3) = tmp2 * (r1(3) - r2(3))
  else
    grad1_fct(1) = 0.d0
    grad1_fct(2) = 0.d0
    grad1_fct(3) = 0.d0
    fct = 0.d0
  endif
  return
 end
 ! ---
--- a/plugins/local/tc_int/jast_utils_bh.irp.f
+++ b/plugins/local/tc_int/jast_utils_bh.irp.f
@ -0,0 +1,43 @@
 ! ---
 subroutine jBH_elem_fct_grad(alpha, r1, r2, fct, grad1_fct)
  implicit none
  double precision, intent(in)  :: alpha, r1(3), r2(3)
  double precision, intent(out) :: fct, grad1_fct(3)
  double precision              :: dist, tmp1, tmp2, dist_inv
  dist =   (r1(1) - r2(1)) * (r1(1) - r2(1)) &
         + (r1(2) - r2(2)) * (r1(2) - r2(2)) &
         + (r1(3) - r2(3)) * (r1(3) - r2(3))
  if(dist .ge. 1d-15) then
    dist_inv = 1.d0/dsqrt( dist )
    dist = dist_inv * dist
    tmp1 = 1.d0 / (1.d0 + alpha * dist)
    fct = alpha * dist * tmp1
    tmp2 = alpha * tmp1 * tmp1 * dist_inv
    grad1_fct(1) = tmp2 * (r1(1) - r2(1))
    grad1_fct(2) = tmp2 * (r1(2) - r2(2))
    grad1_fct(3) = tmp2 * (r1(3) - r2(3))
  else
    grad1_fct(1) = 0.d0
    grad1_fct(2) = 0.d0
    grad1_fct(3) = 0.d0
    fct = 0.d0
  endif
  return
 end
 ! ---
--- a/plugins/local/tc_int/write_tc_int.irp.f
+++ b/plugins/local/tc_int/write_tc_int.irp.f
@ -0,0 +1,56 @@
 ! ---
 program write_tc_int
  implicit none
  print *, ' j2e_type = ', j2e_type
  print *, ' j1e_type = ', j1e_type
  print *, ' env_type = ', env_type
  my_grid_becke  = .True.
  PROVIDE tc_grid1_a tc_grid1_r
  my_n_pt_r_grid = tc_grid1_r
  my_n_pt_a_grid = tc_grid1_a
  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
  my_extra_grid_becke  = .True.
  PROVIDE tc_grid2_a tc_grid2_r
  my_n_pt_r_extra_grid = tc_grid2_r
  my_n_pt_a_extra_grid = tc_grid2_a
  touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid
  call write_int(6, my_n_pt_r_grid, 'radial  external grid over')
  call write_int(6, my_n_pt_a_grid, 'angular external grid over')
  call write_int(6, my_n_pt_r_extra_grid, 'radial  internal grid over')
  call write_int(6, my_n_pt_a_extra_grid, 'angular internal grid over')
  call main()
 end
 ! ---
 subroutine main()
  implicit none
  PROVIDE io_tc_integ
  print*, 'io_tc_integ = ', io_tc_integ
  if(io_tc_integ .ne. "Write") then
    print*, 'io_tc_integ != Write'
    print*, io_tc_integ
    stop
  endif
  call provide_int2_grad1_u12_ao()
  call ezfio_set_tc_keywords_io_tc_integ('Read')
 end
 ! ---
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@ -25,20 +25,22 @@ END_PROVIDER
   ! Last dimension of cholesky_ao is cholesky_ao_num
   !
   ! https://mogp-emulator.readthedocs.io/en/latest/methods/proc/ProcPivotedCholesky.html
   !
   ! https://doi.org/10.1016/j.apnum.2011.10.001 : Page 4, Algorithm 1
   !
   ! https://www.diva-portal.org/smash/get/diva2:396223/FULLTEXT01.pdf
   END_DOC
   integer*8                      :: ndim8
   integer                        :: rank
   double precision               :: tau, tau2
-   double precision, pointer      :: L(:,:), Delta(:,:)
+   double precision, pointer      :: L(:,:)
   double precision               :: s
   double precision               :: dscale, dscale_tmp
-   double precision, allocatable  :: D(:), Ltmp_p(:,:), Ltmp_q(:,:), D_sorted(:), Delta_col(:)
+   double precision, allocatable  :: D(:), Ltmp_p(:,:), Ltmp_q(:,:), D_sorted(:), Delta_col(:), Delta(:,:)
   integer, allocatable           :: addr1(:), addr2(:)
-   integer*8, allocatable         :: Lset(:), Dset(:), addr3(:)
+   integer*8, allocatable         :: Lset(:), Dset(:)
   logical, allocatable           :: computed(:)
   integer                        :: i,j,k,m,p,q, dj, p2, q2, ii, jj
@ -64,11 +66,8 @@ END_PROVIDER
   type(c_ptr)                    :: c_pointer(2)
   integer                        :: fd(2)
   logical                        :: delta_on_disk
   integer                        :: dgemm_block_size, nqq
   double precision, allocatable  :: dgemm_buffer1(:,:), dgemm_buffer2(:,:)
-   PROVIDE nproc
+   PROVIDE nproc ao_cholesky_threshold do_direct_integrals qp_max_mem
   PROVIDE nucl_coord ao_two_e_integral_schwartz
   call set_multiple_levels_omp(.False.)
@ -88,19 +87,8 @@ END_PROVIDER
   else
     PROVIDE nucl_coord ao_two_e_integral_schwartz
     call set_multiple_levels_omp(.False.)
     call resident_memory(mem0)
     rank_max = min(ndim8,(qp_max_mem*1024_8*1024_8*1024_8/8_8)/ndim8)
     call mmap(trim(ezfio_work_dir)//'cholesky_ao_tmp', (/ ndim8, rank_max /), 8, fd(1), .False., .True., c_pointer(1))
     call c_f_pointer(c_pointer(1), L, (/ ndim8, rank_max /))
     ! Deleting the file while it is open makes the file invisible on the filesystem,
     ! and automatically deleted, even if the program crashes
     iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_ao_tmp', 'R')
     close(iunit,status='delete')
     if (do_direct_integrals) then
       if (ao_two_e_integral(1,1,1,1) < huge(1.d0)) then
         ! Trigger providers inside ao_two_e_integral
@ -113,8 +101,12 @@ END_PROVIDER
     tau = ao_cholesky_threshold
     tau2 = tau*tau
-     mem = 6.d0 * memory_of_double8(ndim8) + 6.d0 * memory_of_int8(ndim8)
+     rank = 0
-     call check_mem(mem, irp_here)
+
     allocate( D(ndim8), Lset(ndim8), Dset(ndim8), D_sorted(ndim8))
     allocate( addr1(ndim8), addr2(ndim8), Delta_col(ndim8), computed(ndim8) )
     call resident_memory(mem0)
     call print_memory_usage()
@ -127,59 +119,58 @@ END_PROVIDER
     print *,  '============ ============='
     rank = 0
     allocate( D(ndim8), Lset(ndim8), Dset(ndim8), D_sorted(ndim8))
     allocate( addr1(ndim8), addr2(ndim8), addr3(ndim8) )
 !print *, 'allocate : (D(ndim8))', memory_of_int8(ndim8)
 !print *, 'allocate : (Lset(ndim8))', memory_of_int8(ndim8)
 !print *, 'allocate : (Dset(ndim8))', memory_of_int8(ndim8)
 !print *, 'allocate : (4,addr(ndim8))', memory_of_int8(4_8*ndim8)
     ! 1.
-     k=0
+     i8=0
     do j=1,ao_num
       do i=1,ao_num
-         k = k+1
+         i8 = i8+1
-         addr1(k) = i
+         addr1(i8) = i
-         addr2(k) = j
+         addr2(i8) = j
         addr3(k) = (i-1)*ao_num + j
       enddo
     enddo
     if (do_direct_integrals) then
-       !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i8) SCHEDULE(dynamic,16)
+       !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i8) SCHEDULE(dynamic,21)
       do i8=ndim8,1,-1
         D(i8) = ao_two_e_integral(addr1(i8), addr2(i8),              &
             addr1(i8), addr2(i8))
       enddo
       !$OMP END PARALLEL DO
     else
-       !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i8) SCHEDULE(dynamic,16)
+       !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i8) SCHEDULE(dynamic,21)
       do i8=ndim8,1,-1
         D(i8) = get_ao_two_e_integral(addr1(i8), addr1(i8),          &
-             addr2(i8), addr2(i8),                                   &
+             addr2(i8), addr2(i8), ao_integrals_map)
             ao_integrals_map)
       enddo
       !$OMP END PARALLEL DO
     endif
     D_sorted(:) = -D(:)
     call dsort_noidx_big(D_sorted,ndim8)
-     D_sorted(:) = dabs(D_sorted(:))
+     D_sorted(:) = -D_sorted(:)
     Dmax = D_sorted(1)
     ! 2.
     dscale = 1.d0
     dscale_tmp = dscale*dscale*Dmax
     np8=0_8
     do p8=1,ndim8
-       if ( dscale_tmp*D(p8) > tau2 ) then
+       if ( Dmax*D(p8) >= tau2 ) then
         np8 = np8+1_8
         Lset(np8) = p8
       endif
     enddo
-     np = np8
+     if (np8 > ndim8) stop 'np>ndim8'
     np = int(np8,4)
     if (np <= 0) stop 'np<=0'
     rank_max = min(np,20*elec_num*elec_num)
     call mmap(trim(ezfio_work_dir)//'cholesky_ao_tmp', (/ ndim8, rank_max /), 8, fd(1), .False., .True., c_pointer(1))
     call c_f_pointer(c_pointer(1), L, (/ ndim8, rank_max /))
     ! Deleting the file while it is open makes the file invisible on the filesystem,
     ! and automatically deleted, even if the program crashes
     iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_ao_tmp', 'R')
     close(iunit,status='delete')
     ! 3.
     N = 0
@ -187,82 +178,66 @@ END_PROVIDER
     ! 4.
     i = 0
     mem = memory_of_double(np)                & ! Delta(np,nq)
         + (np+1)*memory_of_double(block_size)   ! Ltmp_p(np,block_size) + Ltmp_q(nq,block_size)
 !     call check_mem(mem)
     ! 5.
-     do while ( (Dmax > tau).and.(rank*1_8 < min(ndim8,rank_max)) )
+     do while ( (Dmax > tau).and.(np > 0) )
       ! a.
       i = i+1
       ! Inrease s until the arrays fit in memory
       s = 0.01d0
       block_size = max(N,24)
       ! Determine nq so that Delta fits in memory
       s = 0.1d0
       Dmin = max(s*Dmax,tau)
       do nq=2,np-1
         if (D_sorted(nq) < Dmin) exit
       enddo
       do while (.True.)
-         ! b.
+         mem = mem0                                 &
-         Dmin = max(s*Dmax,tau)
+             + np*memory_of_double(nq)              & ! Delta(np,nq)
             + (np+nq)*memory_of_double(block_size)   ! Ltmp_p(np,block_size) + Ltmp_q(nq,block_size)
-         ! c.
+         if (mem > qp_max_mem*0.5d0) then
           Dmin = D_sorted(nq/2)
           do ii=nq/2,np-1
             if (D_sorted(ii) < Dmin) then
               nq = ii
               exit
             endif
           enddo
         else
           exit
         endif
       enddo
 !call print_memory_usage
 !print *, 'np, nq, Predicted memory: ', np, nq, mem
       if (nq <= 0) then
         print *, nq
         stop 'bug in cholesky: nq <= 0'
       endif
       Dmin = D_sorted(nq)
       nq=0
       do p=1,np
-           if ( D(Lset(p)) > Dmin ) then
+         if ( D(Lset(p)) >= Dmin ) then
           nq = nq+1
           Dset(nq) = Lset(p)
         endif
       enddo
         mem = mem0                                &
             + np*memory_of_double(nq)
 !print *, 'mem = ', mem
         if (mem > qp_max_mem/2) then
           s = s*2.d0
         else
           exit
         endif
         if ((s > 1.d0).or.(nq == 0)) then
           call print_memory_usage()
           print *, 'Required peak memory: ', mem, 'Gb'
           call resident_memory(mem)
           print *, 'Already used  memory: ', mem, 'Gb'
           print *, 'Not enough memory. Reduce cholesky threshold'
           stop -1
         endif
         if (s > 0.1d0) then
           exit
         endif
       enddo
       ! d., e.
       mem = mem0                            &
             + memory_of_int(nq)             &! computed(nq)
             + np*memory_of_int(nq)          &! computed(nq)
             + memory_of_double(np)          &! Delta_col(np)
             + 7*memory_of_double8(ndim8)     &! D, Lset, Dset, D_sorted, addr[1-3]
             + np*memory_of_double(nq)       &! Delta(np,nq)
             + (np+nq)*memory_of_double(block_size) ! Ltmp_p(np,block_size) + Ltmp_q(nq,block_size)
       if (mem > qp_max_mem) then
         call mmap(trim(ezfio_work_dir)//'cholesky_delta', (/ np*1_8, nq*1_8 /), 8, fd(2), .False., .True., c_pointer(2))
         call c_f_pointer(c_pointer(2), Delta, (/ np, nq /))
         ! Deleting the file while it is open makes the file invisible on the filesystem,
         ! and automatically deleted, even if the program crashes
         iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_delta', 'R')
         close(iunit,status='delete')
         delta_on_disk = .True.
       else
       allocate(Delta(np,nq))
         delta_on_disk = .False.
       endif
 !print *, delta_on_disk
       allocate(Delta_col(np))
       allocate(Ltmp_p(np,block_size), stat=ierr)
 !print *, 'allocate : Ltmp_p(np,block_size)', memory_of_double8(np*block_size*1_8), np, block_size
       if (ierr /= 0) then
         call print_memory_usage()
@ -271,7 +246,6 @@ END_PROVIDER
       endif
       allocate(Ltmp_q(nq,block_size), stat=ierr)
 !print *, 'allocate : Ltmp_q(nq,block_size)', memory_of_double8(nq*block_size*1_8), nq, block_size
       if (ierr /= 0) then
         call print_memory_usage()
@ -280,11 +254,9 @@ END_PROVIDER
       endif
-       allocate(computed(nq))
+       computed(1:nq) = .False.
       computed(:) = .False.
 !print *, 'N, rank, block_size', N, rank, block_size
       !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(k,p,q)
       do k=1,N
         !$OMP DO
@ -302,50 +274,11 @@ END_PROVIDER
       !$OMP BARRIER
       !$OMP END PARALLEL
       PROVIDE nproc
       if (N>0) then
         if (delta_on_disk) then
           ! Blocking improves I/O performance
           dgemm_block_size = nproc*4
           allocate (dgemm_buffer1(np,dgemm_block_size))
           allocate (dgemm_buffer2(dgemm_block_size,N))
           do jj=1,nq,dgemm_block_size
             nqq = min(nq, jj+dgemm_block_size-1) - jj + 1
             !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q,ii)
             do ii=1,N
               do q=jj,jj+nqq-1
                 dgemm_buffer2(q-jj+1,ii) = Ltmp_q(q,ii)
               enddo
             enddo
             !$OMP END PARALLEL DO
             call dgemm('N', 'T', np, nqq, N, 1.d0,                       &
                   Ltmp_p, np, dgemm_buffer2, dgemm_block_size, 0.d0, dgemm_buffer1, np)
             !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q)
             do q=jj,jj+nqq-1
               Delta(:,q) = - dgemm_buffer1(:, q-jj+1)
             enddo
             !$OMP END PARALLEL DO
           enddo
           deallocate(dgemm_buffer1, dgemm_buffer2)
         else
           call dgemm('N', 'T', np, nq, N, -1.d0,                       &
                  Ltmp_p(1,1), np, Ltmp_q(1,1), nq, 0.d0, Delta, np)
         endif
       else
         !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q,j)
@ -368,48 +301,19 @@ END_PROVIDER
       do j=1,nq
         if ( (Qmax <= Dmin).or.(N+j*1_8 > ndim8) ) exit
         ! i.
         rank = N+j
         if (rank == rank_max) then
           print *, 'cholesky: rank_max reached'
           exit
         endif
         if (iblock == block_size) then
           if (delta_on_disk) then
             ! Blocking improves I/O performance
             dgemm_block_size = nproc*4
             allocate (dgemm_buffer1(np,dgemm_block_size))
             allocate (dgemm_buffer2(dgemm_block_size,block_size))
             do jj=1,nq,dgemm_block_size
               nqq = min(nq, jj+dgemm_block_size-1) - jj + 1
               !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q,ii)
               do ii=1,block_size
                 do q=jj,jj+nqq-1
                   dgemm_buffer2(q-jj+1,ii) = Ltmp_q(q,ii)
                 enddo
               enddo
               !$OMP END PARALLEL DO
               call dgemm('N', 'T', np, nqq, block_size, 1.d0,                       &
                     Ltmp_p(1,1), np, dgemm_buffer2, dgemm_block_size, 0.d0, dgemm_buffer1, np)
               !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q)
               do q=jj,jj+nqq-1
                 Delta(:,q) = Delta(:,q) - dgemm_buffer1(:, q-jj+1)
               enddo
               !$OMP END PARALLEL DO
             enddo
             deallocate(dgemm_buffer1, dgemm_buffer2)
           else
            call dgemm('N','T',np,nq,block_size,-1.d0,                &
                 Ltmp_p, np, Ltmp_q, nq, 1.d0, Delta, np)
           endif
            iblock = 0
         endif
@ -438,26 +342,25 @@ END_PROVIDER
           if (do_direct_integrals) then
               !$OMP PARALLEL DO PRIVATE(k) SCHEDULE(dynamic,21)
               do k=1,np
                 Delta_col(k) = 0.d0
                 if (.not.ao_two_e_integral_zero( addr1(Lset(k)), addr1(Dset(m)),&
                       addr2(Lset(k)), addr2(Dset(m)) ) ) then
                     Delta_col(k) = &
                         ao_two_e_integral(addr1(Lset(k)), addr2(Lset(k)),&
                         addr1(Dset(m)), addr2(Dset(m)))
                 else
                     Delta_col(k) = 0.d0
                 endif
               enddo
               !$OMP END PARALLEL DO
           else
               PROVIDE ao_integrals_map
               !$OMP PARALLEL DO PRIVATE(k) SCHEDULE(dynamic,21)
               do k=1,np
                 Delta_col(k) = 0.d0
                 if (.not.ao_two_e_integral_zero( addr1(Lset(k)), addr1(Dset(m)),&
                       addr2(Lset(k)), addr2(Dset(m)) ) ) then
                     Delta_col(k) = &
                         get_ao_two_e_integral( addr1(Lset(k)), addr1(Dset(m)),&
                         addr2(Lset(k)), addr2(Dset(m)), ao_integrals_map)
                 else
                     Delta_col(k) = 0.d0
                 endif
               enddo
               !$OMP END PARALLEL DO
@ -507,35 +410,28 @@ END_PROVIDER
       print '(I10, 4X, ES12.3)', rank, Qmax
       deallocate(Delta_col)
       deallocate(Ltmp_p)
       deallocate(Ltmp_q)
       deallocate(computed)
       if (delta_on_disk) then
         call munmap( (/ np*1_8, nq*1_8 /), 8, fd(2), c_pointer(2) )
       else
       deallocate(Delta)
       endif
       ! i.
       N = rank
       ! j.
-       Dmax = D(Lset(1))
+       D_sorted(:) = -D(:)
-       do p=1,np
+       call dsort_noidx_big(D_sorted,ndim8)
-         Dmax = max(Dmax, D(Lset(p)))
+       D_sorted(:) = -D_sorted(:)
-       enddo
+
       Dmax = D_sorted(1)
       dscale = 1.d0
       dscale_tmp = dscale*dscale*Dmax
       np8=0_8
       do p8=1,ndim8
-         if ( dscale_tmp*D(p8) > tau2 ) then
+         if ( Dmax*D(p8) >= tau2 ) then
           np8 = np8+1_8
           Lset(np8) = p8
         endif
       enddo
-       np = np8
+       np = int(np8,4)
     enddo
@ -543,8 +439,11 @@ END_PROVIDER
     print *,  '============ ============='
     print *,  ''
     deallocate( D, Lset, Dset, D_sorted )
     deallocate( addr1, addr2, Delta_col, computed )
     allocate(cholesky_ao(ao_num,ao_num,rank), stat=ierr)
 !print *, 'allocate : cholesky_ao(ao_num,ao_num,rank)', memory_of_double8(ao_num*ao_num*rank*1_8)
     if (ierr /= 0) then
       call print_memory_usage()
@ -556,7 +455,7 @@ END_PROVIDER
     !$OMP PARALLEL DO PRIVATE(k,j)
     do k=1,rank
       do j=1,ao_num
-           cholesky_ao(1:ao_num,j,k) = L((j-1)*ao_num+1:j*ao_num,k)
+           cholesky_ao(1:ao_num,j,k) = L((j-1_8)*ao_num+1_8:1_8*j*ao_num,k)
       enddo
     enddo
     !$OMP END PARALLEL DO
@ -581,5 +480,6 @@ END_PROVIDER
   call wall_time(wall1)
   print*,'Time to provide AO cholesky vectors = ',(wall1-wall0)/60.d0, ' min'
 END_PROVIDER
--- a/src/ccsd/NEED
+++ b/src/ccsd/NEED
@ -1,2 +1,3 @@
 gpu
 hartree_fock
 utils_cc
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@ -1,4 +1,5 @@
 subroutine run_ccsd_space_orb
  use gpu
  implicit none
@ -9,16 +10,28 @@ subroutine run_ccsd_space_orb
  double precision :: uncorr_energy,energy, max_elem, max_r, max_r1, max_r2,ta,tb
  logical :: not_converged
-  double precision, allocatable :: t2(:,:,:,:), r2(:,:,:,:), tau(:,:,:,:), tau_x(:,:,:,:)
+  type(gpu_double4) :: t2, r2, tau, tau_x
-  double precision, allocatable :: t1(:,:), r1(:,:)
+  type(gpu_double2) :: t1, r1
-  double precision, allocatable :: H_oo(:,:), H_vv(:,:), H_vo(:,:)
+  type(gpu_double2) :: H_oo, H_vv, H_vo
  type(gpu_double2) :: d_cc_space_f_oo, d_cc_space_f_vo
  type(gpu_double2) :: d_cc_space_f_ov, d_cc_space_f_vv
  type(gpu_double3) :: d_cc_space_v_oo_chol, d_cc_space_v_vo_chol
  type(gpu_double3) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol
  type(gpu_double4) :: d_cc_space_v_oovv, d_cc_space_v_voov, d_cc_space_v_ovov
  type(gpu_double4) :: d_cc_space_v_oovo, d_cc_space_v_vooo, d_cc_space_v_oooo
  type(gpu_double4) :: d_cc_space_v_vvoo, d_cc_space_v_ovvo, d_cc_space_v_ovoo
  double precision, allocatable :: all_err(:,:), all_t(:,:)
  integer, allocatable          :: list_occ(:), list_vir(:)
  integer(bit_kind)             :: det(N_int,2)
  integer                       :: nO, nV, nOa, nVa
-  if (do_ao_cholesky) then
+  call set_multiple_levels_omp(.False.)
  if (do_mo_cholesky) then
    PROVIDE cholesky_mo_transp
    FREE cholesky_ao
  else
@ -49,11 +62,77 @@ subroutine run_ccsd_space_orb
  !print*,'occ',list_occ
  !print*,'vir',list_vir
-  allocate(t2(nO,nO,nV,nV), r2(nO,nO,nV,nV))
+  ! GPU arrays
-  allocate(tau(nO,nO,nV,nV))
+  call gpu_allocate(d_cc_space_f_oo, nO, nO)
-  allocate(tau_x(nO,nO,nV,nV))
+  call gpu_allocate(d_cc_space_f_vo, nV, nO)
-  allocate(t1(nO,nV), r1(nO,nV))
+  call gpu_allocate(d_cc_space_f_ov, nO, nV)
-  allocate(H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO))
+  call gpu_allocate(d_cc_space_f_vv, nV, nV)
  call gpu_upload(cc_space_f_oo, d_cc_space_f_oo)
  call gpu_upload(cc_space_f_vo, d_cc_space_f_vo)
  call gpu_upload(cc_space_f_ov, d_cc_space_f_ov)
  call gpu_upload(cc_space_f_vv, d_cc_space_f_vv)
 !  FREE cc_space_f_oo
 !  FREE cc_space_f_vo
 !  FREE cc_space_f_vv
  if (do_mo_cholesky) then
    call gpu_allocate(d_cc_space_v_oo_chol, cholesky_mo_num, nO, nO)
    call gpu_allocate(d_cc_space_v_ov_chol, cholesky_mo_num, nO, nV)
    call gpu_allocate(d_cc_space_v_vo_chol, cholesky_mo_num, nV, nO)
    call gpu_allocate(d_cc_space_v_vv_chol, cholesky_mo_num, nV, nV)
    call gpu_upload(cc_space_v_oo_chol, d_cc_space_v_oo_chol)
    call gpu_upload(cc_space_v_ov_chol, d_cc_space_v_ov_chol)
    call gpu_upload(cc_space_v_vo_chol, d_cc_space_v_vo_chol)
    call gpu_upload(cc_space_v_vv_chol, d_cc_space_v_vv_chol)
 !    FREE cc_space_v_oo_chol
 !    FREE cc_space_v_ov_chol
 !    FREE cc_space_v_vo_chol
 !    FREE cc_space_v_vv_chol
  endif
  call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV)
  call gpu_allocate(d_cc_space_v_voov, nV, nO, nO, nV)
  call gpu_allocate(d_cc_space_v_ovov, nO, nV, nO, nV)
  call gpu_allocate(d_cc_space_v_oovo, nO, nO, nV, nO)
  call gpu_allocate(d_cc_space_v_ovvo, nO, nV, nV, nO)
  call gpu_allocate(d_cc_space_v_vooo, nV, nO, nO, nO)
  call gpu_allocate(d_cc_space_v_oooo, nO, nO, nO, nO)
  call gpu_allocate(d_cc_space_v_vvoo, nV, nV, nO, nO)
  call gpu_allocate(d_cc_space_v_ovoo, nO, nV, nO, nO)
  call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv)
  call gpu_upload(cc_space_v_voov, d_cc_space_v_voov)
  call gpu_upload(cc_space_v_ovov, d_cc_space_v_ovov)
  call gpu_upload(cc_space_v_oovo, d_cc_space_v_oovo)
  call gpu_upload(cc_space_v_ovvo, d_cc_space_v_ovvo)
  call gpu_upload(cc_space_v_vooo, d_cc_space_v_vooo)
  call gpu_upload(cc_space_v_oooo, d_cc_space_v_oooo)
  call gpu_upload(cc_space_v_vvoo, d_cc_space_v_vvoo)
  call gpu_upload(cc_space_v_ovoo, d_cc_space_v_ovoo)
 !  FREE cc_space_v_voov
 !  FREE cc_space_v_ovov
 !  FREE cc_space_v_oovo
 !  FREE cc_space_v_oovv
 !  FREE cc_space_v_vooo
 !  FREE cc_space_v_oooo
 !  FREE cc_space_v_vvoo
 !  FREE cc_space_v_ovvo
 !  FREE cc_space_v_ovoo
  call gpu_allocate(t2, nO,nO,nV,nV)
  call gpu_allocate(r2, nO,nO,nV,nV)
  call gpu_allocate(tau, nO,nO,nV,nV)
  call gpu_allocate(tau_x, nO,nO,nV,nV)
  call gpu_allocate(t1, nO,nV)
  call gpu_allocate(r1, nO,nV)
  call gpu_allocate(H_oo, nO, nO)
  call gpu_allocate(H_vo, nV, nO)
  call gpu_allocate(H_vv, nV, nV)
  if (cc_update_method == 'diis') then
    double precision :: rss, diis_mem, extra_mem
@ -95,14 +174,22 @@ subroutine run_ccsd_space_orb
  endif
  ! Init
-  call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,t1)
+  double precision, allocatable :: h_t1(:,:), h_t2(:,:,:,:)
-  call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,t2)
+  allocate(h_t1(nO,nV), h_t2(nO,nO,nV,nV))
-  call update_tau_space(nO,nV,t1,t2,tau)
+
  call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,h_t1)
  call gpu_upload(h_t1, t1)
  call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,h_t2)
  call gpu_upload(h_t2, t2)
  call update_tau_space(nO,nV,h_t1,t1,t2,tau)
  call update_tau_x_space(nO,nV,tau,tau_x)
  !print*,'hf_energy', hf_energy
  call det_energy(det,uncorr_energy)
  print*,'Det energy', uncorr_energy
-  call ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
+
  call ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy)
  print*,'Guess energy', uncorr_energy+energy, energy
  nb_iter = 0
@ -118,43 +205,45 @@ subroutine run_ccsd_space_orb
  do while (not_converged)
    ! Residue
-    if (do_ao_cholesky) then
+    if (do_mo_cholesky) then
-!    if (.False.) then
+      call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
-      call compute_H_oo_chol(nO,nV,tau_x,H_oo)
+      call compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, d_cc_space_v_ov_chol,H_vv)
-      call compute_H_vv_chol(nO,nV,tau_x,H_vv)
+      call compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol, H_vo)
      call compute_H_vo_chol(nO,nV,t1,H_vo)
-      call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
+      call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_space_f_ov,d_cc_space_f_vo, &
-      call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
+           d_cc_space_v_voov, d_cc_space_v_ovov, d_cc_space_v_oovo, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol)
      call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
           d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, d_cc_space_v_oovo, d_cc_space_v_ovvo, d_cc_space_v_ovoo, &
           d_cc_space_v_ovov, d_cc_space_v_vvoo, d_cc_space_v_oo_chol, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, &
           d_cc_space_f_vo, &
           r2, max_r2)
    else
-      call compute_H_oo(nO,nV,t1,t2,tau,H_oo)
+      call compute_H_oo(nO,nV,t1%f,t2%f,tau%f,H_oo%f)
-      call compute_H_vv(nO,nV,t1,t2,tau,H_vv)
+      call compute_H_vv(nO,nV,t1%f,t2%f,tau%f,H_vv%f)
-      call compute_H_vo(nO,nV,t1,t2,H_vo)
+      call compute_H_vo(nO,nV,t1%f,t2%f,H_vo%f)
-      call compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
+      call compute_r1_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r1%f,max_r1)
-      call compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
+      call compute_r2_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r2%f,max_r2)
    endif
    max_r = max(max_r1,max_r2)
    ! Update
    if (cc_update_method == 'diis') then
-      !call update_t_ccsd(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
+      call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1%f,r2%f,t1%f,t2%f,all_err,all_t)
      !call update_t_ccsd_diis(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
      call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1,r2,t1,t2,all_err,all_t)
    ! Standard update as T = T - Delta
    elseif (cc_update_method == 'none') then
-      call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1,t1)
+      call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1%f,t1%f)
-      call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2,t2)
+      call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2%f,t2%f)
    else
      print*,'Unkown cc_method_method: '//cc_update_method
    endif
-    call update_tau_space(nO,nV,t1,t2,tau)
+    call update_tau_space(nO,nV,t1%f,t1,t2,tau)
    call update_tau_x_space(nO,nV,tau,tau_x)
    ! Energy
-    call ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
+    call ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy)
    write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |'
    nb_iter = nb_iter + 1
@ -179,8 +268,8 @@ subroutine run_ccsd_space_orb
  print*,''
  if (write_amplitudes) then
-    call write_t1(nO,nV,t1)
+    call write_t1(nO,nV,t1%f)
-    call write_t2(nO,nV,t2)
+    call write_t2(nO,nV,t2%f)
    call ezfio_set_utils_cc_io_amplitudes('Read')
  endif
@ -189,7 +278,14 @@ subroutine run_ccsd_space_orb
    deallocate(all_err,all_t)
  endif
-  deallocate(H_vv,H_oo,H_vo,r1,r2,tau)
+  call gpu_deallocate(H_oo)
  call gpu_deallocate(H_vv)
  call gpu_deallocate(H_vo)
  call gpu_deallocate(r1)
  call gpu_deallocate(r2)
  call gpu_deallocate(tau)
  call gpu_deallocate(tau_x)
  ! CCSD(T)
  double precision :: e_t, e_t_err
@ -197,28 +293,14 @@ subroutine run_ccsd_space_orb
  if (cc_par_t .and. elec_alpha_num + elec_beta_num > 2) then
    ! Dumb way
    !call wall_time(ta)
    !call ccsd_par_t_space(nO,nV,t1,t2,e_t)
    !call wall_time(tb)
    !print*,'Time: ',tb-ta, ' s'
    !print*,''
    !write(*,'(A15,F18.12,A3)') ' E(CCSD(T))  = ', uncorr_energy + energy + e_t, ' Ha'
    !write(*,'(A15,F18.12,A3)') ' E(T)        = ', e_t, ' Ha'
    !write(*,'(A15,F18.12,A3)') ' Correlation = ', energy + e_t, ' Ha'
    !print*,''
    ! New
    e_t = uncorr_energy + energy ! For print in (T) call
    e_t_err = 0.d0
    print*,'Computing (T) correction...'
    call wall_time(ta)
 !    call ccsd_par_t_space_v3(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
 !         ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t)
-    call ccsd_par_t_space_stoch(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
+    call ccsd_par_t_space_stoch(nO,nV,t1%f,t2%f,cc_space_f_o,cc_space_f_v &
         ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t, e_t_err)
    call wall_time(tb)
@ -233,168 +315,161 @@ subroutine run_ccsd_space_orb
  call save_energy(uncorr_energy + energy, e_t)
-  deallocate(t1,t2)
+  deallocate(h_t1, h_t2)
  if (do_mo_cholesky) then
    call gpu_deallocate(d_cc_space_v_oo_chol)
    call gpu_deallocate(d_cc_space_v_ov_chol)
    call gpu_deallocate(d_cc_space_v_vo_chol)
    call gpu_deallocate(d_cc_space_v_vv_chol)
  endif
  call gpu_deallocate(d_cc_space_v_oovv)
  call gpu_deallocate(d_cc_space_v_voov)
  call gpu_deallocate(d_cc_space_v_ovov)
  call gpu_deallocate(d_cc_space_v_oovo)
  call gpu_deallocate(d_cc_space_v_ovvo)
  call gpu_deallocate(d_cc_space_v_vooo)
  call gpu_deallocate(d_cc_space_v_oooo)
  call gpu_deallocate(d_cc_space_v_vvoo)
  call gpu_deallocate(d_cc_space_v_ovoo)
  call gpu_deallocate(d_cc_space_f_oo)
  call gpu_deallocate(d_cc_space_f_vo)
  call gpu_deallocate(d_cc_space_f_ov)
  call gpu_deallocate(d_cc_space_f_vv)
  call gpu_deallocate(t1)
  call gpu_deallocate(t2)
 end
 ! Energy
-subroutine ccsd_energy_space(nO,nV,tau,t1,energy)
+subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy)
-
+  use gpu
  implicit none
  integer, intent(in)            :: nO, nV
-  double precision, intent(in)  :: tau(nO,nO,nV,nV)
+  type(gpu_double4), intent(in)  :: tau_x, d_cc_space_v_oovv
-  double precision, intent(in)  :: t1(nO,nV)
+  type(gpu_double2), intent(in)  :: t1, d_cc_space_f_vo
  double precision, intent(out)  :: energy
  ! internal
  integer :: i,j,a,b
  double precision :: e
-  energy = 0d0
+  type(gpu_stream) :: s1, s2
-  !$omp parallel &
+  call gpu_stream_create(s1)
-  !$omp shared(nO,nV,energy,tau,t1,&
+  call gpu_stream_create(s2)
  !$omp cc_space_f_vo,cc_space_w_oovv) &
  !$omp private(i,j,a,b,e) &
  !$omp default(none)
  e = 0d0
  !$omp do
  do a = 1, nV
    do i = 1, nO
      e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
    enddo
  enddo
  !$omp end do nowait
  !$omp do
  do b = 1, nV
    do a = 1, nV
      do j = 1, nO
        do i = 1, nO
          e = e + tau(i,j,a,b) * cc_space_w_oovv(i,j,a,b)
       enddo
      enddo
    enddo
  enddo
  !$omp end do nowait
  !$omp critical
  energy = energy + e
  !$omp end critical
  !$omp end parallel
-end
+  call gpu_set_stream(blas_handle,s1)
  call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo%f(1,1), 1, t1%f(1,1), 1, e)
-subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
+  call gpu_set_stream(blas_handle,s2)
  call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x%f(1,1,1,1), 1_8, d_cc_space_v_oovv%f(1,1,1,1), 1_8, energy)
  call gpu_set_stream(blas_handle,gpu_default_stream)
-  implicit none
+  call gpu_synchronize()
  call gpu_stream_destroy(s1)
  call gpu_stream_destroy(s2)
-  integer, intent(in)           :: nO, nV
+   energy = energy + 2.d0*e
  double precision, intent(in)  :: tau_x(nO,nO,nV,nV)
  double precision, intent(in)  :: t1(nO,nV)
  double precision, intent(out) :: energy
  ! internal
  integer :: i,j,a,b
  double precision :: e
  energy = 0d0
  !$omp parallel &
  !$omp shared(nO,nV,energy,tau_x,t1,&
  !$omp cc_space_f_vo,cc_space_v_oovv) &
  !$omp private(i,j,a,b,e) &
  !$omp default(none)
  e = 0d0
  !$omp do
  do a = 1, nV
    do i = 1, nO
      e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
    enddo
  enddo
  !$omp end do nowait
  !$omp do
  do b = 1, nV
    do a = 1, nV
      do j = 1, nO
        do i = 1, nO
          e = e + tau_x(i,j,a,b) * cc_space_v_oovv(i,j,a,b)
       enddo
      enddo
    enddo
  enddo
  !$omp end do nowait
  !$omp critical
  energy = energy + e
  !$omp end critical
  !$omp end parallel
 end
 ! Tau
-subroutine update_tau_space(nO,nV,t1,t2,tau)
+subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
-
+  use gpu
  implicit none
  ! in
  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: t1(nO,nV), t2(nO,nO,nV,nV)
+  double precision, intent(in)  :: h_t1(nO,nV)
  type(gpu_double2), intent(in) :: t1
  type(gpu_double4), intent(in) :: t2
  ! out
-  double precision, intent(out) :: tau(nO,nO,nV,nV)
+  type(gpu_double4) :: tau
  ! internal
  integer                       :: i,j,a,b
  type(gpu_stream) :: stream(nV)
  !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,tau,t2,t1) &
+  !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) &
  !$OMP PRIVATE(i,j,a,b) &
  !$OMP DEFAULT(NONE)
  !$OMP DO
-  do b = 1, nV
+  do b=1,nV
-    do a = 1, nV
+    call gpu_stream_create(stream(b))
-      do j = 1, nO
+    call gpu_set_stream(blas_handle,stream(b))
-        do i = 1, nO
+    do j=1,nO
-          tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b)
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, &
-        enddo
+         1.d0, t2%f(1,j,1,b), nO*nO, &
-      enddo
+         h_t1(j,b), t1%f(1,1), nO, &
         tau%f(1,j,1,b), nO*nO)
    enddo
  enddo
  !$OMP END DO
  !$OMP END PARALLEL
  call gpu_synchronize()
  do b=1,nV
    call gpu_stream_destroy(stream(b))
  enddo
  call gpu_set_stream(blas_handle,gpu_default_stream)
 end
 subroutine update_tau_x_space(nO,nV,tau,tau_x)
-
+  use gpu
  implicit none
  ! in
  integer, intent(in)         :: nO, nV
-  double precision, intent(in)  :: tau(nO,nO,nV,nV)
+  type(gpu_double4), intent(in)  :: tau
  ! out
-  double precision, intent(out) :: tau_x(nO,nO,nV,nV)
+  type(gpu_double4) :: tau_x
  ! internal
  integer                       :: i,j,a,b
  type(gpu_stream) :: stream(nV)
  do a=1,nV
    call gpu_stream_create(stream(a))
  enddo
  !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,tau,tau_x) &
+  !$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) &
-  !$OMP PRIVATE(i,j,a,b) &
+  !$OMP PRIVATE(a,b) &
  !$OMP DEFAULT(NONE)
  !$OMP DO
-  do b = 1, nV
+  do b=1,nV
-    do a = 1, nV
+    do a=1,nV
-      do j = 1, nO
+      call gpu_set_stream(blas_handle,stream(a))
-        do i = 1, nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, &
-          tau_x(i,j,a,b) = 2.d0*tau(i,j,a,b) - tau(i,j,b,a)
+          2.d0, tau%f(1,1,a,b), nO, &
-        enddo
+         -1.d0, tau%f(1,1,b,a), nO, &
-      enddo
+         tau_x%f(1,1,a,b), nO)
    enddo
  enddo
  !$OMP END DO
  !$OMP END PARALLEL
  call gpu_set_stream(blas_handle,gpu_default_stream)
  call gpu_synchronize()
  do b=1,nV
    call gpu_stream_destroy(stream(b))
  enddo
 end
 ! R1
--- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
--- a/src/gpu/NEED
+++ b/src/gpu/NEED
@ -0,0 +1 @@
 gpu_arch
--- a/src/gpu/README.rst
+++ b/src/gpu/README.rst
@ -0,0 +1,6 @@
 ===
 gpu
 ===
 Bindings for GPU routines (architecture independent).
 Architecture-dependent files are in gpu_arch.
--- a/src/gpu/gpu.h
+++ b/src/gpu/gpu.h
@ -0,0 +1,41 @@
 #include <stdint.h>
 int  gpu_ndevices();
 void gpu_set_device(int32_t i);
 void gpu_allocate(void** ptr, const int64_t n);
 void gpu_free(void** ptr);
 void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n);
 void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n);
 void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n);
 void gpu_stream_create(void** ptr);
 void gpu_stream_destroy(void** ptr);
 void gpu_set_stream(void* handle, void* stream);
 void gpu_synchronize();
 void gpu_blas_create(void** handle);
 void gpu_blas_destroy(void** handle);
 void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result);
 void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result);
 void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double* alpha,
               const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy);
 void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float* alpha,
               const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy);
 void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc);
 void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha,
               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc);
 void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double* alpha,
               const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc);
 void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float* alpha,
               const float* a, const int64_t lda, const float* beta, const float* b, const int64_t ldb, float* c, const int64_t ldc);
--- a/src/gpu/gpu.irp.f
+++ b/src/gpu/gpu.irp.f
@ -0,0 +1,26 @@
 use gpu
 BEGIN_PROVIDER [ type(gpu_blas), blas_handle ]
 implicit none
 BEGIN_DOC
 ! Handle for cuBLAS or RocBLAS
 END_DOC
 call gpu_blas_create(blas_handle)
 END_PROVIDER
 BEGIN_PROVIDER [ type(gpu_stream), gpu_default_stream ]
 implicit none
 BEGIN_DOC
 ! Default stream
 END_DOC
 gpu_default_stream%c = C_NULL_PTR
 END_PROVIDER
 BEGIN_PROVIDER [ integer, gpu_num ]
 implicit none
 BEGIN_DOC
 ! Number of usable GPUs
 END_DOC
 gpu_num = gpu_ndevices()
 END_PROVIDER
--- a/src/gpu/gpu_module.F90
+++ b/src/gpu/gpu_module.F90
@ -0,0 +1,707 @@
 module gpu
  use, intrinsic :: iso_c_binding
  implicit none
 ! Data types
 ! ----------
  type gpu_double1
    type(c_ptr) :: c
    double precision, pointer :: f(:)
  end type
  type gpu_double2
    type(c_ptr) :: c
    double precision, pointer :: f(:,:)
  end type
  type gpu_double3
    type(c_ptr) :: c
    double precision, pointer :: f(:,:,:)
  end type
  type gpu_double4
    type(c_ptr) :: c
    double precision, pointer :: f(:,:,:,:)
  end type
  type gpu_double5
    type(c_ptr) :: c
    double precision, pointer :: f(:,:,:,:,:)
  end type
  type gpu_double6
    type(c_ptr) :: c
    double precision, pointer :: f(:,:,:,:,:,:)
  end type
  type gpu_blas
    type(c_ptr) :: c
  end type
  type gpu_stream
    type(c_ptr) :: c
  end type
 ! C interfaces
 ! ------------
  interface
    logical(c_bool) function no_gpu() bind(C)
      import
    end function
    integer function gpu_ndevices() bind(C)
      import
    end function
    subroutine gpu_set_device(id) bind(C)
      import
      integer(c_int32_t), value :: id
    end subroutine
    subroutine gpu_allocate_c(ptr, n) bind(C, name='gpu_allocate')
      import
      type(c_ptr) :: ptr
      integer(c_int64_t), value :: n
    end subroutine
    subroutine gpu_deallocate_c(ptr) bind(C, name='gpu_deallocate')
      import
      type(c_ptr) :: ptr
    end subroutine
    subroutine gpu_upload_c(cpu_ptr, gpu_ptr, n) bind(C, name='gpu_upload')
      import
      type(c_ptr), value :: cpu_ptr
      type(c_ptr), value :: gpu_ptr
      integer(c_int64_t), value :: n
    end subroutine
    subroutine gpu_download_c(gpu_ptr, cpu_ptr, n) bind(C, name='gpu_download')
      import
      type(c_ptr), value :: gpu_ptr
      type(c_ptr), value :: cpu_ptr
      integer(c_int64_t), value :: n
    end subroutine
    subroutine gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, n) bind(C, name='gpu_copy')
      import
      type(c_ptr), value :: gpu_ptr_src
      type(c_ptr), value :: gpu_ptr_dest
      integer(c_int64_t), value :: n
    end subroutine
    subroutine gpu_stream_create_c(stream) bind(C, name='gpu_stream_create')
      import
      type(c_ptr) :: stream
    end subroutine
    subroutine gpu_stream_destroy_c(stream) bind(C, name='gpu_stream_destroy')
      import
      type(c_ptr) :: stream
    end subroutine
    subroutine gpu_set_stream_c(handle, stream) bind(C, name='gpu_set_stream')
      import
      type(c_ptr), value :: handle, stream
    end subroutine
    subroutine gpu_synchronize() bind(C)
      import
    end subroutine
    subroutine gpu_blas_create_c(handle) bind(C, name='gpu_blas_create')
      import
      type(c_ptr) :: handle
    end subroutine
    subroutine gpu_blas_destroy_c(handle) bind(C, name='gpu_blas_destroy')
      import
      type(c_ptr) :: handle
    end subroutine
    subroutine gpu_ddot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_ddot')
      import
      type(c_ptr), value, intent(in) :: handle
      integer(c_int64_t), value      :: n, incx, incy
      type(c_ptr), value             :: dx, dy
      real(c_double), intent(out)    :: res
    end subroutine
    subroutine gpu_sdot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_sdot')
      import
      type(c_ptr), value, intent(in) :: handle
      integer(c_int64_t), value      :: n, incx, incy
      type(c_ptr), intent(in), value :: dx, dy
      real(c_float), intent(out)     :: res
    end subroutine
    subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
      b, ldb, c, ldc) bind(C, name='gpu_dgeam')
      import
      type(c_ptr), value, intent(in)        :: handle
      character(c_char), intent(in), value  :: transa, transb
      integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
      real(c_double), intent(in)            :: alpha, beta
      type(c_ptr), value :: a, b, c
    end subroutine
    subroutine gpu_sgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
      b, ldb, c, ldc) bind(C, name='gpu_sgeam')
      import
      type(c_ptr), value, intent(in)        :: handle
      character(c_char), intent(in), value  :: transa, transb
      integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
      real(c_float), intent(in)             :: alpha, beta
      real(c_float) :: a, b, c
    end subroutine
    subroutine gpu_dgemv_c(handle, transa, m, n, alpha, a, lda, &
      x, incx, beta, y, incy) bind(C, name='gpu_dgemv')
      import
      type(c_ptr), value, intent(in)        :: handle
      character(c_char), intent(in)         :: transa
      integer(c_int64_t), intent(in), value :: m, n, lda, incx, incy
      real(c_double), intent(in)            :: alpha, beta
      real(c_double)                        :: a, x, y
    end subroutine
    subroutine gpu_sgemv_c(handle, transa, m, n, alpha, a, lda, &
      x, incx, beta, y, incy) bind(C, name='gpu_sgemv')
      import
      type(c_ptr), value, intent(in)        :: handle
      character(c_char), intent(in)         :: transa
      integer(c_int64_t), intent(in), value :: m, n, lda, incx, incy
      real(c_float), intent(in)             :: alpha, beta
      real(c_float)                         :: a, x, y
    end subroutine
    subroutine gpu_dgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, &
      b, ldb, beta, c, ldc) bind(C, name='gpu_dgemm')
      import
      type(c_ptr), value, intent(in)        :: handle
      character(c_char), intent(in)         :: transa, transb
      integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc
      real(c_double), intent(in)            :: alpha, beta
      real(c_double) :: a, b, c
    end subroutine
    subroutine gpu_sgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, &
      b, ldb, beta, c, ldc) bind(C, name='gpu_sgemm')
      import
      type(c_ptr), value, intent(in)        :: handle
      character(c_char), intent(in), value  :: transa, transb
      integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc
      real(c_float), intent(in)             :: alpha, beta
      real(c_float) :: a, b, c
    end subroutine
  end interface
 ! Polymorphic interfaces
 ! ----------------------
  interface gpu_allocate
    procedure gpu_allocate_double1     &
             ,gpu_allocate_double2     &
             ,gpu_allocate_double3     &
             ,gpu_allocate_double4     &
             ,gpu_allocate_double5     &
             ,gpu_allocate_double6     &
             ,gpu_allocate_double1_64  &
             ,gpu_allocate_double2_64  &
             ,gpu_allocate_double3_64  &
             ,gpu_allocate_double4_64  &
             ,gpu_allocate_double5_64  &
             ,gpu_allocate_double6_64
  end interface gpu_allocate
  interface gpu_deallocate
    procedure gpu_deallocate_double1     &
             ,gpu_deallocate_double2     &
             ,gpu_deallocate_double3     &
             ,gpu_deallocate_double4     &
             ,gpu_deallocate_double5     &
             ,gpu_deallocate_double6
  end interface gpu_deallocate
  interface gpu_upload
    procedure gpu_upload_double1  &
             ,gpu_upload_double2  &
             ,gpu_upload_double3  &
             ,gpu_upload_double4  &
             ,gpu_upload_double5  &
             ,gpu_upload_double6
  end interface gpu_upload
  interface gpu_download
    procedure gpu_download_double1  &
             ,gpu_download_double2  &
             ,gpu_download_double3  &
             ,gpu_download_double4  &
             ,gpu_download_double5  &
             ,gpu_download_double6
  end interface gpu_download
  interface gpu_copy
    procedure gpu_copy_double1  &
             ,gpu_copy_double2  &
             ,gpu_copy_double3  &
             ,gpu_copy_double4  &
             ,gpu_copy_double5  &
             ,gpu_copy_double6
  end interface gpu_copy
  contains
 ! gpu_allocate
 ! ------------
    subroutine gpu_allocate_double1(ptr, s)
      implicit none
      type(gpu_double1), intent(inout) :: ptr
      integer, intent(in) :: s
      call gpu_allocate_c(ptr%c, s*8_8)
      call c_f_pointer(ptr%c, ptr%f, (/ s /))
    end subroutine
    subroutine gpu_allocate_double2(ptr, s1, s2)
      implicit none
      type(gpu_double2), intent(inout) :: ptr
      integer, intent(in) :: s1, s2
      call gpu_allocate_c(ptr%c, s1*s2*8_8)
      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /))
    end subroutine
    subroutine gpu_allocate_double3(ptr, s1, s2, s3)
      implicit none
      type(gpu_double3), intent(inout) :: ptr
      integer, intent(in) :: s1, s2, s3
      call gpu_allocate_c(ptr%c, s1*s2*s3*8_8)
      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /))
    end subroutine
    subroutine gpu_allocate_double4(ptr, s1, s2, s3, s4)
      implicit none
      type(gpu_double4), intent(inout) :: ptr
      integer, intent(in) :: s1, s2, s3, s4
      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8)
      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /))
    end subroutine
    subroutine gpu_allocate_double5(ptr, s1, s2, s3, s4, s5)
      implicit none
      type(gpu_double5), intent(inout) :: ptr
      integer, intent(in) :: s1, s2, s3, s4, s5
      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8)
      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /))
    end subroutine
    subroutine gpu_allocate_double6(ptr, s1, s2, s3, s4, s5, s6)
      implicit none
      type(gpu_double6), intent(inout) :: ptr
      integer, intent(in) :: s1, s2, s3, s4, s5, s6
      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8)
      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /))
    end subroutine
    subroutine gpu_allocate_double1_64(ptr, s)
      implicit none
      type(gpu_double1), intent(inout) :: ptr
      integer*8, intent(in) :: s
      call gpu_allocate_c(ptr%c, s)
      call c_f_pointer(ptr%c, ptr%f, (/ s /))
    end subroutine
    subroutine gpu_allocate_double2_64(ptr, s1, s2)
      implicit none
      type(gpu_double2), intent(inout) :: ptr
      integer*8, intent(in) :: s1, s2
      call gpu_allocate_c(ptr%c, s1*s2*8_8)
      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /))
    end subroutine
    subroutine gpu_allocate_double3_64(ptr, s1, s2, s3)
      implicit none
      type(gpu_double3), intent(inout) :: ptr
      integer*8, intent(in) :: s1, s2, s3
      call gpu_allocate_c(ptr%c, s1*s2*s3*8_8)
      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /))
    end subroutine
    subroutine gpu_allocate_double4_64(ptr, s1, s2, s3, s4)
      implicit none
      type(gpu_double4), intent(inout) :: ptr
      integer*8, intent(in) :: s1, s2, s3, s4
      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8)
      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /))
    end subroutine
    subroutine gpu_allocate_double5_64(ptr, s1, s2, s3, s4, s5)
      implicit none
      type(gpu_double5), intent(inout) :: ptr
      integer*8, intent(in) :: s1, s2, s3, s4, s5
      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8)
      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /))
    end subroutine
    subroutine gpu_allocate_double6_64(ptr, s1, s2, s3, s4, s5, s6)
      implicit none
      type(gpu_double6), intent(inout) :: ptr
      integer*8, intent(in) :: s1, s2, s3, s4, s5, s6
      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8)
      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /))
    end subroutine
 ! gpu_deallocate
 ! --------------
    subroutine gpu_deallocate_double1(ptr)
      implicit none
      type(gpu_double1), intent(inout) :: ptr
      call gpu_deallocate_c(ptr%c)
      NULLIFY(ptr%f)
    end subroutine
    subroutine gpu_deallocate_double2(ptr)
      implicit none
      type(gpu_double2), intent(inout) :: ptr
      call gpu_deallocate_c(ptr%c)
      NULLIFY(ptr%f)
    end subroutine
    subroutine gpu_deallocate_double3(ptr)
      implicit none
      type(gpu_double3), intent(inout) :: ptr
      call gpu_deallocate_c(ptr%c)
      NULLIFY(ptr%f)
    end subroutine
    subroutine gpu_deallocate_double4(ptr)
      implicit none
      type(gpu_double4), intent(inout) :: ptr
      call gpu_deallocate_c(ptr%c)
      NULLIFY(ptr%f)
    end subroutine
    subroutine gpu_deallocate_double5(ptr)
      implicit none
      type(gpu_double5), intent(inout) :: ptr
      call gpu_deallocate_c(ptr%c)
      NULLIFY(ptr%f)
    end subroutine
    subroutine gpu_deallocate_double6(ptr)
      implicit none
      type(gpu_double6), intent(inout) :: ptr
      call gpu_deallocate_c(ptr%c)
      NULLIFY(ptr%f)
    end subroutine
 ! gpu_upload
 ! ----------
    subroutine gpu_upload_double1(cpu_ptr, gpu_ptr)
      implicit none
      double precision, target, intent(in)     :: cpu_ptr(*)
      type(gpu_double1), intent(in)    :: gpu_ptr
      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, 8_8*size(gpu_ptr%f))
    end subroutine
    subroutine gpu_upload_double2(cpu_ptr, gpu_ptr)
      implicit none
      double precision, target, intent(in)     :: cpu_ptr(:,:)
      type(gpu_double2), intent(in)    :: gpu_ptr
      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
    end subroutine
    subroutine gpu_upload_double3(cpu_ptr, gpu_ptr)
      implicit none
      double precision, target, intent(in)     :: cpu_ptr(:,:,:)
      type(gpu_double3), intent(in)    :: gpu_ptr
      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
    end subroutine
    subroutine gpu_upload_double4(cpu_ptr, gpu_ptr)
      implicit none
      double precision, target, intent(in)     :: cpu_ptr(:,:,:,:)
      type(gpu_double4), intent(in)    :: gpu_ptr
      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
    end subroutine
    subroutine gpu_upload_double5(cpu_ptr, gpu_ptr)
      implicit none
      double precision, target, intent(in)     :: cpu_ptr(:,:,:,:,:)
      type(gpu_double5), intent(in)    :: gpu_ptr
      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
    end subroutine
    subroutine gpu_upload_double6(cpu_ptr, gpu_ptr)
      implicit none
      double precision, target, intent(in)     :: cpu_ptr(:,:,:,:,:,:)
      type(gpu_double6), intent(in)    :: gpu_ptr
      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
    end subroutine
 ! gpu_download
 ! ------------
    subroutine gpu_download_double1(gpu_ptr, cpu_ptr)
      implicit none
      type(gpu_double1), intent(in)  :: gpu_ptr
      double precision, target, intent(in)   :: cpu_ptr(:)
      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*size(gpu_ptr%f))
    end subroutine
    subroutine gpu_download_double2(gpu_ptr, cpu_ptr)
      implicit none
      type(gpu_double2), intent(in)  :: gpu_ptr
      double precision, target, intent(in)   :: cpu_ptr(:,:)
      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
    end subroutine
    subroutine gpu_download_double3(gpu_ptr, cpu_ptr)
      implicit none
      type(gpu_double3), intent(in)  :: gpu_ptr
      double precision, target, intent(in)   :: cpu_ptr(:,:,:)
      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
    end subroutine
    subroutine gpu_download_double4(gpu_ptr, cpu_ptr)
      implicit none
      type(gpu_double4), intent(in)  :: gpu_ptr
      double precision, target, intent(in)   :: cpu_ptr(:,:,:,:)
      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
    end subroutine
    subroutine gpu_download_double5(gpu_ptr, cpu_ptr)
      implicit none
      type(gpu_double5), intent(in)  :: gpu_ptr
      double precision, target, intent(in)   :: cpu_ptr(:,:,:,:,:)
      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
    end subroutine
    subroutine gpu_download_double6(gpu_ptr, cpu_ptr)
      implicit none
      type(gpu_double6), intent(in)  :: gpu_ptr
      double precision, target, intent(in)   :: cpu_ptr(:,:,:,:,:,:)
      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
    end subroutine
 ! gpu_copy
 ! --------
    subroutine gpu_copy_double1(gpu_ptr_src, gpu_ptr_dest)
      implicit none
      type(gpu_double1), intent(in)        :: gpu_ptr_src
      type(gpu_double1), intent(in)        :: gpu_ptr_dest
      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*size(gpu_ptr_dest%f))
    end subroutine
    subroutine gpu_copy_double2(gpu_ptr_src, gpu_ptr_dest)
      implicit none
      type(gpu_double2), intent(in)        :: gpu_ptr_src
      type(gpu_double2), intent(in)        :: gpu_ptr_dest
      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
    end subroutine
    subroutine gpu_copy_double3(gpu_ptr_src, gpu_ptr_dest)
      implicit none
      type(gpu_double3), intent(in)        :: gpu_ptr_src
      type(gpu_double3), intent(in)        :: gpu_ptr_dest
      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
    end subroutine
    subroutine gpu_copy_double4(gpu_ptr_src, gpu_ptr_dest)
      implicit none
      type(gpu_double4), intent(in)        :: gpu_ptr_src
      type(gpu_double4), intent(in)        :: gpu_ptr_dest
      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
    end subroutine
    subroutine gpu_copy_double5(gpu_ptr_src, gpu_ptr_dest)
      implicit none
      type(gpu_double5), intent(in)        :: gpu_ptr_src
      type(gpu_double5), intent(in)        :: gpu_ptr_dest
      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
    end subroutine
    subroutine gpu_copy_double6(gpu_ptr_src, gpu_ptr_dest)
      implicit none
      type(gpu_double6), intent(in)        :: gpu_ptr_src
      type(gpu_double6), intent(in)        :: gpu_ptr_dest
      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
    end subroutine
 ! gpu_stream
 ! ----------
    subroutine gpu_stream_create(stream)
      type(gpu_stream) :: stream
      call gpu_stream_create_c(stream%c)
    end subroutine
    subroutine gpu_stream_destroy(stream)
      type(gpu_stream) :: stream
      call gpu_stream_destroy_c(stream%c)
    end subroutine
    subroutine gpu_set_stream(handle, stream)
      type(gpu_blas)   :: handle
      type(gpu_stream) :: stream
      call gpu_set_stream_c(handle%c, stream%c)
    end subroutine
 ! gpu_blas
 ! --------
    subroutine gpu_blas_create(handle)
      type(gpu_blas) :: handle
      call gpu_blas_create_c(handle%c)
    end subroutine
    subroutine gpu_blas_destroy(handle)
      type(gpu_blas) :: handle
      call gpu_blas_destroy_c(handle%c)
    end subroutine
 ! dot
 ! ---
 subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res)
 !  use gpu
  type(gpu_blas), intent(in)     :: handle
  integer*4                      :: n, incx, incy
  double precision, target       :: dx, dy
  double precision, intent(out)  :: res
  call gpu_ddot_c(handle%c, int(n,c_int64_t), c_loc(dx), int(incx,c_int64_t), c_loc(dy), int(incy,c_int64_t), res)
 end subroutine
 subroutine gpu_ddot_64(handle, n, dx, incx, dy, incy, res)
 !  use gpu
  type(gpu_blas), intent(in)     :: handle
  integer*8                      :: n, incx, incy
  double precision, target       :: dx, dy
  double precision, intent(out)  :: res
  call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res)
 end subroutine
 ! geam
 ! ----
 subroutine gpu_dgeam(handle, transa, transb, m, n, alpha, a, lda, beta, &
  b, ldb, c, ldc)
 !  use gpu
  type(gpu_blas), intent(in)   :: handle
  character, intent(in)        :: transa, transb
  integer*4, intent(in)        :: m, n, lda, ldb, ldc
  double precision, intent(in) :: alpha, beta
  double precision, target     :: a, b, c
  call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, &
        c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t))
 end subroutine
 subroutine gpu_dgeam_64(handle, transa, transb, m, n, alpha, a, lda, beta, &
  b, ldb, c, ldc)
 !  use gpu
  type(gpu_blas), intent(in)   :: handle
  character, intent(in)        :: transa, transb
  integer*8, intent(in)        :: m, n, lda, ldb, ldc
  double precision, intent(in) :: alpha, beta
  double precision, target     :: a, b, c
  call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, &
        c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t))
 end subroutine
 ! gemv
 ! ----
 subroutine gpu_dgemv(handle, transa, m, n, alpha, a, lda, &
  x, incx, beta, y, incy)
 !  use gpu
  type(gpu_blas), intent(in)   :: handle
  character, intent(in)        :: transa
  integer*4, intent(in)        :: m, n, lda, incx, incy
  double precision, intent(in) :: alpha, beta
  double precision             :: a, x, y
  call gpu_dgemv_c(handle%c, transa, int(m,c_int64_t), int(n,c_int64_t), &
        alpha, a, int(lda,c_int64_t), &
        x, int(incx,c_int64_t), beta, y, int(incy,c_int64_t))
 end subroutine
 subroutine gpu_dgemv_64(handle, transa, m, n, alpha, a, lda, &
  x, incx, beta, y, incy)
 !  use gpu
  type(gpu_blas), intent(in)   :: handle
  character, intent(in)        :: transa
  integer*8, intent(in)        :: m, n, lda, incx, incy
  double precision, intent(in) :: alpha, beta
  double precision             :: a, x, y
  call gpu_dgemv_c(handle%c, transa, int(m,c_int64_t), int(n,c_int64_t), &
        alpha, a, int(lda,c_int64_t), &
        x, int(incx,c_int64_t), beta, y, int(incy,c_int64_t))
 end subroutine
 ! gemm
 ! ----
 subroutine gpu_dgemm(handle, transa, transb, m, n, k, alpha, a, lda, &
  b, ldb, beta, c, ldc)
 !  use gpu
  type(gpu_blas), intent(in)   :: handle
  character, intent(in)        :: transa, transb
  integer*4, intent(in)        :: m, n, k, lda, ldb, ldc
  double precision, intent(in) :: alpha, beta
  double precision             :: a, b, c
  call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), &
        alpha, a, int(lda,c_int64_t), &
        b, int(ldb,c_int64_t), beta, c, int(ldc,c_int64_t))
 end subroutine
 subroutine gpu_dgemm_64(handle, transa, transb, m, n, k, alpha, a, lda, &
  b, ldb, beta, c, ldc)
 !  use gpu
  type(gpu_blas), intent(in)   :: handle
  character, intent(in)        :: transa, transb
  integer*8, intent(in)        :: m, n, k, lda, ldb, ldc
  double precision, intent(in) :: alpha, beta
  double precision             :: a, b, c
  call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), &
        alpha, a, int(lda,c_int64_t), b, int(ldb,c_int64_t), beta, c, int(ldc,c_int64_t))
 end subroutine
 end module
--- a/src/hartree_fock/fock_matrix_hf.irp.f
+++ b/src/hartree_fock/fock_matrix_hf.irp.f
@ -194,14 +194,25 @@ END_PROVIDER
 endif
- double precision :: rss
+ double precision :: rss, mem0, mem
 double precision :: memory_of_double
 integer :: iblock
- integer, parameter :: block_size = 32
+ integer :: block_size
 call resident_memory(mem0)
 block_size = 1024
 rss = memory_of_double(2.d0*ao_num*ao_num)
 do
   mem = mem0 + block_size*rss
   if ( (block_size < 2).or.(mem < qp_max_mem) ) exit
   block_size = block_size/2
 enddo
 call check_mem(block_size*rss, irp_here)
 rss = memory_of_double(ao_num*ao_num)
 call check_mem(2.d0*block_size*rss, irp_here)
 allocate(X2(ao_num,ao_num,block_size,2))
 allocate(X3(ao_num,block_size,ao_num,2))
--- a/src/mo_two_e_ints/map_integrals.irp.f
+++ b/src/mo_two_e_ints/map_integrals.irp.f
@ -40,7 +40,7 @@ end
 ! Min and max values of the MOs for which the integrals are in the cache
 END_DOC
- mo_integrals_cache_size  = 2_8**mo_integrals_cache_shift
+ mo_integrals_cache_size  = 2**mo_integrals_cache_shift
 mo_integrals_cache_min = max(1,elec_alpha_num - (mo_integrals_cache_size/2 - 1) )
 mo_integrals_cache_max = min(mo_num, mo_integrals_cache_min + mo_integrals_cache_size - 1)
--- a/src/mol_properties/multi_s_dipole_moment.irp.f
+++ b/src/mol_properties/multi_s_dipole_moment.irp.f
@ -18,7 +18,7 @@
-BEGIN_PROVIDER [double precision, multi_s_dipole_moment, (N_states, N_states)]
+ BEGIN_PROVIDER [double precision, multi_s_dipole_moment  , (N_states, N_states)]
 &BEGIN_PROVIDER [double precision, multi_s_x_dipole_moment, (N_states, N_states)]
 &BEGIN_PROVIDER [double precision, multi_s_y_dipole_moment, (N_states, N_states)]
 &BEGIN_PROVIDER [double precision, multi_s_z_dipole_moment, (N_states, N_states)]
@ -40,17 +40,18 @@ BEGIN_PROVIDER [double precision, multi_s_dipole_moment, (N_states, N_states)]
  ! gamma^{nm}: density matrix \bra{\Psi^n} a^{\dagger}_a a_i \ket{\Psi^m}
  END_DOC
-  integer          :: istate,jstate ! States
+  integer          :: istate, jstate ! States
-  integer          :: i,j           ! general spatial MOs
+  integer          :: i, j           ! general spatial MOs
  double precision :: nuclei_part_x, nuclei_part_y, nuclei_part_z
  multi_s_x_dipole_moment = 0.d0
  multi_s_y_dipole_moment = 0.d0
  multi_s_z_dipole_moment = 0.d0
  if(8.d0*mo_num*mo_num*n_states*n_states*1d-9 .lt. 200.d0) then
    do jstate = 1, N_states
      do istate = 1, N_states
        do i = 1, mo_num  
          do j = 1, mo_num  
            multi_s_x_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_x(j,i)  
@ -58,9 +59,134 @@ BEGIN_PROVIDER [double precision, multi_s_dipole_moment, (N_states, N_states)]
            multi_s_z_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_z(j,i) 
          enddo
        enddo 
      enddo
    enddo
  else
    ! no enouph memory
    ! on the fly scheme
    PROVIDE psi_det_alpha_unique psi_det_beta_unique
    integer           :: l, k_a, k_b
    integer           :: occ(N_int*bit_kind_size,2)
    integer           :: h1, h2, p1, p2, degree
    integer           :: exc(0:2,2), n_occ(2)
    integer           :: krow, kcol, lrow, lcol
    integer(bit_kind) :: tmp_det(N_int,2), tmp_det2(N_int)
    double precision  :: ck, ckl, phase
    !$OMP PARALLEL DEFAULT(NONE)                                                      &
    !$OMP PRIVATE(j, l, k_a, k_b, istate, jstate, occ, ck, ckl, h1, h2, p1, p2, exc,  & 
    !$OMP         phase, degree, n_occ, krow, kcol, lrow, lcol, tmp_det, tmp_det2)    &
    !$OMP SHARED(N_int, N_states, elec_alpha_num, elec_beta_num, N_det,               &
    !$OMP        psi_bilinear_matrix_rows, psi_bilinear_matrix_columns,               &
    !$OMP        psi_bilinear_matrix_transp_rows, psi_bilinear_matrix_transp_columns, &
    !$OMP        psi_det_alpha_unique, psi_det_beta_unique,                           &
    !$OMP        psi_bilinear_matrix_values, psi_bilinear_matrix_transp_values,       &
    !$OMP        mo_dipole_x, mo_dipole_y, mo_dipole_z,                               &
    !$OMP        multi_s_x_dipole_moment, multi_s_y_dipole_moment, multi_s_z_dipole_moment)
    !$OMP DO COLLAPSE(2)
    do istate = 1, N_states
      do jstate = 1, N_states
        do k_a = 1, N_det
          krow = psi_bilinear_matrix_rows   (k_a)
          kcol = psi_bilinear_matrix_columns(k_a)
          tmp_det(1:N_int,1) = psi_det_alpha_unique(1:N_int,krow)
          tmp_det(1:N_int,2) = psi_det_beta_unique (1:N_int,kcol)
          ! Diagonal part
          call bitstring_to_list_ab(tmp_det, occ, n_occ, N_int)
          ck = psi_bilinear_matrix_values(k_a,istate)*psi_bilinear_matrix_values(k_a,jstate)
          do l = 1, elec_alpha_num
            j = occ(l,1)
            multi_s_x_dipole_moment(istate,jstate) -= ck * mo_dipole_x(j,j) 
            multi_s_y_dipole_moment(istate,jstate) -= ck * mo_dipole_y(j,j) 
            multi_s_z_dipole_moment(istate,jstate) -= ck * mo_dipole_z(j,j) 
          enddo
          if (k_a == N_det) cycle
          l = k_a + 1
          lrow = psi_bilinear_matrix_rows   (l)
          lcol = psi_bilinear_matrix_columns(l)
          ! Fix beta determinant, loop over alphas
          do while (lcol == kcol)
            tmp_det2(:) = psi_det_alpha_unique(:,lrow)
            call get_excitation_degree_spin(tmp_det(1,1), tmp_det2, degree, N_int)
            if (degree == 1) then
              exc = 0
              call get_single_excitation_spin(tmp_det(1,1), tmp_det2, exc, phase, N_int)
              call decode_exc_spin(exc, h1, p1, h2, p2)
              ckl = psi_bilinear_matrix_values(k_a,istate)*psi_bilinear_matrix_values(l,jstate) * phase
              multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(h1,p1) 
              multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(h1,p1) 
              multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(h1,p1) 
              ckl = psi_bilinear_matrix_values(k_a,jstate)*psi_bilinear_matrix_values(l,istate) * phase
              multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(p1,h1) 
              multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(p1,h1) 
              multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(p1,h1) 
            endif
            l = l+1
            if (l > N_det) exit
            lrow = psi_bilinear_matrix_rows   (l)
            lcol = psi_bilinear_matrix_columns(l)
          enddo
        enddo ! k_a
        do k_b = 1, N_det
          krow = psi_bilinear_matrix_transp_rows   (k_b)
          kcol = psi_bilinear_matrix_transp_columns(k_b)
          tmp_det(1:N_int,1) = psi_det_alpha_unique(1:N_int,krow)
          tmp_det(1:N_int,2) = psi_det_beta_unique (1:N_int,kcol)
          ! Diagonal part
          call bitstring_to_list_ab(tmp_det, occ, n_occ, N_int)
          ck = psi_bilinear_matrix_transp_values(k_b,istate)*psi_bilinear_matrix_transp_values(k_b,jstate)
          do l = 1, elec_beta_num
            j = occ(l,2)
            multi_s_x_dipole_moment(istate,jstate) -= ck * mo_dipole_x(j,j) 
            multi_s_y_dipole_moment(istate,jstate) -= ck * mo_dipole_y(j,j) 
            multi_s_z_dipole_moment(istate,jstate) -= ck * mo_dipole_z(j,j) 
          enddo
          if (k_b == N_det) cycle
          l = k_b+1
          lrow = psi_bilinear_matrix_transp_rows   (l)
          lcol = psi_bilinear_matrix_transp_columns(l)
          ! Fix beta determinant, loop over alphas
          do while (lrow == krow)
            tmp_det2(:) = psi_det_beta_unique(:,lcol)
            call get_excitation_degree_spin(tmp_det(1,2), tmp_det2, degree, N_int)
            if (degree == 1) then
              exc = 0
              call get_single_excitation_spin(tmp_det(1,2), tmp_det2, exc, phase, N_int)
              call decode_exc_spin(exc, h1, p1, h2, p2)
              ckl = psi_bilinear_matrix_transp_values(k_b,istate)*psi_bilinear_matrix_transp_values(l,jstate) * phase
              multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(h1,p1) 
              multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(h1,p1) 
              multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(h1,p1) 
              ckl = psi_bilinear_matrix_transp_values(k_b,jstate)*psi_bilinear_matrix_transp_values(l,istate) * phase
              multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(p1,h1) 
              multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(p1,h1) 
              multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(p1,h1) 
            endif
            l = l+1
            if (l > N_det) exit
            lrow = psi_bilinear_matrix_transp_rows   (l)
            lcol = psi_bilinear_matrix_transp_columns(l)
          enddo
        enddo ! k_b
      enddo ! istate
    enddo ! jstate
    !$OMP END DO
    !$OMP END PARALLEL
  endif ! memory condition
  ! Nuclei part
  nuclei_part_x = 0.d0
--- a/src/tools/four_idx_transform.irp.f
+++ b/src/tools/four_idx_transform.irp.f
@ -12,6 +12,9 @@ program four_idx_transform
 !
  END_DOC
  if (do_mo_cholesky) then
    stop 'Not implemented with Cholesky integrals'
  endif
  io_mo_two_e_integrals = 'Write'
  SOFT_TOUCH io_mo_two_e_integrals
  if (.true.) then
--- a/src/trexio/export_trexio_routines.irp.f
+++ b/src/trexio/export_trexio_routines.irp.f
@ -557,7 +557,7 @@ subroutine export_trexio(update,full_path)
    do k=1,cholesky_ao_num
     do j=1,mo_num
      do i=1,mo_num
-         integral = cholesky_mo(i,j,k)
+         integral = cholesky_mo_transp(k,i,j)
         if (integral == 0.d0) cycle
         icount += 1_8
         chol_buffer(icount) = integral
--- a/src/trexio/import_trexio_integrals.irp.f
+++ b/src/trexio/import_trexio_integrals.irp.f
@ -28,7 +28,7 @@ subroutine run(f)
  integer(trexio_t), intent(in)  :: f ! TREXIO file handle
  integer(trexio_exit_code)      :: rc
-  integer ::i,j,k,l
+  integer :: i,j,k,l, iunit
  integer(8) :: m, n_integrals
  double precision :: integral
@ -41,10 +41,12 @@ subroutine run(f)
  integer         , allocatable :: Vi(:,:)
  double precision              :: s
-! TODO:
+  integer*4 :: BUFSIZE
-! - If Cholesky AO in trexio file, read cholesky ao vectors
+  integer :: rank
-! - If Cholesky MO in trexio file, read cholesky mo vectors
+  double precision, allocatable :: tmp(:,:,:)
-! - If Cholesky MO not in trexio file, force do_cholesky_mo to False
+  integer*8 :: offset, icount
  integer, external :: getUnitAndOpen
  if (trexio_has_nucleus_repulsion(f) == TREXIO_SUCCESS) then
    rc = trexio_read_nucleus_repulsion(f, s)
@ -119,16 +121,58 @@ subroutine run(f)
  rc = trexio_has_ao_2e_int(f)
  PROVIDE ao_num
  if (rc /= TREXIO_HAS_NOT) then
      rc = trexio_has_ao_2e_int_eri_cholesky(f)
      if (rc /= TREXIO_HAS_NOT) then
          rc = trexio_read_ao_2e_int_eri_cholesky_num(f, rank)
          call trexio_assert(rc, TREXIO_SUCCESS)
          allocate(tmp(ao_num,ao_num,rank))
          tmp(:,:,:) = 0.d0
          BUFSIZE=ao_num**2
          allocate(Vi(3,BUFSIZE), V(BUFSIZE))
          offset = 0_8
          icount = BUFSIZE
          rc = TREXIO_SUCCESS
          do while (icount == size(V))
            rc = trexio_read_ao_2e_int_eri_cholesky(f, offset, icount, Vi, V)
            do m=1,icount
              i = Vi(1,m)
              j = Vi(2,m)
              k = Vi(3,m)
              integral = V(m)
              tmp(i,j,k) = integral
            enddo
            offset = offset + icount
            if (rc /= TREXIO_SUCCESS) then
                exit
            endif
          end do
          print *,  'Writing Cholesky AO vectors to disk...'
          iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_ao', 'W')
          write(iunit) rank
          write(iunit) tmp(:,:,:)
          close(iunit)
          call ezfio_set_ao_two_e_ints_io_ao_cholesky('Read')
          deallocate(Vi, V, tmp)
          print *, 'Cholesky AO integrals read from TREXIO file'
      endif
      rc = trexio_has_ao_2e_int_eri(f)
      if (rc /= TREXIO_HAS_NOT) then
          PROVIDE ao_integrals_map
      integer*4 :: BUFSIZE
          BUFSIZE=ao_num**2
          allocate(buffer_i(BUFSIZE), buffer_values(BUFSIZE))
          allocate(Vi(4,BUFSIZE), V(BUFSIZE))
      integer*8 :: offset, icount
          offset = 0_8
          icount = BUFSIZE
          rc = TREXIO_SUCCESS
@ -159,6 +203,7 @@ subroutine run(f)
          deallocate(buffer_i, buffer_values, Vi, V)
          print *, 'AO integrals read from TREXIO file'
      endif
  else
      print *, 'AO integrals not found in TREXIO file'
  endif
@ -186,6 +231,49 @@ subroutine run(f)
  rc = trexio_has_mo_2e_int(f)
  if (rc /= TREXIO_HAS_NOT) then
      rc = trexio_has_mo_2e_int_eri_cholesky(f)
      if (rc /= TREXIO_HAS_NOT) then
          rc = trexio_read_mo_2e_int_eri_cholesky_num(f, rank)
          call trexio_assert(rc, TREXIO_SUCCESS)
          allocate(tmp(rank,mo_num,mo_num))
          tmp(:,:,:) = 0.d0
          BUFSIZE=mo_num**2
          allocate(Vi(3,BUFSIZE), V(BUFSIZE))
          offset = 0_8
          icount = BUFSIZE
          rc = TREXIO_SUCCESS
          do while (icount == size(V))
            rc = trexio_read_mo_2e_int_eri_cholesky(f, offset, icount, Vi, V)
            do m=1,icount
              i = Vi(1,m)
              j = Vi(2,m)
              k = Vi(3,m)
              integral = V(m)
              tmp(k,i,j) = integral
            enddo
            offset = offset + icount
            if (rc /= TREXIO_SUCCESS) then
                exit
            endif
          end do
          print *,  'Writing Cholesky MO vectors to disk...'
          iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_mo_transp', 'W')
          write(iunit) rank
          write(iunit) tmp(:,:,:)
          close(iunit)
          call ezfio_set_mo_two_e_ints_io_mo_cholesky('Read')
          deallocate(Vi, V, tmp)
          print *, 'Cholesky MO integrals read from TREXIO file'
      endif
      rc = trexio_has_mo_2e_int_eri(f)
      if (rc /= TREXIO_HAS_NOT) then
          BUFSIZE=mo_num**2
          allocate(buffer_i(BUFSIZE), buffer_values(BUFSIZE))
          allocate(Vi(4,BUFSIZE), V(BUFSIZE))
@ -220,6 +308,8 @@ subroutine run(f)
          call ezfio_set_mo_two_e_ints_io_mo_two_e_integrals('Read')
          deallocate(buffer_i, buffer_values, Vi, V)
          print *, 'MO integrals read from TREXIO file'
      endif
  else
      print *, 'MO integrals not found in TREXIO file'
  endif
--- a/src/utils/fortran_mmap.c
+++ b/src/utils/fortran_mmap.c
@ -40,7 +40,7 @@ void* mmap_fortran(char* filename, size_t bytes, int* file_descr, int read_only,
            exit(EXIT_FAILURE);
        }
-        result = write(fd, "", 1);
+        result = write(fd, " ", 1);
        if (result != 1) {
            close(fd);
            printf("%s:\n", filename);
@ -49,7 +49,13 @@ void* mmap_fortran(char* filename, size_t bytes, int* file_descr, int read_only,
        }
        if (single_node == 1) {
-          map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_POPULATE | MAP_NONBLOCK, fd, 0);
+          map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
 /*
          map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_POPULATE | MAP_NONBLOCK | MAP_NORESERVE, fd, 0);
          if (map == MAP_FAILED) {
             map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
          }
 */
        } else {
          map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
        }
--- a/src/utils/linear_algebra.irp.f
+++ b/src/utils/linear_algebra.irp.f
@ -1856,7 +1856,7 @@ subroutine pivoted_cholesky( A, rank, tol, ndim, U)
 !
 ! matrix A is destroyed inside this subroutine
 ! Cholesky vectors are stored in U
-! dimension of U: U(1:rank, 1:n)
+! dimension of U: U(1:n, 1:rank)
 ! U is allocated inside this subroutine
 ! rank is the number of Cholesky vectors depending on tol
 !
		`@ -0,0 +1,2 @@`
							`-ltbb -lsycl -lmkl_sycl -lgpu -limf -lintlc -lstdc++`