Merge branch 'dev-stable' of https://github.com/AbdAmmar/qp2 into dev-stable

2025-04-26 10:14:45 +02:00 · 2024-07-29 17:22:53 +02:00 · 2024-07-29 17:22:53 +02:00 · eb236e0112
commit eb236e0112
parent 73066b4ac5 4d79bd135f
103 changed files with 4182 additions and 1978 deletions
--- a/18
+++ b/18
@ -40,7 +40,7 @@ Usage:
  $(basename $0) -c <file>    
  $(basename $0) -h           
  $(basename $0) -i <package> 
-  $(basename $0) -g [nvidia|none]
+  $(basename $0) -g [nvidia|intel|none]
 Options:
  -c  <file>        Define a COMPILATION configuration file,
@ -49,7 +49,7 @@ Options:
  -i <package>      INSTALL <package>. Use at your OWN RISK:
                    no support will be provided for the installation of
                    dependencies.
-  -g [nvidia|none]  Choose GPU acceleration (experimental)
+  -g [nvidia|intel|none]  Choose GPU acceleration
 Example:
  ./$(basename $0) -c config/gfortran.cfg
@ -115,19 +115,23 @@ while getopts "d:c:i:g:h" c ; do
 done
 # Handle GPU acceleration
-rm -f ${QP_ROOT}/src/gpu
+rm -f ${QP_ROOT}/src/gpu_arch
 case "$GPU" in
-  amd) # Nvidia
+  amd) # AMD
     echo "Activating AMD GPU acceleration"
-     ln -s ${QP_ROOT}/src/gpu_amd ${QP_ROOT}/src/gpu
+     ln -s ${QP_ROOT}/plugins/local/gpu_amd ${QP_ROOT}/src/gpu_arch
     ;;
  intel) # Intel
     echo "Activating Intel GPU acceleration (EXPERIMENTAL)"
     ln -s ${QP_ROOT}/plugins/local/gpu_intel ${QP_ROOT}/src/gpu_arch
     ;;
  nvidia) # Nvidia
     echo "Activating Nvidia GPU acceleration"
-     ln -s ${QP_ROOT}/src/gpu_nvidia ${QP_ROOT}/src/gpu
+     ln -s ${QP_ROOT}/plugins/local/gpu_nvidia ${QP_ROOT}/src/gpu_arch
     ;;
  *) # No Acceleration
     echo "Disabling GPU acceleration"
-     ln -s ${QP_ROOT}/src/gpu_x86 ${QP_ROOT}/src/gpu
+     ln -s ${QP_ROOT}/plugins/local/gpu_x86 ${QP_ROOT}/src/gpu_arch
     ;;
 esac
--- a/plugins/local/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f
+++ b/plugins/local/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f
@ -1,4 +1,36 @@
 ! ---
 subroutine run_pouet
  BEGIN_DOC
  ! Selected Full Configuration Interaction with Stochastic selection and PT2.
  END_DOC
  use selection_types
  implicit none
  integer                       :: i, j, k, ndet
  integer                       :: to_select
  logical                       :: has
  type(pt2_type)                :: pt2_data, pt2_data_err
  double precision              :: rss
  double precision              :: correlation_energy_ratio
  double precision              :: hf_energy_ref
  double precision              :: relative_error
  double precision, allocatable :: zeros(:),E_tc(:), norm(:)
  logical,          external    :: qp_stop
  double precision, external    :: memory_of_double
  PROVIDE mo_l_coef mo_r_coef
  PROVIDE H_apply_buffer_allocated distributed_davidson 
  print*, ' Diagonal elements of the Fock matrix '
  do i = 1, mo_num
    write(*,*) i, Fock_matrix_tc_mo_tot(i,i)
  enddo
 end
 ! ---
 subroutine run_stochastic_cipsi
--- a/plugins/local/fci_tc_bi/fci_tc_bi_ortho.irp.f
+++ b/plugins/local/fci_tc_bi/fci_tc_bi_ortho.irp.f
@ -65,7 +65,15 @@ subroutine run_cipsi_tc()
  if (.not. is_zmq_slave) then
    if(.True.)then! DO NOT REMOVE THE IF(.TRUE.) !! 
     ! this has to be provided before mo_bi_ortho_tc_two_e to avoid twice the computation of ao_two_e_tc_tot
     PROVIDE Fock_matrix_tc_mo_tot 
     ! because Fock_matrix_tc_mo_tot depends on ao_two_e_tc_tot 
     ! and that mo_bi_ortho_tc_two_e erase ao_two_e_tc_tot after being provided 
    endif
    if(.True.)then ! DO NOT REMOVE THE IF(.TRUE.) !! 
     PROVIDE psi_det psi_coef mo_bi_ortho_tc_two_e mo_bi_ortho_tc_one_e 
    endif
    if((elec_alpha_num+elec_beta_num) .ge. 3) then
      if(three_body_h_tc) then
@ -90,8 +98,16 @@ subroutine run_cipsi_tc()
    call json_close
  else
    if(.True.)then! DO NOT REMOVE THE IF(.TRUE.) !! 
     ! this has to be provided before mo_bi_ortho_tc_two_e to avoid twice the computation of ao_two_e_tc_tot
     PROVIDE Fock_matrix_tc_mo_tot 
     ! because Fock_matrix_tc_mo_tot depends on ao_two_e_tc_tot 
     ! and that mo_bi_ortho_tc_two_e erase ao_two_e_tc_tot after being provided 
    endif
    if(.True.)then! DO NOT REMOVE THE IF(.TRUE.) !! 
     PROVIDE mo_bi_ortho_tc_one_e mo_bi_ortho_tc_two_e pt2_min_parallel_tasks
    endif
    if((elec_alpha_num+elec_beta_num) .ge. 3) then
      if(three_body_h_tc) then
--- a/plugins/local/gpu_intel/LIB
+++ b/plugins/local/gpu_intel/LIB
@ -0,0 +1,2 @@
 -ltbb -lsycl -lmkl_sycl -lgpu -limf -lintlc -lstdc++ 
--- a/plugins/local/gpu_intel/NEED
+++ b/plugins/local/gpu_intel/NEED
--- a/plugins/local/gpu_intel/README.rst
+++ b/plugins/local/gpu_intel/README.rst
@ -0,0 +1,8 @@
 =========
 gpu_intel
 =========
 Intel implementation of GPU routines. Uses MKL and SYCL.
 ```bash
 icpx -fsycl gpu.cxx -c -qmkl=sequential
 ```
--- a/plugins/local/gpu_intel/gpu.sycl
+++ b/plugins/local/gpu_intel/gpu.sycl
@ -0,0 +1,177 @@
 #include <CL/sycl.hpp>
 #include <cassert>
 #include <limits>
 #include <oneapi/mkl/blas.hpp>
 extern "C" {
 /* Generic functions */
 int gpu_ndevices() {
  return 1;
 }
 void gpu_set_device(int32_t igpu) {
 }
 /* Allocation functions */
 void gpu_allocate(void** ptr, int64_t size) {
    auto queue = sycl::queue(sycl::default_selector_v);
    try {
        *ptr = sycl::malloc_shared(size, queue);
        assert(*ptr != nullptr);
    } catch (const sycl::exception& e) {
        std::cerr << "SYCL exception caught: " << e.what() << std::endl;
        *ptr = nullptr; // If allocation fails, set pointer to nullptr
    }
 }
 void gpu_deallocate(void** ptr) {
    assert(*ptr != nullptr);
    sycl::free(*ptr, sycl::queue(sycl::default_selector_v));
    *ptr = nullptr;
 }
 /* Upload data from host to device */
 void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
    sycl::queue queue(sycl::default_selector_v);
    queue.memcpy(gpu_ptr, cpu_ptr, n).wait();
 }
 /* Download data from device to host */
 void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
    sycl::queue queue(sycl::default_selector_v);
    queue.memcpy(cpu_ptr, gpu_ptr, n).wait();
 }
 /* Copy data from one GPU memory location to another */
 void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
    sycl::queue queue(sycl::default_selector_v);
    queue.memcpy(gpu_ptr_dest, gpu_ptr_src, n).wait();
 }
 /* Queues */
 /* SYCL queue as a replacement for CUDA stream */
 void gpu_stream_create(sycl::queue** ptr) {
    *ptr = new sycl::queue(sycl::default_selector_v);
 }
 void gpu_stream_destroy(sycl::queue** ptr) {
    assert(*ptr != nullptr);
    delete *ptr;
    *ptr = nullptr;
 }
 void gpu_synchronize() {
    sycl::queue queue(sycl::default_selector_v);
    queue.wait_and_throw();
 }
 /* BLAS functions */
 typedef struct {
  sycl::queue* queue;
 } blasHandle_t;
 void gpu_set_stream(blasHandle_t* handle, sycl::queue* ptr) {
  handle->queue = ptr;
 }
 void gpu_blas_create(blasHandle_t** ptr) {
    *ptr = (blasHandle_t*) malloc(sizeof(blasHandle_t));
    assert(*ptr != nullptr);
    (*ptr)->queue = new sycl::queue(sycl::default_selector_v);
    assert((*ptr)->queue != nullptr);
 }
 void gpu_blas_destroy(blasHandle_t** ptr) {
    assert(*ptr != nullptr);
    delete (*ptr)->queue;
    free(*ptr);
    *ptr = nullptr;
 }
 void gpu_ddot(blasHandle_t* handle, const int64_t n, const double* x, const int64_t incx, 
              const double* y, const int64_t incy, double* result) {
    // Ensure input parameters are valid
    assert(handle != nullptr);
    assert(handle->queue != nullptr);
    assert(n > 0);
    assert(incx > 0);
    assert(incy > 0);
    assert(x != nullptr);
    assert(y != nullptr);
    assert(result != nullptr);
    oneapi::mkl::blas::dot(*handle->queue, n, x, incx, y, incy, result);
 }
 void gpu_dgemv(blasHandle_t* handle, const char* transa, const int64_t m, const int64_t n, const double* alpha,
               const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) {
    assert(handle != nullptr);
    assert(handle->queue != nullptr);
    // Validate matrix dimensions and increments to be positive
    assert(m > 0 && n > 0 && lda > 0 && incx > 0 && incy > 0);
    assert(a != nullptr && x != nullptr && y != nullptr && alpha != nullptr && beta != nullptr);
    // Determine the operation type
    oneapi::mkl::transpose transa_ = oneapi::mkl::transpose::nontrans;
    if (*transa == 'T' || *transa == 't') {
        transa_ = oneapi::mkl::transpose::trans;
    }
    // Perform DGEMV operation using oneMKL
    oneapi::mkl::blas::column_major::gemv(*handle->queue, transa_, m, n, *alpha, a, lda, x, incx, *beta, y, incy);
 }
 void gpu_dgemm(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) {
    assert(handle != nullptr && handle->queue != nullptr);
    assert(m > 0 && n > 0 && k > 0 && lda > 0 && ldb > 0 && ldc > 0);
    assert(a != nullptr && b != nullptr && c != nullptr && alpha != nullptr && beta != nullptr);
    // Transpose operations
    auto transa_ = (*transa == 'T' || *transa == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans;
    auto transb_ = (*transb == 'T' || *transb == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans;
    oneapi::mkl::blas::column_major::gemm(*handle->queue, transa_, transb_, m, n, k,
                                    *alpha, a, lda, b, ldb, *beta, c, ldc);
 }
 void gpu_dgeam(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha,
               const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
    assert(handle != nullptr && handle->queue != nullptr);
    assert(m > 0 && n > 0 && lda > 0 && ldb > 0 && ldc > 0);
    assert(a != nullptr && b != nullptr && c != nullptr && alpha != nullptr && beta != nullptr);
    // Determine transpose operations
    bool transA = (*transa == 'T' || *transa == 't');
    bool transB = (*transb == 'T' || *transb == 't');
    handle->queue->submit([&](sycl::handler& cgh) {
        cgh.parallel_for(sycl::range<2>(m, n), [=](sycl::id<2> idx) {
            const int i = idx[0];
            const int j = idx[1];
            const int ai = transA ? j * lda + i : i * lda + j;
            const int bi = transB ? j * ldb + i : i * ldb + j;
            const int ci = i * ldc + j;
            c[ci] = (*alpha) * a[ai] + (*beta) * b[bi];
        });
    });
 }
 }  // extern C
--- a/plugins/local/gpu_nvidia/LIB
+++ b/plugins/local/gpu_nvidia/LIB
@ -0,0 +1 @@
 -lcudart -lcublas -lcublasLt
--- a/plugins/local/gpu_nvidia/NEED
+++ b/plugins/local/gpu_nvidia/NEED
@ -0,0 +1 @@
--- a/plugins/local/gpu_nvidia/README.rst
+++ b/plugins/local/gpu_nvidia/README.rst
@ -0,0 +1,5 @@
 ==========
 gpu_nvidia
 ==========
 Nvidia implementation of GPU routines. Uses CUDA and CUBLAS libraries.
--- a/plugins/local/gpu_nvidia/gpu.c
+++ b/plugins/local/gpu_nvidia/gpu.c
@ -0,0 +1,326 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <stdbool.h>
 #include <stdlib.h>
 #include <string.h>
 #include <assert.h>
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
 /* Generic functions */
 int gpu_ndevices() {
  int ngpus;
  cudaGetDeviceCount(&ngpus);
  return ngpus;
 }
 void gpu_set_device(int32_t igpu) {
  cudaSetDevice((int) igpu);
 }
 /* Allocation functions */
 void gpu_allocate(void** ptr, const int64_t size) {
    size_t free, total;
    cudaError_t rc = cudaMemGetInfo( &free, &total );
    if (rc != cudaSuccess) {
      free = INT64_MAX;
    }
    rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
 //    /* Use managed memory if it does not fit on the GPU */
 //    if (size < free && size < total/2) {
 //      rc= cudaMalloc(ptr, size);
 //    } else {
 //      rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
 //    }
    assert (rc == cudaSuccess);
 }
 void gpu_deallocate(void** ptr) {
  assert (*ptr != NULL);
  cudaFree(*ptr);
  *ptr = NULL;
 }
 /* Memory transfer functions */
 void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
  cudaMemcpy (gpu_ptr, cpu_ptr, n, cudaMemcpyHostToDevice);
 }
 void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
  cudaMemcpy (cpu_ptr, gpu_ptr, n, cudaMemcpyDeviceToHost);
 }
 void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
  cudaMemcpy (gpu_ptr_dest, gpu_ptr_src, n, cudaMemcpyDeviceToDevice);
 }
 /* Streams */
 void gpu_stream_create(cudaStream_t* ptr) {
  cudaError_t rc = cudaStreamCreate(ptr);
  assert (rc == cudaSuccess);
 }
 void gpu_stream_destroy(cudaStream_t* ptr) {
  assert (ptr != NULL);
  cudaError_t rc = cudaStreamDestroy(*ptr);
  assert (rc == cudaSuccess);
  *ptr = NULL;
 }
 void gpu_set_stream(cublasHandle_t handle, cudaStream_t stream) {
  cublasSetStream(handle, stream);
 }
 void gpu_synchronize() {
  cudaDeviceSynchronize();
 }
 /* BLAS functions */
 void gpu_blas_create(cublasHandle_t* ptr) {
  cublasStatus_t rc = cublasCreate(ptr);
  assert (rc == CUBLAS_STATUS_SUCCESS);
 }
 void gpu_blas_destroy(cublasHandle_t* ptr) {
  assert (ptr != NULL);
  cublasStatus_t rc = cublasDestroy(*ptr);
  assert (rc == CUBLAS_STATUS_SUCCESS);
  ptr = NULL;
 }
 void gpu_ddot(cublasHandle_t handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
  assert (handle != NULL);
  /* Convert to int */
  int n_, incx_, incy_;
  n_    = (int) n;
  incx_ = (int) incx;
  incy_ = (int) incy;
  assert ( (int64_t)    n_ == n   );
  assert ( (int64_t) incx_ == incx);
  assert ( (int64_t) incy_ == incy);
  cublasStatus_t rc = cublasDdot(handle, n_, x, incx_, y, incy_, result);
  assert (rc == CUBLAS_STATUS_SUCCESS);
 }
 void gpu_sdot(cublasHandle_t handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
  assert (handle != NULL);
  /* Convert to int */
  int n_, incx_, incy_;
  n_    = (int) n;
  incx_ = (int) incx;
  incy_ = (int) incy;
  /* Check for integer overflows */
  assert ( (int64_t)    n_ == n   );
  assert ( (int64_t) incx_ == incx);
  assert ( (int64_t) incy_ == incy);
  float result_ = 0.;
  cublasStatus_t rc = cublasSdot(handle, n_, x, incx_, y, incy_, &result_);
  assert (rc == CUBLAS_STATUS_SUCCESS);
  *result = result_;
 }
 void gpu_dgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const double* alpha,
               const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) {
  assert (handle != NULL);
  /* Convert to int */
  int m_, n_, lda_, incx_, incy_;
  m_    = (int) m;
  n_    = (int) n;
  lda_  = (int) lda;
  incx_ = (int) incx;
  incy_ = (int) incy;
  /* Check for integer overflows */
  assert ( (int64_t)    m_ == m   );
  assert ( (int64_t)    n_ == n   );
  assert ( (int64_t)  lda_ == lda );
  assert ( (int64_t) incx_ == incx);
  assert ( (int64_t) incy_ == incy);
  cublasOperation_t transa_ = CUBLAS_OP_N;
  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
  cublasDgemv(handle, transa_, m_, n_, alpha, a, lda_, x, incx_, beta, y, incy_);
 }
 void gpu_sgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const float* alpha,
               const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy) {
  assert (handle != NULL);
  /* Convert to int */
  int m_, n_, lda_, incx_, incy_;
  m_    = (int) m;
  n_    = (int) n;
  lda_  = (int) lda;
  incx_ = (int) incx;
  incy_ = (int) incy;
  /* Check for integer overflows */
  assert ( (int64_t)    m_ == m   );
  assert ( (int64_t)    n_ == n   );
  assert ( (int64_t)  lda_ == lda );
  assert ( (int64_t) incx_ == incx);
  assert ( (int64_t) incy_ == incy);
  cublasOperation_t transa_ = CUBLAS_OP_N;
  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
  cublasSgemv(handle, transa_, m_, n_, alpha, a, lda_, x, incx_, beta, y, incy_);
 }
 void gpu_dgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) {
  assert (handle != NULL);
  /* Convert to int */
  int m_, n_, k_, lda_, ldb_, ldc_;
  m_   = (int) m;
  n_   = (int) n;
  k_   = (int) k;
  lda_ = (int) lda;
  ldb_ = (int) ldb;
  ldc_ = (int) ldc;
  /* Check for integer overflows */
  assert ( (int64_t)   m_ == m  );
  assert ( (int64_t)   n_ == n  );
  assert ( (int64_t)   k_ == k  );
  assert ( (int64_t) lda_ == lda);
  assert ( (int64_t) ldb_ == ldb);
  assert ( (int64_t) ldc_ == ldc);
  cublasOperation_t transa_ = CUBLAS_OP_N;
  cublasOperation_t transb_ = CUBLAS_OP_N;
  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
  cublasDgemm(handle, transa_, transb_, m_, n_, k_, alpha, a, lda_, b, ldb_, beta, c, ldc_);
 }
 void gpu_sgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha,
               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc) {
  assert (handle != NULL);
  /* Convert to int */
  int m_, n_, k_, lda_, ldb_, ldc_;
  m_   = (int) m;
  n_   = (int) n;
  k_   = (int) k;
  lda_ = (int) lda;
  ldb_ = (int) ldb;
  ldc_ = (int) ldc;
  /* Check for integer overflows */
  assert ( (int64_t)   m_ == m  );
  assert ( (int64_t)   n_ == n  );
  assert ( (int64_t)   k_ == k  );
  assert ( (int64_t) lda_ == lda);
  assert ( (int64_t) ldb_ == ldb);
  assert ( (int64_t) ldc_ == ldc);
  cublasOperation_t transa_ = CUBLAS_OP_N;
  cublasOperation_t transb_ = CUBLAS_OP_N;
  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
  cublasSgemm(handle, transa_, transb_, m_, n_, k_, alpha, a, lda_, b, ldb_, beta, c, ldc_);
 }
 void gpu_dgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha,
               const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
  assert (handle != NULL);
  /* Convert to int */
  int m_, n_, lda_, ldb_, ldc_;
  m_   = (int) m;
  n_   = (int) n;
  lda_ = (int) lda;
  ldb_ = (int) ldb;
  ldc_ = (int) ldc;
  /* Check for integer overflows */
  assert ( (int64_t)   m_ == m  );
  assert ( (int64_t)   n_ == n  );
  assert ( (int64_t) lda_ == lda);
  assert ( (int64_t) ldb_ == ldb);
  assert ( (int64_t) ldc_ == ldc);
  cublasOperation_t transa_ = CUBLAS_OP_N;
  cublasOperation_t transb_ = CUBLAS_OP_N;
  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
  cublasDgeam(handle, transa_, transb_, m_, n_, alpha, a, lda_, beta, b, ldb_, c, ldc_);
 }
 void gpu_sgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float* alpha,
               const float* a, const int64_t lda, const float* beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
  assert (handle != NULL);
  /* Convert to int */
  int m_, n_, lda_, ldb_, ldc_;
  m_   = (int) m;
  n_   = (int) n;
  lda_ = (int) lda;
  ldb_ = (int) ldb;
  ldc_ = (int) ldc;
  /* Check for integer overflows */
  assert ( (int64_t)   m_ == m  );
  assert ( (int64_t)   n_ == n  );
  assert ( (int64_t) lda_ == lda);
  assert ( (int64_t) ldb_ == ldb);
  assert ( (int64_t) ldc_ == ldc);
  cublasOperation_t transa_ = CUBLAS_OP_N;
  cublasOperation_t transb_ = CUBLAS_OP_N;
  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
  cublasSgeam(handle, transa_, transb_, m_, n_, alpha, a, lda_, beta, b, ldb_, c, ldc_);
 }
--- a/plugins/local/gpu_x86/NEED
+++ b/plugins/local/gpu_x86/NEED
@ -0,0 +1 @@
--- a/plugins/local/gpu_x86/README.rst
+++ b/plugins/local/gpu_x86/README.rst
--- a/plugins/local/gpu_x86/gpu.c
+++ b/plugins/local/gpu_x86/gpu.c
@ -2,13 +2,13 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <stdbool.h>
 #include <assert.h>
 /* Generic functions */
 int gpu_ndevices() {
-  return 1;
+  return 0;
 }
 void gpu_set_device(int32_t i) {
@ -25,7 +25,7 @@ void gpu_allocate(void** ptr, const int64_t n) {
  }
 }
-void gpu_free(void** ptr) {
+void gpu_deallocate(void** ptr) {
  free(*ptr);
  *ptr = NULL;
 }
@ -49,10 +49,11 @@ void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
 /* Streams */
 void gpu_stream_create(void** ptr) {
-  *ptr = (void*) 2;
+  *ptr = (void*) malloc(sizeof(char));
 }
 void gpu_stream_destroy(void** ptr) {
  free(*ptr);
  *ptr = NULL;
 }
@ -68,18 +69,19 @@ void gpu_synchronize() {
 /* BLAS functions */
 void gpu_blas_create(void** handle) {
-  *handle = (void*) 1;
+  *handle = (void*) malloc(sizeof(char));
 }
 void gpu_blas_destroy(void** handle) {
  free(*handle);
  *handle = NULL;
 }
 double ddot_(const int32_t* n, const double* x, const int32_t* incx, const double* y, const int32_t* incy);
-void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
+void gpu_ddot(void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
  assert (handle != NULL);
  /* Convert to int32_t */
@ -100,7 +102,7 @@ void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_
 float sdot_(const int32_t* n, const float* x, const int32_t* incx, const float* y, const int32_t* incy);
-void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
+void gpu_sdot(void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
  assert (handle != NULL);
  /* Convert to int32_t */
@ -122,8 +124,8 @@ void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t
 void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha,
            const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy);
-void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double alpha,
+void gpu_dgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const double* alpha,
-               const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) {
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) {
  assert (handle != NULL);
@ -143,15 +145,15 @@ void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int
  assert ( (int64_t) incx_ == incx);
  assert ( (int64_t) incy_ == incy);
-  dgemv_(&transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_);
+  dgemv_(transa, &m_, &n_, alpha, a, &lda_, x, &incx_, beta, y, &incy_);
 }
 void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha,
               const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy);
-void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float alpha,
+void gpu_sgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const float* alpha,
-               const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) {
+               const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy) {
  assert (handle != NULL);
@ -171,15 +173,15 @@ void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int
  assert ( (int64_t) incx_ == incx);
  assert ( (int64_t) incy_ == incy);
-  sgemv_(&transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_);
+  sgemv_(transa, &m_, &n_, alpha, a, &lda_, x, &incx_, beta, y, &incy_);
 }
 void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha,
            const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc);
-void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
+void gpu_dgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
-               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) {
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) {
  assert (handle != NULL);
@ -201,7 +203,7 @@ void gpu_dgemm(const void* handle, const char transa, const char transb, const i
  assert ( (int64_t) ldb_ == ldb);
  assert ( (int64_t) ldc_ == ldc);
-  dgemm_(&transa, &transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_);
+  dgemm_(transa, transb, &m_, &n_, &k_, alpha, a, &lda_, b, &ldb_, beta, c, &ldc_);
 }
@ -209,8 +211,8 @@ void gpu_dgemm(const void* handle, const char transa, const char transb, const i
 void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha,
            const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc);
-void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
+void gpu_sgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha,
-               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) {
+               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc) {
  assert (handle != NULL);
@ -232,136 +234,133 @@ void gpu_sgemm(const void* handle, const char transa, const char transb, const i
  assert ( (int64_t) ldb_ == ldb);
  assert ( (int64_t) ldc_ == ldc);
-  sgemm_(&transa, &transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_);
+  sgemm_(transa, transb, &m_, &n_, &k_, alpha, a, &lda_, b, &ldb_, beta, c, &ldc_);
 }
-void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
+void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha,
-               const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
+               const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
-  if (handle == NULL) {
+  assert (handle != NULL);
    perror("NULL handle");
    exit(-1);
  }
-  if ( (transa == 'N' && transb == 'N') ||
+  if ( (*transa == 'N' && *transb == 'N') ||
-       (transa == 'n' && transb == 'N') ||
+       (*transa == 'n' && *transb == 'N') ||
-       (transa == 'N' && transb == 'n') ||
+       (*transa == 'N' && *transb == 'n') ||
-       (transa == 'n' && transb == 'n') ) {
+       (*transa == 'n' && *transb == 'n') ) {
-     if (alpha == 0.) {
+     if (*alpha == 0.) {
       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = beta * b[j*ldb+i];
+           c[j*ldc+i] = *beta * b[j*ldb+i];
         }
       }
-     } else if (beta == 0.) {
+     } else if (*beta == 0.) {
       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = alpha * a[j*lda+i];
+           c[j*ldc+i] = *alpha * a[j*lda+i];
         }
       }
     } else {
       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i];
+           c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[j*ldb+i];
         }
       }
     }
-  } else if ( (transa == 'N' && transb == 'T') ||
+  } else if ( (*transa == 'N' && *transb == 'T') ||
-              (transa == 'n' && transb == 'T') ||
+              (*transa == 'n' && *transb == 'T') ||
-              (transa == 'N' && transb == 't') ||
+              (*transa == 'N' && *transb == 't') ||
-              (transa == 'n' && transb == 't') ) {
+              (*transa == 'n' && *transb == 't') ) {
-     if (alpha == 0.) {
+     if (*alpha == 0.) {
       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = beta * b[i*ldb+j];
+           c[j*ldc+i] = *beta * b[i*ldb+j];
         }
       }
-     } else if (beta == 0.) {
+     } else if (*beta == 0.) {
       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = alpha * a[j*lda+i];
+           c[j*ldc+i] = *alpha * a[j*lda+i];
         }
       }
     } else {
       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j];
+           c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[i*ldb+j];
         }
       }
     }
-  } else if ( (transa == 'T' && transb == 'N') ||
+  } else if ( (*transa == 'T' && *transb == 'N') ||
-              (transa == 't' && transb == 'N') ||
+              (*transa == 't' && *transb == 'N') ||
-              (transa == 'T' && transb == 'n') ||
+              (*transa == 'T' && *transb == 'n') ||
-              (transa == 't' && transb == 'n') ) {
+              (*transa == 't' && *transb == 'n') ) {
-     if (alpha == 0.) {
+     if (*alpha == 0.) {
       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = beta * b[j*ldb+i];
+           c[j*ldc+i] = *beta * b[j*ldb+i];
         }
       }
-     } else if (beta == 0.) {
+     } else if (*beta == 0.) {
       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = alpha * a[i*lda+j];
+           c[j*ldc+i] = *alpha * a[i*lda+j];
         }
       }
     } else {
       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i];
+           c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[j*ldb+i];
         }
       }
     }
-  } else if ( (transa == 'T' && transb == 'T') ||
+  } else if ( (*transa == 'T' && *transb == 'T') ||
-              (transa == 't' && transb == 'T') ||
+              (*transa == 't' && *transb == 'T') ||
-              (transa == 'T' && transb == 't') ||
+              (*transa == 'T' && *transb == 't') ||
-              (transa == 't' && transb == 't') ) {
+              (*transa == 't' && *transb == 't') ) {
-     if (alpha == 0.) {
+     if (*alpha == 0.) {
       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = beta * b[i*ldb+j];
+           c[j*ldc+i] = *beta * b[i*ldb+j];
         }
       }
-     } else if (beta == 0.) {
+     } else if (*beta == 0.) {
       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = alpha * a[i*lda+j];
+           c[j*ldc+i] = *alpha * a[i*lda+j];
         }
       }
     } else {
       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j];
+           c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[i*ldb+j];
         }
       }
@ -371,132 +370,129 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
 }
-void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
+void gpu_sgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float* alpha,
-               const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
+               const float* a, const int64_t lda, const float* beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
-  if (handle == NULL) {
+  assert (handle != NULL);
    perror("NULL handle");
    exit(-1);
  }
-  if ( (transa == 'N' && transb == 'N') ||
+  if ( (*transa == 'N' && *transb == 'N') ||
-       (transa == 'n' && transb == 'N') ||
+       (*transa == 'n' && *transb == 'N') ||
-       (transa == 'N' && transb == 'n') ||
+       (*transa == 'N' && *transb == 'n') ||
-       (transa == 'n' && transb == 'n') ) {
+       (*transa == 'n' && *transb == 'n') ) {
-     if (alpha == 0.) {
+     if (*alpha == 0.) {
       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = beta * b[j*ldb+i];
+           c[j*ldc+i] = *beta * b[j*ldb+i];
         }
       }
-     } else if (beta == 0.) {
+     } else if (*beta == 0.) {
       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = alpha * a[j*lda+i];
+           c[j*ldc+i] = *alpha * a[j*lda+i];
         }
       }
     } else {
       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i];
+           c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[j*ldb+i];
         }
       }
     }
-  } else if ( (transa == 'N' && transb == 'T') ||
+  } else if ( (*transa == 'N' && *transb == 'T') ||
-              (transa == 'n' && transb == 'T') ||
+              (*transa == 'n' && *transb == 'T') ||
-              (transa == 'N' && transb == 't') ||
+              (*transa == 'N' && *transb == 't') ||
-              (transa == 'n' && transb == 't') ) {
+              (*transa == 'n' && *transb == 't') ) {
-     if (alpha == 0.) {
+     if (*alpha == 0.) {
       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = beta * b[i*ldb+j];
+           c[j*ldc+i] = *beta * b[i*ldb+j];
         }
       }
-     } else if (beta == 0.) {
+     } else if (*beta == 0.) {
       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = alpha * a[j*lda+i];
+           c[j*ldc+i] = *alpha * a[j*lda+i];
         }
       }
     } else {
       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j];
+           c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[i*ldb+j];
         }
       }
     }
-  } else if ( (transa == 'T' && transb == 'N') ||
+  } else if ( (*transa == 'T' && *transb == 'N') ||
-              (transa == 't' && transb == 'N') ||
+              (*transa == 't' && *transb == 'N') ||
-              (transa == 'T' && transb == 'n') ||
+              (*transa == 'T' && *transb == 'n') ||
-              (transa == 't' && transb == 'n') ) {
+              (*transa == 't' && *transb == 'n') ) {
-     if (alpha == 0.) {
+     if (*alpha == 0.) {
       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = beta * b[j*ldb+i];
+           c[j*ldc+i] = *beta * b[j*ldb+i];
         }
       }
-     } else if (beta == 0.) {
+     } else if (*beta == 0.) {
       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = alpha * a[i*lda+j];
+           c[j*ldc+i] = *alpha * a[i*lda+j];
         }
       }
     } else {
       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i];
+           c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[j*ldb+i];
         }
       }
     }
-  } else if ( (transa == 'T' && transb == 'T') ||
+  } else if ( (*transa == 'T' && *transb == 'T') ||
-              (transa == 't' && transb == 'T') ||
+              (*transa == 't' && *transb == 'T') ||
-              (transa == 'T' && transb == 't') ||
+              (*transa == 'T' && *transb == 't') ||
-              (transa == 't' && transb == 't') ) {
+              (*transa == 't' && *transb == 't') ) {
-     if (alpha == 0.) {
+     if (*alpha == 0.) {
       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = beta * b[i*ldb+j];
+           c[j*ldc+i] = *beta * b[i*ldb+j];
         }
       }
-     } else if (beta == 0.) {
+     } else if (*beta == 0.) {
       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = alpha * a[i*lda+j];
+           c[j*ldc+i] = *alpha * a[i*lda+j];
         }
       }
     } else {
       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j];
+           c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[i*ldb+j];
         }
       }
--- a/plugins/local/non_h_ints_mu/total_tc_int.irp.f
+++ b/plugins/local/non_h_ints_mu/total_tc_int.irp.f
@ -288,25 +288,31 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
      !$OMP END DO
      !$OMP END PARALLEL
    else
-      print*, ' ao_integrals_map will be used'
+!      print*, ' ao_integrals_map will be used'
-      PROVIDE ao_integrals_map
+!      PROVIDE ao_integrals_map
      print*,'Cholesky vectors will be used '
      double precision :: get_ao_integ_chol,eri
      eri = get_ao_integ_chol(1,1,1,1) ! FOR OPENMP 
      !$OMP PARALLEL DEFAULT(NONE)                            &
-      !$OMP SHARED(ao_num, ao_two_e_tc_tot, ao_integrals_map) &
+!!!    !$OMP SHARED(ao_num, ao_two_e_tc_tot, ao_integrals_map) &
-      !$OMP PRIVATE(i, j, k, l)
+      !$OMP SHARED(ao_num, ao_two_e_tc_tot) &
      !$OMP PRIVATE(i, j, k, l,eri)
      !$OMP DO COLLAPSE(3)
      do j = 1, ao_num
        do l = 1, ao_num
          do i = 1, ao_num
            do k = 1, ao_num
              !                                                     < 1:i, 2:j | 1:k, 2:l > 
-              ao_two_e_tc_tot(k,i,l,j) = ao_two_e_tc_tot(k,i,l,j) + get_ao_two_e_integral(i, j, k, l, ao_integrals_map)
+!              eri =  get_ao_two_e_integral(i, j, k, l, ao_integrals_map)
               eri = get_ao_integ_chol(i,k,j,l)
              ao_two_e_tc_tot(k,i,l,j) = ao_two_e_tc_tot(k,i,l,j) + eri
            enddo
          enddo
        enddo
      enddo
      !$OMP END DO
      !$OMP END PARALLEL
-      FREE ao_integrals_map
+!      FREE ao_integrals_map
    endif
    if((tc_integ_type .eq. "numeric") .and. (.not. tc_save_mem)) then
--- a/plugins/local/slater_tc/slater_tc_opt.irp.f
+++ b/plugins/local/slater_tc/slater_tc_opt.irp.f
@ -10,8 +10,6 @@ subroutine provide_all_three_ints_bi_ortho()
  implicit none
  double precision :: t1, t2
  PROVIDE ao_two_e_integrals_in_map
  print *, ' start provide_all_three_ints_bi_ortho'
  call wall_time(t1)
--- a/plugins/local/slater_tc/tc_hmat.irp.f
+++ b/plugins/local/slater_tc/tc_hmat.irp.f
@ -30,7 +30,9 @@ BEGIN_PROVIDER [double precision, htilde_matrix_elmt_bi_ortho, (N_det,N_det)]
  print *, ' PROVIDING htilde_matrix_elmt_bi_ortho ...'
  call wall_time(t1)
  if(three_body_h_tc)then
   call provide_all_three_ints_bi_ortho()
  endif
  i = 1
  j = 1
--- a/plugins/local/spher_harm/spher_harm.irp.f
+++ b/plugins/local/spher_harm/spher_harm.irp.f
@ -1,7 +1,7 @@
 program spher_harm
 implicit none
-! call test_spher_harm
+ call test_spher_harm
 ! call test_cart
- call test_brutal_spheric
+! call test_brutal_spheric
 end
--- a/plugins/local/spher_harm/spher_harm_func.irp.f
+++ b/plugins/local/spher_harm/spher_harm_func.irp.f
@ -7,6 +7,7 @@ subroutine spher_harm_func_r3(r,l,m,re_ylm, im_ylm)
 double precision :: theta, phi,r_abs
 call cartesian_to_spherical(r,theta,phi,r_abs)
 call spher_harm_func(l,m,theta,phi,re_ylm, im_ylm)
 ! call spher_harm_func_expl(l,m,theta,phi,re_ylm, im_ylm)
 end
@ -131,6 +132,10 @@ subroutine spher_harm_func_expl(l,m,theta,phi,re_ylm, im_ylm)
  tmp = - inv_sq_pi * dsqrt(3.d0/8.d0) * dsin(theta) 
  re_ylm = tmp * dcos(phi)
  im_ylm = tmp * dsin(phi)
 else if (l==1.and.m==-1)then
  tmp = - inv_sq_pi * dsqrt(3.d0/8.d0) * dsin(theta) 
  re_ylm = tmp * dcos(phi)
  im_ylm = -tmp * dsin(phi)
 else if(l==1.and.m==0)then
  tmp = inv_sq_pi * dsqrt(3.d0/4.d0) * dcos(theta) 
  re_ylm = tmp 
@ -139,10 +144,18 @@ subroutine spher_harm_func_expl(l,m,theta,phi,re_ylm, im_ylm)
  tmp = 0.25d0 * inv_sq_pi * dsqrt(0.5d0*15.d0) * dsin(theta)*dsin(theta)
  re_ylm = tmp * dcos(2.d0*phi)
  im_ylm = tmp * dsin(2.d0*phi)
 else if(l==2.and.m==-2)then
  tmp = 0.25d0 * inv_sq_pi * dsqrt(0.5d0*15.d0) * dsin(theta)*dsin(theta)
  re_ylm = tmp * dcos(2.d0*phi)
  im_ylm =-tmp * dsin(2.d0*phi)
 else if(l==2.and.m==1)then
  tmp = - inv_sq_pi * dsqrt(15.d0/8.d0) * dsin(theta) * dcos(theta)
  re_ylm = tmp * dcos(phi)
  im_ylm = tmp * dsin(phi)
 else if(l==2.and.m==-1)then
  tmp = - inv_sq_pi * dsqrt(15.d0/8.d0) * dsin(theta) * dcos(theta)
  re_ylm = tmp * dcos(phi)
  im_ylm =-tmp * dsin(phi)
 else if(l==2.and.m==0)then
  tmp = dsqrt(5.d0/4.d0) * inv_sq_pi* (1.5d0*dcos(theta)*dcos(theta)-0.5d0)
  re_ylm = tmp
--- a/plugins/local/tc_int/NEED
+++ b/plugins/local/tc_int/NEED
@ -1,3 +1,4 @@
 gpu
 tc_keywords
 jastrow
 qmckl
--- a/plugins/local/tc_int/compute_tc_int.irp.f
+++ b/plugins/local/tc_int/compute_tc_int.irp.f
@ -2,7 +2,7 @@
 ! ---
 subroutine provide_int2_grad1_u12_ao()
-
+  use gpu
  BEGIN_DOC
  !
  ! int2_grad1_u12_ao(i,j,ipoint,1) = \int dr2         [\grad1 u(r1,r2)]_x1 \chi_i(r2) \chi_j(r2)
@ -35,8 +35,9 @@ subroutine provide_int2_grad1_u12_ao()
  double precision              :: weight1, ao_k_r, ao_i_r
  double precision              :: der_envsq_x, der_envsq_y, der_envsq_z, lap_envsq
  double precision              :: time0, time1, time2, tc1, tc2, tc
-  double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:), tc_int_2e_ao(:,:,:,:)
+  type(gpu_double4)             :: int2_grad1_u12_ao
-  double precision, allocatable :: tmp(:,:,:), c_mat(:,:,:), tmp_grad1_u12(:,:,:)
+  type(gpu_double3)             :: tmp_grad1_u12, tmp_grad1_u12p, tmp
  double precision, allocatable :: c_mat(:,:,:), tc_int_2e_ao(:,:,:,:)
  double precision, external    :: get_ao_two_e_integral
@ -51,6 +52,7 @@ subroutine provide_int2_grad1_u12_ao()
  call total_memory(mem)
  mem      = max(1.d0, qp_max_mem - mem)
  mem = 6
  n_double = mem * 1.d8
  n_blocks = int(min(n_double / (n_points_extra_final_grid * 4.d0), 1.d0*n_points_final_grid))
  n_rest   = int(mod(n_points_final_grid, n_blocks))
@ -64,9 +66,9 @@ subroutine provide_int2_grad1_u12_ao()
  ! ---
  ! ---
-  allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,4))
+  call gpu_allocate(int2_grad1_u12_ao, ao_num,ao_num,n_points_final_grid,4)
-  allocate(tmp(n_points_extra_final_grid,ao_num,ao_num))
+  call gpu_allocate(tmp,n_points_extra_final_grid,ao_num,ao_num)
  !$OMP PARALLEL               &
  !$OMP DEFAULT (NONE)         &
  !$OMP PRIVATE (j, i, jpoint) &
@ -75,21 +77,28 @@ subroutine provide_int2_grad1_u12_ao()
  do j = 1, ao_num
    do i = 1, ao_num
      do jpoint = 1, n_points_extra_final_grid
-        tmp(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j)
+        tmp%f(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j)
      enddo
    enddo
  enddo
  !$OMP END DO
  !$OMP END PARALLEL
-  allocate(tmp_grad1_u12(n_points_extra_final_grid,n_blocks,4))
+  call gpu_allocate(tmp_grad1_u12,n_points_extra_final_grid,n_blocks,4)
  call gpu_allocate(tmp_grad1_u12p,n_points_extra_final_grid,n_blocks,4)
  tc = 0.d0
  type(gpu_stream) :: stream(4)
  do i=1,4
    call gpu_stream_create(stream(i))
  enddo
  do i_pass = 1, n_pass
    ii = (i_pass-1)*n_blocks + 1
    call wall_time(tc1)
    !$OMP PARALLEL                   &
    !$OMP DEFAULT (NONE)             &
    !$OMP PRIVATE (i_blocks, ipoint) &
@ -97,27 +106,26 @@ subroutine provide_int2_grad1_u12_ao()
    !$OMP DO
    do i_blocks = 1, n_blocks
      ipoint = ii - 1 + i_blocks ! r1
-      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_blocks,1), tmp_grad1_u12(1,i_blocks,2), tmp_grad1_u12(1,i_blocks,3), tmp_grad1_u12(1,i_blocks,4))
+      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12%f(1,i_blocks,1), tmp_grad1_u12%f(1,i_blocks,2), &
        tmp_grad1_u12%f(1,i_blocks,3), tmp_grad1_u12%f(1,i_blocks,4))
    enddo
    !$OMP END DO
    !$OMP END PARALLEL
    call wall_time(tc2)
    tc = tc + tc2 - tc1
    call gpu_synchronize()
    call gpu_copy(tmp_grad1_u12,tmp_grad1_u12p)
    do m = 1, 4
-      call dgemm( "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0                     &
+      call gpu_set_stream(blas_handle, stream(m))
-                , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
+      call gpu_dgemm(blas_handle, "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0                     &
-                , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) 
+                , tmp%f(1,1,1), n_points_extra_final_grid, tmp_grad1_u12p%f(1,1,m), n_points_extra_final_grid &
                , 0.d0, int2_grad1_u12_ao%f(1,1,ii,m), ao_num*ao_num)
    enddo
  enddo
  deallocate(tmp_grad1_u12)
  if(n_rest .gt. 0) then
    allocate(tmp_grad1_u12(n_points_extra_final_grid,n_rest,4))
    ii = n_pass*n_blocks + 1
    call wall_time(tc1)
@ -128,7 +136,8 @@ subroutine provide_int2_grad1_u12_ao()
    !$OMP DO
    do i_rest = 1, n_rest
      ipoint = ii - 1 + i_rest ! r1
-      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_rest,1), tmp_grad1_u12(1,i_rest,2), tmp_grad1_u12(1,i_rest,3), tmp_grad1_u12(1,i_rest,4))
+      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12%f(1,i_rest,1), tmp_grad1_u12%f(1,i_rest,2), &
        tmp_grad1_u12%f(1,i_rest,3), tmp_grad1_u12%f(1,i_rest,4))
    enddo
    !$OMP END DO
    !$OMP END PARALLEL
@ -136,15 +145,23 @@ subroutine provide_int2_grad1_u12_ao()
    tc = tc + tc2 - tc1
    do m = 1, 4
-      call dgemm( "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0                       &
+      call gpu_set_stream(blas_handle, stream(m))
-                , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
+      call gpu_dgemm(blas_handle, "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0                       &
-                , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) 
+                , tmp%f(1,1,1), n_points_extra_final_grid, tmp_grad1_u12%f(1,1,m), n_points_extra_final_grid &
                , 0.d0, int2_grad1_u12_ao%f(1,1,ii,m), ao_num*ao_num)
    enddo
    deallocate(tmp_grad1_u12)
  endif
  call gpu_synchronize()
  call gpu_deallocate(tmp_grad1_u12)
  call gpu_deallocate(tmp_grad1_u12p)
-  deallocate(tmp)
+  do i=1,4
    call gpu_stream_destroy(stream(i))
  enddo
  call gpu_deallocate(tmp)
  call wall_time(time1)
@ -152,6 +169,8 @@ subroutine provide_int2_grad1_u12_ao()
  print*, ' wall time Jastrow derivatives   (min) = ', tc / 60.d0
  call print_memory_usage()
 !TODO
 stop
  ! ---
  ! ---
  ! ---
@ -177,7 +196,7 @@ subroutine provide_int2_grad1_u12_ao()
  !$OMP END DO
  !$OMP END PARALLEL
  call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0            &
-            , int2_grad1_u12_ao(1,1,1,4), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
+            , int2_grad1_u12_ao%f(1,1,1,4), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
            , 0.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num)
  deallocate(c_mat)
@ -213,7 +232,7 @@ subroutine provide_int2_grad1_u12_ao()
    !$OMP END PARALLEL
    call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, -1.d0           &
-              , int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
+              , int2_grad1_u12_ao%f(1,1,1,m), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
              , 1.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num)
  enddo
  deallocate(c_mat)
@ -263,7 +282,7 @@ subroutine provide_int2_grad1_u12_ao()
  print*, ' Writing int2_grad1_u12_ao in ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao'
  open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="write")
  call ezfio_set_work_empty(.False.)
-    write(11) int2_grad1_u12_ao(:,:,:,1:3)
+    write(11) int2_grad1_u12_ao%f(:,:,:,1:3)
  close(11)
  print*, ' Saving tc_int_2e_ao in ', trim(ezfio_filename) // '/work/ao_two_e_tc_tot'
@ -276,7 +295,7 @@ subroutine provide_int2_grad1_u12_ao()
  ! ----
-  deallocate(int2_grad1_u12_ao)
+  call gpu_deallocate(int2_grad1_u12_ao)
  deallocate(tc_int_2e_ao)
  call wall_time(time2)
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@ -1,3 +1,15 @@
 double precision function get_ao_integ_chol(i,j,k,l)
 implicit none
  BEGIN_DOC
  !  CHOLESKY representation of the integral of the AO basis <ik|jl> or (ij|kl)
  !     i(r1) j(r1) 1/r12 k(r2) l(r2)
  END_DOC
 integer, intent(in) :: i,j,k,l
 double precision, external :: ddot                                                                                  
 get_ao_integ_chol = ddot(cholesky_ao_num, cholesky_ao_transp(1,i,j), 1, cholesky_ao_transp(1,k,l), 1)
 end
 BEGIN_PROVIDER [ double precision, cholesky_ao_transp, (cholesky_ao_num, ao_num, ao_num) ]
 implicit none
 BEGIN_DOC
@ -25,7 +37,10 @@ END_PROVIDER
   ! Last dimension of cholesky_ao is cholesky_ao_num
   !
   ! https://mogp-emulator.readthedocs.io/en/latest/methods/proc/ProcPivotedCholesky.html
   !
   ! https://doi.org/10.1016/j.apnum.2011.10.001 : Page 4, Algorithm 1
   !
   ! https://www.diva-portal.org/smash/get/diva2:396223/FULLTEXT01.pdf
   END_DOC
   integer*8                      :: ndim8
@ -155,11 +170,15 @@ END_PROVIDER
         Lset(np8) = p8
       endif
     enddo
-     np = np8
+     if (np8 > ndim8) stop 'np>ndim8'
     np = int(np8,4)
     if (np <= 0) stop 'np<=0'
     if (np > ndim8) stop 'np>ndim8'
     rank_max = np
     ! Avoid too large arrays when there are many electrons
     if (elec_num > 10) then
       rank_max = min(np,20*elec_num*elec_num)
     endif
     call mmap(trim(ezfio_work_dir)//'cholesky_ao_tmp', (/ ndim8, rank_max /), 8, fd(1), .False., .True., c_pointer(1))
     call c_f_pointer(c_pointer(1), L, (/ ndim8, rank_max /))
@ -428,7 +447,7 @@ END_PROVIDER
           Lset(np8) = p8
         endif
       enddo
-       np = np8
+       np = int(np8,4)
     enddo
--- a/src/casscf_cipsi/EZFIO.cfg
+++ b/src/casscf_cipsi/EZFIO.cfg
@ -79,3 +79,9 @@ type: logical
 doc: If |true|, the pt2_max value in the CIPSI is set to 10-10 and will not change
 interface: ezfio,provider,ocaml
 default: False
 [act_mos_opt]
 type: logical
 doc: If |true|, the active orbitals are also optimized variationally
 interface: ezfio,provider,ocaml
 default: False
--- a/src/casscf_cipsi/NEED
+++ b/src/casscf_cipsi/NEED
@ -3,3 +3,4 @@ selectors_full
 generators_cas
 two_body_rdm
 dav_general_mat
 mo_optimization_utils
--- a/src/casscf_cipsi/bielec.irp.f
+++ b/src/casscf_cipsi/bielec.irp.f
@ -1,18 +1,25 @@
-BEGIN_PROVIDER [real*8, bielec_PQxx, (mo_num, mo_num,n_core_inact_act_orb,n_core_inact_act_orb)]
+BEGIN_PROVIDER [real*8, bielec_PQxx_array, (mo_num, mo_num,n_core_inact_act_orb,n_core_inact_act_orb)]
  BEGIN_DOC
-  ! bielec_PQxx : integral (pq|xx) with p,q arbitrary, x core or active
+  ! WARNING !!! Old version !!! NOT USED ANYMORE IN THE PROGRAM !!! TOO BIG TO BE STORED ON LARGE SYSTEMS !!! 
  ! 
  ! Replaced by the Cholesky-based function bielec_PQxx
  !
  ! bielec_PQxx_array : integral (pq|xx) with p,q arbitrary, x core or active
  ! indices are unshifted orbital numbers
  END_DOC
  implicit none
  integer                        :: i,j,ii,jj,p,q,i3,j3,t3,v3
  real*8                         :: mo_two_e_integral
  print*,''
  print*,'Providing bielec_PQxx_array, WARNING IT CAN BE A VERY BIG ARRAY WHEN MO_NUM IS LARGE !!!'
  print*,''
-  bielec_PQxx(:,:,:,:) = 0.d0
+  bielec_PQxx_array(:,:,:,:) = 0.d0
  PROVIDE mo_two_e_integrals_in_map
  !$OMP PARALLEL DEFAULT(NONE) &
  !$OMP PRIVATE(i,ii,j,jj,i3,j3) &
-  !$OMP SHARED(n_core_inact_orb,list_core_inact,mo_num,bielec_PQxx, &
+  !$OMP SHARED(n_core_inact_orb,list_core_inact,mo_num,bielec_PQxx_array, &
  !$OMP  n_act_orb,mo_integrals_map,list_act)
  !$OMP DO
@ -20,14 +27,14 @@ BEGIN_PROVIDER [real*8, bielec_PQxx, (mo_num, mo_num,n_core_inact_act_orb,n_core
    ii=list_core_inact(i)
    do j=i,n_core_inact_orb
      jj=list_core_inact(j)
-      call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx(1,1,i,j),mo_integrals_map)
+      call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx_array(1,1,i,j),mo_integrals_map)
-      bielec_PQxx(:,:,j,i)=bielec_PQxx(:,:,i,j)
+      bielec_PQxx_array(:,:,j,i)=bielec_PQxx_array(:,:,i,j)
    end do
    do j=1,n_act_orb
      jj=list_act(j)
      j3=j+n_core_inact_orb
-      call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx(1,1,i,j3),mo_integrals_map)
+      call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx_array(1,1,i,j3),mo_integrals_map)
-      bielec_PQxx(:,:,j3,i)=bielec_PQxx(:,:,i,j3)
+      bielec_PQxx_array(:,:,j3,i)=bielec_PQxx_array(:,:,i,j3)
    end do
  end do
  !$OMP END DO
@ -40,8 +47,8 @@ BEGIN_PROVIDER [real*8, bielec_PQxx, (mo_num, mo_num,n_core_inact_act_orb,n_core
    do j=i,n_act_orb
      jj=list_act(j)
      j3=j+n_core_inact_orb
-      call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx(1,1,i3,j3),mo_integrals_map)
+      call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx_array(1,1,i3,j3),mo_integrals_map)
-      bielec_PQxx(:,:,j3,i3)=bielec_PQxx(:,:,i3,j3)
+      bielec_PQxx_array(:,:,j3,i3)=bielec_PQxx_array(:,:,i3,j3)
    end do
  end do
  !$OMP END DO
@ -52,9 +59,13 @@ END_PROVIDER
-BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_act_orb, mo_num)]
+BEGIN_PROVIDER [real*8, bielec_PxxQ_array, (mo_num,n_core_inact_act_orb,n_core_inact_act_orb, mo_num)]
  BEGIN_DOC
-  ! bielec_PxxQ : integral (px|xq) with p,q arbitrary, x core or active
+  ! WARNING !!! Old version !!! NOT USED ANYMORE IN THE PROGRAM !!! TOO BIG TO BE STORED ON LARGE SYSTEMS !!! 
  ! 
  ! Replaced by the Cholesky-based function bielec_PxxQ
  !
  ! bielec_PxxQ_array : integral (px|xq) with p,q arbitrary, x core or active
  ! indices are unshifted orbital numbers
  END_DOC
  implicit none
@ -62,12 +73,15 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_a
  double precision, allocatable  :: integrals_array(:,:)
  real*8                         :: mo_two_e_integral
  print*,''
  print*,'Providing bielec_PxxQ_array, WARNING IT CAN BE A VERY BIG ARRAY WHEN MO_NUM IS LARGE !!!'
  print*,''
  PROVIDE mo_two_e_integrals_in_map
-  bielec_PxxQ = 0.d0
+  bielec_PxxQ_array = 0.d0
  !$OMP PARALLEL DEFAULT(NONE) &
  !$OMP PRIVATE(i,ii,j,jj,i3,j3,integrals_array) &
-  !$OMP SHARED(n_core_inact_orb,list_core_inact,mo_num,bielec_PxxQ, &
+  !$OMP SHARED(n_core_inact_orb,list_core_inact,mo_num,bielec_PxxQ_array, &
  !$OMP  n_act_orb,mo_integrals_map,list_act)
  allocate(integrals_array(mo_num,mo_num))
@ -80,8 +94,8 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_a
      call get_mo_two_e_integrals_ij(ii,jj,mo_num,integrals_array,mo_integrals_map)
      do q=1,mo_num
        do p=1,mo_num
-          bielec_PxxQ(p,i,j,q)=integrals_array(p,q)
+          bielec_PxxQ_array(p,i,j,q)=integrals_array(p,q)
-          bielec_PxxQ(p,j,i,q)=integrals_array(q,p)
+          bielec_PxxQ_array(p,j,i,q)=integrals_array(q,p)
        end do
      end do
    end do
@ -91,8 +105,8 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_a
      call get_mo_two_e_integrals_ij(ii,jj,mo_num,integrals_array,mo_integrals_map)
      do q=1,mo_num
        do p=1,mo_num
-          bielec_PxxQ(p,i,j3,q)=integrals_array(p,q)
+          bielec_PxxQ_array(p,i,j3,q)=integrals_array(p,q)
-          bielec_PxxQ(p,j3,i,q)=integrals_array(q,p)
+          bielec_PxxQ_array(p,j3,i,q)=integrals_array(q,p)
        end do
      end do
    end do
@ -111,8 +125,8 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_a
      call get_mo_two_e_integrals_ij(ii,jj,mo_num,integrals_array,mo_integrals_map)
      do q=1,mo_num
        do p=1,mo_num
-          bielec_PxxQ(p,i3,j3,q)=integrals_array(p,q)
+          bielec_PxxQ_array(p,i3,j3,q)=integrals_array(p,q)
-          bielec_PxxQ(p,j3,i3,q)=integrals_array(q,p)
+          bielec_PxxQ_array(p,j3,i3,q)=integrals_array(q,p)
        end do
      end do
    end do
@ -129,10 +143,15 @@ BEGIN_PROVIDER [real*8, bielecCI, (n_act_orb,n_act_orb,n_act_orb, mo_num)]
  BEGIN_DOC
  ! bielecCI : integrals (tu|vp) with p arbitrary, tuv active
  ! index p runs over the whole basis, t,u,v only over the active orbitals
  ! 
  ! This array can be stored anyway. Ex: 50 active orbitals, 1500 MOs ==> 8x50^3x1500 = 1.5 Gb
  END_DOC
  implicit none
  integer                        :: i,j,k,p,t,u,v
  double precision, external     :: mo_two_e_integral
  double precision :: wall0, wall1 
  call wall_time(wall0)
  print*,'Providing bielecCI'
  PROVIDE mo_two_e_integrals_in_map
  !$OMP PARALLEL DO DEFAULT(NONE) &
@ -151,5 +170,7 @@ BEGIN_PROVIDER [real*8, bielecCI, (n_act_orb,n_act_orb,n_act_orb, mo_num)]
    end do
  end do
  !$OMP END PARALLEL DO
  call wall_time(wall1)
  print*,'Time to provide bielecCI = ',wall1 - wall0
 END_PROVIDER
--- a/src/casscf_cipsi/bielec_natorb.irp.f
+++ b/src/casscf_cipsi/bielec_natorb.irp.f
@ -1,30 +1,38 @@
- BEGIN_PROVIDER [real*8, bielec_PQxx_no, (mo_num, mo_num,n_core_inact_act_orb,n_core_inact_act_orb)]
+ BEGIN_PROVIDER [real*8, bielec_PQxx_no_array, (mo_num, mo_num,n_core_inact_act_orb,n_core_inact_act_orb)]
  BEGIN_DOC
  ! WARNING !!! Old version !!! NOT USED ANYMORE IN THE PROGRAM !!! TOO BIG TO BE STORED ON LARGE SYSTEMS !!! 
  ! 
  ! Replaced by the Cholesky-based function bielec_PQxx_no
  !
  ! integral (pq|xx) in the basis of natural MOs
  ! indices are unshifted orbital numbers
  ! 
  END_DOC
  implicit none
  integer                        :: i,j,k,l,t,u,p,q
  double precision, allocatable  :: f(:,:,:), d(:,:,:)
  print*,''
  print*,'Providing bielec_PQxx_no_array, WARNING IT CAN BE A VERY BIG ARRAY WHEN MO_NUM IS LARGE !!!'
  print*,''
  !$OMP PARALLEL DEFAULT(NONE) &
  !$OMP PRIVATE(j,k,l,p,d,f) &
  !$OMP SHARED(n_core_inact_act_orb,mo_num,n_act_orb,n_core_inact_orb, &
-  !$OMP   bielec_PQxx_no,bielec_PQxx,list_act,natorbsCI)
+  !$OMP   bielec_PQxx_no_array,bielec_PQxx_array,list_act,natorbsCI)
  allocate (f(n_act_orb,mo_num,n_core_inact_act_orb), &
      d(n_act_orb,mo_num,n_core_inact_act_orb))
  !$OMP DO
  do l=1,n_core_inact_act_orb
-    bielec_PQxx_no(:,:,:,l) = bielec_PQxx(:,:,:,l)
+    bielec_PQxx_no_array(:,:,:,l) = bielec_PQxx_array(:,:,:,l)
    do k=1,n_core_inact_act_orb
      do j=1,mo_num
        do p=1,n_act_orb
-          f(p,j,k)=bielec_PQxx_no(list_act(p),j,k,l)
+          f(p,j,k)=bielec_PQxx_no_array(list_act(p),j,k,l)
        end do
      end do
    end do
@ -36,13 +44,13 @@
    do k=1,n_core_inact_act_orb
      do j=1,mo_num
        do p=1,n_act_orb
-          bielec_PQxx_no(list_act(p),j,k,l)=d(p,j,k)
+          bielec_PQxx_no_array(list_act(p),j,k,l)=d(p,j,k)
        end do
      end do
      do j=1,mo_num
        do p=1,n_act_orb
-          f(p,j,k)=bielec_PQxx_no(j,list_act(p),k,l)
+          f(p,j,k)=bielec_PQxx_no_array(j,list_act(p),k,l)
        end do
      end do
    end do
@ -54,7 +62,7 @@
    do k=1,n_core_inact_act_orb
      do p=1,n_act_orb
        do j=1,mo_num
-          bielec_PQxx_no(j,list_act(p),k,l)=d(p,j,k)
+          bielec_PQxx_no_array(j,list_act(p),k,l)=d(p,j,k)
        end do
      end do
    end do
@ -71,7 +79,7 @@
    do p=1,n_act_orb
      do k=1,mo_num
        do j=1,mo_num
-          f(j,k,p) = bielec_PQxx_no(j,k,n_core_inact_orb+p,l)
+          f(j,k,p) = bielec_PQxx_no_array(j,k,n_core_inact_orb+p,l)
        end do
      end do
    end do
@ -83,7 +91,7 @@
    do p=1,n_act_orb
      do k=1,mo_num
        do j=1,mo_num
-          bielec_PQxx_no(j,k,n_core_inact_orb+p,l)=d(j,k,p)
+          bielec_PQxx_no_array(j,k,n_core_inact_orb+p,l)=d(j,k,p)
        end do
      end do
    end do
@ -97,7 +105,7 @@
    do p=1,n_act_orb
      do k=1,mo_num
        do j=1,mo_num
-          f(j,k,p) = bielec_PQxx_no(j,k,l,n_core_inact_orb+p)
+          f(j,k,p) = bielec_PQxx_no_array(j,k,l,n_core_inact_orb+p)
        end do
      end do
    end do
@ -109,7 +117,7 @@
    do p=1,n_act_orb
      do k=1,mo_num
        do j=1,mo_num
-          bielec_PQxx_no(j,k,l,n_core_inact_orb+p)=d(j,k,p)
+          bielec_PQxx_no_array(j,k,l,n_core_inact_orb+p)=d(j,k,p)
        end do
      end do
    end do
@ -123,8 +131,12 @@ END_PROVIDER
-BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inact_act_orb, mo_num)]
+BEGIN_PROVIDER [real*8, bielec_PxxQ_no_array, (mo_num,n_core_inact_act_orb,n_core_inact_act_orb, mo_num)]
  BEGIN_DOC
  ! WARNING !!! Old version !!! NOT USED ANYMORE IN THE PROGRAM !!! TOO BIG TO BE STORED ON LARGE SYSTEMS !!! 
  ! 
  ! Replaced by the Cholesky-based function bielec_PxxQ_no
  !
  ! integral (px|xq) in the basis of natural MOs
  ! indices are unshifted orbital numbers
  END_DOC
@ -132,10 +144,14 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
  integer                        :: i,j,k,l,t,u,p,q
  double precision, allocatable  :: f(:,:,:), d(:,:,:)
  print*,''
  print*,'Providing bielec_PxxQ_no_array, WARNING IT CAN BE A VERY BIG ARRAY WHEN MO_NUM IS LARGE !!!'
  print*,''
  !$OMP PARALLEL DEFAULT(NONE) &
  !$OMP PRIVATE(j,k,l,p,d,f) &
  !$OMP SHARED(n_core_inact_act_orb,mo_num,n_act_orb,n_core_inact_orb, &
-  !$OMP   bielec_PxxQ_no,bielec_PxxQ,list_act,natorbsCI)
+  !$OMP   bielec_PxxQ_no_array,bielec_PxxQ_array,list_act,natorbsCI)
  allocate (f(n_act_orb,n_core_inact_act_orb,n_core_inact_act_orb), &
@ -143,11 +159,11 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
  !$OMP DO
  do j=1,mo_num
-    bielec_PxxQ_no(:,:,:,j) = bielec_PxxQ(:,:,:,j)
+    bielec_PxxQ_no_array(:,:,:,j) = bielec_PxxQ_array(:,:,:,j)
    do l=1,n_core_inact_act_orb
      do k=1,n_core_inact_act_orb
        do p=1,n_act_orb
-            f(p,k,l) = bielec_PxxQ_no(list_act(p),k,l,j)
+            f(p,k,l) = bielec_PxxQ_no_array(list_act(p),k,l,j)
        end do
      end do
    end do
@ -159,7 +175,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
    do l=1,n_core_inact_act_orb
      do k=1,n_core_inact_act_orb
        do p=1,n_act_orb
-          bielec_PxxQ_no(list_act(p),k,l,j)=d(p,k,l)
+          bielec_PxxQ_no_array(list_act(p),k,l,j)=d(p,k,l)
        end do
      end do
    end do
@ -176,7 +192,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
    do l=1,n_core_inact_act_orb
      do j=1,mo_num
        do p=1,n_act_orb
-          f(p,j,l) = bielec_PxxQ_no(j,n_core_inact_orb+p,l,k)
+          f(p,j,l) = bielec_PxxQ_no_array(j,n_core_inact_orb+p,l,k)
        end do
      end do
    end do
@ -188,7 +204,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
    do l=1,n_core_inact_act_orb
      do j=1,mo_num
        do p=1,n_act_orb
-          bielec_PxxQ_no(j,n_core_inact_orb+p,l,k)=d(p,j,l)
+          bielec_PxxQ_no_array(j,n_core_inact_orb+p,l,k)=d(p,j,l)
        end do
      end do
    end do
@ -205,7 +221,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
    do p=1,n_act_orb
      do l=1,n_core_inact_act_orb
        do j=1,mo_num
-          f(j,l,p) = bielec_PxxQ_no(j,l,n_core_inact_orb+p,k)
+          f(j,l,p) = bielec_PxxQ_no_array(j,l,n_core_inact_orb+p,k)
        end do
      end do
    end do
@ -217,7 +233,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
    do p=1,n_act_orb
      do l=1,n_core_inact_act_orb
        do j=1,mo_num
-          bielec_PxxQ_no(j,l,n_core_inact_orb+p,k)=d(j,l,p)
+          bielec_PxxQ_no_array(j,l,n_core_inact_orb+p,k)=d(j,l,p)
        end do
      end do
    end do
@ -231,7 +247,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
    do p=1,n_act_orb
      do k=1,n_core_inact_act_orb
        do j=1,mo_num
-          f(j,k,p) = bielec_PxxQ_no(j,k,l,n_core_inact_orb+p)
+          f(j,k,p) = bielec_PxxQ_no_array(j,k,l,n_core_inact_orb+p)
        end do
      end do
    end do
@ -243,7 +259,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
    do p=1,n_act_orb
      do k=1,n_core_inact_act_orb
        do j=1,mo_num
-          bielec_PxxQ_no(j,k,l,n_core_inact_orb+p)=d(j,k,p)
+          bielec_PxxQ_no_array(j,k,l,n_core_inact_orb+p)=d(j,k,p)
        end do
      end do
    end do
@ -259,11 +275,17 @@ BEGIN_PROVIDER [real*8, bielecCI_no, (n_act_orb,n_act_orb,n_act_orb, mo_num)]
  BEGIN_DOC
  ! integrals (tu|vp) in the basis of natural MOs
  ! index p runs over the whole basis, t,u,v only over the active orbitals
  ! 
  ! This array can be stored anyway. Ex: 50 active orbitals, 1500 MOs ==> 8x50^3x1500 = 1.5 Gb
  END_DOC
  implicit none
  integer                        :: i,j,k,l,t,u,p,q
  double precision, allocatable  :: f(:,:,:), d(:,:,:)
  double precision :: wall0, wall1
  call wall_time(wall0)
  print*,'Providing bielecCI_no'
  !$OMP PARALLEL DEFAULT(NONE) &
  !$OMP PRIVATE(j,k,l,p,d,f) &
  !$OMP SHARED(n_core_inact_act_orb,mo_num,n_act_orb,n_core_inact_orb, &
@ -363,6 +385,8 @@ BEGIN_PROVIDER [real*8, bielecCI_no, (n_act_orb,n_act_orb,n_act_orb, mo_num)]
  deallocate(d,f)
  !$OMP END PARALLEL
  call wall_time(wall1)
  print*,'Time to provide bielecCI_no = ',wall1-wall0
 END_PROVIDER
--- a/src/casscf_cipsi/casscf.irp.f
+++ b/src/casscf_cipsi/casscf.irp.f
@ -11,7 +11,7 @@ program casscf
  if(small_active_space)then
   pt2_relative_error = 0.00001
  else
-   thresh_scf = 1.d-4
+   thresh_scf = max(1.d-4,thresh_scf)
   pt2_relative_error = 0.04
  endif
  touch pt2_relative_error 
@ -46,6 +46,11 @@ subroutine run
  do while (.not.converged)
    print*,'pt2_max = ',pt2_max
    call run_stochastic_cipsi(Ev,PT2)
 !    if(act_mos_opt)then DOES NOT WORK
 !     call run_orb_opt_trust_v2
 !     call run_stochastic_cipsi(Ev,PT2)
 !    endif
    if(.True.)then
     print*,'Ev,PT2',Ev(1),PT2(1)
     E_PT2(1:N_states) = Ev(1:N_states) + PT2(1:N_states)
     energy_old = energy
@ -55,9 +60,9 @@ subroutine run
     call write_time(6)
     call write_int(6,iteration,'CAS-SCF iteration = ')
     call write_double(6,energy,'State-average CAS-SCF energy = ')
-!    if(n_states == 1)then
+!!    if(n_states == 1)then
-!     call ezfio_get_casscf_cipsi_energy_pt2(E_PT2)
+!!     call ezfio_get_casscf_cipsi_energy_pt2(E_PT2)
-!     call ezfio_get_casscf_cipsi_energy(PT2)
+!!     call ezfio_get_casscf_cipsi_energy(PT2)
      double precision :: delta_E_istate, e_av
      e_av = 0.d0
      do istate=1,N_states
@ -68,12 +73,12 @@ subroutine run
       endif
       write(*,'(A6,I2,A18,F16.10)')'state ',istate,' E + PT2 energy = ',E_PT2(istate)
       write(*,'(A6,I2,A18,F16.10)')'state ',istate,'     PT2 energy = ',PT2(istate)
-!      call write_double(6,E_PT2(istate),'E + PT2 energy = ')
+!!      call write_double(6,E_PT2(istate),'E + PT2 energy = ')
-!      call write_double(6,PT2(istate),'  PT2          = ')
+!!      call write_double(6,PT2(istate),'  PT2          = ')
      enddo
      call write_double(6,e_av,'State-average CAS-SCF energy bis = ')
      call write_double(6,pt2_max,' PT2_MAX       = ')
-!    endif
+!!    endif
     print*,''
     call write_double(6,norm_grad_vec2,'Norm of gradients = ')
@ -132,8 +137,10 @@ subroutine run
       soft_touch state_following_casscf
      endif
     endif
    endif
  enddo
  if(.True.)then
     integer :: i
    print*,'Converged CASSCF '
    print*,'--------------------------'
@ -153,6 +160,7 @@ subroutine run
 !   write(*,*)mcscf_fock_alpha_mo(i,i)
  enddo
 endif
 end
--- a/src/casscf_cipsi/chol_bielec.irp.f
+++ b/src/casscf_cipsi/chol_bielec.irp.f
@ -0,0 +1,248 @@
 BEGIN_PROVIDER [double precision, cholesky_no_1_idx_transp, (cholesky_mo_num, n_act_orb, mo_num)]
 BEGIN_DOC
 ! Cholesky vectors with ONE orbital on the active natural orbital basis 
 END_DOC
 implicit none
 integer :: i_chol,i_act,i_mo,jj_act
 double precision, allocatable :: chol_tmp(:,:)
 double precision :: wall0,wall1
 call wall_time(wall0)
 print*,'Providing cholesky_no_1_idx_transp'
 allocate(chol_tmp(cholesky_mo_num,n_act_orb))
 cholesky_no_1_idx_transp = 0.D0
 do i_mo = 1, mo_num
  ! Get all the integrals corresponding to the "i_mo"
  do i_act = 1, n_act_orb
   jj_act = list_act(i_act)
   do i_chol = 1, cholesky_mo_num
    chol_tmp(i_chol, i_act) = cholesky_mo_transp(i_chol, jj_act, i_mo)
   enddo
  enddo
  call dgemm('N','N',cholesky_mo_num,n_act_orb,n_act_orb,1.d0,  &
        chol_tmp, size(chol_tmp,1),                              &
        natorbsCI, size(natorbsCI,1),                                              &
        0.d0,                                                      &
        cholesky_no_1_idx_transp(1,1,i_mo), size(cholesky_no_1_idx_transp,1))
 enddo
 call wall_time(wall1)
 print*,'Time to provide cholesky_no_1_idx_transp = ', wall1 - wall0
 END_PROVIDER 
 BEGIN_PROVIDER [double precision, cholesky_no_2_idx_transp, (cholesky_mo_num, n_act_orb, n_act_orb)]
 BEGIN_DOC
 ! Cholesky vectors with TWO orbital on the active natural orbital basis 
 END_DOC
 implicit none
 integer :: i_chol,i_act,j_act,jj_act
 double precision, allocatable :: chol_tmp(:,:),chol_tmp_bis(:,:)
 allocate(chol_tmp(cholesky_mo_num,n_act_orb),chol_tmp_bis(cholesky_mo_num,n_act_orb))
 double precision :: wall0,wall1
 call wall_time(wall0)
 print*,'Providing cholesky_no_2_idx_transp'
 cholesky_no_2_idx_transp = 0.D0
 do i_act = 1, n_act_orb
  ! Get all the integrals corresponding to the "j_act"
  do j_act = 1, n_act_orb
   jj_act = list_act(j_act)
   do i_chol = 1, cholesky_mo_num
    chol_tmp(i_chol, j_act) = cholesky_no_1_idx_transp(i_chol, i_act, jj_act)
   enddo
  enddo
  call dgemm('N','N',cholesky_mo_num,n_act_orb,n_act_orb,1.d0,  &
        chol_tmp, size(chol_tmp,1),                              &
        natorbsCI, size(natorbsCI,1),                                              &
        0.d0,                                                      &
        cholesky_no_2_idx_transp(1,1,i_act), size(cholesky_no_2_idx_transp,1))
 enddo
 call wall_time(wall1)
 print*,'Time to provide  cholesky_no_2_idx_transp = ', wall1 - wall0
 END_PROVIDER 
 BEGIN_PROVIDER [ double precision, cholesky_no_total_transp, (cholesky_mo_num, mo_num, mo_num)]
 implicit none
 BEGIN_DOC
 ! Cholesky vectors defined on all basis including the NO basis
 END_DOC
 integer :: i_chol, i_act, ii_act, j_act, jj_act, i_core_inact, j_core_inact, ii_core_inact, jj_core_inact
 integer :: i_virt, ii_virt, j_virt, jj_virt
 double precision :: wall0,wall1
 call wall_time(wall0)
 print*,'Providing cholesky_no_total_transp '
 ! Block when two orbitals belong to the core/inact 
 do j_core_inact = 1, n_core_inact_orb
  jj_core_inact = list_core_inact(j_core_inact)
  do i_core_inact = 1, n_core_inact_orb
   ii_core_inact = list_core_inact(i_core_inact)
   do i_chol = 1, cholesky_mo_num
    cholesky_no_total_transp(i_chol, ii_core_inact, jj_core_inact) = cholesky_mo_transp(i_chol,ii_core_inact,jj_core_inact)
   enddo
  enddo
 enddo
 ! Block when one orbitals belongs to the core/inact and one belongs to the active
 do j_core_inact = 1, n_core_inact_orb
  jj_core_inact = list_core_inact(j_core_inact)
  do i_act = 1, n_act_orb
   ii_act = list_act(i_act)
   do i_chol = 1, cholesky_mo_num 
    cholesky_no_total_transp(i_chol,ii_act,j_core_inact) = cholesky_no_1_idx_transp(i_chol,i_act,jj_core_inact)
   enddo
  enddo
 enddo
 do j_core_inact = 1, n_core_inact_orb
  jj_core_inact = list_core_inact(j_core_inact)
  do i_act = 1, n_act_orb
   ii_act = list_act(i_act)
   do i_chol = 1, cholesky_mo_num 
    cholesky_no_total_transp(i_chol,j_core_inact,ii_act) = cholesky_no_1_idx_transp(i_chol,i_act,jj_core_inact)
   enddo
  enddo
 enddo
 ! Block when two orbitals belong to the active 
 do j_act = 1, n_act_orb
  jj_act = list_act(j_act)
  do i_act = 1, n_act_orb
   ii_act = list_act(i_act)
   do i_chol = 1, cholesky_mo_num 
    cholesky_no_total_transp(i_chol,ii_act,jj_act) = cholesky_no_2_idx_transp(i_chol,i_act,j_act)
   enddo
  enddo
 enddo
 ! Block when two orbitals belong to the virtuals 
 do i_virt = 1, n_virt_orb
  ii_virt = list_virt(i_virt)
  do j_virt = 1, n_virt_orb
   jj_virt = list_virt(j_virt)
   do i_chol = 1, cholesky_mo_num
    cholesky_no_total_transp(i_chol,jj_virt,ii_virt) = cholesky_mo_transp(i_chol,jj_virt,ii_virt)
   enddo
  enddo
 enddo
 ! Block when one orbital is in active and the other in the virtuals 
 do i_virt = 1, n_virt_orb
  ii_virt = list_virt(i_virt)
  do i_act = 1, n_act_orb
   ii_act = list_act(i_act)
   do i_chol = 1, cholesky_mo_num
    cholesky_no_total_transp(i_chol,ii_act,ii_virt) = cholesky_no_1_idx_transp(i_chol, i_act,ii_virt)
   enddo
  enddo
 enddo
 do i_virt = 1, n_virt_orb
  ii_virt = list_virt(i_virt)
  do i_act = 1, n_act_orb
   ii_act = list_act(i_act)
   do i_chol = 1, cholesky_mo_num
    cholesky_no_total_transp(i_chol,ii_virt,ii_act) = cholesky_no_1_idx_transp(i_chol, i_act,ii_virt)
   enddo
  enddo
 enddo
 ! Block when one orbital is in the virtual and one in the core-inact 
 do i_virt = 1, n_virt_orb
  ii_virt = list_virt(i_virt)
  do i_core_inact = 1, n_core_inact_orb
   ii_core_inact = list_core_inact(i_core_inact)
   do i_chol = 1, cholesky_mo_num
    cholesky_no_total_transp(i_chol, ii_core_inact, ii_virt) = cholesky_mo_transp(i_chol, ii_core_inact, ii_virt)
   enddo
  enddo
 enddo
 do i_core_inact = 1, n_core_inact_orb
  ii_core_inact = list_core_inact(i_core_inact)
  do i_virt = 1, n_virt_orb
   ii_virt = list_virt(i_virt)
   do i_chol = 1, cholesky_mo_num
    cholesky_no_total_transp(i_chol, ii_virt, ii_core_inact) = cholesky_mo_transp(i_chol, ii_virt, ii_core_inact)
   enddo
  enddo
 enddo
 call wall_time(wall1)
 print*,'Time to provide cholesky_no_total_transp = ', wall1 - wall0
 END_PROVIDER 
 double precision function bielec_no_basis(i_1,j_1,i_2,j_2)
 implicit none
 integer, intent(in) :: i_1,j_1,i_2,j_2
  BEGIN_DOC
  ! integral (i_1 j_1|i_2 j_2) in the mixed basis of both MOs and natural MOs
  ! 
  END_DOC
  integer :: i 
 bielec_no_basis = 0.d0
 do i = 1, cholesky_mo_num
  bielec_no_basis += cholesky_no_total_transp(i,i_1, j_1) * cholesky_no_total_transp(i,i_2,j_2)
 enddo
 end
 double precision function bielec_PQxx_no(i_mo, j_mo, i_ca, j_ca)
 implicit none
 BEGIN_DOC
 ! function that computes (i_mo j_mo| i_ca j_ca) with Cholesky decomposition  on the NO basis for active orbitals 
 ! 
 ! where i_ca, j_ca are in [1:n_core_inact_act_orb]
 END_DOC
 integer, intent(in) :: i_ca, j_ca, i_mo, j_mo
 integer :: ii_ca, jj_ca
 double precision :: bielec_no_basis
 ii_ca = list_core_inact_act(i_ca)
 jj_ca = list_core_inact_act(j_ca)
 bielec_PQxx_no = bielec_no_basis(i_mo,j_mo,ii_ca,jj_ca)
 end
 double precision function bielec_PxxQ_no(i_mo, j_ca, i_ca, j_mo)
 implicit none 
  BEGIN_DOC
  ! function that computes (i_mo j_ca |i_ca j_mo) with Cholesky decomposition  on the NO basis for active orbitals 
  ! 
  ! where i_ca, j_ca are in [1:n_core_inact_act_orb]
  END_DOC
 integer, intent(in) :: i_ca, j_ca, i_mo, j_mo
 integer :: ii_ca, jj_ca
 double precision :: bielec_no_basis
 ii_ca = list_core_inact_act(i_ca)
 jj_ca = list_core_inact_act(j_ca)
 bielec_PxxQ_no = bielec_no_basis(i_mo, jj_ca, ii_ca, j_mo)
 end
 double precision function bielec_PQxx(i_mo, j_mo, i_ca, j_ca)
  BEGIN_DOC
  ! function that computes (i_mo j_mo |i_ca j_ca) with Cholesky decomposition 
  ! 
  ! indices are unshifted orbital numbers
  ! 
  ! where i_ca, j_ca are in [1:n_core_inact_act_orb]
  END_DOC
 implicit none 
 integer, intent(in) :: i_ca, j_ca, j_mo, i_mo
 double precision :: mo_two_e_integral
 integer :: ii_ca, jj_ca
 ii_ca = list_core_inact_act(i_ca)
 jj_ca = list_core_inact_act(j_ca)
 bielec_PQxx = mo_two_e_integral(i_mo,ii_ca,j_mo,jj_ca)
 end
 double precision function bielec_PxxQ(i_mo, i_ca, j_ca, j_mo)
  BEGIN_DOC
  ! function that computes (i_mo j_mo |i_ca j_ca) with Cholesky decomposition 
  ! 
  ! where i_ca, j_ca are in [1:n_core_inact_act_orb]
  END_DOC
 implicit none
 integer, intent(in) :: i_ca, j_ca, j_mo, i_mo
 double precision :: mo_two_e_integral
 integer :: ii_ca, jj_ca
 ii_ca = list_core_inact_act(i_ca)
 jj_ca = list_core_inact_act(j_ca)
 bielec_PxxQ = mo_two_e_integral(i_mo,jj_ca,ii_ca,j_mo)
 end
--- a/src/casscf_cipsi/chol_garb.irp.f
+++ b/src/casscf_cipsi/chol_garb.irp.f
@ -0,0 +1,34 @@
 !!!!! FUNCTIONS THAT WORK BUT WHICH ARE USELESS AS THE ARRAYS CAN ALWAYS BE STORED
 !double precision function bielecCI_chol(i_a, j_a, k_a, i_mo)
 !  BEGIN_DOC
 !  ! function that computes (i_a j_a |k_a j_mo) with Cholesky decomposition 
 !  ! 
 !  ! where i_a, j_a, k_a are in [1:n_act_orb] !!! ONLY ON ACTIVE 
 !  END_DOC
 ! implicit none
 ! integer, intent(in) :: i_a, j_a, k_a, i_mo
 ! integer :: ii_a, jj_a, kk_a
 ! double precision :: mo_two_e_integral
 ! ii_a = list_act(i_a)
 ! jj_a = list_act(j_a)
 ! kk_a = list_act(k_a)
 ! bielecCI_chol = mo_two_e_integral(ii_a,kk_a,jj_a,i_mo)
 !end
 !double precision function bielecCI_no_chol(i_ca, j_ca, k_ca, i_mo)
 !  BEGIN_DOC
 !  ! function that computes (i_ca j_ca |k_ca j_mo) with Cholesky decomposition on the NO basis for active orbitals 
 !  ! 
 !  ! where i_ca, j_ca, k_ca are in [1:n_core_inact_act_orb]
 !  END_DOC
 ! implicit none 
 ! integer, intent(in) :: i_ca, j_ca, k_ca, i_mo
 ! integer :: ii_ca, jj_ca, kk_ca
 ! double precision :: bielec_no_basis_chol
 ! ii_ca = list_act(i_ca)
 ! jj_ca = list_act(j_ca)
 ! kk_ca = list_act(k_ca)
 ! bielecCI_no_chol = bielec_no_basis_chol(ii_ca, jj_ca, kk_ca, i_mo)
 !
 !end
--- a/src/casscf_cipsi/gradient.irp.f
+++ b/src/casscf_cipsi/gradient.irp.f
@ -157,6 +157,7 @@ real*8 function gradvec_it(i,t)
  integer                        :: ii,tt,v,vv,x,y
  integer                        :: x3,y3
  double precision :: bielec_PQxx_no
  ii=list_core_inact(i)
  tt=list_act(t)
--- a/src/casscf_cipsi/hessian.irp.f
+++ b/src/casscf_cipsi/hessian.irp.f
@ -10,6 +10,7 @@ real*8 function hessmat_itju(i,t,j,u)
  implicit none
  integer                        :: i,t,j,u,ii,tt,uu,v,vv,x,xx,y,jj
  real*8                         :: term,t2
  double precision :: bielec_pqxx_no,bielec_pxxq_no
  ii=list_core_inact(i)
  tt=list_act(t)
@ -95,6 +96,7 @@ real*8 function hessmat_itja(i,t,j,a)
  implicit none
  integer                        :: i,t,j,a,ii,tt,jj,aa,v,vv,x,y
  real*8                         :: term
  double precision :: bielec_pqxx_no,bielec_pxxq_no
  ! it/ja
  ii=list_core_inact(i)
@ -128,6 +130,7 @@ real*8 function hessmat_itua(i,t,u,a)
  implicit none
  integer                        :: i,t,u,a,ii,tt,uu,aa,v,vv,x,xx,u3,t3,v3
  real*8                         :: term
  double precision :: bielec_pqxx_no,bielec_pxxq_no
  ii=list_core_inact(i)
  tt=list_act(t)
@ -169,6 +172,7 @@ real*8 function hessmat_iajb(i,a,j,b)
  implicit none
  integer                        :: i,a,j,b,ii,aa,jj,bb
  real*8                         :: term
  double precision :: bielec_pqxx_no,bielec_pxxq_no
  ii=list_core_inact(i)
  aa=list_virt(a)
@ -205,6 +209,7 @@ real*8 function hessmat_iatb(i,a,t,b)
  implicit none
  integer                        :: i,a,t,b,ii,aa,tt,bb,v,vv,x,y,v3,t3
  real*8                         :: term
  double precision :: bielec_pqxx_no,bielec_pxxq_no
  ii=list_core_inact(i)
  aa=list_virt(a)
@ -237,6 +242,7 @@ real*8 function hessmat_taub(t,a,u,b)
  integer                        :: t,a,u,b,tt,aa,uu,bb,v,vv,x,xx,y
  integer                        :: v3,x3
  real*8                         :: term,t1,t2,t3
  double precision :: bielec_pqxx_no,bielec_pxxq_no
  tt=list_act(t)
  aa=list_virt(a)
--- a/src/casscf_cipsi/mcscf_fock.irp.f
+++ b/src/casscf_cipsi/mcscf_fock.irp.f
@ -4,6 +4,7 @@ BEGIN_PROVIDER [real*8, Fipq, (mo_num,mo_num) ]
   END_DOC
   implicit none
   integer                        :: p,q,k,kk,t,tt,u,uu
   double precision :: bielec_pxxq_no, bielec_pqxx_no
   do q=1,mo_num
     do p=1,mo_num
@ -44,6 +45,7 @@ BEGIN_PROVIDER [real*8, Fapq, (mo_num,mo_num) ]
   END_DOC
   implicit none
   integer                        :: p,q,k,kk,t,tt,u,uu
   double precision :: bielec_pxxq_no, bielec_pqxx_no
   Fapq = 0.d0
--- a/src/casscf_cipsi/test_chol.irp.f
+++ b/src/casscf_cipsi/test_chol.irp.f
@ -0,0 +1,116 @@
 program test_chol
 implicit none
 read_wf= .True.
 touch read_wf 
 ! call routine_bielec_PxxQ_no
 ! call routine_bielecCI_no
 ! call test_bielec_PxxQ_chol
 ! call test_bielecCI
 end
 subroutine routine_bielec_PQxx_no
 implicit none
 integer :: i_chol, i_act, ii_act, j_act, jj_act, i_core_inact, j_core_inact, ii_core_inact, jj_core_inact
 integer :: i_virt, ii_virt, j_virt, jj_virt, i_mo, j_mo
 double precision :: exact, new, error, accu, bielec_no_basis_chol
 double precision :: bielec_PQxx_no
 accu = 0.d0
 do i_core_inact = 1, n_core_inact_act_orb
  ii_core_inact = list_core_inact_act(i_core_inact)
  do j_core_inact = 1, n_core_inact_act_orb
   jj_core_inact = list_core_inact_act(j_core_inact)
   do i_mo = 1, mo_num
    do j_mo = 1, mo_num
     exact = bielec_PQxx_no_array(j_mo,i_mo, j_core_inact, i_core_inact) 
     new   = bielec_PQxx_no(j_mo,i_mo, j_core_inact, i_core_inact) 
     error = dabs(exact-new)
     if(dabs(exact).gt.1.d-10)then
      print*,exact,new,error
     endif
     accu += error
    enddo
   enddo
  enddo
 enddo
 print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2))
 end
 subroutine routine_bielec_PxxQ_no_array
 implicit none
 integer :: i_chol, i_act, ii_act, j_act, jj_act, i_core_inact, j_core_inact, ii_core_inact, jj_core_inact
 integer :: i_virt, ii_virt, j_virt, jj_virt, i_mo, j_mo
 double precision :: exact, new, error, accu, bielec_no_basis_chol
 double precision :: bielec_PxxQ_no
 accu = 0.d0
 do i_mo = 1, mo_num
  do i_core_inact = 1, n_core_inact_act_orb
  ii_core_inact = list_core_inact_act(i_core_inact)
   do j_core_inact = 1, n_core_inact_act_orb
   jj_core_inact = list_core_inact_act(j_core_inact)
    do j_mo = 1, mo_num
     exact = bielec_PxxQ_no_array(j_mo, j_core_inact,  i_core_inact,i_mo) 
 !     new   = bielec_no_basis_chol(j_mo,i_mo, jj_core_inact, ii_core_inact) 
     new   = bielec_PxxQ_no(j_mo, j_core_inact,  i_core_inact,i_mo) 
     error = dabs(exact-new)
     accu += error
     if(dabs(exact).gt.1.d-10)then
      print*,exact,new,error
     endif
    enddo
   enddo
  enddo
 enddo
 print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2))
 end
 subroutine test_bielec_PQxx(i_mo, j_mo, i_ca, j_ca)
 implicit none
 integer :: i_mo, j_mo, i_ca, j_ca 
 double precision :: exact, new, error, accu
 double precision :: bielec_PQxx
 accu = 0.d0
 do j_ca = 1, n_core_inact_act_orb
  do i_ca = 1, n_core_inact_act_orb
   do j_mo = 1, mo_num
    do i_mo = 1, mo_num
     exact = bielec_PQxx_array(i_mo, j_mo, i_ca, j_ca)
     new   = bielec_PQxx(i_mo, j_mo, i_ca, j_ca)
     error = dabs(exact-new)
     accu += error
     if(dabs(exact).gt.1.d-10)then
      print*,exact,new,error
     endif
    enddo
   enddo
  enddo
 enddo
 print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2))
 end
 subroutine test_bielec_PxxQ_chol(i_mo, i_ca, j_ca, j_mo)
 implicit none
 integer :: i_mo, i_ca, j_ca, j_mo
 double precision :: exact, new, error, accu
 double precision :: bielec_PxxQ
 accu = 0.d0
 do j_mo = 1, mo_num
  do j_ca = 1, n_core_inact_act_orb
   do i_ca =1, n_core_inact_act_orb
    do i_mo = 1, mo_num
     exact = bielec_PxxQ_array(i_mo, i_ca, j_ca, j_mo)
     new   = bielec_PxxQ(i_mo, i_ca, j_ca, j_mo)
     error = dabs(exact-new)
     accu += error
     if(dabs(exact).gt.1.d-10)then
      print*,exact,new,error
     endif
    enddo
   enddo
  enddo
 enddo
 print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2))
 end
--- a/src/casscf_cipsi/tot_en.irp.f
+++ b/src/casscf_cipsi/tot_en.irp.f
@ -8,6 +8,7 @@
   implicit none
   integer                        :: t,u,v,x,i,ii,tt,uu,vv,xx,j,jj,t3,u3,v3,x3
   real*8                         :: e_one_all,e_two_all
   double precision :: bielec_PQxx,bielec_PxxQ
   e_one_all=0.D0
   e_two_all=0.D0
   do i=1,n_core_inact_orb
--- a/src/ccsd/NEED
+++ b/src/ccsd/NEED
@ -1,2 +1,3 @@
 gpu
 hartree_fock
 utils_cc
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@ -1,4 +1,5 @@
 subroutine run_ccsd_space_orb
  use gpu
  implicit none
@ -9,9 +10,19 @@ subroutine run_ccsd_space_orb
  double precision :: uncorr_energy,energy, max_elem, max_r, max_r1, max_r2,ta,tb
  logical :: not_converged
-  double precision, allocatable :: t2(:,:,:,:), r2(:,:,:,:), tau(:,:,:,:), tau_x(:,:,:,:)
+  type(gpu_double4) :: t2, r2, tau, tau_x
-  double precision, allocatable :: t1(:,:), r1(:,:)
+  type(gpu_double2) :: t1, r1
-  double precision, allocatable :: H_oo(:,:), H_vv(:,:), H_vo(:,:)
+  type(gpu_double2) :: H_oo, H_vv, H_vo
  type(gpu_double2) :: d_cc_space_f_oo, d_cc_space_f_vo
  type(gpu_double2) :: d_cc_space_f_ov, d_cc_space_f_vv
  type(gpu_double3) :: d_cc_space_v_oo_chol, d_cc_space_v_vo_chol
  type(gpu_double3) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol
  type(gpu_double4) :: d_cc_space_v_oovv, d_cc_space_v_voov, d_cc_space_v_ovov
  type(gpu_double4) :: d_cc_space_v_oovo, d_cc_space_v_vooo, d_cc_space_v_oooo
  type(gpu_double4) :: d_cc_space_v_vvoo, d_cc_space_v_ovvo, d_cc_space_v_ovoo
  double precision, allocatable :: all_err(:,:), all_t(:,:)
  integer, allocatable          :: list_occ(:), list_vir(:)
@ -20,7 +31,7 @@ subroutine run_ccsd_space_orb
  call set_multiple_levels_omp(.False.)
-  if (do_ao_cholesky) then
+  if (do_mo_cholesky) then
    PROVIDE cholesky_mo_transp
    FREE cholesky_ao
  else
@ -51,11 +62,77 @@ subroutine run_ccsd_space_orb
  !print*,'occ',list_occ
  !print*,'vir',list_vir
-  allocate(t2(nO,nO,nV,nV), r2(nO,nO,nV,nV))
+  ! GPU arrays
-  allocate(tau(nO,nO,nV,nV))
+  call gpu_allocate(d_cc_space_f_oo, nO, nO)
-  allocate(tau_x(nO,nO,nV,nV))
+  call gpu_allocate(d_cc_space_f_vo, nV, nO)
-  allocate(t1(nO,nV), r1(nO,nV))
+  call gpu_allocate(d_cc_space_f_ov, nO, nV)
-  allocate(H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO))
+  call gpu_allocate(d_cc_space_f_vv, nV, nV)
  call gpu_upload(cc_space_f_oo, d_cc_space_f_oo)
  call gpu_upload(cc_space_f_vo, d_cc_space_f_vo)
  call gpu_upload(cc_space_f_ov, d_cc_space_f_ov)
  call gpu_upload(cc_space_f_vv, d_cc_space_f_vv)
 !  FREE cc_space_f_oo
 !  FREE cc_space_f_vo
 !  FREE cc_space_f_vv
  if (do_mo_cholesky) then
    call gpu_allocate(d_cc_space_v_oo_chol, cholesky_mo_num, nO, nO)
    call gpu_allocate(d_cc_space_v_ov_chol, cholesky_mo_num, nO, nV)
    call gpu_allocate(d_cc_space_v_vo_chol, cholesky_mo_num, nV, nO)
    call gpu_allocate(d_cc_space_v_vv_chol, cholesky_mo_num, nV, nV)
    call gpu_upload(cc_space_v_oo_chol, d_cc_space_v_oo_chol)
    call gpu_upload(cc_space_v_ov_chol, d_cc_space_v_ov_chol)
    call gpu_upload(cc_space_v_vo_chol, d_cc_space_v_vo_chol)
    call gpu_upload(cc_space_v_vv_chol, d_cc_space_v_vv_chol)
 !    FREE cc_space_v_oo_chol
 !    FREE cc_space_v_ov_chol
 !    FREE cc_space_v_vo_chol
 !    FREE cc_space_v_vv_chol
  endif
  call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV)
  call gpu_allocate(d_cc_space_v_voov, nV, nO, nO, nV)
  call gpu_allocate(d_cc_space_v_ovov, nO, nV, nO, nV)
  call gpu_allocate(d_cc_space_v_oovo, nO, nO, nV, nO)
  call gpu_allocate(d_cc_space_v_ovvo, nO, nV, nV, nO)
  call gpu_allocate(d_cc_space_v_vooo, nV, nO, nO, nO)
  call gpu_allocate(d_cc_space_v_oooo, nO, nO, nO, nO)
  call gpu_allocate(d_cc_space_v_vvoo, nV, nV, nO, nO)
  call gpu_allocate(d_cc_space_v_ovoo, nO, nV, nO, nO)
  call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv)
  call gpu_upload(cc_space_v_voov, d_cc_space_v_voov)
  call gpu_upload(cc_space_v_ovov, d_cc_space_v_ovov)
  call gpu_upload(cc_space_v_oovo, d_cc_space_v_oovo)
  call gpu_upload(cc_space_v_ovvo, d_cc_space_v_ovvo)
  call gpu_upload(cc_space_v_vooo, d_cc_space_v_vooo)
  call gpu_upload(cc_space_v_oooo, d_cc_space_v_oooo)
  call gpu_upload(cc_space_v_vvoo, d_cc_space_v_vvoo)
  call gpu_upload(cc_space_v_ovoo, d_cc_space_v_ovoo)
 !  FREE cc_space_v_voov
 !  FREE cc_space_v_ovov
 !  FREE cc_space_v_oovo
 !  FREE cc_space_v_oovv
 !  FREE cc_space_v_vooo
 !  FREE cc_space_v_oooo
 !  FREE cc_space_v_vvoo
 !  FREE cc_space_v_ovvo
 !  FREE cc_space_v_ovoo
  call gpu_allocate(t2, nO,nO,nV,nV)
  call gpu_allocate(r2, nO,nO,nV,nV)
  call gpu_allocate(tau, nO,nO,nV,nV)
  call gpu_allocate(tau_x, nO,nO,nV,nV)
  call gpu_allocate(t1, nO,nV)
  call gpu_allocate(r1, nO,nV)
  call gpu_allocate(H_oo, nO, nO)
  call gpu_allocate(H_vo, nV, nO)
  call gpu_allocate(H_vv, nV, nV)
  if (cc_update_method == 'diis') then
    double precision :: rss, diis_mem, extra_mem
@ -97,14 +174,22 @@ subroutine run_ccsd_space_orb
  endif
  ! Init
-  call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,t1)
+  double precision, allocatable :: h_t1(:,:), h_t2(:,:,:,:)
-  call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,t2)
+  allocate(h_t1(nO,nV), h_t2(nO,nO,nV,nV))
-  call update_tau_space(nO,nV,t1,t2,tau)
+
  call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,h_t1)
  call gpu_upload(h_t1, t1)
  call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,h_t2)
  call gpu_upload(h_t2, t2)
  call update_tau_space(nO,nV,h_t1,t1,t2,tau)
  call update_tau_x_space(nO,nV,tau,tau_x)
  !print*,'hf_energy', hf_energy
  call det_energy(det,uncorr_energy)
  print*,'Det energy', uncorr_energy
-  call ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
+
  call ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy)
  print*,'Guess energy', uncorr_energy+energy, energy
  nb_iter = 0
@ -120,43 +205,45 @@ subroutine run_ccsd_space_orb
  do while (not_converged)
    ! Residue
-    if (do_ao_cholesky) then
+    if (do_mo_cholesky) then
-!    if (.False.) then
+      call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
-      call compute_H_oo_chol(nO,nV,tau_x,H_oo)
+      call compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, d_cc_space_v_ov_chol,H_vv)
-      call compute_H_vv_chol(nO,nV,tau_x,H_vv)
+      call compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol, H_vo)
      call compute_H_vo_chol(nO,nV,t1,H_vo)
-      call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
+      call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_space_f_ov,d_cc_space_f_vo, &
-      call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
+           d_cc_space_v_voov, d_cc_space_v_ovov, d_cc_space_v_oovo, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol)
      call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
           d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, d_cc_space_v_oovo, d_cc_space_v_ovvo, d_cc_space_v_ovoo, &
           d_cc_space_v_ovov, d_cc_space_v_vvoo, d_cc_space_v_oo_chol, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, &
           d_cc_space_f_vo, &
           r2, max_r2)
    else
-      call compute_H_oo(nO,nV,t1,t2,tau,H_oo)
+      call compute_H_oo(nO,nV,t1%f,t2%f,tau%f,H_oo%f)
-      call compute_H_vv(nO,nV,t1,t2,tau,H_vv)
+      call compute_H_vv(nO,nV,t1%f,t2%f,tau%f,H_vv%f)
-      call compute_H_vo(nO,nV,t1,t2,H_vo)
+      call compute_H_vo(nO,nV,t1%f,t2%f,H_vo%f)
-      call compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
+      call compute_r1_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r1%f,max_r1)
-      call compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
+      call compute_r2_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r2%f,max_r2)
    endif
    max_r = max(max_r1,max_r2)
    ! Update
    if (cc_update_method == 'diis') then
-      !call update_t_ccsd(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
+      call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1%f,r2%f,t1%f,t2%f,all_err,all_t)
      !call update_t_ccsd_diis(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
      call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1,r2,t1,t2,all_err,all_t)
    ! Standard update as T = T - Delta
    elseif (cc_update_method == 'none') then
-      call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1,t1)
+      call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1%f,t1%f)
-      call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2,t2)
+      call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2%f,t2%f)
    else
      print*,'Unkown cc_method_method: '//cc_update_method
    endif
-    call update_tau_space(nO,nV,t1,t2,tau)
+    call update_tau_space(nO,nV,t1%f,t1,t2,tau)
    call update_tau_x_space(nO,nV,tau,tau_x)
    ! Energy
-    call ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
+    call ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy)
    write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |'
    nb_iter = nb_iter + 1
@ -181,8 +268,8 @@ subroutine run_ccsd_space_orb
  print*,''
  if (write_amplitudes) then
-    call write_t1(nO,nV,t1)
+    call write_t1(nO,nV,t1%f)
-    call write_t2(nO,nV,t2)
+    call write_t2(nO,nV,t2%f)
    call ezfio_set_utils_cc_io_amplitudes('Read')
  endif
@ -191,7 +278,14 @@ subroutine run_ccsd_space_orb
    deallocate(all_err,all_t)
  endif
-  deallocate(H_vv,H_oo,H_vo,r1,r2,tau)
+  call gpu_deallocate(H_oo)
  call gpu_deallocate(H_vv)
  call gpu_deallocate(H_vo)
  call gpu_deallocate(r1)
  call gpu_deallocate(r2)
  call gpu_deallocate(tau)
  call gpu_deallocate(tau_x)
  ! CCSD(T)
  double precision :: e_t, e_t_err
@ -199,28 +293,14 @@ subroutine run_ccsd_space_orb
  if (cc_par_t .and. elec_alpha_num + elec_beta_num > 2) then
    ! Dumb way
    !call wall_time(ta)
    !call ccsd_par_t_space(nO,nV,t1,t2,e_t)
    !call wall_time(tb)
    !print*,'Time: ',tb-ta, ' s'
    !print*,''
    !write(*,'(A15,F18.12,A3)') ' E(CCSD(T))  = ', uncorr_energy + energy + e_t, ' Ha'
    !write(*,'(A15,F18.12,A3)') ' E(T)        = ', e_t, ' Ha'
    !write(*,'(A15,F18.12,A3)') ' Correlation = ', energy + e_t, ' Ha'
    !print*,''
    ! New
    e_t = uncorr_energy + energy ! For print in (T) call
    e_t_err = 0.d0
    print*,'Computing (T) correction...'
    call wall_time(ta)
 !    call ccsd_par_t_space_v3(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
 !         ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t)
-    call ccsd_par_t_space_stoch(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
+    call ccsd_par_t_space_stoch(nO,nV,t1%f,t2%f,cc_space_f_o,cc_space_f_v &
         ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t, e_t_err)
    call wall_time(tb)
@ -235,168 +315,161 @@ subroutine run_ccsd_space_orb
  call save_energy(uncorr_energy + energy, e_t)
-  deallocate(t1,t2)
+  deallocate(h_t1, h_t2)
  if (do_mo_cholesky) then
    call gpu_deallocate(d_cc_space_v_oo_chol)
    call gpu_deallocate(d_cc_space_v_ov_chol)
    call gpu_deallocate(d_cc_space_v_vo_chol)
    call gpu_deallocate(d_cc_space_v_vv_chol)
  endif
  call gpu_deallocate(d_cc_space_v_oovv)
  call gpu_deallocate(d_cc_space_v_voov)
  call gpu_deallocate(d_cc_space_v_ovov)
  call gpu_deallocate(d_cc_space_v_oovo)
  call gpu_deallocate(d_cc_space_v_ovvo)
  call gpu_deallocate(d_cc_space_v_vooo)
  call gpu_deallocate(d_cc_space_v_oooo)
  call gpu_deallocate(d_cc_space_v_vvoo)
  call gpu_deallocate(d_cc_space_v_ovoo)
  call gpu_deallocate(d_cc_space_f_oo)
  call gpu_deallocate(d_cc_space_f_vo)
  call gpu_deallocate(d_cc_space_f_ov)
  call gpu_deallocate(d_cc_space_f_vv)
  call gpu_deallocate(t1)
  call gpu_deallocate(t2)
 end
 ! Energy
-subroutine ccsd_energy_space(nO,nV,tau,t1,energy)
+subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy)
-
+  use gpu
  implicit none
  integer, intent(in)            :: nO, nV
-  double precision, intent(in)  :: tau(nO,nO,nV,nV)
+  type(gpu_double4), intent(in)  :: tau_x, d_cc_space_v_oovv
-  double precision, intent(in)  :: t1(nO,nV)
+  type(gpu_double2), intent(in)  :: t1, d_cc_space_f_vo
  double precision, intent(out)  :: energy
  ! internal
  integer :: i,j,a,b
  double precision :: e
-  energy = 0d0
+  type(gpu_stream) :: s1, s2
-  !$omp parallel &
+  call gpu_stream_create(s1)
-  !$omp shared(nO,nV,energy,tau,t1,&
+  call gpu_stream_create(s2)
  !$omp cc_space_f_vo,cc_space_w_oovv) &
  !$omp private(i,j,a,b,e) &
  !$omp default(none)
  e = 0d0
  !$omp do
  do a = 1, nV
    do i = 1, nO
      e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
    enddo
  enddo
  !$omp end do nowait
  !$omp do
  do b = 1, nV
    do a = 1, nV
      do j = 1, nO
        do i = 1, nO
          e = e + tau(i,j,a,b) * cc_space_w_oovv(i,j,a,b)
       enddo
      enddo
    enddo
  enddo
  !$omp end do nowait
  !$omp critical
  energy = energy + e
  !$omp end critical
  !$omp end parallel
-end
+  call gpu_set_stream(blas_handle,s1)
  call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo%f(1,1), 1, t1%f(1,1), 1, e)
-subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
+  call gpu_set_stream(blas_handle,s2)
  call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x%f(1,1,1,1), 1_8, d_cc_space_v_oovv%f(1,1,1,1), 1_8, energy)
  call gpu_set_stream(blas_handle,gpu_default_stream)
-  implicit none
+  call gpu_synchronize()
  call gpu_stream_destroy(s1)
  call gpu_stream_destroy(s2)
-  integer, intent(in)           :: nO, nV
+   energy = energy + 2.d0*e
  double precision, intent(in)  :: tau_x(nO,nO,nV,nV)
  double precision, intent(in)  :: t1(nO,nV)
  double precision, intent(out) :: energy
  ! internal
  integer :: i,j,a,b
  double precision :: e
  energy = 0d0
  !$omp parallel &
  !$omp shared(nO,nV,energy,tau_x,t1,&
  !$omp cc_space_f_vo,cc_space_v_oovv) &
  !$omp private(i,j,a,b,e) &
  !$omp default(none)
  e = 0d0
  !$omp do
  do a = 1, nV
    do i = 1, nO
      e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
    enddo
  enddo
  !$omp end do nowait
  !$omp do
  do b = 1, nV
    do a = 1, nV
      do j = 1, nO
        do i = 1, nO
          e = e + tau_x(i,j,a,b) * cc_space_v_oovv(i,j,a,b)
       enddo
      enddo
    enddo
  enddo
  !$omp end do nowait
  !$omp critical
  energy = energy + e
  !$omp end critical
  !$omp end parallel
 end
 ! Tau
-subroutine update_tau_space(nO,nV,t1,t2,tau)
+subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
-
+  use gpu
  implicit none
  ! in
  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: t1(nO,nV), t2(nO,nO,nV,nV)
+  double precision, intent(in)  :: h_t1(nO,nV)
  type(gpu_double2), intent(in) :: t1
  type(gpu_double4), intent(in) :: t2
  ! out
-  double precision, intent(out) :: tau(nO,nO,nV,nV)
+  type(gpu_double4) :: tau
  ! internal
  integer                       :: i,j,a,b
  type(gpu_stream) :: stream(nV)
  !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,tau,t2,t1) &
+  !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) &
  !$OMP PRIVATE(i,j,a,b) &
  !$OMP DEFAULT(NONE)
  !$OMP DO
-  do b = 1, nV
+  do b=1,nV
-    do a = 1, nV
+    call gpu_stream_create(stream(b))
-      do j = 1, nO
+    call gpu_set_stream(blas_handle,stream(b))
-        do i = 1, nO
+    do j=1,nO
-          tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b)
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, &
-        enddo
+         1.d0, t2%f(1,j,1,b), nO*nO, &
-      enddo
+         h_t1(j,b), t1%f(1,1), nO, &
         tau%f(1,j,1,b), nO*nO)
    enddo
  enddo
  !$OMP END DO
  !$OMP END PARALLEL
  call gpu_synchronize()
  do b=1,nV
    call gpu_stream_destroy(stream(b))
  enddo
  call gpu_set_stream(blas_handle,gpu_default_stream)
 end
 subroutine update_tau_x_space(nO,nV,tau,tau_x)
-
+  use gpu
  implicit none
  ! in
  integer, intent(in)         :: nO, nV
-  double precision, intent(in)  :: tau(nO,nO,nV,nV)
+  type(gpu_double4), intent(in)  :: tau
  ! out
-  double precision, intent(out) :: tau_x(nO,nO,nV,nV)
+  type(gpu_double4) :: tau_x
  ! internal
  integer                       :: i,j,a,b
  type(gpu_stream) :: stream(nV)
  do a=1,nV
    call gpu_stream_create(stream(a))
  enddo
  !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,tau,tau_x) &
+  !$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) &
-  !$OMP PRIVATE(i,j,a,b) &
+  !$OMP PRIVATE(a,b) &
  !$OMP DEFAULT(NONE)
  !$OMP DO
-  do b = 1, nV
+  do b=1,nV
-    do a = 1, nV
+    do a=1,nV
-      do j = 1, nO
+      call gpu_set_stream(blas_handle,stream(a))
-        do i = 1, nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, &
-          tau_x(i,j,a,b) = 2.d0*tau(i,j,a,b) - tau(i,j,b,a)
+          2.d0, tau%f(1,1,a,b), nO, &
-        enddo
+         -1.d0, tau%f(1,1,b,a), nO, &
-      enddo
+         tau_x%f(1,1,a,b), nO)
    enddo
  enddo
  !$OMP END DO
  !$OMP END PARALLEL
  call gpu_set_stream(blas_handle,gpu_default_stream)
  call gpu_synchronize()
  do b=1,nV
    call gpu_stream_destroy(stream(b))
  enddo
 end
 ! R1
--- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
--- a/src/gpu/NEED
+++ b/src/gpu/NEED
@ -0,0 +1 @@
 gpu_arch
--- a/src/gpu/README.rst
+++ b/src/gpu/README.rst
@ -0,0 +1,6 @@
 ===
 gpu
 ===
 Bindings for GPU routines (architecture independent).
 Architecture-dependent files are in gpu_arch.
--- a/src/gpu_x86/gpu.h
+++ b/src/gpu_x86/gpu.h
@ -22,20 +22,20 @@ void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_
 void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result);
-void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double alpha,
+void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double* alpha,
-               const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy);
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy);
-void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float alpha,
+void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float* alpha,
-               const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy);
+               const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy);
-void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
+void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
-               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc);
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc);
-void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
+void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha,
-               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc);
+               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc);
-void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
+void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double* alpha,
-               const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc);
+               const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc);
-void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
+void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float* alpha,
-               const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc);
+               const float* a, const int64_t lda, const float* beta, const float* b, const int64_t ldb, float* c, const int64_t ldc);
--- a/src/gpu/gpu.irp.f
+++ b/src/gpu/gpu.irp.f
@ -0,0 +1,26 @@
 use gpu
 BEGIN_PROVIDER [ type(gpu_blas), blas_handle ]
 implicit none
 BEGIN_DOC
 ! Handle for cuBLAS or RocBLAS
 END_DOC
 call gpu_blas_create(blas_handle)
 END_PROVIDER
 BEGIN_PROVIDER [ type(gpu_stream), gpu_default_stream ]
 implicit none
 BEGIN_DOC
 ! Default stream
 END_DOC
 gpu_default_stream%c = C_NULL_PTR
 END_PROVIDER
 BEGIN_PROVIDER [ integer, gpu_num ]
 implicit none
 BEGIN_DOC
 ! Number of usable GPUs
 END_DOC
 gpu_num = gpu_ndevices()
 END_PROVIDER
--- a/src/gpu/gpu_module.F90
+++ b/src/gpu/gpu_module.F90
@ -0,0 +1,707 @@
 module gpu
  use, intrinsic :: iso_c_binding
  implicit none
 ! Data types
 ! ----------
  type gpu_double1
    type(c_ptr) :: c
    double precision, pointer :: f(:)
  end type
  type gpu_double2
    type(c_ptr) :: c
    double precision, pointer :: f(:,:)
  end type
  type gpu_double3
    type(c_ptr) :: c
    double precision, pointer :: f(:,:,:)
  end type
  type gpu_double4
    type(c_ptr) :: c
    double precision, pointer :: f(:,:,:,:)
  end type
  type gpu_double5
    type(c_ptr) :: c
    double precision, pointer :: f(:,:,:,:,:)
  end type
  type gpu_double6
    type(c_ptr) :: c
    double precision, pointer :: f(:,:,:,:,:,:)
  end type
  type gpu_blas
    type(c_ptr) :: c
  end type
  type gpu_stream
    type(c_ptr) :: c
  end type
 ! C interfaces
 ! ------------
  interface
    logical(c_bool) function no_gpu() bind(C)
      import
    end function
    integer function gpu_ndevices() bind(C)
      import
    end function
    subroutine gpu_set_device(id) bind(C)
      import
      integer(c_int32_t), value :: id
    end subroutine
    subroutine gpu_allocate_c(ptr, n) bind(C, name='gpu_allocate')
      import
      type(c_ptr) :: ptr
      integer(c_int64_t), value :: n
    end subroutine
    subroutine gpu_deallocate_c(ptr) bind(C, name='gpu_deallocate')
      import
      type(c_ptr) :: ptr
    end subroutine
    subroutine gpu_upload_c(cpu_ptr, gpu_ptr, n) bind(C, name='gpu_upload')
      import
      type(c_ptr), value :: cpu_ptr
      type(c_ptr), value :: gpu_ptr
      integer(c_int64_t), value :: n
    end subroutine
    subroutine gpu_download_c(gpu_ptr, cpu_ptr, n) bind(C, name='gpu_download')
      import
      type(c_ptr), value :: gpu_ptr
      type(c_ptr), value :: cpu_ptr
      integer(c_int64_t), value :: n
    end subroutine
    subroutine gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, n) bind(C, name='gpu_copy')
      import
      type(c_ptr), value :: gpu_ptr_src
      type(c_ptr), value :: gpu_ptr_dest
      integer(c_int64_t), value :: n
    end subroutine
    subroutine gpu_stream_create_c(stream) bind(C, name='gpu_stream_create')
      import
      type(c_ptr) :: stream
    end subroutine
    subroutine gpu_stream_destroy_c(stream) bind(C, name='gpu_stream_destroy')
      import
      type(c_ptr) :: stream
    end subroutine
    subroutine gpu_set_stream_c(handle, stream) bind(C, name='gpu_set_stream')
      import
      type(c_ptr), value :: handle, stream
    end subroutine
    subroutine gpu_synchronize() bind(C)
      import
    end subroutine
    subroutine gpu_blas_create_c(handle) bind(C, name='gpu_blas_create')
      import
      type(c_ptr) :: handle
    end subroutine
    subroutine gpu_blas_destroy_c(handle) bind(C, name='gpu_blas_destroy')
      import
      type(c_ptr) :: handle
    end subroutine
    subroutine gpu_ddot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_ddot')
      import
      type(c_ptr), value, intent(in) :: handle
      integer(c_int64_t), value      :: n, incx, incy
      type(c_ptr), value             :: dx, dy
      real(c_double), intent(out)    :: res
    end subroutine
    subroutine gpu_sdot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_sdot')
      import
      type(c_ptr), value, intent(in) :: handle
      integer(c_int64_t), value      :: n, incx, incy
      type(c_ptr), intent(in), value :: dx, dy
      real(c_float), intent(out)     :: res
    end subroutine
    subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
      b, ldb, c, ldc) bind(C, name='gpu_dgeam')
      import
      type(c_ptr), value, intent(in)        :: handle
      character(c_char), intent(in), value  :: transa, transb
      integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
      real(c_double), intent(in)            :: alpha, beta
      type(c_ptr), value :: a, b, c
    end subroutine
    subroutine gpu_sgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
      b, ldb, c, ldc) bind(C, name='gpu_sgeam')
      import
      type(c_ptr), value, intent(in)        :: handle
      character(c_char), intent(in), value  :: transa, transb
      integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
      real(c_float), intent(in)             :: alpha, beta
      real(c_float) :: a, b, c
    end subroutine
    subroutine gpu_dgemv_c(handle, transa, m, n, alpha, a, lda, &
      x, incx, beta, y, incy) bind(C, name='gpu_dgemv')
      import
      type(c_ptr), value, intent(in)        :: handle
      character(c_char), intent(in)         :: transa
      integer(c_int64_t), intent(in), value :: m, n, lda, incx, incy
      real(c_double), intent(in)            :: alpha, beta
      real(c_double)                        :: a, x, y
    end subroutine
    subroutine gpu_sgemv_c(handle, transa, m, n, alpha, a, lda, &
      x, incx, beta, y, incy) bind(C, name='gpu_sgemv')
      import
      type(c_ptr), value, intent(in)        :: handle
      character(c_char), intent(in)         :: transa
      integer(c_int64_t), intent(in), value :: m, n, lda, incx, incy
      real(c_float), intent(in)             :: alpha, beta
      real(c_float)                         :: a, x, y
    end subroutine
    subroutine gpu_dgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, &
      b, ldb, beta, c, ldc) bind(C, name='gpu_dgemm')
      import
      type(c_ptr), value, intent(in)        :: handle
      character(c_char), intent(in)         :: transa, transb
      integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc
      real(c_double), intent(in)            :: alpha, beta
      real(c_double) :: a, b, c
    end subroutine
    subroutine gpu_sgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, &
      b, ldb, beta, c, ldc) bind(C, name='gpu_sgemm')
      import
      type(c_ptr), value, intent(in)        :: handle
      character(c_char), intent(in), value  :: transa, transb
      integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc
      real(c_float), intent(in)             :: alpha, beta
      real(c_float) :: a, b, c
    end subroutine
  end interface
 ! Polymorphic interfaces
 ! ----------------------
  interface gpu_allocate
    procedure gpu_allocate_double1     &
             ,gpu_allocate_double2     &
             ,gpu_allocate_double3     &
             ,gpu_allocate_double4     &
             ,gpu_allocate_double5     &
             ,gpu_allocate_double6     &
             ,gpu_allocate_double1_64  &
             ,gpu_allocate_double2_64  &
             ,gpu_allocate_double3_64  &
             ,gpu_allocate_double4_64  &
             ,gpu_allocate_double5_64  &
             ,gpu_allocate_double6_64
  end interface gpu_allocate
  interface gpu_deallocate
    procedure gpu_deallocate_double1     &
             ,gpu_deallocate_double2     &
             ,gpu_deallocate_double3     &
             ,gpu_deallocate_double4     &
             ,gpu_deallocate_double5     &
             ,gpu_deallocate_double6
  end interface gpu_deallocate
  interface gpu_upload
    procedure gpu_upload_double1  &
             ,gpu_upload_double2  &
             ,gpu_upload_double3  &
             ,gpu_upload_double4  &
             ,gpu_upload_double5  &
             ,gpu_upload_double6
  end interface gpu_upload
  interface gpu_download
    procedure gpu_download_double1  &
             ,gpu_download_double2  &
             ,gpu_download_double3  &
             ,gpu_download_double4  &
             ,gpu_download_double5  &
             ,gpu_download_double6
  end interface gpu_download
  interface gpu_copy
    procedure gpu_copy_double1  &
             ,gpu_copy_double2  &
             ,gpu_copy_double3  &
             ,gpu_copy_double4  &
             ,gpu_copy_double5  &
             ,gpu_copy_double6
  end interface gpu_copy
  contains
 ! gpu_allocate
 ! ------------
    subroutine gpu_allocate_double1(ptr, s)
      implicit none
      type(gpu_double1), intent(inout) :: ptr
      integer, intent(in) :: s
      call gpu_allocate_c(ptr%c, s*8_8)
      call c_f_pointer(ptr%c, ptr%f, (/ s /))
    end subroutine
    subroutine gpu_allocate_double2(ptr, s1, s2)
      implicit none
      type(gpu_double2), intent(inout) :: ptr
      integer, intent(in) :: s1, s2
      call gpu_allocate_c(ptr%c, s1*s2*8_8)
      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /))
    end subroutine
    subroutine gpu_allocate_double3(ptr, s1, s2, s3)
      implicit none
      type(gpu_double3), intent(inout) :: ptr
      integer, intent(in) :: s1, s2, s3
      call gpu_allocate_c(ptr%c, s1*s2*s3*8_8)
      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /))
    end subroutine
    subroutine gpu_allocate_double4(ptr, s1, s2, s3, s4)
      implicit none
      type(gpu_double4), intent(inout) :: ptr
      integer, intent(in) :: s1, s2, s3, s4
      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8)
      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /))
    end subroutine
    subroutine gpu_allocate_double5(ptr, s1, s2, s3, s4, s5)
      implicit none
      type(gpu_double5), intent(inout) :: ptr
      integer, intent(in) :: s1, s2, s3, s4, s5
      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8)
      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /))
    end subroutine
    subroutine gpu_allocate_double6(ptr, s1, s2, s3, s4, s5, s6)
      implicit none
      type(gpu_double6), intent(inout) :: ptr
      integer, intent(in) :: s1, s2, s3, s4, s5, s6
      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8)
      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /))
    end subroutine
    subroutine gpu_allocate_double1_64(ptr, s)
      implicit none
      type(gpu_double1), intent(inout) :: ptr
      integer*8, intent(in) :: s
      call gpu_allocate_c(ptr%c, s)
      call c_f_pointer(ptr%c, ptr%f, (/ s /))
    end subroutine
    subroutine gpu_allocate_double2_64(ptr, s1, s2)
      implicit none
      type(gpu_double2), intent(inout) :: ptr
      integer*8, intent(in) :: s1, s2
      call gpu_allocate_c(ptr%c, s1*s2*8_8)
      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /))
    end subroutine
    subroutine gpu_allocate_double3_64(ptr, s1, s2, s3)
      implicit none
      type(gpu_double3), intent(inout) :: ptr
      integer*8, intent(in) :: s1, s2, s3
      call gpu_allocate_c(ptr%c, s1*s2*s3*8_8)
      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /))
    end subroutine
    subroutine gpu_allocate_double4_64(ptr, s1, s2, s3, s4)
      implicit none
      type(gpu_double4), intent(inout) :: ptr
      integer*8, intent(in) :: s1, s2, s3, s4
      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8)
      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /))
    end subroutine
    subroutine gpu_allocate_double5_64(ptr, s1, s2, s3, s4, s5)
      implicit none
      type(gpu_double5), intent(inout) :: ptr
      integer*8, intent(in) :: s1, s2, s3, s4, s5
      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8)
      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /))
    end subroutine
    subroutine gpu_allocate_double6_64(ptr, s1, s2, s3, s4, s5, s6)
      implicit none
      type(gpu_double6), intent(inout) :: ptr
      integer*8, intent(in) :: s1, s2, s3, s4, s5, s6
      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8)
      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /))
    end subroutine
 ! gpu_deallocate
 ! --------------
    subroutine gpu_deallocate_double1(ptr)
      implicit none
      type(gpu_double1), intent(inout) :: ptr
      call gpu_deallocate_c(ptr%c)
      NULLIFY(ptr%f)
    end subroutine
    subroutine gpu_deallocate_double2(ptr)
      implicit none
      type(gpu_double2), intent(inout) :: ptr
      call gpu_deallocate_c(ptr%c)
      NULLIFY(ptr%f)
    end subroutine
    subroutine gpu_deallocate_double3(ptr)
      implicit none
      type(gpu_double3), intent(inout) :: ptr
      call gpu_deallocate_c(ptr%c)
      NULLIFY(ptr%f)
    end subroutine
    subroutine gpu_deallocate_double4(ptr)
      implicit none
      type(gpu_double4), intent(inout) :: ptr
      call gpu_deallocate_c(ptr%c)
      NULLIFY(ptr%f)
    end subroutine
    subroutine gpu_deallocate_double5(ptr)
      implicit none
      type(gpu_double5), intent(inout) :: ptr
      call gpu_deallocate_c(ptr%c)
      NULLIFY(ptr%f)
    end subroutine
    subroutine gpu_deallocate_double6(ptr)
      implicit none
      type(gpu_double6), intent(inout) :: ptr
      call gpu_deallocate_c(ptr%c)
      NULLIFY(ptr%f)
    end subroutine
 ! gpu_upload
 ! ----------
    subroutine gpu_upload_double1(cpu_ptr, gpu_ptr)
      implicit none
      double precision, target, intent(in)     :: cpu_ptr(*)
      type(gpu_double1), intent(in)    :: gpu_ptr
      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, 8_8*size(gpu_ptr%f))
    end subroutine
    subroutine gpu_upload_double2(cpu_ptr, gpu_ptr)
      implicit none
      double precision, target, intent(in)     :: cpu_ptr(:,:)
      type(gpu_double2), intent(in)    :: gpu_ptr
      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
    end subroutine
    subroutine gpu_upload_double3(cpu_ptr, gpu_ptr)
      implicit none
      double precision, target, intent(in)     :: cpu_ptr(:,:,:)
      type(gpu_double3), intent(in)    :: gpu_ptr
      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
    end subroutine
    subroutine gpu_upload_double4(cpu_ptr, gpu_ptr)
      implicit none
      double precision, target, intent(in)     :: cpu_ptr(:,:,:,:)
      type(gpu_double4), intent(in)    :: gpu_ptr
      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
    end subroutine
    subroutine gpu_upload_double5(cpu_ptr, gpu_ptr)
      implicit none
      double precision, target, intent(in)     :: cpu_ptr(:,:,:,:,:)
      type(gpu_double5), intent(in)    :: gpu_ptr
      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
    end subroutine
    subroutine gpu_upload_double6(cpu_ptr, gpu_ptr)
      implicit none
      double precision, target, intent(in)     :: cpu_ptr(:,:,:,:,:,:)
      type(gpu_double6), intent(in)    :: gpu_ptr
      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
    end subroutine
 ! gpu_download
 ! ------------
    subroutine gpu_download_double1(gpu_ptr, cpu_ptr)
      implicit none
      type(gpu_double1), intent(in)  :: gpu_ptr
      double precision, target, intent(in)   :: cpu_ptr(:)
      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*size(gpu_ptr%f))
    end subroutine
    subroutine gpu_download_double2(gpu_ptr, cpu_ptr)
      implicit none
      type(gpu_double2), intent(in)  :: gpu_ptr
      double precision, target, intent(in)   :: cpu_ptr(:,:)
      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
    end subroutine
    subroutine gpu_download_double3(gpu_ptr, cpu_ptr)
      implicit none
      type(gpu_double3), intent(in)  :: gpu_ptr
      double precision, target, intent(in)   :: cpu_ptr(:,:,:)
      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
    end subroutine
    subroutine gpu_download_double4(gpu_ptr, cpu_ptr)
      implicit none
      type(gpu_double4), intent(in)  :: gpu_ptr
      double precision, target, intent(in)   :: cpu_ptr(:,:,:,:)
      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
    end subroutine
    subroutine gpu_download_double5(gpu_ptr, cpu_ptr)
      implicit none
      type(gpu_double5), intent(in)  :: gpu_ptr
      double precision, target, intent(in)   :: cpu_ptr(:,:,:,:,:)
      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
    end subroutine
    subroutine gpu_download_double6(gpu_ptr, cpu_ptr)
      implicit none
      type(gpu_double6), intent(in)  :: gpu_ptr
      double precision, target, intent(in)   :: cpu_ptr(:,:,:,:,:,:)
      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
    end subroutine
 ! gpu_copy
 ! --------
    subroutine gpu_copy_double1(gpu_ptr_src, gpu_ptr_dest)
      implicit none
      type(gpu_double1), intent(in)        :: gpu_ptr_src
      type(gpu_double1), intent(in)        :: gpu_ptr_dest
      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*size(gpu_ptr_dest%f))
    end subroutine
    subroutine gpu_copy_double2(gpu_ptr_src, gpu_ptr_dest)
      implicit none
      type(gpu_double2), intent(in)        :: gpu_ptr_src
      type(gpu_double2), intent(in)        :: gpu_ptr_dest
      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
    end subroutine
    subroutine gpu_copy_double3(gpu_ptr_src, gpu_ptr_dest)
      implicit none
      type(gpu_double3), intent(in)        :: gpu_ptr_src
      type(gpu_double3), intent(in)        :: gpu_ptr_dest
      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
    end subroutine
    subroutine gpu_copy_double4(gpu_ptr_src, gpu_ptr_dest)
      implicit none
      type(gpu_double4), intent(in)        :: gpu_ptr_src
      type(gpu_double4), intent(in)        :: gpu_ptr_dest
      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
    end subroutine
    subroutine gpu_copy_double5(gpu_ptr_src, gpu_ptr_dest)
      implicit none
      type(gpu_double5), intent(in)        :: gpu_ptr_src
      type(gpu_double5), intent(in)        :: gpu_ptr_dest
      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
    end subroutine
    subroutine gpu_copy_double6(gpu_ptr_src, gpu_ptr_dest)
      implicit none
      type(gpu_double6), intent(in)        :: gpu_ptr_src
      type(gpu_double6), intent(in)        :: gpu_ptr_dest
      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
    end subroutine
 ! gpu_stream
 ! ----------
    subroutine gpu_stream_create(stream)
      type(gpu_stream) :: stream
      call gpu_stream_create_c(stream%c)
    end subroutine
    subroutine gpu_stream_destroy(stream)
      type(gpu_stream) :: stream
      call gpu_stream_destroy_c(stream%c)
    end subroutine
    subroutine gpu_set_stream(handle, stream)
      type(gpu_blas)   :: handle
      type(gpu_stream) :: stream
      call gpu_set_stream_c(handle%c, stream%c)
    end subroutine
 ! gpu_blas
 ! --------
    subroutine gpu_blas_create(handle)
      type(gpu_blas) :: handle
      call gpu_blas_create_c(handle%c)
    end subroutine
    subroutine gpu_blas_destroy(handle)
      type(gpu_blas) :: handle
      call gpu_blas_destroy_c(handle%c)
    end subroutine
 ! dot
 ! ---
 subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res)
 !  use gpu
  type(gpu_blas), intent(in)     :: handle
  integer*4                      :: n, incx, incy
  double precision, target       :: dx, dy
  double precision, intent(out)  :: res
  call gpu_ddot_c(handle%c, int(n,c_int64_t), c_loc(dx), int(incx,c_int64_t), c_loc(dy), int(incy,c_int64_t), res)
 end subroutine
 subroutine gpu_ddot_64(handle, n, dx, incx, dy, incy, res)
 !  use gpu
  type(gpu_blas), intent(in)     :: handle
  integer*8                      :: n, incx, incy
  double precision, target       :: dx, dy
  double precision, intent(out)  :: res
  call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res)
 end subroutine
 ! geam
 ! ----
 subroutine gpu_dgeam(handle, transa, transb, m, n, alpha, a, lda, beta, &
  b, ldb, c, ldc)
 !  use gpu
  type(gpu_blas), intent(in)   :: handle
  character, intent(in)        :: transa, transb
  integer*4, intent(in)        :: m, n, lda, ldb, ldc
  double precision, intent(in) :: alpha, beta
  double precision, target     :: a, b, c
  call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, &
        c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t))
 end subroutine
 subroutine gpu_dgeam_64(handle, transa, transb, m, n, alpha, a, lda, beta, &
  b, ldb, c, ldc)
 !  use gpu
  type(gpu_blas), intent(in)   :: handle
  character, intent(in)        :: transa, transb
  integer*8, intent(in)        :: m, n, lda, ldb, ldc
  double precision, intent(in) :: alpha, beta
  double precision, target     :: a, b, c
  call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, &
        c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t))
 end subroutine
 ! gemv
 ! ----
 subroutine gpu_dgemv(handle, transa, m, n, alpha, a, lda, &
  x, incx, beta, y, incy)
 !  use gpu
  type(gpu_blas), intent(in)   :: handle
  character, intent(in)        :: transa
  integer*4, intent(in)        :: m, n, lda, incx, incy
  double precision, intent(in) :: alpha, beta
  double precision             :: a, x, y
  call gpu_dgemv_c(handle%c, transa, int(m,c_int64_t), int(n,c_int64_t), &
        alpha, a, int(lda,c_int64_t), &
        x, int(incx,c_int64_t), beta, y, int(incy,c_int64_t))
 end subroutine
 subroutine gpu_dgemv_64(handle, transa, m, n, alpha, a, lda, &
  x, incx, beta, y, incy)
 !  use gpu
  type(gpu_blas), intent(in)   :: handle
  character, intent(in)        :: transa
  integer*8, intent(in)        :: m, n, lda, incx, incy
  double precision, intent(in) :: alpha, beta
  double precision             :: a, x, y
  call gpu_dgemv_c(handle%c, transa, int(m,c_int64_t), int(n,c_int64_t), &
        alpha, a, int(lda,c_int64_t), &
        x, int(incx,c_int64_t), beta, y, int(incy,c_int64_t))
 end subroutine
 ! gemm
 ! ----
 subroutine gpu_dgemm(handle, transa, transb, m, n, k, alpha, a, lda, &
  b, ldb, beta, c, ldc)
 !  use gpu
  type(gpu_blas), intent(in)   :: handle
  character, intent(in)        :: transa, transb
  integer*4, intent(in)        :: m, n, k, lda, ldb, ldc
  double precision, intent(in) :: alpha, beta
  double precision             :: a, b, c
  call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), &
        alpha, a, int(lda,c_int64_t), &
        b, int(ldb,c_int64_t), beta, c, int(ldc,c_int64_t))
 end subroutine
 subroutine gpu_dgemm_64(handle, transa, transb, m, n, k, alpha, a, lda, &
  b, ldb, beta, c, ldc)
 !  use gpu
  type(gpu_blas), intent(in)   :: handle
  character, intent(in)        :: transa, transb
  integer*8, intent(in)        :: m, n, k, lda, ldb, ldc
  double precision, intent(in) :: alpha, beta
  double precision             :: a, b, c
  call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), &
        alpha, a, int(lda,c_int64_t), b, int(ldb,c_int64_t), beta, c, int(ldc,c_int64_t))
 end subroutine
 end module
--- a/src/gpu_x86/gpu_module.F90
+++ b/src/gpu_x86/gpu_module.F90
@ -1,141 +0,0 @@
 module gpu
  use, intrinsic :: iso_c_binding, only : c_int32_t, c_int64_t, c_double, c_size_t, c_char
  implicit none
  interface
    integer function gpu_ndevices() bind(C)
    end function
    subroutine gpu_set_device(id) bind(C)
      import
      integer(c_int32_t), value :: id
    end subroutine
    subroutine gpu_allocate_c(ptr, n) bind(C, name='gpu_allocate')
      import
      type(c_ptr) :: ptr
      integer(c_int64_t), value :: n
    end subroutine
    subroutine gpu_free_c(ptr) bind(C, name='gpu_free')
      import
      type(c_ptr) :: ptr
    end subroutine
    subroutine gpu_upload_c(cpu_ptr, gpu_ptr, n) bind(C, name='gpu_upload')
      import
      type(c_ptr), value :: cpu_ptr
      type(c_ptr), value :: gpu_ptr
      integer(c_int64_t), value :: n
    end subroutine
    subroutine gpu_download_c(gpu_ptr, cpu_ptr, n) bind(C, name='gpu_download')
      import
      type(c_ptr), value :: gpu_ptr
      type(c_ptr), value :: cpu_ptr
      integer(c_int64_t), value :: n
    end subroutine
    subroutine gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, n) bind(C, name='gpu_copy')
      import
      type(c_ptr), value :: gpu_ptr_src
      type(c_ptr), value :: gpu_ptr_dest
      integer(c_int64_t), value :: n
    end subroutine
    subroutine gpu_stream_create(stream) bind(C)
      import
      type(c_ptr) :: stream
    end subroutine
    subroutine gpu_stream_destroy(stream) bind(C)
      import
      type(c_ptr) :: stream
    end subroutine
    subroutine gpu_set_stream(handle, stream) bind(C)
      import
      type(c_ptr) :: handle, stream
    end subroutine
    subroutine gpu_synchronize()
    end subroutine
    subroutine gpu_blas_create(handle) bind(C)
      import
      type(c_ptr) :: handle
    end subroutine
    subroutine gpu_blas_destroy(handle) bind(C)
      import
      type(c_ptr) :: handle
    end subroutine
    subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res) bind(C)
      import
      type(c_ptr), intent(in)     :: handle
      integer(c_int64_t), value   :: n, incx, incy
      real(c_double), intent(in)  :: dx(*), dy(*)
      real(c_double), intent(out) :: res
    end subroutine
    subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res) bind(C)
      import
      type(c_ptr), intent(in)     :: handle
      integer(c_int64_t), value   :: n, incx, incy
      real(c_float), intent(in)   :: dx(*), dy(*)
      real(c_float), intent(out)  :: res
    end subroutine
  end interface
 end module
 subroutine gpu_allocate_double(ptr, s)
  use gpu
  implicit none
  double precision, pointer, intent(inout) :: ptr
  integer*8, intent(in) :: s(*)
  type(c_ptr) :: cptr
  call gpu_allocate_c(cptr, sum(s)*8_8)
  call c_f_pointer(cptr, ptr, s)
 end subroutine
 subroutine gpu_free_double(ptr)
  use gpu
  implicit none
  double precision, pointer, intent(inout) :: ptr
  type(c_ptr) :: cptr
  cptr = cloc(ptr)
  call gpu_free(cptr)
  NULLIFY(ptr)
 end subroutine
 subroutine gpu_upload_double(cpu_ptr, gpu_ptr, n)
  use gpu
  implicit none
  double precision, intent(in)   :: cpu_ptr(*)
  double precision, intent(out)  :: gpu_ptr(*)
  integer(c_int64_t), intent(in) :: n
  call gpu_upload_c(cpu_ptr, gpu_ptr, 8_8*n)
 end subroutine
 subroutine gpu_download_double(gpu_ptr, cpu_ptr, n)
  use gpu
  implicit none
  double precision, intent(in)   :: gpu_ptr(*)
  double precision, intent(out)  :: cpu_ptr(*)
  integer(c_int64_t), intent(in) :: n
  call gpu_download_c(gpu_ptr, cpu_ptr, 8_8*n)
 end subroutine
 subroutine gpu_copy_double(gpu_ptr_src, gpu_ptr_dest, n)
  use gpu
  implicit none
  double precision, intent(in)   :: gpu_ptr_src(*)
  double precision, intent(out)  :: gpu_ptr_dest(*)
  integer(c_int64_t), intent(in) :: n
  call gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, 8_8*n)
 end subroutine
--- a/src/mo_optimization/NEED
+++ b/src/mo_optimization/NEED
@ -1,7 +1,3 @@
-two_body_rdm
+mo_optimization_utils
 hartree_fock
 cipsi
 davidson_undressed
 selectors_full
 generators_full
 utils_trust_region
--- a/src/mo_optimization/cipsi_orb_opt.irp.f
+++ b/src/mo_optimization/cipsi_orb_opt.irp.f
@ -2,87 +2,7 @@ program optimization
  read_wf = .true. ! must be True for the orbital optimization !!!
  TOUCH read_wf 
-  call run_optimization
+  call run_optimization_mos_CIPSI
 end
 subroutine run_optimization
  implicit none
  double precision :: e_cipsi, e_opt, delta_e
  double precision, allocatable :: Ev(:),PT2(:)
  integer :: nb_iter,i
  logical :: not_converged
  character (len=100) :: filename
  PROVIDE psi_det psi_coef mo_two_e_integrals_in_map ao_pseudo_integrals
  allocate(Ev(N_states),PT2(N_states))
  not_converged = .True.
  nb_iter = 0
  ! To start from the wf
  N_det_max = max(n_det,5)
  TOUCH N_det_max
  open(unit=10, file=trim(ezfio_filename)//'/mo_optimization/result_opt')
  write(10,*) "   Ndet        E_cipsi         E_opt          Delta_e"
  call state_average_energy(e_cipsi)
  write(10,'(I10, 3F15.7)') n_det, e_cipsi, e_cipsi, 0d0
  close(10)
  do while (not_converged)
      print*,''
      print*,'======================'
      print*,' Cipsi step:', nb_iter
      print*,'======================'
      print*,'' 
      print*,'********** cipsi step **********'
      ! cispi calculation
      call run_stochastic_cipsi(Ev,PT2)
      ! State average energy after the cipsi step
      call state_average_energy(e_cipsi)
      print*,''
      print*,'********** optimization step **********'
      ! orbital optimization
      call run_orb_opt_trust_v2
      ! State average energy after the orbital optimization
      call state_average_energy(e_opt)
      print*,''
      print*,'********** diff step **********'
      ! Gain in energy
      delta_e = e_opt - e_cipsi
      print*, 'Gain in energy during the orbital optimization:', delta_e
      open(unit=10, file=trim(ezfio_filename)//'/mo_optimization/result_opt', position='append')
      write(10,'(I10, 3F15.7)') n_det, e_cipsi, e_opt, delta_e
      close(10)
      ! Exit
      if (delta_e > 1d-12) then
          print*, 'WARNING, something wrong happened'
          print*, 'The gain (delta_e) in energy during the optimization process'
          print*, 'is > 0, but it must be < 0'
          print*, 'The program will exit'
          exit
      endif
      if (n_det > n_det_max_opt) then
          print*, 'The number of determinants in the wf > n_det_max_opt'
          print*, 'The program will exit'
          exit
      endif
      ! To double the number of determinants in the wf
      N_det_max = int(dble(n_det * 2)*0.9)
      TOUCH N_det_max
      nb_iter = nb_iter + 1
  enddo
 end
--- a/src/mo_optimization_utils/EZFIO.cfg
+++ b/src/mo_optimization_utils/EZFIO.cfg
--- a/src/mo_optimization_utils/NEED
+++ b/src/mo_optimization_utils/NEED
@ -0,0 +1,5 @@
 two_body_rdm
 hartree_fock
 cipsi
 davidson_undressed
 utils_trust_region
--- a/src/mo_optimization_utils/README.md
+++ b/src/mo_optimization_utils/README.md
@ -0,0 +1,74 @@
 # Orbital optimization
 ## Methods  
 Different methods are available:  
 - full hessian  
 ``` 
 qp set orbital_optimization optimization_method full  
 ```  
 - diagonal hessian  
 ``` 
 qp set orbital_optimization optimization_method diag  
 ``` 
 - identity matrix  
 ``` 
 qp set orbital_optimization optimization_method none  
 ``` 
 After the optimization the ezfio contains the optimized orbitals
 ## For a fixed number of determinants
 To optimize the MOs for the actual determinants:  
 ``` 
 qp run orb_opt
 ``` 
 ## For a complete optimization, i.e, with a larger and larger wave function
 To optimize the MOs with a larger and larger wave function:  
 ``` 
 qp run optimization  
 ``` 
 The results are stored in the EZFIO in "mo_optimization/result_opt",
 with the following format:  
 (1) (2) (3) (4)  
 1: Number of determinants in the wf,  
 2: Cispi energy before the optimization,   
 3: Cipsi energy after the optimization,  
 4: Energy difference between (2) and (3).  
 The optimization process if the following: 
 - we do a first cipsi step to obtain a small number of determinants in the wf 
 - we run an orbital optimization for this wf 
 - we do a new cipsi step to double the number of determinants in the wf 
 - we run an orbital optimization for this wf 
 - ... 
 - we do that until the energy difference between (2) and (3) is  
  smaller than the targeted accuracy for the cispi (targeted_accuracy_cipsi in qp edit) 
  or the wf is larger than a given size (n_det_max_opt in qp_edit) 
 - after that you can reset your determinants (qp reset -d) and run a clean Cispi calculation  
 ### End of the optimization
 You can choos the number of determinants after what the 
 optimization will stop:
 ```
 qp set orbital_optimization n_det_max_opt 1e5 # or any number
 ```
 ## Weight of the states
 You can change the weights of the differents states directly in qp edit.  
 It will affect ths weights used in the orbital optimization.
 # Tests
 To run the tests:  
 ``` 
 qp test
 ``` 
 # Org files
 The org files are stored in the directory org in order to avoid overwriting on user changes.
 The org files can be modified, to export the change to the source code, run
 ```
 ./TANGLE_org_mode.sh
 mv *.irp.f ../.
 ```
--- a/src/mo_optimization_utils/constants.h
+++ b/src/mo_optimization_utils/constants.h
--- a/src/mo_optimization_utils/diagonal_hessian_list_opt.irp.f
+++ b/src/mo_optimization_utils/diagonal_hessian_list_opt.irp.f
--- a/src/mo_optimization_utils/diagonal_hessian_opt.irp.f
+++ b/src/mo_optimization_utils/diagonal_hessian_opt.irp.f
--- a/src/mo_optimization_utils/diagonalization_hessian.irp.f
+++ b/src/mo_optimization_utils/diagonalization_hessian.irp.f
--- a/src/mo_optimization_utils/first_diagonal_hessian_list_opt.irp.f
+++ b/src/mo_optimization_utils/first_diagonal_hessian_list_opt.irp.f
--- a/src/mo_optimization_utils/first_diagonal_hessian_opt.irp.f
+++ b/src/mo_optimization_utils/first_diagonal_hessian_opt.irp.f
--- a/src/mo_optimization_utils/first_gradient_list_opt.irp.f
+++ b/src/mo_optimization_utils/first_gradient_list_opt.irp.f
--- a/src/mo_optimization_utils/first_gradient_opt.irp.f
+++ b/src/mo_optimization_utils/first_gradient_opt.irp.f
--- a/src/mo_optimization_utils/first_hessian_list_opt.irp.f
+++ b/src/mo_optimization_utils/first_hessian_list_opt.irp.f
--- a/src/mo_optimization_utils/first_hessian_opt.irp.f
+++ b/src/mo_optimization_utils/first_hessian_opt.irp.f
--- a/src/mo_optimization_utils/gradient_list_opt.irp.f
+++ b/src/mo_optimization_utils/gradient_list_opt.irp.f
--- a/src/mo_optimization_utils/gradient_opt.irp.f
+++ b/src/mo_optimization_utils/gradient_opt.irp.f
--- a/src/mo_optimization_utils/hessian_list_opt.irp.f
+++ b/src/mo_optimization_utils/hessian_list_opt.irp.f
--- a/src/mo_optimization_utils/hessian_opt.irp.f
+++ b/src/mo_optimization_utils/hessian_opt.irp.f
--- a/src/mo_optimization_utils/org/TODO.org
+++ b/src/mo_optimization_utils/org/TODO.org
--- a/src/mo_optimization_utils/org/debug_gradient_list_opt.org
+++ b/src/mo_optimization_utils/org/debug_gradient_list_opt.org
--- a/src/mo_optimization_utils/org/debug_gradient_opt.org
+++ b/src/mo_optimization_utils/org/debug_gradient_opt.org
--- a/src/mo_optimization_utils/org/debug_hessian_list_opt.org
+++ b/src/mo_optimization_utils/org/debug_hessian_list_opt.org
--- a/src/mo_optimization_utils/org/debug_hessian_opt.org
+++ b/src/mo_optimization_utils/org/debug_hessian_opt.org
--- a/src/mo_optimization_utils/org/diagonal_hessian_list_opt.org
+++ b/src/mo_optimization_utils/org/diagonal_hessian_list_opt.org
--- a/src/mo_optimization_utils/org/diagonal_hessian_opt.org
+++ b/src/mo_optimization_utils/org/diagonal_hessian_opt.org
--- a/src/mo_optimization_utils/org/diagonalization_hessian.org
+++ b/src/mo_optimization_utils/org/diagonalization_hessian.org
--- a/src/mo_optimization_utils/org/first_diagonal_hessian_list_opt.org
+++ b/src/mo_optimization_utils/org/first_diagonal_hessian_list_opt.org
--- a/src/mo_optimization_utils/org/first_diagonal_hessian_opt.org
+++ b/src/mo_optimization_utils/org/first_diagonal_hessian_opt.org
--- a/src/mo_optimization_utils/org/first_gradient_list_opt.org
+++ b/src/mo_optimization_utils/org/first_gradient_list_opt.org
--- a/src/mo_optimization_utils/org/first_gradient_opt.org
+++ b/src/mo_optimization_utils/org/first_gradient_opt.org
--- a/src/mo_optimization_utils/org/first_hessian_list_opt.org
+++ b/src/mo_optimization_utils/org/first_hessian_list_opt.org
--- a/src/mo_optimization_utils/org/first_hessian_opt.org
+++ b/src/mo_optimization_utils/org/first_hessian_opt.org
--- a/src/mo_optimization_utils/org/gradient_list_opt.org
+++ b/src/mo_optimization_utils/org/gradient_list_opt.org
--- a/src/mo_optimization_utils/org/gradient_opt.org
+++ b/src/mo_optimization_utils/org/gradient_opt.org
--- a/src/mo_optimization_utils/org/hessian_list_opt.org
+++ b/src/mo_optimization_utils/org/hessian_list_opt.org
--- a/src/mo_optimization_utils/org/hessian_opt.org
+++ b/src/mo_optimization_utils/org/hessian_opt.org
--- a/src/mo_optimization_utils/org/my_providers.org
+++ b/src/mo_optimization_utils/org/my_providers.org
--- a/src/mo_optimization_utils/org/optimization.org
+++ b/src/mo_optimization_utils/org/optimization.org
--- a/src/mo_optimization_utils/org/orb_opt_trust_v2.org
+++ b/src/mo_optimization_utils/org/orb_opt_trust_v2.org
--- a/src/mo_optimization_utils/org/state_average_energy.org
+++ b/src/mo_optimization_utils/org/state_average_energy.org
--- a/src/mo_optimization_utils/org/state_weight_normalization.org
+++ b/src/mo_optimization_utils/org/state_weight_normalization.org
--- a/src/mo_optimization_utils/org/update_parameters.org
+++ b/src/mo_optimization_utils/org/update_parameters.org
--- a/src/mo_optimization_utils/org/update_st_av_ci_energy.org
+++ b/src/mo_optimization_utils/org/update_st_av_ci_energy.org
--- a/src/mo_optimization_utils/routine_opt_mos.irp.f
+++ b/src/mo_optimization_utils/routine_opt_mos.irp.f
@ -0,0 +1,81 @@
 subroutine run_optimization_mos_CIPSI
  implicit none
  double precision :: e_cipsi, e_opt, delta_e
  double precision, allocatable :: Ev(:),PT2(:)
  integer :: nb_iter,i
  logical :: not_converged
  character (len=100) :: filename
  PROVIDE psi_det psi_coef mo_two_e_integrals_in_map ao_pseudo_integrals
  allocate(Ev(N_states),PT2(N_states))
  not_converged = .True.
  nb_iter = 0
  ! To start from the wf
  N_det_max = max(n_det,5)
  TOUCH N_det_max
  open(unit=10, file=trim(ezfio_filename)//'/mo_optimization/result_opt')
  write(10,*) "   Ndet        E_cipsi         E_opt          Delta_e"
  call state_average_energy(e_cipsi)
  write(10,'(I10, 3F15.7)') n_det, e_cipsi, e_cipsi, 0d0
  close(10)
  do while (not_converged)
      print*,''
      print*,'======================'
      print*,' Cipsi step:', nb_iter
      print*,'======================'
      print*,'' 
      print*,'********** cipsi step **********'
      ! cispi calculation
      call run_stochastic_cipsi(Ev,PT2)
      ! State average energy after the cipsi step
      call state_average_energy(e_cipsi)
      print*,''
      print*,'********** optimization step **********'
      ! orbital optimization
      call run_orb_opt_trust_v2
      ! State average energy after the orbital optimization
      call state_average_energy(e_opt)
      print*,''
      print*,'********** diff step **********'
      ! Gain in energy
      delta_e = e_opt - e_cipsi
      print*, 'Gain in energy during the orbital optimization:', delta_e
      open(unit=10, file=trim(ezfio_filename)//'/mo_optimization/result_opt', position='append')
      write(10,'(I10, 3F15.7)') n_det, e_cipsi, e_opt, delta_e
      close(10)
      ! Exit
      if (delta_e > 1d-12) then
          print*, 'WARNING, something wrong happened'
          print*, 'The gain (delta_e) in energy during the optimization process'
          print*, 'is > 0, but it must be < 0'
          print*, 'The program will exit'
          exit
      endif
      if (n_det > n_det_max_opt) then
          print*, 'The number of determinants in the wf > n_det_max_opt'
          print*, 'The program will exit'
          exit
      endif
      ! To double the number of determinants in the wf
      N_det_max = int(dble(n_det * 2)*0.9)
      TOUCH N_det_max
      nb_iter = nb_iter + 1
  enddo
 end
--- a/src/mo_optimization_utils/run_orb_opt_trust_v2.irp.f
+++ b/src/mo_optimization_utils/run_orb_opt_trust_v2.irp.f
--- a/src/mo_optimization_utils/save_energy.irp.f
+++ b/src/mo_optimization_utils/save_energy.irp.f
--- a/src/mo_optimization_utils/state_average_energy.irp.f
+++ b/src/mo_optimization_utils/state_average_energy.irp.f
--- a/src/mo_optimization_utils/state_weight_normalization.irp.f
+++ b/src/mo_optimization_utils/state_weight_normalization.irp.f
--- a/src/mo_optimization_utils/update_parameters.irp.f
+++ b/src/mo_optimization_utils/update_parameters.irp.f
--- a/src/mo_optimization_utils/update_st_av_ci_energy.irp.f
+++ b/src/mo_optimization_utils/update_st_av_ci_energy.irp.f
--- a/src/mo_two_e_ints/cholesky.irp.f
+++ b/src/mo_two_e_ints/cholesky.irp.f
@ -101,3 +101,34 @@ BEGIN_PROVIDER [ double precision, cholesky_mo_transp, (cholesky_mo_num, mo_num,
 END_PROVIDER
 BEGIN_PROVIDER [ double precision, cholesky_semi_mo_transp_simple, (cholesky_mo_num, ao_num, mo_num) ]
 implicit none
 BEGIN_DOC
 ! Cholesky vectors in MO basis
 END_DOC
 double precision, allocatable :: X(:,:,:)
 double precision :: wall0, wall1
 integer :: ierr
 print *, 'Semi AO->MO Transformation of Cholesky vectors'
  call wall_time(wall0)
 allocate(X(mo_num,cholesky_mo_num,ao_num), stat=ierr)
 if (ierr /= 0) then
   print *, irp_here, ': Allocation failed'
 endif
 integer :: i_chol, i_mo, j_mo, i_ao 
 cholesky_semi_mo_transp_simple = 0.d0
 do i_mo = 1, mo_num
  do i_ao = 1, ao_num
   do j_mo = 1, mo_num
    do i_chol = 1, cholesky_mo_num
     cholesky_semi_mo_transp_simple(i_chol, i_ao,i_mo) += cholesky_mo_transp(i_chol,j_mo,i_mo) * mo_coef_transp(j_mo,i_ao)
    enddo
   enddo
  enddo
 enddo
 END_PROVIDER
--- a/src/mo_two_e_ints/map_integrals.irp.f
+++ b/src/mo_two_e_ints/map_integrals.irp.f
@ -40,7 +40,7 @@ end
 ! Min and max values of the MOs for which the integrals are in the cache
 END_DOC
- mo_integrals_cache_size  = 2_8**mo_integrals_cache_shift
+ mo_integrals_cache_size  = 2**mo_integrals_cache_shift
 mo_integrals_cache_min = max(1,elec_alpha_num - (mo_integrals_cache_size/2 - 1) )
 mo_integrals_cache_max = min(mo_num, mo_integrals_cache_min + mo_integrals_cache_size - 1)
--- a/src/mu_of_r/f_hf_cholesky.irp.f
+++ b/src/mu_of_r/f_hf_cholesky.irp.f
@ -289,6 +289,106 @@ BEGIN_PROVIDER [ double precision, f_hf_cholesky_sparse, (n_points_final_grid)]
 endif
 END_PROVIDER 
 BEGIN_PROVIDER [ double precision, f_hf_cholesky_sparse_bis, (n_points_final_grid)]
 implicit none
 integer :: ipoint,m,mm,i,ii,p
 !!f(R) =  \sum_{I} \sum_{J} Phi_I(R) Phi_J(R) V_IJ
 !!     =  \sum_{I}\sum_{J}\sum_A Phi_I(R) Phi_J(R) V_AI V_AJ
 !!     =  \sum_A \sum_{I}Phi_I(R)V_AI \sum_{J}V_AJ Phi_J(R)
 !!     =  \sum_A V_AR G_AR 
 !! V_AR = \sum_{I}Phi_IR V_AI = \sum_{I}Phi^t_RI V_AI
 double precision :: u_dot_v,wall0,wall1,accu_1, accu_2,mo_i_r1,mo_b_r1
 double precision :: thresh_1,thresh_2
 double precision, allocatable :: accu_vec(:),delta_vec(:)
 thresh_2 = ao_cholesky_threshold * 100.d0
 thresh_1 = dsqrt(thresh_2)
 provide cholesky_mo_transp
 if(elec_alpha_num == elec_beta_num)then
  call wall_time(wall0)
  !$OMP PARALLEL DEFAULT(NONE)                                      &
  !$OMP PRIVATE (accu_vec,ipoint,p,ii,i,mm,m,mo_i_r1,mo_b_r1) & 
  !$OMP ShARED (n_occ_val_orb_for_hf,list_valence_orb_for_hf,mos_in_r_array_omp,aos_in_r_array,thresh_1,thresh_2) & 
  !$OMP ShARED (cholesky_mo_num,f_hf_cholesky_sparse_bis,n_points_final_grid,cholesky_semi_mo_transp_simple,ao_num) 
  allocate(accu_vec(cholesky_mo_num))
  !$OMP DO 
   do ipoint = 1, n_points_final_grid
    f_hf_cholesky_sparse_bis(ipoint) = 0.d0
     accu_vec = 0.d0
     do ii = 1, n_occ_val_orb_for_hf(1)
      i = list_valence_orb_for_hf(ii,1)
      mo_i_r1 = mos_in_r_array_omp(i,ipoint)
      if(dabs(mo_i_r1).lt.thresh_1)cycle
      do mm = 1, ao_num ! electron 1 
       mo_b_r1 = aos_in_r_array(mm,ipoint)*mo_i_r1
       if(dabs(mo_b_r1).lt.thresh_2)cycle
       do p = 1, cholesky_mo_num
        accu_vec(p) = accu_vec(p) + mo_b_r1 * cholesky_semi_mo_transp_simple(p,mm,i)
       enddo
      enddo
     enddo
     do p = 1, cholesky_mo_num
      f_hf_cholesky_sparse_bis(ipoint) = f_hf_cholesky_sparse_bis(ipoint) + accu_vec(p) * accu_vec(p)
     enddo
    f_hf_cholesky_sparse_bis(ipoint) *= 2.D0
   enddo
  !$OMP END DO
  deallocate(accu_vec)
  !$OMP END PARALLEL
  call wall_time(wall1)
  print*,'Time to provide f_hf_cholesky_sparse_bis = ',wall1-wall0
 else
  call wall_time(wall0)
  !$OMP PARALLEL DEFAULT(NONE)                                      &
  !$OMP PRIVATE (accu_vec,delta_vec,ipoint,p,ii,i,mm,m,mo_i_r1,mo_b_r1) & 
  !$OMP ShARED (n_occ_val_orb_for_hf,list_valence_orb_for_hf,list_basis,mos_in_r_array_omp,thresh_1,thresh_2) & 
  !$OMP ShARED (cholesky_mo_num,f_hf_cholesky_sparse_bis,n_points_final_grid,cholesky_mo_transp,n_basis_orb) 
  allocate(accu_vec(cholesky_mo_num),delta_vec(cholesky_mo_num))
  !$OMP DO 
   do ipoint = 1, n_points_final_grid
    f_hf_cholesky_sparse_bis(ipoint) = 0.d0
     accu_vec = 0.d0
     do ii = 1, n_occ_val_orb_for_hf(2)
      i = list_valence_orb_for_hf(ii,2)
      mo_i_r1 = mos_in_r_array_omp(i,ipoint)
      if(dabs(mo_i_r1).lt.thresh_1)cycle
      do mm = 1, n_basis_orb ! electron 1 
       m = list_basis(mm)
       mo_b_r1 = mos_in_r_array_omp(m,ipoint)
       if(dabs(mo_i_r1*mo_b_r1).lt.thresh_2)cycle
       do p = 1, cholesky_mo_num
        accu_vec(p) = accu_vec(p) + mo_i_r1 * mo_b_r1 * cholesky_mo_transp(p,m,i)
       enddo
      enddo
     enddo
     delta_vec = 0.d0
     do ii =  n_occ_val_orb_for_hf(2)+1,n_occ_val_orb_for_hf(1)
      i = list_valence_orb_for_hf(ii,1)
      mo_i_r1 = mos_in_r_array_omp(i,ipoint)
      if(dabs(mo_i_r1).lt.thresh_1)cycle
      do mm = 1, n_basis_orb ! electron 1 
       m = list_basis(mm)
       mo_b_r1 = mos_in_r_array_omp(m,ipoint)
       if(dabs(mo_i_r1*mo_b_r1).lt.thresh_2)cycle
       do p = 1, cholesky_mo_num
        delta_vec(p) = delta_vec(p) + mo_i_r1 * mo_b_r1 * cholesky_mo_transp(p,m,i)
       enddo
      enddo
     enddo
     do p = 1, cholesky_mo_num
      f_hf_cholesky_sparse_bis(ipoint) = f_hf_cholesky_sparse_bis(ipoint) + accu_vec(p) * accu_vec(p) + accu_vec(p) * delta_vec(p)
     enddo
    f_hf_cholesky_sparse_bis(ipoint) *= 2.D0
   enddo
  !$OMP END DO
  deallocate(accu_vec)
  !$OMP END PARALLEL
  call wall_time(wall1)
  print*,'Time to provide f_hf_cholesky_sparse_bis = ',wall1-wall0
 endif
 END_PROVIDER 
 BEGIN_PROVIDER [ double precision, on_top_hf_grid, (n_points_final_grid)]
 implicit none
 integer :: ipoint,i,ii
--- a/src/mu_of_r/mu_of_r_mean_field.irp.f
+++ b/src/mu_of_r/mu_of_r_mean_field.irp.f
@ -0,0 +1,171 @@
 BEGIN_PROVIDER [ double precision, two_e_int_mf, (elec_beta_num,elec_alpha_num,elec_beta_num,elec_alpha_num)]
 implicit none
 integer :: i,j,k,l 
 double precision :: get_two_e_integral
 do i = 1, elec_alpha_num
  do j = 1, elec_beta_num
   do k = 1, elec_alpha_num
    do l = 1, elec_beta_num
     two_e_int_mf(l,k,j,i) = get_two_e_integral(l,k,j,i,mo_integrals_map) 
    enddo
   enddo
  enddo
 enddo
 END_PROVIDER 
 subroutine get_f_mf_ab(r,f_mf_ab,two_bod_dens, dm_a, dm_b)
 implicit none
 double precision, intent(in) :: r(3)
 double precision, intent(out):: f_mf_ab,two_bod_dens, dm_a, dm_b
 double precision, allocatable :: mos_array_r(:),mos_array_a(:), mos_array_b(:)
 integer :: i,j,k,l
 allocate(mos_array_r(mo_num), mos_array_a(elec_alpha_num), mos_array_b(elec_alpha_num))
 call give_all_mos_at_r(r,mos_array_r) 
 do i = 1, elec_alpha_num
  mos_array_a(i) = mos_array_r(i)
 enddo
 do i = 1, elec_beta_num
  mos_array_b(i) = mos_array_r(i)
 enddo
 dm_a = 0.d0
 do i = 1, elec_alpha_num
  dm_a += mos_array_a(i) * mos_array_a(i) 
 enddo
 dm_b = 0.d0
 do i = 1, elec_beta_num
  dm_b += mos_array_b(i) * mos_array_b(i) 
 enddo
 two_bod_dens = dm_a * dm_b
 f_mf_ab = 0.d0
 do i = 1, elec_alpha_num
  do j = 1, elec_beta_num
   do k = 1, elec_alpha_num
    do l = 1, elec_beta_num
     f_mf_ab += two_e_int_mf(l,k,j,i) * mos_array_a(i) * mos_array_a(k) * mos_array_b(j) * mos_array_b(l)
    enddo
   enddo
  enddo
 enddo
 ! multiply by two to adapt to the N(N-1) normalization condition of the active two-rdm
 f_mf_ab *= 2.d0 
 two_bod_dens *= 2.d0
 end
 subroutine get_grad_f_mf_ab(r,grad_f_mf_ab, grad_two_bod_dens,f_mf_ab,two_bod_dens, dm_a, dm_b,grad_dm_a, grad_dm_b)
 implicit none
 double precision, intent(in)  :: r(3)
 double precision, intent(out) :: f_mf_ab, two_bod_dens
 double precision, intent(out) :: grad_two_bod_dens(3), grad_f_mf_ab(3)
 double precision, intent(out) :: dm_a, dm_b, grad_dm_a(3), grad_dm_b(3)
 double precision, allocatable :: mos_array_r(:), mos_grad_array_r(:,:)
 double precision, allocatable :: mos_array_a(:), mos_array_b(:)
 double precision, allocatable :: mos_grad_array_a(:,:), mos_grad_array_b(:,:)
 double precision :: mo_i, mo_j, mo_k, mo_l
 double precision :: grad_mo_i(3), grad_mo_j(3), grad_mo_k(3), grad_mo_l(3)
 integer :: i,j,k,l
 allocate(mos_array_r(mo_num),mos_grad_array_r(3,mo_num))
 allocate(mos_array_a(elec_alpha_num), mos_array_b(elec_beta_num))
 allocate(mos_grad_array_a(3,elec_alpha_num), mos_grad_array_b(3,elec_beta_num))
 call give_all_mos_and_grad_at_r(r,mos_array_r,mos_grad_array_r)
 do i = 1, elec_alpha_num
  mos_array_a(i) = mos_array_r(i)
  mos_grad_array_a(1:3,i) = mos_grad_array_r(1:3,i)
 enddo
 do i = 1, elec_beta_num
  mos_array_b(i) = mos_array_r(i)
  mos_grad_array_b(1:3,i) = mos_grad_array_r(1:3,i)
 enddo
 ! ALPHA DENSITY AND GRADIENT 
 dm_a = 0.d0
 grad_dm_a = 0.d0
 do i = 1, elec_alpha_num
  dm_a += mos_array_a(i) * mos_array_a(i) 
  grad_dm_a(1:3) += 2.d0 * mos_array_a(i) * mos_grad_array_a(1:3,i)  
 enddo
 ! BETA DENSITY AND GRADIENT 
 dm_b = 0.d0
 grad_dm_b = 0.d0
 do i = 1, elec_beta_num
  dm_b += mos_array_b(i) * mos_array_b(i) 
  grad_dm_b(1:3) += 2.d0 * mos_array_b(i) * mos_grad_array_b(1:3,i)  
 enddo
 ! TWO-BODY DENSITY AND GRADIENT 
 two_bod_dens = dm_a * dm_b
 grad_two_bod_dens(1:3) = dm_a * grad_dm_b(1:3) + dm_b * grad_dm_a(1:3)
 ! F_MF and GRADIENT 
 grad_f_mf_ab = 0.d0
 f_mf_ab  = 0.d0
 do i = 1, elec_alpha_num
  mo_i = mos_array_a(i)
  grad_mo_i(1:3) = mos_grad_array_a(1:3,i)
  do j = 1, elec_beta_num
   mo_j = mos_array_b(j)
   grad_mo_j(1:3) = mos_grad_array_b(1:3,j)
   do k = 1, elec_alpha_num
    mo_k = mos_array_a(k)
    grad_mo_k(1:3) = mos_grad_array_a(1:3,k)
    do l = 1, elec_beta_num
     mo_l = mos_array_b(l)
     grad_mo_l(1:3) = mos_grad_array_b(1:3,l)
     f_mf_ab += two_e_int_mf(l,k,j,i) * mo_i * mo_j * mo_k * mo_l
     grad_f_mf_ab(1:3) += two_e_int_mf(l,k,j,i) * & 
     (mo_i * mo_j * mo_k * grad_mo_l(1:3) + mo_i * mo_j * grad_mo_k(1:3) * mo_l & 
     +mo_i * grad_mo_j(1:3) * mo_k * mo_l + grad_mo_i(1:3) * mo_j * mo_k * mo_l)
    enddo
   enddo
  enddo
 enddo
 f_mf_ab *= 2.d0 
 two_bod_dens *= 2.d0
 grad_f_mf_ab *= 2.D0
 grad_two_bod_dens *= 2.d0
 end
 subroutine mu_of_r_mean_field(r,mu_mf, dm)
 implicit none
  include 'constants.include.F'
 double precision, intent(in) :: r(3)
 double precision, intent(out):: mu_mf, dm
 double precision :: f_mf_ab,two_bod_dens, dm_a, dm_b
 call get_f_mf_ab(r,f_mf_ab,two_bod_dens, dm_a, dm_b)
 dm = dm_a + dm_b
 if(dabs(two_bod_dens).lt.1.d-10)then
  mu_mf = 1.d+10
 else
  mu_mf = 0.5d0 * sqpi * f_mf_ab/two_bod_dens
 endif
 end
 subroutine grad_mu_of_r_mean_field(r,mu_mf, dm, grad_mu_mf, grad_dm)
 implicit none
 include 'constants.include.F'
 double precision, intent(in) :: r(3)
 double precision, intent(out):: grad_mu_mf(3), grad_dm(3)
 double precision, intent(out):: mu_mf, dm
 double precision :: grad_f_mf_ab(3), grad_two_bod_dens(3),grad_dm_a(3), grad_dm_b(3)
 double precision :: f_mf_ab,two_bod_dens, dm_a, dm_b
 call get_grad_f_mf_ab(r,grad_f_mf_ab, grad_two_bod_dens,f_mf_ab,two_bod_dens, dm_a, dm_b,grad_dm_a, grad_dm_b)
 dm = dm_a + dm_b
 grad_dm(1:3) = grad_dm_a(1:3) + grad_dm_b(1:3)
 if(dabs(two_bod_dens).lt.1.d-10)then
  mu_mf = 1.d+10
  grad_mu_mf = 0.d0
 else
  mu_mf = 0.5d0 * sqpi * f_mf_ab/two_bod_dens
  grad_mu_mf(1:3) = 0.5d0 * sqpi * (grad_f_mf_ab(1:3) * two_bod_dens - f_mf_ab * grad_two_bod_dens(1:3))& 
                                   /(two_bod_dens*two_bod_dens)
 endif 
 end
--- a/src/mu_of_r/test_proj_op.irp.f
+++ b/src/mu_of_r/test_proj_op.irp.f
@ -15,7 +15,162 @@ program projected_operators
 !  call test_f_HF_valence_ab
 !  call routine_full_mos
 !   call test_f_ii_valence_ab
-   call test_f_ia_valence_ab
+!   call test_f_ia_valence_ab
-  call test_f_ii_ia_aa_valence_ab
+!  call test_f_ii_ia_aa_valence_ab
 ! call test
 !  call test_f_mean_field
 ! call test_grad_f_mean_field
 call test_grad_mu_mf
 end
 subroutine test
 implicit none
 integer :: i_point
 double precision :: ref, new, accu, weight
 accu = 0.d0
 do i_point = 1, n_points_final_grid
  ref = f_hf_cholesky_sparse(i_point)
  new = f_hf_cholesky_sparse_bis(i_point)
  weight = final_weight_at_r_vector(i_point)
  accu += dabs(ref - new) * weight
 enddo
 print*,'accu = ',accu
 end
 subroutine test_f_mean_field
 implicit none
 integer :: i_point
 double precision :: weight,r(3)
 double precision :: ref_f, new_f, accu_f
 double precision :: ref_two_dens, new_two_dens, accu_two_dens, dm_a, dm_b
 accu_f = 0.d0
 accu_two_dens = 0.d0
 do i_point = 1, n_points_final_grid
  r(1:3)   = final_grid_points(1:3,i_point)
  weight = final_weight_at_r_vector(i_point)
  call get_f_mf_ab(r,new_f,new_two_dens, dm_a, dm_b)
  call f_HF_valence_ab(r,r,ref_f,ref_two_dens)
  accu_f += weight * dabs(new_f- ref_f)
  accu_two_dens += weight * dabs(new_two_dens - ref_two_dens)
 enddo
 print*,'accu_f        = ',accu_f
 print*,'accu_two_dens = ',accu_two_dens
 end
 subroutine test_grad_f_mean_field
 implicit none
 integer :: i_point,k
 double precision :: weight,r(3)
 double precision :: grad_f_mf_ab(3), grad_two_bod_dens(3)
 double precision :: grad_dm_a(3), grad_dm_b(3)
 double precision :: f_mf_ab,two_bod_dens, dm_a, dm_b
 double precision :: num_grad_f_mf_ab(3), num_grad_two_bod_dens(3)
 double precision :: num_grad_dm_a(3), num_grad_dm_b(3)
 double precision :: f_mf_ab_p,f_mf_ab_m
 double precision :: two_bod_dens_p, two_bod_dens_m
 double precision :: dm_a_p, dm_a_m
 double precision :: dm_b_p, dm_b_m
 double precision :: rbis(3), dr
 double precision :: accu_grad_f_mf_ab(3),accu_grad_two_bod_dens(3)
 double precision :: accu_grad_dm_a(3),accu_grad_dm_b(3)
 double precision :: accu_f_mf_ab, accu_two_bod_dens, accu_dm_a, accu_dm_b
 dr = 0.00001d0
 accu_f_mf_ab = 0.d0 
 accu_two_bod_dens = 0.d0 
 accu_dm_a = 0.d0 
 accu_dm_b = 0.d0
 accu_grad_f_mf_ab = 0.d0
 accu_grad_two_bod_dens = 0.d0
 accu_grad_dm_a = 0.d0
 accu_grad_dm_b = 0.d0
 do i_point = 1, n_points_final_grid
  r(1:3)   = final_grid_points(1:3,i_point)
  weight = final_weight_at_r_vector(i_point)
  call get_grad_f_mf_ab(r,grad_f_mf_ab, grad_two_bod_dens,f_mf_ab,two_bod_dens, dm_a, dm_b,grad_dm_a, grad_dm_b)
  call get_f_mf_ab(r,f_mf_ab_p,two_bod_dens_p, dm_a_p, dm_b_p)
  accu_f_mf_ab += weight * dabs(f_mf_ab - f_mf_ab_p)
  accu_two_bod_dens += weight * dabs(two_bod_dens - two_bod_dens_p)
  accu_dm_a += weight*dabs(dm_a - dm_a_p)
  accu_dm_b += weight*dabs(dm_b - dm_b_p)
  do k = 1, 3
   rbis = r
   rbis(k) += dr
   call get_f_mf_ab(rbis,f_mf_ab_p,two_bod_dens_p, dm_a_p, dm_b_p)
   rbis = r
   rbis(k) -= dr
   call get_f_mf_ab(rbis,f_mf_ab_m,two_bod_dens_m, dm_a_m, dm_b_m)
   num_grad_f_mf_ab(k) = (f_mf_ab_p - f_mf_ab_m)/(2.d0*dr)
   num_grad_two_bod_dens(k) = (two_bod_dens_p - two_bod_dens_m)/(2.d0*dr)
   num_grad_dm_a(k) = (dm_a_p - dm_a_m)/(2.d0*dr)
   num_grad_dm_b(k) = (dm_b_p - dm_b_m)/(2.d0*dr)
  enddo
  do k = 1, 3
   accu_grad_f_mf_ab(k) += weight * dabs(grad_f_mf_ab(k) - num_grad_f_mf_ab(k))
   accu_grad_two_bod_dens(k) += weight * dabs(grad_two_bod_dens(k) - num_grad_two_bod_dens(k))
   accu_grad_dm_a(k) += weight * dabs(grad_dm_a(k) - num_grad_dm_a(k))
   accu_grad_dm_b(k) += weight * dabs(grad_dm_b(k) - num_grad_dm_b(k))
  enddo
 enddo
 print*,'accu_f_mf_ab = ',accu_f_mf_ab
 print*,'accu_two_bod_dens = ',accu_two_bod_dens
 print*,'accu_dm_a = ',accu_dm_a
 print*,'accu_dm_b = ',accu_dm_b
 print*,'accu_grad_f_mf_ab = '
 print*,accu_grad_f_mf_ab
 print*,'accu_grad_two_bod_dens = '
 print*,accu_grad_two_bod_dens
 print*,'accu_dm_a = '
 print*,accu_grad_dm_a
 print*,'accu_dm_b = '
 print*,accu_grad_dm_b
 end
 subroutine test_grad_mu_mf
 implicit none
 integer :: i_point,k
 double precision :: weight,r(3),rbis(3)
 double precision :: mu_mf, dm,grad_mu_mf(3), grad_dm(3)
 double precision :: mu_mf_p, mu_mf_m, dm_m, dm_p, num_grad_mu_mf(3),dr, num_grad_dm(3)
 double precision :: accu_mu, accu_dm, accu_grad_dm(3), accu_grad_mu_mf(3)
 dr = 0.00001d0
 accu_grad_mu_mf = 0.d0
 accu_mu = 0.d0
 accu_grad_dm = 0.d0
 accu_dm = 0.d0
 do i_point = 1, n_points_final_grid
  r(1:3)   = final_grid_points(1:3,i_point)
  weight = final_weight_at_r_vector(i_point)
  call grad_mu_of_r_mean_field(r,mu_mf, dm, grad_mu_mf, grad_dm)
  call mu_of_r_mean_field(r,mu_mf_p, dm_p)
  accu_mu += weight*dabs(mu_mf_p - mu_mf)
  accu_dm += weight*dabs(dm_p - dm)
  do k = 1, 3
   rbis = r
   rbis(k) += dr
   call mu_of_r_mean_field(rbis,mu_mf_p, dm_p)
   rbis = r
   rbis(k) -= dr
   call mu_of_r_mean_field(rbis,mu_mf_m, dm_m)
   num_grad_mu_mf(k) = (mu_mf_p - mu_mf_m)/(2.d0*dr)
   num_grad_dm(k) = (dm_p - dm_m)/(2.d0*dr)
  enddo
  do k = 1, 3
   accu_grad_dm(k)+= weight *dabs(num_grad_dm(k) - grad_dm(k))
   accu_grad_mu_mf(k)+= weight *dabs(num_grad_mu_mf(k) - grad_mu_mf(k))
  enddo
 enddo
 print*,'accu_mu = ',accu_mu
 print*,'accu_dm = ',accu_dm
 print*,'accu_grad_dm = '
 print*, accu_grad_dm
 print*,'accu_grad_mu_mf = '
 print*, accu_grad_mu_mf
 end
--- a/src/tools/four_idx_transform.irp.f
+++ b/src/tools/four_idx_transform.irp.f
@ -12,6 +12,9 @@ program four_idx_transform
 !
  END_DOC
  if (do_mo_cholesky) then
    stop 'Not implemented with Cholesky integrals'
  endif
  io_mo_two_e_integrals = 'Write'
  SOFT_TOUCH io_mo_two_e_integrals
  if (.true.) then
--- a/Show More
+++ b/Show More
		`@ -0,0 +1,2 @@`
							`-ltbb -lsycl -lmkl_sycl -lgpu -limf -lintlc -lstdc++`