Merge branch 'dev-stable' of https://github.com/AbdAmmar/qp2 into dev-stable

2024-11-19 12:32:30 +01:00 · 2024-07-29 17:22:53 +02:00 · 2024-07-29 17:22:53 +02:00 · eb236e0112
commit eb236e0112
parent 73066b4ac5 4d79bd135f
103 changed files with 4182 additions and 1978 deletions
--- a/18
+++ b/18
@ -40,7 +40,7 @@ Usage:
  $(basename $0) -c <file>    
  $(basename $0) -h           
  $(basename $0) -i <package> 
-  $(basename $0) -g [nvidia|none]
+  $(basename $0) -g [nvidia|intel|none]

 Options:
  -c  <file>        Define a COMPILATION configuration file,
@ -49,7 +49,7 @@ Options:
  -i <package>      INSTALL <package>. Use at your OWN RISK:
                    no support will be provided for the installation of
                    dependencies.
-  -g [nvidia|none]  Choose GPU acceleration (experimental)
+  -g [nvidia|intel|none]  Choose GPU acceleration

 Example:
  ./$(basename $0) -c config/gfortran.cfg
@ -115,19 +115,23 @@ while getopts "d:c:i:g:h" c ; do
 done

 # Handle GPU acceleration
-rm -f ${QP_ROOT}/src/gpu
+rm -f ${QP_ROOT}/src/gpu_arch
 case "$GPU" in
-  amd) # Nvidia
+  amd) # AMD
     echo "Activating AMD GPU acceleration"
-     ln -s ${QP_ROOT}/src/gpu_amd ${QP_ROOT}/src/gpu
+     ln -s ${QP_ROOT}/plugins/local/gpu_amd ${QP_ROOT}/src/gpu_arch
+     ;;
+  intel) # Intel
+     echo "Activating Intel GPU acceleration (EXPERIMENTAL)"
+     ln -s ${QP_ROOT}/plugins/local/gpu_intel ${QP_ROOT}/src/gpu_arch
     ;;
  nvidia) # Nvidia
     echo "Activating Nvidia GPU acceleration"
-     ln -s ${QP_ROOT}/src/gpu_nvidia ${QP_ROOT}/src/gpu
+     ln -s ${QP_ROOT}/plugins/local/gpu_nvidia ${QP_ROOT}/src/gpu_arch
     ;;
  *) # No Acceleration
     echo "Disabling GPU acceleration"
-     ln -s ${QP_ROOT}/src/gpu_x86 ${QP_ROOT}/src/gpu
+     ln -s ${QP_ROOT}/plugins/local/gpu_x86 ${QP_ROOT}/src/gpu_arch
     ;;
 esac

--- a/plugins/local/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f
+++ b/plugins/local/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f
@ -1,4 +1,36 @@

+
+! ---
+
+subroutine run_pouet
+
+  BEGIN_DOC
+  ! Selected Full Configuration Interaction with Stochastic selection and PT2.
+  END_DOC
+
+  use selection_types
+  implicit none
+  integer                       :: i, j, k, ndet
+  integer                       :: to_select
+  logical                       :: has
+  type(pt2_type)                :: pt2_data, pt2_data_err
+  double precision              :: rss
+  double precision              :: correlation_energy_ratio
+  double precision              :: hf_energy_ref
+  double precision              :: relative_error
+  double precision, allocatable :: zeros(:),E_tc(:), norm(:)
+
+  logical,          external    :: qp_stop
+  double precision, external    :: memory_of_double
+
+  PROVIDE mo_l_coef mo_r_coef
+  PROVIDE H_apply_buffer_allocated distributed_davidson 
+
+  print*, ' Diagonal elements of the Fock matrix '
+  do i = 1, mo_num
+    write(*,*) i, Fock_matrix_tc_mo_tot(i,i)
+  enddo
+end
 ! ---

 subroutine run_stochastic_cipsi
--- a/plugins/local/fci_tc_bi/fci_tc_bi_ortho.irp.f
+++ b/plugins/local/fci_tc_bi/fci_tc_bi_ortho.irp.f
@ -65,7 +65,15 @@ subroutine run_cipsi_tc()

  if (.not. is_zmq_slave) then

+    if(.True.)then! DO NOT REMOVE THE IF(.TRUE.) !! 
+     ! this has to be provided before mo_bi_ortho_tc_two_e to avoid twice the computation of ao_two_e_tc_tot
+     PROVIDE Fock_matrix_tc_mo_tot 
+     ! because Fock_matrix_tc_mo_tot depends on ao_two_e_tc_tot 
+     ! and that mo_bi_ortho_tc_two_e erase ao_two_e_tc_tot after being provided 
+    endif
+    if(.True.)then ! DO NOT REMOVE THE IF(.TRUE.) !! 
     PROVIDE psi_det psi_coef mo_bi_ortho_tc_two_e mo_bi_ortho_tc_one_e 
+    endif

    if((elec_alpha_num+elec_beta_num) .ge. 3) then
      if(three_body_h_tc) then
@ -90,8 +98,16 @@ subroutine run_cipsi_tc()
    call json_close

  else
+    if(.True.)then! DO NOT REMOVE THE IF(.TRUE.) !! 
+     ! this has to be provided before mo_bi_ortho_tc_two_e to avoid twice the computation of ao_two_e_tc_tot
+     PROVIDE Fock_matrix_tc_mo_tot 
+     ! because Fock_matrix_tc_mo_tot depends on ao_two_e_tc_tot 
+     ! and that mo_bi_ortho_tc_two_e erase ao_two_e_tc_tot after being provided 
+    endif

+    if(.True.)then! DO NOT REMOVE THE IF(.TRUE.) !! 
     PROVIDE mo_bi_ortho_tc_one_e mo_bi_ortho_tc_two_e pt2_min_parallel_tasks
+    endif

    if((elec_alpha_num+elec_beta_num) .ge. 3) then
      if(three_body_h_tc) then
--- a/plugins/local/gpu_intel/LIB
+++ b/plugins/local/gpu_intel/LIB
@ -0,0 +1,2 @@
+-ltbb -lsycl -lmkl_sycl -lgpu -limf -lintlc -lstdc++ 
+
--- a/plugins/local/gpu_intel/NEED
+++ b/plugins/local/gpu_intel/NEED
--- a/plugins/local/gpu_intel/README.rst
+++ b/plugins/local/gpu_intel/README.rst
@ -0,0 +1,8 @@
+=========
+gpu_intel
+=========
+
+Intel implementation of GPU routines. Uses MKL and SYCL.
+```bash
+icpx -fsycl gpu.cxx -c -qmkl=sequential
+```
--- a/plugins/local/gpu_intel/gpu.sycl
+++ b/plugins/local/gpu_intel/gpu.sycl
@ -0,0 +1,177 @@
+#include <CL/sycl.hpp>
+#include <cassert>
+#include <limits>
+#include <oneapi/mkl/blas.hpp>
+
+extern "C" {
+
+/* Generic functions */
+
+int gpu_ndevices() {
+  return 1;
+}
+
+void gpu_set_device(int32_t igpu) {
+}
+
+
+/* Allocation functions */
+
+void gpu_allocate(void** ptr, int64_t size) {
+    auto queue = sycl::queue(sycl::default_selector_v);
+
+    try {
+        *ptr = sycl::malloc_shared(size, queue);
+        assert(*ptr != nullptr);
+    } catch (const sycl::exception& e) {
+        std::cerr << "SYCL exception caught: " << e.what() << std::endl;
+        *ptr = nullptr; // If allocation fails, set pointer to nullptr
+    }
+}
+
+void gpu_deallocate(void** ptr) {
+    assert(*ptr != nullptr);
+    sycl::free(*ptr, sycl::queue(sycl::default_selector_v));
+    *ptr = nullptr;
+}
+
+/* Upload data from host to device */
+void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
+    sycl::queue queue(sycl::default_selector_v);
+    queue.memcpy(gpu_ptr, cpu_ptr, n).wait();
+}
+
+/* Download data from device to host */
+void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
+    sycl::queue queue(sycl::default_selector_v);
+    queue.memcpy(cpu_ptr, gpu_ptr, n).wait();
+}
+
+/* Copy data from one GPU memory location to another */
+void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
+    sycl::queue queue(sycl::default_selector_v);
+    queue.memcpy(gpu_ptr_dest, gpu_ptr_src, n).wait();
+}
+
+/* Queues */
+
+/* SYCL queue as a replacement for CUDA stream */
+void gpu_stream_create(sycl::queue** ptr) {
+    *ptr = new sycl::queue(sycl::default_selector_v);
+}
+
+void gpu_stream_destroy(sycl::queue** ptr) {
+    assert(*ptr != nullptr);
+    delete *ptr;
+    *ptr = nullptr;
+}
+
+void gpu_synchronize() {
+    sycl::queue queue(sycl::default_selector_v);
+    queue.wait_and_throw();
+}
+
+/* BLAS functions */
+
+typedef struct {
+  sycl::queue* queue;
+} blasHandle_t;
+
+void gpu_set_stream(blasHandle_t* handle, sycl::queue* ptr) {
+  handle->queue = ptr;
+}
+
+void gpu_blas_create(blasHandle_t** ptr) {
+    *ptr = (blasHandle_t*) malloc(sizeof(blasHandle_t));
+    assert(*ptr != nullptr);
+    (*ptr)->queue = new sycl::queue(sycl::default_selector_v);
+    assert((*ptr)->queue != nullptr);
+}
+
+void gpu_blas_destroy(blasHandle_t** ptr) {
+    assert(*ptr != nullptr);
+    delete (*ptr)->queue;
+    free(*ptr);
+    *ptr = nullptr;
+}
+
+
+void gpu_ddot(blasHandle_t* handle, const int64_t n, const double* x, const int64_t incx, 
+              const double* y, const int64_t incy, double* result) {
+    // Ensure input parameters are valid
+    assert(handle != nullptr);
+    assert(handle->queue != nullptr);
+    assert(n > 0);
+    assert(incx > 0);
+    assert(incy > 0);
+    assert(x != nullptr);
+    assert(y != nullptr);
+    assert(result != nullptr);
+
+    oneapi::mkl::blas::dot(*handle->queue, n, x, incx, y, incy, result);
+
+}
+
+void gpu_dgemv(blasHandle_t* handle, const char* transa, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) {
+
+    assert(handle != nullptr);
+    assert(handle->queue != nullptr);
+
+    // Validate matrix dimensions and increments to be positive
+    assert(m > 0 && n > 0 && lda > 0 && incx > 0 && incy > 0);
+    assert(a != nullptr && x != nullptr && y != nullptr && alpha != nullptr && beta != nullptr);
+
+    // Determine the operation type
+    oneapi::mkl::transpose transa_ = oneapi::mkl::transpose::nontrans;
+    if (*transa == 'T' || *transa == 't') {
+        transa_ = oneapi::mkl::transpose::trans;
+    }
+
+    // Perform DGEMV operation using oneMKL
+    oneapi::mkl::blas::column_major::gemv(*handle->queue, transa_, m, n, *alpha, a, lda, x, incx, *beta, y, incy);
+
+}
+
+void gpu_dgemm(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) {
+
+    assert(handle != nullptr && handle->queue != nullptr);
+    assert(m > 0 && n > 0 && k > 0 && lda > 0 && ldb > 0 && ldc > 0);
+    assert(a != nullptr && b != nullptr && c != nullptr && alpha != nullptr && beta != nullptr);
+
+    // Transpose operations
+    auto transa_ = (*transa == 'T' || *transa == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans;
+    auto transb_ = (*transb == 'T' || *transb == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans;
+
+    oneapi::mkl::blas::column_major::gemm(*handle->queue, transa_, transb_, m, n, k,
+                                    *alpha, a, lda, b, ldb, *beta, c, ldc);
+
+}
+
+
+void gpu_dgeam(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
+    assert(handle != nullptr && handle->queue != nullptr);
+    assert(m > 0 && n > 0 && lda > 0 && ldb > 0 && ldc > 0);
+    assert(a != nullptr && b != nullptr && c != nullptr && alpha != nullptr && beta != nullptr);
+
+    // Determine transpose operations
+    bool transA = (*transa == 'T' || *transa == 't');
+    bool transB = (*transb == 'T' || *transb == 't');
+
+    handle->queue->submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(sycl::range<2>(m, n), [=](sycl::id<2> idx) {
+            const int i = idx[0];
+            const int j = idx[1];
+            const int ai = transA ? j * lda + i : i * lda + j;
+            const int bi = transB ? j * ldb + i : i * ldb + j;
+            const int ci = i * ldc + j;
+
+            c[ci] = (*alpha) * a[ai] + (*beta) * b[bi];
+        });
+    });
+
+}
+
+}  // extern C
--- a/plugins/local/gpu_nvidia/LIB
+++ b/plugins/local/gpu_nvidia/LIB
@ -0,0 +1 @@
+-lcudart -lcublas -lcublasLt
--- a/plugins/local/gpu_nvidia/NEED
+++ b/plugins/local/gpu_nvidia/NEED
@ -0,0 +1 @@
+
--- a/plugins/local/gpu_nvidia/README.rst
+++ b/plugins/local/gpu_nvidia/README.rst
@ -0,0 +1,5 @@
+==========
+gpu_nvidia
+==========
+
+Nvidia implementation of GPU routines. Uses CUDA and CUBLAS libraries.
--- a/plugins/local/gpu_nvidia/gpu.c
+++ b/plugins/local/gpu_nvidia/gpu.c
@ -0,0 +1,326 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+
+
+/* Generic functions */
+
+int gpu_ndevices() {
+  int ngpus;
+  cudaGetDeviceCount(&ngpus);
+  return ngpus;
+}
+
+void gpu_set_device(int32_t igpu) {
+  cudaSetDevice((int) igpu);
+}
+
+
+/* Allocation functions */
+
+void gpu_allocate(void** ptr, const int64_t size) {
+    size_t free, total;
+    cudaError_t rc = cudaMemGetInfo( &free, &total );
+    if (rc != cudaSuccess) {
+      free = INT64_MAX;
+    }
+
+    rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
+//    /* Use managed memory if it does not fit on the GPU */
+//    if (size < free && size < total/2) {
+//      rc= cudaMalloc(ptr, size);
+//    } else {
+//      rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
+//    }
+    assert (rc == cudaSuccess);
+}
+
+void gpu_deallocate(void** ptr) {
+  assert (*ptr != NULL);
+  cudaFree(*ptr);
+  *ptr = NULL;
+}
+
+
+/* Memory transfer functions */
+
+void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
+  cudaMemcpy (gpu_ptr, cpu_ptr, n, cudaMemcpyHostToDevice);
+}
+
+void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
+  cudaMemcpy (cpu_ptr, gpu_ptr, n, cudaMemcpyDeviceToHost);
+}
+
+void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
+  cudaMemcpy (gpu_ptr_dest, gpu_ptr_src, n, cudaMemcpyDeviceToDevice);
+}
+
+
+/* Streams */
+
+void gpu_stream_create(cudaStream_t* ptr) {
+  cudaError_t rc = cudaStreamCreate(ptr);
+  assert (rc == cudaSuccess);
+}
+
+void gpu_stream_destroy(cudaStream_t* ptr) {
+  assert (ptr != NULL);
+  cudaError_t rc = cudaStreamDestroy(*ptr);
+  assert (rc == cudaSuccess);
+  *ptr = NULL;
+}
+
+void gpu_set_stream(cublasHandle_t handle, cudaStream_t stream) {
+  cublasSetStream(handle, stream);
+}
+
+void gpu_synchronize() {
+  cudaDeviceSynchronize();
+}
+
+
+/* BLAS functions */
+
+void gpu_blas_create(cublasHandle_t* ptr) {
+  cublasStatus_t rc = cublasCreate(ptr);
+  assert (rc == CUBLAS_STATUS_SUCCESS);
+}
+
+
+void gpu_blas_destroy(cublasHandle_t* ptr) {
+  assert (ptr != NULL);
+  cublasStatus_t rc = cublasDestroy(*ptr);
+  assert (rc == CUBLAS_STATUS_SUCCESS);
+  ptr = NULL;
+}
+
+
+void gpu_ddot(cublasHandle_t handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
+  assert (handle != NULL);
+  /* Convert to int */
+  int n_, incx_, incy_;
+
+  n_    = (int) n;
+  incx_ = (int) incx;
+  incy_ = (int) incy;
+
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  cublasStatus_t rc = cublasDdot(handle, n_, x, incx_, y, incy_, result);
+  assert (rc == CUBLAS_STATUS_SUCCESS);
+}
+
+
+
+void gpu_sdot(cublasHandle_t handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int n_, incx_, incy_;
+
+  n_    = (int) n;
+  incx_ = (int) incx;
+  incy_ = (int) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  float result_ = 0.;
+  cublasStatus_t rc = cublasSdot(handle, n_, x, incx_, y, incy_, &result_);
+  assert (rc == CUBLAS_STATUS_SUCCESS);
+  *result = result_;
+}
+
+
+
+void gpu_dgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) {
+
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, lda_, incx_, incy_;
+
+  m_    = (int) m;
+  n_    = (int) n;
+  lda_  = (int) lda;
+  incx_ = (int) incx;
+  incy_ = (int) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    m_ == m   );
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t)  lda_ == lda );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
+
+  cublasDgemv(handle, transa_, m_, n_, alpha, a, lda_, x, incx_, beta, y, incy_);
+}
+
+
+
+void gpu_sgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const float* alpha,
+               const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy) {
+
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, lda_, incx_, incy_;
+
+  m_    = (int) m;
+  n_    = (int) n;
+  lda_  = (int) lda;
+  incx_ = (int) incx;
+  incy_ = (int) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    m_ == m   );
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t)  lda_ == lda );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
+
+  cublasSgemv(handle, transa_, m_, n_, alpha, a, lda_, x, incx_, beta, y, incy_);
+}
+
+
+void gpu_dgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) {
+
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, k_, lda_, ldb_, ldc_;
+
+  m_   = (int) m;
+  n_   = (int) n;
+  k_   = (int) k;
+  lda_ = (int) lda;
+  ldb_ = (int) ldb;
+  ldc_ = (int) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t)   k_ == k  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
+  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasDgemm(handle, transa_, transb_, m_, n_, k_, alpha, a, lda_, b, ldb_, beta, c, ldc_);
+}
+
+
+
+void gpu_sgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha,
+               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc) {
+
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, k_, lda_, ldb_, ldc_;
+
+  m_   = (int) m;
+  n_   = (int) n;
+  k_   = (int) k;
+  lda_ = (int) lda;
+  ldb_ = (int) ldb;
+  ldc_ = (int) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t)   k_ == k  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
+  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasSgemm(handle, transa_, transb_, m_, n_, k_, alpha, a, lda_, b, ldb_, beta, c, ldc_);
+}
+
+
+void gpu_dgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, lda_, ldb_, ldc_;
+
+  m_   = (int) m;
+  n_   = (int) n;
+  lda_ = (int) lda;
+  ldb_ = (int) ldb;
+  ldc_ = (int) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
+  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasDgeam(handle, transa_, transb_, m_, n_, alpha, a, lda_, beta, b, ldb_, c, ldc_);
+
+}
+
+
+void gpu_sgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float* alpha,
+               const float* a, const int64_t lda, const float* beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, lda_, ldb_, ldc_;
+
+  m_   = (int) m;
+  n_   = (int) n;
+  lda_ = (int) lda;
+  ldb_ = (int) ldb;
+  ldc_ = (int) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
+  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasSgeam(handle, transa_, transb_, m_, n_, alpha, a, lda_, beta, b, ldb_, c, ldc_);
+
+}
--- a/plugins/local/gpu_x86/NEED
+++ b/plugins/local/gpu_x86/NEED
@ -0,0 +1 @@
+
--- a/plugins/local/gpu_x86/README.rst
+++ b/plugins/local/gpu_x86/README.rst
--- a/plugins/local/gpu_x86/gpu.c
+++ b/plugins/local/gpu_x86/gpu.c
@ -2,13 +2,13 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <stdbool.h>
 #include <assert.h>

-
 /* Generic functions */

 int gpu_ndevices() {
-  return 1;
+  return 0;
 }

 void gpu_set_device(int32_t i) {
@ -25,7 +25,7 @@ void gpu_allocate(void** ptr, const int64_t n) {
  }
 }

-void gpu_free(void** ptr) {
+void gpu_deallocate(void** ptr) {
  free(*ptr);
  *ptr = NULL;
 }
@ -49,10 +49,11 @@ void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
 /* Streams */

 void gpu_stream_create(void** ptr) {
-  *ptr = (void*) 2;
+  *ptr = (void*) malloc(sizeof(char));
 }

 void gpu_stream_destroy(void** ptr) {
+  free(*ptr);
  *ptr = NULL;
 }

@ -68,18 +69,19 @@ void gpu_synchronize() {
 /* BLAS functions */

 void gpu_blas_create(void** handle) {
-  *handle = (void*) 1;
+  *handle = (void*) malloc(sizeof(char));
 }


 void gpu_blas_destroy(void** handle) {
+  free(*handle);
  *handle = NULL;
 }


 double ddot_(const int32_t* n, const double* x, const int32_t* incx, const double* y, const int32_t* incy);

-void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
+void gpu_ddot(void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
  assert (handle != NULL);

  /* Convert to int32_t */
@ -100,7 +102,7 @@ void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_

 float sdot_(const int32_t* n, const float* x, const int32_t* incx, const float* y, const int32_t* incy);

-void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
+void gpu_sdot(void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
  assert (handle != NULL);

  /* Convert to int32_t */
@ -122,8 +124,8 @@ void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t
 void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha,
            const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy);

-void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double alpha,
-               const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) {
+void gpu_dgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) {

  assert (handle != NULL);

@ -143,15 +145,15 @@ void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int
  assert ( (int64_t) incx_ == incx);
  assert ( (int64_t) incy_ == incy);

-  dgemv_(&transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_);
+  dgemv_(transa, &m_, &n_, alpha, a, &lda_, x, &incx_, beta, y, &incy_);
 }


 void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha,
               const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy);

-void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float alpha,
-               const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) {
+void gpu_sgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const float* alpha,
+               const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy) {

  assert (handle != NULL);

@ -171,15 +173,15 @@ void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int
  assert ( (int64_t) incx_ == incx);
  assert ( (int64_t) incy_ == incy);

-  sgemv_(&transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_);
+  sgemv_(transa, &m_, &n_, alpha, a, &lda_, x, &incx_, beta, y, &incy_);
 }


 void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha,
            const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc);

-void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
-               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) {
+void gpu_dgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) {

  assert (handle != NULL);

@ -201,7 +203,7 @@ void gpu_dgemm(const void* handle, const char transa, const char transb, const i
  assert ( (int64_t) ldb_ == ldb);
  assert ( (int64_t) ldc_ == ldc);

-  dgemm_(&transa, &transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_);
+  dgemm_(transa, transb, &m_, &n_, &k_, alpha, a, &lda_, b, &ldb_, beta, c, &ldc_);
 }


@ -209,8 +211,8 @@ void gpu_dgemm(const void* handle, const char transa, const char transb, const i
 void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha,
            const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc);

-void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
-               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) {
+void gpu_sgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha,
+               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc) {

  assert (handle != NULL);

@ -232,136 +234,133 @@ void gpu_sgemm(const void* handle, const char transa, const char transb, const i
  assert ( (int64_t) ldb_ == ldb);
  assert ( (int64_t) ldc_ == ldc);

-  sgemm_(&transa, &transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_);
+  sgemm_(transa, transb, &m_, &n_, &k_, alpha, a, &lda_, b, &ldb_, beta, c, &ldc_);
 }


-void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
-               const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
-  if (handle == NULL) {
-    perror("NULL handle");
-    exit(-1);
-  }
+void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
+  assert (handle != NULL);

-  if ( (transa == 'N' && transb == 'N') ||
-       (transa == 'n' && transb == 'N') ||
-       (transa == 'N' && transb == 'n') ||
-       (transa == 'n' && transb == 'n') ) {
+  if ( (*transa == 'N' && *transb == 'N') ||
+       (*transa == 'n' && *transb == 'N') ||
+       (*transa == 'N' && *transb == 'n') ||
+       (*transa == 'n' && *transb == 'n') ) {

-     if (alpha == 0.) {
+     if (*alpha == 0.) {

       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = beta * b[j*ldb+i];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[j*ldb+i];
         }
       }

-     } else if (beta == 0.) {
+     } else if (*beta == 0.) {

       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = alpha * a[j*lda+i];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i];
         }
       }

     } else {

       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[j*ldb+i];
         }
       }

     }

-  } else if ( (transa == 'N' && transb == 'T') ||
-              (transa == 'n' && transb == 'T') ||
-              (transa == 'N' && transb == 't') ||
-              (transa == 'n' && transb == 't') ) {
+  } else if ( (*transa == 'N' && *transb == 'T') ||
+              (*transa == 'n' && *transb == 'T') ||
+              (*transa == 'N' && *transb == 't') ||
+              (*transa == 'n' && *transb == 't') ) {

-     if (alpha == 0.) {
+     if (*alpha == 0.) {

       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = beta * b[i*ldb+j];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[i*ldb+j];
         }
       }

-     } else if (beta == 0.) {
+     } else if (*beta == 0.) {

       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = alpha * a[j*lda+i];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i];
         }
       }

     } else {

       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[i*ldb+j];
         }
       }

     }

-  } else if ( (transa == 'T' && transb == 'N') ||
-              (transa == 't' && transb == 'N') ||
-              (transa == 'T' && transb == 'n') ||
-              (transa == 't' && transb == 'n') ) {
+  } else if ( (*transa == 'T' && *transb == 'N') ||
+              (*transa == 't' && *transb == 'N') ||
+              (*transa == 'T' && *transb == 'n') ||
+              (*transa == 't' && *transb == 'n') ) {

-     if (alpha == 0.) {
+     if (*alpha == 0.) {

       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = beta * b[j*ldb+i];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[j*ldb+i];
         }
       }

-     } else if (beta == 0.) {
+     } else if (*beta == 0.) {

       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = alpha * a[i*lda+j];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j];
         }
       }

     } else {

       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[j*ldb+i];
         }
       }

     }

-  } else if ( (transa == 'T' && transb == 'T') ||
-              (transa == 't' && transb == 'T') ||
-              (transa == 'T' && transb == 't') ||
-              (transa == 't' && transb == 't') ) {
+  } else if ( (*transa == 'T' && *transb == 'T') ||
+              (*transa == 't' && *transb == 'T') ||
+              (*transa == 'T' && *transb == 't') ||
+              (*transa == 't' && *transb == 't') ) {

-     if (alpha == 0.) {
+     if (*alpha == 0.) {

       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = beta * b[i*ldb+j];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[i*ldb+j];
         }
       }

-     } else if (beta == 0.) {
+     } else if (*beta == 0.) {

       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = alpha * a[i*lda+j];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j];
         }
       }

     } else {

       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[i*ldb+j];
         }
       }

@ -371,132 +370,129 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
 }


-void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
-               const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
-  if (handle == NULL) {
-    perror("NULL handle");
-    exit(-1);
-  }
+void gpu_sgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float* alpha,
+               const float* a, const int64_t lda, const float* beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
+  assert (handle != NULL);

-  if ( (transa == 'N' && transb == 'N') ||
-       (transa == 'n' && transb == 'N') ||
-       (transa == 'N' && transb == 'n') ||
-       (transa == 'n' && transb == 'n') ) {
+  if ( (*transa == 'N' && *transb == 'N') ||
+       (*transa == 'n' && *transb == 'N') ||
+       (*transa == 'N' && *transb == 'n') ||
+       (*transa == 'n' && *transb == 'n') ) {

-     if (alpha == 0.) {
+     if (*alpha == 0.) {

       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = beta * b[j*ldb+i];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[j*ldb+i];
         }
       }

-     } else if (beta == 0.) {
+     } else if (*beta == 0.) {

       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = alpha * a[j*lda+i];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i];
         }
       }

     } else {

       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[j*ldb+i];
         }
       }

     }

-  } else if ( (transa == 'N' && transb == 'T') ||
-              (transa == 'n' && transb == 'T') ||
-              (transa == 'N' && transb == 't') ||
-              (transa == 'n' && transb == 't') ) {
+  } else if ( (*transa == 'N' && *transb == 'T') ||
+              (*transa == 'n' && *transb == 'T') ||
+              (*transa == 'N' && *transb == 't') ||
+              (*transa == 'n' && *transb == 't') ) {

-     if (alpha == 0.) {
+     if (*alpha == 0.) {

       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = beta * b[i*ldb+j];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[i*ldb+j];
         }
       }

-     } else if (beta == 0.) {
+     } else if (*beta == 0.) {

       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = alpha * a[j*lda+i];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i];
         }
       }

     } else {

       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[i*ldb+j];
         }
       }

     }

-  } else if ( (transa == 'T' && transb == 'N') ||
-              (transa == 't' && transb == 'N') ||
-              (transa == 'T' && transb == 'n') ||
-              (transa == 't' && transb == 'n') ) {
+  } else if ( (*transa == 'T' && *transb == 'N') ||
+              (*transa == 't' && *transb == 'N') ||
+              (*transa == 'T' && *transb == 'n') ||
+              (*transa == 't' && *transb == 'n') ) {

-     if (alpha == 0.) {
+     if (*alpha == 0.) {

       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = beta * b[j*ldb+i];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[j*ldb+i];
         }
       }

-     } else if (beta == 0.) {
+     } else if (*beta == 0.) {

       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = alpha * a[i*lda+j];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j];
         }
       }

     } else {

       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[j*ldb+i];
         }
       }

     }

-  } else if ( (transa == 'T' && transb == 'T') ||
-              (transa == 't' && transb == 'T') ||
-              (transa == 'T' && transb == 't') ||
-              (transa == 't' && transb == 't') ) {
+  } else if ( (*transa == 'T' && *transb == 'T') ||
+              (*transa == 't' && *transb == 'T') ||
+              (*transa == 'T' && *transb == 't') ||
+              (*transa == 't' && *transb == 't') ) {

-     if (alpha == 0.) {
+     if (*alpha == 0.) {

       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = beta * b[i*ldb+j];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[i*ldb+j];
         }
       }

-     } else if (beta == 0.) {
+     } else if (*beta == 0.) {

       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = alpha * a[i*lda+j];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j];
         }
       }

     } else {

       for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[i*ldb+j];
         }
       }

--- a/plugins/local/non_h_ints_mu/total_tc_int.irp.f
+++ b/plugins/local/non_h_ints_mu/total_tc_int.irp.f
@ -288,25 +288,31 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
      !$OMP END DO
      !$OMP END PARALLEL
    else
-      print*, ' ao_integrals_map will be used'
-      PROVIDE ao_integrals_map
+!      print*, ' ao_integrals_map will be used'
+!      PROVIDE ao_integrals_map
+      print*,'Cholesky vectors will be used '
+      double precision :: get_ao_integ_chol,eri
+      eri = get_ao_integ_chol(1,1,1,1) ! FOR OPENMP 
      !$OMP PARALLEL DEFAULT(NONE)                            &
-      !$OMP SHARED(ao_num, ao_two_e_tc_tot, ao_integrals_map) &
-      !$OMP PRIVATE(i, j, k, l)
+!!!    !$OMP SHARED(ao_num, ao_two_e_tc_tot, ao_integrals_map) &
+      !$OMP SHARED(ao_num, ao_two_e_tc_tot) &
+      !$OMP PRIVATE(i, j, k, l,eri)
      !$OMP DO COLLAPSE(3)
      do j = 1, ao_num
        do l = 1, ao_num
          do i = 1, ao_num
            do k = 1, ao_num
              !                                                     < 1:i, 2:j | 1:k, 2:l > 
-              ao_two_e_tc_tot(k,i,l,j) = ao_two_e_tc_tot(k,i,l,j) + get_ao_two_e_integral(i, j, k, l, ao_integrals_map)
+!              eri =  get_ao_two_e_integral(i, j, k, l, ao_integrals_map)
+               eri = get_ao_integ_chol(i,k,j,l)
+              ao_two_e_tc_tot(k,i,l,j) = ao_two_e_tc_tot(k,i,l,j) + eri
            enddo
          enddo
        enddo
      enddo
      !$OMP END DO
      !$OMP END PARALLEL
-      FREE ao_integrals_map
+!      FREE ao_integrals_map
    endif

    if((tc_integ_type .eq. "numeric") .and. (.not. tc_save_mem)) then
--- a/plugins/local/slater_tc/slater_tc_opt.irp.f
+++ b/plugins/local/slater_tc/slater_tc_opt.irp.f
@ -10,8 +10,6 @@ subroutine provide_all_three_ints_bi_ortho()
  implicit none
  double precision :: t1, t2

-  PROVIDE ao_two_e_integrals_in_map
-
  print *, ' start provide_all_three_ints_bi_ortho'
  call wall_time(t1)

--- a/plugins/local/slater_tc/tc_hmat.irp.f
+++ b/plugins/local/slater_tc/tc_hmat.irp.f
@ -30,7 +30,9 @@ BEGIN_PROVIDER [double precision, htilde_matrix_elmt_bi_ortho, (N_det,N_det)]
  print *, ' PROVIDING htilde_matrix_elmt_bi_ortho ...'
  call wall_time(t1)

+  if(three_body_h_tc)then
   call provide_all_three_ints_bi_ortho()
+  endif

  i = 1
  j = 1
--- a/plugins/local/spher_harm/spher_harm.irp.f
+++ b/plugins/local/spher_harm/spher_harm.irp.f
@ -1,7 +1,7 @@
 program spher_harm
 implicit none
-! call test_spher_harm
+ call test_spher_harm
 ! call test_cart
- call test_brutal_spheric
+! call test_brutal_spheric
 end

--- a/plugins/local/spher_harm/spher_harm_func.irp.f
+++ b/plugins/local/spher_harm/spher_harm_func.irp.f
@ -7,6 +7,7 @@ subroutine spher_harm_func_r3(r,l,m,re_ylm, im_ylm)
 double precision :: theta, phi,r_abs
 call cartesian_to_spherical(r,theta,phi,r_abs)
 call spher_harm_func(l,m,theta,phi,re_ylm, im_ylm)
+! call spher_harm_func_expl(l,m,theta,phi,re_ylm, im_ylm)
 end


@ -131,6 +132,10 @@ subroutine spher_harm_func_expl(l,m,theta,phi,re_ylm, im_ylm)
  tmp = - inv_sq_pi * dsqrt(3.d0/8.d0) * dsin(theta) 
  re_ylm = tmp * dcos(phi)
  im_ylm = tmp * dsin(phi)
+ else if (l==1.and.m==-1)then
+  tmp = - inv_sq_pi * dsqrt(3.d0/8.d0) * dsin(theta) 
+  re_ylm = tmp * dcos(phi)
+  im_ylm = -tmp * dsin(phi)
 else if(l==1.and.m==0)then
  tmp = inv_sq_pi * dsqrt(3.d0/4.d0) * dcos(theta) 
  re_ylm = tmp 
@ -139,10 +144,18 @@ subroutine spher_harm_func_expl(l,m,theta,phi,re_ylm, im_ylm)
  tmp = 0.25d0 * inv_sq_pi * dsqrt(0.5d0*15.d0) * dsin(theta)*dsin(theta)
  re_ylm = tmp * dcos(2.d0*phi)
  im_ylm = tmp * dsin(2.d0*phi)
+ else if(l==2.and.m==-2)then
+  tmp = 0.25d0 * inv_sq_pi * dsqrt(0.5d0*15.d0) * dsin(theta)*dsin(theta)
+  re_ylm = tmp * dcos(2.d0*phi)
+  im_ylm =-tmp * dsin(2.d0*phi)
 else if(l==2.and.m==1)then
  tmp = - inv_sq_pi * dsqrt(15.d0/8.d0) * dsin(theta) * dcos(theta)
  re_ylm = tmp * dcos(phi)
  im_ylm = tmp * dsin(phi)
+ else if(l==2.and.m==-1)then
+  tmp = - inv_sq_pi * dsqrt(15.d0/8.d0) * dsin(theta) * dcos(theta)
+  re_ylm = tmp * dcos(phi)
+  im_ylm =-tmp * dsin(phi)
 else if(l==2.and.m==0)then
  tmp = dsqrt(5.d0/4.d0) * inv_sq_pi* (1.5d0*dcos(theta)*dcos(theta)-0.5d0)
  re_ylm = tmp
--- a/plugins/local/tc_int/NEED
+++ b/plugins/local/tc_int/NEED
@ -1,3 +1,4 @@
+gpu
 tc_keywords
 jastrow
 qmckl
--- a/plugins/local/tc_int/compute_tc_int.irp.f
+++ b/plugins/local/tc_int/compute_tc_int.irp.f
@ -2,7 +2,7 @@
 ! ---

 subroutine provide_int2_grad1_u12_ao()
-
+  use gpu
  BEGIN_DOC
  !
  ! int2_grad1_u12_ao(i,j,ipoint,1) = \int dr2         [\grad1 u(r1,r2)]_x1 \chi_i(r2) \chi_j(r2)
@ -35,8 +35,9 @@ subroutine provide_int2_grad1_u12_ao()
  double precision              :: weight1, ao_k_r, ao_i_r
  double precision              :: der_envsq_x, der_envsq_y, der_envsq_z, lap_envsq
  double precision              :: time0, time1, time2, tc1, tc2, tc
-  double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:), tc_int_2e_ao(:,:,:,:)
-  double precision, allocatable :: tmp(:,:,:), c_mat(:,:,:), tmp_grad1_u12(:,:,:)
+  type(gpu_double4)             :: int2_grad1_u12_ao
+  type(gpu_double3)             :: tmp_grad1_u12, tmp_grad1_u12p, tmp
+  double precision, allocatable :: c_mat(:,:,:), tc_int_2e_ao(:,:,:,:)

  double precision, external    :: get_ao_two_e_integral

@ -51,6 +52,7 @@ subroutine provide_int2_grad1_u12_ao()

  call total_memory(mem)
  mem      = max(1.d0, qp_max_mem - mem)
+  mem = 6
  n_double = mem * 1.d8
  n_blocks = int(min(n_double / (n_points_extra_final_grid * 4.d0), 1.d0*n_points_final_grid))
  n_rest   = int(mod(n_points_final_grid, n_blocks))
@ -64,9 +66,9 @@ subroutine provide_int2_grad1_u12_ao()
  ! ---
  ! ---

-  allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,4))
+  call gpu_allocate(int2_grad1_u12_ao, ao_num,ao_num,n_points_final_grid,4)

-  allocate(tmp(n_points_extra_final_grid,ao_num,ao_num))
+  call gpu_allocate(tmp,n_points_extra_final_grid,ao_num,ao_num)
  !$OMP PARALLEL               &
  !$OMP DEFAULT (NONE)         &
  !$OMP PRIVATE (j, i, jpoint) &
@ -75,21 +77,28 @@ subroutine provide_int2_grad1_u12_ao()
  do j = 1, ao_num
    do i = 1, ao_num
      do jpoint = 1, n_points_extra_final_grid
-        tmp(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j)
+        tmp%f(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j)
      enddo
    enddo
  enddo
  !$OMP END DO
  !$OMP END PARALLEL

-  allocate(tmp_grad1_u12(n_points_extra_final_grid,n_blocks,4))
+  call gpu_allocate(tmp_grad1_u12,n_points_extra_final_grid,n_blocks,4)
+  call gpu_allocate(tmp_grad1_u12p,n_points_extra_final_grid,n_blocks,4)

  tc = 0.d0

+  type(gpu_stream) :: stream(4)
+  do i=1,4
+    call gpu_stream_create(stream(i))
+  enddo
+
  do i_pass = 1, n_pass
    ii = (i_pass-1)*n_blocks + 1

    call wall_time(tc1)
+
    !$OMP PARALLEL                   &
    !$OMP DEFAULT (NONE)             &
    !$OMP PRIVATE (i_blocks, ipoint) &
@ -97,27 +106,26 @@ subroutine provide_int2_grad1_u12_ao()
    !$OMP DO
    do i_blocks = 1, n_blocks
      ipoint = ii - 1 + i_blocks ! r1
-      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_blocks,1), tmp_grad1_u12(1,i_blocks,2), tmp_grad1_u12(1,i_blocks,3), tmp_grad1_u12(1,i_blocks,4))
+      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12%f(1,i_blocks,1), tmp_grad1_u12%f(1,i_blocks,2), &
+        tmp_grad1_u12%f(1,i_blocks,3), tmp_grad1_u12%f(1,i_blocks,4))
    enddo
    !$OMP END DO
    !$OMP END PARALLEL
    call wall_time(tc2)
    tc = tc + tc2 - tc1

+    call gpu_synchronize()
+    call gpu_copy(tmp_grad1_u12,tmp_grad1_u12p)
    do m = 1, 4
-      call dgemm( "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0                     &
-                , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
-                , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) 
+      call gpu_set_stream(blas_handle, stream(m))
+      call gpu_dgemm(blas_handle, "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0                     &
+                , tmp%f(1,1,1), n_points_extra_final_grid, tmp_grad1_u12p%f(1,1,m), n_points_extra_final_grid &
+                , 0.d0, int2_grad1_u12_ao%f(1,1,ii,m), ao_num*ao_num)
    enddo
  enddo

-  deallocate(tmp_grad1_u12)
-
-  
  if(n_rest .gt. 0) then

-    allocate(tmp_grad1_u12(n_points_extra_final_grid,n_rest,4))
-  
    ii = n_pass*n_blocks + 1

    call wall_time(tc1)
@ -128,7 +136,8 @@ subroutine provide_int2_grad1_u12_ao()
    !$OMP DO
    do i_rest = 1, n_rest
      ipoint = ii - 1 + i_rest ! r1
-      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_rest,1), tmp_grad1_u12(1,i_rest,2), tmp_grad1_u12(1,i_rest,3), tmp_grad1_u12(1,i_rest,4))
+      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12%f(1,i_rest,1), tmp_grad1_u12%f(1,i_rest,2), &
+        tmp_grad1_u12%f(1,i_rest,3), tmp_grad1_u12%f(1,i_rest,4))
    enddo
    !$OMP END DO
    !$OMP END PARALLEL
@ -136,15 +145,23 @@ subroutine provide_int2_grad1_u12_ao()
    tc = tc + tc2 - tc1

    do m = 1, 4
-      call dgemm( "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0                       &
-                , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
-                , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) 
+      call gpu_set_stream(blas_handle, stream(m))
+      call gpu_dgemm(blas_handle, "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0                       &
+                , tmp%f(1,1,1), n_points_extra_final_grid, tmp_grad1_u12%f(1,1,m), n_points_extra_final_grid &
+                , 0.d0, int2_grad1_u12_ao%f(1,1,ii,m), ao_num*ao_num)
    enddo

-    deallocate(tmp_grad1_u12)
  endif
+  call gpu_synchronize()
+  call gpu_deallocate(tmp_grad1_u12)
+  call gpu_deallocate(tmp_grad1_u12p)

-  deallocate(tmp)
+  do i=1,4
+    call gpu_stream_destroy(stream(i))
+  enddo
+
+
+  call gpu_deallocate(tmp)


  call wall_time(time1)
@ -152,6 +169,8 @@ subroutine provide_int2_grad1_u12_ao()
  print*, ' wall time Jastrow derivatives   (min) = ', tc / 60.d0
  call print_memory_usage()

+!TODO
+stop
  ! ---
  ! ---
  ! ---
@ -177,7 +196,7 @@ subroutine provide_int2_grad1_u12_ao()
  !$OMP END DO
  !$OMP END PARALLEL
  call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0            &
-            , int2_grad1_u12_ao(1,1,1,4), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
+            , int2_grad1_u12_ao%f(1,1,1,4), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
            , 0.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num)
  deallocate(c_mat)

@ -213,7 +232,7 @@ subroutine provide_int2_grad1_u12_ao()
    !$OMP END PARALLEL

    call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, -1.d0           &
-              , int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
+              , int2_grad1_u12_ao%f(1,1,1,m), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
              , 1.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num)
  enddo
  deallocate(c_mat)
@ -263,7 +282,7 @@ subroutine provide_int2_grad1_u12_ao()
  print*, ' Writing int2_grad1_u12_ao in ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao'
  open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="write")
  call ezfio_set_work_empty(.False.)
-    write(11) int2_grad1_u12_ao(:,:,:,1:3)
+    write(11) int2_grad1_u12_ao%f(:,:,:,1:3)
  close(11)

  print*, ' Saving tc_int_2e_ao in ', trim(ezfio_filename) // '/work/ao_two_e_tc_tot'
@ -276,7 +295,7 @@ subroutine provide_int2_grad1_u12_ao()

  ! ----

-  deallocate(int2_grad1_u12_ao)
+  call gpu_deallocate(int2_grad1_u12_ao)
  deallocate(tc_int_2e_ao)

  call wall_time(time2)
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@ -1,3 +1,15 @@
+double precision function get_ao_integ_chol(i,j,k,l)
+ implicit none
+  BEGIN_DOC
+  !  CHOLESKY representation of the integral of the AO basis <ik|jl> or (ij|kl)
+  !     i(r1) j(r1) 1/r12 k(r2) l(r2)
+  END_DOC
+ integer, intent(in) :: i,j,k,l
+ double precision, external :: ddot                                                                                  
+ get_ao_integ_chol = ddot(cholesky_ao_num, cholesky_ao_transp(1,i,j), 1, cholesky_ao_transp(1,k,l), 1)
+
+end
+
 BEGIN_PROVIDER [ double precision, cholesky_ao_transp, (cholesky_ao_num, ao_num, ao_num) ]
 implicit none
 BEGIN_DOC
@ -25,7 +37,10 @@ END_PROVIDER
   ! Last dimension of cholesky_ao is cholesky_ao_num
   !
   ! https://mogp-emulator.readthedocs.io/en/latest/methods/proc/ProcPivotedCholesky.html
+   !
   ! https://doi.org/10.1016/j.apnum.2011.10.001 : Page 4, Algorithm 1
+   !
+   ! https://www.diva-portal.org/smash/get/diva2:396223/FULLTEXT01.pdf
   END_DOC

   integer*8                      :: ndim8
@ -155,11 +170,15 @@ END_PROVIDER
         Lset(np8) = p8
       endif
     enddo
-     np = np8
+     if (np8 > ndim8) stop 'np>ndim8'
+     np = int(np8,4)
     if (np <= 0) stop 'np<=0'
-     if (np > ndim8) stop 'np>ndim8'

+     rank_max = np
+     ! Avoid too large arrays when there are many electrons
+     if (elec_num > 10) then
       rank_max = min(np,20*elec_num*elec_num)
+     endif
     call mmap(trim(ezfio_work_dir)//'cholesky_ao_tmp', (/ ndim8, rank_max /), 8, fd(1), .False., .True., c_pointer(1))
     call c_f_pointer(c_pointer(1), L, (/ ndim8, rank_max /))

@ -428,7 +447,7 @@ END_PROVIDER
           Lset(np8) = p8
         endif
       enddo
-       np = np8
+       np = int(np8,4)

     enddo

--- a/src/casscf_cipsi/EZFIO.cfg
+++ b/src/casscf_cipsi/EZFIO.cfg
@ -79,3 +79,9 @@ type: logical
 doc: If |true|, the pt2_max value in the CIPSI is set to 10-10 and will not change
 interface: ezfio,provider,ocaml
 default: False
+
+[act_mos_opt]
+type: logical
+doc: If |true|, the active orbitals are also optimized variationally
+interface: ezfio,provider,ocaml
+default: False
--- a/src/casscf_cipsi/NEED
+++ b/src/casscf_cipsi/NEED
@ -3,3 +3,4 @@ selectors_full
 generators_cas
 two_body_rdm
 dav_general_mat
+mo_optimization_utils
--- a/src/casscf_cipsi/bielec.irp.f
+++ b/src/casscf_cipsi/bielec.irp.f
@ -1,18 +1,25 @@
-BEGIN_PROVIDER [real*8, bielec_PQxx, (mo_num, mo_num,n_core_inact_act_orb,n_core_inact_act_orb)]
+BEGIN_PROVIDER [real*8, bielec_PQxx_array, (mo_num, mo_num,n_core_inact_act_orb,n_core_inact_act_orb)]
  BEGIN_DOC
-  ! bielec_PQxx : integral (pq|xx) with p,q arbitrary, x core or active
+  ! WARNING !!! Old version !!! NOT USED ANYMORE IN THE PROGRAM !!! TOO BIG TO BE STORED ON LARGE SYSTEMS !!! 
+  ! 
+  ! Replaced by the Cholesky-based function bielec_PQxx
+  !
+  ! bielec_PQxx_array : integral (pq|xx) with p,q arbitrary, x core or active
  ! indices are unshifted orbital numbers
  END_DOC
  implicit none
  integer                        :: i,j,ii,jj,p,q,i3,j3,t3,v3
  real*8                         :: mo_two_e_integral
+  print*,''
+  print*,'Providing bielec_PQxx_array, WARNING IT CAN BE A VERY BIG ARRAY WHEN MO_NUM IS LARGE !!!'
+  print*,''
  
-  bielec_PQxx(:,:,:,:) = 0.d0
+  bielec_PQxx_array(:,:,:,:) = 0.d0
  PROVIDE mo_two_e_integrals_in_map
  
  !$OMP PARALLEL DEFAULT(NONE) &
  !$OMP PRIVATE(i,ii,j,jj,i3,j3) &
-  !$OMP SHARED(n_core_inact_orb,list_core_inact,mo_num,bielec_PQxx, &
+  !$OMP SHARED(n_core_inact_orb,list_core_inact,mo_num,bielec_PQxx_array, &
  !$OMP  n_act_orb,mo_integrals_map,list_act)

  !$OMP DO
@ -20,14 +27,14 @@ BEGIN_PROVIDER [real*8, bielec_PQxx, (mo_num, mo_num,n_core_inact_act_orb,n_core
    ii=list_core_inact(i)
    do j=i,n_core_inact_orb
      jj=list_core_inact(j)
-      call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx(1,1,i,j),mo_integrals_map)
-      bielec_PQxx(:,:,j,i)=bielec_PQxx(:,:,i,j)
+      call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx_array(1,1,i,j),mo_integrals_map)
+      bielec_PQxx_array(:,:,j,i)=bielec_PQxx_array(:,:,i,j)
    end do
    do j=1,n_act_orb
      jj=list_act(j)
      j3=j+n_core_inact_orb
-      call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx(1,1,i,j3),mo_integrals_map)
-      bielec_PQxx(:,:,j3,i)=bielec_PQxx(:,:,i,j3)
+      call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx_array(1,1,i,j3),mo_integrals_map)
+      bielec_PQxx_array(:,:,j3,i)=bielec_PQxx_array(:,:,i,j3)
    end do
  end do
  !$OMP END DO
@ -40,8 +47,8 @@ BEGIN_PROVIDER [real*8, bielec_PQxx, (mo_num, mo_num,n_core_inact_act_orb,n_core
    do j=i,n_act_orb
      jj=list_act(j)
      j3=j+n_core_inact_orb
-      call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx(1,1,i3,j3),mo_integrals_map)
-      bielec_PQxx(:,:,j3,i3)=bielec_PQxx(:,:,i3,j3)
+      call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx_array(1,1,i3,j3),mo_integrals_map)
+      bielec_PQxx_array(:,:,j3,i3)=bielec_PQxx_array(:,:,i3,j3)
    end do
  end do
  !$OMP END DO
@ -52,9 +59,13 @@ END_PROVIDER



-BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_act_orb, mo_num)]
+BEGIN_PROVIDER [real*8, bielec_PxxQ_array, (mo_num,n_core_inact_act_orb,n_core_inact_act_orb, mo_num)]
  BEGIN_DOC
-  ! bielec_PxxQ : integral (px|xq) with p,q arbitrary, x core or active
+  ! WARNING !!! Old version !!! NOT USED ANYMORE IN THE PROGRAM !!! TOO BIG TO BE STORED ON LARGE SYSTEMS !!! 
+  ! 
+  ! Replaced by the Cholesky-based function bielec_PxxQ
+  !
+  ! bielec_PxxQ_array : integral (px|xq) with p,q arbitrary, x core or active
  ! indices are unshifted orbital numbers
  END_DOC
  implicit none
@ -62,12 +73,15 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_a
  double precision, allocatable  :: integrals_array(:,:)
  real*8                         :: mo_two_e_integral
  
+  print*,''
+  print*,'Providing bielec_PxxQ_array, WARNING IT CAN BE A VERY BIG ARRAY WHEN MO_NUM IS LARGE !!!'
+  print*,''
  PROVIDE mo_two_e_integrals_in_map
-  bielec_PxxQ = 0.d0
+  bielec_PxxQ_array = 0.d0

  !$OMP PARALLEL DEFAULT(NONE) &
  !$OMP PRIVATE(i,ii,j,jj,i3,j3,integrals_array) &
-  !$OMP SHARED(n_core_inact_orb,list_core_inact,mo_num,bielec_PxxQ, &
+  !$OMP SHARED(n_core_inact_orb,list_core_inact,mo_num,bielec_PxxQ_array, &
  !$OMP  n_act_orb,mo_integrals_map,list_act)

  allocate(integrals_array(mo_num,mo_num))
@ -80,8 +94,8 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_a
      call get_mo_two_e_integrals_ij(ii,jj,mo_num,integrals_array,mo_integrals_map)
      do q=1,mo_num
        do p=1,mo_num
-          bielec_PxxQ(p,i,j,q)=integrals_array(p,q)
-          bielec_PxxQ(p,j,i,q)=integrals_array(q,p)
+          bielec_PxxQ_array(p,i,j,q)=integrals_array(p,q)
+          bielec_PxxQ_array(p,j,i,q)=integrals_array(q,p)
        end do
      end do
    end do
@ -91,8 +105,8 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_a
      call get_mo_two_e_integrals_ij(ii,jj,mo_num,integrals_array,mo_integrals_map)
      do q=1,mo_num
        do p=1,mo_num
-          bielec_PxxQ(p,i,j3,q)=integrals_array(p,q)
-          bielec_PxxQ(p,j3,i,q)=integrals_array(q,p)
+          bielec_PxxQ_array(p,i,j3,q)=integrals_array(p,q)
+          bielec_PxxQ_array(p,j3,i,q)=integrals_array(q,p)
        end do
      end do
    end do
@ -111,8 +125,8 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_a
      call get_mo_two_e_integrals_ij(ii,jj,mo_num,integrals_array,mo_integrals_map)
      do q=1,mo_num
        do p=1,mo_num
-          bielec_PxxQ(p,i3,j3,q)=integrals_array(p,q)
-          bielec_PxxQ(p,j3,i3,q)=integrals_array(q,p)
+          bielec_PxxQ_array(p,i3,j3,q)=integrals_array(p,q)
+          bielec_PxxQ_array(p,j3,i3,q)=integrals_array(q,p)
        end do
      end do
    end do
@ -129,10 +143,15 @@ BEGIN_PROVIDER [real*8, bielecCI, (n_act_orb,n_act_orb,n_act_orb, mo_num)]
  BEGIN_DOC
  ! bielecCI : integrals (tu|vp) with p arbitrary, tuv active
  ! index p runs over the whole basis, t,u,v only over the active orbitals
+  ! 
+  ! This array can be stored anyway. Ex: 50 active orbitals, 1500 MOs ==> 8x50^3x1500 = 1.5 Gb
  END_DOC
  implicit none
  integer                        :: i,j,k,p,t,u,v
  double precision, external     :: mo_two_e_integral
+  double precision :: wall0, wall1 
+  call wall_time(wall0)
+  print*,'Providing bielecCI'
  PROVIDE mo_two_e_integrals_in_map
  
  !$OMP PARALLEL DO DEFAULT(NONE) &
@ -151,5 +170,7 @@ BEGIN_PROVIDER [real*8, bielecCI, (n_act_orb,n_act_orb,n_act_orb, mo_num)]
    end do
  end do
  !$OMP END PARALLEL DO
+  call wall_time(wall1)
+  print*,'Time to provide bielecCI = ',wall1 - wall0

 END_PROVIDER
--- a/src/casscf_cipsi/bielec_natorb.irp.f
+++ b/src/casscf_cipsi/bielec_natorb.irp.f
@ -1,30 +1,38 @@
- BEGIN_PROVIDER [real*8, bielec_PQxx_no, (mo_num, mo_num,n_core_inact_act_orb,n_core_inact_act_orb)]
+ BEGIN_PROVIDER [real*8, bielec_PQxx_no_array, (mo_num, mo_num,n_core_inact_act_orb,n_core_inact_act_orb)]
  BEGIN_DOC
+  ! WARNING !!! Old version !!! NOT USED ANYMORE IN THE PROGRAM !!! TOO BIG TO BE STORED ON LARGE SYSTEMS !!! 
+  ! 
+  ! Replaced by the Cholesky-based function bielec_PQxx_no
+  !
  ! integral (pq|xx) in the basis of natural MOs
  ! indices are unshifted orbital numbers
+  ! 
  END_DOC
  implicit none
  integer                        :: i,j,k,l,t,u,p,q
  double precision, allocatable  :: f(:,:,:), d(:,:,:)

+  print*,''
+  print*,'Providing bielec_PQxx_no_array, WARNING IT CAN BE A VERY BIG ARRAY WHEN MO_NUM IS LARGE !!!'
+  print*,''


  !$OMP PARALLEL DEFAULT(NONE) &
  !$OMP PRIVATE(j,k,l,p,d,f) &
  !$OMP SHARED(n_core_inact_act_orb,mo_num,n_act_orb,n_core_inact_orb, &
-  !$OMP   bielec_PQxx_no,bielec_PQxx,list_act,natorbsCI)
+  !$OMP   bielec_PQxx_no_array,bielec_PQxx_array,list_act,natorbsCI)

  allocate (f(n_act_orb,mo_num,n_core_inact_act_orb), &
      d(n_act_orb,mo_num,n_core_inact_act_orb))

  !$OMP DO
  do l=1,n_core_inact_act_orb
-    bielec_PQxx_no(:,:,:,l) = bielec_PQxx(:,:,:,l)
+    bielec_PQxx_no_array(:,:,:,l) = bielec_PQxx_array(:,:,:,l)

    do k=1,n_core_inact_act_orb
      do j=1,mo_num
        do p=1,n_act_orb
-          f(p,j,k)=bielec_PQxx_no(list_act(p),j,k,l)
+          f(p,j,k)=bielec_PQxx_no_array(list_act(p),j,k,l)
        end do
      end do
    end do
@ -36,13 +44,13 @@
    do k=1,n_core_inact_act_orb
      do j=1,mo_num
        do p=1,n_act_orb
-          bielec_PQxx_no(list_act(p),j,k,l)=d(p,j,k)
+          bielec_PQxx_no_array(list_act(p),j,k,l)=d(p,j,k)
        end do
      end do

      do j=1,mo_num
        do p=1,n_act_orb
-          f(p,j,k)=bielec_PQxx_no(j,list_act(p),k,l)
+          f(p,j,k)=bielec_PQxx_no_array(j,list_act(p),k,l)
        end do
      end do
    end do
@ -54,7 +62,7 @@
    do k=1,n_core_inact_act_orb
      do p=1,n_act_orb
        do j=1,mo_num
-          bielec_PQxx_no(j,list_act(p),k,l)=d(p,j,k)
+          bielec_PQxx_no_array(j,list_act(p),k,l)=d(p,j,k)
        end do
      end do
    end do
@ -71,7 +79,7 @@
    do p=1,n_act_orb
      do k=1,mo_num
        do j=1,mo_num
-          f(j,k,p) = bielec_PQxx_no(j,k,n_core_inact_orb+p,l)
+          f(j,k,p) = bielec_PQxx_no_array(j,k,n_core_inact_orb+p,l)
        end do
      end do
    end do
@ -83,7 +91,7 @@
    do p=1,n_act_orb
      do k=1,mo_num
        do j=1,mo_num
-          bielec_PQxx_no(j,k,n_core_inact_orb+p,l)=d(j,k,p)
+          bielec_PQxx_no_array(j,k,n_core_inact_orb+p,l)=d(j,k,p)
        end do
      end do
    end do
@ -97,7 +105,7 @@
    do p=1,n_act_orb
      do k=1,mo_num
        do j=1,mo_num
-          f(j,k,p) = bielec_PQxx_no(j,k,l,n_core_inact_orb+p)
+          f(j,k,p) = bielec_PQxx_no_array(j,k,l,n_core_inact_orb+p)
        end do
      end do
    end do
@ -109,7 +117,7 @@
    do p=1,n_act_orb
      do k=1,mo_num
        do j=1,mo_num
-          bielec_PQxx_no(j,k,l,n_core_inact_orb+p)=d(j,k,p)
+          bielec_PQxx_no_array(j,k,l,n_core_inact_orb+p)=d(j,k,p)
        end do
      end do
    end do
@ -123,8 +131,12 @@ END_PROVIDER



-BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inact_act_orb, mo_num)]
+BEGIN_PROVIDER [real*8, bielec_PxxQ_no_array, (mo_num,n_core_inact_act_orb,n_core_inact_act_orb, mo_num)]
  BEGIN_DOC
+  ! WARNING !!! Old version !!! NOT USED ANYMORE IN THE PROGRAM !!! TOO BIG TO BE STORED ON LARGE SYSTEMS !!! 
+  ! 
+  ! Replaced by the Cholesky-based function bielec_PxxQ_no
+  !
  ! integral (px|xq) in the basis of natural MOs
  ! indices are unshifted orbital numbers
  END_DOC
@ -132,10 +144,14 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
  integer                        :: i,j,k,l,t,u,p,q
  double precision, allocatable  :: f(:,:,:), d(:,:,:)

+  print*,''
+  print*,'Providing bielec_PxxQ_no_array, WARNING IT CAN BE A VERY BIG ARRAY WHEN MO_NUM IS LARGE !!!'
+  print*,''
+
  !$OMP PARALLEL DEFAULT(NONE) &
  !$OMP PRIVATE(j,k,l,p,d,f) &
  !$OMP SHARED(n_core_inact_act_orb,mo_num,n_act_orb,n_core_inact_orb, &
-  !$OMP   bielec_PxxQ_no,bielec_PxxQ,list_act,natorbsCI)
+  !$OMP   bielec_PxxQ_no_array,bielec_PxxQ_array,list_act,natorbsCI)


  allocate (f(n_act_orb,n_core_inact_act_orb,n_core_inact_act_orb), &
@ -143,11 +159,11 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac

  !$OMP DO
  do j=1,mo_num
-    bielec_PxxQ_no(:,:,:,j) = bielec_PxxQ(:,:,:,j)
+    bielec_PxxQ_no_array(:,:,:,j) = bielec_PxxQ_array(:,:,:,j)
    do l=1,n_core_inact_act_orb
      do k=1,n_core_inact_act_orb
        do p=1,n_act_orb
-            f(p,k,l) = bielec_PxxQ_no(list_act(p),k,l,j)
+            f(p,k,l) = bielec_PxxQ_no_array(list_act(p),k,l,j)
        end do
      end do
    end do
@ -159,7 +175,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
    do l=1,n_core_inact_act_orb
      do k=1,n_core_inact_act_orb
        do p=1,n_act_orb
-          bielec_PxxQ_no(list_act(p),k,l,j)=d(p,k,l)
+          bielec_PxxQ_no_array(list_act(p),k,l,j)=d(p,k,l)
        end do
      end do
    end do
@ -176,7 +192,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
    do l=1,n_core_inact_act_orb
      do j=1,mo_num
        do p=1,n_act_orb
-          f(p,j,l) = bielec_PxxQ_no(j,n_core_inact_orb+p,l,k)
+          f(p,j,l) = bielec_PxxQ_no_array(j,n_core_inact_orb+p,l,k)
        end do
      end do
    end do
@ -188,7 +204,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
    do l=1,n_core_inact_act_orb
      do j=1,mo_num
        do p=1,n_act_orb
-          bielec_PxxQ_no(j,n_core_inact_orb+p,l,k)=d(p,j,l)
+          bielec_PxxQ_no_array(j,n_core_inact_orb+p,l,k)=d(p,j,l)
        end do
      end do
    end do
@ -205,7 +221,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
    do p=1,n_act_orb
      do l=1,n_core_inact_act_orb
        do j=1,mo_num
-          f(j,l,p) = bielec_PxxQ_no(j,l,n_core_inact_orb+p,k)
+          f(j,l,p) = bielec_PxxQ_no_array(j,l,n_core_inact_orb+p,k)
        end do
      end do
    end do
@ -217,7 +233,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
    do p=1,n_act_orb
      do l=1,n_core_inact_act_orb
        do j=1,mo_num
-          bielec_PxxQ_no(j,l,n_core_inact_orb+p,k)=d(j,l,p)
+          bielec_PxxQ_no_array(j,l,n_core_inact_orb+p,k)=d(j,l,p)
        end do
      end do
    end do
@ -231,7 +247,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
    do p=1,n_act_orb
      do k=1,n_core_inact_act_orb
        do j=1,mo_num
-          f(j,k,p) = bielec_PxxQ_no(j,k,l,n_core_inact_orb+p)
+          f(j,k,p) = bielec_PxxQ_no_array(j,k,l,n_core_inact_orb+p)
        end do
      end do
    end do
@ -243,7 +259,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
    do p=1,n_act_orb
      do k=1,n_core_inact_act_orb
        do j=1,mo_num
-          bielec_PxxQ_no(j,k,l,n_core_inact_orb+p)=d(j,k,p)
+          bielec_PxxQ_no_array(j,k,l,n_core_inact_orb+p)=d(j,k,p)
        end do
      end do
    end do
@ -259,11 +275,17 @@ BEGIN_PROVIDER [real*8, bielecCI_no, (n_act_orb,n_act_orb,n_act_orb, mo_num)]
  BEGIN_DOC
  ! integrals (tu|vp) in the basis of natural MOs
  ! index p runs over the whole basis, t,u,v only over the active orbitals
+  ! 
+  ! This array can be stored anyway. Ex: 50 active orbitals, 1500 MOs ==> 8x50^3x1500 = 1.5 Gb
  END_DOC
  implicit none
  integer                        :: i,j,k,l,t,u,p,q
  double precision, allocatable  :: f(:,:,:), d(:,:,:)
 
+  double precision :: wall0, wall1
+  call wall_time(wall0)
+  print*,'Providing bielecCI_no'
+  
  !$OMP PARALLEL DEFAULT(NONE) &
  !$OMP PRIVATE(j,k,l,p,d,f) &
  !$OMP SHARED(n_core_inact_act_orb,mo_num,n_act_orb,n_core_inact_orb, &
@ -363,6 +385,8 @@ BEGIN_PROVIDER [real*8, bielecCI_no, (n_act_orb,n_act_orb,n_act_orb, mo_num)]
  
  deallocate(d,f)
  !$OMP END PARALLEL
+  call wall_time(wall1)
+  print*,'Time to provide bielecCI_no = ',wall1-wall0


 END_PROVIDER
--- a/src/casscf_cipsi/casscf.irp.f
+++ b/src/casscf_cipsi/casscf.irp.f
@ -11,7 +11,7 @@ program casscf
  if(small_active_space)then
   pt2_relative_error = 0.00001
  else
-   thresh_scf = 1.d-4
+   thresh_scf = max(1.d-4,thresh_scf)
   pt2_relative_error = 0.04
  endif
  touch pt2_relative_error 
@ -46,6 +46,11 @@ subroutine run
  do while (.not.converged)
    print*,'pt2_max = ',pt2_max
    call run_stochastic_cipsi(Ev,PT2)
+!    if(act_mos_opt)then DOES NOT WORK
+!     call run_orb_opt_trust_v2
+!     call run_stochastic_cipsi(Ev,PT2)
+!    endif
+    if(.True.)then
     print*,'Ev,PT2',Ev(1),PT2(1)
     E_PT2(1:N_states) = Ev(1:N_states) + PT2(1:N_states)
     energy_old = energy
@ -55,9 +60,9 @@ subroutine run
     call write_time(6)
     call write_int(6,iteration,'CAS-SCF iteration = ')
     call write_double(6,energy,'State-average CAS-SCF energy = ')
-!    if(n_states == 1)then
-!     call ezfio_get_casscf_cipsi_energy_pt2(E_PT2)
-!     call ezfio_get_casscf_cipsi_energy(PT2)
+!!    if(n_states == 1)then
+!!     call ezfio_get_casscf_cipsi_energy_pt2(E_PT2)
+!!     call ezfio_get_casscf_cipsi_energy(PT2)
      double precision :: delta_E_istate, e_av
      e_av = 0.d0
      do istate=1,N_states
@ -68,12 +73,12 @@ subroutine run
       endif
       write(*,'(A6,I2,A18,F16.10)')'state ',istate,' E + PT2 energy = ',E_PT2(istate)
       write(*,'(A6,I2,A18,F16.10)')'state ',istate,'     PT2 energy = ',PT2(istate)
-!      call write_double(6,E_PT2(istate),'E + PT2 energy = ')
-!      call write_double(6,PT2(istate),'  PT2          = ')
+!!      call write_double(6,E_PT2(istate),'E + PT2 energy = ')
+!!      call write_double(6,PT2(istate),'  PT2          = ')
      enddo
      call write_double(6,e_av,'State-average CAS-SCF energy bis = ')
      call write_double(6,pt2_max,' PT2_MAX       = ')
-!    endif
+!!    endif
 
     print*,''
     call write_double(6,norm_grad_vec2,'Norm of gradients = ')
@ -132,8 +137,10 @@ subroutine run
       soft_touch state_following_casscf
      endif
     endif
+    endif
 
  enddo
+  if(.True.)then
     integer :: i
    print*,'Converged CASSCF '
    print*,'--------------------------'
@ -153,6 +160,7 @@ subroutine run
 !   write(*,*)mcscf_fock_alpha_mo(i,i)
  enddo

+ endif

 end

--- a/src/casscf_cipsi/chol_bielec.irp.f
+++ b/src/casscf_cipsi/chol_bielec.irp.f
@ -0,0 +1,248 @@
+
+BEGIN_PROVIDER [double precision, cholesky_no_1_idx_transp, (cholesky_mo_num, n_act_orb, mo_num)]
+ BEGIN_DOC
+ ! Cholesky vectors with ONE orbital on the active natural orbital basis 
+ END_DOC
+ implicit none
+ integer :: i_chol,i_act,i_mo,jj_act
+ double precision, allocatable :: chol_tmp(:,:)
+ double precision :: wall0,wall1
+ call wall_time(wall0)
+ print*,'Providing cholesky_no_1_idx_transp'
+ allocate(chol_tmp(cholesky_mo_num,n_act_orb))
+ cholesky_no_1_idx_transp = 0.D0
+ do i_mo = 1, mo_num
+  ! Get all the integrals corresponding to the "i_mo"
+  do i_act = 1, n_act_orb
+   jj_act = list_act(i_act)
+   do i_chol = 1, cholesky_mo_num
+    chol_tmp(i_chol, i_act) = cholesky_mo_transp(i_chol, jj_act, i_mo)
+   enddo
+  enddo
+  call dgemm('N','N',cholesky_mo_num,n_act_orb,n_act_orb,1.d0,  &
+        chol_tmp, size(chol_tmp,1),                              &
+        natorbsCI, size(natorbsCI,1),                                              &
+        0.d0,                                                      &
+        cholesky_no_1_idx_transp(1,1,i_mo), size(cholesky_no_1_idx_transp,1))
+ enddo
+ call wall_time(wall1)
+ print*,'Time to provide cholesky_no_1_idx_transp = ', wall1 - wall0
+
+END_PROVIDER 
+
+BEGIN_PROVIDER [double precision, cholesky_no_2_idx_transp, (cholesky_mo_num, n_act_orb, n_act_orb)]
+ BEGIN_DOC
+ ! Cholesky vectors with TWO orbital on the active natural orbital basis 
+ END_DOC
+ implicit none
+ integer :: i_chol,i_act,j_act,jj_act
+ double precision, allocatable :: chol_tmp(:,:),chol_tmp_bis(:,:)
+ allocate(chol_tmp(cholesky_mo_num,n_act_orb),chol_tmp_bis(cholesky_mo_num,n_act_orb))
+ double precision :: wall0,wall1
+ call wall_time(wall0)
+ print*,'Providing cholesky_no_2_idx_transp'
+ cholesky_no_2_idx_transp = 0.D0
+ do i_act = 1, n_act_orb
+  ! Get all the integrals corresponding to the "j_act"
+  do j_act = 1, n_act_orb
+   jj_act = list_act(j_act)
+   do i_chol = 1, cholesky_mo_num
+    chol_tmp(i_chol, j_act) = cholesky_no_1_idx_transp(i_chol, i_act, jj_act)
+   enddo
+  enddo
+  call dgemm('N','N',cholesky_mo_num,n_act_orb,n_act_orb,1.d0,  &
+        chol_tmp, size(chol_tmp,1),                              &
+        natorbsCI, size(natorbsCI,1),                                              &
+        0.d0,                                                      &
+        cholesky_no_2_idx_transp(1,1,i_act), size(cholesky_no_2_idx_transp,1))
+ enddo
+ call wall_time(wall1)
+ print*,'Time to provide  cholesky_no_2_idx_transp = ', wall1 - wall0
+
+END_PROVIDER 
+
+BEGIN_PROVIDER [ double precision, cholesky_no_total_transp, (cholesky_mo_num, mo_num, mo_num)]
+ implicit none
+ BEGIN_DOC
+ ! Cholesky vectors defined on all basis including the NO basis
+ END_DOC
+ integer :: i_chol, i_act, ii_act, j_act, jj_act, i_core_inact, j_core_inact, ii_core_inact, jj_core_inact
+ integer :: i_virt, ii_virt, j_virt, jj_virt
+ double precision :: wall0,wall1
+ call wall_time(wall0)
+ print*,'Providing cholesky_no_total_transp '
+ ! Block when two orbitals belong to the core/inact 
+ do j_core_inact = 1, n_core_inact_orb
+  jj_core_inact = list_core_inact(j_core_inact)
+  do i_core_inact = 1, n_core_inact_orb
+   ii_core_inact = list_core_inact(i_core_inact)
+   do i_chol = 1, cholesky_mo_num
+    cholesky_no_total_transp(i_chol, ii_core_inact, jj_core_inact) = cholesky_mo_transp(i_chol,ii_core_inact,jj_core_inact)
+   enddo
+  enddo
+ enddo
+
+ ! Block when one orbitals belongs to the core/inact and one belongs to the active
+ do j_core_inact = 1, n_core_inact_orb
+  jj_core_inact = list_core_inact(j_core_inact)
+  do i_act = 1, n_act_orb
+   ii_act = list_act(i_act)
+   do i_chol = 1, cholesky_mo_num 
+    cholesky_no_total_transp(i_chol,ii_act,j_core_inact) = cholesky_no_1_idx_transp(i_chol,i_act,jj_core_inact)
+   enddo
+  enddo
+ enddo
+ do j_core_inact = 1, n_core_inact_orb
+  jj_core_inact = list_core_inact(j_core_inact)
+  do i_act = 1, n_act_orb
+   ii_act = list_act(i_act)
+   do i_chol = 1, cholesky_mo_num 
+    cholesky_no_total_transp(i_chol,j_core_inact,ii_act) = cholesky_no_1_idx_transp(i_chol,i_act,jj_core_inact)
+   enddo
+  enddo
+ enddo
+ 
+ ! Block when two orbitals belong to the active 
+ do j_act = 1, n_act_orb
+  jj_act = list_act(j_act)
+  do i_act = 1, n_act_orb
+   ii_act = list_act(i_act)
+   do i_chol = 1, cholesky_mo_num 
+    cholesky_no_total_transp(i_chol,ii_act,jj_act) = cholesky_no_2_idx_transp(i_chol,i_act,j_act)
+   enddo
+  enddo
+ enddo
+
+ ! Block when two orbitals belong to the virtuals 
+ do i_virt = 1, n_virt_orb
+  ii_virt = list_virt(i_virt)
+  do j_virt = 1, n_virt_orb
+   jj_virt = list_virt(j_virt)
+   do i_chol = 1, cholesky_mo_num
+    cholesky_no_total_transp(i_chol,jj_virt,ii_virt) = cholesky_mo_transp(i_chol,jj_virt,ii_virt)
+   enddo
+  enddo
+ enddo
+ 
+ ! Block when one orbital is in active and the other in the virtuals 
+ do i_virt = 1, n_virt_orb
+  ii_virt = list_virt(i_virt)
+  do i_act = 1, n_act_orb
+   ii_act = list_act(i_act)
+   do i_chol = 1, cholesky_mo_num
+    cholesky_no_total_transp(i_chol,ii_act,ii_virt) = cholesky_no_1_idx_transp(i_chol, i_act,ii_virt)
+   enddo
+  enddo
+ enddo
+ do i_virt = 1, n_virt_orb
+  ii_virt = list_virt(i_virt)
+  do i_act = 1, n_act_orb
+   ii_act = list_act(i_act)
+   do i_chol = 1, cholesky_mo_num
+    cholesky_no_total_transp(i_chol,ii_virt,ii_act) = cholesky_no_1_idx_transp(i_chol, i_act,ii_virt)
+   enddo
+  enddo
+ enddo
+ ! Block when one orbital is in the virtual and one in the core-inact 
+ do i_virt = 1, n_virt_orb
+  ii_virt = list_virt(i_virt)
+  do i_core_inact = 1, n_core_inact_orb
+   ii_core_inact = list_core_inact(i_core_inact)
+   do i_chol = 1, cholesky_mo_num
+    cholesky_no_total_transp(i_chol, ii_core_inact, ii_virt) = cholesky_mo_transp(i_chol, ii_core_inact, ii_virt)
+   enddo
+  enddo
+ enddo
+ do i_core_inact = 1, n_core_inact_orb
+  ii_core_inact = list_core_inact(i_core_inact)
+  do i_virt = 1, n_virt_orb
+   ii_virt = list_virt(i_virt)
+   do i_chol = 1, cholesky_mo_num
+    cholesky_no_total_transp(i_chol, ii_virt, ii_core_inact) = cholesky_mo_transp(i_chol, ii_virt, ii_core_inact)
+   enddo
+  enddo
+ enddo
+
+ call wall_time(wall1)
+ print*,'Time to provide cholesky_no_total_transp = ', wall1 - wall0
+END_PROVIDER 
+
+
+double precision function bielec_no_basis(i_1,j_1,i_2,j_2)
+ implicit none
+ integer, intent(in) :: i_1,j_1,i_2,j_2
+  BEGIN_DOC
+  ! integral (i_1 j_1|i_2 j_2) in the mixed basis of both MOs and natural MOs
+  ! 
+  END_DOC
+  integer :: i 
+ bielec_no_basis = 0.d0
+ do i = 1, cholesky_mo_num
+  bielec_no_basis += cholesky_no_total_transp(i,i_1, j_1) * cholesky_no_total_transp(i,i_2,j_2)
+ enddo
+end
+
+double precision function bielec_PQxx_no(i_mo, j_mo, i_ca, j_ca)
+ implicit none
+ BEGIN_DOC
+ ! function that computes (i_mo j_mo| i_ca j_ca) with Cholesky decomposition  on the NO basis for active orbitals 
+ ! 
+ ! where i_ca, j_ca are in [1:n_core_inact_act_orb]
+ END_DOC
+ integer, intent(in) :: i_ca, j_ca, i_mo, j_mo
+ integer :: ii_ca, jj_ca
+ double precision :: bielec_no_basis
+ ii_ca = list_core_inact_act(i_ca)
+ jj_ca = list_core_inact_act(j_ca)
+ bielec_PQxx_no = bielec_no_basis(i_mo,j_mo,ii_ca,jj_ca)
+end
+
+double precision function bielec_PxxQ_no(i_mo, j_ca, i_ca, j_mo)
+ implicit none 
+  BEGIN_DOC
+  ! function that computes (i_mo j_ca |i_ca j_mo) with Cholesky decomposition  on the NO basis for active orbitals 
+  ! 
+  ! where i_ca, j_ca are in [1:n_core_inact_act_orb]
+  END_DOC
+ integer, intent(in) :: i_ca, j_ca, i_mo, j_mo
+ integer :: ii_ca, jj_ca
+ double precision :: bielec_no_basis
+ ii_ca = list_core_inact_act(i_ca)
+ jj_ca = list_core_inact_act(j_ca)
+ bielec_PxxQ_no = bielec_no_basis(i_mo, jj_ca, ii_ca, j_mo)
+
+end
+
+
+double precision function bielec_PQxx(i_mo, j_mo, i_ca, j_ca)
+  BEGIN_DOC
+  ! function that computes (i_mo j_mo |i_ca j_ca) with Cholesky decomposition 
+  ! 
+  ! indices are unshifted orbital numbers
+  ! 
+  ! where i_ca, j_ca are in [1:n_core_inact_act_orb]
+  END_DOC
+ implicit none 
+ integer, intent(in) :: i_ca, j_ca, j_mo, i_mo
+ double precision :: mo_two_e_integral
+ integer :: ii_ca, jj_ca
+ ii_ca = list_core_inact_act(i_ca)
+ jj_ca = list_core_inact_act(j_ca)
+ bielec_PQxx = mo_two_e_integral(i_mo,ii_ca,j_mo,jj_ca)
+end
+
+double precision function bielec_PxxQ(i_mo, i_ca, j_ca, j_mo)
+  BEGIN_DOC
+  ! function that computes (i_mo j_mo |i_ca j_ca) with Cholesky decomposition 
+  ! 
+  ! where i_ca, j_ca are in [1:n_core_inact_act_orb]
+  END_DOC
+ implicit none
+ integer, intent(in) :: i_ca, j_ca, j_mo, i_mo
+ double precision :: mo_two_e_integral
+ integer :: ii_ca, jj_ca
+ ii_ca = list_core_inact_act(i_ca)
+ jj_ca = list_core_inact_act(j_ca)
+ bielec_PxxQ = mo_two_e_integral(i_mo,jj_ca,ii_ca,j_mo)
+end
+
--- a/src/casscf_cipsi/chol_garb.irp.f
+++ b/src/casscf_cipsi/chol_garb.irp.f
@ -0,0 +1,34 @@
+
+!!!!! FUNCTIONS THAT WORK BUT WHICH ARE USELESS AS THE ARRAYS CAN ALWAYS BE STORED
+!double precision function bielecCI_chol(i_a, j_a, k_a, i_mo)
+!  BEGIN_DOC
+!  ! function that computes (i_a j_a |k_a j_mo) with Cholesky decomposition 
+!  ! 
+!  ! where i_a, j_a, k_a are in [1:n_act_orb] !!! ONLY ON ACTIVE 
+!  END_DOC
+! implicit none
+! integer, intent(in) :: i_a, j_a, k_a, i_mo
+! integer :: ii_a, jj_a, kk_a
+! double precision :: mo_two_e_integral
+! ii_a = list_act(i_a)
+! jj_a = list_act(j_a)
+! kk_a = list_act(k_a)
+! bielecCI_chol = mo_two_e_integral(ii_a,kk_a,jj_a,i_mo)
+!end
+
+!double precision function bielecCI_no_chol(i_ca, j_ca, k_ca, i_mo)
+!  BEGIN_DOC
+!  ! function that computes (i_ca j_ca |k_ca j_mo) with Cholesky decomposition on the NO basis for active orbitals 
+!  ! 
+!  ! where i_ca, j_ca, k_ca are in [1:n_core_inact_act_orb]
+!  END_DOC
+! implicit none 
+! integer, intent(in) :: i_ca, j_ca, k_ca, i_mo
+! integer :: ii_ca, jj_ca, kk_ca
+! double precision :: bielec_no_basis_chol
+! ii_ca = list_act(i_ca)
+! jj_ca = list_act(j_ca)
+! kk_ca = list_act(k_ca)
+! bielecCI_no_chol = bielec_no_basis_chol(ii_ca, jj_ca, kk_ca, i_mo)
+!
+!end
--- a/src/casscf_cipsi/gradient.irp.f
+++ b/src/casscf_cipsi/gradient.irp.f
@ -157,6 +157,7 @@ real*8 function gradvec_it(i,t)
  
  integer                        :: ii,tt,v,vv,x,y
  integer                        :: x3,y3
+  double precision :: bielec_PQxx_no
  
  ii=list_core_inact(i)
  tt=list_act(t)
--- a/src/casscf_cipsi/hessian.irp.f
+++ b/src/casscf_cipsi/hessian.irp.f
@ -10,6 +10,7 @@ real*8 function hessmat_itju(i,t,j,u)
  implicit none
  integer                        :: i,t,j,u,ii,tt,uu,v,vv,x,xx,y,jj
  real*8                         :: term,t2
+  double precision :: bielec_pqxx_no,bielec_pxxq_no
  
  ii=list_core_inact(i)
  tt=list_act(t)
@ -95,6 +96,7 @@ real*8 function hessmat_itja(i,t,j,a)
  implicit none
  integer                        :: i,t,j,a,ii,tt,jj,aa,v,vv,x,y
  real*8                         :: term
+  double precision :: bielec_pqxx_no,bielec_pxxq_no
  
  ! it/ja
  ii=list_core_inact(i)
@ -128,6 +130,7 @@ real*8 function hessmat_itua(i,t,u,a)
  implicit none
  integer                        :: i,t,u,a,ii,tt,uu,aa,v,vv,x,xx,u3,t3,v3
  real*8                         :: term
+  double precision :: bielec_pqxx_no,bielec_pxxq_no
  
  ii=list_core_inact(i)
  tt=list_act(t)
@ -169,6 +172,7 @@ real*8 function hessmat_iajb(i,a,j,b)
  implicit none
  integer                        :: i,a,j,b,ii,aa,jj,bb
  real*8                         :: term
+  double precision :: bielec_pqxx_no,bielec_pxxq_no
  
  ii=list_core_inact(i)
  aa=list_virt(a)
@ -205,6 +209,7 @@ real*8 function hessmat_iatb(i,a,t,b)
  implicit none
  integer                        :: i,a,t,b,ii,aa,tt,bb,v,vv,x,y,v3,t3
  real*8                         :: term
+  double precision :: bielec_pqxx_no,bielec_pxxq_no
  
  ii=list_core_inact(i)
  aa=list_virt(a)
@ -237,6 +242,7 @@ real*8 function hessmat_taub(t,a,u,b)
  integer                        :: t,a,u,b,tt,aa,uu,bb,v,vv,x,xx,y
  integer                        :: v3,x3
  real*8                         :: term,t1,t2,t3
+  double precision :: bielec_pqxx_no,bielec_pxxq_no
  
  tt=list_act(t)
  aa=list_virt(a)
--- a/src/casscf_cipsi/mcscf_fock.irp.f
+++ b/src/casscf_cipsi/mcscf_fock.irp.f
@ -4,6 +4,7 @@ BEGIN_PROVIDER [real*8, Fipq, (mo_num,mo_num) ]
   END_DOC
   implicit none
   integer                        :: p,q,k,kk,t,tt,u,uu
+   double precision :: bielec_pxxq_no, bielec_pqxx_no
   
   do q=1,mo_num
     do p=1,mo_num
@ -44,6 +45,7 @@ BEGIN_PROVIDER [real*8, Fapq, (mo_num,mo_num) ]
   END_DOC
   implicit none
   integer                        :: p,q,k,kk,t,tt,u,uu
+   double precision :: bielec_pxxq_no, bielec_pqxx_no
   
   Fapq = 0.d0
   
--- a/src/casscf_cipsi/test_chol.irp.f
+++ b/src/casscf_cipsi/test_chol.irp.f
@ -0,0 +1,116 @@
+program test_chol
+ implicit none
+ read_wf= .True.
+ touch read_wf 
+! call routine_bielec_PxxQ_no
+! call routine_bielecCI_no
+! call test_bielec_PxxQ_chol
+! call test_bielecCI
+
+end
+
+subroutine routine_bielec_PQxx_no
+ implicit none
+ integer :: i_chol, i_act, ii_act, j_act, jj_act, i_core_inact, j_core_inact, ii_core_inact, jj_core_inact
+ integer :: i_virt, ii_virt, j_virt, jj_virt, i_mo, j_mo
+ double precision :: exact, new, error, accu, bielec_no_basis_chol
+ double precision :: bielec_PQxx_no
+ 
+ accu = 0.d0
+ do i_core_inact = 1, n_core_inact_act_orb
+  ii_core_inact = list_core_inact_act(i_core_inact)
+  do j_core_inact = 1, n_core_inact_act_orb
+   jj_core_inact = list_core_inact_act(j_core_inact)
+   do i_mo = 1, mo_num
+    do j_mo = 1, mo_num
+     exact = bielec_PQxx_no_array(j_mo,i_mo, j_core_inact, i_core_inact) 
+     new   = bielec_PQxx_no(j_mo,i_mo, j_core_inact, i_core_inact) 
+     error = dabs(exact-new)
+     if(dabs(exact).gt.1.d-10)then
+      print*,exact,new,error
+     endif
+     accu += error
+    enddo
+   enddo
+  enddo
+ enddo
+ print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2))
+end
+
+subroutine routine_bielec_PxxQ_no_array
+ implicit none
+ integer :: i_chol, i_act, ii_act, j_act, jj_act, i_core_inact, j_core_inact, ii_core_inact, jj_core_inact
+ integer :: i_virt, ii_virt, j_virt, jj_virt, i_mo, j_mo
+ double precision :: exact, new, error, accu, bielec_no_basis_chol
+ double precision :: bielec_PxxQ_no
+ 
+ accu = 0.d0
+ do i_mo = 1, mo_num
+  do i_core_inact = 1, n_core_inact_act_orb
+  ii_core_inact = list_core_inact_act(i_core_inact)
+   do j_core_inact = 1, n_core_inact_act_orb
+   jj_core_inact = list_core_inact_act(j_core_inact)
+    do j_mo = 1, mo_num
+     exact = bielec_PxxQ_no_array(j_mo, j_core_inact,  i_core_inact,i_mo) 
+!     new   = bielec_no_basis_chol(j_mo,i_mo, jj_core_inact, ii_core_inact) 
+     new   = bielec_PxxQ_no(j_mo, j_core_inact,  i_core_inact,i_mo) 
+     error = dabs(exact-new)
+     accu += error
+     if(dabs(exact).gt.1.d-10)then
+      print*,exact,new,error
+     endif
+    enddo
+   enddo
+  enddo
+ enddo
+ print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2))
+end
+
+subroutine test_bielec_PQxx(i_mo, j_mo, i_ca, j_ca)
+ implicit none
+ integer :: i_mo, j_mo, i_ca, j_ca 
+ double precision :: exact, new, error, accu
+ double precision :: bielec_PQxx
+ 
+ accu = 0.d0
+ do j_ca = 1, n_core_inact_act_orb
+  do i_ca = 1, n_core_inact_act_orb
+   do j_mo = 1, mo_num
+    do i_mo = 1, mo_num
+     exact = bielec_PQxx_array(i_mo, j_mo, i_ca, j_ca)
+     new   = bielec_PQxx(i_mo, j_mo, i_ca, j_ca)
+     error = dabs(exact-new)
+     accu += error
+     if(dabs(exact).gt.1.d-10)then
+      print*,exact,new,error
+     endif
+    enddo
+   enddo
+  enddo
+ enddo
+ print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2))
+end
+
+subroutine test_bielec_PxxQ_chol(i_mo, i_ca, j_ca, j_mo)
+ implicit none
+ integer :: i_mo, i_ca, j_ca, j_mo
+ double precision :: exact, new, error, accu
+ double precision :: bielec_PxxQ
+ accu = 0.d0
+ do j_mo = 1, mo_num
+  do j_ca = 1, n_core_inact_act_orb
+   do i_ca =1, n_core_inact_act_orb
+    do i_mo = 1, mo_num
+     exact = bielec_PxxQ_array(i_mo, i_ca, j_ca, j_mo)
+     new   = bielec_PxxQ(i_mo, i_ca, j_ca, j_mo)
+     error = dabs(exact-new)
+     accu += error
+     if(dabs(exact).gt.1.d-10)then
+      print*,exact,new,error
+     endif
+    enddo
+   enddo
+  enddo
+ enddo
+ print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2))
+end
--- a/src/casscf_cipsi/tot_en.irp.f
+++ b/src/casscf_cipsi/tot_en.irp.f
@ -8,6 +8,7 @@
   implicit none
   integer                        :: t,u,v,x,i,ii,tt,uu,vv,xx,j,jj,t3,u3,v3,x3
   real*8                         :: e_one_all,e_two_all
+   double precision :: bielec_PQxx,bielec_PxxQ
   e_one_all=0.D0
   e_two_all=0.D0
   do i=1,n_core_inact_orb
--- a/src/ccsd/NEED
+++ b/src/ccsd/NEED
@ -1,2 +1,3 @@
+gpu
 hartree_fock
 utils_cc
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@ -1,4 +1,5 @@
 subroutine run_ccsd_space_orb
+  use gpu

  implicit none

@ -9,9 +10,19 @@ subroutine run_ccsd_space_orb
  double precision :: uncorr_energy,energy, max_elem, max_r, max_r1, max_r2,ta,tb
  logical :: not_converged

-  double precision, allocatable :: t2(:,:,:,:), r2(:,:,:,:), tau(:,:,:,:), tau_x(:,:,:,:)
-  double precision, allocatable :: t1(:,:), r1(:,:)
-  double precision, allocatable :: H_oo(:,:), H_vv(:,:), H_vo(:,:)
+  type(gpu_double4) :: t2, r2, tau, tau_x
+  type(gpu_double2) :: t1, r1
+  type(gpu_double2) :: H_oo, H_vv, H_vo
+
+  type(gpu_double2) :: d_cc_space_f_oo, d_cc_space_f_vo
+  type(gpu_double2) :: d_cc_space_f_ov, d_cc_space_f_vv
+
+  type(gpu_double3) :: d_cc_space_v_oo_chol, d_cc_space_v_vo_chol
+  type(gpu_double3) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol
+
+  type(gpu_double4) :: d_cc_space_v_oovv, d_cc_space_v_voov, d_cc_space_v_ovov
+  type(gpu_double4) :: d_cc_space_v_oovo, d_cc_space_v_vooo, d_cc_space_v_oooo
+  type(gpu_double4) :: d_cc_space_v_vvoo, d_cc_space_v_ovvo, d_cc_space_v_ovoo

  double precision, allocatable :: all_err(:,:), all_t(:,:)
  integer, allocatable          :: list_occ(:), list_vir(:)
@ -20,7 +31,7 @@ subroutine run_ccsd_space_orb

  call set_multiple_levels_omp(.False.)

-  if (do_ao_cholesky) then
+  if (do_mo_cholesky) then
    PROVIDE cholesky_mo_transp
    FREE cholesky_ao
  else
@ -51,11 +62,77 @@ subroutine run_ccsd_space_orb
  !print*,'occ',list_occ
  !print*,'vir',list_vir

-  allocate(t2(nO,nO,nV,nV), r2(nO,nO,nV,nV))
-  allocate(tau(nO,nO,nV,nV))
-  allocate(tau_x(nO,nO,nV,nV))
-  allocate(t1(nO,nV), r1(nO,nV))
-  allocate(H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO))
+  ! GPU arrays
+  call gpu_allocate(d_cc_space_f_oo, nO, nO)
+  call gpu_allocate(d_cc_space_f_vo, nV, nO)
+  call gpu_allocate(d_cc_space_f_ov, nO, nV)
+  call gpu_allocate(d_cc_space_f_vv, nV, nV)
+
+  call gpu_upload(cc_space_f_oo, d_cc_space_f_oo)
+  call gpu_upload(cc_space_f_vo, d_cc_space_f_vo)
+  call gpu_upload(cc_space_f_ov, d_cc_space_f_ov)
+  call gpu_upload(cc_space_f_vv, d_cc_space_f_vv)
+
+!  FREE cc_space_f_oo
+!  FREE cc_space_f_vo
+!  FREE cc_space_f_vv
+
+  if (do_mo_cholesky) then
+    call gpu_allocate(d_cc_space_v_oo_chol, cholesky_mo_num, nO, nO)
+    call gpu_allocate(d_cc_space_v_ov_chol, cholesky_mo_num, nO, nV)
+    call gpu_allocate(d_cc_space_v_vo_chol, cholesky_mo_num, nV, nO)
+    call gpu_allocate(d_cc_space_v_vv_chol, cholesky_mo_num, nV, nV)
+
+    call gpu_upload(cc_space_v_oo_chol, d_cc_space_v_oo_chol)
+    call gpu_upload(cc_space_v_ov_chol, d_cc_space_v_ov_chol)
+    call gpu_upload(cc_space_v_vo_chol, d_cc_space_v_vo_chol)
+    call gpu_upload(cc_space_v_vv_chol, d_cc_space_v_vv_chol)
+
+!    FREE cc_space_v_oo_chol
+!    FREE cc_space_v_ov_chol
+!    FREE cc_space_v_vo_chol
+!    FREE cc_space_v_vv_chol
+  endif
+
+  call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV)
+  call gpu_allocate(d_cc_space_v_voov, nV, nO, nO, nV)
+  call gpu_allocate(d_cc_space_v_ovov, nO, nV, nO, nV)
+  call gpu_allocate(d_cc_space_v_oovo, nO, nO, nV, nO)
+  call gpu_allocate(d_cc_space_v_ovvo, nO, nV, nV, nO)
+  call gpu_allocate(d_cc_space_v_vooo, nV, nO, nO, nO)
+  call gpu_allocate(d_cc_space_v_oooo, nO, nO, nO, nO)
+  call gpu_allocate(d_cc_space_v_vvoo, nV, nV, nO, nO)
+  call gpu_allocate(d_cc_space_v_ovoo, nO, nV, nO, nO)
+
+  call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv)
+  call gpu_upload(cc_space_v_voov, d_cc_space_v_voov)
+  call gpu_upload(cc_space_v_ovov, d_cc_space_v_ovov)
+  call gpu_upload(cc_space_v_oovo, d_cc_space_v_oovo)
+  call gpu_upload(cc_space_v_ovvo, d_cc_space_v_ovvo)
+  call gpu_upload(cc_space_v_vooo, d_cc_space_v_vooo)
+  call gpu_upload(cc_space_v_oooo, d_cc_space_v_oooo)
+  call gpu_upload(cc_space_v_vvoo, d_cc_space_v_vvoo)
+  call gpu_upload(cc_space_v_ovoo, d_cc_space_v_ovoo)
+
+!  FREE cc_space_v_voov
+!  FREE cc_space_v_ovov
+!  FREE cc_space_v_oovo
+!  FREE cc_space_v_oovv
+!  FREE cc_space_v_vooo
+!  FREE cc_space_v_oooo
+!  FREE cc_space_v_vvoo
+!  FREE cc_space_v_ovvo
+!  FREE cc_space_v_ovoo
+
+  call gpu_allocate(t2, nO,nO,nV,nV)
+  call gpu_allocate(r2, nO,nO,nV,nV)
+  call gpu_allocate(tau, nO,nO,nV,nV)
+  call gpu_allocate(tau_x, nO,nO,nV,nV)
+  call gpu_allocate(t1, nO,nV)
+  call gpu_allocate(r1, nO,nV)
+  call gpu_allocate(H_oo, nO, nO)
+  call gpu_allocate(H_vo, nV, nO)
+  call gpu_allocate(H_vv, nV, nV)

  if (cc_update_method == 'diis') then
    double precision :: rss, diis_mem, extra_mem
@ -97,14 +174,22 @@ subroutine run_ccsd_space_orb
  endif

  ! Init
-  call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,t1)
-  call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,t2)
-  call update_tau_space(nO,nV,t1,t2,tau)
+  double precision, allocatable :: h_t1(:,:), h_t2(:,:,:,:)
+  allocate(h_t1(nO,nV), h_t2(nO,nO,nV,nV))
+
+  call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,h_t1)
+  call gpu_upload(h_t1, t1)
+
+  call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,h_t2)
+  call gpu_upload(h_t2, t2)
+
+
+  call update_tau_space(nO,nV,h_t1,t1,t2,tau)
  call update_tau_x_space(nO,nV,tau,tau_x)
-  !print*,'hf_energy', hf_energy
  call det_energy(det,uncorr_energy)
  print*,'Det energy', uncorr_energy
-  call ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
+
+  call ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy)
  print*,'Guess energy', uncorr_energy+energy, energy

  nb_iter = 0
@ -120,43 +205,45 @@ subroutine run_ccsd_space_orb
  do while (not_converged)

    ! Residue
-    if (do_ao_cholesky) then
-!    if (.False.) then
-      call compute_H_oo_chol(nO,nV,tau_x,H_oo)
-      call compute_H_vv_chol(nO,nV,tau_x,H_vv)
-      call compute_H_vo_chol(nO,nV,t1,H_vo)
+    if (do_mo_cholesky) then
+      call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
+      call compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, d_cc_space_v_ov_chol,H_vv)
+      call compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol, H_vo)

-      call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
-      call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
+      call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_space_f_ov,d_cc_space_f_vo, &
+           d_cc_space_v_voov, d_cc_space_v_ovov, d_cc_space_v_oovo, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol)
+      call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
+           d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, d_cc_space_v_oovo, d_cc_space_v_ovvo, d_cc_space_v_ovoo, &
+           d_cc_space_v_ovov, d_cc_space_v_vvoo, d_cc_space_v_oo_chol, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, &
+           d_cc_space_f_vo, &
+           r2, max_r2)
    else
-      call compute_H_oo(nO,nV,t1,t2,tau,H_oo)
-      call compute_H_vv(nO,nV,t1,t2,tau,H_vv)
-      call compute_H_vo(nO,nV,t1,t2,H_vo)
+      call compute_H_oo(nO,nV,t1%f,t2%f,tau%f,H_oo%f)
+      call compute_H_vv(nO,nV,t1%f,t2%f,tau%f,H_vv%f)
+      call compute_H_vo(nO,nV,t1%f,t2%f,H_vo%f)

-      call compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
-      call compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
+      call compute_r1_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r1%f,max_r1)
+      call compute_r2_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r2%f,max_r2)
    endif
    max_r = max(max_r1,max_r2)

    ! Update
    if (cc_update_method == 'diis') then
-      !call update_t_ccsd(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
-      !call update_t_ccsd_diis(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
-      call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1,r2,t1,t2,all_err,all_t)
+      call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1%f,r2%f,t1%f,t2%f,all_err,all_t)

    ! Standard update as T = T - Delta
    elseif (cc_update_method == 'none') then
-      call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1,t1)
-      call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2,t2)
+      call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1%f,t1%f)
+      call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2%f,t2%f)
    else
      print*,'Unkown cc_method_method: '//cc_update_method
    endif

-    call update_tau_space(nO,nV,t1,t2,tau)
+    call update_tau_space(nO,nV,t1%f,t1,t2,tau)
    call update_tau_x_space(nO,nV,tau,tau_x)

    ! Energy
-    call ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
+    call ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy)
    write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |'

    nb_iter = nb_iter + 1
@ -181,8 +268,8 @@ subroutine run_ccsd_space_orb
  print*,''

  if (write_amplitudes) then
-    call write_t1(nO,nV,t1)
-    call write_t2(nO,nV,t2)
+    call write_t1(nO,nV,t1%f)
+    call write_t2(nO,nV,t2%f)
    call ezfio_set_utils_cc_io_amplitudes('Read')
  endif

@ -191,7 +278,14 @@ subroutine run_ccsd_space_orb
    deallocate(all_err,all_t)
  endif

-  deallocate(H_vv,H_oo,H_vo,r1,r2,tau)
+  call gpu_deallocate(H_oo)
+  call gpu_deallocate(H_vv)
+  call gpu_deallocate(H_vo)
+
+  call gpu_deallocate(r1)
+  call gpu_deallocate(r2)
+  call gpu_deallocate(tau)
+  call gpu_deallocate(tau_x)

  ! CCSD(T)
  double precision :: e_t, e_t_err
@ -199,28 +293,14 @@ subroutine run_ccsd_space_orb

  if (cc_par_t .and. elec_alpha_num + elec_beta_num > 2) then

-    ! Dumb way
-    !call wall_time(ta)
-    !call ccsd_par_t_space(nO,nV,t1,t2,e_t)
-    !call wall_time(tb)
-    !print*,'Time: ',tb-ta, ' s'
-
-    !print*,''
-    !write(*,'(A15,F18.12,A3)') ' E(CCSD(T))  = ', uncorr_energy + energy + e_t, ' Ha'
-    !write(*,'(A15,F18.12,A3)') ' E(T)        = ', e_t, ' Ha'
-    !write(*,'(A15,F18.12,A3)') ' Correlation = ', energy + e_t, ' Ha'
-    !print*,''
-
    ! New
    e_t = uncorr_energy + energy ! For print in (T) call
    e_t_err = 0.d0

    print*,'Computing (T) correction...'
    call wall_time(ta)
-!    call ccsd_par_t_space_v3(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
-!         ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t)

-    call ccsd_par_t_space_stoch(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
+    call ccsd_par_t_space_stoch(nO,nV,t1%f,t2%f,cc_space_f_o,cc_space_f_v &
         ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t, e_t_err)

    call wall_time(tb)
@ -235,168 +315,161 @@ subroutine run_ccsd_space_orb

  call save_energy(uncorr_energy + energy, e_t)

-  deallocate(t1,t2)
+  deallocate(h_t1, h_t2)
+  if (do_mo_cholesky) then
+    call gpu_deallocate(d_cc_space_v_oo_chol)
+    call gpu_deallocate(d_cc_space_v_ov_chol)
+    call gpu_deallocate(d_cc_space_v_vo_chol)
+    call gpu_deallocate(d_cc_space_v_vv_chol)
+  endif
+
+  call gpu_deallocate(d_cc_space_v_oovv)
+  call gpu_deallocate(d_cc_space_v_voov)
+  call gpu_deallocate(d_cc_space_v_ovov)
+  call gpu_deallocate(d_cc_space_v_oovo)
+  call gpu_deallocate(d_cc_space_v_ovvo)
+  call gpu_deallocate(d_cc_space_v_vooo)
+  call gpu_deallocate(d_cc_space_v_oooo)
+  call gpu_deallocate(d_cc_space_v_vvoo)
+  call gpu_deallocate(d_cc_space_v_ovoo)
+
+  call gpu_deallocate(d_cc_space_f_oo)
+  call gpu_deallocate(d_cc_space_f_vo)
+  call gpu_deallocate(d_cc_space_f_ov)
+  call gpu_deallocate(d_cc_space_f_vv)
+
+  call gpu_deallocate(t1)
+  call gpu_deallocate(t2)

 end

 ! Energy

-subroutine ccsd_energy_space(nO,nV,tau,t1,energy)
-
+subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy)
+  use gpu
  implicit none

  integer, intent(in)            :: nO, nV
-  double precision, intent(in)  :: tau(nO,nO,nV,nV)
-  double precision, intent(in)  :: t1(nO,nV)
+  type(gpu_double4), intent(in)  :: tau_x, d_cc_space_v_oovv
+  type(gpu_double2), intent(in)  :: t1, d_cc_space_f_vo
  double precision, intent(out)  :: energy

  ! internal
  integer :: i,j,a,b
  double precision :: e

-  energy = 0d0
-  !$omp parallel &
-  !$omp shared(nO,nV,energy,tau,t1,&
-  !$omp cc_space_f_vo,cc_space_w_oovv) &
-  !$omp private(i,j,a,b,e) &
-  !$omp default(none)
-  e = 0d0
-  !$omp do
-  do a = 1, nV
-    do i = 1, nO
-      e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp do
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          e = e + tau(i,j,a,b) * cc_space_w_oovv(i,j,a,b)
-       enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp critical
-  energy = energy + e
-  !$omp end critical
-  !$omp end parallel
+  type(gpu_stream) :: s1, s2
+  call gpu_stream_create(s1)
+  call gpu_stream_create(s2)

-end
+  call gpu_set_stream(blas_handle,s1)
+  call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo%f(1,1), 1, t1%f(1,1), 1, e)

-subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
+  call gpu_set_stream(blas_handle,s2)
+  call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x%f(1,1,1,1), 1_8, d_cc_space_v_oovv%f(1,1,1,1), 1_8, energy)
+  call gpu_set_stream(blas_handle,gpu_default_stream)

-  implicit none
+  call gpu_synchronize()
+  call gpu_stream_destroy(s1)
+  call gpu_stream_destroy(s2)

-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: tau_x(nO,nO,nV,nV)
-  double precision, intent(in)  :: t1(nO,nV)
-  double precision, intent(out) :: energy
-
-  ! internal
-  integer :: i,j,a,b
-  double precision :: e
-
-  energy = 0d0
-  !$omp parallel &
-  !$omp shared(nO,nV,energy,tau_x,t1,&
-  !$omp cc_space_f_vo,cc_space_v_oovv) &
-  !$omp private(i,j,a,b,e) &
-  !$omp default(none)
-  e = 0d0
-  !$omp do
-  do a = 1, nV
-    do i = 1, nO
-      e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp do
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          e = e + tau_x(i,j,a,b) * cc_space_v_oovv(i,j,a,b)
-       enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp critical
-  energy = energy + e
-  !$omp end critical
-  !$omp end parallel
+   energy = energy + 2.d0*e

 end

 ! Tau

-subroutine update_tau_space(nO,nV,t1,t2,tau)
-
+subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
+  use gpu
  implicit none

  ! in
  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: t1(nO,nV), t2(nO,nO,nV,nV)
+  double precision, intent(in)  :: h_t1(nO,nV)
+  type(gpu_double2), intent(in) :: t1
+  type(gpu_double4), intent(in) :: t2

  ! out
-  double precision, intent(out) :: tau(nO,nO,nV,nV)
+  type(gpu_double4) :: tau

  ! internal
  integer                       :: i,j,a,b

+  type(gpu_stream) :: stream(nV)
+
  !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,tau,t2,t1) &
+  !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) &
  !$OMP PRIVATE(i,j,a,b) &
  !$OMP DEFAULT(NONE)
  !$OMP DO
  do b=1,nV
-    do a = 1, nV
+    call gpu_stream_create(stream(b))
+    call gpu_set_stream(blas_handle,stream(b))
    do j=1,nO
-        do i = 1, nO
-          tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b)
-        enddo
-      enddo
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, &
+         1.d0, t2%f(1,j,1,b), nO*nO, &
+         h_t1(j,b), t1%f(1,1), nO, &
+         tau%f(1,j,1,b), nO*nO)
    enddo
  enddo
  !$OMP END DO
  !$OMP END PARALLEL

+  call gpu_synchronize()
+
+  do b=1,nV
+    call gpu_stream_destroy(stream(b))
+  enddo
+  call gpu_set_stream(blas_handle,gpu_default_stream)
+
+
 end

 subroutine update_tau_x_space(nO,nV,tau,tau_x)
-
+  use gpu
  implicit none

  ! in
  integer, intent(in)         :: nO, nV
-  double precision, intent(in)  :: tau(nO,nO,nV,nV)
+  type(gpu_double4), intent(in)  :: tau

  ! out
-  double precision, intent(out) :: tau_x(nO,nO,nV,nV)
+  type(gpu_double4) :: tau_x

  ! internal
  integer                       :: i,j,a,b

+  type(gpu_stream) :: stream(nV)
+
+  do a=1,nV
+    call gpu_stream_create(stream(a))
+  enddo
+
  !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,tau,tau_x) &
-  !$OMP PRIVATE(i,j,a,b) &
+  !$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) &
+  !$OMP PRIVATE(a,b) &
  !$OMP DEFAULT(NONE)
  !$OMP DO
  do b=1,nV
    do a=1,nV
-      do j = 1, nO
-        do i = 1, nO
-          tau_x(i,j,a,b) = 2.d0*tau(i,j,a,b) - tau(i,j,b,a)
-        enddo
-      enddo
+      call gpu_set_stream(blas_handle,stream(a))
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, &
+          2.d0, tau%f(1,1,a,b), nO, &
+         -1.d0, tau%f(1,1,b,a), nO, &
+         tau_x%f(1,1,a,b), nO)
    enddo
  enddo
  !$OMP END DO
  !$OMP END PARALLEL

+  call gpu_set_stream(blas_handle,gpu_default_stream)
+  call gpu_synchronize()
+
+  do b=1,nV
+    call gpu_stream_destroy(stream(b))
+  enddo
+
+
 end

 ! R1
--- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
--- a/src/gpu/NEED
+++ b/src/gpu/NEED
@ -0,0 +1 @@
+gpu_arch
--- a/src/gpu/README.rst
+++ b/src/gpu/README.rst
@ -0,0 +1,6 @@
+===
+gpu
+===
+
+Bindings for GPU routines (architecture independent).
+Architecture-dependent files are in gpu_arch.
--- a/src/gpu_x86/gpu.h
+++ b/src/gpu_x86/gpu.h
@ -22,20 +22,20 @@ void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_

 void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result);

-void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double alpha,
-               const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy);
+void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy);

-void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float alpha,
-               const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy);
+void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float* alpha,
+               const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy);

-void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
-               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc);
+void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc);

-void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
-               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc);
+void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha,
+               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc);

-void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
-               const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc);
+void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc);

-void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
-               const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc);
+void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float* alpha,
+               const float* a, const int64_t lda, const float* beta, const float* b, const int64_t ldb, float* c, const int64_t ldc);
--- a/src/gpu/gpu.irp.f
+++ b/src/gpu/gpu.irp.f
@ -0,0 +1,26 @@
+use gpu
+
+BEGIN_PROVIDER [ type(gpu_blas), blas_handle ]
+ implicit none
+ BEGIN_DOC
+ ! Handle for cuBLAS or RocBLAS
+ END_DOC
+ call gpu_blas_create(blas_handle)
+END_PROVIDER
+
+BEGIN_PROVIDER [ type(gpu_stream), gpu_default_stream ]
+ implicit none
+ BEGIN_DOC
+ ! Default stream
+ END_DOC
+ gpu_default_stream%c = C_NULL_PTR
+END_PROVIDER
+
+BEGIN_PROVIDER [ integer, gpu_num ]
+ implicit none
+ BEGIN_DOC
+ ! Number of usable GPUs
+ END_DOC
+ gpu_num = gpu_ndevices()
+END_PROVIDER
+
--- a/src/gpu/gpu_module.F90
+++ b/src/gpu/gpu_module.F90
@ -0,0 +1,707 @@
+module gpu
+  use, intrinsic :: iso_c_binding
+  implicit none
+
+! Data types
+! ----------
+
+  type gpu_double1
+    type(c_ptr) :: c
+    double precision, pointer :: f(:)
+  end type
+
+  type gpu_double2
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:)
+  end type
+
+  type gpu_double3
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:,:)
+  end type
+
+  type gpu_double4
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:,:,:)
+  end type
+
+  type gpu_double5
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:,:,:,:)
+  end type
+
+  type gpu_double6
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:,:,:,:,:)
+  end type
+
+
+  type gpu_blas
+    type(c_ptr) :: c
+  end type
+
+  type gpu_stream
+    type(c_ptr) :: c
+  end type
+
+
+! C interfaces
+! ------------
+
+  interface
+    logical(c_bool) function no_gpu() bind(C)
+      import
+    end function
+
+    integer function gpu_ndevices() bind(C)
+      import
+    end function
+
+    subroutine gpu_set_device(id) bind(C)
+      import
+      integer(c_int32_t), value :: id
+    end subroutine
+
+    subroutine gpu_allocate_c(ptr, n) bind(C, name='gpu_allocate')
+      import
+      type(c_ptr) :: ptr
+      integer(c_int64_t), value :: n
+    end subroutine
+
+    subroutine gpu_deallocate_c(ptr) bind(C, name='gpu_deallocate')
+      import
+      type(c_ptr) :: ptr
+    end subroutine
+
+    subroutine gpu_upload_c(cpu_ptr, gpu_ptr, n) bind(C, name='gpu_upload')
+      import
+      type(c_ptr), value :: cpu_ptr
+      type(c_ptr), value :: gpu_ptr
+      integer(c_int64_t), value :: n
+    end subroutine
+
+    subroutine gpu_download_c(gpu_ptr, cpu_ptr, n) bind(C, name='gpu_download')
+      import
+      type(c_ptr), value :: gpu_ptr
+      type(c_ptr), value :: cpu_ptr
+      integer(c_int64_t), value :: n
+    end subroutine
+
+    subroutine gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, n) bind(C, name='gpu_copy')
+      import
+      type(c_ptr), value :: gpu_ptr_src
+      type(c_ptr), value :: gpu_ptr_dest
+      integer(c_int64_t), value :: n
+    end subroutine
+
+    subroutine gpu_stream_create_c(stream) bind(C, name='gpu_stream_create')
+      import
+      type(c_ptr) :: stream
+    end subroutine
+
+    subroutine gpu_stream_destroy_c(stream) bind(C, name='gpu_stream_destroy')
+      import
+      type(c_ptr) :: stream
+    end subroutine
+
+    subroutine gpu_set_stream_c(handle, stream) bind(C, name='gpu_set_stream')
+      import
+      type(c_ptr), value :: handle, stream
+    end subroutine
+
+    subroutine gpu_synchronize() bind(C)
+      import
+    end subroutine
+
+    subroutine gpu_blas_create_c(handle) bind(C, name='gpu_blas_create')
+      import
+      type(c_ptr) :: handle
+    end subroutine
+
+    subroutine gpu_blas_destroy_c(handle) bind(C, name='gpu_blas_destroy')
+      import
+      type(c_ptr) :: handle
+    end subroutine
+
+    subroutine gpu_ddot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_ddot')
+      import
+      type(c_ptr), value, intent(in) :: handle
+      integer(c_int64_t), value      :: n, incx, incy
+      type(c_ptr), value             :: dx, dy
+      real(c_double), intent(out)    :: res
+    end subroutine
+
+    subroutine gpu_sdot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_sdot')
+      import
+      type(c_ptr), value, intent(in) :: handle
+      integer(c_int64_t), value      :: n, incx, incy
+      type(c_ptr), intent(in), value :: dx, dy
+      real(c_float), intent(out)     :: res
+    end subroutine
+
+    subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
+      b, ldb, c, ldc) bind(C, name='gpu_dgeam')
+      import
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in), value  :: transa, transb
+      integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
+      real(c_double), intent(in)            :: alpha, beta
+      type(c_ptr), value :: a, b, c
+    end subroutine
+
+    subroutine gpu_sgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
+      b, ldb, c, ldc) bind(C, name='gpu_sgeam')
+      import
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in), value  :: transa, transb
+      integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
+      real(c_float), intent(in)             :: alpha, beta
+      real(c_float) :: a, b, c
+    end subroutine
+
+    subroutine gpu_dgemv_c(handle, transa, m, n, alpha, a, lda, &
+      x, incx, beta, y, incy) bind(C, name='gpu_dgemv')
+      import
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in)         :: transa
+      integer(c_int64_t), intent(in), value :: m, n, lda, incx, incy
+      real(c_double), intent(in)            :: alpha, beta
+      real(c_double)                        :: a, x, y
+    end subroutine
+
+    subroutine gpu_sgemv_c(handle, transa, m, n, alpha, a, lda, &
+      x, incx, beta, y, incy) bind(C, name='gpu_sgemv')
+      import
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in)         :: transa
+      integer(c_int64_t), intent(in), value :: m, n, lda, incx, incy
+      real(c_float), intent(in)             :: alpha, beta
+      real(c_float)                         :: a, x, y
+    end subroutine
+
+
+    subroutine gpu_dgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, &
+      b, ldb, beta, c, ldc) bind(C, name='gpu_dgemm')
+      import
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in)         :: transa, transb
+      integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc
+      real(c_double), intent(in)            :: alpha, beta
+      real(c_double) :: a, b, c
+    end subroutine
+
+    subroutine gpu_sgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, &
+      b, ldb, beta, c, ldc) bind(C, name='gpu_sgemm')
+      import
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in), value  :: transa, transb
+      integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc
+      real(c_float), intent(in)             :: alpha, beta
+      real(c_float) :: a, b, c
+    end subroutine
+
+  end interface
+
+
+! Polymorphic interfaces
+! ----------------------
+
+  interface gpu_allocate
+    procedure gpu_allocate_double1     &
+             ,gpu_allocate_double2     &
+             ,gpu_allocate_double3     &
+             ,gpu_allocate_double4     &
+             ,gpu_allocate_double5     &
+             ,gpu_allocate_double6     &
+             ,gpu_allocate_double1_64  &
+             ,gpu_allocate_double2_64  &
+             ,gpu_allocate_double3_64  &
+             ,gpu_allocate_double4_64  &
+             ,gpu_allocate_double5_64  &
+             ,gpu_allocate_double6_64
+  end interface gpu_allocate
+
+  interface gpu_deallocate
+    procedure gpu_deallocate_double1     &
+             ,gpu_deallocate_double2     &
+             ,gpu_deallocate_double3     &
+             ,gpu_deallocate_double4     &
+             ,gpu_deallocate_double5     &
+             ,gpu_deallocate_double6
+  end interface gpu_deallocate
+
+  interface gpu_upload
+    procedure gpu_upload_double1  &
+             ,gpu_upload_double2  &
+             ,gpu_upload_double3  &
+             ,gpu_upload_double4  &
+             ,gpu_upload_double5  &
+             ,gpu_upload_double6
+  end interface gpu_upload
+
+  interface gpu_download
+    procedure gpu_download_double1  &
+             ,gpu_download_double2  &
+             ,gpu_download_double3  &
+             ,gpu_download_double4  &
+             ,gpu_download_double5  &
+             ,gpu_download_double6
+  end interface gpu_download
+
+  interface gpu_copy
+    procedure gpu_copy_double1  &
+             ,gpu_copy_double2  &
+             ,gpu_copy_double3  &
+             ,gpu_copy_double4  &
+             ,gpu_copy_double5  &
+             ,gpu_copy_double6
+  end interface gpu_copy
+
+
+  contains
+
+
+! gpu_allocate
+! ------------
+
+    subroutine gpu_allocate_double1(ptr, s)
+      implicit none
+      type(gpu_double1), intent(inout) :: ptr
+      integer, intent(in) :: s
+
+      call gpu_allocate_c(ptr%c, s*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s /))
+    end subroutine
+
+    subroutine gpu_allocate_double2(ptr, s1, s2)
+      implicit none
+      type(gpu_double2), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2
+
+      call gpu_allocate_c(ptr%c, s1*s2*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /))
+    end subroutine
+
+    subroutine gpu_allocate_double3(ptr, s1, s2, s3)
+      implicit none
+      type(gpu_double3), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2, s3
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /))
+    end subroutine
+
+    subroutine gpu_allocate_double4(ptr, s1, s2, s3, s4)
+      implicit none
+      type(gpu_double4), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2, s3, s4
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /))
+    end subroutine
+
+    subroutine gpu_allocate_double5(ptr, s1, s2, s3, s4, s5)
+      implicit none
+      type(gpu_double5), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2, s3, s4, s5
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /))
+    end subroutine
+
+    subroutine gpu_allocate_double6(ptr, s1, s2, s3, s4, s5, s6)
+      implicit none
+      type(gpu_double6), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2, s3, s4, s5, s6
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /))
+    end subroutine
+
+
+    subroutine gpu_allocate_double1_64(ptr, s)
+      implicit none
+      type(gpu_double1), intent(inout) :: ptr
+      integer*8, intent(in) :: s
+
+      call gpu_allocate_c(ptr%c, s)
+      call c_f_pointer(ptr%c, ptr%f, (/ s /))
+    end subroutine
+
+    subroutine gpu_allocate_double2_64(ptr, s1, s2)
+      implicit none
+      type(gpu_double2), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2
+
+      call gpu_allocate_c(ptr%c, s1*s2*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /))
+    end subroutine
+
+    subroutine gpu_allocate_double3_64(ptr, s1, s2, s3)
+      implicit none
+      type(gpu_double3), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2, s3
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /))
+    end subroutine
+
+    subroutine gpu_allocate_double4_64(ptr, s1, s2, s3, s4)
+      implicit none
+      type(gpu_double4), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2, s3, s4
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /))
+    end subroutine
+
+    subroutine gpu_allocate_double5_64(ptr, s1, s2, s3, s4, s5)
+      implicit none
+      type(gpu_double5), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2, s3, s4, s5
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /))
+    end subroutine
+
+    subroutine gpu_allocate_double6_64(ptr, s1, s2, s3, s4, s5, s6)
+      implicit none
+      type(gpu_double6), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2, s3, s4, s5, s6
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /))
+    end subroutine
+
+
+! gpu_deallocate
+! --------------
+
+    subroutine gpu_deallocate_double1(ptr)
+      implicit none
+      type(gpu_double1), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double2(ptr)
+      implicit none
+      type(gpu_double2), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double3(ptr)
+      implicit none
+      type(gpu_double3), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double4(ptr)
+      implicit none
+      type(gpu_double4), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double5(ptr)
+      implicit none
+      type(gpu_double5), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double6(ptr)
+      implicit none
+      type(gpu_double6), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+
+! gpu_upload
+! ----------
+
+    subroutine gpu_upload_double1(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, target, intent(in)     :: cpu_ptr(*)
+      type(gpu_double1), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, 8_8*size(gpu_ptr%f))
+    end subroutine
+
+    subroutine gpu_upload_double2(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, target, intent(in)     :: cpu_ptr(:,:)
+      type(gpu_double2), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+    subroutine gpu_upload_double3(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, target, intent(in)     :: cpu_ptr(:,:,:)
+      type(gpu_double3), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+    subroutine gpu_upload_double4(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, target, intent(in)     :: cpu_ptr(:,:,:,:)
+      type(gpu_double4), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+    subroutine gpu_upload_double5(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, target, intent(in)     :: cpu_ptr(:,:,:,:,:)
+      type(gpu_double5), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+    subroutine gpu_upload_double6(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, target, intent(in)     :: cpu_ptr(:,:,:,:,:,:)
+      type(gpu_double6), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+
+! gpu_download
+! ------------
+
+    subroutine gpu_download_double1(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double1), intent(in)  :: gpu_ptr
+      double precision, target, intent(in)   :: cpu_ptr(:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*size(gpu_ptr%f))
+    end subroutine
+
+    subroutine gpu_download_double2(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double2), intent(in)  :: gpu_ptr
+      double precision, target, intent(in)   :: cpu_ptr(:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+    subroutine gpu_download_double3(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double3), intent(in)  :: gpu_ptr
+      double precision, target, intent(in)   :: cpu_ptr(:,:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+    subroutine gpu_download_double4(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double4), intent(in)  :: gpu_ptr
+      double precision, target, intent(in)   :: cpu_ptr(:,:,:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+    subroutine gpu_download_double5(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double5), intent(in)  :: gpu_ptr
+      double precision, target, intent(in)   :: cpu_ptr(:,:,:,:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+    subroutine gpu_download_double6(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double6), intent(in)  :: gpu_ptr
+      double precision, target, intent(in)   :: cpu_ptr(:,:,:,:,:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+! gpu_copy
+! --------
+
+    subroutine gpu_copy_double1(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double1), intent(in)        :: gpu_ptr_src
+      type(gpu_double1), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*size(gpu_ptr_dest%f))
+    end subroutine
+
+    subroutine gpu_copy_double2(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double2), intent(in)        :: gpu_ptr_src
+      type(gpu_double2), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+    subroutine gpu_copy_double3(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double3), intent(in)        :: gpu_ptr_src
+      type(gpu_double3), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+    subroutine gpu_copy_double4(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double4), intent(in)        :: gpu_ptr_src
+      type(gpu_double4), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+    subroutine gpu_copy_double5(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double5), intent(in)        :: gpu_ptr_src
+      type(gpu_double5), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+    subroutine gpu_copy_double6(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double6), intent(in)        :: gpu_ptr_src
+      type(gpu_double6), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+
+! gpu_stream
+! ----------
+
+    subroutine gpu_stream_create(stream)
+      type(gpu_stream) :: stream
+      call gpu_stream_create_c(stream%c)
+    end subroutine
+
+    subroutine gpu_stream_destroy(stream)
+      type(gpu_stream) :: stream
+      call gpu_stream_destroy_c(stream%c)
+    end subroutine
+
+    subroutine gpu_set_stream(handle, stream)
+      type(gpu_blas)   :: handle
+      type(gpu_stream) :: stream
+      call gpu_set_stream_c(handle%c, stream%c)
+    end subroutine
+
+
+! gpu_blas
+! --------
+
+    subroutine gpu_blas_create(handle)
+      type(gpu_blas) :: handle
+      call gpu_blas_create_c(handle%c)
+    end subroutine
+
+    subroutine gpu_blas_destroy(handle)
+      type(gpu_blas) :: handle
+      call gpu_blas_destroy_c(handle%c)
+    end subroutine
+
+
+
+
+
+! dot
+! ---
+
+subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res)
+!  use gpu
+  type(gpu_blas), intent(in)     :: handle
+  integer*4                      :: n, incx, incy
+  double precision, target       :: dx, dy
+  double precision, intent(out)  :: res
+  call gpu_ddot_c(handle%c, int(n,c_int64_t), c_loc(dx), int(incx,c_int64_t), c_loc(dy), int(incy,c_int64_t), res)
+end subroutine
+
+
+subroutine gpu_ddot_64(handle, n, dx, incx, dy, incy, res)
+!  use gpu
+  type(gpu_blas), intent(in)     :: handle
+  integer*8                      :: n, incx, incy
+  double precision, target       :: dx, dy
+  double precision, intent(out)  :: res
+  call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res)
+end subroutine
+
+
+! geam
+! ----
+
+subroutine gpu_dgeam(handle, transa, transb, m, n, alpha, a, lda, beta, &
+  b, ldb, c, ldc)
+!  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*4, intent(in)        :: m, n, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  double precision, target     :: a, b, c
+  call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, &
+        c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t))
+end subroutine
+
+
+subroutine gpu_dgeam_64(handle, transa, transb, m, n, alpha, a, lda, beta, &
+  b, ldb, c, ldc)
+!  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*8, intent(in)        :: m, n, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  double precision, target     :: a, b, c
+  call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, &
+        c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t))
+end subroutine
+
+
+! gemv
+! ----
+
+subroutine gpu_dgemv(handle, transa, m, n, alpha, a, lda, &
+  x, incx, beta, y, incy)
+!  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa
+  integer*4, intent(in)        :: m, n, lda, incx, incy
+  double precision, intent(in) :: alpha, beta
+  double precision             :: a, x, y
+  call gpu_dgemv_c(handle%c, transa, int(m,c_int64_t), int(n,c_int64_t), &
+        alpha, a, int(lda,c_int64_t), &
+        x, int(incx,c_int64_t), beta, y, int(incy,c_int64_t))
+end subroutine
+
+subroutine gpu_dgemv_64(handle, transa, m, n, alpha, a, lda, &
+  x, incx, beta, y, incy)
+!  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa
+  integer*8, intent(in)        :: m, n, lda, incx, incy
+  double precision, intent(in) :: alpha, beta
+  double precision             :: a, x, y
+  call gpu_dgemv_c(handle%c, transa, int(m,c_int64_t), int(n,c_int64_t), &
+        alpha, a, int(lda,c_int64_t), &
+        x, int(incx,c_int64_t), beta, y, int(incy,c_int64_t))
+end subroutine
+
+
+! gemm
+! ----
+
+subroutine gpu_dgemm(handle, transa, transb, m, n, k, alpha, a, lda, &
+  b, ldb, beta, c, ldc)
+!  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*4, intent(in)        :: m, n, k, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  double precision             :: a, b, c
+  call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), &
+        alpha, a, int(lda,c_int64_t), &
+        b, int(ldb,c_int64_t), beta, c, int(ldc,c_int64_t))
+end subroutine
+
+subroutine gpu_dgemm_64(handle, transa, transb, m, n, k, alpha, a, lda, &
+  b, ldb, beta, c, ldc)
+!  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*8, intent(in)        :: m, n, k, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  double precision             :: a, b, c
+  call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), &
+        alpha, a, int(lda,c_int64_t), b, int(ldb,c_int64_t), beta, c, int(ldc,c_int64_t))
+end subroutine
+
+end module
--- a/src/gpu_x86/gpu_module.F90
+++ b/src/gpu_x86/gpu_module.F90
@ -1,141 +0,0 @@
-module gpu
-  use, intrinsic :: iso_c_binding, only : c_int32_t, c_int64_t, c_double, c_size_t, c_char
-  implicit none
-
-  interface
-    integer function gpu_ndevices() bind(C)
-    end function
-
-    subroutine gpu_set_device(id) bind(C)
-      import
-      integer(c_int32_t), value :: id
-    end subroutine
-
-    subroutine gpu_allocate_c(ptr, n) bind(C, name='gpu_allocate')
-      import
-      type(c_ptr) :: ptr
-      integer(c_int64_t), value :: n
-    end subroutine
-
-    subroutine gpu_free_c(ptr) bind(C, name='gpu_free')
-      import
-      type(c_ptr) :: ptr
-    end subroutine
-
-    subroutine gpu_upload_c(cpu_ptr, gpu_ptr, n) bind(C, name='gpu_upload')
-      import
-      type(c_ptr), value :: cpu_ptr
-      type(c_ptr), value :: gpu_ptr
-      integer(c_int64_t), value :: n
-    end subroutine
-
-    subroutine gpu_download_c(gpu_ptr, cpu_ptr, n) bind(C, name='gpu_download')
-      import
-      type(c_ptr), value :: gpu_ptr
-      type(c_ptr), value :: cpu_ptr
-      integer(c_int64_t), value :: n
-    end subroutine
-
-    subroutine gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, n) bind(C, name='gpu_copy')
-      import
-      type(c_ptr), value :: gpu_ptr_src
-      type(c_ptr), value :: gpu_ptr_dest
-      integer(c_int64_t), value :: n
-    end subroutine
-
-    subroutine gpu_stream_create(stream) bind(C)
-      import
-      type(c_ptr) :: stream
-    end subroutine
-
-    subroutine gpu_stream_destroy(stream) bind(C)
-      import
-      type(c_ptr) :: stream
-    end subroutine
-
-    subroutine gpu_set_stream(handle, stream) bind(C)
-      import
-      type(c_ptr) :: handle, stream
-    end subroutine
-
-    subroutine gpu_synchronize()
-    end subroutine
-
-    subroutine gpu_blas_create(handle) bind(C)
-      import
-      type(c_ptr) :: handle
-    end subroutine
-
-    subroutine gpu_blas_destroy(handle) bind(C)
-      import
-      type(c_ptr) :: handle
-    end subroutine
-
-    subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res) bind(C)
-      import
-      type(c_ptr), intent(in)     :: handle
-      integer(c_int64_t), value   :: n, incx, incy
-      real(c_double), intent(in)  :: dx(*), dy(*)
-      real(c_double), intent(out) :: res
-    end subroutine
-
-    subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res) bind(C)
-      import
-      type(c_ptr), intent(in)     :: handle
-      integer(c_int64_t), value   :: n, incx, incy
-      real(c_float), intent(in)   :: dx(*), dy(*)
-      real(c_float), intent(out)  :: res
-    end subroutine
-
-  end interface
-
-end module
-
-subroutine gpu_allocate_double(ptr, s)
-  use gpu
-  implicit none
-  double precision, pointer, intent(inout) :: ptr
-  integer*8, intent(in) :: s(*)
-  type(c_ptr) :: cptr
-
-  call gpu_allocate_c(cptr, sum(s)*8_8)
-  call c_f_pointer(cptr, ptr, s)
-end subroutine
-
-subroutine gpu_free_double(ptr)
-  use gpu
-  implicit none
-  double precision, pointer, intent(inout) :: ptr
-  type(c_ptr) :: cptr
-  cptr = cloc(ptr)
-  call gpu_free(cptr)
-  NULLIFY(ptr)
-end subroutine
-
-subroutine gpu_upload_double(cpu_ptr, gpu_ptr, n)
-  use gpu
-  implicit none
-  double precision, intent(in)   :: cpu_ptr(*)
-  double precision, intent(out)  :: gpu_ptr(*)
-  integer(c_int64_t), intent(in) :: n
-  call gpu_upload_c(cpu_ptr, gpu_ptr, 8_8*n)
-end subroutine
-
-subroutine gpu_download_double(gpu_ptr, cpu_ptr, n)
-  use gpu
-  implicit none
-  double precision, intent(in)   :: gpu_ptr(*)
-  double precision, intent(out)  :: cpu_ptr(*)
-  integer(c_int64_t), intent(in) :: n
-  call gpu_download_c(gpu_ptr, cpu_ptr, 8_8*n)
-end subroutine
-
-subroutine gpu_copy_double(gpu_ptr_src, gpu_ptr_dest, n)
-  use gpu
-  implicit none
-  double precision, intent(in)   :: gpu_ptr_src(*)
-  double precision, intent(out)  :: gpu_ptr_dest(*)
-  integer(c_int64_t), intent(in) :: n
-  call gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, 8_8*n)
-end subroutine
-
--- a/src/mo_optimization/NEED
+++ b/src/mo_optimization/NEED
@ -1,7 +1,3 @@
-two_body_rdm
-hartree_fock
-cipsi
-davidson_undressed
+mo_optimization_utils
 selectors_full
 generators_full
-utils_trust_region
--- a/src/mo_optimization/cipsi_orb_opt.irp.f
+++ b/src/mo_optimization/cipsi_orb_opt.irp.f
@ -2,87 +2,7 @@ program optimization
  
  read_wf = .true. ! must be True for the orbital optimization !!!
  TOUCH read_wf 
-  call run_optimization
+  call run_optimization_mos_CIPSI

 end

-subroutine run_optimization
-
-  implicit none
-
-  double precision :: e_cipsi, e_opt, delta_e
-  double precision, allocatable :: Ev(:),PT2(:)
-  integer :: nb_iter,i
-  logical :: not_converged
-  character (len=100) :: filename
-
-  PROVIDE psi_det psi_coef mo_two_e_integrals_in_map ao_pseudo_integrals
-  allocate(Ev(N_states),PT2(N_states))
-
-  not_converged = .True.
-  nb_iter = 0
-
-  ! To start from the wf
-  N_det_max = max(n_det,5)
-  TOUCH N_det_max
-
-  open(unit=10, file=trim(ezfio_filename)//'/mo_optimization/result_opt')
-  write(10,*) "   Ndet        E_cipsi         E_opt          Delta_e"
-  call state_average_energy(e_cipsi)
-  write(10,'(I10, 3F15.7)') n_det, e_cipsi, e_cipsi, 0d0
-  close(10)
-
-  do while (not_converged)
-      print*,''
-      print*,'======================'
-      print*,' Cipsi step:', nb_iter
-      print*,'======================'
-      print*,'' 
-      print*,'********** cipsi step **********'
-      ! cispi calculation
-      call run_stochastic_cipsi(Ev,PT2)
-
-      ! State average energy after the cipsi step
-      call state_average_energy(e_cipsi)
-
-      print*,''
-      print*,'********** optimization step **********'
-      ! orbital optimization
-      call run_orb_opt_trust_v2
-
-      ! State average energy after the orbital optimization
-      call state_average_energy(e_opt)
-
-      print*,''
-      print*,'********** diff step **********'
-      ! Gain in energy
-      delta_e = e_opt - e_cipsi
-      print*, 'Gain in energy during the orbital optimization:', delta_e
-
-      open(unit=10, file=trim(ezfio_filename)//'/mo_optimization/result_opt', position='append')
-      write(10,'(I10, 3F15.7)') n_det, e_cipsi, e_opt, delta_e
-      close(10)
-
-      ! Exit
-      if (delta_e > 1d-12) then
-          print*, 'WARNING, something wrong happened'
-          print*, 'The gain (delta_e) in energy during the optimization process'
-          print*, 'is > 0, but it must be < 0'
-          print*, 'The program will exit'
-          exit
-      endif
-
-      if (n_det > n_det_max_opt) then
-          print*, 'The number of determinants in the wf > n_det_max_opt'
-          print*, 'The program will exit'
-          exit
-      endif
-      
-      ! To double the number of determinants in the wf
-      N_det_max = int(dble(n_det * 2)*0.9)
-      TOUCH N_det_max
-
-      nb_iter = nb_iter + 1
-  enddo
-
-end
--- a/src/mo_optimization_utils/EZFIO.cfg
+++ b/src/mo_optimization_utils/EZFIO.cfg
--- a/src/mo_optimization_utils/NEED
+++ b/src/mo_optimization_utils/NEED
@ -0,0 +1,5 @@
+two_body_rdm
+hartree_fock
+cipsi
+davidson_undressed
+utils_trust_region
--- a/src/mo_optimization_utils/README.md
+++ b/src/mo_optimization_utils/README.md
@ -0,0 +1,74 @@
+# Orbital optimization
+
+## Methods  
+Different methods are available:  
+- full hessian  
+``` 
+qp set orbital_optimization optimization_method full  
+```  
+- diagonal hessian  
+``` 
+qp set orbital_optimization optimization_method diag  
+``` 
+- identity matrix  
+``` 
+qp set orbital_optimization optimization_method none  
+``` 
+
+After the optimization the ezfio contains the optimized orbitals
+ 
+## For a fixed number of determinants
+To optimize the MOs for the actual determinants:  
+``` 
+qp run orb_opt
+``` 
+ 
+## For a complete optimization, i.e, with a larger and larger wave function
+To optimize the MOs with a larger and larger wave function:  
+``` 
+qp run optimization  
+``` 
+
+The results are stored in the EZFIO in "mo_optimization/result_opt",
+with the following format:  
+(1) (2) (3) (4)  
+1: Number of determinants in the wf,  
+2: Cispi energy before the optimization,   
+3: Cipsi energy after the optimization,  
+4: Energy difference between (2) and (3).  
+ 
+The optimization process if the following: 
+- we do a first cipsi step to obtain a small number of determinants in the wf 
+- we run an orbital optimization for this wf 
+- we do a new cipsi step to double the number of determinants in the wf 
+- we run an orbital optimization for this wf 
+- ... 
+- we do that until the energy difference between (2) and (3) is  
+  smaller than the targeted accuracy for the cispi (targeted_accuracy_cipsi in qp edit) 
+  or the wf is larger than a given size (n_det_max_opt in qp_edit) 
+- after that you can reset your determinants (qp reset -d) and run a clean Cispi calculation  
+  
+### End of the optimization
+You can choos the number of determinants after what the 
+optimization will stop:
+```
+qp set orbital_optimization n_det_max_opt 1e5 # or any number
+```
+## Weight of the states
+You can change the weights of the differents states directly in qp edit.  
+It will affect ths weights used in the orbital optimization.
+
+# Tests
+To run the tests:  
+``` 
+qp test
+``` 
+
+# Org files
+The org files are stored in the directory org in order to avoid overwriting on user changes.
+The org files can be modified, to export the change to the source code, run
+```
+./TANGLE_org_mode.sh
+mv *.irp.f ../.
+```
+
--- a/src/mo_optimization_utils/constants.h
+++ b/src/mo_optimization_utils/constants.h
--- a/src/mo_optimization_utils/diagonal_hessian_list_opt.irp.f
+++ b/src/mo_optimization_utils/diagonal_hessian_list_opt.irp.f
--- a/src/mo_optimization_utils/diagonal_hessian_opt.irp.f
+++ b/src/mo_optimization_utils/diagonal_hessian_opt.irp.f
--- a/src/mo_optimization_utils/diagonalization_hessian.irp.f
+++ b/src/mo_optimization_utils/diagonalization_hessian.irp.f
--- a/src/mo_optimization_utils/first_diagonal_hessian_list_opt.irp.f
+++ b/src/mo_optimization_utils/first_diagonal_hessian_list_opt.irp.f
--- a/src/mo_optimization_utils/first_diagonal_hessian_opt.irp.f
+++ b/src/mo_optimization_utils/first_diagonal_hessian_opt.irp.f
--- a/src/mo_optimization_utils/first_gradient_list_opt.irp.f
+++ b/src/mo_optimization_utils/first_gradient_list_opt.irp.f
--- a/src/mo_optimization_utils/first_gradient_opt.irp.f
+++ b/src/mo_optimization_utils/first_gradient_opt.irp.f
--- a/src/mo_optimization_utils/first_hessian_list_opt.irp.f
+++ b/src/mo_optimization_utils/first_hessian_list_opt.irp.f
--- a/src/mo_optimization_utils/first_hessian_opt.irp.f
+++ b/src/mo_optimization_utils/first_hessian_opt.irp.f
--- a/src/mo_optimization_utils/gradient_list_opt.irp.f
+++ b/src/mo_optimization_utils/gradient_list_opt.irp.f
--- a/src/mo_optimization_utils/gradient_opt.irp.f
+++ b/src/mo_optimization_utils/gradient_opt.irp.f
--- a/src/mo_optimization_utils/hessian_list_opt.irp.f
+++ b/src/mo_optimization_utils/hessian_list_opt.irp.f
--- a/src/mo_optimization_utils/hessian_opt.irp.f
+++ b/src/mo_optimization_utils/hessian_opt.irp.f
--- a/src/mo_optimization_utils/org/TODO.org
+++ b/src/mo_optimization_utils/org/TODO.org
--- a/src/mo_optimization_utils/org/debug_gradient_list_opt.org
+++ b/src/mo_optimization_utils/org/debug_gradient_list_opt.org
--- a/src/mo_optimization_utils/org/debug_gradient_opt.org
+++ b/src/mo_optimization_utils/org/debug_gradient_opt.org
--- a/src/mo_optimization_utils/org/debug_hessian_list_opt.org
+++ b/src/mo_optimization_utils/org/debug_hessian_list_opt.org
--- a/src/mo_optimization_utils/org/debug_hessian_opt.org
+++ b/src/mo_optimization_utils/org/debug_hessian_opt.org
--- a/src/mo_optimization_utils/org/diagonal_hessian_list_opt.org
+++ b/src/mo_optimization_utils/org/diagonal_hessian_list_opt.org
--- a/src/mo_optimization_utils/org/diagonal_hessian_opt.org
+++ b/src/mo_optimization_utils/org/diagonal_hessian_opt.org
--- a/src/mo_optimization_utils/org/diagonalization_hessian.org
+++ b/src/mo_optimization_utils/org/diagonalization_hessian.org
--- a/src/mo_optimization_utils/org/first_diagonal_hessian_list_opt.org
+++ b/src/mo_optimization_utils/org/first_diagonal_hessian_list_opt.org
--- a/src/mo_optimization_utils/org/first_diagonal_hessian_opt.org
+++ b/src/mo_optimization_utils/org/first_diagonal_hessian_opt.org
--- a/src/mo_optimization_utils/org/first_gradient_list_opt.org
+++ b/src/mo_optimization_utils/org/first_gradient_list_opt.org
--- a/src/mo_optimization_utils/org/first_gradient_opt.org
+++ b/src/mo_optimization_utils/org/first_gradient_opt.org
--- a/src/mo_optimization_utils/org/first_hessian_list_opt.org
+++ b/src/mo_optimization_utils/org/first_hessian_list_opt.org
--- a/src/mo_optimization_utils/org/first_hessian_opt.org
+++ b/src/mo_optimization_utils/org/first_hessian_opt.org
--- a/src/mo_optimization_utils/org/gradient_list_opt.org
+++ b/src/mo_optimization_utils/org/gradient_list_opt.org
--- a/src/mo_optimization_utils/org/gradient_opt.org
+++ b/src/mo_optimization_utils/org/gradient_opt.org
--- a/src/mo_optimization_utils/org/hessian_list_opt.org
+++ b/src/mo_optimization_utils/org/hessian_list_opt.org
--- a/src/mo_optimization_utils/org/hessian_opt.org
+++ b/src/mo_optimization_utils/org/hessian_opt.org
--- a/src/mo_optimization_utils/org/my_providers.org
+++ b/src/mo_optimization_utils/org/my_providers.org
--- a/src/mo_optimization_utils/org/optimization.org
+++ b/src/mo_optimization_utils/org/optimization.org
--- a/src/mo_optimization_utils/org/orb_opt_trust_v2.org
+++ b/src/mo_optimization_utils/org/orb_opt_trust_v2.org
--- a/src/mo_optimization_utils/org/state_average_energy.org
+++ b/src/mo_optimization_utils/org/state_average_energy.org
--- a/src/mo_optimization_utils/org/state_weight_normalization.org
+++ b/src/mo_optimization_utils/org/state_weight_normalization.org
--- a/src/mo_optimization_utils/org/update_parameters.org
+++ b/src/mo_optimization_utils/org/update_parameters.org
--- a/src/mo_optimization_utils/org/update_st_av_ci_energy.org
+++ b/src/mo_optimization_utils/org/update_st_av_ci_energy.org
--- a/src/mo_optimization_utils/routine_opt_mos.irp.f
+++ b/src/mo_optimization_utils/routine_opt_mos.irp.f
@ -0,0 +1,81 @@
+
+subroutine run_optimization_mos_CIPSI
+
+  implicit none
+
+  double precision :: e_cipsi, e_opt, delta_e
+  double precision, allocatable :: Ev(:),PT2(:)
+  integer :: nb_iter,i
+  logical :: not_converged
+  character (len=100) :: filename
+
+  PROVIDE psi_det psi_coef mo_two_e_integrals_in_map ao_pseudo_integrals
+  allocate(Ev(N_states),PT2(N_states))
+
+  not_converged = .True.
+  nb_iter = 0
+
+  ! To start from the wf
+  N_det_max = max(n_det,5)
+  TOUCH N_det_max
+
+  open(unit=10, file=trim(ezfio_filename)//'/mo_optimization/result_opt')
+  write(10,*) "   Ndet        E_cipsi         E_opt          Delta_e"
+  call state_average_energy(e_cipsi)
+  write(10,'(I10, 3F15.7)') n_det, e_cipsi, e_cipsi, 0d0
+  close(10)
+
+  do while (not_converged)
+      print*,''
+      print*,'======================'
+      print*,' Cipsi step:', nb_iter
+      print*,'======================'
+      print*,'' 
+      print*,'********** cipsi step **********'
+      ! cispi calculation
+      call run_stochastic_cipsi(Ev,PT2)
+
+      ! State average energy after the cipsi step
+      call state_average_energy(e_cipsi)
+
+      print*,''
+      print*,'********** optimization step **********'
+      ! orbital optimization
+      call run_orb_opt_trust_v2
+
+      ! State average energy after the orbital optimization
+      call state_average_energy(e_opt)
+
+      print*,''
+      print*,'********** diff step **********'
+      ! Gain in energy
+      delta_e = e_opt - e_cipsi
+      print*, 'Gain in energy during the orbital optimization:', delta_e
+
+      open(unit=10, file=trim(ezfio_filename)//'/mo_optimization/result_opt', position='append')
+      write(10,'(I10, 3F15.7)') n_det, e_cipsi, e_opt, delta_e
+      close(10)
+
+      ! Exit
+      if (delta_e > 1d-12) then
+          print*, 'WARNING, something wrong happened'
+          print*, 'The gain (delta_e) in energy during the optimization process'
+          print*, 'is > 0, but it must be < 0'
+          print*, 'The program will exit'
+          exit
+      endif
+
+      if (n_det > n_det_max_opt) then
+          print*, 'The number of determinants in the wf > n_det_max_opt'
+          print*, 'The program will exit'
+          exit
+      endif
+      
+      ! To double the number of determinants in the wf
+      N_det_max = int(dble(n_det * 2)*0.9)
+      TOUCH N_det_max
+
+      nb_iter = nb_iter + 1
+  enddo
+
+end
--- a/src/mo_optimization_utils/run_orb_opt_trust_v2.irp.f
+++ b/src/mo_optimization_utils/run_orb_opt_trust_v2.irp.f
--- a/src/mo_optimization_utils/save_energy.irp.f
+++ b/src/mo_optimization_utils/save_energy.irp.f
--- a/src/mo_optimization_utils/state_average_energy.irp.f
+++ b/src/mo_optimization_utils/state_average_energy.irp.f
--- a/src/mo_optimization_utils/state_weight_normalization.irp.f
+++ b/src/mo_optimization_utils/state_weight_normalization.irp.f
--- a/src/mo_optimization_utils/update_parameters.irp.f
+++ b/src/mo_optimization_utils/update_parameters.irp.f
--- a/src/mo_optimization_utils/update_st_av_ci_energy.irp.f
+++ b/src/mo_optimization_utils/update_st_av_ci_energy.irp.f
--- a/src/mo_two_e_ints/cholesky.irp.f
+++ b/src/mo_two_e_ints/cholesky.irp.f
@ -101,3 +101,34 @@ BEGIN_PROVIDER [ double precision, cholesky_mo_transp, (cholesky_mo_num, mo_num,

 END_PROVIDER

+
+BEGIN_PROVIDER [ double precision, cholesky_semi_mo_transp_simple, (cholesky_mo_num, ao_num, mo_num) ]
+ implicit none
+ BEGIN_DOC
+ ! Cholesky vectors in MO basis
+ END_DOC
+
+ double precision, allocatable :: X(:,:,:)
+ double precision :: wall0, wall1
+ integer :: ierr
+ print *, 'Semi AO->MO Transformation of Cholesky vectors'
+  call wall_time(wall0)
+
+ allocate(X(mo_num,cholesky_mo_num,ao_num), stat=ierr)
+ if (ierr /= 0) then
+   print *, irp_here, ': Allocation failed'
+ endif
+ integer :: i_chol, i_mo, j_mo, i_ao 
+ cholesky_semi_mo_transp_simple = 0.d0
+ do i_mo = 1, mo_num
+  do i_ao = 1, ao_num
+   do j_mo = 1, mo_num
+    do i_chol = 1, cholesky_mo_num
+     cholesky_semi_mo_transp_simple(i_chol, i_ao,i_mo) += cholesky_mo_transp(i_chol,j_mo,i_mo) * mo_coef_transp(j_mo,i_ao)
+    enddo
+   enddo
+  enddo
+ enddo
+
+END_PROVIDER
+
--- a/src/mo_two_e_ints/map_integrals.irp.f
+++ b/src/mo_two_e_ints/map_integrals.irp.f
@ -40,7 +40,7 @@ end
 ! Min and max values of the MOs for which the integrals are in the cache
 END_DOC

- mo_integrals_cache_size  = 2_8**mo_integrals_cache_shift
+ mo_integrals_cache_size  = 2**mo_integrals_cache_shift

 mo_integrals_cache_min = max(1,elec_alpha_num - (mo_integrals_cache_size/2 - 1) )
 mo_integrals_cache_max = min(mo_num, mo_integrals_cache_min + mo_integrals_cache_size - 1)
--- a/src/mu_of_r/f_hf_cholesky.irp.f
+++ b/src/mu_of_r/f_hf_cholesky.irp.f
@ -289,6 +289,106 @@ BEGIN_PROVIDER [ double precision, f_hf_cholesky_sparse, (n_points_final_grid)]
 endif
 END_PROVIDER 

+BEGIN_PROVIDER [ double precision, f_hf_cholesky_sparse_bis, (n_points_final_grid)]
+ implicit none
+ integer :: ipoint,m,mm,i,ii,p
+ !!f(R) =  \sum_{I} \sum_{J} Phi_I(R) Phi_J(R) V_IJ
+ !!     =  \sum_{I}\sum_{J}\sum_A Phi_I(R) Phi_J(R) V_AI V_AJ
+ !!     =  \sum_A \sum_{I}Phi_I(R)V_AI \sum_{J}V_AJ Phi_J(R)
+ !!     =  \sum_A V_AR G_AR 
+ !! V_AR = \sum_{I}Phi_IR V_AI = \sum_{I}Phi^t_RI V_AI
+ double precision :: u_dot_v,wall0,wall1,accu_1, accu_2,mo_i_r1,mo_b_r1
+ double precision :: thresh_1,thresh_2
+ double precision, allocatable :: accu_vec(:),delta_vec(:)
+ thresh_2 = ao_cholesky_threshold * 100.d0
+ thresh_1 = dsqrt(thresh_2)
+ provide cholesky_mo_transp
+ if(elec_alpha_num == elec_beta_num)then
+  call wall_time(wall0)
+  !$OMP PARALLEL DEFAULT(NONE)                                      &
+  !$OMP PRIVATE (accu_vec,ipoint,p,ii,i,mm,m,mo_i_r1,mo_b_r1) & 
+  !$OMP ShARED (n_occ_val_orb_for_hf,list_valence_orb_for_hf,mos_in_r_array_omp,aos_in_r_array,thresh_1,thresh_2) & 
+  !$OMP ShARED (cholesky_mo_num,f_hf_cholesky_sparse_bis,n_points_final_grid,cholesky_semi_mo_transp_simple,ao_num) 
+  allocate(accu_vec(cholesky_mo_num))
+  !$OMP DO 
+   do ipoint = 1, n_points_final_grid
+    f_hf_cholesky_sparse_bis(ipoint) = 0.d0
+     accu_vec = 0.d0
+     do ii = 1, n_occ_val_orb_for_hf(1)
+      i = list_valence_orb_for_hf(ii,1)
+      mo_i_r1 = mos_in_r_array_omp(i,ipoint)
+      if(dabs(mo_i_r1).lt.thresh_1)cycle
+      do mm = 1, ao_num ! electron 1 
+       mo_b_r1 = aos_in_r_array(mm,ipoint)*mo_i_r1
+       if(dabs(mo_b_r1).lt.thresh_2)cycle
+       do p = 1, cholesky_mo_num
+        accu_vec(p) = accu_vec(p) + mo_b_r1 * cholesky_semi_mo_transp_simple(p,mm,i)
+       enddo
+      enddo
+     enddo
+     do p = 1, cholesky_mo_num
+      f_hf_cholesky_sparse_bis(ipoint) = f_hf_cholesky_sparse_bis(ipoint) + accu_vec(p) * accu_vec(p)
+     enddo
+    f_hf_cholesky_sparse_bis(ipoint) *= 2.D0
+   enddo
+  !$OMP END DO
+  deallocate(accu_vec)
+  !$OMP END PARALLEL
+  
+  call wall_time(wall1)
+  print*,'Time to provide f_hf_cholesky_sparse_bis = ',wall1-wall0
+ else
+  call wall_time(wall0)
+  !$OMP PARALLEL DEFAULT(NONE)                                      &
+  !$OMP PRIVATE (accu_vec,delta_vec,ipoint,p,ii,i,mm,m,mo_i_r1,mo_b_r1) & 
+  !$OMP ShARED (n_occ_val_orb_for_hf,list_valence_orb_for_hf,list_basis,mos_in_r_array_omp,thresh_1,thresh_2) & 
+  !$OMP ShARED (cholesky_mo_num,f_hf_cholesky_sparse_bis,n_points_final_grid,cholesky_mo_transp,n_basis_orb) 
+  allocate(accu_vec(cholesky_mo_num),delta_vec(cholesky_mo_num))
+  !$OMP DO 
+   do ipoint = 1, n_points_final_grid
+    f_hf_cholesky_sparse_bis(ipoint) = 0.d0
+     accu_vec = 0.d0
+     do ii = 1, n_occ_val_orb_for_hf(2)
+      i = list_valence_orb_for_hf(ii,2)
+      mo_i_r1 = mos_in_r_array_omp(i,ipoint)
+      if(dabs(mo_i_r1).lt.thresh_1)cycle
+      do mm = 1, n_basis_orb ! electron 1 
+       m = list_basis(mm)
+       mo_b_r1 = mos_in_r_array_omp(m,ipoint)
+       if(dabs(mo_i_r1*mo_b_r1).lt.thresh_2)cycle
+       do p = 1, cholesky_mo_num
+        accu_vec(p) = accu_vec(p) + mo_i_r1 * mo_b_r1 * cholesky_mo_transp(p,m,i)
+       enddo
+      enddo
+     enddo
+     delta_vec = 0.d0
+     do ii =  n_occ_val_orb_for_hf(2)+1,n_occ_val_orb_for_hf(1)
+      i = list_valence_orb_for_hf(ii,1)
+      mo_i_r1 = mos_in_r_array_omp(i,ipoint)
+      if(dabs(mo_i_r1).lt.thresh_1)cycle
+      do mm = 1, n_basis_orb ! electron 1 
+       m = list_basis(mm)
+       mo_b_r1 = mos_in_r_array_omp(m,ipoint)
+       if(dabs(mo_i_r1*mo_b_r1).lt.thresh_2)cycle
+       do p = 1, cholesky_mo_num
+        delta_vec(p) = delta_vec(p) + mo_i_r1 * mo_b_r1 * cholesky_mo_transp(p,m,i)
+       enddo
+      enddo
+     enddo
+     do p = 1, cholesky_mo_num
+      f_hf_cholesky_sparse_bis(ipoint) = f_hf_cholesky_sparse_bis(ipoint) + accu_vec(p) * accu_vec(p) + accu_vec(p) * delta_vec(p)
+     enddo
+    f_hf_cholesky_sparse_bis(ipoint) *= 2.D0
+   enddo
+  !$OMP END DO
+  deallocate(accu_vec)
+  !$OMP END PARALLEL
+  call wall_time(wall1)
+  print*,'Time to provide f_hf_cholesky_sparse_bis = ',wall1-wall0
+ endif
+END_PROVIDER 
+
+
 BEGIN_PROVIDER [ double precision, on_top_hf_grid, (n_points_final_grid)]
 implicit none
 integer :: ipoint,i,ii
--- a/src/mu_of_r/mu_of_r_mean_field.irp.f
+++ b/src/mu_of_r/mu_of_r_mean_field.irp.f
@ -0,0 +1,171 @@
+BEGIN_PROVIDER [ double precision, two_e_int_mf, (elec_beta_num,elec_alpha_num,elec_beta_num,elec_alpha_num)]
+ implicit none
+ integer :: i,j,k,l 
+ double precision :: get_two_e_integral
+ do i = 1, elec_alpha_num
+  do j = 1, elec_beta_num
+   do k = 1, elec_alpha_num
+    do l = 1, elec_beta_num
+     two_e_int_mf(l,k,j,i) = get_two_e_integral(l,k,j,i,mo_integrals_map) 
+    enddo
+   enddo
+  enddo
+ enddo
+END_PROVIDER 
+
+subroutine get_f_mf_ab(r,f_mf_ab,two_bod_dens, dm_a, dm_b)
+ implicit none
+ double precision, intent(in) :: r(3)
+ double precision, intent(out):: f_mf_ab,two_bod_dens, dm_a, dm_b
+ double precision, allocatable :: mos_array_r(:),mos_array_a(:), mos_array_b(:)
+ integer :: i,j,k,l
+ allocate(mos_array_r(mo_num), mos_array_a(elec_alpha_num), mos_array_b(elec_alpha_num))
+ call give_all_mos_at_r(r,mos_array_r) 
+ do i = 1, elec_alpha_num
+  mos_array_a(i) = mos_array_r(i)
+ enddo
+ do i = 1, elec_beta_num
+  mos_array_b(i) = mos_array_r(i)
+ enddo
+
+ dm_a = 0.d0
+ do i = 1, elec_alpha_num
+  dm_a += mos_array_a(i) * mos_array_a(i) 
+ enddo
+
+ dm_b = 0.d0
+ do i = 1, elec_beta_num
+  dm_b += mos_array_b(i) * mos_array_b(i) 
+ enddo
+ two_bod_dens = dm_a * dm_b
+ 
+ f_mf_ab = 0.d0
+ do i = 1, elec_alpha_num
+  do j = 1, elec_beta_num
+   do k = 1, elec_alpha_num
+    do l = 1, elec_beta_num
+     f_mf_ab += two_e_int_mf(l,k,j,i) * mos_array_a(i) * mos_array_a(k) * mos_array_b(j) * mos_array_b(l)
+    enddo
+   enddo
+  enddo
+ enddo
+ ! multiply by two to adapt to the N(N-1) normalization condition of the active two-rdm
+ f_mf_ab *= 2.d0 
+ two_bod_dens *= 2.d0
+
+end
+
+subroutine get_grad_f_mf_ab(r,grad_f_mf_ab, grad_two_bod_dens,f_mf_ab,two_bod_dens, dm_a, dm_b,grad_dm_a, grad_dm_b)
+ implicit none
+ double precision, intent(in)  :: r(3)
+ double precision, intent(out) :: f_mf_ab, two_bod_dens
+ double precision, intent(out) :: grad_two_bod_dens(3), grad_f_mf_ab(3)
+ double precision, intent(out) :: dm_a, dm_b, grad_dm_a(3), grad_dm_b(3)
+
+ double precision, allocatable :: mos_array_r(:), mos_grad_array_r(:,:)
+ double precision, allocatable :: mos_array_a(:), mos_array_b(:)
+ double precision, allocatable :: mos_grad_array_a(:,:), mos_grad_array_b(:,:)
+ double precision :: mo_i, mo_j, mo_k, mo_l
+ double precision :: grad_mo_i(3), grad_mo_j(3), grad_mo_k(3), grad_mo_l(3)
+ 
+ integer :: i,j,k,l
+ allocate(mos_array_r(mo_num),mos_grad_array_r(3,mo_num))
+ allocate(mos_array_a(elec_alpha_num), mos_array_b(elec_beta_num))
+ allocate(mos_grad_array_a(3,elec_alpha_num), mos_grad_array_b(3,elec_beta_num))
+ call give_all_mos_and_grad_at_r(r,mos_array_r,mos_grad_array_r)
+ do i = 1, elec_alpha_num
+  mos_array_a(i) = mos_array_r(i)
+  mos_grad_array_a(1:3,i) = mos_grad_array_r(1:3,i)
+ enddo
+ do i = 1, elec_beta_num
+  mos_array_b(i) = mos_array_r(i)
+  mos_grad_array_b(1:3,i) = mos_grad_array_r(1:3,i)
+ enddo
+
+ ! ALPHA DENSITY AND GRADIENT 
+ dm_a = 0.d0
+ grad_dm_a = 0.d0
+ do i = 1, elec_alpha_num
+  dm_a += mos_array_a(i) * mos_array_a(i) 
+  grad_dm_a(1:3) += 2.d0 * mos_array_a(i) * mos_grad_array_a(1:3,i)  
+ enddo
+
+ ! BETA DENSITY AND GRADIENT 
+ dm_b = 0.d0
+ grad_dm_b = 0.d0
+ do i = 1, elec_beta_num
+  dm_b += mos_array_b(i) * mos_array_b(i) 
+  grad_dm_b(1:3) += 2.d0 * mos_array_b(i) * mos_grad_array_b(1:3,i)  
+ enddo
+ ! TWO-BODY DENSITY AND GRADIENT 
+ two_bod_dens = dm_a * dm_b
+ grad_two_bod_dens(1:3) = dm_a * grad_dm_b(1:3) + dm_b * grad_dm_a(1:3)
+
+ ! F_MF and GRADIENT 
+ grad_f_mf_ab = 0.d0
+ f_mf_ab  = 0.d0
+ do i = 1, elec_alpha_num
+  mo_i = mos_array_a(i)
+  grad_mo_i(1:3) = mos_grad_array_a(1:3,i)
+  do j = 1, elec_beta_num
+   mo_j = mos_array_b(j)
+   grad_mo_j(1:3) = mos_grad_array_b(1:3,j)
+   do k = 1, elec_alpha_num
+    mo_k = mos_array_a(k)
+    grad_mo_k(1:3) = mos_grad_array_a(1:3,k)
+    do l = 1, elec_beta_num
+     mo_l = mos_array_b(l)
+     grad_mo_l(1:3) = mos_grad_array_b(1:3,l)
+     f_mf_ab += two_e_int_mf(l,k,j,i) * mo_i * mo_j * mo_k * mo_l
+     grad_f_mf_ab(1:3) += two_e_int_mf(l,k,j,i) * & 
+     (mo_i * mo_j * mo_k * grad_mo_l(1:3) + mo_i * mo_j * grad_mo_k(1:3) * mo_l & 
+     +mo_i * grad_mo_j(1:3) * mo_k * mo_l + grad_mo_i(1:3) * mo_j * mo_k * mo_l)
+    enddo
+   enddo
+  enddo
+ enddo
+
+ f_mf_ab *= 2.d0 
+ two_bod_dens *= 2.d0
+ grad_f_mf_ab *= 2.D0
+ grad_two_bod_dens *= 2.d0
+end
+
+subroutine mu_of_r_mean_field(r,mu_mf, dm)
+ implicit none
+  include 'constants.include.F'
+ double precision, intent(in) :: r(3)
+ double precision, intent(out):: mu_mf, dm
+ double precision :: f_mf_ab,two_bod_dens, dm_a, dm_b
+ call get_f_mf_ab(r,f_mf_ab,two_bod_dens, dm_a, dm_b)
+ dm = dm_a + dm_b
+ if(dabs(two_bod_dens).lt.1.d-10)then
+  mu_mf = 1.d+10
+ else
+  mu_mf = 0.5d0 * sqpi * f_mf_ab/two_bod_dens
+ endif
+end
+
+subroutine grad_mu_of_r_mean_field(r,mu_mf, dm, grad_mu_mf, grad_dm)
+ implicit none
+ include 'constants.include.F'
+ double precision, intent(in) :: r(3)
+ double precision, intent(out):: grad_mu_mf(3), grad_dm(3)
+ double precision, intent(out):: mu_mf, dm
+ double precision :: grad_f_mf_ab(3), grad_two_bod_dens(3),grad_dm_a(3), grad_dm_b(3)
+ double precision :: f_mf_ab,two_bod_dens, dm_a, dm_b
+ call get_grad_f_mf_ab(r,grad_f_mf_ab, grad_two_bod_dens,f_mf_ab,two_bod_dens, dm_a, dm_b,grad_dm_a, grad_dm_b)
+
+ dm = dm_a + dm_b
+ grad_dm(1:3) = grad_dm_a(1:3) + grad_dm_b(1:3)
+
+ if(dabs(two_bod_dens).lt.1.d-10)then
+  mu_mf = 1.d+10
+  grad_mu_mf = 0.d0
+ else
+  mu_mf = 0.5d0 * sqpi * f_mf_ab/two_bod_dens
+  grad_mu_mf(1:3) = 0.5d0 * sqpi * (grad_f_mf_ab(1:3) * two_bod_dens - f_mf_ab * grad_two_bod_dens(1:3))& 
+                                   /(two_bod_dens*two_bod_dens)
+ endif 
+
+end
--- a/src/mu_of_r/test_proj_op.irp.f
+++ b/src/mu_of_r/test_proj_op.irp.f
@ -15,7 +15,162 @@ program projected_operators
 !  call test_f_HF_valence_ab
 !  call routine_full_mos
 !   call test_f_ii_valence_ab
-   call test_f_ia_valence_ab
-  call test_f_ii_ia_aa_valence_ab
+!   call test_f_ia_valence_ab
+!  call test_f_ii_ia_aa_valence_ab
+! call test
+!  call test_f_mean_field
+! call test_grad_f_mean_field
+ call test_grad_mu_mf
 end

+
+subroutine test
+ implicit none
+ integer :: i_point
+ double precision :: ref, new, accu, weight
+ accu = 0.d0
+ do i_point = 1, n_points_final_grid
+  ref = f_hf_cholesky_sparse(i_point)
+  new = f_hf_cholesky_sparse_bis(i_point)
+  weight = final_weight_at_r_vector(i_point)
+  accu += dabs(ref - new) * weight
+ enddo
+ print*,'accu = ',accu
+
+end
+
+subroutine test_f_mean_field
+ implicit none
+ integer :: i_point
+ double precision :: weight,r(3)
+ double precision :: ref_f, new_f, accu_f
+ double precision :: ref_two_dens, new_two_dens, accu_two_dens, dm_a, dm_b
+ accu_f = 0.d0
+ accu_two_dens = 0.d0
+ do i_point = 1, n_points_final_grid
+  r(1:3)   = final_grid_points(1:3,i_point)
+  weight = final_weight_at_r_vector(i_point)
+  call get_f_mf_ab(r,new_f,new_two_dens, dm_a, dm_b)
+  call f_HF_valence_ab(r,r,ref_f,ref_two_dens)
+  accu_f += weight * dabs(new_f- ref_f)
+  accu_two_dens += weight * dabs(new_two_dens - ref_two_dens)
+ enddo
+ print*,'accu_f        = ',accu_f
+ print*,'accu_two_dens = ',accu_two_dens
+
+end
+
+subroutine test_grad_f_mean_field
+ implicit none
+ integer :: i_point,k
+ double precision :: weight,r(3)
+ double precision :: grad_f_mf_ab(3), grad_two_bod_dens(3)
+ double precision :: grad_dm_a(3), grad_dm_b(3)
+ double precision :: f_mf_ab,two_bod_dens, dm_a, dm_b
+
+ double precision :: num_grad_f_mf_ab(3), num_grad_two_bod_dens(3)
+ double precision :: num_grad_dm_a(3), num_grad_dm_b(3)
+ double precision :: f_mf_ab_p,f_mf_ab_m
+ double precision :: two_bod_dens_p, two_bod_dens_m
+ double precision :: dm_a_p, dm_a_m
+ double precision :: dm_b_p, dm_b_m
+ double precision :: rbis(3), dr
+ double precision :: accu_grad_f_mf_ab(3),accu_grad_two_bod_dens(3)
+ double precision :: accu_grad_dm_a(3),accu_grad_dm_b(3)
+ double precision :: accu_f_mf_ab, accu_two_bod_dens, accu_dm_a, accu_dm_b
+ dr = 0.00001d0
+ accu_f_mf_ab = 0.d0 
+ accu_two_bod_dens = 0.d0 
+ accu_dm_a = 0.d0 
+ accu_dm_b = 0.d0
+
+ accu_grad_f_mf_ab = 0.d0
+ accu_grad_two_bod_dens = 0.d0
+ accu_grad_dm_a = 0.d0
+ accu_grad_dm_b = 0.d0
+ do i_point = 1, n_points_final_grid
+  r(1:3)   = final_grid_points(1:3,i_point)
+  weight = final_weight_at_r_vector(i_point)
+  call get_grad_f_mf_ab(r,grad_f_mf_ab, grad_two_bod_dens,f_mf_ab,two_bod_dens, dm_a, dm_b,grad_dm_a, grad_dm_b)
+  call get_f_mf_ab(r,f_mf_ab_p,two_bod_dens_p, dm_a_p, dm_b_p)
+  accu_f_mf_ab += weight * dabs(f_mf_ab - f_mf_ab_p)
+  accu_two_bod_dens += weight * dabs(two_bod_dens - two_bod_dens_p)
+  accu_dm_a += weight*dabs(dm_a - dm_a_p)
+  accu_dm_b += weight*dabs(dm_b - dm_b_p)
+  do k = 1, 3
+   rbis = r
+   rbis(k) += dr
+   call get_f_mf_ab(rbis,f_mf_ab_p,two_bod_dens_p, dm_a_p, dm_b_p)
+   rbis = r
+   rbis(k) -= dr
+   call get_f_mf_ab(rbis,f_mf_ab_m,two_bod_dens_m, dm_a_m, dm_b_m)
+   num_grad_f_mf_ab(k) = (f_mf_ab_p - f_mf_ab_m)/(2.d0*dr)
+   num_grad_two_bod_dens(k) = (two_bod_dens_p - two_bod_dens_m)/(2.d0*dr)
+   num_grad_dm_a(k) = (dm_a_p - dm_a_m)/(2.d0*dr)
+   num_grad_dm_b(k) = (dm_b_p - dm_b_m)/(2.d0*dr)
+  enddo
+  do k = 1, 3
+   accu_grad_f_mf_ab(k) += weight * dabs(grad_f_mf_ab(k) - num_grad_f_mf_ab(k))
+   accu_grad_two_bod_dens(k) += weight * dabs(grad_two_bod_dens(k) - num_grad_two_bod_dens(k))
+   accu_grad_dm_a(k) += weight * dabs(grad_dm_a(k) - num_grad_dm_a(k))
+   accu_grad_dm_b(k) += weight * dabs(grad_dm_b(k) - num_grad_dm_b(k))
+  enddo
+ enddo
+ print*,'accu_f_mf_ab = ',accu_f_mf_ab
+ print*,'accu_two_bod_dens = ',accu_two_bod_dens
+ print*,'accu_dm_a = ',accu_dm_a
+ print*,'accu_dm_b = ',accu_dm_b
+ print*,'accu_grad_f_mf_ab = '
+ print*,accu_grad_f_mf_ab
+ print*,'accu_grad_two_bod_dens = '
+ print*,accu_grad_two_bod_dens
+ print*,'accu_dm_a = '
+ print*,accu_grad_dm_a
+ print*,'accu_dm_b = '
+ print*,accu_grad_dm_b
+
+end
+
+subroutine test_grad_mu_mf
+ implicit none
+ integer :: i_point,k
+ double precision :: weight,r(3),rbis(3)
+ double precision :: mu_mf, dm,grad_mu_mf(3), grad_dm(3)
+ double precision :: mu_mf_p, mu_mf_m, dm_m, dm_p, num_grad_mu_mf(3),dr, num_grad_dm(3)
+ double precision :: accu_mu, accu_dm, accu_grad_dm(3), accu_grad_mu_mf(3)
+ dr = 0.00001d0
+ accu_grad_mu_mf = 0.d0
+ accu_mu = 0.d0
+ accu_grad_dm = 0.d0
+ accu_dm = 0.d0
+ do i_point = 1, n_points_final_grid
+  r(1:3)   = final_grid_points(1:3,i_point)
+  weight = final_weight_at_r_vector(i_point)
+  call grad_mu_of_r_mean_field(r,mu_mf, dm, grad_mu_mf, grad_dm)
+  call mu_of_r_mean_field(r,mu_mf_p, dm_p)
+  accu_mu += weight*dabs(mu_mf_p - mu_mf)
+  accu_dm += weight*dabs(dm_p - dm)
+  do k = 1, 3
+   rbis = r
+   rbis(k) += dr
+   call mu_of_r_mean_field(rbis,mu_mf_p, dm_p)
+   rbis = r
+   rbis(k) -= dr
+   call mu_of_r_mean_field(rbis,mu_mf_m, dm_m)
+
+   num_grad_mu_mf(k) = (mu_mf_p - mu_mf_m)/(2.d0*dr)
+   num_grad_dm(k) = (dm_p - dm_m)/(2.d0*dr)
+  enddo
+  do k = 1, 3
+   accu_grad_dm(k)+= weight *dabs(num_grad_dm(k) - grad_dm(k))
+   accu_grad_mu_mf(k)+= weight *dabs(num_grad_mu_mf(k) - grad_mu_mf(k))
+  enddo
+ enddo
+ print*,'accu_mu = ',accu_mu
+ print*,'accu_dm = ',accu_dm
+ print*,'accu_grad_dm = '
+ print*, accu_grad_dm
+ print*,'accu_grad_mu_mf = '
+ print*, accu_grad_mu_mf
+
+end
--- a/src/tools/four_idx_transform.irp.f
+++ b/src/tools/four_idx_transform.irp.f
@ -12,6 +12,9 @@ program four_idx_transform
 !
  END_DOC

+  if (do_mo_cholesky) then
+    stop 'Not implemented with Cholesky integrals'
+  endif
  io_mo_two_e_integrals = 'Write'
  SOFT_TOUCH io_mo_two_e_integrals
  if (.true.) then
--- a/Show More
+++ b/Show More
				`@ -0,0 +1,2 @@`
				`-ltbb -lsycl -lmkl_sycl -lgpu -limf -lintlc -lstdc++`