diff --git a/configure b/configure
index 014275eb..43ca9f6d 100755
--- a/configure
+++ b/configure
@@ -40,7 +40,7 @@ Usage:
   $(basename $0) -c <file>    
   $(basename $0) -h           
   $(basename $0) -i <package> 
-  $(basename $0) -g [nvidia|none]
+  $(basename $0) -g [nvidia|intel|none]
 
 Options:
   -c  <file>        Define a COMPILATION configuration file,
@@ -49,7 +49,7 @@ Options:
   -i <package>      INSTALL <package>. Use at your OWN RISK:
                     no support will be provided for the installation of
                     dependencies.
-  -g [nvidia|none]  Choose GPU acceleration (experimental)
+  -g [nvidia|intel|none]  Choose GPU acceleration
 
 Example:
   ./$(basename $0) -c config/gfortran.cfg
@@ -115,19 +115,23 @@ while getopts "d:c:i:g:h" c ; do
 done
 
 # Handle GPU acceleration
-rm -f ${QP_ROOT}/src/gpu
+rm -f ${QP_ROOT}/src/gpu_arch
 case "$GPU" in
-  amd) # Nvidia
+  amd) # AMD
      echo "Activating AMD GPU acceleration"
-     ln -s ${QP_ROOT}/src/gpu_amd ${QP_ROOT}/src/gpu
+     ln -s ${QP_ROOT}/plugins/local/gpu_amd ${QP_ROOT}/src/gpu_arch
+     ;;
+  intel) # Intel
+     echo "Activating Intel GPU acceleration (EXPERIMENTAL)"
+     ln -s ${QP_ROOT}/plugins/local/gpu_intel ${QP_ROOT}/src/gpu_arch
      ;;
   nvidia) # Nvidia
      echo "Activating Nvidia GPU acceleration"
-     ln -s ${QP_ROOT}/src/gpu_nvidia ${QP_ROOT}/src/gpu
+     ln -s ${QP_ROOT}/plugins/local/gpu_nvidia ${QP_ROOT}/src/gpu_arch
      ;;
   *) # No Acceleration
      echo "Disabling GPU acceleration"
-     ln -s ${QP_ROOT}/src/gpu_x86 ${QP_ROOT}/src/gpu
+     ln -s ${QP_ROOT}/plugins/local/gpu_x86 ${QP_ROOT}/src/gpu_arch
      ;;
 esac
 
diff --git a/plugins/local/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f b/plugins/local/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f
index e363830d..6b8f3b42 100644
--- a/plugins/local/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f
+++ b/plugins/local/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f
@@ -1,4 +1,36 @@
 
+
+! ---
+
+subroutine run_pouet
+
+  BEGIN_DOC
+  ! Selected Full Configuration Interaction with Stochastic selection and PT2.
+  END_DOC
+
+  use selection_types
+  implicit none
+  integer                       :: i, j, k, ndet
+  integer                       :: to_select
+  logical                       :: has
+  type(pt2_type)                :: pt2_data, pt2_data_err
+  double precision              :: rss
+  double precision              :: correlation_energy_ratio
+  double precision              :: hf_energy_ref
+  double precision              :: relative_error
+  double precision, allocatable :: zeros(:),E_tc(:), norm(:)
+
+  logical,          external    :: qp_stop
+  double precision, external    :: memory_of_double
+
+  PROVIDE mo_l_coef mo_r_coef
+  PROVIDE H_apply_buffer_allocated distributed_davidson 
+
+  print*, ' Diagonal elements of the Fock matrix '
+  do i = 1, mo_num
+    write(*,*) i, Fock_matrix_tc_mo_tot(i,i)
+  enddo
+end
 ! ---
 
 subroutine run_stochastic_cipsi
diff --git a/plugins/local/fci_tc_bi/fci_tc_bi_ortho.irp.f b/plugins/local/fci_tc_bi/fci_tc_bi_ortho.irp.f
index 1c1c0411..f1de0fe3 100644
--- a/plugins/local/fci_tc_bi/fci_tc_bi_ortho.irp.f
+++ b/plugins/local/fci_tc_bi/fci_tc_bi_ortho.irp.f
@@ -65,7 +65,15 @@ subroutine run_cipsi_tc()
 
   if (.not. is_zmq_slave) then
 
-    PROVIDE psi_det psi_coef mo_bi_ortho_tc_two_e mo_bi_ortho_tc_one_e
+    if(.True.)then! DO NOT REMOVE THE IF(.TRUE.) !! 
+     ! this has to be provided before mo_bi_ortho_tc_two_e to avoid twice the computation of ao_two_e_tc_tot
+     PROVIDE Fock_matrix_tc_mo_tot 
+     ! because Fock_matrix_tc_mo_tot depends on ao_two_e_tc_tot 
+     ! and that mo_bi_ortho_tc_two_e erase ao_two_e_tc_tot after being provided 
+    endif
+    if(.True.)then ! DO NOT REMOVE THE IF(.TRUE.) !! 
+     PROVIDE psi_det psi_coef mo_bi_ortho_tc_two_e mo_bi_ortho_tc_one_e 
+    endif
 
     if((elec_alpha_num+elec_beta_num) .ge. 3) then
       if(three_body_h_tc) then
@@ -90,8 +98,16 @@ subroutine run_cipsi_tc()
     call json_close
 
   else
+    if(.True.)then! DO NOT REMOVE THE IF(.TRUE.) !! 
+     ! this has to be provided before mo_bi_ortho_tc_two_e to avoid twice the computation of ao_two_e_tc_tot
+     PROVIDE Fock_matrix_tc_mo_tot 
+     ! because Fock_matrix_tc_mo_tot depends on ao_two_e_tc_tot 
+     ! and that mo_bi_ortho_tc_two_e erase ao_two_e_tc_tot after being provided 
+    endif
 
-    PROVIDE mo_bi_ortho_tc_one_e mo_bi_ortho_tc_two_e pt2_min_parallel_tasks
+    if(.True.)then! DO NOT REMOVE THE IF(.TRUE.) !! 
+     PROVIDE mo_bi_ortho_tc_one_e mo_bi_ortho_tc_two_e pt2_min_parallel_tasks
+    endif
 
     if((elec_alpha_num+elec_beta_num) .ge. 3) then
       if(three_body_h_tc) then
diff --git a/plugins/local/gpu_intel/LIB b/plugins/local/gpu_intel/LIB
new file mode 100644
index 00000000..199b0f1c
--- /dev/null
+++ b/plugins/local/gpu_intel/LIB
@@ -0,0 +1,2 @@
+-ltbb -lsycl -lmkl_sycl -lgpu -limf -lintlc -lstdc++ 
+
diff --git a/src/gpu_x86/NEED b/plugins/local/gpu_intel/NEED
similarity index 100%
rename from src/gpu_x86/NEED
rename to plugins/local/gpu_intel/NEED
diff --git a/plugins/local/gpu_intel/README.rst b/plugins/local/gpu_intel/README.rst
new file mode 100644
index 00000000..d42e2557
--- /dev/null
+++ b/plugins/local/gpu_intel/README.rst
@@ -0,0 +1,8 @@
+=========
+gpu_intel
+=========
+
+Intel implementation of GPU routines. Uses MKL and SYCL.
+```bash
+icpx -fsycl gpu.cxx -c -qmkl=sequential
+```
diff --git a/plugins/local/gpu_intel/gpu.sycl b/plugins/local/gpu_intel/gpu.sycl
new file mode 100644
index 00000000..1f9f89ce
--- /dev/null
+++ b/plugins/local/gpu_intel/gpu.sycl
@@ -0,0 +1,177 @@
+#include <CL/sycl.hpp>
+#include <cassert>
+#include <limits>
+#include <oneapi/mkl/blas.hpp>
+
+extern "C" {
+
+/* Generic functions */
+
+int gpu_ndevices() {
+  return 1;
+}
+
+void gpu_set_device(int32_t igpu) {
+}
+
+
+/* Allocation functions */
+
+void gpu_allocate(void** ptr, int64_t size) {
+    auto queue = sycl::queue(sycl::default_selector_v);
+
+    try {
+        *ptr = sycl::malloc_shared(size, queue);
+        assert(*ptr != nullptr);
+    } catch (const sycl::exception& e) {
+        std::cerr << "SYCL exception caught: " << e.what() << std::endl;
+        *ptr = nullptr; // If allocation fails, set pointer to nullptr
+    }
+}
+
+void gpu_deallocate(void** ptr) {
+    assert(*ptr != nullptr);
+    sycl::free(*ptr, sycl::queue(sycl::default_selector_v));
+    *ptr = nullptr;
+}
+
+/* Upload data from host to device */
+void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
+    sycl::queue queue(sycl::default_selector_v);
+    queue.memcpy(gpu_ptr, cpu_ptr, n).wait();
+}
+
+/* Download data from device to host */
+void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
+    sycl::queue queue(sycl::default_selector_v);
+    queue.memcpy(cpu_ptr, gpu_ptr, n).wait();
+}
+
+/* Copy data from one GPU memory location to another */
+void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
+    sycl::queue queue(sycl::default_selector_v);
+    queue.memcpy(gpu_ptr_dest, gpu_ptr_src, n).wait();
+}
+
+/* Queues */
+
+/* SYCL queue as a replacement for CUDA stream */
+void gpu_stream_create(sycl::queue** ptr) {
+    *ptr = new sycl::queue(sycl::default_selector_v);
+}
+
+void gpu_stream_destroy(sycl::queue** ptr) {
+    assert(*ptr != nullptr);
+    delete *ptr;
+    *ptr = nullptr;
+}
+
+void gpu_synchronize() {
+    sycl::queue queue(sycl::default_selector_v);
+    queue.wait_and_throw();
+}
+
+/* BLAS functions */
+
+typedef struct {
+  sycl::queue* queue;
+} blasHandle_t;
+
+void gpu_set_stream(blasHandle_t* handle, sycl::queue* ptr) {
+  handle->queue = ptr;
+}
+
+void gpu_blas_create(blasHandle_t** ptr) {
+    *ptr = (blasHandle_t*) malloc(sizeof(blasHandle_t));
+    assert(*ptr != nullptr);
+    (*ptr)->queue = new sycl::queue(sycl::default_selector_v);
+    assert((*ptr)->queue != nullptr);
+}
+
+void gpu_blas_destroy(blasHandle_t** ptr) {
+    assert(*ptr != nullptr);
+    delete (*ptr)->queue;
+    free(*ptr);
+    *ptr = nullptr;
+}
+
+
+void gpu_ddot(blasHandle_t* handle, const int64_t n, const double* x, const int64_t incx, 
+              const double* y, const int64_t incy, double* result) {
+    // Ensure input parameters are valid
+    assert(handle != nullptr);
+    assert(handle->queue != nullptr);
+    assert(n > 0);
+    assert(incx > 0);
+    assert(incy > 0);
+    assert(x != nullptr);
+    assert(y != nullptr);
+    assert(result != nullptr);
+
+    oneapi::mkl::blas::dot(*handle->queue, n, x, incx, y, incy, result);
+
+}
+
+void gpu_dgemv(blasHandle_t* handle, const char* transa, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) {
+
+    assert(handle != nullptr);
+    assert(handle->queue != nullptr);
+
+    // Validate matrix dimensions and increments to be positive
+    assert(m > 0 && n > 0 && lda > 0 && incx > 0 && incy > 0);
+    assert(a != nullptr && x != nullptr && y != nullptr && alpha != nullptr && beta != nullptr);
+
+    // Determine the operation type
+    oneapi::mkl::transpose transa_ = oneapi::mkl::transpose::nontrans;
+    if (*transa == 'T' || *transa == 't') {
+        transa_ = oneapi::mkl::transpose::trans;
+    }
+
+    // Perform DGEMV operation using oneMKL
+    oneapi::mkl::blas::column_major::gemv(*handle->queue, transa_, m, n, *alpha, a, lda, x, incx, *beta, y, incy);
+
+}
+
+void gpu_dgemm(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) {
+
+    assert(handle != nullptr && handle->queue != nullptr);
+    assert(m > 0 && n > 0 && k > 0 && lda > 0 && ldb > 0 && ldc > 0);
+    assert(a != nullptr && b != nullptr && c != nullptr && alpha != nullptr && beta != nullptr);
+
+    // Transpose operations
+    auto transa_ = (*transa == 'T' || *transa == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans;
+    auto transb_ = (*transb == 'T' || *transb == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans;
+
+    oneapi::mkl::blas::column_major::gemm(*handle->queue, transa_, transb_, m, n, k,
+                                    *alpha, a, lda, b, ldb, *beta, c, ldc);
+
+}
+
+
+void gpu_dgeam(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
+    assert(handle != nullptr && handle->queue != nullptr);
+    assert(m > 0 && n > 0 && lda > 0 && ldb > 0 && ldc > 0);
+    assert(a != nullptr && b != nullptr && c != nullptr && alpha != nullptr && beta != nullptr);
+
+    // Determine transpose operations
+    bool transA = (*transa == 'T' || *transa == 't');
+    bool transB = (*transb == 'T' || *transb == 't');
+
+    handle->queue->submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(sycl::range<2>(m, n), [=](sycl::id<2> idx) {
+            const int i = idx[0];
+            const int j = idx[1];
+            const int ai = transA ? j * lda + i : i * lda + j;
+            const int bi = transB ? j * ldb + i : i * ldb + j;
+            const int ci = i * ldc + j;
+
+            c[ci] = (*alpha) * a[ai] + (*beta) * b[bi];
+        });
+    });
+
+}
+
+}  // extern C
diff --git a/plugins/local/gpu_nvidia/LIB b/plugins/local/gpu_nvidia/LIB
new file mode 100644
index 00000000..91f54e91
--- /dev/null
+++ b/plugins/local/gpu_nvidia/LIB
@@ -0,0 +1 @@
+-lcudart -lcublas -lcublasLt
diff --git a/plugins/local/gpu_nvidia/NEED b/plugins/local/gpu_nvidia/NEED
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/plugins/local/gpu_nvidia/NEED
@@ -0,0 +1 @@
+
diff --git a/plugins/local/gpu_nvidia/README.rst b/plugins/local/gpu_nvidia/README.rst
new file mode 100644
index 00000000..5dcfca92
--- /dev/null
+++ b/plugins/local/gpu_nvidia/README.rst
@@ -0,0 +1,5 @@
+==========
+gpu_nvidia
+==========
+
+Nvidia implementation of GPU routines. Uses CUDA and CUBLAS libraries.
diff --git a/plugins/local/gpu_nvidia/gpu.c b/plugins/local/gpu_nvidia/gpu.c
new file mode 100644
index 00000000..a775ab95
--- /dev/null
+++ b/plugins/local/gpu_nvidia/gpu.c
@@ -0,0 +1,326 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+
+
+/* Generic functions */
+
+int gpu_ndevices() {
+  int ngpus;
+  cudaGetDeviceCount(&ngpus);
+  return ngpus;
+}
+
+void gpu_set_device(int32_t igpu) {
+  cudaSetDevice((int) igpu);
+}
+
+
+/* Allocation functions */
+
+void gpu_allocate(void** ptr, const int64_t size) {
+    size_t free, total;
+    cudaError_t rc = cudaMemGetInfo( &free, &total );
+    if (rc != cudaSuccess) {
+      free = INT64_MAX;
+    }
+
+    rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
+//    /* Use managed memory if it does not fit on the GPU */
+//    if (size < free && size < total/2) {
+//      rc= cudaMalloc(ptr, size);
+//    } else {
+//      rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
+//    }
+    assert (rc == cudaSuccess);
+}
+
+void gpu_deallocate(void** ptr) {
+  assert (*ptr != NULL);
+  cudaFree(*ptr);
+  *ptr = NULL;
+}
+
+
+/* Memory transfer functions */
+
+void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
+  cudaMemcpy (gpu_ptr, cpu_ptr, n, cudaMemcpyHostToDevice);
+}
+
+void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
+  cudaMemcpy (cpu_ptr, gpu_ptr, n, cudaMemcpyDeviceToHost);
+}
+
+void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
+  cudaMemcpy (gpu_ptr_dest, gpu_ptr_src, n, cudaMemcpyDeviceToDevice);
+}
+
+
+/* Streams */
+
+void gpu_stream_create(cudaStream_t* ptr) {
+  cudaError_t rc = cudaStreamCreate(ptr);
+  assert (rc == cudaSuccess);
+}
+
+void gpu_stream_destroy(cudaStream_t* ptr) {
+  assert (ptr != NULL);
+  cudaError_t rc = cudaStreamDestroy(*ptr);
+  assert (rc == cudaSuccess);
+  *ptr = NULL;
+}
+
+void gpu_set_stream(cublasHandle_t handle, cudaStream_t stream) {
+  cublasSetStream(handle, stream);
+}
+
+void gpu_synchronize() {
+  cudaDeviceSynchronize();
+}
+
+
+/* BLAS functions */
+
+void gpu_blas_create(cublasHandle_t* ptr) {
+  cublasStatus_t rc = cublasCreate(ptr);
+  assert (rc == CUBLAS_STATUS_SUCCESS);
+}
+
+
+void gpu_blas_destroy(cublasHandle_t* ptr) {
+  assert (ptr != NULL);
+  cublasStatus_t rc = cublasDestroy(*ptr);
+  assert (rc == CUBLAS_STATUS_SUCCESS);
+  ptr = NULL;
+}
+
+
+void gpu_ddot(cublasHandle_t handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
+  assert (handle != NULL);
+  /* Convert to int */
+  int n_, incx_, incy_;
+
+  n_    = (int) n;
+  incx_ = (int) incx;
+  incy_ = (int) incy;
+
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  cublasStatus_t rc = cublasDdot(handle, n_, x, incx_, y, incy_, result);
+  assert (rc == CUBLAS_STATUS_SUCCESS);
+}
+
+
+
+void gpu_sdot(cublasHandle_t handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int n_, incx_, incy_;
+
+  n_    = (int) n;
+  incx_ = (int) incx;
+  incy_ = (int) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  float result_ = 0.;
+  cublasStatus_t rc = cublasSdot(handle, n_, x, incx_, y, incy_, &result_);
+  assert (rc == CUBLAS_STATUS_SUCCESS);
+  *result = result_;
+}
+
+
+
+void gpu_dgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) {
+
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, lda_, incx_, incy_;
+
+  m_    = (int) m;
+  n_    = (int) n;
+  lda_  = (int) lda;
+  incx_ = (int) incx;
+  incy_ = (int) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    m_ == m   );
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t)  lda_ == lda );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
+
+  cublasDgemv(handle, transa_, m_, n_, alpha, a, lda_, x, incx_, beta, y, incy_);
+}
+
+
+
+void gpu_sgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const float* alpha,
+               const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy) {
+
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, lda_, incx_, incy_;
+
+  m_    = (int) m;
+  n_    = (int) n;
+  lda_  = (int) lda;
+  incx_ = (int) incx;
+  incy_ = (int) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    m_ == m   );
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t)  lda_ == lda );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
+
+  cublasSgemv(handle, transa_, m_, n_, alpha, a, lda_, x, incx_, beta, y, incy_);
+}
+
+
+void gpu_dgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) {
+
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, k_, lda_, ldb_, ldc_;
+
+  m_   = (int) m;
+  n_   = (int) n;
+  k_   = (int) k;
+  lda_ = (int) lda;
+  ldb_ = (int) ldb;
+  ldc_ = (int) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t)   k_ == k  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
+  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasDgemm(handle, transa_, transb_, m_, n_, k_, alpha, a, lda_, b, ldb_, beta, c, ldc_);
+}
+
+
+
+void gpu_sgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha,
+               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc) {
+
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, k_, lda_, ldb_, ldc_;
+
+  m_   = (int) m;
+  n_   = (int) n;
+  k_   = (int) k;
+  lda_ = (int) lda;
+  ldb_ = (int) ldb;
+  ldc_ = (int) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t)   k_ == k  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
+  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasSgemm(handle, transa_, transb_, m_, n_, k_, alpha, a, lda_, b, ldb_, beta, c, ldc_);
+}
+
+
+void gpu_dgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, lda_, ldb_, ldc_;
+
+  m_   = (int) m;
+  n_   = (int) n;
+  lda_ = (int) lda;
+  ldb_ = (int) ldb;
+  ldc_ = (int) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
+  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasDgeam(handle, transa_, transb_, m_, n_, alpha, a, lda_, beta, b, ldb_, c, ldc_);
+
+}
+
+
+void gpu_sgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float* alpha,
+               const float* a, const int64_t lda, const float* beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, lda_, ldb_, ldc_;
+
+  m_   = (int) m;
+  n_   = (int) n;
+  lda_ = (int) lda;
+  ldb_ = (int) ldb;
+  ldc_ = (int) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
+  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasSgeam(handle, transa_, transb_, m_, n_, alpha, a, lda_, beta, b, ldb_, c, ldc_);
+
+}
diff --git a/plugins/local/gpu_x86/NEED b/plugins/local/gpu_x86/NEED
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/plugins/local/gpu_x86/NEED
@@ -0,0 +1 @@
+
diff --git a/src/gpu_x86/README.rst b/plugins/local/gpu_x86/README.rst
similarity index 100%
rename from src/gpu_x86/README.rst
rename to plugins/local/gpu_x86/README.rst
diff --git a/src/gpu_x86/gpu.c b/plugins/local/gpu_x86/gpu.c
similarity index 51%
rename from src/gpu_x86/gpu.c
rename to plugins/local/gpu_x86/gpu.c
index 71505dbe..49aec9d3 100644
--- a/src/gpu_x86/gpu.c
+++ b/plugins/local/gpu_x86/gpu.c
@@ -2,13 +2,13 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <stdbool.h>
 #include <assert.h>
 
-
 /* Generic functions */
 
 int gpu_ndevices() {
-  return 1;
+  return 0;
 }
 
 void gpu_set_device(int32_t i) {
@@ -25,7 +25,7 @@ void gpu_allocate(void** ptr, const int64_t n) {
   }
 }
 
-void gpu_free(void** ptr) {
+void gpu_deallocate(void** ptr) {
   free(*ptr);
   *ptr = NULL;
 }
@@ -49,10 +49,11 @@ void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
 /* Streams */
 
 void gpu_stream_create(void** ptr) {
-  *ptr = (void*) 2;
+  *ptr = (void*) malloc(sizeof(char));
 }
 
 void gpu_stream_destroy(void** ptr) {
+  free(*ptr);
   *ptr = NULL;
 }
 
@@ -68,18 +69,19 @@ void gpu_synchronize() {
 /* BLAS functions */
 
 void gpu_blas_create(void** handle) {
-  *handle = (void*) 1;
+  *handle = (void*) malloc(sizeof(char));
 }
 
 
 void gpu_blas_destroy(void** handle) {
+  free(*handle);
   *handle = NULL;
 }
 
 
 double ddot_(const int32_t* n, const double* x, const int32_t* incx, const double* y, const int32_t* incy);
 
-void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
+void gpu_ddot(void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
   assert (handle != NULL);
 
   /* Convert to int32_t */
@@ -100,7 +102,7 @@ void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_
 
 float sdot_(const int32_t* n, const float* x, const int32_t* incx, const float* y, const int32_t* incy);
 
-void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
+void gpu_sdot(void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
   assert (handle != NULL);
 
   /* Convert to int32_t */
@@ -122,8 +124,8 @@ void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t
 void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha,
             const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy);
 
-void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double alpha,
-               const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) {
+void gpu_dgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) {
 
   assert (handle != NULL);
 
@@ -143,15 +145,15 @@ void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int
   assert ( (int64_t) incx_ == incx);
   assert ( (int64_t) incy_ == incy);
 
-  dgemv_(&transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_);
+  dgemv_(transa, &m_, &n_, alpha, a, &lda_, x, &incx_, beta, y, &incy_);
 }
 
 
 void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha,
                const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy);
 
-void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float alpha,
-               const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) {
+void gpu_sgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const float* alpha,
+               const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy) {
 
   assert (handle != NULL);
 
@@ -171,15 +173,15 @@ void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int
   assert ( (int64_t) incx_ == incx);
   assert ( (int64_t) incy_ == incy);
 
-  sgemv_(&transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_);
+  sgemv_(transa, &m_, &n_, alpha, a, &lda_, x, &incx_, beta, y, &incy_);
 }
 
 
 void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha,
             const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc);
 
-void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
-               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) {
+void gpu_dgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) {
 
   assert (handle != NULL);
 
@@ -201,7 +203,7 @@ void gpu_dgemm(const void* handle, const char transa, const char transb, const i
   assert ( (int64_t) ldb_ == ldb);
   assert ( (int64_t) ldc_ == ldc);
 
-  dgemm_(&transa, &transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_);
+  dgemm_(transa, transb, &m_, &n_, &k_, alpha, a, &lda_, b, &ldb_, beta, c, &ldc_);
 }
 
 
@@ -209,8 +211,8 @@ void gpu_dgemm(const void* handle, const char transa, const char transb, const i
 void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha,
             const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc);
 
-void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
-               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) {
+void gpu_sgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha,
+               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc) {
 
   assert (handle != NULL);
 
@@ -232,136 +234,133 @@ void gpu_sgemm(const void* handle, const char transa, const char transb, const i
   assert ( (int64_t) ldb_ == ldb);
   assert ( (int64_t) ldc_ == ldc);
 
-  sgemm_(&transa, &transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_);
+  sgemm_(transa, transb, &m_, &n_, &k_, alpha, a, &lda_, b, &ldb_, beta, c, &ldc_);
 }
 
 
-void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
-               const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
-  if (handle == NULL) {
-    perror("NULL handle");
-    exit(-1);
-  }
+void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
+  assert (handle != NULL);
 
-  if ( (transa == 'N' && transb == 'N') ||
-       (transa == 'n' && transb == 'N') ||
-       (transa == 'N' && transb == 'n') ||
-       (transa == 'n' && transb == 'n') ) {
+  if ( (*transa == 'N' && *transb == 'N') ||
+       (*transa == 'n' && *transb == 'N') ||
+       (*transa == 'N' && *transb == 'n') ||
+       (*transa == 'n' && *transb == 'n') ) {
 
-     if (alpha == 0.) {
+     if (*alpha == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = beta * b[j*ldb+i];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[j*ldb+i];
          }
        }
 
-     } else if (beta == 0.) {
+     } else if (*beta == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = alpha * a[j*lda+i];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i];
          }
        }
 
      } else {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[j*ldb+i];
          }
        }
 
      }
 
-  } else if ( (transa == 'N' && transb == 'T') ||
-              (transa == 'n' && transb == 'T') ||
-              (transa == 'N' && transb == 't') ||
-              (transa == 'n' && transb == 't') ) {
+  } else if ( (*transa == 'N' && *transb == 'T') ||
+              (*transa == 'n' && *transb == 'T') ||
+              (*transa == 'N' && *transb == 't') ||
+              (*transa == 'n' && *transb == 't') ) {
 
-     if (alpha == 0.) {
+     if (*alpha == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = beta * b[i*ldb+j];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[i*ldb+j];
          }
        }
 
-     } else if (beta == 0.) {
+     } else if (*beta == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = alpha * a[j*lda+i];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i];
          }
        }
 
      } else {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[i*ldb+j];
          }
        }
 
      }
 
-  } else if ( (transa == 'T' && transb == 'N') ||
-              (transa == 't' && transb == 'N') ||
-              (transa == 'T' && transb == 'n') ||
-              (transa == 't' && transb == 'n') ) {
+  } else if ( (*transa == 'T' && *transb == 'N') ||
+              (*transa == 't' && *transb == 'N') ||
+              (*transa == 'T' && *transb == 'n') ||
+              (*transa == 't' && *transb == 'n') ) {
 
-     if (alpha == 0.) {
+     if (*alpha == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = beta * b[j*ldb+i];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[j*ldb+i];
          }
        }
 
-     } else if (beta == 0.) {
+     } else if (*beta == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = alpha * a[i*lda+j];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j];
          }
        }
 
      } else {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[j*ldb+i];
          }
        }
 
      }
 
-  } else if ( (transa == 'T' && transb == 'T') ||
-              (transa == 't' && transb == 'T') ||
-              (transa == 'T' && transb == 't') ||
-              (transa == 't' && transb == 't') ) {
+  } else if ( (*transa == 'T' && *transb == 'T') ||
+              (*transa == 't' && *transb == 'T') ||
+              (*transa == 'T' && *transb == 't') ||
+              (*transa == 't' && *transb == 't') ) {
 
-     if (alpha == 0.) {
+     if (*alpha == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = beta * b[i*ldb+j];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[i*ldb+j];
          }
        }
 
-     } else if (beta == 0.) {
+     } else if (*beta == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = alpha * a[i*lda+j];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j];
          }
        }
 
      } else {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[i*ldb+j];
          }
        }
 
@@ -371,132 +370,129 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
 }
 
 
-void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
-               const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
-  if (handle == NULL) {
-    perror("NULL handle");
-    exit(-1);
-  }
+void gpu_sgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float* alpha,
+               const float* a, const int64_t lda, const float* beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
+  assert (handle != NULL);
 
-  if ( (transa == 'N' && transb == 'N') ||
-       (transa == 'n' && transb == 'N') ||
-       (transa == 'N' && transb == 'n') ||
-       (transa == 'n' && transb == 'n') ) {
+  if ( (*transa == 'N' && *transb == 'N') ||
+       (*transa == 'n' && *transb == 'N') ||
+       (*transa == 'N' && *transb == 'n') ||
+       (*transa == 'n' && *transb == 'n') ) {
 
-     if (alpha == 0.) {
+     if (*alpha == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = beta * b[j*ldb+i];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[j*ldb+i];
          }
        }
 
-     } else if (beta == 0.) {
+     } else if (*beta == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = alpha * a[j*lda+i];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i];
          }
        }
 
      } else {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[j*ldb+i];
          }
        }
 
      }
 
-  } else if ( (transa == 'N' && transb == 'T') ||
-              (transa == 'n' && transb == 'T') ||
-              (transa == 'N' && transb == 't') ||
-              (transa == 'n' && transb == 't') ) {
+  } else if ( (*transa == 'N' && *transb == 'T') ||
+              (*transa == 'n' && *transb == 'T') ||
+              (*transa == 'N' && *transb == 't') ||
+              (*transa == 'n' && *transb == 't') ) {
 
-     if (alpha == 0.) {
+     if (*alpha == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = beta * b[i*ldb+j];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[i*ldb+j];
          }
        }
 
-     } else if (beta == 0.) {
+     } else if (*beta == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = alpha * a[j*lda+i];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i];
          }
        }
 
      } else {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[i*ldb+j];
          }
        }
 
      }
 
-  } else if ( (transa == 'T' && transb == 'N') ||
-              (transa == 't' && transb == 'N') ||
-              (transa == 'T' && transb == 'n') ||
-              (transa == 't' && transb == 'n') ) {
+  } else if ( (*transa == 'T' && *transb == 'N') ||
+              (*transa == 't' && *transb == 'N') ||
+              (*transa == 'T' && *transb == 'n') ||
+              (*transa == 't' && *transb == 'n') ) {
 
-     if (alpha == 0.) {
+     if (*alpha == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = beta * b[j*ldb+i];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[j*ldb+i];
          }
        }
 
-     } else if (beta == 0.) {
+     } else if (*beta == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = alpha * a[i*lda+j];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j];
          }
        }
 
      } else {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[j*ldb+i];
          }
        }
 
      }
 
-  } else if ( (transa == 'T' && transb == 'T') ||
-              (transa == 't' && transb == 'T') ||
-              (transa == 'T' && transb == 't') ||
-              (transa == 't' && transb == 't') ) {
+  } else if ( (*transa == 'T' && *transb == 'T') ||
+              (*transa == 't' && *transb == 'T') ||
+              (*transa == 'T' && *transb == 't') ||
+              (*transa == 't' && *transb == 't') ) {
 
-     if (alpha == 0.) {
+     if (*alpha == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = beta * b[i*ldb+j];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[i*ldb+j];
          }
        }
 
-     } else if (beta == 0.) {
+     } else if (*beta == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = alpha * a[i*lda+j];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j];
          }
        }
 
      } else {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
-           c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j];
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[i*ldb+j];
          }
        }
 
diff --git a/plugins/local/non_h_ints_mu/total_tc_int.irp.f b/plugins/local/non_h_ints_mu/total_tc_int.irp.f
index 656f5f16..fb09168e 100644
--- a/plugins/local/non_h_ints_mu/total_tc_int.irp.f
+++ b/plugins/local/non_h_ints_mu/total_tc_int.irp.f
@@ -288,25 +288,31 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
       !$OMP END DO
       !$OMP END PARALLEL
     else
-      print*, ' ao_integrals_map will be used'
-      PROVIDE ao_integrals_map
+!      print*, ' ao_integrals_map will be used'
+!      PROVIDE ao_integrals_map
+      print*,'Cholesky vectors will be used '
+      double precision :: get_ao_integ_chol,eri
+      eri = get_ao_integ_chol(1,1,1,1) ! FOR OPENMP 
       !$OMP PARALLEL DEFAULT(NONE)                            &
-      !$OMP SHARED(ao_num, ao_two_e_tc_tot, ao_integrals_map) &
-      !$OMP PRIVATE(i, j, k, l)
+!!!    !$OMP SHARED(ao_num, ao_two_e_tc_tot, ao_integrals_map) &
+      !$OMP SHARED(ao_num, ao_two_e_tc_tot) &
+      !$OMP PRIVATE(i, j, k, l,eri)
       !$OMP DO COLLAPSE(3)
       do j = 1, ao_num
         do l = 1, ao_num
           do i = 1, ao_num
             do k = 1, ao_num
               !                                                     < 1:i, 2:j | 1:k, 2:l > 
-              ao_two_e_tc_tot(k,i,l,j) = ao_two_e_tc_tot(k,i,l,j) + get_ao_two_e_integral(i, j, k, l, ao_integrals_map)
+!              eri =  get_ao_two_e_integral(i, j, k, l, ao_integrals_map)
+               eri = get_ao_integ_chol(i,k,j,l)
+              ao_two_e_tc_tot(k,i,l,j) = ao_two_e_tc_tot(k,i,l,j) + eri
             enddo
           enddo
         enddo
       enddo
       !$OMP END DO
       !$OMP END PARALLEL
-      FREE ao_integrals_map
+!      FREE ao_integrals_map
     endif
 
     if((tc_integ_type .eq. "numeric") .and. (.not. tc_save_mem)) then
diff --git a/plugins/local/slater_tc/slater_tc_opt.irp.f b/plugins/local/slater_tc/slater_tc_opt.irp.f
index 5651a299..3c4421f8 100644
--- a/plugins/local/slater_tc/slater_tc_opt.irp.f
+++ b/plugins/local/slater_tc/slater_tc_opt.irp.f
@@ -10,8 +10,6 @@ subroutine provide_all_three_ints_bi_ortho()
   implicit none
   double precision :: t1, t2
 
-  PROVIDE ao_two_e_integrals_in_map
-
   print *, ' start provide_all_three_ints_bi_ortho'
   call wall_time(t1)
 
diff --git a/plugins/local/slater_tc/tc_hmat.irp.f b/plugins/local/slater_tc/tc_hmat.irp.f
index cc780364..6323d129 100644
--- a/plugins/local/slater_tc/tc_hmat.irp.f
+++ b/plugins/local/slater_tc/tc_hmat.irp.f
@@ -30,7 +30,9 @@ BEGIN_PROVIDER [double precision, htilde_matrix_elmt_bi_ortho, (N_det,N_det)]
   print *, ' PROVIDING htilde_matrix_elmt_bi_ortho ...'
   call wall_time(t1)
 
-  call provide_all_three_ints_bi_ortho()
+  if(three_body_h_tc)then
+   call provide_all_three_ints_bi_ortho()
+  endif
 
   i = 1
   j = 1
diff --git a/plugins/local/spher_harm/spher_harm.irp.f b/plugins/local/spher_harm/spher_harm.irp.f
index 7a2eea06..e8deafb9 100644
--- a/plugins/local/spher_harm/spher_harm.irp.f
+++ b/plugins/local/spher_harm/spher_harm.irp.f
@@ -1,7 +1,7 @@
 program spher_harm
  implicit none
-! call test_spher_harm
+ call test_spher_harm
 ! call test_cart
- call test_brutal_spheric
+! call test_brutal_spheric
 end
 
diff --git a/plugins/local/spher_harm/spher_harm_func.irp.f b/plugins/local/spher_harm/spher_harm_func.irp.f
index 825bd8ac..f12c8fb9 100644
--- a/plugins/local/spher_harm/spher_harm_func.irp.f
+++ b/plugins/local/spher_harm/spher_harm_func.irp.f
@@ -7,6 +7,7 @@ subroutine spher_harm_func_r3(r,l,m,re_ylm, im_ylm)
  double precision :: theta, phi,r_abs
  call cartesian_to_spherical(r,theta,phi,r_abs)
  call spher_harm_func(l,m,theta,phi,re_ylm, im_ylm)
+! call spher_harm_func_expl(l,m,theta,phi,re_ylm, im_ylm)
 end
 
 
@@ -131,6 +132,10 @@ subroutine spher_harm_func_expl(l,m,theta,phi,re_ylm, im_ylm)
   tmp = - inv_sq_pi * dsqrt(3.d0/8.d0) * dsin(theta) 
   re_ylm = tmp * dcos(phi)
   im_ylm = tmp * dsin(phi)
+ else if (l==1.and.m==-1)then
+  tmp = - inv_sq_pi * dsqrt(3.d0/8.d0) * dsin(theta) 
+  re_ylm = tmp * dcos(phi)
+  im_ylm = -tmp * dsin(phi)
  else if(l==1.and.m==0)then
   tmp = inv_sq_pi * dsqrt(3.d0/4.d0) * dcos(theta) 
   re_ylm = tmp 
@@ -139,10 +144,18 @@ subroutine spher_harm_func_expl(l,m,theta,phi,re_ylm, im_ylm)
   tmp = 0.25d0 * inv_sq_pi * dsqrt(0.5d0*15.d0) * dsin(theta)*dsin(theta)
   re_ylm = tmp * dcos(2.d0*phi)
   im_ylm = tmp * dsin(2.d0*phi)
+ else if(l==2.and.m==-2)then
+  tmp = 0.25d0 * inv_sq_pi * dsqrt(0.5d0*15.d0) * dsin(theta)*dsin(theta)
+  re_ylm = tmp * dcos(2.d0*phi)
+  im_ylm =-tmp * dsin(2.d0*phi)
  else if(l==2.and.m==1)then
   tmp = - inv_sq_pi * dsqrt(15.d0/8.d0) * dsin(theta) * dcos(theta)
   re_ylm = tmp * dcos(phi)
   im_ylm = tmp * dsin(phi)
+ else if(l==2.and.m==-1)then
+  tmp = - inv_sq_pi * dsqrt(15.d0/8.d0) * dsin(theta) * dcos(theta)
+  re_ylm = tmp * dcos(phi)
+  im_ylm =-tmp * dsin(phi)
  else if(l==2.and.m==0)then
   tmp = dsqrt(5.d0/4.d0) * inv_sq_pi* (1.5d0*dcos(theta)*dcos(theta)-0.5d0)
   re_ylm = tmp
diff --git a/plugins/local/tc_int/NEED b/plugins/local/tc_int/NEED
index 8a4caf5b..34d2e183 100644
--- a/plugins/local/tc_int/NEED
+++ b/plugins/local/tc_int/NEED
@@ -1,3 +1,4 @@
+gpu
 tc_keywords
 jastrow
 qmckl
diff --git a/plugins/local/tc_int/compute_tc_int.irp.f b/plugins/local/tc_int/compute_tc_int.irp.f
index 02f21570..92c90d03 100644
--- a/plugins/local/tc_int/compute_tc_int.irp.f
+++ b/plugins/local/tc_int/compute_tc_int.irp.f
@@ -2,23 +2,23 @@
 ! ---
 
 subroutine provide_int2_grad1_u12_ao()
-
+  use gpu
   BEGIN_DOC
   !
-  ! int2_grad1_u12_ao(i,j,ipoint,1) = \int dr2         [\grad1 u(r1,r2)]_x1 \chi_i(r2) \chi_j(r2) 
-  ! int2_grad1_u12_ao(i,j,ipoint,2) = \int dr2         [\grad1 u(r1,r2)]_y1 \chi_i(r2) \chi_j(r2) 
-  ! int2_grad1_u12_ao(i,j,ipoint,3) = \int dr2         [\grad1 u(r1,r2)]_z1 \chi_i(r2) \chi_j(r2) 
-  ! int2_grad1_u12_ao(i,j,ipoint,4) = \int dr2 [-(1/2) [\grad1 u(r1,r2)]^2] \chi_i(r2) \chi_j(r2) 
+  ! int2_grad1_u12_ao(i,j,ipoint,1) = \int dr2         [\grad1 u(r1,r2)]_x1 \chi_i(r2) \chi_j(r2)
+  ! int2_grad1_u12_ao(i,j,ipoint,2) = \int dr2         [\grad1 u(r1,r2)]_y1 \chi_i(r2) \chi_j(r2)
+  ! int2_grad1_u12_ao(i,j,ipoint,3) = \int dr2         [\grad1 u(r1,r2)]_z1 \chi_i(r2) \chi_j(r2)
+  ! int2_grad1_u12_ao(i,j,ipoint,4) = \int dr2 [-(1/2) [\grad1 u(r1,r2)]^2] \chi_i(r2) \chi_j(r2)
   !
   !
-  ! tc_int_2e_ao(k,i,l,j) = (ki|V^TC(r_12)|lj) 
-  !                       = <lk| V^TC(r_12) |ji> where V^TC(r_12) is the total TC operator 
+  ! tc_int_2e_ao(k,i,l,j) = (ki|V^TC(r_12)|lj)
+  !                       = <lk| V^TC(r_12) |ji> where V^TC(r_12) is the total TC operator
   !                       = tc_grad_and_lapl_ao(k,i,l,j) + tc_grad_square_ao(k,i,l,j) + ao_two_e_coul(k,i,l,j)
   ! where:
   !
   ! tc_grad_and_lapl_ao(k,i,l,j) = < k l | -1/2 \Delta_1 u(r1,r2) - \grad_1 u(r1,r2) . \grad_1 | ij >
-  !                              = -1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2      \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2) 
-  !                              =  1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2 (-1) \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2) 
+  !                              = -1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2      \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2)
+  !                              =  1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2 (-1) \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2)
   !
   ! tc_grad_square_ao(k,i,l,j) = -1/2 <kl | |\grad_1 u(r1,r2)|^2 + |\grad_2 u(r1,r2)|^2 | ij>
   !
@@ -35,8 +35,9 @@ subroutine provide_int2_grad1_u12_ao()
   double precision              :: weight1, ao_k_r, ao_i_r
   double precision              :: der_envsq_x, der_envsq_y, der_envsq_z, lap_envsq
   double precision              :: time0, time1, time2, tc1, tc2, tc
-  double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:), tc_int_2e_ao(:,:,:,:)
-  double precision, allocatable :: tmp(:,:,:), c_mat(:,:,:), tmp_grad1_u12(:,:,:)
+  type(gpu_double4)             :: int2_grad1_u12_ao
+  type(gpu_double3)             :: tmp_grad1_u12, tmp_grad1_u12p, tmp
+  double precision, allocatable :: c_mat(:,:,:), tc_int_2e_ao(:,:,:,:)
 
   double precision, external    :: get_ao_two_e_integral
 
@@ -51,6 +52,7 @@ subroutine provide_int2_grad1_u12_ao()
 
   call total_memory(mem)
   mem      = max(1.d0, qp_max_mem - mem)
+  mem = 6
   n_double = mem * 1.d8
   n_blocks = int(min(n_double / (n_points_extra_final_grid * 4.d0), 1.d0*n_points_final_grid))
   n_rest   = int(mod(n_points_final_grid, n_blocks))
@@ -64,9 +66,9 @@ subroutine provide_int2_grad1_u12_ao()
   ! ---
   ! ---
 
-  allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,4))
+  call gpu_allocate(int2_grad1_u12_ao, ao_num,ao_num,n_points_final_grid,4)
 
-  allocate(tmp(n_points_extra_final_grid,ao_num,ao_num))
+  call gpu_allocate(tmp,n_points_extra_final_grid,ao_num,ao_num)
   !$OMP PARALLEL               &
   !$OMP DEFAULT (NONE)         &
   !$OMP PRIVATE (j, i, jpoint) &
@@ -75,49 +77,55 @@ subroutine provide_int2_grad1_u12_ao()
   do j = 1, ao_num
     do i = 1, ao_num
       do jpoint = 1, n_points_extra_final_grid
-        tmp(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j)
+        tmp%f(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j)
       enddo
     enddo
   enddo
   !$OMP END DO
   !$OMP END PARALLEL
 
-  allocate(tmp_grad1_u12(n_points_extra_final_grid,n_blocks,4))
+  call gpu_allocate(tmp_grad1_u12,n_points_extra_final_grid,n_blocks,4)
+  call gpu_allocate(tmp_grad1_u12p,n_points_extra_final_grid,n_blocks,4)
 
   tc = 0.d0
-  
+
+  type(gpu_stream) :: stream(4)
+  do i=1,4
+    call gpu_stream_create(stream(i))
+  enddo
+
   do i_pass = 1, n_pass
     ii = (i_pass-1)*n_blocks + 1
-  
+
     call wall_time(tc1)
+
     !$OMP PARALLEL                   &
     !$OMP DEFAULT (NONE)             &
     !$OMP PRIVATE (i_blocks, ipoint) &
     !$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12)
-    !$OMP DO 
+    !$OMP DO
     do i_blocks = 1, n_blocks
       ipoint = ii - 1 + i_blocks ! r1
-      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_blocks,1), tmp_grad1_u12(1,i_blocks,2), tmp_grad1_u12(1,i_blocks,3), tmp_grad1_u12(1,i_blocks,4))
+      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12%f(1,i_blocks,1), tmp_grad1_u12%f(1,i_blocks,2), &
+        tmp_grad1_u12%f(1,i_blocks,3), tmp_grad1_u12%f(1,i_blocks,4))
     enddo
     !$OMP END DO
     !$OMP END PARALLEL
     call wall_time(tc2)
-    tc = tc + tc2 - tc1 
+    tc = tc + tc2 - tc1
 
+    call gpu_synchronize()
+    call gpu_copy(tmp_grad1_u12,tmp_grad1_u12p)
     do m = 1, 4
-      call dgemm( "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0                     &
-                , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
-                , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) 
+      call gpu_set_stream(blas_handle, stream(m))
+      call gpu_dgemm(blas_handle, "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0                     &
+                , tmp%f(1,1,1), n_points_extra_final_grid, tmp_grad1_u12p%f(1,1,m), n_points_extra_final_grid &
+                , 0.d0, int2_grad1_u12_ao%f(1,1,ii,m), ao_num*ao_num)
     enddo
   enddo
-  
-  deallocate(tmp_grad1_u12)
 
-  
   if(n_rest .gt. 0) then
-  
-    allocate(tmp_grad1_u12(n_points_extra_final_grid,n_rest,4))
-  
+
     ii = n_pass*n_blocks + 1
 
     call wall_time(tc1)
@@ -125,26 +133,35 @@ subroutine provide_int2_grad1_u12_ao()
     !$OMP DEFAULT (NONE)           &
     !$OMP PRIVATE (i_rest, ipoint) &
     !$OMP SHARED (n_rest, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12)
-    !$OMP DO 
+    !$OMP DO
     do i_rest = 1, n_rest
       ipoint = ii - 1 + i_rest ! r1
-      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_rest,1), tmp_grad1_u12(1,i_rest,2), tmp_grad1_u12(1,i_rest,3), tmp_grad1_u12(1,i_rest,4))
+      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12%f(1,i_rest,1), tmp_grad1_u12%f(1,i_rest,2), &
+        tmp_grad1_u12%f(1,i_rest,3), tmp_grad1_u12%f(1,i_rest,4))
     enddo
     !$OMP END DO
     !$OMP END PARALLEL
     call wall_time(tc2)
-    tc = tc + tc2 - tc1 
-  
+    tc = tc + tc2 - tc1
+
     do m = 1, 4
-      call dgemm( "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0                       &
-                , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
-                , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) 
+      call gpu_set_stream(blas_handle, stream(m))
+      call gpu_dgemm(blas_handle, "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0                       &
+                , tmp%f(1,1,1), n_points_extra_final_grid, tmp_grad1_u12%f(1,1,m), n_points_extra_final_grid &
+                , 0.d0, int2_grad1_u12_ao%f(1,1,ii,m), ao_num*ao_num)
     enddo
 
-    deallocate(tmp_grad1_u12)
   endif
+  call gpu_synchronize()
+  call gpu_deallocate(tmp_grad1_u12)
+  call gpu_deallocate(tmp_grad1_u12p)
 
-  deallocate(tmp)
+  do i=1,4
+    call gpu_stream_destroy(stream(i))
+  enddo
+
+
+  call gpu_deallocate(tmp)
 
 
   call wall_time(time1)
@@ -152,6 +169,8 @@ subroutine provide_int2_grad1_u12_ao()
   print*, ' wall time Jastrow derivatives   (min) = ', tc / 60.d0
   call print_memory_usage()
 
+!TODO
+stop
   ! ---
   ! ---
   ! ---
@@ -177,7 +196,7 @@ subroutine provide_int2_grad1_u12_ao()
   !$OMP END DO
   !$OMP END PARALLEL
   call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0            &
-            , int2_grad1_u12_ao(1,1,1,4), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
+            , int2_grad1_u12_ao%f(1,1,1,4), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
             , 0.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num)
   deallocate(c_mat)
 
@@ -188,23 +207,23 @@ subroutine provide_int2_grad1_u12_ao()
   ! ---
 
   call wall_time(time1)
-  
+
   allocate(c_mat(n_points_final_grid,ao_num,ao_num))
   do m = 1, 3
     !$OMP PARALLEL                                                              &
     !$OMP DEFAULT (NONE)                                                        &
-    !$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r)                       & 
-    !$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, c_mat, & 
+    !$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r)                       &
+    !$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, c_mat, &
     !$OMP         ao_num, n_points_final_grid, final_weight_at_r_vector, m)
     !$OMP DO SCHEDULE (static)
     do i = 1, ao_num
       do k = 1, ao_num
         do ipoint = 1, n_points_final_grid
-  
+
           weight1 = 0.5d0 * final_weight_at_r_vector(ipoint)
           ao_i_r  = aos_in_r_array_transp(ipoint,i)
           ao_k_r  = aos_in_r_array_transp(ipoint,k)
-  
+
           c_mat(ipoint,k,i) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,m) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,m))
         enddo
       enddo
@@ -213,7 +232,7 @@ subroutine provide_int2_grad1_u12_ao()
     !$OMP END PARALLEL
 
     call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, -1.d0           &
-              , int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
+              , int2_grad1_u12_ao%f(1,1,1,m), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
               , 1.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num)
   enddo
   deallocate(c_mat)
@@ -234,7 +253,7 @@ subroutine provide_int2_grad1_u12_ao()
 
   ! ---
 
-  call wall_time(time1)  
+  call wall_time(time1)
 
   PROVIDE ao_integrals_map
   !$OMP PARALLEL DEFAULT(NONE)                         &
@@ -245,7 +264,7 @@ subroutine provide_int2_grad1_u12_ao()
     do l = 1, ao_num
       do i = 1, ao_num
         do k = 1, ao_num
-          !                                               < 1:i, 2:j | 1:k, 2:l > 
+          !                                               < 1:i, 2:j | 1:k, 2:l >
           tc_int_2e_ao(k,i,l,j) = tc_int_2e_ao(k,i,l,j) + get_ao_two_e_integral(i, j, k, l, ao_integrals_map)
         enddo
       enddo
@@ -263,7 +282,7 @@ subroutine provide_int2_grad1_u12_ao()
   print*, ' Writing int2_grad1_u12_ao in ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao'
   open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="write")
   call ezfio_set_work_empty(.False.)
-    write(11) int2_grad1_u12_ao(:,:,:,1:3)
+    write(11) int2_grad1_u12_ao%f(:,:,:,1:3)
   close(11)
 
   print*, ' Saving tc_int_2e_ao in ', trim(ezfio_filename) // '/work/ao_two_e_tc_tot'
@@ -276,7 +295,7 @@ subroutine provide_int2_grad1_u12_ao()
 
   ! ----
 
-  deallocate(int2_grad1_u12_ao)
+  call gpu_deallocate(int2_grad1_u12_ao)
   deallocate(tc_int_2e_ao)
 
   call wall_time(time2)
diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f
index 063cc365..ccaa7239 100644
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@@ -1,3 +1,15 @@
+double precision function get_ao_integ_chol(i,j,k,l)
+ implicit none
+  BEGIN_DOC
+  !  CHOLESKY representation of the integral of the AO basis <ik|jl> or (ij|kl)
+  !     i(r1) j(r1) 1/r12 k(r2) l(r2)
+  END_DOC
+ integer, intent(in) :: i,j,k,l
+ double precision, external :: ddot                                                                                  
+ get_ao_integ_chol = ddot(cholesky_ao_num, cholesky_ao_transp(1,i,j), 1, cholesky_ao_transp(1,k,l), 1)
+
+end
+
 BEGIN_PROVIDER [ double precision, cholesky_ao_transp, (cholesky_ao_num, ao_num, ao_num) ]
  implicit none
  BEGIN_DOC
@@ -25,7 +37,10 @@ END_PROVIDER
    ! Last dimension of cholesky_ao is cholesky_ao_num
    !
    ! https://mogp-emulator.readthedocs.io/en/latest/methods/proc/ProcPivotedCholesky.html
+   !
    ! https://doi.org/10.1016/j.apnum.2011.10.001 : Page 4, Algorithm 1
+   !
+   ! https://www.diva-portal.org/smash/get/diva2:396223/FULLTEXT01.pdf
    END_DOC
 
    integer*8                      :: ndim8
@@ -155,11 +170,15 @@ END_PROVIDER
          Lset(np8) = p8
        endif
      enddo
-     np = np8
+     if (np8 > ndim8) stop 'np>ndim8'
+     np = int(np8,4)
      if (np <= 0) stop 'np<=0'
-     if (np > ndim8) stop 'np>ndim8'
 
-     rank_max = min(np,20*elec_num*elec_num)
+     rank_max = np
+     ! Avoid too large arrays when there are many electrons
+     if (elec_num > 10) then
+       rank_max = min(np,20*elec_num*elec_num)
+     endif
      call mmap(trim(ezfio_work_dir)//'cholesky_ao_tmp', (/ ndim8, rank_max /), 8, fd(1), .False., .True., c_pointer(1))
      call c_f_pointer(c_pointer(1), L, (/ ndim8, rank_max /))
 
@@ -428,7 +447,7 @@ END_PROVIDER
            Lset(np8) = p8
          endif
        enddo
-       np = np8
+       np = int(np8,4)
 
      enddo
 
diff --git a/src/casscf_cipsi/EZFIO.cfg b/src/casscf_cipsi/EZFIO.cfg
index 18e0b6b1..5b72d906 100644
--- a/src/casscf_cipsi/EZFIO.cfg
+++ b/src/casscf_cipsi/EZFIO.cfg
@@ -79,3 +79,9 @@ type: logical
 doc: If |true|, the pt2_max value in the CIPSI is set to 10-10 and will not change
 interface: ezfio,provider,ocaml
 default: False
+
+[act_mos_opt]
+type: logical
+doc: If |true|, the active orbitals are also optimized variationally
+interface: ezfio,provider,ocaml
+default: False
diff --git a/src/casscf_cipsi/NEED b/src/casscf_cipsi/NEED
index dd91c7bd..32f5ae90 100644
--- a/src/casscf_cipsi/NEED
+++ b/src/casscf_cipsi/NEED
@@ -3,3 +3,4 @@ selectors_full
 generators_cas
 two_body_rdm
 dav_general_mat
+mo_optimization_utils
diff --git a/src/casscf_cipsi/bielec.irp.f b/src/casscf_cipsi/bielec.irp.f
index 0a44f994..a4901985 100644
--- a/src/casscf_cipsi/bielec.irp.f
+++ b/src/casscf_cipsi/bielec.irp.f
@@ -1,18 +1,25 @@
-BEGIN_PROVIDER [real*8, bielec_PQxx, (mo_num, mo_num,n_core_inact_act_orb,n_core_inact_act_orb)]
+BEGIN_PROVIDER [real*8, bielec_PQxx_array, (mo_num, mo_num,n_core_inact_act_orb,n_core_inact_act_orb)]
   BEGIN_DOC
-  ! bielec_PQxx : integral (pq|xx) with p,q arbitrary, x core or active
+  ! WARNING !!! Old version !!! NOT USED ANYMORE IN THE PROGRAM !!! TOO BIG TO BE STORED ON LARGE SYSTEMS !!! 
+  ! 
+  ! Replaced by the Cholesky-based function bielec_PQxx
+  !
+  ! bielec_PQxx_array : integral (pq|xx) with p,q arbitrary, x core or active
   ! indices are unshifted orbital numbers
   END_DOC
   implicit none
   integer                        :: i,j,ii,jj,p,q,i3,j3,t3,v3
   real*8                         :: mo_two_e_integral
+  print*,''
+  print*,'Providing bielec_PQxx_array, WARNING IT CAN BE A VERY BIG ARRAY WHEN MO_NUM IS LARGE !!!'
+  print*,''
   
-  bielec_PQxx(:,:,:,:) = 0.d0
+  bielec_PQxx_array(:,:,:,:) = 0.d0
   PROVIDE mo_two_e_integrals_in_map
   
   !$OMP PARALLEL DEFAULT(NONE) &
   !$OMP PRIVATE(i,ii,j,jj,i3,j3) &
-  !$OMP SHARED(n_core_inact_orb,list_core_inact,mo_num,bielec_PQxx, &
+  !$OMP SHARED(n_core_inact_orb,list_core_inact,mo_num,bielec_PQxx_array, &
   !$OMP  n_act_orb,mo_integrals_map,list_act)
 
   !$OMP DO
@@ -20,14 +27,14 @@ BEGIN_PROVIDER [real*8, bielec_PQxx, (mo_num, mo_num,n_core_inact_act_orb,n_core
     ii=list_core_inact(i)
     do j=i,n_core_inact_orb
       jj=list_core_inact(j)
-      call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx(1,1,i,j),mo_integrals_map)
-      bielec_PQxx(:,:,j,i)=bielec_PQxx(:,:,i,j)
+      call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx_array(1,1,i,j),mo_integrals_map)
+      bielec_PQxx_array(:,:,j,i)=bielec_PQxx_array(:,:,i,j)
     end do
     do j=1,n_act_orb
       jj=list_act(j)
       j3=j+n_core_inact_orb
-      call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx(1,1,i,j3),mo_integrals_map)
-      bielec_PQxx(:,:,j3,i)=bielec_PQxx(:,:,i,j3)
+      call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx_array(1,1,i,j3),mo_integrals_map)
+      bielec_PQxx_array(:,:,j3,i)=bielec_PQxx_array(:,:,i,j3)
     end do
   end do
   !$OMP END DO
@@ -40,8 +47,8 @@ BEGIN_PROVIDER [real*8, bielec_PQxx, (mo_num, mo_num,n_core_inact_act_orb,n_core
     do j=i,n_act_orb
       jj=list_act(j)
       j3=j+n_core_inact_orb
-      call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx(1,1,i3,j3),mo_integrals_map)
-      bielec_PQxx(:,:,j3,i3)=bielec_PQxx(:,:,i3,j3)
+      call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx_array(1,1,i3,j3),mo_integrals_map)
+      bielec_PQxx_array(:,:,j3,i3)=bielec_PQxx_array(:,:,i3,j3)
     end do
   end do
   !$OMP END DO
@@ -52,9 +59,13 @@ END_PROVIDER
 
 
 
-BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_act_orb, mo_num)]
+BEGIN_PROVIDER [real*8, bielec_PxxQ_array, (mo_num,n_core_inact_act_orb,n_core_inact_act_orb, mo_num)]
   BEGIN_DOC
-  ! bielec_PxxQ : integral (px|xq) with p,q arbitrary, x core or active
+  ! WARNING !!! Old version !!! NOT USED ANYMORE IN THE PROGRAM !!! TOO BIG TO BE STORED ON LARGE SYSTEMS !!! 
+  ! 
+  ! Replaced by the Cholesky-based function bielec_PxxQ
+  !
+  ! bielec_PxxQ_array : integral (px|xq) with p,q arbitrary, x core or active
   ! indices are unshifted orbital numbers
   END_DOC
   implicit none
@@ -62,12 +73,15 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_a
   double precision, allocatable  :: integrals_array(:,:)
   real*8                         :: mo_two_e_integral
   
+  print*,''
+  print*,'Providing bielec_PxxQ_array, WARNING IT CAN BE A VERY BIG ARRAY WHEN MO_NUM IS LARGE !!!'
+  print*,''
   PROVIDE mo_two_e_integrals_in_map
-  bielec_PxxQ = 0.d0
+  bielec_PxxQ_array = 0.d0
 
   !$OMP PARALLEL DEFAULT(NONE) &
   !$OMP PRIVATE(i,ii,j,jj,i3,j3,integrals_array) &
-  !$OMP SHARED(n_core_inact_orb,list_core_inact,mo_num,bielec_PxxQ, &
+  !$OMP SHARED(n_core_inact_orb,list_core_inact,mo_num,bielec_PxxQ_array, &
   !$OMP  n_act_orb,mo_integrals_map,list_act)
 
   allocate(integrals_array(mo_num,mo_num))
@@ -80,8 +94,8 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_a
       call get_mo_two_e_integrals_ij(ii,jj,mo_num,integrals_array,mo_integrals_map)
       do q=1,mo_num
         do p=1,mo_num
-          bielec_PxxQ(p,i,j,q)=integrals_array(p,q)
-          bielec_PxxQ(p,j,i,q)=integrals_array(q,p)
+          bielec_PxxQ_array(p,i,j,q)=integrals_array(p,q)
+          bielec_PxxQ_array(p,j,i,q)=integrals_array(q,p)
         end do
       end do
     end do
@@ -91,8 +105,8 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_a
       call get_mo_two_e_integrals_ij(ii,jj,mo_num,integrals_array,mo_integrals_map)
       do q=1,mo_num
         do p=1,mo_num
-          bielec_PxxQ(p,i,j3,q)=integrals_array(p,q)
-          bielec_PxxQ(p,j3,i,q)=integrals_array(q,p)
+          bielec_PxxQ_array(p,i,j3,q)=integrals_array(p,q)
+          bielec_PxxQ_array(p,j3,i,q)=integrals_array(q,p)
         end do
       end do
     end do
@@ -111,8 +125,8 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_a
       call get_mo_two_e_integrals_ij(ii,jj,mo_num,integrals_array,mo_integrals_map)
       do q=1,mo_num
         do p=1,mo_num
-          bielec_PxxQ(p,i3,j3,q)=integrals_array(p,q)
-          bielec_PxxQ(p,j3,i3,q)=integrals_array(q,p)
+          bielec_PxxQ_array(p,i3,j3,q)=integrals_array(p,q)
+          bielec_PxxQ_array(p,j3,i3,q)=integrals_array(q,p)
         end do
       end do
     end do
@@ -129,10 +143,15 @@ BEGIN_PROVIDER [real*8, bielecCI, (n_act_orb,n_act_orb,n_act_orb, mo_num)]
   BEGIN_DOC
   ! bielecCI : integrals (tu|vp) with p arbitrary, tuv active
   ! index p runs over the whole basis, t,u,v only over the active orbitals
+  ! 
+  ! This array can be stored anyway. Ex: 50 active orbitals, 1500 MOs ==> 8x50^3x1500 = 1.5 Gb
   END_DOC
   implicit none
   integer                        :: i,j,k,p,t,u,v
   double precision, external     :: mo_two_e_integral
+  double precision :: wall0, wall1 
+  call wall_time(wall0)
+  print*,'Providing bielecCI'
   PROVIDE mo_two_e_integrals_in_map
   
   !$OMP PARALLEL DO DEFAULT(NONE) &
@@ -151,5 +170,7 @@ BEGIN_PROVIDER [real*8, bielecCI, (n_act_orb,n_act_orb,n_act_orb, mo_num)]
     end do
   end do
   !$OMP END PARALLEL DO
+  call wall_time(wall1)
+  print*,'Time to provide bielecCI = ',wall1 - wall0
 
 END_PROVIDER
diff --git a/src/casscf_cipsi/bielec_natorb.irp.f b/src/casscf_cipsi/bielec_natorb.irp.f
index 9968530c..99734a0b 100644
--- a/src/casscf_cipsi/bielec_natorb.irp.f
+++ b/src/casscf_cipsi/bielec_natorb.irp.f
@@ -1,30 +1,38 @@
- BEGIN_PROVIDER [real*8, bielec_PQxx_no, (mo_num, mo_num,n_core_inact_act_orb,n_core_inact_act_orb)]
+ BEGIN_PROVIDER [real*8, bielec_PQxx_no_array, (mo_num, mo_num,n_core_inact_act_orb,n_core_inact_act_orb)]
   BEGIN_DOC
+  ! WARNING !!! Old version !!! NOT USED ANYMORE IN THE PROGRAM !!! TOO BIG TO BE STORED ON LARGE SYSTEMS !!! 
+  ! 
+  ! Replaced by the Cholesky-based function bielec_PQxx_no
+  !
   ! integral (pq|xx) in the basis of natural MOs
   ! indices are unshifted orbital numbers
+  ! 
   END_DOC
   implicit none
   integer                        :: i,j,k,l,t,u,p,q
   double precision, allocatable  :: f(:,:,:), d(:,:,:)
 
+  print*,''
+  print*,'Providing bielec_PQxx_no_array, WARNING IT CAN BE A VERY BIG ARRAY WHEN MO_NUM IS LARGE !!!'
+  print*,''
 
 
   !$OMP PARALLEL DEFAULT(NONE) &
   !$OMP PRIVATE(j,k,l,p,d,f) &
   !$OMP SHARED(n_core_inact_act_orb,mo_num,n_act_orb,n_core_inact_orb, &
-  !$OMP   bielec_PQxx_no,bielec_PQxx,list_act,natorbsCI)
+  !$OMP   bielec_PQxx_no_array,bielec_PQxx_array,list_act,natorbsCI)
 
   allocate (f(n_act_orb,mo_num,n_core_inact_act_orb), &
       d(n_act_orb,mo_num,n_core_inact_act_orb))
 
   !$OMP DO
   do l=1,n_core_inact_act_orb
-    bielec_PQxx_no(:,:,:,l) = bielec_PQxx(:,:,:,l)
+    bielec_PQxx_no_array(:,:,:,l) = bielec_PQxx_array(:,:,:,l)
 
     do k=1,n_core_inact_act_orb
       do j=1,mo_num
         do p=1,n_act_orb
-          f(p,j,k)=bielec_PQxx_no(list_act(p),j,k,l)
+          f(p,j,k)=bielec_PQxx_no_array(list_act(p),j,k,l)
         end do
       end do
     end do
@@ -36,13 +44,13 @@
     do k=1,n_core_inact_act_orb
       do j=1,mo_num
         do p=1,n_act_orb
-          bielec_PQxx_no(list_act(p),j,k,l)=d(p,j,k)
+          bielec_PQxx_no_array(list_act(p),j,k,l)=d(p,j,k)
         end do
       end do
 
       do j=1,mo_num
         do p=1,n_act_orb
-          f(p,j,k)=bielec_PQxx_no(j,list_act(p),k,l)
+          f(p,j,k)=bielec_PQxx_no_array(j,list_act(p),k,l)
         end do
       end do
     end do
@@ -54,7 +62,7 @@
     do k=1,n_core_inact_act_orb
       do p=1,n_act_orb
         do j=1,mo_num
-          bielec_PQxx_no(j,list_act(p),k,l)=d(p,j,k)
+          bielec_PQxx_no_array(j,list_act(p),k,l)=d(p,j,k)
         end do
       end do
     end do
@@ -71,7 +79,7 @@
     do p=1,n_act_orb
       do k=1,mo_num
         do j=1,mo_num
-          f(j,k,p) = bielec_PQxx_no(j,k,n_core_inact_orb+p,l)
+          f(j,k,p) = bielec_PQxx_no_array(j,k,n_core_inact_orb+p,l)
         end do
       end do
     end do
@@ -83,7 +91,7 @@
     do p=1,n_act_orb
       do k=1,mo_num
         do j=1,mo_num
-          bielec_PQxx_no(j,k,n_core_inact_orb+p,l)=d(j,k,p)
+          bielec_PQxx_no_array(j,k,n_core_inact_orb+p,l)=d(j,k,p)
         end do
       end do
     end do
@@ -97,7 +105,7 @@
     do p=1,n_act_orb
       do k=1,mo_num
         do j=1,mo_num
-          f(j,k,p) = bielec_PQxx_no(j,k,l,n_core_inact_orb+p)
+          f(j,k,p) = bielec_PQxx_no_array(j,k,l,n_core_inact_orb+p)
         end do
       end do
     end do
@@ -109,7 +117,7 @@
     do p=1,n_act_orb
       do k=1,mo_num
         do j=1,mo_num
-          bielec_PQxx_no(j,k,l,n_core_inact_orb+p)=d(j,k,p)
+          bielec_PQxx_no_array(j,k,l,n_core_inact_orb+p)=d(j,k,p)
         end do
       end do
     end do
@@ -123,8 +131,12 @@ END_PROVIDER
 
 
 
-BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inact_act_orb, mo_num)]
+BEGIN_PROVIDER [real*8, bielec_PxxQ_no_array, (mo_num,n_core_inact_act_orb,n_core_inact_act_orb, mo_num)]
   BEGIN_DOC
+  ! WARNING !!! Old version !!! NOT USED ANYMORE IN THE PROGRAM !!! TOO BIG TO BE STORED ON LARGE SYSTEMS !!! 
+  ! 
+  ! Replaced by the Cholesky-based function bielec_PxxQ_no
+  !
   ! integral (px|xq) in the basis of natural MOs
   ! indices are unshifted orbital numbers
   END_DOC
@@ -132,10 +144,14 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
   integer                        :: i,j,k,l,t,u,p,q
   double precision, allocatable  :: f(:,:,:), d(:,:,:)
 
+  print*,''
+  print*,'Providing bielec_PxxQ_no_array, WARNING IT CAN BE A VERY BIG ARRAY WHEN MO_NUM IS LARGE !!!'
+  print*,''
+
   !$OMP PARALLEL DEFAULT(NONE) &
   !$OMP PRIVATE(j,k,l,p,d,f) &
   !$OMP SHARED(n_core_inact_act_orb,mo_num,n_act_orb,n_core_inact_orb, &
-  !$OMP   bielec_PxxQ_no,bielec_PxxQ,list_act,natorbsCI)
+  !$OMP   bielec_PxxQ_no_array,bielec_PxxQ_array,list_act,natorbsCI)
 
 
   allocate (f(n_act_orb,n_core_inact_act_orb,n_core_inact_act_orb), &
@@ -143,11 +159,11 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
 
   !$OMP DO
   do j=1,mo_num
-    bielec_PxxQ_no(:,:,:,j) = bielec_PxxQ(:,:,:,j)
+    bielec_PxxQ_no_array(:,:,:,j) = bielec_PxxQ_array(:,:,:,j)
     do l=1,n_core_inact_act_orb
       do k=1,n_core_inact_act_orb
         do p=1,n_act_orb
-            f(p,k,l) = bielec_PxxQ_no(list_act(p),k,l,j)
+            f(p,k,l) = bielec_PxxQ_no_array(list_act(p),k,l,j)
         end do
       end do
     end do
@@ -159,7 +175,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
     do l=1,n_core_inact_act_orb
       do k=1,n_core_inact_act_orb
         do p=1,n_act_orb
-          bielec_PxxQ_no(list_act(p),k,l,j)=d(p,k,l)
+          bielec_PxxQ_no_array(list_act(p),k,l,j)=d(p,k,l)
         end do
       end do
     end do
@@ -176,7 +192,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
     do l=1,n_core_inact_act_orb
       do j=1,mo_num
         do p=1,n_act_orb
-          f(p,j,l) = bielec_PxxQ_no(j,n_core_inact_orb+p,l,k)
+          f(p,j,l) = bielec_PxxQ_no_array(j,n_core_inact_orb+p,l,k)
         end do
       end do
     end do
@@ -188,7 +204,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
     do l=1,n_core_inact_act_orb
       do j=1,mo_num
         do p=1,n_act_orb
-          bielec_PxxQ_no(j,n_core_inact_orb+p,l,k)=d(p,j,l)
+          bielec_PxxQ_no_array(j,n_core_inact_orb+p,l,k)=d(p,j,l)
         end do
       end do
     end do
@@ -205,7 +221,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
     do p=1,n_act_orb
       do l=1,n_core_inact_act_orb
         do j=1,mo_num
-          f(j,l,p) = bielec_PxxQ_no(j,l,n_core_inact_orb+p,k)
+          f(j,l,p) = bielec_PxxQ_no_array(j,l,n_core_inact_orb+p,k)
         end do
       end do
     end do
@@ -217,7 +233,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
     do p=1,n_act_orb
       do l=1,n_core_inact_act_orb
         do j=1,mo_num
-          bielec_PxxQ_no(j,l,n_core_inact_orb+p,k)=d(j,l,p)
+          bielec_PxxQ_no_array(j,l,n_core_inact_orb+p,k)=d(j,l,p)
         end do
       end do
     end do
@@ -231,7 +247,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
     do p=1,n_act_orb
       do k=1,n_core_inact_act_orb
         do j=1,mo_num
-          f(j,k,p) = bielec_PxxQ_no(j,k,l,n_core_inact_orb+p)
+          f(j,k,p) = bielec_PxxQ_no_array(j,k,l,n_core_inact_orb+p)
         end do
       end do
     end do
@@ -243,7 +259,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
     do p=1,n_act_orb
       do k=1,n_core_inact_act_orb
         do j=1,mo_num
-          bielec_PxxQ_no(j,k,l,n_core_inact_orb+p)=d(j,k,p)
+          bielec_PxxQ_no_array(j,k,l,n_core_inact_orb+p)=d(j,k,p)
         end do
       end do
     end do
@@ -259,10 +275,16 @@ BEGIN_PROVIDER [real*8, bielecCI_no, (n_act_orb,n_act_orb,n_act_orb, mo_num)]
   BEGIN_DOC
   ! integrals (tu|vp) in the basis of natural MOs
   ! index p runs over the whole basis, t,u,v only over the active orbitals
+  ! 
+  ! This array can be stored anyway. Ex: 50 active orbitals, 1500 MOs ==> 8x50^3x1500 = 1.5 Gb
   END_DOC
   implicit none
   integer                        :: i,j,k,l,t,u,p,q
   double precision, allocatable  :: f(:,:,:), d(:,:,:)
+ 
+  double precision :: wall0, wall1
+  call wall_time(wall0)
+  print*,'Providing bielecCI_no'
   
   !$OMP PARALLEL DEFAULT(NONE) &
   !$OMP PRIVATE(j,k,l,p,d,f) &
@@ -363,6 +385,8 @@ BEGIN_PROVIDER [real*8, bielecCI_no, (n_act_orb,n_act_orb,n_act_orb, mo_num)]
   
   deallocate(d,f)
   !$OMP END PARALLEL
+  call wall_time(wall1)
+  print*,'Time to provide bielecCI_no = ',wall1-wall0
 
 
 END_PROVIDER
diff --git a/src/casscf_cipsi/casscf.irp.f b/src/casscf_cipsi/casscf.irp.f
index d0a26d36..b64a9d8f 100644
--- a/src/casscf_cipsi/casscf.irp.f
+++ b/src/casscf_cipsi/casscf.irp.f
@@ -11,7 +11,7 @@ program casscf
   if(small_active_space)then
    pt2_relative_error = 0.00001
   else
-   thresh_scf = 1.d-4
+   thresh_scf = max(1.d-4,thresh_scf)
    pt2_relative_error = 0.04
   endif
   touch pt2_relative_error 
@@ -46,94 +46,101 @@ subroutine run
   do while (.not.converged)
     print*,'pt2_max = ',pt2_max
     call run_stochastic_cipsi(Ev,PT2)
-    print*,'Ev,PT2',Ev(1),PT2(1)
-    E_PT2(1:N_states) = Ev(1:N_states) + PT2(1:N_states)
-    energy_old = energy
-    energy = eone+etwo+ecore
-    pt2_max_before = pt2_max
-
-    call write_time(6)
-    call write_int(6,iteration,'CAS-SCF iteration = ')
-    call write_double(6,energy,'State-average CAS-SCF energy = ')
-!    if(n_states == 1)then
-!     call ezfio_get_casscf_cipsi_energy_pt2(E_PT2)
-!     call ezfio_get_casscf_cipsi_energy(PT2)
-     double precision :: delta_E_istate, e_av
-     e_av = 0.d0
-     do istate=1,N_states
-      e_av += state_average_weight(istate) * Ev(istate) 
-      if(istate.gt.1)then
-       delta_E_istate = E_PT2(istate)  - E_PT2(1)
-      write(*,'(A6,I2,A18,F16.10)')'state ',istate,' Delta E+PT2    = ',delta_E_istate
-      endif
-      write(*,'(A6,I2,A18,F16.10)')'state ',istate,' E + PT2 energy = ',E_PT2(istate)
-      write(*,'(A6,I2,A18,F16.10)')'state ',istate,'     PT2 energy = ',PT2(istate)
-!      call write_double(6,E_PT2(istate),'E + PT2 energy = ')
-!      call write_double(6,PT2(istate),'  PT2          = ')
-     enddo
-     call write_double(6,e_av,'State-average CAS-SCF energy bis = ')
-     call write_double(6,pt2_max,' PT2_MAX       = ')
+!    if(act_mos_opt)then DOES NOT WORK
+!     call run_orb_opt_trust_v2
+!     call run_stochastic_cipsi(Ev,PT2)
 !    endif
-
-    print*,''
-    call write_double(6,norm_grad_vec2,'Norm of gradients = ')
-    call write_double(6,norm_grad_vec2_tab(1), '     Core-active  gradients = ')
-    call write_double(6,norm_grad_vec2_tab(2), '     Core-virtual gradients = ')
-    call write_double(6,norm_grad_vec2_tab(3), '   Active-virtual gradients = ')
-    print*,''
-    call write_double(6,energy_improvement, 'Predicted energy improvement = ')
-
-    if(criterion_casscf == "energy")then
-     converged = dabs(energy_improvement) < thresh_scf
-    else if (criterion_casscf == "gradients")then
-     converged = norm_grad_vec2 < thresh_scf
-    else if (criterion_casscf == "e_pt2")then
-     delta_E = 0.d0
-     do istate = 1, N_states
-      delta_E += dabs(E_PT2(istate) - ept2_before(istate))
-     enddo
-     converged = dabs(delta_E) < thresh_casscf
-    endif
-    ept2_before = E_PT2
-    if(.not.small_active_space)then
-     if(adaptive_pt2_max)then
-      pt2_max = dabs(energy_improvement / (pt2_relative_error))
-      pt2_max = min(pt2_max, pt2_max_before)
-      if(n_act_orb.ge.n_big_act_orb)then
-       pt2_max = max(pt2_max,pt2_min_casscf)
-      endif
+    if(.True.)then
+     print*,'Ev,PT2',Ev(1),PT2(1)
+     E_PT2(1:N_states) = Ev(1:N_states) + PT2(1:N_states)
+     energy_old = energy
+     energy = eone+etwo+ecore
+     pt2_max_before = pt2_max
+ 
+     call write_time(6)
+     call write_int(6,iteration,'CAS-SCF iteration = ')
+     call write_double(6,energy,'State-average CAS-SCF energy = ')
+!!    if(n_states == 1)then
+!!     call ezfio_get_casscf_cipsi_energy_pt2(E_PT2)
+!!     call ezfio_get_casscf_cipsi_energy(PT2)
+      double precision :: delta_E_istate, e_av
+      e_av = 0.d0
+      do istate=1,N_states
+       e_av += state_average_weight(istate) * Ev(istate) 
+       if(istate.gt.1)then
+        delta_E_istate = E_PT2(istate)  - E_PT2(1)
+       write(*,'(A6,I2,A18,F16.10)')'state ',istate,' Delta E+PT2    = ',delta_E_istate
+       endif
+       write(*,'(A6,I2,A18,F16.10)')'state ',istate,' E + PT2 energy = ',E_PT2(istate)
+       write(*,'(A6,I2,A18,F16.10)')'state ',istate,'     PT2 energy = ',PT2(istate)
+!!      call write_double(6,E_PT2(istate),'E + PT2 energy = ')
+!!      call write_double(6,PT2(istate),'  PT2          = ')
+      enddo
+      call write_double(6,e_av,'State-average CAS-SCF energy bis = ')
+      call write_double(6,pt2_max,' PT2_MAX       = ')
+!!    endif
+ 
+     print*,''
+     call write_double(6,norm_grad_vec2,'Norm of gradients = ')
+     call write_double(6,norm_grad_vec2_tab(1), '     Core-active  gradients = ')
+     call write_double(6,norm_grad_vec2_tab(2), '     Core-virtual gradients = ')
+     call write_double(6,norm_grad_vec2_tab(3), '   Active-virtual gradients = ')
+     print*,''
+     call write_double(6,energy_improvement, 'Predicted energy improvement = ')
+ 
+     if(criterion_casscf == "energy")then
+      converged = dabs(energy_improvement) < thresh_scf
+     else if (criterion_casscf == "gradients")then
+      converged = norm_grad_vec2 < thresh_scf
+     else if (criterion_casscf == "e_pt2")then
+      delta_E = 0.d0
+      do istate = 1, N_states
+       delta_E += dabs(E_PT2(istate) - ept2_before(istate))
+      enddo
+      converged = dabs(delta_E) < thresh_casscf
      endif
-    endif
-    print*,''
-    call write_double(6,pt2_max, 'PT2_MAX for next iteration = ')
-
-    mo_coef = NewOrbs
-    mo_occ  = occnum
-    if(.not.converged)then
-     call save_mos
-     iteration += 1
-     if(norm_grad_vec2.gt.0.01d0)then
-      N_det = N_states
-     else
-      N_det = max(N_det/8 ,N_states)
-     endif
-     psi_det = psi_det_sorted
-     psi_coef = psi_coef_sorted
-     read_wf = .True.
-     call clear_mo_map
-     SOFT_TOUCH mo_coef N_det psi_det psi_coef
+     ept2_before = E_PT2
      if(.not.small_active_space)then
       if(adaptive_pt2_max)then
-        SOFT_TOUCH pt2_max  
+       pt2_max = dabs(energy_improvement / (pt2_relative_error))
+       pt2_max = min(pt2_max, pt2_max_before)
+       if(n_act_orb.ge.n_big_act_orb)then
+        pt2_max = max(pt2_max,pt2_min_casscf)
+       endif
       endif
      endif
-     if(iteration .gt. 3)then
-      state_following_casscf = state_following_casscf_cipsi_save
-      soft_touch state_following_casscf
+     print*,''
+     call write_double(6,pt2_max, 'PT2_MAX for next iteration = ')
+ 
+     mo_coef = NewOrbs
+     mo_occ  = occnum
+     if(.not.converged)then
+      call save_mos
+      iteration += 1
+      if(norm_grad_vec2.gt.0.01d0)then
+       N_det = N_states
+      else
+       N_det = max(N_det/8 ,N_states)
+      endif
+      psi_det = psi_det_sorted
+      psi_coef = psi_coef_sorted
+      read_wf = .True.
+      call clear_mo_map
+      SOFT_TOUCH mo_coef N_det psi_det psi_coef
+      if(.not.small_active_space)then
+       if(adaptive_pt2_max)then
+         SOFT_TOUCH pt2_max  
+       endif
+      endif
+      if(iteration .gt. 3)then
+       state_following_casscf = state_following_casscf_cipsi_save
+       soft_touch state_following_casscf
+      endif
      endif
     endif
-
+ 
   enddo
+  if(.True.)then
      integer :: i
     print*,'Converged CASSCF '
     print*,'--------------------------'
@@ -153,6 +160,7 @@ subroutine run
 !   write(*,*)mcscf_fock_alpha_mo(i,i)
   enddo
 
+ endif
 
 end
 
diff --git a/src/casscf_cipsi/chol_bielec.irp.f b/src/casscf_cipsi/chol_bielec.irp.f
new file mode 100644
index 00000000..f69832c1
--- /dev/null
+++ b/src/casscf_cipsi/chol_bielec.irp.f
@@ -0,0 +1,248 @@
+
+BEGIN_PROVIDER [double precision, cholesky_no_1_idx_transp, (cholesky_mo_num, n_act_orb, mo_num)]
+ BEGIN_DOC
+ ! Cholesky vectors with ONE orbital on the active natural orbital basis 
+ END_DOC
+ implicit none
+ integer :: i_chol,i_act,i_mo,jj_act
+ double precision, allocatable :: chol_tmp(:,:)
+ double precision :: wall0,wall1
+ call wall_time(wall0)
+ print*,'Providing cholesky_no_1_idx_transp'
+ allocate(chol_tmp(cholesky_mo_num,n_act_orb))
+ cholesky_no_1_idx_transp = 0.D0
+ do i_mo = 1, mo_num
+  ! Get all the integrals corresponding to the "i_mo"
+  do i_act = 1, n_act_orb
+   jj_act = list_act(i_act)
+   do i_chol = 1, cholesky_mo_num
+    chol_tmp(i_chol, i_act) = cholesky_mo_transp(i_chol, jj_act, i_mo)
+   enddo
+  enddo
+  call dgemm('N','N',cholesky_mo_num,n_act_orb,n_act_orb,1.d0,  &
+        chol_tmp, size(chol_tmp,1),                              &
+        natorbsCI, size(natorbsCI,1),                                              &
+        0.d0,                                                      &
+        cholesky_no_1_idx_transp(1,1,i_mo), size(cholesky_no_1_idx_transp,1))
+ enddo
+ call wall_time(wall1)
+ print*,'Time to provide cholesky_no_1_idx_transp = ', wall1 - wall0
+
+END_PROVIDER 
+
+BEGIN_PROVIDER [double precision, cholesky_no_2_idx_transp, (cholesky_mo_num, n_act_orb, n_act_orb)]
+ BEGIN_DOC
+ ! Cholesky vectors with TWO orbital on the active natural orbital basis 
+ END_DOC
+ implicit none
+ integer :: i_chol,i_act,j_act,jj_act
+ double precision, allocatable :: chol_tmp(:,:),chol_tmp_bis(:,:)
+ allocate(chol_tmp(cholesky_mo_num,n_act_orb),chol_tmp_bis(cholesky_mo_num,n_act_orb))
+ double precision :: wall0,wall1
+ call wall_time(wall0)
+ print*,'Providing cholesky_no_2_idx_transp'
+ cholesky_no_2_idx_transp = 0.D0
+ do i_act = 1, n_act_orb
+  ! Get all the integrals corresponding to the "j_act"
+  do j_act = 1, n_act_orb
+   jj_act = list_act(j_act)
+   do i_chol = 1, cholesky_mo_num
+    chol_tmp(i_chol, j_act) = cholesky_no_1_idx_transp(i_chol, i_act, jj_act)
+   enddo
+  enddo
+  call dgemm('N','N',cholesky_mo_num,n_act_orb,n_act_orb,1.d0,  &
+        chol_tmp, size(chol_tmp,1),                              &
+        natorbsCI, size(natorbsCI,1),                                              &
+        0.d0,                                                      &
+        cholesky_no_2_idx_transp(1,1,i_act), size(cholesky_no_2_idx_transp,1))
+ enddo
+ call wall_time(wall1)
+ print*,'Time to provide  cholesky_no_2_idx_transp = ', wall1 - wall0
+
+END_PROVIDER 
+
+BEGIN_PROVIDER [ double precision, cholesky_no_total_transp, (cholesky_mo_num, mo_num, mo_num)]
+ implicit none
+ BEGIN_DOC
+ ! Cholesky vectors defined on all basis including the NO basis
+ END_DOC
+ integer :: i_chol, i_act, ii_act, j_act, jj_act, i_core_inact, j_core_inact, ii_core_inact, jj_core_inact
+ integer :: i_virt, ii_virt, j_virt, jj_virt
+ double precision :: wall0,wall1
+ call wall_time(wall0)
+ print*,'Providing cholesky_no_total_transp '
+ ! Block when two orbitals belong to the core/inact 
+ do j_core_inact = 1, n_core_inact_orb
+  jj_core_inact = list_core_inact(j_core_inact)
+  do i_core_inact = 1, n_core_inact_orb
+   ii_core_inact = list_core_inact(i_core_inact)
+   do i_chol = 1, cholesky_mo_num
+    cholesky_no_total_transp(i_chol, ii_core_inact, jj_core_inact) = cholesky_mo_transp(i_chol,ii_core_inact,jj_core_inact)
+   enddo
+  enddo
+ enddo
+
+ ! Block when one orbitals belongs to the core/inact and one belongs to the active
+ do j_core_inact = 1, n_core_inact_orb
+  jj_core_inact = list_core_inact(j_core_inact)
+  do i_act = 1, n_act_orb
+   ii_act = list_act(i_act)
+   do i_chol = 1, cholesky_mo_num 
+    cholesky_no_total_transp(i_chol,ii_act,j_core_inact) = cholesky_no_1_idx_transp(i_chol,i_act,jj_core_inact)
+   enddo
+  enddo
+ enddo
+ do j_core_inact = 1, n_core_inact_orb
+  jj_core_inact = list_core_inact(j_core_inact)
+  do i_act = 1, n_act_orb
+   ii_act = list_act(i_act)
+   do i_chol = 1, cholesky_mo_num 
+    cholesky_no_total_transp(i_chol,j_core_inact,ii_act) = cholesky_no_1_idx_transp(i_chol,i_act,jj_core_inact)
+   enddo
+  enddo
+ enddo
+ 
+ ! Block when two orbitals belong to the active 
+ do j_act = 1, n_act_orb
+  jj_act = list_act(j_act)
+  do i_act = 1, n_act_orb
+   ii_act = list_act(i_act)
+   do i_chol = 1, cholesky_mo_num 
+    cholesky_no_total_transp(i_chol,ii_act,jj_act) = cholesky_no_2_idx_transp(i_chol,i_act,j_act)
+   enddo
+  enddo
+ enddo
+
+ ! Block when two orbitals belong to the virtuals 
+ do i_virt = 1, n_virt_orb
+  ii_virt = list_virt(i_virt)
+  do j_virt = 1, n_virt_orb
+   jj_virt = list_virt(j_virt)
+   do i_chol = 1, cholesky_mo_num
+    cholesky_no_total_transp(i_chol,jj_virt,ii_virt) = cholesky_mo_transp(i_chol,jj_virt,ii_virt)
+   enddo
+  enddo
+ enddo
+ 
+ ! Block when one orbital is in active and the other in the virtuals 
+ do i_virt = 1, n_virt_orb
+  ii_virt = list_virt(i_virt)
+  do i_act = 1, n_act_orb
+   ii_act = list_act(i_act)
+   do i_chol = 1, cholesky_mo_num
+    cholesky_no_total_transp(i_chol,ii_act,ii_virt) = cholesky_no_1_idx_transp(i_chol, i_act,ii_virt)
+   enddo
+  enddo
+ enddo
+ do i_virt = 1, n_virt_orb
+  ii_virt = list_virt(i_virt)
+  do i_act = 1, n_act_orb
+   ii_act = list_act(i_act)
+   do i_chol = 1, cholesky_mo_num
+    cholesky_no_total_transp(i_chol,ii_virt,ii_act) = cholesky_no_1_idx_transp(i_chol, i_act,ii_virt)
+   enddo
+  enddo
+ enddo
+ ! Block when one orbital is in the virtual and one in the core-inact 
+ do i_virt = 1, n_virt_orb
+  ii_virt = list_virt(i_virt)
+  do i_core_inact = 1, n_core_inact_orb
+   ii_core_inact = list_core_inact(i_core_inact)
+   do i_chol = 1, cholesky_mo_num
+    cholesky_no_total_transp(i_chol, ii_core_inact, ii_virt) = cholesky_mo_transp(i_chol, ii_core_inact, ii_virt)
+   enddo
+  enddo
+ enddo
+ do i_core_inact = 1, n_core_inact_orb
+  ii_core_inact = list_core_inact(i_core_inact)
+  do i_virt = 1, n_virt_orb
+   ii_virt = list_virt(i_virt)
+   do i_chol = 1, cholesky_mo_num
+    cholesky_no_total_transp(i_chol, ii_virt, ii_core_inact) = cholesky_mo_transp(i_chol, ii_virt, ii_core_inact)
+   enddo
+  enddo
+ enddo
+
+ call wall_time(wall1)
+ print*,'Time to provide cholesky_no_total_transp = ', wall1 - wall0
+END_PROVIDER 
+
+
+double precision function bielec_no_basis(i_1,j_1,i_2,j_2)
+ implicit none
+ integer, intent(in) :: i_1,j_1,i_2,j_2
+  BEGIN_DOC
+  ! integral (i_1 j_1|i_2 j_2) in the mixed basis of both MOs and natural MOs
+  ! 
+  END_DOC
+  integer :: i 
+ bielec_no_basis = 0.d0
+ do i = 1, cholesky_mo_num
+  bielec_no_basis += cholesky_no_total_transp(i,i_1, j_1) * cholesky_no_total_transp(i,i_2,j_2)
+ enddo
+end
+
+double precision function bielec_PQxx_no(i_mo, j_mo, i_ca, j_ca)
+ implicit none
+ BEGIN_DOC
+ ! function that computes (i_mo j_mo| i_ca j_ca) with Cholesky decomposition  on the NO basis for active orbitals 
+ ! 
+ ! where i_ca, j_ca are in [1:n_core_inact_act_orb]
+ END_DOC
+ integer, intent(in) :: i_ca, j_ca, i_mo, j_mo
+ integer :: ii_ca, jj_ca
+ double precision :: bielec_no_basis
+ ii_ca = list_core_inact_act(i_ca)
+ jj_ca = list_core_inact_act(j_ca)
+ bielec_PQxx_no = bielec_no_basis(i_mo,j_mo,ii_ca,jj_ca)
+end
+
+double precision function bielec_PxxQ_no(i_mo, j_ca, i_ca, j_mo)
+ implicit none 
+  BEGIN_DOC
+  ! function that computes (i_mo j_ca |i_ca j_mo) with Cholesky decomposition  on the NO basis for active orbitals 
+  ! 
+  ! where i_ca, j_ca are in [1:n_core_inact_act_orb]
+  END_DOC
+ integer, intent(in) :: i_ca, j_ca, i_mo, j_mo
+ integer :: ii_ca, jj_ca
+ double precision :: bielec_no_basis
+ ii_ca = list_core_inact_act(i_ca)
+ jj_ca = list_core_inact_act(j_ca)
+ bielec_PxxQ_no = bielec_no_basis(i_mo, jj_ca, ii_ca, j_mo)
+
+end
+
+
+double precision function bielec_PQxx(i_mo, j_mo, i_ca, j_ca)
+  BEGIN_DOC
+  ! function that computes (i_mo j_mo |i_ca j_ca) with Cholesky decomposition 
+  ! 
+  ! indices are unshifted orbital numbers
+  ! 
+  ! where i_ca, j_ca are in [1:n_core_inact_act_orb]
+  END_DOC
+ implicit none 
+ integer, intent(in) :: i_ca, j_ca, j_mo, i_mo
+ double precision :: mo_two_e_integral
+ integer :: ii_ca, jj_ca
+ ii_ca = list_core_inact_act(i_ca)
+ jj_ca = list_core_inact_act(j_ca)
+ bielec_PQxx = mo_two_e_integral(i_mo,ii_ca,j_mo,jj_ca)
+end
+
+double precision function bielec_PxxQ(i_mo, i_ca, j_ca, j_mo)
+  BEGIN_DOC
+  ! function that computes (i_mo j_mo |i_ca j_ca) with Cholesky decomposition 
+  ! 
+  ! where i_ca, j_ca are in [1:n_core_inact_act_orb]
+  END_DOC
+ implicit none
+ integer, intent(in) :: i_ca, j_ca, j_mo, i_mo
+ double precision :: mo_two_e_integral
+ integer :: ii_ca, jj_ca
+ ii_ca = list_core_inact_act(i_ca)
+ jj_ca = list_core_inact_act(j_ca)
+ bielec_PxxQ = mo_two_e_integral(i_mo,jj_ca,ii_ca,j_mo)
+end
+
diff --git a/src/casscf_cipsi/chol_garb.irp.f b/src/casscf_cipsi/chol_garb.irp.f
new file mode 100644
index 00000000..c4a8fa59
--- /dev/null
+++ b/src/casscf_cipsi/chol_garb.irp.f
@@ -0,0 +1,34 @@
+
+!!!!! FUNCTIONS THAT WORK BUT WHICH ARE USELESS AS THE ARRAYS CAN ALWAYS BE STORED
+!double precision function bielecCI_chol(i_a, j_a, k_a, i_mo)
+!  BEGIN_DOC
+!  ! function that computes (i_a j_a |k_a j_mo) with Cholesky decomposition 
+!  ! 
+!  ! where i_a, j_a, k_a are in [1:n_act_orb] !!! ONLY ON ACTIVE 
+!  END_DOC
+! implicit none
+! integer, intent(in) :: i_a, j_a, k_a, i_mo
+! integer :: ii_a, jj_a, kk_a
+! double precision :: mo_two_e_integral
+! ii_a = list_act(i_a)
+! jj_a = list_act(j_a)
+! kk_a = list_act(k_a)
+! bielecCI_chol = mo_two_e_integral(ii_a,kk_a,jj_a,i_mo)
+!end
+
+!double precision function bielecCI_no_chol(i_ca, j_ca, k_ca, i_mo)
+!  BEGIN_DOC
+!  ! function that computes (i_ca j_ca |k_ca j_mo) with Cholesky decomposition on the NO basis for active orbitals 
+!  ! 
+!  ! where i_ca, j_ca, k_ca are in [1:n_core_inact_act_orb]
+!  END_DOC
+! implicit none 
+! integer, intent(in) :: i_ca, j_ca, k_ca, i_mo
+! integer :: ii_ca, jj_ca, kk_ca
+! double precision :: bielec_no_basis_chol
+! ii_ca = list_act(i_ca)
+! jj_ca = list_act(j_ca)
+! kk_ca = list_act(k_ca)
+! bielecCI_no_chol = bielec_no_basis_chol(ii_ca, jj_ca, kk_ca, i_mo)
+!
+!end
diff --git a/src/casscf_cipsi/gradient.irp.f b/src/casscf_cipsi/gradient.irp.f
index a1c5e947..961d260d 100644
--- a/src/casscf_cipsi/gradient.irp.f
+++ b/src/casscf_cipsi/gradient.irp.f
@@ -157,6 +157,7 @@ real*8 function gradvec_it(i,t)
   
   integer                        :: ii,tt,v,vv,x,y
   integer                        :: x3,y3
+  double precision :: bielec_PQxx_no
   
   ii=list_core_inact(i)
   tt=list_act(t)
diff --git a/src/casscf_cipsi/hessian.irp.f b/src/casscf_cipsi/hessian.irp.f
index 458c6aa6..9a7a9031 100644
--- a/src/casscf_cipsi/hessian.irp.f
+++ b/src/casscf_cipsi/hessian.irp.f
@@ -10,6 +10,7 @@ real*8 function hessmat_itju(i,t,j,u)
   implicit none
   integer                        :: i,t,j,u,ii,tt,uu,v,vv,x,xx,y,jj
   real*8                         :: term,t2
+  double precision :: bielec_pqxx_no,bielec_pxxq_no
   
   ii=list_core_inact(i)
   tt=list_act(t)
@@ -95,6 +96,7 @@ real*8 function hessmat_itja(i,t,j,a)
   implicit none
   integer                        :: i,t,j,a,ii,tt,jj,aa,v,vv,x,y
   real*8                         :: term
+  double precision :: bielec_pqxx_no,bielec_pxxq_no
   
   ! it/ja
   ii=list_core_inact(i)
@@ -128,6 +130,7 @@ real*8 function hessmat_itua(i,t,u,a)
   implicit none
   integer                        :: i,t,u,a,ii,tt,uu,aa,v,vv,x,xx,u3,t3,v3
   real*8                         :: term
+  double precision :: bielec_pqxx_no,bielec_pxxq_no
   
   ii=list_core_inact(i)
   tt=list_act(t)
@@ -169,6 +172,7 @@ real*8 function hessmat_iajb(i,a,j,b)
   implicit none
   integer                        :: i,a,j,b,ii,aa,jj,bb
   real*8                         :: term
+  double precision :: bielec_pqxx_no,bielec_pxxq_no
   
   ii=list_core_inact(i)
   aa=list_virt(a)
@@ -205,6 +209,7 @@ real*8 function hessmat_iatb(i,a,t,b)
   implicit none
   integer                        :: i,a,t,b,ii,aa,tt,bb,v,vv,x,y,v3,t3
   real*8                         :: term
+  double precision :: bielec_pqxx_no,bielec_pxxq_no
   
   ii=list_core_inact(i)
   aa=list_virt(a)
@@ -237,6 +242,7 @@ real*8 function hessmat_taub(t,a,u,b)
   integer                        :: t,a,u,b,tt,aa,uu,bb,v,vv,x,xx,y
   integer                        :: v3,x3
   real*8                         :: term,t1,t2,t3
+  double precision :: bielec_pqxx_no,bielec_pxxq_no
   
   tt=list_act(t)
   aa=list_virt(a)
diff --git a/src/casscf_cipsi/mcscf_fock.irp.f b/src/casscf_cipsi/mcscf_fock.irp.f
index 0f4b7a99..82b710a7 100644
--- a/src/casscf_cipsi/mcscf_fock.irp.f
+++ b/src/casscf_cipsi/mcscf_fock.irp.f
@@ -4,6 +4,7 @@ BEGIN_PROVIDER [real*8, Fipq, (mo_num,mo_num) ]
    END_DOC
    implicit none
    integer                        :: p,q,k,kk,t,tt,u,uu
+   double precision :: bielec_pxxq_no, bielec_pqxx_no
    
    do q=1,mo_num
      do p=1,mo_num
@@ -44,6 +45,7 @@ BEGIN_PROVIDER [real*8, Fapq, (mo_num,mo_num) ]
    END_DOC
    implicit none
    integer                        :: p,q,k,kk,t,tt,u,uu
+   double precision :: bielec_pxxq_no, bielec_pqxx_no
    
    Fapq = 0.d0
    
diff --git a/src/casscf_cipsi/test_chol.irp.f b/src/casscf_cipsi/test_chol.irp.f
new file mode 100644
index 00000000..bcce7cf7
--- /dev/null
+++ b/src/casscf_cipsi/test_chol.irp.f
@@ -0,0 +1,116 @@
+program test_chol
+ implicit none
+ read_wf= .True.
+ touch read_wf 
+! call routine_bielec_PxxQ_no
+! call routine_bielecCI_no
+! call test_bielec_PxxQ_chol
+! call test_bielecCI
+
+end
+
+subroutine routine_bielec_PQxx_no
+ implicit none
+ integer :: i_chol, i_act, ii_act, j_act, jj_act, i_core_inact, j_core_inact, ii_core_inact, jj_core_inact
+ integer :: i_virt, ii_virt, j_virt, jj_virt, i_mo, j_mo
+ double precision :: exact, new, error, accu, bielec_no_basis_chol
+ double precision :: bielec_PQxx_no
+ 
+ accu = 0.d0
+ do i_core_inact = 1, n_core_inact_act_orb
+  ii_core_inact = list_core_inact_act(i_core_inact)
+  do j_core_inact = 1, n_core_inact_act_orb
+   jj_core_inact = list_core_inact_act(j_core_inact)
+   do i_mo = 1, mo_num
+    do j_mo = 1, mo_num
+     exact = bielec_PQxx_no_array(j_mo,i_mo, j_core_inact, i_core_inact) 
+     new   = bielec_PQxx_no(j_mo,i_mo, j_core_inact, i_core_inact) 
+     error = dabs(exact-new)
+     if(dabs(exact).gt.1.d-10)then
+      print*,exact,new,error
+     endif
+     accu += error
+    enddo
+   enddo
+  enddo
+ enddo
+ print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2))
+end
+
+subroutine routine_bielec_PxxQ_no_array
+ implicit none
+ integer :: i_chol, i_act, ii_act, j_act, jj_act, i_core_inact, j_core_inact, ii_core_inact, jj_core_inact
+ integer :: i_virt, ii_virt, j_virt, jj_virt, i_mo, j_mo
+ double precision :: exact, new, error, accu, bielec_no_basis_chol
+ double precision :: bielec_PxxQ_no
+ 
+ accu = 0.d0
+ do i_mo = 1, mo_num
+  do i_core_inact = 1, n_core_inact_act_orb
+  ii_core_inact = list_core_inact_act(i_core_inact)
+   do j_core_inact = 1, n_core_inact_act_orb
+   jj_core_inact = list_core_inact_act(j_core_inact)
+    do j_mo = 1, mo_num
+     exact = bielec_PxxQ_no_array(j_mo, j_core_inact,  i_core_inact,i_mo) 
+!     new   = bielec_no_basis_chol(j_mo,i_mo, jj_core_inact, ii_core_inact) 
+     new   = bielec_PxxQ_no(j_mo, j_core_inact,  i_core_inact,i_mo) 
+     error = dabs(exact-new)
+     accu += error
+     if(dabs(exact).gt.1.d-10)then
+      print*,exact,new,error
+     endif
+    enddo
+   enddo
+  enddo
+ enddo
+ print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2))
+end
+
+subroutine test_bielec_PQxx(i_mo, j_mo, i_ca, j_ca)
+ implicit none
+ integer :: i_mo, j_mo, i_ca, j_ca 
+ double precision :: exact, new, error, accu
+ double precision :: bielec_PQxx
+ 
+ accu = 0.d0
+ do j_ca = 1, n_core_inact_act_orb
+  do i_ca = 1, n_core_inact_act_orb
+   do j_mo = 1, mo_num
+    do i_mo = 1, mo_num
+     exact = bielec_PQxx_array(i_mo, j_mo, i_ca, j_ca)
+     new   = bielec_PQxx(i_mo, j_mo, i_ca, j_ca)
+     error = dabs(exact-new)
+     accu += error
+     if(dabs(exact).gt.1.d-10)then
+      print*,exact,new,error
+     endif
+    enddo
+   enddo
+  enddo
+ enddo
+ print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2))
+end
+
+subroutine test_bielec_PxxQ_chol(i_mo, i_ca, j_ca, j_mo)
+ implicit none
+ integer :: i_mo, i_ca, j_ca, j_mo
+ double precision :: exact, new, error, accu
+ double precision :: bielec_PxxQ
+ accu = 0.d0
+ do j_mo = 1, mo_num
+  do j_ca = 1, n_core_inact_act_orb
+   do i_ca =1, n_core_inact_act_orb
+    do i_mo = 1, mo_num
+     exact = bielec_PxxQ_array(i_mo, i_ca, j_ca, j_mo)
+     new   = bielec_PxxQ(i_mo, i_ca, j_ca, j_mo)
+     error = dabs(exact-new)
+     accu += error
+     if(dabs(exact).gt.1.d-10)then
+      print*,exact,new,error
+     endif
+    enddo
+   enddo
+  enddo
+ enddo
+ print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2))
+end
diff --git a/src/casscf_cipsi/tot_en.irp.f b/src/casscf_cipsi/tot_en.irp.f
index 1d70e087..37ceac05 100644
--- a/src/casscf_cipsi/tot_en.irp.f
+++ b/src/casscf_cipsi/tot_en.irp.f
@@ -8,6 +8,7 @@
    implicit none
    integer                        :: t,u,v,x,i,ii,tt,uu,vv,xx,j,jj,t3,u3,v3,x3
    real*8                         :: e_one_all,e_two_all
+   double precision :: bielec_PQxx,bielec_PxxQ
    e_one_all=0.D0
    e_two_all=0.D0
    do i=1,n_core_inact_orb
diff --git a/src/ccsd/NEED b/src/ccsd/NEED
index e6e6bc59..8298f28e 100644
--- a/src/ccsd/NEED
+++ b/src/ccsd/NEED
@@ -1,2 +1,3 @@
+gpu
 hartree_fock
 utils_cc
diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index 9d4ae7f9..d8131a9c 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -1,4 +1,5 @@
 subroutine run_ccsd_space_orb
+  use gpu
 
   implicit none
 
@@ -9,9 +10,19 @@ subroutine run_ccsd_space_orb
   double precision :: uncorr_energy,energy, max_elem, max_r, max_r1, max_r2,ta,tb
   logical :: not_converged
 
-  double precision, allocatable :: t2(:,:,:,:), r2(:,:,:,:), tau(:,:,:,:), tau_x(:,:,:,:)
-  double precision, allocatable :: t1(:,:), r1(:,:)
-  double precision, allocatable :: H_oo(:,:), H_vv(:,:), H_vo(:,:)
+  type(gpu_double4) :: t2, r2, tau, tau_x
+  type(gpu_double2) :: t1, r1
+  type(gpu_double2) :: H_oo, H_vv, H_vo
+
+  type(gpu_double2) :: d_cc_space_f_oo, d_cc_space_f_vo
+  type(gpu_double2) :: d_cc_space_f_ov, d_cc_space_f_vv
+
+  type(gpu_double3) :: d_cc_space_v_oo_chol, d_cc_space_v_vo_chol
+  type(gpu_double3) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol
+
+  type(gpu_double4) :: d_cc_space_v_oovv, d_cc_space_v_voov, d_cc_space_v_ovov
+  type(gpu_double4) :: d_cc_space_v_oovo, d_cc_space_v_vooo, d_cc_space_v_oooo
+  type(gpu_double4) :: d_cc_space_v_vvoo, d_cc_space_v_ovvo, d_cc_space_v_ovoo
 
   double precision, allocatable :: all_err(:,:), all_t(:,:)
   integer, allocatable          :: list_occ(:), list_vir(:)
@@ -20,7 +31,7 @@ subroutine run_ccsd_space_orb
 
   call set_multiple_levels_omp(.False.)
 
-  if (do_ao_cholesky) then
+  if (do_mo_cholesky) then
     PROVIDE cholesky_mo_transp
     FREE cholesky_ao
   else
@@ -51,11 +62,77 @@ subroutine run_ccsd_space_orb
   !print*,'occ',list_occ
   !print*,'vir',list_vir
 
-  allocate(t2(nO,nO,nV,nV), r2(nO,nO,nV,nV))
-  allocate(tau(nO,nO,nV,nV))
-  allocate(tau_x(nO,nO,nV,nV))
-  allocate(t1(nO,nV), r1(nO,nV))
-  allocate(H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO))
+  ! GPU arrays
+  call gpu_allocate(d_cc_space_f_oo, nO, nO)
+  call gpu_allocate(d_cc_space_f_vo, nV, nO)
+  call gpu_allocate(d_cc_space_f_ov, nO, nV)
+  call gpu_allocate(d_cc_space_f_vv, nV, nV)
+
+  call gpu_upload(cc_space_f_oo, d_cc_space_f_oo)
+  call gpu_upload(cc_space_f_vo, d_cc_space_f_vo)
+  call gpu_upload(cc_space_f_ov, d_cc_space_f_ov)
+  call gpu_upload(cc_space_f_vv, d_cc_space_f_vv)
+
+!  FREE cc_space_f_oo
+!  FREE cc_space_f_vo
+!  FREE cc_space_f_vv
+
+  if (do_mo_cholesky) then
+    call gpu_allocate(d_cc_space_v_oo_chol, cholesky_mo_num, nO, nO)
+    call gpu_allocate(d_cc_space_v_ov_chol, cholesky_mo_num, nO, nV)
+    call gpu_allocate(d_cc_space_v_vo_chol, cholesky_mo_num, nV, nO)
+    call gpu_allocate(d_cc_space_v_vv_chol, cholesky_mo_num, nV, nV)
+
+    call gpu_upload(cc_space_v_oo_chol, d_cc_space_v_oo_chol)
+    call gpu_upload(cc_space_v_ov_chol, d_cc_space_v_ov_chol)
+    call gpu_upload(cc_space_v_vo_chol, d_cc_space_v_vo_chol)
+    call gpu_upload(cc_space_v_vv_chol, d_cc_space_v_vv_chol)
+
+!    FREE cc_space_v_oo_chol
+!    FREE cc_space_v_ov_chol
+!    FREE cc_space_v_vo_chol
+!    FREE cc_space_v_vv_chol
+  endif
+
+  call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV)
+  call gpu_allocate(d_cc_space_v_voov, nV, nO, nO, nV)
+  call gpu_allocate(d_cc_space_v_ovov, nO, nV, nO, nV)
+  call gpu_allocate(d_cc_space_v_oovo, nO, nO, nV, nO)
+  call gpu_allocate(d_cc_space_v_ovvo, nO, nV, nV, nO)
+  call gpu_allocate(d_cc_space_v_vooo, nV, nO, nO, nO)
+  call gpu_allocate(d_cc_space_v_oooo, nO, nO, nO, nO)
+  call gpu_allocate(d_cc_space_v_vvoo, nV, nV, nO, nO)
+  call gpu_allocate(d_cc_space_v_ovoo, nO, nV, nO, nO)
+
+  call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv)
+  call gpu_upload(cc_space_v_voov, d_cc_space_v_voov)
+  call gpu_upload(cc_space_v_ovov, d_cc_space_v_ovov)
+  call gpu_upload(cc_space_v_oovo, d_cc_space_v_oovo)
+  call gpu_upload(cc_space_v_ovvo, d_cc_space_v_ovvo)
+  call gpu_upload(cc_space_v_vooo, d_cc_space_v_vooo)
+  call gpu_upload(cc_space_v_oooo, d_cc_space_v_oooo)
+  call gpu_upload(cc_space_v_vvoo, d_cc_space_v_vvoo)
+  call gpu_upload(cc_space_v_ovoo, d_cc_space_v_ovoo)
+
+!  FREE cc_space_v_voov
+!  FREE cc_space_v_ovov
+!  FREE cc_space_v_oovo
+!  FREE cc_space_v_oovv
+!  FREE cc_space_v_vooo
+!  FREE cc_space_v_oooo
+!  FREE cc_space_v_vvoo
+!  FREE cc_space_v_ovvo
+!  FREE cc_space_v_ovoo
+
+  call gpu_allocate(t2, nO,nO,nV,nV)
+  call gpu_allocate(r2, nO,nO,nV,nV)
+  call gpu_allocate(tau, nO,nO,nV,nV)
+  call gpu_allocate(tau_x, nO,nO,nV,nV)
+  call gpu_allocate(t1, nO,nV)
+  call gpu_allocate(r1, nO,nV)
+  call gpu_allocate(H_oo, nO, nO)
+  call gpu_allocate(H_vo, nV, nO)
+  call gpu_allocate(H_vv, nV, nV)
 
   if (cc_update_method == 'diis') then
     double precision :: rss, diis_mem, extra_mem
@@ -97,14 +174,22 @@ subroutine run_ccsd_space_orb
   endif
 
   ! Init
-  call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,t1)
-  call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,t2)
-  call update_tau_space(nO,nV,t1,t2,tau)
+  double precision, allocatable :: h_t1(:,:), h_t2(:,:,:,:)
+  allocate(h_t1(nO,nV), h_t2(nO,nO,nV,nV))
+
+  call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,h_t1)
+  call gpu_upload(h_t1, t1)
+
+  call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,h_t2)
+  call gpu_upload(h_t2, t2)
+
+
+  call update_tau_space(nO,nV,h_t1,t1,t2,tau)
   call update_tau_x_space(nO,nV,tau,tau_x)
-  !print*,'hf_energy', hf_energy
   call det_energy(det,uncorr_energy)
   print*,'Det energy', uncorr_energy
-  call ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
+
+  call ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy)
   print*,'Guess energy', uncorr_energy+energy, energy
 
   nb_iter = 0
@@ -120,43 +205,45 @@ subroutine run_ccsd_space_orb
   do while (not_converged)
 
     ! Residue
-    if (do_ao_cholesky) then
-!    if (.False.) then
-      call compute_H_oo_chol(nO,nV,tau_x,H_oo)
-      call compute_H_vv_chol(nO,nV,tau_x,H_vv)
-      call compute_H_vo_chol(nO,nV,t1,H_vo)
+    if (do_mo_cholesky) then
+      call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
+      call compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, d_cc_space_v_ov_chol,H_vv)
+      call compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol, H_vo)
 
-      call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
-      call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
+      call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_space_f_ov,d_cc_space_f_vo, &
+           d_cc_space_v_voov, d_cc_space_v_ovov, d_cc_space_v_oovo, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol)
+      call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
+           d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, d_cc_space_v_oovo, d_cc_space_v_ovvo, d_cc_space_v_ovoo, &
+           d_cc_space_v_ovov, d_cc_space_v_vvoo, d_cc_space_v_oo_chol, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, &
+           d_cc_space_f_vo, &
+           r2, max_r2)
     else
-      call compute_H_oo(nO,nV,t1,t2,tau,H_oo)
-      call compute_H_vv(nO,nV,t1,t2,tau,H_vv)
-      call compute_H_vo(nO,nV,t1,t2,H_vo)
+      call compute_H_oo(nO,nV,t1%f,t2%f,tau%f,H_oo%f)
+      call compute_H_vv(nO,nV,t1%f,t2%f,tau%f,H_vv%f)
+      call compute_H_vo(nO,nV,t1%f,t2%f,H_vo%f)
 
-      call compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
-      call compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
+      call compute_r1_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r1%f,max_r1)
+      call compute_r2_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r2%f,max_r2)
     endif
     max_r = max(max_r1,max_r2)
 
     ! Update
     if (cc_update_method == 'diis') then
-      !call update_t_ccsd(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
-      !call update_t_ccsd_diis(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
-      call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1,r2,t1,t2,all_err,all_t)
+      call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1%f,r2%f,t1%f,t2%f,all_err,all_t)
 
     ! Standard update as T = T - Delta
     elseif (cc_update_method == 'none') then
-      call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1,t1)
-      call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2,t2)
+      call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1%f,t1%f)
+      call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2%f,t2%f)
     else
       print*,'Unkown cc_method_method: '//cc_update_method
     endif
 
-    call update_tau_space(nO,nV,t1,t2,tau)
+    call update_tau_space(nO,nV,t1%f,t1,t2,tau)
     call update_tau_x_space(nO,nV,tau,tau_x)
 
     ! Energy
-    call ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
+    call ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy)
     write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |'
 
     nb_iter = nb_iter + 1
@@ -181,8 +268,8 @@ subroutine run_ccsd_space_orb
   print*,''
 
   if (write_amplitudes) then
-    call write_t1(nO,nV,t1)
-    call write_t2(nO,nV,t2)
+    call write_t1(nO,nV,t1%f)
+    call write_t2(nO,nV,t2%f)
     call ezfio_set_utils_cc_io_amplitudes('Read')
   endif
 
@@ -191,7 +278,14 @@ subroutine run_ccsd_space_orb
     deallocate(all_err,all_t)
   endif
 
-  deallocate(H_vv,H_oo,H_vo,r1,r2,tau)
+  call gpu_deallocate(H_oo)
+  call gpu_deallocate(H_vv)
+  call gpu_deallocate(H_vo)
+
+  call gpu_deallocate(r1)
+  call gpu_deallocate(r2)
+  call gpu_deallocate(tau)
+  call gpu_deallocate(tau_x)
 
   ! CCSD(T)
   double precision :: e_t, e_t_err
@@ -199,28 +293,14 @@ subroutine run_ccsd_space_orb
 
   if (cc_par_t .and. elec_alpha_num + elec_beta_num > 2) then
 
-    ! Dumb way
-    !call wall_time(ta)
-    !call ccsd_par_t_space(nO,nV,t1,t2,e_t)
-    !call wall_time(tb)
-    !print*,'Time: ',tb-ta, ' s'
-
-    !print*,''
-    !write(*,'(A15,F18.12,A3)') ' E(CCSD(T))  = ', uncorr_energy + energy + e_t, ' Ha'
-    !write(*,'(A15,F18.12,A3)') ' E(T)        = ', e_t, ' Ha'
-    !write(*,'(A15,F18.12,A3)') ' Correlation = ', energy + e_t, ' Ha'
-    !print*,''
-
     ! New
     e_t = uncorr_energy + energy ! For print in (T) call
     e_t_err = 0.d0
 
     print*,'Computing (T) correction...'
     call wall_time(ta)
-!    call ccsd_par_t_space_v3(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
-!         ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t)
 
-    call ccsd_par_t_space_stoch(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
+    call ccsd_par_t_space_stoch(nO,nV,t1%f,t2%f,cc_space_f_o,cc_space_f_v &
          ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t, e_t_err)
 
     call wall_time(tb)
@@ -235,168 +315,161 @@ subroutine run_ccsd_space_orb
 
   call save_energy(uncorr_energy + energy, e_t)
 
-  deallocate(t1,t2)
+  deallocate(h_t1, h_t2)
+  if (do_mo_cholesky) then
+    call gpu_deallocate(d_cc_space_v_oo_chol)
+    call gpu_deallocate(d_cc_space_v_ov_chol)
+    call gpu_deallocate(d_cc_space_v_vo_chol)
+    call gpu_deallocate(d_cc_space_v_vv_chol)
+  endif
+
+  call gpu_deallocate(d_cc_space_v_oovv)
+  call gpu_deallocate(d_cc_space_v_voov)
+  call gpu_deallocate(d_cc_space_v_ovov)
+  call gpu_deallocate(d_cc_space_v_oovo)
+  call gpu_deallocate(d_cc_space_v_ovvo)
+  call gpu_deallocate(d_cc_space_v_vooo)
+  call gpu_deallocate(d_cc_space_v_oooo)
+  call gpu_deallocate(d_cc_space_v_vvoo)
+  call gpu_deallocate(d_cc_space_v_ovoo)
+
+  call gpu_deallocate(d_cc_space_f_oo)
+  call gpu_deallocate(d_cc_space_f_vo)
+  call gpu_deallocate(d_cc_space_f_ov)
+  call gpu_deallocate(d_cc_space_f_vv)
+
+  call gpu_deallocate(t1)
+  call gpu_deallocate(t2)
 
 end
 
 ! Energy
 
-subroutine ccsd_energy_space(nO,nV,tau,t1,energy)
-
+subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy)
+  use gpu
   implicit none
 
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: tau(nO,nO,nV,nV)
-  double precision, intent(in)  :: t1(nO,nV)
-  double precision, intent(out) :: energy
+  integer, intent(in)            :: nO, nV
+  type(gpu_double4), intent(in)  :: tau_x, d_cc_space_v_oovv
+  type(gpu_double2), intent(in)  :: t1, d_cc_space_f_vo
+  double precision, intent(out)  :: energy
 
   ! internal
   integer :: i,j,a,b
   double precision :: e
 
-  energy = 0d0
-  !$omp parallel &
-  !$omp shared(nO,nV,energy,tau,t1,&
-  !$omp cc_space_f_vo,cc_space_w_oovv) &
-  !$omp private(i,j,a,b,e) &
-  !$omp default(none)
-  e = 0d0
-  !$omp do
-  do a = 1, nV
-    do i = 1, nO
-      e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp do
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          e = e + tau(i,j,a,b) * cc_space_w_oovv(i,j,a,b)
-       enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp critical
-  energy = energy + e
-  !$omp end critical
-  !$omp end parallel
+  type(gpu_stream) :: s1, s2
+  call gpu_stream_create(s1)
+  call gpu_stream_create(s2)
 
-end
+  call gpu_set_stream(blas_handle,s1)
+  call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo%f(1,1), 1, t1%f(1,1), 1, e)
 
-subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
+  call gpu_set_stream(blas_handle,s2)
+  call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x%f(1,1,1,1), 1_8, d_cc_space_v_oovv%f(1,1,1,1), 1_8, energy)
+  call gpu_set_stream(blas_handle,gpu_default_stream)
 
-  implicit none
+  call gpu_synchronize()
+  call gpu_stream_destroy(s1)
+  call gpu_stream_destroy(s2)
 
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: tau_x(nO,nO,nV,nV)
-  double precision, intent(in)  :: t1(nO,nV)
-  double precision, intent(out) :: energy
-
-  ! internal
-  integer :: i,j,a,b
-  double precision :: e
-
-  energy = 0d0
-  !$omp parallel &
-  !$omp shared(nO,nV,energy,tau_x,t1,&
-  !$omp cc_space_f_vo,cc_space_v_oovv) &
-  !$omp private(i,j,a,b,e) &
-  !$omp default(none)
-  e = 0d0
-  !$omp do
-  do a = 1, nV
-    do i = 1, nO
-      e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp do
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          e = e + tau_x(i,j,a,b) * cc_space_v_oovv(i,j,a,b)
-       enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp critical
-  energy = energy + e
-  !$omp end critical
-  !$omp end parallel
+   energy = energy + 2.d0*e
 
 end
 
 ! Tau
 
-subroutine update_tau_space(nO,nV,t1,t2,tau)
-
+subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
+  use gpu
   implicit none
 
   ! in
   integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: t1(nO,nV), t2(nO,nO,nV,nV)
+  double precision, intent(in)  :: h_t1(nO,nV)
+  type(gpu_double2), intent(in) :: t1
+  type(gpu_double4), intent(in) :: t2
 
   ! out
-  double precision, intent(out) :: tau(nO,nO,nV,nV)
+  type(gpu_double4) :: tau
 
   ! internal
   integer                       :: i,j,a,b
 
+  type(gpu_stream) :: stream(nV)
+
   !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,tau,t2,t1) &
+  !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) &
   !$OMP PRIVATE(i,j,a,b) &
   !$OMP DEFAULT(NONE)
   !$OMP DO
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b)
-        enddo
-      enddo
+  do b=1,nV
+    call gpu_stream_create(stream(b))
+    call gpu_set_stream(blas_handle,stream(b))
+    do j=1,nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, &
+         1.d0, t2%f(1,j,1,b), nO*nO, &
+         h_t1(j,b), t1%f(1,1), nO, &
+         tau%f(1,j,1,b), nO*nO)
     enddo
   enddo
   !$OMP END DO
   !$OMP END PARALLEL
 
+  call gpu_synchronize()
+
+  do b=1,nV
+    call gpu_stream_destroy(stream(b))
+  enddo
+  call gpu_set_stream(blas_handle,gpu_default_stream)
+
+
 end
 
 subroutine update_tau_x_space(nO,nV,tau,tau_x)
-
+  use gpu
   implicit none
 
   ! in
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: tau(nO,nO,nV,nV)
+  integer, intent(in)         :: nO, nV
+  type(gpu_double4), intent(in)  :: tau
 
   ! out
-  double precision, intent(out) :: tau_x(nO,nO,nV,nV)
+  type(gpu_double4) :: tau_x
 
   ! internal
   integer                       :: i,j,a,b
 
+  type(gpu_stream) :: stream(nV)
+
+  do a=1,nV
+    call gpu_stream_create(stream(a))
+  enddo
+
   !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,tau,tau_x) &
-  !$OMP PRIVATE(i,j,a,b) &
+  !$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) &
+  !$OMP PRIVATE(a,b) &
   !$OMP DEFAULT(NONE)
   !$OMP DO
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          tau_x(i,j,a,b) = 2.d0*tau(i,j,a,b) - tau(i,j,b,a)
-        enddo
-      enddo
+  do b=1,nV
+    do a=1,nV
+      call gpu_set_stream(blas_handle,stream(a))
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, &
+          2.d0, tau%f(1,1,a,b), nO, &
+         -1.d0, tau%f(1,1,b,a), nO, &
+         tau_x%f(1,1,a,b), nO)
     enddo
   enddo
   !$OMP END DO
   !$OMP END PARALLEL
 
+  call gpu_set_stream(blas_handle,gpu_default_stream)
+  call gpu_synchronize()
+
+  do b=1,nV
+    call gpu_stream_destroy(stream(b))
+  enddo
+
+
 end
 
 ! R1
diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
index b59dc0bb..6f65ea79 100644
--- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
@@ -1,989 +1,789 @@
-subroutine ccsd_energy_space_chol(nO,nV,tau,t1,energy)
-
-  implicit none
-
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: tau(nO,nO,nV,nV)
-  double precision, intent(in)  :: t1(nO,nV)
-  double precision, intent(out) :: energy
-
-  ! internal
-  integer :: i,j,a,b
-  double precision :: e
-
-  energy = 0d0
-  !$omp parallel &
-  !$omp shared(nO,nV,energy,tau,t1,&
-  !$omp cc_space_f_vo,cc_space_w_oovv) &
-  !$omp private(i,j,a,b,e) &
-  !$omp default(none)
-  e = 0d0
-  !$omp do
-  do a = 1, nV
-    do i = 1, nO
-      e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp do
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          e = e + tau(i,j,a,b) * cc_space_w_oovv(i,j,a,b)
-       enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp critical
-  energy = energy + e
-  !$omp end critical
-  !$omp end parallel
-
-end
-
-! Tau
-
-subroutine update_tau_space_chol(nO,nV,t1,t2,tau)
-
-  implicit none
-
-  ! in
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: t1(nO,nV), t2(nO,nO,nV,nV)
-
-  ! out
-  double precision, intent(out) :: tau(nO,nO,nV,nV)
-
-  ! internal
-  integer                       :: i,j,a,b
-
-  !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,tau,t2,t1) &
-  !$OMP PRIVATE(i,j,a,b) &
-  !$OMP DEFAULT(NONE)
-  !$OMP DO
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-end
-
-! R1
-
-subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
-
-  implicit none
-
-  ! in
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: t1(nO,nV), t2(nO,nO,nV,nV), tau(nO,nO,nV,nV)
-  double precision, intent(in)  :: H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO)
-
-  ! out
-  double precision, intent(out) :: r1(nO,nV), max_r1
-
-  ! internal
-  integer                       :: u,i,j,beta,a,b
-
-  !$omp parallel &
-  !$omp shared(nO,nV,r1,cc_space_f_ov) &
-  !$omp private(u,beta) &
-  !$omp default(none)
-  !$omp do
-  do beta = 1, nV
-    do u = 1, nO
-      r1(u,beta) = cc_space_f_ov(u,beta)
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  double precision, allocatable :: X_oo(:,:)
-  allocate(X_oo(nO,nO))
-  call dgemm('N','N', nO, nO, nV, &
-             -2d0, t1    , size(t1,1), &
-                   cc_space_f_vo, size(cc_space_f_vo,1), &
-              0d0, X_oo  , size(X_oo,1))
-
-  call dgemm('T','N', nO, nV, nO, &
-             1d0, X_oo, size(X_oo,2), &
-                  t1  , size(t1,1), &
-             1d0, r1  , size(r1,1))
-  deallocate(X_oo)
-
-  call dgemm('N','N', nO, nV, nV, &
-             1d0, t1  , size(t1,1), &
-                  H_vv, size(H_vv,1), &
-             1d0, r1  , size(r1,1))
-
-  call dgemm('N','N', nO, nV, nO, &
-             -1d0, H_oo, size(H_oo,1), &
-                   t1  , size(t1,1), &
-              1d0, r1, size(r1,1))
-
-  double precision, allocatable :: X_voov(:,:,:,:)
-  allocate(X_voov(nV, nO, nO, nV))
-
-  !$omp parallel &
-  !$omp shared(nO,nV,X_voov,t2,t1) &
-  !$omp private(u,beta,i,a) &
-  !$omp default(none)
-  !$omp do
-  do beta = 1, nV
-    do u = 1, nO
-      do i = 1, nO
-        do a = 1, nV
-          X_voov(a,i,u,beta) = 2d0 * t2(i,u,a,beta) - t2(u,i,a,beta) + t1(u,a) * t1(i,beta)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  call dgemv('T', nV*nO, nO*nV, &
-             1d0, X_voov, size(X_voov,1) * size(X_voov,2), &
-                  H_vo  , 1, &
-             1d0, r1    , 1)
-
-  deallocate(X_voov)
-
-  double precision, allocatable :: X_ovov(:,:,:,:)
-  allocate(X_ovov(nO, nV, nO, nV))
-
-  !$omp parallel &
-  !$omp shared(nO,nV,cc_space_v_ovov,cc_space_v_voov,X_ovov) &
-  !$omp private(u,beta,i,a) &
-  !$omp default(none)
-  !$omp do
-  do beta = 1, nV
-    do u = 1, nO
-      do a = 1, nv
-        do i = 1, nO
-          X_ovov(i,a,u,beta) = 2d0 * cc_space_v_voov(a,u,i,beta) - cc_space_v_ovov(u,a,i,beta)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  call dgemv('T', nO*nV, nO*nV, &
-             1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), &
-                  t1     , 1, &
-             1d0, r1     , 1)
-
-  deallocate(X_ovov)
-
-  integer :: iblock, block_size, nVmax
-  double precision, allocatable :: W_vvov(:,:,:,:), W_vvov_tmp(:,:,:,:), T_vvoo(:,:,:,:)
-  block_size = 16
-  allocate(W_vvov(nV,nV,nO,block_size), W_vvov_tmp(nV,nO,nV,block_size), T_vvoo(nV,nV,nO,nO))
-
-  !$omp parallel &
-  !$omp private(u,i,b,a) &
-  !$omp default(shared)
-  !$omp do
-  do u = 1, nO
-    do i = 1, nO
-      do b = 1, nV
-        do a = 1, nV
-          T_vvoo(a,b,i,u) = tau(i,u,a,b)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  do iblock = 1, nV, block_size
-    nVmax = min(block_size,nV-iblock+1)
-
-    call dgemm('T','N', nV*nO, nV*nVmax, cholesky_mo_num, 1.d0, &
-      cc_space_v_vo_chol            , cholesky_mo_num, &
-      cc_space_v_vv_chol(1,1,iblock), cholesky_mo_num, &
-      0.d0, W_vvov_tmp, nV*nO)
-
-    !$omp parallel &
-    !$omp private(b,i,a,beta) &
-    !$omp default(shared)
-    do beta = 1,  nVmax
-      do i = 1, nO
-        !$omp do
-        do b = 1, nV
-          do a = 1, nV
-            W_vvov(a,b,i,beta) = 2d0 * W_vvov_tmp(a,i,b,beta) - W_vvov_tmp(b,i,a,beta)
-          enddo
-        enddo
-        !$omp end do nowait
-      enddo
-    enddo
-    !$omp barrier
-    !$omp end parallel
-
-    call dgemm('T','N',nO,nVmax,nO*nV*nV, &
-             1d0, T_vvoo, nV*nV*nO, &
-                  W_vvov, nO*nV*nV, &
-             1d0, r1(1,iblock), nO)
-  enddo
-
-  deallocate(W_vvov,T_vvoo)
-
-
-  double precision, allocatable :: W_oovo(:,:,:,:)
-  allocate(W_oovo(nO,nO,nV,nO))
-
-  !$omp parallel &
-  !$omp shared(nO,nV,cc_space_v_oovo,W_oovo) &
-  !$omp private(u,a,i,j) &
-  !$omp default(none)
-  do u = 1, nO
-    !$omp do
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-!          W_oovo(i,j,a,u) = 2d0 * cc_space_v_vooo(a,u,i,j) - cc_space_v_vooo(a,u,j,i)
-          W_oovo(i,j,a,u) = 2d0 * cc_space_v_oovo(i,j,a,u) - cc_space_v_oovo(j,i,a,u)
-        enddo
-      enddo
-    enddo
-    !$omp end do nowait
-  enddo
-  !$omp end parallel
-
-  call dgemm('T','N', nO, nV, nO*nO*nV, &
-             -1d0, W_oovo, size(W_oovo,1) * size(W_oovo,2) * size(W_oovo,3), &
-                   tau   , size(tau,1) * size(tau,2) * size(tau,3), &
-              1d0, r1    , size(r1,1))
-
-  deallocate(W_oovo)
-
-  max_r1 = 0d0
-  do a = 1, nV
-    do i = 1, nO
-      max_r1 = max(dabs(r1(i,a)), max_r1)
-    enddo
-  enddo
-
-  ! Change the sign for consistency with the code in spin orbitals
-  !$omp parallel &
-  !$omp shared(nO,nV,r1) &
-  !$omp private(a,i) &
-  !$omp default(none)
-  !$omp do
-  do a = 1, nV
-    do i = 1, nO
-      r1(i,a) = -r1(i,a)
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-end
-
 ! H_oo
 
-subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo)
-
+subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, &
+    d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
+  use gpu
   implicit none
 
   integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: tau_x(nO, nO, nV, nV)
-  double precision, intent(out) :: H_oo(nO, nO)
+  type(gpu_double2), intent(in)    :: d_cc_space_f_oo
+  type(gpu_double3), intent(in)    :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol
+  type(gpu_double4), intent(in)    :: tau_x
+  type(gpu_double2), intent(out)   :: H_oo
 
   integer :: a,b,i,j,u,k
 
-  double precision, allocatable :: tau_kau(:,:,:), tmp_vov(:,:,:)
+  type(gpu_double3) :: tau_kau, tmp_vov, tmp_ovv
 
-  allocate(tau_kau(cholesky_mo_num,nV,nO))
-  !$omp parallel &
-  !$omp default(shared) &
-  !$omp private(i,u,j,k,a,b,tmp_vov)
-  allocate(tmp_vov(nV,nO,nV) )
-  !$omp do
-  do u = 1, nO
+  call gpu_allocate(tau_kau, cholesky_mo_num, nV, nO)
+
+  type(gpu_blas) :: blas
+
+
+  !$OMP PARALLEL  &
+  !$OMP DEFAULT(SHARED) &
+  !$OMP PRIVATE(blas,u,b,tmp_vov,tmp_ovv)
+
+  !$OMP SINGLE
+  !$OMP TASK
+  call gpu_copy(d_cc_space_f_oo, H_oo)
+  !$OMP END TASK
+  !$OMP END SINGLE
+
+  call gpu_allocate(tmp_ovv, nO, nV, nV)
+  call gpu_allocate(tmp_vov, nV, nO, nV)
+
+  call gpu_blas_create(blas)
+
+  !$OMP DO
+  do u=1,nO
+    call gpu_dgeam(blas, 'N', 'N', 1, nO*nV*nV, 1.d0, &
+           tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f(1,1,1,1), nO, tmp_ovv%f(1,1,1), 1)
     do b=1,nV
-      do j=1,nO
-        do a=1,nV
-          tmp_vov(a,j,b) = tau_x(u,j,a,b)
-        enddo
-      enddo
+      call gpu_dgeam(blas, 'T', 'T', nV, nO, 1.d0, &
+           tmp_ovv%f(1,1,b), nO, 0.d0, &
+           tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV)
     enddo
-    call dgemm('N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
-      cc_space_v_ov_chol, cholesky_mo_num, tmp_vov, nV, &
-      0.d0, tau_kau(1,1,u), cholesky_mo_num)
+    call gpu_dgemm(blas, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
+      d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, tmp_vov%f(1,1,1), nV, &
+      0.d0, tau_kau%f(1,1,u), cholesky_mo_num)
   enddo
-  !$omp end do nowait
-  deallocate(tmp_vov)
-  !$omp do
-  do i = 1, nO
-    do u = 1, nO
-      H_oo(u,i) = cc_space_f_oo(u,i)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp barrier
-  !$omp end  parallel
-  call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
-    tau_kau, cholesky_mo_num*nV,  cc_space_v_vo_chol, cholesky_mo_num*nV, &
-    1.d0, H_oo, nO)
+  !$OMP END DO
 
+  call gpu_blas_destroy(blas)
+
+  call gpu_deallocate(tmp_vov)
+  call gpu_deallocate(tmp_ovv)
+
+  !$OMP TASKWAIT
+  !$OMP END PARALLEL
+
+  call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
+    tau_kau%f(1,1,1), cholesky_mo_num*nV,  d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num*nV, &
+    1.d0, H_oo%f(1,1), nO)
+
+  call gpu_synchronize()
+  call gpu_deallocate(tau_kau)
 end
 
 ! H_vv
 
-subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv)
-
+subroutine compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, &
+         d_cc_space_v_ov_chol,H_vv)
+  use gpu
   implicit none
 
-  integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: tau_x(nO, nO, nV, nV)
-  double precision, intent(out) :: H_vv(nV, nV)
+  integer, intent(in)              :: nO,nV
+  type(gpu_double2), intent(in)    :: d_cc_space_f_vv
+  type(gpu_double3), intent(in)    :: d_cc_space_v_ov_chol
+  type(gpu_double4), intent(in)    :: tau_x
+  type(gpu_double2), intent(out)   :: H_vv
 
   integer :: a,b,i,j,u,k, beta
 
-  double precision, allocatable :: tau_kia(:,:,:), tmp_oov(:,:,:)
+  type(gpu_double3) :: tau_kia, tmp_oov
 
-  allocate(tau_kia(cholesky_mo_num,nO,nV))
-  !$omp parallel &
-  !$omp default(shared) &
-  !$omp private(i,beta,j,k,a,b,tmp_oov)
-  allocate(tmp_oov(nO,nO,nV) )
-  !$omp do
+  call gpu_allocate(tau_kia, cholesky_mo_num, nO, nV)
+
+  type(gpu_blas) :: blas
+
+  !$OMP PARALLEL &
+  !$OMP DEFAULT(SHARED) &
+  !$OMP PRIVATE(a,b,tmp_oov,blas)
+
+  !$OMP SINGLE
+  !$OMP TASK
+  call gpu_copy(d_cc_space_f_vv, H_vv)
+  !$OMP END TASK
+  !$OMP END SINGLE
+
+  call gpu_blas_create(blas)
+  call gpu_allocate(tmp_oov, nO, nO, nV)
+
+  !$OMP DO
   do a = 1, nV
     do b=1,nV
-      do j=1,nO
-        do i=1,nO
-          tmp_oov(i,j,b) = tau_x(i,j,a,b)
-        enddo
-      enddo
+      call gpu_dgeam(blas, 'N', 'N', nO, nO, 1.d0, &
+        tau_x%f(1,1,a,b), nO, 0.d0, &
+        tau_x%f(1,1,a,b), nO, tmp_oov%f(1,1,b), nO)
     enddo
-    call dgemm('N','T',cholesky_mo_num,nO,nO*nV,1.d0, &
-      cc_space_v_ov_chol, cholesky_mo_num, tmp_oov, nO, &
-      0.d0, tau_kia(1,1,a), cholesky_mo_num)
+    call gpu_dgemm(blas, 'N', 'T', cholesky_mo_num, nO, nO*nV, 1.d0, &
+      d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, tmp_oov%f(1,1,1), nO, &
+      0.d0, tau_kia%f(1,1,a), cholesky_mo_num)
   enddo
-  !$omp end do nowait
-  deallocate(tmp_oov)
+  !$OMP END DO
 
-  !$omp do
-  do beta = 1, nV
-    do a = 1, nV
-      H_vv(a,beta) = cc_space_f_vv(a,beta)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp barrier
-  !$omp end  parallel
-  call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
-    tau_kia, cholesky_mo_num*nO,  cc_space_v_ov_chol, cholesky_mo_num*nO, &
-    1.d0, H_vv, nV)
+  call gpu_blas_destroy(blas)
 
+  call gpu_deallocate(tmp_oov)
+  !$OMP TASKWAIT
+  !$OMP END PARALLEL
+
+  call gpu_dgemm(blas_handle, 'T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
+    tau_kia%f(1,1,1), cholesky_mo_num*nO,  d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, &
+    1.d0, H_vv%f(1,1), nV)
+
+  call gpu_synchronize()
+  call gpu_deallocate(tau_kia)
 end
 
 ! H_vo
-subroutine compute_H_vo_chol(nO,nV,t1,H_vo)
-
+subroutine compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, &
+         d_cc_space_v_ov_chol,d_cc_space_v_vo_chol, H_vo)
+  use gpu
   implicit none
 
-  integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: t1(nO, nV)
-  double precision, intent(out) :: H_vo(nV, nO)
+  integer, intent(in)            :: nO,nV
+  type(gpu_double2), intent(in)  :: t1, d_cc_space_f_vo
+  type(gpu_double3), intent(in)  :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol
+  type(gpu_double2), intent(out) :: H_vo
 
   integer :: a,b,i,j,u,k
 
-  double precision, allocatable :: tmp_k(:), tmp(:,:,:), tmp2(:,:,:)
+  type(gpu_double1) :: tmp_k
+  type(gpu_double3) :: tmp, tmp2
+
+  call gpu_copy(d_cc_space_f_vo, H_vo)
+
+  call gpu_allocate(tmp_k, cholesky_mo_num)
+
+  call gpu_dgemm(blas_handle, 'N', 'N', cholesky_mo_num, 1, nO*nV, 2.d0, &
+     d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, &
+     t1%f(1,1), nO*nV, 0.d0, tmp_k%f(1), cholesky_mo_num)
+
+  call gpu_dgemm(blas_handle, 'T', 'N', nV*nO, 1, cholesky_mo_num, 1.d0, &
+      d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num, tmp_k%f(1), cholesky_mo_num, 1.d0, &
+      H_vo%f(1,1), nV*nO)
+
+  call gpu_deallocate(tmp_k)
+
+
+  call gpu_allocate(tmp,  cholesky_mo_num, nO, nO)
+
+  call gpu_dgemm(blas_handle, 'N', 'T', cholesky_mo_num*nO, nO, nV, 1.d0, &
+    d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, t1%f(1,1), nO, 0.d0, tmp%f(1,1,1), cholesky_mo_num*nO)
+
+  call gpu_allocate(tmp2, cholesky_mo_num, nO, nO)
+
+  type(gpu_stream) :: stream(nO)
   do i=1,nO
-    do a=1,nV
-      H_vo(a,i) = cc_space_f_vo(a,i)
-    enddo
+    call gpu_stream_create(stream(i))
   enddo
 
-  allocate(tmp_k(cholesky_mo_num))
-  call dgemm('N', 'N', cholesky_mo_num, 1, nO*nV, 2.d0, &
-     cc_space_v_ov_chol, cholesky_mo_num, &
-     t1, nO*nV, 0.d0, tmp_k, cholesky_mo_num)
-
-  call dgemm('T','N',nV*nO,1,cholesky_mo_num,1.d0, &
-      cc_space_v_vo_chol, cholesky_mo_num, tmp_k, cholesky_mo_num, 1.d0, &
-      H_vo, nV*nO)
-  deallocate(tmp_k)
-
-  allocate(tmp(cholesky_mo_num,nO,nO))
-  allocate(tmp2(cholesky_mo_num,nO,nO))
-
-  call dgemm('N','T', cholesky_mo_num*nO, nO, nV, 1.d0, &
-    cc_space_v_ov_chol, cholesky_mo_num*nO, t1, nO, 0.d0, tmp, cholesky_mo_num*nO)
-
+  !$OMP PARALLEL DO COLLAPSE(2) PRIVATE(i,j)
   do i=1,nO
     do j=1,nO
-      do k=1,cholesky_mo_num
-        tmp2(k,j,i) = tmp(k,i,j)
-      enddo
+      call gpu_set_stream(blas_handle,stream(j))
+      call gpu_dgeam(blas_handle, 'N', 'N', cholesky_mo_num, 1, 1.d0, &
+        tmp%f(1,i,j), cholesky_mo_num, 0.d0, &
+        tmp%f(1,i,j), cholesky_mo_num, tmp2%f(1,j,i), cholesky_mo_num)
     enddo
   enddo
-  deallocate(tmp)
+  !$OMP END PARALLEL DO
 
-  call dgemm('T','N', nV, nO, cholesky_mo_num*nO, -1.d0, &
-    cc_space_v_ov_chol, cholesky_mo_num*nO, tmp2, cholesky_mo_num*nO, &
-    1.d0, H_vo, nV)
+  call gpu_set_stream(blas_handle,gpu_default_stream)
+  call gpu_synchronize()
+
+  do i=1,nO
+    call gpu_stream_destroy(stream(i))
+  enddo
+  call gpu_deallocate(tmp)
+
+  call gpu_dgemm(blas_handle, 'T','N', nV, nO, cholesky_mo_num*nO, -1.d0, &
+    d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, tmp2%f(1,1,1), cholesky_mo_num*nO, &
+    1.d0, H_vo%f(1,1), nV)
+
+  call gpu_synchronize()
+  call gpu_deallocate(tmp2)
+end
+
+! R1
+
+subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_space_f_ov,d_cc_space_f_vo, &
+    d_cc_space_v_voov, d_cc_space_v_ovov, d_cc_space_v_oovo, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol)
+  use gpu
+  implicit none
+
+  ! in
+  integer, intent(in)           :: nO, nV
+  type(gpu_double2), intent(in) :: t1, H_oo, H_vo, H_vv, d_cc_space_f_ov,d_cc_space_f_vo
+  type(gpu_double3), intent(in) :: d_cc_space_v_vo_chol, d_cc_space_v_vv_chol
+  type(gpu_double4), intent(in) :: t2, tau, d_cc_space_v_voov, d_cc_space_v_ovov, d_cc_space_v_oovo
+
+  ! out
+  type(gpu_double2), intent(out) :: r1
+  double precision, intent(out)  :: max_r1
+
+  ! internal
+  integer                       :: u,i,j,beta,a,b
+
+  type(gpu_stream) :: stream(nV)
+
+  do a=1,nV
+    call gpu_stream_create(stream(a))
+  enddo
+
+  type(gpu_double2) :: X_oo
+  call gpu_allocate(X_oo,nO,nO)
+
+  call gpu_copy(d_cc_space_f_ov, r1)
+
+  call gpu_set_stream(blas_handle, stream(1))
+  call gpu_dgemm(blas_handle, 'N','N', nO, nV, nV, &
+             1d0, t1%f(1,1)  , size(t1%f,1), &
+                  H_vv%f(1,1), size(H_vv%f,1), &
+             1d0, r1%f(1,1)  , size(r1%f,1))
+
+  call gpu_dgemm(blas_handle, 'N','N', nO, nV, nO, &
+             -1d0, H_oo%f(1,1), size(H_oo%f,1), &
+                   t1%f(1,1)  , size(t1%f,1), &
+              1d0, r1%f(1,1), size(r1%f,1))
+
+  call gpu_set_stream(blas_handle, stream(nV))
+  call gpu_dgemm(blas_handle, 'N','N', nO, nO, nV, &
+             -2d0, t1%f(1,1), size(t1%f,1), &
+                   d_cc_space_f_vo%f(1,1), size(d_cc_space_f_vo%f,1), &
+              0d0, X_oo%f(1,1), size(X_oo%f,1))
+
+  call gpu_synchronize()
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+
+  call gpu_dgemm(blas_handle, 'T','N', nO, nV, nO, &
+             1d0, X_oo%f(1,1), size(X_oo%f,2), &
+                  t1%f(1,1)  , size(t1%f,1), &
+             1d0, r1%f(1,1)  , size(r1%f,1))
+
+
+
+  type(gpu_double4) :: X_voov
+  call gpu_allocate(X_voov, nV, nO, nO, nV)
+
+  do i=1,nO
+    do beta=1,nV
+      call gpu_set_stream(blas_handle, stream(beta))
+      call gpu_dgeam(blas_handle, 'T', 'T', nV, nO,  -1.d0, t2%f(1,i,1,beta), &
+         nO*nO, t1%f(i,beta), t1%f(1,1), nO, X_voov%f(1,i,1,beta), nV*nO)
+    enddo
+  enddo
+
+  do beta=1,nV
+    call gpu_set_stream(blas_handle, stream(beta))
+    call gpu_dgeam(blas_handle, 'N', 'T', nV, nO*nO,  1.d0, X_voov%f(1,1,1,beta), &
+         nV, 2.d0, t2%f(1,1,1,beta), nO*nO, X_voov%f(1,1,1,beta), nV)
+  enddo
+
+  call gpu_synchronize()
+  call gpu_deallocate(X_oo)
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+
+  call gpu_dgemv(blas_handle, 'T', nV*nO, nO*nV, &
+             1d0, X_voov%f(1,1,1,1), size(X_voov%f,1) * size(X_voov%f,2), &
+                  H_vo%f(1,1)  , 1, &
+             1d0, r1%f(1,1)    , 1)
+
+  type(gpu_double4) :: X_ovov
+  call gpu_allocate(X_ovov, nO, nV, nO, nV)
+
+  do beta = 1, nV
+    call gpu_set_stream(blas_handle, stream(beta))
+    do u=1,nO
+      call gpu_dgeam(blas_handle, 'N', 'T', nO, nV, -1.d0, d_cc_space_v_ovov%f(1,1,u,beta), &
+      nO, 2.d0, d_cc_space_v_voov%f(1,u,1,beta), nV*nO, X_ovov%f(1,1,u,beta), nO)
+    enddo
+  enddo
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+  call gpu_synchronize()
+  call gpu_deallocate(X_voov)
+
+  call gpu_dgemv(blas_handle, 'T', nO*nV, nO*nV, &
+             1d0, X_ovov%f(1,1,1,1), size(X_ovov%f,1) * size(X_ovov%f,2), &
+                  t1%f(1,1), 1, &
+             1d0, r1%f(1,1), 1)
+
+
+  integer :: iblock, block_size, nVmax
+  type(gpu_double4) :: W_vvov, W_vvov_tmp, T_vvoo
+
+  block_size = 16
+  call gpu_allocate(T_vvoo, nV,nV,nO,nO)
+
+  call gpu_dgeam(blas_handle, 'T', 'N', nV*nV, nO*nO, 1.d0, tau%f(1,1,1,1), &
+    nO*nO, 0.d0, T_vvoo%f(1,1,1,1), nV*nV, T_vvoo%f(1,1,1,1), nV*nV)
+
+  call gpu_allocate(W_vvov,nV, nV,nO,block_size)
+  call gpu_allocate(W_vvov_tmp, nV,nO,nV,block_size)
+
+  do iblock = 1, nV, block_size
+    nVmax = min(block_size,nV-iblock+1)
+
+    call gpu_dgemm(blas_handle, 'T','N', nV*nO, nV*nVmax, cholesky_mo_num, 1.d0, &
+      d_cc_space_v_vo_chol%f(1,1,1) , cholesky_mo_num, &
+      d_cc_space_v_vv_chol%f(1,1,iblock), cholesky_mo_num, &
+      0.d0, W_vvov_tmp%f(1,1,1,1), nV*nO)
+
+    call gpu_synchronize()
+    do b=1,nV
+      call gpu_set_stream(blas_handle, stream(b))
+      do i=1,nO
+        call gpu_dgeam(blas_handle, 'N', 'N', nV, nVmax,  2.d0, W_vvov_tmp%f(1,i,b,1), &
+         nV*nO*nV, 0.d0, W_vvov_tmp%f(1,i,b,1), nV*nO*nV, W_vvov%f(1,b,i,1), nV*nV*nO)
+      enddo
+    enddo
+
+    call gpu_synchronize()
+
+    do beta = 1,  nVmax
+      call gpu_set_stream(blas_handle, stream(beta))
+      call gpu_dgeam(blas_handle, 'N', 'T', nV, nV*nO,  1.d0, W_vvov%f(1,1,1,beta), &
+         nV, -1.d0, W_vvov_tmp%f(1,1,1,beta), nV*nO, W_vvov%f(1,1,1,beta), nV)
+    enddo
+    call gpu_synchronize()
+
+    call gpu_dgemm(blas_handle, 'T','N',nO,nVmax,nO*nV*nV, &
+             1d0, T_vvoo%f(1,1,1,1), nV*nV*nO, &
+                  W_vvov%f(1,1,1,1), nO*nV*nV, &
+             1d0, r1%f(1,iblock), nO)
+  enddo
+
+  call gpu_deallocate(X_ovov)
+
+  type(gpu_double4) :: W_oovo
+  call gpu_allocate(W_oovo, nO,nO,nV,nO)
+
+  do u = 1, nO
+    do a = 1, nV
+      call gpu_set_stream(blas_handle, stream(a))
+      call gpu_dgeam(blas_handle, 'N', 'T', nO, nO,  2.d0, d_cc_space_v_oovo%f(1,1,a,u), &
+         nO, -1.d0, d_cc_space_v_oovo%f(1,1,a,u), nO, W_oovo%f(1,1,a,u), nO)
+    enddo
+  enddo
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+  call gpu_synchronize()
+
+  call gpu_deallocate(W_vvov)
+  call gpu_deallocate(T_vvoo)
+
+  ! Change the sign for consistency with the code in spin orbitals
+  call gpu_dgemm(blas_handle, 'T','N', nO, nV, nO*nO*nV, &
+              1d0, W_oovo%f(1,1,1,1), size(W_oovo%f,1) * size(W_oovo%f,2) * size(W_oovo%f,3), &
+                   tau%f(1,1,1,1), size(tau%f,1) * size(tau%f,2) * size(tau%f,3), &
+             -1d0, r1%f(1,1), size(r1%f,1))
+
+  call gpu_synchronize()
+  call gpu_deallocate(W_oovo)
+
+  max_r1 = 0d0
+  do a = 1, nV
+    do i = 1, nO
+      max_r1 = max(dabs(r1%f(i,a)), max_r1)
+    enddo
+  enddo
+
+  do a=1,nV
+    call gpu_stream_destroy(stream(a))
+  enddo
 
 end
 
 
 ! R2
 
-subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
-
+subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
+    d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, d_cc_space_v_oovo, d_cc_space_v_ovvo, d_cc_space_v_ovoo, &
+    d_cc_space_v_ovov, d_cc_space_v_vvoo, d_cc_space_v_oo_chol, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, &
+    d_cc_space_f_vo, &
+    r2,max_r2)
+  use gpu
   implicit none
 
   ! in
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: t1(nO,nV), t2(nO,nO,nV,nV), tau(nO,nO,nV,nV)
-  double precision, intent(in)  :: H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO)
+  integer, intent(in)            :: nO, nV
+  type(gpu_double2), intent(in)  :: t1, H_oo, H_vv, d_cc_space_f_vo
+  type(gpu_double4), intent(in)  :: t2, tau, d_cc_space_v_oovv
+  type(gpu_double4), intent(in)  :: d_cc_space_v_vooo, d_cc_space_v_oooo
+  type(gpu_double4), intent(in)  :: d_cc_space_v_vvoo, d_cc_space_v_oovo
+  type(gpu_double4), intent(in)  :: d_cc_space_v_ovvo, d_cc_space_v_ovoo
+  type(gpu_double4), intent(in)  :: d_cc_space_v_ovov
+  type(gpu_double3), intent(in)  :: d_cc_space_v_oo_chol, d_cc_space_v_ov_chol
+  type(gpu_double3), intent(in)  :: d_cc_space_v_vo_chol, d_cc_space_v_vv_chol
 
   ! out
-  double precision, intent(out) :: r2(nO,nO,nV,nV), max_r2
+  double precision, intent(out)  :: max_r2
+  type(gpu_double4), intent(out) :: r2
 
   ! internal
   integer                       :: u,v,i,j,beta,gam,a,b
   double precision              :: max_r2_local
 
+  type(gpu_stream) :: stream(nV)
+
   call set_multiple_levels_omp(.False.)
 
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,cc_space_v_oovv) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
-  do gam = 1, nV
-    do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-         r2(u,v,beta,gam) = cc_space_v_oovv(u,v,beta,gam)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
+  call gpu_copy(d_cc_space_v_oovv, r2)
 
-  double precision, allocatable :: A1(:,:,:,:)
-  allocate(A1(nO,nO,nO,nO))
-  call compute_A1_chol(nO,nV,t1,t2,tau,A1)
-  call dgemm('N','N',nO*nO,nV*nV,nO*nO, &
-             1d0, A1, size(A1,1) * size(A1,2), &
-                  tau, size(tau,1) * size(tau,2), &
-             1d0, r2, size(r2,1) * size(r2,2))
+  type(gpu_double4) :: A1
+  call gpu_allocate(A1,nO,nO,nO,nO)
+  call compute_A1_chol(nO,nV,t1,t2,tau,d_cc_space_v_vooo, &
+      d_cc_space_v_oooo, d_cc_space_v_vvoo, A1)
+
+  call gpu_dgemm(blas_handle, 'N','N',nO*nO,nV*nV,nO*nO, &
+             1d0, A1%f(1,1,1,1), size(A1%f,1) * size(A1%f,2), &
+                  tau%f(1,1,1,1), size(tau%f,1) * size(tau%f,2), &
+             1d0, r2%f(1,1,1,1), size(r2%f,1) * size(r2%f,2))
+
+  call gpu_deallocate(A1)
 
-  deallocate(A1)
   integer :: block_size, iblock, k
   block_size = 16
-  double precision, dimension(:,:,:), allocatable :: B1, tmp_cc, tmpB1
-  double precision, dimension(:,:), allocatable :: tmp_cc2
+  type(gpu_double3) :: tmp_cc, B1, tmpB1
+  type(gpu_double2) :: tmp_cc2
 
-  allocate(tmp_cc(cholesky_mo_num,nV,nV))
-  call dgemm('N','N', cholesky_mo_num*nV, nV, nO, 1.d0, &
-      cc_space_v_vo_chol, cholesky_mo_num*nV, t1, nO, 0.d0, tmp_cc, cholesky_mo_num*nV)
+  call gpu_allocate(tmp_cc,cholesky_mo_num,nV,nV)
+  call gpu_dgemm(blas_handle, 'N','N', cholesky_mo_num*nV, nV, nO, 1.d0, &
+      d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num*nV, t1%f(1,1), nO, 0.d0, tmp_cc%f(1,1,1), cholesky_mo_num*nV)
 
   call set_multiple_levels_omp(.False.)
+  call gpu_synchronize()
+
+  type(gpu_blas) :: blas
+
+  !$OMP PARALLEL PRIVATE(gam, iblock, B1, tmpB1, tmp_cc2, beta, b, a, blas)
+  call gpu_allocate(B1,nV,nV,block_size)
+  call gpu_allocate(tmpB1,nV,block_size,nV)
+  call gpu_allocate(tmp_cc2,cholesky_mo_num,nV)
+
+  call gpu_blas_create(blas)
 
-  !$OMP PARALLEL PRIVATE(gam, iblock, B1, tmpB1, tmp_cc2, beta, b, a)
-  allocate(B1(nV,nV,block_size), tmpB1(nV,block_size,nV), tmp_cc2(cholesky_mo_num,nV))
   !$OMP DO
   do gam = 1, nV
 
-    do a=1,nV
-      do k=1,cholesky_mo_num
-        tmp_cc2(k,a) = cc_space_v_vv_chol(k,a,gam) - tmp_cc(k,a,gam)
-      enddo
-    enddo
+    call gpu_dgeam(blas, 'N', 'N', cholesky_mo_num, nV, 1.d0, d_cc_space_v_vv_chol%f(1,1,gam), &
+         cholesky_mo_num, -1.d0, tmp_cc%f(1,1,gam), cholesky_mo_num, tmp_cc2%f(1,1), cholesky_mo_num)
 
     do iblock = 1, nV, block_size
 
-        call dgemm('T', 'N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, &
-                -1.d0, tmp_cc(1,1,iblock), cholesky_mo_num, &
-                cc_space_v_vv_chol(1,1,gam), cholesky_mo_num, &
-                0.d0, tmpB1, nV*block_size)
+        call gpu_dgemm(blas, 'T', 'N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, &
+                -1.d0, tmp_cc%f(1,1,iblock), cholesky_mo_num, &
+                d_cc_space_v_vv_chol%f(1,1,gam), cholesky_mo_num, &
+                0.d0, tmpB1%f(1,1,1), nV*block_size)
 
-        call dgemm('T','N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, &
-                1.d0, cc_space_v_vv_chol(1,1,iblock), cholesky_mo_num, &
-                tmp_cc2, cholesky_mo_num, &
-                1.d0, tmpB1, nV*block_size)
+        call gpu_dgemm(blas, 'T','N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, &
+                1.d0, d_cc_space_v_vv_chol%f(1,1,iblock), cholesky_mo_num, &
+                tmp_cc2%f(1,1), cholesky_mo_num, &
+                1.d0, tmpB1%f(1,1,1), nV*block_size)
 
         do beta = iblock, min(nV, iblock+block_size-1)
-          do b = 1, nV
-            do a = 1, nV
-              B1(a,b,beta-iblock+1) = tmpB1(a,beta-iblock+1,b)
-            enddo
-          enddo
+          call gpu_dgeam(blas, 'N', 'N', nV, nV, 1.d0, tmpB1%f(1,beta-iblock+1,1), &
+             nV*block_size, 0.d0, B1%f(1,1,beta-iblock+1), nV, B1%f(1,1,beta-iblock+1), nV)
         enddo
 
-        call dgemm('N','N',nO*nO,min(block_size, nV-iblock+1),nV*nV, &
-              1d0, tau, size(tau,1) * size(tau,2), &
-                   B1 , size(B1 ,1) * size(B1 ,2), &
-              1d0, r2(1,1,iblock,gam),  size(r2 ,1) * size(r2 ,2))
+        call gpu_dgemm(blas, 'N','N',nO*nO,min(block_size, nV-iblock+1),nV*nV, &
+              1d0, tau%f(1,1,1,1), size(tau%f,1) * size(tau%f,2), &
+                   B1%f(1,1,1) , size(B1%f ,1) * size(B1%f ,2), &
+              1d0, r2%f(1,1,iblock,gam),  size(r2%f ,1) * size(r2%f ,2))
       enddo
 
   enddo
   !$OMP ENDDO
 
-  deallocate(B1, tmpB1, tmp_cc2)
+  call gpu_blas_destroy(blas)
+
+  call gpu_deallocate(B1)
+  call gpu_deallocate(tmpB1)
+  call gpu_deallocate(tmp_cc2)
   !$OMP END PARALLEL
 
-  deallocate(tmp_cc)
+  call gpu_deallocate(tmp_cc)
+
+  type(gpu_double4) :: X_oovv
+  call gpu_allocate(X_oovv,nO,nO,nV,nV)
+  call gpu_copy(t2,X_oovv)
+
+  type(gpu_double2) :: g_occ, g_vir
+  call gpu_allocate(g_vir,nV,nV)
+  call gpu_allocate(g_occ,nO,nO)
+  call compute_g_vir_chol(nO,nV,t1,t2,H_vv,d_cc_space_f_vo, &
+    d_cc_space_v_ov_chol, d_cc_space_v_vv_chol, g_vir)
+  call compute_g_occ_chol(nO,nV,t1,t2,H_oo, &
+    d_cc_space_f_vo, d_cc_space_v_ov_chol, d_cc_space_v_oo_chol, d_cc_space_v_ovoo, g_occ)
+
+  type(gpu_double4) :: Y_oovv
+  call gpu_allocate(Y_oovv,nO,nO,nV,nV)
+
+  call gpu_dgemm(blas_handle, 'N','N',nO*nO*nV,nV,nV, &
+             1d0, X_oovv%f(1,1,1,1), size(X_oovv%f,1) * size(X_oovv%f,2) * size(X_oovv%f,3), &
+                  g_vir%f(1,1), size(g_vir%f,1), &
+             0d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1) * size(Y_oovv%f,2) * size(Y_oovv%f,3))
+
+  call gpu_dgemm(blas_handle, 'N','N',nO,nO*nV*nV,nO, &
+             -1d0, g_occ%f(1,1), size(g_occ%f,1), &
+                  t2%f(1,1,1,1)    , size(t2%f,1), &
+             1d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1))
+
+  call gpu_dgemm(blas_handle, 'N','N',nO*nO*nV,nV,nO, &
+            -1d0, d_cc_space_v_oovo%f(1,1,1,1), size(cc_space_v_oovo,1) * size(cc_space_v_oovo,2) * size(cc_space_v_oovo,3), &
+                  t1%f(1,1) , size(t1%f,1), &
+             1d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1) * size(Y_oovv%f,2) * size(Y_oovv%f,3))
 
 
-  double precision, allocatable :: X_oovv(:,:,:,:)
-  allocate(X_oovv(nO,nO,nV,nV))
-  !$omp parallel &
-  !$omp shared(nO,nV,t2,X_oovv) &
-  !$omp private(u,v,gam,a) &
-  !$omp default(none)
-  !$omp do
-  do a = 1, nV
+  call gpu_dgeam(blas_handle, 'N', 'N', nO*nO, nV*nV, 1.d0, Y_oovv%f(1,1,1,1), &
+         nO*nO, 1.d0, r2%f(1,1,1,1), nO*nO, r2%f(1,1,1,1), nO*nO)
+
+  call gpu_synchronize()
+  call gpu_deallocate(X_oovv)
+
+  call gpu_deallocate(g_vir)
+  call gpu_deallocate(g_occ)
+
+  type(gpu_double4) :: X_vovo, Y_oovo
+  call gpu_allocate(X_vovo,nV,nO,nV,nO)
+
+  do a=1,nV
+    call gpu_stream_create(stream(a))
+  enddo
+
+  do gam = 1, nV
+    call gpu_set_stream(blas_handle, stream(gam))
+    do beta = 1, nV
+      call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, 1.d0, r2%f(1,1,beta,gam), &
+           nO, 1.d0, Y_oovv%f(1,1,gam,beta), nO, r2%f(1,1,beta,gam), nO)
+    enddo
+  enddo
+
+  do i = 1, nO
     do gam = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-          X_oovv(u,v,gam,a) = t2(u,v,gam,a)
-        enddo
-      enddo
+      call gpu_set_stream(blas_handle, stream(gam))
+      call gpu_dgeam(blas_handle, 'T', 'N', nV, nO, 1.d0, d_cc_space_v_ovvo%f(1,1,gam,i), &
+           nO, 0.d0, X_vovo%f(1,1,gam,i), nV, X_vovo%f(1,1,gam,i), nV)
     enddo
   enddo
-  !$omp end do
-  !$omp end parallel
 
-  double precision, allocatable :: g_vir(:,:)
-  allocate(g_vir(nV,nV))
-  call compute_g_vir_chol(nO,nV,t1,t2,H_vv,g_vir)
-
-  double precision, allocatable :: Y_oovv(:,:,:,:)
-  allocate(Y_oovv(nO,nO,nV,nV))
-
-  call dgemm('N','N',nO*nO*nV,nV,nV, &
-             1d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3), &
-                  g_vir, size(g_vir,1), &
-             0d0, Y_oovv, size(Y_oovv,1) * size(Y_oovv,2) * size(Y_oovv,3))
-  deallocate(g_vir)
-  deallocate(X_oovv)
-
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,Y_oovv) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
-  do gam = 1, nV
-    do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-          r2(u,v,beta,gam) = r2(u,v,beta,gam) + Y_oovv(u,v,beta,gam) + Y_oovv(v,u,gam,beta)
-        enddo
-      enddo
-    enddo
+  do a=1,nV
+    call gpu_stream_destroy(stream(a))
   enddo
-  !$omp end do
-  !$omp end parallel
-  deallocate(Y_oovv)
+  call gpu_set_stream(blas_handle, gpu_default_stream)
 
-  double precision, allocatable :: g_occ(:,:)
-  allocate(g_occ(nO,nO))
-  call compute_g_occ_chol(nO,nV,t1,t2,H_oo,g_occ)
 
-  allocate(X_oovv(nO,nO,nV,nV))
-  call dgemm('N','N',nO,nO*nV*nV,nO, &
-             1d0, g_occ , size(g_occ,1), &
-                  t2    , size(t2,1), &
-             0d0, X_oovv, size(X_oovv,1))
-  deallocate(g_occ)
 
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,X_oovv) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
-  do gam = 1, nV
-    do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-          r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(u,v,beta,gam) - X_oovv(v,u,gam,beta)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  deallocate(X_oovv)
-
-  double precision, allocatable :: X_vovv(:,:,:,:)
-
-  allocate(X_vovv(nV,nO,nV,block_size))
-  allocate(Y_oovv(nO,nO,nV,nV))
+  call gpu_allocate(Y_oovo,nO,nO,nV,nO)
 
+  !$OMP PARALLEL PRIVATE(blas, iblock, gam, X_vovv)
+  call gpu_blas_create(blas)
+  type(gpu_double4) :: X_vovv
+  call gpu_allocate(X_vovv,nV,nO,nV,block_size)
+  !$OMP DO
   do iblock = 1, nV, block_size
     do gam = iblock, min(nV, iblock+block_size-1)
-      call dgemm('T','N',nV, nO*nV, cholesky_mo_num, 1.d0, &
-        cc_space_v_vv_chol(1,1,gam), cholesky_mo_num, cc_space_v_ov_chol, &
-        cholesky_mo_num, 0.d0, X_vovv(1,1,1,gam-iblock+1), nV)
+      call gpu_dgemm(blas, 'T','N',nV, nO*nV, cholesky_mo_num, 1.d0, &
+        d_cc_space_v_vv_chol%f(1,1,gam), cholesky_mo_num, d_cc_space_v_ov_chol%f(1,1,1), &
+        cholesky_mo_num, 0.d0, X_vovv%f(1,1,1,gam-iblock+1), nV)
 
     enddo
-    call dgemm('N','N',nO,nO*nV*min(block_size, nV-iblock+1),nV, &
-             1d0, t1    , size(t1,1), &
-                  X_vovv, size(X_vovv,1), &
-             0d0, Y_oovv(1,1,1,iblock), size(Y_oovv,1))
 
+    call gpu_dgemm(blas, 'N','N', nO, &
+             nO*nV*min(block_size, nV-iblock+1),nV, &
+             1.d0, t1%f(1,1)    , size(t1%f,1), &
+             X_vovv%f(1,1,1,1), size(X_vovv%f,1), &
+             0d0, Y_oovv%f(1,1,1,iblock), size(Y_oovv%f,1))
+  enddo
+  !$OMP END DO
+
+  call gpu_blas_destroy(blas)
+  call gpu_deallocate(X_vovv)
+  !$OMP END PARALLEL
+
+  call gpu_dgemm(blas_handle, 'N','N',nO,nO*nV*nO,nV, &
+             1d0, t1%f(1,1), size(t1%f,1), &
+                  X_vovo%f(1,1,1,1), size(X_vovo%f,1), &
+             0d0, Y_oovo%f(1,1,1,1), size(Y_oovo%f,1))
+
+  call gpu_dgemm(blas_handle, 'N','N',nO*nO*nV, nV, nO, &
+            -1d0, Y_oovo%f(1,1,1,1), size(Y_oovo%f,1) * size(Y_oovo%f,2) * size(Y_oovo%f,3), &
+                  t1%f(1,1)    , size(t1%f,1), &
+             1d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1) * size(Y_oovv%f,2) * size(Y_oovv%f,3))
+
+  call gpu_synchronize()
+  call gpu_deallocate(X_vovo)
+  call gpu_deallocate(Y_oovo)
+
+  do a=1,nV
+    call gpu_stream_create(stream(a))
   enddo
-  deallocate(X_vovv)
 
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,Y_oovv) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
   do gam = 1, nV
+    call gpu_set_stream(blas_handle, stream(gam))
     do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-          r2(u,v,beta,gam) = r2(u,v,beta,gam) + Y_oovv(v,u,beta,gam) + Y_oovv(u,v,gam,beta)
-        enddo
-      enddo
+      call gpu_dgeam(blas_handle, 'T', 'N', nO, nO, 1.d0, Y_oovv%f(1,1,beta,gam), &
+           nO, 1.d0, r2%f(1,1,beta,gam), nO, r2%f(1,1,beta,gam), nO)
+    enddo
+    do j=1,nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, r2%f(1,j,1,gam), &
+           nO*nO, 1.d0, Y_oovv%f(1,j,gam,1), nO*nO*nV, r2%f(1,j,1,gam), nO*nO)
     enddo
   enddo
-  !$omp end do
-  !$omp end parallel
-  deallocate(Y_oovv)
 
-  double precision, allocatable :: X_ovvo(:,:,:,:)
-  double precision, allocatable :: tcc(:,:,:), tcc2(:,:,:)
-  allocate(tcc2(cholesky_mo_num,nV,nO), X_ovvo(nO,nV,nV,nO))
-  allocate(tcc(cholesky_mo_num,nO,nV))
+  call gpu_set_stream(blas_handle, gpu_default_stream)
 
-  call dgemm('N','T', cholesky_mo_num*nV, nO, nV, 1.d0, &
-     cc_space_v_vv_chol, cholesky_mo_num*nV, t1, nO, &
-     0.d0, tcc2, cholesky_mo_num*nV)
 
-  call dgemm('N','N', cholesky_mo_num*nO, nV, nO, 1.d0, &
-     cc_space_v_oo_chol, cholesky_mo_num*nO, t1, nO, &
-     0.d0, tcc, cholesky_mo_num*nO)
+  call gpu_synchronize()
+  call gpu_deallocate(Y_oovv)
 
-  call dgemm('T','N', nO*nV, nV*nO, cholesky_mo_num, 1.d0, &
-              tcc, cholesky_mo_num, tcc2, cholesky_mo_num, 0.d0, &
-              X_ovvo, nO*nV)
+  type(gpu_double4) :: X_ovvo
+  type(gpu_double3) :: tcc, tcc2
+  call gpu_allocate(tcc2,cholesky_mo_num,nV,nO)
+  call gpu_allocate(X_ovvo,nO,nV,nV,nO)
+  call gpu_allocate(tcc,cholesky_mo_num,nO,nV)
+
+  call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nV, nO, nV, 1.d0, &
+     d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num*nV, t1%f(1,1), nO, &
+     0.d0, tcc2%f(1,1,1), cholesky_mo_num*nV)
+
+  call gpu_dgemm(blas_handle, 'N','N', cholesky_mo_num*nO, nV, nO, 1.d0, &
+     d_cc_space_v_oo_chol%f(1,1,1), cholesky_mo_num*nO, t1%f(1,1), nO, &
+     0.d0, tcc%f(1,1,1), cholesky_mo_num*nO)
+
+  call gpu_dgemm(blas_handle, 'T','N', nO*nV, nV*nO, cholesky_mo_num, 1.d0, &
+              tcc%f(1,1,1), cholesky_mo_num, tcc2%f(1,1,1), cholesky_mo_num, 0.d0, &
+              X_ovvo%f(1,1,1,1), nO*nV)
+
+  call gpu_synchronize()
 
-  deallocate(tcc, tcc2)
 
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,X_ovvo) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
   do gam = 1, nV
+    call gpu_set_stream(blas_handle, stream(gam))
+    do j=1,nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, -1.d0, X_ovvo%f(1,1,gam,j), &
+           nO, 1.d0, r2%f(1,j,1,gam), nO*nO, r2%f(1,j,1,gam), nO*nO)
+    enddo
     do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-          r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_ovvo(u,beta,gam,v)
-        enddo
-      enddo
+      call gpu_dgeam(blas_handle, 'T', 'N', nO, nO, -1.d0, X_ovvo%f(1,gam,beta,1), &
+           nO*nV*nV, 1.d0, r2%f(1,1,beta,gam), nO, r2%f(1,1,beta,gam), nO)
     enddo
   enddo
-  !$omp end do
-  !$omp do
-  do beta = 1, nV
-    do gam = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-          r2(v,u,gam,beta) = r2(v,u,gam,beta) - X_ovvo(u,beta,gam,v)
-        enddo
-      enddo
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+
+  call gpu_synchronize
+  call gpu_deallocate(tcc)
+  call gpu_deallocate(tcc2)
+  call gpu_deallocate(X_ovvo)
+
+
+  type(gpu_double4) :: J1, K1
+  type(gpu_double4) :: Y_voov, Z_ovov
+
+
+  call gpu_allocate(J1,nO,nV,nV,nO)
+  call compute_J1_chol(nO,nV,t1,t2,d_cc_space_v_ovvo,d_cc_space_v_ovoo, &
+       d_cc_space_v_vvoo,d_cc_space_v_vo_chol,d_cc_space_v_vv_chol,J1)
+
+  call gpu_allocate(K1,nO,nV,nO,nV)
+  call compute_K1_chol(nO,nV,t1,t2,d_cc_space_v_ovoo,d_cc_space_v_vvoo, &
+       d_cc_space_v_ovov,d_cc_space_v_ov_chol,d_cc_space_v_vv_chol,K1)
+
+
+  call gpu_allocate(X_ovvo,nO,nV,nV,nO)
+  call gpu_allocate(Y_voov,nV,nO,nO,nV)
+
+  do a=1, nV
+    call gpu_set_stream(blas_handle, stream(a))
+    do i=1, nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,a,1,i), &
+           nO*nV, -0.5d0, K1%f(1,a,i,1), nO*nV*nO, X_ovvo%f(1,1,a,i), nO)
+      call gpu_dgeam(blas_handle, 'T', 'T', nV, nO, 2.d0, t2%f(1,i,1,a), &
+           nO*nO, -1.d0, t2%f(1,i,a,1), nO*nO*nV, Y_voov%f(1,1,i,a), nV)
     enddo
   enddo
-  !$omp end do
-  !$omp end parallel
 
-  deallocate(X_ovvo)
-  !-----
+  call gpu_allocate(Z_ovov,nO,nV,nO,nV)
 
-  allocate(X_oovv(nO,nO,nV,nV))
+  call gpu_synchronize()
+  call gpu_deallocate(J1)
+  call gpu_set_stream(blas_handle, gpu_default_stream)
 
-  call dgemm('N','N',nO*nO*nV,nV,nO, &
-             1d0, cc_space_v_oovo, size(cc_space_v_oovo,1) * size(cc_space_v_oovo,2) * size(cc_space_v_oovo,3), &
-                  t1 , size(t1,1), &
-             0d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3))
 
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,X_oovv) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
-  do gam = 1, nV
-    do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-           r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(u,v,beta,gam) - X_oovv(v,u,gam,beta)
-        enddo
-      enddo
+  call gpu_dgemm(blas_handle, 'N','N', nO*nV,nO*nV,nV*nO, &
+             1d0, X_ovvo%f(1,1,1,1), size(X_ovvo%f,1) * size(X_ovvo%f,2), &
+                  Y_voov%f(1,1,1,1), size(Y_voov%f,1) * size(Y_voov%f,2), &
+             0d0, Z_ovov%f(1,1,1,1), size(Z_ovov%f,1) * size(Z_ovov%f,2))
+
+  call gpu_synchronize()
+  call gpu_deallocate(Y_voov)
+  call gpu_deallocate(X_ovvo)
+
+  type(gpu_double4) :: Y_ovov, X_ovov
+  call gpu_allocate(X_ovov,nO,nV,nO,nV)
+  call gpu_allocate(Y_ovov,nO,nV,nO,nV)
+
+  do a=1, nV
+    call gpu_set_stream(blas_handle, stream(a))
+    do j=1,nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, t2%f(1,j,1,a), &
+           nO*nO, 0.d0, t2%f(1,j,1,a), nO*nO, Y_ovov%f(1,a,j,1), nO*nV*nO)
+    enddo
+    do beta=1, nV
+      call gpu_dgeam(blas_handle, 'T', 'T', nO, nO, 0.5d0, K1%f(1,a,1,beta), &
+           nO*nV, 0.d0, K1%f(1,a,1,beta), nO*nV, X_ovov%f(1,a,1,beta), nO*nV)
     enddo
   enddo
-  !$omp end do
-  !$omp end parallel
-  deallocate(X_oovv)
+  call gpu_set_stream(blas_handle, gpu_default_stream)
 
-  double precision, allocatable :: X_vovo(:,:,:,:), Y_oovo(:,:,:,:)
-  allocate(X_vovo(nV,nO,nV,nO))
+  call gpu_synchronize()
 
-  !$omp parallel &
-  !$omp shared(nO,nV,X_vovo,cc_space_v_ovvo) &
-  !$omp private(a,v,gam,i) &
-  !$omp default(none)
-  do i = 1, nO
-    !$omp do
-    do gam = 1, nV
-      do v = 1, nO
-        do a = 1, nV
-          X_vovo(a,v,gam,i) = cc_space_v_ovvo(v,a,gam,i)
-        enddo
-      enddo
+  call gpu_dgemm(blas_handle, 'T','N',nO*nV,nO*nV,nO*nV, &
+            -1d0, X_ovov%f(1,1,1,1), size(X_ovov%f,1) * size(X_ovov%f,2), &
+                  Y_ovov%f(1,1,1,1), size(Y_ovov%f,1) * size(Y_ovov%f,2), &
+             1d0, Z_ovov%f(1,1,1,1), size(Z_ovov%f,1) * size(Z_ovov%f,2))
+
+  call gpu_synchronize()
+
+  do gam=1, nV
+    call gpu_set_stream(blas_handle, stream(gam))
+    do j=1,nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, r2%f(1,j,1,gam), &
+           nO*nO, 1.d0, Z_ovov%f(1,1,j,gam), nO, r2%f(1,j,1,gam), nO*nO)
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, K1%f(1,1,j,gam), &
+           nO, 0.d0, K1%f(1,1,j,gam), nO, X_ovov%f(1,gam,j,1), nO*nV*nO)
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, 1.d0, t2%f(1,j,1,gam), &
+           nO*nO, 0.d0, t2%f(1,j,1,gam), nO*nO, Y_ovov%f(1,gam,j,1), nO*nV*nO)
     enddo
-    !$omp end do nowait
-  enddo
-  !$omp end parallel
-
-  allocate(Y_oovo(nO,nO,nV,nO))
-  call dgemm('N','N',nO,nO*nV*nO,nV, &
-             1d0, t1, size(t1,1), &
-                  X_vovo, size(X_vovo,1), &
-             0d0, Y_oovo, size(Y_oovo,1))
-
-  deallocate(X_vovo)
-  allocate(X_oovv(nO,nO,nV,nV))
-  call dgemm('N','N',nO*nO*nV, nV, nO, &
-             1d0, Y_oovo, size(Y_oovo,1) * size(Y_oovo,2) * size(Y_oovo,3), &
-                  t1    , size(t1,1), &
-             0d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3))
-  deallocate(Y_oovo)
-
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,X_oovv) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
-  do gam = 1, nV
-    do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-           r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(u,v,gam,beta) - X_oovv(v,u,beta,gam)
-        enddo
-      enddo
+    do beta=1, nV
+      call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, 1.d0, r2%f(1,1,beta,gam), &
+           nO, 1.d0, Z_ovov%f(1,gam,1,beta), nO*nV, r2%f(1,1,beta,gam), nO)
     enddo
   enddo
-  !$omp end do
-  !$omp end parallel
-  deallocate(X_oovv)
 
+  call gpu_set_stream(blas_handle, gpu_default_stream)
 
-  double precision, allocatable :: J1(:,:,:,:)
-  allocate(J1(nO,nV,nV,nO))
-  call compute_J1_chol(nO,nV,t1,t2,cc_space_v_ovvo,cc_space_v_ovoo, &
-       cc_space_v_vvoo,J1)
+  call gpu_deallocate(K1)
 
-  double precision, allocatable :: K1(:,:,:,:)
-  allocate(K1(nO,nV,nO,nV))
-  call compute_K1_chol(nO,nV,t1,t2,cc_space_v_ovoo,cc_space_v_vvoo, &
-       cc_space_v_ovov,K1)
+  call gpu_dgemm(blas_handle, 'N','N',nO*nV,nO*nV,nO*nV, &
+             1d0, X_ovov%f(1,1,1,1), size(X_ovov%f,1) * size(X_ovov%f,2), &
+                  Y_ovov%f(1,1,1,1), size(Y_ovov%f,1) * size(Y_ovov%f,2), &
+             0d0, Z_ovov%f(1,1,1,1), size(Z_ovov%f,1) * size(Z_ovov%f,2))
 
-  allocate(X_ovvo(nO,nV,nV,nO))
-  !$omp parallel &
-  !$omp private(u,v,gam,beta,i,a) &
-  !$omp default(shared)
-  do i = 1, nO
-    !$omp do
-    do a = 1, nV
-      do beta = 1, nV
-        do u = 1, nO
-          X_ovvo(u,beta,a,i) = (J1(u,a,beta,i) - 0.5d0 * K1(u,a,i,beta))
-        enddo
-      enddo
-    enddo
-    !$omp end do nowait
-  enddo
-  !$omp end parallel
-  deallocate(J1)
+  call gpu_synchronize()
 
-  double precision, allocatable :: Y_voov(:,:,:,:)
-  allocate(Y_voov(nV,nO,nO,nV))
-
-  !$omp parallel &
-  !$omp private(u,v,gam,beta,i,a) &
-  !$omp default(shared)
-  !$omp do
-  do gam = 1, nV
-    do v = 1, nO
-      do i = 1, nO
-        do a = 1, nV
-          Y_voov(a,i,v,gam) = 2d0 * t2(i,v,a,gam) - t2(i,v,gam,a)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  double precision, allocatable :: Z_ovov(:,:,:,:)
-  allocate(Z_ovov(nO,nV,nO,nV))
-
-  call dgemm('N','N', nO*nV,nO*nV,nV*nO, &
-             1d0, X_ovvo, size(X_ovvo,1) * size(X_ovvo,2), &
-                  Y_voov, size(Y_voov,1) * size(Y_voov,2), &
-             0d0, Z_ovov, size(Z_ovov,1) * size(Z_ovov,2))
-
-  deallocate(X_ovvo,Y_voov)
-
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,Z_ovov) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
-  do gam = 1, nV
-    do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-           r2(u,v,beta,gam) = r2(u,v,beta,gam) + Z_ovov(u,beta,v,gam) + Z_ovov(v,gam,u,beta)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  deallocate(Z_ovov)
-
-  double precision, allocatable :: Y_ovov(:,:,:,:), X_ovov(:,:,:,:)
-  allocate(X_ovov(nO,nV,nO,nV))
-  allocate(Y_ovov(nO,nV,nO,nV))
-
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,K1,X_ovov,Y_ovov,t2) &
-  !$omp private(u,a,i,beta,gam) &
-  !$omp default(none)
-  !$omp do
-  do beta = 1, nV
-    do u = 1, nO
-      do a = 1, nV
-        do i = 1, nO
-          X_ovov(i,a,u,beta) = 0.5d0 * K1(u,a,i,beta)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do nowait
-
-  !$omp do
-  do gam = 1, nV
-    do v = 1, nO
-      do a = 1, nV
-        do i = 1, nO
-          Y_ovov(i,a,v,gam) = t2(i,v,gam,a)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  allocate(Z_ovov(nO,nV,nO,nV))
-  call dgemm('T','N',nO*nV,nO*nV,nO*nV, &
-             1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), &
-                  Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), &
-             0d0, Z_ovov, size(Y_ovov,1) * size(Y_ovov,2))
-  deallocate(X_ovov, Y_ovov)
-
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,Z_ovov) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
-  do gam = 1, nV
-    do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-           r2(u,v,beta,gam) = r2(u,v,beta,gam) - Z_ovov(u,beta,v,gam) - Z_ovov(v,gam,u,beta)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-  deallocate(Z_ovov)
-
-  allocate(X_ovov(nO,nV,nO,nV),Y_ovov(nO,nV,nO,nV))
-  !$omp parallel &
-  !$omp shared(nO,nV,K1,X_ovov,Y_ovov,t2) &
-  !$omp private(u,v,gam,beta,i,a) &
-  !$omp default(none)
-  !$omp do
-  do a = 1, nV
-    do i = 1, nO
-      do gam = 1, nV
-        do u = 1, nO
-          X_ovov(u,gam,i,a) = K1(u,a,i,gam)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do nowait
-
-  !$omp do
-  do beta = 1, nV
-    do v = 1, nO
-      do a = 1, nV
-        do i = 1, nO
-          Y_ovov(i,a,v,beta) = t2(i,v,beta,a)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  deallocate(K1)
-
-  allocate(Z_ovov(nO,nV,nO,nV))
-  call dgemm('N','N',nO*nV,nO*nV,nO*nV, &
-             1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), &
-                  Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), &
-             0d0, Z_ovov, size(Y_ovov,1) * size(Y_ovov,2))
-
-  deallocate(X_ovov,Y_ovov)
-
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,Z_ovov) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
-  do gam = 1, nV
-    do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-           r2(u,v,beta,gam) = r2(u,v,beta,gam) - Z_ovov(u,gam,v,beta) - Z_ovov(v,beta,u,gam)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  deallocate(Z_ovov)
+  call gpu_deallocate(X_ovov)
+  call gpu_deallocate(Y_ovov)
 
   ! Change the sign for consistency with the code in spin orbitals
+  do gam = 1, nV
+    call gpu_set_stream(blas_handle, stream(gam))
+    do j=1,nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV,  1.d0, r2%f(1,j,1,gam), &
+           nO*nO, -1.d0, Z_ovov%f(1,gam,j,1), nO*nV*nO, r2%f(1,j,1,gam), nO*nO)
+    enddo
+    do beta = 1, nV
+      call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, -1.d0, r2%f(1,1,beta,gam), &
+           nO, 1.d0, Z_ovov%f(1,beta,1,gam), nO*nV, r2%f(1,1,beta,gam), nO)
+    enddo
+  enddo
+
+  call gpu_deallocate(Z_ovov)
 
   max_r2 = 0d0
   !$omp parallel &
@@ -996,8 +796,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
     do a = 1, nV
       do j = 1, nO
         do i = 1, nO
-          r2(i,j,a,b) = -r2(i,j,a,b)
-          max_r2_local = max(r2(i,j,a,b), max_r2_local)
+          max_r2_local = max(r2%f(i,j,a,b), max_r2_local)
         enddo
       enddo
     enddo
@@ -1012,447 +811,458 @@ end
 
 ! A1
 
-subroutine compute_A1_chol(nO,nV,t1,t2,tau,A1)
-
+subroutine compute_A1_chol(nO,nV,t1,t2,tau,d_cc_space_v_vooo, &
+  d_cc_space_v_oooo, d_cc_space_v_vvoo, A1)
+  use gpu
   implicit none
 
-  integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: t1(nO, nV)
-  double precision, intent(in)  :: t2(nO, nO, nV, nV)
-  double precision, intent(in)  :: tau(nO, nO, nV, nV)
-  double precision, intent(out) :: A1(nO, nO, nO, nO)
+  integer, intent(in)            :: nO,nV
+  type(gpu_double2), intent(in)  :: t1
+  type(gpu_double4), intent(in)  :: t2, tau
+  type(gpu_double4), intent(in)  :: d_cc_space_v_vooo, d_cc_space_v_oooo, d_cc_space_v_vvoo
+  type(gpu_double4), intent(out) :: A1
 
   integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta
 
-  double precision, allocatable :: Y_oooo(:,:,:,:)
-  allocate(Y_oooo(nO,nO,nO,nO))
+  type(gpu_double4) :: Y_oooo
+  call gpu_allocate(Y_oooo,nO,nO,nO,nO)
 
   ! A1(u,v,i,j) = cc_space_v_oooo(u,v,i,j)
   ! A1(u,v,i,j) += cc_space_v_ovoo(u,a,i,j) * t1(v,a) &
 
-  call dgemm('N','N', nO, nO*nO*nO, nV, &
-             1d0, t1    , size(t1,1), &
-                  cc_space_v_vooo, size(cc_space_v_vooo,1), &
-             0d0, Y_oooo, size(Y_oooo,1))
+  call gpu_dgemm(blas_handle, 'N','N', nO, nO*nO*nO, nV, &
+             1d0, t1%f(1,1)  , size(t1%f,1), &
+                  d_cc_space_v_vooo%f(1,1,1,1), size(d_cc_space_v_vooo%f,1), &
+             0d0, Y_oooo%f(1,1,1,1), size(Y_oooo%f,1))
 
-  !$omp parallel &
-  !$omp private(u,v,i,j) &
-  !$omp default(shared)
-  !$omp do collapse(2)
-  do j = 1, nO
-    do i = 1, nO
-      do v = 1, nO
-        do u = 1, nO
-          A1(u,v,i,j) = cc_space_v_oooo(u,v,i,j) + Y_oooo(v,u,j,i) + Y_oooo(u,v,i,j)
-        enddo
-      enddo
-    enddo
+  type(gpu_stream) :: stream(nO)
+
+  do i=1, nO
+    call gpu_stream_create(stream(i))
   enddo
-  !$omp end do
-  !$omp end parallel
 
-  deallocate(Y_oooo)
+  call gpu_synchronize()
+
+  do j = 1, nO
+    call gpu_set_stream(blas_handle, stream(j))
+    do i = 1, nO
+      call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, 1.d0, d_cc_space_v_oooo%f(1,1,i,j), &
+           nO, 1.d0, Y_oooo%f(1,1,j,i), nO, A1%f(1,1,i,j), nO)
+    enddo
+    call gpu_dgeam(blas_handle, 'N', 'N', nO, nO*nO, 1.d0, A1%f(1,1,1,j), &
+         nO, 1.d0, Y_oooo%f(1,1,1,j), nO, A1%f(1,1,1,j), nO)
+  enddo
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+  do i=1, nO
+    call gpu_stream_destroy(stream(i))
+  enddo
+
+  call gpu_deallocate(Y_oooo)
 
   ! A1(u,v,i,j) += cc_space_v_vvoo(a,b,i,j) * tau(u,v,a,b)
-  call dgemm('N','N', nO*nO, nO*nO, nV*nV, &
-             1d0, tau     , size(tau,1) * size(tau,2), &
-                  cc_space_v_vvoo, size(cc_space_v_vvoo,1) * size(cc_space_v_vvoo,2), &
-             1d0, A1      , size(A1,1) * size(A1,2))
+  call gpu_dgemm(blas_handle, 'N','N', nO*nO, nO*nO, nV*nV, &
+             1d0, tau%f(1,1,1,1), size(tau%f,1) * size(tau%f,2), &
+                  d_cc_space_v_vvoo%f(1,1,1,1), size(d_cc_space_v_vvoo%f,1) * size(d_cc_space_v_vvoo%f,2), &
+             1d0, A1%f(1,1,1,1), size(A1%f,1) * size(A1%f,2))
+  call gpu_synchronize()
 
 end
 
 ! g_occ
 
-subroutine compute_g_occ_chol(nO,nV,t1,t2,H_oo,g_occ)
+subroutine compute_g_occ_chol(nO,nV,t1,t2,H_oo, &
+  d_cc_space_f_vo, d_cc_space_v_ov_chol, d_cc_space_v_oo_chol, d_cc_space_v_ovoo, g_occ)
+  use gpu
 
   implicit none
 
   integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: t1(nO, nV), H_oo(nO, nO)
-  double precision, intent(in)  :: t2(nO, nO, nV, nV)
-  double precision, intent(out) :: g_occ(nO, nO)
+  type(gpu_double2), intent(in)  :: t1, H_oo, d_cc_space_f_vo
+  type(gpu_double3), intent(in)  :: d_cc_space_v_ov_chol, d_cc_space_v_oo_chol
+  type(gpu_double4), intent(in)  :: t2, d_cc_space_v_ovoo
+  type(gpu_double2), intent(out) :: g_occ
 
-  g_occ = H_oo
+  call gpu_copy(H_oo, g_occ)
 
-  call dgemm('N','N',nO,nO,nV, &
-             1d0, t1, size(t1,1), &
-                  cc_space_f_vo, size(cc_space_f_vo,1), &
-             1d0, g_occ, size(g_occ,1))
+  call gpu_dgemm(blas_handle, 'N','N',nO,nO,nV, &
+             1d0, t1%f(1,1), size(t1%f,1), &
+                  d_cc_space_f_vo%f(1,1), size(d_cc_space_f_vo%f,1), &
+             1d0, g_occ%f(1,1), size(g_occ%f,1))
 
-  double precision, allocatable :: X(:)
-  allocate(X(cholesky_mo_num))
-  call dgemv('N',cholesky_mo_num,nO*nV,2.d0, &
-    cc_space_v_ov_chol, cholesky_mo_num, &
-    t1, 1, 0.d0, X, 1)
+  type(gpu_double1) :: X
+  call gpu_allocate(X,cholesky_mo_num)
 
-  call dgemv('T',cholesky_mo_num,nO*nO,1.d0, &
-    cc_space_v_oo_chol, cholesky_mo_num, &
-    X, 1, 1.d0, g_occ, 1)
-  deallocate(X)
+  call gpu_dgemv(blas_handle, 'N',cholesky_mo_num,nO*nV,2.d0, &
+    d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, &
+    t1%f(1,1), 1, 0.d0, X%f(1), 1)
 
-  call dgemv('T',nO*nV,nO*nO,-1.d0, &
-    cc_space_v_ovoo, nO*nV, &
-    t1, 1, 1.d0, g_occ, 1)
+  call gpu_dgemv(blas_handle, 'T',cholesky_mo_num,nO*nO,1.d0, &
+    d_cc_space_v_oo_chol%f(1,1,1), cholesky_mo_num, &
+    X%f(1), 1, 1.d0, g_occ%f(1,1), 1)
+
+  call gpu_dgemv(blas_handle, 'T',nO*nV,nO*nO,-1.d0, &
+    d_cc_space_v_ovoo%f(1,1,1,1), nO*nV, &
+    t1%f(1,1), 1, 1.d0, g_occ%f(1,1), 1)
+
+  call gpu_synchronize()
+  call gpu_deallocate(X)
 
 end
 
 ! g_vir
 
-subroutine compute_g_vir_chol(nO,nV,t1,t2,H_vv,g_vir)
+subroutine compute_g_vir_chol(nO,nV,t1,t2,H_vv,d_cc_space_f_vo, &
+  d_cc_space_v_ov_chol, d_cc_space_v_vv_chol, g_vir)
+  use gpu
 
   implicit none
 
   integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: t1(nO, nV), H_vv(nV, nV)
-  double precision, intent(in)  :: t2(nO, nO, nV, nV)
-  double precision, intent(out) :: g_vir(nV, nV)
+  type(gpu_double2), intent(in)  :: t1, H_vv, d_cc_space_f_vo
+  type(gpu_double3), intent(in)  :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol
+  type(gpu_double4), intent(in)  :: t2
+  type(gpu_double2), intent(out) :: g_vir
 
   integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam
 
-  call dgemm('N','N',nV,nV,nO, &
-             -1d0, cc_space_f_vo , size(cc_space_f_vo,1), &
-                   t1   , size(t1,1), &
-              0d0, g_vir, size(g_vir,1))
+  type(gpu_stream) :: stream(max(nO,4))
 
-  double precision, allocatable :: tmp_k(:), tmp_vo(:,:,:), tmp_vo2(:,:,:)
-  allocate(tmp_k(cholesky_mo_num))
-  call dgemm('N','N', cholesky_mo_num, 1, nO*nV, 1.d0, &
-    cc_space_v_ov_chol, cholesky_mo_num, t1, nO*nV, 0.d0, tmp_k, cholesky_mo_num)
-
-  call dgemm('T','N', nV*nV, 1, cholesky_mo_num, 2.d0, &
-    cc_space_v_vv_chol, cholesky_mo_num, tmp_k, cholesky_mo_num, 1.d0, &
-    g_vir, nV*nV)
-  deallocate(tmp_k)
-
-  allocate(tmp_vo(cholesky_mo_num,nV,nO))
-  call dgemm('N','T',cholesky_mo_num*nV, nO, nV, 1.d0, &
-    cc_space_v_vv_chol, cholesky_mo_num*nV, t1, nO, 0.d0, tmp_vo, cholesky_mo_num*nV)
-
-  allocate(tmp_vo2(cholesky_mo_num,nO,nV))
-  do beta=1,nV
-    do i=1,nO
-      do k=1,cholesky_mo_num
-        tmp_vo2(k,i,beta) = -tmp_vo(k,beta,i)
-      enddo
-    enddo
-  enddo
-  deallocate(tmp_vo)
-
-  do beta = 1, nV
-    do a = 1, nV
-      g_vir(a,beta) = g_vir(a,beta) + H_vv(a,beta)
-    enddo
+  do i=1,max(nO,4)
+    call gpu_stream_create(stream(i))
   enddo
 
-  call dgemm('T','N', nV, nV, nO*cholesky_mo_num, 1.d0, &
-     cc_space_v_ov_chol, cholesky_mo_num*nO, &
-     tmp_vo2, cholesky_mo_num*nO, 1.d0, g_vir, nV)
+  call gpu_set_stream(blas_handle, stream(1))
+  call gpu_dgemm(blas_handle, 'N','N',nV,nV,nO, &
+             -1d0, d_cc_space_f_vo%f(1,1) , size(d_cc_space_f_vo%f,1), &
+                   t1%f(1,1)   , size(t1%f,1), &
+              0d0, g_vir%f(1,1), size(g_vir%f,1))
+
+  type(gpu_double1) :: tmp_k
+  type(gpu_double3) :: tmp_vo, tmp_vo2
+
+  call gpu_allocate(tmp_k,cholesky_mo_num)
+
+  call gpu_set_stream(blas_handle, stream(2))
+  call gpu_dgemm(blas_handle, 'N','N', cholesky_mo_num, 1, nO*nV, 1.d0, &
+    d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, t1%f(1,1), nO*nV, 0.d0, tmp_k%f(1), cholesky_mo_num)
+
+  call gpu_dgemm(blas_handle, 'T','N', nV*nV, 1, cholesky_mo_num, 2.d0, &
+    d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num, tmp_k%f(1), cholesky_mo_num, 1.d0, &
+    g_vir%f(1,1), nV*nV)
+
+  call gpu_set_stream(blas_handle, stream(3))
+  call gpu_allocate(tmp_vo,cholesky_mo_num,nV,nO)
+
+  call gpu_dgemm(blas_handle, 'N','T',cholesky_mo_num*nV, nO, nV, 1.d0, &
+    d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num*nV, t1%f(1,1), nO, 0.d0, tmp_vo%f(1,1,1), cholesky_mo_num*nV)
+
+  call gpu_allocate(tmp_vo2,cholesky_mo_num,nO,nV)
+
+  call gpu_synchronize()
+  call gpu_deallocate(tmp_k)
+
+  do i=1,nO
+    call gpu_set_stream(blas_handle, stream(i))
+    call gpu_dgeam(blas_handle, 'N', 'N', cholesky_mo_num, nV, -1.d0, tmp_vo%f(1,1,i), &
+         cholesky_mo_num, 0.d0, tmp_vo%f(1,1,i), cholesky_mo_num, tmp_vo2%f(1,i,1), cholesky_mo_num*nO)
+  enddo
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+
+  do i=1,max(nO,4)
+    call gpu_stream_destroy(stream(i))
+  enddo
+  call gpu_deallocate(tmp_vo)
+
+  call gpu_dgeam(blas_handle, 'N', 'N', nV, nV, 1.d0, g_vir%f(1,1), &
+         nV, 1.d0, H_vv%f(1,1), nV, g_vir%f(1,1), nV)
+
+  call gpu_dgemm(blas_handle, 'T','N', nV, nV, nO*cholesky_mo_num, 1.d0, &
+     d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, &
+     tmp_vo2%f(1,1,1), cholesky_mo_num*nO, 1.d0, g_vir%f(1,1), nV)
+
+  call gpu_synchronize()
+  call gpu_deallocate(tmp_vo2)
 
 end
 
 ! J1
 
-subroutine compute_J1_chol(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvoo,J1)
+subroutine compute_J1_chol(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvoo,d_cc_space_v_vo_chol,d_cc_space_v_vv_chol,J1)
+  use gpu
   implicit none
 
-  integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: t1(nO, nV)
-  double precision, intent(in)  :: t2(nO, nO, nV, nV)
-  double precision, intent(in)  :: v_ovvo(nO,nV,nV,nO), v_ovoo(nO,nV,nO,nO)
-  double precision, intent(in)  :: v_vvoo(nV,nV,nO,nO)
-  double precision, intent(out) :: J1(nO, nV, nV, nO)
+  integer, intent(in)            :: nO,nV
+  type(gpu_double2), intent(in)  :: t1
+  type(gpu_double4), intent(in)  :: t2, v_ovvo, v_ovoo, v_vvoo
+  type(gpu_double3), intent(in) :: d_cc_space_v_vo_chol,d_cc_space_v_vv_chol
+  type(gpu_double4), intent(out) :: J1
 
   integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam
 
-  double precision, allocatable :: X_ovoo(:,:,:,:), Y_ovov(:,:,:,:)
-  allocate(X_ovoo(nO,nV,nO,nO),Y_ovov(nO,nV,nO,nV))
+  type(gpu_double4) :: X_ovoo, Y_ovov
 
-  !$omp parallel &
-  !$omp shared(nO,nV,J1,v_ovvo,v_ovoo,X_ovoo) &
-  !$omp private(i,j,a,u,beta) &
-  !$omp default(none)
-  do i = 1, nO
-    !$omp do
-    do beta = 1, nV
-      do a = 1, nV
-        do u = 1, nO
-          J1(u,a,beta,i) = v_ovvo(u,a,beta,i)
-        enddo
-      enddo
-    enddo
-    !$omp end do nowait
-  enddo
+  call gpu_allocate(X_ovoo,nO,nV,nO,nO)
 
-  !$omp do collapse(2)
-  do j = 1, nO
-    do i = 1, nO
-      do a = 1, nV
-        do u = 1, nO
-          X_ovoo(u,a,i,j) = v_ovoo(u,a,j,i)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
+  type(gpu_stream) :: stream(nV)
 
-  call dgemm('N','N',nO*nV*nO,nV,nO, &
-            -1d0, X_ovoo, size(X_ovoo,1) * size(X_ovoo,2) * size(X_ovoo,3), &
-                  t1    , size(t1,1), &
-             0d0, Y_ovov, size(Y_ovov,1) * size(Y_ovov,2) * size(Y_ovov,3))
-
-  !$omp parallel &
-  !$omp shared(nO,nV,J1,Y_ovov) &
-  !$omp private(i,beta,a,u) &
-  !$omp default(none)
-  do i = 1, nO
-    !$omp do
-    do beta = 1, nV
-      do a = 1, nV
-        do u = 1, nO
-          J1(u,a,beta,i) = J1(u,a,beta,i) + Y_ovov(u,a,i,beta)
-        enddo
-      enddo
-    enddo
-    !$omp end do nowait
-  enddo
-  !$omp end parallel
-  deallocate(X_ovoo)
-
-  double precision, allocatable :: tmp_cc(:,:,:), J1_tmp(:,:,:,:)
-  allocate(tmp_cc(cholesky_mo_num,nV,nO), J1_tmp(nV,nO,nV,nO))
-
-  call dgemm('N','T', cholesky_mo_num*nV, nO, nV, 1.d0, &
-      cc_space_v_vv_chol, cholesky_mo_num*nV, &
-      t1, nO, &
-      0.d0, tmp_cc, cholesky_mo_num*nV)
-
-  call dgemm('T','N', nV*nO, nV*nO, cholesky_mo_num, 1.d0, &
-      tmp_cc, cholesky_mo_num, cc_space_v_vo_chol, cholesky_mo_num, &
-      0.d0, J1_tmp, nV*nO)
-
-  deallocate(tmp_cc)
 
   do i=1,nO
-    do b=1,nV
-      do a=1,nV
-        do u=1,nO
-          J1(u,a,b,i) = J1(u,a,b,i) + J1_tmp(b,u,a,i)
-        enddo
-      enddo
-    enddo
-  enddo
-
-  deallocate(J1_tmp)
-
-  !- cc_space_v_vvoo(a,b,i,j) * (0.5d0 * t2(u,j,b,beta) + t1(u,b) * t1(j,beta)) &
-  double precision, allocatable :: X_voov(:,:,:,:), Z_ovvo(:,:,:,:)
-  allocate(X_voov(nV,nO,nO,nV), Z_ovvo(nO,nV,nV,nO))
-  !$omp parallel &
-  !$omp shared(nO,nV,t2,t1,Y_ovov,X_voov,v_vvoo) &
-  !$omp private(i,beta,a,u,b,j) &
-  !$omp default(none)
-  !$omp do
-  do b = 1, nV
-    do j = 1, nO
-      do beta = 1, nV
-        do u = 1, nO
-          Y_ovov(u,beta,j,b) = 0.5d0 * t2(u,j,b,beta) + t1(u,b) * t1(j,beta)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do nowait
-
-  !$omp do
-  do b = 1, nV
-    do j = 1, nO
-      do i = 1, nO
-        do a = 1, nV
-          X_voov(a,i,j,b) = v_vvoo(a,b,i,j)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  call dgemm('N','T',nO*nV,nV*nO,nO*nV, &
-             -1d0, Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), &
-                   X_voov, size(X_voov,1) * size(X_voov,2), &
-              0d0, Z_ovvo, size(Z_ovvo,1) * size(Z_ovvo,2))
-  deallocate(X_voov)
-
-  double precision, allocatable :: X_ovvo(:,:,:,:), Y_vovo(:,:,:,:)
-  allocate(X_ovvo(nO,nV,nV,nO),Y_vovo(nV,nO,nV,nO))
-  !$omp parallel &
-  !$omp shared(nO,nV,J1,Z_ovvo,t2,Y_vovo,v_vvoo,X_ovvo) &
-  !$omp private(i,beta,a,u,j,b) &
-  !$omp default(none)
-  do i = 1, nO
-    !$omp do
-    do beta = 1, nV
-      do a = 1, nV
-        do u = 1, nO
-          J1(u,a,beta,i) = J1(u,a,beta,i) + Z_ovvo(u,beta,a,i)
-        enddo
-      enddo
-    enddo
-    !$omp end do nowait
-  enddo
-
-  !+ 0.5d0 * (2d0 * cc_space_v_vvoo(a,b,i,j) - cc_space_v_vvoo(b,a,i,j)) * t2(u,j,beta,b)
-  do j = 1, nO
-    !$omp do
-    do b = 1, nV
-      do i = 1, nO
-        do a = 1, nV
-          Y_vovo(a,i,b,j) = 0.5d0 * (2d0 * v_vvoo(a,b,i,j) - v_vvoo(b,a,i,j))
-        enddo
-      enddo
-    enddo
-    !$omp end do nowait
+    call gpu_stream_create(stream(i))
   enddo
 
   do j = 1, nO
-    !$omp do
-    do b = 1, nV
-      do beta = 1, nV
-        do u = 1, nO
-          X_ovvo(u,beta,b,j) = t2(u,j,beta,b)
-        enddo
-      enddo
+    call gpu_set_stream(blas_handle, stream(j))
+    do i = 1, nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, v_ovoo%f(1,1,j,i), &
+         nO, 0.d0, X_ovoo%f(1,1,i,j), nO, X_ovoo%f(1,1,i,j), nO)
     enddo
-    !$omp end do nowait
   enddo
-  !$omp end parallel
 
-  call dgemm('N','T',nO*nV,nV*nO,nV*nO, &
-             1d0, X_ovvo, size(X_ovvo,1) * size(X_ovvo,2), &
-                  Y_vovo, size(Y_vovo,1) * size(Y_vovo,2), &
-             0d0, Z_ovvo, size(Z_ovvo,1) * size(Z_ovvo,2))
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+
+  do i=1,nO
+    call gpu_stream_destroy(stream(i))
+  enddo
+
+  call gpu_allocate(Y_ovov,nO,nV,nO,nV)
+
+  call gpu_dgemm(blas_handle, 'N','N',nO*nV*nO,nV,nO, &
+            -1d0, X_ovoo%f(1,1,1,1), size(X_ovoo%f,1) * size(X_ovoo%f,2) * size(X_ovoo%f,3), &
+                  t1%f(1,1)    , size(t1%f,1), &
+             0d0, Y_ovov%f(1,1,1,1), size(Y_ovov%f,1) * size(Y_ovov%f,2) * size(Y_ovov%f,3))
+
+
+  call gpu_copy(v_ovvo, J1)
+
+  call gpu_synchronize()
+
+  do a=1,nV
+    call gpu_stream_create(stream(a))
+  enddo
 
-  !$omp parallel &
-  !$omp shared(nO,nV,J1,Z_ovvo) &
-  !$omp private(i,beta,a,u) &
-  !$omp default(none)
   do i = 1, nO
-    !$omp do
     do beta = 1, nV
-      do a = 1, nV
-        do u = 1, nO
-          J1(u,a,beta,i) = J1(u,a,beta,i) + Z_ovvo(u,beta,a,i)
-        enddo
-      enddo
+      call gpu_set_stream(blas_handle, stream(beta))
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,1,beta,i), &
+         nO, 1.d0, Y_ovov%f(1,1,i,beta), nO, J1%f(1,1,beta,i), nO)
     enddo
-    !$omp end do nowait
   enddo
-  !$omp end parallel
 
-  deallocate(X_ovvo,Z_ovvo,Y_ovov)
+  call gpu_allocate(tmp_cc,cholesky_mo_num,nV,nO)
+  call gpu_allocate(J1_tmp,nV,nO,nV,nO)
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+
+  type(gpu_double4) :: J1_tmp
+  type(gpu_double3) :: tmp_cc
+
+  call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nV, nO, nV, 1.d0, &
+      d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num*nV, &
+      t1%f(1,1), nO, &
+      0.d0, tmp_cc%f(1,1,1), cholesky_mo_num*nV)
+
+  call gpu_dgemm(blas_handle, 'T','N', nV*nO, nV*nO, cholesky_mo_num, 1.d0, &
+      tmp_cc%f(1,1,1), cholesky_mo_num, d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num, &
+      0.d0, J1_tmp%f(1,1,1,1), nV*nO)
+
+
+  call gpu_deallocate(X_ovoo)
+
+  call gpu_synchronize()
+  call gpu_deallocate(tmp_cc)
+
+  do i = 1, nO
+    do a = 1, nV
+      call gpu_set_stream(blas_handle, stream(a))
+      call gpu_dgeam(blas_handle, 'N', 'T', nO, nV, 1.d0, J1%f(1,a,1,i), &
+         nO*nV, 1.d0, J1_tmp%f(1,1,a,i), nV, J1%f(1,a,1,i), nO*nV)
+    enddo
+  enddo
+
+  type(gpu_double4) :: X_voov, Z_ovvo
+
+  call gpu_allocate(X_voov,nV,nO,nO,nV)
+  call gpu_allocate(Z_ovvo,nO,nV,nV,nO)
+
+  do j = 1, nO
+    do beta = 1, nV
+      call gpu_set_stream(blas_handle, stream(beta))
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 0.5d0, t2%f(1,j,1,beta), &
+         nO*nO, t1%f(j,beta), t1%f(1,1), nO, Y_ovov%f(1,beta,j,1), nO*nV*nO)
+    enddo
+  enddo
+
+  do b = 1, nV
+    call gpu_set_stream(blas_handle, stream(b))
+    call gpu_dgeam(blas_handle, 'N', 'N', nV, nO*nO, 1.d0, v_vvoo%f(1,b,1,1), &
+         nV*nV, 0.d0, X_voov%f(1,1,1,b), nV, X_voov%f(1,1,1,b), nV)
+  enddo
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+
+  call gpu_synchronize()
+  call gpu_deallocate(J1_tmp)
+
+  call gpu_dgemm(blas_handle, 'N','T',nO*nV,nV*nO,nO*nV, &
+             -1d0, Y_ovov%f(1,1,1,1), size(Y_ovov%f,1) * size(Y_ovov%f,2), &
+                   X_voov%f(1,1,1,1), size(X_voov%f,1) * size(X_voov%f,2), &
+              0d0, Z_ovvo%f(1,1,1,1), size(Z_ovvo%f,1) * size(Z_ovvo%f,2))
+
+  call gpu_synchronize()
+
+  do i = 1, nO
+    do a = 1, nV
+      call gpu_set_stream(blas_handle, stream(a))
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,a,1,i), &
+          nO*nV, 1.d0, Z_ovvo%f(1,1,a,i), nO, J1%f(1,a,1,i), nO*nV)
+    enddo
+  enddo
+
+  type(gpu_double4) :: X_ovvo, Y_vovo
+  call gpu_allocate(Y_vovo,nV,nO,nV,nO)
+
+  do j = 1, nO
+    do i = 1, nO
+      call gpu_set_stream(blas_handle, stream(i))
+      call gpu_dgeam(blas_handle, 'N', 'T', nV, nV, 1.d0, v_vvoo%f(1,1,i,j), &
+          nV, -0.5d0, v_vvoo%f(1,1,i,j), nV, Y_vovo%f(1,i,1,j), nO*nV)
+    enddo
+  enddo
+
+  call gpu_allocate(X_ovvo,nO,nV,nV,nO)
+
+  do j = 1, nO
+    do b = 1, nV
+      call gpu_set_stream(blas_handle, stream(b))
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, t2%f(1,j,1,b), &
+          nO*nO, 0.d0, t2%f(1,j,1,b), nO*nO, X_ovvo%f(1,1,b,j), nO)
+    enddo
+  enddo
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+  call gpu_synchronize()
+  call gpu_deallocate(X_voov)
+
+  call gpu_dgemm(blas_handle, 'N','T',nO*nV,nV*nO,nV*nO, &
+             1d0, X_ovvo%f(1,1,1,1), size(X_ovvo%f,1) * size(X_ovvo%f,2), &
+                  Y_vovo%f(1,1,1,1), size(Y_vovo%f,1) * size(Y_vovo%f,2), &
+             0d0, Z_ovvo%f(1,1,1,1), size(Z_ovvo%f,1) * size(Z_ovvo%f,2))
+
+  call gpu_synchronize()
+
+  do i = 1, nO
+    do beta = 1, nV
+      call gpu_set_stream(blas_handle, stream(beta))
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,1,beta,i), &
+          nO, 1.d0, Z_ovvo%f(1,beta,1,i), nO*nV, J1%f(1,1,beta,i), nO)
+    enddo
+  enddo
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+  call gpu_deallocate(Y_ovov)
+  call gpu_deallocate(X_ovvo)
+
+  do a = 1, nV
+    call gpu_stream_destroy(stream(a))
+  enddo
+
+  call gpu_deallocate(Z_ovvo)
 
 end
 
 ! K1
 
-subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov,K1)
+subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov, &
+   d_cc_space_v_ov_chol,d_cc_space_v_vv_chol,K1)
+  use gpu
 
   implicit none
 
-  integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: t1(nO, nV)
-  double precision, intent(in)  :: t2(nO, nO, nV, nV)
-  double precision, intent(in)  :: v_vvoo(nV,nV,nO,nO), v_ovov(nO,nV,nO,nV)
-  double precision, intent(in)  :: v_ovoo(nO,nV,nO,nO)
-  double precision, intent(out) :: K1(nO, nV, nO, nV)
+  integer, intent(in)            :: nO,nV
+  type(gpu_double2), intent(in)  :: t1
+  type(gpu_double4), intent(in)  :: t2, v_vvoo, v_ovov, v_ovoo
+  type(gpu_double3), intent(in)  :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol
+  type(gpu_double4), intent(out) :: K1
 
-  double precision, allocatable :: X(:,:,:,:), Y(:,:,:,:), Z(:,:,:,:)
+  type(gpu_double4) :: X, Y, Z
 
   integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam
 
-  allocate(X(nV,nO,nV,nO),Y(nO,nV,nV,nO),Z(nO,nV,nV,nO))
 
-  !$omp parallel &
-  !$omp shared(nO,nV,K1,X,Y,v_vvoo,v_ovov,t1,t2) &
-  !$omp private(i,beta,a,u,j,b) &
-  !$omp default(none)
-  !$omp do
-  do beta = 1, nV
+  call gpu_copy(v_ovov, K1)
+
+  type(gpu_stream) :: stream(nV)
+  do a = 1, nV
+    call gpu_stream_create(stream(a))
+  enddo
+
+  call gpu_allocate(X,nV,nO,nV,nO)
+  call gpu_allocate(Y,nO,nV,nV,nO)
+
+  do a = 1, nV
+    call gpu_set_stream(blas_handle, stream(a))
     do i = 1, nO
-      do a = 1, nV
-        do u = 1, nO
-          K1(u,a,i,beta) = v_ovov(u,a,i,beta)
-        enddo
-      enddo
+      call gpu_dgeam(blas_handle, 'N', 'N', nV, nO, -1.d0, v_vvoo%f(1,a,i,1), &
+          nV*nV*nO, 0.d0, v_vvoo%f(1,a,i,1), nV*nV*nO, X%f(1,1,a,i), nV)
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 0.5d0, t2%f(1,i,1,a), &
+          nO*nO, t1%f(i,a), t1%f(1,1), nO, Y%f(1,a,1,i), nO*nV)
     enddo
   enddo
-  !$omp end do nowait
 
-  do i = 1, nO
-    !$omp do
-    do a = 1, nV
-      do j = 1, nO
-        do b = 1, nV
-          X(b,j,a,i) = - v_vvoo(b,a,i,j)
-        enddo
-      enddo
-    enddo
-    !$omp end do nowait
-  enddo
+  call gpu_set_stream(blas_handle, gpu_default_stream)
 
-  do j = 1, nO
-    !$omp do
-    do b = 1, nV
-      do beta = 1, nV
-        do u = 1, nO
-          Y(u,beta,b,j) = 0.5d0 * t2(u,j,b,beta) + t1(u,b) * t1(j,beta)
-        enddo
-      enddo
-    enddo
-    !$omp end do
-  enddo
-  !$omp end parallel
+  call gpu_dgemm(blas_handle, 'N','N',nO*nV*nO,nV,nO, &
+            -1d0, v_ovoo%f(1,1,1,1), size(v_ovoo%f,1) * size(v_ovoo%f,2) * size(v_ovoo%f,3), &
+                  t1%f(1,1)    , size(t1%f,1), &
+            1d0, K1%f(1,1,1,1)    , size(K1%f,1) * size(K1%f,2) * size(K1%f,3))
 
-  call dgemm('N','N',nO*nV*nO,nV,nO, &
-            -1d0, v_ovoo, size(v_ovoo,1) * size(v_ovoo,2) * size(v_ovoo,3), &
-                  t1    , size(t1,1), &
-            1d0, K1    , size(K1,1) * size(K1,2) * size(K1,3))
+  type(gpu_double4) :: K1tmp
+  type(gpu_double3) :: t1v
 
-  double precision, allocatable :: K1tmp(:,:,:,:), t1v(:,:,:)
-  allocate(K1tmp(nO,nO,nV,nV), t1v(cholesky_mo_num,nO,nO))
+  call gpu_allocate(t1v,cholesky_mo_num,nO,nO)
 
-  call dgemm('N','T', cholesky_mo_num*nO, nO, nV, 1.d0, &
-    cc_space_v_ov_chol, cholesky_mo_num*nO, t1, nO, 0.d0, &
-    t1v, cholesky_mo_num*nO)
+  call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nO, nO, nV, 1.d0, &
+    d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, t1%f(1,1), nO, 0.d0, &
+    t1v%f(1,1,1), cholesky_mo_num*nO)
 
-  call dgemm('T','N', nO*nO, nV*nV, cholesky_mo_num, 1.d0, &
-    t1v, cholesky_mo_num, cc_space_v_vv_chol, cholesky_mo_num, 0.d0, &
-    K1tmp, nO*nO)
+  call gpu_allocate(K1tmp,nO,nO,nV,nV)
+
+  call gpu_dgemm(blas_handle, 'T','N', nO*nO, nV*nV, cholesky_mo_num, 1.d0, &
+    t1v%f(1,1,1), cholesky_mo_num, d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num, 0.d0, &
+    K1tmp%f(1,1,1,1), nO*nO)
+
+  call gpu_allocate(Z,nO,nV,nV,nO)
+  call gpu_synchronize()
 
-  deallocate(t1v)
   ! Y(u,beta,b,j) * X(b,j,a,i) = Z(u,beta,a,i)
-  call dgemm('N','N',nV*nO,nO*nV,nV*nO, &
-             1d0, Y, size(Y,1) * size(Y,2), &
-                  X, size(X,1) * size(X,2), &
-             0d0, Z, size(Z,1) * size(Z,2))
+  call gpu_dgemm(blas_handle, 'N','N',nV*nO,nO*nV,nV*nO, &
+             1d0, Y%f(1,1,1,1), size(Y%f,1) * size(Y%f,2), &
+                  X%f(1,1,1,1), size(X%f,1) * size(X%f,2), &
+             0d0, Z%f(1,1,1,1), size(Z%f,1) * size(Z%f,2))
 
-  !$omp parallel &
-  !$omp shared(nO,nV,K1,Z,K1tmp) &
-  !$omp private(i,beta,a,u) &
-  !$omp default(none)
-  !$omp do
-   do beta = 1, nV
-    do i = 1, nO
-      do a = 1, nV
-        do u = 1, nO
-          K1(u,a,i,beta) = K1(u,a,i,beta) + K1tmp(u,i,a,beta) + Z(u,beta,a,i)
-        enddo
-      enddo
+  call gpu_synchronize()
+  call gpu_deallocate(t1v)
+
+  do beta = 1, nV
+   call gpu_set_stream(blas_handle, stream(beta))
+   do i = 1, nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, K1%f(1,1,i,beta), &
+          nO, 1.d0, K1tmp%f(1,i,1,beta), nO*nO, K1%f(1,1,i,beta), nO)
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, K1%f(1,1,i,beta), &
+          nO, 1.d0, Z%f(1,beta,1,i), nO*nV, K1%f(1,1,i,beta), nO)
     enddo
   enddo
-  !$omp end do
-  !$omp end parallel
 
-  deallocate(K1tmp,X,Y,Z)
+  call gpu_deallocate(X)
+  call gpu_deallocate(Y)
+
+  do a = 1, nV
+    call gpu_stream_destroy(stream(a))
+  enddo
+
+  call gpu_deallocate(K1tmp)
+  call gpu_deallocate(Z)
 
 end
diff --git a/src/gpu/NEED b/src/gpu/NEED
new file mode 100644
index 00000000..c2af78d2
--- /dev/null
+++ b/src/gpu/NEED
@@ -0,0 +1 @@
+gpu_arch
diff --git a/src/gpu/README.rst b/src/gpu/README.rst
new file mode 100644
index 00000000..17ee28a0
--- /dev/null
+++ b/src/gpu/README.rst
@@ -0,0 +1,6 @@
+===
+gpu
+===
+
+Bindings for GPU routines (architecture independent).
+Architecture-dependent files are in gpu_arch.
diff --git a/src/gpu_x86/gpu.h b/src/gpu/gpu.h
similarity index 66%
rename from src/gpu_x86/gpu.h
rename to src/gpu/gpu.h
index 03aeeab7..ac70e21a 100644
--- a/src/gpu_x86/gpu.h
+++ b/src/gpu/gpu.h
@@ -22,20 +22,20 @@ void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_
 
 void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result);
 
-void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double alpha,
-               const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy);
+void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy);
 
-void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float alpha,
-               const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy);
+void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float* alpha,
+               const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy);
 
-void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
-               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc);
+void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc);
 
-void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
-               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc);
+void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha,
+               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc);
 
-void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
-               const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc);
+void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc);
 
-void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
-               const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc);
+void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float* alpha,
+               const float* a, const int64_t lda, const float* beta, const float* b, const int64_t ldb, float* c, const int64_t ldc);
diff --git a/src/gpu/gpu.irp.f b/src/gpu/gpu.irp.f
new file mode 100644
index 00000000..3b2feeb6
--- /dev/null
+++ b/src/gpu/gpu.irp.f
@@ -0,0 +1,26 @@
+use gpu
+
+BEGIN_PROVIDER [ type(gpu_blas), blas_handle ]
+ implicit none
+ BEGIN_DOC
+ ! Handle for cuBLAS or RocBLAS
+ END_DOC
+ call gpu_blas_create(blas_handle)
+END_PROVIDER
+
+BEGIN_PROVIDER [ type(gpu_stream), gpu_default_stream ]
+ implicit none
+ BEGIN_DOC
+ ! Default stream
+ END_DOC
+ gpu_default_stream%c = C_NULL_PTR
+END_PROVIDER
+
+BEGIN_PROVIDER [ integer, gpu_num ]
+ implicit none
+ BEGIN_DOC
+ ! Number of usable GPUs
+ END_DOC
+ gpu_num = gpu_ndevices()
+END_PROVIDER
+
diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90
new file mode 100644
index 00000000..6050075f
--- /dev/null
+++ b/src/gpu/gpu_module.F90
@@ -0,0 +1,707 @@
+module gpu
+  use, intrinsic :: iso_c_binding
+  implicit none
+
+! Data types
+! ----------
+
+  type gpu_double1
+    type(c_ptr) :: c
+    double precision, pointer :: f(:)
+  end type
+
+  type gpu_double2
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:)
+  end type
+
+  type gpu_double3
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:,:)
+  end type
+
+  type gpu_double4
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:,:,:)
+  end type
+
+  type gpu_double5
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:,:,:,:)
+  end type
+
+  type gpu_double6
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:,:,:,:,:)
+  end type
+
+
+  type gpu_blas
+    type(c_ptr) :: c
+  end type
+
+  type gpu_stream
+    type(c_ptr) :: c
+  end type
+
+
+! C interfaces
+! ------------
+
+  interface
+    logical(c_bool) function no_gpu() bind(C)
+      import
+    end function
+
+    integer function gpu_ndevices() bind(C)
+      import
+    end function
+
+    subroutine gpu_set_device(id) bind(C)
+      import
+      integer(c_int32_t), value :: id
+    end subroutine
+
+    subroutine gpu_allocate_c(ptr, n) bind(C, name='gpu_allocate')
+      import
+      type(c_ptr) :: ptr
+      integer(c_int64_t), value :: n
+    end subroutine
+
+    subroutine gpu_deallocate_c(ptr) bind(C, name='gpu_deallocate')
+      import
+      type(c_ptr) :: ptr
+    end subroutine
+
+    subroutine gpu_upload_c(cpu_ptr, gpu_ptr, n) bind(C, name='gpu_upload')
+      import
+      type(c_ptr), value :: cpu_ptr
+      type(c_ptr), value :: gpu_ptr
+      integer(c_int64_t), value :: n
+    end subroutine
+
+    subroutine gpu_download_c(gpu_ptr, cpu_ptr, n) bind(C, name='gpu_download')
+      import
+      type(c_ptr), value :: gpu_ptr
+      type(c_ptr), value :: cpu_ptr
+      integer(c_int64_t), value :: n
+    end subroutine
+
+    subroutine gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, n) bind(C, name='gpu_copy')
+      import
+      type(c_ptr), value :: gpu_ptr_src
+      type(c_ptr), value :: gpu_ptr_dest
+      integer(c_int64_t), value :: n
+    end subroutine
+
+    subroutine gpu_stream_create_c(stream) bind(C, name='gpu_stream_create')
+      import
+      type(c_ptr) :: stream
+    end subroutine
+
+    subroutine gpu_stream_destroy_c(stream) bind(C, name='gpu_stream_destroy')
+      import
+      type(c_ptr) :: stream
+    end subroutine
+
+    subroutine gpu_set_stream_c(handle, stream) bind(C, name='gpu_set_stream')
+      import
+      type(c_ptr), value :: handle, stream
+    end subroutine
+
+    subroutine gpu_synchronize() bind(C)
+      import
+    end subroutine
+
+    subroutine gpu_blas_create_c(handle) bind(C, name='gpu_blas_create')
+      import
+      type(c_ptr) :: handle
+    end subroutine
+
+    subroutine gpu_blas_destroy_c(handle) bind(C, name='gpu_blas_destroy')
+      import
+      type(c_ptr) :: handle
+    end subroutine
+
+    subroutine gpu_ddot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_ddot')
+      import
+      type(c_ptr), value, intent(in) :: handle
+      integer(c_int64_t), value      :: n, incx, incy
+      type(c_ptr), value             :: dx, dy
+      real(c_double), intent(out)    :: res
+    end subroutine
+
+    subroutine gpu_sdot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_sdot')
+      import
+      type(c_ptr), value, intent(in) :: handle
+      integer(c_int64_t), value      :: n, incx, incy
+      type(c_ptr), intent(in), value :: dx, dy
+      real(c_float), intent(out)     :: res
+    end subroutine
+
+    subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
+      b, ldb, c, ldc) bind(C, name='gpu_dgeam')
+      import
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in), value  :: transa, transb
+      integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
+      real(c_double), intent(in)            :: alpha, beta
+      type(c_ptr), value :: a, b, c
+    end subroutine
+
+    subroutine gpu_sgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
+      b, ldb, c, ldc) bind(C, name='gpu_sgeam')
+      import
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in), value  :: transa, transb
+      integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
+      real(c_float), intent(in)             :: alpha, beta
+      real(c_float) :: a, b, c
+    end subroutine
+
+    subroutine gpu_dgemv_c(handle, transa, m, n, alpha, a, lda, &
+      x, incx, beta, y, incy) bind(C, name='gpu_dgemv')
+      import
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in)         :: transa
+      integer(c_int64_t), intent(in), value :: m, n, lda, incx, incy
+      real(c_double), intent(in)            :: alpha, beta
+      real(c_double)                        :: a, x, y
+    end subroutine
+
+    subroutine gpu_sgemv_c(handle, transa, m, n, alpha, a, lda, &
+      x, incx, beta, y, incy) bind(C, name='gpu_sgemv')
+      import
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in)         :: transa
+      integer(c_int64_t), intent(in), value :: m, n, lda, incx, incy
+      real(c_float), intent(in)             :: alpha, beta
+      real(c_float)                         :: a, x, y
+    end subroutine
+
+
+    subroutine gpu_dgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, &
+      b, ldb, beta, c, ldc) bind(C, name='gpu_dgemm')
+      import
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in)         :: transa, transb
+      integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc
+      real(c_double), intent(in)            :: alpha, beta
+      real(c_double) :: a, b, c
+    end subroutine
+
+    subroutine gpu_sgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, &
+      b, ldb, beta, c, ldc) bind(C, name='gpu_sgemm')
+      import
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in), value  :: transa, transb
+      integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc
+      real(c_float), intent(in)             :: alpha, beta
+      real(c_float) :: a, b, c
+    end subroutine
+
+  end interface
+
+
+! Polymorphic interfaces
+! ----------------------
+
+  interface gpu_allocate
+    procedure gpu_allocate_double1     &
+             ,gpu_allocate_double2     &
+             ,gpu_allocate_double3     &
+             ,gpu_allocate_double4     &
+             ,gpu_allocate_double5     &
+             ,gpu_allocate_double6     &
+             ,gpu_allocate_double1_64  &
+             ,gpu_allocate_double2_64  &
+             ,gpu_allocate_double3_64  &
+             ,gpu_allocate_double4_64  &
+             ,gpu_allocate_double5_64  &
+             ,gpu_allocate_double6_64
+  end interface gpu_allocate
+
+  interface gpu_deallocate
+    procedure gpu_deallocate_double1     &
+             ,gpu_deallocate_double2     &
+             ,gpu_deallocate_double3     &
+             ,gpu_deallocate_double4     &
+             ,gpu_deallocate_double5     &
+             ,gpu_deallocate_double6
+  end interface gpu_deallocate
+
+  interface gpu_upload
+    procedure gpu_upload_double1  &
+             ,gpu_upload_double2  &
+             ,gpu_upload_double3  &
+             ,gpu_upload_double4  &
+             ,gpu_upload_double5  &
+             ,gpu_upload_double6
+  end interface gpu_upload
+
+  interface gpu_download
+    procedure gpu_download_double1  &
+             ,gpu_download_double2  &
+             ,gpu_download_double3  &
+             ,gpu_download_double4  &
+             ,gpu_download_double5  &
+             ,gpu_download_double6
+  end interface gpu_download
+
+  interface gpu_copy
+    procedure gpu_copy_double1  &
+             ,gpu_copy_double2  &
+             ,gpu_copy_double3  &
+             ,gpu_copy_double4  &
+             ,gpu_copy_double5  &
+             ,gpu_copy_double6
+  end interface gpu_copy
+
+
+  contains
+
+
+! gpu_allocate
+! ------------
+
+    subroutine gpu_allocate_double1(ptr, s)
+      implicit none
+      type(gpu_double1), intent(inout) :: ptr
+      integer, intent(in) :: s
+
+      call gpu_allocate_c(ptr%c, s*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s /))
+    end subroutine
+
+    subroutine gpu_allocate_double2(ptr, s1, s2)
+      implicit none
+      type(gpu_double2), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2
+
+      call gpu_allocate_c(ptr%c, s1*s2*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /))
+    end subroutine
+
+    subroutine gpu_allocate_double3(ptr, s1, s2, s3)
+      implicit none
+      type(gpu_double3), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2, s3
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /))
+    end subroutine
+
+    subroutine gpu_allocate_double4(ptr, s1, s2, s3, s4)
+      implicit none
+      type(gpu_double4), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2, s3, s4
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /))
+    end subroutine
+
+    subroutine gpu_allocate_double5(ptr, s1, s2, s3, s4, s5)
+      implicit none
+      type(gpu_double5), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2, s3, s4, s5
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /))
+    end subroutine
+
+    subroutine gpu_allocate_double6(ptr, s1, s2, s3, s4, s5, s6)
+      implicit none
+      type(gpu_double6), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2, s3, s4, s5, s6
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /))
+    end subroutine
+
+
+    subroutine gpu_allocate_double1_64(ptr, s)
+      implicit none
+      type(gpu_double1), intent(inout) :: ptr
+      integer*8, intent(in) :: s
+
+      call gpu_allocate_c(ptr%c, s)
+      call c_f_pointer(ptr%c, ptr%f, (/ s /))
+    end subroutine
+
+    subroutine gpu_allocate_double2_64(ptr, s1, s2)
+      implicit none
+      type(gpu_double2), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2
+
+      call gpu_allocate_c(ptr%c, s1*s2*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /))
+    end subroutine
+
+    subroutine gpu_allocate_double3_64(ptr, s1, s2, s3)
+      implicit none
+      type(gpu_double3), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2, s3
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /))
+    end subroutine
+
+    subroutine gpu_allocate_double4_64(ptr, s1, s2, s3, s4)
+      implicit none
+      type(gpu_double4), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2, s3, s4
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /))
+    end subroutine
+
+    subroutine gpu_allocate_double5_64(ptr, s1, s2, s3, s4, s5)
+      implicit none
+      type(gpu_double5), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2, s3, s4, s5
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /))
+    end subroutine
+
+    subroutine gpu_allocate_double6_64(ptr, s1, s2, s3, s4, s5, s6)
+      implicit none
+      type(gpu_double6), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2, s3, s4, s5, s6
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /))
+    end subroutine
+
+
+! gpu_deallocate
+! --------------
+
+    subroutine gpu_deallocate_double1(ptr)
+      implicit none
+      type(gpu_double1), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double2(ptr)
+      implicit none
+      type(gpu_double2), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double3(ptr)
+      implicit none
+      type(gpu_double3), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double4(ptr)
+      implicit none
+      type(gpu_double4), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double5(ptr)
+      implicit none
+      type(gpu_double5), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double6(ptr)
+      implicit none
+      type(gpu_double6), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+
+! gpu_upload
+! ----------
+
+    subroutine gpu_upload_double1(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, target, intent(in)     :: cpu_ptr(*)
+      type(gpu_double1), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, 8_8*size(gpu_ptr%f))
+    end subroutine
+
+    subroutine gpu_upload_double2(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, target, intent(in)     :: cpu_ptr(:,:)
+      type(gpu_double2), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+    subroutine gpu_upload_double3(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, target, intent(in)     :: cpu_ptr(:,:,:)
+      type(gpu_double3), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+    subroutine gpu_upload_double4(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, target, intent(in)     :: cpu_ptr(:,:,:,:)
+      type(gpu_double4), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+    subroutine gpu_upload_double5(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, target, intent(in)     :: cpu_ptr(:,:,:,:,:)
+      type(gpu_double5), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+    subroutine gpu_upload_double6(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, target, intent(in)     :: cpu_ptr(:,:,:,:,:,:)
+      type(gpu_double6), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+
+! gpu_download
+! ------------
+
+    subroutine gpu_download_double1(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double1), intent(in)  :: gpu_ptr
+      double precision, target, intent(in)   :: cpu_ptr(:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*size(gpu_ptr%f))
+    end subroutine
+
+    subroutine gpu_download_double2(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double2), intent(in)  :: gpu_ptr
+      double precision, target, intent(in)   :: cpu_ptr(:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+    subroutine gpu_download_double3(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double3), intent(in)  :: gpu_ptr
+      double precision, target, intent(in)   :: cpu_ptr(:,:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+    subroutine gpu_download_double4(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double4), intent(in)  :: gpu_ptr
+      double precision, target, intent(in)   :: cpu_ptr(:,:,:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+    subroutine gpu_download_double5(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double5), intent(in)  :: gpu_ptr
+      double precision, target, intent(in)   :: cpu_ptr(:,:,:,:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+    subroutine gpu_download_double6(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double6), intent(in)  :: gpu_ptr
+      double precision, target, intent(in)   :: cpu_ptr(:,:,:,:,:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+! gpu_copy
+! --------
+
+    subroutine gpu_copy_double1(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double1), intent(in)        :: gpu_ptr_src
+      type(gpu_double1), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*size(gpu_ptr_dest%f))
+    end subroutine
+
+    subroutine gpu_copy_double2(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double2), intent(in)        :: gpu_ptr_src
+      type(gpu_double2), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+    subroutine gpu_copy_double3(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double3), intent(in)        :: gpu_ptr_src
+      type(gpu_double3), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+    subroutine gpu_copy_double4(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double4), intent(in)        :: gpu_ptr_src
+      type(gpu_double4), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+    subroutine gpu_copy_double5(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double5), intent(in)        :: gpu_ptr_src
+      type(gpu_double5), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+    subroutine gpu_copy_double6(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double6), intent(in)        :: gpu_ptr_src
+      type(gpu_double6), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+
+! gpu_stream
+! ----------
+
+    subroutine gpu_stream_create(stream)
+      type(gpu_stream) :: stream
+      call gpu_stream_create_c(stream%c)
+    end subroutine
+
+    subroutine gpu_stream_destroy(stream)
+      type(gpu_stream) :: stream
+      call gpu_stream_destroy_c(stream%c)
+    end subroutine
+
+    subroutine gpu_set_stream(handle, stream)
+      type(gpu_blas)   :: handle
+      type(gpu_stream) :: stream
+      call gpu_set_stream_c(handle%c, stream%c)
+    end subroutine
+
+
+! gpu_blas
+! --------
+
+    subroutine gpu_blas_create(handle)
+      type(gpu_blas) :: handle
+      call gpu_blas_create_c(handle%c)
+    end subroutine
+
+    subroutine gpu_blas_destroy(handle)
+      type(gpu_blas) :: handle
+      call gpu_blas_destroy_c(handle%c)
+    end subroutine
+
+
+
+
+
+! dot
+! ---
+
+subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res)
+!  use gpu
+  type(gpu_blas), intent(in)     :: handle
+  integer*4                      :: n, incx, incy
+  double precision, target       :: dx, dy
+  double precision, intent(out)  :: res
+  call gpu_ddot_c(handle%c, int(n,c_int64_t), c_loc(dx), int(incx,c_int64_t), c_loc(dy), int(incy,c_int64_t), res)
+end subroutine
+
+
+subroutine gpu_ddot_64(handle, n, dx, incx, dy, incy, res)
+!  use gpu
+  type(gpu_blas), intent(in)     :: handle
+  integer*8                      :: n, incx, incy
+  double precision, target       :: dx, dy
+  double precision, intent(out)  :: res
+  call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res)
+end subroutine
+
+
+! geam
+! ----
+
+subroutine gpu_dgeam(handle, transa, transb, m, n, alpha, a, lda, beta, &
+  b, ldb, c, ldc)
+!  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*4, intent(in)        :: m, n, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  double precision, target     :: a, b, c
+  call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, &
+        c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t))
+end subroutine
+
+
+subroutine gpu_dgeam_64(handle, transa, transb, m, n, alpha, a, lda, beta, &
+  b, ldb, c, ldc)
+!  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*8, intent(in)        :: m, n, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  double precision, target     :: a, b, c
+  call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, &
+        c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t))
+end subroutine
+
+
+! gemv
+! ----
+
+subroutine gpu_dgemv(handle, transa, m, n, alpha, a, lda, &
+  x, incx, beta, y, incy)
+!  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa
+  integer*4, intent(in)        :: m, n, lda, incx, incy
+  double precision, intent(in) :: alpha, beta
+  double precision             :: a, x, y
+  call gpu_dgemv_c(handle%c, transa, int(m,c_int64_t), int(n,c_int64_t), &
+        alpha, a, int(lda,c_int64_t), &
+        x, int(incx,c_int64_t), beta, y, int(incy,c_int64_t))
+end subroutine
+
+subroutine gpu_dgemv_64(handle, transa, m, n, alpha, a, lda, &
+  x, incx, beta, y, incy)
+!  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa
+  integer*8, intent(in)        :: m, n, lda, incx, incy
+  double precision, intent(in) :: alpha, beta
+  double precision             :: a, x, y
+  call gpu_dgemv_c(handle%c, transa, int(m,c_int64_t), int(n,c_int64_t), &
+        alpha, a, int(lda,c_int64_t), &
+        x, int(incx,c_int64_t), beta, y, int(incy,c_int64_t))
+end subroutine
+
+
+! gemm
+! ----
+
+subroutine gpu_dgemm(handle, transa, transb, m, n, k, alpha, a, lda, &
+  b, ldb, beta, c, ldc)
+!  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*4, intent(in)        :: m, n, k, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  double precision             :: a, b, c
+  call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), &
+        alpha, a, int(lda,c_int64_t), &
+        b, int(ldb,c_int64_t), beta, c, int(ldc,c_int64_t))
+end subroutine
+
+subroutine gpu_dgemm_64(handle, transa, transb, m, n, k, alpha, a, lda, &
+  b, ldb, beta, c, ldc)
+!  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*8, intent(in)        :: m, n, k, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  double precision             :: a, b, c
+  call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), &
+        alpha, a, int(lda,c_int64_t), b, int(ldb,c_int64_t), beta, c, int(ldc,c_int64_t))
+end subroutine
+
+end module
diff --git a/src/gpu_x86/gpu_module.F90 b/src/gpu_x86/gpu_module.F90
deleted file mode 100644
index 86ba3926..00000000
--- a/src/gpu_x86/gpu_module.F90
+++ /dev/null
@@ -1,141 +0,0 @@
-module gpu
-  use, intrinsic :: iso_c_binding, only : c_int32_t, c_int64_t, c_double, c_size_t, c_char
-  implicit none
-
-  interface
-    integer function gpu_ndevices() bind(C)
-    end function
-
-    subroutine gpu_set_device(id) bind(C)
-      import
-      integer(c_int32_t), value :: id
-    end subroutine
-
-    subroutine gpu_allocate_c(ptr, n) bind(C, name='gpu_allocate')
-      import
-      type(c_ptr) :: ptr
-      integer(c_int64_t), value :: n
-    end subroutine
-
-    subroutine gpu_free_c(ptr) bind(C, name='gpu_free')
-      import
-      type(c_ptr) :: ptr
-    end subroutine
-
-    subroutine gpu_upload_c(cpu_ptr, gpu_ptr, n) bind(C, name='gpu_upload')
-      import
-      type(c_ptr), value :: cpu_ptr
-      type(c_ptr), value :: gpu_ptr
-      integer(c_int64_t), value :: n
-    end subroutine
-
-    subroutine gpu_download_c(gpu_ptr, cpu_ptr, n) bind(C, name='gpu_download')
-      import
-      type(c_ptr), value :: gpu_ptr
-      type(c_ptr), value :: cpu_ptr
-      integer(c_int64_t), value :: n
-    end subroutine
-
-    subroutine gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, n) bind(C, name='gpu_copy')
-      import
-      type(c_ptr), value :: gpu_ptr_src
-      type(c_ptr), value :: gpu_ptr_dest
-      integer(c_int64_t), value :: n
-    end subroutine
-
-    subroutine gpu_stream_create(stream) bind(C)
-      import
-      type(c_ptr) :: stream
-    end subroutine
-
-    subroutine gpu_stream_destroy(stream) bind(C)
-      import
-      type(c_ptr) :: stream
-    end subroutine
-
-    subroutine gpu_set_stream(handle, stream) bind(C)
-      import
-      type(c_ptr) :: handle, stream
-    end subroutine
-
-    subroutine gpu_synchronize()
-    end subroutine
-
-    subroutine gpu_blas_create(handle) bind(C)
-      import
-      type(c_ptr) :: handle
-    end subroutine
-
-    subroutine gpu_blas_destroy(handle) bind(C)
-      import
-      type(c_ptr) :: handle
-    end subroutine
-
-    subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res) bind(C)
-      import
-      type(c_ptr), intent(in)     :: handle
-      integer(c_int64_t), value   :: n, incx, incy
-      real(c_double), intent(in)  :: dx(*), dy(*)
-      real(c_double), intent(out) :: res
-    end subroutine
-
-    subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res) bind(C)
-      import
-      type(c_ptr), intent(in)     :: handle
-      integer(c_int64_t), value   :: n, incx, incy
-      real(c_float), intent(in)   :: dx(*), dy(*)
-      real(c_float), intent(out)  :: res
-    end subroutine
-
-  end interface
-
-end module
-
-subroutine gpu_allocate_double(ptr, s)
-  use gpu
-  implicit none
-  double precision, pointer, intent(inout) :: ptr
-  integer*8, intent(in) :: s(*)
-  type(c_ptr) :: cptr
-
-  call gpu_allocate_c(cptr, sum(s)*8_8)
-  call c_f_pointer(cptr, ptr, s)
-end subroutine
-
-subroutine gpu_free_double(ptr)
-  use gpu
-  implicit none
-  double precision, pointer, intent(inout) :: ptr
-  type(c_ptr) :: cptr
-  cptr = cloc(ptr)
-  call gpu_free(cptr)
-  NULLIFY(ptr)
-end subroutine
-
-subroutine gpu_upload_double(cpu_ptr, gpu_ptr, n)
-  use gpu
-  implicit none
-  double precision, intent(in)   :: cpu_ptr(*)
-  double precision, intent(out)  :: gpu_ptr(*)
-  integer(c_int64_t), intent(in) :: n
-  call gpu_upload_c(cpu_ptr, gpu_ptr, 8_8*n)
-end subroutine
-
-subroutine gpu_download_double(gpu_ptr, cpu_ptr, n)
-  use gpu
-  implicit none
-  double precision, intent(in)   :: gpu_ptr(*)
-  double precision, intent(out)  :: cpu_ptr(*)
-  integer(c_int64_t), intent(in) :: n
-  call gpu_download_c(gpu_ptr, cpu_ptr, 8_8*n)
-end subroutine
-
-subroutine gpu_copy_double(gpu_ptr_src, gpu_ptr_dest, n)
-  use gpu
-  implicit none
-  double precision, intent(in)   :: gpu_ptr_src(*)
-  double precision, intent(out)  :: gpu_ptr_dest(*)
-  integer(c_int64_t), intent(in) :: n
-  call gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, 8_8*n)
-end subroutine
-
diff --git a/src/mo_optimization/NEED b/src/mo_optimization/NEED
index 91f41ee3..33f770c3 100644
--- a/src/mo_optimization/NEED
+++ b/src/mo_optimization/NEED
@@ -1,7 +1,3 @@
-two_body_rdm
-hartree_fock
-cipsi
-davidson_undressed
+mo_optimization_utils
 selectors_full
 generators_full
-utils_trust_region
diff --git a/src/mo_optimization/cipsi_orb_opt.irp.f b/src/mo_optimization/cipsi_orb_opt.irp.f
index 7e3a79eb..19b3e9db 100644
--- a/src/mo_optimization/cipsi_orb_opt.irp.f
+++ b/src/mo_optimization/cipsi_orb_opt.irp.f
@@ -2,87 +2,7 @@ program optimization
   
   read_wf = .true. ! must be True for the orbital optimization !!!
   TOUCH read_wf 
-  call run_optimization
+  call run_optimization_mos_CIPSI
 
 end
 
-subroutine run_optimization
-
-  implicit none
-
-  double precision :: e_cipsi, e_opt, delta_e
-  double precision, allocatable :: Ev(:),PT2(:)
-  integer :: nb_iter,i
-  logical :: not_converged
-  character (len=100) :: filename
-
-  PROVIDE psi_det psi_coef mo_two_e_integrals_in_map ao_pseudo_integrals
-  allocate(Ev(N_states),PT2(N_states))
-
-  not_converged = .True.
-  nb_iter = 0
-
-  ! To start from the wf
-  N_det_max = max(n_det,5)
-  TOUCH N_det_max
-
-  open(unit=10, file=trim(ezfio_filename)//'/mo_optimization/result_opt')
-  write(10,*) "   Ndet        E_cipsi         E_opt          Delta_e"
-  call state_average_energy(e_cipsi)
-  write(10,'(I10, 3F15.7)') n_det, e_cipsi, e_cipsi, 0d0
-  close(10)
-
-  do while (not_converged)
-      print*,''
-      print*,'======================'
-      print*,' Cipsi step:', nb_iter
-      print*,'======================'
-      print*,'' 
-      print*,'********** cipsi step **********'
-      ! cispi calculation
-      call run_stochastic_cipsi(Ev,PT2)
-
-      ! State average energy after the cipsi step
-      call state_average_energy(e_cipsi)
-
-      print*,''
-      print*,'********** optimization step **********'
-      ! orbital optimization
-      call run_orb_opt_trust_v2
-
-      ! State average energy after the orbital optimization
-      call state_average_energy(e_opt)
-
-      print*,''
-      print*,'********** diff step **********'
-      ! Gain in energy
-      delta_e = e_opt - e_cipsi
-      print*, 'Gain in energy during the orbital optimization:', delta_e
-
-      open(unit=10, file=trim(ezfio_filename)//'/mo_optimization/result_opt', position='append')
-      write(10,'(I10, 3F15.7)') n_det, e_cipsi, e_opt, delta_e
-      close(10)
-
-      ! Exit
-      if (delta_e > 1d-12) then
-          print*, 'WARNING, something wrong happened'
-          print*, 'The gain (delta_e) in energy during the optimization process'
-          print*, 'is > 0, but it must be < 0'
-          print*, 'The program will exit'
-          exit
-      endif
-
-      if (n_det > n_det_max_opt) then
-          print*, 'The number of determinants in the wf > n_det_max_opt'
-          print*, 'The program will exit'
-          exit
-      endif
-      
-      ! To double the number of determinants in the wf
-      N_det_max = int(dble(n_det * 2)*0.9)
-      TOUCH N_det_max
-
-      nb_iter = nb_iter + 1
-  enddo
-
-end
diff --git a/src/mo_optimization/EZFIO.cfg b/src/mo_optimization_utils/EZFIO.cfg
similarity index 100%
rename from src/mo_optimization/EZFIO.cfg
rename to src/mo_optimization_utils/EZFIO.cfg
diff --git a/src/mo_optimization_utils/NEED b/src/mo_optimization_utils/NEED
new file mode 100644
index 00000000..1a78a17f
--- /dev/null
+++ b/src/mo_optimization_utils/NEED
@@ -0,0 +1,5 @@
+two_body_rdm
+hartree_fock
+cipsi
+davidson_undressed
+utils_trust_region
diff --git a/src/mo_optimization_utils/README.md b/src/mo_optimization_utils/README.md
new file mode 100644
index 00000000..94f29aee
--- /dev/null
+++ b/src/mo_optimization_utils/README.md
@@ -0,0 +1,74 @@
+# Orbital optimization
+
+## Methods  
+Different methods are available:  
+- full hessian  
+``` 
+qp set orbital_optimization optimization_method full  
+```  
+- diagonal hessian  
+``` 
+qp set orbital_optimization optimization_method diag  
+``` 
+- identity matrix  
+``` 
+qp set orbital_optimization optimization_method none  
+``` 
+
+After the optimization the ezfio contains the optimized orbitals
+ 
+## For a fixed number of determinants
+To optimize the MOs for the actual determinants:  
+``` 
+qp run orb_opt
+``` 
+ 
+## For a complete optimization, i.e, with a larger and larger wave function
+To optimize the MOs with a larger and larger wave function:  
+``` 
+qp run optimization  
+``` 
+
+The results are stored in the EZFIO in "mo_optimization/result_opt",
+with the following format:  
+(1) (2) (3) (4)  
+1: Number of determinants in the wf,  
+2: Cispi energy before the optimization,   
+3: Cipsi energy after the optimization,  
+4: Energy difference between (2) and (3).  
+ 
+The optimization process if the following: 
+- we do a first cipsi step to obtain a small number of determinants in the wf 
+- we run an orbital optimization for this wf 
+- we do a new cipsi step to double the number of determinants in the wf 
+- we run an orbital optimization for this wf 
+- ... 
+- we do that until the energy difference between (2) and (3) is  
+  smaller than the targeted accuracy for the cispi (targeted_accuracy_cipsi in qp edit) 
+  or the wf is larger than a given size (n_det_max_opt in qp_edit) 
+- after that you can reset your determinants (qp reset -d) and run a clean Cispi calculation  
+  
+### End of the optimization
+You can choos the number of determinants after what the 
+optimization will stop:
+```
+qp set orbital_optimization n_det_max_opt 1e5 # or any number
+```
+## Weight of the states
+You can change the weights of the differents states directly in qp edit.  
+It will affect ths weights used in the orbital optimization.
+
+# Tests
+To run the tests:  
+``` 
+qp test
+``` 
+
+# Org files
+The org files are stored in the directory org in order to avoid overwriting on user changes.
+The org files can be modified, to export the change to the source code, run
+```
+./TANGLE_org_mode.sh
+mv *.irp.f ../.
+```
+
diff --git a/src/mo_optimization/constants.h b/src/mo_optimization_utils/constants.h
similarity index 100%
rename from src/mo_optimization/constants.h
rename to src/mo_optimization_utils/constants.h
diff --git a/src/mo_optimization/diagonal_hessian_list_opt.irp.f b/src/mo_optimization_utils/diagonal_hessian_list_opt.irp.f
similarity index 100%
rename from src/mo_optimization/diagonal_hessian_list_opt.irp.f
rename to src/mo_optimization_utils/diagonal_hessian_list_opt.irp.f
diff --git a/src/mo_optimization/diagonal_hessian_opt.irp.f b/src/mo_optimization_utils/diagonal_hessian_opt.irp.f
similarity index 100%
rename from src/mo_optimization/diagonal_hessian_opt.irp.f
rename to src/mo_optimization_utils/diagonal_hessian_opt.irp.f
diff --git a/src/mo_optimization/diagonalization_hessian.irp.f b/src/mo_optimization_utils/diagonalization_hessian.irp.f
similarity index 100%
rename from src/mo_optimization/diagonalization_hessian.irp.f
rename to src/mo_optimization_utils/diagonalization_hessian.irp.f
diff --git a/src/mo_optimization/first_diagonal_hessian_list_opt.irp.f b/src/mo_optimization_utils/first_diagonal_hessian_list_opt.irp.f
similarity index 100%
rename from src/mo_optimization/first_diagonal_hessian_list_opt.irp.f
rename to src/mo_optimization_utils/first_diagonal_hessian_list_opt.irp.f
diff --git a/src/mo_optimization/first_diagonal_hessian_opt.irp.f b/src/mo_optimization_utils/first_diagonal_hessian_opt.irp.f
similarity index 100%
rename from src/mo_optimization/first_diagonal_hessian_opt.irp.f
rename to src/mo_optimization_utils/first_diagonal_hessian_opt.irp.f
diff --git a/src/mo_optimization/first_gradient_list_opt.irp.f b/src/mo_optimization_utils/first_gradient_list_opt.irp.f
similarity index 100%
rename from src/mo_optimization/first_gradient_list_opt.irp.f
rename to src/mo_optimization_utils/first_gradient_list_opt.irp.f
diff --git a/src/mo_optimization/first_gradient_opt.irp.f b/src/mo_optimization_utils/first_gradient_opt.irp.f
similarity index 100%
rename from src/mo_optimization/first_gradient_opt.irp.f
rename to src/mo_optimization_utils/first_gradient_opt.irp.f
diff --git a/src/mo_optimization/first_hessian_list_opt.irp.f b/src/mo_optimization_utils/first_hessian_list_opt.irp.f
similarity index 100%
rename from src/mo_optimization/first_hessian_list_opt.irp.f
rename to src/mo_optimization_utils/first_hessian_list_opt.irp.f
diff --git a/src/mo_optimization/first_hessian_opt.irp.f b/src/mo_optimization_utils/first_hessian_opt.irp.f
similarity index 100%
rename from src/mo_optimization/first_hessian_opt.irp.f
rename to src/mo_optimization_utils/first_hessian_opt.irp.f
diff --git a/src/mo_optimization/gradient_list_opt.irp.f b/src/mo_optimization_utils/gradient_list_opt.irp.f
similarity index 100%
rename from src/mo_optimization/gradient_list_opt.irp.f
rename to src/mo_optimization_utils/gradient_list_opt.irp.f
diff --git a/src/mo_optimization/gradient_opt.irp.f b/src/mo_optimization_utils/gradient_opt.irp.f
similarity index 100%
rename from src/mo_optimization/gradient_opt.irp.f
rename to src/mo_optimization_utils/gradient_opt.irp.f
diff --git a/src/mo_optimization/hessian_list_opt.irp.f b/src/mo_optimization_utils/hessian_list_opt.irp.f
similarity index 100%
rename from src/mo_optimization/hessian_list_opt.irp.f
rename to src/mo_optimization_utils/hessian_list_opt.irp.f
diff --git a/src/mo_optimization/hessian_opt.irp.f b/src/mo_optimization_utils/hessian_opt.irp.f
similarity index 100%
rename from src/mo_optimization/hessian_opt.irp.f
rename to src/mo_optimization_utils/hessian_opt.irp.f
diff --git a/src/mo_optimization/org/TODO.org b/src/mo_optimization_utils/org/TODO.org
similarity index 100%
rename from src/mo_optimization/org/TODO.org
rename to src/mo_optimization_utils/org/TODO.org
diff --git a/src/mo_optimization/org/debug_gradient_list_opt.org b/src/mo_optimization_utils/org/debug_gradient_list_opt.org
similarity index 100%
rename from src/mo_optimization/org/debug_gradient_list_opt.org
rename to src/mo_optimization_utils/org/debug_gradient_list_opt.org
diff --git a/src/mo_optimization/org/debug_gradient_opt.org b/src/mo_optimization_utils/org/debug_gradient_opt.org
similarity index 100%
rename from src/mo_optimization/org/debug_gradient_opt.org
rename to src/mo_optimization_utils/org/debug_gradient_opt.org
diff --git a/src/mo_optimization/org/debug_hessian_list_opt.org b/src/mo_optimization_utils/org/debug_hessian_list_opt.org
similarity index 100%
rename from src/mo_optimization/org/debug_hessian_list_opt.org
rename to src/mo_optimization_utils/org/debug_hessian_list_opt.org
diff --git a/src/mo_optimization/org/debug_hessian_opt.org b/src/mo_optimization_utils/org/debug_hessian_opt.org
similarity index 100%
rename from src/mo_optimization/org/debug_hessian_opt.org
rename to src/mo_optimization_utils/org/debug_hessian_opt.org
diff --git a/src/mo_optimization/org/diagonal_hessian_list_opt.org b/src/mo_optimization_utils/org/diagonal_hessian_list_opt.org
similarity index 100%
rename from src/mo_optimization/org/diagonal_hessian_list_opt.org
rename to src/mo_optimization_utils/org/diagonal_hessian_list_opt.org
diff --git a/src/mo_optimization/org/diagonal_hessian_opt.org b/src/mo_optimization_utils/org/diagonal_hessian_opt.org
similarity index 100%
rename from src/mo_optimization/org/diagonal_hessian_opt.org
rename to src/mo_optimization_utils/org/diagonal_hessian_opt.org
diff --git a/src/mo_optimization/org/diagonalization_hessian.org b/src/mo_optimization_utils/org/diagonalization_hessian.org
similarity index 100%
rename from src/mo_optimization/org/diagonalization_hessian.org
rename to src/mo_optimization_utils/org/diagonalization_hessian.org
diff --git a/src/mo_optimization/org/first_diagonal_hessian_list_opt.org b/src/mo_optimization_utils/org/first_diagonal_hessian_list_opt.org
similarity index 100%
rename from src/mo_optimization/org/first_diagonal_hessian_list_opt.org
rename to src/mo_optimization_utils/org/first_diagonal_hessian_list_opt.org
diff --git a/src/mo_optimization/org/first_diagonal_hessian_opt.org b/src/mo_optimization_utils/org/first_diagonal_hessian_opt.org
similarity index 100%
rename from src/mo_optimization/org/first_diagonal_hessian_opt.org
rename to src/mo_optimization_utils/org/first_diagonal_hessian_opt.org
diff --git a/src/mo_optimization/org/first_gradient_list_opt.org b/src/mo_optimization_utils/org/first_gradient_list_opt.org
similarity index 100%
rename from src/mo_optimization/org/first_gradient_list_opt.org
rename to src/mo_optimization_utils/org/first_gradient_list_opt.org
diff --git a/src/mo_optimization/org/first_gradient_opt.org b/src/mo_optimization_utils/org/first_gradient_opt.org
similarity index 100%
rename from src/mo_optimization/org/first_gradient_opt.org
rename to src/mo_optimization_utils/org/first_gradient_opt.org
diff --git a/src/mo_optimization/org/first_hessian_list_opt.org b/src/mo_optimization_utils/org/first_hessian_list_opt.org
similarity index 100%
rename from src/mo_optimization/org/first_hessian_list_opt.org
rename to src/mo_optimization_utils/org/first_hessian_list_opt.org
diff --git a/src/mo_optimization/org/first_hessian_opt.org b/src/mo_optimization_utils/org/first_hessian_opt.org
similarity index 100%
rename from src/mo_optimization/org/first_hessian_opt.org
rename to src/mo_optimization_utils/org/first_hessian_opt.org
diff --git a/src/mo_optimization/org/gradient_list_opt.org b/src/mo_optimization_utils/org/gradient_list_opt.org
similarity index 100%
rename from src/mo_optimization/org/gradient_list_opt.org
rename to src/mo_optimization_utils/org/gradient_list_opt.org
diff --git a/src/mo_optimization/org/gradient_opt.org b/src/mo_optimization_utils/org/gradient_opt.org
similarity index 100%
rename from src/mo_optimization/org/gradient_opt.org
rename to src/mo_optimization_utils/org/gradient_opt.org
diff --git a/src/mo_optimization/org/hessian_list_opt.org b/src/mo_optimization_utils/org/hessian_list_opt.org
similarity index 100%
rename from src/mo_optimization/org/hessian_list_opt.org
rename to src/mo_optimization_utils/org/hessian_list_opt.org
diff --git a/src/mo_optimization/org/hessian_opt.org b/src/mo_optimization_utils/org/hessian_opt.org
similarity index 100%
rename from src/mo_optimization/org/hessian_opt.org
rename to src/mo_optimization_utils/org/hessian_opt.org
diff --git a/src/mo_optimization/org/my_providers.org b/src/mo_optimization_utils/org/my_providers.org
similarity index 100%
rename from src/mo_optimization/org/my_providers.org
rename to src/mo_optimization_utils/org/my_providers.org
diff --git a/src/mo_optimization/org/optimization.org b/src/mo_optimization_utils/org/optimization.org
similarity index 100%
rename from src/mo_optimization/org/optimization.org
rename to src/mo_optimization_utils/org/optimization.org
diff --git a/src/mo_optimization/org/orb_opt_trust_v2.org b/src/mo_optimization_utils/org/orb_opt_trust_v2.org
similarity index 100%
rename from src/mo_optimization/org/orb_opt_trust_v2.org
rename to src/mo_optimization_utils/org/orb_opt_trust_v2.org
diff --git a/src/mo_optimization/org/state_average_energy.org b/src/mo_optimization_utils/org/state_average_energy.org
similarity index 100%
rename from src/mo_optimization/org/state_average_energy.org
rename to src/mo_optimization_utils/org/state_average_energy.org
diff --git a/src/mo_optimization/org/state_weight_normalization.org b/src/mo_optimization_utils/org/state_weight_normalization.org
similarity index 100%
rename from src/mo_optimization/org/state_weight_normalization.org
rename to src/mo_optimization_utils/org/state_weight_normalization.org
diff --git a/src/mo_optimization/org/update_parameters.org b/src/mo_optimization_utils/org/update_parameters.org
similarity index 100%
rename from src/mo_optimization/org/update_parameters.org
rename to src/mo_optimization_utils/org/update_parameters.org
diff --git a/src/mo_optimization/org/update_st_av_ci_energy.org b/src/mo_optimization_utils/org/update_st_av_ci_energy.org
similarity index 100%
rename from src/mo_optimization/org/update_st_av_ci_energy.org
rename to src/mo_optimization_utils/org/update_st_av_ci_energy.org
diff --git a/src/mo_optimization_utils/routine_opt_mos.irp.f b/src/mo_optimization_utils/routine_opt_mos.irp.f
new file mode 100644
index 00000000..fceba2c5
--- /dev/null
+++ b/src/mo_optimization_utils/routine_opt_mos.irp.f
@@ -0,0 +1,81 @@
+
+subroutine run_optimization_mos_CIPSI
+
+  implicit none
+
+  double precision :: e_cipsi, e_opt, delta_e
+  double precision, allocatable :: Ev(:),PT2(:)
+  integer :: nb_iter,i
+  logical :: not_converged
+  character (len=100) :: filename
+
+  PROVIDE psi_det psi_coef mo_two_e_integrals_in_map ao_pseudo_integrals
+  allocate(Ev(N_states),PT2(N_states))
+
+  not_converged = .True.
+  nb_iter = 0
+
+  ! To start from the wf
+  N_det_max = max(n_det,5)
+  TOUCH N_det_max
+
+  open(unit=10, file=trim(ezfio_filename)//'/mo_optimization/result_opt')
+  write(10,*) "   Ndet        E_cipsi         E_opt          Delta_e"
+  call state_average_energy(e_cipsi)
+  write(10,'(I10, 3F15.7)') n_det, e_cipsi, e_cipsi, 0d0
+  close(10)
+
+  do while (not_converged)
+      print*,''
+      print*,'======================'
+      print*,' Cipsi step:', nb_iter
+      print*,'======================'
+      print*,'' 
+      print*,'********** cipsi step **********'
+      ! cispi calculation
+      call run_stochastic_cipsi(Ev,PT2)
+
+      ! State average energy after the cipsi step
+      call state_average_energy(e_cipsi)
+
+      print*,''
+      print*,'********** optimization step **********'
+      ! orbital optimization
+      call run_orb_opt_trust_v2
+
+      ! State average energy after the orbital optimization
+      call state_average_energy(e_opt)
+
+      print*,''
+      print*,'********** diff step **********'
+      ! Gain in energy
+      delta_e = e_opt - e_cipsi
+      print*, 'Gain in energy during the orbital optimization:', delta_e
+
+      open(unit=10, file=trim(ezfio_filename)//'/mo_optimization/result_opt', position='append')
+      write(10,'(I10, 3F15.7)') n_det, e_cipsi, e_opt, delta_e
+      close(10)
+
+      ! Exit
+      if (delta_e > 1d-12) then
+          print*, 'WARNING, something wrong happened'
+          print*, 'The gain (delta_e) in energy during the optimization process'
+          print*, 'is > 0, but it must be < 0'
+          print*, 'The program will exit'
+          exit
+      endif
+
+      if (n_det > n_det_max_opt) then
+          print*, 'The number of determinants in the wf > n_det_max_opt'
+          print*, 'The program will exit'
+          exit
+      endif
+      
+      ! To double the number of determinants in the wf
+      N_det_max = int(dble(n_det * 2)*0.9)
+      TOUCH N_det_max
+
+      nb_iter = nb_iter + 1
+  enddo
+
+end
diff --git a/src/mo_optimization/run_orb_opt_trust_v2.irp.f b/src/mo_optimization_utils/run_orb_opt_trust_v2.irp.f
similarity index 100%
rename from src/mo_optimization/run_orb_opt_trust_v2.irp.f
rename to src/mo_optimization_utils/run_orb_opt_trust_v2.irp.f
diff --git a/src/mo_optimization/save_energy.irp.f b/src/mo_optimization_utils/save_energy.irp.f
similarity index 100%
rename from src/mo_optimization/save_energy.irp.f
rename to src/mo_optimization_utils/save_energy.irp.f
diff --git a/src/mo_optimization/state_average_energy.irp.f b/src/mo_optimization_utils/state_average_energy.irp.f
similarity index 100%
rename from src/mo_optimization/state_average_energy.irp.f
rename to src/mo_optimization_utils/state_average_energy.irp.f
diff --git a/src/mo_optimization/state_weight_normalization.irp.f b/src/mo_optimization_utils/state_weight_normalization.irp.f
similarity index 100%
rename from src/mo_optimization/state_weight_normalization.irp.f
rename to src/mo_optimization_utils/state_weight_normalization.irp.f
diff --git a/src/mo_optimization/update_parameters.irp.f b/src/mo_optimization_utils/update_parameters.irp.f
similarity index 100%
rename from src/mo_optimization/update_parameters.irp.f
rename to src/mo_optimization_utils/update_parameters.irp.f
diff --git a/src/mo_optimization/update_st_av_ci_energy.irp.f b/src/mo_optimization_utils/update_st_av_ci_energy.irp.f
similarity index 100%
rename from src/mo_optimization/update_st_av_ci_energy.irp.f
rename to src/mo_optimization_utils/update_st_av_ci_energy.irp.f
diff --git a/src/mo_two_e_ints/cholesky.irp.f b/src/mo_two_e_ints/cholesky.irp.f
index 7e2c8b37..1fed949d 100644
--- a/src/mo_two_e_ints/cholesky.irp.f
+++ b/src/mo_two_e_ints/cholesky.irp.f
@@ -101,3 +101,34 @@ BEGIN_PROVIDER [ double precision, cholesky_mo_transp, (cholesky_mo_num, mo_num,
 
 END_PROVIDER
 
+
+BEGIN_PROVIDER [ double precision, cholesky_semi_mo_transp_simple, (cholesky_mo_num, ao_num, mo_num) ]
+ implicit none
+ BEGIN_DOC
+ ! Cholesky vectors in MO basis
+ END_DOC
+
+ double precision, allocatable :: X(:,:,:)
+ double precision :: wall0, wall1
+ integer :: ierr
+ print *, 'Semi AO->MO Transformation of Cholesky vectors'
+  call wall_time(wall0)
+
+ allocate(X(mo_num,cholesky_mo_num,ao_num), stat=ierr)
+ if (ierr /= 0) then
+   print *, irp_here, ': Allocation failed'
+ endif
+ integer :: i_chol, i_mo, j_mo, i_ao 
+ cholesky_semi_mo_transp_simple = 0.d0
+ do i_mo = 1, mo_num
+  do i_ao = 1, ao_num
+   do j_mo = 1, mo_num
+    do i_chol = 1, cholesky_mo_num
+     cholesky_semi_mo_transp_simple(i_chol, i_ao,i_mo) += cholesky_mo_transp(i_chol,j_mo,i_mo) * mo_coef_transp(j_mo,i_ao)
+    enddo
+   enddo
+  enddo
+ enddo
+
+END_PROVIDER
+
diff --git a/src/mo_two_e_ints/map_integrals.irp.f b/src/mo_two_e_ints/map_integrals.irp.f
index 168c34b4..eeb4279f 100644
--- a/src/mo_two_e_ints/map_integrals.irp.f
+++ b/src/mo_two_e_ints/map_integrals.irp.f
@@ -40,7 +40,7 @@ end
  ! Min and max values of the MOs for which the integrals are in the cache
  END_DOC
 
- mo_integrals_cache_size  = 2_8**mo_integrals_cache_shift
+ mo_integrals_cache_size  = 2**mo_integrals_cache_shift
 
  mo_integrals_cache_min = max(1,elec_alpha_num - (mo_integrals_cache_size/2 - 1) )
  mo_integrals_cache_max = min(mo_num, mo_integrals_cache_min + mo_integrals_cache_size - 1)
diff --git a/src/mu_of_r/f_hf_cholesky.irp.f b/src/mu_of_r/f_hf_cholesky.irp.f
index 5dd69eb6..179b80dd 100644
--- a/src/mu_of_r/f_hf_cholesky.irp.f
+++ b/src/mu_of_r/f_hf_cholesky.irp.f
@@ -289,6 +289,106 @@ BEGIN_PROVIDER [ double precision, f_hf_cholesky_sparse, (n_points_final_grid)]
  endif
 END_PROVIDER 
 
+BEGIN_PROVIDER [ double precision, f_hf_cholesky_sparse_bis, (n_points_final_grid)]
+ implicit none
+ integer :: ipoint,m,mm,i,ii,p
+ !!f(R) =  \sum_{I} \sum_{J} Phi_I(R) Phi_J(R) V_IJ
+ !!     =  \sum_{I}\sum_{J}\sum_A Phi_I(R) Phi_J(R) V_AI V_AJ
+ !!     =  \sum_A \sum_{I}Phi_I(R)V_AI \sum_{J}V_AJ Phi_J(R)
+ !!     =  \sum_A V_AR G_AR 
+ !! V_AR = \sum_{I}Phi_IR V_AI = \sum_{I}Phi^t_RI V_AI
+ double precision :: u_dot_v,wall0,wall1,accu_1, accu_2,mo_i_r1,mo_b_r1
+ double precision :: thresh_1,thresh_2
+ double precision, allocatable :: accu_vec(:),delta_vec(:)
+ thresh_2 = ao_cholesky_threshold * 100.d0
+ thresh_1 = dsqrt(thresh_2)
+ provide cholesky_mo_transp
+ if(elec_alpha_num == elec_beta_num)then
+  call wall_time(wall0)
+  !$OMP PARALLEL DEFAULT(NONE)                                      &
+  !$OMP PRIVATE (accu_vec,ipoint,p,ii,i,mm,m,mo_i_r1,mo_b_r1) & 
+  !$OMP ShARED (n_occ_val_orb_for_hf,list_valence_orb_for_hf,mos_in_r_array_omp,aos_in_r_array,thresh_1,thresh_2) & 
+  !$OMP ShARED (cholesky_mo_num,f_hf_cholesky_sparse_bis,n_points_final_grid,cholesky_semi_mo_transp_simple,ao_num) 
+  allocate(accu_vec(cholesky_mo_num))
+  !$OMP DO 
+   do ipoint = 1, n_points_final_grid
+    f_hf_cholesky_sparse_bis(ipoint) = 0.d0
+     accu_vec = 0.d0
+     do ii = 1, n_occ_val_orb_for_hf(1)
+      i = list_valence_orb_for_hf(ii,1)
+      mo_i_r1 = mos_in_r_array_omp(i,ipoint)
+      if(dabs(mo_i_r1).lt.thresh_1)cycle
+      do mm = 1, ao_num ! electron 1 
+       mo_b_r1 = aos_in_r_array(mm,ipoint)*mo_i_r1
+       if(dabs(mo_b_r1).lt.thresh_2)cycle
+       do p = 1, cholesky_mo_num
+        accu_vec(p) = accu_vec(p) + mo_b_r1 * cholesky_semi_mo_transp_simple(p,mm,i)
+       enddo
+      enddo
+     enddo
+     do p = 1, cholesky_mo_num
+      f_hf_cholesky_sparse_bis(ipoint) = f_hf_cholesky_sparse_bis(ipoint) + accu_vec(p) * accu_vec(p)
+     enddo
+    f_hf_cholesky_sparse_bis(ipoint) *= 2.D0
+   enddo
+  !$OMP END DO
+  deallocate(accu_vec)
+  !$OMP END PARALLEL
+  
+  call wall_time(wall1)
+  print*,'Time to provide f_hf_cholesky_sparse_bis = ',wall1-wall0
+ else
+  call wall_time(wall0)
+  !$OMP PARALLEL DEFAULT(NONE)                                      &
+  !$OMP PRIVATE (accu_vec,delta_vec,ipoint,p,ii,i,mm,m,mo_i_r1,mo_b_r1) & 
+  !$OMP ShARED (n_occ_val_orb_for_hf,list_valence_orb_for_hf,list_basis,mos_in_r_array_omp,thresh_1,thresh_2) & 
+  !$OMP ShARED (cholesky_mo_num,f_hf_cholesky_sparse_bis,n_points_final_grid,cholesky_mo_transp,n_basis_orb) 
+  allocate(accu_vec(cholesky_mo_num),delta_vec(cholesky_mo_num))
+  !$OMP DO 
+   do ipoint = 1, n_points_final_grid
+    f_hf_cholesky_sparse_bis(ipoint) = 0.d0
+     accu_vec = 0.d0
+     do ii = 1, n_occ_val_orb_for_hf(2)
+      i = list_valence_orb_for_hf(ii,2)
+      mo_i_r1 = mos_in_r_array_omp(i,ipoint)
+      if(dabs(mo_i_r1).lt.thresh_1)cycle
+      do mm = 1, n_basis_orb ! electron 1 
+       m = list_basis(mm)
+       mo_b_r1 = mos_in_r_array_omp(m,ipoint)
+       if(dabs(mo_i_r1*mo_b_r1).lt.thresh_2)cycle
+       do p = 1, cholesky_mo_num
+        accu_vec(p) = accu_vec(p) + mo_i_r1 * mo_b_r1 * cholesky_mo_transp(p,m,i)
+       enddo
+      enddo
+     enddo
+     delta_vec = 0.d0
+     do ii =  n_occ_val_orb_for_hf(2)+1,n_occ_val_orb_for_hf(1)
+      i = list_valence_orb_for_hf(ii,1)
+      mo_i_r1 = mos_in_r_array_omp(i,ipoint)
+      if(dabs(mo_i_r1).lt.thresh_1)cycle
+      do mm = 1, n_basis_orb ! electron 1 
+       m = list_basis(mm)
+       mo_b_r1 = mos_in_r_array_omp(m,ipoint)
+       if(dabs(mo_i_r1*mo_b_r1).lt.thresh_2)cycle
+       do p = 1, cholesky_mo_num
+        delta_vec(p) = delta_vec(p) + mo_i_r1 * mo_b_r1 * cholesky_mo_transp(p,m,i)
+       enddo
+      enddo
+     enddo
+     do p = 1, cholesky_mo_num
+      f_hf_cholesky_sparse_bis(ipoint) = f_hf_cholesky_sparse_bis(ipoint) + accu_vec(p) * accu_vec(p) + accu_vec(p) * delta_vec(p)
+     enddo
+    f_hf_cholesky_sparse_bis(ipoint) *= 2.D0
+   enddo
+  !$OMP END DO
+  deallocate(accu_vec)
+  !$OMP END PARALLEL
+  call wall_time(wall1)
+  print*,'Time to provide f_hf_cholesky_sparse_bis = ',wall1-wall0
+ endif
+END_PROVIDER 
+
+
 BEGIN_PROVIDER [ double precision, on_top_hf_grid, (n_points_final_grid)]
  implicit none
  integer :: ipoint,i,ii
diff --git a/src/mu_of_r/mu_of_r_mean_field.irp.f b/src/mu_of_r/mu_of_r_mean_field.irp.f
new file mode 100644
index 00000000..6abc7e4f
--- /dev/null
+++ b/src/mu_of_r/mu_of_r_mean_field.irp.f
@@ -0,0 +1,171 @@
+BEGIN_PROVIDER [ double precision, two_e_int_mf, (elec_beta_num,elec_alpha_num,elec_beta_num,elec_alpha_num)]
+ implicit none
+ integer :: i,j,k,l 
+ double precision :: get_two_e_integral
+ do i = 1, elec_alpha_num
+  do j = 1, elec_beta_num
+   do k = 1, elec_alpha_num
+    do l = 1, elec_beta_num
+     two_e_int_mf(l,k,j,i) = get_two_e_integral(l,k,j,i,mo_integrals_map) 
+    enddo
+   enddo
+  enddo
+ enddo
+END_PROVIDER 
+
+subroutine get_f_mf_ab(r,f_mf_ab,two_bod_dens, dm_a, dm_b)
+ implicit none
+ double precision, intent(in) :: r(3)
+ double precision, intent(out):: f_mf_ab,two_bod_dens, dm_a, dm_b
+ double precision, allocatable :: mos_array_r(:),mos_array_a(:), mos_array_b(:)
+ integer :: i,j,k,l
+ allocate(mos_array_r(mo_num), mos_array_a(elec_alpha_num), mos_array_b(elec_alpha_num))
+ call give_all_mos_at_r(r,mos_array_r) 
+ do i = 1, elec_alpha_num
+  mos_array_a(i) = mos_array_r(i)
+ enddo
+ do i = 1, elec_beta_num
+  mos_array_b(i) = mos_array_r(i)
+ enddo
+
+ dm_a = 0.d0
+ do i = 1, elec_alpha_num
+  dm_a += mos_array_a(i) * mos_array_a(i) 
+ enddo
+
+ dm_b = 0.d0
+ do i = 1, elec_beta_num
+  dm_b += mos_array_b(i) * mos_array_b(i) 
+ enddo
+ two_bod_dens = dm_a * dm_b
+ 
+ f_mf_ab = 0.d0
+ do i = 1, elec_alpha_num
+  do j = 1, elec_beta_num
+   do k = 1, elec_alpha_num
+    do l = 1, elec_beta_num
+     f_mf_ab += two_e_int_mf(l,k,j,i) * mos_array_a(i) * mos_array_a(k) * mos_array_b(j) * mos_array_b(l)
+    enddo
+   enddo
+  enddo
+ enddo
+ ! multiply by two to adapt to the N(N-1) normalization condition of the active two-rdm
+ f_mf_ab *= 2.d0 
+ two_bod_dens *= 2.d0
+
+end
+
+subroutine get_grad_f_mf_ab(r,grad_f_mf_ab, grad_two_bod_dens,f_mf_ab,two_bod_dens, dm_a, dm_b,grad_dm_a, grad_dm_b)
+ implicit none
+ double precision, intent(in)  :: r(3)
+ double precision, intent(out) :: f_mf_ab, two_bod_dens
+ double precision, intent(out) :: grad_two_bod_dens(3), grad_f_mf_ab(3)
+ double precision, intent(out) :: dm_a, dm_b, grad_dm_a(3), grad_dm_b(3)
+
+ double precision, allocatable :: mos_array_r(:), mos_grad_array_r(:,:)
+ double precision, allocatable :: mos_array_a(:), mos_array_b(:)
+ double precision, allocatable :: mos_grad_array_a(:,:), mos_grad_array_b(:,:)
+ double precision :: mo_i, mo_j, mo_k, mo_l
+ double precision :: grad_mo_i(3), grad_mo_j(3), grad_mo_k(3), grad_mo_l(3)
+ 
+ integer :: i,j,k,l
+ allocate(mos_array_r(mo_num),mos_grad_array_r(3,mo_num))
+ allocate(mos_array_a(elec_alpha_num), mos_array_b(elec_beta_num))
+ allocate(mos_grad_array_a(3,elec_alpha_num), mos_grad_array_b(3,elec_beta_num))
+ call give_all_mos_and_grad_at_r(r,mos_array_r,mos_grad_array_r)
+ do i = 1, elec_alpha_num
+  mos_array_a(i) = mos_array_r(i)
+  mos_grad_array_a(1:3,i) = mos_grad_array_r(1:3,i)
+ enddo
+ do i = 1, elec_beta_num
+  mos_array_b(i) = mos_array_r(i)
+  mos_grad_array_b(1:3,i) = mos_grad_array_r(1:3,i)
+ enddo
+
+ ! ALPHA DENSITY AND GRADIENT 
+ dm_a = 0.d0
+ grad_dm_a = 0.d0
+ do i = 1, elec_alpha_num
+  dm_a += mos_array_a(i) * mos_array_a(i) 
+  grad_dm_a(1:3) += 2.d0 * mos_array_a(i) * mos_grad_array_a(1:3,i)  
+ enddo
+
+ ! BETA DENSITY AND GRADIENT 
+ dm_b = 0.d0
+ grad_dm_b = 0.d0
+ do i = 1, elec_beta_num
+  dm_b += mos_array_b(i) * mos_array_b(i) 
+  grad_dm_b(1:3) += 2.d0 * mos_array_b(i) * mos_grad_array_b(1:3,i)  
+ enddo
+ ! TWO-BODY DENSITY AND GRADIENT 
+ two_bod_dens = dm_a * dm_b
+ grad_two_bod_dens(1:3) = dm_a * grad_dm_b(1:3) + dm_b * grad_dm_a(1:3)
+
+ ! F_MF and GRADIENT 
+ grad_f_mf_ab = 0.d0
+ f_mf_ab  = 0.d0
+ do i = 1, elec_alpha_num
+  mo_i = mos_array_a(i)
+  grad_mo_i(1:3) = mos_grad_array_a(1:3,i)
+  do j = 1, elec_beta_num
+   mo_j = mos_array_b(j)
+   grad_mo_j(1:3) = mos_grad_array_b(1:3,j)
+   do k = 1, elec_alpha_num
+    mo_k = mos_array_a(k)
+    grad_mo_k(1:3) = mos_grad_array_a(1:3,k)
+    do l = 1, elec_beta_num
+     mo_l = mos_array_b(l)
+     grad_mo_l(1:3) = mos_grad_array_b(1:3,l)
+     f_mf_ab += two_e_int_mf(l,k,j,i) * mo_i * mo_j * mo_k * mo_l
+     grad_f_mf_ab(1:3) += two_e_int_mf(l,k,j,i) * & 
+     (mo_i * mo_j * mo_k * grad_mo_l(1:3) + mo_i * mo_j * grad_mo_k(1:3) * mo_l & 
+     +mo_i * grad_mo_j(1:3) * mo_k * mo_l + grad_mo_i(1:3) * mo_j * mo_k * mo_l)
+    enddo
+   enddo
+  enddo
+ enddo
+
+ f_mf_ab *= 2.d0 
+ two_bod_dens *= 2.d0
+ grad_f_mf_ab *= 2.D0
+ grad_two_bod_dens *= 2.d0
+end
+
+subroutine mu_of_r_mean_field(r,mu_mf, dm)
+ implicit none
+  include 'constants.include.F'
+ double precision, intent(in) :: r(3)
+ double precision, intent(out):: mu_mf, dm
+ double precision :: f_mf_ab,two_bod_dens, dm_a, dm_b
+ call get_f_mf_ab(r,f_mf_ab,two_bod_dens, dm_a, dm_b)
+ dm = dm_a + dm_b
+ if(dabs(two_bod_dens).lt.1.d-10)then
+  mu_mf = 1.d+10
+ else
+  mu_mf = 0.5d0 * sqpi * f_mf_ab/two_bod_dens
+ endif
+end
+
+subroutine grad_mu_of_r_mean_field(r,mu_mf, dm, grad_mu_mf, grad_dm)
+ implicit none
+ include 'constants.include.F'
+ double precision, intent(in) :: r(3)
+ double precision, intent(out):: grad_mu_mf(3), grad_dm(3)
+ double precision, intent(out):: mu_mf, dm
+ double precision :: grad_f_mf_ab(3), grad_two_bod_dens(3),grad_dm_a(3), grad_dm_b(3)
+ double precision :: f_mf_ab,two_bod_dens, dm_a, dm_b
+ call get_grad_f_mf_ab(r,grad_f_mf_ab, grad_two_bod_dens,f_mf_ab,two_bod_dens, dm_a, dm_b,grad_dm_a, grad_dm_b)
+
+ dm = dm_a + dm_b
+ grad_dm(1:3) = grad_dm_a(1:3) + grad_dm_b(1:3)
+
+ if(dabs(two_bod_dens).lt.1.d-10)then
+  mu_mf = 1.d+10
+  grad_mu_mf = 0.d0
+ else
+  mu_mf = 0.5d0 * sqpi * f_mf_ab/two_bod_dens
+  grad_mu_mf(1:3) = 0.5d0 * sqpi * (grad_f_mf_ab(1:3) * two_bod_dens - f_mf_ab * grad_two_bod_dens(1:3))& 
+                                   /(two_bod_dens*two_bod_dens)
+ endif 
+
+end
diff --git a/src/mu_of_r/test_proj_op.irp.f b/src/mu_of_r/test_proj_op.irp.f
index f9aba094..cf53c772 100644
--- a/src/mu_of_r/test_proj_op.irp.f
+++ b/src/mu_of_r/test_proj_op.irp.f
@@ -15,7 +15,162 @@ program projected_operators
 !  call test_f_HF_valence_ab
 !  call routine_full_mos
 !   call test_f_ii_valence_ab
-   call test_f_ia_valence_ab
-  call test_f_ii_ia_aa_valence_ab
+!   call test_f_ia_valence_ab
+!  call test_f_ii_ia_aa_valence_ab
+! call test
+!  call test_f_mean_field
+! call test_grad_f_mean_field
+ call test_grad_mu_mf
 end
 
+
+subroutine test
+ implicit none
+ integer :: i_point
+ double precision :: ref, new, accu, weight
+ accu = 0.d0
+ do i_point = 1, n_points_final_grid
+  ref = f_hf_cholesky_sparse(i_point)
+  new = f_hf_cholesky_sparse_bis(i_point)
+  weight = final_weight_at_r_vector(i_point)
+  accu += dabs(ref - new) * weight
+ enddo
+ print*,'accu = ',accu
+
+end
+
+subroutine test_f_mean_field
+ implicit none
+ integer :: i_point
+ double precision :: weight,r(3)
+ double precision :: ref_f, new_f, accu_f
+ double precision :: ref_two_dens, new_two_dens, accu_two_dens, dm_a, dm_b
+ accu_f = 0.d0
+ accu_two_dens = 0.d0
+ do i_point = 1, n_points_final_grid
+  r(1:3)   = final_grid_points(1:3,i_point)
+  weight = final_weight_at_r_vector(i_point)
+  call get_f_mf_ab(r,new_f,new_two_dens, dm_a, dm_b)
+  call f_HF_valence_ab(r,r,ref_f,ref_two_dens)
+  accu_f += weight * dabs(new_f- ref_f)
+  accu_two_dens += weight * dabs(new_two_dens - ref_two_dens)
+ enddo
+ print*,'accu_f        = ',accu_f
+ print*,'accu_two_dens = ',accu_two_dens
+
+end
+
+subroutine test_grad_f_mean_field
+ implicit none
+ integer :: i_point,k
+ double precision :: weight,r(3)
+ double precision :: grad_f_mf_ab(3), grad_two_bod_dens(3)
+ double precision :: grad_dm_a(3), grad_dm_b(3)
+ double precision :: f_mf_ab,two_bod_dens, dm_a, dm_b
+
+ double precision :: num_grad_f_mf_ab(3), num_grad_two_bod_dens(3)
+ double precision :: num_grad_dm_a(3), num_grad_dm_b(3)
+ double precision :: f_mf_ab_p,f_mf_ab_m
+ double precision :: two_bod_dens_p, two_bod_dens_m
+ double precision :: dm_a_p, dm_a_m
+ double precision :: dm_b_p, dm_b_m
+ double precision :: rbis(3), dr
+ double precision :: accu_grad_f_mf_ab(3),accu_grad_two_bod_dens(3)
+ double precision :: accu_grad_dm_a(3),accu_grad_dm_b(3)
+ double precision :: accu_f_mf_ab, accu_two_bod_dens, accu_dm_a, accu_dm_b
+ dr = 0.00001d0
+ accu_f_mf_ab = 0.d0 
+ accu_two_bod_dens = 0.d0 
+ accu_dm_a = 0.d0 
+ accu_dm_b = 0.d0
+
+ accu_grad_f_mf_ab = 0.d0
+ accu_grad_two_bod_dens = 0.d0
+ accu_grad_dm_a = 0.d0
+ accu_grad_dm_b = 0.d0
+ do i_point = 1, n_points_final_grid
+  r(1:3)   = final_grid_points(1:3,i_point)
+  weight = final_weight_at_r_vector(i_point)
+  call get_grad_f_mf_ab(r,grad_f_mf_ab, grad_two_bod_dens,f_mf_ab,two_bod_dens, dm_a, dm_b,grad_dm_a, grad_dm_b)
+  call get_f_mf_ab(r,f_mf_ab_p,two_bod_dens_p, dm_a_p, dm_b_p)
+  accu_f_mf_ab += weight * dabs(f_mf_ab - f_mf_ab_p)
+  accu_two_bod_dens += weight * dabs(two_bod_dens - two_bod_dens_p)
+  accu_dm_a += weight*dabs(dm_a - dm_a_p)
+  accu_dm_b += weight*dabs(dm_b - dm_b_p)
+  do k = 1, 3
+   rbis = r
+   rbis(k) += dr
+   call get_f_mf_ab(rbis,f_mf_ab_p,two_bod_dens_p, dm_a_p, dm_b_p)
+   rbis = r
+   rbis(k) -= dr
+   call get_f_mf_ab(rbis,f_mf_ab_m,two_bod_dens_m, dm_a_m, dm_b_m)
+   num_grad_f_mf_ab(k) = (f_mf_ab_p - f_mf_ab_m)/(2.d0*dr)
+   num_grad_two_bod_dens(k) = (two_bod_dens_p - two_bod_dens_m)/(2.d0*dr)
+   num_grad_dm_a(k) = (dm_a_p - dm_a_m)/(2.d0*dr)
+   num_grad_dm_b(k) = (dm_b_p - dm_b_m)/(2.d0*dr)
+  enddo
+  do k = 1, 3
+   accu_grad_f_mf_ab(k) += weight * dabs(grad_f_mf_ab(k) - num_grad_f_mf_ab(k))
+   accu_grad_two_bod_dens(k) += weight * dabs(grad_two_bod_dens(k) - num_grad_two_bod_dens(k))
+   accu_grad_dm_a(k) += weight * dabs(grad_dm_a(k) - num_grad_dm_a(k))
+   accu_grad_dm_b(k) += weight * dabs(grad_dm_b(k) - num_grad_dm_b(k))
+  enddo
+ enddo
+ print*,'accu_f_mf_ab = ',accu_f_mf_ab
+ print*,'accu_two_bod_dens = ',accu_two_bod_dens
+ print*,'accu_dm_a = ',accu_dm_a
+ print*,'accu_dm_b = ',accu_dm_b
+ print*,'accu_grad_f_mf_ab = '
+ print*,accu_grad_f_mf_ab
+ print*,'accu_grad_two_bod_dens = '
+ print*,accu_grad_two_bod_dens
+ print*,'accu_dm_a = '
+ print*,accu_grad_dm_a
+ print*,'accu_dm_b = '
+ print*,accu_grad_dm_b
+
+end
+
+subroutine test_grad_mu_mf
+ implicit none
+ integer :: i_point,k
+ double precision :: weight,r(3),rbis(3)
+ double precision :: mu_mf, dm,grad_mu_mf(3), grad_dm(3)
+ double precision :: mu_mf_p, mu_mf_m, dm_m, dm_p, num_grad_mu_mf(3),dr, num_grad_dm(3)
+ double precision :: accu_mu, accu_dm, accu_grad_dm(3), accu_grad_mu_mf(3)
+ dr = 0.00001d0
+ accu_grad_mu_mf = 0.d0
+ accu_mu = 0.d0
+ accu_grad_dm = 0.d0
+ accu_dm = 0.d0
+ do i_point = 1, n_points_final_grid
+  r(1:3)   = final_grid_points(1:3,i_point)
+  weight = final_weight_at_r_vector(i_point)
+  call grad_mu_of_r_mean_field(r,mu_mf, dm, grad_mu_mf, grad_dm)
+  call mu_of_r_mean_field(r,mu_mf_p, dm_p)
+  accu_mu += weight*dabs(mu_mf_p - mu_mf)
+  accu_dm += weight*dabs(dm_p - dm)
+  do k = 1, 3
+   rbis = r
+   rbis(k) += dr
+   call mu_of_r_mean_field(rbis,mu_mf_p, dm_p)
+   rbis = r
+   rbis(k) -= dr
+   call mu_of_r_mean_field(rbis,mu_mf_m, dm_m)
+
+   num_grad_mu_mf(k) = (mu_mf_p - mu_mf_m)/(2.d0*dr)
+   num_grad_dm(k) = (dm_p - dm_m)/(2.d0*dr)
+  enddo
+  do k = 1, 3
+   accu_grad_dm(k)+= weight *dabs(num_grad_dm(k) - grad_dm(k))
+   accu_grad_mu_mf(k)+= weight *dabs(num_grad_mu_mf(k) - grad_mu_mf(k))
+  enddo
+ enddo
+ print*,'accu_mu = ',accu_mu
+ print*,'accu_dm = ',accu_dm
+ print*,'accu_grad_dm = '
+ print*, accu_grad_dm
+ print*,'accu_grad_mu_mf = '
+ print*, accu_grad_mu_mf
+
+end
diff --git a/src/tools/four_idx_transform.irp.f b/src/tools/four_idx_transform.irp.f
index 92e87cad..fc6bface 100644
--- a/src/tools/four_idx_transform.irp.f
+++ b/src/tools/four_idx_transform.irp.f
@@ -12,6 +12,9 @@ program four_idx_transform
 !
   END_DOC
 
+  if (do_mo_cholesky) then
+    stop 'Not implemented with Cholesky integrals'
+  endif
   io_mo_two_e_integrals = 'Write'
   SOFT_TOUCH io_mo_two_e_integrals
   if (.true.) then
diff --git a/src/trexio/export_trexio_routines.irp.f b/src/trexio/export_trexio_routines.irp.f
index 63630243..0eec68bd 100644
--- a/src/trexio/export_trexio_routines.irp.f
+++ b/src/trexio/export_trexio_routines.irp.f
@@ -557,7 +557,7 @@ subroutine export_trexio(update,full_path)
     do k=1,cholesky_ao_num
      do j=1,mo_num
       do i=1,mo_num
-         integral = cholesky_mo(i,j,k)
+         integral = cholesky_mo_transp(k,i,j)
          if (integral == 0.d0) cycle
          icount += 1_8
          chol_buffer(icount) = integral
diff --git a/src/trexio/import_trexio_integrals.irp.f b/src/trexio/import_trexio_integrals.irp.f
index 5a6b3c03..556ed7bc 100644
--- a/src/trexio/import_trexio_integrals.irp.f
+++ b/src/trexio/import_trexio_integrals.irp.f
@@ -28,7 +28,7 @@ subroutine run(f)
   integer(trexio_t), intent(in)  :: f ! TREXIO file handle
   integer(trexio_exit_code)      :: rc
 
-  integer ::i,j,k,l
+  integer :: i,j,k,l, iunit
   integer(8) :: m, n_integrals
   double precision :: integral
 
@@ -41,10 +41,12 @@ subroutine run(f)
   integer         , allocatable :: Vi(:,:)
   double precision              :: s
 
-! TODO:
-! - If Cholesky AO in trexio file, read cholesky ao vectors
-! - If Cholesky MO in trexio file, read cholesky mo vectors
-! - If Cholesky MO not in trexio file, force do_cholesky_mo to False
+  integer*4 :: BUFSIZE
+  integer :: rank
+  double precision, allocatable :: tmp(:,:,:)
+  integer*8 :: offset, icount
+
+  integer, external :: getUnitAndOpen
 
   if (trexio_has_nucleus_repulsion(f) == TREXIO_SUCCESS) then
     rc = trexio_read_nucleus_repulsion(f, s)
@@ -120,45 +122,88 @@ subroutine run(f)
   rc = trexio_has_ao_2e_int(f)
   PROVIDE ao_num
   if (rc /= TREXIO_HAS_NOT) then
-      PROVIDE ao_integrals_map
 
-      integer*4 :: BUFSIZE
-      BUFSIZE=ao_num**2
-      allocate(buffer_i(BUFSIZE), buffer_values(BUFSIZE))
-      allocate(Vi(4,BUFSIZE), V(BUFSIZE))
+      rc = trexio_has_ao_2e_int_eri_cholesky(f)
+      if (rc /= TREXIO_HAS_NOT) then
 
-      integer*8 :: offset, icount
+          rc = trexio_read_ao_2e_int_eri_cholesky_num(f, rank)
+          call trexio_assert(rc, TREXIO_SUCCESS)
 
-      offset = 0_8
-      icount = BUFSIZE
-      rc = TREXIO_SUCCESS
-      do while (icount == size(V))
-        rc = trexio_read_ao_2e_int_eri(f, offset, icount, Vi, V)
-        do m=1,icount
-          i = Vi(1,m)
-          j = Vi(2,m)
-          k = Vi(3,m)
-          l = Vi(4,m)
-          integral = V(m)
-          call two_e_integrals_index(i, j, k, l, buffer_i(m) )
-          buffer_values(m) = integral
-        enddo
-        call insert_into_ao_integrals_map(int(icount,4),buffer_i,buffer_values)
-        offset = offset + icount
-        if (rc /= TREXIO_SUCCESS) then
-            exit
-        endif
-      end do
-      n_integrals = offset
+          allocate(tmp(ao_num,ao_num,rank))
+          tmp(:,:,:) = 0.d0
 
-      call map_sort(ao_integrals_map)
-      call map_unique(ao_integrals_map)
+          BUFSIZE=ao_num**2
+          allocate(Vi(3,BUFSIZE), V(BUFSIZE))
 
-      call map_save_to_disk(trim(ezfio_filename)//'/work/ao_ints',ao_integrals_map)
-      call ezfio_set_ao_two_e_ints_io_ao_two_e_integrals('Read')
 
-      deallocate(buffer_i, buffer_values, Vi, V)
-      print *, 'AO integrals read from TREXIO file'
+          offset = 0_8
+          icount = BUFSIZE
+          rc = TREXIO_SUCCESS
+          do while (icount == size(V))
+            rc = trexio_read_ao_2e_int_eri_cholesky(f, offset, icount, Vi, V)
+            do m=1,icount
+              i = Vi(1,m)
+              j = Vi(2,m)
+              k = Vi(3,m)
+              integral = V(m)
+              tmp(i,j,k) = integral
+            enddo
+            offset = offset + icount
+            if (rc /= TREXIO_SUCCESS) then
+                exit
+            endif
+          end do
+
+          print *,  'Writing Cholesky AO vectors to disk...'
+          iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_ao', 'W')
+          write(iunit) rank
+          write(iunit) tmp(:,:,:)
+          close(iunit)
+          call ezfio_set_ao_two_e_ints_io_ao_cholesky('Read')
+
+          deallocate(Vi, V, tmp)
+          print *, 'Cholesky AO integrals read from TREXIO file'
+      endif
+
+      rc = trexio_has_ao_2e_int_eri(f)
+      if (rc /= TREXIO_HAS_NOT) then
+          PROVIDE ao_integrals_map
+
+          BUFSIZE=ao_num**2
+          allocate(buffer_i(BUFSIZE), buffer_values(BUFSIZE))
+          allocate(Vi(4,BUFSIZE), V(BUFSIZE))
+
+          offset = 0_8
+          icount = BUFSIZE
+          rc = TREXIO_SUCCESS
+          do while (icount == size(V))
+            rc = trexio_read_ao_2e_int_eri(f, offset, icount, Vi, V)
+            do m=1,icount
+              i = Vi(1,m)
+              j = Vi(2,m)
+              k = Vi(3,m)
+              l = Vi(4,m)
+              integral = V(m)
+              call two_e_integrals_index(i, j, k, l, buffer_i(m) )
+              buffer_values(m) = integral
+            enddo
+            call insert_into_ao_integrals_map(int(icount,4),buffer_i,buffer_values)
+            offset = offset + icount
+            if (rc /= TREXIO_SUCCESS) then
+                exit
+            endif
+          end do
+          n_integrals = offset
+
+          call map_sort(ao_integrals_map)
+          call map_unique(ao_integrals_map)
+
+          call map_save_to_disk(trim(ezfio_filename)//'/work/ao_ints',ao_integrals_map)
+          call ezfio_set_ao_two_e_ints_io_ao_two_e_integrals('Read')
+
+          deallocate(buffer_i, buffer_values, Vi, V)
+          print *, 'AO integrals read from TREXIO file'
+      endif
   else
       print *, 'AO integrals not found in TREXIO file'
   endif
@@ -186,40 +231,85 @@ subroutine run(f)
   rc = trexio_has_mo_2e_int(f)
   if (rc /= TREXIO_HAS_NOT) then
 
-      BUFSIZE=mo_num**2
-      allocate(buffer_i(BUFSIZE), buffer_values(BUFSIZE))
-      allocate(Vi(4,BUFSIZE), V(BUFSIZE))
+      rc = trexio_has_mo_2e_int_eri_cholesky(f)
+      if (rc /= TREXIO_HAS_NOT) then
+
+          rc = trexio_read_mo_2e_int_eri_cholesky_num(f, rank)
+          call trexio_assert(rc, TREXIO_SUCCESS)
+
+          allocate(tmp(rank,mo_num,mo_num))
+          tmp(:,:,:) = 0.d0
+
+          BUFSIZE=mo_num**2
+          allocate(Vi(3,BUFSIZE), V(BUFSIZE))
+
+          offset = 0_8
+          icount = BUFSIZE
+          rc = TREXIO_SUCCESS
+          do while (icount == size(V))
+            rc = trexio_read_mo_2e_int_eri_cholesky(f, offset, icount, Vi, V)
+            do m=1,icount
+              i = Vi(1,m)
+              j = Vi(2,m)
+              k = Vi(3,m)
+              integral = V(m)
+              tmp(k,i,j) = integral
+            enddo
+            offset = offset + icount
+            if (rc /= TREXIO_SUCCESS) then
+                exit
+            endif
+          end do
+
+          print *,  'Writing Cholesky MO vectors to disk...'
+          iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_mo_transp', 'W')
+          write(iunit) rank
+          write(iunit) tmp(:,:,:)
+          close(iunit)
+          call ezfio_set_mo_two_e_ints_io_mo_cholesky('Read')
+
+          deallocate(Vi, V, tmp)
+          print *, 'Cholesky MO integrals read from TREXIO file'
+      endif
+
+      rc = trexio_has_mo_2e_int_eri(f)
+      if (rc /= TREXIO_HAS_NOT) then
+          BUFSIZE=mo_num**2
+          allocate(buffer_i(BUFSIZE), buffer_values(BUFSIZE))
+          allocate(Vi(4,BUFSIZE), V(BUFSIZE))
 
 
-      offset = 0_8
-      icount = BUFSIZE
-      rc = TREXIO_SUCCESS
-      do while (icount == size(V))
-        rc = trexio_read_mo_2e_int_eri(f, offset, icount, Vi, V)
-        do m=1,icount
-          i = Vi(1,m)
-          j = Vi(2,m)
-          k = Vi(3,m)
-          l = Vi(4,m)
-          integral = V(m)
-          call two_e_integrals_index(i, j, k, l, buffer_i(m) )
-          buffer_values(m) = integral
-        enddo
-        call map_append(mo_integrals_map, buffer_i, buffer_values, int(icount,4))
-        offset = offset + icount
-        if (rc /= TREXIO_SUCCESS) then
-            exit
-        endif
-      end do
-      n_integrals = offset
+          offset = 0_8
+          icount = BUFSIZE
+          rc = TREXIO_SUCCESS
+          do while (icount == size(V))
+            rc = trexio_read_mo_2e_int_eri(f, offset, icount, Vi, V)
+            do m=1,icount
+              i = Vi(1,m)
+              j = Vi(2,m)
+              k = Vi(3,m)
+              l = Vi(4,m)
+              integral = V(m)
+              call two_e_integrals_index(i, j, k, l, buffer_i(m) )
+              buffer_values(m) = integral
+            enddo
+            call map_append(mo_integrals_map, buffer_i, buffer_values, int(icount,4))
+            offset = offset + icount
+            if (rc /= TREXIO_SUCCESS) then
+                exit
+            endif
+          end do
+          n_integrals = offset
 
-      call map_sort(mo_integrals_map)
-      call map_unique(mo_integrals_map)
+          call map_sort(mo_integrals_map)
+          call map_unique(mo_integrals_map)
+
+          call map_save_to_disk(trim(ezfio_filename)//'/work/mo_ints',mo_integrals_map)
+          call ezfio_set_mo_two_e_ints_io_mo_two_e_integrals('Read')
+          deallocate(buffer_i, buffer_values, Vi, V)
+          print *, 'MO integrals read from TREXIO file'
+      endif
 
-      call map_save_to_disk(trim(ezfio_filename)//'/work/mo_ints',mo_integrals_map)
-      call ezfio_set_mo_two_e_ints_io_mo_two_e_integrals('Read')
-      deallocate(buffer_i, buffer_values, Vi, V)
-      print *, 'MO integrals read from TREXIO file'
   else
       print *, 'MO integrals not found in TREXIO file'
   endif
diff --git a/src/utils/linear_algebra.irp.f b/src/utils/linear_algebra.irp.f
index 20386b30..4e7ca87d 100644
--- a/src/utils/linear_algebra.irp.f
+++ b/src/utils/linear_algebra.irp.f
@@ -1856,7 +1856,7 @@ subroutine pivoted_cholesky( A, rank, tol, ndim, U)
 !
 ! matrix A is destroyed inside this subroutine
 ! Cholesky vectors are stored in U
-! dimension of U: U(1:rank, 1:n)
+! dimension of U: U(1:n, 1:rank)
 ! U is allocated inside this subroutine
 ! rank is the number of Cholesky vectors depending on tol
 !