From 6e2f28b97fbf3c961c9461689a8db9adf81bd6f8 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Sat, 11 May 2024 10:27:03 +0200
Subject: [PATCH 01/19] COLLAPSE(4) -> COLLAPSE(3)

---
 plugins/local/non_h_ints_mu/deb_aos.irp.f      | 6 +++---
 plugins/local/non_h_ints_mu/total_tc_int.irp.f | 9 ++++-----
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/plugins/local/non_h_ints_mu/deb_aos.irp.f b/plugins/local/non_h_ints_mu/deb_aos.irp.f
index 4012f47c..70604f54 100644
--- a/plugins/local/non_h_ints_mu/deb_aos.irp.f
+++ b/plugins/local/non_h_ints_mu/deb_aos.irp.f
@@ -31,6 +31,9 @@ subroutine print_aos()
   integer          :: i, ipoint
   double precision :: r(3)
   double precision :: ao_val, ao_der(3), ao_lap
+  double precision :: accu_vgl(5)
+  double precision :: accu_vgl_nrm(5)
+
   double precision :: mo_val, mo_der(3), mo_lap
 
   PROVIDE final_grid_points aos_in_r_array aos_grad_in_r_array aos_lapl_in_r_array
@@ -40,9 +43,6 @@ subroutine print_aos()
     write(1000, '(3(f15.7, 3X))') r
   enddo
   
-double precision :: accu_vgl(5)
-double precision :: accu_vgl_nrm(5)
-
   do ipoint = 1, n_points_final_grid
     do i = 1, ao_num
       ao_val    = aos_in_r_array     (i,ipoint)
diff --git a/plugins/local/non_h_ints_mu/total_tc_int.irp.f b/plugins/local/non_h_ints_mu/total_tc_int.irp.f
index a1bbd6e0..656f5f16 100644
--- a/plugins/local/non_h_ints_mu/total_tc_int.irp.f
+++ b/plugins/local/non_h_ints_mu/total_tc_int.irp.f
@@ -78,7 +78,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
       !$OMP PRIVATE (i, j, k, l, ipoint, ao_i_r, ao_k_r, weight1) &
       !$OMP SHARED (ao_num, n_points_final_grid, ao_two_e_tc_tot, &
       !$OMP         aos_in_r_array_transp, final_weight_at_r_vector, int2_grad1_u12_square_ao)
-      !$OMP DO COLLAPSE(4)
+      !$OMP DO COLLAPSE(3)
       do i = 1, ao_num
         do k = 1, ao_num
           do l = 1, ao_num
@@ -188,7 +188,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
         !$OMP SHARED (ao_num, n_points_final_grid, ao_two_e_tc_tot,    &
         !$OMP         aos_in_r_array_transp, final_weight_at_r_vector, &
         !$OMP         int2_grad1_u12_ao, aos_grad_in_r_array_transp_bis)
-        !$OMP DO COLLAPSE(4)
+        !$OMP DO COLLAPSE(3)
         do i = 1, ao_num
           do k = 1, ao_num
             do l = 1, ao_num
@@ -270,7 +270,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
       !$OMP PARALLEL DEFAULT(NONE)                     &
       !$OMP PRIVATE(i, j, k, l, integ_zero, integ_val) & 
       !$OMP SHARED(ao_num, ao_two_e_tc_tot)
-      !$OMP DO COLLAPSE(4)
+      !$OMP DO COLLAPSE(3)
       do j = 1, ao_num
         do l = 1, ao_num
           do i = 1, ao_num
@@ -293,7 +293,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
       !$OMP PARALLEL DEFAULT(NONE)                            &
       !$OMP SHARED(ao_num, ao_two_e_tc_tot, ao_integrals_map) &
       !$OMP PRIVATE(i, j, k, l)
-      !$OMP DO COLLAPSE(4)
+      !$OMP DO COLLAPSE(3)
       do j = 1, ao_num
         do l = 1, ao_num
           do i = 1, ao_num
@@ -306,7 +306,6 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
       enddo
       !$OMP END DO
       !$OMP END PARALLEL
-      !call clear_ao_map()
       FREE ao_integrals_map
     endif
 

From 29da3b6542cdfeb52f9d1b7f8c23f3967018bf0f Mon Sep 17 00:00:00 2001
From: AbdAmmar <abd.ammar.phys@gmail.com>
Date: Thu, 23 May 2024 00:45:56 +0200
Subject: [PATCH 02/19] bypass one_e_tr_dm_mo for large mo_num & n_states

---
 .../multi_s_dipole_moment.irp.f               | 154 ++++++++++++++++--
 1 file changed, 140 insertions(+), 14 deletions(-)

diff --git a/src/mol_properties/multi_s_dipole_moment.irp.f b/src/mol_properties/multi_s_dipole_moment.irp.f
index c7216a61..8aae3bf4 100644
--- a/src/mol_properties/multi_s_dipole_moment.irp.f
+++ b/src/mol_properties/multi_s_dipole_moment.irp.f
@@ -18,7 +18,7 @@
 
 
 
-BEGIN_PROVIDER [double precision, multi_s_dipole_moment, (N_states, N_states)]
+ BEGIN_PROVIDER [double precision, multi_s_dipole_moment  , (N_states, N_states)]
 &BEGIN_PROVIDER [double precision, multi_s_x_dipole_moment, (N_states, N_states)]
 &BEGIN_PROVIDER [double precision, multi_s_y_dipole_moment, (N_states, N_states)]
 &BEGIN_PROVIDER [double precision, multi_s_z_dipole_moment, (N_states, N_states)]
@@ -40,27 +40,153 @@ BEGIN_PROVIDER [double precision, multi_s_dipole_moment, (N_states, N_states)]
   ! gamma^{nm}: density matrix \bra{\Psi^n} a^{\dagger}_a a_i \ket{\Psi^m}
   END_DOC
 
-  integer          :: istate,jstate ! States
-  integer          :: i,j           ! general spatial MOs
+  integer          :: istate, jstate ! States
+  integer          :: i, j           ! general spatial MOs
   double precision :: nuclei_part_x, nuclei_part_y, nuclei_part_z
  
   multi_s_x_dipole_moment = 0.d0
   multi_s_y_dipole_moment = 0.d0
   multi_s_z_dipole_moment = 0.d0
+
+  if(8.d0*mo_num*mo_num*n_states*n_states*1d-9 .lt. 200.d0) then
  
-  do jstate = 1, N_states
-    do istate = 1, N_states
- 
-      do i = 1, mo_num  
-        do j = 1, mo_num  
-          multi_s_x_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_x(j,i)  
-          multi_s_y_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_y(j,i) 
-          multi_s_z_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_z(j,i) 
-        enddo
+    do jstate = 1, N_states
+      do istate = 1, N_states
+        do i = 1, mo_num  
+          do j = 1, mo_num  
+            multi_s_x_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_x(j,i)  
+            multi_s_y_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_y(j,i) 
+            multi_s_z_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_z(j,i) 
+          enddo
+        enddo 
       enddo
- 
     enddo
-  enddo
+
+  else
+
+    ! no enouph memory
+    ! on the fly scheme
+
+    PROVIDE psi_det_alpha_unique psi_det_beta_unique
+
+    integer           :: l, k_a, k_b
+    integer           :: occ(N_int*bit_kind_size,2)
+    integer           :: h1, h2, p1, p2, degree
+    integer           :: exc(0:2,2), n_occ(2)
+    integer           :: krow, kcol, lrow, lcol
+    integer(bit_kind) :: tmp_det(N_int,2), tmp_det2(N_int)
+    double precision  :: ck, ckl, phase
+
+    !$OMP PARALLEL DEFAULT(NONE)                                                      &
+    !$OMP PRIVATE(j, l, k_a, k_b, istate, jstate, occ, ck, ckl, h1, h2, p1, p2, exc,  & 
+    !$OMP         phase, degree, n_occ, krow, kcol, lrow, lcol, tmp_det, tmp_det2)    &
+    !$OMP SHARED(N_int, N_states, elec_alpha_num, elec_beta_num, N_det,               &
+    !$OMP        psi_bilinear_matrix_rows, psi_bilinear_matrix_columns,               &
+    !$OMP        psi_bilinear_matrix_transp_rows, psi_bilinear_matrix_transp_columns, &
+    !$OMP        psi_det_alpha_unique, psi_det_beta_unique,                           &
+    !$OMP        psi_bilinear_matrix_values, psi_bilinear_matrix_transp_values,       &
+    !$OMP        mo_dipole_x, mo_dipole_y, mo_dipole_z,                               &
+    !$OMP        multi_s_x_dipole_moment, multi_s_y_dipole_moment, multi_s_z_dipole_moment)
+    !$OMP DO COLLAPSE(2)
+    do istate = 1, N_states
+      do jstate = 1, N_states
+
+        do k_a = 1, N_det
+          krow = psi_bilinear_matrix_rows   (k_a)
+          kcol = psi_bilinear_matrix_columns(k_a)
+  
+          tmp_det(1:N_int,1) = psi_det_alpha_unique(1:N_int,krow)
+          tmp_det(1:N_int,2) = psi_det_beta_unique (1:N_int,kcol)
+  
+          ! Diagonal part
+          call bitstring_to_list_ab(tmp_det, occ, n_occ, N_int)
+          ck = psi_bilinear_matrix_values(k_a,istate)*psi_bilinear_matrix_values(k_a,jstate)
+          do l = 1, elec_alpha_num
+            j = occ(l,1)
+            multi_s_x_dipole_moment(istate,jstate) -= ck * mo_dipole_x(j,j) 
+            multi_s_y_dipole_moment(istate,jstate) -= ck * mo_dipole_y(j,j) 
+            multi_s_z_dipole_moment(istate,jstate) -= ck * mo_dipole_z(j,j) 
+          enddo
+  
+          if (k_a == N_det) cycle
+          l = k_a + 1
+          lrow = psi_bilinear_matrix_rows   (l)
+          lcol = psi_bilinear_matrix_columns(l)
+          ! Fix beta determinant, loop over alphas
+          do while (lcol == kcol)
+            tmp_det2(:) = psi_det_alpha_unique(:,lrow)
+            call get_excitation_degree_spin(tmp_det(1,1), tmp_det2, degree, N_int)
+            if (degree == 1) then
+              exc = 0
+              call get_single_excitation_spin(tmp_det(1,1), tmp_det2, exc, phase, N_int)
+              call decode_exc_spin(exc, h1, p1, h2, p2)
+              ckl = psi_bilinear_matrix_values(k_a,istate)*psi_bilinear_matrix_values(l,jstate) * phase
+              multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(h1,p1) 
+              multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(h1,p1) 
+              multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(h1,p1) 
+              ckl = psi_bilinear_matrix_values(k_a,jstate)*psi_bilinear_matrix_values(l,istate) * phase
+              multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(p1,h1) 
+              multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(p1,h1) 
+              multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(p1,h1) 
+            endif
+            l = l+1
+            if (l > N_det) exit
+            lrow = psi_bilinear_matrix_rows   (l)
+            lcol = psi_bilinear_matrix_columns(l)
+          enddo
+        enddo ! k_a
+  
+        do k_b = 1, N_det
+          krow = psi_bilinear_matrix_transp_rows   (k_b)
+          kcol = psi_bilinear_matrix_transp_columns(k_b)
+      
+          tmp_det(1:N_int,1) = psi_det_alpha_unique(1:N_int,krow)
+          tmp_det(1:N_int,2) = psi_det_beta_unique (1:N_int,kcol)
+      
+          ! Diagonal part
+          call bitstring_to_list_ab(tmp_det, occ, n_occ, N_int)
+          ck = psi_bilinear_matrix_transp_values(k_b,istate)*psi_bilinear_matrix_transp_values(k_b,jstate)
+          do l = 1, elec_beta_num
+            j = occ(l,2)
+            multi_s_x_dipole_moment(istate,jstate) -= ck * mo_dipole_x(j,j) 
+            multi_s_y_dipole_moment(istate,jstate) -= ck * mo_dipole_y(j,j) 
+            multi_s_z_dipole_moment(istate,jstate) -= ck * mo_dipole_z(j,j) 
+          enddo
+      
+          if (k_b == N_det) cycle
+          l = k_b+1
+          lrow = psi_bilinear_matrix_transp_rows   (l)
+          lcol = psi_bilinear_matrix_transp_columns(l)
+          ! Fix beta determinant, loop over alphas
+          do while (lrow == krow)
+            tmp_det2(:) = psi_det_beta_unique(:,lcol)
+            call get_excitation_degree_spin(tmp_det(1,2), tmp_det2, degree, N_int)
+            if (degree == 1) then
+              exc = 0
+              call get_single_excitation_spin(tmp_det(1,2), tmp_det2, exc, phase, N_int)
+              call decode_exc_spin(exc, h1, p1, h2, p2)
+              ckl = psi_bilinear_matrix_transp_values(k_b,istate)*psi_bilinear_matrix_transp_values(l,jstate) * phase
+              multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(h1,p1) 
+              multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(h1,p1) 
+              multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(h1,p1) 
+              ckl = psi_bilinear_matrix_transp_values(k_b,jstate)*psi_bilinear_matrix_transp_values(l,istate) * phase
+              multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(p1,h1) 
+              multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(p1,h1) 
+              multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(p1,h1) 
+            endif
+            l = l+1
+            if (l > N_det) exit
+            lrow = psi_bilinear_matrix_transp_rows   (l)
+            lcol = psi_bilinear_matrix_transp_columns(l)
+          enddo
+        enddo ! k_b
+
+      enddo ! istate
+    enddo ! jstate
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+  endif ! memory condition
  
   ! Nuclei part
   nuclei_part_x = 0.d0

From 7e45c517d981adcd77883a39a361be38b470ff20 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Tue, 25 Jun 2024 18:32:44 +0200
Subject: [PATCH 03/19] Introducing gpu_x86

---
 src/gpu_x86/NEED           |   1 +
 src/gpu_x86/README.rst     |   5 +
 src/gpu_x86/gpu.c          | 506 +++++++++++++++++++++++++++++++++++++
 src/gpu_x86/gpu.h          |  41 +++
 src/gpu_x86/gpu_module.F90 | 141 +++++++++++
 5 files changed, 694 insertions(+)
 create mode 100644 src/gpu_x86/NEED
 create mode 100644 src/gpu_x86/README.rst
 create mode 100644 src/gpu_x86/gpu.c
 create mode 100644 src/gpu_x86/gpu.h
 create mode 100644 src/gpu_x86/gpu_module.F90

diff --git a/src/gpu_x86/NEED b/src/gpu_x86/NEED
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/src/gpu_x86/NEED
@@ -0,0 +1 @@
+
diff --git a/src/gpu_x86/README.rst b/src/gpu_x86/README.rst
new file mode 100644
index 00000000..f530bf29
--- /dev/null
+++ b/src/gpu_x86/README.rst
@@ -0,0 +1,5 @@
+=======
+gpu_x86
+=======
+
+x86 implementation of GPU routines. For use when GPUs are not available.
diff --git a/src/gpu_x86/gpu.c b/src/gpu_x86/gpu.c
new file mode 100644
index 00000000..71505dbe
--- /dev/null
+++ b/src/gpu_x86/gpu.c
@@ -0,0 +1,506 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+
+/* Generic functions */
+
+int gpu_ndevices() {
+  return 1;
+}
+
+void gpu_set_device(int32_t i) {
+  return;
+}
+
+
+/* Allocation functions */
+
+void gpu_allocate(void** ptr, const int64_t n) {
+  *ptr = malloc((size_t) n);
+  if (*ptr == NULL) {
+    perror("Allocation failed");
+  }
+}
+
+void gpu_free(void** ptr) {
+  free(*ptr);
+  *ptr = NULL;
+}
+
+
+/* Memory transfer functions */
+
+void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
+  memcpy(gpu_ptr, cpu_ptr, n);
+}
+
+void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
+  memcpy(cpu_ptr, gpu_ptr, n);
+}
+
+void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
+  memcpy(gpu_ptr_dest, gpu_ptr_src, n);
+}
+
+
+/* Streams */
+
+void gpu_stream_create(void** ptr) {
+  *ptr = (void*) 2;
+}
+
+void gpu_stream_destroy(void** ptr) {
+  *ptr = NULL;
+}
+
+void gpu_set_stream(void* handle, void* stream) {
+  return;
+}
+
+void gpu_synchronize() {
+  return;
+}
+
+
+/* BLAS functions */
+
+void gpu_blas_create(void** handle) {
+  *handle = (void*) 1;
+}
+
+
+void gpu_blas_destroy(void** handle) {
+  *handle = NULL;
+}
+
+
+double ddot_(const int32_t* n, const double* x, const int32_t* incx, const double* y, const int32_t* incy);
+
+void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t n_, incx_, incy_;
+
+  n_    = (int32_t) n;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  *result = ddot_(&n_, x, &incx_, y, &incy_);
+}
+
+
+float sdot_(const int32_t* n, const float* x, const int32_t* incx, const float* y, const int32_t* incy);
+
+void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t n_, incx_, incy_;
+
+  n_    = (int32_t) n;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  *result = sdot_(&n_, x, &incx_, y, &incy_);
+}
+
+
+void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha,
+            const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy);
+
+void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) {
+
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, lda_, incx_, incy_;
+
+  m_    = (int32_t) m;
+  n_    = (int32_t) n;
+  lda_  = (int32_t) lda;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    m_ == m   );
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t)  lda_ == lda );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  dgemv_(&transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_);
+}
+
+
+void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha,
+               const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy);
+
+void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float alpha,
+               const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) {
+
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, lda_, incx_, incy_;
+
+  m_    = (int32_t) m;
+  n_    = (int32_t) n;
+  lda_  = (int32_t) lda;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    m_ == m   );
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t)  lda_ == lda );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  sgemv_(&transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_);
+}
+
+
+void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha,
+            const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc);
+
+void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) {
+
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, k_, lda_, ldb_, ldc_;
+
+  m_   = (int32_t) m;
+  n_   = (int32_t) n;
+  k_   = (int32_t) k;
+  lda_ = (int32_t) lda;
+  ldb_ = (int32_t) ldb;
+  ldc_ = (int32_t) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t)   k_ == k  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  dgemm_(&transa, &transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_);
+}
+
+
+
+void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha,
+            const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc);
+
+void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
+               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) {
+
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, k_, lda_, ldb_, ldc_;
+
+  m_   = (int32_t) m;
+  n_   = (int32_t) n;
+  k_   = (int32_t) k;
+  lda_ = (int32_t) lda;
+  ldb_ = (int32_t) ldb;
+  ldc_ = (int32_t) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t)   k_ == k  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  sgemm_(&transa, &transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_);
+}
+
+
+void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
+               const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
+  if (handle == NULL) {
+    perror("NULL handle");
+    exit(-1);
+  }
+
+  if ( (transa == 'N' && transb == 'N') ||
+       (transa == 'n' && transb == 'N') ||
+       (transa == 'N' && transb == 'n') ||
+       (transa == 'n' && transb == 'n') ) {
+
+     if (alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = beta * b[j*ldb+i];
+         }
+       }
+
+     } else if (beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[j*lda+i];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i];
+         }
+       }
+
+     }
+
+  } else if ( (transa == 'N' && transb == 'T') ||
+              (transa == 'n' && transb == 'T') ||
+              (transa == 'N' && transb == 't') ||
+              (transa == 'n' && transb == 't') ) {
+
+     if (alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = beta * b[i*ldb+j];
+         }
+       }
+
+     } else if (beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[j*lda+i];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j];
+         }
+       }
+
+     }
+
+  } else if ( (transa == 'T' && transb == 'N') ||
+              (transa == 't' && transb == 'N') ||
+              (transa == 'T' && transb == 'n') ||
+              (transa == 't' && transb == 'n') ) {
+
+     if (alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = beta * b[j*ldb+i];
+         }
+       }
+
+     } else if (beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[i*lda+j];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i];
+         }
+       }
+
+     }
+
+  } else if ( (transa == 'T' && transb == 'T') ||
+              (transa == 't' && transb == 'T') ||
+              (transa == 'T' && transb == 't') ||
+              (transa == 't' && transb == 't') ) {
+
+     if (alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = beta * b[i*ldb+j];
+         }
+       }
+
+     } else if (beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[i*lda+j];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j];
+         }
+       }
+
+     }
+
+  }
+}
+
+
+void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
+               const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
+  if (handle == NULL) {
+    perror("NULL handle");
+    exit(-1);
+  }
+
+  if ( (transa == 'N' && transb == 'N') ||
+       (transa == 'n' && transb == 'N') ||
+       (transa == 'N' && transb == 'n') ||
+       (transa == 'n' && transb == 'n') ) {
+
+     if (alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = beta * b[j*ldb+i];
+         }
+       }
+
+     } else if (beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[j*lda+i];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i];
+         }
+       }
+
+     }
+
+  } else if ( (transa == 'N' && transb == 'T') ||
+              (transa == 'n' && transb == 'T') ||
+              (transa == 'N' && transb == 't') ||
+              (transa == 'n' && transb == 't') ) {
+
+     if (alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = beta * b[i*ldb+j];
+         }
+       }
+
+     } else if (beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[j*lda+i];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j];
+         }
+       }
+
+     }
+
+  } else if ( (transa == 'T' && transb == 'N') ||
+              (transa == 't' && transb == 'N') ||
+              (transa == 'T' && transb == 'n') ||
+              (transa == 't' && transb == 'n') ) {
+
+     if (alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = beta * b[j*ldb+i];
+         }
+       }
+
+     } else if (beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[i*lda+j];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i];
+         }
+       }
+
+     }
+
+  } else if ( (transa == 'T' && transb == 'T') ||
+              (transa == 't' && transb == 'T') ||
+              (transa == 'T' && transb == 't') ||
+              (transa == 't' && transb == 't') ) {
+
+     if (alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = beta * b[i*ldb+j];
+         }
+       }
+
+     } else if (beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[i*lda+j];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j];
+         }
+       }
+
+     }
+
+  }
+}
diff --git a/src/gpu_x86/gpu.h b/src/gpu_x86/gpu.h
new file mode 100644
index 00000000..03aeeab7
--- /dev/null
+++ b/src/gpu_x86/gpu.h
@@ -0,0 +1,41 @@
+#include <stdint.h>
+
+int  gpu_ndevices();
+void gpu_set_device(int32_t i);
+
+void gpu_allocate(void** ptr, const int64_t n);
+void gpu_free(void** ptr);
+
+void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n);
+void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n);
+void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n);
+
+void gpu_stream_create(void** ptr);
+void gpu_stream_destroy(void** ptr);
+void gpu_set_stream(void* handle, void* stream);
+void gpu_synchronize();
+
+void gpu_blas_create(void** handle);
+void gpu_blas_destroy(void** handle);
+
+void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result);
+
+void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result);
+
+void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy);
+
+void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float alpha,
+               const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy);
+
+void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc);
+
+void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
+               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc);
+
+void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
+               const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc);
+
+void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
+               const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc);
diff --git a/src/gpu_x86/gpu_module.F90 b/src/gpu_x86/gpu_module.F90
new file mode 100644
index 00000000..86ba3926
--- /dev/null
+++ b/src/gpu_x86/gpu_module.F90
@@ -0,0 +1,141 @@
+module gpu
+  use, intrinsic :: iso_c_binding, only : c_int32_t, c_int64_t, c_double, c_size_t, c_char
+  implicit none
+
+  interface
+    integer function gpu_ndevices() bind(C)
+    end function
+
+    subroutine gpu_set_device(id) bind(C)
+      import
+      integer(c_int32_t), value :: id
+    end subroutine
+
+    subroutine gpu_allocate_c(ptr, n) bind(C, name='gpu_allocate')
+      import
+      type(c_ptr) :: ptr
+      integer(c_int64_t), value :: n
+    end subroutine
+
+    subroutine gpu_free_c(ptr) bind(C, name='gpu_free')
+      import
+      type(c_ptr) :: ptr
+    end subroutine
+
+    subroutine gpu_upload_c(cpu_ptr, gpu_ptr, n) bind(C, name='gpu_upload')
+      import
+      type(c_ptr), value :: cpu_ptr
+      type(c_ptr), value :: gpu_ptr
+      integer(c_int64_t), value :: n
+    end subroutine
+
+    subroutine gpu_download_c(gpu_ptr, cpu_ptr, n) bind(C, name='gpu_download')
+      import
+      type(c_ptr), value :: gpu_ptr
+      type(c_ptr), value :: cpu_ptr
+      integer(c_int64_t), value :: n
+    end subroutine
+
+    subroutine gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, n) bind(C, name='gpu_copy')
+      import
+      type(c_ptr), value :: gpu_ptr_src
+      type(c_ptr), value :: gpu_ptr_dest
+      integer(c_int64_t), value :: n
+    end subroutine
+
+    subroutine gpu_stream_create(stream) bind(C)
+      import
+      type(c_ptr) :: stream
+    end subroutine
+
+    subroutine gpu_stream_destroy(stream) bind(C)
+      import
+      type(c_ptr) :: stream
+    end subroutine
+
+    subroutine gpu_set_stream(handle, stream) bind(C)
+      import
+      type(c_ptr) :: handle, stream
+    end subroutine
+
+    subroutine gpu_synchronize()
+    end subroutine
+
+    subroutine gpu_blas_create(handle) bind(C)
+      import
+      type(c_ptr) :: handle
+    end subroutine
+
+    subroutine gpu_blas_destroy(handle) bind(C)
+      import
+      type(c_ptr) :: handle
+    end subroutine
+
+    subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res) bind(C)
+      import
+      type(c_ptr), intent(in)     :: handle
+      integer(c_int64_t), value   :: n, incx, incy
+      real(c_double), intent(in)  :: dx(*), dy(*)
+      real(c_double), intent(out) :: res
+    end subroutine
+
+    subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res) bind(C)
+      import
+      type(c_ptr), intent(in)     :: handle
+      integer(c_int64_t), value   :: n, incx, incy
+      real(c_float), intent(in)   :: dx(*), dy(*)
+      real(c_float), intent(out)  :: res
+    end subroutine
+
+  end interface
+
+end module
+
+subroutine gpu_allocate_double(ptr, s)
+  use gpu
+  implicit none
+  double precision, pointer, intent(inout) :: ptr
+  integer*8, intent(in) :: s(*)
+  type(c_ptr) :: cptr
+
+  call gpu_allocate_c(cptr, sum(s)*8_8)
+  call c_f_pointer(cptr, ptr, s)
+end subroutine
+
+subroutine gpu_free_double(ptr)
+  use gpu
+  implicit none
+  double precision, pointer, intent(inout) :: ptr
+  type(c_ptr) :: cptr
+  cptr = cloc(ptr)
+  call gpu_free(cptr)
+  NULLIFY(ptr)
+end subroutine
+
+subroutine gpu_upload_double(cpu_ptr, gpu_ptr, n)
+  use gpu
+  implicit none
+  double precision, intent(in)   :: cpu_ptr(*)
+  double precision, intent(out)  :: gpu_ptr(*)
+  integer(c_int64_t), intent(in) :: n
+  call gpu_upload_c(cpu_ptr, gpu_ptr, 8_8*n)
+end subroutine
+
+subroutine gpu_download_double(gpu_ptr, cpu_ptr, n)
+  use gpu
+  implicit none
+  double precision, intent(in)   :: gpu_ptr(*)
+  double precision, intent(out)  :: cpu_ptr(*)
+  integer(c_int64_t), intent(in) :: n
+  call gpu_download_c(gpu_ptr, cpu_ptr, 8_8*n)
+end subroutine
+
+subroutine gpu_copy_double(gpu_ptr_src, gpu_ptr_dest, n)
+  use gpu
+  implicit none
+  double precision, intent(in)   :: gpu_ptr_src(*)
+  double precision, intent(out)  :: gpu_ptr_dest(*)
+  integer(c_int64_t), intent(in) :: n
+  call gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, 8_8*n)
+end subroutine
+

From 646607ada4a0a58c9cd5e0593c04bce7bc9bd02e Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Wed, 26 Jun 2024 11:15:30 +0200
Subject: [PATCH 04/19] 1st commit

---
 plugins/local/tc_int/NEED                 |   5 +
 plugins/local/tc_int/README.rst           |   4 +
 plugins/local/tc_int/int2_grad1_u12.irp.f | 134 ++++++++++++++++++++++
 plugins/local/tc_int/jast_grad_2e.irp.f   | 102 ++++++++++++++++
 plugins/local/tc_int/jast_grad_full.irp.f |  51 ++++++++
 plugins/local/tc_int/jast_utils_bh.irp.f  |  35 ++++++
 plugins/local/tc_int/write_tc_int.irp.f   |  58 ++++++++++
 7 files changed, 389 insertions(+)
 create mode 100644 plugins/local/tc_int/NEED
 create mode 100644 plugins/local/tc_int/README.rst
 create mode 100644 plugins/local/tc_int/int2_grad1_u12.irp.f
 create mode 100644 plugins/local/tc_int/jast_grad_2e.irp.f
 create mode 100644 plugins/local/tc_int/jast_grad_full.irp.f
 create mode 100644 plugins/local/tc_int/jast_utils_bh.irp.f
 create mode 100644 plugins/local/tc_int/write_tc_int.irp.f

diff --git a/plugins/local/tc_int/NEED b/plugins/local/tc_int/NEED
new file mode 100644
index 00000000..8a4caf5b
--- /dev/null
+++ b/plugins/local/tc_int/NEED
@@ -0,0 +1,5 @@
+tc_keywords
+jastrow
+qmckl
+becke_numerical_grid
+dft_utils_in_r
diff --git a/plugins/local/tc_int/README.rst b/plugins/local/tc_int/README.rst
new file mode 100644
index 00000000..bc9e8483
--- /dev/null
+++ b/plugins/local/tc_int/README.rst
@@ -0,0 +1,4 @@
+======
+tc_int
+======
+
diff --git a/plugins/local/tc_int/int2_grad1_u12.irp.f b/plugins/local/tc_int/int2_grad1_u12.irp.f
new file mode 100644
index 00000000..0cf0d775
--- /dev/null
+++ b/plugins/local/tc_int/int2_grad1_u12.irp.f
@@ -0,0 +1,134 @@
+
+! ---
+
+subroutine provide_int2_grad1_u12_ao()
+
+  implicit none
+  integer                       :: ipoint, i, j, m, jpoint
+  integer                       :: n_blocks, n_rest, n_pass
+  integer                       :: i_blocks, i_rest, i_pass, ii
+  double precision              :: time0, time1
+  double precision              :: mem, n_double
+  double precision, allocatable :: tmp(:,:,:)
+  double precision, allocatable :: tmp_grad1_u12(:,:,:)
+  double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:)
+
+  PROVIDE final_weight_at_r_vector_extra aos_in_r_array_extra
+
+  print*, ' start provide_int2_grad1_u12_ao ...'
+  call wall_time(time0)
+
+
+  ! int2_grad1_u12_ao(i,j,ipoint,1) =        \int dr2 [\grad1 u(r1,r2)]_x1 \chi_i(r2) \chi_j(r2) 
+  ! int2_grad1_u12_ao(i,j,ipoint,2) =        \int dr2 [\grad1 u(r1,r2)]_y1 \chi_i(r2) \chi_j(r2) 
+  ! int2_grad1_u12_ao(i,j,ipoint,3) =        \int dr2 [\grad1 u(r1,r2)]_z1 \chi_i(r2) \chi_j(r2) 
+  ! int2_grad1_u12_ao(i,j,ipoint,4) = -(1/2) \int dr2 [\grad1 u(r1,r2)]^2  \chi_i(r2) \chi_j(r2) 
+  allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,4))
+
+
+
+  call total_memory(mem)
+  mem      = max(1.d0, qp_max_mem - mem)
+  n_double = mem * 1.d8
+  n_blocks = int(min(n_double / (n_points_extra_final_grid * 4.d0), 1.d0*n_points_final_grid))
+  n_rest   = int(mod(n_points_final_grid, n_blocks))
+  n_pass   = int((n_points_final_grid - n_rest) / n_blocks)
+
+  call write_int(6, n_pass, 'Number of passes')
+  call write_int(6, n_blocks, 'Size of the blocks')
+  call write_int(6, n_rest, 'Size of the last block')
+
+
+  allocate(tmp(n_points_extra_final_grid,ao_num,ao_num))
+  !$OMP PARALLEL               &
+  !$OMP DEFAULT (NONE)         &
+  !$OMP PRIVATE (j, i, jpoint) &
+  !$OMP SHARED (tmp, ao_num, n_points_extra_final_grid, final_weight_at_r_vector_extra, aos_in_r_array_extra_transp)
+  !$OMP DO SCHEDULE (static)
+  do j = 1, ao_num
+    do i = 1, ao_num
+      do jpoint = 1, n_points_extra_final_grid
+        tmp(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  
+  allocate(tmp_grad1_u12(n_points_extra_final_grid,n_blocks,4))
+  
+  do i_pass = 1, n_pass
+    ii = (i_pass-1)*n_blocks + 1
+  
+    !$OMP PARALLEL                   &
+    !$OMP DEFAULT (NONE)             &
+    !$OMP PRIVATE (i_blocks, ipoint) &
+    !$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12)
+    !$OMP DO 
+    do i_blocks = 1, n_blocks
+      ipoint = ii - 1 + i_blocks ! r1
+      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_blocks,1), tmp_grad1_u12(1,i_blocks,2), tmp_grad1_u12(1,i_blocks,3), tmp_grad1_u12(1,i_blocks,4))
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    do m = 1, 4
+      call dgemm( "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0                     &
+                , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
+                , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) 
+    enddo
+  enddo
+  
+  deallocate(tmp_grad1_u12)
+
+  
+  if(n_rest .gt. 0) then
+  
+    allocate(tmp_grad1_u12(n_points_extra_final_grid,n_rest,4))
+  
+    ii = n_pass*n_blocks + 1
+
+    !$OMP PARALLEL                 &
+    !$OMP DEFAULT (NONE)           &
+    !$OMP PRIVATE (i_rest, ipoint) &
+    !$OMP SHARED (n_rest, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12)
+    !$OMP DO 
+    do i_rest = 1, n_rest
+      ipoint = ii - 1 + i_rest ! r1
+      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_rest,1), tmp_grad1_u12(1,i_rest,2), tmp_grad1_u12(1,i_rest,3), tmp_grad1_u12(1,i_rest,4))
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+  
+    do m = 1, 4
+      call dgemm( "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0                       &
+                , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
+                , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) 
+    enddo
+
+    deallocate(tmp_grad1_u12)
+  endif
+
+  deallocate(tmp)
+
+
+  ! ---
+
+  print*, ' Writing int2_grad1_u12_ao in ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao'
+  open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="write")
+  call ezfio_set_work_empty(.False.)
+    write(11) int2_grad1_u12_ao(:,:,:,1:3)
+  close(11)
+
+  deallocate(int2_grad1_u12_ao)
+
+  call wall_time(time1)
+  print*, ' wall time for provide_int2_grad1_u12_ao (min) = ', (time1-time0) / 60.d0
+  call print_memory_usage()
+
+end
+
+! ---
+
+
diff --git a/plugins/local/tc_int/jast_grad_2e.irp.f b/plugins/local/tc_int/jast_grad_2e.irp.f
new file mode 100644
index 00000000..b18b9d62
--- /dev/null
+++ b/plugins/local/tc_int/jast_grad_2e.irp.f
@@ -0,0 +1,102 @@
+
+! ---
+
+subroutine get_grad1_u12_r1_2e(r1, n_grid2, gradx, grady, gradz)
+
+  BEGIN_DOC
+  !
+  !  d/dx1 j_2e(1,2)
+  !  d/dy1 j_2e(1,2)
+  !  d/dz1 j_2e(1,2)
+  !
+  END_DOC
+
+  include 'constants.include.F'
+
+  implicit none
+  integer         , intent(in)  :: n_grid2
+  double precision, intent(in)  :: r1(3)
+  double precision, intent(out) :: gradx(n_grid2)
+  double precision, intent(out) :: grady(n_grid2)
+  double precision, intent(out) :: gradz(n_grid2)
+
+  integer                       :: jpoint
+  integer                       :: i_nucl, p, mpA, npA, opA
+  integer                       :: powmax1, powmax, powmax2
+  double precision              :: r2(3)
+  double precision              :: tmp, tmp1, tmp2
+  double precision              :: rn(3), f1A, grad1_f1A(3), f2A, grad2_f2A(3), g12, grad1_g12(3)
+  double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:)
+
+
+  powmax1 = max(maxval(jBH_m), maxval(jBH_n))
+  powmax2 = maxval(jBH_o)
+  powmax  = max(powmax1, powmax2)
+
+  allocate(f1A_power(-1:powmax), f2A_power(-1:powmax), g12_power(-1:powmax), double_p(0:powmax))
+
+  do p = 0, powmax
+    double_p(p) = dble(p)
+  enddo
+
+  f1A_power(-1) = 0.d0
+  f2A_power(-1) = 0.d0
+  g12_power(-1) = 0.d0
+
+  f1A_power(0) = 1.d0
+  f2A_power(0) = 1.d0
+  g12_power(0) = 1.d0
+
+  do jpoint = 1, n_points_extra_final_grid ! r2
+
+    r2(1) = final_grid_points_extra(1,jpoint)
+    r2(2) = final_grid_points_extra(2,jpoint)
+    r2(3) = final_grid_points_extra(3,jpoint)
+
+    gradx(jpoint) = 0.d0 
+    grady(jpoint) = 0.d0 
+    gradz(jpoint) = 0.d0 
+    do i_nucl = 1, nucl_num 
+
+      rn(1) = nucl_coord(i_nucl,1)
+      rn(2) = nucl_coord(i_nucl,2)
+      rn(3) = nucl_coord(i_nucl,3)
+
+      call jBH_elem_fct_grad(jBH_en(i_nucl), r1, rn, f1A, grad1_f1A)
+      call jBH_elem_fct_grad(jBH_en(i_nucl), r2, rn, f2A, grad2_f2A)
+      call jBH_elem_fct_grad(jBH_ee(i_nucl), r1, r2, g12, grad1_g12)
+
+      ! Compute powers of f1A and f2A
+      do p = 1, powmax1
+        f1A_power(p) = f1A_power(p-1) * f1A
+        f2A_power(p) = f2A_power(p-1) * f2A
+      enddo
+      do p = 1, powmax2
+        g12_power(p) = g12_power(p-1) * g12
+      enddo
+
+      do p = 1, jBH_size
+        mpA = jBH_m(p,i_nucl)
+        npA = jBH_n(p,i_nucl)
+        opA = jBH_o(p,i_nucl)
+        tmp = jBH_c(p,i_nucl)
+        if(mpA .eq. npA) then
+          tmp = tmp * 0.5d0
+        endif
+
+        tmp1 = double_p(mpA) * f1A_power(mpA-1) * f2A_power(npA) + double_p(npA) * f1A_power(npA-1) * f2A_power(mpA)
+        tmp1 = tmp1 * g12_power(opA) * tmp
+        tmp2 = double_p(opA) * g12_power(opA-1) * (f1A_power(mpA) * f2A_power(npA) + f1A_power(npA) * f2A_power(mpA)) * tmp
+
+        gradx(jpoint) = gradx(jpoint) + tmp1 * grad1_f1A(1) + tmp2 * grad1_g12(1)
+        grady(jpoint) = grady(jpoint) + tmp1 * grad1_f1A(2) + tmp2 * grad1_g12(2)
+        gradz(jpoint) = gradz(jpoint) + tmp1 * grad1_f1A(3) + tmp2 * grad1_g12(3)
+      enddo ! p
+    enddo ! i_nucl
+  enddo ! jpoint
+
+  return
+end
+
+! ---
+
diff --git a/plugins/local/tc_int/jast_grad_full.irp.f b/plugins/local/tc_int/jast_grad_full.irp.f
new file mode 100644
index 00000000..f63ee3e4
--- /dev/null
+++ b/plugins/local/tc_int/jast_grad_full.irp.f
@@ -0,0 +1,51 @@
+
+! ---
+
+subroutine get_grad1_u12_for_tc(ipoint, n_grid2, resx, resy, resz, res)
+
+  BEGIN_DOC
+  ! 
+  ! resx(ipoint) =      [grad1 u(r1,r2)]_x1
+  ! resy(ipoint) =      [grad1 u(r1,r2)]_y1
+  ! resz(ipoint) =      [grad1 u(r1,r2)]_z1
+  ! res (ipoint) = -0.5 [grad1 u(r1,r2)]^2
+  !
+  ! We use:
+  !       grid for r1
+  ! extra_grid for r2
+  !
+  END_DOC
+
+  implicit none
+  integer,          intent(in)  :: ipoint, n_grid2
+  double precision, intent(out) :: resx(n_grid2), resy(n_grid2), resz(n_grid2), res(n_grid2)
+
+  integer                       :: jpoint
+  double precision              :: env_r1, tmp
+  double precision              :: grad1_env(3), r1(3)
+  double precision, allocatable :: env_r2(:)
+  double precision, allocatable :: u2b_r12(:), gradx1_u2b(:), grady1_u2b(:), gradz1_u2b(:)
+  double precision, allocatable :: u2b_mu(:), gradx1_mu(:), grady1_mu(:), gradz1_mu(:)
+  double precision, allocatable :: u2b_nu(:), gradx1_nu(:), grady1_nu(:), gradz1_nu(:)
+  double precision, external    :: env_nucl
+
+  r1(1) = final_grid_points(1,ipoint)
+  r1(2) = final_grid_points(2,ipoint)
+  r1(3) = final_grid_points(3,ipoint)
+
+
+  ! j2e_type .eq. "Boys_Handy"
+  ! env_type .eq. "None"
+  ! j1e_type .eq "None"
+
+  call get_grad1_u12_r1_2e(r1, n_grid2, resx(1), resy(1), resz(1))
+
+  do jpoint = 1, n_points_extra_final_grid
+    res(jpoint) = -0.5d0 * (resx(jpoint) * resx(jpoint) + resy(jpoint) * resy(jpoint) + resz(jpoint) * resz(jpoint))
+  enddo
+
+  return
+end
+
+! ---
+
diff --git a/plugins/local/tc_int/jast_utils_bh.irp.f b/plugins/local/tc_int/jast_utils_bh.irp.f
new file mode 100644
index 00000000..750ce90b
--- /dev/null
+++ b/plugins/local/tc_int/jast_utils_bh.irp.f
@@ -0,0 +1,35 @@
+
+! ---
+
+subroutine jBH_elem_fct_grad(alpha, r1, r2, fct, grad1_fct)
+
+  implicit none
+  double precision, intent(in)  :: alpha, r1(3), r2(3)
+  double precision, intent(out) :: fct, grad1_fct(3)
+  double precision              :: dist, tmp1, tmp2
+
+  dist = dsqrt( (r1(1) - r2(1)) * (r1(1) - r2(1)) &
+              + (r1(2) - r2(2)) * (r1(2) - r2(2)) &
+              + (r1(3) - r2(3)) * (r1(3) - r2(3)) )
+
+
+  if(dist .ge. 1d-10) then
+    tmp1 = 1.d0 / (1.d0 + alpha * dist)
+
+    fct = alpha * dist * tmp1
+    tmp2 = alpha * tmp1 * tmp1 / dist
+    grad1_fct(1) = tmp2 * (r1(1) - r2(1))
+    grad1_fct(2) = tmp2 * (r1(2) - r2(2))
+    grad1_fct(3) = tmp2 * (r1(3) - r2(3))
+  else
+    grad1_fct(1) = 0.d0
+    grad1_fct(2) = 0.d0
+    grad1_fct(3) = 0.d0
+    fct = 0.d0
+  endif
+
+  return
+end 
+
+! ---
+
diff --git a/plugins/local/tc_int/write_tc_int.irp.f b/plugins/local/tc_int/write_tc_int.irp.f
new file mode 100644
index 00000000..ebdce6f2
--- /dev/null
+++ b/plugins/local/tc_int/write_tc_int.irp.f
@@ -0,0 +1,58 @@
+! ---
+
+program write_tc_int
+
+  implicit none
+
+  print *, ' j2e_type = ', j2e_type
+  print *, ' j1e_type = ', j1e_type
+  print *, ' env_type = ', env_type
+
+  my_grid_becke  = .True.
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
+  call write_int(6, my_n_pt_r_grid, 'radial  external grid over')
+  call write_int(6, my_n_pt_a_grid, 'angular external grid over')
+
+  if(tc_integ_type .eq. "numeric") then
+    my_extra_grid_becke  = .True.
+    PROVIDE tc_grid2_a tc_grid2_r
+    my_n_pt_r_extra_grid = tc_grid2_r
+    my_n_pt_a_extra_grid = tc_grid2_a
+    touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid
+
+    call write_int(6, my_n_pt_r_extra_grid, 'radial  internal grid over')
+    call write_int(6, my_n_pt_a_extra_grid, 'angular internal grid over')
+  endif
+
+  call main()
+
+end
+
+! ---
+
+subroutine main()
+
+  implicit none
+
+  PROVIDE io_tc_integ
+
+  print*, 'io_tc_integ = ', io_tc_integ
+
+  if(io_tc_integ .ne. "Write") then
+    print*, 'io_tc_integ != Write'
+    print*, io_tc_integ
+    stop
+  endif
+
+  call provide_int2_grad1_u12_ao()
+
+  call ezfio_set_tc_keywords_io_tc_integ('Read')
+
+end
+
+! ---
+

From a2f4bc218d207d4c588b3d0d1c1d4c5f7448b334 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Wed, 26 Jun 2024 13:44:45 +0200
Subject: [PATCH 05/19] GPU acceleration selection in configure

---
 configure | 36 +++++++++++++++++++++++++++++-------
 1 file changed, 29 insertions(+), 7 deletions(-)

diff --git a/configure b/configure
index 41c0123d..014275eb 100755
--- a/configure
+++ b/configure
@@ -40,14 +40,16 @@ Usage:
   $(basename $0) -c <file>    
   $(basename $0) -h           
   $(basename $0) -i <package> 
+  $(basename $0) -g [nvidia|none]
 
 Options:
-  -c  <file>    Define a COMPILATION configuration file,
-                in "${QP_ROOT}/config/".
-  -h            Print the HELP message
-  -i <package>  INSTALL <package>. Use at your OWN RISK:
-                no support will be provided for the installation of
-                dependencies.
+  -c  <file>        Define a COMPILATION configuration file,
+                    in "${QP_ROOT}/config/".
+  -h                Print the HELP message
+  -i <package>      INSTALL <package>. Use at your OWN RISK:
+                    no support will be provided for the installation of
+                    dependencies.
+  -g [nvidia|none]  Choose GPU acceleration (experimental)
 
 Example:
   ./$(basename $0) -c config/gfortran.cfg
@@ -83,7 +85,7 @@ function execute () {
 PACKAGES=""
 
 
-while getopts "d:c:i:h" c ; do
+while getopts "d:c:i:g:h" c ; do
     case "$c" in
         c)
             case "$OPTARG" in
@@ -100,6 +102,9 @@ while getopts "d:c:i:h" c ; do
                 "") help ; break;;
                 *) PACKAGES="${PACKAGE} $OPTARG"
             esac;;
+        g)
+            GPU=$OPTARG;
+            break;;
         h)
             help
             exit 0;;
@@ -109,6 +114,23 @@ while getopts "d:c:i:h" c ; do
     esac
 done
 
+# Handle GPU acceleration
+rm -f ${QP_ROOT}/src/gpu
+case "$GPU" in
+  amd) # Nvidia
+     echo "Activating AMD GPU acceleration"
+     ln -s ${QP_ROOT}/src/gpu_amd ${QP_ROOT}/src/gpu
+     ;;
+  nvidia) # Nvidia
+     echo "Activating Nvidia GPU acceleration"
+     ln -s ${QP_ROOT}/src/gpu_nvidia ${QP_ROOT}/src/gpu
+     ;;
+  *) # No Acceleration
+     echo "Disabling GPU acceleration"
+     ln -s ${QP_ROOT}/src/gpu_x86 ${QP_ROOT}/src/gpu
+     ;;
+esac
+
 # Trim leading and trailing spaces
 PACKAGES=$(echo $PACKAGES | xargs)
 

From 1d0bac25d081d76177c6efeba88117f397c5de3c Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Wed, 26 Jun 2024 15:31:44 +0200
Subject: [PATCH 06/19] v0

---
 plugins/local/tc_int/compute_tc_int.irp.f | 295 ++++++++++++++++++++++
 plugins/local/tc_int/int2_grad1_u12.irp.f | 134 ----------
 plugins/local/tc_int/jast_grad_2e.irp.f   | 102 --------
 plugins/local/tc_int/jast_grad_full.irp.f | 113 +++++++--
 plugins/local/tc_int/write_tc_int.irp.f   |  18 +-
 5 files changed, 401 insertions(+), 261 deletions(-)
 create mode 100644 plugins/local/tc_int/compute_tc_int.irp.f
 delete mode 100644 plugins/local/tc_int/int2_grad1_u12.irp.f
 delete mode 100644 plugins/local/tc_int/jast_grad_2e.irp.f

diff --git a/plugins/local/tc_int/compute_tc_int.irp.f b/plugins/local/tc_int/compute_tc_int.irp.f
new file mode 100644
index 00000000..02f21570
--- /dev/null
+++ b/plugins/local/tc_int/compute_tc_int.irp.f
@@ -0,0 +1,295 @@
+
+! ---
+
+subroutine provide_int2_grad1_u12_ao()
+
+  BEGIN_DOC
+  !
+  ! int2_grad1_u12_ao(i,j,ipoint,1) = \int dr2         [\grad1 u(r1,r2)]_x1 \chi_i(r2) \chi_j(r2) 
+  ! int2_grad1_u12_ao(i,j,ipoint,2) = \int dr2         [\grad1 u(r1,r2)]_y1 \chi_i(r2) \chi_j(r2) 
+  ! int2_grad1_u12_ao(i,j,ipoint,3) = \int dr2         [\grad1 u(r1,r2)]_z1 \chi_i(r2) \chi_j(r2) 
+  ! int2_grad1_u12_ao(i,j,ipoint,4) = \int dr2 [-(1/2) [\grad1 u(r1,r2)]^2] \chi_i(r2) \chi_j(r2) 
+  !
+  !
+  ! tc_int_2e_ao(k,i,l,j) = (ki|V^TC(r_12)|lj) 
+  !                       = <lk| V^TC(r_12) |ji> where V^TC(r_12) is the total TC operator 
+  !                       = tc_grad_and_lapl_ao(k,i,l,j) + tc_grad_square_ao(k,i,l,j) + ao_two_e_coul(k,i,l,j)
+  ! where:
+  !
+  ! tc_grad_and_lapl_ao(k,i,l,j) = < k l | -1/2 \Delta_1 u(r1,r2) - \grad_1 u(r1,r2) . \grad_1 | ij >
+  !                              = -1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2      \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2) 
+  !                              =  1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2 (-1) \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2) 
+  !
+  ! tc_grad_square_ao(k,i,l,j) = -1/2 <kl | |\grad_1 u(r1,r2)|^2 + |\grad_2 u(r1,r2)|^2 | ij>
+  !
+  ! ao_two_e_coul(k,i,l,j) = < l k | 1/r12 | j i > = ( k i | 1/r12 | l j )
+  !
+  END_DOC
+
+  implicit none
+
+  integer                       :: i, j, k, l, m, ipoint, jpoint
+  integer                       :: n_blocks, n_rest, n_pass
+  integer                       :: i_blocks, i_rest, i_pass, ii
+  double precision              :: mem, n_double
+  double precision              :: weight1, ao_k_r, ao_i_r
+  double precision              :: der_envsq_x, der_envsq_y, der_envsq_z, lap_envsq
+  double precision              :: time0, time1, time2, tc1, tc2, tc
+  double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:), tc_int_2e_ao(:,:,:,:)
+  double precision, allocatable :: tmp(:,:,:), c_mat(:,:,:), tmp_grad1_u12(:,:,:)
+
+  double precision, external    :: get_ao_two_e_integral
+
+
+  PROVIDE final_weight_at_r_vector_extra aos_in_r_array_extra
+  PROVIDE final_weight_at_r_vector aos_grad_in_r_array_transp_bis final_weight_at_r_vector aos_in_r_array_transp
+
+
+
+  print*, ' start provide_int2_grad1_u12_ao ...'
+  call wall_time(time0)
+
+  call total_memory(mem)
+  mem      = max(1.d0, qp_max_mem - mem)
+  n_double = mem * 1.d8
+  n_blocks = int(min(n_double / (n_points_extra_final_grid * 4.d0), 1.d0*n_points_final_grid))
+  n_rest   = int(mod(n_points_final_grid, n_blocks))
+  n_pass   = int((n_points_final_grid - n_rest) / n_blocks)
+
+  call write_int(6, n_pass, 'Number of passes')
+  call write_int(6, n_blocks, 'Size of the blocks')
+  call write_int(6, n_rest, 'Size of the last block')
+
+  ! ---
+  ! ---
+  ! ---
+
+  allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,4))
+
+  allocate(tmp(n_points_extra_final_grid,ao_num,ao_num))
+  !$OMP PARALLEL               &
+  !$OMP DEFAULT (NONE)         &
+  !$OMP PRIVATE (j, i, jpoint) &
+  !$OMP SHARED (tmp, ao_num, n_points_extra_final_grid, final_weight_at_r_vector_extra, aos_in_r_array_extra_transp)
+  !$OMP DO SCHEDULE (static)
+  do j = 1, ao_num
+    do i = 1, ao_num
+      do jpoint = 1, n_points_extra_final_grid
+        tmp(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  allocate(tmp_grad1_u12(n_points_extra_final_grid,n_blocks,4))
+
+  tc = 0.d0
+  
+  do i_pass = 1, n_pass
+    ii = (i_pass-1)*n_blocks + 1
+  
+    call wall_time(tc1)
+    !$OMP PARALLEL                   &
+    !$OMP DEFAULT (NONE)             &
+    !$OMP PRIVATE (i_blocks, ipoint) &
+    !$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12)
+    !$OMP DO 
+    do i_blocks = 1, n_blocks
+      ipoint = ii - 1 + i_blocks ! r1
+      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_blocks,1), tmp_grad1_u12(1,i_blocks,2), tmp_grad1_u12(1,i_blocks,3), tmp_grad1_u12(1,i_blocks,4))
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+    call wall_time(tc2)
+    tc = tc + tc2 - tc1 
+
+    do m = 1, 4
+      call dgemm( "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0                     &
+                , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
+                , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) 
+    enddo
+  enddo
+  
+  deallocate(tmp_grad1_u12)
+
+  
+  if(n_rest .gt. 0) then
+  
+    allocate(tmp_grad1_u12(n_points_extra_final_grid,n_rest,4))
+  
+    ii = n_pass*n_blocks + 1
+
+    call wall_time(tc1)
+    !$OMP PARALLEL                 &
+    !$OMP DEFAULT (NONE)           &
+    !$OMP PRIVATE (i_rest, ipoint) &
+    !$OMP SHARED (n_rest, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12)
+    !$OMP DO 
+    do i_rest = 1, n_rest
+      ipoint = ii - 1 + i_rest ! r1
+      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_rest,1), tmp_grad1_u12(1,i_rest,2), tmp_grad1_u12(1,i_rest,3), tmp_grad1_u12(1,i_rest,4))
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+    call wall_time(tc2)
+    tc = tc + tc2 - tc1 
+  
+    do m = 1, 4
+      call dgemm( "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0                       &
+                , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
+                , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) 
+    enddo
+
+    deallocate(tmp_grad1_u12)
+  endif
+
+  deallocate(tmp)
+
+
+  call wall_time(time1)
+  print*, ' wall time for int2_grad1_u12_ao (min) = ', (time1-time0) / 60.d0
+  print*, ' wall time Jastrow derivatives   (min) = ', tc / 60.d0
+  call print_memory_usage()
+
+  ! ---
+  ! ---
+  ! ---
+
+
+  allocate(tc_int_2e_ao(ao_num,ao_num,ao_num,ao_num))
+
+  call wall_time(time1)
+
+  allocate(c_mat(n_points_final_grid,ao_num,ao_num))
+  !$OMP PARALLEL               &
+  !$OMP DEFAULT (NONE)         &
+  !$OMP PRIVATE (i, k, ipoint) &
+  !$OMP SHARED (aos_in_r_array_transp, c_mat, ao_num, n_points_final_grid, final_weight_at_r_vector)
+  !$OMP DO SCHEDULE (static)
+  do i = 1, ao_num
+    do k = 1, ao_num
+      do ipoint = 1, n_points_final_grid
+        c_mat(ipoint,k,i) = final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,k)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+  call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0            &
+            , int2_grad1_u12_ao(1,1,1,4), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
+            , 0.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num)
+  deallocate(c_mat)
+
+  call wall_time(time2)
+  print*, ' wall time of Hermitian part of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0
+  call print_memory_usage()
+
+  ! ---
+
+  call wall_time(time1)
+  
+  allocate(c_mat(n_points_final_grid,ao_num,ao_num))
+  do m = 1, 3
+    !$OMP PARALLEL                                                              &
+    !$OMP DEFAULT (NONE)                                                        &
+    !$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r)                       & 
+    !$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, c_mat, & 
+    !$OMP         ao_num, n_points_final_grid, final_weight_at_r_vector, m)
+    !$OMP DO SCHEDULE (static)
+    do i = 1, ao_num
+      do k = 1, ao_num
+        do ipoint = 1, n_points_final_grid
+  
+          weight1 = 0.5d0 * final_weight_at_r_vector(ipoint)
+          ao_i_r  = aos_in_r_array_transp(ipoint,i)
+          ao_k_r  = aos_in_r_array_transp(ipoint,k)
+  
+          c_mat(ipoint,k,i) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,m) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,m))
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, -1.d0           &
+              , int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
+              , 1.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num)
+  enddo
+  deallocate(c_mat)
+
+  call wall_time(time2)
+  print*, ' wall time of non-Hermitian part of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0
+  call print_memory_usage()
+
+  ! ---
+
+  call wall_time(time1)
+
+  call sum_A_At(tc_int_2e_ao(1,1,1,1), ao_num*ao_num)
+
+  call wall_time(time2)
+  print*, ' lower- and upper-triangle of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0
+  call print_memory_usage()
+
+  ! ---
+
+  call wall_time(time1)  
+
+  PROVIDE ao_integrals_map
+  !$OMP PARALLEL DEFAULT(NONE)                         &
+  !$OMP SHARED(ao_num, tc_int_2e_ao, ao_integrals_map) &
+  !$OMP PRIVATE(i, j, k, l)
+  !$OMP DO COLLAPSE(3)
+  do j = 1, ao_num
+    do l = 1, ao_num
+      do i = 1, ao_num
+        do k = 1, ao_num
+          !                                               < 1:i, 2:j | 1:k, 2:l > 
+          tc_int_2e_ao(k,i,l,j) = tc_int_2e_ao(k,i,l,j) + get_ao_two_e_integral(i, j, k, l, ao_integrals_map)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  call wall_time(time2)
+  print*, ' wall time of Coulomb part of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0
+  call print_memory_usage()
+
+  ! ---
+
+  print*, ' Writing int2_grad1_u12_ao in ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao'
+  open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="write")
+  call ezfio_set_work_empty(.False.)
+    write(11) int2_grad1_u12_ao(:,:,:,1:3)
+  close(11)
+
+  print*, ' Saving tc_int_2e_ao in ', trim(ezfio_filename) // '/work/ao_two_e_tc_tot'
+  open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/ao_two_e_tc_tot', action="write")
+  call ezfio_set_work_empty(.False.)
+  do i = 1, ao_num
+    write(11) tc_int_2e_ao(:,:,:,i)
+  enddo
+  close(11)
+
+  ! ----
+
+  deallocate(int2_grad1_u12_ao)
+  deallocate(tc_int_2e_ao)
+
+  call wall_time(time2)
+  print*, ' wall time for tc_int_2e_ao (min) = ', (time2-time1) / 60.d0
+  call print_memory_usage()
+
+  ! ---
+
+  call wall_time(time1)
+  print*, ' wall time for TC-integrals (min) = ', (time1-time0) / 60.d0
+
+  return
+end
+
+! ---
+
diff --git a/plugins/local/tc_int/int2_grad1_u12.irp.f b/plugins/local/tc_int/int2_grad1_u12.irp.f
deleted file mode 100644
index 0cf0d775..00000000
--- a/plugins/local/tc_int/int2_grad1_u12.irp.f
+++ /dev/null
@@ -1,134 +0,0 @@
-
-! ---
-
-subroutine provide_int2_grad1_u12_ao()
-
-  implicit none
-  integer                       :: ipoint, i, j, m, jpoint
-  integer                       :: n_blocks, n_rest, n_pass
-  integer                       :: i_blocks, i_rest, i_pass, ii
-  double precision              :: time0, time1
-  double precision              :: mem, n_double
-  double precision, allocatable :: tmp(:,:,:)
-  double precision, allocatable :: tmp_grad1_u12(:,:,:)
-  double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:)
-
-  PROVIDE final_weight_at_r_vector_extra aos_in_r_array_extra
-
-  print*, ' start provide_int2_grad1_u12_ao ...'
-  call wall_time(time0)
-
-
-  ! int2_grad1_u12_ao(i,j,ipoint,1) =        \int dr2 [\grad1 u(r1,r2)]_x1 \chi_i(r2) \chi_j(r2) 
-  ! int2_grad1_u12_ao(i,j,ipoint,2) =        \int dr2 [\grad1 u(r1,r2)]_y1 \chi_i(r2) \chi_j(r2) 
-  ! int2_grad1_u12_ao(i,j,ipoint,3) =        \int dr2 [\grad1 u(r1,r2)]_z1 \chi_i(r2) \chi_j(r2) 
-  ! int2_grad1_u12_ao(i,j,ipoint,4) = -(1/2) \int dr2 [\grad1 u(r1,r2)]^2  \chi_i(r2) \chi_j(r2) 
-  allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,4))
-
-
-
-  call total_memory(mem)
-  mem      = max(1.d0, qp_max_mem - mem)
-  n_double = mem * 1.d8
-  n_blocks = int(min(n_double / (n_points_extra_final_grid * 4.d0), 1.d0*n_points_final_grid))
-  n_rest   = int(mod(n_points_final_grid, n_blocks))
-  n_pass   = int((n_points_final_grid - n_rest) / n_blocks)
-
-  call write_int(6, n_pass, 'Number of passes')
-  call write_int(6, n_blocks, 'Size of the blocks')
-  call write_int(6, n_rest, 'Size of the last block')
-
-
-  allocate(tmp(n_points_extra_final_grid,ao_num,ao_num))
-  !$OMP PARALLEL               &
-  !$OMP DEFAULT (NONE)         &
-  !$OMP PRIVATE (j, i, jpoint) &
-  !$OMP SHARED (tmp, ao_num, n_points_extra_final_grid, final_weight_at_r_vector_extra, aos_in_r_array_extra_transp)
-  !$OMP DO SCHEDULE (static)
-  do j = 1, ao_num
-    do i = 1, ao_num
-      do jpoint = 1, n_points_extra_final_grid
-        tmp(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j)
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  
-  allocate(tmp_grad1_u12(n_points_extra_final_grid,n_blocks,4))
-  
-  do i_pass = 1, n_pass
-    ii = (i_pass-1)*n_blocks + 1
-  
-    !$OMP PARALLEL                   &
-    !$OMP DEFAULT (NONE)             &
-    !$OMP PRIVATE (i_blocks, ipoint) &
-    !$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12)
-    !$OMP DO 
-    do i_blocks = 1, n_blocks
-      ipoint = ii - 1 + i_blocks ! r1
-      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_blocks,1), tmp_grad1_u12(1,i_blocks,2), tmp_grad1_u12(1,i_blocks,3), tmp_grad1_u12(1,i_blocks,4))
-    enddo
-    !$OMP END DO
-    !$OMP END PARALLEL
-
-    do m = 1, 4
-      call dgemm( "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0                     &
-                , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
-                , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) 
-    enddo
-  enddo
-  
-  deallocate(tmp_grad1_u12)
-
-  
-  if(n_rest .gt. 0) then
-  
-    allocate(tmp_grad1_u12(n_points_extra_final_grid,n_rest,4))
-  
-    ii = n_pass*n_blocks + 1
-
-    !$OMP PARALLEL                 &
-    !$OMP DEFAULT (NONE)           &
-    !$OMP PRIVATE (i_rest, ipoint) &
-    !$OMP SHARED (n_rest, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12)
-    !$OMP DO 
-    do i_rest = 1, n_rest
-      ipoint = ii - 1 + i_rest ! r1
-      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_rest,1), tmp_grad1_u12(1,i_rest,2), tmp_grad1_u12(1,i_rest,3), tmp_grad1_u12(1,i_rest,4))
-    enddo
-    !$OMP END DO
-    !$OMP END PARALLEL
-  
-    do m = 1, 4
-      call dgemm( "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0                       &
-                , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
-                , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) 
-    enddo
-
-    deallocate(tmp_grad1_u12)
-  endif
-
-  deallocate(tmp)
-
-
-  ! ---
-
-  print*, ' Writing int2_grad1_u12_ao in ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao'
-  open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="write")
-  call ezfio_set_work_empty(.False.)
-    write(11) int2_grad1_u12_ao(:,:,:,1:3)
-  close(11)
-
-  deallocate(int2_grad1_u12_ao)
-
-  call wall_time(time1)
-  print*, ' wall time for provide_int2_grad1_u12_ao (min) = ', (time1-time0) / 60.d0
-  call print_memory_usage()
-
-end
-
-! ---
-
-
diff --git a/plugins/local/tc_int/jast_grad_2e.irp.f b/plugins/local/tc_int/jast_grad_2e.irp.f
deleted file mode 100644
index b18b9d62..00000000
--- a/plugins/local/tc_int/jast_grad_2e.irp.f
+++ /dev/null
@@ -1,102 +0,0 @@
-
-! ---
-
-subroutine get_grad1_u12_r1_2e(r1, n_grid2, gradx, grady, gradz)
-
-  BEGIN_DOC
-  !
-  !  d/dx1 j_2e(1,2)
-  !  d/dy1 j_2e(1,2)
-  !  d/dz1 j_2e(1,2)
-  !
-  END_DOC
-
-  include 'constants.include.F'
-
-  implicit none
-  integer         , intent(in)  :: n_grid2
-  double precision, intent(in)  :: r1(3)
-  double precision, intent(out) :: gradx(n_grid2)
-  double precision, intent(out) :: grady(n_grid2)
-  double precision, intent(out) :: gradz(n_grid2)
-
-  integer                       :: jpoint
-  integer                       :: i_nucl, p, mpA, npA, opA
-  integer                       :: powmax1, powmax, powmax2
-  double precision              :: r2(3)
-  double precision              :: tmp, tmp1, tmp2
-  double precision              :: rn(3), f1A, grad1_f1A(3), f2A, grad2_f2A(3), g12, grad1_g12(3)
-  double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:)
-
-
-  powmax1 = max(maxval(jBH_m), maxval(jBH_n))
-  powmax2 = maxval(jBH_o)
-  powmax  = max(powmax1, powmax2)
-
-  allocate(f1A_power(-1:powmax), f2A_power(-1:powmax), g12_power(-1:powmax), double_p(0:powmax))
-
-  do p = 0, powmax
-    double_p(p) = dble(p)
-  enddo
-
-  f1A_power(-1) = 0.d0
-  f2A_power(-1) = 0.d0
-  g12_power(-1) = 0.d0
-
-  f1A_power(0) = 1.d0
-  f2A_power(0) = 1.d0
-  g12_power(0) = 1.d0
-
-  do jpoint = 1, n_points_extra_final_grid ! r2
-
-    r2(1) = final_grid_points_extra(1,jpoint)
-    r2(2) = final_grid_points_extra(2,jpoint)
-    r2(3) = final_grid_points_extra(3,jpoint)
-
-    gradx(jpoint) = 0.d0 
-    grady(jpoint) = 0.d0 
-    gradz(jpoint) = 0.d0 
-    do i_nucl = 1, nucl_num 
-
-      rn(1) = nucl_coord(i_nucl,1)
-      rn(2) = nucl_coord(i_nucl,2)
-      rn(3) = nucl_coord(i_nucl,3)
-
-      call jBH_elem_fct_grad(jBH_en(i_nucl), r1, rn, f1A, grad1_f1A)
-      call jBH_elem_fct_grad(jBH_en(i_nucl), r2, rn, f2A, grad2_f2A)
-      call jBH_elem_fct_grad(jBH_ee(i_nucl), r1, r2, g12, grad1_g12)
-
-      ! Compute powers of f1A and f2A
-      do p = 1, powmax1
-        f1A_power(p) = f1A_power(p-1) * f1A
-        f2A_power(p) = f2A_power(p-1) * f2A
-      enddo
-      do p = 1, powmax2
-        g12_power(p) = g12_power(p-1) * g12
-      enddo
-
-      do p = 1, jBH_size
-        mpA = jBH_m(p,i_nucl)
-        npA = jBH_n(p,i_nucl)
-        opA = jBH_o(p,i_nucl)
-        tmp = jBH_c(p,i_nucl)
-        if(mpA .eq. npA) then
-          tmp = tmp * 0.5d0
-        endif
-
-        tmp1 = double_p(mpA) * f1A_power(mpA-1) * f2A_power(npA) + double_p(npA) * f1A_power(npA-1) * f2A_power(mpA)
-        tmp1 = tmp1 * g12_power(opA) * tmp
-        tmp2 = double_p(opA) * g12_power(opA-1) * (f1A_power(mpA) * f2A_power(npA) + f1A_power(npA) * f2A_power(mpA)) * tmp
-
-        gradx(jpoint) = gradx(jpoint) + tmp1 * grad1_f1A(1) + tmp2 * grad1_g12(1)
-        grady(jpoint) = grady(jpoint) + tmp1 * grad1_f1A(2) + tmp2 * grad1_g12(2)
-        gradz(jpoint) = gradz(jpoint) + tmp1 * grad1_f1A(3) + tmp2 * grad1_g12(3)
-      enddo ! p
-    enddo ! i_nucl
-  enddo ! jpoint
-
-  return
-end
-
-! ---
-
diff --git a/plugins/local/tc_int/jast_grad_full.irp.f b/plugins/local/tc_int/jast_grad_full.irp.f
index f63ee3e4..78ed1edf 100644
--- a/plugins/local/tc_int/jast_grad_full.irp.f
+++ b/plugins/local/tc_int/jast_grad_full.irp.f
@@ -16,31 +16,26 @@ subroutine get_grad1_u12_for_tc(ipoint, n_grid2, resx, resy, resz, res)
   !
   END_DOC
 
+  include 'constants.include.F'
+
   implicit none
   integer,          intent(in)  :: ipoint, n_grid2
   double precision, intent(out) :: resx(n_grid2), resy(n_grid2), resz(n_grid2), res(n_grid2)
 
-  integer                       :: jpoint
-  double precision              :: env_r1, tmp
-  double precision              :: grad1_env(3), r1(3)
-  double precision, allocatable :: env_r2(:)
-  double precision, allocatable :: u2b_r12(:), gradx1_u2b(:), grady1_u2b(:), gradz1_u2b(:)
-  double precision, allocatable :: u2b_mu(:), gradx1_mu(:), grady1_mu(:), gradz1_mu(:)
-  double precision, allocatable :: u2b_nu(:), gradx1_nu(:), grady1_nu(:), gradz1_nu(:)
-  double precision, external    :: env_nucl
+  integer                       :: jpoint, i_nucl, p, mpA, npA, opA, pp
+  integer                       :: powmax1, powmax, powmax2
+  double precision              :: r1(3), r2(3)
+  double precision              :: tmp, tmp1, tmp2, tmp11, tmp22
+  double precision              :: rn(3), f1A, grad1_f1A(3), f2A, grad2_f2A(3), g12, grad1_g12(3)
+  double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:)
 
   r1(1) = final_grid_points(1,ipoint)
   r1(2) = final_grid_points(2,ipoint)
   r1(3) = final_grid_points(3,ipoint)
 
+  call grad1_j12_r1_seq(r1, n_grid2, resx, resy, resz)
 
-  ! j2e_type .eq. "Boys_Handy"
-  ! env_type .eq. "None"
-  ! j1e_type .eq "None"
-
-  call get_grad1_u12_r1_2e(r1, n_grid2, resx(1), resy(1), resz(1))
-
-  do jpoint = 1, n_points_extra_final_grid
+  do jpoint = 1, n_grid2 ! r2
     res(jpoint) = -0.5d0 * (resx(jpoint) * resx(jpoint) + resy(jpoint) * resy(jpoint) + resz(jpoint) * resz(jpoint))
   enddo
 
@@ -49,3 +44,91 @@ end
 
 ! ---
 
+subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
+
+  include 'constants.include.F'
+
+  implicit none
+  integer         , intent(in)  :: n_grid2
+  double precision, intent(in)  :: r1(3)
+  double precision, intent(out) :: gradx(n_grid2)
+  double precision, intent(out) :: grady(n_grid2)
+  double precision, intent(out) :: gradz(n_grid2)
+
+  integer                       :: jpoint, i_nucl, p, mpA, npA, opA
+  double precision              :: r2(3)
+  double precision              :: dx, dy, dz, r12, tmp
+  double precision              :: rn(3), f1A, grad1_f1A(3), f2A, grad2_f2A(3), g12, grad1_g12(3)
+  double precision              :: tmp1, tmp2
+  integer                       :: powmax1, powmax, powmax2
+  double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:)
+
+  powmax1 = max(maxval(jBH_m), maxval(jBH_n))
+  powmax2 = maxval(jBH_o)
+  powmax  = max(powmax1, powmax2)
+
+  allocate(f1A_power(-1:powmax), f2A_power(-1:powmax), g12_power(-1:powmax), double_p(0:powmax))
+
+  do p = 0, powmax
+    double_p(p) = dble(p)
+  enddo
+
+  f1A_power(-1) = 0.d0
+  f2A_power(-1) = 0.d0
+  g12_power(-1) = 0.d0
+
+  f1A_power(0) = 1.d0
+  f2A_power(0) = 1.d0
+  g12_power(0) = 1.d0
+
+  do jpoint = 1, n_grid2 ! r2
+
+    r2(1) = final_grid_points_extra(1,jpoint)
+    r2(2) = final_grid_points_extra(2,jpoint)
+    r2(3) = final_grid_points_extra(3,jpoint)
+
+    gradx(jpoint) = 0.d0
+    grady(jpoint) = 0.d0
+    gradz(jpoint) = 0.d0
+    do i_nucl = 1, nucl_num
+
+      rn(1) = nucl_coord(i_nucl,1)
+      rn(2) = nucl_coord(i_nucl,2)
+      rn(3) = nucl_coord(i_nucl,3)
+
+      call jBH_elem_fct_grad(jBH_en(i_nucl), r1, rn, f1A, grad1_f1A)
+      call jBH_elem_fct_grad(jBH_en(i_nucl), r2, rn, f2A, grad2_f2A)
+      call jBH_elem_fct_grad(jBH_ee(i_nucl), r1, r2, g12, grad1_g12)
+
+      ! Compute powers of f1A and f2A
+      do p = 1, powmax1
+        f1A_power(p) = f1A_power(p-1) * f1A
+        f2A_power(p) = f2A_power(p-1) * f2A
+      enddo
+      do p = 1, powmax2
+        g12_power(p) = g12_power(p-1) * g12
+      enddo
+
+      do p = 1, jBH_size
+        mpA = jBH_m(p,i_nucl)
+        npA = jBH_n(p,i_nucl)
+        opA = jBH_o(p,i_nucl)
+        tmp = jBH_c(p,i_nucl)
+        if(mpA .eq. npA) then
+          tmp = tmp * 0.5d0
+        endif
+
+        tmp1 = double_p(mpA) * f1A_power(mpA-1) * f2A_power(npA) + double_p(npA) * f1A_power(npA-1) * f2A_power(mpA)
+        tmp1 = tmp1 * g12_power(opA) * tmp
+        tmp2 = double_p(opA) * g12_power(opA-1) * (f1A_power(mpA) * f2A_power(npA) + f1A_power(npA) * f2A_power(mpA)) * tmp
+
+        gradx(jpoint) = gradx(jpoint) + tmp1 * grad1_f1A(1) + tmp2 * grad1_g12(1)
+        grady(jpoint) = grady(jpoint) + tmp1 * grad1_f1A(2) + tmp2 * grad1_g12(2)
+        gradz(jpoint) = gradz(jpoint) + tmp1 * grad1_f1A(3) + tmp2 * grad1_g12(3)
+      enddo ! p
+    enddo ! i_nucl
+  enddo ! jpoint
+
+  return
+end
+
diff --git a/plugins/local/tc_int/write_tc_int.irp.f b/plugins/local/tc_int/write_tc_int.irp.f
index ebdce6f2..9f25a6fd 100644
--- a/plugins/local/tc_int/write_tc_int.irp.f
+++ b/plugins/local/tc_int/write_tc_int.irp.f
@@ -14,19 +14,17 @@ program write_tc_int
   my_n_pt_a_grid = tc_grid1_a
   touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
 
+  my_extra_grid_becke  = .True.
+  PROVIDE tc_grid2_a tc_grid2_r
+  my_n_pt_r_extra_grid = tc_grid2_r
+  my_n_pt_a_extra_grid = tc_grid2_a
+  touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid
+
   call write_int(6, my_n_pt_r_grid, 'radial  external grid over')
   call write_int(6, my_n_pt_a_grid, 'angular external grid over')
 
-  if(tc_integ_type .eq. "numeric") then
-    my_extra_grid_becke  = .True.
-    PROVIDE tc_grid2_a tc_grid2_r
-    my_n_pt_r_extra_grid = tc_grid2_r
-    my_n_pt_a_extra_grid = tc_grid2_a
-    touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid
-
-    call write_int(6, my_n_pt_r_extra_grid, 'radial  internal grid over')
-    call write_int(6, my_n_pt_a_extra_grid, 'angular internal grid over')
-  endif
+  call write_int(6, my_n_pt_r_extra_grid, 'radial  internal grid over')
+  call write_int(6, my_n_pt_a_extra_grid, 'angular internal grid over')
 
   call main()
 

From a9d2f0e188cdc88e1cfe1387de4f1c118bb17a5d Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Wed, 26 Jun 2024 17:55:56 +0200
Subject: [PATCH 07/19] Working on TC ints. Not well tested

---
 plugins/local/tc_int/jast_grad_full.irp.f | 127 ++++++++++++++++++++--
 plugins/local/tc_int/jast_utils_bh.irp.f  |  22 ++--
 2 files changed, 134 insertions(+), 15 deletions(-)

diff --git a/plugins/local/tc_int/jast_grad_full.irp.f b/plugins/local/tc_int/jast_grad_full.irp.f
index 78ed1edf..599d3779 100644
--- a/plugins/local/tc_int/jast_grad_full.irp.f
+++ b/plugins/local/tc_int/jast_grad_full.irp.f
@@ -4,7 +4,7 @@
 subroutine get_grad1_u12_for_tc(ipoint, n_grid2, resx, resy, resz, res)
 
   BEGIN_DOC
-  ! 
+  !
   ! resx(ipoint) =      [grad1 u(r1,r2)]_x1
   ! resy(ipoint) =      [grad1 u(r1,r2)]_y1
   ! resz(ipoint) =      [grad1 u(r1,r2)]_z1
@@ -59,7 +59,7 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
   double precision              :: r2(3)
   double precision              :: dx, dy, dz, r12, tmp
   double precision              :: rn(3), f1A, grad1_f1A(3), f2A, grad2_f2A(3), g12, grad1_g12(3)
-  double precision              :: tmp1, tmp2
+  double precision              :: tmp1, tmp2, dist
   integer                       :: powmax1, powmax, powmax2
   double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:)
 
@@ -90,30 +90,105 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
     gradx(jpoint) = 0.d0
     grady(jpoint) = 0.d0
     gradz(jpoint) = 0.d0
+
+    call jBH_elem_fct_grad_alpha1(r1, r2, g12, grad1_g12)
+
+!    dist =   (r1(1) - r2(1)) * (r1(1) - r2(1)) &
+!           + (r1(2) - r2(2)) * (r1(2) - r2(2)) &
+!           + (r1(3) - r2(3)) * (r1(3) - r2(3))
+!
+!    if(dist .ge. 1d-15) then
+!      dist = dsqrt( dist )
+!
+!      tmp1 = 1.d0 / (1.d0 + dist)
+!
+!      g12 = dist * tmp1
+!      tmp2 = tmp1 * tmp1 / dist
+!      grad1_g12(1) = tmp2 * (r1(1) - r2(1))
+!      grad1_g12(2) = tmp2 * (r1(2) - r2(2))
+!      grad1_g12(3) = tmp2 * (r1(3) - r2(3))
+!
+!    else
+!
+!      grad1_g12(1) = 0.d0
+!      grad1_g12(2) = 0.d0
+!      grad1_g12(3) = 0.d0
+!      g12 = 0.d0
+!
+!    endif
+!
+    do p = 1, powmax2
+      g12_power(p) = g12_power(p-1) * g12
+    enddo
+
     do i_nucl = 1, nucl_num
 
       rn(1) = nucl_coord(i_nucl,1)
       rn(2) = nucl_coord(i_nucl,2)
       rn(3) = nucl_coord(i_nucl,3)
 
-      call jBH_elem_fct_grad(jBH_en(i_nucl), r1, rn, f1A, grad1_f1A)
-      call jBH_elem_fct_grad(jBH_en(i_nucl), r2, rn, f2A, grad2_f2A)
-      call jBH_elem_fct_grad(jBH_ee(i_nucl), r1, r2, g12, grad1_g12)
+        call jBH_elem_fct_grad_alpha1(r1, rn, f1A, grad1_f1A)
+!      dist =   (r1(1) - rn(1)) * (r1(1) - rn(1)) &
+!             + (r1(2) - rn(2)) * (r1(2) - rn(2)) &
+!             + (r1(3) - rn(3)) * (r1(3) - rn(3))
+!      if (dist > 1.d-15) then
+!        dist = dsqrt( dist )
+!
+!        tmp1 = 1.d0 / (1.d0 + dist)
+!
+!        f1A = dist * tmp1
+!        tmp2 = tmp1 * tmp1 / dist
+!        grad1_f1A(1) = tmp2 * (r1(1) - rn(1))
+!        grad1_f1A(2) = tmp2 * (r1(2) - rn(2))
+!        grad1_f1A(3) = tmp2 * (r1(3) - rn(3))
+!
+!      else
+!
+!        grad1_f1A(1) = 0.d0
+!        grad1_f1A(2) = 0.d0
+!        grad1_f1A(3) = 0.d0
+!        f1A = 0.d0
+!
+!      endif
+
+        call jBH_elem_fct_grad_alpha1(r2, rn, f2A, grad2_f2A)
+!      dist =   (r2(1) - rn(1)) * (r2(1) - rn(1)) &
+!             + (r2(2) - rn(2)) * (r2(2) - rn(2)) &
+!             + (r2(3) - rn(3)) * (r2(3) - rn(3))
+!
+!      if (dist > 1.d-15) then
+!        dist = dsqrt( dist )
+!
+!        tmp1 = 1.d0 / (1.d0 + dist)
+!
+!        f2A = dist * tmp1
+!        tmp2 = tmp1 * tmp1 / dist
+!        grad2_f2A(1) = tmp2 * (r2(1) - rn(1))
+!        grad2_f2A(2) = tmp2 * (r2(2) - rn(2))
+!        grad2_f2A(3) = tmp2 * (r2(3) - rn(3))
+!
+!      else
+!
+!        grad2_f2A(1) = 0.d0
+!        grad2_f2A(2) = 0.d0
+!        grad2_f2A(3) = 0.d0
+!        f2A = 0.d0
+!
+!      endif
 
       ! Compute powers of f1A and f2A
       do p = 1, powmax1
         f1A_power(p) = f1A_power(p-1) * f1A
         f2A_power(p) = f2A_power(p-1) * f2A
       enddo
-      do p = 1, powmax2
-        g12_power(p) = g12_power(p-1) * g12
-      enddo
 
       do p = 1, jBH_size
         mpA = jBH_m(p,i_nucl)
         npA = jBH_n(p,i_nucl)
         opA = jBH_o(p,i_nucl)
         tmp = jBH_c(p,i_nucl)
+!        if (dabs(tmp) <= 1.d-10) cycle
+!
         if(mpA .eq. npA) then
           tmp = tmp * 0.5d0
         endif
@@ -132,3 +207,39 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
   return
 end
 
+subroutine jBH_elem_fct_grad_alpha1(r1, r2, fct, grad1_fct)
+
+  implicit none
+  double precision, intent(in)  :: r1(3), r2(3)
+  double precision, intent(out) :: fct, grad1_fct(3)
+  double precision              :: dist, tmp1, tmp2
+
+  dist =   (r1(1) - r2(1)) * (r1(1) - r2(1)) &
+         + (r1(2) - r2(2)) * (r1(2) - r2(2)) &
+         + (r1(3) - r2(3)) * (r1(3) - r2(3))
+
+
+  if(dist .ge. 1d-15) then
+    dist = dsqrt( dist )
+
+    tmp1 = 1.d0 / (1.d0 + dist)
+
+    fct = dist * tmp1
+    tmp2 = tmp1 * tmp1 / dist
+    grad1_fct(1) = tmp2 * (r1(1) - r2(1))
+    grad1_fct(2) = tmp2 * (r1(2) - r2(2))
+    grad1_fct(3) = tmp2 * (r1(3) - r2(3))
+
+  else
+
+    grad1_fct(1) = 0.d0
+    grad1_fct(2) = 0.d0
+    grad1_fct(3) = 0.d0
+    fct = 0.d0
+
+  endif
+
+  return
+end
+
+! ---
diff --git a/plugins/local/tc_int/jast_utils_bh.irp.f b/plugins/local/tc_int/jast_utils_bh.irp.f
index 750ce90b..200bc5ff 100644
--- a/plugins/local/tc_int/jast_utils_bh.irp.f
+++ b/plugins/local/tc_int/jast_utils_bh.irp.f
@@ -1,35 +1,43 @@
 
 ! ---
 
+
+
 subroutine jBH_elem_fct_grad(alpha, r1, r2, fct, grad1_fct)
 
   implicit none
   double precision, intent(in)  :: alpha, r1(3), r2(3)
   double precision, intent(out) :: fct, grad1_fct(3)
-  double precision              :: dist, tmp1, tmp2
+  double precision              :: dist, tmp1, tmp2, dist_inv
 
-  dist = dsqrt( (r1(1) - r2(1)) * (r1(1) - r2(1)) &
-              + (r1(2) - r2(2)) * (r1(2) - r2(2)) &
-              + (r1(3) - r2(3)) * (r1(3) - r2(3)) )
+  dist =   (r1(1) - r2(1)) * (r1(1) - r2(1)) &
+         + (r1(2) - r2(2)) * (r1(2) - r2(2)) &
+         + (r1(3) - r2(3)) * (r1(3) - r2(3))
 
 
-  if(dist .ge. 1d-10) then
+  if(dist .ge. 1d-15) then
+    dist_inv = 1.d0/dsqrt( dist )
+    dist = dist_inv * dist
+
     tmp1 = 1.d0 / (1.d0 + alpha * dist)
 
     fct = alpha * dist * tmp1
-    tmp2 = alpha * tmp1 * tmp1 / dist
+    tmp2 = alpha * tmp1 * tmp1 * dist_inv
     grad1_fct(1) = tmp2 * (r1(1) - r2(1))
     grad1_fct(2) = tmp2 * (r1(2) - r2(2))
     grad1_fct(3) = tmp2 * (r1(3) - r2(3))
+
   else
+
     grad1_fct(1) = 0.d0
     grad1_fct(2) = 0.d0
     grad1_fct(3) = 0.d0
     fct = 0.d0
+
   endif
 
   return
-end 
+end
 
 ! ---
 

From 5d80cb7b2dd53bdd9eb713e507912e6fce3cadd7 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Thu, 27 Jun 2024 12:06:06 +0200
Subject: [PATCH 08/19] Separated gpu and gpu_arch

---
 configure                           |  8 ++--
 src/ccsd/NEED                       |  1 +
 src/ccsd/ccsd_space_orb_sub.irp.f   | 14 +++++--
 src/gpu/NEED                        |  1 +
 src/gpu/README.rst                  |  6 +++
 src/{gpu_x86 => gpu}/gpu.h          |  0
 src/{gpu_x86 => gpu}/gpu_module.F90 | 59 +++++++++++++++--------------
 src/gpu_x86/gpu.c                   |  2 +-
 8 files changed, 54 insertions(+), 37 deletions(-)
 create mode 100644 src/gpu/NEED
 create mode 100644 src/gpu/README.rst
 rename src/{gpu_x86 => gpu}/gpu.h (100%)
 rename src/{gpu_x86 => gpu}/gpu_module.F90 (74%)

diff --git a/configure b/configure
index 014275eb..db158966 100755
--- a/configure
+++ b/configure
@@ -115,19 +115,19 @@ while getopts "d:c:i:g:h" c ; do
 done
 
 # Handle GPU acceleration
-rm -f ${QP_ROOT}/src/gpu
+rm -f ${QP_ROOT}/src/gpu_arch
 case "$GPU" in
   amd) # Nvidia
      echo "Activating AMD GPU acceleration"
-     ln -s ${QP_ROOT}/src/gpu_amd ${QP_ROOT}/src/gpu
+     ln -s ${QP_ROOT}/src/gpu_amd ${QP_ROOT}/src/gpu_arch
      ;;
   nvidia) # Nvidia
      echo "Activating Nvidia GPU acceleration"
-     ln -s ${QP_ROOT}/src/gpu_nvidia ${QP_ROOT}/src/gpu
+     ln -s ${QP_ROOT}/src/gpu_nvidia ${QP_ROOT}/src/gpu_arch
      ;;
   *) # No Acceleration
      echo "Disabling GPU acceleration"
-     ln -s ${QP_ROOT}/src/gpu_x86 ${QP_ROOT}/src/gpu
+     ln -s ${QP_ROOT}/src/gpu_x86 ${QP_ROOT}/src/gpu_arch
      ;;
 esac
 
diff --git a/src/ccsd/NEED b/src/ccsd/NEED
index e6e6bc59..8298f28e 100644
--- a/src/ccsd/NEED
+++ b/src/ccsd/NEED
@@ -1,2 +1,3 @@
+gpu
 hartree_fock
 utils_cc
diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index 9d4ae7f9..84aab08a 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -1,4 +1,5 @@
 subroutine run_ccsd_space_orb
+  use gpu
 
   implicit none
 
@@ -11,7 +12,7 @@ subroutine run_ccsd_space_orb
 
   double precision, allocatable :: t2(:,:,:,:), r2(:,:,:,:), tau(:,:,:,:), tau_x(:,:,:,:)
   double precision, allocatable :: t1(:,:), r1(:,:)
-  double precision, allocatable :: H_oo(:,:), H_vv(:,:), H_vo(:,:)
+  double precision, pointer     :: H_oo, H_vv, H_vo
 
   double precision, allocatable :: all_err(:,:), all_t(:,:)
   integer, allocatable          :: list_occ(:), list_vir(:)
@@ -55,7 +56,10 @@ subroutine run_ccsd_space_orb
   allocate(tau(nO,nO,nV,nV))
   allocate(tau_x(nO,nO,nV,nV))
   allocate(t1(nO,nV), r1(nO,nV))
-  allocate(H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO))
+
+  call gpu_allocate_double(H_oo, (/ nO, nO /) )
+  call gpu_allocate_double(H_vv, (/ nV, nV /) )
+  call gpu_allocate_double(H_vo, (/ nV, nO /) )
 
   if (cc_update_method == 'diis') then
     double precision :: rss, diis_mem, extra_mem
@@ -191,7 +195,11 @@ subroutine run_ccsd_space_orb
     deallocate(all_err,all_t)
   endif
 
-  deallocate(H_vv,H_oo,H_vo,r1,r2,tau)
+  call gpu_deallocate_double(H_oo)
+  call gpu_deallocate_double(H_vv)
+  call gpu_deallocate_double(H_vo)
+
+  deallocate(r1,r2,tau)
 
   ! CCSD(T)
   double precision :: e_t, e_t_err
diff --git a/src/gpu/NEED b/src/gpu/NEED
new file mode 100644
index 00000000..c2af78d2
--- /dev/null
+++ b/src/gpu/NEED
@@ -0,0 +1 @@
+gpu_arch
diff --git a/src/gpu/README.rst b/src/gpu/README.rst
new file mode 100644
index 00000000..17ee28a0
--- /dev/null
+++ b/src/gpu/README.rst
@@ -0,0 +1,6 @@
+===
+gpu
+===
+
+Bindings for GPU routines (architecture independent).
+Architecture-dependent files are in gpu_arch.
diff --git a/src/gpu_x86/gpu.h b/src/gpu/gpu.h
similarity index 100%
rename from src/gpu_x86/gpu.h
rename to src/gpu/gpu.h
diff --git a/src/gpu_x86/gpu_module.F90 b/src/gpu/gpu_module.F90
similarity index 74%
rename from src/gpu_x86/gpu_module.F90
rename to src/gpu/gpu_module.F90
index 86ba3926..f35ebc97 100644
--- a/src/gpu_x86/gpu_module.F90
+++ b/src/gpu/gpu_module.F90
@@ -1,5 +1,5 @@
 module gpu
-  use, intrinsic :: iso_c_binding, only : c_int32_t, c_int64_t, c_double, c_size_t, c_char
+  use, intrinsic :: iso_c_binding
   implicit none
 
   interface
@@ -17,7 +17,7 @@ module gpu
       integer(c_int64_t), value :: n
     end subroutine
 
-    subroutine gpu_free_c(ptr) bind(C, name='gpu_free')
+    subroutine gpu_deallocate_c(ptr) bind(C, name='gpu_deallocate')
       import
       type(c_ptr) :: ptr
     end subroutine
@@ -89,53 +89,54 @@ module gpu
 
   end interface
 
+  contains
+
+
+    subroutine gpu_allocate_double(ptr, s)
+      implicit none
+      double precision, pointer, intent(inout) :: ptr
+      integer, intent(in) :: s(:)
+      type(c_ptr) :: cptr
+
+      call gpu_allocate_c(cptr, sum(s*1_8)*8_8)
+      call c_f_pointer(cptr, ptr, s)
+    end subroutine
+
+    subroutine gpu_deallocate_double(ptr)
+      implicit none
+      double precision, pointer, intent(inout) :: ptr
+      type(c_ptr) :: cptr
+      cptr = c_loc(ptr)
+      call gpu_deallocate(cptr)
+      NULLIFY(ptr)
+    end subroutine
+
 end module
 
-subroutine gpu_allocate_double(ptr, s)
-  use gpu
-  implicit none
-  double precision, pointer, intent(inout) :: ptr
-  integer*8, intent(in) :: s(*)
-  type(c_ptr) :: cptr
-
-  call gpu_allocate_c(cptr, sum(s)*8_8)
-  call c_f_pointer(cptr, ptr, s)
-end subroutine
-
-subroutine gpu_free_double(ptr)
-  use gpu
-  implicit none
-  double precision, pointer, intent(inout) :: ptr
-  type(c_ptr) :: cptr
-  cptr = cloc(ptr)
-  call gpu_free(cptr)
-  NULLIFY(ptr)
-end subroutine
-
 subroutine gpu_upload_double(cpu_ptr, gpu_ptr, n)
   use gpu
   implicit none
   double precision, intent(in)   :: cpu_ptr(*)
-  double precision, intent(out)  :: gpu_ptr(*)
+  double precision, intent(in)   :: gpu_ptr(*)
   integer(c_int64_t), intent(in) :: n
-  call gpu_upload_c(cpu_ptr, gpu_ptr, 8_8*n)
+  call gpu_upload_c(c_loc(cpu_ptr), c_loc(gpu_ptr), 8_8*n)
 end subroutine
 
 subroutine gpu_download_double(gpu_ptr, cpu_ptr, n)
   use gpu
   implicit none
   double precision, intent(in)   :: gpu_ptr(*)
-  double precision, intent(out)  :: cpu_ptr(*)
+  double precision, intent(in)   :: cpu_ptr(*)
   integer(c_int64_t), intent(in) :: n
-  call gpu_download_c(gpu_ptr, cpu_ptr, 8_8*n)
+  call gpu_download_c(c_loc(gpu_ptr), c_loc(cpu_ptr), 8_8*n)
 end subroutine
 
 subroutine gpu_copy_double(gpu_ptr_src, gpu_ptr_dest, n)
   use gpu
   implicit none
   double precision, intent(in)   :: gpu_ptr_src(*)
-  double precision, intent(out)  :: gpu_ptr_dest(*)
+  double precision, intent(in)   :: gpu_ptr_dest(*)
   integer(c_int64_t), intent(in) :: n
-  call gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, 8_8*n)
+  call gpu_copy_c(c_loc(gpu_ptr_src), c_loc(gpu_ptr_dest), 8_8*n)
 end subroutine
 
diff --git a/src/gpu_x86/gpu.c b/src/gpu_x86/gpu.c
index 71505dbe..41ede396 100644
--- a/src/gpu_x86/gpu.c
+++ b/src/gpu_x86/gpu.c
@@ -25,7 +25,7 @@ void gpu_allocate(void** ptr, const int64_t n) {
   }
 }
 
-void gpu_free(void** ptr) {
+void gpu_deallocate(void** ptr) {
   free(*ptr);
   *ptr = NULL;
 }

From 6c02ac0f0b05ea3cc16e0fde66e23c9a0de14246 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Thu, 27 Jun 2024 12:07:48 +0200
Subject: [PATCH 09/19] Separated gpu and gpu_arch

---
 src/gpu/gpu_module.F90 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90
index f35ebc97..43754454 100644
--- a/src/gpu/gpu_module.F90
+++ b/src/gpu/gpu_module.F90
@@ -107,7 +107,7 @@ module gpu
       double precision, pointer, intent(inout) :: ptr
       type(c_ptr) :: cptr
       cptr = c_loc(ptr)
-      call gpu_deallocate(cptr)
+      call gpu_deallocate_c(cptr)
       NULLIFY(ptr)
     end subroutine
 

From fa6d1419496d271a4715efc776790ce7fc152064 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Thu, 27 Jun 2024 15:45:52 +0200
Subject: [PATCH 10/19] Introducing GPU in CCSD

---
 src/ccsd/ccsd_space_orb_sub.irp.f      | 224 +++++++-----
 src/ccsd/ccsd_space_orb_sub_chol.irp.f |  12 +-
 src/gpu/gpu_module.F90                 | 450 ++++++++++++++++++++++---
 src/gpu_x86/gpu.c                      |  48 +--
 4 files changed, 570 insertions(+), 164 deletions(-)

diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index 84aab08a..455d62f7 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -10,9 +10,9 @@ subroutine run_ccsd_space_orb
   double precision :: uncorr_energy,energy, max_elem, max_r, max_r1, max_r2,ta,tb
   logical :: not_converged
 
-  double precision, allocatable :: t2(:,:,:,:), r2(:,:,:,:), tau(:,:,:,:), tau_x(:,:,:,:)
-  double precision, allocatable :: t1(:,:), r1(:,:)
-  double precision, pointer     :: H_oo, H_vv, H_vo
+  type(gpu_double4) :: t2, r2, tau, tau_x
+  type(gpu_double2) :: t1, r1
+  type(gpu_double2) :: H_oo, H_vv, H_vo
 
   double precision, allocatable :: all_err(:,:), all_t(:,:)
   integer, allocatable          :: list_occ(:), list_vir(:)
@@ -52,14 +52,15 @@ subroutine run_ccsd_space_orb
   !print*,'occ',list_occ
   !print*,'vir',list_vir
 
-  allocate(t2(nO,nO,nV,nV), r2(nO,nO,nV,nV))
-  allocate(tau(nO,nO,nV,nV))
-  allocate(tau_x(nO,nO,nV,nV))
-  allocate(t1(nO,nV), r1(nO,nV))
-
-  call gpu_allocate_double(H_oo, (/ nO, nO /) )
-  call gpu_allocate_double(H_vv, (/ nV, nV /) )
-  call gpu_allocate_double(H_vo, (/ nV, nO /) )
+  call gpu_allocate(t2, nO,nO,nV,nV)
+  call gpu_allocate(r2, nO,nO,nV,nV)
+  call gpu_allocate(tau, nO,nO,nV,nV)
+  call gpu_allocate(tau_x, nO,nO,nV,nV)
+  call gpu_allocate(t1, nO,nV)
+  call gpu_allocate(r1, nO,nV)
+  call gpu_allocate(H_oo, nO, nO)
+  call gpu_allocate(H_vo, nV, nO)
+  call gpu_allocate(H_vv, nV, nV)
 
   if (cc_update_method == 'diis') then
     double precision :: rss, diis_mem, extra_mem
@@ -101,14 +102,21 @@ subroutine run_ccsd_space_orb
   endif
 
   ! Init
-  call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,t1)
-  call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,t2)
-  call update_tau_space(nO,nV,t1,t2,tau)
+  double precision, allocatable :: h_t1(:,:), h_t2(:,:,:,:)
+  allocate(h_t1(nO,nV), h_t2(nO,nO,nV,nV))
+
+  call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,h_t1)
+  call gpu_upload(h_t1, t1)
+
+  call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,h_t2)
+  call gpu_upload(h_t2, t2)
+
+  call update_tau_space(nO,nV,h_t1,t1,t2,tau)
   call update_tau_x_space(nO,nV,tau,tau_x)
   !print*,'hf_energy', hf_energy
   call det_energy(det,uncorr_energy)
   print*,'Det energy', uncorr_energy
-  call ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
+  call ccsd_energy_space_x(nO,nV,tau_x%f,t1%f,energy)
   print*,'Guess energy', uncorr_energy+energy, energy
 
   nb_iter = 0
@@ -127,40 +135,38 @@ subroutine run_ccsd_space_orb
     if (do_ao_cholesky) then
 !    if (.False.) then
       call compute_H_oo_chol(nO,nV,tau_x,H_oo)
-      call compute_H_vv_chol(nO,nV,tau_x,H_vv)
-      call compute_H_vo_chol(nO,nV,t1,H_vo)
+      call compute_H_vv_chol(nO,nV,tau_x%f,H_vv%f)
+      call compute_H_vo_chol(nO,nV,t1%f,H_vo%f)
 
-      call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
-      call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
+      call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1)
+      call compute_r2_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r2%f,max_r2)
     else
-      call compute_H_oo(nO,nV,t1,t2,tau,H_oo)
-      call compute_H_vv(nO,nV,t1,t2,tau,H_vv)
-      call compute_H_vo(nO,nV,t1,t2,H_vo)
+      call compute_H_oo(nO,nV,t1%f,t2%f,tau%f,H_oo%f)
+      call compute_H_vv(nO,nV,t1%f,t2%f,tau%f,H_vv%f)
+      call compute_H_vo(nO,nV,t1%f,t2%f,H_vo%f)
 
-      call compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
-      call compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
+      call compute_r1_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r1%f,max_r1)
+      call compute_r2_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r2%f,max_r2)
     endif
     max_r = max(max_r1,max_r2)
 
     ! Update
     if (cc_update_method == 'diis') then
-      !call update_t_ccsd(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
-      !call update_t_ccsd_diis(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
-      call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1,r2,t1,t2,all_err,all_t)
+      call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1%f,r2%f,t1%f,t2%f,all_err,all_t)
 
     ! Standard update as T = T - Delta
     elseif (cc_update_method == 'none') then
-      call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1,t1)
-      call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2,t2)
+      call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1%f,t1%f)
+      call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2%f,t2%f)
     else
       print*,'Unkown cc_method_method: '//cc_update_method
     endif
 
-    call update_tau_space(nO,nV,t1,t2,tau)
+    call update_tau_space(nO,nV,t1%f,t1,t2,tau)
     call update_tau_x_space(nO,nV,tau,tau_x)
 
     ! Energy
-    call ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
+    call ccsd_energy_space_x(nO,nV,tau_x%f,t1%f,energy)
     write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |'
 
     nb_iter = nb_iter + 1
@@ -185,8 +191,8 @@ subroutine run_ccsd_space_orb
   print*,''
 
   if (write_amplitudes) then
-    call write_t1(nO,nV,t1)
-    call write_t2(nO,nV,t2)
+    call write_t1(nO,nV,t1%f)
+    call write_t2(nO,nV,t2%f)
     call ezfio_set_utils_cc_io_amplitudes('Read')
   endif
 
@@ -195,11 +201,14 @@ subroutine run_ccsd_space_orb
     deallocate(all_err,all_t)
   endif
 
-  call gpu_deallocate_double(H_oo)
-  call gpu_deallocate_double(H_vv)
-  call gpu_deallocate_double(H_vo)
+  call gpu_deallocate(H_oo)
+  call gpu_deallocate(H_vv)
+  call gpu_deallocate(H_vo)
 
-  deallocate(r1,r2,tau)
+  call gpu_deallocate(r1)
+  call gpu_deallocate(r2)
+  call gpu_deallocate(tau)
+  call gpu_deallocate(tau_x)
 
   ! CCSD(T)
   double precision :: e_t, e_t_err
@@ -207,28 +216,14 @@ subroutine run_ccsd_space_orb
 
   if (cc_par_t .and. elec_alpha_num + elec_beta_num > 2) then
 
-    ! Dumb way
-    !call wall_time(ta)
-    !call ccsd_par_t_space(nO,nV,t1,t2,e_t)
-    !call wall_time(tb)
-    !print*,'Time: ',tb-ta, ' s'
-
-    !print*,''
-    !write(*,'(A15,F18.12,A3)') ' E(CCSD(T))  = ', uncorr_energy + energy + e_t, ' Ha'
-    !write(*,'(A15,F18.12,A3)') ' E(T)        = ', e_t, ' Ha'
-    !write(*,'(A15,F18.12,A3)') ' Correlation = ', energy + e_t, ' Ha'
-    !print*,''
-
     ! New
     e_t = uncorr_energy + energy ! For print in (T) call
     e_t_err = 0.d0
 
     print*,'Computing (T) correction...'
     call wall_time(ta)
-!    call ccsd_par_t_space_v3(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
-!         ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t)
 
-    call ccsd_par_t_space_stoch(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
+    call ccsd_par_t_space_stoch(nO,nV,t1%f,t2%f,cc_space_f_o,cc_space_f_v &
          ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t, e_t_err)
 
     call wall_time(tb)
@@ -243,7 +238,9 @@ subroutine run_ccsd_space_orb
 
   call save_energy(uncorr_energy + energy, e_t)
 
-  deallocate(t1,t2)
+  deallocate(h_t1, h_t2)
+  call gpu_deallocate(t1)
+  call gpu_deallocate(t2)
 
 end
 
@@ -341,70 +338,139 @@ end
 
 ! Tau
 
-subroutine update_tau_space(nO,nV,t1,t2,tau)
-
+subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
+  use gpu
   implicit none
 
   ! in
   integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: t1(nO,nV), t2(nO,nO,nV,nV)
+  double precision, intent(in)  :: h_t1(nO,nV)
+  type(gpu_double2), intent(in) :: t1
+  type(gpu_double4), intent(in) :: t2
 
   ! out
-  double precision, intent(out) :: tau(nO,nO,nV,nV)
+  type(gpu_double4) :: tau
 
   ! internal
   integer                       :: i,j,a,b
 
+!  !$OMP PARALLEL &
+!  !$OMP SHARED(nO,nV,tau,t2,t1,h_t1) &
+!  !$OMP PRIVATE(i,j,a,b) &
+!  !$OMP DEFAULT(NONE)
+!  !$OMP DO
+!  do b = 1, nV
+!    do a = 1, nV
+!      do j = 1, nO
+!        do i = 1, nO
+!          tau%f(i,j,a,b) = t2%f(i,j,a,b) + t1%f(i,a) * h_t1(j,b)
+!        enddo
+!      enddo
+!    enddo
+!  enddo
+!  !$OMP END DO
+!  !$OMP END PARALLEL
+
+
+  type(gpu_blas) :: blas
+  type(gpu_stream) :: stream(nV)
+
+  call gpu_blas_create(blas)
+  do b=1,nV
+    call gpu_stream_create(stream(b))
+  enddo
+
   !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,tau,t2,t1) &
+  !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas) &
   !$OMP PRIVATE(i,j,a,b) &
   !$OMP DEFAULT(NONE)
-  !$OMP DO
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b)
-        enddo
-      enddo
+  do j=1,nO
+    !$OMP DO
+    do b=1,nV
+      call gpu_set_stream(blas,stream(b))
+      call gpu_dgeam_c(blas%c, 'N', 'N', nO*1_8, nV*1_8, &
+         1.d0, c_loc(t2%f(1,j,1,b)), nO*nO*1_8, &
+         h_t1(j,b), t1%c, nO*1_8, &
+         c_loc(tau%f(1,j,1,b)), nO*nO*1_8)
     enddo
+    !$OMP END DO
   enddo
-  !$OMP END DO
   !$OMP END PARALLEL
 
+  call gpu_synchronize()
+
+  do b=1,nV
+    call gpu_stream_destroy(stream(b))
+  enddo
+
+  call gpu_blas_destroy(blas)
+
 end
 
 subroutine update_tau_x_space(nO,nV,tau,tau_x)
-
+  use gpu
   implicit none
 
   ! in
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: tau(nO,nO,nV,nV)
+  integer, intent(in)         :: nO, nV
+  type(gpu_double4), intent(in)  :: tau
 
   ! out
-  double precision, intent(out) :: tau_x(nO,nO,nV,nV)
+  type(gpu_double4) :: tau_x
 
   ! internal
   integer                       :: i,j,a,b
 
+!  !$OMP PARALLEL &
+!  !$OMP SHARED(nO,nV,tau,tau_x) &
+!  !$OMP PRIVATE(i,j,a,b) &
+!  !$OMP DEFAULT(NONE)
+!  !$OMP DO
+!  do b = 1, nV
+!    do a = 1, nV
+!      do j = 1, nO
+!        do i = 1, nO
+!          tau_x%f(i,j,a,b) = 2.d0*tau%f(i,j,a,b) - tau%f(i,j,b,a)
+!        enddo
+!      enddo
+!    enddo
+!  enddo
+!  !$OMP END DO
+!  !$OMP END PARALLEL
+
+  type(gpu_blas) :: blas
+  type(gpu_stream) :: stream(nV)
+
+  call gpu_blas_create(blas)
+  do a=1,nV
+    call gpu_stream_create(stream(a))
+  enddo
+
   !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,tau,tau_x) &
+  !$OMP SHARED(nO,nV,tau,tau_x,stream,blas) &
   !$OMP PRIVATE(i,j,a,b) &
   !$OMP DEFAULT(NONE)
   !$OMP DO
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          tau_x(i,j,a,b) = 2.d0*tau(i,j,a,b) - tau(i,j,b,a)
-        enddo
-      enddo
+  do b=1,nV
+    do a=1,nV
+      call gpu_set_stream(blas,stream(a))
+      call gpu_dgeam_c(blas%c, 'N', 'N', nO*1_8, nO*1_8, &
+          2.d0, c_loc(tau%f(1,1,a,b)), nO*1_8, &
+         -1.d0, c_loc(tau%f(1,1,b,a)), nO*1_8, &
+         c_loc(tau_x%f(1,1,a,b)), nO*1_8)
     enddo
   enddo
   !$OMP END DO
   !$OMP END PARALLEL
 
+  call gpu_synchronize()
+
+  do b=1,nV
+    call gpu_stream_destroy(stream(b))
+  enddo
+
+  call gpu_blas_destroy(blas)
+
 end
 
 ! R1
diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
index b59dc0bb..9b161001 100644
--- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
@@ -294,12 +294,12 @@ end
 ! H_oo
 
 subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo)
-
+  use gpu
   implicit none
 
   integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: tau_x(nO, nO, nV, nV)
-  double precision, intent(out) :: H_oo(nO, nO)
+  type(gpu_double4), intent(in)    :: tau_x
+  type(gpu_double2), intent(out)   :: H_oo
 
   integer :: a,b,i,j,u,k
 
@@ -315,7 +315,7 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo)
     do b=1,nV
       do j=1,nO
         do a=1,nV
-          tmp_vov(a,j,b) = tau_x(u,j,a,b)
+          tmp_vov(a,j,b) = tau_x%f(u,j,a,b)
         enddo
       enddo
     enddo
@@ -328,7 +328,7 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo)
   !$omp do
   do i = 1, nO
     do u = 1, nO
-      H_oo(u,i) = cc_space_f_oo(u,i)
+      H_oo%f(u,i) = cc_space_f_oo(u,i)
     enddo
   enddo
   !$omp end do nowait
@@ -336,7 +336,7 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo)
   !$omp end  parallel
   call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
     tau_kau, cholesky_mo_num*nV,  cc_space_v_vo_chol, cholesky_mo_num*nV, &
-    1.d0, H_oo, nO)
+    1.d0, H_oo%f, nO)
 
 end
 
diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90
index 43754454..51f80ac0 100644
--- a/src/gpu/gpu_module.F90
+++ b/src/gpu/gpu_module.F90
@@ -2,6 +2,52 @@ module gpu
   use, intrinsic :: iso_c_binding
   implicit none
 
+! Data types
+! ----------
+
+  type gpu_double1
+    type(c_ptr) :: c
+    double precision, pointer :: f(:)
+  end type
+
+  type gpu_double2
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:)
+  end type
+
+  type gpu_double3
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:,:)
+  end type
+
+  type gpu_double4
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:,:,:)
+  end type
+
+  type gpu_double5
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:,:,:,:)
+  end type
+
+  type gpu_double6
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:,:,:,:,:)
+  end type
+
+
+  type gpu_blas
+    type(c_ptr) :: c
+  end type
+
+  type gpu_stream
+    type(c_ptr) :: c
+  end type
+
+
+! C interfaces
+! ------------
+
   interface
     integer function gpu_ndevices() bind(C)
     end function
@@ -43,100 +89,394 @@ module gpu
       integer(c_int64_t), value :: n
     end subroutine
 
-    subroutine gpu_stream_create(stream) bind(C)
+    subroutine gpu_stream_create_c(stream) bind(C, name='gpu_stream_create')
       import
       type(c_ptr) :: stream
     end subroutine
 
-    subroutine gpu_stream_destroy(stream) bind(C)
+    subroutine gpu_stream_destroy_c(stream) bind(C, name='gpu_stream_destroy')
       import
       type(c_ptr) :: stream
     end subroutine
 
-    subroutine gpu_set_stream(handle, stream) bind(C)
+    subroutine gpu_set_stream_c(handle, stream) bind(C, name='gpu_set_stream')
       import
       type(c_ptr) :: handle, stream
     end subroutine
 
-    subroutine gpu_synchronize()
+    subroutine gpu_synchronize() bind(C)
+      import
     end subroutine
 
-    subroutine gpu_blas_create(handle) bind(C)
+    subroutine gpu_blas_create_c(handle) bind(C, name='gpu_blas_create')
       import
       type(c_ptr) :: handle
     end subroutine
 
-    subroutine gpu_blas_destroy(handle) bind(C)
+    subroutine gpu_blas_destroy_c(handle) bind(C, name='gpu_blas_destroy')
       import
       type(c_ptr) :: handle
     end subroutine
 
-    subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res) bind(C)
+    subroutine gpu_ddot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_ddot')
       import
-      type(c_ptr), intent(in)     :: handle
-      integer(c_int64_t), value   :: n, incx, incy
-      real(c_double), intent(in)  :: dx(*), dy(*)
-      real(c_double), intent(out) :: res
+      type(c_ptr), intent(in), value :: handle
+      integer(c_int64_t), value      :: n, incx, incy
+      type(c_ptr), intent(in), value :: dx, dy
+      real(c_double), intent(out)    :: res
     end subroutine
 
-    subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res) bind(C)
+    subroutine gpu_sdot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_sdot')
       import
-      type(c_ptr), intent(in)     :: handle
-      integer(c_int64_t), value   :: n, incx, incy
-      real(c_float), intent(in)   :: dx(*), dy(*)
+      type(c_ptr), intent(in), value :: handle
+      integer(c_int64_t), value      :: n, incx, incy
+      type(c_ptr), intent(in), value :: dx, dy
       real(c_float), intent(out)  :: res
     end subroutine
 
+    subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
+      b, ldb, c, ldc) bind(C, name='gpu_dgeam')
+      import
+      type(c_ptr), intent(in), value :: handle
+      character(c_char), intent(in), value :: transa, transb
+      integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
+      real(c_double), intent(in), value :: alpha, beta
+      type(c_ptr), value :: a, b, c
+    end subroutine
+
   end interface
 
+
+! Polymorphic interfaces
+! ----------------------
+
+  interface gpu_allocate
+    procedure gpu_allocate_double1  &
+             ,gpu_allocate_double2  &
+             ,gpu_allocate_double3  &
+             ,gpu_allocate_double4  &
+             ,gpu_allocate_double5  &
+             ,gpu_allocate_double6
+  end interface gpu_allocate
+
+  interface gpu_deallocate
+    procedure gpu_deallocate_double1  &
+             ,gpu_deallocate_double2  &
+             ,gpu_deallocate_double3  &
+             ,gpu_deallocate_double4  &
+             ,gpu_deallocate_double5  &
+             ,gpu_deallocate_double6
+  end interface gpu_deallocate
+
+  interface gpu_upload
+    procedure gpu_upload_double1  &
+             ,gpu_upload_double2  &
+             ,gpu_upload_double3  &
+             ,gpu_upload_double4  &
+             ,gpu_upload_double5  &
+             ,gpu_upload_double6
+  end interface gpu_upload
+
+  interface gpu_download
+    procedure gpu_download_double1  &
+             ,gpu_download_double2  &
+             ,gpu_download_double3  &
+             ,gpu_download_double4  &
+             ,gpu_download_double5  &
+             ,gpu_download_double6
+  end interface gpu_download
+
+  interface gpu_copy
+    procedure gpu_copy_double1  &
+             ,gpu_copy_double2  &
+             ,gpu_copy_double3  &
+             ,gpu_copy_double4  &
+             ,gpu_copy_double5  &
+             ,gpu_copy_double6
+  end interface gpu_copy
+
+
   contains
 
 
-    subroutine gpu_allocate_double(ptr, s)
-      implicit none
-      double precision, pointer, intent(inout) :: ptr
-      integer, intent(in) :: s(:)
-      type(c_ptr) :: cptr
+! gpu_allocate
+! ------------
 
-      call gpu_allocate_c(cptr, sum(s*1_8)*8_8)
-      call c_f_pointer(cptr, ptr, s)
+    subroutine gpu_allocate_double1(ptr, s)
+      implicit none
+      type(gpu_double1), intent(inout) :: ptr
+      integer, intent(in) :: s
+
+      call gpu_allocate_c(ptr%c, s*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s /))
     end subroutine
 
-    subroutine gpu_deallocate_double(ptr)
+    subroutine gpu_allocate_double2(ptr, s1, s2)
       implicit none
-      double precision, pointer, intent(inout) :: ptr
-      type(c_ptr) :: cptr
-      cptr = c_loc(ptr)
-      call gpu_deallocate_c(cptr)
-      NULLIFY(ptr)
+      type(gpu_double2), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2
+
+      call gpu_allocate_c(ptr%c, s1*s2*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /))
+    end subroutine
+
+    subroutine gpu_allocate_double3(ptr, s1, s2, s3)
+      implicit none
+      type(gpu_double3), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2, s3
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /))
+    end subroutine
+
+    subroutine gpu_allocate_double4(ptr, s1, s2, s3, s4)
+      implicit none
+      type(gpu_double4), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2, s3, s4
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /))
+    end subroutine
+
+    subroutine gpu_allocate_double5(ptr, s1, s2, s3, s4, s5)
+      implicit none
+      type(gpu_double5), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2, s3, s4, s5
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /))
+    end subroutine
+
+    subroutine gpu_allocate_double6(ptr, s1, s2, s3, s4, s5, s6)
+      implicit none
+      type(gpu_double6), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2, s3, s4, s5, s6
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /))
+    end subroutine
+
+
+! gpu_deallocate
+! --------------
+
+    subroutine gpu_deallocate_double1(ptr)
+      implicit none
+      type(gpu_double1), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double2(ptr)
+      implicit none
+      type(gpu_double2), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double3(ptr)
+      implicit none
+      type(gpu_double3), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double4(ptr)
+      implicit none
+      type(gpu_double4), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double5(ptr)
+      implicit none
+      type(gpu_double5), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double6(ptr)
+      implicit none
+      type(gpu_double6), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+
+! gpu_upload
+! ----------
+
+    subroutine gpu_upload_double1(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, intent(in)     :: cpu_ptr(:)
+      type(gpu_double1), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, 8_8*size(gpu_ptr%f))
+    end subroutine
+
+    subroutine gpu_upload_double2(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, intent(in)     :: cpu_ptr(:,:)
+      type(gpu_double2), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+    subroutine gpu_upload_double3(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, intent(in)     :: cpu_ptr(:,:,:)
+      type(gpu_double3), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+    subroutine gpu_upload_double4(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, intent(in)     :: cpu_ptr(:,:,:,:)
+      type(gpu_double4), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+    subroutine gpu_upload_double5(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, intent(in)     :: cpu_ptr(:,:,:,:,:)
+      type(gpu_double5), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+    subroutine gpu_upload_double6(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, intent(in)     :: cpu_ptr(:,:,:,:,:,:)
+      type(gpu_double6), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+
+! gpu_download
+! ------------
+
+    subroutine gpu_download_double1(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double1), intent(in)  :: gpu_ptr
+      double precision, intent(in)   :: cpu_ptr(:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*size(gpu_ptr%f))
+    end subroutine
+
+    subroutine gpu_download_double2(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double2), intent(in)  :: gpu_ptr
+      double precision, intent(in)   :: cpu_ptr(:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+    subroutine gpu_download_double3(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double3), intent(in)  :: gpu_ptr
+      double precision, intent(in)   :: cpu_ptr(:,:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+    subroutine gpu_download_double4(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double4), intent(in)  :: gpu_ptr
+      double precision, intent(in)   :: cpu_ptr(:,:,:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+    subroutine gpu_download_double5(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double5), intent(in)  :: gpu_ptr
+      double precision, intent(in)   :: cpu_ptr(:,:,:,:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+    subroutine gpu_download_double6(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double6), intent(in)  :: gpu_ptr
+      double precision, intent(in)   :: cpu_ptr(:,:,:,:,:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+! gpu_copy
+! --------
+
+    subroutine gpu_copy_double1(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double1), intent(in)        :: gpu_ptr_src
+      type(gpu_double1), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*size(gpu_ptr_dest%f))
+    end subroutine
+
+    subroutine gpu_copy_double2(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double2), intent(in)        :: gpu_ptr_src
+      type(gpu_double2), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+    subroutine gpu_copy_double3(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double3), intent(in)        :: gpu_ptr_src
+      type(gpu_double3), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+    subroutine gpu_copy_double4(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double4), intent(in)        :: gpu_ptr_src
+      type(gpu_double4), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+    subroutine gpu_copy_double5(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double5), intent(in)        :: gpu_ptr_src
+      type(gpu_double5), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+    subroutine gpu_copy_double6(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double6), intent(in)        :: gpu_ptr_src
+      type(gpu_double6), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+
+! gpu_stream
+! ----------
+
+    subroutine gpu_stream_create(stream)
+      import
+      type(gpu_stream) :: stream
+      call gpu_stream_create_c(stream%c)
+    end subroutine
+
+    subroutine gpu_stream_destroy(stream)
+      import
+      type(gpu_stream) :: stream
+      call gpu_stream_destroy_c(stream%c)
+    end subroutine
+
+    subroutine gpu_set_stream(handle, stream)
+      import
+      type(gpu_blas)   :: handle
+      type(gpu_stream) :: stream
+      call gpu_set_stream_c(handle%c, stream%c)
+    end subroutine
+
+
+! gpu_blas
+! --------
+
+    subroutine gpu_blas_create(handle)
+      import
+      type(gpu_blas) :: handle
+      call gpu_blas_create_c(handle%c)
+    end subroutine
+
+    subroutine gpu_blas_destroy(handle)
+      import
+      type(gpu_blas) :: handle
+      call gpu_blas_destroy_c(handle%c)
     end subroutine
 
 end module
 
-subroutine gpu_upload_double(cpu_ptr, gpu_ptr, n)
-  use gpu
-  implicit none
-  double precision, intent(in)   :: cpu_ptr(*)
-  double precision, intent(in)   :: gpu_ptr(*)
-  integer(c_int64_t), intent(in) :: n
-  call gpu_upload_c(c_loc(cpu_ptr), c_loc(gpu_ptr), 8_8*n)
-end subroutine
-
-subroutine gpu_download_double(gpu_ptr, cpu_ptr, n)
-  use gpu
-  implicit none
-  double precision, intent(in)   :: gpu_ptr(*)
-  double precision, intent(in)   :: cpu_ptr(*)
-  integer(c_int64_t), intent(in) :: n
-  call gpu_download_c(c_loc(gpu_ptr), c_loc(cpu_ptr), 8_8*n)
-end subroutine
-
-subroutine gpu_copy_double(gpu_ptr_src, gpu_ptr_dest, n)
-  use gpu
-  implicit none
-  double precision, intent(in)   :: gpu_ptr_src(*)
-  double precision, intent(in)   :: gpu_ptr_dest(*)
-  integer(c_int64_t), intent(in) :: n
-  call gpu_copy_c(c_loc(gpu_ptr_src), c_loc(gpu_ptr_dest), 8_8*n)
-end subroutine
-
diff --git a/src/gpu_x86/gpu.c b/src/gpu_x86/gpu.c
index 41ede396..5f42cb0d 100644
--- a/src/gpu_x86/gpu.c
+++ b/src/gpu_x86/gpu.c
@@ -251,7 +251,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
      if (alpha == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = beta * b[j*ldb+i];
          }
        }
@@ -259,7 +259,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
      } else if (beta == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = alpha * a[j*lda+i];
          }
        }
@@ -267,7 +267,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
      } else {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i];
          }
        }
@@ -282,7 +282,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
      if (alpha == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = beta * b[i*ldb+j];
          }
        }
@@ -290,7 +290,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
      } else if (beta == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = alpha * a[j*lda+i];
          }
        }
@@ -298,7 +298,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
      } else {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j];
          }
        }
@@ -313,7 +313,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
      if (alpha == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = beta * b[j*ldb+i];
          }
        }
@@ -321,7 +321,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
      } else if (beta == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = alpha * a[i*lda+j];
          }
        }
@@ -329,7 +329,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
      } else {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i];
          }
        }
@@ -344,7 +344,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
      if (alpha == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = beta * b[i*ldb+j];
          }
        }
@@ -352,7 +352,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
      } else if (beta == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = alpha * a[i*lda+j];
          }
        }
@@ -360,7 +360,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
      } else {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j];
          }
        }
@@ -386,7 +386,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
      if (alpha == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = beta * b[j*ldb+i];
          }
        }
@@ -394,7 +394,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
      } else if (beta == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = alpha * a[j*lda+i];
          }
        }
@@ -402,7 +402,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
      } else {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i];
          }
        }
@@ -417,7 +417,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
      if (alpha == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = beta * b[i*ldb+j];
          }
        }
@@ -425,7 +425,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
      } else if (beta == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = alpha * a[j*lda+i];
          }
        }
@@ -433,7 +433,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
      } else {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j];
          }
        }
@@ -448,7 +448,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
      if (alpha == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = beta * b[j*ldb+i];
          }
        }
@@ -456,7 +456,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
      } else if (beta == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = alpha * a[i*lda+j];
          }
        }
@@ -464,7 +464,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
      } else {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i];
          }
        }
@@ -479,7 +479,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
      if (alpha == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = beta * b[i*ldb+j];
          }
        }
@@ -487,7 +487,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
      } else if (beta == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = alpha * a[i*lda+j];
          }
        }
@@ -495,7 +495,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
      } else {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j];
          }
        }

From caec5183841840a7933e7fa4cf46bcb0ff3f14d9 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 28 Jun 2024 11:00:58 +0200
Subject: [PATCH 11/19] Added Nvidia module

---
 src/ccsd/ccsd_space_orb_sub.irp.f |  10 +-
 src/gpu/gpu_module.F90            |   6 +-
 src/gpu_nvidia/LIB                |   1 +
 src/gpu_nvidia/NEED               |   1 +
 src/gpu_nvidia/README.rst         |   5 +
 src/gpu_nvidia/gpu.c              | 327 ++++++++++++++++++++++++++++++
 src/gpu_x86/gpu.c                 |  40 ++--
 7 files changed, 359 insertions(+), 31 deletions(-)
 create mode 100644 src/gpu_nvidia/LIB
 create mode 100644 src/gpu_nvidia/NEED
 create mode 100644 src/gpu_nvidia/README.rst
 create mode 100644 src/gpu_nvidia/gpu.c

diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index 455d62f7..e7c9b1ab 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -384,17 +384,17 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
   !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas) &
   !$OMP PRIVATE(i,j,a,b) &
   !$OMP DEFAULT(NONE)
-  do j=1,nO
-    !$OMP DO
-    do b=1,nV
-      call gpu_set_stream(blas,stream(b))
+  !$OMP DO
+  do b=1,nV
+    call gpu_set_stream(blas,stream(b))
+    do j=1,nO
       call gpu_dgeam_c(blas%c, 'N', 'N', nO*1_8, nV*1_8, &
          1.d0, c_loc(t2%f(1,j,1,b)), nO*nO*1_8, &
          h_t1(j,b), t1%c, nO*1_8, &
          c_loc(tau%f(1,j,1,b)), nO*nO*1_8)
     enddo
-    !$OMP END DO
   enddo
+  !$OMP END DO
   !$OMP END PARALLEL
 
   call gpu_synchronize()
diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90
index 51f80ac0..d1ddad4c 100644
--- a/src/gpu/gpu_module.F90
+++ b/src/gpu/gpu_module.F90
@@ -120,7 +120,7 @@ module gpu
 
     subroutine gpu_ddot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_ddot')
       import
-      type(c_ptr), intent(in), value :: handle
+      type(c_ptr), intent(in)        :: handle
       integer(c_int64_t), value      :: n, incx, incy
       type(c_ptr), intent(in), value :: dx, dy
       real(c_double), intent(out)    :: res
@@ -128,7 +128,7 @@ module gpu
 
     subroutine gpu_sdot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_sdot')
       import
-      type(c_ptr), intent(in), value :: handle
+      type(c_ptr), intent(in)        :: handle
       integer(c_int64_t), value      :: n, incx, incy
       type(c_ptr), intent(in), value :: dx, dy
       real(c_float), intent(out)  :: res
@@ -137,7 +137,7 @@ module gpu
     subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
       b, ldb, c, ldc) bind(C, name='gpu_dgeam')
       import
-      type(c_ptr), intent(in), value :: handle
+      type(c_ptr), intent(in)        :: handle
       character(c_char), intent(in), value :: transa, transb
       integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
       real(c_double), intent(in), value :: alpha, beta
diff --git a/src/gpu_nvidia/LIB b/src/gpu_nvidia/LIB
new file mode 100644
index 00000000..91f54e91
--- /dev/null
+++ b/src/gpu_nvidia/LIB
@@ -0,0 +1 @@
+-lcudart -lcublas -lcublasLt
diff --git a/src/gpu_nvidia/NEED b/src/gpu_nvidia/NEED
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/src/gpu_nvidia/NEED
@@ -0,0 +1 @@
+
diff --git a/src/gpu_nvidia/README.rst b/src/gpu_nvidia/README.rst
new file mode 100644
index 00000000..5dcfca92
--- /dev/null
+++ b/src/gpu_nvidia/README.rst
@@ -0,0 +1,5 @@
+==========
+gpu_nvidia
+==========
+
+Nvidia implementation of GPU routines. Uses CUDA and CUBLAS libraries.
diff --git a/src/gpu_nvidia/gpu.c b/src/gpu_nvidia/gpu.c
new file mode 100644
index 00000000..f0bd247a
--- /dev/null
+++ b/src/gpu_nvidia/gpu.c
@@ -0,0 +1,327 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+
+
+/* Generic functions */
+
+int gpu_ndevices() {
+  int ngpus;
+  cudaGetDeviceCount(&ngpus);
+  return ngpus;
+}
+
+void gpu_set_device(int32_t igpu) {
+  cudaSetDevice(igpu);
+}
+
+
+/* Allocation functions */
+
+void gpu_allocate(void** ptr, const int64_t size) {
+    size_t free, total;
+    cudaError_t rc = cudaMemGetInfo( &free, &total );
+    if (rc != cudaSuccess) {
+      free = INT64_MAX;
+    }
+
+    /* Use managed memory if it does not fit on the GPU */
+    if (size < free && size < total/2) {
+//      rc= cudaMalloc(ptr, size);
+      rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
+    } else {
+      rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
+    }
+    assert (rc == cudaSuccess);
+}
+
+void gpu_deallocate(void** ptr) {
+  assert (*ptr != NULL);
+  cudaFree(*ptr);
+  *ptr = NULL;
+}
+
+
+/* Memory transfer functions */
+
+void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
+  cudaMemcpy (gpu_ptr, cpu_ptr, n, cudaMemcpyHostToDevice);
+}
+
+void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
+  cudaMemcpy (cpu_ptr, gpu_ptr, n, cudaMemcpyDeviceToHost);
+}
+
+void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
+  cudaMemcpy (gpu_ptr_dest, gpu_ptr_src, n, cudaMemcpyDeviceToDevice);
+}
+
+
+/* Streams */
+
+void gpu_stream_create(void** ptr) {
+  cudaStream_t stream;
+  cudaError_t rc = cudaStreamCreate(&stream);
+  assert (rc == cudaSuccess);
+  *ptr = (void*) stream;
+}
+
+void gpu_stream_destroy(void** ptr) {
+  assert (*ptr != NULL);
+  cudaError_t rc = cudaStreamDestroy( (cudaStream_t) *ptr);
+  assert (rc == cudaSuccess);
+  *ptr = NULL;
+}
+
+void gpu_set_stream(void** handle, void** stream) {
+  cublasSetStream( (cublasHandle_t) *handle, (cudaStream_t) *stream);
+}
+
+void gpu_synchronize() {
+  cudaDeviceSynchronize();
+}
+
+
+/* BLAS functions */
+
+void gpu_blas_create(void** handle) {
+  cublasHandle_t cublas_handle;
+  cublasStatus_t rc = cublasCreate(&cublas_handle);
+  assert (rc == CUBLAS_STATUS_SUCCESS);
+  *handle = (void*) cublas_handle;
+}
+
+
+void gpu_blas_destroy(void** handle) {
+  assert (*handle != NULL);
+  cublasStatus_t rc = cublasDestroy( (cublasHandle_t) *handle);
+  assert (rc == CUBLAS_STATUS_SUCCESS);
+  *handle = NULL;
+}
+
+
+void gpu_ddot(void** handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
+  assert (*handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t n_, incx_, incy_;
+
+  n_    = (int32_t) n;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  cublasDdot((cublasHandle_t) *handle, n_, x, incx_, y, incy_, result);
+}
+
+
+
+void gpu_sdot(void** handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
+  assert (*handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t n_, incx_, incy_;
+
+  n_    = (int32_t) n;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  cublasSdot((cublasHandle_t) *handle, n_, x, incx_, y, incy_, result);
+}
+
+
+
+void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t n, const double alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) {
+
+  assert (*handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, lda_, incx_, incy_;
+
+  m_    = (int32_t) m;
+  n_    = (int32_t) n;
+  lda_  = (int32_t) lda;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    m_ == m   );
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t)  lda_ == lda );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
+
+  cublasDgemv((cublasHandle_t) *handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_);
+}
+
+
+
+void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t n, const float alpha,
+               const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) {
+
+  assert (*handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, lda_, incx_, incy_;
+
+  m_    = (int32_t) m;
+  n_    = (int32_t) n;
+  lda_  = (int32_t) lda;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    m_ == m   );
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t)  lda_ == lda );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
+
+  cublasSgemv((cublasHandle_t) *handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_);
+}
+
+
+void gpu_dgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) {
+
+  assert (*handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, k_, lda_, ldb_, ldc_;
+
+  m_   = (int32_t) m;
+  n_   = (int32_t) n;
+  k_   = (int32_t) k;
+  lda_ = (int32_t) lda;
+  ldb_ = (int32_t) ldb;
+  ldc_ = (int32_t) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t)   k_ == k  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
+  if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasDgemm((cublasHandle_t) *handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_);
+}
+
+
+
+void gpu_sgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
+               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) {
+
+  assert (*handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, k_, lda_, ldb_, ldc_;
+
+  m_   = (int32_t) m;
+  n_   = (int32_t) n;
+  k_   = (int32_t) k;
+  lda_ = (int32_t) lda;
+  ldb_ = (int32_t) ldb;
+  ldc_ = (int32_t) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t)   k_ == k  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
+  if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasSgemm((cublasHandle_t) *handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_);
+}
+
+
+void gpu_dgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
+               const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
+  assert (*handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, lda_, ldb_, ldc_;
+
+  m_   = (int32_t) m;
+  n_   = (int32_t) n;
+  lda_ = (int32_t) lda;
+  ldb_ = (int32_t) ldb;
+  ldc_ = (int32_t) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
+  if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasDgeam((cublasHandle_t) *handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_);
+
+}
+
+
+void gpu_sgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
+               const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
+  assert (*handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, lda_, ldb_, ldc_;
+
+  m_   = (int32_t) m;
+  n_   = (int32_t) n;
+  lda_ = (int32_t) lda;
+  ldb_ = (int32_t) ldb;
+  ldc_ = (int32_t) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
+  if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasSgeam((cublasHandle_t) *handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_);
+
+}
diff --git a/src/gpu_x86/gpu.c b/src/gpu_x86/gpu.c
index 5f42cb0d..ac7c3620 100644
--- a/src/gpu_x86/gpu.c
+++ b/src/gpu_x86/gpu.c
@@ -56,7 +56,7 @@ void gpu_stream_destroy(void** ptr) {
   *ptr = NULL;
 }
 
-void gpu_set_stream(void* handle, void* stream) {
+void gpu_set_stream(void** handle, void** stream) {
   return;
 }
 
@@ -79,8 +79,8 @@ void gpu_blas_destroy(void** handle) {
 
 double ddot_(const int32_t* n, const double* x, const int32_t* incx, const double* y, const int32_t* incy);
 
-void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
-  assert (handle != NULL);
+void gpu_ddot(void** handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
+  assert (*handle != NULL);
 
   /* Convert to int32_t */
   int32_t n_, incx_, incy_;
@@ -100,8 +100,8 @@ void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_
 
 float sdot_(const int32_t* n, const float* x, const int32_t* incx, const float* y, const int32_t* incy);
 
-void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
-  assert (handle != NULL);
+void gpu_sdot(void** handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
+  assert (*handle != NULL);
 
   /* Convert to int32_t */
   int32_t n_, incx_, incy_;
@@ -122,10 +122,10 @@ void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t
 void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha,
             const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy);
 
-void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double alpha,
+void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t n, const double alpha,
                const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) {
 
-  assert (handle != NULL);
+  assert (*handle != NULL);
 
   /* Convert to int32_t */
   int32_t m_, n_, lda_, incx_, incy_;
@@ -150,10 +150,10 @@ void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int
 void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha,
                const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy);
 
-void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float alpha,
+void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t n, const float alpha,
                const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) {
 
-  assert (handle != NULL);
+  assert (*handle != NULL);
 
   /* Convert to int32_t */
   int32_t m_, n_, lda_, incx_, incy_;
@@ -178,10 +178,10 @@ void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int
 void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha,
             const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc);
 
-void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
+void gpu_dgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
                const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) {
 
-  assert (handle != NULL);
+  assert (*handle != NULL);
 
   /* Convert to int32_t */
   int32_t m_, n_, k_, lda_, ldb_, ldc_;
@@ -209,10 +209,10 @@ void gpu_dgemm(const void* handle, const char transa, const char transb, const i
 void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha,
             const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc);
 
-void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
+void gpu_sgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
                const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) {
 
-  assert (handle != NULL);
+  assert (*handle != NULL);
 
   /* Convert to int32_t */
   int32_t m_, n_, k_, lda_, ldb_, ldc_;
@@ -236,12 +236,9 @@ void gpu_sgemm(const void* handle, const char transa, const char transb, const i
 }
 
 
-void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
+void gpu_dgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
                const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
-  if (handle == NULL) {
-    perror("NULL handle");
-    exit(-1);
-  }
+  assert (*handle != NULL);
 
   if ( (transa == 'N' && transb == 'N') ||
        (transa == 'n' && transb == 'N') ||
@@ -371,12 +368,9 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
 }
 
 
-void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
+void gpu_sgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
                const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
-  if (handle == NULL) {
-    perror("NULL handle");
-    exit(-1);
-  }
+  assert (*handle != NULL);
 
   if ( (transa == 'N' && transb == 'N') ||
        (transa == 'n' && transb == 'N') ||

From d3d89022bc8092ab0c6131904f85475f160dfa53 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 28 Jun 2024 16:50:52 +0200
Subject: [PATCH 12/19] Move GPU modules to plugins

---
 configure                                    | 6 +++---
 {src => plugins/local}/gpu_nvidia/LIB        | 0
 {src => plugins/local}/gpu_nvidia/NEED       | 0
 {src => plugins/local}/gpu_nvidia/README.rst | 0
 {src => plugins/local}/gpu_nvidia/gpu.c      | 0
 {src => plugins/local}/gpu_x86/NEED          | 0
 {src => plugins/local}/gpu_x86/README.rst    | 0
 {src => plugins/local}/gpu_x86/gpu.c         | 0
 8 files changed, 3 insertions(+), 3 deletions(-)
 rename {src => plugins/local}/gpu_nvidia/LIB (100%)
 rename {src => plugins/local}/gpu_nvidia/NEED (100%)
 rename {src => plugins/local}/gpu_nvidia/README.rst (100%)
 rename {src => plugins/local}/gpu_nvidia/gpu.c (100%)
 rename {src => plugins/local}/gpu_x86/NEED (100%)
 rename {src => plugins/local}/gpu_x86/README.rst (100%)
 rename {src => plugins/local}/gpu_x86/gpu.c (100%)

diff --git a/configure b/configure
index db158966..08dac444 100755
--- a/configure
+++ b/configure
@@ -119,15 +119,15 @@ rm -f ${QP_ROOT}/src/gpu_arch
 case "$GPU" in
   amd) # Nvidia
      echo "Activating AMD GPU acceleration"
-     ln -s ${QP_ROOT}/src/gpu_amd ${QP_ROOT}/src/gpu_arch
+     ln -s ${QP_ROOT}/plugins/local/gpu_amd ${QP_ROOT}/src/gpu_arch
      ;;
   nvidia) # Nvidia
      echo "Activating Nvidia GPU acceleration"
-     ln -s ${QP_ROOT}/src/gpu_nvidia ${QP_ROOT}/src/gpu_arch
+     ln -s ${QP_ROOT}/plugins/local/gpu_nvidia ${QP_ROOT}/src/gpu_arch
      ;;
   *) # No Acceleration
      echo "Disabling GPU acceleration"
-     ln -s ${QP_ROOT}/src/gpu_x86 ${QP_ROOT}/src/gpu_arch
+     ln -s ${QP_ROOT}/plugins/local/gpu_x86 ${QP_ROOT}/src/gpu_arch
      ;;
 esac
 
diff --git a/src/gpu_nvidia/LIB b/plugins/local/gpu_nvidia/LIB
similarity index 100%
rename from src/gpu_nvidia/LIB
rename to plugins/local/gpu_nvidia/LIB
diff --git a/src/gpu_nvidia/NEED b/plugins/local/gpu_nvidia/NEED
similarity index 100%
rename from src/gpu_nvidia/NEED
rename to plugins/local/gpu_nvidia/NEED
diff --git a/src/gpu_nvidia/README.rst b/plugins/local/gpu_nvidia/README.rst
similarity index 100%
rename from src/gpu_nvidia/README.rst
rename to plugins/local/gpu_nvidia/README.rst
diff --git a/src/gpu_nvidia/gpu.c b/plugins/local/gpu_nvidia/gpu.c
similarity index 100%
rename from src/gpu_nvidia/gpu.c
rename to plugins/local/gpu_nvidia/gpu.c
diff --git a/src/gpu_x86/NEED b/plugins/local/gpu_x86/NEED
similarity index 100%
rename from src/gpu_x86/NEED
rename to plugins/local/gpu_x86/NEED
diff --git a/src/gpu_x86/README.rst b/plugins/local/gpu_x86/README.rst
similarity index 100%
rename from src/gpu_x86/README.rst
rename to plugins/local/gpu_x86/README.rst
diff --git a/src/gpu_x86/gpu.c b/plugins/local/gpu_x86/gpu.c
similarity index 100%
rename from src/gpu_x86/gpu.c
rename to plugins/local/gpu_x86/gpu.c

From 85b1035cfba778559e629045961cb542631841bd Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 28 Jun 2024 17:33:08 +0200
Subject: [PATCH 13/19] Working on CCSD

---
 src/ccsd/ccsd_space_orb_sub.irp.f | 117 +++++++++++-------------------
 src/gpu/gpu_module.F90            |  62 ++++++++++++++++
 2 files changed, 103 insertions(+), 76 deletions(-)

diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index e7c9b1ab..1329f172 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -14,6 +14,9 @@ subroutine run_ccsd_space_orb
   type(gpu_double2) :: t1, r1
   type(gpu_double2) :: H_oo, H_vv, H_vo
 
+  type(gpu_double2) :: d_cc_space_f_vo
+  type(gpu_double4) :: d_cc_space_v_oovv
+
   double precision, allocatable :: all_err(:,:), all_t(:,:)
   integer, allocatable          :: list_occ(:), list_vir(:)
   integer(bit_kind)             :: det(N_int,2)
@@ -52,6 +55,12 @@ subroutine run_ccsd_space_orb
   !print*,'occ',list_occ
   !print*,'vir',list_vir
 
+  call gpu_allocate(d_cc_space_f_vo, nV, nO)
+  call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV)
+  call gpu_upload(cc_space_f_vo, d_cc_space_f_vo)
+  call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv)
+
+
   call gpu_allocate(t2, nO,nO,nV,nV)
   call gpu_allocate(r2, nO,nO,nV,nV)
   call gpu_allocate(tau, nO,nO,nV,nV)
@@ -116,7 +125,8 @@ subroutine run_ccsd_space_orb
   !print*,'hf_energy', hf_energy
   call det_energy(det,uncorr_energy)
   print*,'Det energy', uncorr_energy
-  call ccsd_energy_space_x(nO,nV,tau_x%f,t1%f,energy)
+
+  call ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy)
   print*,'Guess energy', uncorr_energy+energy, energy
 
   nb_iter = 0
@@ -166,7 +176,7 @@ subroutine run_ccsd_space_orb
     call update_tau_x_space(nO,nV,tau,tau_x)
 
     ! Energy
-    call ccsd_energy_space_x(nO,nV,tau_x%f,t1%f,energy)
+    call ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy)
     write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |'
 
     nb_iter = nb_iter + 1
@@ -239,6 +249,8 @@ subroutine run_ccsd_space_orb
   call save_energy(uncorr_energy + energy, e_t)
 
   deallocate(h_t1, h_t2)
+  call gpu_deallocate(d_cc_space_f_vo)
+  call gpu_deallocate(d_cc_space_v_oovv)
   call gpu_deallocate(t1)
   call gpu_deallocate(t2)
 
@@ -246,59 +258,14 @@ end
 
 ! Energy
 
-subroutine ccsd_energy_space(nO,nV,tau,t1,energy)
-
+subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy)
+  use gpu
   implicit none
 
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: tau(nO,nO,nV,nV)
-  double precision, intent(in)  :: t1(nO,nV)
-  double precision, intent(out) :: energy
-
-  ! internal
-  integer :: i,j,a,b
-  double precision :: e
-
-  energy = 0d0
-  !$omp parallel &
-  !$omp shared(nO,nV,energy,tau,t1,&
-  !$omp cc_space_f_vo,cc_space_w_oovv) &
-  !$omp private(i,j,a,b,e) &
-  !$omp default(none)
-  e = 0d0
-  !$omp do
-  do a = 1, nV
-    do i = 1, nO
-      e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp do
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          e = e + tau(i,j,a,b) * cc_space_w_oovv(i,j,a,b)
-       enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp critical
-  energy = energy + e
-  !$omp end critical
-  !$omp end parallel
-
-end
-
-subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
-
-  implicit none
-
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: tau_x(nO,nO,nV,nV)
-  double precision, intent(in)  :: t1(nO,nV)
-  double precision, intent(out) :: energy
+  integer, intent(in)            :: nO, nV
+  type(gpu_double4), intent(in)  :: tau_x, d_cc_space_v_oovv
+  type(gpu_double2), intent(in)  :: t1, d_cc_space_f_vo
+  double precision, intent(out)  :: energy
 
   ! internal
   integer :: i,j,a,b
@@ -307,14 +274,14 @@ subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
   energy = 0d0
   !$omp parallel &
   !$omp shared(nO,nV,energy,tau_x,t1,&
-  !$omp cc_space_f_vo,cc_space_v_oovv) &
+  !$omp d_cc_space_f_vo,d_cc_space_v_oovv) &
   !$omp private(i,j,a,b,e) &
   !$omp default(none)
   e = 0d0
   !$omp do
   do a = 1, nV
     do i = 1, nO
-      e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
+      e = e + 2d0 * d_cc_space_f_vo%f(a,i) * t1%f(i,a)
     enddo
   enddo
   !$omp end do nowait
@@ -323,7 +290,7 @@ subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
     do a = 1, nV
       do j = 1, nO
         do i = 1, nO
-          e = e + tau_x(i,j,a,b) * cc_space_v_oovv(i,j,a,b)
+          e = e + tau_x%f(i,j,a,b) * d_cc_space_v_oovv%f(i,j,a,b)
        enddo
       enddo
     enddo
@@ -333,6 +300,12 @@ subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
   energy = energy + e
   !$omp end critical
   !$omp end parallel
+!
+!
+!   call gpu_ddot(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1, d_cc_space_v_oovv, 1, energy)
+!   call gpu_ddot(blas_handle, nO*nV*1_8, d_cc_space_f_vo, 1, t1, 1, e)
+!   energy = energy + 2.d0*e
+
 
 end
 
@@ -372,26 +345,24 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
 !  !$OMP END PARALLEL
 
 
-  type(gpu_blas) :: blas
   type(gpu_stream) :: stream(nV)
 
-  call gpu_blas_create(blas)
   do b=1,nV
     call gpu_stream_create(stream(b))
   enddo
 
   !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas) &
+  !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) &
   !$OMP PRIVATE(i,j,a,b) &
   !$OMP DEFAULT(NONE)
   !$OMP DO
   do b=1,nV
-    call gpu_set_stream(blas,stream(b))
+    call gpu_set_stream(blas_handle,stream(b))
     do j=1,nO
-      call gpu_dgeam_c(blas%c, 'N', 'N', nO*1_8, nV*1_8, &
-         1.d0, c_loc(t2%f(1,j,1,b)), nO*nO*1_8, &
-         h_t1(j,b), t1%c, nO*1_8, &
-         c_loc(tau%f(1,j,1,b)), nO*nO*1_8)
+      call gpu_dgeam(blas_handle, 'N', 'N', nO*1_8, nV*1_8, &
+         1.d0, t2%f(1,j,1,b), nO*nO*1_8, &
+         h_t1(j,b), t1%f, nO*1_8, &
+         tau%f(1,j,1,b), nO*nO*1_8)
     enddo
   enddo
   !$OMP END DO
@@ -403,8 +374,6 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
     call gpu_stream_destroy(stream(b))
   enddo
 
-  call gpu_blas_destroy(blas)
-
 end
 
 subroutine update_tau_x_space(nO,nV,tau,tau_x)
@@ -438,26 +407,24 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x)
 !  !$OMP END DO
 !  !$OMP END PARALLEL
 
-  type(gpu_blas) :: blas
   type(gpu_stream) :: stream(nV)
 
-  call gpu_blas_create(blas)
   do a=1,nV
     call gpu_stream_create(stream(a))
   enddo
 
   !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,tau,tau_x,stream,blas) &
+  !$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) &
   !$OMP PRIVATE(i,j,a,b) &
   !$OMP DEFAULT(NONE)
   !$OMP DO
   do b=1,nV
     do a=1,nV
-      call gpu_set_stream(blas,stream(a))
-      call gpu_dgeam_c(blas%c, 'N', 'N', nO*1_8, nO*1_8, &
-          2.d0, c_loc(tau%f(1,1,a,b)), nO*1_8, &
-         -1.d0, c_loc(tau%f(1,1,b,a)), nO*1_8, &
-         c_loc(tau_x%f(1,1,a,b)), nO*1_8)
+      call gpu_set_stream(blas_handle,stream(a))
+      call gpu_dgeam(blas_handle, 'N', 'N', nO*1_8, nO*1_8, &
+          2.d0, tau%f(1,1,a,b), nO*1_8, &
+         -1.d0, tau%f(1,1,b,a), nO*1_8, &
+         tau_x%f(1,1,a,b), nO*1_8)
     enddo
   enddo
   !$OMP END DO
@@ -469,8 +436,6 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x)
     call gpu_stream_destroy(stream(b))
   enddo
 
-  call gpu_blas_destroy(blas)
-
 end
 
 ! R1
diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90
index d1ddad4c..2057d1eb 100644
--- a/src/gpu/gpu_module.F90
+++ b/src/gpu/gpu_module.F90
@@ -144,6 +144,16 @@ module gpu
       type(c_ptr), value :: a, b, c
     end subroutine
 
+    subroutine gpu_sgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
+      b, ldb, c, ldc) bind(C, name='gpu_sgeam')
+      import
+      type(c_ptr), intent(in)        :: handle
+      character(c_char), intent(in), value :: transa, transb
+      integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
+      real(c_float), intent(in), value :: alpha, beta
+      type(c_ptr), value :: a, b, c
+    end subroutine
+
   end interface
 
 
@@ -478,5 +488,57 @@ module gpu
       call gpu_blas_destroy_c(handle%c)
     end subroutine
 
+
 end module
 
+
+
+! dot
+! ---
+
+subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res)
+  use gpu
+  type(gpu_blas), intent(in)     :: handle
+  integer*8                      :: n, incx, incy
+  double precision, intent(in)   :: dx(*), dy(*)
+  double precision, intent(out)    :: res
+  call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res)
+end subroutine
+
+subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res)
+  use gpu
+  type(gpu_blas), intent(in)     :: handle
+  integer*8                      :: n, incx, incy
+  real, intent(in)               :: dx(*), dy(*)
+  real, intent(out)              :: res
+  call gpu_sdot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res)
+end subroutine
+
+
+! geam
+! ----
+
+subroutine gpu_dgeam(handle, transa, transb, m, n, alpha, a, lda, beta, &
+  b, ldb, c, ldc)
+  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*8, intent(in)        :: m, n, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  double precision             :: a(lda,*), b(ldb,*), c(ldc,*)
+  call gpu_dgeam_c(handle%c, transa, transb, m, n, alpha, c_loc(a), lda, beta, &
+        c_loc(b), ldb, c_loc(c), ldc)
+end subroutine
+
+subroutine gpu_sgeam(handle, transa, transb, m, n, alpha, a, lda, beta, &
+  b, ldb, c, ldc)
+ use gpu 
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*8, intent(in)        :: m, n, lda, ldb, ldc
+  real, intent(in)             :: alpha, beta
+  real                         :: a(lda,*), b(ldb,*), c(ldc,*)
+  call gpu_sgeam_c(handle%c, transa, transb, m, n, alpha, c_loc(a), lda, beta, &
+        c_loc(b), ldb, c_loc(c), ldc)
+end subroutine
+

From a5f4f0516eec9f17438474529616368a6f9e5de4 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 28 Jun 2024 17:39:43 +0200
Subject: [PATCH 14/19] Fixing compile

---
 src/ccsd/ccsd_space_orb_sub.irp.f |  1 -
 src/gpu/gpu_module.F90            | 23 +++++++++--------------
 2 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index 1329f172..4e06e31d 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -306,7 +306,6 @@ subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,
 !   call gpu_ddot(blas_handle, nO*nV*1_8, d_cc_space_f_vo, 1, t1, 1, e)
 !   energy = energy + 2.d0*e
 
-
 end
 
 ! Tau
diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90
index 2057d1eb..d7c26ba6 100644
--- a/src/gpu/gpu_module.F90
+++ b/src/gpu/gpu_module.F90
@@ -365,42 +365,42 @@ module gpu
     subroutine gpu_download_double1(gpu_ptr, cpu_ptr)
       implicit none
       type(gpu_double1), intent(in)  :: gpu_ptr
-      double precision, intent(in)   :: cpu_ptr(:)
+      double precision, target, intent(in)   :: cpu_ptr(:)
       call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*size(gpu_ptr%f))
     end subroutine
 
     subroutine gpu_download_double2(gpu_ptr, cpu_ptr)
       implicit none
       type(gpu_double2), intent(in)  :: gpu_ptr
-      double precision, intent(in)   :: cpu_ptr(:,:)
+      double precision, target, intent(in)   :: cpu_ptr(:,:)
       call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
     end subroutine
 
     subroutine gpu_download_double3(gpu_ptr, cpu_ptr)
       implicit none
       type(gpu_double3), intent(in)  :: gpu_ptr
-      double precision, intent(in)   :: cpu_ptr(:,:,:)
+      double precision, target, intent(in)   :: cpu_ptr(:,:,:)
       call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
     end subroutine
 
     subroutine gpu_download_double4(gpu_ptr, cpu_ptr)
       implicit none
       type(gpu_double4), intent(in)  :: gpu_ptr
-      double precision, intent(in)   :: cpu_ptr(:,:,:,:)
+      double precision, target, intent(in)   :: cpu_ptr(:,:,:,:)
       call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
     end subroutine
 
     subroutine gpu_download_double5(gpu_ptr, cpu_ptr)
       implicit none
       type(gpu_double5), intent(in)  :: gpu_ptr
-      double precision, intent(in)   :: cpu_ptr(:,:,:,:,:)
+      double precision, target, intent(in)   :: cpu_ptr(:,:,:,:,:)
       call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
     end subroutine
 
     subroutine gpu_download_double6(gpu_ptr, cpu_ptr)
       implicit none
       type(gpu_double6), intent(in)  :: gpu_ptr
-      double precision, intent(in)   :: cpu_ptr(:,:,:,:,:,:)
+      double precision, target, intent(in)   :: cpu_ptr(:,:,:,:,:,:)
       call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
     end subroutine
 
@@ -454,19 +454,16 @@ module gpu
 ! ----------
 
     subroutine gpu_stream_create(stream)
-      import
       type(gpu_stream) :: stream
       call gpu_stream_create_c(stream%c)
     end subroutine
 
     subroutine gpu_stream_destroy(stream)
-      import
       type(gpu_stream) :: stream
       call gpu_stream_destroy_c(stream%c)
     end subroutine
 
     subroutine gpu_set_stream(handle, stream)
-      import
       type(gpu_blas)   :: handle
       type(gpu_stream) :: stream
       call gpu_set_stream_c(handle%c, stream%c)
@@ -477,13 +474,11 @@ module gpu
 ! --------
 
     subroutine gpu_blas_create(handle)
-      import
       type(gpu_blas) :: handle
       call gpu_blas_create_c(handle%c)
     end subroutine
 
     subroutine gpu_blas_destroy(handle)
-      import
       type(gpu_blas) :: handle
       call gpu_blas_destroy_c(handle%c)
     end subroutine
@@ -500,7 +495,7 @@ subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res)
   use gpu
   type(gpu_blas), intent(in)     :: handle
   integer*8                      :: n, incx, incy
-  double precision, intent(in)   :: dx(*), dy(*)
+  double precision, target, intent(in)   :: dx(*), dy(*)
   double precision, intent(out)    :: res
   call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res)
 end subroutine
@@ -525,7 +520,7 @@ subroutine gpu_dgeam(handle, transa, transb, m, n, alpha, a, lda, beta, &
   character, intent(in)        :: transa, transb
   integer*8, intent(in)        :: m, n, lda, ldb, ldc
   double precision, intent(in) :: alpha, beta
-  double precision             :: a(lda,*), b(ldb,*), c(ldc,*)
+  double precision, target     :: a(lda,*), b(ldb,*), c(ldc,*)
   call gpu_dgeam_c(handle%c, transa, transb, m, n, alpha, c_loc(a), lda, beta, &
         c_loc(b), ldb, c_loc(c), ldc)
 end subroutine
@@ -537,7 +532,7 @@ subroutine gpu_sgeam(handle, transa, transb, m, n, alpha, a, lda, beta, &
   character, intent(in)        :: transa, transb
   integer*8, intent(in)        :: m, n, lda, ldb, ldc
   real, intent(in)             :: alpha, beta
-  real                         :: a(lda,*), b(ldb,*), c(ldc,*)
+  real, target                 :: a(lda,*), b(ldb,*), c(ldc,*)
   call gpu_sgeam_c(handle%c, transa, transb, m, n, alpha, c_loc(a), lda, beta, &
         c_loc(b), ldb, c_loc(c), ldc)
 end subroutine

From c7df9a72cc68a7f5dfded36aa94ac50d5188a5a1 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 28 Jun 2024 21:32:04 +0200
Subject: [PATCH 15/19] Fixing again actions

---
 src/gpu/gpu_module.F90 | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90
index d7c26ba6..ecf79c83 100644
--- a/src/gpu/gpu_module.F90
+++ b/src/gpu/gpu_module.F90
@@ -318,42 +318,42 @@ module gpu
 
     subroutine gpu_upload_double1(cpu_ptr, gpu_ptr)
       implicit none
-      double precision, intent(in)     :: cpu_ptr(:)
+      double precision, target, intent(in)     :: cpu_ptr(*)
       type(gpu_double1), intent(in)    :: gpu_ptr
       call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, 8_8*size(gpu_ptr%f))
     end subroutine
 
     subroutine gpu_upload_double2(cpu_ptr, gpu_ptr)
       implicit none
-      double precision, intent(in)     :: cpu_ptr(:,:)
+      double precision, target, intent(in)     :: cpu_ptr(:,:)
       type(gpu_double2), intent(in)    :: gpu_ptr
       call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
     end subroutine
 
     subroutine gpu_upload_double3(cpu_ptr, gpu_ptr)
       implicit none
-      double precision, intent(in)     :: cpu_ptr(:,:,:)
+      double precision, target, intent(in)     :: cpu_ptr(:,:,:)
       type(gpu_double3), intent(in)    :: gpu_ptr
       call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
     end subroutine
 
     subroutine gpu_upload_double4(cpu_ptr, gpu_ptr)
       implicit none
-      double precision, intent(in)     :: cpu_ptr(:,:,:,:)
+      double precision, target, intent(in)     :: cpu_ptr(:,:,:,:)
       type(gpu_double4), intent(in)    :: gpu_ptr
       call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
     end subroutine
 
     subroutine gpu_upload_double5(cpu_ptr, gpu_ptr)
       implicit none
-      double precision, intent(in)     :: cpu_ptr(:,:,:,:,:)
+      double precision, target, intent(in)     :: cpu_ptr(:,:,:,:,:)
       type(gpu_double5), intent(in)    :: gpu_ptr
       call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
     end subroutine
 
     subroutine gpu_upload_double6(cpu_ptr, gpu_ptr)
       implicit none
-      double precision, intent(in)     :: cpu_ptr(:,:,:,:,:,:)
+      double precision, target, intent(in)     :: cpu_ptr(:,:,:,:,:,:)
       type(gpu_double6), intent(in)    :: gpu_ptr
       call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
     end subroutine
@@ -504,7 +504,7 @@ subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res)
   use gpu
   type(gpu_blas), intent(in)     :: handle
   integer*8                      :: n, incx, incy
-  real, intent(in)               :: dx(*), dy(*)
+  real, target, intent(in)       :: dx(*), dy(*)
   real, intent(out)              :: res
   call gpu_sdot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res)
 end subroutine

From b467bef6dd1e14c5914cc6508aa898d5f1665e3a Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 28 Jun 2024 21:37:14 +0200
Subject: [PATCH 16/19] Forgot file

---
 src/ccsd/ccsd_space_orb_sub.irp.f | 68 +++++++++++++++----------------
 src/gpu/gpu.irp.f                 | 11 +++++
 2 files changed, 45 insertions(+), 34 deletions(-)
 create mode 100644 src/gpu/gpu.irp.f

diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index 4e06e31d..5c2daa05 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -271,40 +271,40 @@ subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,
   integer :: i,j,a,b
   double precision :: e
 
-  energy = 0d0
-  !$omp parallel &
-  !$omp shared(nO,nV,energy,tau_x,t1,&
-  !$omp d_cc_space_f_vo,d_cc_space_v_oovv) &
-  !$omp private(i,j,a,b,e) &
-  !$omp default(none)
-  e = 0d0
-  !$omp do
-  do a = 1, nV
-    do i = 1, nO
-      e = e + 2d0 * d_cc_space_f_vo%f(a,i) * t1%f(i,a)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp do
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          e = e + tau_x%f(i,j,a,b) * d_cc_space_v_oovv%f(i,j,a,b)
-       enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp critical
-  energy = energy + e
-  !$omp end critical
-  !$omp end parallel
-!
-!
-!   call gpu_ddot(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1, d_cc_space_v_oovv, 1, energy)
-!   call gpu_ddot(blas_handle, nO*nV*1_8, d_cc_space_f_vo, 1, t1, 1, e)
-!   energy = energy + 2.d0*e
+!  energy = 0d0
+!  !$omp parallel &
+!  !$omp shared(nO,nV,energy,tau_x,t1,&
+!  !$omp d_cc_space_f_vo,d_cc_space_v_oovv) &
+!  !$omp private(i,j,a,b,e) &
+!  !$omp default(none)
+!  e = 0d0
+!  !$omp do
+!  do a = 1, nV
+!    do i = 1, nO
+!      e = e + 2d0 * d_cc_space_f_vo%f(a,i) * t1%f(i,a)
+!    enddo
+!  enddo
+!  !$omp end do nowait
+!  !$omp do
+!  do b = 1, nV
+!    do a = 1, nV
+!      do j = 1, nO
+!        do i = 1, nO
+!          e = e + tau_x%f(i,j,a,b) * d_cc_space_v_oovv%f(i,j,a,b)
+!       enddo
+!      enddo
+!    enddo
+!  enddo
+!  !$omp end do nowait
+!  !$omp critical
+!  energy = energy + e
+!  !$omp end critical
+!  !$omp end parallel
+
+
+   call gpu_ddot(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1, d_cc_space_v_oovv, 1, energy)
+   call gpu_ddot(blas_handle, nO*nV*1_8, d_cc_space_f_vo, 1, t1, 1, e)
+   energy = energy + 2.d0*e
 
 end
 
diff --git a/src/gpu/gpu.irp.f b/src/gpu/gpu.irp.f
new file mode 100644
index 00000000..e91d66f5
--- /dev/null
+++ b/src/gpu/gpu.irp.f
@@ -0,0 +1,11 @@
+use gpu
+
+BEGIN_PROVIDER [ type(gpu_blas), blas_handle ]
+ implicit none
+ BEGIN_DOC
+ ! Handle for cuBLAS or RocBLAS
+ END_DOC
+ call gpu_blas_create(blas_handle)
+END_PROVIDER
+
+

From 860121d404f7ae255790cd12136139103bdc48d0 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Sat, 29 Jun 2024 02:27:50 +0200
Subject: [PATCH 17/19] H_oo on GPU

---
 plugins/local/gpu_nvidia/gpu.c         | 224 +++++++++++----------
 plugins/local/gpu_x86/gpu.c            |  38 ++--
 src/ccsd/ccsd_space_orb_sub.irp.f      | 112 ++++++++---
 src/ccsd/ccsd_space_orb_sub_chol.irp.f | 127 ++++++++----
 src/gpu/gpu.irp.f                      |   7 +
 src/gpu/gpu_module.F90                 | 260 +++++++++++++++++++++----
 6 files changed, 540 insertions(+), 228 deletions(-)

diff --git a/plugins/local/gpu_nvidia/gpu.c b/plugins/local/gpu_nvidia/gpu.c
index f0bd247a..189de64c 100644
--- a/plugins/local/gpu_nvidia/gpu.c
+++ b/plugins/local/gpu_nvidia/gpu.c
@@ -1,5 +1,6 @@
 #include <stdint.h>
 #include <stdio.h>
+#include <stdbool.h>
 #include <stdlib.h>
 #include <string.h>
 #include <assert.h>
@@ -10,6 +11,10 @@
 
 /* Generic functions */
 
+bool no_gpu() {
+ return false;
+}
+
 int gpu_ndevices() {
   int ngpus;
   cudaGetDeviceCount(&ngpus);
@@ -17,7 +22,7 @@ int gpu_ndevices() {
 }
 
 void gpu_set_device(int32_t igpu) {
-  cudaSetDevice(igpu);
+  cudaSetDevice((int) igpu);
 }
 
 
@@ -64,22 +69,20 @@ void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
 
 /* Streams */
 
-void gpu_stream_create(void** ptr) {
-  cudaStream_t stream;
-  cudaError_t rc = cudaStreamCreate(&stream);
+void gpu_stream_create(cudaStream_t* ptr) {
+  cudaError_t rc = cudaStreamCreate(ptr);
   assert (rc == cudaSuccess);
-  *ptr = (void*) stream;
 }
 
-void gpu_stream_destroy(void** ptr) {
-  assert (*ptr != NULL);
-  cudaError_t rc = cudaStreamDestroy( (cudaStream_t) *ptr);
+void gpu_stream_destroy(cudaStream_t* ptr) {
+  assert (ptr != NULL);
+  cudaError_t rc = cudaStreamDestroy(*ptr);
   assert (rc == cudaSuccess);
   *ptr = NULL;
 }
 
-void gpu_set_stream(void** handle, void** stream) {
-  cublasSetStream( (cublasHandle_t) *handle, (cudaStream_t) *stream);
+void gpu_set_stream(cublasHandle_t handle, cudaStream_t stream) {
+  cublasSetStream(handle, stream);
 }
 
 void gpu_synchronize() {
@@ -89,75 +92,80 @@ void gpu_synchronize() {
 
 /* BLAS functions */
 
-void gpu_blas_create(void** handle) {
-  cublasHandle_t cublas_handle;
-  cublasStatus_t rc = cublasCreate(&cublas_handle);
+void gpu_blas_create(cublasHandle_t* ptr) {
+  cublasStatus_t rc = cublasCreate(ptr);
   assert (rc == CUBLAS_STATUS_SUCCESS);
-  *handle = (void*) cublas_handle;
 }
 
 
-void gpu_blas_destroy(void** handle) {
-  assert (*handle != NULL);
-  cublasStatus_t rc = cublasDestroy( (cublasHandle_t) *handle);
+void gpu_blas_destroy(cublasHandle_t* ptr) {
+  assert (ptr != NULL);
+  cublasStatus_t rc = cublasDestroy(*ptr);
   assert (rc == CUBLAS_STATUS_SUCCESS);
-  *handle = NULL;
+  ptr = NULL;
 }
 
 
-void gpu_ddot(void** handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
-  assert (*handle != NULL);
+void gpu_ddot(cublasHandle_t handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
+  assert (handle != NULL);
+  /* Convert to int */
+  int n_, incx_, incy_;
 
-  /* Convert to int32_t */
-  int32_t n_, incx_, incy_;
+  n_    = (int) n;
+  incx_ = (int) incx;
+  incy_ = (int) incy;
 
-  n_    = (int32_t) n;
-  incx_ = (int32_t) incx;
-  incy_ = (int32_t) incy;
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  cublasStatus_t rc = cublasDdot(handle, n_, x, incx_, y, incy_, result);
+/*
+  double alpha = 1.0;
+  double beta = 0.0;
+  cublasStatus_t rc = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, n_, &alpha, x, 1, y, n_, &beta, &result_, 1);
+*/
+  assert (rc == CUBLAS_STATUS_SUCCESS);
+}
+
+
+
+void gpu_sdot(cublasHandle_t handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int n_, incx_, incy_;
+
+  n_    = (int) n;
+  incx_ = (int) incx;
+  incy_ = (int) incy;
 
   /* Check for integer overflows */
   assert ( (int64_t)    n_ == n   );
   assert ( (int64_t) incx_ == incx);
   assert ( (int64_t) incy_ == incy);
 
-  cublasDdot((cublasHandle_t) *handle, n_, x, incx_, y, incy_, result);
+  float result_ = 0.;
+  cublasStatus_t rc = cublasSdot(handle, n_, x, incx_, y, incy_, &result_);
+  assert (rc == CUBLAS_STATUS_SUCCESS);
+  *result = result_;
 }
 
 
 
-void gpu_sdot(void** handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
-  assert (*handle != NULL);
-
-  /* Convert to int32_t */
-  int32_t n_, incx_, incy_;
-
-  n_    = (int32_t) n;
-  incx_ = (int32_t) incx;
-  incy_ = (int32_t) incy;
-
-  /* Check for integer overflows */
-  assert ( (int64_t)    n_ == n   );
-  assert ( (int64_t) incx_ == incx);
-  assert ( (int64_t) incy_ == incy);
-
-  cublasSdot((cublasHandle_t) *handle, n_, x, incx_, y, incy_, result);
-}
-
-
-
-void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t n, const double alpha,
+void gpu_dgemv(cublasHandle_t handle, const char transa, const int64_t m, const int64_t n, const double alpha,
                const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) {
 
-  assert (*handle != NULL);
+  assert (handle != NULL);
 
-  /* Convert to int32_t */
-  int32_t m_, n_, lda_, incx_, incy_;
+  /* Convert to int */
+  int m_, n_, lda_, incx_, incy_;
 
-  m_    = (int32_t) m;
-  n_    = (int32_t) n;
-  lda_  = (int32_t) lda;
-  incx_ = (int32_t) incx;
-  incy_ = (int32_t) incy;
+  m_    = (int) m;
+  n_    = (int) n;
+  lda_  = (int) lda;
+  incx_ = (int) incx;
+  incy_ = (int) incy;
 
   /* Check for integer overflows */
   assert ( (int64_t)    m_ == m   );
@@ -169,24 +177,24 @@ void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t
   cublasOperation_t transa_ = CUBLAS_OP_N;
   if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
 
-  cublasDgemv((cublasHandle_t) *handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_);
+  cublasDgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_);
 }
 
 
 
-void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t n, const float alpha,
+void gpu_sgemv(cublasHandle_t handle, const char transa, const int64_t m, const int64_t n, const float alpha,
                const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) {
 
-  assert (*handle != NULL);
+  assert (handle != NULL);
 
-  /* Convert to int32_t */
-  int32_t m_, n_, lda_, incx_, incy_;
+  /* Convert to int */
+  int m_, n_, lda_, incx_, incy_;
 
-  m_    = (int32_t) m;
-  n_    = (int32_t) n;
-  lda_  = (int32_t) lda;
-  incx_ = (int32_t) incx;
-  incy_ = (int32_t) incy;
+  m_    = (int) m;
+  n_    = (int) n;
+  lda_  = (int) lda;
+  incx_ = (int) incx;
+  incy_ = (int) incy;
 
   /* Check for integer overflows */
   assert ( (int64_t)    m_ == m   );
@@ -198,24 +206,24 @@ void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t
   cublasOperation_t transa_ = CUBLAS_OP_N;
   if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
 
-  cublasSgemv((cublasHandle_t) *handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_);
+  cublasSgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_);
 }
 
 
-void gpu_dgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
+void gpu_dgemm(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
                const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) {
 
-  assert (*handle != NULL);
+  assert (handle != NULL);
 
-  /* Convert to int32_t */
-  int32_t m_, n_, k_, lda_, ldb_, ldc_;
+  /* Convert to int */
+  int m_, n_, k_, lda_, ldb_, ldc_;
 
-  m_   = (int32_t) m;
-  n_   = (int32_t) n;
-  k_   = (int32_t) k;
-  lda_ = (int32_t) lda;
-  ldb_ = (int32_t) ldb;
-  ldc_ = (int32_t) ldc;
+  m_   = (int) m;
+  n_   = (int) n;
+  k_   = (int) k;
+  lda_ = (int) lda;
+  ldb_ = (int) ldb;
+  ldc_ = (int) ldc;
 
   /* Check for integer overflows */
   assert ( (int64_t)   m_ == m  );
@@ -230,25 +238,25 @@ void gpu_dgemm(void** handle, const char transa, const char transb, const int64_
   if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
   if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
 
-  cublasDgemm((cublasHandle_t) *handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_);
+  cublasDgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_);
 }
 
 
 
-void gpu_sgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
+void gpu_sgemm(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
                const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) {
 
-  assert (*handle != NULL);
+  assert (handle != NULL);
 
-  /* Convert to int32_t */
-  int32_t m_, n_, k_, lda_, ldb_, ldc_;
+  /* Convert to int */
+  int m_, n_, k_, lda_, ldb_, ldc_;
 
-  m_   = (int32_t) m;
-  n_   = (int32_t) n;
-  k_   = (int32_t) k;
-  lda_ = (int32_t) lda;
-  ldb_ = (int32_t) ldb;
-  ldc_ = (int32_t) ldc;
+  m_   = (int) m;
+  n_   = (int) n;
+  k_   = (int) k;
+  lda_ = (int) lda;
+  ldb_ = (int) ldb;
+  ldc_ = (int) ldc;
 
   /* Check for integer overflows */
   assert ( (int64_t)   m_ == m  );
@@ -263,22 +271,22 @@ void gpu_sgemm(void** handle, const char transa, const char transb, const int64_
   if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
   if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
 
-  cublasSgemm((cublasHandle_t) *handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_);
+  cublasSgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_);
 }
 
 
-void gpu_dgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
+void gpu_dgeam(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
                const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
-  assert (*handle != NULL);
+  assert (handle != NULL);
 
-  /* Convert to int32_t */
-  int32_t m_, n_, lda_, ldb_, ldc_;
+  /* Convert to int */
+  int m_, n_, lda_, ldb_, ldc_;
 
-  m_   = (int32_t) m;
-  n_   = (int32_t) n;
-  lda_ = (int32_t) lda;
-  ldb_ = (int32_t) ldb;
-  ldc_ = (int32_t) ldc;
+  m_   = (int) m;
+  n_   = (int) n;
+  lda_ = (int) lda;
+  ldb_ = (int) ldb;
+  ldc_ = (int) ldc;
 
   /* Check for integer overflows */
   assert ( (int64_t)   m_ == m  );
@@ -292,23 +300,23 @@ void gpu_dgeam(void** handle, const char transa, const char transb, const int64_
   if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
   if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
 
-  cublasDgeam((cublasHandle_t) *handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_);
+  cublasDgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_);
 
 }
 
 
-void gpu_sgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
+void gpu_sgeam(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
                const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
-  assert (*handle != NULL);
+  assert (handle != NULL);
 
-  /* Convert to int32_t */
-  int32_t m_, n_, lda_, ldb_, ldc_;
+  /* Convert to int */
+  int m_, n_, lda_, ldb_, ldc_;
 
-  m_   = (int32_t) m;
-  n_   = (int32_t) n;
-  lda_ = (int32_t) lda;
-  ldb_ = (int32_t) ldb;
-  ldc_ = (int32_t) ldc;
+  m_   = (int) m;
+  n_   = (int) n;
+  lda_ = (int) lda;
+  ldb_ = (int) ldb;
+  ldc_ = (int) ldc;
 
   /* Check for integer overflows */
   assert ( (int64_t)   m_ == m  );
@@ -322,6 +330,6 @@ void gpu_sgeam(void** handle, const char transa, const char transb, const int64_
   if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
   if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
 
-  cublasSgeam((cublasHandle_t) *handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_);
+  cublasSgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_);
 
 }
diff --git a/plugins/local/gpu_x86/gpu.c b/plugins/local/gpu_x86/gpu.c
index ac7c3620..53267a7c 100644
--- a/plugins/local/gpu_x86/gpu.c
+++ b/plugins/local/gpu_x86/gpu.c
@@ -2,8 +2,12 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <stdbool.h>
 #include <assert.h>
 
+bool no_gpu() {
+ return true;
+}
 
 /* Generic functions */
 
@@ -56,7 +60,7 @@ void gpu_stream_destroy(void** ptr) {
   *ptr = NULL;
 }
 
-void gpu_set_stream(void** handle, void** stream) {
+void gpu_set_stream(void* handle, void* stream) {
   return;
 }
 
@@ -79,8 +83,8 @@ void gpu_blas_destroy(void** handle) {
 
 double ddot_(const int32_t* n, const double* x, const int32_t* incx, const double* y, const int32_t* incy);
 
-void gpu_ddot(void** handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
-  assert (*handle != NULL);
+void gpu_ddot(void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
+  assert (handle != NULL);
 
   /* Convert to int32_t */
   int32_t n_, incx_, incy_;
@@ -100,8 +104,8 @@ void gpu_ddot(void** handle, const int64_t n, const double* x, const int64_t inc
 
 float sdot_(const int32_t* n, const float* x, const int32_t* incx, const float* y, const int32_t* incy);
 
-void gpu_sdot(void** handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
-  assert (*handle != NULL);
+void gpu_sdot(void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
+  assert (handle != NULL);
 
   /* Convert to int32_t */
   int32_t n_, incx_, incy_;
@@ -122,10 +126,10 @@ void gpu_sdot(void** handle, const int64_t n, const float* x, const int64_t incx
 void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha,
             const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy);
 
-void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t n, const double alpha,
+void gpu_dgemv(void* handle, const char transa, const int64_t m, const int64_t n, const double alpha,
                const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) {
 
-  assert (*handle != NULL);
+  assert (handle != NULL);
 
   /* Convert to int32_t */
   int32_t m_, n_, lda_, incx_, incy_;
@@ -150,10 +154,10 @@ void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t
 void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha,
                const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy);
 
-void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t n, const float alpha,
+void gpu_sgemv(void* handle, const char transa, const int64_t m, const int64_t n, const float alpha,
                const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) {
 
-  assert (*handle != NULL);
+  assert (handle != NULL);
 
   /* Convert to int32_t */
   int32_t m_, n_, lda_, incx_, incy_;
@@ -178,10 +182,10 @@ void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t
 void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha,
             const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc);
 
-void gpu_dgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
+void gpu_dgemm(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
                const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) {
 
-  assert (*handle != NULL);
+  assert (handle != NULL);
 
   /* Convert to int32_t */
   int32_t m_, n_, k_, lda_, ldb_, ldc_;
@@ -209,10 +213,10 @@ void gpu_dgemm(void** handle, const char transa, const char transb, const int64_
 void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha,
             const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc);
 
-void gpu_sgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
+void gpu_sgemm(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
                const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) {
 
-  assert (*handle != NULL);
+  assert (handle != NULL);
 
   /* Convert to int32_t */
   int32_t m_, n_, k_, lda_, ldb_, ldc_;
@@ -236,9 +240,9 @@ void gpu_sgemm(void** handle, const char transa, const char transb, const int64_
 }
 
 
-void gpu_dgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
+void gpu_dgeam(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
                const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
-  assert (*handle != NULL);
+  assert (handle != NULL);
 
   if ( (transa == 'N' && transb == 'N') ||
        (transa == 'n' && transb == 'N') ||
@@ -368,9 +372,9 @@ void gpu_dgeam(void** handle, const char transa, const char transb, const int64_
 }
 
 
-void gpu_sgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
+void gpu_sgeam(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
                const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
-  assert (*handle != NULL);
+  assert (handle != NULL);
 
   if ( (transa == 'N' && transb == 'N') ||
        (transa == 'n' && transb == 'N') ||
diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index 5c2daa05..5ee7366e 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -14,9 +14,15 @@ subroutine run_ccsd_space_orb
   type(gpu_double2) :: t1, r1
   type(gpu_double2) :: H_oo, H_vv, H_vo
 
-  type(gpu_double2) :: d_cc_space_f_vo
+  type(gpu_double2) :: d_cc_space_f_oo, d_cc_space_f_vo
+  type(gpu_double2) :: d_cc_space_f_ov, d_cc_space_f_vv
+
+  type(gpu_double3) :: d_cc_space_v_oo_chol, d_cc_space_v_vo_chol
+  type(gpu_double3) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol
+
   type(gpu_double4) :: d_cc_space_v_oovv
 
+
   double precision, allocatable :: all_err(:,:), all_t(:,:)
   integer, allocatable          :: list_occ(:), list_vir(:)
   integer(bit_kind)             :: det(N_int,2)
@@ -24,7 +30,7 @@ subroutine run_ccsd_space_orb
 
   call set_multiple_levels_omp(.False.)
 
-  if (do_ao_cholesky) then
+  if (do_mo_cholesky) then
     PROVIDE cholesky_mo_transp
     FREE cholesky_ao
   else
@@ -55,11 +61,36 @@ subroutine run_ccsd_space_orb
   !print*,'occ',list_occ
   !print*,'vir',list_vir
 
+  ! GPU arrays
+  call gpu_allocate(d_cc_space_f_oo, nO, nO)
   call gpu_allocate(d_cc_space_f_vo, nV, nO)
-  call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV)
-  call gpu_upload(cc_space_f_vo, d_cc_space_f_vo)
-  call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv)
+  call gpu_allocate(d_cc_space_f_ov, nO, nV)
+  call gpu_allocate(d_cc_space_f_vv, nV, nV)
 
+  call gpu_upload(cc_space_f_oo, d_cc_space_f_oo)
+  call gpu_upload(cc_space_f_vo, d_cc_space_f_vo)
+  call gpu_upload(cc_space_f_vv, d_cc_space_f_vv)
+
+!  FREE cc_space_f_oo
+!  FREE cc_space_f_vo
+!  FREE cc_space_f_vv
+
+  if (do_mo_cholesky) then
+    call gpu_allocate(d_cc_space_v_oo_chol, cholesky_mo_num, nO, nO)
+    call gpu_allocate(d_cc_space_v_ov_chol, cholesky_mo_num, nO, nV)
+    call gpu_allocate(d_cc_space_v_vo_chol, cholesky_mo_num, nV, nO)
+    call gpu_allocate(d_cc_space_v_vv_chol, cholesky_mo_num, nV, nV)
+
+    call gpu_upload(cc_space_v_oo_chol, d_cc_space_v_oo_chol)
+    call gpu_upload(cc_space_v_ov_chol, d_cc_space_v_ov_chol)
+    call gpu_upload(cc_space_v_vo_chol, d_cc_space_v_vo_chol)
+    call gpu_upload(cc_space_v_vv_chol, d_cc_space_v_vv_chol)
+
+!    FREE cc_space_v_oo_chol
+!    FREE cc_space_v_ov_chol
+!    FREE cc_space_v_vo_chol
+!    FREE cc_space_v_vv_chol
+  endif
 
   call gpu_allocate(t2, nO,nO,nV,nV)
   call gpu_allocate(r2, nO,nO,nV,nV)
@@ -120,6 +151,13 @@ subroutine run_ccsd_space_orb
   call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,h_t2)
   call gpu_upload(h_t2, t2)
 
+
+  call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV)
+  call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv)
+
+!  FREE cc_space_v_oovv
+
+
   call update_tau_space(nO,nV,h_t1,t1,t2,tau)
   call update_tau_x_space(nO,nV,tau,tau_x)
   !print*,'hf_energy', hf_energy
@@ -142,10 +180,10 @@ subroutine run_ccsd_space_orb
   do while (not_converged)
 
     ! Residue
-    if (do_ao_cholesky) then
-!    if (.False.) then
-      call compute_H_oo_chol(nO,nV,tau_x,H_oo)
-      call compute_H_vv_chol(nO,nV,tau_x%f,H_vv%f)
+    if (do_mo_cholesky) then
+      call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, &
+              d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
+      call compute_H_vv_chol(nO,nV,tau_x,H_vv)
       call compute_H_vo_chol(nO,nV,t1%f,H_vo%f)
 
       call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1)
@@ -249,6 +287,12 @@ subroutine run_ccsd_space_orb
   call save_energy(uncorr_energy + energy, e_t)
 
   deallocate(h_t1, h_t2)
+  if (do_mo_cholesky) then
+    call gpu_deallocate(d_cc_space_v_oo_chol)
+    call gpu_deallocate(d_cc_space_v_ov_chol)
+    call gpu_deallocate(d_cc_space_v_vo_chol)
+    call gpu_deallocate(d_cc_space_v_vv_chol)
+  endif
   call gpu_deallocate(d_cc_space_f_vo)
   call gpu_deallocate(d_cc_space_v_oovv)
   call gpu_deallocate(t1)
@@ -302,8 +346,21 @@ subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,
 !  !$omp end parallel
 
 
-   call gpu_ddot(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1, d_cc_space_v_oovv, 1, energy)
-   call gpu_ddot(blas_handle, nO*nV*1_8, d_cc_space_f_vo, 1, t1, 1, e)
+   type(gpu_stream) :: s1, s2
+   call gpu_stream_create(s1)
+   call gpu_stream_create(s2)
+
+   call gpu_set_stream(blas_handle,s1)
+   call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo, 1, t1, 1, e)
+
+   call gpu_set_stream(blas_handle,s2)
+   call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1_8, d_cc_space_v_oovv, 1_8, energy)
+   call gpu_synchronize()
+   call gpu_set_stream(blas_handle,gpu_default_stream)
+
+   call gpu_stream_destroy(s1)
+   call gpu_stream_destroy(s2)
+
    energy = energy + 2.d0*e
 
 end
@@ -346,32 +403,29 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
 
   type(gpu_stream) :: stream(nV)
 
-  do b=1,nV
-    call gpu_stream_create(stream(b))
-  enddo
-
-  !$OMP PARALLEL &
+  !$OMP PARALLEL if (no_gpu()) &
   !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) &
   !$OMP PRIVATE(i,j,a,b) &
   !$OMP DEFAULT(NONE)
   !$OMP DO
   do b=1,nV
+    call gpu_stream_create(stream(b))
     call gpu_set_stream(blas_handle,stream(b))
     do j=1,nO
-      call gpu_dgeam(blas_handle, 'N', 'N', nO*1_8, nV*1_8, &
-         1.d0, t2%f(1,j,1,b), nO*nO*1_8, &
-         h_t1(j,b), t1%f, nO*1_8, &
-         tau%f(1,j,1,b), nO*nO*1_8)
+      call gpu_dgeam_f(blas_handle, 'N', 'N', nO, nV, &
+         1.d0, t2%f(1,j,1,b), nO*nO, &
+         h_t1(j,b), t1%f, nO, &
+         tau%f(1,j,1,b), nO*nO)
     enddo
   enddo
   !$OMP END DO
   !$OMP END PARALLEL
 
-  call gpu_synchronize()
-
   do b=1,nV
     call gpu_stream_destroy(stream(b))
   enddo
+  call gpu_set_stream(blas_handle,gpu_default_stream)
+
 
 end
 
@@ -412,7 +466,7 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x)
     call gpu_stream_create(stream(a))
   enddo
 
-  !$OMP PARALLEL &
+  !$OMP PARALLEL if (no_gpu()) &
   !$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) &
   !$OMP PRIVATE(i,j,a,b) &
   !$OMP DEFAULT(NONE)
@@ -420,20 +474,20 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x)
   do b=1,nV
     do a=1,nV
       call gpu_set_stream(blas_handle,stream(a))
-      call gpu_dgeam(blas_handle, 'N', 'N', nO*1_8, nO*1_8, &
-          2.d0, tau%f(1,1,a,b), nO*1_8, &
-         -1.d0, tau%f(1,1,b,a), nO*1_8, &
-         tau_x%f(1,1,a,b), nO*1_8)
+      call gpu_dgeam_f(blas_handle, 'N', 'N', nO, nO, &
+          2.d0, tau%f(1,1,a,b), nO, &
+         -1.d0, tau%f(1,1,b,a), nO, &
+         tau_x%f(1,1,a,b), nO)
     enddo
   enddo
   !$OMP END DO
   !$OMP END PARALLEL
 
-  call gpu_synchronize()
-
   do b=1,nV
     call gpu_stream_destroy(stream(b))
   enddo
+  call gpu_set_stream(blas_handle,gpu_default_stream)
+
 
 end
 
diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
index 9b161001..288724f3 100644
--- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
@@ -293,62 +293,115 @@ end
 
 ! H_oo
 
-subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo)
+subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, &
+    d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
   use gpu
   implicit none
 
   integer, intent(in)           :: nO,nV
+  type(gpu_double2), intent(in)    :: d_cc_space_f_oo
+  type(gpu_double3), intent(in)    :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol
   type(gpu_double4), intent(in)    :: tau_x
   type(gpu_double2), intent(out)   :: H_oo
 
   integer :: a,b,i,j,u,k
 
-  double precision, allocatable :: tau_kau(:,:,:), tmp_vov(:,:,:)
+  type(gpu_double3) :: tau_kau, tmp_vov, tmp_ovv
 
-  allocate(tau_kau(cholesky_mo_num,nV,nO))
-  !$omp parallel &
-  !$omp default(shared) &
-  !$omp private(i,u,j,k,a,b,tmp_vov)
-  allocate(tmp_vov(nV,nO,nV) )
-  !$omp do
-  do u = 1, nO
+  call gpu_allocate(tau_kau, cholesky_mo_num, nV, nO)
+
+!  !$omp parallel &
+!  !$omp default(shared) &
+!  !$omp private(i,u,j,k,a,b,tmp_vov)
+!  call gpu_allocate(tmp_vov, nV, nO, nV)
+!  !$omp do
+!  do u = 1, nO
+!    do b=1,nV
+!      do j=1,nO
+!        do a=1,nV
+!          tmp_vov%f(a,j,b) = tau_x%f(u,j,a,b)
+!        enddo
+!      enddo
+!    enddo
+!    call dgemm('N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
+!      d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, tmp_vov%f, nV, &
+!      0.d0, tau_kau%f(1,1,u), cholesky_mo_num)
+!  enddo
+!  !$omp end do nowait
+!  call gpu_deallocate(tmp_vov)
+!  !$omp do
+!  do i = 1, nO
+!    do u = 1, nO
+!      H_oo%f(u,i) = d_cc_space_f_oo%f(u,i)
+!    enddo
+!  enddo
+!  !$omp end do nowait
+!
+!  !$omp barrier
+!  !$omp end  parallel
+!  call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
+!    tau_kau%f(1,1,1), cholesky_mo_num*nV,  d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num*nV, &
+!    1.d0, H_oo%f(1,1), nO)
+!
+
+  type(gpu_stream) :: stream(nV)
+
+  do b=1,nV
+    call gpu_stream_create(stream(b))
+  enddo
+
+  !$OMP PARALLEL if (no_gpu()) &
+  !$OMP DEFAULT(SHARED) &
+  !$OMP PRIVATE(u,b,tmp_vov,tmp_ovv)
+
+  call gpu_allocate(tmp_vov, nV, nO, nV)
+  call gpu_allocate(tmp_ovv, nO, nV, nV)
+
+  !$OMP DO
+  do u=1,nO
+    call gpu_dgeam_f(blas_handle, 'N', 'N', 1, nO*nV*nV, 1.d0, &
+           tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f, nO, tmp_ovv%f, 1)
     do b=1,nV
-      do j=1,nO
-        do a=1,nV
-          tmp_vov(a,j,b) = tau_x%f(u,j,a,b)
-        enddo
-      enddo
+      call gpu_set_stream(blas_handle,stream(b))
+      call gpu_dgeam_f(blas_handle, 'T', 'T', nV, nO, 1.d0, &
+           tmp_ovv%f(1,1,b), nO, 0.d0, &
+           tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV)
     enddo
-    call dgemm('N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
-      cc_space_v_ov_chol, cholesky_mo_num, tmp_vov, nV, &
-      0.d0, tau_kau(1,1,u), cholesky_mo_num)
+    call gpu_dgemm_f(blas_handle, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
+      d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_vov%f, nV, &
+      0.d0, tau_kau%f(1,1,u), cholesky_mo_num)
+    call gpu_synchronize()
   enddo
-  !$omp end do nowait
-  deallocate(tmp_vov)
-  !$omp do
-  do i = 1, nO
-    do u = 1, nO
-      H_oo%f(u,i) = cc_space_f_oo(u,i)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp barrier
-  !$omp end  parallel
-  call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
-    tau_kau, cholesky_mo_num*nV,  cc_space_v_vo_chol, cholesky_mo_num*nV, &
-    1.d0, H_oo%f, nO)
+  !$OMP END DO
 
+  call gpu_deallocate(tmp_vov)
+  call gpu_deallocate(tmp_ovv)
+  !$OMP END PARALLEL
+
+  do b=1,nV
+    call gpu_stream_destroy(stream(b))
+  enddo
+
+  call gpu_set_stream(blas_handle,gpu_default_stream)
+
+  call gpu_copy(d_cc_space_f_oo, H_oo)
+
+  call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
+    tau_kau, cholesky_mo_num*nV,  d_cc_space_v_vo_chol, cholesky_mo_num*nV, &
+    1.d0, H_oo, nO)
+
+  call gpu_deallocate(tau_kau)
 end
 
 ! H_vv
 
 subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv)
-
+  use gpu
   implicit none
 
   integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: tau_x(nO, nO, nV, nV)
-  double precision, intent(out) :: H_vv(nV, nV)
+  type(gpu_double4), intent(in)    :: tau_x
+  type(gpu_double2), intent(out)   :: H_vv
 
   integer :: a,b,i,j,u,k, beta
 
@@ -364,7 +417,7 @@ subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv)
     do b=1,nV
       do j=1,nO
         do i=1,nO
-          tmp_oov(i,j,b) = tau_x(i,j,a,b)
+          tmp_oov(i,j,b) = tau_x%f(i,j,a,b)
         enddo
       enddo
     enddo
@@ -378,7 +431,7 @@ subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv)
   !$omp do
   do beta = 1, nV
     do a = 1, nV
-      H_vv(a,beta) = cc_space_f_vv(a,beta)
+      H_vv%f(a,beta) = cc_space_f_vv(a,beta)
     enddo
   enddo
   !$omp end do nowait
@@ -386,7 +439,7 @@ subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv)
   !$omp end  parallel
   call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
     tau_kia, cholesky_mo_num*nO,  cc_space_v_ov_chol, cholesky_mo_num*nO, &
-    1.d0, H_vv, nV)
+    1.d0, H_vv%f, nV)
 
 end
 
diff --git a/src/gpu/gpu.irp.f b/src/gpu/gpu.irp.f
index e91d66f5..6ad0a075 100644
--- a/src/gpu/gpu.irp.f
+++ b/src/gpu/gpu.irp.f
@@ -8,4 +8,11 @@ BEGIN_PROVIDER [ type(gpu_blas), blas_handle ]
  call gpu_blas_create(blas_handle)
 END_PROVIDER
 
+BEGIN_PROVIDER [ type(gpu_stream), gpu_default_stream ]
+ implicit none
+ BEGIN_DOC
+ ! Default stream
+ END_DOC
+ gpu_default_stream%c = C_NULL_PTR
+END_PROVIDER
 
diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90
index ecf79c83..2676b339 100644
--- a/src/gpu/gpu_module.F90
+++ b/src/gpu/gpu_module.F90
@@ -49,7 +49,12 @@ module gpu
 ! ------------
 
   interface
+    logical(c_bool) function no_gpu() bind(C)
+      import
+    end function
+
     integer function gpu_ndevices() bind(C)
+      import
     end function
 
     subroutine gpu_set_device(id) bind(C)
@@ -101,7 +106,7 @@ module gpu
 
     subroutine gpu_set_stream_c(handle, stream) bind(C, name='gpu_set_stream')
       import
-      type(c_ptr) :: handle, stream
+      type(c_ptr), value :: handle, stream
     end subroutine
 
     subroutine gpu_synchronize() bind(C)
@@ -120,15 +125,15 @@ module gpu
 
     subroutine gpu_ddot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_ddot')
       import
-      type(c_ptr), intent(in)        :: handle
+      type(c_ptr), value, intent(in) :: handle
       integer(c_int64_t), value      :: n, incx, incy
-      type(c_ptr), intent(in), value :: dx, dy
+      type(c_ptr), value             :: dx, dy
       real(c_double), intent(out)    :: res
     end subroutine
 
     subroutine gpu_sdot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_sdot')
       import
-      type(c_ptr), intent(in)        :: handle
+      type(c_ptr), value, intent(in) :: handle
       integer(c_int64_t), value      :: n, incx, incy
       type(c_ptr), intent(in), value :: dx, dy
       real(c_float), intent(out)  :: res
@@ -137,8 +142,8 @@ module gpu
     subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
       b, ldb, c, ldc) bind(C, name='gpu_dgeam')
       import
-      type(c_ptr), intent(in)        :: handle
-      character(c_char), intent(in), value :: transa, transb
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in), value  :: transa, transb
       integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
       real(c_double), intent(in), value :: alpha, beta
       type(c_ptr), value :: a, b, c
@@ -147,13 +152,33 @@ module gpu
     subroutine gpu_sgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
       b, ldb, c, ldc) bind(C, name='gpu_sgeam')
       import
-      type(c_ptr), intent(in)        :: handle
-      character(c_char), intent(in), value :: transa, transb
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in), value  :: transa, transb
       integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
       real(c_float), intent(in), value :: alpha, beta
       type(c_ptr), value :: a, b, c
     end subroutine
 
+    subroutine gpu_dgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, &
+      b, ldb, beta, c, ldc) bind(C, name='gpu_dgemm')
+      import
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in), value  :: transa, transb
+      integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc
+      real(c_double), intent(in), value :: alpha, beta
+      type(c_ptr), value :: a, b, c
+    end subroutine
+
+    subroutine gpu_sgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, &
+      b, ldb, beta, c, ldc) bind(C, name='gpu_sgemm')
+      import
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in), value  :: transa, transb
+      integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc
+      real(c_float), intent(in), value :: alpha, beta
+      type(c_ptr), value :: a, b, c
+    end subroutine
+
   end interface
 
 
@@ -161,20 +186,26 @@ module gpu
 ! ----------------------
 
   interface gpu_allocate
-    procedure gpu_allocate_double1  &
-             ,gpu_allocate_double2  &
-             ,gpu_allocate_double3  &
-             ,gpu_allocate_double4  &
-             ,gpu_allocate_double5  &
-             ,gpu_allocate_double6
+    procedure gpu_allocate_double1     &
+             ,gpu_allocate_double2     &
+             ,gpu_allocate_double3     &
+             ,gpu_allocate_double4     &
+             ,gpu_allocate_double5     &
+             ,gpu_allocate_double6     &
+             ,gpu_allocate_double1_64  &
+             ,gpu_allocate_double2_64  &
+             ,gpu_allocate_double3_64  &
+             ,gpu_allocate_double4_64  &
+             ,gpu_allocate_double5_64  &
+             ,gpu_allocate_double6_64
   end interface gpu_allocate
 
   interface gpu_deallocate
-    procedure gpu_deallocate_double1  &
-             ,gpu_deallocate_double2  &
-             ,gpu_deallocate_double3  &
-             ,gpu_deallocate_double4  &
-             ,gpu_deallocate_double5  &
+    procedure gpu_deallocate_double1     &
+             ,gpu_deallocate_double2     &
+             ,gpu_deallocate_double3     &
+             ,gpu_deallocate_double4     &
+             ,gpu_deallocate_double5     &
              ,gpu_deallocate_double6
   end interface gpu_deallocate
 
@@ -267,6 +298,61 @@ module gpu
     end subroutine
 
 
+    subroutine gpu_allocate_double1_64(ptr, s)
+      implicit none
+      type(gpu_double1), intent(inout) :: ptr
+      integer*8, intent(in) :: s
+
+      call gpu_allocate_c(ptr%c, s)
+      call c_f_pointer(ptr%c, ptr%f, (/ s /))
+    end subroutine
+
+    subroutine gpu_allocate_double2_64(ptr, s1, s2)
+      implicit none
+      type(gpu_double2), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2
+
+      call gpu_allocate_c(ptr%c, s1*s2*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /))
+    end subroutine
+
+    subroutine gpu_allocate_double3_64(ptr, s1, s2, s3)
+      implicit none
+      type(gpu_double3), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2, s3
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /))
+    end subroutine
+
+    subroutine gpu_allocate_double4_64(ptr, s1, s2, s3, s4)
+      implicit none
+      type(gpu_double4), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2, s3, s4
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /))
+    end subroutine
+
+    subroutine gpu_allocate_double5_64(ptr, s1, s2, s3, s4, s5)
+      implicit none
+      type(gpu_double5), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2, s3, s4, s5
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /))
+    end subroutine
+
+    subroutine gpu_allocate_double6_64(ptr, s1, s2, s3, s4, s5, s6)
+      implicit none
+      type(gpu_double6), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2, s3, s4, s5, s6
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /))
+    end subroutine
+
+
 ! gpu_deallocate
 ! --------------
 
@@ -494,19 +580,38 @@ end module
 subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res)
   use gpu
   type(gpu_blas), intent(in)     :: handle
-  integer*8                      :: n, incx, incy
-  double precision, target, intent(in)   :: dx(*), dy(*)
-  double precision, intent(out)    :: res
-  call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res)
+  integer*4                      :: n, incx, incy
+  type(gpu_double1), intent(in)  :: dx, dy
+  double precision, intent(out)  :: res
+  call gpu_ddot_c(handle%c, int(n,c_int64_t), dx%c, int(incx,c_int64_t), dy%c, int(incy,c_int64_t), res)
 end subroutine
 
-subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res)
+subroutine gpu_ddot_f(handle, n, dx, incx, dy, incy, res)
+  use gpu
+  type(gpu_blas), intent(in)     :: handle
+  integer*4                      :: n, incx, incy
+  double precision, target       :: dx(*), dy(*)
+  double precision, intent(out)  :: res
+  call gpu_ddot_c(handle%c, int(n,c_int64_t), c_loc(dx), int(incx,c_int64_t), c_loc(dy), int(incy,c_int64_t), res)
+end subroutine
+
+
+subroutine gpu_ddot_64(handle, n, dx, incx, dy, incy, res)
   use gpu
   type(gpu_blas), intent(in)     :: handle
   integer*8                      :: n, incx, incy
-  real, target, intent(in)       :: dx(*), dy(*)
-  real, intent(out)              :: res
-  call gpu_sdot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res)
+  type(gpu_double1), intent(in)  :: dx, dy
+  double precision, intent(out)  :: res
+  call gpu_ddot_c(handle%c, n, dx%c, incx, dy%c, incy, res)
+end subroutine
+
+subroutine gpu_ddot_f_64(handle, n, dx, incx, dy, incy, res)
+  use gpu
+  type(gpu_blas), intent(in)     :: handle
+  integer*8                      :: n, incx, incy
+  double precision, target       :: dx(*), dy(*)
+  double precision, intent(out)  :: res
+  call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res)
 end subroutine
 
 
@@ -518,22 +623,103 @@ subroutine gpu_dgeam(handle, transa, transb, m, n, alpha, a, lda, beta, &
   use gpu
   type(gpu_blas), intent(in)   :: handle
   character, intent(in)        :: transa, transb
-  integer*8, intent(in)        :: m, n, lda, ldb, ldc
+  integer*4, intent(in)        :: m, n, lda, ldb, ldc
   double precision, intent(in) :: alpha, beta
-  double precision, target     :: a(lda,*), b(ldb,*), c(ldc,*)
-  call gpu_dgeam_c(handle%c, transa, transb, m, n, alpha, c_loc(a), lda, beta, &
-        c_loc(b), ldb, c_loc(c), ldc)
+  type(gpu_double2)            :: a, b, c
+  call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, a%c, int(lda,c_int64_t), beta, &
+        b%c, int(ldb,c_int64_t), c%c, int(ldc,c_int64_t))
 end subroutine
 
-subroutine gpu_sgeam(handle, transa, transb, m, n, alpha, a, lda, beta, &
+
+subroutine gpu_dgeam_f(handle, transa, transb, m, n, alpha, a, lda, beta, &
   b, ldb, c, ldc)
- use gpu 
+  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*4, intent(in)        :: m, n, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  double precision, target     :: a(*), b(*), c(*)
+  call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, &
+        c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t))
+end subroutine
+
+
+subroutine gpu_dgeam_64(handle, transa, transb, m, n, alpha, a, lda, beta, &
+  b, ldb, c, ldc)
+  use gpu
   type(gpu_blas), intent(in)   :: handle
   character, intent(in)        :: transa, transb
   integer*8, intent(in)        :: m, n, lda, ldb, ldc
-  real, intent(in)             :: alpha, beta
-  real, target                 :: a(lda,*), b(ldb,*), c(ldc,*)
-  call gpu_sgeam_c(handle%c, transa, transb, m, n, alpha, c_loc(a), lda, beta, &
-        c_loc(b), ldb, c_loc(c), ldc)
+  double precision, intent(in) :: alpha, beta
+  type(gpu_double2)            :: a, b, c
+  call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, a%c, int(lda,c_int64_t), beta, &
+        b%c, int(ldb,c_int64_t), c%c, int(ldc,c_int64_t))
+end subroutine
+
+
+subroutine gpu_dgeam_f_64(handle, transa, transb, m, n, alpha, a, lda, beta, &
+  b, ldb, c, ldc)
+  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*8, intent(in)        :: m, n, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  double precision, target     :: a(*), b(*), c(*)
+  call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, &
+        c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t))
+end subroutine
+
+
+! gemm
+! ----
+
+subroutine gpu_dgemm(handle, transa, transb, m, n, k, alpha, a, lda, &
+  b, ldb, beta, c, ldc)
+  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*4, intent(in)        :: m, n, k, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  type(gpu_double2)            :: a, b, c
+  call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), &
+        alpha, a%c, int(lda,c_int64_t), &
+        b%c, int(ldb,c_int64_t), beta, c%c, int(ldc,c_int64_t))
+end subroutine
+
+subroutine gpu_dgemm_64(handle, transa, transb, m, n, k, alpha, a, lda, &
+  b, ldb, beta, c, ldc)
+  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*8, intent(in)        :: m, n, k, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  type(gpu_double2)            :: a, b, c
+  call gpu_dgemm_c(handle%c, transa, transb, m, n, k, &
+        alpha, a%c, lda, b%c, ldb, beta, c%c, ldc)
+end subroutine
+
+subroutine gpu_dgemm_f(handle, transa, transb, m, n, k, alpha, a, lda, &
+  b, ldb, beta, c, ldc)
+  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*4, intent(in)        :: m, n, k, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  double precision, target     :: a(*), b(*), c(*)
+  call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), &
+        alpha, c_loc(a), int(lda,c_int64_t), &
+        c_loc(b), int(ldb,c_int64_t), beta, c_loc(c), int(ldc,c_int64_t))
+end subroutine
+
+subroutine gpu_dgemm_f_64(handle, transa, transb, m, n, k, alpha, a, lda, &
+  b, ldb, beta, c, ldc)
+  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*8, intent(in)        :: m, n, k, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  double precision, target     :: a(*), b(*), c(*)
+  call gpu_dgemm_c(handle%c, transa, transb, m, n, k, &
+        alpha, c_loc(a), lda, c_loc(b), ldb, beta, c_loc(c), ldc)
 end subroutine
 

From d3c1994c64ed9ae9914ce605a6b7c364ac518d9b Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Mon, 1 Jul 2024 18:04:48 +0200
Subject: [PATCH 18/19] H_vv

---
 plugins/local/gpu_nvidia/gpu.c         |  16 +--
 plugins/local/gpu_x86/gpu.c            |   6 +-
 src/ccsd/ccsd_space_orb_sub.irp.f      |   7 +-
 src/ccsd/ccsd_space_orb_sub_chol.irp.f | 146 ++++++++++++++++---------
 src/gpu/gpu.irp.f                      |   8 ++
 5 files changed, 114 insertions(+), 69 deletions(-)

diff --git a/plugins/local/gpu_nvidia/gpu.c b/plugins/local/gpu_nvidia/gpu.c
index 189de64c..39a82984 100644
--- a/plugins/local/gpu_nvidia/gpu.c
+++ b/plugins/local/gpu_nvidia/gpu.c
@@ -11,10 +11,6 @@
 
 /* Generic functions */
 
-bool no_gpu() {
- return false;
-}
-
 int gpu_ndevices() {
   int ngpus;
   cudaGetDeviceCount(&ngpus);
@@ -35,13 +31,13 @@ void gpu_allocate(void** ptr, const int64_t size) {
       free = INT64_MAX;
     }
 
-    /* Use managed memory if it does not fit on the GPU */
-    if (size < free && size < total/2) {
+    rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
+//    /* Use managed memory if it does not fit on the GPU */
+//    if (size < free && size < total/2) {
 //      rc= cudaMalloc(ptr, size);
-      rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
-    } else {
-      rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
-    }
+//    } else {
+//      rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
+//    }
     assert (rc == cudaSuccess);
 }
 
diff --git a/plugins/local/gpu_x86/gpu.c b/plugins/local/gpu_x86/gpu.c
index 53267a7c..dab23a25 100644
--- a/plugins/local/gpu_x86/gpu.c
+++ b/plugins/local/gpu_x86/gpu.c
@@ -5,14 +5,10 @@
 #include <stdbool.h>
 #include <assert.h>
 
-bool no_gpu() {
- return true;
-}
-
 /* Generic functions */
 
 int gpu_ndevices() {
-  return 1;
+  return 0;
 }
 
 void gpu_set_device(int32_t i) {
diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index 5ee7366e..0b3636ac 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -183,7 +183,8 @@ subroutine run_ccsd_space_orb
     if (do_mo_cholesky) then
       call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, &
               d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
-      call compute_H_vv_chol(nO,nV,tau_x,H_vv)
+      call compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, &
+              d_cc_space_v_ov_chol,H_vv)
       call compute_H_vo_chol(nO,nV,t1%f,H_vo%f)
 
       call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1)
@@ -403,7 +404,7 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
 
   type(gpu_stream) :: stream(nV)
 
-  !$OMP PARALLEL if (no_gpu()) &
+  !$OMP PARALLEL if (gpu_num == 0) &
   !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) &
   !$OMP PRIVATE(i,j,a,b) &
   !$OMP DEFAULT(NONE)
@@ -466,7 +467,7 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x)
     call gpu_stream_create(stream(a))
   enddo
 
-  !$OMP PARALLEL if (no_gpu()) &
+  !$OMP PARALLEL if (gpu_num == 0) &
   !$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) &
   !$OMP PRIVATE(i,j,a,b) &
   !$OMP DEFAULT(NONE)
diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
index 288724f3..458016fb 100644
--- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
@@ -344,48 +344,47 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, &
 !    1.d0, H_oo%f(1,1), nO)
 !
 
-  type(gpu_stream) :: stream(nV)
+  type(gpu_blas) :: blas
 
-  do b=1,nV
-    call gpu_stream_create(stream(b))
-  enddo
 
-  !$OMP PARALLEL if (no_gpu()) &
+  !$OMP PARALLEL  &
   !$OMP DEFAULT(SHARED) &
-  !$OMP PRIVATE(u,b,tmp_vov,tmp_ovv)
+  !$OMP PRIVATE(blas,u,b,tmp_vov,tmp_ovv)
+
+  !$OMP SINGLE
+  !$OMP TASK
+  call gpu_copy(d_cc_space_f_oo, H_oo)
+  !$OMP END TASK
+  !$OMP END SINGLE
 
-  call gpu_allocate(tmp_vov, nV, nO, nV)
   call gpu_allocate(tmp_ovv, nO, nV, nV)
+  call gpu_allocate(tmp_vov, nV, nO, nV)
+
+  call gpu_blas_create(blas)
 
   !$OMP DO
   do u=1,nO
-    call gpu_dgeam_f(blas_handle, 'N', 'N', 1, nO*nV*nV, 1.d0, &
+    call gpu_dgeam_f(blas, 'N', 'N', 1, nO*nV*nV, 1.d0, &
            tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f, nO, tmp_ovv%f, 1)
     do b=1,nV
-      call gpu_set_stream(blas_handle,stream(b))
-      call gpu_dgeam_f(blas_handle, 'T', 'T', nV, nO, 1.d0, &
+      call gpu_dgeam_f(blas, 'T', 'T', nV, nO, 1.d0, &
            tmp_ovv%f(1,1,b), nO, 0.d0, &
            tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV)
     enddo
-    call gpu_dgemm_f(blas_handle, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
+    call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
       d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_vov%f, nV, &
       0.d0, tau_kau%f(1,1,u), cholesky_mo_num)
-    call gpu_synchronize()
   enddo
   !$OMP END DO
 
+  call gpu_blas_destroy(blas)
+
   call gpu_deallocate(tmp_vov)
   call gpu_deallocate(tmp_ovv)
+
+  !$OMP TASKWAIT
   !$OMP END PARALLEL
 
-  do b=1,nV
-    call gpu_stream_destroy(stream(b))
-  enddo
-
-  call gpu_set_stream(blas_handle,gpu_default_stream)
-
-  call gpu_copy(d_cc_space_f_oo, H_oo)
-
   call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
     tau_kau, cholesky_mo_num*nV,  d_cc_space_v_vo_chol, cholesky_mo_num*nV, &
     1.d0, H_oo, nO)
@@ -395,52 +394,97 @@ end
 
 ! H_vv
 
-subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv)
+subroutine compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, &
+         d_cc_space_v_ov_chol,H_vv)
   use gpu
   implicit none
 
-  integer, intent(in)           :: nO,nV
+  integer, intent(in)              :: nO,nV
+  type(gpu_double2), intent(in)    :: d_cc_space_f_vv
+  type(gpu_double3), intent(in)    :: d_cc_space_v_ov_chol
   type(gpu_double4), intent(in)    :: tau_x
   type(gpu_double2), intent(out)   :: H_vv
 
   integer :: a,b,i,j,u,k, beta
 
-  double precision, allocatable :: tau_kia(:,:,:), tmp_oov(:,:,:)
+  type(gpu_double3) :: tau_kia, tmp_oov
 
-  allocate(tau_kia(cholesky_mo_num,nO,nV))
-  !$omp parallel &
-  !$omp default(shared) &
-  !$omp private(i,beta,j,k,a,b,tmp_oov)
-  allocate(tmp_oov(nO,nO,nV) )
-  !$omp do
+  call gpu_allocate(tau_kia, cholesky_mo_num, nO, nV)
+
+!  !$omp parallel &
+!  !$omp default(shared) &
+!  !$omp private(i,beta,j,k,a,b,tmp_oov)
+!  allocate(tmp_oov(nO,nO,nV) )
+!  !$omp do
+!  do a = 1, nV
+!    do b=1,nV
+!      do j=1,nO
+!        do i=1,nO
+!          tmp_oov(i,j,b) = tau_x%f(i,j,a,b)
+!        enddo
+!      enddo
+!    enddo
+!    call dgemm('N','T',cholesky_mo_num,nO,nO*nV,1.d0, &
+!      d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov, nO, &
+!      0.d0, tau_kia(1,1,a), cholesky_mo_num)
+!  enddo
+!  !$omp end do nowait
+!  deallocate(tmp_oov)
+
+!  !$omp do
+!  do beta = 1, nV
+!    do a = 1, nV
+!      H_vv%f(a,beta) = cc_space_f_vv(a,beta)
+!    enddo
+!  enddo
+!  !$omp end do nowait
+!  !$omp barrier
+!  !$omp end  parallel
+!  call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
+!    tau_kia, cholesky_mo_num*nO,  d_cc_space_v_ov_chol%f, cholesky_mo_num*nO, &
+!    1.d0, H_vv%f, nV)
+
+  type(gpu_blas) :: blas
+
+
+  PROVIDE gpu_num
+  !$OMP PARALLEL &
+  !$OMP DEFAULT(SHARED) &
+  !$OMP PRIVATE(a,b,tmp_oov,blas)
+
+  !$OMP SINGLE
+  !$OMP TASK
+  call gpu_copy(d_cc_space_f_vv, H_vv)
+  !$OMP END TASK
+  !$OMP END SINGLE
+
+  call gpu_blas_create(blas)
+  call gpu_allocate(tmp_oov, nO, nO, nV)
+
+  !$OMP DO
   do a = 1, nV
     do b=1,nV
-      do j=1,nO
-        do i=1,nO
-          tmp_oov(i,j,b) = tau_x%f(i,j,a,b)
-        enddo
-      enddo
+      call gpu_dgeam_f(blas, 'N', 'N', nO, nO, 1.d0, &
+        tau_x%f(1,1,a,b), nO, 0.d0, &
+        tau_x%f(1,1,a,b), nO, tmp_oov%f(1,1,b), nO)
     enddo
-    call dgemm('N','T',cholesky_mo_num,nO,nO*nV,1.d0, &
-      cc_space_v_ov_chol, cholesky_mo_num, tmp_oov, nO, &
-      0.d0, tau_kia(1,1,a), cholesky_mo_num)
+    call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nO,nO*nV,1.d0, &
+      d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov%f, nO, &
+      0.d0, tau_kia%f(1,1,a), cholesky_mo_num)
   enddo
-  !$omp end do nowait
-  deallocate(tmp_oov)
+  !$OMP END DO
 
-  !$omp do
-  do beta = 1, nV
-    do a = 1, nV
-      H_vv%f(a,beta) = cc_space_f_vv(a,beta)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp barrier
-  !$omp end  parallel
-  call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
-    tau_kia, cholesky_mo_num*nO,  cc_space_v_ov_chol, cholesky_mo_num*nO, &
-    1.d0, H_vv%f, nV)
+  call gpu_blas_destroy(blas)
 
+  call gpu_deallocate(tmp_oov)
+  !$OMP TASKWAIT
+  !$OMP END PARALLEL
+
+  call gpu_dgemm(blas_handle,'T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
+    tau_kia, cholesky_mo_num*nO,  d_cc_space_v_ov_chol, cholesky_mo_num*nO, &
+    1.d0, H_vv, nV)
+
+  call gpu_deallocate(tau_kia)
 end
 
 ! H_vo
diff --git a/src/gpu/gpu.irp.f b/src/gpu/gpu.irp.f
index 6ad0a075..3b2feeb6 100644
--- a/src/gpu/gpu.irp.f
+++ b/src/gpu/gpu.irp.f
@@ -16,3 +16,11 @@ BEGIN_PROVIDER [ type(gpu_stream), gpu_default_stream ]
  gpu_default_stream%c = C_NULL_PTR
 END_PROVIDER
 
+BEGIN_PROVIDER [ integer, gpu_num ]
+ implicit none
+ BEGIN_DOC
+ ! Number of usable GPUs
+ END_DOC
+ gpu_num = gpu_ndevices()
+END_PROVIDER
+

From 44a7729f65a37cc3a7c35ae55f462bb1d61e411b Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Mon, 1 Jul 2024 19:00:27 +0200
Subject: [PATCH 19/19] H_ finished in CCSD

---
 src/ccsd/ccsd_space_orb_sub.irp.f      | 108 ++----
 src/ccsd/ccsd_space_orb_sub_chol.irp.f | 482 +++++++++----------------
 2 files changed, 200 insertions(+), 390 deletions(-)

diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index 0b3636ac..13b974be 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -181,11 +181,9 @@ subroutine run_ccsd_space_orb
 
     ! Residue
     if (do_mo_cholesky) then
-      call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, &
-              d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
-      call compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, &
-              d_cc_space_v_ov_chol,H_vv)
-      call compute_H_vo_chol(nO,nV,t1%f,H_vo%f)
+      call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
+      call compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, d_cc_space_v_ov_chol,H_vv)
+      call compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol, H_vo)
 
       call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1)
       call compute_r2_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r2%f,max_r2)
@@ -316,51 +314,20 @@ subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,
   integer :: i,j,a,b
   double precision :: e
 
-!  energy = 0d0
-!  !$omp parallel &
-!  !$omp shared(nO,nV,energy,tau_x,t1,&
-!  !$omp d_cc_space_f_vo,d_cc_space_v_oovv) &
-!  !$omp private(i,j,a,b,e) &
-!  !$omp default(none)
-!  e = 0d0
-!  !$omp do
-!  do a = 1, nV
-!    do i = 1, nO
-!      e = e + 2d0 * d_cc_space_f_vo%f(a,i) * t1%f(i,a)
-!    enddo
-!  enddo
-!  !$omp end do nowait
-!  !$omp do
-!  do b = 1, nV
-!    do a = 1, nV
-!      do j = 1, nO
-!        do i = 1, nO
-!          e = e + tau_x%f(i,j,a,b) * d_cc_space_v_oovv%f(i,j,a,b)
-!       enddo
-!      enddo
-!    enddo
-!  enddo
-!  !$omp end do nowait
-!  !$omp critical
-!  energy = energy + e
-!  !$omp end critical
-!  !$omp end parallel
+  type(gpu_stream) :: s1, s2
+  call gpu_stream_create(s1)
+  call gpu_stream_create(s2)
 
+  call gpu_set_stream(blas_handle,s1)
+  call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo, 1, t1, 1, e)
 
-   type(gpu_stream) :: s1, s2
-   call gpu_stream_create(s1)
-   call gpu_stream_create(s2)
+  call gpu_set_stream(blas_handle,s2)
+  call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1_8, d_cc_space_v_oovv, 1_8, energy)
+  call gpu_set_stream(blas_handle,gpu_default_stream)
 
-   call gpu_set_stream(blas_handle,s1)
-   call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo, 1, t1, 1, e)
-
-   call gpu_set_stream(blas_handle,s2)
-   call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1_8, d_cc_space_v_oovv, 1_8, energy)
-   call gpu_synchronize()
-   call gpu_set_stream(blas_handle,gpu_default_stream)
-
-   call gpu_stream_destroy(s1)
-   call gpu_stream_destroy(s2)
+  call gpu_synchronize()
+  call gpu_stream_destroy(s1)
+  call gpu_stream_destroy(s2)
 
    energy = energy + 2.d0*e
 
@@ -384,27 +351,9 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
   ! internal
   integer                       :: i,j,a,b
 
-!  !$OMP PARALLEL &
-!  !$OMP SHARED(nO,nV,tau,t2,t1,h_t1) &
-!  !$OMP PRIVATE(i,j,a,b) &
-!  !$OMP DEFAULT(NONE)
-!  !$OMP DO
-!  do b = 1, nV
-!    do a = 1, nV
-!      do j = 1, nO
-!        do i = 1, nO
-!          tau%f(i,j,a,b) = t2%f(i,j,a,b) + t1%f(i,a) * h_t1(j,b)
-!        enddo
-!      enddo
-!    enddo
-!  enddo
-!  !$OMP END DO
-!  !$OMP END PARALLEL
-
-
   type(gpu_stream) :: stream(nV)
 
-  !$OMP PARALLEL if (gpu_num == 0) &
+  !$OMP PARALLEL &
   !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) &
   !$OMP PRIVATE(i,j,a,b) &
   !$OMP DEFAULT(NONE)
@@ -422,6 +371,8 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
   !$OMP END DO
   !$OMP END PARALLEL
 
+  call gpu_synchronize()
+
   do b=1,nV
     call gpu_stream_destroy(stream(b))
   enddo
@@ -444,32 +395,15 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x)
   ! internal
   integer                       :: i,j,a,b
 
-!  !$OMP PARALLEL &
-!  !$OMP SHARED(nO,nV,tau,tau_x) &
-!  !$OMP PRIVATE(i,j,a,b) &
-!  !$OMP DEFAULT(NONE)
-!  !$OMP DO
-!  do b = 1, nV
-!    do a = 1, nV
-!      do j = 1, nO
-!        do i = 1, nO
-!          tau_x%f(i,j,a,b) = 2.d0*tau%f(i,j,a,b) - tau%f(i,j,b,a)
-!        enddo
-!      enddo
-!    enddo
-!  enddo
-!  !$OMP END DO
-!  !$OMP END PARALLEL
-
   type(gpu_stream) :: stream(nV)
 
   do a=1,nV
     call gpu_stream_create(stream(a))
   enddo
 
-  !$OMP PARALLEL if (gpu_num == 0) &
+  !$OMP PARALLEL &
   !$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) &
-  !$OMP PRIVATE(i,j,a,b) &
+  !$OMP PRIVATE(a,b) &
   !$OMP DEFAULT(NONE)
   !$OMP DO
   do b=1,nV
@@ -484,10 +418,12 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x)
   !$OMP END DO
   !$OMP END PARALLEL
 
+  call gpu_set_stream(blas_handle,gpu_default_stream)
+  call gpu_synchronize()
+
   do b=1,nV
     call gpu_stream_destroy(stream(b))
   enddo
-  call gpu_set_stream(blas_handle,gpu_default_stream)
 
 
 end
diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
index 458016fb..5eb95a06 100644
--- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
@@ -1,81 +1,200 @@
-subroutine ccsd_energy_space_chol(nO,nV,tau,t1,energy)
+! H_oo
 
+subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, &
+    d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
+  use gpu
   implicit none
 
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: tau(nO,nO,nV,nV)
-  double precision, intent(in)  :: t1(nO,nV)
-  double precision, intent(out) :: energy
+  integer, intent(in)           :: nO,nV
+  type(gpu_double2), intent(in)    :: d_cc_space_f_oo
+  type(gpu_double3), intent(in)    :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol
+  type(gpu_double4), intent(in)    :: tau_x
+  type(gpu_double2), intent(out)   :: H_oo
 
-  ! internal
-  integer :: i,j,a,b
-  double precision :: e
+  integer :: a,b,i,j,u,k
 
-  energy = 0d0
-  !$omp parallel &
-  !$omp shared(nO,nV,energy,tau,t1,&
-  !$omp cc_space_f_vo,cc_space_w_oovv) &
-  !$omp private(i,j,a,b,e) &
-  !$omp default(none)
-  e = 0d0
-  !$omp do
-  do a = 1, nV
-    do i = 1, nO
-      e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp do
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          e = e + tau(i,j,a,b) * cc_space_w_oovv(i,j,a,b)
-       enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp critical
-  energy = energy + e
-  !$omp end critical
-  !$omp end parallel
+  type(gpu_double3) :: tau_kau, tmp_vov, tmp_ovv
 
-end
+  call gpu_allocate(tau_kau, cholesky_mo_num, nV, nO)
 
-! Tau
+  type(gpu_blas) :: blas
 
-subroutine update_tau_space_chol(nO,nV,t1,t2,tau)
 
-  implicit none
+  !$OMP PARALLEL  &
+  !$OMP DEFAULT(SHARED) &
+  !$OMP PRIVATE(blas,u,b,tmp_vov,tmp_ovv)
 
-  ! in
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: t1(nO,nV), t2(nO,nO,nV,nV)
+  !$OMP SINGLE
+  !$OMP TASK
+  call gpu_copy(d_cc_space_f_oo, H_oo)
+  !$OMP END TASK
+  !$OMP END SINGLE
 
-  ! out
-  double precision, intent(out) :: tau(nO,nO,nV,nV)
+  call gpu_allocate(tmp_ovv, nO, nV, nV)
+  call gpu_allocate(tmp_vov, nV, nO, nV)
 
-  ! internal
-  integer                       :: i,j,a,b
+  call gpu_blas_create(blas)
 
-  !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,tau,t2,t1) &
-  !$OMP PRIVATE(i,j,a,b) &
-  !$OMP DEFAULT(NONE)
   !$OMP DO
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b)
-        enddo
-      enddo
+  do u=1,nO
+    call gpu_dgeam_f(blas, 'N', 'N', 1, nO*nV*nV, 1.d0, &
+           tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f, nO, tmp_ovv%f, 1)
+    do b=1,nV
+      call gpu_dgeam_f(blas, 'T', 'T', nV, nO, 1.d0, &
+           tmp_ovv%f(1,1,b), nO, 0.d0, &
+           tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV)
     enddo
+    call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
+      d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_vov%f, nV, &
+      0.d0, tau_kau%f(1,1,u), cholesky_mo_num)
   enddo
   !$OMP END DO
+
+  call gpu_blas_destroy(blas)
+
+  call gpu_deallocate(tmp_vov)
+  call gpu_deallocate(tmp_ovv)
+
+  !$OMP TASKWAIT
   !$OMP END PARALLEL
 
+  call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
+    tau_kau, cholesky_mo_num*nV,  d_cc_space_v_vo_chol, cholesky_mo_num*nV, &
+    1.d0, H_oo, nO)
+
+  call gpu_synchronize()
+  call gpu_deallocate(tau_kau)
+end
+
+! H_vv
+
+subroutine compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, &
+         d_cc_space_v_ov_chol,H_vv)
+  use gpu
+  implicit none
+
+  integer, intent(in)              :: nO,nV
+  type(gpu_double2), intent(in)    :: d_cc_space_f_vv
+  type(gpu_double3), intent(in)    :: d_cc_space_v_ov_chol
+  type(gpu_double4), intent(in)    :: tau_x
+  type(gpu_double2), intent(out)   :: H_vv
+
+  integer :: a,b,i,j,u,k, beta
+
+  type(gpu_double3) :: tau_kia, tmp_oov
+
+  call gpu_allocate(tau_kia, cholesky_mo_num, nO, nV)
+
+  type(gpu_blas) :: blas
+
+  !$OMP PARALLEL &
+  !$OMP DEFAULT(SHARED) &
+  !$OMP PRIVATE(a,b,tmp_oov,blas)
+
+  !$OMP SINGLE
+  !$OMP TASK
+  call gpu_copy(d_cc_space_f_vv, H_vv)
+  !$OMP END TASK
+  !$OMP END SINGLE
+
+  call gpu_blas_create(blas)
+  call gpu_allocate(tmp_oov, nO, nO, nV)
+
+  !$OMP DO
+  do a = 1, nV
+    do b=1,nV
+      call gpu_dgeam_f(blas, 'N', 'N', nO, nO, 1.d0, &
+        tau_x%f(1,1,a,b), nO, 0.d0, &
+        tau_x%f(1,1,a,b), nO, tmp_oov%f(1,1,b), nO)
+    enddo
+    call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nO,nO*nV,1.d0, &
+      d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov%f, nO, &
+      0.d0, tau_kia%f(1,1,a), cholesky_mo_num)
+  enddo
+  !$OMP END DO
+
+  call gpu_blas_destroy(blas)
+
+  call gpu_deallocate(tmp_oov)
+  !$OMP TASKWAIT
+  !$OMP END PARALLEL
+
+  call gpu_dgemm(blas_handle,'T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
+    tau_kia, cholesky_mo_num*nO,  d_cc_space_v_ov_chol, cholesky_mo_num*nO, &
+    1.d0, H_vv, nV)
+
+  call gpu_synchronize()
+  call gpu_deallocate(tau_kia)
+end
+
+! H_vo
+subroutine compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, &
+         d_cc_space_v_ov_chol,d_cc_space_v_vo_chol, H_vo)
+  use gpu
+  implicit none
+
+  integer, intent(in)            :: nO,nV
+  type(gpu_double2), intent(in)  :: t1, d_cc_space_f_vo
+  type(gpu_double3), intent(in)  :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol
+  type(gpu_double2), intent(out) :: H_vo
+
+  integer :: a,b,i,j,u,k
+
+  type(gpu_double1) :: tmp_k
+  type(gpu_double3) :: tmp, tmp2
+
+  call gpu_copy(d_cc_space_f_vo, H_vo)
+
+  call gpu_allocate(tmp_k, cholesky_mo_num)
+
+  call gpu_dgemm(blas_handle, 'N', 'N', cholesky_mo_num, 1, nO*nV, 2.d0, &
+     d_cc_space_v_ov_chol, cholesky_mo_num, &
+     t1, nO*nV, 0.d0, tmp_k, cholesky_mo_num)
+
+  call gpu_dgemm(blas_handle, 'T','N',nV*nO,1,cholesky_mo_num,1.d0, &
+      d_cc_space_v_vo_chol, cholesky_mo_num, tmp_k, cholesky_mo_num, 1.d0, &
+      H_vo, nV*nO)
+
+  call gpu_deallocate(tmp_k)
+
+
+  call gpu_allocate(tmp,  cholesky_mo_num, nO, nO)
+
+  call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nO, nO, nV, 1.d0, &
+    d_cc_space_v_ov_chol, cholesky_mo_num*nO, t1, nO, 0.d0, tmp, cholesky_mo_num*nO)
+
+  call gpu_allocate(tmp2, cholesky_mo_num, nO, nO)
+
+  type(gpu_stream) :: stream(nO)
+  do i=1,nO
+    call gpu_stream_create(stream(i))
+  enddo
+
+  !$OMP PARALLEL DO COLLAPSE(2) PRIVATE(i,j)
+  do i=1,nO
+    do j=1,nO
+      call gpu_set_stream(blas_handle,stream(j))
+      call gpu_dgeam_f(blas_handle, 'N', 'N', cholesky_mo_num, 1, 1.d0, &
+        tmp%f(1,i,j), cholesky_mo_num, 0.d0, &
+        tmp%f(1,i,j), cholesky_mo_num, tmp2%f(1,j,i), cholesky_mo_num)
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+  call gpu_set_stream(blas_handle,gpu_default_stream)
+  call gpu_synchronize()
+
+  do i=1,nO
+    call gpu_stream_destroy(stream(i))
+  enddo
+  call gpu_deallocate(tmp)
+
+  call gpu_dgemm(blas_handle, 'T','N', nV, nO, cholesky_mo_num*nO, -1.d0, &
+    d_cc_space_v_ov_chol, cholesky_mo_num*nO, tmp2, cholesky_mo_num*nO, &
+    1.d0, H_vo, nV)
+
+  call gpu_synchronize()
+  call gpu_deallocate(tmp2)
 end
 
 ! R1
@@ -291,251 +410,6 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
 
 end
 
-! H_oo
-
-subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, &
-    d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
-  use gpu
-  implicit none
-
-  integer, intent(in)           :: nO,nV
-  type(gpu_double2), intent(in)    :: d_cc_space_f_oo
-  type(gpu_double3), intent(in)    :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol
-  type(gpu_double4), intent(in)    :: tau_x
-  type(gpu_double2), intent(out)   :: H_oo
-
-  integer :: a,b,i,j,u,k
-
-  type(gpu_double3) :: tau_kau, tmp_vov, tmp_ovv
-
-  call gpu_allocate(tau_kau, cholesky_mo_num, nV, nO)
-
-!  !$omp parallel &
-!  !$omp default(shared) &
-!  !$omp private(i,u,j,k,a,b,tmp_vov)
-!  call gpu_allocate(tmp_vov, nV, nO, nV)
-!  !$omp do
-!  do u = 1, nO
-!    do b=1,nV
-!      do j=1,nO
-!        do a=1,nV
-!          tmp_vov%f(a,j,b) = tau_x%f(u,j,a,b)
-!        enddo
-!      enddo
-!    enddo
-!    call dgemm('N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
-!      d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, tmp_vov%f, nV, &
-!      0.d0, tau_kau%f(1,1,u), cholesky_mo_num)
-!  enddo
-!  !$omp end do nowait
-!  call gpu_deallocate(tmp_vov)
-!  !$omp do
-!  do i = 1, nO
-!    do u = 1, nO
-!      H_oo%f(u,i) = d_cc_space_f_oo%f(u,i)
-!    enddo
-!  enddo
-!  !$omp end do nowait
-!
-!  !$omp barrier
-!  !$omp end  parallel
-!  call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
-!    tau_kau%f(1,1,1), cholesky_mo_num*nV,  d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num*nV, &
-!    1.d0, H_oo%f(1,1), nO)
-!
-
-  type(gpu_blas) :: blas
-
-
-  !$OMP PARALLEL  &
-  !$OMP DEFAULT(SHARED) &
-  !$OMP PRIVATE(blas,u,b,tmp_vov,tmp_ovv)
-
-  !$OMP SINGLE
-  !$OMP TASK
-  call gpu_copy(d_cc_space_f_oo, H_oo)
-  !$OMP END TASK
-  !$OMP END SINGLE
-
-  call gpu_allocate(tmp_ovv, nO, nV, nV)
-  call gpu_allocate(tmp_vov, nV, nO, nV)
-
-  call gpu_blas_create(blas)
-
-  !$OMP DO
-  do u=1,nO
-    call gpu_dgeam_f(blas, 'N', 'N', 1, nO*nV*nV, 1.d0, &
-           tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f, nO, tmp_ovv%f, 1)
-    do b=1,nV
-      call gpu_dgeam_f(blas, 'T', 'T', nV, nO, 1.d0, &
-           tmp_ovv%f(1,1,b), nO, 0.d0, &
-           tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV)
-    enddo
-    call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
-      d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_vov%f, nV, &
-      0.d0, tau_kau%f(1,1,u), cholesky_mo_num)
-  enddo
-  !$OMP END DO
-
-  call gpu_blas_destroy(blas)
-
-  call gpu_deallocate(tmp_vov)
-  call gpu_deallocate(tmp_ovv)
-
-  !$OMP TASKWAIT
-  !$OMP END PARALLEL
-
-  call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
-    tau_kau, cholesky_mo_num*nV,  d_cc_space_v_vo_chol, cholesky_mo_num*nV, &
-    1.d0, H_oo, nO)
-
-  call gpu_deallocate(tau_kau)
-end
-
-! H_vv
-
-subroutine compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, &
-         d_cc_space_v_ov_chol,H_vv)
-  use gpu
-  implicit none
-
-  integer, intent(in)              :: nO,nV
-  type(gpu_double2), intent(in)    :: d_cc_space_f_vv
-  type(gpu_double3), intent(in)    :: d_cc_space_v_ov_chol
-  type(gpu_double4), intent(in)    :: tau_x
-  type(gpu_double2), intent(out)   :: H_vv
-
-  integer :: a,b,i,j,u,k, beta
-
-  type(gpu_double3) :: tau_kia, tmp_oov
-
-  call gpu_allocate(tau_kia, cholesky_mo_num, nO, nV)
-
-!  !$omp parallel &
-!  !$omp default(shared) &
-!  !$omp private(i,beta,j,k,a,b,tmp_oov)
-!  allocate(tmp_oov(nO,nO,nV) )
-!  !$omp do
-!  do a = 1, nV
-!    do b=1,nV
-!      do j=1,nO
-!        do i=1,nO
-!          tmp_oov(i,j,b) = tau_x%f(i,j,a,b)
-!        enddo
-!      enddo
-!    enddo
-!    call dgemm('N','T',cholesky_mo_num,nO,nO*nV,1.d0, &
-!      d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov, nO, &
-!      0.d0, tau_kia(1,1,a), cholesky_mo_num)
-!  enddo
-!  !$omp end do nowait
-!  deallocate(tmp_oov)
-
-!  !$omp do
-!  do beta = 1, nV
-!    do a = 1, nV
-!      H_vv%f(a,beta) = cc_space_f_vv(a,beta)
-!    enddo
-!  enddo
-!  !$omp end do nowait
-!  !$omp barrier
-!  !$omp end  parallel
-!  call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
-!    tau_kia, cholesky_mo_num*nO,  d_cc_space_v_ov_chol%f, cholesky_mo_num*nO, &
-!    1.d0, H_vv%f, nV)
-
-  type(gpu_blas) :: blas
-
-
-  PROVIDE gpu_num
-  !$OMP PARALLEL &
-  !$OMP DEFAULT(SHARED) &
-  !$OMP PRIVATE(a,b,tmp_oov,blas)
-
-  !$OMP SINGLE
-  !$OMP TASK
-  call gpu_copy(d_cc_space_f_vv, H_vv)
-  !$OMP END TASK
-  !$OMP END SINGLE
-
-  call gpu_blas_create(blas)
-  call gpu_allocate(tmp_oov, nO, nO, nV)
-
-  !$OMP DO
-  do a = 1, nV
-    do b=1,nV
-      call gpu_dgeam_f(blas, 'N', 'N', nO, nO, 1.d0, &
-        tau_x%f(1,1,a,b), nO, 0.d0, &
-        tau_x%f(1,1,a,b), nO, tmp_oov%f(1,1,b), nO)
-    enddo
-    call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nO,nO*nV,1.d0, &
-      d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov%f, nO, &
-      0.d0, tau_kia%f(1,1,a), cholesky_mo_num)
-  enddo
-  !$OMP END DO
-
-  call gpu_blas_destroy(blas)
-
-  call gpu_deallocate(tmp_oov)
-  !$OMP TASKWAIT
-  !$OMP END PARALLEL
-
-  call gpu_dgemm(blas_handle,'T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
-    tau_kia, cholesky_mo_num*nO,  d_cc_space_v_ov_chol, cholesky_mo_num*nO, &
-    1.d0, H_vv, nV)
-
-  call gpu_deallocate(tau_kia)
-end
-
-! H_vo
-subroutine compute_H_vo_chol(nO,nV,t1,H_vo)
-
-  implicit none
-
-  integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: t1(nO, nV)
-  double precision, intent(out) :: H_vo(nV, nO)
-
-  integer :: a,b,i,j,u,k
-
-  double precision, allocatable :: tmp_k(:), tmp(:,:,:), tmp2(:,:,:)
-  do i=1,nO
-    do a=1,nV
-      H_vo(a,i) = cc_space_f_vo(a,i)
-    enddo
-  enddo
-
-  allocate(tmp_k(cholesky_mo_num))
-  call dgemm('N', 'N', cholesky_mo_num, 1, nO*nV, 2.d0, &
-     cc_space_v_ov_chol, cholesky_mo_num, &
-     t1, nO*nV, 0.d0, tmp_k, cholesky_mo_num)
-
-  call dgemm('T','N',nV*nO,1,cholesky_mo_num,1.d0, &
-      cc_space_v_vo_chol, cholesky_mo_num, tmp_k, cholesky_mo_num, 1.d0, &
-      H_vo, nV*nO)
-  deallocate(tmp_k)
-
-  allocate(tmp(cholesky_mo_num,nO,nO))
-  allocate(tmp2(cholesky_mo_num,nO,nO))
-
-  call dgemm('N','T', cholesky_mo_num*nO, nO, nV, 1.d0, &
-    cc_space_v_ov_chol, cholesky_mo_num*nO, t1, nO, 0.d0, tmp, cholesky_mo_num*nO)
-
-  do i=1,nO
-    do j=1,nO
-      do k=1,cholesky_mo_num
-        tmp2(k,j,i) = tmp(k,i,j)
-      enddo
-    enddo
-  enddo
-  deallocate(tmp)
-
-  call dgemm('T','N', nV, nO, cholesky_mo_num*nO, -1.d0, &
-    cc_space_v_ov_chol, cholesky_mo_num*nO, tmp2, cholesky_mo_num*nO, &
-    1.d0, H_vo, nV)
-
-end
-
 
 ! R2