Refactoring

Merge turpan:~/qp2/plugins/qp_plugins_scemama
r1 on GPU
2024-11-08 23:23:42 +01:00 · 2023-08-04 16:54:48 +02:00 · 2023-08-04 16:43:42 +02:00 · 2023-08-04 16:42:46 +02:00 · 2023-08-04 16:19:41 +02:00 · 2023-08-04 14:48:08 +02:00
14 changed files with 2006 additions and 2359 deletions
--- a/devel/ccsd_gpu/EZFIO.cfg
+++ b/devel/ccsd_gpu/EZFIO.cfg
@ -1,3 +1,75 @@
+[cc_thresh_conv]
+type: double precision
+doc: Threshold for the convergence of the residual equations.
+interface: ezfio,ocaml,provider
+default: 1e-6
+
+[cc_max_iter]
+type: integer
+doc: Maximum number of iterations.
+interface: ezfio,ocaml,provider
+default: 100
+
+[cc_diis_depth]
+type: integer
+doc: Maximum depth of the DIIS, i.e., maximum number of iterations that the DIIS keeps in memory. Warning, we allocate matrices with the diis depth at the beginning without update. If you don't have enough memory it should crash in memory.
+interface: ezfio,ocaml,provider
+default: 8
+
+[cc_level_shift]
+type: double precision
+doc: Level shift for the CC
+interface: ezfio,ocaml,provider
+default: 0.0
+
+[cc_level_shift_guess]
+type: double precision
+doc: Level shift for the guess of the CC amplitudes
+interface: ezfio,ocaml,provider
+default: 0.0
+
+[cc_update_method]
+type: character*(32)
+doc: Method used to update the CC amplitudes. none -> normal, diis -> with diis.
+interface: ezfio,ocaml,provider
+default: diis
+
+[cc_guess_t1]
+type: character*(32)
+doc: Guess used to initialize the T1 amplitudes. none -> 0, MP -> perturbation theory, read -> read from disk.
+interface: ezfio,ocaml,provider
+default: MP
+
+[cc_guess_t2]
+type: character*(32)
+doc: Guess used to initialize the T2 amplitudes. none -> 0, MP -> perturbation theory, read -> read from disk.
+interface: ezfio,ocaml,provider
+default: MP
+
+[io_amplitudes]
+type: Disk_access
+doc: Read/Write |CCSD| amplitudes from/to disk [ Write | Read | None ]
+interface: ezfio,provider,ocaml
+default: None
+
+[cc_par_t]
+type: logical
+doc: If true, the CCSD(T) will be computed.
+interface: ezfio,ocaml,provider
+default: False
+
+[cc_dev]
+type: logical
+doc: Only for dev purposes.
+interface: ezfio,ocaml,provider
+default: False
+
+[cc_ref]
+type: integer
+doc: Index of the reference determinant in psi_det for CC calculation.
+interface: ezfio,ocaml,provider
+default: 1
+
 [energy]
 type: double precision
 doc: CCSD energy
--- a/devel/ccsd_gpu/NEED
+++ b/devel/ccsd_gpu/NEED
@ -1,2 +1,2 @@
 hartree_fock
-utils_cc_gpu
+determinants
--- a/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f
+++ b/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f
--- a/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f
+++ b/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f
@ -80,11 +80,12 @@ end

 ! R1

-subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
-
+subroutine compute_r1_space_chol(gpu_data, nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
+  use gpu_module
  implicit none

  ! in
+  type(c_ptr), intent(in) :: gpu_data
  integer, intent(in)           :: nO, nV
  double precision, intent(in)  :: t1(nO,nV), t2(nO,nO,nV,nV), tau(nO,nO,nV,nV)
  double precision, intent(in)  :: H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO)
@ -95,177 +96,40 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
  ! internal
  integer                       :: u,i,j,beta,a,b

-  !$omp parallel &
-  !$omp shared(nO,nV,r1,cc_space_f_ov) &
-  !$omp private(u,beta) &
-  !$omp default(none)
-  !$omp do
-  do beta = 1, nV
-    do u = 1, nO
-      r1(u,beta) = cc_space_f_ov(u,beta)
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel

-  double precision, allocatable :: X_oo(:,:)
-  allocate(X_oo(nO,nO))
-  call dgemm('N','N', nO, nO, nV, &
-             -2d0, t1    , size(t1,1), &
-                   cc_space_f_vo, size(cc_space_f_vo,1), &
-              0d0, X_oo  , size(X_oo,1))
-
-  call dgemm('T','N', nO, nV, nO, &
-             1d0, X_oo, size(X_oo,2), &
-                  t1  , size(t1,1), &
-             1d0, r1  , size(r1,1))
-  deallocate(X_oo)
-
-  call dgemm('N','N', nO, nV, nV, &
-             1d0, t1  , size(t1,1), &
-                  H_vv, size(H_vv,1), &
-             1d0, r1  , size(r1,1))
-
-  call dgemm('N','N', nO, nV, nO, &
-             -1d0, H_oo, size(H_oo,1), &
-                   t1  , size(t1,1), &
-              1d0, r1, size(r1,1))
-
-  double precision, allocatable :: X_voov(:,:,:,:)
-  allocate(X_voov(nV, nO, nO, nV))
-
-  !$omp parallel &
-  !$omp shared(nO,nV,X_voov,t2,t1) &
-  !$omp private(u,beta,i,a) &
-  !$omp default(none)
-  !$omp do
-  do beta = 1, nV
-    do u = 1, nO
-      do i = 1, nO
-        do a = 1, nV
-          X_voov(a,i,u,beta) = 2d0 * t2(i,u,a,beta) - t2(u,i,a,beta) + t1(u,a) * t1(i,beta)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  call dgemv('T', nV*nO, nO*nV, &
-             1d0, X_voov, size(X_voov,1) * size(X_voov,2), &
-                  H_vo  , 1, &
-             1d0, r1    , 1)
-
-  deallocate(X_voov)
+  call compute_r1_space_chol_gpu(gpu_data, nO, nV, t1, r1, max_r1)

  double precision, allocatable :: X_ovov(:,:,:,:)
-  allocate(X_ovov(nO, nV, nO, nV))
-
-  !$omp parallel &
-  !$omp shared(nO,nV,cc_space_v_ovov,cc_space_v_voov,X_ovov) &
-  !$omp private(u,beta,i,a) &
-  !$omp default(none)
-  !$omp do
-  do beta = 1, nV
-    do u = 1, nO
-      do a = 1, nv
-        do i = 1, nO
-          X_ovov(i,a,u,beta) = 2d0 * cc_space_v_voov(a,u,i,beta) - cc_space_v_ovov(u,a,i,beta)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  call dgemv('T', nO*nV, nO*nV, &
-             1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), &
-                  t1     , 1, &
-             1d0, r1     , 1)
-
-  deallocate(X_ovov)

  integer :: iblock, block_size, nVmax
  double precision, allocatable :: W_vvov(:,:,:,:), W_vvov_tmp(:,:,:,:), T_vvoo(:,:,:,:)
-  block_size = 16
-  allocate(W_vvov(nV,nV,nO,block_size), W_vvov_tmp(nV,nO,nV,block_size), T_vvoo(nV,nV,nO,nO))
-
-  !$omp parallel &
-  !$omp private(u,i,b,a) &
-  !$omp default(shared)
-  !$omp do
-  do u = 1, nO
-    do i = 1, nO
-      do b = 1, nV
-        do a = 1, nV
-          T_vvoo(a,b,i,u) = tau(i,u,a,b)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  do iblock = 1, nV, block_size
-    nVmax = min(block_size,nV-iblock+1)
-
-    call dgemm('T','N', nV*nO, nV*nVmax, cholesky_mo_num, 1.d0, &
-      cc_space_v_vo_chol            , cholesky_mo_num, &
-      cc_space_v_vv_chol(1,1,iblock), cholesky_mo_num, &
-      0.d0, W_vvov_tmp, nV*nO)
-
-    !$omp parallel &
-    !$omp private(b,i,a,beta) &
-    !$omp default(shared)
-    do beta = 1,  nVmax
-      do i = 1, nO
-        !$omp do
-        do b = 1, nV
-          do a = 1, nV
-            W_vvov(a,b,i,beta) = 2d0 * W_vvov_tmp(a,i,b,beta) - W_vvov_tmp(b,i,a,beta)
-          enddo
-        enddo
-        !$omp end do nowait
-      enddo
-    enddo
-    !$omp barrier
-    !$omp end parallel
-
-    call dgemm('T','N',nO,nVmax,nO*nV*nV, &
-             1d0, T_vvoo, nV*nV*nO, &
-                  W_vvov, nO*nV*nV, &
-             1d0, r1(1,iblock), nO)
-  enddo
-
-  deallocate(W_vvov,T_vvoo)
-

  double precision, allocatable :: W_oovo(:,:,:,:)
  allocate(W_oovo(nO,nO,nV,nO))

-  !$omp parallel &
-  !$omp shared(nO,nV,cc_space_v_vooo,W_oovo) &
-  !$omp private(u,a,i,j) &
-  !$omp default(none)
-  do u = 1, nO
-    !$omp do
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          W_oovo(i,j,a,u) = 2d0 * cc_space_v_vooo(a,u,i,j) - cc_space_v_vooo(a,u,j,i)
-        enddo
-      enddo
-    enddo
-    !$omp end do nowait
-  enddo
-  !$omp end parallel
+!  !$omp parallel &
+!  !$omp shared(nO,nV,cc_space_v_oovo,W_oovo) &
+!  !$omp private(u,a,i,j) &
+!  !$omp default(none)
+!  do u = 1, nO
+!    !$omp do
+!    do a = 1, nV
+!      do j = 1, nO
+!        do i = 1, nO
+!          W_oovo(i,j,a,u) = 2d0 * cc_space_v_oovo(i,j,a,u) - cc_space_v_oovo(j,i,a,u)
+!        enddo
+!      enddo
+!    enddo
+!    !$omp end do nowait
+!  enddo
+!  !$omp end parallel

-  call dgemm('T','N', nO, nV, nO*nO*nV, &
-             -1d0, W_oovo, size(W_oovo,1) * size(W_oovo,2) * size(W_oovo,3), &
-                   tau   , size(tau,1) * size(tau,2) * size(tau,3), &
-              1d0, r1    , size(r1,1))
-
-  deallocate(W_oovo)
+!  call dgemm('T','N', nO, nV, nO*nO*nV, &
+!             -1d0, W_oovo, nO * nO * nV, &
+!                   tau   , nO * nO * nV, &
+!              1d0, r1    , nO)
+!
+!  deallocate(W_oovo)

  max_r1 = 0d0
  do a = 1, nV
--- a/devel/ccsd_gpu/diis.irp.f
+++ b/devel/ccsd_gpu/diis.irp.f
@ -0,0 +1,529 @@
+! Code
+
+subroutine diis_cc(all_err,all_t,sze,m,iter,t)
+
+  implicit none
+
+  BEGIN_DOC
+  ! DIIS. Take the error vectors and the amplitudes of the previous
+  ! iterations to compute the new amplitudes
+  END_DOC
+  
+  ! {err_i}_{i=1}^{m_it} -> B -> c
+  ! {t_i}_{i=1}^{m_it}, c, {err_i}_{i=1}^{m_it} -> t_{m_it+1}
+
+  integer, intent(in)             :: m,iter,sze
+  double precision, intent(in)    :: all_err(sze,m)
+  double precision, intent(in)    :: all_t(sze,m)
+  
+  double precision, intent(out)   :: t(sze)
+  
+  double precision, allocatable   :: B(:,:), c(:), zero(:)
+  integer                         :: m_iter
+  integer                         :: i,j,k
+  integer                         :: info
+  integer, allocatable            :: ipiv(:)
+  double precision                :: accu
+  
+  m_iter = min(m,iter)
+  !print*,'m_iter',m_iter
+  allocate(B(m_iter+1,m_iter+1), c(m_iter), zero(m_iter+1))
+  allocate(ipiv(m+1))
+
+  ! B(i,j) =  < err(iter-m_iter+j),err(iter-m_iter+i) > ! iter-m_iter will be zero for us
+  B = 0d0
+  !$OMP PARALLEL &
+  !$OMP SHARED(B,m,m_iter,sze,all_err) &
+  !$OMP PRIVATE(i,j,k,accu) &
+  !$OMP DEFAULT(NONE)
+  do j = 1, m_iter
+    do i = 1, m_iter
+      accu = 0d0
+      !$OMP DO
+      do k = 1, sze
+        ! the errors of the ith iteration are in all_err(:,m+1-i)
+        accu = accu + all_err(k,m+1-i) * all_err(k,m+1-j)
+      enddo
+      !$OMP END DO NOWAIT
+      !$OMP CRITICAL
+      B(i,j) = B(i,j) + accu
+      !$OMP END CRITICAL
+    enddo
+  enddo
+  !$OMP END PARALLEL
+  
+  do i = 1, m_iter
+    B(i,m_iter+1) = -1
+  enddo
+  do j = 1, m_iter
+    B(m_iter+1,j) = -1
+  enddo
+  ! Debug
+  !print*,'B'
+  !do i = 1, m_iter+1
+  !  write(*,'(100(F10.6))') B(i,:)
+  !enddo
+
+  ! (0 0 .... 0 -1)
+  zero = 0d0
+  zero(m_iter+1) = -1d0
+
+  ! Solve B.c = zero
+  call dgesv(m_iter+1, 1, B, size(B,1), ipiv, zero, size(zero,1), info)
+  if (info /= 0) then
+    print*,'DIIS error in dgesv:', info
+    call abort
+  endif
+  ! c corresponds to the m_iter first solutions
+  c = zero(1:m_iter)
+  ! Debug
+  !print*,'c',c
+  !print*,'all_t' 
+  !do i = 1, m
+  !  write(*,'(100(F10.6))') all_t(:,i)
+  !enddo
+  !print*,'all_err' 
+  !do i = 1, m
+  !  write(*,'(100(F10.6))') all_err(:,i)
+  !enddo
+
+  ! update T
+  !$OMP PARALLEL &
+  !$OMP SHARED(t,c,m,all_err,all_t,sze,m_iter) &
+  !$OMP PRIVATE(i,j,accu) &
+  !$OMP DEFAULT(NONE)
+  !$OMP DO
+  do i = 1, sze
+    t(i) = 0d0
+  enddo
+  !$OMP END DO
+  do i = 1, m_iter
+    !$OMP DO
+    do j = 1, sze
+      t(j) = t(j) + c(i) * (all_t(j,m+1-i) + all_err(j,m+1-i))
+    enddo
+    !$OMP END DO
+  enddo
+  !$OMP END PARALLEL
+
+  !print*,'new t',t
+
+  deallocate(ipiv,B,c,zero)
+
+end
+
+! Update all err
+
+subroutine update_all_err(err,all_err,sze,m,iter)
+
+  implicit none
+
+  BEGIN_DOC
+  ! Shift all the err vectors of the previous iterations to add the new one
+  ! The last err vector is placed in the last position and all the others are
+  ! moved toward the first one.
+  END_DOC
+
+  integer, intent(in)             :: m, iter, sze
+  double precision, intent(in)    :: err(sze)
+  double precision, intent(inout) :: all_err(sze,m)
+  integer                         :: i,j
+  integer                         :: m_iter
+
+  m_iter = min(m,iter)
+
+  ! Shift
+  !$OMP PARALLEL &
+  !$OMP SHARED(m,all_err,err,sze) &
+  !$OMP PRIVATE(i,j) &
+  !$OMP DEFAULT(NONE)
+  do i = 1, m-1
+    !$OMP DO
+    do j = 1, sze
+      all_err(j,i) = all_err(j,i+1)
+    enddo
+    !$OMP END DO
+  enddo
+  
+  ! Debug
+  !print*,'shift err'
+  !do i = 1, m
+  !  print*,i, all_err(:,i)
+  !enddo
+
+  ! New
+  !$OMP DO
+  do i = 1, sze
+    all_err(i,m) = err(i)
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  ! Debug
+  !print*,'Updated err'
+  !do i = 1, m
+  !  print*,i, all_err(:,i)
+  !enddo
+
+end
+
+! Update all t
+
+subroutine update_all_t(t,all_t,sze,m,iter)
+
+  implicit none
+
+  BEGIN_DOC
+  ! Shift all the t vectors of the previous iterations to add the new one
+  ! The last t vector is placed in the last position and all the others are
+  ! moved toward the first one.
+  END_DOC
+
+  integer, intent(in)             :: m, iter, sze
+  double precision, intent(in)    :: t(sze)
+  double precision, intent(inout) :: all_t(sze,m)
+  integer                         :: i,j
+  integer                         :: m_iter
+
+  m_iter = min(m,iter)
+
+  ! Shift
+  !$OMP PARALLEL &
+  !$OMP SHARED(m,all_t,t,sze) &
+  !$OMP PRIVATE(i,j) &
+  !$OMP DEFAULT(NONE)
+  do i = 1, m-1
+    !$OMP DO
+    do j = 1, sze
+      all_t(j,i) = all_t(j,i+1)
+    enddo
+    !$OMP END DO
+  enddo
+
+  ! New
+  !$OMP DO
+  do i = 1, sze
+    all_t(i,m) = t(i)
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  ! Debug
+  !print*,'Updated t'
+  !do i = 1, m
+  !  print*,i, all_t(:,i)
+  !enddo
+
+end
+
+! Err1
+
+subroutine compute_err1(nO,nV,f_o,f_v,r1,err1)
+
+  implicit none
+
+  BEGIN_DOC
+  ! Compute the error vector for the t1
+  END_DOC
+
+  integer, intent(in)           :: nO, nV
+  double precision, intent(in)  :: f_o(nO), f_v(nV), r1(nO,nV)
+  
+  double precision, intent(out) :: err1(nO,nV)
+
+  integer                       :: i,a
+  
+  !$OMP PARALLEL &
+  !$OMP SHARED(err1,r1,f_o,f_v,nO,nV,cc_level_shift) &
+  !$OMP PRIVATE(i,a) &
+  !$OMP DEFAULT(NONE)
+  !$OMP DO
+  do a = 1, nV
+    do i = 1, nO
+      err1(i,a) = - r1(i,a) / (f_o(i) - f_v(a) - cc_level_shift)
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+end
+
+! Err2
+
+subroutine compute_err2(nO,nV,f_o,f_v,r2,err2)
+
+  implicit none
+
+  BEGIN_DOC
+  ! Compute the error vector for the t2
+  END_DOC
+
+  integer, intent(in)           :: nO, nV
+  double precision, intent(in)  :: f_o(nO), f_v(nV), r2(nO,nO,nV,nV)
+  
+  double precision, intent(out) :: err2(nO,nO,nV,nV)
+
+  integer                       :: i,j,a,b
+
+  !$OMP PARALLEL &
+  !$OMP SHARED(err2,r2,f_o,f_v,nO,nV,cc_level_shift) &
+  !$OMP PRIVATE(i,j,a,b) &
+  !$OMP DEFAULT(NONE)
+  !$OMP DO collapse(3)
+  do b = 1, nV
+    do a = 1, nV
+      do j = 1, nO
+        do i = 1, nO       
+          err2(i,j,a,b) = - r2(i,j,a,b) / (f_o(i) + f_o(j) - f_v(a) - f_v(b) - cc_level_shift)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+end
+
+! Update t
+
+subroutine update_t_ccsd(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
+
+  implicit none
+
+  integer, intent(in)             :: nO,nV,nb_iter
+  double precision, intent(in)    :: f_o(nO), f_v(nV)
+  double precision, intent(in)    :: r1(nO,nV), r2(nO,nO,nV,nV)
+  
+  double precision, intent(inout) :: t1(nO,nV), t2(nO,nO,nV,nV)
+  double precision, intent(inout) :: all_err1(nO*nV, cc_diis_depth), all_err2(nO*nO*nV*nV, cc_diis_depth)
+  double precision, intent(inout) :: all_t1(nO*nV, cc_diis_depth), all_t2(nO*nO*nV*nV, cc_diis_depth)
+
+  double precision, allocatable   :: err1(:,:), err2(:,:,:,:)
+  double precision, allocatable   :: tmp_err1(:), tmp_err2(:)
+  double precision, allocatable   :: tmp_t1(:), tmp_t2(:)
+  
+  if (cc_update_method == 'diis') then
+
+    allocate(err1(nO,nV), err2(nO,nO,nV,nV))
+    allocate(tmp_err1(nO*nV), tmp_err2(nO*nO*nV*nV))
+    allocate(tmp_t1(nO*nV), tmp_t2(nO*nO*nV*nV))
+
+    ! DIIS T1, it is not always good since the t1 can be small
+    ! That's why there is a call to update the t1 in the standard way
+    ! T1 error tensor
+    !call compute_err1(nO,nV,f_o,f_v,r1,err1)
+    ! Transfo errors and parameters in vectors
+    !tmp_err1 = reshape(err1,(/nO*nV/))
+    !tmp_t1   = reshape(t1  ,(/nO*nV/))
+    ! Add the error and parameter vectors with those of the previous iterations
+    !call update_all_err(tmp_err1,all_err1,nO*nV,cc_diis_depth,nb_iter+1)
+    !call update_all_t  (tmp_t1  ,all_t1  ,nO*nV,cc_diis_depth,nb_iter+1)
+    ! Diis and reshape T as a tensor
+    !call diis_cc(all_err1,all_t1,nO*nV,cc_diis_depth,nb_iter+1,tmp_t1)
+    !t1 = reshape(tmp_t1  ,(/nO,nV/))
+    call update_t1(nO,nV,f_o,f_v,r1,t1)
+
+    ! DIIS T2
+    ! T2 error tensor
+    call compute_err2(nO,nV,f_o,f_v,r2,err2)
+    ! Transfo errors and parameters in vectors
+    tmp_err2 = reshape(err2,(/nO*nO*nV*nV/))
+    tmp_t2   = reshape(t2  ,(/nO*nO*nV*nV/))
+    ! Add the error and parameter vectors with those of the previous iterations
+    call update_all_err(tmp_err2,all_err2,nO*nO*nV*nV,cc_diis_depth,nb_iter+1)
+    call update_all_t  (tmp_t2  ,all_t2  ,nO*nO*nV*nV,cc_diis_depth,nb_iter+1)
+    ! Diis and reshape T as a tensor
+    call diis_cc(all_err2,all_t2,nO*nO*nV*nV,cc_diis_depth,nb_iter+1,tmp_t2)
+    t2 = reshape(tmp_t2  ,(/nO,nO,nV,nV/))
+
+    deallocate(tmp_t1,tmp_t2,tmp_err1,tmp_err2,err1,err2)
+
+  ! Standard update as T = T - Delta
+  elseif (cc_update_method == 'none') then
+     
+    call update_t1(nO,nV,f_o,f_v,r1,t1)
+    call update_t2(nO,nV,f_o,f_v,r2,t2)
+    
+  else
+    print*,'Unkonw cc_method_method: '//cc_update_method
+  endif
+  
+end
+
+! Update t v2
+
+subroutine update_t_ccsd_diis(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
+
+  implicit none
+
+  integer, intent(in)             :: nO,nV,nb_iter
+  double precision, intent(in)    :: f_o(nO), f_v(nV)
+  double precision, intent(in)    :: r1(nO,nV), r2(nO,nO,nV,nV)
+  
+  double precision, intent(inout) :: t1(nO,nV), t2(nO,nO,nV,nV)
+  double precision, intent(inout) :: all_err1(nO*nV, cc_diis_depth), all_err2(nO*nO*nV*nV, cc_diis_depth)
+  double precision, intent(inout) :: all_t1(nO*nV, cc_diis_depth), all_t2(nO*nO*nV*nV, cc_diis_depth)
+
+  double precision, allocatable   :: all_t(:,:), all_err(:,:), tmp_t(:)
+  double precision, allocatable   :: err1(:,:), err2(:,:,:,:)
+  double precision, allocatable   :: tmp_err1(:), tmp_err2(:)
+  double precision, allocatable   :: tmp_t1(:), tmp_t2(:)
+
+  integer                         :: i,j
+  
+  ! Allocate
+  allocate(all_err(nO*nV+nO*nO*nV*nV,cc_diis_depth), all_t(nO*nV+nO*nO*nV*nV,cc_diis_depth))
+  allocate(tmp_t(nO*nV+nO*nO*nV*nV))
+  allocate(err1(nO,nV), err2(nO,nO,nV,nV))
+  allocate(tmp_err1(nO*nV), tmp_err2(nO*nO*nV*nV))
+  allocate(tmp_t1(nO*nV), tmp_t2(nO*nO*nV*nV))
+
+  ! Compute the errors and reshape them as vector
+  call compute_err1(nO,nV,f_o,f_v,r1,err1)
+  call compute_err2(nO,nV,f_o,f_v,r2,err2)
+  tmp_err1 = reshape(err1,(/nO*nV/))
+  tmp_err2 = reshape(err2,(/nO*nO*nV*nV/))
+  tmp_t1   = reshape(t1  ,(/nO*nV/))
+  tmp_t2   = reshape(t2  ,(/nO*nO*nV*nV/))
+  
+  ! Update the errors and parameters for the diis
+  call update_all_err(tmp_err1,all_err1,nO*nV,cc_diis_depth,nb_iter+1)
+  call update_all_t  (tmp_t1  ,all_t1  ,nO*nV,cc_diis_depth,nb_iter+1)
+  call update_all_err(tmp_err2,all_err2,nO*nO*nV*nV,cc_diis_depth,nb_iter+1)
+  call update_all_t  (tmp_t2  ,all_t2  ,nO*nO*nV*nV,cc_diis_depth,nb_iter+1)
+
+  ! Gather the different parameters and errors
+  !$OMP PARALLEL &
+  !$OMP SHARED(nO,nV,all_err,all_err1,all_err2,cc_diis_depth,&
+  !$OMP all_t,all_t1,all_t2) &
+  !$OMP PRIVATE(i,j) &
+  !$OMP DEFAULT(NONE)
+  do j = 1, cc_diis_depth
+    !$OMP DO 
+    do i = 1, nO*nV
+      all_err(i,j) = all_err1(i,j)
+    enddo
+    !$OMP END DO NOWAIT
+  enddo
+  do j = 1, cc_diis_depth
+    !$OMP DO
+    do i = 1, nO*nO*nV*nV
+      all_err(i+nO*nV,j) = all_err2(i,j)
+    enddo
+    !$OMP END DO NOWAIT
+  enddo
+  do j = 1, cc_diis_depth
+    !$OMP DO 
+    do i = 1, nO*nV
+      all_t(i,j) = all_t1(i,j)
+    enddo
+    !$OMP END DO NOWAIT
+  enddo
+  do j = 1, cc_diis_depth
+    !$OMP DO 
+    do i = 1, nO*nO*nV*nV
+      all_t(i+nO*nV,j) = all_t2(i,j)
+    enddo
+    !$OMP END DO
+  enddo
+  !$OMP END PARALLEL
+  
+  ! Diis
+  call diis_cc(all_err,all_t,nO*nV+nO*nO*nV*nV,cc_diis_depth,nb_iter+1,tmp_t)
+
+  ! Split the resulting vector
+  !$OMP PARALLEL &
+  !$OMP SHARED(nO,nV,tmp_t,tmp_t1,tmp_t2) &
+  !$OMP PRIVATE(i) &
+  !$OMP DEFAULT(NONE)
+  !$OMP DO
+  do i = 1, nO*nV
+    tmp_t1(i) = tmp_t(i)
+  enddo
+  !$OMP END DO NOWAIT
+  !$OMP DO
+  do i = 1, nO*nO*nV*nV
+    tmp_t2(i) = tmp_t(i+nO*nV) 
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  ! Reshape as tensors
+  t1 = reshape(tmp_t1 ,(/nO,nV/))
+  t2 = reshape(tmp_t2 ,(/nO,nO,nV,nV/))
+
+  ! Deallocate
+  deallocate(tmp_t1,tmp_t2,tmp_err1,tmp_err2,err1,err2,all_t,all_err)
+
+end
+
+! Update t v3
+
+subroutine update_t_ccsd_diis_v3(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err,all_t)
+
+  implicit none
+
+  integer, intent(in)             :: nO,nV,nb_iter
+  double precision, intent(in)    :: f_o(nO), f_v(nV)
+  double precision, intent(in)    :: r1(nO,nV), r2(nO,nO,nV,nV)
+  
+  double precision, intent(inout) :: t1(nO*nV), t2(nO*nO*nV*nV)
+  double precision, intent(inout) :: all_err(nO*nV+nO*nO*nV*nV, cc_diis_depth)
+  double precision, intent(inout) :: all_t(nO*nV+nO*nO*nV*nV, cc_diis_depth)
+
+  double precision, allocatable   :: tmp(:)
+
+  integer                         :: i,j
+  
+  ! Allocate
+  allocate(tmp(nO*nV+nO*nO*nV*nV))
+
+  ! Compute the errors
+  call compute_err1(nO,nV,f_o,f_v,r1,tmp(1:nO*nV))
+  call compute_err2(nO,nV,f_o,f_v,r2,tmp(nO*nV+1:nO*nV+nO*nO*nV*nV))
+  
+  ! Update the errors and parameters for the diis
+  call update_all_err(tmp,all_err,nO*nV+nO*nO*nV*nV,cc_diis_depth,nb_iter+1)
+
+  !$OMP PARALLEL &
+  !$OMP SHARED(nO,nV,tmp,t1,t2) &
+  !$OMP PRIVATE(i) &
+  !$OMP DEFAULT(NONE)
+  !$OMP DO
+  do i = 1, nO*nV
+    tmp(i) = t1(i)
+  enddo
+  !$OMP END DO NOWAIT
+  !$OMP DO
+  do i = 1, nO*nO*nV*nV
+    tmp(i+nO*nV) = t2(i)
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+   
+  call update_all_t(tmp,all_t,nO*nV+nO*nO*nV*nV,cc_diis_depth,nb_iter+1)
+
+  ! Diis
+  call diis_cc(all_err,all_t,nO*nV+nO*nO*nV*nV,cc_diis_depth,nb_iter+1,tmp)
+
+  ! Split the resulting vector
+  !$OMP PARALLEL &
+  !$OMP SHARED(nO,nV,tmp,t1,t2) &
+  !$OMP PRIVATE(i) &
+  !$OMP DEFAULT(NONE)
+  !$OMP DO
+  do i = 1, nO*nV
+    t1(i) = tmp(i)
+  enddo
+  !$OMP END DO NOWAIT
+  !$OMP DO
+  do i = 1, nO*nO*nV*nV
+    t2(i) = tmp(i+nO*nV) 
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  ! Deallocate
+  deallocate(tmp)
+
+end
--- a/devel/ccsd_gpu/energy.irp.f
+++ b/devel/ccsd_gpu/energy.irp.f
@ -0,0 +1,12 @@
+subroutine det_energy(det,energy)
+
+  implicit none
+
+  integer(bit_kind), intent(in) :: det
+
+  double precision, intent(out) :: energy
+  double precision, external :: diag_H_mat_elem
+
+  energy = diag_H_mat_elem(det,N_int) + nuclear_repulsion
+
+end
--- a/devel/ccsd_gpu/gpu.c
+++ b/devel/ccsd_gpu/gpu.c
@ -43,6 +43,292 @@ void gpu_upload(gpu_data* data,

 }

+
+void compute_h_oo_chol_gpu(gpu_data* data, int igpu)
+{
+    int ngpus = 1;
+    if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
+    igpu = igpu % ngpus;
+
+    const int cholesky_mo_num = data[igpu].cholesky_mo_num;
+    const int nO = data[igpu].nO;
+    const int nV = data[igpu].nV;
+    cudaSetDevice(igpu);
+
+    int m,n,k, lda, ldb, ldc;
+    double alpha, beta;
+    double* A;
+    double* B;
+    double* C;
+    cudaStream_t stream[nV];
+
+    cublasHandle_t handle;
+    cublasCreate(&handle);
+
+    double* d_H_oo               = data[igpu].H_oo;
+    double* d_tau_x              = data[igpu].tau_x;
+    double* d_cc_space_f_oo      = data[igpu].cc_space_f_oo;
+    double* d_cc_space_v_vo_chol = data[igpu].cc_space_v_vo_chol;
+    double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol;
+
+    double* d_tau_kau;
+    cudaMalloc((void **)&d_tau_kau, cholesky_mo_num*nV*nO * sizeof(double));
+
+    double* d_tmp_ovv;
+    cudaMalloc((void **)&d_tmp_ovv, nO*nV*nV  * sizeof(double));
+
+    double* d_tmp_vov;
+    cudaMalloc((void **)&d_tmp_vov, nV*nO*nV  * sizeof(double));
+
+    for (int i=0 ; i<nV ; ++i) {
+      cudaStreamCreate(&(stream[i]));
+    }
+    alpha = 1.0;
+    beta = 0.0;
+    for (int u=0 ; u<nO ; ++u) {
+      cublasDcopy(handle, nO*nV*nV, &(d_tau_x[u]), nO, d_tmp_ovv, 1);
+      for (int b=0 ; b<nV ; ++b) {
+            cublasSetStream(handle, stream[b]);
+            A = &(d_tmp_ovv[nO*nV*b]); lda = nO;
+            B = &(d_tmp_ovv[nO*nV*b]); ldb = nO;
+            C = &(d_tmp_vov[nV*nO*b]); ldc = nV;
+            cublasDgeam(handle, CUBLAS_OP_T, CUBLAS_OP_T, nV, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
+      }
+      cudaDeviceSynchronize();
+      cublasSetStream(handle, NULL);
+      alpha = 1.0;
+      beta = 0.0;
+      m=cholesky_mo_num;  n=nV;  k=nO*nV;
+      A=d_cc_space_v_ov_chol;  lda=cholesky_mo_num;
+      B=d_tmp_vov;  ldb=nV;
+      C=&(d_tau_kau[cholesky_mo_num*nV*u]);  ldc=cholesky_mo_num;
+      cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
+    }
+    for (int i=0 ; i<nV ; ++i) {
+      cudaStreamDestroy(stream[i]);
+    }
+
+    cudaFree(d_tmp_vov);
+    cudaFree(d_tmp_ovv);
+
+    cublasDcopy(handle, nO*nO, d_cc_space_f_oo, 1, d_H_oo, 1);
+    alpha = 1.0;
+    beta  = 1.0;
+    m=nO;  n=nO;  k=cholesky_mo_num*nV;
+    A=d_tau_kau;  lda=cholesky_mo_num*nV;
+    B=d_cc_space_v_vo_chol;  ldb=cholesky_mo_num*nV;
+    C=d_H_oo;  ldc=nO;
+    cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
+
+    cudaFree(d_tau_kau);
+
+    double* H_oo = malloc(nO*nO*sizeof(double));
+    cublasGetMatrix(nO, nO, sizeof(double), d_H_oo, nO, H_oo, nO);
+    for (int i=0 ; i<ngpus ; ++i) {
+       if (i != igpu) {
+         double* d_H_oo = data[i].H_oo;
+         cudaSetDevice(i);
+         cublasSetMatrix(nO, nO, sizeof(double), H_oo, nO, d_H_oo, nO);
+       }
+    }
+    free(H_oo);
+
+    cublasDestroy(handle);
+
+}
+
+
+void compute_h_vo_chol_gpu(gpu_data* data, int igpu)
+{
+    int ngpus = 1;
+    if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
+    igpu = igpu % ngpus;
+
+    const int cholesky_mo_num = data[igpu].cholesky_mo_num;
+    const int nO = data[igpu].nO;
+    const int nV = data[igpu].nV;
+    cudaSetDevice(igpu);
+
+    int m,n,k, lda, ldb, ldc;
+    double alpha, beta;
+    double* A;
+    double* B;
+    double* C;
+    cudaStream_t stream[nV];
+
+    cublasHandle_t handle;
+    cublasCreate(&handle);
+
+    double* d_t1                 = data[igpu].t1;
+    double* d_H_vo               = data[igpu].H_vo;
+    double* d_tau_x              = data[igpu].tau_x;
+    double* d_cc_space_f_vo      = data[igpu].cc_space_f_vo;
+    double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol;
+    double* d_cc_space_v_vo_chol = data[igpu].cc_space_v_vo_chol;
+
+    cublasDcopy(handle, nV*nO, d_cc_space_f_vo, 1, d_H_vo, 1);
+
+    double* d_tmp_k;
+    cudaMalloc((void **)&d_tmp_k, cholesky_mo_num * sizeof(double));
+
+    alpha = 2.0;
+    beta  = 0.0;
+    m=cholesky_mo_num;  n=1;  k=nO*nV;
+    A=d_cc_space_v_ov_chol;  lda=cholesky_mo_num;
+    B=d_t1;  ldb=nO*nV;
+    C=d_tmp_k;  ldc=cholesky_mo_num;
+    cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
+
+    alpha = 1.0;
+    beta  = 1.0;
+    m=nV*nO;  n=1;  k=cholesky_mo_num;
+    A=d_cc_space_v_vo_chol;  lda=cholesky_mo_num;
+    B=d_tmp_k;  ldb=cholesky_mo_num;
+    C=d_H_vo;  ldc=nV*nO;
+    cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
+
+    cudaFree(d_tmp_k);
+
+    double* d_tmp;
+    cudaMalloc((void **)&d_tmp, cholesky_mo_num*nO*nO * sizeof(double));
+
+    alpha = 1.0;
+    beta  = 0.0;
+    m=cholesky_mo_num*nO;  n=nO;  k=nV;
+    A=d_cc_space_v_ov_chol;  lda=cholesky_mo_num*nO;
+    B=d_t1;  ldb=nO;
+    C=d_tmp;  ldc=cholesky_mo_num*nO;
+    cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
+
+    double* d_tmp2;
+    cudaMalloc((void **)&d_tmp2, cholesky_mo_num*nO*nO * sizeof(double));
+
+    for (int i=0 ; i<nV ; ++i) {
+      cudaStreamCreate(&(stream[i]));
+    }
+    for (int i=0 ; i<nO ; ++i) {
+      for (int j=0 ; j<nO ; ++j) {
+         cublasSetStream(handle, stream[j]);
+         cublasDcopy(handle, cholesky_mo_num, &(d_tmp [cholesky_mo_num*(i+nO*j)]), 1,
+                                              &(d_tmp2[cholesky_mo_num*(j+nO*i)]), 1);
+      }
+    }
+    for (int i=0 ; i<nV ; ++i) {
+      cudaStreamDestroy(stream[i]);
+    }
+    cublasSetStream(handle, NULL);
+
+    alpha = -1.0;
+    beta  = 1.0;
+    m=nV;  n=nO;  k=cholesky_mo_num*nO;
+    A=d_cc_space_v_ov_chol;  lda=cholesky_mo_num*nO;
+    B=d_tmp2;  ldb=cholesky_mo_num*nO;
+    C=d_H_vo;  ldc=nV;
+    cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
+
+    double* H_vo = malloc(nV*nO*sizeof(double));
+    cublasGetMatrix(nV, nO, sizeof(double), d_H_vo, nV, H_vo, nV);
+    for (int i=0 ; i<ngpus ; ++i) {
+       if (i != igpu) {
+         double* d_H_vo = data[i].H_vo;
+         cudaSetDevice(i);
+         cublasSetMatrix(nV, nO, sizeof(double), H_vo, nV, d_H_vo, nV);
+       }
+    }
+    free(H_vo);
+
+    cublasDestroy(handle);
+
+}
+
+
+
+void compute_h_vv_chol_gpu(gpu_data* data, int igpu)
+{
+    int ngpus = 1;
+    if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
+    igpu = igpu % ngpus;
+
+    const int cholesky_mo_num = data[igpu].cholesky_mo_num;
+    const int nO = data[igpu].nO;
+    const int nV = data[igpu].nV;
+    cudaSetDevice(igpu);
+
+    int m,n,k, lda, ldb, ldc;
+    double alpha, beta;
+    double* A;
+    double* B;
+    double* C;
+    cudaStream_t stream[nV];
+
+    cublasHandle_t handle;
+    cublasCreate(&handle);
+
+    double* d_H_vv               = data[igpu].H_vv;
+    double* d_tau_x              = data[igpu].tau_x;
+    double* d_cc_space_f_vv      = data[igpu].cc_space_f_vv;
+    double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol;
+
+    double* d_tau_kia;
+    cudaMalloc((void **)&d_tau_kia, cholesky_mo_num*nO*nV * sizeof(double));
+
+    double* d_tmp_oov;
+    cudaMalloc((void **)&d_tmp_oov, nO*nO*nV  * sizeof(double));
+
+    alpha = 1.0;
+    beta = 0.0;
+    for (int i=0 ; i<nV ; ++i) {
+      cudaStreamCreate(&(stream[i]));
+    }
+    for (int a=0 ; a<nV ; ++a) {
+      for (int b=0 ; b<nV ; ++b) {
+         cublasSetStream(handle, stream[b]);
+         cublasDcopy(handle, nO*nO, &(d_tau_x[nO*nO*(a+nV*b)]), 1, &(d_tmp_oov[nO*nO*b]), 1);
+      }
+      cudaDeviceSynchronize();
+      cublasSetStream(handle, NULL);
+      alpha = 1.0;
+      beta = 0.0;
+      m=cholesky_mo_num;  n=nO;  k=nO*nV;
+      A=d_cc_space_v_ov_chol;  lda=cholesky_mo_num;
+      B=d_tmp_oov;  ldb=nO;
+      C=&(d_tau_kia[cholesky_mo_num*nO*a]);  ldc=cholesky_mo_num;
+      cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
+    }
+    for (int i=0 ; i<nV ; ++i) {
+      cudaStreamDestroy(stream[i]);
+    }
+    cudaFree(d_tmp_oov);
+
+    cublasDcopy(handle, nV*nV, d_cc_space_f_vv, 1, d_H_vv, 1);
+    alpha = -1.0;
+    beta  = 1.0;
+    m=nV;  n=nV;  k=cholesky_mo_num*nO;
+    A=d_tau_kia;  lda=cholesky_mo_num*nO;
+    B=d_cc_space_v_ov_chol;  ldb=cholesky_mo_num*nO;
+    C=d_H_vv;  ldc=nV;
+    cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
+
+    cudaFree(d_tau_kia);
+
+    double* H_vv = malloc(nV*nV*sizeof(double));
+    cublasGetMatrix(nV, nV, sizeof(double), d_H_vv, nV, H_vv, nV);
+    for (int i=0 ; i<ngpus ; ++i) {
+       if (i != igpu) {
+         double* d_H_vv = data[i].H_vv;
+         cudaSetDevice(i);
+         cublasSetMatrix(nV, nV, sizeof(double), H_vv, nV, d_H_vv, nV);
+       }
+    }
+    free(H_vv);
+
+    cublasDestroy(handle);
+
+}
+
+
+
+
 void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, double* r2, double* max_r2)
 {
    const int cholesky_mo_num = data->cholesky_mo_num;
@ -1294,7 +1580,6 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl

              for (size_t bet=iblock ; bet<(nV < iblock+BLOCK_SIZE ? nV : iblock+BLOCK_SIZE) ; ++bet)
              {
-
                alpha = 1.0;
                beta = 0.0;
                A = &(d_tmpB1[nV*(bet-iblock)]); lda = nV*BLOCK_SIZE;
@ -1344,15 +1629,19 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
 }


-void compute_h_oo_chol_gpu(gpu_data* data, int nO, int nV, int igpu, double* H_oo)
+
+
+
+
+void compute_r1_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, double* r1, double* max_r1)
 {
+    const int cholesky_mo_num = data->cholesky_mo_num;
+
    int ngpus = 1;
    if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
-    igpu = igpu % ngpus;
-
-    const int cholesky_mo_num = data[igpu].cholesky_mo_num;
-    cudaSetDevice(igpu);

+    #pragma omp parallel num_threads(ngpus)
+    {
      int m,n,k, lda, ldb, ldc;
      double alpha, beta;
      double* A;
@ -1360,238 +1649,248 @@ void compute_h_oo_chol_gpu(gpu_data* data, int nO, int nV, int igpu, double* H_o
      double* C;
      cudaStream_t stream[nV];

+      int igpu = omp_get_thread_num();
+      cudaSetDevice(igpu);
+
      cublasHandle_t handle;
      cublasCreate(&handle);

-    double* d_H_oo               = data[igpu].H_oo;
-    double* d_tau_x              = data[igpu].tau_x;
-    double* d_cc_space_f_oo      = data[igpu].cc_space_f_oo;
+      double* d_r1;
+      lda = nO ;
+      cudaMalloc((void **)&d_r1, lda * nV * sizeof(double));
+      cudaMemset(d_r1, 0, nO*nV*sizeof(double));
+      memset(r1, 0, nO*nV*sizeof(double));
+
      double* d_cc_space_v_vo_chol = data[igpu].cc_space_v_vo_chol;
-    double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol;
-
-    double* d_tau_kau;
-    cudaMalloc((void **)&d_tau_kau, cholesky_mo_num*nV*nO * sizeof(double));
-
-    double* d_tmp_ovv;
-    cudaMalloc((void **)&d_tmp_ovv, nO*nV*nV  * sizeof(double));
-
-    double* d_tmp_vov;
-    cudaMalloc((void **)&d_tmp_vov, nV*nO*nV  * sizeof(double));
-
-    for (int i=0 ; i<nV ; ++i) {
-      cudaStreamCreate(&(stream[i]));
-    }
-    alpha = 1.0;
-    beta = 0.0;
-    for (int u=0 ; u<nO ; ++u) {
-      cublasDcopy(handle, nO*nV*nV, &(d_tau_x[u]), nO, d_tmp_ovv, 1);
-      for (int b=0 ; b<nV ; ++b) {
-            cublasSetStream(handle, stream[b]);
-            A = &(d_tmp_ovv[nO*nV*b]); lda = nO;
-            B = &(d_tmp_ovv[nO*nV*b]); ldb = nO;
-            C = &(d_tmp_vov[nV*nO*b]); ldc = nV;
-            cublasDgeam(handle, CUBLAS_OP_T, CUBLAS_OP_T, nV, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
-      }
-      cudaDeviceSynchronize();
-      cublasSetStream(handle, NULL);
-      alpha = 1.0;
-      beta = 0.0;
-      m=cholesky_mo_num;  n=nV;  k=nO*nV;
-      A=d_cc_space_v_ov_chol;  lda=cholesky_mo_num;
-      B=d_tmp_vov;  ldb=nV;
-      C=&(d_tau_kau[cholesky_mo_num*nV*u]);  ldc=cholesky_mo_num;
-      cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
-    }
-    for (int i=0 ; i<nV ; ++i) {
-      cudaStreamDestroy(stream[i]);
-    }
-
-    cudaFree(d_tmp_vov);
-    cudaFree(d_tmp_ovv);
-
-    cublasDcopy(handle, nO*nO, d_cc_space_f_oo, 1, d_H_oo, 1);
-    alpha = 1.0;
-    beta  = 1.0;
-    m=nO;  n=nO;  k=cholesky_mo_num*nV;
-    A=d_tau_kau;  lda=cholesky_mo_num*nV;
-    B=d_cc_space_v_vo_chol;  ldb=cholesky_mo_num*nV;
-    C=d_H_oo;  ldc=nO;
-    cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
-
-    cudaFree(d_tau_kau);
-
-//    double* H_oo = malloc(nO*nO*sizeof(double));
-    cublasGetMatrix(nO, nO, sizeof(double), d_H_oo, nO, H_oo, nO);
-    for (int i=0 ; i<ngpus ; ++i) {
-       if (i != igpu) {
-         double* d_H_oo = data[i].H_oo;
-         cudaSetDevice(i);
-         cublasSetMatrix(nO, nO, sizeof(double), H_oo, nO, d_H_oo, nO);
-       }
-    }
-//    free(H_oo);
-
-    cublasDestroy(handle);
-
-}
-
-
-
-
-void compute_h_vv_chol_gpu(gpu_data* data, int nO, int nV, int igpu, double* H_vv)
-{
-    int ngpus = 1;
-    if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
-    igpu = igpu % ngpus;
-
-    const int cholesky_mo_num = data[igpu].cholesky_mo_num;
-    cudaSetDevice(igpu);
-
-    int m,n,k, lda, ldb, ldc;
-    double alpha, beta;
-    double* A;
-    double* B;
-    double* C;
-    cudaStream_t stream[nV];
-
-    cublasHandle_t handle;
-    cublasCreate(&handle);
-
-    double* d_H_vv               = data[igpu].H_vv;
-    double* d_tau_x              = data[igpu].tau_x;
-    double* d_cc_space_f_vv      = data[igpu].cc_space_f_vv;
-    double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol;
-
-    double* d_tau_kia;
-    cudaMalloc((void **)&d_tau_kia, cholesky_mo_num*nO*nV * sizeof(double));
-
-    double* d_tmp_oov;
-    cudaMalloc((void **)&d_tmp_oov, nO*nO*nV  * sizeof(double));
-
-    alpha = 1.0;
-    beta = 0.0;
-    for (int i=0 ; i<nV ; ++i) {
-      cudaStreamCreate(&(stream[i]));
-    }
-    for (int a=0 ; a<nV ; ++a) {
-      for (int b=0 ; b<nV ; ++b) {
-         cublasSetStream(handle, stream[b]);
-         cublasDcopy(handle, nO*nO, &(d_tau_x[nO*nO*(a+nV*b)]), 1, &(d_tmp_oov[nO*nO*b]), 1);
-      }
-      cudaDeviceSynchronize();
-      cublasSetStream(handle, NULL);
-      alpha = 1.0;
-      beta = 0.0;
-      m=cholesky_mo_num;  n=nO;  k=nO*nV;
-      A=d_cc_space_v_ov_chol;  lda=cholesky_mo_num;
-      B=d_tmp_oov;  ldb=nO;
-      C=&(d_tau_kia[cholesky_mo_num*nO*a]);  ldc=cholesky_mo_num;
-      cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
-    }
-    for (int i=0 ; i<nV ; ++i) {
-      cudaStreamDestroy(stream[i]);
-    }
-    cudaFree(d_tmp_oov);
-
-    cublasDcopy(handle, nV*nV, d_cc_space_f_vv, 1, d_H_vv, 1);
-    alpha = -1.0;
-    beta  = 1.0;
-    m=nV;  n=nV;  k=cholesky_mo_num*nO;
-    A=d_tau_kia;  lda=cholesky_mo_num*nO;
-    B=d_cc_space_v_ov_chol;  ldb=cholesky_mo_num*nO;
-    C=d_H_vv;  ldc=nV;
-    cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
-
-    cudaFree(d_tau_kia);
-
-//    double* H_vv = malloc(nO*nO*sizeof(double));
-    cublasGetMatrix(nV, nV, sizeof(double), d_H_vv, nV, H_vv, nV);
-    for (int i=0 ; i<ngpus ; ++i) {
-       if (i != igpu) {
-         double* d_H_vv = data[i].H_vv;
-         cudaSetDevice(i);
-         cublasSetMatrix(nV, nV, sizeof(double), H_vv, nV, d_H_vv, nV);
-       }
-    }
-//    free(H_vv);
-
-    cublasDestroy(handle);
-
-}
-
-
-
-void compute_h_vo_chol_gpu(gpu_data* data, int nO, int nV, int igpu, double* H_vo)
-{
-    int ngpus = 1;
-    if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
-    igpu = igpu % ngpus;
-
-    const int cholesky_mo_num = data[igpu].cholesky_mo_num;
-    cudaSetDevice(igpu);
-
-    int m,n,k, lda, ldb, ldc;
-    double alpha, beta;
-    double* A;
-    double* B;
-    double* C;
-    cudaStream_t stream[nV];
-
-    cublasHandle_t handle;
-    cublasCreate(&handle);
-
-    double* d_t1                 = data[igpu].t1;
-    double* d_H_vo               = data[igpu].H_vo;
-    double* d_tau_x              = data[igpu].tau_x;
+      double* d_cc_space_v_vv_chol = data[igpu].cc_space_v_vv_chol;
+      double* d_cc_space_v_oovo    = data[igpu].cc_space_v_oovo;
+      double* d_cc_space_v_ovov    = data[igpu].cc_space_v_ovov;
+      double* d_cc_space_v_voov    = data[igpu].cc_space_v_voov;
+      double* d_cc_space_f_ov      = data[igpu].cc_space_f_ov;
      double* d_cc_space_f_vo      = data[igpu].cc_space_f_vo;
-    double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol;
-    double* d_cc_space_v_vo_chol = data[igpu].cc_space_v_vo_chol;
+      double* d_tau                = data[igpu].tau;
+      double* d_t1                 = data[igpu].t1;
+      double* d_t2                 = data[igpu].t2;
+      double* d_H_oo               = data[igpu].H_oo;
+      double* d_H_vo               = data[igpu].H_vo;
+      double* d_H_vv               = data[igpu].H_vv;

-    cublasDcopy(handle, nV*nO, d_cc_space_f_vo, 1, d_H_vo, 1);
+      #pragma omp sections
+      {

-    double* d_tmp_k;
-    cudaMalloc((void **)&d_tmp_k, cholesky_mo_num * sizeof(double));
+        #pragma omp section
+        {
+          cublasDcopy(handle, nO*nV, d_cc_space_f_ov, 1, d_r1, 1);

-    alpha = 2.0;
+          double* d_X_oo;
+          cudaMalloc((void **)&d_X_oo, nO*nO * sizeof(double));
+
+          alpha = -2.0;
          beta  = 0.0;
-    m=cholesky_mo_num;  n=1;  k=nO*nV;
-    A=d_cc_space_v_ov_chol;  lda=cholesky_mo_num;
-    B=d_t1;  ldb=nO*nV;
-    C=d_tmp_k;  ldc=cholesky_mo_num;
+          m=nO;  n=nO;  k=nV;
+          A=d_t1;  lda=nO;
+          B=d_cc_space_f_vo;  ldb=nV;
+          C=d_X_oo;  ldc=nO;
          cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);

          alpha = 1.0;
          beta  = 1.0;
-    m=nV*nO;  n=1;  k=cholesky_mo_num;
-    A=d_cc_space_v_vo_chol;  lda=cholesky_mo_num;
-    B=d_tmp_k;  ldb=cholesky_mo_num;
-    C=d_H_vo;  ldc=nV*nO;
+          m=nO;  n=nV;  k=nO;
+          A=d_X_oo;  lda=nO;
+          B=d_t1;  ldb=nO;
+          C=d_r1;  ldc=nO;
          cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);

-    cudaFree(d_tmp_k);
-
-    double* d_tmp;
-    cudaMalloc((void **)&d_tmp, cholesky_mo_num*nO*nO * sizeof(double));
+          cudaFree(d_X_oo);
+        }

+        #pragma omp section
+        {
          alpha = 1.0;
-    beta  = 0.0;
-    m=cholesky_mo_num*nO;  n=nO;  k=nV;
-    A=d_cc_space_v_ov_chol;  lda=cholesky_mo_num*nO;
-    B=d_t1;  ldb=nO;
-    C=d_tmp;  ldc=cholesky_mo_num*nO;
-    cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
+          beta  = 1.0;
+          m=nO;  n=nV;  k=nV;
+          A=d_t1;  lda=nO;
+          B=d_H_vv;  ldb=nV;
+          C=d_r1;  ldc=nO;
+          cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
+        }

-    double* d_tmp2;
-    cudaMalloc((void **)&d_tmp2, cholesky_mo_num*nO*nO * sizeof(double));
+        #pragma omp section
+        {
+          alpha = -1.0;
+          beta  = 1.0;
+          m=nO;  n=nV;  k=nO;
+          A=d_H_oo;  lda=nO;
+          B=d_t1;  ldb=nO;
+          C=d_r1;  ldc=nO;
+          cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
+        }
+
+        #pragma omp section
+        {
+          double* d_X_voov;
+          cudaMalloc((void **)&d_X_voov, nV* nO* nO* nV * sizeof(double));

          for (int i=0 ; i<nV ; ++i) {
            cudaStreamCreate(&(stream[i]));
          }
+          alpha = -1.0;
          for (int i=0 ; i<nO ; ++i) {
-      for (int j=0 ; j<nO ; ++j) {
-         cublasSetStream(handle, stream[j]);
-         cublasDcopy(handle, cholesky_mo_num, &(d_tmp [cholesky_mo_num*(i+nO*j)]), 1,
-                                              &(d_tmp2[cholesky_mo_num*(j+nO*i)]), 1);
+            for (int bet=0 ; bet<nV ; ++bet) {
+              cublasSetStream(handle, stream[bet]);
+              beta = t1[i+bet*nO];
+              A = &(d_t2[nO*(i+nO*nV*bet)]); lda = nO*nO;
+              B = &(d_t1[0]); ldb = nO;
+              C = &(d_X_voov[nV*(i+nO*nO*bet)]); ldc = nV*nO;
+              cublasDgeam(handle, CUBLAS_OP_T, CUBLAS_OP_T, nV, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
+            }
+          }
+          cudaDeviceSynchronize();
+          alpha = 1.0;
+          beta  = 2.0;
+          for (int bet=0 ; bet<nV ; ++bet) {
+            cublasSetStream(handle, stream[bet]);
+            A = &(d_X_voov[nV*nO*nO*bet]); lda = nV;
+            B = &(d_t2[nO*nO*nV*bet]); ldb = nO*nO;
+            C = A ; ldc = lda;
+            cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nV, nO*nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
+          }
+          for (int i=0 ; i<nV ; ++i) {
+            cudaStreamDestroy(stream[i]);
+          }
+          cublasSetStream(handle, NULL);
+
+          alpha = 1.0;
+          beta  = 1.0;
+          m=nV*nO;  n=nO*nV;
+          A=d_X_voov;  lda=nV * nO;
+          B=d_H_vo;  ldb=1;
+          C=d_r1;  ldc=1;
+          cublasDgemv(handle, CUBLAS_OP_T, m, n, &alpha, A, lda, B, ldb, &beta, C, ldc);
+
+          cudaFree(d_X_voov);
+        }
+
+        #pragma omp section
+        {
+          double* d_X_ovov;
+          cudaMalloc((void **)&d_X_ovov, nO* nV* nO* nV * sizeof(double));
+          cublasDcopy(handle, nO*nV*nO*nV, d_cc_space_v_ovov, 1, d_X_ovov, 1);
+
+          for (int i=0 ; i<nV ; ++i) {
+            cudaStreamCreate(&(stream[i]));
+          }
+          alpha = -1.0;
+          beta  = 2.0;
+          for (int u=0 ; u<nO ; ++u) {
+            for (int bet=0 ; bet<nV ; ++bet) {
+              cublasSetStream(handle, stream[bet]);
+              A = &(d_X_ovov[nO*nV*(u+nO*bet)]); lda = nO;
+              B = &(d_cc_space_v_voov[(nV*(u+nO*nO*bet))]); ldb = nV*nO;
+              C = A ; ldc = lda;
+              cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nO, nV, &alpha, A, lda, &beta, B, ldb, C, ldc);
+            }
+          }
+          for (int i=0 ; i<nV ; ++i) {
+            cudaStreamDestroy(stream[i]);
+          }
+          cublasSetStream(handle, NULL);
+
+          alpha = 1.0;
+          beta  = 1.0;
+          m=nO*nV;  n=nO*nV;
+          A=d_X_ovov;  lda=nO * nV;
+          B=d_t1;  ldb=1;
+          C=d_r1;  ldc=1;
+          cublasDgemv(handle, CUBLAS_OP_T, m, n, &alpha, A, lda, B, ldb, &beta, C, ldc);
+
+          cudaFree(d_X_ovov);
+        }
+
+
+        #pragma omp section
+        {
+          double* d_T_vvoo;
+          cudaMalloc((void **)&d_T_vvoo, nV*nV*nO*nO * sizeof(double));
+
+          alpha = 0.0;
+          beta  = 1.0;
+          A = d_T_vvoo; lda = nV*nV;
+          B = d_tau; ldb = nO*nO;
+          C = A ; ldc = lda;
+          cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nV*nV, nO*nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
+
+          double* d_W_vvov;
+          cudaMalloc((void **)&d_W_vvov, nV*nV*nO*BLOCK_SIZE * sizeof(double));
+
+          double* d_W_vvov_tmp;
+          cudaMalloc((void **)&d_W_vvov_tmp, nV*nO*nV*BLOCK_SIZE * sizeof(double));
+
+
+          for (int iblock=0 ; iblock<nV ; iblock += BLOCK_SIZE) {
+            const int mbs = BLOCK_SIZE < nV-iblock ? BLOCK_SIZE : nV-iblock;
+
+            alpha = 1.0;
+            beta  = 0.0;
+            m=nV*nO;  n=nV*mbs;  k=cholesky_mo_num;
+            A=d_cc_space_v_vo_chol;  lda=cholesky_mo_num;
+            B=&(d_cc_space_v_vv_chol[cholesky_mo_num*nV*iblock]);  ldb=cholesky_mo_num;
+            C=d_W_vvov_tmp;  ldc=nV*nO;
+            cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
+
+            alpha = 2.0;
+            beta = -1.0;
+            int kk=0;
+            for (int i=0 ; i<nV ; ++i) {
+              cudaStreamCreate(&(stream[i]));
+            }
+            for (int i=0 ; i<nO ; ++i) {
+              for (int bet=0 ; bet<mbs ; ++bet) {
+                cublasSetStream(handle, stream[kk]);
+                ++kk;
+                if (kk >= nV) kk = 0;
+                A = &(d_W_vvov_tmp[nV*(i+nO*nV*bet)]); lda = nV*nO;
+                B = &(d_W_vvov_tmp[nV*(i+nO*nV*bet)]); ldb = nV*nO;
+                C = &(d_W_vvov[nV*nV*(i+nO*bet)]); ldc = nV;
+                cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nV, nV, &alpha, A, lda, &beta, B, ldb, C, ldc);
+              }
+            }
+            for (int i=0 ; i<nV ; ++i) {
+              cudaStreamDestroy(stream[i]);
+            }
+            cublasSetStream(handle, NULL);
+
+            alpha = 1.0;
+            beta  = 1.0;
+            m=nO;  n=mbs;  k=nO*nV*nV;
+            A=d_T_vvoo;  lda=nV*nV*nO;
+            B=d_W_vvov;  ldb=nO*nV*nV;
+            C=&(d_r1[nO*iblock]);  ldc=nO;
+            cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
+
+          }
+          cudaFree(d_W_vvov);
+          cudaFree(d_W_vvov_tmp);
+          cudaFree(d_T_vvoo);
+        }
+
+        #pragma omp section
+        {
+          double* d_W_oovo;
+          cudaMalloc((void **)&d_W_oovo, nO*nO*nV*nO * sizeof(double));
+
+          alpha = 2.0;
+          beta  = -1.0;
+          for (int i=0 ; i<nV ; ++i) {
+            cudaStreamCreate(&(stream[i]));
+          }
+          for (int u=0 ; u<nO ; ++u) {
+            for (int a=0 ; a<nV ; ++a) {
+              cublasSetStream(handle, stream[a]);
+              A = &(d_cc_space_v_oovo[nO*nO*(a+nV*u)]); lda = nO;
+              B = &(d_cc_space_v_oovo[nO*nO*(a+nV*u)]); ldb = nO;
+              C = &(d_W_oovo[nO*nO*(a+nV*u)]); ldc = nO;
+              cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
            }
          }
          for (int i=0 ; i<nV ; ++i) {
@ -1601,24 +1900,35 @@ void compute_h_vo_chol_gpu(gpu_data* data, int nO, int nV, int igpu, double* H_v

          alpha = -1.0;
          beta  = 1.0;
-    m=nV;  n=nO;  k=cholesky_mo_num*nO;
-    A=d_cc_space_v_ov_chol;  lda=cholesky_mo_num*nO;
-    B=d_tmp2;  ldb=cholesky_mo_num*nO;
-    C=d_H_vo;  ldc=nV;
+          m=nO;  n=nV;  k=nO*nO*nV;
+          A=d_W_oovo;  lda=nO * nO * nV;
+          B=d_tau;  ldb=nO * nO * nV;
+          C=d_r1;  ldc=nO;
          cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);

-//    double* H_vo = malloc(nO*nO*sizeof(double));
-    cublasGetMatrix(nV, nO, sizeof(double), d_H_vo, nV, H_vo, nV);
-    for (int i=0 ; i<ngpus ; ++i) {
-       if (i != igpu) {
-         double* d_H_vo = data[i].H_vo;
-         cudaSetDevice(i);
-         cublasSetMatrix(nV, nO, sizeof(double), H_vo, nV, d_H_vo, nV);
        }
      }
-//    free(H_vo);
+
+      double * r1_tmp = malloc(nO*nV*sizeof(double));
+      lda=nO;
+      cublasGetMatrix(nO, nV, sizeof(double), d_r1, lda, r1_tmp, lda);
+      #pragma omp critical
+      {
+         for (size_t i=0 ; i<(size_t) nO*nV ; ++i) {
+           r1[i] -= r1_tmp[i];
+         }
+      }
+      free(r1_tmp);
+
+      cudaFree(d_r1);

      cublasDestroy(handle);
+  }

+  *max_r1 = 0.;
+  for (size_t i=0 ; i<(size_t) nO*nV ; ++i) {
+    const double x = r1[i] > 0. ? r1[i] : -r1[i];
+    *max_r1 = *max_r1 > x ? *max_r1 : x;
+  }
 }

--- a/devel/ccsd_gpu/gpu.h
+++ b/devel/ccsd_gpu/gpu.h
@ -5,6 +5,7 @@ typedef struct {
   double* cc_space_v_vv_chol;
   double* cc_space_v_oooo;
   double* cc_space_v_vooo;
+   double* cc_space_v_voov;
   double* cc_space_v_oovv;
   double* cc_space_v_vvoo;
   double* cc_space_v_oovo;
@ -12,6 +13,7 @@ typedef struct {
   double* cc_space_v_ovov;
   double* cc_space_v_ovoo;
   double* cc_space_f_oo;
+   double* cc_space_f_ov;
   double* cc_space_f_vo;
   double* cc_space_f_vv;
   double* tau;
--- a/devel/ccsd_gpu/gpu_init.c
+++ b/devel/ccsd_gpu/gpu_init.c
@ -10,12 +10,12 @@ gpu_data* gpu_init(
   int nO, int nV, int cholesky_mo_num,
   double* cc_space_v_oo_chol, double* cc_space_v_ov_chol,
   double* cc_space_v_vo_chol, double* cc_space_v_vv_chol,
-   double* cc_space_v_oooo, double* cc_space_v_vooo,
+   double* cc_space_v_oooo, double* cc_space_v_vooo, double* cc_space_v_voov,
   double* cc_space_v_oovv, double* cc_space_v_vvoo,
   double* cc_space_v_oovo, double* cc_space_v_ovvo,
   double* cc_space_v_ovov, double* cc_space_v_ovoo,
-   double* cc_space_f_oo, double* cc_space_f_vo,
-   double* cc_space_f_vv)
+   double* cc_space_f_oo, double* cc_space_f_ov,
+   double* cc_space_f_vo, double* cc_space_f_vv)
 {
    int ngpus = 1;
    cudaGetDeviceCount(&ngpus);
@ -59,6 +59,10 @@ gpu_data* gpu_init(
      cudaMalloc((void**)&d_cc_space_v_vooo, nV*nO*nO*nO*sizeof(double));
      cublasSetMatrix(nV*nO, nO*nO, sizeof(double), cc_space_v_vooo, nV*nO, d_cc_space_v_vooo, nV*nO);

+      double* d_cc_space_v_voov;
+      cudaMalloc((void**)&d_cc_space_v_voov, nV*nO*nO*nV*sizeof(double));
+      cublasSetMatrix(nV*nO, nO*nV, sizeof(double), cc_space_v_voov, nV*nO, d_cc_space_v_voov, nV*nO);
+
      double* d_cc_space_v_oovv;
      cudaMalloc((void**)&d_cc_space_v_oovv, nO*nO*nV*nV*sizeof(double));
      cublasSetMatrix(nO*nO, nV*nV, sizeof(double), cc_space_v_oovv, nO*nO, d_cc_space_v_oovv, nO*nO);
@ -95,6 +99,10 @@ gpu_data* gpu_init(
      cudaMalloc((void**)&d_cc_space_f_vo, nV*nO*sizeof(double));
      cublasSetMatrix(nV, nO, sizeof(double), cc_space_f_vo, nV, d_cc_space_f_vo, nV);

+      double* d_cc_space_f_ov;
+      cudaMalloc((void**)&d_cc_space_f_ov, nV*nO*sizeof(double));
+      cublasSetMatrix(nO, nV, sizeof(double), cc_space_f_ov, nO, d_cc_space_f_ov, nO);
+
      double* d_cc_space_f_vv;
      cudaMalloc((void**)&d_cc_space_f_vv, nV*nV*sizeof(double));
      cublasSetMatrix(nV, nV, sizeof(double), cc_space_f_vv, nV, d_cc_space_f_vv, nV);
@ -128,6 +136,7 @@ gpu_data* gpu_init(
      data[igpu].cc_space_v_vv_chol = d_cc_space_v_vv_chol;
      data[igpu].cc_space_v_oooo = d_cc_space_v_oooo;
      data[igpu].cc_space_v_vooo = d_cc_space_v_vooo;
+      data[igpu].cc_space_v_voov = d_cc_space_v_voov;
      data[igpu].cc_space_v_oovv = d_cc_space_v_oovv;
      data[igpu].cc_space_v_vvoo = d_cc_space_v_vvoo;
      data[igpu].cc_space_v_oovo = d_cc_space_v_oovo;
@ -135,6 +144,7 @@ gpu_data* gpu_init(
      data[igpu].cc_space_v_ovov = d_cc_space_v_ovov;
      data[igpu].cc_space_v_ovoo = d_cc_space_v_ovoo;
      data[igpu].cc_space_f_oo   = d_cc_space_f_oo;
+      data[igpu].cc_space_f_ov   = d_cc_space_f_ov;
      data[igpu].cc_space_f_vo   = d_cc_space_f_vo;
      data[igpu].cc_space_f_vv   = d_cc_space_f_vv;
      data[igpu].tau   = d_tau;
--- a/devel/ccsd_gpu/gpu_module.f90
+++ b/devel/ccsd_gpu/gpu_module.f90
@ -6,9 +6,9 @@ module gpu_module
  interface
    type(c_ptr) function gpu_init(nO, nV, cholesky_mo_num, &
      cc_space_v_oo_chol, cc_space_v_ov_chol, cc_space_v_vo_chol, cc_space_v_vv_chol, &
-      cc_space_v_oooo, cc_space_v_vooo, cc_space_v_oovv, cc_space_v_vvoo, &
+      cc_space_v_oooo, cc_space_v_vooo, cc_space_v_voov, cc_space_v_oovv, cc_space_v_vvoo, &
      cc_space_v_oovo, cc_space_v_ovvo, cc_space_v_ovov, cc_space_v_ovoo, &
-      cc_space_f_oo, cc_space_f_vo, cc_space_f_vv) bind(C)
+      cc_space_f_oo, cc_space_f_ov, cc_space_f_vo, cc_space_f_vv) bind(C)
        import c_int, c_double, c_ptr
        integer(c_int), intent(in), value :: nO, nV, cholesky_mo_num
        real(c_double), intent(in)  :: cc_space_v_oo_chol(cholesky_mo_num,nO,nO)
@ -17,6 +17,7 @@ module gpu_module
        real(c_double), intent(in)  :: cc_space_v_vv_chol(cholesky_mo_num,nV,nV)
        real(c_double), intent(in)  :: cc_space_v_oooo(nO,nO,nO,nO)
        real(c_double), intent(in)  :: cc_space_v_vooo(nV,nO,nO,nO)
+        real(c_double), intent(in)  :: cc_space_v_voov(nV,nO,nO,nV)
        real(c_double), intent(in)  :: cc_space_v_oovv(nO,nO,nV,nV)
        real(c_double), intent(in)  :: cc_space_v_vvoo(nV,nV,nO,nO)
        real(c_double), intent(in)  :: cc_space_v_oovo(nO,nO,nV,nO)
@ -24,6 +25,7 @@ module gpu_module
        real(c_double), intent(in)  :: cc_space_v_ovov(nO,nV,nO,nV)
        real(c_double), intent(in)  :: cc_space_v_ovoo(nO,nV,nO,nO)
        real(c_double), intent(in)  :: cc_space_f_oo(nO,nO)
+        real(c_double), intent(in)  :: cc_space_f_ov(nO,nV)
        real(c_double), intent(in)  :: cc_space_f_vo(nV,nO)
        real(c_double), intent(in)  :: cc_space_f_vv(nV,nV)
    end function
@ -38,25 +40,31 @@ module gpu_module
        real(c_double), intent(in) :: tau_x(nO,nO,nV,nV)
    end subroutine

-    subroutine compute_H_oo_chol_gpu(gpu_data, nO, nV, igpu, H_oo) bind(C)
+    subroutine compute_H_oo_chol_gpu(gpu_data, igpu) bind(C)
        import c_int, c_double, c_ptr
        type(c_ptr), value    :: gpu_data
-        integer(c_int), intent(in), value  :: nO, nV, igpu
-        real(c_double), intent(out) :: H_oo(nO,nO)
+        integer(c_int), intent(in), value  :: igpu
    end subroutine

-    subroutine compute_H_vo_chol_gpu(gpu_data, nO, nV, igpu, H_vo) bind(C)
+    subroutine compute_H_vo_chol_gpu(gpu_data, igpu) bind(C)
        import c_int, c_double, c_ptr
        type(c_ptr), value    :: gpu_data
-        integer(c_int), intent(in), value  :: nO, nV, igpu
-        real(c_double), intent(out) :: H_vo(nV,nO)
+        integer(c_int), intent(in), value  :: igpu
    end subroutine

-    subroutine compute_H_vv_chol_gpu(gpu_data, nO, nV, igpu, H_vv) bind(C)
+    subroutine compute_H_vv_chol_gpu(gpu_data, igpu) bind(C)
        import c_int, c_double, c_ptr
        type(c_ptr), value    :: gpu_data
-        integer(c_int), intent(in), value  :: nO, nV, igpu
-        real(c_double), intent(out) :: H_vv(nO,nO)
+        integer(c_int), intent(in), value  :: igpu
+    end subroutine
+
+    subroutine compute_r1_space_chol_gpu(gpu_data, nO, nV, t1, r1, max_r1) bind(C)
+        import c_int, c_double, c_ptr
+        type(c_ptr), value    :: gpu_data
+        integer(c_int), intent(in), value  :: nO, nV
+        real(c_double), intent(in)  :: t1(nO,nV)
+        real(c_double), intent(out) :: r1(nO,nO,nV,nV)
+        real(c_double), intent(out) :: max_r1
    end subroutine

    subroutine compute_r2_space_chol_gpu(gpu_data, nO, nV, t1, r2, max_r2) bind(C)
--- a/devel/ccsd_gpu/guess_t.irp.f
+++ b/devel/ccsd_gpu/guess_t.irp.f
@ -0,0 +1,208 @@
+! T1
+
+subroutine guess_t1(nO,nV,f_o,f_v,f_ov,t1)
+
+  implicit none
+
+  BEGIN_DOC
+  ! Update the T1 amplitudes for CC
+  END_DOC
+
+  ! in
+  integer, intent(in)           :: nO, nV
+  double precision, intent(in)  :: f_o(nO), f_v(nV), f_ov(nO,nV)
+
+  ! inout
+  double precision, intent(out) :: t1(nO, nV)
+
+  ! internal
+  integer                       :: i,a
+
+  if (trim(cc_guess_t1) == 'none') then
+     t1 = 0d0
+  else if (trim(cc_guess_t1) == 'MP') then
+    do a = 1, nV
+      do i = 1, nO
+        t1(i,a) = f_ov(i,a) / (f_o(i) - f_v(a) - cc_level_shift_guess)
+      enddo
+    enddo
+  else if (trim(cc_guess_t1) == 'read') then
+    call read_t1(nO,nV,t1)
+  else
+    print*, 'Unknown cc_guess_t1 type: '//trim(cc_guess_t1)
+    call abort
+  endif
+  
+end
+
+! T2
+
+subroutine guess_t2(nO,nV,f_o,f_v,v_oovv,t2)
+
+  implicit none
+
+  BEGIN_DOC
+  ! Update the T2 amplitudes for CC
+  END_DOC
+
+  ! in
+  integer, intent(in)           :: nO, nV
+  double precision, intent(in)  :: f_o(nO), f_v(nV), v_oovv(nO, nO, nV, nV)
+
+  ! inout
+  double precision, intent(out) :: t2(nO, nO, nV, nV)
+
+  ! internal
+  integer                       :: i,j,a,b
+
+  if (trim(cc_guess_t2) == 'none') then
+    t2 = 0d0
+  else if (trim(cc_guess_t2) == 'MP') then
+    do b = 1, nV
+      do a = 1, nV
+        do j = 1, nO
+          do i = 1, nO
+            t2(i,j,a,b) = v_oovv(i,j,a,b) / (f_o(i) + f_o(j) - f_v(a) - f_v(b) - cc_level_shift_guess)
+          enddo
+        enddo
+      enddo
+    enddo
+  else if (trim(cc_guess_t2) == 'read') then
+    call read_t2(nO,nV,t2)
+  else
+    print*, 'Unknown cc_guess_t1 type: '//trim(cc_guess_t2)
+    call abort
+  endif
+  
+end
+
+! T1
+
+subroutine write_t1(nO,nV,t1)
+
+  implicit none
+
+  BEGIN_DOC
+  ! Write the T1 amplitudes for CC
+  END_DOC
+
+  ! in
+  integer, intent(in)          :: nO, nV
+  double precision, intent(in) :: t1(nO, nV)
+
+  ! internal
+  integer                      :: i,a, iunit
+  integer, external :: getunitandopen
+
+  if (write_amplitudes) then
+    iunit = getUnitAndOpen(trim(ezfio_filename)//'/work/T1','w')
+    do a = 1, nV
+      do i = 1, nO
+         write(iunit,'(F20.12)') t1(i,a)
+      enddo
+    enddo
+    close(iunit)
+  endif
+  
+end
+
+! T2
+
+subroutine write_t2(nO,nV,t2)
+
+  implicit none
+
+  BEGIN_DOC
+  ! Write the T2 amplitudes for CC
+  END_DOC
+
+  ! in
+  integer, intent(in)          :: nO, nV
+  double precision, intent(in) :: t2(nO, nO, nV, nV)
+
+  ! internal
+  integer                      :: i,j,a,b, iunit
+  integer, external :: getunitandopen
+
+  if (write_amplitudes) then
+    iunit = getUnitAndOpen(trim(ezfio_filename)//'/work/T2','w')
+    do b = 1, nV
+      do a = 1, nV
+        do j = 1, nO
+          do i = 1, nO
+             write(iunit,'(F20.12)') t2(i,j,a,b)
+          enddo
+        enddo
+      enddo
+    enddo
+    close(iunit)
+  endif
+  
+end
+
+! T1
+
+subroutine read_t1(nO,nV,t1)
+
+  implicit none
+
+  BEGIN_DOC
+  ! Read the T1 amplitudes for CC
+  END_DOC
+
+  ! in
+  integer, intent(in)           :: nO, nV
+  double precision, intent(out) :: t1(nO, nV)
+
+  ! internal
+  integer                       :: i,a, iunit
+  logical                       :: ok
+  integer, external :: getunitandopen
+
+  if (read_amplitudes) then
+    iunit = getUnitAndOpen(trim(ezfio_filename)//'/work/T1','r')
+    do a = 1, nV
+      do i = 1, nO
+         read(iunit,'(F20.12)') t1(i,a)
+      enddo
+    enddo
+    close(iunit)
+  endif
+  
+end
+
+! T2
+
+subroutine read_t2(nO,nV,t2)
+
+  implicit none
+
+  BEGIN_DOC
+  ! Read the T2 amplitudes for CC
+  END_DOC
+
+  ! in
+  integer, intent(in)           :: nO, nV
+  double precision, intent(out) :: t2(nO, nO, nV, nV)
+
+  ! internal
+  integer                       :: i,j,a,b, iunit
+  logical                       :: ok
+
+  integer, external :: getunitandopen
+
+  if (read_amplitudes) then
+    iunit = getUnitAndOpen(trim(ezfio_filename)//'/work/T2','r')
+    do b = 1, nV
+      do a = 1, nV
+        do j = 1, nO
+          do i = 1, nO
+             read(iunit,'(F20.12)') t2(i,j,a,b)
+          enddo
+        enddo
+      enddo
+    enddo
+    close(iunit)
+  endif
+  
+end
--- a/devel/ccsd_gpu/occupancy.irp.f
+++ b/devel/ccsd_gpu/occupancy.irp.f
@ -0,0 +1,328 @@
+! N spin orb
+
+subroutine extract_n_spin(det,n)
+
+  implicit none
+
+  BEGIN_DOC
+  ! Returns the number of occupied alpha, occupied beta, virtual alpha, virtual beta spin orbitals
+  ! in det without counting the core and deleted orbitals in the format n(nOa,nOb,nVa,nVb)
+  END_DOC
+
+  integer(bit_kind), intent(in) :: det(N_int,2)
+  
+  integer, intent(out)          :: n(4)
+  
+  integer(bit_kind)             :: res(N_int,2)
+  integer                       :: i, si
+  logical                       :: ok, is_core, is_del
+
+  ! Init
+  n = 0
+
+  ! Loop over the spin
+  do si = 1, 2
+    do i = 1, mo_num
+      call apply_hole(det, si, i, res, ok, N_int)
+      
+      ! in core ?
+      if (is_core(i)) cycle
+      ! in del ?
+      if (is_del(i)) cycle
+      
+      if (ok) then
+        ! particle
+        n(si) = n(si) + 1
+      else
+        ! hole
+        n(si+2) = n(si+2) + 1
+      endif
+    enddo
+  enddo
+
+  !print*,n(1),n(2),n(3),n(4)
+
+end
+
+! Spin
+
+subroutine extract_list_orb_spin(det,nO_m,nV_m,list_occ,list_vir)
+
+  implicit none
+
+  BEGIN_DOC
+  ! Returns the the list of occupied alpha/beta, virtual alpha/beta spin orbitals
+  ! size(nO_m,1) must be max(nOa,nOb) and size(nV_m,1) must be max(nVa,nVb)
+  END_DOC
+  
+  integer, intent(in)           :: nO_m, nV_m
+  integer(bit_kind), intent(in) :: det(N_int,2)
+  
+  integer, intent(out)          :: list_occ(nO_m,2), list_vir(nV_m,2)
+  
+  integer(bit_kind)             :: res(N_int,2)
+  integer                       :: i, si, idx_o, idx_v, idx_i, idx_b
+  logical                       :: ok, is_core, is_del
+
+  list_occ = 0
+  list_vir = 0
+
+  ! List of occ/vir alpha/beta
+
+  ! occ alpha -> list_occ(:,1)
+  ! occ beta -> list_occ(:,2)
+  ! vir alpha -> list_vir(:,1)
+  ! vir beta -> list_vir(:,2)
+
+  ! Loop over the spin 
+  do si = 1, 2
+    ! tmp idx
+    idx_o = 1
+    idx_v = 1
+    do i = 1, mo_num
+      call apply_hole(det, si, i, res, ok, N_int)
+
+      ! in core ?
+      if (is_core(i)) cycle
+      ! in del ?
+      if (is_del(i)) cycle
+            
+      if (ok) then
+        ! particle
+        list_occ(idx_o,si) = i
+        idx_o = idx_o + 1
+      else
+        ! hole
+        list_vir(idx_v,si) = i
+        idx_v = idx_v + 1
+      endif
+    enddo
+  enddo
+
+end
+
+! Space
+
+subroutine extract_list_orb_space(det,nO,nV,list_occ,list_vir)
+
+  implicit none
+
+  BEGIN_DOC
+  ! Returns the the list of occupied and virtual alpha spin orbitals
+  END_DOC
+  
+  integer, intent(in)           :: nO, nV
+  integer(bit_kind), intent(in) :: det(N_int,2)
+  
+  integer, intent(out)          :: list_occ(nO), list_vir(nV)
+  
+  integer(bit_kind)             :: res(N_int,2)
+  integer                       :: i, si, idx_o, idx_v, idx_i, idx_b
+  logical                       :: ok, is_core, is_del
+  
+  if (elec_alpha_num /= elec_beta_num) then
+    print*,'Error elec_alpha_num /= elec_beta_num, impossible to create cc_list_occ and cc_list_vir, abort'
+    call abort
+  endif
+
+  list_occ = 0
+  list_vir = 0
+
+  ! List of occ/vir alpha
+
+  ! occ alpha -> list_occ(:,1)
+  ! vir alpha -> list_vir(:,1)
+
+  ! tmp idx
+  idx_o = 1
+  idx_v = 1
+  do i = 1, mo_num
+    call apply_hole(det, 1, i, res, ok, N_int)
+
+    ! in core ?
+    if (is_core(i)) cycle
+    ! in del ?
+    if (is_del(i)) cycle
+
+    if (ok) then
+      ! particle
+      list_occ(idx_o) = i
+      idx_o = idx_o + 1
+    else
+      ! hole
+      list_vir(idx_v) = i
+      idx_v = idx_v + 1
+    endif
+  enddo
+
+end
+
+! is_core
+
+function is_core(i)
+
+  implicit none
+
+  BEGIN_DOC
+  ! True if the orbital i is a core orbital
+  END_DOC
+
+  integer, intent(in) :: i
+  logical             :: is_core
+
+  integer             :: j
+
+  ! Init
+  is_core = .False.
+
+  ! Search
+  do j = 1, dim_list_core_orb
+    if (list_core(j) == i) then
+      is_core = .True.
+      exit
+    endif
+  enddo
+
+end
+
+! is_del
+
+function is_del(i)
+
+  implicit none
+
+  BEGIN_DOC
+  ! True if the orbital i is a deleted orbital
+  END_DOC
+
+  integer, intent(in) :: i
+  logical             :: is_del
+
+  integer             :: j
+
+  ! Init
+  is_del = .False.
+
+  ! Search
+  do j = 1, dim_list_del_orb
+    if (list_del(j) == i) then
+      is_del = .True.
+      exit
+    endif
+  enddo
+
+end
+
+! N orb
+
+BEGIN_PROVIDER [integer, cc_nO_m]
+&BEGIN_PROVIDER [integer, cc_nOa]
+&BEGIN_PROVIDER [integer, cc_nOb]
+&BEGIN_PROVIDER [integer, cc_nOab]
+&BEGIN_PROVIDER [integer, cc_nV_m]
+&BEGIN_PROVIDER [integer, cc_nVa]
+&BEGIN_PROVIDER [integer, cc_nVb]
+&BEGIN_PROVIDER [integer, cc_nVab]
+&BEGIN_PROVIDER [integer, cc_n_mo]
+&BEGIN_PROVIDER [integer, cc_nO_S, (2)]
+&BEGIN_PROVIDER [integer, cc_nV_S, (2)]
+
+  implicit none
+
+  BEGIN_DOC
+  ! Number of orbitals without core and deleted ones of the cc_ref det in psi_det
+  ! a: alpha, b: beta
+  ! nO_m: max(a,b) occupied 
+  ! nOa: nb a occupied 
+  ! nOb: nb b occupied 
+  ! nOab: nb a+b occupied 
+  ! nV_m: max(a,b) virtual 
+  ! nVa: nb a virtual 
+  ! nVb: nb b virtual 
+  ! nVab: nb a+b virtual 
+  END_DOC
+
+  integer :: n_spin(4)
+
+  ! Extract number of occ/vir alpha/beta spin orbitals
+  call extract_n_spin(psi_det(1,1,cc_ref),n_spin)
+
+  cc_nOa  = n_spin(1)
+  cc_nOb  = n_spin(2)
+  cc_nOab = cc_nOa + cc_nOb    !n_spin(1) + n_spin(2)
+  cc_nO_m = max(cc_nOa,cc_nOb) !max(n_spin(1), n_spin(2))
+  cc_nVa  = n_spin(3)
+  cc_nVb  = n_spin(4)
+  cc_nVab = cc_nVa + cc_nVb    !n_spin(3) + n_spin(4)
+  cc_nV_m = max(cc_nVa,cc_nVb) !max(n_spin(3), n_spin(4))
+  cc_n_mo = cc_nVa + cc_nVb    !n_spin(1) + n_spin(3)
+  cc_nO_S = (/cc_nOa,cc_nOb/)
+  cc_nV_S = (/cc_nVa,cc_nVb/)
+
+END_PROVIDER
+
+! General
+
+BEGIN_PROVIDER [integer, cc_list_gen, (cc_n_mo)]
+
+  implicit none
+
+  BEGIN_DOC
+  ! List of general orbitals without core and deleted ones
+  END_DOC
+
+  integer :: i,j
+  logical :: is_core, is_del
+  
+  j = 1
+  do i = 1, mo_num
+    ! in core ?
+    if (is_core(i)) cycle
+    ! in del ?
+    if (is_del(i)) cycle
+    cc_list_gen(j) = i
+    j = j+1
+  enddo
+
+END_PROVIDER
+
+! Space
+
+BEGIN_PROVIDER [integer, cc_list_occ, (cc_nOa)]
+&BEGIN_PROVIDER [integer, cc_list_vir, (cc_nVa)]
+
+  implicit none
+
+  BEGIN_DOC
+  ! List of occupied and virtual spatial orbitals without core and deleted ones
+  END_DOC
+
+  call extract_list_orb_space(psi_det(1,1,cc_ref),cc_nOa,cc_nVa,cc_list_occ,cc_list_vir)
+
+END_PROVIDER
+
+! Spin
+
+BEGIN_PROVIDER [integer, cc_list_occ_spin, (cc_nO_m,2)]
+&BEGIN_PROVIDER [integer, cc_list_vir_spin, (cc_nV_m,2)]
+&BEGIN_PROVIDER [logical, cc_ref_is_open_shell]
+
+  implicit none
+
+  BEGIN_DOC
+  ! List of occupied and virtual spin orbitals without core and deleted ones
+  END_DOC
+
+  integer :: i
+
+  call extract_list_orb_spin(psi_det(1,1,cc_ref),cc_nO_m,cc_nV_m,cc_list_occ_spin,cc_list_vir_spin)
+
+  cc_ref_is_open_shell = .False.
+  do i = 1, cc_nO_m
+    if (cc_list_occ_spin(i,1) /= cc_list_occ_spin(i,2)) then
+       cc_ref_is_open_shell = .True.
+    endif
+  enddo
+
+
+END_PROVIDER
--- a/devel/ccsd_gpu/phase.irp.f
+++ b/devel/ccsd_gpu/phase.irp.f
@ -0,0 +1,137 @@
+! phase
+
+subroutine get_phase_general(det1,det2,phase,degree,Nint)
+  implicit none
+
+  integer, intent(in)           :: Nint
+  integer(bit_kind), intent(in) :: det1(Nint,2), det2(Nint,2)
+  double precision, intent(out) :: phase
+  integer, intent(out)          :: degree
+  integer :: n(2)
+  integer, allocatable :: list_anni(:,:), list_crea(:,:)
+
+  allocate(list_anni(N_int*bit_kind_size,2))
+  allocate(list_crea(N_int*bit_kind_size,2))
+
+  call get_excitation_general(det1,det2,degree,n,list_anni,list_crea,phase,Nint)
+end
+
+! Get excitation general
+
+subroutine get_excitation_general(det1,det2,degree,n,list_anni,list_crea,phase,Nint)
+
+  use bitmasks
+
+  implicit none
+
+  integer, intent(in) :: Nint
+  integer(bit_kind), intent(in)  :: det1(Nint,2), det2(Nint,2)
+  double precision, intent(out)  :: phase
+  integer, intent(out)           :: list_crea(Nint*bit_kind_size,2)
+  integer, intent(out)           :: list_anni(Nint*bit_kind_size,2)
+  integer, intent(out)           :: degree, n(2)
+  
+  integer, allocatable           :: l1(:,:), l2(:,:) 
+  integer(bit_kind), allocatable :: det_crea(:,:), det_anni(:,:)
+  integer, allocatable           :: pos_anni(:,:), pos_crea(:,:)
+
+  integer :: n1(2),n2(2),n_crea(2),n_anni(2),i,j,k,d
+
+  allocate(l1(Nint*bit_kind_size,2))
+  allocate(l2(Nint*bit_kind_size,2))
+  allocate(det_crea(Nint,2),det_anni(Nint,2))
+
+  ! 1      111010
+  ! 2      110101
+  !
+  !not 1-> 000101
+  !    2   110101
+  !and     000101 -> crea
+  !
+  !    1   111010
+  !not 2-> 001010
+  !        001010 -> anni
+
+  do j = 1, 2
+    do i = 1, Nint
+      det_crea(i,j) = iand(not(det1(i,j)),det2(i,j))
+    enddo
+  enddo
+  
+  do j = 1, 2
+    do i = 1, Nint
+      det_anni(i,j) = iand(det1(i,j),not(det2(i,j)))
+    enddo
+  enddo
+
+  call bitstring_to_list_ab(det1,l1,n1,Nint)
+  call bitstring_to_list_ab(det2,l2,n2,Nint)
+  call bitstring_to_list_ab(det_crea,list_crea,n_crea,Nint)
+  call bitstring_to_list_ab(det_anni,list_anni,n_anni,Nint)
+
+  do i = 1, 2
+    if (n_crea(i) /= n_anni(i)) then
+      print*,'Well, it seems we have a problem here...'
+      call abort
+    endif
+  enddo
+
+  !1    11110011001  1 2 3 4 7 8  11
+  !pos               1 2 3 4 5 6  7 
+  !2    11100101011  1 2 3 6 8 10 11
+  !anni 00010010000  4 7
+  !pos               4 5
+  !crea 00000100010  6 10
+  !pos               4 6
+  !4 -> 6  pos(4 -> 4)
+  !7 -> 10 pos(5 -> 6)
+
+  n = n_anni
+  degree = n_anni(1) + n_anni(2)
+
+  allocate(pos_anni(max(n(1),n(2)),2))
+  allocate(pos_crea(max(n(1),n(2)),2))
+  
+  ! Search pos anni
+  do j = 1, 2
+    k = 1
+    do i = 1, n1(j)
+       if (k > n_anni(j)) exit
+       if (l1(i,j) /= list_anni(k,j)) cycle
+       pos_anni(k,j) = i
+       k = k + 1
+    enddo
+  enddo
+
+  ! Search pos crea
+  do j = 1, 2
+    k = 1
+    do i = 1, n2(j)
+       if (k > n_crea(j)) exit
+       if (l2(i,j) /= list_crea(k,j)) cycle
+       pos_crea(k,j) = i
+       k = k + 1
+    enddo
+  enddo
+
+  ! Distance between the ith anni and the ith crea op
+  ! By doing so there is no crossing between the different pairs of anni/crea
+  ! and the phase is determined by the sum of the distances
+  ! -> (-1)^{sum of the distances}
+  d = 0
+  do j = 1, 2
+    do i = 1, n(j)
+      d = d + abs(pos_anni(i,j) - pos_crea(i,j))
+    enddo
+  enddo
+  
+  phase = dble((-1)**d)
+
+  ! Debug
+  !print*,l2(1:n2(1),1)
+  !print*,l2(1:n2(2),2)
+  !!call print_det(det1,Nint)
+  !!call print_det(det2,Nint)
+  !print*,phase
+  !print*,''
+end
--- a/devel/ccsd_gpu/update_t.irp.f
+++ b/devel/ccsd_gpu/update_t.irp.f
@ -0,0 +1,73 @@
+! T1
+
+subroutine update_t1(nO,nV,f_o,f_v,r1,t1)
+
+  implicit none
+
+  BEGIN_DOC
+  ! Update the T1 amplitudes for CC
+  END_DOC
+
+  ! in
+  integer, intent(in)             :: nO, nV
+  double precision, intent(in)    :: f_o(nO), f_v(nV), r1(nO, nV)
+
+  ! inout
+  double precision, intent(inout) :: t1(nO, nV)
+
+  ! internal
+  integer                         :: i,a
+
+  !$OMP PARALLEL &
+  !$OMP SHARED(nO,nV,t1,r1,cc_level_shift,f_o,f_v) &
+  !$OMP PRIVATE(i,a) &
+  !$OMP DEFAULT(NONE)
+  !$OMP DO 
+  do a = 1, nV
+    do i = 1, nO
+      t1(i,a) = t1(i,a) - r1(i,a) / (f_o(i) - f_v(a) - cc_level_shift)
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+  
+end
+
+! T2
+
+subroutine update_t2(nO,nV,f_o,f_v,r2,t2)
+
+  implicit none
+
+  BEGIN_DOC
+  ! Update the T2 amplitudes for CC
+  END_DOC
+
+  ! in
+  integer, intent(in)             :: nO, nV
+  double precision, intent(in)    :: f_o(nO), f_v(nV), r2(nO, nO, nV, nV)
+
+  ! inout
+  double precision, intent(inout) :: t2(nO, nO, nV, nV)
+
+  ! internal
+  integer                         :: i,j,a,b
+
+  !$OMP PARALLEL &
+  !$OMP SHARED(nO,nV,t2,r2,cc_level_shift,f_o,f_v) &
+  !$OMP PRIVATE(i,j,a,b) &
+  !$OMP DEFAULT(NONE)
+  !$OMP DO 
+  do b = 1, nV
+    do a = 1, nV
+      do j = 1, nO
+        do i = 1, nO
+          t2(i,j,a,b) = t2(i,j,a,b) - r2(i,j,a,b) / (f_o(i) + f_o(j) - f_v(a) - f_v(b) - cc_level_shift)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+  
+end
Author	SHA1	Message	Date
Anthony Scemama	971a0ff160	Refactoring	2023-08-04 16:54:48 +02:00
Anthony Scemama	ec6e5fde68	Merge turpan:~/qp2/plugins/qp_plugins_scemama	2023-08-04 16:43:42 +02:00
Anthony Scemama	ac2614a0f3	r1 on GPU	2023-08-04 16:42:46 +02:00
Anthony Scemama	d61ecb35c4	r1 on GPU	2023-08-04 16:19:41 +02:00
Anthony Scemama	a7e0832dae	Starting r1 on GPU	2023-08-04 14:48:08 +02:00