Refactoring

Merge turpan:~/qp2/plugins/qp_plugins_scemama
r1 on GPU
2024-12-22 12:23:37 +01:00 · 2023-08-04 16:54:48 +02:00 · 2023-08-04 16:43:42 +02:00 · 2023-08-04 16:42:46 +02:00 · 2023-08-04 16:19:41 +02:00 · 2023-08-04 14:48:08 +02:00
14 changed files with 2006 additions and 2359 deletions
--- a/devel/ccsd_gpu/EZFIO.cfg
+++ b/devel/ccsd_gpu/EZFIO.cfg
@ -1,3 +1,75 @@
 [cc_thresh_conv]
 type: double precision
 doc: Threshold for the convergence of the residual equations.
 interface: ezfio,ocaml,provider
 default: 1e-6
 [cc_max_iter]
 type: integer
 doc: Maximum number of iterations.
 interface: ezfio,ocaml,provider
 default: 100
 [cc_diis_depth]
 type: integer
 doc: Maximum depth of the DIIS, i.e., maximum number of iterations that the DIIS keeps in memory. Warning, we allocate matrices with the diis depth at the beginning without update. If you don't have enough memory it should crash in memory.
 interface: ezfio,ocaml,provider
 default: 8
 [cc_level_shift]
 type: double precision
 doc: Level shift for the CC
 interface: ezfio,ocaml,provider
 default: 0.0
 [cc_level_shift_guess]
 type: double precision
 doc: Level shift for the guess of the CC amplitudes
 interface: ezfio,ocaml,provider
 default: 0.0
 [cc_update_method]
 type: character*(32)
 doc: Method used to update the CC amplitudes. none -> normal, diis -> with diis.
 interface: ezfio,ocaml,provider
 default: diis
 [cc_guess_t1]
 type: character*(32)
 doc: Guess used to initialize the T1 amplitudes. none -> 0, MP -> perturbation theory, read -> read from disk.
 interface: ezfio,ocaml,provider
 default: MP
 [cc_guess_t2]
 type: character*(32)
 doc: Guess used to initialize the T2 amplitudes. none -> 0, MP -> perturbation theory, read -> read from disk.
 interface: ezfio,ocaml,provider
 default: MP
 [io_amplitudes]
 type: Disk_access
 doc: Read/Write |CCSD| amplitudes from/to disk [ Write | Read | None ]
 interface: ezfio,provider,ocaml
 default: None
 [cc_par_t]
 type: logical
 doc: If true, the CCSD(T) will be computed.
 interface: ezfio,ocaml,provider
 default: False
 [cc_dev]
 type: logical
 doc: Only for dev purposes.
 interface: ezfio,ocaml,provider
 default: False
 [cc_ref]
 type: integer
 doc: Index of the reference determinant in psi_det for CC calculation.
 interface: ezfio,ocaml,provider
 default: 1
 [energy]
 type: double precision
 doc: CCSD energy
--- a/devel/ccsd_gpu/NEED
+++ b/devel/ccsd_gpu/NEED
@ -1,2 +1,2 @@
 hartree_fock
-utils_cc_gpu
+determinants
--- a/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f
+++ b/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f
--- a/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f
+++ b/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f
@ -80,11 +80,12 @@ end
 ! R1
-subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
+subroutine compute_r1_space_chol(gpu_data, nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
-
+  use gpu_module
  implicit none
  ! in
  type(c_ptr), intent(in) :: gpu_data
  integer, intent(in)           :: nO, nV
  double precision, intent(in)  :: t1(nO,nV), t2(nO,nO,nV,nV), tau(nO,nO,nV,nV)
  double precision, intent(in)  :: H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO)
@ -95,177 +96,40 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
  ! internal
  integer                       :: u,i,j,beta,a,b
  !$omp parallel &
  !$omp shared(nO,nV,r1,cc_space_f_ov) &
  !$omp private(u,beta) &
  !$omp default(none)
  !$omp do
  do beta = 1, nV
    do u = 1, nO
      r1(u,beta) = cc_space_f_ov(u,beta)
    enddo
  enddo
  !$omp end do
  !$omp end parallel
-  double precision, allocatable :: X_oo(:,:)
+  call compute_r1_space_chol_gpu(gpu_data, nO, nV, t1, r1, max_r1)
  allocate(X_oo(nO,nO))
  call dgemm('N','N', nO, nO, nV, &
             -2d0, t1    , size(t1,1), &
                   cc_space_f_vo, size(cc_space_f_vo,1), &
              0d0, X_oo  , size(X_oo,1))
  call dgemm('T','N', nO, nV, nO, &
             1d0, X_oo, size(X_oo,2), &
                  t1  , size(t1,1), &
             1d0, r1  , size(r1,1))
  deallocate(X_oo)
  call dgemm('N','N', nO, nV, nV, &
             1d0, t1  , size(t1,1), &
                  H_vv, size(H_vv,1), &
             1d0, r1  , size(r1,1))
  call dgemm('N','N', nO, nV, nO, &
             -1d0, H_oo, size(H_oo,1), &
                   t1  , size(t1,1), &
              1d0, r1, size(r1,1))
  double precision, allocatable :: X_voov(:,:,:,:)
  allocate(X_voov(nV, nO, nO, nV))
  !$omp parallel &
  !$omp shared(nO,nV,X_voov,t2,t1) &
  !$omp private(u,beta,i,a) &
  !$omp default(none)
  !$omp do
  do beta = 1, nV
    do u = 1, nO
      do i = 1, nO
        do a = 1, nV
          X_voov(a,i,u,beta) = 2d0 * t2(i,u,a,beta) - t2(u,i,a,beta) + t1(u,a) * t1(i,beta)
        enddo
      enddo
    enddo
  enddo
  !$omp end do
  !$omp end parallel
  call dgemv('T', nV*nO, nO*nV, &
             1d0, X_voov, size(X_voov,1) * size(X_voov,2), &
                  H_vo  , 1, &
             1d0, r1    , 1)
  deallocate(X_voov)
  double precision, allocatable :: X_ovov(:,:,:,:)
  allocate(X_ovov(nO, nV, nO, nV))
  !$omp parallel &
  !$omp shared(nO,nV,cc_space_v_ovov,cc_space_v_voov,X_ovov) &
  !$omp private(u,beta,i,a) &
  !$omp default(none)
  !$omp do
  do beta = 1, nV
    do u = 1, nO
      do a = 1, nv
        do i = 1, nO
          X_ovov(i,a,u,beta) = 2d0 * cc_space_v_voov(a,u,i,beta) - cc_space_v_ovov(u,a,i,beta)
        enddo
      enddo
    enddo
  enddo
  !$omp end do
  !$omp end parallel
  call dgemv('T', nO*nV, nO*nV, &
             1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), &
                  t1     , 1, &
             1d0, r1     , 1)
  deallocate(X_ovov)
  integer :: iblock, block_size, nVmax
  double precision, allocatable :: W_vvov(:,:,:,:), W_vvov_tmp(:,:,:,:), T_vvoo(:,:,:,:)
  block_size = 16
  allocate(W_vvov(nV,nV,nO,block_size), W_vvov_tmp(nV,nO,nV,block_size), T_vvoo(nV,nV,nO,nO))
  !$omp parallel &
  !$omp private(u,i,b,a) &
  !$omp default(shared)
  !$omp do
  do u = 1, nO
    do i = 1, nO
      do b = 1, nV
        do a = 1, nV
          T_vvoo(a,b,i,u) = tau(i,u,a,b)
        enddo
      enddo
    enddo
  enddo
  !$omp end do
  !$omp end parallel
  do iblock = 1, nV, block_size
    nVmax = min(block_size,nV-iblock+1)
    call dgemm('T','N', nV*nO, nV*nVmax, cholesky_mo_num, 1.d0, &
      cc_space_v_vo_chol            , cholesky_mo_num, &
      cc_space_v_vv_chol(1,1,iblock), cholesky_mo_num, &
      0.d0, W_vvov_tmp, nV*nO)
    !$omp parallel &
    !$omp private(b,i,a,beta) &
    !$omp default(shared)
    do beta = 1,  nVmax
      do i = 1, nO
        !$omp do
        do b = 1, nV
          do a = 1, nV
            W_vvov(a,b,i,beta) = 2d0 * W_vvov_tmp(a,i,b,beta) - W_vvov_tmp(b,i,a,beta)
          enddo
        enddo
        !$omp end do nowait
      enddo
    enddo
    !$omp barrier
    !$omp end parallel
    call dgemm('T','N',nO,nVmax,nO*nV*nV, &
             1d0, T_vvoo, nV*nV*nO, &
                  W_vvov, nO*nV*nV, &
             1d0, r1(1,iblock), nO)
  enddo
  deallocate(W_vvov,T_vvoo)
  double precision, allocatable :: W_oovo(:,:,:,:)
  allocate(W_oovo(nO,nO,nV,nO))
-  !$omp parallel &
+!  !$omp parallel &
-  !$omp shared(nO,nV,cc_space_v_vooo,W_oovo) &
+!  !$omp shared(nO,nV,cc_space_v_oovo,W_oovo) &
-  !$omp private(u,a,i,j) &
+!  !$omp private(u,a,i,j) &
-  !$omp default(none)
+!  !$omp default(none)
-  do u = 1, nO
+!  do u = 1, nO
-    !$omp do
+!    !$omp do
-    do a = 1, nV
+!    do a = 1, nV
-      do j = 1, nO
+!      do j = 1, nO
-        do i = 1, nO
+!        do i = 1, nO
-          W_oovo(i,j,a,u) = 2d0 * cc_space_v_vooo(a,u,i,j) - cc_space_v_vooo(a,u,j,i)
+!          W_oovo(i,j,a,u) = 2d0 * cc_space_v_oovo(i,j,a,u) - cc_space_v_oovo(j,i,a,u)
-        enddo
+!        enddo
-      enddo
+!      enddo
-    enddo
+!    enddo
-    !$omp end do nowait
+!    !$omp end do nowait
-  enddo
+!  enddo
-  !$omp end parallel
+!  !$omp end parallel
-  call dgemm('T','N', nO, nV, nO*nO*nV, &
+!  call dgemm('T','N', nO, nV, nO*nO*nV, &
-             -1d0, W_oovo, size(W_oovo,1) * size(W_oovo,2) * size(W_oovo,3), &
+!             -1d0, W_oovo, nO * nO * nV, &
-                   tau   , size(tau,1) * size(tau,2) * size(tau,3), &
+!                   tau   , nO * nO * nV, &
-              1d0, r1    , size(r1,1))
+!              1d0, r1    , nO)
-
+!
-  deallocate(W_oovo)
+!  deallocate(W_oovo)
  max_r1 = 0d0
  do a = 1, nV
--- a/devel/ccsd_gpu/diis.irp.f
+++ b/devel/ccsd_gpu/diis.irp.f
@ -0,0 +1,529 @@
 ! Code
 subroutine diis_cc(all_err,all_t,sze,m,iter,t)
  implicit none
  BEGIN_DOC
  ! DIIS. Take the error vectors and the amplitudes of the previous
  ! iterations to compute the new amplitudes
  END_DOC
  ! {err_i}_{i=1}^{m_it} -> B -> c
  ! {t_i}_{i=1}^{m_it}, c, {err_i}_{i=1}^{m_it} -> t_{m_it+1}
  integer, intent(in)             :: m,iter,sze
  double precision, intent(in)    :: all_err(sze,m)
  double precision, intent(in)    :: all_t(sze,m)
  double precision, intent(out)   :: t(sze)
  double precision, allocatable   :: B(:,:), c(:), zero(:)
  integer                         :: m_iter
  integer                         :: i,j,k
  integer                         :: info
  integer, allocatable            :: ipiv(:)
  double precision                :: accu
  m_iter = min(m,iter)
  !print*,'m_iter',m_iter
  allocate(B(m_iter+1,m_iter+1), c(m_iter), zero(m_iter+1))
  allocate(ipiv(m+1))
  ! B(i,j) =  < err(iter-m_iter+j),err(iter-m_iter+i) > ! iter-m_iter will be zero for us
  B = 0d0
  !$OMP PARALLEL &
  !$OMP SHARED(B,m,m_iter,sze,all_err) &
  !$OMP PRIVATE(i,j,k,accu) &
  !$OMP DEFAULT(NONE)
  do j = 1, m_iter
    do i = 1, m_iter
      accu = 0d0
      !$OMP DO
      do k = 1, sze
        ! the errors of the ith iteration are in all_err(:,m+1-i)
        accu = accu + all_err(k,m+1-i) * all_err(k,m+1-j)
      enddo
      !$OMP END DO NOWAIT
      !$OMP CRITICAL
      B(i,j) = B(i,j) + accu
      !$OMP END CRITICAL
    enddo
  enddo
  !$OMP END PARALLEL
  do i = 1, m_iter
    B(i,m_iter+1) = -1
  enddo
  do j = 1, m_iter
    B(m_iter+1,j) = -1
  enddo
  ! Debug
  !print*,'B'
  !do i = 1, m_iter+1
  !  write(*,'(100(F10.6))') B(i,:)
  !enddo
  ! (0 0 .... 0 -1)
  zero = 0d0
  zero(m_iter+1) = -1d0
  ! Solve B.c = zero
  call dgesv(m_iter+1, 1, B, size(B,1), ipiv, zero, size(zero,1), info)
  if (info /= 0) then
    print*,'DIIS error in dgesv:', info
    call abort
  endif
  ! c corresponds to the m_iter first solutions
  c = zero(1:m_iter)
  ! Debug
  !print*,'c',c
  !print*,'all_t' 
  !do i = 1, m
  !  write(*,'(100(F10.6))') all_t(:,i)
  !enddo
  !print*,'all_err' 
  !do i = 1, m
  !  write(*,'(100(F10.6))') all_err(:,i)
  !enddo
  ! update T
  !$OMP PARALLEL &
  !$OMP SHARED(t,c,m,all_err,all_t,sze,m_iter) &
  !$OMP PRIVATE(i,j,accu) &
  !$OMP DEFAULT(NONE)
  !$OMP DO
  do i = 1, sze
    t(i) = 0d0
  enddo
  !$OMP END DO
  do i = 1, m_iter
    !$OMP DO
    do j = 1, sze
      t(j) = t(j) + c(i) * (all_t(j,m+1-i) + all_err(j,m+1-i))
    enddo
    !$OMP END DO
  enddo
  !$OMP END PARALLEL
  !print*,'new t',t
  deallocate(ipiv,B,c,zero)
 end
 ! Update all err
 subroutine update_all_err(err,all_err,sze,m,iter)
  implicit none
  BEGIN_DOC
  ! Shift all the err vectors of the previous iterations to add the new one
  ! The last err vector is placed in the last position and all the others are
  ! moved toward the first one.
  END_DOC
  integer, intent(in)             :: m, iter, sze
  double precision, intent(in)    :: err(sze)
  double precision, intent(inout) :: all_err(sze,m)
  integer                         :: i,j
  integer                         :: m_iter
  m_iter = min(m,iter)
  ! Shift
  !$OMP PARALLEL &
  !$OMP SHARED(m,all_err,err,sze) &
  !$OMP PRIVATE(i,j) &
  !$OMP DEFAULT(NONE)
  do i = 1, m-1
    !$OMP DO
    do j = 1, sze
      all_err(j,i) = all_err(j,i+1)
    enddo
    !$OMP END DO
  enddo
  ! Debug
  !print*,'shift err'
  !do i = 1, m
  !  print*,i, all_err(:,i)
  !enddo
  ! New
  !$OMP DO
  do i = 1, sze
    all_err(i,m) = err(i)
  enddo
  !$OMP END DO
  !$OMP END PARALLEL
  ! Debug
  !print*,'Updated err'
  !do i = 1, m
  !  print*,i, all_err(:,i)
  !enddo
 end
 ! Update all t
 subroutine update_all_t(t,all_t,sze,m,iter)
  implicit none
  BEGIN_DOC
  ! Shift all the t vectors of the previous iterations to add the new one
  ! The last t vector is placed in the last position and all the others are
  ! moved toward the first one.
  END_DOC
  integer, intent(in)             :: m, iter, sze
  double precision, intent(in)    :: t(sze)
  double precision, intent(inout) :: all_t(sze,m)
  integer                         :: i,j
  integer                         :: m_iter
  m_iter = min(m,iter)
  ! Shift
  !$OMP PARALLEL &
  !$OMP SHARED(m,all_t,t,sze) &
  !$OMP PRIVATE(i,j) &
  !$OMP DEFAULT(NONE)
  do i = 1, m-1
    !$OMP DO
    do j = 1, sze
      all_t(j,i) = all_t(j,i+1)
    enddo
    !$OMP END DO
  enddo
  ! New
  !$OMP DO
  do i = 1, sze
    all_t(i,m) = t(i)
  enddo
  !$OMP END DO
  !$OMP END PARALLEL
  ! Debug
  !print*,'Updated t'
  !do i = 1, m
  !  print*,i, all_t(:,i)
  !enddo
 end
 ! Err1
 subroutine compute_err1(nO,nV,f_o,f_v,r1,err1)
  implicit none
  BEGIN_DOC
  ! Compute the error vector for the t1
  END_DOC
  integer, intent(in)           :: nO, nV
  double precision, intent(in)  :: f_o(nO), f_v(nV), r1(nO,nV)
  double precision, intent(out) :: err1(nO,nV)
  integer                       :: i,a
  !$OMP PARALLEL &
  !$OMP SHARED(err1,r1,f_o,f_v,nO,nV,cc_level_shift) &
  !$OMP PRIVATE(i,a) &
  !$OMP DEFAULT(NONE)
  !$OMP DO
  do a = 1, nV
    do i = 1, nO
      err1(i,a) = - r1(i,a) / (f_o(i) - f_v(a) - cc_level_shift)
    enddo
  enddo
  !$OMP END DO
  !$OMP END PARALLEL
 end
 ! Err2
 subroutine compute_err2(nO,nV,f_o,f_v,r2,err2)
  implicit none
  BEGIN_DOC
  ! Compute the error vector for the t2
  END_DOC
  integer, intent(in)           :: nO, nV
  double precision, intent(in)  :: f_o(nO), f_v(nV), r2(nO,nO,nV,nV)
  double precision, intent(out) :: err2(nO,nO,nV,nV)
  integer                       :: i,j,a,b
  !$OMP PARALLEL &
  !$OMP SHARED(err2,r2,f_o,f_v,nO,nV,cc_level_shift) &
  !$OMP PRIVATE(i,j,a,b) &
  !$OMP DEFAULT(NONE)
  !$OMP DO collapse(3)
  do b = 1, nV
    do a = 1, nV
      do j = 1, nO
        do i = 1, nO       
          err2(i,j,a,b) = - r2(i,j,a,b) / (f_o(i) + f_o(j) - f_v(a) - f_v(b) - cc_level_shift)
        enddo
      enddo
    enddo
  enddo
  !$OMP END DO
  !$OMP END PARALLEL
 end
 ! Update t
 subroutine update_t_ccsd(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
  implicit none
  integer, intent(in)             :: nO,nV,nb_iter
  double precision, intent(in)    :: f_o(nO), f_v(nV)
  double precision, intent(in)    :: r1(nO,nV), r2(nO,nO,nV,nV)
  double precision, intent(inout) :: t1(nO,nV), t2(nO,nO,nV,nV)
  double precision, intent(inout) :: all_err1(nO*nV, cc_diis_depth), all_err2(nO*nO*nV*nV, cc_diis_depth)
  double precision, intent(inout) :: all_t1(nO*nV, cc_diis_depth), all_t2(nO*nO*nV*nV, cc_diis_depth)
  double precision, allocatable   :: err1(:,:), err2(:,:,:,:)
  double precision, allocatable   :: tmp_err1(:), tmp_err2(:)
  double precision, allocatable   :: tmp_t1(:), tmp_t2(:)
  if (cc_update_method == 'diis') then
    allocate(err1(nO,nV), err2(nO,nO,nV,nV))
    allocate(tmp_err1(nO*nV), tmp_err2(nO*nO*nV*nV))
    allocate(tmp_t1(nO*nV), tmp_t2(nO*nO*nV*nV))
    ! DIIS T1, it is not always good since the t1 can be small
    ! That's why there is a call to update the t1 in the standard way
    ! T1 error tensor
    !call compute_err1(nO,nV,f_o,f_v,r1,err1)
    ! Transfo errors and parameters in vectors
    !tmp_err1 = reshape(err1,(/nO*nV/))
    !tmp_t1   = reshape(t1  ,(/nO*nV/))
    ! Add the error and parameter vectors with those of the previous iterations
    !call update_all_err(tmp_err1,all_err1,nO*nV,cc_diis_depth,nb_iter+1)
    !call update_all_t  (tmp_t1  ,all_t1  ,nO*nV,cc_diis_depth,nb_iter+1)
    ! Diis and reshape T as a tensor
    !call diis_cc(all_err1,all_t1,nO*nV,cc_diis_depth,nb_iter+1,tmp_t1)
    !t1 = reshape(tmp_t1  ,(/nO,nV/))
    call update_t1(nO,nV,f_o,f_v,r1,t1)
    ! DIIS T2
    ! T2 error tensor
    call compute_err2(nO,nV,f_o,f_v,r2,err2)
    ! Transfo errors and parameters in vectors
    tmp_err2 = reshape(err2,(/nO*nO*nV*nV/))
    tmp_t2   = reshape(t2  ,(/nO*nO*nV*nV/))
    ! Add the error and parameter vectors with those of the previous iterations
    call update_all_err(tmp_err2,all_err2,nO*nO*nV*nV,cc_diis_depth,nb_iter+1)
    call update_all_t  (tmp_t2  ,all_t2  ,nO*nO*nV*nV,cc_diis_depth,nb_iter+1)
    ! Diis and reshape T as a tensor
    call diis_cc(all_err2,all_t2,nO*nO*nV*nV,cc_diis_depth,nb_iter+1,tmp_t2)
    t2 = reshape(tmp_t2  ,(/nO,nO,nV,nV/))
    deallocate(tmp_t1,tmp_t2,tmp_err1,tmp_err2,err1,err2)
  ! Standard update as T = T - Delta
  elseif (cc_update_method == 'none') then
    call update_t1(nO,nV,f_o,f_v,r1,t1)
    call update_t2(nO,nV,f_o,f_v,r2,t2)
  else
    print*,'Unkonw cc_method_method: '//cc_update_method
  endif
 end
 ! Update t v2
 subroutine update_t_ccsd_diis(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
  implicit none
  integer, intent(in)             :: nO,nV,nb_iter
  double precision, intent(in)    :: f_o(nO), f_v(nV)
  double precision, intent(in)    :: r1(nO,nV), r2(nO,nO,nV,nV)
  double precision, intent(inout) :: t1(nO,nV), t2(nO,nO,nV,nV)
  double precision, intent(inout) :: all_err1(nO*nV, cc_diis_depth), all_err2(nO*nO*nV*nV, cc_diis_depth)
  double precision, intent(inout) :: all_t1(nO*nV, cc_diis_depth), all_t2(nO*nO*nV*nV, cc_diis_depth)
  double precision, allocatable   :: all_t(:,:), all_err(:,:), tmp_t(:)
  double precision, allocatable   :: err1(:,:), err2(:,:,:,:)
  double precision, allocatable   :: tmp_err1(:), tmp_err2(:)
  double precision, allocatable   :: tmp_t1(:), tmp_t2(:)
  integer                         :: i,j
  ! Allocate
  allocate(all_err(nO*nV+nO*nO*nV*nV,cc_diis_depth), all_t(nO*nV+nO*nO*nV*nV,cc_diis_depth))
  allocate(tmp_t(nO*nV+nO*nO*nV*nV))
  allocate(err1(nO,nV), err2(nO,nO,nV,nV))
  allocate(tmp_err1(nO*nV), tmp_err2(nO*nO*nV*nV))
  allocate(tmp_t1(nO*nV), tmp_t2(nO*nO*nV*nV))
  ! Compute the errors and reshape them as vector
  call compute_err1(nO,nV,f_o,f_v,r1,err1)
  call compute_err2(nO,nV,f_o,f_v,r2,err2)
  tmp_err1 = reshape(err1,(/nO*nV/))
  tmp_err2 = reshape(err2,(/nO*nO*nV*nV/))
  tmp_t1   = reshape(t1  ,(/nO*nV/))
  tmp_t2   = reshape(t2  ,(/nO*nO*nV*nV/))
  ! Update the errors and parameters for the diis
  call update_all_err(tmp_err1,all_err1,nO*nV,cc_diis_depth,nb_iter+1)
  call update_all_t  (tmp_t1  ,all_t1  ,nO*nV,cc_diis_depth,nb_iter+1)
  call update_all_err(tmp_err2,all_err2,nO*nO*nV*nV,cc_diis_depth,nb_iter+1)
  call update_all_t  (tmp_t2  ,all_t2  ,nO*nO*nV*nV,cc_diis_depth,nb_iter+1)
  ! Gather the different parameters and errors
  !$OMP PARALLEL &
  !$OMP SHARED(nO,nV,all_err,all_err1,all_err2,cc_diis_depth,&
  !$OMP all_t,all_t1,all_t2) &
  !$OMP PRIVATE(i,j) &
  !$OMP DEFAULT(NONE)
  do j = 1, cc_diis_depth
    !$OMP DO 
    do i = 1, nO*nV
      all_err(i,j) = all_err1(i,j)
    enddo
    !$OMP END DO NOWAIT
  enddo
  do j = 1, cc_diis_depth
    !$OMP DO
    do i = 1, nO*nO*nV*nV
      all_err(i+nO*nV,j) = all_err2(i,j)
    enddo
    !$OMP END DO NOWAIT
  enddo
  do j = 1, cc_diis_depth
    !$OMP DO 
    do i = 1, nO*nV
      all_t(i,j) = all_t1(i,j)
    enddo
    !$OMP END DO NOWAIT
  enddo
  do j = 1, cc_diis_depth
    !$OMP DO 
    do i = 1, nO*nO*nV*nV
      all_t(i+nO*nV,j) = all_t2(i,j)
    enddo
    !$OMP END DO
  enddo
  !$OMP END PARALLEL
  ! Diis
  call diis_cc(all_err,all_t,nO*nV+nO*nO*nV*nV,cc_diis_depth,nb_iter+1,tmp_t)
  ! Split the resulting vector
  !$OMP PARALLEL &
  !$OMP SHARED(nO,nV,tmp_t,tmp_t1,tmp_t2) &
  !$OMP PRIVATE(i) &
  !$OMP DEFAULT(NONE)
  !$OMP DO
  do i = 1, nO*nV
    tmp_t1(i) = tmp_t(i)
  enddo
  !$OMP END DO NOWAIT
  !$OMP DO
  do i = 1, nO*nO*nV*nV
    tmp_t2(i) = tmp_t(i+nO*nV) 
  enddo
  !$OMP END DO
  !$OMP END PARALLEL
  ! Reshape as tensors
  t1 = reshape(tmp_t1 ,(/nO,nV/))
  t2 = reshape(tmp_t2 ,(/nO,nO,nV,nV/))
  ! Deallocate
  deallocate(tmp_t1,tmp_t2,tmp_err1,tmp_err2,err1,err2,all_t,all_err)
 end
 ! Update t v3
 subroutine update_t_ccsd_diis_v3(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err,all_t)
  implicit none
  integer, intent(in)             :: nO,nV,nb_iter
  double precision, intent(in)    :: f_o(nO), f_v(nV)
  double precision, intent(in)    :: r1(nO,nV), r2(nO,nO,nV,nV)
  double precision, intent(inout) :: t1(nO*nV), t2(nO*nO*nV*nV)
  double precision, intent(inout) :: all_err(nO*nV+nO*nO*nV*nV, cc_diis_depth)
  double precision, intent(inout) :: all_t(nO*nV+nO*nO*nV*nV, cc_diis_depth)
  double precision, allocatable   :: tmp(:)
  integer                         :: i,j
  ! Allocate
  allocate(tmp(nO*nV+nO*nO*nV*nV))
  ! Compute the errors
  call compute_err1(nO,nV,f_o,f_v,r1,tmp(1:nO*nV))
  call compute_err2(nO,nV,f_o,f_v,r2,tmp(nO*nV+1:nO*nV+nO*nO*nV*nV))
  ! Update the errors and parameters for the diis
  call update_all_err(tmp,all_err,nO*nV+nO*nO*nV*nV,cc_diis_depth,nb_iter+1)
  !$OMP PARALLEL &
  !$OMP SHARED(nO,nV,tmp,t1,t2) &
  !$OMP PRIVATE(i) &
  !$OMP DEFAULT(NONE)
  !$OMP DO
  do i = 1, nO*nV
    tmp(i) = t1(i)
  enddo
  !$OMP END DO NOWAIT
  !$OMP DO
  do i = 1, nO*nO*nV*nV
    tmp(i+nO*nV) = t2(i)
  enddo
  !$OMP END DO
  !$OMP END PARALLEL
  call update_all_t(tmp,all_t,nO*nV+nO*nO*nV*nV,cc_diis_depth,nb_iter+1)
  ! Diis
  call diis_cc(all_err,all_t,nO*nV+nO*nO*nV*nV,cc_diis_depth,nb_iter+1,tmp)
  ! Split the resulting vector
  !$OMP PARALLEL &
  !$OMP SHARED(nO,nV,tmp,t1,t2) &
  !$OMP PRIVATE(i) &
  !$OMP DEFAULT(NONE)
  !$OMP DO
  do i = 1, nO*nV
    t1(i) = tmp(i)
  enddo
  !$OMP END DO NOWAIT
  !$OMP DO
  do i = 1, nO*nO*nV*nV
    t2(i) = tmp(i+nO*nV) 
  enddo
  !$OMP END DO
  !$OMP END PARALLEL
  ! Deallocate
  deallocate(tmp)
 end
--- a/devel/ccsd_gpu/energy.irp.f
+++ b/devel/ccsd_gpu/energy.irp.f
@ -0,0 +1,12 @@
 subroutine det_energy(det,energy)
  implicit none
  integer(bit_kind), intent(in) :: det
  double precision, intent(out) :: energy
  double precision, external :: diag_H_mat_elem
  energy = diag_H_mat_elem(det,N_int) + nuclear_repulsion
 end
--- a/devel/ccsd_gpu/gpu.c
+++ b/devel/ccsd_gpu/gpu.c
@ -43,6 +43,292 @@ void gpu_upload(gpu_data* data,
 }
 void compute_h_oo_chol_gpu(gpu_data* data, int igpu)
 {
    int ngpus = 1;
    if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
    igpu = igpu % ngpus;
    const int cholesky_mo_num = data[igpu].cholesky_mo_num;
    const int nO = data[igpu].nO;
    const int nV = data[igpu].nV;
    cudaSetDevice(igpu);
    int m,n,k, lda, ldb, ldc;
    double alpha, beta;
    double* A;
    double* B;
    double* C;
    cudaStream_t stream[nV];
    cublasHandle_t handle;
    cublasCreate(&handle);
    double* d_H_oo               = data[igpu].H_oo;
    double* d_tau_x              = data[igpu].tau_x;
    double* d_cc_space_f_oo      = data[igpu].cc_space_f_oo;
    double* d_cc_space_v_vo_chol = data[igpu].cc_space_v_vo_chol;
    double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol;
    double* d_tau_kau;
    cudaMalloc((void **)&d_tau_kau, cholesky_mo_num*nV*nO * sizeof(double));
    double* d_tmp_ovv;
    cudaMalloc((void **)&d_tmp_ovv, nO*nV*nV  * sizeof(double));
    double* d_tmp_vov;
    cudaMalloc((void **)&d_tmp_vov, nV*nO*nV  * sizeof(double));
    for (int i=0 ; i<nV ; ++i) {
      cudaStreamCreate(&(stream[i]));
    }
    alpha = 1.0;
    beta = 0.0;
    for (int u=0 ; u<nO ; ++u) {
      cublasDcopy(handle, nO*nV*nV, &(d_tau_x[u]), nO, d_tmp_ovv, 1);
      for (int b=0 ; b<nV ; ++b) {
            cublasSetStream(handle, stream[b]);
            A = &(d_tmp_ovv[nO*nV*b]); lda = nO;
            B = &(d_tmp_ovv[nO*nV*b]); ldb = nO;
            C = &(d_tmp_vov[nV*nO*b]); ldc = nV;
            cublasDgeam(handle, CUBLAS_OP_T, CUBLAS_OP_T, nV, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
      }
      cudaDeviceSynchronize();
      cublasSetStream(handle, NULL);
      alpha = 1.0;
      beta = 0.0;
      m=cholesky_mo_num;  n=nV;  k=nO*nV;
      A=d_cc_space_v_ov_chol;  lda=cholesky_mo_num;
      B=d_tmp_vov;  ldb=nV;
      C=&(d_tau_kau[cholesky_mo_num*nV*u]);  ldc=cholesky_mo_num;
      cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
    }
    for (int i=0 ; i<nV ; ++i) {
      cudaStreamDestroy(stream[i]);
    }
    cudaFree(d_tmp_vov);
    cudaFree(d_tmp_ovv);
    cublasDcopy(handle, nO*nO, d_cc_space_f_oo, 1, d_H_oo, 1);
    alpha = 1.0;
    beta  = 1.0;
    m=nO;  n=nO;  k=cholesky_mo_num*nV;
    A=d_tau_kau;  lda=cholesky_mo_num*nV;
    B=d_cc_space_v_vo_chol;  ldb=cholesky_mo_num*nV;
    C=d_H_oo;  ldc=nO;
    cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
    cudaFree(d_tau_kau);
    double* H_oo = malloc(nO*nO*sizeof(double));
    cublasGetMatrix(nO, nO, sizeof(double), d_H_oo, nO, H_oo, nO);
    for (int i=0 ; i<ngpus ; ++i) {
       if (i != igpu) {
         double* d_H_oo = data[i].H_oo;
         cudaSetDevice(i);
         cublasSetMatrix(nO, nO, sizeof(double), H_oo, nO, d_H_oo, nO);
       }
    }
    free(H_oo);
    cublasDestroy(handle);
 }
 void compute_h_vo_chol_gpu(gpu_data* data, int igpu)
 {
    int ngpus = 1;
    if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
    igpu = igpu % ngpus;
    const int cholesky_mo_num = data[igpu].cholesky_mo_num;
    const int nO = data[igpu].nO;
    const int nV = data[igpu].nV;
    cudaSetDevice(igpu);
    int m,n,k, lda, ldb, ldc;
    double alpha, beta;
    double* A;
    double* B;
    double* C;
    cudaStream_t stream[nV];
    cublasHandle_t handle;
    cublasCreate(&handle);
    double* d_t1                 = data[igpu].t1;
    double* d_H_vo               = data[igpu].H_vo;
    double* d_tau_x              = data[igpu].tau_x;
    double* d_cc_space_f_vo      = data[igpu].cc_space_f_vo;
    double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol;
    double* d_cc_space_v_vo_chol = data[igpu].cc_space_v_vo_chol;
    cublasDcopy(handle, nV*nO, d_cc_space_f_vo, 1, d_H_vo, 1);
    double* d_tmp_k;
    cudaMalloc((void **)&d_tmp_k, cholesky_mo_num * sizeof(double));
    alpha = 2.0;
    beta  = 0.0;
    m=cholesky_mo_num;  n=1;  k=nO*nV;
    A=d_cc_space_v_ov_chol;  lda=cholesky_mo_num;
    B=d_t1;  ldb=nO*nV;
    C=d_tmp_k;  ldc=cholesky_mo_num;
    cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
    alpha = 1.0;
    beta  = 1.0;
    m=nV*nO;  n=1;  k=cholesky_mo_num;
    A=d_cc_space_v_vo_chol;  lda=cholesky_mo_num;
    B=d_tmp_k;  ldb=cholesky_mo_num;
    C=d_H_vo;  ldc=nV*nO;
    cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
    cudaFree(d_tmp_k);
    double* d_tmp;
    cudaMalloc((void **)&d_tmp, cholesky_mo_num*nO*nO * sizeof(double));
    alpha = 1.0;
    beta  = 0.0;
    m=cholesky_mo_num*nO;  n=nO;  k=nV;
    A=d_cc_space_v_ov_chol;  lda=cholesky_mo_num*nO;
    B=d_t1;  ldb=nO;
    C=d_tmp;  ldc=cholesky_mo_num*nO;
    cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
    double* d_tmp2;
    cudaMalloc((void **)&d_tmp2, cholesky_mo_num*nO*nO * sizeof(double));
    for (int i=0 ; i<nV ; ++i) {
      cudaStreamCreate(&(stream[i]));
    }
    for (int i=0 ; i<nO ; ++i) {
      for (int j=0 ; j<nO ; ++j) {
         cublasSetStream(handle, stream[j]);
         cublasDcopy(handle, cholesky_mo_num, &(d_tmp [cholesky_mo_num*(i+nO*j)]), 1,
                                              &(d_tmp2[cholesky_mo_num*(j+nO*i)]), 1);
      }
    }
    for (int i=0 ; i<nV ; ++i) {
      cudaStreamDestroy(stream[i]);
    }
    cublasSetStream(handle, NULL);
    alpha = -1.0;
    beta  = 1.0;
    m=nV;  n=nO;  k=cholesky_mo_num*nO;
    A=d_cc_space_v_ov_chol;  lda=cholesky_mo_num*nO;
    B=d_tmp2;  ldb=cholesky_mo_num*nO;
    C=d_H_vo;  ldc=nV;
    cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
    double* H_vo = malloc(nV*nO*sizeof(double));
    cublasGetMatrix(nV, nO, sizeof(double), d_H_vo, nV, H_vo, nV);
    for (int i=0 ; i<ngpus ; ++i) {
       if (i != igpu) {
         double* d_H_vo = data[i].H_vo;
         cudaSetDevice(i);
         cublasSetMatrix(nV, nO, sizeof(double), H_vo, nV, d_H_vo, nV);
       }
    }
    free(H_vo);
    cublasDestroy(handle);
 }
 void compute_h_vv_chol_gpu(gpu_data* data, int igpu)
 {
    int ngpus = 1;
    if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
    igpu = igpu % ngpus;
    const int cholesky_mo_num = data[igpu].cholesky_mo_num;
    const int nO = data[igpu].nO;
    const int nV = data[igpu].nV;
    cudaSetDevice(igpu);
    int m,n,k, lda, ldb, ldc;
    double alpha, beta;
    double* A;
    double* B;
    double* C;
    cudaStream_t stream[nV];
    cublasHandle_t handle;
    cublasCreate(&handle);
    double* d_H_vv               = data[igpu].H_vv;
    double* d_tau_x              = data[igpu].tau_x;
    double* d_cc_space_f_vv      = data[igpu].cc_space_f_vv;
    double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol;
    double* d_tau_kia;
    cudaMalloc((void **)&d_tau_kia, cholesky_mo_num*nO*nV * sizeof(double));
    double* d_tmp_oov;
    cudaMalloc((void **)&d_tmp_oov, nO*nO*nV  * sizeof(double));
    alpha = 1.0;
    beta = 0.0;
    for (int i=0 ; i<nV ; ++i) {
      cudaStreamCreate(&(stream[i]));
    }
    for (int a=0 ; a<nV ; ++a) {
      for (int b=0 ; b<nV ; ++b) {
         cublasSetStream(handle, stream[b]);
         cublasDcopy(handle, nO*nO, &(d_tau_x[nO*nO*(a+nV*b)]), 1, &(d_tmp_oov[nO*nO*b]), 1);
      }
      cudaDeviceSynchronize();
      cublasSetStream(handle, NULL);
      alpha = 1.0;
      beta = 0.0;
      m=cholesky_mo_num;  n=nO;  k=nO*nV;
      A=d_cc_space_v_ov_chol;  lda=cholesky_mo_num;
      B=d_tmp_oov;  ldb=nO;
      C=&(d_tau_kia[cholesky_mo_num*nO*a]);  ldc=cholesky_mo_num;
      cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
    }
    for (int i=0 ; i<nV ; ++i) {
      cudaStreamDestroy(stream[i]);
    }
    cudaFree(d_tmp_oov);
    cublasDcopy(handle, nV*nV, d_cc_space_f_vv, 1, d_H_vv, 1);
    alpha = -1.0;
    beta  = 1.0;
    m=nV;  n=nV;  k=cholesky_mo_num*nO;
    A=d_tau_kia;  lda=cholesky_mo_num*nO;
    B=d_cc_space_v_ov_chol;  ldb=cholesky_mo_num*nO;
    C=d_H_vv;  ldc=nV;
    cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
    cudaFree(d_tau_kia);
    double* H_vv = malloc(nV*nV*sizeof(double));
    cublasGetMatrix(nV, nV, sizeof(double), d_H_vv, nV, H_vv, nV);
    for (int i=0 ; i<ngpus ; ++i) {
       if (i != igpu) {
         double* d_H_vv = data[i].H_vv;
         cudaSetDevice(i);
         cublasSetMatrix(nV, nV, sizeof(double), H_vv, nV, d_H_vv, nV);
       }
    }
    free(H_vv);
    cublasDestroy(handle);
 }
 void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, double* r2, double* max_r2)
 {
    const int cholesky_mo_num = data->cholesky_mo_num;
@ -1294,7 +1580,6 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
              for (size_t bet=iblock ; bet<(nV < iblock+BLOCK_SIZE ? nV : iblock+BLOCK_SIZE) ; ++bet)
              {
                alpha = 1.0;
                beta = 0.0;
                A = &(d_tmpB1[nV*(bet-iblock)]); lda = nV*BLOCK_SIZE;
@ -1344,281 +1629,306 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
 }
-void compute_h_oo_chol_gpu(gpu_data* data, int nO, int nV, int igpu, double* H_oo)
+
 void compute_r1_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, double* r1, double* max_r1)
 {
    const int cholesky_mo_num = data->cholesky_mo_num;
    int ngpus = 1;
    if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
    igpu = igpu % ngpus;
-    const int cholesky_mo_num = data[igpu].cholesky_mo_num;
+    #pragma omp parallel num_threads(ngpus)
-    cudaSetDevice(igpu);
+    {
      int m,n,k, lda, ldb, ldc;
      double alpha, beta;
      double* A;
      double* B;
      double* C;
      cudaStream_t stream[nV];
-    int m,n,k, lda, ldb, ldc;
+      int igpu = omp_get_thread_num();
-    double alpha, beta;
+      cudaSetDevice(igpu);
    double* A;
    double* B;
    double* C;
    cudaStream_t stream[nV];
-    cublasHandle_t handle;
+      cublasHandle_t handle;
-    cublasCreate(&handle);
+      cublasCreate(&handle);
-    double* d_H_oo               = data[igpu].H_oo;
+      double* d_r1;
-    double* d_tau_x              = data[igpu].tau_x;
+      lda = nO ;
-    double* d_cc_space_f_oo      = data[igpu].cc_space_f_oo;
+      cudaMalloc((void **)&d_r1, lda * nV * sizeof(double));
-    double* d_cc_space_v_vo_chol = data[igpu].cc_space_v_vo_chol;
+      cudaMemset(d_r1, 0, nO*nV*sizeof(double));
-    double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol;
+      memset(r1, 0, nO*nV*sizeof(double));
-    double* d_tau_kau;
+      double* d_cc_space_v_vo_chol = data[igpu].cc_space_v_vo_chol;
-    cudaMalloc((void **)&d_tau_kau, cholesky_mo_num*nV*nO * sizeof(double));
+      double* d_cc_space_v_vv_chol = data[igpu].cc_space_v_vv_chol;
      double* d_cc_space_v_oovo    = data[igpu].cc_space_v_oovo;
      double* d_cc_space_v_ovov    = data[igpu].cc_space_v_ovov;
      double* d_cc_space_v_voov    = data[igpu].cc_space_v_voov;
      double* d_cc_space_f_ov      = data[igpu].cc_space_f_ov;
      double* d_cc_space_f_vo      = data[igpu].cc_space_f_vo;
      double* d_tau                = data[igpu].tau;
      double* d_t1                 = data[igpu].t1;
      double* d_t2                 = data[igpu].t2;
      double* d_H_oo               = data[igpu].H_oo;
      double* d_H_vo               = data[igpu].H_vo;
      double* d_H_vv               = data[igpu].H_vv;
-    double* d_tmp_ovv;
+      #pragma omp sections
-    cudaMalloc((void **)&d_tmp_ovv, nO*nV*nV  * sizeof(double));
+      {
-    double* d_tmp_vov;
+        #pragma omp section
-    cudaMalloc((void **)&d_tmp_vov, nV*nO*nV  * sizeof(double));
+        {
          cublasDcopy(handle, nO*nV, d_cc_space_f_ov, 1, d_r1, 1);
-    for (int i=0 ; i<nV ; ++i) {
+          double* d_X_oo;
-      cudaStreamCreate(&(stream[i]));
+          cudaMalloc((void **)&d_X_oo, nO*nO * sizeof(double));
-    }
+
-    alpha = 1.0;
+          alpha = -2.0;
-    beta = 0.0;
+          beta  = 0.0;
-    for (int u=0 ; u<nO ; ++u) {
+          m=nO;  n=nO;  k=nV;
-      cublasDcopy(handle, nO*nV*nV, &(d_tau_x[u]), nO, d_tmp_ovv, 1);
+          A=d_t1;  lda=nO;
-      for (int b=0 ; b<nV ; ++b) {
+          B=d_cc_space_f_vo;  ldb=nV;
-            cublasSetStream(handle, stream[b]);
+          C=d_X_oo;  ldc=nO;
-            A = &(d_tmp_ovv[nO*nV*b]); lda = nO;
+          cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
-            B = &(d_tmp_ovv[nO*nV*b]); ldb = nO;
+
-            C = &(d_tmp_vov[nV*nO*b]); ldc = nV;
+          alpha = 1.0;
-            cublasDgeam(handle, CUBLAS_OP_T, CUBLAS_OP_T, nV, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
+          beta  = 1.0;
          m=nO;  n=nV;  k=nO;
          A=d_X_oo;  lda=nO;
          B=d_t1;  ldb=nO;
          C=d_r1;  ldc=nO;
          cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
          cudaFree(d_X_oo);
        }
        #pragma omp section
        {
          alpha = 1.0;
          beta  = 1.0;
          m=nO;  n=nV;  k=nV;
          A=d_t1;  lda=nO;
          B=d_H_vv;  ldb=nV;
          C=d_r1;  ldc=nO;
          cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
        }
        #pragma omp section
        {
          alpha = -1.0;
          beta  = 1.0;
          m=nO;  n=nV;  k=nO;
          A=d_H_oo;  lda=nO;
          B=d_t1;  ldb=nO;
          C=d_r1;  ldc=nO;
          cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
        }
        #pragma omp section
        {
          double* d_X_voov;
          cudaMalloc((void **)&d_X_voov, nV* nO* nO* nV * sizeof(double));
          for (int i=0 ; i<nV ; ++i) {
            cudaStreamCreate(&(stream[i]));
          }
          alpha = -1.0;
          for (int i=0 ; i<nO ; ++i) {
            for (int bet=0 ; bet<nV ; ++bet) {
              cublasSetStream(handle, stream[bet]);
              beta = t1[i+bet*nO];
              A = &(d_t2[nO*(i+nO*nV*bet)]); lda = nO*nO;
              B = &(d_t1[0]); ldb = nO;
              C = &(d_X_voov[nV*(i+nO*nO*bet)]); ldc = nV*nO;
              cublasDgeam(handle, CUBLAS_OP_T, CUBLAS_OP_T, nV, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
            }
          }
          cudaDeviceSynchronize();
          alpha = 1.0;
          beta  = 2.0;
          for (int bet=0 ; bet<nV ; ++bet) {
            cublasSetStream(handle, stream[bet]);
            A = &(d_X_voov[nV*nO*nO*bet]); lda = nV;
            B = &(d_t2[nO*nO*nV*bet]); ldb = nO*nO;
            C = A ; ldc = lda;
            cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nV, nO*nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
          }
          for (int i=0 ; i<nV ; ++i) {
            cudaStreamDestroy(stream[i]);
          }
          cublasSetStream(handle, NULL);
          alpha = 1.0;
          beta  = 1.0;
          m=nV*nO;  n=nO*nV;
          A=d_X_voov;  lda=nV * nO;
          B=d_H_vo;  ldb=1;
          C=d_r1;  ldc=1;
          cublasDgemv(handle, CUBLAS_OP_T, m, n, &alpha, A, lda, B, ldb, &beta, C, ldc);
          cudaFree(d_X_voov);
        }
        #pragma omp section
        {
          double* d_X_ovov;
          cudaMalloc((void **)&d_X_ovov, nO* nV* nO* nV * sizeof(double));
          cublasDcopy(handle, nO*nV*nO*nV, d_cc_space_v_ovov, 1, d_X_ovov, 1);
          for (int i=0 ; i<nV ; ++i) {
            cudaStreamCreate(&(stream[i]));
          }
          alpha = -1.0;
          beta  = 2.0;
          for (int u=0 ; u<nO ; ++u) {
            for (int bet=0 ; bet<nV ; ++bet) {
              cublasSetStream(handle, stream[bet]);
              A = &(d_X_ovov[nO*nV*(u+nO*bet)]); lda = nO;
              B = &(d_cc_space_v_voov[(nV*(u+nO*nO*bet))]); ldb = nV*nO;
              C = A ; ldc = lda;
              cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nO, nV, &alpha, A, lda, &beta, B, ldb, C, ldc);
            }
          }
          for (int i=0 ; i<nV ; ++i) {
            cudaStreamDestroy(stream[i]);
          }
          cublasSetStream(handle, NULL);
          alpha = 1.0;
          beta  = 1.0;
          m=nO*nV;  n=nO*nV;
          A=d_X_ovov;  lda=nO * nV;
          B=d_t1;  ldb=1;
          C=d_r1;  ldc=1;
          cublasDgemv(handle, CUBLAS_OP_T, m, n, &alpha, A, lda, B, ldb, &beta, C, ldc);
          cudaFree(d_X_ovov);
        }
        #pragma omp section
        {
          double* d_T_vvoo;
          cudaMalloc((void **)&d_T_vvoo, nV*nV*nO*nO * sizeof(double));
          alpha = 0.0;
          beta  = 1.0;
          A = d_T_vvoo; lda = nV*nV;
          B = d_tau; ldb = nO*nO;
          C = A ; ldc = lda;
          cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nV*nV, nO*nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
          double* d_W_vvov;
          cudaMalloc((void **)&d_W_vvov, nV*nV*nO*BLOCK_SIZE * sizeof(double));
          double* d_W_vvov_tmp;
          cudaMalloc((void **)&d_W_vvov_tmp, nV*nO*nV*BLOCK_SIZE * sizeof(double));
          for (int iblock=0 ; iblock<nV ; iblock += BLOCK_SIZE) {
            const int mbs = BLOCK_SIZE < nV-iblock ? BLOCK_SIZE : nV-iblock;
            alpha = 1.0;
            beta  = 0.0;
            m=nV*nO;  n=nV*mbs;  k=cholesky_mo_num;
            A=d_cc_space_v_vo_chol;  lda=cholesky_mo_num;
            B=&(d_cc_space_v_vv_chol[cholesky_mo_num*nV*iblock]);  ldb=cholesky_mo_num;
            C=d_W_vvov_tmp;  ldc=nV*nO;
            cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
            alpha = 2.0;
            beta = -1.0;
            int kk=0;
            for (int i=0 ; i<nV ; ++i) {
              cudaStreamCreate(&(stream[i]));
            }
            for (int i=0 ; i<nO ; ++i) {
              for (int bet=0 ; bet<mbs ; ++bet) {
                cublasSetStream(handle, stream[kk]);
                ++kk;
                if (kk >= nV) kk = 0;
                A = &(d_W_vvov_tmp[nV*(i+nO*nV*bet)]); lda = nV*nO;
                B = &(d_W_vvov_tmp[nV*(i+nO*nV*bet)]); ldb = nV*nO;
                C = &(d_W_vvov[nV*nV*(i+nO*bet)]); ldc = nV;
                cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nV, nV, &alpha, A, lda, &beta, B, ldb, C, ldc);
              }
            }
            for (int i=0 ; i<nV ; ++i) {
              cudaStreamDestroy(stream[i]);
            }
            cublasSetStream(handle, NULL);
            alpha = 1.0;
            beta  = 1.0;
            m=nO;  n=mbs;  k=nO*nV*nV;
            A=d_T_vvoo;  lda=nV*nV*nO;
            B=d_W_vvov;  ldb=nO*nV*nV;
            C=&(d_r1[nO*iblock]);  ldc=nO;
            cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
          }
          cudaFree(d_W_vvov);
          cudaFree(d_W_vvov_tmp);
          cudaFree(d_T_vvoo);
        }
        #pragma omp section
        {
          double* d_W_oovo;
          cudaMalloc((void **)&d_W_oovo, nO*nO*nV*nO * sizeof(double));
          alpha = 2.0;
          beta  = -1.0;
          for (int i=0 ; i<nV ; ++i) {
            cudaStreamCreate(&(stream[i]));
          }
          for (int u=0 ; u<nO ; ++u) {
            for (int a=0 ; a<nV ; ++a) {
              cublasSetStream(handle, stream[a]);
              A = &(d_cc_space_v_oovo[nO*nO*(a+nV*u)]); lda = nO;
              B = &(d_cc_space_v_oovo[nO*nO*(a+nV*u)]); ldb = nO;
              C = &(d_W_oovo[nO*nO*(a+nV*u)]); ldc = nO;
              cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
            }
          }
          for (int i=0 ; i<nV ; ++i) {
            cudaStreamDestroy(stream[i]);
          }
          cublasSetStream(handle, NULL);
          alpha = -1.0;
          beta  = 1.0;
          m=nO;  n=nV;  k=nO*nO*nV;
          A=d_W_oovo;  lda=nO * nO * nV;
          B=d_tau;  ldb=nO * nO * nV;
          C=d_r1;  ldc=nO;
          cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
        }
      }
      cudaDeviceSynchronize();
      cublasSetStream(handle, NULL);
      alpha = 1.0;
      beta = 0.0;
      m=cholesky_mo_num;  n=nV;  k=nO*nV;
      A=d_cc_space_v_ov_chol;  lda=cholesky_mo_num;
      B=d_tmp_vov;  ldb=nV;
      C=&(d_tau_kau[cholesky_mo_num*nV*u]);  ldc=cholesky_mo_num;
      cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
    }
    for (int i=0 ; i<nV ; ++i) {
      cudaStreamDestroy(stream[i]);
    }
-    cudaFree(d_tmp_vov);
+      double * r1_tmp = malloc(nO*nV*sizeof(double));
-    cudaFree(d_tmp_ovv);
+      lda=nO;
      cublasGetMatrix(nO, nV, sizeof(double), d_r1, lda, r1_tmp, lda);
      #pragma omp critical
      {
         for (size_t i=0 ; i<(size_t) nO*nV ; ++i) {
           r1[i] -= r1_tmp[i];
         }
      }
      free(r1_tmp);
-    cublasDcopy(handle, nO*nO, d_cc_space_f_oo, 1, d_H_oo, 1);
+      cudaFree(d_r1);
    alpha = 1.0;
    beta  = 1.0;
    m=nO;  n=nO;  k=cholesky_mo_num*nV;
    A=d_tau_kau;  lda=cholesky_mo_num*nV;
    B=d_cc_space_v_vo_chol;  ldb=cholesky_mo_num*nV;
    C=d_H_oo;  ldc=nO;
    cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
-    cudaFree(d_tau_kau);
+      cublasDestroy(handle);
-
+  }
 //    double* H_oo = malloc(nO*nO*sizeof(double));
    cublasGetMatrix(nO, nO, sizeof(double), d_H_oo, nO, H_oo, nO);
    for (int i=0 ; i<ngpus ; ++i) {
       if (i != igpu) {
         double* d_H_oo = data[i].H_oo;
         cudaSetDevice(i);
         cublasSetMatrix(nO, nO, sizeof(double), H_oo, nO, d_H_oo, nO);
       }
    }
 //    free(H_oo);
    cublasDestroy(handle);
 }
 void compute_h_vv_chol_gpu(gpu_data* data, int nO, int nV, int igpu, double* H_vv)
 {
    int ngpus = 1;
    if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
    igpu = igpu % ngpus;
    const int cholesky_mo_num = data[igpu].cholesky_mo_num;
    cudaSetDevice(igpu);
    int m,n,k, lda, ldb, ldc;
    double alpha, beta;
    double* A;
    double* B;
    double* C;
    cudaStream_t stream[nV];
    cublasHandle_t handle;
    cublasCreate(&handle);
    double* d_H_vv               = data[igpu].H_vv;
    double* d_tau_x              = data[igpu].tau_x;
    double* d_cc_space_f_vv      = data[igpu].cc_space_f_vv;
    double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol;
    double* d_tau_kia;
    cudaMalloc((void **)&d_tau_kia, cholesky_mo_num*nO*nV * sizeof(double));
    double* d_tmp_oov;
    cudaMalloc((void **)&d_tmp_oov, nO*nO*nV  * sizeof(double));
    alpha = 1.0;
    beta = 0.0;
    for (int i=0 ; i<nV ; ++i) {
      cudaStreamCreate(&(stream[i]));
    }
    for (int a=0 ; a<nV ; ++a) {
      for (int b=0 ; b<nV ; ++b) {
         cublasSetStream(handle, stream[b]);
         cublasDcopy(handle, nO*nO, &(d_tau_x[nO*nO*(a+nV*b)]), 1, &(d_tmp_oov[nO*nO*b]), 1);
      }
      cudaDeviceSynchronize();
      cublasSetStream(handle, NULL);
      alpha = 1.0;
      beta = 0.0;
      m=cholesky_mo_num;  n=nO;  k=nO*nV;
      A=d_cc_space_v_ov_chol;  lda=cholesky_mo_num;
      B=d_tmp_oov;  ldb=nO;
      C=&(d_tau_kia[cholesky_mo_num*nO*a]);  ldc=cholesky_mo_num;
      cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
    }
    for (int i=0 ; i<nV ; ++i) {
      cudaStreamDestroy(stream[i]);
    }
    cudaFree(d_tmp_oov);
    cublasDcopy(handle, nV*nV, d_cc_space_f_vv, 1, d_H_vv, 1);
    alpha = -1.0;
    beta  = 1.0;
    m=nV;  n=nV;  k=cholesky_mo_num*nO;
    A=d_tau_kia;  lda=cholesky_mo_num*nO;
    B=d_cc_space_v_ov_chol;  ldb=cholesky_mo_num*nO;
    C=d_H_vv;  ldc=nV;
    cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
    cudaFree(d_tau_kia);
 //    double* H_vv = malloc(nO*nO*sizeof(double));
    cublasGetMatrix(nV, nV, sizeof(double), d_H_vv, nV, H_vv, nV);
    for (int i=0 ; i<ngpus ; ++i) {
       if (i != igpu) {
         double* d_H_vv = data[i].H_vv;
         cudaSetDevice(i);
         cublasSetMatrix(nV, nV, sizeof(double), H_vv, nV, d_H_vv, nV);
       }
    }
 //    free(H_vv);
    cublasDestroy(handle);
 }
 void compute_h_vo_chol_gpu(gpu_data* data, int nO, int nV, int igpu, double* H_vo)
 {
    int ngpus = 1;
    if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
    igpu = igpu % ngpus;
    const int cholesky_mo_num = data[igpu].cholesky_mo_num;
    cudaSetDevice(igpu);
    int m,n,k, lda, ldb, ldc;
    double alpha, beta;
    double* A;
    double* B;
    double* C;
    cudaStream_t stream[nV];
    cublasHandle_t handle;
    cublasCreate(&handle);
    double* d_t1                 = data[igpu].t1;
    double* d_H_vo               = data[igpu].H_vo;
    double* d_tau_x              = data[igpu].tau_x;
    double* d_cc_space_f_vo      = data[igpu].cc_space_f_vo;
    double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol;
    double* d_cc_space_v_vo_chol = data[igpu].cc_space_v_vo_chol;
    cublasDcopy(handle, nV*nO, d_cc_space_f_vo, 1, d_H_vo, 1);
    double* d_tmp_k;
    cudaMalloc((void **)&d_tmp_k, cholesky_mo_num * sizeof(double));
    alpha = 2.0;
    beta  = 0.0;
    m=cholesky_mo_num;  n=1;  k=nO*nV;
    A=d_cc_space_v_ov_chol;  lda=cholesky_mo_num;
    B=d_t1;  ldb=nO*nV;
    C=d_tmp_k;  ldc=cholesky_mo_num;
    cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
    alpha = 1.0;
    beta  = 1.0;
    m=nV*nO;  n=1;  k=cholesky_mo_num;
    A=d_cc_space_v_vo_chol;  lda=cholesky_mo_num;
    B=d_tmp_k;  ldb=cholesky_mo_num;
    C=d_H_vo;  ldc=nV*nO;
    cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
    cudaFree(d_tmp_k);
    double* d_tmp;
    cudaMalloc((void **)&d_tmp, cholesky_mo_num*nO*nO * sizeof(double));
    alpha = 1.0;
    beta  = 0.0;
    m=cholesky_mo_num*nO;  n=nO;  k=nV;
    A=d_cc_space_v_ov_chol;  lda=cholesky_mo_num*nO;
    B=d_t1;  ldb=nO;
    C=d_tmp;  ldc=cholesky_mo_num*nO;
    cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
    double* d_tmp2;
    cudaMalloc((void **)&d_tmp2, cholesky_mo_num*nO*nO * sizeof(double));
    for (int i=0 ; i<nV ; ++i) {
      cudaStreamCreate(&(stream[i]));
    }
    for (int i=0 ; i<nO ; ++i) {
      for (int j=0 ; j<nO ; ++j) {
         cublasSetStream(handle, stream[j]);
         cublasDcopy(handle, cholesky_mo_num, &(d_tmp [cholesky_mo_num*(i+nO*j)]), 1,
                                              &(d_tmp2[cholesky_mo_num*(j+nO*i)]), 1);
      }
    }
    for (int i=0 ; i<nV ; ++i) {
      cudaStreamDestroy(stream[i]);
    }
    cublasSetStream(handle, NULL);
    alpha = -1.0;
    beta  = 1.0;
    m=nV;  n=nO;  k=cholesky_mo_num*nO;
    A=d_cc_space_v_ov_chol;  lda=cholesky_mo_num*nO;
    B=d_tmp2;  ldb=cholesky_mo_num*nO;
    C=d_H_vo;  ldc=nV;
    cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
 //    double* H_vo = malloc(nO*nO*sizeof(double));
    cublasGetMatrix(nV, nO, sizeof(double), d_H_vo, nV, H_vo, nV);
    for (int i=0 ; i<ngpus ; ++i) {
       if (i != igpu) {
         double* d_H_vo = data[i].H_vo;
         cudaSetDevice(i);
         cublasSetMatrix(nV, nO, sizeof(double), H_vo, nV, d_H_vo, nV);
       }
    }
 //    free(H_vo);
    cublasDestroy(handle);
  *max_r1 = 0.;
  for (size_t i=0 ; i<(size_t) nO*nV ; ++i) {
    const double x = r1[i] > 0. ? r1[i] : -r1[i];
    *max_r1 = *max_r1 > x ? *max_r1 : x;
  }
 }
--- a/devel/ccsd_gpu/gpu.h
+++ b/devel/ccsd_gpu/gpu.h
@ -5,6 +5,7 @@ typedef struct {
   double* cc_space_v_vv_chol;
   double* cc_space_v_oooo;
   double* cc_space_v_vooo;
   double* cc_space_v_voov;
   double* cc_space_v_oovv;
   double* cc_space_v_vvoo;
   double* cc_space_v_oovo;
@ -12,6 +13,7 @@ typedef struct {
   double* cc_space_v_ovov;
   double* cc_space_v_ovoo;
   double* cc_space_f_oo;
   double* cc_space_f_ov;
   double* cc_space_f_vo;
   double* cc_space_f_vv;
   double* tau;
--- a/devel/ccsd_gpu/gpu_init.c
+++ b/devel/ccsd_gpu/gpu_init.c
@ -10,12 +10,12 @@ gpu_data* gpu_init(
   int nO, int nV, int cholesky_mo_num,
   double* cc_space_v_oo_chol, double* cc_space_v_ov_chol,
   double* cc_space_v_vo_chol, double* cc_space_v_vv_chol,
-   double* cc_space_v_oooo, double* cc_space_v_vooo,
+   double* cc_space_v_oooo, double* cc_space_v_vooo, double* cc_space_v_voov,
   double* cc_space_v_oovv, double* cc_space_v_vvoo,
   double* cc_space_v_oovo, double* cc_space_v_ovvo,
   double* cc_space_v_ovov, double* cc_space_v_ovoo,
-   double* cc_space_f_oo, double* cc_space_f_vo,
+   double* cc_space_f_oo, double* cc_space_f_ov,
-   double* cc_space_f_vv)
+   double* cc_space_f_vo, double* cc_space_f_vv)
 {
    int ngpus = 1;
    cudaGetDeviceCount(&ngpus);
@ -59,6 +59,10 @@ gpu_data* gpu_init(
      cudaMalloc((void**)&d_cc_space_v_vooo, nV*nO*nO*nO*sizeof(double));
      cublasSetMatrix(nV*nO, nO*nO, sizeof(double), cc_space_v_vooo, nV*nO, d_cc_space_v_vooo, nV*nO);
      double* d_cc_space_v_voov;
      cudaMalloc((void**)&d_cc_space_v_voov, nV*nO*nO*nV*sizeof(double));
      cublasSetMatrix(nV*nO, nO*nV, sizeof(double), cc_space_v_voov, nV*nO, d_cc_space_v_voov, nV*nO);
      double* d_cc_space_v_oovv;
      cudaMalloc((void**)&d_cc_space_v_oovv, nO*nO*nV*nV*sizeof(double));
      cublasSetMatrix(nO*nO, nV*nV, sizeof(double), cc_space_v_oovv, nO*nO, d_cc_space_v_oovv, nO*nO);
@ -95,6 +99,10 @@ gpu_data* gpu_init(
      cudaMalloc((void**)&d_cc_space_f_vo, nV*nO*sizeof(double));
      cublasSetMatrix(nV, nO, sizeof(double), cc_space_f_vo, nV, d_cc_space_f_vo, nV);
      double* d_cc_space_f_ov;
      cudaMalloc((void**)&d_cc_space_f_ov, nV*nO*sizeof(double));
      cublasSetMatrix(nO, nV, sizeof(double), cc_space_f_ov, nO, d_cc_space_f_ov, nO);
      double* d_cc_space_f_vv;
      cudaMalloc((void**)&d_cc_space_f_vv, nV*nV*sizeof(double));
      cublasSetMatrix(nV, nV, sizeof(double), cc_space_f_vv, nV, d_cc_space_f_vv, nV);
@ -128,6 +136,7 @@ gpu_data* gpu_init(
      data[igpu].cc_space_v_vv_chol = d_cc_space_v_vv_chol;
      data[igpu].cc_space_v_oooo = d_cc_space_v_oooo;
      data[igpu].cc_space_v_vooo = d_cc_space_v_vooo;
      data[igpu].cc_space_v_voov = d_cc_space_v_voov;
      data[igpu].cc_space_v_oovv = d_cc_space_v_oovv;
      data[igpu].cc_space_v_vvoo = d_cc_space_v_vvoo;
      data[igpu].cc_space_v_oovo = d_cc_space_v_oovo;
@ -135,6 +144,7 @@ gpu_data* gpu_init(
      data[igpu].cc_space_v_ovov = d_cc_space_v_ovov;
      data[igpu].cc_space_v_ovoo = d_cc_space_v_ovoo;
      data[igpu].cc_space_f_oo   = d_cc_space_f_oo;
      data[igpu].cc_space_f_ov   = d_cc_space_f_ov;
      data[igpu].cc_space_f_vo   = d_cc_space_f_vo;
      data[igpu].cc_space_f_vv   = d_cc_space_f_vv;
      data[igpu].tau   = d_tau;
--- a/devel/ccsd_gpu/gpu_module.f90
+++ b/devel/ccsd_gpu/gpu_module.f90
@ -6,9 +6,9 @@ module gpu_module
  interface
    type(c_ptr) function gpu_init(nO, nV, cholesky_mo_num, &
      cc_space_v_oo_chol, cc_space_v_ov_chol, cc_space_v_vo_chol, cc_space_v_vv_chol, &
-      cc_space_v_oooo, cc_space_v_vooo, cc_space_v_oovv, cc_space_v_vvoo, &
+      cc_space_v_oooo, cc_space_v_vooo, cc_space_v_voov, cc_space_v_oovv, cc_space_v_vvoo, &
      cc_space_v_oovo, cc_space_v_ovvo, cc_space_v_ovov, cc_space_v_ovoo, &
-      cc_space_f_oo, cc_space_f_vo, cc_space_f_vv) bind(C)
+      cc_space_f_oo, cc_space_f_ov, cc_space_f_vo, cc_space_f_vv) bind(C)
        import c_int, c_double, c_ptr
        integer(c_int), intent(in), value :: nO, nV, cholesky_mo_num
        real(c_double), intent(in)  :: cc_space_v_oo_chol(cholesky_mo_num,nO,nO)
@ -17,6 +17,7 @@ module gpu_module
        real(c_double), intent(in)  :: cc_space_v_vv_chol(cholesky_mo_num,nV,nV)
        real(c_double), intent(in)  :: cc_space_v_oooo(nO,nO,nO,nO)
        real(c_double), intent(in)  :: cc_space_v_vooo(nV,nO,nO,nO)
        real(c_double), intent(in)  :: cc_space_v_voov(nV,nO,nO,nV)
        real(c_double), intent(in)  :: cc_space_v_oovv(nO,nO,nV,nV)
        real(c_double), intent(in)  :: cc_space_v_vvoo(nV,nV,nO,nO)
        real(c_double), intent(in)  :: cc_space_v_oovo(nO,nO,nV,nO)
@ -24,6 +25,7 @@ module gpu_module
        real(c_double), intent(in)  :: cc_space_v_ovov(nO,nV,nO,nV)
        real(c_double), intent(in)  :: cc_space_v_ovoo(nO,nV,nO,nO)
        real(c_double), intent(in)  :: cc_space_f_oo(nO,nO)
        real(c_double), intent(in)  :: cc_space_f_ov(nO,nV)
        real(c_double), intent(in)  :: cc_space_f_vo(nV,nO)
        real(c_double), intent(in)  :: cc_space_f_vv(nV,nV)
    end function
@ -38,25 +40,31 @@ module gpu_module
        real(c_double), intent(in) :: tau_x(nO,nO,nV,nV)
    end subroutine
-    subroutine compute_H_oo_chol_gpu(gpu_data, nO, nV, igpu, H_oo) bind(C)
+    subroutine compute_H_oo_chol_gpu(gpu_data, igpu) bind(C)
        import c_int, c_double, c_ptr
        type(c_ptr), value    :: gpu_data
-        integer(c_int), intent(in), value  :: nO, nV, igpu
+        integer(c_int), intent(in), value  :: igpu
        real(c_double), intent(out) :: H_oo(nO,nO)
    end subroutine
-    subroutine compute_H_vo_chol_gpu(gpu_data, nO, nV, igpu, H_vo) bind(C)
+    subroutine compute_H_vo_chol_gpu(gpu_data, igpu) bind(C)
        import c_int, c_double, c_ptr
        type(c_ptr), value    :: gpu_data
-        integer(c_int), intent(in), value  :: nO, nV, igpu
+        integer(c_int), intent(in), value  :: igpu
        real(c_double), intent(out) :: H_vo(nV,nO)
    end subroutine
-    subroutine compute_H_vv_chol_gpu(gpu_data, nO, nV, igpu, H_vv) bind(C)
+    subroutine compute_H_vv_chol_gpu(gpu_data, igpu) bind(C)
        import c_int, c_double, c_ptr
        type(c_ptr), value    :: gpu_data
-        integer(c_int), intent(in), value  :: nO, nV, igpu
+        integer(c_int), intent(in), value  :: igpu
-        real(c_double), intent(out) :: H_vv(nO,nO)
+    end subroutine
    subroutine compute_r1_space_chol_gpu(gpu_data, nO, nV, t1, r1, max_r1) bind(C)
        import c_int, c_double, c_ptr
        type(c_ptr), value    :: gpu_data
        integer(c_int), intent(in), value  :: nO, nV
        real(c_double), intent(in)  :: t1(nO,nV)
        real(c_double), intent(out) :: r1(nO,nO,nV,nV)
        real(c_double), intent(out) :: max_r1
    end subroutine
    subroutine compute_r2_space_chol_gpu(gpu_data, nO, nV, t1, r2, max_r2) bind(C)
--- a/devel/ccsd_gpu/guess_t.irp.f
+++ b/devel/ccsd_gpu/guess_t.irp.f
@ -0,0 +1,208 @@
 ! T1
 subroutine guess_t1(nO,nV,f_o,f_v,f_ov,t1)
  implicit none
  BEGIN_DOC
  ! Update the T1 amplitudes for CC
  END_DOC
  ! in
  integer, intent(in)           :: nO, nV
  double precision, intent(in)  :: f_o(nO), f_v(nV), f_ov(nO,nV)
  ! inout
  double precision, intent(out) :: t1(nO, nV)
  ! internal
  integer                       :: i,a
  if (trim(cc_guess_t1) == 'none') then
     t1 = 0d0
  else if (trim(cc_guess_t1) == 'MP') then
    do a = 1, nV
      do i = 1, nO
        t1(i,a) = f_ov(i,a) / (f_o(i) - f_v(a) - cc_level_shift_guess)
      enddo
    enddo
  else if (trim(cc_guess_t1) == 'read') then
    call read_t1(nO,nV,t1)
  else
    print*, 'Unknown cc_guess_t1 type: '//trim(cc_guess_t1)
    call abort
  endif
 end
 ! T2
 subroutine guess_t2(nO,nV,f_o,f_v,v_oovv,t2)
  implicit none
  BEGIN_DOC
  ! Update the T2 amplitudes for CC
  END_DOC
  ! in
  integer, intent(in)           :: nO, nV
  double precision, intent(in)  :: f_o(nO), f_v(nV), v_oovv(nO, nO, nV, nV)
  ! inout
  double precision, intent(out) :: t2(nO, nO, nV, nV)
  ! internal
  integer                       :: i,j,a,b
  if (trim(cc_guess_t2) == 'none') then
    t2 = 0d0
  else if (trim(cc_guess_t2) == 'MP') then
    do b = 1, nV
      do a = 1, nV
        do j = 1, nO
          do i = 1, nO
            t2(i,j,a,b) = v_oovv(i,j,a,b) / (f_o(i) + f_o(j) - f_v(a) - f_v(b) - cc_level_shift_guess)
          enddo
        enddo
      enddo
    enddo
  else if (trim(cc_guess_t2) == 'read') then
    call read_t2(nO,nV,t2)
  else
    print*, 'Unknown cc_guess_t1 type: '//trim(cc_guess_t2)
    call abort
  endif
 end
 ! T1
 subroutine write_t1(nO,nV,t1)
  implicit none
  BEGIN_DOC
  ! Write the T1 amplitudes for CC
  END_DOC
  ! in
  integer, intent(in)          :: nO, nV
  double precision, intent(in) :: t1(nO, nV)
  ! internal
  integer                      :: i,a, iunit
  integer, external :: getunitandopen
  if (write_amplitudes) then
    iunit = getUnitAndOpen(trim(ezfio_filename)//'/work/T1','w')
    do a = 1, nV
      do i = 1, nO
         write(iunit,'(F20.12)') t1(i,a)
      enddo
    enddo
    close(iunit)
  endif
 end
 ! T2
 subroutine write_t2(nO,nV,t2)
  implicit none
  BEGIN_DOC
  ! Write the T2 amplitudes for CC
  END_DOC
  ! in
  integer, intent(in)          :: nO, nV
  double precision, intent(in) :: t2(nO, nO, nV, nV)
  ! internal
  integer                      :: i,j,a,b, iunit
  integer, external :: getunitandopen
  if (write_amplitudes) then
    iunit = getUnitAndOpen(trim(ezfio_filename)//'/work/T2','w')
    do b = 1, nV
      do a = 1, nV
        do j = 1, nO
          do i = 1, nO
             write(iunit,'(F20.12)') t2(i,j,a,b)
          enddo
        enddo
      enddo
    enddo
    close(iunit)
  endif
 end
 ! T1
 subroutine read_t1(nO,nV,t1)
  implicit none
  BEGIN_DOC
  ! Read the T1 amplitudes for CC
  END_DOC
  ! in
  integer, intent(in)           :: nO, nV
  double precision, intent(out) :: t1(nO, nV)
  ! internal
  integer                       :: i,a, iunit
  logical                       :: ok
  integer, external :: getunitandopen
  if (read_amplitudes) then
    iunit = getUnitAndOpen(trim(ezfio_filename)//'/work/T1','r')
    do a = 1, nV
      do i = 1, nO
         read(iunit,'(F20.12)') t1(i,a)
      enddo
    enddo
    close(iunit)
  endif
 end
 ! T2
 subroutine read_t2(nO,nV,t2)
  implicit none
  BEGIN_DOC
  ! Read the T2 amplitudes for CC
  END_DOC
  ! in
  integer, intent(in)           :: nO, nV
  double precision, intent(out) :: t2(nO, nO, nV, nV)
  ! internal
  integer                       :: i,j,a,b, iunit
  logical                       :: ok
  integer, external :: getunitandopen
  if (read_amplitudes) then
    iunit = getUnitAndOpen(trim(ezfio_filename)//'/work/T2','r')
    do b = 1, nV
      do a = 1, nV
        do j = 1, nO
          do i = 1, nO
             read(iunit,'(F20.12)') t2(i,j,a,b)
          enddo
        enddo
      enddo
    enddo
    close(iunit)
  endif
 end
--- a/devel/ccsd_gpu/occupancy.irp.f
+++ b/devel/ccsd_gpu/occupancy.irp.f
@ -0,0 +1,328 @@
 ! N spin orb
 subroutine extract_n_spin(det,n)
  implicit none
  BEGIN_DOC
  ! Returns the number of occupied alpha, occupied beta, virtual alpha, virtual beta spin orbitals
  ! in det without counting the core and deleted orbitals in the format n(nOa,nOb,nVa,nVb)
  END_DOC
  integer(bit_kind), intent(in) :: det(N_int,2)
  integer, intent(out)          :: n(4)
  integer(bit_kind)             :: res(N_int,2)
  integer                       :: i, si
  logical                       :: ok, is_core, is_del
  ! Init
  n = 0
  ! Loop over the spin
  do si = 1, 2
    do i = 1, mo_num
      call apply_hole(det, si, i, res, ok, N_int)
      ! in core ?
      if (is_core(i)) cycle
      ! in del ?
      if (is_del(i)) cycle
      if (ok) then
        ! particle
        n(si) = n(si) + 1
      else
        ! hole
        n(si+2) = n(si+2) + 1
      endif
    enddo
  enddo
  !print*,n(1),n(2),n(3),n(4)
 end
 ! Spin
 subroutine extract_list_orb_spin(det,nO_m,nV_m,list_occ,list_vir)
  implicit none
  BEGIN_DOC
  ! Returns the the list of occupied alpha/beta, virtual alpha/beta spin orbitals
  ! size(nO_m,1) must be max(nOa,nOb) and size(nV_m,1) must be max(nVa,nVb)
  END_DOC
  integer, intent(in)           :: nO_m, nV_m
  integer(bit_kind), intent(in) :: det(N_int,2)
  integer, intent(out)          :: list_occ(nO_m,2), list_vir(nV_m,2)
  integer(bit_kind)             :: res(N_int,2)
  integer                       :: i, si, idx_o, idx_v, idx_i, idx_b
  logical                       :: ok, is_core, is_del
  list_occ = 0
  list_vir = 0
  ! List of occ/vir alpha/beta
  ! occ alpha -> list_occ(:,1)
  ! occ beta -> list_occ(:,2)
  ! vir alpha -> list_vir(:,1)
  ! vir beta -> list_vir(:,2)
  ! Loop over the spin 
  do si = 1, 2
    ! tmp idx
    idx_o = 1
    idx_v = 1
    do i = 1, mo_num
      call apply_hole(det, si, i, res, ok, N_int)
      ! in core ?
      if (is_core(i)) cycle
      ! in del ?
      if (is_del(i)) cycle
      if (ok) then
        ! particle
        list_occ(idx_o,si) = i
        idx_o = idx_o + 1
      else
        ! hole
        list_vir(idx_v,si) = i
        idx_v = idx_v + 1
      endif
    enddo
  enddo
 end
 ! Space
 subroutine extract_list_orb_space(det,nO,nV,list_occ,list_vir)
  implicit none
  BEGIN_DOC
  ! Returns the the list of occupied and virtual alpha spin orbitals
  END_DOC
  integer, intent(in)           :: nO, nV
  integer(bit_kind), intent(in) :: det(N_int,2)
  integer, intent(out)          :: list_occ(nO), list_vir(nV)
  integer(bit_kind)             :: res(N_int,2)
  integer                       :: i, si, idx_o, idx_v, idx_i, idx_b
  logical                       :: ok, is_core, is_del
  if (elec_alpha_num /= elec_beta_num) then
    print*,'Error elec_alpha_num /= elec_beta_num, impossible to create cc_list_occ and cc_list_vir, abort'
    call abort
  endif
  list_occ = 0
  list_vir = 0
  ! List of occ/vir alpha
  ! occ alpha -> list_occ(:,1)
  ! vir alpha -> list_vir(:,1)
  ! tmp idx
  idx_o = 1
  idx_v = 1
  do i = 1, mo_num
    call apply_hole(det, 1, i, res, ok, N_int)
    ! in core ?
    if (is_core(i)) cycle
    ! in del ?
    if (is_del(i)) cycle
    if (ok) then
      ! particle
      list_occ(idx_o) = i
      idx_o = idx_o + 1
    else
      ! hole
      list_vir(idx_v) = i
      idx_v = idx_v + 1
    endif
  enddo
 end
 ! is_core
 function is_core(i)
  implicit none
  BEGIN_DOC
  ! True if the orbital i is a core orbital
  END_DOC
  integer, intent(in) :: i
  logical             :: is_core
  integer             :: j
  ! Init
  is_core = .False.
  ! Search
  do j = 1, dim_list_core_orb
    if (list_core(j) == i) then
      is_core = .True.
      exit
    endif
  enddo
 end
 ! is_del
 function is_del(i)
  implicit none
  BEGIN_DOC
  ! True if the orbital i is a deleted orbital
  END_DOC
  integer, intent(in) :: i
  logical             :: is_del
  integer             :: j
  ! Init
  is_del = .False.
  ! Search
  do j = 1, dim_list_del_orb
    if (list_del(j) == i) then
      is_del = .True.
      exit
    endif
  enddo
 end
 ! N orb
 BEGIN_PROVIDER [integer, cc_nO_m]
 &BEGIN_PROVIDER [integer, cc_nOa]
 &BEGIN_PROVIDER [integer, cc_nOb]
 &BEGIN_PROVIDER [integer, cc_nOab]
 &BEGIN_PROVIDER [integer, cc_nV_m]
 &BEGIN_PROVIDER [integer, cc_nVa]
 &BEGIN_PROVIDER [integer, cc_nVb]
 &BEGIN_PROVIDER [integer, cc_nVab]
 &BEGIN_PROVIDER [integer, cc_n_mo]
 &BEGIN_PROVIDER [integer, cc_nO_S, (2)]
 &BEGIN_PROVIDER [integer, cc_nV_S, (2)]
  implicit none
  BEGIN_DOC
  ! Number of orbitals without core and deleted ones of the cc_ref det in psi_det
  ! a: alpha, b: beta
  ! nO_m: max(a,b) occupied 
  ! nOa: nb a occupied 
  ! nOb: nb b occupied 
  ! nOab: nb a+b occupied 
  ! nV_m: max(a,b) virtual 
  ! nVa: nb a virtual 
  ! nVb: nb b virtual 
  ! nVab: nb a+b virtual 
  END_DOC
  integer :: n_spin(4)
  ! Extract number of occ/vir alpha/beta spin orbitals
  call extract_n_spin(psi_det(1,1,cc_ref),n_spin)
  cc_nOa  = n_spin(1)
  cc_nOb  = n_spin(2)
  cc_nOab = cc_nOa + cc_nOb    !n_spin(1) + n_spin(2)
  cc_nO_m = max(cc_nOa,cc_nOb) !max(n_spin(1), n_spin(2))
  cc_nVa  = n_spin(3)
  cc_nVb  = n_spin(4)
  cc_nVab = cc_nVa + cc_nVb    !n_spin(3) + n_spin(4)
  cc_nV_m = max(cc_nVa,cc_nVb) !max(n_spin(3), n_spin(4))
  cc_n_mo = cc_nVa + cc_nVb    !n_spin(1) + n_spin(3)
  cc_nO_S = (/cc_nOa,cc_nOb/)
  cc_nV_S = (/cc_nVa,cc_nVb/)
 END_PROVIDER
 ! General
 BEGIN_PROVIDER [integer, cc_list_gen, (cc_n_mo)]
  implicit none
  BEGIN_DOC
  ! List of general orbitals without core and deleted ones
  END_DOC
  integer :: i,j
  logical :: is_core, is_del
  j = 1
  do i = 1, mo_num
    ! in core ?
    if (is_core(i)) cycle
    ! in del ?
    if (is_del(i)) cycle
    cc_list_gen(j) = i
    j = j+1
  enddo
 END_PROVIDER
 ! Space
 BEGIN_PROVIDER [integer, cc_list_occ, (cc_nOa)]
 &BEGIN_PROVIDER [integer, cc_list_vir, (cc_nVa)]
  implicit none
  BEGIN_DOC
  ! List of occupied and virtual spatial orbitals without core and deleted ones
  END_DOC
  call extract_list_orb_space(psi_det(1,1,cc_ref),cc_nOa,cc_nVa,cc_list_occ,cc_list_vir)
 END_PROVIDER
 ! Spin
 BEGIN_PROVIDER [integer, cc_list_occ_spin, (cc_nO_m,2)]
 &BEGIN_PROVIDER [integer, cc_list_vir_spin, (cc_nV_m,2)]
 &BEGIN_PROVIDER [logical, cc_ref_is_open_shell]
  implicit none
  BEGIN_DOC
  ! List of occupied and virtual spin orbitals without core and deleted ones
  END_DOC
  integer :: i
  call extract_list_orb_spin(psi_det(1,1,cc_ref),cc_nO_m,cc_nV_m,cc_list_occ_spin,cc_list_vir_spin)
  cc_ref_is_open_shell = .False.
  do i = 1, cc_nO_m
    if (cc_list_occ_spin(i,1) /= cc_list_occ_spin(i,2)) then
       cc_ref_is_open_shell = .True.
    endif
  enddo
 END_PROVIDER
--- a/devel/ccsd_gpu/phase.irp.f
+++ b/devel/ccsd_gpu/phase.irp.f
@ -0,0 +1,137 @@
 ! phase
 subroutine get_phase_general(det1,det2,phase,degree,Nint)
  implicit none
  integer, intent(in)           :: Nint
  integer(bit_kind), intent(in) :: det1(Nint,2), det2(Nint,2)
  double precision, intent(out) :: phase
  integer, intent(out)          :: degree
  integer :: n(2)
  integer, allocatable :: list_anni(:,:), list_crea(:,:)
  allocate(list_anni(N_int*bit_kind_size,2))
  allocate(list_crea(N_int*bit_kind_size,2))
  call get_excitation_general(det1,det2,degree,n,list_anni,list_crea,phase,Nint)
 end
 ! Get excitation general
 subroutine get_excitation_general(det1,det2,degree,n,list_anni,list_crea,phase,Nint)
  use bitmasks
  implicit none
  integer, intent(in) :: Nint
  integer(bit_kind), intent(in)  :: det1(Nint,2), det2(Nint,2)
  double precision, intent(out)  :: phase
  integer, intent(out)           :: list_crea(Nint*bit_kind_size,2)
  integer, intent(out)           :: list_anni(Nint*bit_kind_size,2)
  integer, intent(out)           :: degree, n(2)
  integer, allocatable           :: l1(:,:), l2(:,:) 
  integer(bit_kind), allocatable :: det_crea(:,:), det_anni(:,:)
  integer, allocatable           :: pos_anni(:,:), pos_crea(:,:)
  integer :: n1(2),n2(2),n_crea(2),n_anni(2),i,j,k,d
  allocate(l1(Nint*bit_kind_size,2))
  allocate(l2(Nint*bit_kind_size,2))
  allocate(det_crea(Nint,2),det_anni(Nint,2))
  ! 1      111010
  ! 2      110101
  !
  !not 1-> 000101
  !    2   110101
  !and     000101 -> crea
  !
  !    1   111010
  !not 2-> 001010
  !        001010 -> anni
  do j = 1, 2
    do i = 1, Nint
      det_crea(i,j) = iand(not(det1(i,j)),det2(i,j))
    enddo
  enddo
  do j = 1, 2
    do i = 1, Nint
      det_anni(i,j) = iand(det1(i,j),not(det2(i,j)))
    enddo
  enddo
  call bitstring_to_list_ab(det1,l1,n1,Nint)
  call bitstring_to_list_ab(det2,l2,n2,Nint)
  call bitstring_to_list_ab(det_crea,list_crea,n_crea,Nint)
  call bitstring_to_list_ab(det_anni,list_anni,n_anni,Nint)
  do i = 1, 2
    if (n_crea(i) /= n_anni(i)) then
      print*,'Well, it seems we have a problem here...'
      call abort
    endif
  enddo
  !1    11110011001  1 2 3 4 7 8  11
  !pos               1 2 3 4 5 6  7 
  !2    11100101011  1 2 3 6 8 10 11
  !anni 00010010000  4 7
  !pos               4 5
  !crea 00000100010  6 10
  !pos               4 6
  !4 -> 6  pos(4 -> 4)
  !7 -> 10 pos(5 -> 6)
  n = n_anni
  degree = n_anni(1) + n_anni(2)
  allocate(pos_anni(max(n(1),n(2)),2))
  allocate(pos_crea(max(n(1),n(2)),2))
  ! Search pos anni
  do j = 1, 2
    k = 1
    do i = 1, n1(j)
       if (k > n_anni(j)) exit
       if (l1(i,j) /= list_anni(k,j)) cycle
       pos_anni(k,j) = i
       k = k + 1
    enddo
  enddo
  ! Search pos crea
  do j = 1, 2
    k = 1
    do i = 1, n2(j)
       if (k > n_crea(j)) exit
       if (l2(i,j) /= list_crea(k,j)) cycle
       pos_crea(k,j) = i
       k = k + 1
    enddo
  enddo
  ! Distance between the ith anni and the ith crea op
  ! By doing so there is no crossing between the different pairs of anni/crea
  ! and the phase is determined by the sum of the distances
  ! -> (-1)^{sum of the distances}
  d = 0
  do j = 1, 2
    do i = 1, n(j)
      d = d + abs(pos_anni(i,j) - pos_crea(i,j))
    enddo
  enddo
  phase = dble((-1)**d)
  ! Debug
  !print*,l2(1:n2(1),1)
  !print*,l2(1:n2(2),2)
  !!call print_det(det1,Nint)
  !!call print_det(det2,Nint)
  !print*,phase
  !print*,''
 end
--- a/devel/ccsd_gpu/update_t.irp.f
+++ b/devel/ccsd_gpu/update_t.irp.f
@ -0,0 +1,73 @@
 ! T1
 subroutine update_t1(nO,nV,f_o,f_v,r1,t1)
  implicit none
  BEGIN_DOC
  ! Update the T1 amplitudes for CC
  END_DOC
  ! in
  integer, intent(in)             :: nO, nV
  double precision, intent(in)    :: f_o(nO), f_v(nV), r1(nO, nV)
  ! inout
  double precision, intent(inout) :: t1(nO, nV)
  ! internal
  integer                         :: i,a
  !$OMP PARALLEL &
  !$OMP SHARED(nO,nV,t1,r1,cc_level_shift,f_o,f_v) &
  !$OMP PRIVATE(i,a) &
  !$OMP DEFAULT(NONE)
  !$OMP DO 
  do a = 1, nV
    do i = 1, nO
      t1(i,a) = t1(i,a) - r1(i,a) / (f_o(i) - f_v(a) - cc_level_shift)
    enddo
  enddo
  !$OMP END DO
  !$OMP END PARALLEL
 end
 ! T2
 subroutine update_t2(nO,nV,f_o,f_v,r2,t2)
  implicit none
  BEGIN_DOC
  ! Update the T2 amplitudes for CC
  END_DOC
  ! in
  integer, intent(in)             :: nO, nV
  double precision, intent(in)    :: f_o(nO), f_v(nV), r2(nO, nO, nV, nV)
  ! inout
  double precision, intent(inout) :: t2(nO, nO, nV, nV)
  ! internal
  integer                         :: i,j,a,b
  !$OMP PARALLEL &
  !$OMP SHARED(nO,nV,t2,r2,cc_level_shift,f_o,f_v) &
  !$OMP PRIVATE(i,j,a,b) &
  !$OMP DEFAULT(NONE)
  !$OMP DO 
  do b = 1, nV
    do a = 1, nV
      do j = 1, nO
        do i = 1, nO
          t2(i,j,a,b) = t2(i,j,a,b) - r2(i,j,a,b) / (f_o(i) + f_o(j) - f_v(a) - f_v(b) - cc_level_shift)
        enddo
      enddo
    enddo
  enddo
  !$OMP END DO
  !$OMP END PARALLEL
 end
Author	SHA1	Message	Date
Anthony Scemama	971a0ff160	Refactoring	2023-08-04 16:54:48 +02:00
Anthony Scemama	ec6e5fde68	Merge turpan:~/qp2/plugins/qp_plugins_scemama	2023-08-04 16:43:42 +02:00
Anthony Scemama	ac2614a0f3	r1 on GPU	2023-08-04 16:42:46 +02:00
Anthony Scemama	d61ecb35c4	r1 on GPU	2023-08-04 16:19:41 +02:00
Anthony Scemama	a7e0832dae	Starting r1 on GPU	2023-08-04 14:48:08 +02:00
`@ -1,2 +1,2 @@`
	`hartree_fock`	`hartree_fock`
	`utils_cc_gpu`	`determinants`