H_ finished in CCSD

2025-04-16 13:39:27 +02:00 · 2024-07-01 19:00:27 +02:00 · 2024-07-01 19:00:27 +02:00 · 44a7729f65
commit 44a7729f65
parent d3c1994c64
2 changed files with 200 additions and 390 deletions
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@ -181,11 +181,9 @@ subroutine run_ccsd_space_orb

    ! Residue
    if (do_mo_cholesky) then
-      call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, &
-              d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
-      call compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, &
-              d_cc_space_v_ov_chol,H_vv)
-      call compute_H_vo_chol(nO,nV,t1%f,H_vo%f)
+      call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
+      call compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, d_cc_space_v_ov_chol,H_vv)
+      call compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol, H_vo)

      call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1)
      call compute_r2_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r2%f,max_r2)
@ -316,51 +314,20 @@ subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,
  integer :: i,j,a,b
  double precision :: e

-!  energy = 0d0
-!  !$omp parallel &
-!  !$omp shared(nO,nV,energy,tau_x,t1,&
-!  !$omp d_cc_space_f_vo,d_cc_space_v_oovv) &
-!  !$omp private(i,j,a,b,e) &
-!  !$omp default(none)
-!  e = 0d0
-!  !$omp do
-!  do a = 1, nV
-!    do i = 1, nO
-!      e = e + 2d0 * d_cc_space_f_vo%f(a,i) * t1%f(i,a)
-!    enddo
-!  enddo
-!  !$omp end do nowait
-!  !$omp do
-!  do b = 1, nV
-!    do a = 1, nV
-!      do j = 1, nO
-!        do i = 1, nO
-!          e = e + tau_x%f(i,j,a,b) * d_cc_space_v_oovv%f(i,j,a,b)
-!       enddo
-!      enddo
-!    enddo
-!  enddo
-!  !$omp end do nowait
-!  !$omp critical
-!  energy = energy + e
-!  !$omp end critical
-!  !$omp end parallel
+  type(gpu_stream) :: s1, s2
+  call gpu_stream_create(s1)
+  call gpu_stream_create(s2)

+  call gpu_set_stream(blas_handle,s1)
+  call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo, 1, t1, 1, e)

-   type(gpu_stream) :: s1, s2
-   call gpu_stream_create(s1)
-   call gpu_stream_create(s2)
+  call gpu_set_stream(blas_handle,s2)
+  call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1_8, d_cc_space_v_oovv, 1_8, energy)
+  call gpu_set_stream(blas_handle,gpu_default_stream)

-   call gpu_set_stream(blas_handle,s1)
-   call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo, 1, t1, 1, e)
-
-   call gpu_set_stream(blas_handle,s2)
-   call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1_8, d_cc_space_v_oovv, 1_8, energy)
-   call gpu_synchronize()
-   call gpu_set_stream(blas_handle,gpu_default_stream)
-
-   call gpu_stream_destroy(s1)
-   call gpu_stream_destroy(s2)
+  call gpu_synchronize()
+  call gpu_stream_destroy(s1)
+  call gpu_stream_destroy(s2)

   energy = energy + 2.d0*e

@ -384,27 +351,9 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
  ! internal
  integer                       :: i,j,a,b

-!  !$OMP PARALLEL &
-!  !$OMP SHARED(nO,nV,tau,t2,t1,h_t1) &
-!  !$OMP PRIVATE(i,j,a,b) &
-!  !$OMP DEFAULT(NONE)
-!  !$OMP DO
-!  do b = 1, nV
-!    do a = 1, nV
-!      do j = 1, nO
-!        do i = 1, nO
-!          tau%f(i,j,a,b) = t2%f(i,j,a,b) + t1%f(i,a) * h_t1(j,b)
-!        enddo
-!      enddo
-!    enddo
-!  enddo
-!  !$OMP END DO
-!  !$OMP END PARALLEL
-
-
  type(gpu_stream) :: stream(nV)

-  !$OMP PARALLEL if (gpu_num == 0) &
+  !$OMP PARALLEL &
  !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) &
  !$OMP PRIVATE(i,j,a,b) &
  !$OMP DEFAULT(NONE)
@ -422,6 +371,8 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
  !$OMP END DO
  !$OMP END PARALLEL

+  call gpu_synchronize()
+
  do b=1,nV
    call gpu_stream_destroy(stream(b))
  enddo
@ -444,32 +395,15 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x)
  ! internal
  integer                       :: i,j,a,b

-!  !$OMP PARALLEL &
-!  !$OMP SHARED(nO,nV,tau,tau_x) &
-!  !$OMP PRIVATE(i,j,a,b) &
-!  !$OMP DEFAULT(NONE)
-!  !$OMP DO
-!  do b = 1, nV
-!    do a = 1, nV
-!      do j = 1, nO
-!        do i = 1, nO
-!          tau_x%f(i,j,a,b) = 2.d0*tau%f(i,j,a,b) - tau%f(i,j,b,a)
-!        enddo
-!      enddo
-!    enddo
-!  enddo
-!  !$OMP END DO
-!  !$OMP END PARALLEL
-
  type(gpu_stream) :: stream(nV)

  do a=1,nV
    call gpu_stream_create(stream(a))
  enddo

-  !$OMP PARALLEL if (gpu_num == 0) &
+  !$OMP PARALLEL &
  !$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) &
-  !$OMP PRIVATE(i,j,a,b) &
+  !$OMP PRIVATE(a,b) &
  !$OMP DEFAULT(NONE)
  !$OMP DO
  do b=1,nV
@ -484,10 +418,12 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x)
  !$OMP END DO
  !$OMP END PARALLEL

+  call gpu_set_stream(blas_handle,gpu_default_stream)
+  call gpu_synchronize()
+
  do b=1,nV
    call gpu_stream_destroy(stream(b))
  enddo
-  call gpu_set_stream(blas_handle,gpu_default_stream)


 end
--- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
@ -1,81 +1,200 @@
-subroutine ccsd_energy_space_chol(nO,nV,tau,t1,energy)
+! H_oo

+subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, &
+    d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
+  use gpu
  implicit none

-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: tau(nO,nO,nV,nV)
-  double precision, intent(in)  :: t1(nO,nV)
-  double precision, intent(out) :: energy
+  integer, intent(in)           :: nO,nV
+  type(gpu_double2), intent(in)    :: d_cc_space_f_oo
+  type(gpu_double3), intent(in)    :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol
+  type(gpu_double4), intent(in)    :: tau_x
+  type(gpu_double2), intent(out)   :: H_oo

-  ! internal
-  integer :: i,j,a,b
-  double precision :: e
+  integer :: a,b,i,j,u,k

-  energy = 0d0
-  !$omp parallel &
-  !$omp shared(nO,nV,energy,tau,t1,&
-  !$omp cc_space_f_vo,cc_space_w_oovv) &
-  !$omp private(i,j,a,b,e) &
-  !$omp default(none)
-  e = 0d0
-  !$omp do
-  do a = 1, nV
-    do i = 1, nO
-      e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp do
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          e = e + tau(i,j,a,b) * cc_space_w_oovv(i,j,a,b)
-       enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp critical
-  energy = energy + e
-  !$omp end critical
-  !$omp end parallel
+  type(gpu_double3) :: tau_kau, tmp_vov, tmp_ovv

-end
+  call gpu_allocate(tau_kau, cholesky_mo_num, nV, nO)

-! Tau
+  type(gpu_blas) :: blas

-subroutine update_tau_space_chol(nO,nV,t1,t2,tau)

-  implicit none
+  !$OMP PARALLEL  &
+  !$OMP DEFAULT(SHARED) &
+  !$OMP PRIVATE(blas,u,b,tmp_vov,tmp_ovv)

-  ! in
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: t1(nO,nV), t2(nO,nO,nV,nV)
+  !$OMP SINGLE
+  !$OMP TASK
+  call gpu_copy(d_cc_space_f_oo, H_oo)
+  !$OMP END TASK
+  !$OMP END SINGLE

-  ! out
-  double precision, intent(out) :: tau(nO,nO,nV,nV)
+  call gpu_allocate(tmp_ovv, nO, nV, nV)
+  call gpu_allocate(tmp_vov, nV, nO, nV)

-  ! internal
-  integer                       :: i,j,a,b
+  call gpu_blas_create(blas)

-  !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,tau,t2,t1) &
-  !$OMP PRIVATE(i,j,a,b) &
-  !$OMP DEFAULT(NONE)
  !$OMP DO
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b)
-        enddo
-      enddo
+  do u=1,nO
+    call gpu_dgeam_f(blas, 'N', 'N', 1, nO*nV*nV, 1.d0, &
+           tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f, nO, tmp_ovv%f, 1)
+    do b=1,nV
+      call gpu_dgeam_f(blas, 'T', 'T', nV, nO, 1.d0, &
+           tmp_ovv%f(1,1,b), nO, 0.d0, &
+           tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV)
    enddo
+    call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
+      d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_vov%f, nV, &
+      0.d0, tau_kau%f(1,1,u), cholesky_mo_num)
  enddo
  !$OMP END DO
+
+  call gpu_blas_destroy(blas)
+
+  call gpu_deallocate(tmp_vov)
+  call gpu_deallocate(tmp_ovv)
+
+  !$OMP TASKWAIT
  !$OMP END PARALLEL

+  call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
+    tau_kau, cholesky_mo_num*nV,  d_cc_space_v_vo_chol, cholesky_mo_num*nV, &
+    1.d0, H_oo, nO)
+
+  call gpu_synchronize()
+  call gpu_deallocate(tau_kau)
+end
+
+! H_vv
+
+subroutine compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, &
+         d_cc_space_v_ov_chol,H_vv)
+  use gpu
+  implicit none
+
+  integer, intent(in)              :: nO,nV
+  type(gpu_double2), intent(in)    :: d_cc_space_f_vv
+  type(gpu_double3), intent(in)    :: d_cc_space_v_ov_chol
+  type(gpu_double4), intent(in)    :: tau_x
+  type(gpu_double2), intent(out)   :: H_vv
+
+  integer :: a,b,i,j,u,k, beta
+
+  type(gpu_double3) :: tau_kia, tmp_oov
+
+  call gpu_allocate(tau_kia, cholesky_mo_num, nO, nV)
+
+  type(gpu_blas) :: blas
+
+  !$OMP PARALLEL &
+  !$OMP DEFAULT(SHARED) &
+  !$OMP PRIVATE(a,b,tmp_oov,blas)
+
+  !$OMP SINGLE
+  !$OMP TASK
+  call gpu_copy(d_cc_space_f_vv, H_vv)
+  !$OMP END TASK
+  !$OMP END SINGLE
+
+  call gpu_blas_create(blas)
+  call gpu_allocate(tmp_oov, nO, nO, nV)
+
+  !$OMP DO
+  do a = 1, nV
+    do b=1,nV
+      call gpu_dgeam_f(blas, 'N', 'N', nO, nO, 1.d0, &
+        tau_x%f(1,1,a,b), nO, 0.d0, &
+        tau_x%f(1,1,a,b), nO, tmp_oov%f(1,1,b), nO)
+    enddo
+    call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nO,nO*nV,1.d0, &
+      d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov%f, nO, &
+      0.d0, tau_kia%f(1,1,a), cholesky_mo_num)
+  enddo
+  !$OMP END DO
+
+  call gpu_blas_destroy(blas)
+
+  call gpu_deallocate(tmp_oov)
+  !$OMP TASKWAIT
+  !$OMP END PARALLEL
+
+  call gpu_dgemm(blas_handle,'T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
+    tau_kia, cholesky_mo_num*nO,  d_cc_space_v_ov_chol, cholesky_mo_num*nO, &
+    1.d0, H_vv, nV)
+
+  call gpu_synchronize()
+  call gpu_deallocate(tau_kia)
+end
+
+! H_vo
+subroutine compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, &
+         d_cc_space_v_ov_chol,d_cc_space_v_vo_chol, H_vo)
+  use gpu
+  implicit none
+
+  integer, intent(in)            :: nO,nV
+  type(gpu_double2), intent(in)  :: t1, d_cc_space_f_vo
+  type(gpu_double3), intent(in)  :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol
+  type(gpu_double2), intent(out) :: H_vo
+
+  integer :: a,b,i,j,u,k
+
+  type(gpu_double1) :: tmp_k
+  type(gpu_double3) :: tmp, tmp2
+
+  call gpu_copy(d_cc_space_f_vo, H_vo)
+
+  call gpu_allocate(tmp_k, cholesky_mo_num)
+
+  call gpu_dgemm(blas_handle, 'N', 'N', cholesky_mo_num, 1, nO*nV, 2.d0, &
+     d_cc_space_v_ov_chol, cholesky_mo_num, &
+     t1, nO*nV, 0.d0, tmp_k, cholesky_mo_num)
+
+  call gpu_dgemm(blas_handle, 'T','N',nV*nO,1,cholesky_mo_num,1.d0, &
+      d_cc_space_v_vo_chol, cholesky_mo_num, tmp_k, cholesky_mo_num, 1.d0, &
+      H_vo, nV*nO)
+
+  call gpu_deallocate(tmp_k)
+
+
+  call gpu_allocate(tmp,  cholesky_mo_num, nO, nO)
+
+  call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nO, nO, nV, 1.d0, &
+    d_cc_space_v_ov_chol, cholesky_mo_num*nO, t1, nO, 0.d0, tmp, cholesky_mo_num*nO)
+
+  call gpu_allocate(tmp2, cholesky_mo_num, nO, nO)
+
+  type(gpu_stream) :: stream(nO)
+  do i=1,nO
+    call gpu_stream_create(stream(i))
+  enddo
+
+  !$OMP PARALLEL DO COLLAPSE(2) PRIVATE(i,j)
+  do i=1,nO
+    do j=1,nO
+      call gpu_set_stream(blas_handle,stream(j))
+      call gpu_dgeam_f(blas_handle, 'N', 'N', cholesky_mo_num, 1, 1.d0, &
+        tmp%f(1,i,j), cholesky_mo_num, 0.d0, &
+        tmp%f(1,i,j), cholesky_mo_num, tmp2%f(1,j,i), cholesky_mo_num)
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+  call gpu_set_stream(blas_handle,gpu_default_stream)
+  call gpu_synchronize()
+
+  do i=1,nO
+    call gpu_stream_destroy(stream(i))
+  enddo
+  call gpu_deallocate(tmp)
+
+  call gpu_dgemm(blas_handle, 'T','N', nV, nO, cholesky_mo_num*nO, -1.d0, &
+    d_cc_space_v_ov_chol, cholesky_mo_num*nO, tmp2, cholesky_mo_num*nO, &
+    1.d0, H_vo, nV)
+
+  call gpu_synchronize()
+  call gpu_deallocate(tmp2)
 end

 ! R1
@ -291,251 +410,6 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)

 end

-! H_oo
-
-subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, &
-    d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
-  use gpu
-  implicit none
-
-  integer, intent(in)           :: nO,nV
-  type(gpu_double2), intent(in)    :: d_cc_space_f_oo
-  type(gpu_double3), intent(in)    :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol
-  type(gpu_double4), intent(in)    :: tau_x
-  type(gpu_double2), intent(out)   :: H_oo
-
-  integer :: a,b,i,j,u,k
-
-  type(gpu_double3) :: tau_kau, tmp_vov, tmp_ovv
-
-  call gpu_allocate(tau_kau, cholesky_mo_num, nV, nO)
-
-!  !$omp parallel &
-!  !$omp default(shared) &
-!  !$omp private(i,u,j,k,a,b,tmp_vov)
-!  call gpu_allocate(tmp_vov, nV, nO, nV)
-!  !$omp do
-!  do u = 1, nO
-!    do b=1,nV
-!      do j=1,nO
-!        do a=1,nV
-!          tmp_vov%f(a,j,b) = tau_x%f(u,j,a,b)
-!        enddo
-!      enddo
-!    enddo
-!    call dgemm('N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
-!      d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, tmp_vov%f, nV, &
-!      0.d0, tau_kau%f(1,1,u), cholesky_mo_num)
-!  enddo
-!  !$omp end do nowait
-!  call gpu_deallocate(tmp_vov)
-!  !$omp do
-!  do i = 1, nO
-!    do u = 1, nO
-!      H_oo%f(u,i) = d_cc_space_f_oo%f(u,i)
-!    enddo
-!  enddo
-!  !$omp end do nowait
-!
-!  !$omp barrier
-!  !$omp end  parallel
-!  call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
-!    tau_kau%f(1,1,1), cholesky_mo_num*nV,  d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num*nV, &
-!    1.d0, H_oo%f(1,1), nO)
-!
-
-  type(gpu_blas) :: blas
-
-
-  !$OMP PARALLEL  &
-  !$OMP DEFAULT(SHARED) &
-  !$OMP PRIVATE(blas,u,b,tmp_vov,tmp_ovv)
-
-  !$OMP SINGLE
-  !$OMP TASK
-  call gpu_copy(d_cc_space_f_oo, H_oo)
-  !$OMP END TASK
-  !$OMP END SINGLE
-
-  call gpu_allocate(tmp_ovv, nO, nV, nV)
-  call gpu_allocate(tmp_vov, nV, nO, nV)
-
-  call gpu_blas_create(blas)
-
-  !$OMP DO
-  do u=1,nO
-    call gpu_dgeam_f(blas, 'N', 'N', 1, nO*nV*nV, 1.d0, &
-           tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f, nO, tmp_ovv%f, 1)
-    do b=1,nV
-      call gpu_dgeam_f(blas, 'T', 'T', nV, nO, 1.d0, &
-           tmp_ovv%f(1,1,b), nO, 0.d0, &
-           tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV)
-    enddo
-    call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
-      d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_vov%f, nV, &
-      0.d0, tau_kau%f(1,1,u), cholesky_mo_num)
-  enddo
-  !$OMP END DO
-
-  call gpu_blas_destroy(blas)
-
-  call gpu_deallocate(tmp_vov)
-  call gpu_deallocate(tmp_ovv)
-
-  !$OMP TASKWAIT
-  !$OMP END PARALLEL
-
-  call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
-    tau_kau, cholesky_mo_num*nV,  d_cc_space_v_vo_chol, cholesky_mo_num*nV, &
-    1.d0, H_oo, nO)
-
-  call gpu_deallocate(tau_kau)
-end
-
-! H_vv
-
-subroutine compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, &
-         d_cc_space_v_ov_chol,H_vv)
-  use gpu
-  implicit none
-
-  integer, intent(in)              :: nO,nV
-  type(gpu_double2), intent(in)    :: d_cc_space_f_vv
-  type(gpu_double3), intent(in)    :: d_cc_space_v_ov_chol
-  type(gpu_double4), intent(in)    :: tau_x
-  type(gpu_double2), intent(out)   :: H_vv
-
-  integer :: a,b,i,j,u,k, beta
-
-  type(gpu_double3) :: tau_kia, tmp_oov
-
-  call gpu_allocate(tau_kia, cholesky_mo_num, nO, nV)
-
-!  !$omp parallel &
-!  !$omp default(shared) &
-!  !$omp private(i,beta,j,k,a,b,tmp_oov)
-!  allocate(tmp_oov(nO,nO,nV) )
-!  !$omp do
-!  do a = 1, nV
-!    do b=1,nV
-!      do j=1,nO
-!        do i=1,nO
-!          tmp_oov(i,j,b) = tau_x%f(i,j,a,b)
-!        enddo
-!      enddo
-!    enddo
-!    call dgemm('N','T',cholesky_mo_num,nO,nO*nV,1.d0, &
-!      d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov, nO, &
-!      0.d0, tau_kia(1,1,a), cholesky_mo_num)
-!  enddo
-!  !$omp end do nowait
-!  deallocate(tmp_oov)
-
-!  !$omp do
-!  do beta = 1, nV
-!    do a = 1, nV
-!      H_vv%f(a,beta) = cc_space_f_vv(a,beta)
-!    enddo
-!  enddo
-!  !$omp end do nowait
-!  !$omp barrier
-!  !$omp end  parallel
-!  call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
-!    tau_kia, cholesky_mo_num*nO,  d_cc_space_v_ov_chol%f, cholesky_mo_num*nO, &
-!    1.d0, H_vv%f, nV)
-
-  type(gpu_blas) :: blas
-
-
-  PROVIDE gpu_num
-  !$OMP PARALLEL &
-  !$OMP DEFAULT(SHARED) &
-  !$OMP PRIVATE(a,b,tmp_oov,blas)
-
-  !$OMP SINGLE
-  !$OMP TASK
-  call gpu_copy(d_cc_space_f_vv, H_vv)
-  !$OMP END TASK
-  !$OMP END SINGLE
-
-  call gpu_blas_create(blas)
-  call gpu_allocate(tmp_oov, nO, nO, nV)
-
-  !$OMP DO
-  do a = 1, nV
-    do b=1,nV
-      call gpu_dgeam_f(blas, 'N', 'N', nO, nO, 1.d0, &
-        tau_x%f(1,1,a,b), nO, 0.d0, &
-        tau_x%f(1,1,a,b), nO, tmp_oov%f(1,1,b), nO)
-    enddo
-    call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nO,nO*nV,1.d0, &
-      d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov%f, nO, &
-      0.d0, tau_kia%f(1,1,a), cholesky_mo_num)
-  enddo
-  !$OMP END DO
-
-  call gpu_blas_destroy(blas)
-
-  call gpu_deallocate(tmp_oov)
-  !$OMP TASKWAIT
-  !$OMP END PARALLEL
-
-  call gpu_dgemm(blas_handle,'T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
-    tau_kia, cholesky_mo_num*nO,  d_cc_space_v_ov_chol, cholesky_mo_num*nO, &
-    1.d0, H_vv, nV)
-
-  call gpu_deallocate(tau_kia)
-end
-
-! H_vo
-subroutine compute_H_vo_chol(nO,nV,t1,H_vo)
-
-  implicit none
-
-  integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: t1(nO, nV)
-  double precision, intent(out) :: H_vo(nV, nO)
-
-  integer :: a,b,i,j,u,k
-
-  double precision, allocatable :: tmp_k(:), tmp(:,:,:), tmp2(:,:,:)
-  do i=1,nO
-    do a=1,nV
-      H_vo(a,i) = cc_space_f_vo(a,i)
-    enddo
-  enddo
-
-  allocate(tmp_k(cholesky_mo_num))
-  call dgemm('N', 'N', cholesky_mo_num, 1, nO*nV, 2.d0, &
-     cc_space_v_ov_chol, cholesky_mo_num, &
-     t1, nO*nV, 0.d0, tmp_k, cholesky_mo_num)
-
-  call dgemm('T','N',nV*nO,1,cholesky_mo_num,1.d0, &
-      cc_space_v_vo_chol, cholesky_mo_num, tmp_k, cholesky_mo_num, 1.d0, &
-      H_vo, nV*nO)
-  deallocate(tmp_k)
-
-  allocate(tmp(cholesky_mo_num,nO,nO))
-  allocate(tmp2(cholesky_mo_num,nO,nO))
-
-  call dgemm('N','T', cholesky_mo_num*nO, nO, nV, 1.d0, &
-    cc_space_v_ov_chol, cholesky_mo_num*nO, t1, nO, 0.d0, tmp, cholesky_mo_num*nO)
-
-  do i=1,nO
-    do j=1,nO
-      do k=1,cholesky_mo_num
-        tmp2(k,j,i) = tmp(k,i,j)
-      enddo
-    enddo
-  enddo
-  deallocate(tmp)
-
-  call dgemm('T','N', nV, nO, cholesky_mo_num*nO, -1.d0, &
-    cc_space_v_ov_chol, cholesky_mo_num*nO, tmp2, cholesky_mo_num*nO, &
-    1.d0, H_vo, nV)
-
-end
-

 ! R2