From b6b169c1cd5ad510364cf5b33800cad8a6b5272c Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Thu, 20 Jun 2024 17:27:21 +0200 Subject: [PATCH 01/38] Updated documentation --- src/ao_two_e_ints/cholesky.irp.f | 3 +++ src/utils/linear_algebra.irp.f | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f index 41cdb80d..cdd64a8c 100644 --- a/src/ao_two_e_ints/cholesky.irp.f +++ b/src/ao_two_e_ints/cholesky.irp.f @@ -25,7 +25,10 @@ END_PROVIDER ! Last dimension of cholesky_ao is cholesky_ao_num ! ! https://mogp-emulator.readthedocs.io/en/latest/methods/proc/ProcPivotedCholesky.html + ! ! https://doi.org/10.1016/j.apnum.2011.10.001 : Page 4, Algorithm 1 + ! + ! https://www.diva-portal.org/smash/get/diva2:396223/FULLTEXT01.pdf END_DOC integer*8 :: ndim8 diff --git a/src/utils/linear_algebra.irp.f b/src/utils/linear_algebra.irp.f index 20386b30..4e7ca87d 100644 --- a/src/utils/linear_algebra.irp.f +++ b/src/utils/linear_algebra.irp.f @@ -1856,7 +1856,7 @@ subroutine pivoted_cholesky( A, rank, tol, ndim, U) ! ! matrix A is destroyed inside this subroutine ! Cholesky vectors are stored in U -! dimension of U: U(1:rank, 1:n) +! dimension of U: U(1:n, 1:rank) ! U is allocated inside this subroutine ! rank is the number of Cholesky vectors depending on tol ! From 5d80cb7b2dd53bdd9eb713e507912e6fce3cadd7 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Thu, 27 Jun 2024 12:06:06 +0200 Subject: [PATCH 02/38] Separated gpu and gpu_arch --- configure | 8 ++-- src/ccsd/NEED | 1 + src/ccsd/ccsd_space_orb_sub.irp.f | 14 +++++-- src/gpu/NEED | 1 + src/gpu/README.rst | 6 +++ src/{gpu_x86 => gpu}/gpu.h | 0 src/{gpu_x86 => gpu}/gpu_module.F90 | 59 +++++++++++++++-------------- src/gpu_x86/gpu.c | 2 +- 8 files changed, 54 insertions(+), 37 deletions(-) create mode 100644 src/gpu/NEED create mode 100644 src/gpu/README.rst rename src/{gpu_x86 => gpu}/gpu.h (100%) rename src/{gpu_x86 => gpu}/gpu_module.F90 (74%) diff --git a/configure b/configure index 014275eb..db158966 100755 --- a/configure +++ b/configure @@ -115,19 +115,19 @@ while getopts "d:c:i:g:h" c ; do done # Handle GPU acceleration -rm -f ${QP_ROOT}/src/gpu +rm -f ${QP_ROOT}/src/gpu_arch case "$GPU" in amd) # Nvidia echo "Activating AMD GPU acceleration" - ln -s ${QP_ROOT}/src/gpu_amd ${QP_ROOT}/src/gpu + ln -s ${QP_ROOT}/src/gpu_amd ${QP_ROOT}/src/gpu_arch ;; nvidia) # Nvidia echo "Activating Nvidia GPU acceleration" - ln -s ${QP_ROOT}/src/gpu_nvidia ${QP_ROOT}/src/gpu + ln -s ${QP_ROOT}/src/gpu_nvidia ${QP_ROOT}/src/gpu_arch ;; *) # No Acceleration echo "Disabling GPU acceleration" - ln -s ${QP_ROOT}/src/gpu_x86 ${QP_ROOT}/src/gpu + ln -s ${QP_ROOT}/src/gpu_x86 ${QP_ROOT}/src/gpu_arch ;; esac diff --git a/src/ccsd/NEED b/src/ccsd/NEED index e6e6bc59..8298f28e 100644 --- a/src/ccsd/NEED +++ b/src/ccsd/NEED @@ -1,2 +1,3 @@ +gpu hartree_fock utils_cc diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 9d4ae7f9..84aab08a 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -1,4 +1,5 @@ subroutine run_ccsd_space_orb + use gpu implicit none @@ -11,7 +12,7 @@ subroutine run_ccsd_space_orb double precision, allocatable :: t2(:,:,:,:), r2(:,:,:,:), tau(:,:,:,:), tau_x(:,:,:,:) double precision, allocatable :: t1(:,:), r1(:,:) - double precision, allocatable :: H_oo(:,:), H_vv(:,:), H_vo(:,:) + double precision, pointer :: H_oo, H_vv, H_vo double precision, allocatable :: all_err(:,:), all_t(:,:) integer, allocatable :: list_occ(:), list_vir(:) @@ -55,7 +56,10 @@ subroutine run_ccsd_space_orb allocate(tau(nO,nO,nV,nV)) allocate(tau_x(nO,nO,nV,nV)) allocate(t1(nO,nV), r1(nO,nV)) - allocate(H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO)) + + call gpu_allocate_double(H_oo, (/ nO, nO /) ) + call gpu_allocate_double(H_vv, (/ nV, nV /) ) + call gpu_allocate_double(H_vo, (/ nV, nO /) ) if (cc_update_method == 'diis') then double precision :: rss, diis_mem, extra_mem @@ -191,7 +195,11 @@ subroutine run_ccsd_space_orb deallocate(all_err,all_t) endif - deallocate(H_vv,H_oo,H_vo,r1,r2,tau) + call gpu_deallocate_double(H_oo) + call gpu_deallocate_double(H_vv) + call gpu_deallocate_double(H_vo) + + deallocate(r1,r2,tau) ! CCSD(T) double precision :: e_t, e_t_err diff --git a/src/gpu/NEED b/src/gpu/NEED new file mode 100644 index 00000000..c2af78d2 --- /dev/null +++ b/src/gpu/NEED @@ -0,0 +1 @@ +gpu_arch diff --git a/src/gpu/README.rst b/src/gpu/README.rst new file mode 100644 index 00000000..17ee28a0 --- /dev/null +++ b/src/gpu/README.rst @@ -0,0 +1,6 @@ +=== +gpu +=== + +Bindings for GPU routines (architecture independent). +Architecture-dependent files are in gpu_arch. diff --git a/src/gpu_x86/gpu.h b/src/gpu/gpu.h similarity index 100% rename from src/gpu_x86/gpu.h rename to src/gpu/gpu.h diff --git a/src/gpu_x86/gpu_module.F90 b/src/gpu/gpu_module.F90 similarity index 74% rename from src/gpu_x86/gpu_module.F90 rename to src/gpu/gpu_module.F90 index 86ba3926..f35ebc97 100644 --- a/src/gpu_x86/gpu_module.F90 +++ b/src/gpu/gpu_module.F90 @@ -1,5 +1,5 @@ module gpu - use, intrinsic :: iso_c_binding, only : c_int32_t, c_int64_t, c_double, c_size_t, c_char + use, intrinsic :: iso_c_binding implicit none interface @@ -17,7 +17,7 @@ module gpu integer(c_int64_t), value :: n end subroutine - subroutine gpu_free_c(ptr) bind(C, name='gpu_free') + subroutine gpu_deallocate_c(ptr) bind(C, name='gpu_deallocate') import type(c_ptr) :: ptr end subroutine @@ -89,53 +89,54 @@ module gpu end interface + contains + + + subroutine gpu_allocate_double(ptr, s) + implicit none + double precision, pointer, intent(inout) :: ptr + integer, intent(in) :: s(:) + type(c_ptr) :: cptr + + call gpu_allocate_c(cptr, sum(s*1_8)*8_8) + call c_f_pointer(cptr, ptr, s) + end subroutine + + subroutine gpu_deallocate_double(ptr) + implicit none + double precision, pointer, intent(inout) :: ptr + type(c_ptr) :: cptr + cptr = c_loc(ptr) + call gpu_deallocate(cptr) + NULLIFY(ptr) + end subroutine + end module -subroutine gpu_allocate_double(ptr, s) - use gpu - implicit none - double precision, pointer, intent(inout) :: ptr - integer*8, intent(in) :: s(*) - type(c_ptr) :: cptr - - call gpu_allocate_c(cptr, sum(s)*8_8) - call c_f_pointer(cptr, ptr, s) -end subroutine - -subroutine gpu_free_double(ptr) - use gpu - implicit none - double precision, pointer, intent(inout) :: ptr - type(c_ptr) :: cptr - cptr = cloc(ptr) - call gpu_free(cptr) - NULLIFY(ptr) -end subroutine - subroutine gpu_upload_double(cpu_ptr, gpu_ptr, n) use gpu implicit none double precision, intent(in) :: cpu_ptr(*) - double precision, intent(out) :: gpu_ptr(*) + double precision, intent(in) :: gpu_ptr(*) integer(c_int64_t), intent(in) :: n - call gpu_upload_c(cpu_ptr, gpu_ptr, 8_8*n) + call gpu_upload_c(c_loc(cpu_ptr), c_loc(gpu_ptr), 8_8*n) end subroutine subroutine gpu_download_double(gpu_ptr, cpu_ptr, n) use gpu implicit none double precision, intent(in) :: gpu_ptr(*) - double precision, intent(out) :: cpu_ptr(*) + double precision, intent(in) :: cpu_ptr(*) integer(c_int64_t), intent(in) :: n - call gpu_download_c(gpu_ptr, cpu_ptr, 8_8*n) + call gpu_download_c(c_loc(gpu_ptr), c_loc(cpu_ptr), 8_8*n) end subroutine subroutine gpu_copy_double(gpu_ptr_src, gpu_ptr_dest, n) use gpu implicit none double precision, intent(in) :: gpu_ptr_src(*) - double precision, intent(out) :: gpu_ptr_dest(*) + double precision, intent(in) :: gpu_ptr_dest(*) integer(c_int64_t), intent(in) :: n - call gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, 8_8*n) + call gpu_copy_c(c_loc(gpu_ptr_src), c_loc(gpu_ptr_dest), 8_8*n) end subroutine diff --git a/src/gpu_x86/gpu.c b/src/gpu_x86/gpu.c index 71505dbe..41ede396 100644 --- a/src/gpu_x86/gpu.c +++ b/src/gpu_x86/gpu.c @@ -25,7 +25,7 @@ void gpu_allocate(void** ptr, const int64_t n) { } } -void gpu_free(void** ptr) { +void gpu_deallocate(void** ptr) { free(*ptr); *ptr = NULL; } From 6c02ac0f0b05ea3cc16e0fde66e23c9a0de14246 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Thu, 27 Jun 2024 12:07:48 +0200 Subject: [PATCH 03/38] Separated gpu and gpu_arch --- src/gpu/gpu_module.F90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90 index f35ebc97..43754454 100644 --- a/src/gpu/gpu_module.F90 +++ b/src/gpu/gpu_module.F90 @@ -107,7 +107,7 @@ module gpu double precision, pointer, intent(inout) :: ptr type(c_ptr) :: cptr cptr = c_loc(ptr) - call gpu_deallocate(cptr) + call gpu_deallocate_c(cptr) NULLIFY(ptr) end subroutine From fa6d1419496d271a4715efc776790ce7fc152064 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Thu, 27 Jun 2024 15:45:52 +0200 Subject: [PATCH 04/38] Introducing GPU in CCSD --- src/ccsd/ccsd_space_orb_sub.irp.f | 224 +++++++----- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 12 +- src/gpu/gpu_module.F90 | 450 ++++++++++++++++++++++--- src/gpu_x86/gpu.c | 48 +-- 4 files changed, 570 insertions(+), 164 deletions(-) diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 84aab08a..455d62f7 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -10,9 +10,9 @@ subroutine run_ccsd_space_orb double precision :: uncorr_energy,energy, max_elem, max_r, max_r1, max_r2,ta,tb logical :: not_converged - double precision, allocatable :: t2(:,:,:,:), r2(:,:,:,:), tau(:,:,:,:), tau_x(:,:,:,:) - double precision, allocatable :: t1(:,:), r1(:,:) - double precision, pointer :: H_oo, H_vv, H_vo + type(gpu_double4) :: t2, r2, tau, tau_x + type(gpu_double2) :: t1, r1 + type(gpu_double2) :: H_oo, H_vv, H_vo double precision, allocatable :: all_err(:,:), all_t(:,:) integer, allocatable :: list_occ(:), list_vir(:) @@ -52,14 +52,15 @@ subroutine run_ccsd_space_orb !print*,'occ',list_occ !print*,'vir',list_vir - allocate(t2(nO,nO,nV,nV), r2(nO,nO,nV,nV)) - allocate(tau(nO,nO,nV,nV)) - allocate(tau_x(nO,nO,nV,nV)) - allocate(t1(nO,nV), r1(nO,nV)) - - call gpu_allocate_double(H_oo, (/ nO, nO /) ) - call gpu_allocate_double(H_vv, (/ nV, nV /) ) - call gpu_allocate_double(H_vo, (/ nV, nO /) ) + call gpu_allocate(t2, nO,nO,nV,nV) + call gpu_allocate(r2, nO,nO,nV,nV) + call gpu_allocate(tau, nO,nO,nV,nV) + call gpu_allocate(tau_x, nO,nO,nV,nV) + call gpu_allocate(t1, nO,nV) + call gpu_allocate(r1, nO,nV) + call gpu_allocate(H_oo, nO, nO) + call gpu_allocate(H_vo, nV, nO) + call gpu_allocate(H_vv, nV, nV) if (cc_update_method == 'diis') then double precision :: rss, diis_mem, extra_mem @@ -101,14 +102,21 @@ subroutine run_ccsd_space_orb endif ! Init - call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,t1) - call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,t2) - call update_tau_space(nO,nV,t1,t2,tau) + double precision, allocatable :: h_t1(:,:), h_t2(:,:,:,:) + allocate(h_t1(nO,nV), h_t2(nO,nO,nV,nV)) + + call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,h_t1) + call gpu_upload(h_t1, t1) + + call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,h_t2) + call gpu_upload(h_t2, t2) + + call update_tau_space(nO,nV,h_t1,t1,t2,tau) call update_tau_x_space(nO,nV,tau,tau_x) !print*,'hf_energy', hf_energy call det_energy(det,uncorr_energy) print*,'Det energy', uncorr_energy - call ccsd_energy_space_x(nO,nV,tau_x,t1,energy) + call ccsd_energy_space_x(nO,nV,tau_x%f,t1%f,energy) print*,'Guess energy', uncorr_energy+energy, energy nb_iter = 0 @@ -127,40 +135,38 @@ subroutine run_ccsd_space_orb if (do_ao_cholesky) then ! if (.False.) then call compute_H_oo_chol(nO,nV,tau_x,H_oo) - call compute_H_vv_chol(nO,nV,tau_x,H_vv) - call compute_H_vo_chol(nO,nV,t1,H_vo) + call compute_H_vv_chol(nO,nV,tau_x%f,H_vv%f) + call compute_H_vo_chol(nO,nV,t1%f,H_vo%f) - call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) - call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) + call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1) + call compute_r2_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r2%f,max_r2) else - call compute_H_oo(nO,nV,t1,t2,tau,H_oo) - call compute_H_vv(nO,nV,t1,t2,tau,H_vv) - call compute_H_vo(nO,nV,t1,t2,H_vo) + call compute_H_oo(nO,nV,t1%f,t2%f,tau%f,H_oo%f) + call compute_H_vv(nO,nV,t1%f,t2%f,tau%f,H_vv%f) + call compute_H_vo(nO,nV,t1%f,t2%f,H_vo%f) - call compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) - call compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) + call compute_r1_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r1%f,max_r1) + call compute_r2_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r2%f,max_r2) endif max_r = max(max_r1,max_r2) ! Update if (cc_update_method == 'diis') then - !call update_t_ccsd(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2) - !call update_t_ccsd_diis(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2) - call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1,r2,t1,t2,all_err,all_t) + call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1%f,r2%f,t1%f,t2%f,all_err,all_t) ! Standard update as T = T - Delta elseif (cc_update_method == 'none') then - call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1,t1) - call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2,t2) + call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1%f,t1%f) + call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2%f,t2%f) else print*,'Unkown cc_method_method: '//cc_update_method endif - call update_tau_space(nO,nV,t1,t2,tau) + call update_tau_space(nO,nV,t1%f,t1,t2,tau) call update_tau_x_space(nO,nV,tau,tau_x) ! Energy - call ccsd_energy_space_x(nO,nV,tau_x,t1,energy) + call ccsd_energy_space_x(nO,nV,tau_x%f,t1%f,energy) write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |' nb_iter = nb_iter + 1 @@ -185,8 +191,8 @@ subroutine run_ccsd_space_orb print*,'' if (write_amplitudes) then - call write_t1(nO,nV,t1) - call write_t2(nO,nV,t2) + call write_t1(nO,nV,t1%f) + call write_t2(nO,nV,t2%f) call ezfio_set_utils_cc_io_amplitudes('Read') endif @@ -195,11 +201,14 @@ subroutine run_ccsd_space_orb deallocate(all_err,all_t) endif - call gpu_deallocate_double(H_oo) - call gpu_deallocate_double(H_vv) - call gpu_deallocate_double(H_vo) + call gpu_deallocate(H_oo) + call gpu_deallocate(H_vv) + call gpu_deallocate(H_vo) - deallocate(r1,r2,tau) + call gpu_deallocate(r1) + call gpu_deallocate(r2) + call gpu_deallocate(tau) + call gpu_deallocate(tau_x) ! CCSD(T) double precision :: e_t, e_t_err @@ -207,28 +216,14 @@ subroutine run_ccsd_space_orb if (cc_par_t .and. elec_alpha_num + elec_beta_num > 2) then - ! Dumb way - !call wall_time(ta) - !call ccsd_par_t_space(nO,nV,t1,t2,e_t) - !call wall_time(tb) - !print*,'Time: ',tb-ta, ' s' - - !print*,'' - !write(*,'(A15,F18.12,A3)') ' E(CCSD(T)) = ', uncorr_energy + energy + e_t, ' Ha' - !write(*,'(A15,F18.12,A3)') ' E(T) = ', e_t, ' Ha' - !write(*,'(A15,F18.12,A3)') ' Correlation = ', energy + e_t, ' Ha' - !print*,'' - ! New e_t = uncorr_energy + energy ! For print in (T) call e_t_err = 0.d0 print*,'Computing (T) correction...' call wall_time(ta) -! call ccsd_par_t_space_v3(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v & -! ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t) - call ccsd_par_t_space_stoch(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v & + call ccsd_par_t_space_stoch(nO,nV,t1%f,t2%f,cc_space_f_o,cc_space_f_v & ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t, e_t_err) call wall_time(tb) @@ -243,7 +238,9 @@ subroutine run_ccsd_space_orb call save_energy(uncorr_energy + energy, e_t) - deallocate(t1,t2) + deallocate(h_t1, h_t2) + call gpu_deallocate(t1) + call gpu_deallocate(t2) end @@ -341,70 +338,139 @@ end ! Tau -subroutine update_tau_space(nO,nV,t1,t2,tau) - +subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau) + use gpu implicit none ! in integer, intent(in) :: nO, nV - double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV) + double precision, intent(in) :: h_t1(nO,nV) + type(gpu_double2), intent(in) :: t1 + type(gpu_double4), intent(in) :: t2 ! out - double precision, intent(out) :: tau(nO,nO,nV,nV) + type(gpu_double4) :: tau ! internal integer :: i,j,a,b +! !$OMP PARALLEL & +! !$OMP SHARED(nO,nV,tau,t2,t1,h_t1) & +! !$OMP PRIVATE(i,j,a,b) & +! !$OMP DEFAULT(NONE) +! !$OMP DO +! do b = 1, nV +! do a = 1, nV +! do j = 1, nO +! do i = 1, nO +! tau%f(i,j,a,b) = t2%f(i,j,a,b) + t1%f(i,a) * h_t1(j,b) +! enddo +! enddo +! enddo +! enddo +! !$OMP END DO +! !$OMP END PARALLEL + + + type(gpu_blas) :: blas + type(gpu_stream) :: stream(nV) + + call gpu_blas_create(blas) + do b=1,nV + call gpu_stream_create(stream(b)) + enddo + !$OMP PARALLEL & - !$OMP SHARED(nO,nV,tau,t2,t1) & + !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) - !$OMP DO - do b = 1, nV - do a = 1, nV - do j = 1, nO - do i = 1, nO - tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b) - enddo - enddo + do j=1,nO + !$OMP DO + do b=1,nV + call gpu_set_stream(blas,stream(b)) + call gpu_dgeam_c(blas%c, 'N', 'N', nO*1_8, nV*1_8, & + 1.d0, c_loc(t2%f(1,j,1,b)), nO*nO*1_8, & + h_t1(j,b), t1%c, nO*1_8, & + c_loc(tau%f(1,j,1,b)), nO*nO*1_8) enddo + !$OMP END DO enddo - !$OMP END DO !$OMP END PARALLEL + call gpu_synchronize() + + do b=1,nV + call gpu_stream_destroy(stream(b)) + enddo + + call gpu_blas_destroy(blas) + end subroutine update_tau_x_space(nO,nV,tau,tau_x) - + use gpu implicit none ! in - integer, intent(in) :: nO, nV - double precision, intent(in) :: tau(nO,nO,nV,nV) + integer, intent(in) :: nO, nV + type(gpu_double4), intent(in) :: tau ! out - double precision, intent(out) :: tau_x(nO,nO,nV,nV) + type(gpu_double4) :: tau_x ! internal integer :: i,j,a,b +! !$OMP PARALLEL & +! !$OMP SHARED(nO,nV,tau,tau_x) & +! !$OMP PRIVATE(i,j,a,b) & +! !$OMP DEFAULT(NONE) +! !$OMP DO +! do b = 1, nV +! do a = 1, nV +! do j = 1, nO +! do i = 1, nO +! tau_x%f(i,j,a,b) = 2.d0*tau%f(i,j,a,b) - tau%f(i,j,b,a) +! enddo +! enddo +! enddo +! enddo +! !$OMP END DO +! !$OMP END PARALLEL + + type(gpu_blas) :: blas + type(gpu_stream) :: stream(nV) + + call gpu_blas_create(blas) + do a=1,nV + call gpu_stream_create(stream(a)) + enddo + !$OMP PARALLEL & - !$OMP SHARED(nO,nV,tau,tau_x) & + !$OMP SHARED(nO,nV,tau,tau_x,stream,blas) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) !$OMP DO - do b = 1, nV - do a = 1, nV - do j = 1, nO - do i = 1, nO - tau_x(i,j,a,b) = 2.d0*tau(i,j,a,b) - tau(i,j,b,a) - enddo - enddo + do b=1,nV + do a=1,nV + call gpu_set_stream(blas,stream(a)) + call gpu_dgeam_c(blas%c, 'N', 'N', nO*1_8, nO*1_8, & + 2.d0, c_loc(tau%f(1,1,a,b)), nO*1_8, & + -1.d0, c_loc(tau%f(1,1,b,a)), nO*1_8, & + c_loc(tau_x%f(1,1,a,b)), nO*1_8) enddo enddo !$OMP END DO !$OMP END PARALLEL + call gpu_synchronize() + + do b=1,nV + call gpu_stream_destroy(stream(b)) + enddo + + call gpu_blas_destroy(blas) + end ! R1 diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index b59dc0bb..9b161001 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -294,12 +294,12 @@ end ! H_oo subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo) - + use gpu implicit none integer, intent(in) :: nO,nV - double precision, intent(in) :: tau_x(nO, nO, nV, nV) - double precision, intent(out) :: H_oo(nO, nO) + type(gpu_double4), intent(in) :: tau_x + type(gpu_double2), intent(out) :: H_oo integer :: a,b,i,j,u,k @@ -315,7 +315,7 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo) do b=1,nV do j=1,nO do a=1,nV - tmp_vov(a,j,b) = tau_x(u,j,a,b) + tmp_vov(a,j,b) = tau_x%f(u,j,a,b) enddo enddo enddo @@ -328,7 +328,7 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo) !$omp do do i = 1, nO do u = 1, nO - H_oo(u,i) = cc_space_f_oo(u,i) + H_oo%f(u,i) = cc_space_f_oo(u,i) enddo enddo !$omp end do nowait @@ -336,7 +336,7 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo) !$omp end parallel call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & tau_kau, cholesky_mo_num*nV, cc_space_v_vo_chol, cholesky_mo_num*nV, & - 1.d0, H_oo, nO) + 1.d0, H_oo%f, nO) end diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90 index 43754454..51f80ac0 100644 --- a/src/gpu/gpu_module.F90 +++ b/src/gpu/gpu_module.F90 @@ -2,6 +2,52 @@ module gpu use, intrinsic :: iso_c_binding implicit none +! Data types +! ---------- + + type gpu_double1 + type(c_ptr) :: c + double precision, pointer :: f(:) + end type + + type gpu_double2 + type(c_ptr) :: c + double precision, pointer :: f(:,:) + end type + + type gpu_double3 + type(c_ptr) :: c + double precision, pointer :: f(:,:,:) + end type + + type gpu_double4 + type(c_ptr) :: c + double precision, pointer :: f(:,:,:,:) + end type + + type gpu_double5 + type(c_ptr) :: c + double precision, pointer :: f(:,:,:,:,:) + end type + + type gpu_double6 + type(c_ptr) :: c + double precision, pointer :: f(:,:,:,:,:,:) + end type + + + type gpu_blas + type(c_ptr) :: c + end type + + type gpu_stream + type(c_ptr) :: c + end type + + +! C interfaces +! ------------ + interface integer function gpu_ndevices() bind(C) end function @@ -43,100 +89,394 @@ module gpu integer(c_int64_t), value :: n end subroutine - subroutine gpu_stream_create(stream) bind(C) + subroutine gpu_stream_create_c(stream) bind(C, name='gpu_stream_create') import type(c_ptr) :: stream end subroutine - subroutine gpu_stream_destroy(stream) bind(C) + subroutine gpu_stream_destroy_c(stream) bind(C, name='gpu_stream_destroy') import type(c_ptr) :: stream end subroutine - subroutine gpu_set_stream(handle, stream) bind(C) + subroutine gpu_set_stream_c(handle, stream) bind(C, name='gpu_set_stream') import type(c_ptr) :: handle, stream end subroutine - subroutine gpu_synchronize() + subroutine gpu_synchronize() bind(C) + import end subroutine - subroutine gpu_blas_create(handle) bind(C) + subroutine gpu_blas_create_c(handle) bind(C, name='gpu_blas_create') import type(c_ptr) :: handle end subroutine - subroutine gpu_blas_destroy(handle) bind(C) + subroutine gpu_blas_destroy_c(handle) bind(C, name='gpu_blas_destroy') import type(c_ptr) :: handle end subroutine - subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res) bind(C) + subroutine gpu_ddot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_ddot') import - type(c_ptr), intent(in) :: handle - integer(c_int64_t), value :: n, incx, incy - real(c_double), intent(in) :: dx(*), dy(*) - real(c_double), intent(out) :: res + type(c_ptr), intent(in), value :: handle + integer(c_int64_t), value :: n, incx, incy + type(c_ptr), intent(in), value :: dx, dy + real(c_double), intent(out) :: res end subroutine - subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res) bind(C) + subroutine gpu_sdot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_sdot') import - type(c_ptr), intent(in) :: handle - integer(c_int64_t), value :: n, incx, incy - real(c_float), intent(in) :: dx(*), dy(*) + type(c_ptr), intent(in), value :: handle + integer(c_int64_t), value :: n, incx, incy + type(c_ptr), intent(in), value :: dx, dy real(c_float), intent(out) :: res end subroutine + subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, & + b, ldb, c, ldc) bind(C, name='gpu_dgeam') + import + type(c_ptr), intent(in), value :: handle + character(c_char), intent(in), value :: transa, transb + integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc + real(c_double), intent(in), value :: alpha, beta + type(c_ptr), value :: a, b, c + end subroutine + end interface + +! Polymorphic interfaces +! ---------------------- + + interface gpu_allocate + procedure gpu_allocate_double1 & + ,gpu_allocate_double2 & + ,gpu_allocate_double3 & + ,gpu_allocate_double4 & + ,gpu_allocate_double5 & + ,gpu_allocate_double6 + end interface gpu_allocate + + interface gpu_deallocate + procedure gpu_deallocate_double1 & + ,gpu_deallocate_double2 & + ,gpu_deallocate_double3 & + ,gpu_deallocate_double4 & + ,gpu_deallocate_double5 & + ,gpu_deallocate_double6 + end interface gpu_deallocate + + interface gpu_upload + procedure gpu_upload_double1 & + ,gpu_upload_double2 & + ,gpu_upload_double3 & + ,gpu_upload_double4 & + ,gpu_upload_double5 & + ,gpu_upload_double6 + end interface gpu_upload + + interface gpu_download + procedure gpu_download_double1 & + ,gpu_download_double2 & + ,gpu_download_double3 & + ,gpu_download_double4 & + ,gpu_download_double5 & + ,gpu_download_double6 + end interface gpu_download + + interface gpu_copy + procedure gpu_copy_double1 & + ,gpu_copy_double2 & + ,gpu_copy_double3 & + ,gpu_copy_double4 & + ,gpu_copy_double5 & + ,gpu_copy_double6 + end interface gpu_copy + + contains - subroutine gpu_allocate_double(ptr, s) - implicit none - double precision, pointer, intent(inout) :: ptr - integer, intent(in) :: s(:) - type(c_ptr) :: cptr +! gpu_allocate +! ------------ - call gpu_allocate_c(cptr, sum(s*1_8)*8_8) - call c_f_pointer(cptr, ptr, s) + subroutine gpu_allocate_double1(ptr, s) + implicit none + type(gpu_double1), intent(inout) :: ptr + integer, intent(in) :: s + + call gpu_allocate_c(ptr%c, s*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s /)) end subroutine - subroutine gpu_deallocate_double(ptr) + subroutine gpu_allocate_double2(ptr, s1, s2) implicit none - double precision, pointer, intent(inout) :: ptr - type(c_ptr) :: cptr - cptr = c_loc(ptr) - call gpu_deallocate_c(cptr) - NULLIFY(ptr) + type(gpu_double2), intent(inout) :: ptr + integer, intent(in) :: s1, s2 + + call gpu_allocate_c(ptr%c, s1*s2*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /)) + end subroutine + + subroutine gpu_allocate_double3(ptr, s1, s2, s3) + implicit none + type(gpu_double3), intent(inout) :: ptr + integer, intent(in) :: s1, s2, s3 + + call gpu_allocate_c(ptr%c, s1*s2*s3*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /)) + end subroutine + + subroutine gpu_allocate_double4(ptr, s1, s2, s3, s4) + implicit none + type(gpu_double4), intent(inout) :: ptr + integer, intent(in) :: s1, s2, s3, s4 + + call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /)) + end subroutine + + subroutine gpu_allocate_double5(ptr, s1, s2, s3, s4, s5) + implicit none + type(gpu_double5), intent(inout) :: ptr + integer, intent(in) :: s1, s2, s3, s4, s5 + + call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /)) + end subroutine + + subroutine gpu_allocate_double6(ptr, s1, s2, s3, s4, s5, s6) + implicit none + type(gpu_double6), intent(inout) :: ptr + integer, intent(in) :: s1, s2, s3, s4, s5, s6 + + call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /)) + end subroutine + + +! gpu_deallocate +! -------------- + + subroutine gpu_deallocate_double1(ptr) + implicit none + type(gpu_double1), intent(inout) :: ptr + call gpu_deallocate_c(ptr%c) + NULLIFY(ptr%f) + end subroutine + + subroutine gpu_deallocate_double2(ptr) + implicit none + type(gpu_double2), intent(inout) :: ptr + call gpu_deallocate_c(ptr%c) + NULLIFY(ptr%f) + end subroutine + + subroutine gpu_deallocate_double3(ptr) + implicit none + type(gpu_double3), intent(inout) :: ptr + call gpu_deallocate_c(ptr%c) + NULLIFY(ptr%f) + end subroutine + + subroutine gpu_deallocate_double4(ptr) + implicit none + type(gpu_double4), intent(inout) :: ptr + call gpu_deallocate_c(ptr%c) + NULLIFY(ptr%f) + end subroutine + + subroutine gpu_deallocate_double5(ptr) + implicit none + type(gpu_double5), intent(inout) :: ptr + call gpu_deallocate_c(ptr%c) + NULLIFY(ptr%f) + end subroutine + + subroutine gpu_deallocate_double6(ptr) + implicit none + type(gpu_double6), intent(inout) :: ptr + call gpu_deallocate_c(ptr%c) + NULLIFY(ptr%f) + end subroutine + + +! gpu_upload +! ---------- + + subroutine gpu_upload_double1(cpu_ptr, gpu_ptr) + implicit none + double precision, intent(in) :: cpu_ptr(:) + type(gpu_double1), intent(in) :: gpu_ptr + call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, 8_8*size(gpu_ptr%f)) + end subroutine + + subroutine gpu_upload_double2(cpu_ptr, gpu_ptr) + implicit none + double precision, intent(in) :: cpu_ptr(:,:) + type(gpu_double2), intent(in) :: gpu_ptr + call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) + end subroutine + + subroutine gpu_upload_double3(cpu_ptr, gpu_ptr) + implicit none + double precision, intent(in) :: cpu_ptr(:,:,:) + type(gpu_double3), intent(in) :: gpu_ptr + call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) + end subroutine + + subroutine gpu_upload_double4(cpu_ptr, gpu_ptr) + implicit none + double precision, intent(in) :: cpu_ptr(:,:,:,:) + type(gpu_double4), intent(in) :: gpu_ptr + call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) + end subroutine + + subroutine gpu_upload_double5(cpu_ptr, gpu_ptr) + implicit none + double precision, intent(in) :: cpu_ptr(:,:,:,:,:) + type(gpu_double5), intent(in) :: gpu_ptr + call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) + end subroutine + + subroutine gpu_upload_double6(cpu_ptr, gpu_ptr) + implicit none + double precision, intent(in) :: cpu_ptr(:,:,:,:,:,:) + type(gpu_double6), intent(in) :: gpu_ptr + call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) + end subroutine + + +! gpu_download +! ------------ + + subroutine gpu_download_double1(gpu_ptr, cpu_ptr) + implicit none + type(gpu_double1), intent(in) :: gpu_ptr + double precision, intent(in) :: cpu_ptr(:) + call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*size(gpu_ptr%f)) + end subroutine + + subroutine gpu_download_double2(gpu_ptr, cpu_ptr) + implicit none + type(gpu_double2), intent(in) :: gpu_ptr + double precision, intent(in) :: cpu_ptr(:,:) + call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) + end subroutine + + subroutine gpu_download_double3(gpu_ptr, cpu_ptr) + implicit none + type(gpu_double3), intent(in) :: gpu_ptr + double precision, intent(in) :: cpu_ptr(:,:,:) + call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) + end subroutine + + subroutine gpu_download_double4(gpu_ptr, cpu_ptr) + implicit none + type(gpu_double4), intent(in) :: gpu_ptr + double precision, intent(in) :: cpu_ptr(:,:,:,:) + call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) + end subroutine + + subroutine gpu_download_double5(gpu_ptr, cpu_ptr) + implicit none + type(gpu_double5), intent(in) :: gpu_ptr + double precision, intent(in) :: cpu_ptr(:,:,:,:,:) + call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) + end subroutine + + subroutine gpu_download_double6(gpu_ptr, cpu_ptr) + implicit none + type(gpu_double6), intent(in) :: gpu_ptr + double precision, intent(in) :: cpu_ptr(:,:,:,:,:,:) + call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) + end subroutine + +! gpu_copy +! -------- + + subroutine gpu_copy_double1(gpu_ptr_src, gpu_ptr_dest) + implicit none + type(gpu_double1), intent(in) :: gpu_ptr_src + type(gpu_double1), intent(in) :: gpu_ptr_dest + call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*size(gpu_ptr_dest%f)) + end subroutine + + subroutine gpu_copy_double2(gpu_ptr_src, gpu_ptr_dest) + implicit none + type(gpu_double2), intent(in) :: gpu_ptr_src + type(gpu_double2), intent(in) :: gpu_ptr_dest + call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8)) + end subroutine + + subroutine gpu_copy_double3(gpu_ptr_src, gpu_ptr_dest) + implicit none + type(gpu_double3), intent(in) :: gpu_ptr_src + type(gpu_double3), intent(in) :: gpu_ptr_dest + call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8)) + end subroutine + + subroutine gpu_copy_double4(gpu_ptr_src, gpu_ptr_dest) + implicit none + type(gpu_double4), intent(in) :: gpu_ptr_src + type(gpu_double4), intent(in) :: gpu_ptr_dest + call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8)) + end subroutine + + subroutine gpu_copy_double5(gpu_ptr_src, gpu_ptr_dest) + implicit none + type(gpu_double5), intent(in) :: gpu_ptr_src + type(gpu_double5), intent(in) :: gpu_ptr_dest + call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8)) + end subroutine + + subroutine gpu_copy_double6(gpu_ptr_src, gpu_ptr_dest) + implicit none + type(gpu_double6), intent(in) :: gpu_ptr_src + type(gpu_double6), intent(in) :: gpu_ptr_dest + call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8)) + end subroutine + + +! gpu_stream +! ---------- + + subroutine gpu_stream_create(stream) + import + type(gpu_stream) :: stream + call gpu_stream_create_c(stream%c) + end subroutine + + subroutine gpu_stream_destroy(stream) + import + type(gpu_stream) :: stream + call gpu_stream_destroy_c(stream%c) + end subroutine + + subroutine gpu_set_stream(handle, stream) + import + type(gpu_blas) :: handle + type(gpu_stream) :: stream + call gpu_set_stream_c(handle%c, stream%c) + end subroutine + + +! gpu_blas +! -------- + + subroutine gpu_blas_create(handle) + import + type(gpu_blas) :: handle + call gpu_blas_create_c(handle%c) + end subroutine + + subroutine gpu_blas_destroy(handle) + import + type(gpu_blas) :: handle + call gpu_blas_destroy_c(handle%c) end subroutine end module -subroutine gpu_upload_double(cpu_ptr, gpu_ptr, n) - use gpu - implicit none - double precision, intent(in) :: cpu_ptr(*) - double precision, intent(in) :: gpu_ptr(*) - integer(c_int64_t), intent(in) :: n - call gpu_upload_c(c_loc(cpu_ptr), c_loc(gpu_ptr), 8_8*n) -end subroutine - -subroutine gpu_download_double(gpu_ptr, cpu_ptr, n) - use gpu - implicit none - double precision, intent(in) :: gpu_ptr(*) - double precision, intent(in) :: cpu_ptr(*) - integer(c_int64_t), intent(in) :: n - call gpu_download_c(c_loc(gpu_ptr), c_loc(cpu_ptr), 8_8*n) -end subroutine - -subroutine gpu_copy_double(gpu_ptr_src, gpu_ptr_dest, n) - use gpu - implicit none - double precision, intent(in) :: gpu_ptr_src(*) - double precision, intent(in) :: gpu_ptr_dest(*) - integer(c_int64_t), intent(in) :: n - call gpu_copy_c(c_loc(gpu_ptr_src), c_loc(gpu_ptr_dest), 8_8*n) -end subroutine - diff --git a/src/gpu_x86/gpu.c b/src/gpu_x86/gpu.c index 41ede396..5f42cb0d 100644 --- a/src/gpu_x86/gpu.c +++ b/src/gpu_x86/gpu.c @@ -251,7 +251,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i if (alpha == 0.) { for (int64_t j=0 ; j Date: Fri, 28 Jun 2024 11:00:58 +0200 Subject: [PATCH 05/38] Added Nvidia module --- src/ccsd/ccsd_space_orb_sub.irp.f | 10 +- src/gpu/gpu_module.F90 | 6 +- src/gpu_nvidia/LIB | 1 + src/gpu_nvidia/NEED | 1 + src/gpu_nvidia/README.rst | 5 + src/gpu_nvidia/gpu.c | 327 ++++++++++++++++++++++++++++++ src/gpu_x86/gpu.c | 40 ++-- 7 files changed, 359 insertions(+), 31 deletions(-) create mode 100644 src/gpu_nvidia/LIB create mode 100644 src/gpu_nvidia/NEED create mode 100644 src/gpu_nvidia/README.rst create mode 100644 src/gpu_nvidia/gpu.c diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 455d62f7..e7c9b1ab 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -384,17 +384,17 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau) !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) - do j=1,nO - !$OMP DO - do b=1,nV - call gpu_set_stream(blas,stream(b)) + !$OMP DO + do b=1,nV + call gpu_set_stream(blas,stream(b)) + do j=1,nO call gpu_dgeam_c(blas%c, 'N', 'N', nO*1_8, nV*1_8, & 1.d0, c_loc(t2%f(1,j,1,b)), nO*nO*1_8, & h_t1(j,b), t1%c, nO*1_8, & c_loc(tau%f(1,j,1,b)), nO*nO*1_8) enddo - !$OMP END DO enddo + !$OMP END DO !$OMP END PARALLEL call gpu_synchronize() diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90 index 51f80ac0..d1ddad4c 100644 --- a/src/gpu/gpu_module.F90 +++ b/src/gpu/gpu_module.F90 @@ -120,7 +120,7 @@ module gpu subroutine gpu_ddot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_ddot') import - type(c_ptr), intent(in), value :: handle + type(c_ptr), intent(in) :: handle integer(c_int64_t), value :: n, incx, incy type(c_ptr), intent(in), value :: dx, dy real(c_double), intent(out) :: res @@ -128,7 +128,7 @@ module gpu subroutine gpu_sdot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_sdot') import - type(c_ptr), intent(in), value :: handle + type(c_ptr), intent(in) :: handle integer(c_int64_t), value :: n, incx, incy type(c_ptr), intent(in), value :: dx, dy real(c_float), intent(out) :: res @@ -137,7 +137,7 @@ module gpu subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, & b, ldb, c, ldc) bind(C, name='gpu_dgeam') import - type(c_ptr), intent(in), value :: handle + type(c_ptr), intent(in) :: handle character(c_char), intent(in), value :: transa, transb integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc real(c_double), intent(in), value :: alpha, beta diff --git a/src/gpu_nvidia/LIB b/src/gpu_nvidia/LIB new file mode 100644 index 00000000..91f54e91 --- /dev/null +++ b/src/gpu_nvidia/LIB @@ -0,0 +1 @@ +-lcudart -lcublas -lcublasLt diff --git a/src/gpu_nvidia/NEED b/src/gpu_nvidia/NEED new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/src/gpu_nvidia/NEED @@ -0,0 +1 @@ + diff --git a/src/gpu_nvidia/README.rst b/src/gpu_nvidia/README.rst new file mode 100644 index 00000000..5dcfca92 --- /dev/null +++ b/src/gpu_nvidia/README.rst @@ -0,0 +1,5 @@ +========== +gpu_nvidia +========== + +Nvidia implementation of GPU routines. Uses CUDA and CUBLAS libraries. diff --git a/src/gpu_nvidia/gpu.c b/src/gpu_nvidia/gpu.c new file mode 100644 index 00000000..f0bd247a --- /dev/null +++ b/src/gpu_nvidia/gpu.c @@ -0,0 +1,327 @@ +#include +#include +#include +#include +#include + +#include +#include + + +/* Generic functions */ + +int gpu_ndevices() { + int ngpus; + cudaGetDeviceCount(&ngpus); + return ngpus; +} + +void gpu_set_device(int32_t igpu) { + cudaSetDevice(igpu); +} + + +/* Allocation functions */ + +void gpu_allocate(void** ptr, const int64_t size) { + size_t free, total; + cudaError_t rc = cudaMemGetInfo( &free, &total ); + if (rc != cudaSuccess) { + free = INT64_MAX; + } + + /* Use managed memory if it does not fit on the GPU */ + if (size < free && size < total/2) { +// rc= cudaMalloc(ptr, size); + rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal); + } else { + rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal); + } + assert (rc == cudaSuccess); +} + +void gpu_deallocate(void** ptr) { + assert (*ptr != NULL); + cudaFree(*ptr); + *ptr = NULL; +} + + +/* Memory transfer functions */ + +void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) { + cudaMemcpy (gpu_ptr, cpu_ptr, n, cudaMemcpyHostToDevice); +} + +void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) { + cudaMemcpy (cpu_ptr, gpu_ptr, n, cudaMemcpyDeviceToHost); +} + +void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) { + cudaMemcpy (gpu_ptr_dest, gpu_ptr_src, n, cudaMemcpyDeviceToDevice); +} + + +/* Streams */ + +void gpu_stream_create(void** ptr) { + cudaStream_t stream; + cudaError_t rc = cudaStreamCreate(&stream); + assert (rc == cudaSuccess); + *ptr = (void*) stream; +} + +void gpu_stream_destroy(void** ptr) { + assert (*ptr != NULL); + cudaError_t rc = cudaStreamDestroy( (cudaStream_t) *ptr); + assert (rc == cudaSuccess); + *ptr = NULL; +} + +void gpu_set_stream(void** handle, void** stream) { + cublasSetStream( (cublasHandle_t) *handle, (cudaStream_t) *stream); +} + +void gpu_synchronize() { + cudaDeviceSynchronize(); +} + + +/* BLAS functions */ + +void gpu_blas_create(void** handle) { + cublasHandle_t cublas_handle; + cublasStatus_t rc = cublasCreate(&cublas_handle); + assert (rc == CUBLAS_STATUS_SUCCESS); + *handle = (void*) cublas_handle; +} + + +void gpu_blas_destroy(void** handle) { + assert (*handle != NULL); + cublasStatus_t rc = cublasDestroy( (cublasHandle_t) *handle); + assert (rc == CUBLAS_STATUS_SUCCESS); + *handle = NULL; +} + + +void gpu_ddot(void** handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) { + assert (*handle != NULL); + + /* Convert to int32_t */ + int32_t n_, incx_, incy_; + + n_ = (int32_t) n; + incx_ = (int32_t) incx; + incy_ = (int32_t) incy; + + /* Check for integer overflows */ + assert ( (int64_t) n_ == n ); + assert ( (int64_t) incx_ == incx); + assert ( (int64_t) incy_ == incy); + + cublasDdot((cublasHandle_t) *handle, n_, x, incx_, y, incy_, result); +} + + + +void gpu_sdot(void** handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) { + assert (*handle != NULL); + + /* Convert to int32_t */ + int32_t n_, incx_, incy_; + + n_ = (int32_t) n; + incx_ = (int32_t) incx; + incy_ = (int32_t) incy; + + /* Check for integer overflows */ + assert ( (int64_t) n_ == n ); + assert ( (int64_t) incx_ == incx); + assert ( (int64_t) incy_ == incy); + + cublasSdot((cublasHandle_t) *handle, n_, x, incx_, y, incy_, result); +} + + + +void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t n, const double alpha, + const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) { + + assert (*handle != NULL); + + /* Convert to int32_t */ + int32_t m_, n_, lda_, incx_, incy_; + + m_ = (int32_t) m; + n_ = (int32_t) n; + lda_ = (int32_t) lda; + incx_ = (int32_t) incx; + incy_ = (int32_t) incy; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) lda_ == lda ); + assert ( (int64_t) incx_ == incx); + assert ( (int64_t) incy_ == incy); + + cublasOperation_t transa_ = CUBLAS_OP_N; + if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; + + cublasDgemv((cublasHandle_t) *handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_); +} + + + +void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t n, const float alpha, + const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) { + + assert (*handle != NULL); + + /* Convert to int32_t */ + int32_t m_, n_, lda_, incx_, incy_; + + m_ = (int32_t) m; + n_ = (int32_t) n; + lda_ = (int32_t) lda; + incx_ = (int32_t) incx; + incy_ = (int32_t) incy; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) lda_ == lda ); + assert ( (int64_t) incx_ == incx); + assert ( (int64_t) incy_ == incy); + + cublasOperation_t transa_ = CUBLAS_OP_N; + if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; + + cublasSgemv((cublasHandle_t) *handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_); +} + + +void gpu_dgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, + const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) { + + assert (*handle != NULL); + + /* Convert to int32_t */ + int32_t m_, n_, k_, lda_, ldb_, ldc_; + + m_ = (int32_t) m; + n_ = (int32_t) n; + k_ = (int32_t) k; + lda_ = (int32_t) lda; + ldb_ = (int32_t) ldb; + ldc_ = (int32_t) ldc; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) k_ == k ); + assert ( (int64_t) lda_ == lda); + assert ( (int64_t) ldb_ == ldb); + assert ( (int64_t) ldc_ == ldc); + + cublasOperation_t transa_ = CUBLAS_OP_N; + cublasOperation_t transb_ = CUBLAS_OP_N; + if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; + if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; + + cublasDgemm((cublasHandle_t) *handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_); +} + + + +void gpu_sgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, + const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) { + + assert (*handle != NULL); + + /* Convert to int32_t */ + int32_t m_, n_, k_, lda_, ldb_, ldc_; + + m_ = (int32_t) m; + n_ = (int32_t) n; + k_ = (int32_t) k; + lda_ = (int32_t) lda; + ldb_ = (int32_t) ldb; + ldc_ = (int32_t) ldc; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) k_ == k ); + assert ( (int64_t) lda_ == lda); + assert ( (int64_t) ldb_ == ldb); + assert ( (int64_t) ldc_ == ldc); + + cublasOperation_t transa_ = CUBLAS_OP_N; + cublasOperation_t transb_ = CUBLAS_OP_N; + if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; + if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; + + cublasSgemm((cublasHandle_t) *handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_); +} + + +void gpu_dgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, + const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { + assert (*handle != NULL); + + /* Convert to int32_t */ + int32_t m_, n_, lda_, ldb_, ldc_; + + m_ = (int32_t) m; + n_ = (int32_t) n; + lda_ = (int32_t) lda; + ldb_ = (int32_t) ldb; + ldc_ = (int32_t) ldc; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) lda_ == lda); + assert ( (int64_t) ldb_ == ldb); + assert ( (int64_t) ldc_ == ldc); + + cublasOperation_t transa_ = CUBLAS_OP_N; + cublasOperation_t transb_ = CUBLAS_OP_N; + if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; + if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; + + cublasDgeam((cublasHandle_t) *handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_); + +} + + +void gpu_sgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha, + const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) { + assert (*handle != NULL); + + /* Convert to int32_t */ + int32_t m_, n_, lda_, ldb_, ldc_; + + m_ = (int32_t) m; + n_ = (int32_t) n; + lda_ = (int32_t) lda; + ldb_ = (int32_t) ldb; + ldc_ = (int32_t) ldc; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) lda_ == lda); + assert ( (int64_t) ldb_ == ldb); + assert ( (int64_t) ldc_ == ldc); + + cublasOperation_t transa_ = CUBLAS_OP_N; + cublasOperation_t transb_ = CUBLAS_OP_N; + if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; + if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; + + cublasSgeam((cublasHandle_t) *handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_); + +} diff --git a/src/gpu_x86/gpu.c b/src/gpu_x86/gpu.c index 5f42cb0d..ac7c3620 100644 --- a/src/gpu_x86/gpu.c +++ b/src/gpu_x86/gpu.c @@ -56,7 +56,7 @@ void gpu_stream_destroy(void** ptr) { *ptr = NULL; } -void gpu_set_stream(void* handle, void* stream) { +void gpu_set_stream(void** handle, void** stream) { return; } @@ -79,8 +79,8 @@ void gpu_blas_destroy(void** handle) { double ddot_(const int32_t* n, const double* x, const int32_t* incx, const double* y, const int32_t* incy); -void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) { - assert (handle != NULL); +void gpu_ddot(void** handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) { + assert (*handle != NULL); /* Convert to int32_t */ int32_t n_, incx_, incy_; @@ -100,8 +100,8 @@ void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_ float sdot_(const int32_t* n, const float* x, const int32_t* incx, const float* y, const int32_t* incy); -void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) { - assert (handle != NULL); +void gpu_sdot(void** handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) { + assert (*handle != NULL); /* Convert to int32_t */ int32_t n_, incx_, incy_; @@ -122,10 +122,10 @@ void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha, const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy); -void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double alpha, +void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t n, const double alpha, const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) { - assert (handle != NULL); + assert (*handle != NULL); /* Convert to int32_t */ int32_t m_, n_, lda_, incx_, incy_; @@ -150,10 +150,10 @@ void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha, const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy); -void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float alpha, +void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t n, const float alpha, const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) { - assert (handle != NULL); + assert (*handle != NULL); /* Convert to int32_t */ int32_t m_, n_, lda_, incx_, incy_; @@ -178,10 +178,10 @@ void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha, const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc); -void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, +void gpu_dgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) { - assert (handle != NULL); + assert (*handle != NULL); /* Convert to int32_t */ int32_t m_, n_, k_, lda_, ldb_, ldc_; @@ -209,10 +209,10 @@ void gpu_dgemm(const void* handle, const char transa, const char transb, const i void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha, const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc); -void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, +void gpu_sgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) { - assert (handle != NULL); + assert (*handle != NULL); /* Convert to int32_t */ int32_t m_, n_, k_, lda_, ldb_, ldc_; @@ -236,12 +236,9 @@ void gpu_sgemm(const void* handle, const char transa, const char transb, const i } -void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, +void gpu_dgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { - if (handle == NULL) { - perror("NULL handle"); - exit(-1); - } + assert (*handle != NULL); if ( (transa == 'N' && transb == 'N') || (transa == 'n' && transb == 'N') || @@ -371,12 +368,9 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i } -void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha, +void gpu_sgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha, const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) { - if (handle == NULL) { - perror("NULL handle"); - exit(-1); - } + assert (*handle != NULL); if ( (transa == 'N' && transb == 'N') || (transa == 'n' && transb == 'N') || From d3d89022bc8092ab0c6131904f85475f160dfa53 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Fri, 28 Jun 2024 16:50:52 +0200 Subject: [PATCH 06/38] Move GPU modules to plugins --- configure | 6 +++--- {src => plugins/local}/gpu_nvidia/LIB | 0 {src => plugins/local}/gpu_nvidia/NEED | 0 {src => plugins/local}/gpu_nvidia/README.rst | 0 {src => plugins/local}/gpu_nvidia/gpu.c | 0 {src => plugins/local}/gpu_x86/NEED | 0 {src => plugins/local}/gpu_x86/README.rst | 0 {src => plugins/local}/gpu_x86/gpu.c | 0 8 files changed, 3 insertions(+), 3 deletions(-) rename {src => plugins/local}/gpu_nvidia/LIB (100%) rename {src => plugins/local}/gpu_nvidia/NEED (100%) rename {src => plugins/local}/gpu_nvidia/README.rst (100%) rename {src => plugins/local}/gpu_nvidia/gpu.c (100%) rename {src => plugins/local}/gpu_x86/NEED (100%) rename {src => plugins/local}/gpu_x86/README.rst (100%) rename {src => plugins/local}/gpu_x86/gpu.c (100%) diff --git a/configure b/configure index db158966..08dac444 100755 --- a/configure +++ b/configure @@ -119,15 +119,15 @@ rm -f ${QP_ROOT}/src/gpu_arch case "$GPU" in amd) # Nvidia echo "Activating AMD GPU acceleration" - ln -s ${QP_ROOT}/src/gpu_amd ${QP_ROOT}/src/gpu_arch + ln -s ${QP_ROOT}/plugins/local/gpu_amd ${QP_ROOT}/src/gpu_arch ;; nvidia) # Nvidia echo "Activating Nvidia GPU acceleration" - ln -s ${QP_ROOT}/src/gpu_nvidia ${QP_ROOT}/src/gpu_arch + ln -s ${QP_ROOT}/plugins/local/gpu_nvidia ${QP_ROOT}/src/gpu_arch ;; *) # No Acceleration echo "Disabling GPU acceleration" - ln -s ${QP_ROOT}/src/gpu_x86 ${QP_ROOT}/src/gpu_arch + ln -s ${QP_ROOT}/plugins/local/gpu_x86 ${QP_ROOT}/src/gpu_arch ;; esac diff --git a/src/gpu_nvidia/LIB b/plugins/local/gpu_nvidia/LIB similarity index 100% rename from src/gpu_nvidia/LIB rename to plugins/local/gpu_nvidia/LIB diff --git a/src/gpu_nvidia/NEED b/plugins/local/gpu_nvidia/NEED similarity index 100% rename from src/gpu_nvidia/NEED rename to plugins/local/gpu_nvidia/NEED diff --git a/src/gpu_nvidia/README.rst b/plugins/local/gpu_nvidia/README.rst similarity index 100% rename from src/gpu_nvidia/README.rst rename to plugins/local/gpu_nvidia/README.rst diff --git a/src/gpu_nvidia/gpu.c b/plugins/local/gpu_nvidia/gpu.c similarity index 100% rename from src/gpu_nvidia/gpu.c rename to plugins/local/gpu_nvidia/gpu.c diff --git a/src/gpu_x86/NEED b/plugins/local/gpu_x86/NEED similarity index 100% rename from src/gpu_x86/NEED rename to plugins/local/gpu_x86/NEED diff --git a/src/gpu_x86/README.rst b/plugins/local/gpu_x86/README.rst similarity index 100% rename from src/gpu_x86/README.rst rename to plugins/local/gpu_x86/README.rst diff --git a/src/gpu_x86/gpu.c b/plugins/local/gpu_x86/gpu.c similarity index 100% rename from src/gpu_x86/gpu.c rename to plugins/local/gpu_x86/gpu.c From 85b1035cfba778559e629045961cb542631841bd Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Fri, 28 Jun 2024 17:33:08 +0200 Subject: [PATCH 07/38] Working on CCSD --- src/ccsd/ccsd_space_orb_sub.irp.f | 117 +++++++++++------------------- src/gpu/gpu_module.F90 | 62 ++++++++++++++++ 2 files changed, 103 insertions(+), 76 deletions(-) diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index e7c9b1ab..1329f172 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -14,6 +14,9 @@ subroutine run_ccsd_space_orb type(gpu_double2) :: t1, r1 type(gpu_double2) :: H_oo, H_vv, H_vo + type(gpu_double2) :: d_cc_space_f_vo + type(gpu_double4) :: d_cc_space_v_oovv + double precision, allocatable :: all_err(:,:), all_t(:,:) integer, allocatable :: list_occ(:), list_vir(:) integer(bit_kind) :: det(N_int,2) @@ -52,6 +55,12 @@ subroutine run_ccsd_space_orb !print*,'occ',list_occ !print*,'vir',list_vir + call gpu_allocate(d_cc_space_f_vo, nV, nO) + call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV) + call gpu_upload(cc_space_f_vo, d_cc_space_f_vo) + call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv) + + call gpu_allocate(t2, nO,nO,nV,nV) call gpu_allocate(r2, nO,nO,nV,nV) call gpu_allocate(tau, nO,nO,nV,nV) @@ -116,7 +125,8 @@ subroutine run_ccsd_space_orb !print*,'hf_energy', hf_energy call det_energy(det,uncorr_energy) print*,'Det energy', uncorr_energy - call ccsd_energy_space_x(nO,nV,tau_x%f,t1%f,energy) + + call ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy) print*,'Guess energy', uncorr_energy+energy, energy nb_iter = 0 @@ -166,7 +176,7 @@ subroutine run_ccsd_space_orb call update_tau_x_space(nO,nV,tau,tau_x) ! Energy - call ccsd_energy_space_x(nO,nV,tau_x%f,t1%f,energy) + call ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy) write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |' nb_iter = nb_iter + 1 @@ -239,6 +249,8 @@ subroutine run_ccsd_space_orb call save_energy(uncorr_energy + energy, e_t) deallocate(h_t1, h_t2) + call gpu_deallocate(d_cc_space_f_vo) + call gpu_deallocate(d_cc_space_v_oovv) call gpu_deallocate(t1) call gpu_deallocate(t2) @@ -246,59 +258,14 @@ end ! Energy -subroutine ccsd_energy_space(nO,nV,tau,t1,energy) - +subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy) + use gpu implicit none - integer, intent(in) :: nO, nV - double precision, intent(in) :: tau(nO,nO,nV,nV) - double precision, intent(in) :: t1(nO,nV) - double precision, intent(out) :: energy - - ! internal - integer :: i,j,a,b - double precision :: e - - energy = 0d0 - !$omp parallel & - !$omp shared(nO,nV,energy,tau,t1,& - !$omp cc_space_f_vo,cc_space_w_oovv) & - !$omp private(i,j,a,b,e) & - !$omp default(none) - e = 0d0 - !$omp do - do a = 1, nV - do i = 1, nO - e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a) - enddo - enddo - !$omp end do nowait - !$omp do - do b = 1, nV - do a = 1, nV - do j = 1, nO - do i = 1, nO - e = e + tau(i,j,a,b) * cc_space_w_oovv(i,j,a,b) - enddo - enddo - enddo - enddo - !$omp end do nowait - !$omp critical - energy = energy + e - !$omp end critical - !$omp end parallel - -end - -subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy) - - implicit none - - integer, intent(in) :: nO, nV - double precision, intent(in) :: tau_x(nO,nO,nV,nV) - double precision, intent(in) :: t1(nO,nV) - double precision, intent(out) :: energy + integer, intent(in) :: nO, nV + type(gpu_double4), intent(in) :: tau_x, d_cc_space_v_oovv + type(gpu_double2), intent(in) :: t1, d_cc_space_f_vo + double precision, intent(out) :: energy ! internal integer :: i,j,a,b @@ -307,14 +274,14 @@ subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy) energy = 0d0 !$omp parallel & !$omp shared(nO,nV,energy,tau_x,t1,& - !$omp cc_space_f_vo,cc_space_v_oovv) & + !$omp d_cc_space_f_vo,d_cc_space_v_oovv) & !$omp private(i,j,a,b,e) & !$omp default(none) e = 0d0 !$omp do do a = 1, nV do i = 1, nO - e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a) + e = e + 2d0 * d_cc_space_f_vo%f(a,i) * t1%f(i,a) enddo enddo !$omp end do nowait @@ -323,7 +290,7 @@ subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy) do a = 1, nV do j = 1, nO do i = 1, nO - e = e + tau_x(i,j,a,b) * cc_space_v_oovv(i,j,a,b) + e = e + tau_x%f(i,j,a,b) * d_cc_space_v_oovv%f(i,j,a,b) enddo enddo enddo @@ -333,6 +300,12 @@ subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy) energy = energy + e !$omp end critical !$omp end parallel +! +! +! call gpu_ddot(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1, d_cc_space_v_oovv, 1, energy) +! call gpu_ddot(blas_handle, nO*nV*1_8, d_cc_space_f_vo, 1, t1, 1, e) +! energy = energy + 2.d0*e + end @@ -372,26 +345,24 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau) ! !$OMP END PARALLEL - type(gpu_blas) :: blas type(gpu_stream) :: stream(nV) - call gpu_blas_create(blas) do b=1,nV call gpu_stream_create(stream(b)) enddo !$OMP PARALLEL & - !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas) & + !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) !$OMP DO do b=1,nV - call gpu_set_stream(blas,stream(b)) + call gpu_set_stream(blas_handle,stream(b)) do j=1,nO - call gpu_dgeam_c(blas%c, 'N', 'N', nO*1_8, nV*1_8, & - 1.d0, c_loc(t2%f(1,j,1,b)), nO*nO*1_8, & - h_t1(j,b), t1%c, nO*1_8, & - c_loc(tau%f(1,j,1,b)), nO*nO*1_8) + call gpu_dgeam(blas_handle, 'N', 'N', nO*1_8, nV*1_8, & + 1.d0, t2%f(1,j,1,b), nO*nO*1_8, & + h_t1(j,b), t1%f, nO*1_8, & + tau%f(1,j,1,b), nO*nO*1_8) enddo enddo !$OMP END DO @@ -403,8 +374,6 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau) call gpu_stream_destroy(stream(b)) enddo - call gpu_blas_destroy(blas) - end subroutine update_tau_x_space(nO,nV,tau,tau_x) @@ -438,26 +407,24 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x) ! !$OMP END DO ! !$OMP END PARALLEL - type(gpu_blas) :: blas type(gpu_stream) :: stream(nV) - call gpu_blas_create(blas) do a=1,nV call gpu_stream_create(stream(a)) enddo !$OMP PARALLEL & - !$OMP SHARED(nO,nV,tau,tau_x,stream,blas) & + !$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) !$OMP DO do b=1,nV do a=1,nV - call gpu_set_stream(blas,stream(a)) - call gpu_dgeam_c(blas%c, 'N', 'N', nO*1_8, nO*1_8, & - 2.d0, c_loc(tau%f(1,1,a,b)), nO*1_8, & - -1.d0, c_loc(tau%f(1,1,b,a)), nO*1_8, & - c_loc(tau_x%f(1,1,a,b)), nO*1_8) + call gpu_set_stream(blas_handle,stream(a)) + call gpu_dgeam(blas_handle, 'N', 'N', nO*1_8, nO*1_8, & + 2.d0, tau%f(1,1,a,b), nO*1_8, & + -1.d0, tau%f(1,1,b,a), nO*1_8, & + tau_x%f(1,1,a,b), nO*1_8) enddo enddo !$OMP END DO @@ -469,8 +436,6 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x) call gpu_stream_destroy(stream(b)) enddo - call gpu_blas_destroy(blas) - end ! R1 diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90 index d1ddad4c..2057d1eb 100644 --- a/src/gpu/gpu_module.F90 +++ b/src/gpu/gpu_module.F90 @@ -144,6 +144,16 @@ module gpu type(c_ptr), value :: a, b, c end subroutine + subroutine gpu_sgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, & + b, ldb, c, ldc) bind(C, name='gpu_sgeam') + import + type(c_ptr), intent(in) :: handle + character(c_char), intent(in), value :: transa, transb + integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc + real(c_float), intent(in), value :: alpha, beta + type(c_ptr), value :: a, b, c + end subroutine + end interface @@ -478,5 +488,57 @@ module gpu call gpu_blas_destroy_c(handle%c) end subroutine + end module + + +! dot +! --- + +subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res) + use gpu + type(gpu_blas), intent(in) :: handle + integer*8 :: n, incx, incy + double precision, intent(in) :: dx(*), dy(*) + double precision, intent(out) :: res + call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res) +end subroutine + +subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res) + use gpu + type(gpu_blas), intent(in) :: handle + integer*8 :: n, incx, incy + real, intent(in) :: dx(*), dy(*) + real, intent(out) :: res + call gpu_sdot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res) +end subroutine + + +! geam +! ---- + +subroutine gpu_dgeam(handle, transa, transb, m, n, alpha, a, lda, beta, & + b, ldb, c, ldc) + use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa, transb + integer*8, intent(in) :: m, n, lda, ldb, ldc + double precision, intent(in) :: alpha, beta + double precision :: a(lda,*), b(ldb,*), c(ldc,*) + call gpu_dgeam_c(handle%c, transa, transb, m, n, alpha, c_loc(a), lda, beta, & + c_loc(b), ldb, c_loc(c), ldc) +end subroutine + +subroutine gpu_sgeam(handle, transa, transb, m, n, alpha, a, lda, beta, & + b, ldb, c, ldc) + use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa, transb + integer*8, intent(in) :: m, n, lda, ldb, ldc + real, intent(in) :: alpha, beta + real :: a(lda,*), b(ldb,*), c(ldc,*) + call gpu_sgeam_c(handle%c, transa, transb, m, n, alpha, c_loc(a), lda, beta, & + c_loc(b), ldb, c_loc(c), ldc) +end subroutine + From a5f4f0516eec9f17438474529616368a6f9e5de4 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Fri, 28 Jun 2024 17:39:43 +0200 Subject: [PATCH 08/38] Fixing compile --- src/ccsd/ccsd_space_orb_sub.irp.f | 1 - src/gpu/gpu_module.F90 | 23 +++++++++-------------- 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 1329f172..4e06e31d 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -306,7 +306,6 @@ subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1, ! call gpu_ddot(blas_handle, nO*nV*1_8, d_cc_space_f_vo, 1, t1, 1, e) ! energy = energy + 2.d0*e - end ! Tau diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90 index 2057d1eb..d7c26ba6 100644 --- a/src/gpu/gpu_module.F90 +++ b/src/gpu/gpu_module.F90 @@ -365,42 +365,42 @@ module gpu subroutine gpu_download_double1(gpu_ptr, cpu_ptr) implicit none type(gpu_double1), intent(in) :: gpu_ptr - double precision, intent(in) :: cpu_ptr(:) + double precision, target, intent(in) :: cpu_ptr(:) call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*size(gpu_ptr%f)) end subroutine subroutine gpu_download_double2(gpu_ptr, cpu_ptr) implicit none type(gpu_double2), intent(in) :: gpu_ptr - double precision, intent(in) :: cpu_ptr(:,:) + double precision, target, intent(in) :: cpu_ptr(:,:) call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) end subroutine subroutine gpu_download_double3(gpu_ptr, cpu_ptr) implicit none type(gpu_double3), intent(in) :: gpu_ptr - double precision, intent(in) :: cpu_ptr(:,:,:) + double precision, target, intent(in) :: cpu_ptr(:,:,:) call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) end subroutine subroutine gpu_download_double4(gpu_ptr, cpu_ptr) implicit none type(gpu_double4), intent(in) :: gpu_ptr - double precision, intent(in) :: cpu_ptr(:,:,:,:) + double precision, target, intent(in) :: cpu_ptr(:,:,:,:) call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) end subroutine subroutine gpu_download_double5(gpu_ptr, cpu_ptr) implicit none type(gpu_double5), intent(in) :: gpu_ptr - double precision, intent(in) :: cpu_ptr(:,:,:,:,:) + double precision, target, intent(in) :: cpu_ptr(:,:,:,:,:) call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) end subroutine subroutine gpu_download_double6(gpu_ptr, cpu_ptr) implicit none type(gpu_double6), intent(in) :: gpu_ptr - double precision, intent(in) :: cpu_ptr(:,:,:,:,:,:) + double precision, target, intent(in) :: cpu_ptr(:,:,:,:,:,:) call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) end subroutine @@ -454,19 +454,16 @@ module gpu ! ---------- subroutine gpu_stream_create(stream) - import type(gpu_stream) :: stream call gpu_stream_create_c(stream%c) end subroutine subroutine gpu_stream_destroy(stream) - import type(gpu_stream) :: stream call gpu_stream_destroy_c(stream%c) end subroutine subroutine gpu_set_stream(handle, stream) - import type(gpu_blas) :: handle type(gpu_stream) :: stream call gpu_set_stream_c(handle%c, stream%c) @@ -477,13 +474,11 @@ module gpu ! -------- subroutine gpu_blas_create(handle) - import type(gpu_blas) :: handle call gpu_blas_create_c(handle%c) end subroutine subroutine gpu_blas_destroy(handle) - import type(gpu_blas) :: handle call gpu_blas_destroy_c(handle%c) end subroutine @@ -500,7 +495,7 @@ subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res) use gpu type(gpu_blas), intent(in) :: handle integer*8 :: n, incx, incy - double precision, intent(in) :: dx(*), dy(*) + double precision, target, intent(in) :: dx(*), dy(*) double precision, intent(out) :: res call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res) end subroutine @@ -525,7 +520,7 @@ subroutine gpu_dgeam(handle, transa, transb, m, n, alpha, a, lda, beta, & character, intent(in) :: transa, transb integer*8, intent(in) :: m, n, lda, ldb, ldc double precision, intent(in) :: alpha, beta - double precision :: a(lda,*), b(ldb,*), c(ldc,*) + double precision, target :: a(lda,*), b(ldb,*), c(ldc,*) call gpu_dgeam_c(handle%c, transa, transb, m, n, alpha, c_loc(a), lda, beta, & c_loc(b), ldb, c_loc(c), ldc) end subroutine @@ -537,7 +532,7 @@ subroutine gpu_sgeam(handle, transa, transb, m, n, alpha, a, lda, beta, & character, intent(in) :: transa, transb integer*8, intent(in) :: m, n, lda, ldb, ldc real, intent(in) :: alpha, beta - real :: a(lda,*), b(ldb,*), c(ldc,*) + real, target :: a(lda,*), b(ldb,*), c(ldc,*) call gpu_sgeam_c(handle%c, transa, transb, m, n, alpha, c_loc(a), lda, beta, & c_loc(b), ldb, c_loc(c), ldc) end subroutine From c7df9a72cc68a7f5dfded36aa94ac50d5188a5a1 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Fri, 28 Jun 2024 21:32:04 +0200 Subject: [PATCH 09/38] Fixing again actions --- src/gpu/gpu_module.F90 | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90 index d7c26ba6..ecf79c83 100644 --- a/src/gpu/gpu_module.F90 +++ b/src/gpu/gpu_module.F90 @@ -318,42 +318,42 @@ module gpu subroutine gpu_upload_double1(cpu_ptr, gpu_ptr) implicit none - double precision, intent(in) :: cpu_ptr(:) + double precision, target, intent(in) :: cpu_ptr(*) type(gpu_double1), intent(in) :: gpu_ptr call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, 8_8*size(gpu_ptr%f)) end subroutine subroutine gpu_upload_double2(cpu_ptr, gpu_ptr) implicit none - double precision, intent(in) :: cpu_ptr(:,:) + double precision, target, intent(in) :: cpu_ptr(:,:) type(gpu_double2), intent(in) :: gpu_ptr call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) end subroutine subroutine gpu_upload_double3(cpu_ptr, gpu_ptr) implicit none - double precision, intent(in) :: cpu_ptr(:,:,:) + double precision, target, intent(in) :: cpu_ptr(:,:,:) type(gpu_double3), intent(in) :: gpu_ptr call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) end subroutine subroutine gpu_upload_double4(cpu_ptr, gpu_ptr) implicit none - double precision, intent(in) :: cpu_ptr(:,:,:,:) + double precision, target, intent(in) :: cpu_ptr(:,:,:,:) type(gpu_double4), intent(in) :: gpu_ptr call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) end subroutine subroutine gpu_upload_double5(cpu_ptr, gpu_ptr) implicit none - double precision, intent(in) :: cpu_ptr(:,:,:,:,:) + double precision, target, intent(in) :: cpu_ptr(:,:,:,:,:) type(gpu_double5), intent(in) :: gpu_ptr call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) end subroutine subroutine gpu_upload_double6(cpu_ptr, gpu_ptr) implicit none - double precision, intent(in) :: cpu_ptr(:,:,:,:,:,:) + double precision, target, intent(in) :: cpu_ptr(:,:,:,:,:,:) type(gpu_double6), intent(in) :: gpu_ptr call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) end subroutine @@ -504,7 +504,7 @@ subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res) use gpu type(gpu_blas), intent(in) :: handle integer*8 :: n, incx, incy - real, intent(in) :: dx(*), dy(*) + real, target, intent(in) :: dx(*), dy(*) real, intent(out) :: res call gpu_sdot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res) end subroutine From b467bef6dd1e14c5914cc6508aa898d5f1665e3a Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Fri, 28 Jun 2024 21:37:14 +0200 Subject: [PATCH 10/38] Forgot file --- src/ccsd/ccsd_space_orb_sub.irp.f | 68 +++++++++++++++---------------- src/gpu/gpu.irp.f | 11 +++++ 2 files changed, 45 insertions(+), 34 deletions(-) create mode 100644 src/gpu/gpu.irp.f diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 4e06e31d..5c2daa05 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -271,40 +271,40 @@ subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1, integer :: i,j,a,b double precision :: e - energy = 0d0 - !$omp parallel & - !$omp shared(nO,nV,energy,tau_x,t1,& - !$omp d_cc_space_f_vo,d_cc_space_v_oovv) & - !$omp private(i,j,a,b,e) & - !$omp default(none) - e = 0d0 - !$omp do - do a = 1, nV - do i = 1, nO - e = e + 2d0 * d_cc_space_f_vo%f(a,i) * t1%f(i,a) - enddo - enddo - !$omp end do nowait - !$omp do - do b = 1, nV - do a = 1, nV - do j = 1, nO - do i = 1, nO - e = e + tau_x%f(i,j,a,b) * d_cc_space_v_oovv%f(i,j,a,b) - enddo - enddo - enddo - enddo - !$omp end do nowait - !$omp critical - energy = energy + e - !$omp end critical - !$omp end parallel -! -! -! call gpu_ddot(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1, d_cc_space_v_oovv, 1, energy) -! call gpu_ddot(blas_handle, nO*nV*1_8, d_cc_space_f_vo, 1, t1, 1, e) -! energy = energy + 2.d0*e +! energy = 0d0 +! !$omp parallel & +! !$omp shared(nO,nV,energy,tau_x,t1,& +! !$omp d_cc_space_f_vo,d_cc_space_v_oovv) & +! !$omp private(i,j,a,b,e) & +! !$omp default(none) +! e = 0d0 +! !$omp do +! do a = 1, nV +! do i = 1, nO +! e = e + 2d0 * d_cc_space_f_vo%f(a,i) * t1%f(i,a) +! enddo +! enddo +! !$omp end do nowait +! !$omp do +! do b = 1, nV +! do a = 1, nV +! do j = 1, nO +! do i = 1, nO +! e = e + tau_x%f(i,j,a,b) * d_cc_space_v_oovv%f(i,j,a,b) +! enddo +! enddo +! enddo +! enddo +! !$omp end do nowait +! !$omp critical +! energy = energy + e +! !$omp end critical +! !$omp end parallel + + + call gpu_ddot(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1, d_cc_space_v_oovv, 1, energy) + call gpu_ddot(blas_handle, nO*nV*1_8, d_cc_space_f_vo, 1, t1, 1, e) + energy = energy + 2.d0*e end diff --git a/src/gpu/gpu.irp.f b/src/gpu/gpu.irp.f new file mode 100644 index 00000000..e91d66f5 --- /dev/null +++ b/src/gpu/gpu.irp.f @@ -0,0 +1,11 @@ +use gpu + +BEGIN_PROVIDER [ type(gpu_blas), blas_handle ] + implicit none + BEGIN_DOC + ! Handle for cuBLAS or RocBLAS + END_DOC + call gpu_blas_create(blas_handle) +END_PROVIDER + + From 860121d404f7ae255790cd12136139103bdc48d0 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Sat, 29 Jun 2024 02:27:50 +0200 Subject: [PATCH 11/38] H_oo on GPU --- plugins/local/gpu_nvidia/gpu.c | 224 +++++++++++---------- plugins/local/gpu_x86/gpu.c | 38 ++-- src/ccsd/ccsd_space_orb_sub.irp.f | 112 ++++++++--- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 127 ++++++++---- src/gpu/gpu.irp.f | 7 + src/gpu/gpu_module.F90 | 260 +++++++++++++++++++++---- 6 files changed, 540 insertions(+), 228 deletions(-) diff --git a/plugins/local/gpu_nvidia/gpu.c b/plugins/local/gpu_nvidia/gpu.c index f0bd247a..189de64c 100644 --- a/plugins/local/gpu_nvidia/gpu.c +++ b/plugins/local/gpu_nvidia/gpu.c @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -10,6 +11,10 @@ /* Generic functions */ +bool no_gpu() { + return false; +} + int gpu_ndevices() { int ngpus; cudaGetDeviceCount(&ngpus); @@ -17,7 +22,7 @@ int gpu_ndevices() { } void gpu_set_device(int32_t igpu) { - cudaSetDevice(igpu); + cudaSetDevice((int) igpu); } @@ -64,22 +69,20 @@ void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) { /* Streams */ -void gpu_stream_create(void** ptr) { - cudaStream_t stream; - cudaError_t rc = cudaStreamCreate(&stream); +void gpu_stream_create(cudaStream_t* ptr) { + cudaError_t rc = cudaStreamCreate(ptr); assert (rc == cudaSuccess); - *ptr = (void*) stream; } -void gpu_stream_destroy(void** ptr) { - assert (*ptr != NULL); - cudaError_t rc = cudaStreamDestroy( (cudaStream_t) *ptr); +void gpu_stream_destroy(cudaStream_t* ptr) { + assert (ptr != NULL); + cudaError_t rc = cudaStreamDestroy(*ptr); assert (rc == cudaSuccess); *ptr = NULL; } -void gpu_set_stream(void** handle, void** stream) { - cublasSetStream( (cublasHandle_t) *handle, (cudaStream_t) *stream); +void gpu_set_stream(cublasHandle_t handle, cudaStream_t stream) { + cublasSetStream(handle, stream); } void gpu_synchronize() { @@ -89,75 +92,80 @@ void gpu_synchronize() { /* BLAS functions */ -void gpu_blas_create(void** handle) { - cublasHandle_t cublas_handle; - cublasStatus_t rc = cublasCreate(&cublas_handle); +void gpu_blas_create(cublasHandle_t* ptr) { + cublasStatus_t rc = cublasCreate(ptr); assert (rc == CUBLAS_STATUS_SUCCESS); - *handle = (void*) cublas_handle; } -void gpu_blas_destroy(void** handle) { - assert (*handle != NULL); - cublasStatus_t rc = cublasDestroy( (cublasHandle_t) *handle); +void gpu_blas_destroy(cublasHandle_t* ptr) { + assert (ptr != NULL); + cublasStatus_t rc = cublasDestroy(*ptr); assert (rc == CUBLAS_STATUS_SUCCESS); - *handle = NULL; + ptr = NULL; } -void gpu_ddot(void** handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) { - assert (*handle != NULL); +void gpu_ddot(cublasHandle_t handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) { + assert (handle != NULL); + /* Convert to int */ + int n_, incx_, incy_; - /* Convert to int32_t */ - int32_t n_, incx_, incy_; + n_ = (int) n; + incx_ = (int) incx; + incy_ = (int) incy; - n_ = (int32_t) n; - incx_ = (int32_t) incx; - incy_ = (int32_t) incy; + assert ( (int64_t) n_ == n ); + assert ( (int64_t) incx_ == incx); + assert ( (int64_t) incy_ == incy); + + cublasStatus_t rc = cublasDdot(handle, n_, x, incx_, y, incy_, result); +/* + double alpha = 1.0; + double beta = 0.0; + cublasStatus_t rc = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, n_, &alpha, x, 1, y, n_, &beta, &result_, 1); +*/ + assert (rc == CUBLAS_STATUS_SUCCESS); +} + + + +void gpu_sdot(cublasHandle_t handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) { + assert (handle != NULL); + + /* Convert to int */ + int n_, incx_, incy_; + + n_ = (int) n; + incx_ = (int) incx; + incy_ = (int) incy; /* Check for integer overflows */ assert ( (int64_t) n_ == n ); assert ( (int64_t) incx_ == incx); assert ( (int64_t) incy_ == incy); - cublasDdot((cublasHandle_t) *handle, n_, x, incx_, y, incy_, result); + float result_ = 0.; + cublasStatus_t rc = cublasSdot(handle, n_, x, incx_, y, incy_, &result_); + assert (rc == CUBLAS_STATUS_SUCCESS); + *result = result_; } -void gpu_sdot(void** handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) { - assert (*handle != NULL); - - /* Convert to int32_t */ - int32_t n_, incx_, incy_; - - n_ = (int32_t) n; - incx_ = (int32_t) incx; - incy_ = (int32_t) incy; - - /* Check for integer overflows */ - assert ( (int64_t) n_ == n ); - assert ( (int64_t) incx_ == incx); - assert ( (int64_t) incy_ == incy); - - cublasSdot((cublasHandle_t) *handle, n_, x, incx_, y, incy_, result); -} - - - -void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t n, const double alpha, +void gpu_dgemv(cublasHandle_t handle, const char transa, const int64_t m, const int64_t n, const double alpha, const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) { - assert (*handle != NULL); + assert (handle != NULL); - /* Convert to int32_t */ - int32_t m_, n_, lda_, incx_, incy_; + /* Convert to int */ + int m_, n_, lda_, incx_, incy_; - m_ = (int32_t) m; - n_ = (int32_t) n; - lda_ = (int32_t) lda; - incx_ = (int32_t) incx; - incy_ = (int32_t) incy; + m_ = (int) m; + n_ = (int) n; + lda_ = (int) lda; + incx_ = (int) incx; + incy_ = (int) incy; /* Check for integer overflows */ assert ( (int64_t) m_ == m ); @@ -169,24 +177,24 @@ void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t cublasOperation_t transa_ = CUBLAS_OP_N; if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; - cublasDgemv((cublasHandle_t) *handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_); + cublasDgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_); } -void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t n, const float alpha, +void gpu_sgemv(cublasHandle_t handle, const char transa, const int64_t m, const int64_t n, const float alpha, const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) { - assert (*handle != NULL); + assert (handle != NULL); - /* Convert to int32_t */ - int32_t m_, n_, lda_, incx_, incy_; + /* Convert to int */ + int m_, n_, lda_, incx_, incy_; - m_ = (int32_t) m; - n_ = (int32_t) n; - lda_ = (int32_t) lda; - incx_ = (int32_t) incx; - incy_ = (int32_t) incy; + m_ = (int) m; + n_ = (int) n; + lda_ = (int) lda; + incx_ = (int) incx; + incy_ = (int) incy; /* Check for integer overflows */ assert ( (int64_t) m_ == m ); @@ -198,24 +206,24 @@ void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t cublasOperation_t transa_ = CUBLAS_OP_N; if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; - cublasSgemv((cublasHandle_t) *handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_); + cublasSgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_); } -void gpu_dgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, +void gpu_dgemm(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) { - assert (*handle != NULL); + assert (handle != NULL); - /* Convert to int32_t */ - int32_t m_, n_, k_, lda_, ldb_, ldc_; + /* Convert to int */ + int m_, n_, k_, lda_, ldb_, ldc_; - m_ = (int32_t) m; - n_ = (int32_t) n; - k_ = (int32_t) k; - lda_ = (int32_t) lda; - ldb_ = (int32_t) ldb; - ldc_ = (int32_t) ldc; + m_ = (int) m; + n_ = (int) n; + k_ = (int) k; + lda_ = (int) lda; + ldb_ = (int) ldb; + ldc_ = (int) ldc; /* Check for integer overflows */ assert ( (int64_t) m_ == m ); @@ -230,25 +238,25 @@ void gpu_dgemm(void** handle, const char transa, const char transb, const int64_ if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; - cublasDgemm((cublasHandle_t) *handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_); + cublasDgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_); } -void gpu_sgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, +void gpu_sgemm(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) { - assert (*handle != NULL); + assert (handle != NULL); - /* Convert to int32_t */ - int32_t m_, n_, k_, lda_, ldb_, ldc_; + /* Convert to int */ + int m_, n_, k_, lda_, ldb_, ldc_; - m_ = (int32_t) m; - n_ = (int32_t) n; - k_ = (int32_t) k; - lda_ = (int32_t) lda; - ldb_ = (int32_t) ldb; - ldc_ = (int32_t) ldc; + m_ = (int) m; + n_ = (int) n; + k_ = (int) k; + lda_ = (int) lda; + ldb_ = (int) ldb; + ldc_ = (int) ldc; /* Check for integer overflows */ assert ( (int64_t) m_ == m ); @@ -263,22 +271,22 @@ void gpu_sgemm(void** handle, const char transa, const char transb, const int64_ if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; - cublasSgemm((cublasHandle_t) *handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_); + cublasSgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_); } -void gpu_dgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, +void gpu_dgeam(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { - assert (*handle != NULL); + assert (handle != NULL); - /* Convert to int32_t */ - int32_t m_, n_, lda_, ldb_, ldc_; + /* Convert to int */ + int m_, n_, lda_, ldb_, ldc_; - m_ = (int32_t) m; - n_ = (int32_t) n; - lda_ = (int32_t) lda; - ldb_ = (int32_t) ldb; - ldc_ = (int32_t) ldc; + m_ = (int) m; + n_ = (int) n; + lda_ = (int) lda; + ldb_ = (int) ldb; + ldc_ = (int) ldc; /* Check for integer overflows */ assert ( (int64_t) m_ == m ); @@ -292,23 +300,23 @@ void gpu_dgeam(void** handle, const char transa, const char transb, const int64_ if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; - cublasDgeam((cublasHandle_t) *handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_); + cublasDgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_); } -void gpu_sgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha, +void gpu_sgeam(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha, const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) { - assert (*handle != NULL); + assert (handle != NULL); - /* Convert to int32_t */ - int32_t m_, n_, lda_, ldb_, ldc_; + /* Convert to int */ + int m_, n_, lda_, ldb_, ldc_; - m_ = (int32_t) m; - n_ = (int32_t) n; - lda_ = (int32_t) lda; - ldb_ = (int32_t) ldb; - ldc_ = (int32_t) ldc; + m_ = (int) m; + n_ = (int) n; + lda_ = (int) lda; + ldb_ = (int) ldb; + ldc_ = (int) ldc; /* Check for integer overflows */ assert ( (int64_t) m_ == m ); @@ -322,6 +330,6 @@ void gpu_sgeam(void** handle, const char transa, const char transb, const int64_ if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; - cublasSgeam((cublasHandle_t) *handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_); + cublasSgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_); } diff --git a/plugins/local/gpu_x86/gpu.c b/plugins/local/gpu_x86/gpu.c index ac7c3620..53267a7c 100644 --- a/plugins/local/gpu_x86/gpu.c +++ b/plugins/local/gpu_x86/gpu.c @@ -2,8 +2,12 @@ #include #include #include +#include #include +bool no_gpu() { + return true; +} /* Generic functions */ @@ -56,7 +60,7 @@ void gpu_stream_destroy(void** ptr) { *ptr = NULL; } -void gpu_set_stream(void** handle, void** stream) { +void gpu_set_stream(void* handle, void* stream) { return; } @@ -79,8 +83,8 @@ void gpu_blas_destroy(void** handle) { double ddot_(const int32_t* n, const double* x, const int32_t* incx, const double* y, const int32_t* incy); -void gpu_ddot(void** handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) { - assert (*handle != NULL); +void gpu_ddot(void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) { + assert (handle != NULL); /* Convert to int32_t */ int32_t n_, incx_, incy_; @@ -100,8 +104,8 @@ void gpu_ddot(void** handle, const int64_t n, const double* x, const int64_t inc float sdot_(const int32_t* n, const float* x, const int32_t* incx, const float* y, const int32_t* incy); -void gpu_sdot(void** handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) { - assert (*handle != NULL); +void gpu_sdot(void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) { + assert (handle != NULL); /* Convert to int32_t */ int32_t n_, incx_, incy_; @@ -122,10 +126,10 @@ void gpu_sdot(void** handle, const int64_t n, const float* x, const int64_t incx void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha, const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy); -void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t n, const double alpha, +void gpu_dgemv(void* handle, const char transa, const int64_t m, const int64_t n, const double alpha, const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) { - assert (*handle != NULL); + assert (handle != NULL); /* Convert to int32_t */ int32_t m_, n_, lda_, incx_, incy_; @@ -150,10 +154,10 @@ void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha, const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy); -void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t n, const float alpha, +void gpu_sgemv(void* handle, const char transa, const int64_t m, const int64_t n, const float alpha, const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) { - assert (*handle != NULL); + assert (handle != NULL); /* Convert to int32_t */ int32_t m_, n_, lda_, incx_, incy_; @@ -178,10 +182,10 @@ void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha, const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc); -void gpu_dgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, +void gpu_dgemm(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) { - assert (*handle != NULL); + assert (handle != NULL); /* Convert to int32_t */ int32_t m_, n_, k_, lda_, ldb_, ldc_; @@ -209,10 +213,10 @@ void gpu_dgemm(void** handle, const char transa, const char transb, const int64_ void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha, const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc); -void gpu_sgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, +void gpu_sgemm(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) { - assert (*handle != NULL); + assert (handle != NULL); /* Convert to int32_t */ int32_t m_, n_, k_, lda_, ldb_, ldc_; @@ -236,9 +240,9 @@ void gpu_sgemm(void** handle, const char transa, const char transb, const int64_ } -void gpu_dgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, +void gpu_dgeam(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { - assert (*handle != NULL); + assert (handle != NULL); if ( (transa == 'N' && transb == 'N') || (transa == 'n' && transb == 'N') || @@ -368,9 +372,9 @@ void gpu_dgeam(void** handle, const char transa, const char transb, const int64_ } -void gpu_sgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha, +void gpu_sgeam(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha, const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) { - assert (*handle != NULL); + assert (handle != NULL); if ( (transa == 'N' && transb == 'N') || (transa == 'n' && transb == 'N') || diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 5c2daa05..5ee7366e 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -14,9 +14,15 @@ subroutine run_ccsd_space_orb type(gpu_double2) :: t1, r1 type(gpu_double2) :: H_oo, H_vv, H_vo - type(gpu_double2) :: d_cc_space_f_vo + type(gpu_double2) :: d_cc_space_f_oo, d_cc_space_f_vo + type(gpu_double2) :: d_cc_space_f_ov, d_cc_space_f_vv + + type(gpu_double3) :: d_cc_space_v_oo_chol, d_cc_space_v_vo_chol + type(gpu_double3) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol + type(gpu_double4) :: d_cc_space_v_oovv + double precision, allocatable :: all_err(:,:), all_t(:,:) integer, allocatable :: list_occ(:), list_vir(:) integer(bit_kind) :: det(N_int,2) @@ -24,7 +30,7 @@ subroutine run_ccsd_space_orb call set_multiple_levels_omp(.False.) - if (do_ao_cholesky) then + if (do_mo_cholesky) then PROVIDE cholesky_mo_transp FREE cholesky_ao else @@ -55,11 +61,36 @@ subroutine run_ccsd_space_orb !print*,'occ',list_occ !print*,'vir',list_vir + ! GPU arrays + call gpu_allocate(d_cc_space_f_oo, nO, nO) call gpu_allocate(d_cc_space_f_vo, nV, nO) - call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV) - call gpu_upload(cc_space_f_vo, d_cc_space_f_vo) - call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv) + call gpu_allocate(d_cc_space_f_ov, nO, nV) + call gpu_allocate(d_cc_space_f_vv, nV, nV) + call gpu_upload(cc_space_f_oo, d_cc_space_f_oo) + call gpu_upload(cc_space_f_vo, d_cc_space_f_vo) + call gpu_upload(cc_space_f_vv, d_cc_space_f_vv) + +! FREE cc_space_f_oo +! FREE cc_space_f_vo +! FREE cc_space_f_vv + + if (do_mo_cholesky) then + call gpu_allocate(d_cc_space_v_oo_chol, cholesky_mo_num, nO, nO) + call gpu_allocate(d_cc_space_v_ov_chol, cholesky_mo_num, nO, nV) + call gpu_allocate(d_cc_space_v_vo_chol, cholesky_mo_num, nV, nO) + call gpu_allocate(d_cc_space_v_vv_chol, cholesky_mo_num, nV, nV) + + call gpu_upload(cc_space_v_oo_chol, d_cc_space_v_oo_chol) + call gpu_upload(cc_space_v_ov_chol, d_cc_space_v_ov_chol) + call gpu_upload(cc_space_v_vo_chol, d_cc_space_v_vo_chol) + call gpu_upload(cc_space_v_vv_chol, d_cc_space_v_vv_chol) + +! FREE cc_space_v_oo_chol +! FREE cc_space_v_ov_chol +! FREE cc_space_v_vo_chol +! FREE cc_space_v_vv_chol + endif call gpu_allocate(t2, nO,nO,nV,nV) call gpu_allocate(r2, nO,nO,nV,nV) @@ -120,6 +151,13 @@ subroutine run_ccsd_space_orb call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,h_t2) call gpu_upload(h_t2, t2) + + call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV) + call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv) + +! FREE cc_space_v_oovv + + call update_tau_space(nO,nV,h_t1,t1,t2,tau) call update_tau_x_space(nO,nV,tau,tau_x) !print*,'hf_energy', hf_energy @@ -142,10 +180,10 @@ subroutine run_ccsd_space_orb do while (not_converged) ! Residue - if (do_ao_cholesky) then -! if (.False.) then - call compute_H_oo_chol(nO,nV,tau_x,H_oo) - call compute_H_vv_chol(nO,nV,tau_x%f,H_vv%f) + if (do_mo_cholesky) then + call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, & + d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo) + call compute_H_vv_chol(nO,nV,tau_x,H_vv) call compute_H_vo_chol(nO,nV,t1%f,H_vo%f) call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1) @@ -249,6 +287,12 @@ subroutine run_ccsd_space_orb call save_energy(uncorr_energy + energy, e_t) deallocate(h_t1, h_t2) + if (do_mo_cholesky) then + call gpu_deallocate(d_cc_space_v_oo_chol) + call gpu_deallocate(d_cc_space_v_ov_chol) + call gpu_deallocate(d_cc_space_v_vo_chol) + call gpu_deallocate(d_cc_space_v_vv_chol) + endif call gpu_deallocate(d_cc_space_f_vo) call gpu_deallocate(d_cc_space_v_oovv) call gpu_deallocate(t1) @@ -302,8 +346,21 @@ subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1, ! !$omp end parallel - call gpu_ddot(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1, d_cc_space_v_oovv, 1, energy) - call gpu_ddot(blas_handle, nO*nV*1_8, d_cc_space_f_vo, 1, t1, 1, e) + type(gpu_stream) :: s1, s2 + call gpu_stream_create(s1) + call gpu_stream_create(s2) + + call gpu_set_stream(blas_handle,s1) + call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo, 1, t1, 1, e) + + call gpu_set_stream(blas_handle,s2) + call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1_8, d_cc_space_v_oovv, 1_8, energy) + call gpu_synchronize() + call gpu_set_stream(blas_handle,gpu_default_stream) + + call gpu_stream_destroy(s1) + call gpu_stream_destroy(s2) + energy = energy + 2.d0*e end @@ -346,32 +403,29 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau) type(gpu_stream) :: stream(nV) - do b=1,nV - call gpu_stream_create(stream(b)) - enddo - - !$OMP PARALLEL & + !$OMP PARALLEL if (no_gpu()) & !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) !$OMP DO do b=1,nV + call gpu_stream_create(stream(b)) call gpu_set_stream(blas_handle,stream(b)) do j=1,nO - call gpu_dgeam(blas_handle, 'N', 'N', nO*1_8, nV*1_8, & - 1.d0, t2%f(1,j,1,b), nO*nO*1_8, & - h_t1(j,b), t1%f, nO*1_8, & - tau%f(1,j,1,b), nO*nO*1_8) + call gpu_dgeam_f(blas_handle, 'N', 'N', nO, nV, & + 1.d0, t2%f(1,j,1,b), nO*nO, & + h_t1(j,b), t1%f, nO, & + tau%f(1,j,1,b), nO*nO) enddo enddo !$OMP END DO !$OMP END PARALLEL - call gpu_synchronize() - do b=1,nV call gpu_stream_destroy(stream(b)) enddo + call gpu_set_stream(blas_handle,gpu_default_stream) + end @@ -412,7 +466,7 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x) call gpu_stream_create(stream(a)) enddo - !$OMP PARALLEL & + !$OMP PARALLEL if (no_gpu()) & !$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) @@ -420,20 +474,20 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x) do b=1,nV do a=1,nV call gpu_set_stream(blas_handle,stream(a)) - call gpu_dgeam(blas_handle, 'N', 'N', nO*1_8, nO*1_8, & - 2.d0, tau%f(1,1,a,b), nO*1_8, & - -1.d0, tau%f(1,1,b,a), nO*1_8, & - tau_x%f(1,1,a,b), nO*1_8) + call gpu_dgeam_f(blas_handle, 'N', 'N', nO, nO, & + 2.d0, tau%f(1,1,a,b), nO, & + -1.d0, tau%f(1,1,b,a), nO, & + tau_x%f(1,1,a,b), nO) enddo enddo !$OMP END DO !$OMP END PARALLEL - call gpu_synchronize() - do b=1,nV call gpu_stream_destroy(stream(b)) enddo + call gpu_set_stream(blas_handle,gpu_default_stream) + end diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index 9b161001..288724f3 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -293,62 +293,115 @@ end ! H_oo -subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo) +subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, & + d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo) use gpu implicit none integer, intent(in) :: nO,nV + type(gpu_double2), intent(in) :: d_cc_space_f_oo + type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol type(gpu_double4), intent(in) :: tau_x type(gpu_double2), intent(out) :: H_oo integer :: a,b,i,j,u,k - double precision, allocatable :: tau_kau(:,:,:), tmp_vov(:,:,:) + type(gpu_double3) :: tau_kau, tmp_vov, tmp_ovv - allocate(tau_kau(cholesky_mo_num,nV,nO)) - !$omp parallel & - !$omp default(shared) & - !$omp private(i,u,j,k,a,b,tmp_vov) - allocate(tmp_vov(nV,nO,nV) ) - !$omp do - do u = 1, nO + call gpu_allocate(tau_kau, cholesky_mo_num, nV, nO) + +! !$omp parallel & +! !$omp default(shared) & +! !$omp private(i,u,j,k,a,b,tmp_vov) +! call gpu_allocate(tmp_vov, nV, nO, nV) +! !$omp do +! do u = 1, nO +! do b=1,nV +! do j=1,nO +! do a=1,nV +! tmp_vov%f(a,j,b) = tau_x%f(u,j,a,b) +! enddo +! enddo +! enddo +! call dgemm('N','T',cholesky_mo_num,nV,nO*nV,1.d0, & +! d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, tmp_vov%f, nV, & +! 0.d0, tau_kau%f(1,1,u), cholesky_mo_num) +! enddo +! !$omp end do nowait +! call gpu_deallocate(tmp_vov) +! !$omp do +! do i = 1, nO +! do u = 1, nO +! H_oo%f(u,i) = d_cc_space_f_oo%f(u,i) +! enddo +! enddo +! !$omp end do nowait +! +! !$omp barrier +! !$omp end parallel +! call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & +! tau_kau%f(1,1,1), cholesky_mo_num*nV, d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num*nV, & +! 1.d0, H_oo%f(1,1), nO) +! + + type(gpu_stream) :: stream(nV) + + do b=1,nV + call gpu_stream_create(stream(b)) + enddo + + !$OMP PARALLEL if (no_gpu()) & + !$OMP DEFAULT(SHARED) & + !$OMP PRIVATE(u,b,tmp_vov,tmp_ovv) + + call gpu_allocate(tmp_vov, nV, nO, nV) + call gpu_allocate(tmp_ovv, nO, nV, nV) + + !$OMP DO + do u=1,nO + call gpu_dgeam_f(blas_handle, 'N', 'N', 1, nO*nV*nV, 1.d0, & + tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f, nO, tmp_ovv%f, 1) do b=1,nV - do j=1,nO - do a=1,nV - tmp_vov(a,j,b) = tau_x%f(u,j,a,b) - enddo - enddo + call gpu_set_stream(blas_handle,stream(b)) + call gpu_dgeam_f(blas_handle, 'T', 'T', nV, nO, 1.d0, & + tmp_ovv%f(1,1,b), nO, 0.d0, & + tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV) enddo - call dgemm('N','T',cholesky_mo_num,nV,nO*nV,1.d0, & - cc_space_v_ov_chol, cholesky_mo_num, tmp_vov, nV, & - 0.d0, tau_kau(1,1,u), cholesky_mo_num) + call gpu_dgemm_f(blas_handle, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, & + d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_vov%f, nV, & + 0.d0, tau_kau%f(1,1,u), cholesky_mo_num) + call gpu_synchronize() enddo - !$omp end do nowait - deallocate(tmp_vov) - !$omp do - do i = 1, nO - do u = 1, nO - H_oo%f(u,i) = cc_space_f_oo(u,i) - enddo - enddo - !$omp end do nowait - !$omp barrier - !$omp end parallel - call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & - tau_kau, cholesky_mo_num*nV, cc_space_v_vo_chol, cholesky_mo_num*nV, & - 1.d0, H_oo%f, nO) + !$OMP END DO + call gpu_deallocate(tmp_vov) + call gpu_deallocate(tmp_ovv) + !$OMP END PARALLEL + + do b=1,nV + call gpu_stream_destroy(stream(b)) + enddo + + call gpu_set_stream(blas_handle,gpu_default_stream) + + call gpu_copy(d_cc_space_f_oo, H_oo) + + call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & + tau_kau, cholesky_mo_num*nV, d_cc_space_v_vo_chol, cholesky_mo_num*nV, & + 1.d0, H_oo, nO) + + call gpu_deallocate(tau_kau) end ! H_vv subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv) - + use gpu implicit none integer, intent(in) :: nO,nV - double precision, intent(in) :: tau_x(nO, nO, nV, nV) - double precision, intent(out) :: H_vv(nV, nV) + type(gpu_double4), intent(in) :: tau_x + type(gpu_double2), intent(out) :: H_vv integer :: a,b,i,j,u,k, beta @@ -364,7 +417,7 @@ subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv) do b=1,nV do j=1,nO do i=1,nO - tmp_oov(i,j,b) = tau_x(i,j,a,b) + tmp_oov(i,j,b) = tau_x%f(i,j,a,b) enddo enddo enddo @@ -378,7 +431,7 @@ subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv) !$omp do do beta = 1, nV do a = 1, nV - H_vv(a,beta) = cc_space_f_vv(a,beta) + H_vv%f(a,beta) = cc_space_f_vv(a,beta) enddo enddo !$omp end do nowait @@ -386,7 +439,7 @@ subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv) !$omp end parallel call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, & tau_kia, cholesky_mo_num*nO, cc_space_v_ov_chol, cholesky_mo_num*nO, & - 1.d0, H_vv, nV) + 1.d0, H_vv%f, nV) end diff --git a/src/gpu/gpu.irp.f b/src/gpu/gpu.irp.f index e91d66f5..6ad0a075 100644 --- a/src/gpu/gpu.irp.f +++ b/src/gpu/gpu.irp.f @@ -8,4 +8,11 @@ BEGIN_PROVIDER [ type(gpu_blas), blas_handle ] call gpu_blas_create(blas_handle) END_PROVIDER +BEGIN_PROVIDER [ type(gpu_stream), gpu_default_stream ] + implicit none + BEGIN_DOC + ! Default stream + END_DOC + gpu_default_stream%c = C_NULL_PTR +END_PROVIDER diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90 index ecf79c83..2676b339 100644 --- a/src/gpu/gpu_module.F90 +++ b/src/gpu/gpu_module.F90 @@ -49,7 +49,12 @@ module gpu ! ------------ interface + logical(c_bool) function no_gpu() bind(C) + import + end function + integer function gpu_ndevices() bind(C) + import end function subroutine gpu_set_device(id) bind(C) @@ -101,7 +106,7 @@ module gpu subroutine gpu_set_stream_c(handle, stream) bind(C, name='gpu_set_stream') import - type(c_ptr) :: handle, stream + type(c_ptr), value :: handle, stream end subroutine subroutine gpu_synchronize() bind(C) @@ -120,15 +125,15 @@ module gpu subroutine gpu_ddot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_ddot') import - type(c_ptr), intent(in) :: handle + type(c_ptr), value, intent(in) :: handle integer(c_int64_t), value :: n, incx, incy - type(c_ptr), intent(in), value :: dx, dy + type(c_ptr), value :: dx, dy real(c_double), intent(out) :: res end subroutine subroutine gpu_sdot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_sdot') import - type(c_ptr), intent(in) :: handle + type(c_ptr), value, intent(in) :: handle integer(c_int64_t), value :: n, incx, incy type(c_ptr), intent(in), value :: dx, dy real(c_float), intent(out) :: res @@ -137,8 +142,8 @@ module gpu subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, & b, ldb, c, ldc) bind(C, name='gpu_dgeam') import - type(c_ptr), intent(in) :: handle - character(c_char), intent(in), value :: transa, transb + type(c_ptr), value, intent(in) :: handle + character(c_char), intent(in), value :: transa, transb integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc real(c_double), intent(in), value :: alpha, beta type(c_ptr), value :: a, b, c @@ -147,13 +152,33 @@ module gpu subroutine gpu_sgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, & b, ldb, c, ldc) bind(C, name='gpu_sgeam') import - type(c_ptr), intent(in) :: handle - character(c_char), intent(in), value :: transa, transb + type(c_ptr), value, intent(in) :: handle + character(c_char), intent(in), value :: transa, transb integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc real(c_float), intent(in), value :: alpha, beta type(c_ptr), value :: a, b, c end subroutine + subroutine gpu_dgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, & + b, ldb, beta, c, ldc) bind(C, name='gpu_dgemm') + import + type(c_ptr), value, intent(in) :: handle + character(c_char), intent(in), value :: transa, transb + integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc + real(c_double), intent(in), value :: alpha, beta + type(c_ptr), value :: a, b, c + end subroutine + + subroutine gpu_sgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, & + b, ldb, beta, c, ldc) bind(C, name='gpu_sgemm') + import + type(c_ptr), value, intent(in) :: handle + character(c_char), intent(in), value :: transa, transb + integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc + real(c_float), intent(in), value :: alpha, beta + type(c_ptr), value :: a, b, c + end subroutine + end interface @@ -161,20 +186,26 @@ module gpu ! ---------------------- interface gpu_allocate - procedure gpu_allocate_double1 & - ,gpu_allocate_double2 & - ,gpu_allocate_double3 & - ,gpu_allocate_double4 & - ,gpu_allocate_double5 & - ,gpu_allocate_double6 + procedure gpu_allocate_double1 & + ,gpu_allocate_double2 & + ,gpu_allocate_double3 & + ,gpu_allocate_double4 & + ,gpu_allocate_double5 & + ,gpu_allocate_double6 & + ,gpu_allocate_double1_64 & + ,gpu_allocate_double2_64 & + ,gpu_allocate_double3_64 & + ,gpu_allocate_double4_64 & + ,gpu_allocate_double5_64 & + ,gpu_allocate_double6_64 end interface gpu_allocate interface gpu_deallocate - procedure gpu_deallocate_double1 & - ,gpu_deallocate_double2 & - ,gpu_deallocate_double3 & - ,gpu_deallocate_double4 & - ,gpu_deallocate_double5 & + procedure gpu_deallocate_double1 & + ,gpu_deallocate_double2 & + ,gpu_deallocate_double3 & + ,gpu_deallocate_double4 & + ,gpu_deallocate_double5 & ,gpu_deallocate_double6 end interface gpu_deallocate @@ -267,6 +298,61 @@ module gpu end subroutine + subroutine gpu_allocate_double1_64(ptr, s) + implicit none + type(gpu_double1), intent(inout) :: ptr + integer*8, intent(in) :: s + + call gpu_allocate_c(ptr%c, s) + call c_f_pointer(ptr%c, ptr%f, (/ s /)) + end subroutine + + subroutine gpu_allocate_double2_64(ptr, s1, s2) + implicit none + type(gpu_double2), intent(inout) :: ptr + integer*8, intent(in) :: s1, s2 + + call gpu_allocate_c(ptr%c, s1*s2*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /)) + end subroutine + + subroutine gpu_allocate_double3_64(ptr, s1, s2, s3) + implicit none + type(gpu_double3), intent(inout) :: ptr + integer*8, intent(in) :: s1, s2, s3 + + call gpu_allocate_c(ptr%c, s1*s2*s3*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /)) + end subroutine + + subroutine gpu_allocate_double4_64(ptr, s1, s2, s3, s4) + implicit none + type(gpu_double4), intent(inout) :: ptr + integer*8, intent(in) :: s1, s2, s3, s4 + + call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /)) + end subroutine + + subroutine gpu_allocate_double5_64(ptr, s1, s2, s3, s4, s5) + implicit none + type(gpu_double5), intent(inout) :: ptr + integer*8, intent(in) :: s1, s2, s3, s4, s5 + + call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /)) + end subroutine + + subroutine gpu_allocate_double6_64(ptr, s1, s2, s3, s4, s5, s6) + implicit none + type(gpu_double6), intent(inout) :: ptr + integer*8, intent(in) :: s1, s2, s3, s4, s5, s6 + + call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /)) + end subroutine + + ! gpu_deallocate ! -------------- @@ -494,19 +580,38 @@ end module subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res) use gpu type(gpu_blas), intent(in) :: handle - integer*8 :: n, incx, incy - double precision, target, intent(in) :: dx(*), dy(*) - double precision, intent(out) :: res - call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res) + integer*4 :: n, incx, incy + type(gpu_double1), intent(in) :: dx, dy + double precision, intent(out) :: res + call gpu_ddot_c(handle%c, int(n,c_int64_t), dx%c, int(incx,c_int64_t), dy%c, int(incy,c_int64_t), res) end subroutine -subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res) +subroutine gpu_ddot_f(handle, n, dx, incx, dy, incy, res) + use gpu + type(gpu_blas), intent(in) :: handle + integer*4 :: n, incx, incy + double precision, target :: dx(*), dy(*) + double precision, intent(out) :: res + call gpu_ddot_c(handle%c, int(n,c_int64_t), c_loc(dx), int(incx,c_int64_t), c_loc(dy), int(incy,c_int64_t), res) +end subroutine + + +subroutine gpu_ddot_64(handle, n, dx, incx, dy, incy, res) use gpu type(gpu_blas), intent(in) :: handle integer*8 :: n, incx, incy - real, target, intent(in) :: dx(*), dy(*) - real, intent(out) :: res - call gpu_sdot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res) + type(gpu_double1), intent(in) :: dx, dy + double precision, intent(out) :: res + call gpu_ddot_c(handle%c, n, dx%c, incx, dy%c, incy, res) +end subroutine + +subroutine gpu_ddot_f_64(handle, n, dx, incx, dy, incy, res) + use gpu + type(gpu_blas), intent(in) :: handle + integer*8 :: n, incx, incy + double precision, target :: dx(*), dy(*) + double precision, intent(out) :: res + call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res) end subroutine @@ -518,22 +623,103 @@ subroutine gpu_dgeam(handle, transa, transb, m, n, alpha, a, lda, beta, & use gpu type(gpu_blas), intent(in) :: handle character, intent(in) :: transa, transb - integer*8, intent(in) :: m, n, lda, ldb, ldc + integer*4, intent(in) :: m, n, lda, ldb, ldc double precision, intent(in) :: alpha, beta - double precision, target :: a(lda,*), b(ldb,*), c(ldc,*) - call gpu_dgeam_c(handle%c, transa, transb, m, n, alpha, c_loc(a), lda, beta, & - c_loc(b), ldb, c_loc(c), ldc) + type(gpu_double2) :: a, b, c + call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, a%c, int(lda,c_int64_t), beta, & + b%c, int(ldb,c_int64_t), c%c, int(ldc,c_int64_t)) end subroutine -subroutine gpu_sgeam(handle, transa, transb, m, n, alpha, a, lda, beta, & + +subroutine gpu_dgeam_f(handle, transa, transb, m, n, alpha, a, lda, beta, & b, ldb, c, ldc) - use gpu + use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa, transb + integer*4, intent(in) :: m, n, lda, ldb, ldc + double precision, intent(in) :: alpha, beta + double precision, target :: a(*), b(*), c(*) + call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, & + c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t)) +end subroutine + + +subroutine gpu_dgeam_64(handle, transa, transb, m, n, alpha, a, lda, beta, & + b, ldb, c, ldc) + use gpu type(gpu_blas), intent(in) :: handle character, intent(in) :: transa, transb integer*8, intent(in) :: m, n, lda, ldb, ldc - real, intent(in) :: alpha, beta - real, target :: a(lda,*), b(ldb,*), c(ldc,*) - call gpu_sgeam_c(handle%c, transa, transb, m, n, alpha, c_loc(a), lda, beta, & - c_loc(b), ldb, c_loc(c), ldc) + double precision, intent(in) :: alpha, beta + type(gpu_double2) :: a, b, c + call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, a%c, int(lda,c_int64_t), beta, & + b%c, int(ldb,c_int64_t), c%c, int(ldc,c_int64_t)) +end subroutine + + +subroutine gpu_dgeam_f_64(handle, transa, transb, m, n, alpha, a, lda, beta, & + b, ldb, c, ldc) + use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa, transb + integer*8, intent(in) :: m, n, lda, ldb, ldc + double precision, intent(in) :: alpha, beta + double precision, target :: a(*), b(*), c(*) + call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, & + c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t)) +end subroutine + + +! gemm +! ---- + +subroutine gpu_dgemm(handle, transa, transb, m, n, k, alpha, a, lda, & + b, ldb, beta, c, ldc) + use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa, transb + integer*4, intent(in) :: m, n, k, lda, ldb, ldc + double precision, intent(in) :: alpha, beta + type(gpu_double2) :: a, b, c + call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), & + alpha, a%c, int(lda,c_int64_t), & + b%c, int(ldb,c_int64_t), beta, c%c, int(ldc,c_int64_t)) +end subroutine + +subroutine gpu_dgemm_64(handle, transa, transb, m, n, k, alpha, a, lda, & + b, ldb, beta, c, ldc) + use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa, transb + integer*8, intent(in) :: m, n, k, lda, ldb, ldc + double precision, intent(in) :: alpha, beta + type(gpu_double2) :: a, b, c + call gpu_dgemm_c(handle%c, transa, transb, m, n, k, & + alpha, a%c, lda, b%c, ldb, beta, c%c, ldc) +end subroutine + +subroutine gpu_dgemm_f(handle, transa, transb, m, n, k, alpha, a, lda, & + b, ldb, beta, c, ldc) + use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa, transb + integer*4, intent(in) :: m, n, k, lda, ldb, ldc + double precision, intent(in) :: alpha, beta + double precision, target :: a(*), b(*), c(*) + call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), & + alpha, c_loc(a), int(lda,c_int64_t), & + c_loc(b), int(ldb,c_int64_t), beta, c_loc(c), int(ldc,c_int64_t)) +end subroutine + +subroutine gpu_dgemm_f_64(handle, transa, transb, m, n, k, alpha, a, lda, & + b, ldb, beta, c, ldc) + use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa, transb + integer*8, intent(in) :: m, n, k, lda, ldb, ldc + double precision, intent(in) :: alpha, beta + double precision, target :: a(*), b(*), c(*) + call gpu_dgemm_c(handle%c, transa, transb, m, n, k, & + alpha, c_loc(a), lda, c_loc(b), ldb, beta, c_loc(c), ldc) end subroutine From d3c1994c64ed9ae9914ce605a6b7c364ac518d9b Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Mon, 1 Jul 2024 18:04:48 +0200 Subject: [PATCH 12/38] H_vv --- plugins/local/gpu_nvidia/gpu.c | 16 +-- plugins/local/gpu_x86/gpu.c | 6 +- src/ccsd/ccsd_space_orb_sub.irp.f | 7 +- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 146 ++++++++++++++++--------- src/gpu/gpu.irp.f | 8 ++ 5 files changed, 114 insertions(+), 69 deletions(-) diff --git a/plugins/local/gpu_nvidia/gpu.c b/plugins/local/gpu_nvidia/gpu.c index 189de64c..39a82984 100644 --- a/plugins/local/gpu_nvidia/gpu.c +++ b/plugins/local/gpu_nvidia/gpu.c @@ -11,10 +11,6 @@ /* Generic functions */ -bool no_gpu() { - return false; -} - int gpu_ndevices() { int ngpus; cudaGetDeviceCount(&ngpus); @@ -35,13 +31,13 @@ void gpu_allocate(void** ptr, const int64_t size) { free = INT64_MAX; } - /* Use managed memory if it does not fit on the GPU */ - if (size < free && size < total/2) { + rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal); +// /* Use managed memory if it does not fit on the GPU */ +// if (size < free && size < total/2) { // rc= cudaMalloc(ptr, size); - rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal); - } else { - rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal); - } +// } else { +// rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal); +// } assert (rc == cudaSuccess); } diff --git a/plugins/local/gpu_x86/gpu.c b/plugins/local/gpu_x86/gpu.c index 53267a7c..dab23a25 100644 --- a/plugins/local/gpu_x86/gpu.c +++ b/plugins/local/gpu_x86/gpu.c @@ -5,14 +5,10 @@ #include #include -bool no_gpu() { - return true; -} - /* Generic functions */ int gpu_ndevices() { - return 1; + return 0; } void gpu_set_device(int32_t i) { diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 5ee7366e..0b3636ac 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -183,7 +183,8 @@ subroutine run_ccsd_space_orb if (do_mo_cholesky) then call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, & d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo) - call compute_H_vv_chol(nO,nV,tau_x,H_vv) + call compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, & + d_cc_space_v_ov_chol,H_vv) call compute_H_vo_chol(nO,nV,t1%f,H_vo%f) call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1) @@ -403,7 +404,7 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau) type(gpu_stream) :: stream(nV) - !$OMP PARALLEL if (no_gpu()) & + !$OMP PARALLEL if (gpu_num == 0) & !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) @@ -466,7 +467,7 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x) call gpu_stream_create(stream(a)) enddo - !$OMP PARALLEL if (no_gpu()) & + !$OMP PARALLEL if (gpu_num == 0) & !$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index 288724f3..458016fb 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -344,48 +344,47 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, & ! 1.d0, H_oo%f(1,1), nO) ! - type(gpu_stream) :: stream(nV) + type(gpu_blas) :: blas - do b=1,nV - call gpu_stream_create(stream(b)) - enddo - !$OMP PARALLEL if (no_gpu()) & + !$OMP PARALLEL & !$OMP DEFAULT(SHARED) & - !$OMP PRIVATE(u,b,tmp_vov,tmp_ovv) + !$OMP PRIVATE(blas,u,b,tmp_vov,tmp_ovv) + + !$OMP SINGLE + !$OMP TASK + call gpu_copy(d_cc_space_f_oo, H_oo) + !$OMP END TASK + !$OMP END SINGLE - call gpu_allocate(tmp_vov, nV, nO, nV) call gpu_allocate(tmp_ovv, nO, nV, nV) + call gpu_allocate(tmp_vov, nV, nO, nV) + + call gpu_blas_create(blas) !$OMP DO do u=1,nO - call gpu_dgeam_f(blas_handle, 'N', 'N', 1, nO*nV*nV, 1.d0, & + call gpu_dgeam_f(blas, 'N', 'N', 1, nO*nV*nV, 1.d0, & tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f, nO, tmp_ovv%f, 1) do b=1,nV - call gpu_set_stream(blas_handle,stream(b)) - call gpu_dgeam_f(blas_handle, 'T', 'T', nV, nO, 1.d0, & + call gpu_dgeam_f(blas, 'T', 'T', nV, nO, 1.d0, & tmp_ovv%f(1,1,b), nO, 0.d0, & tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV) enddo - call gpu_dgemm_f(blas_handle, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, & + call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, & d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_vov%f, nV, & 0.d0, tau_kau%f(1,1,u), cholesky_mo_num) - call gpu_synchronize() enddo !$OMP END DO + call gpu_blas_destroy(blas) + call gpu_deallocate(tmp_vov) call gpu_deallocate(tmp_ovv) + + !$OMP TASKWAIT !$OMP END PARALLEL - do b=1,nV - call gpu_stream_destroy(stream(b)) - enddo - - call gpu_set_stream(blas_handle,gpu_default_stream) - - call gpu_copy(d_cc_space_f_oo, H_oo) - call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & tau_kau, cholesky_mo_num*nV, d_cc_space_v_vo_chol, cholesky_mo_num*nV, & 1.d0, H_oo, nO) @@ -395,52 +394,97 @@ end ! H_vv -subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv) +subroutine compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, & + d_cc_space_v_ov_chol,H_vv) use gpu implicit none - integer, intent(in) :: nO,nV + integer, intent(in) :: nO,nV + type(gpu_double2), intent(in) :: d_cc_space_f_vv + type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol type(gpu_double4), intent(in) :: tau_x type(gpu_double2), intent(out) :: H_vv integer :: a,b,i,j,u,k, beta - double precision, allocatable :: tau_kia(:,:,:), tmp_oov(:,:,:) + type(gpu_double3) :: tau_kia, tmp_oov - allocate(tau_kia(cholesky_mo_num,nO,nV)) - !$omp parallel & - !$omp default(shared) & - !$omp private(i,beta,j,k,a,b,tmp_oov) - allocate(tmp_oov(nO,nO,nV) ) - !$omp do + call gpu_allocate(tau_kia, cholesky_mo_num, nO, nV) + +! !$omp parallel & +! !$omp default(shared) & +! !$omp private(i,beta,j,k,a,b,tmp_oov) +! allocate(tmp_oov(nO,nO,nV) ) +! !$omp do +! do a = 1, nV +! do b=1,nV +! do j=1,nO +! do i=1,nO +! tmp_oov(i,j,b) = tau_x%f(i,j,a,b) +! enddo +! enddo +! enddo +! call dgemm('N','T',cholesky_mo_num,nO,nO*nV,1.d0, & +! d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov, nO, & +! 0.d0, tau_kia(1,1,a), cholesky_mo_num) +! enddo +! !$omp end do nowait +! deallocate(tmp_oov) + +! !$omp do +! do beta = 1, nV +! do a = 1, nV +! H_vv%f(a,beta) = cc_space_f_vv(a,beta) +! enddo +! enddo +! !$omp end do nowait +! !$omp barrier +! !$omp end parallel +! call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, & +! tau_kia, cholesky_mo_num*nO, d_cc_space_v_ov_chol%f, cholesky_mo_num*nO, & +! 1.d0, H_vv%f, nV) + + type(gpu_blas) :: blas + + + PROVIDE gpu_num + !$OMP PARALLEL & + !$OMP DEFAULT(SHARED) & + !$OMP PRIVATE(a,b,tmp_oov,blas) + + !$OMP SINGLE + !$OMP TASK + call gpu_copy(d_cc_space_f_vv, H_vv) + !$OMP END TASK + !$OMP END SINGLE + + call gpu_blas_create(blas) + call gpu_allocate(tmp_oov, nO, nO, nV) + + !$OMP DO do a = 1, nV do b=1,nV - do j=1,nO - do i=1,nO - tmp_oov(i,j,b) = tau_x%f(i,j,a,b) - enddo - enddo + call gpu_dgeam_f(blas, 'N', 'N', nO, nO, 1.d0, & + tau_x%f(1,1,a,b), nO, 0.d0, & + tau_x%f(1,1,a,b), nO, tmp_oov%f(1,1,b), nO) enddo - call dgemm('N','T',cholesky_mo_num,nO,nO*nV,1.d0, & - cc_space_v_ov_chol, cholesky_mo_num, tmp_oov, nO, & - 0.d0, tau_kia(1,1,a), cholesky_mo_num) + call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nO,nO*nV,1.d0, & + d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov%f, nO, & + 0.d0, tau_kia%f(1,1,a), cholesky_mo_num) enddo - !$omp end do nowait - deallocate(tmp_oov) + !$OMP END DO - !$omp do - do beta = 1, nV - do a = 1, nV - H_vv%f(a,beta) = cc_space_f_vv(a,beta) - enddo - enddo - !$omp end do nowait - !$omp barrier - !$omp end parallel - call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, & - tau_kia, cholesky_mo_num*nO, cc_space_v_ov_chol, cholesky_mo_num*nO, & - 1.d0, H_vv%f, nV) + call gpu_blas_destroy(blas) + call gpu_deallocate(tmp_oov) + !$OMP TASKWAIT + !$OMP END PARALLEL + + call gpu_dgemm(blas_handle,'T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, & + tau_kia, cholesky_mo_num*nO, d_cc_space_v_ov_chol, cholesky_mo_num*nO, & + 1.d0, H_vv, nV) + + call gpu_deallocate(tau_kia) end ! H_vo diff --git a/src/gpu/gpu.irp.f b/src/gpu/gpu.irp.f index 6ad0a075..3b2feeb6 100644 --- a/src/gpu/gpu.irp.f +++ b/src/gpu/gpu.irp.f @@ -16,3 +16,11 @@ BEGIN_PROVIDER [ type(gpu_stream), gpu_default_stream ] gpu_default_stream%c = C_NULL_PTR END_PROVIDER +BEGIN_PROVIDER [ integer, gpu_num ] + implicit none + BEGIN_DOC + ! Number of usable GPUs + END_DOC + gpu_num = gpu_ndevices() +END_PROVIDER + From 44a7729f65a37cc3a7c35ae55f462bb1d61e411b Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Mon, 1 Jul 2024 19:00:27 +0200 Subject: [PATCH 13/38] H_ finished in CCSD --- src/ccsd/ccsd_space_orb_sub.irp.f | 108 ++---- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 482 +++++++++---------------- 2 files changed, 200 insertions(+), 390 deletions(-) diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 0b3636ac..13b974be 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -181,11 +181,9 @@ subroutine run_ccsd_space_orb ! Residue if (do_mo_cholesky) then - call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, & - d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo) - call compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, & - d_cc_space_v_ov_chol,H_vv) - call compute_H_vo_chol(nO,nV,t1%f,H_vo%f) + call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo) + call compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, d_cc_space_v_ov_chol,H_vv) + call compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol, H_vo) call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1) call compute_r2_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r2%f,max_r2) @@ -316,51 +314,20 @@ subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1, integer :: i,j,a,b double precision :: e -! energy = 0d0 -! !$omp parallel & -! !$omp shared(nO,nV,energy,tau_x,t1,& -! !$omp d_cc_space_f_vo,d_cc_space_v_oovv) & -! !$omp private(i,j,a,b,e) & -! !$omp default(none) -! e = 0d0 -! !$omp do -! do a = 1, nV -! do i = 1, nO -! e = e + 2d0 * d_cc_space_f_vo%f(a,i) * t1%f(i,a) -! enddo -! enddo -! !$omp end do nowait -! !$omp do -! do b = 1, nV -! do a = 1, nV -! do j = 1, nO -! do i = 1, nO -! e = e + tau_x%f(i,j,a,b) * d_cc_space_v_oovv%f(i,j,a,b) -! enddo -! enddo -! enddo -! enddo -! !$omp end do nowait -! !$omp critical -! energy = energy + e -! !$omp end critical -! !$omp end parallel + type(gpu_stream) :: s1, s2 + call gpu_stream_create(s1) + call gpu_stream_create(s2) + call gpu_set_stream(blas_handle,s1) + call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo, 1, t1, 1, e) - type(gpu_stream) :: s1, s2 - call gpu_stream_create(s1) - call gpu_stream_create(s2) + call gpu_set_stream(blas_handle,s2) + call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1_8, d_cc_space_v_oovv, 1_8, energy) + call gpu_set_stream(blas_handle,gpu_default_stream) - call gpu_set_stream(blas_handle,s1) - call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo, 1, t1, 1, e) - - call gpu_set_stream(blas_handle,s2) - call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1_8, d_cc_space_v_oovv, 1_8, energy) - call gpu_synchronize() - call gpu_set_stream(blas_handle,gpu_default_stream) - - call gpu_stream_destroy(s1) - call gpu_stream_destroy(s2) + call gpu_synchronize() + call gpu_stream_destroy(s1) + call gpu_stream_destroy(s2) energy = energy + 2.d0*e @@ -384,27 +351,9 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau) ! internal integer :: i,j,a,b -! !$OMP PARALLEL & -! !$OMP SHARED(nO,nV,tau,t2,t1,h_t1) & -! !$OMP PRIVATE(i,j,a,b) & -! !$OMP DEFAULT(NONE) -! !$OMP DO -! do b = 1, nV -! do a = 1, nV -! do j = 1, nO -! do i = 1, nO -! tau%f(i,j,a,b) = t2%f(i,j,a,b) + t1%f(i,a) * h_t1(j,b) -! enddo -! enddo -! enddo -! enddo -! !$OMP END DO -! !$OMP END PARALLEL - - type(gpu_stream) :: stream(nV) - !$OMP PARALLEL if (gpu_num == 0) & + !$OMP PARALLEL & !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) @@ -422,6 +371,8 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau) !$OMP END DO !$OMP END PARALLEL + call gpu_synchronize() + do b=1,nV call gpu_stream_destroy(stream(b)) enddo @@ -444,32 +395,15 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x) ! internal integer :: i,j,a,b -! !$OMP PARALLEL & -! !$OMP SHARED(nO,nV,tau,tau_x) & -! !$OMP PRIVATE(i,j,a,b) & -! !$OMP DEFAULT(NONE) -! !$OMP DO -! do b = 1, nV -! do a = 1, nV -! do j = 1, nO -! do i = 1, nO -! tau_x%f(i,j,a,b) = 2.d0*tau%f(i,j,a,b) - tau%f(i,j,b,a) -! enddo -! enddo -! enddo -! enddo -! !$OMP END DO -! !$OMP END PARALLEL - type(gpu_stream) :: stream(nV) do a=1,nV call gpu_stream_create(stream(a)) enddo - !$OMP PARALLEL if (gpu_num == 0) & + !$OMP PARALLEL & !$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) & - !$OMP PRIVATE(i,j,a,b) & + !$OMP PRIVATE(a,b) & !$OMP DEFAULT(NONE) !$OMP DO do b=1,nV @@ -484,10 +418,12 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x) !$OMP END DO !$OMP END PARALLEL + call gpu_set_stream(blas_handle,gpu_default_stream) + call gpu_synchronize() + do b=1,nV call gpu_stream_destroy(stream(b)) enddo - call gpu_set_stream(blas_handle,gpu_default_stream) end diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index 458016fb..5eb95a06 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -1,81 +1,200 @@ -subroutine ccsd_energy_space_chol(nO,nV,tau,t1,energy) +! H_oo +subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, & + d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo) + use gpu implicit none - integer, intent(in) :: nO, nV - double precision, intent(in) :: tau(nO,nO,nV,nV) - double precision, intent(in) :: t1(nO,nV) - double precision, intent(out) :: energy + integer, intent(in) :: nO,nV + type(gpu_double2), intent(in) :: d_cc_space_f_oo + type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol + type(gpu_double4), intent(in) :: tau_x + type(gpu_double2), intent(out) :: H_oo - ! internal - integer :: i,j,a,b - double precision :: e + integer :: a,b,i,j,u,k - energy = 0d0 - !$omp parallel & - !$omp shared(nO,nV,energy,tau,t1,& - !$omp cc_space_f_vo,cc_space_w_oovv) & - !$omp private(i,j,a,b,e) & - !$omp default(none) - e = 0d0 - !$omp do - do a = 1, nV - do i = 1, nO - e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a) - enddo - enddo - !$omp end do nowait - !$omp do - do b = 1, nV - do a = 1, nV - do j = 1, nO - do i = 1, nO - e = e + tau(i,j,a,b) * cc_space_w_oovv(i,j,a,b) - enddo - enddo - enddo - enddo - !$omp end do nowait - !$omp critical - energy = energy + e - !$omp end critical - !$omp end parallel + type(gpu_double3) :: tau_kau, tmp_vov, tmp_ovv -end + call gpu_allocate(tau_kau, cholesky_mo_num, nV, nO) -! Tau + type(gpu_blas) :: blas -subroutine update_tau_space_chol(nO,nV,t1,t2,tau) - implicit none + !$OMP PARALLEL & + !$OMP DEFAULT(SHARED) & + !$OMP PRIVATE(blas,u,b,tmp_vov,tmp_ovv) - ! in - integer, intent(in) :: nO, nV - double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV) + !$OMP SINGLE + !$OMP TASK + call gpu_copy(d_cc_space_f_oo, H_oo) + !$OMP END TASK + !$OMP END SINGLE - ! out - double precision, intent(out) :: tau(nO,nO,nV,nV) + call gpu_allocate(tmp_ovv, nO, nV, nV) + call gpu_allocate(tmp_vov, nV, nO, nV) - ! internal - integer :: i,j,a,b + call gpu_blas_create(blas) - !$OMP PARALLEL & - !$OMP SHARED(nO,nV,tau,t2,t1) & - !$OMP PRIVATE(i,j,a,b) & - !$OMP DEFAULT(NONE) !$OMP DO - do b = 1, nV - do a = 1, nV - do j = 1, nO - do i = 1, nO - tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b) - enddo - enddo + do u=1,nO + call gpu_dgeam_f(blas, 'N', 'N', 1, nO*nV*nV, 1.d0, & + tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f, nO, tmp_ovv%f, 1) + do b=1,nV + call gpu_dgeam_f(blas, 'T', 'T', nV, nO, 1.d0, & + tmp_ovv%f(1,1,b), nO, 0.d0, & + tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV) enddo + call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, & + d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_vov%f, nV, & + 0.d0, tau_kau%f(1,1,u), cholesky_mo_num) enddo !$OMP END DO + + call gpu_blas_destroy(blas) + + call gpu_deallocate(tmp_vov) + call gpu_deallocate(tmp_ovv) + + !$OMP TASKWAIT !$OMP END PARALLEL + call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & + tau_kau, cholesky_mo_num*nV, d_cc_space_v_vo_chol, cholesky_mo_num*nV, & + 1.d0, H_oo, nO) + + call gpu_synchronize() + call gpu_deallocate(tau_kau) +end + +! H_vv + +subroutine compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, & + d_cc_space_v_ov_chol,H_vv) + use gpu + implicit none + + integer, intent(in) :: nO,nV + type(gpu_double2), intent(in) :: d_cc_space_f_vv + type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol + type(gpu_double4), intent(in) :: tau_x + type(gpu_double2), intent(out) :: H_vv + + integer :: a,b,i,j,u,k, beta + + type(gpu_double3) :: tau_kia, tmp_oov + + call gpu_allocate(tau_kia, cholesky_mo_num, nO, nV) + + type(gpu_blas) :: blas + + !$OMP PARALLEL & + !$OMP DEFAULT(SHARED) & + !$OMP PRIVATE(a,b,tmp_oov,blas) + + !$OMP SINGLE + !$OMP TASK + call gpu_copy(d_cc_space_f_vv, H_vv) + !$OMP END TASK + !$OMP END SINGLE + + call gpu_blas_create(blas) + call gpu_allocate(tmp_oov, nO, nO, nV) + + !$OMP DO + do a = 1, nV + do b=1,nV + call gpu_dgeam_f(blas, 'N', 'N', nO, nO, 1.d0, & + tau_x%f(1,1,a,b), nO, 0.d0, & + tau_x%f(1,1,a,b), nO, tmp_oov%f(1,1,b), nO) + enddo + call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nO,nO*nV,1.d0, & + d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov%f, nO, & + 0.d0, tau_kia%f(1,1,a), cholesky_mo_num) + enddo + !$OMP END DO + + call gpu_blas_destroy(blas) + + call gpu_deallocate(tmp_oov) + !$OMP TASKWAIT + !$OMP END PARALLEL + + call gpu_dgemm(blas_handle,'T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, & + tau_kia, cholesky_mo_num*nO, d_cc_space_v_ov_chol, cholesky_mo_num*nO, & + 1.d0, H_vv, nV) + + call gpu_synchronize() + call gpu_deallocate(tau_kia) +end + +! H_vo +subroutine compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, & + d_cc_space_v_ov_chol,d_cc_space_v_vo_chol, H_vo) + use gpu + implicit none + + integer, intent(in) :: nO,nV + type(gpu_double2), intent(in) :: t1, d_cc_space_f_vo + type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol + type(gpu_double2), intent(out) :: H_vo + + integer :: a,b,i,j,u,k + + type(gpu_double1) :: tmp_k + type(gpu_double3) :: tmp, tmp2 + + call gpu_copy(d_cc_space_f_vo, H_vo) + + call gpu_allocate(tmp_k, cholesky_mo_num) + + call gpu_dgemm(blas_handle, 'N', 'N', cholesky_mo_num, 1, nO*nV, 2.d0, & + d_cc_space_v_ov_chol, cholesky_mo_num, & + t1, nO*nV, 0.d0, tmp_k, cholesky_mo_num) + + call gpu_dgemm(blas_handle, 'T','N',nV*nO,1,cholesky_mo_num,1.d0, & + d_cc_space_v_vo_chol, cholesky_mo_num, tmp_k, cholesky_mo_num, 1.d0, & + H_vo, nV*nO) + + call gpu_deallocate(tmp_k) + + + call gpu_allocate(tmp, cholesky_mo_num, nO, nO) + + call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nO, nO, nV, 1.d0, & + d_cc_space_v_ov_chol, cholesky_mo_num*nO, t1, nO, 0.d0, tmp, cholesky_mo_num*nO) + + call gpu_allocate(tmp2, cholesky_mo_num, nO, nO) + + type(gpu_stream) :: stream(nO) + do i=1,nO + call gpu_stream_create(stream(i)) + enddo + + !$OMP PARALLEL DO COLLAPSE(2) PRIVATE(i,j) + do i=1,nO + do j=1,nO + call gpu_set_stream(blas_handle,stream(j)) + call gpu_dgeam_f(blas_handle, 'N', 'N', cholesky_mo_num, 1, 1.d0, & + tmp%f(1,i,j), cholesky_mo_num, 0.d0, & + tmp%f(1,i,j), cholesky_mo_num, tmp2%f(1,j,i), cholesky_mo_num) + enddo + enddo + !$OMP END PARALLEL DO + + call gpu_set_stream(blas_handle,gpu_default_stream) + call gpu_synchronize() + + do i=1,nO + call gpu_stream_destroy(stream(i)) + enddo + call gpu_deallocate(tmp) + + call gpu_dgemm(blas_handle, 'T','N', nV, nO, cholesky_mo_num*nO, -1.d0, & + d_cc_space_v_ov_chol, cholesky_mo_num*nO, tmp2, cholesky_mo_num*nO, & + 1.d0, H_vo, nV) + + call gpu_synchronize() + call gpu_deallocate(tmp2) end ! R1 @@ -291,251 +410,6 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) end -! H_oo - -subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, & - d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo) - use gpu - implicit none - - integer, intent(in) :: nO,nV - type(gpu_double2), intent(in) :: d_cc_space_f_oo - type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol - type(gpu_double4), intent(in) :: tau_x - type(gpu_double2), intent(out) :: H_oo - - integer :: a,b,i,j,u,k - - type(gpu_double3) :: tau_kau, tmp_vov, tmp_ovv - - call gpu_allocate(tau_kau, cholesky_mo_num, nV, nO) - -! !$omp parallel & -! !$omp default(shared) & -! !$omp private(i,u,j,k,a,b,tmp_vov) -! call gpu_allocate(tmp_vov, nV, nO, nV) -! !$omp do -! do u = 1, nO -! do b=1,nV -! do j=1,nO -! do a=1,nV -! tmp_vov%f(a,j,b) = tau_x%f(u,j,a,b) -! enddo -! enddo -! enddo -! call dgemm('N','T',cholesky_mo_num,nV,nO*nV,1.d0, & -! d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, tmp_vov%f, nV, & -! 0.d0, tau_kau%f(1,1,u), cholesky_mo_num) -! enddo -! !$omp end do nowait -! call gpu_deallocate(tmp_vov) -! !$omp do -! do i = 1, nO -! do u = 1, nO -! H_oo%f(u,i) = d_cc_space_f_oo%f(u,i) -! enddo -! enddo -! !$omp end do nowait -! -! !$omp barrier -! !$omp end parallel -! call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & -! tau_kau%f(1,1,1), cholesky_mo_num*nV, d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num*nV, & -! 1.d0, H_oo%f(1,1), nO) -! - - type(gpu_blas) :: blas - - - !$OMP PARALLEL & - !$OMP DEFAULT(SHARED) & - !$OMP PRIVATE(blas,u,b,tmp_vov,tmp_ovv) - - !$OMP SINGLE - !$OMP TASK - call gpu_copy(d_cc_space_f_oo, H_oo) - !$OMP END TASK - !$OMP END SINGLE - - call gpu_allocate(tmp_ovv, nO, nV, nV) - call gpu_allocate(tmp_vov, nV, nO, nV) - - call gpu_blas_create(blas) - - !$OMP DO - do u=1,nO - call gpu_dgeam_f(blas, 'N', 'N', 1, nO*nV*nV, 1.d0, & - tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f, nO, tmp_ovv%f, 1) - do b=1,nV - call gpu_dgeam_f(blas, 'T', 'T', nV, nO, 1.d0, & - tmp_ovv%f(1,1,b), nO, 0.d0, & - tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV) - enddo - call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, & - d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_vov%f, nV, & - 0.d0, tau_kau%f(1,1,u), cholesky_mo_num) - enddo - !$OMP END DO - - call gpu_blas_destroy(blas) - - call gpu_deallocate(tmp_vov) - call gpu_deallocate(tmp_ovv) - - !$OMP TASKWAIT - !$OMP END PARALLEL - - call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & - tau_kau, cholesky_mo_num*nV, d_cc_space_v_vo_chol, cholesky_mo_num*nV, & - 1.d0, H_oo, nO) - - call gpu_deallocate(tau_kau) -end - -! H_vv - -subroutine compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, & - d_cc_space_v_ov_chol,H_vv) - use gpu - implicit none - - integer, intent(in) :: nO,nV - type(gpu_double2), intent(in) :: d_cc_space_f_vv - type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol - type(gpu_double4), intent(in) :: tau_x - type(gpu_double2), intent(out) :: H_vv - - integer :: a,b,i,j,u,k, beta - - type(gpu_double3) :: tau_kia, tmp_oov - - call gpu_allocate(tau_kia, cholesky_mo_num, nO, nV) - -! !$omp parallel & -! !$omp default(shared) & -! !$omp private(i,beta,j,k,a,b,tmp_oov) -! allocate(tmp_oov(nO,nO,nV) ) -! !$omp do -! do a = 1, nV -! do b=1,nV -! do j=1,nO -! do i=1,nO -! tmp_oov(i,j,b) = tau_x%f(i,j,a,b) -! enddo -! enddo -! enddo -! call dgemm('N','T',cholesky_mo_num,nO,nO*nV,1.d0, & -! d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov, nO, & -! 0.d0, tau_kia(1,1,a), cholesky_mo_num) -! enddo -! !$omp end do nowait -! deallocate(tmp_oov) - -! !$omp do -! do beta = 1, nV -! do a = 1, nV -! H_vv%f(a,beta) = cc_space_f_vv(a,beta) -! enddo -! enddo -! !$omp end do nowait -! !$omp barrier -! !$omp end parallel -! call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, & -! tau_kia, cholesky_mo_num*nO, d_cc_space_v_ov_chol%f, cholesky_mo_num*nO, & -! 1.d0, H_vv%f, nV) - - type(gpu_blas) :: blas - - - PROVIDE gpu_num - !$OMP PARALLEL & - !$OMP DEFAULT(SHARED) & - !$OMP PRIVATE(a,b,tmp_oov,blas) - - !$OMP SINGLE - !$OMP TASK - call gpu_copy(d_cc_space_f_vv, H_vv) - !$OMP END TASK - !$OMP END SINGLE - - call gpu_blas_create(blas) - call gpu_allocate(tmp_oov, nO, nO, nV) - - !$OMP DO - do a = 1, nV - do b=1,nV - call gpu_dgeam_f(blas, 'N', 'N', nO, nO, 1.d0, & - tau_x%f(1,1,a,b), nO, 0.d0, & - tau_x%f(1,1,a,b), nO, tmp_oov%f(1,1,b), nO) - enddo - call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nO,nO*nV,1.d0, & - d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov%f, nO, & - 0.d0, tau_kia%f(1,1,a), cholesky_mo_num) - enddo - !$OMP END DO - - call gpu_blas_destroy(blas) - - call gpu_deallocate(tmp_oov) - !$OMP TASKWAIT - !$OMP END PARALLEL - - call gpu_dgemm(blas_handle,'T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, & - tau_kia, cholesky_mo_num*nO, d_cc_space_v_ov_chol, cholesky_mo_num*nO, & - 1.d0, H_vv, nV) - - call gpu_deallocate(tau_kia) -end - -! H_vo -subroutine compute_H_vo_chol(nO,nV,t1,H_vo) - - implicit none - - integer, intent(in) :: nO,nV - double precision, intent(in) :: t1(nO, nV) - double precision, intent(out) :: H_vo(nV, nO) - - integer :: a,b,i,j,u,k - - double precision, allocatable :: tmp_k(:), tmp(:,:,:), tmp2(:,:,:) - do i=1,nO - do a=1,nV - H_vo(a,i) = cc_space_f_vo(a,i) - enddo - enddo - - allocate(tmp_k(cholesky_mo_num)) - call dgemm('N', 'N', cholesky_mo_num, 1, nO*nV, 2.d0, & - cc_space_v_ov_chol, cholesky_mo_num, & - t1, nO*nV, 0.d0, tmp_k, cholesky_mo_num) - - call dgemm('T','N',nV*nO,1,cholesky_mo_num,1.d0, & - cc_space_v_vo_chol, cholesky_mo_num, tmp_k, cholesky_mo_num, 1.d0, & - H_vo, nV*nO) - deallocate(tmp_k) - - allocate(tmp(cholesky_mo_num,nO,nO)) - allocate(tmp2(cholesky_mo_num,nO,nO)) - - call dgemm('N','T', cholesky_mo_num*nO, nO, nV, 1.d0, & - cc_space_v_ov_chol, cholesky_mo_num*nO, t1, nO, 0.d0, tmp, cholesky_mo_num*nO) - - do i=1,nO - do j=1,nO - do k=1,cholesky_mo_num - tmp2(k,j,i) = tmp(k,i,j) - enddo - enddo - enddo - deallocate(tmp) - - call dgemm('T','N', nV, nO, cholesky_mo_num*nO, -1.d0, & - cc_space_v_ov_chol, cholesky_mo_num*nO, tmp2, cholesky_mo_num*nO, & - 1.d0, H_vo, nV) - -end - ! R2 From 2bead959d0eee7790162df656e3781e4dcdedb7d Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 2 Jul 2024 13:58:19 +0200 Subject: [PATCH 14/38] Fxied GPU interface for gfortran --- plugins/local/gpu_x86/gpu.c | 90 ++++++++++---------- src/ccsd/ccsd_space_orb_sub.irp.f | 10 +-- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 46 +++++------ src/gpu/gpu_module.F90 | 109 +++++-------------------- 4 files changed, 94 insertions(+), 161 deletions(-) diff --git a/plugins/local/gpu_x86/gpu.c b/plugins/local/gpu_x86/gpu.c index dab23a25..fe3cadc5 100644 --- a/plugins/local/gpu_x86/gpu.c +++ b/plugins/local/gpu_x86/gpu.c @@ -49,10 +49,11 @@ void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) { /* Streams */ void gpu_stream_create(void** ptr) { - *ptr = (void*) 2; + *ptr = (void*) malloc(sizeof(char)); } void gpu_stream_destroy(void** ptr) { + free(*ptr); *ptr = NULL; } @@ -68,11 +69,12 @@ void gpu_synchronize() { /* BLAS functions */ void gpu_blas_create(void** handle) { - *handle = (void*) 1; + *handle = (void*) malloc(sizeof(char)); } void gpu_blas_destroy(void** handle) { + free(*handle); *handle = NULL; } @@ -122,7 +124,7 @@ void gpu_sdot(void* handle, const int64_t n, const float* x, const int64_t incx, void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha, const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy); -void gpu_dgemv(void* handle, const char transa, const int64_t m, const int64_t n, const double alpha, +void gpu_dgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const double alpha, const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) { assert (handle != NULL); @@ -143,14 +145,14 @@ void gpu_dgemv(void* handle, const char transa, const int64_t m, const int64_t n assert ( (int64_t) incx_ == incx); assert ( (int64_t) incy_ == incy); - dgemv_(&transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_); + dgemv_(transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_); } void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha, const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy); -void gpu_sgemv(void* handle, const char transa, const int64_t m, const int64_t n, const float alpha, +void gpu_sgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const float alpha, const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) { assert (handle != NULL); @@ -171,14 +173,14 @@ void gpu_sgemv(void* handle, const char transa, const int64_t m, const int64_t n assert ( (int64_t) incx_ == incx); assert ( (int64_t) incy_ == incy); - sgemv_(&transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_); + sgemv_(transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_); } void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha, const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc); -void gpu_dgemm(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, +void gpu_dgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) { assert (handle != NULL); @@ -201,7 +203,7 @@ void gpu_dgemm(void* handle, const char transa, const char transb, const int64_t assert ( (int64_t) ldb_ == ldb); assert ( (int64_t) ldc_ == ldc); - dgemm_(&transa, &transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_); + dgemm_(transa, transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_); } @@ -209,7 +211,7 @@ void gpu_dgemm(void* handle, const char transa, const char transb, const int64_t void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha, const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc); -void gpu_sgemm(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, +void gpu_sgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) { assert (handle != NULL); @@ -232,18 +234,18 @@ void gpu_sgemm(void* handle, const char transa, const char transb, const int64_t assert ( (int64_t) ldb_ == ldb); assert ( (int64_t) ldc_ == ldc); - sgemm_(&transa, &transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_); + sgemm_(transa, transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_); } -void gpu_dgeam(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, +void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double alpha, const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { assert (handle != NULL); - if ( (transa == 'N' && transb == 'N') || - (transa == 'n' && transb == 'N') || - (transa == 'N' && transb == 'n') || - (transa == 'n' && transb == 'n') ) { + if ( (*transa == 'N' && *transb == 'N') || + (*transa == 'n' && *transb == 'N') || + (*transa == 'N' && *transb == 'n') || + (*transa == 'n' && *transb == 'n') ) { if (alpha == 0.) { @@ -271,10 +273,10 @@ void gpu_dgeam(void* handle, const char transa, const char transb, const int64_t } - } else if ( (transa == 'N' && transb == 'T') || - (transa == 'n' && transb == 'T') || - (transa == 'N' && transb == 't') || - (transa == 'n' && transb == 't') ) { + } else if ( (*transa == 'N' && *transb == 'T') || + (*transa == 'n' && *transb == 'T') || + (*transa == 'N' && *transb == 't') || + (*transa == 'n' && *transb == 't') ) { if (alpha == 0.) { @@ -302,10 +304,10 @@ void gpu_dgeam(void* handle, const char transa, const char transb, const int64_t } - } else if ( (transa == 'T' && transb == 'N') || - (transa == 't' && transb == 'N') || - (transa == 'T' && transb == 'n') || - (transa == 't' && transb == 'n') ) { + } else if ( (*transa == 'T' && *transb == 'N') || + (*transa == 't' && *transb == 'N') || + (*transa == 'T' && *transb == 'n') || + (*transa == 't' && *transb == 'n') ) { if (alpha == 0.) { @@ -333,10 +335,10 @@ void gpu_dgeam(void* handle, const char transa, const char transb, const int64_t } - } else if ( (transa == 'T' && transb == 'T') || - (transa == 't' && transb == 'T') || - (transa == 'T' && transb == 't') || - (transa == 't' && transb == 't') ) { + } else if ( (*transa == 'T' && *transb == 'T') || + (*transa == 't' && *transb == 'T') || + (*transa == 'T' && *transb == 't') || + (*transa == 't' && *transb == 't') ) { if (alpha == 0.) { @@ -368,14 +370,14 @@ void gpu_dgeam(void* handle, const char transa, const char transb, const int64_t } -void gpu_sgeam(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha, +void gpu_sgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float alpha, const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) { assert (handle != NULL); - if ( (transa == 'N' && transb == 'N') || - (transa == 'n' && transb == 'N') || - (transa == 'N' && transb == 'n') || - (transa == 'n' && transb == 'n') ) { + if ( (*transa == 'N' && *transb == 'N') || + (*transa == 'n' && *transb == 'N') || + (*transa == 'N' && *transb == 'n') || + (*transa == 'n' && *transb == 'n') ) { if (alpha == 0.) { @@ -403,10 +405,10 @@ void gpu_sgeam(void* handle, const char transa, const char transb, const int64_t } - } else if ( (transa == 'N' && transb == 'T') || - (transa == 'n' && transb == 'T') || - (transa == 'N' && transb == 't') || - (transa == 'n' && transb == 't') ) { + } else if ( (*transa == 'N' && *transb == 'T') || + (*transa == 'n' && *transb == 'T') || + (*transa == 'N' && *transb == 't') || + (*transa == 'n' && *transb == 't') ) { if (alpha == 0.) { @@ -434,10 +436,10 @@ void gpu_sgeam(void* handle, const char transa, const char transb, const int64_t } - } else if ( (transa == 'T' && transb == 'N') || - (transa == 't' && transb == 'N') || - (transa == 'T' && transb == 'n') || - (transa == 't' && transb == 'n') ) { + } else if ( (*transa == 'T' && *transb == 'N') || + (*transa == 't' && *transb == 'N') || + (*transa == 'T' && *transb == 'n') || + (*transa == 't' && *transb == 'n') ) { if (alpha == 0.) { @@ -465,10 +467,10 @@ void gpu_sgeam(void* handle, const char transa, const char transb, const int64_t } - } else if ( (transa == 'T' && transb == 'T') || - (transa == 't' && transb == 'T') || - (transa == 'T' && transb == 't') || - (transa == 't' && transb == 't') ) { + } else if ( (*transa == 'T' && *transb == 'T') || + (*transa == 't' && *transb == 'T') || + (*transa == 'T' && *transb == 't') || + (*transa == 't' && *transb == 't') ) { if (alpha == 0.) { diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 13b974be..de109cea 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -319,10 +319,10 @@ subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1, call gpu_stream_create(s2) call gpu_set_stream(blas_handle,s1) - call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo, 1, t1, 1, e) + call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo%f(1,1), 1, t1%f(1,1), 1, e) call gpu_set_stream(blas_handle,s2) - call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1_8, d_cc_space_v_oovv, 1_8, energy) + call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x%f(1,1,1,1), 1_8, d_cc_space_v_oovv%f(1,1,1,1), 1_8, energy) call gpu_set_stream(blas_handle,gpu_default_stream) call gpu_synchronize() @@ -362,9 +362,9 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau) call gpu_stream_create(stream(b)) call gpu_set_stream(blas_handle,stream(b)) do j=1,nO - call gpu_dgeam_f(blas_handle, 'N', 'N', nO, nV, & + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, & 1.d0, t2%f(1,j,1,b), nO*nO, & - h_t1(j,b), t1%f, nO, & + h_t1(j,b), t1%f(1,1), nO, & tau%f(1,j,1,b), nO*nO) enddo enddo @@ -409,7 +409,7 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x) do b=1,nV do a=1,nV call gpu_set_stream(blas_handle,stream(a)) - call gpu_dgeam_f(blas_handle, 'N', 'N', nO, nO, & + call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, & 2.d0, tau%f(1,1,a,b), nO, & -1.d0, tau%f(1,1,b,a), nO, & tau_x%f(1,1,a,b), nO) diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index 5eb95a06..a3490589 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -37,15 +37,15 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, & !$OMP DO do u=1,nO - call gpu_dgeam_f(blas, 'N', 'N', 1, nO*nV*nV, 1.d0, & - tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f, nO, tmp_ovv%f, 1) + call gpu_dgeam(blas, 'N', 'N', 1, nO*nV*nV, 1.d0, & + tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f(1,1,1,1), nO, tmp_ovv%f(1,1,1), 1) do b=1,nV - call gpu_dgeam_f(blas, 'T', 'T', nV, nO, 1.d0, & + call gpu_dgeam(blas, 'T', 'T', nV, nO, 1.d0, & tmp_ovv%f(1,1,b), nO, 0.d0, & tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV) enddo - call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, & - d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_vov%f, nV, & + call gpu_dgemm(blas, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, & + d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, tmp_vov%f(1,1,1), nV, & 0.d0, tau_kau%f(1,1,u), cholesky_mo_num) enddo !$OMP END DO @@ -59,8 +59,8 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, & !$OMP END PARALLEL call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & - tau_kau, cholesky_mo_num*nV, d_cc_space_v_vo_chol, cholesky_mo_num*nV, & - 1.d0, H_oo, nO) + tau_kau%f(1,1,1), cholesky_mo_num*nV, d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num*nV, & + 1.d0, H_oo%f(1,1), nO) call gpu_synchronize() call gpu_deallocate(tau_kau) @@ -103,12 +103,12 @@ subroutine compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, & !$OMP DO do a = 1, nV do b=1,nV - call gpu_dgeam_f(blas, 'N', 'N', nO, nO, 1.d0, & + call gpu_dgeam(blas, 'N', 'N', nO, nO, 1.d0, & tau_x%f(1,1,a,b), nO, 0.d0, & tau_x%f(1,1,a,b), nO, tmp_oov%f(1,1,b), nO) enddo - call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nO,nO*nV,1.d0, & - d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov%f, nO, & + call gpu_dgemm(blas, 'N', 'T', cholesky_mo_num, nO, nO*nV, 1.d0, & + d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, tmp_oov%f(1,1,1), nO, & 0.d0, tau_kia%f(1,1,a), cholesky_mo_num) enddo !$OMP END DO @@ -119,9 +119,9 @@ subroutine compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, & !$OMP TASKWAIT !$OMP END PARALLEL - call gpu_dgemm(blas_handle,'T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, & - tau_kia, cholesky_mo_num*nO, d_cc_space_v_ov_chol, cholesky_mo_num*nO, & - 1.d0, H_vv, nV) + call gpu_dgemm(blas_handle, 'T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, & + tau_kia%f(1,1,1), cholesky_mo_num*nO, d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, & + 1.d0, H_vv%f(1,1), nV) call gpu_synchronize() call gpu_deallocate(tau_kia) @@ -148,20 +148,20 @@ subroutine compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, & call gpu_allocate(tmp_k, cholesky_mo_num) call gpu_dgemm(blas_handle, 'N', 'N', cholesky_mo_num, 1, nO*nV, 2.d0, & - d_cc_space_v_ov_chol, cholesky_mo_num, & - t1, nO*nV, 0.d0, tmp_k, cholesky_mo_num) + d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, & + t1%f(1,1), nO*nV, 0.d0, tmp_k%f(1), cholesky_mo_num) - call gpu_dgemm(blas_handle, 'T','N',nV*nO,1,cholesky_mo_num,1.d0, & - d_cc_space_v_vo_chol, cholesky_mo_num, tmp_k, cholesky_mo_num, 1.d0, & - H_vo, nV*nO) + call gpu_dgemm(blas_handle, 'T', 'N', nV*nO, 1, cholesky_mo_num, 1.d0, & + d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num, tmp_k%f(1), cholesky_mo_num, 1.d0, & + H_vo%f(1,1), nV*nO) call gpu_deallocate(tmp_k) call gpu_allocate(tmp, cholesky_mo_num, nO, nO) - call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nO, nO, nV, 1.d0, & - d_cc_space_v_ov_chol, cholesky_mo_num*nO, t1, nO, 0.d0, tmp, cholesky_mo_num*nO) + call gpu_dgemm(blas_handle, 'N', 'T', cholesky_mo_num*nO, nO, nV, 1.d0, & + d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, t1%f(1,1), nO, 0.d0, tmp%f(1,1,1), cholesky_mo_num*nO) call gpu_allocate(tmp2, cholesky_mo_num, nO, nO) @@ -174,7 +174,7 @@ subroutine compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, & do i=1,nO do j=1,nO call gpu_set_stream(blas_handle,stream(j)) - call gpu_dgeam_f(blas_handle, 'N', 'N', cholesky_mo_num, 1, 1.d0, & + call gpu_dgeam(blas_handle, 'N', 'N', cholesky_mo_num, 1, 1.d0, & tmp%f(1,i,j), cholesky_mo_num, 0.d0, & tmp%f(1,i,j), cholesky_mo_num, tmp2%f(1,j,i), cholesky_mo_num) enddo @@ -190,8 +190,8 @@ subroutine compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, & call gpu_deallocate(tmp) call gpu_dgemm(blas_handle, 'T','N', nV, nO, cholesky_mo_num*nO, -1.d0, & - d_cc_space_v_ov_chol, cholesky_mo_num*nO, tmp2, cholesky_mo_num*nO, & - 1.d0, H_vo, nV) + d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, tmp2%f(1,1,1), cholesky_mo_num*nO, & + 1.d0, H_vo%f(1,1), nV) call gpu_synchronize() call gpu_deallocate(tmp2) diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90 index 2676b339..20d99ede 100644 --- a/src/gpu/gpu_module.F90 +++ b/src/gpu/gpu_module.F90 @@ -156,17 +156,17 @@ module gpu character(c_char), intent(in), value :: transa, transb integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc real(c_float), intent(in), value :: alpha, beta - type(c_ptr), value :: a, b, c + real(c_float) :: a, b, c end subroutine subroutine gpu_dgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, & b, ldb, beta, c, ldc) bind(C, name='gpu_dgemm') import type(c_ptr), value, intent(in) :: handle - character(c_char), intent(in), value :: transa, transb + character(c_char), intent(in) :: transa, transb integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc real(c_double), intent(in), value :: alpha, beta - type(c_ptr), value :: a, b, c + real(c_double) :: a, b, c end subroutine subroutine gpu_sgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, & @@ -176,7 +176,7 @@ module gpu character(c_char), intent(in), value :: transa, transb integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc real(c_float), intent(in), value :: alpha, beta - type(c_ptr), value :: a, b, c + real(c_float) :: a, b, c end subroutine end interface @@ -570,7 +570,6 @@ module gpu end subroutine -end module @@ -578,38 +577,20 @@ end module ! --- subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res) - use gpu +! use gpu type(gpu_blas), intent(in) :: handle integer*4 :: n, incx, incy - type(gpu_double1), intent(in) :: dx, dy - double precision, intent(out) :: res - call gpu_ddot_c(handle%c, int(n,c_int64_t), dx%c, int(incx,c_int64_t), dy%c, int(incy,c_int64_t), res) -end subroutine - -subroutine gpu_ddot_f(handle, n, dx, incx, dy, incy, res) - use gpu - type(gpu_blas), intent(in) :: handle - integer*4 :: n, incx, incy - double precision, target :: dx(*), dy(*) + double precision, target :: dx, dy double precision, intent(out) :: res call gpu_ddot_c(handle%c, int(n,c_int64_t), c_loc(dx), int(incx,c_int64_t), c_loc(dy), int(incy,c_int64_t), res) end subroutine subroutine gpu_ddot_64(handle, n, dx, incx, dy, incy, res) - use gpu +! use gpu type(gpu_blas), intent(in) :: handle integer*8 :: n, incx, incy - type(gpu_double1), intent(in) :: dx, dy - double precision, intent(out) :: res - call gpu_ddot_c(handle%c, n, dx%c, incx, dy%c, incy, res) -end subroutine - -subroutine gpu_ddot_f_64(handle, n, dx, incx, dy, incy, res) - use gpu - type(gpu_blas), intent(in) :: handle - integer*8 :: n, incx, incy - double precision, target :: dx(*), dy(*) + double precision, target :: dx, dy double precision, intent(out) :: res call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res) end subroutine @@ -620,25 +601,12 @@ end subroutine subroutine gpu_dgeam(handle, transa, transb, m, n, alpha, a, lda, beta, & b, ldb, c, ldc) - use gpu +! use gpu type(gpu_blas), intent(in) :: handle character, intent(in) :: transa, transb integer*4, intent(in) :: m, n, lda, ldb, ldc double precision, intent(in) :: alpha, beta - type(gpu_double2) :: a, b, c - call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, a%c, int(lda,c_int64_t), beta, & - b%c, int(ldb,c_int64_t), c%c, int(ldc,c_int64_t)) -end subroutine - - -subroutine gpu_dgeam_f(handle, transa, transb, m, n, alpha, a, lda, beta, & - b, ldb, c, ldc) - use gpu - type(gpu_blas), intent(in) :: handle - character, intent(in) :: transa, transb - integer*4, intent(in) :: m, n, lda, ldb, ldc - double precision, intent(in) :: alpha, beta - double precision, target :: a(*), b(*), c(*) + double precision, target :: a, b, c call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, & c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t)) end subroutine @@ -646,25 +614,12 @@ end subroutine subroutine gpu_dgeam_64(handle, transa, transb, m, n, alpha, a, lda, beta, & b, ldb, c, ldc) - use gpu +! use gpu type(gpu_blas), intent(in) :: handle character, intent(in) :: transa, transb integer*8, intent(in) :: m, n, lda, ldb, ldc double precision, intent(in) :: alpha, beta - type(gpu_double2) :: a, b, c - call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, a%c, int(lda,c_int64_t), beta, & - b%c, int(ldb,c_int64_t), c%c, int(ldc,c_int64_t)) -end subroutine - - -subroutine gpu_dgeam_f_64(handle, transa, transb, m, n, alpha, a, lda, beta, & - b, ldb, c, ldc) - use gpu - type(gpu_blas), intent(in) :: handle - character, intent(in) :: transa, transb - integer*8, intent(in) :: m, n, lda, ldb, ldc - double precision, intent(in) :: alpha, beta - double precision, target :: a(*), b(*), c(*) + double precision, target :: a, b, c call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, & c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t)) end subroutine @@ -675,51 +630,27 @@ end subroutine subroutine gpu_dgemm(handle, transa, transb, m, n, k, alpha, a, lda, & b, ldb, beta, c, ldc) - use gpu +! use gpu type(gpu_blas), intent(in) :: handle character, intent(in) :: transa, transb integer*4, intent(in) :: m, n, k, lda, ldb, ldc double precision, intent(in) :: alpha, beta - type(gpu_double2) :: a, b, c + double precision :: a, b, c call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), & - alpha, a%c, int(lda,c_int64_t), & - b%c, int(ldb,c_int64_t), beta, c%c, int(ldc,c_int64_t)) + alpha, a, int(lda,c_int64_t), & + b, int(ldb,c_int64_t), beta, c, int(ldc,c_int64_t)) end subroutine subroutine gpu_dgemm_64(handle, transa, transb, m, n, k, alpha, a, lda, & b, ldb, beta, c, ldc) - use gpu +! use gpu type(gpu_blas), intent(in) :: handle character, intent(in) :: transa, transb integer*8, intent(in) :: m, n, k, lda, ldb, ldc double precision, intent(in) :: alpha, beta - type(gpu_double2) :: a, b, c - call gpu_dgemm_c(handle%c, transa, transb, m, n, k, & - alpha, a%c, lda, b%c, ldb, beta, c%c, ldc) -end subroutine - -subroutine gpu_dgemm_f(handle, transa, transb, m, n, k, alpha, a, lda, & - b, ldb, beta, c, ldc) - use gpu - type(gpu_blas), intent(in) :: handle - character, intent(in) :: transa, transb - integer*4, intent(in) :: m, n, k, lda, ldb, ldc - double precision, intent(in) :: alpha, beta - double precision, target :: a(*), b(*), c(*) + double precision :: a, b, c call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), & - alpha, c_loc(a), int(lda,c_int64_t), & - c_loc(b), int(ldb,c_int64_t), beta, c_loc(c), int(ldc,c_int64_t)) -end subroutine - -subroutine gpu_dgemm_f_64(handle, transa, transb, m, n, k, alpha, a, lda, & - b, ldb, beta, c, ldc) - use gpu - type(gpu_blas), intent(in) :: handle - character, intent(in) :: transa, transb - integer*8, intent(in) :: m, n, k, lda, ldb, ldc - double precision, intent(in) :: alpha, beta - double precision, target :: a(*), b(*), c(*) - call gpu_dgemm_c(handle%c, transa, transb, m, n, k, & - alpha, c_loc(a), lda, c_loc(b), ldb, beta, c_loc(c), ldc) + alpha, a, int(lda,c_int64_t), b, int(ldb,c_int64_t), beta, c, int(ldc,c_int64_t)) end subroutine +end module From 447cdcd907dd864252777423763ed6947efc32d8 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 2 Jul 2024 17:22:41 +0200 Subject: [PATCH 15/38] Working on r1 --- plugins/local/gpu_nvidia/gpu.c | 32 ++--- src/ccsd/ccsd_space_orb_sub.irp.f | 31 ++++- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 176 ++++++++++++------------- src/gpu/gpu_module.F90 | 57 +++++++- 4 files changed, 182 insertions(+), 114 deletions(-) diff --git a/plugins/local/gpu_nvidia/gpu.c b/plugins/local/gpu_nvidia/gpu.c index 39a82984..e77847a6 100644 --- a/plugins/local/gpu_nvidia/gpu.c +++ b/plugins/local/gpu_nvidia/gpu.c @@ -149,7 +149,7 @@ void gpu_sdot(cublasHandle_t handle, const int64_t n, const float* x, const int6 -void gpu_dgemv(cublasHandle_t handle, const char transa, const int64_t m, const int64_t n, const double alpha, +void gpu_dgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const double alpha, const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) { assert (handle != NULL); @@ -171,14 +171,14 @@ void gpu_dgemv(cublasHandle_t handle, const char transa, const int64_t m, const assert ( (int64_t) incy_ == incy); cublasOperation_t transa_ = CUBLAS_OP_N; - if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; + if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T; cublasDgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_); } -void gpu_sgemv(cublasHandle_t handle, const char transa, const int64_t m, const int64_t n, const float alpha, +void gpu_sgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const float alpha, const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) { assert (handle != NULL); @@ -200,13 +200,13 @@ void gpu_sgemv(cublasHandle_t handle, const char transa, const int64_t m, const assert ( (int64_t) incy_ == incy); cublasOperation_t transa_ = CUBLAS_OP_N; - if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; + if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T; cublasSgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_); } -void gpu_dgemm(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, +void gpu_dgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) { assert (handle != NULL); @@ -231,15 +231,15 @@ void gpu_dgemm(cublasHandle_t handle, const char transa, const char transb, cons cublasOperation_t transa_ = CUBLAS_OP_N; cublasOperation_t transb_ = CUBLAS_OP_N; - if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; - if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; + if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T; + if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T; cublasDgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_); } -void gpu_sgemm(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, +void gpu_sgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) { assert (handle != NULL); @@ -264,14 +264,14 @@ void gpu_sgemm(cublasHandle_t handle, const char transa, const char transb, cons cublasOperation_t transa_ = CUBLAS_OP_N; cublasOperation_t transb_ = CUBLAS_OP_N; - if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; - if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; + if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T; + if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T; cublasSgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_); } -void gpu_dgeam(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, +void gpu_dgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double alpha, const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { assert (handle != NULL); @@ -293,15 +293,15 @@ void gpu_dgeam(cublasHandle_t handle, const char transa, const char transb, cons cublasOperation_t transa_ = CUBLAS_OP_N; cublasOperation_t transb_ = CUBLAS_OP_N; - if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; - if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; + if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T; + if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T; cublasDgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_); } -void gpu_sgeam(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha, +void gpu_sgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float alpha, const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) { assert (handle != NULL); @@ -323,8 +323,8 @@ void gpu_sgeam(cublasHandle_t handle, const char transa, const char transb, cons cublasOperation_t transa_ = CUBLAS_OP_N; cublasOperation_t transb_ = CUBLAS_OP_N; - if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; - if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; + if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T; + if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T; cublasSgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_); diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index de109cea..256f743b 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -20,8 +20,8 @@ subroutine run_ccsd_space_orb type(gpu_double3) :: d_cc_space_v_oo_chol, d_cc_space_v_vo_chol type(gpu_double3) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol - type(gpu_double4) :: d_cc_space_v_oovv - + type(gpu_double4) :: d_cc_space_v_oovv, d_cc_space_v_voov, d_cc_space_v_ovov + type(gpu_double4) :: d_cc_space_v_oovo double precision, allocatable :: all_err(:,:), all_t(:,:) integer, allocatable :: list_occ(:), list_vir(:) @@ -69,6 +69,7 @@ subroutine run_ccsd_space_orb call gpu_upload(cc_space_f_oo, d_cc_space_f_oo) call gpu_upload(cc_space_f_vo, d_cc_space_f_vo) + call gpu_upload(cc_space_f_ov, d_cc_space_f_ov) call gpu_upload(cc_space_f_vv, d_cc_space_f_vv) ! FREE cc_space_f_oo @@ -92,6 +93,18 @@ subroutine run_ccsd_space_orb ! FREE cc_space_v_vv_chol endif + call gpu_allocate(d_cc_space_v_voov, nV, nO, nO, nV) + call gpu_allocate(d_cc_space_v_ovov, nO, nV, nO, nV) + call gpu_allocate(d_cc_space_v_oovo, nO, nO, nV, nO) + + call gpu_upload(cc_space_v_voov, d_cc_space_v_voov) + call gpu_upload(cc_space_v_ovov, d_cc_space_v_ovov) + call gpu_upload(cc_space_v_oovo, d_cc_space_v_oovo) + +! FREE cc_space_v_voov +! FREE cc_space_v_ovov +! FREE cc_space_v_oovo + call gpu_allocate(t2, nO,nO,nV,nV) call gpu_allocate(r2, nO,nO,nV,nV) call gpu_allocate(tau, nO,nO,nV,nV) @@ -185,7 +198,8 @@ subroutine run_ccsd_space_orb call compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, d_cc_space_v_ov_chol,H_vv) call compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol, H_vo) - call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1) + call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_space_f_ov,d_cc_space_f_vo, & + d_cc_space_v_voov, d_cc_space_v_ovov, d_cc_space_v_oovo, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol) call compute_r2_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r2%f,max_r2) else call compute_H_oo(nO,nV,t1%f,t2%f,tau%f,H_oo%f) @@ -292,8 +306,17 @@ subroutine run_ccsd_space_orb call gpu_deallocate(d_cc_space_v_vo_chol) call gpu_deallocate(d_cc_space_v_vv_chol) endif - call gpu_deallocate(d_cc_space_f_vo) + call gpu_deallocate(d_cc_space_v_oovv) + call gpu_deallocate(d_cc_space_v_voov) + call gpu_deallocate(d_cc_space_v_ovov) + call gpu_deallocate(d_cc_space_v_oovo) + + call gpu_deallocate(d_cc_space_f_oo) + call gpu_deallocate(d_cc_space_f_vo) + call gpu_deallocate(d_cc_space_f_ov) + call gpu_deallocate(d_cc_space_f_vv) + call gpu_deallocate(t1) call gpu_deallocate(t2) diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index a3490589..6190e985 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -199,59 +199,52 @@ end ! R1 -subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) - +subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_space_f_ov,d_cc_space_f_vo, & + d_cc_space_v_voov, d_cc_space_v_ovov, d_cc_space_v_oovo, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol) + use gpu implicit none ! in integer, intent(in) :: nO, nV - double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV), tau(nO,nO,nV,nV) - double precision, intent(in) :: H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO) + type(gpu_double2), intent(in) :: t1, H_oo, H_vo, H_vv, d_cc_space_f_ov,d_cc_space_f_vo + type(gpu_double3), intent(in) :: d_cc_space_v_vo_chol, d_cc_space_v_vv_chol + type(gpu_double4), intent(in) :: t2, tau, d_cc_space_v_voov, d_cc_space_v_ovov, d_cc_space_v_oovo ! out - double precision, intent(out) :: r1(nO,nV), max_r1 + type(gpu_double2), intent(out) :: r1 + double precision, intent(out) :: max_r1 ! internal integer :: u,i,j,beta,a,b - !$omp parallel & - !$omp shared(nO,nV,r1,cc_space_f_ov) & - !$omp private(u,beta) & - !$omp default(none) - !$omp do - do beta = 1, nV - do u = 1, nO - r1(u,beta) = cc_space_f_ov(u,beta) - enddo - enddo - !$omp end do - !$omp end parallel + call gpu_copy(d_cc_space_f_ov, r1) - double precision, allocatable :: X_oo(:,:) - allocate(X_oo(nO,nO)) - call dgemm('N','N', nO, nO, nV, & - -2d0, t1 , size(t1,1), & - cc_space_f_vo, size(cc_space_f_vo,1), & - 0d0, X_oo , size(X_oo,1)) + type(gpu_double2) :: X_oo + call gpu_allocate(X_oo,nO,nO) - call dgemm('T','N', nO, nV, nO, & - 1d0, X_oo, size(X_oo,2), & - t1 , size(t1,1), & - 1d0, r1 , size(r1,1)) - deallocate(X_oo) + call gpu_dgemm(blas_handle, 'N','N', nO, nO, nV, & + -2d0, t1%f(1,1), size(t1%f,1), & + d_cc_space_f_vo%f(1,1), size(d_cc_space_f_vo%f,1), & + 0d0, X_oo%f(1,1), size(X_oo%f,1)) - call dgemm('N','N', nO, nV, nV, & - 1d0, t1 , size(t1,1), & - H_vv, size(H_vv,1), & - 1d0, r1 , size(r1,1)) + call gpu_dgemm(blas_handle, 'T','N', nO, nV, nO, & + 1d0, X_oo%f(1,1), size(X_oo%f,2), & + t1%f(1,1) , size(t1%f,1), & + 1d0, r1%f(1,1) , size(r1%f,1)) - call dgemm('N','N', nO, nV, nO, & - -1d0, H_oo, size(H_oo,1), & - t1 , size(t1,1), & - 1d0, r1, size(r1,1)) + call gpu_dgemm(blas_handle, 'N','N', nO, nV, nV, & + 1d0, t1%f(1,1) , size(t1%f,1), & + H_vv%f(1,1), size(H_vv%f,1), & + 1d0, r1%f(1,1) , size(r1%f,1)) + + call gpu_dgemm(blas_handle, 'N','N', nO, nV, nO, & + -1d0, H_oo%f(1,1), size(H_oo%f,1), & + t1%f(1,1) , size(t1%f,1), & + 1d0, r1%f(1,1), size(r1%f,1)) + + type(gpu_double4) :: X_voov + call gpu_allocate(X_voov, nV, nO, nO, nV) - double precision, allocatable :: X_voov(:,:,:,:) - allocate(X_voov(nV, nO, nO, nV)) !$omp parallel & !$omp shared(nO,nV,X_voov,t2,t1) & @@ -262,7 +255,7 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) do u = 1, nO do i = 1, nO do a = 1, nV - X_voov(a,i,u,beta) = 2d0 * t2(i,u,a,beta) - t2(u,i,a,beta) + t1(u,a) * t1(i,beta) + X_voov%f(a,i,u,beta) = 2d0 * t2%f(i,u,a,beta) - t2%f(u,i,a,beta) + t1%f(u,a) * t1%f(i,beta) enddo enddo enddo @@ -270,18 +263,20 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) !$omp end do !$omp end parallel - call dgemv('T', nV*nO, nO*nV, & - 1d0, X_voov, size(X_voov,1) * size(X_voov,2), & - H_vo , 1, & - 1d0, r1 , 1) + call gpu_dgemv(blas_handle, 'T', nV*nO, nO*nV, & + 1d0, X_voov%f(1,1,1,1), size(X_voov%f,1) * size(X_voov%f,2), & + H_vo%f(1,1) , 1, & + 1d0, r1%f(1,1) , 1) - deallocate(X_voov) + call gpu_synchronize() + call gpu_deallocate(X_oo) + call gpu_deallocate(X_voov) - double precision, allocatable :: X_ovov(:,:,:,:) - allocate(X_ovov(nO, nV, nO, nV)) + type(gpu_double4) :: X_ovov + call gpu_allocate(X_ovov, nO, nV, nO, nV) !$omp parallel & - !$omp shared(nO,nV,cc_space_v_ovov,cc_space_v_voov,X_ovov) & + !$omp shared(nO,nV,d_cc_space_v_ovov,d_cc_space_v_voov,X_ovov) & !$omp private(u,beta,i,a) & !$omp default(none) !$omp do @@ -289,7 +284,7 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) do u = 1, nO do a = 1, nv do i = 1, nO - X_ovov(i,a,u,beta) = 2d0 * cc_space_v_voov(a,u,i,beta) - cc_space_v_ovov(u,a,i,beta) + X_ovov%f(i,a,u,beta) = 2d0 * d_cc_space_v_voov%f(a,u,i,beta) - d_cc_space_v_ovov%f(u,a,i,beta) enddo enddo enddo @@ -297,17 +292,25 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) !$omp end do !$omp end parallel - call dgemv('T', nO*nV, nO*nV, & - 1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), & - t1 , 1, & - 1d0, r1 , 1) - - deallocate(X_ovov) +! call dgemv('T', nO*nV, nO*nV, & +! 1d0, X_ovov%f, size(X_ovov%f,1) * size(X_ovov%f,2), & +! t1%f, 1, & +! 1d0, r1%f, 1) + call gpu_dgemv(blas_handle, 'T', nO*nV, nO*nV, & + 1d0, X_ovov%f(1,1,1,1), size(X_ovov%f,1) * size(X_ovov%f,2), & + t1%f(1,1), 1, & + 1d0, r1%f(1,1), 1) integer :: iblock, block_size, nVmax - double precision, allocatable :: W_vvov(:,:,:,:), W_vvov_tmp(:,:,:,:), T_vvoo(:,:,:,:) + type(gpu_double4) :: W_vvov, W_vvov_tmp, T_vvoo + block_size = 16 - allocate(W_vvov(nV,nV,nO,block_size), W_vvov_tmp(nV,nO,nV,block_size), T_vvoo(nV,nV,nO,nO)) + call gpu_allocate(W_vvov,nV, nV,nO,block_size) + call gpu_allocate(W_vvov_tmp, nV,nO,nV,block_size) + call gpu_allocate(T_vvoo, nV,nV,nO,nO) + + call gpu_synchronize() + call gpu_deallocate(X_ovov) !$omp parallel & !$omp private(u,i,b,a) & @@ -317,7 +320,7 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) do i = 1, nO do b = 1, nV do a = 1, nV - T_vvoo(a,b,i,u) = tau(i,u,a,b) + T_vvoo%f(a,b,i,u) = tau%f(i,u,a,b) enddo enddo enddo @@ -328,11 +331,12 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) do iblock = 1, nV, block_size nVmax = min(block_size,nV-iblock+1) - call dgemm('T','N', nV*nO, nV*nVmax, cholesky_mo_num, 1.d0, & - cc_space_v_vo_chol , cholesky_mo_num, & - cc_space_v_vv_chol(1,1,iblock), cholesky_mo_num, & - 0.d0, W_vvov_tmp, nV*nO) + call gpu_dgemm(blas_handle, 'T','N', nV*nO, nV*nVmax, cholesky_mo_num, 1.d0, & + d_cc_space_v_vo_chol%f(1,1,1) , cholesky_mo_num, & + d_cc_space_v_vv_chol%f(1,1,iblock), cholesky_mo_num, & + 0.d0, W_vvov_tmp%f(1,1,1,1), nV*nO) + call gpu_synchronize() !$omp parallel & !$omp private(b,i,a,beta) & !$omp default(shared) @@ -341,7 +345,7 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) !$omp do do b = 1, nV do a = 1, nV - W_vvov(a,b,i,beta) = 2d0 * W_vvov_tmp(a,i,b,beta) - W_vvov_tmp(b,i,a,beta) + W_vvov%f(a,b,i,beta) = 2d0 * W_vvov_tmp%f(a,i,b,beta) - W_vvov_tmp%f(b,i,a,beta) enddo enddo !$omp end do nowait @@ -350,20 +354,22 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) !$omp barrier !$omp end parallel - call dgemm('T','N',nO,nVmax,nO*nV*nV, & - 1d0, T_vvoo, nV*nV*nO, & - W_vvov, nO*nV*nV, & - 1d0, r1(1,iblock), nO) + call gpu_dgemm(blas_handle, 'T','N',nO,nVmax,nO*nV*nV, & + 1d0, T_vvoo%f(1,1,1,1), nV*nV*nO, & + W_vvov%f(1,1,1,1), nO*nV*nV, & + 1d0, r1%f(1,iblock), nO) enddo - deallocate(W_vvov,T_vvoo) + call gpu_synchronize() + call gpu_deallocate(W_vvov) + call gpu_deallocate(T_vvoo) - double precision, allocatable :: W_oovo(:,:,:,:) - allocate(W_oovo(nO,nO,nV,nO)) + type(gpu_double4) :: W_oovo + call gpu_allocate(W_oovo, nO,nO,nV,nO) !$omp parallel & - !$omp shared(nO,nV,cc_space_v_oovo,W_oovo) & + !$omp shared(nO,nV,d_cc_space_v_oovo,W_oovo) & !$omp private(u,a,i,j) & !$omp default(none) do u = 1, nO @@ -371,8 +377,7 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) do a = 1, nV do j = 1, nO do i = 1, nO -! W_oovo(i,j,a,u) = 2d0 * cc_space_v_vooo(a,u,i,j) - cc_space_v_vooo(a,u,j,i) - W_oovo(i,j,a,u) = 2d0 * cc_space_v_oovo(i,j,a,u) - cc_space_v_oovo(j,i,a,u) + W_oovo%f(i,j,a,u) = 2d0 * d_cc_space_v_oovo%f(i,j,a,u) - d_cc_space_v_oovo%f(j,i,a,u) enddo enddo enddo @@ -380,33 +385,22 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) enddo !$omp end parallel - call dgemm('T','N', nO, nV, nO*nO*nV, & - -1d0, W_oovo, size(W_oovo,1) * size(W_oovo,2) * size(W_oovo,3), & - tau , size(tau,1) * size(tau,2) * size(tau,3), & - 1d0, r1 , size(r1,1)) + ! Change the sign for consistency with the code in spin orbitals + call gpu_dgemm(blas_handle, 'T','N', nO, nV, nO*nO*nV, & + 1d0, W_oovo%f(1,1,1,1), size(W_oovo%f,1) * size(W_oovo%f,2) * size(W_oovo%f,3), & + tau%f(1,1,1,1), size(tau%f,1) * size(tau%f,2) * size(tau%f,3), & + -1d0, r1%f(1,1), size(r1%f,1)) - deallocate(W_oovo) + call gpu_synchronize() + call gpu_deallocate(W_oovo) max_r1 = 0d0 do a = 1, nV do i = 1, nO - max_r1 = max(dabs(r1(i,a)), max_r1) + max_r1 = max(dabs(r1%f(i,a)), max_r1) enddo enddo - ! Change the sign for consistency with the code in spin orbitals - !$omp parallel & - !$omp shared(nO,nV,r1) & - !$omp private(a,i) & - !$omp default(none) - !$omp do - do a = 1, nV - do i = 1, nO - r1(i,a) = -r1(i,a) - enddo - enddo - !$omp end do - !$omp end parallel end diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90 index 20d99ede..949ae4fc 100644 --- a/src/gpu/gpu_module.F90 +++ b/src/gpu/gpu_module.F90 @@ -136,7 +136,7 @@ module gpu type(c_ptr), value, intent(in) :: handle integer(c_int64_t), value :: n, incx, incy type(c_ptr), intent(in), value :: dx, dy - real(c_float), intent(out) :: res + real(c_float), intent(out) :: res end subroutine subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, & @@ -145,7 +145,7 @@ module gpu type(c_ptr), value, intent(in) :: handle character(c_char), intent(in), value :: transa, transb integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc - real(c_double), intent(in), value :: alpha, beta + real(c_double), intent(in), value :: alpha, beta type(c_ptr), value :: a, b, c end subroutine @@ -155,10 +155,31 @@ module gpu type(c_ptr), value, intent(in) :: handle character(c_char), intent(in), value :: transa, transb integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc - real(c_float), intent(in), value :: alpha, beta + real(c_float), intent(in), value :: alpha, beta real(c_float) :: a, b, c end subroutine + subroutine gpu_dgemv_c(handle, transa, m, n, alpha, a, lda, & + x, incx, beta, y, incy) bind(C, name='gpu_dgemv') + import + type(c_ptr), value, intent(in) :: handle + character(c_char), intent(in) :: transa + integer(c_int64_t), intent(in), value :: m, n, lda, incx, incy + real(c_double), intent(in), value :: alpha, beta + real(c_double) :: a, x, y + end subroutine + + subroutine gpu_sgemv_c(handle, transa, m, n, alpha, a, lda, & + x, incx, beta, y, incy) bind(C, name='gpu_sgemv') + import + type(c_ptr), value, intent(in) :: handle + character(c_char), intent(in) :: transa + integer(c_int64_t), intent(in), value :: m, n, lda, incx, incy + real(c_float), intent(in), value :: alpha, beta + real(c_float) :: a, x, y + end subroutine + + subroutine gpu_dgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, & b, ldb, beta, c, ldc) bind(C, name='gpu_dgemm') import @@ -625,6 +646,36 @@ subroutine gpu_dgeam_64(handle, transa, transb, m, n, alpha, a, lda, beta, & end subroutine +! gemv +! ---- + +subroutine gpu_dgemv(handle, transa, m, n, alpha, a, lda, & + x, incx, beta, y, incy) +! use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa + integer*4, intent(in) :: m, n, lda, incx, incy + double precision, intent(in) :: alpha, beta + double precision :: a, x, y + call gpu_dgemv_c(handle%c, transa, int(m,c_int64_t), int(n,c_int64_t), & + alpha, a, int(lda,c_int64_t), & + x, int(incx,c_int64_t), beta, y, int(incy,c_int64_t)) +end subroutine + +subroutine gpu_dgemv_64(handle, transa, m, n, alpha, a, lda, & + x, incx, beta, y, incy) +! use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa + integer*8, intent(in) :: m, n, lda, incx, incy + double precision, intent(in) :: alpha, beta + double precision :: a, x, y + call gpu_dgemv_c(handle%c, transa, int(m,c_int64_t), int(n,c_int64_t), & + alpha, a, int(lda,c_int64_t), & + x, int(incx,c_int64_t), beta, y, int(incy,c_int64_t)) +end subroutine + + ! gemm ! ---- From 92fe3a6f84b0af99bf554602528969699c206cde Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 2 Jul 2024 18:36:19 +0200 Subject: [PATCH 16/38] Working on r1 --- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 37 ++++++++++++++++++++------ 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index 6190e985..e0048637 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -245,23 +245,44 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_s type(gpu_double4) :: X_voov call gpu_allocate(X_voov, nV, nO, nO, nV) + type(gpu_stream) :: stream(nV) + + do a=1,nV + call gpu_stream_create(stream(a)) + enddo + + call gpu_synchronize() + +! do i=1,nO +! do beta=1,nV +! call gpu_set_stream(blas_handle, stream(beta)) +! call gpu_dgeam(blas_handle, 'T', 'T', nV, nO, -1.d0, t2%f(1,i,1,beta), & +! nO*nO, t1%f(i,beta), t1%f(1,1), nO, X_voov%f(1,i,1,beta), nV) +! enddo +! enddo - !$omp parallel & - !$omp shared(nO,nV,X_voov,t2,t1) & - !$omp private(u,beta,i,a) & - !$omp default(none) - !$omp do do beta = 1, nV do u = 1, nO do i = 1, nO do a = 1, nV - X_voov%f(a,i,u,beta) = 2d0 * t2%f(i,u,a,beta) - t2%f(u,i,a,beta) + t1%f(u,a) * t1%f(i,beta) + X_voov%f(a,i,u,beta) = - t2%f(u,i,a,beta) + t1%f(u,a) * t1%f(i,beta) enddo enddo enddo enddo - !$omp end do - !$omp end parallel + call gpu_synchronize() + + do beta=1,nV + call gpu_set_stream(blas_handle, stream(beta)) + call gpu_dgeam(blas_handle, 'N', 'T', nV, nO*nO, 1.d0, X_voov%f(1,1,1,beta), & + nV, 2.d0, t2%f(1,1,1,beta), nO*nO, X_voov%f(1,1,1,beta), nV) + enddo + + call gpu_synchronize() + do a=1,nV + call gpu_stream_destroy(stream(a)) + enddo + call gpu_set_stream(blas_handle, gpu_default_stream) call gpu_dgemv(blas_handle, 'T', nV*nO, nO*nV, & 1d0, X_voov%f(1,1,1,1), size(X_voov%f,1) * size(X_voov%f,2), & From cc09f8c61a0e8a29e7d2a2933d9659c1ad70a7b5 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Wed, 3 Jul 2024 14:52:11 +0200 Subject: [PATCH 17/38] Minor changes in Cholesky --- src/ao_two_e_ints/cholesky.irp.f | 6 +++--- src/mo_two_e_ints/map_integrals.irp.f | 2 +- src/tools/four_idx_transform.irp.f | 3 +++ 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f index acb0872b..a2d9d043 100644 --- a/src/ao_two_e_ints/cholesky.irp.f +++ b/src/ao_two_e_ints/cholesky.irp.f @@ -158,9 +158,9 @@ END_PROVIDER Lset(np8) = p8 endif enddo - np = np8 + if (np8 > ndim8) stop 'np>ndim8' + np = int(np8,4) if (np <= 0) stop 'np<=0' - if (np > ndim8) stop 'np>ndim8' rank_max = min(np,20*elec_num*elec_num) call mmap(trim(ezfio_work_dir)//'cholesky_ao_tmp', (/ ndim8, rank_max /), 8, fd(1), .False., .True., c_pointer(1)) @@ -431,7 +431,7 @@ END_PROVIDER Lset(np8) = p8 endif enddo - np = np8 + np = int(np8,4) enddo diff --git a/src/mo_two_e_ints/map_integrals.irp.f b/src/mo_two_e_ints/map_integrals.irp.f index 168c34b4..eeb4279f 100644 --- a/src/mo_two_e_ints/map_integrals.irp.f +++ b/src/mo_two_e_ints/map_integrals.irp.f @@ -40,7 +40,7 @@ end ! Min and max values of the MOs for which the integrals are in the cache END_DOC - mo_integrals_cache_size = 2_8**mo_integrals_cache_shift + mo_integrals_cache_size = 2**mo_integrals_cache_shift mo_integrals_cache_min = max(1,elec_alpha_num - (mo_integrals_cache_size/2 - 1) ) mo_integrals_cache_max = min(mo_num, mo_integrals_cache_min + mo_integrals_cache_size - 1) diff --git a/src/tools/four_idx_transform.irp.f b/src/tools/four_idx_transform.irp.f index 92e87cad..fc6bface 100644 --- a/src/tools/four_idx_transform.irp.f +++ b/src/tools/four_idx_transform.irp.f @@ -12,6 +12,9 @@ program four_idx_transform ! END_DOC + if (do_mo_cholesky) then + stop 'Not implemented with Cholesky integrals' + endif io_mo_two_e_integrals = 'Write' SOFT_TOUCH io_mo_two_e_integrals if (.true.) then From 2f8e7bd4f79108476bcc1b04165912d629d7d924 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Wed, 3 Jul 2024 15:32:38 +0200 Subject: [PATCH 18/38] Updated to read CHolesky MO integrals from TREXIO --- src/trexio/export_trexio_routines.irp.f | 2 +- src/trexio/import_trexio_integrals.irp.f | 226 ++++++++++++++++------- 2 files changed, 159 insertions(+), 69 deletions(-) diff --git a/src/trexio/export_trexio_routines.irp.f b/src/trexio/export_trexio_routines.irp.f index 63630243..0eec68bd 100644 --- a/src/trexio/export_trexio_routines.irp.f +++ b/src/trexio/export_trexio_routines.irp.f @@ -557,7 +557,7 @@ subroutine export_trexio(update,full_path) do k=1,cholesky_ao_num do j=1,mo_num do i=1,mo_num - integral = cholesky_mo(i,j,k) + integral = cholesky_mo_transp(k,i,j) if (integral == 0.d0) cycle icount += 1_8 chol_buffer(icount) = integral diff --git a/src/trexio/import_trexio_integrals.irp.f b/src/trexio/import_trexio_integrals.irp.f index 5a6b3c03..556ed7bc 100644 --- a/src/trexio/import_trexio_integrals.irp.f +++ b/src/trexio/import_trexio_integrals.irp.f @@ -28,7 +28,7 @@ subroutine run(f) integer(trexio_t), intent(in) :: f ! TREXIO file handle integer(trexio_exit_code) :: rc - integer ::i,j,k,l + integer :: i,j,k,l, iunit integer(8) :: m, n_integrals double precision :: integral @@ -41,10 +41,12 @@ subroutine run(f) integer , allocatable :: Vi(:,:) double precision :: s -! TODO: -! - If Cholesky AO in trexio file, read cholesky ao vectors -! - If Cholesky MO in trexio file, read cholesky mo vectors -! - If Cholesky MO not in trexio file, force do_cholesky_mo to False + integer*4 :: BUFSIZE + integer :: rank + double precision, allocatable :: tmp(:,:,:) + integer*8 :: offset, icount + + integer, external :: getUnitAndOpen if (trexio_has_nucleus_repulsion(f) == TREXIO_SUCCESS) then rc = trexio_read_nucleus_repulsion(f, s) @@ -120,45 +122,88 @@ subroutine run(f) rc = trexio_has_ao_2e_int(f) PROVIDE ao_num if (rc /= TREXIO_HAS_NOT) then - PROVIDE ao_integrals_map - integer*4 :: BUFSIZE - BUFSIZE=ao_num**2 - allocate(buffer_i(BUFSIZE), buffer_values(BUFSIZE)) - allocate(Vi(4,BUFSIZE), V(BUFSIZE)) + rc = trexio_has_ao_2e_int_eri_cholesky(f) + if (rc /= TREXIO_HAS_NOT) then - integer*8 :: offset, icount + rc = trexio_read_ao_2e_int_eri_cholesky_num(f, rank) + call trexio_assert(rc, TREXIO_SUCCESS) - offset = 0_8 - icount = BUFSIZE - rc = TREXIO_SUCCESS - do while (icount == size(V)) - rc = trexio_read_ao_2e_int_eri(f, offset, icount, Vi, V) - do m=1,icount - i = Vi(1,m) - j = Vi(2,m) - k = Vi(3,m) - l = Vi(4,m) - integral = V(m) - call two_e_integrals_index(i, j, k, l, buffer_i(m) ) - buffer_values(m) = integral - enddo - call insert_into_ao_integrals_map(int(icount,4),buffer_i,buffer_values) - offset = offset + icount - if (rc /= TREXIO_SUCCESS) then - exit - endif - end do - n_integrals = offset + allocate(tmp(ao_num,ao_num,rank)) + tmp(:,:,:) = 0.d0 - call map_sort(ao_integrals_map) - call map_unique(ao_integrals_map) + BUFSIZE=ao_num**2 + allocate(Vi(3,BUFSIZE), V(BUFSIZE)) - call map_save_to_disk(trim(ezfio_filename)//'/work/ao_ints',ao_integrals_map) - call ezfio_set_ao_two_e_ints_io_ao_two_e_integrals('Read') - deallocate(buffer_i, buffer_values, Vi, V) - print *, 'AO integrals read from TREXIO file' + offset = 0_8 + icount = BUFSIZE + rc = TREXIO_SUCCESS + do while (icount == size(V)) + rc = trexio_read_ao_2e_int_eri_cholesky(f, offset, icount, Vi, V) + do m=1,icount + i = Vi(1,m) + j = Vi(2,m) + k = Vi(3,m) + integral = V(m) + tmp(i,j,k) = integral + enddo + offset = offset + icount + if (rc /= TREXIO_SUCCESS) then + exit + endif + end do + + print *, 'Writing Cholesky AO vectors to disk...' + iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_ao', 'W') + write(iunit) rank + write(iunit) tmp(:,:,:) + close(iunit) + call ezfio_set_ao_two_e_ints_io_ao_cholesky('Read') + + deallocate(Vi, V, tmp) + print *, 'Cholesky AO integrals read from TREXIO file' + endif + + rc = trexio_has_ao_2e_int_eri(f) + if (rc /= TREXIO_HAS_NOT) then + PROVIDE ao_integrals_map + + BUFSIZE=ao_num**2 + allocate(buffer_i(BUFSIZE), buffer_values(BUFSIZE)) + allocate(Vi(4,BUFSIZE), V(BUFSIZE)) + + offset = 0_8 + icount = BUFSIZE + rc = TREXIO_SUCCESS + do while (icount == size(V)) + rc = trexio_read_ao_2e_int_eri(f, offset, icount, Vi, V) + do m=1,icount + i = Vi(1,m) + j = Vi(2,m) + k = Vi(3,m) + l = Vi(4,m) + integral = V(m) + call two_e_integrals_index(i, j, k, l, buffer_i(m) ) + buffer_values(m) = integral + enddo + call insert_into_ao_integrals_map(int(icount,4),buffer_i,buffer_values) + offset = offset + icount + if (rc /= TREXIO_SUCCESS) then + exit + endif + end do + n_integrals = offset + + call map_sort(ao_integrals_map) + call map_unique(ao_integrals_map) + + call map_save_to_disk(trim(ezfio_filename)//'/work/ao_ints',ao_integrals_map) + call ezfio_set_ao_two_e_ints_io_ao_two_e_integrals('Read') + + deallocate(buffer_i, buffer_values, Vi, V) + print *, 'AO integrals read from TREXIO file' + endif else print *, 'AO integrals not found in TREXIO file' endif @@ -186,40 +231,85 @@ subroutine run(f) rc = trexio_has_mo_2e_int(f) if (rc /= TREXIO_HAS_NOT) then - BUFSIZE=mo_num**2 - allocate(buffer_i(BUFSIZE), buffer_values(BUFSIZE)) - allocate(Vi(4,BUFSIZE), V(BUFSIZE)) + rc = trexio_has_mo_2e_int_eri_cholesky(f) + if (rc /= TREXIO_HAS_NOT) then + + rc = trexio_read_mo_2e_int_eri_cholesky_num(f, rank) + call trexio_assert(rc, TREXIO_SUCCESS) + + allocate(tmp(rank,mo_num,mo_num)) + tmp(:,:,:) = 0.d0 + + BUFSIZE=mo_num**2 + allocate(Vi(3,BUFSIZE), V(BUFSIZE)) + + offset = 0_8 + icount = BUFSIZE + rc = TREXIO_SUCCESS + do while (icount == size(V)) + rc = trexio_read_mo_2e_int_eri_cholesky(f, offset, icount, Vi, V) + do m=1,icount + i = Vi(1,m) + j = Vi(2,m) + k = Vi(3,m) + integral = V(m) + tmp(k,i,j) = integral + enddo + offset = offset + icount + if (rc /= TREXIO_SUCCESS) then + exit + endif + end do + + print *, 'Writing Cholesky MO vectors to disk...' + iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_mo_transp', 'W') + write(iunit) rank + write(iunit) tmp(:,:,:) + close(iunit) + call ezfio_set_mo_two_e_ints_io_mo_cholesky('Read') + + deallocate(Vi, V, tmp) + print *, 'Cholesky MO integrals read from TREXIO file' + endif + + rc = trexio_has_mo_2e_int_eri(f) + if (rc /= TREXIO_HAS_NOT) then + BUFSIZE=mo_num**2 + allocate(buffer_i(BUFSIZE), buffer_values(BUFSIZE)) + allocate(Vi(4,BUFSIZE), V(BUFSIZE)) - offset = 0_8 - icount = BUFSIZE - rc = TREXIO_SUCCESS - do while (icount == size(V)) - rc = trexio_read_mo_2e_int_eri(f, offset, icount, Vi, V) - do m=1,icount - i = Vi(1,m) - j = Vi(2,m) - k = Vi(3,m) - l = Vi(4,m) - integral = V(m) - call two_e_integrals_index(i, j, k, l, buffer_i(m) ) - buffer_values(m) = integral - enddo - call map_append(mo_integrals_map, buffer_i, buffer_values, int(icount,4)) - offset = offset + icount - if (rc /= TREXIO_SUCCESS) then - exit - endif - end do - n_integrals = offset + offset = 0_8 + icount = BUFSIZE + rc = TREXIO_SUCCESS + do while (icount == size(V)) + rc = trexio_read_mo_2e_int_eri(f, offset, icount, Vi, V) + do m=1,icount + i = Vi(1,m) + j = Vi(2,m) + k = Vi(3,m) + l = Vi(4,m) + integral = V(m) + call two_e_integrals_index(i, j, k, l, buffer_i(m) ) + buffer_values(m) = integral + enddo + call map_append(mo_integrals_map, buffer_i, buffer_values, int(icount,4)) + offset = offset + icount + if (rc /= TREXIO_SUCCESS) then + exit + endif + end do + n_integrals = offset - call map_sort(mo_integrals_map) - call map_unique(mo_integrals_map) + call map_sort(mo_integrals_map) + call map_unique(mo_integrals_map) + + call map_save_to_disk(trim(ezfio_filename)//'/work/mo_ints',mo_integrals_map) + call ezfio_set_mo_two_e_ints_io_mo_two_e_integrals('Read') + deallocate(buffer_i, buffer_values, Vi, V) + print *, 'MO integrals read from TREXIO file' + endif - call map_save_to_disk(trim(ezfio_filename)//'/work/mo_ints',mo_integrals_map) - call ezfio_set_mo_two_e_ints_io_mo_two_e_integrals('Read') - deallocate(buffer_i, buffer_values, Vi, V) - print *, 'MO integrals read from TREXIO file' else print *, 'MO integrals not found in TREXIO file' endif From 7ceb8fdcca5cd7bff3984c816eef8e47aa681a0b Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Wed, 3 Jul 2024 18:24:13 +0200 Subject: [PATCH 19/38] Finished r1 --- plugins/local/gpu_nvidia/gpu.c | 41 +++--- plugins/local/gpu_x86/gpu.c | 112 ++++++++-------- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 177 ++++++++++--------------- src/gpu/gpu.h | 24 ++-- src/gpu/gpu_module.F90 | 12 +- 5 files changed, 162 insertions(+), 204 deletions(-) diff --git a/plugins/local/gpu_nvidia/gpu.c b/plugins/local/gpu_nvidia/gpu.c index e77847a6..a775ab95 100644 --- a/plugins/local/gpu_nvidia/gpu.c +++ b/plugins/local/gpu_nvidia/gpu.c @@ -116,11 +116,6 @@ void gpu_ddot(cublasHandle_t handle, const int64_t n, const double* x, const int assert ( (int64_t) incy_ == incy); cublasStatus_t rc = cublasDdot(handle, n_, x, incx_, y, incy_, result); -/* - double alpha = 1.0; - double beta = 0.0; - cublasStatus_t rc = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, n_, &alpha, x, 1, y, n_, &beta, &result_, 1); -*/ assert (rc == CUBLAS_STATUS_SUCCESS); } @@ -149,8 +144,8 @@ void gpu_sdot(cublasHandle_t handle, const int64_t n, const float* x, const int6 -void gpu_dgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const double alpha, - const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) { +void gpu_dgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const double* alpha, + const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) { assert (handle != NULL); @@ -173,13 +168,13 @@ void gpu_dgemv(cublasHandle_t handle, const char* transa, const int64_t m, const cublasOperation_t transa_ = CUBLAS_OP_N; if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T; - cublasDgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_); + cublasDgemv(handle, transa_, m_, n_, alpha, a, lda_, x, incx_, beta, y, incy_); } -void gpu_sgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const float alpha, - const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) { +void gpu_sgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const float* alpha, + const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy) { assert (handle != NULL); @@ -202,12 +197,12 @@ void gpu_sgemv(cublasHandle_t handle, const char* transa, const int64_t m, const cublasOperation_t transa_ = CUBLAS_OP_N; if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T; - cublasSgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_); + cublasSgemv(handle, transa_, m_, n_, alpha, a, lda_, x, incx_, beta, y, incy_); } -void gpu_dgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, - const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) { +void gpu_dgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha, + const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) { assert (handle != NULL); @@ -234,13 +229,13 @@ void gpu_dgemm(cublasHandle_t handle, const char* transa, const char* transb, co if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T; if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T; - cublasDgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_); + cublasDgemm(handle, transa_, transb_, m_, n_, k_, alpha, a, lda_, b, ldb_, beta, c, ldc_); } -void gpu_sgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, - const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) { +void gpu_sgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha, + const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc) { assert (handle != NULL); @@ -267,12 +262,12 @@ void gpu_sgemm(cublasHandle_t handle, const char* transa, const char* transb, co if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T; if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T; - cublasSgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_); + cublasSgemm(handle, transa_, transb_, m_, n_, k_, alpha, a, lda_, b, ldb_, beta, c, ldc_); } -void gpu_dgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double alpha, - const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { +void gpu_dgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha, + const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { assert (handle != NULL); /* Convert to int */ @@ -296,13 +291,13 @@ void gpu_dgeam(cublasHandle_t handle, const char* transa, const char* transb, co if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T; if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T; - cublasDgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_); + cublasDgeam(handle, transa_, transb_, m_, n_, alpha, a, lda_, beta, b, ldb_, c, ldc_); } -void gpu_sgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float alpha, - const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) { +void gpu_sgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float* alpha, + const float* a, const int64_t lda, const float* beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) { assert (handle != NULL); /* Convert to int */ @@ -326,6 +321,6 @@ void gpu_sgeam(cublasHandle_t handle, const char* transa, const char* transb, co if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T; if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T; - cublasSgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_); + cublasSgeam(handle, transa_, transb_, m_, n_, alpha, a, lda_, beta, b, ldb_, c, ldc_); } diff --git a/plugins/local/gpu_x86/gpu.c b/plugins/local/gpu_x86/gpu.c index fe3cadc5..49aec9d3 100644 --- a/plugins/local/gpu_x86/gpu.c +++ b/plugins/local/gpu_x86/gpu.c @@ -124,8 +124,8 @@ void gpu_sdot(void* handle, const int64_t n, const float* x, const int64_t incx, void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha, const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy); -void gpu_dgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const double alpha, - const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) { +void gpu_dgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const double* alpha, + const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) { assert (handle != NULL); @@ -145,15 +145,15 @@ void gpu_dgemv(void* handle, const char* transa, const int64_t m, const int64_t assert ( (int64_t) incx_ == incx); assert ( (int64_t) incy_ == incy); - dgemv_(transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_); + dgemv_(transa, &m_, &n_, alpha, a, &lda_, x, &incx_, beta, y, &incy_); } void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha, const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy); -void gpu_sgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const float alpha, - const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) { +void gpu_sgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const float* alpha, + const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy) { assert (handle != NULL); @@ -173,15 +173,15 @@ void gpu_sgemv(void* handle, const char* transa, const int64_t m, const int64_t assert ( (int64_t) incx_ == incx); assert ( (int64_t) incy_ == incy); - sgemv_(transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_); + sgemv_(transa, &m_, &n_, alpha, a, &lda_, x, &incx_, beta, y, &incy_); } void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha, const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc); -void gpu_dgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, - const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) { +void gpu_dgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha, + const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) { assert (handle != NULL); @@ -203,7 +203,7 @@ void gpu_dgemm(void* handle, const char* transa, const char* transb, const int64 assert ( (int64_t) ldb_ == ldb); assert ( (int64_t) ldc_ == ldc); - dgemm_(transa, transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_); + dgemm_(transa, transb, &m_, &n_, &k_, alpha, a, &lda_, b, &ldb_, beta, c, &ldc_); } @@ -211,8 +211,8 @@ void gpu_dgemm(void* handle, const char* transa, const char* transb, const int64 void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha, const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc); -void gpu_sgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, - const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) { +void gpu_sgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha, + const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc) { assert (handle != NULL); @@ -234,12 +234,12 @@ void gpu_sgemm(void* handle, const char* transa, const char* transb, const int64 assert ( (int64_t) ldb_ == ldb); assert ( (int64_t) ldc_ == ldc); - sgemm_(transa, transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_); + sgemm_(transa, transb, &m_, &n_, &k_, alpha, a, &lda_, b, &ldb_, beta, c, &ldc_); } -void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double alpha, - const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { +void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha, + const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { assert (handle != NULL); if ( (*transa == 'N' && *transb == 'N') || @@ -247,19 +247,19 @@ void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64 (*transa == 'N' && *transb == 'n') || (*transa == 'n' && *transb == 'n') ) { - if (alpha == 0.) { + if (*alpha == 0.) { for (int64_t j=0 ; j Date: Thu, 4 Jul 2024 12:01:16 +0200 Subject: [PATCH 20/38] Working on r2 --- src/ccsd/ccsd_space_orb_sub.irp.f | 27 +- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 353 +++++++++++++------------ 2 files changed, 200 insertions(+), 180 deletions(-) diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 256f743b..59b9ebd2 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -21,7 +21,8 @@ subroutine run_ccsd_space_orb type(gpu_double3) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol type(gpu_double4) :: d_cc_space_v_oovv, d_cc_space_v_voov, d_cc_space_v_ovov - type(gpu_double4) :: d_cc_space_v_oovo + type(gpu_double4) :: d_cc_space_v_oovo, d_cc_space_v_vooo, d_cc_space_v_oooo + type(gpu_double4) :: d_cc_space_v_vvoo double precision, allocatable :: all_err(:,:), all_t(:,:) integer, allocatable :: list_occ(:), list_vir(:) @@ -93,17 +94,29 @@ subroutine run_ccsd_space_orb ! FREE cc_space_v_vv_chol endif + call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV) call gpu_allocate(d_cc_space_v_voov, nV, nO, nO, nV) call gpu_allocate(d_cc_space_v_ovov, nO, nV, nO, nV) call gpu_allocate(d_cc_space_v_oovo, nO, nO, nV, nO) + call gpu_allocate(d_cc_space_v_vooo, nV, nO, nO, nO) + call gpu_allocate(d_cc_space_v_oooo, nO, nO, nO, nO) + call gpu_allocate(d_cc_space_v_vvoo, nV, nV, nO, nO) + call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv) call gpu_upload(cc_space_v_voov, d_cc_space_v_voov) call gpu_upload(cc_space_v_ovov, d_cc_space_v_ovov) call gpu_upload(cc_space_v_oovo, d_cc_space_v_oovo) + call gpu_upload(cc_space_v_vooo, d_cc_space_v_vooo) + call gpu_upload(cc_space_v_oooo, d_cc_space_v_oooo) + call gpu_upload(cc_space_v_vvoo, d_cc_space_v_vvoo) ! FREE cc_space_v_voov ! FREE cc_space_v_ovov ! FREE cc_space_v_oovo +! FREE cc_space_v_oovv +! FREE cc_space_v_vooo +! FREE cc_space_v_oooo +! FREE cc_space_v_vvoo call gpu_allocate(t2, nO,nO,nV,nV) call gpu_allocate(r2, nO,nO,nV,nV) @@ -165,15 +178,8 @@ subroutine run_ccsd_space_orb call gpu_upload(h_t2, t2) - call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV) - call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv) - -! FREE cc_space_v_oovv - - call update_tau_space(nO,nV,h_t1,t1,t2,tau) call update_tau_x_space(nO,nV,tau,tau_x) - !print*,'hf_energy', hf_energy call det_energy(det,uncorr_energy) print*,'Det energy', uncorr_energy @@ -200,7 +206,10 @@ subroutine run_ccsd_space_orb call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_space_f_ov,d_cc_space_f_vo, & d_cc_space_v_voov, d_cc_space_v_ovov, d_cc_space_v_oovo, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol) - call compute_r2_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r2%f,max_r2) + call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & + d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, & + d_cc_space_v_vvoo, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, & + r2, max_r2) else call compute_H_oo(nO,nV,t1%f,t2%f,tau%f,H_oo%f) call compute_H_vv(nO,nV,t1%f,t2%f,tau%f,H_vv%f) diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index c34b390b..0474dcec 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -391,168 +391,162 @@ end ! R2 -subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) - +subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & + d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, & + d_cc_space_v_vvoo, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, & + r2,max_r2) + use gpu implicit none ! in - integer, intent(in) :: nO, nV - double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV), tau(nO,nO,nV,nV) - double precision, intent(in) :: H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO) + integer, intent(in) :: nO, nV + type(gpu_double2), intent(in) :: t1, H_oo, H_vv + type(gpu_double4), intent(in) :: t2, tau, d_cc_space_v_oovv + type(gpu_double4), intent(in) :: d_cc_space_v_vooo, d_cc_space_v_oooo + type(gpu_double4), intent(in) :: d_cc_space_v_vvoo + type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol + type(gpu_double3), intent(in) :: d_cc_space_v_vo_chol ! out - double precision, intent(out) :: r2(nO,nO,nV,nV), max_r2 + double precision, intent(out) :: max_r2 + type(gpu_double4), intent(out) :: r2 ! internal integer :: u,v,i,j,beta,gam,a,b double precision :: max_r2_local + type(gpu_stream) :: stream(nV) + call set_multiple_levels_omp(.False.) - !$omp parallel & - !$omp shared(nO,nV,r2,cc_space_v_oovv) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do - do gam = 1, nV - do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2(u,v,beta,gam) = cc_space_v_oovv(u,v,beta,gam) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel + call gpu_copy(d_cc_space_v_oovv, r2) - double precision, allocatable :: A1(:,:,:,:) - allocate(A1(nO,nO,nO,nO)) - call compute_A1_chol(nO,nV,t1,t2,tau,A1) - call dgemm('N','N',nO*nO,nV*nV,nO*nO, & - 1d0, A1, size(A1,1) * size(A1,2), & - tau, size(tau,1) * size(tau,2), & - 1d0, r2, size(r2,1) * size(r2,2)) + type(gpu_double4) :: A1 + call gpu_allocate(A1,nO,nO,nO,nO) + call compute_A1_chol(nO,nV,t1,t2,tau,d_cc_space_v_vooo, & + d_cc_space_v_oooo, d_cc_space_v_vvoo, A1) + + call gpu_dgemm(blas_handle, 'N','N',nO*nO,nV*nV,nO*nO, & + 1d0, A1%f(1,1,1,1), size(A1%f,1) * size(A1%f,2), & + tau%f(1,1,1,1), size(tau%f,1) * size(tau%f,2), & + 1d0, r2%f(1,1,1,1), size(r2%f,1) * size(r2%f,2)) + + call gpu_deallocate(A1) - deallocate(A1) integer :: block_size, iblock, k block_size = 16 - double precision, dimension(:,:,:), allocatable :: B1, tmp_cc, tmpB1 - double precision, dimension(:,:), allocatable :: tmp_cc2 + type(gpu_double3) :: tmp_cc, B1, tmpB1 + type(gpu_double2) :: tmp_cc2 - allocate(tmp_cc(cholesky_mo_num,nV,nV)) - call dgemm('N','N', cholesky_mo_num*nV, nV, nO, 1.d0, & - cc_space_v_vo_chol, cholesky_mo_num*nV, t1, nO, 0.d0, tmp_cc, cholesky_mo_num*nV) + call gpu_allocate(tmp_cc,cholesky_mo_num,nV,nV) + call gpu_dgemm(blas_handle, 'N','N', cholesky_mo_num*nV, nV, nO, 1.d0, & + d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num*nV, t1%f(1,1), nO, 0.d0, tmp_cc%f(1,1,1), cholesky_mo_num*nV) call set_multiple_levels_omp(.False.) + call gpu_synchronize() + + type(gpu_blas) :: blas + + !$OMP PARALLEL PRIVATE(gam, iblock, B1, tmpB1, tmp_cc2, beta, b, a, blas) + call gpu_allocate(B1,nV,nV,block_size) + call gpu_allocate(tmpB1,nV,block_size,nV) + call gpu_allocate(tmp_cc2,cholesky_mo_num,nV) + + call gpu_blas_create(blas) - !$OMP PARALLEL PRIVATE(gam, iblock, B1, tmpB1, tmp_cc2, beta, b, a) - allocate(B1(nV,nV,block_size), tmpB1(nV,block_size,nV), tmp_cc2(cholesky_mo_num,nV)) !$OMP DO do gam = 1, nV - do a=1,nV - do k=1,cholesky_mo_num - tmp_cc2(k,a) = cc_space_v_vv_chol(k,a,gam) - tmp_cc(k,a,gam) - enddo - enddo + call gpu_dgeam(blas, 'N', 'N', cholesky_mo_num, nV, 1.d0, d_cc_space_v_vv_chol%f(1,1,gam), & + cholesky_mo_num, -1.d0, tmp_cc%f(1,1,gam), cholesky_mo_num, tmp_cc2%f(1,1), cholesky_mo_num) do iblock = 1, nV, block_size - call dgemm('T', 'N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, & - -1.d0, tmp_cc(1,1,iblock), cholesky_mo_num, & - cc_space_v_vv_chol(1,1,gam), cholesky_mo_num, & - 0.d0, tmpB1, nV*block_size) + call gpu_dgemm(blas, 'T', 'N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, & + -1.d0, tmp_cc%f(1,1,iblock), cholesky_mo_num, & + d_cc_space_v_vv_chol%f(1,1,gam), cholesky_mo_num, & + 0.d0, tmpB1%f(1,1,1), nV*block_size) - call dgemm('T','N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, & - 1.d0, cc_space_v_vv_chol(1,1,iblock), cholesky_mo_num, & - tmp_cc2, cholesky_mo_num, & - 1.d0, tmpB1, nV*block_size) + call gpu_dgemm(blas, 'T','N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, & + 1.d0, d_cc_space_v_vv_chol%f(1,1,iblock), cholesky_mo_num, & + tmp_cc2%f(1,1), cholesky_mo_num, & + 1.d0, tmpB1%f(1,1,1), nV*block_size) do beta = iblock, min(nV, iblock+block_size-1) - do b = 1, nV - do a = 1, nV - B1(a,b,beta-iblock+1) = tmpB1(a,beta-iblock+1,b) - enddo - enddo + call gpu_dgeam(blas, 'N', 'N', nV, nV, 1.d0, tmpB1%f(1,beta-iblock+1,1), & + nV*block_size, 0.d0, B1%f(1,1,beta-iblock+1), nV, B1%f(1,1,beta-iblock+1), nV) enddo - call dgemm('N','N',nO*nO,min(block_size, nV-iblock+1),nV*nV, & - 1d0, tau, size(tau,1) * size(tau,2), & - B1 , size(B1 ,1) * size(B1 ,2), & - 1d0, r2(1,1,iblock,gam), size(r2 ,1) * size(r2 ,2)) + call gpu_dgemm(blas, 'N','N',nO*nO,min(block_size, nV-iblock+1),nV*nV, & + 1d0, tau%f(1,1,1,1), size(tau%f,1) * size(tau%f,2), & + B1%f(1,1,1) , size(B1%f ,1) * size(B1%f ,2), & + 1d0, r2%f(1,1,iblock,gam), size(r2%f ,1) * size(r2%f ,2)) enddo enddo !$OMP ENDDO - deallocate(B1, tmpB1, tmp_cc2) + call gpu_blas_destroy(blas) + + call gpu_deallocate(B1) + call gpu_deallocate(tmpB1) + call gpu_deallocate(tmp_cc2) !$OMP END PARALLEL - deallocate(tmp_cc) + call gpu_deallocate(tmp_cc) + type(gpu_double4) :: X_oovv + call gpu_allocate(X_oovv,nO,nO,nV,nV) + call gpu_copy(t2,X_oovv) - double precision, allocatable :: X_oovv(:,:,:,:) - allocate(X_oovv(nO,nO,nV,nV)) - !$omp parallel & - !$omp shared(nO,nV,t2,X_oovv) & - !$omp private(u,v,gam,a) & - !$omp default(none) - !$omp do - do a = 1, nV - do gam = 1, nV - do v = 1, nO - do u = 1, nO - X_oovv(u,v,gam,a) = t2(u,v,gam,a) - enddo - enddo - enddo + type(gpu_double2) :: g_vir + call gpu_allocate(g_vir,nV,nV) + call compute_g_vir_chol(nO,nV,t1%f,t2%f,H_vv%f,g_vir%f) + + type(gpu_double4) :: Y_oovv + call gpu_allocate(Y_oovv,nO,nO,nV,nV) + + call gpu_dgemm(blas_handle, 'N','N',nO*nO*nV,nV,nV, & + 1d0, X_oovv%f(1,1,1,1), size(X_oovv%f,1) * size(X_oovv%f,2) * size(X_oovv%f,3), & + g_vir%f(1,1), size(g_vir%f,1), & + 0d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1) * size(Y_oovv%f,2) * size(Y_oovv%f,3)) + + call gpu_synchronize() + + do a=1,nV + call gpu_stream_create(stream(a)) enddo - !$omp end do - !$omp end parallel - double precision, allocatable :: g_vir(:,:) - allocate(g_vir(nV,nV)) - call compute_g_vir_chol(nO,nV,t1,t2,H_vv,g_vir) - - double precision, allocatable :: Y_oovv(:,:,:,:) - allocate(Y_oovv(nO,nO,nV,nV)) - - call dgemm('N','N',nO*nO*nV,nV,nV, & - 1d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3), & - g_vir, size(g_vir,1), & - 0d0, Y_oovv, size(Y_oovv,1) * size(Y_oovv,2) * size(Y_oovv,3)) - deallocate(g_vir) - deallocate(X_oovv) - - !$omp parallel & - !$omp shared(nO,nV,r2,Y_oovv) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do do gam = 1, nV do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) + Y_oovv(u,v,beta,gam) + Y_oovv(v,u,gam,beta) - enddo - enddo + call gpu_set_stream(blas_handle, stream(beta)) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, 1.d0, Y_oovv%f(1,1,beta,gam), & + nO, 1.d0, r2%f(1,1,beta,gam), nO, r2%f(1,1,beta,gam), nO) + call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, 1.d0, r2%f(1,1,beta,gam), & + nO, 1.d0, Y_oovv%f(1,1,gam,beta), nO, r2%f(1,1,beta,gam), nO) enddo enddo - !$omp end do - !$omp end parallel - deallocate(Y_oovv) - double precision, allocatable :: g_occ(:,:) - allocate(g_occ(nO,nO)) - call compute_g_occ_chol(nO,nV,t1,t2,H_oo,g_occ) + call gpu_deallocate(g_vir) + call gpu_set_stream(blas_handle, gpu_default_stream) - allocate(X_oovv(nO,nO,nV,nV)) - call dgemm('N','N',nO,nO*nV*nV,nO, & - 1d0, g_occ , size(g_occ,1), & - t2 , size(t2,1), & - 0d0, X_oovv, size(X_oovv,1)) - deallocate(g_occ) + do a=1,nV + call gpu_stream_destroy(stream(a)) + enddo + + call gpu_deallocate(Y_oovv) + + type(gpu_double2) :: g_occ + call gpu_allocate(g_occ,nO,nO) + + call compute_g_occ_chol(nO,nV,t1%f,t2%f,H_oo%f,g_occ%f) + + call gpu_dgemm(blas_handle, 'N','N',nO,nO*nV*nV,nO, & + 1d0, g_occ%f(1,1), size(g_occ%f,1), & + t2%f(1,1,1,1) , size(t2%f,1), & + 0d0, X_oovv%f(1,1,1,1), size(X_oovv%f,1)) + + call gpu_synchronize() !$omp parallel & !$omp shared(nO,nV,r2,X_oovv) & @@ -563,7 +557,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) do beta = 1, nV do v = 1, nO do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(u,v,beta,gam) - X_oovv(v,u,gam,beta) + r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - X_oovv%f(u,v,beta,gam) - X_oovv%f(v,u,gam,beta) enddo enddo enddo @@ -571,27 +565,39 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) !$omp end do !$omp end parallel - deallocate(X_oovv) + call gpu_deallocate(g_occ) + call gpu_deallocate(X_oovv) - double precision, allocatable :: X_vovv(:,:,:,:) + type(gpu_double4) :: X_vovv + + call gpu_allocate(X_vovv,nV,nO,nV,block_size) + call gpu_allocate(Y_oovv,nO,nO,nV,nV) - allocate(X_vovv(nV,nO,nV,block_size)) - allocate(Y_oovv(nO,nO,nV,nV)) do iblock = 1, nV, block_size do gam = iblock, min(nV, iblock+block_size-1) - call dgemm('T','N',nV, nO*nV, cholesky_mo_num, 1.d0, & - cc_space_v_vv_chol(1,1,gam), cholesky_mo_num, cc_space_v_ov_chol, & - cholesky_mo_num, 0.d0, X_vovv(1,1,1,gam-iblock+1), nV) + call gpu_stream_create(stream(gam)) + call gpu_set_stream(blas_handle, stream(gam)) + call gpu_dgemm(blas_handle, 'T','N',nV, nO*nV, cholesky_mo_num, 1.d0, & + d_cc_space_v_vv_chol%f(1,1,gam), cholesky_mo_num, d_cc_space_v_ov_chol%f(1,1,1), & + cholesky_mo_num, 0.d0, X_vovv%f(1,1,1,gam-iblock+1), nV) enddo + do gam = iblock, min(nV, iblock+block_size-1) + call gpu_stream_destroy(stream(gam)) + enddo + + call gpu_synchronize() + + call gpu_set_stream(blas_handle, gpu_default_stream) call dgemm('N','N',nO,nO*nV*min(block_size, nV-iblock+1),nV, & - 1d0, t1 , size(t1,1), & - X_vovv, size(X_vovv,1), & - 0d0, Y_oovv(1,1,1,iblock), size(Y_oovv,1)) + 1d0, t1%f , size(t1%f,1), & + X_vovv%f, size(X_vovv%f,1), & + 0d0, Y_oovv%f(1,1,1,iblock), size(Y_oovv%f,1)) enddo - deallocate(X_vovv) + call gpu_synchronize() + call gpu_deallocate(X_vovv) !$omp parallel & !$omp shared(nO,nV,r2,Y_oovv) & @@ -602,14 +608,14 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) do beta = 1, nV do v = 1, nO do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) + Y_oovv(v,u,beta,gam) + Y_oovv(u,v,gam,beta) + r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) + Y_oovv%f(v,u,beta,gam) + Y_oovv%f(u,v,gam,beta) enddo enddo enddo enddo !$omp end do !$omp end parallel - deallocate(Y_oovv) + call gpu_deallocate(Y_oovv) double precision, allocatable :: X_ovvo(:,:,:,:) double precision, allocatable :: tcc(:,:,:), tcc2(:,:,:) @@ -617,11 +623,11 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) allocate(tcc(cholesky_mo_num,nO,nV)) call dgemm('N','T', cholesky_mo_num*nV, nO, nV, 1.d0, & - cc_space_v_vv_chol, cholesky_mo_num*nV, t1, nO, & + d_cc_space_v_vv_chol%f, cholesky_mo_num*nV, t1%f, nO, & 0.d0, tcc2, cholesky_mo_num*nV) call dgemm('N','N', cholesky_mo_num*nO, nV, nO, 1.d0, & - cc_space_v_oo_chol, cholesky_mo_num*nO, t1, nO, & + cc_space_v_oo_chol, cholesky_mo_num*nO, t1%f, nO, & 0.d0, tcc, cholesky_mo_num*nO) call dgemm('T','N', nO*nV, nV*nO, cholesky_mo_num, 1.d0, & @@ -639,7 +645,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) do beta = 1, nV do v = 1, nO do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_ovvo(u,beta,gam,v) + r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - X_ovvo(u,beta,gam,v) enddo enddo enddo @@ -650,7 +656,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) do gam = 1, nV do v = 1, nO do u = 1, nO - r2(v,u,gam,beta) = r2(v,u,gam,beta) - X_ovvo(u,beta,gam,v) + r2%f(v,u,gam,beta) = r2%f(v,u,gam,beta) - X_ovvo(u,beta,gam,v) enddo enddo enddo @@ -661,12 +667,12 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) deallocate(X_ovvo) !----- - allocate(X_oovv(nO,nO,nV,nV)) + call gpu_allocate(X_oovv,nO,nO,nV,nV) call dgemm('N','N',nO*nO*nV,nV,nO, & 1d0, cc_space_v_oovo, size(cc_space_v_oovo,1) * size(cc_space_v_oovo,2) * size(cc_space_v_oovo,3), & - t1 , size(t1,1), & - 0d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3)) + t1%f , size(t1%f,1), & + 0d0, X_oovv%f, size(X_oovv%f,1) * size(X_oovv%f,2) * size(X_oovv%f,3)) !$omp parallel & !$omp shared(nO,nV,r2,X_oovv) & @@ -677,14 +683,14 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) do beta = 1, nV do v = 1, nO do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(u,v,beta,gam) - X_oovv(v,u,gam,beta) + r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - X_oovv%f(u,v,beta,gam) - X_oovv%f(v,u,gam,beta) enddo enddo enddo enddo !$omp end do !$omp end parallel - deallocate(X_oovv) + call gpu_deallocate(X_oovv) double precision, allocatable :: X_vovo(:,:,:,:), Y_oovo(:,:,:,:) allocate(X_vovo(nV,nO,nV,nO)) @@ -708,16 +714,16 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) allocate(Y_oovo(nO,nO,nV,nO)) call dgemm('N','N',nO,nO*nV*nO,nV, & - 1d0, t1, size(t1,1), & + 1d0, t1%f, size(t1%f,1), & X_vovo, size(X_vovo,1), & 0d0, Y_oovo, size(Y_oovo,1)) deallocate(X_vovo) - allocate(X_oovv(nO,nO,nV,nV)) + call gpu_allocate(X_oovv,nO,nO,nV,nV) call dgemm('N','N',nO*nO*nV, nV, nO, & 1d0, Y_oovo, size(Y_oovo,1) * size(Y_oovo,2) * size(Y_oovo,3), & - t1 , size(t1,1), & - 0d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3)) + t1%f , size(t1%f,1), & + 0d0, X_oovv%f, size(X_oovv%f,1) * size(X_oovv%f,2) * size(X_oovv%f,3)) deallocate(Y_oovo) !$omp parallel & @@ -729,24 +735,24 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) do beta = 1, nV do v = 1, nO do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(u,v,gam,beta) - X_oovv(v,u,beta,gam) + r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - X_oovv%f(u,v,gam,beta) - X_oovv%f(v,u,beta,gam) enddo enddo enddo enddo !$omp end do !$omp end parallel - deallocate(X_oovv) + call gpu_deallocate(X_oovv) double precision, allocatable :: J1(:,:,:,:) allocate(J1(nO,nV,nV,nO)) - call compute_J1_chol(nO,nV,t1,t2,cc_space_v_ovvo,cc_space_v_ovoo, & + call compute_J1_chol(nO,nV,t1%f,t2%f,cc_space_v_ovvo,cc_space_v_ovoo, & cc_space_v_vvoo,J1) double precision, allocatable :: K1(:,:,:,:) allocate(K1(nO,nV,nO,nV)) - call compute_K1_chol(nO,nV,t1,t2,cc_space_v_ovoo,cc_space_v_vvoo, & + call compute_K1_chol(nO,nV,t1%f,t2%f,cc_space_v_ovoo,cc_space_v_vvoo, & cc_space_v_ovov,K1) allocate(X_ovvo(nO,nV,nV,nO)) @@ -778,7 +784,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) do v = 1, nO do i = 1, nO do a = 1, nV - Y_voov(a,i,v,gam) = 2d0 * t2(i,v,a,gam) - t2(i,v,gam,a) + Y_voov(a,i,v,gam) = 2d0 * t2%f(i,v,a,gam) - t2%f(i,v,gam,a) enddo enddo enddo @@ -805,7 +811,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) do beta = 1, nV do v = 1, nO do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) + Z_ovov(u,beta,v,gam) + Z_ovov(v,gam,u,beta) + r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) + Z_ovov(u,beta,v,gam) + Z_ovov(v,gam,u,beta) enddo enddo enddo @@ -820,7 +826,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) allocate(Y_ovov(nO,nV,nO,nV)) !$omp parallel & - !$omp shared(nO,nV,r2,K1,X_ovov,Y_ovov,t2) & + !$omp shared(nO,nV,K1,X_ovov,Y_ovov,t2) & !$omp private(u,a,i,beta,gam) & !$omp default(none) !$omp do @@ -840,7 +846,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) do v = 1, nO do a = 1, nV do i = 1, nO - Y_ovov(i,a,v,gam) = t2(i,v,gam,a) + Y_ovov(i,a,v,gam) = t2%f(i,v,gam,a) enddo enddo enddo @@ -864,7 +870,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) do beta = 1, nV do v = 1, nO do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) - Z_ovov(u,beta,v,gam) - Z_ovov(v,gam,u,beta) + r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - Z_ovov(u,beta,v,gam) - Z_ovov(v,gam,u,beta) enddo enddo enddo @@ -895,7 +901,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) do v = 1, nO do a = 1, nV do i = 1, nO - Y_ovov(i,a,v,beta) = t2(i,v,beta,a) + Y_ovov(i,a,v,beta) = t2%f(i,v,beta,a) enddo enddo enddo @@ -922,7 +928,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) do beta = 1, nV do v = 1, nO do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) - Z_ovov(u,gam,v,beta) - Z_ovov(v,beta,u,gam) + r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - Z_ovov(u,gam,v,beta) - Z_ovov(v,beta,u,gam) enddo enddo enddo @@ -945,8 +951,8 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) do a = 1, nV do j = 1, nO do i = 1, nO - r2(i,j,a,b) = -r2(i,j,a,b) - max_r2_local = max(r2(i,j,a,b), max_r2_local) + r2%f(i,j,a,b) = -r2%f(i,j,a,b) + max_r2_local = max(r2%f(i,j,a,b), max_r2_local) enddo enddo enddo @@ -961,28 +967,29 @@ end ! A1 -subroutine compute_A1_chol(nO,nV,t1,t2,tau,A1) - +subroutine compute_A1_chol(nO,nV,t1,t2,tau,d_cc_space_v_vooo, & + d_cc_space_v_oooo, d_cc_space_v_vvoo, A1) + use gpu implicit none - integer, intent(in) :: nO,nV - double precision, intent(in) :: t1(nO, nV) - double precision, intent(in) :: t2(nO, nO, nV, nV) - double precision, intent(in) :: tau(nO, nO, nV, nV) - double precision, intent(out) :: A1(nO, nO, nO, nO) + integer, intent(in) :: nO,nV + type(gpu_double2), intent(in) :: t1 + type(gpu_double4), intent(in) :: t2, tau + type(gpu_double4), intent(in) :: d_cc_space_v_vooo, d_cc_space_v_oooo, d_cc_space_v_vvoo + type(gpu_double4), intent(out) :: A1 integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta - double precision, allocatable :: Y_oooo(:,:,:,:) - allocate(Y_oooo(nO,nO,nO,nO)) + type(gpu_double4) :: Y_oooo + call gpu_allocate(Y_oooo,nO,nO,nO,nO) ! A1(u,v,i,j) = cc_space_v_oooo(u,v,i,j) ! A1(u,v,i,j) += cc_space_v_ovoo(u,a,i,j) * t1(v,a) & call dgemm('N','N', nO, nO*nO*nO, nV, & - 1d0, t1 , size(t1,1), & - cc_space_v_vooo, size(cc_space_v_vooo,1), & - 0d0, Y_oooo, size(Y_oooo,1)) + 1d0, t1%f , size(t1%f,1), & + d_cc_space_v_vooo%f, size(d_cc_space_v_vooo%f,1), & + 0d0, Y_oooo%f, size(Y_oooo%f,1)) !$omp parallel & !$omp private(u,v,i,j) & @@ -992,7 +999,7 @@ subroutine compute_A1_chol(nO,nV,t1,t2,tau,A1) do i = 1, nO do v = 1, nO do u = 1, nO - A1(u,v,i,j) = cc_space_v_oooo(u,v,i,j) + Y_oooo(v,u,j,i) + Y_oooo(u,v,i,j) + A1%f(u,v,i,j) = d_cc_space_v_oooo%f(u,v,i,j) + Y_oooo%f(v,u,j,i) + Y_oooo%f(u,v,i,j) enddo enddo enddo @@ -1000,19 +1007,20 @@ subroutine compute_A1_chol(nO,nV,t1,t2,tau,A1) !$omp end do !$omp end parallel - deallocate(Y_oooo) + call gpu_deallocate(Y_oooo) ! A1(u,v,i,j) += cc_space_v_vvoo(a,b,i,j) * tau(u,v,a,b) call dgemm('N','N', nO*nO, nO*nO, nV*nV, & - 1d0, tau , size(tau,1) * size(tau,2), & - cc_space_v_vvoo, size(cc_space_v_vvoo,1) * size(cc_space_v_vvoo,2), & - 1d0, A1 , size(A1,1) * size(A1,2)) + 1d0, tau%f , size(tau%f,1) * size(tau%f,2), & + d_cc_space_v_vvoo%f, size(d_cc_space_v_vvoo%f,1) * size(d_cc_space_v_vvoo%f,2), & + 1d0, A1%f , size(A1%f,1) * size(A1%f,2)) end ! g_occ subroutine compute_g_occ_chol(nO,nV,t1,t2,H_oo,g_occ) + use gpu implicit none @@ -1048,6 +1056,7 @@ end ! g_vir subroutine compute_g_vir_chol(nO,nV,t1,t2,H_vv,g_vir) + use gpu implicit none @@ -1102,6 +1111,7 @@ end ! J1 subroutine compute_J1_chol(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvoo,J1) + use gpu implicit none integer, intent(in) :: nO,nV @@ -1305,6 +1315,7 @@ end ! K1 subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov,K1) + use gpu implicit none From 5b1e5f84e6defc8857b5d29c54449e3b8d35cb67 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Thu, 4 Jul 2024 14:52:09 +0200 Subject: [PATCH 21/38] Working on r2 --- src/ccsd/ccsd_space_orb_sub.irp.f | 17 +- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 469 +++++++++++-------------- 2 files changed, 220 insertions(+), 266 deletions(-) diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 59b9ebd2..e97c2325 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -22,7 +22,7 @@ subroutine run_ccsd_space_orb type(gpu_double4) :: d_cc_space_v_oovv, d_cc_space_v_voov, d_cc_space_v_ovov type(gpu_double4) :: d_cc_space_v_oovo, d_cc_space_v_vooo, d_cc_space_v_oooo - type(gpu_double4) :: d_cc_space_v_vvoo + type(gpu_double4) :: d_cc_space_v_vvoo, d_cc_space_v_ovvo, d_cc_space_v_ovoo double precision, allocatable :: all_err(:,:), all_t(:,:) integer, allocatable :: list_occ(:), list_vir(:) @@ -98,17 +98,21 @@ subroutine run_ccsd_space_orb call gpu_allocate(d_cc_space_v_voov, nV, nO, nO, nV) call gpu_allocate(d_cc_space_v_ovov, nO, nV, nO, nV) call gpu_allocate(d_cc_space_v_oovo, nO, nO, nV, nO) + call gpu_allocate(d_cc_space_v_ovvo, nO, nV, nV, nO) call gpu_allocate(d_cc_space_v_vooo, nV, nO, nO, nO) call gpu_allocate(d_cc_space_v_oooo, nO, nO, nO, nO) call gpu_allocate(d_cc_space_v_vvoo, nV, nV, nO, nO) + call gpu_allocate(d_cc_space_v_ovoo, nO, nV, nO, nO) call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv) call gpu_upload(cc_space_v_voov, d_cc_space_v_voov) call gpu_upload(cc_space_v_ovov, d_cc_space_v_ovov) call gpu_upload(cc_space_v_oovo, d_cc_space_v_oovo) + call gpu_upload(cc_space_v_ovvo, d_cc_space_v_ovvo) call gpu_upload(cc_space_v_vooo, d_cc_space_v_vooo) call gpu_upload(cc_space_v_oooo, d_cc_space_v_oooo) call gpu_upload(cc_space_v_vvoo, d_cc_space_v_vvoo) + call gpu_upload(cc_space_v_ovoo, d_cc_space_v_ovoo) ! FREE cc_space_v_voov ! FREE cc_space_v_ovov @@ -117,6 +121,8 @@ subroutine run_ccsd_space_orb ! FREE cc_space_v_vooo ! FREE cc_space_v_oooo ! FREE cc_space_v_vvoo +! FREE cc_space_v_ovvo +! FREE cc_space_v_ovoo call gpu_allocate(t2, nO,nO,nV,nV) call gpu_allocate(r2, nO,nO,nV,nV) @@ -207,8 +213,8 @@ subroutine run_ccsd_space_orb call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_space_f_ov,d_cc_space_f_vo, & d_cc_space_v_voov, d_cc_space_v_ovov, d_cc_space_v_oovo, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol) call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & - d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, & - d_cc_space_v_vvoo, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, & + d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, d_cc_space_v_oovo, d_cc_space_v_ovvo, d_cc_space_v_ovoo, & + d_cc_space_v_ovov, d_cc_space_v_vvoo, d_cc_space_v_oo_chol, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, & r2, max_r2) else call compute_H_oo(nO,nV,t1%f,t2%f,tau%f,H_oo%f) @@ -320,6 +326,11 @@ subroutine run_ccsd_space_orb call gpu_deallocate(d_cc_space_v_voov) call gpu_deallocate(d_cc_space_v_ovov) call gpu_deallocate(d_cc_space_v_oovo) + call gpu_deallocate(d_cc_space_v_ovvo) + call gpu_deallocate(d_cc_space_v_vooo) + call gpu_deallocate(d_cc_space_v_oooo) + call gpu_deallocate(d_cc_space_v_vvoo) + call gpu_deallocate(d_cc_space_v_ovoo) call gpu_deallocate(d_cc_space_f_oo) call gpu_deallocate(d_cc_space_f_vo) diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index 0474dcec..abb9909b 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -392,8 +392,8 @@ end ! R2 subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & - d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, & - d_cc_space_v_vvoo, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, & + d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, d_cc_space_v_oovo, d_cc_space_v_ovvo, d_cc_space_v_ovoo, & + d_cc_space_v_ovov, d_cc_space_v_vvoo, d_cc_space_v_oo_chol, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, & r2,max_r2) use gpu implicit none @@ -403,9 +403,11 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & type(gpu_double2), intent(in) :: t1, H_oo, H_vv type(gpu_double4), intent(in) :: t2, tau, d_cc_space_v_oovv type(gpu_double4), intent(in) :: d_cc_space_v_vooo, d_cc_space_v_oooo - type(gpu_double4), intent(in) :: d_cc_space_v_vvoo - type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol - type(gpu_double3), intent(in) :: d_cc_space_v_vo_chol + type(gpu_double4), intent(in) :: d_cc_space_v_vvoo, d_cc_space_v_oovo + type(gpu_double4), intent(in) :: d_cc_space_v_ovvo, d_cc_space_v_ovoo + type(gpu_double4), intent(in) :: d_cc_space_v_ovov + type(gpu_double3), intent(in) :: d_cc_space_v_oo_chol, d_cc_space_v_ov_chol + type(gpu_double3), intent(in) :: d_cc_space_v_vo_chol, d_cc_space_v_vv_chol ! out double precision, intent(out) :: max_r2 @@ -499,9 +501,11 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & call gpu_allocate(X_oovv,nO,nO,nV,nV) call gpu_copy(t2,X_oovv) - type(gpu_double2) :: g_vir + type(gpu_double2) :: g_occ, g_vir call gpu_allocate(g_vir,nV,nV) + call gpu_allocate(g_occ,nO,nO) call compute_g_vir_chol(nO,nV,t1%f,t2%f,H_vv%f,g_vir%f) + call compute_g_occ_chol(nO,nV,t1%f,t2%f,H_oo%f,g_occ%f) type(gpu_double4) :: Y_oovv call gpu_allocate(Y_oovv,nO,nO,nV,nV) @@ -511,7 +515,41 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & g_vir%f(1,1), size(g_vir%f,1), & 0d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1) * size(Y_oovv%f,2) * size(Y_oovv%f,3)) + call gpu_dgemm(blas_handle, 'N','N',nO,nO*nV*nV,nO, & + -1d0, g_occ%f(1,1), size(g_occ%f,1), & + t2%f(1,1,1,1) , size(t2%f,1), & + 1d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1)) + + call gpu_dgemm(blas_handle, 'N','N',nO*nO*nV,nV,nO, & + -1d0, d_cc_space_v_oovo%f(1,1,1,1), size(cc_space_v_oovo,1) * size(cc_space_v_oovo,2) * size(cc_space_v_oovo,3), & + t1%f(1,1) , size(t1%f,1), & + 1d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1) * size(Y_oovv%f,2) * size(Y_oovv%f,3)) + call gpu_synchronize() + call gpu_deallocate(X_oovv) + + call gpu_deallocate(g_vir) + call gpu_deallocate(g_occ) + + type(gpu_double4) :: X_vovo, Y_oovo + call gpu_allocate(X_vovo,nV,nO,nV,nO) + +! !$omp parallel & +! !$omp shared(nO,nV,r2,Y_oovv) & +! !$omp private(u,v,gam,beta) & +! !$omp default(none) +! !$omp do +! do gam = 1, nV +! do beta = 1, nV +! do v = 1, nO +! do u = 1, nO +! r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) + Y_oovv%f(u,v,beta,gam) + Y_oovv%f(v,u,gam,beta) +! enddo +! enddo +! enddo +! enddo +! !$omp end do +! !$omp end parallel do a=1,nV call gpu_stream_create(stream(a)) @@ -527,52 +565,24 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & enddo enddo - call gpu_deallocate(g_vir) - call gpu_set_stream(blas_handle, gpu_default_stream) + do i = 1, nO + do gam = 1, nV + call gpu_set_stream(blas_handle, stream(gam)) + call gpu_dgeam(blas_handle, 'T', 'N', nV, nO, 1.d0, d_cc_space_v_ovvo%f(1,1,gam,i), & + nO, 0.d0, X_vovo%f(1,1,gam,i), nV, X_vovo%f(1,1,gam,i), nV) + enddo + enddo do a=1,nV call gpu_stream_destroy(stream(a)) enddo + call gpu_set_stream(blas_handle, gpu_default_stream) - call gpu_deallocate(Y_oovv) - - type(gpu_double2) :: g_occ - call gpu_allocate(g_occ,nO,nO) - - call compute_g_occ_chol(nO,nV,t1%f,t2%f,H_oo%f,g_occ%f) - - call gpu_dgemm(blas_handle, 'N','N',nO,nO*nV*nV,nO, & - 1d0, g_occ%f(1,1), size(g_occ%f,1), & - t2%f(1,1,1,1) , size(t2%f,1), & - 0d0, X_oovv%f(1,1,1,1), size(X_oovv%f,1)) - - call gpu_synchronize() - - !$omp parallel & - !$omp shared(nO,nV,r2,X_oovv) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do - do gam = 1, nV - do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - X_oovv%f(u,v,beta,gam) - X_oovv%f(v,u,gam,beta) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - - call gpu_deallocate(g_occ) - call gpu_deallocate(X_oovv) type(gpu_double4) :: X_vovv call gpu_allocate(X_vovv,nV,nO,nV,block_size) - call gpu_allocate(Y_oovv,nO,nO,nV,nV) - + call gpu_allocate(Y_oovo,nO,nO,nV,nO) do iblock = 1, nV, block_size do gam = iblock, min(nV, iblock+block_size-1) @@ -590,241 +600,176 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & call gpu_synchronize() call gpu_set_stream(blas_handle, gpu_default_stream) - call dgemm('N','N',nO,nO*nV*min(block_size, nV-iblock+1),nV, & - 1d0, t1%f , size(t1%f,1), & - X_vovv%f, size(X_vovv%f,1), & + call gpu_dgemm(blas_handle, 'N','N', nO, & + nO*nV*min(block_size, nV-iblock+1),nV, & + 1.d0, t1%f(1,1) , size(t1%f,1), & + X_vovv%f(1,1,1,1), size(X_vovv%f,1), & 0d0, Y_oovv%f(1,1,1,iblock), size(Y_oovv%f,1)) enddo + + call gpu_dgemm(blas_handle, 'N','N',nO,nO*nV*nO,nV, & + 1d0, t1%f(1,1), size(t1%f,1), & + X_vovo%f(1,1,1,1), size(X_vovo%f,1), & + 0d0, Y_oovo%f(1,1,1,1), size(Y_oovo%f,1)) + + call gpu_dgemm(blas_handle, 'N','N',nO*nO*nV, nV, nO, & + -1d0, Y_oovo%f(1,1,1,1), size(Y_oovo%f,1) * size(Y_oovo%f,2) * size(Y_oovo%f,3), & + t1%f(1,1) , size(t1%f,1), & + 1d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1) * size(Y_oovv%f,2) * size(Y_oovv%f,3)) + call gpu_synchronize() call gpu_deallocate(X_vovv) + call gpu_deallocate(X_vovo) + call gpu_deallocate(Y_oovo) + +! !$omp parallel & +! !$omp shared(nO,nV,r2,Y_oovv) & +! !$omp private(u,v,gam,beta) & +! !$omp default(none) +! !$omp do +! do gam = 1, nV +! do beta = 1, nV +! do v = 1, nO +! do u = 1, nO +! r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) + Y_oovv%f(u,v,gam,beta) + Y_oovv%f(v,u,beta,gam) +! enddo +! enddo +! enddo +! enddo +! !$omp end do +! !$omp end parallel + + do a=1,nV + call gpu_stream_create(stream(a)) + enddo - !$omp parallel & - !$omp shared(nO,nV,r2,Y_oovv) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do do gam = 1, nV do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) + Y_oovv%f(v,u,beta,gam) + Y_oovv%f(u,v,gam,beta) - enddo - enddo + call gpu_set_stream(blas_handle, stream(beta)) + call gpu_dgeam(blas_handle, 'T', 'N', nO, nO, 1.d0, Y_oovv%f(1,1,beta,gam), & + nO, 1.d0, r2%f(1,1,beta,gam), nO, r2%f(1,1,beta,gam), nO) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, 1.d0, r2%f(1,1,beta,gam), & + nO, 1.d0, Y_oovv%f(1,1,gam,beta), nO, r2%f(1,1,beta,gam), nO) enddo enddo - !$omp end do - !$omp end parallel + + call gpu_set_stream(blas_handle, gpu_default_stream) + + do a=1,nV + call gpu_stream_destroy(stream(a)) + enddo + + + call gpu_synchronize() call gpu_deallocate(Y_oovv) - double precision, allocatable :: X_ovvo(:,:,:,:) - double precision, allocatable :: tcc(:,:,:), tcc2(:,:,:) - allocate(tcc2(cholesky_mo_num,nV,nO), X_ovvo(nO,nV,nV,nO)) - allocate(tcc(cholesky_mo_num,nO,nV)) + type(gpu_double4) :: X_ovvo + type(gpu_double3) :: tcc, tcc2 + call gpu_allocate(tcc2,cholesky_mo_num,nV,nO) + call gpu_allocate(X_ovvo,nO,nV,nV,nO) + call gpu_allocate(tcc,cholesky_mo_num,nO,nV) - call dgemm('N','T', cholesky_mo_num*nV, nO, nV, 1.d0, & - d_cc_space_v_vv_chol%f, cholesky_mo_num*nV, t1%f, nO, & - 0.d0, tcc2, cholesky_mo_num*nV) + call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nV, nO, nV, 1.d0, & + d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num*nV, t1%f(1,1), nO, & + 0.d0, tcc2%f(1,1,1), cholesky_mo_num*nV) - call dgemm('N','N', cholesky_mo_num*nO, nV, nO, 1.d0, & - cc_space_v_oo_chol, cholesky_mo_num*nO, t1%f, nO, & - 0.d0, tcc, cholesky_mo_num*nO) + call gpu_dgemm(blas_handle, 'N','N', cholesky_mo_num*nO, nV, nO, 1.d0, & + d_cc_space_v_oo_chol%f(1,1,1), cholesky_mo_num*nO, t1%f(1,1), nO, & + 0.d0, tcc%f(1,1,1), cholesky_mo_num*nO) - call dgemm('T','N', nO*nV, nV*nO, cholesky_mo_num, 1.d0, & - tcc, cholesky_mo_num, tcc2, cholesky_mo_num, 0.d0, & - X_ovvo, nO*nV) + call gpu_dgemm(blas_handle, 'T','N', nO*nV, nV*nO, cholesky_mo_num, 1.d0, & + tcc%f(1,1,1), cholesky_mo_num, tcc2%f(1,1,1), cholesky_mo_num, 0.d0, & + X_ovvo%f(1,1,1,1), nO*nV) - deallocate(tcc, tcc2) + call gpu_synchronize() + + do a=1,nV + call gpu_stream_create(stream(a)) + enddo - !$omp parallel & - !$omp shared(nO,nV,r2,X_ovvo) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do do gam = 1, nV do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - X_ovvo(u,beta,gam,v) - enddo - enddo + call gpu_set_stream(blas_handle, stream(beta)) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, -1.d0, X_ovvo%f(1,beta,gam,1), & + nO*nV*nV, 1.d0, r2%f(1,1,beta,gam), nO, r2%f(1,1,beta,gam), nO) + call gpu_dgeam(blas_handle, 'T', 'N', nO, nO, -1.d0, X_ovvo%f(1,gam,beta,1), & + nO*nV*nV, 1.d0, r2%f(1,1,beta,gam), nO, r2%f(1,1,beta,gam), nO) enddo enddo - !$omp end do - !$omp do - do beta = 1, nV - do gam = 1, nV - do v = 1, nO - do u = 1, nO - r2%f(v,u,gam,beta) = r2%f(v,u,gam,beta) - X_ovvo(u,beta,gam,v) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - deallocate(X_ovvo) + call gpu_set_stream(blas_handle, gpu_default_stream) + + do a=1,nV + call gpu_stream_destroy(stream(a)) + enddo + + call gpu_synchronize() + call gpu_deallocate(tcc) + call gpu_deallocate(tcc2) + call gpu_deallocate(X_ovvo) + !----- - call gpu_allocate(X_oovv,nO,nO,nV,nV) + type(gpu_double4) :: J1, K1 + type(gpu_double4) :: Y_voov, Z_ovov - call dgemm('N','N',nO*nO*nV,nV,nO, & - 1d0, cc_space_v_oovo, size(cc_space_v_oovo,1) * size(cc_space_v_oovo,2) * size(cc_space_v_oovo,3), & - t1%f , size(t1%f,1), & - 0d0, X_oovv%f, size(X_oovv%f,1) * size(X_oovv%f,2) * size(X_oovv%f,3)) + call gpu_allocate(J1,nO,nV,nV,nO) + call compute_J1_chol(nO,nV,t1%f,t2%f,d_cc_space_v_ovvo%f,d_cc_space_v_ovoo%f, & + d_cc_space_v_vvoo%f,J1%f) - !$omp parallel & - !$omp shared(nO,nV,r2,X_oovv) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do - do gam = 1, nV - do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - X_oovv%f(u,v,beta,gam) - X_oovv%f(v,u,gam,beta) - enddo - enddo + call gpu_allocate(K1,nO,nV,nO,nV) + call compute_K1_chol(nO,nV,t1%f,t2%f,d_cc_space_v_ovoo%f,d_cc_space_v_vvoo%f, & + d_cc_space_v_ovov%f,K1%f) + + + call gpu_allocate(X_ovvo,nO,nV,nV,nO) + call gpu_allocate(Y_voov,nV,nO,nO,nV) + + do a=1,nV + call gpu_stream_create(stream(a)) + enddo + + do i=1, nO + do a=1, nV + call gpu_set_stream(blas_handle, stream(a)) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,a,1,i), & + nO*nV, -0.5d0, K1%f(1,a,i,1), nO*nV*nO, X_ovvo%f(1,1,a,i), nO) enddo enddo - !$omp end do - !$omp end parallel - call gpu_deallocate(X_oovv) - double precision, allocatable :: X_vovo(:,:,:,:), Y_oovo(:,:,:,:) - allocate(X_vovo(nV,nO,nV,nO)) - - !$omp parallel & - !$omp shared(nO,nV,X_vovo,cc_space_v_ovvo) & - !$omp private(a,v,gam,i) & - !$omp default(none) - do i = 1, nO - !$omp do - do gam = 1, nV - do v = 1, nO - do a = 1, nV - X_vovo(a,v,gam,i) = cc_space_v_ovvo(v,a,gam,i) - enddo - enddo - enddo - !$omp end do nowait - enddo - !$omp end parallel - - allocate(Y_oovo(nO,nO,nV,nO)) - call dgemm('N','N',nO,nO*nV*nO,nV, & - 1d0, t1%f, size(t1%f,1), & - X_vovo, size(X_vovo,1), & - 0d0, Y_oovo, size(Y_oovo,1)) - - deallocate(X_vovo) - call gpu_allocate(X_oovv,nO,nO,nV,nV) - call dgemm('N','N',nO*nO*nV, nV, nO, & - 1d0, Y_oovo, size(Y_oovo,1) * size(Y_oovo,2) * size(Y_oovo,3), & - t1%f , size(t1%f,1), & - 0d0, X_oovv%f, size(X_oovv%f,1) * size(X_oovv%f,2) * size(X_oovv%f,3)) - deallocate(Y_oovo) - - !$omp parallel & - !$omp shared(nO,nV,r2,X_oovv) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do - do gam = 1, nV - do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - X_oovv%f(u,v,gam,beta) - X_oovv%f(v,u,beta,gam) - enddo - enddo + do gam=1, nV + call gpu_set_stream(blas_handle, stream(gam)) + do v=1, nO + call gpu_dgeam(blas_handle, 'T', 'T', nV, nO, 2.d0, t2%f(1,v,1,gam), & + nO*nO, -1.d0, t2%f(1,v,gam,1), nO*nO*nV, Y_voov%f(1,1,v,gam), nV) enddo enddo - !$omp end do - !$omp end parallel - call gpu_deallocate(X_oovv) + call gpu_allocate(Z_ovov,nO,nV,nO,nV) - double precision, allocatable :: J1(:,:,:,:) - allocate(J1(nO,nV,nV,nO)) - call compute_J1_chol(nO,nV,t1%f,t2%f,cc_space_v_ovvo,cc_space_v_ovoo, & - cc_space_v_vvoo,J1) - - double precision, allocatable :: K1(:,:,:,:) - allocate(K1(nO,nV,nO,nV)) - call compute_K1_chol(nO,nV,t1%f,t2%f,cc_space_v_ovoo,cc_space_v_vvoo, & - cc_space_v_ovov,K1) - - allocate(X_ovvo(nO,nV,nV,nO)) - !$omp parallel & - !$omp private(u,v,gam,beta,i,a) & - !$omp default(shared) - do i = 1, nO - !$omp do - do a = 1, nV - do beta = 1, nV - do u = 1, nO - X_ovvo(u,beta,a,i) = (J1(u,a,beta,i) - 0.5d0 * K1(u,a,i,beta)) - enddo - enddo - enddo - !$omp end do nowait + do a=1,nV + call gpu_stream_destroy(stream(a)) enddo - !$omp end parallel - deallocate(J1) - double precision, allocatable :: Y_voov(:,:,:,:) - allocate(Y_voov(nV,nO,nO,nV)) + call gpu_deallocate(J1) + call gpu_set_stream(blas_handle, gpu_default_stream) - !$omp parallel & - !$omp private(u,v,gam,beta,i,a) & - !$omp default(shared) - !$omp do - do gam = 1, nV - do v = 1, nO - do i = 1, nO - do a = 1, nV - Y_voov(a,i,v,gam) = 2d0 * t2%f(i,v,a,gam) - t2%f(i,v,gam,a) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - double precision, allocatable :: Z_ovov(:,:,:,:) - allocate(Z_ovov(nO,nV,nO,nV)) + call gpu_dgemm(blas_handle, 'N','N', nO*nV,nO*nV,nV*nO, & + 1d0, X_ovvo%f(1,1,1,1), size(X_ovvo%f,1) * size(X_ovvo%f,2), & + Y_voov%f(1,1,1,1), size(Y_voov%f,1) * size(Y_voov%f,2), & + 0d0, Z_ovov%f(1,1,1,1), size(Z_ovov%f,1) * size(Z_ovov%f,2)) - call dgemm('N','N', nO*nV,nO*nV,nV*nO, & - 1d0, X_ovvo, size(X_ovvo,1) * size(X_ovvo,2), & - Y_voov, size(Y_voov,1) * size(Y_voov,2), & - 0d0, Z_ovov, size(Z_ovov,1) * size(Z_ovov,2)) + call gpu_synchronize() + call gpu_deallocate(Y_voov) + call gpu_deallocate(X_ovvo) - deallocate(X_ovvo,Y_voov) - - !$omp parallel & - !$omp shared(nO,nV,r2,Z_ovov) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do - do gam = 1, nV - do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) + Z_ovov(u,beta,v,gam) + Z_ovov(v,gam,u,beta) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - - deallocate(Z_ovov) - - double precision, allocatable :: Y_ovov(:,:,:,:), X_ovov(:,:,:,:) - allocate(X_ovov(nO,nV,nO,nV)) - allocate(Y_ovov(nO,nV,nO,nV)) + type(gpu_double4) :: Y_ovov, X_ovov + call gpu_allocate(X_ovov,nO,nV,nO,nV) + call gpu_allocate(Y_ovov,nO,nV,nO,nV) +!TODO !$omp parallel & !$omp shared(nO,nV,K1,X_ovov,Y_ovov,t2) & !$omp private(u,a,i,beta,gam) & @@ -834,7 +779,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & do u = 1, nO do a = 1, nV do i = 1, nO - X_ovov(i,a,u,beta) = 0.5d0 * K1(u,a,i,beta) + X_ovov%f(i,a,u,beta) = 0.5d0 * K1%f(u,a,i,beta) enddo enddo enddo @@ -846,7 +791,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & do v = 1, nO do a = 1, nV do i = 1, nO - Y_ovov(i,a,v,gam) = t2%f(i,v,gam,a) + Y_ovov%f(i,a,v,gam) = t2%f(i,v,gam,a) enddo enddo enddo @@ -854,12 +799,12 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & !$omp end do !$omp end parallel - allocate(Z_ovov(nO,nV,nO,nV)) - call dgemm('T','N',nO*nV,nO*nV,nO*nV, & - 1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), & - Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), & - 0d0, Z_ovov, size(Y_ovov,1) * size(Y_ovov,2)) - deallocate(X_ovov, Y_ovov) + call gpu_dgemm(blas_handle, 'T','N',nO*nV,nO*nV,nO*nV, & + -1d0, X_ovov%f(1,1,1,1), size(X_ovov%f,1) * size(X_ovov%f,2), & + Y_ovov%f(1,1,1,1), size(Y_ovov%f,1) * size(Y_ovov%f,2), & + 1d0, Z_ovov%f(1,1,1,1), size(Z_ovov%f,1) * size(Z_ovov%f,2)) + + call gpu_synchronize() !$omp parallel & !$omp shared(nO,nV,r2,Z_ovov) & @@ -870,16 +815,14 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & do beta = 1, nV do v = 1, nO do u = 1, nO - r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - Z_ovov(u,beta,v,gam) - Z_ovov(v,gam,u,beta) + r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) + Z_ovov%f(u,beta,v,gam) + Z_ovov%f(v,gam,u,beta) enddo enddo enddo enddo !$omp end do !$omp end parallel - deallocate(Z_ovov) - allocate(X_ovov(nO,nV,nO,nV),Y_ovov(nO,nV,nO,nV)) !$omp parallel & !$omp shared(nO,nV,K1,X_ovov,Y_ovov,t2) & !$omp private(u,v,gam,beta,i,a) & @@ -889,7 +832,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & do i = 1, nO do gam = 1, nV do u = 1, nO - X_ovov(u,gam,i,a) = K1(u,a,i,gam) + X_ovov%f(u,gam,i,a) = K1%f(u,a,i,gam) enddo enddo enddo @@ -901,7 +844,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & do v = 1, nO do a = 1, nV do i = 1, nO - Y_ovov(i,a,v,beta) = t2%f(i,v,beta,a) + Y_ovov%f(i,a,v,beta) = t2%f(i,v,beta,a) enddo enddo enddo @@ -909,16 +852,19 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & !$omp end do !$omp end parallel - deallocate(K1) + call gpu_deallocate(K1) - allocate(Z_ovov(nO,nV,nO,nV)) - call dgemm('N','N',nO*nV,nO*nV,nO*nV, & - 1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), & - Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), & - 0d0, Z_ovov, size(Y_ovov,1) * size(Y_ovov,2)) + call gpu_dgemm(blas_handle, 'N','N',nO*nV,nO*nV,nO*nV, & + 1d0, X_ovov%f(1,1,1,1), size(X_ovov%f,1) * size(X_ovov%f,2), & + Y_ovov%f(1,1,1,1), size(Y_ovov%f,1) * size(Y_ovov%f,2), & + 0d0, Z_ovov%f(1,1,1,1), size(Z_ovov%f,1) * size(Z_ovov%f,2)) - deallocate(X_ovov,Y_ovov) + call gpu_synchronize() + call gpu_deallocate(X_ovov) + call gpu_deallocate(Y_ovov) + + ! Change the sign for consistency with the code in spin orbitals !$omp parallel & !$omp shared(nO,nV,r2,Z_ovov) & !$omp private(u,v,gam,beta) & @@ -928,7 +874,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & do beta = 1, nV do v = 1, nO do u = 1, nO - r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - Z_ovov(u,gam,v,beta) - Z_ovov(v,beta,u,gam) + r2%f(u,v,beta,gam) = -r2%f(u,v,beta,gam) + Z_ovov%f(u,gam,v,beta) + Z_ovov%f(v,beta,u,gam) enddo enddo enddo @@ -936,9 +882,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & !$omp end do !$omp end parallel - deallocate(Z_ovov) - - ! Change the sign for consistency with the code in spin orbitals + call gpu_deallocate(Z_ovov) max_r2 = 0d0 !$omp parallel & @@ -951,7 +895,6 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & do a = 1, nV do j = 1, nO do i = 1, nO - r2%f(i,j,a,b) = -r2%f(i,j,a,b) max_r2_local = max(r2%f(i,j,a,b), max_r2_local) enddo enddo From f09e91cb2296b3c1fb5cf854dc06372c879c4104 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Mon, 8 Jul 2024 12:44:32 +0200 Subject: [PATCH 22/38] Working on CCSD GPU --- src/ccsd/ccsd_space_orb_sub.irp.f | 1 + src/ccsd/ccsd_space_orb_sub_chol.irp.f | 874 +++++++++++-------------- 2 files changed, 392 insertions(+), 483 deletions(-) diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index e97c2325..d8131a9c 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -215,6 +215,7 @@ subroutine run_ccsd_space_orb call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, d_cc_space_v_oovo, d_cc_space_v_ovvo, d_cc_space_v_ovoo, & d_cc_space_v_ovov, d_cc_space_v_vvoo, d_cc_space_v_oo_chol, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, & + d_cc_space_f_vo, & r2, max_r2) else call compute_H_oo(nO,nV,t1%f,t2%f,tau%f,H_oo%f) diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index abb9909b..a185df13 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -394,13 +394,14 @@ end subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, d_cc_space_v_oovo, d_cc_space_v_ovvo, d_cc_space_v_ovoo, & d_cc_space_v_ovov, d_cc_space_v_vvoo, d_cc_space_v_oo_chol, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, & + d_cc_space_f_vo, & r2,max_r2) use gpu implicit none ! in integer, intent(in) :: nO, nV - type(gpu_double2), intent(in) :: t1, H_oo, H_vv + type(gpu_double2), intent(in) :: t1, H_oo, H_vv, d_cc_space_f_vo type(gpu_double4), intent(in) :: t2, tau, d_cc_space_v_oovv type(gpu_double4), intent(in) :: d_cc_space_v_vooo, d_cc_space_v_oooo type(gpu_double4), intent(in) :: d_cc_space_v_vvoo, d_cc_space_v_oovo @@ -504,7 +505,8 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & type(gpu_double2) :: g_occ, g_vir call gpu_allocate(g_vir,nV,nV) call gpu_allocate(g_occ,nO,nO) - call compute_g_vir_chol(nO,nV,t1%f,t2%f,H_vv%f,g_vir%f) + call compute_g_vir_chol(nO,nV,t1,t2,H_vv,d_cc_space_f_vo, & + d_cc_space_v_ov_chol, d_cc_space_v_vv_chol, g_vir) call compute_g_occ_chol(nO,nV,t1%f,t2%f,H_oo%f,g_occ%f) type(gpu_double4) :: Y_oovv @@ -525,6 +527,10 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & t1%f(1,1) , size(t1%f,1), & 1d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1) * size(Y_oovv%f,2) * size(Y_oovv%f,3)) + + call gpu_dgeam(blas_handle, 'N', 'N', nO*nO, nV*nV, 1.d0, Y_oovv%f(1,1,1,1), & + nO*nO, 1.d0, r2%f(1,1,1,1), nO*nO, r2%f(1,1,1,1), nO*nO) + call gpu_synchronize() call gpu_deallocate(X_oovv) @@ -534,32 +540,13 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & type(gpu_double4) :: X_vovo, Y_oovo call gpu_allocate(X_vovo,nV,nO,nV,nO) -! !$omp parallel & -! !$omp shared(nO,nV,r2,Y_oovv) & -! !$omp private(u,v,gam,beta) & -! !$omp default(none) -! !$omp do -! do gam = 1, nV -! do beta = 1, nV -! do v = 1, nO -! do u = 1, nO -! r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) + Y_oovv%f(u,v,beta,gam) + Y_oovv%f(v,u,gam,beta) -! enddo -! enddo -! enddo -! enddo -! !$omp end do -! !$omp end parallel - do a=1,nV call gpu_stream_create(stream(a)) enddo do gam = 1, nV + call gpu_set_stream(blas_handle, stream(gam)) do beta = 1, nV - call gpu_set_stream(blas_handle, stream(beta)) - call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, 1.d0, Y_oovv%f(1,1,beta,gam), & - nO, 1.d0, r2%f(1,1,beta,gam), nO, r2%f(1,1,beta,gam), nO) call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, 1.d0, r2%f(1,1,beta,gam), & nO, 1.d0, Y_oovv%f(1,1,gam,beta), nO, r2%f(1,1,beta,gam), nO) enddo @@ -579,34 +566,33 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & call gpu_set_stream(blas_handle, gpu_default_stream) - type(gpu_double4) :: X_vovv - call gpu_allocate(X_vovv,nV,nO,nV,block_size) call gpu_allocate(Y_oovo,nO,nO,nV,nO) + !$OMP PARALLEL PRIVATE(blas, iblock, gam, X_vovv) + call gpu_blas_create(blas) + type(gpu_double4) :: X_vovv + call gpu_allocate(X_vovv,nV,nO,nV,block_size) + !$OMP DO do iblock = 1, nV, block_size do gam = iblock, min(nV, iblock+block_size-1) - call gpu_stream_create(stream(gam)) - call gpu_set_stream(blas_handle, stream(gam)) - call gpu_dgemm(blas_handle, 'T','N',nV, nO*nV, cholesky_mo_num, 1.d0, & + call gpu_dgemm(blas, 'T','N',nV, nO*nV, cholesky_mo_num, 1.d0, & d_cc_space_v_vv_chol%f(1,1,gam), cholesky_mo_num, d_cc_space_v_ov_chol%f(1,1,1), & cholesky_mo_num, 0.d0, X_vovv%f(1,1,1,gam-iblock+1), nV) enddo - do gam = iblock, min(nV, iblock+block_size-1) - call gpu_stream_destroy(stream(gam)) - enddo - call gpu_synchronize() - - call gpu_set_stream(blas_handle, gpu_default_stream) - call gpu_dgemm(blas_handle, 'N','N', nO, & + call gpu_dgemm(blas, 'N','N', nO, & nO*nV*min(block_size, nV-iblock+1),nV, & 1.d0, t1%f(1,1) , size(t1%f,1), & X_vovv%f(1,1,1,1), size(X_vovv%f,1), & 0d0, Y_oovv%f(1,1,1,iblock), size(Y_oovv%f,1)) - enddo + !$OMP END DO + + call gpu_blas_destroy(blas) + call gpu_deallocate(X_vovv) + !$OMP END PARALLEL call gpu_dgemm(blas_handle, 'N','N',nO,nO*nV*nO,nV, & 1d0, t1%f(1,1), size(t1%f,1), & @@ -619,47 +605,27 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & 1d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1) * size(Y_oovv%f,2) * size(Y_oovv%f,3)) call gpu_synchronize() - call gpu_deallocate(X_vovv) call gpu_deallocate(X_vovo) call gpu_deallocate(Y_oovo) -! !$omp parallel & -! !$omp shared(nO,nV,r2,Y_oovv) & -! !$omp private(u,v,gam,beta) & -! !$omp default(none) -! !$omp do -! do gam = 1, nV -! do beta = 1, nV -! do v = 1, nO -! do u = 1, nO -! r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) + Y_oovv%f(u,v,gam,beta) + Y_oovv%f(v,u,beta,gam) -! enddo -! enddo -! enddo -! enddo -! !$omp end do -! !$omp end parallel - do a=1,nV call gpu_stream_create(stream(a)) enddo do gam = 1, nV + call gpu_set_stream(blas_handle, stream(gam)) do beta = 1, nV - call gpu_set_stream(blas_handle, stream(beta)) call gpu_dgeam(blas_handle, 'T', 'N', nO, nO, 1.d0, Y_oovv%f(1,1,beta,gam), & nO, 1.d0, r2%f(1,1,beta,gam), nO, r2%f(1,1,beta,gam), nO) - call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, 1.d0, r2%f(1,1,beta,gam), & - nO, 1.d0, Y_oovv%f(1,1,gam,beta), nO, r2%f(1,1,beta,gam), nO) + enddo + do j=1,nO + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, r2%f(1,j,1,gam), & + nO*nO, 1.d0, Y_oovv%f(1,j,gam,1), nO*nO*nV, r2%f(1,j,1,gam), nO*nO) enddo enddo call gpu_set_stream(blas_handle, gpu_default_stream) - do a=1,nV - call gpu_stream_destroy(stream(a)) - enddo - call gpu_synchronize() call gpu_deallocate(Y_oovv) @@ -684,15 +650,14 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & call gpu_synchronize() - do a=1,nV - call gpu_stream_create(stream(a)) - enddo do gam = 1, nV + call gpu_set_stream(blas_handle, stream(gam)) + do j=1,nO + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, -1.d0, X_ovvo%f(1,1,gam,j), & + nO, 1.d0, r2%f(1,j,1,gam), nO*nO, r2%f(1,j,1,gam), nO*nO) + enddo do beta = 1, nV - call gpu_set_stream(blas_handle, stream(beta)) - call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, -1.d0, X_ovvo%f(1,beta,gam,1), & - nO*nV*nV, 1.d0, r2%f(1,1,beta,gam), nO, r2%f(1,1,beta,gam), nO) call gpu_dgeam(blas_handle, 'T', 'N', nO, nO, -1.d0, X_ovvo%f(1,gam,beta,1), & nO*nV*nV, 1.d0, r2%f(1,1,beta,gam), nO, r2%f(1,1,beta,gam), nO) enddo @@ -700,58 +665,41 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & call gpu_set_stream(blas_handle, gpu_default_stream) - do a=1,nV - call gpu_stream_destroy(stream(a)) - enddo - - call gpu_synchronize() + call gpu_synchronize call gpu_deallocate(tcc) call gpu_deallocate(tcc2) call gpu_deallocate(X_ovvo) - !----- type(gpu_double4) :: J1, K1 type(gpu_double4) :: Y_voov, Z_ovov + call gpu_allocate(J1,nO,nV,nV,nO) - call compute_J1_chol(nO,nV,t1%f,t2%f,d_cc_space_v_ovvo%f,d_cc_space_v_ovoo%f, & - d_cc_space_v_vvoo%f,J1%f) + call compute_J1_chol(nO,nV,t1,t2,d_cc_space_v_ovvo,d_cc_space_v_ovoo, & + d_cc_space_v_vvoo,d_cc_space_v_vo_chol,d_cc_space_v_vv_chol,J1) call gpu_allocate(K1,nO,nV,nO,nV) - call compute_K1_chol(nO,nV,t1%f,t2%f,d_cc_space_v_ovoo%f,d_cc_space_v_vvoo%f, & - d_cc_space_v_ovov%f,K1%f) + call compute_K1_chol(nO,nV,t1,t2,d_cc_space_v_ovoo,d_cc_space_v_vvoo, & + d_cc_space_v_ovov,d_cc_space_v_ov_chol,d_cc_space_v_vv_chol,K1) call gpu_allocate(X_ovvo,nO,nV,nV,nO) call gpu_allocate(Y_voov,nV,nO,nO,nV) - do a=1,nV - call gpu_stream_create(stream(a)) - enddo - - do i=1, nO - do a=1, nV - call gpu_set_stream(blas_handle, stream(a)) + do a=1, nV + call gpu_set_stream(blas_handle, stream(a)) + do i=1, nO call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,a,1,i), & nO*nV, -0.5d0, K1%f(1,a,i,1), nO*nV*nO, X_ovvo%f(1,1,a,i), nO) - enddo - enddo - - do gam=1, nV - call gpu_set_stream(blas_handle, stream(gam)) - do v=1, nO - call gpu_dgeam(blas_handle, 'T', 'T', nV, nO, 2.d0, t2%f(1,v,1,gam), & - nO*nO, -1.d0, t2%f(1,v,gam,1), nO*nO*nV, Y_voov%f(1,1,v,gam), nV) + call gpu_dgeam(blas_handle, 'T', 'T', nV, nO, 2.d0, t2%f(1,i,1,a), & + nO*nO, -1.d0, t2%f(1,i,a,1), nO*nO*nV, Y_voov%f(1,1,i,a), nV) enddo enddo call gpu_allocate(Z_ovov,nO,nV,nO,nV) - do a=1,nV - call gpu_stream_destroy(stream(a)) - enddo - + call gpu_synchronize() call gpu_deallocate(J1) call gpu_set_stream(blas_handle, gpu_default_stream) @@ -769,35 +717,20 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & call gpu_allocate(X_ovov,nO,nV,nO,nV) call gpu_allocate(Y_ovov,nO,nV,nO,nV) -!TODO - !$omp parallel & - !$omp shared(nO,nV,K1,X_ovov,Y_ovov,t2) & - !$omp private(u,a,i,beta,gam) & - !$omp default(none) - !$omp do - do beta = 1, nV - do u = 1, nO - do a = 1, nV - do i = 1, nO - X_ovov%f(i,a,u,beta) = 0.5d0 * K1%f(u,a,i,beta) - enddo - enddo + do a=1, nV + call gpu_set_stream(blas_handle, stream(a)) + do j=1,nO + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, t2%f(1,j,1,a), & + nO*nO, 0.d0, t2%f(1,j,1,a), nO*nO, Y_ovov%f(1,a,j,1), nO*nV*nO) + enddo + do beta=1, nV + call gpu_dgeam(blas_handle, 'T', 'T', nO, nO, 0.5d0, K1%f(1,a,1,beta), & + nO*nV, 0.d0, K1%f(1,a,1,beta), nO*nV, X_ovov%f(1,a,1,beta), nO*nV) enddo enddo - !$omp end do nowait + call gpu_set_stream(blas_handle, gpu_default_stream) - !$omp do - do gam = 1, nV - do v = 1, nO - do a = 1, nV - do i = 1, nO - Y_ovov%f(i,a,v,gam) = t2%f(i,v,gam,a) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel + call gpu_synchronize() call gpu_dgemm(blas_handle, 'T','N',nO*nV,nO*nV,nO*nV, & -1d0, X_ovov%f(1,1,1,1), size(X_ovov%f,1) * size(X_ovov%f,2), & @@ -806,51 +739,23 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & call gpu_synchronize() - !$omp parallel & - !$omp shared(nO,nV,r2,Z_ovov) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do - do gam = 1, nV - do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) + Z_ovov%f(u,beta,v,gam) + Z_ovov%f(v,gam,u,beta) - enddo - enddo + do gam=1, nV + call gpu_set_stream(blas_handle, stream(gam)) + do j=1,nO + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, r2%f(1,j,1,gam), & + nO*nO, 1.d0, Z_ovov%f(1,1,j,gam), nO, r2%f(1,j,1,gam), nO*nO) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, K1%f(1,1,j,gam), & + nO, 0.d0, K1%f(1,1,j,gam), nO, X_ovov%f(1,gam,j,1), nO*nV*nO) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, 1.d0, t2%f(1,j,1,gam), & + nO*nO, 0.d0, t2%f(1,j,1,gam), nO*nO, Y_ovov%f(1,gam,j,1), nO*nV*nO) + enddo + do beta=1, nV + call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, 1.d0, r2%f(1,1,beta,gam), & + nO, 1.d0, Z_ovov%f(1,gam,1,beta), nO*nV, r2%f(1,1,beta,gam), nO) enddo enddo - !$omp end do - !$omp end parallel - !$omp parallel & - !$omp shared(nO,nV,K1,X_ovov,Y_ovov,t2) & - !$omp private(u,v,gam,beta,i,a) & - !$omp default(none) - !$omp do - do a = 1, nV - do i = 1, nO - do gam = 1, nV - do u = 1, nO - X_ovov%f(u,gam,i,a) = K1%f(u,a,i,gam) - enddo - enddo - enddo - enddo - !$omp end do nowait - - !$omp do - do beta = 1, nV - do v = 1, nO - do a = 1, nV - do i = 1, nO - Y_ovov%f(i,a,v,beta) = t2%f(i,v,beta,a) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel + call gpu_set_stream(blas_handle, gpu_default_stream) call gpu_deallocate(K1) @@ -865,22 +770,17 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & call gpu_deallocate(Y_ovov) ! Change the sign for consistency with the code in spin orbitals - !$omp parallel & - !$omp shared(nO,nV,r2,Z_ovov) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do do gam = 1, nV + call gpu_set_stream(blas_handle, stream(gam)) + do j=1,nO + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, r2%f(1,j,1,gam), & + nO*nO, -1.d0, Z_ovov%f(1,gam,j,1), nO*nV*nO, r2%f(1,j,1,gam), nO*nO) + enddo do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2%f(u,v,beta,gam) = -r2%f(u,v,beta,gam) + Z_ovov%f(u,gam,v,beta) + Z_ovov%f(v,beta,u,gam) - enddo - enddo + call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, -1.d0, r2%f(1,1,beta,gam), & + nO, 1.d0, Z_ovov%f(1,beta,1,gam), nO*nV, r2%f(1,1,beta,gam), nO) enddo enddo - !$omp end do - !$omp end parallel call gpu_deallocate(Z_ovov) @@ -929,34 +829,42 @@ subroutine compute_A1_chol(nO,nV,t1,t2,tau,d_cc_space_v_vooo, & ! A1(u,v,i,j) = cc_space_v_oooo(u,v,i,j) ! A1(u,v,i,j) += cc_space_v_ovoo(u,a,i,j) * t1(v,a) & - call dgemm('N','N', nO, nO*nO*nO, nV, & - 1d0, t1%f , size(t1%f,1), & - d_cc_space_v_vooo%f, size(d_cc_space_v_vooo%f,1), & - 0d0, Y_oooo%f, size(Y_oooo%f,1)) + call gpu_dgemm(blas_handle, 'N','N', nO, nO*nO*nO, nV, & + 1d0, t1%f(1,1) , size(t1%f,1), & + d_cc_space_v_vooo%f(1,1,1,1), size(d_cc_space_v_vooo%f,1), & + 0d0, Y_oooo%f(1,1,1,1), size(Y_oooo%f,1)) - !$omp parallel & - !$omp private(u,v,i,j) & - !$omp default(shared) - !$omp do collapse(2) - do j = 1, nO - do i = 1, nO - do v = 1, nO - do u = 1, nO - A1%f(u,v,i,j) = d_cc_space_v_oooo%f(u,v,i,j) + Y_oooo%f(v,u,j,i) + Y_oooo%f(u,v,i,j) - enddo - enddo - enddo + type(gpu_stream) :: stream(nO) + + do i=1, nO + call gpu_stream_create(stream(i)) + enddo + + call gpu_synchronize() + + do j = 1, nO + call gpu_set_stream(blas_handle, stream(j)) + do i = 1, nO + call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, 1.d0, d_cc_space_v_oooo%f(1,1,i,j), & + nO, 1.d0, Y_oooo%f(1,1,j,i), nO, A1%f(1,1,i,j), nO) + enddo + call gpu_dgeam(blas_handle, 'N', 'N', nO, nO*nO, 1.d0, A1%f(1,1,1,j), & + nO, 1.d0, Y_oooo%f(1,1,1,j), nO, A1%f(1,1,1,j), nO) + enddo + + call gpu_set_stream(blas_handle, gpu_default_stream) + do i=1, nO + call gpu_stream_destroy(stream(i)) enddo - !$omp end do - !$omp end parallel call gpu_deallocate(Y_oooo) ! A1(u,v,i,j) += cc_space_v_vvoo(a,b,i,j) * tau(u,v,a,b) - call dgemm('N','N', nO*nO, nO*nO, nV*nV, & - 1d0, tau%f , size(tau%f,1) * size(tau%f,2), & - d_cc_space_v_vvoo%f, size(d_cc_space_v_vvoo%f,1) * size(d_cc_space_v_vvoo%f,2), & - 1d0, A1%f , size(A1%f,1) * size(A1%f,2)) + call gpu_dgemm(blas_handle, 'N','N', nO*nO, nO*nO, nV*nV, & + 1d0, tau%f(1,1,1,1), size(tau%f,1) * size(tau%f,2), & + d_cc_space_v_vvoo%f(1,1,1,1), size(d_cc_space_v_vvoo%f,1) * size(d_cc_space_v_vvoo%f,2), & + 1d0, A1%f(1,1,1,1), size(A1%f,1) * size(A1%f,2)) + call gpu_synchronize() end @@ -998,364 +906,364 @@ end ! g_vir -subroutine compute_g_vir_chol(nO,nV,t1,t2,H_vv,g_vir) +subroutine compute_g_vir_chol(nO,nV,t1,t2,H_vv,d_cc_space_f_vo, & + d_cc_space_v_ov_chol, d_cc_space_v_vv_chol, g_vir) use gpu implicit none integer, intent(in) :: nO,nV - double precision, intent(in) :: t1(nO, nV), H_vv(nV, nV) - double precision, intent(in) :: t2(nO, nO, nV, nV) - double precision, intent(out) :: g_vir(nV, nV) + type(gpu_double2), intent(in) :: t1, H_vv, d_cc_space_f_vo + type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol + type(gpu_double4), intent(in) :: t2 + type(gpu_double2), intent(out) :: g_vir integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam - call dgemm('N','N',nV,nV,nO, & - -1d0, cc_space_f_vo , size(cc_space_f_vo,1), & - t1 , size(t1,1), & - 0d0, g_vir, size(g_vir,1)) + type(gpu_stream) :: stream(max(nO,4)) - double precision, allocatable :: tmp_k(:), tmp_vo(:,:,:), tmp_vo2(:,:,:) - allocate(tmp_k(cholesky_mo_num)) - call dgemm('N','N', cholesky_mo_num, 1, nO*nV, 1.d0, & - cc_space_v_ov_chol, cholesky_mo_num, t1, nO*nV, 0.d0, tmp_k, cholesky_mo_num) - - call dgemm('T','N', nV*nV, 1, cholesky_mo_num, 2.d0, & - cc_space_v_vv_chol, cholesky_mo_num, tmp_k, cholesky_mo_num, 1.d0, & - g_vir, nV*nV) - deallocate(tmp_k) - - allocate(tmp_vo(cholesky_mo_num,nV,nO)) - call dgemm('N','T',cholesky_mo_num*nV, nO, nV, 1.d0, & - cc_space_v_vv_chol, cholesky_mo_num*nV, t1, nO, 0.d0, tmp_vo, cholesky_mo_num*nV) - - allocate(tmp_vo2(cholesky_mo_num,nO,nV)) - do beta=1,nV - do i=1,nO - do k=1,cholesky_mo_num - tmp_vo2(k,i,beta) = -tmp_vo(k,beta,i) - enddo - enddo - enddo - deallocate(tmp_vo) - - do beta = 1, nV - do a = 1, nV - g_vir(a,beta) = g_vir(a,beta) + H_vv(a,beta) - enddo + do i=1,max(nO,4) + call gpu_stream_create(stream(i)) enddo - call dgemm('T','N', nV, nV, nO*cholesky_mo_num, 1.d0, & - cc_space_v_ov_chol, cholesky_mo_num*nO, & - tmp_vo2, cholesky_mo_num*nO, 1.d0, g_vir, nV) + call gpu_set_stream(blas_handle, stream(1)) + call gpu_dgemm(blas_handle, 'N','N',nV,nV,nO, & + -1d0, d_cc_space_f_vo%f(1,1) , size(d_cc_space_f_vo%f,1), & + t1%f(1,1) , size(t1%f,1), & + 0d0, g_vir%f(1,1), size(g_vir%f,1)) + + type(gpu_double1) :: tmp_k + type(gpu_double3) :: tmp_vo, tmp_vo2 + + call gpu_allocate(tmp_k,cholesky_mo_num) + + call gpu_set_stream(blas_handle, stream(2)) + call gpu_dgemm(blas_handle, 'N','N', cholesky_mo_num, 1, nO*nV, 1.d0, & + d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, t1%f(1,1), nO*nV, 0.d0, tmp_k%f(1), cholesky_mo_num) + + call gpu_dgemm(blas_handle, 'T','N', nV*nV, 1, cholesky_mo_num, 2.d0, & + d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num, tmp_k%f(1), cholesky_mo_num, 1.d0, & + g_vir%f(1,1), nV*nV) + + call gpu_set_stream(blas_handle, stream(3)) + call gpu_allocate(tmp_vo,cholesky_mo_num,nV,nO) + + call gpu_dgemm(blas_handle, 'N','T',cholesky_mo_num*nV, nO, nV, 1.d0, & + d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num*nV, t1%f(1,1), nO, 0.d0, tmp_vo%f(1,1,1), cholesky_mo_num*nV) + + call gpu_allocate(tmp_vo2,cholesky_mo_num,nO,nV) + + call gpu_synchronize() + call gpu_deallocate(tmp_k) + + do i=1,nO + call gpu_set_stream(blas_handle, stream(i)) + call gpu_dgeam(blas_handle, 'N', 'N', cholesky_mo_num, nV, -1.d0, tmp_vo%f(1,1,i), & + cholesky_mo_num, 0.d0, tmp_vo%f(1,1,i), cholesky_mo_num, tmp_vo2%f(1,i,1), cholesky_mo_num*nO) + enddo + + call gpu_set_stream(blas_handle, gpu_default_stream) + + do i=1,max(nO,4) + call gpu_stream_destroy(stream(i)) + enddo + call gpu_deallocate(tmp_vo) + + call gpu_dgeam(blas_handle, 'N', 'N', nV, nV, 1.d0, g_vir%f(1,1), & + nV, 1.d0, H_vv%f(1,1), nV, g_vir%f(1,1), nV) + + call gpu_dgemm(blas_handle, 'T','N', nV, nV, nO*cholesky_mo_num, 1.d0, & + d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, & + tmp_vo2%f(1,1,1), cholesky_mo_num*nO, 1.d0, g_vir%f(1,1), nV) + + call gpu_synchronize() + call gpu_deallocate(tmp_vo2) end ! J1 -subroutine compute_J1_chol(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvoo,J1) +subroutine compute_J1_chol(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvoo,d_cc_space_v_vo_chol,d_cc_space_v_vv_chol,J1) use gpu implicit none - integer, intent(in) :: nO,nV - double precision, intent(in) :: t1(nO, nV) - double precision, intent(in) :: t2(nO, nO, nV, nV) - double precision, intent(in) :: v_ovvo(nO,nV,nV,nO), v_ovoo(nO,nV,nO,nO) - double precision, intent(in) :: v_vvoo(nV,nV,nO,nO) - double precision, intent(out) :: J1(nO, nV, nV, nO) + integer, intent(in) :: nO,nV + type(gpu_double2), intent(in) :: t1 + type(gpu_double4), intent(in) :: t2, v_ovvo, v_ovoo, v_vvoo + type(gpu_double4), intent(out) :: J1 + type(gpu_double3), intent(out) :: d_cc_space_v_vo_chol,d_cc_space_v_vv_chol integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam - double precision, allocatable :: X_ovoo(:,:,:,:), Y_ovov(:,:,:,:) - allocate(X_ovoo(nO,nV,nO,nO),Y_ovov(nO,nV,nO,nV)) + type(gpu_double4) :: X_ovoo, Y_ovov - !$omp parallel & - !$omp shared(nO,nV,J1,v_ovvo,v_ovoo,X_ovoo) & - !$omp private(i,j,a,u,beta) & - !$omp default(none) - do i = 1, nO - !$omp do - do beta = 1, nV - do a = 1, nV - do u = 1, nO - J1(u,a,beta,i) = v_ovvo(u,a,beta,i) - enddo - enddo - enddo - !$omp end do nowait - enddo + call gpu_allocate(X_ovoo,nO,nV,nO,nO) - !$omp do collapse(2) - do j = 1, nO - do i = 1, nO - do a = 1, nV - do u = 1, nO - X_ovoo(u,a,i,j) = v_ovoo(u,a,j,i) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel + type(gpu_stream) :: stream(nV) - call dgemm('N','N',nO*nV*nO,nV,nO, & - -1d0, X_ovoo, size(X_ovoo,1) * size(X_ovoo,2) * size(X_ovoo,3), & - t1 , size(t1,1), & - 0d0, Y_ovov, size(Y_ovov,1) * size(Y_ovov,2) * size(Y_ovov,3)) - - !$omp parallel & - !$omp shared(nO,nV,J1,Y_ovov) & - !$omp private(i,beta,a,u) & - !$omp default(none) - do i = 1, nO - !$omp do - do beta = 1, nV - do a = 1, nV - do u = 1, nO - J1(u,a,beta,i) = J1(u,a,beta,i) + Y_ovov(u,a,i,beta) - enddo - enddo - enddo - !$omp end do nowait - enddo - !$omp end parallel - deallocate(X_ovoo) - - double precision, allocatable :: tmp_cc(:,:,:), J1_tmp(:,:,:,:) - allocate(tmp_cc(cholesky_mo_num,nV,nO), J1_tmp(nV,nO,nV,nO)) - - call dgemm('N','T', cholesky_mo_num*nV, nO, nV, 1.d0, & - cc_space_v_vv_chol, cholesky_mo_num*nV, & - t1, nO, & - 0.d0, tmp_cc, cholesky_mo_num*nV) - - call dgemm('T','N', nV*nO, nV*nO, cholesky_mo_num, 1.d0, & - tmp_cc, cholesky_mo_num, cc_space_v_vo_chol, cholesky_mo_num, & - 0.d0, J1_tmp, nV*nO) - - deallocate(tmp_cc) do i=1,nO - do b=1,nV - do a=1,nV - do u=1,nO - J1(u,a,b,i) = J1(u,a,b,i) + J1_tmp(b,u,a,i) - enddo - enddo - enddo - enddo - - deallocate(J1_tmp) - - !- cc_space_v_vvoo(a,b,i,j) * (0.5d0 * t2(u,j,b,beta) + t1(u,b) * t1(j,beta)) & - double precision, allocatable :: X_voov(:,:,:,:), Z_ovvo(:,:,:,:) - allocate(X_voov(nV,nO,nO,nV), Z_ovvo(nO,nV,nV,nO)) - !$omp parallel & - !$omp shared(nO,nV,t2,t1,Y_ovov,X_voov,v_vvoo) & - !$omp private(i,beta,a,u,b,j) & - !$omp default(none) - !$omp do - do b = 1, nV - do j = 1, nO - do beta = 1, nV - do u = 1, nO - Y_ovov(u,beta,j,b) = 0.5d0 * t2(u,j,b,beta) + t1(u,b) * t1(j,beta) - enddo - enddo - enddo - enddo - !$omp end do nowait - - !$omp do - do b = 1, nV - do j = 1, nO - do i = 1, nO - do a = 1, nV - X_voov(a,i,j,b) = v_vvoo(a,b,i,j) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - - call dgemm('N','T',nO*nV,nV*nO,nO*nV, & - -1d0, Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), & - X_voov, size(X_voov,1) * size(X_voov,2), & - 0d0, Z_ovvo, size(Z_ovvo,1) * size(Z_ovvo,2)) - deallocate(X_voov) - - double precision, allocatable :: X_ovvo(:,:,:,:), Y_vovo(:,:,:,:) - allocate(X_ovvo(nO,nV,nV,nO),Y_vovo(nV,nO,nV,nO)) - !$omp parallel & - !$omp shared(nO,nV,J1,Z_ovvo,t2,Y_vovo,v_vvoo,X_ovvo) & - !$omp private(i,beta,a,u,j,b) & - !$omp default(none) - do i = 1, nO - !$omp do - do beta = 1, nV - do a = 1, nV - do u = 1, nO - J1(u,a,beta,i) = J1(u,a,beta,i) + Z_ovvo(u,beta,a,i) - enddo - enddo - enddo - !$omp end do nowait - enddo - - !+ 0.5d0 * (2d0 * cc_space_v_vvoo(a,b,i,j) - cc_space_v_vvoo(b,a,i,j)) * t2(u,j,beta,b) - do j = 1, nO - !$omp do - do b = 1, nV - do i = 1, nO - do a = 1, nV - Y_vovo(a,i,b,j) = 0.5d0 * (2d0 * v_vvoo(a,b,i,j) - v_vvoo(b,a,i,j)) - enddo - enddo - enddo - !$omp end do nowait + call gpu_stream_create(stream(i)) enddo do j = 1, nO - !$omp do - do b = 1, nV - do beta = 1, nV - do u = 1, nO - X_ovvo(u,beta,b,j) = t2(u,j,beta,b) - enddo - enddo + call gpu_set_stream(blas_handle, stream(j)) + do i = 1, nO + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, v_ovoo%f(1,1,j,i), & + nO, 0.d0, X_ovoo%f(1,1,i,j), nO, X_ovoo%f(1,1,i,j), nO) enddo - !$omp end do nowait enddo - !$omp end parallel - call dgemm('N','T',nO*nV,nV*nO,nV*nO, & - 1d0, X_ovvo, size(X_ovvo,1) * size(X_ovvo,2), & - Y_vovo, size(Y_vovo,1) * size(Y_vovo,2), & - 0d0, Z_ovvo, size(Z_ovvo,1) * size(Z_ovvo,2)) + call gpu_set_stream(blas_handle, gpu_default_stream) + + do i=1,nO + call gpu_stream_destroy(stream(i)) + enddo + + call gpu_allocate(Y_ovov,nO,nV,nO,nV) + + call gpu_dgemm(blas_handle, 'N','N',nO*nV*nO,nV,nO, & + -1d0, X_ovoo%f(1,1,1,1), size(X_ovoo%f,1) * size(X_ovoo%f,2) * size(X_ovoo%f,3), & + t1%f(1,1) , size(t1%f,1), & + 0d0, Y_ovov%f(1,1,1,1), size(Y_ovov%f,1) * size(Y_ovov%f,2) * size(Y_ovov%f,3)) + + + call gpu_copy(v_ovvo, J1) + + call gpu_synchronize() + + do a=1,nV + call gpu_stream_create(stream(a)) + enddo - !$omp parallel & - !$omp shared(nO,nV,J1,Z_ovvo) & - !$omp private(i,beta,a,u) & - !$omp default(none) do i = 1, nO - !$omp do do beta = 1, nV - do a = 1, nV - do u = 1, nO - J1(u,a,beta,i) = J1(u,a,beta,i) + Z_ovvo(u,beta,a,i) - enddo - enddo + call gpu_set_stream(blas_handle, stream(beta)) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,1,beta,i), & + nO, 1.d0, Y_ovov%f(1,1,i,beta), nO, J1%f(1,1,beta,i), nO) enddo - !$omp end do nowait enddo - !$omp end parallel - deallocate(X_ovvo,Z_ovvo,Y_ovov) + call gpu_allocate(tmp_cc,cholesky_mo_num,nV,nO) + call gpu_allocate(J1_tmp,nV,nO,nV,nO) + + call gpu_set_stream(blas_handle, gpu_default_stream) + + type(gpu_double4) :: J1_tmp + type(gpu_double3) :: tmp_cc + + call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nV, nO, nV, 1.d0, & + d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num*nV, & + t1%f(1,1), nO, & + 0.d0, tmp_cc%f(1,1,1), cholesky_mo_num*nV) + + call gpu_dgemm(blas_handle, 'T','N', nV*nO, nV*nO, cholesky_mo_num, 1.d0, & + tmp_cc%f(1,1,1), cholesky_mo_num, d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num, & + 0.d0, J1_tmp%f(1,1,1,1), nV*nO) + + + call gpu_deallocate(X_ovoo) + + call gpu_synchronize() + call gpu_deallocate(tmp_cc) + + do i = 1, nO + do a = 1, nV + call gpu_set_stream(blas_handle, stream(a)) + call gpu_dgeam(blas_handle, 'N', 'T', nO, nV, 1.d0, J1%f(1,a,1,i), & + nO*nV, 1.d0, J1_tmp%f(1,1,a,i), nV, J1%f(1,a,1,i), nO*nV) + enddo + enddo + + type(gpu_double4) :: X_voov, Z_ovvo + + call gpu_allocate(X_voov,nV,nO,nO,nV) + call gpu_allocate(Z_ovvo,nO,nV,nV,nO) + + do j = 1, nO + do beta = 1, nV + call gpu_set_stream(blas_handle, stream(beta)) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 0.5d0, t2%f(1,j,1,beta), & + nO*nO, t1%f(j,beta), t1%f(1,1), nO, Y_ovov%f(1,beta,j,1), nO*nV*nO) + enddo + enddo + + do b = 1, nV + call gpu_set_stream(blas_handle, stream(b)) + call gpu_dgeam(blas_handle, 'N', 'N', nV, nO*nO, 1.d0, v_vvoo%f(1,b,1,1), & + nV*nV, 0.d0, X_voov%f(1,1,1,b), nV, X_voov%f(1,1,1,b), nV) + enddo + + call gpu_set_stream(blas_handle, gpu_default_stream) + + call gpu_synchronize() + call gpu_deallocate(J1_tmp) + + call gpu_dgemm(blas_handle, 'N','T',nO*nV,nV*nO,nO*nV, & + -1d0, Y_ovov%f(1,1,1,1), size(Y_ovov%f,1) * size(Y_ovov%f,2), & + X_voov%f(1,1,1,1), size(X_voov%f,1) * size(X_voov%f,2), & + 0d0, Z_ovvo%f(1,1,1,1), size(Z_ovvo%f,1) * size(Z_ovvo%f,2)) + + call gpu_synchronize() + + do i = 1, nO + do a = 1, nV + call gpu_set_stream(blas_handle, stream(a)) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,a,1,i), & + nO*nV, 1.d0, Z_ovvo%f(1,1,a,i), nO, J1%f(1,a,1,i), nO*nV) + enddo + enddo + + type(gpu_double4) :: X_ovvo, Y_vovo + call gpu_allocate(Y_vovo,nV,nO,nV,nO) + + do j = 1, nO + do i = 1, nO + call gpu_set_stream(blas_handle, stream(i)) + call gpu_dgeam(blas_handle, 'N', 'T', nV, nV, 1.d0, v_vvoo%f(1,1,i,j), & + nV, -0.5d0, v_vvoo%f(1,1,i,j), nV, Y_vovo%f(1,i,1,j), nO*nV) + enddo + enddo + + call gpu_allocate(X_ovvo,nO,nV,nV,nO) + + do j = 1, nO + do b = 1, nV + call gpu_set_stream(blas_handle, stream(b)) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, t2%f(1,j,1,b), & + nO*nO, 0.d0, t2%f(1,j,1,b), nO*nO, X_ovvo%f(1,1,b,j), nO) + enddo + enddo + + call gpu_set_stream(blas_handle, gpu_default_stream) + call gpu_synchronize() + call gpu_deallocate(X_voov) + + call gpu_dgemm(blas_handle, 'N','T',nO*nV,nV*nO,nV*nO, & + 1d0, X_ovvo%f(1,1,1,1), size(X_ovvo%f,1) * size(X_ovvo%f,2), & + Y_vovo%f(1,1,1,1), size(Y_vovo%f,1) * size(Y_vovo%f,2), & + 0d0, Z_ovvo%f(1,1,1,1), size(Z_ovvo%f,1) * size(Z_ovvo%f,2)) + + call gpu_synchronize() + + do i = 1, nO + do beta = 1, nV + call gpu_set_stream(blas_handle, stream(beta)) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,1,beta,i), & + nO, 1.d0, Z_ovvo%f(1,beta,1,i), nO*nV, J1%f(1,1,beta,i), nO) + enddo + enddo + + call gpu_set_stream(blas_handle, gpu_default_stream) + call gpu_deallocate(Y_ovov) + call gpu_deallocate(X_ovvo) + + do a = 1, nV + call gpu_stream_destroy(stream(a)) + enddo + + call gpu_deallocate(Z_ovvo) end ! K1 -subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov,K1) +subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov, & + d_cc_space_v_ov_chol,d_cc_space_v_vv_chol,K1) use gpu implicit none - integer, intent(in) :: nO,nV - double precision, intent(in) :: t1(nO, nV) - double precision, intent(in) :: t2(nO, nO, nV, nV) - double precision, intent(in) :: v_vvoo(nV,nV,nO,nO), v_ovov(nO,nV,nO,nV) - double precision, intent(in) :: v_ovoo(nO,nV,nO,nO) - double precision, intent(out) :: K1(nO, nV, nO, nV) + integer, intent(in) :: nO,nV + type(gpu_double2), intent(in) :: t1 + type(gpu_double4), intent(in) :: t2, v_vvoo, v_ovov, v_ovoo + type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol + type(gpu_double4), intent(out) :: K1 - double precision, allocatable :: X(:,:,:,:), Y(:,:,:,:), Z(:,:,:,:) + type(gpu_double4) :: X, Y, Z integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam - allocate(X(nV,nO,nV,nO),Y(nO,nV,nV,nO),Z(nO,nV,nV,nO)) - !$omp parallel & - !$omp shared(nO,nV,K1,X,Y,v_vvoo,v_ovov,t1,t2) & - !$omp private(i,beta,a,u,j,b) & - !$omp default(none) - !$omp do - do beta = 1, nV - do i = 1, nO - do a = 1, nV - do u = 1, nO - K1(u,a,i,beta) = v_ovov(u,a,i,beta) - enddo - enddo - enddo + call gpu_copy(v_ovov, K1) + + type(gpu_stream) :: stream(nV) + do a = 1, nV + call gpu_stream_create(stream(a)) enddo - !$omp end do nowait + + call gpu_allocate(X,nV,nO,nV,nO) do i = 1, nO - !$omp do do a = 1, nV - do j = 1, nO - do b = 1, nV - X(b,j,a,i) = - v_vvoo(b,a,i,j) - enddo - enddo + call gpu_set_stream(blas_handle, stream(a)) + call gpu_dgeam(blas_handle, 'N', 'N', nV, nO, -1.d0, v_vvoo%f(1,a,i,1), & + nV*nV*nO, 0.d0, v_vvoo%f(1,a,i,1), nV*nV*nO, X%f(1,1,a,i), nV) enddo - !$omp end do nowait enddo + call gpu_allocate(Y,nO,nV,nV,nO) + do j = 1, nO - !$omp do - do b = 1, nV - do beta = 1, nV - do u = 1, nO - Y(u,beta,b,j) = 0.5d0 * t2(u,j,b,beta) + t1(u,b) * t1(j,beta) - enddo - enddo + do beta = 1, nV + call gpu_set_stream(blas_handle, stream(beta)) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 0.5d0, t2%f(1,j,1,beta), & + nO*nO, t1%f(j,beta), t1%f(1,1), nO, Y%f(1,beta,1,j), nO*nV) enddo - !$omp end do enddo - !$omp end parallel - call dgemm('N','N',nO*nV*nO,nV,nO, & - -1d0, v_ovoo, size(v_ovoo,1) * size(v_ovoo,2) * size(v_ovoo,3), & - t1 , size(t1,1), & - 1d0, K1 , size(K1,1) * size(K1,2) * size(K1,3)) + call gpu_set_stream(blas_handle, gpu_default_stream) - double precision, allocatable :: K1tmp(:,:,:,:), t1v(:,:,:) - allocate(K1tmp(nO,nO,nV,nV), t1v(cholesky_mo_num,nO,nO)) + call gpu_dgemm(blas_handle, 'N','N',nO*nV*nO,nV,nO, & + -1d0, v_ovoo%f(1,1,1,1), size(v_ovoo%f,1) * size(v_ovoo%f,2) * size(v_ovoo%f,3), & + t1%f(1,1) , size(t1%f,1), & + 1d0, K1%f(1,1,1,1) , size(K1%f,1) * size(K1%f,2) * size(K1%f,3)) - call dgemm('N','T', cholesky_mo_num*nO, nO, nV, 1.d0, & - cc_space_v_ov_chol, cholesky_mo_num*nO, t1, nO, 0.d0, & - t1v, cholesky_mo_num*nO) + type(gpu_double4) :: K1tmp + type(gpu_double3) :: t1v - call dgemm('T','N', nO*nO, nV*nV, cholesky_mo_num, 1.d0, & - t1v, cholesky_mo_num, cc_space_v_vv_chol, cholesky_mo_num, 0.d0, & - K1tmp, nO*nO) + call gpu_allocate(t1v,cholesky_mo_num,nO,nO) + + call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nO, nO, nV, 1.d0, & + d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, t1%f(1,1), nO, 0.d0, & + t1v%f(1,1,1), cholesky_mo_num*nO) + + call gpu_allocate(K1tmp,nO,nO,nV,nV) + + call gpu_dgemm(blas_handle, 'T','N', nO*nO, nV*nV, cholesky_mo_num, 1.d0, & + t1v%f(1,1,1), cholesky_mo_num, d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num, 0.d0, & + K1tmp%f(1,1,1,1), nO*nO) + + call gpu_allocate(Z,nO,nV,nV,nO) + call gpu_synchronize() - deallocate(t1v) ! Y(u,beta,b,j) * X(b,j,a,i) = Z(u,beta,a,i) - call dgemm('N','N',nV*nO,nO*nV,nV*nO, & - 1d0, Y, size(Y,1) * size(Y,2), & - X, size(X,1) * size(X,2), & - 0d0, Z, size(Z,1) * size(Z,2)) + call gpu_dgemm(blas_handle, 'N','N',nV*nO,nO*nV,nV*nO, & + 1d0, Y%f(1,1,1,1), size(Y%f,1) * size(Y%f,2), & + X%f(1,1,1,1), size(X%f,1) * size(X%f,2), & + 0d0, Z%f(1,1,1,1), size(Z%f,1) * size(Z%f,2)) - !$omp parallel & - !$omp shared(nO,nV,K1,Z,K1tmp) & - !$omp private(i,beta,a,u) & - !$omp default(none) - !$omp do - do beta = 1, nV - do i = 1, nO - do a = 1, nV - do u = 1, nO - K1(u,a,i,beta) = K1(u,a,i,beta) + K1tmp(u,i,a,beta) + Z(u,beta,a,i) - enddo - enddo + call gpu_synchronize() + call gpu_deallocate(t1v) + + do i = 1, nO + do beta = 1, nV + call gpu_set_stream(blas_handle, stream(beta)) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, K1%f(1,1,i,beta), & + nO, 1.d0, K1tmp%f(1,i,1,beta), nO*nO, K1%f(1,1,i,beta), nO) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, K1%f(1,1,i,beta), & + nO, 1.d0, Z%f(1,beta,1,i), nO*nV, K1%f(1,1,i,beta), nO) enddo enddo - !$omp end do - !$omp end parallel - deallocate(K1tmp,X,Y,Z) + call gpu_deallocate(X) + call gpu_deallocate(Y) + + do a = 1, nV + call gpu_stream_destroy(stream(a)) + enddo + + call gpu_deallocate(K1tmp) + call gpu_deallocate(Z) end From 9ad69bb27dc4195ae2ae2c9ea2f280156c20366e Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 9 Jul 2024 03:27:54 +0200 Subject: [PATCH 23/38] GPU accelerated CCSD --- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 75 +++++++++++++------------- 1 file changed, 37 insertions(+), 38 deletions(-) diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index a185df13..24fcc5af 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -507,7 +507,8 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & call gpu_allocate(g_occ,nO,nO) call compute_g_vir_chol(nO,nV,t1,t2,H_vv,d_cc_space_f_vo, & d_cc_space_v_ov_chol, d_cc_space_v_vv_chol, g_vir) - call compute_g_occ_chol(nO,nV,t1%f,t2%f,H_oo%f,g_occ%f) + call compute_g_occ_chol(nO,nV,t1,t2,H_oo, & + d_cc_space_f_vo, d_cc_space_v_ov_chol, d_cc_space_v_oo_chol, d_cc_space_v_ovoo, g_occ) type(gpu_double4) :: Y_oovv call gpu_allocate(Y_oovv,nO,nO,nV,nV) @@ -870,37 +871,42 @@ end ! g_occ -subroutine compute_g_occ_chol(nO,nV,t1,t2,H_oo,g_occ) +subroutine compute_g_occ_chol(nO,nV,t1,t2,H_oo, & + d_cc_space_f_vo, d_cc_space_v_ov_chol, d_cc_space_v_oo_chol, d_cc_space_v_ovoo, g_occ) use gpu implicit none integer, intent(in) :: nO,nV - double precision, intent(in) :: t1(nO, nV), H_oo(nO, nO) - double precision, intent(in) :: t2(nO, nO, nV, nV) - double precision, intent(out) :: g_occ(nO, nO) + type(gpu_double2), intent(in) :: t1, H_oo, d_cc_space_f_vo + type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_oo_chol + type(gpu_double4), intent(in) :: t2, d_cc_space_v_ovoo + type(gpu_double2), intent(out) :: g_occ - g_occ = H_oo + call gpu_copy(H_oo, g_occ) - call dgemm('N','N',nO,nO,nV, & - 1d0, t1, size(t1,1), & - cc_space_f_vo, size(cc_space_f_vo,1), & - 1d0, g_occ, size(g_occ,1)) + call gpu_dgemm(blas_handle, 'N','N',nO,nO,nV, & + 1d0, t1%f(1,1), size(t1%f,1), & + d_cc_space_f_vo%f(1,1), size(d_cc_space_f_vo%f,1), & + 1d0, g_occ%f(1,1), size(g_occ%f,1)) - double precision, allocatable :: X(:) - allocate(X(cholesky_mo_num)) - call dgemv('N',cholesky_mo_num,nO*nV,2.d0, & - cc_space_v_ov_chol, cholesky_mo_num, & - t1, 1, 0.d0, X, 1) + type(gpu_double1) :: X + call gpu_allocate(X,cholesky_mo_num) - call dgemv('T',cholesky_mo_num,nO*nO,1.d0, & - cc_space_v_oo_chol, cholesky_mo_num, & - X, 1, 1.d0, g_occ, 1) - deallocate(X) + call gpu_dgemv(blas_handle, 'N',cholesky_mo_num,nO*nV,2.d0, & + d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, & + t1%f(1,1), 1, 0.d0, X%f(1), 1) - call dgemv('T',nO*nV,nO*nO,-1.d0, & - cc_space_v_ovoo, nO*nV, & - t1, 1, 1.d0, g_occ, 1) + call gpu_dgemv(blas_handle, 'T',cholesky_mo_num,nO*nO,1.d0, & + d_cc_space_v_oo_chol%f(1,1,1), cholesky_mo_num, & + X%f(1), 1, 1.d0, g_occ%f(1,1), 1) + + call gpu_dgemv(blas_handle, 'T',nO*nV,nO*nO,-1.d0, & + d_cc_space_v_ovoo%f(1,1,1,1), nO*nV, & + t1%f(1,1), 1, 1.d0, g_occ%f(1,1), 1) + + call gpu_synchronize() + call gpu_deallocate(X) end @@ -1193,22 +1199,15 @@ subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov, & enddo call gpu_allocate(X,nV,nO,nV,nO) - - do i = 1, nO - do a = 1, nV - call gpu_set_stream(blas_handle, stream(a)) - call gpu_dgeam(blas_handle, 'N', 'N', nV, nO, -1.d0, v_vvoo%f(1,a,i,1), & - nV*nV*nO, 0.d0, v_vvoo%f(1,a,i,1), nV*nV*nO, X%f(1,1,a,i), nV) - enddo - enddo - call gpu_allocate(Y,nO,nV,nV,nO) - do j = 1, nO - do beta = 1, nV - call gpu_set_stream(blas_handle, stream(beta)) - call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 0.5d0, t2%f(1,j,1,beta), & - nO*nO, t1%f(j,beta), t1%f(1,1), nO, Y%f(1,beta,1,j), nO*nV) + do a = 1, nV + call gpu_set_stream(blas_handle, stream(a)) + do i = 1, nO + call gpu_dgeam(blas_handle, 'N', 'N', nV, nO, -1.d0, v_vvoo%f(1,a,i,1), & + nV*nV*nO, 0.d0, v_vvoo%f(1,a,i,1), nV*nV*nO, X%f(1,1,a,i), nV) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 0.5d0, t2%f(1,i,1,a), & + nO*nO, t1%f(i,a), t1%f(1,1), nO, Y%f(1,a,1,i), nO*nV) enddo enddo @@ -1246,9 +1245,9 @@ subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov, & call gpu_synchronize() call gpu_deallocate(t1v) + do beta = 1, nV + call gpu_set_stream(blas_handle, stream(beta)) do i = 1, nO - do beta = 1, nV - call gpu_set_stream(blas_handle, stream(beta)) call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, K1%f(1,1,i,beta), & nO, 1.d0, K1tmp%f(1,i,1,beta), nO*nO, K1%f(1,1,i,beta), nO) call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, K1%f(1,1,i,beta), & From dd9c6dcc03e6d24a78ed5651e5c825e81d68a9c6 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 9 Jul 2024 21:11:13 +0200 Subject: [PATCH 24/38] Introducing dpcpp --- configure | 8 +- plugins/local/gpu_intel/LIB | 1 + plugins/local/gpu_intel/NEED | 1 + plugins/local/gpu_intel/README.rst | 8 + plugins/local/gpu_intel/gpu.sycl | 266 +++++++++++++++++++++++++++++ 5 files changed, 282 insertions(+), 2 deletions(-) create mode 100644 plugins/local/gpu_intel/LIB create mode 100644 plugins/local/gpu_intel/NEED create mode 100644 plugins/local/gpu_intel/README.rst create mode 100644 plugins/local/gpu_intel/gpu.sycl diff --git a/configure b/configure index 08dac444..3e3390e1 100755 --- a/configure +++ b/configure @@ -40,7 +40,7 @@ Usage: $(basename $0) -c $(basename $0) -h $(basename $0) -i - $(basename $0) -g [nvidia|none] + $(basename $0) -g [nvidia|intel|none] Options: -c Define a COMPILATION configuration file, @@ -49,7 +49,7 @@ Options: -i INSTALL . Use at your OWN RISK: no support will be provided for the installation of dependencies. - -g [nvidia|none] Choose GPU acceleration (experimental) + -g [nvidia|intel|none] Choose GPU acceleration Example: ./$(basename $0) -c config/gfortran.cfg @@ -121,6 +121,10 @@ case "$GPU" in echo "Activating AMD GPU acceleration" ln -s ${QP_ROOT}/plugins/local/gpu_amd ${QP_ROOT}/src/gpu_arch ;; + intel) # Intel + echo "Activating Intel GPU acceleration" + ln -s ${QP_ROOT}/plugins/local/gpu_intel ${QP_ROOT}/src/gpu_arch + ;; nvidia) # Nvidia echo "Activating Nvidia GPU acceleration" ln -s ${QP_ROOT}/plugins/local/gpu_nvidia ${QP_ROOT}/src/gpu_arch diff --git a/plugins/local/gpu_intel/LIB b/plugins/local/gpu_intel/LIB new file mode 100644 index 00000000..027c35b0 --- /dev/null +++ b/plugins/local/gpu_intel/LIB @@ -0,0 +1 @@ +-lmkl_sycl -lsycl diff --git a/plugins/local/gpu_intel/NEED b/plugins/local/gpu_intel/NEED new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/plugins/local/gpu_intel/NEED @@ -0,0 +1 @@ + diff --git a/plugins/local/gpu_intel/README.rst b/plugins/local/gpu_intel/README.rst new file mode 100644 index 00000000..3a4653de --- /dev/null +++ b/plugins/local/gpu_intel/README.rst @@ -0,0 +1,8 @@ +========= +gpu_intel +========= + +Intel implementation of GPU routines. Uses MKL and SYCL. +```bash +dpcpp -O3 -c gpu.o gpu.sycl +``` diff --git a/plugins/local/gpu_intel/gpu.sycl b/plugins/local/gpu_intel/gpu.sycl new file mode 100644 index 00000000..7b589490 --- /dev/null +++ b/plugins/local/gpu_intel/gpu.sycl @@ -0,0 +1,266 @@ +#include +#include +#include +#include + +extern "C" { + +/* Generic functions */ + +int gpu_ndevices() { + return 1; +} + +void gpu_set_device(int32_t igpu) { +} + + +/* Allocation functions */ + +void gpu_allocate(void** ptr, int64_t size) { + auto queue = sycl::queue(sycl::default_selector{}); + + try { + *ptr = sycl::malloc_shared(size, queue); + assert(*ptr != nullptr); + } catch (const sycl::exception& e) { + std::cerr << "SYCL exception caught: " << e.what() << std::endl; + *ptr = nullptr; // If allocation fails, set pointer to nullptr + } +} + +void gpu_deallocate(void** ptr) { + assert(*ptr != nullptr); + sycl::free(*ptr, sycl::queue(sycl::default_selector{})); + *ptr = nullptr; +} + +/* Upload data from host to device */ +void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) { + sycl::queue queue(sycl::default_selector{}); + queue.memcpy(gpu_ptr, cpu_ptr, n).wait(); +} + +/* Download data from device to host */ +void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) { + sycl::queue queue(sycl::default_selector{}); + queue.memcpy(cpu_ptr, gpu_ptr, n).wait(); +} + +/* Copy data from one GPU memory location to another */ +void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) { + sycl::queue queue(sycl::default_selector{}); + queue.memcpy(gpu_ptr_dest, gpu_ptr_src, n).wait(); +} + +/* Queues */ + +/* SYCL queue as a replacement for CUDA stream */ +void gpu_stream_create(sycl::queue** ptr) { + *ptr = new sycl::queue(sycl::default_selector{}); +} + +void gpu_stream_destroy(sycl::queue** ptr) { + assert(*ptr != nullptr); + delete *ptr; + *ptr = nullptr; +} + +To translate the CUDA functions related to stream management to SYCL, you will need to adapt to SYCL's approach to command groups and queues. SYCL uses queues to manage execution order and parallelism, similar to CUDA streams but integrated within the SYCL ecosystem. + +### Original CUDA Code + +```c +/* Create a CUDA stream */ +void gpu_stream_create(cudaStream_t* ptr) { + cudaError_t rc = cudaStreamCreate(ptr); + assert(rc == cudaSuccess); +} + +/* Destroy a CUDA stream */ +void gpu_stream_destroy(cudaStream_t* ptr) { + assert(ptr != NULL); + cudaError_t rc = cudaStreamDestroy(*ptr); + assert(rc == cudaSuccess); + *ptr = NULL; +} + +/* Set a specific stream for cuBLAS operations */ +void gpu_set_stream(cublasHandle_t handle, cudaStream_t stream) { + cublasSetStream(handle, stream); +} + +/* Synchronize all streams */ +void gpu_synchronize() { + cudaDeviceSynchronize(); +} +``` + +### Translated SYCL Code + +```cpp +#include +#include + +/* SYCL queue as a replacement for CUDA stream */ +void gpu_stream_create(sycl::queue** ptr) { + *ptr = new sycl::queue(sycl::default_selector{}); +} + +void gpu_stream_destroy(sycl::queue** ptr) { + *ptr->wait_and_throw(); + assert(*ptr != nullptr); + delete *ptr; + *ptr = nullptr; +} + +/* SYCL does not need an equivalent for setting a stream on a cuBLAS handle, + because each SYCL queue acts independently and can be used directly. */ + +void gpu_synchronize() { + sycl::queue queue(sycl::default_selector{}); + queue.wait_and_throw(); +} + +/* BLAS functions */ + +typedef struct { + sycl::queue* queue; +} blasHandle_t; + +void gpu_set_stream(blasHandle_t* handle, sycl::queue* ptr) { + handle->queue = ptr; +} + +void gpu_blas_create(blasHandle_t* ptr) { + *ptr = new blasHandle_t; + assert(*ptr != nullptr); + ptr->queue = new sycl::queue(sycl::default_selector{}); + assert(ptr->queue != nullptr); +} + +void gpu_blas_destroy(blasHandle_t* ptr) { + assert(*ptr != nullptr); + delete ptr->queue; + delete *ptr; + *ptr = nullptr; +} + + +void gpu_ddot(blasHandle_t* handle, const int64_t n, const double* x, const int64_t incx, + const double* y, const int64_t incy, double* result) { + // Ensure input parameters are valid + assert(handle != nullptr); + assert(handle->queue != nullptr); + assert(n > 0); + assert(incx > 0); + assert(incy > 0); + assert(x != nullptr); + assert(y != nullptr); + assert(result != nullptr); + + // SYCL buffer for the result + sycl::buffer result_buf(result, sycl::range<1>(1)); + + sycl::queue& queue = handle->queue; + + // Perform the dot product operation + queue.submit([&](sycl::handler& cgh) { + // Accessors for the buffers + auto result_acc = result_buf.get_access(cgh); + + // This is an asynchronous call to compute dot product + cgh.single_task([=]() { + result_acc[0] = oneapi::mkl::blas::dot(cgh, n, x, incx, y, incy); + }); + }); + +} + +void gpu_dgemv(blasHandle_t* handle, const char* transa, const int64_t m, const int64_t n, const double* alpha, + const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) { + + assert(handle != nullptr); + assert(handle->queue != nullptr); + + // Validate matrix dimensions and increments to be positive + assert(m > 0 && n > 0 && lda > 0 && incx > 0 && incy > 0); + assert(a != nullptr && x != nullptr && y != nullptr && alpha != nullptr && beta != nullptr); + + // Determine the operation type + oneapi::mkl::transpose transa_ = oneapi::mkl::transpose::nontrans; + if (*transa == 'T' || *transa == 't') { + transa_ = oneapi::mkl::transpose::trans; + } + + // Perform DGEMV operation using oneMKL + handle->queue->submit([&](sycl::handler& cgh) { + // Use accessors to ensure data consistency and dependency resolution + auto a_acc = sycl::accessor(a, sycl::range(m * lda), sycl::read_only, cgh); + auto x_acc = sycl::accessor(x, sycl::range(n * incx), sycl::read_only, cgh); + auto y_acc = sycl::accessor(y, sycl::range(m * incy), sycl::read_write, cgh); + + cgh.parallel_for(sycl::range(1), [=](sycl::id<1>) { + oneapi::mkl::blas::gemv(*handle->queue, transa_, m, n, *alpha, a_acc.get_pointer(), lda, x_acc.get_pointer(), incx, *beta, y_acc.get_pointer(), incy); + }); + }); + +} + +void gpu_dgemm(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha, + const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) { + + assert(handle != nullptr && handle->queue != nullptr); + assert(m > 0 && n > 0 && k > 0 && lda > 0 && ldb > 0 && ldc > 0); + assert(a != nullptr && b != nullptr && c != nullptr && alpha != nullptr && beta != nullptr); + + // Transpose operations + auto transa_ = (*transa == 'T' || *transa == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans; + auto transb_ = (*transb == 'T' || *transb == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans; + + // Ensure queue is ready + handle->queue->submit([&](sycl::handler& cgh) { + // Accessors for matrices + auto a_acc = sycl::accessor(a, sycl::range<1>(m * lda), sycl::read_only, cgh); + auto b_acc = sycl::accessor(b, sycl::range<1>(k * ldb), sycl::read_only, cgh); + auto c_acc = sycl::accessor(c, sycl::range<1>(m * ldc), sycl::read_write, cgh); + + cgh.parallel_for(sycl::range(1), [=](sycl::id<1>) { + oneapi::mkl::blas::gemm(*handle->queue, transa_, transb_, m, n, k, + *alpha, a_acc.get_pointer(), lda, + b_acc.get_pointer(), ldb, + *beta, c_acc.get_pointer(), ldc); + }); + }); + +} + +void gpu_dgeam(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha, + const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { + assert(handle != nullptr && handle->queue != nullptr); + assert(m > 0 && n > 0 && lda > 0 && ldb > 0 && ldc > 0); + assert(a != nullptr && b != nullptr && c != nullptr && alpha != nullptr && beta != nullptr); + + // Determine transpose operations + bool transA = (*transa == 'T' || *transa == 't'); + bool transB = (*transb == 'T' || *transb == 't'); + + handle->queue->submit([&](sycl::handler& cgh) { + auto a_acc = sycl::accessor(a, sycl::range(m * lda), sycl::read_only, cgh); + auto b_acc = sycl::accessor(b, sycl::range(n * ldb), sycl::read_only, cgh); + auto c_acc = sycl::accessor(c, sycl::range(m * ldc), sycl::read_write, cgh); + + cgh.parallel_for(sycl::range<2>(m, n), [=](sycl::id<2> idx) { + int i = idx[0]; + int j = idx[1]; + int ai = transA ? j * lda + i : i * lda + j; + int bi = transB ? j * ldb + i : i * ldb + j; + int ci = i * ldc + j; + + c_acc[ci] = (*alpha) * a_acc[ai] + (*beta) * b_acc[bi]; + }); + }); + +} + +} // extern C From 44b8e22e7aebf4dd89874549eac8bb8aef2fb16d Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 9 Jul 2024 22:02:13 +0200 Subject: [PATCH 25/38] Fixed sycl --- plugins/local/gpu_intel/README.rst | 2 +- plugins/local/gpu_intel/gpu.sycl | 139 ++++++----------------------- 2 files changed, 26 insertions(+), 115 deletions(-) diff --git a/plugins/local/gpu_intel/README.rst b/plugins/local/gpu_intel/README.rst index 3a4653de..d42e2557 100644 --- a/plugins/local/gpu_intel/README.rst +++ b/plugins/local/gpu_intel/README.rst @@ -4,5 +4,5 @@ gpu_intel Intel implementation of GPU routines. Uses MKL and SYCL. ```bash -dpcpp -O3 -c gpu.o gpu.sycl +icpx -fsycl gpu.cxx -c -qmkl=sequential ``` diff --git a/plugins/local/gpu_intel/gpu.sycl b/plugins/local/gpu_intel/gpu.sycl index 7b589490..1f9f89ce 100644 --- a/plugins/local/gpu_intel/gpu.sycl +++ b/plugins/local/gpu_intel/gpu.sycl @@ -18,7 +18,7 @@ void gpu_set_device(int32_t igpu) { /* Allocation functions */ void gpu_allocate(void** ptr, int64_t size) { - auto queue = sycl::queue(sycl::default_selector{}); + auto queue = sycl::queue(sycl::default_selector_v); try { *ptr = sycl::malloc_shared(size, queue); @@ -31,25 +31,25 @@ void gpu_allocate(void** ptr, int64_t size) { void gpu_deallocate(void** ptr) { assert(*ptr != nullptr); - sycl::free(*ptr, sycl::queue(sycl::default_selector{})); + sycl::free(*ptr, sycl::queue(sycl::default_selector_v)); *ptr = nullptr; } /* Upload data from host to device */ void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) { - sycl::queue queue(sycl::default_selector{}); + sycl::queue queue(sycl::default_selector_v); queue.memcpy(gpu_ptr, cpu_ptr, n).wait(); } /* Download data from device to host */ void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) { - sycl::queue queue(sycl::default_selector{}); + sycl::queue queue(sycl::default_selector_v); queue.memcpy(cpu_ptr, gpu_ptr, n).wait(); } /* Copy data from one GPU memory location to another */ void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) { - sycl::queue queue(sycl::default_selector{}); + sycl::queue queue(sycl::default_selector_v); queue.memcpy(gpu_ptr_dest, gpu_ptr_src, n).wait(); } @@ -57,7 +57,7 @@ void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) { /* SYCL queue as a replacement for CUDA stream */ void gpu_stream_create(sycl::queue** ptr) { - *ptr = new sycl::queue(sycl::default_selector{}); + *ptr = new sycl::queue(sycl::default_selector_v); } void gpu_stream_destroy(sycl::queue** ptr) { @@ -66,59 +66,8 @@ void gpu_stream_destroy(sycl::queue** ptr) { *ptr = nullptr; } -To translate the CUDA functions related to stream management to SYCL, you will need to adapt to SYCL's approach to command groups and queues. SYCL uses queues to manage execution order and parallelism, similar to CUDA streams but integrated within the SYCL ecosystem. - -### Original CUDA Code - -```c -/* Create a CUDA stream */ -void gpu_stream_create(cudaStream_t* ptr) { - cudaError_t rc = cudaStreamCreate(ptr); - assert(rc == cudaSuccess); -} - -/* Destroy a CUDA stream */ -void gpu_stream_destroy(cudaStream_t* ptr) { - assert(ptr != NULL); - cudaError_t rc = cudaStreamDestroy(*ptr); - assert(rc == cudaSuccess); - *ptr = NULL; -} - -/* Set a specific stream for cuBLAS operations */ -void gpu_set_stream(cublasHandle_t handle, cudaStream_t stream) { - cublasSetStream(handle, stream); -} - -/* Synchronize all streams */ void gpu_synchronize() { - cudaDeviceSynchronize(); -} -``` - -### Translated SYCL Code - -```cpp -#include -#include - -/* SYCL queue as a replacement for CUDA stream */ -void gpu_stream_create(sycl::queue** ptr) { - *ptr = new sycl::queue(sycl::default_selector{}); -} - -void gpu_stream_destroy(sycl::queue** ptr) { - *ptr->wait_and_throw(); - assert(*ptr != nullptr); - delete *ptr; - *ptr = nullptr; -} - -/* SYCL does not need an equivalent for setting a stream on a cuBLAS handle, - because each SYCL queue acts independently and can be used directly. */ - -void gpu_synchronize() { - sycl::queue queue(sycl::default_selector{}); + sycl::queue queue(sycl::default_selector_v); queue.wait_and_throw(); } @@ -132,17 +81,17 @@ void gpu_set_stream(blasHandle_t* handle, sycl::queue* ptr) { handle->queue = ptr; } -void gpu_blas_create(blasHandle_t* ptr) { - *ptr = new blasHandle_t; +void gpu_blas_create(blasHandle_t** ptr) { + *ptr = (blasHandle_t*) malloc(sizeof(blasHandle_t)); assert(*ptr != nullptr); - ptr->queue = new sycl::queue(sycl::default_selector{}); - assert(ptr->queue != nullptr); + (*ptr)->queue = new sycl::queue(sycl::default_selector_v); + assert((*ptr)->queue != nullptr); } -void gpu_blas_destroy(blasHandle_t* ptr) { +void gpu_blas_destroy(blasHandle_t** ptr) { assert(*ptr != nullptr); - delete ptr->queue; - delete *ptr; + delete (*ptr)->queue; + free(*ptr); *ptr = nullptr; } @@ -159,21 +108,7 @@ void gpu_ddot(blasHandle_t* handle, const int64_t n, const double* x, const int6 assert(y != nullptr); assert(result != nullptr); - // SYCL buffer for the result - sycl::buffer result_buf(result, sycl::range<1>(1)); - - sycl::queue& queue = handle->queue; - - // Perform the dot product operation - queue.submit([&](sycl::handler& cgh) { - // Accessors for the buffers - auto result_acc = result_buf.get_access(cgh); - - // This is an asynchronous call to compute dot product - cgh.single_task([=]() { - result_acc[0] = oneapi::mkl::blas::dot(cgh, n, x, incx, y, incy); - }); - }); + oneapi::mkl::blas::dot(*handle->queue, n, x, incx, y, incy, result); } @@ -194,16 +129,7 @@ void gpu_dgemv(blasHandle_t* handle, const char* transa, const int64_t m, const } // Perform DGEMV operation using oneMKL - handle->queue->submit([&](sycl::handler& cgh) { - // Use accessors to ensure data consistency and dependency resolution - auto a_acc = sycl::accessor(a, sycl::range(m * lda), sycl::read_only, cgh); - auto x_acc = sycl::accessor(x, sycl::range(n * incx), sycl::read_only, cgh); - auto y_acc = sycl::accessor(y, sycl::range(m * incy), sycl::read_write, cgh); - - cgh.parallel_for(sycl::range(1), [=](sycl::id<1>) { - oneapi::mkl::blas::gemv(*handle->queue, transa_, m, n, *alpha, a_acc.get_pointer(), lda, x_acc.get_pointer(), incx, *beta, y_acc.get_pointer(), incy); - }); - }); + oneapi::mkl::blas::column_major::gemv(*handle->queue, transa_, m, n, *alpha, a, lda, x, incx, *beta, y, incy); } @@ -218,23 +144,12 @@ void gpu_dgemm(blasHandle_t* handle, const char* transa, const char* transb, con auto transa_ = (*transa == 'T' || *transa == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans; auto transb_ = (*transb == 'T' || *transb == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans; - // Ensure queue is ready - handle->queue->submit([&](sycl::handler& cgh) { - // Accessors for matrices - auto a_acc = sycl::accessor(a, sycl::range<1>(m * lda), sycl::read_only, cgh); - auto b_acc = sycl::accessor(b, sycl::range<1>(k * ldb), sycl::read_only, cgh); - auto c_acc = sycl::accessor(c, sycl::range<1>(m * ldc), sycl::read_write, cgh); - - cgh.parallel_for(sycl::range(1), [=](sycl::id<1>) { - oneapi::mkl::blas::gemm(*handle->queue, transa_, transb_, m, n, k, - *alpha, a_acc.get_pointer(), lda, - b_acc.get_pointer(), ldb, - *beta, c_acc.get_pointer(), ldc); - }); - }); + oneapi::mkl::blas::column_major::gemm(*handle->queue, transa_, transb_, m, n, k, + *alpha, a, lda, b, ldb, *beta, c, ldc); } + void gpu_dgeam(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha, const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { assert(handle != nullptr && handle->queue != nullptr); @@ -246,18 +161,14 @@ void gpu_dgeam(blasHandle_t* handle, const char* transa, const char* transb, con bool transB = (*transb == 'T' || *transb == 't'); handle->queue->submit([&](sycl::handler& cgh) { - auto a_acc = sycl::accessor(a, sycl::range(m * lda), sycl::read_only, cgh); - auto b_acc = sycl::accessor(b, sycl::range(n * ldb), sycl::read_only, cgh); - auto c_acc = sycl::accessor(c, sycl::range(m * ldc), sycl::read_write, cgh); - cgh.parallel_for(sycl::range<2>(m, n), [=](sycl::id<2> idx) { - int i = idx[0]; - int j = idx[1]; - int ai = transA ? j * lda + i : i * lda + j; - int bi = transB ? j * ldb + i : i * ldb + j; - int ci = i * ldc + j; + const int i = idx[0]; + const int j = idx[1]; + const int ai = transA ? j * lda + i : i * lda + j; + const int bi = transB ? j * ldb + i : i * ldb + j; + const int ci = i * ldc + j; - c_acc[ci] = (*alpha) * a_acc[ai] + (*beta) * b_acc[bi]; + c[ci] = (*alpha) * a[ai] + (*beta) * b[bi]; }); }); From 6c275d54ef050ec8d210a35aa4bbb2c93d176f34 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 9 Jul 2024 22:14:19 +0200 Subject: [PATCH 26/38] Fix intent --- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index 24fcc5af..6f65ea79 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -996,8 +996,8 @@ subroutine compute_J1_chol(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvoo,d_cc_space_v_vo_chol integer, intent(in) :: nO,nV type(gpu_double2), intent(in) :: t1 type(gpu_double4), intent(in) :: t2, v_ovvo, v_ovoo, v_vvoo + type(gpu_double3), intent(in) :: d_cc_space_v_vo_chol,d_cc_space_v_vv_chol type(gpu_double4), intent(out) :: J1 - type(gpu_double3), intent(out) :: d_cc_space_v_vo_chol,d_cc_space_v_vv_chol integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam From f5cf674d7b4eb98637bde7eb07d1119cfeccc557 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 9 Jul 2024 23:04:22 +0200 Subject: [PATCH 27/38] Fix link stage for intel gpus --- configure | 4 ++-- plugins/local/gpu_intel/LIB | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/configure b/configure index 3e3390e1..43ca9f6d 100755 --- a/configure +++ b/configure @@ -117,12 +117,12 @@ done # Handle GPU acceleration rm -f ${QP_ROOT}/src/gpu_arch case "$GPU" in - amd) # Nvidia + amd) # AMD echo "Activating AMD GPU acceleration" ln -s ${QP_ROOT}/plugins/local/gpu_amd ${QP_ROOT}/src/gpu_arch ;; intel) # Intel - echo "Activating Intel GPU acceleration" + echo "Activating Intel GPU acceleration (EXPERIMENTAL)" ln -s ${QP_ROOT}/plugins/local/gpu_intel ${QP_ROOT}/src/gpu_arch ;; nvidia) # Nvidia diff --git a/plugins/local/gpu_intel/LIB b/plugins/local/gpu_intel/LIB index 027c35b0..199b0f1c 100644 --- a/plugins/local/gpu_intel/LIB +++ b/plugins/local/gpu_intel/LIB @@ -1 +1,2 @@ --lmkl_sycl -lsycl +-ltbb -lsycl -lmkl_sycl -lgpu -limf -lintlc -lstdc++ + From d219dc10267c0fe86fbae4683c00f5051229c8c0 Mon Sep 17 00:00:00 2001 From: eginer Date: Thu, 11 Jul 2024 13:57:28 +0200 Subject: [PATCH 28/38] beginning to put cholesky in CASSCF --- plugins/local/spher_harm/spher_harm.irp.f | 4 +- .../local/spher_harm/spher_harm_func.irp.f | 13 +++ src/casscf_cipsi/chol_bielec.irp.f | 93 ++++++++++++++++ src/casscf_cipsi/test_chol.irp.f | 23 ++++ src/mo_two_e_ints/cholesky.irp.f | 31 ++++++ src/mu_of_r/f_hf_cholesky.irp.f | 100 ++++++++++++++++++ src/mu_of_r/test_proj_op.irp.f | 20 +++- 7 files changed, 280 insertions(+), 4 deletions(-) create mode 100644 src/casscf_cipsi/chol_bielec.irp.f create mode 100644 src/casscf_cipsi/test_chol.irp.f diff --git a/plugins/local/spher_harm/spher_harm.irp.f b/plugins/local/spher_harm/spher_harm.irp.f index 7a2eea06..e8deafb9 100644 --- a/plugins/local/spher_harm/spher_harm.irp.f +++ b/plugins/local/spher_harm/spher_harm.irp.f @@ -1,7 +1,7 @@ program spher_harm implicit none -! call test_spher_harm + call test_spher_harm ! call test_cart - call test_brutal_spheric +! call test_brutal_spheric end diff --git a/plugins/local/spher_harm/spher_harm_func.irp.f b/plugins/local/spher_harm/spher_harm_func.irp.f index 825bd8ac..f12c8fb9 100644 --- a/plugins/local/spher_harm/spher_harm_func.irp.f +++ b/plugins/local/spher_harm/spher_harm_func.irp.f @@ -7,6 +7,7 @@ subroutine spher_harm_func_r3(r,l,m,re_ylm, im_ylm) double precision :: theta, phi,r_abs call cartesian_to_spherical(r,theta,phi,r_abs) call spher_harm_func(l,m,theta,phi,re_ylm, im_ylm) +! call spher_harm_func_expl(l,m,theta,phi,re_ylm, im_ylm) end @@ -131,6 +132,10 @@ subroutine spher_harm_func_expl(l,m,theta,phi,re_ylm, im_ylm) tmp = - inv_sq_pi * dsqrt(3.d0/8.d0) * dsin(theta) re_ylm = tmp * dcos(phi) im_ylm = tmp * dsin(phi) + else if (l==1.and.m==-1)then + tmp = - inv_sq_pi * dsqrt(3.d0/8.d0) * dsin(theta) + re_ylm = tmp * dcos(phi) + im_ylm = -tmp * dsin(phi) else if(l==1.and.m==0)then tmp = inv_sq_pi * dsqrt(3.d0/4.d0) * dcos(theta) re_ylm = tmp @@ -139,10 +144,18 @@ subroutine spher_harm_func_expl(l,m,theta,phi,re_ylm, im_ylm) tmp = 0.25d0 * inv_sq_pi * dsqrt(0.5d0*15.d0) * dsin(theta)*dsin(theta) re_ylm = tmp * dcos(2.d0*phi) im_ylm = tmp * dsin(2.d0*phi) + else if(l==2.and.m==-2)then + tmp = 0.25d0 * inv_sq_pi * dsqrt(0.5d0*15.d0) * dsin(theta)*dsin(theta) + re_ylm = tmp * dcos(2.d0*phi) + im_ylm =-tmp * dsin(2.d0*phi) else if(l==2.and.m==1)then tmp = - inv_sq_pi * dsqrt(15.d0/8.d0) * dsin(theta) * dcos(theta) re_ylm = tmp * dcos(phi) im_ylm = tmp * dsin(phi) + else if(l==2.and.m==-1)then + tmp = - inv_sq_pi * dsqrt(15.d0/8.d0) * dsin(theta) * dcos(theta) + re_ylm = tmp * dcos(phi) + im_ylm =-tmp * dsin(phi) else if(l==2.and.m==0)then tmp = dsqrt(5.d0/4.d0) * inv_sq_pi* (1.5d0*dcos(theta)*dcos(theta)-0.5d0) re_ylm = tmp diff --git a/src/casscf_cipsi/chol_bielec.irp.f b/src/casscf_cipsi/chol_bielec.irp.f new file mode 100644 index 00000000..1fe985ad --- /dev/null +++ b/src/casscf_cipsi/chol_bielec.irp.f @@ -0,0 +1,93 @@ + +BEGIN_PROVIDER [double precision, cholesky_no_1_idx_transp, (cholesky_mo_num, n_act_orb, mo_num)] + BEGIN_DOC + ! Cholesky vectors with ONE orbital on the active natural orbital basis + END_DOC + implicit none + integer :: i_chol,i_act,i_mo,jj_act + double precision, allocatable :: chol_tmp(:,:) + allocate(chol_tmp(cholesky_mo_num,n_act_orb)) + cholesky_no_1_idx_transp = 0.D0 + do i_mo = 1, mo_num + ! Get all the integrals corresponding to the "i_mo" + do i_act = 1, n_act_orb + jj_act = list_act(i_act) + do i_chol = 1, cholesky_mo_num + chol_tmp(i_chol, i_act) = cholesky_mo_transp(i_chol, jj_act, i_mo) + enddo + enddo +! ! Do the matrix product +! do i_act = 1, n_act_orb +! do jj_act = 1, n_act_orb +! do i_chol = 1, cholesky_mo_num +! cholesky_no_1_idx_transp(i_chol, i_act, i_mo) += chol_tmp(i_chol, jj_act) * natorbsCI(jj_act,i_act) +! enddo +! enddo +! enddo + call dgemm('N','N',cholesky_mo_num,n_act_orb,n_act_orb,1.d0, & + chol_tmp, size(chol_tmp,1), & + natorbsCI, size(natorbsCI,1), & + 0.d0, & + cholesky_no_1_idx_transp(1,1,i_mo), size(cholesky_no_1_idx_transp,1)) + enddo + +END_PROVIDER + + +BEGIN_PROVIDER [double precision, cholesky_no_2_idx_transp, (cholesky_mo_num, n_act_orb, n_act_orb)] + BEGIN_DOC + ! Cholesky vectors with TWO orbital on the active natural orbital basis + END_DOC + implicit none + integer :: i_chol,i_act,j_act,jj_act + double precision, allocatable :: chol_tmp(:,:) + allocate(chol_tmp(cholesky_mo_num,n_act_orb)) + cholesky_no_2_idx_transp = 0.D0 + do j_act = 1, n_act_orb + do i_act = 1, n_act_orb + do jj_act = 1, n_act_orb + do i_chol = 1, cholesky_mo_num + cholesky_no_2_idx_transp(i_chol, i_act, j_act) += cholesky_no_1_idx_transp(i_chol, i_act,jj_act) * natorbsCI(jj_act,i_act) + enddo + enddo + enddo + enddo + +END_PROVIDER + + +BEGIN_PROVIDER [double precision, cholesky_no_2_idx_transp_dgemm, (cholesky_mo_num, n_act_orb, n_act_orb)] + BEGIN_DOC + ! Cholesky vectors with TWO orbital on the active natural orbital basis + END_DOC + implicit none + integer :: i_chol,i_act,j_act,jj_act + double precision, allocatable :: chol_tmp(:,:) + allocate(chol_tmp(cholesky_mo_num,n_act_orb)) + cholesky_no_2_idx_transp_dgemm = 0.D0 + do j_act = 1, n_act_orb + ! Get all the integrals corresponding to the "j_act" + do i_act = 1, n_act_orb + jj_act = list_act(i_act) + do i_chol = 1, cholesky_mo_num + chol_tmp(i_chol, i_act) = cholesky_no_1_idx_transp(i_chol, j_act, jj_act) + enddo + enddo +! ! Do the matrix product +! do i_act = 1, n_act_orb +! do jj_act = 1, n_act_orb +! do i_chol = 1, cholesky_mo_num +! cholesky_no_1_idx_transp(i_chol, i_act, j_act) += chol_tmp(i_chol, jj_act) * natorbsCI(jj_act,i_act) +! enddo +! enddo +! enddo + call dgemm('N','N',cholesky_mo_num,n_act_orb,n_act_orb,1.d0, & + chol_tmp, size(chol_tmp,1), & + natorbsCI, size(natorbsCI,1), & + 0.d0, & + cholesky_no_2_idx_transp_dgemm(1,1,j_act), size(cholesky_no_2_idx_transp_dgemm,1)) + enddo + +END_PROVIDER + + diff --git a/src/casscf_cipsi/test_chol.irp.f b/src/casscf_cipsi/test_chol.irp.f new file mode 100644 index 00000000..b94851f9 --- /dev/null +++ b/src/casscf_cipsi/test_chol.irp.f @@ -0,0 +1,23 @@ +program test_chol + implicit none + read_wf= .True. + touch read_wf + call routine + +end + +subroutine routine + implicit none + integer :: i_chol, i_act, i_mo + double precision :: accu + accu = 0.d0 + do i_mo = 1, n_act_orb + do i_act = 1, n_act_orb + do i_chol = 1, cholesky_mo_num + accu += dabs(cholesky_no_2_idx_transp_dgemm(i_chol,i_act,i_mo) - cholesky_no_2_idx_transp(i_chol,i_act,i_mo)) + print*,cholesky_no_2_idx_transp_dgemm(i_chol,i_act,i_mo) , cholesky_no_2_idx_transp(i_chol,i_act,i_mo) + enddo + enddo + enddo + print*,'accu =', accu +end diff --git a/src/mo_two_e_ints/cholesky.irp.f b/src/mo_two_e_ints/cholesky.irp.f index 7e2c8b37..1fed949d 100644 --- a/src/mo_two_e_ints/cholesky.irp.f +++ b/src/mo_two_e_ints/cholesky.irp.f @@ -101,3 +101,34 @@ BEGIN_PROVIDER [ double precision, cholesky_mo_transp, (cholesky_mo_num, mo_num, END_PROVIDER + +BEGIN_PROVIDER [ double precision, cholesky_semi_mo_transp_simple, (cholesky_mo_num, ao_num, mo_num) ] + implicit none + BEGIN_DOC + ! Cholesky vectors in MO basis + END_DOC + + double precision, allocatable :: X(:,:,:) + double precision :: wall0, wall1 + integer :: ierr + print *, 'Semi AO->MO Transformation of Cholesky vectors' + call wall_time(wall0) + + allocate(X(mo_num,cholesky_mo_num,ao_num), stat=ierr) + if (ierr /= 0) then + print *, irp_here, ': Allocation failed' + endif + integer :: i_chol, i_mo, j_mo, i_ao + cholesky_semi_mo_transp_simple = 0.d0 + do i_mo = 1, mo_num + do i_ao = 1, ao_num + do j_mo = 1, mo_num + do i_chol = 1, cholesky_mo_num + cholesky_semi_mo_transp_simple(i_chol, i_ao,i_mo) += cholesky_mo_transp(i_chol,j_mo,i_mo) * mo_coef_transp(j_mo,i_ao) + enddo + enddo + enddo + enddo + +END_PROVIDER + diff --git a/src/mu_of_r/f_hf_cholesky.irp.f b/src/mu_of_r/f_hf_cholesky.irp.f index 5dd69eb6..179b80dd 100644 --- a/src/mu_of_r/f_hf_cholesky.irp.f +++ b/src/mu_of_r/f_hf_cholesky.irp.f @@ -289,6 +289,106 @@ BEGIN_PROVIDER [ double precision, f_hf_cholesky_sparse, (n_points_final_grid)] endif END_PROVIDER +BEGIN_PROVIDER [ double precision, f_hf_cholesky_sparse_bis, (n_points_final_grid)] + implicit none + integer :: ipoint,m,mm,i,ii,p + !!f(R) = \sum_{I} \sum_{J} Phi_I(R) Phi_J(R) V_IJ + !! = \sum_{I}\sum_{J}\sum_A Phi_I(R) Phi_J(R) V_AI V_AJ + !! = \sum_A \sum_{I}Phi_I(R)V_AI \sum_{J}V_AJ Phi_J(R) + !! = \sum_A V_AR G_AR + !! V_AR = \sum_{I}Phi_IR V_AI = \sum_{I}Phi^t_RI V_AI + double precision :: u_dot_v,wall0,wall1,accu_1, accu_2,mo_i_r1,mo_b_r1 + double precision :: thresh_1,thresh_2 + double precision, allocatable :: accu_vec(:),delta_vec(:) + thresh_2 = ao_cholesky_threshold * 100.d0 + thresh_1 = dsqrt(thresh_2) + provide cholesky_mo_transp + if(elec_alpha_num == elec_beta_num)then + call wall_time(wall0) + !$OMP PARALLEL DEFAULT(NONE) & + !$OMP PRIVATE (accu_vec,ipoint,p,ii,i,mm,m,mo_i_r1,mo_b_r1) & + !$OMP ShARED (n_occ_val_orb_for_hf,list_valence_orb_for_hf,mos_in_r_array_omp,aos_in_r_array,thresh_1,thresh_2) & + !$OMP ShARED (cholesky_mo_num,f_hf_cholesky_sparse_bis,n_points_final_grid,cholesky_semi_mo_transp_simple,ao_num) + allocate(accu_vec(cholesky_mo_num)) + !$OMP DO + do ipoint = 1, n_points_final_grid + f_hf_cholesky_sparse_bis(ipoint) = 0.d0 + accu_vec = 0.d0 + do ii = 1, n_occ_val_orb_for_hf(1) + i = list_valence_orb_for_hf(ii,1) + mo_i_r1 = mos_in_r_array_omp(i,ipoint) + if(dabs(mo_i_r1).lt.thresh_1)cycle + do mm = 1, ao_num ! electron 1 + mo_b_r1 = aos_in_r_array(mm,ipoint)*mo_i_r1 + if(dabs(mo_b_r1).lt.thresh_2)cycle + do p = 1, cholesky_mo_num + accu_vec(p) = accu_vec(p) + mo_b_r1 * cholesky_semi_mo_transp_simple(p,mm,i) + enddo + enddo + enddo + do p = 1, cholesky_mo_num + f_hf_cholesky_sparse_bis(ipoint) = f_hf_cholesky_sparse_bis(ipoint) + accu_vec(p) * accu_vec(p) + enddo + f_hf_cholesky_sparse_bis(ipoint) *= 2.D0 + enddo + !$OMP END DO + deallocate(accu_vec) + !$OMP END PARALLEL + + call wall_time(wall1) + print*,'Time to provide f_hf_cholesky_sparse_bis = ',wall1-wall0 + else + call wall_time(wall0) + !$OMP PARALLEL DEFAULT(NONE) & + !$OMP PRIVATE (accu_vec,delta_vec,ipoint,p,ii,i,mm,m,mo_i_r1,mo_b_r1) & + !$OMP ShARED (n_occ_val_orb_for_hf,list_valence_orb_for_hf,list_basis,mos_in_r_array_omp,thresh_1,thresh_2) & + !$OMP ShARED (cholesky_mo_num,f_hf_cholesky_sparse_bis,n_points_final_grid,cholesky_mo_transp,n_basis_orb) + allocate(accu_vec(cholesky_mo_num),delta_vec(cholesky_mo_num)) + !$OMP DO + do ipoint = 1, n_points_final_grid + f_hf_cholesky_sparse_bis(ipoint) = 0.d0 + accu_vec = 0.d0 + do ii = 1, n_occ_val_orb_for_hf(2) + i = list_valence_orb_for_hf(ii,2) + mo_i_r1 = mos_in_r_array_omp(i,ipoint) + if(dabs(mo_i_r1).lt.thresh_1)cycle + do mm = 1, n_basis_orb ! electron 1 + m = list_basis(mm) + mo_b_r1 = mos_in_r_array_omp(m,ipoint) + if(dabs(mo_i_r1*mo_b_r1).lt.thresh_2)cycle + do p = 1, cholesky_mo_num + accu_vec(p) = accu_vec(p) + mo_i_r1 * mo_b_r1 * cholesky_mo_transp(p,m,i) + enddo + enddo + enddo + delta_vec = 0.d0 + do ii = n_occ_val_orb_for_hf(2)+1,n_occ_val_orb_for_hf(1) + i = list_valence_orb_for_hf(ii,1) + mo_i_r1 = mos_in_r_array_omp(i,ipoint) + if(dabs(mo_i_r1).lt.thresh_1)cycle + do mm = 1, n_basis_orb ! electron 1 + m = list_basis(mm) + mo_b_r1 = mos_in_r_array_omp(m,ipoint) + if(dabs(mo_i_r1*mo_b_r1).lt.thresh_2)cycle + do p = 1, cholesky_mo_num + delta_vec(p) = delta_vec(p) + mo_i_r1 * mo_b_r1 * cholesky_mo_transp(p,m,i) + enddo + enddo + enddo + do p = 1, cholesky_mo_num + f_hf_cholesky_sparse_bis(ipoint) = f_hf_cholesky_sparse_bis(ipoint) + accu_vec(p) * accu_vec(p) + accu_vec(p) * delta_vec(p) + enddo + f_hf_cholesky_sparse_bis(ipoint) *= 2.D0 + enddo + !$OMP END DO + deallocate(accu_vec) + !$OMP END PARALLEL + call wall_time(wall1) + print*,'Time to provide f_hf_cholesky_sparse_bis = ',wall1-wall0 + endif +END_PROVIDER + + BEGIN_PROVIDER [ double precision, on_top_hf_grid, (n_points_final_grid)] implicit none integer :: ipoint,i,ii diff --git a/src/mu_of_r/test_proj_op.irp.f b/src/mu_of_r/test_proj_op.irp.f index f9aba094..fd5e976b 100644 --- a/src/mu_of_r/test_proj_op.irp.f +++ b/src/mu_of_r/test_proj_op.irp.f @@ -15,7 +15,23 @@ program projected_operators ! call test_f_HF_valence_ab ! call routine_full_mos ! call test_f_ii_valence_ab - call test_f_ia_valence_ab - call test_f_ii_ia_aa_valence_ab +! call test_f_ia_valence_ab +! call test_f_ii_ia_aa_valence_ab + call test end + +subroutine test + implicit none + integer :: i_point + double precision :: ref, new, accu, weight + accu = 0.d0 + do i_point = 1, n_points_final_grid + ref = f_hf_cholesky_sparse(i_point) + new = f_hf_cholesky_sparse_bis(i_point) + weight = final_weight_at_r_vector(i_point) + accu += dabs(ref - new) * weight + enddo + print*,'accu = ',accu + +end From 31ec3ace0540177c2476e478d763a068d73bd41b Mon Sep 17 00:00:00 2001 From: eginer Date: Thu, 11 Jul 2024 14:22:27 +0200 Subject: [PATCH 29/38] correct transformation of cholesky vectors on the NO basis --- src/casscf_cipsi/chol_bielec.irp.f | 41 +++++++++++++----------------- src/casscf_cipsi/test_chol.irp.f | 14 +++++++--- 2 files changed, 27 insertions(+), 28 deletions(-) diff --git a/src/casscf_cipsi/chol_bielec.irp.f b/src/casscf_cipsi/chol_bielec.irp.f index 1fe985ad..3104fe5f 100644 --- a/src/casscf_cipsi/chol_bielec.irp.f +++ b/src/casscf_cipsi/chol_bielec.irp.f @@ -34,20 +34,21 @@ BEGIN_PROVIDER [double precision, cholesky_no_1_idx_transp, (cholesky_mo_num, n_ END_PROVIDER -BEGIN_PROVIDER [double precision, cholesky_no_2_idx_transp, (cholesky_mo_num, n_act_orb, n_act_orb)] +BEGIN_PROVIDER [double precision, cholesky_no_2_idx_transp_old, (cholesky_mo_num, n_act_orb, n_act_orb)] BEGIN_DOC ! Cholesky vectors with TWO orbital on the active natural orbital basis END_DOC implicit none - integer :: i_chol,i_act,j_act,jj_act + integer :: i_chol,i_act,j_act,jj_act,jjj_act double precision, allocatable :: chol_tmp(:,:) allocate(chol_tmp(cholesky_mo_num,n_act_orb)) - cholesky_no_2_idx_transp = 0.D0 - do j_act = 1, n_act_orb - do i_act = 1, n_act_orb - do jj_act = 1, n_act_orb + cholesky_no_2_idx_transp_old = 0.D0 + do jj_act = 1, n_act_orb + jjj_act = list_act(jj_act) + do j_act = 1, n_act_orb + do i_act = 1, n_act_orb do i_chol = 1, cholesky_mo_num - cholesky_no_2_idx_transp(i_chol, i_act, j_act) += cholesky_no_1_idx_transp(i_chol, i_act,jj_act) * natorbsCI(jj_act,i_act) + cholesky_no_2_idx_transp_old(i_chol, i_act, j_act) += cholesky_no_1_idx_transp(i_chol, i_act,jjj_act) * natorbsCI(jj_act,j_act) enddo enddo enddo @@ -56,36 +57,28 @@ BEGIN_PROVIDER [double precision, cholesky_no_2_idx_transp, (cholesky_mo_num, n_ END_PROVIDER -BEGIN_PROVIDER [double precision, cholesky_no_2_idx_transp_dgemm, (cholesky_mo_num, n_act_orb, n_act_orb)] +BEGIN_PROVIDER [double precision, cholesky_no_2_idx_transp, (cholesky_mo_num, n_act_orb, n_act_orb)] BEGIN_DOC ! Cholesky vectors with TWO orbital on the active natural orbital basis END_DOC implicit none integer :: i_chol,i_act,j_act,jj_act - double precision, allocatable :: chol_tmp(:,:) - allocate(chol_tmp(cholesky_mo_num,n_act_orb)) - cholesky_no_2_idx_transp_dgemm = 0.D0 - do j_act = 1, n_act_orb + double precision, allocatable :: chol_tmp(:,:),chol_tmp_bis(:,:) + allocate(chol_tmp(cholesky_mo_num,n_act_orb),chol_tmp_bis(cholesky_mo_num,n_act_orb)) + cholesky_no_2_idx_transp = 0.D0 + do i_act = 1, n_act_orb ! Get all the integrals corresponding to the "j_act" - do i_act = 1, n_act_orb - jj_act = list_act(i_act) + do j_act = 1, n_act_orb + jj_act = list_act(j_act) do i_chol = 1, cholesky_mo_num - chol_tmp(i_chol, i_act) = cholesky_no_1_idx_transp(i_chol, j_act, jj_act) + chol_tmp(i_chol, j_act) = cholesky_no_1_idx_transp(i_chol, i_act, jj_act) enddo enddo -! ! Do the matrix product -! do i_act = 1, n_act_orb -! do jj_act = 1, n_act_orb -! do i_chol = 1, cholesky_mo_num -! cholesky_no_1_idx_transp(i_chol, i_act, j_act) += chol_tmp(i_chol, jj_act) * natorbsCI(jj_act,i_act) -! enddo -! enddo -! enddo call dgemm('N','N',cholesky_mo_num,n_act_orb,n_act_orb,1.d0, & chol_tmp, size(chol_tmp,1), & natorbsCI, size(natorbsCI,1), & 0.d0, & - cholesky_no_2_idx_transp_dgemm(1,1,j_act), size(cholesky_no_2_idx_transp_dgemm,1)) + cholesky_no_2_idx_transp(1,1,i_act), size(cholesky_no_2_idx_transp,1)) enddo END_PROVIDER diff --git a/src/casscf_cipsi/test_chol.irp.f b/src/casscf_cipsi/test_chol.irp.f index b94851f9..8d978817 100644 --- a/src/casscf_cipsi/test_chol.irp.f +++ b/src/casscf_cipsi/test_chol.irp.f @@ -9,15 +9,21 @@ end subroutine routine implicit none integer :: i_chol, i_act, i_mo - double precision :: accu + double precision :: accu,error,exact accu = 0.d0 do i_mo = 1, n_act_orb do i_act = 1, n_act_orb do i_chol = 1, cholesky_mo_num - accu += dabs(cholesky_no_2_idx_transp_dgemm(i_chol,i_act,i_mo) - cholesky_no_2_idx_transp(i_chol,i_act,i_mo)) - print*,cholesky_no_2_idx_transp_dgemm(i_chol,i_act,i_mo) , cholesky_no_2_idx_transp(i_chol,i_act,i_mo) + error = dabs(cholesky_no_2_idx_transp(i_chol,i_act,i_mo) - cholesky_no_2_idx_transp_old(i_chol,i_act,i_mo)) + exact = dabs(cholesky_no_2_idx_transp_old(i_chol,i_act,i_mo)) + accu += error + if(exact.gt.1.d-10)then + if(error/exact.gt.1.d-7)then + write(*,'(4(E16.10,X))')cholesky_no_2_idx_transp(i_chol,i_act,i_mo) , cholesky_no_2_idx_transp_old(i_chol,i_act,i_mo),error,error/exact + endif + endif enddo enddo enddo - print*,'accu =', accu + print*,'accu =', accu/(dble(n_act_orb*n_act_orb*cholesky_mo_num)) end From 56450ed0436c4c1be26f33757ea3a4ca35238b57 Mon Sep 17 00:00:00 2001 From: eginer Date: Thu, 11 Jul 2024 19:09:20 +0200 Subject: [PATCH 30/38] introduced functions mimicking the arrays --- src/casscf_cipsi/chol_bielec.irp.f | 158 ++++++++++++++++++++++++++++- src/casscf_cipsi/test_chol.irp.f | 92 ++++++++++++++--- 2 files changed, 234 insertions(+), 16 deletions(-) diff --git a/src/casscf_cipsi/chol_bielec.irp.f b/src/casscf_cipsi/chol_bielec.irp.f index 3104fe5f..94a76453 100644 --- a/src/casscf_cipsi/chol_bielec.irp.f +++ b/src/casscf_cipsi/chol_bielec.irp.f @@ -33,7 +33,6 @@ BEGIN_PROVIDER [double precision, cholesky_no_1_idx_transp, (cholesky_mo_num, n_ END_PROVIDER - BEGIN_PROVIDER [double precision, cholesky_no_2_idx_transp_old, (cholesky_mo_num, n_act_orb, n_act_orb)] BEGIN_DOC ! Cholesky vectors with TWO orbital on the active natural orbital basis @@ -83,4 +82,161 @@ BEGIN_PROVIDER [double precision, cholesky_no_2_idx_transp, (cholesky_mo_num, n_ END_PROVIDER +BEGIN_PROVIDER [ double precision, cholesky_no_total_transp, (cholesky_mo_num, mo_num, mo_num)] + implicit none + BEGIN_DOC + ! Cholesky vectors defined on all basis including the NO basis + END_DOC + integer :: i_chol, i_act, ii_act, j_act, jj_act, i_core_inact, j_core_inact, ii_core_inact, jj_core_inact + integer :: i_virt, ii_virt, j_virt, jj_virt + ! Block when two orbitals belong to the core/inact + do j_core_inact = 1, n_core_inact_orb + jj_core_inact = list_core_inact(j_core_inact) + do i_core_inact = 1, n_core_inact_orb + ii_core_inact = list_core_inact(i_core_inact) + do i_chol = 1, cholesky_mo_num + cholesky_no_total_transp(i_chol, ii_core_inact, jj_core_inact) = cholesky_mo_transp(i_chol,ii_core_inact,jj_core_inact) + enddo + enddo + enddo + ! Block when one orbitals belongs to the core/inact and one belongs to the active + do j_core_inact = 1, n_core_inact_orb + jj_core_inact = list_core_inact(j_core_inact) + do i_act = 1, n_act_orb + ii_act = list_act(i_act) + do i_chol = 1, cholesky_mo_num + cholesky_no_total_transp(i_chol,ii_act,j_core_inact) = cholesky_no_1_idx_transp(i_chol,i_act,jj_core_inact) + enddo + enddo + enddo + do j_core_inact = 1, n_core_inact_orb + jj_core_inact = list_core_inact(j_core_inact) + do i_act = 1, n_act_orb + ii_act = list_act(i_act) + do i_chol = 1, cholesky_mo_num + cholesky_no_total_transp(i_chol,j_core_inact,ii_act) = cholesky_no_1_idx_transp(i_chol,i_act,jj_core_inact) + enddo + enddo + enddo + + ! Block when two orbitals belong to the active + do j_act = 1, n_act_orb + jj_act = list_act(j_act) + do i_act = 1, n_act_orb + ii_act = list_act(i_act) + do i_chol = 1, cholesky_mo_num + cholesky_no_total_transp(i_chol,ii_act,jj_act) = cholesky_no_2_idx_transp(i_chol,i_act,j_act) + enddo + enddo + enddo + + ! Block when two orbitals belong to the virtuals + do i_virt = 1, n_virt_orb + ii_virt = list_virt(i_virt) + do j_virt = 1, n_virt_orb + jj_virt = list_virt(j_virt) + do i_chol = 1, cholesky_mo_num + cholesky_no_total_transp(i_chol,jj_virt,ii_virt) = cholesky_mo_transp(i_chol,jj_virt,ii_virt) + enddo + enddo + enddo + + ! Block when one orbital is in active and the other in the virtuals + do i_virt = 1, n_virt_orb + ii_virt = list_virt(i_virt) + do i_act = 1, n_act_orb + ii_act = list_act(i_act) + do i_chol = 1, cholesky_mo_num + cholesky_no_total_transp(i_chol,ii_act,ii_virt) = cholesky_no_1_idx_transp(i_chol, i_act,ii_virt) + enddo + enddo + enddo + do i_virt = 1, n_virt_orb + ii_virt = list_virt(i_virt) + do i_act = 1, n_act_orb + ii_act = list_act(i_act) + do i_chol = 1, cholesky_mo_num + cholesky_no_total_transp(i_chol,ii_virt,ii_act) = cholesky_no_1_idx_transp(i_chol, i_act,ii_virt) + enddo + enddo + enddo + ! Block when one orbital is in the virtual and one in the core-inact + do i_virt = 1, n_virt_orb + ii_virt = list_virt(i_virt) + do i_core_inact = 1, n_core_inact_orb + ii_core_inact = list_core_inact(i_core_inact) + do i_chol = 1, cholesky_mo_num + cholesky_no_total_transp(i_chol, ii_core_inact, ii_virt) = cholesky_mo_transp(i_chol, ii_core_inact, ii_virt) + enddo + enddo + enddo + do i_core_inact = 1, n_core_inact_orb + ii_core_inact = list_core_inact(i_core_inact) + do i_virt = 1, n_virt_orb + ii_virt = list_virt(i_virt) + do i_chol = 1, cholesky_mo_num + cholesky_no_total_transp(i_chol, ii_virt, ii_core_inact) = cholesky_mo_transp(i_chol, ii_virt, ii_core_inact) + enddo + enddo + enddo +END_PROVIDER + + +double precision function bielec_no_basis_chol(i_1,j_1,i_2,j_2) + implicit none + integer, intent(in) :: i_1,j_1,i_2,j_2 + BEGIN_DOC + ! integral (i_1 j_1|i_2 j_2) in the mixed basis of both MOs and natural MOs + ! + END_DOC + integer :: i_chol + bielec_no_basis_chol = 0.d0 + do i_chol = 1, cholesky_mo_num + bielec_no_basis_chol += cholesky_no_total_transp(i_chol,i_1, j_1) * cholesky_no_total_transp(i_chol,i_2,j_2) + enddo +end + +double precision function bielec_PQxx_no_chol(i_mo, j_mo, i_ca, j_ca) + implicit none + BEGIN_DOC + ! function that computes (i_mo j_mo| i_ca j_ca) with Cholesky decomposition + ! + ! indices are unshifted orbital numbers + END_DOC + integer, intent(in) :: i_ca, j_ca, i_mo, j_mo + integer :: ii_ca, jj_ca + double precision :: bielec_no_basis_chol + ii_ca = list_core_inact_act(i_ca) + jj_ca = list_core_inact_act(j_ca) + bielec_PQxx_no_chol = bielec_no_basis_chol(i_mo,j_mo,ii_ca,jj_ca) + +end + +double precision function bielec_PxxQ_no_chol(i_mo, j_ca, i_ca, j_mo) + implicit none + BEGIN_DOC + ! function that computes (i_mo j_ca |i_ca j_mo) with Cholesky decomposition + ! + ! indices are unshifted orbital numbers + END_DOC + integer, intent(in) :: i_ca, j_ca, i_mo, j_mo + integer :: ii_ca, jj_ca + double precision :: bielec_no_basis_chol + ii_ca = list_core_inact_act(i_ca) + jj_ca = list_core_inact_act(j_ca) + bielec_PxxQ_no_chol = bielec_no_basis_chol(i_mo, jj_ca, ii_ca, j_mo) + +end + +double precision function bielecCI_no_chol(i_ca, j_ca, k_ca, i_mo) + implicit none + integer, intent(in) :: i_ca, j_ca, k_ca, i_mo + integer :: ii_ca, jj_ca, kk_ca + double precision :: bielec_no_basis_chol + ii_ca = list_act(i_ca) + jj_ca = list_act(j_ca) + kk_ca = list_act(k_ca) + bielecCI_no_chol = bielec_no_basis_chol(ii_ca, jj_ca, kk_ca, i_mo) + +end diff --git a/src/casscf_cipsi/test_chol.irp.f b/src/casscf_cipsi/test_chol.irp.f index 8d978817..87c5c352 100644 --- a/src/casscf_cipsi/test_chol.irp.f +++ b/src/casscf_cipsi/test_chol.irp.f @@ -2,28 +2,90 @@ program test_chol implicit none read_wf= .True. touch read_wf - call routine +! call routine_bielec_PxxQ_no + call routine_bielecCI_no end -subroutine routine +subroutine routine_bielec_PQxx_no implicit none - integer :: i_chol, i_act, i_mo - double precision :: accu,error,exact + integer :: i_chol, i_act, ii_act, j_act, jj_act, i_core_inact, j_core_inact, ii_core_inact, jj_core_inact + integer :: i_virt, ii_virt, j_virt, jj_virt, i_mo, j_mo + double precision :: exact, new, error, accu, bielec_no_basis_chol + double precision :: bielec_PQxx_no_chol + accu = 0.d0 - do i_mo = 1, n_act_orb - do i_act = 1, n_act_orb - do i_chol = 1, cholesky_mo_num - error = dabs(cholesky_no_2_idx_transp(i_chol,i_act,i_mo) - cholesky_no_2_idx_transp_old(i_chol,i_act,i_mo)) - exact = dabs(cholesky_no_2_idx_transp_old(i_chol,i_act,i_mo)) - accu += error - if(exact.gt.1.d-10)then - if(error/exact.gt.1.d-7)then - write(*,'(4(E16.10,X))')cholesky_no_2_idx_transp(i_chol,i_act,i_mo) , cholesky_no_2_idx_transp_old(i_chol,i_act,i_mo),error,error/exact + do i_core_inact = 1, n_core_inact_act_orb + ii_core_inact = list_core_inact_act(i_core_inact) + do j_core_inact = 1, n_core_inact_act_orb + jj_core_inact = list_core_inact_act(j_core_inact) + do i_mo = 1, mo_num + do j_mo = 1, mo_num + exact = bielec_PQxx_no(j_mo,i_mo, j_core_inact, i_core_inact) +! new = bielec_no_basis_chol(j_mo,i_mo, jj_core_inact, ii_core_inact) + new = bielec_PQxx_no_chol(j_mo,i_mo, j_core_inact, i_core_inact) + error = dabs(exact-new) + if(dabs(exact).gt.1.d-10)then + print*,exact,new,error endif - endif + accu += error + enddo enddo enddo enddo - print*,'accu =', accu/(dble(n_act_orb*n_act_orb*cholesky_mo_num)) + print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2)) +end + +subroutine routine_bielec_PxxQ_no + implicit none + integer :: i_chol, i_act, ii_act, j_act, jj_act, i_core_inact, j_core_inact, ii_core_inact, jj_core_inact + integer :: i_virt, ii_virt, j_virt, jj_virt, i_mo, j_mo + double precision :: exact, new, error, accu, bielec_no_basis_chol + double precision :: bielec_PxxQ_no_chol + + accu = 0.d0 + do i_mo = 1, mo_num + do i_core_inact = 1, n_core_inact_act_orb + ii_core_inact = list_core_inact_act(i_core_inact) + do j_core_inact = 1, n_core_inact_act_orb + jj_core_inact = list_core_inact_act(j_core_inact) + do j_mo = 1, mo_num + exact = bielec_PxxQ_no(j_mo, j_core_inact, i_core_inact,i_mo) +! new = bielec_no_basis_chol(j_mo,i_mo, jj_core_inact, ii_core_inact) + new = bielec_PxxQ_no_chol(j_mo, j_core_inact, i_core_inact,i_mo) + error = dabs(exact-new) + accu += error + if(dabs(exact).gt.1.d-10)then + print*,exact,new,error + endif + enddo + enddo + enddo + enddo + print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2)) +end + +subroutine routine_bielecCI_no + implicit none + integer :: i_ca, j_ca, k_ca, i_mo + double precision :: exact, new, error, accu, bielec_no_basis_chol + double precision :: bielecCI_no_chol + + accu = 0.d0 + do i_mo = 1, mo_num + do i_ca = 1, n_act_orb + do j_ca = 1, n_act_orb + do k_ca = 1, n_act_orb + exact =bielecCI_no(k_ca, j_ca, i_ca, i_mo) + new = bielecCI_no_chol(k_ca, j_ca, i_ca, i_mo) + error = dabs(exact-new) + accu += error + if(dabs(exact).gt.1.d-10)then + print*,exact,new,error + endif + enddo + enddo + enddo + enddo + print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2)) end From 505d10084c8331c36ea6a2244bdc47ea8fb33a81 Mon Sep 17 00:00:00 2001 From: eginer Date: Fri, 12 Jul 2024 16:19:53 +0200 Subject: [PATCH 31/38] Choleskization of the CASSCF --- src/casscf_cipsi/bielec.irp.f | 61 +++++++++----- src/casscf_cipsi/bielec_natorb.irp.f | 68 ++++++++++----- src/casscf_cipsi/casscf.irp.f | 2 +- src/casscf_cipsi/chol_bielec.irp.f | 118 ++++++++++++++------------- src/casscf_cipsi/chol_garb.irp.f | 34 ++++++++ src/casscf_cipsi/gradient.irp.f | 1 + src/casscf_cipsi/hessian.irp.f | 6 ++ src/casscf_cipsi/mcscf_fock.irp.f | 2 + src/casscf_cipsi/test_chol.irp.f | 63 +++++++++----- src/casscf_cipsi/tot_en.irp.f | 1 + 10 files changed, 238 insertions(+), 118 deletions(-) create mode 100644 src/casscf_cipsi/chol_garb.irp.f diff --git a/src/casscf_cipsi/bielec.irp.f b/src/casscf_cipsi/bielec.irp.f index 0a44f994..a4901985 100644 --- a/src/casscf_cipsi/bielec.irp.f +++ b/src/casscf_cipsi/bielec.irp.f @@ -1,18 +1,25 @@ -BEGIN_PROVIDER [real*8, bielec_PQxx, (mo_num, mo_num,n_core_inact_act_orb,n_core_inact_act_orb)] +BEGIN_PROVIDER [real*8, bielec_PQxx_array, (mo_num, mo_num,n_core_inact_act_orb,n_core_inact_act_orb)] BEGIN_DOC - ! bielec_PQxx : integral (pq|xx) with p,q arbitrary, x core or active + ! WARNING !!! Old version !!! NOT USED ANYMORE IN THE PROGRAM !!! TOO BIG TO BE STORED ON LARGE SYSTEMS !!! + ! + ! Replaced by the Cholesky-based function bielec_PQxx + ! + ! bielec_PQxx_array : integral (pq|xx) with p,q arbitrary, x core or active ! indices are unshifted orbital numbers END_DOC implicit none integer :: i,j,ii,jj,p,q,i3,j3,t3,v3 real*8 :: mo_two_e_integral + print*,'' + print*,'Providing bielec_PQxx_array, WARNING IT CAN BE A VERY BIG ARRAY WHEN MO_NUM IS LARGE !!!' + print*,'' - bielec_PQxx(:,:,:,:) = 0.d0 + bielec_PQxx_array(:,:,:,:) = 0.d0 PROVIDE mo_two_e_integrals_in_map !$OMP PARALLEL DEFAULT(NONE) & !$OMP PRIVATE(i,ii,j,jj,i3,j3) & - !$OMP SHARED(n_core_inact_orb,list_core_inact,mo_num,bielec_PQxx, & + !$OMP SHARED(n_core_inact_orb,list_core_inact,mo_num,bielec_PQxx_array, & !$OMP n_act_orb,mo_integrals_map,list_act) !$OMP DO @@ -20,14 +27,14 @@ BEGIN_PROVIDER [real*8, bielec_PQxx, (mo_num, mo_num,n_core_inact_act_orb,n_core ii=list_core_inact(i) do j=i,n_core_inact_orb jj=list_core_inact(j) - call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx(1,1,i,j),mo_integrals_map) - bielec_PQxx(:,:,j,i)=bielec_PQxx(:,:,i,j) + call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx_array(1,1,i,j),mo_integrals_map) + bielec_PQxx_array(:,:,j,i)=bielec_PQxx_array(:,:,i,j) end do do j=1,n_act_orb jj=list_act(j) j3=j+n_core_inact_orb - call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx(1,1,i,j3),mo_integrals_map) - bielec_PQxx(:,:,j3,i)=bielec_PQxx(:,:,i,j3) + call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx_array(1,1,i,j3),mo_integrals_map) + bielec_PQxx_array(:,:,j3,i)=bielec_PQxx_array(:,:,i,j3) end do end do !$OMP END DO @@ -40,8 +47,8 @@ BEGIN_PROVIDER [real*8, bielec_PQxx, (mo_num, mo_num,n_core_inact_act_orb,n_core do j=i,n_act_orb jj=list_act(j) j3=j+n_core_inact_orb - call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx(1,1,i3,j3),mo_integrals_map) - bielec_PQxx(:,:,j3,i3)=bielec_PQxx(:,:,i3,j3) + call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx_array(1,1,i3,j3),mo_integrals_map) + bielec_PQxx_array(:,:,j3,i3)=bielec_PQxx_array(:,:,i3,j3) end do end do !$OMP END DO @@ -52,9 +59,13 @@ END_PROVIDER -BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_act_orb, mo_num)] +BEGIN_PROVIDER [real*8, bielec_PxxQ_array, (mo_num,n_core_inact_act_orb,n_core_inact_act_orb, mo_num)] BEGIN_DOC - ! bielec_PxxQ : integral (px|xq) with p,q arbitrary, x core or active + ! WARNING !!! Old version !!! NOT USED ANYMORE IN THE PROGRAM !!! TOO BIG TO BE STORED ON LARGE SYSTEMS !!! + ! + ! Replaced by the Cholesky-based function bielec_PxxQ + ! + ! bielec_PxxQ_array : integral (px|xq) with p,q arbitrary, x core or active ! indices are unshifted orbital numbers END_DOC implicit none @@ -62,12 +73,15 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_a double precision, allocatable :: integrals_array(:,:) real*8 :: mo_two_e_integral + print*,'' + print*,'Providing bielec_PxxQ_array, WARNING IT CAN BE A VERY BIG ARRAY WHEN MO_NUM IS LARGE !!!' + print*,'' PROVIDE mo_two_e_integrals_in_map - bielec_PxxQ = 0.d0 + bielec_PxxQ_array = 0.d0 !$OMP PARALLEL DEFAULT(NONE) & !$OMP PRIVATE(i,ii,j,jj,i3,j3,integrals_array) & - !$OMP SHARED(n_core_inact_orb,list_core_inact,mo_num,bielec_PxxQ, & + !$OMP SHARED(n_core_inact_orb,list_core_inact,mo_num,bielec_PxxQ_array, & !$OMP n_act_orb,mo_integrals_map,list_act) allocate(integrals_array(mo_num,mo_num)) @@ -80,8 +94,8 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_a call get_mo_two_e_integrals_ij(ii,jj,mo_num,integrals_array,mo_integrals_map) do q=1,mo_num do p=1,mo_num - bielec_PxxQ(p,i,j,q)=integrals_array(p,q) - bielec_PxxQ(p,j,i,q)=integrals_array(q,p) + bielec_PxxQ_array(p,i,j,q)=integrals_array(p,q) + bielec_PxxQ_array(p,j,i,q)=integrals_array(q,p) end do end do end do @@ -91,8 +105,8 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_a call get_mo_two_e_integrals_ij(ii,jj,mo_num,integrals_array,mo_integrals_map) do q=1,mo_num do p=1,mo_num - bielec_PxxQ(p,i,j3,q)=integrals_array(p,q) - bielec_PxxQ(p,j3,i,q)=integrals_array(q,p) + bielec_PxxQ_array(p,i,j3,q)=integrals_array(p,q) + bielec_PxxQ_array(p,j3,i,q)=integrals_array(q,p) end do end do end do @@ -111,8 +125,8 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_a call get_mo_two_e_integrals_ij(ii,jj,mo_num,integrals_array,mo_integrals_map) do q=1,mo_num do p=1,mo_num - bielec_PxxQ(p,i3,j3,q)=integrals_array(p,q) - bielec_PxxQ(p,j3,i3,q)=integrals_array(q,p) + bielec_PxxQ_array(p,i3,j3,q)=integrals_array(p,q) + bielec_PxxQ_array(p,j3,i3,q)=integrals_array(q,p) end do end do end do @@ -129,10 +143,15 @@ BEGIN_PROVIDER [real*8, bielecCI, (n_act_orb,n_act_orb,n_act_orb, mo_num)] BEGIN_DOC ! bielecCI : integrals (tu|vp) with p arbitrary, tuv active ! index p runs over the whole basis, t,u,v only over the active orbitals + ! + ! This array can be stored anyway. Ex: 50 active orbitals, 1500 MOs ==> 8x50^3x1500 = 1.5 Gb END_DOC implicit none integer :: i,j,k,p,t,u,v double precision, external :: mo_two_e_integral + double precision :: wall0, wall1 + call wall_time(wall0) + print*,'Providing bielecCI' PROVIDE mo_two_e_integrals_in_map !$OMP PARALLEL DO DEFAULT(NONE) & @@ -151,5 +170,7 @@ BEGIN_PROVIDER [real*8, bielecCI, (n_act_orb,n_act_orb,n_act_orb, mo_num)] end do end do !$OMP END PARALLEL DO + call wall_time(wall1) + print*,'Time to provide bielecCI = ',wall1 - wall0 END_PROVIDER diff --git a/src/casscf_cipsi/bielec_natorb.irp.f b/src/casscf_cipsi/bielec_natorb.irp.f index 9968530c..99734a0b 100644 --- a/src/casscf_cipsi/bielec_natorb.irp.f +++ b/src/casscf_cipsi/bielec_natorb.irp.f @@ -1,30 +1,38 @@ - BEGIN_PROVIDER [real*8, bielec_PQxx_no, (mo_num, mo_num,n_core_inact_act_orb,n_core_inact_act_orb)] + BEGIN_PROVIDER [real*8, bielec_PQxx_no_array, (mo_num, mo_num,n_core_inact_act_orb,n_core_inact_act_orb)] BEGIN_DOC + ! WARNING !!! Old version !!! NOT USED ANYMORE IN THE PROGRAM !!! TOO BIG TO BE STORED ON LARGE SYSTEMS !!! + ! + ! Replaced by the Cholesky-based function bielec_PQxx_no + ! ! integral (pq|xx) in the basis of natural MOs ! indices are unshifted orbital numbers + ! END_DOC implicit none integer :: i,j,k,l,t,u,p,q double precision, allocatable :: f(:,:,:), d(:,:,:) + print*,'' + print*,'Providing bielec_PQxx_no_array, WARNING IT CAN BE A VERY BIG ARRAY WHEN MO_NUM IS LARGE !!!' + print*,'' !$OMP PARALLEL DEFAULT(NONE) & !$OMP PRIVATE(j,k,l,p,d,f) & !$OMP SHARED(n_core_inact_act_orb,mo_num,n_act_orb,n_core_inact_orb, & - !$OMP bielec_PQxx_no,bielec_PQxx,list_act,natorbsCI) + !$OMP bielec_PQxx_no_array,bielec_PQxx_array,list_act,natorbsCI) allocate (f(n_act_orb,mo_num,n_core_inact_act_orb), & d(n_act_orb,mo_num,n_core_inact_act_orb)) !$OMP DO do l=1,n_core_inact_act_orb - bielec_PQxx_no(:,:,:,l) = bielec_PQxx(:,:,:,l) + bielec_PQxx_no_array(:,:,:,l) = bielec_PQxx_array(:,:,:,l) do k=1,n_core_inact_act_orb do j=1,mo_num do p=1,n_act_orb - f(p,j,k)=bielec_PQxx_no(list_act(p),j,k,l) + f(p,j,k)=bielec_PQxx_no_array(list_act(p),j,k,l) end do end do end do @@ -36,13 +44,13 @@ do k=1,n_core_inact_act_orb do j=1,mo_num do p=1,n_act_orb - bielec_PQxx_no(list_act(p),j,k,l)=d(p,j,k) + bielec_PQxx_no_array(list_act(p),j,k,l)=d(p,j,k) end do end do do j=1,mo_num do p=1,n_act_orb - f(p,j,k)=bielec_PQxx_no(j,list_act(p),k,l) + f(p,j,k)=bielec_PQxx_no_array(j,list_act(p),k,l) end do end do end do @@ -54,7 +62,7 @@ do k=1,n_core_inact_act_orb do p=1,n_act_orb do j=1,mo_num - bielec_PQxx_no(j,list_act(p),k,l)=d(p,j,k) + bielec_PQxx_no_array(j,list_act(p),k,l)=d(p,j,k) end do end do end do @@ -71,7 +79,7 @@ do p=1,n_act_orb do k=1,mo_num do j=1,mo_num - f(j,k,p) = bielec_PQxx_no(j,k,n_core_inact_orb+p,l) + f(j,k,p) = bielec_PQxx_no_array(j,k,n_core_inact_orb+p,l) end do end do end do @@ -83,7 +91,7 @@ do p=1,n_act_orb do k=1,mo_num do j=1,mo_num - bielec_PQxx_no(j,k,n_core_inact_orb+p,l)=d(j,k,p) + bielec_PQxx_no_array(j,k,n_core_inact_orb+p,l)=d(j,k,p) end do end do end do @@ -97,7 +105,7 @@ do p=1,n_act_orb do k=1,mo_num do j=1,mo_num - f(j,k,p) = bielec_PQxx_no(j,k,l,n_core_inact_orb+p) + f(j,k,p) = bielec_PQxx_no_array(j,k,l,n_core_inact_orb+p) end do end do end do @@ -109,7 +117,7 @@ do p=1,n_act_orb do k=1,mo_num do j=1,mo_num - bielec_PQxx_no(j,k,l,n_core_inact_orb+p)=d(j,k,p) + bielec_PQxx_no_array(j,k,l,n_core_inact_orb+p)=d(j,k,p) end do end do end do @@ -123,8 +131,12 @@ END_PROVIDER -BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inact_act_orb, mo_num)] +BEGIN_PROVIDER [real*8, bielec_PxxQ_no_array, (mo_num,n_core_inact_act_orb,n_core_inact_act_orb, mo_num)] BEGIN_DOC + ! WARNING !!! Old version !!! NOT USED ANYMORE IN THE PROGRAM !!! TOO BIG TO BE STORED ON LARGE SYSTEMS !!! + ! + ! Replaced by the Cholesky-based function bielec_PxxQ_no + ! ! integral (px|xq) in the basis of natural MOs ! indices are unshifted orbital numbers END_DOC @@ -132,10 +144,14 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac integer :: i,j,k,l,t,u,p,q double precision, allocatable :: f(:,:,:), d(:,:,:) + print*,'' + print*,'Providing bielec_PxxQ_no_array, WARNING IT CAN BE A VERY BIG ARRAY WHEN MO_NUM IS LARGE !!!' + print*,'' + !$OMP PARALLEL DEFAULT(NONE) & !$OMP PRIVATE(j,k,l,p,d,f) & !$OMP SHARED(n_core_inact_act_orb,mo_num,n_act_orb,n_core_inact_orb, & - !$OMP bielec_PxxQ_no,bielec_PxxQ,list_act,natorbsCI) + !$OMP bielec_PxxQ_no_array,bielec_PxxQ_array,list_act,natorbsCI) allocate (f(n_act_orb,n_core_inact_act_orb,n_core_inact_act_orb), & @@ -143,11 +159,11 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac !$OMP DO do j=1,mo_num - bielec_PxxQ_no(:,:,:,j) = bielec_PxxQ(:,:,:,j) + bielec_PxxQ_no_array(:,:,:,j) = bielec_PxxQ_array(:,:,:,j) do l=1,n_core_inact_act_orb do k=1,n_core_inact_act_orb do p=1,n_act_orb - f(p,k,l) = bielec_PxxQ_no(list_act(p),k,l,j) + f(p,k,l) = bielec_PxxQ_no_array(list_act(p),k,l,j) end do end do end do @@ -159,7 +175,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac do l=1,n_core_inact_act_orb do k=1,n_core_inact_act_orb do p=1,n_act_orb - bielec_PxxQ_no(list_act(p),k,l,j)=d(p,k,l) + bielec_PxxQ_no_array(list_act(p),k,l,j)=d(p,k,l) end do end do end do @@ -176,7 +192,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac do l=1,n_core_inact_act_orb do j=1,mo_num do p=1,n_act_orb - f(p,j,l) = bielec_PxxQ_no(j,n_core_inact_orb+p,l,k) + f(p,j,l) = bielec_PxxQ_no_array(j,n_core_inact_orb+p,l,k) end do end do end do @@ -188,7 +204,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac do l=1,n_core_inact_act_orb do j=1,mo_num do p=1,n_act_orb - bielec_PxxQ_no(j,n_core_inact_orb+p,l,k)=d(p,j,l) + bielec_PxxQ_no_array(j,n_core_inact_orb+p,l,k)=d(p,j,l) end do end do end do @@ -205,7 +221,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac do p=1,n_act_orb do l=1,n_core_inact_act_orb do j=1,mo_num - f(j,l,p) = bielec_PxxQ_no(j,l,n_core_inact_orb+p,k) + f(j,l,p) = bielec_PxxQ_no_array(j,l,n_core_inact_orb+p,k) end do end do end do @@ -217,7 +233,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac do p=1,n_act_orb do l=1,n_core_inact_act_orb do j=1,mo_num - bielec_PxxQ_no(j,l,n_core_inact_orb+p,k)=d(j,l,p) + bielec_PxxQ_no_array(j,l,n_core_inact_orb+p,k)=d(j,l,p) end do end do end do @@ -231,7 +247,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac do p=1,n_act_orb do k=1,n_core_inact_act_orb do j=1,mo_num - f(j,k,p) = bielec_PxxQ_no(j,k,l,n_core_inact_orb+p) + f(j,k,p) = bielec_PxxQ_no_array(j,k,l,n_core_inact_orb+p) end do end do end do @@ -243,7 +259,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac do p=1,n_act_orb do k=1,n_core_inact_act_orb do j=1,mo_num - bielec_PxxQ_no(j,k,l,n_core_inact_orb+p)=d(j,k,p) + bielec_PxxQ_no_array(j,k,l,n_core_inact_orb+p)=d(j,k,p) end do end do end do @@ -259,10 +275,16 @@ BEGIN_PROVIDER [real*8, bielecCI_no, (n_act_orb,n_act_orb,n_act_orb, mo_num)] BEGIN_DOC ! integrals (tu|vp) in the basis of natural MOs ! index p runs over the whole basis, t,u,v only over the active orbitals + ! + ! This array can be stored anyway. Ex: 50 active orbitals, 1500 MOs ==> 8x50^3x1500 = 1.5 Gb END_DOC implicit none integer :: i,j,k,l,t,u,p,q double precision, allocatable :: f(:,:,:), d(:,:,:) + + double precision :: wall0, wall1 + call wall_time(wall0) + print*,'Providing bielecCI_no' !$OMP PARALLEL DEFAULT(NONE) & !$OMP PRIVATE(j,k,l,p,d,f) & @@ -363,6 +385,8 @@ BEGIN_PROVIDER [real*8, bielecCI_no, (n_act_orb,n_act_orb,n_act_orb, mo_num)] deallocate(d,f) !$OMP END PARALLEL + call wall_time(wall1) + print*,'Time to provide bielecCI_no = ',wall1-wall0 END_PROVIDER diff --git a/src/casscf_cipsi/casscf.irp.f b/src/casscf_cipsi/casscf.irp.f index d0a26d36..dc3e2245 100644 --- a/src/casscf_cipsi/casscf.irp.f +++ b/src/casscf_cipsi/casscf.irp.f @@ -11,7 +11,7 @@ program casscf if(small_active_space)then pt2_relative_error = 0.00001 else - thresh_scf = 1.d-4 + thresh_scf = max(1.d-4,thresh_scf) pt2_relative_error = 0.04 endif touch pt2_relative_error diff --git a/src/casscf_cipsi/chol_bielec.irp.f b/src/casscf_cipsi/chol_bielec.irp.f index 94a76453..f69832c1 100644 --- a/src/casscf_cipsi/chol_bielec.irp.f +++ b/src/casscf_cipsi/chol_bielec.irp.f @@ -6,6 +6,9 @@ BEGIN_PROVIDER [double precision, cholesky_no_1_idx_transp, (cholesky_mo_num, n_ implicit none integer :: i_chol,i_act,i_mo,jj_act double precision, allocatable :: chol_tmp(:,:) + double precision :: wall0,wall1 + call wall_time(wall0) + print*,'Providing cholesky_no_1_idx_transp' allocate(chol_tmp(cholesky_mo_num,n_act_orb)) cholesky_no_1_idx_transp = 0.D0 do i_mo = 1, mo_num @@ -16,46 +19,17 @@ BEGIN_PROVIDER [double precision, cholesky_no_1_idx_transp, (cholesky_mo_num, n_ chol_tmp(i_chol, i_act) = cholesky_mo_transp(i_chol, jj_act, i_mo) enddo enddo -! ! Do the matrix product -! do i_act = 1, n_act_orb -! do jj_act = 1, n_act_orb -! do i_chol = 1, cholesky_mo_num -! cholesky_no_1_idx_transp(i_chol, i_act, i_mo) += chol_tmp(i_chol, jj_act) * natorbsCI(jj_act,i_act) -! enddo -! enddo -! enddo call dgemm('N','N',cholesky_mo_num,n_act_orb,n_act_orb,1.d0, & chol_tmp, size(chol_tmp,1), & natorbsCI, size(natorbsCI,1), & 0.d0, & cholesky_no_1_idx_transp(1,1,i_mo), size(cholesky_no_1_idx_transp,1)) enddo + call wall_time(wall1) + print*,'Time to provide cholesky_no_1_idx_transp = ', wall1 - wall0 END_PROVIDER -BEGIN_PROVIDER [double precision, cholesky_no_2_idx_transp_old, (cholesky_mo_num, n_act_orb, n_act_orb)] - BEGIN_DOC - ! Cholesky vectors with TWO orbital on the active natural orbital basis - END_DOC - implicit none - integer :: i_chol,i_act,j_act,jj_act,jjj_act - double precision, allocatable :: chol_tmp(:,:) - allocate(chol_tmp(cholesky_mo_num,n_act_orb)) - cholesky_no_2_idx_transp_old = 0.D0 - do jj_act = 1, n_act_orb - jjj_act = list_act(jj_act) - do j_act = 1, n_act_orb - do i_act = 1, n_act_orb - do i_chol = 1, cholesky_mo_num - cholesky_no_2_idx_transp_old(i_chol, i_act, j_act) += cholesky_no_1_idx_transp(i_chol, i_act,jjj_act) * natorbsCI(jj_act,j_act) - enddo - enddo - enddo - enddo - -END_PROVIDER - - BEGIN_PROVIDER [double precision, cholesky_no_2_idx_transp, (cholesky_mo_num, n_act_orb, n_act_orb)] BEGIN_DOC ! Cholesky vectors with TWO orbital on the active natural orbital basis @@ -64,6 +38,9 @@ BEGIN_PROVIDER [double precision, cholesky_no_2_idx_transp, (cholesky_mo_num, n_ integer :: i_chol,i_act,j_act,jj_act double precision, allocatable :: chol_tmp(:,:),chol_tmp_bis(:,:) allocate(chol_tmp(cholesky_mo_num,n_act_orb),chol_tmp_bis(cholesky_mo_num,n_act_orb)) + double precision :: wall0,wall1 + call wall_time(wall0) + print*,'Providing cholesky_no_2_idx_transp' cholesky_no_2_idx_transp = 0.D0 do i_act = 1, n_act_orb ! Get all the integrals corresponding to the "j_act" @@ -79,6 +56,8 @@ BEGIN_PROVIDER [double precision, cholesky_no_2_idx_transp, (cholesky_mo_num, n_ 0.d0, & cholesky_no_2_idx_transp(1,1,i_act), size(cholesky_no_2_idx_transp,1)) enddo + call wall_time(wall1) + print*,'Time to provide cholesky_no_2_idx_transp = ', wall1 - wall0 END_PROVIDER @@ -89,6 +68,9 @@ BEGIN_PROVIDER [ double precision, cholesky_no_total_transp, (cholesky_mo_num, m END_DOC integer :: i_chol, i_act, ii_act, j_act, jj_act, i_core_inact, j_core_inact, ii_core_inact, jj_core_inact integer :: i_virt, ii_virt, j_virt, jj_virt + double precision :: wall0,wall1 + call wall_time(wall0) + print*,'Providing cholesky_no_total_transp ' ! Block when two orbitals belong to the core/inact do j_core_inact = 1, n_core_inact_orb jj_core_inact = list_core_inact(j_core_inact) @@ -180,63 +162,87 @@ BEGIN_PROVIDER [ double precision, cholesky_no_total_transp, (cholesky_mo_num, m enddo enddo enddo + + call wall_time(wall1) + print*,'Time to provide cholesky_no_total_transp = ', wall1 - wall0 END_PROVIDER -double precision function bielec_no_basis_chol(i_1,j_1,i_2,j_2) +double precision function bielec_no_basis(i_1,j_1,i_2,j_2) implicit none integer, intent(in) :: i_1,j_1,i_2,j_2 BEGIN_DOC ! integral (i_1 j_1|i_2 j_2) in the mixed basis of both MOs and natural MOs ! END_DOC - integer :: i_chol - bielec_no_basis_chol = 0.d0 - do i_chol = 1, cholesky_mo_num - bielec_no_basis_chol += cholesky_no_total_transp(i_chol,i_1, j_1) * cholesky_no_total_transp(i_chol,i_2,j_2) + integer :: i + bielec_no_basis = 0.d0 + do i = 1, cholesky_mo_num + bielec_no_basis += cholesky_no_total_transp(i,i_1, j_1) * cholesky_no_total_transp(i,i_2,j_2) enddo end -double precision function bielec_PQxx_no_chol(i_mo, j_mo, i_ca, j_ca) +double precision function bielec_PQxx_no(i_mo, j_mo, i_ca, j_ca) implicit none BEGIN_DOC - ! function that computes (i_mo j_mo| i_ca j_ca) with Cholesky decomposition + ! function that computes (i_mo j_mo| i_ca j_ca) with Cholesky decomposition on the NO basis for active orbitals ! - ! indices are unshifted orbital numbers + ! where i_ca, j_ca are in [1:n_core_inact_act_orb] END_DOC integer, intent(in) :: i_ca, j_ca, i_mo, j_mo integer :: ii_ca, jj_ca - double precision :: bielec_no_basis_chol + double precision :: bielec_no_basis ii_ca = list_core_inact_act(i_ca) jj_ca = list_core_inact_act(j_ca) - bielec_PQxx_no_chol = bielec_no_basis_chol(i_mo,j_mo,ii_ca,jj_ca) - + bielec_PQxx_no = bielec_no_basis(i_mo,j_mo,ii_ca,jj_ca) end -double precision function bielec_PxxQ_no_chol(i_mo, j_ca, i_ca, j_mo) +double precision function bielec_PxxQ_no(i_mo, j_ca, i_ca, j_mo) implicit none BEGIN_DOC - ! function that computes (i_mo j_ca |i_ca j_mo) with Cholesky decomposition + ! function that computes (i_mo j_ca |i_ca j_mo) with Cholesky decomposition on the NO basis for active orbitals ! - ! indices are unshifted orbital numbers + ! where i_ca, j_ca are in [1:n_core_inact_act_orb] END_DOC integer, intent(in) :: i_ca, j_ca, i_mo, j_mo integer :: ii_ca, jj_ca - double precision :: bielec_no_basis_chol + double precision :: bielec_no_basis ii_ca = list_core_inact_act(i_ca) jj_ca = list_core_inact_act(j_ca) - bielec_PxxQ_no_chol = bielec_no_basis_chol(i_mo, jj_ca, ii_ca, j_mo) + bielec_PxxQ_no = bielec_no_basis(i_mo, jj_ca, ii_ca, j_mo) end -double precision function bielecCI_no_chol(i_ca, j_ca, k_ca, i_mo) + +double precision function bielec_PQxx(i_mo, j_mo, i_ca, j_ca) + BEGIN_DOC + ! function that computes (i_mo j_mo |i_ca j_ca) with Cholesky decomposition + ! + ! indices are unshifted orbital numbers + ! + ! where i_ca, j_ca are in [1:n_core_inact_act_orb] + END_DOC implicit none - integer, intent(in) :: i_ca, j_ca, k_ca, i_mo - integer :: ii_ca, jj_ca, kk_ca - double precision :: bielec_no_basis_chol - ii_ca = list_act(i_ca) - jj_ca = list_act(j_ca) - kk_ca = list_act(k_ca) - bielecCI_no_chol = bielec_no_basis_chol(ii_ca, jj_ca, kk_ca, i_mo) - + integer, intent(in) :: i_ca, j_ca, j_mo, i_mo + double precision :: mo_two_e_integral + integer :: ii_ca, jj_ca + ii_ca = list_core_inact_act(i_ca) + jj_ca = list_core_inact_act(j_ca) + bielec_PQxx = mo_two_e_integral(i_mo,ii_ca,j_mo,jj_ca) end + +double precision function bielec_PxxQ(i_mo, i_ca, j_ca, j_mo) + BEGIN_DOC + ! function that computes (i_mo j_mo |i_ca j_ca) with Cholesky decomposition + ! + ! where i_ca, j_ca are in [1:n_core_inact_act_orb] + END_DOC + implicit none + integer, intent(in) :: i_ca, j_ca, j_mo, i_mo + double precision :: mo_two_e_integral + integer :: ii_ca, jj_ca + ii_ca = list_core_inact_act(i_ca) + jj_ca = list_core_inact_act(j_ca) + bielec_PxxQ = mo_two_e_integral(i_mo,jj_ca,ii_ca,j_mo) +end + diff --git a/src/casscf_cipsi/chol_garb.irp.f b/src/casscf_cipsi/chol_garb.irp.f new file mode 100644 index 00000000..c4a8fa59 --- /dev/null +++ b/src/casscf_cipsi/chol_garb.irp.f @@ -0,0 +1,34 @@ + +!!!!! FUNCTIONS THAT WORK BUT WHICH ARE USELESS AS THE ARRAYS CAN ALWAYS BE STORED +!double precision function bielecCI_chol(i_a, j_a, k_a, i_mo) +! BEGIN_DOC +! ! function that computes (i_a j_a |k_a j_mo) with Cholesky decomposition +! ! +! ! where i_a, j_a, k_a are in [1:n_act_orb] !!! ONLY ON ACTIVE +! END_DOC +! implicit none +! integer, intent(in) :: i_a, j_a, k_a, i_mo +! integer :: ii_a, jj_a, kk_a +! double precision :: mo_two_e_integral +! ii_a = list_act(i_a) +! jj_a = list_act(j_a) +! kk_a = list_act(k_a) +! bielecCI_chol = mo_two_e_integral(ii_a,kk_a,jj_a,i_mo) +!end + +!double precision function bielecCI_no_chol(i_ca, j_ca, k_ca, i_mo) +! BEGIN_DOC +! ! function that computes (i_ca j_ca |k_ca j_mo) with Cholesky decomposition on the NO basis for active orbitals +! ! +! ! where i_ca, j_ca, k_ca are in [1:n_core_inact_act_orb] +! END_DOC +! implicit none +! integer, intent(in) :: i_ca, j_ca, k_ca, i_mo +! integer :: ii_ca, jj_ca, kk_ca +! double precision :: bielec_no_basis_chol +! ii_ca = list_act(i_ca) +! jj_ca = list_act(j_ca) +! kk_ca = list_act(k_ca) +! bielecCI_no_chol = bielec_no_basis_chol(ii_ca, jj_ca, kk_ca, i_mo) +! +!end diff --git a/src/casscf_cipsi/gradient.irp.f b/src/casscf_cipsi/gradient.irp.f index a1c5e947..961d260d 100644 --- a/src/casscf_cipsi/gradient.irp.f +++ b/src/casscf_cipsi/gradient.irp.f @@ -157,6 +157,7 @@ real*8 function gradvec_it(i,t) integer :: ii,tt,v,vv,x,y integer :: x3,y3 + double precision :: bielec_PQxx_no ii=list_core_inact(i) tt=list_act(t) diff --git a/src/casscf_cipsi/hessian.irp.f b/src/casscf_cipsi/hessian.irp.f index 458c6aa6..9a7a9031 100644 --- a/src/casscf_cipsi/hessian.irp.f +++ b/src/casscf_cipsi/hessian.irp.f @@ -10,6 +10,7 @@ real*8 function hessmat_itju(i,t,j,u) implicit none integer :: i,t,j,u,ii,tt,uu,v,vv,x,xx,y,jj real*8 :: term,t2 + double precision :: bielec_pqxx_no,bielec_pxxq_no ii=list_core_inact(i) tt=list_act(t) @@ -95,6 +96,7 @@ real*8 function hessmat_itja(i,t,j,a) implicit none integer :: i,t,j,a,ii,tt,jj,aa,v,vv,x,y real*8 :: term + double precision :: bielec_pqxx_no,bielec_pxxq_no ! it/ja ii=list_core_inact(i) @@ -128,6 +130,7 @@ real*8 function hessmat_itua(i,t,u,a) implicit none integer :: i,t,u,a,ii,tt,uu,aa,v,vv,x,xx,u3,t3,v3 real*8 :: term + double precision :: bielec_pqxx_no,bielec_pxxq_no ii=list_core_inact(i) tt=list_act(t) @@ -169,6 +172,7 @@ real*8 function hessmat_iajb(i,a,j,b) implicit none integer :: i,a,j,b,ii,aa,jj,bb real*8 :: term + double precision :: bielec_pqxx_no,bielec_pxxq_no ii=list_core_inact(i) aa=list_virt(a) @@ -205,6 +209,7 @@ real*8 function hessmat_iatb(i,a,t,b) implicit none integer :: i,a,t,b,ii,aa,tt,bb,v,vv,x,y,v3,t3 real*8 :: term + double precision :: bielec_pqxx_no,bielec_pxxq_no ii=list_core_inact(i) aa=list_virt(a) @@ -237,6 +242,7 @@ real*8 function hessmat_taub(t,a,u,b) integer :: t,a,u,b,tt,aa,uu,bb,v,vv,x,xx,y integer :: v3,x3 real*8 :: term,t1,t2,t3 + double precision :: bielec_pqxx_no,bielec_pxxq_no tt=list_act(t) aa=list_virt(a) diff --git a/src/casscf_cipsi/mcscf_fock.irp.f b/src/casscf_cipsi/mcscf_fock.irp.f index 0f4b7a99..82b710a7 100644 --- a/src/casscf_cipsi/mcscf_fock.irp.f +++ b/src/casscf_cipsi/mcscf_fock.irp.f @@ -4,6 +4,7 @@ BEGIN_PROVIDER [real*8, Fipq, (mo_num,mo_num) ] END_DOC implicit none integer :: p,q,k,kk,t,tt,u,uu + double precision :: bielec_pxxq_no, bielec_pqxx_no do q=1,mo_num do p=1,mo_num @@ -44,6 +45,7 @@ BEGIN_PROVIDER [real*8, Fapq, (mo_num,mo_num) ] END_DOC implicit none integer :: p,q,k,kk,t,tt,u,uu + double precision :: bielec_pxxq_no, bielec_pqxx_no Fapq = 0.d0 diff --git a/src/casscf_cipsi/test_chol.irp.f b/src/casscf_cipsi/test_chol.irp.f index 87c5c352..bcce7cf7 100644 --- a/src/casscf_cipsi/test_chol.irp.f +++ b/src/casscf_cipsi/test_chol.irp.f @@ -3,7 +3,9 @@ program test_chol read_wf= .True. touch read_wf ! call routine_bielec_PxxQ_no - call routine_bielecCI_no +! call routine_bielecCI_no +! call test_bielec_PxxQ_chol +! call test_bielecCI end @@ -12,7 +14,7 @@ subroutine routine_bielec_PQxx_no integer :: i_chol, i_act, ii_act, j_act, jj_act, i_core_inact, j_core_inact, ii_core_inact, jj_core_inact integer :: i_virt, ii_virt, j_virt, jj_virt, i_mo, j_mo double precision :: exact, new, error, accu, bielec_no_basis_chol - double precision :: bielec_PQxx_no_chol + double precision :: bielec_PQxx_no accu = 0.d0 do i_core_inact = 1, n_core_inact_act_orb @@ -21,9 +23,8 @@ subroutine routine_bielec_PQxx_no jj_core_inact = list_core_inact_act(j_core_inact) do i_mo = 1, mo_num do j_mo = 1, mo_num - exact = bielec_PQxx_no(j_mo,i_mo, j_core_inact, i_core_inact) -! new = bielec_no_basis_chol(j_mo,i_mo, jj_core_inact, ii_core_inact) - new = bielec_PQxx_no_chol(j_mo,i_mo, j_core_inact, i_core_inact) + exact = bielec_PQxx_no_array(j_mo,i_mo, j_core_inact, i_core_inact) + new = bielec_PQxx_no(j_mo,i_mo, j_core_inact, i_core_inact) error = dabs(exact-new) if(dabs(exact).gt.1.d-10)then print*,exact,new,error @@ -36,12 +37,12 @@ subroutine routine_bielec_PQxx_no print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2)) end -subroutine routine_bielec_PxxQ_no +subroutine routine_bielec_PxxQ_no_array implicit none integer :: i_chol, i_act, ii_act, j_act, jj_act, i_core_inact, j_core_inact, ii_core_inact, jj_core_inact integer :: i_virt, ii_virt, j_virt, jj_virt, i_mo, j_mo double precision :: exact, new, error, accu, bielec_no_basis_chol - double precision :: bielec_PxxQ_no_chol + double precision :: bielec_PxxQ_no accu = 0.d0 do i_mo = 1, mo_num @@ -50,9 +51,9 @@ subroutine routine_bielec_PxxQ_no do j_core_inact = 1, n_core_inact_act_orb jj_core_inact = list_core_inact_act(j_core_inact) do j_mo = 1, mo_num - exact = bielec_PxxQ_no(j_mo, j_core_inact, i_core_inact,i_mo) + exact = bielec_PxxQ_no_array(j_mo, j_core_inact, i_core_inact,i_mo) ! new = bielec_no_basis_chol(j_mo,i_mo, jj_core_inact, ii_core_inact) - new = bielec_PxxQ_no_chol(j_mo, j_core_inact, i_core_inact,i_mo) + new = bielec_PxxQ_no(j_mo, j_core_inact, i_core_inact,i_mo) error = dabs(exact-new) accu += error if(dabs(exact).gt.1.d-10)then @@ -65,19 +66,43 @@ subroutine routine_bielec_PxxQ_no print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2)) end -subroutine routine_bielecCI_no +subroutine test_bielec_PQxx(i_mo, j_mo, i_ca, j_ca) implicit none - integer :: i_ca, j_ca, k_ca, i_mo - double precision :: exact, new, error, accu, bielec_no_basis_chol - double precision :: bielecCI_no_chol + integer :: i_mo, j_mo, i_ca, j_ca + double precision :: exact, new, error, accu + double precision :: bielec_PQxx accu = 0.d0 - do i_mo = 1, mo_num - do i_ca = 1, n_act_orb - do j_ca = 1, n_act_orb - do k_ca = 1, n_act_orb - exact =bielecCI_no(k_ca, j_ca, i_ca, i_mo) - new = bielecCI_no_chol(k_ca, j_ca, i_ca, i_mo) + do j_ca = 1, n_core_inact_act_orb + do i_ca = 1, n_core_inact_act_orb + do j_mo = 1, mo_num + do i_mo = 1, mo_num + exact = bielec_PQxx_array(i_mo, j_mo, i_ca, j_ca) + new = bielec_PQxx(i_mo, j_mo, i_ca, j_ca) + error = dabs(exact-new) + accu += error + if(dabs(exact).gt.1.d-10)then + print*,exact,new,error + endif + enddo + enddo + enddo + enddo + print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2)) +end + +subroutine test_bielec_PxxQ_chol(i_mo, i_ca, j_ca, j_mo) + implicit none + integer :: i_mo, i_ca, j_ca, j_mo + double precision :: exact, new, error, accu + double precision :: bielec_PxxQ + accu = 0.d0 + do j_mo = 1, mo_num + do j_ca = 1, n_core_inact_act_orb + do i_ca =1, n_core_inact_act_orb + do i_mo = 1, mo_num + exact = bielec_PxxQ_array(i_mo, i_ca, j_ca, j_mo) + new = bielec_PxxQ(i_mo, i_ca, j_ca, j_mo) error = dabs(exact-new) accu += error if(dabs(exact).gt.1.d-10)then diff --git a/src/casscf_cipsi/tot_en.irp.f b/src/casscf_cipsi/tot_en.irp.f index 1d70e087..37ceac05 100644 --- a/src/casscf_cipsi/tot_en.irp.f +++ b/src/casscf_cipsi/tot_en.irp.f @@ -8,6 +8,7 @@ implicit none integer :: t,u,v,x,i,ii,tt,uu,vv,xx,j,jj,t3,u3,v3,x3 real*8 :: e_one_all,e_two_all + double precision :: bielec_PQxx,bielec_PxxQ e_one_all=0.D0 e_two_all=0.D0 do i=1,n_core_inact_orb From 4a9a11c630c94528fb429255d5d9767d9b2edefc Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Fri, 12 Jul 2024 17:32:41 +0200 Subject: [PATCH 32/38] GPU acceleration compute_tc_int --- plugins/local/tc_int/NEED | 1 + plugins/local/tc_int/compute_tc_int.irp.f | 117 +++++++++++++--------- 2 files changed, 69 insertions(+), 49 deletions(-) diff --git a/plugins/local/tc_int/NEED b/plugins/local/tc_int/NEED index 8a4caf5b..34d2e183 100644 --- a/plugins/local/tc_int/NEED +++ b/plugins/local/tc_int/NEED @@ -1,3 +1,4 @@ +gpu tc_keywords jastrow qmckl diff --git a/plugins/local/tc_int/compute_tc_int.irp.f b/plugins/local/tc_int/compute_tc_int.irp.f index 02f21570..92c90d03 100644 --- a/plugins/local/tc_int/compute_tc_int.irp.f +++ b/plugins/local/tc_int/compute_tc_int.irp.f @@ -2,23 +2,23 @@ ! --- subroutine provide_int2_grad1_u12_ao() - + use gpu BEGIN_DOC ! - ! int2_grad1_u12_ao(i,j,ipoint,1) = \int dr2 [\grad1 u(r1,r2)]_x1 \chi_i(r2) \chi_j(r2) - ! int2_grad1_u12_ao(i,j,ipoint,2) = \int dr2 [\grad1 u(r1,r2)]_y1 \chi_i(r2) \chi_j(r2) - ! int2_grad1_u12_ao(i,j,ipoint,3) = \int dr2 [\grad1 u(r1,r2)]_z1 \chi_i(r2) \chi_j(r2) - ! int2_grad1_u12_ao(i,j,ipoint,4) = \int dr2 [-(1/2) [\grad1 u(r1,r2)]^2] \chi_i(r2) \chi_j(r2) + ! int2_grad1_u12_ao(i,j,ipoint,1) = \int dr2 [\grad1 u(r1,r2)]_x1 \chi_i(r2) \chi_j(r2) + ! int2_grad1_u12_ao(i,j,ipoint,2) = \int dr2 [\grad1 u(r1,r2)]_y1 \chi_i(r2) \chi_j(r2) + ! int2_grad1_u12_ao(i,j,ipoint,3) = \int dr2 [\grad1 u(r1,r2)]_z1 \chi_i(r2) \chi_j(r2) + ! int2_grad1_u12_ao(i,j,ipoint,4) = \int dr2 [-(1/2) [\grad1 u(r1,r2)]^2] \chi_i(r2) \chi_j(r2) ! ! - ! tc_int_2e_ao(k,i,l,j) = (ki|V^TC(r_12)|lj) - ! = where V^TC(r_12) is the total TC operator + ! tc_int_2e_ao(k,i,l,j) = (ki|V^TC(r_12)|lj) + ! = where V^TC(r_12) is the total TC operator ! = tc_grad_and_lapl_ao(k,i,l,j) + tc_grad_square_ao(k,i,l,j) + ao_two_e_coul(k,i,l,j) ! where: ! ! tc_grad_and_lapl_ao(k,i,l,j) = < k l | -1/2 \Delta_1 u(r1,r2) - \grad_1 u(r1,r2) . \grad_1 | ij > - ! = -1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2 \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2) - ! = 1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2 (-1) \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2) + ! = -1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2 \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2) + ! = 1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2 (-1) \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2) ! ! tc_grad_square_ao(k,i,l,j) = -1/2 ! @@ -35,8 +35,9 @@ subroutine provide_int2_grad1_u12_ao() double precision :: weight1, ao_k_r, ao_i_r double precision :: der_envsq_x, der_envsq_y, der_envsq_z, lap_envsq double precision :: time0, time1, time2, tc1, tc2, tc - double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:), tc_int_2e_ao(:,:,:,:) - double precision, allocatable :: tmp(:,:,:), c_mat(:,:,:), tmp_grad1_u12(:,:,:) + type(gpu_double4) :: int2_grad1_u12_ao + type(gpu_double3) :: tmp_grad1_u12, tmp_grad1_u12p, tmp + double precision, allocatable :: c_mat(:,:,:), tc_int_2e_ao(:,:,:,:) double precision, external :: get_ao_two_e_integral @@ -51,6 +52,7 @@ subroutine provide_int2_grad1_u12_ao() call total_memory(mem) mem = max(1.d0, qp_max_mem - mem) + mem = 6 n_double = mem * 1.d8 n_blocks = int(min(n_double / (n_points_extra_final_grid * 4.d0), 1.d0*n_points_final_grid)) n_rest = int(mod(n_points_final_grid, n_blocks)) @@ -64,9 +66,9 @@ subroutine provide_int2_grad1_u12_ao() ! --- ! --- - allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,4)) + call gpu_allocate(int2_grad1_u12_ao, ao_num,ao_num,n_points_final_grid,4) - allocate(tmp(n_points_extra_final_grid,ao_num,ao_num)) + call gpu_allocate(tmp,n_points_extra_final_grid,ao_num,ao_num) !$OMP PARALLEL & !$OMP DEFAULT (NONE) & !$OMP PRIVATE (j, i, jpoint) & @@ -75,49 +77,55 @@ subroutine provide_int2_grad1_u12_ao() do j = 1, ao_num do i = 1, ao_num do jpoint = 1, n_points_extra_final_grid - tmp(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j) + tmp%f(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j) enddo enddo enddo !$OMP END DO !$OMP END PARALLEL - allocate(tmp_grad1_u12(n_points_extra_final_grid,n_blocks,4)) + call gpu_allocate(tmp_grad1_u12,n_points_extra_final_grid,n_blocks,4) + call gpu_allocate(tmp_grad1_u12p,n_points_extra_final_grid,n_blocks,4) tc = 0.d0 - + + type(gpu_stream) :: stream(4) + do i=1,4 + call gpu_stream_create(stream(i)) + enddo + do i_pass = 1, n_pass ii = (i_pass-1)*n_blocks + 1 - + call wall_time(tc1) + !$OMP PARALLEL & !$OMP DEFAULT (NONE) & !$OMP PRIVATE (i_blocks, ipoint) & !$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12) - !$OMP DO + !$OMP DO do i_blocks = 1, n_blocks ipoint = ii - 1 + i_blocks ! r1 - call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_blocks,1), tmp_grad1_u12(1,i_blocks,2), tmp_grad1_u12(1,i_blocks,3), tmp_grad1_u12(1,i_blocks,4)) + call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12%f(1,i_blocks,1), tmp_grad1_u12%f(1,i_blocks,2), & + tmp_grad1_u12%f(1,i_blocks,3), tmp_grad1_u12%f(1,i_blocks,4)) enddo !$OMP END DO !$OMP END PARALLEL call wall_time(tc2) - tc = tc + tc2 - tc1 + tc = tc + tc2 - tc1 + call gpu_synchronize() + call gpu_copy(tmp_grad1_u12,tmp_grad1_u12p) do m = 1, 4 - call dgemm( "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0 & - , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid & - , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) + call gpu_set_stream(blas_handle, stream(m)) + call gpu_dgemm(blas_handle, "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0 & + , tmp%f(1,1,1), n_points_extra_final_grid, tmp_grad1_u12p%f(1,1,m), n_points_extra_final_grid & + , 0.d0, int2_grad1_u12_ao%f(1,1,ii,m), ao_num*ao_num) enddo enddo - - deallocate(tmp_grad1_u12) - if(n_rest .gt. 0) then - - allocate(tmp_grad1_u12(n_points_extra_final_grid,n_rest,4)) - + ii = n_pass*n_blocks + 1 call wall_time(tc1) @@ -125,26 +133,35 @@ subroutine provide_int2_grad1_u12_ao() !$OMP DEFAULT (NONE) & !$OMP PRIVATE (i_rest, ipoint) & !$OMP SHARED (n_rest, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12) - !$OMP DO + !$OMP DO do i_rest = 1, n_rest ipoint = ii - 1 + i_rest ! r1 - call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_rest,1), tmp_grad1_u12(1,i_rest,2), tmp_grad1_u12(1,i_rest,3), tmp_grad1_u12(1,i_rest,4)) + call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12%f(1,i_rest,1), tmp_grad1_u12%f(1,i_rest,2), & + tmp_grad1_u12%f(1,i_rest,3), tmp_grad1_u12%f(1,i_rest,4)) enddo !$OMP END DO !$OMP END PARALLEL call wall_time(tc2) - tc = tc + tc2 - tc1 - + tc = tc + tc2 - tc1 + do m = 1, 4 - call dgemm( "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0 & - , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid & - , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) + call gpu_set_stream(blas_handle, stream(m)) + call gpu_dgemm(blas_handle, "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0 & + , tmp%f(1,1,1), n_points_extra_final_grid, tmp_grad1_u12%f(1,1,m), n_points_extra_final_grid & + , 0.d0, int2_grad1_u12_ao%f(1,1,ii,m), ao_num*ao_num) enddo - deallocate(tmp_grad1_u12) endif + call gpu_synchronize() + call gpu_deallocate(tmp_grad1_u12) + call gpu_deallocate(tmp_grad1_u12p) - deallocate(tmp) + do i=1,4 + call gpu_stream_destroy(stream(i)) + enddo + + + call gpu_deallocate(tmp) call wall_time(time1) @@ -152,6 +169,8 @@ subroutine provide_int2_grad1_u12_ao() print*, ' wall time Jastrow derivatives (min) = ', tc / 60.d0 call print_memory_usage() +!TODO +stop ! --- ! --- ! --- @@ -177,7 +196,7 @@ subroutine provide_int2_grad1_u12_ao() !$OMP END DO !$OMP END PARALLEL call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0 & - , int2_grad1_u12_ao(1,1,1,4), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid & + , int2_grad1_u12_ao%f(1,1,1,4), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid & , 0.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num) deallocate(c_mat) @@ -188,23 +207,23 @@ subroutine provide_int2_grad1_u12_ao() ! --- call wall_time(time1) - + allocate(c_mat(n_points_final_grid,ao_num,ao_num)) do m = 1, 3 !$OMP PARALLEL & !$OMP DEFAULT (NONE) & - !$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r) & - !$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, c_mat, & + !$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r) & + !$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, c_mat, & !$OMP ao_num, n_points_final_grid, final_weight_at_r_vector, m) !$OMP DO SCHEDULE (static) do i = 1, ao_num do k = 1, ao_num do ipoint = 1, n_points_final_grid - + weight1 = 0.5d0 * final_weight_at_r_vector(ipoint) ao_i_r = aos_in_r_array_transp(ipoint,i) ao_k_r = aos_in_r_array_transp(ipoint,k) - + c_mat(ipoint,k,i) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,m) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,m)) enddo enddo @@ -213,7 +232,7 @@ subroutine provide_int2_grad1_u12_ao() !$OMP END PARALLEL call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, -1.d0 & - , int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid & + , int2_grad1_u12_ao%f(1,1,1,m), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid & , 1.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num) enddo deallocate(c_mat) @@ -234,7 +253,7 @@ subroutine provide_int2_grad1_u12_ao() ! --- - call wall_time(time1) + call wall_time(time1) PROVIDE ao_integrals_map !$OMP PARALLEL DEFAULT(NONE) & @@ -245,7 +264,7 @@ subroutine provide_int2_grad1_u12_ao() do l = 1, ao_num do i = 1, ao_num do k = 1, ao_num - ! < 1:i, 2:j | 1:k, 2:l > + ! < 1:i, 2:j | 1:k, 2:l > tc_int_2e_ao(k,i,l,j) = tc_int_2e_ao(k,i,l,j) + get_ao_two_e_integral(i, j, k, l, ao_integrals_map) enddo enddo @@ -263,7 +282,7 @@ subroutine provide_int2_grad1_u12_ao() print*, ' Writing int2_grad1_u12_ao in ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao' open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="write") call ezfio_set_work_empty(.False.) - write(11) int2_grad1_u12_ao(:,:,:,1:3) + write(11) int2_grad1_u12_ao%f(:,:,:,1:3) close(11) print*, ' Saving tc_int_2e_ao in ', trim(ezfio_filename) // '/work/ao_two_e_tc_tot' @@ -276,7 +295,7 @@ subroutine provide_int2_grad1_u12_ao() ! ---- - deallocate(int2_grad1_u12_ao) + call gpu_deallocate(int2_grad1_u12_ao) deallocate(tc_int_2e_ao) call wall_time(time2) From 228796cff525bc6420b2b961c803fda2aa0094bc Mon Sep 17 00:00:00 2001 From: eginer Date: Fri, 12 Jul 2024 17:41:15 +0200 Subject: [PATCH 33/38] split the mo_optimization into mo_optimizatio_utils and mo_optimization --- src/casscf_cipsi/NEED | 1 + src/mo_optimization/NEED | 6 +- src/mo_optimization/cipsi_orb_opt.irp.f | 82 +------------------ .../EZFIO.cfg | 0 src/mo_optimization_utils/NEED | 5 ++ src/mo_optimization_utils/README.md | 74 +++++++++++++++++ .../class.irp.f | 0 .../constants.h | 0 .../diagonal_hessian_list_opt.irp.f | 0 .../diagonal_hessian_opt.irp.f | 0 .../diagonalization_hessian.irp.f | 0 .../first_diagonal_hessian_list_opt.irp.f | 0 .../first_diagonal_hessian_opt.irp.f | 0 .../first_gradient_list_opt.irp.f | 0 .../first_gradient_opt.irp.f | 0 .../first_hessian_list_opt.irp.f | 0 .../first_hessian_opt.irp.f | 0 .../gradient_list_opt.irp.f | 0 .../gradient_opt.irp.f | 0 .../hessian_list_opt.irp.f | 0 .../hessian_opt.irp.f | 0 .../org/TODO.org | 0 .../org/debug_gradient_list_opt.org | 0 .../org/debug_gradient_opt.org | 0 .../org/debug_hessian_list_opt.org | 0 .../org/debug_hessian_opt.org | 0 .../org/diagonal_hessian_list_opt.org | 0 .../org/diagonal_hessian_opt.org | 0 .../org/diagonalization_hessian.org | 0 .../org/first_diagonal_hessian_list_opt.org | 0 .../org/first_diagonal_hessian_opt.org | 0 .../org/first_gradient_list_opt.org | 0 .../org/first_gradient_opt.org | 0 .../org/first_hessian_list_opt.org | 0 .../org/first_hessian_opt.org | 0 .../org/gradient_list_opt.org | 0 .../org/gradient_opt.org | 0 .../org/hessian_list_opt.org | 0 .../org/hessian_opt.org | 0 .../org/my_providers.org | 0 .../org/optimization.org | 0 .../org/orb_opt_trust_v2.org | 0 .../org/state_average_energy.org | 0 .../org/state_weight_normalization.org | 0 .../org/update_parameters.org | 0 .../org/update_st_av_ci_energy.org | 0 .../routine_opt_mos.irp.f | 81 ++++++++++++++++++ .../run_orb_opt_trust_v2.irp.f | 0 .../save_energy.irp.f | 0 .../state_average_energy.irp.f | 0 .../state_weight_normalization.irp.f | 0 .../update_parameters.irp.f | 0 .../update_st_av_ci_energy.irp.f | 0 53 files changed, 163 insertions(+), 86 deletions(-) rename src/{mo_optimization => mo_optimization_utils}/EZFIO.cfg (100%) create mode 100644 src/mo_optimization_utils/NEED create mode 100644 src/mo_optimization_utils/README.md rename src/{mo_optimization => mo_optimization_utils}/class.irp.f (100%) rename src/{mo_optimization => mo_optimization_utils}/constants.h (100%) rename src/{mo_optimization => mo_optimization_utils}/diagonal_hessian_list_opt.irp.f (100%) rename src/{mo_optimization => mo_optimization_utils}/diagonal_hessian_opt.irp.f (100%) rename src/{mo_optimization => mo_optimization_utils}/diagonalization_hessian.irp.f (100%) rename src/{mo_optimization => mo_optimization_utils}/first_diagonal_hessian_list_opt.irp.f (100%) rename src/{mo_optimization => mo_optimization_utils}/first_diagonal_hessian_opt.irp.f (100%) rename src/{mo_optimization => mo_optimization_utils}/first_gradient_list_opt.irp.f (100%) rename src/{mo_optimization => mo_optimization_utils}/first_gradient_opt.irp.f (100%) rename src/{mo_optimization => mo_optimization_utils}/first_hessian_list_opt.irp.f (100%) rename src/{mo_optimization => mo_optimization_utils}/first_hessian_opt.irp.f (100%) rename src/{mo_optimization => mo_optimization_utils}/gradient_list_opt.irp.f (100%) rename src/{mo_optimization => mo_optimization_utils}/gradient_opt.irp.f (100%) rename src/{mo_optimization => mo_optimization_utils}/hessian_list_opt.irp.f (100%) rename src/{mo_optimization => mo_optimization_utils}/hessian_opt.irp.f (100%) rename src/{mo_optimization => mo_optimization_utils}/org/TODO.org (100%) rename src/{mo_optimization => mo_optimization_utils}/org/debug_gradient_list_opt.org (100%) rename src/{mo_optimization => mo_optimization_utils}/org/debug_gradient_opt.org (100%) rename src/{mo_optimization => mo_optimization_utils}/org/debug_hessian_list_opt.org (100%) rename src/{mo_optimization => mo_optimization_utils}/org/debug_hessian_opt.org (100%) rename src/{mo_optimization => mo_optimization_utils}/org/diagonal_hessian_list_opt.org (100%) rename src/{mo_optimization => mo_optimization_utils}/org/diagonal_hessian_opt.org (100%) rename src/{mo_optimization => mo_optimization_utils}/org/diagonalization_hessian.org (100%) rename src/{mo_optimization => mo_optimization_utils}/org/first_diagonal_hessian_list_opt.org (100%) rename src/{mo_optimization => mo_optimization_utils}/org/first_diagonal_hessian_opt.org (100%) rename src/{mo_optimization => mo_optimization_utils}/org/first_gradient_list_opt.org (100%) rename src/{mo_optimization => mo_optimization_utils}/org/first_gradient_opt.org (100%) rename src/{mo_optimization => mo_optimization_utils}/org/first_hessian_list_opt.org (100%) rename src/{mo_optimization => mo_optimization_utils}/org/first_hessian_opt.org (100%) rename src/{mo_optimization => mo_optimization_utils}/org/gradient_list_opt.org (100%) rename src/{mo_optimization => mo_optimization_utils}/org/gradient_opt.org (100%) rename src/{mo_optimization => mo_optimization_utils}/org/hessian_list_opt.org (100%) rename src/{mo_optimization => mo_optimization_utils}/org/hessian_opt.org (100%) rename src/{mo_optimization => mo_optimization_utils}/org/my_providers.org (100%) rename src/{mo_optimization => mo_optimization_utils}/org/optimization.org (100%) rename src/{mo_optimization => mo_optimization_utils}/org/orb_opt_trust_v2.org (100%) rename src/{mo_optimization => mo_optimization_utils}/org/state_average_energy.org (100%) rename src/{mo_optimization => mo_optimization_utils}/org/state_weight_normalization.org (100%) rename src/{mo_optimization => mo_optimization_utils}/org/update_parameters.org (100%) rename src/{mo_optimization => mo_optimization_utils}/org/update_st_av_ci_energy.org (100%) create mode 100644 src/mo_optimization_utils/routine_opt_mos.irp.f rename src/{mo_optimization => mo_optimization_utils}/run_orb_opt_trust_v2.irp.f (100%) rename src/{mo_optimization => mo_optimization_utils}/save_energy.irp.f (100%) rename src/{mo_optimization => mo_optimization_utils}/state_average_energy.irp.f (100%) rename src/{mo_optimization => mo_optimization_utils}/state_weight_normalization.irp.f (100%) rename src/{mo_optimization => mo_optimization_utils}/update_parameters.irp.f (100%) rename src/{mo_optimization => mo_optimization_utils}/update_st_av_ci_energy.irp.f (100%) diff --git a/src/casscf_cipsi/NEED b/src/casscf_cipsi/NEED index dd91c7bd..11d1a78c 100644 --- a/src/casscf_cipsi/NEED +++ b/src/casscf_cipsi/NEED @@ -3,3 +3,4 @@ selectors_full generators_cas two_body_rdm dav_general_mat +mo_optimization diff --git a/src/mo_optimization/NEED b/src/mo_optimization/NEED index 91f41ee3..33f770c3 100644 --- a/src/mo_optimization/NEED +++ b/src/mo_optimization/NEED @@ -1,7 +1,3 @@ -two_body_rdm -hartree_fock -cipsi -davidson_undressed +mo_optimization_utils selectors_full generators_full -utils_trust_region diff --git a/src/mo_optimization/cipsi_orb_opt.irp.f b/src/mo_optimization/cipsi_orb_opt.irp.f index 7e3a79eb..19b3e9db 100644 --- a/src/mo_optimization/cipsi_orb_opt.irp.f +++ b/src/mo_optimization/cipsi_orb_opt.irp.f @@ -2,87 +2,7 @@ program optimization read_wf = .true. ! must be True for the orbital optimization !!! TOUCH read_wf - call run_optimization + call run_optimization_mos_CIPSI end -subroutine run_optimization - - implicit none - - double precision :: e_cipsi, e_opt, delta_e - double precision, allocatable :: Ev(:),PT2(:) - integer :: nb_iter,i - logical :: not_converged - character (len=100) :: filename - - PROVIDE psi_det psi_coef mo_two_e_integrals_in_map ao_pseudo_integrals - allocate(Ev(N_states),PT2(N_states)) - - not_converged = .True. - nb_iter = 0 - - ! To start from the wf - N_det_max = max(n_det,5) - TOUCH N_det_max - - open(unit=10, file=trim(ezfio_filename)//'/mo_optimization/result_opt') - write(10,*) " Ndet E_cipsi E_opt Delta_e" - call state_average_energy(e_cipsi) - write(10,'(I10, 3F15.7)') n_det, e_cipsi, e_cipsi, 0d0 - close(10) - - do while (not_converged) - print*,'' - print*,'======================' - print*,' Cipsi step:', nb_iter - print*,'======================' - print*,'' - print*,'********** cipsi step **********' - ! cispi calculation - call run_stochastic_cipsi(Ev,PT2) - - ! State average energy after the cipsi step - call state_average_energy(e_cipsi) - - print*,'' - print*,'********** optimization step **********' - ! orbital optimization - call run_orb_opt_trust_v2 - - ! State average energy after the orbital optimization - call state_average_energy(e_opt) - - print*,'' - print*,'********** diff step **********' - ! Gain in energy - delta_e = e_opt - e_cipsi - print*, 'Gain in energy during the orbital optimization:', delta_e - - open(unit=10, file=trim(ezfio_filename)//'/mo_optimization/result_opt', position='append') - write(10,'(I10, 3F15.7)') n_det, e_cipsi, e_opt, delta_e - close(10) - - ! Exit - if (delta_e > 1d-12) then - print*, 'WARNING, something wrong happened' - print*, 'The gain (delta_e) in energy during the optimization process' - print*, 'is > 0, but it must be < 0' - print*, 'The program will exit' - exit - endif - - if (n_det > n_det_max_opt) then - print*, 'The number of determinants in the wf > n_det_max_opt' - print*, 'The program will exit' - exit - endif - - ! To double the number of determinants in the wf - N_det_max = int(dble(n_det * 2)*0.9) - TOUCH N_det_max - - nb_iter = nb_iter + 1 - enddo - -end diff --git a/src/mo_optimization/EZFIO.cfg b/src/mo_optimization_utils/EZFIO.cfg similarity index 100% rename from src/mo_optimization/EZFIO.cfg rename to src/mo_optimization_utils/EZFIO.cfg diff --git a/src/mo_optimization_utils/NEED b/src/mo_optimization_utils/NEED new file mode 100644 index 00000000..1a78a17f --- /dev/null +++ b/src/mo_optimization_utils/NEED @@ -0,0 +1,5 @@ +two_body_rdm +hartree_fock +cipsi +davidson_undressed +utils_trust_region diff --git a/src/mo_optimization_utils/README.md b/src/mo_optimization_utils/README.md new file mode 100644 index 00000000..94f29aee --- /dev/null +++ b/src/mo_optimization_utils/README.md @@ -0,0 +1,74 @@ +# Orbital optimization + +## Methods +Different methods are available: +- full hessian +``` +qp set orbital_optimization optimization_method full +``` +- diagonal hessian +``` +qp set orbital_optimization optimization_method diag +``` +- identity matrix +``` +qp set orbital_optimization optimization_method none +``` + +After the optimization the ezfio contains the optimized orbitals + +## For a fixed number of determinants +To optimize the MOs for the actual determinants: +``` +qp run orb_opt +``` + +## For a complete optimization, i.e, with a larger and larger wave function +To optimize the MOs with a larger and larger wave function: +``` +qp run optimization +``` + +The results are stored in the EZFIO in "mo_optimization/result_opt", +with the following format: +(1) (2) (3) (4) +1: Number of determinants in the wf, +2: Cispi energy before the optimization, +3: Cipsi energy after the optimization, +4: Energy difference between (2) and (3). + +The optimization process if the following: +- we do a first cipsi step to obtain a small number of determinants in the wf +- we run an orbital optimization for this wf +- we do a new cipsi step to double the number of determinants in the wf +- we run an orbital optimization for this wf +- ... +- we do that until the energy difference between (2) and (3) is + smaller than the targeted accuracy for the cispi (targeted_accuracy_cipsi in qp edit) + or the wf is larger than a given size (n_det_max_opt in qp_edit) +- after that you can reset your determinants (qp reset -d) and run a clean Cispi calculation + +### End of the optimization +You can choos the number of determinants after what the +optimization will stop: +``` +qp set orbital_optimization n_det_max_opt 1e5 # or any number +``` +## Weight of the states +You can change the weights of the differents states directly in qp edit. +It will affect ths weights used in the orbital optimization. + +# Tests +To run the tests: +``` +qp test +``` + +# Org files +The org files are stored in the directory org in order to avoid overwriting on user changes. +The org files can be modified, to export the change to the source code, run +``` +./TANGLE_org_mode.sh +mv *.irp.f ../. +``` + diff --git a/src/mo_optimization/class.irp.f b/src/mo_optimization_utils/class.irp.f similarity index 100% rename from src/mo_optimization/class.irp.f rename to src/mo_optimization_utils/class.irp.f diff --git a/src/mo_optimization/constants.h b/src/mo_optimization_utils/constants.h similarity index 100% rename from src/mo_optimization/constants.h rename to src/mo_optimization_utils/constants.h diff --git a/src/mo_optimization/diagonal_hessian_list_opt.irp.f b/src/mo_optimization_utils/diagonal_hessian_list_opt.irp.f similarity index 100% rename from src/mo_optimization/diagonal_hessian_list_opt.irp.f rename to src/mo_optimization_utils/diagonal_hessian_list_opt.irp.f diff --git a/src/mo_optimization/diagonal_hessian_opt.irp.f b/src/mo_optimization_utils/diagonal_hessian_opt.irp.f similarity index 100% rename from src/mo_optimization/diagonal_hessian_opt.irp.f rename to src/mo_optimization_utils/diagonal_hessian_opt.irp.f diff --git a/src/mo_optimization/diagonalization_hessian.irp.f b/src/mo_optimization_utils/diagonalization_hessian.irp.f similarity index 100% rename from src/mo_optimization/diagonalization_hessian.irp.f rename to src/mo_optimization_utils/diagonalization_hessian.irp.f diff --git a/src/mo_optimization/first_diagonal_hessian_list_opt.irp.f b/src/mo_optimization_utils/first_diagonal_hessian_list_opt.irp.f similarity index 100% rename from src/mo_optimization/first_diagonal_hessian_list_opt.irp.f rename to src/mo_optimization_utils/first_diagonal_hessian_list_opt.irp.f diff --git a/src/mo_optimization/first_diagonal_hessian_opt.irp.f b/src/mo_optimization_utils/first_diagonal_hessian_opt.irp.f similarity index 100% rename from src/mo_optimization/first_diagonal_hessian_opt.irp.f rename to src/mo_optimization_utils/first_diagonal_hessian_opt.irp.f diff --git a/src/mo_optimization/first_gradient_list_opt.irp.f b/src/mo_optimization_utils/first_gradient_list_opt.irp.f similarity index 100% rename from src/mo_optimization/first_gradient_list_opt.irp.f rename to src/mo_optimization_utils/first_gradient_list_opt.irp.f diff --git a/src/mo_optimization/first_gradient_opt.irp.f b/src/mo_optimization_utils/first_gradient_opt.irp.f similarity index 100% rename from src/mo_optimization/first_gradient_opt.irp.f rename to src/mo_optimization_utils/first_gradient_opt.irp.f diff --git a/src/mo_optimization/first_hessian_list_opt.irp.f b/src/mo_optimization_utils/first_hessian_list_opt.irp.f similarity index 100% rename from src/mo_optimization/first_hessian_list_opt.irp.f rename to src/mo_optimization_utils/first_hessian_list_opt.irp.f diff --git a/src/mo_optimization/first_hessian_opt.irp.f b/src/mo_optimization_utils/first_hessian_opt.irp.f similarity index 100% rename from src/mo_optimization/first_hessian_opt.irp.f rename to src/mo_optimization_utils/first_hessian_opt.irp.f diff --git a/src/mo_optimization/gradient_list_opt.irp.f b/src/mo_optimization_utils/gradient_list_opt.irp.f similarity index 100% rename from src/mo_optimization/gradient_list_opt.irp.f rename to src/mo_optimization_utils/gradient_list_opt.irp.f diff --git a/src/mo_optimization/gradient_opt.irp.f b/src/mo_optimization_utils/gradient_opt.irp.f similarity index 100% rename from src/mo_optimization/gradient_opt.irp.f rename to src/mo_optimization_utils/gradient_opt.irp.f diff --git a/src/mo_optimization/hessian_list_opt.irp.f b/src/mo_optimization_utils/hessian_list_opt.irp.f similarity index 100% rename from src/mo_optimization/hessian_list_opt.irp.f rename to src/mo_optimization_utils/hessian_list_opt.irp.f diff --git a/src/mo_optimization/hessian_opt.irp.f b/src/mo_optimization_utils/hessian_opt.irp.f similarity index 100% rename from src/mo_optimization/hessian_opt.irp.f rename to src/mo_optimization_utils/hessian_opt.irp.f diff --git a/src/mo_optimization/org/TODO.org b/src/mo_optimization_utils/org/TODO.org similarity index 100% rename from src/mo_optimization/org/TODO.org rename to src/mo_optimization_utils/org/TODO.org diff --git a/src/mo_optimization/org/debug_gradient_list_opt.org b/src/mo_optimization_utils/org/debug_gradient_list_opt.org similarity index 100% rename from src/mo_optimization/org/debug_gradient_list_opt.org rename to src/mo_optimization_utils/org/debug_gradient_list_opt.org diff --git a/src/mo_optimization/org/debug_gradient_opt.org b/src/mo_optimization_utils/org/debug_gradient_opt.org similarity index 100% rename from src/mo_optimization/org/debug_gradient_opt.org rename to src/mo_optimization_utils/org/debug_gradient_opt.org diff --git a/src/mo_optimization/org/debug_hessian_list_opt.org b/src/mo_optimization_utils/org/debug_hessian_list_opt.org similarity index 100% rename from src/mo_optimization/org/debug_hessian_list_opt.org rename to src/mo_optimization_utils/org/debug_hessian_list_opt.org diff --git a/src/mo_optimization/org/debug_hessian_opt.org b/src/mo_optimization_utils/org/debug_hessian_opt.org similarity index 100% rename from src/mo_optimization/org/debug_hessian_opt.org rename to src/mo_optimization_utils/org/debug_hessian_opt.org diff --git a/src/mo_optimization/org/diagonal_hessian_list_opt.org b/src/mo_optimization_utils/org/diagonal_hessian_list_opt.org similarity index 100% rename from src/mo_optimization/org/diagonal_hessian_list_opt.org rename to src/mo_optimization_utils/org/diagonal_hessian_list_opt.org diff --git a/src/mo_optimization/org/diagonal_hessian_opt.org b/src/mo_optimization_utils/org/diagonal_hessian_opt.org similarity index 100% rename from src/mo_optimization/org/diagonal_hessian_opt.org rename to src/mo_optimization_utils/org/diagonal_hessian_opt.org diff --git a/src/mo_optimization/org/diagonalization_hessian.org b/src/mo_optimization_utils/org/diagonalization_hessian.org similarity index 100% rename from src/mo_optimization/org/diagonalization_hessian.org rename to src/mo_optimization_utils/org/diagonalization_hessian.org diff --git a/src/mo_optimization/org/first_diagonal_hessian_list_opt.org b/src/mo_optimization_utils/org/first_diagonal_hessian_list_opt.org similarity index 100% rename from src/mo_optimization/org/first_diagonal_hessian_list_opt.org rename to src/mo_optimization_utils/org/first_diagonal_hessian_list_opt.org diff --git a/src/mo_optimization/org/first_diagonal_hessian_opt.org b/src/mo_optimization_utils/org/first_diagonal_hessian_opt.org similarity index 100% rename from src/mo_optimization/org/first_diagonal_hessian_opt.org rename to src/mo_optimization_utils/org/first_diagonal_hessian_opt.org diff --git a/src/mo_optimization/org/first_gradient_list_opt.org b/src/mo_optimization_utils/org/first_gradient_list_opt.org similarity index 100% rename from src/mo_optimization/org/first_gradient_list_opt.org rename to src/mo_optimization_utils/org/first_gradient_list_opt.org diff --git a/src/mo_optimization/org/first_gradient_opt.org b/src/mo_optimization_utils/org/first_gradient_opt.org similarity index 100% rename from src/mo_optimization/org/first_gradient_opt.org rename to src/mo_optimization_utils/org/first_gradient_opt.org diff --git a/src/mo_optimization/org/first_hessian_list_opt.org b/src/mo_optimization_utils/org/first_hessian_list_opt.org similarity index 100% rename from src/mo_optimization/org/first_hessian_list_opt.org rename to src/mo_optimization_utils/org/first_hessian_list_opt.org diff --git a/src/mo_optimization/org/first_hessian_opt.org b/src/mo_optimization_utils/org/first_hessian_opt.org similarity index 100% rename from src/mo_optimization/org/first_hessian_opt.org rename to src/mo_optimization_utils/org/first_hessian_opt.org diff --git a/src/mo_optimization/org/gradient_list_opt.org b/src/mo_optimization_utils/org/gradient_list_opt.org similarity index 100% rename from src/mo_optimization/org/gradient_list_opt.org rename to src/mo_optimization_utils/org/gradient_list_opt.org diff --git a/src/mo_optimization/org/gradient_opt.org b/src/mo_optimization_utils/org/gradient_opt.org similarity index 100% rename from src/mo_optimization/org/gradient_opt.org rename to src/mo_optimization_utils/org/gradient_opt.org diff --git a/src/mo_optimization/org/hessian_list_opt.org b/src/mo_optimization_utils/org/hessian_list_opt.org similarity index 100% rename from src/mo_optimization/org/hessian_list_opt.org rename to src/mo_optimization_utils/org/hessian_list_opt.org diff --git a/src/mo_optimization/org/hessian_opt.org b/src/mo_optimization_utils/org/hessian_opt.org similarity index 100% rename from src/mo_optimization/org/hessian_opt.org rename to src/mo_optimization_utils/org/hessian_opt.org diff --git a/src/mo_optimization/org/my_providers.org b/src/mo_optimization_utils/org/my_providers.org similarity index 100% rename from src/mo_optimization/org/my_providers.org rename to src/mo_optimization_utils/org/my_providers.org diff --git a/src/mo_optimization/org/optimization.org b/src/mo_optimization_utils/org/optimization.org similarity index 100% rename from src/mo_optimization/org/optimization.org rename to src/mo_optimization_utils/org/optimization.org diff --git a/src/mo_optimization/org/orb_opt_trust_v2.org b/src/mo_optimization_utils/org/orb_opt_trust_v2.org similarity index 100% rename from src/mo_optimization/org/orb_opt_trust_v2.org rename to src/mo_optimization_utils/org/orb_opt_trust_v2.org diff --git a/src/mo_optimization/org/state_average_energy.org b/src/mo_optimization_utils/org/state_average_energy.org similarity index 100% rename from src/mo_optimization/org/state_average_energy.org rename to src/mo_optimization_utils/org/state_average_energy.org diff --git a/src/mo_optimization/org/state_weight_normalization.org b/src/mo_optimization_utils/org/state_weight_normalization.org similarity index 100% rename from src/mo_optimization/org/state_weight_normalization.org rename to src/mo_optimization_utils/org/state_weight_normalization.org diff --git a/src/mo_optimization/org/update_parameters.org b/src/mo_optimization_utils/org/update_parameters.org similarity index 100% rename from src/mo_optimization/org/update_parameters.org rename to src/mo_optimization_utils/org/update_parameters.org diff --git a/src/mo_optimization/org/update_st_av_ci_energy.org b/src/mo_optimization_utils/org/update_st_av_ci_energy.org similarity index 100% rename from src/mo_optimization/org/update_st_av_ci_energy.org rename to src/mo_optimization_utils/org/update_st_av_ci_energy.org diff --git a/src/mo_optimization_utils/routine_opt_mos.irp.f b/src/mo_optimization_utils/routine_opt_mos.irp.f new file mode 100644 index 00000000..fceba2c5 --- /dev/null +++ b/src/mo_optimization_utils/routine_opt_mos.irp.f @@ -0,0 +1,81 @@ + +subroutine run_optimization_mos_CIPSI + + implicit none + + double precision :: e_cipsi, e_opt, delta_e + double precision, allocatable :: Ev(:),PT2(:) + integer :: nb_iter,i + logical :: not_converged + character (len=100) :: filename + + PROVIDE psi_det psi_coef mo_two_e_integrals_in_map ao_pseudo_integrals + allocate(Ev(N_states),PT2(N_states)) + + not_converged = .True. + nb_iter = 0 + + ! To start from the wf + N_det_max = max(n_det,5) + TOUCH N_det_max + + open(unit=10, file=trim(ezfio_filename)//'/mo_optimization/result_opt') + write(10,*) " Ndet E_cipsi E_opt Delta_e" + call state_average_energy(e_cipsi) + write(10,'(I10, 3F15.7)') n_det, e_cipsi, e_cipsi, 0d0 + close(10) + + do while (not_converged) + print*,'' + print*,'======================' + print*,' Cipsi step:', nb_iter + print*,'======================' + print*,'' + print*,'********** cipsi step **********' + ! cispi calculation + call run_stochastic_cipsi(Ev,PT2) + + ! State average energy after the cipsi step + call state_average_energy(e_cipsi) + + print*,'' + print*,'********** optimization step **********' + ! orbital optimization + call run_orb_opt_trust_v2 + + ! State average energy after the orbital optimization + call state_average_energy(e_opt) + + print*,'' + print*,'********** diff step **********' + ! Gain in energy + delta_e = e_opt - e_cipsi + print*, 'Gain in energy during the orbital optimization:', delta_e + + open(unit=10, file=trim(ezfio_filename)//'/mo_optimization/result_opt', position='append') + write(10,'(I10, 3F15.7)') n_det, e_cipsi, e_opt, delta_e + close(10) + + ! Exit + if (delta_e > 1d-12) then + print*, 'WARNING, something wrong happened' + print*, 'The gain (delta_e) in energy during the optimization process' + print*, 'is > 0, but it must be < 0' + print*, 'The program will exit' + exit + endif + + if (n_det > n_det_max_opt) then + print*, 'The number of determinants in the wf > n_det_max_opt' + print*, 'The program will exit' + exit + endif + + ! To double the number of determinants in the wf + N_det_max = int(dble(n_det * 2)*0.9) + TOUCH N_det_max + + nb_iter = nb_iter + 1 + enddo + +end diff --git a/src/mo_optimization/run_orb_opt_trust_v2.irp.f b/src/mo_optimization_utils/run_orb_opt_trust_v2.irp.f similarity index 100% rename from src/mo_optimization/run_orb_opt_trust_v2.irp.f rename to src/mo_optimization_utils/run_orb_opt_trust_v2.irp.f diff --git a/src/mo_optimization/save_energy.irp.f b/src/mo_optimization_utils/save_energy.irp.f similarity index 100% rename from src/mo_optimization/save_energy.irp.f rename to src/mo_optimization_utils/save_energy.irp.f diff --git a/src/mo_optimization/state_average_energy.irp.f b/src/mo_optimization_utils/state_average_energy.irp.f similarity index 100% rename from src/mo_optimization/state_average_energy.irp.f rename to src/mo_optimization_utils/state_average_energy.irp.f diff --git a/src/mo_optimization/state_weight_normalization.irp.f b/src/mo_optimization_utils/state_weight_normalization.irp.f similarity index 100% rename from src/mo_optimization/state_weight_normalization.irp.f rename to src/mo_optimization_utils/state_weight_normalization.irp.f diff --git a/src/mo_optimization/update_parameters.irp.f b/src/mo_optimization_utils/update_parameters.irp.f similarity index 100% rename from src/mo_optimization/update_parameters.irp.f rename to src/mo_optimization_utils/update_parameters.irp.f diff --git a/src/mo_optimization/update_st_av_ci_energy.irp.f b/src/mo_optimization_utils/update_st_av_ci_energy.irp.f similarity index 100% rename from src/mo_optimization/update_st_av_ci_energy.irp.f rename to src/mo_optimization_utils/update_st_av_ci_energy.irp.f From 6985d4d5493a6204f95a8d1d1cccbccc80c12071 Mon Sep 17 00:00:00 2001 From: eginer Date: Fri, 12 Jul 2024 18:25:17 +0200 Subject: [PATCH 34/38] the casscf does not work with mo optimization ... --- src/casscf_cipsi/EZFIO.cfg | 6 + src/casscf_cipsi/NEED | 2 +- src/casscf_cipsi/casscf.irp.f | 166 +++++++++--------- .../class.irp.f | 0 4 files changed, 94 insertions(+), 80 deletions(-) rename src/{mo_optimization_utils => mo_optimization}/class.irp.f (100%) diff --git a/src/casscf_cipsi/EZFIO.cfg b/src/casscf_cipsi/EZFIO.cfg index 18e0b6b1..5b72d906 100644 --- a/src/casscf_cipsi/EZFIO.cfg +++ b/src/casscf_cipsi/EZFIO.cfg @@ -79,3 +79,9 @@ type: logical doc: If |true|, the pt2_max value in the CIPSI is set to 10-10 and will not change interface: ezfio,provider,ocaml default: False + +[act_mos_opt] +type: logical +doc: If |true|, the active orbitals are also optimized variationally +interface: ezfio,provider,ocaml +default: False diff --git a/src/casscf_cipsi/NEED b/src/casscf_cipsi/NEED index 11d1a78c..32f5ae90 100644 --- a/src/casscf_cipsi/NEED +++ b/src/casscf_cipsi/NEED @@ -3,4 +3,4 @@ selectors_full generators_cas two_body_rdm dav_general_mat -mo_optimization +mo_optimization_utils diff --git a/src/casscf_cipsi/casscf.irp.f b/src/casscf_cipsi/casscf.irp.f index dc3e2245..b64a9d8f 100644 --- a/src/casscf_cipsi/casscf.irp.f +++ b/src/casscf_cipsi/casscf.irp.f @@ -46,94 +46,101 @@ subroutine run do while (.not.converged) print*,'pt2_max = ',pt2_max call run_stochastic_cipsi(Ev,PT2) - print*,'Ev,PT2',Ev(1),PT2(1) - E_PT2(1:N_states) = Ev(1:N_states) + PT2(1:N_states) - energy_old = energy - energy = eone+etwo+ecore - pt2_max_before = pt2_max - - call write_time(6) - call write_int(6,iteration,'CAS-SCF iteration = ') - call write_double(6,energy,'State-average CAS-SCF energy = ') -! if(n_states == 1)then -! call ezfio_get_casscf_cipsi_energy_pt2(E_PT2) -! call ezfio_get_casscf_cipsi_energy(PT2) - double precision :: delta_E_istate, e_av - e_av = 0.d0 - do istate=1,N_states - e_av += state_average_weight(istate) * Ev(istate) - if(istate.gt.1)then - delta_E_istate = E_PT2(istate) - E_PT2(1) - write(*,'(A6,I2,A18,F16.10)')'state ',istate,' Delta E+PT2 = ',delta_E_istate - endif - write(*,'(A6,I2,A18,F16.10)')'state ',istate,' E + PT2 energy = ',E_PT2(istate) - write(*,'(A6,I2,A18,F16.10)')'state ',istate,' PT2 energy = ',PT2(istate) -! call write_double(6,E_PT2(istate),'E + PT2 energy = ') -! call write_double(6,PT2(istate),' PT2 = ') - enddo - call write_double(6,e_av,'State-average CAS-SCF energy bis = ') - call write_double(6,pt2_max,' PT2_MAX = ') +! if(act_mos_opt)then DOES NOT WORK +! call run_orb_opt_trust_v2 +! call run_stochastic_cipsi(Ev,PT2) ! endif - - print*,'' - call write_double(6,norm_grad_vec2,'Norm of gradients = ') - call write_double(6,norm_grad_vec2_tab(1), ' Core-active gradients = ') - call write_double(6,norm_grad_vec2_tab(2), ' Core-virtual gradients = ') - call write_double(6,norm_grad_vec2_tab(3), ' Active-virtual gradients = ') - print*,'' - call write_double(6,energy_improvement, 'Predicted energy improvement = ') - - if(criterion_casscf == "energy")then - converged = dabs(energy_improvement) < thresh_scf - else if (criterion_casscf == "gradients")then - converged = norm_grad_vec2 < thresh_scf - else if (criterion_casscf == "e_pt2")then - delta_E = 0.d0 - do istate = 1, N_states - delta_E += dabs(E_PT2(istate) - ept2_before(istate)) - enddo - converged = dabs(delta_E) < thresh_casscf - endif - ept2_before = E_PT2 - if(.not.small_active_space)then - if(adaptive_pt2_max)then - pt2_max = dabs(energy_improvement / (pt2_relative_error)) - pt2_max = min(pt2_max, pt2_max_before) - if(n_act_orb.ge.n_big_act_orb)then - pt2_max = max(pt2_max,pt2_min_casscf) - endif + if(.True.)then + print*,'Ev,PT2',Ev(1),PT2(1) + E_PT2(1:N_states) = Ev(1:N_states) + PT2(1:N_states) + energy_old = energy + energy = eone+etwo+ecore + pt2_max_before = pt2_max + + call write_time(6) + call write_int(6,iteration,'CAS-SCF iteration = ') + call write_double(6,energy,'State-average CAS-SCF energy = ') +!! if(n_states == 1)then +!! call ezfio_get_casscf_cipsi_energy_pt2(E_PT2) +!! call ezfio_get_casscf_cipsi_energy(PT2) + double precision :: delta_E_istate, e_av + e_av = 0.d0 + do istate=1,N_states + e_av += state_average_weight(istate) * Ev(istate) + if(istate.gt.1)then + delta_E_istate = E_PT2(istate) - E_PT2(1) + write(*,'(A6,I2,A18,F16.10)')'state ',istate,' Delta E+PT2 = ',delta_E_istate + endif + write(*,'(A6,I2,A18,F16.10)')'state ',istate,' E + PT2 energy = ',E_PT2(istate) + write(*,'(A6,I2,A18,F16.10)')'state ',istate,' PT2 energy = ',PT2(istate) +!! call write_double(6,E_PT2(istate),'E + PT2 energy = ') +!! call write_double(6,PT2(istate),' PT2 = ') + enddo + call write_double(6,e_av,'State-average CAS-SCF energy bis = ') + call write_double(6,pt2_max,' PT2_MAX = ') +!! endif + + print*,'' + call write_double(6,norm_grad_vec2,'Norm of gradients = ') + call write_double(6,norm_grad_vec2_tab(1), ' Core-active gradients = ') + call write_double(6,norm_grad_vec2_tab(2), ' Core-virtual gradients = ') + call write_double(6,norm_grad_vec2_tab(3), ' Active-virtual gradients = ') + print*,'' + call write_double(6,energy_improvement, 'Predicted energy improvement = ') + + if(criterion_casscf == "energy")then + converged = dabs(energy_improvement) < thresh_scf + else if (criterion_casscf == "gradients")then + converged = norm_grad_vec2 < thresh_scf + else if (criterion_casscf == "e_pt2")then + delta_E = 0.d0 + do istate = 1, N_states + delta_E += dabs(E_PT2(istate) - ept2_before(istate)) + enddo + converged = dabs(delta_E) < thresh_casscf endif - endif - print*,'' - call write_double(6,pt2_max, 'PT2_MAX for next iteration = ') - - mo_coef = NewOrbs - mo_occ = occnum - if(.not.converged)then - call save_mos - iteration += 1 - if(norm_grad_vec2.gt.0.01d0)then - N_det = N_states - else - N_det = max(N_det/8 ,N_states) - endif - psi_det = psi_det_sorted - psi_coef = psi_coef_sorted - read_wf = .True. - call clear_mo_map - SOFT_TOUCH mo_coef N_det psi_det psi_coef + ept2_before = E_PT2 if(.not.small_active_space)then if(adaptive_pt2_max)then - SOFT_TOUCH pt2_max + pt2_max = dabs(energy_improvement / (pt2_relative_error)) + pt2_max = min(pt2_max, pt2_max_before) + if(n_act_orb.ge.n_big_act_orb)then + pt2_max = max(pt2_max,pt2_min_casscf) + endif endif endif - if(iteration .gt. 3)then - state_following_casscf = state_following_casscf_cipsi_save - soft_touch state_following_casscf + print*,'' + call write_double(6,pt2_max, 'PT2_MAX for next iteration = ') + + mo_coef = NewOrbs + mo_occ = occnum + if(.not.converged)then + call save_mos + iteration += 1 + if(norm_grad_vec2.gt.0.01d0)then + N_det = N_states + else + N_det = max(N_det/8 ,N_states) + endif + psi_det = psi_det_sorted + psi_coef = psi_coef_sorted + read_wf = .True. + call clear_mo_map + SOFT_TOUCH mo_coef N_det psi_det psi_coef + if(.not.small_active_space)then + if(adaptive_pt2_max)then + SOFT_TOUCH pt2_max + endif + endif + if(iteration .gt. 3)then + state_following_casscf = state_following_casscf_cipsi_save + soft_touch state_following_casscf + endif endif endif - + enddo + if(.True.)then integer :: i print*,'Converged CASSCF ' print*,'--------------------------' @@ -153,6 +160,7 @@ subroutine run ! write(*,*)mcscf_fock_alpha_mo(i,i) enddo + endif end diff --git a/src/mo_optimization_utils/class.irp.f b/src/mo_optimization/class.irp.f similarity index 100% rename from src/mo_optimization_utils/class.irp.f rename to src/mo_optimization/class.irp.f From 31028f8979189d1cc2822d4dc60eb527ed639932 Mon Sep 17 00:00:00 2001 From: eginer Date: Tue, 16 Jul 2024 17:44:48 +0200 Subject: [PATCH 35/38] fixed some weird dependencies in TC, introduced an AO cholesky 2e function --- .../cipsi_tc_bi_ortho/stochastic_cipsi.irp.f | 32 +++++++++++++++++++ plugins/local/fci_tc_bi/fci_tc_bi_ortho.irp.f | 20 ++++++++++-- .../local/non_h_ints_mu/total_tc_int.irp.f | 18 +++++++---- plugins/local/slater_tc/slater_tc_opt.irp.f | 2 -- plugins/local/slater_tc/tc_hmat.irp.f | 4 ++- src/ao_two_e_ints/cholesky.irp.f | 15 ++++++++- 6 files changed, 79 insertions(+), 12 deletions(-) diff --git a/plugins/local/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f b/plugins/local/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f index e363830d..6b8f3b42 100644 --- a/plugins/local/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f +++ b/plugins/local/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f @@ -1,4 +1,36 @@ + +! --- + +subroutine run_pouet + + BEGIN_DOC + ! Selected Full Configuration Interaction with Stochastic selection and PT2. + END_DOC + + use selection_types + implicit none + integer :: i, j, k, ndet + integer :: to_select + logical :: has + type(pt2_type) :: pt2_data, pt2_data_err + double precision :: rss + double precision :: correlation_energy_ratio + double precision :: hf_energy_ref + double precision :: relative_error + double precision, allocatable :: zeros(:),E_tc(:), norm(:) + + logical, external :: qp_stop + double precision, external :: memory_of_double + + PROVIDE mo_l_coef mo_r_coef + PROVIDE H_apply_buffer_allocated distributed_davidson + + print*, ' Diagonal elements of the Fock matrix ' + do i = 1, mo_num + write(*,*) i, Fock_matrix_tc_mo_tot(i,i) + enddo +end ! --- subroutine run_stochastic_cipsi diff --git a/plugins/local/fci_tc_bi/fci_tc_bi_ortho.irp.f b/plugins/local/fci_tc_bi/fci_tc_bi_ortho.irp.f index 1c1c0411..f1de0fe3 100644 --- a/plugins/local/fci_tc_bi/fci_tc_bi_ortho.irp.f +++ b/plugins/local/fci_tc_bi/fci_tc_bi_ortho.irp.f @@ -65,7 +65,15 @@ subroutine run_cipsi_tc() if (.not. is_zmq_slave) then - PROVIDE psi_det psi_coef mo_bi_ortho_tc_two_e mo_bi_ortho_tc_one_e + if(.True.)then! DO NOT REMOVE THE IF(.TRUE.) !! + ! this has to be provided before mo_bi_ortho_tc_two_e to avoid twice the computation of ao_two_e_tc_tot + PROVIDE Fock_matrix_tc_mo_tot + ! because Fock_matrix_tc_mo_tot depends on ao_two_e_tc_tot + ! and that mo_bi_ortho_tc_two_e erase ao_two_e_tc_tot after being provided + endif + if(.True.)then ! DO NOT REMOVE THE IF(.TRUE.) !! + PROVIDE psi_det psi_coef mo_bi_ortho_tc_two_e mo_bi_ortho_tc_one_e + endif if((elec_alpha_num+elec_beta_num) .ge. 3) then if(three_body_h_tc) then @@ -90,8 +98,16 @@ subroutine run_cipsi_tc() call json_close else + if(.True.)then! DO NOT REMOVE THE IF(.TRUE.) !! + ! this has to be provided before mo_bi_ortho_tc_two_e to avoid twice the computation of ao_two_e_tc_tot + PROVIDE Fock_matrix_tc_mo_tot + ! because Fock_matrix_tc_mo_tot depends on ao_two_e_tc_tot + ! and that mo_bi_ortho_tc_two_e erase ao_two_e_tc_tot after being provided + endif - PROVIDE mo_bi_ortho_tc_one_e mo_bi_ortho_tc_two_e pt2_min_parallel_tasks + if(.True.)then! DO NOT REMOVE THE IF(.TRUE.) !! + PROVIDE mo_bi_ortho_tc_one_e mo_bi_ortho_tc_two_e pt2_min_parallel_tasks + endif if((elec_alpha_num+elec_beta_num) .ge. 3) then if(three_body_h_tc) then diff --git a/plugins/local/non_h_ints_mu/total_tc_int.irp.f b/plugins/local/non_h_ints_mu/total_tc_int.irp.f index 656f5f16..fb09168e 100644 --- a/plugins/local/non_h_ints_mu/total_tc_int.irp.f +++ b/plugins/local/non_h_ints_mu/total_tc_int.irp.f @@ -288,25 +288,31 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n !$OMP END DO !$OMP END PARALLEL else - print*, ' ao_integrals_map will be used' - PROVIDE ao_integrals_map +! print*, ' ao_integrals_map will be used' +! PROVIDE ao_integrals_map + print*,'Cholesky vectors will be used ' + double precision :: get_ao_integ_chol,eri + eri = get_ao_integ_chol(1,1,1,1) ! FOR OPENMP !$OMP PARALLEL DEFAULT(NONE) & - !$OMP SHARED(ao_num, ao_two_e_tc_tot, ao_integrals_map) & - !$OMP PRIVATE(i, j, k, l) +!!! !$OMP SHARED(ao_num, ao_two_e_tc_tot, ao_integrals_map) & + !$OMP SHARED(ao_num, ao_two_e_tc_tot) & + !$OMP PRIVATE(i, j, k, l,eri) !$OMP DO COLLAPSE(3) do j = 1, ao_num do l = 1, ao_num do i = 1, ao_num do k = 1, ao_num ! < 1:i, 2:j | 1:k, 2:l > - ao_two_e_tc_tot(k,i,l,j) = ao_two_e_tc_tot(k,i,l,j) + get_ao_two_e_integral(i, j, k, l, ao_integrals_map) +! eri = get_ao_two_e_integral(i, j, k, l, ao_integrals_map) + eri = get_ao_integ_chol(i,k,j,l) + ao_two_e_tc_tot(k,i,l,j) = ao_two_e_tc_tot(k,i,l,j) + eri enddo enddo enddo enddo !$OMP END DO !$OMP END PARALLEL - FREE ao_integrals_map +! FREE ao_integrals_map endif if((tc_integ_type .eq. "numeric") .and. (.not. tc_save_mem)) then diff --git a/plugins/local/slater_tc/slater_tc_opt.irp.f b/plugins/local/slater_tc/slater_tc_opt.irp.f index 5651a299..3c4421f8 100644 --- a/plugins/local/slater_tc/slater_tc_opt.irp.f +++ b/plugins/local/slater_tc/slater_tc_opt.irp.f @@ -10,8 +10,6 @@ subroutine provide_all_three_ints_bi_ortho() implicit none double precision :: t1, t2 - PROVIDE ao_two_e_integrals_in_map - print *, ' start provide_all_three_ints_bi_ortho' call wall_time(t1) diff --git a/plugins/local/slater_tc/tc_hmat.irp.f b/plugins/local/slater_tc/tc_hmat.irp.f index cc780364..6323d129 100644 --- a/plugins/local/slater_tc/tc_hmat.irp.f +++ b/plugins/local/slater_tc/tc_hmat.irp.f @@ -30,7 +30,9 @@ BEGIN_PROVIDER [double precision, htilde_matrix_elmt_bi_ortho, (N_det,N_det)] print *, ' PROVIDING htilde_matrix_elmt_bi_ortho ...' call wall_time(t1) - call provide_all_three_ints_bi_ortho() + if(three_body_h_tc)then + call provide_all_three_ints_bi_ortho() + endif i = 1 j = 1 diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f index a2d9d043..bfa6bd0a 100644 --- a/src/ao_two_e_ints/cholesky.irp.f +++ b/src/ao_two_e_ints/cholesky.irp.f @@ -1,3 +1,15 @@ +double precision function get_ao_integ_chol(i,j,k,l) + implicit none + BEGIN_DOC + ! CHOLESKY representation of the integral of the AO basis or (ij|kl) + ! i(r1) j(r1) 1/r12 k(r2) l(r2) + END_DOC + integer, intent(in) :: i,j,k,l + double precision, external :: ddot + get_ao_integ_chol = ddot(cholesky_ao_num, cholesky_ao_transp(1,i,j), 1, cholesky_ao_transp(1,k,l), 1) + +end + BEGIN_PROVIDER [ double precision, cholesky_ao_transp, (cholesky_ao_num, ao_num, ao_num) ] implicit none BEGIN_DOC @@ -162,7 +174,8 @@ END_PROVIDER np = int(np8,4) if (np <= 0) stop 'np<=0' - rank_max = min(np,20*elec_num*elec_num) +! rank_max = min(np,20*elec_num*elec_num) + rank_max = np call mmap(trim(ezfio_work_dir)//'cholesky_ao_tmp', (/ ndim8, rank_max /), 8, fd(1), .False., .True., c_pointer(1)) call c_f_pointer(c_pointer(1), L, (/ ndim8, rank_max /)) From a0140b9b0af59712494de5b443846c70a432465b Mon Sep 17 00:00:00 2001 From: eginer Date: Wed, 24 Jul 2024 12:20:16 +0200 Subject: [PATCH 36/38] added mu_of_r_mean_field.irp.f --- src/mu_of_r/mu_of_r_mean_field.irp.f | 132 +++++++++++++++++++++++++++ src/mu_of_r/test_proj_op.irp.f | 96 ++++++++++++++++++- 2 files changed, 227 insertions(+), 1 deletion(-) create mode 100644 src/mu_of_r/mu_of_r_mean_field.irp.f diff --git a/src/mu_of_r/mu_of_r_mean_field.irp.f b/src/mu_of_r/mu_of_r_mean_field.irp.f new file mode 100644 index 00000000..9b9c2e20 --- /dev/null +++ b/src/mu_of_r/mu_of_r_mean_field.irp.f @@ -0,0 +1,132 @@ +BEGIN_PROVIDER [ double precision, two_e_int_mf, (elec_beta_num,elec_alpha_num,elec_beta_num,elec_alpha_num)] + implicit none + integer :: i,j,k,l + double precision :: get_two_e_integral + do i = 1, elec_alpha_num + do j = 1, elec_beta_num + do k = 1, elec_alpha_num + do l = 1, elec_beta_num + two_e_int_mf(l,k,j,i) = get_two_e_integral(l,k,j,i,mo_integrals_map) + enddo + enddo + enddo + enddo +END_PROVIDER + +subroutine get_f_mf_ab(r,f_mf_ab,two_bod_dens, dm_a, dm_b) + implicit none + double precision, intent(in) :: r(3) + double precision, intent(out):: f_mf_ab,two_bod_dens, dm_a, dm_b + double precision, allocatable :: mos_array_r(:),mos_array_a(:), mos_array_b(:) + integer :: i,j,k,l + allocate(mos_array_r(mo_num), mos_array_a(elec_alpha_num), mos_array_b(elec_alpha_num)) + call give_all_mos_at_r(r,mos_array_r) + do i = 1, elec_alpha_num + mos_array_a(i) = mos_array_r(i) + enddo + do i = 1, elec_beta_num + mos_array_b(i) = mos_array_r(i) + enddo + + dm_a = 0.d0 + do i = 1, elec_alpha_num + dm_a += mos_array_a(i) * mos_array_a(i) + enddo + + dm_b = 0.d0 + do i = 1, elec_beta_num + dm_b += mos_array_b(i) * mos_array_b(i) + enddo + two_bod_dens = dm_a * dm_b + + f_mf_ab = 0.d0 + do i = 1, elec_alpha_num + do j = 1, elec_beta_num + do k = 1, elec_alpha_num + do l = 1, elec_beta_num + f_mf_ab += two_e_int_mf(l,k,j,i) * mos_array_a(i) * mos_array_a(k) * mos_array_b(j) * mos_array_b(l) + enddo + enddo + enddo + enddo + ! multiply by two to adapt to the N(N-1) normalization condition of the active two-rdm + f_mf_ab *= 2.d0 + two_bod_dens *= 2.d0 + +end + +subroutine get_grad_f_mf_ab(r,grad_f_mf_ab, grad_two_bod_dens,f_mf_ab,two_bod_dens, dm_a, dm_b,grad_dm_a, grad_dm_b) + implicit none + double precision, intent(in) :: r(3) + double precision, intent(out) :: f_mf_ab, two_bod_dens + double precision, intent(out) :: grad_two_bod_dens(3), grad_f_mf_ab(3) + double precision, intent(out) :: dm_a, dm_b, grad_dm_a(3), grad_dm_b(3) + + double precision, allocatable :: mos_array_r(:), mos_grad_array_r(:,:) + double precision, allocatable :: mos_array_a(:), mos_array_b(:) + double precision, allocatable :: mos_grad_array_a(:,:), mos_grad_array_b(:,:) + double precision :: mo_i, mo_j, mo_k, mo_l + double precision :: grad_mo_i(3), grad_mo_j(3), grad_mo_k(3), grad_mo_l(3) + + integer :: i,j,k,l + allocate(mos_array_r(mo_num),mos_grad_array_r(3,mo_num)) + allocate(mos_array_a(elec_alpha_num), mos_array_b(elec_beta_num)) + allocate(mos_grad_array_a(3,elec_alpha_num), mos_grad_array_b(3,elec_beta_num)) + call give_all_mos_and_grad_at_r(r,mos_array_r,mos_grad_array_r) + do i = 1, elec_alpha_num + mos_array_a(i) = mos_array_r(i) + mos_grad_array_a(1:3,i) = mos_grad_array_r(1:3,i) + enddo + do i = 1, elec_beta_num + mos_array_b(i) = mos_array_r(i) + mos_grad_array_b(1:3,i) = mos_grad_array_r(1:3,i) + enddo + + ! ALPHA DENSITY AND GRADIENT + dm_a = 0.d0 + grad_dm_a = 0.d0 + do i = 1, elec_alpha_num + dm_a += mos_array_a(i) * mos_array_a(i) + grad_dm_a(1:3) += 2.d0 * mos_array_a(i) * mos_grad_array_a(1:3,i) + enddo + + ! BETA DENSITY AND GRADIENT + dm_b = 0.d0 + grad_dm_b = 0.d0 + do i = 1, elec_beta_num + dm_b += mos_array_b(i) * mos_array_b(i) + grad_dm_b(1:3) += 2.d0 * mos_array_b(i) * mos_grad_array_b(1:3,i) + enddo + ! TWO-BODY DENSITY AND GRADIENT + two_bod_dens = dm_a * dm_b + grad_two_bod_dens(1:3) = dm_a * grad_dm_b(1:3) + dm_b * grad_dm_a(1:3) + + ! F_MF and GRADIENT + grad_f_mf_ab = 0.d0 + f_mf_ab = 0.d0 + do i = 1, elec_alpha_num + mo_i = mos_array_a(i) + grad_mo_i(1:3) = mos_grad_array_a(1:3,i) + do j = 1, elec_beta_num + mo_j = mos_array_b(j) + grad_mo_j(1:3) = mos_grad_array_b(1:3,j) + do k = 1, elec_alpha_num + mo_k = mos_array_a(k) + grad_mo_k(1:3) = mos_grad_array_a(1:3,k) + do l = 1, elec_beta_num + mo_l = mos_array_b(l) + grad_mo_l(1:3) = mos_grad_array_b(1:3,l) + f_mf_ab += two_e_int_mf(l,k,j,i) * mo_i * mo_j * mo_k * mo_l + grad_f_mf_ab(1:3) += two_e_int_mf(l,k,j,i) * & + (mo_i * mo_j * mo_k * grad_mo_l(1:3) + mo_i * mo_j * grad_mo_k(1:3) * mo_l & + +mo_i * grad_mo_j(1:3) * mo_k * mo_l + grad_mo_i(1:3) * mo_j * mo_k * mo_l) + enddo + enddo + enddo + enddo + + f_mf_ab *= 2.d0 + two_bod_dens *= 2.d0 + grad_f_mf_ab *= 2.D0 + grad_two_bod_dens *= 2.d0 +end diff --git a/src/mu_of_r/test_proj_op.irp.f b/src/mu_of_r/test_proj_op.irp.f index fd5e976b..bd2f3b4f 100644 --- a/src/mu_of_r/test_proj_op.irp.f +++ b/src/mu_of_r/test_proj_op.irp.f @@ -17,7 +17,9 @@ program projected_operators ! call test_f_ii_valence_ab ! call test_f_ia_valence_ab ! call test_f_ii_ia_aa_valence_ab - call test +! call test +! call test_f_mean_field + call test_grad_f_mean_field end @@ -35,3 +37,95 @@ subroutine test print*,'accu = ',accu end + +subroutine test_f_mean_field + implicit none + integer :: i_point + double precision :: weight,r(3) + double precision :: ref_f, new_f, accu_f + double precision :: ref_two_dens, new_two_dens, accu_two_dens, dm_a, dm_b + accu_f = 0.d0 + accu_two_dens = 0.d0 + do i_point = 1, n_points_final_grid + r(1:3) = final_grid_points(1:3,i_point) + weight = final_weight_at_r_vector(i_point) + call get_f_mf_ab(r,new_f,new_two_dens, dm_a, dm_b) + call f_HF_valence_ab(r,r,ref_f,ref_two_dens) + accu_f += weight * dabs(new_f- ref_f) + accu_two_dens += weight * dabs(new_two_dens - ref_two_dens) + enddo + print*,'accu_f = ',accu_f + print*,'accu_two_dens = ',accu_two_dens + +end + +subroutine test_grad_f_mean_field + implicit none + integer :: i_point,k + double precision :: weight,r(3) + double precision :: grad_f_mf_ab(3), grad_two_bod_dens(3) + double precision :: grad_dm_a(3), grad_dm_b(3) + double precision :: f_mf_ab,two_bod_dens, dm_a, dm_b + + double precision :: num_grad_f_mf_ab(3), num_grad_two_bod_dens(3) + double precision :: num_grad_dm_a(3), num_grad_dm_b(3) + double precision :: f_mf_ab_p,f_mf_ab_m + double precision :: two_bod_dens_p, two_bod_dens_m + double precision :: dm_a_p, dm_a_m + double precision :: dm_b_p, dm_b_m + double precision :: rbis(3), dr + double precision :: accu_grad_f_mf_ab(3),accu_grad_two_bod_dens(3) + double precision :: accu_grad_dm_a(3),accu_grad_dm_b(3) + double precision :: accu_f_mf_ab, accu_two_bod_dens, accu_dm_a, accu_dm_b + dr = 0.00001d0 + accu_f_mf_ab = 0.d0 + accu_two_bod_dens = 0.d0 + accu_dm_a = 0.d0 + accu_dm_b = 0.d0 + + accu_grad_f_mf_ab = 0.d0 + accu_grad_two_bod_dens = 0.d0 + accu_grad_dm_a = 0.d0 + accu_grad_dm_b = 0.d0 + do i_point = 1, n_points_final_grid + r(1:3) = final_grid_points(1:3,i_point) + weight = final_weight_at_r_vector(i_point) + call get_grad_f_mf_ab(r,grad_f_mf_ab, grad_two_bod_dens,f_mf_ab,two_bod_dens, dm_a, dm_b,grad_dm_a, grad_dm_b) + call get_f_mf_ab(r,f_mf_ab_p,two_bod_dens_p, dm_a_p, dm_b_p) + accu_f_mf_ab += weight * dabs(f_mf_ab - f_mf_ab_p) + accu_two_bod_dens += weight * dabs(two_bod_dens - two_bod_dens_p) + accu_dm_a += weight*dabs(dm_a - dm_a_p) + accu_dm_b += weight*dabs(dm_b - dm_b_p) + do k = 1, 3 + rbis = r + rbis(k) += dr + call get_f_mf_ab(rbis,f_mf_ab_p,two_bod_dens_p, dm_a_p, dm_b_p) + rbis = r + rbis(k) -= dr + call get_f_mf_ab(rbis,f_mf_ab_m,two_bod_dens_m, dm_a_m, dm_b_m) + num_grad_f_mf_ab(k) = (f_mf_ab_p - f_mf_ab_m)/(2.d0*dr) + num_grad_two_bod_dens(k) = (two_bod_dens_p - two_bod_dens_m)/(2.d0*dr) + num_grad_dm_a(k) = (dm_a_p - dm_a_m)/(2.d0*dr) + num_grad_dm_b(k) = (dm_b_p - dm_b_m)/(2.d0*dr) + enddo + do k = 1, 3 + accu_grad_f_mf_ab(k) += weight * dabs(grad_f_mf_ab(k) - num_grad_f_mf_ab(k)) + accu_grad_two_bod_dens(k) += weight * dabs(grad_two_bod_dens(k) - num_grad_two_bod_dens(k)) + accu_grad_dm_a(k) += weight * dabs(grad_dm_a(k) - num_grad_dm_a(k)) + accu_grad_dm_b(k) += weight * dabs(grad_dm_b(k) - num_grad_dm_b(k)) + enddo + enddo + print*,'accu_f_mf_ab = ',accu_f_mf_ab + print*,'accu_two_bod_dens = ',accu_two_bod_dens + print*,'accu_dm_a = ',accu_dm_a + print*,'accu_dm_b = ',accu_dm_b + print*,'accu_grad_f_mf_ab = ' + print*,accu_grad_f_mf_ab + print*,'accu_grad_two_bod_dens = ' + print*,accu_grad_two_bod_dens + print*,'accu_dm_a = ' + print*,accu_grad_dm_a + print*,'accu_dm_b = ' + print*,accu_grad_dm_b + +end From cb8bef2ecda20f5e8b6076ece8cd922aa063db65 Mon Sep 17 00:00:00 2001 From: eginer Date: Wed, 24 Jul 2024 12:43:20 +0200 Subject: [PATCH 37/38] added gradients of mu_mf --- src/mu_of_r/mu_of_r_mean_field.irp.f | 39 +++++++++++++++++++++++ src/mu_of_r/test_proj_op.irp.f | 47 +++++++++++++++++++++++++++- 2 files changed, 85 insertions(+), 1 deletion(-) diff --git a/src/mu_of_r/mu_of_r_mean_field.irp.f b/src/mu_of_r/mu_of_r_mean_field.irp.f index 9b9c2e20..6abc7e4f 100644 --- a/src/mu_of_r/mu_of_r_mean_field.irp.f +++ b/src/mu_of_r/mu_of_r_mean_field.irp.f @@ -130,3 +130,42 @@ subroutine get_grad_f_mf_ab(r,grad_f_mf_ab, grad_two_bod_dens,f_mf_ab,two_bod_de grad_f_mf_ab *= 2.D0 grad_two_bod_dens *= 2.d0 end + +subroutine mu_of_r_mean_field(r,mu_mf, dm) + implicit none + include 'constants.include.F' + double precision, intent(in) :: r(3) + double precision, intent(out):: mu_mf, dm + double precision :: f_mf_ab,two_bod_dens, dm_a, dm_b + call get_f_mf_ab(r,f_mf_ab,two_bod_dens, dm_a, dm_b) + dm = dm_a + dm_b + if(dabs(two_bod_dens).lt.1.d-10)then + mu_mf = 1.d+10 + else + mu_mf = 0.5d0 * sqpi * f_mf_ab/two_bod_dens + endif +end + +subroutine grad_mu_of_r_mean_field(r,mu_mf, dm, grad_mu_mf, grad_dm) + implicit none + include 'constants.include.F' + double precision, intent(in) :: r(3) + double precision, intent(out):: grad_mu_mf(3), grad_dm(3) + double precision, intent(out):: mu_mf, dm + double precision :: grad_f_mf_ab(3), grad_two_bod_dens(3),grad_dm_a(3), grad_dm_b(3) + double precision :: f_mf_ab,two_bod_dens, dm_a, dm_b + call get_grad_f_mf_ab(r,grad_f_mf_ab, grad_two_bod_dens,f_mf_ab,two_bod_dens, dm_a, dm_b,grad_dm_a, grad_dm_b) + + dm = dm_a + dm_b + grad_dm(1:3) = grad_dm_a(1:3) + grad_dm_b(1:3) + + if(dabs(two_bod_dens).lt.1.d-10)then + mu_mf = 1.d+10 + grad_mu_mf = 0.d0 + else + mu_mf = 0.5d0 * sqpi * f_mf_ab/two_bod_dens + grad_mu_mf(1:3) = 0.5d0 * sqpi * (grad_f_mf_ab(1:3) * two_bod_dens - f_mf_ab * grad_two_bod_dens(1:3))& + /(two_bod_dens*two_bod_dens) + endif + +end diff --git a/src/mu_of_r/test_proj_op.irp.f b/src/mu_of_r/test_proj_op.irp.f index bd2f3b4f..cf53c772 100644 --- a/src/mu_of_r/test_proj_op.irp.f +++ b/src/mu_of_r/test_proj_op.irp.f @@ -19,7 +19,8 @@ program projected_operators ! call test_f_ii_ia_aa_valence_ab ! call test ! call test_f_mean_field - call test_grad_f_mean_field +! call test_grad_f_mean_field + call test_grad_mu_mf end @@ -129,3 +130,47 @@ subroutine test_grad_f_mean_field print*,accu_grad_dm_b end + +subroutine test_grad_mu_mf + implicit none + integer :: i_point,k + double precision :: weight,r(3),rbis(3) + double precision :: mu_mf, dm,grad_mu_mf(3), grad_dm(3) + double precision :: mu_mf_p, mu_mf_m, dm_m, dm_p, num_grad_mu_mf(3),dr, num_grad_dm(3) + double precision :: accu_mu, accu_dm, accu_grad_dm(3), accu_grad_mu_mf(3) + dr = 0.00001d0 + accu_grad_mu_mf = 0.d0 + accu_mu = 0.d0 + accu_grad_dm = 0.d0 + accu_dm = 0.d0 + do i_point = 1, n_points_final_grid + r(1:3) = final_grid_points(1:3,i_point) + weight = final_weight_at_r_vector(i_point) + call grad_mu_of_r_mean_field(r,mu_mf, dm, grad_mu_mf, grad_dm) + call mu_of_r_mean_field(r,mu_mf_p, dm_p) + accu_mu += weight*dabs(mu_mf_p - mu_mf) + accu_dm += weight*dabs(dm_p - dm) + do k = 1, 3 + rbis = r + rbis(k) += dr + call mu_of_r_mean_field(rbis,mu_mf_p, dm_p) + rbis = r + rbis(k) -= dr + call mu_of_r_mean_field(rbis,mu_mf_m, dm_m) + + num_grad_mu_mf(k) = (mu_mf_p - mu_mf_m)/(2.d0*dr) + num_grad_dm(k) = (dm_p - dm_m)/(2.d0*dr) + enddo + do k = 1, 3 + accu_grad_dm(k)+= weight *dabs(num_grad_dm(k) - grad_dm(k)) + accu_grad_mu_mf(k)+= weight *dabs(num_grad_mu_mf(k) - grad_mu_mf(k)) + enddo + enddo + print*,'accu_mu = ',accu_mu + print*,'accu_dm = ',accu_dm + print*,'accu_grad_dm = ' + print*, accu_grad_dm + print*,'accu_grad_mu_mf = ' + print*, accu_grad_mu_mf + +end From edf3a27534e531a1866eac30202bf26ca305123a Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Mon, 29 Jul 2024 16:15:48 +0200 Subject: [PATCH 38/38] rank_max --- src/ao_two_e_ints/cholesky.irp.f | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f index bfa6bd0a..ccaa7239 100644 --- a/src/ao_two_e_ints/cholesky.irp.f +++ b/src/ao_two_e_ints/cholesky.irp.f @@ -174,8 +174,11 @@ END_PROVIDER np = int(np8,4) if (np <= 0) stop 'np<=0' -! rank_max = min(np,20*elec_num*elec_num) rank_max = np + ! Avoid too large arrays when there are many electrons + if (elec_num > 10) then + rank_max = min(np,20*elec_num*elec_num) + endif call mmap(trim(ezfio_work_dir)//'cholesky_ao_tmp', (/ ndim8, rank_max /), 8, fd(1), .False., .True., c_pointer(1)) call c_f_pointer(c_pointer(1), L, (/ ndim8, rank_max /))