From 6e2f28b97fbf3c961c9461689a8db9adf81bd6f8 Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Sat, 11 May 2024 10:27:03 +0200 Subject: [PATCH 01/38] COLLAPSE(4) -> COLLAPSE(3) --- plugins/local/non_h_ints_mu/deb_aos.irp.f | 6 +++--- plugins/local/non_h_ints_mu/total_tc_int.irp.f | 9 ++++----- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/plugins/local/non_h_ints_mu/deb_aos.irp.f b/plugins/local/non_h_ints_mu/deb_aos.irp.f index 4012f47c..70604f54 100644 --- a/plugins/local/non_h_ints_mu/deb_aos.irp.f +++ b/plugins/local/non_h_ints_mu/deb_aos.irp.f @@ -31,6 +31,9 @@ subroutine print_aos() integer :: i, ipoint double precision :: r(3) double precision :: ao_val, ao_der(3), ao_lap + double precision :: accu_vgl(5) + double precision :: accu_vgl_nrm(5) + double precision :: mo_val, mo_der(3), mo_lap PROVIDE final_grid_points aos_in_r_array aos_grad_in_r_array aos_lapl_in_r_array @@ -40,9 +43,6 @@ subroutine print_aos() write(1000, '(3(f15.7, 3X))') r enddo -double precision :: accu_vgl(5) -double precision :: accu_vgl_nrm(5) - do ipoint = 1, n_points_final_grid do i = 1, ao_num ao_val = aos_in_r_array (i,ipoint) diff --git a/plugins/local/non_h_ints_mu/total_tc_int.irp.f b/plugins/local/non_h_ints_mu/total_tc_int.irp.f index a1bbd6e0..656f5f16 100644 --- a/plugins/local/non_h_ints_mu/total_tc_int.irp.f +++ b/plugins/local/non_h_ints_mu/total_tc_int.irp.f @@ -78,7 +78,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n !$OMP PRIVATE (i, j, k, l, ipoint, ao_i_r, ao_k_r, weight1) & !$OMP SHARED (ao_num, n_points_final_grid, ao_two_e_tc_tot, & !$OMP aos_in_r_array_transp, final_weight_at_r_vector, int2_grad1_u12_square_ao) - !$OMP DO COLLAPSE(4) + !$OMP DO COLLAPSE(3) do i = 1, ao_num do k = 1, ao_num do l = 1, ao_num @@ -188,7 +188,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n !$OMP SHARED (ao_num, n_points_final_grid, ao_two_e_tc_tot, & !$OMP aos_in_r_array_transp, final_weight_at_r_vector, & !$OMP int2_grad1_u12_ao, aos_grad_in_r_array_transp_bis) - !$OMP DO COLLAPSE(4) + !$OMP DO COLLAPSE(3) do i = 1, ao_num do k = 1, ao_num do l = 1, ao_num @@ -270,7 +270,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n !$OMP PARALLEL DEFAULT(NONE) & !$OMP PRIVATE(i, j, k, l, integ_zero, integ_val) & !$OMP SHARED(ao_num, ao_two_e_tc_tot) - !$OMP DO COLLAPSE(4) + !$OMP DO COLLAPSE(3) do j = 1, ao_num do l = 1, ao_num do i = 1, ao_num @@ -293,7 +293,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n !$OMP PARALLEL DEFAULT(NONE) & !$OMP SHARED(ao_num, ao_two_e_tc_tot, ao_integrals_map) & !$OMP PRIVATE(i, j, k, l) - !$OMP DO COLLAPSE(4) + !$OMP DO COLLAPSE(3) do j = 1, ao_num do l = 1, ao_num do i = 1, ao_num @@ -306,7 +306,6 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n enddo !$OMP END DO !$OMP END PARALLEL - !call clear_ao_map() FREE ao_integrals_map endif From 29da3b6542cdfeb52f9d1b7f8c23f3967018bf0f Mon Sep 17 00:00:00 2001 From: AbdAmmar Date: Thu, 23 May 2024 00:45:56 +0200 Subject: [PATCH 02/38] bypass one_e_tr_dm_mo for large mo_num & n_states --- .../multi_s_dipole_moment.irp.f | 154 ++++++++++++++++-- 1 file changed, 140 insertions(+), 14 deletions(-) diff --git a/src/mol_properties/multi_s_dipole_moment.irp.f b/src/mol_properties/multi_s_dipole_moment.irp.f index c7216a61..8aae3bf4 100644 --- a/src/mol_properties/multi_s_dipole_moment.irp.f +++ b/src/mol_properties/multi_s_dipole_moment.irp.f @@ -18,7 +18,7 @@ -BEGIN_PROVIDER [double precision, multi_s_dipole_moment, (N_states, N_states)] + BEGIN_PROVIDER [double precision, multi_s_dipole_moment , (N_states, N_states)] &BEGIN_PROVIDER [double precision, multi_s_x_dipole_moment, (N_states, N_states)] &BEGIN_PROVIDER [double precision, multi_s_y_dipole_moment, (N_states, N_states)] &BEGIN_PROVIDER [double precision, multi_s_z_dipole_moment, (N_states, N_states)] @@ -40,27 +40,153 @@ BEGIN_PROVIDER [double precision, multi_s_dipole_moment, (N_states, N_states)] ! gamma^{nm}: density matrix \bra{\Psi^n} a^{\dagger}_a a_i \ket{\Psi^m} END_DOC - integer :: istate,jstate ! States - integer :: i,j ! general spatial MOs + integer :: istate, jstate ! States + integer :: i, j ! general spatial MOs double precision :: nuclei_part_x, nuclei_part_y, nuclei_part_z multi_s_x_dipole_moment = 0.d0 multi_s_y_dipole_moment = 0.d0 multi_s_z_dipole_moment = 0.d0 + + if(8.d0*mo_num*mo_num*n_states*n_states*1d-9 .lt. 200.d0) then - do jstate = 1, N_states - do istate = 1, N_states - - do i = 1, mo_num - do j = 1, mo_num - multi_s_x_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_x(j,i) - multi_s_y_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_y(j,i) - multi_s_z_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_z(j,i) - enddo + do jstate = 1, N_states + do istate = 1, N_states + do i = 1, mo_num + do j = 1, mo_num + multi_s_x_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_x(j,i) + multi_s_y_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_y(j,i) + multi_s_z_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_z(j,i) + enddo + enddo enddo - enddo - enddo + + else + + ! no enouph memory + ! on the fly scheme + + PROVIDE psi_det_alpha_unique psi_det_beta_unique + + integer :: l, k_a, k_b + integer :: occ(N_int*bit_kind_size,2) + integer :: h1, h2, p1, p2, degree + integer :: exc(0:2,2), n_occ(2) + integer :: krow, kcol, lrow, lcol + integer(bit_kind) :: tmp_det(N_int,2), tmp_det2(N_int) + double precision :: ck, ckl, phase + + !$OMP PARALLEL DEFAULT(NONE) & + !$OMP PRIVATE(j, l, k_a, k_b, istate, jstate, occ, ck, ckl, h1, h2, p1, p2, exc, & + !$OMP phase, degree, n_occ, krow, kcol, lrow, lcol, tmp_det, tmp_det2) & + !$OMP SHARED(N_int, N_states, elec_alpha_num, elec_beta_num, N_det, & + !$OMP psi_bilinear_matrix_rows, psi_bilinear_matrix_columns, & + !$OMP psi_bilinear_matrix_transp_rows, psi_bilinear_matrix_transp_columns, & + !$OMP psi_det_alpha_unique, psi_det_beta_unique, & + !$OMP psi_bilinear_matrix_values, psi_bilinear_matrix_transp_values, & + !$OMP mo_dipole_x, mo_dipole_y, mo_dipole_z, & + !$OMP multi_s_x_dipole_moment, multi_s_y_dipole_moment, multi_s_z_dipole_moment) + !$OMP DO COLLAPSE(2) + do istate = 1, N_states + do jstate = 1, N_states + + do k_a = 1, N_det + krow = psi_bilinear_matrix_rows (k_a) + kcol = psi_bilinear_matrix_columns(k_a) + + tmp_det(1:N_int,1) = psi_det_alpha_unique(1:N_int,krow) + tmp_det(1:N_int,2) = psi_det_beta_unique (1:N_int,kcol) + + ! Diagonal part + call bitstring_to_list_ab(tmp_det, occ, n_occ, N_int) + ck = psi_bilinear_matrix_values(k_a,istate)*psi_bilinear_matrix_values(k_a,jstate) + do l = 1, elec_alpha_num + j = occ(l,1) + multi_s_x_dipole_moment(istate,jstate) -= ck * mo_dipole_x(j,j) + multi_s_y_dipole_moment(istate,jstate) -= ck * mo_dipole_y(j,j) + multi_s_z_dipole_moment(istate,jstate) -= ck * mo_dipole_z(j,j) + enddo + + if (k_a == N_det) cycle + l = k_a + 1 + lrow = psi_bilinear_matrix_rows (l) + lcol = psi_bilinear_matrix_columns(l) + ! Fix beta determinant, loop over alphas + do while (lcol == kcol) + tmp_det2(:) = psi_det_alpha_unique(:,lrow) + call get_excitation_degree_spin(tmp_det(1,1), tmp_det2, degree, N_int) + if (degree == 1) then + exc = 0 + call get_single_excitation_spin(tmp_det(1,1), tmp_det2, exc, phase, N_int) + call decode_exc_spin(exc, h1, p1, h2, p2) + ckl = psi_bilinear_matrix_values(k_a,istate)*psi_bilinear_matrix_values(l,jstate) * phase + multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(h1,p1) + multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(h1,p1) + multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(h1,p1) + ckl = psi_bilinear_matrix_values(k_a,jstate)*psi_bilinear_matrix_values(l,istate) * phase + multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(p1,h1) + multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(p1,h1) + multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(p1,h1) + endif + l = l+1 + if (l > N_det) exit + lrow = psi_bilinear_matrix_rows (l) + lcol = psi_bilinear_matrix_columns(l) + enddo + enddo ! k_a + + do k_b = 1, N_det + krow = psi_bilinear_matrix_transp_rows (k_b) + kcol = psi_bilinear_matrix_transp_columns(k_b) + + tmp_det(1:N_int,1) = psi_det_alpha_unique(1:N_int,krow) + tmp_det(1:N_int,2) = psi_det_beta_unique (1:N_int,kcol) + + ! Diagonal part + call bitstring_to_list_ab(tmp_det, occ, n_occ, N_int) + ck = psi_bilinear_matrix_transp_values(k_b,istate)*psi_bilinear_matrix_transp_values(k_b,jstate) + do l = 1, elec_beta_num + j = occ(l,2) + multi_s_x_dipole_moment(istate,jstate) -= ck * mo_dipole_x(j,j) + multi_s_y_dipole_moment(istate,jstate) -= ck * mo_dipole_y(j,j) + multi_s_z_dipole_moment(istate,jstate) -= ck * mo_dipole_z(j,j) + enddo + + if (k_b == N_det) cycle + l = k_b+1 + lrow = psi_bilinear_matrix_transp_rows (l) + lcol = psi_bilinear_matrix_transp_columns(l) + ! Fix beta determinant, loop over alphas + do while (lrow == krow) + tmp_det2(:) = psi_det_beta_unique(:,lcol) + call get_excitation_degree_spin(tmp_det(1,2), tmp_det2, degree, N_int) + if (degree == 1) then + exc = 0 + call get_single_excitation_spin(tmp_det(1,2), tmp_det2, exc, phase, N_int) + call decode_exc_spin(exc, h1, p1, h2, p2) + ckl = psi_bilinear_matrix_transp_values(k_b,istate)*psi_bilinear_matrix_transp_values(l,jstate) * phase + multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(h1,p1) + multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(h1,p1) + multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(h1,p1) + ckl = psi_bilinear_matrix_transp_values(k_b,jstate)*psi_bilinear_matrix_transp_values(l,istate) * phase + multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(p1,h1) + multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(p1,h1) + multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(p1,h1) + endif + l = l+1 + if (l > N_det) exit + lrow = psi_bilinear_matrix_transp_rows (l) + lcol = psi_bilinear_matrix_transp_columns(l) + enddo + enddo ! k_b + + enddo ! istate + enddo ! jstate + !$OMP END DO + !$OMP END PARALLEL + + endif ! memory condition ! Nuclei part nuclei_part_x = 0.d0 From 38aa8ef547c2b4d68b6caf2d7cca40a3590f69ed Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Thu, 6 Jun 2024 16:36:45 +0200 Subject: [PATCH 03/38] Update ccsd_space_orb_sub.irp.f --- src/ccsd/ccsd_space_orb_sub.irp.f | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index b48ca7da..555a2552 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -18,6 +18,8 @@ subroutine run_ccsd_space_orb integer(bit_kind) :: det(N_int,2) integer :: nO, nV, nOa, nVa + call set_multiple_levels_omp(.False.) + if (do_ao_cholesky) then PROVIDE cholesky_mo_transp FREE cholesky_ao From e876f635d636b1cb878cdb9baf9e7b3906bf955f Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Fri, 14 Jun 2024 16:26:23 +0200 Subject: [PATCH 04/38] Asyc Fortran I/O --- src/ao_two_e_ints/cholesky.irp.f | 161 ++++++++++++++++++++++--------- src/utils/fortran_mmap.c | 7 +- 2 files changed, 118 insertions(+), 50 deletions(-) diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f index 05f7115d..d731ef04 100644 --- a/src/ao_two_e_ints/cholesky.irp.f +++ b/src/ao_two_e_ints/cholesky.irp.f @@ -31,12 +31,12 @@ END_PROVIDER integer*8 :: ndim8 integer :: rank double precision :: tau, tau2 - double precision, pointer :: L(:,:), Delta(:,:) + double precision, pointer :: L(:,:) double precision :: s double precision :: dscale, dscale_tmp - double precision, allocatable :: D(:), Ltmp_p(:,:), Ltmp_q(:,:), D_sorted(:), Delta_col(:) + double precision, allocatable :: D(:), Ltmp_p(:,:), Ltmp_q(:,:), D_sorted(:), Delta_col(:), Delta(:,:) integer, allocatable :: addr1(:), addr2(:) integer*8, allocatable :: Lset(:), Dset(:), addr3(:) logical, allocatable :: computed(:) @@ -66,7 +66,7 @@ END_PROVIDER integer :: fd(2) logical :: delta_on_disk integer :: dgemm_block_size, nqq - double precision, allocatable :: dgemm_buffer1(:,:), dgemm_buffer2(:,:) + double precision, allocatable :: dgemm_buffer1(:,:), dgemm_buffer2(:,:), dgemm_buffer3(:,:) PROVIDE nproc PROVIDE nucl_coord ao_two_e_integral_schwartz @@ -230,7 +230,7 @@ END_PROVIDER stop -1 endif - if (s > 0.1d0) then + if (s > 0.3d0) then exit endif @@ -245,13 +245,16 @@ END_PROVIDER + np*memory_of_double(nq) &! Delta(np,nq) + (np+nq)*memory_of_double(block_size) ! Ltmp_p(np,block_size) + Ltmp_q(nq,block_size) - if (mem > qp_max_mem) then - call mmap(trim(ezfio_work_dir)//'cholesky_delta', (/ np*1_8, nq*1_8 /), 8, fd(2), .False., .True., c_pointer(2)) - call c_f_pointer(c_pointer(2), Delta, (/ np, nq /)) - ! Deleting the file while it is open makes the file invisible on the filesystem, - ! and automatically deleted, even if the program crashes + if (1.1*mem > qp_max_mem) then +! call mmap(trim(ezfio_work_dir)//'cholesky_delta', (/ np*1_8, nq*1_8 /), 8, fd(2), .False., .True., c_pointer(2)) +! call c_f_pointer(c_pointer(2), Delta, (/ np, nq /)) + +! ! Deleting the file while it is open makes the file invisible on the filesystem, +! ! and automatically deleted, even if the program crashes iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_delta', 'R') close(iunit,status='delete') + open(unit=iunit, access='DIRECT', form='UNFORMATTED', RECL=(np+1)*8, & + ASYNCHRONOUS='YES', file=trim(ezfio_work_dir)//'cholesky_delta') delta_on_disk = .True. else allocate(Delta(np,nq)) @@ -303,15 +306,18 @@ END_PROVIDER !$OMP END PARALLEL PROVIDE nproc - if (N>0) then - if (delta_on_disk) then + if (delta_on_disk) then + + dgemm_block_size = nproc*4 + + allocate (dgemm_buffer1(np,dgemm_block_size)) + allocate (dgemm_buffer3(np,dgemm_block_size)) + allocate (dgemm_buffer2(dgemm_block_size,N)) + + if (N>0) then ! Blocking improves I/O performance - dgemm_block_size = nproc*4 - - allocate (dgemm_buffer1(np,dgemm_block_size)) - allocate (dgemm_buffer2(dgemm_block_size,N)) do jj=1,nq,dgemm_block_size @@ -325,34 +331,55 @@ END_PROVIDER enddo !$OMP END PARALLEL DO - call dgemm('N', 'T', np, nqq, N, 1.d0, & +print *, np, nq, jj, nqq, N + call dgemm('N', 'T', np, nqq, N, -1.d0, & Ltmp_p, np, dgemm_buffer2, dgemm_block_size, 0.d0, dgemm_buffer1, np) - !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q) + wait(iunit) + dgemm_buffer3(:,:) = dgemm_buffer1(:,:) +! !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q) do q=jj,jj+nqq-1 - Delta(:,q) = - dgemm_buffer1(:, q-jj+1) + write(iunit, ASYNCHRONOUS='YES', rec=q) dgemm_buffer3(1:np, q-jj+1) enddo - !$OMP END PARALLEL DO +! !$OMP END PARALLEL DO enddo - - deallocate(dgemm_buffer1, dgemm_buffer2) +print *, 'ok1' else - call dgemm('N', 'T', np, nq, N, -1.d0, & - Ltmp_p(1,1), np, Ltmp_q(1,1), nq, 0.d0, Delta, np) + + dgemm_buffer1(1:np,1) = 0.d0 + +! !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q) + do q=1,nq + write(iunit, ASYNCHRONOUS='YES', rec=q) dgemm_buffer1(1:np, 1) + enddo +! !$OMP END PARALLEL DO endif + deallocate(dgemm_buffer1, dgemm_buffer2) + if (delta_on_disk) wait(iunit) + deallocate(dgemm_buffer3) + else - !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q,j) - do q=1,nq - Delta(:,q) = 0.d0 - enddo - !$OMP END PARALLEL DO + if (N>0) then + + call dgemm('N', 'T', np, nq, N, -1.d0, & + Ltmp_p(1,1), np, Ltmp_q(1,1), nq, 0.d0, Delta, np) + + else + + !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q,j) + do q=1,nq + Delta(:,q) = 0.d0 + enddo + !$OMP END PARALLEL DO + + endif endif @@ -383,25 +410,40 @@ END_PROVIDER do jj=1,nq,dgemm_block_size nqq = min(nq, jj+dgemm_block_size-1) - jj + 1 - !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q,ii) + !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(q,ii) + !$OMP DO do ii=1,block_size do q=jj,jj+nqq-1 dgemm_buffer2(q-jj+1,ii) = Ltmp_q(q,ii) enddo enddo - !$OMP END PARALLEL DO + !$OMP END DO + !$OMP END PARALLEL - call dgemm('N', 'T', np, nqq, block_size, 1.d0, & - Ltmp_p(1,1), np, dgemm_buffer2, dgemm_block_size, 0.d0, dgemm_buffer1, np) - - !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q) +! !$OMP DO do q=jj,jj+nqq-1 - Delta(:,q) = Delta(:,q) - dgemm_buffer1(:, q-jj+1) + read(iunit, rec=q) dgemm_buffer1(1:np, q-jj+1) enddo - !$OMP END PARALLEL DO +! !$OMP END DO + +print *, np, nq, jj, nqq, block_size + call dgemm('N', 'T', np, nqq, block_size, -1.d0, & + Ltmp_p(1,1), np, dgemm_buffer2, dgemm_block_size, 1.d0, dgemm_buffer1, np) + + wait(iunit) + dgemm_buffer3 = dgemm_buffer1 + +! !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q) + do q=jj,jj+nqq-1 + write(iunit, ASYNCHRONOUS='YES', rec=q) dgemm_buffer3(:, q-jj+1) + enddo +! !$OMP END PARALLEL DO enddo +print *, 'ok' deallocate(dgemm_buffer1, dgemm_buffer2) + wait(iunit) + deallocate(dgemm_buffer3) else @@ -427,11 +469,20 @@ END_PROVIDER enddo iblock = iblock+1 - !$OMP PARALLEL DO PRIVATE(p) - do p=1,np - Ltmp_p(p,iblock) = Delta(p,dj) - enddo - !$OMP END PARALLEL DO + + if (delta_on_disk) then + + read(iunit,rec=dj) Ltmp_p(1:np,iblock) + + else + + !$OMP PARALLEL DO PRIVATE(p) + do p=1,np + Ltmp_p(p,iblock) = Delta(p,dj) + enddo + !$OMP END PARALLEL DO + + endif if (.not.computed(dj)) then m = dj @@ -463,12 +514,26 @@ END_PROVIDER !$OMP END PARALLEL DO endif - !$OMP PARALLEL DO PRIVATE(p) - do p=1,np - Ltmp_p(p,iblock) = Ltmp_p(p,iblock) + Delta_col(p) - Delta(p,dj) = Ltmp_p(p,iblock) - enddo - !$OMP END PARALLEL DO + if (delta_on_disk) then + + !$OMP PARALLEL DO PRIVATE(p) + do p=1,np + Ltmp_p(p,iblock) = Ltmp_p(p,iblock) + Delta_col(p) + enddo + !$OMP END PARALLEL DO + + write(iunit, ASYNCHRONOUS='YES', rec=dj) Ltmp_p(1:np,iblock) + + else + + !$OMP PARALLEL DO PRIVATE(p) + do p=1,np + Ltmp_p(p,iblock) = Ltmp_p(p,iblock) + Delta_col(p) + Delta(p,dj) = Ltmp_p(p,iblock) + enddo + !$OMP END PARALLEL DO + + endif computed(dj) = .True. endif @@ -512,7 +577,7 @@ END_PROVIDER deallocate(Ltmp_q) deallocate(computed) if (delta_on_disk) then - call munmap( (/ np*1_8, nq*1_8 /), 8, fd(2), c_pointer(2) ) + close(iunit, status='delete') else deallocate(Delta) endif diff --git a/src/utils/fortran_mmap.c b/src/utils/fortran_mmap.c index 711a9c34..2dbe42b8 100644 --- a/src/utils/fortran_mmap.c +++ b/src/utils/fortran_mmap.c @@ -40,7 +40,7 @@ void* mmap_fortran(char* filename, size_t bytes, int* file_descr, int read_only, exit(EXIT_FAILURE); } - result = write(fd, "", 1); + result = write(fd, " ", 1); if (result != 1) { close(fd); printf("%s:\n", filename); @@ -49,7 +49,10 @@ void* mmap_fortran(char* filename, size_t bytes, int* file_descr, int read_only, } if (single_node == 1) { - map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_POPULATE | MAP_NONBLOCK, fd, 0); + map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_POPULATE | MAP_NONBLOCK | MAP_NORESERVE, fd, 0); + if (map == MAP_FAILED) { + map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + } } else { map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); } From f671c669f8cae460911c1e016e9e44297e817d79 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Mon, 17 Jun 2024 14:57:48 +0200 Subject: [PATCH 05/38] Use less memory in Cholesky --- src/ao_two_e_ints/cholesky.irp.f | 378 +++++++------------------- src/hartree_fock/fock_matrix_hf.irp.f | 21 +- 2 files changed, 119 insertions(+), 280 deletions(-) diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f index d731ef04..a680e7ee 100644 --- a/src/ao_two_e_ints/cholesky.irp.f +++ b/src/ao_two_e_ints/cholesky.irp.f @@ -31,14 +31,14 @@ END_PROVIDER integer*8 :: ndim8 integer :: rank double precision :: tau, tau2 - double precision, pointer :: L(:,:) + double precision, pointer :: L(:,:), Delta(:,:) double precision :: s double precision :: dscale, dscale_tmp - double precision, allocatable :: D(:), Ltmp_p(:,:), Ltmp_q(:,:), D_sorted(:), Delta_col(:), Delta(:,:) + double precision, allocatable :: D(:), Ltmp_p(:,:), Ltmp_q(:,:), D_sorted(:), Delta_col(:) integer, allocatable :: addr1(:), addr2(:) - integer*8, allocatable :: Lset(:), Dset(:), addr3(:) + integer*8, allocatable :: Lset(:), Dset(:) logical, allocatable :: computed(:) integer :: i,j,k,m,p,q, dj, p2, q2, ii, jj @@ -64,11 +64,8 @@ END_PROVIDER type(c_ptr) :: c_pointer(2) integer :: fd(2) - logical :: delta_on_disk - integer :: dgemm_block_size, nqq - double precision, allocatable :: dgemm_buffer1(:,:), dgemm_buffer2(:,:), dgemm_buffer3(:,:) - PROVIDE nproc + PROVIDE nproc ao_cholesky_threshold do_direct_integrals qp_max_mem PROVIDE nucl_coord ao_two_e_integral_schwartz call set_multiple_levels_omp(.False.) @@ -88,19 +85,8 @@ END_PROVIDER else - PROVIDE nucl_coord ao_two_e_integral_schwartz call set_multiple_levels_omp(.False.) - call resident_memory(mem0) - - rank_max = min(ndim8,(qp_max_mem*1024_8*1024_8*1024_8/8_8)/ndim8) - call mmap(trim(ezfio_work_dir)//'cholesky_ao_tmp', (/ ndim8, rank_max /), 8, fd(1), .False., .True., c_pointer(1)) - call c_f_pointer(c_pointer(1), L, (/ ndim8, rank_max /)) - ! Deleting the file while it is open makes the file invisible on the filesystem, - ! and automatically deleted, even if the program crashes - iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_ao_tmp', 'R') - close(iunit,status='delete') - if (do_direct_integrals) then if (ao_two_e_integral(1,1,1,1) < huge(1.d0)) then ! Trigger providers inside ao_two_e_integral @@ -113,8 +99,12 @@ END_PROVIDER tau = ao_cholesky_threshold tau2 = tau*tau - mem = 6.d0 * memory_of_double8(ndim8) + 6.d0 * memory_of_int8(ndim8) - call check_mem(mem, irp_here) + rank = 0 + + allocate( D(ndim8), Lset(ndim8), Dset(ndim8), D_sorted(ndim8)) + allocate( addr1(ndim8), addr2(ndim8), Delta_col(ndim8) ) + + call resident_memory(mem0) call print_memory_usage() @@ -127,46 +117,35 @@ END_PROVIDER print *, '============ =============' - rank = 0 - - allocate( D(ndim8), Lset(ndim8), Dset(ndim8), D_sorted(ndim8)) - allocate( addr1(ndim8), addr2(ndim8), addr3(ndim8) ) -!print *, 'allocate : (D(ndim8))', memory_of_int8(ndim8) -!print *, 'allocate : (Lset(ndim8))', memory_of_int8(ndim8) -!print *, 'allocate : (Dset(ndim8))', memory_of_int8(ndim8) -!print *, 'allocate : (4,addr(ndim8))', memory_of_int8(4_8*ndim8) - ! 1. - k=0 + i8=0 do j=1,ao_num do i=1,ao_num - k = k+1 - addr1(k) = i - addr2(k) = j - addr3(k) = (i-1)*ao_num + j + i8 = i8+1 + addr1(i8) = i + addr2(i8) = j enddo enddo if (do_direct_integrals) then - !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i8) SCHEDULE(dynamic,16) + !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i8) SCHEDULE(dynamic,21) do i8=ndim8,1,-1 D(i8) = ao_two_e_integral(addr1(i8), addr2(i8), & addr1(i8), addr2(i8)) enddo !$OMP END PARALLEL DO else - !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i8) SCHEDULE(dynamic,16) + !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i8) SCHEDULE(dynamic,21) do i8=ndim8,1,-1 D(i8) = get_ao_two_e_integral(addr1(i8), addr1(i8), & - addr2(i8), addr2(i8), & - ao_integrals_map) + addr2(i8), addr2(i8), ao_integrals_map) enddo !$OMP END PARALLEL DO endif + D_sorted(:) = -D(:) call dsort_noidx_big(D_sorted,ndim8) - D_sorted(:) = dabs(D_sorted(:)) - + D_sorted(:) = -D_sorted(:) Dmax = D_sorted(1) ! 2. @@ -174,12 +153,24 @@ END_PROVIDER dscale_tmp = dscale*dscale*Dmax np8=0_8 do p8=1,ndim8 - if ( dscale_tmp*D(p8) > tau2 ) then + if ( dscale_tmp*D(p8) >= tau2 ) then np8 = np8+1_8 Lset(np8) = p8 endif enddo np = np8 + if (np <= 0) stop 'np<=0' + if (np > ndim8) stop 'np>ndim8' + + rank_max = min(np,20*elec_num*elec_num) + call mmap(trim(ezfio_work_dir)//'cholesky_ao_tmp', (/ ndim8, rank_max /), 8, fd(1), .False., .True., c_pointer(1)) + call c_f_pointer(c_pointer(1), L, (/ ndim8, rank_max /)) + + ! Deleting the file while it is open makes the file invisible on the filesystem, + ! and automatically deleted, even if the program crashes + iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_ao_tmp', 'R') + close(iunit,status='delete') + ! 3. N = 0 @@ -187,85 +178,59 @@ END_PROVIDER ! 4. i = 0 + mem = memory_of_double(np) & ! Delta(np,nq) + + (np+1)*memory_of_double(block_size) ! Ltmp_p(np,block_size) + Ltmp_q(nq,block_size) + +! call check_mem(mem) + ! 5. - do while ( (Dmax > tau).and.(rank*1_8 < min(ndim8,rank_max)) ) + do while ( (Dmax > tau).and.(np > 0) ) ! a. i = i+1 - ! Inrease s until the arrays fit in memory - s = 0.01d0 block_size = max(N,24) + + ! Determine nq so that Delta fits in memory + + s = 0.1d0 + Dmin = max(s*Dmax,tau) + do nq=2,np-1 + if (D_sorted(nq) < Dmin) exit + enddo + do while (.True.) - ! b. - Dmin = max(s*Dmax,tau) + mem = mem0 & + + np*memory_of_double(nq) & ! Delta(np,nq) + + (np+nq)*memory_of_double(block_size) & ! Ltmp_p(np,block_size) + Ltmp_q(nq,block_size) + + memory_of_int(nq) ! computed(nq) - ! c. - nq=0 - do p=1,np - if ( D(Lset(p)) > Dmin ) then - nq = nq+1 - Dset(nq) = Lset(p) - endif - enddo - - - mem = mem0 & - + np*memory_of_double(nq) - -!print *, 'mem = ', mem - if (mem > qp_max_mem/2) then - s = s*2.d0 + if (mem > qp_max_mem*0.5d0) then + nq = nq/2 else exit endif - if ((s > 1.d0).or.(nq == 0)) then - call print_memory_usage() - print *, 'Required peak memory: ', mem, 'Gb' - call resident_memory(mem) - print *, 'Already used memory: ', mem, 'Gb' - print *, 'Not enough memory. Reduce cholesky threshold' - stop -1 - endif - - if (s > 0.3d0) then - exit - endif - enddo - ! d., e. - mem = mem0 & - + memory_of_int(nq) &! computed(nq) - + np*memory_of_int(nq) &! computed(nq) - + memory_of_double(np) &! Delta_col(np) - + 7*memory_of_double8(ndim8) &! D, Lset, Dset, D_sorted, addr[1-3] - + np*memory_of_double(nq) &! Delta(np,nq) - + (np+nq)*memory_of_double(block_size) ! Ltmp_p(np,block_size) + Ltmp_q(nq,block_size) - - if (1.1*mem > qp_max_mem) then -! call mmap(trim(ezfio_work_dir)//'cholesky_delta', (/ np*1_8, nq*1_8 /), 8, fd(2), .False., .True., c_pointer(2)) -! call c_f_pointer(c_pointer(2), Delta, (/ np, nq /)) - -! ! Deleting the file while it is open makes the file invisible on the filesystem, -! ! and automatically deleted, even if the program crashes - iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_delta', 'R') - close(iunit,status='delete') - open(unit=iunit, access='DIRECT', form='UNFORMATTED', RECL=(np+1)*8, & - ASYNCHRONOUS='YES', file=trim(ezfio_work_dir)//'cholesky_delta') - delta_on_disk = .True. - else - allocate(Delta(np,nq)) - delta_on_disk = .False. + if (nq <= 0) then + print *, nq + stop 'bug in cholesky: nq <= 0' endif -!print *, delta_on_disk - allocate(Delta_col(np)) + Dmin = D_sorted(nq) + nq=0 + do p=1,np + if ( D(Lset(p)) >= Dmin ) then + nq = nq+1 + Dset(nq) = Lset(p) + endif + enddo + + allocate(Delta(np,nq)) allocate(Ltmp_p(np,block_size), stat=ierr) -!print *, 'allocate : Ltmp_p(np,block_size)', memory_of_double8(np*block_size*1_8), np, block_size if (ierr /= 0) then call print_memory_usage() @@ -274,7 +239,6 @@ END_PROVIDER endif allocate(Ltmp_q(nq,block_size), stat=ierr) -!print *, 'allocate : Ltmp_q(nq,block_size)', memory_of_double8(nq*block_size*1_8), nq, block_size if (ierr /= 0) then call print_memory_usage() @@ -287,7 +251,6 @@ END_PROVIDER computed(:) = .False. -!print *, 'N, rank, block_size', N, rank, block_size !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(k,p,q) do k=1,N !$OMP DO @@ -305,81 +268,18 @@ END_PROVIDER !$OMP BARRIER !$OMP END PARALLEL - PROVIDE nproc - - if (delta_on_disk) then - - dgemm_block_size = nproc*4 - - allocate (dgemm_buffer1(np,dgemm_block_size)) - allocate (dgemm_buffer3(np,dgemm_block_size)) - allocate (dgemm_buffer2(dgemm_block_size,N)) - - if (N>0) then - ! Blocking improves I/O performance - - - do jj=1,nq,dgemm_block_size - - nqq = min(nq, jj+dgemm_block_size-1) - jj + 1 - - !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q,ii) - do ii=1,N - do q=jj,jj+nqq-1 - dgemm_buffer2(q-jj+1,ii) = Ltmp_q(q,ii) - enddo - enddo - !$OMP END PARALLEL DO - -print *, np, nq, jj, nqq, N - call dgemm('N', 'T', np, nqq, N, -1.d0, & - Ltmp_p, np, dgemm_buffer2, dgemm_block_size, 0.d0, dgemm_buffer1, np) - - wait(iunit) - dgemm_buffer3(:,:) = dgemm_buffer1(:,:) -! !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q) - do q=jj,jj+nqq-1 - write(iunit, ASYNCHRONOUS='YES', rec=q) dgemm_buffer3(1:np, q-jj+1) - enddo -! !$OMP END PARALLEL DO - - enddo -print *, 'ok1' - - else - - - dgemm_buffer1(1:np,1) = 0.d0 - -! !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q) - do q=1,nq - write(iunit, ASYNCHRONOUS='YES', rec=q) dgemm_buffer1(1:np, 1) - enddo -! !$OMP END PARALLEL DO - - endif - - deallocate(dgemm_buffer1, dgemm_buffer2) - if (delta_on_disk) wait(iunit) - deallocate(dgemm_buffer3) - - - else - - if (N>0) then + if (N>0) then call dgemm('N', 'T', np, nq, N, -1.d0, & Ltmp_p(1,1), np, Ltmp_q(1,1), nq, 0.d0, Delta, np) - else + else - !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q,j) - do q=1,nq - Delta(:,q) = 0.d0 - enddo - !$OMP END PARALLEL DO - - endif + !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q,j) + do q=1,nq + Delta(:,q) = 0.d0 + enddo + !$OMP END PARALLEL DO endif @@ -395,64 +295,20 @@ print *, 'ok1' do j=1,nq if ( (Qmax <= Dmin).or.(N+j*1_8 > ndim8) ) exit + ! i. rank = N+j + if (rank == rank_max) then + print *, 'cholesky: rank_max reached' + exit + endif if (iblock == block_size) then - if (delta_on_disk) then - ! Blocking improves I/O performance - - dgemm_block_size = nproc*4 - allocate (dgemm_buffer1(np,dgemm_block_size)) - allocate (dgemm_buffer2(dgemm_block_size,block_size)) - - do jj=1,nq,dgemm_block_size - nqq = min(nq, jj+dgemm_block_size-1) - jj + 1 - - !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(q,ii) - !$OMP DO - do ii=1,block_size - do q=jj,jj+nqq-1 - dgemm_buffer2(q-jj+1,ii) = Ltmp_q(q,ii) - enddo - enddo - !$OMP END DO - !$OMP END PARALLEL - -! !$OMP DO - do q=jj,jj+nqq-1 - read(iunit, rec=q) dgemm_buffer1(1:np, q-jj+1) - enddo -! !$OMP END DO - -print *, np, nq, jj, nqq, block_size - call dgemm('N', 'T', np, nqq, block_size, -1.d0, & - Ltmp_p(1,1), np, dgemm_buffer2, dgemm_block_size, 1.d0, dgemm_buffer1, np) - - wait(iunit) - dgemm_buffer3 = dgemm_buffer1 - -! !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q) - do q=jj,jj+nqq-1 - write(iunit, ASYNCHRONOUS='YES', rec=q) dgemm_buffer3(:, q-jj+1) - enddo -! !$OMP END PARALLEL DO - - enddo -print *, 'ok' - deallocate(dgemm_buffer1, dgemm_buffer2) - wait(iunit) - deallocate(dgemm_buffer3) - - else - - call dgemm('N','T',np,nq,block_size,-1.d0, & + call dgemm('N','T',np,nq,block_size,-1.d0, & Ltmp_p, np, Ltmp_q, nq, 1.d0, Delta, np) - endif - - iblock = 0 + iblock = 0 endif @@ -469,71 +325,47 @@ print *, 'ok' enddo iblock = iblock+1 - - if (delta_on_disk) then - - read(iunit,rec=dj) Ltmp_p(1:np,iblock) - - else - - !$OMP PARALLEL DO PRIVATE(p) - do p=1,np - Ltmp_p(p,iblock) = Delta(p,dj) - enddo - !$OMP END PARALLEL DO - - endif + !$OMP PARALLEL DO PRIVATE(p) + do p=1,np + Ltmp_p(p,iblock) = Delta(p,dj) + enddo + !$OMP END PARALLEL DO if (.not.computed(dj)) then m = dj if (do_direct_integrals) then !$OMP PARALLEL DO PRIVATE(k) SCHEDULE(dynamic,21) do k=1,np + Delta_col(k) = 0.d0 if (.not.ao_two_e_integral_zero( addr1(Lset(k)), addr1(Dset(m)),& addr2(Lset(k)), addr2(Dset(m)) ) ) then Delta_col(k) = & ao_two_e_integral(addr1(Lset(k)), addr2(Lset(k)),& addr1(Dset(m)), addr2(Dset(m))) - else - Delta_col(k) = 0.d0 endif enddo !$OMP END PARALLEL DO else + PROVIDE ao_integrals_map !$OMP PARALLEL DO PRIVATE(k) SCHEDULE(dynamic,21) do k=1,np + Delta_col(k) = 0.d0 if (.not.ao_two_e_integral_zero( addr1(Lset(k)), addr1(Dset(m)),& addr2(Lset(k)), addr2(Dset(m)) ) ) then Delta_col(k) = & get_ao_two_e_integral( addr1(Lset(k)), addr1(Dset(m)),& addr2(Lset(k)), addr2(Dset(m)), ao_integrals_map) - else - Delta_col(k) = 0.d0 endif enddo !$OMP END PARALLEL DO endif - if (delta_on_disk) then - - !$OMP PARALLEL DO PRIVATE(p) - do p=1,np - Ltmp_p(p,iblock) = Ltmp_p(p,iblock) + Delta_col(p) - enddo - !$OMP END PARALLEL DO - - write(iunit, ASYNCHRONOUS='YES', rec=dj) Ltmp_p(1:np,iblock) - - else - - !$OMP PARALLEL DO PRIVATE(p) - do p=1,np - Ltmp_p(p,iblock) = Ltmp_p(p,iblock) + Delta_col(p) - Delta(p,dj) = Ltmp_p(p,iblock) - enddo - !$OMP END PARALLEL DO - - endif + !$OMP PARALLEL DO PRIVATE(p) + do p=1,np + Ltmp_p(p,iblock) = Ltmp_p(p,iblock) + Delta_col(p) + Delta(p,dj) = Ltmp_p(p,iblock) + enddo + !$OMP END PARALLEL DO computed(dj) = .True. endif @@ -572,30 +404,26 @@ print *, 'ok' print '(I10, 4X, ES12.3)', rank, Qmax - deallocate(Delta_col) deallocate(Ltmp_p) deallocate(Ltmp_q) deallocate(computed) - if (delta_on_disk) then - close(iunit, status='delete') - else - deallocate(Delta) - endif + deallocate(Delta) ! i. N = rank ! j. - Dmax = D(Lset(1)) - do p=1,np - Dmax = max(Dmax, D(Lset(p))) - enddo + D_sorted(:) = -D(:) + call dsort_noidx_big(D_sorted,ndim8) + D_sorted(:) = -D_sorted(:) + + Dmax = D_sorted(1) dscale = 1.d0 dscale_tmp = dscale*dscale*Dmax np8=0_8 do p8=1,ndim8 - if ( dscale_tmp*D(p8) > tau2 ) then + if ( dscale_tmp*D(p8) >= tau2 ) then np8 = np8+1_8 Lset(np8) = p8 endif @@ -609,7 +437,6 @@ print *, 'ok' print *, '' allocate(cholesky_ao(ao_num,ao_num,rank), stat=ierr) -!print *, 'allocate : cholesky_ao(ao_num,ao_num,rank)', memory_of_double8(ao_num*ao_num*rank*1_8) if (ierr /= 0) then call print_memory_usage() @@ -621,7 +448,7 @@ print *, 'ok' !$OMP PARALLEL DO PRIVATE(k,j) do k=1,rank do j=1,ao_num - cholesky_ao(1:ao_num,j,k) = L((j-1)*ao_num+1:j*ao_num,k) + cholesky_ao(1:ao_num,j,k) = L((j-1_8)*ao_num+1_8:1_8*j*ao_num,k) enddo enddo !$OMP END PARALLEL DO @@ -646,5 +473,6 @@ print *, 'ok' call wall_time(wall1) print*,'Time to provide AO cholesky vectors = ',(wall1-wall0)/60.d0, ' min' + END_PROVIDER diff --git a/src/hartree_fock/fock_matrix_hf.irp.f b/src/hartree_fock/fock_matrix_hf.irp.f index 65b3d63c..6d917322 100644 --- a/src/hartree_fock/fock_matrix_hf.irp.f +++ b/src/hartree_fock/fock_matrix_hf.irp.f @@ -194,17 +194,28 @@ END_PROVIDER endif - double precision :: rss + double precision :: rss, mem0, mem double precision :: memory_of_double integer :: iblock - integer, parameter :: block_size = 32 + integer :: block_size + + call resident_memory(mem0) + + block_size = 1024 + + rss = memory_of_double(2.d0*ao_num*ao_num) + do + mem = mem0 + block_size*rss + if ( (block_size < 2).or.(mem < qp_max_mem) ) exit + block_size = block_size/2 + enddo + + call check_mem(block_size*rss, irp_here) - rss = memory_of_double(ao_num*ao_num) - call check_mem(2.d0*block_size*rss, irp_here) allocate(X2(ao_num,ao_num,block_size,2)) allocate(X3(ao_num,block_size,ao_num,2)) - + ! ao_two_e_integral_alpha_chol (l,s) -= cholesky_ao(l,m,j) * SCF_density_matrix_ao_beta (m,n) * cholesky_ao(n,s,j) do iblock=1,cholesky_ao_num,block_size From 4b578d9849df7fa548a0b8627f714df6248a8440 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Thu, 20 Jun 2024 13:43:46 +0200 Subject: [PATCH 06/38] mmap is now shared in cholesky --- src/ao_two_e_ints/cholesky.irp.f | 36 ++++++++++++++++++-------------- src/utils/fortran_mmap.c | 3 +++ 2 files changed, 23 insertions(+), 16 deletions(-) diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f index a680e7ee..063cc365 100644 --- a/src/ao_two_e_ints/cholesky.irp.f +++ b/src/ao_two_e_ints/cholesky.irp.f @@ -31,12 +31,11 @@ END_PROVIDER integer*8 :: ndim8 integer :: rank double precision :: tau, tau2 - double precision, pointer :: L(:,:), Delta(:,:) + double precision, pointer :: L(:,:) double precision :: s - double precision :: dscale, dscale_tmp - double precision, allocatable :: D(:), Ltmp_p(:,:), Ltmp_q(:,:), D_sorted(:), Delta_col(:) + double precision, allocatable :: D(:), Ltmp_p(:,:), Ltmp_q(:,:), D_sorted(:), Delta_col(:), Delta(:,:) integer, allocatable :: addr1(:), addr2(:) integer*8, allocatable :: Lset(:), Dset(:) logical, allocatable :: computed(:) @@ -102,7 +101,7 @@ END_PROVIDER rank = 0 allocate( D(ndim8), Lset(ndim8), Dset(ndim8), D_sorted(ndim8)) - allocate( addr1(ndim8), addr2(ndim8), Delta_col(ndim8) ) + allocate( addr1(ndim8), addr2(ndim8), Delta_col(ndim8), computed(ndim8) ) call resident_memory(mem0) @@ -149,11 +148,9 @@ END_PROVIDER Dmax = D_sorted(1) ! 2. - dscale = 1.d0 - dscale_tmp = dscale*dscale*Dmax np8=0_8 do p8=1,ndim8 - if ( dscale_tmp*D(p8) >= tau2 ) then + if ( Dmax*D(p8) >= tau2 ) then np8 = np8+1_8 Lset(np8) = p8 endif @@ -203,16 +200,23 @@ END_PROVIDER mem = mem0 & + np*memory_of_double(nq) & ! Delta(np,nq) - + (np+nq)*memory_of_double(block_size) & ! Ltmp_p(np,block_size) + Ltmp_q(nq,block_size) - + memory_of_int(nq) ! computed(nq) + + (np+nq)*memory_of_double(block_size) ! Ltmp_p(np,block_size) + Ltmp_q(nq,block_size) if (mem > qp_max_mem*0.5d0) then - nq = nq/2 + Dmin = D_sorted(nq/2) + do ii=nq/2,np-1 + if (D_sorted(ii) < Dmin) then + nq = ii + exit + endif + enddo else exit endif enddo +!call print_memory_usage +!print *, 'np, nq, Predicted memory: ', np, nq, mem if (nq <= 0) then print *, nq @@ -247,8 +251,7 @@ END_PROVIDER endif - allocate(computed(nq)) - computed(:) = .False. + computed(1:nq) = .False. !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(k,p,q) @@ -406,7 +409,6 @@ END_PROVIDER deallocate(Ltmp_p) deallocate(Ltmp_q) - deallocate(computed) deallocate(Delta) ! i. @@ -419,11 +421,9 @@ END_PROVIDER Dmax = D_sorted(1) - dscale = 1.d0 - dscale_tmp = dscale*dscale*Dmax np8=0_8 do p8=1,ndim8 - if ( dscale_tmp*D(p8) >= tau2 ) then + if ( Dmax*D(p8) >= tau2 ) then np8 = np8+1_8 Lset(np8) = p8 endif @@ -436,6 +436,10 @@ END_PROVIDER print *, '============ =============' print *, '' + deallocate( D, Lset, Dset, D_sorted ) + deallocate( addr1, addr2, Delta_col, computed ) + + allocate(cholesky_ao(ao_num,ao_num,rank), stat=ierr) if (ierr /= 0) then diff --git a/src/utils/fortran_mmap.c b/src/utils/fortran_mmap.c index 2dbe42b8..0306f64f 100644 --- a/src/utils/fortran_mmap.c +++ b/src/utils/fortran_mmap.c @@ -49,10 +49,13 @@ void* mmap_fortran(char* filename, size_t bytes, int* file_descr, int read_only, } if (single_node == 1) { + map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); +/* map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_POPULATE | MAP_NONBLOCK | MAP_NORESERVE, fd, 0); if (map == MAP_FAILED) { map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); } +*/ } else { map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); } From b6b169c1cd5ad510364cf5b33800cad8a6b5272c Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Thu, 20 Jun 2024 17:27:21 +0200 Subject: [PATCH 07/38] Updated documentation --- src/ao_two_e_ints/cholesky.irp.f | 3 +++ src/utils/linear_algebra.irp.f | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f index 41cdb80d..cdd64a8c 100644 --- a/src/ao_two_e_ints/cholesky.irp.f +++ b/src/ao_two_e_ints/cholesky.irp.f @@ -25,7 +25,10 @@ END_PROVIDER ! Last dimension of cholesky_ao is cholesky_ao_num ! ! https://mogp-emulator.readthedocs.io/en/latest/methods/proc/ProcPivotedCholesky.html + ! ! https://doi.org/10.1016/j.apnum.2011.10.001 : Page 4, Algorithm 1 + ! + ! https://www.diva-portal.org/smash/get/diva2:396223/FULLTEXT01.pdf END_DOC integer*8 :: ndim8 diff --git a/src/utils/linear_algebra.irp.f b/src/utils/linear_algebra.irp.f index 20386b30..4e7ca87d 100644 --- a/src/utils/linear_algebra.irp.f +++ b/src/utils/linear_algebra.irp.f @@ -1856,7 +1856,7 @@ subroutine pivoted_cholesky( A, rank, tol, ndim, U) ! ! matrix A is destroyed inside this subroutine ! Cholesky vectors are stored in U -! dimension of U: U(1:rank, 1:n) +! dimension of U: U(1:n, 1:rank) ! U is allocated inside this subroutine ! rank is the number of Cholesky vectors depending on tol ! From 7e45c517d981adcd77883a39a361be38b470ff20 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 25 Jun 2024 18:32:44 +0200 Subject: [PATCH 08/38] Introducing gpu_x86 --- src/gpu_x86/NEED | 1 + src/gpu_x86/README.rst | 5 + src/gpu_x86/gpu.c | 506 +++++++++++++++++++++++++++++++++++++ src/gpu_x86/gpu.h | 41 +++ src/gpu_x86/gpu_module.F90 | 141 +++++++++++ 5 files changed, 694 insertions(+) create mode 100644 src/gpu_x86/NEED create mode 100644 src/gpu_x86/README.rst create mode 100644 src/gpu_x86/gpu.c create mode 100644 src/gpu_x86/gpu.h create mode 100644 src/gpu_x86/gpu_module.F90 diff --git a/src/gpu_x86/NEED b/src/gpu_x86/NEED new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/src/gpu_x86/NEED @@ -0,0 +1 @@ + diff --git a/src/gpu_x86/README.rst b/src/gpu_x86/README.rst new file mode 100644 index 00000000..f530bf29 --- /dev/null +++ b/src/gpu_x86/README.rst @@ -0,0 +1,5 @@ +======= +gpu_x86 +======= + +x86 implementation of GPU routines. For use when GPUs are not available. diff --git a/src/gpu_x86/gpu.c b/src/gpu_x86/gpu.c new file mode 100644 index 00000000..71505dbe --- /dev/null +++ b/src/gpu_x86/gpu.c @@ -0,0 +1,506 @@ +#include +#include +#include +#include +#include + + +/* Generic functions */ + +int gpu_ndevices() { + return 1; +} + +void gpu_set_device(int32_t i) { + return; +} + + +/* Allocation functions */ + +void gpu_allocate(void** ptr, const int64_t n) { + *ptr = malloc((size_t) n); + if (*ptr == NULL) { + perror("Allocation failed"); + } +} + +void gpu_free(void** ptr) { + free(*ptr); + *ptr = NULL; +} + + +/* Memory transfer functions */ + +void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) { + memcpy(gpu_ptr, cpu_ptr, n); +} + +void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) { + memcpy(cpu_ptr, gpu_ptr, n); +} + +void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) { + memcpy(gpu_ptr_dest, gpu_ptr_src, n); +} + + +/* Streams */ + +void gpu_stream_create(void** ptr) { + *ptr = (void*) 2; +} + +void gpu_stream_destroy(void** ptr) { + *ptr = NULL; +} + +void gpu_set_stream(void* handle, void* stream) { + return; +} + +void gpu_synchronize() { + return; +} + + +/* BLAS functions */ + +void gpu_blas_create(void** handle) { + *handle = (void*) 1; +} + + +void gpu_blas_destroy(void** handle) { + *handle = NULL; +} + + +double ddot_(const int32_t* n, const double* x, const int32_t* incx, const double* y, const int32_t* incy); + +void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) { + assert (handle != NULL); + + /* Convert to int32_t */ + int32_t n_, incx_, incy_; + + n_ = (int32_t) n; + incx_ = (int32_t) incx; + incy_ = (int32_t) incy; + + /* Check for integer overflows */ + assert ( (int64_t) n_ == n ); + assert ( (int64_t) incx_ == incx); + assert ( (int64_t) incy_ == incy); + + *result = ddot_(&n_, x, &incx_, y, &incy_); +} + + +float sdot_(const int32_t* n, const float* x, const int32_t* incx, const float* y, const int32_t* incy); + +void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) { + assert (handle != NULL); + + /* Convert to int32_t */ + int32_t n_, incx_, incy_; + + n_ = (int32_t) n; + incx_ = (int32_t) incx; + incy_ = (int32_t) incy; + + /* Check for integer overflows */ + assert ( (int64_t) n_ == n ); + assert ( (int64_t) incx_ == incx); + assert ( (int64_t) incy_ == incy); + + *result = sdot_(&n_, x, &incx_, y, &incy_); +} + + +void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha, + const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy); + +void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double alpha, + const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) { + + assert (handle != NULL); + + /* Convert to int32_t */ + int32_t m_, n_, lda_, incx_, incy_; + + m_ = (int32_t) m; + n_ = (int32_t) n; + lda_ = (int32_t) lda; + incx_ = (int32_t) incx; + incy_ = (int32_t) incy; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) lda_ == lda ); + assert ( (int64_t) incx_ == incx); + assert ( (int64_t) incy_ == incy); + + dgemv_(&transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_); +} + + +void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha, + const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy); + +void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float alpha, + const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) { + + assert (handle != NULL); + + /* Convert to int32_t */ + int32_t m_, n_, lda_, incx_, incy_; + + m_ = (int32_t) m; + n_ = (int32_t) n; + lda_ = (int32_t) lda; + incx_ = (int32_t) incx; + incy_ = (int32_t) incy; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) lda_ == lda ); + assert ( (int64_t) incx_ == incx); + assert ( (int64_t) incy_ == incy); + + sgemv_(&transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_); +} + + +void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha, + const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc); + +void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, + const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) { + + assert (handle != NULL); + + /* Convert to int32_t */ + int32_t m_, n_, k_, lda_, ldb_, ldc_; + + m_ = (int32_t) m; + n_ = (int32_t) n; + k_ = (int32_t) k; + lda_ = (int32_t) lda; + ldb_ = (int32_t) ldb; + ldc_ = (int32_t) ldc; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) k_ == k ); + assert ( (int64_t) lda_ == lda); + assert ( (int64_t) ldb_ == ldb); + assert ( (int64_t) ldc_ == ldc); + + dgemm_(&transa, &transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_); +} + + + +void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha, + const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc); + +void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, + const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) { + + assert (handle != NULL); + + /* Convert to int32_t */ + int32_t m_, n_, k_, lda_, ldb_, ldc_; + + m_ = (int32_t) m; + n_ = (int32_t) n; + k_ = (int32_t) k; + lda_ = (int32_t) lda; + ldb_ = (int32_t) ldb; + ldc_ = (int32_t) ldc; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) k_ == k ); + assert ( (int64_t) lda_ == lda); + assert ( (int64_t) ldb_ == ldb); + assert ( (int64_t) ldc_ == ldc); + + sgemm_(&transa, &transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_); +} + + +void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, + const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { + if (handle == NULL) { + perror("NULL handle"); + exit(-1); + } + + if ( (transa == 'N' && transb == 'N') || + (transa == 'n' && transb == 'N') || + (transa == 'N' && transb == 'n') || + (transa == 'n' && transb == 'n') ) { + + if (alpha == 0.) { + + for (int64_t j=0 ; j + +int gpu_ndevices(); +void gpu_set_device(int32_t i); + +void gpu_allocate(void** ptr, const int64_t n); +void gpu_free(void** ptr); + +void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n); +void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n); +void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n); + +void gpu_stream_create(void** ptr); +void gpu_stream_destroy(void** ptr); +void gpu_set_stream(void* handle, void* stream); +void gpu_synchronize(); + +void gpu_blas_create(void** handle); +void gpu_blas_destroy(void** handle); + +void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result); + +void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result); + +void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double alpha, + const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy); + +void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float alpha, + const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy); + +void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, + const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc); + +void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, + const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc); + +void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, + const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc); + +void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha, + const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc); diff --git a/src/gpu_x86/gpu_module.F90 b/src/gpu_x86/gpu_module.F90 new file mode 100644 index 00000000..86ba3926 --- /dev/null +++ b/src/gpu_x86/gpu_module.F90 @@ -0,0 +1,141 @@ +module gpu + use, intrinsic :: iso_c_binding, only : c_int32_t, c_int64_t, c_double, c_size_t, c_char + implicit none + + interface + integer function gpu_ndevices() bind(C) + end function + + subroutine gpu_set_device(id) bind(C) + import + integer(c_int32_t), value :: id + end subroutine + + subroutine gpu_allocate_c(ptr, n) bind(C, name='gpu_allocate') + import + type(c_ptr) :: ptr + integer(c_int64_t), value :: n + end subroutine + + subroutine gpu_free_c(ptr) bind(C, name='gpu_free') + import + type(c_ptr) :: ptr + end subroutine + + subroutine gpu_upload_c(cpu_ptr, gpu_ptr, n) bind(C, name='gpu_upload') + import + type(c_ptr), value :: cpu_ptr + type(c_ptr), value :: gpu_ptr + integer(c_int64_t), value :: n + end subroutine + + subroutine gpu_download_c(gpu_ptr, cpu_ptr, n) bind(C, name='gpu_download') + import + type(c_ptr), value :: gpu_ptr + type(c_ptr), value :: cpu_ptr + integer(c_int64_t), value :: n + end subroutine + + subroutine gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, n) bind(C, name='gpu_copy') + import + type(c_ptr), value :: gpu_ptr_src + type(c_ptr), value :: gpu_ptr_dest + integer(c_int64_t), value :: n + end subroutine + + subroutine gpu_stream_create(stream) bind(C) + import + type(c_ptr) :: stream + end subroutine + + subroutine gpu_stream_destroy(stream) bind(C) + import + type(c_ptr) :: stream + end subroutine + + subroutine gpu_set_stream(handle, stream) bind(C) + import + type(c_ptr) :: handle, stream + end subroutine + + subroutine gpu_synchronize() + end subroutine + + subroutine gpu_blas_create(handle) bind(C) + import + type(c_ptr) :: handle + end subroutine + + subroutine gpu_blas_destroy(handle) bind(C) + import + type(c_ptr) :: handle + end subroutine + + subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res) bind(C) + import + type(c_ptr), intent(in) :: handle + integer(c_int64_t), value :: n, incx, incy + real(c_double), intent(in) :: dx(*), dy(*) + real(c_double), intent(out) :: res + end subroutine + + subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res) bind(C) + import + type(c_ptr), intent(in) :: handle + integer(c_int64_t), value :: n, incx, incy + real(c_float), intent(in) :: dx(*), dy(*) + real(c_float), intent(out) :: res + end subroutine + + end interface + +end module + +subroutine gpu_allocate_double(ptr, s) + use gpu + implicit none + double precision, pointer, intent(inout) :: ptr + integer*8, intent(in) :: s(*) + type(c_ptr) :: cptr + + call gpu_allocate_c(cptr, sum(s)*8_8) + call c_f_pointer(cptr, ptr, s) +end subroutine + +subroutine gpu_free_double(ptr) + use gpu + implicit none + double precision, pointer, intent(inout) :: ptr + type(c_ptr) :: cptr + cptr = cloc(ptr) + call gpu_free(cptr) + NULLIFY(ptr) +end subroutine + +subroutine gpu_upload_double(cpu_ptr, gpu_ptr, n) + use gpu + implicit none + double precision, intent(in) :: cpu_ptr(*) + double precision, intent(out) :: gpu_ptr(*) + integer(c_int64_t), intent(in) :: n + call gpu_upload_c(cpu_ptr, gpu_ptr, 8_8*n) +end subroutine + +subroutine gpu_download_double(gpu_ptr, cpu_ptr, n) + use gpu + implicit none + double precision, intent(in) :: gpu_ptr(*) + double precision, intent(out) :: cpu_ptr(*) + integer(c_int64_t), intent(in) :: n + call gpu_download_c(gpu_ptr, cpu_ptr, 8_8*n) +end subroutine + +subroutine gpu_copy_double(gpu_ptr_src, gpu_ptr_dest, n) + use gpu + implicit none + double precision, intent(in) :: gpu_ptr_src(*) + double precision, intent(out) :: gpu_ptr_dest(*) + integer(c_int64_t), intent(in) :: n + call gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, 8_8*n) +end subroutine + From 646607ada4a0a58c9cd5e0593c04bce7bc9bd02e Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Wed, 26 Jun 2024 11:15:30 +0200 Subject: [PATCH 09/38] 1st commit --- plugins/local/tc_int/NEED | 5 + plugins/local/tc_int/README.rst | 4 + plugins/local/tc_int/int2_grad1_u12.irp.f | 134 ++++++++++++++++++++++ plugins/local/tc_int/jast_grad_2e.irp.f | 102 ++++++++++++++++ plugins/local/tc_int/jast_grad_full.irp.f | 51 ++++++++ plugins/local/tc_int/jast_utils_bh.irp.f | 35 ++++++ plugins/local/tc_int/write_tc_int.irp.f | 58 ++++++++++ 7 files changed, 389 insertions(+) create mode 100644 plugins/local/tc_int/NEED create mode 100644 plugins/local/tc_int/README.rst create mode 100644 plugins/local/tc_int/int2_grad1_u12.irp.f create mode 100644 plugins/local/tc_int/jast_grad_2e.irp.f create mode 100644 plugins/local/tc_int/jast_grad_full.irp.f create mode 100644 plugins/local/tc_int/jast_utils_bh.irp.f create mode 100644 plugins/local/tc_int/write_tc_int.irp.f diff --git a/plugins/local/tc_int/NEED b/plugins/local/tc_int/NEED new file mode 100644 index 00000000..8a4caf5b --- /dev/null +++ b/plugins/local/tc_int/NEED @@ -0,0 +1,5 @@ +tc_keywords +jastrow +qmckl +becke_numerical_grid +dft_utils_in_r diff --git a/plugins/local/tc_int/README.rst b/plugins/local/tc_int/README.rst new file mode 100644 index 00000000..bc9e8483 --- /dev/null +++ b/plugins/local/tc_int/README.rst @@ -0,0 +1,4 @@ +====== +tc_int +====== + diff --git a/plugins/local/tc_int/int2_grad1_u12.irp.f b/plugins/local/tc_int/int2_grad1_u12.irp.f new file mode 100644 index 00000000..0cf0d775 --- /dev/null +++ b/plugins/local/tc_int/int2_grad1_u12.irp.f @@ -0,0 +1,134 @@ + +! --- + +subroutine provide_int2_grad1_u12_ao() + + implicit none + integer :: ipoint, i, j, m, jpoint + integer :: n_blocks, n_rest, n_pass + integer :: i_blocks, i_rest, i_pass, ii + double precision :: time0, time1 + double precision :: mem, n_double + double precision, allocatable :: tmp(:,:,:) + double precision, allocatable :: tmp_grad1_u12(:,:,:) + double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:) + + PROVIDE final_weight_at_r_vector_extra aos_in_r_array_extra + + print*, ' start provide_int2_grad1_u12_ao ...' + call wall_time(time0) + + + ! int2_grad1_u12_ao(i,j,ipoint,1) = \int dr2 [\grad1 u(r1,r2)]_x1 \chi_i(r2) \chi_j(r2) + ! int2_grad1_u12_ao(i,j,ipoint,2) = \int dr2 [\grad1 u(r1,r2)]_y1 \chi_i(r2) \chi_j(r2) + ! int2_grad1_u12_ao(i,j,ipoint,3) = \int dr2 [\grad1 u(r1,r2)]_z1 \chi_i(r2) \chi_j(r2) + ! int2_grad1_u12_ao(i,j,ipoint,4) = -(1/2) \int dr2 [\grad1 u(r1,r2)]^2 \chi_i(r2) \chi_j(r2) + allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,4)) + + + + call total_memory(mem) + mem = max(1.d0, qp_max_mem - mem) + n_double = mem * 1.d8 + n_blocks = int(min(n_double / (n_points_extra_final_grid * 4.d0), 1.d0*n_points_final_grid)) + n_rest = int(mod(n_points_final_grid, n_blocks)) + n_pass = int((n_points_final_grid - n_rest) / n_blocks) + + call write_int(6, n_pass, 'Number of passes') + call write_int(6, n_blocks, 'Size of the blocks') + call write_int(6, n_rest, 'Size of the last block') + + + allocate(tmp(n_points_extra_final_grid,ao_num,ao_num)) + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (j, i, jpoint) & + !$OMP SHARED (tmp, ao_num, n_points_extra_final_grid, final_weight_at_r_vector_extra, aos_in_r_array_extra_transp) + !$OMP DO SCHEDULE (static) + do j = 1, ao_num + do i = 1, ao_num + do jpoint = 1, n_points_extra_final_grid + tmp(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j) + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + + + allocate(tmp_grad1_u12(n_points_extra_final_grid,n_blocks,4)) + + do i_pass = 1, n_pass + ii = (i_pass-1)*n_blocks + 1 + + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (i_blocks, ipoint) & + !$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12) + !$OMP DO + do i_blocks = 1, n_blocks + ipoint = ii - 1 + i_blocks ! r1 + call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_blocks,1), tmp_grad1_u12(1,i_blocks,2), tmp_grad1_u12(1,i_blocks,3), tmp_grad1_u12(1,i_blocks,4)) + enddo + !$OMP END DO + !$OMP END PARALLEL + + do m = 1, 4 + call dgemm( "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0 & + , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid & + , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) + enddo + enddo + + deallocate(tmp_grad1_u12) + + + if(n_rest .gt. 0) then + + allocate(tmp_grad1_u12(n_points_extra_final_grid,n_rest,4)) + + ii = n_pass*n_blocks + 1 + + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (i_rest, ipoint) & + !$OMP SHARED (n_rest, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12) + !$OMP DO + do i_rest = 1, n_rest + ipoint = ii - 1 + i_rest ! r1 + call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_rest,1), tmp_grad1_u12(1,i_rest,2), tmp_grad1_u12(1,i_rest,3), tmp_grad1_u12(1,i_rest,4)) + enddo + !$OMP END DO + !$OMP END PARALLEL + + do m = 1, 4 + call dgemm( "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0 & + , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid & + , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) + enddo + + deallocate(tmp_grad1_u12) + endif + + deallocate(tmp) + + + ! --- + + print*, ' Writing int2_grad1_u12_ao in ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao' + open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="write") + call ezfio_set_work_empty(.False.) + write(11) int2_grad1_u12_ao(:,:,:,1:3) + close(11) + + deallocate(int2_grad1_u12_ao) + + call wall_time(time1) + print*, ' wall time for provide_int2_grad1_u12_ao (min) = ', (time1-time0) / 60.d0 + call print_memory_usage() + +end + +! --- + + diff --git a/plugins/local/tc_int/jast_grad_2e.irp.f b/plugins/local/tc_int/jast_grad_2e.irp.f new file mode 100644 index 00000000..b18b9d62 --- /dev/null +++ b/plugins/local/tc_int/jast_grad_2e.irp.f @@ -0,0 +1,102 @@ + +! --- + +subroutine get_grad1_u12_r1_2e(r1, n_grid2, gradx, grady, gradz) + + BEGIN_DOC + ! + ! d/dx1 j_2e(1,2) + ! d/dy1 j_2e(1,2) + ! d/dz1 j_2e(1,2) + ! + END_DOC + + include 'constants.include.F' + + implicit none + integer , intent(in) :: n_grid2 + double precision, intent(in) :: r1(3) + double precision, intent(out) :: gradx(n_grid2) + double precision, intent(out) :: grady(n_grid2) + double precision, intent(out) :: gradz(n_grid2) + + integer :: jpoint + integer :: i_nucl, p, mpA, npA, opA + integer :: powmax1, powmax, powmax2 + double precision :: r2(3) + double precision :: tmp, tmp1, tmp2 + double precision :: rn(3), f1A, grad1_f1A(3), f2A, grad2_f2A(3), g12, grad1_g12(3) + double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:) + + + powmax1 = max(maxval(jBH_m), maxval(jBH_n)) + powmax2 = maxval(jBH_o) + powmax = max(powmax1, powmax2) + + allocate(f1A_power(-1:powmax), f2A_power(-1:powmax), g12_power(-1:powmax), double_p(0:powmax)) + + do p = 0, powmax + double_p(p) = dble(p) + enddo + + f1A_power(-1) = 0.d0 + f2A_power(-1) = 0.d0 + g12_power(-1) = 0.d0 + + f1A_power(0) = 1.d0 + f2A_power(0) = 1.d0 + g12_power(0) = 1.d0 + + do jpoint = 1, n_points_extra_final_grid ! r2 + + r2(1) = final_grid_points_extra(1,jpoint) + r2(2) = final_grid_points_extra(2,jpoint) + r2(3) = final_grid_points_extra(3,jpoint) + + gradx(jpoint) = 0.d0 + grady(jpoint) = 0.d0 + gradz(jpoint) = 0.d0 + do i_nucl = 1, nucl_num + + rn(1) = nucl_coord(i_nucl,1) + rn(2) = nucl_coord(i_nucl,2) + rn(3) = nucl_coord(i_nucl,3) + + call jBH_elem_fct_grad(jBH_en(i_nucl), r1, rn, f1A, grad1_f1A) + call jBH_elem_fct_grad(jBH_en(i_nucl), r2, rn, f2A, grad2_f2A) + call jBH_elem_fct_grad(jBH_ee(i_nucl), r1, r2, g12, grad1_g12) + + ! Compute powers of f1A and f2A + do p = 1, powmax1 + f1A_power(p) = f1A_power(p-1) * f1A + f2A_power(p) = f2A_power(p-1) * f2A + enddo + do p = 1, powmax2 + g12_power(p) = g12_power(p-1) * g12 + enddo + + do p = 1, jBH_size + mpA = jBH_m(p,i_nucl) + npA = jBH_n(p,i_nucl) + opA = jBH_o(p,i_nucl) + tmp = jBH_c(p,i_nucl) + if(mpA .eq. npA) then + tmp = tmp * 0.5d0 + endif + + tmp1 = double_p(mpA) * f1A_power(mpA-1) * f2A_power(npA) + double_p(npA) * f1A_power(npA-1) * f2A_power(mpA) + tmp1 = tmp1 * g12_power(opA) * tmp + tmp2 = double_p(opA) * g12_power(opA-1) * (f1A_power(mpA) * f2A_power(npA) + f1A_power(npA) * f2A_power(mpA)) * tmp + + gradx(jpoint) = gradx(jpoint) + tmp1 * grad1_f1A(1) + tmp2 * grad1_g12(1) + grady(jpoint) = grady(jpoint) + tmp1 * grad1_f1A(2) + tmp2 * grad1_g12(2) + gradz(jpoint) = gradz(jpoint) + tmp1 * grad1_f1A(3) + tmp2 * grad1_g12(3) + enddo ! p + enddo ! i_nucl + enddo ! jpoint + + return +end + +! --- + diff --git a/plugins/local/tc_int/jast_grad_full.irp.f b/plugins/local/tc_int/jast_grad_full.irp.f new file mode 100644 index 00000000..f63ee3e4 --- /dev/null +++ b/plugins/local/tc_int/jast_grad_full.irp.f @@ -0,0 +1,51 @@ + +! --- + +subroutine get_grad1_u12_for_tc(ipoint, n_grid2, resx, resy, resz, res) + + BEGIN_DOC + ! + ! resx(ipoint) = [grad1 u(r1,r2)]_x1 + ! resy(ipoint) = [grad1 u(r1,r2)]_y1 + ! resz(ipoint) = [grad1 u(r1,r2)]_z1 + ! res (ipoint) = -0.5 [grad1 u(r1,r2)]^2 + ! + ! We use: + ! grid for r1 + ! extra_grid for r2 + ! + END_DOC + + implicit none + integer, intent(in) :: ipoint, n_grid2 + double precision, intent(out) :: resx(n_grid2), resy(n_grid2), resz(n_grid2), res(n_grid2) + + integer :: jpoint + double precision :: env_r1, tmp + double precision :: grad1_env(3), r1(3) + double precision, allocatable :: env_r2(:) + double precision, allocatable :: u2b_r12(:), gradx1_u2b(:), grady1_u2b(:), gradz1_u2b(:) + double precision, allocatable :: u2b_mu(:), gradx1_mu(:), grady1_mu(:), gradz1_mu(:) + double precision, allocatable :: u2b_nu(:), gradx1_nu(:), grady1_nu(:), gradz1_nu(:) + double precision, external :: env_nucl + + r1(1) = final_grid_points(1,ipoint) + r1(2) = final_grid_points(2,ipoint) + r1(3) = final_grid_points(3,ipoint) + + + ! j2e_type .eq. "Boys_Handy" + ! env_type .eq. "None" + ! j1e_type .eq "None" + + call get_grad1_u12_r1_2e(r1, n_grid2, resx(1), resy(1), resz(1)) + + do jpoint = 1, n_points_extra_final_grid + res(jpoint) = -0.5d0 * (resx(jpoint) * resx(jpoint) + resy(jpoint) * resy(jpoint) + resz(jpoint) * resz(jpoint)) + enddo + + return +end + +! --- + diff --git a/plugins/local/tc_int/jast_utils_bh.irp.f b/plugins/local/tc_int/jast_utils_bh.irp.f new file mode 100644 index 00000000..750ce90b --- /dev/null +++ b/plugins/local/tc_int/jast_utils_bh.irp.f @@ -0,0 +1,35 @@ + +! --- + +subroutine jBH_elem_fct_grad(alpha, r1, r2, fct, grad1_fct) + + implicit none + double precision, intent(in) :: alpha, r1(3), r2(3) + double precision, intent(out) :: fct, grad1_fct(3) + double precision :: dist, tmp1, tmp2 + + dist = dsqrt( (r1(1) - r2(1)) * (r1(1) - r2(1)) & + + (r1(2) - r2(2)) * (r1(2) - r2(2)) & + + (r1(3) - r2(3)) * (r1(3) - r2(3)) ) + + + if(dist .ge. 1d-10) then + tmp1 = 1.d0 / (1.d0 + alpha * dist) + + fct = alpha * dist * tmp1 + tmp2 = alpha * tmp1 * tmp1 / dist + grad1_fct(1) = tmp2 * (r1(1) - r2(1)) + grad1_fct(2) = tmp2 * (r1(2) - r2(2)) + grad1_fct(3) = tmp2 * (r1(3) - r2(3)) + else + grad1_fct(1) = 0.d0 + grad1_fct(2) = 0.d0 + grad1_fct(3) = 0.d0 + fct = 0.d0 + endif + + return +end + +! --- + diff --git a/plugins/local/tc_int/write_tc_int.irp.f b/plugins/local/tc_int/write_tc_int.irp.f new file mode 100644 index 00000000..ebdce6f2 --- /dev/null +++ b/plugins/local/tc_int/write_tc_int.irp.f @@ -0,0 +1,58 @@ +! --- + +program write_tc_int + + implicit none + + print *, ' j2e_type = ', j2e_type + print *, ' j1e_type = ', j1e_type + print *, ' env_type = ', env_type + + my_grid_becke = .True. + PROVIDE tc_grid1_a tc_grid1_r + my_n_pt_r_grid = tc_grid1_r + my_n_pt_a_grid = tc_grid1_a + touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid + + call write_int(6, my_n_pt_r_grid, 'radial external grid over') + call write_int(6, my_n_pt_a_grid, 'angular external grid over') + + if(tc_integ_type .eq. "numeric") then + my_extra_grid_becke = .True. + PROVIDE tc_grid2_a tc_grid2_r + my_n_pt_r_extra_grid = tc_grid2_r + my_n_pt_a_extra_grid = tc_grid2_a + touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid + + call write_int(6, my_n_pt_r_extra_grid, 'radial internal grid over') + call write_int(6, my_n_pt_a_extra_grid, 'angular internal grid over') + endif + + call main() + +end + +! --- + +subroutine main() + + implicit none + + PROVIDE io_tc_integ + + print*, 'io_tc_integ = ', io_tc_integ + + if(io_tc_integ .ne. "Write") then + print*, 'io_tc_integ != Write' + print*, io_tc_integ + stop + endif + + call provide_int2_grad1_u12_ao() + + call ezfio_set_tc_keywords_io_tc_integ('Read') + +end + +! --- + From a2f4bc218d207d4c588b3d0d1c1d4c5f7448b334 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Wed, 26 Jun 2024 13:44:45 +0200 Subject: [PATCH 10/38] GPU acceleration selection in configure --- configure | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/configure b/configure index 41c0123d..014275eb 100755 --- a/configure +++ b/configure @@ -40,14 +40,16 @@ Usage: $(basename $0) -c $(basename $0) -h $(basename $0) -i + $(basename $0) -g [nvidia|none] Options: - -c Define a COMPILATION configuration file, - in "${QP_ROOT}/config/". - -h Print the HELP message - -i INSTALL . Use at your OWN RISK: - no support will be provided for the installation of - dependencies. + -c Define a COMPILATION configuration file, + in "${QP_ROOT}/config/". + -h Print the HELP message + -i INSTALL . Use at your OWN RISK: + no support will be provided for the installation of + dependencies. + -g [nvidia|none] Choose GPU acceleration (experimental) Example: ./$(basename $0) -c config/gfortran.cfg @@ -83,7 +85,7 @@ function execute () { PACKAGES="" -while getopts "d:c:i:h" c ; do +while getopts "d:c:i:g:h" c ; do case "$c" in c) case "$OPTARG" in @@ -100,6 +102,9 @@ while getopts "d:c:i:h" c ; do "") help ; break;; *) PACKAGES="${PACKAGE} $OPTARG" esac;; + g) + GPU=$OPTARG; + break;; h) help exit 0;; @@ -109,6 +114,23 @@ while getopts "d:c:i:h" c ; do esac done +# Handle GPU acceleration +rm -f ${QP_ROOT}/src/gpu +case "$GPU" in + amd) # Nvidia + echo "Activating AMD GPU acceleration" + ln -s ${QP_ROOT}/src/gpu_amd ${QP_ROOT}/src/gpu + ;; + nvidia) # Nvidia + echo "Activating Nvidia GPU acceleration" + ln -s ${QP_ROOT}/src/gpu_nvidia ${QP_ROOT}/src/gpu + ;; + *) # No Acceleration + echo "Disabling GPU acceleration" + ln -s ${QP_ROOT}/src/gpu_x86 ${QP_ROOT}/src/gpu + ;; +esac + # Trim leading and trailing spaces PACKAGES=$(echo $PACKAGES | xargs) From 1d0bac25d081d76177c6efeba88117f397c5de3c Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Wed, 26 Jun 2024 15:31:44 +0200 Subject: [PATCH 11/38] v0 --- plugins/local/tc_int/compute_tc_int.irp.f | 295 ++++++++++++++++++++++ plugins/local/tc_int/int2_grad1_u12.irp.f | 134 ---------- plugins/local/tc_int/jast_grad_2e.irp.f | 102 -------- plugins/local/tc_int/jast_grad_full.irp.f | 113 +++++++-- plugins/local/tc_int/write_tc_int.irp.f | 18 +- 5 files changed, 401 insertions(+), 261 deletions(-) create mode 100644 plugins/local/tc_int/compute_tc_int.irp.f delete mode 100644 plugins/local/tc_int/int2_grad1_u12.irp.f delete mode 100644 plugins/local/tc_int/jast_grad_2e.irp.f diff --git a/plugins/local/tc_int/compute_tc_int.irp.f b/plugins/local/tc_int/compute_tc_int.irp.f new file mode 100644 index 00000000..02f21570 --- /dev/null +++ b/plugins/local/tc_int/compute_tc_int.irp.f @@ -0,0 +1,295 @@ + +! --- + +subroutine provide_int2_grad1_u12_ao() + + BEGIN_DOC + ! + ! int2_grad1_u12_ao(i,j,ipoint,1) = \int dr2 [\grad1 u(r1,r2)]_x1 \chi_i(r2) \chi_j(r2) + ! int2_grad1_u12_ao(i,j,ipoint,2) = \int dr2 [\grad1 u(r1,r2)]_y1 \chi_i(r2) \chi_j(r2) + ! int2_grad1_u12_ao(i,j,ipoint,3) = \int dr2 [\grad1 u(r1,r2)]_z1 \chi_i(r2) \chi_j(r2) + ! int2_grad1_u12_ao(i,j,ipoint,4) = \int dr2 [-(1/2) [\grad1 u(r1,r2)]^2] \chi_i(r2) \chi_j(r2) + ! + ! + ! tc_int_2e_ao(k,i,l,j) = (ki|V^TC(r_12)|lj) + ! = where V^TC(r_12) is the total TC operator + ! = tc_grad_and_lapl_ao(k,i,l,j) + tc_grad_square_ao(k,i,l,j) + ao_two_e_coul(k,i,l,j) + ! where: + ! + ! tc_grad_and_lapl_ao(k,i,l,j) = < k l | -1/2 \Delta_1 u(r1,r2) - \grad_1 u(r1,r2) . \grad_1 | ij > + ! = -1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2 \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2) + ! = 1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2 (-1) \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2) + ! + ! tc_grad_square_ao(k,i,l,j) = -1/2 + ! + ! ao_two_e_coul(k,i,l,j) = < l k | 1/r12 | j i > = ( k i | 1/r12 | l j ) + ! + END_DOC + + implicit none + + integer :: i, j, k, l, m, ipoint, jpoint + integer :: n_blocks, n_rest, n_pass + integer :: i_blocks, i_rest, i_pass, ii + double precision :: mem, n_double + double precision :: weight1, ao_k_r, ao_i_r + double precision :: der_envsq_x, der_envsq_y, der_envsq_z, lap_envsq + double precision :: time0, time1, time2, tc1, tc2, tc + double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:), tc_int_2e_ao(:,:,:,:) + double precision, allocatable :: tmp(:,:,:), c_mat(:,:,:), tmp_grad1_u12(:,:,:) + + double precision, external :: get_ao_two_e_integral + + + PROVIDE final_weight_at_r_vector_extra aos_in_r_array_extra + PROVIDE final_weight_at_r_vector aos_grad_in_r_array_transp_bis final_weight_at_r_vector aos_in_r_array_transp + + + + print*, ' start provide_int2_grad1_u12_ao ...' + call wall_time(time0) + + call total_memory(mem) + mem = max(1.d0, qp_max_mem - mem) + n_double = mem * 1.d8 + n_blocks = int(min(n_double / (n_points_extra_final_grid * 4.d0), 1.d0*n_points_final_grid)) + n_rest = int(mod(n_points_final_grid, n_blocks)) + n_pass = int((n_points_final_grid - n_rest) / n_blocks) + + call write_int(6, n_pass, 'Number of passes') + call write_int(6, n_blocks, 'Size of the blocks') + call write_int(6, n_rest, 'Size of the last block') + + ! --- + ! --- + ! --- + + allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,4)) + + allocate(tmp(n_points_extra_final_grid,ao_num,ao_num)) + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (j, i, jpoint) & + !$OMP SHARED (tmp, ao_num, n_points_extra_final_grid, final_weight_at_r_vector_extra, aos_in_r_array_extra_transp) + !$OMP DO SCHEDULE (static) + do j = 1, ao_num + do i = 1, ao_num + do jpoint = 1, n_points_extra_final_grid + tmp(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j) + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + + allocate(tmp_grad1_u12(n_points_extra_final_grid,n_blocks,4)) + + tc = 0.d0 + + do i_pass = 1, n_pass + ii = (i_pass-1)*n_blocks + 1 + + call wall_time(tc1) + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (i_blocks, ipoint) & + !$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12) + !$OMP DO + do i_blocks = 1, n_blocks + ipoint = ii - 1 + i_blocks ! r1 + call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_blocks,1), tmp_grad1_u12(1,i_blocks,2), tmp_grad1_u12(1,i_blocks,3), tmp_grad1_u12(1,i_blocks,4)) + enddo + !$OMP END DO + !$OMP END PARALLEL + call wall_time(tc2) + tc = tc + tc2 - tc1 + + do m = 1, 4 + call dgemm( "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0 & + , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid & + , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) + enddo + enddo + + deallocate(tmp_grad1_u12) + + + if(n_rest .gt. 0) then + + allocate(tmp_grad1_u12(n_points_extra_final_grid,n_rest,4)) + + ii = n_pass*n_blocks + 1 + + call wall_time(tc1) + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (i_rest, ipoint) & + !$OMP SHARED (n_rest, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12) + !$OMP DO + do i_rest = 1, n_rest + ipoint = ii - 1 + i_rest ! r1 + call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_rest,1), tmp_grad1_u12(1,i_rest,2), tmp_grad1_u12(1,i_rest,3), tmp_grad1_u12(1,i_rest,4)) + enddo + !$OMP END DO + !$OMP END PARALLEL + call wall_time(tc2) + tc = tc + tc2 - tc1 + + do m = 1, 4 + call dgemm( "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0 & + , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid & + , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) + enddo + + deallocate(tmp_grad1_u12) + endif + + deallocate(tmp) + + + call wall_time(time1) + print*, ' wall time for int2_grad1_u12_ao (min) = ', (time1-time0) / 60.d0 + print*, ' wall time Jastrow derivatives (min) = ', tc / 60.d0 + call print_memory_usage() + + ! --- + ! --- + ! --- + + + allocate(tc_int_2e_ao(ao_num,ao_num,ao_num,ao_num)) + + call wall_time(time1) + + allocate(c_mat(n_points_final_grid,ao_num,ao_num)) + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (i, k, ipoint) & + !$OMP SHARED (aos_in_r_array_transp, c_mat, ao_num, n_points_final_grid, final_weight_at_r_vector) + !$OMP DO SCHEDULE (static) + do i = 1, ao_num + do k = 1, ao_num + do ipoint = 1, n_points_final_grid + c_mat(ipoint,k,i) = final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,k) + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0 & + , int2_grad1_u12_ao(1,1,1,4), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid & + , 0.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num) + deallocate(c_mat) + + call wall_time(time2) + print*, ' wall time of Hermitian part of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0 + call print_memory_usage() + + ! --- + + call wall_time(time1) + + allocate(c_mat(n_points_final_grid,ao_num,ao_num)) + do m = 1, 3 + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r) & + !$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, c_mat, & + !$OMP ao_num, n_points_final_grid, final_weight_at_r_vector, m) + !$OMP DO SCHEDULE (static) + do i = 1, ao_num + do k = 1, ao_num + do ipoint = 1, n_points_final_grid + + weight1 = 0.5d0 * final_weight_at_r_vector(ipoint) + ao_i_r = aos_in_r_array_transp(ipoint,i) + ao_k_r = aos_in_r_array_transp(ipoint,k) + + c_mat(ipoint,k,i) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,m) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,m)) + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + + call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, -1.d0 & + , int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid & + , 1.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num) + enddo + deallocate(c_mat) + + call wall_time(time2) + print*, ' wall time of non-Hermitian part of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0 + call print_memory_usage() + + ! --- + + call wall_time(time1) + + call sum_A_At(tc_int_2e_ao(1,1,1,1), ao_num*ao_num) + + call wall_time(time2) + print*, ' lower- and upper-triangle of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0 + call print_memory_usage() + + ! --- + + call wall_time(time1) + + PROVIDE ao_integrals_map + !$OMP PARALLEL DEFAULT(NONE) & + !$OMP SHARED(ao_num, tc_int_2e_ao, ao_integrals_map) & + !$OMP PRIVATE(i, j, k, l) + !$OMP DO COLLAPSE(3) + do j = 1, ao_num + do l = 1, ao_num + do i = 1, ao_num + do k = 1, ao_num + ! < 1:i, 2:j | 1:k, 2:l > + tc_int_2e_ao(k,i,l,j) = tc_int_2e_ao(k,i,l,j) + get_ao_two_e_integral(i, j, k, l, ao_integrals_map) + enddo + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + + call wall_time(time2) + print*, ' wall time of Coulomb part of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0 + call print_memory_usage() + + ! --- + + print*, ' Writing int2_grad1_u12_ao in ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao' + open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="write") + call ezfio_set_work_empty(.False.) + write(11) int2_grad1_u12_ao(:,:,:,1:3) + close(11) + + print*, ' Saving tc_int_2e_ao in ', trim(ezfio_filename) // '/work/ao_two_e_tc_tot' + open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/ao_two_e_tc_tot', action="write") + call ezfio_set_work_empty(.False.) + do i = 1, ao_num + write(11) tc_int_2e_ao(:,:,:,i) + enddo + close(11) + + ! ---- + + deallocate(int2_grad1_u12_ao) + deallocate(tc_int_2e_ao) + + call wall_time(time2) + print*, ' wall time for tc_int_2e_ao (min) = ', (time2-time1) / 60.d0 + call print_memory_usage() + + ! --- + + call wall_time(time1) + print*, ' wall time for TC-integrals (min) = ', (time1-time0) / 60.d0 + + return +end + +! --- + diff --git a/plugins/local/tc_int/int2_grad1_u12.irp.f b/plugins/local/tc_int/int2_grad1_u12.irp.f deleted file mode 100644 index 0cf0d775..00000000 --- a/plugins/local/tc_int/int2_grad1_u12.irp.f +++ /dev/null @@ -1,134 +0,0 @@ - -! --- - -subroutine provide_int2_grad1_u12_ao() - - implicit none - integer :: ipoint, i, j, m, jpoint - integer :: n_blocks, n_rest, n_pass - integer :: i_blocks, i_rest, i_pass, ii - double precision :: time0, time1 - double precision :: mem, n_double - double precision, allocatable :: tmp(:,:,:) - double precision, allocatable :: tmp_grad1_u12(:,:,:) - double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:) - - PROVIDE final_weight_at_r_vector_extra aos_in_r_array_extra - - print*, ' start provide_int2_grad1_u12_ao ...' - call wall_time(time0) - - - ! int2_grad1_u12_ao(i,j,ipoint,1) = \int dr2 [\grad1 u(r1,r2)]_x1 \chi_i(r2) \chi_j(r2) - ! int2_grad1_u12_ao(i,j,ipoint,2) = \int dr2 [\grad1 u(r1,r2)]_y1 \chi_i(r2) \chi_j(r2) - ! int2_grad1_u12_ao(i,j,ipoint,3) = \int dr2 [\grad1 u(r1,r2)]_z1 \chi_i(r2) \chi_j(r2) - ! int2_grad1_u12_ao(i,j,ipoint,4) = -(1/2) \int dr2 [\grad1 u(r1,r2)]^2 \chi_i(r2) \chi_j(r2) - allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,4)) - - - - call total_memory(mem) - mem = max(1.d0, qp_max_mem - mem) - n_double = mem * 1.d8 - n_blocks = int(min(n_double / (n_points_extra_final_grid * 4.d0), 1.d0*n_points_final_grid)) - n_rest = int(mod(n_points_final_grid, n_blocks)) - n_pass = int((n_points_final_grid - n_rest) / n_blocks) - - call write_int(6, n_pass, 'Number of passes') - call write_int(6, n_blocks, 'Size of the blocks') - call write_int(6, n_rest, 'Size of the last block') - - - allocate(tmp(n_points_extra_final_grid,ao_num,ao_num)) - !$OMP PARALLEL & - !$OMP DEFAULT (NONE) & - !$OMP PRIVATE (j, i, jpoint) & - !$OMP SHARED (tmp, ao_num, n_points_extra_final_grid, final_weight_at_r_vector_extra, aos_in_r_array_extra_transp) - !$OMP DO SCHEDULE (static) - do j = 1, ao_num - do i = 1, ao_num - do jpoint = 1, n_points_extra_final_grid - tmp(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j) - enddo - enddo - enddo - !$OMP END DO - !$OMP END PARALLEL - - - allocate(tmp_grad1_u12(n_points_extra_final_grid,n_blocks,4)) - - do i_pass = 1, n_pass - ii = (i_pass-1)*n_blocks + 1 - - !$OMP PARALLEL & - !$OMP DEFAULT (NONE) & - !$OMP PRIVATE (i_blocks, ipoint) & - !$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12) - !$OMP DO - do i_blocks = 1, n_blocks - ipoint = ii - 1 + i_blocks ! r1 - call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_blocks,1), tmp_grad1_u12(1,i_blocks,2), tmp_grad1_u12(1,i_blocks,3), tmp_grad1_u12(1,i_blocks,4)) - enddo - !$OMP END DO - !$OMP END PARALLEL - - do m = 1, 4 - call dgemm( "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0 & - , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid & - , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) - enddo - enddo - - deallocate(tmp_grad1_u12) - - - if(n_rest .gt. 0) then - - allocate(tmp_grad1_u12(n_points_extra_final_grid,n_rest,4)) - - ii = n_pass*n_blocks + 1 - - !$OMP PARALLEL & - !$OMP DEFAULT (NONE) & - !$OMP PRIVATE (i_rest, ipoint) & - !$OMP SHARED (n_rest, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12) - !$OMP DO - do i_rest = 1, n_rest - ipoint = ii - 1 + i_rest ! r1 - call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_rest,1), tmp_grad1_u12(1,i_rest,2), tmp_grad1_u12(1,i_rest,3), tmp_grad1_u12(1,i_rest,4)) - enddo - !$OMP END DO - !$OMP END PARALLEL - - do m = 1, 4 - call dgemm( "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0 & - , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid & - , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) - enddo - - deallocate(tmp_grad1_u12) - endif - - deallocate(tmp) - - - ! --- - - print*, ' Writing int2_grad1_u12_ao in ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao' - open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="write") - call ezfio_set_work_empty(.False.) - write(11) int2_grad1_u12_ao(:,:,:,1:3) - close(11) - - deallocate(int2_grad1_u12_ao) - - call wall_time(time1) - print*, ' wall time for provide_int2_grad1_u12_ao (min) = ', (time1-time0) / 60.d0 - call print_memory_usage() - -end - -! --- - - diff --git a/plugins/local/tc_int/jast_grad_2e.irp.f b/plugins/local/tc_int/jast_grad_2e.irp.f deleted file mode 100644 index b18b9d62..00000000 --- a/plugins/local/tc_int/jast_grad_2e.irp.f +++ /dev/null @@ -1,102 +0,0 @@ - -! --- - -subroutine get_grad1_u12_r1_2e(r1, n_grid2, gradx, grady, gradz) - - BEGIN_DOC - ! - ! d/dx1 j_2e(1,2) - ! d/dy1 j_2e(1,2) - ! d/dz1 j_2e(1,2) - ! - END_DOC - - include 'constants.include.F' - - implicit none - integer , intent(in) :: n_grid2 - double precision, intent(in) :: r1(3) - double precision, intent(out) :: gradx(n_grid2) - double precision, intent(out) :: grady(n_grid2) - double precision, intent(out) :: gradz(n_grid2) - - integer :: jpoint - integer :: i_nucl, p, mpA, npA, opA - integer :: powmax1, powmax, powmax2 - double precision :: r2(3) - double precision :: tmp, tmp1, tmp2 - double precision :: rn(3), f1A, grad1_f1A(3), f2A, grad2_f2A(3), g12, grad1_g12(3) - double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:) - - - powmax1 = max(maxval(jBH_m), maxval(jBH_n)) - powmax2 = maxval(jBH_o) - powmax = max(powmax1, powmax2) - - allocate(f1A_power(-1:powmax), f2A_power(-1:powmax), g12_power(-1:powmax), double_p(0:powmax)) - - do p = 0, powmax - double_p(p) = dble(p) - enddo - - f1A_power(-1) = 0.d0 - f2A_power(-1) = 0.d0 - g12_power(-1) = 0.d0 - - f1A_power(0) = 1.d0 - f2A_power(0) = 1.d0 - g12_power(0) = 1.d0 - - do jpoint = 1, n_points_extra_final_grid ! r2 - - r2(1) = final_grid_points_extra(1,jpoint) - r2(2) = final_grid_points_extra(2,jpoint) - r2(3) = final_grid_points_extra(3,jpoint) - - gradx(jpoint) = 0.d0 - grady(jpoint) = 0.d0 - gradz(jpoint) = 0.d0 - do i_nucl = 1, nucl_num - - rn(1) = nucl_coord(i_nucl,1) - rn(2) = nucl_coord(i_nucl,2) - rn(3) = nucl_coord(i_nucl,3) - - call jBH_elem_fct_grad(jBH_en(i_nucl), r1, rn, f1A, grad1_f1A) - call jBH_elem_fct_grad(jBH_en(i_nucl), r2, rn, f2A, grad2_f2A) - call jBH_elem_fct_grad(jBH_ee(i_nucl), r1, r2, g12, grad1_g12) - - ! Compute powers of f1A and f2A - do p = 1, powmax1 - f1A_power(p) = f1A_power(p-1) * f1A - f2A_power(p) = f2A_power(p-1) * f2A - enddo - do p = 1, powmax2 - g12_power(p) = g12_power(p-1) * g12 - enddo - - do p = 1, jBH_size - mpA = jBH_m(p,i_nucl) - npA = jBH_n(p,i_nucl) - opA = jBH_o(p,i_nucl) - tmp = jBH_c(p,i_nucl) - if(mpA .eq. npA) then - tmp = tmp * 0.5d0 - endif - - tmp1 = double_p(mpA) * f1A_power(mpA-1) * f2A_power(npA) + double_p(npA) * f1A_power(npA-1) * f2A_power(mpA) - tmp1 = tmp1 * g12_power(opA) * tmp - tmp2 = double_p(opA) * g12_power(opA-1) * (f1A_power(mpA) * f2A_power(npA) + f1A_power(npA) * f2A_power(mpA)) * tmp - - gradx(jpoint) = gradx(jpoint) + tmp1 * grad1_f1A(1) + tmp2 * grad1_g12(1) - grady(jpoint) = grady(jpoint) + tmp1 * grad1_f1A(2) + tmp2 * grad1_g12(2) - gradz(jpoint) = gradz(jpoint) + tmp1 * grad1_f1A(3) + tmp2 * grad1_g12(3) - enddo ! p - enddo ! i_nucl - enddo ! jpoint - - return -end - -! --- - diff --git a/plugins/local/tc_int/jast_grad_full.irp.f b/plugins/local/tc_int/jast_grad_full.irp.f index f63ee3e4..78ed1edf 100644 --- a/plugins/local/tc_int/jast_grad_full.irp.f +++ b/plugins/local/tc_int/jast_grad_full.irp.f @@ -16,31 +16,26 @@ subroutine get_grad1_u12_for_tc(ipoint, n_grid2, resx, resy, resz, res) ! END_DOC + include 'constants.include.F' + implicit none integer, intent(in) :: ipoint, n_grid2 double precision, intent(out) :: resx(n_grid2), resy(n_grid2), resz(n_grid2), res(n_grid2) - integer :: jpoint - double precision :: env_r1, tmp - double precision :: grad1_env(3), r1(3) - double precision, allocatable :: env_r2(:) - double precision, allocatable :: u2b_r12(:), gradx1_u2b(:), grady1_u2b(:), gradz1_u2b(:) - double precision, allocatable :: u2b_mu(:), gradx1_mu(:), grady1_mu(:), gradz1_mu(:) - double precision, allocatable :: u2b_nu(:), gradx1_nu(:), grady1_nu(:), gradz1_nu(:) - double precision, external :: env_nucl + integer :: jpoint, i_nucl, p, mpA, npA, opA, pp + integer :: powmax1, powmax, powmax2 + double precision :: r1(3), r2(3) + double precision :: tmp, tmp1, tmp2, tmp11, tmp22 + double precision :: rn(3), f1A, grad1_f1A(3), f2A, grad2_f2A(3), g12, grad1_g12(3) + double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:) r1(1) = final_grid_points(1,ipoint) r1(2) = final_grid_points(2,ipoint) r1(3) = final_grid_points(3,ipoint) + call grad1_j12_r1_seq(r1, n_grid2, resx, resy, resz) - ! j2e_type .eq. "Boys_Handy" - ! env_type .eq. "None" - ! j1e_type .eq "None" - - call get_grad1_u12_r1_2e(r1, n_grid2, resx(1), resy(1), resz(1)) - - do jpoint = 1, n_points_extra_final_grid + do jpoint = 1, n_grid2 ! r2 res(jpoint) = -0.5d0 * (resx(jpoint) * resx(jpoint) + resy(jpoint) * resy(jpoint) + resz(jpoint) * resz(jpoint)) enddo @@ -49,3 +44,91 @@ end ! --- +subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz) + + include 'constants.include.F' + + implicit none + integer , intent(in) :: n_grid2 + double precision, intent(in) :: r1(3) + double precision, intent(out) :: gradx(n_grid2) + double precision, intent(out) :: grady(n_grid2) + double precision, intent(out) :: gradz(n_grid2) + + integer :: jpoint, i_nucl, p, mpA, npA, opA + double precision :: r2(3) + double precision :: dx, dy, dz, r12, tmp + double precision :: rn(3), f1A, grad1_f1A(3), f2A, grad2_f2A(3), g12, grad1_g12(3) + double precision :: tmp1, tmp2 + integer :: powmax1, powmax, powmax2 + double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:) + + powmax1 = max(maxval(jBH_m), maxval(jBH_n)) + powmax2 = maxval(jBH_o) + powmax = max(powmax1, powmax2) + + allocate(f1A_power(-1:powmax), f2A_power(-1:powmax), g12_power(-1:powmax), double_p(0:powmax)) + + do p = 0, powmax + double_p(p) = dble(p) + enddo + + f1A_power(-1) = 0.d0 + f2A_power(-1) = 0.d0 + g12_power(-1) = 0.d0 + + f1A_power(0) = 1.d0 + f2A_power(0) = 1.d0 + g12_power(0) = 1.d0 + + do jpoint = 1, n_grid2 ! r2 + + r2(1) = final_grid_points_extra(1,jpoint) + r2(2) = final_grid_points_extra(2,jpoint) + r2(3) = final_grid_points_extra(3,jpoint) + + gradx(jpoint) = 0.d0 + grady(jpoint) = 0.d0 + gradz(jpoint) = 0.d0 + do i_nucl = 1, nucl_num + + rn(1) = nucl_coord(i_nucl,1) + rn(2) = nucl_coord(i_nucl,2) + rn(3) = nucl_coord(i_nucl,3) + + call jBH_elem_fct_grad(jBH_en(i_nucl), r1, rn, f1A, grad1_f1A) + call jBH_elem_fct_grad(jBH_en(i_nucl), r2, rn, f2A, grad2_f2A) + call jBH_elem_fct_grad(jBH_ee(i_nucl), r1, r2, g12, grad1_g12) + + ! Compute powers of f1A and f2A + do p = 1, powmax1 + f1A_power(p) = f1A_power(p-1) * f1A + f2A_power(p) = f2A_power(p-1) * f2A + enddo + do p = 1, powmax2 + g12_power(p) = g12_power(p-1) * g12 + enddo + + do p = 1, jBH_size + mpA = jBH_m(p,i_nucl) + npA = jBH_n(p,i_nucl) + opA = jBH_o(p,i_nucl) + tmp = jBH_c(p,i_nucl) + if(mpA .eq. npA) then + tmp = tmp * 0.5d0 + endif + + tmp1 = double_p(mpA) * f1A_power(mpA-1) * f2A_power(npA) + double_p(npA) * f1A_power(npA-1) * f2A_power(mpA) + tmp1 = tmp1 * g12_power(opA) * tmp + tmp2 = double_p(opA) * g12_power(opA-1) * (f1A_power(mpA) * f2A_power(npA) + f1A_power(npA) * f2A_power(mpA)) * tmp + + gradx(jpoint) = gradx(jpoint) + tmp1 * grad1_f1A(1) + tmp2 * grad1_g12(1) + grady(jpoint) = grady(jpoint) + tmp1 * grad1_f1A(2) + tmp2 * grad1_g12(2) + gradz(jpoint) = gradz(jpoint) + tmp1 * grad1_f1A(3) + tmp2 * grad1_g12(3) + enddo ! p + enddo ! i_nucl + enddo ! jpoint + + return +end + diff --git a/plugins/local/tc_int/write_tc_int.irp.f b/plugins/local/tc_int/write_tc_int.irp.f index ebdce6f2..9f25a6fd 100644 --- a/plugins/local/tc_int/write_tc_int.irp.f +++ b/plugins/local/tc_int/write_tc_int.irp.f @@ -14,19 +14,17 @@ program write_tc_int my_n_pt_a_grid = tc_grid1_a touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid + my_extra_grid_becke = .True. + PROVIDE tc_grid2_a tc_grid2_r + my_n_pt_r_extra_grid = tc_grid2_r + my_n_pt_a_extra_grid = tc_grid2_a + touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid + call write_int(6, my_n_pt_r_grid, 'radial external grid over') call write_int(6, my_n_pt_a_grid, 'angular external grid over') - if(tc_integ_type .eq. "numeric") then - my_extra_grid_becke = .True. - PROVIDE tc_grid2_a tc_grid2_r - my_n_pt_r_extra_grid = tc_grid2_r - my_n_pt_a_extra_grid = tc_grid2_a - touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid - - call write_int(6, my_n_pt_r_extra_grid, 'radial internal grid over') - call write_int(6, my_n_pt_a_extra_grid, 'angular internal grid over') - endif + call write_int(6, my_n_pt_r_extra_grid, 'radial internal grid over') + call write_int(6, my_n_pt_a_extra_grid, 'angular internal grid over') call main() From a9d2f0e188cdc88e1cfe1387de4f1c118bb17a5d Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Wed, 26 Jun 2024 17:55:56 +0200 Subject: [PATCH 12/38] Working on TC ints. Not well tested --- plugins/local/tc_int/jast_grad_full.irp.f | 127 ++++++++++++++++++++-- plugins/local/tc_int/jast_utils_bh.irp.f | 22 ++-- 2 files changed, 134 insertions(+), 15 deletions(-) diff --git a/plugins/local/tc_int/jast_grad_full.irp.f b/plugins/local/tc_int/jast_grad_full.irp.f index 78ed1edf..599d3779 100644 --- a/plugins/local/tc_int/jast_grad_full.irp.f +++ b/plugins/local/tc_int/jast_grad_full.irp.f @@ -4,7 +4,7 @@ subroutine get_grad1_u12_for_tc(ipoint, n_grid2, resx, resy, resz, res) BEGIN_DOC - ! + ! ! resx(ipoint) = [grad1 u(r1,r2)]_x1 ! resy(ipoint) = [grad1 u(r1,r2)]_y1 ! resz(ipoint) = [grad1 u(r1,r2)]_z1 @@ -59,7 +59,7 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz) double precision :: r2(3) double precision :: dx, dy, dz, r12, tmp double precision :: rn(3), f1A, grad1_f1A(3), f2A, grad2_f2A(3), g12, grad1_g12(3) - double precision :: tmp1, tmp2 + double precision :: tmp1, tmp2, dist integer :: powmax1, powmax, powmax2 double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:) @@ -90,30 +90,105 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz) gradx(jpoint) = 0.d0 grady(jpoint) = 0.d0 gradz(jpoint) = 0.d0 + + call jBH_elem_fct_grad_alpha1(r1, r2, g12, grad1_g12) + +! dist = (r1(1) - r2(1)) * (r1(1) - r2(1)) & +! + (r1(2) - r2(2)) * (r1(2) - r2(2)) & +! + (r1(3) - r2(3)) * (r1(3) - r2(3)) +! +! if(dist .ge. 1d-15) then +! dist = dsqrt( dist ) +! +! tmp1 = 1.d0 / (1.d0 + dist) +! +! g12 = dist * tmp1 +! tmp2 = tmp1 * tmp1 / dist +! grad1_g12(1) = tmp2 * (r1(1) - r2(1)) +! grad1_g12(2) = tmp2 * (r1(2) - r2(2)) +! grad1_g12(3) = tmp2 * (r1(3) - r2(3)) +! +! else +! +! grad1_g12(1) = 0.d0 +! grad1_g12(2) = 0.d0 +! grad1_g12(3) = 0.d0 +! g12 = 0.d0 +! +! endif +! + do p = 1, powmax2 + g12_power(p) = g12_power(p-1) * g12 + enddo + do i_nucl = 1, nucl_num rn(1) = nucl_coord(i_nucl,1) rn(2) = nucl_coord(i_nucl,2) rn(3) = nucl_coord(i_nucl,3) - call jBH_elem_fct_grad(jBH_en(i_nucl), r1, rn, f1A, grad1_f1A) - call jBH_elem_fct_grad(jBH_en(i_nucl), r2, rn, f2A, grad2_f2A) - call jBH_elem_fct_grad(jBH_ee(i_nucl), r1, r2, g12, grad1_g12) + call jBH_elem_fct_grad_alpha1(r1, rn, f1A, grad1_f1A) +! dist = (r1(1) - rn(1)) * (r1(1) - rn(1)) & +! + (r1(2) - rn(2)) * (r1(2) - rn(2)) & +! + (r1(3) - rn(3)) * (r1(3) - rn(3)) +! if (dist > 1.d-15) then +! dist = dsqrt( dist ) +! +! tmp1 = 1.d0 / (1.d0 + dist) +! +! f1A = dist * tmp1 +! tmp2 = tmp1 * tmp1 / dist +! grad1_f1A(1) = tmp2 * (r1(1) - rn(1)) +! grad1_f1A(2) = tmp2 * (r1(2) - rn(2)) +! grad1_f1A(3) = tmp2 * (r1(3) - rn(3)) +! +! else +! +! grad1_f1A(1) = 0.d0 +! grad1_f1A(2) = 0.d0 +! grad1_f1A(3) = 0.d0 +! f1A = 0.d0 +! +! endif + + call jBH_elem_fct_grad_alpha1(r2, rn, f2A, grad2_f2A) +! dist = (r2(1) - rn(1)) * (r2(1) - rn(1)) & +! + (r2(2) - rn(2)) * (r2(2) - rn(2)) & +! + (r2(3) - rn(3)) * (r2(3) - rn(3)) +! +! if (dist > 1.d-15) then +! dist = dsqrt( dist ) +! +! tmp1 = 1.d0 / (1.d0 + dist) +! +! f2A = dist * tmp1 +! tmp2 = tmp1 * tmp1 / dist +! grad2_f2A(1) = tmp2 * (r2(1) - rn(1)) +! grad2_f2A(2) = tmp2 * (r2(2) - rn(2)) +! grad2_f2A(3) = tmp2 * (r2(3) - rn(3)) +! +! else +! +! grad2_f2A(1) = 0.d0 +! grad2_f2A(2) = 0.d0 +! grad2_f2A(3) = 0.d0 +! f2A = 0.d0 +! +! endif ! Compute powers of f1A and f2A do p = 1, powmax1 f1A_power(p) = f1A_power(p-1) * f1A f2A_power(p) = f2A_power(p-1) * f2A enddo - do p = 1, powmax2 - g12_power(p) = g12_power(p-1) * g12 - enddo do p = 1, jBH_size mpA = jBH_m(p,i_nucl) npA = jBH_n(p,i_nucl) opA = jBH_o(p,i_nucl) tmp = jBH_c(p,i_nucl) +! if (dabs(tmp) <= 1.d-10) cycle +! if(mpA .eq. npA) then tmp = tmp * 0.5d0 endif @@ -132,3 +207,39 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz) return end +subroutine jBH_elem_fct_grad_alpha1(r1, r2, fct, grad1_fct) + + implicit none + double precision, intent(in) :: r1(3), r2(3) + double precision, intent(out) :: fct, grad1_fct(3) + double precision :: dist, tmp1, tmp2 + + dist = (r1(1) - r2(1)) * (r1(1) - r2(1)) & + + (r1(2) - r2(2)) * (r1(2) - r2(2)) & + + (r1(3) - r2(3)) * (r1(3) - r2(3)) + + + if(dist .ge. 1d-15) then + dist = dsqrt( dist ) + + tmp1 = 1.d0 / (1.d0 + dist) + + fct = dist * tmp1 + tmp2 = tmp1 * tmp1 / dist + grad1_fct(1) = tmp2 * (r1(1) - r2(1)) + grad1_fct(2) = tmp2 * (r1(2) - r2(2)) + grad1_fct(3) = tmp2 * (r1(3) - r2(3)) + + else + + grad1_fct(1) = 0.d0 + grad1_fct(2) = 0.d0 + grad1_fct(3) = 0.d0 + fct = 0.d0 + + endif + + return +end + +! --- diff --git a/plugins/local/tc_int/jast_utils_bh.irp.f b/plugins/local/tc_int/jast_utils_bh.irp.f index 750ce90b..200bc5ff 100644 --- a/plugins/local/tc_int/jast_utils_bh.irp.f +++ b/plugins/local/tc_int/jast_utils_bh.irp.f @@ -1,35 +1,43 @@ ! --- + + subroutine jBH_elem_fct_grad(alpha, r1, r2, fct, grad1_fct) implicit none double precision, intent(in) :: alpha, r1(3), r2(3) double precision, intent(out) :: fct, grad1_fct(3) - double precision :: dist, tmp1, tmp2 + double precision :: dist, tmp1, tmp2, dist_inv - dist = dsqrt( (r1(1) - r2(1)) * (r1(1) - r2(1)) & - + (r1(2) - r2(2)) * (r1(2) - r2(2)) & - + (r1(3) - r2(3)) * (r1(3) - r2(3)) ) + dist = (r1(1) - r2(1)) * (r1(1) - r2(1)) & + + (r1(2) - r2(2)) * (r1(2) - r2(2)) & + + (r1(3) - r2(3)) * (r1(3) - r2(3)) - if(dist .ge. 1d-10) then + if(dist .ge. 1d-15) then + dist_inv = 1.d0/dsqrt( dist ) + dist = dist_inv * dist + tmp1 = 1.d0 / (1.d0 + alpha * dist) fct = alpha * dist * tmp1 - tmp2 = alpha * tmp1 * tmp1 / dist + tmp2 = alpha * tmp1 * tmp1 * dist_inv grad1_fct(1) = tmp2 * (r1(1) - r2(1)) grad1_fct(2) = tmp2 * (r1(2) - r2(2)) grad1_fct(3) = tmp2 * (r1(3) - r2(3)) + else + grad1_fct(1) = 0.d0 grad1_fct(2) = 0.d0 grad1_fct(3) = 0.d0 fct = 0.d0 + endif return -end +end ! --- From 5d80cb7b2dd53bdd9eb713e507912e6fce3cadd7 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Thu, 27 Jun 2024 12:06:06 +0200 Subject: [PATCH 13/38] Separated gpu and gpu_arch --- configure | 8 ++-- src/ccsd/NEED | 1 + src/ccsd/ccsd_space_orb_sub.irp.f | 14 +++++-- src/gpu/NEED | 1 + src/gpu/README.rst | 6 +++ src/{gpu_x86 => gpu}/gpu.h | 0 src/{gpu_x86 => gpu}/gpu_module.F90 | 59 +++++++++++++++-------------- src/gpu_x86/gpu.c | 2 +- 8 files changed, 54 insertions(+), 37 deletions(-) create mode 100644 src/gpu/NEED create mode 100644 src/gpu/README.rst rename src/{gpu_x86 => gpu}/gpu.h (100%) rename src/{gpu_x86 => gpu}/gpu_module.F90 (74%) diff --git a/configure b/configure index 014275eb..db158966 100755 --- a/configure +++ b/configure @@ -115,19 +115,19 @@ while getopts "d:c:i:g:h" c ; do done # Handle GPU acceleration -rm -f ${QP_ROOT}/src/gpu +rm -f ${QP_ROOT}/src/gpu_arch case "$GPU" in amd) # Nvidia echo "Activating AMD GPU acceleration" - ln -s ${QP_ROOT}/src/gpu_amd ${QP_ROOT}/src/gpu + ln -s ${QP_ROOT}/src/gpu_amd ${QP_ROOT}/src/gpu_arch ;; nvidia) # Nvidia echo "Activating Nvidia GPU acceleration" - ln -s ${QP_ROOT}/src/gpu_nvidia ${QP_ROOT}/src/gpu + ln -s ${QP_ROOT}/src/gpu_nvidia ${QP_ROOT}/src/gpu_arch ;; *) # No Acceleration echo "Disabling GPU acceleration" - ln -s ${QP_ROOT}/src/gpu_x86 ${QP_ROOT}/src/gpu + ln -s ${QP_ROOT}/src/gpu_x86 ${QP_ROOT}/src/gpu_arch ;; esac diff --git a/src/ccsd/NEED b/src/ccsd/NEED index e6e6bc59..8298f28e 100644 --- a/src/ccsd/NEED +++ b/src/ccsd/NEED @@ -1,2 +1,3 @@ +gpu hartree_fock utils_cc diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 9d4ae7f9..84aab08a 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -1,4 +1,5 @@ subroutine run_ccsd_space_orb + use gpu implicit none @@ -11,7 +12,7 @@ subroutine run_ccsd_space_orb double precision, allocatable :: t2(:,:,:,:), r2(:,:,:,:), tau(:,:,:,:), tau_x(:,:,:,:) double precision, allocatable :: t1(:,:), r1(:,:) - double precision, allocatable :: H_oo(:,:), H_vv(:,:), H_vo(:,:) + double precision, pointer :: H_oo, H_vv, H_vo double precision, allocatable :: all_err(:,:), all_t(:,:) integer, allocatable :: list_occ(:), list_vir(:) @@ -55,7 +56,10 @@ subroutine run_ccsd_space_orb allocate(tau(nO,nO,nV,nV)) allocate(tau_x(nO,nO,nV,nV)) allocate(t1(nO,nV), r1(nO,nV)) - allocate(H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO)) + + call gpu_allocate_double(H_oo, (/ nO, nO /) ) + call gpu_allocate_double(H_vv, (/ nV, nV /) ) + call gpu_allocate_double(H_vo, (/ nV, nO /) ) if (cc_update_method == 'diis') then double precision :: rss, diis_mem, extra_mem @@ -191,7 +195,11 @@ subroutine run_ccsd_space_orb deallocate(all_err,all_t) endif - deallocate(H_vv,H_oo,H_vo,r1,r2,tau) + call gpu_deallocate_double(H_oo) + call gpu_deallocate_double(H_vv) + call gpu_deallocate_double(H_vo) + + deallocate(r1,r2,tau) ! CCSD(T) double precision :: e_t, e_t_err diff --git a/src/gpu/NEED b/src/gpu/NEED new file mode 100644 index 00000000..c2af78d2 --- /dev/null +++ b/src/gpu/NEED @@ -0,0 +1 @@ +gpu_arch diff --git a/src/gpu/README.rst b/src/gpu/README.rst new file mode 100644 index 00000000..17ee28a0 --- /dev/null +++ b/src/gpu/README.rst @@ -0,0 +1,6 @@ +=== +gpu +=== + +Bindings for GPU routines (architecture independent). +Architecture-dependent files are in gpu_arch. diff --git a/src/gpu_x86/gpu.h b/src/gpu/gpu.h similarity index 100% rename from src/gpu_x86/gpu.h rename to src/gpu/gpu.h diff --git a/src/gpu_x86/gpu_module.F90 b/src/gpu/gpu_module.F90 similarity index 74% rename from src/gpu_x86/gpu_module.F90 rename to src/gpu/gpu_module.F90 index 86ba3926..f35ebc97 100644 --- a/src/gpu_x86/gpu_module.F90 +++ b/src/gpu/gpu_module.F90 @@ -1,5 +1,5 @@ module gpu - use, intrinsic :: iso_c_binding, only : c_int32_t, c_int64_t, c_double, c_size_t, c_char + use, intrinsic :: iso_c_binding implicit none interface @@ -17,7 +17,7 @@ module gpu integer(c_int64_t), value :: n end subroutine - subroutine gpu_free_c(ptr) bind(C, name='gpu_free') + subroutine gpu_deallocate_c(ptr) bind(C, name='gpu_deallocate') import type(c_ptr) :: ptr end subroutine @@ -89,53 +89,54 @@ module gpu end interface + contains + + + subroutine gpu_allocate_double(ptr, s) + implicit none + double precision, pointer, intent(inout) :: ptr + integer, intent(in) :: s(:) + type(c_ptr) :: cptr + + call gpu_allocate_c(cptr, sum(s*1_8)*8_8) + call c_f_pointer(cptr, ptr, s) + end subroutine + + subroutine gpu_deallocate_double(ptr) + implicit none + double precision, pointer, intent(inout) :: ptr + type(c_ptr) :: cptr + cptr = c_loc(ptr) + call gpu_deallocate(cptr) + NULLIFY(ptr) + end subroutine + end module -subroutine gpu_allocate_double(ptr, s) - use gpu - implicit none - double precision, pointer, intent(inout) :: ptr - integer*8, intent(in) :: s(*) - type(c_ptr) :: cptr - - call gpu_allocate_c(cptr, sum(s)*8_8) - call c_f_pointer(cptr, ptr, s) -end subroutine - -subroutine gpu_free_double(ptr) - use gpu - implicit none - double precision, pointer, intent(inout) :: ptr - type(c_ptr) :: cptr - cptr = cloc(ptr) - call gpu_free(cptr) - NULLIFY(ptr) -end subroutine - subroutine gpu_upload_double(cpu_ptr, gpu_ptr, n) use gpu implicit none double precision, intent(in) :: cpu_ptr(*) - double precision, intent(out) :: gpu_ptr(*) + double precision, intent(in) :: gpu_ptr(*) integer(c_int64_t), intent(in) :: n - call gpu_upload_c(cpu_ptr, gpu_ptr, 8_8*n) + call gpu_upload_c(c_loc(cpu_ptr), c_loc(gpu_ptr), 8_8*n) end subroutine subroutine gpu_download_double(gpu_ptr, cpu_ptr, n) use gpu implicit none double precision, intent(in) :: gpu_ptr(*) - double precision, intent(out) :: cpu_ptr(*) + double precision, intent(in) :: cpu_ptr(*) integer(c_int64_t), intent(in) :: n - call gpu_download_c(gpu_ptr, cpu_ptr, 8_8*n) + call gpu_download_c(c_loc(gpu_ptr), c_loc(cpu_ptr), 8_8*n) end subroutine subroutine gpu_copy_double(gpu_ptr_src, gpu_ptr_dest, n) use gpu implicit none double precision, intent(in) :: gpu_ptr_src(*) - double precision, intent(out) :: gpu_ptr_dest(*) + double precision, intent(in) :: gpu_ptr_dest(*) integer(c_int64_t), intent(in) :: n - call gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, 8_8*n) + call gpu_copy_c(c_loc(gpu_ptr_src), c_loc(gpu_ptr_dest), 8_8*n) end subroutine diff --git a/src/gpu_x86/gpu.c b/src/gpu_x86/gpu.c index 71505dbe..41ede396 100644 --- a/src/gpu_x86/gpu.c +++ b/src/gpu_x86/gpu.c @@ -25,7 +25,7 @@ void gpu_allocate(void** ptr, const int64_t n) { } } -void gpu_free(void** ptr) { +void gpu_deallocate(void** ptr) { free(*ptr); *ptr = NULL; } From 6c02ac0f0b05ea3cc16e0fde66e23c9a0de14246 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Thu, 27 Jun 2024 12:07:48 +0200 Subject: [PATCH 14/38] Separated gpu and gpu_arch --- src/gpu/gpu_module.F90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90 index f35ebc97..43754454 100644 --- a/src/gpu/gpu_module.F90 +++ b/src/gpu/gpu_module.F90 @@ -107,7 +107,7 @@ module gpu double precision, pointer, intent(inout) :: ptr type(c_ptr) :: cptr cptr = c_loc(ptr) - call gpu_deallocate(cptr) + call gpu_deallocate_c(cptr) NULLIFY(ptr) end subroutine From fa6d1419496d271a4715efc776790ce7fc152064 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Thu, 27 Jun 2024 15:45:52 +0200 Subject: [PATCH 15/38] Introducing GPU in CCSD --- src/ccsd/ccsd_space_orb_sub.irp.f | 224 +++++++----- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 12 +- src/gpu/gpu_module.F90 | 450 ++++++++++++++++++++++--- src/gpu_x86/gpu.c | 48 +-- 4 files changed, 570 insertions(+), 164 deletions(-) diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 84aab08a..455d62f7 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -10,9 +10,9 @@ subroutine run_ccsd_space_orb double precision :: uncorr_energy,energy, max_elem, max_r, max_r1, max_r2,ta,tb logical :: not_converged - double precision, allocatable :: t2(:,:,:,:), r2(:,:,:,:), tau(:,:,:,:), tau_x(:,:,:,:) - double precision, allocatable :: t1(:,:), r1(:,:) - double precision, pointer :: H_oo, H_vv, H_vo + type(gpu_double4) :: t2, r2, tau, tau_x + type(gpu_double2) :: t1, r1 + type(gpu_double2) :: H_oo, H_vv, H_vo double precision, allocatable :: all_err(:,:), all_t(:,:) integer, allocatable :: list_occ(:), list_vir(:) @@ -52,14 +52,15 @@ subroutine run_ccsd_space_orb !print*,'occ',list_occ !print*,'vir',list_vir - allocate(t2(nO,nO,nV,nV), r2(nO,nO,nV,nV)) - allocate(tau(nO,nO,nV,nV)) - allocate(tau_x(nO,nO,nV,nV)) - allocate(t1(nO,nV), r1(nO,nV)) - - call gpu_allocate_double(H_oo, (/ nO, nO /) ) - call gpu_allocate_double(H_vv, (/ nV, nV /) ) - call gpu_allocate_double(H_vo, (/ nV, nO /) ) + call gpu_allocate(t2, nO,nO,nV,nV) + call gpu_allocate(r2, nO,nO,nV,nV) + call gpu_allocate(tau, nO,nO,nV,nV) + call gpu_allocate(tau_x, nO,nO,nV,nV) + call gpu_allocate(t1, nO,nV) + call gpu_allocate(r1, nO,nV) + call gpu_allocate(H_oo, nO, nO) + call gpu_allocate(H_vo, nV, nO) + call gpu_allocate(H_vv, nV, nV) if (cc_update_method == 'diis') then double precision :: rss, diis_mem, extra_mem @@ -101,14 +102,21 @@ subroutine run_ccsd_space_orb endif ! Init - call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,t1) - call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,t2) - call update_tau_space(nO,nV,t1,t2,tau) + double precision, allocatable :: h_t1(:,:), h_t2(:,:,:,:) + allocate(h_t1(nO,nV), h_t2(nO,nO,nV,nV)) + + call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,h_t1) + call gpu_upload(h_t1, t1) + + call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,h_t2) + call gpu_upload(h_t2, t2) + + call update_tau_space(nO,nV,h_t1,t1,t2,tau) call update_tau_x_space(nO,nV,tau,tau_x) !print*,'hf_energy', hf_energy call det_energy(det,uncorr_energy) print*,'Det energy', uncorr_energy - call ccsd_energy_space_x(nO,nV,tau_x,t1,energy) + call ccsd_energy_space_x(nO,nV,tau_x%f,t1%f,energy) print*,'Guess energy', uncorr_energy+energy, energy nb_iter = 0 @@ -127,40 +135,38 @@ subroutine run_ccsd_space_orb if (do_ao_cholesky) then ! if (.False.) then call compute_H_oo_chol(nO,nV,tau_x,H_oo) - call compute_H_vv_chol(nO,nV,tau_x,H_vv) - call compute_H_vo_chol(nO,nV,t1,H_vo) + call compute_H_vv_chol(nO,nV,tau_x%f,H_vv%f) + call compute_H_vo_chol(nO,nV,t1%f,H_vo%f) - call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) - call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) + call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1) + call compute_r2_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r2%f,max_r2) else - call compute_H_oo(nO,nV,t1,t2,tau,H_oo) - call compute_H_vv(nO,nV,t1,t2,tau,H_vv) - call compute_H_vo(nO,nV,t1,t2,H_vo) + call compute_H_oo(nO,nV,t1%f,t2%f,tau%f,H_oo%f) + call compute_H_vv(nO,nV,t1%f,t2%f,tau%f,H_vv%f) + call compute_H_vo(nO,nV,t1%f,t2%f,H_vo%f) - call compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) - call compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) + call compute_r1_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r1%f,max_r1) + call compute_r2_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r2%f,max_r2) endif max_r = max(max_r1,max_r2) ! Update if (cc_update_method == 'diis') then - !call update_t_ccsd(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2) - !call update_t_ccsd_diis(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2) - call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1,r2,t1,t2,all_err,all_t) + call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1%f,r2%f,t1%f,t2%f,all_err,all_t) ! Standard update as T = T - Delta elseif (cc_update_method == 'none') then - call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1,t1) - call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2,t2) + call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1%f,t1%f) + call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2%f,t2%f) else print*,'Unkown cc_method_method: '//cc_update_method endif - call update_tau_space(nO,nV,t1,t2,tau) + call update_tau_space(nO,nV,t1%f,t1,t2,tau) call update_tau_x_space(nO,nV,tau,tau_x) ! Energy - call ccsd_energy_space_x(nO,nV,tau_x,t1,energy) + call ccsd_energy_space_x(nO,nV,tau_x%f,t1%f,energy) write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |' nb_iter = nb_iter + 1 @@ -185,8 +191,8 @@ subroutine run_ccsd_space_orb print*,'' if (write_amplitudes) then - call write_t1(nO,nV,t1) - call write_t2(nO,nV,t2) + call write_t1(nO,nV,t1%f) + call write_t2(nO,nV,t2%f) call ezfio_set_utils_cc_io_amplitudes('Read') endif @@ -195,11 +201,14 @@ subroutine run_ccsd_space_orb deallocate(all_err,all_t) endif - call gpu_deallocate_double(H_oo) - call gpu_deallocate_double(H_vv) - call gpu_deallocate_double(H_vo) + call gpu_deallocate(H_oo) + call gpu_deallocate(H_vv) + call gpu_deallocate(H_vo) - deallocate(r1,r2,tau) + call gpu_deallocate(r1) + call gpu_deallocate(r2) + call gpu_deallocate(tau) + call gpu_deallocate(tau_x) ! CCSD(T) double precision :: e_t, e_t_err @@ -207,28 +216,14 @@ subroutine run_ccsd_space_orb if (cc_par_t .and. elec_alpha_num + elec_beta_num > 2) then - ! Dumb way - !call wall_time(ta) - !call ccsd_par_t_space(nO,nV,t1,t2,e_t) - !call wall_time(tb) - !print*,'Time: ',tb-ta, ' s' - - !print*,'' - !write(*,'(A15,F18.12,A3)') ' E(CCSD(T)) = ', uncorr_energy + energy + e_t, ' Ha' - !write(*,'(A15,F18.12,A3)') ' E(T) = ', e_t, ' Ha' - !write(*,'(A15,F18.12,A3)') ' Correlation = ', energy + e_t, ' Ha' - !print*,'' - ! New e_t = uncorr_energy + energy ! For print in (T) call e_t_err = 0.d0 print*,'Computing (T) correction...' call wall_time(ta) -! call ccsd_par_t_space_v3(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v & -! ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t) - call ccsd_par_t_space_stoch(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v & + call ccsd_par_t_space_stoch(nO,nV,t1%f,t2%f,cc_space_f_o,cc_space_f_v & ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t, e_t_err) call wall_time(tb) @@ -243,7 +238,9 @@ subroutine run_ccsd_space_orb call save_energy(uncorr_energy + energy, e_t) - deallocate(t1,t2) + deallocate(h_t1, h_t2) + call gpu_deallocate(t1) + call gpu_deallocate(t2) end @@ -341,70 +338,139 @@ end ! Tau -subroutine update_tau_space(nO,nV,t1,t2,tau) - +subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau) + use gpu implicit none ! in integer, intent(in) :: nO, nV - double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV) + double precision, intent(in) :: h_t1(nO,nV) + type(gpu_double2), intent(in) :: t1 + type(gpu_double4), intent(in) :: t2 ! out - double precision, intent(out) :: tau(nO,nO,nV,nV) + type(gpu_double4) :: tau ! internal integer :: i,j,a,b +! !$OMP PARALLEL & +! !$OMP SHARED(nO,nV,tau,t2,t1,h_t1) & +! !$OMP PRIVATE(i,j,a,b) & +! !$OMP DEFAULT(NONE) +! !$OMP DO +! do b = 1, nV +! do a = 1, nV +! do j = 1, nO +! do i = 1, nO +! tau%f(i,j,a,b) = t2%f(i,j,a,b) + t1%f(i,a) * h_t1(j,b) +! enddo +! enddo +! enddo +! enddo +! !$OMP END DO +! !$OMP END PARALLEL + + + type(gpu_blas) :: blas + type(gpu_stream) :: stream(nV) + + call gpu_blas_create(blas) + do b=1,nV + call gpu_stream_create(stream(b)) + enddo + !$OMP PARALLEL & - !$OMP SHARED(nO,nV,tau,t2,t1) & + !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) - !$OMP DO - do b = 1, nV - do a = 1, nV - do j = 1, nO - do i = 1, nO - tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b) - enddo - enddo + do j=1,nO + !$OMP DO + do b=1,nV + call gpu_set_stream(blas,stream(b)) + call gpu_dgeam_c(blas%c, 'N', 'N', nO*1_8, nV*1_8, & + 1.d0, c_loc(t2%f(1,j,1,b)), nO*nO*1_8, & + h_t1(j,b), t1%c, nO*1_8, & + c_loc(tau%f(1,j,1,b)), nO*nO*1_8) enddo + !$OMP END DO enddo - !$OMP END DO !$OMP END PARALLEL + call gpu_synchronize() + + do b=1,nV + call gpu_stream_destroy(stream(b)) + enddo + + call gpu_blas_destroy(blas) + end subroutine update_tau_x_space(nO,nV,tau,tau_x) - + use gpu implicit none ! in - integer, intent(in) :: nO, nV - double precision, intent(in) :: tau(nO,nO,nV,nV) + integer, intent(in) :: nO, nV + type(gpu_double4), intent(in) :: tau ! out - double precision, intent(out) :: tau_x(nO,nO,nV,nV) + type(gpu_double4) :: tau_x ! internal integer :: i,j,a,b +! !$OMP PARALLEL & +! !$OMP SHARED(nO,nV,tau,tau_x) & +! !$OMP PRIVATE(i,j,a,b) & +! !$OMP DEFAULT(NONE) +! !$OMP DO +! do b = 1, nV +! do a = 1, nV +! do j = 1, nO +! do i = 1, nO +! tau_x%f(i,j,a,b) = 2.d0*tau%f(i,j,a,b) - tau%f(i,j,b,a) +! enddo +! enddo +! enddo +! enddo +! !$OMP END DO +! !$OMP END PARALLEL + + type(gpu_blas) :: blas + type(gpu_stream) :: stream(nV) + + call gpu_blas_create(blas) + do a=1,nV + call gpu_stream_create(stream(a)) + enddo + !$OMP PARALLEL & - !$OMP SHARED(nO,nV,tau,tau_x) & + !$OMP SHARED(nO,nV,tau,tau_x,stream,blas) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) !$OMP DO - do b = 1, nV - do a = 1, nV - do j = 1, nO - do i = 1, nO - tau_x(i,j,a,b) = 2.d0*tau(i,j,a,b) - tau(i,j,b,a) - enddo - enddo + do b=1,nV + do a=1,nV + call gpu_set_stream(blas,stream(a)) + call gpu_dgeam_c(blas%c, 'N', 'N', nO*1_8, nO*1_8, & + 2.d0, c_loc(tau%f(1,1,a,b)), nO*1_8, & + -1.d0, c_loc(tau%f(1,1,b,a)), nO*1_8, & + c_loc(tau_x%f(1,1,a,b)), nO*1_8) enddo enddo !$OMP END DO !$OMP END PARALLEL + call gpu_synchronize() + + do b=1,nV + call gpu_stream_destroy(stream(b)) + enddo + + call gpu_blas_destroy(blas) + end ! R1 diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index b59dc0bb..9b161001 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -294,12 +294,12 @@ end ! H_oo subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo) - + use gpu implicit none integer, intent(in) :: nO,nV - double precision, intent(in) :: tau_x(nO, nO, nV, nV) - double precision, intent(out) :: H_oo(nO, nO) + type(gpu_double4), intent(in) :: tau_x + type(gpu_double2), intent(out) :: H_oo integer :: a,b,i,j,u,k @@ -315,7 +315,7 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo) do b=1,nV do j=1,nO do a=1,nV - tmp_vov(a,j,b) = tau_x(u,j,a,b) + tmp_vov(a,j,b) = tau_x%f(u,j,a,b) enddo enddo enddo @@ -328,7 +328,7 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo) !$omp do do i = 1, nO do u = 1, nO - H_oo(u,i) = cc_space_f_oo(u,i) + H_oo%f(u,i) = cc_space_f_oo(u,i) enddo enddo !$omp end do nowait @@ -336,7 +336,7 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo) !$omp end parallel call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & tau_kau, cholesky_mo_num*nV, cc_space_v_vo_chol, cholesky_mo_num*nV, & - 1.d0, H_oo, nO) + 1.d0, H_oo%f, nO) end diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90 index 43754454..51f80ac0 100644 --- a/src/gpu/gpu_module.F90 +++ b/src/gpu/gpu_module.F90 @@ -2,6 +2,52 @@ module gpu use, intrinsic :: iso_c_binding implicit none +! Data types +! ---------- + + type gpu_double1 + type(c_ptr) :: c + double precision, pointer :: f(:) + end type + + type gpu_double2 + type(c_ptr) :: c + double precision, pointer :: f(:,:) + end type + + type gpu_double3 + type(c_ptr) :: c + double precision, pointer :: f(:,:,:) + end type + + type gpu_double4 + type(c_ptr) :: c + double precision, pointer :: f(:,:,:,:) + end type + + type gpu_double5 + type(c_ptr) :: c + double precision, pointer :: f(:,:,:,:,:) + end type + + type gpu_double6 + type(c_ptr) :: c + double precision, pointer :: f(:,:,:,:,:,:) + end type + + + type gpu_blas + type(c_ptr) :: c + end type + + type gpu_stream + type(c_ptr) :: c + end type + + +! C interfaces +! ------------ + interface integer function gpu_ndevices() bind(C) end function @@ -43,100 +89,394 @@ module gpu integer(c_int64_t), value :: n end subroutine - subroutine gpu_stream_create(stream) bind(C) + subroutine gpu_stream_create_c(stream) bind(C, name='gpu_stream_create') import type(c_ptr) :: stream end subroutine - subroutine gpu_stream_destroy(stream) bind(C) + subroutine gpu_stream_destroy_c(stream) bind(C, name='gpu_stream_destroy') import type(c_ptr) :: stream end subroutine - subroutine gpu_set_stream(handle, stream) bind(C) + subroutine gpu_set_stream_c(handle, stream) bind(C, name='gpu_set_stream') import type(c_ptr) :: handle, stream end subroutine - subroutine gpu_synchronize() + subroutine gpu_synchronize() bind(C) + import end subroutine - subroutine gpu_blas_create(handle) bind(C) + subroutine gpu_blas_create_c(handle) bind(C, name='gpu_blas_create') import type(c_ptr) :: handle end subroutine - subroutine gpu_blas_destroy(handle) bind(C) + subroutine gpu_blas_destroy_c(handle) bind(C, name='gpu_blas_destroy') import type(c_ptr) :: handle end subroutine - subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res) bind(C) + subroutine gpu_ddot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_ddot') import - type(c_ptr), intent(in) :: handle - integer(c_int64_t), value :: n, incx, incy - real(c_double), intent(in) :: dx(*), dy(*) - real(c_double), intent(out) :: res + type(c_ptr), intent(in), value :: handle + integer(c_int64_t), value :: n, incx, incy + type(c_ptr), intent(in), value :: dx, dy + real(c_double), intent(out) :: res end subroutine - subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res) bind(C) + subroutine gpu_sdot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_sdot') import - type(c_ptr), intent(in) :: handle - integer(c_int64_t), value :: n, incx, incy - real(c_float), intent(in) :: dx(*), dy(*) + type(c_ptr), intent(in), value :: handle + integer(c_int64_t), value :: n, incx, incy + type(c_ptr), intent(in), value :: dx, dy real(c_float), intent(out) :: res end subroutine + subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, & + b, ldb, c, ldc) bind(C, name='gpu_dgeam') + import + type(c_ptr), intent(in), value :: handle + character(c_char), intent(in), value :: transa, transb + integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc + real(c_double), intent(in), value :: alpha, beta + type(c_ptr), value :: a, b, c + end subroutine + end interface + +! Polymorphic interfaces +! ---------------------- + + interface gpu_allocate + procedure gpu_allocate_double1 & + ,gpu_allocate_double2 & + ,gpu_allocate_double3 & + ,gpu_allocate_double4 & + ,gpu_allocate_double5 & + ,gpu_allocate_double6 + end interface gpu_allocate + + interface gpu_deallocate + procedure gpu_deallocate_double1 & + ,gpu_deallocate_double2 & + ,gpu_deallocate_double3 & + ,gpu_deallocate_double4 & + ,gpu_deallocate_double5 & + ,gpu_deallocate_double6 + end interface gpu_deallocate + + interface gpu_upload + procedure gpu_upload_double1 & + ,gpu_upload_double2 & + ,gpu_upload_double3 & + ,gpu_upload_double4 & + ,gpu_upload_double5 & + ,gpu_upload_double6 + end interface gpu_upload + + interface gpu_download + procedure gpu_download_double1 & + ,gpu_download_double2 & + ,gpu_download_double3 & + ,gpu_download_double4 & + ,gpu_download_double5 & + ,gpu_download_double6 + end interface gpu_download + + interface gpu_copy + procedure gpu_copy_double1 & + ,gpu_copy_double2 & + ,gpu_copy_double3 & + ,gpu_copy_double4 & + ,gpu_copy_double5 & + ,gpu_copy_double6 + end interface gpu_copy + + contains - subroutine gpu_allocate_double(ptr, s) - implicit none - double precision, pointer, intent(inout) :: ptr - integer, intent(in) :: s(:) - type(c_ptr) :: cptr +! gpu_allocate +! ------------ - call gpu_allocate_c(cptr, sum(s*1_8)*8_8) - call c_f_pointer(cptr, ptr, s) + subroutine gpu_allocate_double1(ptr, s) + implicit none + type(gpu_double1), intent(inout) :: ptr + integer, intent(in) :: s + + call gpu_allocate_c(ptr%c, s*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s /)) end subroutine - subroutine gpu_deallocate_double(ptr) + subroutine gpu_allocate_double2(ptr, s1, s2) implicit none - double precision, pointer, intent(inout) :: ptr - type(c_ptr) :: cptr - cptr = c_loc(ptr) - call gpu_deallocate_c(cptr) - NULLIFY(ptr) + type(gpu_double2), intent(inout) :: ptr + integer, intent(in) :: s1, s2 + + call gpu_allocate_c(ptr%c, s1*s2*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /)) + end subroutine + + subroutine gpu_allocate_double3(ptr, s1, s2, s3) + implicit none + type(gpu_double3), intent(inout) :: ptr + integer, intent(in) :: s1, s2, s3 + + call gpu_allocate_c(ptr%c, s1*s2*s3*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /)) + end subroutine + + subroutine gpu_allocate_double4(ptr, s1, s2, s3, s4) + implicit none + type(gpu_double4), intent(inout) :: ptr + integer, intent(in) :: s1, s2, s3, s4 + + call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /)) + end subroutine + + subroutine gpu_allocate_double5(ptr, s1, s2, s3, s4, s5) + implicit none + type(gpu_double5), intent(inout) :: ptr + integer, intent(in) :: s1, s2, s3, s4, s5 + + call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /)) + end subroutine + + subroutine gpu_allocate_double6(ptr, s1, s2, s3, s4, s5, s6) + implicit none + type(gpu_double6), intent(inout) :: ptr + integer, intent(in) :: s1, s2, s3, s4, s5, s6 + + call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /)) + end subroutine + + +! gpu_deallocate +! -------------- + + subroutine gpu_deallocate_double1(ptr) + implicit none + type(gpu_double1), intent(inout) :: ptr + call gpu_deallocate_c(ptr%c) + NULLIFY(ptr%f) + end subroutine + + subroutine gpu_deallocate_double2(ptr) + implicit none + type(gpu_double2), intent(inout) :: ptr + call gpu_deallocate_c(ptr%c) + NULLIFY(ptr%f) + end subroutine + + subroutine gpu_deallocate_double3(ptr) + implicit none + type(gpu_double3), intent(inout) :: ptr + call gpu_deallocate_c(ptr%c) + NULLIFY(ptr%f) + end subroutine + + subroutine gpu_deallocate_double4(ptr) + implicit none + type(gpu_double4), intent(inout) :: ptr + call gpu_deallocate_c(ptr%c) + NULLIFY(ptr%f) + end subroutine + + subroutine gpu_deallocate_double5(ptr) + implicit none + type(gpu_double5), intent(inout) :: ptr + call gpu_deallocate_c(ptr%c) + NULLIFY(ptr%f) + end subroutine + + subroutine gpu_deallocate_double6(ptr) + implicit none + type(gpu_double6), intent(inout) :: ptr + call gpu_deallocate_c(ptr%c) + NULLIFY(ptr%f) + end subroutine + + +! gpu_upload +! ---------- + + subroutine gpu_upload_double1(cpu_ptr, gpu_ptr) + implicit none + double precision, intent(in) :: cpu_ptr(:) + type(gpu_double1), intent(in) :: gpu_ptr + call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, 8_8*size(gpu_ptr%f)) + end subroutine + + subroutine gpu_upload_double2(cpu_ptr, gpu_ptr) + implicit none + double precision, intent(in) :: cpu_ptr(:,:) + type(gpu_double2), intent(in) :: gpu_ptr + call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) + end subroutine + + subroutine gpu_upload_double3(cpu_ptr, gpu_ptr) + implicit none + double precision, intent(in) :: cpu_ptr(:,:,:) + type(gpu_double3), intent(in) :: gpu_ptr + call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) + end subroutine + + subroutine gpu_upload_double4(cpu_ptr, gpu_ptr) + implicit none + double precision, intent(in) :: cpu_ptr(:,:,:,:) + type(gpu_double4), intent(in) :: gpu_ptr + call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) + end subroutine + + subroutine gpu_upload_double5(cpu_ptr, gpu_ptr) + implicit none + double precision, intent(in) :: cpu_ptr(:,:,:,:,:) + type(gpu_double5), intent(in) :: gpu_ptr + call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) + end subroutine + + subroutine gpu_upload_double6(cpu_ptr, gpu_ptr) + implicit none + double precision, intent(in) :: cpu_ptr(:,:,:,:,:,:) + type(gpu_double6), intent(in) :: gpu_ptr + call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) + end subroutine + + +! gpu_download +! ------------ + + subroutine gpu_download_double1(gpu_ptr, cpu_ptr) + implicit none + type(gpu_double1), intent(in) :: gpu_ptr + double precision, intent(in) :: cpu_ptr(:) + call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*size(gpu_ptr%f)) + end subroutine + + subroutine gpu_download_double2(gpu_ptr, cpu_ptr) + implicit none + type(gpu_double2), intent(in) :: gpu_ptr + double precision, intent(in) :: cpu_ptr(:,:) + call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) + end subroutine + + subroutine gpu_download_double3(gpu_ptr, cpu_ptr) + implicit none + type(gpu_double3), intent(in) :: gpu_ptr + double precision, intent(in) :: cpu_ptr(:,:,:) + call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) + end subroutine + + subroutine gpu_download_double4(gpu_ptr, cpu_ptr) + implicit none + type(gpu_double4), intent(in) :: gpu_ptr + double precision, intent(in) :: cpu_ptr(:,:,:,:) + call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) + end subroutine + + subroutine gpu_download_double5(gpu_ptr, cpu_ptr) + implicit none + type(gpu_double5), intent(in) :: gpu_ptr + double precision, intent(in) :: cpu_ptr(:,:,:,:,:) + call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) + end subroutine + + subroutine gpu_download_double6(gpu_ptr, cpu_ptr) + implicit none + type(gpu_double6), intent(in) :: gpu_ptr + double precision, intent(in) :: cpu_ptr(:,:,:,:,:,:) + call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) + end subroutine + +! gpu_copy +! -------- + + subroutine gpu_copy_double1(gpu_ptr_src, gpu_ptr_dest) + implicit none + type(gpu_double1), intent(in) :: gpu_ptr_src + type(gpu_double1), intent(in) :: gpu_ptr_dest + call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*size(gpu_ptr_dest%f)) + end subroutine + + subroutine gpu_copy_double2(gpu_ptr_src, gpu_ptr_dest) + implicit none + type(gpu_double2), intent(in) :: gpu_ptr_src + type(gpu_double2), intent(in) :: gpu_ptr_dest + call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8)) + end subroutine + + subroutine gpu_copy_double3(gpu_ptr_src, gpu_ptr_dest) + implicit none + type(gpu_double3), intent(in) :: gpu_ptr_src + type(gpu_double3), intent(in) :: gpu_ptr_dest + call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8)) + end subroutine + + subroutine gpu_copy_double4(gpu_ptr_src, gpu_ptr_dest) + implicit none + type(gpu_double4), intent(in) :: gpu_ptr_src + type(gpu_double4), intent(in) :: gpu_ptr_dest + call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8)) + end subroutine + + subroutine gpu_copy_double5(gpu_ptr_src, gpu_ptr_dest) + implicit none + type(gpu_double5), intent(in) :: gpu_ptr_src + type(gpu_double5), intent(in) :: gpu_ptr_dest + call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8)) + end subroutine + + subroutine gpu_copy_double6(gpu_ptr_src, gpu_ptr_dest) + implicit none + type(gpu_double6), intent(in) :: gpu_ptr_src + type(gpu_double6), intent(in) :: gpu_ptr_dest + call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8)) + end subroutine + + +! gpu_stream +! ---------- + + subroutine gpu_stream_create(stream) + import + type(gpu_stream) :: stream + call gpu_stream_create_c(stream%c) + end subroutine + + subroutine gpu_stream_destroy(stream) + import + type(gpu_stream) :: stream + call gpu_stream_destroy_c(stream%c) + end subroutine + + subroutine gpu_set_stream(handle, stream) + import + type(gpu_blas) :: handle + type(gpu_stream) :: stream + call gpu_set_stream_c(handle%c, stream%c) + end subroutine + + +! gpu_blas +! -------- + + subroutine gpu_blas_create(handle) + import + type(gpu_blas) :: handle + call gpu_blas_create_c(handle%c) + end subroutine + + subroutine gpu_blas_destroy(handle) + import + type(gpu_blas) :: handle + call gpu_blas_destroy_c(handle%c) end subroutine end module -subroutine gpu_upload_double(cpu_ptr, gpu_ptr, n) - use gpu - implicit none - double precision, intent(in) :: cpu_ptr(*) - double precision, intent(in) :: gpu_ptr(*) - integer(c_int64_t), intent(in) :: n - call gpu_upload_c(c_loc(cpu_ptr), c_loc(gpu_ptr), 8_8*n) -end subroutine - -subroutine gpu_download_double(gpu_ptr, cpu_ptr, n) - use gpu - implicit none - double precision, intent(in) :: gpu_ptr(*) - double precision, intent(in) :: cpu_ptr(*) - integer(c_int64_t), intent(in) :: n - call gpu_download_c(c_loc(gpu_ptr), c_loc(cpu_ptr), 8_8*n) -end subroutine - -subroutine gpu_copy_double(gpu_ptr_src, gpu_ptr_dest, n) - use gpu - implicit none - double precision, intent(in) :: gpu_ptr_src(*) - double precision, intent(in) :: gpu_ptr_dest(*) - integer(c_int64_t), intent(in) :: n - call gpu_copy_c(c_loc(gpu_ptr_src), c_loc(gpu_ptr_dest), 8_8*n) -end subroutine - diff --git a/src/gpu_x86/gpu.c b/src/gpu_x86/gpu.c index 41ede396..5f42cb0d 100644 --- a/src/gpu_x86/gpu.c +++ b/src/gpu_x86/gpu.c @@ -251,7 +251,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i if (alpha == 0.) { for (int64_t j=0 ; j Date: Fri, 28 Jun 2024 11:00:58 +0200 Subject: [PATCH 16/38] Added Nvidia module --- src/ccsd/ccsd_space_orb_sub.irp.f | 10 +- src/gpu/gpu_module.F90 | 6 +- src/gpu_nvidia/LIB | 1 + src/gpu_nvidia/NEED | 1 + src/gpu_nvidia/README.rst | 5 + src/gpu_nvidia/gpu.c | 327 ++++++++++++++++++++++++++++++ src/gpu_x86/gpu.c | 40 ++-- 7 files changed, 359 insertions(+), 31 deletions(-) create mode 100644 src/gpu_nvidia/LIB create mode 100644 src/gpu_nvidia/NEED create mode 100644 src/gpu_nvidia/README.rst create mode 100644 src/gpu_nvidia/gpu.c diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 455d62f7..e7c9b1ab 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -384,17 +384,17 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau) !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) - do j=1,nO - !$OMP DO - do b=1,nV - call gpu_set_stream(blas,stream(b)) + !$OMP DO + do b=1,nV + call gpu_set_stream(blas,stream(b)) + do j=1,nO call gpu_dgeam_c(blas%c, 'N', 'N', nO*1_8, nV*1_8, & 1.d0, c_loc(t2%f(1,j,1,b)), nO*nO*1_8, & h_t1(j,b), t1%c, nO*1_8, & c_loc(tau%f(1,j,1,b)), nO*nO*1_8) enddo - !$OMP END DO enddo + !$OMP END DO !$OMP END PARALLEL call gpu_synchronize() diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90 index 51f80ac0..d1ddad4c 100644 --- a/src/gpu/gpu_module.F90 +++ b/src/gpu/gpu_module.F90 @@ -120,7 +120,7 @@ module gpu subroutine gpu_ddot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_ddot') import - type(c_ptr), intent(in), value :: handle + type(c_ptr), intent(in) :: handle integer(c_int64_t), value :: n, incx, incy type(c_ptr), intent(in), value :: dx, dy real(c_double), intent(out) :: res @@ -128,7 +128,7 @@ module gpu subroutine gpu_sdot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_sdot') import - type(c_ptr), intent(in), value :: handle + type(c_ptr), intent(in) :: handle integer(c_int64_t), value :: n, incx, incy type(c_ptr), intent(in), value :: dx, dy real(c_float), intent(out) :: res @@ -137,7 +137,7 @@ module gpu subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, & b, ldb, c, ldc) bind(C, name='gpu_dgeam') import - type(c_ptr), intent(in), value :: handle + type(c_ptr), intent(in) :: handle character(c_char), intent(in), value :: transa, transb integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc real(c_double), intent(in), value :: alpha, beta diff --git a/src/gpu_nvidia/LIB b/src/gpu_nvidia/LIB new file mode 100644 index 00000000..91f54e91 --- /dev/null +++ b/src/gpu_nvidia/LIB @@ -0,0 +1 @@ +-lcudart -lcublas -lcublasLt diff --git a/src/gpu_nvidia/NEED b/src/gpu_nvidia/NEED new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/src/gpu_nvidia/NEED @@ -0,0 +1 @@ + diff --git a/src/gpu_nvidia/README.rst b/src/gpu_nvidia/README.rst new file mode 100644 index 00000000..5dcfca92 --- /dev/null +++ b/src/gpu_nvidia/README.rst @@ -0,0 +1,5 @@ +========== +gpu_nvidia +========== + +Nvidia implementation of GPU routines. Uses CUDA and CUBLAS libraries. diff --git a/src/gpu_nvidia/gpu.c b/src/gpu_nvidia/gpu.c new file mode 100644 index 00000000..f0bd247a --- /dev/null +++ b/src/gpu_nvidia/gpu.c @@ -0,0 +1,327 @@ +#include +#include +#include +#include +#include + +#include +#include + + +/* Generic functions */ + +int gpu_ndevices() { + int ngpus; + cudaGetDeviceCount(&ngpus); + return ngpus; +} + +void gpu_set_device(int32_t igpu) { + cudaSetDevice(igpu); +} + + +/* Allocation functions */ + +void gpu_allocate(void** ptr, const int64_t size) { + size_t free, total; + cudaError_t rc = cudaMemGetInfo( &free, &total ); + if (rc != cudaSuccess) { + free = INT64_MAX; + } + + /* Use managed memory if it does not fit on the GPU */ + if (size < free && size < total/2) { +// rc= cudaMalloc(ptr, size); + rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal); + } else { + rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal); + } + assert (rc == cudaSuccess); +} + +void gpu_deallocate(void** ptr) { + assert (*ptr != NULL); + cudaFree(*ptr); + *ptr = NULL; +} + + +/* Memory transfer functions */ + +void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) { + cudaMemcpy (gpu_ptr, cpu_ptr, n, cudaMemcpyHostToDevice); +} + +void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) { + cudaMemcpy (cpu_ptr, gpu_ptr, n, cudaMemcpyDeviceToHost); +} + +void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) { + cudaMemcpy (gpu_ptr_dest, gpu_ptr_src, n, cudaMemcpyDeviceToDevice); +} + + +/* Streams */ + +void gpu_stream_create(void** ptr) { + cudaStream_t stream; + cudaError_t rc = cudaStreamCreate(&stream); + assert (rc == cudaSuccess); + *ptr = (void*) stream; +} + +void gpu_stream_destroy(void** ptr) { + assert (*ptr != NULL); + cudaError_t rc = cudaStreamDestroy( (cudaStream_t) *ptr); + assert (rc == cudaSuccess); + *ptr = NULL; +} + +void gpu_set_stream(void** handle, void** stream) { + cublasSetStream( (cublasHandle_t) *handle, (cudaStream_t) *stream); +} + +void gpu_synchronize() { + cudaDeviceSynchronize(); +} + + +/* BLAS functions */ + +void gpu_blas_create(void** handle) { + cublasHandle_t cublas_handle; + cublasStatus_t rc = cublasCreate(&cublas_handle); + assert (rc == CUBLAS_STATUS_SUCCESS); + *handle = (void*) cublas_handle; +} + + +void gpu_blas_destroy(void** handle) { + assert (*handle != NULL); + cublasStatus_t rc = cublasDestroy( (cublasHandle_t) *handle); + assert (rc == CUBLAS_STATUS_SUCCESS); + *handle = NULL; +} + + +void gpu_ddot(void** handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) { + assert (*handle != NULL); + + /* Convert to int32_t */ + int32_t n_, incx_, incy_; + + n_ = (int32_t) n; + incx_ = (int32_t) incx; + incy_ = (int32_t) incy; + + /* Check for integer overflows */ + assert ( (int64_t) n_ == n ); + assert ( (int64_t) incx_ == incx); + assert ( (int64_t) incy_ == incy); + + cublasDdot((cublasHandle_t) *handle, n_, x, incx_, y, incy_, result); +} + + + +void gpu_sdot(void** handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) { + assert (*handle != NULL); + + /* Convert to int32_t */ + int32_t n_, incx_, incy_; + + n_ = (int32_t) n; + incx_ = (int32_t) incx; + incy_ = (int32_t) incy; + + /* Check for integer overflows */ + assert ( (int64_t) n_ == n ); + assert ( (int64_t) incx_ == incx); + assert ( (int64_t) incy_ == incy); + + cublasSdot((cublasHandle_t) *handle, n_, x, incx_, y, incy_, result); +} + + + +void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t n, const double alpha, + const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) { + + assert (*handle != NULL); + + /* Convert to int32_t */ + int32_t m_, n_, lda_, incx_, incy_; + + m_ = (int32_t) m; + n_ = (int32_t) n; + lda_ = (int32_t) lda; + incx_ = (int32_t) incx; + incy_ = (int32_t) incy; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) lda_ == lda ); + assert ( (int64_t) incx_ == incx); + assert ( (int64_t) incy_ == incy); + + cublasOperation_t transa_ = CUBLAS_OP_N; + if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; + + cublasDgemv((cublasHandle_t) *handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_); +} + + + +void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t n, const float alpha, + const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) { + + assert (*handle != NULL); + + /* Convert to int32_t */ + int32_t m_, n_, lda_, incx_, incy_; + + m_ = (int32_t) m; + n_ = (int32_t) n; + lda_ = (int32_t) lda; + incx_ = (int32_t) incx; + incy_ = (int32_t) incy; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) lda_ == lda ); + assert ( (int64_t) incx_ == incx); + assert ( (int64_t) incy_ == incy); + + cublasOperation_t transa_ = CUBLAS_OP_N; + if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; + + cublasSgemv((cublasHandle_t) *handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_); +} + + +void gpu_dgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, + const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) { + + assert (*handle != NULL); + + /* Convert to int32_t */ + int32_t m_, n_, k_, lda_, ldb_, ldc_; + + m_ = (int32_t) m; + n_ = (int32_t) n; + k_ = (int32_t) k; + lda_ = (int32_t) lda; + ldb_ = (int32_t) ldb; + ldc_ = (int32_t) ldc; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) k_ == k ); + assert ( (int64_t) lda_ == lda); + assert ( (int64_t) ldb_ == ldb); + assert ( (int64_t) ldc_ == ldc); + + cublasOperation_t transa_ = CUBLAS_OP_N; + cublasOperation_t transb_ = CUBLAS_OP_N; + if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; + if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; + + cublasDgemm((cublasHandle_t) *handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_); +} + + + +void gpu_sgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, + const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) { + + assert (*handle != NULL); + + /* Convert to int32_t */ + int32_t m_, n_, k_, lda_, ldb_, ldc_; + + m_ = (int32_t) m; + n_ = (int32_t) n; + k_ = (int32_t) k; + lda_ = (int32_t) lda; + ldb_ = (int32_t) ldb; + ldc_ = (int32_t) ldc; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) k_ == k ); + assert ( (int64_t) lda_ == lda); + assert ( (int64_t) ldb_ == ldb); + assert ( (int64_t) ldc_ == ldc); + + cublasOperation_t transa_ = CUBLAS_OP_N; + cublasOperation_t transb_ = CUBLAS_OP_N; + if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; + if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; + + cublasSgemm((cublasHandle_t) *handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_); +} + + +void gpu_dgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, + const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { + assert (*handle != NULL); + + /* Convert to int32_t */ + int32_t m_, n_, lda_, ldb_, ldc_; + + m_ = (int32_t) m; + n_ = (int32_t) n; + lda_ = (int32_t) lda; + ldb_ = (int32_t) ldb; + ldc_ = (int32_t) ldc; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) lda_ == lda); + assert ( (int64_t) ldb_ == ldb); + assert ( (int64_t) ldc_ == ldc); + + cublasOperation_t transa_ = CUBLAS_OP_N; + cublasOperation_t transb_ = CUBLAS_OP_N; + if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; + if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; + + cublasDgeam((cublasHandle_t) *handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_); + +} + + +void gpu_sgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha, + const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) { + assert (*handle != NULL); + + /* Convert to int32_t */ + int32_t m_, n_, lda_, ldb_, ldc_; + + m_ = (int32_t) m; + n_ = (int32_t) n; + lda_ = (int32_t) lda; + ldb_ = (int32_t) ldb; + ldc_ = (int32_t) ldc; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) lda_ == lda); + assert ( (int64_t) ldb_ == ldb); + assert ( (int64_t) ldc_ == ldc); + + cublasOperation_t transa_ = CUBLAS_OP_N; + cublasOperation_t transb_ = CUBLAS_OP_N; + if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; + if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; + + cublasSgeam((cublasHandle_t) *handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_); + +} diff --git a/src/gpu_x86/gpu.c b/src/gpu_x86/gpu.c index 5f42cb0d..ac7c3620 100644 --- a/src/gpu_x86/gpu.c +++ b/src/gpu_x86/gpu.c @@ -56,7 +56,7 @@ void gpu_stream_destroy(void** ptr) { *ptr = NULL; } -void gpu_set_stream(void* handle, void* stream) { +void gpu_set_stream(void** handle, void** stream) { return; } @@ -79,8 +79,8 @@ void gpu_blas_destroy(void** handle) { double ddot_(const int32_t* n, const double* x, const int32_t* incx, const double* y, const int32_t* incy); -void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) { - assert (handle != NULL); +void gpu_ddot(void** handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) { + assert (*handle != NULL); /* Convert to int32_t */ int32_t n_, incx_, incy_; @@ -100,8 +100,8 @@ void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_ float sdot_(const int32_t* n, const float* x, const int32_t* incx, const float* y, const int32_t* incy); -void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) { - assert (handle != NULL); +void gpu_sdot(void** handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) { + assert (*handle != NULL); /* Convert to int32_t */ int32_t n_, incx_, incy_; @@ -122,10 +122,10 @@ void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha, const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy); -void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double alpha, +void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t n, const double alpha, const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) { - assert (handle != NULL); + assert (*handle != NULL); /* Convert to int32_t */ int32_t m_, n_, lda_, incx_, incy_; @@ -150,10 +150,10 @@ void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha, const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy); -void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float alpha, +void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t n, const float alpha, const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) { - assert (handle != NULL); + assert (*handle != NULL); /* Convert to int32_t */ int32_t m_, n_, lda_, incx_, incy_; @@ -178,10 +178,10 @@ void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha, const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc); -void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, +void gpu_dgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) { - assert (handle != NULL); + assert (*handle != NULL); /* Convert to int32_t */ int32_t m_, n_, k_, lda_, ldb_, ldc_; @@ -209,10 +209,10 @@ void gpu_dgemm(const void* handle, const char transa, const char transb, const i void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha, const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc); -void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, +void gpu_sgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) { - assert (handle != NULL); + assert (*handle != NULL); /* Convert to int32_t */ int32_t m_, n_, k_, lda_, ldb_, ldc_; @@ -236,12 +236,9 @@ void gpu_sgemm(const void* handle, const char transa, const char transb, const i } -void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, +void gpu_dgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { - if (handle == NULL) { - perror("NULL handle"); - exit(-1); - } + assert (*handle != NULL); if ( (transa == 'N' && transb == 'N') || (transa == 'n' && transb == 'N') || @@ -371,12 +368,9 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i } -void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha, +void gpu_sgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha, const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) { - if (handle == NULL) { - perror("NULL handle"); - exit(-1); - } + assert (*handle != NULL); if ( (transa == 'N' && transb == 'N') || (transa == 'n' && transb == 'N') || From d3d89022bc8092ab0c6131904f85475f160dfa53 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Fri, 28 Jun 2024 16:50:52 +0200 Subject: [PATCH 17/38] Move GPU modules to plugins --- configure | 6 +++--- {src => plugins/local}/gpu_nvidia/LIB | 0 {src => plugins/local}/gpu_nvidia/NEED | 0 {src => plugins/local}/gpu_nvidia/README.rst | 0 {src => plugins/local}/gpu_nvidia/gpu.c | 0 {src => plugins/local}/gpu_x86/NEED | 0 {src => plugins/local}/gpu_x86/README.rst | 0 {src => plugins/local}/gpu_x86/gpu.c | 0 8 files changed, 3 insertions(+), 3 deletions(-) rename {src => plugins/local}/gpu_nvidia/LIB (100%) rename {src => plugins/local}/gpu_nvidia/NEED (100%) rename {src => plugins/local}/gpu_nvidia/README.rst (100%) rename {src => plugins/local}/gpu_nvidia/gpu.c (100%) rename {src => plugins/local}/gpu_x86/NEED (100%) rename {src => plugins/local}/gpu_x86/README.rst (100%) rename {src => plugins/local}/gpu_x86/gpu.c (100%) diff --git a/configure b/configure index db158966..08dac444 100755 --- a/configure +++ b/configure @@ -119,15 +119,15 @@ rm -f ${QP_ROOT}/src/gpu_arch case "$GPU" in amd) # Nvidia echo "Activating AMD GPU acceleration" - ln -s ${QP_ROOT}/src/gpu_amd ${QP_ROOT}/src/gpu_arch + ln -s ${QP_ROOT}/plugins/local/gpu_amd ${QP_ROOT}/src/gpu_arch ;; nvidia) # Nvidia echo "Activating Nvidia GPU acceleration" - ln -s ${QP_ROOT}/src/gpu_nvidia ${QP_ROOT}/src/gpu_arch + ln -s ${QP_ROOT}/plugins/local/gpu_nvidia ${QP_ROOT}/src/gpu_arch ;; *) # No Acceleration echo "Disabling GPU acceleration" - ln -s ${QP_ROOT}/src/gpu_x86 ${QP_ROOT}/src/gpu_arch + ln -s ${QP_ROOT}/plugins/local/gpu_x86 ${QP_ROOT}/src/gpu_arch ;; esac diff --git a/src/gpu_nvidia/LIB b/plugins/local/gpu_nvidia/LIB similarity index 100% rename from src/gpu_nvidia/LIB rename to plugins/local/gpu_nvidia/LIB diff --git a/src/gpu_nvidia/NEED b/plugins/local/gpu_nvidia/NEED similarity index 100% rename from src/gpu_nvidia/NEED rename to plugins/local/gpu_nvidia/NEED diff --git a/src/gpu_nvidia/README.rst b/plugins/local/gpu_nvidia/README.rst similarity index 100% rename from src/gpu_nvidia/README.rst rename to plugins/local/gpu_nvidia/README.rst diff --git a/src/gpu_nvidia/gpu.c b/plugins/local/gpu_nvidia/gpu.c similarity index 100% rename from src/gpu_nvidia/gpu.c rename to plugins/local/gpu_nvidia/gpu.c diff --git a/src/gpu_x86/NEED b/plugins/local/gpu_x86/NEED similarity index 100% rename from src/gpu_x86/NEED rename to plugins/local/gpu_x86/NEED diff --git a/src/gpu_x86/README.rst b/plugins/local/gpu_x86/README.rst similarity index 100% rename from src/gpu_x86/README.rst rename to plugins/local/gpu_x86/README.rst diff --git a/src/gpu_x86/gpu.c b/plugins/local/gpu_x86/gpu.c similarity index 100% rename from src/gpu_x86/gpu.c rename to plugins/local/gpu_x86/gpu.c From 85b1035cfba778559e629045961cb542631841bd Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Fri, 28 Jun 2024 17:33:08 +0200 Subject: [PATCH 18/38] Working on CCSD --- src/ccsd/ccsd_space_orb_sub.irp.f | 117 +++++++++++------------------- src/gpu/gpu_module.F90 | 62 ++++++++++++++++ 2 files changed, 103 insertions(+), 76 deletions(-) diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index e7c9b1ab..1329f172 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -14,6 +14,9 @@ subroutine run_ccsd_space_orb type(gpu_double2) :: t1, r1 type(gpu_double2) :: H_oo, H_vv, H_vo + type(gpu_double2) :: d_cc_space_f_vo + type(gpu_double4) :: d_cc_space_v_oovv + double precision, allocatable :: all_err(:,:), all_t(:,:) integer, allocatable :: list_occ(:), list_vir(:) integer(bit_kind) :: det(N_int,2) @@ -52,6 +55,12 @@ subroutine run_ccsd_space_orb !print*,'occ',list_occ !print*,'vir',list_vir + call gpu_allocate(d_cc_space_f_vo, nV, nO) + call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV) + call gpu_upload(cc_space_f_vo, d_cc_space_f_vo) + call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv) + + call gpu_allocate(t2, nO,nO,nV,nV) call gpu_allocate(r2, nO,nO,nV,nV) call gpu_allocate(tau, nO,nO,nV,nV) @@ -116,7 +125,8 @@ subroutine run_ccsd_space_orb !print*,'hf_energy', hf_energy call det_energy(det,uncorr_energy) print*,'Det energy', uncorr_energy - call ccsd_energy_space_x(nO,nV,tau_x%f,t1%f,energy) + + call ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy) print*,'Guess energy', uncorr_energy+energy, energy nb_iter = 0 @@ -166,7 +176,7 @@ subroutine run_ccsd_space_orb call update_tau_x_space(nO,nV,tau,tau_x) ! Energy - call ccsd_energy_space_x(nO,nV,tau_x%f,t1%f,energy) + call ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy) write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |' nb_iter = nb_iter + 1 @@ -239,6 +249,8 @@ subroutine run_ccsd_space_orb call save_energy(uncorr_energy + energy, e_t) deallocate(h_t1, h_t2) + call gpu_deallocate(d_cc_space_f_vo) + call gpu_deallocate(d_cc_space_v_oovv) call gpu_deallocate(t1) call gpu_deallocate(t2) @@ -246,59 +258,14 @@ end ! Energy -subroutine ccsd_energy_space(nO,nV,tau,t1,energy) - +subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy) + use gpu implicit none - integer, intent(in) :: nO, nV - double precision, intent(in) :: tau(nO,nO,nV,nV) - double precision, intent(in) :: t1(nO,nV) - double precision, intent(out) :: energy - - ! internal - integer :: i,j,a,b - double precision :: e - - energy = 0d0 - !$omp parallel & - !$omp shared(nO,nV,energy,tau,t1,& - !$omp cc_space_f_vo,cc_space_w_oovv) & - !$omp private(i,j,a,b,e) & - !$omp default(none) - e = 0d0 - !$omp do - do a = 1, nV - do i = 1, nO - e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a) - enddo - enddo - !$omp end do nowait - !$omp do - do b = 1, nV - do a = 1, nV - do j = 1, nO - do i = 1, nO - e = e + tau(i,j,a,b) * cc_space_w_oovv(i,j,a,b) - enddo - enddo - enddo - enddo - !$omp end do nowait - !$omp critical - energy = energy + e - !$omp end critical - !$omp end parallel - -end - -subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy) - - implicit none - - integer, intent(in) :: nO, nV - double precision, intent(in) :: tau_x(nO,nO,nV,nV) - double precision, intent(in) :: t1(nO,nV) - double precision, intent(out) :: energy + integer, intent(in) :: nO, nV + type(gpu_double4), intent(in) :: tau_x, d_cc_space_v_oovv + type(gpu_double2), intent(in) :: t1, d_cc_space_f_vo + double precision, intent(out) :: energy ! internal integer :: i,j,a,b @@ -307,14 +274,14 @@ subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy) energy = 0d0 !$omp parallel & !$omp shared(nO,nV,energy,tau_x,t1,& - !$omp cc_space_f_vo,cc_space_v_oovv) & + !$omp d_cc_space_f_vo,d_cc_space_v_oovv) & !$omp private(i,j,a,b,e) & !$omp default(none) e = 0d0 !$omp do do a = 1, nV do i = 1, nO - e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a) + e = e + 2d0 * d_cc_space_f_vo%f(a,i) * t1%f(i,a) enddo enddo !$omp end do nowait @@ -323,7 +290,7 @@ subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy) do a = 1, nV do j = 1, nO do i = 1, nO - e = e + tau_x(i,j,a,b) * cc_space_v_oovv(i,j,a,b) + e = e + tau_x%f(i,j,a,b) * d_cc_space_v_oovv%f(i,j,a,b) enddo enddo enddo @@ -333,6 +300,12 @@ subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy) energy = energy + e !$omp end critical !$omp end parallel +! +! +! call gpu_ddot(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1, d_cc_space_v_oovv, 1, energy) +! call gpu_ddot(blas_handle, nO*nV*1_8, d_cc_space_f_vo, 1, t1, 1, e) +! energy = energy + 2.d0*e + end @@ -372,26 +345,24 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau) ! !$OMP END PARALLEL - type(gpu_blas) :: blas type(gpu_stream) :: stream(nV) - call gpu_blas_create(blas) do b=1,nV call gpu_stream_create(stream(b)) enddo !$OMP PARALLEL & - !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas) & + !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) !$OMP DO do b=1,nV - call gpu_set_stream(blas,stream(b)) + call gpu_set_stream(blas_handle,stream(b)) do j=1,nO - call gpu_dgeam_c(blas%c, 'N', 'N', nO*1_8, nV*1_8, & - 1.d0, c_loc(t2%f(1,j,1,b)), nO*nO*1_8, & - h_t1(j,b), t1%c, nO*1_8, & - c_loc(tau%f(1,j,1,b)), nO*nO*1_8) + call gpu_dgeam(blas_handle, 'N', 'N', nO*1_8, nV*1_8, & + 1.d0, t2%f(1,j,1,b), nO*nO*1_8, & + h_t1(j,b), t1%f, nO*1_8, & + tau%f(1,j,1,b), nO*nO*1_8) enddo enddo !$OMP END DO @@ -403,8 +374,6 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau) call gpu_stream_destroy(stream(b)) enddo - call gpu_blas_destroy(blas) - end subroutine update_tau_x_space(nO,nV,tau,tau_x) @@ -438,26 +407,24 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x) ! !$OMP END DO ! !$OMP END PARALLEL - type(gpu_blas) :: blas type(gpu_stream) :: stream(nV) - call gpu_blas_create(blas) do a=1,nV call gpu_stream_create(stream(a)) enddo !$OMP PARALLEL & - !$OMP SHARED(nO,nV,tau,tau_x,stream,blas) & + !$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) !$OMP DO do b=1,nV do a=1,nV - call gpu_set_stream(blas,stream(a)) - call gpu_dgeam_c(blas%c, 'N', 'N', nO*1_8, nO*1_8, & - 2.d0, c_loc(tau%f(1,1,a,b)), nO*1_8, & - -1.d0, c_loc(tau%f(1,1,b,a)), nO*1_8, & - c_loc(tau_x%f(1,1,a,b)), nO*1_8) + call gpu_set_stream(blas_handle,stream(a)) + call gpu_dgeam(blas_handle, 'N', 'N', nO*1_8, nO*1_8, & + 2.d0, tau%f(1,1,a,b), nO*1_8, & + -1.d0, tau%f(1,1,b,a), nO*1_8, & + tau_x%f(1,1,a,b), nO*1_8) enddo enddo !$OMP END DO @@ -469,8 +436,6 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x) call gpu_stream_destroy(stream(b)) enddo - call gpu_blas_destroy(blas) - end ! R1 diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90 index d1ddad4c..2057d1eb 100644 --- a/src/gpu/gpu_module.F90 +++ b/src/gpu/gpu_module.F90 @@ -144,6 +144,16 @@ module gpu type(c_ptr), value :: a, b, c end subroutine + subroutine gpu_sgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, & + b, ldb, c, ldc) bind(C, name='gpu_sgeam') + import + type(c_ptr), intent(in) :: handle + character(c_char), intent(in), value :: transa, transb + integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc + real(c_float), intent(in), value :: alpha, beta + type(c_ptr), value :: a, b, c + end subroutine + end interface @@ -478,5 +488,57 @@ module gpu call gpu_blas_destroy_c(handle%c) end subroutine + end module + + +! dot +! --- + +subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res) + use gpu + type(gpu_blas), intent(in) :: handle + integer*8 :: n, incx, incy + double precision, intent(in) :: dx(*), dy(*) + double precision, intent(out) :: res + call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res) +end subroutine + +subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res) + use gpu + type(gpu_blas), intent(in) :: handle + integer*8 :: n, incx, incy + real, intent(in) :: dx(*), dy(*) + real, intent(out) :: res + call gpu_sdot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res) +end subroutine + + +! geam +! ---- + +subroutine gpu_dgeam(handle, transa, transb, m, n, alpha, a, lda, beta, & + b, ldb, c, ldc) + use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa, transb + integer*8, intent(in) :: m, n, lda, ldb, ldc + double precision, intent(in) :: alpha, beta + double precision :: a(lda,*), b(ldb,*), c(ldc,*) + call gpu_dgeam_c(handle%c, transa, transb, m, n, alpha, c_loc(a), lda, beta, & + c_loc(b), ldb, c_loc(c), ldc) +end subroutine + +subroutine gpu_sgeam(handle, transa, transb, m, n, alpha, a, lda, beta, & + b, ldb, c, ldc) + use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa, transb + integer*8, intent(in) :: m, n, lda, ldb, ldc + real, intent(in) :: alpha, beta + real :: a(lda,*), b(ldb,*), c(ldc,*) + call gpu_sgeam_c(handle%c, transa, transb, m, n, alpha, c_loc(a), lda, beta, & + c_loc(b), ldb, c_loc(c), ldc) +end subroutine + From a5f4f0516eec9f17438474529616368a6f9e5de4 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Fri, 28 Jun 2024 17:39:43 +0200 Subject: [PATCH 19/38] Fixing compile --- src/ccsd/ccsd_space_orb_sub.irp.f | 1 - src/gpu/gpu_module.F90 | 23 +++++++++-------------- 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 1329f172..4e06e31d 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -306,7 +306,6 @@ subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1, ! call gpu_ddot(blas_handle, nO*nV*1_8, d_cc_space_f_vo, 1, t1, 1, e) ! energy = energy + 2.d0*e - end ! Tau diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90 index 2057d1eb..d7c26ba6 100644 --- a/src/gpu/gpu_module.F90 +++ b/src/gpu/gpu_module.F90 @@ -365,42 +365,42 @@ module gpu subroutine gpu_download_double1(gpu_ptr, cpu_ptr) implicit none type(gpu_double1), intent(in) :: gpu_ptr - double precision, intent(in) :: cpu_ptr(:) + double precision, target, intent(in) :: cpu_ptr(:) call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*size(gpu_ptr%f)) end subroutine subroutine gpu_download_double2(gpu_ptr, cpu_ptr) implicit none type(gpu_double2), intent(in) :: gpu_ptr - double precision, intent(in) :: cpu_ptr(:,:) + double precision, target, intent(in) :: cpu_ptr(:,:) call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) end subroutine subroutine gpu_download_double3(gpu_ptr, cpu_ptr) implicit none type(gpu_double3), intent(in) :: gpu_ptr - double precision, intent(in) :: cpu_ptr(:,:,:) + double precision, target, intent(in) :: cpu_ptr(:,:,:) call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) end subroutine subroutine gpu_download_double4(gpu_ptr, cpu_ptr) implicit none type(gpu_double4), intent(in) :: gpu_ptr - double precision, intent(in) :: cpu_ptr(:,:,:,:) + double precision, target, intent(in) :: cpu_ptr(:,:,:,:) call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) end subroutine subroutine gpu_download_double5(gpu_ptr, cpu_ptr) implicit none type(gpu_double5), intent(in) :: gpu_ptr - double precision, intent(in) :: cpu_ptr(:,:,:,:,:) + double precision, target, intent(in) :: cpu_ptr(:,:,:,:,:) call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) end subroutine subroutine gpu_download_double6(gpu_ptr, cpu_ptr) implicit none type(gpu_double6), intent(in) :: gpu_ptr - double precision, intent(in) :: cpu_ptr(:,:,:,:,:,:) + double precision, target, intent(in) :: cpu_ptr(:,:,:,:,:,:) call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) end subroutine @@ -454,19 +454,16 @@ module gpu ! ---------- subroutine gpu_stream_create(stream) - import type(gpu_stream) :: stream call gpu_stream_create_c(stream%c) end subroutine subroutine gpu_stream_destroy(stream) - import type(gpu_stream) :: stream call gpu_stream_destroy_c(stream%c) end subroutine subroutine gpu_set_stream(handle, stream) - import type(gpu_blas) :: handle type(gpu_stream) :: stream call gpu_set_stream_c(handle%c, stream%c) @@ -477,13 +474,11 @@ module gpu ! -------- subroutine gpu_blas_create(handle) - import type(gpu_blas) :: handle call gpu_blas_create_c(handle%c) end subroutine subroutine gpu_blas_destroy(handle) - import type(gpu_blas) :: handle call gpu_blas_destroy_c(handle%c) end subroutine @@ -500,7 +495,7 @@ subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res) use gpu type(gpu_blas), intent(in) :: handle integer*8 :: n, incx, incy - double precision, intent(in) :: dx(*), dy(*) + double precision, target, intent(in) :: dx(*), dy(*) double precision, intent(out) :: res call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res) end subroutine @@ -525,7 +520,7 @@ subroutine gpu_dgeam(handle, transa, transb, m, n, alpha, a, lda, beta, & character, intent(in) :: transa, transb integer*8, intent(in) :: m, n, lda, ldb, ldc double precision, intent(in) :: alpha, beta - double precision :: a(lda,*), b(ldb,*), c(ldc,*) + double precision, target :: a(lda,*), b(ldb,*), c(ldc,*) call gpu_dgeam_c(handle%c, transa, transb, m, n, alpha, c_loc(a), lda, beta, & c_loc(b), ldb, c_loc(c), ldc) end subroutine @@ -537,7 +532,7 @@ subroutine gpu_sgeam(handle, transa, transb, m, n, alpha, a, lda, beta, & character, intent(in) :: transa, transb integer*8, intent(in) :: m, n, lda, ldb, ldc real, intent(in) :: alpha, beta - real :: a(lda,*), b(ldb,*), c(ldc,*) + real, target :: a(lda,*), b(ldb,*), c(ldc,*) call gpu_sgeam_c(handle%c, transa, transb, m, n, alpha, c_loc(a), lda, beta, & c_loc(b), ldb, c_loc(c), ldc) end subroutine From c7df9a72cc68a7f5dfded36aa94ac50d5188a5a1 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Fri, 28 Jun 2024 21:32:04 +0200 Subject: [PATCH 20/38] Fixing again actions --- src/gpu/gpu_module.F90 | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90 index d7c26ba6..ecf79c83 100644 --- a/src/gpu/gpu_module.F90 +++ b/src/gpu/gpu_module.F90 @@ -318,42 +318,42 @@ module gpu subroutine gpu_upload_double1(cpu_ptr, gpu_ptr) implicit none - double precision, intent(in) :: cpu_ptr(:) + double precision, target, intent(in) :: cpu_ptr(*) type(gpu_double1), intent(in) :: gpu_ptr call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, 8_8*size(gpu_ptr%f)) end subroutine subroutine gpu_upload_double2(cpu_ptr, gpu_ptr) implicit none - double precision, intent(in) :: cpu_ptr(:,:) + double precision, target, intent(in) :: cpu_ptr(:,:) type(gpu_double2), intent(in) :: gpu_ptr call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) end subroutine subroutine gpu_upload_double3(cpu_ptr, gpu_ptr) implicit none - double precision, intent(in) :: cpu_ptr(:,:,:) + double precision, target, intent(in) :: cpu_ptr(:,:,:) type(gpu_double3), intent(in) :: gpu_ptr call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) end subroutine subroutine gpu_upload_double4(cpu_ptr, gpu_ptr) implicit none - double precision, intent(in) :: cpu_ptr(:,:,:,:) + double precision, target, intent(in) :: cpu_ptr(:,:,:,:) type(gpu_double4), intent(in) :: gpu_ptr call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) end subroutine subroutine gpu_upload_double5(cpu_ptr, gpu_ptr) implicit none - double precision, intent(in) :: cpu_ptr(:,:,:,:,:) + double precision, target, intent(in) :: cpu_ptr(:,:,:,:,:) type(gpu_double5), intent(in) :: gpu_ptr call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) end subroutine subroutine gpu_upload_double6(cpu_ptr, gpu_ptr) implicit none - double precision, intent(in) :: cpu_ptr(:,:,:,:,:,:) + double precision, target, intent(in) :: cpu_ptr(:,:,:,:,:,:) type(gpu_double6), intent(in) :: gpu_ptr call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) end subroutine @@ -504,7 +504,7 @@ subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res) use gpu type(gpu_blas), intent(in) :: handle integer*8 :: n, incx, incy - real, intent(in) :: dx(*), dy(*) + real, target, intent(in) :: dx(*), dy(*) real, intent(out) :: res call gpu_sdot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res) end subroutine From b467bef6dd1e14c5914cc6508aa898d5f1665e3a Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Fri, 28 Jun 2024 21:37:14 +0200 Subject: [PATCH 21/38] Forgot file --- src/ccsd/ccsd_space_orb_sub.irp.f | 68 +++++++++++++++---------------- src/gpu/gpu.irp.f | 11 +++++ 2 files changed, 45 insertions(+), 34 deletions(-) create mode 100644 src/gpu/gpu.irp.f diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 4e06e31d..5c2daa05 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -271,40 +271,40 @@ subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1, integer :: i,j,a,b double precision :: e - energy = 0d0 - !$omp parallel & - !$omp shared(nO,nV,energy,tau_x,t1,& - !$omp d_cc_space_f_vo,d_cc_space_v_oovv) & - !$omp private(i,j,a,b,e) & - !$omp default(none) - e = 0d0 - !$omp do - do a = 1, nV - do i = 1, nO - e = e + 2d0 * d_cc_space_f_vo%f(a,i) * t1%f(i,a) - enddo - enddo - !$omp end do nowait - !$omp do - do b = 1, nV - do a = 1, nV - do j = 1, nO - do i = 1, nO - e = e + tau_x%f(i,j,a,b) * d_cc_space_v_oovv%f(i,j,a,b) - enddo - enddo - enddo - enddo - !$omp end do nowait - !$omp critical - energy = energy + e - !$omp end critical - !$omp end parallel -! -! -! call gpu_ddot(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1, d_cc_space_v_oovv, 1, energy) -! call gpu_ddot(blas_handle, nO*nV*1_8, d_cc_space_f_vo, 1, t1, 1, e) -! energy = energy + 2.d0*e +! energy = 0d0 +! !$omp parallel & +! !$omp shared(nO,nV,energy,tau_x,t1,& +! !$omp d_cc_space_f_vo,d_cc_space_v_oovv) & +! !$omp private(i,j,a,b,e) & +! !$omp default(none) +! e = 0d0 +! !$omp do +! do a = 1, nV +! do i = 1, nO +! e = e + 2d0 * d_cc_space_f_vo%f(a,i) * t1%f(i,a) +! enddo +! enddo +! !$omp end do nowait +! !$omp do +! do b = 1, nV +! do a = 1, nV +! do j = 1, nO +! do i = 1, nO +! e = e + tau_x%f(i,j,a,b) * d_cc_space_v_oovv%f(i,j,a,b) +! enddo +! enddo +! enddo +! enddo +! !$omp end do nowait +! !$omp critical +! energy = energy + e +! !$omp end critical +! !$omp end parallel + + + call gpu_ddot(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1, d_cc_space_v_oovv, 1, energy) + call gpu_ddot(blas_handle, nO*nV*1_8, d_cc_space_f_vo, 1, t1, 1, e) + energy = energy + 2.d0*e end diff --git a/src/gpu/gpu.irp.f b/src/gpu/gpu.irp.f new file mode 100644 index 00000000..e91d66f5 --- /dev/null +++ b/src/gpu/gpu.irp.f @@ -0,0 +1,11 @@ +use gpu + +BEGIN_PROVIDER [ type(gpu_blas), blas_handle ] + implicit none + BEGIN_DOC + ! Handle for cuBLAS or RocBLAS + END_DOC + call gpu_blas_create(blas_handle) +END_PROVIDER + + From 860121d404f7ae255790cd12136139103bdc48d0 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Sat, 29 Jun 2024 02:27:50 +0200 Subject: [PATCH 22/38] H_oo on GPU --- plugins/local/gpu_nvidia/gpu.c | 224 +++++++++++---------- plugins/local/gpu_x86/gpu.c | 38 ++-- src/ccsd/ccsd_space_orb_sub.irp.f | 112 ++++++++--- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 127 ++++++++---- src/gpu/gpu.irp.f | 7 + src/gpu/gpu_module.F90 | 260 +++++++++++++++++++++---- 6 files changed, 540 insertions(+), 228 deletions(-) diff --git a/plugins/local/gpu_nvidia/gpu.c b/plugins/local/gpu_nvidia/gpu.c index f0bd247a..189de64c 100644 --- a/plugins/local/gpu_nvidia/gpu.c +++ b/plugins/local/gpu_nvidia/gpu.c @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -10,6 +11,10 @@ /* Generic functions */ +bool no_gpu() { + return false; +} + int gpu_ndevices() { int ngpus; cudaGetDeviceCount(&ngpus); @@ -17,7 +22,7 @@ int gpu_ndevices() { } void gpu_set_device(int32_t igpu) { - cudaSetDevice(igpu); + cudaSetDevice((int) igpu); } @@ -64,22 +69,20 @@ void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) { /* Streams */ -void gpu_stream_create(void** ptr) { - cudaStream_t stream; - cudaError_t rc = cudaStreamCreate(&stream); +void gpu_stream_create(cudaStream_t* ptr) { + cudaError_t rc = cudaStreamCreate(ptr); assert (rc == cudaSuccess); - *ptr = (void*) stream; } -void gpu_stream_destroy(void** ptr) { - assert (*ptr != NULL); - cudaError_t rc = cudaStreamDestroy( (cudaStream_t) *ptr); +void gpu_stream_destroy(cudaStream_t* ptr) { + assert (ptr != NULL); + cudaError_t rc = cudaStreamDestroy(*ptr); assert (rc == cudaSuccess); *ptr = NULL; } -void gpu_set_stream(void** handle, void** stream) { - cublasSetStream( (cublasHandle_t) *handle, (cudaStream_t) *stream); +void gpu_set_stream(cublasHandle_t handle, cudaStream_t stream) { + cublasSetStream(handle, stream); } void gpu_synchronize() { @@ -89,75 +92,80 @@ void gpu_synchronize() { /* BLAS functions */ -void gpu_blas_create(void** handle) { - cublasHandle_t cublas_handle; - cublasStatus_t rc = cublasCreate(&cublas_handle); +void gpu_blas_create(cublasHandle_t* ptr) { + cublasStatus_t rc = cublasCreate(ptr); assert (rc == CUBLAS_STATUS_SUCCESS); - *handle = (void*) cublas_handle; } -void gpu_blas_destroy(void** handle) { - assert (*handle != NULL); - cublasStatus_t rc = cublasDestroy( (cublasHandle_t) *handle); +void gpu_blas_destroy(cublasHandle_t* ptr) { + assert (ptr != NULL); + cublasStatus_t rc = cublasDestroy(*ptr); assert (rc == CUBLAS_STATUS_SUCCESS); - *handle = NULL; + ptr = NULL; } -void gpu_ddot(void** handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) { - assert (*handle != NULL); +void gpu_ddot(cublasHandle_t handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) { + assert (handle != NULL); + /* Convert to int */ + int n_, incx_, incy_; - /* Convert to int32_t */ - int32_t n_, incx_, incy_; + n_ = (int) n; + incx_ = (int) incx; + incy_ = (int) incy; - n_ = (int32_t) n; - incx_ = (int32_t) incx; - incy_ = (int32_t) incy; + assert ( (int64_t) n_ == n ); + assert ( (int64_t) incx_ == incx); + assert ( (int64_t) incy_ == incy); + + cublasStatus_t rc = cublasDdot(handle, n_, x, incx_, y, incy_, result); +/* + double alpha = 1.0; + double beta = 0.0; + cublasStatus_t rc = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, n_, &alpha, x, 1, y, n_, &beta, &result_, 1); +*/ + assert (rc == CUBLAS_STATUS_SUCCESS); +} + + + +void gpu_sdot(cublasHandle_t handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) { + assert (handle != NULL); + + /* Convert to int */ + int n_, incx_, incy_; + + n_ = (int) n; + incx_ = (int) incx; + incy_ = (int) incy; /* Check for integer overflows */ assert ( (int64_t) n_ == n ); assert ( (int64_t) incx_ == incx); assert ( (int64_t) incy_ == incy); - cublasDdot((cublasHandle_t) *handle, n_, x, incx_, y, incy_, result); + float result_ = 0.; + cublasStatus_t rc = cublasSdot(handle, n_, x, incx_, y, incy_, &result_); + assert (rc == CUBLAS_STATUS_SUCCESS); + *result = result_; } -void gpu_sdot(void** handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) { - assert (*handle != NULL); - - /* Convert to int32_t */ - int32_t n_, incx_, incy_; - - n_ = (int32_t) n; - incx_ = (int32_t) incx; - incy_ = (int32_t) incy; - - /* Check for integer overflows */ - assert ( (int64_t) n_ == n ); - assert ( (int64_t) incx_ == incx); - assert ( (int64_t) incy_ == incy); - - cublasSdot((cublasHandle_t) *handle, n_, x, incx_, y, incy_, result); -} - - - -void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t n, const double alpha, +void gpu_dgemv(cublasHandle_t handle, const char transa, const int64_t m, const int64_t n, const double alpha, const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) { - assert (*handle != NULL); + assert (handle != NULL); - /* Convert to int32_t */ - int32_t m_, n_, lda_, incx_, incy_; + /* Convert to int */ + int m_, n_, lda_, incx_, incy_; - m_ = (int32_t) m; - n_ = (int32_t) n; - lda_ = (int32_t) lda; - incx_ = (int32_t) incx; - incy_ = (int32_t) incy; + m_ = (int) m; + n_ = (int) n; + lda_ = (int) lda; + incx_ = (int) incx; + incy_ = (int) incy; /* Check for integer overflows */ assert ( (int64_t) m_ == m ); @@ -169,24 +177,24 @@ void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t cublasOperation_t transa_ = CUBLAS_OP_N; if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; - cublasDgemv((cublasHandle_t) *handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_); + cublasDgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_); } -void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t n, const float alpha, +void gpu_sgemv(cublasHandle_t handle, const char transa, const int64_t m, const int64_t n, const float alpha, const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) { - assert (*handle != NULL); + assert (handle != NULL); - /* Convert to int32_t */ - int32_t m_, n_, lda_, incx_, incy_; + /* Convert to int */ + int m_, n_, lda_, incx_, incy_; - m_ = (int32_t) m; - n_ = (int32_t) n; - lda_ = (int32_t) lda; - incx_ = (int32_t) incx; - incy_ = (int32_t) incy; + m_ = (int) m; + n_ = (int) n; + lda_ = (int) lda; + incx_ = (int) incx; + incy_ = (int) incy; /* Check for integer overflows */ assert ( (int64_t) m_ == m ); @@ -198,24 +206,24 @@ void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t cublasOperation_t transa_ = CUBLAS_OP_N; if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; - cublasSgemv((cublasHandle_t) *handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_); + cublasSgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_); } -void gpu_dgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, +void gpu_dgemm(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) { - assert (*handle != NULL); + assert (handle != NULL); - /* Convert to int32_t */ - int32_t m_, n_, k_, lda_, ldb_, ldc_; + /* Convert to int */ + int m_, n_, k_, lda_, ldb_, ldc_; - m_ = (int32_t) m; - n_ = (int32_t) n; - k_ = (int32_t) k; - lda_ = (int32_t) lda; - ldb_ = (int32_t) ldb; - ldc_ = (int32_t) ldc; + m_ = (int) m; + n_ = (int) n; + k_ = (int) k; + lda_ = (int) lda; + ldb_ = (int) ldb; + ldc_ = (int) ldc; /* Check for integer overflows */ assert ( (int64_t) m_ == m ); @@ -230,25 +238,25 @@ void gpu_dgemm(void** handle, const char transa, const char transb, const int64_ if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; - cublasDgemm((cublasHandle_t) *handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_); + cublasDgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_); } -void gpu_sgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, +void gpu_sgemm(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) { - assert (*handle != NULL); + assert (handle != NULL); - /* Convert to int32_t */ - int32_t m_, n_, k_, lda_, ldb_, ldc_; + /* Convert to int */ + int m_, n_, k_, lda_, ldb_, ldc_; - m_ = (int32_t) m; - n_ = (int32_t) n; - k_ = (int32_t) k; - lda_ = (int32_t) lda; - ldb_ = (int32_t) ldb; - ldc_ = (int32_t) ldc; + m_ = (int) m; + n_ = (int) n; + k_ = (int) k; + lda_ = (int) lda; + ldb_ = (int) ldb; + ldc_ = (int) ldc; /* Check for integer overflows */ assert ( (int64_t) m_ == m ); @@ -263,22 +271,22 @@ void gpu_sgemm(void** handle, const char transa, const char transb, const int64_ if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; - cublasSgemm((cublasHandle_t) *handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_); + cublasSgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_); } -void gpu_dgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, +void gpu_dgeam(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { - assert (*handle != NULL); + assert (handle != NULL); - /* Convert to int32_t */ - int32_t m_, n_, lda_, ldb_, ldc_; + /* Convert to int */ + int m_, n_, lda_, ldb_, ldc_; - m_ = (int32_t) m; - n_ = (int32_t) n; - lda_ = (int32_t) lda; - ldb_ = (int32_t) ldb; - ldc_ = (int32_t) ldc; + m_ = (int) m; + n_ = (int) n; + lda_ = (int) lda; + ldb_ = (int) ldb; + ldc_ = (int) ldc; /* Check for integer overflows */ assert ( (int64_t) m_ == m ); @@ -292,23 +300,23 @@ void gpu_dgeam(void** handle, const char transa, const char transb, const int64_ if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; - cublasDgeam((cublasHandle_t) *handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_); + cublasDgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_); } -void gpu_sgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha, +void gpu_sgeam(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha, const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) { - assert (*handle != NULL); + assert (handle != NULL); - /* Convert to int32_t */ - int32_t m_, n_, lda_, ldb_, ldc_; + /* Convert to int */ + int m_, n_, lda_, ldb_, ldc_; - m_ = (int32_t) m; - n_ = (int32_t) n; - lda_ = (int32_t) lda; - ldb_ = (int32_t) ldb; - ldc_ = (int32_t) ldc; + m_ = (int) m; + n_ = (int) n; + lda_ = (int) lda; + ldb_ = (int) ldb; + ldc_ = (int) ldc; /* Check for integer overflows */ assert ( (int64_t) m_ == m ); @@ -322,6 +330,6 @@ void gpu_sgeam(void** handle, const char transa, const char transb, const int64_ if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; - cublasSgeam((cublasHandle_t) *handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_); + cublasSgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_); } diff --git a/plugins/local/gpu_x86/gpu.c b/plugins/local/gpu_x86/gpu.c index ac7c3620..53267a7c 100644 --- a/plugins/local/gpu_x86/gpu.c +++ b/plugins/local/gpu_x86/gpu.c @@ -2,8 +2,12 @@ #include #include #include +#include #include +bool no_gpu() { + return true; +} /* Generic functions */ @@ -56,7 +60,7 @@ void gpu_stream_destroy(void** ptr) { *ptr = NULL; } -void gpu_set_stream(void** handle, void** stream) { +void gpu_set_stream(void* handle, void* stream) { return; } @@ -79,8 +83,8 @@ void gpu_blas_destroy(void** handle) { double ddot_(const int32_t* n, const double* x, const int32_t* incx, const double* y, const int32_t* incy); -void gpu_ddot(void** handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) { - assert (*handle != NULL); +void gpu_ddot(void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) { + assert (handle != NULL); /* Convert to int32_t */ int32_t n_, incx_, incy_; @@ -100,8 +104,8 @@ void gpu_ddot(void** handle, const int64_t n, const double* x, const int64_t inc float sdot_(const int32_t* n, const float* x, const int32_t* incx, const float* y, const int32_t* incy); -void gpu_sdot(void** handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) { - assert (*handle != NULL); +void gpu_sdot(void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) { + assert (handle != NULL); /* Convert to int32_t */ int32_t n_, incx_, incy_; @@ -122,10 +126,10 @@ void gpu_sdot(void** handle, const int64_t n, const float* x, const int64_t incx void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha, const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy); -void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t n, const double alpha, +void gpu_dgemv(void* handle, const char transa, const int64_t m, const int64_t n, const double alpha, const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) { - assert (*handle != NULL); + assert (handle != NULL); /* Convert to int32_t */ int32_t m_, n_, lda_, incx_, incy_; @@ -150,10 +154,10 @@ void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha, const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy); -void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t n, const float alpha, +void gpu_sgemv(void* handle, const char transa, const int64_t m, const int64_t n, const float alpha, const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) { - assert (*handle != NULL); + assert (handle != NULL); /* Convert to int32_t */ int32_t m_, n_, lda_, incx_, incy_; @@ -178,10 +182,10 @@ void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha, const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc); -void gpu_dgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, +void gpu_dgemm(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) { - assert (*handle != NULL); + assert (handle != NULL); /* Convert to int32_t */ int32_t m_, n_, k_, lda_, ldb_, ldc_; @@ -209,10 +213,10 @@ void gpu_dgemm(void** handle, const char transa, const char transb, const int64_ void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha, const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc); -void gpu_sgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, +void gpu_sgemm(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) { - assert (*handle != NULL); + assert (handle != NULL); /* Convert to int32_t */ int32_t m_, n_, k_, lda_, ldb_, ldc_; @@ -236,9 +240,9 @@ void gpu_sgemm(void** handle, const char transa, const char transb, const int64_ } -void gpu_dgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, +void gpu_dgeam(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { - assert (*handle != NULL); + assert (handle != NULL); if ( (transa == 'N' && transb == 'N') || (transa == 'n' && transb == 'N') || @@ -368,9 +372,9 @@ void gpu_dgeam(void** handle, const char transa, const char transb, const int64_ } -void gpu_sgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha, +void gpu_sgeam(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha, const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) { - assert (*handle != NULL); + assert (handle != NULL); if ( (transa == 'N' && transb == 'N') || (transa == 'n' && transb == 'N') || diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 5c2daa05..5ee7366e 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -14,9 +14,15 @@ subroutine run_ccsd_space_orb type(gpu_double2) :: t1, r1 type(gpu_double2) :: H_oo, H_vv, H_vo - type(gpu_double2) :: d_cc_space_f_vo + type(gpu_double2) :: d_cc_space_f_oo, d_cc_space_f_vo + type(gpu_double2) :: d_cc_space_f_ov, d_cc_space_f_vv + + type(gpu_double3) :: d_cc_space_v_oo_chol, d_cc_space_v_vo_chol + type(gpu_double3) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol + type(gpu_double4) :: d_cc_space_v_oovv + double precision, allocatable :: all_err(:,:), all_t(:,:) integer, allocatable :: list_occ(:), list_vir(:) integer(bit_kind) :: det(N_int,2) @@ -24,7 +30,7 @@ subroutine run_ccsd_space_orb call set_multiple_levels_omp(.False.) - if (do_ao_cholesky) then + if (do_mo_cholesky) then PROVIDE cholesky_mo_transp FREE cholesky_ao else @@ -55,11 +61,36 @@ subroutine run_ccsd_space_orb !print*,'occ',list_occ !print*,'vir',list_vir + ! GPU arrays + call gpu_allocate(d_cc_space_f_oo, nO, nO) call gpu_allocate(d_cc_space_f_vo, nV, nO) - call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV) - call gpu_upload(cc_space_f_vo, d_cc_space_f_vo) - call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv) + call gpu_allocate(d_cc_space_f_ov, nO, nV) + call gpu_allocate(d_cc_space_f_vv, nV, nV) + call gpu_upload(cc_space_f_oo, d_cc_space_f_oo) + call gpu_upload(cc_space_f_vo, d_cc_space_f_vo) + call gpu_upload(cc_space_f_vv, d_cc_space_f_vv) + +! FREE cc_space_f_oo +! FREE cc_space_f_vo +! FREE cc_space_f_vv + + if (do_mo_cholesky) then + call gpu_allocate(d_cc_space_v_oo_chol, cholesky_mo_num, nO, nO) + call gpu_allocate(d_cc_space_v_ov_chol, cholesky_mo_num, nO, nV) + call gpu_allocate(d_cc_space_v_vo_chol, cholesky_mo_num, nV, nO) + call gpu_allocate(d_cc_space_v_vv_chol, cholesky_mo_num, nV, nV) + + call gpu_upload(cc_space_v_oo_chol, d_cc_space_v_oo_chol) + call gpu_upload(cc_space_v_ov_chol, d_cc_space_v_ov_chol) + call gpu_upload(cc_space_v_vo_chol, d_cc_space_v_vo_chol) + call gpu_upload(cc_space_v_vv_chol, d_cc_space_v_vv_chol) + +! FREE cc_space_v_oo_chol +! FREE cc_space_v_ov_chol +! FREE cc_space_v_vo_chol +! FREE cc_space_v_vv_chol + endif call gpu_allocate(t2, nO,nO,nV,nV) call gpu_allocate(r2, nO,nO,nV,nV) @@ -120,6 +151,13 @@ subroutine run_ccsd_space_orb call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,h_t2) call gpu_upload(h_t2, t2) + + call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV) + call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv) + +! FREE cc_space_v_oovv + + call update_tau_space(nO,nV,h_t1,t1,t2,tau) call update_tau_x_space(nO,nV,tau,tau_x) !print*,'hf_energy', hf_energy @@ -142,10 +180,10 @@ subroutine run_ccsd_space_orb do while (not_converged) ! Residue - if (do_ao_cholesky) then -! if (.False.) then - call compute_H_oo_chol(nO,nV,tau_x,H_oo) - call compute_H_vv_chol(nO,nV,tau_x%f,H_vv%f) + if (do_mo_cholesky) then + call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, & + d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo) + call compute_H_vv_chol(nO,nV,tau_x,H_vv) call compute_H_vo_chol(nO,nV,t1%f,H_vo%f) call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1) @@ -249,6 +287,12 @@ subroutine run_ccsd_space_orb call save_energy(uncorr_energy + energy, e_t) deallocate(h_t1, h_t2) + if (do_mo_cholesky) then + call gpu_deallocate(d_cc_space_v_oo_chol) + call gpu_deallocate(d_cc_space_v_ov_chol) + call gpu_deallocate(d_cc_space_v_vo_chol) + call gpu_deallocate(d_cc_space_v_vv_chol) + endif call gpu_deallocate(d_cc_space_f_vo) call gpu_deallocate(d_cc_space_v_oovv) call gpu_deallocate(t1) @@ -302,8 +346,21 @@ subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1, ! !$omp end parallel - call gpu_ddot(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1, d_cc_space_v_oovv, 1, energy) - call gpu_ddot(blas_handle, nO*nV*1_8, d_cc_space_f_vo, 1, t1, 1, e) + type(gpu_stream) :: s1, s2 + call gpu_stream_create(s1) + call gpu_stream_create(s2) + + call gpu_set_stream(blas_handle,s1) + call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo, 1, t1, 1, e) + + call gpu_set_stream(blas_handle,s2) + call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1_8, d_cc_space_v_oovv, 1_8, energy) + call gpu_synchronize() + call gpu_set_stream(blas_handle,gpu_default_stream) + + call gpu_stream_destroy(s1) + call gpu_stream_destroy(s2) + energy = energy + 2.d0*e end @@ -346,32 +403,29 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau) type(gpu_stream) :: stream(nV) - do b=1,nV - call gpu_stream_create(stream(b)) - enddo - - !$OMP PARALLEL & + !$OMP PARALLEL if (no_gpu()) & !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) !$OMP DO do b=1,nV + call gpu_stream_create(stream(b)) call gpu_set_stream(blas_handle,stream(b)) do j=1,nO - call gpu_dgeam(blas_handle, 'N', 'N', nO*1_8, nV*1_8, & - 1.d0, t2%f(1,j,1,b), nO*nO*1_8, & - h_t1(j,b), t1%f, nO*1_8, & - tau%f(1,j,1,b), nO*nO*1_8) + call gpu_dgeam_f(blas_handle, 'N', 'N', nO, nV, & + 1.d0, t2%f(1,j,1,b), nO*nO, & + h_t1(j,b), t1%f, nO, & + tau%f(1,j,1,b), nO*nO) enddo enddo !$OMP END DO !$OMP END PARALLEL - call gpu_synchronize() - do b=1,nV call gpu_stream_destroy(stream(b)) enddo + call gpu_set_stream(blas_handle,gpu_default_stream) + end @@ -412,7 +466,7 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x) call gpu_stream_create(stream(a)) enddo - !$OMP PARALLEL & + !$OMP PARALLEL if (no_gpu()) & !$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) @@ -420,20 +474,20 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x) do b=1,nV do a=1,nV call gpu_set_stream(blas_handle,stream(a)) - call gpu_dgeam(blas_handle, 'N', 'N', nO*1_8, nO*1_8, & - 2.d0, tau%f(1,1,a,b), nO*1_8, & - -1.d0, tau%f(1,1,b,a), nO*1_8, & - tau_x%f(1,1,a,b), nO*1_8) + call gpu_dgeam_f(blas_handle, 'N', 'N', nO, nO, & + 2.d0, tau%f(1,1,a,b), nO, & + -1.d0, tau%f(1,1,b,a), nO, & + tau_x%f(1,1,a,b), nO) enddo enddo !$OMP END DO !$OMP END PARALLEL - call gpu_synchronize() - do b=1,nV call gpu_stream_destroy(stream(b)) enddo + call gpu_set_stream(blas_handle,gpu_default_stream) + end diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index 9b161001..288724f3 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -293,62 +293,115 @@ end ! H_oo -subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo) +subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, & + d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo) use gpu implicit none integer, intent(in) :: nO,nV + type(gpu_double2), intent(in) :: d_cc_space_f_oo + type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol type(gpu_double4), intent(in) :: tau_x type(gpu_double2), intent(out) :: H_oo integer :: a,b,i,j,u,k - double precision, allocatable :: tau_kau(:,:,:), tmp_vov(:,:,:) + type(gpu_double3) :: tau_kau, tmp_vov, tmp_ovv - allocate(tau_kau(cholesky_mo_num,nV,nO)) - !$omp parallel & - !$omp default(shared) & - !$omp private(i,u,j,k,a,b,tmp_vov) - allocate(tmp_vov(nV,nO,nV) ) - !$omp do - do u = 1, nO + call gpu_allocate(tau_kau, cholesky_mo_num, nV, nO) + +! !$omp parallel & +! !$omp default(shared) & +! !$omp private(i,u,j,k,a,b,tmp_vov) +! call gpu_allocate(tmp_vov, nV, nO, nV) +! !$omp do +! do u = 1, nO +! do b=1,nV +! do j=1,nO +! do a=1,nV +! tmp_vov%f(a,j,b) = tau_x%f(u,j,a,b) +! enddo +! enddo +! enddo +! call dgemm('N','T',cholesky_mo_num,nV,nO*nV,1.d0, & +! d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, tmp_vov%f, nV, & +! 0.d0, tau_kau%f(1,1,u), cholesky_mo_num) +! enddo +! !$omp end do nowait +! call gpu_deallocate(tmp_vov) +! !$omp do +! do i = 1, nO +! do u = 1, nO +! H_oo%f(u,i) = d_cc_space_f_oo%f(u,i) +! enddo +! enddo +! !$omp end do nowait +! +! !$omp barrier +! !$omp end parallel +! call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & +! tau_kau%f(1,1,1), cholesky_mo_num*nV, d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num*nV, & +! 1.d0, H_oo%f(1,1), nO) +! + + type(gpu_stream) :: stream(nV) + + do b=1,nV + call gpu_stream_create(stream(b)) + enddo + + !$OMP PARALLEL if (no_gpu()) & + !$OMP DEFAULT(SHARED) & + !$OMP PRIVATE(u,b,tmp_vov,tmp_ovv) + + call gpu_allocate(tmp_vov, nV, nO, nV) + call gpu_allocate(tmp_ovv, nO, nV, nV) + + !$OMP DO + do u=1,nO + call gpu_dgeam_f(blas_handle, 'N', 'N', 1, nO*nV*nV, 1.d0, & + tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f, nO, tmp_ovv%f, 1) do b=1,nV - do j=1,nO - do a=1,nV - tmp_vov(a,j,b) = tau_x%f(u,j,a,b) - enddo - enddo + call gpu_set_stream(blas_handle,stream(b)) + call gpu_dgeam_f(blas_handle, 'T', 'T', nV, nO, 1.d0, & + tmp_ovv%f(1,1,b), nO, 0.d0, & + tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV) enddo - call dgemm('N','T',cholesky_mo_num,nV,nO*nV,1.d0, & - cc_space_v_ov_chol, cholesky_mo_num, tmp_vov, nV, & - 0.d0, tau_kau(1,1,u), cholesky_mo_num) + call gpu_dgemm_f(blas_handle, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, & + d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_vov%f, nV, & + 0.d0, tau_kau%f(1,1,u), cholesky_mo_num) + call gpu_synchronize() enddo - !$omp end do nowait - deallocate(tmp_vov) - !$omp do - do i = 1, nO - do u = 1, nO - H_oo%f(u,i) = cc_space_f_oo(u,i) - enddo - enddo - !$omp end do nowait - !$omp barrier - !$omp end parallel - call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & - tau_kau, cholesky_mo_num*nV, cc_space_v_vo_chol, cholesky_mo_num*nV, & - 1.d0, H_oo%f, nO) + !$OMP END DO + call gpu_deallocate(tmp_vov) + call gpu_deallocate(tmp_ovv) + !$OMP END PARALLEL + + do b=1,nV + call gpu_stream_destroy(stream(b)) + enddo + + call gpu_set_stream(blas_handle,gpu_default_stream) + + call gpu_copy(d_cc_space_f_oo, H_oo) + + call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & + tau_kau, cholesky_mo_num*nV, d_cc_space_v_vo_chol, cholesky_mo_num*nV, & + 1.d0, H_oo, nO) + + call gpu_deallocate(tau_kau) end ! H_vv subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv) - + use gpu implicit none integer, intent(in) :: nO,nV - double precision, intent(in) :: tau_x(nO, nO, nV, nV) - double precision, intent(out) :: H_vv(nV, nV) + type(gpu_double4), intent(in) :: tau_x + type(gpu_double2), intent(out) :: H_vv integer :: a,b,i,j,u,k, beta @@ -364,7 +417,7 @@ subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv) do b=1,nV do j=1,nO do i=1,nO - tmp_oov(i,j,b) = tau_x(i,j,a,b) + tmp_oov(i,j,b) = tau_x%f(i,j,a,b) enddo enddo enddo @@ -378,7 +431,7 @@ subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv) !$omp do do beta = 1, nV do a = 1, nV - H_vv(a,beta) = cc_space_f_vv(a,beta) + H_vv%f(a,beta) = cc_space_f_vv(a,beta) enddo enddo !$omp end do nowait @@ -386,7 +439,7 @@ subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv) !$omp end parallel call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, & tau_kia, cholesky_mo_num*nO, cc_space_v_ov_chol, cholesky_mo_num*nO, & - 1.d0, H_vv, nV) + 1.d0, H_vv%f, nV) end diff --git a/src/gpu/gpu.irp.f b/src/gpu/gpu.irp.f index e91d66f5..6ad0a075 100644 --- a/src/gpu/gpu.irp.f +++ b/src/gpu/gpu.irp.f @@ -8,4 +8,11 @@ BEGIN_PROVIDER [ type(gpu_blas), blas_handle ] call gpu_blas_create(blas_handle) END_PROVIDER +BEGIN_PROVIDER [ type(gpu_stream), gpu_default_stream ] + implicit none + BEGIN_DOC + ! Default stream + END_DOC + gpu_default_stream%c = C_NULL_PTR +END_PROVIDER diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90 index ecf79c83..2676b339 100644 --- a/src/gpu/gpu_module.F90 +++ b/src/gpu/gpu_module.F90 @@ -49,7 +49,12 @@ module gpu ! ------------ interface + logical(c_bool) function no_gpu() bind(C) + import + end function + integer function gpu_ndevices() bind(C) + import end function subroutine gpu_set_device(id) bind(C) @@ -101,7 +106,7 @@ module gpu subroutine gpu_set_stream_c(handle, stream) bind(C, name='gpu_set_stream') import - type(c_ptr) :: handle, stream + type(c_ptr), value :: handle, stream end subroutine subroutine gpu_synchronize() bind(C) @@ -120,15 +125,15 @@ module gpu subroutine gpu_ddot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_ddot') import - type(c_ptr), intent(in) :: handle + type(c_ptr), value, intent(in) :: handle integer(c_int64_t), value :: n, incx, incy - type(c_ptr), intent(in), value :: dx, dy + type(c_ptr), value :: dx, dy real(c_double), intent(out) :: res end subroutine subroutine gpu_sdot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_sdot') import - type(c_ptr), intent(in) :: handle + type(c_ptr), value, intent(in) :: handle integer(c_int64_t), value :: n, incx, incy type(c_ptr), intent(in), value :: dx, dy real(c_float), intent(out) :: res @@ -137,8 +142,8 @@ module gpu subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, & b, ldb, c, ldc) bind(C, name='gpu_dgeam') import - type(c_ptr), intent(in) :: handle - character(c_char), intent(in), value :: transa, transb + type(c_ptr), value, intent(in) :: handle + character(c_char), intent(in), value :: transa, transb integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc real(c_double), intent(in), value :: alpha, beta type(c_ptr), value :: a, b, c @@ -147,13 +152,33 @@ module gpu subroutine gpu_sgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, & b, ldb, c, ldc) bind(C, name='gpu_sgeam') import - type(c_ptr), intent(in) :: handle - character(c_char), intent(in), value :: transa, transb + type(c_ptr), value, intent(in) :: handle + character(c_char), intent(in), value :: transa, transb integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc real(c_float), intent(in), value :: alpha, beta type(c_ptr), value :: a, b, c end subroutine + subroutine gpu_dgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, & + b, ldb, beta, c, ldc) bind(C, name='gpu_dgemm') + import + type(c_ptr), value, intent(in) :: handle + character(c_char), intent(in), value :: transa, transb + integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc + real(c_double), intent(in), value :: alpha, beta + type(c_ptr), value :: a, b, c + end subroutine + + subroutine gpu_sgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, & + b, ldb, beta, c, ldc) bind(C, name='gpu_sgemm') + import + type(c_ptr), value, intent(in) :: handle + character(c_char), intent(in), value :: transa, transb + integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc + real(c_float), intent(in), value :: alpha, beta + type(c_ptr), value :: a, b, c + end subroutine + end interface @@ -161,20 +186,26 @@ module gpu ! ---------------------- interface gpu_allocate - procedure gpu_allocate_double1 & - ,gpu_allocate_double2 & - ,gpu_allocate_double3 & - ,gpu_allocate_double4 & - ,gpu_allocate_double5 & - ,gpu_allocate_double6 + procedure gpu_allocate_double1 & + ,gpu_allocate_double2 & + ,gpu_allocate_double3 & + ,gpu_allocate_double4 & + ,gpu_allocate_double5 & + ,gpu_allocate_double6 & + ,gpu_allocate_double1_64 & + ,gpu_allocate_double2_64 & + ,gpu_allocate_double3_64 & + ,gpu_allocate_double4_64 & + ,gpu_allocate_double5_64 & + ,gpu_allocate_double6_64 end interface gpu_allocate interface gpu_deallocate - procedure gpu_deallocate_double1 & - ,gpu_deallocate_double2 & - ,gpu_deallocate_double3 & - ,gpu_deallocate_double4 & - ,gpu_deallocate_double5 & + procedure gpu_deallocate_double1 & + ,gpu_deallocate_double2 & + ,gpu_deallocate_double3 & + ,gpu_deallocate_double4 & + ,gpu_deallocate_double5 & ,gpu_deallocate_double6 end interface gpu_deallocate @@ -267,6 +298,61 @@ module gpu end subroutine + subroutine gpu_allocate_double1_64(ptr, s) + implicit none + type(gpu_double1), intent(inout) :: ptr + integer*8, intent(in) :: s + + call gpu_allocate_c(ptr%c, s) + call c_f_pointer(ptr%c, ptr%f, (/ s /)) + end subroutine + + subroutine gpu_allocate_double2_64(ptr, s1, s2) + implicit none + type(gpu_double2), intent(inout) :: ptr + integer*8, intent(in) :: s1, s2 + + call gpu_allocate_c(ptr%c, s1*s2*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /)) + end subroutine + + subroutine gpu_allocate_double3_64(ptr, s1, s2, s3) + implicit none + type(gpu_double3), intent(inout) :: ptr + integer*8, intent(in) :: s1, s2, s3 + + call gpu_allocate_c(ptr%c, s1*s2*s3*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /)) + end subroutine + + subroutine gpu_allocate_double4_64(ptr, s1, s2, s3, s4) + implicit none + type(gpu_double4), intent(inout) :: ptr + integer*8, intent(in) :: s1, s2, s3, s4 + + call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /)) + end subroutine + + subroutine gpu_allocate_double5_64(ptr, s1, s2, s3, s4, s5) + implicit none + type(gpu_double5), intent(inout) :: ptr + integer*8, intent(in) :: s1, s2, s3, s4, s5 + + call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /)) + end subroutine + + subroutine gpu_allocate_double6_64(ptr, s1, s2, s3, s4, s5, s6) + implicit none + type(gpu_double6), intent(inout) :: ptr + integer*8, intent(in) :: s1, s2, s3, s4, s5, s6 + + call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /)) + end subroutine + + ! gpu_deallocate ! -------------- @@ -494,19 +580,38 @@ end module subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res) use gpu type(gpu_blas), intent(in) :: handle - integer*8 :: n, incx, incy - double precision, target, intent(in) :: dx(*), dy(*) - double precision, intent(out) :: res - call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res) + integer*4 :: n, incx, incy + type(gpu_double1), intent(in) :: dx, dy + double precision, intent(out) :: res + call gpu_ddot_c(handle%c, int(n,c_int64_t), dx%c, int(incx,c_int64_t), dy%c, int(incy,c_int64_t), res) end subroutine -subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res) +subroutine gpu_ddot_f(handle, n, dx, incx, dy, incy, res) + use gpu + type(gpu_blas), intent(in) :: handle + integer*4 :: n, incx, incy + double precision, target :: dx(*), dy(*) + double precision, intent(out) :: res + call gpu_ddot_c(handle%c, int(n,c_int64_t), c_loc(dx), int(incx,c_int64_t), c_loc(dy), int(incy,c_int64_t), res) +end subroutine + + +subroutine gpu_ddot_64(handle, n, dx, incx, dy, incy, res) use gpu type(gpu_blas), intent(in) :: handle integer*8 :: n, incx, incy - real, target, intent(in) :: dx(*), dy(*) - real, intent(out) :: res - call gpu_sdot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res) + type(gpu_double1), intent(in) :: dx, dy + double precision, intent(out) :: res + call gpu_ddot_c(handle%c, n, dx%c, incx, dy%c, incy, res) +end subroutine + +subroutine gpu_ddot_f_64(handle, n, dx, incx, dy, incy, res) + use gpu + type(gpu_blas), intent(in) :: handle + integer*8 :: n, incx, incy + double precision, target :: dx(*), dy(*) + double precision, intent(out) :: res + call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res) end subroutine @@ -518,22 +623,103 @@ subroutine gpu_dgeam(handle, transa, transb, m, n, alpha, a, lda, beta, & use gpu type(gpu_blas), intent(in) :: handle character, intent(in) :: transa, transb - integer*8, intent(in) :: m, n, lda, ldb, ldc + integer*4, intent(in) :: m, n, lda, ldb, ldc double precision, intent(in) :: alpha, beta - double precision, target :: a(lda,*), b(ldb,*), c(ldc,*) - call gpu_dgeam_c(handle%c, transa, transb, m, n, alpha, c_loc(a), lda, beta, & - c_loc(b), ldb, c_loc(c), ldc) + type(gpu_double2) :: a, b, c + call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, a%c, int(lda,c_int64_t), beta, & + b%c, int(ldb,c_int64_t), c%c, int(ldc,c_int64_t)) end subroutine -subroutine gpu_sgeam(handle, transa, transb, m, n, alpha, a, lda, beta, & + +subroutine gpu_dgeam_f(handle, transa, transb, m, n, alpha, a, lda, beta, & b, ldb, c, ldc) - use gpu + use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa, transb + integer*4, intent(in) :: m, n, lda, ldb, ldc + double precision, intent(in) :: alpha, beta + double precision, target :: a(*), b(*), c(*) + call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, & + c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t)) +end subroutine + + +subroutine gpu_dgeam_64(handle, transa, transb, m, n, alpha, a, lda, beta, & + b, ldb, c, ldc) + use gpu type(gpu_blas), intent(in) :: handle character, intent(in) :: transa, transb integer*8, intent(in) :: m, n, lda, ldb, ldc - real, intent(in) :: alpha, beta - real, target :: a(lda,*), b(ldb,*), c(ldc,*) - call gpu_sgeam_c(handle%c, transa, transb, m, n, alpha, c_loc(a), lda, beta, & - c_loc(b), ldb, c_loc(c), ldc) + double precision, intent(in) :: alpha, beta + type(gpu_double2) :: a, b, c + call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, a%c, int(lda,c_int64_t), beta, & + b%c, int(ldb,c_int64_t), c%c, int(ldc,c_int64_t)) +end subroutine + + +subroutine gpu_dgeam_f_64(handle, transa, transb, m, n, alpha, a, lda, beta, & + b, ldb, c, ldc) + use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa, transb + integer*8, intent(in) :: m, n, lda, ldb, ldc + double precision, intent(in) :: alpha, beta + double precision, target :: a(*), b(*), c(*) + call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, & + c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t)) +end subroutine + + +! gemm +! ---- + +subroutine gpu_dgemm(handle, transa, transb, m, n, k, alpha, a, lda, & + b, ldb, beta, c, ldc) + use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa, transb + integer*4, intent(in) :: m, n, k, lda, ldb, ldc + double precision, intent(in) :: alpha, beta + type(gpu_double2) :: a, b, c + call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), & + alpha, a%c, int(lda,c_int64_t), & + b%c, int(ldb,c_int64_t), beta, c%c, int(ldc,c_int64_t)) +end subroutine + +subroutine gpu_dgemm_64(handle, transa, transb, m, n, k, alpha, a, lda, & + b, ldb, beta, c, ldc) + use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa, transb + integer*8, intent(in) :: m, n, k, lda, ldb, ldc + double precision, intent(in) :: alpha, beta + type(gpu_double2) :: a, b, c + call gpu_dgemm_c(handle%c, transa, transb, m, n, k, & + alpha, a%c, lda, b%c, ldb, beta, c%c, ldc) +end subroutine + +subroutine gpu_dgemm_f(handle, transa, transb, m, n, k, alpha, a, lda, & + b, ldb, beta, c, ldc) + use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa, transb + integer*4, intent(in) :: m, n, k, lda, ldb, ldc + double precision, intent(in) :: alpha, beta + double precision, target :: a(*), b(*), c(*) + call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), & + alpha, c_loc(a), int(lda,c_int64_t), & + c_loc(b), int(ldb,c_int64_t), beta, c_loc(c), int(ldc,c_int64_t)) +end subroutine + +subroutine gpu_dgemm_f_64(handle, transa, transb, m, n, k, alpha, a, lda, & + b, ldb, beta, c, ldc) + use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa, transb + integer*8, intent(in) :: m, n, k, lda, ldb, ldc + double precision, intent(in) :: alpha, beta + double precision, target :: a(*), b(*), c(*) + call gpu_dgemm_c(handle%c, transa, transb, m, n, k, & + alpha, c_loc(a), lda, c_loc(b), ldb, beta, c_loc(c), ldc) end subroutine From d3c1994c64ed9ae9914ce605a6b7c364ac518d9b Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Mon, 1 Jul 2024 18:04:48 +0200 Subject: [PATCH 23/38] H_vv --- plugins/local/gpu_nvidia/gpu.c | 16 +-- plugins/local/gpu_x86/gpu.c | 6 +- src/ccsd/ccsd_space_orb_sub.irp.f | 7 +- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 146 ++++++++++++++++--------- src/gpu/gpu.irp.f | 8 ++ 5 files changed, 114 insertions(+), 69 deletions(-) diff --git a/plugins/local/gpu_nvidia/gpu.c b/plugins/local/gpu_nvidia/gpu.c index 189de64c..39a82984 100644 --- a/plugins/local/gpu_nvidia/gpu.c +++ b/plugins/local/gpu_nvidia/gpu.c @@ -11,10 +11,6 @@ /* Generic functions */ -bool no_gpu() { - return false; -} - int gpu_ndevices() { int ngpus; cudaGetDeviceCount(&ngpus); @@ -35,13 +31,13 @@ void gpu_allocate(void** ptr, const int64_t size) { free = INT64_MAX; } - /* Use managed memory if it does not fit on the GPU */ - if (size < free && size < total/2) { + rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal); +// /* Use managed memory if it does not fit on the GPU */ +// if (size < free && size < total/2) { // rc= cudaMalloc(ptr, size); - rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal); - } else { - rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal); - } +// } else { +// rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal); +// } assert (rc == cudaSuccess); } diff --git a/plugins/local/gpu_x86/gpu.c b/plugins/local/gpu_x86/gpu.c index 53267a7c..dab23a25 100644 --- a/plugins/local/gpu_x86/gpu.c +++ b/plugins/local/gpu_x86/gpu.c @@ -5,14 +5,10 @@ #include #include -bool no_gpu() { - return true; -} - /* Generic functions */ int gpu_ndevices() { - return 1; + return 0; } void gpu_set_device(int32_t i) { diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 5ee7366e..0b3636ac 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -183,7 +183,8 @@ subroutine run_ccsd_space_orb if (do_mo_cholesky) then call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, & d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo) - call compute_H_vv_chol(nO,nV,tau_x,H_vv) + call compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, & + d_cc_space_v_ov_chol,H_vv) call compute_H_vo_chol(nO,nV,t1%f,H_vo%f) call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1) @@ -403,7 +404,7 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau) type(gpu_stream) :: stream(nV) - !$OMP PARALLEL if (no_gpu()) & + !$OMP PARALLEL if (gpu_num == 0) & !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) @@ -466,7 +467,7 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x) call gpu_stream_create(stream(a)) enddo - !$OMP PARALLEL if (no_gpu()) & + !$OMP PARALLEL if (gpu_num == 0) & !$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index 288724f3..458016fb 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -344,48 +344,47 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, & ! 1.d0, H_oo%f(1,1), nO) ! - type(gpu_stream) :: stream(nV) + type(gpu_blas) :: blas - do b=1,nV - call gpu_stream_create(stream(b)) - enddo - !$OMP PARALLEL if (no_gpu()) & + !$OMP PARALLEL & !$OMP DEFAULT(SHARED) & - !$OMP PRIVATE(u,b,tmp_vov,tmp_ovv) + !$OMP PRIVATE(blas,u,b,tmp_vov,tmp_ovv) + + !$OMP SINGLE + !$OMP TASK + call gpu_copy(d_cc_space_f_oo, H_oo) + !$OMP END TASK + !$OMP END SINGLE - call gpu_allocate(tmp_vov, nV, nO, nV) call gpu_allocate(tmp_ovv, nO, nV, nV) + call gpu_allocate(tmp_vov, nV, nO, nV) + + call gpu_blas_create(blas) !$OMP DO do u=1,nO - call gpu_dgeam_f(blas_handle, 'N', 'N', 1, nO*nV*nV, 1.d0, & + call gpu_dgeam_f(blas, 'N', 'N', 1, nO*nV*nV, 1.d0, & tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f, nO, tmp_ovv%f, 1) do b=1,nV - call gpu_set_stream(blas_handle,stream(b)) - call gpu_dgeam_f(blas_handle, 'T', 'T', nV, nO, 1.d0, & + call gpu_dgeam_f(blas, 'T', 'T', nV, nO, 1.d0, & tmp_ovv%f(1,1,b), nO, 0.d0, & tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV) enddo - call gpu_dgemm_f(blas_handle, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, & + call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, & d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_vov%f, nV, & 0.d0, tau_kau%f(1,1,u), cholesky_mo_num) - call gpu_synchronize() enddo !$OMP END DO + call gpu_blas_destroy(blas) + call gpu_deallocate(tmp_vov) call gpu_deallocate(tmp_ovv) + + !$OMP TASKWAIT !$OMP END PARALLEL - do b=1,nV - call gpu_stream_destroy(stream(b)) - enddo - - call gpu_set_stream(blas_handle,gpu_default_stream) - - call gpu_copy(d_cc_space_f_oo, H_oo) - call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & tau_kau, cholesky_mo_num*nV, d_cc_space_v_vo_chol, cholesky_mo_num*nV, & 1.d0, H_oo, nO) @@ -395,52 +394,97 @@ end ! H_vv -subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv) +subroutine compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, & + d_cc_space_v_ov_chol,H_vv) use gpu implicit none - integer, intent(in) :: nO,nV + integer, intent(in) :: nO,nV + type(gpu_double2), intent(in) :: d_cc_space_f_vv + type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol type(gpu_double4), intent(in) :: tau_x type(gpu_double2), intent(out) :: H_vv integer :: a,b,i,j,u,k, beta - double precision, allocatable :: tau_kia(:,:,:), tmp_oov(:,:,:) + type(gpu_double3) :: tau_kia, tmp_oov - allocate(tau_kia(cholesky_mo_num,nO,nV)) - !$omp parallel & - !$omp default(shared) & - !$omp private(i,beta,j,k,a,b,tmp_oov) - allocate(tmp_oov(nO,nO,nV) ) - !$omp do + call gpu_allocate(tau_kia, cholesky_mo_num, nO, nV) + +! !$omp parallel & +! !$omp default(shared) & +! !$omp private(i,beta,j,k,a,b,tmp_oov) +! allocate(tmp_oov(nO,nO,nV) ) +! !$omp do +! do a = 1, nV +! do b=1,nV +! do j=1,nO +! do i=1,nO +! tmp_oov(i,j,b) = tau_x%f(i,j,a,b) +! enddo +! enddo +! enddo +! call dgemm('N','T',cholesky_mo_num,nO,nO*nV,1.d0, & +! d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov, nO, & +! 0.d0, tau_kia(1,1,a), cholesky_mo_num) +! enddo +! !$omp end do nowait +! deallocate(tmp_oov) + +! !$omp do +! do beta = 1, nV +! do a = 1, nV +! H_vv%f(a,beta) = cc_space_f_vv(a,beta) +! enddo +! enddo +! !$omp end do nowait +! !$omp barrier +! !$omp end parallel +! call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, & +! tau_kia, cholesky_mo_num*nO, d_cc_space_v_ov_chol%f, cholesky_mo_num*nO, & +! 1.d0, H_vv%f, nV) + + type(gpu_blas) :: blas + + + PROVIDE gpu_num + !$OMP PARALLEL & + !$OMP DEFAULT(SHARED) & + !$OMP PRIVATE(a,b,tmp_oov,blas) + + !$OMP SINGLE + !$OMP TASK + call gpu_copy(d_cc_space_f_vv, H_vv) + !$OMP END TASK + !$OMP END SINGLE + + call gpu_blas_create(blas) + call gpu_allocate(tmp_oov, nO, nO, nV) + + !$OMP DO do a = 1, nV do b=1,nV - do j=1,nO - do i=1,nO - tmp_oov(i,j,b) = tau_x%f(i,j,a,b) - enddo - enddo + call gpu_dgeam_f(blas, 'N', 'N', nO, nO, 1.d0, & + tau_x%f(1,1,a,b), nO, 0.d0, & + tau_x%f(1,1,a,b), nO, tmp_oov%f(1,1,b), nO) enddo - call dgemm('N','T',cholesky_mo_num,nO,nO*nV,1.d0, & - cc_space_v_ov_chol, cholesky_mo_num, tmp_oov, nO, & - 0.d0, tau_kia(1,1,a), cholesky_mo_num) + call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nO,nO*nV,1.d0, & + d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov%f, nO, & + 0.d0, tau_kia%f(1,1,a), cholesky_mo_num) enddo - !$omp end do nowait - deallocate(tmp_oov) + !$OMP END DO - !$omp do - do beta = 1, nV - do a = 1, nV - H_vv%f(a,beta) = cc_space_f_vv(a,beta) - enddo - enddo - !$omp end do nowait - !$omp barrier - !$omp end parallel - call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, & - tau_kia, cholesky_mo_num*nO, cc_space_v_ov_chol, cholesky_mo_num*nO, & - 1.d0, H_vv%f, nV) + call gpu_blas_destroy(blas) + call gpu_deallocate(tmp_oov) + !$OMP TASKWAIT + !$OMP END PARALLEL + + call gpu_dgemm(blas_handle,'T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, & + tau_kia, cholesky_mo_num*nO, d_cc_space_v_ov_chol, cholesky_mo_num*nO, & + 1.d0, H_vv, nV) + + call gpu_deallocate(tau_kia) end ! H_vo diff --git a/src/gpu/gpu.irp.f b/src/gpu/gpu.irp.f index 6ad0a075..3b2feeb6 100644 --- a/src/gpu/gpu.irp.f +++ b/src/gpu/gpu.irp.f @@ -16,3 +16,11 @@ BEGIN_PROVIDER [ type(gpu_stream), gpu_default_stream ] gpu_default_stream%c = C_NULL_PTR END_PROVIDER +BEGIN_PROVIDER [ integer, gpu_num ] + implicit none + BEGIN_DOC + ! Number of usable GPUs + END_DOC + gpu_num = gpu_ndevices() +END_PROVIDER + From 44a7729f65a37cc3a7c35ae55f462bb1d61e411b Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Mon, 1 Jul 2024 19:00:27 +0200 Subject: [PATCH 24/38] H_ finished in CCSD --- src/ccsd/ccsd_space_orb_sub.irp.f | 108 ++---- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 482 +++++++++---------------- 2 files changed, 200 insertions(+), 390 deletions(-) diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 0b3636ac..13b974be 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -181,11 +181,9 @@ subroutine run_ccsd_space_orb ! Residue if (do_mo_cholesky) then - call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, & - d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo) - call compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, & - d_cc_space_v_ov_chol,H_vv) - call compute_H_vo_chol(nO,nV,t1%f,H_vo%f) + call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo) + call compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, d_cc_space_v_ov_chol,H_vv) + call compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol, H_vo) call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1) call compute_r2_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r2%f,max_r2) @@ -316,51 +314,20 @@ subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1, integer :: i,j,a,b double precision :: e -! energy = 0d0 -! !$omp parallel & -! !$omp shared(nO,nV,energy,tau_x,t1,& -! !$omp d_cc_space_f_vo,d_cc_space_v_oovv) & -! !$omp private(i,j,a,b,e) & -! !$omp default(none) -! e = 0d0 -! !$omp do -! do a = 1, nV -! do i = 1, nO -! e = e + 2d0 * d_cc_space_f_vo%f(a,i) * t1%f(i,a) -! enddo -! enddo -! !$omp end do nowait -! !$omp do -! do b = 1, nV -! do a = 1, nV -! do j = 1, nO -! do i = 1, nO -! e = e + tau_x%f(i,j,a,b) * d_cc_space_v_oovv%f(i,j,a,b) -! enddo -! enddo -! enddo -! enddo -! !$omp end do nowait -! !$omp critical -! energy = energy + e -! !$omp end critical -! !$omp end parallel + type(gpu_stream) :: s1, s2 + call gpu_stream_create(s1) + call gpu_stream_create(s2) + call gpu_set_stream(blas_handle,s1) + call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo, 1, t1, 1, e) - type(gpu_stream) :: s1, s2 - call gpu_stream_create(s1) - call gpu_stream_create(s2) + call gpu_set_stream(blas_handle,s2) + call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1_8, d_cc_space_v_oovv, 1_8, energy) + call gpu_set_stream(blas_handle,gpu_default_stream) - call gpu_set_stream(blas_handle,s1) - call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo, 1, t1, 1, e) - - call gpu_set_stream(blas_handle,s2) - call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1_8, d_cc_space_v_oovv, 1_8, energy) - call gpu_synchronize() - call gpu_set_stream(blas_handle,gpu_default_stream) - - call gpu_stream_destroy(s1) - call gpu_stream_destroy(s2) + call gpu_synchronize() + call gpu_stream_destroy(s1) + call gpu_stream_destroy(s2) energy = energy + 2.d0*e @@ -384,27 +351,9 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau) ! internal integer :: i,j,a,b -! !$OMP PARALLEL & -! !$OMP SHARED(nO,nV,tau,t2,t1,h_t1) & -! !$OMP PRIVATE(i,j,a,b) & -! !$OMP DEFAULT(NONE) -! !$OMP DO -! do b = 1, nV -! do a = 1, nV -! do j = 1, nO -! do i = 1, nO -! tau%f(i,j,a,b) = t2%f(i,j,a,b) + t1%f(i,a) * h_t1(j,b) -! enddo -! enddo -! enddo -! enddo -! !$OMP END DO -! !$OMP END PARALLEL - - type(gpu_stream) :: stream(nV) - !$OMP PARALLEL if (gpu_num == 0) & + !$OMP PARALLEL & !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) @@ -422,6 +371,8 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau) !$OMP END DO !$OMP END PARALLEL + call gpu_synchronize() + do b=1,nV call gpu_stream_destroy(stream(b)) enddo @@ -444,32 +395,15 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x) ! internal integer :: i,j,a,b -! !$OMP PARALLEL & -! !$OMP SHARED(nO,nV,tau,tau_x) & -! !$OMP PRIVATE(i,j,a,b) & -! !$OMP DEFAULT(NONE) -! !$OMP DO -! do b = 1, nV -! do a = 1, nV -! do j = 1, nO -! do i = 1, nO -! tau_x%f(i,j,a,b) = 2.d0*tau%f(i,j,a,b) - tau%f(i,j,b,a) -! enddo -! enddo -! enddo -! enddo -! !$OMP END DO -! !$OMP END PARALLEL - type(gpu_stream) :: stream(nV) do a=1,nV call gpu_stream_create(stream(a)) enddo - !$OMP PARALLEL if (gpu_num == 0) & + !$OMP PARALLEL & !$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) & - !$OMP PRIVATE(i,j,a,b) & + !$OMP PRIVATE(a,b) & !$OMP DEFAULT(NONE) !$OMP DO do b=1,nV @@ -484,10 +418,12 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x) !$OMP END DO !$OMP END PARALLEL + call gpu_set_stream(blas_handle,gpu_default_stream) + call gpu_synchronize() + do b=1,nV call gpu_stream_destroy(stream(b)) enddo - call gpu_set_stream(blas_handle,gpu_default_stream) end diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index 458016fb..5eb95a06 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -1,81 +1,200 @@ -subroutine ccsd_energy_space_chol(nO,nV,tau,t1,energy) +! H_oo +subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, & + d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo) + use gpu implicit none - integer, intent(in) :: nO, nV - double precision, intent(in) :: tau(nO,nO,nV,nV) - double precision, intent(in) :: t1(nO,nV) - double precision, intent(out) :: energy + integer, intent(in) :: nO,nV + type(gpu_double2), intent(in) :: d_cc_space_f_oo + type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol + type(gpu_double4), intent(in) :: tau_x + type(gpu_double2), intent(out) :: H_oo - ! internal - integer :: i,j,a,b - double precision :: e + integer :: a,b,i,j,u,k - energy = 0d0 - !$omp parallel & - !$omp shared(nO,nV,energy,tau,t1,& - !$omp cc_space_f_vo,cc_space_w_oovv) & - !$omp private(i,j,a,b,e) & - !$omp default(none) - e = 0d0 - !$omp do - do a = 1, nV - do i = 1, nO - e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a) - enddo - enddo - !$omp end do nowait - !$omp do - do b = 1, nV - do a = 1, nV - do j = 1, nO - do i = 1, nO - e = e + tau(i,j,a,b) * cc_space_w_oovv(i,j,a,b) - enddo - enddo - enddo - enddo - !$omp end do nowait - !$omp critical - energy = energy + e - !$omp end critical - !$omp end parallel + type(gpu_double3) :: tau_kau, tmp_vov, tmp_ovv -end + call gpu_allocate(tau_kau, cholesky_mo_num, nV, nO) -! Tau + type(gpu_blas) :: blas -subroutine update_tau_space_chol(nO,nV,t1,t2,tau) - implicit none + !$OMP PARALLEL & + !$OMP DEFAULT(SHARED) & + !$OMP PRIVATE(blas,u,b,tmp_vov,tmp_ovv) - ! in - integer, intent(in) :: nO, nV - double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV) + !$OMP SINGLE + !$OMP TASK + call gpu_copy(d_cc_space_f_oo, H_oo) + !$OMP END TASK + !$OMP END SINGLE - ! out - double precision, intent(out) :: tau(nO,nO,nV,nV) + call gpu_allocate(tmp_ovv, nO, nV, nV) + call gpu_allocate(tmp_vov, nV, nO, nV) - ! internal - integer :: i,j,a,b + call gpu_blas_create(blas) - !$OMP PARALLEL & - !$OMP SHARED(nO,nV,tau,t2,t1) & - !$OMP PRIVATE(i,j,a,b) & - !$OMP DEFAULT(NONE) !$OMP DO - do b = 1, nV - do a = 1, nV - do j = 1, nO - do i = 1, nO - tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b) - enddo - enddo + do u=1,nO + call gpu_dgeam_f(blas, 'N', 'N', 1, nO*nV*nV, 1.d0, & + tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f, nO, tmp_ovv%f, 1) + do b=1,nV + call gpu_dgeam_f(blas, 'T', 'T', nV, nO, 1.d0, & + tmp_ovv%f(1,1,b), nO, 0.d0, & + tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV) enddo + call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, & + d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_vov%f, nV, & + 0.d0, tau_kau%f(1,1,u), cholesky_mo_num) enddo !$OMP END DO + + call gpu_blas_destroy(blas) + + call gpu_deallocate(tmp_vov) + call gpu_deallocate(tmp_ovv) + + !$OMP TASKWAIT !$OMP END PARALLEL + call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & + tau_kau, cholesky_mo_num*nV, d_cc_space_v_vo_chol, cholesky_mo_num*nV, & + 1.d0, H_oo, nO) + + call gpu_synchronize() + call gpu_deallocate(tau_kau) +end + +! H_vv + +subroutine compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, & + d_cc_space_v_ov_chol,H_vv) + use gpu + implicit none + + integer, intent(in) :: nO,nV + type(gpu_double2), intent(in) :: d_cc_space_f_vv + type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol + type(gpu_double4), intent(in) :: tau_x + type(gpu_double2), intent(out) :: H_vv + + integer :: a,b,i,j,u,k, beta + + type(gpu_double3) :: tau_kia, tmp_oov + + call gpu_allocate(tau_kia, cholesky_mo_num, nO, nV) + + type(gpu_blas) :: blas + + !$OMP PARALLEL & + !$OMP DEFAULT(SHARED) & + !$OMP PRIVATE(a,b,tmp_oov,blas) + + !$OMP SINGLE + !$OMP TASK + call gpu_copy(d_cc_space_f_vv, H_vv) + !$OMP END TASK + !$OMP END SINGLE + + call gpu_blas_create(blas) + call gpu_allocate(tmp_oov, nO, nO, nV) + + !$OMP DO + do a = 1, nV + do b=1,nV + call gpu_dgeam_f(blas, 'N', 'N', nO, nO, 1.d0, & + tau_x%f(1,1,a,b), nO, 0.d0, & + tau_x%f(1,1,a,b), nO, tmp_oov%f(1,1,b), nO) + enddo + call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nO,nO*nV,1.d0, & + d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov%f, nO, & + 0.d0, tau_kia%f(1,1,a), cholesky_mo_num) + enddo + !$OMP END DO + + call gpu_blas_destroy(blas) + + call gpu_deallocate(tmp_oov) + !$OMP TASKWAIT + !$OMP END PARALLEL + + call gpu_dgemm(blas_handle,'T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, & + tau_kia, cholesky_mo_num*nO, d_cc_space_v_ov_chol, cholesky_mo_num*nO, & + 1.d0, H_vv, nV) + + call gpu_synchronize() + call gpu_deallocate(tau_kia) +end + +! H_vo +subroutine compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, & + d_cc_space_v_ov_chol,d_cc_space_v_vo_chol, H_vo) + use gpu + implicit none + + integer, intent(in) :: nO,nV + type(gpu_double2), intent(in) :: t1, d_cc_space_f_vo + type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol + type(gpu_double2), intent(out) :: H_vo + + integer :: a,b,i,j,u,k + + type(gpu_double1) :: tmp_k + type(gpu_double3) :: tmp, tmp2 + + call gpu_copy(d_cc_space_f_vo, H_vo) + + call gpu_allocate(tmp_k, cholesky_mo_num) + + call gpu_dgemm(blas_handle, 'N', 'N', cholesky_mo_num, 1, nO*nV, 2.d0, & + d_cc_space_v_ov_chol, cholesky_mo_num, & + t1, nO*nV, 0.d0, tmp_k, cholesky_mo_num) + + call gpu_dgemm(blas_handle, 'T','N',nV*nO,1,cholesky_mo_num,1.d0, & + d_cc_space_v_vo_chol, cholesky_mo_num, tmp_k, cholesky_mo_num, 1.d0, & + H_vo, nV*nO) + + call gpu_deallocate(tmp_k) + + + call gpu_allocate(tmp, cholesky_mo_num, nO, nO) + + call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nO, nO, nV, 1.d0, & + d_cc_space_v_ov_chol, cholesky_mo_num*nO, t1, nO, 0.d0, tmp, cholesky_mo_num*nO) + + call gpu_allocate(tmp2, cholesky_mo_num, nO, nO) + + type(gpu_stream) :: stream(nO) + do i=1,nO + call gpu_stream_create(stream(i)) + enddo + + !$OMP PARALLEL DO COLLAPSE(2) PRIVATE(i,j) + do i=1,nO + do j=1,nO + call gpu_set_stream(blas_handle,stream(j)) + call gpu_dgeam_f(blas_handle, 'N', 'N', cholesky_mo_num, 1, 1.d0, & + tmp%f(1,i,j), cholesky_mo_num, 0.d0, & + tmp%f(1,i,j), cholesky_mo_num, tmp2%f(1,j,i), cholesky_mo_num) + enddo + enddo + !$OMP END PARALLEL DO + + call gpu_set_stream(blas_handle,gpu_default_stream) + call gpu_synchronize() + + do i=1,nO + call gpu_stream_destroy(stream(i)) + enddo + call gpu_deallocate(tmp) + + call gpu_dgemm(blas_handle, 'T','N', nV, nO, cholesky_mo_num*nO, -1.d0, & + d_cc_space_v_ov_chol, cholesky_mo_num*nO, tmp2, cholesky_mo_num*nO, & + 1.d0, H_vo, nV) + + call gpu_synchronize() + call gpu_deallocate(tmp2) end ! R1 @@ -291,251 +410,6 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) end -! H_oo - -subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, & - d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo) - use gpu - implicit none - - integer, intent(in) :: nO,nV - type(gpu_double2), intent(in) :: d_cc_space_f_oo - type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol - type(gpu_double4), intent(in) :: tau_x - type(gpu_double2), intent(out) :: H_oo - - integer :: a,b,i,j,u,k - - type(gpu_double3) :: tau_kau, tmp_vov, tmp_ovv - - call gpu_allocate(tau_kau, cholesky_mo_num, nV, nO) - -! !$omp parallel & -! !$omp default(shared) & -! !$omp private(i,u,j,k,a,b,tmp_vov) -! call gpu_allocate(tmp_vov, nV, nO, nV) -! !$omp do -! do u = 1, nO -! do b=1,nV -! do j=1,nO -! do a=1,nV -! tmp_vov%f(a,j,b) = tau_x%f(u,j,a,b) -! enddo -! enddo -! enddo -! call dgemm('N','T',cholesky_mo_num,nV,nO*nV,1.d0, & -! d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, tmp_vov%f, nV, & -! 0.d0, tau_kau%f(1,1,u), cholesky_mo_num) -! enddo -! !$omp end do nowait -! call gpu_deallocate(tmp_vov) -! !$omp do -! do i = 1, nO -! do u = 1, nO -! H_oo%f(u,i) = d_cc_space_f_oo%f(u,i) -! enddo -! enddo -! !$omp end do nowait -! -! !$omp barrier -! !$omp end parallel -! call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & -! tau_kau%f(1,1,1), cholesky_mo_num*nV, d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num*nV, & -! 1.d0, H_oo%f(1,1), nO) -! - - type(gpu_blas) :: blas - - - !$OMP PARALLEL & - !$OMP DEFAULT(SHARED) & - !$OMP PRIVATE(blas,u,b,tmp_vov,tmp_ovv) - - !$OMP SINGLE - !$OMP TASK - call gpu_copy(d_cc_space_f_oo, H_oo) - !$OMP END TASK - !$OMP END SINGLE - - call gpu_allocate(tmp_ovv, nO, nV, nV) - call gpu_allocate(tmp_vov, nV, nO, nV) - - call gpu_blas_create(blas) - - !$OMP DO - do u=1,nO - call gpu_dgeam_f(blas, 'N', 'N', 1, nO*nV*nV, 1.d0, & - tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f, nO, tmp_ovv%f, 1) - do b=1,nV - call gpu_dgeam_f(blas, 'T', 'T', nV, nO, 1.d0, & - tmp_ovv%f(1,1,b), nO, 0.d0, & - tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV) - enddo - call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, & - d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_vov%f, nV, & - 0.d0, tau_kau%f(1,1,u), cholesky_mo_num) - enddo - !$OMP END DO - - call gpu_blas_destroy(blas) - - call gpu_deallocate(tmp_vov) - call gpu_deallocate(tmp_ovv) - - !$OMP TASKWAIT - !$OMP END PARALLEL - - call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & - tau_kau, cholesky_mo_num*nV, d_cc_space_v_vo_chol, cholesky_mo_num*nV, & - 1.d0, H_oo, nO) - - call gpu_deallocate(tau_kau) -end - -! H_vv - -subroutine compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, & - d_cc_space_v_ov_chol,H_vv) - use gpu - implicit none - - integer, intent(in) :: nO,nV - type(gpu_double2), intent(in) :: d_cc_space_f_vv - type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol - type(gpu_double4), intent(in) :: tau_x - type(gpu_double2), intent(out) :: H_vv - - integer :: a,b,i,j,u,k, beta - - type(gpu_double3) :: tau_kia, tmp_oov - - call gpu_allocate(tau_kia, cholesky_mo_num, nO, nV) - -! !$omp parallel & -! !$omp default(shared) & -! !$omp private(i,beta,j,k,a,b,tmp_oov) -! allocate(tmp_oov(nO,nO,nV) ) -! !$omp do -! do a = 1, nV -! do b=1,nV -! do j=1,nO -! do i=1,nO -! tmp_oov(i,j,b) = tau_x%f(i,j,a,b) -! enddo -! enddo -! enddo -! call dgemm('N','T',cholesky_mo_num,nO,nO*nV,1.d0, & -! d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov, nO, & -! 0.d0, tau_kia(1,1,a), cholesky_mo_num) -! enddo -! !$omp end do nowait -! deallocate(tmp_oov) - -! !$omp do -! do beta = 1, nV -! do a = 1, nV -! H_vv%f(a,beta) = cc_space_f_vv(a,beta) -! enddo -! enddo -! !$omp end do nowait -! !$omp barrier -! !$omp end parallel -! call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, & -! tau_kia, cholesky_mo_num*nO, d_cc_space_v_ov_chol%f, cholesky_mo_num*nO, & -! 1.d0, H_vv%f, nV) - - type(gpu_blas) :: blas - - - PROVIDE gpu_num - !$OMP PARALLEL & - !$OMP DEFAULT(SHARED) & - !$OMP PRIVATE(a,b,tmp_oov,blas) - - !$OMP SINGLE - !$OMP TASK - call gpu_copy(d_cc_space_f_vv, H_vv) - !$OMP END TASK - !$OMP END SINGLE - - call gpu_blas_create(blas) - call gpu_allocate(tmp_oov, nO, nO, nV) - - !$OMP DO - do a = 1, nV - do b=1,nV - call gpu_dgeam_f(blas, 'N', 'N', nO, nO, 1.d0, & - tau_x%f(1,1,a,b), nO, 0.d0, & - tau_x%f(1,1,a,b), nO, tmp_oov%f(1,1,b), nO) - enddo - call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nO,nO*nV,1.d0, & - d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov%f, nO, & - 0.d0, tau_kia%f(1,1,a), cholesky_mo_num) - enddo - !$OMP END DO - - call gpu_blas_destroy(blas) - - call gpu_deallocate(tmp_oov) - !$OMP TASKWAIT - !$OMP END PARALLEL - - call gpu_dgemm(blas_handle,'T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, & - tau_kia, cholesky_mo_num*nO, d_cc_space_v_ov_chol, cholesky_mo_num*nO, & - 1.d0, H_vv, nV) - - call gpu_deallocate(tau_kia) -end - -! H_vo -subroutine compute_H_vo_chol(nO,nV,t1,H_vo) - - implicit none - - integer, intent(in) :: nO,nV - double precision, intent(in) :: t1(nO, nV) - double precision, intent(out) :: H_vo(nV, nO) - - integer :: a,b,i,j,u,k - - double precision, allocatable :: tmp_k(:), tmp(:,:,:), tmp2(:,:,:) - do i=1,nO - do a=1,nV - H_vo(a,i) = cc_space_f_vo(a,i) - enddo - enddo - - allocate(tmp_k(cholesky_mo_num)) - call dgemm('N', 'N', cholesky_mo_num, 1, nO*nV, 2.d0, & - cc_space_v_ov_chol, cholesky_mo_num, & - t1, nO*nV, 0.d0, tmp_k, cholesky_mo_num) - - call dgemm('T','N',nV*nO,1,cholesky_mo_num,1.d0, & - cc_space_v_vo_chol, cholesky_mo_num, tmp_k, cholesky_mo_num, 1.d0, & - H_vo, nV*nO) - deallocate(tmp_k) - - allocate(tmp(cholesky_mo_num,nO,nO)) - allocate(tmp2(cholesky_mo_num,nO,nO)) - - call dgemm('N','T', cholesky_mo_num*nO, nO, nV, 1.d0, & - cc_space_v_ov_chol, cholesky_mo_num*nO, t1, nO, 0.d0, tmp, cholesky_mo_num*nO) - - do i=1,nO - do j=1,nO - do k=1,cholesky_mo_num - tmp2(k,j,i) = tmp(k,i,j) - enddo - enddo - enddo - deallocate(tmp) - - call dgemm('T','N', nV, nO, cholesky_mo_num*nO, -1.d0, & - cc_space_v_ov_chol, cholesky_mo_num*nO, tmp2, cholesky_mo_num*nO, & - 1.d0, H_vo, nV) - -end - ! R2 From 2bead959d0eee7790162df656e3781e4dcdedb7d Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 2 Jul 2024 13:58:19 +0200 Subject: [PATCH 25/38] Fxied GPU interface for gfortran --- plugins/local/gpu_x86/gpu.c | 90 ++++++++++---------- src/ccsd/ccsd_space_orb_sub.irp.f | 10 +-- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 46 +++++------ src/gpu/gpu_module.F90 | 109 +++++-------------------- 4 files changed, 94 insertions(+), 161 deletions(-) diff --git a/plugins/local/gpu_x86/gpu.c b/plugins/local/gpu_x86/gpu.c index dab23a25..fe3cadc5 100644 --- a/plugins/local/gpu_x86/gpu.c +++ b/plugins/local/gpu_x86/gpu.c @@ -49,10 +49,11 @@ void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) { /* Streams */ void gpu_stream_create(void** ptr) { - *ptr = (void*) 2; + *ptr = (void*) malloc(sizeof(char)); } void gpu_stream_destroy(void** ptr) { + free(*ptr); *ptr = NULL; } @@ -68,11 +69,12 @@ void gpu_synchronize() { /* BLAS functions */ void gpu_blas_create(void** handle) { - *handle = (void*) 1; + *handle = (void*) malloc(sizeof(char)); } void gpu_blas_destroy(void** handle) { + free(*handle); *handle = NULL; } @@ -122,7 +124,7 @@ void gpu_sdot(void* handle, const int64_t n, const float* x, const int64_t incx, void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha, const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy); -void gpu_dgemv(void* handle, const char transa, const int64_t m, const int64_t n, const double alpha, +void gpu_dgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const double alpha, const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) { assert (handle != NULL); @@ -143,14 +145,14 @@ void gpu_dgemv(void* handle, const char transa, const int64_t m, const int64_t n assert ( (int64_t) incx_ == incx); assert ( (int64_t) incy_ == incy); - dgemv_(&transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_); + dgemv_(transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_); } void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha, const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy); -void gpu_sgemv(void* handle, const char transa, const int64_t m, const int64_t n, const float alpha, +void gpu_sgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const float alpha, const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) { assert (handle != NULL); @@ -171,14 +173,14 @@ void gpu_sgemv(void* handle, const char transa, const int64_t m, const int64_t n assert ( (int64_t) incx_ == incx); assert ( (int64_t) incy_ == incy); - sgemv_(&transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_); + sgemv_(transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_); } void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha, const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc); -void gpu_dgemm(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, +void gpu_dgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) { assert (handle != NULL); @@ -201,7 +203,7 @@ void gpu_dgemm(void* handle, const char transa, const char transb, const int64_t assert ( (int64_t) ldb_ == ldb); assert ( (int64_t) ldc_ == ldc); - dgemm_(&transa, &transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_); + dgemm_(transa, transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_); } @@ -209,7 +211,7 @@ void gpu_dgemm(void* handle, const char transa, const char transb, const int64_t void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha, const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc); -void gpu_sgemm(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, +void gpu_sgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) { assert (handle != NULL); @@ -232,18 +234,18 @@ void gpu_sgemm(void* handle, const char transa, const char transb, const int64_t assert ( (int64_t) ldb_ == ldb); assert ( (int64_t) ldc_ == ldc); - sgemm_(&transa, &transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_); + sgemm_(transa, transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_); } -void gpu_dgeam(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, +void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double alpha, const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { assert (handle != NULL); - if ( (transa == 'N' && transb == 'N') || - (transa == 'n' && transb == 'N') || - (transa == 'N' && transb == 'n') || - (transa == 'n' && transb == 'n') ) { + if ( (*transa == 'N' && *transb == 'N') || + (*transa == 'n' && *transb == 'N') || + (*transa == 'N' && *transb == 'n') || + (*transa == 'n' && *transb == 'n') ) { if (alpha == 0.) { @@ -271,10 +273,10 @@ void gpu_dgeam(void* handle, const char transa, const char transb, const int64_t } - } else if ( (transa == 'N' && transb == 'T') || - (transa == 'n' && transb == 'T') || - (transa == 'N' && transb == 't') || - (transa == 'n' && transb == 't') ) { + } else if ( (*transa == 'N' && *transb == 'T') || + (*transa == 'n' && *transb == 'T') || + (*transa == 'N' && *transb == 't') || + (*transa == 'n' && *transb == 't') ) { if (alpha == 0.) { @@ -302,10 +304,10 @@ void gpu_dgeam(void* handle, const char transa, const char transb, const int64_t } - } else if ( (transa == 'T' && transb == 'N') || - (transa == 't' && transb == 'N') || - (transa == 'T' && transb == 'n') || - (transa == 't' && transb == 'n') ) { + } else if ( (*transa == 'T' && *transb == 'N') || + (*transa == 't' && *transb == 'N') || + (*transa == 'T' && *transb == 'n') || + (*transa == 't' && *transb == 'n') ) { if (alpha == 0.) { @@ -333,10 +335,10 @@ void gpu_dgeam(void* handle, const char transa, const char transb, const int64_t } - } else if ( (transa == 'T' && transb == 'T') || - (transa == 't' && transb == 'T') || - (transa == 'T' && transb == 't') || - (transa == 't' && transb == 't') ) { + } else if ( (*transa == 'T' && *transb == 'T') || + (*transa == 't' && *transb == 'T') || + (*transa == 'T' && *transb == 't') || + (*transa == 't' && *transb == 't') ) { if (alpha == 0.) { @@ -368,14 +370,14 @@ void gpu_dgeam(void* handle, const char transa, const char transb, const int64_t } -void gpu_sgeam(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha, +void gpu_sgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float alpha, const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) { assert (handle != NULL); - if ( (transa == 'N' && transb == 'N') || - (transa == 'n' && transb == 'N') || - (transa == 'N' && transb == 'n') || - (transa == 'n' && transb == 'n') ) { + if ( (*transa == 'N' && *transb == 'N') || + (*transa == 'n' && *transb == 'N') || + (*transa == 'N' && *transb == 'n') || + (*transa == 'n' && *transb == 'n') ) { if (alpha == 0.) { @@ -403,10 +405,10 @@ void gpu_sgeam(void* handle, const char transa, const char transb, const int64_t } - } else if ( (transa == 'N' && transb == 'T') || - (transa == 'n' && transb == 'T') || - (transa == 'N' && transb == 't') || - (transa == 'n' && transb == 't') ) { + } else if ( (*transa == 'N' && *transb == 'T') || + (*transa == 'n' && *transb == 'T') || + (*transa == 'N' && *transb == 't') || + (*transa == 'n' && *transb == 't') ) { if (alpha == 0.) { @@ -434,10 +436,10 @@ void gpu_sgeam(void* handle, const char transa, const char transb, const int64_t } - } else if ( (transa == 'T' && transb == 'N') || - (transa == 't' && transb == 'N') || - (transa == 'T' && transb == 'n') || - (transa == 't' && transb == 'n') ) { + } else if ( (*transa == 'T' && *transb == 'N') || + (*transa == 't' && *transb == 'N') || + (*transa == 'T' && *transb == 'n') || + (*transa == 't' && *transb == 'n') ) { if (alpha == 0.) { @@ -465,10 +467,10 @@ void gpu_sgeam(void* handle, const char transa, const char transb, const int64_t } - } else if ( (transa == 'T' && transb == 'T') || - (transa == 't' && transb == 'T') || - (transa == 'T' && transb == 't') || - (transa == 't' && transb == 't') ) { + } else if ( (*transa == 'T' && *transb == 'T') || + (*transa == 't' && *transb == 'T') || + (*transa == 'T' && *transb == 't') || + (*transa == 't' && *transb == 't') ) { if (alpha == 0.) { diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 13b974be..de109cea 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -319,10 +319,10 @@ subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1, call gpu_stream_create(s2) call gpu_set_stream(blas_handle,s1) - call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo, 1, t1, 1, e) + call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo%f(1,1), 1, t1%f(1,1), 1, e) call gpu_set_stream(blas_handle,s2) - call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1_8, d_cc_space_v_oovv, 1_8, energy) + call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x%f(1,1,1,1), 1_8, d_cc_space_v_oovv%f(1,1,1,1), 1_8, energy) call gpu_set_stream(blas_handle,gpu_default_stream) call gpu_synchronize() @@ -362,9 +362,9 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau) call gpu_stream_create(stream(b)) call gpu_set_stream(blas_handle,stream(b)) do j=1,nO - call gpu_dgeam_f(blas_handle, 'N', 'N', nO, nV, & + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, & 1.d0, t2%f(1,j,1,b), nO*nO, & - h_t1(j,b), t1%f, nO, & + h_t1(j,b), t1%f(1,1), nO, & tau%f(1,j,1,b), nO*nO) enddo enddo @@ -409,7 +409,7 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x) do b=1,nV do a=1,nV call gpu_set_stream(blas_handle,stream(a)) - call gpu_dgeam_f(blas_handle, 'N', 'N', nO, nO, & + call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, & 2.d0, tau%f(1,1,a,b), nO, & -1.d0, tau%f(1,1,b,a), nO, & tau_x%f(1,1,a,b), nO) diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index 5eb95a06..a3490589 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -37,15 +37,15 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, & !$OMP DO do u=1,nO - call gpu_dgeam_f(blas, 'N', 'N', 1, nO*nV*nV, 1.d0, & - tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f, nO, tmp_ovv%f, 1) + call gpu_dgeam(blas, 'N', 'N', 1, nO*nV*nV, 1.d0, & + tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f(1,1,1,1), nO, tmp_ovv%f(1,1,1), 1) do b=1,nV - call gpu_dgeam_f(blas, 'T', 'T', nV, nO, 1.d0, & + call gpu_dgeam(blas, 'T', 'T', nV, nO, 1.d0, & tmp_ovv%f(1,1,b), nO, 0.d0, & tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV) enddo - call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, & - d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_vov%f, nV, & + call gpu_dgemm(blas, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, & + d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, tmp_vov%f(1,1,1), nV, & 0.d0, tau_kau%f(1,1,u), cholesky_mo_num) enddo !$OMP END DO @@ -59,8 +59,8 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, & !$OMP END PARALLEL call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & - tau_kau, cholesky_mo_num*nV, d_cc_space_v_vo_chol, cholesky_mo_num*nV, & - 1.d0, H_oo, nO) + tau_kau%f(1,1,1), cholesky_mo_num*nV, d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num*nV, & + 1.d0, H_oo%f(1,1), nO) call gpu_synchronize() call gpu_deallocate(tau_kau) @@ -103,12 +103,12 @@ subroutine compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, & !$OMP DO do a = 1, nV do b=1,nV - call gpu_dgeam_f(blas, 'N', 'N', nO, nO, 1.d0, & + call gpu_dgeam(blas, 'N', 'N', nO, nO, 1.d0, & tau_x%f(1,1,a,b), nO, 0.d0, & tau_x%f(1,1,a,b), nO, tmp_oov%f(1,1,b), nO) enddo - call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nO,nO*nV,1.d0, & - d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov%f, nO, & + call gpu_dgemm(blas, 'N', 'T', cholesky_mo_num, nO, nO*nV, 1.d0, & + d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, tmp_oov%f(1,1,1), nO, & 0.d0, tau_kia%f(1,1,a), cholesky_mo_num) enddo !$OMP END DO @@ -119,9 +119,9 @@ subroutine compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, & !$OMP TASKWAIT !$OMP END PARALLEL - call gpu_dgemm(blas_handle,'T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, & - tau_kia, cholesky_mo_num*nO, d_cc_space_v_ov_chol, cholesky_mo_num*nO, & - 1.d0, H_vv, nV) + call gpu_dgemm(blas_handle, 'T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, & + tau_kia%f(1,1,1), cholesky_mo_num*nO, d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, & + 1.d0, H_vv%f(1,1), nV) call gpu_synchronize() call gpu_deallocate(tau_kia) @@ -148,20 +148,20 @@ subroutine compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, & call gpu_allocate(tmp_k, cholesky_mo_num) call gpu_dgemm(blas_handle, 'N', 'N', cholesky_mo_num, 1, nO*nV, 2.d0, & - d_cc_space_v_ov_chol, cholesky_mo_num, & - t1, nO*nV, 0.d0, tmp_k, cholesky_mo_num) + d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, & + t1%f(1,1), nO*nV, 0.d0, tmp_k%f(1), cholesky_mo_num) - call gpu_dgemm(blas_handle, 'T','N',nV*nO,1,cholesky_mo_num,1.d0, & - d_cc_space_v_vo_chol, cholesky_mo_num, tmp_k, cholesky_mo_num, 1.d0, & - H_vo, nV*nO) + call gpu_dgemm(blas_handle, 'T', 'N', nV*nO, 1, cholesky_mo_num, 1.d0, & + d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num, tmp_k%f(1), cholesky_mo_num, 1.d0, & + H_vo%f(1,1), nV*nO) call gpu_deallocate(tmp_k) call gpu_allocate(tmp, cholesky_mo_num, nO, nO) - call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nO, nO, nV, 1.d0, & - d_cc_space_v_ov_chol, cholesky_mo_num*nO, t1, nO, 0.d0, tmp, cholesky_mo_num*nO) + call gpu_dgemm(blas_handle, 'N', 'T', cholesky_mo_num*nO, nO, nV, 1.d0, & + d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, t1%f(1,1), nO, 0.d0, tmp%f(1,1,1), cholesky_mo_num*nO) call gpu_allocate(tmp2, cholesky_mo_num, nO, nO) @@ -174,7 +174,7 @@ subroutine compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, & do i=1,nO do j=1,nO call gpu_set_stream(blas_handle,stream(j)) - call gpu_dgeam_f(blas_handle, 'N', 'N', cholesky_mo_num, 1, 1.d0, & + call gpu_dgeam(blas_handle, 'N', 'N', cholesky_mo_num, 1, 1.d0, & tmp%f(1,i,j), cholesky_mo_num, 0.d0, & tmp%f(1,i,j), cholesky_mo_num, tmp2%f(1,j,i), cholesky_mo_num) enddo @@ -190,8 +190,8 @@ subroutine compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, & call gpu_deallocate(tmp) call gpu_dgemm(blas_handle, 'T','N', nV, nO, cholesky_mo_num*nO, -1.d0, & - d_cc_space_v_ov_chol, cholesky_mo_num*nO, tmp2, cholesky_mo_num*nO, & - 1.d0, H_vo, nV) + d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, tmp2%f(1,1,1), cholesky_mo_num*nO, & + 1.d0, H_vo%f(1,1), nV) call gpu_synchronize() call gpu_deallocate(tmp2) diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90 index 2676b339..20d99ede 100644 --- a/src/gpu/gpu_module.F90 +++ b/src/gpu/gpu_module.F90 @@ -156,17 +156,17 @@ module gpu character(c_char), intent(in), value :: transa, transb integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc real(c_float), intent(in), value :: alpha, beta - type(c_ptr), value :: a, b, c + real(c_float) :: a, b, c end subroutine subroutine gpu_dgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, & b, ldb, beta, c, ldc) bind(C, name='gpu_dgemm') import type(c_ptr), value, intent(in) :: handle - character(c_char), intent(in), value :: transa, transb + character(c_char), intent(in) :: transa, transb integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc real(c_double), intent(in), value :: alpha, beta - type(c_ptr), value :: a, b, c + real(c_double) :: a, b, c end subroutine subroutine gpu_sgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, & @@ -176,7 +176,7 @@ module gpu character(c_char), intent(in), value :: transa, transb integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc real(c_float), intent(in), value :: alpha, beta - type(c_ptr), value :: a, b, c + real(c_float) :: a, b, c end subroutine end interface @@ -570,7 +570,6 @@ module gpu end subroutine -end module @@ -578,38 +577,20 @@ end module ! --- subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res) - use gpu +! use gpu type(gpu_blas), intent(in) :: handle integer*4 :: n, incx, incy - type(gpu_double1), intent(in) :: dx, dy - double precision, intent(out) :: res - call gpu_ddot_c(handle%c, int(n,c_int64_t), dx%c, int(incx,c_int64_t), dy%c, int(incy,c_int64_t), res) -end subroutine - -subroutine gpu_ddot_f(handle, n, dx, incx, dy, incy, res) - use gpu - type(gpu_blas), intent(in) :: handle - integer*4 :: n, incx, incy - double precision, target :: dx(*), dy(*) + double precision, target :: dx, dy double precision, intent(out) :: res call gpu_ddot_c(handle%c, int(n,c_int64_t), c_loc(dx), int(incx,c_int64_t), c_loc(dy), int(incy,c_int64_t), res) end subroutine subroutine gpu_ddot_64(handle, n, dx, incx, dy, incy, res) - use gpu +! use gpu type(gpu_blas), intent(in) :: handle integer*8 :: n, incx, incy - type(gpu_double1), intent(in) :: dx, dy - double precision, intent(out) :: res - call gpu_ddot_c(handle%c, n, dx%c, incx, dy%c, incy, res) -end subroutine - -subroutine gpu_ddot_f_64(handle, n, dx, incx, dy, incy, res) - use gpu - type(gpu_blas), intent(in) :: handle - integer*8 :: n, incx, incy - double precision, target :: dx(*), dy(*) + double precision, target :: dx, dy double precision, intent(out) :: res call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res) end subroutine @@ -620,25 +601,12 @@ end subroutine subroutine gpu_dgeam(handle, transa, transb, m, n, alpha, a, lda, beta, & b, ldb, c, ldc) - use gpu +! use gpu type(gpu_blas), intent(in) :: handle character, intent(in) :: transa, transb integer*4, intent(in) :: m, n, lda, ldb, ldc double precision, intent(in) :: alpha, beta - type(gpu_double2) :: a, b, c - call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, a%c, int(lda,c_int64_t), beta, & - b%c, int(ldb,c_int64_t), c%c, int(ldc,c_int64_t)) -end subroutine - - -subroutine gpu_dgeam_f(handle, transa, transb, m, n, alpha, a, lda, beta, & - b, ldb, c, ldc) - use gpu - type(gpu_blas), intent(in) :: handle - character, intent(in) :: transa, transb - integer*4, intent(in) :: m, n, lda, ldb, ldc - double precision, intent(in) :: alpha, beta - double precision, target :: a(*), b(*), c(*) + double precision, target :: a, b, c call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, & c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t)) end subroutine @@ -646,25 +614,12 @@ end subroutine subroutine gpu_dgeam_64(handle, transa, transb, m, n, alpha, a, lda, beta, & b, ldb, c, ldc) - use gpu +! use gpu type(gpu_blas), intent(in) :: handle character, intent(in) :: transa, transb integer*8, intent(in) :: m, n, lda, ldb, ldc double precision, intent(in) :: alpha, beta - type(gpu_double2) :: a, b, c - call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, a%c, int(lda,c_int64_t), beta, & - b%c, int(ldb,c_int64_t), c%c, int(ldc,c_int64_t)) -end subroutine - - -subroutine gpu_dgeam_f_64(handle, transa, transb, m, n, alpha, a, lda, beta, & - b, ldb, c, ldc) - use gpu - type(gpu_blas), intent(in) :: handle - character, intent(in) :: transa, transb - integer*8, intent(in) :: m, n, lda, ldb, ldc - double precision, intent(in) :: alpha, beta - double precision, target :: a(*), b(*), c(*) + double precision, target :: a, b, c call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, & c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t)) end subroutine @@ -675,51 +630,27 @@ end subroutine subroutine gpu_dgemm(handle, transa, transb, m, n, k, alpha, a, lda, & b, ldb, beta, c, ldc) - use gpu +! use gpu type(gpu_blas), intent(in) :: handle character, intent(in) :: transa, transb integer*4, intent(in) :: m, n, k, lda, ldb, ldc double precision, intent(in) :: alpha, beta - type(gpu_double2) :: a, b, c + double precision :: a, b, c call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), & - alpha, a%c, int(lda,c_int64_t), & - b%c, int(ldb,c_int64_t), beta, c%c, int(ldc,c_int64_t)) + alpha, a, int(lda,c_int64_t), & + b, int(ldb,c_int64_t), beta, c, int(ldc,c_int64_t)) end subroutine subroutine gpu_dgemm_64(handle, transa, transb, m, n, k, alpha, a, lda, & b, ldb, beta, c, ldc) - use gpu +! use gpu type(gpu_blas), intent(in) :: handle character, intent(in) :: transa, transb integer*8, intent(in) :: m, n, k, lda, ldb, ldc double precision, intent(in) :: alpha, beta - type(gpu_double2) :: a, b, c - call gpu_dgemm_c(handle%c, transa, transb, m, n, k, & - alpha, a%c, lda, b%c, ldb, beta, c%c, ldc) -end subroutine - -subroutine gpu_dgemm_f(handle, transa, transb, m, n, k, alpha, a, lda, & - b, ldb, beta, c, ldc) - use gpu - type(gpu_blas), intent(in) :: handle - character, intent(in) :: transa, transb - integer*4, intent(in) :: m, n, k, lda, ldb, ldc - double precision, intent(in) :: alpha, beta - double precision, target :: a(*), b(*), c(*) + double precision :: a, b, c call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), & - alpha, c_loc(a), int(lda,c_int64_t), & - c_loc(b), int(ldb,c_int64_t), beta, c_loc(c), int(ldc,c_int64_t)) -end subroutine - -subroutine gpu_dgemm_f_64(handle, transa, transb, m, n, k, alpha, a, lda, & - b, ldb, beta, c, ldc) - use gpu - type(gpu_blas), intent(in) :: handle - character, intent(in) :: transa, transb - integer*8, intent(in) :: m, n, k, lda, ldb, ldc - double precision, intent(in) :: alpha, beta - double precision, target :: a(*), b(*), c(*) - call gpu_dgemm_c(handle%c, transa, transb, m, n, k, & - alpha, c_loc(a), lda, c_loc(b), ldb, beta, c_loc(c), ldc) + alpha, a, int(lda,c_int64_t), b, int(ldb,c_int64_t), beta, c, int(ldc,c_int64_t)) end subroutine +end module From 447cdcd907dd864252777423763ed6947efc32d8 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 2 Jul 2024 17:22:41 +0200 Subject: [PATCH 26/38] Working on r1 --- plugins/local/gpu_nvidia/gpu.c | 32 ++--- src/ccsd/ccsd_space_orb_sub.irp.f | 31 ++++- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 176 ++++++++++++------------- src/gpu/gpu_module.F90 | 57 +++++++- 4 files changed, 182 insertions(+), 114 deletions(-) diff --git a/plugins/local/gpu_nvidia/gpu.c b/plugins/local/gpu_nvidia/gpu.c index 39a82984..e77847a6 100644 --- a/plugins/local/gpu_nvidia/gpu.c +++ b/plugins/local/gpu_nvidia/gpu.c @@ -149,7 +149,7 @@ void gpu_sdot(cublasHandle_t handle, const int64_t n, const float* x, const int6 -void gpu_dgemv(cublasHandle_t handle, const char transa, const int64_t m, const int64_t n, const double alpha, +void gpu_dgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const double alpha, const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) { assert (handle != NULL); @@ -171,14 +171,14 @@ void gpu_dgemv(cublasHandle_t handle, const char transa, const int64_t m, const assert ( (int64_t) incy_ == incy); cublasOperation_t transa_ = CUBLAS_OP_N; - if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; + if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T; cublasDgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_); } -void gpu_sgemv(cublasHandle_t handle, const char transa, const int64_t m, const int64_t n, const float alpha, +void gpu_sgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const float alpha, const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) { assert (handle != NULL); @@ -200,13 +200,13 @@ void gpu_sgemv(cublasHandle_t handle, const char transa, const int64_t m, const assert ( (int64_t) incy_ == incy); cublasOperation_t transa_ = CUBLAS_OP_N; - if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; + if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T; cublasSgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_); } -void gpu_dgemm(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, +void gpu_dgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) { assert (handle != NULL); @@ -231,15 +231,15 @@ void gpu_dgemm(cublasHandle_t handle, const char transa, const char transb, cons cublasOperation_t transa_ = CUBLAS_OP_N; cublasOperation_t transb_ = CUBLAS_OP_N; - if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; - if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; + if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T; + if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T; cublasDgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_); } -void gpu_sgemm(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, +void gpu_sgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) { assert (handle != NULL); @@ -264,14 +264,14 @@ void gpu_sgemm(cublasHandle_t handle, const char transa, const char transb, cons cublasOperation_t transa_ = CUBLAS_OP_N; cublasOperation_t transb_ = CUBLAS_OP_N; - if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; - if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; + if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T; + if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T; cublasSgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_); } -void gpu_dgeam(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, +void gpu_dgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double alpha, const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { assert (handle != NULL); @@ -293,15 +293,15 @@ void gpu_dgeam(cublasHandle_t handle, const char transa, const char transb, cons cublasOperation_t transa_ = CUBLAS_OP_N; cublasOperation_t transb_ = CUBLAS_OP_N; - if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; - if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; + if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T; + if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T; cublasDgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_); } -void gpu_sgeam(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha, +void gpu_sgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float alpha, const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) { assert (handle != NULL); @@ -323,8 +323,8 @@ void gpu_sgeam(cublasHandle_t handle, const char transa, const char transb, cons cublasOperation_t transa_ = CUBLAS_OP_N; cublasOperation_t transb_ = CUBLAS_OP_N; - if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; - if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; + if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T; + if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T; cublasSgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_); diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index de109cea..256f743b 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -20,8 +20,8 @@ subroutine run_ccsd_space_orb type(gpu_double3) :: d_cc_space_v_oo_chol, d_cc_space_v_vo_chol type(gpu_double3) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol - type(gpu_double4) :: d_cc_space_v_oovv - + type(gpu_double4) :: d_cc_space_v_oovv, d_cc_space_v_voov, d_cc_space_v_ovov + type(gpu_double4) :: d_cc_space_v_oovo double precision, allocatable :: all_err(:,:), all_t(:,:) integer, allocatable :: list_occ(:), list_vir(:) @@ -69,6 +69,7 @@ subroutine run_ccsd_space_orb call gpu_upload(cc_space_f_oo, d_cc_space_f_oo) call gpu_upload(cc_space_f_vo, d_cc_space_f_vo) + call gpu_upload(cc_space_f_ov, d_cc_space_f_ov) call gpu_upload(cc_space_f_vv, d_cc_space_f_vv) ! FREE cc_space_f_oo @@ -92,6 +93,18 @@ subroutine run_ccsd_space_orb ! FREE cc_space_v_vv_chol endif + call gpu_allocate(d_cc_space_v_voov, nV, nO, nO, nV) + call gpu_allocate(d_cc_space_v_ovov, nO, nV, nO, nV) + call gpu_allocate(d_cc_space_v_oovo, nO, nO, nV, nO) + + call gpu_upload(cc_space_v_voov, d_cc_space_v_voov) + call gpu_upload(cc_space_v_ovov, d_cc_space_v_ovov) + call gpu_upload(cc_space_v_oovo, d_cc_space_v_oovo) + +! FREE cc_space_v_voov +! FREE cc_space_v_ovov +! FREE cc_space_v_oovo + call gpu_allocate(t2, nO,nO,nV,nV) call gpu_allocate(r2, nO,nO,nV,nV) call gpu_allocate(tau, nO,nO,nV,nV) @@ -185,7 +198,8 @@ subroutine run_ccsd_space_orb call compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, d_cc_space_v_ov_chol,H_vv) call compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol, H_vo) - call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1) + call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_space_f_ov,d_cc_space_f_vo, & + d_cc_space_v_voov, d_cc_space_v_ovov, d_cc_space_v_oovo, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol) call compute_r2_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r2%f,max_r2) else call compute_H_oo(nO,nV,t1%f,t2%f,tau%f,H_oo%f) @@ -292,8 +306,17 @@ subroutine run_ccsd_space_orb call gpu_deallocate(d_cc_space_v_vo_chol) call gpu_deallocate(d_cc_space_v_vv_chol) endif - call gpu_deallocate(d_cc_space_f_vo) + call gpu_deallocate(d_cc_space_v_oovv) + call gpu_deallocate(d_cc_space_v_voov) + call gpu_deallocate(d_cc_space_v_ovov) + call gpu_deallocate(d_cc_space_v_oovo) + + call gpu_deallocate(d_cc_space_f_oo) + call gpu_deallocate(d_cc_space_f_vo) + call gpu_deallocate(d_cc_space_f_ov) + call gpu_deallocate(d_cc_space_f_vv) + call gpu_deallocate(t1) call gpu_deallocate(t2) diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index a3490589..6190e985 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -199,59 +199,52 @@ end ! R1 -subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) - +subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_space_f_ov,d_cc_space_f_vo, & + d_cc_space_v_voov, d_cc_space_v_ovov, d_cc_space_v_oovo, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol) + use gpu implicit none ! in integer, intent(in) :: nO, nV - double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV), tau(nO,nO,nV,nV) - double precision, intent(in) :: H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO) + type(gpu_double2), intent(in) :: t1, H_oo, H_vo, H_vv, d_cc_space_f_ov,d_cc_space_f_vo + type(gpu_double3), intent(in) :: d_cc_space_v_vo_chol, d_cc_space_v_vv_chol + type(gpu_double4), intent(in) :: t2, tau, d_cc_space_v_voov, d_cc_space_v_ovov, d_cc_space_v_oovo ! out - double precision, intent(out) :: r1(nO,nV), max_r1 + type(gpu_double2), intent(out) :: r1 + double precision, intent(out) :: max_r1 ! internal integer :: u,i,j,beta,a,b - !$omp parallel & - !$omp shared(nO,nV,r1,cc_space_f_ov) & - !$omp private(u,beta) & - !$omp default(none) - !$omp do - do beta = 1, nV - do u = 1, nO - r1(u,beta) = cc_space_f_ov(u,beta) - enddo - enddo - !$omp end do - !$omp end parallel + call gpu_copy(d_cc_space_f_ov, r1) - double precision, allocatable :: X_oo(:,:) - allocate(X_oo(nO,nO)) - call dgemm('N','N', nO, nO, nV, & - -2d0, t1 , size(t1,1), & - cc_space_f_vo, size(cc_space_f_vo,1), & - 0d0, X_oo , size(X_oo,1)) + type(gpu_double2) :: X_oo + call gpu_allocate(X_oo,nO,nO) - call dgemm('T','N', nO, nV, nO, & - 1d0, X_oo, size(X_oo,2), & - t1 , size(t1,1), & - 1d0, r1 , size(r1,1)) - deallocate(X_oo) + call gpu_dgemm(blas_handle, 'N','N', nO, nO, nV, & + -2d0, t1%f(1,1), size(t1%f,1), & + d_cc_space_f_vo%f(1,1), size(d_cc_space_f_vo%f,1), & + 0d0, X_oo%f(1,1), size(X_oo%f,1)) - call dgemm('N','N', nO, nV, nV, & - 1d0, t1 , size(t1,1), & - H_vv, size(H_vv,1), & - 1d0, r1 , size(r1,1)) + call gpu_dgemm(blas_handle, 'T','N', nO, nV, nO, & + 1d0, X_oo%f(1,1), size(X_oo%f,2), & + t1%f(1,1) , size(t1%f,1), & + 1d0, r1%f(1,1) , size(r1%f,1)) - call dgemm('N','N', nO, nV, nO, & - -1d0, H_oo, size(H_oo,1), & - t1 , size(t1,1), & - 1d0, r1, size(r1,1)) + call gpu_dgemm(blas_handle, 'N','N', nO, nV, nV, & + 1d0, t1%f(1,1) , size(t1%f,1), & + H_vv%f(1,1), size(H_vv%f,1), & + 1d0, r1%f(1,1) , size(r1%f,1)) + + call gpu_dgemm(blas_handle, 'N','N', nO, nV, nO, & + -1d0, H_oo%f(1,1), size(H_oo%f,1), & + t1%f(1,1) , size(t1%f,1), & + 1d0, r1%f(1,1), size(r1%f,1)) + + type(gpu_double4) :: X_voov + call gpu_allocate(X_voov, nV, nO, nO, nV) - double precision, allocatable :: X_voov(:,:,:,:) - allocate(X_voov(nV, nO, nO, nV)) !$omp parallel & !$omp shared(nO,nV,X_voov,t2,t1) & @@ -262,7 +255,7 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) do u = 1, nO do i = 1, nO do a = 1, nV - X_voov(a,i,u,beta) = 2d0 * t2(i,u,a,beta) - t2(u,i,a,beta) + t1(u,a) * t1(i,beta) + X_voov%f(a,i,u,beta) = 2d0 * t2%f(i,u,a,beta) - t2%f(u,i,a,beta) + t1%f(u,a) * t1%f(i,beta) enddo enddo enddo @@ -270,18 +263,20 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) !$omp end do !$omp end parallel - call dgemv('T', nV*nO, nO*nV, & - 1d0, X_voov, size(X_voov,1) * size(X_voov,2), & - H_vo , 1, & - 1d0, r1 , 1) + call gpu_dgemv(blas_handle, 'T', nV*nO, nO*nV, & + 1d0, X_voov%f(1,1,1,1), size(X_voov%f,1) * size(X_voov%f,2), & + H_vo%f(1,1) , 1, & + 1d0, r1%f(1,1) , 1) - deallocate(X_voov) + call gpu_synchronize() + call gpu_deallocate(X_oo) + call gpu_deallocate(X_voov) - double precision, allocatable :: X_ovov(:,:,:,:) - allocate(X_ovov(nO, nV, nO, nV)) + type(gpu_double4) :: X_ovov + call gpu_allocate(X_ovov, nO, nV, nO, nV) !$omp parallel & - !$omp shared(nO,nV,cc_space_v_ovov,cc_space_v_voov,X_ovov) & + !$omp shared(nO,nV,d_cc_space_v_ovov,d_cc_space_v_voov,X_ovov) & !$omp private(u,beta,i,a) & !$omp default(none) !$omp do @@ -289,7 +284,7 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) do u = 1, nO do a = 1, nv do i = 1, nO - X_ovov(i,a,u,beta) = 2d0 * cc_space_v_voov(a,u,i,beta) - cc_space_v_ovov(u,a,i,beta) + X_ovov%f(i,a,u,beta) = 2d0 * d_cc_space_v_voov%f(a,u,i,beta) - d_cc_space_v_ovov%f(u,a,i,beta) enddo enddo enddo @@ -297,17 +292,25 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) !$omp end do !$omp end parallel - call dgemv('T', nO*nV, nO*nV, & - 1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), & - t1 , 1, & - 1d0, r1 , 1) - - deallocate(X_ovov) +! call dgemv('T', nO*nV, nO*nV, & +! 1d0, X_ovov%f, size(X_ovov%f,1) * size(X_ovov%f,2), & +! t1%f, 1, & +! 1d0, r1%f, 1) + call gpu_dgemv(blas_handle, 'T', nO*nV, nO*nV, & + 1d0, X_ovov%f(1,1,1,1), size(X_ovov%f,1) * size(X_ovov%f,2), & + t1%f(1,1), 1, & + 1d0, r1%f(1,1), 1) integer :: iblock, block_size, nVmax - double precision, allocatable :: W_vvov(:,:,:,:), W_vvov_tmp(:,:,:,:), T_vvoo(:,:,:,:) + type(gpu_double4) :: W_vvov, W_vvov_tmp, T_vvoo + block_size = 16 - allocate(W_vvov(nV,nV,nO,block_size), W_vvov_tmp(nV,nO,nV,block_size), T_vvoo(nV,nV,nO,nO)) + call gpu_allocate(W_vvov,nV, nV,nO,block_size) + call gpu_allocate(W_vvov_tmp, nV,nO,nV,block_size) + call gpu_allocate(T_vvoo, nV,nV,nO,nO) + + call gpu_synchronize() + call gpu_deallocate(X_ovov) !$omp parallel & !$omp private(u,i,b,a) & @@ -317,7 +320,7 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) do i = 1, nO do b = 1, nV do a = 1, nV - T_vvoo(a,b,i,u) = tau(i,u,a,b) + T_vvoo%f(a,b,i,u) = tau%f(i,u,a,b) enddo enddo enddo @@ -328,11 +331,12 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) do iblock = 1, nV, block_size nVmax = min(block_size,nV-iblock+1) - call dgemm('T','N', nV*nO, nV*nVmax, cholesky_mo_num, 1.d0, & - cc_space_v_vo_chol , cholesky_mo_num, & - cc_space_v_vv_chol(1,1,iblock), cholesky_mo_num, & - 0.d0, W_vvov_tmp, nV*nO) + call gpu_dgemm(blas_handle, 'T','N', nV*nO, nV*nVmax, cholesky_mo_num, 1.d0, & + d_cc_space_v_vo_chol%f(1,1,1) , cholesky_mo_num, & + d_cc_space_v_vv_chol%f(1,1,iblock), cholesky_mo_num, & + 0.d0, W_vvov_tmp%f(1,1,1,1), nV*nO) + call gpu_synchronize() !$omp parallel & !$omp private(b,i,a,beta) & !$omp default(shared) @@ -341,7 +345,7 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) !$omp do do b = 1, nV do a = 1, nV - W_vvov(a,b,i,beta) = 2d0 * W_vvov_tmp(a,i,b,beta) - W_vvov_tmp(b,i,a,beta) + W_vvov%f(a,b,i,beta) = 2d0 * W_vvov_tmp%f(a,i,b,beta) - W_vvov_tmp%f(b,i,a,beta) enddo enddo !$omp end do nowait @@ -350,20 +354,22 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) !$omp barrier !$omp end parallel - call dgemm('T','N',nO,nVmax,nO*nV*nV, & - 1d0, T_vvoo, nV*nV*nO, & - W_vvov, nO*nV*nV, & - 1d0, r1(1,iblock), nO) + call gpu_dgemm(blas_handle, 'T','N',nO,nVmax,nO*nV*nV, & + 1d0, T_vvoo%f(1,1,1,1), nV*nV*nO, & + W_vvov%f(1,1,1,1), nO*nV*nV, & + 1d0, r1%f(1,iblock), nO) enddo - deallocate(W_vvov,T_vvoo) + call gpu_synchronize() + call gpu_deallocate(W_vvov) + call gpu_deallocate(T_vvoo) - double precision, allocatable :: W_oovo(:,:,:,:) - allocate(W_oovo(nO,nO,nV,nO)) + type(gpu_double4) :: W_oovo + call gpu_allocate(W_oovo, nO,nO,nV,nO) !$omp parallel & - !$omp shared(nO,nV,cc_space_v_oovo,W_oovo) & + !$omp shared(nO,nV,d_cc_space_v_oovo,W_oovo) & !$omp private(u,a,i,j) & !$omp default(none) do u = 1, nO @@ -371,8 +377,7 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) do a = 1, nV do j = 1, nO do i = 1, nO -! W_oovo(i,j,a,u) = 2d0 * cc_space_v_vooo(a,u,i,j) - cc_space_v_vooo(a,u,j,i) - W_oovo(i,j,a,u) = 2d0 * cc_space_v_oovo(i,j,a,u) - cc_space_v_oovo(j,i,a,u) + W_oovo%f(i,j,a,u) = 2d0 * d_cc_space_v_oovo%f(i,j,a,u) - d_cc_space_v_oovo%f(j,i,a,u) enddo enddo enddo @@ -380,33 +385,22 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) enddo !$omp end parallel - call dgemm('T','N', nO, nV, nO*nO*nV, & - -1d0, W_oovo, size(W_oovo,1) * size(W_oovo,2) * size(W_oovo,3), & - tau , size(tau,1) * size(tau,2) * size(tau,3), & - 1d0, r1 , size(r1,1)) + ! Change the sign for consistency with the code in spin orbitals + call gpu_dgemm(blas_handle, 'T','N', nO, nV, nO*nO*nV, & + 1d0, W_oovo%f(1,1,1,1), size(W_oovo%f,1) * size(W_oovo%f,2) * size(W_oovo%f,3), & + tau%f(1,1,1,1), size(tau%f,1) * size(tau%f,2) * size(tau%f,3), & + -1d0, r1%f(1,1), size(r1%f,1)) - deallocate(W_oovo) + call gpu_synchronize() + call gpu_deallocate(W_oovo) max_r1 = 0d0 do a = 1, nV do i = 1, nO - max_r1 = max(dabs(r1(i,a)), max_r1) + max_r1 = max(dabs(r1%f(i,a)), max_r1) enddo enddo - ! Change the sign for consistency with the code in spin orbitals - !$omp parallel & - !$omp shared(nO,nV,r1) & - !$omp private(a,i) & - !$omp default(none) - !$omp do - do a = 1, nV - do i = 1, nO - r1(i,a) = -r1(i,a) - enddo - enddo - !$omp end do - !$omp end parallel end diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90 index 20d99ede..949ae4fc 100644 --- a/src/gpu/gpu_module.F90 +++ b/src/gpu/gpu_module.F90 @@ -136,7 +136,7 @@ module gpu type(c_ptr), value, intent(in) :: handle integer(c_int64_t), value :: n, incx, incy type(c_ptr), intent(in), value :: dx, dy - real(c_float), intent(out) :: res + real(c_float), intent(out) :: res end subroutine subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, & @@ -145,7 +145,7 @@ module gpu type(c_ptr), value, intent(in) :: handle character(c_char), intent(in), value :: transa, transb integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc - real(c_double), intent(in), value :: alpha, beta + real(c_double), intent(in), value :: alpha, beta type(c_ptr), value :: a, b, c end subroutine @@ -155,10 +155,31 @@ module gpu type(c_ptr), value, intent(in) :: handle character(c_char), intent(in), value :: transa, transb integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc - real(c_float), intent(in), value :: alpha, beta + real(c_float), intent(in), value :: alpha, beta real(c_float) :: a, b, c end subroutine + subroutine gpu_dgemv_c(handle, transa, m, n, alpha, a, lda, & + x, incx, beta, y, incy) bind(C, name='gpu_dgemv') + import + type(c_ptr), value, intent(in) :: handle + character(c_char), intent(in) :: transa + integer(c_int64_t), intent(in), value :: m, n, lda, incx, incy + real(c_double), intent(in), value :: alpha, beta + real(c_double) :: a, x, y + end subroutine + + subroutine gpu_sgemv_c(handle, transa, m, n, alpha, a, lda, & + x, incx, beta, y, incy) bind(C, name='gpu_sgemv') + import + type(c_ptr), value, intent(in) :: handle + character(c_char), intent(in) :: transa + integer(c_int64_t), intent(in), value :: m, n, lda, incx, incy + real(c_float), intent(in), value :: alpha, beta + real(c_float) :: a, x, y + end subroutine + + subroutine gpu_dgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, & b, ldb, beta, c, ldc) bind(C, name='gpu_dgemm') import @@ -625,6 +646,36 @@ subroutine gpu_dgeam_64(handle, transa, transb, m, n, alpha, a, lda, beta, & end subroutine +! gemv +! ---- + +subroutine gpu_dgemv(handle, transa, m, n, alpha, a, lda, & + x, incx, beta, y, incy) +! use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa + integer*4, intent(in) :: m, n, lda, incx, incy + double precision, intent(in) :: alpha, beta + double precision :: a, x, y + call gpu_dgemv_c(handle%c, transa, int(m,c_int64_t), int(n,c_int64_t), & + alpha, a, int(lda,c_int64_t), & + x, int(incx,c_int64_t), beta, y, int(incy,c_int64_t)) +end subroutine + +subroutine gpu_dgemv_64(handle, transa, m, n, alpha, a, lda, & + x, incx, beta, y, incy) +! use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa + integer*8, intent(in) :: m, n, lda, incx, incy + double precision, intent(in) :: alpha, beta + double precision :: a, x, y + call gpu_dgemv_c(handle%c, transa, int(m,c_int64_t), int(n,c_int64_t), & + alpha, a, int(lda,c_int64_t), & + x, int(incx,c_int64_t), beta, y, int(incy,c_int64_t)) +end subroutine + + ! gemm ! ---- From 92fe3a6f84b0af99bf554602528969699c206cde Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 2 Jul 2024 18:36:19 +0200 Subject: [PATCH 27/38] Working on r1 --- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 37 ++++++++++++++++++++------ 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index 6190e985..e0048637 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -245,23 +245,44 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_s type(gpu_double4) :: X_voov call gpu_allocate(X_voov, nV, nO, nO, nV) + type(gpu_stream) :: stream(nV) + + do a=1,nV + call gpu_stream_create(stream(a)) + enddo + + call gpu_synchronize() + +! do i=1,nO +! do beta=1,nV +! call gpu_set_stream(blas_handle, stream(beta)) +! call gpu_dgeam(blas_handle, 'T', 'T', nV, nO, -1.d0, t2%f(1,i,1,beta), & +! nO*nO, t1%f(i,beta), t1%f(1,1), nO, X_voov%f(1,i,1,beta), nV) +! enddo +! enddo - !$omp parallel & - !$omp shared(nO,nV,X_voov,t2,t1) & - !$omp private(u,beta,i,a) & - !$omp default(none) - !$omp do do beta = 1, nV do u = 1, nO do i = 1, nO do a = 1, nV - X_voov%f(a,i,u,beta) = 2d0 * t2%f(i,u,a,beta) - t2%f(u,i,a,beta) + t1%f(u,a) * t1%f(i,beta) + X_voov%f(a,i,u,beta) = - t2%f(u,i,a,beta) + t1%f(u,a) * t1%f(i,beta) enddo enddo enddo enddo - !$omp end do - !$omp end parallel + call gpu_synchronize() + + do beta=1,nV + call gpu_set_stream(blas_handle, stream(beta)) + call gpu_dgeam(blas_handle, 'N', 'T', nV, nO*nO, 1.d0, X_voov%f(1,1,1,beta), & + nV, 2.d0, t2%f(1,1,1,beta), nO*nO, X_voov%f(1,1,1,beta), nV) + enddo + + call gpu_synchronize() + do a=1,nV + call gpu_stream_destroy(stream(a)) + enddo + call gpu_set_stream(blas_handle, gpu_default_stream) call gpu_dgemv(blas_handle, 'T', nV*nO, nO*nV, & 1d0, X_voov%f(1,1,1,1), size(X_voov%f,1) * size(X_voov%f,2), & From cc09f8c61a0e8a29e7d2a2933d9659c1ad70a7b5 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Wed, 3 Jul 2024 14:52:11 +0200 Subject: [PATCH 28/38] Minor changes in Cholesky --- src/ao_two_e_ints/cholesky.irp.f | 6 +++--- src/mo_two_e_ints/map_integrals.irp.f | 2 +- src/tools/four_idx_transform.irp.f | 3 +++ 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f index acb0872b..a2d9d043 100644 --- a/src/ao_two_e_ints/cholesky.irp.f +++ b/src/ao_two_e_ints/cholesky.irp.f @@ -158,9 +158,9 @@ END_PROVIDER Lset(np8) = p8 endif enddo - np = np8 + if (np8 > ndim8) stop 'np>ndim8' + np = int(np8,4) if (np <= 0) stop 'np<=0' - if (np > ndim8) stop 'np>ndim8' rank_max = min(np,20*elec_num*elec_num) call mmap(trim(ezfio_work_dir)//'cholesky_ao_tmp', (/ ndim8, rank_max /), 8, fd(1), .False., .True., c_pointer(1)) @@ -431,7 +431,7 @@ END_PROVIDER Lset(np8) = p8 endif enddo - np = np8 + np = int(np8,4) enddo diff --git a/src/mo_two_e_ints/map_integrals.irp.f b/src/mo_two_e_ints/map_integrals.irp.f index 168c34b4..eeb4279f 100644 --- a/src/mo_two_e_ints/map_integrals.irp.f +++ b/src/mo_two_e_ints/map_integrals.irp.f @@ -40,7 +40,7 @@ end ! Min and max values of the MOs for which the integrals are in the cache END_DOC - mo_integrals_cache_size = 2_8**mo_integrals_cache_shift + mo_integrals_cache_size = 2**mo_integrals_cache_shift mo_integrals_cache_min = max(1,elec_alpha_num - (mo_integrals_cache_size/2 - 1) ) mo_integrals_cache_max = min(mo_num, mo_integrals_cache_min + mo_integrals_cache_size - 1) diff --git a/src/tools/four_idx_transform.irp.f b/src/tools/four_idx_transform.irp.f index 92e87cad..fc6bface 100644 --- a/src/tools/four_idx_transform.irp.f +++ b/src/tools/four_idx_transform.irp.f @@ -12,6 +12,9 @@ program four_idx_transform ! END_DOC + if (do_mo_cholesky) then + stop 'Not implemented with Cholesky integrals' + endif io_mo_two_e_integrals = 'Write' SOFT_TOUCH io_mo_two_e_integrals if (.true.) then From 2f8e7bd4f79108476bcc1b04165912d629d7d924 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Wed, 3 Jul 2024 15:32:38 +0200 Subject: [PATCH 29/38] Updated to read CHolesky MO integrals from TREXIO --- src/trexio/export_trexio_routines.irp.f | 2 +- src/trexio/import_trexio_integrals.irp.f | 226 ++++++++++++++++------- 2 files changed, 159 insertions(+), 69 deletions(-) diff --git a/src/trexio/export_trexio_routines.irp.f b/src/trexio/export_trexio_routines.irp.f index 63630243..0eec68bd 100644 --- a/src/trexio/export_trexio_routines.irp.f +++ b/src/trexio/export_trexio_routines.irp.f @@ -557,7 +557,7 @@ subroutine export_trexio(update,full_path) do k=1,cholesky_ao_num do j=1,mo_num do i=1,mo_num - integral = cholesky_mo(i,j,k) + integral = cholesky_mo_transp(k,i,j) if (integral == 0.d0) cycle icount += 1_8 chol_buffer(icount) = integral diff --git a/src/trexio/import_trexio_integrals.irp.f b/src/trexio/import_trexio_integrals.irp.f index 5a6b3c03..556ed7bc 100644 --- a/src/trexio/import_trexio_integrals.irp.f +++ b/src/trexio/import_trexio_integrals.irp.f @@ -28,7 +28,7 @@ subroutine run(f) integer(trexio_t), intent(in) :: f ! TREXIO file handle integer(trexio_exit_code) :: rc - integer ::i,j,k,l + integer :: i,j,k,l, iunit integer(8) :: m, n_integrals double precision :: integral @@ -41,10 +41,12 @@ subroutine run(f) integer , allocatable :: Vi(:,:) double precision :: s -! TODO: -! - If Cholesky AO in trexio file, read cholesky ao vectors -! - If Cholesky MO in trexio file, read cholesky mo vectors -! - If Cholesky MO not in trexio file, force do_cholesky_mo to False + integer*4 :: BUFSIZE + integer :: rank + double precision, allocatable :: tmp(:,:,:) + integer*8 :: offset, icount + + integer, external :: getUnitAndOpen if (trexio_has_nucleus_repulsion(f) == TREXIO_SUCCESS) then rc = trexio_read_nucleus_repulsion(f, s) @@ -120,45 +122,88 @@ subroutine run(f) rc = trexio_has_ao_2e_int(f) PROVIDE ao_num if (rc /= TREXIO_HAS_NOT) then - PROVIDE ao_integrals_map - integer*4 :: BUFSIZE - BUFSIZE=ao_num**2 - allocate(buffer_i(BUFSIZE), buffer_values(BUFSIZE)) - allocate(Vi(4,BUFSIZE), V(BUFSIZE)) + rc = trexio_has_ao_2e_int_eri_cholesky(f) + if (rc /= TREXIO_HAS_NOT) then - integer*8 :: offset, icount + rc = trexio_read_ao_2e_int_eri_cholesky_num(f, rank) + call trexio_assert(rc, TREXIO_SUCCESS) - offset = 0_8 - icount = BUFSIZE - rc = TREXIO_SUCCESS - do while (icount == size(V)) - rc = trexio_read_ao_2e_int_eri(f, offset, icount, Vi, V) - do m=1,icount - i = Vi(1,m) - j = Vi(2,m) - k = Vi(3,m) - l = Vi(4,m) - integral = V(m) - call two_e_integrals_index(i, j, k, l, buffer_i(m) ) - buffer_values(m) = integral - enddo - call insert_into_ao_integrals_map(int(icount,4),buffer_i,buffer_values) - offset = offset + icount - if (rc /= TREXIO_SUCCESS) then - exit - endif - end do - n_integrals = offset + allocate(tmp(ao_num,ao_num,rank)) + tmp(:,:,:) = 0.d0 - call map_sort(ao_integrals_map) - call map_unique(ao_integrals_map) + BUFSIZE=ao_num**2 + allocate(Vi(3,BUFSIZE), V(BUFSIZE)) - call map_save_to_disk(trim(ezfio_filename)//'/work/ao_ints',ao_integrals_map) - call ezfio_set_ao_two_e_ints_io_ao_two_e_integrals('Read') - deallocate(buffer_i, buffer_values, Vi, V) - print *, 'AO integrals read from TREXIO file' + offset = 0_8 + icount = BUFSIZE + rc = TREXIO_SUCCESS + do while (icount == size(V)) + rc = trexio_read_ao_2e_int_eri_cholesky(f, offset, icount, Vi, V) + do m=1,icount + i = Vi(1,m) + j = Vi(2,m) + k = Vi(3,m) + integral = V(m) + tmp(i,j,k) = integral + enddo + offset = offset + icount + if (rc /= TREXIO_SUCCESS) then + exit + endif + end do + + print *, 'Writing Cholesky AO vectors to disk...' + iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_ao', 'W') + write(iunit) rank + write(iunit) tmp(:,:,:) + close(iunit) + call ezfio_set_ao_two_e_ints_io_ao_cholesky('Read') + + deallocate(Vi, V, tmp) + print *, 'Cholesky AO integrals read from TREXIO file' + endif + + rc = trexio_has_ao_2e_int_eri(f) + if (rc /= TREXIO_HAS_NOT) then + PROVIDE ao_integrals_map + + BUFSIZE=ao_num**2 + allocate(buffer_i(BUFSIZE), buffer_values(BUFSIZE)) + allocate(Vi(4,BUFSIZE), V(BUFSIZE)) + + offset = 0_8 + icount = BUFSIZE + rc = TREXIO_SUCCESS + do while (icount == size(V)) + rc = trexio_read_ao_2e_int_eri(f, offset, icount, Vi, V) + do m=1,icount + i = Vi(1,m) + j = Vi(2,m) + k = Vi(3,m) + l = Vi(4,m) + integral = V(m) + call two_e_integrals_index(i, j, k, l, buffer_i(m) ) + buffer_values(m) = integral + enddo + call insert_into_ao_integrals_map(int(icount,4),buffer_i,buffer_values) + offset = offset + icount + if (rc /= TREXIO_SUCCESS) then + exit + endif + end do + n_integrals = offset + + call map_sort(ao_integrals_map) + call map_unique(ao_integrals_map) + + call map_save_to_disk(trim(ezfio_filename)//'/work/ao_ints',ao_integrals_map) + call ezfio_set_ao_two_e_ints_io_ao_two_e_integrals('Read') + + deallocate(buffer_i, buffer_values, Vi, V) + print *, 'AO integrals read from TREXIO file' + endif else print *, 'AO integrals not found in TREXIO file' endif @@ -186,40 +231,85 @@ subroutine run(f) rc = trexio_has_mo_2e_int(f) if (rc /= TREXIO_HAS_NOT) then - BUFSIZE=mo_num**2 - allocate(buffer_i(BUFSIZE), buffer_values(BUFSIZE)) - allocate(Vi(4,BUFSIZE), V(BUFSIZE)) + rc = trexio_has_mo_2e_int_eri_cholesky(f) + if (rc /= TREXIO_HAS_NOT) then + + rc = trexio_read_mo_2e_int_eri_cholesky_num(f, rank) + call trexio_assert(rc, TREXIO_SUCCESS) + + allocate(tmp(rank,mo_num,mo_num)) + tmp(:,:,:) = 0.d0 + + BUFSIZE=mo_num**2 + allocate(Vi(3,BUFSIZE), V(BUFSIZE)) + + offset = 0_8 + icount = BUFSIZE + rc = TREXIO_SUCCESS + do while (icount == size(V)) + rc = trexio_read_mo_2e_int_eri_cholesky(f, offset, icount, Vi, V) + do m=1,icount + i = Vi(1,m) + j = Vi(2,m) + k = Vi(3,m) + integral = V(m) + tmp(k,i,j) = integral + enddo + offset = offset + icount + if (rc /= TREXIO_SUCCESS) then + exit + endif + end do + + print *, 'Writing Cholesky MO vectors to disk...' + iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_mo_transp', 'W') + write(iunit) rank + write(iunit) tmp(:,:,:) + close(iunit) + call ezfio_set_mo_two_e_ints_io_mo_cholesky('Read') + + deallocate(Vi, V, tmp) + print *, 'Cholesky MO integrals read from TREXIO file' + endif + + rc = trexio_has_mo_2e_int_eri(f) + if (rc /= TREXIO_HAS_NOT) then + BUFSIZE=mo_num**2 + allocate(buffer_i(BUFSIZE), buffer_values(BUFSIZE)) + allocate(Vi(4,BUFSIZE), V(BUFSIZE)) - offset = 0_8 - icount = BUFSIZE - rc = TREXIO_SUCCESS - do while (icount == size(V)) - rc = trexio_read_mo_2e_int_eri(f, offset, icount, Vi, V) - do m=1,icount - i = Vi(1,m) - j = Vi(2,m) - k = Vi(3,m) - l = Vi(4,m) - integral = V(m) - call two_e_integrals_index(i, j, k, l, buffer_i(m) ) - buffer_values(m) = integral - enddo - call map_append(mo_integrals_map, buffer_i, buffer_values, int(icount,4)) - offset = offset + icount - if (rc /= TREXIO_SUCCESS) then - exit - endif - end do - n_integrals = offset + offset = 0_8 + icount = BUFSIZE + rc = TREXIO_SUCCESS + do while (icount == size(V)) + rc = trexio_read_mo_2e_int_eri(f, offset, icount, Vi, V) + do m=1,icount + i = Vi(1,m) + j = Vi(2,m) + k = Vi(3,m) + l = Vi(4,m) + integral = V(m) + call two_e_integrals_index(i, j, k, l, buffer_i(m) ) + buffer_values(m) = integral + enddo + call map_append(mo_integrals_map, buffer_i, buffer_values, int(icount,4)) + offset = offset + icount + if (rc /= TREXIO_SUCCESS) then + exit + endif + end do + n_integrals = offset - call map_sort(mo_integrals_map) - call map_unique(mo_integrals_map) + call map_sort(mo_integrals_map) + call map_unique(mo_integrals_map) + + call map_save_to_disk(trim(ezfio_filename)//'/work/mo_ints',mo_integrals_map) + call ezfio_set_mo_two_e_ints_io_mo_two_e_integrals('Read') + deallocate(buffer_i, buffer_values, Vi, V) + print *, 'MO integrals read from TREXIO file' + endif - call map_save_to_disk(trim(ezfio_filename)//'/work/mo_ints',mo_integrals_map) - call ezfio_set_mo_two_e_ints_io_mo_two_e_integrals('Read') - deallocate(buffer_i, buffer_values, Vi, V) - print *, 'MO integrals read from TREXIO file' else print *, 'MO integrals not found in TREXIO file' endif From 7ceb8fdcca5cd7bff3984c816eef8e47aa681a0b Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Wed, 3 Jul 2024 18:24:13 +0200 Subject: [PATCH 30/38] Finished r1 --- plugins/local/gpu_nvidia/gpu.c | 41 +++--- plugins/local/gpu_x86/gpu.c | 112 ++++++++-------- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 177 ++++++++++--------------- src/gpu/gpu.h | 24 ++-- src/gpu/gpu_module.F90 | 12 +- 5 files changed, 162 insertions(+), 204 deletions(-) diff --git a/plugins/local/gpu_nvidia/gpu.c b/plugins/local/gpu_nvidia/gpu.c index e77847a6..a775ab95 100644 --- a/plugins/local/gpu_nvidia/gpu.c +++ b/plugins/local/gpu_nvidia/gpu.c @@ -116,11 +116,6 @@ void gpu_ddot(cublasHandle_t handle, const int64_t n, const double* x, const int assert ( (int64_t) incy_ == incy); cublasStatus_t rc = cublasDdot(handle, n_, x, incx_, y, incy_, result); -/* - double alpha = 1.0; - double beta = 0.0; - cublasStatus_t rc = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, n_, &alpha, x, 1, y, n_, &beta, &result_, 1); -*/ assert (rc == CUBLAS_STATUS_SUCCESS); } @@ -149,8 +144,8 @@ void gpu_sdot(cublasHandle_t handle, const int64_t n, const float* x, const int6 -void gpu_dgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const double alpha, - const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) { +void gpu_dgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const double* alpha, + const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) { assert (handle != NULL); @@ -173,13 +168,13 @@ void gpu_dgemv(cublasHandle_t handle, const char* transa, const int64_t m, const cublasOperation_t transa_ = CUBLAS_OP_N; if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T; - cublasDgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_); + cublasDgemv(handle, transa_, m_, n_, alpha, a, lda_, x, incx_, beta, y, incy_); } -void gpu_sgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const float alpha, - const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) { +void gpu_sgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const float* alpha, + const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy) { assert (handle != NULL); @@ -202,12 +197,12 @@ void gpu_sgemv(cublasHandle_t handle, const char* transa, const int64_t m, const cublasOperation_t transa_ = CUBLAS_OP_N; if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T; - cublasSgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_); + cublasSgemv(handle, transa_, m_, n_, alpha, a, lda_, x, incx_, beta, y, incy_); } -void gpu_dgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, - const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) { +void gpu_dgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha, + const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) { assert (handle != NULL); @@ -234,13 +229,13 @@ void gpu_dgemm(cublasHandle_t handle, const char* transa, const char* transb, co if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T; if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T; - cublasDgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_); + cublasDgemm(handle, transa_, transb_, m_, n_, k_, alpha, a, lda_, b, ldb_, beta, c, ldc_); } -void gpu_sgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, - const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) { +void gpu_sgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha, + const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc) { assert (handle != NULL); @@ -267,12 +262,12 @@ void gpu_sgemm(cublasHandle_t handle, const char* transa, const char* transb, co if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T; if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T; - cublasSgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_); + cublasSgemm(handle, transa_, transb_, m_, n_, k_, alpha, a, lda_, b, ldb_, beta, c, ldc_); } -void gpu_dgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double alpha, - const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { +void gpu_dgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha, + const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { assert (handle != NULL); /* Convert to int */ @@ -296,13 +291,13 @@ void gpu_dgeam(cublasHandle_t handle, const char* transa, const char* transb, co if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T; if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T; - cublasDgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_); + cublasDgeam(handle, transa_, transb_, m_, n_, alpha, a, lda_, beta, b, ldb_, c, ldc_); } -void gpu_sgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float alpha, - const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) { +void gpu_sgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float* alpha, + const float* a, const int64_t lda, const float* beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) { assert (handle != NULL); /* Convert to int */ @@ -326,6 +321,6 @@ void gpu_sgeam(cublasHandle_t handle, const char* transa, const char* transb, co if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T; if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T; - cublasSgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_); + cublasSgeam(handle, transa_, transb_, m_, n_, alpha, a, lda_, beta, b, ldb_, c, ldc_); } diff --git a/plugins/local/gpu_x86/gpu.c b/plugins/local/gpu_x86/gpu.c index fe3cadc5..49aec9d3 100644 --- a/plugins/local/gpu_x86/gpu.c +++ b/plugins/local/gpu_x86/gpu.c @@ -124,8 +124,8 @@ void gpu_sdot(void* handle, const int64_t n, const float* x, const int64_t incx, void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha, const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy); -void gpu_dgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const double alpha, - const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) { +void gpu_dgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const double* alpha, + const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) { assert (handle != NULL); @@ -145,15 +145,15 @@ void gpu_dgemv(void* handle, const char* transa, const int64_t m, const int64_t assert ( (int64_t) incx_ == incx); assert ( (int64_t) incy_ == incy); - dgemv_(transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_); + dgemv_(transa, &m_, &n_, alpha, a, &lda_, x, &incx_, beta, y, &incy_); } void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha, const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy); -void gpu_sgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const float alpha, - const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) { +void gpu_sgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const float* alpha, + const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy) { assert (handle != NULL); @@ -173,15 +173,15 @@ void gpu_sgemv(void* handle, const char* transa, const int64_t m, const int64_t assert ( (int64_t) incx_ == incx); assert ( (int64_t) incy_ == incy); - sgemv_(transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_); + sgemv_(transa, &m_, &n_, alpha, a, &lda_, x, &incx_, beta, y, &incy_); } void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha, const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc); -void gpu_dgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, - const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) { +void gpu_dgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha, + const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) { assert (handle != NULL); @@ -203,7 +203,7 @@ void gpu_dgemm(void* handle, const char* transa, const char* transb, const int64 assert ( (int64_t) ldb_ == ldb); assert ( (int64_t) ldc_ == ldc); - dgemm_(transa, transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_); + dgemm_(transa, transb, &m_, &n_, &k_, alpha, a, &lda_, b, &ldb_, beta, c, &ldc_); } @@ -211,8 +211,8 @@ void gpu_dgemm(void* handle, const char* transa, const char* transb, const int64 void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha, const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc); -void gpu_sgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, - const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) { +void gpu_sgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha, + const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc) { assert (handle != NULL); @@ -234,12 +234,12 @@ void gpu_sgemm(void* handle, const char* transa, const char* transb, const int64 assert ( (int64_t) ldb_ == ldb); assert ( (int64_t) ldc_ == ldc); - sgemm_(transa, transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_); + sgemm_(transa, transb, &m_, &n_, &k_, alpha, a, &lda_, b, &ldb_, beta, c, &ldc_); } -void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double alpha, - const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { +void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha, + const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { assert (handle != NULL); if ( (*transa == 'N' && *transb == 'N') || @@ -247,19 +247,19 @@ void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64 (*transa == 'N' && *transb == 'n') || (*transa == 'n' && *transb == 'n') ) { - if (alpha == 0.) { + if (*alpha == 0.) { for (int64_t j=0 ; j Date: Thu, 4 Jul 2024 12:01:16 +0200 Subject: [PATCH 31/38] Working on r2 --- src/ccsd/ccsd_space_orb_sub.irp.f | 27 +- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 353 +++++++++++++------------ 2 files changed, 200 insertions(+), 180 deletions(-) diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 256f743b..59b9ebd2 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -21,7 +21,8 @@ subroutine run_ccsd_space_orb type(gpu_double3) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol type(gpu_double4) :: d_cc_space_v_oovv, d_cc_space_v_voov, d_cc_space_v_ovov - type(gpu_double4) :: d_cc_space_v_oovo + type(gpu_double4) :: d_cc_space_v_oovo, d_cc_space_v_vooo, d_cc_space_v_oooo + type(gpu_double4) :: d_cc_space_v_vvoo double precision, allocatable :: all_err(:,:), all_t(:,:) integer, allocatable :: list_occ(:), list_vir(:) @@ -93,17 +94,29 @@ subroutine run_ccsd_space_orb ! FREE cc_space_v_vv_chol endif + call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV) call gpu_allocate(d_cc_space_v_voov, nV, nO, nO, nV) call gpu_allocate(d_cc_space_v_ovov, nO, nV, nO, nV) call gpu_allocate(d_cc_space_v_oovo, nO, nO, nV, nO) + call gpu_allocate(d_cc_space_v_vooo, nV, nO, nO, nO) + call gpu_allocate(d_cc_space_v_oooo, nO, nO, nO, nO) + call gpu_allocate(d_cc_space_v_vvoo, nV, nV, nO, nO) + call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv) call gpu_upload(cc_space_v_voov, d_cc_space_v_voov) call gpu_upload(cc_space_v_ovov, d_cc_space_v_ovov) call gpu_upload(cc_space_v_oovo, d_cc_space_v_oovo) + call gpu_upload(cc_space_v_vooo, d_cc_space_v_vooo) + call gpu_upload(cc_space_v_oooo, d_cc_space_v_oooo) + call gpu_upload(cc_space_v_vvoo, d_cc_space_v_vvoo) ! FREE cc_space_v_voov ! FREE cc_space_v_ovov ! FREE cc_space_v_oovo +! FREE cc_space_v_oovv +! FREE cc_space_v_vooo +! FREE cc_space_v_oooo +! FREE cc_space_v_vvoo call gpu_allocate(t2, nO,nO,nV,nV) call gpu_allocate(r2, nO,nO,nV,nV) @@ -165,15 +178,8 @@ subroutine run_ccsd_space_orb call gpu_upload(h_t2, t2) - call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV) - call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv) - -! FREE cc_space_v_oovv - - call update_tau_space(nO,nV,h_t1,t1,t2,tau) call update_tau_x_space(nO,nV,tau,tau_x) - !print*,'hf_energy', hf_energy call det_energy(det,uncorr_energy) print*,'Det energy', uncorr_energy @@ -200,7 +206,10 @@ subroutine run_ccsd_space_orb call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_space_f_ov,d_cc_space_f_vo, & d_cc_space_v_voov, d_cc_space_v_ovov, d_cc_space_v_oovo, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol) - call compute_r2_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r2%f,max_r2) + call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & + d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, & + d_cc_space_v_vvoo, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, & + r2, max_r2) else call compute_H_oo(nO,nV,t1%f,t2%f,tau%f,H_oo%f) call compute_H_vv(nO,nV,t1%f,t2%f,tau%f,H_vv%f) diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index c34b390b..0474dcec 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -391,168 +391,162 @@ end ! R2 -subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) - +subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & + d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, & + d_cc_space_v_vvoo, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, & + r2,max_r2) + use gpu implicit none ! in - integer, intent(in) :: nO, nV - double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV), tau(nO,nO,nV,nV) - double precision, intent(in) :: H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO) + integer, intent(in) :: nO, nV + type(gpu_double2), intent(in) :: t1, H_oo, H_vv + type(gpu_double4), intent(in) :: t2, tau, d_cc_space_v_oovv + type(gpu_double4), intent(in) :: d_cc_space_v_vooo, d_cc_space_v_oooo + type(gpu_double4), intent(in) :: d_cc_space_v_vvoo + type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol + type(gpu_double3), intent(in) :: d_cc_space_v_vo_chol ! out - double precision, intent(out) :: r2(nO,nO,nV,nV), max_r2 + double precision, intent(out) :: max_r2 + type(gpu_double4), intent(out) :: r2 ! internal integer :: u,v,i,j,beta,gam,a,b double precision :: max_r2_local + type(gpu_stream) :: stream(nV) + call set_multiple_levels_omp(.False.) - !$omp parallel & - !$omp shared(nO,nV,r2,cc_space_v_oovv) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do - do gam = 1, nV - do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2(u,v,beta,gam) = cc_space_v_oovv(u,v,beta,gam) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel + call gpu_copy(d_cc_space_v_oovv, r2) - double precision, allocatable :: A1(:,:,:,:) - allocate(A1(nO,nO,nO,nO)) - call compute_A1_chol(nO,nV,t1,t2,tau,A1) - call dgemm('N','N',nO*nO,nV*nV,nO*nO, & - 1d0, A1, size(A1,1) * size(A1,2), & - tau, size(tau,1) * size(tau,2), & - 1d0, r2, size(r2,1) * size(r2,2)) + type(gpu_double4) :: A1 + call gpu_allocate(A1,nO,nO,nO,nO) + call compute_A1_chol(nO,nV,t1,t2,tau,d_cc_space_v_vooo, & + d_cc_space_v_oooo, d_cc_space_v_vvoo, A1) + + call gpu_dgemm(blas_handle, 'N','N',nO*nO,nV*nV,nO*nO, & + 1d0, A1%f(1,1,1,1), size(A1%f,1) * size(A1%f,2), & + tau%f(1,1,1,1), size(tau%f,1) * size(tau%f,2), & + 1d0, r2%f(1,1,1,1), size(r2%f,1) * size(r2%f,2)) + + call gpu_deallocate(A1) - deallocate(A1) integer :: block_size, iblock, k block_size = 16 - double precision, dimension(:,:,:), allocatable :: B1, tmp_cc, tmpB1 - double precision, dimension(:,:), allocatable :: tmp_cc2 + type(gpu_double3) :: tmp_cc, B1, tmpB1 + type(gpu_double2) :: tmp_cc2 - allocate(tmp_cc(cholesky_mo_num,nV,nV)) - call dgemm('N','N', cholesky_mo_num*nV, nV, nO, 1.d0, & - cc_space_v_vo_chol, cholesky_mo_num*nV, t1, nO, 0.d0, tmp_cc, cholesky_mo_num*nV) + call gpu_allocate(tmp_cc,cholesky_mo_num,nV,nV) + call gpu_dgemm(blas_handle, 'N','N', cholesky_mo_num*nV, nV, nO, 1.d0, & + d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num*nV, t1%f(1,1), nO, 0.d0, tmp_cc%f(1,1,1), cholesky_mo_num*nV) call set_multiple_levels_omp(.False.) + call gpu_synchronize() + + type(gpu_blas) :: blas + + !$OMP PARALLEL PRIVATE(gam, iblock, B1, tmpB1, tmp_cc2, beta, b, a, blas) + call gpu_allocate(B1,nV,nV,block_size) + call gpu_allocate(tmpB1,nV,block_size,nV) + call gpu_allocate(tmp_cc2,cholesky_mo_num,nV) + + call gpu_blas_create(blas) - !$OMP PARALLEL PRIVATE(gam, iblock, B1, tmpB1, tmp_cc2, beta, b, a) - allocate(B1(nV,nV,block_size), tmpB1(nV,block_size,nV), tmp_cc2(cholesky_mo_num,nV)) !$OMP DO do gam = 1, nV - do a=1,nV - do k=1,cholesky_mo_num - tmp_cc2(k,a) = cc_space_v_vv_chol(k,a,gam) - tmp_cc(k,a,gam) - enddo - enddo + call gpu_dgeam(blas, 'N', 'N', cholesky_mo_num, nV, 1.d0, d_cc_space_v_vv_chol%f(1,1,gam), & + cholesky_mo_num, -1.d0, tmp_cc%f(1,1,gam), cholesky_mo_num, tmp_cc2%f(1,1), cholesky_mo_num) do iblock = 1, nV, block_size - call dgemm('T', 'N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, & - -1.d0, tmp_cc(1,1,iblock), cholesky_mo_num, & - cc_space_v_vv_chol(1,1,gam), cholesky_mo_num, & - 0.d0, tmpB1, nV*block_size) + call gpu_dgemm(blas, 'T', 'N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, & + -1.d0, tmp_cc%f(1,1,iblock), cholesky_mo_num, & + d_cc_space_v_vv_chol%f(1,1,gam), cholesky_mo_num, & + 0.d0, tmpB1%f(1,1,1), nV*block_size) - call dgemm('T','N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, & - 1.d0, cc_space_v_vv_chol(1,1,iblock), cholesky_mo_num, & - tmp_cc2, cholesky_mo_num, & - 1.d0, tmpB1, nV*block_size) + call gpu_dgemm(blas, 'T','N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, & + 1.d0, d_cc_space_v_vv_chol%f(1,1,iblock), cholesky_mo_num, & + tmp_cc2%f(1,1), cholesky_mo_num, & + 1.d0, tmpB1%f(1,1,1), nV*block_size) do beta = iblock, min(nV, iblock+block_size-1) - do b = 1, nV - do a = 1, nV - B1(a,b,beta-iblock+1) = tmpB1(a,beta-iblock+1,b) - enddo - enddo + call gpu_dgeam(blas, 'N', 'N', nV, nV, 1.d0, tmpB1%f(1,beta-iblock+1,1), & + nV*block_size, 0.d0, B1%f(1,1,beta-iblock+1), nV, B1%f(1,1,beta-iblock+1), nV) enddo - call dgemm('N','N',nO*nO,min(block_size, nV-iblock+1),nV*nV, & - 1d0, tau, size(tau,1) * size(tau,2), & - B1 , size(B1 ,1) * size(B1 ,2), & - 1d0, r2(1,1,iblock,gam), size(r2 ,1) * size(r2 ,2)) + call gpu_dgemm(blas, 'N','N',nO*nO,min(block_size, nV-iblock+1),nV*nV, & + 1d0, tau%f(1,1,1,1), size(tau%f,1) * size(tau%f,2), & + B1%f(1,1,1) , size(B1%f ,1) * size(B1%f ,2), & + 1d0, r2%f(1,1,iblock,gam), size(r2%f ,1) * size(r2%f ,2)) enddo enddo !$OMP ENDDO - deallocate(B1, tmpB1, tmp_cc2) + call gpu_blas_destroy(blas) + + call gpu_deallocate(B1) + call gpu_deallocate(tmpB1) + call gpu_deallocate(tmp_cc2) !$OMP END PARALLEL - deallocate(tmp_cc) + call gpu_deallocate(tmp_cc) + type(gpu_double4) :: X_oovv + call gpu_allocate(X_oovv,nO,nO,nV,nV) + call gpu_copy(t2,X_oovv) - double precision, allocatable :: X_oovv(:,:,:,:) - allocate(X_oovv(nO,nO,nV,nV)) - !$omp parallel & - !$omp shared(nO,nV,t2,X_oovv) & - !$omp private(u,v,gam,a) & - !$omp default(none) - !$omp do - do a = 1, nV - do gam = 1, nV - do v = 1, nO - do u = 1, nO - X_oovv(u,v,gam,a) = t2(u,v,gam,a) - enddo - enddo - enddo + type(gpu_double2) :: g_vir + call gpu_allocate(g_vir,nV,nV) + call compute_g_vir_chol(nO,nV,t1%f,t2%f,H_vv%f,g_vir%f) + + type(gpu_double4) :: Y_oovv + call gpu_allocate(Y_oovv,nO,nO,nV,nV) + + call gpu_dgemm(blas_handle, 'N','N',nO*nO*nV,nV,nV, & + 1d0, X_oovv%f(1,1,1,1), size(X_oovv%f,1) * size(X_oovv%f,2) * size(X_oovv%f,3), & + g_vir%f(1,1), size(g_vir%f,1), & + 0d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1) * size(Y_oovv%f,2) * size(Y_oovv%f,3)) + + call gpu_synchronize() + + do a=1,nV + call gpu_stream_create(stream(a)) enddo - !$omp end do - !$omp end parallel - double precision, allocatable :: g_vir(:,:) - allocate(g_vir(nV,nV)) - call compute_g_vir_chol(nO,nV,t1,t2,H_vv,g_vir) - - double precision, allocatable :: Y_oovv(:,:,:,:) - allocate(Y_oovv(nO,nO,nV,nV)) - - call dgemm('N','N',nO*nO*nV,nV,nV, & - 1d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3), & - g_vir, size(g_vir,1), & - 0d0, Y_oovv, size(Y_oovv,1) * size(Y_oovv,2) * size(Y_oovv,3)) - deallocate(g_vir) - deallocate(X_oovv) - - !$omp parallel & - !$omp shared(nO,nV,r2,Y_oovv) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do do gam = 1, nV do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) + Y_oovv(u,v,beta,gam) + Y_oovv(v,u,gam,beta) - enddo - enddo + call gpu_set_stream(blas_handle, stream(beta)) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, 1.d0, Y_oovv%f(1,1,beta,gam), & + nO, 1.d0, r2%f(1,1,beta,gam), nO, r2%f(1,1,beta,gam), nO) + call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, 1.d0, r2%f(1,1,beta,gam), & + nO, 1.d0, Y_oovv%f(1,1,gam,beta), nO, r2%f(1,1,beta,gam), nO) enddo enddo - !$omp end do - !$omp end parallel - deallocate(Y_oovv) - double precision, allocatable :: g_occ(:,:) - allocate(g_occ(nO,nO)) - call compute_g_occ_chol(nO,nV,t1,t2,H_oo,g_occ) + call gpu_deallocate(g_vir) + call gpu_set_stream(blas_handle, gpu_default_stream) - allocate(X_oovv(nO,nO,nV,nV)) - call dgemm('N','N',nO,nO*nV*nV,nO, & - 1d0, g_occ , size(g_occ,1), & - t2 , size(t2,1), & - 0d0, X_oovv, size(X_oovv,1)) - deallocate(g_occ) + do a=1,nV + call gpu_stream_destroy(stream(a)) + enddo + + call gpu_deallocate(Y_oovv) + + type(gpu_double2) :: g_occ + call gpu_allocate(g_occ,nO,nO) + + call compute_g_occ_chol(nO,nV,t1%f,t2%f,H_oo%f,g_occ%f) + + call gpu_dgemm(blas_handle, 'N','N',nO,nO*nV*nV,nO, & + 1d0, g_occ%f(1,1), size(g_occ%f,1), & + t2%f(1,1,1,1) , size(t2%f,1), & + 0d0, X_oovv%f(1,1,1,1), size(X_oovv%f,1)) + + call gpu_synchronize() !$omp parallel & !$omp shared(nO,nV,r2,X_oovv) & @@ -563,7 +557,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) do beta = 1, nV do v = 1, nO do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(u,v,beta,gam) - X_oovv(v,u,gam,beta) + r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - X_oovv%f(u,v,beta,gam) - X_oovv%f(v,u,gam,beta) enddo enddo enddo @@ -571,27 +565,39 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) !$omp end do !$omp end parallel - deallocate(X_oovv) + call gpu_deallocate(g_occ) + call gpu_deallocate(X_oovv) - double precision, allocatable :: X_vovv(:,:,:,:) + type(gpu_double4) :: X_vovv + + call gpu_allocate(X_vovv,nV,nO,nV,block_size) + call gpu_allocate(Y_oovv,nO,nO,nV,nV) - allocate(X_vovv(nV,nO,nV,block_size)) - allocate(Y_oovv(nO,nO,nV,nV)) do iblock = 1, nV, block_size do gam = iblock, min(nV, iblock+block_size-1) - call dgemm('T','N',nV, nO*nV, cholesky_mo_num, 1.d0, & - cc_space_v_vv_chol(1,1,gam), cholesky_mo_num, cc_space_v_ov_chol, & - cholesky_mo_num, 0.d0, X_vovv(1,1,1,gam-iblock+1), nV) + call gpu_stream_create(stream(gam)) + call gpu_set_stream(blas_handle, stream(gam)) + call gpu_dgemm(blas_handle, 'T','N',nV, nO*nV, cholesky_mo_num, 1.d0, & + d_cc_space_v_vv_chol%f(1,1,gam), cholesky_mo_num, d_cc_space_v_ov_chol%f(1,1,1), & + cholesky_mo_num, 0.d0, X_vovv%f(1,1,1,gam-iblock+1), nV) enddo + do gam = iblock, min(nV, iblock+block_size-1) + call gpu_stream_destroy(stream(gam)) + enddo + + call gpu_synchronize() + + call gpu_set_stream(blas_handle, gpu_default_stream) call dgemm('N','N',nO,nO*nV*min(block_size, nV-iblock+1),nV, & - 1d0, t1 , size(t1,1), & - X_vovv, size(X_vovv,1), & - 0d0, Y_oovv(1,1,1,iblock), size(Y_oovv,1)) + 1d0, t1%f , size(t1%f,1), & + X_vovv%f, size(X_vovv%f,1), & + 0d0, Y_oovv%f(1,1,1,iblock), size(Y_oovv%f,1)) enddo - deallocate(X_vovv) + call gpu_synchronize() + call gpu_deallocate(X_vovv) !$omp parallel & !$omp shared(nO,nV,r2,Y_oovv) & @@ -602,14 +608,14 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) do beta = 1, nV do v = 1, nO do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) + Y_oovv(v,u,beta,gam) + Y_oovv(u,v,gam,beta) + r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) + Y_oovv%f(v,u,beta,gam) + Y_oovv%f(u,v,gam,beta) enddo enddo enddo enddo !$omp end do !$omp end parallel - deallocate(Y_oovv) + call gpu_deallocate(Y_oovv) double precision, allocatable :: X_ovvo(:,:,:,:) double precision, allocatable :: tcc(:,:,:), tcc2(:,:,:) @@ -617,11 +623,11 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) allocate(tcc(cholesky_mo_num,nO,nV)) call dgemm('N','T', cholesky_mo_num*nV, nO, nV, 1.d0, & - cc_space_v_vv_chol, cholesky_mo_num*nV, t1, nO, & + d_cc_space_v_vv_chol%f, cholesky_mo_num*nV, t1%f, nO, & 0.d0, tcc2, cholesky_mo_num*nV) call dgemm('N','N', cholesky_mo_num*nO, nV, nO, 1.d0, & - cc_space_v_oo_chol, cholesky_mo_num*nO, t1, nO, & + cc_space_v_oo_chol, cholesky_mo_num*nO, t1%f, nO, & 0.d0, tcc, cholesky_mo_num*nO) call dgemm('T','N', nO*nV, nV*nO, cholesky_mo_num, 1.d0, & @@ -639,7 +645,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) do beta = 1, nV do v = 1, nO do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_ovvo(u,beta,gam,v) + r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - X_ovvo(u,beta,gam,v) enddo enddo enddo @@ -650,7 +656,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) do gam = 1, nV do v = 1, nO do u = 1, nO - r2(v,u,gam,beta) = r2(v,u,gam,beta) - X_ovvo(u,beta,gam,v) + r2%f(v,u,gam,beta) = r2%f(v,u,gam,beta) - X_ovvo(u,beta,gam,v) enddo enddo enddo @@ -661,12 +667,12 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) deallocate(X_ovvo) !----- - allocate(X_oovv(nO,nO,nV,nV)) + call gpu_allocate(X_oovv,nO,nO,nV,nV) call dgemm('N','N',nO*nO*nV,nV,nO, & 1d0, cc_space_v_oovo, size(cc_space_v_oovo,1) * size(cc_space_v_oovo,2) * size(cc_space_v_oovo,3), & - t1 , size(t1,1), & - 0d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3)) + t1%f , size(t1%f,1), & + 0d0, X_oovv%f, size(X_oovv%f,1) * size(X_oovv%f,2) * size(X_oovv%f,3)) !$omp parallel & !$omp shared(nO,nV,r2,X_oovv) & @@ -677,14 +683,14 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) do beta = 1, nV do v = 1, nO do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(u,v,beta,gam) - X_oovv(v,u,gam,beta) + r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - X_oovv%f(u,v,beta,gam) - X_oovv%f(v,u,gam,beta) enddo enddo enddo enddo !$omp end do !$omp end parallel - deallocate(X_oovv) + call gpu_deallocate(X_oovv) double precision, allocatable :: X_vovo(:,:,:,:), Y_oovo(:,:,:,:) allocate(X_vovo(nV,nO,nV,nO)) @@ -708,16 +714,16 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) allocate(Y_oovo(nO,nO,nV,nO)) call dgemm('N','N',nO,nO*nV*nO,nV, & - 1d0, t1, size(t1,1), & + 1d0, t1%f, size(t1%f,1), & X_vovo, size(X_vovo,1), & 0d0, Y_oovo, size(Y_oovo,1)) deallocate(X_vovo) - allocate(X_oovv(nO,nO,nV,nV)) + call gpu_allocate(X_oovv,nO,nO,nV,nV) call dgemm('N','N',nO*nO*nV, nV, nO, & 1d0, Y_oovo, size(Y_oovo,1) * size(Y_oovo,2) * size(Y_oovo,3), & - t1 , size(t1,1), & - 0d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3)) + t1%f , size(t1%f,1), & + 0d0, X_oovv%f, size(X_oovv%f,1) * size(X_oovv%f,2) * size(X_oovv%f,3)) deallocate(Y_oovo) !$omp parallel & @@ -729,24 +735,24 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) do beta = 1, nV do v = 1, nO do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(u,v,gam,beta) - X_oovv(v,u,beta,gam) + r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - X_oovv%f(u,v,gam,beta) - X_oovv%f(v,u,beta,gam) enddo enddo enddo enddo !$omp end do !$omp end parallel - deallocate(X_oovv) + call gpu_deallocate(X_oovv) double precision, allocatable :: J1(:,:,:,:) allocate(J1(nO,nV,nV,nO)) - call compute_J1_chol(nO,nV,t1,t2,cc_space_v_ovvo,cc_space_v_ovoo, & + call compute_J1_chol(nO,nV,t1%f,t2%f,cc_space_v_ovvo,cc_space_v_ovoo, & cc_space_v_vvoo,J1) double precision, allocatable :: K1(:,:,:,:) allocate(K1(nO,nV,nO,nV)) - call compute_K1_chol(nO,nV,t1,t2,cc_space_v_ovoo,cc_space_v_vvoo, & + call compute_K1_chol(nO,nV,t1%f,t2%f,cc_space_v_ovoo,cc_space_v_vvoo, & cc_space_v_ovov,K1) allocate(X_ovvo(nO,nV,nV,nO)) @@ -778,7 +784,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) do v = 1, nO do i = 1, nO do a = 1, nV - Y_voov(a,i,v,gam) = 2d0 * t2(i,v,a,gam) - t2(i,v,gam,a) + Y_voov(a,i,v,gam) = 2d0 * t2%f(i,v,a,gam) - t2%f(i,v,gam,a) enddo enddo enddo @@ -805,7 +811,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) do beta = 1, nV do v = 1, nO do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) + Z_ovov(u,beta,v,gam) + Z_ovov(v,gam,u,beta) + r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) + Z_ovov(u,beta,v,gam) + Z_ovov(v,gam,u,beta) enddo enddo enddo @@ -820,7 +826,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) allocate(Y_ovov(nO,nV,nO,nV)) !$omp parallel & - !$omp shared(nO,nV,r2,K1,X_ovov,Y_ovov,t2) & + !$omp shared(nO,nV,K1,X_ovov,Y_ovov,t2) & !$omp private(u,a,i,beta,gam) & !$omp default(none) !$omp do @@ -840,7 +846,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) do v = 1, nO do a = 1, nV do i = 1, nO - Y_ovov(i,a,v,gam) = t2(i,v,gam,a) + Y_ovov(i,a,v,gam) = t2%f(i,v,gam,a) enddo enddo enddo @@ -864,7 +870,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) do beta = 1, nV do v = 1, nO do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) - Z_ovov(u,beta,v,gam) - Z_ovov(v,gam,u,beta) + r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - Z_ovov(u,beta,v,gam) - Z_ovov(v,gam,u,beta) enddo enddo enddo @@ -895,7 +901,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) do v = 1, nO do a = 1, nV do i = 1, nO - Y_ovov(i,a,v,beta) = t2(i,v,beta,a) + Y_ovov(i,a,v,beta) = t2%f(i,v,beta,a) enddo enddo enddo @@ -922,7 +928,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) do beta = 1, nV do v = 1, nO do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) - Z_ovov(u,gam,v,beta) - Z_ovov(v,beta,u,gam) + r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - Z_ovov(u,gam,v,beta) - Z_ovov(v,beta,u,gam) enddo enddo enddo @@ -945,8 +951,8 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) do a = 1, nV do j = 1, nO do i = 1, nO - r2(i,j,a,b) = -r2(i,j,a,b) - max_r2_local = max(r2(i,j,a,b), max_r2_local) + r2%f(i,j,a,b) = -r2%f(i,j,a,b) + max_r2_local = max(r2%f(i,j,a,b), max_r2_local) enddo enddo enddo @@ -961,28 +967,29 @@ end ! A1 -subroutine compute_A1_chol(nO,nV,t1,t2,tau,A1) - +subroutine compute_A1_chol(nO,nV,t1,t2,tau,d_cc_space_v_vooo, & + d_cc_space_v_oooo, d_cc_space_v_vvoo, A1) + use gpu implicit none - integer, intent(in) :: nO,nV - double precision, intent(in) :: t1(nO, nV) - double precision, intent(in) :: t2(nO, nO, nV, nV) - double precision, intent(in) :: tau(nO, nO, nV, nV) - double precision, intent(out) :: A1(nO, nO, nO, nO) + integer, intent(in) :: nO,nV + type(gpu_double2), intent(in) :: t1 + type(gpu_double4), intent(in) :: t2, tau + type(gpu_double4), intent(in) :: d_cc_space_v_vooo, d_cc_space_v_oooo, d_cc_space_v_vvoo + type(gpu_double4), intent(out) :: A1 integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta - double precision, allocatable :: Y_oooo(:,:,:,:) - allocate(Y_oooo(nO,nO,nO,nO)) + type(gpu_double4) :: Y_oooo + call gpu_allocate(Y_oooo,nO,nO,nO,nO) ! A1(u,v,i,j) = cc_space_v_oooo(u,v,i,j) ! A1(u,v,i,j) += cc_space_v_ovoo(u,a,i,j) * t1(v,a) & call dgemm('N','N', nO, nO*nO*nO, nV, & - 1d0, t1 , size(t1,1), & - cc_space_v_vooo, size(cc_space_v_vooo,1), & - 0d0, Y_oooo, size(Y_oooo,1)) + 1d0, t1%f , size(t1%f,1), & + d_cc_space_v_vooo%f, size(d_cc_space_v_vooo%f,1), & + 0d0, Y_oooo%f, size(Y_oooo%f,1)) !$omp parallel & !$omp private(u,v,i,j) & @@ -992,7 +999,7 @@ subroutine compute_A1_chol(nO,nV,t1,t2,tau,A1) do i = 1, nO do v = 1, nO do u = 1, nO - A1(u,v,i,j) = cc_space_v_oooo(u,v,i,j) + Y_oooo(v,u,j,i) + Y_oooo(u,v,i,j) + A1%f(u,v,i,j) = d_cc_space_v_oooo%f(u,v,i,j) + Y_oooo%f(v,u,j,i) + Y_oooo%f(u,v,i,j) enddo enddo enddo @@ -1000,19 +1007,20 @@ subroutine compute_A1_chol(nO,nV,t1,t2,tau,A1) !$omp end do !$omp end parallel - deallocate(Y_oooo) + call gpu_deallocate(Y_oooo) ! A1(u,v,i,j) += cc_space_v_vvoo(a,b,i,j) * tau(u,v,a,b) call dgemm('N','N', nO*nO, nO*nO, nV*nV, & - 1d0, tau , size(tau,1) * size(tau,2), & - cc_space_v_vvoo, size(cc_space_v_vvoo,1) * size(cc_space_v_vvoo,2), & - 1d0, A1 , size(A1,1) * size(A1,2)) + 1d0, tau%f , size(tau%f,1) * size(tau%f,2), & + d_cc_space_v_vvoo%f, size(d_cc_space_v_vvoo%f,1) * size(d_cc_space_v_vvoo%f,2), & + 1d0, A1%f , size(A1%f,1) * size(A1%f,2)) end ! g_occ subroutine compute_g_occ_chol(nO,nV,t1,t2,H_oo,g_occ) + use gpu implicit none @@ -1048,6 +1056,7 @@ end ! g_vir subroutine compute_g_vir_chol(nO,nV,t1,t2,H_vv,g_vir) + use gpu implicit none @@ -1102,6 +1111,7 @@ end ! J1 subroutine compute_J1_chol(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvoo,J1) + use gpu implicit none integer, intent(in) :: nO,nV @@ -1305,6 +1315,7 @@ end ! K1 subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov,K1) + use gpu implicit none From 5b1e5f84e6defc8857b5d29c54449e3b8d35cb67 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Thu, 4 Jul 2024 14:52:09 +0200 Subject: [PATCH 32/38] Working on r2 --- src/ccsd/ccsd_space_orb_sub.irp.f | 17 +- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 469 +++++++++++-------------- 2 files changed, 220 insertions(+), 266 deletions(-) diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 59b9ebd2..e97c2325 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -22,7 +22,7 @@ subroutine run_ccsd_space_orb type(gpu_double4) :: d_cc_space_v_oovv, d_cc_space_v_voov, d_cc_space_v_ovov type(gpu_double4) :: d_cc_space_v_oovo, d_cc_space_v_vooo, d_cc_space_v_oooo - type(gpu_double4) :: d_cc_space_v_vvoo + type(gpu_double4) :: d_cc_space_v_vvoo, d_cc_space_v_ovvo, d_cc_space_v_ovoo double precision, allocatable :: all_err(:,:), all_t(:,:) integer, allocatable :: list_occ(:), list_vir(:) @@ -98,17 +98,21 @@ subroutine run_ccsd_space_orb call gpu_allocate(d_cc_space_v_voov, nV, nO, nO, nV) call gpu_allocate(d_cc_space_v_ovov, nO, nV, nO, nV) call gpu_allocate(d_cc_space_v_oovo, nO, nO, nV, nO) + call gpu_allocate(d_cc_space_v_ovvo, nO, nV, nV, nO) call gpu_allocate(d_cc_space_v_vooo, nV, nO, nO, nO) call gpu_allocate(d_cc_space_v_oooo, nO, nO, nO, nO) call gpu_allocate(d_cc_space_v_vvoo, nV, nV, nO, nO) + call gpu_allocate(d_cc_space_v_ovoo, nO, nV, nO, nO) call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv) call gpu_upload(cc_space_v_voov, d_cc_space_v_voov) call gpu_upload(cc_space_v_ovov, d_cc_space_v_ovov) call gpu_upload(cc_space_v_oovo, d_cc_space_v_oovo) + call gpu_upload(cc_space_v_ovvo, d_cc_space_v_ovvo) call gpu_upload(cc_space_v_vooo, d_cc_space_v_vooo) call gpu_upload(cc_space_v_oooo, d_cc_space_v_oooo) call gpu_upload(cc_space_v_vvoo, d_cc_space_v_vvoo) + call gpu_upload(cc_space_v_ovoo, d_cc_space_v_ovoo) ! FREE cc_space_v_voov ! FREE cc_space_v_ovov @@ -117,6 +121,8 @@ subroutine run_ccsd_space_orb ! FREE cc_space_v_vooo ! FREE cc_space_v_oooo ! FREE cc_space_v_vvoo +! FREE cc_space_v_ovvo +! FREE cc_space_v_ovoo call gpu_allocate(t2, nO,nO,nV,nV) call gpu_allocate(r2, nO,nO,nV,nV) @@ -207,8 +213,8 @@ subroutine run_ccsd_space_orb call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_space_f_ov,d_cc_space_f_vo, & d_cc_space_v_voov, d_cc_space_v_ovov, d_cc_space_v_oovo, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol) call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & - d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, & - d_cc_space_v_vvoo, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, & + d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, d_cc_space_v_oovo, d_cc_space_v_ovvo, d_cc_space_v_ovoo, & + d_cc_space_v_ovov, d_cc_space_v_vvoo, d_cc_space_v_oo_chol, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, & r2, max_r2) else call compute_H_oo(nO,nV,t1%f,t2%f,tau%f,H_oo%f) @@ -320,6 +326,11 @@ subroutine run_ccsd_space_orb call gpu_deallocate(d_cc_space_v_voov) call gpu_deallocate(d_cc_space_v_ovov) call gpu_deallocate(d_cc_space_v_oovo) + call gpu_deallocate(d_cc_space_v_ovvo) + call gpu_deallocate(d_cc_space_v_vooo) + call gpu_deallocate(d_cc_space_v_oooo) + call gpu_deallocate(d_cc_space_v_vvoo) + call gpu_deallocate(d_cc_space_v_ovoo) call gpu_deallocate(d_cc_space_f_oo) call gpu_deallocate(d_cc_space_f_vo) diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index 0474dcec..abb9909b 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -392,8 +392,8 @@ end ! R2 subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & - d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, & - d_cc_space_v_vvoo, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, & + d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, d_cc_space_v_oovo, d_cc_space_v_ovvo, d_cc_space_v_ovoo, & + d_cc_space_v_ovov, d_cc_space_v_vvoo, d_cc_space_v_oo_chol, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, & r2,max_r2) use gpu implicit none @@ -403,9 +403,11 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & type(gpu_double2), intent(in) :: t1, H_oo, H_vv type(gpu_double4), intent(in) :: t2, tau, d_cc_space_v_oovv type(gpu_double4), intent(in) :: d_cc_space_v_vooo, d_cc_space_v_oooo - type(gpu_double4), intent(in) :: d_cc_space_v_vvoo - type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol - type(gpu_double3), intent(in) :: d_cc_space_v_vo_chol + type(gpu_double4), intent(in) :: d_cc_space_v_vvoo, d_cc_space_v_oovo + type(gpu_double4), intent(in) :: d_cc_space_v_ovvo, d_cc_space_v_ovoo + type(gpu_double4), intent(in) :: d_cc_space_v_ovov + type(gpu_double3), intent(in) :: d_cc_space_v_oo_chol, d_cc_space_v_ov_chol + type(gpu_double3), intent(in) :: d_cc_space_v_vo_chol, d_cc_space_v_vv_chol ! out double precision, intent(out) :: max_r2 @@ -499,9 +501,11 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & call gpu_allocate(X_oovv,nO,nO,nV,nV) call gpu_copy(t2,X_oovv) - type(gpu_double2) :: g_vir + type(gpu_double2) :: g_occ, g_vir call gpu_allocate(g_vir,nV,nV) + call gpu_allocate(g_occ,nO,nO) call compute_g_vir_chol(nO,nV,t1%f,t2%f,H_vv%f,g_vir%f) + call compute_g_occ_chol(nO,nV,t1%f,t2%f,H_oo%f,g_occ%f) type(gpu_double4) :: Y_oovv call gpu_allocate(Y_oovv,nO,nO,nV,nV) @@ -511,7 +515,41 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & g_vir%f(1,1), size(g_vir%f,1), & 0d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1) * size(Y_oovv%f,2) * size(Y_oovv%f,3)) + call gpu_dgemm(blas_handle, 'N','N',nO,nO*nV*nV,nO, & + -1d0, g_occ%f(1,1), size(g_occ%f,1), & + t2%f(1,1,1,1) , size(t2%f,1), & + 1d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1)) + + call gpu_dgemm(blas_handle, 'N','N',nO*nO*nV,nV,nO, & + -1d0, d_cc_space_v_oovo%f(1,1,1,1), size(cc_space_v_oovo,1) * size(cc_space_v_oovo,2) * size(cc_space_v_oovo,3), & + t1%f(1,1) , size(t1%f,1), & + 1d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1) * size(Y_oovv%f,2) * size(Y_oovv%f,3)) + call gpu_synchronize() + call gpu_deallocate(X_oovv) + + call gpu_deallocate(g_vir) + call gpu_deallocate(g_occ) + + type(gpu_double4) :: X_vovo, Y_oovo + call gpu_allocate(X_vovo,nV,nO,nV,nO) + +! !$omp parallel & +! !$omp shared(nO,nV,r2,Y_oovv) & +! !$omp private(u,v,gam,beta) & +! !$omp default(none) +! !$omp do +! do gam = 1, nV +! do beta = 1, nV +! do v = 1, nO +! do u = 1, nO +! r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) + Y_oovv%f(u,v,beta,gam) + Y_oovv%f(v,u,gam,beta) +! enddo +! enddo +! enddo +! enddo +! !$omp end do +! !$omp end parallel do a=1,nV call gpu_stream_create(stream(a)) @@ -527,52 +565,24 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & enddo enddo - call gpu_deallocate(g_vir) - call gpu_set_stream(blas_handle, gpu_default_stream) + do i = 1, nO + do gam = 1, nV + call gpu_set_stream(blas_handle, stream(gam)) + call gpu_dgeam(blas_handle, 'T', 'N', nV, nO, 1.d0, d_cc_space_v_ovvo%f(1,1,gam,i), & + nO, 0.d0, X_vovo%f(1,1,gam,i), nV, X_vovo%f(1,1,gam,i), nV) + enddo + enddo do a=1,nV call gpu_stream_destroy(stream(a)) enddo + call gpu_set_stream(blas_handle, gpu_default_stream) - call gpu_deallocate(Y_oovv) - - type(gpu_double2) :: g_occ - call gpu_allocate(g_occ,nO,nO) - - call compute_g_occ_chol(nO,nV,t1%f,t2%f,H_oo%f,g_occ%f) - - call gpu_dgemm(blas_handle, 'N','N',nO,nO*nV*nV,nO, & - 1d0, g_occ%f(1,1), size(g_occ%f,1), & - t2%f(1,1,1,1) , size(t2%f,1), & - 0d0, X_oovv%f(1,1,1,1), size(X_oovv%f,1)) - - call gpu_synchronize() - - !$omp parallel & - !$omp shared(nO,nV,r2,X_oovv) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do - do gam = 1, nV - do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - X_oovv%f(u,v,beta,gam) - X_oovv%f(v,u,gam,beta) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - - call gpu_deallocate(g_occ) - call gpu_deallocate(X_oovv) type(gpu_double4) :: X_vovv call gpu_allocate(X_vovv,nV,nO,nV,block_size) - call gpu_allocate(Y_oovv,nO,nO,nV,nV) - + call gpu_allocate(Y_oovo,nO,nO,nV,nO) do iblock = 1, nV, block_size do gam = iblock, min(nV, iblock+block_size-1) @@ -590,241 +600,176 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & call gpu_synchronize() call gpu_set_stream(blas_handle, gpu_default_stream) - call dgemm('N','N',nO,nO*nV*min(block_size, nV-iblock+1),nV, & - 1d0, t1%f , size(t1%f,1), & - X_vovv%f, size(X_vovv%f,1), & + call gpu_dgemm(blas_handle, 'N','N', nO, & + nO*nV*min(block_size, nV-iblock+1),nV, & + 1.d0, t1%f(1,1) , size(t1%f,1), & + X_vovv%f(1,1,1,1), size(X_vovv%f,1), & 0d0, Y_oovv%f(1,1,1,iblock), size(Y_oovv%f,1)) enddo + + call gpu_dgemm(blas_handle, 'N','N',nO,nO*nV*nO,nV, & + 1d0, t1%f(1,1), size(t1%f,1), & + X_vovo%f(1,1,1,1), size(X_vovo%f,1), & + 0d0, Y_oovo%f(1,1,1,1), size(Y_oovo%f,1)) + + call gpu_dgemm(blas_handle, 'N','N',nO*nO*nV, nV, nO, & + -1d0, Y_oovo%f(1,1,1,1), size(Y_oovo%f,1) * size(Y_oovo%f,2) * size(Y_oovo%f,3), & + t1%f(1,1) , size(t1%f,1), & + 1d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1) * size(Y_oovv%f,2) * size(Y_oovv%f,3)) + call gpu_synchronize() call gpu_deallocate(X_vovv) + call gpu_deallocate(X_vovo) + call gpu_deallocate(Y_oovo) + +! !$omp parallel & +! !$omp shared(nO,nV,r2,Y_oovv) & +! !$omp private(u,v,gam,beta) & +! !$omp default(none) +! !$omp do +! do gam = 1, nV +! do beta = 1, nV +! do v = 1, nO +! do u = 1, nO +! r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) + Y_oovv%f(u,v,gam,beta) + Y_oovv%f(v,u,beta,gam) +! enddo +! enddo +! enddo +! enddo +! !$omp end do +! !$omp end parallel + + do a=1,nV + call gpu_stream_create(stream(a)) + enddo - !$omp parallel & - !$omp shared(nO,nV,r2,Y_oovv) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do do gam = 1, nV do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) + Y_oovv%f(v,u,beta,gam) + Y_oovv%f(u,v,gam,beta) - enddo - enddo + call gpu_set_stream(blas_handle, stream(beta)) + call gpu_dgeam(blas_handle, 'T', 'N', nO, nO, 1.d0, Y_oovv%f(1,1,beta,gam), & + nO, 1.d0, r2%f(1,1,beta,gam), nO, r2%f(1,1,beta,gam), nO) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, 1.d0, r2%f(1,1,beta,gam), & + nO, 1.d0, Y_oovv%f(1,1,gam,beta), nO, r2%f(1,1,beta,gam), nO) enddo enddo - !$omp end do - !$omp end parallel + + call gpu_set_stream(blas_handle, gpu_default_stream) + + do a=1,nV + call gpu_stream_destroy(stream(a)) + enddo + + + call gpu_synchronize() call gpu_deallocate(Y_oovv) - double precision, allocatable :: X_ovvo(:,:,:,:) - double precision, allocatable :: tcc(:,:,:), tcc2(:,:,:) - allocate(tcc2(cholesky_mo_num,nV,nO), X_ovvo(nO,nV,nV,nO)) - allocate(tcc(cholesky_mo_num,nO,nV)) + type(gpu_double4) :: X_ovvo + type(gpu_double3) :: tcc, tcc2 + call gpu_allocate(tcc2,cholesky_mo_num,nV,nO) + call gpu_allocate(X_ovvo,nO,nV,nV,nO) + call gpu_allocate(tcc,cholesky_mo_num,nO,nV) - call dgemm('N','T', cholesky_mo_num*nV, nO, nV, 1.d0, & - d_cc_space_v_vv_chol%f, cholesky_mo_num*nV, t1%f, nO, & - 0.d0, tcc2, cholesky_mo_num*nV) + call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nV, nO, nV, 1.d0, & + d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num*nV, t1%f(1,1), nO, & + 0.d0, tcc2%f(1,1,1), cholesky_mo_num*nV) - call dgemm('N','N', cholesky_mo_num*nO, nV, nO, 1.d0, & - cc_space_v_oo_chol, cholesky_mo_num*nO, t1%f, nO, & - 0.d0, tcc, cholesky_mo_num*nO) + call gpu_dgemm(blas_handle, 'N','N', cholesky_mo_num*nO, nV, nO, 1.d0, & + d_cc_space_v_oo_chol%f(1,1,1), cholesky_mo_num*nO, t1%f(1,1), nO, & + 0.d0, tcc%f(1,1,1), cholesky_mo_num*nO) - call dgemm('T','N', nO*nV, nV*nO, cholesky_mo_num, 1.d0, & - tcc, cholesky_mo_num, tcc2, cholesky_mo_num, 0.d0, & - X_ovvo, nO*nV) + call gpu_dgemm(blas_handle, 'T','N', nO*nV, nV*nO, cholesky_mo_num, 1.d0, & + tcc%f(1,1,1), cholesky_mo_num, tcc2%f(1,1,1), cholesky_mo_num, 0.d0, & + X_ovvo%f(1,1,1,1), nO*nV) - deallocate(tcc, tcc2) + call gpu_synchronize() + + do a=1,nV + call gpu_stream_create(stream(a)) + enddo - !$omp parallel & - !$omp shared(nO,nV,r2,X_ovvo) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do do gam = 1, nV do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - X_ovvo(u,beta,gam,v) - enddo - enddo + call gpu_set_stream(blas_handle, stream(beta)) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, -1.d0, X_ovvo%f(1,beta,gam,1), & + nO*nV*nV, 1.d0, r2%f(1,1,beta,gam), nO, r2%f(1,1,beta,gam), nO) + call gpu_dgeam(blas_handle, 'T', 'N', nO, nO, -1.d0, X_ovvo%f(1,gam,beta,1), & + nO*nV*nV, 1.d0, r2%f(1,1,beta,gam), nO, r2%f(1,1,beta,gam), nO) enddo enddo - !$omp end do - !$omp do - do beta = 1, nV - do gam = 1, nV - do v = 1, nO - do u = 1, nO - r2%f(v,u,gam,beta) = r2%f(v,u,gam,beta) - X_ovvo(u,beta,gam,v) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - deallocate(X_ovvo) + call gpu_set_stream(blas_handle, gpu_default_stream) + + do a=1,nV + call gpu_stream_destroy(stream(a)) + enddo + + call gpu_synchronize() + call gpu_deallocate(tcc) + call gpu_deallocate(tcc2) + call gpu_deallocate(X_ovvo) + !----- - call gpu_allocate(X_oovv,nO,nO,nV,nV) + type(gpu_double4) :: J1, K1 + type(gpu_double4) :: Y_voov, Z_ovov - call dgemm('N','N',nO*nO*nV,nV,nO, & - 1d0, cc_space_v_oovo, size(cc_space_v_oovo,1) * size(cc_space_v_oovo,2) * size(cc_space_v_oovo,3), & - t1%f , size(t1%f,1), & - 0d0, X_oovv%f, size(X_oovv%f,1) * size(X_oovv%f,2) * size(X_oovv%f,3)) + call gpu_allocate(J1,nO,nV,nV,nO) + call compute_J1_chol(nO,nV,t1%f,t2%f,d_cc_space_v_ovvo%f,d_cc_space_v_ovoo%f, & + d_cc_space_v_vvoo%f,J1%f) - !$omp parallel & - !$omp shared(nO,nV,r2,X_oovv) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do - do gam = 1, nV - do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - X_oovv%f(u,v,beta,gam) - X_oovv%f(v,u,gam,beta) - enddo - enddo + call gpu_allocate(K1,nO,nV,nO,nV) + call compute_K1_chol(nO,nV,t1%f,t2%f,d_cc_space_v_ovoo%f,d_cc_space_v_vvoo%f, & + d_cc_space_v_ovov%f,K1%f) + + + call gpu_allocate(X_ovvo,nO,nV,nV,nO) + call gpu_allocate(Y_voov,nV,nO,nO,nV) + + do a=1,nV + call gpu_stream_create(stream(a)) + enddo + + do i=1, nO + do a=1, nV + call gpu_set_stream(blas_handle, stream(a)) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,a,1,i), & + nO*nV, -0.5d0, K1%f(1,a,i,1), nO*nV*nO, X_ovvo%f(1,1,a,i), nO) enddo enddo - !$omp end do - !$omp end parallel - call gpu_deallocate(X_oovv) - double precision, allocatable :: X_vovo(:,:,:,:), Y_oovo(:,:,:,:) - allocate(X_vovo(nV,nO,nV,nO)) - - !$omp parallel & - !$omp shared(nO,nV,X_vovo,cc_space_v_ovvo) & - !$omp private(a,v,gam,i) & - !$omp default(none) - do i = 1, nO - !$omp do - do gam = 1, nV - do v = 1, nO - do a = 1, nV - X_vovo(a,v,gam,i) = cc_space_v_ovvo(v,a,gam,i) - enddo - enddo - enddo - !$omp end do nowait - enddo - !$omp end parallel - - allocate(Y_oovo(nO,nO,nV,nO)) - call dgemm('N','N',nO,nO*nV*nO,nV, & - 1d0, t1%f, size(t1%f,1), & - X_vovo, size(X_vovo,1), & - 0d0, Y_oovo, size(Y_oovo,1)) - - deallocate(X_vovo) - call gpu_allocate(X_oovv,nO,nO,nV,nV) - call dgemm('N','N',nO*nO*nV, nV, nO, & - 1d0, Y_oovo, size(Y_oovo,1) * size(Y_oovo,2) * size(Y_oovo,3), & - t1%f , size(t1%f,1), & - 0d0, X_oovv%f, size(X_oovv%f,1) * size(X_oovv%f,2) * size(X_oovv%f,3)) - deallocate(Y_oovo) - - !$omp parallel & - !$omp shared(nO,nV,r2,X_oovv) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do - do gam = 1, nV - do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - X_oovv%f(u,v,gam,beta) - X_oovv%f(v,u,beta,gam) - enddo - enddo + do gam=1, nV + call gpu_set_stream(blas_handle, stream(gam)) + do v=1, nO + call gpu_dgeam(blas_handle, 'T', 'T', nV, nO, 2.d0, t2%f(1,v,1,gam), & + nO*nO, -1.d0, t2%f(1,v,gam,1), nO*nO*nV, Y_voov%f(1,1,v,gam), nV) enddo enddo - !$omp end do - !$omp end parallel - call gpu_deallocate(X_oovv) + call gpu_allocate(Z_ovov,nO,nV,nO,nV) - double precision, allocatable :: J1(:,:,:,:) - allocate(J1(nO,nV,nV,nO)) - call compute_J1_chol(nO,nV,t1%f,t2%f,cc_space_v_ovvo,cc_space_v_ovoo, & - cc_space_v_vvoo,J1) - - double precision, allocatable :: K1(:,:,:,:) - allocate(K1(nO,nV,nO,nV)) - call compute_K1_chol(nO,nV,t1%f,t2%f,cc_space_v_ovoo,cc_space_v_vvoo, & - cc_space_v_ovov,K1) - - allocate(X_ovvo(nO,nV,nV,nO)) - !$omp parallel & - !$omp private(u,v,gam,beta,i,a) & - !$omp default(shared) - do i = 1, nO - !$omp do - do a = 1, nV - do beta = 1, nV - do u = 1, nO - X_ovvo(u,beta,a,i) = (J1(u,a,beta,i) - 0.5d0 * K1(u,a,i,beta)) - enddo - enddo - enddo - !$omp end do nowait + do a=1,nV + call gpu_stream_destroy(stream(a)) enddo - !$omp end parallel - deallocate(J1) - double precision, allocatable :: Y_voov(:,:,:,:) - allocate(Y_voov(nV,nO,nO,nV)) + call gpu_deallocate(J1) + call gpu_set_stream(blas_handle, gpu_default_stream) - !$omp parallel & - !$omp private(u,v,gam,beta,i,a) & - !$omp default(shared) - !$omp do - do gam = 1, nV - do v = 1, nO - do i = 1, nO - do a = 1, nV - Y_voov(a,i,v,gam) = 2d0 * t2%f(i,v,a,gam) - t2%f(i,v,gam,a) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - double precision, allocatable :: Z_ovov(:,:,:,:) - allocate(Z_ovov(nO,nV,nO,nV)) + call gpu_dgemm(blas_handle, 'N','N', nO*nV,nO*nV,nV*nO, & + 1d0, X_ovvo%f(1,1,1,1), size(X_ovvo%f,1) * size(X_ovvo%f,2), & + Y_voov%f(1,1,1,1), size(Y_voov%f,1) * size(Y_voov%f,2), & + 0d0, Z_ovov%f(1,1,1,1), size(Z_ovov%f,1) * size(Z_ovov%f,2)) - call dgemm('N','N', nO*nV,nO*nV,nV*nO, & - 1d0, X_ovvo, size(X_ovvo,1) * size(X_ovvo,2), & - Y_voov, size(Y_voov,1) * size(Y_voov,2), & - 0d0, Z_ovov, size(Z_ovov,1) * size(Z_ovov,2)) + call gpu_synchronize() + call gpu_deallocate(Y_voov) + call gpu_deallocate(X_ovvo) - deallocate(X_ovvo,Y_voov) - - !$omp parallel & - !$omp shared(nO,nV,r2,Z_ovov) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do - do gam = 1, nV - do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) + Z_ovov(u,beta,v,gam) + Z_ovov(v,gam,u,beta) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - - deallocate(Z_ovov) - - double precision, allocatable :: Y_ovov(:,:,:,:), X_ovov(:,:,:,:) - allocate(X_ovov(nO,nV,nO,nV)) - allocate(Y_ovov(nO,nV,nO,nV)) + type(gpu_double4) :: Y_ovov, X_ovov + call gpu_allocate(X_ovov,nO,nV,nO,nV) + call gpu_allocate(Y_ovov,nO,nV,nO,nV) +!TODO !$omp parallel & !$omp shared(nO,nV,K1,X_ovov,Y_ovov,t2) & !$omp private(u,a,i,beta,gam) & @@ -834,7 +779,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & do u = 1, nO do a = 1, nV do i = 1, nO - X_ovov(i,a,u,beta) = 0.5d0 * K1(u,a,i,beta) + X_ovov%f(i,a,u,beta) = 0.5d0 * K1%f(u,a,i,beta) enddo enddo enddo @@ -846,7 +791,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & do v = 1, nO do a = 1, nV do i = 1, nO - Y_ovov(i,a,v,gam) = t2%f(i,v,gam,a) + Y_ovov%f(i,a,v,gam) = t2%f(i,v,gam,a) enddo enddo enddo @@ -854,12 +799,12 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & !$omp end do !$omp end parallel - allocate(Z_ovov(nO,nV,nO,nV)) - call dgemm('T','N',nO*nV,nO*nV,nO*nV, & - 1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), & - Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), & - 0d0, Z_ovov, size(Y_ovov,1) * size(Y_ovov,2)) - deallocate(X_ovov, Y_ovov) + call gpu_dgemm(blas_handle, 'T','N',nO*nV,nO*nV,nO*nV, & + -1d0, X_ovov%f(1,1,1,1), size(X_ovov%f,1) * size(X_ovov%f,2), & + Y_ovov%f(1,1,1,1), size(Y_ovov%f,1) * size(Y_ovov%f,2), & + 1d0, Z_ovov%f(1,1,1,1), size(Z_ovov%f,1) * size(Z_ovov%f,2)) + + call gpu_synchronize() !$omp parallel & !$omp shared(nO,nV,r2,Z_ovov) & @@ -870,16 +815,14 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & do beta = 1, nV do v = 1, nO do u = 1, nO - r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - Z_ovov(u,beta,v,gam) - Z_ovov(v,gam,u,beta) + r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) + Z_ovov%f(u,beta,v,gam) + Z_ovov%f(v,gam,u,beta) enddo enddo enddo enddo !$omp end do !$omp end parallel - deallocate(Z_ovov) - allocate(X_ovov(nO,nV,nO,nV),Y_ovov(nO,nV,nO,nV)) !$omp parallel & !$omp shared(nO,nV,K1,X_ovov,Y_ovov,t2) & !$omp private(u,v,gam,beta,i,a) & @@ -889,7 +832,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & do i = 1, nO do gam = 1, nV do u = 1, nO - X_ovov(u,gam,i,a) = K1(u,a,i,gam) + X_ovov%f(u,gam,i,a) = K1%f(u,a,i,gam) enddo enddo enddo @@ -901,7 +844,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & do v = 1, nO do a = 1, nV do i = 1, nO - Y_ovov(i,a,v,beta) = t2%f(i,v,beta,a) + Y_ovov%f(i,a,v,beta) = t2%f(i,v,beta,a) enddo enddo enddo @@ -909,16 +852,19 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & !$omp end do !$omp end parallel - deallocate(K1) + call gpu_deallocate(K1) - allocate(Z_ovov(nO,nV,nO,nV)) - call dgemm('N','N',nO*nV,nO*nV,nO*nV, & - 1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), & - Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), & - 0d0, Z_ovov, size(Y_ovov,1) * size(Y_ovov,2)) + call gpu_dgemm(blas_handle, 'N','N',nO*nV,nO*nV,nO*nV, & + 1d0, X_ovov%f(1,1,1,1), size(X_ovov%f,1) * size(X_ovov%f,2), & + Y_ovov%f(1,1,1,1), size(Y_ovov%f,1) * size(Y_ovov%f,2), & + 0d0, Z_ovov%f(1,1,1,1), size(Z_ovov%f,1) * size(Z_ovov%f,2)) - deallocate(X_ovov,Y_ovov) + call gpu_synchronize() + call gpu_deallocate(X_ovov) + call gpu_deallocate(Y_ovov) + + ! Change the sign for consistency with the code in spin orbitals !$omp parallel & !$omp shared(nO,nV,r2,Z_ovov) & !$omp private(u,v,gam,beta) & @@ -928,7 +874,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & do beta = 1, nV do v = 1, nO do u = 1, nO - r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - Z_ovov(u,gam,v,beta) - Z_ovov(v,beta,u,gam) + r2%f(u,v,beta,gam) = -r2%f(u,v,beta,gam) + Z_ovov%f(u,gam,v,beta) + Z_ovov%f(v,beta,u,gam) enddo enddo enddo @@ -936,9 +882,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & !$omp end do !$omp end parallel - deallocate(Z_ovov) - - ! Change the sign for consistency with the code in spin orbitals + call gpu_deallocate(Z_ovov) max_r2 = 0d0 !$omp parallel & @@ -951,7 +895,6 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & do a = 1, nV do j = 1, nO do i = 1, nO - r2%f(i,j,a,b) = -r2%f(i,j,a,b) max_r2_local = max(r2%f(i,j,a,b), max_r2_local) enddo enddo From f09e91cb2296b3c1fb5cf854dc06372c879c4104 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Mon, 8 Jul 2024 12:44:32 +0200 Subject: [PATCH 33/38] Working on CCSD GPU --- src/ccsd/ccsd_space_orb_sub.irp.f | 1 + src/ccsd/ccsd_space_orb_sub_chol.irp.f | 874 +++++++++++-------------- 2 files changed, 392 insertions(+), 483 deletions(-) diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index e97c2325..d8131a9c 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -215,6 +215,7 @@ subroutine run_ccsd_space_orb call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, d_cc_space_v_oovo, d_cc_space_v_ovvo, d_cc_space_v_ovoo, & d_cc_space_v_ovov, d_cc_space_v_vvoo, d_cc_space_v_oo_chol, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, & + d_cc_space_f_vo, & r2, max_r2) else call compute_H_oo(nO,nV,t1%f,t2%f,tau%f,H_oo%f) diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index abb9909b..a185df13 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -394,13 +394,14 @@ end subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, d_cc_space_v_oovo, d_cc_space_v_ovvo, d_cc_space_v_ovoo, & d_cc_space_v_ovov, d_cc_space_v_vvoo, d_cc_space_v_oo_chol, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, & + d_cc_space_f_vo, & r2,max_r2) use gpu implicit none ! in integer, intent(in) :: nO, nV - type(gpu_double2), intent(in) :: t1, H_oo, H_vv + type(gpu_double2), intent(in) :: t1, H_oo, H_vv, d_cc_space_f_vo type(gpu_double4), intent(in) :: t2, tau, d_cc_space_v_oovv type(gpu_double4), intent(in) :: d_cc_space_v_vooo, d_cc_space_v_oooo type(gpu_double4), intent(in) :: d_cc_space_v_vvoo, d_cc_space_v_oovo @@ -504,7 +505,8 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & type(gpu_double2) :: g_occ, g_vir call gpu_allocate(g_vir,nV,nV) call gpu_allocate(g_occ,nO,nO) - call compute_g_vir_chol(nO,nV,t1%f,t2%f,H_vv%f,g_vir%f) + call compute_g_vir_chol(nO,nV,t1,t2,H_vv,d_cc_space_f_vo, & + d_cc_space_v_ov_chol, d_cc_space_v_vv_chol, g_vir) call compute_g_occ_chol(nO,nV,t1%f,t2%f,H_oo%f,g_occ%f) type(gpu_double4) :: Y_oovv @@ -525,6 +527,10 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & t1%f(1,1) , size(t1%f,1), & 1d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1) * size(Y_oovv%f,2) * size(Y_oovv%f,3)) + + call gpu_dgeam(blas_handle, 'N', 'N', nO*nO, nV*nV, 1.d0, Y_oovv%f(1,1,1,1), & + nO*nO, 1.d0, r2%f(1,1,1,1), nO*nO, r2%f(1,1,1,1), nO*nO) + call gpu_synchronize() call gpu_deallocate(X_oovv) @@ -534,32 +540,13 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & type(gpu_double4) :: X_vovo, Y_oovo call gpu_allocate(X_vovo,nV,nO,nV,nO) -! !$omp parallel & -! !$omp shared(nO,nV,r2,Y_oovv) & -! !$omp private(u,v,gam,beta) & -! !$omp default(none) -! !$omp do -! do gam = 1, nV -! do beta = 1, nV -! do v = 1, nO -! do u = 1, nO -! r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) + Y_oovv%f(u,v,beta,gam) + Y_oovv%f(v,u,gam,beta) -! enddo -! enddo -! enddo -! enddo -! !$omp end do -! !$omp end parallel - do a=1,nV call gpu_stream_create(stream(a)) enddo do gam = 1, nV + call gpu_set_stream(blas_handle, stream(gam)) do beta = 1, nV - call gpu_set_stream(blas_handle, stream(beta)) - call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, 1.d0, Y_oovv%f(1,1,beta,gam), & - nO, 1.d0, r2%f(1,1,beta,gam), nO, r2%f(1,1,beta,gam), nO) call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, 1.d0, r2%f(1,1,beta,gam), & nO, 1.d0, Y_oovv%f(1,1,gam,beta), nO, r2%f(1,1,beta,gam), nO) enddo @@ -579,34 +566,33 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & call gpu_set_stream(blas_handle, gpu_default_stream) - type(gpu_double4) :: X_vovv - call gpu_allocate(X_vovv,nV,nO,nV,block_size) call gpu_allocate(Y_oovo,nO,nO,nV,nO) + !$OMP PARALLEL PRIVATE(blas, iblock, gam, X_vovv) + call gpu_blas_create(blas) + type(gpu_double4) :: X_vovv + call gpu_allocate(X_vovv,nV,nO,nV,block_size) + !$OMP DO do iblock = 1, nV, block_size do gam = iblock, min(nV, iblock+block_size-1) - call gpu_stream_create(stream(gam)) - call gpu_set_stream(blas_handle, stream(gam)) - call gpu_dgemm(blas_handle, 'T','N',nV, nO*nV, cholesky_mo_num, 1.d0, & + call gpu_dgemm(blas, 'T','N',nV, nO*nV, cholesky_mo_num, 1.d0, & d_cc_space_v_vv_chol%f(1,1,gam), cholesky_mo_num, d_cc_space_v_ov_chol%f(1,1,1), & cholesky_mo_num, 0.d0, X_vovv%f(1,1,1,gam-iblock+1), nV) enddo - do gam = iblock, min(nV, iblock+block_size-1) - call gpu_stream_destroy(stream(gam)) - enddo - call gpu_synchronize() - - call gpu_set_stream(blas_handle, gpu_default_stream) - call gpu_dgemm(blas_handle, 'N','N', nO, & + call gpu_dgemm(blas, 'N','N', nO, & nO*nV*min(block_size, nV-iblock+1),nV, & 1.d0, t1%f(1,1) , size(t1%f,1), & X_vovv%f(1,1,1,1), size(X_vovv%f,1), & 0d0, Y_oovv%f(1,1,1,iblock), size(Y_oovv%f,1)) - enddo + !$OMP END DO + + call gpu_blas_destroy(blas) + call gpu_deallocate(X_vovv) + !$OMP END PARALLEL call gpu_dgemm(blas_handle, 'N','N',nO,nO*nV*nO,nV, & 1d0, t1%f(1,1), size(t1%f,1), & @@ -619,47 +605,27 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & 1d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1) * size(Y_oovv%f,2) * size(Y_oovv%f,3)) call gpu_synchronize() - call gpu_deallocate(X_vovv) call gpu_deallocate(X_vovo) call gpu_deallocate(Y_oovo) -! !$omp parallel & -! !$omp shared(nO,nV,r2,Y_oovv) & -! !$omp private(u,v,gam,beta) & -! !$omp default(none) -! !$omp do -! do gam = 1, nV -! do beta = 1, nV -! do v = 1, nO -! do u = 1, nO -! r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) + Y_oovv%f(u,v,gam,beta) + Y_oovv%f(v,u,beta,gam) -! enddo -! enddo -! enddo -! enddo -! !$omp end do -! !$omp end parallel - do a=1,nV call gpu_stream_create(stream(a)) enddo do gam = 1, nV + call gpu_set_stream(blas_handle, stream(gam)) do beta = 1, nV - call gpu_set_stream(blas_handle, stream(beta)) call gpu_dgeam(blas_handle, 'T', 'N', nO, nO, 1.d0, Y_oovv%f(1,1,beta,gam), & nO, 1.d0, r2%f(1,1,beta,gam), nO, r2%f(1,1,beta,gam), nO) - call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, 1.d0, r2%f(1,1,beta,gam), & - nO, 1.d0, Y_oovv%f(1,1,gam,beta), nO, r2%f(1,1,beta,gam), nO) + enddo + do j=1,nO + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, r2%f(1,j,1,gam), & + nO*nO, 1.d0, Y_oovv%f(1,j,gam,1), nO*nO*nV, r2%f(1,j,1,gam), nO*nO) enddo enddo call gpu_set_stream(blas_handle, gpu_default_stream) - do a=1,nV - call gpu_stream_destroy(stream(a)) - enddo - call gpu_synchronize() call gpu_deallocate(Y_oovv) @@ -684,15 +650,14 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & call gpu_synchronize() - do a=1,nV - call gpu_stream_create(stream(a)) - enddo do gam = 1, nV + call gpu_set_stream(blas_handle, stream(gam)) + do j=1,nO + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, -1.d0, X_ovvo%f(1,1,gam,j), & + nO, 1.d0, r2%f(1,j,1,gam), nO*nO, r2%f(1,j,1,gam), nO*nO) + enddo do beta = 1, nV - call gpu_set_stream(blas_handle, stream(beta)) - call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, -1.d0, X_ovvo%f(1,beta,gam,1), & - nO*nV*nV, 1.d0, r2%f(1,1,beta,gam), nO, r2%f(1,1,beta,gam), nO) call gpu_dgeam(blas_handle, 'T', 'N', nO, nO, -1.d0, X_ovvo%f(1,gam,beta,1), & nO*nV*nV, 1.d0, r2%f(1,1,beta,gam), nO, r2%f(1,1,beta,gam), nO) enddo @@ -700,58 +665,41 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & call gpu_set_stream(blas_handle, gpu_default_stream) - do a=1,nV - call gpu_stream_destroy(stream(a)) - enddo - - call gpu_synchronize() + call gpu_synchronize call gpu_deallocate(tcc) call gpu_deallocate(tcc2) call gpu_deallocate(X_ovvo) - !----- type(gpu_double4) :: J1, K1 type(gpu_double4) :: Y_voov, Z_ovov + call gpu_allocate(J1,nO,nV,nV,nO) - call compute_J1_chol(nO,nV,t1%f,t2%f,d_cc_space_v_ovvo%f,d_cc_space_v_ovoo%f, & - d_cc_space_v_vvoo%f,J1%f) + call compute_J1_chol(nO,nV,t1,t2,d_cc_space_v_ovvo,d_cc_space_v_ovoo, & + d_cc_space_v_vvoo,d_cc_space_v_vo_chol,d_cc_space_v_vv_chol,J1) call gpu_allocate(K1,nO,nV,nO,nV) - call compute_K1_chol(nO,nV,t1%f,t2%f,d_cc_space_v_ovoo%f,d_cc_space_v_vvoo%f, & - d_cc_space_v_ovov%f,K1%f) + call compute_K1_chol(nO,nV,t1,t2,d_cc_space_v_ovoo,d_cc_space_v_vvoo, & + d_cc_space_v_ovov,d_cc_space_v_ov_chol,d_cc_space_v_vv_chol,K1) call gpu_allocate(X_ovvo,nO,nV,nV,nO) call gpu_allocate(Y_voov,nV,nO,nO,nV) - do a=1,nV - call gpu_stream_create(stream(a)) - enddo - - do i=1, nO - do a=1, nV - call gpu_set_stream(blas_handle, stream(a)) + do a=1, nV + call gpu_set_stream(blas_handle, stream(a)) + do i=1, nO call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,a,1,i), & nO*nV, -0.5d0, K1%f(1,a,i,1), nO*nV*nO, X_ovvo%f(1,1,a,i), nO) - enddo - enddo - - do gam=1, nV - call gpu_set_stream(blas_handle, stream(gam)) - do v=1, nO - call gpu_dgeam(blas_handle, 'T', 'T', nV, nO, 2.d0, t2%f(1,v,1,gam), & - nO*nO, -1.d0, t2%f(1,v,gam,1), nO*nO*nV, Y_voov%f(1,1,v,gam), nV) + call gpu_dgeam(blas_handle, 'T', 'T', nV, nO, 2.d0, t2%f(1,i,1,a), & + nO*nO, -1.d0, t2%f(1,i,a,1), nO*nO*nV, Y_voov%f(1,1,i,a), nV) enddo enddo call gpu_allocate(Z_ovov,nO,nV,nO,nV) - do a=1,nV - call gpu_stream_destroy(stream(a)) - enddo - + call gpu_synchronize() call gpu_deallocate(J1) call gpu_set_stream(blas_handle, gpu_default_stream) @@ -769,35 +717,20 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & call gpu_allocate(X_ovov,nO,nV,nO,nV) call gpu_allocate(Y_ovov,nO,nV,nO,nV) -!TODO - !$omp parallel & - !$omp shared(nO,nV,K1,X_ovov,Y_ovov,t2) & - !$omp private(u,a,i,beta,gam) & - !$omp default(none) - !$omp do - do beta = 1, nV - do u = 1, nO - do a = 1, nV - do i = 1, nO - X_ovov%f(i,a,u,beta) = 0.5d0 * K1%f(u,a,i,beta) - enddo - enddo + do a=1, nV + call gpu_set_stream(blas_handle, stream(a)) + do j=1,nO + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, t2%f(1,j,1,a), & + nO*nO, 0.d0, t2%f(1,j,1,a), nO*nO, Y_ovov%f(1,a,j,1), nO*nV*nO) + enddo + do beta=1, nV + call gpu_dgeam(blas_handle, 'T', 'T', nO, nO, 0.5d0, K1%f(1,a,1,beta), & + nO*nV, 0.d0, K1%f(1,a,1,beta), nO*nV, X_ovov%f(1,a,1,beta), nO*nV) enddo enddo - !$omp end do nowait + call gpu_set_stream(blas_handle, gpu_default_stream) - !$omp do - do gam = 1, nV - do v = 1, nO - do a = 1, nV - do i = 1, nO - Y_ovov%f(i,a,v,gam) = t2%f(i,v,gam,a) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel + call gpu_synchronize() call gpu_dgemm(blas_handle, 'T','N',nO*nV,nO*nV,nO*nV, & -1d0, X_ovov%f(1,1,1,1), size(X_ovov%f,1) * size(X_ovov%f,2), & @@ -806,51 +739,23 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & call gpu_synchronize() - !$omp parallel & - !$omp shared(nO,nV,r2,Z_ovov) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do - do gam = 1, nV - do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) + Z_ovov%f(u,beta,v,gam) + Z_ovov%f(v,gam,u,beta) - enddo - enddo + do gam=1, nV + call gpu_set_stream(blas_handle, stream(gam)) + do j=1,nO + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, r2%f(1,j,1,gam), & + nO*nO, 1.d0, Z_ovov%f(1,1,j,gam), nO, r2%f(1,j,1,gam), nO*nO) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, K1%f(1,1,j,gam), & + nO, 0.d0, K1%f(1,1,j,gam), nO, X_ovov%f(1,gam,j,1), nO*nV*nO) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, 1.d0, t2%f(1,j,1,gam), & + nO*nO, 0.d0, t2%f(1,j,1,gam), nO*nO, Y_ovov%f(1,gam,j,1), nO*nV*nO) + enddo + do beta=1, nV + call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, 1.d0, r2%f(1,1,beta,gam), & + nO, 1.d0, Z_ovov%f(1,gam,1,beta), nO*nV, r2%f(1,1,beta,gam), nO) enddo enddo - !$omp end do - !$omp end parallel - !$omp parallel & - !$omp shared(nO,nV,K1,X_ovov,Y_ovov,t2) & - !$omp private(u,v,gam,beta,i,a) & - !$omp default(none) - !$omp do - do a = 1, nV - do i = 1, nO - do gam = 1, nV - do u = 1, nO - X_ovov%f(u,gam,i,a) = K1%f(u,a,i,gam) - enddo - enddo - enddo - enddo - !$omp end do nowait - - !$omp do - do beta = 1, nV - do v = 1, nO - do a = 1, nV - do i = 1, nO - Y_ovov%f(i,a,v,beta) = t2%f(i,v,beta,a) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel + call gpu_set_stream(blas_handle, gpu_default_stream) call gpu_deallocate(K1) @@ -865,22 +770,17 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & call gpu_deallocate(Y_ovov) ! Change the sign for consistency with the code in spin orbitals - !$omp parallel & - !$omp shared(nO,nV,r2,Z_ovov) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do do gam = 1, nV + call gpu_set_stream(blas_handle, stream(gam)) + do j=1,nO + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, r2%f(1,j,1,gam), & + nO*nO, -1.d0, Z_ovov%f(1,gam,j,1), nO*nV*nO, r2%f(1,j,1,gam), nO*nO) + enddo do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2%f(u,v,beta,gam) = -r2%f(u,v,beta,gam) + Z_ovov%f(u,gam,v,beta) + Z_ovov%f(v,beta,u,gam) - enddo - enddo + call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, -1.d0, r2%f(1,1,beta,gam), & + nO, 1.d0, Z_ovov%f(1,beta,1,gam), nO*nV, r2%f(1,1,beta,gam), nO) enddo enddo - !$omp end do - !$omp end parallel call gpu_deallocate(Z_ovov) @@ -929,34 +829,42 @@ subroutine compute_A1_chol(nO,nV,t1,t2,tau,d_cc_space_v_vooo, & ! A1(u,v,i,j) = cc_space_v_oooo(u,v,i,j) ! A1(u,v,i,j) += cc_space_v_ovoo(u,a,i,j) * t1(v,a) & - call dgemm('N','N', nO, nO*nO*nO, nV, & - 1d0, t1%f , size(t1%f,1), & - d_cc_space_v_vooo%f, size(d_cc_space_v_vooo%f,1), & - 0d0, Y_oooo%f, size(Y_oooo%f,1)) + call gpu_dgemm(blas_handle, 'N','N', nO, nO*nO*nO, nV, & + 1d0, t1%f(1,1) , size(t1%f,1), & + d_cc_space_v_vooo%f(1,1,1,1), size(d_cc_space_v_vooo%f,1), & + 0d0, Y_oooo%f(1,1,1,1), size(Y_oooo%f,1)) - !$omp parallel & - !$omp private(u,v,i,j) & - !$omp default(shared) - !$omp do collapse(2) - do j = 1, nO - do i = 1, nO - do v = 1, nO - do u = 1, nO - A1%f(u,v,i,j) = d_cc_space_v_oooo%f(u,v,i,j) + Y_oooo%f(v,u,j,i) + Y_oooo%f(u,v,i,j) - enddo - enddo - enddo + type(gpu_stream) :: stream(nO) + + do i=1, nO + call gpu_stream_create(stream(i)) + enddo + + call gpu_synchronize() + + do j = 1, nO + call gpu_set_stream(blas_handle, stream(j)) + do i = 1, nO + call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, 1.d0, d_cc_space_v_oooo%f(1,1,i,j), & + nO, 1.d0, Y_oooo%f(1,1,j,i), nO, A1%f(1,1,i,j), nO) + enddo + call gpu_dgeam(blas_handle, 'N', 'N', nO, nO*nO, 1.d0, A1%f(1,1,1,j), & + nO, 1.d0, Y_oooo%f(1,1,1,j), nO, A1%f(1,1,1,j), nO) + enddo + + call gpu_set_stream(blas_handle, gpu_default_stream) + do i=1, nO + call gpu_stream_destroy(stream(i)) enddo - !$omp end do - !$omp end parallel call gpu_deallocate(Y_oooo) ! A1(u,v,i,j) += cc_space_v_vvoo(a,b,i,j) * tau(u,v,a,b) - call dgemm('N','N', nO*nO, nO*nO, nV*nV, & - 1d0, tau%f , size(tau%f,1) * size(tau%f,2), & - d_cc_space_v_vvoo%f, size(d_cc_space_v_vvoo%f,1) * size(d_cc_space_v_vvoo%f,2), & - 1d0, A1%f , size(A1%f,1) * size(A1%f,2)) + call gpu_dgemm(blas_handle, 'N','N', nO*nO, nO*nO, nV*nV, & + 1d0, tau%f(1,1,1,1), size(tau%f,1) * size(tau%f,2), & + d_cc_space_v_vvoo%f(1,1,1,1), size(d_cc_space_v_vvoo%f,1) * size(d_cc_space_v_vvoo%f,2), & + 1d0, A1%f(1,1,1,1), size(A1%f,1) * size(A1%f,2)) + call gpu_synchronize() end @@ -998,364 +906,364 @@ end ! g_vir -subroutine compute_g_vir_chol(nO,nV,t1,t2,H_vv,g_vir) +subroutine compute_g_vir_chol(nO,nV,t1,t2,H_vv,d_cc_space_f_vo, & + d_cc_space_v_ov_chol, d_cc_space_v_vv_chol, g_vir) use gpu implicit none integer, intent(in) :: nO,nV - double precision, intent(in) :: t1(nO, nV), H_vv(nV, nV) - double precision, intent(in) :: t2(nO, nO, nV, nV) - double precision, intent(out) :: g_vir(nV, nV) + type(gpu_double2), intent(in) :: t1, H_vv, d_cc_space_f_vo + type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol + type(gpu_double4), intent(in) :: t2 + type(gpu_double2), intent(out) :: g_vir integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam - call dgemm('N','N',nV,nV,nO, & - -1d0, cc_space_f_vo , size(cc_space_f_vo,1), & - t1 , size(t1,1), & - 0d0, g_vir, size(g_vir,1)) + type(gpu_stream) :: stream(max(nO,4)) - double precision, allocatable :: tmp_k(:), tmp_vo(:,:,:), tmp_vo2(:,:,:) - allocate(tmp_k(cholesky_mo_num)) - call dgemm('N','N', cholesky_mo_num, 1, nO*nV, 1.d0, & - cc_space_v_ov_chol, cholesky_mo_num, t1, nO*nV, 0.d0, tmp_k, cholesky_mo_num) - - call dgemm('T','N', nV*nV, 1, cholesky_mo_num, 2.d0, & - cc_space_v_vv_chol, cholesky_mo_num, tmp_k, cholesky_mo_num, 1.d0, & - g_vir, nV*nV) - deallocate(tmp_k) - - allocate(tmp_vo(cholesky_mo_num,nV,nO)) - call dgemm('N','T',cholesky_mo_num*nV, nO, nV, 1.d0, & - cc_space_v_vv_chol, cholesky_mo_num*nV, t1, nO, 0.d0, tmp_vo, cholesky_mo_num*nV) - - allocate(tmp_vo2(cholesky_mo_num,nO,nV)) - do beta=1,nV - do i=1,nO - do k=1,cholesky_mo_num - tmp_vo2(k,i,beta) = -tmp_vo(k,beta,i) - enddo - enddo - enddo - deallocate(tmp_vo) - - do beta = 1, nV - do a = 1, nV - g_vir(a,beta) = g_vir(a,beta) + H_vv(a,beta) - enddo + do i=1,max(nO,4) + call gpu_stream_create(stream(i)) enddo - call dgemm('T','N', nV, nV, nO*cholesky_mo_num, 1.d0, & - cc_space_v_ov_chol, cholesky_mo_num*nO, & - tmp_vo2, cholesky_mo_num*nO, 1.d0, g_vir, nV) + call gpu_set_stream(blas_handle, stream(1)) + call gpu_dgemm(blas_handle, 'N','N',nV,nV,nO, & + -1d0, d_cc_space_f_vo%f(1,1) , size(d_cc_space_f_vo%f,1), & + t1%f(1,1) , size(t1%f,1), & + 0d0, g_vir%f(1,1), size(g_vir%f,1)) + + type(gpu_double1) :: tmp_k + type(gpu_double3) :: tmp_vo, tmp_vo2 + + call gpu_allocate(tmp_k,cholesky_mo_num) + + call gpu_set_stream(blas_handle, stream(2)) + call gpu_dgemm(blas_handle, 'N','N', cholesky_mo_num, 1, nO*nV, 1.d0, & + d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, t1%f(1,1), nO*nV, 0.d0, tmp_k%f(1), cholesky_mo_num) + + call gpu_dgemm(blas_handle, 'T','N', nV*nV, 1, cholesky_mo_num, 2.d0, & + d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num, tmp_k%f(1), cholesky_mo_num, 1.d0, & + g_vir%f(1,1), nV*nV) + + call gpu_set_stream(blas_handle, stream(3)) + call gpu_allocate(tmp_vo,cholesky_mo_num,nV,nO) + + call gpu_dgemm(blas_handle, 'N','T',cholesky_mo_num*nV, nO, nV, 1.d0, & + d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num*nV, t1%f(1,1), nO, 0.d0, tmp_vo%f(1,1,1), cholesky_mo_num*nV) + + call gpu_allocate(tmp_vo2,cholesky_mo_num,nO,nV) + + call gpu_synchronize() + call gpu_deallocate(tmp_k) + + do i=1,nO + call gpu_set_stream(blas_handle, stream(i)) + call gpu_dgeam(blas_handle, 'N', 'N', cholesky_mo_num, nV, -1.d0, tmp_vo%f(1,1,i), & + cholesky_mo_num, 0.d0, tmp_vo%f(1,1,i), cholesky_mo_num, tmp_vo2%f(1,i,1), cholesky_mo_num*nO) + enddo + + call gpu_set_stream(blas_handle, gpu_default_stream) + + do i=1,max(nO,4) + call gpu_stream_destroy(stream(i)) + enddo + call gpu_deallocate(tmp_vo) + + call gpu_dgeam(blas_handle, 'N', 'N', nV, nV, 1.d0, g_vir%f(1,1), & + nV, 1.d0, H_vv%f(1,1), nV, g_vir%f(1,1), nV) + + call gpu_dgemm(blas_handle, 'T','N', nV, nV, nO*cholesky_mo_num, 1.d0, & + d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, & + tmp_vo2%f(1,1,1), cholesky_mo_num*nO, 1.d0, g_vir%f(1,1), nV) + + call gpu_synchronize() + call gpu_deallocate(tmp_vo2) end ! J1 -subroutine compute_J1_chol(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvoo,J1) +subroutine compute_J1_chol(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvoo,d_cc_space_v_vo_chol,d_cc_space_v_vv_chol,J1) use gpu implicit none - integer, intent(in) :: nO,nV - double precision, intent(in) :: t1(nO, nV) - double precision, intent(in) :: t2(nO, nO, nV, nV) - double precision, intent(in) :: v_ovvo(nO,nV,nV,nO), v_ovoo(nO,nV,nO,nO) - double precision, intent(in) :: v_vvoo(nV,nV,nO,nO) - double precision, intent(out) :: J1(nO, nV, nV, nO) + integer, intent(in) :: nO,nV + type(gpu_double2), intent(in) :: t1 + type(gpu_double4), intent(in) :: t2, v_ovvo, v_ovoo, v_vvoo + type(gpu_double4), intent(out) :: J1 + type(gpu_double3), intent(out) :: d_cc_space_v_vo_chol,d_cc_space_v_vv_chol integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam - double precision, allocatable :: X_ovoo(:,:,:,:), Y_ovov(:,:,:,:) - allocate(X_ovoo(nO,nV,nO,nO),Y_ovov(nO,nV,nO,nV)) + type(gpu_double4) :: X_ovoo, Y_ovov - !$omp parallel & - !$omp shared(nO,nV,J1,v_ovvo,v_ovoo,X_ovoo) & - !$omp private(i,j,a,u,beta) & - !$omp default(none) - do i = 1, nO - !$omp do - do beta = 1, nV - do a = 1, nV - do u = 1, nO - J1(u,a,beta,i) = v_ovvo(u,a,beta,i) - enddo - enddo - enddo - !$omp end do nowait - enddo + call gpu_allocate(X_ovoo,nO,nV,nO,nO) - !$omp do collapse(2) - do j = 1, nO - do i = 1, nO - do a = 1, nV - do u = 1, nO - X_ovoo(u,a,i,j) = v_ovoo(u,a,j,i) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel + type(gpu_stream) :: stream(nV) - call dgemm('N','N',nO*nV*nO,nV,nO, & - -1d0, X_ovoo, size(X_ovoo,1) * size(X_ovoo,2) * size(X_ovoo,3), & - t1 , size(t1,1), & - 0d0, Y_ovov, size(Y_ovov,1) * size(Y_ovov,2) * size(Y_ovov,3)) - - !$omp parallel & - !$omp shared(nO,nV,J1,Y_ovov) & - !$omp private(i,beta,a,u) & - !$omp default(none) - do i = 1, nO - !$omp do - do beta = 1, nV - do a = 1, nV - do u = 1, nO - J1(u,a,beta,i) = J1(u,a,beta,i) + Y_ovov(u,a,i,beta) - enddo - enddo - enddo - !$omp end do nowait - enddo - !$omp end parallel - deallocate(X_ovoo) - - double precision, allocatable :: tmp_cc(:,:,:), J1_tmp(:,:,:,:) - allocate(tmp_cc(cholesky_mo_num,nV,nO), J1_tmp(nV,nO,nV,nO)) - - call dgemm('N','T', cholesky_mo_num*nV, nO, nV, 1.d0, & - cc_space_v_vv_chol, cholesky_mo_num*nV, & - t1, nO, & - 0.d0, tmp_cc, cholesky_mo_num*nV) - - call dgemm('T','N', nV*nO, nV*nO, cholesky_mo_num, 1.d0, & - tmp_cc, cholesky_mo_num, cc_space_v_vo_chol, cholesky_mo_num, & - 0.d0, J1_tmp, nV*nO) - - deallocate(tmp_cc) do i=1,nO - do b=1,nV - do a=1,nV - do u=1,nO - J1(u,a,b,i) = J1(u,a,b,i) + J1_tmp(b,u,a,i) - enddo - enddo - enddo - enddo - - deallocate(J1_tmp) - - !- cc_space_v_vvoo(a,b,i,j) * (0.5d0 * t2(u,j,b,beta) + t1(u,b) * t1(j,beta)) & - double precision, allocatable :: X_voov(:,:,:,:), Z_ovvo(:,:,:,:) - allocate(X_voov(nV,nO,nO,nV), Z_ovvo(nO,nV,nV,nO)) - !$omp parallel & - !$omp shared(nO,nV,t2,t1,Y_ovov,X_voov,v_vvoo) & - !$omp private(i,beta,a,u,b,j) & - !$omp default(none) - !$omp do - do b = 1, nV - do j = 1, nO - do beta = 1, nV - do u = 1, nO - Y_ovov(u,beta,j,b) = 0.5d0 * t2(u,j,b,beta) + t1(u,b) * t1(j,beta) - enddo - enddo - enddo - enddo - !$omp end do nowait - - !$omp do - do b = 1, nV - do j = 1, nO - do i = 1, nO - do a = 1, nV - X_voov(a,i,j,b) = v_vvoo(a,b,i,j) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - - call dgemm('N','T',nO*nV,nV*nO,nO*nV, & - -1d0, Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), & - X_voov, size(X_voov,1) * size(X_voov,2), & - 0d0, Z_ovvo, size(Z_ovvo,1) * size(Z_ovvo,2)) - deallocate(X_voov) - - double precision, allocatable :: X_ovvo(:,:,:,:), Y_vovo(:,:,:,:) - allocate(X_ovvo(nO,nV,nV,nO),Y_vovo(nV,nO,nV,nO)) - !$omp parallel & - !$omp shared(nO,nV,J1,Z_ovvo,t2,Y_vovo,v_vvoo,X_ovvo) & - !$omp private(i,beta,a,u,j,b) & - !$omp default(none) - do i = 1, nO - !$omp do - do beta = 1, nV - do a = 1, nV - do u = 1, nO - J1(u,a,beta,i) = J1(u,a,beta,i) + Z_ovvo(u,beta,a,i) - enddo - enddo - enddo - !$omp end do nowait - enddo - - !+ 0.5d0 * (2d0 * cc_space_v_vvoo(a,b,i,j) - cc_space_v_vvoo(b,a,i,j)) * t2(u,j,beta,b) - do j = 1, nO - !$omp do - do b = 1, nV - do i = 1, nO - do a = 1, nV - Y_vovo(a,i,b,j) = 0.5d0 * (2d0 * v_vvoo(a,b,i,j) - v_vvoo(b,a,i,j)) - enddo - enddo - enddo - !$omp end do nowait + call gpu_stream_create(stream(i)) enddo do j = 1, nO - !$omp do - do b = 1, nV - do beta = 1, nV - do u = 1, nO - X_ovvo(u,beta,b,j) = t2(u,j,beta,b) - enddo - enddo + call gpu_set_stream(blas_handle, stream(j)) + do i = 1, nO + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, v_ovoo%f(1,1,j,i), & + nO, 0.d0, X_ovoo%f(1,1,i,j), nO, X_ovoo%f(1,1,i,j), nO) enddo - !$omp end do nowait enddo - !$omp end parallel - call dgemm('N','T',nO*nV,nV*nO,nV*nO, & - 1d0, X_ovvo, size(X_ovvo,1) * size(X_ovvo,2), & - Y_vovo, size(Y_vovo,1) * size(Y_vovo,2), & - 0d0, Z_ovvo, size(Z_ovvo,1) * size(Z_ovvo,2)) + call gpu_set_stream(blas_handle, gpu_default_stream) + + do i=1,nO + call gpu_stream_destroy(stream(i)) + enddo + + call gpu_allocate(Y_ovov,nO,nV,nO,nV) + + call gpu_dgemm(blas_handle, 'N','N',nO*nV*nO,nV,nO, & + -1d0, X_ovoo%f(1,1,1,1), size(X_ovoo%f,1) * size(X_ovoo%f,2) * size(X_ovoo%f,3), & + t1%f(1,1) , size(t1%f,1), & + 0d0, Y_ovov%f(1,1,1,1), size(Y_ovov%f,1) * size(Y_ovov%f,2) * size(Y_ovov%f,3)) + + + call gpu_copy(v_ovvo, J1) + + call gpu_synchronize() + + do a=1,nV + call gpu_stream_create(stream(a)) + enddo - !$omp parallel & - !$omp shared(nO,nV,J1,Z_ovvo) & - !$omp private(i,beta,a,u) & - !$omp default(none) do i = 1, nO - !$omp do do beta = 1, nV - do a = 1, nV - do u = 1, nO - J1(u,a,beta,i) = J1(u,a,beta,i) + Z_ovvo(u,beta,a,i) - enddo - enddo + call gpu_set_stream(blas_handle, stream(beta)) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,1,beta,i), & + nO, 1.d0, Y_ovov%f(1,1,i,beta), nO, J1%f(1,1,beta,i), nO) enddo - !$omp end do nowait enddo - !$omp end parallel - deallocate(X_ovvo,Z_ovvo,Y_ovov) + call gpu_allocate(tmp_cc,cholesky_mo_num,nV,nO) + call gpu_allocate(J1_tmp,nV,nO,nV,nO) + + call gpu_set_stream(blas_handle, gpu_default_stream) + + type(gpu_double4) :: J1_tmp + type(gpu_double3) :: tmp_cc + + call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nV, nO, nV, 1.d0, & + d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num*nV, & + t1%f(1,1), nO, & + 0.d0, tmp_cc%f(1,1,1), cholesky_mo_num*nV) + + call gpu_dgemm(blas_handle, 'T','N', nV*nO, nV*nO, cholesky_mo_num, 1.d0, & + tmp_cc%f(1,1,1), cholesky_mo_num, d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num, & + 0.d0, J1_tmp%f(1,1,1,1), nV*nO) + + + call gpu_deallocate(X_ovoo) + + call gpu_synchronize() + call gpu_deallocate(tmp_cc) + + do i = 1, nO + do a = 1, nV + call gpu_set_stream(blas_handle, stream(a)) + call gpu_dgeam(blas_handle, 'N', 'T', nO, nV, 1.d0, J1%f(1,a,1,i), & + nO*nV, 1.d0, J1_tmp%f(1,1,a,i), nV, J1%f(1,a,1,i), nO*nV) + enddo + enddo + + type(gpu_double4) :: X_voov, Z_ovvo + + call gpu_allocate(X_voov,nV,nO,nO,nV) + call gpu_allocate(Z_ovvo,nO,nV,nV,nO) + + do j = 1, nO + do beta = 1, nV + call gpu_set_stream(blas_handle, stream(beta)) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 0.5d0, t2%f(1,j,1,beta), & + nO*nO, t1%f(j,beta), t1%f(1,1), nO, Y_ovov%f(1,beta,j,1), nO*nV*nO) + enddo + enddo + + do b = 1, nV + call gpu_set_stream(blas_handle, stream(b)) + call gpu_dgeam(blas_handle, 'N', 'N', nV, nO*nO, 1.d0, v_vvoo%f(1,b,1,1), & + nV*nV, 0.d0, X_voov%f(1,1,1,b), nV, X_voov%f(1,1,1,b), nV) + enddo + + call gpu_set_stream(blas_handle, gpu_default_stream) + + call gpu_synchronize() + call gpu_deallocate(J1_tmp) + + call gpu_dgemm(blas_handle, 'N','T',nO*nV,nV*nO,nO*nV, & + -1d0, Y_ovov%f(1,1,1,1), size(Y_ovov%f,1) * size(Y_ovov%f,2), & + X_voov%f(1,1,1,1), size(X_voov%f,1) * size(X_voov%f,2), & + 0d0, Z_ovvo%f(1,1,1,1), size(Z_ovvo%f,1) * size(Z_ovvo%f,2)) + + call gpu_synchronize() + + do i = 1, nO + do a = 1, nV + call gpu_set_stream(blas_handle, stream(a)) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,a,1,i), & + nO*nV, 1.d0, Z_ovvo%f(1,1,a,i), nO, J1%f(1,a,1,i), nO*nV) + enddo + enddo + + type(gpu_double4) :: X_ovvo, Y_vovo + call gpu_allocate(Y_vovo,nV,nO,nV,nO) + + do j = 1, nO + do i = 1, nO + call gpu_set_stream(blas_handle, stream(i)) + call gpu_dgeam(blas_handle, 'N', 'T', nV, nV, 1.d0, v_vvoo%f(1,1,i,j), & + nV, -0.5d0, v_vvoo%f(1,1,i,j), nV, Y_vovo%f(1,i,1,j), nO*nV) + enddo + enddo + + call gpu_allocate(X_ovvo,nO,nV,nV,nO) + + do j = 1, nO + do b = 1, nV + call gpu_set_stream(blas_handle, stream(b)) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, t2%f(1,j,1,b), & + nO*nO, 0.d0, t2%f(1,j,1,b), nO*nO, X_ovvo%f(1,1,b,j), nO) + enddo + enddo + + call gpu_set_stream(blas_handle, gpu_default_stream) + call gpu_synchronize() + call gpu_deallocate(X_voov) + + call gpu_dgemm(blas_handle, 'N','T',nO*nV,nV*nO,nV*nO, & + 1d0, X_ovvo%f(1,1,1,1), size(X_ovvo%f,1) * size(X_ovvo%f,2), & + Y_vovo%f(1,1,1,1), size(Y_vovo%f,1) * size(Y_vovo%f,2), & + 0d0, Z_ovvo%f(1,1,1,1), size(Z_ovvo%f,1) * size(Z_ovvo%f,2)) + + call gpu_synchronize() + + do i = 1, nO + do beta = 1, nV + call gpu_set_stream(blas_handle, stream(beta)) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,1,beta,i), & + nO, 1.d0, Z_ovvo%f(1,beta,1,i), nO*nV, J1%f(1,1,beta,i), nO) + enddo + enddo + + call gpu_set_stream(blas_handle, gpu_default_stream) + call gpu_deallocate(Y_ovov) + call gpu_deallocate(X_ovvo) + + do a = 1, nV + call gpu_stream_destroy(stream(a)) + enddo + + call gpu_deallocate(Z_ovvo) end ! K1 -subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov,K1) +subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov, & + d_cc_space_v_ov_chol,d_cc_space_v_vv_chol,K1) use gpu implicit none - integer, intent(in) :: nO,nV - double precision, intent(in) :: t1(nO, nV) - double precision, intent(in) :: t2(nO, nO, nV, nV) - double precision, intent(in) :: v_vvoo(nV,nV,nO,nO), v_ovov(nO,nV,nO,nV) - double precision, intent(in) :: v_ovoo(nO,nV,nO,nO) - double precision, intent(out) :: K1(nO, nV, nO, nV) + integer, intent(in) :: nO,nV + type(gpu_double2), intent(in) :: t1 + type(gpu_double4), intent(in) :: t2, v_vvoo, v_ovov, v_ovoo + type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol + type(gpu_double4), intent(out) :: K1 - double precision, allocatable :: X(:,:,:,:), Y(:,:,:,:), Z(:,:,:,:) + type(gpu_double4) :: X, Y, Z integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam - allocate(X(nV,nO,nV,nO),Y(nO,nV,nV,nO),Z(nO,nV,nV,nO)) - !$omp parallel & - !$omp shared(nO,nV,K1,X,Y,v_vvoo,v_ovov,t1,t2) & - !$omp private(i,beta,a,u,j,b) & - !$omp default(none) - !$omp do - do beta = 1, nV - do i = 1, nO - do a = 1, nV - do u = 1, nO - K1(u,a,i,beta) = v_ovov(u,a,i,beta) - enddo - enddo - enddo + call gpu_copy(v_ovov, K1) + + type(gpu_stream) :: stream(nV) + do a = 1, nV + call gpu_stream_create(stream(a)) enddo - !$omp end do nowait + + call gpu_allocate(X,nV,nO,nV,nO) do i = 1, nO - !$omp do do a = 1, nV - do j = 1, nO - do b = 1, nV - X(b,j,a,i) = - v_vvoo(b,a,i,j) - enddo - enddo + call gpu_set_stream(blas_handle, stream(a)) + call gpu_dgeam(blas_handle, 'N', 'N', nV, nO, -1.d0, v_vvoo%f(1,a,i,1), & + nV*nV*nO, 0.d0, v_vvoo%f(1,a,i,1), nV*nV*nO, X%f(1,1,a,i), nV) enddo - !$omp end do nowait enddo + call gpu_allocate(Y,nO,nV,nV,nO) + do j = 1, nO - !$omp do - do b = 1, nV - do beta = 1, nV - do u = 1, nO - Y(u,beta,b,j) = 0.5d0 * t2(u,j,b,beta) + t1(u,b) * t1(j,beta) - enddo - enddo + do beta = 1, nV + call gpu_set_stream(blas_handle, stream(beta)) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 0.5d0, t2%f(1,j,1,beta), & + nO*nO, t1%f(j,beta), t1%f(1,1), nO, Y%f(1,beta,1,j), nO*nV) enddo - !$omp end do enddo - !$omp end parallel - call dgemm('N','N',nO*nV*nO,nV,nO, & - -1d0, v_ovoo, size(v_ovoo,1) * size(v_ovoo,2) * size(v_ovoo,3), & - t1 , size(t1,1), & - 1d0, K1 , size(K1,1) * size(K1,2) * size(K1,3)) + call gpu_set_stream(blas_handle, gpu_default_stream) - double precision, allocatable :: K1tmp(:,:,:,:), t1v(:,:,:) - allocate(K1tmp(nO,nO,nV,nV), t1v(cholesky_mo_num,nO,nO)) + call gpu_dgemm(blas_handle, 'N','N',nO*nV*nO,nV,nO, & + -1d0, v_ovoo%f(1,1,1,1), size(v_ovoo%f,1) * size(v_ovoo%f,2) * size(v_ovoo%f,3), & + t1%f(1,1) , size(t1%f,1), & + 1d0, K1%f(1,1,1,1) , size(K1%f,1) * size(K1%f,2) * size(K1%f,3)) - call dgemm('N','T', cholesky_mo_num*nO, nO, nV, 1.d0, & - cc_space_v_ov_chol, cholesky_mo_num*nO, t1, nO, 0.d0, & - t1v, cholesky_mo_num*nO) + type(gpu_double4) :: K1tmp + type(gpu_double3) :: t1v - call dgemm('T','N', nO*nO, nV*nV, cholesky_mo_num, 1.d0, & - t1v, cholesky_mo_num, cc_space_v_vv_chol, cholesky_mo_num, 0.d0, & - K1tmp, nO*nO) + call gpu_allocate(t1v,cholesky_mo_num,nO,nO) + + call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nO, nO, nV, 1.d0, & + d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, t1%f(1,1), nO, 0.d0, & + t1v%f(1,1,1), cholesky_mo_num*nO) + + call gpu_allocate(K1tmp,nO,nO,nV,nV) + + call gpu_dgemm(blas_handle, 'T','N', nO*nO, nV*nV, cholesky_mo_num, 1.d0, & + t1v%f(1,1,1), cholesky_mo_num, d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num, 0.d0, & + K1tmp%f(1,1,1,1), nO*nO) + + call gpu_allocate(Z,nO,nV,nV,nO) + call gpu_synchronize() - deallocate(t1v) ! Y(u,beta,b,j) * X(b,j,a,i) = Z(u,beta,a,i) - call dgemm('N','N',nV*nO,nO*nV,nV*nO, & - 1d0, Y, size(Y,1) * size(Y,2), & - X, size(X,1) * size(X,2), & - 0d0, Z, size(Z,1) * size(Z,2)) + call gpu_dgemm(blas_handle, 'N','N',nV*nO,nO*nV,nV*nO, & + 1d0, Y%f(1,1,1,1), size(Y%f,1) * size(Y%f,2), & + X%f(1,1,1,1), size(X%f,1) * size(X%f,2), & + 0d0, Z%f(1,1,1,1), size(Z%f,1) * size(Z%f,2)) - !$omp parallel & - !$omp shared(nO,nV,K1,Z,K1tmp) & - !$omp private(i,beta,a,u) & - !$omp default(none) - !$omp do - do beta = 1, nV - do i = 1, nO - do a = 1, nV - do u = 1, nO - K1(u,a,i,beta) = K1(u,a,i,beta) + K1tmp(u,i,a,beta) + Z(u,beta,a,i) - enddo - enddo + call gpu_synchronize() + call gpu_deallocate(t1v) + + do i = 1, nO + do beta = 1, nV + call gpu_set_stream(blas_handle, stream(beta)) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, K1%f(1,1,i,beta), & + nO, 1.d0, K1tmp%f(1,i,1,beta), nO*nO, K1%f(1,1,i,beta), nO) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, K1%f(1,1,i,beta), & + nO, 1.d0, Z%f(1,beta,1,i), nO*nV, K1%f(1,1,i,beta), nO) enddo enddo - !$omp end do - !$omp end parallel - deallocate(K1tmp,X,Y,Z) + call gpu_deallocate(X) + call gpu_deallocate(Y) + + do a = 1, nV + call gpu_stream_destroy(stream(a)) + enddo + + call gpu_deallocate(K1tmp) + call gpu_deallocate(Z) end From 9ad69bb27dc4195ae2ae2c9ea2f280156c20366e Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 9 Jul 2024 03:27:54 +0200 Subject: [PATCH 34/38] GPU accelerated CCSD --- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 75 +++++++++++++------------- 1 file changed, 37 insertions(+), 38 deletions(-) diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index a185df13..24fcc5af 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -507,7 +507,8 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & call gpu_allocate(g_occ,nO,nO) call compute_g_vir_chol(nO,nV,t1,t2,H_vv,d_cc_space_f_vo, & d_cc_space_v_ov_chol, d_cc_space_v_vv_chol, g_vir) - call compute_g_occ_chol(nO,nV,t1%f,t2%f,H_oo%f,g_occ%f) + call compute_g_occ_chol(nO,nV,t1,t2,H_oo, & + d_cc_space_f_vo, d_cc_space_v_ov_chol, d_cc_space_v_oo_chol, d_cc_space_v_ovoo, g_occ) type(gpu_double4) :: Y_oovv call gpu_allocate(Y_oovv,nO,nO,nV,nV) @@ -870,37 +871,42 @@ end ! g_occ -subroutine compute_g_occ_chol(nO,nV,t1,t2,H_oo,g_occ) +subroutine compute_g_occ_chol(nO,nV,t1,t2,H_oo, & + d_cc_space_f_vo, d_cc_space_v_ov_chol, d_cc_space_v_oo_chol, d_cc_space_v_ovoo, g_occ) use gpu implicit none integer, intent(in) :: nO,nV - double precision, intent(in) :: t1(nO, nV), H_oo(nO, nO) - double precision, intent(in) :: t2(nO, nO, nV, nV) - double precision, intent(out) :: g_occ(nO, nO) + type(gpu_double2), intent(in) :: t1, H_oo, d_cc_space_f_vo + type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_oo_chol + type(gpu_double4), intent(in) :: t2, d_cc_space_v_ovoo + type(gpu_double2), intent(out) :: g_occ - g_occ = H_oo + call gpu_copy(H_oo, g_occ) - call dgemm('N','N',nO,nO,nV, & - 1d0, t1, size(t1,1), & - cc_space_f_vo, size(cc_space_f_vo,1), & - 1d0, g_occ, size(g_occ,1)) + call gpu_dgemm(blas_handle, 'N','N',nO,nO,nV, & + 1d0, t1%f(1,1), size(t1%f,1), & + d_cc_space_f_vo%f(1,1), size(d_cc_space_f_vo%f,1), & + 1d0, g_occ%f(1,1), size(g_occ%f,1)) - double precision, allocatable :: X(:) - allocate(X(cholesky_mo_num)) - call dgemv('N',cholesky_mo_num,nO*nV,2.d0, & - cc_space_v_ov_chol, cholesky_mo_num, & - t1, 1, 0.d0, X, 1) + type(gpu_double1) :: X + call gpu_allocate(X,cholesky_mo_num) - call dgemv('T',cholesky_mo_num,nO*nO,1.d0, & - cc_space_v_oo_chol, cholesky_mo_num, & - X, 1, 1.d0, g_occ, 1) - deallocate(X) + call gpu_dgemv(blas_handle, 'N',cholesky_mo_num,nO*nV,2.d0, & + d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, & + t1%f(1,1), 1, 0.d0, X%f(1), 1) - call dgemv('T',nO*nV,nO*nO,-1.d0, & - cc_space_v_ovoo, nO*nV, & - t1, 1, 1.d0, g_occ, 1) + call gpu_dgemv(blas_handle, 'T',cholesky_mo_num,nO*nO,1.d0, & + d_cc_space_v_oo_chol%f(1,1,1), cholesky_mo_num, & + X%f(1), 1, 1.d0, g_occ%f(1,1), 1) + + call gpu_dgemv(blas_handle, 'T',nO*nV,nO*nO,-1.d0, & + d_cc_space_v_ovoo%f(1,1,1,1), nO*nV, & + t1%f(1,1), 1, 1.d0, g_occ%f(1,1), 1) + + call gpu_synchronize() + call gpu_deallocate(X) end @@ -1193,22 +1199,15 @@ subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov, & enddo call gpu_allocate(X,nV,nO,nV,nO) - - do i = 1, nO - do a = 1, nV - call gpu_set_stream(blas_handle, stream(a)) - call gpu_dgeam(blas_handle, 'N', 'N', nV, nO, -1.d0, v_vvoo%f(1,a,i,1), & - nV*nV*nO, 0.d0, v_vvoo%f(1,a,i,1), nV*nV*nO, X%f(1,1,a,i), nV) - enddo - enddo - call gpu_allocate(Y,nO,nV,nV,nO) - do j = 1, nO - do beta = 1, nV - call gpu_set_stream(blas_handle, stream(beta)) - call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 0.5d0, t2%f(1,j,1,beta), & - nO*nO, t1%f(j,beta), t1%f(1,1), nO, Y%f(1,beta,1,j), nO*nV) + do a = 1, nV + call gpu_set_stream(blas_handle, stream(a)) + do i = 1, nO + call gpu_dgeam(blas_handle, 'N', 'N', nV, nO, -1.d0, v_vvoo%f(1,a,i,1), & + nV*nV*nO, 0.d0, v_vvoo%f(1,a,i,1), nV*nV*nO, X%f(1,1,a,i), nV) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 0.5d0, t2%f(1,i,1,a), & + nO*nO, t1%f(i,a), t1%f(1,1), nO, Y%f(1,a,1,i), nO*nV) enddo enddo @@ -1246,9 +1245,9 @@ subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov, & call gpu_synchronize() call gpu_deallocate(t1v) + do beta = 1, nV + call gpu_set_stream(blas_handle, stream(beta)) do i = 1, nO - do beta = 1, nV - call gpu_set_stream(blas_handle, stream(beta)) call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, K1%f(1,1,i,beta), & nO, 1.d0, K1tmp%f(1,i,1,beta), nO*nO, K1%f(1,1,i,beta), nO) call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, K1%f(1,1,i,beta), & From dd9c6dcc03e6d24a78ed5651e5c825e81d68a9c6 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 9 Jul 2024 21:11:13 +0200 Subject: [PATCH 35/38] Introducing dpcpp --- configure | 8 +- plugins/local/gpu_intel/LIB | 1 + plugins/local/gpu_intel/NEED | 1 + plugins/local/gpu_intel/README.rst | 8 + plugins/local/gpu_intel/gpu.sycl | 266 +++++++++++++++++++++++++++++ 5 files changed, 282 insertions(+), 2 deletions(-) create mode 100644 plugins/local/gpu_intel/LIB create mode 100644 plugins/local/gpu_intel/NEED create mode 100644 plugins/local/gpu_intel/README.rst create mode 100644 plugins/local/gpu_intel/gpu.sycl diff --git a/configure b/configure index 08dac444..3e3390e1 100755 --- a/configure +++ b/configure @@ -40,7 +40,7 @@ Usage: $(basename $0) -c $(basename $0) -h $(basename $0) -i - $(basename $0) -g [nvidia|none] + $(basename $0) -g [nvidia|intel|none] Options: -c Define a COMPILATION configuration file, @@ -49,7 +49,7 @@ Options: -i INSTALL . Use at your OWN RISK: no support will be provided for the installation of dependencies. - -g [nvidia|none] Choose GPU acceleration (experimental) + -g [nvidia|intel|none] Choose GPU acceleration Example: ./$(basename $0) -c config/gfortran.cfg @@ -121,6 +121,10 @@ case "$GPU" in echo "Activating AMD GPU acceleration" ln -s ${QP_ROOT}/plugins/local/gpu_amd ${QP_ROOT}/src/gpu_arch ;; + intel) # Intel + echo "Activating Intel GPU acceleration" + ln -s ${QP_ROOT}/plugins/local/gpu_intel ${QP_ROOT}/src/gpu_arch + ;; nvidia) # Nvidia echo "Activating Nvidia GPU acceleration" ln -s ${QP_ROOT}/plugins/local/gpu_nvidia ${QP_ROOT}/src/gpu_arch diff --git a/plugins/local/gpu_intel/LIB b/plugins/local/gpu_intel/LIB new file mode 100644 index 00000000..027c35b0 --- /dev/null +++ b/plugins/local/gpu_intel/LIB @@ -0,0 +1 @@ +-lmkl_sycl -lsycl diff --git a/plugins/local/gpu_intel/NEED b/plugins/local/gpu_intel/NEED new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/plugins/local/gpu_intel/NEED @@ -0,0 +1 @@ + diff --git a/plugins/local/gpu_intel/README.rst b/plugins/local/gpu_intel/README.rst new file mode 100644 index 00000000..3a4653de --- /dev/null +++ b/plugins/local/gpu_intel/README.rst @@ -0,0 +1,8 @@ +========= +gpu_intel +========= + +Intel implementation of GPU routines. Uses MKL and SYCL. +```bash +dpcpp -O3 -c gpu.o gpu.sycl +``` diff --git a/plugins/local/gpu_intel/gpu.sycl b/plugins/local/gpu_intel/gpu.sycl new file mode 100644 index 00000000..7b589490 --- /dev/null +++ b/plugins/local/gpu_intel/gpu.sycl @@ -0,0 +1,266 @@ +#include +#include +#include +#include + +extern "C" { + +/* Generic functions */ + +int gpu_ndevices() { + return 1; +} + +void gpu_set_device(int32_t igpu) { +} + + +/* Allocation functions */ + +void gpu_allocate(void** ptr, int64_t size) { + auto queue = sycl::queue(sycl::default_selector{}); + + try { + *ptr = sycl::malloc_shared(size, queue); + assert(*ptr != nullptr); + } catch (const sycl::exception& e) { + std::cerr << "SYCL exception caught: " << e.what() << std::endl; + *ptr = nullptr; // If allocation fails, set pointer to nullptr + } +} + +void gpu_deallocate(void** ptr) { + assert(*ptr != nullptr); + sycl::free(*ptr, sycl::queue(sycl::default_selector{})); + *ptr = nullptr; +} + +/* Upload data from host to device */ +void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) { + sycl::queue queue(sycl::default_selector{}); + queue.memcpy(gpu_ptr, cpu_ptr, n).wait(); +} + +/* Download data from device to host */ +void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) { + sycl::queue queue(sycl::default_selector{}); + queue.memcpy(cpu_ptr, gpu_ptr, n).wait(); +} + +/* Copy data from one GPU memory location to another */ +void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) { + sycl::queue queue(sycl::default_selector{}); + queue.memcpy(gpu_ptr_dest, gpu_ptr_src, n).wait(); +} + +/* Queues */ + +/* SYCL queue as a replacement for CUDA stream */ +void gpu_stream_create(sycl::queue** ptr) { + *ptr = new sycl::queue(sycl::default_selector{}); +} + +void gpu_stream_destroy(sycl::queue** ptr) { + assert(*ptr != nullptr); + delete *ptr; + *ptr = nullptr; +} + +To translate the CUDA functions related to stream management to SYCL, you will need to adapt to SYCL's approach to command groups and queues. SYCL uses queues to manage execution order and parallelism, similar to CUDA streams but integrated within the SYCL ecosystem. + +### Original CUDA Code + +```c +/* Create a CUDA stream */ +void gpu_stream_create(cudaStream_t* ptr) { + cudaError_t rc = cudaStreamCreate(ptr); + assert(rc == cudaSuccess); +} + +/* Destroy a CUDA stream */ +void gpu_stream_destroy(cudaStream_t* ptr) { + assert(ptr != NULL); + cudaError_t rc = cudaStreamDestroy(*ptr); + assert(rc == cudaSuccess); + *ptr = NULL; +} + +/* Set a specific stream for cuBLAS operations */ +void gpu_set_stream(cublasHandle_t handle, cudaStream_t stream) { + cublasSetStream(handle, stream); +} + +/* Synchronize all streams */ +void gpu_synchronize() { + cudaDeviceSynchronize(); +} +``` + +### Translated SYCL Code + +```cpp +#include +#include + +/* SYCL queue as a replacement for CUDA stream */ +void gpu_stream_create(sycl::queue** ptr) { + *ptr = new sycl::queue(sycl::default_selector{}); +} + +void gpu_stream_destroy(sycl::queue** ptr) { + *ptr->wait_and_throw(); + assert(*ptr != nullptr); + delete *ptr; + *ptr = nullptr; +} + +/* SYCL does not need an equivalent for setting a stream on a cuBLAS handle, + because each SYCL queue acts independently and can be used directly. */ + +void gpu_synchronize() { + sycl::queue queue(sycl::default_selector{}); + queue.wait_and_throw(); +} + +/* BLAS functions */ + +typedef struct { + sycl::queue* queue; +} blasHandle_t; + +void gpu_set_stream(blasHandle_t* handle, sycl::queue* ptr) { + handle->queue = ptr; +} + +void gpu_blas_create(blasHandle_t* ptr) { + *ptr = new blasHandle_t; + assert(*ptr != nullptr); + ptr->queue = new sycl::queue(sycl::default_selector{}); + assert(ptr->queue != nullptr); +} + +void gpu_blas_destroy(blasHandle_t* ptr) { + assert(*ptr != nullptr); + delete ptr->queue; + delete *ptr; + *ptr = nullptr; +} + + +void gpu_ddot(blasHandle_t* handle, const int64_t n, const double* x, const int64_t incx, + const double* y, const int64_t incy, double* result) { + // Ensure input parameters are valid + assert(handle != nullptr); + assert(handle->queue != nullptr); + assert(n > 0); + assert(incx > 0); + assert(incy > 0); + assert(x != nullptr); + assert(y != nullptr); + assert(result != nullptr); + + // SYCL buffer for the result + sycl::buffer result_buf(result, sycl::range<1>(1)); + + sycl::queue& queue = handle->queue; + + // Perform the dot product operation + queue.submit([&](sycl::handler& cgh) { + // Accessors for the buffers + auto result_acc = result_buf.get_access(cgh); + + // This is an asynchronous call to compute dot product + cgh.single_task([=]() { + result_acc[0] = oneapi::mkl::blas::dot(cgh, n, x, incx, y, incy); + }); + }); + +} + +void gpu_dgemv(blasHandle_t* handle, const char* transa, const int64_t m, const int64_t n, const double* alpha, + const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) { + + assert(handle != nullptr); + assert(handle->queue != nullptr); + + // Validate matrix dimensions and increments to be positive + assert(m > 0 && n > 0 && lda > 0 && incx > 0 && incy > 0); + assert(a != nullptr && x != nullptr && y != nullptr && alpha != nullptr && beta != nullptr); + + // Determine the operation type + oneapi::mkl::transpose transa_ = oneapi::mkl::transpose::nontrans; + if (*transa == 'T' || *transa == 't') { + transa_ = oneapi::mkl::transpose::trans; + } + + // Perform DGEMV operation using oneMKL + handle->queue->submit([&](sycl::handler& cgh) { + // Use accessors to ensure data consistency and dependency resolution + auto a_acc = sycl::accessor(a, sycl::range(m * lda), sycl::read_only, cgh); + auto x_acc = sycl::accessor(x, sycl::range(n * incx), sycl::read_only, cgh); + auto y_acc = sycl::accessor(y, sycl::range(m * incy), sycl::read_write, cgh); + + cgh.parallel_for(sycl::range(1), [=](sycl::id<1>) { + oneapi::mkl::blas::gemv(*handle->queue, transa_, m, n, *alpha, a_acc.get_pointer(), lda, x_acc.get_pointer(), incx, *beta, y_acc.get_pointer(), incy); + }); + }); + +} + +void gpu_dgemm(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha, + const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) { + + assert(handle != nullptr && handle->queue != nullptr); + assert(m > 0 && n > 0 && k > 0 && lda > 0 && ldb > 0 && ldc > 0); + assert(a != nullptr && b != nullptr && c != nullptr && alpha != nullptr && beta != nullptr); + + // Transpose operations + auto transa_ = (*transa == 'T' || *transa == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans; + auto transb_ = (*transb == 'T' || *transb == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans; + + // Ensure queue is ready + handle->queue->submit([&](sycl::handler& cgh) { + // Accessors for matrices + auto a_acc = sycl::accessor(a, sycl::range<1>(m * lda), sycl::read_only, cgh); + auto b_acc = sycl::accessor(b, sycl::range<1>(k * ldb), sycl::read_only, cgh); + auto c_acc = sycl::accessor(c, sycl::range<1>(m * ldc), sycl::read_write, cgh); + + cgh.parallel_for(sycl::range(1), [=](sycl::id<1>) { + oneapi::mkl::blas::gemm(*handle->queue, transa_, transb_, m, n, k, + *alpha, a_acc.get_pointer(), lda, + b_acc.get_pointer(), ldb, + *beta, c_acc.get_pointer(), ldc); + }); + }); + +} + +void gpu_dgeam(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha, + const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { + assert(handle != nullptr && handle->queue != nullptr); + assert(m > 0 && n > 0 && lda > 0 && ldb > 0 && ldc > 0); + assert(a != nullptr && b != nullptr && c != nullptr && alpha != nullptr && beta != nullptr); + + // Determine transpose operations + bool transA = (*transa == 'T' || *transa == 't'); + bool transB = (*transb == 'T' || *transb == 't'); + + handle->queue->submit([&](sycl::handler& cgh) { + auto a_acc = sycl::accessor(a, sycl::range(m * lda), sycl::read_only, cgh); + auto b_acc = sycl::accessor(b, sycl::range(n * ldb), sycl::read_only, cgh); + auto c_acc = sycl::accessor(c, sycl::range(m * ldc), sycl::read_write, cgh); + + cgh.parallel_for(sycl::range<2>(m, n), [=](sycl::id<2> idx) { + int i = idx[0]; + int j = idx[1]; + int ai = transA ? j * lda + i : i * lda + j; + int bi = transB ? j * ldb + i : i * ldb + j; + int ci = i * ldc + j; + + c_acc[ci] = (*alpha) * a_acc[ai] + (*beta) * b_acc[bi]; + }); + }); + +} + +} // extern C From 44b8e22e7aebf4dd89874549eac8bb8aef2fb16d Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 9 Jul 2024 22:02:13 +0200 Subject: [PATCH 36/38] Fixed sycl --- plugins/local/gpu_intel/README.rst | 2 +- plugins/local/gpu_intel/gpu.sycl | 139 ++++++----------------------- 2 files changed, 26 insertions(+), 115 deletions(-) diff --git a/plugins/local/gpu_intel/README.rst b/plugins/local/gpu_intel/README.rst index 3a4653de..d42e2557 100644 --- a/plugins/local/gpu_intel/README.rst +++ b/plugins/local/gpu_intel/README.rst @@ -4,5 +4,5 @@ gpu_intel Intel implementation of GPU routines. Uses MKL and SYCL. ```bash -dpcpp -O3 -c gpu.o gpu.sycl +icpx -fsycl gpu.cxx -c -qmkl=sequential ``` diff --git a/plugins/local/gpu_intel/gpu.sycl b/plugins/local/gpu_intel/gpu.sycl index 7b589490..1f9f89ce 100644 --- a/plugins/local/gpu_intel/gpu.sycl +++ b/plugins/local/gpu_intel/gpu.sycl @@ -18,7 +18,7 @@ void gpu_set_device(int32_t igpu) { /* Allocation functions */ void gpu_allocate(void** ptr, int64_t size) { - auto queue = sycl::queue(sycl::default_selector{}); + auto queue = sycl::queue(sycl::default_selector_v); try { *ptr = sycl::malloc_shared(size, queue); @@ -31,25 +31,25 @@ void gpu_allocate(void** ptr, int64_t size) { void gpu_deallocate(void** ptr) { assert(*ptr != nullptr); - sycl::free(*ptr, sycl::queue(sycl::default_selector{})); + sycl::free(*ptr, sycl::queue(sycl::default_selector_v)); *ptr = nullptr; } /* Upload data from host to device */ void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) { - sycl::queue queue(sycl::default_selector{}); + sycl::queue queue(sycl::default_selector_v); queue.memcpy(gpu_ptr, cpu_ptr, n).wait(); } /* Download data from device to host */ void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) { - sycl::queue queue(sycl::default_selector{}); + sycl::queue queue(sycl::default_selector_v); queue.memcpy(cpu_ptr, gpu_ptr, n).wait(); } /* Copy data from one GPU memory location to another */ void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) { - sycl::queue queue(sycl::default_selector{}); + sycl::queue queue(sycl::default_selector_v); queue.memcpy(gpu_ptr_dest, gpu_ptr_src, n).wait(); } @@ -57,7 +57,7 @@ void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) { /* SYCL queue as a replacement for CUDA stream */ void gpu_stream_create(sycl::queue** ptr) { - *ptr = new sycl::queue(sycl::default_selector{}); + *ptr = new sycl::queue(sycl::default_selector_v); } void gpu_stream_destroy(sycl::queue** ptr) { @@ -66,59 +66,8 @@ void gpu_stream_destroy(sycl::queue** ptr) { *ptr = nullptr; } -To translate the CUDA functions related to stream management to SYCL, you will need to adapt to SYCL's approach to command groups and queues. SYCL uses queues to manage execution order and parallelism, similar to CUDA streams but integrated within the SYCL ecosystem. - -### Original CUDA Code - -```c -/* Create a CUDA stream */ -void gpu_stream_create(cudaStream_t* ptr) { - cudaError_t rc = cudaStreamCreate(ptr); - assert(rc == cudaSuccess); -} - -/* Destroy a CUDA stream */ -void gpu_stream_destroy(cudaStream_t* ptr) { - assert(ptr != NULL); - cudaError_t rc = cudaStreamDestroy(*ptr); - assert(rc == cudaSuccess); - *ptr = NULL; -} - -/* Set a specific stream for cuBLAS operations */ -void gpu_set_stream(cublasHandle_t handle, cudaStream_t stream) { - cublasSetStream(handle, stream); -} - -/* Synchronize all streams */ void gpu_synchronize() { - cudaDeviceSynchronize(); -} -``` - -### Translated SYCL Code - -```cpp -#include -#include - -/* SYCL queue as a replacement for CUDA stream */ -void gpu_stream_create(sycl::queue** ptr) { - *ptr = new sycl::queue(sycl::default_selector{}); -} - -void gpu_stream_destroy(sycl::queue** ptr) { - *ptr->wait_and_throw(); - assert(*ptr != nullptr); - delete *ptr; - *ptr = nullptr; -} - -/* SYCL does not need an equivalent for setting a stream on a cuBLAS handle, - because each SYCL queue acts independently and can be used directly. */ - -void gpu_synchronize() { - sycl::queue queue(sycl::default_selector{}); + sycl::queue queue(sycl::default_selector_v); queue.wait_and_throw(); } @@ -132,17 +81,17 @@ void gpu_set_stream(blasHandle_t* handle, sycl::queue* ptr) { handle->queue = ptr; } -void gpu_blas_create(blasHandle_t* ptr) { - *ptr = new blasHandle_t; +void gpu_blas_create(blasHandle_t** ptr) { + *ptr = (blasHandle_t*) malloc(sizeof(blasHandle_t)); assert(*ptr != nullptr); - ptr->queue = new sycl::queue(sycl::default_selector{}); - assert(ptr->queue != nullptr); + (*ptr)->queue = new sycl::queue(sycl::default_selector_v); + assert((*ptr)->queue != nullptr); } -void gpu_blas_destroy(blasHandle_t* ptr) { +void gpu_blas_destroy(blasHandle_t** ptr) { assert(*ptr != nullptr); - delete ptr->queue; - delete *ptr; + delete (*ptr)->queue; + free(*ptr); *ptr = nullptr; } @@ -159,21 +108,7 @@ void gpu_ddot(blasHandle_t* handle, const int64_t n, const double* x, const int6 assert(y != nullptr); assert(result != nullptr); - // SYCL buffer for the result - sycl::buffer result_buf(result, sycl::range<1>(1)); - - sycl::queue& queue = handle->queue; - - // Perform the dot product operation - queue.submit([&](sycl::handler& cgh) { - // Accessors for the buffers - auto result_acc = result_buf.get_access(cgh); - - // This is an asynchronous call to compute dot product - cgh.single_task([=]() { - result_acc[0] = oneapi::mkl::blas::dot(cgh, n, x, incx, y, incy); - }); - }); + oneapi::mkl::blas::dot(*handle->queue, n, x, incx, y, incy, result); } @@ -194,16 +129,7 @@ void gpu_dgemv(blasHandle_t* handle, const char* transa, const int64_t m, const } // Perform DGEMV operation using oneMKL - handle->queue->submit([&](sycl::handler& cgh) { - // Use accessors to ensure data consistency and dependency resolution - auto a_acc = sycl::accessor(a, sycl::range(m * lda), sycl::read_only, cgh); - auto x_acc = sycl::accessor(x, sycl::range(n * incx), sycl::read_only, cgh); - auto y_acc = sycl::accessor(y, sycl::range(m * incy), sycl::read_write, cgh); - - cgh.parallel_for(sycl::range(1), [=](sycl::id<1>) { - oneapi::mkl::blas::gemv(*handle->queue, transa_, m, n, *alpha, a_acc.get_pointer(), lda, x_acc.get_pointer(), incx, *beta, y_acc.get_pointer(), incy); - }); - }); + oneapi::mkl::blas::column_major::gemv(*handle->queue, transa_, m, n, *alpha, a, lda, x, incx, *beta, y, incy); } @@ -218,23 +144,12 @@ void gpu_dgemm(blasHandle_t* handle, const char* transa, const char* transb, con auto transa_ = (*transa == 'T' || *transa == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans; auto transb_ = (*transb == 'T' || *transb == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans; - // Ensure queue is ready - handle->queue->submit([&](sycl::handler& cgh) { - // Accessors for matrices - auto a_acc = sycl::accessor(a, sycl::range<1>(m * lda), sycl::read_only, cgh); - auto b_acc = sycl::accessor(b, sycl::range<1>(k * ldb), sycl::read_only, cgh); - auto c_acc = sycl::accessor(c, sycl::range<1>(m * ldc), sycl::read_write, cgh); - - cgh.parallel_for(sycl::range(1), [=](sycl::id<1>) { - oneapi::mkl::blas::gemm(*handle->queue, transa_, transb_, m, n, k, - *alpha, a_acc.get_pointer(), lda, - b_acc.get_pointer(), ldb, - *beta, c_acc.get_pointer(), ldc); - }); - }); + oneapi::mkl::blas::column_major::gemm(*handle->queue, transa_, transb_, m, n, k, + *alpha, a, lda, b, ldb, *beta, c, ldc); } + void gpu_dgeam(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha, const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { assert(handle != nullptr && handle->queue != nullptr); @@ -246,18 +161,14 @@ void gpu_dgeam(blasHandle_t* handle, const char* transa, const char* transb, con bool transB = (*transb == 'T' || *transb == 't'); handle->queue->submit([&](sycl::handler& cgh) { - auto a_acc = sycl::accessor(a, sycl::range(m * lda), sycl::read_only, cgh); - auto b_acc = sycl::accessor(b, sycl::range(n * ldb), sycl::read_only, cgh); - auto c_acc = sycl::accessor(c, sycl::range(m * ldc), sycl::read_write, cgh); - cgh.parallel_for(sycl::range<2>(m, n), [=](sycl::id<2> idx) { - int i = idx[0]; - int j = idx[1]; - int ai = transA ? j * lda + i : i * lda + j; - int bi = transB ? j * ldb + i : i * ldb + j; - int ci = i * ldc + j; + const int i = idx[0]; + const int j = idx[1]; + const int ai = transA ? j * lda + i : i * lda + j; + const int bi = transB ? j * ldb + i : i * ldb + j; + const int ci = i * ldc + j; - c_acc[ci] = (*alpha) * a_acc[ai] + (*beta) * b_acc[bi]; + c[ci] = (*alpha) * a[ai] + (*beta) * b[bi]; }); }); From 6c275d54ef050ec8d210a35aa4bbb2c93d176f34 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 9 Jul 2024 22:14:19 +0200 Subject: [PATCH 37/38] Fix intent --- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index 24fcc5af..6f65ea79 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -996,8 +996,8 @@ subroutine compute_J1_chol(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvoo,d_cc_space_v_vo_chol integer, intent(in) :: nO,nV type(gpu_double2), intent(in) :: t1 type(gpu_double4), intent(in) :: t2, v_ovvo, v_ovoo, v_vvoo + type(gpu_double3), intent(in) :: d_cc_space_v_vo_chol,d_cc_space_v_vv_chol type(gpu_double4), intent(out) :: J1 - type(gpu_double3), intent(out) :: d_cc_space_v_vo_chol,d_cc_space_v_vv_chol integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam From f5cf674d7b4eb98637bde7eb07d1119cfeccc557 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 9 Jul 2024 23:04:22 +0200 Subject: [PATCH 38/38] Fix link stage for intel gpus --- configure | 4 ++-- plugins/local/gpu_intel/LIB | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/configure b/configure index 3e3390e1..43ca9f6d 100755 --- a/configure +++ b/configure @@ -117,12 +117,12 @@ done # Handle GPU acceleration rm -f ${QP_ROOT}/src/gpu_arch case "$GPU" in - amd) # Nvidia + amd) # AMD echo "Activating AMD GPU acceleration" ln -s ${QP_ROOT}/plugins/local/gpu_amd ${QP_ROOT}/src/gpu_arch ;; intel) # Intel - echo "Activating Intel GPU acceleration" + echo "Activating Intel GPU acceleration (EXPERIMENTAL)" ln -s ${QP_ROOT}/plugins/local/gpu_intel ${QP_ROOT}/src/gpu_arch ;; nvidia) # Nvidia diff --git a/plugins/local/gpu_intel/LIB b/plugins/local/gpu_intel/LIB index 027c35b0..199b0f1c 100644 --- a/plugins/local/gpu_intel/LIB +++ b/plugins/local/gpu_intel/LIB @@ -1 +1,2 @@ --lmkl_sycl -lsycl +-ltbb -lsycl -lmkl_sycl -lgpu -limf -lintlc -lstdc++ +