From 6e2f28b97fbf3c961c9461689a8db9adf81bd6f8 Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Sat, 11 May 2024 10:27:03 +0200 Subject: [PATCH 01/19] COLLAPSE(4) -> COLLAPSE(3) --- plugins/local/non_h_ints_mu/deb_aos.irp.f | 6 +++--- plugins/local/non_h_ints_mu/total_tc_int.irp.f | 9 ++++----- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/plugins/local/non_h_ints_mu/deb_aos.irp.f b/plugins/local/non_h_ints_mu/deb_aos.irp.f index 4012f47c..70604f54 100644 --- a/plugins/local/non_h_ints_mu/deb_aos.irp.f +++ b/plugins/local/non_h_ints_mu/deb_aos.irp.f @@ -31,6 +31,9 @@ subroutine print_aos() integer :: i, ipoint double precision :: r(3) double precision :: ao_val, ao_der(3), ao_lap + double precision :: accu_vgl(5) + double precision :: accu_vgl_nrm(5) + double precision :: mo_val, mo_der(3), mo_lap PROVIDE final_grid_points aos_in_r_array aos_grad_in_r_array aos_lapl_in_r_array @@ -40,9 +43,6 @@ subroutine print_aos() write(1000, '(3(f15.7, 3X))') r enddo -double precision :: accu_vgl(5) -double precision :: accu_vgl_nrm(5) - do ipoint = 1, n_points_final_grid do i = 1, ao_num ao_val = aos_in_r_array (i,ipoint) diff --git a/plugins/local/non_h_ints_mu/total_tc_int.irp.f b/plugins/local/non_h_ints_mu/total_tc_int.irp.f index a1bbd6e0..656f5f16 100644 --- a/plugins/local/non_h_ints_mu/total_tc_int.irp.f +++ b/plugins/local/non_h_ints_mu/total_tc_int.irp.f @@ -78,7 +78,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n !$OMP PRIVATE (i, j, k, l, ipoint, ao_i_r, ao_k_r, weight1) & !$OMP SHARED (ao_num, n_points_final_grid, ao_two_e_tc_tot, & !$OMP aos_in_r_array_transp, final_weight_at_r_vector, int2_grad1_u12_square_ao) - !$OMP DO COLLAPSE(4) + !$OMP DO COLLAPSE(3) do i = 1, ao_num do k = 1, ao_num do l = 1, ao_num @@ -188,7 +188,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n !$OMP SHARED (ao_num, n_points_final_grid, ao_two_e_tc_tot, & !$OMP aos_in_r_array_transp, final_weight_at_r_vector, & !$OMP int2_grad1_u12_ao, aos_grad_in_r_array_transp_bis) - !$OMP DO COLLAPSE(4) + !$OMP DO COLLAPSE(3) do i = 1, ao_num do k = 1, ao_num do l = 1, ao_num @@ -270,7 +270,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n !$OMP PARALLEL DEFAULT(NONE) & !$OMP PRIVATE(i, j, k, l, integ_zero, integ_val) & !$OMP SHARED(ao_num, ao_two_e_tc_tot) - !$OMP DO COLLAPSE(4) + !$OMP DO COLLAPSE(3) do j = 1, ao_num do l = 1, ao_num do i = 1, ao_num @@ -293,7 +293,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n !$OMP PARALLEL DEFAULT(NONE) & !$OMP SHARED(ao_num, ao_two_e_tc_tot, ao_integrals_map) & !$OMP PRIVATE(i, j, k, l) - !$OMP DO COLLAPSE(4) + !$OMP DO COLLAPSE(3) do j = 1, ao_num do l = 1, ao_num do i = 1, ao_num @@ -306,7 +306,6 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n enddo !$OMP END DO !$OMP END PARALLEL - !call clear_ao_map() FREE ao_integrals_map endif From 29da3b6542cdfeb52f9d1b7f8c23f3967018bf0f Mon Sep 17 00:00:00 2001 From: AbdAmmar Date: Thu, 23 May 2024 00:45:56 +0200 Subject: [PATCH 02/19] bypass one_e_tr_dm_mo for large mo_num & n_states --- .../multi_s_dipole_moment.irp.f | 154 ++++++++++++++++-- 1 file changed, 140 insertions(+), 14 deletions(-) diff --git a/src/mol_properties/multi_s_dipole_moment.irp.f b/src/mol_properties/multi_s_dipole_moment.irp.f index c7216a61..8aae3bf4 100644 --- a/src/mol_properties/multi_s_dipole_moment.irp.f +++ b/src/mol_properties/multi_s_dipole_moment.irp.f @@ -18,7 +18,7 @@ -BEGIN_PROVIDER [double precision, multi_s_dipole_moment, (N_states, N_states)] + BEGIN_PROVIDER [double precision, multi_s_dipole_moment , (N_states, N_states)] &BEGIN_PROVIDER [double precision, multi_s_x_dipole_moment, (N_states, N_states)] &BEGIN_PROVIDER [double precision, multi_s_y_dipole_moment, (N_states, N_states)] &BEGIN_PROVIDER [double precision, multi_s_z_dipole_moment, (N_states, N_states)] @@ -40,27 +40,153 @@ BEGIN_PROVIDER [double precision, multi_s_dipole_moment, (N_states, N_states)] ! gamma^{nm}: density matrix \bra{\Psi^n} a^{\dagger}_a a_i \ket{\Psi^m} END_DOC - integer :: istate,jstate ! States - integer :: i,j ! general spatial MOs + integer :: istate, jstate ! States + integer :: i, j ! general spatial MOs double precision :: nuclei_part_x, nuclei_part_y, nuclei_part_z multi_s_x_dipole_moment = 0.d0 multi_s_y_dipole_moment = 0.d0 multi_s_z_dipole_moment = 0.d0 + + if(8.d0*mo_num*mo_num*n_states*n_states*1d-9 .lt. 200.d0) then - do jstate = 1, N_states - do istate = 1, N_states - - do i = 1, mo_num - do j = 1, mo_num - multi_s_x_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_x(j,i) - multi_s_y_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_y(j,i) - multi_s_z_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_z(j,i) - enddo + do jstate = 1, N_states + do istate = 1, N_states + do i = 1, mo_num + do j = 1, mo_num + multi_s_x_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_x(j,i) + multi_s_y_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_y(j,i) + multi_s_z_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_z(j,i) + enddo + enddo enddo - enddo - enddo + + else + + ! no enouph memory + ! on the fly scheme + + PROVIDE psi_det_alpha_unique psi_det_beta_unique + + integer :: l, k_a, k_b + integer :: occ(N_int*bit_kind_size,2) + integer :: h1, h2, p1, p2, degree + integer :: exc(0:2,2), n_occ(2) + integer :: krow, kcol, lrow, lcol + integer(bit_kind) :: tmp_det(N_int,2), tmp_det2(N_int) + double precision :: ck, ckl, phase + + !$OMP PARALLEL DEFAULT(NONE) & + !$OMP PRIVATE(j, l, k_a, k_b, istate, jstate, occ, ck, ckl, h1, h2, p1, p2, exc, & + !$OMP phase, degree, n_occ, krow, kcol, lrow, lcol, tmp_det, tmp_det2) & + !$OMP SHARED(N_int, N_states, elec_alpha_num, elec_beta_num, N_det, & + !$OMP psi_bilinear_matrix_rows, psi_bilinear_matrix_columns, & + !$OMP psi_bilinear_matrix_transp_rows, psi_bilinear_matrix_transp_columns, & + !$OMP psi_det_alpha_unique, psi_det_beta_unique, & + !$OMP psi_bilinear_matrix_values, psi_bilinear_matrix_transp_values, & + !$OMP mo_dipole_x, mo_dipole_y, mo_dipole_z, & + !$OMP multi_s_x_dipole_moment, multi_s_y_dipole_moment, multi_s_z_dipole_moment) + !$OMP DO COLLAPSE(2) + do istate = 1, N_states + do jstate = 1, N_states + + do k_a = 1, N_det + krow = psi_bilinear_matrix_rows (k_a) + kcol = psi_bilinear_matrix_columns(k_a) + + tmp_det(1:N_int,1) = psi_det_alpha_unique(1:N_int,krow) + tmp_det(1:N_int,2) = psi_det_beta_unique (1:N_int,kcol) + + ! Diagonal part + call bitstring_to_list_ab(tmp_det, occ, n_occ, N_int) + ck = psi_bilinear_matrix_values(k_a,istate)*psi_bilinear_matrix_values(k_a,jstate) + do l = 1, elec_alpha_num + j = occ(l,1) + multi_s_x_dipole_moment(istate,jstate) -= ck * mo_dipole_x(j,j) + multi_s_y_dipole_moment(istate,jstate) -= ck * mo_dipole_y(j,j) + multi_s_z_dipole_moment(istate,jstate) -= ck * mo_dipole_z(j,j) + enddo + + if (k_a == N_det) cycle + l = k_a + 1 + lrow = psi_bilinear_matrix_rows (l) + lcol = psi_bilinear_matrix_columns(l) + ! Fix beta determinant, loop over alphas + do while (lcol == kcol) + tmp_det2(:) = psi_det_alpha_unique(:,lrow) + call get_excitation_degree_spin(tmp_det(1,1), tmp_det2, degree, N_int) + if (degree == 1) then + exc = 0 + call get_single_excitation_spin(tmp_det(1,1), tmp_det2, exc, phase, N_int) + call decode_exc_spin(exc, h1, p1, h2, p2) + ckl = psi_bilinear_matrix_values(k_a,istate)*psi_bilinear_matrix_values(l,jstate) * phase + multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(h1,p1) + multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(h1,p1) + multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(h1,p1) + ckl = psi_bilinear_matrix_values(k_a,jstate)*psi_bilinear_matrix_values(l,istate) * phase + multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(p1,h1) + multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(p1,h1) + multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(p1,h1) + endif + l = l+1 + if (l > N_det) exit + lrow = psi_bilinear_matrix_rows (l) + lcol = psi_bilinear_matrix_columns(l) + enddo + enddo ! k_a + + do k_b = 1, N_det + krow = psi_bilinear_matrix_transp_rows (k_b) + kcol = psi_bilinear_matrix_transp_columns(k_b) + + tmp_det(1:N_int,1) = psi_det_alpha_unique(1:N_int,krow) + tmp_det(1:N_int,2) = psi_det_beta_unique (1:N_int,kcol) + + ! Diagonal part + call bitstring_to_list_ab(tmp_det, occ, n_occ, N_int) + ck = psi_bilinear_matrix_transp_values(k_b,istate)*psi_bilinear_matrix_transp_values(k_b,jstate) + do l = 1, elec_beta_num + j = occ(l,2) + multi_s_x_dipole_moment(istate,jstate) -= ck * mo_dipole_x(j,j) + multi_s_y_dipole_moment(istate,jstate) -= ck * mo_dipole_y(j,j) + multi_s_z_dipole_moment(istate,jstate) -= ck * mo_dipole_z(j,j) + enddo + + if (k_b == N_det) cycle + l = k_b+1 + lrow = psi_bilinear_matrix_transp_rows (l) + lcol = psi_bilinear_matrix_transp_columns(l) + ! Fix beta determinant, loop over alphas + do while (lrow == krow) + tmp_det2(:) = psi_det_beta_unique(:,lcol) + call get_excitation_degree_spin(tmp_det(1,2), tmp_det2, degree, N_int) + if (degree == 1) then + exc = 0 + call get_single_excitation_spin(tmp_det(1,2), tmp_det2, exc, phase, N_int) + call decode_exc_spin(exc, h1, p1, h2, p2) + ckl = psi_bilinear_matrix_transp_values(k_b,istate)*psi_bilinear_matrix_transp_values(l,jstate) * phase + multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(h1,p1) + multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(h1,p1) + multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(h1,p1) + ckl = psi_bilinear_matrix_transp_values(k_b,jstate)*psi_bilinear_matrix_transp_values(l,istate) * phase + multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(p1,h1) + multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(p1,h1) + multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(p1,h1) + endif + l = l+1 + if (l > N_det) exit + lrow = psi_bilinear_matrix_transp_rows (l) + lcol = psi_bilinear_matrix_transp_columns(l) + enddo + enddo ! k_b + + enddo ! istate + enddo ! jstate + !$OMP END DO + !$OMP END PARALLEL + + endif ! memory condition ! Nuclei part nuclei_part_x = 0.d0 From 7e45c517d981adcd77883a39a361be38b470ff20 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 25 Jun 2024 18:32:44 +0200 Subject: [PATCH 03/19] Introducing gpu_x86 --- src/gpu_x86/NEED | 1 + src/gpu_x86/README.rst | 5 + src/gpu_x86/gpu.c | 506 +++++++++++++++++++++++++++++++++++++ src/gpu_x86/gpu.h | 41 +++ src/gpu_x86/gpu_module.F90 | 141 +++++++++++ 5 files changed, 694 insertions(+) create mode 100644 src/gpu_x86/NEED create mode 100644 src/gpu_x86/README.rst create mode 100644 src/gpu_x86/gpu.c create mode 100644 src/gpu_x86/gpu.h create mode 100644 src/gpu_x86/gpu_module.F90 diff --git a/src/gpu_x86/NEED b/src/gpu_x86/NEED new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/src/gpu_x86/NEED @@ -0,0 +1 @@ + diff --git a/src/gpu_x86/README.rst b/src/gpu_x86/README.rst new file mode 100644 index 00000000..f530bf29 --- /dev/null +++ b/src/gpu_x86/README.rst @@ -0,0 +1,5 @@ +======= +gpu_x86 +======= + +x86 implementation of GPU routines. For use when GPUs are not available. diff --git a/src/gpu_x86/gpu.c b/src/gpu_x86/gpu.c new file mode 100644 index 00000000..71505dbe --- /dev/null +++ b/src/gpu_x86/gpu.c @@ -0,0 +1,506 @@ +#include +#include +#include +#include +#include + + +/* Generic functions */ + +int gpu_ndevices() { + return 1; +} + +void gpu_set_device(int32_t i) { + return; +} + + +/* Allocation functions */ + +void gpu_allocate(void** ptr, const int64_t n) { + *ptr = malloc((size_t) n); + if (*ptr == NULL) { + perror("Allocation failed"); + } +} + +void gpu_free(void** ptr) { + free(*ptr); + *ptr = NULL; +} + + +/* Memory transfer functions */ + +void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) { + memcpy(gpu_ptr, cpu_ptr, n); +} + +void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) { + memcpy(cpu_ptr, gpu_ptr, n); +} + +void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) { + memcpy(gpu_ptr_dest, gpu_ptr_src, n); +} + + +/* Streams */ + +void gpu_stream_create(void** ptr) { + *ptr = (void*) 2; +} + +void gpu_stream_destroy(void** ptr) { + *ptr = NULL; +} + +void gpu_set_stream(void* handle, void* stream) { + return; +} + +void gpu_synchronize() { + return; +} + + +/* BLAS functions */ + +void gpu_blas_create(void** handle) { + *handle = (void*) 1; +} + + +void gpu_blas_destroy(void** handle) { + *handle = NULL; +} + + +double ddot_(const int32_t* n, const double* x, const int32_t* incx, const double* y, const int32_t* incy); + +void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) { + assert (handle != NULL); + + /* Convert to int32_t */ + int32_t n_, incx_, incy_; + + n_ = (int32_t) n; + incx_ = (int32_t) incx; + incy_ = (int32_t) incy; + + /* Check for integer overflows */ + assert ( (int64_t) n_ == n ); + assert ( (int64_t) incx_ == incx); + assert ( (int64_t) incy_ == incy); + + *result = ddot_(&n_, x, &incx_, y, &incy_); +} + + +float sdot_(const int32_t* n, const float* x, const int32_t* incx, const float* y, const int32_t* incy); + +void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) { + assert (handle != NULL); + + /* Convert to int32_t */ + int32_t n_, incx_, incy_; + + n_ = (int32_t) n; + incx_ = (int32_t) incx; + incy_ = (int32_t) incy; + + /* Check for integer overflows */ + assert ( (int64_t) n_ == n ); + assert ( (int64_t) incx_ == incx); + assert ( (int64_t) incy_ == incy); + + *result = sdot_(&n_, x, &incx_, y, &incy_); +} + + +void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha, + const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy); + +void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double alpha, + const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) { + + assert (handle != NULL); + + /* Convert to int32_t */ + int32_t m_, n_, lda_, incx_, incy_; + + m_ = (int32_t) m; + n_ = (int32_t) n; + lda_ = (int32_t) lda; + incx_ = (int32_t) incx; + incy_ = (int32_t) incy; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) lda_ == lda ); + assert ( (int64_t) incx_ == incx); + assert ( (int64_t) incy_ == incy); + + dgemv_(&transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_); +} + + +void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha, + const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy); + +void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float alpha, + const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) { + + assert (handle != NULL); + + /* Convert to int32_t */ + int32_t m_, n_, lda_, incx_, incy_; + + m_ = (int32_t) m; + n_ = (int32_t) n; + lda_ = (int32_t) lda; + incx_ = (int32_t) incx; + incy_ = (int32_t) incy; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) lda_ == lda ); + assert ( (int64_t) incx_ == incx); + assert ( (int64_t) incy_ == incy); + + sgemv_(&transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_); +} + + +void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha, + const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc); + +void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, + const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) { + + assert (handle != NULL); + + /* Convert to int32_t */ + int32_t m_, n_, k_, lda_, ldb_, ldc_; + + m_ = (int32_t) m; + n_ = (int32_t) n; + k_ = (int32_t) k; + lda_ = (int32_t) lda; + ldb_ = (int32_t) ldb; + ldc_ = (int32_t) ldc; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) k_ == k ); + assert ( (int64_t) lda_ == lda); + assert ( (int64_t) ldb_ == ldb); + assert ( (int64_t) ldc_ == ldc); + + dgemm_(&transa, &transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_); +} + + + +void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha, + const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc); + +void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, + const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) { + + assert (handle != NULL); + + /* Convert to int32_t */ + int32_t m_, n_, k_, lda_, ldb_, ldc_; + + m_ = (int32_t) m; + n_ = (int32_t) n; + k_ = (int32_t) k; + lda_ = (int32_t) lda; + ldb_ = (int32_t) ldb; + ldc_ = (int32_t) ldc; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) k_ == k ); + assert ( (int64_t) lda_ == lda); + assert ( (int64_t) ldb_ == ldb); + assert ( (int64_t) ldc_ == ldc); + + sgemm_(&transa, &transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_); +} + + +void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, + const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { + if (handle == NULL) { + perror("NULL handle"); + exit(-1); + } + + if ( (transa == 'N' && transb == 'N') || + (transa == 'n' && transb == 'N') || + (transa == 'N' && transb == 'n') || + (transa == 'n' && transb == 'n') ) { + + if (alpha == 0.) { + + for (int64_t j=0 ; j + +int gpu_ndevices(); +void gpu_set_device(int32_t i); + +void gpu_allocate(void** ptr, const int64_t n); +void gpu_free(void** ptr); + +void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n); +void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n); +void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n); + +void gpu_stream_create(void** ptr); +void gpu_stream_destroy(void** ptr); +void gpu_set_stream(void* handle, void* stream); +void gpu_synchronize(); + +void gpu_blas_create(void** handle); +void gpu_blas_destroy(void** handle); + +void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result); + +void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result); + +void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double alpha, + const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy); + +void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float alpha, + const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy); + +void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, + const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc); + +void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, + const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc); + +void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, + const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc); + +void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha, + const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc); diff --git a/src/gpu_x86/gpu_module.F90 b/src/gpu_x86/gpu_module.F90 new file mode 100644 index 00000000..86ba3926 --- /dev/null +++ b/src/gpu_x86/gpu_module.F90 @@ -0,0 +1,141 @@ +module gpu + use, intrinsic :: iso_c_binding, only : c_int32_t, c_int64_t, c_double, c_size_t, c_char + implicit none + + interface + integer function gpu_ndevices() bind(C) + end function + + subroutine gpu_set_device(id) bind(C) + import + integer(c_int32_t), value :: id + end subroutine + + subroutine gpu_allocate_c(ptr, n) bind(C, name='gpu_allocate') + import + type(c_ptr) :: ptr + integer(c_int64_t), value :: n + end subroutine + + subroutine gpu_free_c(ptr) bind(C, name='gpu_free') + import + type(c_ptr) :: ptr + end subroutine + + subroutine gpu_upload_c(cpu_ptr, gpu_ptr, n) bind(C, name='gpu_upload') + import + type(c_ptr), value :: cpu_ptr + type(c_ptr), value :: gpu_ptr + integer(c_int64_t), value :: n + end subroutine + + subroutine gpu_download_c(gpu_ptr, cpu_ptr, n) bind(C, name='gpu_download') + import + type(c_ptr), value :: gpu_ptr + type(c_ptr), value :: cpu_ptr + integer(c_int64_t), value :: n + end subroutine + + subroutine gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, n) bind(C, name='gpu_copy') + import + type(c_ptr), value :: gpu_ptr_src + type(c_ptr), value :: gpu_ptr_dest + integer(c_int64_t), value :: n + end subroutine + + subroutine gpu_stream_create(stream) bind(C) + import + type(c_ptr) :: stream + end subroutine + + subroutine gpu_stream_destroy(stream) bind(C) + import + type(c_ptr) :: stream + end subroutine + + subroutine gpu_set_stream(handle, stream) bind(C) + import + type(c_ptr) :: handle, stream + end subroutine + + subroutine gpu_synchronize() + end subroutine + + subroutine gpu_blas_create(handle) bind(C) + import + type(c_ptr) :: handle + end subroutine + + subroutine gpu_blas_destroy(handle) bind(C) + import + type(c_ptr) :: handle + end subroutine + + subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res) bind(C) + import + type(c_ptr), intent(in) :: handle + integer(c_int64_t), value :: n, incx, incy + real(c_double), intent(in) :: dx(*), dy(*) + real(c_double), intent(out) :: res + end subroutine + + subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res) bind(C) + import + type(c_ptr), intent(in) :: handle + integer(c_int64_t), value :: n, incx, incy + real(c_float), intent(in) :: dx(*), dy(*) + real(c_float), intent(out) :: res + end subroutine + + end interface + +end module + +subroutine gpu_allocate_double(ptr, s) + use gpu + implicit none + double precision, pointer, intent(inout) :: ptr + integer*8, intent(in) :: s(*) + type(c_ptr) :: cptr + + call gpu_allocate_c(cptr, sum(s)*8_8) + call c_f_pointer(cptr, ptr, s) +end subroutine + +subroutine gpu_free_double(ptr) + use gpu + implicit none + double precision, pointer, intent(inout) :: ptr + type(c_ptr) :: cptr + cptr = cloc(ptr) + call gpu_free(cptr) + NULLIFY(ptr) +end subroutine + +subroutine gpu_upload_double(cpu_ptr, gpu_ptr, n) + use gpu + implicit none + double precision, intent(in) :: cpu_ptr(*) + double precision, intent(out) :: gpu_ptr(*) + integer(c_int64_t), intent(in) :: n + call gpu_upload_c(cpu_ptr, gpu_ptr, 8_8*n) +end subroutine + +subroutine gpu_download_double(gpu_ptr, cpu_ptr, n) + use gpu + implicit none + double precision, intent(in) :: gpu_ptr(*) + double precision, intent(out) :: cpu_ptr(*) + integer(c_int64_t), intent(in) :: n + call gpu_download_c(gpu_ptr, cpu_ptr, 8_8*n) +end subroutine + +subroutine gpu_copy_double(gpu_ptr_src, gpu_ptr_dest, n) + use gpu + implicit none + double precision, intent(in) :: gpu_ptr_src(*) + double precision, intent(out) :: gpu_ptr_dest(*) + integer(c_int64_t), intent(in) :: n + call gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, 8_8*n) +end subroutine + From 646607ada4a0a58c9cd5e0593c04bce7bc9bd02e Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Wed, 26 Jun 2024 11:15:30 +0200 Subject: [PATCH 04/19] 1st commit --- plugins/local/tc_int/NEED | 5 + plugins/local/tc_int/README.rst | 4 + plugins/local/tc_int/int2_grad1_u12.irp.f | 134 ++++++++++++++++++++++ plugins/local/tc_int/jast_grad_2e.irp.f | 102 ++++++++++++++++ plugins/local/tc_int/jast_grad_full.irp.f | 51 ++++++++ plugins/local/tc_int/jast_utils_bh.irp.f | 35 ++++++ plugins/local/tc_int/write_tc_int.irp.f | 58 ++++++++++ 7 files changed, 389 insertions(+) create mode 100644 plugins/local/tc_int/NEED create mode 100644 plugins/local/tc_int/README.rst create mode 100644 plugins/local/tc_int/int2_grad1_u12.irp.f create mode 100644 plugins/local/tc_int/jast_grad_2e.irp.f create mode 100644 plugins/local/tc_int/jast_grad_full.irp.f create mode 100644 plugins/local/tc_int/jast_utils_bh.irp.f create mode 100644 plugins/local/tc_int/write_tc_int.irp.f diff --git a/plugins/local/tc_int/NEED b/plugins/local/tc_int/NEED new file mode 100644 index 00000000..8a4caf5b --- /dev/null +++ b/plugins/local/tc_int/NEED @@ -0,0 +1,5 @@ +tc_keywords +jastrow +qmckl +becke_numerical_grid +dft_utils_in_r diff --git a/plugins/local/tc_int/README.rst b/plugins/local/tc_int/README.rst new file mode 100644 index 00000000..bc9e8483 --- /dev/null +++ b/plugins/local/tc_int/README.rst @@ -0,0 +1,4 @@ +====== +tc_int +====== + diff --git a/plugins/local/tc_int/int2_grad1_u12.irp.f b/plugins/local/tc_int/int2_grad1_u12.irp.f new file mode 100644 index 00000000..0cf0d775 --- /dev/null +++ b/plugins/local/tc_int/int2_grad1_u12.irp.f @@ -0,0 +1,134 @@ + +! --- + +subroutine provide_int2_grad1_u12_ao() + + implicit none + integer :: ipoint, i, j, m, jpoint + integer :: n_blocks, n_rest, n_pass + integer :: i_blocks, i_rest, i_pass, ii + double precision :: time0, time1 + double precision :: mem, n_double + double precision, allocatable :: tmp(:,:,:) + double precision, allocatable :: tmp_grad1_u12(:,:,:) + double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:) + + PROVIDE final_weight_at_r_vector_extra aos_in_r_array_extra + + print*, ' start provide_int2_grad1_u12_ao ...' + call wall_time(time0) + + + ! int2_grad1_u12_ao(i,j,ipoint,1) = \int dr2 [\grad1 u(r1,r2)]_x1 \chi_i(r2) \chi_j(r2) + ! int2_grad1_u12_ao(i,j,ipoint,2) = \int dr2 [\grad1 u(r1,r2)]_y1 \chi_i(r2) \chi_j(r2) + ! int2_grad1_u12_ao(i,j,ipoint,3) = \int dr2 [\grad1 u(r1,r2)]_z1 \chi_i(r2) \chi_j(r2) + ! int2_grad1_u12_ao(i,j,ipoint,4) = -(1/2) \int dr2 [\grad1 u(r1,r2)]^2 \chi_i(r2) \chi_j(r2) + allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,4)) + + + + call total_memory(mem) + mem = max(1.d0, qp_max_mem - mem) + n_double = mem * 1.d8 + n_blocks = int(min(n_double / (n_points_extra_final_grid * 4.d0), 1.d0*n_points_final_grid)) + n_rest = int(mod(n_points_final_grid, n_blocks)) + n_pass = int((n_points_final_grid - n_rest) / n_blocks) + + call write_int(6, n_pass, 'Number of passes') + call write_int(6, n_blocks, 'Size of the blocks') + call write_int(6, n_rest, 'Size of the last block') + + + allocate(tmp(n_points_extra_final_grid,ao_num,ao_num)) + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (j, i, jpoint) & + !$OMP SHARED (tmp, ao_num, n_points_extra_final_grid, final_weight_at_r_vector_extra, aos_in_r_array_extra_transp) + !$OMP DO SCHEDULE (static) + do j = 1, ao_num + do i = 1, ao_num + do jpoint = 1, n_points_extra_final_grid + tmp(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j) + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + + + allocate(tmp_grad1_u12(n_points_extra_final_grid,n_blocks,4)) + + do i_pass = 1, n_pass + ii = (i_pass-1)*n_blocks + 1 + + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (i_blocks, ipoint) & + !$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12) + !$OMP DO + do i_blocks = 1, n_blocks + ipoint = ii - 1 + i_blocks ! r1 + call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_blocks,1), tmp_grad1_u12(1,i_blocks,2), tmp_grad1_u12(1,i_blocks,3), tmp_grad1_u12(1,i_blocks,4)) + enddo + !$OMP END DO + !$OMP END PARALLEL + + do m = 1, 4 + call dgemm( "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0 & + , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid & + , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) + enddo + enddo + + deallocate(tmp_grad1_u12) + + + if(n_rest .gt. 0) then + + allocate(tmp_grad1_u12(n_points_extra_final_grid,n_rest,4)) + + ii = n_pass*n_blocks + 1 + + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (i_rest, ipoint) & + !$OMP SHARED (n_rest, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12) + !$OMP DO + do i_rest = 1, n_rest + ipoint = ii - 1 + i_rest ! r1 + call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_rest,1), tmp_grad1_u12(1,i_rest,2), tmp_grad1_u12(1,i_rest,3), tmp_grad1_u12(1,i_rest,4)) + enddo + !$OMP END DO + !$OMP END PARALLEL + + do m = 1, 4 + call dgemm( "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0 & + , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid & + , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) + enddo + + deallocate(tmp_grad1_u12) + endif + + deallocate(tmp) + + + ! --- + + print*, ' Writing int2_grad1_u12_ao in ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao' + open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="write") + call ezfio_set_work_empty(.False.) + write(11) int2_grad1_u12_ao(:,:,:,1:3) + close(11) + + deallocate(int2_grad1_u12_ao) + + call wall_time(time1) + print*, ' wall time for provide_int2_grad1_u12_ao (min) = ', (time1-time0) / 60.d0 + call print_memory_usage() + +end + +! --- + + diff --git a/plugins/local/tc_int/jast_grad_2e.irp.f b/plugins/local/tc_int/jast_grad_2e.irp.f new file mode 100644 index 00000000..b18b9d62 --- /dev/null +++ b/plugins/local/tc_int/jast_grad_2e.irp.f @@ -0,0 +1,102 @@ + +! --- + +subroutine get_grad1_u12_r1_2e(r1, n_grid2, gradx, grady, gradz) + + BEGIN_DOC + ! + ! d/dx1 j_2e(1,2) + ! d/dy1 j_2e(1,2) + ! d/dz1 j_2e(1,2) + ! + END_DOC + + include 'constants.include.F' + + implicit none + integer , intent(in) :: n_grid2 + double precision, intent(in) :: r1(3) + double precision, intent(out) :: gradx(n_grid2) + double precision, intent(out) :: grady(n_grid2) + double precision, intent(out) :: gradz(n_grid2) + + integer :: jpoint + integer :: i_nucl, p, mpA, npA, opA + integer :: powmax1, powmax, powmax2 + double precision :: r2(3) + double precision :: tmp, tmp1, tmp2 + double precision :: rn(3), f1A, grad1_f1A(3), f2A, grad2_f2A(3), g12, grad1_g12(3) + double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:) + + + powmax1 = max(maxval(jBH_m), maxval(jBH_n)) + powmax2 = maxval(jBH_o) + powmax = max(powmax1, powmax2) + + allocate(f1A_power(-1:powmax), f2A_power(-1:powmax), g12_power(-1:powmax), double_p(0:powmax)) + + do p = 0, powmax + double_p(p) = dble(p) + enddo + + f1A_power(-1) = 0.d0 + f2A_power(-1) = 0.d0 + g12_power(-1) = 0.d0 + + f1A_power(0) = 1.d0 + f2A_power(0) = 1.d0 + g12_power(0) = 1.d0 + + do jpoint = 1, n_points_extra_final_grid ! r2 + + r2(1) = final_grid_points_extra(1,jpoint) + r2(2) = final_grid_points_extra(2,jpoint) + r2(3) = final_grid_points_extra(3,jpoint) + + gradx(jpoint) = 0.d0 + grady(jpoint) = 0.d0 + gradz(jpoint) = 0.d0 + do i_nucl = 1, nucl_num + + rn(1) = nucl_coord(i_nucl,1) + rn(2) = nucl_coord(i_nucl,2) + rn(3) = nucl_coord(i_nucl,3) + + call jBH_elem_fct_grad(jBH_en(i_nucl), r1, rn, f1A, grad1_f1A) + call jBH_elem_fct_grad(jBH_en(i_nucl), r2, rn, f2A, grad2_f2A) + call jBH_elem_fct_grad(jBH_ee(i_nucl), r1, r2, g12, grad1_g12) + + ! Compute powers of f1A and f2A + do p = 1, powmax1 + f1A_power(p) = f1A_power(p-1) * f1A + f2A_power(p) = f2A_power(p-1) * f2A + enddo + do p = 1, powmax2 + g12_power(p) = g12_power(p-1) * g12 + enddo + + do p = 1, jBH_size + mpA = jBH_m(p,i_nucl) + npA = jBH_n(p,i_nucl) + opA = jBH_o(p,i_nucl) + tmp = jBH_c(p,i_nucl) + if(mpA .eq. npA) then + tmp = tmp * 0.5d0 + endif + + tmp1 = double_p(mpA) * f1A_power(mpA-1) * f2A_power(npA) + double_p(npA) * f1A_power(npA-1) * f2A_power(mpA) + tmp1 = tmp1 * g12_power(opA) * tmp + tmp2 = double_p(opA) * g12_power(opA-1) * (f1A_power(mpA) * f2A_power(npA) + f1A_power(npA) * f2A_power(mpA)) * tmp + + gradx(jpoint) = gradx(jpoint) + tmp1 * grad1_f1A(1) + tmp2 * grad1_g12(1) + grady(jpoint) = grady(jpoint) + tmp1 * grad1_f1A(2) + tmp2 * grad1_g12(2) + gradz(jpoint) = gradz(jpoint) + tmp1 * grad1_f1A(3) + tmp2 * grad1_g12(3) + enddo ! p + enddo ! i_nucl + enddo ! jpoint + + return +end + +! --- + diff --git a/plugins/local/tc_int/jast_grad_full.irp.f b/plugins/local/tc_int/jast_grad_full.irp.f new file mode 100644 index 00000000..f63ee3e4 --- /dev/null +++ b/plugins/local/tc_int/jast_grad_full.irp.f @@ -0,0 +1,51 @@ + +! --- + +subroutine get_grad1_u12_for_tc(ipoint, n_grid2, resx, resy, resz, res) + + BEGIN_DOC + ! + ! resx(ipoint) = [grad1 u(r1,r2)]_x1 + ! resy(ipoint) = [grad1 u(r1,r2)]_y1 + ! resz(ipoint) = [grad1 u(r1,r2)]_z1 + ! res (ipoint) = -0.5 [grad1 u(r1,r2)]^2 + ! + ! We use: + ! grid for r1 + ! extra_grid for r2 + ! + END_DOC + + implicit none + integer, intent(in) :: ipoint, n_grid2 + double precision, intent(out) :: resx(n_grid2), resy(n_grid2), resz(n_grid2), res(n_grid2) + + integer :: jpoint + double precision :: env_r1, tmp + double precision :: grad1_env(3), r1(3) + double precision, allocatable :: env_r2(:) + double precision, allocatable :: u2b_r12(:), gradx1_u2b(:), grady1_u2b(:), gradz1_u2b(:) + double precision, allocatable :: u2b_mu(:), gradx1_mu(:), grady1_mu(:), gradz1_mu(:) + double precision, allocatable :: u2b_nu(:), gradx1_nu(:), grady1_nu(:), gradz1_nu(:) + double precision, external :: env_nucl + + r1(1) = final_grid_points(1,ipoint) + r1(2) = final_grid_points(2,ipoint) + r1(3) = final_grid_points(3,ipoint) + + + ! j2e_type .eq. "Boys_Handy" + ! env_type .eq. "None" + ! j1e_type .eq "None" + + call get_grad1_u12_r1_2e(r1, n_grid2, resx(1), resy(1), resz(1)) + + do jpoint = 1, n_points_extra_final_grid + res(jpoint) = -0.5d0 * (resx(jpoint) * resx(jpoint) + resy(jpoint) * resy(jpoint) + resz(jpoint) * resz(jpoint)) + enddo + + return +end + +! --- + diff --git a/plugins/local/tc_int/jast_utils_bh.irp.f b/plugins/local/tc_int/jast_utils_bh.irp.f new file mode 100644 index 00000000..750ce90b --- /dev/null +++ b/plugins/local/tc_int/jast_utils_bh.irp.f @@ -0,0 +1,35 @@ + +! --- + +subroutine jBH_elem_fct_grad(alpha, r1, r2, fct, grad1_fct) + + implicit none + double precision, intent(in) :: alpha, r1(3), r2(3) + double precision, intent(out) :: fct, grad1_fct(3) + double precision :: dist, tmp1, tmp2 + + dist = dsqrt( (r1(1) - r2(1)) * (r1(1) - r2(1)) & + + (r1(2) - r2(2)) * (r1(2) - r2(2)) & + + (r1(3) - r2(3)) * (r1(3) - r2(3)) ) + + + if(dist .ge. 1d-10) then + tmp1 = 1.d0 / (1.d0 + alpha * dist) + + fct = alpha * dist * tmp1 + tmp2 = alpha * tmp1 * tmp1 / dist + grad1_fct(1) = tmp2 * (r1(1) - r2(1)) + grad1_fct(2) = tmp2 * (r1(2) - r2(2)) + grad1_fct(3) = tmp2 * (r1(3) - r2(3)) + else + grad1_fct(1) = 0.d0 + grad1_fct(2) = 0.d0 + grad1_fct(3) = 0.d0 + fct = 0.d0 + endif + + return +end + +! --- + diff --git a/plugins/local/tc_int/write_tc_int.irp.f b/plugins/local/tc_int/write_tc_int.irp.f new file mode 100644 index 00000000..ebdce6f2 --- /dev/null +++ b/plugins/local/tc_int/write_tc_int.irp.f @@ -0,0 +1,58 @@ +! --- + +program write_tc_int + + implicit none + + print *, ' j2e_type = ', j2e_type + print *, ' j1e_type = ', j1e_type + print *, ' env_type = ', env_type + + my_grid_becke = .True. + PROVIDE tc_grid1_a tc_grid1_r + my_n_pt_r_grid = tc_grid1_r + my_n_pt_a_grid = tc_grid1_a + touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid + + call write_int(6, my_n_pt_r_grid, 'radial external grid over') + call write_int(6, my_n_pt_a_grid, 'angular external grid over') + + if(tc_integ_type .eq. "numeric") then + my_extra_grid_becke = .True. + PROVIDE tc_grid2_a tc_grid2_r + my_n_pt_r_extra_grid = tc_grid2_r + my_n_pt_a_extra_grid = tc_grid2_a + touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid + + call write_int(6, my_n_pt_r_extra_grid, 'radial internal grid over') + call write_int(6, my_n_pt_a_extra_grid, 'angular internal grid over') + endif + + call main() + +end + +! --- + +subroutine main() + + implicit none + + PROVIDE io_tc_integ + + print*, 'io_tc_integ = ', io_tc_integ + + if(io_tc_integ .ne. "Write") then + print*, 'io_tc_integ != Write' + print*, io_tc_integ + stop + endif + + call provide_int2_grad1_u12_ao() + + call ezfio_set_tc_keywords_io_tc_integ('Read') + +end + +! --- + From a2f4bc218d207d4c588b3d0d1c1d4c5f7448b334 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Wed, 26 Jun 2024 13:44:45 +0200 Subject: [PATCH 05/19] GPU acceleration selection in configure --- configure | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/configure b/configure index 41c0123d..014275eb 100755 --- a/configure +++ b/configure @@ -40,14 +40,16 @@ Usage: $(basename $0) -c $(basename $0) -h $(basename $0) -i + $(basename $0) -g [nvidia|none] Options: - -c Define a COMPILATION configuration file, - in "${QP_ROOT}/config/". - -h Print the HELP message - -i INSTALL . Use at your OWN RISK: - no support will be provided for the installation of - dependencies. + -c Define a COMPILATION configuration file, + in "${QP_ROOT}/config/". + -h Print the HELP message + -i INSTALL . Use at your OWN RISK: + no support will be provided for the installation of + dependencies. + -g [nvidia|none] Choose GPU acceleration (experimental) Example: ./$(basename $0) -c config/gfortran.cfg @@ -83,7 +85,7 @@ function execute () { PACKAGES="" -while getopts "d:c:i:h" c ; do +while getopts "d:c:i:g:h" c ; do case "$c" in c) case "$OPTARG" in @@ -100,6 +102,9 @@ while getopts "d:c:i:h" c ; do "") help ; break;; *) PACKAGES="${PACKAGE} $OPTARG" esac;; + g) + GPU=$OPTARG; + break;; h) help exit 0;; @@ -109,6 +114,23 @@ while getopts "d:c:i:h" c ; do esac done +# Handle GPU acceleration +rm -f ${QP_ROOT}/src/gpu +case "$GPU" in + amd) # Nvidia + echo "Activating AMD GPU acceleration" + ln -s ${QP_ROOT}/src/gpu_amd ${QP_ROOT}/src/gpu + ;; + nvidia) # Nvidia + echo "Activating Nvidia GPU acceleration" + ln -s ${QP_ROOT}/src/gpu_nvidia ${QP_ROOT}/src/gpu + ;; + *) # No Acceleration + echo "Disabling GPU acceleration" + ln -s ${QP_ROOT}/src/gpu_x86 ${QP_ROOT}/src/gpu + ;; +esac + # Trim leading and trailing spaces PACKAGES=$(echo $PACKAGES | xargs) From 1d0bac25d081d76177c6efeba88117f397c5de3c Mon Sep 17 00:00:00 2001 From: Abdallah Ammar Date: Wed, 26 Jun 2024 15:31:44 +0200 Subject: [PATCH 06/19] v0 --- plugins/local/tc_int/compute_tc_int.irp.f | 295 ++++++++++++++++++++++ plugins/local/tc_int/int2_grad1_u12.irp.f | 134 ---------- plugins/local/tc_int/jast_grad_2e.irp.f | 102 -------- plugins/local/tc_int/jast_grad_full.irp.f | 113 +++++++-- plugins/local/tc_int/write_tc_int.irp.f | 18 +- 5 files changed, 401 insertions(+), 261 deletions(-) create mode 100644 plugins/local/tc_int/compute_tc_int.irp.f delete mode 100644 plugins/local/tc_int/int2_grad1_u12.irp.f delete mode 100644 plugins/local/tc_int/jast_grad_2e.irp.f diff --git a/plugins/local/tc_int/compute_tc_int.irp.f b/plugins/local/tc_int/compute_tc_int.irp.f new file mode 100644 index 00000000..02f21570 --- /dev/null +++ b/plugins/local/tc_int/compute_tc_int.irp.f @@ -0,0 +1,295 @@ + +! --- + +subroutine provide_int2_grad1_u12_ao() + + BEGIN_DOC + ! + ! int2_grad1_u12_ao(i,j,ipoint,1) = \int dr2 [\grad1 u(r1,r2)]_x1 \chi_i(r2) \chi_j(r2) + ! int2_grad1_u12_ao(i,j,ipoint,2) = \int dr2 [\grad1 u(r1,r2)]_y1 \chi_i(r2) \chi_j(r2) + ! int2_grad1_u12_ao(i,j,ipoint,3) = \int dr2 [\grad1 u(r1,r2)]_z1 \chi_i(r2) \chi_j(r2) + ! int2_grad1_u12_ao(i,j,ipoint,4) = \int dr2 [-(1/2) [\grad1 u(r1,r2)]^2] \chi_i(r2) \chi_j(r2) + ! + ! + ! tc_int_2e_ao(k,i,l,j) = (ki|V^TC(r_12)|lj) + ! = where V^TC(r_12) is the total TC operator + ! = tc_grad_and_lapl_ao(k,i,l,j) + tc_grad_square_ao(k,i,l,j) + ao_two_e_coul(k,i,l,j) + ! where: + ! + ! tc_grad_and_lapl_ao(k,i,l,j) = < k l | -1/2 \Delta_1 u(r1,r2) - \grad_1 u(r1,r2) . \grad_1 | ij > + ! = -1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2 \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2) + ! = 1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2 (-1) \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2) + ! + ! tc_grad_square_ao(k,i,l,j) = -1/2 + ! + ! ao_two_e_coul(k,i,l,j) = < l k | 1/r12 | j i > = ( k i | 1/r12 | l j ) + ! + END_DOC + + implicit none + + integer :: i, j, k, l, m, ipoint, jpoint + integer :: n_blocks, n_rest, n_pass + integer :: i_blocks, i_rest, i_pass, ii + double precision :: mem, n_double + double precision :: weight1, ao_k_r, ao_i_r + double precision :: der_envsq_x, der_envsq_y, der_envsq_z, lap_envsq + double precision :: time0, time1, time2, tc1, tc2, tc + double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:), tc_int_2e_ao(:,:,:,:) + double precision, allocatable :: tmp(:,:,:), c_mat(:,:,:), tmp_grad1_u12(:,:,:) + + double precision, external :: get_ao_two_e_integral + + + PROVIDE final_weight_at_r_vector_extra aos_in_r_array_extra + PROVIDE final_weight_at_r_vector aos_grad_in_r_array_transp_bis final_weight_at_r_vector aos_in_r_array_transp + + + + print*, ' start provide_int2_grad1_u12_ao ...' + call wall_time(time0) + + call total_memory(mem) + mem = max(1.d0, qp_max_mem - mem) + n_double = mem * 1.d8 + n_blocks = int(min(n_double / (n_points_extra_final_grid * 4.d0), 1.d0*n_points_final_grid)) + n_rest = int(mod(n_points_final_grid, n_blocks)) + n_pass = int((n_points_final_grid - n_rest) / n_blocks) + + call write_int(6, n_pass, 'Number of passes') + call write_int(6, n_blocks, 'Size of the blocks') + call write_int(6, n_rest, 'Size of the last block') + + ! --- + ! --- + ! --- + + allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,4)) + + allocate(tmp(n_points_extra_final_grid,ao_num,ao_num)) + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (j, i, jpoint) & + !$OMP SHARED (tmp, ao_num, n_points_extra_final_grid, final_weight_at_r_vector_extra, aos_in_r_array_extra_transp) + !$OMP DO SCHEDULE (static) + do j = 1, ao_num + do i = 1, ao_num + do jpoint = 1, n_points_extra_final_grid + tmp(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j) + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + + allocate(tmp_grad1_u12(n_points_extra_final_grid,n_blocks,4)) + + tc = 0.d0 + + do i_pass = 1, n_pass + ii = (i_pass-1)*n_blocks + 1 + + call wall_time(tc1) + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (i_blocks, ipoint) & + !$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12) + !$OMP DO + do i_blocks = 1, n_blocks + ipoint = ii - 1 + i_blocks ! r1 + call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_blocks,1), tmp_grad1_u12(1,i_blocks,2), tmp_grad1_u12(1,i_blocks,3), tmp_grad1_u12(1,i_blocks,4)) + enddo + !$OMP END DO + !$OMP END PARALLEL + call wall_time(tc2) + tc = tc + tc2 - tc1 + + do m = 1, 4 + call dgemm( "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0 & + , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid & + , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) + enddo + enddo + + deallocate(tmp_grad1_u12) + + + if(n_rest .gt. 0) then + + allocate(tmp_grad1_u12(n_points_extra_final_grid,n_rest,4)) + + ii = n_pass*n_blocks + 1 + + call wall_time(tc1) + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (i_rest, ipoint) & + !$OMP SHARED (n_rest, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12) + !$OMP DO + do i_rest = 1, n_rest + ipoint = ii - 1 + i_rest ! r1 + call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_rest,1), tmp_grad1_u12(1,i_rest,2), tmp_grad1_u12(1,i_rest,3), tmp_grad1_u12(1,i_rest,4)) + enddo + !$OMP END DO + !$OMP END PARALLEL + call wall_time(tc2) + tc = tc + tc2 - tc1 + + do m = 1, 4 + call dgemm( "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0 & + , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid & + , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) + enddo + + deallocate(tmp_grad1_u12) + endif + + deallocate(tmp) + + + call wall_time(time1) + print*, ' wall time for int2_grad1_u12_ao (min) = ', (time1-time0) / 60.d0 + print*, ' wall time Jastrow derivatives (min) = ', tc / 60.d0 + call print_memory_usage() + + ! --- + ! --- + ! --- + + + allocate(tc_int_2e_ao(ao_num,ao_num,ao_num,ao_num)) + + call wall_time(time1) + + allocate(c_mat(n_points_final_grid,ao_num,ao_num)) + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (i, k, ipoint) & + !$OMP SHARED (aos_in_r_array_transp, c_mat, ao_num, n_points_final_grid, final_weight_at_r_vector) + !$OMP DO SCHEDULE (static) + do i = 1, ao_num + do k = 1, ao_num + do ipoint = 1, n_points_final_grid + c_mat(ipoint,k,i) = final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,k) + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0 & + , int2_grad1_u12_ao(1,1,1,4), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid & + , 0.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num) + deallocate(c_mat) + + call wall_time(time2) + print*, ' wall time of Hermitian part of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0 + call print_memory_usage() + + ! --- + + call wall_time(time1) + + allocate(c_mat(n_points_final_grid,ao_num,ao_num)) + do m = 1, 3 + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r) & + !$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, c_mat, & + !$OMP ao_num, n_points_final_grid, final_weight_at_r_vector, m) + !$OMP DO SCHEDULE (static) + do i = 1, ao_num + do k = 1, ao_num + do ipoint = 1, n_points_final_grid + + weight1 = 0.5d0 * final_weight_at_r_vector(ipoint) + ao_i_r = aos_in_r_array_transp(ipoint,i) + ao_k_r = aos_in_r_array_transp(ipoint,k) + + c_mat(ipoint,k,i) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,m) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,m)) + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + + call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, -1.d0 & + , int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid & + , 1.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num) + enddo + deallocate(c_mat) + + call wall_time(time2) + print*, ' wall time of non-Hermitian part of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0 + call print_memory_usage() + + ! --- + + call wall_time(time1) + + call sum_A_At(tc_int_2e_ao(1,1,1,1), ao_num*ao_num) + + call wall_time(time2) + print*, ' lower- and upper-triangle of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0 + call print_memory_usage() + + ! --- + + call wall_time(time1) + + PROVIDE ao_integrals_map + !$OMP PARALLEL DEFAULT(NONE) & + !$OMP SHARED(ao_num, tc_int_2e_ao, ao_integrals_map) & + !$OMP PRIVATE(i, j, k, l) + !$OMP DO COLLAPSE(3) + do j = 1, ao_num + do l = 1, ao_num + do i = 1, ao_num + do k = 1, ao_num + ! < 1:i, 2:j | 1:k, 2:l > + tc_int_2e_ao(k,i,l,j) = tc_int_2e_ao(k,i,l,j) + get_ao_two_e_integral(i, j, k, l, ao_integrals_map) + enddo + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + + call wall_time(time2) + print*, ' wall time of Coulomb part of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0 + call print_memory_usage() + + ! --- + + print*, ' Writing int2_grad1_u12_ao in ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao' + open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="write") + call ezfio_set_work_empty(.False.) + write(11) int2_grad1_u12_ao(:,:,:,1:3) + close(11) + + print*, ' Saving tc_int_2e_ao in ', trim(ezfio_filename) // '/work/ao_two_e_tc_tot' + open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/ao_two_e_tc_tot', action="write") + call ezfio_set_work_empty(.False.) + do i = 1, ao_num + write(11) tc_int_2e_ao(:,:,:,i) + enddo + close(11) + + ! ---- + + deallocate(int2_grad1_u12_ao) + deallocate(tc_int_2e_ao) + + call wall_time(time2) + print*, ' wall time for tc_int_2e_ao (min) = ', (time2-time1) / 60.d0 + call print_memory_usage() + + ! --- + + call wall_time(time1) + print*, ' wall time for TC-integrals (min) = ', (time1-time0) / 60.d0 + + return +end + +! --- + diff --git a/plugins/local/tc_int/int2_grad1_u12.irp.f b/plugins/local/tc_int/int2_grad1_u12.irp.f deleted file mode 100644 index 0cf0d775..00000000 --- a/plugins/local/tc_int/int2_grad1_u12.irp.f +++ /dev/null @@ -1,134 +0,0 @@ - -! --- - -subroutine provide_int2_grad1_u12_ao() - - implicit none - integer :: ipoint, i, j, m, jpoint - integer :: n_blocks, n_rest, n_pass - integer :: i_blocks, i_rest, i_pass, ii - double precision :: time0, time1 - double precision :: mem, n_double - double precision, allocatable :: tmp(:,:,:) - double precision, allocatable :: tmp_grad1_u12(:,:,:) - double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:) - - PROVIDE final_weight_at_r_vector_extra aos_in_r_array_extra - - print*, ' start provide_int2_grad1_u12_ao ...' - call wall_time(time0) - - - ! int2_grad1_u12_ao(i,j,ipoint,1) = \int dr2 [\grad1 u(r1,r2)]_x1 \chi_i(r2) \chi_j(r2) - ! int2_grad1_u12_ao(i,j,ipoint,2) = \int dr2 [\grad1 u(r1,r2)]_y1 \chi_i(r2) \chi_j(r2) - ! int2_grad1_u12_ao(i,j,ipoint,3) = \int dr2 [\grad1 u(r1,r2)]_z1 \chi_i(r2) \chi_j(r2) - ! int2_grad1_u12_ao(i,j,ipoint,4) = -(1/2) \int dr2 [\grad1 u(r1,r2)]^2 \chi_i(r2) \chi_j(r2) - allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,4)) - - - - call total_memory(mem) - mem = max(1.d0, qp_max_mem - mem) - n_double = mem * 1.d8 - n_blocks = int(min(n_double / (n_points_extra_final_grid * 4.d0), 1.d0*n_points_final_grid)) - n_rest = int(mod(n_points_final_grid, n_blocks)) - n_pass = int((n_points_final_grid - n_rest) / n_blocks) - - call write_int(6, n_pass, 'Number of passes') - call write_int(6, n_blocks, 'Size of the blocks') - call write_int(6, n_rest, 'Size of the last block') - - - allocate(tmp(n_points_extra_final_grid,ao_num,ao_num)) - !$OMP PARALLEL & - !$OMP DEFAULT (NONE) & - !$OMP PRIVATE (j, i, jpoint) & - !$OMP SHARED (tmp, ao_num, n_points_extra_final_grid, final_weight_at_r_vector_extra, aos_in_r_array_extra_transp) - !$OMP DO SCHEDULE (static) - do j = 1, ao_num - do i = 1, ao_num - do jpoint = 1, n_points_extra_final_grid - tmp(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j) - enddo - enddo - enddo - !$OMP END DO - !$OMP END PARALLEL - - - allocate(tmp_grad1_u12(n_points_extra_final_grid,n_blocks,4)) - - do i_pass = 1, n_pass - ii = (i_pass-1)*n_blocks + 1 - - !$OMP PARALLEL & - !$OMP DEFAULT (NONE) & - !$OMP PRIVATE (i_blocks, ipoint) & - !$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12) - !$OMP DO - do i_blocks = 1, n_blocks - ipoint = ii - 1 + i_blocks ! r1 - call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_blocks,1), tmp_grad1_u12(1,i_blocks,2), tmp_grad1_u12(1,i_blocks,3), tmp_grad1_u12(1,i_blocks,4)) - enddo - !$OMP END DO - !$OMP END PARALLEL - - do m = 1, 4 - call dgemm( "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0 & - , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid & - , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) - enddo - enddo - - deallocate(tmp_grad1_u12) - - - if(n_rest .gt. 0) then - - allocate(tmp_grad1_u12(n_points_extra_final_grid,n_rest,4)) - - ii = n_pass*n_blocks + 1 - - !$OMP PARALLEL & - !$OMP DEFAULT (NONE) & - !$OMP PRIVATE (i_rest, ipoint) & - !$OMP SHARED (n_rest, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12) - !$OMP DO - do i_rest = 1, n_rest - ipoint = ii - 1 + i_rest ! r1 - call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_rest,1), tmp_grad1_u12(1,i_rest,2), tmp_grad1_u12(1,i_rest,3), tmp_grad1_u12(1,i_rest,4)) - enddo - !$OMP END DO - !$OMP END PARALLEL - - do m = 1, 4 - call dgemm( "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0 & - , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid & - , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) - enddo - - deallocate(tmp_grad1_u12) - endif - - deallocate(tmp) - - - ! --- - - print*, ' Writing int2_grad1_u12_ao in ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao' - open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="write") - call ezfio_set_work_empty(.False.) - write(11) int2_grad1_u12_ao(:,:,:,1:3) - close(11) - - deallocate(int2_grad1_u12_ao) - - call wall_time(time1) - print*, ' wall time for provide_int2_grad1_u12_ao (min) = ', (time1-time0) / 60.d0 - call print_memory_usage() - -end - -! --- - - diff --git a/plugins/local/tc_int/jast_grad_2e.irp.f b/plugins/local/tc_int/jast_grad_2e.irp.f deleted file mode 100644 index b18b9d62..00000000 --- a/plugins/local/tc_int/jast_grad_2e.irp.f +++ /dev/null @@ -1,102 +0,0 @@ - -! --- - -subroutine get_grad1_u12_r1_2e(r1, n_grid2, gradx, grady, gradz) - - BEGIN_DOC - ! - ! d/dx1 j_2e(1,2) - ! d/dy1 j_2e(1,2) - ! d/dz1 j_2e(1,2) - ! - END_DOC - - include 'constants.include.F' - - implicit none - integer , intent(in) :: n_grid2 - double precision, intent(in) :: r1(3) - double precision, intent(out) :: gradx(n_grid2) - double precision, intent(out) :: grady(n_grid2) - double precision, intent(out) :: gradz(n_grid2) - - integer :: jpoint - integer :: i_nucl, p, mpA, npA, opA - integer :: powmax1, powmax, powmax2 - double precision :: r2(3) - double precision :: tmp, tmp1, tmp2 - double precision :: rn(3), f1A, grad1_f1A(3), f2A, grad2_f2A(3), g12, grad1_g12(3) - double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:) - - - powmax1 = max(maxval(jBH_m), maxval(jBH_n)) - powmax2 = maxval(jBH_o) - powmax = max(powmax1, powmax2) - - allocate(f1A_power(-1:powmax), f2A_power(-1:powmax), g12_power(-1:powmax), double_p(0:powmax)) - - do p = 0, powmax - double_p(p) = dble(p) - enddo - - f1A_power(-1) = 0.d0 - f2A_power(-1) = 0.d0 - g12_power(-1) = 0.d0 - - f1A_power(0) = 1.d0 - f2A_power(0) = 1.d0 - g12_power(0) = 1.d0 - - do jpoint = 1, n_points_extra_final_grid ! r2 - - r2(1) = final_grid_points_extra(1,jpoint) - r2(2) = final_grid_points_extra(2,jpoint) - r2(3) = final_grid_points_extra(3,jpoint) - - gradx(jpoint) = 0.d0 - grady(jpoint) = 0.d0 - gradz(jpoint) = 0.d0 - do i_nucl = 1, nucl_num - - rn(1) = nucl_coord(i_nucl,1) - rn(2) = nucl_coord(i_nucl,2) - rn(3) = nucl_coord(i_nucl,3) - - call jBH_elem_fct_grad(jBH_en(i_nucl), r1, rn, f1A, grad1_f1A) - call jBH_elem_fct_grad(jBH_en(i_nucl), r2, rn, f2A, grad2_f2A) - call jBH_elem_fct_grad(jBH_ee(i_nucl), r1, r2, g12, grad1_g12) - - ! Compute powers of f1A and f2A - do p = 1, powmax1 - f1A_power(p) = f1A_power(p-1) * f1A - f2A_power(p) = f2A_power(p-1) * f2A - enddo - do p = 1, powmax2 - g12_power(p) = g12_power(p-1) * g12 - enddo - - do p = 1, jBH_size - mpA = jBH_m(p,i_nucl) - npA = jBH_n(p,i_nucl) - opA = jBH_o(p,i_nucl) - tmp = jBH_c(p,i_nucl) - if(mpA .eq. npA) then - tmp = tmp * 0.5d0 - endif - - tmp1 = double_p(mpA) * f1A_power(mpA-1) * f2A_power(npA) + double_p(npA) * f1A_power(npA-1) * f2A_power(mpA) - tmp1 = tmp1 * g12_power(opA) * tmp - tmp2 = double_p(opA) * g12_power(opA-1) * (f1A_power(mpA) * f2A_power(npA) + f1A_power(npA) * f2A_power(mpA)) * tmp - - gradx(jpoint) = gradx(jpoint) + tmp1 * grad1_f1A(1) + tmp2 * grad1_g12(1) - grady(jpoint) = grady(jpoint) + tmp1 * grad1_f1A(2) + tmp2 * grad1_g12(2) - gradz(jpoint) = gradz(jpoint) + tmp1 * grad1_f1A(3) + tmp2 * grad1_g12(3) - enddo ! p - enddo ! i_nucl - enddo ! jpoint - - return -end - -! --- - diff --git a/plugins/local/tc_int/jast_grad_full.irp.f b/plugins/local/tc_int/jast_grad_full.irp.f index f63ee3e4..78ed1edf 100644 --- a/plugins/local/tc_int/jast_grad_full.irp.f +++ b/plugins/local/tc_int/jast_grad_full.irp.f @@ -16,31 +16,26 @@ subroutine get_grad1_u12_for_tc(ipoint, n_grid2, resx, resy, resz, res) ! END_DOC + include 'constants.include.F' + implicit none integer, intent(in) :: ipoint, n_grid2 double precision, intent(out) :: resx(n_grid2), resy(n_grid2), resz(n_grid2), res(n_grid2) - integer :: jpoint - double precision :: env_r1, tmp - double precision :: grad1_env(3), r1(3) - double precision, allocatable :: env_r2(:) - double precision, allocatable :: u2b_r12(:), gradx1_u2b(:), grady1_u2b(:), gradz1_u2b(:) - double precision, allocatable :: u2b_mu(:), gradx1_mu(:), grady1_mu(:), gradz1_mu(:) - double precision, allocatable :: u2b_nu(:), gradx1_nu(:), grady1_nu(:), gradz1_nu(:) - double precision, external :: env_nucl + integer :: jpoint, i_nucl, p, mpA, npA, opA, pp + integer :: powmax1, powmax, powmax2 + double precision :: r1(3), r2(3) + double precision :: tmp, tmp1, tmp2, tmp11, tmp22 + double precision :: rn(3), f1A, grad1_f1A(3), f2A, grad2_f2A(3), g12, grad1_g12(3) + double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:) r1(1) = final_grid_points(1,ipoint) r1(2) = final_grid_points(2,ipoint) r1(3) = final_grid_points(3,ipoint) + call grad1_j12_r1_seq(r1, n_grid2, resx, resy, resz) - ! j2e_type .eq. "Boys_Handy" - ! env_type .eq. "None" - ! j1e_type .eq "None" - - call get_grad1_u12_r1_2e(r1, n_grid2, resx(1), resy(1), resz(1)) - - do jpoint = 1, n_points_extra_final_grid + do jpoint = 1, n_grid2 ! r2 res(jpoint) = -0.5d0 * (resx(jpoint) * resx(jpoint) + resy(jpoint) * resy(jpoint) + resz(jpoint) * resz(jpoint)) enddo @@ -49,3 +44,91 @@ end ! --- +subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz) + + include 'constants.include.F' + + implicit none + integer , intent(in) :: n_grid2 + double precision, intent(in) :: r1(3) + double precision, intent(out) :: gradx(n_grid2) + double precision, intent(out) :: grady(n_grid2) + double precision, intent(out) :: gradz(n_grid2) + + integer :: jpoint, i_nucl, p, mpA, npA, opA + double precision :: r2(3) + double precision :: dx, dy, dz, r12, tmp + double precision :: rn(3), f1A, grad1_f1A(3), f2A, grad2_f2A(3), g12, grad1_g12(3) + double precision :: tmp1, tmp2 + integer :: powmax1, powmax, powmax2 + double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:) + + powmax1 = max(maxval(jBH_m), maxval(jBH_n)) + powmax2 = maxval(jBH_o) + powmax = max(powmax1, powmax2) + + allocate(f1A_power(-1:powmax), f2A_power(-1:powmax), g12_power(-1:powmax), double_p(0:powmax)) + + do p = 0, powmax + double_p(p) = dble(p) + enddo + + f1A_power(-1) = 0.d0 + f2A_power(-1) = 0.d0 + g12_power(-1) = 0.d0 + + f1A_power(0) = 1.d0 + f2A_power(0) = 1.d0 + g12_power(0) = 1.d0 + + do jpoint = 1, n_grid2 ! r2 + + r2(1) = final_grid_points_extra(1,jpoint) + r2(2) = final_grid_points_extra(2,jpoint) + r2(3) = final_grid_points_extra(3,jpoint) + + gradx(jpoint) = 0.d0 + grady(jpoint) = 0.d0 + gradz(jpoint) = 0.d0 + do i_nucl = 1, nucl_num + + rn(1) = nucl_coord(i_nucl,1) + rn(2) = nucl_coord(i_nucl,2) + rn(3) = nucl_coord(i_nucl,3) + + call jBH_elem_fct_grad(jBH_en(i_nucl), r1, rn, f1A, grad1_f1A) + call jBH_elem_fct_grad(jBH_en(i_nucl), r2, rn, f2A, grad2_f2A) + call jBH_elem_fct_grad(jBH_ee(i_nucl), r1, r2, g12, grad1_g12) + + ! Compute powers of f1A and f2A + do p = 1, powmax1 + f1A_power(p) = f1A_power(p-1) * f1A + f2A_power(p) = f2A_power(p-1) * f2A + enddo + do p = 1, powmax2 + g12_power(p) = g12_power(p-1) * g12 + enddo + + do p = 1, jBH_size + mpA = jBH_m(p,i_nucl) + npA = jBH_n(p,i_nucl) + opA = jBH_o(p,i_nucl) + tmp = jBH_c(p,i_nucl) + if(mpA .eq. npA) then + tmp = tmp * 0.5d0 + endif + + tmp1 = double_p(mpA) * f1A_power(mpA-1) * f2A_power(npA) + double_p(npA) * f1A_power(npA-1) * f2A_power(mpA) + tmp1 = tmp1 * g12_power(opA) * tmp + tmp2 = double_p(opA) * g12_power(opA-1) * (f1A_power(mpA) * f2A_power(npA) + f1A_power(npA) * f2A_power(mpA)) * tmp + + gradx(jpoint) = gradx(jpoint) + tmp1 * grad1_f1A(1) + tmp2 * grad1_g12(1) + grady(jpoint) = grady(jpoint) + tmp1 * grad1_f1A(2) + tmp2 * grad1_g12(2) + gradz(jpoint) = gradz(jpoint) + tmp1 * grad1_f1A(3) + tmp2 * grad1_g12(3) + enddo ! p + enddo ! i_nucl + enddo ! jpoint + + return +end + diff --git a/plugins/local/tc_int/write_tc_int.irp.f b/plugins/local/tc_int/write_tc_int.irp.f index ebdce6f2..9f25a6fd 100644 --- a/plugins/local/tc_int/write_tc_int.irp.f +++ b/plugins/local/tc_int/write_tc_int.irp.f @@ -14,19 +14,17 @@ program write_tc_int my_n_pt_a_grid = tc_grid1_a touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid + my_extra_grid_becke = .True. + PROVIDE tc_grid2_a tc_grid2_r + my_n_pt_r_extra_grid = tc_grid2_r + my_n_pt_a_extra_grid = tc_grid2_a + touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid + call write_int(6, my_n_pt_r_grid, 'radial external grid over') call write_int(6, my_n_pt_a_grid, 'angular external grid over') - if(tc_integ_type .eq. "numeric") then - my_extra_grid_becke = .True. - PROVIDE tc_grid2_a tc_grid2_r - my_n_pt_r_extra_grid = tc_grid2_r - my_n_pt_a_extra_grid = tc_grid2_a - touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid - - call write_int(6, my_n_pt_r_extra_grid, 'radial internal grid over') - call write_int(6, my_n_pt_a_extra_grid, 'angular internal grid over') - endif + call write_int(6, my_n_pt_r_extra_grid, 'radial internal grid over') + call write_int(6, my_n_pt_a_extra_grid, 'angular internal grid over') call main() From a9d2f0e188cdc88e1cfe1387de4f1c118bb17a5d Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Wed, 26 Jun 2024 17:55:56 +0200 Subject: [PATCH 07/19] Working on TC ints. Not well tested --- plugins/local/tc_int/jast_grad_full.irp.f | 127 ++++++++++++++++++++-- plugins/local/tc_int/jast_utils_bh.irp.f | 22 ++-- 2 files changed, 134 insertions(+), 15 deletions(-) diff --git a/plugins/local/tc_int/jast_grad_full.irp.f b/plugins/local/tc_int/jast_grad_full.irp.f index 78ed1edf..599d3779 100644 --- a/plugins/local/tc_int/jast_grad_full.irp.f +++ b/plugins/local/tc_int/jast_grad_full.irp.f @@ -4,7 +4,7 @@ subroutine get_grad1_u12_for_tc(ipoint, n_grid2, resx, resy, resz, res) BEGIN_DOC - ! + ! ! resx(ipoint) = [grad1 u(r1,r2)]_x1 ! resy(ipoint) = [grad1 u(r1,r2)]_y1 ! resz(ipoint) = [grad1 u(r1,r2)]_z1 @@ -59,7 +59,7 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz) double precision :: r2(3) double precision :: dx, dy, dz, r12, tmp double precision :: rn(3), f1A, grad1_f1A(3), f2A, grad2_f2A(3), g12, grad1_g12(3) - double precision :: tmp1, tmp2 + double precision :: tmp1, tmp2, dist integer :: powmax1, powmax, powmax2 double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:) @@ -90,30 +90,105 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz) gradx(jpoint) = 0.d0 grady(jpoint) = 0.d0 gradz(jpoint) = 0.d0 + + call jBH_elem_fct_grad_alpha1(r1, r2, g12, grad1_g12) + +! dist = (r1(1) - r2(1)) * (r1(1) - r2(1)) & +! + (r1(2) - r2(2)) * (r1(2) - r2(2)) & +! + (r1(3) - r2(3)) * (r1(3) - r2(3)) +! +! if(dist .ge. 1d-15) then +! dist = dsqrt( dist ) +! +! tmp1 = 1.d0 / (1.d0 + dist) +! +! g12 = dist * tmp1 +! tmp2 = tmp1 * tmp1 / dist +! grad1_g12(1) = tmp2 * (r1(1) - r2(1)) +! grad1_g12(2) = tmp2 * (r1(2) - r2(2)) +! grad1_g12(3) = tmp2 * (r1(3) - r2(3)) +! +! else +! +! grad1_g12(1) = 0.d0 +! grad1_g12(2) = 0.d0 +! grad1_g12(3) = 0.d0 +! g12 = 0.d0 +! +! endif +! + do p = 1, powmax2 + g12_power(p) = g12_power(p-1) * g12 + enddo + do i_nucl = 1, nucl_num rn(1) = nucl_coord(i_nucl,1) rn(2) = nucl_coord(i_nucl,2) rn(3) = nucl_coord(i_nucl,3) - call jBH_elem_fct_grad(jBH_en(i_nucl), r1, rn, f1A, grad1_f1A) - call jBH_elem_fct_grad(jBH_en(i_nucl), r2, rn, f2A, grad2_f2A) - call jBH_elem_fct_grad(jBH_ee(i_nucl), r1, r2, g12, grad1_g12) + call jBH_elem_fct_grad_alpha1(r1, rn, f1A, grad1_f1A) +! dist = (r1(1) - rn(1)) * (r1(1) - rn(1)) & +! + (r1(2) - rn(2)) * (r1(2) - rn(2)) & +! + (r1(3) - rn(3)) * (r1(3) - rn(3)) +! if (dist > 1.d-15) then +! dist = dsqrt( dist ) +! +! tmp1 = 1.d0 / (1.d0 + dist) +! +! f1A = dist * tmp1 +! tmp2 = tmp1 * tmp1 / dist +! grad1_f1A(1) = tmp2 * (r1(1) - rn(1)) +! grad1_f1A(2) = tmp2 * (r1(2) - rn(2)) +! grad1_f1A(3) = tmp2 * (r1(3) - rn(3)) +! +! else +! +! grad1_f1A(1) = 0.d0 +! grad1_f1A(2) = 0.d0 +! grad1_f1A(3) = 0.d0 +! f1A = 0.d0 +! +! endif + + call jBH_elem_fct_grad_alpha1(r2, rn, f2A, grad2_f2A) +! dist = (r2(1) - rn(1)) * (r2(1) - rn(1)) & +! + (r2(2) - rn(2)) * (r2(2) - rn(2)) & +! + (r2(3) - rn(3)) * (r2(3) - rn(3)) +! +! if (dist > 1.d-15) then +! dist = dsqrt( dist ) +! +! tmp1 = 1.d0 / (1.d0 + dist) +! +! f2A = dist * tmp1 +! tmp2 = tmp1 * tmp1 / dist +! grad2_f2A(1) = tmp2 * (r2(1) - rn(1)) +! grad2_f2A(2) = tmp2 * (r2(2) - rn(2)) +! grad2_f2A(3) = tmp2 * (r2(3) - rn(3)) +! +! else +! +! grad2_f2A(1) = 0.d0 +! grad2_f2A(2) = 0.d0 +! grad2_f2A(3) = 0.d0 +! f2A = 0.d0 +! +! endif ! Compute powers of f1A and f2A do p = 1, powmax1 f1A_power(p) = f1A_power(p-1) * f1A f2A_power(p) = f2A_power(p-1) * f2A enddo - do p = 1, powmax2 - g12_power(p) = g12_power(p-1) * g12 - enddo do p = 1, jBH_size mpA = jBH_m(p,i_nucl) npA = jBH_n(p,i_nucl) opA = jBH_o(p,i_nucl) tmp = jBH_c(p,i_nucl) +! if (dabs(tmp) <= 1.d-10) cycle +! if(mpA .eq. npA) then tmp = tmp * 0.5d0 endif @@ -132,3 +207,39 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz) return end +subroutine jBH_elem_fct_grad_alpha1(r1, r2, fct, grad1_fct) + + implicit none + double precision, intent(in) :: r1(3), r2(3) + double precision, intent(out) :: fct, grad1_fct(3) + double precision :: dist, tmp1, tmp2 + + dist = (r1(1) - r2(1)) * (r1(1) - r2(1)) & + + (r1(2) - r2(2)) * (r1(2) - r2(2)) & + + (r1(3) - r2(3)) * (r1(3) - r2(3)) + + + if(dist .ge. 1d-15) then + dist = dsqrt( dist ) + + tmp1 = 1.d0 / (1.d0 + dist) + + fct = dist * tmp1 + tmp2 = tmp1 * tmp1 / dist + grad1_fct(1) = tmp2 * (r1(1) - r2(1)) + grad1_fct(2) = tmp2 * (r1(2) - r2(2)) + grad1_fct(3) = tmp2 * (r1(3) - r2(3)) + + else + + grad1_fct(1) = 0.d0 + grad1_fct(2) = 0.d0 + grad1_fct(3) = 0.d0 + fct = 0.d0 + + endif + + return +end + +! --- diff --git a/plugins/local/tc_int/jast_utils_bh.irp.f b/plugins/local/tc_int/jast_utils_bh.irp.f index 750ce90b..200bc5ff 100644 --- a/plugins/local/tc_int/jast_utils_bh.irp.f +++ b/plugins/local/tc_int/jast_utils_bh.irp.f @@ -1,35 +1,43 @@ ! --- + + subroutine jBH_elem_fct_grad(alpha, r1, r2, fct, grad1_fct) implicit none double precision, intent(in) :: alpha, r1(3), r2(3) double precision, intent(out) :: fct, grad1_fct(3) - double precision :: dist, tmp1, tmp2 + double precision :: dist, tmp1, tmp2, dist_inv - dist = dsqrt( (r1(1) - r2(1)) * (r1(1) - r2(1)) & - + (r1(2) - r2(2)) * (r1(2) - r2(2)) & - + (r1(3) - r2(3)) * (r1(3) - r2(3)) ) + dist = (r1(1) - r2(1)) * (r1(1) - r2(1)) & + + (r1(2) - r2(2)) * (r1(2) - r2(2)) & + + (r1(3) - r2(3)) * (r1(3) - r2(3)) - if(dist .ge. 1d-10) then + if(dist .ge. 1d-15) then + dist_inv = 1.d0/dsqrt( dist ) + dist = dist_inv * dist + tmp1 = 1.d0 / (1.d0 + alpha * dist) fct = alpha * dist * tmp1 - tmp2 = alpha * tmp1 * tmp1 / dist + tmp2 = alpha * tmp1 * tmp1 * dist_inv grad1_fct(1) = tmp2 * (r1(1) - r2(1)) grad1_fct(2) = tmp2 * (r1(2) - r2(2)) grad1_fct(3) = tmp2 * (r1(3) - r2(3)) + else + grad1_fct(1) = 0.d0 grad1_fct(2) = 0.d0 grad1_fct(3) = 0.d0 fct = 0.d0 + endif return -end +end ! --- From 5d80cb7b2dd53bdd9eb713e507912e6fce3cadd7 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Thu, 27 Jun 2024 12:06:06 +0200 Subject: [PATCH 08/19] Separated gpu and gpu_arch --- configure | 8 ++-- src/ccsd/NEED | 1 + src/ccsd/ccsd_space_orb_sub.irp.f | 14 +++++-- src/gpu/NEED | 1 + src/gpu/README.rst | 6 +++ src/{gpu_x86 => gpu}/gpu.h | 0 src/{gpu_x86 => gpu}/gpu_module.F90 | 59 +++++++++++++++-------------- src/gpu_x86/gpu.c | 2 +- 8 files changed, 54 insertions(+), 37 deletions(-) create mode 100644 src/gpu/NEED create mode 100644 src/gpu/README.rst rename src/{gpu_x86 => gpu}/gpu.h (100%) rename src/{gpu_x86 => gpu}/gpu_module.F90 (74%) diff --git a/configure b/configure index 014275eb..db158966 100755 --- a/configure +++ b/configure @@ -115,19 +115,19 @@ while getopts "d:c:i:g:h" c ; do done # Handle GPU acceleration -rm -f ${QP_ROOT}/src/gpu +rm -f ${QP_ROOT}/src/gpu_arch case "$GPU" in amd) # Nvidia echo "Activating AMD GPU acceleration" - ln -s ${QP_ROOT}/src/gpu_amd ${QP_ROOT}/src/gpu + ln -s ${QP_ROOT}/src/gpu_amd ${QP_ROOT}/src/gpu_arch ;; nvidia) # Nvidia echo "Activating Nvidia GPU acceleration" - ln -s ${QP_ROOT}/src/gpu_nvidia ${QP_ROOT}/src/gpu + ln -s ${QP_ROOT}/src/gpu_nvidia ${QP_ROOT}/src/gpu_arch ;; *) # No Acceleration echo "Disabling GPU acceleration" - ln -s ${QP_ROOT}/src/gpu_x86 ${QP_ROOT}/src/gpu + ln -s ${QP_ROOT}/src/gpu_x86 ${QP_ROOT}/src/gpu_arch ;; esac diff --git a/src/ccsd/NEED b/src/ccsd/NEED index e6e6bc59..8298f28e 100644 --- a/src/ccsd/NEED +++ b/src/ccsd/NEED @@ -1,2 +1,3 @@ +gpu hartree_fock utils_cc diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 9d4ae7f9..84aab08a 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -1,4 +1,5 @@ subroutine run_ccsd_space_orb + use gpu implicit none @@ -11,7 +12,7 @@ subroutine run_ccsd_space_orb double precision, allocatable :: t2(:,:,:,:), r2(:,:,:,:), tau(:,:,:,:), tau_x(:,:,:,:) double precision, allocatable :: t1(:,:), r1(:,:) - double precision, allocatable :: H_oo(:,:), H_vv(:,:), H_vo(:,:) + double precision, pointer :: H_oo, H_vv, H_vo double precision, allocatable :: all_err(:,:), all_t(:,:) integer, allocatable :: list_occ(:), list_vir(:) @@ -55,7 +56,10 @@ subroutine run_ccsd_space_orb allocate(tau(nO,nO,nV,nV)) allocate(tau_x(nO,nO,nV,nV)) allocate(t1(nO,nV), r1(nO,nV)) - allocate(H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO)) + + call gpu_allocate_double(H_oo, (/ nO, nO /) ) + call gpu_allocate_double(H_vv, (/ nV, nV /) ) + call gpu_allocate_double(H_vo, (/ nV, nO /) ) if (cc_update_method == 'diis') then double precision :: rss, diis_mem, extra_mem @@ -191,7 +195,11 @@ subroutine run_ccsd_space_orb deallocate(all_err,all_t) endif - deallocate(H_vv,H_oo,H_vo,r1,r2,tau) + call gpu_deallocate_double(H_oo) + call gpu_deallocate_double(H_vv) + call gpu_deallocate_double(H_vo) + + deallocate(r1,r2,tau) ! CCSD(T) double precision :: e_t, e_t_err diff --git a/src/gpu/NEED b/src/gpu/NEED new file mode 100644 index 00000000..c2af78d2 --- /dev/null +++ b/src/gpu/NEED @@ -0,0 +1 @@ +gpu_arch diff --git a/src/gpu/README.rst b/src/gpu/README.rst new file mode 100644 index 00000000..17ee28a0 --- /dev/null +++ b/src/gpu/README.rst @@ -0,0 +1,6 @@ +=== +gpu +=== + +Bindings for GPU routines (architecture independent). +Architecture-dependent files are in gpu_arch. diff --git a/src/gpu_x86/gpu.h b/src/gpu/gpu.h similarity index 100% rename from src/gpu_x86/gpu.h rename to src/gpu/gpu.h diff --git a/src/gpu_x86/gpu_module.F90 b/src/gpu/gpu_module.F90 similarity index 74% rename from src/gpu_x86/gpu_module.F90 rename to src/gpu/gpu_module.F90 index 86ba3926..f35ebc97 100644 --- a/src/gpu_x86/gpu_module.F90 +++ b/src/gpu/gpu_module.F90 @@ -1,5 +1,5 @@ module gpu - use, intrinsic :: iso_c_binding, only : c_int32_t, c_int64_t, c_double, c_size_t, c_char + use, intrinsic :: iso_c_binding implicit none interface @@ -17,7 +17,7 @@ module gpu integer(c_int64_t), value :: n end subroutine - subroutine gpu_free_c(ptr) bind(C, name='gpu_free') + subroutine gpu_deallocate_c(ptr) bind(C, name='gpu_deallocate') import type(c_ptr) :: ptr end subroutine @@ -89,53 +89,54 @@ module gpu end interface + contains + + + subroutine gpu_allocate_double(ptr, s) + implicit none + double precision, pointer, intent(inout) :: ptr + integer, intent(in) :: s(:) + type(c_ptr) :: cptr + + call gpu_allocate_c(cptr, sum(s*1_8)*8_8) + call c_f_pointer(cptr, ptr, s) + end subroutine + + subroutine gpu_deallocate_double(ptr) + implicit none + double precision, pointer, intent(inout) :: ptr + type(c_ptr) :: cptr + cptr = c_loc(ptr) + call gpu_deallocate(cptr) + NULLIFY(ptr) + end subroutine + end module -subroutine gpu_allocate_double(ptr, s) - use gpu - implicit none - double precision, pointer, intent(inout) :: ptr - integer*8, intent(in) :: s(*) - type(c_ptr) :: cptr - - call gpu_allocate_c(cptr, sum(s)*8_8) - call c_f_pointer(cptr, ptr, s) -end subroutine - -subroutine gpu_free_double(ptr) - use gpu - implicit none - double precision, pointer, intent(inout) :: ptr - type(c_ptr) :: cptr - cptr = cloc(ptr) - call gpu_free(cptr) - NULLIFY(ptr) -end subroutine - subroutine gpu_upload_double(cpu_ptr, gpu_ptr, n) use gpu implicit none double precision, intent(in) :: cpu_ptr(*) - double precision, intent(out) :: gpu_ptr(*) + double precision, intent(in) :: gpu_ptr(*) integer(c_int64_t), intent(in) :: n - call gpu_upload_c(cpu_ptr, gpu_ptr, 8_8*n) + call gpu_upload_c(c_loc(cpu_ptr), c_loc(gpu_ptr), 8_8*n) end subroutine subroutine gpu_download_double(gpu_ptr, cpu_ptr, n) use gpu implicit none double precision, intent(in) :: gpu_ptr(*) - double precision, intent(out) :: cpu_ptr(*) + double precision, intent(in) :: cpu_ptr(*) integer(c_int64_t), intent(in) :: n - call gpu_download_c(gpu_ptr, cpu_ptr, 8_8*n) + call gpu_download_c(c_loc(gpu_ptr), c_loc(cpu_ptr), 8_8*n) end subroutine subroutine gpu_copy_double(gpu_ptr_src, gpu_ptr_dest, n) use gpu implicit none double precision, intent(in) :: gpu_ptr_src(*) - double precision, intent(out) :: gpu_ptr_dest(*) + double precision, intent(in) :: gpu_ptr_dest(*) integer(c_int64_t), intent(in) :: n - call gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, 8_8*n) + call gpu_copy_c(c_loc(gpu_ptr_src), c_loc(gpu_ptr_dest), 8_8*n) end subroutine diff --git a/src/gpu_x86/gpu.c b/src/gpu_x86/gpu.c index 71505dbe..41ede396 100644 --- a/src/gpu_x86/gpu.c +++ b/src/gpu_x86/gpu.c @@ -25,7 +25,7 @@ void gpu_allocate(void** ptr, const int64_t n) { } } -void gpu_free(void** ptr) { +void gpu_deallocate(void** ptr) { free(*ptr); *ptr = NULL; } From 6c02ac0f0b05ea3cc16e0fde66e23c9a0de14246 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Thu, 27 Jun 2024 12:07:48 +0200 Subject: [PATCH 09/19] Separated gpu and gpu_arch --- src/gpu/gpu_module.F90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90 index f35ebc97..43754454 100644 --- a/src/gpu/gpu_module.F90 +++ b/src/gpu/gpu_module.F90 @@ -107,7 +107,7 @@ module gpu double precision, pointer, intent(inout) :: ptr type(c_ptr) :: cptr cptr = c_loc(ptr) - call gpu_deallocate(cptr) + call gpu_deallocate_c(cptr) NULLIFY(ptr) end subroutine From fa6d1419496d271a4715efc776790ce7fc152064 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Thu, 27 Jun 2024 15:45:52 +0200 Subject: [PATCH 10/19] Introducing GPU in CCSD --- src/ccsd/ccsd_space_orb_sub.irp.f | 224 +++++++----- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 12 +- src/gpu/gpu_module.F90 | 450 ++++++++++++++++++++++--- src/gpu_x86/gpu.c | 48 +-- 4 files changed, 570 insertions(+), 164 deletions(-) diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 84aab08a..455d62f7 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -10,9 +10,9 @@ subroutine run_ccsd_space_orb double precision :: uncorr_energy,energy, max_elem, max_r, max_r1, max_r2,ta,tb logical :: not_converged - double precision, allocatable :: t2(:,:,:,:), r2(:,:,:,:), tau(:,:,:,:), tau_x(:,:,:,:) - double precision, allocatable :: t1(:,:), r1(:,:) - double precision, pointer :: H_oo, H_vv, H_vo + type(gpu_double4) :: t2, r2, tau, tau_x + type(gpu_double2) :: t1, r1 + type(gpu_double2) :: H_oo, H_vv, H_vo double precision, allocatable :: all_err(:,:), all_t(:,:) integer, allocatable :: list_occ(:), list_vir(:) @@ -52,14 +52,15 @@ subroutine run_ccsd_space_orb !print*,'occ',list_occ !print*,'vir',list_vir - allocate(t2(nO,nO,nV,nV), r2(nO,nO,nV,nV)) - allocate(tau(nO,nO,nV,nV)) - allocate(tau_x(nO,nO,nV,nV)) - allocate(t1(nO,nV), r1(nO,nV)) - - call gpu_allocate_double(H_oo, (/ nO, nO /) ) - call gpu_allocate_double(H_vv, (/ nV, nV /) ) - call gpu_allocate_double(H_vo, (/ nV, nO /) ) + call gpu_allocate(t2, nO,nO,nV,nV) + call gpu_allocate(r2, nO,nO,nV,nV) + call gpu_allocate(tau, nO,nO,nV,nV) + call gpu_allocate(tau_x, nO,nO,nV,nV) + call gpu_allocate(t1, nO,nV) + call gpu_allocate(r1, nO,nV) + call gpu_allocate(H_oo, nO, nO) + call gpu_allocate(H_vo, nV, nO) + call gpu_allocate(H_vv, nV, nV) if (cc_update_method == 'diis') then double precision :: rss, diis_mem, extra_mem @@ -101,14 +102,21 @@ subroutine run_ccsd_space_orb endif ! Init - call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,t1) - call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,t2) - call update_tau_space(nO,nV,t1,t2,tau) + double precision, allocatable :: h_t1(:,:), h_t2(:,:,:,:) + allocate(h_t1(nO,nV), h_t2(nO,nO,nV,nV)) + + call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,h_t1) + call gpu_upload(h_t1, t1) + + call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,h_t2) + call gpu_upload(h_t2, t2) + + call update_tau_space(nO,nV,h_t1,t1,t2,tau) call update_tau_x_space(nO,nV,tau,tau_x) !print*,'hf_energy', hf_energy call det_energy(det,uncorr_energy) print*,'Det energy', uncorr_energy - call ccsd_energy_space_x(nO,nV,tau_x,t1,energy) + call ccsd_energy_space_x(nO,nV,tau_x%f,t1%f,energy) print*,'Guess energy', uncorr_energy+energy, energy nb_iter = 0 @@ -127,40 +135,38 @@ subroutine run_ccsd_space_orb if (do_ao_cholesky) then ! if (.False.) then call compute_H_oo_chol(nO,nV,tau_x,H_oo) - call compute_H_vv_chol(nO,nV,tau_x,H_vv) - call compute_H_vo_chol(nO,nV,t1,H_vo) + call compute_H_vv_chol(nO,nV,tau_x%f,H_vv%f) + call compute_H_vo_chol(nO,nV,t1%f,H_vo%f) - call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) - call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) + call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1) + call compute_r2_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r2%f,max_r2) else - call compute_H_oo(nO,nV,t1,t2,tau,H_oo) - call compute_H_vv(nO,nV,t1,t2,tau,H_vv) - call compute_H_vo(nO,nV,t1,t2,H_vo) + call compute_H_oo(nO,nV,t1%f,t2%f,tau%f,H_oo%f) + call compute_H_vv(nO,nV,t1%f,t2%f,tau%f,H_vv%f) + call compute_H_vo(nO,nV,t1%f,t2%f,H_vo%f) - call compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) - call compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) + call compute_r1_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r1%f,max_r1) + call compute_r2_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r2%f,max_r2) endif max_r = max(max_r1,max_r2) ! Update if (cc_update_method == 'diis') then - !call update_t_ccsd(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2) - !call update_t_ccsd_diis(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2) - call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1,r2,t1,t2,all_err,all_t) + call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1%f,r2%f,t1%f,t2%f,all_err,all_t) ! Standard update as T = T - Delta elseif (cc_update_method == 'none') then - call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1,t1) - call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2,t2) + call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1%f,t1%f) + call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2%f,t2%f) else print*,'Unkown cc_method_method: '//cc_update_method endif - call update_tau_space(nO,nV,t1,t2,tau) + call update_tau_space(nO,nV,t1%f,t1,t2,tau) call update_tau_x_space(nO,nV,tau,tau_x) ! Energy - call ccsd_energy_space_x(nO,nV,tau_x,t1,energy) + call ccsd_energy_space_x(nO,nV,tau_x%f,t1%f,energy) write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |' nb_iter = nb_iter + 1 @@ -185,8 +191,8 @@ subroutine run_ccsd_space_orb print*,'' if (write_amplitudes) then - call write_t1(nO,nV,t1) - call write_t2(nO,nV,t2) + call write_t1(nO,nV,t1%f) + call write_t2(nO,nV,t2%f) call ezfio_set_utils_cc_io_amplitudes('Read') endif @@ -195,11 +201,14 @@ subroutine run_ccsd_space_orb deallocate(all_err,all_t) endif - call gpu_deallocate_double(H_oo) - call gpu_deallocate_double(H_vv) - call gpu_deallocate_double(H_vo) + call gpu_deallocate(H_oo) + call gpu_deallocate(H_vv) + call gpu_deallocate(H_vo) - deallocate(r1,r2,tau) + call gpu_deallocate(r1) + call gpu_deallocate(r2) + call gpu_deallocate(tau) + call gpu_deallocate(tau_x) ! CCSD(T) double precision :: e_t, e_t_err @@ -207,28 +216,14 @@ subroutine run_ccsd_space_orb if (cc_par_t .and. elec_alpha_num + elec_beta_num > 2) then - ! Dumb way - !call wall_time(ta) - !call ccsd_par_t_space(nO,nV,t1,t2,e_t) - !call wall_time(tb) - !print*,'Time: ',tb-ta, ' s' - - !print*,'' - !write(*,'(A15,F18.12,A3)') ' E(CCSD(T)) = ', uncorr_energy + energy + e_t, ' Ha' - !write(*,'(A15,F18.12,A3)') ' E(T) = ', e_t, ' Ha' - !write(*,'(A15,F18.12,A3)') ' Correlation = ', energy + e_t, ' Ha' - !print*,'' - ! New e_t = uncorr_energy + energy ! For print in (T) call e_t_err = 0.d0 print*,'Computing (T) correction...' call wall_time(ta) -! call ccsd_par_t_space_v3(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v & -! ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t) - call ccsd_par_t_space_stoch(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v & + call ccsd_par_t_space_stoch(nO,nV,t1%f,t2%f,cc_space_f_o,cc_space_f_v & ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t, e_t_err) call wall_time(tb) @@ -243,7 +238,9 @@ subroutine run_ccsd_space_orb call save_energy(uncorr_energy + energy, e_t) - deallocate(t1,t2) + deallocate(h_t1, h_t2) + call gpu_deallocate(t1) + call gpu_deallocate(t2) end @@ -341,70 +338,139 @@ end ! Tau -subroutine update_tau_space(nO,nV,t1,t2,tau) - +subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau) + use gpu implicit none ! in integer, intent(in) :: nO, nV - double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV) + double precision, intent(in) :: h_t1(nO,nV) + type(gpu_double2), intent(in) :: t1 + type(gpu_double4), intent(in) :: t2 ! out - double precision, intent(out) :: tau(nO,nO,nV,nV) + type(gpu_double4) :: tau ! internal integer :: i,j,a,b +! !$OMP PARALLEL & +! !$OMP SHARED(nO,nV,tau,t2,t1,h_t1) & +! !$OMP PRIVATE(i,j,a,b) & +! !$OMP DEFAULT(NONE) +! !$OMP DO +! do b = 1, nV +! do a = 1, nV +! do j = 1, nO +! do i = 1, nO +! tau%f(i,j,a,b) = t2%f(i,j,a,b) + t1%f(i,a) * h_t1(j,b) +! enddo +! enddo +! enddo +! enddo +! !$OMP END DO +! !$OMP END PARALLEL + + + type(gpu_blas) :: blas + type(gpu_stream) :: stream(nV) + + call gpu_blas_create(blas) + do b=1,nV + call gpu_stream_create(stream(b)) + enddo + !$OMP PARALLEL & - !$OMP SHARED(nO,nV,tau,t2,t1) & + !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) - !$OMP DO - do b = 1, nV - do a = 1, nV - do j = 1, nO - do i = 1, nO - tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b) - enddo - enddo + do j=1,nO + !$OMP DO + do b=1,nV + call gpu_set_stream(blas,stream(b)) + call gpu_dgeam_c(blas%c, 'N', 'N', nO*1_8, nV*1_8, & + 1.d0, c_loc(t2%f(1,j,1,b)), nO*nO*1_8, & + h_t1(j,b), t1%c, nO*1_8, & + c_loc(tau%f(1,j,1,b)), nO*nO*1_8) enddo + !$OMP END DO enddo - !$OMP END DO !$OMP END PARALLEL + call gpu_synchronize() + + do b=1,nV + call gpu_stream_destroy(stream(b)) + enddo + + call gpu_blas_destroy(blas) + end subroutine update_tau_x_space(nO,nV,tau,tau_x) - + use gpu implicit none ! in - integer, intent(in) :: nO, nV - double precision, intent(in) :: tau(nO,nO,nV,nV) + integer, intent(in) :: nO, nV + type(gpu_double4), intent(in) :: tau ! out - double precision, intent(out) :: tau_x(nO,nO,nV,nV) + type(gpu_double4) :: tau_x ! internal integer :: i,j,a,b +! !$OMP PARALLEL & +! !$OMP SHARED(nO,nV,tau,tau_x) & +! !$OMP PRIVATE(i,j,a,b) & +! !$OMP DEFAULT(NONE) +! !$OMP DO +! do b = 1, nV +! do a = 1, nV +! do j = 1, nO +! do i = 1, nO +! tau_x%f(i,j,a,b) = 2.d0*tau%f(i,j,a,b) - tau%f(i,j,b,a) +! enddo +! enddo +! enddo +! enddo +! !$OMP END DO +! !$OMP END PARALLEL + + type(gpu_blas) :: blas + type(gpu_stream) :: stream(nV) + + call gpu_blas_create(blas) + do a=1,nV + call gpu_stream_create(stream(a)) + enddo + !$OMP PARALLEL & - !$OMP SHARED(nO,nV,tau,tau_x) & + !$OMP SHARED(nO,nV,tau,tau_x,stream,blas) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) !$OMP DO - do b = 1, nV - do a = 1, nV - do j = 1, nO - do i = 1, nO - tau_x(i,j,a,b) = 2.d0*tau(i,j,a,b) - tau(i,j,b,a) - enddo - enddo + do b=1,nV + do a=1,nV + call gpu_set_stream(blas,stream(a)) + call gpu_dgeam_c(blas%c, 'N', 'N', nO*1_8, nO*1_8, & + 2.d0, c_loc(tau%f(1,1,a,b)), nO*1_8, & + -1.d0, c_loc(tau%f(1,1,b,a)), nO*1_8, & + c_loc(tau_x%f(1,1,a,b)), nO*1_8) enddo enddo !$OMP END DO !$OMP END PARALLEL + call gpu_synchronize() + + do b=1,nV + call gpu_stream_destroy(stream(b)) + enddo + + call gpu_blas_destroy(blas) + end ! R1 diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index b59dc0bb..9b161001 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -294,12 +294,12 @@ end ! H_oo subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo) - + use gpu implicit none integer, intent(in) :: nO,nV - double precision, intent(in) :: tau_x(nO, nO, nV, nV) - double precision, intent(out) :: H_oo(nO, nO) + type(gpu_double4), intent(in) :: tau_x + type(gpu_double2), intent(out) :: H_oo integer :: a,b,i,j,u,k @@ -315,7 +315,7 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo) do b=1,nV do j=1,nO do a=1,nV - tmp_vov(a,j,b) = tau_x(u,j,a,b) + tmp_vov(a,j,b) = tau_x%f(u,j,a,b) enddo enddo enddo @@ -328,7 +328,7 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo) !$omp do do i = 1, nO do u = 1, nO - H_oo(u,i) = cc_space_f_oo(u,i) + H_oo%f(u,i) = cc_space_f_oo(u,i) enddo enddo !$omp end do nowait @@ -336,7 +336,7 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo) !$omp end parallel call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & tau_kau, cholesky_mo_num*nV, cc_space_v_vo_chol, cholesky_mo_num*nV, & - 1.d0, H_oo, nO) + 1.d0, H_oo%f, nO) end diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90 index 43754454..51f80ac0 100644 --- a/src/gpu/gpu_module.F90 +++ b/src/gpu/gpu_module.F90 @@ -2,6 +2,52 @@ module gpu use, intrinsic :: iso_c_binding implicit none +! Data types +! ---------- + + type gpu_double1 + type(c_ptr) :: c + double precision, pointer :: f(:) + end type + + type gpu_double2 + type(c_ptr) :: c + double precision, pointer :: f(:,:) + end type + + type gpu_double3 + type(c_ptr) :: c + double precision, pointer :: f(:,:,:) + end type + + type gpu_double4 + type(c_ptr) :: c + double precision, pointer :: f(:,:,:,:) + end type + + type gpu_double5 + type(c_ptr) :: c + double precision, pointer :: f(:,:,:,:,:) + end type + + type gpu_double6 + type(c_ptr) :: c + double precision, pointer :: f(:,:,:,:,:,:) + end type + + + type gpu_blas + type(c_ptr) :: c + end type + + type gpu_stream + type(c_ptr) :: c + end type + + +! C interfaces +! ------------ + interface integer function gpu_ndevices() bind(C) end function @@ -43,100 +89,394 @@ module gpu integer(c_int64_t), value :: n end subroutine - subroutine gpu_stream_create(stream) bind(C) + subroutine gpu_stream_create_c(stream) bind(C, name='gpu_stream_create') import type(c_ptr) :: stream end subroutine - subroutine gpu_stream_destroy(stream) bind(C) + subroutine gpu_stream_destroy_c(stream) bind(C, name='gpu_stream_destroy') import type(c_ptr) :: stream end subroutine - subroutine gpu_set_stream(handle, stream) bind(C) + subroutine gpu_set_stream_c(handle, stream) bind(C, name='gpu_set_stream') import type(c_ptr) :: handle, stream end subroutine - subroutine gpu_synchronize() + subroutine gpu_synchronize() bind(C) + import end subroutine - subroutine gpu_blas_create(handle) bind(C) + subroutine gpu_blas_create_c(handle) bind(C, name='gpu_blas_create') import type(c_ptr) :: handle end subroutine - subroutine gpu_blas_destroy(handle) bind(C) + subroutine gpu_blas_destroy_c(handle) bind(C, name='gpu_blas_destroy') import type(c_ptr) :: handle end subroutine - subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res) bind(C) + subroutine gpu_ddot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_ddot') import - type(c_ptr), intent(in) :: handle - integer(c_int64_t), value :: n, incx, incy - real(c_double), intent(in) :: dx(*), dy(*) - real(c_double), intent(out) :: res + type(c_ptr), intent(in), value :: handle + integer(c_int64_t), value :: n, incx, incy + type(c_ptr), intent(in), value :: dx, dy + real(c_double), intent(out) :: res end subroutine - subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res) bind(C) + subroutine gpu_sdot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_sdot') import - type(c_ptr), intent(in) :: handle - integer(c_int64_t), value :: n, incx, incy - real(c_float), intent(in) :: dx(*), dy(*) + type(c_ptr), intent(in), value :: handle + integer(c_int64_t), value :: n, incx, incy + type(c_ptr), intent(in), value :: dx, dy real(c_float), intent(out) :: res end subroutine + subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, & + b, ldb, c, ldc) bind(C, name='gpu_dgeam') + import + type(c_ptr), intent(in), value :: handle + character(c_char), intent(in), value :: transa, transb + integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc + real(c_double), intent(in), value :: alpha, beta + type(c_ptr), value :: a, b, c + end subroutine + end interface + +! Polymorphic interfaces +! ---------------------- + + interface gpu_allocate + procedure gpu_allocate_double1 & + ,gpu_allocate_double2 & + ,gpu_allocate_double3 & + ,gpu_allocate_double4 & + ,gpu_allocate_double5 & + ,gpu_allocate_double6 + end interface gpu_allocate + + interface gpu_deallocate + procedure gpu_deallocate_double1 & + ,gpu_deallocate_double2 & + ,gpu_deallocate_double3 & + ,gpu_deallocate_double4 & + ,gpu_deallocate_double5 & + ,gpu_deallocate_double6 + end interface gpu_deallocate + + interface gpu_upload + procedure gpu_upload_double1 & + ,gpu_upload_double2 & + ,gpu_upload_double3 & + ,gpu_upload_double4 & + ,gpu_upload_double5 & + ,gpu_upload_double6 + end interface gpu_upload + + interface gpu_download + procedure gpu_download_double1 & + ,gpu_download_double2 & + ,gpu_download_double3 & + ,gpu_download_double4 & + ,gpu_download_double5 & + ,gpu_download_double6 + end interface gpu_download + + interface gpu_copy + procedure gpu_copy_double1 & + ,gpu_copy_double2 & + ,gpu_copy_double3 & + ,gpu_copy_double4 & + ,gpu_copy_double5 & + ,gpu_copy_double6 + end interface gpu_copy + + contains - subroutine gpu_allocate_double(ptr, s) - implicit none - double precision, pointer, intent(inout) :: ptr - integer, intent(in) :: s(:) - type(c_ptr) :: cptr +! gpu_allocate +! ------------ - call gpu_allocate_c(cptr, sum(s*1_8)*8_8) - call c_f_pointer(cptr, ptr, s) + subroutine gpu_allocate_double1(ptr, s) + implicit none + type(gpu_double1), intent(inout) :: ptr + integer, intent(in) :: s + + call gpu_allocate_c(ptr%c, s*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s /)) end subroutine - subroutine gpu_deallocate_double(ptr) + subroutine gpu_allocate_double2(ptr, s1, s2) implicit none - double precision, pointer, intent(inout) :: ptr - type(c_ptr) :: cptr - cptr = c_loc(ptr) - call gpu_deallocate_c(cptr) - NULLIFY(ptr) + type(gpu_double2), intent(inout) :: ptr + integer, intent(in) :: s1, s2 + + call gpu_allocate_c(ptr%c, s1*s2*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /)) + end subroutine + + subroutine gpu_allocate_double3(ptr, s1, s2, s3) + implicit none + type(gpu_double3), intent(inout) :: ptr + integer, intent(in) :: s1, s2, s3 + + call gpu_allocate_c(ptr%c, s1*s2*s3*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /)) + end subroutine + + subroutine gpu_allocate_double4(ptr, s1, s2, s3, s4) + implicit none + type(gpu_double4), intent(inout) :: ptr + integer, intent(in) :: s1, s2, s3, s4 + + call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /)) + end subroutine + + subroutine gpu_allocate_double5(ptr, s1, s2, s3, s4, s5) + implicit none + type(gpu_double5), intent(inout) :: ptr + integer, intent(in) :: s1, s2, s3, s4, s5 + + call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /)) + end subroutine + + subroutine gpu_allocate_double6(ptr, s1, s2, s3, s4, s5, s6) + implicit none + type(gpu_double6), intent(inout) :: ptr + integer, intent(in) :: s1, s2, s3, s4, s5, s6 + + call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /)) + end subroutine + + +! gpu_deallocate +! -------------- + + subroutine gpu_deallocate_double1(ptr) + implicit none + type(gpu_double1), intent(inout) :: ptr + call gpu_deallocate_c(ptr%c) + NULLIFY(ptr%f) + end subroutine + + subroutine gpu_deallocate_double2(ptr) + implicit none + type(gpu_double2), intent(inout) :: ptr + call gpu_deallocate_c(ptr%c) + NULLIFY(ptr%f) + end subroutine + + subroutine gpu_deallocate_double3(ptr) + implicit none + type(gpu_double3), intent(inout) :: ptr + call gpu_deallocate_c(ptr%c) + NULLIFY(ptr%f) + end subroutine + + subroutine gpu_deallocate_double4(ptr) + implicit none + type(gpu_double4), intent(inout) :: ptr + call gpu_deallocate_c(ptr%c) + NULLIFY(ptr%f) + end subroutine + + subroutine gpu_deallocate_double5(ptr) + implicit none + type(gpu_double5), intent(inout) :: ptr + call gpu_deallocate_c(ptr%c) + NULLIFY(ptr%f) + end subroutine + + subroutine gpu_deallocate_double6(ptr) + implicit none + type(gpu_double6), intent(inout) :: ptr + call gpu_deallocate_c(ptr%c) + NULLIFY(ptr%f) + end subroutine + + +! gpu_upload +! ---------- + + subroutine gpu_upload_double1(cpu_ptr, gpu_ptr) + implicit none + double precision, intent(in) :: cpu_ptr(:) + type(gpu_double1), intent(in) :: gpu_ptr + call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, 8_8*size(gpu_ptr%f)) + end subroutine + + subroutine gpu_upload_double2(cpu_ptr, gpu_ptr) + implicit none + double precision, intent(in) :: cpu_ptr(:,:) + type(gpu_double2), intent(in) :: gpu_ptr + call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) + end subroutine + + subroutine gpu_upload_double3(cpu_ptr, gpu_ptr) + implicit none + double precision, intent(in) :: cpu_ptr(:,:,:) + type(gpu_double3), intent(in) :: gpu_ptr + call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) + end subroutine + + subroutine gpu_upload_double4(cpu_ptr, gpu_ptr) + implicit none + double precision, intent(in) :: cpu_ptr(:,:,:,:) + type(gpu_double4), intent(in) :: gpu_ptr + call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) + end subroutine + + subroutine gpu_upload_double5(cpu_ptr, gpu_ptr) + implicit none + double precision, intent(in) :: cpu_ptr(:,:,:,:,:) + type(gpu_double5), intent(in) :: gpu_ptr + call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) + end subroutine + + subroutine gpu_upload_double6(cpu_ptr, gpu_ptr) + implicit none + double precision, intent(in) :: cpu_ptr(:,:,:,:,:,:) + type(gpu_double6), intent(in) :: gpu_ptr + call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) + end subroutine + + +! gpu_download +! ------------ + + subroutine gpu_download_double1(gpu_ptr, cpu_ptr) + implicit none + type(gpu_double1), intent(in) :: gpu_ptr + double precision, intent(in) :: cpu_ptr(:) + call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*size(gpu_ptr%f)) + end subroutine + + subroutine gpu_download_double2(gpu_ptr, cpu_ptr) + implicit none + type(gpu_double2), intent(in) :: gpu_ptr + double precision, intent(in) :: cpu_ptr(:,:) + call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) + end subroutine + + subroutine gpu_download_double3(gpu_ptr, cpu_ptr) + implicit none + type(gpu_double3), intent(in) :: gpu_ptr + double precision, intent(in) :: cpu_ptr(:,:,:) + call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) + end subroutine + + subroutine gpu_download_double4(gpu_ptr, cpu_ptr) + implicit none + type(gpu_double4), intent(in) :: gpu_ptr + double precision, intent(in) :: cpu_ptr(:,:,:,:) + call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) + end subroutine + + subroutine gpu_download_double5(gpu_ptr, cpu_ptr) + implicit none + type(gpu_double5), intent(in) :: gpu_ptr + double precision, intent(in) :: cpu_ptr(:,:,:,:,:) + call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) + end subroutine + + subroutine gpu_download_double6(gpu_ptr, cpu_ptr) + implicit none + type(gpu_double6), intent(in) :: gpu_ptr + double precision, intent(in) :: cpu_ptr(:,:,:,:,:,:) + call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) + end subroutine + +! gpu_copy +! -------- + + subroutine gpu_copy_double1(gpu_ptr_src, gpu_ptr_dest) + implicit none + type(gpu_double1), intent(in) :: gpu_ptr_src + type(gpu_double1), intent(in) :: gpu_ptr_dest + call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*size(gpu_ptr_dest%f)) + end subroutine + + subroutine gpu_copy_double2(gpu_ptr_src, gpu_ptr_dest) + implicit none + type(gpu_double2), intent(in) :: gpu_ptr_src + type(gpu_double2), intent(in) :: gpu_ptr_dest + call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8)) + end subroutine + + subroutine gpu_copy_double3(gpu_ptr_src, gpu_ptr_dest) + implicit none + type(gpu_double3), intent(in) :: gpu_ptr_src + type(gpu_double3), intent(in) :: gpu_ptr_dest + call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8)) + end subroutine + + subroutine gpu_copy_double4(gpu_ptr_src, gpu_ptr_dest) + implicit none + type(gpu_double4), intent(in) :: gpu_ptr_src + type(gpu_double4), intent(in) :: gpu_ptr_dest + call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8)) + end subroutine + + subroutine gpu_copy_double5(gpu_ptr_src, gpu_ptr_dest) + implicit none + type(gpu_double5), intent(in) :: gpu_ptr_src + type(gpu_double5), intent(in) :: gpu_ptr_dest + call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8)) + end subroutine + + subroutine gpu_copy_double6(gpu_ptr_src, gpu_ptr_dest) + implicit none + type(gpu_double6), intent(in) :: gpu_ptr_src + type(gpu_double6), intent(in) :: gpu_ptr_dest + call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8)) + end subroutine + + +! gpu_stream +! ---------- + + subroutine gpu_stream_create(stream) + import + type(gpu_stream) :: stream + call gpu_stream_create_c(stream%c) + end subroutine + + subroutine gpu_stream_destroy(stream) + import + type(gpu_stream) :: stream + call gpu_stream_destroy_c(stream%c) + end subroutine + + subroutine gpu_set_stream(handle, stream) + import + type(gpu_blas) :: handle + type(gpu_stream) :: stream + call gpu_set_stream_c(handle%c, stream%c) + end subroutine + + +! gpu_blas +! -------- + + subroutine gpu_blas_create(handle) + import + type(gpu_blas) :: handle + call gpu_blas_create_c(handle%c) + end subroutine + + subroutine gpu_blas_destroy(handle) + import + type(gpu_blas) :: handle + call gpu_blas_destroy_c(handle%c) end subroutine end module -subroutine gpu_upload_double(cpu_ptr, gpu_ptr, n) - use gpu - implicit none - double precision, intent(in) :: cpu_ptr(*) - double precision, intent(in) :: gpu_ptr(*) - integer(c_int64_t), intent(in) :: n - call gpu_upload_c(c_loc(cpu_ptr), c_loc(gpu_ptr), 8_8*n) -end subroutine - -subroutine gpu_download_double(gpu_ptr, cpu_ptr, n) - use gpu - implicit none - double precision, intent(in) :: gpu_ptr(*) - double precision, intent(in) :: cpu_ptr(*) - integer(c_int64_t), intent(in) :: n - call gpu_download_c(c_loc(gpu_ptr), c_loc(cpu_ptr), 8_8*n) -end subroutine - -subroutine gpu_copy_double(gpu_ptr_src, gpu_ptr_dest, n) - use gpu - implicit none - double precision, intent(in) :: gpu_ptr_src(*) - double precision, intent(in) :: gpu_ptr_dest(*) - integer(c_int64_t), intent(in) :: n - call gpu_copy_c(c_loc(gpu_ptr_src), c_loc(gpu_ptr_dest), 8_8*n) -end subroutine - diff --git a/src/gpu_x86/gpu.c b/src/gpu_x86/gpu.c index 41ede396..5f42cb0d 100644 --- a/src/gpu_x86/gpu.c +++ b/src/gpu_x86/gpu.c @@ -251,7 +251,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i if (alpha == 0.) { for (int64_t j=0 ; j Date: Fri, 28 Jun 2024 11:00:58 +0200 Subject: [PATCH 11/19] Added Nvidia module --- src/ccsd/ccsd_space_orb_sub.irp.f | 10 +- src/gpu/gpu_module.F90 | 6 +- src/gpu_nvidia/LIB | 1 + src/gpu_nvidia/NEED | 1 + src/gpu_nvidia/README.rst | 5 + src/gpu_nvidia/gpu.c | 327 ++++++++++++++++++++++++++++++ src/gpu_x86/gpu.c | 40 ++-- 7 files changed, 359 insertions(+), 31 deletions(-) create mode 100644 src/gpu_nvidia/LIB create mode 100644 src/gpu_nvidia/NEED create mode 100644 src/gpu_nvidia/README.rst create mode 100644 src/gpu_nvidia/gpu.c diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 455d62f7..e7c9b1ab 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -384,17 +384,17 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau) !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) - do j=1,nO - !$OMP DO - do b=1,nV - call gpu_set_stream(blas,stream(b)) + !$OMP DO + do b=1,nV + call gpu_set_stream(blas,stream(b)) + do j=1,nO call gpu_dgeam_c(blas%c, 'N', 'N', nO*1_8, nV*1_8, & 1.d0, c_loc(t2%f(1,j,1,b)), nO*nO*1_8, & h_t1(j,b), t1%c, nO*1_8, & c_loc(tau%f(1,j,1,b)), nO*nO*1_8) enddo - !$OMP END DO enddo + !$OMP END DO !$OMP END PARALLEL call gpu_synchronize() diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90 index 51f80ac0..d1ddad4c 100644 --- a/src/gpu/gpu_module.F90 +++ b/src/gpu/gpu_module.F90 @@ -120,7 +120,7 @@ module gpu subroutine gpu_ddot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_ddot') import - type(c_ptr), intent(in), value :: handle + type(c_ptr), intent(in) :: handle integer(c_int64_t), value :: n, incx, incy type(c_ptr), intent(in), value :: dx, dy real(c_double), intent(out) :: res @@ -128,7 +128,7 @@ module gpu subroutine gpu_sdot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_sdot') import - type(c_ptr), intent(in), value :: handle + type(c_ptr), intent(in) :: handle integer(c_int64_t), value :: n, incx, incy type(c_ptr), intent(in), value :: dx, dy real(c_float), intent(out) :: res @@ -137,7 +137,7 @@ module gpu subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, & b, ldb, c, ldc) bind(C, name='gpu_dgeam') import - type(c_ptr), intent(in), value :: handle + type(c_ptr), intent(in) :: handle character(c_char), intent(in), value :: transa, transb integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc real(c_double), intent(in), value :: alpha, beta diff --git a/src/gpu_nvidia/LIB b/src/gpu_nvidia/LIB new file mode 100644 index 00000000..91f54e91 --- /dev/null +++ b/src/gpu_nvidia/LIB @@ -0,0 +1 @@ +-lcudart -lcublas -lcublasLt diff --git a/src/gpu_nvidia/NEED b/src/gpu_nvidia/NEED new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/src/gpu_nvidia/NEED @@ -0,0 +1 @@ + diff --git a/src/gpu_nvidia/README.rst b/src/gpu_nvidia/README.rst new file mode 100644 index 00000000..5dcfca92 --- /dev/null +++ b/src/gpu_nvidia/README.rst @@ -0,0 +1,5 @@ +========== +gpu_nvidia +========== + +Nvidia implementation of GPU routines. Uses CUDA and CUBLAS libraries. diff --git a/src/gpu_nvidia/gpu.c b/src/gpu_nvidia/gpu.c new file mode 100644 index 00000000..f0bd247a --- /dev/null +++ b/src/gpu_nvidia/gpu.c @@ -0,0 +1,327 @@ +#include +#include +#include +#include +#include + +#include +#include + + +/* Generic functions */ + +int gpu_ndevices() { + int ngpus; + cudaGetDeviceCount(&ngpus); + return ngpus; +} + +void gpu_set_device(int32_t igpu) { + cudaSetDevice(igpu); +} + + +/* Allocation functions */ + +void gpu_allocate(void** ptr, const int64_t size) { + size_t free, total; + cudaError_t rc = cudaMemGetInfo( &free, &total ); + if (rc != cudaSuccess) { + free = INT64_MAX; + } + + /* Use managed memory if it does not fit on the GPU */ + if (size < free && size < total/2) { +// rc= cudaMalloc(ptr, size); + rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal); + } else { + rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal); + } + assert (rc == cudaSuccess); +} + +void gpu_deallocate(void** ptr) { + assert (*ptr != NULL); + cudaFree(*ptr); + *ptr = NULL; +} + + +/* Memory transfer functions */ + +void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) { + cudaMemcpy (gpu_ptr, cpu_ptr, n, cudaMemcpyHostToDevice); +} + +void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) { + cudaMemcpy (cpu_ptr, gpu_ptr, n, cudaMemcpyDeviceToHost); +} + +void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) { + cudaMemcpy (gpu_ptr_dest, gpu_ptr_src, n, cudaMemcpyDeviceToDevice); +} + + +/* Streams */ + +void gpu_stream_create(void** ptr) { + cudaStream_t stream; + cudaError_t rc = cudaStreamCreate(&stream); + assert (rc == cudaSuccess); + *ptr = (void*) stream; +} + +void gpu_stream_destroy(void** ptr) { + assert (*ptr != NULL); + cudaError_t rc = cudaStreamDestroy( (cudaStream_t) *ptr); + assert (rc == cudaSuccess); + *ptr = NULL; +} + +void gpu_set_stream(void** handle, void** stream) { + cublasSetStream( (cublasHandle_t) *handle, (cudaStream_t) *stream); +} + +void gpu_synchronize() { + cudaDeviceSynchronize(); +} + + +/* BLAS functions */ + +void gpu_blas_create(void** handle) { + cublasHandle_t cublas_handle; + cublasStatus_t rc = cublasCreate(&cublas_handle); + assert (rc == CUBLAS_STATUS_SUCCESS); + *handle = (void*) cublas_handle; +} + + +void gpu_blas_destroy(void** handle) { + assert (*handle != NULL); + cublasStatus_t rc = cublasDestroy( (cublasHandle_t) *handle); + assert (rc == CUBLAS_STATUS_SUCCESS); + *handle = NULL; +} + + +void gpu_ddot(void** handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) { + assert (*handle != NULL); + + /* Convert to int32_t */ + int32_t n_, incx_, incy_; + + n_ = (int32_t) n; + incx_ = (int32_t) incx; + incy_ = (int32_t) incy; + + /* Check for integer overflows */ + assert ( (int64_t) n_ == n ); + assert ( (int64_t) incx_ == incx); + assert ( (int64_t) incy_ == incy); + + cublasDdot((cublasHandle_t) *handle, n_, x, incx_, y, incy_, result); +} + + + +void gpu_sdot(void** handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) { + assert (*handle != NULL); + + /* Convert to int32_t */ + int32_t n_, incx_, incy_; + + n_ = (int32_t) n; + incx_ = (int32_t) incx; + incy_ = (int32_t) incy; + + /* Check for integer overflows */ + assert ( (int64_t) n_ == n ); + assert ( (int64_t) incx_ == incx); + assert ( (int64_t) incy_ == incy); + + cublasSdot((cublasHandle_t) *handle, n_, x, incx_, y, incy_, result); +} + + + +void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t n, const double alpha, + const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) { + + assert (*handle != NULL); + + /* Convert to int32_t */ + int32_t m_, n_, lda_, incx_, incy_; + + m_ = (int32_t) m; + n_ = (int32_t) n; + lda_ = (int32_t) lda; + incx_ = (int32_t) incx; + incy_ = (int32_t) incy; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) lda_ == lda ); + assert ( (int64_t) incx_ == incx); + assert ( (int64_t) incy_ == incy); + + cublasOperation_t transa_ = CUBLAS_OP_N; + if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; + + cublasDgemv((cublasHandle_t) *handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_); +} + + + +void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t n, const float alpha, + const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) { + + assert (*handle != NULL); + + /* Convert to int32_t */ + int32_t m_, n_, lda_, incx_, incy_; + + m_ = (int32_t) m; + n_ = (int32_t) n; + lda_ = (int32_t) lda; + incx_ = (int32_t) incx; + incy_ = (int32_t) incy; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) lda_ == lda ); + assert ( (int64_t) incx_ == incx); + assert ( (int64_t) incy_ == incy); + + cublasOperation_t transa_ = CUBLAS_OP_N; + if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; + + cublasSgemv((cublasHandle_t) *handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_); +} + + +void gpu_dgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, + const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) { + + assert (*handle != NULL); + + /* Convert to int32_t */ + int32_t m_, n_, k_, lda_, ldb_, ldc_; + + m_ = (int32_t) m; + n_ = (int32_t) n; + k_ = (int32_t) k; + lda_ = (int32_t) lda; + ldb_ = (int32_t) ldb; + ldc_ = (int32_t) ldc; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) k_ == k ); + assert ( (int64_t) lda_ == lda); + assert ( (int64_t) ldb_ == ldb); + assert ( (int64_t) ldc_ == ldc); + + cublasOperation_t transa_ = CUBLAS_OP_N; + cublasOperation_t transb_ = CUBLAS_OP_N; + if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; + if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; + + cublasDgemm((cublasHandle_t) *handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_); +} + + + +void gpu_sgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, + const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) { + + assert (*handle != NULL); + + /* Convert to int32_t */ + int32_t m_, n_, k_, lda_, ldb_, ldc_; + + m_ = (int32_t) m; + n_ = (int32_t) n; + k_ = (int32_t) k; + lda_ = (int32_t) lda; + ldb_ = (int32_t) ldb; + ldc_ = (int32_t) ldc; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) k_ == k ); + assert ( (int64_t) lda_ == lda); + assert ( (int64_t) ldb_ == ldb); + assert ( (int64_t) ldc_ == ldc); + + cublasOperation_t transa_ = CUBLAS_OP_N; + cublasOperation_t transb_ = CUBLAS_OP_N; + if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; + if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; + + cublasSgemm((cublasHandle_t) *handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_); +} + + +void gpu_dgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, + const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { + assert (*handle != NULL); + + /* Convert to int32_t */ + int32_t m_, n_, lda_, ldb_, ldc_; + + m_ = (int32_t) m; + n_ = (int32_t) n; + lda_ = (int32_t) lda; + ldb_ = (int32_t) ldb; + ldc_ = (int32_t) ldc; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) lda_ == lda); + assert ( (int64_t) ldb_ == ldb); + assert ( (int64_t) ldc_ == ldc); + + cublasOperation_t transa_ = CUBLAS_OP_N; + cublasOperation_t transb_ = CUBLAS_OP_N; + if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; + if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; + + cublasDgeam((cublasHandle_t) *handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_); + +} + + +void gpu_sgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha, + const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) { + assert (*handle != NULL); + + /* Convert to int32_t */ + int32_t m_, n_, lda_, ldb_, ldc_; + + m_ = (int32_t) m; + n_ = (int32_t) n; + lda_ = (int32_t) lda; + ldb_ = (int32_t) ldb; + ldc_ = (int32_t) ldc; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) lda_ == lda); + assert ( (int64_t) ldb_ == ldb); + assert ( (int64_t) ldc_ == ldc); + + cublasOperation_t transa_ = CUBLAS_OP_N; + cublasOperation_t transb_ = CUBLAS_OP_N; + if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; + if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; + + cublasSgeam((cublasHandle_t) *handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_); + +} diff --git a/src/gpu_x86/gpu.c b/src/gpu_x86/gpu.c index 5f42cb0d..ac7c3620 100644 --- a/src/gpu_x86/gpu.c +++ b/src/gpu_x86/gpu.c @@ -56,7 +56,7 @@ void gpu_stream_destroy(void** ptr) { *ptr = NULL; } -void gpu_set_stream(void* handle, void* stream) { +void gpu_set_stream(void** handle, void** stream) { return; } @@ -79,8 +79,8 @@ void gpu_blas_destroy(void** handle) { double ddot_(const int32_t* n, const double* x, const int32_t* incx, const double* y, const int32_t* incy); -void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) { - assert (handle != NULL); +void gpu_ddot(void** handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) { + assert (*handle != NULL); /* Convert to int32_t */ int32_t n_, incx_, incy_; @@ -100,8 +100,8 @@ void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_ float sdot_(const int32_t* n, const float* x, const int32_t* incx, const float* y, const int32_t* incy); -void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) { - assert (handle != NULL); +void gpu_sdot(void** handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) { + assert (*handle != NULL); /* Convert to int32_t */ int32_t n_, incx_, incy_; @@ -122,10 +122,10 @@ void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha, const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy); -void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double alpha, +void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t n, const double alpha, const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) { - assert (handle != NULL); + assert (*handle != NULL); /* Convert to int32_t */ int32_t m_, n_, lda_, incx_, incy_; @@ -150,10 +150,10 @@ void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha, const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy); -void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float alpha, +void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t n, const float alpha, const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) { - assert (handle != NULL); + assert (*handle != NULL); /* Convert to int32_t */ int32_t m_, n_, lda_, incx_, incy_; @@ -178,10 +178,10 @@ void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha, const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc); -void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, +void gpu_dgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) { - assert (handle != NULL); + assert (*handle != NULL); /* Convert to int32_t */ int32_t m_, n_, k_, lda_, ldb_, ldc_; @@ -209,10 +209,10 @@ void gpu_dgemm(const void* handle, const char transa, const char transb, const i void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha, const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc); -void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, +void gpu_sgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) { - assert (handle != NULL); + assert (*handle != NULL); /* Convert to int32_t */ int32_t m_, n_, k_, lda_, ldb_, ldc_; @@ -236,12 +236,9 @@ void gpu_sgemm(const void* handle, const char transa, const char transb, const i } -void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, +void gpu_dgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { - if (handle == NULL) { - perror("NULL handle"); - exit(-1); - } + assert (*handle != NULL); if ( (transa == 'N' && transb == 'N') || (transa == 'n' && transb == 'N') || @@ -371,12 +368,9 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i } -void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha, +void gpu_sgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha, const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) { - if (handle == NULL) { - perror("NULL handle"); - exit(-1); - } + assert (*handle != NULL); if ( (transa == 'N' && transb == 'N') || (transa == 'n' && transb == 'N') || From d3d89022bc8092ab0c6131904f85475f160dfa53 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Fri, 28 Jun 2024 16:50:52 +0200 Subject: [PATCH 12/19] Move GPU modules to plugins --- configure | 6 +++--- {src => plugins/local}/gpu_nvidia/LIB | 0 {src => plugins/local}/gpu_nvidia/NEED | 0 {src => plugins/local}/gpu_nvidia/README.rst | 0 {src => plugins/local}/gpu_nvidia/gpu.c | 0 {src => plugins/local}/gpu_x86/NEED | 0 {src => plugins/local}/gpu_x86/README.rst | 0 {src => plugins/local}/gpu_x86/gpu.c | 0 8 files changed, 3 insertions(+), 3 deletions(-) rename {src => plugins/local}/gpu_nvidia/LIB (100%) rename {src => plugins/local}/gpu_nvidia/NEED (100%) rename {src => plugins/local}/gpu_nvidia/README.rst (100%) rename {src => plugins/local}/gpu_nvidia/gpu.c (100%) rename {src => plugins/local}/gpu_x86/NEED (100%) rename {src => plugins/local}/gpu_x86/README.rst (100%) rename {src => plugins/local}/gpu_x86/gpu.c (100%) diff --git a/configure b/configure index db158966..08dac444 100755 --- a/configure +++ b/configure @@ -119,15 +119,15 @@ rm -f ${QP_ROOT}/src/gpu_arch case "$GPU" in amd) # Nvidia echo "Activating AMD GPU acceleration" - ln -s ${QP_ROOT}/src/gpu_amd ${QP_ROOT}/src/gpu_arch + ln -s ${QP_ROOT}/plugins/local/gpu_amd ${QP_ROOT}/src/gpu_arch ;; nvidia) # Nvidia echo "Activating Nvidia GPU acceleration" - ln -s ${QP_ROOT}/src/gpu_nvidia ${QP_ROOT}/src/gpu_arch + ln -s ${QP_ROOT}/plugins/local/gpu_nvidia ${QP_ROOT}/src/gpu_arch ;; *) # No Acceleration echo "Disabling GPU acceleration" - ln -s ${QP_ROOT}/src/gpu_x86 ${QP_ROOT}/src/gpu_arch + ln -s ${QP_ROOT}/plugins/local/gpu_x86 ${QP_ROOT}/src/gpu_arch ;; esac diff --git a/src/gpu_nvidia/LIB b/plugins/local/gpu_nvidia/LIB similarity index 100% rename from src/gpu_nvidia/LIB rename to plugins/local/gpu_nvidia/LIB diff --git a/src/gpu_nvidia/NEED b/plugins/local/gpu_nvidia/NEED similarity index 100% rename from src/gpu_nvidia/NEED rename to plugins/local/gpu_nvidia/NEED diff --git a/src/gpu_nvidia/README.rst b/plugins/local/gpu_nvidia/README.rst similarity index 100% rename from src/gpu_nvidia/README.rst rename to plugins/local/gpu_nvidia/README.rst diff --git a/src/gpu_nvidia/gpu.c b/plugins/local/gpu_nvidia/gpu.c similarity index 100% rename from src/gpu_nvidia/gpu.c rename to plugins/local/gpu_nvidia/gpu.c diff --git a/src/gpu_x86/NEED b/plugins/local/gpu_x86/NEED similarity index 100% rename from src/gpu_x86/NEED rename to plugins/local/gpu_x86/NEED diff --git a/src/gpu_x86/README.rst b/plugins/local/gpu_x86/README.rst similarity index 100% rename from src/gpu_x86/README.rst rename to plugins/local/gpu_x86/README.rst diff --git a/src/gpu_x86/gpu.c b/plugins/local/gpu_x86/gpu.c similarity index 100% rename from src/gpu_x86/gpu.c rename to plugins/local/gpu_x86/gpu.c From 85b1035cfba778559e629045961cb542631841bd Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Fri, 28 Jun 2024 17:33:08 +0200 Subject: [PATCH 13/19] Working on CCSD --- src/ccsd/ccsd_space_orb_sub.irp.f | 117 +++++++++++------------------- src/gpu/gpu_module.F90 | 62 ++++++++++++++++ 2 files changed, 103 insertions(+), 76 deletions(-) diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index e7c9b1ab..1329f172 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -14,6 +14,9 @@ subroutine run_ccsd_space_orb type(gpu_double2) :: t1, r1 type(gpu_double2) :: H_oo, H_vv, H_vo + type(gpu_double2) :: d_cc_space_f_vo + type(gpu_double4) :: d_cc_space_v_oovv + double precision, allocatable :: all_err(:,:), all_t(:,:) integer, allocatable :: list_occ(:), list_vir(:) integer(bit_kind) :: det(N_int,2) @@ -52,6 +55,12 @@ subroutine run_ccsd_space_orb !print*,'occ',list_occ !print*,'vir',list_vir + call gpu_allocate(d_cc_space_f_vo, nV, nO) + call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV) + call gpu_upload(cc_space_f_vo, d_cc_space_f_vo) + call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv) + + call gpu_allocate(t2, nO,nO,nV,nV) call gpu_allocate(r2, nO,nO,nV,nV) call gpu_allocate(tau, nO,nO,nV,nV) @@ -116,7 +125,8 @@ subroutine run_ccsd_space_orb !print*,'hf_energy', hf_energy call det_energy(det,uncorr_energy) print*,'Det energy', uncorr_energy - call ccsd_energy_space_x(nO,nV,tau_x%f,t1%f,energy) + + call ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy) print*,'Guess energy', uncorr_energy+energy, energy nb_iter = 0 @@ -166,7 +176,7 @@ subroutine run_ccsd_space_orb call update_tau_x_space(nO,nV,tau,tau_x) ! Energy - call ccsd_energy_space_x(nO,nV,tau_x%f,t1%f,energy) + call ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy) write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |' nb_iter = nb_iter + 1 @@ -239,6 +249,8 @@ subroutine run_ccsd_space_orb call save_energy(uncorr_energy + energy, e_t) deallocate(h_t1, h_t2) + call gpu_deallocate(d_cc_space_f_vo) + call gpu_deallocate(d_cc_space_v_oovv) call gpu_deallocate(t1) call gpu_deallocate(t2) @@ -246,59 +258,14 @@ end ! Energy -subroutine ccsd_energy_space(nO,nV,tau,t1,energy) - +subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy) + use gpu implicit none - integer, intent(in) :: nO, nV - double precision, intent(in) :: tau(nO,nO,nV,nV) - double precision, intent(in) :: t1(nO,nV) - double precision, intent(out) :: energy - - ! internal - integer :: i,j,a,b - double precision :: e - - energy = 0d0 - !$omp parallel & - !$omp shared(nO,nV,energy,tau,t1,& - !$omp cc_space_f_vo,cc_space_w_oovv) & - !$omp private(i,j,a,b,e) & - !$omp default(none) - e = 0d0 - !$omp do - do a = 1, nV - do i = 1, nO - e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a) - enddo - enddo - !$omp end do nowait - !$omp do - do b = 1, nV - do a = 1, nV - do j = 1, nO - do i = 1, nO - e = e + tau(i,j,a,b) * cc_space_w_oovv(i,j,a,b) - enddo - enddo - enddo - enddo - !$omp end do nowait - !$omp critical - energy = energy + e - !$omp end critical - !$omp end parallel - -end - -subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy) - - implicit none - - integer, intent(in) :: nO, nV - double precision, intent(in) :: tau_x(nO,nO,nV,nV) - double precision, intent(in) :: t1(nO,nV) - double precision, intent(out) :: energy + integer, intent(in) :: nO, nV + type(gpu_double4), intent(in) :: tau_x, d_cc_space_v_oovv + type(gpu_double2), intent(in) :: t1, d_cc_space_f_vo + double precision, intent(out) :: energy ! internal integer :: i,j,a,b @@ -307,14 +274,14 @@ subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy) energy = 0d0 !$omp parallel & !$omp shared(nO,nV,energy,tau_x,t1,& - !$omp cc_space_f_vo,cc_space_v_oovv) & + !$omp d_cc_space_f_vo,d_cc_space_v_oovv) & !$omp private(i,j,a,b,e) & !$omp default(none) e = 0d0 !$omp do do a = 1, nV do i = 1, nO - e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a) + e = e + 2d0 * d_cc_space_f_vo%f(a,i) * t1%f(i,a) enddo enddo !$omp end do nowait @@ -323,7 +290,7 @@ subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy) do a = 1, nV do j = 1, nO do i = 1, nO - e = e + tau_x(i,j,a,b) * cc_space_v_oovv(i,j,a,b) + e = e + tau_x%f(i,j,a,b) * d_cc_space_v_oovv%f(i,j,a,b) enddo enddo enddo @@ -333,6 +300,12 @@ subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy) energy = energy + e !$omp end critical !$omp end parallel +! +! +! call gpu_ddot(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1, d_cc_space_v_oovv, 1, energy) +! call gpu_ddot(blas_handle, nO*nV*1_8, d_cc_space_f_vo, 1, t1, 1, e) +! energy = energy + 2.d0*e + end @@ -372,26 +345,24 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau) ! !$OMP END PARALLEL - type(gpu_blas) :: blas type(gpu_stream) :: stream(nV) - call gpu_blas_create(blas) do b=1,nV call gpu_stream_create(stream(b)) enddo !$OMP PARALLEL & - !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas) & + !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) !$OMP DO do b=1,nV - call gpu_set_stream(blas,stream(b)) + call gpu_set_stream(blas_handle,stream(b)) do j=1,nO - call gpu_dgeam_c(blas%c, 'N', 'N', nO*1_8, nV*1_8, & - 1.d0, c_loc(t2%f(1,j,1,b)), nO*nO*1_8, & - h_t1(j,b), t1%c, nO*1_8, & - c_loc(tau%f(1,j,1,b)), nO*nO*1_8) + call gpu_dgeam(blas_handle, 'N', 'N', nO*1_8, nV*1_8, & + 1.d0, t2%f(1,j,1,b), nO*nO*1_8, & + h_t1(j,b), t1%f, nO*1_8, & + tau%f(1,j,1,b), nO*nO*1_8) enddo enddo !$OMP END DO @@ -403,8 +374,6 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau) call gpu_stream_destroy(stream(b)) enddo - call gpu_blas_destroy(blas) - end subroutine update_tau_x_space(nO,nV,tau,tau_x) @@ -438,26 +407,24 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x) ! !$OMP END DO ! !$OMP END PARALLEL - type(gpu_blas) :: blas type(gpu_stream) :: stream(nV) - call gpu_blas_create(blas) do a=1,nV call gpu_stream_create(stream(a)) enddo !$OMP PARALLEL & - !$OMP SHARED(nO,nV,tau,tau_x,stream,blas) & + !$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) !$OMP DO do b=1,nV do a=1,nV - call gpu_set_stream(blas,stream(a)) - call gpu_dgeam_c(blas%c, 'N', 'N', nO*1_8, nO*1_8, & - 2.d0, c_loc(tau%f(1,1,a,b)), nO*1_8, & - -1.d0, c_loc(tau%f(1,1,b,a)), nO*1_8, & - c_loc(tau_x%f(1,1,a,b)), nO*1_8) + call gpu_set_stream(blas_handle,stream(a)) + call gpu_dgeam(blas_handle, 'N', 'N', nO*1_8, nO*1_8, & + 2.d0, tau%f(1,1,a,b), nO*1_8, & + -1.d0, tau%f(1,1,b,a), nO*1_8, & + tau_x%f(1,1,a,b), nO*1_8) enddo enddo !$OMP END DO @@ -469,8 +436,6 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x) call gpu_stream_destroy(stream(b)) enddo - call gpu_blas_destroy(blas) - end ! R1 diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90 index d1ddad4c..2057d1eb 100644 --- a/src/gpu/gpu_module.F90 +++ b/src/gpu/gpu_module.F90 @@ -144,6 +144,16 @@ module gpu type(c_ptr), value :: a, b, c end subroutine + subroutine gpu_sgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, & + b, ldb, c, ldc) bind(C, name='gpu_sgeam') + import + type(c_ptr), intent(in) :: handle + character(c_char), intent(in), value :: transa, transb + integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc + real(c_float), intent(in), value :: alpha, beta + type(c_ptr), value :: a, b, c + end subroutine + end interface @@ -478,5 +488,57 @@ module gpu call gpu_blas_destroy_c(handle%c) end subroutine + end module + + +! dot +! --- + +subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res) + use gpu + type(gpu_blas), intent(in) :: handle + integer*8 :: n, incx, incy + double precision, intent(in) :: dx(*), dy(*) + double precision, intent(out) :: res + call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res) +end subroutine + +subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res) + use gpu + type(gpu_blas), intent(in) :: handle + integer*8 :: n, incx, incy + real, intent(in) :: dx(*), dy(*) + real, intent(out) :: res + call gpu_sdot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res) +end subroutine + + +! geam +! ---- + +subroutine gpu_dgeam(handle, transa, transb, m, n, alpha, a, lda, beta, & + b, ldb, c, ldc) + use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa, transb + integer*8, intent(in) :: m, n, lda, ldb, ldc + double precision, intent(in) :: alpha, beta + double precision :: a(lda,*), b(ldb,*), c(ldc,*) + call gpu_dgeam_c(handle%c, transa, transb, m, n, alpha, c_loc(a), lda, beta, & + c_loc(b), ldb, c_loc(c), ldc) +end subroutine + +subroutine gpu_sgeam(handle, transa, transb, m, n, alpha, a, lda, beta, & + b, ldb, c, ldc) + use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa, transb + integer*8, intent(in) :: m, n, lda, ldb, ldc + real, intent(in) :: alpha, beta + real :: a(lda,*), b(ldb,*), c(ldc,*) + call gpu_sgeam_c(handle%c, transa, transb, m, n, alpha, c_loc(a), lda, beta, & + c_loc(b), ldb, c_loc(c), ldc) +end subroutine + From a5f4f0516eec9f17438474529616368a6f9e5de4 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Fri, 28 Jun 2024 17:39:43 +0200 Subject: [PATCH 14/19] Fixing compile --- src/ccsd/ccsd_space_orb_sub.irp.f | 1 - src/gpu/gpu_module.F90 | 23 +++++++++-------------- 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 1329f172..4e06e31d 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -306,7 +306,6 @@ subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1, ! call gpu_ddot(blas_handle, nO*nV*1_8, d_cc_space_f_vo, 1, t1, 1, e) ! energy = energy + 2.d0*e - end ! Tau diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90 index 2057d1eb..d7c26ba6 100644 --- a/src/gpu/gpu_module.F90 +++ b/src/gpu/gpu_module.F90 @@ -365,42 +365,42 @@ module gpu subroutine gpu_download_double1(gpu_ptr, cpu_ptr) implicit none type(gpu_double1), intent(in) :: gpu_ptr - double precision, intent(in) :: cpu_ptr(:) + double precision, target, intent(in) :: cpu_ptr(:) call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*size(gpu_ptr%f)) end subroutine subroutine gpu_download_double2(gpu_ptr, cpu_ptr) implicit none type(gpu_double2), intent(in) :: gpu_ptr - double precision, intent(in) :: cpu_ptr(:,:) + double precision, target, intent(in) :: cpu_ptr(:,:) call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) end subroutine subroutine gpu_download_double3(gpu_ptr, cpu_ptr) implicit none type(gpu_double3), intent(in) :: gpu_ptr - double precision, intent(in) :: cpu_ptr(:,:,:) + double precision, target, intent(in) :: cpu_ptr(:,:,:) call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) end subroutine subroutine gpu_download_double4(gpu_ptr, cpu_ptr) implicit none type(gpu_double4), intent(in) :: gpu_ptr - double precision, intent(in) :: cpu_ptr(:,:,:,:) + double precision, target, intent(in) :: cpu_ptr(:,:,:,:) call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) end subroutine subroutine gpu_download_double5(gpu_ptr, cpu_ptr) implicit none type(gpu_double5), intent(in) :: gpu_ptr - double precision, intent(in) :: cpu_ptr(:,:,:,:,:) + double precision, target, intent(in) :: cpu_ptr(:,:,:,:,:) call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) end subroutine subroutine gpu_download_double6(gpu_ptr, cpu_ptr) implicit none type(gpu_double6), intent(in) :: gpu_ptr - double precision, intent(in) :: cpu_ptr(:,:,:,:,:,:) + double precision, target, intent(in) :: cpu_ptr(:,:,:,:,:,:) call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) end subroutine @@ -454,19 +454,16 @@ module gpu ! ---------- subroutine gpu_stream_create(stream) - import type(gpu_stream) :: stream call gpu_stream_create_c(stream%c) end subroutine subroutine gpu_stream_destroy(stream) - import type(gpu_stream) :: stream call gpu_stream_destroy_c(stream%c) end subroutine subroutine gpu_set_stream(handle, stream) - import type(gpu_blas) :: handle type(gpu_stream) :: stream call gpu_set_stream_c(handle%c, stream%c) @@ -477,13 +474,11 @@ module gpu ! -------- subroutine gpu_blas_create(handle) - import type(gpu_blas) :: handle call gpu_blas_create_c(handle%c) end subroutine subroutine gpu_blas_destroy(handle) - import type(gpu_blas) :: handle call gpu_blas_destroy_c(handle%c) end subroutine @@ -500,7 +495,7 @@ subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res) use gpu type(gpu_blas), intent(in) :: handle integer*8 :: n, incx, incy - double precision, intent(in) :: dx(*), dy(*) + double precision, target, intent(in) :: dx(*), dy(*) double precision, intent(out) :: res call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res) end subroutine @@ -525,7 +520,7 @@ subroutine gpu_dgeam(handle, transa, transb, m, n, alpha, a, lda, beta, & character, intent(in) :: transa, transb integer*8, intent(in) :: m, n, lda, ldb, ldc double precision, intent(in) :: alpha, beta - double precision :: a(lda,*), b(ldb,*), c(ldc,*) + double precision, target :: a(lda,*), b(ldb,*), c(ldc,*) call gpu_dgeam_c(handle%c, transa, transb, m, n, alpha, c_loc(a), lda, beta, & c_loc(b), ldb, c_loc(c), ldc) end subroutine @@ -537,7 +532,7 @@ subroutine gpu_sgeam(handle, transa, transb, m, n, alpha, a, lda, beta, & character, intent(in) :: transa, transb integer*8, intent(in) :: m, n, lda, ldb, ldc real, intent(in) :: alpha, beta - real :: a(lda,*), b(ldb,*), c(ldc,*) + real, target :: a(lda,*), b(ldb,*), c(ldc,*) call gpu_sgeam_c(handle%c, transa, transb, m, n, alpha, c_loc(a), lda, beta, & c_loc(b), ldb, c_loc(c), ldc) end subroutine From c7df9a72cc68a7f5dfded36aa94ac50d5188a5a1 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Fri, 28 Jun 2024 21:32:04 +0200 Subject: [PATCH 15/19] Fixing again actions --- src/gpu/gpu_module.F90 | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90 index d7c26ba6..ecf79c83 100644 --- a/src/gpu/gpu_module.F90 +++ b/src/gpu/gpu_module.F90 @@ -318,42 +318,42 @@ module gpu subroutine gpu_upload_double1(cpu_ptr, gpu_ptr) implicit none - double precision, intent(in) :: cpu_ptr(:) + double precision, target, intent(in) :: cpu_ptr(*) type(gpu_double1), intent(in) :: gpu_ptr call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, 8_8*size(gpu_ptr%f)) end subroutine subroutine gpu_upload_double2(cpu_ptr, gpu_ptr) implicit none - double precision, intent(in) :: cpu_ptr(:,:) + double precision, target, intent(in) :: cpu_ptr(:,:) type(gpu_double2), intent(in) :: gpu_ptr call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) end subroutine subroutine gpu_upload_double3(cpu_ptr, gpu_ptr) implicit none - double precision, intent(in) :: cpu_ptr(:,:,:) + double precision, target, intent(in) :: cpu_ptr(:,:,:) type(gpu_double3), intent(in) :: gpu_ptr call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) end subroutine subroutine gpu_upload_double4(cpu_ptr, gpu_ptr) implicit none - double precision, intent(in) :: cpu_ptr(:,:,:,:) + double precision, target, intent(in) :: cpu_ptr(:,:,:,:) type(gpu_double4), intent(in) :: gpu_ptr call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) end subroutine subroutine gpu_upload_double5(cpu_ptr, gpu_ptr) implicit none - double precision, intent(in) :: cpu_ptr(:,:,:,:,:) + double precision, target, intent(in) :: cpu_ptr(:,:,:,:,:) type(gpu_double5), intent(in) :: gpu_ptr call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) end subroutine subroutine gpu_upload_double6(cpu_ptr, gpu_ptr) implicit none - double precision, intent(in) :: cpu_ptr(:,:,:,:,:,:) + double precision, target, intent(in) :: cpu_ptr(:,:,:,:,:,:) type(gpu_double6), intent(in) :: gpu_ptr call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) end subroutine @@ -504,7 +504,7 @@ subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res) use gpu type(gpu_blas), intent(in) :: handle integer*8 :: n, incx, incy - real, intent(in) :: dx(*), dy(*) + real, target, intent(in) :: dx(*), dy(*) real, intent(out) :: res call gpu_sdot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res) end subroutine From b467bef6dd1e14c5914cc6508aa898d5f1665e3a Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Fri, 28 Jun 2024 21:37:14 +0200 Subject: [PATCH 16/19] Forgot file --- src/ccsd/ccsd_space_orb_sub.irp.f | 68 +++++++++++++++---------------- src/gpu/gpu.irp.f | 11 +++++ 2 files changed, 45 insertions(+), 34 deletions(-) create mode 100644 src/gpu/gpu.irp.f diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 4e06e31d..5c2daa05 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -271,40 +271,40 @@ subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1, integer :: i,j,a,b double precision :: e - energy = 0d0 - !$omp parallel & - !$omp shared(nO,nV,energy,tau_x,t1,& - !$omp d_cc_space_f_vo,d_cc_space_v_oovv) & - !$omp private(i,j,a,b,e) & - !$omp default(none) - e = 0d0 - !$omp do - do a = 1, nV - do i = 1, nO - e = e + 2d0 * d_cc_space_f_vo%f(a,i) * t1%f(i,a) - enddo - enddo - !$omp end do nowait - !$omp do - do b = 1, nV - do a = 1, nV - do j = 1, nO - do i = 1, nO - e = e + tau_x%f(i,j,a,b) * d_cc_space_v_oovv%f(i,j,a,b) - enddo - enddo - enddo - enddo - !$omp end do nowait - !$omp critical - energy = energy + e - !$omp end critical - !$omp end parallel -! -! -! call gpu_ddot(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1, d_cc_space_v_oovv, 1, energy) -! call gpu_ddot(blas_handle, nO*nV*1_8, d_cc_space_f_vo, 1, t1, 1, e) -! energy = energy + 2.d0*e +! energy = 0d0 +! !$omp parallel & +! !$omp shared(nO,nV,energy,tau_x,t1,& +! !$omp d_cc_space_f_vo,d_cc_space_v_oovv) & +! !$omp private(i,j,a,b,e) & +! !$omp default(none) +! e = 0d0 +! !$omp do +! do a = 1, nV +! do i = 1, nO +! e = e + 2d0 * d_cc_space_f_vo%f(a,i) * t1%f(i,a) +! enddo +! enddo +! !$omp end do nowait +! !$omp do +! do b = 1, nV +! do a = 1, nV +! do j = 1, nO +! do i = 1, nO +! e = e + tau_x%f(i,j,a,b) * d_cc_space_v_oovv%f(i,j,a,b) +! enddo +! enddo +! enddo +! enddo +! !$omp end do nowait +! !$omp critical +! energy = energy + e +! !$omp end critical +! !$omp end parallel + + + call gpu_ddot(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1, d_cc_space_v_oovv, 1, energy) + call gpu_ddot(blas_handle, nO*nV*1_8, d_cc_space_f_vo, 1, t1, 1, e) + energy = energy + 2.d0*e end diff --git a/src/gpu/gpu.irp.f b/src/gpu/gpu.irp.f new file mode 100644 index 00000000..e91d66f5 --- /dev/null +++ b/src/gpu/gpu.irp.f @@ -0,0 +1,11 @@ +use gpu + +BEGIN_PROVIDER [ type(gpu_blas), blas_handle ] + implicit none + BEGIN_DOC + ! Handle for cuBLAS or RocBLAS + END_DOC + call gpu_blas_create(blas_handle) +END_PROVIDER + + From 860121d404f7ae255790cd12136139103bdc48d0 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Sat, 29 Jun 2024 02:27:50 +0200 Subject: [PATCH 17/19] H_oo on GPU --- plugins/local/gpu_nvidia/gpu.c | 224 +++++++++++---------- plugins/local/gpu_x86/gpu.c | 38 ++-- src/ccsd/ccsd_space_orb_sub.irp.f | 112 ++++++++--- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 127 ++++++++---- src/gpu/gpu.irp.f | 7 + src/gpu/gpu_module.F90 | 260 +++++++++++++++++++++---- 6 files changed, 540 insertions(+), 228 deletions(-) diff --git a/plugins/local/gpu_nvidia/gpu.c b/plugins/local/gpu_nvidia/gpu.c index f0bd247a..189de64c 100644 --- a/plugins/local/gpu_nvidia/gpu.c +++ b/plugins/local/gpu_nvidia/gpu.c @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -10,6 +11,10 @@ /* Generic functions */ +bool no_gpu() { + return false; +} + int gpu_ndevices() { int ngpus; cudaGetDeviceCount(&ngpus); @@ -17,7 +22,7 @@ int gpu_ndevices() { } void gpu_set_device(int32_t igpu) { - cudaSetDevice(igpu); + cudaSetDevice((int) igpu); } @@ -64,22 +69,20 @@ void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) { /* Streams */ -void gpu_stream_create(void** ptr) { - cudaStream_t stream; - cudaError_t rc = cudaStreamCreate(&stream); +void gpu_stream_create(cudaStream_t* ptr) { + cudaError_t rc = cudaStreamCreate(ptr); assert (rc == cudaSuccess); - *ptr = (void*) stream; } -void gpu_stream_destroy(void** ptr) { - assert (*ptr != NULL); - cudaError_t rc = cudaStreamDestroy( (cudaStream_t) *ptr); +void gpu_stream_destroy(cudaStream_t* ptr) { + assert (ptr != NULL); + cudaError_t rc = cudaStreamDestroy(*ptr); assert (rc == cudaSuccess); *ptr = NULL; } -void gpu_set_stream(void** handle, void** stream) { - cublasSetStream( (cublasHandle_t) *handle, (cudaStream_t) *stream); +void gpu_set_stream(cublasHandle_t handle, cudaStream_t stream) { + cublasSetStream(handle, stream); } void gpu_synchronize() { @@ -89,75 +92,80 @@ void gpu_synchronize() { /* BLAS functions */ -void gpu_blas_create(void** handle) { - cublasHandle_t cublas_handle; - cublasStatus_t rc = cublasCreate(&cublas_handle); +void gpu_blas_create(cublasHandle_t* ptr) { + cublasStatus_t rc = cublasCreate(ptr); assert (rc == CUBLAS_STATUS_SUCCESS); - *handle = (void*) cublas_handle; } -void gpu_blas_destroy(void** handle) { - assert (*handle != NULL); - cublasStatus_t rc = cublasDestroy( (cublasHandle_t) *handle); +void gpu_blas_destroy(cublasHandle_t* ptr) { + assert (ptr != NULL); + cublasStatus_t rc = cublasDestroy(*ptr); assert (rc == CUBLAS_STATUS_SUCCESS); - *handle = NULL; + ptr = NULL; } -void gpu_ddot(void** handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) { - assert (*handle != NULL); +void gpu_ddot(cublasHandle_t handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) { + assert (handle != NULL); + /* Convert to int */ + int n_, incx_, incy_; - /* Convert to int32_t */ - int32_t n_, incx_, incy_; + n_ = (int) n; + incx_ = (int) incx; + incy_ = (int) incy; - n_ = (int32_t) n; - incx_ = (int32_t) incx; - incy_ = (int32_t) incy; + assert ( (int64_t) n_ == n ); + assert ( (int64_t) incx_ == incx); + assert ( (int64_t) incy_ == incy); + + cublasStatus_t rc = cublasDdot(handle, n_, x, incx_, y, incy_, result); +/* + double alpha = 1.0; + double beta = 0.0; + cublasStatus_t rc = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, n_, &alpha, x, 1, y, n_, &beta, &result_, 1); +*/ + assert (rc == CUBLAS_STATUS_SUCCESS); +} + + + +void gpu_sdot(cublasHandle_t handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) { + assert (handle != NULL); + + /* Convert to int */ + int n_, incx_, incy_; + + n_ = (int) n; + incx_ = (int) incx; + incy_ = (int) incy; /* Check for integer overflows */ assert ( (int64_t) n_ == n ); assert ( (int64_t) incx_ == incx); assert ( (int64_t) incy_ == incy); - cublasDdot((cublasHandle_t) *handle, n_, x, incx_, y, incy_, result); + float result_ = 0.; + cublasStatus_t rc = cublasSdot(handle, n_, x, incx_, y, incy_, &result_); + assert (rc == CUBLAS_STATUS_SUCCESS); + *result = result_; } -void gpu_sdot(void** handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) { - assert (*handle != NULL); - - /* Convert to int32_t */ - int32_t n_, incx_, incy_; - - n_ = (int32_t) n; - incx_ = (int32_t) incx; - incy_ = (int32_t) incy; - - /* Check for integer overflows */ - assert ( (int64_t) n_ == n ); - assert ( (int64_t) incx_ == incx); - assert ( (int64_t) incy_ == incy); - - cublasSdot((cublasHandle_t) *handle, n_, x, incx_, y, incy_, result); -} - - - -void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t n, const double alpha, +void gpu_dgemv(cublasHandle_t handle, const char transa, const int64_t m, const int64_t n, const double alpha, const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) { - assert (*handle != NULL); + assert (handle != NULL); - /* Convert to int32_t */ - int32_t m_, n_, lda_, incx_, incy_; + /* Convert to int */ + int m_, n_, lda_, incx_, incy_; - m_ = (int32_t) m; - n_ = (int32_t) n; - lda_ = (int32_t) lda; - incx_ = (int32_t) incx; - incy_ = (int32_t) incy; + m_ = (int) m; + n_ = (int) n; + lda_ = (int) lda; + incx_ = (int) incx; + incy_ = (int) incy; /* Check for integer overflows */ assert ( (int64_t) m_ == m ); @@ -169,24 +177,24 @@ void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t cublasOperation_t transa_ = CUBLAS_OP_N; if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; - cublasDgemv((cublasHandle_t) *handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_); + cublasDgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_); } -void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t n, const float alpha, +void gpu_sgemv(cublasHandle_t handle, const char transa, const int64_t m, const int64_t n, const float alpha, const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) { - assert (*handle != NULL); + assert (handle != NULL); - /* Convert to int32_t */ - int32_t m_, n_, lda_, incx_, incy_; + /* Convert to int */ + int m_, n_, lda_, incx_, incy_; - m_ = (int32_t) m; - n_ = (int32_t) n; - lda_ = (int32_t) lda; - incx_ = (int32_t) incx; - incy_ = (int32_t) incy; + m_ = (int) m; + n_ = (int) n; + lda_ = (int) lda; + incx_ = (int) incx; + incy_ = (int) incy; /* Check for integer overflows */ assert ( (int64_t) m_ == m ); @@ -198,24 +206,24 @@ void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t cublasOperation_t transa_ = CUBLAS_OP_N; if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; - cublasSgemv((cublasHandle_t) *handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_); + cublasSgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_); } -void gpu_dgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, +void gpu_dgemm(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) { - assert (*handle != NULL); + assert (handle != NULL); - /* Convert to int32_t */ - int32_t m_, n_, k_, lda_, ldb_, ldc_; + /* Convert to int */ + int m_, n_, k_, lda_, ldb_, ldc_; - m_ = (int32_t) m; - n_ = (int32_t) n; - k_ = (int32_t) k; - lda_ = (int32_t) lda; - ldb_ = (int32_t) ldb; - ldc_ = (int32_t) ldc; + m_ = (int) m; + n_ = (int) n; + k_ = (int) k; + lda_ = (int) lda; + ldb_ = (int) ldb; + ldc_ = (int) ldc; /* Check for integer overflows */ assert ( (int64_t) m_ == m ); @@ -230,25 +238,25 @@ void gpu_dgemm(void** handle, const char transa, const char transb, const int64_ if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; - cublasDgemm((cublasHandle_t) *handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_); + cublasDgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_); } -void gpu_sgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, +void gpu_sgemm(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) { - assert (*handle != NULL); + assert (handle != NULL); - /* Convert to int32_t */ - int32_t m_, n_, k_, lda_, ldb_, ldc_; + /* Convert to int */ + int m_, n_, k_, lda_, ldb_, ldc_; - m_ = (int32_t) m; - n_ = (int32_t) n; - k_ = (int32_t) k; - lda_ = (int32_t) lda; - ldb_ = (int32_t) ldb; - ldc_ = (int32_t) ldc; + m_ = (int) m; + n_ = (int) n; + k_ = (int) k; + lda_ = (int) lda; + ldb_ = (int) ldb; + ldc_ = (int) ldc; /* Check for integer overflows */ assert ( (int64_t) m_ == m ); @@ -263,22 +271,22 @@ void gpu_sgemm(void** handle, const char transa, const char transb, const int64_ if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; - cublasSgemm((cublasHandle_t) *handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_); + cublasSgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_); } -void gpu_dgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, +void gpu_dgeam(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { - assert (*handle != NULL); + assert (handle != NULL); - /* Convert to int32_t */ - int32_t m_, n_, lda_, ldb_, ldc_; + /* Convert to int */ + int m_, n_, lda_, ldb_, ldc_; - m_ = (int32_t) m; - n_ = (int32_t) n; - lda_ = (int32_t) lda; - ldb_ = (int32_t) ldb; - ldc_ = (int32_t) ldc; + m_ = (int) m; + n_ = (int) n; + lda_ = (int) lda; + ldb_ = (int) ldb; + ldc_ = (int) ldc; /* Check for integer overflows */ assert ( (int64_t) m_ == m ); @@ -292,23 +300,23 @@ void gpu_dgeam(void** handle, const char transa, const char transb, const int64_ if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; - cublasDgeam((cublasHandle_t) *handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_); + cublasDgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_); } -void gpu_sgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha, +void gpu_sgeam(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha, const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) { - assert (*handle != NULL); + assert (handle != NULL); - /* Convert to int32_t */ - int32_t m_, n_, lda_, ldb_, ldc_; + /* Convert to int */ + int m_, n_, lda_, ldb_, ldc_; - m_ = (int32_t) m; - n_ = (int32_t) n; - lda_ = (int32_t) lda; - ldb_ = (int32_t) ldb; - ldc_ = (int32_t) ldc; + m_ = (int) m; + n_ = (int) n; + lda_ = (int) lda; + ldb_ = (int) ldb; + ldc_ = (int) ldc; /* Check for integer overflows */ assert ( (int64_t) m_ == m ); @@ -322,6 +330,6 @@ void gpu_sgeam(void** handle, const char transa, const char transb, const int64_ if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T; if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T; - cublasSgeam((cublasHandle_t) *handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_); + cublasSgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_); } diff --git a/plugins/local/gpu_x86/gpu.c b/plugins/local/gpu_x86/gpu.c index ac7c3620..53267a7c 100644 --- a/plugins/local/gpu_x86/gpu.c +++ b/plugins/local/gpu_x86/gpu.c @@ -2,8 +2,12 @@ #include #include #include +#include #include +bool no_gpu() { + return true; +} /* Generic functions */ @@ -56,7 +60,7 @@ void gpu_stream_destroy(void** ptr) { *ptr = NULL; } -void gpu_set_stream(void** handle, void** stream) { +void gpu_set_stream(void* handle, void* stream) { return; } @@ -79,8 +83,8 @@ void gpu_blas_destroy(void** handle) { double ddot_(const int32_t* n, const double* x, const int32_t* incx, const double* y, const int32_t* incy); -void gpu_ddot(void** handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) { - assert (*handle != NULL); +void gpu_ddot(void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) { + assert (handle != NULL); /* Convert to int32_t */ int32_t n_, incx_, incy_; @@ -100,8 +104,8 @@ void gpu_ddot(void** handle, const int64_t n, const double* x, const int64_t inc float sdot_(const int32_t* n, const float* x, const int32_t* incx, const float* y, const int32_t* incy); -void gpu_sdot(void** handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) { - assert (*handle != NULL); +void gpu_sdot(void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) { + assert (handle != NULL); /* Convert to int32_t */ int32_t n_, incx_, incy_; @@ -122,10 +126,10 @@ void gpu_sdot(void** handle, const int64_t n, const float* x, const int64_t incx void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha, const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy); -void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t n, const double alpha, +void gpu_dgemv(void* handle, const char transa, const int64_t m, const int64_t n, const double alpha, const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) { - assert (*handle != NULL); + assert (handle != NULL); /* Convert to int32_t */ int32_t m_, n_, lda_, incx_, incy_; @@ -150,10 +154,10 @@ void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha, const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy); -void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t n, const float alpha, +void gpu_sgemv(void* handle, const char transa, const int64_t m, const int64_t n, const float alpha, const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) { - assert (*handle != NULL); + assert (handle != NULL); /* Convert to int32_t */ int32_t m_, n_, lda_, incx_, incy_; @@ -178,10 +182,10 @@ void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha, const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc); -void gpu_dgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, +void gpu_dgemm(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) { - assert (*handle != NULL); + assert (handle != NULL); /* Convert to int32_t */ int32_t m_, n_, k_, lda_, ldb_, ldc_; @@ -209,10 +213,10 @@ void gpu_dgemm(void** handle, const char transa, const char transb, const int64_ void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha, const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc); -void gpu_sgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, +void gpu_sgemm(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) { - assert (*handle != NULL); + assert (handle != NULL); /* Convert to int32_t */ int32_t m_, n_, k_, lda_, ldb_, ldc_; @@ -236,9 +240,9 @@ void gpu_sgemm(void** handle, const char transa, const char transb, const int64_ } -void gpu_dgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, +void gpu_dgeam(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { - assert (*handle != NULL); + assert (handle != NULL); if ( (transa == 'N' && transb == 'N') || (transa == 'n' && transb == 'N') || @@ -368,9 +372,9 @@ void gpu_dgeam(void** handle, const char transa, const char transb, const int64_ } -void gpu_sgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha, +void gpu_sgeam(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha, const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) { - assert (*handle != NULL); + assert (handle != NULL); if ( (transa == 'N' && transb == 'N') || (transa == 'n' && transb == 'N') || diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 5c2daa05..5ee7366e 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -14,9 +14,15 @@ subroutine run_ccsd_space_orb type(gpu_double2) :: t1, r1 type(gpu_double2) :: H_oo, H_vv, H_vo - type(gpu_double2) :: d_cc_space_f_vo + type(gpu_double2) :: d_cc_space_f_oo, d_cc_space_f_vo + type(gpu_double2) :: d_cc_space_f_ov, d_cc_space_f_vv + + type(gpu_double3) :: d_cc_space_v_oo_chol, d_cc_space_v_vo_chol + type(gpu_double3) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol + type(gpu_double4) :: d_cc_space_v_oovv + double precision, allocatable :: all_err(:,:), all_t(:,:) integer, allocatable :: list_occ(:), list_vir(:) integer(bit_kind) :: det(N_int,2) @@ -24,7 +30,7 @@ subroutine run_ccsd_space_orb call set_multiple_levels_omp(.False.) - if (do_ao_cholesky) then + if (do_mo_cholesky) then PROVIDE cholesky_mo_transp FREE cholesky_ao else @@ -55,11 +61,36 @@ subroutine run_ccsd_space_orb !print*,'occ',list_occ !print*,'vir',list_vir + ! GPU arrays + call gpu_allocate(d_cc_space_f_oo, nO, nO) call gpu_allocate(d_cc_space_f_vo, nV, nO) - call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV) - call gpu_upload(cc_space_f_vo, d_cc_space_f_vo) - call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv) + call gpu_allocate(d_cc_space_f_ov, nO, nV) + call gpu_allocate(d_cc_space_f_vv, nV, nV) + call gpu_upload(cc_space_f_oo, d_cc_space_f_oo) + call gpu_upload(cc_space_f_vo, d_cc_space_f_vo) + call gpu_upload(cc_space_f_vv, d_cc_space_f_vv) + +! FREE cc_space_f_oo +! FREE cc_space_f_vo +! FREE cc_space_f_vv + + if (do_mo_cholesky) then + call gpu_allocate(d_cc_space_v_oo_chol, cholesky_mo_num, nO, nO) + call gpu_allocate(d_cc_space_v_ov_chol, cholesky_mo_num, nO, nV) + call gpu_allocate(d_cc_space_v_vo_chol, cholesky_mo_num, nV, nO) + call gpu_allocate(d_cc_space_v_vv_chol, cholesky_mo_num, nV, nV) + + call gpu_upload(cc_space_v_oo_chol, d_cc_space_v_oo_chol) + call gpu_upload(cc_space_v_ov_chol, d_cc_space_v_ov_chol) + call gpu_upload(cc_space_v_vo_chol, d_cc_space_v_vo_chol) + call gpu_upload(cc_space_v_vv_chol, d_cc_space_v_vv_chol) + +! FREE cc_space_v_oo_chol +! FREE cc_space_v_ov_chol +! FREE cc_space_v_vo_chol +! FREE cc_space_v_vv_chol + endif call gpu_allocate(t2, nO,nO,nV,nV) call gpu_allocate(r2, nO,nO,nV,nV) @@ -120,6 +151,13 @@ subroutine run_ccsd_space_orb call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,h_t2) call gpu_upload(h_t2, t2) + + call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV) + call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv) + +! FREE cc_space_v_oovv + + call update_tau_space(nO,nV,h_t1,t1,t2,tau) call update_tau_x_space(nO,nV,tau,tau_x) !print*,'hf_energy', hf_energy @@ -142,10 +180,10 @@ subroutine run_ccsd_space_orb do while (not_converged) ! Residue - if (do_ao_cholesky) then -! if (.False.) then - call compute_H_oo_chol(nO,nV,tau_x,H_oo) - call compute_H_vv_chol(nO,nV,tau_x%f,H_vv%f) + if (do_mo_cholesky) then + call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, & + d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo) + call compute_H_vv_chol(nO,nV,tau_x,H_vv) call compute_H_vo_chol(nO,nV,t1%f,H_vo%f) call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1) @@ -249,6 +287,12 @@ subroutine run_ccsd_space_orb call save_energy(uncorr_energy + energy, e_t) deallocate(h_t1, h_t2) + if (do_mo_cholesky) then + call gpu_deallocate(d_cc_space_v_oo_chol) + call gpu_deallocate(d_cc_space_v_ov_chol) + call gpu_deallocate(d_cc_space_v_vo_chol) + call gpu_deallocate(d_cc_space_v_vv_chol) + endif call gpu_deallocate(d_cc_space_f_vo) call gpu_deallocate(d_cc_space_v_oovv) call gpu_deallocate(t1) @@ -302,8 +346,21 @@ subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1, ! !$omp end parallel - call gpu_ddot(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1, d_cc_space_v_oovv, 1, energy) - call gpu_ddot(blas_handle, nO*nV*1_8, d_cc_space_f_vo, 1, t1, 1, e) + type(gpu_stream) :: s1, s2 + call gpu_stream_create(s1) + call gpu_stream_create(s2) + + call gpu_set_stream(blas_handle,s1) + call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo, 1, t1, 1, e) + + call gpu_set_stream(blas_handle,s2) + call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1_8, d_cc_space_v_oovv, 1_8, energy) + call gpu_synchronize() + call gpu_set_stream(blas_handle,gpu_default_stream) + + call gpu_stream_destroy(s1) + call gpu_stream_destroy(s2) + energy = energy + 2.d0*e end @@ -346,32 +403,29 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau) type(gpu_stream) :: stream(nV) - do b=1,nV - call gpu_stream_create(stream(b)) - enddo - - !$OMP PARALLEL & + !$OMP PARALLEL if (no_gpu()) & !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) !$OMP DO do b=1,nV + call gpu_stream_create(stream(b)) call gpu_set_stream(blas_handle,stream(b)) do j=1,nO - call gpu_dgeam(blas_handle, 'N', 'N', nO*1_8, nV*1_8, & - 1.d0, t2%f(1,j,1,b), nO*nO*1_8, & - h_t1(j,b), t1%f, nO*1_8, & - tau%f(1,j,1,b), nO*nO*1_8) + call gpu_dgeam_f(blas_handle, 'N', 'N', nO, nV, & + 1.d0, t2%f(1,j,1,b), nO*nO, & + h_t1(j,b), t1%f, nO, & + tau%f(1,j,1,b), nO*nO) enddo enddo !$OMP END DO !$OMP END PARALLEL - call gpu_synchronize() - do b=1,nV call gpu_stream_destroy(stream(b)) enddo + call gpu_set_stream(blas_handle,gpu_default_stream) + end @@ -412,7 +466,7 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x) call gpu_stream_create(stream(a)) enddo - !$OMP PARALLEL & + !$OMP PARALLEL if (no_gpu()) & !$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) @@ -420,20 +474,20 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x) do b=1,nV do a=1,nV call gpu_set_stream(blas_handle,stream(a)) - call gpu_dgeam(blas_handle, 'N', 'N', nO*1_8, nO*1_8, & - 2.d0, tau%f(1,1,a,b), nO*1_8, & - -1.d0, tau%f(1,1,b,a), nO*1_8, & - tau_x%f(1,1,a,b), nO*1_8) + call gpu_dgeam_f(blas_handle, 'N', 'N', nO, nO, & + 2.d0, tau%f(1,1,a,b), nO, & + -1.d0, tau%f(1,1,b,a), nO, & + tau_x%f(1,1,a,b), nO) enddo enddo !$OMP END DO !$OMP END PARALLEL - call gpu_synchronize() - do b=1,nV call gpu_stream_destroy(stream(b)) enddo + call gpu_set_stream(blas_handle,gpu_default_stream) + end diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index 9b161001..288724f3 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -293,62 +293,115 @@ end ! H_oo -subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo) +subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, & + d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo) use gpu implicit none integer, intent(in) :: nO,nV + type(gpu_double2), intent(in) :: d_cc_space_f_oo + type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol type(gpu_double4), intent(in) :: tau_x type(gpu_double2), intent(out) :: H_oo integer :: a,b,i,j,u,k - double precision, allocatable :: tau_kau(:,:,:), tmp_vov(:,:,:) + type(gpu_double3) :: tau_kau, tmp_vov, tmp_ovv - allocate(tau_kau(cholesky_mo_num,nV,nO)) - !$omp parallel & - !$omp default(shared) & - !$omp private(i,u,j,k,a,b,tmp_vov) - allocate(tmp_vov(nV,nO,nV) ) - !$omp do - do u = 1, nO + call gpu_allocate(tau_kau, cholesky_mo_num, nV, nO) + +! !$omp parallel & +! !$omp default(shared) & +! !$omp private(i,u,j,k,a,b,tmp_vov) +! call gpu_allocate(tmp_vov, nV, nO, nV) +! !$omp do +! do u = 1, nO +! do b=1,nV +! do j=1,nO +! do a=1,nV +! tmp_vov%f(a,j,b) = tau_x%f(u,j,a,b) +! enddo +! enddo +! enddo +! call dgemm('N','T',cholesky_mo_num,nV,nO*nV,1.d0, & +! d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, tmp_vov%f, nV, & +! 0.d0, tau_kau%f(1,1,u), cholesky_mo_num) +! enddo +! !$omp end do nowait +! call gpu_deallocate(tmp_vov) +! !$omp do +! do i = 1, nO +! do u = 1, nO +! H_oo%f(u,i) = d_cc_space_f_oo%f(u,i) +! enddo +! enddo +! !$omp end do nowait +! +! !$omp barrier +! !$omp end parallel +! call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & +! tau_kau%f(1,1,1), cholesky_mo_num*nV, d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num*nV, & +! 1.d0, H_oo%f(1,1), nO) +! + + type(gpu_stream) :: stream(nV) + + do b=1,nV + call gpu_stream_create(stream(b)) + enddo + + !$OMP PARALLEL if (no_gpu()) & + !$OMP DEFAULT(SHARED) & + !$OMP PRIVATE(u,b,tmp_vov,tmp_ovv) + + call gpu_allocate(tmp_vov, nV, nO, nV) + call gpu_allocate(tmp_ovv, nO, nV, nV) + + !$OMP DO + do u=1,nO + call gpu_dgeam_f(blas_handle, 'N', 'N', 1, nO*nV*nV, 1.d0, & + tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f, nO, tmp_ovv%f, 1) do b=1,nV - do j=1,nO - do a=1,nV - tmp_vov(a,j,b) = tau_x%f(u,j,a,b) - enddo - enddo + call gpu_set_stream(blas_handle,stream(b)) + call gpu_dgeam_f(blas_handle, 'T', 'T', nV, nO, 1.d0, & + tmp_ovv%f(1,1,b), nO, 0.d0, & + tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV) enddo - call dgemm('N','T',cholesky_mo_num,nV,nO*nV,1.d0, & - cc_space_v_ov_chol, cholesky_mo_num, tmp_vov, nV, & - 0.d0, tau_kau(1,1,u), cholesky_mo_num) + call gpu_dgemm_f(blas_handle, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, & + d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_vov%f, nV, & + 0.d0, tau_kau%f(1,1,u), cholesky_mo_num) + call gpu_synchronize() enddo - !$omp end do nowait - deallocate(tmp_vov) - !$omp do - do i = 1, nO - do u = 1, nO - H_oo%f(u,i) = cc_space_f_oo(u,i) - enddo - enddo - !$omp end do nowait - !$omp barrier - !$omp end parallel - call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & - tau_kau, cholesky_mo_num*nV, cc_space_v_vo_chol, cholesky_mo_num*nV, & - 1.d0, H_oo%f, nO) + !$OMP END DO + call gpu_deallocate(tmp_vov) + call gpu_deallocate(tmp_ovv) + !$OMP END PARALLEL + + do b=1,nV + call gpu_stream_destroy(stream(b)) + enddo + + call gpu_set_stream(blas_handle,gpu_default_stream) + + call gpu_copy(d_cc_space_f_oo, H_oo) + + call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & + tau_kau, cholesky_mo_num*nV, d_cc_space_v_vo_chol, cholesky_mo_num*nV, & + 1.d0, H_oo, nO) + + call gpu_deallocate(tau_kau) end ! H_vv subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv) - + use gpu implicit none integer, intent(in) :: nO,nV - double precision, intent(in) :: tau_x(nO, nO, nV, nV) - double precision, intent(out) :: H_vv(nV, nV) + type(gpu_double4), intent(in) :: tau_x + type(gpu_double2), intent(out) :: H_vv integer :: a,b,i,j,u,k, beta @@ -364,7 +417,7 @@ subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv) do b=1,nV do j=1,nO do i=1,nO - tmp_oov(i,j,b) = tau_x(i,j,a,b) + tmp_oov(i,j,b) = tau_x%f(i,j,a,b) enddo enddo enddo @@ -378,7 +431,7 @@ subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv) !$omp do do beta = 1, nV do a = 1, nV - H_vv(a,beta) = cc_space_f_vv(a,beta) + H_vv%f(a,beta) = cc_space_f_vv(a,beta) enddo enddo !$omp end do nowait @@ -386,7 +439,7 @@ subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv) !$omp end parallel call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, & tau_kia, cholesky_mo_num*nO, cc_space_v_ov_chol, cholesky_mo_num*nO, & - 1.d0, H_vv, nV) + 1.d0, H_vv%f, nV) end diff --git a/src/gpu/gpu.irp.f b/src/gpu/gpu.irp.f index e91d66f5..6ad0a075 100644 --- a/src/gpu/gpu.irp.f +++ b/src/gpu/gpu.irp.f @@ -8,4 +8,11 @@ BEGIN_PROVIDER [ type(gpu_blas), blas_handle ] call gpu_blas_create(blas_handle) END_PROVIDER +BEGIN_PROVIDER [ type(gpu_stream), gpu_default_stream ] + implicit none + BEGIN_DOC + ! Default stream + END_DOC + gpu_default_stream%c = C_NULL_PTR +END_PROVIDER diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90 index ecf79c83..2676b339 100644 --- a/src/gpu/gpu_module.F90 +++ b/src/gpu/gpu_module.F90 @@ -49,7 +49,12 @@ module gpu ! ------------ interface + logical(c_bool) function no_gpu() bind(C) + import + end function + integer function gpu_ndevices() bind(C) + import end function subroutine gpu_set_device(id) bind(C) @@ -101,7 +106,7 @@ module gpu subroutine gpu_set_stream_c(handle, stream) bind(C, name='gpu_set_stream') import - type(c_ptr) :: handle, stream + type(c_ptr), value :: handle, stream end subroutine subroutine gpu_synchronize() bind(C) @@ -120,15 +125,15 @@ module gpu subroutine gpu_ddot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_ddot') import - type(c_ptr), intent(in) :: handle + type(c_ptr), value, intent(in) :: handle integer(c_int64_t), value :: n, incx, incy - type(c_ptr), intent(in), value :: dx, dy + type(c_ptr), value :: dx, dy real(c_double), intent(out) :: res end subroutine subroutine gpu_sdot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_sdot') import - type(c_ptr), intent(in) :: handle + type(c_ptr), value, intent(in) :: handle integer(c_int64_t), value :: n, incx, incy type(c_ptr), intent(in), value :: dx, dy real(c_float), intent(out) :: res @@ -137,8 +142,8 @@ module gpu subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, & b, ldb, c, ldc) bind(C, name='gpu_dgeam') import - type(c_ptr), intent(in) :: handle - character(c_char), intent(in), value :: transa, transb + type(c_ptr), value, intent(in) :: handle + character(c_char), intent(in), value :: transa, transb integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc real(c_double), intent(in), value :: alpha, beta type(c_ptr), value :: a, b, c @@ -147,13 +152,33 @@ module gpu subroutine gpu_sgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, & b, ldb, c, ldc) bind(C, name='gpu_sgeam') import - type(c_ptr), intent(in) :: handle - character(c_char), intent(in), value :: transa, transb + type(c_ptr), value, intent(in) :: handle + character(c_char), intent(in), value :: transa, transb integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc real(c_float), intent(in), value :: alpha, beta type(c_ptr), value :: a, b, c end subroutine + subroutine gpu_dgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, & + b, ldb, beta, c, ldc) bind(C, name='gpu_dgemm') + import + type(c_ptr), value, intent(in) :: handle + character(c_char), intent(in), value :: transa, transb + integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc + real(c_double), intent(in), value :: alpha, beta + type(c_ptr), value :: a, b, c + end subroutine + + subroutine gpu_sgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, & + b, ldb, beta, c, ldc) bind(C, name='gpu_sgemm') + import + type(c_ptr), value, intent(in) :: handle + character(c_char), intent(in), value :: transa, transb + integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc + real(c_float), intent(in), value :: alpha, beta + type(c_ptr), value :: a, b, c + end subroutine + end interface @@ -161,20 +186,26 @@ module gpu ! ---------------------- interface gpu_allocate - procedure gpu_allocate_double1 & - ,gpu_allocate_double2 & - ,gpu_allocate_double3 & - ,gpu_allocate_double4 & - ,gpu_allocate_double5 & - ,gpu_allocate_double6 + procedure gpu_allocate_double1 & + ,gpu_allocate_double2 & + ,gpu_allocate_double3 & + ,gpu_allocate_double4 & + ,gpu_allocate_double5 & + ,gpu_allocate_double6 & + ,gpu_allocate_double1_64 & + ,gpu_allocate_double2_64 & + ,gpu_allocate_double3_64 & + ,gpu_allocate_double4_64 & + ,gpu_allocate_double5_64 & + ,gpu_allocate_double6_64 end interface gpu_allocate interface gpu_deallocate - procedure gpu_deallocate_double1 & - ,gpu_deallocate_double2 & - ,gpu_deallocate_double3 & - ,gpu_deallocate_double4 & - ,gpu_deallocate_double5 & + procedure gpu_deallocate_double1 & + ,gpu_deallocate_double2 & + ,gpu_deallocate_double3 & + ,gpu_deallocate_double4 & + ,gpu_deallocate_double5 & ,gpu_deallocate_double6 end interface gpu_deallocate @@ -267,6 +298,61 @@ module gpu end subroutine + subroutine gpu_allocate_double1_64(ptr, s) + implicit none + type(gpu_double1), intent(inout) :: ptr + integer*8, intent(in) :: s + + call gpu_allocate_c(ptr%c, s) + call c_f_pointer(ptr%c, ptr%f, (/ s /)) + end subroutine + + subroutine gpu_allocate_double2_64(ptr, s1, s2) + implicit none + type(gpu_double2), intent(inout) :: ptr + integer*8, intent(in) :: s1, s2 + + call gpu_allocate_c(ptr%c, s1*s2*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /)) + end subroutine + + subroutine gpu_allocate_double3_64(ptr, s1, s2, s3) + implicit none + type(gpu_double3), intent(inout) :: ptr + integer*8, intent(in) :: s1, s2, s3 + + call gpu_allocate_c(ptr%c, s1*s2*s3*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /)) + end subroutine + + subroutine gpu_allocate_double4_64(ptr, s1, s2, s3, s4) + implicit none + type(gpu_double4), intent(inout) :: ptr + integer*8, intent(in) :: s1, s2, s3, s4 + + call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /)) + end subroutine + + subroutine gpu_allocate_double5_64(ptr, s1, s2, s3, s4, s5) + implicit none + type(gpu_double5), intent(inout) :: ptr + integer*8, intent(in) :: s1, s2, s3, s4, s5 + + call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /)) + end subroutine + + subroutine gpu_allocate_double6_64(ptr, s1, s2, s3, s4, s5, s6) + implicit none + type(gpu_double6), intent(inout) :: ptr + integer*8, intent(in) :: s1, s2, s3, s4, s5, s6 + + call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /)) + end subroutine + + ! gpu_deallocate ! -------------- @@ -494,19 +580,38 @@ end module subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res) use gpu type(gpu_blas), intent(in) :: handle - integer*8 :: n, incx, incy - double precision, target, intent(in) :: dx(*), dy(*) - double precision, intent(out) :: res - call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res) + integer*4 :: n, incx, incy + type(gpu_double1), intent(in) :: dx, dy + double precision, intent(out) :: res + call gpu_ddot_c(handle%c, int(n,c_int64_t), dx%c, int(incx,c_int64_t), dy%c, int(incy,c_int64_t), res) end subroutine -subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res) +subroutine gpu_ddot_f(handle, n, dx, incx, dy, incy, res) + use gpu + type(gpu_blas), intent(in) :: handle + integer*4 :: n, incx, incy + double precision, target :: dx(*), dy(*) + double precision, intent(out) :: res + call gpu_ddot_c(handle%c, int(n,c_int64_t), c_loc(dx), int(incx,c_int64_t), c_loc(dy), int(incy,c_int64_t), res) +end subroutine + + +subroutine gpu_ddot_64(handle, n, dx, incx, dy, incy, res) use gpu type(gpu_blas), intent(in) :: handle integer*8 :: n, incx, incy - real, target, intent(in) :: dx(*), dy(*) - real, intent(out) :: res - call gpu_sdot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res) + type(gpu_double1), intent(in) :: dx, dy + double precision, intent(out) :: res + call gpu_ddot_c(handle%c, n, dx%c, incx, dy%c, incy, res) +end subroutine + +subroutine gpu_ddot_f_64(handle, n, dx, incx, dy, incy, res) + use gpu + type(gpu_blas), intent(in) :: handle + integer*8 :: n, incx, incy + double precision, target :: dx(*), dy(*) + double precision, intent(out) :: res + call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res) end subroutine @@ -518,22 +623,103 @@ subroutine gpu_dgeam(handle, transa, transb, m, n, alpha, a, lda, beta, & use gpu type(gpu_blas), intent(in) :: handle character, intent(in) :: transa, transb - integer*8, intent(in) :: m, n, lda, ldb, ldc + integer*4, intent(in) :: m, n, lda, ldb, ldc double precision, intent(in) :: alpha, beta - double precision, target :: a(lda,*), b(ldb,*), c(ldc,*) - call gpu_dgeam_c(handle%c, transa, transb, m, n, alpha, c_loc(a), lda, beta, & - c_loc(b), ldb, c_loc(c), ldc) + type(gpu_double2) :: a, b, c + call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, a%c, int(lda,c_int64_t), beta, & + b%c, int(ldb,c_int64_t), c%c, int(ldc,c_int64_t)) end subroutine -subroutine gpu_sgeam(handle, transa, transb, m, n, alpha, a, lda, beta, & + +subroutine gpu_dgeam_f(handle, transa, transb, m, n, alpha, a, lda, beta, & b, ldb, c, ldc) - use gpu + use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa, transb + integer*4, intent(in) :: m, n, lda, ldb, ldc + double precision, intent(in) :: alpha, beta + double precision, target :: a(*), b(*), c(*) + call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, & + c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t)) +end subroutine + + +subroutine gpu_dgeam_64(handle, transa, transb, m, n, alpha, a, lda, beta, & + b, ldb, c, ldc) + use gpu type(gpu_blas), intent(in) :: handle character, intent(in) :: transa, transb integer*8, intent(in) :: m, n, lda, ldb, ldc - real, intent(in) :: alpha, beta - real, target :: a(lda,*), b(ldb,*), c(ldc,*) - call gpu_sgeam_c(handle%c, transa, transb, m, n, alpha, c_loc(a), lda, beta, & - c_loc(b), ldb, c_loc(c), ldc) + double precision, intent(in) :: alpha, beta + type(gpu_double2) :: a, b, c + call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, a%c, int(lda,c_int64_t), beta, & + b%c, int(ldb,c_int64_t), c%c, int(ldc,c_int64_t)) +end subroutine + + +subroutine gpu_dgeam_f_64(handle, transa, transb, m, n, alpha, a, lda, beta, & + b, ldb, c, ldc) + use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa, transb + integer*8, intent(in) :: m, n, lda, ldb, ldc + double precision, intent(in) :: alpha, beta + double precision, target :: a(*), b(*), c(*) + call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, & + c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t)) +end subroutine + + +! gemm +! ---- + +subroutine gpu_dgemm(handle, transa, transb, m, n, k, alpha, a, lda, & + b, ldb, beta, c, ldc) + use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa, transb + integer*4, intent(in) :: m, n, k, lda, ldb, ldc + double precision, intent(in) :: alpha, beta + type(gpu_double2) :: a, b, c + call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), & + alpha, a%c, int(lda,c_int64_t), & + b%c, int(ldb,c_int64_t), beta, c%c, int(ldc,c_int64_t)) +end subroutine + +subroutine gpu_dgemm_64(handle, transa, transb, m, n, k, alpha, a, lda, & + b, ldb, beta, c, ldc) + use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa, transb + integer*8, intent(in) :: m, n, k, lda, ldb, ldc + double precision, intent(in) :: alpha, beta + type(gpu_double2) :: a, b, c + call gpu_dgemm_c(handle%c, transa, transb, m, n, k, & + alpha, a%c, lda, b%c, ldb, beta, c%c, ldc) +end subroutine + +subroutine gpu_dgemm_f(handle, transa, transb, m, n, k, alpha, a, lda, & + b, ldb, beta, c, ldc) + use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa, transb + integer*4, intent(in) :: m, n, k, lda, ldb, ldc + double precision, intent(in) :: alpha, beta + double precision, target :: a(*), b(*), c(*) + call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), & + alpha, c_loc(a), int(lda,c_int64_t), & + c_loc(b), int(ldb,c_int64_t), beta, c_loc(c), int(ldc,c_int64_t)) +end subroutine + +subroutine gpu_dgemm_f_64(handle, transa, transb, m, n, k, alpha, a, lda, & + b, ldb, beta, c, ldc) + use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa, transb + integer*8, intent(in) :: m, n, k, lda, ldb, ldc + double precision, intent(in) :: alpha, beta + double precision, target :: a(*), b(*), c(*) + call gpu_dgemm_c(handle%c, transa, transb, m, n, k, & + alpha, c_loc(a), lda, c_loc(b), ldb, beta, c_loc(c), ldc) end subroutine From d3c1994c64ed9ae9914ce605a6b7c364ac518d9b Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Mon, 1 Jul 2024 18:04:48 +0200 Subject: [PATCH 18/19] H_vv --- plugins/local/gpu_nvidia/gpu.c | 16 +-- plugins/local/gpu_x86/gpu.c | 6 +- src/ccsd/ccsd_space_orb_sub.irp.f | 7 +- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 146 ++++++++++++++++--------- src/gpu/gpu.irp.f | 8 ++ 5 files changed, 114 insertions(+), 69 deletions(-) diff --git a/plugins/local/gpu_nvidia/gpu.c b/plugins/local/gpu_nvidia/gpu.c index 189de64c..39a82984 100644 --- a/plugins/local/gpu_nvidia/gpu.c +++ b/plugins/local/gpu_nvidia/gpu.c @@ -11,10 +11,6 @@ /* Generic functions */ -bool no_gpu() { - return false; -} - int gpu_ndevices() { int ngpus; cudaGetDeviceCount(&ngpus); @@ -35,13 +31,13 @@ void gpu_allocate(void** ptr, const int64_t size) { free = INT64_MAX; } - /* Use managed memory if it does not fit on the GPU */ - if (size < free && size < total/2) { + rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal); +// /* Use managed memory if it does not fit on the GPU */ +// if (size < free && size < total/2) { // rc= cudaMalloc(ptr, size); - rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal); - } else { - rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal); - } +// } else { +// rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal); +// } assert (rc == cudaSuccess); } diff --git a/plugins/local/gpu_x86/gpu.c b/plugins/local/gpu_x86/gpu.c index 53267a7c..dab23a25 100644 --- a/plugins/local/gpu_x86/gpu.c +++ b/plugins/local/gpu_x86/gpu.c @@ -5,14 +5,10 @@ #include #include -bool no_gpu() { - return true; -} - /* Generic functions */ int gpu_ndevices() { - return 1; + return 0; } void gpu_set_device(int32_t i) { diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 5ee7366e..0b3636ac 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -183,7 +183,8 @@ subroutine run_ccsd_space_orb if (do_mo_cholesky) then call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, & d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo) - call compute_H_vv_chol(nO,nV,tau_x,H_vv) + call compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, & + d_cc_space_v_ov_chol,H_vv) call compute_H_vo_chol(nO,nV,t1%f,H_vo%f) call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1) @@ -403,7 +404,7 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau) type(gpu_stream) :: stream(nV) - !$OMP PARALLEL if (no_gpu()) & + !$OMP PARALLEL if (gpu_num == 0) & !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) @@ -466,7 +467,7 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x) call gpu_stream_create(stream(a)) enddo - !$OMP PARALLEL if (no_gpu()) & + !$OMP PARALLEL if (gpu_num == 0) & !$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index 288724f3..458016fb 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -344,48 +344,47 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, & ! 1.d0, H_oo%f(1,1), nO) ! - type(gpu_stream) :: stream(nV) + type(gpu_blas) :: blas - do b=1,nV - call gpu_stream_create(stream(b)) - enddo - !$OMP PARALLEL if (no_gpu()) & + !$OMP PARALLEL & !$OMP DEFAULT(SHARED) & - !$OMP PRIVATE(u,b,tmp_vov,tmp_ovv) + !$OMP PRIVATE(blas,u,b,tmp_vov,tmp_ovv) + + !$OMP SINGLE + !$OMP TASK + call gpu_copy(d_cc_space_f_oo, H_oo) + !$OMP END TASK + !$OMP END SINGLE - call gpu_allocate(tmp_vov, nV, nO, nV) call gpu_allocate(tmp_ovv, nO, nV, nV) + call gpu_allocate(tmp_vov, nV, nO, nV) + + call gpu_blas_create(blas) !$OMP DO do u=1,nO - call gpu_dgeam_f(blas_handle, 'N', 'N', 1, nO*nV*nV, 1.d0, & + call gpu_dgeam_f(blas, 'N', 'N', 1, nO*nV*nV, 1.d0, & tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f, nO, tmp_ovv%f, 1) do b=1,nV - call gpu_set_stream(blas_handle,stream(b)) - call gpu_dgeam_f(blas_handle, 'T', 'T', nV, nO, 1.d0, & + call gpu_dgeam_f(blas, 'T', 'T', nV, nO, 1.d0, & tmp_ovv%f(1,1,b), nO, 0.d0, & tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV) enddo - call gpu_dgemm_f(blas_handle, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, & + call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, & d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_vov%f, nV, & 0.d0, tau_kau%f(1,1,u), cholesky_mo_num) - call gpu_synchronize() enddo !$OMP END DO + call gpu_blas_destroy(blas) + call gpu_deallocate(tmp_vov) call gpu_deallocate(tmp_ovv) + + !$OMP TASKWAIT !$OMP END PARALLEL - do b=1,nV - call gpu_stream_destroy(stream(b)) - enddo - - call gpu_set_stream(blas_handle,gpu_default_stream) - - call gpu_copy(d_cc_space_f_oo, H_oo) - call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & tau_kau, cholesky_mo_num*nV, d_cc_space_v_vo_chol, cholesky_mo_num*nV, & 1.d0, H_oo, nO) @@ -395,52 +394,97 @@ end ! H_vv -subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv) +subroutine compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, & + d_cc_space_v_ov_chol,H_vv) use gpu implicit none - integer, intent(in) :: nO,nV + integer, intent(in) :: nO,nV + type(gpu_double2), intent(in) :: d_cc_space_f_vv + type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol type(gpu_double4), intent(in) :: tau_x type(gpu_double2), intent(out) :: H_vv integer :: a,b,i,j,u,k, beta - double precision, allocatable :: tau_kia(:,:,:), tmp_oov(:,:,:) + type(gpu_double3) :: tau_kia, tmp_oov - allocate(tau_kia(cholesky_mo_num,nO,nV)) - !$omp parallel & - !$omp default(shared) & - !$omp private(i,beta,j,k,a,b,tmp_oov) - allocate(tmp_oov(nO,nO,nV) ) - !$omp do + call gpu_allocate(tau_kia, cholesky_mo_num, nO, nV) + +! !$omp parallel & +! !$omp default(shared) & +! !$omp private(i,beta,j,k,a,b,tmp_oov) +! allocate(tmp_oov(nO,nO,nV) ) +! !$omp do +! do a = 1, nV +! do b=1,nV +! do j=1,nO +! do i=1,nO +! tmp_oov(i,j,b) = tau_x%f(i,j,a,b) +! enddo +! enddo +! enddo +! call dgemm('N','T',cholesky_mo_num,nO,nO*nV,1.d0, & +! d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov, nO, & +! 0.d0, tau_kia(1,1,a), cholesky_mo_num) +! enddo +! !$omp end do nowait +! deallocate(tmp_oov) + +! !$omp do +! do beta = 1, nV +! do a = 1, nV +! H_vv%f(a,beta) = cc_space_f_vv(a,beta) +! enddo +! enddo +! !$omp end do nowait +! !$omp barrier +! !$omp end parallel +! call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, & +! tau_kia, cholesky_mo_num*nO, d_cc_space_v_ov_chol%f, cholesky_mo_num*nO, & +! 1.d0, H_vv%f, nV) + + type(gpu_blas) :: blas + + + PROVIDE gpu_num + !$OMP PARALLEL & + !$OMP DEFAULT(SHARED) & + !$OMP PRIVATE(a,b,tmp_oov,blas) + + !$OMP SINGLE + !$OMP TASK + call gpu_copy(d_cc_space_f_vv, H_vv) + !$OMP END TASK + !$OMP END SINGLE + + call gpu_blas_create(blas) + call gpu_allocate(tmp_oov, nO, nO, nV) + + !$OMP DO do a = 1, nV do b=1,nV - do j=1,nO - do i=1,nO - tmp_oov(i,j,b) = tau_x%f(i,j,a,b) - enddo - enddo + call gpu_dgeam_f(blas, 'N', 'N', nO, nO, 1.d0, & + tau_x%f(1,1,a,b), nO, 0.d0, & + tau_x%f(1,1,a,b), nO, tmp_oov%f(1,1,b), nO) enddo - call dgemm('N','T',cholesky_mo_num,nO,nO*nV,1.d0, & - cc_space_v_ov_chol, cholesky_mo_num, tmp_oov, nO, & - 0.d0, tau_kia(1,1,a), cholesky_mo_num) + call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nO,nO*nV,1.d0, & + d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov%f, nO, & + 0.d0, tau_kia%f(1,1,a), cholesky_mo_num) enddo - !$omp end do nowait - deallocate(tmp_oov) + !$OMP END DO - !$omp do - do beta = 1, nV - do a = 1, nV - H_vv%f(a,beta) = cc_space_f_vv(a,beta) - enddo - enddo - !$omp end do nowait - !$omp barrier - !$omp end parallel - call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, & - tau_kia, cholesky_mo_num*nO, cc_space_v_ov_chol, cholesky_mo_num*nO, & - 1.d0, H_vv%f, nV) + call gpu_blas_destroy(blas) + call gpu_deallocate(tmp_oov) + !$OMP TASKWAIT + !$OMP END PARALLEL + + call gpu_dgemm(blas_handle,'T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, & + tau_kia, cholesky_mo_num*nO, d_cc_space_v_ov_chol, cholesky_mo_num*nO, & + 1.d0, H_vv, nV) + + call gpu_deallocate(tau_kia) end ! H_vo diff --git a/src/gpu/gpu.irp.f b/src/gpu/gpu.irp.f index 6ad0a075..3b2feeb6 100644 --- a/src/gpu/gpu.irp.f +++ b/src/gpu/gpu.irp.f @@ -16,3 +16,11 @@ BEGIN_PROVIDER [ type(gpu_stream), gpu_default_stream ] gpu_default_stream%c = C_NULL_PTR END_PROVIDER +BEGIN_PROVIDER [ integer, gpu_num ] + implicit none + BEGIN_DOC + ! Number of usable GPUs + END_DOC + gpu_num = gpu_ndevices() +END_PROVIDER + From 44a7729f65a37cc3a7c35ae55f462bb1d61e411b Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Mon, 1 Jul 2024 19:00:27 +0200 Subject: [PATCH 19/19] H_ finished in CCSD --- src/ccsd/ccsd_space_orb_sub.irp.f | 108 ++---- src/ccsd/ccsd_space_orb_sub_chol.irp.f | 482 +++++++++---------------- 2 files changed, 200 insertions(+), 390 deletions(-) diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 0b3636ac..13b974be 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -181,11 +181,9 @@ subroutine run_ccsd_space_orb ! Residue if (do_mo_cholesky) then - call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, & - d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo) - call compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, & - d_cc_space_v_ov_chol,H_vv) - call compute_H_vo_chol(nO,nV,t1%f,H_vo%f) + call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo) + call compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, d_cc_space_v_ov_chol,H_vv) + call compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol, H_vo) call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1) call compute_r2_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r2%f,max_r2) @@ -316,51 +314,20 @@ subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1, integer :: i,j,a,b double precision :: e -! energy = 0d0 -! !$omp parallel & -! !$omp shared(nO,nV,energy,tau_x,t1,& -! !$omp d_cc_space_f_vo,d_cc_space_v_oovv) & -! !$omp private(i,j,a,b,e) & -! !$omp default(none) -! e = 0d0 -! !$omp do -! do a = 1, nV -! do i = 1, nO -! e = e + 2d0 * d_cc_space_f_vo%f(a,i) * t1%f(i,a) -! enddo -! enddo -! !$omp end do nowait -! !$omp do -! do b = 1, nV -! do a = 1, nV -! do j = 1, nO -! do i = 1, nO -! e = e + tau_x%f(i,j,a,b) * d_cc_space_v_oovv%f(i,j,a,b) -! enddo -! enddo -! enddo -! enddo -! !$omp end do nowait -! !$omp critical -! energy = energy + e -! !$omp end critical -! !$omp end parallel + type(gpu_stream) :: s1, s2 + call gpu_stream_create(s1) + call gpu_stream_create(s2) + call gpu_set_stream(blas_handle,s1) + call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo, 1, t1, 1, e) - type(gpu_stream) :: s1, s2 - call gpu_stream_create(s1) - call gpu_stream_create(s2) + call gpu_set_stream(blas_handle,s2) + call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1_8, d_cc_space_v_oovv, 1_8, energy) + call gpu_set_stream(blas_handle,gpu_default_stream) - call gpu_set_stream(blas_handle,s1) - call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo, 1, t1, 1, e) - - call gpu_set_stream(blas_handle,s2) - call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1_8, d_cc_space_v_oovv, 1_8, energy) - call gpu_synchronize() - call gpu_set_stream(blas_handle,gpu_default_stream) - - call gpu_stream_destroy(s1) - call gpu_stream_destroy(s2) + call gpu_synchronize() + call gpu_stream_destroy(s1) + call gpu_stream_destroy(s2) energy = energy + 2.d0*e @@ -384,27 +351,9 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau) ! internal integer :: i,j,a,b -! !$OMP PARALLEL & -! !$OMP SHARED(nO,nV,tau,t2,t1,h_t1) & -! !$OMP PRIVATE(i,j,a,b) & -! !$OMP DEFAULT(NONE) -! !$OMP DO -! do b = 1, nV -! do a = 1, nV -! do j = 1, nO -! do i = 1, nO -! tau%f(i,j,a,b) = t2%f(i,j,a,b) + t1%f(i,a) * h_t1(j,b) -! enddo -! enddo -! enddo -! enddo -! !$OMP END DO -! !$OMP END PARALLEL - - type(gpu_stream) :: stream(nV) - !$OMP PARALLEL if (gpu_num == 0) & + !$OMP PARALLEL & !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) @@ -422,6 +371,8 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau) !$OMP END DO !$OMP END PARALLEL + call gpu_synchronize() + do b=1,nV call gpu_stream_destroy(stream(b)) enddo @@ -444,32 +395,15 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x) ! internal integer :: i,j,a,b -! !$OMP PARALLEL & -! !$OMP SHARED(nO,nV,tau,tau_x) & -! !$OMP PRIVATE(i,j,a,b) & -! !$OMP DEFAULT(NONE) -! !$OMP DO -! do b = 1, nV -! do a = 1, nV -! do j = 1, nO -! do i = 1, nO -! tau_x%f(i,j,a,b) = 2.d0*tau%f(i,j,a,b) - tau%f(i,j,b,a) -! enddo -! enddo -! enddo -! enddo -! !$OMP END DO -! !$OMP END PARALLEL - type(gpu_stream) :: stream(nV) do a=1,nV call gpu_stream_create(stream(a)) enddo - !$OMP PARALLEL if (gpu_num == 0) & + !$OMP PARALLEL & !$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) & - !$OMP PRIVATE(i,j,a,b) & + !$OMP PRIVATE(a,b) & !$OMP DEFAULT(NONE) !$OMP DO do b=1,nV @@ -484,10 +418,12 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x) !$OMP END DO !$OMP END PARALLEL + call gpu_set_stream(blas_handle,gpu_default_stream) + call gpu_synchronize() + do b=1,nV call gpu_stream_destroy(stream(b)) enddo - call gpu_set_stream(blas_handle,gpu_default_stream) end diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index 458016fb..5eb95a06 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -1,81 +1,200 @@ -subroutine ccsd_energy_space_chol(nO,nV,tau,t1,energy) +! H_oo +subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, & + d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo) + use gpu implicit none - integer, intent(in) :: nO, nV - double precision, intent(in) :: tau(nO,nO,nV,nV) - double precision, intent(in) :: t1(nO,nV) - double precision, intent(out) :: energy + integer, intent(in) :: nO,nV + type(gpu_double2), intent(in) :: d_cc_space_f_oo + type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol + type(gpu_double4), intent(in) :: tau_x + type(gpu_double2), intent(out) :: H_oo - ! internal - integer :: i,j,a,b - double precision :: e + integer :: a,b,i,j,u,k - energy = 0d0 - !$omp parallel & - !$omp shared(nO,nV,energy,tau,t1,& - !$omp cc_space_f_vo,cc_space_w_oovv) & - !$omp private(i,j,a,b,e) & - !$omp default(none) - e = 0d0 - !$omp do - do a = 1, nV - do i = 1, nO - e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a) - enddo - enddo - !$omp end do nowait - !$omp do - do b = 1, nV - do a = 1, nV - do j = 1, nO - do i = 1, nO - e = e + tau(i,j,a,b) * cc_space_w_oovv(i,j,a,b) - enddo - enddo - enddo - enddo - !$omp end do nowait - !$omp critical - energy = energy + e - !$omp end critical - !$omp end parallel + type(gpu_double3) :: tau_kau, tmp_vov, tmp_ovv -end + call gpu_allocate(tau_kau, cholesky_mo_num, nV, nO) -! Tau + type(gpu_blas) :: blas -subroutine update_tau_space_chol(nO,nV,t1,t2,tau) - implicit none + !$OMP PARALLEL & + !$OMP DEFAULT(SHARED) & + !$OMP PRIVATE(blas,u,b,tmp_vov,tmp_ovv) - ! in - integer, intent(in) :: nO, nV - double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV) + !$OMP SINGLE + !$OMP TASK + call gpu_copy(d_cc_space_f_oo, H_oo) + !$OMP END TASK + !$OMP END SINGLE - ! out - double precision, intent(out) :: tau(nO,nO,nV,nV) + call gpu_allocate(tmp_ovv, nO, nV, nV) + call gpu_allocate(tmp_vov, nV, nO, nV) - ! internal - integer :: i,j,a,b + call gpu_blas_create(blas) - !$OMP PARALLEL & - !$OMP SHARED(nO,nV,tau,t2,t1) & - !$OMP PRIVATE(i,j,a,b) & - !$OMP DEFAULT(NONE) !$OMP DO - do b = 1, nV - do a = 1, nV - do j = 1, nO - do i = 1, nO - tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b) - enddo - enddo + do u=1,nO + call gpu_dgeam_f(blas, 'N', 'N', 1, nO*nV*nV, 1.d0, & + tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f, nO, tmp_ovv%f, 1) + do b=1,nV + call gpu_dgeam_f(blas, 'T', 'T', nV, nO, 1.d0, & + tmp_ovv%f(1,1,b), nO, 0.d0, & + tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV) enddo + call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, & + d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_vov%f, nV, & + 0.d0, tau_kau%f(1,1,u), cholesky_mo_num) enddo !$OMP END DO + + call gpu_blas_destroy(blas) + + call gpu_deallocate(tmp_vov) + call gpu_deallocate(tmp_ovv) + + !$OMP TASKWAIT !$OMP END PARALLEL + call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & + tau_kau, cholesky_mo_num*nV, d_cc_space_v_vo_chol, cholesky_mo_num*nV, & + 1.d0, H_oo, nO) + + call gpu_synchronize() + call gpu_deallocate(tau_kau) +end + +! H_vv + +subroutine compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, & + d_cc_space_v_ov_chol,H_vv) + use gpu + implicit none + + integer, intent(in) :: nO,nV + type(gpu_double2), intent(in) :: d_cc_space_f_vv + type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol + type(gpu_double4), intent(in) :: tau_x + type(gpu_double2), intent(out) :: H_vv + + integer :: a,b,i,j,u,k, beta + + type(gpu_double3) :: tau_kia, tmp_oov + + call gpu_allocate(tau_kia, cholesky_mo_num, nO, nV) + + type(gpu_blas) :: blas + + !$OMP PARALLEL & + !$OMP DEFAULT(SHARED) & + !$OMP PRIVATE(a,b,tmp_oov,blas) + + !$OMP SINGLE + !$OMP TASK + call gpu_copy(d_cc_space_f_vv, H_vv) + !$OMP END TASK + !$OMP END SINGLE + + call gpu_blas_create(blas) + call gpu_allocate(tmp_oov, nO, nO, nV) + + !$OMP DO + do a = 1, nV + do b=1,nV + call gpu_dgeam_f(blas, 'N', 'N', nO, nO, 1.d0, & + tau_x%f(1,1,a,b), nO, 0.d0, & + tau_x%f(1,1,a,b), nO, tmp_oov%f(1,1,b), nO) + enddo + call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nO,nO*nV,1.d0, & + d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov%f, nO, & + 0.d0, tau_kia%f(1,1,a), cholesky_mo_num) + enddo + !$OMP END DO + + call gpu_blas_destroy(blas) + + call gpu_deallocate(tmp_oov) + !$OMP TASKWAIT + !$OMP END PARALLEL + + call gpu_dgemm(blas_handle,'T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, & + tau_kia, cholesky_mo_num*nO, d_cc_space_v_ov_chol, cholesky_mo_num*nO, & + 1.d0, H_vv, nV) + + call gpu_synchronize() + call gpu_deallocate(tau_kia) +end + +! H_vo +subroutine compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, & + d_cc_space_v_ov_chol,d_cc_space_v_vo_chol, H_vo) + use gpu + implicit none + + integer, intent(in) :: nO,nV + type(gpu_double2), intent(in) :: t1, d_cc_space_f_vo + type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol + type(gpu_double2), intent(out) :: H_vo + + integer :: a,b,i,j,u,k + + type(gpu_double1) :: tmp_k + type(gpu_double3) :: tmp, tmp2 + + call gpu_copy(d_cc_space_f_vo, H_vo) + + call gpu_allocate(tmp_k, cholesky_mo_num) + + call gpu_dgemm(blas_handle, 'N', 'N', cholesky_mo_num, 1, nO*nV, 2.d0, & + d_cc_space_v_ov_chol, cholesky_mo_num, & + t1, nO*nV, 0.d0, tmp_k, cholesky_mo_num) + + call gpu_dgemm(blas_handle, 'T','N',nV*nO,1,cholesky_mo_num,1.d0, & + d_cc_space_v_vo_chol, cholesky_mo_num, tmp_k, cholesky_mo_num, 1.d0, & + H_vo, nV*nO) + + call gpu_deallocate(tmp_k) + + + call gpu_allocate(tmp, cholesky_mo_num, nO, nO) + + call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nO, nO, nV, 1.d0, & + d_cc_space_v_ov_chol, cholesky_mo_num*nO, t1, nO, 0.d0, tmp, cholesky_mo_num*nO) + + call gpu_allocate(tmp2, cholesky_mo_num, nO, nO) + + type(gpu_stream) :: stream(nO) + do i=1,nO + call gpu_stream_create(stream(i)) + enddo + + !$OMP PARALLEL DO COLLAPSE(2) PRIVATE(i,j) + do i=1,nO + do j=1,nO + call gpu_set_stream(blas_handle,stream(j)) + call gpu_dgeam_f(blas_handle, 'N', 'N', cholesky_mo_num, 1, 1.d0, & + tmp%f(1,i,j), cholesky_mo_num, 0.d0, & + tmp%f(1,i,j), cholesky_mo_num, tmp2%f(1,j,i), cholesky_mo_num) + enddo + enddo + !$OMP END PARALLEL DO + + call gpu_set_stream(blas_handle,gpu_default_stream) + call gpu_synchronize() + + do i=1,nO + call gpu_stream_destroy(stream(i)) + enddo + call gpu_deallocate(tmp) + + call gpu_dgemm(blas_handle, 'T','N', nV, nO, cholesky_mo_num*nO, -1.d0, & + d_cc_space_v_ov_chol, cholesky_mo_num*nO, tmp2, cholesky_mo_num*nO, & + 1.d0, H_vo, nV) + + call gpu_synchronize() + call gpu_deallocate(tmp2) end ! R1 @@ -291,251 +410,6 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) end -! H_oo - -subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, & - d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo) - use gpu - implicit none - - integer, intent(in) :: nO,nV - type(gpu_double2), intent(in) :: d_cc_space_f_oo - type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol - type(gpu_double4), intent(in) :: tau_x - type(gpu_double2), intent(out) :: H_oo - - integer :: a,b,i,j,u,k - - type(gpu_double3) :: tau_kau, tmp_vov, tmp_ovv - - call gpu_allocate(tau_kau, cholesky_mo_num, nV, nO) - -! !$omp parallel & -! !$omp default(shared) & -! !$omp private(i,u,j,k,a,b,tmp_vov) -! call gpu_allocate(tmp_vov, nV, nO, nV) -! !$omp do -! do u = 1, nO -! do b=1,nV -! do j=1,nO -! do a=1,nV -! tmp_vov%f(a,j,b) = tau_x%f(u,j,a,b) -! enddo -! enddo -! enddo -! call dgemm('N','T',cholesky_mo_num,nV,nO*nV,1.d0, & -! d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, tmp_vov%f, nV, & -! 0.d0, tau_kau%f(1,1,u), cholesky_mo_num) -! enddo -! !$omp end do nowait -! call gpu_deallocate(tmp_vov) -! !$omp do -! do i = 1, nO -! do u = 1, nO -! H_oo%f(u,i) = d_cc_space_f_oo%f(u,i) -! enddo -! enddo -! !$omp end do nowait -! -! !$omp barrier -! !$omp end parallel -! call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & -! tau_kau%f(1,1,1), cholesky_mo_num*nV, d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num*nV, & -! 1.d0, H_oo%f(1,1), nO) -! - - type(gpu_blas) :: blas - - - !$OMP PARALLEL & - !$OMP DEFAULT(SHARED) & - !$OMP PRIVATE(blas,u,b,tmp_vov,tmp_ovv) - - !$OMP SINGLE - !$OMP TASK - call gpu_copy(d_cc_space_f_oo, H_oo) - !$OMP END TASK - !$OMP END SINGLE - - call gpu_allocate(tmp_ovv, nO, nV, nV) - call gpu_allocate(tmp_vov, nV, nO, nV) - - call gpu_blas_create(blas) - - !$OMP DO - do u=1,nO - call gpu_dgeam_f(blas, 'N', 'N', 1, nO*nV*nV, 1.d0, & - tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f, nO, tmp_ovv%f, 1) - do b=1,nV - call gpu_dgeam_f(blas, 'T', 'T', nV, nO, 1.d0, & - tmp_ovv%f(1,1,b), nO, 0.d0, & - tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV) - enddo - call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, & - d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_vov%f, nV, & - 0.d0, tau_kau%f(1,1,u), cholesky_mo_num) - enddo - !$OMP END DO - - call gpu_blas_destroy(blas) - - call gpu_deallocate(tmp_vov) - call gpu_deallocate(tmp_ovv) - - !$OMP TASKWAIT - !$OMP END PARALLEL - - call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & - tau_kau, cholesky_mo_num*nV, d_cc_space_v_vo_chol, cholesky_mo_num*nV, & - 1.d0, H_oo, nO) - - call gpu_deallocate(tau_kau) -end - -! H_vv - -subroutine compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, & - d_cc_space_v_ov_chol,H_vv) - use gpu - implicit none - - integer, intent(in) :: nO,nV - type(gpu_double2), intent(in) :: d_cc_space_f_vv - type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol - type(gpu_double4), intent(in) :: tau_x - type(gpu_double2), intent(out) :: H_vv - - integer :: a,b,i,j,u,k, beta - - type(gpu_double3) :: tau_kia, tmp_oov - - call gpu_allocate(tau_kia, cholesky_mo_num, nO, nV) - -! !$omp parallel & -! !$omp default(shared) & -! !$omp private(i,beta,j,k,a,b,tmp_oov) -! allocate(tmp_oov(nO,nO,nV) ) -! !$omp do -! do a = 1, nV -! do b=1,nV -! do j=1,nO -! do i=1,nO -! tmp_oov(i,j,b) = tau_x%f(i,j,a,b) -! enddo -! enddo -! enddo -! call dgemm('N','T',cholesky_mo_num,nO,nO*nV,1.d0, & -! d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov, nO, & -! 0.d0, tau_kia(1,1,a), cholesky_mo_num) -! enddo -! !$omp end do nowait -! deallocate(tmp_oov) - -! !$omp do -! do beta = 1, nV -! do a = 1, nV -! H_vv%f(a,beta) = cc_space_f_vv(a,beta) -! enddo -! enddo -! !$omp end do nowait -! !$omp barrier -! !$omp end parallel -! call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, & -! tau_kia, cholesky_mo_num*nO, d_cc_space_v_ov_chol%f, cholesky_mo_num*nO, & -! 1.d0, H_vv%f, nV) - - type(gpu_blas) :: blas - - - PROVIDE gpu_num - !$OMP PARALLEL & - !$OMP DEFAULT(SHARED) & - !$OMP PRIVATE(a,b,tmp_oov,blas) - - !$OMP SINGLE - !$OMP TASK - call gpu_copy(d_cc_space_f_vv, H_vv) - !$OMP END TASK - !$OMP END SINGLE - - call gpu_blas_create(blas) - call gpu_allocate(tmp_oov, nO, nO, nV) - - !$OMP DO - do a = 1, nV - do b=1,nV - call gpu_dgeam_f(blas, 'N', 'N', nO, nO, 1.d0, & - tau_x%f(1,1,a,b), nO, 0.d0, & - tau_x%f(1,1,a,b), nO, tmp_oov%f(1,1,b), nO) - enddo - call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nO,nO*nV,1.d0, & - d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov%f, nO, & - 0.d0, tau_kia%f(1,1,a), cholesky_mo_num) - enddo - !$OMP END DO - - call gpu_blas_destroy(blas) - - call gpu_deallocate(tmp_oov) - !$OMP TASKWAIT - !$OMP END PARALLEL - - call gpu_dgemm(blas_handle,'T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, & - tau_kia, cholesky_mo_num*nO, d_cc_space_v_ov_chol, cholesky_mo_num*nO, & - 1.d0, H_vv, nV) - - call gpu_deallocate(tau_kia) -end - -! H_vo -subroutine compute_H_vo_chol(nO,nV,t1,H_vo) - - implicit none - - integer, intent(in) :: nO,nV - double precision, intent(in) :: t1(nO, nV) - double precision, intent(out) :: H_vo(nV, nO) - - integer :: a,b,i,j,u,k - - double precision, allocatable :: tmp_k(:), tmp(:,:,:), tmp2(:,:,:) - do i=1,nO - do a=1,nV - H_vo(a,i) = cc_space_f_vo(a,i) - enddo - enddo - - allocate(tmp_k(cholesky_mo_num)) - call dgemm('N', 'N', cholesky_mo_num, 1, nO*nV, 2.d0, & - cc_space_v_ov_chol, cholesky_mo_num, & - t1, nO*nV, 0.d0, tmp_k, cholesky_mo_num) - - call dgemm('T','N',nV*nO,1,cholesky_mo_num,1.d0, & - cc_space_v_vo_chol, cholesky_mo_num, tmp_k, cholesky_mo_num, 1.d0, & - H_vo, nV*nO) - deallocate(tmp_k) - - allocate(tmp(cholesky_mo_num,nO,nO)) - allocate(tmp2(cholesky_mo_num,nO,nO)) - - call dgemm('N','T', cholesky_mo_num*nO, nO, nV, 1.d0, & - cc_space_v_ov_chol, cholesky_mo_num*nO, t1, nO, 0.d0, tmp, cholesky_mo_num*nO) - - do i=1,nO - do j=1,nO - do k=1,cholesky_mo_num - tmp2(k,j,i) = tmp(k,i,j) - enddo - enddo - enddo - deallocate(tmp) - - call dgemm('T','N', nV, nO, cholesky_mo_num*nO, -1.d0, & - cc_space_v_ov_chol, cholesky_mo_num*nO, tmp2, cholesky_mo_num*nO, & - 1.d0, H_vo, nV) - -end - ! R2