9
1
mirror of https://github.com/QuantumPackage/qp2.git synced 2024-12-31 23:55:39 +01:00
This commit is contained in:
Anthony Scemama 2024-07-01 18:04:48 +02:00
parent 860121d404
commit d3c1994c64
5 changed files with 114 additions and 69 deletions

View File

@ -11,10 +11,6 @@
/* Generic functions */ /* Generic functions */
bool no_gpu() {
return false;
}
int gpu_ndevices() { int gpu_ndevices() {
int ngpus; int ngpus;
cudaGetDeviceCount(&ngpus); cudaGetDeviceCount(&ngpus);
@ -35,13 +31,13 @@ void gpu_allocate(void** ptr, const int64_t size) {
free = INT64_MAX; free = INT64_MAX;
} }
/* Use managed memory if it does not fit on the GPU */ rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
if (size < free && size < total/2) { // /* Use managed memory if it does not fit on the GPU */
// if (size < free && size < total/2) {
// rc= cudaMalloc(ptr, size); // rc= cudaMalloc(ptr, size);
rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal); // } else {
} else { // rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal); // }
}
assert (rc == cudaSuccess); assert (rc == cudaSuccess);
} }

View File

@ -5,14 +5,10 @@
#include <stdbool.h> #include <stdbool.h>
#include <assert.h> #include <assert.h>
bool no_gpu() {
return true;
}
/* Generic functions */ /* Generic functions */
int gpu_ndevices() { int gpu_ndevices() {
return 1; return 0;
} }
void gpu_set_device(int32_t i) { void gpu_set_device(int32_t i) {

View File

@ -183,7 +183,8 @@ subroutine run_ccsd_space_orb
if (do_mo_cholesky) then if (do_mo_cholesky) then
call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, & call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, &
d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo) d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
call compute_H_vv_chol(nO,nV,tau_x,H_vv) call compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, &
d_cc_space_v_ov_chol,H_vv)
call compute_H_vo_chol(nO,nV,t1%f,H_vo%f) call compute_H_vo_chol(nO,nV,t1%f,H_vo%f)
call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1) call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1)
@ -403,7 +404,7 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
type(gpu_stream) :: stream(nV) type(gpu_stream) :: stream(nV)
!$OMP PARALLEL if (no_gpu()) & !$OMP PARALLEL if (gpu_num == 0) &
!$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) & !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) &
!$OMP PRIVATE(i,j,a,b) & !$OMP PRIVATE(i,j,a,b) &
!$OMP DEFAULT(NONE) !$OMP DEFAULT(NONE)
@ -466,7 +467,7 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x)
call gpu_stream_create(stream(a)) call gpu_stream_create(stream(a))
enddo enddo
!$OMP PARALLEL if (no_gpu()) & !$OMP PARALLEL if (gpu_num == 0) &
!$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) & !$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) &
!$OMP PRIVATE(i,j,a,b) & !$OMP PRIVATE(i,j,a,b) &
!$OMP DEFAULT(NONE) !$OMP DEFAULT(NONE)

View File

@ -344,48 +344,47 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, &
! 1.d0, H_oo%f(1,1), nO) ! 1.d0, H_oo%f(1,1), nO)
! !
type(gpu_stream) :: stream(nV) type(gpu_blas) :: blas
do b=1,nV
call gpu_stream_create(stream(b))
enddo
!$OMP PARALLEL if (no_gpu()) & !$OMP PARALLEL &
!$OMP DEFAULT(SHARED) & !$OMP DEFAULT(SHARED) &
!$OMP PRIVATE(u,b,tmp_vov,tmp_ovv) !$OMP PRIVATE(blas,u,b,tmp_vov,tmp_ovv)
!$OMP SINGLE
!$OMP TASK
call gpu_copy(d_cc_space_f_oo, H_oo)
!$OMP END TASK
!$OMP END SINGLE
call gpu_allocate(tmp_vov, nV, nO, nV)
call gpu_allocate(tmp_ovv, nO, nV, nV) call gpu_allocate(tmp_ovv, nO, nV, nV)
call gpu_allocate(tmp_vov, nV, nO, nV)
call gpu_blas_create(blas)
!$OMP DO !$OMP DO
do u=1,nO do u=1,nO
call gpu_dgeam_f(blas_handle, 'N', 'N', 1, nO*nV*nV, 1.d0, & call gpu_dgeam_f(blas, 'N', 'N', 1, nO*nV*nV, 1.d0, &
tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f, nO, tmp_ovv%f, 1) tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f, nO, tmp_ovv%f, 1)
do b=1,nV do b=1,nV
call gpu_set_stream(blas_handle,stream(b)) call gpu_dgeam_f(blas, 'T', 'T', nV, nO, 1.d0, &
call gpu_dgeam_f(blas_handle, 'T', 'T', nV, nO, 1.d0, &
tmp_ovv%f(1,1,b), nO, 0.d0, & tmp_ovv%f(1,1,b), nO, 0.d0, &
tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV) tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV)
enddo enddo
call gpu_dgemm_f(blas_handle, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, & call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_vov%f, nV, & d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_vov%f, nV, &
0.d0, tau_kau%f(1,1,u), cholesky_mo_num) 0.d0, tau_kau%f(1,1,u), cholesky_mo_num)
call gpu_synchronize()
enddo enddo
!$OMP END DO !$OMP END DO
call gpu_blas_destroy(blas)
call gpu_deallocate(tmp_vov) call gpu_deallocate(tmp_vov)
call gpu_deallocate(tmp_ovv) call gpu_deallocate(tmp_ovv)
!$OMP TASKWAIT
!$OMP END PARALLEL !$OMP END PARALLEL
do b=1,nV
call gpu_stream_destroy(stream(b))
enddo
call gpu_set_stream(blas_handle,gpu_default_stream)
call gpu_copy(d_cc_space_f_oo, H_oo)
call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
tau_kau, cholesky_mo_num*nV, d_cc_space_v_vo_chol, cholesky_mo_num*nV, & tau_kau, cholesky_mo_num*nV, d_cc_space_v_vo_chol, cholesky_mo_num*nV, &
1.d0, H_oo, nO) 1.d0, H_oo, nO)
@ -395,52 +394,97 @@ end
! H_vv ! H_vv
subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv) subroutine compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, &
d_cc_space_v_ov_chol,H_vv)
use gpu use gpu
implicit none implicit none
integer, intent(in) :: nO,nV integer, intent(in) :: nO,nV
type(gpu_double2), intent(in) :: d_cc_space_f_vv
type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol
type(gpu_double4), intent(in) :: tau_x type(gpu_double4), intent(in) :: tau_x
type(gpu_double2), intent(out) :: H_vv type(gpu_double2), intent(out) :: H_vv
integer :: a,b,i,j,u,k, beta integer :: a,b,i,j,u,k, beta
double precision, allocatable :: tau_kia(:,:,:), tmp_oov(:,:,:) type(gpu_double3) :: tau_kia, tmp_oov
allocate(tau_kia(cholesky_mo_num,nO,nV)) call gpu_allocate(tau_kia, cholesky_mo_num, nO, nV)
!$omp parallel &
!$omp default(shared) & ! !$omp parallel &
!$omp private(i,beta,j,k,a,b,tmp_oov) ! !$omp default(shared) &
allocate(tmp_oov(nO,nO,nV) ) ! !$omp private(i,beta,j,k,a,b,tmp_oov)
!$omp do ! allocate(tmp_oov(nO,nO,nV) )
! !$omp do
! do a = 1, nV
! do b=1,nV
! do j=1,nO
! do i=1,nO
! tmp_oov(i,j,b) = tau_x%f(i,j,a,b)
! enddo
! enddo
! enddo
! call dgemm('N','T',cholesky_mo_num,nO,nO*nV,1.d0, &
! d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov, nO, &
! 0.d0, tau_kia(1,1,a), cholesky_mo_num)
! enddo
! !$omp end do nowait
! deallocate(tmp_oov)
! !$omp do
! do beta = 1, nV
! do a = 1, nV
! H_vv%f(a,beta) = cc_space_f_vv(a,beta)
! enddo
! enddo
! !$omp end do nowait
! !$omp barrier
! !$omp end parallel
! call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
! tau_kia, cholesky_mo_num*nO, d_cc_space_v_ov_chol%f, cholesky_mo_num*nO, &
! 1.d0, H_vv%f, nV)
type(gpu_blas) :: blas
PROVIDE gpu_num
!$OMP PARALLEL &
!$OMP DEFAULT(SHARED) &
!$OMP PRIVATE(a,b,tmp_oov,blas)
!$OMP SINGLE
!$OMP TASK
call gpu_copy(d_cc_space_f_vv, H_vv)
!$OMP END TASK
!$OMP END SINGLE
call gpu_blas_create(blas)
call gpu_allocate(tmp_oov, nO, nO, nV)
!$OMP DO
do a = 1, nV do a = 1, nV
do b=1,nV do b=1,nV
do j=1,nO call gpu_dgeam_f(blas, 'N', 'N', nO, nO, 1.d0, &
do i=1,nO tau_x%f(1,1,a,b), nO, 0.d0, &
tmp_oov(i,j,b) = tau_x%f(i,j,a,b) tau_x%f(1,1,a,b), nO, tmp_oov%f(1,1,b), nO)
enddo
enddo
enddo enddo
call dgemm('N','T',cholesky_mo_num,nO,nO*nV,1.d0, & call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nO,nO*nV,1.d0, &
cc_space_v_ov_chol, cholesky_mo_num, tmp_oov, nO, & d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov%f, nO, &
0.d0, tau_kia(1,1,a), cholesky_mo_num) 0.d0, tau_kia%f(1,1,a), cholesky_mo_num)
enddo enddo
!$omp end do nowait !$OMP END DO
deallocate(tmp_oov)
!$omp do call gpu_blas_destroy(blas)
do beta = 1, nV
do a = 1, nV
H_vv%f(a,beta) = cc_space_f_vv(a,beta)
enddo
enddo
!$omp end do nowait
!$omp barrier
!$omp end parallel
call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
tau_kia, cholesky_mo_num*nO, cc_space_v_ov_chol, cholesky_mo_num*nO, &
1.d0, H_vv%f, nV)
call gpu_deallocate(tmp_oov)
!$OMP TASKWAIT
!$OMP END PARALLEL
call gpu_dgemm(blas_handle,'T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
tau_kia, cholesky_mo_num*nO, d_cc_space_v_ov_chol, cholesky_mo_num*nO, &
1.d0, H_vv, nV)
call gpu_deallocate(tau_kia)
end end
! H_vo ! H_vo

View File

@ -16,3 +16,11 @@ BEGIN_PROVIDER [ type(gpu_stream), gpu_default_stream ]
gpu_default_stream%c = C_NULL_PTR gpu_default_stream%c = C_NULL_PTR
END_PROVIDER END_PROVIDER
BEGIN_PROVIDER [ integer, gpu_num ]
implicit none
BEGIN_DOC
! Number of usable GPUs
END_DOC
gpu_num = gpu_ndevices()
END_PROVIDER