diff --git a/plugins/local/gpu_nvidia/gpu.c b/plugins/local/gpu_nvidia/gpu.c index 189de64c..39a82984 100644 --- a/plugins/local/gpu_nvidia/gpu.c +++ b/plugins/local/gpu_nvidia/gpu.c @@ -11,10 +11,6 @@ /* Generic functions */ -bool no_gpu() { - return false; -} - int gpu_ndevices() { int ngpus; cudaGetDeviceCount(&ngpus); @@ -35,13 +31,13 @@ void gpu_allocate(void** ptr, const int64_t size) { free = INT64_MAX; } - /* Use managed memory if it does not fit on the GPU */ - if (size < free && size < total/2) { + rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal); +// /* Use managed memory if it does not fit on the GPU */ +// if (size < free && size < total/2) { // rc= cudaMalloc(ptr, size); - rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal); - } else { - rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal); - } +// } else { +// rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal); +// } assert (rc == cudaSuccess); } diff --git a/plugins/local/gpu_x86/gpu.c b/plugins/local/gpu_x86/gpu.c index 53267a7c..dab23a25 100644 --- a/plugins/local/gpu_x86/gpu.c +++ b/plugins/local/gpu_x86/gpu.c @@ -5,14 +5,10 @@ #include #include -bool no_gpu() { - return true; -} - /* Generic functions */ int gpu_ndevices() { - return 1; + return 0; } void gpu_set_device(int32_t i) { diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 5ee7366e..0b3636ac 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -183,7 +183,8 @@ subroutine run_ccsd_space_orb if (do_mo_cholesky) then call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, & d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo) - call compute_H_vv_chol(nO,nV,tau_x,H_vv) + call compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, & + d_cc_space_v_ov_chol,H_vv) call compute_H_vo_chol(nO,nV,t1%f,H_vo%f) call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1) @@ -403,7 +404,7 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau) type(gpu_stream) :: stream(nV) - !$OMP PARALLEL if (no_gpu()) & + !$OMP PARALLEL if (gpu_num == 0) & !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) @@ -466,7 +467,7 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x) call gpu_stream_create(stream(a)) enddo - !$OMP PARALLEL if (no_gpu()) & + !$OMP PARALLEL if (gpu_num == 0) & !$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index 288724f3..458016fb 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -344,48 +344,47 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, & ! 1.d0, H_oo%f(1,1), nO) ! - type(gpu_stream) :: stream(nV) + type(gpu_blas) :: blas - do b=1,nV - call gpu_stream_create(stream(b)) - enddo - !$OMP PARALLEL if (no_gpu()) & + !$OMP PARALLEL & !$OMP DEFAULT(SHARED) & - !$OMP PRIVATE(u,b,tmp_vov,tmp_ovv) + !$OMP PRIVATE(blas,u,b,tmp_vov,tmp_ovv) + + !$OMP SINGLE + !$OMP TASK + call gpu_copy(d_cc_space_f_oo, H_oo) + !$OMP END TASK + !$OMP END SINGLE - call gpu_allocate(tmp_vov, nV, nO, nV) call gpu_allocate(tmp_ovv, nO, nV, nV) + call gpu_allocate(tmp_vov, nV, nO, nV) + + call gpu_blas_create(blas) !$OMP DO do u=1,nO - call gpu_dgeam_f(blas_handle, 'N', 'N', 1, nO*nV*nV, 1.d0, & + call gpu_dgeam_f(blas, 'N', 'N', 1, nO*nV*nV, 1.d0, & tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f, nO, tmp_ovv%f, 1) do b=1,nV - call gpu_set_stream(blas_handle,stream(b)) - call gpu_dgeam_f(blas_handle, 'T', 'T', nV, nO, 1.d0, & + call gpu_dgeam_f(blas, 'T', 'T', nV, nO, 1.d0, & tmp_ovv%f(1,1,b), nO, 0.d0, & tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV) enddo - call gpu_dgemm_f(blas_handle, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, & + call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, & d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_vov%f, nV, & 0.d0, tau_kau%f(1,1,u), cholesky_mo_num) - call gpu_synchronize() enddo !$OMP END DO + call gpu_blas_destroy(blas) + call gpu_deallocate(tmp_vov) call gpu_deallocate(tmp_ovv) + + !$OMP TASKWAIT !$OMP END PARALLEL - do b=1,nV - call gpu_stream_destroy(stream(b)) - enddo - - call gpu_set_stream(blas_handle,gpu_default_stream) - - call gpu_copy(d_cc_space_f_oo, H_oo) - call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & tau_kau, cholesky_mo_num*nV, d_cc_space_v_vo_chol, cholesky_mo_num*nV, & 1.d0, H_oo, nO) @@ -395,52 +394,97 @@ end ! H_vv -subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv) +subroutine compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, & + d_cc_space_v_ov_chol,H_vv) use gpu implicit none - integer, intent(in) :: nO,nV + integer, intent(in) :: nO,nV + type(gpu_double2), intent(in) :: d_cc_space_f_vv + type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol type(gpu_double4), intent(in) :: tau_x type(gpu_double2), intent(out) :: H_vv integer :: a,b,i,j,u,k, beta - double precision, allocatable :: tau_kia(:,:,:), tmp_oov(:,:,:) + type(gpu_double3) :: tau_kia, tmp_oov - allocate(tau_kia(cholesky_mo_num,nO,nV)) - !$omp parallel & - !$omp default(shared) & - !$omp private(i,beta,j,k,a,b,tmp_oov) - allocate(tmp_oov(nO,nO,nV) ) - !$omp do + call gpu_allocate(tau_kia, cholesky_mo_num, nO, nV) + +! !$omp parallel & +! !$omp default(shared) & +! !$omp private(i,beta,j,k,a,b,tmp_oov) +! allocate(tmp_oov(nO,nO,nV) ) +! !$omp do +! do a = 1, nV +! do b=1,nV +! do j=1,nO +! do i=1,nO +! tmp_oov(i,j,b) = tau_x%f(i,j,a,b) +! enddo +! enddo +! enddo +! call dgemm('N','T',cholesky_mo_num,nO,nO*nV,1.d0, & +! d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov, nO, & +! 0.d0, tau_kia(1,1,a), cholesky_mo_num) +! enddo +! !$omp end do nowait +! deallocate(tmp_oov) + +! !$omp do +! do beta = 1, nV +! do a = 1, nV +! H_vv%f(a,beta) = cc_space_f_vv(a,beta) +! enddo +! enddo +! !$omp end do nowait +! !$omp barrier +! !$omp end parallel +! call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, & +! tau_kia, cholesky_mo_num*nO, d_cc_space_v_ov_chol%f, cholesky_mo_num*nO, & +! 1.d0, H_vv%f, nV) + + type(gpu_blas) :: blas + + + PROVIDE gpu_num + !$OMP PARALLEL & + !$OMP DEFAULT(SHARED) & + !$OMP PRIVATE(a,b,tmp_oov,blas) + + !$OMP SINGLE + !$OMP TASK + call gpu_copy(d_cc_space_f_vv, H_vv) + !$OMP END TASK + !$OMP END SINGLE + + call gpu_blas_create(blas) + call gpu_allocate(tmp_oov, nO, nO, nV) + + !$OMP DO do a = 1, nV do b=1,nV - do j=1,nO - do i=1,nO - tmp_oov(i,j,b) = tau_x%f(i,j,a,b) - enddo - enddo + call gpu_dgeam_f(blas, 'N', 'N', nO, nO, 1.d0, & + tau_x%f(1,1,a,b), nO, 0.d0, & + tau_x%f(1,1,a,b), nO, tmp_oov%f(1,1,b), nO) enddo - call dgemm('N','T',cholesky_mo_num,nO,nO*nV,1.d0, & - cc_space_v_ov_chol, cholesky_mo_num, tmp_oov, nO, & - 0.d0, tau_kia(1,1,a), cholesky_mo_num) + call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nO,nO*nV,1.d0, & + d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov%f, nO, & + 0.d0, tau_kia%f(1,1,a), cholesky_mo_num) enddo - !$omp end do nowait - deallocate(tmp_oov) + !$OMP END DO - !$omp do - do beta = 1, nV - do a = 1, nV - H_vv%f(a,beta) = cc_space_f_vv(a,beta) - enddo - enddo - !$omp end do nowait - !$omp barrier - !$omp end parallel - call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, & - tau_kia, cholesky_mo_num*nO, cc_space_v_ov_chol, cholesky_mo_num*nO, & - 1.d0, H_vv%f, nV) + call gpu_blas_destroy(blas) + call gpu_deallocate(tmp_oov) + !$OMP TASKWAIT + !$OMP END PARALLEL + + call gpu_dgemm(blas_handle,'T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, & + tau_kia, cholesky_mo_num*nO, d_cc_space_v_ov_chol, cholesky_mo_num*nO, & + 1.d0, H_vv, nV) + + call gpu_deallocate(tau_kia) end ! H_vo diff --git a/src/gpu/gpu.irp.f b/src/gpu/gpu.irp.f index 6ad0a075..3b2feeb6 100644 --- a/src/gpu/gpu.irp.f +++ b/src/gpu/gpu.irp.f @@ -16,3 +16,11 @@ BEGIN_PROVIDER [ type(gpu_stream), gpu_default_stream ] gpu_default_stream%c = C_NULL_PTR END_PROVIDER +BEGIN_PROVIDER [ integer, gpu_num ] + implicit none + BEGIN_DOC + ! Number of usable GPUs + END_DOC + gpu_num = gpu_ndevices() +END_PROVIDER +