mirror of
https://github.com/QuantumPackage/qp2.git
synced 2025-01-05 02:48:49 +01:00
Introducing GPU in CCSD
This commit is contained in:
parent
6c02ac0f0b
commit
fa6d141949
@ -10,9 +10,9 @@ subroutine run_ccsd_space_orb
|
|||||||
double precision :: uncorr_energy,energy, max_elem, max_r, max_r1, max_r2,ta,tb
|
double precision :: uncorr_energy,energy, max_elem, max_r, max_r1, max_r2,ta,tb
|
||||||
logical :: not_converged
|
logical :: not_converged
|
||||||
|
|
||||||
double precision, allocatable :: t2(:,:,:,:), r2(:,:,:,:), tau(:,:,:,:), tau_x(:,:,:,:)
|
type(gpu_double4) :: t2, r2, tau, tau_x
|
||||||
double precision, allocatable :: t1(:,:), r1(:,:)
|
type(gpu_double2) :: t1, r1
|
||||||
double precision, pointer :: H_oo, H_vv, H_vo
|
type(gpu_double2) :: H_oo, H_vv, H_vo
|
||||||
|
|
||||||
double precision, allocatable :: all_err(:,:), all_t(:,:)
|
double precision, allocatable :: all_err(:,:), all_t(:,:)
|
||||||
integer, allocatable :: list_occ(:), list_vir(:)
|
integer, allocatable :: list_occ(:), list_vir(:)
|
||||||
@ -52,14 +52,15 @@ subroutine run_ccsd_space_orb
|
|||||||
!print*,'occ',list_occ
|
!print*,'occ',list_occ
|
||||||
!print*,'vir',list_vir
|
!print*,'vir',list_vir
|
||||||
|
|
||||||
allocate(t2(nO,nO,nV,nV), r2(nO,nO,nV,nV))
|
call gpu_allocate(t2, nO,nO,nV,nV)
|
||||||
allocate(tau(nO,nO,nV,nV))
|
call gpu_allocate(r2, nO,nO,nV,nV)
|
||||||
allocate(tau_x(nO,nO,nV,nV))
|
call gpu_allocate(tau, nO,nO,nV,nV)
|
||||||
allocate(t1(nO,nV), r1(nO,nV))
|
call gpu_allocate(tau_x, nO,nO,nV,nV)
|
||||||
|
call gpu_allocate(t1, nO,nV)
|
||||||
call gpu_allocate_double(H_oo, (/ nO, nO /) )
|
call gpu_allocate(r1, nO,nV)
|
||||||
call gpu_allocate_double(H_vv, (/ nV, nV /) )
|
call gpu_allocate(H_oo, nO, nO)
|
||||||
call gpu_allocate_double(H_vo, (/ nV, nO /) )
|
call gpu_allocate(H_vo, nV, nO)
|
||||||
|
call gpu_allocate(H_vv, nV, nV)
|
||||||
|
|
||||||
if (cc_update_method == 'diis') then
|
if (cc_update_method == 'diis') then
|
||||||
double precision :: rss, diis_mem, extra_mem
|
double precision :: rss, diis_mem, extra_mem
|
||||||
@ -101,14 +102,21 @@ subroutine run_ccsd_space_orb
|
|||||||
endif
|
endif
|
||||||
|
|
||||||
! Init
|
! Init
|
||||||
call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,t1)
|
double precision, allocatable :: h_t1(:,:), h_t2(:,:,:,:)
|
||||||
call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,t2)
|
allocate(h_t1(nO,nV), h_t2(nO,nO,nV,nV))
|
||||||
call update_tau_space(nO,nV,t1,t2,tau)
|
|
||||||
|
call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,h_t1)
|
||||||
|
call gpu_upload(h_t1, t1)
|
||||||
|
|
||||||
|
call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,h_t2)
|
||||||
|
call gpu_upload(h_t2, t2)
|
||||||
|
|
||||||
|
call update_tau_space(nO,nV,h_t1,t1,t2,tau)
|
||||||
call update_tau_x_space(nO,nV,tau,tau_x)
|
call update_tau_x_space(nO,nV,tau,tau_x)
|
||||||
!print*,'hf_energy', hf_energy
|
!print*,'hf_energy', hf_energy
|
||||||
call det_energy(det,uncorr_energy)
|
call det_energy(det,uncorr_energy)
|
||||||
print*,'Det energy', uncorr_energy
|
print*,'Det energy', uncorr_energy
|
||||||
call ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
|
call ccsd_energy_space_x(nO,nV,tau_x%f,t1%f,energy)
|
||||||
print*,'Guess energy', uncorr_energy+energy, energy
|
print*,'Guess energy', uncorr_energy+energy, energy
|
||||||
|
|
||||||
nb_iter = 0
|
nb_iter = 0
|
||||||
@ -127,40 +135,38 @@ subroutine run_ccsd_space_orb
|
|||||||
if (do_ao_cholesky) then
|
if (do_ao_cholesky) then
|
||||||
! if (.False.) then
|
! if (.False.) then
|
||||||
call compute_H_oo_chol(nO,nV,tau_x,H_oo)
|
call compute_H_oo_chol(nO,nV,tau_x,H_oo)
|
||||||
call compute_H_vv_chol(nO,nV,tau_x,H_vv)
|
call compute_H_vv_chol(nO,nV,tau_x%f,H_vv%f)
|
||||||
call compute_H_vo_chol(nO,nV,t1,H_vo)
|
call compute_H_vo_chol(nO,nV,t1%f,H_vo%f)
|
||||||
|
|
||||||
call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
|
call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1)
|
||||||
call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
|
call compute_r2_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r2%f,max_r2)
|
||||||
else
|
else
|
||||||
call compute_H_oo(nO,nV,t1,t2,tau,H_oo)
|
call compute_H_oo(nO,nV,t1%f,t2%f,tau%f,H_oo%f)
|
||||||
call compute_H_vv(nO,nV,t1,t2,tau,H_vv)
|
call compute_H_vv(nO,nV,t1%f,t2%f,tau%f,H_vv%f)
|
||||||
call compute_H_vo(nO,nV,t1,t2,H_vo)
|
call compute_H_vo(nO,nV,t1%f,t2%f,H_vo%f)
|
||||||
|
|
||||||
call compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
|
call compute_r1_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r1%f,max_r1)
|
||||||
call compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
|
call compute_r2_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r2%f,max_r2)
|
||||||
endif
|
endif
|
||||||
max_r = max(max_r1,max_r2)
|
max_r = max(max_r1,max_r2)
|
||||||
|
|
||||||
! Update
|
! Update
|
||||||
if (cc_update_method == 'diis') then
|
if (cc_update_method == 'diis') then
|
||||||
!call update_t_ccsd(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
|
call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1%f,r2%f,t1%f,t2%f,all_err,all_t)
|
||||||
!call update_t_ccsd_diis(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
|
|
||||||
call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1,r2,t1,t2,all_err,all_t)
|
|
||||||
|
|
||||||
! Standard update as T = T - Delta
|
! Standard update as T = T - Delta
|
||||||
elseif (cc_update_method == 'none') then
|
elseif (cc_update_method == 'none') then
|
||||||
call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1,t1)
|
call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1%f,t1%f)
|
||||||
call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2,t2)
|
call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2%f,t2%f)
|
||||||
else
|
else
|
||||||
print*,'Unkown cc_method_method: '//cc_update_method
|
print*,'Unkown cc_method_method: '//cc_update_method
|
||||||
endif
|
endif
|
||||||
|
|
||||||
call update_tau_space(nO,nV,t1,t2,tau)
|
call update_tau_space(nO,nV,t1%f,t1,t2,tau)
|
||||||
call update_tau_x_space(nO,nV,tau,tau_x)
|
call update_tau_x_space(nO,nV,tau,tau_x)
|
||||||
|
|
||||||
! Energy
|
! Energy
|
||||||
call ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
|
call ccsd_energy_space_x(nO,nV,tau_x%f,t1%f,energy)
|
||||||
write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |'
|
write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |'
|
||||||
|
|
||||||
nb_iter = nb_iter + 1
|
nb_iter = nb_iter + 1
|
||||||
@ -185,8 +191,8 @@ subroutine run_ccsd_space_orb
|
|||||||
print*,''
|
print*,''
|
||||||
|
|
||||||
if (write_amplitudes) then
|
if (write_amplitudes) then
|
||||||
call write_t1(nO,nV,t1)
|
call write_t1(nO,nV,t1%f)
|
||||||
call write_t2(nO,nV,t2)
|
call write_t2(nO,nV,t2%f)
|
||||||
call ezfio_set_utils_cc_io_amplitudes('Read')
|
call ezfio_set_utils_cc_io_amplitudes('Read')
|
||||||
endif
|
endif
|
||||||
|
|
||||||
@ -195,11 +201,14 @@ subroutine run_ccsd_space_orb
|
|||||||
deallocate(all_err,all_t)
|
deallocate(all_err,all_t)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
call gpu_deallocate_double(H_oo)
|
call gpu_deallocate(H_oo)
|
||||||
call gpu_deallocate_double(H_vv)
|
call gpu_deallocate(H_vv)
|
||||||
call gpu_deallocate_double(H_vo)
|
call gpu_deallocate(H_vo)
|
||||||
|
|
||||||
deallocate(r1,r2,tau)
|
call gpu_deallocate(r1)
|
||||||
|
call gpu_deallocate(r2)
|
||||||
|
call gpu_deallocate(tau)
|
||||||
|
call gpu_deallocate(tau_x)
|
||||||
|
|
||||||
! CCSD(T)
|
! CCSD(T)
|
||||||
double precision :: e_t, e_t_err
|
double precision :: e_t, e_t_err
|
||||||
@ -207,28 +216,14 @@ subroutine run_ccsd_space_orb
|
|||||||
|
|
||||||
if (cc_par_t .and. elec_alpha_num + elec_beta_num > 2) then
|
if (cc_par_t .and. elec_alpha_num + elec_beta_num > 2) then
|
||||||
|
|
||||||
! Dumb way
|
|
||||||
!call wall_time(ta)
|
|
||||||
!call ccsd_par_t_space(nO,nV,t1,t2,e_t)
|
|
||||||
!call wall_time(tb)
|
|
||||||
!print*,'Time: ',tb-ta, ' s'
|
|
||||||
|
|
||||||
!print*,''
|
|
||||||
!write(*,'(A15,F18.12,A3)') ' E(CCSD(T)) = ', uncorr_energy + energy + e_t, ' Ha'
|
|
||||||
!write(*,'(A15,F18.12,A3)') ' E(T) = ', e_t, ' Ha'
|
|
||||||
!write(*,'(A15,F18.12,A3)') ' Correlation = ', energy + e_t, ' Ha'
|
|
||||||
!print*,''
|
|
||||||
|
|
||||||
! New
|
! New
|
||||||
e_t = uncorr_energy + energy ! For print in (T) call
|
e_t = uncorr_energy + energy ! For print in (T) call
|
||||||
e_t_err = 0.d0
|
e_t_err = 0.d0
|
||||||
|
|
||||||
print*,'Computing (T) correction...'
|
print*,'Computing (T) correction...'
|
||||||
call wall_time(ta)
|
call wall_time(ta)
|
||||||
! call ccsd_par_t_space_v3(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
|
|
||||||
! ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t)
|
|
||||||
|
|
||||||
call ccsd_par_t_space_stoch(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
|
call ccsd_par_t_space_stoch(nO,nV,t1%f,t2%f,cc_space_f_o,cc_space_f_v &
|
||||||
,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t, e_t_err)
|
,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t, e_t_err)
|
||||||
|
|
||||||
call wall_time(tb)
|
call wall_time(tb)
|
||||||
@ -243,7 +238,9 @@ subroutine run_ccsd_space_orb
|
|||||||
|
|
||||||
call save_energy(uncorr_energy + energy, e_t)
|
call save_energy(uncorr_energy + energy, e_t)
|
||||||
|
|
||||||
deallocate(t1,t2)
|
deallocate(h_t1, h_t2)
|
||||||
|
call gpu_deallocate(t1)
|
||||||
|
call gpu_deallocate(t2)
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
@ -341,70 +338,139 @@ end
|
|||||||
|
|
||||||
! Tau
|
! Tau
|
||||||
|
|
||||||
subroutine update_tau_space(nO,nV,t1,t2,tau)
|
subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
|
||||||
|
use gpu
|
||||||
implicit none
|
implicit none
|
||||||
|
|
||||||
! in
|
! in
|
||||||
integer, intent(in) :: nO, nV
|
integer, intent(in) :: nO, nV
|
||||||
double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV)
|
double precision, intent(in) :: h_t1(nO,nV)
|
||||||
|
type(gpu_double2), intent(in) :: t1
|
||||||
|
type(gpu_double4), intent(in) :: t2
|
||||||
|
|
||||||
! out
|
! out
|
||||||
double precision, intent(out) :: tau(nO,nO,nV,nV)
|
type(gpu_double4) :: tau
|
||||||
|
|
||||||
! internal
|
! internal
|
||||||
integer :: i,j,a,b
|
integer :: i,j,a,b
|
||||||
|
|
||||||
|
! !$OMP PARALLEL &
|
||||||
|
! !$OMP SHARED(nO,nV,tau,t2,t1,h_t1) &
|
||||||
|
! !$OMP PRIVATE(i,j,a,b) &
|
||||||
|
! !$OMP DEFAULT(NONE)
|
||||||
|
! !$OMP DO
|
||||||
|
! do b = 1, nV
|
||||||
|
! do a = 1, nV
|
||||||
|
! do j = 1, nO
|
||||||
|
! do i = 1, nO
|
||||||
|
! tau%f(i,j,a,b) = t2%f(i,j,a,b) + t1%f(i,a) * h_t1(j,b)
|
||||||
|
! enddo
|
||||||
|
! enddo
|
||||||
|
! enddo
|
||||||
|
! enddo
|
||||||
|
! !$OMP END DO
|
||||||
|
! !$OMP END PARALLEL
|
||||||
|
|
||||||
|
|
||||||
|
type(gpu_blas) :: blas
|
||||||
|
type(gpu_stream) :: stream(nV)
|
||||||
|
|
||||||
|
call gpu_blas_create(blas)
|
||||||
|
do b=1,nV
|
||||||
|
call gpu_stream_create(stream(b))
|
||||||
|
enddo
|
||||||
|
|
||||||
!$OMP PARALLEL &
|
!$OMP PARALLEL &
|
||||||
!$OMP SHARED(nO,nV,tau,t2,t1) &
|
!$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas) &
|
||||||
!$OMP PRIVATE(i,j,a,b) &
|
!$OMP PRIVATE(i,j,a,b) &
|
||||||
!$OMP DEFAULT(NONE)
|
!$OMP DEFAULT(NONE)
|
||||||
|
do j=1,nO
|
||||||
!$OMP DO
|
!$OMP DO
|
||||||
do b=1,nV
|
do b=1,nV
|
||||||
do a = 1, nV
|
call gpu_set_stream(blas,stream(b))
|
||||||
do j = 1, nO
|
call gpu_dgeam_c(blas%c, 'N', 'N', nO*1_8, nV*1_8, &
|
||||||
do i = 1, nO
|
1.d0, c_loc(t2%f(1,j,1,b)), nO*nO*1_8, &
|
||||||
tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b)
|
h_t1(j,b), t1%c, nO*1_8, &
|
||||||
enddo
|
c_loc(tau%f(1,j,1,b)), nO*nO*1_8)
|
||||||
enddo
|
|
||||||
enddo
|
|
||||||
enddo
|
enddo
|
||||||
!$OMP END DO
|
!$OMP END DO
|
||||||
|
enddo
|
||||||
!$OMP END PARALLEL
|
!$OMP END PARALLEL
|
||||||
|
|
||||||
|
call gpu_synchronize()
|
||||||
|
|
||||||
|
do b=1,nV
|
||||||
|
call gpu_stream_destroy(stream(b))
|
||||||
|
enddo
|
||||||
|
|
||||||
|
call gpu_blas_destroy(blas)
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
subroutine update_tau_x_space(nO,nV,tau,tau_x)
|
subroutine update_tau_x_space(nO,nV,tau,tau_x)
|
||||||
|
use gpu
|
||||||
implicit none
|
implicit none
|
||||||
|
|
||||||
! in
|
! in
|
||||||
integer, intent(in) :: nO, nV
|
integer, intent(in) :: nO, nV
|
||||||
double precision, intent(in) :: tau(nO,nO,nV,nV)
|
type(gpu_double4), intent(in) :: tau
|
||||||
|
|
||||||
! out
|
! out
|
||||||
double precision, intent(out) :: tau_x(nO,nO,nV,nV)
|
type(gpu_double4) :: tau_x
|
||||||
|
|
||||||
! internal
|
! internal
|
||||||
integer :: i,j,a,b
|
integer :: i,j,a,b
|
||||||
|
|
||||||
|
! !$OMP PARALLEL &
|
||||||
|
! !$OMP SHARED(nO,nV,tau,tau_x) &
|
||||||
|
! !$OMP PRIVATE(i,j,a,b) &
|
||||||
|
! !$OMP DEFAULT(NONE)
|
||||||
|
! !$OMP DO
|
||||||
|
! do b = 1, nV
|
||||||
|
! do a = 1, nV
|
||||||
|
! do j = 1, nO
|
||||||
|
! do i = 1, nO
|
||||||
|
! tau_x%f(i,j,a,b) = 2.d0*tau%f(i,j,a,b) - tau%f(i,j,b,a)
|
||||||
|
! enddo
|
||||||
|
! enddo
|
||||||
|
! enddo
|
||||||
|
! enddo
|
||||||
|
! !$OMP END DO
|
||||||
|
! !$OMP END PARALLEL
|
||||||
|
|
||||||
|
type(gpu_blas) :: blas
|
||||||
|
type(gpu_stream) :: stream(nV)
|
||||||
|
|
||||||
|
call gpu_blas_create(blas)
|
||||||
|
do a=1,nV
|
||||||
|
call gpu_stream_create(stream(a))
|
||||||
|
enddo
|
||||||
|
|
||||||
!$OMP PARALLEL &
|
!$OMP PARALLEL &
|
||||||
!$OMP SHARED(nO,nV,tau,tau_x) &
|
!$OMP SHARED(nO,nV,tau,tau_x,stream,blas) &
|
||||||
!$OMP PRIVATE(i,j,a,b) &
|
!$OMP PRIVATE(i,j,a,b) &
|
||||||
!$OMP DEFAULT(NONE)
|
!$OMP DEFAULT(NONE)
|
||||||
!$OMP DO
|
!$OMP DO
|
||||||
do b=1,nV
|
do b=1,nV
|
||||||
do a=1,nV
|
do a=1,nV
|
||||||
do j = 1, nO
|
call gpu_set_stream(blas,stream(a))
|
||||||
do i = 1, nO
|
call gpu_dgeam_c(blas%c, 'N', 'N', nO*1_8, nO*1_8, &
|
||||||
tau_x(i,j,a,b) = 2.d0*tau(i,j,a,b) - tau(i,j,b,a)
|
2.d0, c_loc(tau%f(1,1,a,b)), nO*1_8, &
|
||||||
enddo
|
-1.d0, c_loc(tau%f(1,1,b,a)), nO*1_8, &
|
||||||
enddo
|
c_loc(tau_x%f(1,1,a,b)), nO*1_8)
|
||||||
enddo
|
enddo
|
||||||
enddo
|
enddo
|
||||||
!$OMP END DO
|
!$OMP END DO
|
||||||
!$OMP END PARALLEL
|
!$OMP END PARALLEL
|
||||||
|
|
||||||
|
call gpu_synchronize()
|
||||||
|
|
||||||
|
do b=1,nV
|
||||||
|
call gpu_stream_destroy(stream(b))
|
||||||
|
enddo
|
||||||
|
|
||||||
|
call gpu_blas_destroy(blas)
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
! R1
|
! R1
|
||||||
|
@ -294,12 +294,12 @@ end
|
|||||||
! H_oo
|
! H_oo
|
||||||
|
|
||||||
subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo)
|
subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo)
|
||||||
|
use gpu
|
||||||
implicit none
|
implicit none
|
||||||
|
|
||||||
integer, intent(in) :: nO,nV
|
integer, intent(in) :: nO,nV
|
||||||
double precision, intent(in) :: tau_x(nO, nO, nV, nV)
|
type(gpu_double4), intent(in) :: tau_x
|
||||||
double precision, intent(out) :: H_oo(nO, nO)
|
type(gpu_double2), intent(out) :: H_oo
|
||||||
|
|
||||||
integer :: a,b,i,j,u,k
|
integer :: a,b,i,j,u,k
|
||||||
|
|
||||||
@ -315,7 +315,7 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo)
|
|||||||
do b=1,nV
|
do b=1,nV
|
||||||
do j=1,nO
|
do j=1,nO
|
||||||
do a=1,nV
|
do a=1,nV
|
||||||
tmp_vov(a,j,b) = tau_x(u,j,a,b)
|
tmp_vov(a,j,b) = tau_x%f(u,j,a,b)
|
||||||
enddo
|
enddo
|
||||||
enddo
|
enddo
|
||||||
enddo
|
enddo
|
||||||
@ -328,7 +328,7 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo)
|
|||||||
!$omp do
|
!$omp do
|
||||||
do i = 1, nO
|
do i = 1, nO
|
||||||
do u = 1, nO
|
do u = 1, nO
|
||||||
H_oo(u,i) = cc_space_f_oo(u,i)
|
H_oo%f(u,i) = cc_space_f_oo(u,i)
|
||||||
enddo
|
enddo
|
||||||
enddo
|
enddo
|
||||||
!$omp end do nowait
|
!$omp end do nowait
|
||||||
@ -336,7 +336,7 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo)
|
|||||||
!$omp end parallel
|
!$omp end parallel
|
||||||
call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
|
call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
|
||||||
tau_kau, cholesky_mo_num*nV, cc_space_v_vo_chol, cholesky_mo_num*nV, &
|
tau_kau, cholesky_mo_num*nV, cc_space_v_vo_chol, cholesky_mo_num*nV, &
|
||||||
1.d0, H_oo, nO)
|
1.d0, H_oo%f, nO)
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -2,6 +2,52 @@ module gpu
|
|||||||
use, intrinsic :: iso_c_binding
|
use, intrinsic :: iso_c_binding
|
||||||
implicit none
|
implicit none
|
||||||
|
|
||||||
|
! Data types
|
||||||
|
! ----------
|
||||||
|
|
||||||
|
type gpu_double1
|
||||||
|
type(c_ptr) :: c
|
||||||
|
double precision, pointer :: f(:)
|
||||||
|
end type
|
||||||
|
|
||||||
|
type gpu_double2
|
||||||
|
type(c_ptr) :: c
|
||||||
|
double precision, pointer :: f(:,:)
|
||||||
|
end type
|
||||||
|
|
||||||
|
type gpu_double3
|
||||||
|
type(c_ptr) :: c
|
||||||
|
double precision, pointer :: f(:,:,:)
|
||||||
|
end type
|
||||||
|
|
||||||
|
type gpu_double4
|
||||||
|
type(c_ptr) :: c
|
||||||
|
double precision, pointer :: f(:,:,:,:)
|
||||||
|
end type
|
||||||
|
|
||||||
|
type gpu_double5
|
||||||
|
type(c_ptr) :: c
|
||||||
|
double precision, pointer :: f(:,:,:,:,:)
|
||||||
|
end type
|
||||||
|
|
||||||
|
type gpu_double6
|
||||||
|
type(c_ptr) :: c
|
||||||
|
double precision, pointer :: f(:,:,:,:,:,:)
|
||||||
|
end type
|
||||||
|
|
||||||
|
|
||||||
|
type gpu_blas
|
||||||
|
type(c_ptr) :: c
|
||||||
|
end type
|
||||||
|
|
||||||
|
type gpu_stream
|
||||||
|
type(c_ptr) :: c
|
||||||
|
end type
|
||||||
|
|
||||||
|
|
||||||
|
! C interfaces
|
||||||
|
! ------------
|
||||||
|
|
||||||
interface
|
interface
|
||||||
integer function gpu_ndevices() bind(C)
|
integer function gpu_ndevices() bind(C)
|
||||||
end function
|
end function
|
||||||
@ -43,100 +89,394 @@ module gpu
|
|||||||
integer(c_int64_t), value :: n
|
integer(c_int64_t), value :: n
|
||||||
end subroutine
|
end subroutine
|
||||||
|
|
||||||
subroutine gpu_stream_create(stream) bind(C)
|
subroutine gpu_stream_create_c(stream) bind(C, name='gpu_stream_create')
|
||||||
import
|
import
|
||||||
type(c_ptr) :: stream
|
type(c_ptr) :: stream
|
||||||
end subroutine
|
end subroutine
|
||||||
|
|
||||||
subroutine gpu_stream_destroy(stream) bind(C)
|
subroutine gpu_stream_destroy_c(stream) bind(C, name='gpu_stream_destroy')
|
||||||
import
|
import
|
||||||
type(c_ptr) :: stream
|
type(c_ptr) :: stream
|
||||||
end subroutine
|
end subroutine
|
||||||
|
|
||||||
subroutine gpu_set_stream(handle, stream) bind(C)
|
subroutine gpu_set_stream_c(handle, stream) bind(C, name='gpu_set_stream')
|
||||||
import
|
import
|
||||||
type(c_ptr) :: handle, stream
|
type(c_ptr) :: handle, stream
|
||||||
end subroutine
|
end subroutine
|
||||||
|
|
||||||
subroutine gpu_synchronize()
|
subroutine gpu_synchronize() bind(C)
|
||||||
|
import
|
||||||
end subroutine
|
end subroutine
|
||||||
|
|
||||||
subroutine gpu_blas_create(handle) bind(C)
|
subroutine gpu_blas_create_c(handle) bind(C, name='gpu_blas_create')
|
||||||
import
|
import
|
||||||
type(c_ptr) :: handle
|
type(c_ptr) :: handle
|
||||||
end subroutine
|
end subroutine
|
||||||
|
|
||||||
subroutine gpu_blas_destroy(handle) bind(C)
|
subroutine gpu_blas_destroy_c(handle) bind(C, name='gpu_blas_destroy')
|
||||||
import
|
import
|
||||||
type(c_ptr) :: handle
|
type(c_ptr) :: handle
|
||||||
end subroutine
|
end subroutine
|
||||||
|
|
||||||
subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res) bind(C)
|
subroutine gpu_ddot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_ddot')
|
||||||
import
|
import
|
||||||
type(c_ptr), intent(in) :: handle
|
type(c_ptr), intent(in), value :: handle
|
||||||
integer(c_int64_t), value :: n, incx, incy
|
integer(c_int64_t), value :: n, incx, incy
|
||||||
real(c_double), intent(in) :: dx(*), dy(*)
|
type(c_ptr), intent(in), value :: dx, dy
|
||||||
real(c_double), intent(out) :: res
|
real(c_double), intent(out) :: res
|
||||||
end subroutine
|
end subroutine
|
||||||
|
|
||||||
subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res) bind(C)
|
subroutine gpu_sdot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_sdot')
|
||||||
import
|
import
|
||||||
type(c_ptr), intent(in) :: handle
|
type(c_ptr), intent(in), value :: handle
|
||||||
integer(c_int64_t), value :: n, incx, incy
|
integer(c_int64_t), value :: n, incx, incy
|
||||||
real(c_float), intent(in) :: dx(*), dy(*)
|
type(c_ptr), intent(in), value :: dx, dy
|
||||||
real(c_float), intent(out) :: res
|
real(c_float), intent(out) :: res
|
||||||
end subroutine
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
|
||||||
|
b, ldb, c, ldc) bind(C, name='gpu_dgeam')
|
||||||
|
import
|
||||||
|
type(c_ptr), intent(in), value :: handle
|
||||||
|
character(c_char), intent(in), value :: transa, transb
|
||||||
|
integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
|
||||||
|
real(c_double), intent(in), value :: alpha, beta
|
||||||
|
type(c_ptr), value :: a, b, c
|
||||||
|
end subroutine
|
||||||
|
|
||||||
end interface
|
end interface
|
||||||
|
|
||||||
|
|
||||||
|
! Polymorphic interfaces
|
||||||
|
! ----------------------
|
||||||
|
|
||||||
|
interface gpu_allocate
|
||||||
|
procedure gpu_allocate_double1 &
|
||||||
|
,gpu_allocate_double2 &
|
||||||
|
,gpu_allocate_double3 &
|
||||||
|
,gpu_allocate_double4 &
|
||||||
|
,gpu_allocate_double5 &
|
||||||
|
,gpu_allocate_double6
|
||||||
|
end interface gpu_allocate
|
||||||
|
|
||||||
|
interface gpu_deallocate
|
||||||
|
procedure gpu_deallocate_double1 &
|
||||||
|
,gpu_deallocate_double2 &
|
||||||
|
,gpu_deallocate_double3 &
|
||||||
|
,gpu_deallocate_double4 &
|
||||||
|
,gpu_deallocate_double5 &
|
||||||
|
,gpu_deallocate_double6
|
||||||
|
end interface gpu_deallocate
|
||||||
|
|
||||||
|
interface gpu_upload
|
||||||
|
procedure gpu_upload_double1 &
|
||||||
|
,gpu_upload_double2 &
|
||||||
|
,gpu_upload_double3 &
|
||||||
|
,gpu_upload_double4 &
|
||||||
|
,gpu_upload_double5 &
|
||||||
|
,gpu_upload_double6
|
||||||
|
end interface gpu_upload
|
||||||
|
|
||||||
|
interface gpu_download
|
||||||
|
procedure gpu_download_double1 &
|
||||||
|
,gpu_download_double2 &
|
||||||
|
,gpu_download_double3 &
|
||||||
|
,gpu_download_double4 &
|
||||||
|
,gpu_download_double5 &
|
||||||
|
,gpu_download_double6
|
||||||
|
end interface gpu_download
|
||||||
|
|
||||||
|
interface gpu_copy
|
||||||
|
procedure gpu_copy_double1 &
|
||||||
|
,gpu_copy_double2 &
|
||||||
|
,gpu_copy_double3 &
|
||||||
|
,gpu_copy_double4 &
|
||||||
|
,gpu_copy_double5 &
|
||||||
|
,gpu_copy_double6
|
||||||
|
end interface gpu_copy
|
||||||
|
|
||||||
|
|
||||||
contains
|
contains
|
||||||
|
|
||||||
|
|
||||||
subroutine gpu_allocate_double(ptr, s)
|
! gpu_allocate
|
||||||
implicit none
|
! ------------
|
||||||
double precision, pointer, intent(inout) :: ptr
|
|
||||||
integer, intent(in) :: s(:)
|
|
||||||
type(c_ptr) :: cptr
|
|
||||||
|
|
||||||
call gpu_allocate_c(cptr, sum(s*1_8)*8_8)
|
subroutine gpu_allocate_double1(ptr, s)
|
||||||
call c_f_pointer(cptr, ptr, s)
|
implicit none
|
||||||
|
type(gpu_double1), intent(inout) :: ptr
|
||||||
|
integer, intent(in) :: s
|
||||||
|
|
||||||
|
call gpu_allocate_c(ptr%c, s*8_8)
|
||||||
|
call c_f_pointer(ptr%c, ptr%f, (/ s /))
|
||||||
end subroutine
|
end subroutine
|
||||||
|
|
||||||
subroutine gpu_deallocate_double(ptr)
|
subroutine gpu_allocate_double2(ptr, s1, s2)
|
||||||
implicit none
|
implicit none
|
||||||
double precision, pointer, intent(inout) :: ptr
|
type(gpu_double2), intent(inout) :: ptr
|
||||||
type(c_ptr) :: cptr
|
integer, intent(in) :: s1, s2
|
||||||
cptr = c_loc(ptr)
|
|
||||||
call gpu_deallocate_c(cptr)
|
call gpu_allocate_c(ptr%c, s1*s2*8_8)
|
||||||
NULLIFY(ptr)
|
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /))
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_allocate_double3(ptr, s1, s2, s3)
|
||||||
|
implicit none
|
||||||
|
type(gpu_double3), intent(inout) :: ptr
|
||||||
|
integer, intent(in) :: s1, s2, s3
|
||||||
|
|
||||||
|
call gpu_allocate_c(ptr%c, s1*s2*s3*8_8)
|
||||||
|
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /))
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_allocate_double4(ptr, s1, s2, s3, s4)
|
||||||
|
implicit none
|
||||||
|
type(gpu_double4), intent(inout) :: ptr
|
||||||
|
integer, intent(in) :: s1, s2, s3, s4
|
||||||
|
|
||||||
|
call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8)
|
||||||
|
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /))
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_allocate_double5(ptr, s1, s2, s3, s4, s5)
|
||||||
|
implicit none
|
||||||
|
type(gpu_double5), intent(inout) :: ptr
|
||||||
|
integer, intent(in) :: s1, s2, s3, s4, s5
|
||||||
|
|
||||||
|
call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8)
|
||||||
|
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /))
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_allocate_double6(ptr, s1, s2, s3, s4, s5, s6)
|
||||||
|
implicit none
|
||||||
|
type(gpu_double6), intent(inout) :: ptr
|
||||||
|
integer, intent(in) :: s1, s2, s3, s4, s5, s6
|
||||||
|
|
||||||
|
call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8)
|
||||||
|
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /))
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
|
||||||
|
! gpu_deallocate
|
||||||
|
! --------------
|
||||||
|
|
||||||
|
subroutine gpu_deallocate_double1(ptr)
|
||||||
|
implicit none
|
||||||
|
type(gpu_double1), intent(inout) :: ptr
|
||||||
|
call gpu_deallocate_c(ptr%c)
|
||||||
|
NULLIFY(ptr%f)
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_deallocate_double2(ptr)
|
||||||
|
implicit none
|
||||||
|
type(gpu_double2), intent(inout) :: ptr
|
||||||
|
call gpu_deallocate_c(ptr%c)
|
||||||
|
NULLIFY(ptr%f)
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_deallocate_double3(ptr)
|
||||||
|
implicit none
|
||||||
|
type(gpu_double3), intent(inout) :: ptr
|
||||||
|
call gpu_deallocate_c(ptr%c)
|
||||||
|
NULLIFY(ptr%f)
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_deallocate_double4(ptr)
|
||||||
|
implicit none
|
||||||
|
type(gpu_double4), intent(inout) :: ptr
|
||||||
|
call gpu_deallocate_c(ptr%c)
|
||||||
|
NULLIFY(ptr%f)
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_deallocate_double5(ptr)
|
||||||
|
implicit none
|
||||||
|
type(gpu_double5), intent(inout) :: ptr
|
||||||
|
call gpu_deallocate_c(ptr%c)
|
||||||
|
NULLIFY(ptr%f)
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_deallocate_double6(ptr)
|
||||||
|
implicit none
|
||||||
|
type(gpu_double6), intent(inout) :: ptr
|
||||||
|
call gpu_deallocate_c(ptr%c)
|
||||||
|
NULLIFY(ptr%f)
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
|
||||||
|
! gpu_upload
|
||||||
|
! ----------
|
||||||
|
|
||||||
|
subroutine gpu_upload_double1(cpu_ptr, gpu_ptr)
|
||||||
|
implicit none
|
||||||
|
double precision, intent(in) :: cpu_ptr(:)
|
||||||
|
type(gpu_double1), intent(in) :: gpu_ptr
|
||||||
|
call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, 8_8*size(gpu_ptr%f))
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_upload_double2(cpu_ptr, gpu_ptr)
|
||||||
|
implicit none
|
||||||
|
double precision, intent(in) :: cpu_ptr(:,:)
|
||||||
|
type(gpu_double2), intent(in) :: gpu_ptr
|
||||||
|
call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_upload_double3(cpu_ptr, gpu_ptr)
|
||||||
|
implicit none
|
||||||
|
double precision, intent(in) :: cpu_ptr(:,:,:)
|
||||||
|
type(gpu_double3), intent(in) :: gpu_ptr
|
||||||
|
call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_upload_double4(cpu_ptr, gpu_ptr)
|
||||||
|
implicit none
|
||||||
|
double precision, intent(in) :: cpu_ptr(:,:,:,:)
|
||||||
|
type(gpu_double4), intent(in) :: gpu_ptr
|
||||||
|
call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_upload_double5(cpu_ptr, gpu_ptr)
|
||||||
|
implicit none
|
||||||
|
double precision, intent(in) :: cpu_ptr(:,:,:,:,:)
|
||||||
|
type(gpu_double5), intent(in) :: gpu_ptr
|
||||||
|
call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_upload_double6(cpu_ptr, gpu_ptr)
|
||||||
|
implicit none
|
||||||
|
double precision, intent(in) :: cpu_ptr(:,:,:,:,:,:)
|
||||||
|
type(gpu_double6), intent(in) :: gpu_ptr
|
||||||
|
call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
|
||||||
|
! gpu_download
|
||||||
|
! ------------
|
||||||
|
|
||||||
|
subroutine gpu_download_double1(gpu_ptr, cpu_ptr)
|
||||||
|
implicit none
|
||||||
|
type(gpu_double1), intent(in) :: gpu_ptr
|
||||||
|
double precision, intent(in) :: cpu_ptr(:)
|
||||||
|
call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*size(gpu_ptr%f))
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_download_double2(gpu_ptr, cpu_ptr)
|
||||||
|
implicit none
|
||||||
|
type(gpu_double2), intent(in) :: gpu_ptr
|
||||||
|
double precision, intent(in) :: cpu_ptr(:,:)
|
||||||
|
call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_download_double3(gpu_ptr, cpu_ptr)
|
||||||
|
implicit none
|
||||||
|
type(gpu_double3), intent(in) :: gpu_ptr
|
||||||
|
double precision, intent(in) :: cpu_ptr(:,:,:)
|
||||||
|
call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_download_double4(gpu_ptr, cpu_ptr)
|
||||||
|
implicit none
|
||||||
|
type(gpu_double4), intent(in) :: gpu_ptr
|
||||||
|
double precision, intent(in) :: cpu_ptr(:,:,:,:)
|
||||||
|
call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_download_double5(gpu_ptr, cpu_ptr)
|
||||||
|
implicit none
|
||||||
|
type(gpu_double5), intent(in) :: gpu_ptr
|
||||||
|
double precision, intent(in) :: cpu_ptr(:,:,:,:,:)
|
||||||
|
call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_download_double6(gpu_ptr, cpu_ptr)
|
||||||
|
implicit none
|
||||||
|
type(gpu_double6), intent(in) :: gpu_ptr
|
||||||
|
double precision, intent(in) :: cpu_ptr(:,:,:,:,:,:)
|
||||||
|
call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
! gpu_copy
|
||||||
|
! --------
|
||||||
|
|
||||||
|
subroutine gpu_copy_double1(gpu_ptr_src, gpu_ptr_dest)
|
||||||
|
implicit none
|
||||||
|
type(gpu_double1), intent(in) :: gpu_ptr_src
|
||||||
|
type(gpu_double1), intent(in) :: gpu_ptr_dest
|
||||||
|
call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*size(gpu_ptr_dest%f))
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_copy_double2(gpu_ptr_src, gpu_ptr_dest)
|
||||||
|
implicit none
|
||||||
|
type(gpu_double2), intent(in) :: gpu_ptr_src
|
||||||
|
type(gpu_double2), intent(in) :: gpu_ptr_dest
|
||||||
|
call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_copy_double3(gpu_ptr_src, gpu_ptr_dest)
|
||||||
|
implicit none
|
||||||
|
type(gpu_double3), intent(in) :: gpu_ptr_src
|
||||||
|
type(gpu_double3), intent(in) :: gpu_ptr_dest
|
||||||
|
call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_copy_double4(gpu_ptr_src, gpu_ptr_dest)
|
||||||
|
implicit none
|
||||||
|
type(gpu_double4), intent(in) :: gpu_ptr_src
|
||||||
|
type(gpu_double4), intent(in) :: gpu_ptr_dest
|
||||||
|
call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_copy_double5(gpu_ptr_src, gpu_ptr_dest)
|
||||||
|
implicit none
|
||||||
|
type(gpu_double5), intent(in) :: gpu_ptr_src
|
||||||
|
type(gpu_double5), intent(in) :: gpu_ptr_dest
|
||||||
|
call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_copy_double6(gpu_ptr_src, gpu_ptr_dest)
|
||||||
|
implicit none
|
||||||
|
type(gpu_double6), intent(in) :: gpu_ptr_src
|
||||||
|
type(gpu_double6), intent(in) :: gpu_ptr_dest
|
||||||
|
call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
|
||||||
|
! gpu_stream
|
||||||
|
! ----------
|
||||||
|
|
||||||
|
subroutine gpu_stream_create(stream)
|
||||||
|
import
|
||||||
|
type(gpu_stream) :: stream
|
||||||
|
call gpu_stream_create_c(stream%c)
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_stream_destroy(stream)
|
||||||
|
import
|
||||||
|
type(gpu_stream) :: stream
|
||||||
|
call gpu_stream_destroy_c(stream%c)
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_set_stream(handle, stream)
|
||||||
|
import
|
||||||
|
type(gpu_blas) :: handle
|
||||||
|
type(gpu_stream) :: stream
|
||||||
|
call gpu_set_stream_c(handle%c, stream%c)
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
|
||||||
|
! gpu_blas
|
||||||
|
! --------
|
||||||
|
|
||||||
|
subroutine gpu_blas_create(handle)
|
||||||
|
import
|
||||||
|
type(gpu_blas) :: handle
|
||||||
|
call gpu_blas_create_c(handle%c)
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_blas_destroy(handle)
|
||||||
|
import
|
||||||
|
type(gpu_blas) :: handle
|
||||||
|
call gpu_blas_destroy_c(handle%c)
|
||||||
end subroutine
|
end subroutine
|
||||||
|
|
||||||
end module
|
end module
|
||||||
|
|
||||||
subroutine gpu_upload_double(cpu_ptr, gpu_ptr, n)
|
|
||||||
use gpu
|
|
||||||
implicit none
|
|
||||||
double precision, intent(in) :: cpu_ptr(*)
|
|
||||||
double precision, intent(in) :: gpu_ptr(*)
|
|
||||||
integer(c_int64_t), intent(in) :: n
|
|
||||||
call gpu_upload_c(c_loc(cpu_ptr), c_loc(gpu_ptr), 8_8*n)
|
|
||||||
end subroutine
|
|
||||||
|
|
||||||
subroutine gpu_download_double(gpu_ptr, cpu_ptr, n)
|
|
||||||
use gpu
|
|
||||||
implicit none
|
|
||||||
double precision, intent(in) :: gpu_ptr(*)
|
|
||||||
double precision, intent(in) :: cpu_ptr(*)
|
|
||||||
integer(c_int64_t), intent(in) :: n
|
|
||||||
call gpu_download_c(c_loc(gpu_ptr), c_loc(cpu_ptr), 8_8*n)
|
|
||||||
end subroutine
|
|
||||||
|
|
||||||
subroutine gpu_copy_double(gpu_ptr_src, gpu_ptr_dest, n)
|
|
||||||
use gpu
|
|
||||||
implicit none
|
|
||||||
double precision, intent(in) :: gpu_ptr_src(*)
|
|
||||||
double precision, intent(in) :: gpu_ptr_dest(*)
|
|
||||||
integer(c_int64_t), intent(in) :: n
|
|
||||||
call gpu_copy_c(c_loc(gpu_ptr_src), c_loc(gpu_ptr_dest), 8_8*n)
|
|
||||||
end subroutine
|
|
||||||
|
|
||||||
|
@ -251,7 +251,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
|
|||||||
if (alpha == 0.) {
|
if (alpha == 0.) {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<n ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = beta * b[j*ldb+i];
|
c[j*ldc+i] = beta * b[j*ldb+i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -259,7 +259,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
|
|||||||
} else if (beta == 0.) {
|
} else if (beta == 0.) {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<n ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = alpha * a[j*lda+i];
|
c[j*ldc+i] = alpha * a[j*lda+i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -267,7 +267,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
|
|||||||
} else {
|
} else {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<n ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i];
|
c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -282,7 +282,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
|
|||||||
if (alpha == 0.) {
|
if (alpha == 0.) {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<n ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = beta * b[i*ldb+j];
|
c[j*ldc+i] = beta * b[i*ldb+j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -290,7 +290,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
|
|||||||
} else if (beta == 0.) {
|
} else if (beta == 0.) {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<n ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = alpha * a[j*lda+i];
|
c[j*ldc+i] = alpha * a[j*lda+i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -298,7 +298,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
|
|||||||
} else {
|
} else {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<n ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j];
|
c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -313,7 +313,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
|
|||||||
if (alpha == 0.) {
|
if (alpha == 0.) {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<n ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = beta * b[j*ldb+i];
|
c[j*ldc+i] = beta * b[j*ldb+i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -321,7 +321,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
|
|||||||
} else if (beta == 0.) {
|
} else if (beta == 0.) {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<n ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = alpha * a[i*lda+j];
|
c[j*ldc+i] = alpha * a[i*lda+j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -329,7 +329,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
|
|||||||
} else {
|
} else {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<n ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i];
|
c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -344,7 +344,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
|
|||||||
if (alpha == 0.) {
|
if (alpha == 0.) {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<n ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = beta * b[i*ldb+j];
|
c[j*ldc+i] = beta * b[i*ldb+j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -352,7 +352,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
|
|||||||
} else if (beta == 0.) {
|
} else if (beta == 0.) {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<n ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = alpha * a[i*lda+j];
|
c[j*ldc+i] = alpha * a[i*lda+j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -360,7 +360,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
|
|||||||
} else {
|
} else {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<n ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j];
|
c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -386,7 +386,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
|
|||||||
if (alpha == 0.) {
|
if (alpha == 0.) {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<n ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = beta * b[j*ldb+i];
|
c[j*ldc+i] = beta * b[j*ldb+i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -394,7 +394,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
|
|||||||
} else if (beta == 0.) {
|
} else if (beta == 0.) {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<n ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = alpha * a[j*lda+i];
|
c[j*ldc+i] = alpha * a[j*lda+i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -402,7 +402,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
|
|||||||
} else {
|
} else {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<n ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i];
|
c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -417,7 +417,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
|
|||||||
if (alpha == 0.) {
|
if (alpha == 0.) {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<n ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = beta * b[i*ldb+j];
|
c[j*ldc+i] = beta * b[i*ldb+j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -425,7 +425,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
|
|||||||
} else if (beta == 0.) {
|
} else if (beta == 0.) {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<n ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = alpha * a[j*lda+i];
|
c[j*ldc+i] = alpha * a[j*lda+i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -433,7 +433,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
|
|||||||
} else {
|
} else {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<n ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j];
|
c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -448,7 +448,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
|
|||||||
if (alpha == 0.) {
|
if (alpha == 0.) {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<n ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = beta * b[j*ldb+i];
|
c[j*ldc+i] = beta * b[j*ldb+i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -456,7 +456,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
|
|||||||
} else if (beta == 0.) {
|
} else if (beta == 0.) {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<n ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = alpha * a[i*lda+j];
|
c[j*ldc+i] = alpha * a[i*lda+j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -464,7 +464,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
|
|||||||
} else {
|
} else {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<n ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i];
|
c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -479,7 +479,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
|
|||||||
if (alpha == 0.) {
|
if (alpha == 0.) {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<n ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = beta * b[i*ldb+j];
|
c[j*ldc+i] = beta * b[i*ldb+j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -487,7 +487,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
|
|||||||
} else if (beta == 0.) {
|
} else if (beta == 0.) {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<n ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = alpha * a[i*lda+j];
|
c[j*ldc+i] = alpha * a[i*lda+j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -495,7 +495,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
|
|||||||
} else {
|
} else {
|
||||||
|
|
||||||
for (int64_t j=0 ; j<n ; ++j) {
|
for (int64_t j=0 ; j<n ; ++j) {
|
||||||
for (int64_t i=0 ; i<n ; ++i) {
|
for (int64_t i=0 ; i<m ; ++i) {
|
||||||
c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j];
|
c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user