mirror of
https://github.com/QuantumPackage/qp2.git
synced 2025-01-03 01:55:59 +01:00
Introducing GPU in CCSD
This commit is contained in:
parent
6c02ac0f0b
commit
fa6d141949
@ -10,9 +10,9 @@ subroutine run_ccsd_space_orb
|
||||
double precision :: uncorr_energy,energy, max_elem, max_r, max_r1, max_r2,ta,tb
|
||||
logical :: not_converged
|
||||
|
||||
double precision, allocatable :: t2(:,:,:,:), r2(:,:,:,:), tau(:,:,:,:), tau_x(:,:,:,:)
|
||||
double precision, allocatable :: t1(:,:), r1(:,:)
|
||||
double precision, pointer :: H_oo, H_vv, H_vo
|
||||
type(gpu_double4) :: t2, r2, tau, tau_x
|
||||
type(gpu_double2) :: t1, r1
|
||||
type(gpu_double2) :: H_oo, H_vv, H_vo
|
||||
|
||||
double precision, allocatable :: all_err(:,:), all_t(:,:)
|
||||
integer, allocatable :: list_occ(:), list_vir(:)
|
||||
@ -52,14 +52,15 @@ subroutine run_ccsd_space_orb
|
||||
!print*,'occ',list_occ
|
||||
!print*,'vir',list_vir
|
||||
|
||||
allocate(t2(nO,nO,nV,nV), r2(nO,nO,nV,nV))
|
||||
allocate(tau(nO,nO,nV,nV))
|
||||
allocate(tau_x(nO,nO,nV,nV))
|
||||
allocate(t1(nO,nV), r1(nO,nV))
|
||||
|
||||
call gpu_allocate_double(H_oo, (/ nO, nO /) )
|
||||
call gpu_allocate_double(H_vv, (/ nV, nV /) )
|
||||
call gpu_allocate_double(H_vo, (/ nV, nO /) )
|
||||
call gpu_allocate(t2, nO,nO,nV,nV)
|
||||
call gpu_allocate(r2, nO,nO,nV,nV)
|
||||
call gpu_allocate(tau, nO,nO,nV,nV)
|
||||
call gpu_allocate(tau_x, nO,nO,nV,nV)
|
||||
call gpu_allocate(t1, nO,nV)
|
||||
call gpu_allocate(r1, nO,nV)
|
||||
call gpu_allocate(H_oo, nO, nO)
|
||||
call gpu_allocate(H_vo, nV, nO)
|
||||
call gpu_allocate(H_vv, nV, nV)
|
||||
|
||||
if (cc_update_method == 'diis') then
|
||||
double precision :: rss, diis_mem, extra_mem
|
||||
@ -101,14 +102,21 @@ subroutine run_ccsd_space_orb
|
||||
endif
|
||||
|
||||
! Init
|
||||
call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,t1)
|
||||
call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,t2)
|
||||
call update_tau_space(nO,nV,t1,t2,tau)
|
||||
double precision, allocatable :: h_t1(:,:), h_t2(:,:,:,:)
|
||||
allocate(h_t1(nO,nV), h_t2(nO,nO,nV,nV))
|
||||
|
||||
call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,h_t1)
|
||||
call gpu_upload(h_t1, t1)
|
||||
|
||||
call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,h_t2)
|
||||
call gpu_upload(h_t2, t2)
|
||||
|
||||
call update_tau_space(nO,nV,h_t1,t1,t2,tau)
|
||||
call update_tau_x_space(nO,nV,tau,tau_x)
|
||||
!print*,'hf_energy', hf_energy
|
||||
call det_energy(det,uncorr_energy)
|
||||
print*,'Det energy', uncorr_energy
|
||||
call ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
|
||||
call ccsd_energy_space_x(nO,nV,tau_x%f,t1%f,energy)
|
||||
print*,'Guess energy', uncorr_energy+energy, energy
|
||||
|
||||
nb_iter = 0
|
||||
@ -127,40 +135,38 @@ subroutine run_ccsd_space_orb
|
||||
if (do_ao_cholesky) then
|
||||
! if (.False.) then
|
||||
call compute_H_oo_chol(nO,nV,tau_x,H_oo)
|
||||
call compute_H_vv_chol(nO,nV,tau_x,H_vv)
|
||||
call compute_H_vo_chol(nO,nV,t1,H_vo)
|
||||
call compute_H_vv_chol(nO,nV,tau_x%f,H_vv%f)
|
||||
call compute_H_vo_chol(nO,nV,t1%f,H_vo%f)
|
||||
|
||||
call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
|
||||
call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
|
||||
call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1)
|
||||
call compute_r2_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r2%f,max_r2)
|
||||
else
|
||||
call compute_H_oo(nO,nV,t1,t2,tau,H_oo)
|
||||
call compute_H_vv(nO,nV,t1,t2,tau,H_vv)
|
||||
call compute_H_vo(nO,nV,t1,t2,H_vo)
|
||||
call compute_H_oo(nO,nV,t1%f,t2%f,tau%f,H_oo%f)
|
||||
call compute_H_vv(nO,nV,t1%f,t2%f,tau%f,H_vv%f)
|
||||
call compute_H_vo(nO,nV,t1%f,t2%f,H_vo%f)
|
||||
|
||||
call compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
|
||||
call compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
|
||||
call compute_r1_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r1%f,max_r1)
|
||||
call compute_r2_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r2%f,max_r2)
|
||||
endif
|
||||
max_r = max(max_r1,max_r2)
|
||||
|
||||
! Update
|
||||
if (cc_update_method == 'diis') then
|
||||
!call update_t_ccsd(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
|
||||
!call update_t_ccsd_diis(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
|
||||
call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1,r2,t1,t2,all_err,all_t)
|
||||
call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1%f,r2%f,t1%f,t2%f,all_err,all_t)
|
||||
|
||||
! Standard update as T = T - Delta
|
||||
elseif (cc_update_method == 'none') then
|
||||
call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1,t1)
|
||||
call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2,t2)
|
||||
call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1%f,t1%f)
|
||||
call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2%f,t2%f)
|
||||
else
|
||||
print*,'Unkown cc_method_method: '//cc_update_method
|
||||
endif
|
||||
|
||||
call update_tau_space(nO,nV,t1,t2,tau)
|
||||
call update_tau_space(nO,nV,t1%f,t1,t2,tau)
|
||||
call update_tau_x_space(nO,nV,tau,tau_x)
|
||||
|
||||
! Energy
|
||||
call ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
|
||||
call ccsd_energy_space_x(nO,nV,tau_x%f,t1%f,energy)
|
||||
write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |'
|
||||
|
||||
nb_iter = nb_iter + 1
|
||||
@ -185,8 +191,8 @@ subroutine run_ccsd_space_orb
|
||||
print*,''
|
||||
|
||||
if (write_amplitudes) then
|
||||
call write_t1(nO,nV,t1)
|
||||
call write_t2(nO,nV,t2)
|
||||
call write_t1(nO,nV,t1%f)
|
||||
call write_t2(nO,nV,t2%f)
|
||||
call ezfio_set_utils_cc_io_amplitudes('Read')
|
||||
endif
|
||||
|
||||
@ -195,11 +201,14 @@ subroutine run_ccsd_space_orb
|
||||
deallocate(all_err,all_t)
|
||||
endif
|
||||
|
||||
call gpu_deallocate_double(H_oo)
|
||||
call gpu_deallocate_double(H_vv)
|
||||
call gpu_deallocate_double(H_vo)
|
||||
call gpu_deallocate(H_oo)
|
||||
call gpu_deallocate(H_vv)
|
||||
call gpu_deallocate(H_vo)
|
||||
|
||||
deallocate(r1,r2,tau)
|
||||
call gpu_deallocate(r1)
|
||||
call gpu_deallocate(r2)
|
||||
call gpu_deallocate(tau)
|
||||
call gpu_deallocate(tau_x)
|
||||
|
||||
! CCSD(T)
|
||||
double precision :: e_t, e_t_err
|
||||
@ -207,28 +216,14 @@ subroutine run_ccsd_space_orb
|
||||
|
||||
if (cc_par_t .and. elec_alpha_num + elec_beta_num > 2) then
|
||||
|
||||
! Dumb way
|
||||
!call wall_time(ta)
|
||||
!call ccsd_par_t_space(nO,nV,t1,t2,e_t)
|
||||
!call wall_time(tb)
|
||||
!print*,'Time: ',tb-ta, ' s'
|
||||
|
||||
!print*,''
|
||||
!write(*,'(A15,F18.12,A3)') ' E(CCSD(T)) = ', uncorr_energy + energy + e_t, ' Ha'
|
||||
!write(*,'(A15,F18.12,A3)') ' E(T) = ', e_t, ' Ha'
|
||||
!write(*,'(A15,F18.12,A3)') ' Correlation = ', energy + e_t, ' Ha'
|
||||
!print*,''
|
||||
|
||||
! New
|
||||
e_t = uncorr_energy + energy ! For print in (T) call
|
||||
e_t_err = 0.d0
|
||||
|
||||
print*,'Computing (T) correction...'
|
||||
call wall_time(ta)
|
||||
! call ccsd_par_t_space_v3(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
|
||||
! ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t)
|
||||
|
||||
call ccsd_par_t_space_stoch(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
|
||||
call ccsd_par_t_space_stoch(nO,nV,t1%f,t2%f,cc_space_f_o,cc_space_f_v &
|
||||
,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t, e_t_err)
|
||||
|
||||
call wall_time(tb)
|
||||
@ -243,7 +238,9 @@ subroutine run_ccsd_space_orb
|
||||
|
||||
call save_energy(uncorr_energy + energy, e_t)
|
||||
|
||||
deallocate(t1,t2)
|
||||
deallocate(h_t1, h_t2)
|
||||
call gpu_deallocate(t1)
|
||||
call gpu_deallocate(t2)
|
||||
|
||||
end
|
||||
|
||||
@ -341,70 +338,139 @@ end
|
||||
|
||||
! Tau
|
||||
|
||||
subroutine update_tau_space(nO,nV,t1,t2,tau)
|
||||
|
||||
subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
|
||||
use gpu
|
||||
implicit none
|
||||
|
||||
! in
|
||||
integer, intent(in) :: nO, nV
|
||||
double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV)
|
||||
double precision, intent(in) :: h_t1(nO,nV)
|
||||
type(gpu_double2), intent(in) :: t1
|
||||
type(gpu_double4), intent(in) :: t2
|
||||
|
||||
! out
|
||||
double precision, intent(out) :: tau(nO,nO,nV,nV)
|
||||
type(gpu_double4) :: tau
|
||||
|
||||
! internal
|
||||
integer :: i,j,a,b
|
||||
|
||||
! !$OMP PARALLEL &
|
||||
! !$OMP SHARED(nO,nV,tau,t2,t1,h_t1) &
|
||||
! !$OMP PRIVATE(i,j,a,b) &
|
||||
! !$OMP DEFAULT(NONE)
|
||||
! !$OMP DO
|
||||
! do b = 1, nV
|
||||
! do a = 1, nV
|
||||
! do j = 1, nO
|
||||
! do i = 1, nO
|
||||
! tau%f(i,j,a,b) = t2%f(i,j,a,b) + t1%f(i,a) * h_t1(j,b)
|
||||
! enddo
|
||||
! enddo
|
||||
! enddo
|
||||
! enddo
|
||||
! !$OMP END DO
|
||||
! !$OMP END PARALLEL
|
||||
|
||||
|
||||
type(gpu_blas) :: blas
|
||||
type(gpu_stream) :: stream(nV)
|
||||
|
||||
call gpu_blas_create(blas)
|
||||
do b=1,nV
|
||||
call gpu_stream_create(stream(b))
|
||||
enddo
|
||||
|
||||
!$OMP PARALLEL &
|
||||
!$OMP SHARED(nO,nV,tau,t2,t1) &
|
||||
!$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas) &
|
||||
!$OMP PRIVATE(i,j,a,b) &
|
||||
!$OMP DEFAULT(NONE)
|
||||
do j=1,nO
|
||||
!$OMP DO
|
||||
do b=1,nV
|
||||
do a = 1, nV
|
||||
do j = 1, nO
|
||||
do i = 1, nO
|
||||
tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b)
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
call gpu_set_stream(blas,stream(b))
|
||||
call gpu_dgeam_c(blas%c, 'N', 'N', nO*1_8, nV*1_8, &
|
||||
1.d0, c_loc(t2%f(1,j,1,b)), nO*nO*1_8, &
|
||||
h_t1(j,b), t1%c, nO*1_8, &
|
||||
c_loc(tau%f(1,j,1,b)), nO*nO*1_8)
|
||||
enddo
|
||||
!$OMP END DO
|
||||
enddo
|
||||
!$OMP END PARALLEL
|
||||
|
||||
call gpu_synchronize()
|
||||
|
||||
do b=1,nV
|
||||
call gpu_stream_destroy(stream(b))
|
||||
enddo
|
||||
|
||||
call gpu_blas_destroy(blas)
|
||||
|
||||
end
|
||||
|
||||
subroutine update_tau_x_space(nO,nV,tau,tau_x)
|
||||
|
||||
use gpu
|
||||
implicit none
|
||||
|
||||
! in
|
||||
integer, intent(in) :: nO, nV
|
||||
double precision, intent(in) :: tau(nO,nO,nV,nV)
|
||||
type(gpu_double4), intent(in) :: tau
|
||||
|
||||
! out
|
||||
double precision, intent(out) :: tau_x(nO,nO,nV,nV)
|
||||
type(gpu_double4) :: tau_x
|
||||
|
||||
! internal
|
||||
integer :: i,j,a,b
|
||||
|
||||
! !$OMP PARALLEL &
|
||||
! !$OMP SHARED(nO,nV,tau,tau_x) &
|
||||
! !$OMP PRIVATE(i,j,a,b) &
|
||||
! !$OMP DEFAULT(NONE)
|
||||
! !$OMP DO
|
||||
! do b = 1, nV
|
||||
! do a = 1, nV
|
||||
! do j = 1, nO
|
||||
! do i = 1, nO
|
||||
! tau_x%f(i,j,a,b) = 2.d0*tau%f(i,j,a,b) - tau%f(i,j,b,a)
|
||||
! enddo
|
||||
! enddo
|
||||
! enddo
|
||||
! enddo
|
||||
! !$OMP END DO
|
||||
! !$OMP END PARALLEL
|
||||
|
||||
type(gpu_blas) :: blas
|
||||
type(gpu_stream) :: stream(nV)
|
||||
|
||||
call gpu_blas_create(blas)
|
||||
do a=1,nV
|
||||
call gpu_stream_create(stream(a))
|
||||
enddo
|
||||
|
||||
!$OMP PARALLEL &
|
||||
!$OMP SHARED(nO,nV,tau,tau_x) &
|
||||
!$OMP SHARED(nO,nV,tau,tau_x,stream,blas) &
|
||||
!$OMP PRIVATE(i,j,a,b) &
|
||||
!$OMP DEFAULT(NONE)
|
||||
!$OMP DO
|
||||
do b=1,nV
|
||||
do a=1,nV
|
||||
do j = 1, nO
|
||||
do i = 1, nO
|
||||
tau_x(i,j,a,b) = 2.d0*tau(i,j,a,b) - tau(i,j,b,a)
|
||||
enddo
|
||||
enddo
|
||||
call gpu_set_stream(blas,stream(a))
|
||||
call gpu_dgeam_c(blas%c, 'N', 'N', nO*1_8, nO*1_8, &
|
||||
2.d0, c_loc(tau%f(1,1,a,b)), nO*1_8, &
|
||||
-1.d0, c_loc(tau%f(1,1,b,a)), nO*1_8, &
|
||||
c_loc(tau_x%f(1,1,a,b)), nO*1_8)
|
||||
enddo
|
||||
enddo
|
||||
!$OMP END DO
|
||||
!$OMP END PARALLEL
|
||||
|
||||
call gpu_synchronize()
|
||||
|
||||
do b=1,nV
|
||||
call gpu_stream_destroy(stream(b))
|
||||
enddo
|
||||
|
||||
call gpu_blas_destroy(blas)
|
||||
|
||||
end
|
||||
|
||||
! R1
|
||||
|
@ -294,12 +294,12 @@ end
|
||||
! H_oo
|
||||
|
||||
subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo)
|
||||
|
||||
use gpu
|
||||
implicit none
|
||||
|
||||
integer, intent(in) :: nO,nV
|
||||
double precision, intent(in) :: tau_x(nO, nO, nV, nV)
|
||||
double precision, intent(out) :: H_oo(nO, nO)
|
||||
type(gpu_double4), intent(in) :: tau_x
|
||||
type(gpu_double2), intent(out) :: H_oo
|
||||
|
||||
integer :: a,b,i,j,u,k
|
||||
|
||||
@ -315,7 +315,7 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo)
|
||||
do b=1,nV
|
||||
do j=1,nO
|
||||
do a=1,nV
|
||||
tmp_vov(a,j,b) = tau_x(u,j,a,b)
|
||||
tmp_vov(a,j,b) = tau_x%f(u,j,a,b)
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
@ -328,7 +328,7 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo)
|
||||
!$omp do
|
||||
do i = 1, nO
|
||||
do u = 1, nO
|
||||
H_oo(u,i) = cc_space_f_oo(u,i)
|
||||
H_oo%f(u,i) = cc_space_f_oo(u,i)
|
||||
enddo
|
||||
enddo
|
||||
!$omp end do nowait
|
||||
@ -336,7 +336,7 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo)
|
||||
!$omp end parallel
|
||||
call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
|
||||
tau_kau, cholesky_mo_num*nV, cc_space_v_vo_chol, cholesky_mo_num*nV, &
|
||||
1.d0, H_oo, nO)
|
||||
1.d0, H_oo%f, nO)
|
||||
|
||||
end
|
||||
|
||||
|
@ -2,6 +2,52 @@ module gpu
|
||||
use, intrinsic :: iso_c_binding
|
||||
implicit none
|
||||
|
||||
! Data types
|
||||
! ----------
|
||||
|
||||
type gpu_double1
|
||||
type(c_ptr) :: c
|
||||
double precision, pointer :: f(:)
|
||||
end type
|
||||
|
||||
type gpu_double2
|
||||
type(c_ptr) :: c
|
||||
double precision, pointer :: f(:,:)
|
||||
end type
|
||||
|
||||
type gpu_double3
|
||||
type(c_ptr) :: c
|
||||
double precision, pointer :: f(:,:,:)
|
||||
end type
|
||||
|
||||
type gpu_double4
|
||||
type(c_ptr) :: c
|
||||
double precision, pointer :: f(:,:,:,:)
|
||||
end type
|
||||
|
||||
type gpu_double5
|
||||
type(c_ptr) :: c
|
||||
double precision, pointer :: f(:,:,:,:,:)
|
||||
end type
|
||||
|
||||
type gpu_double6
|
||||
type(c_ptr) :: c
|
||||
double precision, pointer :: f(:,:,:,:,:,:)
|
||||
end type
|
||||
|
||||
|
||||
type gpu_blas
|
||||
type(c_ptr) :: c
|
||||
end type
|
||||
|
||||
type gpu_stream
|
||||
type(c_ptr) :: c
|
||||
end type
|
||||
|
||||
|
||||
! C interfaces
|
||||
! ------------
|
||||
|
||||
interface
|
||||
integer function gpu_ndevices() bind(C)
|
||||
end function
|
||||
@ -43,100 +89,394 @@ module gpu
|
||||
integer(c_int64_t), value :: n
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_stream_create(stream) bind(C)
|
||||
subroutine gpu_stream_create_c(stream) bind(C, name='gpu_stream_create')
|
||||
import
|
||||
type(c_ptr) :: stream
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_stream_destroy(stream) bind(C)
|
||||
subroutine gpu_stream_destroy_c(stream) bind(C, name='gpu_stream_destroy')
|
||||
import
|
||||
type(c_ptr) :: stream
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_set_stream(handle, stream) bind(C)
|
||||
subroutine gpu_set_stream_c(handle, stream) bind(C, name='gpu_set_stream')
|
||||
import
|
||||
type(c_ptr) :: handle, stream
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_synchronize()
|
||||
subroutine gpu_synchronize() bind(C)
|
||||
import
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_blas_create(handle) bind(C)
|
||||
subroutine gpu_blas_create_c(handle) bind(C, name='gpu_blas_create')
|
||||
import
|
||||
type(c_ptr) :: handle
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_blas_destroy(handle) bind(C)
|
||||
subroutine gpu_blas_destroy_c(handle) bind(C, name='gpu_blas_destroy')
|
||||
import
|
||||
type(c_ptr) :: handle
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res) bind(C)
|
||||
subroutine gpu_ddot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_ddot')
|
||||
import
|
||||
type(c_ptr), intent(in) :: handle
|
||||
type(c_ptr), intent(in), value :: handle
|
||||
integer(c_int64_t), value :: n, incx, incy
|
||||
real(c_double), intent(in) :: dx(*), dy(*)
|
||||
type(c_ptr), intent(in), value :: dx, dy
|
||||
real(c_double), intent(out) :: res
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res) bind(C)
|
||||
subroutine gpu_sdot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_sdot')
|
||||
import
|
||||
type(c_ptr), intent(in) :: handle
|
||||
type(c_ptr), intent(in), value :: handle
|
||||
integer(c_int64_t), value :: n, incx, incy
|
||||
real(c_float), intent(in) :: dx(*), dy(*)
|
||||
type(c_ptr), intent(in), value :: dx, dy
|
||||
real(c_float), intent(out) :: res
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
|
||||
b, ldb, c, ldc) bind(C, name='gpu_dgeam')
|
||||
import
|
||||
type(c_ptr), intent(in), value :: handle
|
||||
character(c_char), intent(in), value :: transa, transb
|
||||
integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
|
||||
real(c_double), intent(in), value :: alpha, beta
|
||||
type(c_ptr), value :: a, b, c
|
||||
end subroutine
|
||||
|
||||
end interface
|
||||
|
||||
|
||||
! Polymorphic interfaces
|
||||
! ----------------------
|
||||
|
||||
interface gpu_allocate
|
||||
procedure gpu_allocate_double1 &
|
||||
,gpu_allocate_double2 &
|
||||
,gpu_allocate_double3 &
|
||||
,gpu_allocate_double4 &
|
||||
,gpu_allocate_double5 &
|
||||
,gpu_allocate_double6
|
||||
end interface gpu_allocate
|
||||
|
||||
interface gpu_deallocate
|
||||
procedure gpu_deallocate_double1 &
|
||||
,gpu_deallocate_double2 &
|
||||
,gpu_deallocate_double3 &
|
||||
,gpu_deallocate_double4 &
|
||||
,gpu_deallocate_double5 &
|
||||
,gpu_deallocate_double6
|
||||
end interface gpu_deallocate
|
||||
|
||||
interface gpu_upload
|
||||
procedure gpu_upload_double1 &
|
||||
,gpu_upload_double2 &
|
||||
,gpu_upload_double3 &
|
||||
,gpu_upload_double4 &
|
||||
,gpu_upload_double5 &
|
||||
,gpu_upload_double6
|
||||
end interface gpu_upload
|
||||
|
||||
interface gpu_download
|
||||
procedure gpu_download_double1 &
|
||||
,gpu_download_double2 &
|
||||
,gpu_download_double3 &
|
||||
,gpu_download_double4 &
|
||||
,gpu_download_double5 &
|
||||
,gpu_download_double6
|
||||
end interface gpu_download
|
||||
|
||||
interface gpu_copy
|
||||
procedure gpu_copy_double1 &
|
||||
,gpu_copy_double2 &
|
||||
,gpu_copy_double3 &
|
||||
,gpu_copy_double4 &
|
||||
,gpu_copy_double5 &
|
||||
,gpu_copy_double6
|
||||
end interface gpu_copy
|
||||
|
||||
|
||||
contains
|
||||
|
||||
|
||||
subroutine gpu_allocate_double(ptr, s)
|
||||
implicit none
|
||||
double precision, pointer, intent(inout) :: ptr
|
||||
integer, intent(in) :: s(:)
|
||||
type(c_ptr) :: cptr
|
||||
! gpu_allocate
|
||||
! ------------
|
||||
|
||||
call gpu_allocate_c(cptr, sum(s*1_8)*8_8)
|
||||
call c_f_pointer(cptr, ptr, s)
|
||||
subroutine gpu_allocate_double1(ptr, s)
|
||||
implicit none
|
||||
type(gpu_double1), intent(inout) :: ptr
|
||||
integer, intent(in) :: s
|
||||
|
||||
call gpu_allocate_c(ptr%c, s*8_8)
|
||||
call c_f_pointer(ptr%c, ptr%f, (/ s /))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_deallocate_double(ptr)
|
||||
subroutine gpu_allocate_double2(ptr, s1, s2)
|
||||
implicit none
|
||||
double precision, pointer, intent(inout) :: ptr
|
||||
type(c_ptr) :: cptr
|
||||
cptr = c_loc(ptr)
|
||||
call gpu_deallocate_c(cptr)
|
||||
NULLIFY(ptr)
|
||||
type(gpu_double2), intent(inout) :: ptr
|
||||
integer, intent(in) :: s1, s2
|
||||
|
||||
call gpu_allocate_c(ptr%c, s1*s2*8_8)
|
||||
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_allocate_double3(ptr, s1, s2, s3)
|
||||
implicit none
|
||||
type(gpu_double3), intent(inout) :: ptr
|
||||
integer, intent(in) :: s1, s2, s3
|
||||
|
||||
call gpu_allocate_c(ptr%c, s1*s2*s3*8_8)
|
||||
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_allocate_double4(ptr, s1, s2, s3, s4)
|
||||
implicit none
|
||||
type(gpu_double4), intent(inout) :: ptr
|
||||
integer, intent(in) :: s1, s2, s3, s4
|
||||
|
||||
call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8)
|
||||
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_allocate_double5(ptr, s1, s2, s3, s4, s5)
|
||||
implicit none
|
||||
type(gpu_double5), intent(inout) :: ptr
|
||||
integer, intent(in) :: s1, s2, s3, s4, s5
|
||||
|
||||
call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8)
|
||||
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_allocate_double6(ptr, s1, s2, s3, s4, s5, s6)
|
||||
implicit none
|
||||
type(gpu_double6), intent(inout) :: ptr
|
||||
integer, intent(in) :: s1, s2, s3, s4, s5, s6
|
||||
|
||||
call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8)
|
||||
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /))
|
||||
end subroutine
|
||||
|
||||
|
||||
! gpu_deallocate
|
||||
! --------------
|
||||
|
||||
subroutine gpu_deallocate_double1(ptr)
|
||||
implicit none
|
||||
type(gpu_double1), intent(inout) :: ptr
|
||||
call gpu_deallocate_c(ptr%c)
|
||||
NULLIFY(ptr%f)
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_deallocate_double2(ptr)
|
||||
implicit none
|
||||
type(gpu_double2), intent(inout) :: ptr
|
||||
call gpu_deallocate_c(ptr%c)
|
||||
NULLIFY(ptr%f)
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_deallocate_double3(ptr)
|
||||
implicit none
|
||||
type(gpu_double3), intent(inout) :: ptr
|
||||
call gpu_deallocate_c(ptr%c)
|
||||
NULLIFY(ptr%f)
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_deallocate_double4(ptr)
|
||||
implicit none
|
||||
type(gpu_double4), intent(inout) :: ptr
|
||||
call gpu_deallocate_c(ptr%c)
|
||||
NULLIFY(ptr%f)
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_deallocate_double5(ptr)
|
||||
implicit none
|
||||
type(gpu_double5), intent(inout) :: ptr
|
||||
call gpu_deallocate_c(ptr%c)
|
||||
NULLIFY(ptr%f)
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_deallocate_double6(ptr)
|
||||
implicit none
|
||||
type(gpu_double6), intent(inout) :: ptr
|
||||
call gpu_deallocate_c(ptr%c)
|
||||
NULLIFY(ptr%f)
|
||||
end subroutine
|
||||
|
||||
|
||||
! gpu_upload
|
||||
! ----------
|
||||
|
||||
subroutine gpu_upload_double1(cpu_ptr, gpu_ptr)
|
||||
implicit none
|
||||
double precision, intent(in) :: cpu_ptr(:)
|
||||
type(gpu_double1), intent(in) :: gpu_ptr
|
||||
call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, 8_8*size(gpu_ptr%f))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_upload_double2(cpu_ptr, gpu_ptr)
|
||||
implicit none
|
||||
double precision, intent(in) :: cpu_ptr(:,:)
|
||||
type(gpu_double2), intent(in) :: gpu_ptr
|
||||
call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_upload_double3(cpu_ptr, gpu_ptr)
|
||||
implicit none
|
||||
double precision, intent(in) :: cpu_ptr(:,:,:)
|
||||
type(gpu_double3), intent(in) :: gpu_ptr
|
||||
call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_upload_double4(cpu_ptr, gpu_ptr)
|
||||
implicit none
|
||||
double precision, intent(in) :: cpu_ptr(:,:,:,:)
|
||||
type(gpu_double4), intent(in) :: gpu_ptr
|
||||
call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_upload_double5(cpu_ptr, gpu_ptr)
|
||||
implicit none
|
||||
double precision, intent(in) :: cpu_ptr(:,:,:,:,:)
|
||||
type(gpu_double5), intent(in) :: gpu_ptr
|
||||
call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_upload_double6(cpu_ptr, gpu_ptr)
|
||||
implicit none
|
||||
double precision, intent(in) :: cpu_ptr(:,:,:,:,:,:)
|
||||
type(gpu_double6), intent(in) :: gpu_ptr
|
||||
call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
|
||||
end subroutine
|
||||
|
||||
|
||||
! gpu_download
|
||||
! ------------
|
||||
|
||||
subroutine gpu_download_double1(gpu_ptr, cpu_ptr)
|
||||
implicit none
|
||||
type(gpu_double1), intent(in) :: gpu_ptr
|
||||
double precision, intent(in) :: cpu_ptr(:)
|
||||
call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*size(gpu_ptr%f))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_download_double2(gpu_ptr, cpu_ptr)
|
||||
implicit none
|
||||
type(gpu_double2), intent(in) :: gpu_ptr
|
||||
double precision, intent(in) :: cpu_ptr(:,:)
|
||||
call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_download_double3(gpu_ptr, cpu_ptr)
|
||||
implicit none
|
||||
type(gpu_double3), intent(in) :: gpu_ptr
|
||||
double precision, intent(in) :: cpu_ptr(:,:,:)
|
||||
call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_download_double4(gpu_ptr, cpu_ptr)
|
||||
implicit none
|
||||
type(gpu_double4), intent(in) :: gpu_ptr
|
||||
double precision, intent(in) :: cpu_ptr(:,:,:,:)
|
||||
call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_download_double5(gpu_ptr, cpu_ptr)
|
||||
implicit none
|
||||
type(gpu_double5), intent(in) :: gpu_ptr
|
||||
double precision, intent(in) :: cpu_ptr(:,:,:,:,:)
|
||||
call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_download_double6(gpu_ptr, cpu_ptr)
|
||||
implicit none
|
||||
type(gpu_double6), intent(in) :: gpu_ptr
|
||||
double precision, intent(in) :: cpu_ptr(:,:,:,:,:,:)
|
||||
call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
|
||||
end subroutine
|
||||
|
||||
! gpu_copy
|
||||
! --------
|
||||
|
||||
subroutine gpu_copy_double1(gpu_ptr_src, gpu_ptr_dest)
|
||||
implicit none
|
||||
type(gpu_double1), intent(in) :: gpu_ptr_src
|
||||
type(gpu_double1), intent(in) :: gpu_ptr_dest
|
||||
call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*size(gpu_ptr_dest%f))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_copy_double2(gpu_ptr_src, gpu_ptr_dest)
|
||||
implicit none
|
||||
type(gpu_double2), intent(in) :: gpu_ptr_src
|
||||
type(gpu_double2), intent(in) :: gpu_ptr_dest
|
||||
call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_copy_double3(gpu_ptr_src, gpu_ptr_dest)
|
||||
implicit none
|
||||
type(gpu_double3), intent(in) :: gpu_ptr_src
|
||||
type(gpu_double3), intent(in) :: gpu_ptr_dest
|
||||
call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_copy_double4(gpu_ptr_src, gpu_ptr_dest)
|
||||
implicit none
|
||||
type(gpu_double4), intent(in) :: gpu_ptr_src
|
||||
type(gpu_double4), intent(in) :: gpu_ptr_dest
|
||||
call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_copy_double5(gpu_ptr_src, gpu_ptr_dest)
|
||||
implicit none
|
||||
type(gpu_double5), intent(in) :: gpu_ptr_src
|
||||
type(gpu_double5), intent(in) :: gpu_ptr_dest
|
||||
call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_copy_double6(gpu_ptr_src, gpu_ptr_dest)
|
||||
implicit none
|
||||
type(gpu_double6), intent(in) :: gpu_ptr_src
|
||||
type(gpu_double6), intent(in) :: gpu_ptr_dest
|
||||
call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
|
||||
end subroutine
|
||||
|
||||
|
||||
! gpu_stream
|
||||
! ----------
|
||||
|
||||
subroutine gpu_stream_create(stream)
|
||||
import
|
||||
type(gpu_stream) :: stream
|
||||
call gpu_stream_create_c(stream%c)
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_stream_destroy(stream)
|
||||
import
|
||||
type(gpu_stream) :: stream
|
||||
call gpu_stream_destroy_c(stream%c)
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_set_stream(handle, stream)
|
||||
import
|
||||
type(gpu_blas) :: handle
|
||||
type(gpu_stream) :: stream
|
||||
call gpu_set_stream_c(handle%c, stream%c)
|
||||
end subroutine
|
||||
|
||||
|
||||
! gpu_blas
|
||||
! --------
|
||||
|
||||
subroutine gpu_blas_create(handle)
|
||||
import
|
||||
type(gpu_blas) :: handle
|
||||
call gpu_blas_create_c(handle%c)
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_blas_destroy(handle)
|
||||
import
|
||||
type(gpu_blas) :: handle
|
||||
call gpu_blas_destroy_c(handle%c)
|
||||
end subroutine
|
||||
|
||||
end module
|
||||
|
||||
subroutine gpu_upload_double(cpu_ptr, gpu_ptr, n)
|
||||
use gpu
|
||||
implicit none
|
||||
double precision, intent(in) :: cpu_ptr(*)
|
||||
double precision, intent(in) :: gpu_ptr(*)
|
||||
integer(c_int64_t), intent(in) :: n
|
||||
call gpu_upload_c(c_loc(cpu_ptr), c_loc(gpu_ptr), 8_8*n)
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_download_double(gpu_ptr, cpu_ptr, n)
|
||||
use gpu
|
||||
implicit none
|
||||
double precision, intent(in) :: gpu_ptr(*)
|
||||
double precision, intent(in) :: cpu_ptr(*)
|
||||
integer(c_int64_t), intent(in) :: n
|
||||
call gpu_download_c(c_loc(gpu_ptr), c_loc(cpu_ptr), 8_8*n)
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_copy_double(gpu_ptr_src, gpu_ptr_dest, n)
|
||||
use gpu
|
||||
implicit none
|
||||
double precision, intent(in) :: gpu_ptr_src(*)
|
||||
double precision, intent(in) :: gpu_ptr_dest(*)
|
||||
integer(c_int64_t), intent(in) :: n
|
||||
call gpu_copy_c(c_loc(gpu_ptr_src), c_loc(gpu_ptr_dest), 8_8*n)
|
||||
end subroutine
|
||||
|
||||
|
@ -251,7 +251,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
|
||||
if (alpha == 0.) {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = beta * b[j*ldb+i];
|
||||
}
|
||||
}
|
||||
@ -259,7 +259,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
|
||||
} else if (beta == 0.) {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = alpha * a[j*lda+i];
|
||||
}
|
||||
}
|
||||
@ -267,7 +267,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
|
||||
} else {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i];
|
||||
}
|
||||
}
|
||||
@ -282,7 +282,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
|
||||
if (alpha == 0.) {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = beta * b[i*ldb+j];
|
||||
}
|
||||
}
|
||||
@ -290,7 +290,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
|
||||
} else if (beta == 0.) {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = alpha * a[j*lda+i];
|
||||
}
|
||||
}
|
||||
@ -298,7 +298,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
|
||||
} else {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j];
|
||||
}
|
||||
}
|
||||
@ -313,7 +313,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
|
||||
if (alpha == 0.) {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = beta * b[j*ldb+i];
|
||||
}
|
||||
}
|
||||
@ -321,7 +321,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
|
||||
} else if (beta == 0.) {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = alpha * a[i*lda+j];
|
||||
}
|
||||
}
|
||||
@ -329,7 +329,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
|
||||
} else {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i];
|
||||
}
|
||||
}
|
||||
@ -344,7 +344,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
|
||||
if (alpha == 0.) {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = beta * b[i*ldb+j];
|
||||
}
|
||||
}
|
||||
@ -352,7 +352,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
|
||||
} else if (beta == 0.) {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = alpha * a[i*lda+j];
|
||||
}
|
||||
}
|
||||
@ -360,7 +360,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
|
||||
} else {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j];
|
||||
}
|
||||
}
|
||||
@ -386,7 +386,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
|
||||
if (alpha == 0.) {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = beta * b[j*ldb+i];
|
||||
}
|
||||
}
|
||||
@ -394,7 +394,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
|
||||
} else if (beta == 0.) {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = alpha * a[j*lda+i];
|
||||
}
|
||||
}
|
||||
@ -402,7 +402,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
|
||||
} else {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i];
|
||||
}
|
||||
}
|
||||
@ -417,7 +417,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
|
||||
if (alpha == 0.) {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = beta * b[i*ldb+j];
|
||||
}
|
||||
}
|
||||
@ -425,7 +425,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
|
||||
} else if (beta == 0.) {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = alpha * a[j*lda+i];
|
||||
}
|
||||
}
|
||||
@ -433,7 +433,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
|
||||
} else {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j];
|
||||
}
|
||||
}
|
||||
@ -448,7 +448,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
|
||||
if (alpha == 0.) {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = beta * b[j*ldb+i];
|
||||
}
|
||||
}
|
||||
@ -456,7 +456,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
|
||||
} else if (beta == 0.) {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = alpha * a[i*lda+j];
|
||||
}
|
||||
}
|
||||
@ -464,7 +464,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
|
||||
} else {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i];
|
||||
}
|
||||
}
|
||||
@ -479,7 +479,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
|
||||
if (alpha == 0.) {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = beta * b[i*ldb+j];
|
||||
}
|
||||
}
|
||||
@ -487,7 +487,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
|
||||
} else if (beta == 0.) {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = alpha * a[i*lda+j];
|
||||
}
|
||||
}
|
||||
@ -495,7 +495,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
|
||||
} else {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j];
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user