10
0
mirror of https://github.com/QuantumPackage/qp2.git synced 2025-01-05 02:48:49 +01:00

Introducing GPU in CCSD

This commit is contained in:
Anthony Scemama 2024-06-27 15:45:52 +02:00
parent 6c02ac0f0b
commit fa6d141949
4 changed files with 570 additions and 164 deletions

View File

@ -10,9 +10,9 @@ subroutine run_ccsd_space_orb
double precision :: uncorr_energy,energy, max_elem, max_r, max_r1, max_r2,ta,tb double precision :: uncorr_energy,energy, max_elem, max_r, max_r1, max_r2,ta,tb
logical :: not_converged logical :: not_converged
double precision, allocatable :: t2(:,:,:,:), r2(:,:,:,:), tau(:,:,:,:), tau_x(:,:,:,:) type(gpu_double4) :: t2, r2, tau, tau_x
double precision, allocatable :: t1(:,:), r1(:,:) type(gpu_double2) :: t1, r1
double precision, pointer :: H_oo, H_vv, H_vo type(gpu_double2) :: H_oo, H_vv, H_vo
double precision, allocatable :: all_err(:,:), all_t(:,:) double precision, allocatable :: all_err(:,:), all_t(:,:)
integer, allocatable :: list_occ(:), list_vir(:) integer, allocatable :: list_occ(:), list_vir(:)
@ -52,14 +52,15 @@ subroutine run_ccsd_space_orb
!print*,'occ',list_occ !print*,'occ',list_occ
!print*,'vir',list_vir !print*,'vir',list_vir
allocate(t2(nO,nO,nV,nV), r2(nO,nO,nV,nV)) call gpu_allocate(t2, nO,nO,nV,nV)
allocate(tau(nO,nO,nV,nV)) call gpu_allocate(r2, nO,nO,nV,nV)
allocate(tau_x(nO,nO,nV,nV)) call gpu_allocate(tau, nO,nO,nV,nV)
allocate(t1(nO,nV), r1(nO,nV)) call gpu_allocate(tau_x, nO,nO,nV,nV)
call gpu_allocate(t1, nO,nV)
call gpu_allocate_double(H_oo, (/ nO, nO /) ) call gpu_allocate(r1, nO,nV)
call gpu_allocate_double(H_vv, (/ nV, nV /) ) call gpu_allocate(H_oo, nO, nO)
call gpu_allocate_double(H_vo, (/ nV, nO /) ) call gpu_allocate(H_vo, nV, nO)
call gpu_allocate(H_vv, nV, nV)
if (cc_update_method == 'diis') then if (cc_update_method == 'diis') then
double precision :: rss, diis_mem, extra_mem double precision :: rss, diis_mem, extra_mem
@ -101,14 +102,21 @@ subroutine run_ccsd_space_orb
endif endif
! Init ! Init
call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,t1) double precision, allocatable :: h_t1(:,:), h_t2(:,:,:,:)
call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,t2) allocate(h_t1(nO,nV), h_t2(nO,nO,nV,nV))
call update_tau_space(nO,nV,t1,t2,tau)
call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,h_t1)
call gpu_upload(h_t1, t1)
call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,h_t2)
call gpu_upload(h_t2, t2)
call update_tau_space(nO,nV,h_t1,t1,t2,tau)
call update_tau_x_space(nO,nV,tau,tau_x) call update_tau_x_space(nO,nV,tau,tau_x)
!print*,'hf_energy', hf_energy !print*,'hf_energy', hf_energy
call det_energy(det,uncorr_energy) call det_energy(det,uncorr_energy)
print*,'Det energy', uncorr_energy print*,'Det energy', uncorr_energy
call ccsd_energy_space_x(nO,nV,tau_x,t1,energy) call ccsd_energy_space_x(nO,nV,tau_x%f,t1%f,energy)
print*,'Guess energy', uncorr_energy+energy, energy print*,'Guess energy', uncorr_energy+energy, energy
nb_iter = 0 nb_iter = 0
@ -127,40 +135,38 @@ subroutine run_ccsd_space_orb
if (do_ao_cholesky) then if (do_ao_cholesky) then
! if (.False.) then ! if (.False.) then
call compute_H_oo_chol(nO,nV,tau_x,H_oo) call compute_H_oo_chol(nO,nV,tau_x,H_oo)
call compute_H_vv_chol(nO,nV,tau_x,H_vv) call compute_H_vv_chol(nO,nV,tau_x%f,H_vv%f)
call compute_H_vo_chol(nO,nV,t1,H_vo) call compute_H_vo_chol(nO,nV,t1%f,H_vo%f)
call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1)
call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) call compute_r2_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r2%f,max_r2)
else else
call compute_H_oo(nO,nV,t1,t2,tau,H_oo) call compute_H_oo(nO,nV,t1%f,t2%f,tau%f,H_oo%f)
call compute_H_vv(nO,nV,t1,t2,tau,H_vv) call compute_H_vv(nO,nV,t1%f,t2%f,tau%f,H_vv%f)
call compute_H_vo(nO,nV,t1,t2,H_vo) call compute_H_vo(nO,nV,t1%f,t2%f,H_vo%f)
call compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) call compute_r1_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r1%f,max_r1)
call compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) call compute_r2_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r2%f,max_r2)
endif endif
max_r = max(max_r1,max_r2) max_r = max(max_r1,max_r2)
! Update ! Update
if (cc_update_method == 'diis') then if (cc_update_method == 'diis') then
!call update_t_ccsd(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2) call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1%f,r2%f,t1%f,t2%f,all_err,all_t)
!call update_t_ccsd_diis(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1,r2,t1,t2,all_err,all_t)
! Standard update as T = T - Delta ! Standard update as T = T - Delta
elseif (cc_update_method == 'none') then elseif (cc_update_method == 'none') then
call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1,t1) call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1%f,t1%f)
call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2,t2) call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2%f,t2%f)
else else
print*,'Unkown cc_method_method: '//cc_update_method print*,'Unkown cc_method_method: '//cc_update_method
endif endif
call update_tau_space(nO,nV,t1,t2,tau) call update_tau_space(nO,nV,t1%f,t1,t2,tau)
call update_tau_x_space(nO,nV,tau,tau_x) call update_tau_x_space(nO,nV,tau,tau_x)
! Energy ! Energy
call ccsd_energy_space_x(nO,nV,tau_x,t1,energy) call ccsd_energy_space_x(nO,nV,tau_x%f,t1%f,energy)
write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |' write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |'
nb_iter = nb_iter + 1 nb_iter = nb_iter + 1
@ -185,8 +191,8 @@ subroutine run_ccsd_space_orb
print*,'' print*,''
if (write_amplitudes) then if (write_amplitudes) then
call write_t1(nO,nV,t1) call write_t1(nO,nV,t1%f)
call write_t2(nO,nV,t2) call write_t2(nO,nV,t2%f)
call ezfio_set_utils_cc_io_amplitudes('Read') call ezfio_set_utils_cc_io_amplitudes('Read')
endif endif
@ -195,11 +201,14 @@ subroutine run_ccsd_space_orb
deallocate(all_err,all_t) deallocate(all_err,all_t)
endif endif
call gpu_deallocate_double(H_oo) call gpu_deallocate(H_oo)
call gpu_deallocate_double(H_vv) call gpu_deallocate(H_vv)
call gpu_deallocate_double(H_vo) call gpu_deallocate(H_vo)
deallocate(r1,r2,tau) call gpu_deallocate(r1)
call gpu_deallocate(r2)
call gpu_deallocate(tau)
call gpu_deallocate(tau_x)
! CCSD(T) ! CCSD(T)
double precision :: e_t, e_t_err double precision :: e_t, e_t_err
@ -207,28 +216,14 @@ subroutine run_ccsd_space_orb
if (cc_par_t .and. elec_alpha_num + elec_beta_num > 2) then if (cc_par_t .and. elec_alpha_num + elec_beta_num > 2) then
! Dumb way
!call wall_time(ta)
!call ccsd_par_t_space(nO,nV,t1,t2,e_t)
!call wall_time(tb)
!print*,'Time: ',tb-ta, ' s'
!print*,''
!write(*,'(A15,F18.12,A3)') ' E(CCSD(T)) = ', uncorr_energy + energy + e_t, ' Ha'
!write(*,'(A15,F18.12,A3)') ' E(T) = ', e_t, ' Ha'
!write(*,'(A15,F18.12,A3)') ' Correlation = ', energy + e_t, ' Ha'
!print*,''
! New ! New
e_t = uncorr_energy + energy ! For print in (T) call e_t = uncorr_energy + energy ! For print in (T) call
e_t_err = 0.d0 e_t_err = 0.d0
print*,'Computing (T) correction...' print*,'Computing (T) correction...'
call wall_time(ta) call wall_time(ta)
! call ccsd_par_t_space_v3(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
! ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t)
call ccsd_par_t_space_stoch(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v & call ccsd_par_t_space_stoch(nO,nV,t1%f,t2%f,cc_space_f_o,cc_space_f_v &
,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t, e_t_err) ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t, e_t_err)
call wall_time(tb) call wall_time(tb)
@ -243,7 +238,9 @@ subroutine run_ccsd_space_orb
call save_energy(uncorr_energy + energy, e_t) call save_energy(uncorr_energy + energy, e_t)
deallocate(t1,t2) deallocate(h_t1, h_t2)
call gpu_deallocate(t1)
call gpu_deallocate(t2)
end end
@ -341,70 +338,139 @@ end
! Tau ! Tau
subroutine update_tau_space(nO,nV,t1,t2,tau) subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
use gpu
implicit none implicit none
! in ! in
integer, intent(in) :: nO, nV integer, intent(in) :: nO, nV
double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV) double precision, intent(in) :: h_t1(nO,nV)
type(gpu_double2), intent(in) :: t1
type(gpu_double4), intent(in) :: t2
! out ! out
double precision, intent(out) :: tau(nO,nO,nV,nV) type(gpu_double4) :: tau
! internal ! internal
integer :: i,j,a,b integer :: i,j,a,b
! !$OMP PARALLEL &
! !$OMP SHARED(nO,nV,tau,t2,t1,h_t1) &
! !$OMP PRIVATE(i,j,a,b) &
! !$OMP DEFAULT(NONE)
! !$OMP DO
! do b = 1, nV
! do a = 1, nV
! do j = 1, nO
! do i = 1, nO
! tau%f(i,j,a,b) = t2%f(i,j,a,b) + t1%f(i,a) * h_t1(j,b)
! enddo
! enddo
! enddo
! enddo
! !$OMP END DO
! !$OMP END PARALLEL
type(gpu_blas) :: blas
type(gpu_stream) :: stream(nV)
call gpu_blas_create(blas)
do b=1,nV
call gpu_stream_create(stream(b))
enddo
!$OMP PARALLEL & !$OMP PARALLEL &
!$OMP SHARED(nO,nV,tau,t2,t1) & !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas) &
!$OMP PRIVATE(i,j,a,b) & !$OMP PRIVATE(i,j,a,b) &
!$OMP DEFAULT(NONE) !$OMP DEFAULT(NONE)
do j=1,nO
!$OMP DO !$OMP DO
do b=1,nV do b=1,nV
do a = 1, nV call gpu_set_stream(blas,stream(b))
do j = 1, nO call gpu_dgeam_c(blas%c, 'N', 'N', nO*1_8, nV*1_8, &
do i = 1, nO 1.d0, c_loc(t2%f(1,j,1,b)), nO*nO*1_8, &
tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b) h_t1(j,b), t1%c, nO*1_8, &
enddo c_loc(tau%f(1,j,1,b)), nO*nO*1_8)
enddo
enddo
enddo enddo
!$OMP END DO !$OMP END DO
enddo
!$OMP END PARALLEL !$OMP END PARALLEL
call gpu_synchronize()
do b=1,nV
call gpu_stream_destroy(stream(b))
enddo
call gpu_blas_destroy(blas)
end end
subroutine update_tau_x_space(nO,nV,tau,tau_x) subroutine update_tau_x_space(nO,nV,tau,tau_x)
use gpu
implicit none implicit none
! in ! in
integer, intent(in) :: nO, nV integer, intent(in) :: nO, nV
double precision, intent(in) :: tau(nO,nO,nV,nV) type(gpu_double4), intent(in) :: tau
! out ! out
double precision, intent(out) :: tau_x(nO,nO,nV,nV) type(gpu_double4) :: tau_x
! internal ! internal
integer :: i,j,a,b integer :: i,j,a,b
! !$OMP PARALLEL &
! !$OMP SHARED(nO,nV,tau,tau_x) &
! !$OMP PRIVATE(i,j,a,b) &
! !$OMP DEFAULT(NONE)
! !$OMP DO
! do b = 1, nV
! do a = 1, nV
! do j = 1, nO
! do i = 1, nO
! tau_x%f(i,j,a,b) = 2.d0*tau%f(i,j,a,b) - tau%f(i,j,b,a)
! enddo
! enddo
! enddo
! enddo
! !$OMP END DO
! !$OMP END PARALLEL
type(gpu_blas) :: blas
type(gpu_stream) :: stream(nV)
call gpu_blas_create(blas)
do a=1,nV
call gpu_stream_create(stream(a))
enddo
!$OMP PARALLEL & !$OMP PARALLEL &
!$OMP SHARED(nO,nV,tau,tau_x) & !$OMP SHARED(nO,nV,tau,tau_x,stream,blas) &
!$OMP PRIVATE(i,j,a,b) & !$OMP PRIVATE(i,j,a,b) &
!$OMP DEFAULT(NONE) !$OMP DEFAULT(NONE)
!$OMP DO !$OMP DO
do b=1,nV do b=1,nV
do a=1,nV do a=1,nV
do j = 1, nO call gpu_set_stream(blas,stream(a))
do i = 1, nO call gpu_dgeam_c(blas%c, 'N', 'N', nO*1_8, nO*1_8, &
tau_x(i,j,a,b) = 2.d0*tau(i,j,a,b) - tau(i,j,b,a) 2.d0, c_loc(tau%f(1,1,a,b)), nO*1_8, &
enddo -1.d0, c_loc(tau%f(1,1,b,a)), nO*1_8, &
enddo c_loc(tau_x%f(1,1,a,b)), nO*1_8)
enddo enddo
enddo enddo
!$OMP END DO !$OMP END DO
!$OMP END PARALLEL !$OMP END PARALLEL
call gpu_synchronize()
do b=1,nV
call gpu_stream_destroy(stream(b))
enddo
call gpu_blas_destroy(blas)
end end
! R1 ! R1

View File

@ -294,12 +294,12 @@ end
! H_oo ! H_oo
subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo) subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo)
use gpu
implicit none implicit none
integer, intent(in) :: nO,nV integer, intent(in) :: nO,nV
double precision, intent(in) :: tau_x(nO, nO, nV, nV) type(gpu_double4), intent(in) :: tau_x
double precision, intent(out) :: H_oo(nO, nO) type(gpu_double2), intent(out) :: H_oo
integer :: a,b,i,j,u,k integer :: a,b,i,j,u,k
@ -315,7 +315,7 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo)
do b=1,nV do b=1,nV
do j=1,nO do j=1,nO
do a=1,nV do a=1,nV
tmp_vov(a,j,b) = tau_x(u,j,a,b) tmp_vov(a,j,b) = tau_x%f(u,j,a,b)
enddo enddo
enddo enddo
enddo enddo
@ -328,7 +328,7 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo)
!$omp do !$omp do
do i = 1, nO do i = 1, nO
do u = 1, nO do u = 1, nO
H_oo(u,i) = cc_space_f_oo(u,i) H_oo%f(u,i) = cc_space_f_oo(u,i)
enddo enddo
enddo enddo
!$omp end do nowait !$omp end do nowait
@ -336,7 +336,7 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo)
!$omp end parallel !$omp end parallel
call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
tau_kau, cholesky_mo_num*nV, cc_space_v_vo_chol, cholesky_mo_num*nV, & tau_kau, cholesky_mo_num*nV, cc_space_v_vo_chol, cholesky_mo_num*nV, &
1.d0, H_oo, nO) 1.d0, H_oo%f, nO)
end end

View File

@ -2,6 +2,52 @@ module gpu
use, intrinsic :: iso_c_binding use, intrinsic :: iso_c_binding
implicit none implicit none
! Data types
! ----------
type gpu_double1
type(c_ptr) :: c
double precision, pointer :: f(:)
end type
type gpu_double2
type(c_ptr) :: c
double precision, pointer :: f(:,:)
end type
type gpu_double3
type(c_ptr) :: c
double precision, pointer :: f(:,:,:)
end type
type gpu_double4
type(c_ptr) :: c
double precision, pointer :: f(:,:,:,:)
end type
type gpu_double5
type(c_ptr) :: c
double precision, pointer :: f(:,:,:,:,:)
end type
type gpu_double6
type(c_ptr) :: c
double precision, pointer :: f(:,:,:,:,:,:)
end type
type gpu_blas
type(c_ptr) :: c
end type
type gpu_stream
type(c_ptr) :: c
end type
! C interfaces
! ------------
interface interface
integer function gpu_ndevices() bind(C) integer function gpu_ndevices() bind(C)
end function end function
@ -43,100 +89,394 @@ module gpu
integer(c_int64_t), value :: n integer(c_int64_t), value :: n
end subroutine end subroutine
subroutine gpu_stream_create(stream) bind(C) subroutine gpu_stream_create_c(stream) bind(C, name='gpu_stream_create')
import import
type(c_ptr) :: stream type(c_ptr) :: stream
end subroutine end subroutine
subroutine gpu_stream_destroy(stream) bind(C) subroutine gpu_stream_destroy_c(stream) bind(C, name='gpu_stream_destroy')
import import
type(c_ptr) :: stream type(c_ptr) :: stream
end subroutine end subroutine
subroutine gpu_set_stream(handle, stream) bind(C) subroutine gpu_set_stream_c(handle, stream) bind(C, name='gpu_set_stream')
import import
type(c_ptr) :: handle, stream type(c_ptr) :: handle, stream
end subroutine end subroutine
subroutine gpu_synchronize() subroutine gpu_synchronize() bind(C)
import
end subroutine end subroutine
subroutine gpu_blas_create(handle) bind(C) subroutine gpu_blas_create_c(handle) bind(C, name='gpu_blas_create')
import import
type(c_ptr) :: handle type(c_ptr) :: handle
end subroutine end subroutine
subroutine gpu_blas_destroy(handle) bind(C) subroutine gpu_blas_destroy_c(handle) bind(C, name='gpu_blas_destroy')
import import
type(c_ptr) :: handle type(c_ptr) :: handle
end subroutine end subroutine
subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res) bind(C) subroutine gpu_ddot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_ddot')
import import
type(c_ptr), intent(in) :: handle type(c_ptr), intent(in), value :: handle
integer(c_int64_t), value :: n, incx, incy integer(c_int64_t), value :: n, incx, incy
real(c_double), intent(in) :: dx(*), dy(*) type(c_ptr), intent(in), value :: dx, dy
real(c_double), intent(out) :: res real(c_double), intent(out) :: res
end subroutine end subroutine
subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res) bind(C) subroutine gpu_sdot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_sdot')
import import
type(c_ptr), intent(in) :: handle type(c_ptr), intent(in), value :: handle
integer(c_int64_t), value :: n, incx, incy integer(c_int64_t), value :: n, incx, incy
real(c_float), intent(in) :: dx(*), dy(*) type(c_ptr), intent(in), value :: dx, dy
real(c_float), intent(out) :: res real(c_float), intent(out) :: res
end subroutine end subroutine
subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
b, ldb, c, ldc) bind(C, name='gpu_dgeam')
import
type(c_ptr), intent(in), value :: handle
character(c_char), intent(in), value :: transa, transb
integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
real(c_double), intent(in), value :: alpha, beta
type(c_ptr), value :: a, b, c
end subroutine
end interface end interface
! Polymorphic interfaces
! ----------------------
interface gpu_allocate
procedure gpu_allocate_double1 &
,gpu_allocate_double2 &
,gpu_allocate_double3 &
,gpu_allocate_double4 &
,gpu_allocate_double5 &
,gpu_allocate_double6
end interface gpu_allocate
interface gpu_deallocate
procedure gpu_deallocate_double1 &
,gpu_deallocate_double2 &
,gpu_deallocate_double3 &
,gpu_deallocate_double4 &
,gpu_deallocate_double5 &
,gpu_deallocate_double6
end interface gpu_deallocate
interface gpu_upload
procedure gpu_upload_double1 &
,gpu_upload_double2 &
,gpu_upload_double3 &
,gpu_upload_double4 &
,gpu_upload_double5 &
,gpu_upload_double6
end interface gpu_upload
interface gpu_download
procedure gpu_download_double1 &
,gpu_download_double2 &
,gpu_download_double3 &
,gpu_download_double4 &
,gpu_download_double5 &
,gpu_download_double6
end interface gpu_download
interface gpu_copy
procedure gpu_copy_double1 &
,gpu_copy_double2 &
,gpu_copy_double3 &
,gpu_copy_double4 &
,gpu_copy_double5 &
,gpu_copy_double6
end interface gpu_copy
contains contains
subroutine gpu_allocate_double(ptr, s) ! gpu_allocate
implicit none ! ------------
double precision, pointer, intent(inout) :: ptr
integer, intent(in) :: s(:)
type(c_ptr) :: cptr
call gpu_allocate_c(cptr, sum(s*1_8)*8_8) subroutine gpu_allocate_double1(ptr, s)
call c_f_pointer(cptr, ptr, s) implicit none
type(gpu_double1), intent(inout) :: ptr
integer, intent(in) :: s
call gpu_allocate_c(ptr%c, s*8_8)
call c_f_pointer(ptr%c, ptr%f, (/ s /))
end subroutine end subroutine
subroutine gpu_deallocate_double(ptr) subroutine gpu_allocate_double2(ptr, s1, s2)
implicit none implicit none
double precision, pointer, intent(inout) :: ptr type(gpu_double2), intent(inout) :: ptr
type(c_ptr) :: cptr integer, intent(in) :: s1, s2
cptr = c_loc(ptr)
call gpu_deallocate_c(cptr) call gpu_allocate_c(ptr%c, s1*s2*8_8)
NULLIFY(ptr) call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /))
end subroutine
subroutine gpu_allocate_double3(ptr, s1, s2, s3)
implicit none
type(gpu_double3), intent(inout) :: ptr
integer, intent(in) :: s1, s2, s3
call gpu_allocate_c(ptr%c, s1*s2*s3*8_8)
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /))
end subroutine
subroutine gpu_allocate_double4(ptr, s1, s2, s3, s4)
implicit none
type(gpu_double4), intent(inout) :: ptr
integer, intent(in) :: s1, s2, s3, s4
call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8)
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /))
end subroutine
subroutine gpu_allocate_double5(ptr, s1, s2, s3, s4, s5)
implicit none
type(gpu_double5), intent(inout) :: ptr
integer, intent(in) :: s1, s2, s3, s4, s5
call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8)
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /))
end subroutine
subroutine gpu_allocate_double6(ptr, s1, s2, s3, s4, s5, s6)
implicit none
type(gpu_double6), intent(inout) :: ptr
integer, intent(in) :: s1, s2, s3, s4, s5, s6
call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8)
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /))
end subroutine
! gpu_deallocate
! --------------
subroutine gpu_deallocate_double1(ptr)
implicit none
type(gpu_double1), intent(inout) :: ptr
call gpu_deallocate_c(ptr%c)
NULLIFY(ptr%f)
end subroutine
subroutine gpu_deallocate_double2(ptr)
implicit none
type(gpu_double2), intent(inout) :: ptr
call gpu_deallocate_c(ptr%c)
NULLIFY(ptr%f)
end subroutine
subroutine gpu_deallocate_double3(ptr)
implicit none
type(gpu_double3), intent(inout) :: ptr
call gpu_deallocate_c(ptr%c)
NULLIFY(ptr%f)
end subroutine
subroutine gpu_deallocate_double4(ptr)
implicit none
type(gpu_double4), intent(inout) :: ptr
call gpu_deallocate_c(ptr%c)
NULLIFY(ptr%f)
end subroutine
subroutine gpu_deallocate_double5(ptr)
implicit none
type(gpu_double5), intent(inout) :: ptr
call gpu_deallocate_c(ptr%c)
NULLIFY(ptr%f)
end subroutine
subroutine gpu_deallocate_double6(ptr)
implicit none
type(gpu_double6), intent(inout) :: ptr
call gpu_deallocate_c(ptr%c)
NULLIFY(ptr%f)
end subroutine
! gpu_upload
! ----------
subroutine gpu_upload_double1(cpu_ptr, gpu_ptr)
implicit none
double precision, intent(in) :: cpu_ptr(:)
type(gpu_double1), intent(in) :: gpu_ptr
call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, 8_8*size(gpu_ptr%f))
end subroutine
subroutine gpu_upload_double2(cpu_ptr, gpu_ptr)
implicit none
double precision, intent(in) :: cpu_ptr(:,:)
type(gpu_double2), intent(in) :: gpu_ptr
call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
end subroutine
subroutine gpu_upload_double3(cpu_ptr, gpu_ptr)
implicit none
double precision, intent(in) :: cpu_ptr(:,:,:)
type(gpu_double3), intent(in) :: gpu_ptr
call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
end subroutine
subroutine gpu_upload_double4(cpu_ptr, gpu_ptr)
implicit none
double precision, intent(in) :: cpu_ptr(:,:,:,:)
type(gpu_double4), intent(in) :: gpu_ptr
call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
end subroutine
subroutine gpu_upload_double5(cpu_ptr, gpu_ptr)
implicit none
double precision, intent(in) :: cpu_ptr(:,:,:,:,:)
type(gpu_double5), intent(in) :: gpu_ptr
call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
end subroutine
subroutine gpu_upload_double6(cpu_ptr, gpu_ptr)
implicit none
double precision, intent(in) :: cpu_ptr(:,:,:,:,:,:)
type(gpu_double6), intent(in) :: gpu_ptr
call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
end subroutine
! gpu_download
! ------------
subroutine gpu_download_double1(gpu_ptr, cpu_ptr)
implicit none
type(gpu_double1), intent(in) :: gpu_ptr
double precision, intent(in) :: cpu_ptr(:)
call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*size(gpu_ptr%f))
end subroutine
subroutine gpu_download_double2(gpu_ptr, cpu_ptr)
implicit none
type(gpu_double2), intent(in) :: gpu_ptr
double precision, intent(in) :: cpu_ptr(:,:)
call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
end subroutine
subroutine gpu_download_double3(gpu_ptr, cpu_ptr)
implicit none
type(gpu_double3), intent(in) :: gpu_ptr
double precision, intent(in) :: cpu_ptr(:,:,:)
call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
end subroutine
subroutine gpu_download_double4(gpu_ptr, cpu_ptr)
implicit none
type(gpu_double4), intent(in) :: gpu_ptr
double precision, intent(in) :: cpu_ptr(:,:,:,:)
call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
end subroutine
subroutine gpu_download_double5(gpu_ptr, cpu_ptr)
implicit none
type(gpu_double5), intent(in) :: gpu_ptr
double precision, intent(in) :: cpu_ptr(:,:,:,:,:)
call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
end subroutine
subroutine gpu_download_double6(gpu_ptr, cpu_ptr)
implicit none
type(gpu_double6), intent(in) :: gpu_ptr
double precision, intent(in) :: cpu_ptr(:,:,:,:,:,:)
call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
end subroutine
! gpu_copy
! --------
subroutine gpu_copy_double1(gpu_ptr_src, gpu_ptr_dest)
implicit none
type(gpu_double1), intent(in) :: gpu_ptr_src
type(gpu_double1), intent(in) :: gpu_ptr_dest
call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*size(gpu_ptr_dest%f))
end subroutine
subroutine gpu_copy_double2(gpu_ptr_src, gpu_ptr_dest)
implicit none
type(gpu_double2), intent(in) :: gpu_ptr_src
type(gpu_double2), intent(in) :: gpu_ptr_dest
call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
end subroutine
subroutine gpu_copy_double3(gpu_ptr_src, gpu_ptr_dest)
implicit none
type(gpu_double3), intent(in) :: gpu_ptr_src
type(gpu_double3), intent(in) :: gpu_ptr_dest
call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
end subroutine
subroutine gpu_copy_double4(gpu_ptr_src, gpu_ptr_dest)
implicit none
type(gpu_double4), intent(in) :: gpu_ptr_src
type(gpu_double4), intent(in) :: gpu_ptr_dest
call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
end subroutine
subroutine gpu_copy_double5(gpu_ptr_src, gpu_ptr_dest)
implicit none
type(gpu_double5), intent(in) :: gpu_ptr_src
type(gpu_double5), intent(in) :: gpu_ptr_dest
call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
end subroutine
subroutine gpu_copy_double6(gpu_ptr_src, gpu_ptr_dest)
implicit none
type(gpu_double6), intent(in) :: gpu_ptr_src
type(gpu_double6), intent(in) :: gpu_ptr_dest
call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
end subroutine
! gpu_stream
! ----------
subroutine gpu_stream_create(stream)
import
type(gpu_stream) :: stream
call gpu_stream_create_c(stream%c)
end subroutine
subroutine gpu_stream_destroy(stream)
import
type(gpu_stream) :: stream
call gpu_stream_destroy_c(stream%c)
end subroutine
subroutine gpu_set_stream(handle, stream)
import
type(gpu_blas) :: handle
type(gpu_stream) :: stream
call gpu_set_stream_c(handle%c, stream%c)
end subroutine
! gpu_blas
! --------
subroutine gpu_blas_create(handle)
import
type(gpu_blas) :: handle
call gpu_blas_create_c(handle%c)
end subroutine
subroutine gpu_blas_destroy(handle)
import
type(gpu_blas) :: handle
call gpu_blas_destroy_c(handle%c)
end subroutine end subroutine
end module end module
subroutine gpu_upload_double(cpu_ptr, gpu_ptr, n)
use gpu
implicit none
double precision, intent(in) :: cpu_ptr(*)
double precision, intent(in) :: gpu_ptr(*)
integer(c_int64_t), intent(in) :: n
call gpu_upload_c(c_loc(cpu_ptr), c_loc(gpu_ptr), 8_8*n)
end subroutine
subroutine gpu_download_double(gpu_ptr, cpu_ptr, n)
use gpu
implicit none
double precision, intent(in) :: gpu_ptr(*)
double precision, intent(in) :: cpu_ptr(*)
integer(c_int64_t), intent(in) :: n
call gpu_download_c(c_loc(gpu_ptr), c_loc(cpu_ptr), 8_8*n)
end subroutine
subroutine gpu_copy_double(gpu_ptr_src, gpu_ptr_dest, n)
use gpu
implicit none
double precision, intent(in) :: gpu_ptr_src(*)
double precision, intent(in) :: gpu_ptr_dest(*)
integer(c_int64_t), intent(in) :: n
call gpu_copy_c(c_loc(gpu_ptr_src), c_loc(gpu_ptr_dest), 8_8*n)
end subroutine

View File

@ -251,7 +251,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
if (alpha == 0.) { if (alpha == 0.) {
for (int64_t j=0 ; j<n ; ++j) { for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<n ; ++i) { for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = beta * b[j*ldb+i]; c[j*ldc+i] = beta * b[j*ldb+i];
} }
} }
@ -259,7 +259,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
} else if (beta == 0.) { } else if (beta == 0.) {
for (int64_t j=0 ; j<n ; ++j) { for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<n ; ++i) { for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = alpha * a[j*lda+i]; c[j*ldc+i] = alpha * a[j*lda+i];
} }
} }
@ -267,7 +267,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
} else { } else {
for (int64_t j=0 ; j<n ; ++j) { for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<n ; ++i) { for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i]; c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i];
} }
} }
@ -282,7 +282,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
if (alpha == 0.) { if (alpha == 0.) {
for (int64_t j=0 ; j<n ; ++j) { for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<n ; ++i) { for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = beta * b[i*ldb+j]; c[j*ldc+i] = beta * b[i*ldb+j];
} }
} }
@ -290,7 +290,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
} else if (beta == 0.) { } else if (beta == 0.) {
for (int64_t j=0 ; j<n ; ++j) { for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<n ; ++i) { for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = alpha * a[j*lda+i]; c[j*ldc+i] = alpha * a[j*lda+i];
} }
} }
@ -298,7 +298,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
} else { } else {
for (int64_t j=0 ; j<n ; ++j) { for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<n ; ++i) { for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j]; c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j];
} }
} }
@ -313,7 +313,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
if (alpha == 0.) { if (alpha == 0.) {
for (int64_t j=0 ; j<n ; ++j) { for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<n ; ++i) { for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = beta * b[j*ldb+i]; c[j*ldc+i] = beta * b[j*ldb+i];
} }
} }
@ -321,7 +321,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
} else if (beta == 0.) { } else if (beta == 0.) {
for (int64_t j=0 ; j<n ; ++j) { for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<n ; ++i) { for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = alpha * a[i*lda+j]; c[j*ldc+i] = alpha * a[i*lda+j];
} }
} }
@ -329,7 +329,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
} else { } else {
for (int64_t j=0 ; j<n ; ++j) { for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<n ; ++i) { for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i]; c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i];
} }
} }
@ -344,7 +344,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
if (alpha == 0.) { if (alpha == 0.) {
for (int64_t j=0 ; j<n ; ++j) { for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<n ; ++i) { for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = beta * b[i*ldb+j]; c[j*ldc+i] = beta * b[i*ldb+j];
} }
} }
@ -352,7 +352,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
} else if (beta == 0.) { } else if (beta == 0.) {
for (int64_t j=0 ; j<n ; ++j) { for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<n ; ++i) { for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = alpha * a[i*lda+j]; c[j*ldc+i] = alpha * a[i*lda+j];
} }
} }
@ -360,7 +360,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
} else { } else {
for (int64_t j=0 ; j<n ; ++j) { for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<n ; ++i) { for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j]; c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j];
} }
} }
@ -386,7 +386,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
if (alpha == 0.) { if (alpha == 0.) {
for (int64_t j=0 ; j<n ; ++j) { for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<n ; ++i) { for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = beta * b[j*ldb+i]; c[j*ldc+i] = beta * b[j*ldb+i];
} }
} }
@ -394,7 +394,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
} else if (beta == 0.) { } else if (beta == 0.) {
for (int64_t j=0 ; j<n ; ++j) { for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<n ; ++i) { for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = alpha * a[j*lda+i]; c[j*ldc+i] = alpha * a[j*lda+i];
} }
} }
@ -402,7 +402,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
} else { } else {
for (int64_t j=0 ; j<n ; ++j) { for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<n ; ++i) { for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i]; c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i];
} }
} }
@ -417,7 +417,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
if (alpha == 0.) { if (alpha == 0.) {
for (int64_t j=0 ; j<n ; ++j) { for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<n ; ++i) { for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = beta * b[i*ldb+j]; c[j*ldc+i] = beta * b[i*ldb+j];
} }
} }
@ -425,7 +425,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
} else if (beta == 0.) { } else if (beta == 0.) {
for (int64_t j=0 ; j<n ; ++j) { for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<n ; ++i) { for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = alpha * a[j*lda+i]; c[j*ldc+i] = alpha * a[j*lda+i];
} }
} }
@ -433,7 +433,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
} else { } else {
for (int64_t j=0 ; j<n ; ++j) { for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<n ; ++i) { for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j]; c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j];
} }
} }
@ -448,7 +448,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
if (alpha == 0.) { if (alpha == 0.) {
for (int64_t j=0 ; j<n ; ++j) { for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<n ; ++i) { for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = beta * b[j*ldb+i]; c[j*ldc+i] = beta * b[j*ldb+i];
} }
} }
@ -456,7 +456,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
} else if (beta == 0.) { } else if (beta == 0.) {
for (int64_t j=0 ; j<n ; ++j) { for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<n ; ++i) { for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = alpha * a[i*lda+j]; c[j*ldc+i] = alpha * a[i*lda+j];
} }
} }
@ -464,7 +464,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
} else { } else {
for (int64_t j=0 ; j<n ; ++j) { for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<n ; ++i) { for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i]; c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i];
} }
} }
@ -479,7 +479,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
if (alpha == 0.) { if (alpha == 0.) {
for (int64_t j=0 ; j<n ; ++j) { for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<n ; ++i) { for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = beta * b[i*ldb+j]; c[j*ldc+i] = beta * b[i*ldb+j];
} }
} }
@ -487,7 +487,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
} else if (beta == 0.) { } else if (beta == 0.) {
for (int64_t j=0 ; j<n ; ++j) { for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<n ; ++i) { for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = alpha * a[i*lda+j]; c[j*ldc+i] = alpha * a[i*lda+j];
} }
} }
@ -495,7 +495,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
} else { } else {
for (int64_t j=0 ; j<n ; ++j) { for (int64_t j=0 ; j<n ; ++j) {
for (int64_t i=0 ; i<n ; ++i) { for (int64_t i=0 ; i<m ; ++i) {
c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j]; c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j];
} }
} }