#include #include #include #include extern "C" { /* Generic functions */ int gpu_ndevices() { return 1; } void gpu_set_device(int32_t igpu) { } /* Allocation functions */ void gpu_allocate(void** ptr, int64_t size) { auto queue = sycl::queue(sycl::default_selector_v); try { *ptr = sycl::malloc_shared(size, queue); assert(*ptr != nullptr); } catch (const sycl::exception& e) { std::cerr << "SYCL exception caught: " << e.what() << std::endl; *ptr = nullptr; // If allocation fails, set pointer to nullptr } } void gpu_deallocate(void** ptr) { assert(*ptr != nullptr); sycl::free(*ptr, sycl::queue(sycl::default_selector_v)); *ptr = nullptr; } /* Upload data from host to device */ void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) { sycl::queue queue(sycl::default_selector_v); queue.memcpy(gpu_ptr, cpu_ptr, n).wait(); } /* Download data from device to host */ void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) { sycl::queue queue(sycl::default_selector_v); queue.memcpy(cpu_ptr, gpu_ptr, n).wait(); } /* Copy data from one GPU memory location to another */ void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) { sycl::queue queue(sycl::default_selector_v); queue.memcpy(gpu_ptr_dest, gpu_ptr_src, n).wait(); } /* Queues */ /* SYCL queue as a replacement for CUDA stream */ void gpu_stream_create(sycl::queue** ptr) { *ptr = new sycl::queue(sycl::default_selector_v); } void gpu_stream_destroy(sycl::queue** ptr) { assert(*ptr != nullptr); delete *ptr; *ptr = nullptr; } void gpu_synchronize() { sycl::queue queue(sycl::default_selector_v); queue.wait_and_throw(); } /* BLAS functions */ typedef struct { sycl::queue* queue; } blasHandle_t; void gpu_set_stream(blasHandle_t* handle, sycl::queue* ptr) { handle->queue = ptr; } void gpu_blas_create(blasHandle_t** ptr) { *ptr = (blasHandle_t*) malloc(sizeof(blasHandle_t)); assert(*ptr != nullptr); (*ptr)->queue = new sycl::queue(sycl::default_selector_v); assert((*ptr)->queue != nullptr); } void gpu_blas_destroy(blasHandle_t** ptr) { assert(*ptr != nullptr); delete (*ptr)->queue; free(*ptr); *ptr = nullptr; } void gpu_ddot(blasHandle_t* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) { // Ensure input parameters are valid assert(handle != nullptr); assert(handle->queue != nullptr); assert(n > 0); assert(incx > 0); assert(incy > 0); assert(x != nullptr); assert(y != nullptr); assert(result != nullptr); oneapi::mkl::blas::dot(*handle->queue, n, x, incx, y, incy, result); } void gpu_dgemv(blasHandle_t* handle, const char* transa, const int64_t m, const int64_t n, const double* alpha, const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) { assert(handle != nullptr); assert(handle->queue != nullptr); // Validate matrix dimensions and increments to be positive assert(m > 0 && n > 0 && lda > 0 && incx > 0 && incy > 0); assert(a != nullptr && x != nullptr && y != nullptr && alpha != nullptr && beta != nullptr); // Determine the operation type oneapi::mkl::transpose transa_ = oneapi::mkl::transpose::nontrans; if (*transa == 'T' || *transa == 't') { transa_ = oneapi::mkl::transpose::trans; } // Perform DGEMV operation using oneMKL oneapi::mkl::blas::column_major::gemv(*handle->queue, transa_, m, n, *alpha, a, lda, x, incx, *beta, y, incy); } void gpu_dgemm(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha, const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) { assert(handle != nullptr && handle->queue != nullptr); assert(m > 0 && n > 0 && k > 0 && lda > 0 && ldb > 0 && ldc > 0); assert(a != nullptr && b != nullptr && c != nullptr && alpha != nullptr && beta != nullptr); // Transpose operations auto transa_ = (*transa == 'T' || *transa == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans; auto transb_ = (*transb == 'T' || *transb == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans; oneapi::mkl::blas::column_major::gemm(*handle->queue, transa_, transb_, m, n, k, *alpha, a, lda, b, ldb, *beta, c, ldc); } void gpu_dgeam(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha, const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { assert(handle != nullptr && handle->queue != nullptr); assert(m > 0 && n > 0 && lda > 0 && ldb > 0 && ldc > 0); assert(a != nullptr && b != nullptr && c != nullptr && alpha != nullptr && beta != nullptr); // Determine transpose operations bool transA = (*transa == 'T' || *transa == 't'); bool transB = (*transb == 'T' || *transb == 't'); handle->queue->submit([&](sycl::handler& cgh) { cgh.parallel_for(sycl::range<2>(m, n), [=](sycl::id<2> idx) { const int i = idx[0]; const int j = idx[1]; const int ai = transA ? j * lda + i : i * lda + j; const int bi = transB ? j * ldb + i : i * ldb + j; const int ci = i * ldc + j; c[ci] = (*alpha) * a[ai] + (*beta) * b[bi]; }); }); } } // extern C