mirror of
https://github.com/QuantumPackage/qp2.git
synced 2024-12-22 19:43:32 +01:00
Merge branch 'dev-stable' of https://github.com/QuantumPackage/qp2 into dev-stable
This commit is contained in:
commit
d1ef3e178f
10
configure
vendored
10
configure
vendored
@ -40,7 +40,7 @@ Usage:
|
|||||||
$(basename $0) -c <file>
|
$(basename $0) -c <file>
|
||||||
$(basename $0) -h
|
$(basename $0) -h
|
||||||
$(basename $0) -i <package>
|
$(basename $0) -i <package>
|
||||||
$(basename $0) -g [nvidia|none]
|
$(basename $0) -g [nvidia|intel|none]
|
||||||
|
|
||||||
Options:
|
Options:
|
||||||
-c <file> Define a COMPILATION configuration file,
|
-c <file> Define a COMPILATION configuration file,
|
||||||
@ -49,7 +49,7 @@ Options:
|
|||||||
-i <package> INSTALL <package>. Use at your OWN RISK:
|
-i <package> INSTALL <package>. Use at your OWN RISK:
|
||||||
no support will be provided for the installation of
|
no support will be provided for the installation of
|
||||||
dependencies.
|
dependencies.
|
||||||
-g [nvidia|none] Choose GPU acceleration (experimental)
|
-g [nvidia|intel|none] Choose GPU acceleration
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
./$(basename $0) -c config/gfortran.cfg
|
./$(basename $0) -c config/gfortran.cfg
|
||||||
@ -117,10 +117,14 @@ done
|
|||||||
# Handle GPU acceleration
|
# Handle GPU acceleration
|
||||||
rm -f ${QP_ROOT}/src/gpu_arch
|
rm -f ${QP_ROOT}/src/gpu_arch
|
||||||
case "$GPU" in
|
case "$GPU" in
|
||||||
amd) # Nvidia
|
amd) # AMD
|
||||||
echo "Activating AMD GPU acceleration"
|
echo "Activating AMD GPU acceleration"
|
||||||
ln -s ${QP_ROOT}/plugins/local/gpu_amd ${QP_ROOT}/src/gpu_arch
|
ln -s ${QP_ROOT}/plugins/local/gpu_amd ${QP_ROOT}/src/gpu_arch
|
||||||
;;
|
;;
|
||||||
|
intel) # Intel
|
||||||
|
echo "Activating Intel GPU acceleration (EXPERIMENTAL)"
|
||||||
|
ln -s ${QP_ROOT}/plugins/local/gpu_intel ${QP_ROOT}/src/gpu_arch
|
||||||
|
;;
|
||||||
nvidia) # Nvidia
|
nvidia) # Nvidia
|
||||||
echo "Activating Nvidia GPU acceleration"
|
echo "Activating Nvidia GPU acceleration"
|
||||||
ln -s ${QP_ROOT}/plugins/local/gpu_nvidia ${QP_ROOT}/src/gpu_arch
|
ln -s ${QP_ROOT}/plugins/local/gpu_nvidia ${QP_ROOT}/src/gpu_arch
|
||||||
|
2
plugins/local/gpu_intel/LIB
Normal file
2
plugins/local/gpu_intel/LIB
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
-ltbb -lsycl -lmkl_sycl -lgpu -limf -lintlc -lstdc++
|
||||||
|
|
1
plugins/local/gpu_intel/NEED
Normal file
1
plugins/local/gpu_intel/NEED
Normal file
@ -0,0 +1 @@
|
|||||||
|
|
8
plugins/local/gpu_intel/README.rst
Normal file
8
plugins/local/gpu_intel/README.rst
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
=========
|
||||||
|
gpu_intel
|
||||||
|
=========
|
||||||
|
|
||||||
|
Intel implementation of GPU routines. Uses MKL and SYCL.
|
||||||
|
```bash
|
||||||
|
icpx -fsycl gpu.cxx -c -qmkl=sequential
|
||||||
|
```
|
177
plugins/local/gpu_intel/gpu.sycl
Normal file
177
plugins/local/gpu_intel/gpu.sycl
Normal file
@ -0,0 +1,177 @@
|
|||||||
|
#include <CL/sycl.hpp>
|
||||||
|
#include <cassert>
|
||||||
|
#include <limits>
|
||||||
|
#include <oneapi/mkl/blas.hpp>
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
|
||||||
|
/* Generic functions */
|
||||||
|
|
||||||
|
int gpu_ndevices() {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpu_set_device(int32_t igpu) {
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* Allocation functions */
|
||||||
|
|
||||||
|
void gpu_allocate(void** ptr, int64_t size) {
|
||||||
|
auto queue = sycl::queue(sycl::default_selector_v);
|
||||||
|
|
||||||
|
try {
|
||||||
|
*ptr = sycl::malloc_shared(size, queue);
|
||||||
|
assert(*ptr != nullptr);
|
||||||
|
} catch (const sycl::exception& e) {
|
||||||
|
std::cerr << "SYCL exception caught: " << e.what() << std::endl;
|
||||||
|
*ptr = nullptr; // If allocation fails, set pointer to nullptr
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpu_deallocate(void** ptr) {
|
||||||
|
assert(*ptr != nullptr);
|
||||||
|
sycl::free(*ptr, sycl::queue(sycl::default_selector_v));
|
||||||
|
*ptr = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Upload data from host to device */
|
||||||
|
void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
|
||||||
|
sycl::queue queue(sycl::default_selector_v);
|
||||||
|
queue.memcpy(gpu_ptr, cpu_ptr, n).wait();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Download data from device to host */
|
||||||
|
void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
|
||||||
|
sycl::queue queue(sycl::default_selector_v);
|
||||||
|
queue.memcpy(cpu_ptr, gpu_ptr, n).wait();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Copy data from one GPU memory location to another */
|
||||||
|
void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
|
||||||
|
sycl::queue queue(sycl::default_selector_v);
|
||||||
|
queue.memcpy(gpu_ptr_dest, gpu_ptr_src, n).wait();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Queues */
|
||||||
|
|
||||||
|
/* SYCL queue as a replacement for CUDA stream */
|
||||||
|
void gpu_stream_create(sycl::queue** ptr) {
|
||||||
|
*ptr = new sycl::queue(sycl::default_selector_v);
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpu_stream_destroy(sycl::queue** ptr) {
|
||||||
|
assert(*ptr != nullptr);
|
||||||
|
delete *ptr;
|
||||||
|
*ptr = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpu_synchronize() {
|
||||||
|
sycl::queue queue(sycl::default_selector_v);
|
||||||
|
queue.wait_and_throw();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* BLAS functions */
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
sycl::queue* queue;
|
||||||
|
} blasHandle_t;
|
||||||
|
|
||||||
|
void gpu_set_stream(blasHandle_t* handle, sycl::queue* ptr) {
|
||||||
|
handle->queue = ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpu_blas_create(blasHandle_t** ptr) {
|
||||||
|
*ptr = (blasHandle_t*) malloc(sizeof(blasHandle_t));
|
||||||
|
assert(*ptr != nullptr);
|
||||||
|
(*ptr)->queue = new sycl::queue(sycl::default_selector_v);
|
||||||
|
assert((*ptr)->queue != nullptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpu_blas_destroy(blasHandle_t** ptr) {
|
||||||
|
assert(*ptr != nullptr);
|
||||||
|
delete (*ptr)->queue;
|
||||||
|
free(*ptr);
|
||||||
|
*ptr = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void gpu_ddot(blasHandle_t* handle, const int64_t n, const double* x, const int64_t incx,
|
||||||
|
const double* y, const int64_t incy, double* result) {
|
||||||
|
// Ensure input parameters are valid
|
||||||
|
assert(handle != nullptr);
|
||||||
|
assert(handle->queue != nullptr);
|
||||||
|
assert(n > 0);
|
||||||
|
assert(incx > 0);
|
||||||
|
assert(incy > 0);
|
||||||
|
assert(x != nullptr);
|
||||||
|
assert(y != nullptr);
|
||||||
|
assert(result != nullptr);
|
||||||
|
|
||||||
|
oneapi::mkl::blas::dot(*handle->queue, n, x, incx, y, incy, result);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpu_dgemv(blasHandle_t* handle, const char* transa, const int64_t m, const int64_t n, const double* alpha,
|
||||||
|
const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) {
|
||||||
|
|
||||||
|
assert(handle != nullptr);
|
||||||
|
assert(handle->queue != nullptr);
|
||||||
|
|
||||||
|
// Validate matrix dimensions and increments to be positive
|
||||||
|
assert(m > 0 && n > 0 && lda > 0 && incx > 0 && incy > 0);
|
||||||
|
assert(a != nullptr && x != nullptr && y != nullptr && alpha != nullptr && beta != nullptr);
|
||||||
|
|
||||||
|
// Determine the operation type
|
||||||
|
oneapi::mkl::transpose transa_ = oneapi::mkl::transpose::nontrans;
|
||||||
|
if (*transa == 'T' || *transa == 't') {
|
||||||
|
transa_ = oneapi::mkl::transpose::trans;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Perform DGEMV operation using oneMKL
|
||||||
|
oneapi::mkl::blas::column_major::gemv(*handle->queue, transa_, m, n, *alpha, a, lda, x, incx, *beta, y, incy);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpu_dgemm(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
|
||||||
|
const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) {
|
||||||
|
|
||||||
|
assert(handle != nullptr && handle->queue != nullptr);
|
||||||
|
assert(m > 0 && n > 0 && k > 0 && lda > 0 && ldb > 0 && ldc > 0);
|
||||||
|
assert(a != nullptr && b != nullptr && c != nullptr && alpha != nullptr && beta != nullptr);
|
||||||
|
|
||||||
|
// Transpose operations
|
||||||
|
auto transa_ = (*transa == 'T' || *transa == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans;
|
||||||
|
auto transb_ = (*transb == 'T' || *transb == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans;
|
||||||
|
|
||||||
|
oneapi::mkl::blas::column_major::gemm(*handle->queue, transa_, transb_, m, n, k,
|
||||||
|
*alpha, a, lda, b, ldb, *beta, c, ldc);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void gpu_dgeam(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha,
|
||||||
|
const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
|
||||||
|
assert(handle != nullptr && handle->queue != nullptr);
|
||||||
|
assert(m > 0 && n > 0 && lda > 0 && ldb > 0 && ldc > 0);
|
||||||
|
assert(a != nullptr && b != nullptr && c != nullptr && alpha != nullptr && beta != nullptr);
|
||||||
|
|
||||||
|
// Determine transpose operations
|
||||||
|
bool transA = (*transa == 'T' || *transa == 't');
|
||||||
|
bool transB = (*transb == 'T' || *transb == 't');
|
||||||
|
|
||||||
|
handle->queue->submit([&](sycl::handler& cgh) {
|
||||||
|
cgh.parallel_for(sycl::range<2>(m, n), [=](sycl::id<2> idx) {
|
||||||
|
const int i = idx[0];
|
||||||
|
const int j = idx[1];
|
||||||
|
const int ai = transA ? j * lda + i : i * lda + j;
|
||||||
|
const int bi = transB ? j * ldb + i : i * ldb + j;
|
||||||
|
const int ci = i * ldc + j;
|
||||||
|
|
||||||
|
c[ci] = (*alpha) * a[ai] + (*beta) * b[bi];
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
} // extern C
|
@ -996,8 +996,8 @@ subroutine compute_J1_chol(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvoo,d_cc_space_v_vo_chol
|
|||||||
integer, intent(in) :: nO,nV
|
integer, intent(in) :: nO,nV
|
||||||
type(gpu_double2), intent(in) :: t1
|
type(gpu_double2), intent(in) :: t1
|
||||||
type(gpu_double4), intent(in) :: t2, v_ovvo, v_ovoo, v_vvoo
|
type(gpu_double4), intent(in) :: t2, v_ovvo, v_ovoo, v_vvoo
|
||||||
|
type(gpu_double3), intent(in) :: d_cc_space_v_vo_chol,d_cc_space_v_vv_chol
|
||||||
type(gpu_double4), intent(out) :: J1
|
type(gpu_double4), intent(out) :: J1
|
||||||
type(gpu_double3), intent(out) :: d_cc_space_v_vo_chol,d_cc_space_v_vv_chol
|
|
||||||
|
|
||||||
integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam
|
integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user