From 3c8f8291bd061aa5e22ffb617aefb0d73cd4b3b0 Mon Sep 17 00:00:00 2001 From: AbdAmmar Date: Fri, 29 Nov 2024 13:58:52 +0100 Subject: [PATCH] working on dRPA (with no TDA) on GPU: saving --- src/GPU/cu_quack_module.f90 | 51 +++++------ src/RPA/phRRPA.f90 | 1 - src/RPA/phRRPA_GPU.f90 | 144 +++++++++++++++---------------- src/cuda/src/ph_drpa_amb_sing.cu | 88 +++++++++++++++++++ src/cuda/src/ph_drpa_apb_sing.cu | 88 +++++++++++++++++++ src/cuda/src/ph_drpa_sing.c | 114 ++++++++++++++++++++++++ src/cuda/src/ph_drpa_tda_sing.c | 12 ++- 7 files changed, 391 insertions(+), 107 deletions(-) create mode 100644 src/cuda/src/ph_drpa_amb_sing.cu create mode 100644 src/cuda/src/ph_drpa_apb_sing.cu create mode 100644 src/cuda/src/ph_drpa_sing.c diff --git a/src/GPU/cu_quack_module.f90 b/src/GPU/cu_quack_module.f90 index eba54a0..bb022c2 100644 --- a/src/GPU/cu_quack_module.f90 +++ b/src/GPU/cu_quack_module.f90 @@ -4,49 +4,40 @@ module cu_quack_module implicit none -!#ifdef USE_GPU -! interface -! subroutine ph_drpa_tda_sing(nO, nBas, nS, eps, ERI, & -! Omega, X) bind(C, name = "ph_drpa_tda_sing") -! -! import c_int, c_double -! integer(c_int), intent(in), value :: nO, nBas, nS -! real(c_double), intent(in) :: eps(nBas) -! real(c_double), intent(in) :: ERI(nBas,nBas,nBas,nBas) -! real(c_double), intent(out) :: Omega(nS) -! real(c_double), intent(out) :: X(nS,nS) -! -! end subroutine ph_drpa_tda_sing -! end interface -!#else -! interface -! subroutine ph_drpa_tda_sing(nO, nBas, nS, eps, ERI, Omega, X) -! integer, intent(in) :: nO, nBas, nS -! double precision, intent(in) :: eps(nBas) -! double precision, intent(in) :: ERI(nBas,nBas,nBas,nBas) -! double precision, intent(out) :: Omega(nS) -! double precision, intent(out) :: X(nS,nS) -! end subroutine ph_drpa_tda_sing -! end interface -!#endif - interface + ! --- + subroutine ph_drpa_tda_sing(nO, nBas, nS, eps, ERI, & - Omega, X) bind(C, name = "ph_drpa_tda_sing") + Omega, XpY) bind(C, name = "ph_drpa_tda_sing") import c_int, c_double integer(c_int), intent(in), value :: nO, nBas, nS real(c_double), intent(in) :: eps(nBas) real(c_double), intent(in) :: ERI(nBas,nBas,nBas,nBas) real(c_double), intent(out) :: Omega(nS) - real(c_double), intent(out) :: X(nS,nS) + real(c_double), intent(out) :: XpY(nS,nS) end subroutine ph_drpa_tda_sing - end interface + ! --- - ! --- + subroutine ph_drpa_sing(nO, nBas, nS, eps, ERI, & + Omega, XpY, XmY) bind(C, name = "ph_drpa_sing") + + import c_int, c_double + integer(c_int), intent(in), value :: nO, nBas, nS + real(c_double), intent(in) :: eps(nBas) + real(c_double), intent(in) :: ERI(nBas,nBas,nBas,nBas) + real(c_double), intent(out) :: Omega(nS) + real(c_double), intent(out) :: XpY(nS,nS) + real(c_double), intent(out) :: XmY(nS,nS) + + end subroutine ph_drpa_sing + + ! --- + + end interface end module cu_quack_module diff --git a/src/RPA/phRRPA.f90 b/src/RPA/phRRPA.f90 index 31e7854..3ddeb03 100644 --- a/src/RPA/phRRPA.f90 +++ b/src/RPA/phRRPA.f90 @@ -83,7 +83,6 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO, call phLR(TDA,nS,Aph,Bph,EcRPA(ispin),Om,XpY,XmY) !call wall_time(t2) !print *, "wall time diag A on CPU (sec) = ", t2 - t1 - !stop call print_excitation_energies('phRPA@RHF','singlet',nS,Om) call phLR_transition_vectors(.true.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY) diff --git a/src/RPA/phRRPA_GPU.f90 b/src/RPA/phRRPA_GPU.f90 index 839c7be..f5c27c0 100644 --- a/src/RPA/phRRPA_GPU.f90 +++ b/src/RPA/phRRPA_GPU.f90 @@ -2,62 +2,51 @@ subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF) - use cu_quack_module + use cu_quack_module -! Perform a direct random phase approximation calculation implicit none include 'parameters.h' include 'quadrature.h' -! Input variables - logical,intent(in) :: dotest + logical,intent(in) :: dotest + logical,intent(in) :: TDA + logical,intent(in) :: doACFDT + logical,intent(in) :: exchange_kernel + logical,intent(in) :: singlet + logical,intent(in) :: triplet + integer,intent(in) :: nBas + integer,intent(in) :: nC + integer,intent(in) :: nO + integer,intent(in) :: nV + integer,intent(in) :: nR + integer,intent(in) :: nS + double precision,intent(in) :: ENuc + double precision,intent(in) :: ERHF + double precision,intent(in) :: eHF(nBas) + double precision,intent(in) :: ERI(nBas,nBas,nBas,nBas) + double precision,intent(in) :: dipole_int(nBas,nBas,ncart) - logical,intent(in) :: TDA - logical,intent(in) :: doACFDT - logical,intent(in) :: exchange_kernel - logical,intent(in) :: singlet - logical,intent(in) :: triplet - integer,intent(in) :: nBas - integer,intent(in) :: nC - integer,intent(in) :: nO - integer,intent(in) :: nV - integer,intent(in) :: nR - integer,intent(in) :: nS - double precision,intent(in) :: ENuc - double precision,intent(in) :: ERHF - double precision,intent(in) :: eHF(nBas) - double precision,intent(in) :: ERI(nBas,nBas,nBas,nBas) - double precision,intent(in) :: dipole_int(nBas,nBas,ncart) -! Local variables + integer :: i + integer :: ispin + logical :: dRPA + double precision :: t1, t2 + integer, allocatable :: iorder(:) + double precision,allocatable :: Om(:) + double precision,allocatable :: XpY(:,:) + double precision,allocatable :: XmY(:,:) - integer :: i - integer :: ispin - logical :: dRPA - double precision :: t1, t2 - double precision :: lambda - double precision,allocatable :: Aph(:,:) - double precision,allocatable :: Bph(:,:) - double precision,allocatable :: Om(:) - double precision,allocatable :: XpY(:,:) - double precision,allocatable :: XmY(:,:) - ! DEBUG - !double precision, allocatable :: XpY_gpu(:,:), XmY_gpu(:,:), Om_gpu(:) + double precision :: EcRPA(nspin) - double precision :: EcRPA(nspin) - -! Hello world write(*,*) - write(*,*)'*********************************' - write(*,*)'* Restricted ph-RPA Calculation *' - write(*,*)'*********************************' + write(*,*)'******************************************' + write(*,*)'* Restricted ph-RPA Calculation (on GPU) *' + write(*,*)'******************************************' write(*,*) -! TDA - if(TDA) then write(*,*) 'Tamm-Dancoff approximation activated!' write(*,*) @@ -67,61 +56,71 @@ subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC dRPA = .true. EcRPA(:) = 0d0 - lambda = 1d0 -! Memory allocation - allocate(Om(nS),XpY(nS,nS),XmY(nS,nS)) - !allocate(Aph(nS,nS),Bph(nS,nS)) - -! Singlet manifold + allocate(Om(nS), XpY(nS,nS), XmY(nS,nS)) if(singlet) then if(TDA) then - print*, 'start diag on GPU:' - call wall_time(t1) + !print*, 'start diag on GPU:' + !call wall_time(t1) call ph_drpa_tda_sing(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om(1), XpY(1,1)) - call wall_time(t2) - print*, 'diag time on GPU (sec):', t2 - t1 - stop + !call wall_time(t2) + !print*, 'diag time on GPU (sec):', t2 - t1 XmY(:,:) = XpY(:,:) else - ! TODO - !call ph_drpa_sing(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om(1), XpY(1,1)) - !XmY(:,:) = XpY(:,:) + !print*, 'start diag on GPU:' + !call wall_time(t1) + call ph_drpa_sing(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om(1), XpY(1,1), XmY(1,1)) + !call wall_time(t2) + !print*, 'diag time on GPU (sec):', t2 - t1 endif + ! TODO + XpY(:,:) = transpose(XpY(:,:)) + XmY(:,:) = transpose(XmY(:,:)) + call print_excitation_energies('phRPA@RHF','singlet',nS,Om) call phLR_transition_vectors(.true.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY) - - end if - -! Triplet manifold + endif if(triplet) then - ispin = 2 + XpY(:,:) = 0.d0 + allocate(iorder(nS)) + ia = 0 + do i = nC+1, nO + do a = nO+1, nBas-nR + ia = ia + 1 + iorder(ia) = ia + Om(ia) = e(a) - e(i) + XpY(ia,ia) = 1.d0 + enddo + enddo - call phLR_A(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,eHF,ERI,Aph) - if(.not.TDA) call phLR_B(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,ERI,Bph) + call quick_sort(Om(1), iorder(1), nS) + deallocate(iorder) + + XmY(:,:) = XpY(:,:) - call phLR(TDA,nS,Aph,Bph,EcRPA(ispin),Om,XpY,XmY) call print_excitation_energies('phRPA@RHF','triplet',nS,Om) call phLR_transition_vectors(.false.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY) + endif - end if + deallocate(Om, XpY, XmY) + + ! TODO + ! init EcRPA if(exchange_kernel) then - EcRPA(1) = 0.5d0*EcRPA(1) EcRPA(2) = 1.5d0*EcRPA(2) - - end if + endif write(*,*) write(*,*)'-------------------------------------------------------------------------------' @@ -132,12 +131,11 @@ subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC write(*,*)'-------------------------------------------------------------------------------' write(*,*) - deallocate(Om,XpY,XmY,Aph,Bph) - ! Compute the correlation energy via the adiabatic connection if(doACFDT) then + ! TODO call phACFDT(exchange_kernel,dRPA,TDA,singlet,triplet,nBas,nC,nO,nV,nR,nS,ERI,eHF,EcRPA) write(*,*) @@ -149,13 +147,11 @@ subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC write(*,*)'-------------------------------------------------------------------------------' write(*,*) - end if + endif if(dotest) then - - call dump_test_value('R','phRPA correlation energy',sum(EcRPA)) - - end if + call dump_test_value('R','phRPA correlation energy (on GPU)',sum(EcRPA)) + endif end subroutine diff --git a/src/cuda/src/ph_drpa_amb_sing.cu b/src/cuda/src/ph_drpa_amb_sing.cu new file mode 100644 index 0000000..4f88b4b --- /dev/null +++ b/src/cuda/src/ph_drpa_amb_sing.cu @@ -0,0 +1,88 @@ +#include + +__global__ void ph_dRPA_AmB_sing_kernel(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *AmB) { + + + int i, j, a, b; + int aa, bb; + int nVS; + int nBas2, nBas3; + int i_A0, i_A1, i_A2; + int i_I0, i_I1, i_I2; + + bool a_eq_b; + + nVS = nV * nS; + + nBas2 = nBas * nBas; + nBas3 = nBas2 * nBas; + + aa = blockIdx.x * blockDim.x + threadIdx.x; + bb = blockIdx.y * blockDim.y + threadIdx.y; + + while(aa < nV) { + a = aa + nO; + + i_A0 = aa * nS; + i_I0 = a * nBas2; + + while(bb < nV) { + b = bb + nO; + + a_eq_b = a == b; + + i_A1 = i_A0 + bb; + i_I1 = i_I0 + b * nBas; + + i = 0; + while(i < nO) { + + i_A2 = i_A1 + i * nVS; + i_I2 = i_I1 + i; + + j = 0; + while(j < nO) { + + AmB[i_A2 + j * nV] = 2.0 * (ERI[i_I2 + j * nBas3] - ERI[i_I2 + j * nBas]); + if(a_eq_b && (i==j)) { + AmB[i_A2 + j * nV] += eps[a] - eps[i]; + } + + j ++; + } // j + + i ++; + } // i + + bb += blockDim.y * gridDim.y; + } // bb + + aa += blockDim.x * gridDim.x; + } // aa + +} + + + + + +extern "C" void ph_dRPA_AmB_sing(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *AmB) { + + + int sBlocks = 32; + int nBlocks = (nV + sBlocks - 1) / sBlocks; + + dim3 dimGrid(nBlocks, nBlocks, 1); + dim3 dimBlock(sBlocks, sBlocks, 1); + + printf("lunching ph_dRPA_AmB_sing_kernel with %dx%d blocks and %dx%d threads/block\n", + nBlocks, nBlocks, sBlocks, sBlocks); + + + ph_dRPA_AmB_sing_kernel<<>>(nO, nV, nBas, nS, eps, ERI, AmB); + +} + + + + diff --git a/src/cuda/src/ph_drpa_apb_sing.cu b/src/cuda/src/ph_drpa_apb_sing.cu new file mode 100644 index 0000000..d7de329 --- /dev/null +++ b/src/cuda/src/ph_drpa_apb_sing.cu @@ -0,0 +1,88 @@ +#include + +__global__ void ph_dRPA_ApB_sing_kernel(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *ApB) { + + + int i, j, a, b; + int aa, bb; + int nVS; + int nBas2, nBas3; + int i_A0, i_A1, i_A2; + int i_I0, i_I1, i_I2; + + bool a_eq_b; + + nVS = nV * nS; + + nBas2 = nBas * nBas; + nBas3 = nBas2 * nBas; + + aa = blockIdx.x * blockDim.x + threadIdx.x; + bb = blockIdx.y * blockDim.y + threadIdx.y; + + while(aa < nV) { + a = aa + nO; + + i_A0 = aa * nS; + i_I0 = a * nBas2; + + while(bb < nV) { + b = bb + nO; + + a_eq_b = a == b; + + i_A1 = i_A0 + bb; + i_I1 = i_I0 + b * nBas; + + i = 0; + while(i < nO) { + + i_A2 = i_A1 + i * nVS; + i_I2 = i_I1 + i; + + j = 0; + while(j < nO) { + + ApB[i_A2 + j * nV] = 2.0 * (ERI[i_I2 + j * nBas3] + ERI[i_I2 + j * nBas]); + if(a_eq_b && (i==j)) { + ApB[i_A2 + j * nV] += eps[a] - eps[i]; + } + + j ++; + } // j + + i ++; + } // i + + bb += blockDim.y * gridDim.y; + } // bb + + aa += blockDim.x * gridDim.x; + } // aa + +} + + + + + +extern "C" void ph_dRPA_ApB_sing(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *ApB) { + + + int sBlocks = 32; + int nBlocks = (nV + sBlocks - 1) / sBlocks; + + dim3 dimGrid(nBlocks, nBlocks, 1); + dim3 dimBlock(sBlocks, sBlocks, 1); + + printf("lunching ph_dRPA_ApB_sing_kernel with %dx%d blocks and %dx%d threads/block\n", + nBlocks, nBlocks, sBlocks, sBlocks); + + + ph_dRPA_ApB_sing_kernel<<>>(nO, nV, nBas, nS, eps, ERI, ApB); + +} + + + + diff --git a/src/cuda/src/ph_drpa_sing.c b/src/cuda/src/ph_drpa_sing.c new file mode 100644 index 0000000..f3f8244 --- /dev/null +++ b/src/cuda/src/ph_drpa_sing.c @@ -0,0 +1,114 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "utils.h" +#include "ph_rpa.h" + +void ph_drpa_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, + double *h_Omega, double *h_XpY, double *h_XmY) { + + double *d_eps = NULL; + double *d_ERI = NULL; + + int nV = nBas - nO; + + long long nBas_long = (long long) nBas; + long long nBas4 = nBas_long * nBas_long * nBas_long * nBas_long; + + long long nS_long = (long long) nS; + long long nS2 = nS_long * nS_long; + + float elapsedTime; + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + + check_Cuda_Errors(cudaMalloc((void**)&d_eps, nBas * sizeof(double)), + "cudaMalloc", __FILE__, __LINE__); + check_Cuda_Errors(cudaMalloc((void**)&d_ERI, nBas4 * sizeof(double)), + "cudaMalloc", __FILE__, __LINE__); + + printf("CPU->GPU transfer..\n"); + cudaEventRecord(start, 0); + check_Cuda_Errors(cudaMemcpy(d_eps, h_eps, nBas * sizeof(double), cudaMemcpyHostToDevice), + "cudaMemcpy", __FILE__, __LINE__); + check_Cuda_Errors(cudaMemcpy(d_ERI, h_ERI, nBas4 * sizeof(double), cudaMemcpyHostToDevice), + "cudaMemcpy", __FILE__, __LINE__); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&elapsedTime, start, stop); + printf("Time elapsed on CPU->GPU transfer = %f msec\n", elapsedTime); + + // construct A+B & A-B + double *d_ApB = NULL; + double *d_AmB = NULL; + check_Cuda_Errors(cudaMalloc((void**)&d_ApB, nS2 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); + check_Cuda_Errors(cudaMalloc((void**)&d_A-B, nS2 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); + + cudaEventRecord(start, 0); + ph_dRPA_ApB_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_ApB); + ph_dRPA_AmB_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_AmB); + check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&elapsedTime, start, stop); + printf("Time elapsed on A & B kernels = %f msec\n", elapsedTime); + + + // free memory + check_Cuda_Errors(cudaFree(d_eps), "cudaFree", __FILE__, __LINE__); + check_Cuda_Errors(cudaFree(d_ERI), "cudaFree", __FILE__, __LINE__); + + + // TODO + // diagonalize A+B and A-B + int *d_info = NULL; + double *d_Omega = NULL; + check_Cuda_Errors(cudaMalloc((void**)&d_info, sizeof(int)), + "cudaMalloc", __FILE__, __LINE__); + check_Cuda_Errors(cudaMalloc((void**)&d_Omega, nS * sizeof(double)), + "cudaMalloc", __FILE__, __LINE__); + + cudaEventRecord(start, 0); + diag_dn_dsyevd(nS, d_info, d_Omega, d_A); + check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&elapsedTime, start, stop); + printf("Time elapsed on diagonalization = %f msec\n", elapsedTime); + + + // transfer data to CPU + cudaEventRecord(start, 0); + //int info_gpu = 0; + //check_Cuda_Errors(cudaMemcpy(&info_gpu, d_info, sizeof(int), cudaMemcpyDeviceToHost), + // "cudaMemcpy", __FILE__, __LINE__); + //if (info_gpu != 0) { + // printf("Error: diag_dn_dsyevd returned error code %d\n", info_gpu); + // exit(EXIT_FAILURE); + //} + check_Cuda_Errors(cudaMemcpy(h_XpY, d_, nS2 * sizeof(double), cudaMemcpyDeviceToHost), + "cudaMemcpy", __FILE__, __LINE__); + check_Cuda_Errors(cudaMemcpy(h_XmY, d_, nS2 * sizeof(double), cudaMemcpyDeviceToHost), + "cudaMemcpy", __FILE__, __LINE__); + check_Cuda_Errors(cudaMemcpy(h_Omega, d_Omega, nS * sizeof(double), cudaMemcpyDeviceToHost), + "cudaMemcpy", __FILE__, __LINE__); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&elapsedTime, start, stop); + printf("Time elapsed on GPU -> CPU transfer = %f msec\n", elapsedTime); + + check_Cuda_Errors(cudaFree(d_info), "cudaFree", __FILE__, __LINE__); + check_Cuda_Errors(cudaFree(d_A), "cudaFree", __FILE__, __LINE__); + check_Cuda_Errors(cudaFree(d_B), "cudaFree", __FILE__, __LINE__); + check_Cuda_Errors(cudaFree(d_Omega), "cudaFree", __FILE__, __LINE__); + + +} + diff --git a/src/cuda/src/ph_drpa_tda_sing.c b/src/cuda/src/ph_drpa_tda_sing.c index 5cfa9ef..31d0725 100644 --- a/src/cuda/src/ph_drpa_tda_sing.c +++ b/src/cuda/src/ph_drpa_tda_sing.c @@ -9,6 +9,11 @@ #include "utils.h" #include "ph_rpa.h" +/* + * + * Y = 0 ==> X+Y = X-Y = X + * +*/ void ph_drpa_tda_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, double *h_Omega, double *h_X) { @@ -16,6 +21,9 @@ void ph_drpa_tda_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, double *d_ERI = NULL; int nV = nBas - nO; + + long long nS_long = (long long) nS; + long long nS2 = nS_long * nS_long; long long nBas_long = (long long) nBas; long long nBas4 = nBas_long * nBas_long * nBas_long * nBas_long; @@ -47,7 +55,7 @@ void ph_drpa_tda_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, // construct A double *d_A = NULL; - check_Cuda_Errors(cudaMalloc((void**)&d_A, nS * nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); + check_Cuda_Errors(cudaMalloc((void**)&d_A, nS2 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); cudaEventRecord(start, 0); ph_dRPA_A_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_A); @@ -86,7 +94,7 @@ void ph_drpa_tda_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, // printf("Error: diag_dn_dsyevd returned error code %d\n", info_gpu); // exit(EXIT_FAILURE); //} - check_Cuda_Errors(cudaMemcpy(h_X, d_A, nS * nS * sizeof(double), cudaMemcpyDeviceToHost), + check_Cuda_Errors(cudaMemcpy(h_X, d_A, nS2 * sizeof(double), cudaMemcpyDeviceToHost), "cudaMemcpy", __FILE__, __LINE__); check_Cuda_Errors(cudaMemcpy(h_Omega, d_Omega, nS * sizeof(double), cudaMemcpyDeviceToHost), "cudaMemcpy", __FILE__, __LINE__);