#include #include #include #include #include #include #include #include "gpu.h" gpu_data_sp* gpu_init_sp( int nO, int nV, int cholesky_mo_num, double* cc_space_v_oo_chol_, double* cc_space_v_ov_chol_, double* cc_space_v_vo_chol_, double* cc_space_v_vv_chol_, double* cc_space_v_oooo_, double* cc_space_v_vooo_, double* cc_space_v_voov_, double* cc_space_v_oovv_, double* cc_space_v_vvoo_, double* cc_space_v_oovo_, double* cc_space_v_ovvo_, double* cc_space_v_ovov_, double* cc_space_v_ovoo_, double* cc_space_f_oo_, double* cc_space_f_ov_, double* cc_space_f_vo_, double* cc_space_f_vv_) { int ngpus = 1; if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus); gpu_data_sp* data = (gpu_data_sp*) malloc ((size_t) ngpus*sizeof(gpu_data_sp)); assert (data != NULL); #pragma omp parallel num_threads(ngpus) { cudaError_t cudaStat = cudaSuccess; size_t lda; int igpu = omp_get_thread_num(); cudaSetDevice(igpu); cublasHandle_t handle; cublasCreate(&handle); /* size_t limit; size_t mtotal, mfree; cudaStat = cudaMemGetInfo(&mfree, &mtotal); fprintf(stderr, "%s\nTotal:%lu\nFree :%lu\n", cudaGetErrorString(cudaStat), mtotal, mfree); */ float* d_cc_space_v_oo_chol = NULL; lda = cholesky_mo_num * nO; cudaStat = cudaMalloc((void **)&d_cc_space_v_oo_chol, lda * nO * sizeof(float)); assert (cudaStat == cudaSuccess); float* cc_space_v_oo_chol = malloc((size_t) lda * nO * sizeof(float)); assert (cc_space_v_oo_chol != NULL); for (size_t i=0 ; i<(size_t) lda * nO ; ++i) { cc_space_v_oo_chol[i] = cc_space_v_oo_chol_[i]; } cublasSetMatrix(cholesky_mo_num*nO, nO, sizeof(float), cc_space_v_oo_chol, lda, d_cc_space_v_oo_chol, lda); free(cc_space_v_oo_chol); float* d_cc_space_v_ov_chol = NULL; lda = cholesky_mo_num * nO; cudaStat = cudaMalloc((void **)&d_cc_space_v_ov_chol, lda * nV * sizeof(float)); assert (cudaStat == cudaSuccess); float* cc_space_v_ov_chol = malloc((size_t) lda * nV * sizeof(float)); assert (cc_space_v_ov_chol != NULL); for (size_t i=0 ; i<(size_t) lda * nV ; ++i) { cc_space_v_ov_chol[i] = cc_space_v_ov_chol_[i]; } cublasSetMatrix(cholesky_mo_num*nO, nV, sizeof(float), cc_space_v_ov_chol, lda, d_cc_space_v_ov_chol, lda); free(cc_space_v_ov_chol); float* d_cc_space_v_vo_chol = NULL; lda = cholesky_mo_num * nV; cudaStat = cudaMalloc((void **)&d_cc_space_v_vo_chol, lda * nO * sizeof(float)); assert (cudaStat == cudaSuccess); float* cc_space_v_vo_chol = malloc((size_t) lda * nO * sizeof(float)); assert (cc_space_v_vo_chol != NULL); for (size_t i=0 ; i<(size_t) lda * nO ; ++i) { cc_space_v_vo_chol[i] = cc_space_v_vo_chol_[i]; } cublasSetMatrix(cholesky_mo_num*nV, nO, sizeof(float), cc_space_v_vo_chol, lda, d_cc_space_v_vo_chol, lda); free(cc_space_v_vo_chol); float* d_cc_space_v_vv_chol = NULL; lda = cholesky_mo_num * nV; cudaStat = cudaMalloc((void **)&d_cc_space_v_vv_chol, lda * nV * sizeof(float)); assert (cudaStat == cudaSuccess); float* cc_space_v_vv_chol = malloc((size_t) lda * nV * sizeof(float)); assert (cc_space_v_vv_chol != NULL); for (size_t i=0 ; i< (size_t) lda * nV ; ++i) { cc_space_v_vv_chol[i] = cc_space_v_vv_chol_[i]; } cublasSetMatrix(cholesky_mo_num*nV, nV, sizeof(float), cc_space_v_vv_chol, lda, d_cc_space_v_vv_chol, lda); free(cc_space_v_vv_chol); float* d_cc_space_v_oooo = NULL; cudaStat = cudaMalloc((void**)&d_cc_space_v_oooo, nO*nO*nO*nO*sizeof(float)); assert (cudaStat == cudaSuccess); float* cc_space_v_oooo = malloc((size_t) nO*nO*nO*nO*sizeof(float)); assert (cc_space_v_oooo != NULL); for (size_t i=0 ; i<(size_t) nO*nO*nO*nO ; ++i) { cc_space_v_oooo[i] = cc_space_v_oooo_[i]; } cublasSetMatrix(nO*nO, nO*nO, sizeof(float), cc_space_v_oooo, nO*nO, d_cc_space_v_oooo, nO*nO); free(cc_space_v_oooo); float* d_cc_space_v_vooo = NULL; cudaStat = cudaMalloc((void**)&d_cc_space_v_vooo, nV*nO*nO*nO*sizeof(float)); assert (cudaStat == cudaSuccess); float* cc_space_v_vooo = malloc((size_t) nV*nO*nO*nO*sizeof(float)); assert (cc_space_v_vooo != NULL); for (size_t i=0 ; i<(size_t) nV*nO*nO*nO; ++i) { cc_space_v_vooo[i] = cc_space_v_vooo_[i]; } cublasSetMatrix(nV*nO, nO*nO, sizeof(float), cc_space_v_vooo, nV*nO, d_cc_space_v_vooo, nV*nO); free(cc_space_v_vooo); float* d_cc_space_v_voov = NULL; cudaStat = cudaMalloc((void**)&d_cc_space_v_voov, nV*nO*nO*nV*sizeof(float)); assert (cudaStat == cudaSuccess); float* cc_space_v_voov = malloc((size_t) nV*nO*nO*nV*sizeof(float)); assert (cc_space_v_voov != NULL); for (size_t i=0 ; i<(size_t) nV*nO*nO*nV; ++i) { cc_space_v_voov[i] = cc_space_v_voov_[i]; } cublasSetMatrix(nV*nO, nO*nV, sizeof(float), cc_space_v_voov, nV*nO, d_cc_space_v_voov, nV*nO); free(cc_space_v_voov); float* d_cc_space_v_oovv = NULL; cudaStat = cudaMalloc((void**)&d_cc_space_v_oovv, nO*nO*nV*nV*sizeof(float)); assert (cudaStat == cudaSuccess); float* cc_space_v_oovv = malloc((size_t) nO*nO*nV*nV*sizeof(float)); assert (cc_space_v_oovv != NULL); for (size_t i=0 ; i<(size_t) nO*nO*nV*nV; ++i) { cc_space_v_oovv[i] = cc_space_v_oovv_[i]; } cublasSetMatrix(nO*nO, nV*nV, sizeof(float), cc_space_v_oovv, nO*nO, d_cc_space_v_oovv, nO*nO); free(cc_space_v_oovv); float* d_cc_space_v_vvoo = NULL; cudaStat = cudaMalloc((void**)&d_cc_space_v_vvoo, nV*nV*nO*nO*sizeof(float)); assert (cudaStat == cudaSuccess); float* cc_space_v_vvoo = malloc((size_t) nV*nV*nO*nO*sizeof(float)); assert (cc_space_v_vvoo != NULL); for (size_t i=0 ; i<(size_t) nV*nV*nO*nO; ++i) { cc_space_v_vvoo[i] = cc_space_v_vvoo_[i]; } cublasSetMatrix(nV*nV, nO*nO, sizeof(float), cc_space_v_vvoo, nV*nV, d_cc_space_v_vvoo, nV*nV); free(cc_space_v_vvoo); float* d_cc_space_v_oovo = NULL; lda = nO*nO; cudaStat = cudaMalloc((void **)&d_cc_space_v_oovo, nO*nO*nV*nO * sizeof(float)); assert (cudaStat == cudaSuccess); float* cc_space_v_oovo = malloc((size_t) nO*nO*nV*nO * sizeof(float)); assert (cc_space_v_oovo != NULL); for (size_t i=0 ; i<(size_t) nO*nO*nV*nO ; ++i) { cc_space_v_oovo[i] = cc_space_v_oovo_[i]; } cublasSetMatrix(lda, nV*nO, sizeof(float), cc_space_v_oovo, lda, d_cc_space_v_oovo, lda); free(cc_space_v_oovo); float* d_cc_space_v_ovvo = NULL; lda = nO*nV; cudaStat = cudaMalloc((void **)&d_cc_space_v_ovvo, nO*nV*nV*nO * sizeof(float)); assert (cudaStat == cudaSuccess); float* cc_space_v_ovvo = malloc((size_t) nO*nV*nV*nO * sizeof(float)); assert (cc_space_v_ovvo != NULL); for (size_t i=0 ; i<(size_t) nO*nV*nV*nO ; ++i) { cc_space_v_ovvo[i] = cc_space_v_ovvo_[i]; } cublasSetMatrix(lda, nV*nO, sizeof(float), cc_space_v_ovvo, lda, d_cc_space_v_ovvo, lda); free(cc_space_v_ovvo); float* d_cc_space_v_ovov = NULL; lda = nO*nV; cudaStat = cudaMalloc((void **)&d_cc_space_v_ovov, nO*nV*nV*nO * sizeof(float)); assert (cudaStat == cudaSuccess); float* cc_space_v_ovov = malloc((size_t) nO*nV*nV*nO * sizeof(float)); assert (cc_space_v_ovov != NULL); for (size_t i=0 ; i<(size_t) nO*nV*nV*nO ; ++i) { cc_space_v_ovov[i] = cc_space_v_ovov_[i]; } cublasSetMatrix(lda, nV*nO, sizeof(float), cc_space_v_ovov, lda, d_cc_space_v_ovov, lda); free(cc_space_v_ovov); float* d_cc_space_v_ovoo = NULL; lda = nO*nV; cudaStat = cudaMalloc((void **)&d_cc_space_v_ovoo, nO*nV*nO*nO * sizeof(float)); assert (cudaStat == cudaSuccess); float* cc_space_v_ovoo = malloc((size_t) nO*nV*nO*nO * sizeof(float)); assert (cc_space_v_ovoo != NULL); for (size_t i=0 ; i<(size_t) nO*nV*nO*nO ; ++i) { cc_space_v_ovoo[i] = cc_space_v_ovoo_[i]; } cublasSetMatrix(lda, nO*nO, sizeof(float), cc_space_v_ovoo, lda, d_cc_space_v_ovoo, lda); free(cc_space_v_ovoo); float* d_cc_space_f_oo = NULL; cudaStat = cudaMalloc((void**)&d_cc_space_f_oo, nO*nO*sizeof(float)); assert (cudaStat == cudaSuccess); float* cc_space_f_oo = malloc((size_t) nO*nO*sizeof(float)); assert (cc_space_f_oo != NULL); for (size_t i=0 ; i<(size_t) nO*nO; ++i) { cc_space_f_oo[i] = cc_space_f_oo_[i]; } cublasSetMatrix(nO, nO, sizeof(float), cc_space_f_oo, nO, d_cc_space_f_oo, nO); free(cc_space_f_oo); float* d_cc_space_f_vo = NULL; cudaStat = cudaMalloc((void**)&d_cc_space_f_vo, nV*nO*sizeof(float)); assert (cudaStat == cudaSuccess); float* cc_space_f_vo = malloc((size_t) nV*nO*sizeof(float)); assert (cc_space_f_vo != NULL); for (size_t i=0 ; i<(size_t) nV*nO; ++i) { cc_space_f_vo[i] = cc_space_f_vo_[i]; } cublasSetMatrix(nV, nO, sizeof(float), cc_space_f_vo, nV, d_cc_space_f_vo, nV); free(cc_space_f_vo); float* d_cc_space_f_ov = NULL; cudaStat = cudaMalloc((void**)&d_cc_space_f_ov, nV*nO*sizeof(float)); assert (cudaStat == cudaSuccess); float* cc_space_f_ov = malloc((size_t) nV*nO*sizeof(float)); assert (cc_space_f_ov != NULL); for (size_t i=0 ; i<(size_t) nV*nO; ++i) { cc_space_f_ov[i] = cc_space_f_ov_[i]; } cublasSetMatrix(nO, nV, sizeof(float), cc_space_f_ov, nO, d_cc_space_f_ov, nO); free(cc_space_f_ov); float* d_cc_space_f_vv = NULL; cudaStat = cudaMalloc((void**)&d_cc_space_f_vv, nV*nV*sizeof(float)); assert (cudaStat == cudaSuccess); float* cc_space_f_vv = malloc((size_t) nV*nV*sizeof(float)); assert (cc_space_f_vv != NULL); for (size_t i=0 ; i<(size_t) nV*nV; ++i) { cc_space_f_vv[i] = cc_space_f_vv_[i]; } cublasSetMatrix(nV, nV, sizeof(float), cc_space_f_vv, nV, d_cc_space_f_vv, nV); free(cc_space_f_vv); float* d_tau = NULL; lda = nO * nO; cudaStat = cudaMalloc((void **)&d_tau, lda * nV * nV * sizeof(float)); assert (cudaStat == cudaSuccess); float* d_tau_x = NULL; lda = nO * nO; cudaStat = cudaMalloc((void **)&d_tau_x, lda * nV * nV * sizeof(float)); assert (cudaStat == cudaSuccess); float* d_t1 = NULL; cudaStat = cudaMalloc((void **)&d_t1, nO * nV * sizeof(float)); assert (cudaStat == cudaSuccess); float* d_t2 = NULL; cudaStat = cudaMalloc((void **)&d_t2, nO*nO*nV*nV * sizeof(float)); assert (cudaStat == cudaSuccess); float* d_H_oo = NULL; cudaStat = cudaMalloc((void **)&d_H_oo, nO * nO * sizeof(float)); assert (cudaStat == cudaSuccess); float* d_H_vo = NULL; cudaStat = cudaMalloc((void **)&d_H_vo, nV * nO * sizeof(float)); assert (cudaStat == cudaSuccess); float* d_H_vv = NULL; cudaStat = cudaMalloc((void **)&d_H_vv, nV * nV * sizeof(float)); assert (cudaStat == cudaSuccess); data[igpu].cc_space_v_oo_chol = d_cc_space_v_oo_chol; data[igpu].cc_space_v_ov_chol = d_cc_space_v_ov_chol; data[igpu].cc_space_v_vo_chol = d_cc_space_v_vo_chol; data[igpu].cc_space_v_vv_chol = d_cc_space_v_vv_chol; data[igpu].cc_space_v_oooo = d_cc_space_v_oooo; data[igpu].cc_space_v_vooo = d_cc_space_v_vooo; data[igpu].cc_space_v_voov = d_cc_space_v_voov; data[igpu].cc_space_v_oovv = d_cc_space_v_oovv; data[igpu].cc_space_v_vvoo = d_cc_space_v_vvoo; data[igpu].cc_space_v_oovo = d_cc_space_v_oovo; data[igpu].cc_space_v_ovvo = d_cc_space_v_ovvo; data[igpu].cc_space_v_ovov = d_cc_space_v_ovov; data[igpu].cc_space_v_ovoo = d_cc_space_v_ovoo; data[igpu].cc_space_f_oo = d_cc_space_f_oo; data[igpu].cc_space_f_ov = d_cc_space_f_ov; data[igpu].cc_space_f_vo = d_cc_space_f_vo; data[igpu].cc_space_f_vv = d_cc_space_f_vv; data[igpu].tau = d_tau; data[igpu].tau_x = d_tau_x; data[igpu].t1 = d_t1; data[igpu].t2 = d_t2; data[igpu].H_oo = d_H_oo; data[igpu].H_vo = d_H_vo; data[igpu].H_vv = d_H_vv; data[igpu].nO = nO; data[igpu].nV = nV; data[igpu].cholesky_mo_num = cholesky_mo_num; } return data; } void gpu_deinit_sp(gpu_data_sp* data) { int ngpus = 1; if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus); #pragma omp parallel num_threads(ngpus) { size_t lda; int igpu = omp_get_thread_num(); cudaSetDevice(igpu); free(data[igpu].cc_space_v_oo_chol); free(data[igpu].cc_space_v_ov_chol); free(data[igpu].cc_space_v_vo_chol); free(data[igpu].cc_space_v_vv_chol); free(data[igpu].cc_space_v_oooo); free(data[igpu].cc_space_v_vooo); free(data[igpu].cc_space_v_voov); free(data[igpu].cc_space_v_oovv); free(data[igpu].cc_space_v_vvoo); free(data[igpu].cc_space_v_oovo); free(data[igpu].cc_space_v_ovvo); free(data[igpu].cc_space_v_ovov); free(data[igpu].cc_space_v_ovoo); free(data[igpu].cc_space_f_oo); free(data[igpu].cc_space_f_ov); free(data[igpu].cc_space_f_vo); free(data[igpu].cc_space_f_vv); free(data[igpu].tau); free(data[igpu].tau_x); free(data[igpu].t1); free(data[igpu].t2); free(data[igpu].H_oo); free(data[igpu].H_vo); free(data[igpu].H_vv); } }