#include #include #include #include #include #include #include "gpu.h" #define BLOCK_SIZE 16 gpu_data* gpu_init( int nO, int nV, int cholesky_mo_num, double* cc_space_v_oo_chol, double* cc_space_v_ov_chol, double* cc_space_v_vo_chol, double* cc_space_v_vv_chol, double* cc_space_v_oooo, double* cc_space_v_vooo, double* cc_space_v_oovv, double* cc_space_v_vvoo, double* cc_space_v_oovo, double* cc_space_v_ovvo, double* cc_space_v_ovov, double* cc_space_v_ovoo, double* cc_space_f_vo) { int ngpus = 1; cudaGetDeviceCount(&ngpus); gpu_data* data = (gpu_data*) malloc (ngpus*sizeof(gpu_data)); #pragma omp parallel num_threads(ngpus) { int lda; int igpu = omp_get_thread_num(); cudaSetDevice(igpu); cublasHandle_t handle; cublasCreate(&handle); double* d_cc_space_v_oo_chol; lda = cholesky_mo_num * nO; cudaMalloc((void **)&d_cc_space_v_oo_chol, lda * nO * sizeof(double)); cublasSetMatrix(cholesky_mo_num*nO, nO, sizeof(double), cc_space_v_oo_chol, lda, d_cc_space_v_oo_chol, lda); double* d_cc_space_v_ov_chol; lda = cholesky_mo_num * nO; cudaMalloc((void **)&d_cc_space_v_ov_chol, lda * nV * sizeof(double)); cublasSetMatrix(cholesky_mo_num*nO, nV, sizeof(double), cc_space_v_ov_chol, lda, d_cc_space_v_ov_chol, lda); double* d_cc_space_v_vo_chol; lda = cholesky_mo_num * nV; cudaMalloc((void **)&d_cc_space_v_vo_chol, lda * nO * sizeof(double)); cublasSetMatrix(cholesky_mo_num*nV, nO, sizeof(double), cc_space_v_vo_chol, lda, d_cc_space_v_vo_chol, lda); double* d_cc_space_v_vv_chol; lda = cholesky_mo_num * nV; cudaMalloc((void **)&d_cc_space_v_vv_chol, lda * nV * sizeof(double)); cublasSetMatrix(cholesky_mo_num*nV, nV, sizeof(double), cc_space_v_vv_chol, lda, d_cc_space_v_vv_chol, lda); double* d_cc_space_v_oooo; cudaMalloc((void**)&d_cc_space_v_oooo, nO*nO*nO*nO*sizeof(double)); cublasSetMatrix(nO*nO, nO*nO, sizeof(double), cc_space_v_oooo, nO*nO, d_cc_space_v_oooo, nO*nO); double* d_cc_space_v_vooo; cudaMalloc((void**)&d_cc_space_v_vooo, nV*nO*nO*nO*sizeof(double)); cublasSetMatrix(nV*nO, nO*nO, sizeof(double), cc_space_v_vooo, nV*nO, d_cc_space_v_vooo, nV*nO); double* d_cc_space_v_oovv; cudaMalloc((void**)&d_cc_space_v_oovv, nO*nO*nV*nV*sizeof(double)); cublasSetMatrix(nO*nO, nV*nV, sizeof(double), cc_space_v_oovv, nO*nO, d_cc_space_v_oovv, nO*nO); double* d_cc_space_v_vvoo; cudaMalloc((void**)&d_cc_space_v_vvoo, nV*nV*nO*nO*sizeof(double)); cublasSetMatrix(nV*nV, nO*nO, sizeof(double), cc_space_v_vvoo, nV*nV, d_cc_space_v_vvoo, nV*nV); double* d_cc_space_v_oovo; lda = nO*nO; cudaMalloc((void **)&d_cc_space_v_oovo, nO*nO*nV*nO * sizeof(double)); cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_oovo, lda, d_cc_space_v_oovo, lda); double* d_cc_space_v_ovvo; lda = nO*nV; cudaMalloc((void **)&d_cc_space_v_ovvo, nO*nV*nV*nO * sizeof(double)); cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_ovvo, lda, d_cc_space_v_ovvo, lda); double* d_cc_space_v_ovov; lda = nO*nV; cudaMalloc((void **)&d_cc_space_v_ovov, nO*nV*nV*nO * sizeof(double)); cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_ovov, lda, d_cc_space_v_ovov, lda); double* d_cc_space_v_ovoo; lda = nO*nV; cudaMalloc((void **)&d_cc_space_v_ovoo, nO*nV*nO*nO * sizeof(double)); cublasSetMatrix(lda, nO*nO, sizeof(double), cc_space_v_ovoo, lda, d_cc_space_v_ovoo, lda); double* d_cc_space_f_vo; cudaMalloc((void**)&d_cc_space_f_vo, nV*nO*sizeof(double)); cublasSetMatrix(nV, nO, sizeof(double), cc_space_f_vo, nV, d_cc_space_f_vo, nV); data[igpu].cc_space_v_oo_chol = d_cc_space_v_oo_chol; data[igpu].cc_space_v_ov_chol = d_cc_space_v_ov_chol; data[igpu].cc_space_v_vo_chol = d_cc_space_v_vo_chol; data[igpu].cc_space_v_vv_chol = d_cc_space_v_vv_chol; data[igpu].cc_space_v_oooo = d_cc_space_v_oooo; data[igpu].cc_space_v_vooo = d_cc_space_v_vooo; data[igpu].cc_space_v_oovv = d_cc_space_v_oovv; data[igpu].cc_space_v_vvoo = d_cc_space_v_vvoo; data[igpu].cc_space_v_oovo = d_cc_space_v_oovo; data[igpu].cc_space_v_ovvo = d_cc_space_v_ovvo; data[igpu].cc_space_v_ovov = d_cc_space_v_ovov; data[igpu].cc_space_v_ovoo = d_cc_space_v_ovoo; data[igpu].cc_space_f_vo = d_cc_space_f_vo; } return data; }