mirror of
https://gitlab.com/scemama/qp_plugins_scemama.git
synced 2025-01-05 10:59:10 +01:00
232 lines
8.9 KiB
C
232 lines
8.9 KiB
C
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <omp.h>
|
|
#include <cublas_v2.h>
|
|
#include <cuda_runtime.h>
|
|
#include "gpu.h"
|
|
#include "assert.h"
|
|
|
|
gpu_data* gpu_init(
|
|
int nO, int nV, int cholesky_mo_num,
|
|
double* cc_space_v_oo_chol, double* cc_space_v_ov_chol,
|
|
double* cc_space_v_vo_chol, double* cc_space_v_vv_chol,
|
|
double* cc_space_v_oooo, double* cc_space_v_vooo, double* cc_space_v_voov,
|
|
double* cc_space_v_oovv, double* cc_space_v_vvoo,
|
|
double* cc_space_v_oovo, double* cc_space_v_ovvo,
|
|
double* cc_space_v_ovov, double* cc_space_v_ovoo,
|
|
double* cc_space_f_oo, double* cc_space_f_ov,
|
|
double* cc_space_f_vo, double* cc_space_f_vv)
|
|
{
|
|
int ngpus = 1;
|
|
if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
|
|
|
|
gpu_data* data = (gpu_data*) malloc (ngpus*sizeof(gpu_data));
|
|
assert (data != NULL);
|
|
|
|
#pragma omp parallel num_threads(ngpus)
|
|
{
|
|
cudaError_t cudaStat = cudaSuccess;
|
|
size_t lda;
|
|
|
|
int igpu = omp_get_thread_num();
|
|
cudaSetDevice(igpu);
|
|
cublasHandle_t handle;
|
|
cublasCreate(&handle);
|
|
|
|
double* d_cc_space_v_oo_chol;
|
|
lda = cholesky_mo_num * nO;
|
|
cudaStat = cudaMalloc((void **)&d_cc_space_v_oo_chol, lda * nO * sizeof(double));
|
|
assert (cudaStat == cudaSuccess);
|
|
cublasSetMatrix(cholesky_mo_num*nO, nO, sizeof(double), cc_space_v_oo_chol, lda, d_cc_space_v_oo_chol, lda);
|
|
|
|
double* d_cc_space_v_ov_chol;
|
|
lda = cholesky_mo_num * nO;
|
|
cudaStat = cudaMalloc((void **)&d_cc_space_v_ov_chol, lda * nV * sizeof(double));
|
|
assert (cudaStat == cudaSuccess);
|
|
cublasSetMatrix(cholesky_mo_num*nO, nV, sizeof(double), cc_space_v_ov_chol, lda, d_cc_space_v_ov_chol, lda);
|
|
|
|
double* d_cc_space_v_vo_chol;
|
|
lda = cholesky_mo_num * nV;
|
|
cudaStat = cudaMalloc((void **)&d_cc_space_v_vo_chol, lda * nO * sizeof(double));
|
|
assert (cudaStat == cudaSuccess);
|
|
cublasSetMatrix(cholesky_mo_num*nV, nO, sizeof(double), cc_space_v_vo_chol, lda, d_cc_space_v_vo_chol, lda);
|
|
|
|
double* d_cc_space_v_vv_chol;
|
|
lda = cholesky_mo_num * nV;
|
|
cudaStat = cudaMalloc((void **)&d_cc_space_v_vv_chol, lda * nV * sizeof(double));
|
|
assert (cudaStat == cudaSuccess);
|
|
cublasSetMatrix(cholesky_mo_num*nV, nV, sizeof(double), cc_space_v_vv_chol, lda, d_cc_space_v_vv_chol, lda);
|
|
|
|
double* d_cc_space_v_oooo;
|
|
cudaStat = cudaMalloc((void**)&d_cc_space_v_oooo, nO*nO*nO*nO*sizeof(double));
|
|
assert (cudaStat == cudaSuccess);
|
|
cublasSetMatrix(nO*nO, nO*nO, sizeof(double), cc_space_v_oooo, nO*nO, d_cc_space_v_oooo, nO*nO);
|
|
|
|
double* d_cc_space_v_vooo;
|
|
cudaStat = cudaMalloc((void**)&d_cc_space_v_vooo, nV*nO*nO*nO*sizeof(double));
|
|
assert (cudaStat == cudaSuccess);
|
|
cublasSetMatrix(nV*nO, nO*nO, sizeof(double), cc_space_v_vooo, nV*nO, d_cc_space_v_vooo, nV*nO);
|
|
|
|
double* d_cc_space_v_voov;
|
|
cudaStat = cudaMalloc((void**)&d_cc_space_v_voov, nV*nO*nO*nV*sizeof(double));
|
|
assert (cudaStat == cudaSuccess);
|
|
cublasSetMatrix(nV*nO, nO*nV, sizeof(double), cc_space_v_voov, nV*nO, d_cc_space_v_voov, nV*nO);
|
|
|
|
double* d_cc_space_v_oovv;
|
|
cudaStat = cudaMalloc((void**)&d_cc_space_v_oovv, nO*nO*nV*nV*sizeof(double));
|
|
assert (cudaStat == cudaSuccess);
|
|
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), cc_space_v_oovv, nO*nO, d_cc_space_v_oovv, nO*nO);
|
|
|
|
double* d_cc_space_v_vvoo;
|
|
cudaStat = cudaMalloc((void**)&d_cc_space_v_vvoo, nV*nV*nO*nO*sizeof(double));
|
|
assert (cudaStat == cudaSuccess);
|
|
cublasSetMatrix(nV*nV, nO*nO, sizeof(double), cc_space_v_vvoo, nV*nV, d_cc_space_v_vvoo, nV*nV);
|
|
|
|
double* d_cc_space_v_oovo;
|
|
lda = nO*nO;
|
|
cudaStat = cudaMalloc((void **)&d_cc_space_v_oovo, nO*nO*nV*nO * sizeof(double));
|
|
assert (cudaStat == cudaSuccess);
|
|
cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_oovo, lda, d_cc_space_v_oovo, lda);
|
|
|
|
double* d_cc_space_v_ovvo;
|
|
lda = nO*nV;
|
|
cudaStat = cudaMalloc((void **)&d_cc_space_v_ovvo, nO*nV*nV*nO * sizeof(double));
|
|
assert (cudaStat == cudaSuccess);
|
|
cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_ovvo, lda, d_cc_space_v_ovvo, lda);
|
|
|
|
double* d_cc_space_v_ovov;
|
|
lda = nO*nV;
|
|
cudaStat = cudaMalloc((void **)&d_cc_space_v_ovov, nO*nV*nV*nO * sizeof(double));
|
|
assert (cudaStat == cudaSuccess);
|
|
cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_ovov, lda, d_cc_space_v_ovov, lda);
|
|
|
|
double* d_cc_space_v_ovoo;
|
|
lda = nO*nV;
|
|
cudaStat = cudaMalloc((void **)&d_cc_space_v_ovoo, nO*nV*nO*nO * sizeof(double));
|
|
assert (cudaStat == cudaSuccess);
|
|
cublasSetMatrix(lda, nO*nO, sizeof(double), cc_space_v_ovoo, lda, d_cc_space_v_ovoo, lda);
|
|
|
|
double* d_cc_space_f_oo;
|
|
cudaStat = cudaMalloc((void**)&d_cc_space_f_oo, nO*nO*sizeof(double));
|
|
assert (cudaStat == cudaSuccess);
|
|
cublasSetMatrix(nO, nO, sizeof(double), cc_space_f_oo, nO, d_cc_space_f_oo, nO);
|
|
|
|
double* d_cc_space_f_vo;
|
|
cudaStat = cudaMalloc((void**)&d_cc_space_f_vo, nV*nO*sizeof(double));
|
|
assert (cudaStat == cudaSuccess);
|
|
cublasSetMatrix(nV, nO, sizeof(double), cc_space_f_vo, nV, d_cc_space_f_vo, nV);
|
|
|
|
double* d_cc_space_f_ov;
|
|
cudaStat = cudaMalloc((void**)&d_cc_space_f_ov, nV*nO*sizeof(double));
|
|
assert (cudaStat == cudaSuccess);
|
|
cublasSetMatrix(nO, nV, sizeof(double), cc_space_f_ov, nO, d_cc_space_f_ov, nO);
|
|
|
|
double* d_cc_space_f_vv;
|
|
cudaStat = cudaMalloc((void**)&d_cc_space_f_vv, nV*nV*sizeof(double));
|
|
assert (cudaStat == cudaSuccess);
|
|
cublasSetMatrix(nV, nV, sizeof(double), cc_space_f_vv, nV, d_cc_space_f_vv, nV);
|
|
|
|
double* d_tau;
|
|
lda = nO * nO;
|
|
cudaStat = cudaMalloc((void **)&d_tau, lda * nV * nV * sizeof(double));
|
|
assert (cudaStat == cudaSuccess);
|
|
|
|
double* d_tau_x;
|
|
lda = nO * nO;
|
|
cudaStat = cudaMalloc((void **)&d_tau_x, lda * nV * nV * sizeof(double));
|
|
assert (cudaStat == cudaSuccess);
|
|
|
|
double* d_t1;
|
|
cudaStat = cudaMalloc((void **)&d_t1, nO * nV * sizeof(double));
|
|
assert (cudaStat == cudaSuccess);
|
|
|
|
double* d_t2;
|
|
cudaStat = cudaMalloc((void **)&d_t2, nO*nO*nV*nV * sizeof(double));
|
|
assert (cudaStat == cudaSuccess);
|
|
|
|
double* d_H_oo;
|
|
cudaStat = cudaMalloc((void **)&d_H_oo, nO * nO * sizeof(double));
|
|
assert (cudaStat == cudaSuccess);
|
|
|
|
double* d_H_vo;
|
|
cudaStat = cudaMalloc((void **)&d_H_vo, nV * nO * sizeof(double));
|
|
assert (cudaStat == cudaSuccess);
|
|
|
|
double* d_H_vv;
|
|
cudaStat = cudaMalloc((void **)&d_H_vv, nV * nV * sizeof(double));
|
|
assert (cudaStat == cudaSuccess);
|
|
|
|
data[igpu].cc_space_v_oo_chol = d_cc_space_v_oo_chol;
|
|
data[igpu].cc_space_v_ov_chol = d_cc_space_v_ov_chol;
|
|
data[igpu].cc_space_v_vo_chol = d_cc_space_v_vo_chol;
|
|
data[igpu].cc_space_v_vv_chol = d_cc_space_v_vv_chol;
|
|
data[igpu].cc_space_v_oooo = d_cc_space_v_oooo;
|
|
data[igpu].cc_space_v_vooo = d_cc_space_v_vooo;
|
|
data[igpu].cc_space_v_voov = d_cc_space_v_voov;
|
|
data[igpu].cc_space_v_oovv = d_cc_space_v_oovv;
|
|
data[igpu].cc_space_v_vvoo = d_cc_space_v_vvoo;
|
|
data[igpu].cc_space_v_oovo = d_cc_space_v_oovo;
|
|
data[igpu].cc_space_v_ovvo = d_cc_space_v_ovvo;
|
|
data[igpu].cc_space_v_ovov = d_cc_space_v_ovov;
|
|
data[igpu].cc_space_v_ovoo = d_cc_space_v_ovoo;
|
|
data[igpu].cc_space_f_oo = d_cc_space_f_oo;
|
|
data[igpu].cc_space_f_ov = d_cc_space_f_ov;
|
|
data[igpu].cc_space_f_vo = d_cc_space_f_vo;
|
|
data[igpu].cc_space_f_vv = d_cc_space_f_vv;
|
|
data[igpu].tau = d_tau;
|
|
data[igpu].tau_x = d_tau_x;
|
|
data[igpu].t1 = d_t1;
|
|
data[igpu].t2 = d_t2;
|
|
data[igpu].H_oo = d_H_oo;
|
|
data[igpu].H_vo = d_H_vo;
|
|
data[igpu].H_vv = d_H_vv;
|
|
|
|
data[igpu].nO = nO;
|
|
data[igpu].nV = nV;
|
|
data[igpu].cholesky_mo_num = cholesky_mo_num;
|
|
|
|
}
|
|
return data;
|
|
}
|
|
|
|
|
|
void gpu_deinit(gpu_data* data)
|
|
{
|
|
int ngpus = 1;
|
|
if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
|
|
|
|
#pragma omp parallel num_threads(ngpus)
|
|
{
|
|
size_t lda;
|
|
int igpu = omp_get_thread_num();
|
|
cudaSetDevice(igpu);
|
|
|
|
free(data[igpu].cc_space_v_oo_chol);
|
|
free(data[igpu].cc_space_v_ov_chol);
|
|
free(data[igpu].cc_space_v_vo_chol);
|
|
free(data[igpu].cc_space_v_vv_chol);
|
|
free(data[igpu].cc_space_v_oooo);
|
|
free(data[igpu].cc_space_v_vooo);
|
|
free(data[igpu].cc_space_v_voov);
|
|
free(data[igpu].cc_space_v_oovv);
|
|
free(data[igpu].cc_space_v_vvoo);
|
|
free(data[igpu].cc_space_v_oovo);
|
|
free(data[igpu].cc_space_v_ovvo);
|
|
free(data[igpu].cc_space_v_ovov);
|
|
free(data[igpu].cc_space_v_ovoo);
|
|
free(data[igpu].cc_space_f_oo);
|
|
free(data[igpu].cc_space_f_ov);
|
|
free(data[igpu].cc_space_f_vo);
|
|
free(data[igpu].cc_space_f_vv);
|
|
free(data[igpu].tau);
|
|
free(data[igpu].tau_x);
|
|
free(data[igpu].t1);
|
|
free(data[igpu].t2);
|
|
free(data[igpu].H_oo);
|
|
free(data[igpu].H_vo);
|
|
free(data[igpu].H_vv);
|
|
}
|
|
}
|
|
|