1
0
mirror of https://gitlab.com/scemama/qp_plugins_scemama.git synced 2024-11-07 06:33:40 +01:00
qp_plugins_scemama/devel/ccsd_gpu/gpu_init_sp.c
2023-08-21 13:17:39 +02:00

345 lines
14 KiB
C

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <omp.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <assert.h>
#include "gpu.h"
gpu_data_sp* gpu_init_sp(
int nO, int nV, int cholesky_mo_num,
double* cc_space_v_oo_chol_, double* cc_space_v_ov_chol_,
double* cc_space_v_vo_chol_, double* cc_space_v_vv_chol_,
double* cc_space_v_oooo_, double* cc_space_v_vooo_, double* cc_space_v_voov_,
double* cc_space_v_oovv_, double* cc_space_v_vvoo_,
double* cc_space_v_oovo_, double* cc_space_v_ovvo_,
double* cc_space_v_ovov_, double* cc_space_v_ovoo_,
double* cc_space_f_oo_, double* cc_space_f_ov_,
double* cc_space_f_vo_, double* cc_space_f_vv_)
{
int ngpus = 1;
if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
gpu_data_sp* data = (gpu_data_sp*) malloc ((size_t) ngpus*sizeof(gpu_data_sp));
assert (data != NULL);
#pragma omp parallel num_threads(ngpus)
{
cudaError_t cudaStat = cudaSuccess;
size_t lda;
int igpu = omp_get_thread_num();
cudaSetDevice(igpu);
cublasHandle_t handle;
cublasCreate(&handle);
/*
size_t limit;
size_t mtotal, mfree;
cudaStat = cudaMemGetInfo(&mfree, &mtotal);
fprintf(stderr, "%s\nTotal:%lu\nFree :%lu\n", cudaGetErrorString(cudaStat), mtotal, mfree);
*/
float* d_cc_space_v_oo_chol = NULL;
lda = cholesky_mo_num * nO;
cudaStat = cudaMalloc((void **)&d_cc_space_v_oo_chol, lda * nO * sizeof(float));
assert (cudaStat == cudaSuccess);
float* cc_space_v_oo_chol = malloc((size_t) lda * nO * sizeof(float));
assert (cc_space_v_oo_chol != NULL);
for (size_t i=0 ; i<(size_t) lda * nO ; ++i) {
cc_space_v_oo_chol[i] = cc_space_v_oo_chol_[i];
}
cublasSetMatrix(cholesky_mo_num*nO, nO, sizeof(float), cc_space_v_oo_chol, lda, d_cc_space_v_oo_chol, lda);
free(cc_space_v_oo_chol);
float* d_cc_space_v_ov_chol = NULL;
lda = cholesky_mo_num * nO;
cudaStat = cudaMalloc((void **)&d_cc_space_v_ov_chol, lda * nV * sizeof(float));
assert (cudaStat == cudaSuccess);
float* cc_space_v_ov_chol = malloc((size_t) lda * nV * sizeof(float));
assert (cc_space_v_ov_chol != NULL);
for (size_t i=0 ; i<(size_t) lda * nV ; ++i) {
cc_space_v_ov_chol[i] = cc_space_v_ov_chol_[i];
}
cublasSetMatrix(cholesky_mo_num*nO, nV, sizeof(float), cc_space_v_ov_chol, lda, d_cc_space_v_ov_chol, lda);
free(cc_space_v_ov_chol);
float* d_cc_space_v_vo_chol = NULL;
lda = cholesky_mo_num * nV;
cudaStat = cudaMalloc((void **)&d_cc_space_v_vo_chol, lda * nO * sizeof(float));
assert (cudaStat == cudaSuccess);
float* cc_space_v_vo_chol = malloc((size_t) lda * nO * sizeof(float));
assert (cc_space_v_vo_chol != NULL);
for (size_t i=0 ; i<(size_t) lda * nO ; ++i) {
cc_space_v_vo_chol[i] = cc_space_v_vo_chol_[i];
}
cublasSetMatrix(cholesky_mo_num*nV, nO, sizeof(float), cc_space_v_vo_chol, lda, d_cc_space_v_vo_chol, lda);
free(cc_space_v_vo_chol);
float* d_cc_space_v_vv_chol = NULL;
lda = cholesky_mo_num * nV;
cudaStat = cudaMalloc((void **)&d_cc_space_v_vv_chol, lda * nV * sizeof(float));
assert (cudaStat == cudaSuccess);
float* cc_space_v_vv_chol = malloc((size_t) lda * nV * sizeof(float));
assert (cc_space_v_vv_chol != NULL);
for (size_t i=0 ; i< (size_t) lda * nV ; ++i) {
cc_space_v_vv_chol[i] = cc_space_v_vv_chol_[i];
}
cublasSetMatrix(cholesky_mo_num*nV, nV, sizeof(float), cc_space_v_vv_chol, lda, d_cc_space_v_vv_chol, lda);
free(cc_space_v_vv_chol);
float* d_cc_space_v_oooo = NULL;
cudaStat = cudaMalloc((void**)&d_cc_space_v_oooo, nO*nO*nO*nO*sizeof(float));
assert (cudaStat == cudaSuccess);
float* cc_space_v_oooo = malloc((size_t) nO*nO*nO*nO*sizeof(float));
assert (cc_space_v_oooo != NULL);
for (size_t i=0 ; i<(size_t) nO*nO*nO*nO ; ++i) {
cc_space_v_oooo[i] = cc_space_v_oooo_[i];
}
cublasSetMatrix(nO*nO, nO*nO, sizeof(float), cc_space_v_oooo, nO*nO, d_cc_space_v_oooo, nO*nO);
free(cc_space_v_oooo);
float* d_cc_space_v_vooo = NULL;
cudaStat = cudaMalloc((void**)&d_cc_space_v_vooo, nV*nO*nO*nO*sizeof(float));
assert (cudaStat == cudaSuccess);
float* cc_space_v_vooo = malloc((size_t) nV*nO*nO*nO*sizeof(float));
assert (cc_space_v_vooo != NULL);
for (size_t i=0 ; i<(size_t) nV*nO*nO*nO; ++i) {
cc_space_v_vooo[i] = cc_space_v_vooo_[i];
}
cublasSetMatrix(nV*nO, nO*nO, sizeof(float), cc_space_v_vooo, nV*nO, d_cc_space_v_vooo, nV*nO);
free(cc_space_v_vooo);
float* d_cc_space_v_voov = NULL;
cudaStat = cudaMalloc((void**)&d_cc_space_v_voov, nV*nO*nO*nV*sizeof(float));
assert (cudaStat == cudaSuccess);
float* cc_space_v_voov = malloc((size_t) nV*nO*nO*nV*sizeof(float));
assert (cc_space_v_voov != NULL);
for (size_t i=0 ; i<(size_t) nV*nO*nO*nV; ++i) {
cc_space_v_voov[i] = cc_space_v_voov_[i];
}
cublasSetMatrix(nV*nO, nO*nV, sizeof(float), cc_space_v_voov, nV*nO, d_cc_space_v_voov, nV*nO);
free(cc_space_v_voov);
float* d_cc_space_v_oovv = NULL;
cudaStat = cudaMalloc((void**)&d_cc_space_v_oovv, nO*nO*nV*nV*sizeof(float));
assert (cudaStat == cudaSuccess);
float* cc_space_v_oovv = malloc((size_t) nO*nO*nV*nV*sizeof(float));
assert (cc_space_v_oovv != NULL);
for (size_t i=0 ; i<(size_t) nO*nO*nV*nV; ++i) {
cc_space_v_oovv[i] = cc_space_v_oovv_[i];
}
cublasSetMatrix(nO*nO, nV*nV, sizeof(float), cc_space_v_oovv, nO*nO, d_cc_space_v_oovv, nO*nO);
free(cc_space_v_oovv);
float* d_cc_space_v_vvoo = NULL;
cudaStat = cudaMalloc((void**)&d_cc_space_v_vvoo, nV*nV*nO*nO*sizeof(float));
assert (cudaStat == cudaSuccess);
float* cc_space_v_vvoo = malloc((size_t) nV*nV*nO*nO*sizeof(float));
assert (cc_space_v_vvoo != NULL);
for (size_t i=0 ; i<(size_t) nV*nV*nO*nO; ++i) {
cc_space_v_vvoo[i] = cc_space_v_vvoo_[i];
}
cublasSetMatrix(nV*nV, nO*nO, sizeof(float), cc_space_v_vvoo, nV*nV, d_cc_space_v_vvoo, nV*nV);
free(cc_space_v_vvoo);
float* d_cc_space_v_oovo = NULL;
lda = nO*nO;
cudaStat = cudaMalloc((void **)&d_cc_space_v_oovo, nO*nO*nV*nO * sizeof(float));
assert (cudaStat == cudaSuccess);
float* cc_space_v_oovo = malloc((size_t) nO*nO*nV*nO * sizeof(float));
assert (cc_space_v_oovo != NULL);
for (size_t i=0 ; i<(size_t) nO*nO*nV*nO ; ++i) {
cc_space_v_oovo[i] = cc_space_v_oovo_[i];
}
cublasSetMatrix(lda, nV*nO, sizeof(float), cc_space_v_oovo, lda, d_cc_space_v_oovo, lda);
free(cc_space_v_oovo);
float* d_cc_space_v_ovvo = NULL;
lda = nO*nV;
cudaStat = cudaMalloc((void **)&d_cc_space_v_ovvo, nO*nV*nV*nO * sizeof(float));
assert (cudaStat == cudaSuccess);
float* cc_space_v_ovvo = malloc((size_t) nO*nV*nV*nO * sizeof(float));
assert (cc_space_v_ovvo != NULL);
for (size_t i=0 ; i<(size_t) nO*nV*nV*nO ; ++i) {
cc_space_v_ovvo[i] = cc_space_v_ovvo_[i];
}
cublasSetMatrix(lda, nV*nO, sizeof(float), cc_space_v_ovvo, lda, d_cc_space_v_ovvo, lda);
free(cc_space_v_ovvo);
float* d_cc_space_v_ovov = NULL;
lda = nO*nV;
cudaStat = cudaMalloc((void **)&d_cc_space_v_ovov, nO*nV*nV*nO * sizeof(float));
assert (cudaStat == cudaSuccess);
float* cc_space_v_ovov = malloc((size_t) nO*nV*nV*nO * sizeof(float));
assert (cc_space_v_ovov != NULL);
for (size_t i=0 ; i<(size_t) nO*nV*nV*nO ; ++i) {
cc_space_v_ovov[i] = cc_space_v_ovov_[i];
}
cublasSetMatrix(lda, nV*nO, sizeof(float), cc_space_v_ovov, lda, d_cc_space_v_ovov, lda);
free(cc_space_v_ovov);
float* d_cc_space_v_ovoo = NULL;
lda = nO*nV;
cudaStat = cudaMalloc((void **)&d_cc_space_v_ovoo, nO*nV*nO*nO * sizeof(float));
assert (cudaStat == cudaSuccess);
float* cc_space_v_ovoo = malloc((size_t) nO*nV*nO*nO * sizeof(float));
assert (cc_space_v_ovoo != NULL);
for (size_t i=0 ; i<(size_t) nO*nV*nO*nO ; ++i) {
cc_space_v_ovoo[i] = cc_space_v_ovoo_[i];
}
cublasSetMatrix(lda, nO*nO, sizeof(float), cc_space_v_ovoo, lda, d_cc_space_v_ovoo, lda);
free(cc_space_v_ovoo);
float* d_cc_space_f_oo = NULL;
cudaStat = cudaMalloc((void**)&d_cc_space_f_oo, nO*nO*sizeof(float));
assert (cudaStat == cudaSuccess);
float* cc_space_f_oo = malloc((size_t) nO*nO*sizeof(float));
assert (cc_space_f_oo != NULL);
for (size_t i=0 ; i<(size_t) nO*nO; ++i) {
cc_space_f_oo[i] = cc_space_f_oo_[i];
}
cublasSetMatrix(nO, nO, sizeof(float), cc_space_f_oo, nO, d_cc_space_f_oo, nO);
free(cc_space_f_oo);
float* d_cc_space_f_vo = NULL;
cudaStat = cudaMalloc((void**)&d_cc_space_f_vo, nV*nO*sizeof(float));
assert (cudaStat == cudaSuccess);
float* cc_space_f_vo = malloc((size_t) nV*nO*sizeof(float));
assert (cc_space_f_vo != NULL);
for (size_t i=0 ; i<(size_t) nV*nO; ++i) {
cc_space_f_vo[i] = cc_space_f_vo_[i];
}
cublasSetMatrix(nV, nO, sizeof(float), cc_space_f_vo, nV, d_cc_space_f_vo, nV);
free(cc_space_f_vo);
float* d_cc_space_f_ov = NULL;
cudaStat = cudaMalloc((void**)&d_cc_space_f_ov, nV*nO*sizeof(float));
assert (cudaStat == cudaSuccess);
float* cc_space_f_ov = malloc((size_t) nV*nO*sizeof(float));
assert (cc_space_f_ov != NULL);
for (size_t i=0 ; i<(size_t) nV*nO; ++i) {
cc_space_f_ov[i] = cc_space_f_ov_[i];
}
cublasSetMatrix(nO, nV, sizeof(float), cc_space_f_ov, nO, d_cc_space_f_ov, nO);
free(cc_space_f_ov);
float* d_cc_space_f_vv = NULL;
cudaStat = cudaMalloc((void**)&d_cc_space_f_vv, nV*nV*sizeof(float));
assert (cudaStat == cudaSuccess);
float* cc_space_f_vv = malloc((size_t) nV*nV*sizeof(float));
assert (cc_space_f_vv != NULL);
for (size_t i=0 ; i<(size_t) nV*nV; ++i) {
cc_space_f_vv[i] = cc_space_f_vv_[i];
}
cublasSetMatrix(nV, nV, sizeof(float), cc_space_f_vv, nV, d_cc_space_f_vv, nV);
free(cc_space_f_vv);
float* d_tau = NULL;
lda = nO * nO;
cudaStat = cudaMalloc((void **)&d_tau, lda * nV * nV * sizeof(float));
assert (cudaStat == cudaSuccess);
float* d_tau_x = NULL;
lda = nO * nO;
cudaStat = cudaMalloc((void **)&d_tau_x, lda * nV * nV * sizeof(float));
assert (cudaStat == cudaSuccess);
float* d_t1 = NULL;
cudaStat = cudaMalloc((void **)&d_t1, nO * nV * sizeof(float));
assert (cudaStat == cudaSuccess);
float* d_t2 = NULL;
cudaStat = cudaMalloc((void **)&d_t2, nO*nO*nV*nV * sizeof(float));
assert (cudaStat == cudaSuccess);
float* d_H_oo = NULL;
cudaStat = cudaMalloc((void **)&d_H_oo, nO * nO * sizeof(float));
assert (cudaStat == cudaSuccess);
float* d_H_vo = NULL;
cudaStat = cudaMalloc((void **)&d_H_vo, nV * nO * sizeof(float));
assert (cudaStat == cudaSuccess);
float* d_H_vv = NULL;
cudaStat = cudaMalloc((void **)&d_H_vv, nV * nV * sizeof(float));
assert (cudaStat == cudaSuccess);
data[igpu].cc_space_v_oo_chol = d_cc_space_v_oo_chol;
data[igpu].cc_space_v_ov_chol = d_cc_space_v_ov_chol;
data[igpu].cc_space_v_vo_chol = d_cc_space_v_vo_chol;
data[igpu].cc_space_v_vv_chol = d_cc_space_v_vv_chol;
data[igpu].cc_space_v_oooo = d_cc_space_v_oooo;
data[igpu].cc_space_v_vooo = d_cc_space_v_vooo;
data[igpu].cc_space_v_voov = d_cc_space_v_voov;
data[igpu].cc_space_v_oovv = d_cc_space_v_oovv;
data[igpu].cc_space_v_vvoo = d_cc_space_v_vvoo;
data[igpu].cc_space_v_oovo = d_cc_space_v_oovo;
data[igpu].cc_space_v_ovvo = d_cc_space_v_ovvo;
data[igpu].cc_space_v_ovov = d_cc_space_v_ovov;
data[igpu].cc_space_v_ovoo = d_cc_space_v_ovoo;
data[igpu].cc_space_f_oo = d_cc_space_f_oo;
data[igpu].cc_space_f_ov = d_cc_space_f_ov;
data[igpu].cc_space_f_vo = d_cc_space_f_vo;
data[igpu].cc_space_f_vv = d_cc_space_f_vv;
data[igpu].tau = d_tau;
data[igpu].tau_x = d_tau_x;
data[igpu].t1 = d_t1;
data[igpu].t2 = d_t2;
data[igpu].H_oo = d_H_oo;
data[igpu].H_vo = d_H_vo;
data[igpu].H_vv = d_H_vv;
data[igpu].nO = nO;
data[igpu].nV = nV;
data[igpu].cholesky_mo_num = cholesky_mo_num;
}
return data;
}
void gpu_deinit_sp(gpu_data_sp* data)
{
int ngpus = 1;
if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
#pragma omp parallel num_threads(ngpus)
{
size_t lda;
int igpu = omp_get_thread_num();
cudaSetDevice(igpu);
free(data[igpu].cc_space_v_oo_chol);
free(data[igpu].cc_space_v_ov_chol);
free(data[igpu].cc_space_v_vo_chol);
free(data[igpu].cc_space_v_vv_chol);
free(data[igpu].cc_space_v_oooo);
free(data[igpu].cc_space_v_vooo);
free(data[igpu].cc_space_v_voov);
free(data[igpu].cc_space_v_oovv);
free(data[igpu].cc_space_v_vvoo);
free(data[igpu].cc_space_v_oovo);
free(data[igpu].cc_space_v_ovvo);
free(data[igpu].cc_space_v_ovov);
free(data[igpu].cc_space_v_ovoo);
free(data[igpu].cc_space_f_oo);
free(data[igpu].cc_space_f_ov);
free(data[igpu].cc_space_f_vo);
free(data[igpu].cc_space_f_vv);
free(data[igpu].tau);
free(data[igpu].tau_x);
free(data[igpu].t1);
free(data[igpu].t2);
free(data[igpu].H_oo);
free(data[igpu].H_vo);
free(data[igpu].H_vv);
}
}