1
0
mirror of https://gitlab.com/scemama/qp_plugins_scemama.git synced 2024-11-07 06:33:40 +01:00

Managed memory

This commit is contained in:
Anthony Scemama 2023-12-05 17:45:25 +01:00
parent 36fda1d1bb
commit 0c5b6834b1
3 changed files with 89 additions and 89 deletions

View File

@ -138,15 +138,15 @@ void compute_h_oo_chol_gpu(gpu_data* data, int igpu)
double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol;
double* d_tau_kau;
cudaStat = cudaMalloc((void **)&d_tau_kau, cholesky_mo_num*nV*nO * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_tau_kau, cholesky_mo_num*nV*nO * sizeof(double), 1);
assert(cudaStat == cudaSuccess);
double* d_tmp_ovv;
cudaStat = cudaMalloc((void **)&d_tmp_ovv, nO*nV*nV * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_tmp_ovv, nO*nV*nV * sizeof(double), 1);
assert(cudaStat == cudaSuccess);
double* d_tmp_vov;
cudaStat = cudaMalloc((void **)&d_tmp_vov, nV*nO*nV * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_tmp_vov, nV*nO*nV * sizeof(double), 1);
assert(cudaStat == cudaSuccess);
for (size_t i=0 ; i<nV ; ++i) {
@ -239,7 +239,7 @@ void compute_h_vo_chol_gpu(gpu_data* data, int igpu)
cublasDcopy(handle, nV*nO, d_cc_space_f_vo, 1, d_H_vo, 1);
double* d_tmp_k;
cudaStat = cudaMalloc((void **)&d_tmp_k, cholesky_mo_num * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_tmp_k, cholesky_mo_num * sizeof(double), 1);
assert(cudaStat == cudaSuccess);
alpha = 2.0;
@ -261,7 +261,7 @@ void compute_h_vo_chol_gpu(gpu_data* data, int igpu)
cudaFree(d_tmp_k);
double* d_tmp;
cudaStat = cudaMalloc((void **)&d_tmp, cholesky_mo_num*nO*nO * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_tmp, cholesky_mo_num*nO*nO * sizeof(double), 1);
assert(cudaStat == cudaSuccess);
alpha = 1.0;
@ -273,7 +273,7 @@ void compute_h_vo_chol_gpu(gpu_data* data, int igpu)
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
double* d_tmp2;
cudaStat = cudaMalloc((void **)&d_tmp2, cholesky_mo_num*nO*nO * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_tmp2, cholesky_mo_num*nO*nO * sizeof(double), 1);
assert(cudaStat == cudaSuccess);
for (size_t i=0 ; i<nV ; ++i) {
@ -344,11 +344,11 @@ void compute_h_vv_chol_gpu(gpu_data* data, int igpu)
double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol;
double* d_tau_kia;
cudaStat = cudaMalloc((void **)&d_tau_kia, cholesky_mo_num*nO*nV * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_tau_kia, cholesky_mo_num*nO*nV * sizeof(double), 1);
assert(cudaStat == cudaSuccess);
double* d_tmp_oov;
cudaStat = cudaMalloc((void **)&d_tmp_oov, nO*nO*nV * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_tmp_oov, nO*nO*nV * sizeof(double), 1);
assert(cudaStat == cudaSuccess);
alpha = 1.0;
@ -433,7 +433,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
double* d_r2;
lda = nO * nO;
cudaStat = cudaMalloc((void **)&d_r2, lda * nV * nV * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_r2, lda * nV * nV * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
cudaMemset(d_r2, 0, nO*nO*nV*nV*sizeof(double));
memset(r2, 0, nO*nO*nV*nV*sizeof(double));
@ -458,7 +458,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
double* d_H_vv = data[igpu].H_vv;
double* d_K1;
cudaStat = cudaMalloc((void **)&d_K1, nO*nV*nO*nV * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_K1, nO*nV*nO*nV * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
#pragma omp sections
@ -467,7 +467,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
#pragma omp section
{
double* d_J1;
cudaStat = cudaMalloc((void **)&d_J1, nO*nV*nV*nO * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_J1, nO*nV*nV*nO * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
alpha = 1.0;
@ -479,7 +479,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
double* d_X_ovoo;
cudaStat = cudaMalloc((void **)&d_X_ovoo, nO*nV*nO*nO * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_X_ovoo, nO*nV*nO*nO * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
alpha = 0.0;
beta = 1.0;
@ -502,7 +502,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
double* d_Y_ovov;
cudaStat = cudaMalloc((void **)&d_Y_ovov, nO*nV*nO*nV * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_Y_ovov, nO*nV*nO*nV * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
alpha = 1.0;
@ -534,7 +534,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
cublasSetStream(handle, NULL);
double* d_tmp_cc;
cudaStat = cudaMalloc((void **)&d_tmp_cc, cholesky_mo_num*nV*nO * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_tmp_cc, cholesky_mo_num*nV*nO * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
alpha = 1.0;
@ -546,7 +546,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
double* d_J1_tmp;
cudaStat = cudaMalloc((void **)&d_J1_tmp, nV*nO*nV*nO * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_J1_tmp, nV*nO*nV*nO * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
alpha = 1.0;
@ -578,7 +578,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
cudaFree(d_J1_tmp);
double* d_X_voov;
cudaStat = cudaMalloc((void **)&d_X_voov, nV*nO*nO*nV * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_X_voov, nV*nO*nO*nV * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
for (size_t i=0 ; i<nV ; ++i) {
@ -612,7 +612,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
cublasSetStream(handle, NULL);
double* d_Z_ovvo;
cudaStat = cudaMalloc((void **)&d_Z_ovvo, nO*nV*nV*nO * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_Z_ovvo, nO*nV*nV*nO * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
alpha = -1.0;
@ -641,7 +641,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
}
double* d_Y_vovo;
cudaStat = cudaMalloc((void **)&d_Y_vovo, nV*nO*nV*nO * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_Y_vovo, nV*nO*nV*nO * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
alpha = 1.0;
@ -657,7 +657,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
}
double* d_X_ovvo;
cudaStat = cudaMalloc((void **)&d_X_ovvo, nO*nV*nV*nO * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_X_ovvo, nO*nV*nV*nO * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
alpha = 1.0;
@ -732,11 +732,11 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
double* d_X;
cudaStat = cudaMalloc((void **)&d_X, nV*nO*nV*nO * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_X, nV*nO*nV*nO * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
double* d_Y;
cudaStat = cudaMalloc((void **)&d_Y, nO*nV*nV*nO * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_Y, nO*nV*nV*nO * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
for (size_t i=0 ; i<nV ; ++i) {
@ -771,7 +771,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
cublasSetStream(handle, NULL);
double* d_Z;
cudaStat = cudaMalloc((void **)&d_Z, nO*nV*nV*nO * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_Z, nO*nV*nV*nO * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
alpha = 1.0;
@ -787,7 +787,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
double* d_t1v;
cudaStat = cudaMalloc((void **)&d_t1v, cholesky_mo_num*nO*nO * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_t1v, cholesky_mo_num*nO*nO * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
alpha = 1.0;
@ -799,7 +799,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
double* d_K1tmp;
cudaStat = cudaMalloc((void **)&d_K1tmp, nO*nO*nV*nV * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_K1tmp, nO*nO*nV*nV * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
alpha = 1.0;
@ -852,7 +852,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
#pragma omp section
{
double* d_Y_oooo;
cudaStat = cudaMalloc((void**)&d_Y_oooo, nO*nO*nO*nO*sizeof(double));
cudaStat = cudaMallocManaged((void**)&d_Y_oooo, nO*nO*nO*nO*sizeof(double), 1);
assert (cudaStat == cudaSuccess);
alpha = 1.0;
@ -864,7 +864,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
double* d_A1;
cudaStat = cudaMalloc((void**)&d_A1, nO*nO*nO*nO*sizeof(double));
cudaStat = cudaMallocManaged((void**)&d_A1, nO*nO*nO*nO*sizeof(double), 1);
assert (cudaStat == cudaSuccess);
alpha = 1.0;
@ -916,7 +916,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
#pragma omp section
{
double* d_g_vir;
cudaStat = cudaMalloc((void**)&d_g_vir, nV*nV*sizeof(double));
cudaStat = cudaMallocManaged((void**)&d_g_vir, nV*nV*sizeof(double), 1);
assert (cudaStat == cudaSuccess);
cublasDcopy(handle, nV*nV, d_H_vv, 1, d_g_vir, 1);
@ -929,7 +929,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
double* d_tmp_k;
cudaStat = cudaMalloc((void**)&d_tmp_k, cholesky_mo_num*sizeof(double));
cudaStat = cudaMallocManaged((void**)&d_tmp_k, cholesky_mo_num*sizeof(double), 1);
assert (cudaStat == cudaSuccess);
alpha = 1.0;
beta = 0.0;
@ -949,7 +949,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
cudaFree(d_tmp_k);
double* d_tmp_vo;
cudaStat = cudaMalloc((void**)&d_tmp_vo, cholesky_mo_num*nV*nO*sizeof(double));
cudaStat = cudaMallocManaged((void**)&d_tmp_vo, cholesky_mo_num*nV*nO*sizeof(double), 1);
assert (cudaStat == cudaSuccess);
alpha = 1.0;
beta = 0.0;
@ -960,7 +960,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
double* d_tmp_vo2;
cudaStat = cudaMalloc((void**)&d_tmp_vo2, cholesky_mo_num*nV*nO*sizeof(double));
cudaStat = cudaMallocManaged((void**)&d_tmp_vo2, cholesky_mo_num*nV*nO*sizeof(double), 1);
assert (cudaStat == cudaSuccess);
for (size_t i=0 ; i<nO ; ++i) {
cudaStreamCreate(&(stream[i]));
@ -990,7 +990,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
cudaFree(d_tmp_vo2);
double* d_Y_oovv;
cudaStat = cudaMalloc((void**)&d_Y_oovv, nO*nO*nV*nV*sizeof(double));
cudaStat = cudaMallocManaged((void**)&d_Y_oovv, nO*nO*nV*nV*sizeof(double), 1);
assert (cudaStat == cudaSuccess);
alpha = 1.0;
beta = 0.0;
@ -1035,12 +1035,12 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
{
double* d_g_occ;
lda = nO;
cudaStat = cudaMalloc((void **)&d_g_occ, nO*nO * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_g_occ, nO*nO * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
cublasDcopy(handle, nO*nO, d_H_oo, 1, d_g_occ, 1);
double* d_X;
cudaStat = cudaMalloc((void **)&d_X, cholesky_mo_num*sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_X, cholesky_mo_num*sizeof(double), 1);
assert (cudaStat == cudaSuccess);
alpha = 2.0;
@ -1077,7 +1077,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
double* d_X_oovv;
cudaStat = cudaMalloc((void **)&d_X_oovv, nO*nO*nV*nV * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_X_oovv, nO*nO*nV*nV * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
alpha = 1.0;
@ -1130,11 +1130,11 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
#pragma omp section
{
double* d_X_vovv;
cudaStat = cudaMalloc((void **)&d_X_vovv, nV*nO*nV*BLOCK_SIZE * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_X_vovv, nV*nO*nV*BLOCK_SIZE * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
double* d_Y_oovv;
cudaStat = cudaMalloc((void **)&d_Y_oovv, nO*nO*nV*nV * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_Y_oovv, nO*nO*nV*nV * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
for (size_t iblock=0 ; iblock<nV ; iblock += BLOCK_SIZE) {
@ -1197,7 +1197,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
#pragma omp section
{
double* d_tcc2;
cudaStat = cudaMalloc((void **)&d_tcc2, cholesky_mo_num*nV*nO * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_tcc2, cholesky_mo_num*nV*nO * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
alpha = 1.0;
@ -1209,7 +1209,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
double* d_tcc;
cudaStat = cudaMalloc((void **)&d_tcc, cholesky_mo_num*nO*nV * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_tcc, cholesky_mo_num*nO*nV * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
alpha = 1.0;
@ -1221,7 +1221,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
double* d_X_ovvo;
cudaStat = cudaMalloc((void **)&d_X_ovvo, nO*nV*nV*nO * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_X_ovvo, nO*nV*nV*nO * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
alpha = 1.0;
@ -1265,7 +1265,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
#pragma omp section
{
double* d_X_oovv;
cudaStat = cudaMalloc((void **)&d_X_oovv, nO*nO*nV*nV * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_X_oovv, nO*nO*nV*nV * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
alpha = 1.0;
@ -1296,7 +1296,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
}
double* d_X_vovo;
cudaStat = cudaMalloc((void **)&d_X_vovo, nV*nO*nV*nO * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_X_vovo, nV*nO*nV*nO * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
alpha = 0.0;
@ -1316,7 +1316,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
cublasSetStream(handle, NULL);
double* d_Y_oovo;
cudaStat = cudaMalloc((void **)&d_Y_oovo, nO*nO*nV*nO * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_Y_oovo, nO*nO*nV*nO * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
alpha = 1.0;
@ -1382,12 +1382,12 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
{
double* d_J1;
lda = nO*nV;
cudaStat = cudaMalloc((void **)&d_J1, nO*nV*nV*nO * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_J1, nO*nV*nV*nO * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
cublasSetMatrix(lda, nV*nO, sizeof(double), J1, lda, d_J1, lda);
double* d_X_ovvo;
cudaStat = cudaMalloc((void **)&d_X_ovvo, nO*nV*nV*nO * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_X_ovvo, nO*nV*nV*nO * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
for (size_t i=0 ; i<nV ; ++i) {
@ -1411,7 +1411,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
cudaFree(d_J1);
double* d_Y_voov;
cudaStat = cudaMalloc((void **)&d_Y_voov, nV*nO*nO*nV * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_Y_voov, nV*nO*nO*nV * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
for (size_t i=0 ; i<nV ; ++i) {
@ -1434,7 +1434,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
cublasSetStream(handle, NULL);
double* d_Z_ovov;
cudaStat = cudaMalloc((void **)&d_Z_ovov, nO*nV*nO*nV * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_Z_ovov, nO*nV*nO*nV * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
alpha = 1.0;
@ -1479,11 +1479,11 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
#pragma omp section
{
double* d_X_ovov;
cudaStat = cudaMalloc((void **)&d_X_ovov, nO*nV*nO*nV * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_X_ovov, nO*nV*nO*nV * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
double* d_Y_ovov;
cudaStat = cudaMalloc((void **)&d_Y_ovov, nO*nV*nO*nV * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_Y_ovov, nO*nV*nO*nV * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
for (size_t i=0 ; i<nV ; ++i) {
@ -1516,7 +1516,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
cublasSetStream(handle, NULL);
double* d_Z_ovov;
cudaStat = cudaMalloc((void **)&d_Z_ovov, nO*nV*nO*nV * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_Z_ovov, nO*nV*nO*nV * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
alpha = 1.0;
@ -1562,11 +1562,11 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
#pragma omp section
{
double* d_X_ovov;
cudaStat = cudaMalloc((void **)&d_X_ovov, nO*nV*nO*nV * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_X_ovov, nO*nV*nO*nV * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
double* d_Y_ovov;
cudaStat = cudaMalloc((void **)&d_Y_ovov, nO*nV*nO*nV * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_Y_ovov, nO*nV*nO*nV * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
@ -1600,7 +1600,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
cublasSetStream(handle, NULL);
double* d_Z_ovov;
cudaStat = cudaMalloc((void **)&d_Z_ovov, nO*nV*nO*nV * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_Z_ovov, nO*nV*nO*nV * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
alpha = 1.0;
@ -1648,7 +1648,7 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
double* d_tmp_cc;
lda = cholesky_mo_num * nV;
cudaStat = cudaMalloc((void **)&d_tmp_cc, lda * nV * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_tmp_cc, lda * nV * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
alpha=1.0; beta=0.0;
@ -1657,15 +1657,15 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, m, B, k, &beta, C, m);
double* d_tmp_cc2;
cudaStat = cudaMalloc((void **)&d_tmp_cc2, cholesky_mo_num*nV*sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_tmp_cc2, cholesky_mo_num*nV*sizeof(double), 1);
assert (cudaStat == cudaSuccess);
double* d_B1;
cudaStat = cudaMalloc((void**)&d_B1, nV*nV*BLOCK_SIZE*sizeof(double));
cudaStat = cudaMallocManaged((void**)&d_B1, nV*nV*BLOCK_SIZE*sizeof(double), 1);
assert (cudaStat == cudaSuccess);
double* d_tmpB1;
cudaStat = cudaMalloc((void**)&d_tmpB1, nV*BLOCK_SIZE*nV*sizeof(double));
cudaStat = cudaMallocManaged((void**)&d_tmpB1, nV*BLOCK_SIZE*nV*sizeof(double), 1);
assert (cudaStat == cudaSuccess);
#pragma omp for
@ -1781,7 +1781,7 @@ void compute_r1_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
double* d_r1;
lda = nO ;
cudaStat = cudaMalloc((void **)&d_r1, lda * nV * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_r1, lda * nV * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
cudaMemset(d_r1, 0, nO*nV*sizeof(double));
memset(r1, 0, nO*nV*sizeof(double));
@ -1808,7 +1808,7 @@ void compute_r1_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
cublasDcopy(handle, nO*nV, d_cc_space_f_ov, 1, d_r1, 1);
double* d_X_oo;
cudaStat = cudaMalloc((void **)&d_X_oo, nO*nO * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_X_oo, nO*nO * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
alpha = -2.0;
@ -1855,7 +1855,7 @@ void compute_r1_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
#pragma omp section
{
double* d_X_voov;
cudaStat = cudaMalloc((void **)&d_X_voov, nV* nO* nO* nV * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_X_voov, nV* nO* nO* nV * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
for (size_t i=0 ; i<nV ; ++i) {
@ -1901,7 +1901,7 @@ void compute_r1_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
#pragma omp section
{
double* d_X_ovov;
cudaStat = cudaMalloc((void **)&d_X_ovov, nO* nV* nO* nV * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_X_ovov, nO* nV* nO* nV * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
cublasDcopy(handle, nO*nV*nO*nV, d_cc_space_v_ovov, 1, d_X_ovov, 1);
@ -1939,7 +1939,7 @@ void compute_r1_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
#pragma omp section
{
double* d_T_vvoo;
cudaStat = cudaMalloc((void **)&d_T_vvoo, nV*nV*nO*nO * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_T_vvoo, nV*nV*nO*nO * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
alpha = 0.0;
@ -1950,11 +1950,11 @@ void compute_r1_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nV*nV, nO*nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
double* d_W_vvov;
cudaStat = cudaMalloc((void **)&d_W_vvov, nV*nV*nO*BLOCK_SIZE * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_W_vvov, nV*nV*nO*BLOCK_SIZE * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
double* d_W_vvov_tmp;
cudaStat = cudaMalloc((void **)&d_W_vvov_tmp, nV*nO*nV*BLOCK_SIZE * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_W_vvov_tmp, nV*nO*nV*BLOCK_SIZE * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
@ -2008,7 +2008,7 @@ void compute_r1_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
#pragma omp section
{
double* d_W_oovo;
cudaStat = cudaMalloc((void **)&d_W_oovo, nO*nO*nV*nO * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_W_oovo, nO*nO*nV*nO * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
alpha = 2.0;

View File

@ -36,125 +36,125 @@ gpu_data* gpu_init(
double* d_cc_space_v_oo_chol;
lda = cholesky_mo_num * nO;
cudaStat = cudaMalloc((void **)&d_cc_space_v_oo_chol, lda * nO * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_cc_space_v_oo_chol, lda * nO * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
cublasSetMatrix(cholesky_mo_num*nO, nO, sizeof(double), cc_space_v_oo_chol, lda, d_cc_space_v_oo_chol, lda);
double* d_cc_space_v_ov_chol;
lda = cholesky_mo_num * nO;
cudaStat = cudaMalloc((void **)&d_cc_space_v_ov_chol, lda * nV * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_cc_space_v_ov_chol, lda * nV * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
cublasSetMatrix(cholesky_mo_num*nO, nV, sizeof(double), cc_space_v_ov_chol, lda, d_cc_space_v_ov_chol, lda);
double* d_cc_space_v_vo_chol;
lda = cholesky_mo_num * nV;
cudaStat = cudaMalloc((void **)&d_cc_space_v_vo_chol, lda * nO * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_cc_space_v_vo_chol, lda * nO * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
cublasSetMatrix(cholesky_mo_num*nV, nO, sizeof(double), cc_space_v_vo_chol, lda, d_cc_space_v_vo_chol, lda);
double* d_cc_space_v_vv_chol;
lda = cholesky_mo_num * nV;
cudaStat = cudaMalloc((void **)&d_cc_space_v_vv_chol, lda * nV * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_cc_space_v_vv_chol, lda * nV * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
cublasSetMatrix(cholesky_mo_num*nV, nV, sizeof(double), cc_space_v_vv_chol, lda, d_cc_space_v_vv_chol, lda);
double* d_cc_space_v_oooo;
cudaStat = cudaMalloc((void**)&d_cc_space_v_oooo, nO*nO*nO*nO*sizeof(double));
cudaStat = cudaMallocManaged((void**)&d_cc_space_v_oooo, nO*nO*nO*nO*sizeof(double), 1);
assert (cudaStat == cudaSuccess);
cublasSetMatrix(nO*nO, nO*nO, sizeof(double), cc_space_v_oooo, nO*nO, d_cc_space_v_oooo, nO*nO);
double* d_cc_space_v_vooo;
cudaStat = cudaMalloc((void**)&d_cc_space_v_vooo, nV*nO*nO*nO*sizeof(double));
cudaStat = cudaMallocManaged((void**)&d_cc_space_v_vooo, nV*nO*nO*nO*sizeof(double), 1);
assert (cudaStat == cudaSuccess);
cublasSetMatrix(nV*nO, nO*nO, sizeof(double), cc_space_v_vooo, nV*nO, d_cc_space_v_vooo, nV*nO);
double* d_cc_space_v_voov;
cudaStat = cudaMalloc((void**)&d_cc_space_v_voov, nV*nO*nO*nV*sizeof(double));
cudaStat = cudaMallocManaged((void**)&d_cc_space_v_voov, nV*nO*nO*nV*sizeof(double), 1);
assert (cudaStat == cudaSuccess);
cublasSetMatrix(nV*nO, nO*nV, sizeof(double), cc_space_v_voov, nV*nO, d_cc_space_v_voov, nV*nO);
double* d_cc_space_v_oovv;
cudaStat = cudaMalloc((void**)&d_cc_space_v_oovv, nO*nO*nV*nV*sizeof(double));
cudaStat = cudaMallocManaged((void**)&d_cc_space_v_oovv, nO*nO*nV*nV*sizeof(double), 1);
assert (cudaStat == cudaSuccess);
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), cc_space_v_oovv, nO*nO, d_cc_space_v_oovv, nO*nO);
double* d_cc_space_v_vvoo;
cudaStat = cudaMalloc((void**)&d_cc_space_v_vvoo, nV*nV*nO*nO*sizeof(double));
cudaStat = cudaMallocManaged((void**)&d_cc_space_v_vvoo, nV*nV*nO*nO*sizeof(double), 1);
assert (cudaStat == cudaSuccess);
cublasSetMatrix(nV*nV, nO*nO, sizeof(double), cc_space_v_vvoo, nV*nV, d_cc_space_v_vvoo, nV*nV);
double* d_cc_space_v_oovo;
lda = nO*nO;
cudaStat = cudaMalloc((void **)&d_cc_space_v_oovo, nO*nO*nV*nO * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_cc_space_v_oovo, nO*nO*nV*nO * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_oovo, lda, d_cc_space_v_oovo, lda);
double* d_cc_space_v_ovvo;
lda = nO*nV;
cudaStat = cudaMalloc((void **)&d_cc_space_v_ovvo, nO*nV*nV*nO * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_cc_space_v_ovvo, nO*nV*nV*nO * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_ovvo, lda, d_cc_space_v_ovvo, lda);
double* d_cc_space_v_ovov;
lda = nO*nV;
cudaStat = cudaMalloc((void **)&d_cc_space_v_ovov, nO*nV*nV*nO * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_cc_space_v_ovov, nO*nV*nV*nO * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_ovov, lda, d_cc_space_v_ovov, lda);
double* d_cc_space_v_ovoo;
lda = nO*nV;
cudaStat = cudaMalloc((void **)&d_cc_space_v_ovoo, nO*nV*nO*nO * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_cc_space_v_ovoo, nO*nV*nO*nO * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
cublasSetMatrix(lda, nO*nO, sizeof(double), cc_space_v_ovoo, lda, d_cc_space_v_ovoo, lda);
double* d_cc_space_f_oo;
cudaStat = cudaMalloc((void**)&d_cc_space_f_oo, nO*nO*sizeof(double));
cudaStat = cudaMallocManaged((void**)&d_cc_space_f_oo, nO*nO*sizeof(double), 1);
assert (cudaStat == cudaSuccess);
cublasSetMatrix(nO, nO, sizeof(double), cc_space_f_oo, nO, d_cc_space_f_oo, nO);
double* d_cc_space_f_vo;
cudaStat = cudaMalloc((void**)&d_cc_space_f_vo, nV*nO*sizeof(double));
cudaStat = cudaMallocManaged((void**)&d_cc_space_f_vo, nV*nO*sizeof(double), 1);
assert (cudaStat == cudaSuccess);
cublasSetMatrix(nV, nO, sizeof(double), cc_space_f_vo, nV, d_cc_space_f_vo, nV);
double* d_cc_space_f_ov;
cudaStat = cudaMalloc((void**)&d_cc_space_f_ov, nV*nO*sizeof(double));
cudaStat = cudaMallocManaged((void**)&d_cc_space_f_ov, nV*nO*sizeof(double), 1);
assert (cudaStat == cudaSuccess);
cublasSetMatrix(nO, nV, sizeof(double), cc_space_f_ov, nO, d_cc_space_f_ov, nO);
double* d_cc_space_f_vv;
cudaStat = cudaMalloc((void**)&d_cc_space_f_vv, nV*nV*sizeof(double));
cudaStat = cudaMallocManaged((void**)&d_cc_space_f_vv, nV*nV*sizeof(double), 1);
assert (cudaStat == cudaSuccess);
cublasSetMatrix(nV, nV, sizeof(double), cc_space_f_vv, nV, d_cc_space_f_vv, nV);
double* d_tau;
lda = nO * nO;
cudaStat = cudaMalloc((void **)&d_tau, lda * nV * nV * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_tau, lda * nV * nV * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
double* d_tau_x;
lda = nO * nO;
cudaStat = cudaMalloc((void **)&d_tau_x, lda * nV * nV * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_tau_x, lda * nV * nV * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
double* d_t1;
cudaStat = cudaMalloc((void **)&d_t1, nO * nV * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_t1, nO * nV * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
double* d_t2;
cudaStat = cudaMalloc((void **)&d_t2, nO*nO*nV*nV * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_t2, nO*nO*nV*nV * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
double* d_H_oo;
cudaStat = cudaMalloc((void **)&d_H_oo, nO * nO * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_H_oo, nO * nO * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
double* d_H_vo;
cudaStat = cudaMalloc((void **)&d_H_vo, nV * nO * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_H_vo, nV * nO * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
double* d_H_vv;
cudaStat = cudaMalloc((void **)&d_H_vv, nV * nV * sizeof(double));
cudaStat = cudaMallocManaged((void **)&d_H_vv, nV * nV * sizeof(double), 1);
assert (cudaStat == cudaSuccess);
data[igpu].cc_space_v_oo_chol = d_cc_space_v_oo_chol;

View File

@ -40,7 +40,7 @@ subroutine run
print *,''
print '(''|'',A6,''|'',5(A20,''|''))', 'E(mu)', '<W>', 'E(mu) + <W>', 'E(mu) + \alpha_0<W>', &
'E(mu) + \alpha_0<W>_s + \alpha_1<W>_t', 'E(mu) + \alpha_0_r<W>_s + \alpha_1_r<W>_t'
print '(''|'',F6.1,''|'',5(F20.15,''|''))', mu_erf, energy_mu(istate), energy_mu(istate) + correction_mu(istate), energy_mu(istate) + &
print '(''|'',F6.2,''|'',5(F20.15,''|''))', mu_erf, energy_mu(istate), energy_mu(istate) + correction_mu(istate), energy_mu(istate) + &
correction_alpha_0(istate), energy_mu(istate) + correction_alpha_1(istate), energy_mu(istate) + &
correction_alpha_1_r(istate)
enddo