Separate upload of t1, t2 and tau

This commit is contained in:
Anthony Scemama 2023-08-03 17:29:57 +02:00
parent d3c87d8181
commit 2ea84d4ffc
5 changed files with 124 additions and 51 deletions

View File

@ -126,14 +126,13 @@ subroutine run_ccsd_space_orb
! Residue
if (do_ao_cholesky) then
! if (.False.) then
call compute_H_oo_chol(nO,nV,tau_x,H_oo)
call compute_H_vv_chol(nO,nV,tau_x,H_vv)
call compute_H_vo_chol(nO,nV,t1,H_vo)
call gpu_upload(gpu_data, nO, nV, t1, t2, tau, tau_x, H_oo, H_vv);
call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
call compute_r2_space_chol_gpu(nO,nV,cholesky_mo_num,gpu_data,t1,t2,tau, &
H_oo, H_vv, r2, max_r2)
call compute_r2_space_chol_gpu(gpu_data, nO, nV, t1, r2, max_r2)
else
call compute_H_oo(nO,nV,t1,t2,tau,H_oo)
call compute_H_vv(nO,nV,t1,t2,tau,H_vv)

View File

@ -6,22 +6,60 @@
#include <cuda_runtime.h>
#include "gpu.h"
#define BLOCK_SIZE 16
void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo_num,
gpu_data* data,
void gpu_upload(gpu_data* data,
int nO, int nV,
double* t1,
double* t2,
double* tau,
double* tau_x,
double* H_oo,
double* H_vv,
double* r2,
double* max_r2)
double* H_vv)
{
int lda;
const int cholesky_mo_num = data->cholesky_mo_num;
int ngpus = 1;
cudaGetDeviceCount(&ngpus);
if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
#pragma omp parallel num_threads(ngpus)
{
int igpu = omp_get_thread_num();
cudaSetDevice(igpu);
double* d_tau = data[igpu].tau;
lda = nO * nO;
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau, lda, d_tau, lda);
double* d_tau_x = data[igpu].tau_x;
lda = nO * nO;
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau_x, lda, d_tau_x, lda);
double* d_t1 = data[igpu].t1;
lda = nO;
cublasSetMatrix(nO, nV, sizeof(double), t1, lda, d_t1, lda);
double* d_t2 = data[igpu].t2;
lda = nO*nO;
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), t2, lda, d_t2, lda);
double* d_H_oo = data[igpu].H_oo;
lda = nO;
cublasSetMatrix(nO, nO, sizeof(double), H_oo, lda, d_H_oo, lda);
double* d_H_vv = data[igpu].H_vv;
lda = nV;
cublasSetMatrix(nV, nV, sizeof(double), H_vv, lda, d_H_vv, lda);
}
}
void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, double* r2, double* max_r2)
{
const int cholesky_mo_num = data->cholesky_mo_num;
int ngpus = 1;
if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
double* J1 = malloc(nO*nV*nV*nO*sizeof(double));
double* K1 = malloc(nO*nV*nV*nO*sizeof(double));
@ -33,15 +71,19 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
double* A;
double* B;
double* C;
cudaStream_t stream[nV];
int igpu = omp_get_thread_num();
cudaSetDevice(igpu);
cublasHandle_t handle;
cublasCreate(&handle);
cudaStream_t stream[nV];
double* d_r2;
lda = nO * nO;
cudaMalloc((void **)&d_r2, lda * nV * nV * sizeof(double));
cudaMemset(d_r2, 0, nO*nO*nV*nV*sizeof(double));
memset(r2, 0, nO*nO*nV*nV*sizeof(double));
double* d_cc_space_v_oo_chol = data[igpu].cc_space_v_oo_chol;
double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol;
@ -56,33 +98,15 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
double* d_cc_space_v_ovov = data[igpu].cc_space_v_ovov;
double* d_cc_space_v_ovoo = data[igpu].cc_space_v_ovoo;
double* d_cc_space_f_vo = data[igpu].cc_space_f_vo;
double* d_tau;
double* d_r2;
double* d_t1;
double* d_t2;
double* d_tmp_cc;
double* d_tau = data[igpu].tau;
double* d_t1 = data[igpu].t1;
double* d_t2 = data[igpu].t2;
double* d_H_oo = data[igpu].H_oo;
double* d_H_vv = data[igpu].H_vv;
double* d_K1;
cudaMalloc((void **)&d_K1, nO*nV*nO*nV * sizeof(double));
lda = nO * nO;
cudaMalloc((void **)&d_tau, lda * nV * nV * sizeof(double));
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau, lda, d_tau, lda);
lda = nO * nO;
cudaMalloc((void **)&d_r2, lda * nV * nV * sizeof(double));
memset(r2, 0, nO*nO*nV*nV*sizeof(double));
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), r2, lda, d_r2, lda);
lda = nO;
cudaMalloc((void **)&d_t1, nO * nV * sizeof(double));
cublasSetMatrix(nO, nV, sizeof(double), t1, lda, d_t1, lda);
lda = nO*nO;
cudaMalloc((void **)&d_t2, nO*nO*nV*nV * sizeof(double));
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), t2, lda, d_t2, lda);
#pragma omp sections
{
@ -523,7 +547,7 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
{
double* d_g_vir;
cudaMalloc((void**)&d_g_vir, nV*nV*sizeof(double));
cublasSetMatrix(nV, nV, sizeof(double), H_vv, nV, d_g_vir, nV);
cublasDcopy(handle, nV*nV, d_H_vv, 1, d_g_vir, 1);
alpha = -1.0;
beta = 1.0;
@ -637,7 +661,7 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
double* d_g_occ;
lda = nO;
cudaMalloc((void **)&d_g_occ, nO*nO * sizeof(double));
cublasSetMatrix(lda, nO, sizeof(double), H_oo, lda, d_g_occ, lda);
cublasDcopy(handle, nO*nO, d_H_oo, 1, d_g_occ, 1);
double* d_X;
cudaMalloc((void **)&d_X, cholesky_mo_num*sizeof(double));
@ -720,6 +744,8 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
lda = nO*nV;
cublasSetMatrix(lda, nO*nV, sizeof(double), K1, lda, d_K1, lda);
#define BLOCK_SIZE 16
#pragma omp sections
{
@ -1224,6 +1250,7 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
cudaFree(d_K1);
double* d_tmp_cc;
lda = cholesky_mo_num * nV;
cudaMalloc((void **)&d_tmp_cc, lda * nV * sizeof(double));
@ -1297,9 +1324,6 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
cudaFree(d_tmpB1);
cudaFree(d_B1);
cudaFree(d_tmp_cc2);
cudaFree(d_tau);
cudaFree(d_t1);
cudaFree(d_tmp_cc);
double * r2_tmp = malloc(nO*nO*nV*nV*sizeof(double));

View File

@ -12,5 +12,15 @@ typedef struct {
double* cc_space_v_ovov;
double* cc_space_v_ovoo;
double* cc_space_f_vo;
double* tau;
double* tau_x;
double* t1;
double* t2;
double* H_oo;
double* H_vv;
int nO;
int nV;
int cholesky_mo_num;
} gpu_data;
#define MULTIGPU 1

View File

@ -6,8 +6,6 @@
#include <cuda_runtime.h>
#include "gpu.h"
#define BLOCK_SIZE 16
gpu_data* gpu_init(
int nO, int nV, int cholesky_mo_num,
double* cc_space_v_oo_chol, double* cc_space_v_ov_chol,
@ -92,6 +90,30 @@ gpu_data* gpu_init(
cudaMalloc((void**)&d_cc_space_f_vo, nV*nO*sizeof(double));
cublasSetMatrix(nV, nO, sizeof(double), cc_space_f_vo, nV, d_cc_space_f_vo, nV);
double* d_tau;
lda = nO * nO;
cudaMalloc((void **)&d_tau, lda * nV * nV * sizeof(double));
double* d_tau_x;
lda = nO * nO;
cudaMalloc((void **)&d_tau_x, lda * nV * nV * sizeof(double));
double* d_t1;
lda = nO;
cudaMalloc((void **)&d_t1, nO * nV * sizeof(double));
double* d_t2;
lda = nO*nO;
cudaMalloc((void **)&d_t2, nO*nO*nV*nV * sizeof(double));
double* d_H_oo;
lda = nO;
cudaMalloc((void **)&d_H_oo, nO * nO * sizeof(double));
double* d_H_vv;
lda = nV;
cudaMalloc((void **)&d_H_vv, nV * nV * sizeof(double));
data[igpu].cc_space_v_oo_chol = d_cc_space_v_oo_chol;
data[igpu].cc_space_v_ov_chol = d_cc_space_v_ov_chol;
data[igpu].cc_space_v_vo_chol = d_cc_space_v_vo_chol;
@ -105,6 +127,17 @@ gpu_data* gpu_init(
data[igpu].cc_space_v_ovov = d_cc_space_v_ovov;
data[igpu].cc_space_v_ovoo = d_cc_space_v_ovoo;
data[igpu].cc_space_f_vo = d_cc_space_f_vo;
data[igpu].tau = d_tau;
data[igpu].tau_x = d_tau_x;
data[igpu].t1 = d_t1;
data[igpu].t2 = d_t2;
data[igpu].H_oo = d_H_oo;
data[igpu].H_vv = d_H_vv;
data[igpu].nO = nO;
data[igpu].nV = nV;
data[igpu].cholesky_mo_num = cholesky_mo_num;
}
return data;
}

View File

@ -26,16 +26,23 @@ module gpu_module
real(c_double), intent(in) :: cc_space_f_vo(nV,nO)
end function
subroutine compute_r2_space_chol_gpu(nO,nV,cholesky_mo_num, gpu_data, t1, t2, tau,&
H_oo, H_vv, r2, max_r2) bind(C)
subroutine gpu_upload(gpu_data, nO, nV, t1, t2, tau, tau_x, H_oo, H_vv) bind(C)
import c_int, c_double, c_ptr
integer(c_int), intent(in), value :: nO, nV, cholesky_mo_num
type(c_ptr), value :: gpu_data
integer(c_int), intent(in), value :: nO, nV
real(c_double), intent(in) :: t1(nO,nV)
real(c_double), intent(in) :: t2(nO,nO,nV,nV)
real(c_double), intent(in) :: tau(nO,nO,nV,nV)
real(c_double), intent(in) :: tau_x(nO,nO,nV,nV)
real(c_double), intent(in) :: H_oo(nO,nO)
real(c_double), intent(in) :: H_vv(nV,nV)
end subroutine
subroutine compute_r2_space_chol_gpu(gpu_data, nO, nV, t1, r2, max_r2) bind(C)
import c_int, c_double, c_ptr
type(c_ptr), value :: gpu_data
integer(c_int), intent(in), value :: nO, nV
real(c_double), intent(in) :: t1(nO,nV)
real(c_double), intent(in) :: t2(nO,nO,nV,nV)
real(c_double), intent(in) :: tau(nO,nO,nV,nV)
real(c_double), intent(in) :: H_oo(nO,nO)
real(c_double), intent(in) :: H_vv(nV,nV)
real(c_double), intent(out) :: r2(nO,nO,nV,nV)
real(c_double), intent(out) :: max_r2
end subroutine