mirror of
https://gitlab.com/scemama/qp_plugins_scemama.git
synced 2024-12-22 12:23:37 +01:00
Separate upload of t1, t2 and tau
This commit is contained in:
parent
d3c87d8181
commit
2ea84d4ffc
@ -126,14 +126,13 @@ subroutine run_ccsd_space_orb
|
|||||||
|
|
||||||
! Residue
|
! Residue
|
||||||
if (do_ao_cholesky) then
|
if (do_ao_cholesky) then
|
||||||
! if (.False.) then
|
|
||||||
call compute_H_oo_chol(nO,nV,tau_x,H_oo)
|
call compute_H_oo_chol(nO,nV,tau_x,H_oo)
|
||||||
call compute_H_vv_chol(nO,nV,tau_x,H_vv)
|
call compute_H_vv_chol(nO,nV,tau_x,H_vv)
|
||||||
call compute_H_vo_chol(nO,nV,t1,H_vo)
|
call compute_H_vo_chol(nO,nV,t1,H_vo)
|
||||||
|
|
||||||
|
call gpu_upload(gpu_data, nO, nV, t1, t2, tau, tau_x, H_oo, H_vv);
|
||||||
call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
|
call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
|
||||||
call compute_r2_space_chol_gpu(nO,nV,cholesky_mo_num,gpu_data,t1,t2,tau, &
|
call compute_r2_space_chol_gpu(gpu_data, nO, nV, t1, r2, max_r2)
|
||||||
H_oo, H_vv, r2, max_r2)
|
|
||||||
else
|
else
|
||||||
call compute_H_oo(nO,nV,t1,t2,tau,H_oo)
|
call compute_H_oo(nO,nV,t1,t2,tau,H_oo)
|
||||||
call compute_H_vv(nO,nV,t1,t2,tau,H_vv)
|
call compute_H_vv(nO,nV,t1,t2,tau,H_vv)
|
||||||
|
@ -6,22 +6,60 @@
|
|||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
#include "gpu.h"
|
#include "gpu.h"
|
||||||
|
|
||||||
#define BLOCK_SIZE 16
|
void gpu_upload(gpu_data* data,
|
||||||
|
int nO, int nV,
|
||||||
|
|
||||||
void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo_num,
|
|
||||||
gpu_data* data,
|
|
||||||
double* t1,
|
double* t1,
|
||||||
double* t2,
|
double* t2,
|
||||||
double* tau,
|
double* tau,
|
||||||
|
double* tau_x,
|
||||||
double* H_oo,
|
double* H_oo,
|
||||||
double* H_vv,
|
double* H_vv)
|
||||||
double* r2,
|
|
||||||
double* max_r2)
|
|
||||||
{
|
{
|
||||||
|
int lda;
|
||||||
|
const int cholesky_mo_num = data->cholesky_mo_num;
|
||||||
|
|
||||||
int ngpus = 1;
|
int ngpus = 1;
|
||||||
cudaGetDeviceCount(&ngpus);
|
if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
|
||||||
|
|
||||||
|
|
||||||
|
#pragma omp parallel num_threads(ngpus)
|
||||||
|
{
|
||||||
|
int igpu = omp_get_thread_num();
|
||||||
|
cudaSetDevice(igpu);
|
||||||
|
|
||||||
|
double* d_tau = data[igpu].tau;
|
||||||
|
lda = nO * nO;
|
||||||
|
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau, lda, d_tau, lda);
|
||||||
|
|
||||||
|
double* d_tau_x = data[igpu].tau_x;
|
||||||
|
lda = nO * nO;
|
||||||
|
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau_x, lda, d_tau_x, lda);
|
||||||
|
|
||||||
|
double* d_t1 = data[igpu].t1;
|
||||||
|
lda = nO;
|
||||||
|
cublasSetMatrix(nO, nV, sizeof(double), t1, lda, d_t1, lda);
|
||||||
|
|
||||||
|
double* d_t2 = data[igpu].t2;
|
||||||
|
lda = nO*nO;
|
||||||
|
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), t2, lda, d_t2, lda);
|
||||||
|
|
||||||
|
double* d_H_oo = data[igpu].H_oo;
|
||||||
|
lda = nO;
|
||||||
|
cublasSetMatrix(nO, nO, sizeof(double), H_oo, lda, d_H_oo, lda);
|
||||||
|
|
||||||
|
double* d_H_vv = data[igpu].H_vv;
|
||||||
|
lda = nV;
|
||||||
|
cublasSetMatrix(nV, nV, sizeof(double), H_vv, lda, d_H_vv, lda);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, double* r2, double* max_r2)
|
||||||
|
{
|
||||||
|
const int cholesky_mo_num = data->cholesky_mo_num;
|
||||||
|
|
||||||
|
int ngpus = 1;
|
||||||
|
if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
|
||||||
|
|
||||||
double* J1 = malloc(nO*nV*nV*nO*sizeof(double));
|
double* J1 = malloc(nO*nV*nV*nO*sizeof(double));
|
||||||
double* K1 = malloc(nO*nV*nV*nO*sizeof(double));
|
double* K1 = malloc(nO*nV*nV*nO*sizeof(double));
|
||||||
@ -33,15 +71,19 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
|
|||||||
double* A;
|
double* A;
|
||||||
double* B;
|
double* B;
|
||||||
double* C;
|
double* C;
|
||||||
|
cudaStream_t stream[nV];
|
||||||
|
|
||||||
int igpu = omp_get_thread_num();
|
int igpu = omp_get_thread_num();
|
||||||
|
|
||||||
cudaSetDevice(igpu);
|
cudaSetDevice(igpu);
|
||||||
cublasHandle_t handle;
|
cublasHandle_t handle;
|
||||||
|
|
||||||
cublasCreate(&handle);
|
cublasCreate(&handle);
|
||||||
|
|
||||||
cudaStream_t stream[nV];
|
double* d_r2;
|
||||||
|
lda = nO * nO;
|
||||||
|
cudaMalloc((void **)&d_r2, lda * nV * nV * sizeof(double));
|
||||||
|
cudaMemset(d_r2, 0, nO*nO*nV*nV*sizeof(double));
|
||||||
|
memset(r2, 0, nO*nO*nV*nV*sizeof(double));
|
||||||
|
|
||||||
double* d_cc_space_v_oo_chol = data[igpu].cc_space_v_oo_chol;
|
double* d_cc_space_v_oo_chol = data[igpu].cc_space_v_oo_chol;
|
||||||
double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol;
|
double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol;
|
||||||
@ -56,33 +98,15 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
|
|||||||
double* d_cc_space_v_ovov = data[igpu].cc_space_v_ovov;
|
double* d_cc_space_v_ovov = data[igpu].cc_space_v_ovov;
|
||||||
double* d_cc_space_v_ovoo = data[igpu].cc_space_v_ovoo;
|
double* d_cc_space_v_ovoo = data[igpu].cc_space_v_ovoo;
|
||||||
double* d_cc_space_f_vo = data[igpu].cc_space_f_vo;
|
double* d_cc_space_f_vo = data[igpu].cc_space_f_vo;
|
||||||
|
double* d_tau = data[igpu].tau;
|
||||||
double* d_tau;
|
double* d_t1 = data[igpu].t1;
|
||||||
double* d_r2;
|
double* d_t2 = data[igpu].t2;
|
||||||
double* d_t1;
|
double* d_H_oo = data[igpu].H_oo;
|
||||||
double* d_t2;
|
double* d_H_vv = data[igpu].H_vv;
|
||||||
double* d_tmp_cc;
|
|
||||||
|
|
||||||
double* d_K1;
|
double* d_K1;
|
||||||
cudaMalloc((void **)&d_K1, nO*nV*nO*nV * sizeof(double));
|
cudaMalloc((void **)&d_K1, nO*nV*nO*nV * sizeof(double));
|
||||||
|
|
||||||
lda = nO * nO;
|
|
||||||
cudaMalloc((void **)&d_tau, lda * nV * nV * sizeof(double));
|
|
||||||
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau, lda, d_tau, lda);
|
|
||||||
|
|
||||||
lda = nO * nO;
|
|
||||||
cudaMalloc((void **)&d_r2, lda * nV * nV * sizeof(double));
|
|
||||||
memset(r2, 0, nO*nO*nV*nV*sizeof(double));
|
|
||||||
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), r2, lda, d_r2, lda);
|
|
||||||
|
|
||||||
lda = nO;
|
|
||||||
cudaMalloc((void **)&d_t1, nO * nV * sizeof(double));
|
|
||||||
cublasSetMatrix(nO, nV, sizeof(double), t1, lda, d_t1, lda);
|
|
||||||
|
|
||||||
lda = nO*nO;
|
|
||||||
cudaMalloc((void **)&d_t2, nO*nO*nV*nV * sizeof(double));
|
|
||||||
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), t2, lda, d_t2, lda);
|
|
||||||
|
|
||||||
#pragma omp sections
|
#pragma omp sections
|
||||||
{
|
{
|
||||||
|
|
||||||
@ -523,7 +547,7 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
|
|||||||
{
|
{
|
||||||
double* d_g_vir;
|
double* d_g_vir;
|
||||||
cudaMalloc((void**)&d_g_vir, nV*nV*sizeof(double));
|
cudaMalloc((void**)&d_g_vir, nV*nV*sizeof(double));
|
||||||
cublasSetMatrix(nV, nV, sizeof(double), H_vv, nV, d_g_vir, nV);
|
cublasDcopy(handle, nV*nV, d_H_vv, 1, d_g_vir, 1);
|
||||||
|
|
||||||
alpha = -1.0;
|
alpha = -1.0;
|
||||||
beta = 1.0;
|
beta = 1.0;
|
||||||
@ -637,7 +661,7 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
|
|||||||
double* d_g_occ;
|
double* d_g_occ;
|
||||||
lda = nO;
|
lda = nO;
|
||||||
cudaMalloc((void **)&d_g_occ, nO*nO * sizeof(double));
|
cudaMalloc((void **)&d_g_occ, nO*nO * sizeof(double));
|
||||||
cublasSetMatrix(lda, nO, sizeof(double), H_oo, lda, d_g_occ, lda);
|
cublasDcopy(handle, nO*nO, d_H_oo, 1, d_g_occ, 1);
|
||||||
|
|
||||||
double* d_X;
|
double* d_X;
|
||||||
cudaMalloc((void **)&d_X, cholesky_mo_num*sizeof(double));
|
cudaMalloc((void **)&d_X, cholesky_mo_num*sizeof(double));
|
||||||
@ -720,6 +744,8 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
|
|||||||
lda = nO*nV;
|
lda = nO*nV;
|
||||||
cublasSetMatrix(lda, nO*nV, sizeof(double), K1, lda, d_K1, lda);
|
cublasSetMatrix(lda, nO*nV, sizeof(double), K1, lda, d_K1, lda);
|
||||||
|
|
||||||
|
#define BLOCK_SIZE 16
|
||||||
|
|
||||||
#pragma omp sections
|
#pragma omp sections
|
||||||
{
|
{
|
||||||
|
|
||||||
@ -1224,6 +1250,7 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
|
|||||||
|
|
||||||
cudaFree(d_K1);
|
cudaFree(d_K1);
|
||||||
|
|
||||||
|
double* d_tmp_cc;
|
||||||
lda = cholesky_mo_num * nV;
|
lda = cholesky_mo_num * nV;
|
||||||
cudaMalloc((void **)&d_tmp_cc, lda * nV * sizeof(double));
|
cudaMalloc((void **)&d_tmp_cc, lda * nV * sizeof(double));
|
||||||
|
|
||||||
@ -1297,9 +1324,6 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
|
|||||||
cudaFree(d_tmpB1);
|
cudaFree(d_tmpB1);
|
||||||
cudaFree(d_B1);
|
cudaFree(d_B1);
|
||||||
cudaFree(d_tmp_cc2);
|
cudaFree(d_tmp_cc2);
|
||||||
|
|
||||||
cudaFree(d_tau);
|
|
||||||
cudaFree(d_t1);
|
|
||||||
cudaFree(d_tmp_cc);
|
cudaFree(d_tmp_cc);
|
||||||
|
|
||||||
double * r2_tmp = malloc(nO*nO*nV*nV*sizeof(double));
|
double * r2_tmp = malloc(nO*nO*nV*nV*sizeof(double));
|
||||||
|
@ -12,5 +12,15 @@ typedef struct {
|
|||||||
double* cc_space_v_ovov;
|
double* cc_space_v_ovov;
|
||||||
double* cc_space_v_ovoo;
|
double* cc_space_v_ovoo;
|
||||||
double* cc_space_f_vo;
|
double* cc_space_f_vo;
|
||||||
|
double* tau;
|
||||||
|
double* tau_x;
|
||||||
|
double* t1;
|
||||||
|
double* t2;
|
||||||
|
double* H_oo;
|
||||||
|
double* H_vv;
|
||||||
|
int nO;
|
||||||
|
int nV;
|
||||||
|
int cholesky_mo_num;
|
||||||
} gpu_data;
|
} gpu_data;
|
||||||
|
|
||||||
|
#define MULTIGPU 1
|
||||||
|
@ -6,8 +6,6 @@
|
|||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
#include "gpu.h"
|
#include "gpu.h"
|
||||||
|
|
||||||
#define BLOCK_SIZE 16
|
|
||||||
|
|
||||||
gpu_data* gpu_init(
|
gpu_data* gpu_init(
|
||||||
int nO, int nV, int cholesky_mo_num,
|
int nO, int nV, int cholesky_mo_num,
|
||||||
double* cc_space_v_oo_chol, double* cc_space_v_ov_chol,
|
double* cc_space_v_oo_chol, double* cc_space_v_ov_chol,
|
||||||
@ -92,6 +90,30 @@ gpu_data* gpu_init(
|
|||||||
cudaMalloc((void**)&d_cc_space_f_vo, nV*nO*sizeof(double));
|
cudaMalloc((void**)&d_cc_space_f_vo, nV*nO*sizeof(double));
|
||||||
cublasSetMatrix(nV, nO, sizeof(double), cc_space_f_vo, nV, d_cc_space_f_vo, nV);
|
cublasSetMatrix(nV, nO, sizeof(double), cc_space_f_vo, nV, d_cc_space_f_vo, nV);
|
||||||
|
|
||||||
|
double* d_tau;
|
||||||
|
lda = nO * nO;
|
||||||
|
cudaMalloc((void **)&d_tau, lda * nV * nV * sizeof(double));
|
||||||
|
|
||||||
|
double* d_tau_x;
|
||||||
|
lda = nO * nO;
|
||||||
|
cudaMalloc((void **)&d_tau_x, lda * nV * nV * sizeof(double));
|
||||||
|
|
||||||
|
double* d_t1;
|
||||||
|
lda = nO;
|
||||||
|
cudaMalloc((void **)&d_t1, nO * nV * sizeof(double));
|
||||||
|
|
||||||
|
double* d_t2;
|
||||||
|
lda = nO*nO;
|
||||||
|
cudaMalloc((void **)&d_t2, nO*nO*nV*nV * sizeof(double));
|
||||||
|
|
||||||
|
double* d_H_oo;
|
||||||
|
lda = nO;
|
||||||
|
cudaMalloc((void **)&d_H_oo, nO * nO * sizeof(double));
|
||||||
|
|
||||||
|
double* d_H_vv;
|
||||||
|
lda = nV;
|
||||||
|
cudaMalloc((void **)&d_H_vv, nV * nV * sizeof(double));
|
||||||
|
|
||||||
data[igpu].cc_space_v_oo_chol = d_cc_space_v_oo_chol;
|
data[igpu].cc_space_v_oo_chol = d_cc_space_v_oo_chol;
|
||||||
data[igpu].cc_space_v_ov_chol = d_cc_space_v_ov_chol;
|
data[igpu].cc_space_v_ov_chol = d_cc_space_v_ov_chol;
|
||||||
data[igpu].cc_space_v_vo_chol = d_cc_space_v_vo_chol;
|
data[igpu].cc_space_v_vo_chol = d_cc_space_v_vo_chol;
|
||||||
@ -105,6 +127,17 @@ gpu_data* gpu_init(
|
|||||||
data[igpu].cc_space_v_ovov = d_cc_space_v_ovov;
|
data[igpu].cc_space_v_ovov = d_cc_space_v_ovov;
|
||||||
data[igpu].cc_space_v_ovoo = d_cc_space_v_ovoo;
|
data[igpu].cc_space_v_ovoo = d_cc_space_v_ovoo;
|
||||||
data[igpu].cc_space_f_vo = d_cc_space_f_vo;
|
data[igpu].cc_space_f_vo = d_cc_space_f_vo;
|
||||||
|
data[igpu].tau = d_tau;
|
||||||
|
data[igpu].tau_x = d_tau_x;
|
||||||
|
data[igpu].t1 = d_t1;
|
||||||
|
data[igpu].t2 = d_t2;
|
||||||
|
data[igpu].H_oo = d_H_oo;
|
||||||
|
data[igpu].H_vv = d_H_vv;
|
||||||
|
|
||||||
|
data[igpu].nO = nO;
|
||||||
|
data[igpu].nV = nV;
|
||||||
|
data[igpu].cholesky_mo_num = cholesky_mo_num;
|
||||||
|
|
||||||
}
|
}
|
||||||
return data;
|
return data;
|
||||||
}
|
}
|
||||||
|
@ -26,16 +26,23 @@ module gpu_module
|
|||||||
real(c_double), intent(in) :: cc_space_f_vo(nV,nO)
|
real(c_double), intent(in) :: cc_space_f_vo(nV,nO)
|
||||||
end function
|
end function
|
||||||
|
|
||||||
subroutine compute_r2_space_chol_gpu(nO,nV,cholesky_mo_num, gpu_data, t1, t2, tau,&
|
subroutine gpu_upload(gpu_data, nO, nV, t1, t2, tau, tau_x, H_oo, H_vv) bind(C)
|
||||||
H_oo, H_vv, r2, max_r2) bind(C)
|
|
||||||
import c_int, c_double, c_ptr
|
import c_int, c_double, c_ptr
|
||||||
integer(c_int), intent(in), value :: nO, nV, cholesky_mo_num
|
|
||||||
type(c_ptr), value :: gpu_data
|
type(c_ptr), value :: gpu_data
|
||||||
|
integer(c_int), intent(in), value :: nO, nV
|
||||||
|
real(c_double), intent(in) :: t1(nO,nV)
|
||||||
|
real(c_double), intent(in) :: t2(nO,nO,nV,nV)
|
||||||
|
real(c_double), intent(in) :: tau(nO,nO,nV,nV)
|
||||||
|
real(c_double), intent(in) :: tau_x(nO,nO,nV,nV)
|
||||||
|
real(c_double), intent(in) :: H_oo(nO,nO)
|
||||||
|
real(c_double), intent(in) :: H_vv(nV,nV)
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine compute_r2_space_chol_gpu(gpu_data, nO, nV, t1, r2, max_r2) bind(C)
|
||||||
|
import c_int, c_double, c_ptr
|
||||||
|
type(c_ptr), value :: gpu_data
|
||||||
|
integer(c_int), intent(in), value :: nO, nV
|
||||||
real(c_double), intent(in) :: t1(nO,nV)
|
real(c_double), intent(in) :: t1(nO,nV)
|
||||||
real(c_double), intent(in) :: t2(nO,nO,nV,nV)
|
|
||||||
real(c_double), intent(in) :: tau(nO,nO,nV,nV)
|
|
||||||
real(c_double), intent(in) :: H_oo(nO,nO)
|
|
||||||
real(c_double), intent(in) :: H_vv(nV,nV)
|
|
||||||
real(c_double), intent(out) :: r2(nO,nO,nV,nV)
|
real(c_double), intent(out) :: r2(nO,nO,nV,nV)
|
||||||
real(c_double), intent(out) :: max_r2
|
real(c_double), intent(out) :: max_r2
|
||||||
end subroutine
|
end subroutine
|
||||||
|
Loading…
Reference in New Issue
Block a user