r1 on GPU

This commit is contained in:
Anthony Scemama 2023-08-04 15:49:48 +02:00
parent a7e0832dae
commit ac2614a0f3
6 changed files with 197 additions and 2082 deletions

File diff suppressed because it is too large Load Diff

View File

@ -99,141 +99,37 @@ subroutine compute_r1_space_chol(gpu_data, nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max
call compute_r1_space_chol_gpu(gpu_data, nO, nV, t1, r1, max_r1)
! double precision, allocatable :: X_voov(:,:,:,:)
! allocate(X_voov(nV, nO, nO, nV))
!
! !$omp parallel &
! !$omp shared(nO,nV,X_voov,t2,t1) &
! !$omp private(u,beta,i,a) &
! !$omp default(none)
! !$omp do
! do beta = 1, nV
! do u = 1, nO
! do i = 1, nO
! do a = 1, nV
! X_voov(a,i,u,beta) = 2d0 * t2(i,u,a,beta) - t2(u,i,a,beta) + t1(u,a) * t1(i,beta)
! enddo
! enddo
! enddo
! enddo
! !$omp end do
! !$omp end parallel
!
! call dgemv('T', nV*nO, nO*nV, &
! 1d0, X_voov, nV * nO, &
! H_vo , 1, &
! 1d0, r1 , 1)
!
! deallocate(X_voov)
double precision, allocatable :: X_ovov(:,:,:,:)
allocate(X_ovov(nO, nV, nO, nV))
!$omp parallel &
!$omp shared(nO,nV,cc_space_v_ovov,cc_space_v_voov,X_ovov) &
!$omp private(u,beta,i,a) &
!$omp default(none)
!$omp do
do beta = 1, nV
do u = 1, nO
do a = 1, nv
do i = 1, nO
X_ovov(i,a,u,beta) = 2d0 * cc_space_v_voov(a,u,i,beta) - cc_space_v_ovov(u,a,i,beta)
enddo
enddo
enddo
enddo
!$omp end do
!$omp end parallel
call dgemv('T', nO*nV, nO*nV, &
1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), &
t1 , 1, &
1d0, r1 , 1)
deallocate(X_ovov)
integer :: iblock, block_size, nVmax
double precision, allocatable :: W_vvov(:,:,:,:), W_vvov_tmp(:,:,:,:), T_vvoo(:,:,:,:)
block_size = 16
allocate(W_vvov(nV,nV,nO,block_size), W_vvov_tmp(nV,nO,nV,block_size), T_vvoo(nV,nV,nO,nO))
!$omp parallel &
!$omp private(u,i,b,a) &
!$omp default(shared)
!$omp do
do u = 1, nO
do i = 1, nO
do b = 1, nV
do a = 1, nV
T_vvoo(a,b,i,u) = tau(i,u,a,b)
enddo
enddo
enddo
enddo
!$omp end do
!$omp end parallel
do iblock = 1, nV, block_size
nVmax = min(block_size,nV-iblock+1)
call dgemm('T','N', nV*nO, nV*nVmax, cholesky_mo_num, 1.d0, &
cc_space_v_vo_chol , cholesky_mo_num, &
cc_space_v_vv_chol(1,1,iblock), cholesky_mo_num, &
0.d0, W_vvov_tmp, nV*nO)
!$omp parallel &
!$omp private(b,i,a,beta) &
!$omp default(shared)
do beta = 1, nVmax
do i = 1, nO
!$omp do
do b = 1, nV
do a = 1, nV
W_vvov(a,b,i,beta) = 2d0 * W_vvov_tmp(a,i,b,beta) - W_vvov_tmp(b,i,a,beta)
enddo
enddo
!$omp end do nowait
enddo
enddo
!$omp barrier
!$omp end parallel
call dgemm('T','N',nO,nVmax,nO*nV*nV, &
1d0, T_vvoo, nV*nV*nO, &
W_vvov, nO*nV*nV, &
1d0, r1(1,iblock), nO)
enddo
deallocate(W_vvov,T_vvoo)
double precision, allocatable :: W_oovo(:,:,:,:)
allocate(W_oovo(nO,nO,nV,nO))
!$omp parallel &
!$omp shared(nO,nV,cc_space_v_vooo,W_oovo) &
!$omp private(u,a,i,j) &
!$omp default(none)
do u = 1, nO
!$omp do
do a = 1, nV
do j = 1, nO
do i = 1, nO
W_oovo(i,j,a,u) = 2d0 * cc_space_v_vooo(a,u,i,j) - cc_space_v_vooo(a,u,j,i)
enddo
enddo
enddo
!$omp end do nowait
enddo
!$omp end parallel
! !$omp parallel &
! !$omp shared(nO,nV,cc_space_v_oovo,W_oovo) &
! !$omp private(u,a,i,j) &
! !$omp default(none)
! do u = 1, nO
! !$omp do
! do a = 1, nV
! do j = 1, nO
! do i = 1, nO
! W_oovo(i,j,a,u) = 2d0 * cc_space_v_oovo(i,j,a,u) - cc_space_v_oovo(j,i,a,u)
! enddo
! enddo
! enddo
! !$omp end do nowait
! enddo
! !$omp end parallel
call dgemm('T','N', nO, nV, nO*nO*nV, &
-1d0, W_oovo, size(W_oovo,1) * size(W_oovo,2) * size(W_oovo,3), &
tau , size(tau,1) * size(tau,2) * size(tau,3), &
1d0, r1 , nO)
deallocate(W_oovo)
! call dgemm('T','N', nO, nV, nO*nO*nV, &
! -1d0, W_oovo, nO * nO * nV, &
! tau , nO * nO * nV, &
! 1d0, r1 , nO)
!
! deallocate(W_oovo)
max_r1 = 0d0
do a = 1, nV

View File

@ -44,13 +44,15 @@ void gpu_upload(gpu_data* data,
}
void compute_h_oo_chol_gpu(gpu_data* data, int nO, int nV, int igpu, double* H_oo)
void compute_h_oo_chol_gpu(gpu_data* data, int igpu)
{
int ngpus = 1;
if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
igpu = igpu % ngpus;
const int cholesky_mo_num = data[igpu].cholesky_mo_num;
const int nO = data[igpu].nO;
const int nV = data[igpu].nV;
cudaSetDevice(igpu);
int m,n,k, lda, ldb, ldc;
@ -120,7 +122,7 @@ void compute_h_oo_chol_gpu(gpu_data* data, int nO, int nV, int igpu, double* H_o
cudaFree(d_tau_kau);
// double* H_oo = malloc(nO*nO*sizeof(double));
double* H_oo = malloc(nO*nO*sizeof(double));
cublasGetMatrix(nO, nO, sizeof(double), d_H_oo, nO, H_oo, nO);
for (int i=0 ; i<ngpus ; ++i) {
if (i != igpu) {
@ -129,20 +131,22 @@ void compute_h_oo_chol_gpu(gpu_data* data, int nO, int nV, int igpu, double* H_o
cublasSetMatrix(nO, nO, sizeof(double), H_oo, nO, d_H_oo, nO);
}
}
// free(H_oo);
free(H_oo);
cublasDestroy(handle);
}
void compute_h_vo_chol_gpu(gpu_data* data, int nO, int nV, int igpu, double* H_vo)
void compute_h_vo_chol_gpu(gpu_data* data, int igpu)
{
int ngpus = 1;
if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
igpu = igpu % ngpus;
const int cholesky_mo_num = data[igpu].cholesky_mo_num;
const int nO = data[igpu].nO;
const int nV = data[igpu].nV;
cudaSetDevice(igpu);
int m,n,k, lda, ldb, ldc;
@ -222,7 +226,7 @@ void compute_h_vo_chol_gpu(gpu_data* data, int nO, int nV, int igpu, double* H_v
C=d_H_vo; ldc=nV;
cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
// double* H_vo = malloc(nO*nO*sizeof(double));
double* H_vo = malloc(nV*nO*sizeof(double));
cublasGetMatrix(nV, nO, sizeof(double), d_H_vo, nV, H_vo, nV);
for (int i=0 ; i<ngpus ; ++i) {
if (i != igpu) {
@ -231,7 +235,7 @@ void compute_h_vo_chol_gpu(gpu_data* data, int nO, int nV, int igpu, double* H_v
cublasSetMatrix(nV, nO, sizeof(double), H_vo, nV, d_H_vo, nV);
}
}
// free(H_vo);
free(H_vo);
cublasDestroy(handle);
@ -239,13 +243,15 @@ void compute_h_vo_chol_gpu(gpu_data* data, int nO, int nV, int igpu, double* H_v
void compute_h_vv_chol_gpu(gpu_data* data, int nO, int nV, int igpu, double* H_vv)
void compute_h_vv_chol_gpu(gpu_data* data, int igpu)
{
int ngpus = 1;
if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
igpu = igpu % ngpus;
const int cholesky_mo_num = data[igpu].cholesky_mo_num;
const int nO = data[igpu].nO;
const int nV = data[igpu].nV;
cudaSetDevice(igpu);
int m,n,k, lda, ldb, ldc;
@ -305,7 +311,7 @@ void compute_h_vv_chol_gpu(gpu_data* data, int nO, int nV, int igpu, double* H_v
cudaFree(d_tau_kia);
// double* H_vv = malloc(nO*nO*sizeof(double));
double* H_vv = malloc(nV*nV*sizeof(double));
cublasGetMatrix(nV, nV, sizeof(double), d_H_vv, nV, H_vv, nV);
for (int i=0 ; i<ngpus ; ++i) {
if (i != igpu) {
@ -314,7 +320,7 @@ void compute_h_vv_chol_gpu(gpu_data* data, int nO, int nV, int igpu, double* H_v
cublasSetMatrix(nV, nV, sizeof(double), H_vv, nV, d_H_vv, nV);
}
}
// free(H_vv);
free(H_vv);
cublasDestroy(handle);
@ -1655,23 +1661,14 @@ void compute_r1_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
cudaMemset(d_r1, 0, nO*nV*sizeof(double));
memset(r1, 0, nO*nV*sizeof(double));
// double* d_cc_space_v_oo_chol = data[igpu].cc_space_v_oo_chol;
// double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol;
// double* d_cc_space_v_vo_chol = data[igpu].cc_space_v_vo_chol;
// double* d_cc_space_v_vv_chol = data[igpu].cc_space_v_vv_chol;
// double* d_cc_space_v_oooo = data[igpu].cc_space_v_oooo;
// double* d_cc_space_v_vooo = data[igpu].cc_space_v_vooo;
// double* d_cc_space_v_oovv = data[igpu].cc_space_v_oovv;
// double* d_cc_space_v_vvoo = data[igpu].cc_space_v_vvoo;
// double* d_cc_space_v_oovo = data[igpu].cc_space_v_oovo;
// double* d_cc_space_v_ovvo = data[igpu].cc_space_v_ovvo;
// double* d_cc_space_v_ovov = data[igpu].cc_space_v_ovov;
// double* d_cc_space_v_ovoo = data[igpu].cc_space_v_ovoo;
// double* d_cc_space_f_oo = data[igpu].cc_space_f_oo;
double* d_cc_space_v_vo_chol = data[igpu].cc_space_v_vo_chol;
double* d_cc_space_v_vv_chol = data[igpu].cc_space_v_vv_chol;
double* d_cc_space_v_oovo = data[igpu].cc_space_v_oovo;
double* d_cc_space_v_ovov = data[igpu].cc_space_v_ovov;
double* d_cc_space_v_voov = data[igpu].cc_space_v_voov;
double* d_cc_space_f_ov = data[igpu].cc_space_f_ov;
double* d_cc_space_f_vo = data[igpu].cc_space_f_vo;
// double* d_cc_space_f_vv = data[igpu].cc_space_f_vv;
// double* d_tau = data[igpu].tau;
double* d_tau = data[igpu].tau;
double* d_t1 = data[igpu].t1;
double* d_t2 = data[igpu].t2;
double* d_H_oo = data[igpu].H_oo;
@ -1776,8 +1773,140 @@ void compute_r1_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
#pragma omp section
{
double* d_X_ovov;
cudaMalloc((void **)&d_X_ovov, nO* nV* nO* nV * sizeof(double));
cublasDcopy(handle, nO*nV*nO*nV, d_cc_space_v_ovov, 1, d_X_ovov, 1);
for (int i=0 ; i<nV ; ++i) {
cudaStreamCreate(&(stream[i]));
}
alpha = -1.0;
beta = 2.0;
for (int u=0 ; u<nO ; ++u) {
for (int bet=0 ; bet<nV ; ++bet) {
cublasSetStream(handle, stream[bet]);
A = &(d_X_ovov[nO*nV*(u+nO*bet)]); lda = nO;
B = &(d_cc_space_v_voov[(nV*(u+nO*nO*bet))]); ldb = nV*nO;
C = A ; ldc = lda;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nO, nV, &alpha, A, lda, &beta, B, ldb, C, ldc);
}
}
for (int i=0 ; i<nV ; ++i) {
cudaStreamDestroy(stream[i]);
}
cublasSetStream(handle, NULL);
alpha = 1.0;
beta = 1.0;
m=nO*nV; n=nO*nV;
A=d_X_ovov; lda=nO * nV;
B=d_t1; ldb=1;
C=d_r1; ldc=1;
cublasDgemv(handle, CUBLAS_OP_T, m, n, &alpha, A, lda, B, ldb, &beta, C, ldc);
cudaFree(d_X_ovov);
}
#pragma omp section
{
double* d_T_vvoo;
cudaMalloc((void **)&d_T_vvoo, nV*nV*nO*nO * sizeof(double));
alpha = 0.0;
beta = 1.0;
A = d_T_vvoo; lda = nV*nV;
B = d_tau; ldb = nO*nO;
C = A ; ldc = lda;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nV*nV, nO*nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
double* d_W_vvov;
cudaMalloc((void **)&d_W_vvov, nV*nV*nO*BLOCK_SIZE * sizeof(double));
double* d_W_vvov_tmp;
cudaMalloc((void **)&d_W_vvov_tmp, nV*nO*nV*BLOCK_SIZE * sizeof(double));
for (int iblock=0 ; iblock<nV ; iblock += BLOCK_SIZE) {
const int mbs = BLOCK_SIZE < nV-iblock ? BLOCK_SIZE : nV-iblock;
alpha = 1.0;
beta = 0.0;
m=nV*nO; n=nV*mbs; k=cholesky_mo_num;
A=d_cc_space_v_vo_chol; lda=cholesky_mo_num;
B=&(d_cc_space_v_vv_chol[cholesky_mo_num*nV*iblock]); ldb=cholesky_mo_num;
C=d_W_vvov_tmp; ldc=nV*nO;
cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
alpha = 2.0;
beta = -1.0;
int kk=0;
for (int i=0 ; i<nV ; ++i) {
cudaStreamCreate(&(stream[i]));
}
for (int i=0 ; i<nO ; ++i) {
for (int bet=0 ; bet<mbs ; ++bet) {
cublasSetStream(handle, stream[kk]);
++kk;
if (kk >= nV) kk = 0;
A = &(d_W_vvov_tmp[nV*(i+nO*nV*bet)]); lda = nV*nO;
B = &(d_W_vvov_tmp[nV*(i+nO*nV*bet)]); ldb = nV*nO;
C = &(d_W_vvov[nV*nV*(i+nO*bet)]); ldc = nV;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nV, nV, &alpha, A, lda, &beta, B, ldb, C, ldc);
}
}
for (int i=0 ; i<nV ; ++i) {
cudaStreamDestroy(stream[i]);
}
cublasSetStream(handle, NULL);
alpha = 1.0;
beta = 1.0;
m=nO; n=mbs; k=nO*nV*nV;
A=d_T_vvoo; lda=nV*nV*nO;
B=d_W_vvov; ldb=nO*nV*nV;
C=&(d_r1[nO*iblock]); ldc=nO;
cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
}
cudaFree(d_W_vvov);
cudaFree(d_W_vvov_tmp);
cudaFree(d_T_vvoo);
}
#pragma omp section
{
double* d_W_oovo;
cudaMalloc((void **)&d_W_oovo, nO*nO*nV*nO * sizeof(double));
alpha = 2.0;
beta = -1.0;
for (int i=0 ; i<nV ; ++i) {
cudaStreamCreate(&(stream[i]));
}
for (int u=0 ; u<nO ; ++u) {
for (int a=0 ; a<nV ; ++a) {
cublasSetStream(handle, stream[a]);
A = &(d_cc_space_v_oovo[nO*nO*(a+nV*u)]); lda = nO;
B = &(d_cc_space_v_oovo[nO*nO*(a+nV*u)]); ldb = nO;
C = &(d_W_oovo[nO*nO*(a+nV*u)]); ldc = nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
}
}
for (int i=0 ; i<nV ; ++i) {
cudaStreamDestroy(stream[i]);
}
cublasSetStream(handle, NULL);
alpha = -1.0;
beta = 1.0;
m=nO; n=nV; k=nO*nO*nV;
A=d_W_oovo; lda=nO * nO * nV;
B=d_tau; ldb=nO * nO * nV;
C=d_r1; ldc=nO;
cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
}
}
double * r1_tmp = malloc(nO*nV*sizeof(double));
@ -1786,7 +1915,7 @@ void compute_r1_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
#pragma omp critical
{
for (size_t i=0 ; i<(size_t) nO*nV ; ++i) {
r1[i] += r1_tmp[i];
r1[i] -= r1_tmp[i];
}
}
free(r1_tmp);

View File

@ -5,6 +5,7 @@ typedef struct {
double* cc_space_v_vv_chol;
double* cc_space_v_oooo;
double* cc_space_v_vooo;
double* cc_space_v_voov;
double* cc_space_v_oovv;
double* cc_space_v_vvoo;
double* cc_space_v_oovo;

View File

@ -10,7 +10,7 @@ gpu_data* gpu_init(
int nO, int nV, int cholesky_mo_num,
double* cc_space_v_oo_chol, double* cc_space_v_ov_chol,
double* cc_space_v_vo_chol, double* cc_space_v_vv_chol,
double* cc_space_v_oooo, double* cc_space_v_vooo,
double* cc_space_v_oooo, double* cc_space_v_vooo, double* cc_space_v_voov,
double* cc_space_v_oovv, double* cc_space_v_vvoo,
double* cc_space_v_oovo, double* cc_space_v_ovvo,
double* cc_space_v_ovov, double* cc_space_v_ovoo,
@ -59,6 +59,10 @@ gpu_data* gpu_init(
cudaMalloc((void**)&d_cc_space_v_vooo, nV*nO*nO*nO*sizeof(double));
cublasSetMatrix(nV*nO, nO*nO, sizeof(double), cc_space_v_vooo, nV*nO, d_cc_space_v_vooo, nV*nO);
double* d_cc_space_v_voov;
cudaMalloc((void**)&d_cc_space_v_voov, nV*nO*nO*nV*sizeof(double));
cublasSetMatrix(nV*nO, nO*nV, sizeof(double), cc_space_v_voov, nV*nO, d_cc_space_v_voov, nV*nO);
double* d_cc_space_v_oovv;
cudaMalloc((void**)&d_cc_space_v_oovv, nO*nO*nV*nV*sizeof(double));
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), cc_space_v_oovv, nO*nO, d_cc_space_v_oovv, nO*nO);
@ -132,6 +136,7 @@ gpu_data* gpu_init(
data[igpu].cc_space_v_vv_chol = d_cc_space_v_vv_chol;
data[igpu].cc_space_v_oooo = d_cc_space_v_oooo;
data[igpu].cc_space_v_vooo = d_cc_space_v_vooo;
data[igpu].cc_space_v_voov = d_cc_space_v_voov;
data[igpu].cc_space_v_oovv = d_cc_space_v_oovv;
data[igpu].cc_space_v_vvoo = d_cc_space_v_vvoo;
data[igpu].cc_space_v_oovo = d_cc_space_v_oovo;

View File

@ -6,7 +6,7 @@ module gpu_module
interface
type(c_ptr) function gpu_init(nO, nV, cholesky_mo_num, &
cc_space_v_oo_chol, cc_space_v_ov_chol, cc_space_v_vo_chol, cc_space_v_vv_chol, &
cc_space_v_oooo, cc_space_v_vooo, cc_space_v_oovv, cc_space_v_vvoo, &
cc_space_v_oooo, cc_space_v_vooo, cc_space_v_voov, cc_space_v_oovv, cc_space_v_vvoo, &
cc_space_v_oovo, cc_space_v_ovvo, cc_space_v_ovov, cc_space_v_ovoo, &
cc_space_f_oo, cc_space_f_ov, cc_space_f_vo, cc_space_f_vv) bind(C)
import c_int, c_double, c_ptr
@ -17,6 +17,7 @@ module gpu_module
real(c_double), intent(in) :: cc_space_v_vv_chol(cholesky_mo_num,nV,nV)
real(c_double), intent(in) :: cc_space_v_oooo(nO,nO,nO,nO)
real(c_double), intent(in) :: cc_space_v_vooo(nV,nO,nO,nO)
real(c_double), intent(in) :: cc_space_v_voov(nV,nO,nO,nV)
real(c_double), intent(in) :: cc_space_v_oovv(nO,nO,nV,nV)
real(c_double), intent(in) :: cc_space_v_vvoo(nV,nV,nO,nO)
real(c_double), intent(in) :: cc_space_v_oovo(nO,nO,nV,nO)
@ -39,25 +40,22 @@ module gpu_module
real(c_double), intent(in) :: tau_x(nO,nO,nV,nV)
end subroutine
subroutine compute_H_oo_chol_gpu(gpu_data, nO, nV, igpu, H_oo) bind(C)
subroutine compute_H_oo_chol_gpu(gpu_data, igpu) bind(C)
import c_int, c_double, c_ptr
type(c_ptr), value :: gpu_data
integer(c_int), intent(in), value :: nO, nV, igpu
real(c_double), intent(out) :: H_oo(nO,nO)
integer(c_int), intent(in), value :: igpu
end subroutine
subroutine compute_H_vo_chol_gpu(gpu_data, nO, nV, igpu, H_vo) bind(C)
subroutine compute_H_vo_chol_gpu(gpu_data, igpu) bind(C)
import c_int, c_double, c_ptr
type(c_ptr), value :: gpu_data
integer(c_int), intent(in), value :: nO, nV, igpu
real(c_double), intent(out) :: H_vo(nV,nO)
integer(c_int), intent(in), value :: igpu
end subroutine
subroutine compute_H_vv_chol_gpu(gpu_data, nO, nV, igpu, H_vv) bind(C)
subroutine compute_H_vv_chol_gpu(gpu_data, igpu) bind(C)
import c_int, c_double, c_ptr
type(c_ptr), value :: gpu_data
integer(c_int), intent(in), value :: nO, nV, igpu
real(c_double), intent(out) :: H_vv(nO,nO)
integer(c_int), intent(in), value :: igpu
end subroutine
subroutine compute_r1_space_chol_gpu(gpu_data, nO, nV, t1, r1, max_r1) bind(C)