1
0
mirror of https://gitlab.com/scemama/qp_plugins_scemama.git synced 2024-12-22 12:23:37 +01:00

Added gpu_dgemm

This commit is contained in:
Anthony Scemama 2023-07-16 15:39:37 +02:00
parent 0c6e6a1ca0
commit 5cec1b8a0c
3 changed files with 1747 additions and 58 deletions

View File

@ -11,6 +11,53 @@ void dgemm_(char*, char*, int*, int*, int*, double*, double*, int*, double*, int
void gpu_dgemm(char transa, char transb, int m, int n, int k, double alpha,
double* A, int lda, double* B, int ldb, double beta, double* C, int ldc)
{
cublasHandle_t handle;
cublasCreate(&handle);
double * d_A;
double * d_B;
double * d_C;
cublasOperation_t ta, tb;
if (transa == 'N') {
cudaMalloc((void**)&d_A, lda*k*sizeof(double));
cublasSetMatrix(m, k, sizeof(double), A, lda, d_A, lda);
ta = CUBLAS_OP_N;
} else {
cudaMalloc((void**)&d_A, lda*m*sizeof(double));
cublasSetMatrix(k, m, sizeof(double), A, lda, d_A, lda);
ta = CUBLAS_OP_T;
}
if (transb == 'N') {
cudaMalloc((void**)&d_B, ldb*n*sizeof(double));
cublasSetMatrix(k, n, sizeof(double), B, ldb, d_B, ldb);
tb = CUBLAS_OP_N;
} else {
cudaMalloc((void**)&d_B, ldb*k*sizeof(double));
cublasSetMatrix(n, k, sizeof(double), B, ldb, d_B, ldb);
tb = CUBLAS_OP_T;
}
cudaMalloc((void**)&d_C, ldc*n*sizeof(double));
if (beta != 0.) {
cublasSetMatrix(m, n, sizeof(double), C, ldc, d_C, ldc);
}
cublasDgemm(handle, ta, tb, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, ldc);
cublasGetMatrix(m, n, sizeof(double), d_C, ldc, C, ldc);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cublasDestroy(handle);
}
void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo_num, void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo_num,
double* t1, double* t1,
double* tau, double* tau,
@ -35,7 +82,6 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
double* d_r2; double* d_r2;
lda = nO * nO; lda = nO * nO;
cudaMalloc((void **)&d_r2, lda * nV * nV * sizeof(double)); cudaMalloc((void **)&d_r2, lda * nV * nV * sizeof(double));
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), r2, lda, d_r2, lda);
double* d_cc_space_v_vv_chol; double* d_cc_space_v_vv_chol;
lda = cholesky_mo_num * nV; lda = cholesky_mo_num * nV;
@ -60,9 +106,13 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
m=cholesky_mo_num*nV; n=nV; k=nO; m=cholesky_mo_num*nV; n=nV; k=nO;
A = d_cc_space_v_vo_chol; B = d_t1; C = d_tmp_cc; A = d_cc_space_v_vo_chol; B = d_t1; C = d_tmp_cc;
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, m, B, k, &beta, C, m); cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, m, B, k, &beta, C, m);
cublasDestroy(handle);
#pragma omp parallel #pragma omp parallel
{ {
cublasHandle_t handle;
cublasCreate(&handle);
double* d_tmp_cc2; double* d_tmp_cc2;
cudaMalloc((void **)&d_tmp_cc2, cholesky_mo_num*nV*sizeof(double)); cudaMalloc((void **)&d_tmp_cc2, cholesky_mo_num*nV*sizeof(double));
@ -126,42 +176,26 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
} }
} }
cublasDestroy(handle);
} }
lda=nO*nO;
cublasGetMatrix(nO*nO, nV*nV, sizeof(double), d_r2, lda, r2, lda);
cudaFree(d_cc_space_v_vo_chol); cudaFree(d_cc_space_v_vo_chol);
cudaFree(d_cc_space_v_vv_chol); cudaFree(d_cc_space_v_vv_chol);
cudaFree(d_tau); cudaFree(d_tau);
cudaFree(d_t1); cudaFree(d_t1);
cudaFree(d_tmp_cc); cudaFree(d_tmp_cc);
cudaFree(d_r2);
cublasDestroy(handle);
double * r2_tmp = malloc(nO*nO*nV*nV*sizeof(double));
lda=nO*nO;
cublasGetMatrix(nO*nO, nV*nV, sizeof(double), d_r2, lda, r2_tmp, lda);
for (size_t i=0 ; i<nO*nO*nV*nV ; ++i) {
r2[i] += r2_tmp[i];
}
free(r2_tmp);
cudaFree(d_r2);
} }
/*
cublasHandle_t handle;
cublasCreate(&handle);
double *d_A, *d_B, *d_C;
cudaMalloc((void **)&d_A, m * k * sizeof(double));
cudaMalloc((void **)&d_B, k * n * sizeof(double));
cudaMalloc((void **)&d_C, m * n * sizeof(double));
cublasSetMatrix(m, k, sizeof(double), A, m, d_A, m);
cublasSetMatrix(k, n, sizeof(double), B, k, d_B, k);
cublasSetMatrix(m, n, sizeof(double), C, m, d_C, m);
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, d_A, m, d_B, k, &beta, d_C, m);
cublasGetMatrix(m, n, sizeof(double), d_C, m, C, m);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cublasDestroy(handle);
*/

View File

@ -15,37 +15,13 @@ module gpu_module
real(c_double), intent(out) :: r2(nO,nO,nV,nV) real(c_double), intent(out) :: r2(nO,nO,nV,nV)
end subroutine end subroutine
subroutine gemm0(nO, nV, cholesky_mo_num, cc_space_v_vo_chol, t1, tmp_cc) bind(C, name="gemm0") subroutine gpu_dgemm(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) bind(C)
import c_int, c_double import c_int, c_double, c_char
integer(c_int), value :: nO, nV, cholesky_mo_num character(c_char), value :: transa, transb
real(c_double) :: cc_space_v_vo_chol(cholesky_mo_num,nV,nO) integer(c_int), value :: m,n,k,lda,ldb,ldc
real(c_double) :: t1(nO,nV) real(c_double), value :: alpha, beta
real(c_double) :: tmp_cc(cholesky_mo_num,nV,nV) real(c_double) :: A(lda,*), B(ldb,*), C(ldc,*)
end subroutine gemm0 end subroutine
subroutine gemm1(iblock, nV, cholesky_mo_num, tmp_cc, cc_space_v_vv_chol_, tmpB1) bind(C, name="gemm1")
import c_int, c_double
integer(c_int), value :: iblock, nV, cholesky_mo_num
real(c_double) :: tmp_cc(cholesky_mo_num,nV,nV)
real(c_double) :: cc_space_v_vv_chol_(cholesky_mo_num,nV)
real(c_double) :: tmpB1(nV,16,nV)
end subroutine gemm1
subroutine gemm2(iblock, nV, cholesky_mo_num, tmp_cc2, cc_space_v_vv_chol, tmpB1) bind(C, name="gemm2")
import c_int, c_double
integer(c_int), value :: iblock, nV, cholesky_mo_num
real(c_double) :: tmp_cc2(cholesky_mo_num,nV)
real(c_double) :: cc_space_v_vv_chol(cholesky_mo_num,nV,nV)
real(c_double) :: tmpB1(nV,16,nV)
end subroutine gemm2
subroutine gemm3(iblock, nO, nV, gam, tau, B1, r2) bind(C, name="gemm3")
import c_int, c_double
integer(c_int), value :: iblock, nO, nV, gam
real(c_double) :: tau(nO,nO,nV,nV)
real(c_double) :: B1(nV,nV,*)
real(c_double) :: r2(nO,nO,nV,nV)
end subroutine gemm3
end interface end interface

File diff suppressed because it is too large Load Diff