mirror of
https://gitlab.com/scemama/qp_plugins_scemama.git
synced 2024-12-22 12:23:37 +01:00
Added gpu_dgemm
This commit is contained in:
parent
0c6e6a1ca0
commit
5cec1b8a0c
@ -11,6 +11,53 @@ void dgemm_(char*, char*, int*, int*, int*, double*, double*, int*, double*, int
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
void gpu_dgemm(char transa, char transb, int m, int n, int k, double alpha,
|
||||||
|
double* A, int lda, double* B, int ldb, double beta, double* C, int ldc)
|
||||||
|
{
|
||||||
|
cublasHandle_t handle;
|
||||||
|
cublasCreate(&handle);
|
||||||
|
|
||||||
|
double * d_A;
|
||||||
|
double * d_B;
|
||||||
|
double * d_C;
|
||||||
|
cublasOperation_t ta, tb;
|
||||||
|
|
||||||
|
if (transa == 'N') {
|
||||||
|
cudaMalloc((void**)&d_A, lda*k*sizeof(double));
|
||||||
|
cublasSetMatrix(m, k, sizeof(double), A, lda, d_A, lda);
|
||||||
|
ta = CUBLAS_OP_N;
|
||||||
|
} else {
|
||||||
|
cudaMalloc((void**)&d_A, lda*m*sizeof(double));
|
||||||
|
cublasSetMatrix(k, m, sizeof(double), A, lda, d_A, lda);
|
||||||
|
ta = CUBLAS_OP_T;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (transb == 'N') {
|
||||||
|
cudaMalloc((void**)&d_B, ldb*n*sizeof(double));
|
||||||
|
cublasSetMatrix(k, n, sizeof(double), B, ldb, d_B, ldb);
|
||||||
|
tb = CUBLAS_OP_N;
|
||||||
|
} else {
|
||||||
|
cudaMalloc((void**)&d_B, ldb*k*sizeof(double));
|
||||||
|
cublasSetMatrix(n, k, sizeof(double), B, ldb, d_B, ldb);
|
||||||
|
tb = CUBLAS_OP_T;
|
||||||
|
}
|
||||||
|
|
||||||
|
cudaMalloc((void**)&d_C, ldc*n*sizeof(double));
|
||||||
|
if (beta != 0.) {
|
||||||
|
cublasSetMatrix(m, n, sizeof(double), C, ldc, d_C, ldc);
|
||||||
|
}
|
||||||
|
|
||||||
|
cublasDgemm(handle, ta, tb, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, ldc);
|
||||||
|
|
||||||
|
cublasGetMatrix(m, n, sizeof(double), d_C, ldc, C, ldc);
|
||||||
|
|
||||||
|
cudaFree(d_A);
|
||||||
|
cudaFree(d_B);
|
||||||
|
cudaFree(d_C);
|
||||||
|
cublasDestroy(handle);
|
||||||
|
}
|
||||||
|
|
||||||
void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo_num,
|
void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo_num,
|
||||||
double* t1,
|
double* t1,
|
||||||
double* tau,
|
double* tau,
|
||||||
@ -35,7 +82,6 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
|
|||||||
double* d_r2;
|
double* d_r2;
|
||||||
lda = nO * nO;
|
lda = nO * nO;
|
||||||
cudaMalloc((void **)&d_r2, lda * nV * nV * sizeof(double));
|
cudaMalloc((void **)&d_r2, lda * nV * nV * sizeof(double));
|
||||||
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), r2, lda, d_r2, lda);
|
|
||||||
|
|
||||||
double* d_cc_space_v_vv_chol;
|
double* d_cc_space_v_vv_chol;
|
||||||
lda = cholesky_mo_num * nV;
|
lda = cholesky_mo_num * nV;
|
||||||
@ -60,9 +106,13 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
|
|||||||
m=cholesky_mo_num*nV; n=nV; k=nO;
|
m=cholesky_mo_num*nV; n=nV; k=nO;
|
||||||
A = d_cc_space_v_vo_chol; B = d_t1; C = d_tmp_cc;
|
A = d_cc_space_v_vo_chol; B = d_t1; C = d_tmp_cc;
|
||||||
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, m, B, k, &beta, C, m);
|
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, m, B, k, &beta, C, m);
|
||||||
|
cublasDestroy(handle);
|
||||||
|
|
||||||
#pragma omp parallel
|
#pragma omp parallel
|
||||||
{
|
{
|
||||||
|
cublasHandle_t handle;
|
||||||
|
cublasCreate(&handle);
|
||||||
|
|
||||||
double* d_tmp_cc2;
|
double* d_tmp_cc2;
|
||||||
cudaMalloc((void **)&d_tmp_cc2, cholesky_mo_num*nV*sizeof(double));
|
cudaMalloc((void **)&d_tmp_cc2, cholesky_mo_num*nV*sizeof(double));
|
||||||
|
|
||||||
@ -126,42 +176,26 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
|
|||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
cublasDestroy(handle);
|
||||||
}
|
}
|
||||||
lda=nO*nO;
|
|
||||||
cublasGetMatrix(nO*nO, nV*nV, sizeof(double), d_r2, lda, r2, lda);
|
|
||||||
|
|
||||||
cudaFree(d_cc_space_v_vo_chol);
|
cudaFree(d_cc_space_v_vo_chol);
|
||||||
cudaFree(d_cc_space_v_vv_chol);
|
cudaFree(d_cc_space_v_vv_chol);
|
||||||
cudaFree(d_tau);
|
cudaFree(d_tau);
|
||||||
cudaFree(d_t1);
|
cudaFree(d_t1);
|
||||||
cudaFree(d_tmp_cc);
|
cudaFree(d_tmp_cc);
|
||||||
cudaFree(d_r2);
|
|
||||||
cublasDestroy(handle);
|
|
||||||
|
|
||||||
|
double * r2_tmp = malloc(nO*nO*nV*nV*sizeof(double));
|
||||||
|
lda=nO*nO;
|
||||||
|
cublasGetMatrix(nO*nO, nV*nV, sizeof(double), d_r2, lda, r2_tmp, lda);
|
||||||
|
for (size_t i=0 ; i<nO*nO*nV*nV ; ++i) {
|
||||||
|
r2[i] += r2_tmp[i];
|
||||||
|
}
|
||||||
|
free(r2_tmp);
|
||||||
|
|
||||||
|
cudaFree(d_r2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
|
|
||||||
cublasHandle_t handle;
|
|
||||||
cublasCreate(&handle);
|
|
||||||
|
|
||||||
double *d_A, *d_B, *d_C;
|
|
||||||
cudaMalloc((void **)&d_A, m * k * sizeof(double));
|
|
||||||
cudaMalloc((void **)&d_B, k * n * sizeof(double));
|
|
||||||
cudaMalloc((void **)&d_C, m * n * sizeof(double));
|
|
||||||
|
|
||||||
cublasSetMatrix(m, k, sizeof(double), A, m, d_A, m);
|
|
||||||
cublasSetMatrix(k, n, sizeof(double), B, k, d_B, k);
|
|
||||||
cublasSetMatrix(m, n, sizeof(double), C, m, d_C, m);
|
|
||||||
|
|
||||||
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, d_A, m, d_B, k, &beta, d_C, m);
|
|
||||||
|
|
||||||
cublasGetMatrix(m, n, sizeof(double), d_C, m, C, m);
|
|
||||||
|
|
||||||
cudaFree(d_A);
|
|
||||||
cudaFree(d_B);
|
|
||||||
cudaFree(d_C);
|
|
||||||
cublasDestroy(handle);
|
|
||||||
|
|
||||||
*/
|
|
||||||
|
@ -15,37 +15,13 @@ module gpu_module
|
|||||||
real(c_double), intent(out) :: r2(nO,nO,nV,nV)
|
real(c_double), intent(out) :: r2(nO,nO,nV,nV)
|
||||||
end subroutine
|
end subroutine
|
||||||
|
|
||||||
subroutine gemm0(nO, nV, cholesky_mo_num, cc_space_v_vo_chol, t1, tmp_cc) bind(C, name="gemm0")
|
subroutine gpu_dgemm(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) bind(C)
|
||||||
import c_int, c_double
|
import c_int, c_double, c_char
|
||||||
integer(c_int), value :: nO, nV, cholesky_mo_num
|
character(c_char), value :: transa, transb
|
||||||
real(c_double) :: cc_space_v_vo_chol(cholesky_mo_num,nV,nO)
|
integer(c_int), value :: m,n,k,lda,ldb,ldc
|
||||||
real(c_double) :: t1(nO,nV)
|
real(c_double), value :: alpha, beta
|
||||||
real(c_double) :: tmp_cc(cholesky_mo_num,nV,nV)
|
real(c_double) :: A(lda,*), B(ldb,*), C(ldc,*)
|
||||||
end subroutine gemm0
|
end subroutine
|
||||||
|
|
||||||
subroutine gemm1(iblock, nV, cholesky_mo_num, tmp_cc, cc_space_v_vv_chol_, tmpB1) bind(C, name="gemm1")
|
|
||||||
import c_int, c_double
|
|
||||||
integer(c_int), value :: iblock, nV, cholesky_mo_num
|
|
||||||
real(c_double) :: tmp_cc(cholesky_mo_num,nV,nV)
|
|
||||||
real(c_double) :: cc_space_v_vv_chol_(cholesky_mo_num,nV)
|
|
||||||
real(c_double) :: tmpB1(nV,16,nV)
|
|
||||||
end subroutine gemm1
|
|
||||||
|
|
||||||
subroutine gemm2(iblock, nV, cholesky_mo_num, tmp_cc2, cc_space_v_vv_chol, tmpB1) bind(C, name="gemm2")
|
|
||||||
import c_int, c_double
|
|
||||||
integer(c_int), value :: iblock, nV, cholesky_mo_num
|
|
||||||
real(c_double) :: tmp_cc2(cholesky_mo_num,nV)
|
|
||||||
real(c_double) :: cc_space_v_vv_chol(cholesky_mo_num,nV,nV)
|
|
||||||
real(c_double) :: tmpB1(nV,16,nV)
|
|
||||||
end subroutine gemm2
|
|
||||||
|
|
||||||
subroutine gemm3(iblock, nO, nV, gam, tau, B1, r2) bind(C, name="gemm3")
|
|
||||||
import c_int, c_double
|
|
||||||
integer(c_int), value :: iblock, nO, nV, gam
|
|
||||||
real(c_double) :: tau(nO,nO,nV,nV)
|
|
||||||
real(c_double) :: B1(nV,nV,*)
|
|
||||||
real(c_double) :: r2(nO,nO,nV,nV)
|
|
||||||
end subroutine gemm3
|
|
||||||
|
|
||||||
end interface
|
end interface
|
||||||
|
|
||||||
|
1679
devel/ccsd_gpu/mo_integrals_cc.irp.f
Normal file
1679
devel/ccsd_gpu/mo_integrals_cc.irp.f
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user