mirror of
https://gitlab.com/scemama/qp_plugins_scemama.git
synced 2025-01-03 01:55:52 +01:00
Upload integrals only once
This commit is contained in:
parent
cf980f0fae
commit
5da8e2cba4
@ -1,5 +1,5 @@
|
|||||||
subroutine run_ccsd_space_orb
|
subroutine run_ccsd_space_orb
|
||||||
|
use gpu_module
|
||||||
implicit none
|
implicit none
|
||||||
|
|
||||||
integer :: i,j,k,l,a,b,c,d,tmp_a,tmp_b,tmp_c,tmp_d
|
integer :: i,j,k,l,a,b,c,d,tmp_a,tmp_b,tmp_c,tmp_d
|
||||||
@ -115,6 +115,13 @@ subroutine run_ccsd_space_orb
|
|||||||
write(*,'(A77)') ' -----------------------------------------------------------------------------'
|
write(*,'(A77)') ' -----------------------------------------------------------------------------'
|
||||||
call wall_time(ta)
|
call wall_time(ta)
|
||||||
|
|
||||||
|
type(c_ptr) :: gpu_data
|
||||||
|
|
||||||
|
gpu_data = gpu_init(nO, nV, cholesky_mo_num, &
|
||||||
|
cc_space_v_oo_chol, cc_space_v_ov_chol, cc_space_v_vo_chol, cc_space_v_vv_chol, &
|
||||||
|
cc_space_v_oooo, cc_space_v_vooo, cc_space_v_oovv, cc_space_v_vvoo, &
|
||||||
|
cc_space_v_oovo, cc_space_v_ovvo, cc_space_v_ovoo, cc_space_f_vo)
|
||||||
|
|
||||||
do while (not_converged)
|
do while (not_converged)
|
||||||
|
|
||||||
! Residue
|
! Residue
|
||||||
@ -125,7 +132,7 @@ subroutine run_ccsd_space_orb
|
|||||||
call compute_H_vo_chol(nO,nV,t1,H_vo)
|
call compute_H_vo_chol(nO,nV,t1,H_vo)
|
||||||
|
|
||||||
call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
|
call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
|
||||||
call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
|
call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2,gpu_data)
|
||||||
else
|
else
|
||||||
call compute_H_oo(nO,nV,t1,t2,tau,H_oo)
|
call compute_H_oo(nO,nV,t1,t2,tau,H_oo)
|
||||||
call compute_H_vv(nO,nV,t1,t2,tau,H_vv)
|
call compute_H_vv(nO,nV,t1,t2,tau,H_vv)
|
||||||
|
@ -441,7 +441,7 @@ end
|
|||||||
|
|
||||||
! R2
|
! R2
|
||||||
|
|
||||||
subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
|
subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2,gpu_data)
|
||||||
use gpu_module
|
use gpu_module
|
||||||
implicit none
|
implicit none
|
||||||
|
|
||||||
@ -449,6 +449,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
|
|||||||
integer, intent(in) :: nO, nV
|
integer, intent(in) :: nO, nV
|
||||||
double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV), tau(nO,nO,nV,nV)
|
double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV), tau(nO,nO,nV,nV)
|
||||||
double precision, intent(in) :: H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO)
|
double precision, intent(in) :: H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO)
|
||||||
|
type(c_ptr), intent(in) :: gpu_data
|
||||||
|
|
||||||
! out
|
! out
|
||||||
double precision, intent(out) :: r2(nO,nO,nV,nV), max_r2
|
double precision, intent(out) :: r2(nO,nO,nV,nV), max_r2
|
||||||
@ -493,11 +494,8 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
|
|||||||
allocate(J1(nO,nV,nV,nO))
|
allocate(J1(nO,nV,nV,nO))
|
||||||
J1 = 0.d0
|
J1 = 0.d0
|
||||||
|
|
||||||
call compute_r2_space_chol_gpu(nO,nV,cholesky_mo_num,t1,t2,tau, &
|
call compute_r2_space_chol_gpu(nO,nV,cholesky_mo_num,gpu_data,t1,t2,tau, &
|
||||||
cc_space_v_oo_chol, cc_space_v_ov_chol, cc_space_v_vo_chol, cc_space_v_vv_chol, &
|
H_vv, g_occ, J1, r2)
|
||||||
cc_space_v_oooo, cc_space_v_vooo, cc_space_v_oovv, cc_space_v_vvoo, &
|
|
||||||
cc_space_v_oovo, cc_space_v_ovvo, cc_space_v_ovoo, &
|
|
||||||
cc_space_f_vo, H_vv, g_occ, J1, r2)
|
|
||||||
|
|
||||||
|
|
||||||
!---
|
!---
|
||||||
@ -747,6 +745,8 @@ subroutine compute_J1_chol(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvoo,J1)
|
|||||||
|
|
||||||
double precision, allocatable :: tmp_cc(:,:,:), J1_tmp(:,:,:,:)
|
double precision, allocatable :: tmp_cc(:,:,:), J1_tmp(:,:,:,:)
|
||||||
|
|
||||||
|
!TODO: I am here
|
||||||
|
|
||||||
!- cc_space_v_vvoo(a,b,i,j) * (0.5d0 * t2(u,j,b,beta) + t1(u,b) * t1(j,beta)) &
|
!- cc_space_v_vvoo(a,b,i,j) * (0.5d0 * t2(u,j,b,beta) + t1(u,b) * t1(j,beta)) &
|
||||||
double precision, allocatable :: X_voov(:,:,:,:), Z_ovvo(:,:,:,:)
|
double precision, allocatable :: X_voov(:,:,:,:), Z_ovvo(:,:,:,:)
|
||||||
allocate(X_voov(nV,nO,nO,nV), Z_ovvo(nO,nV,nV,nO))
|
allocate(X_voov(nV,nO,nO,nV), Z_ovvo(nO,nV,nV,nO))
|
||||||
|
@ -4,85 +4,25 @@
|
|||||||
#include <omp.h>
|
#include <omp.h>
|
||||||
#include <cublas_v2.h>
|
#include <cublas_v2.h>
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
#include "gpu.h"
|
||||||
|
|
||||||
#define BLOCK_SIZE 16
|
#define BLOCK_SIZE 16
|
||||||
|
|
||||||
void dgemm_(char*, char*, int*, int*, int*, double*, double*, int*, double*, int*,
|
|
||||||
double*, double*, int*);
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
void gpu_dgemm(char transa, char transb, int m, int n, int k, double alpha,
|
|
||||||
double* A, int lda, double* B, int ldb, double beta, double* C, int ldc)
|
|
||||||
{
|
|
||||||
cublasHandle_t handle;
|
|
||||||
cublasCreate(&handle);
|
|
||||||
|
|
||||||
double * d_A;
|
|
||||||
double * d_B;
|
|
||||||
double * d_C;
|
|
||||||
cublasOperation_t ta, tb;
|
|
||||||
|
|
||||||
if (transa == 'N') {
|
|
||||||
cudaMalloc((void**)&d_A, lda*k*sizeof(double));
|
|
||||||
cublasSetMatrix(m, k, sizeof(double), A, lda, d_A, lda);
|
|
||||||
ta = CUBLAS_OP_N;
|
|
||||||
} else {
|
|
||||||
cudaMalloc((void**)&d_A, lda*m*sizeof(double));
|
|
||||||
cublasSetMatrix(k, m, sizeof(double), A, lda, d_A, lda);
|
|
||||||
ta = CUBLAS_OP_T;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (transb == 'N') {
|
|
||||||
cudaMalloc((void**)&d_B, ldb*n*sizeof(double));
|
|
||||||
cublasSetMatrix(k, n, sizeof(double), B, ldb, d_B, ldb);
|
|
||||||
tb = CUBLAS_OP_N;
|
|
||||||
} else {
|
|
||||||
cudaMalloc((void**)&d_B, ldb*k*sizeof(double));
|
|
||||||
cublasSetMatrix(n, k, sizeof(double), B, ldb, d_B, ldb);
|
|
||||||
tb = CUBLAS_OP_T;
|
|
||||||
}
|
|
||||||
|
|
||||||
cudaMalloc((void**)&d_C, ldc*n*sizeof(double));
|
|
||||||
if (beta != 0.) {
|
|
||||||
cublasSetMatrix(m, n, sizeof(double), C, ldc, d_C, ldc);
|
|
||||||
}
|
|
||||||
|
|
||||||
cublasDgemm(handle, ta, tb, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, ldc);
|
|
||||||
|
|
||||||
cublasGetMatrix(m, n, sizeof(double), d_C, ldc, C, ldc);
|
|
||||||
|
|
||||||
cudaFree(d_A);
|
|
||||||
cudaFree(d_B);
|
|
||||||
cudaFree(d_C);
|
|
||||||
cublasDestroy(handle);
|
|
||||||
}
|
|
||||||
|
|
||||||
void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo_num,
|
void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo_num,
|
||||||
|
gpu_data* data,
|
||||||
double* t1,
|
double* t1,
|
||||||
double* t2,
|
double* t2,
|
||||||
double* tau,
|
double* tau,
|
||||||
double* cc_space_v_oo_chol,
|
|
||||||
double* cc_space_v_ov_chol,
|
|
||||||
double* cc_space_v_vo_chol,
|
|
||||||
double* cc_space_v_vv_chol,
|
|
||||||
double* cc_space_v_oooo,
|
|
||||||
double* cc_space_v_vooo,
|
|
||||||
double* cc_space_v_oovv,
|
|
||||||
double* cc_space_v_vvoo,
|
|
||||||
double* cc_space_v_oovo,
|
|
||||||
double* cc_space_v_ovvo,
|
|
||||||
double* cc_space_v_ovoo,
|
|
||||||
double* cc_space_f_vo,
|
|
||||||
double* H_vv,
|
double* H_vv,
|
||||||
double* g_occ,
|
double* g_occ,
|
||||||
double* J1,
|
double* J1,
|
||||||
double* r2)
|
double* r2)
|
||||||
{
|
{
|
||||||
|
|
||||||
int ngpus;
|
int ngpus = 1;
|
||||||
cudaGetDeviceCount(&ngpus);
|
cudaGetDeviceCount(&ngpus);
|
||||||
|
|
||||||
#pragma omp parallel num_threads(ngpus)
|
#pragma omp parallel num_threads(ngpus)
|
||||||
{
|
{
|
||||||
int m,n,k, lda, ldb, ldc;
|
int m,n,k, lda, ldb, ldc;
|
||||||
@ -91,20 +31,28 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
|
|||||||
double* B;
|
double* B;
|
||||||
double* C;
|
double* C;
|
||||||
|
|
||||||
int ithread = omp_get_thread_num();
|
int igpu = omp_get_thread_num();
|
||||||
int igpu = ithread ;
|
|
||||||
//igpu=1;
|
|
||||||
cudaSetDevice(igpu);
|
cudaSetDevice(igpu);
|
||||||
cublasHandle_t handle;
|
cublasHandle_t handle;
|
||||||
|
|
||||||
cublasCreate(&handle);
|
cublasCreate(&handle);
|
||||||
|
|
||||||
|
double* d_cc_space_v_oo_chol = data[igpu].cc_space_v_oo_chol;
|
||||||
|
double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol;
|
||||||
|
double* d_cc_space_v_vo_chol = data[igpu].cc_space_v_vo_chol;
|
||||||
|
double* d_cc_space_v_vv_chol = data[igpu].cc_space_v_vv_chol;
|
||||||
|
double* d_cc_space_v_oooo = data[igpu].cc_space_v_oooo;
|
||||||
|
double* d_cc_space_v_vooo = data[igpu].cc_space_v_vooo;
|
||||||
|
double* d_cc_space_v_oovv = data[igpu].cc_space_v_oovv;
|
||||||
|
double* d_cc_space_v_vvoo = data[igpu].cc_space_v_vvoo;
|
||||||
|
double* d_cc_space_v_oovo = data[igpu].cc_space_v_oovo;
|
||||||
|
double* d_cc_space_v_ovvo = data[igpu].cc_space_v_ovvo;
|
||||||
|
double* d_cc_space_v_ovoo = data[igpu].cc_space_v_ovoo;
|
||||||
|
double* d_cc_space_f_vo = data[igpu].cc_space_f_vo;
|
||||||
|
|
||||||
double* d_tau;
|
double* d_tau;
|
||||||
double* d_r2;
|
double* d_r2;
|
||||||
double* d_cc_space_v_oo_chol;
|
|
||||||
double* d_cc_space_v_ov_chol;
|
|
||||||
double* d_cc_space_v_vo_chol;
|
|
||||||
double* d_cc_space_v_vv_chol;
|
|
||||||
double* d_t1;
|
double* d_t1;
|
||||||
double* d_t2;
|
double* d_t2;
|
||||||
double* d_tmp_cc;
|
double* d_tmp_cc;
|
||||||
@ -118,27 +66,6 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
|
|||||||
memset(r2, 0, nO*nO*nV*nV*sizeof(double));
|
memset(r2, 0, nO*nO*nV*nV*sizeof(double));
|
||||||
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), r2, lda, d_r2, lda);
|
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), r2, lda, d_r2, lda);
|
||||||
|
|
||||||
lda = cholesky_mo_num * nV;
|
|
||||||
cudaMalloc((void **)&d_cc_space_v_vv_chol, lda * nV * sizeof(double));
|
|
||||||
cublasSetMatrix(cholesky_mo_num*nV, nV, sizeof(double), cc_space_v_vv_chol, lda, d_cc_space_v_vv_chol, lda);
|
|
||||||
|
|
||||||
lda = cholesky_mo_num * nO;
|
|
||||||
cudaMalloc((void **)&d_cc_space_v_oo_chol, lda * nO * sizeof(double));
|
|
||||||
cublasSetMatrix(cholesky_mo_num*nO, nO, sizeof(double), cc_space_v_oo_chol, lda, d_cc_space_v_oo_chol, lda);
|
|
||||||
|
|
||||||
lda = cholesky_mo_num * nO;
|
|
||||||
cudaMalloc((void **)&d_cc_space_v_ov_chol, lda * nV * sizeof(double));
|
|
||||||
cublasSetMatrix(cholesky_mo_num*nO, nV, sizeof(double), cc_space_v_ov_chol, lda, d_cc_space_v_ov_chol, lda);
|
|
||||||
|
|
||||||
lda = cholesky_mo_num * nV;
|
|
||||||
cudaMalloc((void **)&d_cc_space_v_vo_chol, lda * nO * sizeof(double));
|
|
||||||
cublasSetMatrix(cholesky_mo_num*nV, nO, sizeof(double), cc_space_v_vo_chol, lda, d_cc_space_v_vo_chol, lda);
|
|
||||||
|
|
||||||
double* d_cc_space_v_ovvo;
|
|
||||||
lda = nO*nV;
|
|
||||||
cudaMalloc((void **)&d_cc_space_v_ovvo, nO*nV*nV*nO * sizeof(double));
|
|
||||||
cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_ovvo, lda, d_cc_space_v_ovvo, lda);
|
|
||||||
|
|
||||||
lda = nO;
|
lda = nO;
|
||||||
cudaMalloc((void **)&d_t1, nO * nV * sizeof(double));
|
cudaMalloc((void **)&d_t1, nO * nV * sizeof(double));
|
||||||
cublasSetMatrix(nO, nV, sizeof(double), t1, lda, d_t1, lda);
|
cublasSetMatrix(nO, nV, sizeof(double), t1, lda, d_t1, lda);
|
||||||
@ -147,19 +74,11 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
|
|||||||
cudaMalloc((void **)&d_t2, nO*nO*nV*nV * sizeof(double));
|
cudaMalloc((void **)&d_t2, nO*nO*nV*nV * sizeof(double));
|
||||||
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), t2, lda, d_t2, lda);
|
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), t2, lda, d_t2, lda);
|
||||||
|
|
||||||
double* d_cc_space_f_vo;
|
|
||||||
cudaMalloc((void**)&d_cc_space_f_vo, nV*nO*sizeof(double));
|
|
||||||
cublasSetMatrix(nV, nO, sizeof(double), cc_space_f_vo, nV, d_cc_space_f_vo, nV);
|
|
||||||
|
|
||||||
#pragma omp sections
|
#pragma omp sections
|
||||||
{
|
{
|
||||||
|
|
||||||
#pragma omp section
|
#pragma omp section
|
||||||
{
|
{
|
||||||
double* d_cc_space_v_vooo;
|
|
||||||
cudaMalloc((void**)&d_cc_space_v_vooo, nV*nO*nO*nO*sizeof(double));
|
|
||||||
cublasSetMatrix(nV*nO, nO*nO, sizeof(double), cc_space_v_vooo, nV*nO, d_cc_space_v_vooo, nV*nO);
|
|
||||||
|
|
||||||
double* d_Y_oooo;
|
double* d_Y_oooo;
|
||||||
cudaMalloc((void**)&d_Y_oooo, nO*nO*nO*nO*sizeof(double));
|
cudaMalloc((void**)&d_Y_oooo, nO*nO*nO*nO*sizeof(double));
|
||||||
|
|
||||||
@ -170,15 +89,10 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
|
|||||||
B = d_cc_space_v_vooo ; ldb = nV;
|
B = d_cc_space_v_vooo ; ldb = nV;
|
||||||
C = d_Y_oooo; ldc = nO;
|
C = d_Y_oooo; ldc = nO;
|
||||||
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
|
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
|
||||||
cudaFree(d_cc_space_v_vooo);
|
|
||||||
|
|
||||||
double* d_A1;
|
double* d_A1;
|
||||||
cudaMalloc((void**)&d_A1, nO*nO*nO*nO*sizeof(double));
|
cudaMalloc((void**)&d_A1, nO*nO*nO*nO*sizeof(double));
|
||||||
|
|
||||||
double* d_cc_space_v_oooo;
|
|
||||||
cudaMalloc((void**)&d_cc_space_v_oooo, nO*nO*nO*nO*sizeof(double));
|
|
||||||
cublasSetMatrix(nO*nO, nO*nO, sizeof(double), cc_space_v_oooo, nO*nO, d_cc_space_v_oooo, nO*nO);
|
|
||||||
|
|
||||||
alpha = 1.0;
|
alpha = 1.0;
|
||||||
beta = 1.0;
|
beta = 1.0;
|
||||||
A = d_cc_space_v_oooo; lda = nO*nO;
|
A = d_cc_space_v_oooo; lda = nO*nO;
|
||||||
@ -198,10 +112,6 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
|
|||||||
}
|
}
|
||||||
cudaFree(d_Y_oooo);
|
cudaFree(d_Y_oooo);
|
||||||
|
|
||||||
double* d_cc_space_v_vvoo;
|
|
||||||
cudaMalloc((void**)&d_cc_space_v_vvoo, nV*nV*nO*nO*sizeof(double));
|
|
||||||
cublasSetMatrix(nV*nV, nO*nO, sizeof(double), cc_space_v_vvoo, nV*nV, d_cc_space_v_vvoo, nV*nV);
|
|
||||||
|
|
||||||
alpha = 1.0;
|
alpha = 1.0;
|
||||||
beta = 1.0;
|
beta = 1.0;
|
||||||
m=nO*nO ; n=nO*nO; k=nV*nV;
|
m=nO*nO ; n=nO*nO; k=nV*nV;
|
||||||
@ -209,7 +119,6 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
|
|||||||
B = d_cc_space_v_vvoo ; ldb = nV*nV;
|
B = d_cc_space_v_vvoo ; ldb = nV*nV;
|
||||||
C = d_A1; ldc = nO*nO;
|
C = d_A1; ldc = nO*nO;
|
||||||
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
|
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
|
||||||
cudaFree(d_cc_space_v_vvoo);
|
|
||||||
|
|
||||||
alpha = 1.0;
|
alpha = 1.0;
|
||||||
beta = 0.0;
|
beta = 0.0;
|
||||||
@ -529,11 +438,6 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
|
|||||||
|
|
||||||
#pragma omp section
|
#pragma omp section
|
||||||
{
|
{
|
||||||
double* d_cc_space_v_oovo;
|
|
||||||
lda = nO*nO;
|
|
||||||
cudaMalloc((void **)&d_cc_space_v_oovo, nO*nO*nV*nO * sizeof(double));
|
|
||||||
cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_oovo, lda, d_cc_space_v_oovo, lda);
|
|
||||||
|
|
||||||
double* d_X_oovv;
|
double* d_X_oovv;
|
||||||
lda = nO*nO;
|
lda = nO*nO;
|
||||||
cudaMalloc((void **)&d_X_oovv, nO*nO*nV*nV * sizeof(double));
|
cudaMalloc((void **)&d_X_oovv, nO*nO*nV*nV * sizeof(double));
|
||||||
@ -562,7 +466,6 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
|
|||||||
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
|
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
cudaFree(d_cc_space_v_oovo);
|
|
||||||
|
|
||||||
double* d_X_vovo;
|
double* d_X_vovo;
|
||||||
lda = nV*nO;
|
lda = nV*nO;
|
||||||
@ -618,6 +521,16 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
|
|||||||
cudaFree(d_X_oovv);
|
cudaFree(d_X_oovv);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#pragma omp section
|
||||||
|
{
|
||||||
|
alpha = 1.0;
|
||||||
|
beta = 1.0;
|
||||||
|
A = d_r2; lda = nO*nO;
|
||||||
|
B = d_cc_space_v_oovv; ldb = nO*nO;
|
||||||
|
C = d_r2; ldc = nO*nO;
|
||||||
|
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO*nO, nV*nV, &alpha, A, lda, &beta, B, ldb, C, ldc);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
#pragma omp section
|
#pragma omp section
|
||||||
{
|
{
|
||||||
@ -634,11 +547,6 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
|
|||||||
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO*nV, nV*nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
|
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO*nV, nV*nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
|
||||||
|
|
||||||
|
|
||||||
double* d_cc_space_v_ovoo;
|
|
||||||
lda = nO*nV;
|
|
||||||
cudaMalloc((void **)&d_cc_space_v_ovoo, nO*nV*nO*nO * sizeof(double));
|
|
||||||
cublasSetMatrix(lda, nO*nO, sizeof(double), cc_space_v_ovoo, lda, d_cc_space_v_ovoo, lda);
|
|
||||||
|
|
||||||
double* d_X_ovoo;
|
double* d_X_ovoo;
|
||||||
lda = nO*nV;
|
lda = nO*nV;
|
||||||
cudaMalloc((void **)&d_X_ovoo, nO*nV*nO*nO * sizeof(double));
|
cudaMalloc((void **)&d_X_ovoo, nO*nV*nO*nO * sizeof(double));
|
||||||
@ -795,8 +703,6 @@ cublasGetMatrix(nO*nV, nV*nO, sizeof(double), d_J1, lda, J1, lda);
|
|||||||
cudaFree(d_B1);
|
cudaFree(d_B1);
|
||||||
cudaFree(d_tmp_cc2);
|
cudaFree(d_tmp_cc2);
|
||||||
|
|
||||||
cudaFree(d_cc_space_v_vo_chol);
|
|
||||||
cudaFree(d_cc_space_v_vv_chol);
|
|
||||||
cudaFree(d_tau);
|
cudaFree(d_tau);
|
||||||
cudaFree(d_t1);
|
cudaFree(d_t1);
|
||||||
cudaFree(d_tmp_cc);
|
cudaFree(d_tmp_cc);
|
||||||
@ -815,10 +721,6 @@ cublasGetMatrix(nO*nV, nV*nO, sizeof(double), d_J1, lda, J1, lda);
|
|||||||
cublasDestroy(handle);
|
cublasDestroy(handle);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i=0 ; i<(size_t) nO*nO*nV*nV ; ++i)
|
|
||||||
{
|
|
||||||
r2[i] += cc_space_v_oovv[i];
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
15
devel/ccsd_gpu/gpu.h
Normal file
15
devel/ccsd_gpu/gpu.h
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
typedef struct {
|
||||||
|
double* cc_space_v_oo_chol;
|
||||||
|
double* cc_space_v_ov_chol;
|
||||||
|
double* cc_space_v_vo_chol;
|
||||||
|
double* cc_space_v_vv_chol;
|
||||||
|
double* cc_space_v_oooo;
|
||||||
|
double* cc_space_v_vooo;
|
||||||
|
double* cc_space_v_oovv;
|
||||||
|
double* cc_space_v_vvoo;
|
||||||
|
double* cc_space_v_oovo;
|
||||||
|
double* cc_space_v_ovvo;
|
||||||
|
double* cc_space_v_ovoo;
|
||||||
|
double* cc_space_f_vo;
|
||||||
|
} gpu_data;
|
||||||
|
|
61
devel/ccsd_gpu/gpu_dgemm.c
Normal file
61
devel/ccsd_gpu/gpu_dgemm.c
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <omp.h>
|
||||||
|
#include <cublas_v2.h>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
|
#define BLOCK_SIZE 16
|
||||||
|
|
||||||
|
void dgemm_(char*, char*, int*, int*, int*, double*, double*, int*, double*, int*,
|
||||||
|
double*, double*, int*);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
void gpu_dgemm(char transa, char transb, int m, int n, int k, double alpha,
|
||||||
|
double* A, int lda, double* B, int ldb, double beta, double* C, int ldc)
|
||||||
|
{
|
||||||
|
cublasHandle_t handle;
|
||||||
|
cublasCreate(&handle);
|
||||||
|
|
||||||
|
double * d_A;
|
||||||
|
double * d_B;
|
||||||
|
double * d_C;
|
||||||
|
cublasOperation_t ta, tb;
|
||||||
|
|
||||||
|
if (transa == 'N') {
|
||||||
|
cudaMalloc((void**)&d_A, lda*k*sizeof(double));
|
||||||
|
cublasSetMatrix(m, k, sizeof(double), A, lda, d_A, lda);
|
||||||
|
ta = CUBLAS_OP_N;
|
||||||
|
} else {
|
||||||
|
cudaMalloc((void**)&d_A, lda*m*sizeof(double));
|
||||||
|
cublasSetMatrix(k, m, sizeof(double), A, lda, d_A, lda);
|
||||||
|
ta = CUBLAS_OP_T;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (transb == 'N') {
|
||||||
|
cudaMalloc((void**)&d_B, ldb*n*sizeof(double));
|
||||||
|
cublasSetMatrix(k, n, sizeof(double), B, ldb, d_B, ldb);
|
||||||
|
tb = CUBLAS_OP_N;
|
||||||
|
} else {
|
||||||
|
cudaMalloc((void**)&d_B, ldb*k*sizeof(double));
|
||||||
|
cublasSetMatrix(n, k, sizeof(double), B, ldb, d_B, ldb);
|
||||||
|
tb = CUBLAS_OP_T;
|
||||||
|
}
|
||||||
|
|
||||||
|
cudaMalloc((void**)&d_C, ldc*n*sizeof(double));
|
||||||
|
if (beta != 0.) {
|
||||||
|
cublasSetMatrix(m, n, sizeof(double), C, ldc, d_C, ldc);
|
||||||
|
}
|
||||||
|
|
||||||
|
cublasDgemm(handle, ta, tb, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, ldc);
|
||||||
|
|
||||||
|
cublasGetMatrix(m, n, sizeof(double), d_C, ldc, C, ldc);
|
||||||
|
|
||||||
|
cudaFree(d_A);
|
||||||
|
cudaFree(d_B);
|
||||||
|
cudaFree(d_C);
|
||||||
|
cublasDestroy(handle);
|
||||||
|
}
|
||||||
|
|
105
devel/ccsd_gpu/gpu_init.c
Normal file
105
devel/ccsd_gpu/gpu_init.c
Normal file
@ -0,0 +1,105 @@
|
|||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <omp.h>
|
||||||
|
#include <cublas_v2.h>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
#include "gpu.h"
|
||||||
|
|
||||||
|
#define BLOCK_SIZE 16
|
||||||
|
|
||||||
|
gpu_data* gpu_init(
|
||||||
|
int nO, int nV, int cholesky_mo_num,
|
||||||
|
double* cc_space_v_oo_chol, double* cc_space_v_ov_chol,
|
||||||
|
double* cc_space_v_vo_chol, double* cc_space_v_vv_chol,
|
||||||
|
double* cc_space_v_oooo, double* cc_space_v_vooo,
|
||||||
|
double* cc_space_v_oovv, double* cc_space_v_vvoo,
|
||||||
|
double* cc_space_v_oovo, double* cc_space_v_ovvo,
|
||||||
|
double* cc_space_v_ovoo, double* cc_space_f_vo)
|
||||||
|
{
|
||||||
|
int ngpus = 1;
|
||||||
|
cudaGetDeviceCount(&ngpus);
|
||||||
|
|
||||||
|
gpu_data* data = (gpu_data*) malloc (ngpus*sizeof(gpu_data));
|
||||||
|
|
||||||
|
#pragma omp parallel num_threads(ngpus)
|
||||||
|
{
|
||||||
|
int lda;
|
||||||
|
int igpu = omp_get_thread_num();
|
||||||
|
cudaSetDevice(igpu);
|
||||||
|
cublasHandle_t handle;
|
||||||
|
|
||||||
|
cublasCreate(&handle);
|
||||||
|
|
||||||
|
double* d_cc_space_v_oo_chol;
|
||||||
|
lda = cholesky_mo_num * nO;
|
||||||
|
cudaMalloc((void **)&d_cc_space_v_oo_chol, lda * nO * sizeof(double));
|
||||||
|
cublasSetMatrix(cholesky_mo_num*nO, nO, sizeof(double), cc_space_v_oo_chol, lda, d_cc_space_v_oo_chol, lda);
|
||||||
|
|
||||||
|
double* d_cc_space_v_ov_chol;
|
||||||
|
lda = cholesky_mo_num * nO;
|
||||||
|
cudaMalloc((void **)&d_cc_space_v_ov_chol, lda * nV * sizeof(double));
|
||||||
|
cublasSetMatrix(cholesky_mo_num*nO, nV, sizeof(double), cc_space_v_ov_chol, lda, d_cc_space_v_ov_chol, lda);
|
||||||
|
|
||||||
|
double* d_cc_space_v_vo_chol;
|
||||||
|
lda = cholesky_mo_num * nV;
|
||||||
|
cudaMalloc((void **)&d_cc_space_v_vo_chol, lda * nO * sizeof(double));
|
||||||
|
cublasSetMatrix(cholesky_mo_num*nV, nO, sizeof(double), cc_space_v_vo_chol, lda, d_cc_space_v_vo_chol, lda);
|
||||||
|
|
||||||
|
double* d_cc_space_v_vv_chol;
|
||||||
|
lda = cholesky_mo_num * nV;
|
||||||
|
cudaMalloc((void **)&d_cc_space_v_vv_chol, lda * nV * sizeof(double));
|
||||||
|
cublasSetMatrix(cholesky_mo_num*nV, nV, sizeof(double), cc_space_v_vv_chol, lda, d_cc_space_v_vv_chol, lda);
|
||||||
|
|
||||||
|
double* d_cc_space_v_oooo;
|
||||||
|
cudaMalloc((void**)&d_cc_space_v_oooo, nO*nO*nO*nO*sizeof(double));
|
||||||
|
cublasSetMatrix(nO*nO, nO*nO, sizeof(double), cc_space_v_oooo, nO*nO, d_cc_space_v_oooo, nO*nO);
|
||||||
|
|
||||||
|
double* d_cc_space_v_vooo;
|
||||||
|
cudaMalloc((void**)&d_cc_space_v_vooo, nV*nO*nO*nO*sizeof(double));
|
||||||
|
cublasSetMatrix(nV*nO, nO*nO, sizeof(double), cc_space_v_vooo, nV*nO, d_cc_space_v_vooo, nV*nO);
|
||||||
|
|
||||||
|
double* d_cc_space_v_oovv;
|
||||||
|
cudaMalloc((void**)&d_cc_space_v_oovv, nO*nO*nV*nV*sizeof(double));
|
||||||
|
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), cc_space_v_oovv, nO*nO, d_cc_space_v_oovv, nO*nO);
|
||||||
|
|
||||||
|
double* d_cc_space_v_vvoo;
|
||||||
|
cudaMalloc((void**)&d_cc_space_v_vvoo, nV*nV*nO*nO*sizeof(double));
|
||||||
|
cublasSetMatrix(nV*nV, nO*nO, sizeof(double), cc_space_v_vvoo, nV*nV, d_cc_space_v_vvoo, nV*nV);
|
||||||
|
|
||||||
|
double* d_cc_space_v_oovo;
|
||||||
|
lda = nO*nO;
|
||||||
|
cudaMalloc((void **)&d_cc_space_v_oovo, nO*nO*nV*nO * sizeof(double));
|
||||||
|
cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_oovo, lda, d_cc_space_v_oovo, lda);
|
||||||
|
|
||||||
|
double* d_cc_space_v_ovvo;
|
||||||
|
lda = nO*nV;
|
||||||
|
cudaMalloc((void **)&d_cc_space_v_ovvo, nO*nV*nV*nO * sizeof(double));
|
||||||
|
cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_ovvo, lda, d_cc_space_v_ovvo, lda);
|
||||||
|
|
||||||
|
double* d_cc_space_v_ovoo;
|
||||||
|
lda = nO*nV;
|
||||||
|
cudaMalloc((void **)&d_cc_space_v_ovoo, nO*nV*nO*nO * sizeof(double));
|
||||||
|
cublasSetMatrix(lda, nO*nO, sizeof(double), cc_space_v_ovoo, lda, d_cc_space_v_ovoo, lda);
|
||||||
|
|
||||||
|
double* d_cc_space_f_vo;
|
||||||
|
cudaMalloc((void**)&d_cc_space_f_vo, nV*nO*sizeof(double));
|
||||||
|
cublasSetMatrix(nV, nO, sizeof(double), cc_space_f_vo, nV, d_cc_space_f_vo, nV);
|
||||||
|
|
||||||
|
data[igpu].cc_space_v_oo_chol = d_cc_space_v_oo_chol;
|
||||||
|
data[igpu].cc_space_v_ov_chol = d_cc_space_v_ov_chol;
|
||||||
|
data[igpu].cc_space_v_vo_chol = d_cc_space_v_vo_chol;
|
||||||
|
data[igpu].cc_space_v_vv_chol = d_cc_space_v_vv_chol;
|
||||||
|
data[igpu].cc_space_v_oooo = d_cc_space_v_oooo;
|
||||||
|
data[igpu].cc_space_v_vooo = d_cc_space_v_vooo;
|
||||||
|
data[igpu].cc_space_v_oovv = d_cc_space_v_oovv;
|
||||||
|
data[igpu].cc_space_v_vvoo = d_cc_space_v_vvoo;
|
||||||
|
data[igpu].cc_space_v_oovo = d_cc_space_v_oovo;
|
||||||
|
data[igpu].cc_space_v_ovvo = d_cc_space_v_ovvo;
|
||||||
|
data[igpu].cc_space_v_ovoo = d_cc_space_v_ovoo;
|
||||||
|
data[igpu].cc_space_f_vo = d_cc_space_f_vo;
|
||||||
|
}
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
@ -4,16 +4,12 @@ module gpu_module
|
|||||||
implicit none
|
implicit none
|
||||||
|
|
||||||
interface
|
interface
|
||||||
subroutine compute_r2_space_chol_gpu(nO,nV,cholesky_mo_num, t1, t2, tau,&
|
type(c_ptr) function gpu_init(nO, nV, cholesky_mo_num, &
|
||||||
cc_space_v_oo_chol, cc_space_v_ov_chol, cc_space_v_vo_chol, cc_space_v_vv_chol, &
|
cc_space_v_oo_chol, cc_space_v_ov_chol, cc_space_v_vo_chol, cc_space_v_vv_chol, &
|
||||||
cc_space_v_oooo, cc_space_v_vooo, cc_space_v_oovv, cc_space_v_vvoo, &
|
cc_space_v_oooo, cc_space_v_vooo, cc_space_v_oovv, cc_space_v_vvoo, &
|
||||||
cc_space_v_oovo, cc_space_v_ovvo, cc_space_v_ovoo, &
|
cc_space_v_oovo, cc_space_v_ovvo, cc_space_v_ovoo, cc_space_f_vo) bind(C)
|
||||||
cc_space_f_vo, H_vv, g_occ, J1, r2) bind(C)
|
import c_int, c_double, c_ptr
|
||||||
import c_int, c_double
|
integer(c_int), intent(in), value :: nO, nV, cholesky_mo_num
|
||||||
integer(c_int), value :: nO, nV, cholesky_mo_num
|
|
||||||
real(c_double), intent(in) :: t1(nO,nV)
|
|
||||||
real(c_double), intent(in) :: t2(nO,nO,nV,nV)
|
|
||||||
real(c_double), intent(in) :: tau(nO,nO,nV,nV)
|
|
||||||
real(c_double), intent(in) :: cc_space_v_oo_chol(cholesky_mo_num,nO,nO)
|
real(c_double), intent(in) :: cc_space_v_oo_chol(cholesky_mo_num,nO,nO)
|
||||||
real(c_double), intent(in) :: cc_space_v_ov_chol(cholesky_mo_num,nO,nV)
|
real(c_double), intent(in) :: cc_space_v_ov_chol(cholesky_mo_num,nO,nV)
|
||||||
real(c_double), intent(in) :: cc_space_v_vo_chol(cholesky_mo_num,nV,nO)
|
real(c_double), intent(in) :: cc_space_v_vo_chol(cholesky_mo_num,nV,nO)
|
||||||
@ -26,6 +22,16 @@ module gpu_module
|
|||||||
real(c_double), intent(in) :: cc_space_v_ovvo(nO,nV,nV,nO)
|
real(c_double), intent(in) :: cc_space_v_ovvo(nO,nV,nV,nO)
|
||||||
real(c_double), intent(in) :: cc_space_v_ovoo(nO,nV,nO,nO)
|
real(c_double), intent(in) :: cc_space_v_ovoo(nO,nV,nO,nO)
|
||||||
real(c_double), intent(in) :: cc_space_f_vo(nV,nO)
|
real(c_double), intent(in) :: cc_space_f_vo(nV,nO)
|
||||||
|
end function
|
||||||
|
|
||||||
|
subroutine compute_r2_space_chol_gpu(nO,nV,cholesky_mo_num, gpu_data, t1, t2, tau,&
|
||||||
|
H_vv, g_occ, J1, r2) bind(C)
|
||||||
|
import c_int, c_double, c_ptr
|
||||||
|
integer(c_int), intent(in), value :: nO, nV, cholesky_mo_num
|
||||||
|
type(c_ptr), value :: gpu_data
|
||||||
|
real(c_double), intent(in) :: t1(nO,nV)
|
||||||
|
real(c_double), intent(in) :: t2(nO,nO,nV,nV)
|
||||||
|
real(c_double), intent(in) :: tau(nO,nO,nV,nV)
|
||||||
real(c_double), intent(in) :: H_vv(nV,nV)
|
real(c_double), intent(in) :: H_vv(nV,nV)
|
||||||
real(c_double), intent(in) :: g_occ(nO,nO)
|
real(c_double), intent(in) :: g_occ(nO,nO)
|
||||||
real(c_double) :: J1(nO,nV,nV,nO)
|
real(c_double) :: J1(nO,nV,nV,nO)
|
||||||
|
Loading…
Reference in New Issue
Block a user