1
0
mirror of https://gitlab.com/scemama/qp_plugins_scemama.git synced 2024-12-22 12:23:37 +01:00

Compute energy on GPU

This commit is contained in:
Anthony Scemama 2023-08-05 00:50:58 +02:00
parent 971a0ff160
commit 699c555633
3 changed files with 156 additions and 203 deletions

View File

@ -9,7 +9,7 @@ subroutine run_ccsd_space_orb
double precision :: uncorr_energy,energy, max_elem, max_r, max_r1, max_r2,ta,tb double precision :: uncorr_energy,energy, max_elem, max_r, max_r1, max_r2,ta,tb
logical :: not_converged logical :: not_converged
double precision, allocatable :: t2(:,:,:,:), r2(:,:,:,:), tau(:,:,:,:), tau_x(:,:,:,:) double precision, allocatable :: t2(:,:,:,:), r2(:,:,:,:)
double precision, allocatable :: t1(:,:), r1(:,:) double precision, allocatable :: t1(:,:), r1(:,:)
double precision, allocatable :: H_oo(:,:), H_vv(:,:), H_vo(:,:) double precision, allocatable :: H_oo(:,:), H_vv(:,:), H_vo(:,:)
@ -50,8 +50,6 @@ subroutine run_ccsd_space_orb
!print*,'vir',list_vir !print*,'vir',list_vir
allocate(t2(nO,nO,nV,nV), r2(nO,nO,nV,nV)) allocate(t2(nO,nO,nV,nV), r2(nO,nO,nV,nV))
allocate(tau(nO,nO,nV,nV))
allocate(tau_x(nO,nO,nV,nV))
allocate(t1(nO,nV), r1(nO,nV)) allocate(t1(nO,nV), r1(nO,nV))
allocate(H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO)) allocate(H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO))
@ -95,26 +93,6 @@ subroutine run_ccsd_space_orb
endif endif
! Init ! Init
call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,t1)
call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,t2)
call update_tau_space(nO,nV,t1,t2,tau)
call update_tau_x_space(nO,nV,tau,tau_x)
!print*,'hf_energy', hf_energy
call det_energy(det,uncorr_energy)
print*,'Det energy', uncorr_energy
call ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
print*,'Guess energy', uncorr_energy+energy, energy
nb_iter = 0
not_converged = .True.
max_r1 = 0d0
max_r2 = 0d0
write(*,'(A77)') ' -----------------------------------------------------------------------------'
write(*,'(A77)') ' | It. | E(CCSD) (Ha) | Correlation (Ha) | Conv. T1 | Conv. T2 |'
write(*,'(A77)') ' -----------------------------------------------------------------------------'
call wall_time(ta)
type(c_ptr) :: gpu_data type(c_ptr) :: gpu_data
gpu_data = gpu_init(nO, nV, cholesky_mo_num, & gpu_data = gpu_init(nO, nV, cholesky_mo_num, &
@ -128,10 +106,31 @@ subroutine run_ccsd_space_orb
stop -1 stop -1
endif endif
call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,t1)
call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,t2)
call gpu_upload(gpu_data, nO, nV, t1, t2);
!print*,'hf_energy', hf_energy
call det_energy(det,uncorr_energy)
print*,'Det energy', uncorr_energy
energy = ccsd_energy_space_gpu(gpu_data)
print*,'Guess energy', uncorr_energy+energy, energy
nb_iter = 0
not_converged = .True.
max_r1 = 0d0
max_r2 = 0d0
write(*,'(A77)') ' -----------------------------------------------------------------------------'
write(*,'(A77)') ' | It. | E(CCSD) (Ha) | Correlation (Ha) | Conv. T1 | Conv. T2 |'
write(*,'(A77)') ' -----------------------------------------------------------------------------'
call wall_time(ta)
do while (not_converged) do while (not_converged)
! Residue ! Residue
call gpu_upload(gpu_data, nO, nV, t1, t2, tau, tau_x);
!$OMP PARALLEL SECTIONS !$OMP PARALLEL SECTIONS
!$OMP SECTION !$OMP SECTION
call compute_H_oo_chol_gpu(gpu_data,0) call compute_H_oo_chol_gpu(gpu_data,0)
@ -163,11 +162,10 @@ subroutine run_ccsd_space_orb
print*,'Unkown cc_method_method: '//cc_update_method print*,'Unkown cc_method_method: '//cc_update_method
endif endif
call update_tau_space(nO,nV,t1,t2,tau) call gpu_upload(gpu_data, nO, nV, t1, t2);
call update_tau_x_space(nO,nV,tau,tau_x)
! Energy ! Energy
call ccsd_energy_space_x(nO,nV,tau_x,t1,energy) energy = ccsd_energy_space_gpu(gpu_data)
write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |' write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |'
nb_iter = nb_iter + 1 nb_iter = nb_iter + 1
@ -202,7 +200,7 @@ subroutine run_ccsd_space_orb
deallocate(all_err,all_t) deallocate(all_err,all_t)
endif endif
deallocate(r1,r2,tau) deallocate(r1,r2)
! CCSD(T) ! CCSD(T)
double precision :: e_t double precision :: e_t
@ -248,163 +246,3 @@ subroutine run_ccsd_space_orb
end end
! Energy
subroutine ccsd_energy_space(nO,nV,tau,t1,energy)
implicit none
integer, intent(in) :: nO, nV
double precision, intent(in) :: tau(nO,nO,nV,nV)
double precision, intent(in) :: t1(nO,nV)
double precision, intent(out) :: energy
! internal
integer :: i,j,a,b
double precision :: e
energy = 0d0
!$omp parallel &
!$omp shared(nO,nV,energy,tau,t1,&
!$omp cc_space_f_vo,cc_space_w_oovv) &
!$omp private(i,j,a,b,e) &
!$omp default(none)
e = 0d0
!$omp do
do a = 1, nV
do i = 1, nO
e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
enddo
enddo
!$omp end do nowait
!$omp do
do b = 1, nV
do a = 1, nV
do j = 1, nO
do i = 1, nO
e = e + tau(i,j,a,b) * cc_space_w_oovv(i,j,a,b)
enddo
enddo
enddo
enddo
!$omp end do nowait
!$omp critical
energy = energy + e
!$omp end critical
!$omp end parallel
end
subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
implicit none
integer, intent(in) :: nO, nV
double precision, intent(in) :: tau_x(nO,nO,nV,nV)
double precision, intent(in) :: t1(nO,nV)
double precision, intent(out) :: energy
! internal
integer :: i,j,a,b
double precision :: e
energy = 0d0
!$omp parallel &
!$omp shared(nO,nV,energy,tau_x,t1,&
!$omp cc_space_f_vo,cc_space_v_oovv) &
!$omp private(i,j,a,b,e) &
!$omp default(none)
e = 0d0
!$omp do
do a = 1, nV
do i = 1, nO
e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
enddo
enddo
!$omp end do nowait
!$omp do
do b = 1, nV
do a = 1, nV
do j = 1, nO
do i = 1, nO
e = e + tau_x(i,j,a,b) * cc_space_v_oovv(i,j,a,b)
enddo
enddo
enddo
enddo
!$omp end do nowait
!$omp critical
energy = energy + e
!$omp end critical
!$omp end parallel
end
! Tau
subroutine update_tau_space(nO,nV,t1,t2,tau)
implicit none
! in
integer, intent(in) :: nO, nV
double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV)
! out
double precision, intent(out) :: tau(nO,nO,nV,nV)
! internal
integer :: i,j,a,b
!$OMP PARALLEL &
!$OMP SHARED(nO,nV,tau,t2,t1) &
!$OMP PRIVATE(i,j,a,b) &
!$OMP DEFAULT(NONE)
!$OMP DO
do b = 1, nV
do a = 1, nV
do j = 1, nO
do i = 1, nO
tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b)
enddo
enddo
enddo
enddo
!$OMP END DO
!$OMP END PARALLEL
end
subroutine update_tau_x_space(nO,nV,tau,tau_x)
implicit none
! in
integer, intent(in) :: nO, nV
double precision, intent(in) :: tau(nO,nO,nV,nV)
! out
double precision, intent(out) :: tau_x(nO,nO,nV,nV)
! internal
integer :: i,j,a,b
!$OMP PARALLEL &
!$OMP SHARED(nO,nV,tau,tau_x) &
!$OMP PRIVATE(i,j,a,b) &
!$OMP DEFAULT(NONE)
!$OMP DO
do b = 1, nV
do a = 1, nV
do j = 1, nO
do i = 1, nO
tau_x(i,j,a,b) = 2.d0*tau(i,j,a,b) - tau(i,j,b,a)
enddo
enddo
enddo
enddo
!$OMP END DO
!$OMP END PARALLEL
end

View File

@ -9,9 +9,7 @@
void gpu_upload(gpu_data* data, void gpu_upload(gpu_data* data,
int nO, int nV, int nO, int nV,
double* t1, double* t1,
double* t2, double* t2)
double* tau,
double* tau_x)
{ {
int lda; int lda;
const int cholesky_mo_num = data->cholesky_mo_num; const int cholesky_mo_num = data->cholesky_mo_num;
@ -19,19 +17,14 @@ void gpu_upload(gpu_data* data,
int ngpus = 1; int ngpus = 1;
if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus); if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
double * tau = malloc(nO*nO*nV*nV * sizeof(double));
double * tau_x = malloc(nO*nO*nV*nV * sizeof(double));
#pragma omp parallel num_threads(ngpus) #pragma omp parallel num_threads(ngpus)
{ {
int igpu = omp_get_thread_num(); int igpu = omp_get_thread_num();
cudaSetDevice(igpu); cudaSetDevice(igpu);
double* d_tau = data[igpu].tau;
lda = nO * nO;
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau, lda, d_tau, lda);
double* d_tau_x = data[igpu].tau_x;
lda = nO * nO;
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau_x, lda, d_tau_x, lda);
double* d_t1 = data[igpu].t1; double* d_t1 = data[igpu].t1;
lda = nO; lda = nO;
cublasSetMatrix(nO, nV, sizeof(double), t1, lda, d_t1, lda); cublasSetMatrix(nO, nV, sizeof(double), t1, lda, d_t1, lda);
@ -39,11 +32,76 @@ void gpu_upload(gpu_data* data,
double* d_t2 = data[igpu].t2; double* d_t2 = data[igpu].t2;
lda = nO*nO; lda = nO*nO;
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), t2, lda, d_t2, lda); cublasSetMatrix(nO*nO, nV*nV, sizeof(double), t2, lda, d_t2, lda);
}
int m,n,k, lda, ldb, ldc;
double alpha, beta;
double* A;
double* B;
double* C;
cublasHandle_t handle;
cublasCreate(&handle);
cudaStream_t stream[nV];
double* d_tau = data[igpu].tau;
double* d_tau_x = data[igpu].tau_x;
lda = nO * nO;
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau_x, lda, d_tau_x, lda);
if (igpu == 0) {
for (int i=0 ; i<nV ; ++i) {
cudaStreamCreate(&(stream[i]));
}
alpha = 1.0;
for (int j=0 ; j<nO ; ++j) {
for (int b=0 ; b<nV ; ++b) {
cublasSetStream(handle, stream[b]);
beta = t1[j+b*nO];
A = &(d_t2[nO*(j + nO*nV*b)]); lda = nO*nO;
B = d_t1; ldb = nO;
C = &(d_tau[nO*(j + nO*nV*b)]); ldc = nO*nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO, nV, &alpha, A, lda, &beta, B, ldb, C, ldc);
}
}
cudaDeviceSynchronize();
alpha = 2.0;
beta = -1.0;
for (int b=0 ; b<nV ; ++b) {
for (int a=0 ; a<nV ; ++a) {
cublasSetStream(handle, stream[a]);
A = &(d_tau[nO*nO*(a + nV*b)]); lda = nO;
B = &(d_tau[nO*nO*(b + nV*a)]); ldb = nO;
C = &(d_tau_x[nO*nO*(a + nV*b)]); ldc = nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
}
}
for (int i=0 ; i<nV ; ++i) {
cudaStreamDestroy(stream[i]);
}
cublasSetStream(handle, NULL);
lda = nO*nO;
cublasGetMatrix(nO*nO, nV*nV, sizeof(double), d_tau, lda, tau, lda);
cublasGetMatrix(nO*nO, nV*nV, sizeof(double), d_tau_x, lda, tau_x, lda);
}
#pragma omp barrier
if (igpu > 0) {
lda = nO * nO;
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau, lda, d_tau, lda);
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau_x, lda, d_tau_x, lda);
}
cublasDestroy(handle);
}
free(tau);
free(tau_x);
} }
void compute_h_oo_chol_gpu(gpu_data* data, int igpu) void compute_h_oo_chol_gpu(gpu_data* data, int igpu)
{ {
int ngpus = 1; int ngpus = 1;
@ -1932,3 +1990,57 @@ void compute_r1_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
} }
} }
double ccsd_energy_space_gpu(gpu_data* data)
{
double result = 0.0;
const int nO = data->nO;
const int nV = data->nV;
int ngpus = 1;
if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
#pragma omp parallel num_threads(ngpus)
{
int m,n,k, lda, ldb, ldc;
double alpha, beta;
double* A;
double* B;
double* C;
int igpu = omp_get_thread_num();
cudaSetDevice(igpu);
cublasHandle_t handle;
cublasCreate(&handle);
double result_local = 0.0;
#pragma omp sections
{
#pragma omp section
{
double* d_cc_space_f_ov = data[igpu].cc_space_f_ov;
double* d_t1 = data[igpu].t1;
double x;
cublasDdot(handle, nO*nV, d_cc_space_f_ov, 1, d_t1, 1, &x);
result_local += 2.0*x;
}
#pragma omp section
{
double* d_tau_x = data[igpu].tau_x;
double* d_cc_space_v_oovv = data[igpu].cc_space_v_oovv;
double x;
cublasDdot(handle, nO*nO*nV*nV, d_tau_x, 1, d_cc_space_v_oovv, 1, &x);
result_local += x;
}
}
cublasDestroy(handle);
#pragma omp critical
{
result += result_local;
}
}
return result;
}

View File

@ -30,16 +30,15 @@ module gpu_module
real(c_double), intent(in) :: cc_space_f_vv(nV,nV) real(c_double), intent(in) :: cc_space_f_vv(nV,nV)
end function end function
subroutine gpu_upload(gpu_data, nO, nV, t1, t2, tau, tau_x) bind(C) subroutine gpu_upload(gpu_data, nO, nV, t1, t2) bind(C)
import c_int, c_double, c_ptr import c_int, c_double, c_ptr
type(c_ptr), value :: gpu_data type(c_ptr), value :: gpu_data
integer(c_int), intent(in), value :: nO, nV integer(c_int), intent(in), value :: nO, nV
real(c_double), intent(in) :: t1(nO,nV) real(c_double), intent(in) :: t1(nO,nV)
real(c_double), intent(in) :: t2(nO,nO,nV,nV) real(c_double), intent(in) :: t2(nO,nO,nV,nV)
real(c_double), intent(in) :: tau(nO,nO,nV,nV)
real(c_double), intent(in) :: tau_x(nO,nO,nV,nV)
end subroutine end subroutine
subroutine compute_H_oo_chol_gpu(gpu_data, igpu) bind(C) subroutine compute_H_oo_chol_gpu(gpu_data, igpu) bind(C)
import c_int, c_double, c_ptr import c_int, c_double, c_ptr
type(c_ptr), value :: gpu_data type(c_ptr), value :: gpu_data
@ -76,6 +75,10 @@ module gpu_module
real(c_double), intent(out) :: max_r2 real(c_double), intent(out) :: max_r2
end subroutine end subroutine
double precision function ccsd_energy_space_gpu(gpu_data) bind(C)
import c_ptr
type(c_ptr), value :: gpu_data
end function
subroutine gpu_dgemm(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) bind(C) subroutine gpu_dgemm(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) bind(C)