1
0
mirror of https://gitlab.com/scemama/qp_plugins_scemama.git synced 2024-12-22 12:23:37 +01:00

Compare commits

...

5 Commits

Author SHA1 Message Date
971a0ff160 Refactoring 2023-08-04 16:54:48 +02:00
ec6e5fde68 Merge turpan:~/qp2/plugins/qp_plugins_scemama 2023-08-04 16:43:42 +02:00
ac2614a0f3 r1 on GPU 2023-08-04 16:42:46 +02:00
d61ecb35c4 r1 on GPU 2023-08-04 16:19:41 +02:00
a7e0832dae Starting r1 on GPU 2023-08-04 14:48:08 +02:00
14 changed files with 2006 additions and 2359 deletions

View File

@ -1,3 +1,75 @@
[cc_thresh_conv]
type: double precision
doc: Threshold for the convergence of the residual equations.
interface: ezfio,ocaml,provider
default: 1e-6
[cc_max_iter]
type: integer
doc: Maximum number of iterations.
interface: ezfio,ocaml,provider
default: 100
[cc_diis_depth]
type: integer
doc: Maximum depth of the DIIS, i.e., maximum number of iterations that the DIIS keeps in memory. Warning, we allocate matrices with the diis depth at the beginning without update. If you don't have enough memory it should crash in memory.
interface: ezfio,ocaml,provider
default: 8
[cc_level_shift]
type: double precision
doc: Level shift for the CC
interface: ezfio,ocaml,provider
default: 0.0
[cc_level_shift_guess]
type: double precision
doc: Level shift for the guess of the CC amplitudes
interface: ezfio,ocaml,provider
default: 0.0
[cc_update_method]
type: character*(32)
doc: Method used to update the CC amplitudes. none -> normal, diis -> with diis.
interface: ezfio,ocaml,provider
default: diis
[cc_guess_t1]
type: character*(32)
doc: Guess used to initialize the T1 amplitudes. none -> 0, MP -> perturbation theory, read -> read from disk.
interface: ezfio,ocaml,provider
default: MP
[cc_guess_t2]
type: character*(32)
doc: Guess used to initialize the T2 amplitudes. none -> 0, MP -> perturbation theory, read -> read from disk.
interface: ezfio,ocaml,provider
default: MP
[io_amplitudes]
type: Disk_access
doc: Read/Write |CCSD| amplitudes from/to disk [ Write | Read | None ]
interface: ezfio,provider,ocaml
default: None
[cc_par_t]
type: logical
doc: If true, the CCSD(T) will be computed.
interface: ezfio,ocaml,provider
default: False
[cc_dev]
type: logical
doc: Only for dev purposes.
interface: ezfio,ocaml,provider
default: False
[cc_ref]
type: integer
doc: Index of the reference determinant in psi_det for CC calculation.
interface: ezfio,ocaml,provider
default: 1
[energy] [energy]
type: double precision type: double precision
doc: CCSD energy doc: CCSD energy

View File

@ -1,2 +1,2 @@
hartree_fock hartree_fock
utils_cc_gpu determinants

File diff suppressed because it is too large Load Diff

View File

@ -80,11 +80,12 @@ end
! R1 ! R1
subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) subroutine compute_r1_space_chol(gpu_data, nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
use gpu_module
implicit none implicit none
! in ! in
type(c_ptr), intent(in) :: gpu_data
integer, intent(in) :: nO, nV integer, intent(in) :: nO, nV
double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV), tau(nO,nO,nV,nV) double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV), tau(nO,nO,nV,nV)
double precision, intent(in) :: H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO) double precision, intent(in) :: H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO)
@ -95,177 +96,40 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
! internal ! internal
integer :: u,i,j,beta,a,b integer :: u,i,j,beta,a,b
!$omp parallel &
!$omp shared(nO,nV,r1,cc_space_f_ov) &
!$omp private(u,beta) &
!$omp default(none)
!$omp do
do beta = 1, nV
do u = 1, nO
r1(u,beta) = cc_space_f_ov(u,beta)
enddo
enddo
!$omp end do
!$omp end parallel
double precision, allocatable :: X_oo(:,:) call compute_r1_space_chol_gpu(gpu_data, nO, nV, t1, r1, max_r1)
allocate(X_oo(nO,nO))
call dgemm('N','N', nO, nO, nV, &
-2d0, t1 , size(t1,1), &
cc_space_f_vo, size(cc_space_f_vo,1), &
0d0, X_oo , size(X_oo,1))
call dgemm('T','N', nO, nV, nO, &
1d0, X_oo, size(X_oo,2), &
t1 , size(t1,1), &
1d0, r1 , size(r1,1))
deallocate(X_oo)
call dgemm('N','N', nO, nV, nV, &
1d0, t1 , size(t1,1), &
H_vv, size(H_vv,1), &
1d0, r1 , size(r1,1))
call dgemm('N','N', nO, nV, nO, &
-1d0, H_oo, size(H_oo,1), &
t1 , size(t1,1), &
1d0, r1, size(r1,1))
double precision, allocatable :: X_voov(:,:,:,:)
allocate(X_voov(nV, nO, nO, nV))
!$omp parallel &
!$omp shared(nO,nV,X_voov,t2,t1) &
!$omp private(u,beta,i,a) &
!$omp default(none)
!$omp do
do beta = 1, nV
do u = 1, nO
do i = 1, nO
do a = 1, nV
X_voov(a,i,u,beta) = 2d0 * t2(i,u,a,beta) - t2(u,i,a,beta) + t1(u,a) * t1(i,beta)
enddo
enddo
enddo
enddo
!$omp end do
!$omp end parallel
call dgemv('T', nV*nO, nO*nV, &
1d0, X_voov, size(X_voov,1) * size(X_voov,2), &
H_vo , 1, &
1d0, r1 , 1)
deallocate(X_voov)
double precision, allocatable :: X_ovov(:,:,:,:) double precision, allocatable :: X_ovov(:,:,:,:)
allocate(X_ovov(nO, nV, nO, nV))
!$omp parallel &
!$omp shared(nO,nV,cc_space_v_ovov,cc_space_v_voov,X_ovov) &
!$omp private(u,beta,i,a) &
!$omp default(none)
!$omp do
do beta = 1, nV
do u = 1, nO
do a = 1, nv
do i = 1, nO
X_ovov(i,a,u,beta) = 2d0 * cc_space_v_voov(a,u,i,beta) - cc_space_v_ovov(u,a,i,beta)
enddo
enddo
enddo
enddo
!$omp end do
!$omp end parallel
call dgemv('T', nO*nV, nO*nV, &
1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), &
t1 , 1, &
1d0, r1 , 1)
deallocate(X_ovov)
integer :: iblock, block_size, nVmax integer :: iblock, block_size, nVmax
double precision, allocatable :: W_vvov(:,:,:,:), W_vvov_tmp(:,:,:,:), T_vvoo(:,:,:,:) double precision, allocatable :: W_vvov(:,:,:,:), W_vvov_tmp(:,:,:,:), T_vvoo(:,:,:,:)
block_size = 16
allocate(W_vvov(nV,nV,nO,block_size), W_vvov_tmp(nV,nO,nV,block_size), T_vvoo(nV,nV,nO,nO))
!$omp parallel &
!$omp private(u,i,b,a) &
!$omp default(shared)
!$omp do
do u = 1, nO
do i = 1, nO
do b = 1, nV
do a = 1, nV
T_vvoo(a,b,i,u) = tau(i,u,a,b)
enddo
enddo
enddo
enddo
!$omp end do
!$omp end parallel
do iblock = 1, nV, block_size
nVmax = min(block_size,nV-iblock+1)
call dgemm('T','N', nV*nO, nV*nVmax, cholesky_mo_num, 1.d0, &
cc_space_v_vo_chol , cholesky_mo_num, &
cc_space_v_vv_chol(1,1,iblock), cholesky_mo_num, &
0.d0, W_vvov_tmp, nV*nO)
!$omp parallel &
!$omp private(b,i,a,beta) &
!$omp default(shared)
do beta = 1, nVmax
do i = 1, nO
!$omp do
do b = 1, nV
do a = 1, nV
W_vvov(a,b,i,beta) = 2d0 * W_vvov_tmp(a,i,b,beta) - W_vvov_tmp(b,i,a,beta)
enddo
enddo
!$omp end do nowait
enddo
enddo
!$omp barrier
!$omp end parallel
call dgemm('T','N',nO,nVmax,nO*nV*nV, &
1d0, T_vvoo, nV*nV*nO, &
W_vvov, nO*nV*nV, &
1d0, r1(1,iblock), nO)
enddo
deallocate(W_vvov,T_vvoo)
double precision, allocatable :: W_oovo(:,:,:,:) double precision, allocatable :: W_oovo(:,:,:,:)
allocate(W_oovo(nO,nO,nV,nO)) allocate(W_oovo(nO,nO,nV,nO))
!$omp parallel & ! !$omp parallel &
!$omp shared(nO,nV,cc_space_v_vooo,W_oovo) & ! !$omp shared(nO,nV,cc_space_v_oovo,W_oovo) &
!$omp private(u,a,i,j) & ! !$omp private(u,a,i,j) &
!$omp default(none) ! !$omp default(none)
do u = 1, nO ! do u = 1, nO
!$omp do ! !$omp do
do a = 1, nV ! do a = 1, nV
do j = 1, nO ! do j = 1, nO
do i = 1, nO ! do i = 1, nO
W_oovo(i,j,a,u) = 2d0 * cc_space_v_vooo(a,u,i,j) - cc_space_v_vooo(a,u,j,i) ! W_oovo(i,j,a,u) = 2d0 * cc_space_v_oovo(i,j,a,u) - cc_space_v_oovo(j,i,a,u)
enddo ! enddo
enddo ! enddo
enddo ! enddo
!$omp end do nowait ! !$omp end do nowait
enddo ! enddo
!$omp end parallel ! !$omp end parallel
call dgemm('T','N', nO, nV, nO*nO*nV, & ! call dgemm('T','N', nO, nV, nO*nO*nV, &
-1d0, W_oovo, size(W_oovo,1) * size(W_oovo,2) * size(W_oovo,3), & ! -1d0, W_oovo, nO * nO * nV, &
tau , size(tau,1) * size(tau,2) * size(tau,3), & ! tau , nO * nO * nV, &
1d0, r1 , size(r1,1)) ! 1d0, r1 , nO)
!
deallocate(W_oovo) ! deallocate(W_oovo)
max_r1 = 0d0 max_r1 = 0d0
do a = 1, nV do a = 1, nV

529
devel/ccsd_gpu/diis.irp.f Normal file
View File

@ -0,0 +1,529 @@
! Code
subroutine diis_cc(all_err,all_t,sze,m,iter,t)
implicit none
BEGIN_DOC
! DIIS. Take the error vectors and the amplitudes of the previous
! iterations to compute the new amplitudes
END_DOC
! {err_i}_{i=1}^{m_it} -> B -> c
! {t_i}_{i=1}^{m_it}, c, {err_i}_{i=1}^{m_it} -> t_{m_it+1}
integer, intent(in) :: m,iter,sze
double precision, intent(in) :: all_err(sze,m)
double precision, intent(in) :: all_t(sze,m)
double precision, intent(out) :: t(sze)
double precision, allocatable :: B(:,:), c(:), zero(:)
integer :: m_iter
integer :: i,j,k
integer :: info
integer, allocatable :: ipiv(:)
double precision :: accu
m_iter = min(m,iter)
!print*,'m_iter',m_iter
allocate(B(m_iter+1,m_iter+1), c(m_iter), zero(m_iter+1))
allocate(ipiv(m+1))
! B(i,j) = < err(iter-m_iter+j),err(iter-m_iter+i) > ! iter-m_iter will be zero for us
B = 0d0
!$OMP PARALLEL &
!$OMP SHARED(B,m,m_iter,sze,all_err) &
!$OMP PRIVATE(i,j,k,accu) &
!$OMP DEFAULT(NONE)
do j = 1, m_iter
do i = 1, m_iter
accu = 0d0
!$OMP DO
do k = 1, sze
! the errors of the ith iteration are in all_err(:,m+1-i)
accu = accu + all_err(k,m+1-i) * all_err(k,m+1-j)
enddo
!$OMP END DO NOWAIT
!$OMP CRITICAL
B(i,j) = B(i,j) + accu
!$OMP END CRITICAL
enddo
enddo
!$OMP END PARALLEL
do i = 1, m_iter
B(i,m_iter+1) = -1
enddo
do j = 1, m_iter
B(m_iter+1,j) = -1
enddo
! Debug
!print*,'B'
!do i = 1, m_iter+1
! write(*,'(100(F10.6))') B(i,:)
!enddo
! (0 0 .... 0 -1)
zero = 0d0
zero(m_iter+1) = -1d0
! Solve B.c = zero
call dgesv(m_iter+1, 1, B, size(B,1), ipiv, zero, size(zero,1), info)
if (info /= 0) then
print*,'DIIS error in dgesv:', info
call abort
endif
! c corresponds to the m_iter first solutions
c = zero(1:m_iter)
! Debug
!print*,'c',c
!print*,'all_t'
!do i = 1, m
! write(*,'(100(F10.6))') all_t(:,i)
!enddo
!print*,'all_err'
!do i = 1, m
! write(*,'(100(F10.6))') all_err(:,i)
!enddo
! update T
!$OMP PARALLEL &
!$OMP SHARED(t,c,m,all_err,all_t,sze,m_iter) &
!$OMP PRIVATE(i,j,accu) &
!$OMP DEFAULT(NONE)
!$OMP DO
do i = 1, sze
t(i) = 0d0
enddo
!$OMP END DO
do i = 1, m_iter
!$OMP DO
do j = 1, sze
t(j) = t(j) + c(i) * (all_t(j,m+1-i) + all_err(j,m+1-i))
enddo
!$OMP END DO
enddo
!$OMP END PARALLEL
!print*,'new t',t
deallocate(ipiv,B,c,zero)
end
! Update all err
subroutine update_all_err(err,all_err,sze,m,iter)
implicit none
BEGIN_DOC
! Shift all the err vectors of the previous iterations to add the new one
! The last err vector is placed in the last position and all the others are
! moved toward the first one.
END_DOC
integer, intent(in) :: m, iter, sze
double precision, intent(in) :: err(sze)
double precision, intent(inout) :: all_err(sze,m)
integer :: i,j
integer :: m_iter
m_iter = min(m,iter)
! Shift
!$OMP PARALLEL &
!$OMP SHARED(m,all_err,err,sze) &
!$OMP PRIVATE(i,j) &
!$OMP DEFAULT(NONE)
do i = 1, m-1
!$OMP DO
do j = 1, sze
all_err(j,i) = all_err(j,i+1)
enddo
!$OMP END DO
enddo
! Debug
!print*,'shift err'
!do i = 1, m
! print*,i, all_err(:,i)
!enddo
! New
!$OMP DO
do i = 1, sze
all_err(i,m) = err(i)
enddo
!$OMP END DO
!$OMP END PARALLEL
! Debug
!print*,'Updated err'
!do i = 1, m
! print*,i, all_err(:,i)
!enddo
end
! Update all t
subroutine update_all_t(t,all_t,sze,m,iter)
implicit none
BEGIN_DOC
! Shift all the t vectors of the previous iterations to add the new one
! The last t vector is placed in the last position and all the others are
! moved toward the first one.
END_DOC
integer, intent(in) :: m, iter, sze
double precision, intent(in) :: t(sze)
double precision, intent(inout) :: all_t(sze,m)
integer :: i,j
integer :: m_iter
m_iter = min(m,iter)
! Shift
!$OMP PARALLEL &
!$OMP SHARED(m,all_t,t,sze) &
!$OMP PRIVATE(i,j) &
!$OMP DEFAULT(NONE)
do i = 1, m-1
!$OMP DO
do j = 1, sze
all_t(j,i) = all_t(j,i+1)
enddo
!$OMP END DO
enddo
! New
!$OMP DO
do i = 1, sze
all_t(i,m) = t(i)
enddo
!$OMP END DO
!$OMP END PARALLEL
! Debug
!print*,'Updated t'
!do i = 1, m
! print*,i, all_t(:,i)
!enddo
end
! Err1
subroutine compute_err1(nO,nV,f_o,f_v,r1,err1)
implicit none
BEGIN_DOC
! Compute the error vector for the t1
END_DOC
integer, intent(in) :: nO, nV
double precision, intent(in) :: f_o(nO), f_v(nV), r1(nO,nV)
double precision, intent(out) :: err1(nO,nV)
integer :: i,a
!$OMP PARALLEL &
!$OMP SHARED(err1,r1,f_o,f_v,nO,nV,cc_level_shift) &
!$OMP PRIVATE(i,a) &
!$OMP DEFAULT(NONE)
!$OMP DO
do a = 1, nV
do i = 1, nO
err1(i,a) = - r1(i,a) / (f_o(i) - f_v(a) - cc_level_shift)
enddo
enddo
!$OMP END DO
!$OMP END PARALLEL
end
! Err2
subroutine compute_err2(nO,nV,f_o,f_v,r2,err2)
implicit none
BEGIN_DOC
! Compute the error vector for the t2
END_DOC
integer, intent(in) :: nO, nV
double precision, intent(in) :: f_o(nO), f_v(nV), r2(nO,nO,nV,nV)
double precision, intent(out) :: err2(nO,nO,nV,nV)
integer :: i,j,a,b
!$OMP PARALLEL &
!$OMP SHARED(err2,r2,f_o,f_v,nO,nV,cc_level_shift) &
!$OMP PRIVATE(i,j,a,b) &
!$OMP DEFAULT(NONE)
!$OMP DO collapse(3)
do b = 1, nV
do a = 1, nV
do j = 1, nO
do i = 1, nO
err2(i,j,a,b) = - r2(i,j,a,b) / (f_o(i) + f_o(j) - f_v(a) - f_v(b) - cc_level_shift)
enddo
enddo
enddo
enddo
!$OMP END DO
!$OMP END PARALLEL
end
! Update t
subroutine update_t_ccsd(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
implicit none
integer, intent(in) :: nO,nV,nb_iter
double precision, intent(in) :: f_o(nO), f_v(nV)
double precision, intent(in) :: r1(nO,nV), r2(nO,nO,nV,nV)
double precision, intent(inout) :: t1(nO,nV), t2(nO,nO,nV,nV)
double precision, intent(inout) :: all_err1(nO*nV, cc_diis_depth), all_err2(nO*nO*nV*nV, cc_diis_depth)
double precision, intent(inout) :: all_t1(nO*nV, cc_diis_depth), all_t2(nO*nO*nV*nV, cc_diis_depth)
double precision, allocatable :: err1(:,:), err2(:,:,:,:)
double precision, allocatable :: tmp_err1(:), tmp_err2(:)
double precision, allocatable :: tmp_t1(:), tmp_t2(:)
if (cc_update_method == 'diis') then
allocate(err1(nO,nV), err2(nO,nO,nV,nV))
allocate(tmp_err1(nO*nV), tmp_err2(nO*nO*nV*nV))
allocate(tmp_t1(nO*nV), tmp_t2(nO*nO*nV*nV))
! DIIS T1, it is not always good since the t1 can be small
! That's why there is a call to update the t1 in the standard way
! T1 error tensor
!call compute_err1(nO,nV,f_o,f_v,r1,err1)
! Transfo errors and parameters in vectors
!tmp_err1 = reshape(err1,(/nO*nV/))
!tmp_t1 = reshape(t1 ,(/nO*nV/))
! Add the error and parameter vectors with those of the previous iterations
!call update_all_err(tmp_err1,all_err1,nO*nV,cc_diis_depth,nb_iter+1)
!call update_all_t (tmp_t1 ,all_t1 ,nO*nV,cc_diis_depth,nb_iter+1)
! Diis and reshape T as a tensor
!call diis_cc(all_err1,all_t1,nO*nV,cc_diis_depth,nb_iter+1,tmp_t1)
!t1 = reshape(tmp_t1 ,(/nO,nV/))
call update_t1(nO,nV,f_o,f_v,r1,t1)
! DIIS T2
! T2 error tensor
call compute_err2(nO,nV,f_o,f_v,r2,err2)
! Transfo errors and parameters in vectors
tmp_err2 = reshape(err2,(/nO*nO*nV*nV/))
tmp_t2 = reshape(t2 ,(/nO*nO*nV*nV/))
! Add the error and parameter vectors with those of the previous iterations
call update_all_err(tmp_err2,all_err2,nO*nO*nV*nV,cc_diis_depth,nb_iter+1)
call update_all_t (tmp_t2 ,all_t2 ,nO*nO*nV*nV,cc_diis_depth,nb_iter+1)
! Diis and reshape T as a tensor
call diis_cc(all_err2,all_t2,nO*nO*nV*nV,cc_diis_depth,nb_iter+1,tmp_t2)
t2 = reshape(tmp_t2 ,(/nO,nO,nV,nV/))
deallocate(tmp_t1,tmp_t2,tmp_err1,tmp_err2,err1,err2)
! Standard update as T = T - Delta
elseif (cc_update_method == 'none') then
call update_t1(nO,nV,f_o,f_v,r1,t1)
call update_t2(nO,nV,f_o,f_v,r2,t2)
else
print*,'Unkonw cc_method_method: '//cc_update_method
endif
end
! Update t v2
subroutine update_t_ccsd_diis(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
implicit none
integer, intent(in) :: nO,nV,nb_iter
double precision, intent(in) :: f_o(nO), f_v(nV)
double precision, intent(in) :: r1(nO,nV), r2(nO,nO,nV,nV)
double precision, intent(inout) :: t1(nO,nV), t2(nO,nO,nV,nV)
double precision, intent(inout) :: all_err1(nO*nV, cc_diis_depth), all_err2(nO*nO*nV*nV, cc_diis_depth)
double precision, intent(inout) :: all_t1(nO*nV, cc_diis_depth), all_t2(nO*nO*nV*nV, cc_diis_depth)
double precision, allocatable :: all_t(:,:), all_err(:,:), tmp_t(:)
double precision, allocatable :: err1(:,:), err2(:,:,:,:)
double precision, allocatable :: tmp_err1(:), tmp_err2(:)
double precision, allocatable :: tmp_t1(:), tmp_t2(:)
integer :: i,j
! Allocate
allocate(all_err(nO*nV+nO*nO*nV*nV,cc_diis_depth), all_t(nO*nV+nO*nO*nV*nV,cc_diis_depth))
allocate(tmp_t(nO*nV+nO*nO*nV*nV))
allocate(err1(nO,nV), err2(nO,nO,nV,nV))
allocate(tmp_err1(nO*nV), tmp_err2(nO*nO*nV*nV))
allocate(tmp_t1(nO*nV), tmp_t2(nO*nO*nV*nV))
! Compute the errors and reshape them as vector
call compute_err1(nO,nV,f_o,f_v,r1,err1)
call compute_err2(nO,nV,f_o,f_v,r2,err2)
tmp_err1 = reshape(err1,(/nO*nV/))
tmp_err2 = reshape(err2,(/nO*nO*nV*nV/))
tmp_t1 = reshape(t1 ,(/nO*nV/))
tmp_t2 = reshape(t2 ,(/nO*nO*nV*nV/))
! Update the errors and parameters for the diis
call update_all_err(tmp_err1,all_err1,nO*nV,cc_diis_depth,nb_iter+1)
call update_all_t (tmp_t1 ,all_t1 ,nO*nV,cc_diis_depth,nb_iter+1)
call update_all_err(tmp_err2,all_err2,nO*nO*nV*nV,cc_diis_depth,nb_iter+1)
call update_all_t (tmp_t2 ,all_t2 ,nO*nO*nV*nV,cc_diis_depth,nb_iter+1)
! Gather the different parameters and errors
!$OMP PARALLEL &
!$OMP SHARED(nO,nV,all_err,all_err1,all_err2,cc_diis_depth,&
!$OMP all_t,all_t1,all_t2) &
!$OMP PRIVATE(i,j) &
!$OMP DEFAULT(NONE)
do j = 1, cc_diis_depth
!$OMP DO
do i = 1, nO*nV
all_err(i,j) = all_err1(i,j)
enddo
!$OMP END DO NOWAIT
enddo
do j = 1, cc_diis_depth
!$OMP DO
do i = 1, nO*nO*nV*nV
all_err(i+nO*nV,j) = all_err2(i,j)
enddo
!$OMP END DO NOWAIT
enddo
do j = 1, cc_diis_depth
!$OMP DO
do i = 1, nO*nV
all_t(i,j) = all_t1(i,j)
enddo
!$OMP END DO NOWAIT
enddo
do j = 1, cc_diis_depth
!$OMP DO
do i = 1, nO*nO*nV*nV
all_t(i+nO*nV,j) = all_t2(i,j)
enddo
!$OMP END DO
enddo
!$OMP END PARALLEL
! Diis
call diis_cc(all_err,all_t,nO*nV+nO*nO*nV*nV,cc_diis_depth,nb_iter+1,tmp_t)
! Split the resulting vector
!$OMP PARALLEL &
!$OMP SHARED(nO,nV,tmp_t,tmp_t1,tmp_t2) &
!$OMP PRIVATE(i) &
!$OMP DEFAULT(NONE)
!$OMP DO
do i = 1, nO*nV
tmp_t1(i) = tmp_t(i)
enddo
!$OMP END DO NOWAIT
!$OMP DO
do i = 1, nO*nO*nV*nV
tmp_t2(i) = tmp_t(i+nO*nV)
enddo
!$OMP END DO
!$OMP END PARALLEL
! Reshape as tensors
t1 = reshape(tmp_t1 ,(/nO,nV/))
t2 = reshape(tmp_t2 ,(/nO,nO,nV,nV/))
! Deallocate
deallocate(tmp_t1,tmp_t2,tmp_err1,tmp_err2,err1,err2,all_t,all_err)
end
! Update t v3
subroutine update_t_ccsd_diis_v3(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err,all_t)
implicit none
integer, intent(in) :: nO,nV,nb_iter
double precision, intent(in) :: f_o(nO), f_v(nV)
double precision, intent(in) :: r1(nO,nV), r2(nO,nO,nV,nV)
double precision, intent(inout) :: t1(nO*nV), t2(nO*nO*nV*nV)
double precision, intent(inout) :: all_err(nO*nV+nO*nO*nV*nV, cc_diis_depth)
double precision, intent(inout) :: all_t(nO*nV+nO*nO*nV*nV, cc_diis_depth)
double precision, allocatable :: tmp(:)
integer :: i,j
! Allocate
allocate(tmp(nO*nV+nO*nO*nV*nV))
! Compute the errors
call compute_err1(nO,nV,f_o,f_v,r1,tmp(1:nO*nV))
call compute_err2(nO,nV,f_o,f_v,r2,tmp(nO*nV+1:nO*nV+nO*nO*nV*nV))
! Update the errors and parameters for the diis
call update_all_err(tmp,all_err,nO*nV+nO*nO*nV*nV,cc_diis_depth,nb_iter+1)
!$OMP PARALLEL &
!$OMP SHARED(nO,nV,tmp,t1,t2) &
!$OMP PRIVATE(i) &
!$OMP DEFAULT(NONE)
!$OMP DO
do i = 1, nO*nV
tmp(i) = t1(i)
enddo
!$OMP END DO NOWAIT
!$OMP DO
do i = 1, nO*nO*nV*nV
tmp(i+nO*nV) = t2(i)
enddo
!$OMP END DO
!$OMP END PARALLEL
call update_all_t(tmp,all_t,nO*nV+nO*nO*nV*nV,cc_diis_depth,nb_iter+1)
! Diis
call diis_cc(all_err,all_t,nO*nV+nO*nO*nV*nV,cc_diis_depth,nb_iter+1,tmp)
! Split the resulting vector
!$OMP PARALLEL &
!$OMP SHARED(nO,nV,tmp,t1,t2) &
!$OMP PRIVATE(i) &
!$OMP DEFAULT(NONE)
!$OMP DO
do i = 1, nO*nV
t1(i) = tmp(i)
enddo
!$OMP END DO NOWAIT
!$OMP DO
do i = 1, nO*nO*nV*nV
t2(i) = tmp(i+nO*nV)
enddo
!$OMP END DO
!$OMP END PARALLEL
! Deallocate
deallocate(tmp)
end

View File

@ -0,0 +1,12 @@
subroutine det_energy(det,energy)
implicit none
integer(bit_kind), intent(in) :: det
double precision, intent(out) :: energy
double precision, external :: diag_H_mat_elem
energy = diag_H_mat_elem(det,N_int) + nuclear_repulsion
end

View File

@ -43,6 +43,292 @@ void gpu_upload(gpu_data* data,
} }
void compute_h_oo_chol_gpu(gpu_data* data, int igpu)
{
int ngpus = 1;
if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
igpu = igpu % ngpus;
const int cholesky_mo_num = data[igpu].cholesky_mo_num;
const int nO = data[igpu].nO;
const int nV = data[igpu].nV;
cudaSetDevice(igpu);
int m,n,k, lda, ldb, ldc;
double alpha, beta;
double* A;
double* B;
double* C;
cudaStream_t stream[nV];
cublasHandle_t handle;
cublasCreate(&handle);
double* d_H_oo = data[igpu].H_oo;
double* d_tau_x = data[igpu].tau_x;
double* d_cc_space_f_oo = data[igpu].cc_space_f_oo;
double* d_cc_space_v_vo_chol = data[igpu].cc_space_v_vo_chol;
double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol;
double* d_tau_kau;
cudaMalloc((void **)&d_tau_kau, cholesky_mo_num*nV*nO * sizeof(double));
double* d_tmp_ovv;
cudaMalloc((void **)&d_tmp_ovv, nO*nV*nV * sizeof(double));
double* d_tmp_vov;
cudaMalloc((void **)&d_tmp_vov, nV*nO*nV * sizeof(double));
for (int i=0 ; i<nV ; ++i) {
cudaStreamCreate(&(stream[i]));
}
alpha = 1.0;
beta = 0.0;
for (int u=0 ; u<nO ; ++u) {
cublasDcopy(handle, nO*nV*nV, &(d_tau_x[u]), nO, d_tmp_ovv, 1);
for (int b=0 ; b<nV ; ++b) {
cublasSetStream(handle, stream[b]);
A = &(d_tmp_ovv[nO*nV*b]); lda = nO;
B = &(d_tmp_ovv[nO*nV*b]); ldb = nO;
C = &(d_tmp_vov[nV*nO*b]); ldc = nV;
cublasDgeam(handle, CUBLAS_OP_T, CUBLAS_OP_T, nV, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
}
cudaDeviceSynchronize();
cublasSetStream(handle, NULL);
alpha = 1.0;
beta = 0.0;
m=cholesky_mo_num; n=nV; k=nO*nV;
A=d_cc_space_v_ov_chol; lda=cholesky_mo_num;
B=d_tmp_vov; ldb=nV;
C=&(d_tau_kau[cholesky_mo_num*nV*u]); ldc=cholesky_mo_num;
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
}
for (int i=0 ; i<nV ; ++i) {
cudaStreamDestroy(stream[i]);
}
cudaFree(d_tmp_vov);
cudaFree(d_tmp_ovv);
cublasDcopy(handle, nO*nO, d_cc_space_f_oo, 1, d_H_oo, 1);
alpha = 1.0;
beta = 1.0;
m=nO; n=nO; k=cholesky_mo_num*nV;
A=d_tau_kau; lda=cholesky_mo_num*nV;
B=d_cc_space_v_vo_chol; ldb=cholesky_mo_num*nV;
C=d_H_oo; ldc=nO;
cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
cudaFree(d_tau_kau);
double* H_oo = malloc(nO*nO*sizeof(double));
cublasGetMatrix(nO, nO, sizeof(double), d_H_oo, nO, H_oo, nO);
for (int i=0 ; i<ngpus ; ++i) {
if (i != igpu) {
double* d_H_oo = data[i].H_oo;
cudaSetDevice(i);
cublasSetMatrix(nO, nO, sizeof(double), H_oo, nO, d_H_oo, nO);
}
}
free(H_oo);
cublasDestroy(handle);
}
void compute_h_vo_chol_gpu(gpu_data* data, int igpu)
{
int ngpus = 1;
if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
igpu = igpu % ngpus;
const int cholesky_mo_num = data[igpu].cholesky_mo_num;
const int nO = data[igpu].nO;
const int nV = data[igpu].nV;
cudaSetDevice(igpu);
int m,n,k, lda, ldb, ldc;
double alpha, beta;
double* A;
double* B;
double* C;
cudaStream_t stream[nV];
cublasHandle_t handle;
cublasCreate(&handle);
double* d_t1 = data[igpu].t1;
double* d_H_vo = data[igpu].H_vo;
double* d_tau_x = data[igpu].tau_x;
double* d_cc_space_f_vo = data[igpu].cc_space_f_vo;
double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol;
double* d_cc_space_v_vo_chol = data[igpu].cc_space_v_vo_chol;
cublasDcopy(handle, nV*nO, d_cc_space_f_vo, 1, d_H_vo, 1);
double* d_tmp_k;
cudaMalloc((void **)&d_tmp_k, cholesky_mo_num * sizeof(double));
alpha = 2.0;
beta = 0.0;
m=cholesky_mo_num; n=1; k=nO*nV;
A=d_cc_space_v_ov_chol; lda=cholesky_mo_num;
B=d_t1; ldb=nO*nV;
C=d_tmp_k; ldc=cholesky_mo_num;
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
alpha = 1.0;
beta = 1.0;
m=nV*nO; n=1; k=cholesky_mo_num;
A=d_cc_space_v_vo_chol; lda=cholesky_mo_num;
B=d_tmp_k; ldb=cholesky_mo_num;
C=d_H_vo; ldc=nV*nO;
cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
cudaFree(d_tmp_k);
double* d_tmp;
cudaMalloc((void **)&d_tmp, cholesky_mo_num*nO*nO * sizeof(double));
alpha = 1.0;
beta = 0.0;
m=cholesky_mo_num*nO; n=nO; k=nV;
A=d_cc_space_v_ov_chol; lda=cholesky_mo_num*nO;
B=d_t1; ldb=nO;
C=d_tmp; ldc=cholesky_mo_num*nO;
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
double* d_tmp2;
cudaMalloc((void **)&d_tmp2, cholesky_mo_num*nO*nO * sizeof(double));
for (int i=0 ; i<nV ; ++i) {
cudaStreamCreate(&(stream[i]));
}
for (int i=0 ; i<nO ; ++i) {
for (int j=0 ; j<nO ; ++j) {
cublasSetStream(handle, stream[j]);
cublasDcopy(handle, cholesky_mo_num, &(d_tmp [cholesky_mo_num*(i+nO*j)]), 1,
&(d_tmp2[cholesky_mo_num*(j+nO*i)]), 1);
}
}
for (int i=0 ; i<nV ; ++i) {
cudaStreamDestroy(stream[i]);
}
cublasSetStream(handle, NULL);
alpha = -1.0;
beta = 1.0;
m=nV; n=nO; k=cholesky_mo_num*nO;
A=d_cc_space_v_ov_chol; lda=cholesky_mo_num*nO;
B=d_tmp2; ldb=cholesky_mo_num*nO;
C=d_H_vo; ldc=nV;
cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
double* H_vo = malloc(nV*nO*sizeof(double));
cublasGetMatrix(nV, nO, sizeof(double), d_H_vo, nV, H_vo, nV);
for (int i=0 ; i<ngpus ; ++i) {
if (i != igpu) {
double* d_H_vo = data[i].H_vo;
cudaSetDevice(i);
cublasSetMatrix(nV, nO, sizeof(double), H_vo, nV, d_H_vo, nV);
}
}
free(H_vo);
cublasDestroy(handle);
}
void compute_h_vv_chol_gpu(gpu_data* data, int igpu)
{
int ngpus = 1;
if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
igpu = igpu % ngpus;
const int cholesky_mo_num = data[igpu].cholesky_mo_num;
const int nO = data[igpu].nO;
const int nV = data[igpu].nV;
cudaSetDevice(igpu);
int m,n,k, lda, ldb, ldc;
double alpha, beta;
double* A;
double* B;
double* C;
cudaStream_t stream[nV];
cublasHandle_t handle;
cublasCreate(&handle);
double* d_H_vv = data[igpu].H_vv;
double* d_tau_x = data[igpu].tau_x;
double* d_cc_space_f_vv = data[igpu].cc_space_f_vv;
double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol;
double* d_tau_kia;
cudaMalloc((void **)&d_tau_kia, cholesky_mo_num*nO*nV * sizeof(double));
double* d_tmp_oov;
cudaMalloc((void **)&d_tmp_oov, nO*nO*nV * sizeof(double));
alpha = 1.0;
beta = 0.0;
for (int i=0 ; i<nV ; ++i) {
cudaStreamCreate(&(stream[i]));
}
for (int a=0 ; a<nV ; ++a) {
for (int b=0 ; b<nV ; ++b) {
cublasSetStream(handle, stream[b]);
cublasDcopy(handle, nO*nO, &(d_tau_x[nO*nO*(a+nV*b)]), 1, &(d_tmp_oov[nO*nO*b]), 1);
}
cudaDeviceSynchronize();
cublasSetStream(handle, NULL);
alpha = 1.0;
beta = 0.0;
m=cholesky_mo_num; n=nO; k=nO*nV;
A=d_cc_space_v_ov_chol; lda=cholesky_mo_num;
B=d_tmp_oov; ldb=nO;
C=&(d_tau_kia[cholesky_mo_num*nO*a]); ldc=cholesky_mo_num;
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
}
for (int i=0 ; i<nV ; ++i) {
cudaStreamDestroy(stream[i]);
}
cudaFree(d_tmp_oov);
cublasDcopy(handle, nV*nV, d_cc_space_f_vv, 1, d_H_vv, 1);
alpha = -1.0;
beta = 1.0;
m=nV; n=nV; k=cholesky_mo_num*nO;
A=d_tau_kia; lda=cholesky_mo_num*nO;
B=d_cc_space_v_ov_chol; ldb=cholesky_mo_num*nO;
C=d_H_vv; ldc=nV;
cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
cudaFree(d_tau_kia);
double* H_vv = malloc(nV*nV*sizeof(double));
cublasGetMatrix(nV, nV, sizeof(double), d_H_vv, nV, H_vv, nV);
for (int i=0 ; i<ngpus ; ++i) {
if (i != igpu) {
double* d_H_vv = data[i].H_vv;
cudaSetDevice(i);
cublasSetMatrix(nV, nV, sizeof(double), H_vv, nV, d_H_vv, nV);
}
}
free(H_vv);
cublasDestroy(handle);
}
void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, double* r2, double* max_r2) void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, double* r2, double* max_r2)
{ {
const int cholesky_mo_num = data->cholesky_mo_num; const int cholesky_mo_num = data->cholesky_mo_num;
@ -1294,7 +1580,6 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
for (size_t bet=iblock ; bet<(nV < iblock+BLOCK_SIZE ? nV : iblock+BLOCK_SIZE) ; ++bet) for (size_t bet=iblock ; bet<(nV < iblock+BLOCK_SIZE ? nV : iblock+BLOCK_SIZE) ; ++bet)
{ {
alpha = 1.0; alpha = 1.0;
beta = 0.0; beta = 0.0;
A = &(d_tmpB1[nV*(bet-iblock)]); lda = nV*BLOCK_SIZE; A = &(d_tmpB1[nV*(bet-iblock)]); lda = nV*BLOCK_SIZE;
@ -1344,15 +1629,19 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
} }
void compute_h_oo_chol_gpu(gpu_data* data, int nO, int nV, int igpu, double* H_oo)
void compute_r1_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, double* r1, double* max_r1)
{ {
const int cholesky_mo_num = data->cholesky_mo_num;
int ngpus = 1; int ngpus = 1;
if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus); if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
igpu = igpu % ngpus;
const int cholesky_mo_num = data[igpu].cholesky_mo_num;
cudaSetDevice(igpu);
#pragma omp parallel num_threads(ngpus)
{
int m,n,k, lda, ldb, ldc; int m,n,k, lda, ldb, ldc;
double alpha, beta; double alpha, beta;
double* A; double* A;
@ -1360,238 +1649,248 @@ void compute_h_oo_chol_gpu(gpu_data* data, int nO, int nV, int igpu, double* H_o
double* C; double* C;
cudaStream_t stream[nV]; cudaStream_t stream[nV];
int igpu = omp_get_thread_num();
cudaSetDevice(igpu);
cublasHandle_t handle; cublasHandle_t handle;
cublasCreate(&handle); cublasCreate(&handle);
double* d_H_oo = data[igpu].H_oo; double* d_r1;
double* d_tau_x = data[igpu].tau_x; lda = nO ;
double* d_cc_space_f_oo = data[igpu].cc_space_f_oo; cudaMalloc((void **)&d_r1, lda * nV * sizeof(double));
cudaMemset(d_r1, 0, nO*nV*sizeof(double));
memset(r1, 0, nO*nV*sizeof(double));
double* d_cc_space_v_vo_chol = data[igpu].cc_space_v_vo_chol; double* d_cc_space_v_vo_chol = data[igpu].cc_space_v_vo_chol;
double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol; double* d_cc_space_v_vv_chol = data[igpu].cc_space_v_vv_chol;
double* d_cc_space_v_oovo = data[igpu].cc_space_v_oovo;
double* d_tau_kau; double* d_cc_space_v_ovov = data[igpu].cc_space_v_ovov;
cudaMalloc((void **)&d_tau_kau, cholesky_mo_num*nV*nO * sizeof(double)); double* d_cc_space_v_voov = data[igpu].cc_space_v_voov;
double* d_cc_space_f_ov = data[igpu].cc_space_f_ov;
double* d_tmp_ovv;
cudaMalloc((void **)&d_tmp_ovv, nO*nV*nV * sizeof(double));
double* d_tmp_vov;
cudaMalloc((void **)&d_tmp_vov, nV*nO*nV * sizeof(double));
for (int i=0 ; i<nV ; ++i) {
cudaStreamCreate(&(stream[i]));
}
alpha = 1.0;
beta = 0.0;
for (int u=0 ; u<nO ; ++u) {
cublasDcopy(handle, nO*nV*nV, &(d_tau_x[u]), nO, d_tmp_ovv, 1);
for (int b=0 ; b<nV ; ++b) {
cublasSetStream(handle, stream[b]);
A = &(d_tmp_ovv[nO*nV*b]); lda = nO;
B = &(d_tmp_ovv[nO*nV*b]); ldb = nO;
C = &(d_tmp_vov[nV*nO*b]); ldc = nV;
cublasDgeam(handle, CUBLAS_OP_T, CUBLAS_OP_T, nV, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
}
cudaDeviceSynchronize();
cublasSetStream(handle, NULL);
alpha = 1.0;
beta = 0.0;
m=cholesky_mo_num; n=nV; k=nO*nV;
A=d_cc_space_v_ov_chol; lda=cholesky_mo_num;
B=d_tmp_vov; ldb=nV;
C=&(d_tau_kau[cholesky_mo_num*nV*u]); ldc=cholesky_mo_num;
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
}
for (int i=0 ; i<nV ; ++i) {
cudaStreamDestroy(stream[i]);
}
cudaFree(d_tmp_vov);
cudaFree(d_tmp_ovv);
cublasDcopy(handle, nO*nO, d_cc_space_f_oo, 1, d_H_oo, 1);
alpha = 1.0;
beta = 1.0;
m=nO; n=nO; k=cholesky_mo_num*nV;
A=d_tau_kau; lda=cholesky_mo_num*nV;
B=d_cc_space_v_vo_chol; ldb=cholesky_mo_num*nV;
C=d_H_oo; ldc=nO;
cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
cudaFree(d_tau_kau);
// double* H_oo = malloc(nO*nO*sizeof(double));
cublasGetMatrix(nO, nO, sizeof(double), d_H_oo, nO, H_oo, nO);
for (int i=0 ; i<ngpus ; ++i) {
if (i != igpu) {
double* d_H_oo = data[i].H_oo;
cudaSetDevice(i);
cublasSetMatrix(nO, nO, sizeof(double), H_oo, nO, d_H_oo, nO);
}
}
// free(H_oo);
cublasDestroy(handle);
}
void compute_h_vv_chol_gpu(gpu_data* data, int nO, int nV, int igpu, double* H_vv)
{
int ngpus = 1;
if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
igpu = igpu % ngpus;
const int cholesky_mo_num = data[igpu].cholesky_mo_num;
cudaSetDevice(igpu);
int m,n,k, lda, ldb, ldc;
double alpha, beta;
double* A;
double* B;
double* C;
cudaStream_t stream[nV];
cublasHandle_t handle;
cublasCreate(&handle);
double* d_H_vv = data[igpu].H_vv;
double* d_tau_x = data[igpu].tau_x;
double* d_cc_space_f_vv = data[igpu].cc_space_f_vv;
double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol;
double* d_tau_kia;
cudaMalloc((void **)&d_tau_kia, cholesky_mo_num*nO*nV * sizeof(double));
double* d_tmp_oov;
cudaMalloc((void **)&d_tmp_oov, nO*nO*nV * sizeof(double));
alpha = 1.0;
beta = 0.0;
for (int i=0 ; i<nV ; ++i) {
cudaStreamCreate(&(stream[i]));
}
for (int a=0 ; a<nV ; ++a) {
for (int b=0 ; b<nV ; ++b) {
cublasSetStream(handle, stream[b]);
cublasDcopy(handle, nO*nO, &(d_tau_x[nO*nO*(a+nV*b)]), 1, &(d_tmp_oov[nO*nO*b]), 1);
}
cudaDeviceSynchronize();
cublasSetStream(handle, NULL);
alpha = 1.0;
beta = 0.0;
m=cholesky_mo_num; n=nO; k=nO*nV;
A=d_cc_space_v_ov_chol; lda=cholesky_mo_num;
B=d_tmp_oov; ldb=nO;
C=&(d_tau_kia[cholesky_mo_num*nO*a]); ldc=cholesky_mo_num;
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
}
for (int i=0 ; i<nV ; ++i) {
cudaStreamDestroy(stream[i]);
}
cudaFree(d_tmp_oov);
cublasDcopy(handle, nV*nV, d_cc_space_f_vv, 1, d_H_vv, 1);
alpha = -1.0;
beta = 1.0;
m=nV; n=nV; k=cholesky_mo_num*nO;
A=d_tau_kia; lda=cholesky_mo_num*nO;
B=d_cc_space_v_ov_chol; ldb=cholesky_mo_num*nO;
C=d_H_vv; ldc=nV;
cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
cudaFree(d_tau_kia);
// double* H_vv = malloc(nO*nO*sizeof(double));
cublasGetMatrix(nV, nV, sizeof(double), d_H_vv, nV, H_vv, nV);
for (int i=0 ; i<ngpus ; ++i) {
if (i != igpu) {
double* d_H_vv = data[i].H_vv;
cudaSetDevice(i);
cublasSetMatrix(nV, nV, sizeof(double), H_vv, nV, d_H_vv, nV);
}
}
// free(H_vv);
cublasDestroy(handle);
}
void compute_h_vo_chol_gpu(gpu_data* data, int nO, int nV, int igpu, double* H_vo)
{
int ngpus = 1;
if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
igpu = igpu % ngpus;
const int cholesky_mo_num = data[igpu].cholesky_mo_num;
cudaSetDevice(igpu);
int m,n,k, lda, ldb, ldc;
double alpha, beta;
double* A;
double* B;
double* C;
cudaStream_t stream[nV];
cublasHandle_t handle;
cublasCreate(&handle);
double* d_t1 = data[igpu].t1;
double* d_H_vo = data[igpu].H_vo;
double* d_tau_x = data[igpu].tau_x;
double* d_cc_space_f_vo = data[igpu].cc_space_f_vo; double* d_cc_space_f_vo = data[igpu].cc_space_f_vo;
double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol; double* d_tau = data[igpu].tau;
double* d_cc_space_v_vo_chol = data[igpu].cc_space_v_vo_chol; double* d_t1 = data[igpu].t1;
double* d_t2 = data[igpu].t2;
double* d_H_oo = data[igpu].H_oo;
double* d_H_vo = data[igpu].H_vo;
double* d_H_vv = data[igpu].H_vv;
cublasDcopy(handle, nV*nO, d_cc_space_f_vo, 1, d_H_vo, 1); #pragma omp sections
{
double* d_tmp_k; #pragma omp section
cudaMalloc((void **)&d_tmp_k, cholesky_mo_num * sizeof(double)); {
cublasDcopy(handle, nO*nV, d_cc_space_f_ov, 1, d_r1, 1);
alpha = 2.0; double* d_X_oo;
cudaMalloc((void **)&d_X_oo, nO*nO * sizeof(double));
alpha = -2.0;
beta = 0.0; beta = 0.0;
m=cholesky_mo_num; n=1; k=nO*nV; m=nO; n=nO; k=nV;
A=d_cc_space_v_ov_chol; lda=cholesky_mo_num; A=d_t1; lda=nO;
B=d_t1; ldb=nO*nV; B=d_cc_space_f_vo; ldb=nV;
C=d_tmp_k; ldc=cholesky_mo_num; C=d_X_oo; ldc=nO;
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc); cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
alpha = 1.0; alpha = 1.0;
beta = 1.0; beta = 1.0;
m=nV*nO; n=1; k=cholesky_mo_num; m=nO; n=nV; k=nO;
A=d_cc_space_v_vo_chol; lda=cholesky_mo_num; A=d_X_oo; lda=nO;
B=d_tmp_k; ldb=cholesky_mo_num; B=d_t1; ldb=nO;
C=d_H_vo; ldc=nV*nO; C=d_r1; ldc=nO;
cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc); cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
cudaFree(d_tmp_k); cudaFree(d_X_oo);
}
double* d_tmp;
cudaMalloc((void **)&d_tmp, cholesky_mo_num*nO*nO * sizeof(double));
#pragma omp section
{
alpha = 1.0; alpha = 1.0;
beta = 0.0; beta = 1.0;
m=cholesky_mo_num*nO; n=nO; k=nV; m=nO; n=nV; k=nV;
A=d_cc_space_v_ov_chol; lda=cholesky_mo_num*nO; A=d_t1; lda=nO;
B=d_t1; ldb=nO; B=d_H_vv; ldb=nV;
C=d_tmp; ldc=cholesky_mo_num*nO; C=d_r1; ldc=nO;
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc); cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
}
double* d_tmp2; #pragma omp section
cudaMalloc((void **)&d_tmp2, cholesky_mo_num*nO*nO * sizeof(double)); {
alpha = -1.0;
beta = 1.0;
m=nO; n=nV; k=nO;
A=d_H_oo; lda=nO;
B=d_t1; ldb=nO;
C=d_r1; ldc=nO;
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
}
#pragma omp section
{
double* d_X_voov;
cudaMalloc((void **)&d_X_voov, nV* nO* nO* nV * sizeof(double));
for (int i=0 ; i<nV ; ++i) { for (int i=0 ; i<nV ; ++i) {
cudaStreamCreate(&(stream[i])); cudaStreamCreate(&(stream[i]));
} }
alpha = -1.0;
for (int i=0 ; i<nO ; ++i) { for (int i=0 ; i<nO ; ++i) {
for (int j=0 ; j<nO ; ++j) { for (int bet=0 ; bet<nV ; ++bet) {
cublasSetStream(handle, stream[j]); cublasSetStream(handle, stream[bet]);
cublasDcopy(handle, cholesky_mo_num, &(d_tmp [cholesky_mo_num*(i+nO*j)]), 1, beta = t1[i+bet*nO];
&(d_tmp2[cholesky_mo_num*(j+nO*i)]), 1); A = &(d_t2[nO*(i+nO*nV*bet)]); lda = nO*nO;
B = &(d_t1[0]); ldb = nO;
C = &(d_X_voov[nV*(i+nO*nO*bet)]); ldc = nV*nO;
cublasDgeam(handle, CUBLAS_OP_T, CUBLAS_OP_T, nV, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
}
}
cudaDeviceSynchronize();
alpha = 1.0;
beta = 2.0;
for (int bet=0 ; bet<nV ; ++bet) {
cublasSetStream(handle, stream[bet]);
A = &(d_X_voov[nV*nO*nO*bet]); lda = nV;
B = &(d_t2[nO*nO*nV*bet]); ldb = nO*nO;
C = A ; ldc = lda;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nV, nO*nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
}
for (int i=0 ; i<nV ; ++i) {
cudaStreamDestroy(stream[i]);
}
cublasSetStream(handle, NULL);
alpha = 1.0;
beta = 1.0;
m=nV*nO; n=nO*nV;
A=d_X_voov; lda=nV * nO;
B=d_H_vo; ldb=1;
C=d_r1; ldc=1;
cublasDgemv(handle, CUBLAS_OP_T, m, n, &alpha, A, lda, B, ldb, &beta, C, ldc);
cudaFree(d_X_voov);
}
#pragma omp section
{
double* d_X_ovov;
cudaMalloc((void **)&d_X_ovov, nO* nV* nO* nV * sizeof(double));
cublasDcopy(handle, nO*nV*nO*nV, d_cc_space_v_ovov, 1, d_X_ovov, 1);
for (int i=0 ; i<nV ; ++i) {
cudaStreamCreate(&(stream[i]));
}
alpha = -1.0;
beta = 2.0;
for (int u=0 ; u<nO ; ++u) {
for (int bet=0 ; bet<nV ; ++bet) {
cublasSetStream(handle, stream[bet]);
A = &(d_X_ovov[nO*nV*(u+nO*bet)]); lda = nO;
B = &(d_cc_space_v_voov[(nV*(u+nO*nO*bet))]); ldb = nV*nO;
C = A ; ldc = lda;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nO, nV, &alpha, A, lda, &beta, B, ldb, C, ldc);
}
}
for (int i=0 ; i<nV ; ++i) {
cudaStreamDestroy(stream[i]);
}
cublasSetStream(handle, NULL);
alpha = 1.0;
beta = 1.0;
m=nO*nV; n=nO*nV;
A=d_X_ovov; lda=nO * nV;
B=d_t1; ldb=1;
C=d_r1; ldc=1;
cublasDgemv(handle, CUBLAS_OP_T, m, n, &alpha, A, lda, B, ldb, &beta, C, ldc);
cudaFree(d_X_ovov);
}
#pragma omp section
{
double* d_T_vvoo;
cudaMalloc((void **)&d_T_vvoo, nV*nV*nO*nO * sizeof(double));
alpha = 0.0;
beta = 1.0;
A = d_T_vvoo; lda = nV*nV;
B = d_tau; ldb = nO*nO;
C = A ; ldc = lda;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nV*nV, nO*nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
double* d_W_vvov;
cudaMalloc((void **)&d_W_vvov, nV*nV*nO*BLOCK_SIZE * sizeof(double));
double* d_W_vvov_tmp;
cudaMalloc((void **)&d_W_vvov_tmp, nV*nO*nV*BLOCK_SIZE * sizeof(double));
for (int iblock=0 ; iblock<nV ; iblock += BLOCK_SIZE) {
const int mbs = BLOCK_SIZE < nV-iblock ? BLOCK_SIZE : nV-iblock;
alpha = 1.0;
beta = 0.0;
m=nV*nO; n=nV*mbs; k=cholesky_mo_num;
A=d_cc_space_v_vo_chol; lda=cholesky_mo_num;
B=&(d_cc_space_v_vv_chol[cholesky_mo_num*nV*iblock]); ldb=cholesky_mo_num;
C=d_W_vvov_tmp; ldc=nV*nO;
cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
alpha = 2.0;
beta = -1.0;
int kk=0;
for (int i=0 ; i<nV ; ++i) {
cudaStreamCreate(&(stream[i]));
}
for (int i=0 ; i<nO ; ++i) {
for (int bet=0 ; bet<mbs ; ++bet) {
cublasSetStream(handle, stream[kk]);
++kk;
if (kk >= nV) kk = 0;
A = &(d_W_vvov_tmp[nV*(i+nO*nV*bet)]); lda = nV*nO;
B = &(d_W_vvov_tmp[nV*(i+nO*nV*bet)]); ldb = nV*nO;
C = &(d_W_vvov[nV*nV*(i+nO*bet)]); ldc = nV;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nV, nV, &alpha, A, lda, &beta, B, ldb, C, ldc);
}
}
for (int i=0 ; i<nV ; ++i) {
cudaStreamDestroy(stream[i]);
}
cublasSetStream(handle, NULL);
alpha = 1.0;
beta = 1.0;
m=nO; n=mbs; k=nO*nV*nV;
A=d_T_vvoo; lda=nV*nV*nO;
B=d_W_vvov; ldb=nO*nV*nV;
C=&(d_r1[nO*iblock]); ldc=nO;
cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
}
cudaFree(d_W_vvov);
cudaFree(d_W_vvov_tmp);
cudaFree(d_T_vvoo);
}
#pragma omp section
{
double* d_W_oovo;
cudaMalloc((void **)&d_W_oovo, nO*nO*nV*nO * sizeof(double));
alpha = 2.0;
beta = -1.0;
for (int i=0 ; i<nV ; ++i) {
cudaStreamCreate(&(stream[i]));
}
for (int u=0 ; u<nO ; ++u) {
for (int a=0 ; a<nV ; ++a) {
cublasSetStream(handle, stream[a]);
A = &(d_cc_space_v_oovo[nO*nO*(a+nV*u)]); lda = nO;
B = &(d_cc_space_v_oovo[nO*nO*(a+nV*u)]); ldb = nO;
C = &(d_W_oovo[nO*nO*(a+nV*u)]); ldc = nO;
cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
} }
} }
for (int i=0 ; i<nV ; ++i) { for (int i=0 ; i<nV ; ++i) {
@ -1601,24 +1900,35 @@ void compute_h_vo_chol_gpu(gpu_data* data, int nO, int nV, int igpu, double* H_v
alpha = -1.0; alpha = -1.0;
beta = 1.0; beta = 1.0;
m=nV; n=nO; k=cholesky_mo_num*nO; m=nO; n=nV; k=nO*nO*nV;
A=d_cc_space_v_ov_chol; lda=cholesky_mo_num*nO; A=d_W_oovo; lda=nO * nO * nV;
B=d_tmp2; ldb=cholesky_mo_num*nO; B=d_tau; ldb=nO * nO * nV;
C=d_H_vo; ldc=nV; C=d_r1; ldc=nO;
cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc); cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
// double* H_vo = malloc(nO*nO*sizeof(double));
cublasGetMatrix(nV, nO, sizeof(double), d_H_vo, nV, H_vo, nV);
for (int i=0 ; i<ngpus ; ++i) {
if (i != igpu) {
double* d_H_vo = data[i].H_vo;
cudaSetDevice(i);
cublasSetMatrix(nV, nO, sizeof(double), H_vo, nV, d_H_vo, nV);
} }
} }
// free(H_vo);
double * r1_tmp = malloc(nO*nV*sizeof(double));
lda=nO;
cublasGetMatrix(nO, nV, sizeof(double), d_r1, lda, r1_tmp, lda);
#pragma omp critical
{
for (size_t i=0 ; i<(size_t) nO*nV ; ++i) {
r1[i] -= r1_tmp[i];
}
}
free(r1_tmp);
cudaFree(d_r1);
cublasDestroy(handle); cublasDestroy(handle);
}
*max_r1 = 0.;
for (size_t i=0 ; i<(size_t) nO*nV ; ++i) {
const double x = r1[i] > 0. ? r1[i] : -r1[i];
*max_r1 = *max_r1 > x ? *max_r1 : x;
}
} }

View File

@ -5,6 +5,7 @@ typedef struct {
double* cc_space_v_vv_chol; double* cc_space_v_vv_chol;
double* cc_space_v_oooo; double* cc_space_v_oooo;
double* cc_space_v_vooo; double* cc_space_v_vooo;
double* cc_space_v_voov;
double* cc_space_v_oovv; double* cc_space_v_oovv;
double* cc_space_v_vvoo; double* cc_space_v_vvoo;
double* cc_space_v_oovo; double* cc_space_v_oovo;
@ -12,6 +13,7 @@ typedef struct {
double* cc_space_v_ovov; double* cc_space_v_ovov;
double* cc_space_v_ovoo; double* cc_space_v_ovoo;
double* cc_space_f_oo; double* cc_space_f_oo;
double* cc_space_f_ov;
double* cc_space_f_vo; double* cc_space_f_vo;
double* cc_space_f_vv; double* cc_space_f_vv;
double* tau; double* tau;

View File

@ -10,12 +10,12 @@ gpu_data* gpu_init(
int nO, int nV, int cholesky_mo_num, int nO, int nV, int cholesky_mo_num,
double* cc_space_v_oo_chol, double* cc_space_v_ov_chol, double* cc_space_v_oo_chol, double* cc_space_v_ov_chol,
double* cc_space_v_vo_chol, double* cc_space_v_vv_chol, double* cc_space_v_vo_chol, double* cc_space_v_vv_chol,
double* cc_space_v_oooo, double* cc_space_v_vooo, double* cc_space_v_oooo, double* cc_space_v_vooo, double* cc_space_v_voov,
double* cc_space_v_oovv, double* cc_space_v_vvoo, double* cc_space_v_oovv, double* cc_space_v_vvoo,
double* cc_space_v_oovo, double* cc_space_v_ovvo, double* cc_space_v_oovo, double* cc_space_v_ovvo,
double* cc_space_v_ovov, double* cc_space_v_ovoo, double* cc_space_v_ovov, double* cc_space_v_ovoo,
double* cc_space_f_oo, double* cc_space_f_vo, double* cc_space_f_oo, double* cc_space_f_ov,
double* cc_space_f_vv) double* cc_space_f_vo, double* cc_space_f_vv)
{ {
int ngpus = 1; int ngpus = 1;
cudaGetDeviceCount(&ngpus); cudaGetDeviceCount(&ngpus);
@ -59,6 +59,10 @@ gpu_data* gpu_init(
cudaMalloc((void**)&d_cc_space_v_vooo, nV*nO*nO*nO*sizeof(double)); cudaMalloc((void**)&d_cc_space_v_vooo, nV*nO*nO*nO*sizeof(double));
cublasSetMatrix(nV*nO, nO*nO, sizeof(double), cc_space_v_vooo, nV*nO, d_cc_space_v_vooo, nV*nO); cublasSetMatrix(nV*nO, nO*nO, sizeof(double), cc_space_v_vooo, nV*nO, d_cc_space_v_vooo, nV*nO);
double* d_cc_space_v_voov;
cudaMalloc((void**)&d_cc_space_v_voov, nV*nO*nO*nV*sizeof(double));
cublasSetMatrix(nV*nO, nO*nV, sizeof(double), cc_space_v_voov, nV*nO, d_cc_space_v_voov, nV*nO);
double* d_cc_space_v_oovv; double* d_cc_space_v_oovv;
cudaMalloc((void**)&d_cc_space_v_oovv, nO*nO*nV*nV*sizeof(double)); cudaMalloc((void**)&d_cc_space_v_oovv, nO*nO*nV*nV*sizeof(double));
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), cc_space_v_oovv, nO*nO, d_cc_space_v_oovv, nO*nO); cublasSetMatrix(nO*nO, nV*nV, sizeof(double), cc_space_v_oovv, nO*nO, d_cc_space_v_oovv, nO*nO);
@ -95,6 +99,10 @@ gpu_data* gpu_init(
cudaMalloc((void**)&d_cc_space_f_vo, nV*nO*sizeof(double)); cudaMalloc((void**)&d_cc_space_f_vo, nV*nO*sizeof(double));
cublasSetMatrix(nV, nO, sizeof(double), cc_space_f_vo, nV, d_cc_space_f_vo, nV); cublasSetMatrix(nV, nO, sizeof(double), cc_space_f_vo, nV, d_cc_space_f_vo, nV);
double* d_cc_space_f_ov;
cudaMalloc((void**)&d_cc_space_f_ov, nV*nO*sizeof(double));
cublasSetMatrix(nO, nV, sizeof(double), cc_space_f_ov, nO, d_cc_space_f_ov, nO);
double* d_cc_space_f_vv; double* d_cc_space_f_vv;
cudaMalloc((void**)&d_cc_space_f_vv, nV*nV*sizeof(double)); cudaMalloc((void**)&d_cc_space_f_vv, nV*nV*sizeof(double));
cublasSetMatrix(nV, nV, sizeof(double), cc_space_f_vv, nV, d_cc_space_f_vv, nV); cublasSetMatrix(nV, nV, sizeof(double), cc_space_f_vv, nV, d_cc_space_f_vv, nV);
@ -128,6 +136,7 @@ gpu_data* gpu_init(
data[igpu].cc_space_v_vv_chol = d_cc_space_v_vv_chol; data[igpu].cc_space_v_vv_chol = d_cc_space_v_vv_chol;
data[igpu].cc_space_v_oooo = d_cc_space_v_oooo; data[igpu].cc_space_v_oooo = d_cc_space_v_oooo;
data[igpu].cc_space_v_vooo = d_cc_space_v_vooo; data[igpu].cc_space_v_vooo = d_cc_space_v_vooo;
data[igpu].cc_space_v_voov = d_cc_space_v_voov;
data[igpu].cc_space_v_oovv = d_cc_space_v_oovv; data[igpu].cc_space_v_oovv = d_cc_space_v_oovv;
data[igpu].cc_space_v_vvoo = d_cc_space_v_vvoo; data[igpu].cc_space_v_vvoo = d_cc_space_v_vvoo;
data[igpu].cc_space_v_oovo = d_cc_space_v_oovo; data[igpu].cc_space_v_oovo = d_cc_space_v_oovo;
@ -135,6 +144,7 @@ gpu_data* gpu_init(
data[igpu].cc_space_v_ovov = d_cc_space_v_ovov; data[igpu].cc_space_v_ovov = d_cc_space_v_ovov;
data[igpu].cc_space_v_ovoo = d_cc_space_v_ovoo; data[igpu].cc_space_v_ovoo = d_cc_space_v_ovoo;
data[igpu].cc_space_f_oo = d_cc_space_f_oo; data[igpu].cc_space_f_oo = d_cc_space_f_oo;
data[igpu].cc_space_f_ov = d_cc_space_f_ov;
data[igpu].cc_space_f_vo = d_cc_space_f_vo; data[igpu].cc_space_f_vo = d_cc_space_f_vo;
data[igpu].cc_space_f_vv = d_cc_space_f_vv; data[igpu].cc_space_f_vv = d_cc_space_f_vv;
data[igpu].tau = d_tau; data[igpu].tau = d_tau;

View File

@ -6,9 +6,9 @@ module gpu_module
interface interface
type(c_ptr) function gpu_init(nO, nV, cholesky_mo_num, & type(c_ptr) function gpu_init(nO, nV, cholesky_mo_num, &
cc_space_v_oo_chol, cc_space_v_ov_chol, cc_space_v_vo_chol, cc_space_v_vv_chol, & cc_space_v_oo_chol, cc_space_v_ov_chol, cc_space_v_vo_chol, cc_space_v_vv_chol, &
cc_space_v_oooo, cc_space_v_vooo, cc_space_v_oovv, cc_space_v_vvoo, & cc_space_v_oooo, cc_space_v_vooo, cc_space_v_voov, cc_space_v_oovv, cc_space_v_vvoo, &
cc_space_v_oovo, cc_space_v_ovvo, cc_space_v_ovov, cc_space_v_ovoo, & cc_space_v_oovo, cc_space_v_ovvo, cc_space_v_ovov, cc_space_v_ovoo, &
cc_space_f_oo, cc_space_f_vo, cc_space_f_vv) bind(C) cc_space_f_oo, cc_space_f_ov, cc_space_f_vo, cc_space_f_vv) bind(C)
import c_int, c_double, c_ptr import c_int, c_double, c_ptr
integer(c_int), intent(in), value :: nO, nV, cholesky_mo_num integer(c_int), intent(in), value :: nO, nV, cholesky_mo_num
real(c_double), intent(in) :: cc_space_v_oo_chol(cholesky_mo_num,nO,nO) real(c_double), intent(in) :: cc_space_v_oo_chol(cholesky_mo_num,nO,nO)
@ -17,6 +17,7 @@ module gpu_module
real(c_double), intent(in) :: cc_space_v_vv_chol(cholesky_mo_num,nV,nV) real(c_double), intent(in) :: cc_space_v_vv_chol(cholesky_mo_num,nV,nV)
real(c_double), intent(in) :: cc_space_v_oooo(nO,nO,nO,nO) real(c_double), intent(in) :: cc_space_v_oooo(nO,nO,nO,nO)
real(c_double), intent(in) :: cc_space_v_vooo(nV,nO,nO,nO) real(c_double), intent(in) :: cc_space_v_vooo(nV,nO,nO,nO)
real(c_double), intent(in) :: cc_space_v_voov(nV,nO,nO,nV)
real(c_double), intent(in) :: cc_space_v_oovv(nO,nO,nV,nV) real(c_double), intent(in) :: cc_space_v_oovv(nO,nO,nV,nV)
real(c_double), intent(in) :: cc_space_v_vvoo(nV,nV,nO,nO) real(c_double), intent(in) :: cc_space_v_vvoo(nV,nV,nO,nO)
real(c_double), intent(in) :: cc_space_v_oovo(nO,nO,nV,nO) real(c_double), intent(in) :: cc_space_v_oovo(nO,nO,nV,nO)
@ -24,6 +25,7 @@ module gpu_module
real(c_double), intent(in) :: cc_space_v_ovov(nO,nV,nO,nV) real(c_double), intent(in) :: cc_space_v_ovov(nO,nV,nO,nV)
real(c_double), intent(in) :: cc_space_v_ovoo(nO,nV,nO,nO) real(c_double), intent(in) :: cc_space_v_ovoo(nO,nV,nO,nO)
real(c_double), intent(in) :: cc_space_f_oo(nO,nO) real(c_double), intent(in) :: cc_space_f_oo(nO,nO)
real(c_double), intent(in) :: cc_space_f_ov(nO,nV)
real(c_double), intent(in) :: cc_space_f_vo(nV,nO) real(c_double), intent(in) :: cc_space_f_vo(nV,nO)
real(c_double), intent(in) :: cc_space_f_vv(nV,nV) real(c_double), intent(in) :: cc_space_f_vv(nV,nV)
end function end function
@ -38,25 +40,31 @@ module gpu_module
real(c_double), intent(in) :: tau_x(nO,nO,nV,nV) real(c_double), intent(in) :: tau_x(nO,nO,nV,nV)
end subroutine end subroutine
subroutine compute_H_oo_chol_gpu(gpu_data, nO, nV, igpu, H_oo) bind(C) subroutine compute_H_oo_chol_gpu(gpu_data, igpu) bind(C)
import c_int, c_double, c_ptr import c_int, c_double, c_ptr
type(c_ptr), value :: gpu_data type(c_ptr), value :: gpu_data
integer(c_int), intent(in), value :: nO, nV, igpu integer(c_int), intent(in), value :: igpu
real(c_double), intent(out) :: H_oo(nO,nO)
end subroutine end subroutine
subroutine compute_H_vo_chol_gpu(gpu_data, nO, nV, igpu, H_vo) bind(C) subroutine compute_H_vo_chol_gpu(gpu_data, igpu) bind(C)
import c_int, c_double, c_ptr import c_int, c_double, c_ptr
type(c_ptr), value :: gpu_data type(c_ptr), value :: gpu_data
integer(c_int), intent(in), value :: nO, nV, igpu integer(c_int), intent(in), value :: igpu
real(c_double), intent(out) :: H_vo(nV,nO)
end subroutine end subroutine
subroutine compute_H_vv_chol_gpu(gpu_data, nO, nV, igpu, H_vv) bind(C) subroutine compute_H_vv_chol_gpu(gpu_data, igpu) bind(C)
import c_int, c_double, c_ptr import c_int, c_double, c_ptr
type(c_ptr), value :: gpu_data type(c_ptr), value :: gpu_data
integer(c_int), intent(in), value :: nO, nV, igpu integer(c_int), intent(in), value :: igpu
real(c_double), intent(out) :: H_vv(nO,nO) end subroutine
subroutine compute_r1_space_chol_gpu(gpu_data, nO, nV, t1, r1, max_r1) bind(C)
import c_int, c_double, c_ptr
type(c_ptr), value :: gpu_data
integer(c_int), intent(in), value :: nO, nV
real(c_double), intent(in) :: t1(nO,nV)
real(c_double), intent(out) :: r1(nO,nO,nV,nV)
real(c_double), intent(out) :: max_r1
end subroutine end subroutine
subroutine compute_r2_space_chol_gpu(gpu_data, nO, nV, t1, r2, max_r2) bind(C) subroutine compute_r2_space_chol_gpu(gpu_data, nO, nV, t1, r2, max_r2) bind(C)

View File

@ -0,0 +1,208 @@
! T1
subroutine guess_t1(nO,nV,f_o,f_v,f_ov,t1)
implicit none
BEGIN_DOC
! Update the T1 amplitudes for CC
END_DOC
! in
integer, intent(in) :: nO, nV
double precision, intent(in) :: f_o(nO), f_v(nV), f_ov(nO,nV)
! inout
double precision, intent(out) :: t1(nO, nV)
! internal
integer :: i,a
if (trim(cc_guess_t1) == 'none') then
t1 = 0d0
else if (trim(cc_guess_t1) == 'MP') then
do a = 1, nV
do i = 1, nO
t1(i,a) = f_ov(i,a) / (f_o(i) - f_v(a) - cc_level_shift_guess)
enddo
enddo
else if (trim(cc_guess_t1) == 'read') then
call read_t1(nO,nV,t1)
else
print*, 'Unknown cc_guess_t1 type: '//trim(cc_guess_t1)
call abort
endif
end
! T2
subroutine guess_t2(nO,nV,f_o,f_v,v_oovv,t2)
implicit none
BEGIN_DOC
! Update the T2 amplitudes for CC
END_DOC
! in
integer, intent(in) :: nO, nV
double precision, intent(in) :: f_o(nO), f_v(nV), v_oovv(nO, nO, nV, nV)
! inout
double precision, intent(out) :: t2(nO, nO, nV, nV)
! internal
integer :: i,j,a,b
if (trim(cc_guess_t2) == 'none') then
t2 = 0d0
else if (trim(cc_guess_t2) == 'MP') then
do b = 1, nV
do a = 1, nV
do j = 1, nO
do i = 1, nO
t2(i,j,a,b) = v_oovv(i,j,a,b) / (f_o(i) + f_o(j) - f_v(a) - f_v(b) - cc_level_shift_guess)
enddo
enddo
enddo
enddo
else if (trim(cc_guess_t2) == 'read') then
call read_t2(nO,nV,t2)
else
print*, 'Unknown cc_guess_t1 type: '//trim(cc_guess_t2)
call abort
endif
end
! T1
subroutine write_t1(nO,nV,t1)
implicit none
BEGIN_DOC
! Write the T1 amplitudes for CC
END_DOC
! in
integer, intent(in) :: nO, nV
double precision, intent(in) :: t1(nO, nV)
! internal
integer :: i,a, iunit
integer, external :: getunitandopen
if (write_amplitudes) then
iunit = getUnitAndOpen(trim(ezfio_filename)//'/work/T1','w')
do a = 1, nV
do i = 1, nO
write(iunit,'(F20.12)') t1(i,a)
enddo
enddo
close(iunit)
endif
end
! T2
subroutine write_t2(nO,nV,t2)
implicit none
BEGIN_DOC
! Write the T2 amplitudes for CC
END_DOC
! in
integer, intent(in) :: nO, nV
double precision, intent(in) :: t2(nO, nO, nV, nV)
! internal
integer :: i,j,a,b, iunit
integer, external :: getunitandopen
if (write_amplitudes) then
iunit = getUnitAndOpen(trim(ezfio_filename)//'/work/T2','w')
do b = 1, nV
do a = 1, nV
do j = 1, nO
do i = 1, nO
write(iunit,'(F20.12)') t2(i,j,a,b)
enddo
enddo
enddo
enddo
close(iunit)
endif
end
! T1
subroutine read_t1(nO,nV,t1)
implicit none
BEGIN_DOC
! Read the T1 amplitudes for CC
END_DOC
! in
integer, intent(in) :: nO, nV
double precision, intent(out) :: t1(nO, nV)
! internal
integer :: i,a, iunit
logical :: ok
integer, external :: getunitandopen
if (read_amplitudes) then
iunit = getUnitAndOpen(trim(ezfio_filename)//'/work/T1','r')
do a = 1, nV
do i = 1, nO
read(iunit,'(F20.12)') t1(i,a)
enddo
enddo
close(iunit)
endif
end
! T2
subroutine read_t2(nO,nV,t2)
implicit none
BEGIN_DOC
! Read the T2 amplitudes for CC
END_DOC
! in
integer, intent(in) :: nO, nV
double precision, intent(out) :: t2(nO, nO, nV, nV)
! internal
integer :: i,j,a,b, iunit
logical :: ok
integer, external :: getunitandopen
if (read_amplitudes) then
iunit = getUnitAndOpen(trim(ezfio_filename)//'/work/T2','r')
do b = 1, nV
do a = 1, nV
do j = 1, nO
do i = 1, nO
read(iunit,'(F20.12)') t2(i,j,a,b)
enddo
enddo
enddo
enddo
close(iunit)
endif
end

View File

@ -0,0 +1,328 @@
! N spin orb
subroutine extract_n_spin(det,n)
implicit none
BEGIN_DOC
! Returns the number of occupied alpha, occupied beta, virtual alpha, virtual beta spin orbitals
! in det without counting the core and deleted orbitals in the format n(nOa,nOb,nVa,nVb)
END_DOC
integer(bit_kind), intent(in) :: det(N_int,2)
integer, intent(out) :: n(4)
integer(bit_kind) :: res(N_int,2)
integer :: i, si
logical :: ok, is_core, is_del
! Init
n = 0
! Loop over the spin
do si = 1, 2
do i = 1, mo_num
call apply_hole(det, si, i, res, ok, N_int)
! in core ?
if (is_core(i)) cycle
! in del ?
if (is_del(i)) cycle
if (ok) then
! particle
n(si) = n(si) + 1
else
! hole
n(si+2) = n(si+2) + 1
endif
enddo
enddo
!print*,n(1),n(2),n(3),n(4)
end
! Spin
subroutine extract_list_orb_spin(det,nO_m,nV_m,list_occ,list_vir)
implicit none
BEGIN_DOC
! Returns the the list of occupied alpha/beta, virtual alpha/beta spin orbitals
! size(nO_m,1) must be max(nOa,nOb) and size(nV_m,1) must be max(nVa,nVb)
END_DOC
integer, intent(in) :: nO_m, nV_m
integer(bit_kind), intent(in) :: det(N_int,2)
integer, intent(out) :: list_occ(nO_m,2), list_vir(nV_m,2)
integer(bit_kind) :: res(N_int,2)
integer :: i, si, idx_o, idx_v, idx_i, idx_b
logical :: ok, is_core, is_del
list_occ = 0
list_vir = 0
! List of occ/vir alpha/beta
! occ alpha -> list_occ(:,1)
! occ beta -> list_occ(:,2)
! vir alpha -> list_vir(:,1)
! vir beta -> list_vir(:,2)
! Loop over the spin
do si = 1, 2
! tmp idx
idx_o = 1
idx_v = 1
do i = 1, mo_num
call apply_hole(det, si, i, res, ok, N_int)
! in core ?
if (is_core(i)) cycle
! in del ?
if (is_del(i)) cycle
if (ok) then
! particle
list_occ(idx_o,si) = i
idx_o = idx_o + 1
else
! hole
list_vir(idx_v,si) = i
idx_v = idx_v + 1
endif
enddo
enddo
end
! Space
subroutine extract_list_orb_space(det,nO,nV,list_occ,list_vir)
implicit none
BEGIN_DOC
! Returns the the list of occupied and virtual alpha spin orbitals
END_DOC
integer, intent(in) :: nO, nV
integer(bit_kind), intent(in) :: det(N_int,2)
integer, intent(out) :: list_occ(nO), list_vir(nV)
integer(bit_kind) :: res(N_int,2)
integer :: i, si, idx_o, idx_v, idx_i, idx_b
logical :: ok, is_core, is_del
if (elec_alpha_num /= elec_beta_num) then
print*,'Error elec_alpha_num /= elec_beta_num, impossible to create cc_list_occ and cc_list_vir, abort'
call abort
endif
list_occ = 0
list_vir = 0
! List of occ/vir alpha
! occ alpha -> list_occ(:,1)
! vir alpha -> list_vir(:,1)
! tmp idx
idx_o = 1
idx_v = 1
do i = 1, mo_num
call apply_hole(det, 1, i, res, ok, N_int)
! in core ?
if (is_core(i)) cycle
! in del ?
if (is_del(i)) cycle
if (ok) then
! particle
list_occ(idx_o) = i
idx_o = idx_o + 1
else
! hole
list_vir(idx_v) = i
idx_v = idx_v + 1
endif
enddo
end
! is_core
function is_core(i)
implicit none
BEGIN_DOC
! True if the orbital i is a core orbital
END_DOC
integer, intent(in) :: i
logical :: is_core
integer :: j
! Init
is_core = .False.
! Search
do j = 1, dim_list_core_orb
if (list_core(j) == i) then
is_core = .True.
exit
endif
enddo
end
! is_del
function is_del(i)
implicit none
BEGIN_DOC
! True if the orbital i is a deleted orbital
END_DOC
integer, intent(in) :: i
logical :: is_del
integer :: j
! Init
is_del = .False.
! Search
do j = 1, dim_list_del_orb
if (list_del(j) == i) then
is_del = .True.
exit
endif
enddo
end
! N orb
BEGIN_PROVIDER [integer, cc_nO_m]
&BEGIN_PROVIDER [integer, cc_nOa]
&BEGIN_PROVIDER [integer, cc_nOb]
&BEGIN_PROVIDER [integer, cc_nOab]
&BEGIN_PROVIDER [integer, cc_nV_m]
&BEGIN_PROVIDER [integer, cc_nVa]
&BEGIN_PROVIDER [integer, cc_nVb]
&BEGIN_PROVIDER [integer, cc_nVab]
&BEGIN_PROVIDER [integer, cc_n_mo]
&BEGIN_PROVIDER [integer, cc_nO_S, (2)]
&BEGIN_PROVIDER [integer, cc_nV_S, (2)]
implicit none
BEGIN_DOC
! Number of orbitals without core and deleted ones of the cc_ref det in psi_det
! a: alpha, b: beta
! nO_m: max(a,b) occupied
! nOa: nb a occupied
! nOb: nb b occupied
! nOab: nb a+b occupied
! nV_m: max(a,b) virtual
! nVa: nb a virtual
! nVb: nb b virtual
! nVab: nb a+b virtual
END_DOC
integer :: n_spin(4)
! Extract number of occ/vir alpha/beta spin orbitals
call extract_n_spin(psi_det(1,1,cc_ref),n_spin)
cc_nOa = n_spin(1)
cc_nOb = n_spin(2)
cc_nOab = cc_nOa + cc_nOb !n_spin(1) + n_spin(2)
cc_nO_m = max(cc_nOa,cc_nOb) !max(n_spin(1), n_spin(2))
cc_nVa = n_spin(3)
cc_nVb = n_spin(4)
cc_nVab = cc_nVa + cc_nVb !n_spin(3) + n_spin(4)
cc_nV_m = max(cc_nVa,cc_nVb) !max(n_spin(3), n_spin(4))
cc_n_mo = cc_nVa + cc_nVb !n_spin(1) + n_spin(3)
cc_nO_S = (/cc_nOa,cc_nOb/)
cc_nV_S = (/cc_nVa,cc_nVb/)
END_PROVIDER
! General
BEGIN_PROVIDER [integer, cc_list_gen, (cc_n_mo)]
implicit none
BEGIN_DOC
! List of general orbitals without core and deleted ones
END_DOC
integer :: i,j
logical :: is_core, is_del
j = 1
do i = 1, mo_num
! in core ?
if (is_core(i)) cycle
! in del ?
if (is_del(i)) cycle
cc_list_gen(j) = i
j = j+1
enddo
END_PROVIDER
! Space
BEGIN_PROVIDER [integer, cc_list_occ, (cc_nOa)]
&BEGIN_PROVIDER [integer, cc_list_vir, (cc_nVa)]
implicit none
BEGIN_DOC
! List of occupied and virtual spatial orbitals without core and deleted ones
END_DOC
call extract_list_orb_space(psi_det(1,1,cc_ref),cc_nOa,cc_nVa,cc_list_occ,cc_list_vir)
END_PROVIDER
! Spin
BEGIN_PROVIDER [integer, cc_list_occ_spin, (cc_nO_m,2)]
&BEGIN_PROVIDER [integer, cc_list_vir_spin, (cc_nV_m,2)]
&BEGIN_PROVIDER [logical, cc_ref_is_open_shell]
implicit none
BEGIN_DOC
! List of occupied and virtual spin orbitals without core and deleted ones
END_DOC
integer :: i
call extract_list_orb_spin(psi_det(1,1,cc_ref),cc_nO_m,cc_nV_m,cc_list_occ_spin,cc_list_vir_spin)
cc_ref_is_open_shell = .False.
do i = 1, cc_nO_m
if (cc_list_occ_spin(i,1) /= cc_list_occ_spin(i,2)) then
cc_ref_is_open_shell = .True.
endif
enddo
END_PROVIDER

137
devel/ccsd_gpu/phase.irp.f Normal file
View File

@ -0,0 +1,137 @@
! phase
subroutine get_phase_general(det1,det2,phase,degree,Nint)
implicit none
integer, intent(in) :: Nint
integer(bit_kind), intent(in) :: det1(Nint,2), det2(Nint,2)
double precision, intent(out) :: phase
integer, intent(out) :: degree
integer :: n(2)
integer, allocatable :: list_anni(:,:), list_crea(:,:)
allocate(list_anni(N_int*bit_kind_size,2))
allocate(list_crea(N_int*bit_kind_size,2))
call get_excitation_general(det1,det2,degree,n,list_anni,list_crea,phase,Nint)
end
! Get excitation general
subroutine get_excitation_general(det1,det2,degree,n,list_anni,list_crea,phase,Nint)
use bitmasks
implicit none
integer, intent(in) :: Nint
integer(bit_kind), intent(in) :: det1(Nint,2), det2(Nint,2)
double precision, intent(out) :: phase
integer, intent(out) :: list_crea(Nint*bit_kind_size,2)
integer, intent(out) :: list_anni(Nint*bit_kind_size,2)
integer, intent(out) :: degree, n(2)
integer, allocatable :: l1(:,:), l2(:,:)
integer(bit_kind), allocatable :: det_crea(:,:), det_anni(:,:)
integer, allocatable :: pos_anni(:,:), pos_crea(:,:)
integer :: n1(2),n2(2),n_crea(2),n_anni(2),i,j,k,d
allocate(l1(Nint*bit_kind_size,2))
allocate(l2(Nint*bit_kind_size,2))
allocate(det_crea(Nint,2),det_anni(Nint,2))
! 1 111010
! 2 110101
!
!not 1-> 000101
! 2 110101
!and 000101 -> crea
!
! 1 111010
!not 2-> 001010
! 001010 -> anni
do j = 1, 2
do i = 1, Nint
det_crea(i,j) = iand(not(det1(i,j)),det2(i,j))
enddo
enddo
do j = 1, 2
do i = 1, Nint
det_anni(i,j) = iand(det1(i,j),not(det2(i,j)))
enddo
enddo
call bitstring_to_list_ab(det1,l1,n1,Nint)
call bitstring_to_list_ab(det2,l2,n2,Nint)
call bitstring_to_list_ab(det_crea,list_crea,n_crea,Nint)
call bitstring_to_list_ab(det_anni,list_anni,n_anni,Nint)
do i = 1, 2
if (n_crea(i) /= n_anni(i)) then
print*,'Well, it seems we have a problem here...'
call abort
endif
enddo
!1 11110011001 1 2 3 4 7 8 11
!pos 1 2 3 4 5 6 7
!2 11100101011 1 2 3 6 8 10 11
!anni 00010010000 4 7
!pos 4 5
!crea 00000100010 6 10
!pos 4 6
!4 -> 6 pos(4 -> 4)
!7 -> 10 pos(5 -> 6)
n = n_anni
degree = n_anni(1) + n_anni(2)
allocate(pos_anni(max(n(1),n(2)),2))
allocate(pos_crea(max(n(1),n(2)),2))
! Search pos anni
do j = 1, 2
k = 1
do i = 1, n1(j)
if (k > n_anni(j)) exit
if (l1(i,j) /= list_anni(k,j)) cycle
pos_anni(k,j) = i
k = k + 1
enddo
enddo
! Search pos crea
do j = 1, 2
k = 1
do i = 1, n2(j)
if (k > n_crea(j)) exit
if (l2(i,j) /= list_crea(k,j)) cycle
pos_crea(k,j) = i
k = k + 1
enddo
enddo
! Distance between the ith anni and the ith crea op
! By doing so there is no crossing between the different pairs of anni/crea
! and the phase is determined by the sum of the distances
! -> (-1)^{sum of the distances}
d = 0
do j = 1, 2
do i = 1, n(j)
d = d + abs(pos_anni(i,j) - pos_crea(i,j))
enddo
enddo
phase = dble((-1)**d)
! Debug
!print*,l2(1:n2(1),1)
!print*,l2(1:n2(2),2)
!!call print_det(det1,Nint)
!!call print_det(det2,Nint)
!print*,phase
!print*,''
end

View File

@ -0,0 +1,73 @@
! T1
subroutine update_t1(nO,nV,f_o,f_v,r1,t1)
implicit none
BEGIN_DOC
! Update the T1 amplitudes for CC
END_DOC
! in
integer, intent(in) :: nO, nV
double precision, intent(in) :: f_o(nO), f_v(nV), r1(nO, nV)
! inout
double precision, intent(inout) :: t1(nO, nV)
! internal
integer :: i,a
!$OMP PARALLEL &
!$OMP SHARED(nO,nV,t1,r1,cc_level_shift,f_o,f_v) &
!$OMP PRIVATE(i,a) &
!$OMP DEFAULT(NONE)
!$OMP DO
do a = 1, nV
do i = 1, nO
t1(i,a) = t1(i,a) - r1(i,a) / (f_o(i) - f_v(a) - cc_level_shift)
enddo
enddo
!$OMP END DO
!$OMP END PARALLEL
end
! T2
subroutine update_t2(nO,nV,f_o,f_v,r2,t2)
implicit none
BEGIN_DOC
! Update the T2 amplitudes for CC
END_DOC
! in
integer, intent(in) :: nO, nV
double precision, intent(in) :: f_o(nO), f_v(nV), r2(nO, nO, nV, nV)
! inout
double precision, intent(inout) :: t2(nO, nO, nV, nV)
! internal
integer :: i,j,a,b
!$OMP PARALLEL &
!$OMP SHARED(nO,nV,t2,r2,cc_level_shift,f_o,f_v) &
!$OMP PRIVATE(i,j,a,b) &
!$OMP DEFAULT(NONE)
!$OMP DO
do b = 1, nV
do a = 1, nV
do j = 1, nO
do i = 1, nO
t2(i,j,a,b) = t2(i,j,a,b) - r2(i,j,a,b) / (f_o(i) + f_o(j) - f_v(a) - f_v(b) - cc_level_shift)
enddo
enddo
enddo
enddo
!$OMP END DO
!$OMP END PARALLEL
end