mirror of
https://github.com/pfloos/quack
synced 2025-01-03 01:56:09 +01:00
fixed bug in CUDA implement of dRPA-sing
This commit is contained in:
parent
956ab28f58
commit
fcb11d662c
@ -29,7 +29,7 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,
|
||||
|
||||
! Local variables
|
||||
|
||||
integer :: i
|
||||
integer :: ia
|
||||
integer :: ispin
|
||||
logical :: dRPA
|
||||
double precision :: t1, t2
|
||||
@ -81,8 +81,12 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,
|
||||
|
||||
!call wall_time(t1)
|
||||
call phLR(TDA,nS,Aph,Bph,EcRPA(ispin),Om,XpY,XmY)
|
||||
!call wall_time(t2)
|
||||
call wall_time(t2)
|
||||
!print *, "wall time diag A on CPU (sec) = ", t2 - t1
|
||||
!do ia = 1, nS
|
||||
! write(112, *) Om(ia)
|
||||
!enddo
|
||||
!stop
|
||||
call print_excitation_energies('phRPA@RHF','singlet',nS,Om)
|
||||
call phLR_transition_vectors(.true.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY)
|
||||
|
||||
|
@ -29,7 +29,7 @@ subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC
|
||||
double precision,intent(in) :: dipole_int(nBas,nBas,ncart)
|
||||
|
||||
|
||||
integer :: i
|
||||
integer :: i, a, ia
|
||||
integer :: ispin
|
||||
logical :: dRPA
|
||||
double precision :: t1, t2
|
||||
@ -78,6 +78,10 @@ subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC
|
||||
call ph_drpa_sing(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om(1), XpY(1,1), XmY(1,1))
|
||||
!call wall_time(t2)
|
||||
!print*, 'diag time on GPU (sec):', t2 - t1
|
||||
!do ia = 1, nS
|
||||
! write(111, *) Om(ia)
|
||||
!enddo
|
||||
!stop
|
||||
|
||||
endif
|
||||
|
||||
@ -98,7 +102,7 @@ subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC
|
||||
do a = nO+1, nBas-nR
|
||||
ia = ia + 1
|
||||
iorder(ia) = ia
|
||||
Om(ia) = e(a) - e(i)
|
||||
Om(ia) = eHF(a) - eHF(i)
|
||||
XpY(ia,ia) = 1.d0
|
||||
enddo
|
||||
enddo
|
||||
|
@ -2,14 +2,14 @@
|
||||
#include <math.h>
|
||||
|
||||
|
||||
__global__ void elementwise_dsqrt_inplace_kernel(int nS, double *A) {
|
||||
__global__ void elementwise_dsqrt_inplace_kernel(int n, double *A) {
|
||||
|
||||
|
||||
int i;
|
||||
|
||||
i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
while(i < nS) {
|
||||
while(i < n) {
|
||||
|
||||
if(A[i] > 0.0) {
|
||||
|
||||
@ -30,10 +30,10 @@ __global__ void elementwise_dsqrt_inplace_kernel(int nS, double *A) {
|
||||
|
||||
|
||||
|
||||
extern "C" void elementwise_dsqrt_inplace(int nS, double *A) {
|
||||
extern "C" void elementwise_dsqrt_inplace(int n, double *A) {
|
||||
|
||||
int sBlocks = 32;
|
||||
int nBlocks = (nS + sBlocks - 1) / sBlocks;
|
||||
int nBlocks = (n + sBlocks - 1) / sBlocks;
|
||||
|
||||
dim3 dimGrid(nBlocks, 1, 1);
|
||||
dim3 dimBlock(sBlocks, 1, 1);
|
||||
@ -42,7 +42,7 @@ extern "C" void elementwise_dsqrt_inplace(int nS, double *A) {
|
||||
nBlocks, sBlocks);
|
||||
|
||||
|
||||
elementwise_dsqrt_inplace_kernel<<<dimGrid, dimBlock>>>(nS, A);
|
||||
elementwise_dsqrt_inplace_kernel<<<dimGrid, dimBlock>>>(n, A);
|
||||
|
||||
}
|
||||
|
||||
|
@ -9,6 +9,7 @@ __global__ void ph_dRPA_AmB_sing_kernel(int nO, int nV, int nBas, int nS, double
|
||||
int nBas2, nBas3;
|
||||
int i_A0, i_A1, i_A2;
|
||||
int i_I0, i_I1, i_I2;
|
||||
int i_J1, i_J2;
|
||||
|
||||
bool a_eq_b;
|
||||
|
||||
@ -33,17 +34,19 @@ __global__ void ph_dRPA_AmB_sing_kernel(int nO, int nV, int nBas, int nS, double
|
||||
|
||||
i_A1 = i_A0 + bb;
|
||||
i_I1 = i_I0 + b * nBas;
|
||||
i_J1 = i_I0 + b * nBas3;
|
||||
|
||||
i = 0;
|
||||
while(i < nO) {
|
||||
|
||||
i_A2 = i_A1 + i * nVS;
|
||||
i_I2 = i_I1 + i;
|
||||
i_J2 = i_J1 + i;
|
||||
|
||||
j = 0;
|
||||
while(j < nO) {
|
||||
|
||||
AmB[i_A2 + j * nV] = 2.0 * (ERI[i_I2 + j * nBas3] - ERI[i_I2 + j * nBas]);
|
||||
AmB[i_A2 + j * nV] = 2.0 * (ERI[i_I2 + j * nBas3] - ERI[i_J2 + j * nBas]);
|
||||
if(a_eq_b && (i==j)) {
|
||||
AmB[i_A2 + j * nV] += eps[a] - eps[i];
|
||||
}
|
||||
|
@ -9,6 +9,7 @@ __global__ void ph_dRPA_ApB_sing_kernel(int nO, int nV, int nBas, int nS, double
|
||||
int nBas2, nBas3;
|
||||
int i_A0, i_A1, i_A2;
|
||||
int i_I0, i_I1, i_I2;
|
||||
int i_J1, i_J2;
|
||||
|
||||
bool a_eq_b;
|
||||
|
||||
@ -33,17 +34,19 @@ __global__ void ph_dRPA_ApB_sing_kernel(int nO, int nV, int nBas, int nS, double
|
||||
|
||||
i_A1 = i_A0 + bb;
|
||||
i_I1 = i_I0 + b * nBas;
|
||||
i_J1 = i_I0 + b * nBas3;
|
||||
|
||||
i = 0;
|
||||
while(i < nO) {
|
||||
|
||||
i_A2 = i_A1 + i * nVS;
|
||||
i_I2 = i_I1 + i;
|
||||
i_J2 = i_J1 + i;
|
||||
|
||||
j = 0;
|
||||
while(j < nO) {
|
||||
|
||||
ApB[i_A2 + j * nV] = 2.0 * (ERI[i_I2 + j * nBas3] + ERI[i_I2 + j * nBas]);
|
||||
ApB[i_A2 + j * nV] = 2.0 * (ERI[i_I2 + j * nBas3] + ERI[i_J2 + j * nBas]);
|
||||
if(a_eq_b && (i==j)) {
|
||||
ApB[i_A2 + j * nV] += eps[a] - eps[i];
|
||||
}
|
||||
|
@ -80,8 +80,10 @@ void ph_drpa_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI,
|
||||
// diagonalize A-B
|
||||
int *d_info1 = NULL;
|
||||
check_Cuda_Errors(cudaMalloc((void**)&d_info1, sizeof(int)), "cudaMalloc", __FILE__, __LINE__);
|
||||
|
||||
double *d_Omega = NULL;
|
||||
check_Cuda_Errors(cudaMalloc((void**)&d_Omega, nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__);
|
||||
|
||||
cudaEventRecord(start, 0);
|
||||
diag_dn_dsyevd(nS, d_info1, d_Omega, d_AmB);
|
||||
check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__);
|
||||
@ -91,6 +93,7 @@ void ph_drpa_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI,
|
||||
printf("Time elapsed on diag AmB = %f msec\n", elapsedTime);
|
||||
|
||||
|
||||
|
||||
// d_Omega <-- d_Omega^{0.5}
|
||||
// TODO: nb of <= 0 elements
|
||||
cudaEventRecord(start, 0);
|
||||
@ -101,15 +104,17 @@ void ph_drpa_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI,
|
||||
printf("Time elapsed on elementwise_dsqrt_inplace %f msec\n", elapsedTime);
|
||||
|
||||
|
||||
// d_AmBSq = d_AmB (d_Omega)^{+0.5} (d_AmB)^T
|
||||
// d_AmBSqInv = d_AmB (d_Omega)^{-0.5} (d_AmB)^T
|
||||
cudaEventRecord(start, 0);
|
||||
// d_AmBSq = d_AmB (d_Omega)^{+0.5} (d_AmB)^T
|
||||
double *d_AmBSq = NULL;
|
||||
check_Cuda_Errors(cudaMalloc((void**)&d_AmBSq, nS * sizeof(double)),
|
||||
check_Cuda_Errors(cudaMalloc((void**)&d_AmBSq, nS2 * sizeof(double)),
|
||||
"cudaMalloc", __FILE__, __LINE__);
|
||||
|
||||
// d_AmBSqInv = d_AmB (d_Omega)^{-0.5} (d_AmB)^T
|
||||
double *d_AmBSqInv = NULL;
|
||||
check_Cuda_Errors(cudaMalloc((void**)&d_AmBSqInv, nS * sizeof(double)),
|
||||
check_Cuda_Errors(cudaMalloc((void**)&d_AmBSqInv, nS2 * sizeof(double)),
|
||||
"cudaMalloc", __FILE__, __LINE__);
|
||||
|
||||
cudaEventRecord(start, 0);
|
||||
A_D_At(nS, d_AmB, d_Omega, d_AmBSq);
|
||||
A_Dinv_At(nS, d_AmB, d_Omega, d_AmBSqInv);
|
||||
check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__);
|
||||
|
Loading…
Reference in New Issue
Block a user