From 6e634b07798ec1e577b77963d95a7244e2960d55 Mon Sep 17 00:00:00 2001 From: vijay gopal chilkuri Date: Mon, 15 Mar 2021 14:22:08 +0100 Subject: [PATCH] Improved memory access in sigma_vector_cfg. --- src/csf/sigma_vector.irp.f | 91 ++++++++++++------- .../diagonalization_hcsf_dressed.irp.f | 20 ++-- 2 files changed, 69 insertions(+), 42 deletions(-) diff --git a/src/csf/sigma_vector.irp.f b/src/csf/sigma_vector.irp.f index 2bcc7491..b3d43d08 100644 --- a/src/csf/sigma_vector.irp.f +++ b/src/csf/sigma_vector.irp.f @@ -445,7 +445,8 @@ end subroutine get_phase_qp_to_cfg print *,"Rowsmax=",rowsmax," Colsmax=",colsmax END_PROVIDER - BEGIN_PROVIDER [ real*8, AIJpqContainer, (NSOMOMin:NSOMOMax,4,NSOMOMax+1,NSOMOMax+1,NBFMax,NBFMax)] + !BEGIN_PROVIDER [ real*8, AIJpqContainer, (NSOMOMin:NSOMOMax,4,NSOMOMax+1,NSOMOMax+1,NBFMax,NBFMax)] + BEGIN_PROVIDER [ real*8, AIJpqContainer, (NBFMax,NBFmax,NSOMOMax+1,NSOMOMax+1,4,NSOMOMin:NSOMOMax)] use cfunctions implicit none BEGIN_DOC @@ -486,7 +487,8 @@ end subroutine get_phase_qp_to_cfg ! Type ! 1. SOMO -> SOMO !print *,"Doing SOMO -> SOMO" - AIJpqContainer(NSOMOMin,1,1,1,1,1) = 1.0d0 + !AIJpqContainer(NSOMOMin,1,1,1,1,1) = 1.0d0 + AIJpqContainer(1,1,1,1,1,NSOMOMin) = 1.0d0 do i = NSOMOMin+2, NSOMOMax, 2 Isomo = ISHFT(1_8,i)-1 do j = i-2,i-2, 2 @@ -512,7 +514,8 @@ end subroutine get_phase_qp_to_cfg !call debug_spindet(Jsomo,1) !call debug_spindet(Isomo,1) - AIJpqContainer(nsomoi,1,k,l,:,:) = 0.0d0 + !AIJpqContainer(nsomoi,1,k,l,:,:) = 0.0d0 + AIJpqContainer(:,:,k,l,1,nsomoi) = 0.0d0 call getApqIJMatrixDims(Isomo, & Jsomo, & MS, & @@ -538,7 +541,8 @@ end subroutine get_phase_qp_to_cfg ! i -> j do ri = 1,rows do ci = 1,cols - AIJpqContainer(nsomoi,1,k,l,ri,ci) = meMatrix(ri, ci) + !AIJpqContainer(nsomoi,1,k,l,ri,ci) = meMatrix(ri, ci) + AIJpqContainer(ri,ci,k,l,1,nsomoi) = meMatrix(ri, ci) end do end do deallocate(meMatrix) @@ -549,7 +553,8 @@ end subroutine get_phase_qp_to_cfg ! Type ! 2. DOMO -> VMO !print *,"Doing DOMO -> VMO" - AIJpqContainer(NSOMOMin,2,1,1,1,1) = 1.0d0 + !AIJpqContainer(NSOMOMin,2,1,1,1,1) = 1.0d0 + AIJpqContainer(1,1,1,1,2,NSOMOMin) = 1.0d0 do i = NSOMOMin, NSOMOMax, 2 Isomo = ISHFT(1_8,i)-1 tmpsomo = ISHFT(1_8,i+2)-1 @@ -579,7 +584,8 @@ end subroutine get_phase_qp_to_cfg !call debug_spindet(Jsomo,1) !call debug_spindet(Isomo,1) - AIJpqContainer(nsomoi,2,k,l,:,:) = 0.0d0 + !AIJpqContainer(nsomoi,2,k,l,:,:) = 0.0d0 + AIJpqContainer(:,:,k,l,2,nsomoi) = 0.0d0 call getApqIJMatrixDims(Isomo, & Jsomo, & MS, & @@ -605,7 +611,8 @@ end subroutine get_phase_qp_to_cfg ! i -> j do ri = 1,rows do ci = 1,cols - AIJpqContainer(nsomoi,2,k,l,ri,ci) = meMatrix(ri, ci) + !AIJpqContainer(nsomoi,2,k,l,ri,ci) = meMatrix(ri, ci) + AIJpqContainer(ri,ci,k,l,2,nsomoi) = meMatrix(ri, ci) end do end do deallocate(meMatrix) @@ -616,7 +623,8 @@ end subroutine get_phase_qp_to_cfg ! Type ! 3. SOMO -> VMO !print *,"Doing SOMO -> VMO" - AIJpqContainer(NSOMOMin,3,1,1,1,1) = 1.0d0 + !AIJpqContainer(NSOMOMin,3,1,1,1,1) = 1.0d0 + AIJpqContainer(1,1,1,1,3,NSOMOMin) = 1.0d0 do i = NSOMOMin, NSOMOMax, 2 Isomo = ISHFT(1_8,i)-1 do j = i,i, 2 @@ -638,7 +646,8 @@ end subroutine get_phase_qp_to_cfg !call debug_spindet(Jsomo,1) !call debug_spindet(Isomo,1) - AIJpqContainer(i,3,k,l,:,:) = 0.0d0 + !AIJpqContainer(i,3,k,l,:,:) = 0.0d0 + AIJpqContainer(:,:,k,l,3,i) = 0.0d0 call getApqIJMatrixDims(Isomo, & Jsomo, & MS, & @@ -664,7 +673,8 @@ end subroutine get_phase_qp_to_cfg ! i -> j do ri = 1,rows do ci = 1,cols - AIJpqContainer(i,3,k,l,ri,ci) = meMatrix(ri, ci) + !AIJpqContainer(i,3,k,l,ri,ci) = meMatrix(ri, ci) + AIJpqContainer(ri,ci,k,l,3,i) = meMatrix(ri, ci) end do end do deallocate(meMatrix) @@ -675,7 +685,8 @@ end subroutine get_phase_qp_to_cfg ! Type ! 4. DOMO -> SOMO !print *,"Doing DOMO -> SOMO" - AIJpqContainer(NSOMOMin,4,1,1,1,1) = 1.0d0 + !AIJpqContainer(NSOMOMin,4,1,1,1,1) = 1.0d0 + AIJpqContainer(1,1,1,1,4,NSOMOMin) = 1.0d0 do i = NSOMOMin+2, NSOMOMax, 2 Isomo = ISHFT(1_8,i)-1 do j = i,i, 2 @@ -693,7 +704,8 @@ end subroutine get_phase_qp_to_cfg Jsomo = ISHFT(1_8,j)-1 endif - AIJpqContainer(i,4,k,l,:,:) = 0.0d0 + !AIJpqContainer(i,4,k,l,:,:) = 0.0d0 + AIJpqContainer(:,:,k,l,4,i) = 0.0d0 call getApqIJMatrixDims(Isomo, & Jsomo, & MS, & @@ -720,7 +732,8 @@ end subroutine get_phase_qp_to_cfg ! i -> j do ri = 1,rows do ci = 1,cols - AIJpqContainer(i,4,k,l,ri,ci) = meMatrix(ri, ci) + !AIJpqContainer(i,4,k,l,ri,ci) = meMatrix(ri, ci) + AIJpqContainer(ri,ci,k,l,4,i) = meMatrix(ri, ci) end do end do deallocate(meMatrix) @@ -888,8 +901,8 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze ! returns : psi_coef_out_det : END_DOC integer,intent(in) :: sze, istart,iend, istep, ishift, n_st - real*8,intent(in) :: psi_in(sze,n_st) - real*8,intent(out) :: psi_out(sze,n_st) + real*8,intent(in) :: psi_in(n_st,sze) + real*8,intent(out) :: psi_out(n_st,sze) integer(bit_kind) :: Icfg(N_INT,2) integer :: i,j,k,l,p,q,noccp,noccq, ii, jj, m, n, idxI, kk, nocck,orbk integer(bit_kind),dimension(:,:,:),allocatable :: alphas_Icfg @@ -1054,8 +1067,9 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze cntj = 0 do jj = startj, endj cntj += 1 - meCC1 = AIJpqContainer(NSOMOI,extype,pmodel,qmodel,cnti,cntj) - psi_out(jj,kk) += meCC1 * psi_in(ii,kk) * h_core_ri(p,q) + !meCC1 = AIJpqContainer(NSOMOI,extype,pmodel,qmodel,cnti,cntj) + meCC1 = AIJpqContainer(cnti,cntj,pmodel,qmodel,extype,NSOMOI) + psi_out(kk,jj) += meCC1 * psi_in(kk,ii) * h_core_ri(p,q) enddo enddo enddo @@ -1072,6 +1086,9 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze !!! Double Excitations !!! + call omp_set_max_active_levels(0) + !$OMP parallel + !$OMP master ! Loop over all selected configurations do i = istart_cfg,iend_cfg @@ -1126,11 +1143,11 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze endif enddo - allocate(TKI(rowsTKI,n_st,totcolsTKI)) ! coefficients of CSF + allocate(TKI(n_st,rowsTKI,totcolsTKI)) ! coefficients of CSF ! Initialize the inegral container ! dims : (totcolsTKI, nconnectedI) allocate(GIJpqrs(totcolsTKI,nconnectedI)) ! gpqrs - allocate(TKIGIJ(rowsTKI,n_st,nconnectedI)) ! TKI * gpqrs + allocate(TKIGIJ(n_st,rowsTKI,nconnectedI)) ! TKI * gpqrs totcolsTKI = 0 do j = 1,nconnectedI @@ -1143,16 +1160,18 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze rowsikpq = AIJpqMatrixDimsList(NSOMOalpha,extype,pmodel,qmodel,1) colsikpq = AIJpqMatrixDimsList(NSOMOalpha,extype,pmodel,qmodel,2) allocate(CCmattmp(colsikpq,n_st)) - do kk = 1,n_st + !do kk = 1,n_st + !do m = 1,colsikpq + ! CCmattmp(m,kk) = psi_in(idxs_connectedI_alpha(j)+m-1,kk) + !enddo + !enddo do m = 1,colsikpq - CCmattmp(m,kk) = psi_in(idxs_connectedI_alpha(j)+m-1,kk) - enddo - enddo - do kk = 1,n_st - do m = 1,colsikpq - tmpvar = CCmattmp(m,kk) do l = 1,rowsTKI - TKI(l,kk,totcolsTKI+m) = AIJpqContainer(NSOMOalpha,extype,pmodel,qmodel,l,m) * tmpvar + do kk = 1,n_st + !tmpvar = CCmattmp(m,kk) + !TKI(kk,l,totcolsTKI+m) = AIJpqContainer(NSOMOalpha,extype,pmodel,qmodel,l,m) * tmpvar + !TKI(kk,l,totcolsTKI+m) = AIJpqContainer(l,m,pmodel,qmodel,extype,NSOMOalpha) * tmpvar + TKI(kk,l,totcolsTKI+m) = AIJpqContainer(l,m,pmodel,qmodel,extype,NSOMOalpha) * psi_in(kk,idxs_connectedI_alpha(j)+m-1) enddo enddo enddo @@ -1185,8 +1204,8 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze ! Do big BLAS ! TODO TKI, size(TKI,1)*size(TKI,2) call dgemm('N','N', rowsTKI*n_st, nconnectedI, totcolsTKI, 1.d0, & - TKI, size(TKI,1)*n_st, GIJpqrs, size(GIJpqrs,1), 0.d0, & - TKIGIJ , size(TKIGIJ,1)*n_st ) + TKI, size(TKI,1)*size(TKI,2), GIJpqrs, size(GIJpqrs,1), 0.d0, & + TKIGIJ , size(TKIGIJ,1)*size(TKIGIJ,2) ) !print *,"DIMs = ",rowsTKI,n_st,totcolsTKI,nconnectedI !print *,"TKI mat" @@ -1218,13 +1237,14 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze rowsikpq = AIJpqMatrixDimsList(NSOMOalpha,extype,pmodel,qmodel,1) colsikpq = AIJpqMatrixDimsList(NSOMOalpha,extype,pmodel,qmodel,2) !print *,">j=",j,rowsikpq,colsikpq, ">>",totcolsTKI,",",idxs_connectedI_alpha(j) - do kk = 1,n_st do m = 1,colsikpq - tmpvar = psi_out(idxs_connectedI_alpha(j)+m-1,kk) + !tmpvar = psi_out(kk,idxs_connectedI_alpha(j)+m-1) do l = 1,rowsTKI - tmpvar += AIJpqContainer(NSOMOalpha,extype,pmodel,qmodel,l,m) * TKIGIJ(l,kk,j) + do kk = 1,n_st + !tmpvar += AIJpqContainer(NSOMOalpha,extype,pmodel,qmodel,l,m) * TKIGIJ(kk,l,j) + psi_out(kk,idxs_connectedI_alpha(j)+m-1) += AIJpqContainer(l,m,pmodel,qmodel,extype,NSOMOalpha) * TKIGIJ(kk,l,j) enddo - psi_out(idxs_connectedI_alpha(j)+m-1,kk) = tmpvar + !psi_out(kk,idxs_connectedI_alpha(j)+m-1) = tmpvar enddo enddo totcolsTKI += colsikpq @@ -1238,12 +1258,15 @@ subroutine calculate_sigma_vector_cfg_nst_naive_store(psi_out, psi_in, n_st, sze enddo ! loop over alphas enddo ! loop over I + !$OMP end master + !$OMP end parallel + call omp_set_max_active_levels(4) ! Add the diagonal contribution do kk=1,n_st do i = 1,n_CSF - psi_out(i,kk) += 1.0d0*diag_energies(i)*psi_in(i,kk) + psi_out(kk,i) += diag_energies(i)*psi_in(kk,i) enddo enddo diff --git a/src/davidson/diagonalization_hcsf_dressed.irp.f b/src/davidson/diagonalization_hcsf_dressed.irp.f index ddfce72b..7ee73b0c 100644 --- a/src/davidson/diagonalization_hcsf_dressed.irp.f +++ b/src/davidson/diagonalization_hcsf_dressed.irp.f @@ -314,17 +314,19 @@ subroutine davidson_diag_csf_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,sze_csf,N !call convertWFfromDETtoCSF(N_st_diag,W,W_csf(1,shift+1)) ! call calculate_sigma_vector_cfg_nst(W_csf(1,shift+1),U_csf(1,shift+1),N_st_diag,sze_csf,1,sze_csf,0,1) ! ! TODO : psi_det_size ? for psi_det - allocate(tmpW(sze_csf,N_st_diag)) - allocate(tmpU(sze_csf,N_st_diag)) + !allocate(tmpW(sze_csf,N_st_diag)) + allocate(tmpW(N_st_diag,sze_csf)) + !allocate(tmpU(sze_csf,N_st_diag)) + allocate(tmpU(N_st_diag,sze_csf)) do kk=1,N_st_diag do ii=1,sze_csf - tmpU(ii,kk) = U_csf(ii,shift+kk) + tmpU(kk,ii) = U_csf(ii,shift+kk) enddo enddo call calculate_sigma_vector_cfg_nst_naive_store(tmpW,tmpU,N_st_diag,sze_csf,1,sze_csf,0,1) do kk=1,N_st_diag do ii=1,sze_csf - W_csf(ii,shift+kk)=tmpW(ii,kk) + W_csf(ii,shift+kk)=tmpW(kk,ii) enddo enddo deallocate(tmpW) @@ -339,17 +341,19 @@ subroutine davidson_diag_csf_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,sze_csf,N !call convertWFfromDETtoCSF(N_st_diag,U,U_csf(1,shift+1)) !call convertWFfromDETtoCSF(N_st_diag,W,W_csf(1,shift+1)) ! call calculate_sigma_vector_cfg_nst(W_csf(1,shift+1),U_csf(1,shift+1),N_st_diag,sze_csf,1,sze_csf,0,1) - allocate(tmpW(sze_csf,N_st_diag)) - allocate(tmpU(sze_csf,N_st_diag)) + !allocate(tmpW(sze_csf,N_st_diag)) + allocate(tmpW(N_st_diag,sze_csf)) + !allocate(tmpU(sze_csf,N_st_diag)) + allocate(tmpU(N_st_diag,sze_csf)) do kk=1,N_st_diag do ii=1,sze_csf - tmpU(ii,kk) = U_csf(ii,shift+kk) + tmpU(kk,ii) = U_csf(ii,shift+kk) enddo enddo call calculate_sigma_vector_cfg_nst_naive_store(tmpW,tmpU,N_st_diag,sze_csf,1,sze_csf,0,1) do kk=1,N_st_diag do ii=1,sze_csf - W_csf(ii,shift+kk)=tmpW(ii,kk) + W_csf(ii,shift+kk)=tmpW(kk,ii) enddo enddo deallocate(tmpW)