diff --git a/src/GT/GTpp_excitation_density.f90 b/src/GT/GTpp_excitation_density.f90 index a6ed548..05f9c2d 100644 --- a/src/GT/GTpp_excitation_density.f90 +++ b/src/GT/GTpp_excitation_density.f90 @@ -2,6 +2,12 @@ subroutine GTpp_excitation_density(ispin,nBas,nC,nO,nV,nR,nOO,nVV,ERI,X1,Y1,rho1 ! Compute excitation densities for T-matrix self-energy + ! TODO + ! debug DGEMM for nC != 0 + ! and nR != 0 + + + implicit none ! Input variables @@ -33,6 +39,10 @@ subroutine GTpp_excitation_density(ispin,nBas,nC,nO,nV,nR,nOO,nVV,ERI,X1,Y1,rho1 double precision,intent(out) :: rho1(nBas,nBas,nVV) double precision,intent(out) :: rho2(nBas,nBas,nOO) + integer :: dim_1, dim_2 + double precision, allocatable :: ERI_1(:,:,:) + double precision, allocatable :: ERI_2(:,:,:) + ! Initialization rho1(:,:,:) = 0d0 @@ -44,11 +54,10 @@ subroutine GTpp_excitation_density(ispin,nBas,nC,nO,nV,nR,nOO,nVV,ERI,X1,Y1,rho1 if(ispin == 1) then - !$OMP PARALLEL & - !$OMP SHARED(nC,nBas,nR,nO,nVV,nOO,rho1,rho2,ERI,X1,Y1,X2,Y2) & - !$OMP PRIVATE(q,p,ab,cd,kl,ij) & - !$OMP DEFAULT(NONE) - !$OMP DO + !$OMP PARALLEL DEFAULT(NONE) & + !$OMP PRIVATE(p, q, a, b, ab, c, d, cd, i, j, ij, k, l, kl) & + !$OMP SHARED(nC, nBas, nR, nO, rho1, rho2, ERI, X1, Y1, X2, Y2) + !$OMP DO COLLAPSE(2) do q=nC+1,nBas-nR do p=nC+1,nBas-nR @@ -113,8 +122,8 @@ subroutine GTpp_excitation_density(ispin,nBas,nC,nO,nV,nR,nOO,nVV,ERI,X1,Y1,rho1 end do end do - !$OMP END DO - !$OMP END PARALLEL + !$OMP END DO + !$OMP END PARALLEL end if !---------------------------------------------- @@ -123,65 +132,132 @@ subroutine GTpp_excitation_density(ispin,nBas,nC,nO,nV,nR,nOO,nVV,ERI,X1,Y1,rho1 if(ispin == 2 .or. ispin == 4) then - do q=nC+1,nBas-nR - do p=nC+1,nBas-nR - -! do ab=1,nVV - ab = 0 - do a=nO+1,nBas-nR - do b=a+1,nBas-nR - ab = ab + 1 - - cd = 0 - do c=nO+1,nBas-nR - do d=c+1,nBas-nR - cd = cd + 1 - rho1(p,q,ab) = rho1(p,q,ab) & - + (ERI(p,q,c,d) - ERI(p,q,d,c))*X1(cd,ab) - end do - end do - - kl = 0 - do k=nC+1,nO - do l=k+1,nO - kl = kl + 1 - rho1(p,q,ab) = rho1(p,q,ab) & - + (ERI(p,q,k,l) - ERI(p,q,l,k))*Y1(kl,ab) - end do - end do - - end do - end do - -! do ij=1,nOO - ij = 0 - do i=nC+1,nO - do j=i+1,nO - ij = ij + 1 - - cd = 0 - do c=nO+1,nBas-nR - do d=c+1,nBas-nR - cd = cd + 1 - rho2(p,q,ij) = rho2(p,q,ij) & - + (ERI(p,q,c,d) - ERI(p,q,d,c))*X2(cd,ij) - end do - end do - - kl = 0 - do k=nC+1,nO - do l=k+1,nO - kl = kl + 1 - rho2(p,q,ij) = rho2(p,q,ij) & - + (ERI(p,q,k,l) - ERI(p,q,l,k))*Y2(kl,ij) - end do - end do - - end do - end do + dim_1 = (nBas - nO) * (nBas - nO - 1) / 2 + dim_2 = nO * (nO - 1) / 2 - end do - end do + allocate(ERI_1(nBas,nBas,dim_1), ERI_2(nBas,nBas,dim_2)) + ERI_1 = 0.d0 + ERI_2 = 0.d0 + + !$OMP PARALLEL DEFAULT(NONE) & + !$OMP PRIVATE(p, q, c, d, cd, k, l, kl) & + !$OMP SHARED(nC, nBas, nR, nO, ERI_1, ERI_2, ERI) + !$OMP DO COLLAPSE(2) + do q = nC+1, nBas-nR + do p = nC+1, nBas-nR + cd = 0 + do c = nO+1, nBas-nR + do d = c+1, nBas-nR + cd = cd + 1 + ERI_1(p,q,cd) = ERI(p,q,c,d) - ERI(p,q,d,c) + enddo + enddo + kl = 0 + do k = nC+1, nO + do l = k+1, nO + kl = kl + 1 + ERI_2(p,q,kl) = ERI(p,q,k,l) - ERI(p,q,l,k) + end do + end do + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + + call dgemm("N", "N", nBas*nBas, dim_1, dim_1, 1.d0, & + ERI_1(1,1,1), nBas*nBas, X1(1,1), dim_1, & + 0.d0, rho1(1,1,1), nBas*nBas) + + call dgemm("N", "N", nBas*nBas, dim_1, dim_2, 1.d0, & + ERI_2(1,1,1), nBas*nBas, Y1(1,1), dim_2, & + 1.d0, rho1(1,1,1), nBas*nBas) + + call dgemm("N", "N", nBas*nBas, dim_2, dim_1, 1.d0, & + ERI_1(1,1,1), nBas*nBas, X2(1,1), dim_1, & + 0.d0, rho2(1,1,1), nBas*nBas) + + call dgemm("N", "N", nBas*nBas, dim_2, dim_2, 1.d0, & + ERI_2(1,1,1), nBas*nBas, Y2(1,1), dim_2, & + 1.d0, rho2(1,1,1), nBas*nBas) + + deallocate(ERI_1, ERI_2) + + +! !$OMP PARALLEL DEFAULT(NONE) & +! !$OMP PRIVATE(p, q, a, b, ab, c, d, cd, i, j, ij, k, l, kl) & +! !$OMP SHARED(nC, nBas, nR, nO, rho1, rho2, ERI, X1, Y1, X2, Y2) +! !$OMP DO COLLAPSE(2) +! do q = nC+1, nBas-nR +! do p = nC+1, nBas-nR +! +! ab = 0 +! +! do a = nO+1, nBas-nR +! do b = a+1, nBas-nR +! +! ab = ab + 1 +! +! cd = 0 +! do c = nO+1, nBas-nR +! do d = c+1, nBas-nR +! +! cd = cd + 1 +! +! rho1(p,q,ab) = rho1(p,q,ab) & +! + (ERI(p,q,c,d) - ERI(p,q,d,c))*X1(cd,ab) +! end do ! d +! end do ! c +! +! kl = 0 +! do k = nC+1, nO +! do l = k+1, nO +! +! kl = kl + 1 +! +! rho1(p,q,ab) = rho1(p,q,ab) & +! + (ERI(p,q,k,l) - ERI(p,q,l,k))*Y1(kl,ab) +! end do ! l +! end do ! k +! +! end do ! b +! end do ! a +! +! ij = 0 +! do i = nC+1, nO +! do j = i+1, nO +! +! ij = ij + 1 +! +! cd = 0 +! +! do c = nO+1, nBas-nR +! do d = c+1, nBas-nR +! +! cd = cd + 1 +! +! rho2(p,q,ij) = rho2(p,q,ij) & +! + (ERI(p,q,c,d) - ERI(p,q,d,c))*X2(cd,ij) +! end do ! d +! end do ! c +! +! kl = 0 +! do k = nC+1, nO +! do l = k+1, nO +! +! kl = kl + 1 +! +! rho2(p,q,ij) = rho2(p,q,ij) & +! + (ERI(p,q,k,l) - ERI(p,q,l,k))*Y2(kl,ij) +! end do ! l +! end do ! k +! +! end do ! j +! end do ! i +! +! end do ! p +! end do ! q +! !$OMP END DO +! !$OMP END PARALLEL end if @@ -190,70 +266,128 @@ subroutine GTpp_excitation_density(ispin,nBas,nC,nO,nV,nR,nOO,nVV,ERI,X1,Y1,rho1 !---------------------------------------------- if(ispin == 3) then - - !$OMP PARALLEL & - !$OMP SHARED(nC,nBas,nR,nO,nVV,nOO,rho1,rho2,ERI,X1,Y1,X2,Y2) & - !$OMP PRIVATE(q,p,ab,cd,kl,ij,c,d,k,l) & - !$OMP DEFAULT(NONE) - !$OMP DO - - do q=nC+1,nBas-nR - do p=nC+1,nBas-nR - - ! do ab=1,nVV - ab = 0 - do a=nO+1,nBas-nR - do b=nO+1,nBas-nR - ab = ab + 1 - - cd = 0 - do c=nO+1,nBas-nR - do d=nO+1,nBas-nR - cd = cd + 1 - rho1(p,q,ab) = rho1(p,q,ab) + ERI(p,q,c,d)*X1(cd,ab) - end do - end do - - kl = 0 - do k=nC+1,nO - do l=nC+1,nO - kl = kl + 1 - rho1(p,q,ab) = rho1(p,q,ab) + ERI(p,q,k,l)*Y1(kl,ab) - end do - end do - - end do - end do - - ! do ij=1,nOO - ij = 0 - do i=nC+1,nO - do j=nC+1,nO - ij = ij + 1 - - cd = 0 - do c=nO+1,nBas-nR - do d=nO+1,nBas-nR - cd = cd + 1 - rho2(p,q,ij) = rho2(p,q,ij) + ERI(p,q,c,d)*X2(cd,ij) - end do - end do - - kl = 0 - do k=nC+1,nO - do l=nC+1,nO - kl = kl + 1 - rho2(p,q,ij) = rho2(p,q,ij) + ERI(p,q,k,l)*Y2(kl,ij) - end do - end do - - end do - end do - + + dim_1 = (nBas - nO) * (nBas - nO) + dim_2 = nO * nO + + allocate(ERI_1(nBas,nBas,dim_1), ERI_2(nBas,nBas,dim_2)) + ERI_1 = 0.d0 + ERI_2 = 0.d0 + + !$OMP PARALLEL DEFAULT(NONE) & + !$OMP PRIVATE(p, q, c, d, cd, k, l, kl) & + !$OMP SHARED(nC, nBas, nR, nO, ERI_1, ERI_2, ERI) + !$OMP DO COLLAPSE(2) + do q = nC+1, nBas-nR + do p = nC+1, nBas-nR + cd = 0 + do c = nO+1, nBas-nR + do d = nO+1, nBas-nR + cd = cd + 1 + ERI_1(p,q,cd) = ERI(p,q,c,d) + enddo + enddo + kl = 0 + do k = nC+1, nO + do l = nC+1, nO + kl = kl + 1 + ERI_2(p,q,kl) = ERI(p,q,k,l) + end do end do - end do - !$OMP END DO - !$OMP END PARALLEL + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + + call dgemm("N", "N", nBas*nBas, dim_1, dim_1, 1.d0, & + ERI_1(1,1,1), nBas*nBas, X1(1,1), dim_1, & + 0.d0, rho1(1,1,1), nBas*nBas) + + call dgemm("N", "N", nBas*nBas, dim_1, dim_2, 1.d0, & + ERI_2(1,1,1), nBas*nBas, Y1(1,1), dim_2, & + 1.d0, rho1(1,1,1), nBas*nBas) + + call dgemm("N", "N", nBas*nBas, dim_2, dim_1, 1.d0, & + ERI_1(1,1,1), nBas*nBas, X2(1,1), dim_1, & + 0.d0, rho2(1,1,1), nBas*nBas) + + call dgemm("N", "N", nBas*nBas, dim_2, dim_2, 1.d0, & + ERI_2(1,1,1), nBas*nBas, Y2(1,1), dim_2, & + 1.d0, rho2(1,1,1), nBas*nBas) + + deallocate(ERI_1, ERI_2) + + +! !$OMP PARALLEL DEFAULT(NONE) & +! !$OMP PRIVATE(p, q, a, b, ab, c, d, cd, i, j, ij, k, l, kl) & +! !$OMP SHARED(nC, nBas, nR, nO, rho1, rho2, ERI, X1, Y1, X2, Y2) +! !$OMP DO COLLAPSE(2) +! +! do q = nC+1, nBas-nR +! do p = nC+1, nBas-nR +! +! ab = 0 +! do a = nO+1, nBas-nR +! do b = nO+1, nBas-nR +! +! ab = ab + 1 +! +! cd = 0 +! do c = nO+1, nBas-nR +! do d = nO+1, nBas-nR +! +! cd = cd + 1 +! +! rho1(p,q,ab) = rho1(p,q,ab) + ERI(p,q,c,d)*X1(cd,ab) +! end do +! end do +! +! kl = 0 +! do k = nC+1, nO +! do l = nC+1, nO +! +! kl = kl + 1 +! +! rho1(p,q,ab) = rho1(p,q,ab) + ERI(p,q,k,l)*Y1(kl,ab) +! end do +! end do +! +! end do +! end do +! +! ij = 0 +! do i = nC+1, nO +! do j = nC+1, nO +! +! ij = ij + 1 +! +! cd = 0 +! do c = nO+1, nBas-nR +! do d = nO+1, nBas-nR +! +! cd = cd + 1 +! +! rho2(p,q,ij) = rho2(p,q,ij) + ERI(p,q,c,d)*X2(cd,ij) +! end do +! end do +! +! kl = 0 +! do k = nC+1, nO +! do l = nC+1, nO +! +! kl = kl + 1 +! +! rho2(p,q,ij) = rho2(p,q,ij) + ERI(p,q,k,l)*Y2(kl,ij) +! end do +! end do +! +! end do +! end do +! +! end do +! end do +! !$OMP END DO +! !$OMP END PARALLEL end if diff --git a/src/GT/RG0T0pp.f90 b/src/GT/RG0T0pp.f90 index e624bca..5954adf 100644 --- a/src/GT/RG0T0pp.f90 +++ b/src/GT/RG0T0pp.f90 @@ -64,6 +64,7 @@ subroutine RG0T0pp(dotest,doACFDT,exchange_kernel,doXBS,dophBSE,TDA_T,TDA,dBSE,d double precision,allocatable :: eGT(:) double precision,allocatable :: eGTlin(:) + ! Output variables ! Hello world @@ -122,9 +123,9 @@ subroutine RG0T0pp(dotest,doACFDT,exchange_kernel,doXBS,dophBSE,TDA_T,TDA,dBSE,d allocate(Bpp(nVVs,nOOs),Cpp(nVVs,nVVs),Dpp(nOOs,nOOs)) + call ppLR_C(iblock,nBas,nC,nO,nV,nR,nVVs,1d0,eHF,ERI,Cpp) + call ppLR_D(iblock,nBas,nC,nO,nV,nR,nOOs,1d0,eHF,ERI,Dpp) if(.not.TDA_T) call ppLR_B(iblock,nBas,nC,nO,nV,nR,nOOs,nVVs,1d0,ERI,Bpp) - call ppLR_C(iblock,nBas,nC,nO,nV,nR,nVVs,1d0,eHF,ERI,Cpp) - call ppLR_D(iblock,nBas,nC,nO,nV,nR,nOOs,1d0,eHF,ERI,Dpp) call ppLR(TDA_T,nOOs,nVVs,Bpp,Cpp,Dpp,Om1s,X1s,Y1s,Om2s,X2s,Y2s,EcRPA(ispin)) @@ -145,9 +146,9 @@ subroutine RG0T0pp(dotest,doACFDT,exchange_kernel,doXBS,dophBSE,TDA_T,TDA,dBSE,d allocate(Bpp(nVVt,nOOt),Cpp(nVVt,nVVt),Dpp(nOOt,nOOt)) + call ppLR_C(iblock,nBas,nC,nO,nV,nR,nVVt,1d0,eHF,ERI,Cpp) + call ppLR_D(iblock,nBas,nC,nO,nV,nR,nOOt,1d0,eHF,ERI,Dpp) if(.not.TDA_T) call ppLR_B(iblock,nBas,nC,nO,nV,nR,nOOt,nVVt,1d0,ERI,Bpp) - call ppLR_C(iblock,nBas,nC,nO,nV,nR,nVVt,1d0,eHF,ERI,Cpp) - call ppLR_D(iblock,nBas,nC,nO,nV,nR,nOOt,1d0,eHF,ERI,Dpp) call ppLR(TDA_T,nOOt,nVVt,Bpp,Cpp,Dpp,Om1t,X1t,Y1t,Om2t,X2t,Y2t,EcRPA(ispin)) @@ -162,10 +163,12 @@ subroutine RG0T0pp(dotest,doACFDT,exchange_kernel,doXBS,dophBSE,TDA_T,TDA,dBSE,d ! iblock = 1 iblock = 3 + call GTpp_excitation_density(iblock,nBas,nC,nO,nV,nR,nOOs,nVVs,ERI,X1s,Y1s,rho1s,X2s,Y2s,rho2s) ! iblock = 2 iblock = 4 + call GTpp_excitation_density(iblock,nBas,nC,nO,nV,nR,nOOt,nVVt,ERI,X1t,Y1t,rho1t,X2t,Y2t,rho2t) !---------------------------------------------- @@ -183,7 +186,7 @@ subroutine RG0T0pp(dotest,doACFDT,exchange_kernel,doXBS,dophBSE,TDA_T,TDA,dBSE,d !---------------------------------------------- ! Solve the quasi-particle equation !---------------------------------------------- - + eGTlin(:) = eHF(:) + Z(:)*Sig(:) if(linearize) then @@ -218,9 +221,9 @@ subroutine RG0T0pp(dotest,doACFDT,exchange_kernel,doXBS,dophBSE,TDA_T,TDA,dBSE,d allocate(Bpp(nVVs,nOOs),Cpp(nVVs,nVVs),Dpp(nOOs,nOOs)) + call ppLR_C(iblock,nBas,nC,nO,nV,nR,nVVs,1d0,eGT,ERI,Cpp) + call ppLR_D(iblock,nBas,nC,nO,nV,nR,nOOs,1d0,eGT,ERI,Dpp) if(.not.TDA_T) call ppLR_B(iblock,nBas,nC,nO,nV,nR,nOOs,nVVs,1d0,ERI,Bpp) - call ppLR_C(iblock,nBas,nC,nO,nV,nR,nVVs,1d0,eGT,ERI,Cpp) - call ppLR_D(iblock,nBas,nC,nO,nV,nR,nOOs,1d0,eGT,ERI,Dpp) call ppLR(TDA_T,nOOs,nVVs,Bpp,Cpp,Dpp,Om1s,X1s,Y1s,Om2s,X2s,Y2s,EcRPA(ispin)) @@ -232,9 +235,9 @@ subroutine RG0T0pp(dotest,doACFDT,exchange_kernel,doXBS,dophBSE,TDA_T,TDA,dBSE,d allocate(Bpp(nVVt,nOOt),Cpp(nVVt,nVVt),Dpp(nOOt,nOOt)) + call ppLR_C(iblock,nBas,nC,nO,nV,nR,nVVt,1d0,eGT,ERI,Cpp) + call ppLR_D(iblock,nBas,nC,nO,nV,nR,nOOt,1d0,eGT,ERI,Dpp) if(.not.TDA_T) call ppLR_B(iblock,nBas,nC,nO,nV,nR,nOOt,nVVt,1d0,ERI,Bpp) - call ppLR_C(iblock,nBas,nC,nO,nV,nR,nVVt,1d0,eGT,ERI,Cpp) - call ppLR_D(iblock,nBas,nC,nO,nV,nR,nOOt,1d0,eGT,ERI,Dpp) call ppLR(TDA_T,nOOt,nVVt,Bpp,Cpp,Dpp,Om1t,X1t,Y1t,Om2t,X2t,Y2t,EcRPA(ispin)) diff --git a/src/GW/GW_ppBSE.f90 b/src/GW/GW_ppBSE.f90 index c1d6daa..371836d 100644 --- a/src/GW/GW_ppBSE.f90 +++ b/src/GW/GW_ppBSE.f90 @@ -66,6 +66,7 @@ subroutine GW_ppBSE(TDA_W,TDA,dBSE,dTDA,singlet,triplet,eta,nBas,nC,nO,nV,nR,nS, double precision,intent(out) :: EcBSE(nspin) + !--------------------------------- ! Compute (singlet) RPA screening !--------------------------------- @@ -76,10 +77,12 @@ subroutine GW_ppBSE(TDA_W,TDA,dBSE,dTDA,singlet,triplet,eta,nBas,nC,nO,nV,nR,nS, allocate(OmRPA(nS),XpY_RPA(nS,nS),XmY_RPA(nS,nS),rho_RPA(nBas,nBas,nS), & Aph(nS,nS),Bph(nS,nS)) - call phLR_A(isp_W,dRPA_W,nBas,nC,nO,nV,nR,nS,1d0,eW,ERI,Aph) + call phLR_A(isp_W,dRPA_W,nBas,nC,nO,nV,nR,nS,1d0,eW,ERI,Aph) + if(.not.TDA_W) call phLR_B(isp_W,dRPA_W,nBas,nC,nO,nV,nR,nS,1d0,ERI,Bph) call phLR(TDA_W,nS,Aph,Bph,EcRPA,OmRPA,XpY_RPA,XmY_RPA) + call GW_excitation_density(nBas,nC,nO,nR,nS,ERI,XpY_RPA,rho_RPA) deallocate(XpY_RPA,XmY_RPA,Aph,Bph) @@ -108,13 +111,13 @@ subroutine GW_ppBSE(TDA_W,TDA,dBSE,dTDA,singlet,triplet,eta,nBas,nC,nO,nV,nR,nS, ! Compute BSE excitation energies + call GW_ppBSE_static_kernel_C(ispin,eta,nBas,nC,nO,nV,nR,nS,nVV,1d0,ERI,OmRPA,rho_RPA,KC_sta) + call GW_ppBSE_static_kernel_D(ispin,eta,nBas,nC,nO,nV,nR,nS,nOO,1d0,ERI,OmRPA,rho_RPA,KD_sta) if(.not.TDA) call GW_ppBSE_static_kernel_B(ispin,eta,nBas,nC,nO,nV,nR,nS,nOO,nVV,1d0,ERI,OmRPA,rho_RPA,KB_sta) - call GW_ppBSE_static_kernel_C(ispin,eta,nBas,nC,nO,nV,nR,nS,nVV,1d0,ERI,OmRPA,rho_RPA,KC_sta) - call GW_ppBSE_static_kernel_D(ispin,eta,nBas,nC,nO,nV,nR,nS,nOO,1d0,ERI,OmRPA,rho_RPA,KD_sta) + call ppLR_C(ispin,nBas,nC,nO,nV,nR,nVV,1d0,eGW,ERI,Cpp) + call ppLR_D(ispin,nBas,nC,nO,nV,nR,nOO,1d0,eGW,ERI,Dpp) if(.not.TDA) call ppLR_B(ispin,nBas,nC,nO,nV,nR,nOO,nVV,1d0,ERI,Bpp) - call ppLR_C(ispin,nBas,nC,nO,nV,nR,nVV,1d0,eGW,ERI,Cpp) - call ppLR_D(ispin,nBas,nC,nO,nV,nR,nOO,1d0,eGW,ERI,Dpp) Bpp(:,:) = Bpp(:,:) + KB_sta(:,:) Cpp(:,:) = Cpp(:,:) + KC_sta(:,:) @@ -131,9 +134,9 @@ subroutine GW_ppBSE(TDA_W,TDA,dBSE,dTDA,singlet,triplet,eta,nBas,nC,nO,nV,nR,nS, if(dBSE) & call GW_ppBSE_dynamic_perturbation(ispin,dTDA,eta,nBas,nC,nO,nV,nR,nS,nOO,nVV,eW,eGW,ERI,dipole_int,OmRPA,rho_RPA, & Om1,X1,Y1,Om2,X2,Y2,KB_sta,KC_sta,KD_sta) - write(*,*) "Deallocate not done" + + deallocate(Om1,X1,Y1,Om2,X2,Y2,Bpp,Cpp,Dpp,KB_sta,KC_sta,KD_sta) - write(*,*) "Deallocate done" end if !------------------- @@ -160,14 +163,13 @@ subroutine GW_ppBSE(TDA_W,TDA,dBSE,dTDA,singlet,triplet,eta,nBas,nC,nO,nV,nR,nS, ! Compute BSE excitation energies + call GW_ppBSE_static_kernel_C(ispin,eta,nBas,nC,nO,nV,nR,nS,nVV,1d0,ERI,OmRPA,rho_RPA,KC_sta) + call GW_ppBSE_static_kernel_D(ispin,eta,nBas,nC,nO,nV,nR,nS,nOO,1d0,ERI,OmRPA,rho_RPA,KD_sta) if(.not.TDA) call GW_ppBSE_static_kernel_B(ispin,eta,nBas,nC,nO,nV,nR,nS,nOO,nVV,1d0,ERI,OmRPA,rho_RPA,KB_sta) - call GW_ppBSE_static_kernel_C(ispin,eta,nBas,nC,nO,nV,nR,nS,nVV,1d0,ERI,OmRPA,rho_RPA,KC_sta) - call GW_ppBSE_static_kernel_D(ispin,eta,nBas,nC,nO,nV,nR,nS,nOO,1d0,ERI,OmRPA,rho_RPA,KD_sta) - + call ppLR_C(ispin,nBas,nC,nO,nV,nR,nVV,1d0,eGW,ERI,Cpp) + call ppLR_D(ispin,nBas,nC,nO,nV,nR,nOO,1d0,eGW,ERI,Dpp) if(.not.TDA) call ppLR_B(ispin,nBas,nC,nO,nV,nR,nOO,nVV,1d0,ERI,Bpp) - call ppLR_C(ispin,nBas,nC,nO,nV,nR,nVV,1d0,eGW,ERI,Cpp) - call ppLR_D(ispin,nBas,nC,nO,nV,nR,nOO,1d0,eGW,ERI,Dpp) Bpp(:,:) = Bpp(:,:) + KB_sta(:,:) Cpp(:,:) = Cpp(:,:) + KC_sta(:,:) diff --git a/src/GW/GW_ppBSE_static_kernel_C.f90 b/src/GW/GW_ppBSE_static_kernel_C.f90 index ef21825..eb1e912 100644 --- a/src/GW/GW_ppBSE_static_kernel_C.f90 +++ b/src/GW/GW_ppBSE_static_kernel_C.f90 @@ -26,44 +26,186 @@ subroutine GW_ppBSE_static_kernel_C(ispin,eta,nBas,nC,nO,nV,nR,nS,nVV,lambda,ERI double precision,external :: Kronecker_delta double precision :: chi double precision :: eps + double precision :: tmp_ab, lambda4, eta2 integer :: a,b,c,d,ab,cd,m + integer :: a0, aa + + double precision, allocatable :: Om_tmp(:) + double precision, allocatable :: tmp_m(:,:,:) + double precision, allocatable :: tmp(:,:,:,:) ! Output variables double precision,intent(out) :: KC(nVV,nVV) -! Initialization - - KC(:,:) = 0d0 - !---------------! ! Singlet block ! !---------------! if(ispin == 1) then - ab = 0 - do a=nO+1,nBas-nR - do b=a,nBas-nR - ab = ab + 1 + a0 = nBas - nR - nO + lambda4 = 4.d0 * lambda + eta2 = eta * eta + + allocate(tmp_m(nBas,nBas,nS)) + allocate(tmp(nBas,nBas,nBas,nBas)) + + !$OMP PARALLEL DEFAULT(NONE) & + !$OMP PRIVATE(m, c, a, eps) & + !$OMP SHARED(nS, nBas, eta2, Om, rho, tmp_m) + !$OMP DO + do m = 1, nS + eps = Om(m) / (Om(m)*Om(m) + eta2) + do c = 1, nBas + do a = 1, nBas + tmp_m(a,c,m) = eps * rho(a,c,m) + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + + call dgemm("N", "T", nBas*nBas, nBas*nBas, nS, 1.d0, & + tmp_m(1,1,1), nBas*nBas, rho(1,1,1), nBas*nBas, & + 0.d0, tmp(1,1,1,1), nBas*nBas) + + deallocate(tmp_m) + + !$OMP PARALLEL DEFAULT(NONE) & + !$OMP PRIVATE(a, b, aa, ab, c, d, cd, tmp_ab) & + !$OMP SHARED(nO, nBas, nR, nS, a0, lambda4, tmp, KC) + !$OMP DO + do a = nO+1, nBas-nR + aa = a0 * (a - nO - 1) - (a - nO - 1) * (a - nO) / 2 - nO + do b = a, nBas-nR + ab = aa + b + + tmp_ab = lambda4 + if(a .eq. b) then + tmp_ab = 0.7071067811865475d0 * lambda4 + endif + cd = 0 - do c=nO+1,nBas-nR - do d=c,nBas-nR + do c = nO+1, nBas-nR + do d = c, nBas-nR cd = cd + 1 - chi = 0d0 - do m=1,nS - eps = Om(m)**2 + eta**2 - chi = chi - rho(a,c,m)*rho(b,d,m)*Om(m)/eps & - - rho(a,d,m)*rho(b,c,m)*Om(m)/eps - end do + KC(ab,cd) = -tmp_ab * (tmp(a,c,b,d) + tmp(a,d,b,c)) + if(c .eq. d) then + KC(ab,cd) = 0.7071067811865475d0 * KC(ab,cd) + endif + enddo + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL - KC(ab,cd) = 4d0*lambda*chi/sqrt((1d0 + Kronecker_delta(a,b))*(1d0 + Kronecker_delta(c,d))) + deallocate(tmp) - end do - end do - end do - end do + +! do a=nO+1,nBas-nR +! do b=a,nBas-nR +! ab = ab + 1 +! cd = 0 +! do c=nO+1,nBas-nR +! do d=c,nBas-nR +! cd = cd + 1 +! +! chi = 0d0 +! do m=1,nS +! eps = Om(m)**2 + eta**2 +! chi = chi - rho(a,c,m)*rho(b,d,m)*Om(m)/eps & +! - rho(a,d,m)*rho(b,c,m)*Om(m)/eps +! end do + + +! --- --- --- +! OpenMP implementation +! --- --- --- +! +! a0 = nBas - nR - nO +! lambda4 = 4.d0 * lambda +! eta2 = eta * eta +! +! allocate(Om_tmp(nS)) +! +! !$OMP PARALLEL DEFAULT(NONE) PRIVATE(m) SHARED(nS, eta2, Om, Om_tmp) +! !$OMP DO +! do m = 1, nS +! Om_tmp(m) = Om(m) / (Om(m)*Om(m) + eta2) +! enddo +! !$OMP END DO +! !$OMP END PARALLEL +! +! !$OMP PARALLEL DEFAULT(NONE) & +! !$OMP PRIVATE(a, b, aa, ab, c, d, cd, m, tmp_ab) & +! !$OMP SHARED(nO, nBas, nR, nS, a0, lambda4, Om_tmp, rho, KC) +! !$OMP DO +! do a = nO+1, nBas-nR +! aa = a0 * (a - nO - 1) - (a - nO - 1) * (a - nO) / 2 - nO +! do b = a, nBas-nR +! ab = aa + b +! +! tmp_ab = lambda4 +! if(a .eq. b) then +! tmp_ab = 0.7071067811865475d0 * lambda4 +! endif +! +! cd = 0 +! do c = nO+1, nBas-nR +! do d = c, nBas-nR +! cd = cd + 1 +! +! KC(ab,cd) = 0d0 +! do m = 1, nS +! KC(ab,cd) = KC(ab,cd) - rho(a,c,m) * rho(b,d,m) * Om_tmp(m) & +! - rho(a,d,m) * rho(b,c,m) * Om_tmp(m) +! end do +! +! KC(ab,cd) = tmp_ab * KC(ab,cd) +! if(c .eq. d) then +! KC(ab,cd) = 0.7071067811865475d0 * KC(ab,cd) +! endif +! enddo +! enddo +! enddo +! enddo +! !$OMP END DO +! !$OMP END PARALLEL +! +! deallocate(Om_tmp) +! --- --- --- + + +! --- --- --- +! Naive implementation +! --- --- --- +! +! ab = 0 +! do a=nO+1,nBas-nR +! do b=a,nBas-nR +! ab = ab + 1 +! cd = 0 +! do c=nO+1,nBas-nR +! do d=c,nBas-nR +! cd = cd + 1 +! +! chi = 0d0 +! do m=1,nS +! eps = Om(m)**2 + eta**2 +! chi = chi - rho(a,c,m)*rho(b,d,m)*Om(m)/eps & +! - rho(a,d,m)*rho(b,c,m)*Om(m)/eps +! end do +! +! KC(ab,cd) = 4d0*lambda*chi/sqrt((1d0 + Kronecker_delta(a,b))*(1d0 + Kronecker_delta(c,d))) +! +! end do +! end do +! end do +! end do +! --- --- --- end if @@ -73,28 +215,136 @@ subroutine GW_ppBSE_static_kernel_C(ispin,eta,nBas,nC,nO,nV,nR,nS,nVV,lambda,ERI if(ispin == 2) then - ab = 0 - do a=nO+1,nBas-nR - do b=a+1,nBas-nR - ab = ab + 1 + a0 = nBas - nR - nO - 1 + lambda4 = 4.d0 * lambda + eta2 = eta * eta + + allocate(tmp_m(nBas,nBas,nS)) + allocate(tmp(nBas,nBas,nBas,nBas)) + + !$OMP PARALLEL DEFAULT(NONE) & + !$OMP PRIVATE(m, c, a, eps) & + !$OMP SHARED(nS, nBas, eta2, Om, rho, tmp_m) + !$OMP DO + do m = 1, nS + eps = Om(m) / (Om(m)*Om(m) + eta2) + do c = 1, nBas + do a = 1, nBas + tmp_m(a,c,m) = eps * rho(a,c,m) + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + + call dgemm("N", "T", nBas*nBas, nBas*nBas, nS, 1.d0, & + tmp_m(1,1,1), nBas*nBas, rho(1,1,1), nBas*nBas, & + 0.d0, tmp(1,1,1,1), nBas*nBas) + + deallocate(tmp_m) + + !$OMP PARALLEL DEFAULT(NONE) & + !$OMP PRIVATE(a, b, aa, ab, c, d, cd) & + !$OMP SHARED(nO, nBas, nR, nS, a0, lambda4, tmp, KC) + !$OMP DO + do a = nO+1, nBas-nR + aa = a0 * (a - nO - 1) - (a - nO - 1) * (a - nO) / 2 - nO - 1 + do b = a+1, nBas-nR + ab = aa + b + cd = 0 - do c=nO+1,nBas-nR - do d=c+1,nBas-nR + do c = nO+1, nBas-nR + do d = c+1, nBas-nR cd = cd + 1 - chi = 0d0 - do m=1,nS - eps = Om(m)**2 + eta**2 - chi = chi - rho(a,c,m)*rho(b,d,m)*Om(m)/eps & - + rho(a,d,m)*rho(b,c,m)*Om(m)/eps - end do - - KC(ab,cd) = 4d0*lambda*chi + KC(ab,cd) = lambda4 * (-tmp(a,c,b,d) + tmp(a,d,b,c)) + enddo + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL - end do - end do - end do - end do + deallocate(tmp) + + +! --- --- --- +! OpenMP implementation +! --- --- --- +! +! a0 = nBas - nR - nO - 1 +! lambda4 = 4.d0 * lambda +! eta2 = eta * eta +! +! allocate(Om_tmp(nS)) +! +! !$OMP PARALLEL DEFAULT(NONE) PRIVATE(m) SHARED(nS, eta2, Om, Om_tmp) +! !$OMP DO +! do m = 1, nS +! Om_tmp(m) = Om(m) / (Om(m)*Om(m) + eta2) +! enddo +! !$OMP END DO +! !$OMP END PARALLEL +! +! !$OMP PARALLEL DEFAULT(NONE) & +! !$OMP PRIVATE(a, b, aa, ab, c, d, cd, m) & +! !$OMP SHARED(nO, nBas, nR, nS, a0, lambda4, Om_tmp, rho, KC) +! !$OMP DO +! do a = nO+1, nBas-nR +! aa = a0 * (a - nO - 1) - (a - nO - 1) * (a - nO) / 2 - nO - 1 +! do b = a+1, nBas-nR +! ab = aa + b +! +! cd = 0 +! do c = nO+1, nBas-nR +! do d = c+1, nBas-nR +! cd = cd + 1 +! +! KC(ab,cd) = 0d0 +! do m = 1, nS +! KC(ab,cd) = KC(ab,cd) - rho(a,c,m) * rho(b,d,m) * Om_tmp(m) & +! + rho(a,d,m) * rho(b,c,m) * Om_tmp(m) +! end do +! +! KC(ab,cd) = lambda4 * KC(ab,cd) +! enddo +! enddo +! enddo +! enddo +! !$OMP END DO +! !$OMP END PARALLEL +! +! deallocate(Om_tmp) +! --- --- --- + + +! --- --- --- +! Naive implementation +! --- --- --- +! +! ab = 0 +! do a=nO+1,nBas-nR +! do b=a+1,nBas-nR +! ab = ab + 1 +! cd = 0 +! do c=nO+1,nBas-nR +! do d=c+1,nBas-nR +! cd = cd + 1 +! +! chi = 0d0 +! do m=1,nS +! eps = Om(m)**2 + eta**2 +! chi = chi - rho(a,c,m)*rho(b,d,m)*Om(m)/eps & +! + rho(a,d,m)*rho(b,c,m)*Om(m)/eps +! end do +! +! KC(ab,cd) = 4d0*lambda*chi +! +! end do +! end do +! end do +! end do +! --- --- --- end if diff --git a/src/GW/RG0W0.f90 b/src/GW/RG0W0.f90 index fc30b5d..65ffb4d 100644 --- a/src/GW/RG0W0.f90 +++ b/src/GW/RG0W0.f90 @@ -59,6 +59,7 @@ subroutine RG0W0(dotest,doACFDT,exchange_kernel,doXBS,dophBSE,dophBSE2,TDA_W,TDA double precision,allocatable :: eGWlin(:) double precision,allocatable :: eGW(:) + ! Output variables ! Hello world @@ -101,7 +102,8 @@ subroutine RG0W0(dotest,doACFDT,exchange_kernel,doXBS,dophBSE,dophBSE2,TDA_W,TDA ! Compute screening ! !-------------------! - call phLR_A(ispin,dRPA,nBas,nC,nO,nV,nR,nS,1d0,eHF,ERI,Aph) + call phLR_A(ispin,dRPA,nBas,nC,nO,nV,nR,nS,1d0,eHF,ERI,Aph) + if(.not.TDA_W) call phLR_B(ispin,dRPA,nBas,nC,nO,nV,nR,nS,1d0,ERI,Bph) call phLR(TDA_W,nS,Aph,Bph,EcRPA,Om,XpY,XmY) @@ -159,6 +161,7 @@ subroutine RG0W0(dotest,doACFDT,exchange_kernel,doXBS,dophBSE,dophBSE2,TDA_W,TDA ! Compute the RPA correlation energy call phLR_A(ispin,dRPA,nBas,nC,nO,nV,nR,nS,1d0,eGW,ERI,Aph) + if(.not.TDA_W) call phLR_B(ispin,dRPA,nBas,nC,nO,nV,nR,nS,1d0,ERI,Bph) call phLR(TDA_W,nS,Aph,Bph,EcRPA,Om,XpY,XmY) diff --git a/src/LR/ppLR_C.f90 b/src/LR/ppLR_C.f90 index c74b2dd..72f15f0 100644 --- a/src/LR/ppLR_C.f90 +++ b/src/LR/ppLR_C.f90 @@ -23,6 +23,8 @@ subroutine ppLR_C(ispin,nBas,nC,nO,nV,nR,nVV,lambda,e,ERI,Cpp) double precision,external :: Kronecker_delta integer :: a,b,c,d,ab,cd + integer :: a0, aa + double precision :: e_ab, tmp_ab, delta_ac, tmp_cd ! Output variables @@ -37,22 +39,70 @@ subroutine ppLR_C(ispin,nBas,nC,nO,nV,nR,nVV,lambda,e,ERI,Cpp) if(ispin == 1) then - ab = 0 - do a=nO+1,nBas-nR - do b=a,nBas-nR - ab = ab + 1 + a0 = nBas - nR - nO + + !$OMP PARALLEL DEFAULT(NONE) & + !$OMP PRIVATE(a, b, aa, ab, c, d, cd, e_ab, tmp_ab, delta_ac, tmp_cd) & + !$OMP SHARED(nO, nBas, nR, a0, eF, lambda, e, ERI, Cpp) + !$OMP DO + do a = nO+1, nBas-nR + aa = a0 * (a - nO - 1) - (a - nO - 1) * (a - nO) / 2 - nO + do b = a, nBas-nR + ab = aa + b + + e_ab = e(a) + e(b) - eF + + tmp_ab = lambda + if(a .eq. b) then + tmp_ab = 0.7071067811865475d0 * lambda + endif + cd = 0 - do c=nO+1,nBas-nR - do d=c,nBas-nR + do c = nO+1, nBas-nR + + delta_ac = 0.d0 + if(a .eq. c) then + delta_ac = 1.d0 + endif + + do d = c, nBas-nR cd = cd + 1 + + tmp_cd = tmp_ab + if(c .eq. d) then + tmp_cd = 0.7071067811865475d0 * tmp_ab + endif + + Cpp(ab,cd) = 0.d0 + if(b .eq. d) then + Cpp(ab,cd) = e_ab * delta_ac + endif - Cpp(ab,cd) = + (e(a) + e(b) - eF)*Kronecker_delta(a,c)*Kronecker_delta(b,d) & - + lambda*(ERI(a,b,c,d) + ERI(a,b,d,c))/sqrt((1d0 + Kronecker_delta(a,b))*(1d0 + Kronecker_delta(c,d))) - - end do - end do - end do - end do + Cpp(ab,cd) = Cpp(ab,cd) + tmp_cd * (ERI(a,b,c,d) + ERI(a,b,d,c)) + enddo + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + + +! ab = 0 +! do a=nO+1,nBas-nR +! do b=a,nBas-nR +! ab = ab + 1 +! cd = 0 +! do c=nO+1,nBas-nR +! do d=c,nBas-nR +! cd = cd + 1 +! +! Cpp(ab,cd) = + (e(a) + e(b) - eF)*Kronecker_delta(a,c)*Kronecker_delta(b,d) & +! + lambda*(ERI(a,b,c,d) + ERI(a,b,d,c))/sqrt((1d0 + Kronecker_delta(a,b))*(1d0 + Kronecker_delta(c,d))) +! +! end do +! end do +! end do +! end do end if diff --git a/src/RPA/sort_ppRPA.f90 b/src/RPA/sort_ppRPA.f90 index e7037ad..8518ecb 100644 --- a/src/RPA/sort_ppRPA.f90 +++ b/src/RPA/sort_ppRPA.f90 @@ -39,9 +39,10 @@ subroutine sort_ppRPA(nOO,nVV,Om,Z,Om1,X1,Y1,Om2,X2,Y2) double precision,intent(out) :: X2(nVV,nOO) double precision,intent(out) :: Y2(nOO,nOO) + ! Memory allocation - allocate(M(nOO+nVV,nOO+nVV),Z1(nOO+nVV,nVV),Z2(nOO+nVV,nOO),order1(nVV),order2(nOO)) + allocate(M(nOO+nVV,nOO+nVV),Z1(nOO+nVV,nVV),Z2(nOO+nVV,nOO),order1(nVV),order2(nOO)) ! Initializatiom @@ -86,7 +87,7 @@ subroutine sort_ppRPA(nOO,nVV,Om,Z,Om1,X1,Y1,Om2,X2,Y2) end if - end do + end do if(minval(Om1) < 0d0 .or. ab /= nVV) call print_warning('You may have instabilities in pp-RPA!!') if(maxval(Om2) > 0d0 .or. ij /= nOO) call print_warning('You may have instabilities in pp-RPA!!') @@ -111,7 +112,8 @@ subroutine sort_ppRPA(nOO,nVV,Om,Z,Om1,X1,Y1,Om2,X2,Y2) call quick_sort(Om2,order2,nOO) call set_order(Z2,order2,nOO+nVV,nOO) - end if + end if + ! Orthogonalize eigenvectors @@ -202,7 +204,6 @@ subroutine sort_ppRPA(nOO,nVV,Om,Z,Om1,X1,Y1,Om2,X2,Y2) if(nVV > 0) call dgemm ('N', 'N', nOO+nVV, nVV, nOO+nVV, 1d0, M, nOO+nVV, Z1, nOO+nVV, 0d0, tmp1, nOO+nVV) if(nVV > 0) call dgemm ('T', 'N', nVV , nVV, nOO+nVV, 1d0, Z1, nOO+nVV, tmp1, nOO+nVV, 0d0, S1, nVV) - if(nOO > 0) call dgemm ('N', 'N', nOO+nVV, nOO, nOO+nVV, 1d0, M, nOO+nVV, -1d0*Z2, nOO+nVV, 0d0, tmp2, nOO+nVV) if(nOO > 0) call dgemm ('T', 'N', nOO , nOO, nOO+nVV, 1d0, Z2, nOO+nVV, tmp2, nOO+nVV, 0d0, S2, nOO) diff --git a/src/make_ninja.py b/src/make_ninja.py index 1ac16ed..1e72639 100755 --- a/src/make_ninja.py +++ b/src/make_ninja.py @@ -76,10 +76,21 @@ STDCXX=-lstdc++ FIX_ORDER_OF_LIBS=-Wl,--start-group """ +compile_olympe = """ +FC = ifort -mkl=parallel -qopenmp +AR = ar crs +FFLAGS = -I$IDIR -Ofast -traceback -xCORE-AVX512 +CC = icc +CXX = icpc +LAPACK= +STDCXX=-lstdc++ +FIX_ORDER_OF_LIBS=-Wl,--start-group +""" if sys.platform in ["linux", "linux2"]: - compiler = compile_gfortran_linux +# compiler = compile_gfortran_linux # compiler = compile_ifort_linux + compiler = compile_olympe elif sys.platform == "darwin": compiler = compile_gfortran_mac else: diff --git a/src/utils/orthogonalization_matrix.f90 b/src/utils/orthogonalization_matrix.f90 index 176862a..d7e0089 100644 --- a/src/utils/orthogonalization_matrix.f90 +++ b/src/utils/orthogonalization_matrix.f90 @@ -54,7 +54,7 @@ subroutine orthogonalization_matrix(nBas,S,X) end do - call ADAt(nBas,Uvec,Uval,X) + call ADAt(nBas, Uvec(1,1), Uval(1), X(1,1)) elseif(ortho_type == 2) then diff --git a/src/utils/utils.f90 b/src/utils/utils.f90 index efd3087..80b8322 100644 --- a/src/utils/utils.f90 +++ b/src/utils/utils.f90 @@ -375,15 +375,29 @@ subroutine ADAt(N,A,D,B) double precision,intent(out) :: B(N,N) - B = 0d0 + double precision, allocatable :: tmp(:,:) - do i=1,N - do j=1,N - do k=1,N - B(i,k) = B(i,k) + A(i,j)*D(j)*A(k,j) - end do - end do - end do + allocate(tmp(N,N)) + !$OMP PARALLEL DEFAULT(NONE) PRIVATE(i, j) SHARED(N, A, D, tmp) + !$OMP DO + do i = 1, N + do j = 1, N + tmp(i,j) = D(i) * A(j,i) + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + call dgemm("N", "N", N, N, N, 1.d0, A(1,1), N, tmp(1,1), N, 0.d0, B(1,1), N) + deallocate(tmp) + +! B = 0d0 +! do i=1,N +! do j=1,N +! do k=1,N +! B(i,k) = B(i,k) + A(i,j)*D(j)*A(k,j) +! end do +! end do +! end do end subroutine !------------------------------------------------------------------------ diff --git a/src/utils/wrap_lapack.f90 b/src/utils/wrap_lapack.f90 index d63b722..86280fe 100644 --- a/src/utils/wrap_lapack.f90 +++ b/src/utils/wrap_lapack.f90 @@ -31,6 +31,8 @@ subroutine diagonalize_general_matrix(N,A,WR,VR) call dgeev('V','V',N,A,N,WR,WI,VL,N,VR,N,work,lwork,info) + deallocate(work, WI, VL) + if(info /= 0) then print*,'Problem in diagonalize_general_matrix (dgeev)!!' end if @@ -67,6 +69,8 @@ subroutine diagonalize_matrix(N,A,e) allocate(work(lwork)) call dsyev('V','U',N,A,N,e,work,lwork,info) + + deallocate(work) if(info /= 0) then print*,'Problem in diagonalize_matrix (dsyev)!!'