From 3cab869c2d7cb2e112d18e3612e3a1342f1eb227 Mon Sep 17 00:00:00 2001 From: AbdAmmar Date: Thu, 25 Jan 2024 22:12:26 +0100 Subject: [PATCH 1/4] optim in 1e-Jastrow --- plugins/local/jastrow/EZFIO.cfg | 2 +- plugins/local/non_h_ints_mu/jast_1e.irp.f | 16 +- .../local/non_h_ints_mu/jast_1e_utils.irp.f | 182 ++++++---- .../non_h_ints_mu/print_j1ecoef_info.irp.f | 94 +++++ .../local/non_h_ints_mu/test_non_h_ints.irp.f | 332 +++++++++++++++++- .../grid_becke_vector.irp.f | 9 +- 6 files changed, 557 insertions(+), 78 deletions(-) create mode 100644 plugins/local/non_h_ints_mu/print_j1ecoef_info.irp.f diff --git a/plugins/local/jastrow/EZFIO.cfg b/plugins/local/jastrow/EZFIO.cfg index 0d4141af..c3ed29a3 100644 --- a/plugins/local/jastrow/EZFIO.cfg +++ b/plugins/local/jastrow/EZFIO.cfg @@ -99,7 +99,7 @@ size: (ao_basis.ao_num) type: double precision doc: coefficients of the 1-electron Jastrow in AOsxAOs interface: ezfio -size: (ao_basis.ao_num*ao_basis.ao_num) +size: (ao_basis.ao_num,ao_basis.ao_num) [j1e_coef_ao3] type: double precision diff --git a/plugins/local/non_h_ints_mu/jast_1e.irp.f b/plugins/local/non_h_ints_mu/jast_1e.irp.f index fbd032ed..1fc2fd2b 100644 --- a/plugins/local/non_h_ints_mu/jast_1e.irp.f +++ b/plugins/local/non_h_ints_mu/jast_1e.irp.f @@ -78,7 +78,7 @@ END_PROVIDER double precision :: cx, cy, cz double precision :: time0, time1 double precision, allocatable :: Pa(:,:), Pb(:,:), Pt(:,:) - double precision, allocatable :: coef_fit(:), coef_fit2(:), coef_fit3(:,:) + double precision, allocatable :: coef_fit(:), coef_fit2(:,:), coef_fit3(:,:) PROVIDE j1e_type @@ -243,7 +243,7 @@ END_PROVIDER PROVIDE aos_grad_in_r_array - allocate(coef_fit2(ao_num*ao_num)) + allocate(coef_fit2(ao_num,ao_num)) if(mpi_master) then call ezfio_has_jastrow_j1e_coef_ao2(exists) @@ -254,7 +254,7 @@ END_PROVIDER IRP_ENDIF IRP_IF MPI include 'mpif.h' - call MPI_BCAST(coef_fit2, ao_num*ao_num, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr) + call MPI_BCAST(coef_fit2, (ao_num*ao_num), MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr) if (ierr /= MPI_SUCCESS) then stop 'Unable to read j1e_coef_ao2 with MPI' endif @@ -264,7 +264,7 @@ END_PROVIDER write(6,'(A)') '.. >>>>> [ IO READ: j1e_coef_ao2 ] <<<<< ..' call ezfio_get_jastrow_j1e_coef_ao2(coef_fit2) IRP_IF MPI - call MPI_BCAST(coef_fit2, ao_num*ao_num, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr) + call MPI_BCAST(coef_fit2, (ao_num*ao_num), MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr) if (ierr /= MPI_SUCCESS) then stop 'Unable to read j1e_coef_ao2 with MPI' endif @@ -272,14 +272,14 @@ END_PROVIDER endif else - call get_j1e_coef_fit_ao2(ao_num*ao_num, coef_fit2) + call get_j1e_coef_fit_ao2(ao_num, coef_fit2) call ezfio_set_jastrow_j1e_coef_ao2(coef_fit2) endif !$OMP PARALLEL & !$OMP DEFAULT (NONE) & - !$OMP PRIVATE (i, j, ij, ipoint, c) & + !$OMP PRIVATE (i, j, ipoint, c) & !$OMP SHARED (n_points_final_grid, ao_num, & !$OMP aos_grad_in_r_array, coef_fit2, & !$OMP aos_in_r_array, j1e_gradx, j1e_grady, j1e_gradz) @@ -292,9 +292,7 @@ END_PROVIDER do i = 1, ao_num do j = 1, ao_num - ij = (i-1)*ao_num + j - - c = coef_fit2(ij) + c = coef_fit2(j,i) j1e_gradx(ipoint) += c * (aos_in_r_array(i,ipoint) * aos_grad_in_r_array(j,ipoint,1) + aos_grad_in_r_array(i,ipoint,1) * aos_in_r_array(j,ipoint)) j1e_grady(ipoint) += c * (aos_in_r_array(i,ipoint) * aos_grad_in_r_array(j,ipoint,2) + aos_grad_in_r_array(i,ipoint,2) * aos_in_r_array(j,ipoint)) diff --git a/plugins/local/non_h_ints_mu/jast_1e_utils.irp.f b/plugins/local/non_h_ints_mu/jast_1e_utils.irp.f index 842908a7..90fcb5bb 100644 --- a/plugins/local/non_h_ints_mu/jast_1e_utils.irp.f +++ b/plugins/local/non_h_ints_mu/jast_1e_utils.irp.f @@ -120,15 +120,18 @@ subroutine get_j1e_coef_fit_ao2(dim_fit, coef_fit) implicit none integer , intent(in) :: dim_fit - double precision, intent(out) :: coef_fit(dim_fit) + double precision, intent(out) :: coef_fit(dim_fit,dim_fit) integer :: i, j, k, l, ipoint - integer :: ij, kl + integer :: ij, kl, mn + integer :: info, n_svd, LWORK double precision :: g double precision :: t0, t1 - double precision, allocatable :: A(:,:), b(:), A_inv(:,:) + double precision :: cutoff_svd + double precision, allocatable :: A(:,:,:,:), b(:,:) double precision, allocatable :: Pa(:,:), Pb(:,:), Pt(:,:) - double precision, allocatable :: u1e_tmp(:) + double precision, allocatable :: u1e_tmp(:), tmp(:,:,:) + double precision, allocatable :: U(:,:), D(:), Vt(:,:), work(:) PROVIDE j1e_type @@ -136,6 +139,9 @@ subroutine get_j1e_coef_fit_ao2(dim_fit, coef_fit) PROVIDE elec_alpha_num elec_beta_num elec_num PROVIDE mo_coef + + cutoff_svd = 5d-8 + call wall_time(t0) print*, ' PROVIDING the representation of 1e-Jastrow in AOs x AOs ... ' @@ -169,57 +175,70 @@ subroutine get_j1e_coef_fit_ao2(dim_fit, coef_fit) ! --- --- --- ! get A - allocate(A(ao_num*ao_num,ao_num*ao_num)) + !!$OMP PARALLEL & + !!$OMP DEFAULT (NONE) & + !!$OMP PRIVATE (i, j, k, l, ij, kl, ipoint) & + !!$OMP SHARED (n_points_final_grid, ao_num, & + !!$OMP final_weight_at_r_vector, aos_in_r_array_transp, A) + !!$OMP DO COLLAPSE(2) + !do k = 1, ao_num + ! do l = 1, ao_num + ! kl = (k-1)*ao_num + l + ! do i = 1, ao_num + ! do j = 1, ao_num + ! ij = (i-1)*ao_num + j + ! A(ij,kl) = 0.d0 + ! do ipoint = 1, n_points_final_grid + ! A(ij,kl) += final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j) & + ! * aos_in_r_array_transp(ipoint,k) * aos_in_r_array_transp(ipoint,l) + ! enddo + ! enddo + ! enddo + ! enddo + !enddo + !!$OMP END DO + !!$OMP END PARALLEL - !$OMP PARALLEL & - !$OMP DEFAULT (NONE) & - !$OMP PRIVATE (i, j, k, l, ij, kl, ipoint) & - !$OMP SHARED (n_points_final_grid, ao_num, & - !$OMP final_weight_at_r_vector, aos_in_r_array_transp, A) + allocate(tmp(ao_num,ao_num,n_points_final_grid)) + allocate(A(ao_num,ao_num,ao_num,ao_num)) + + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (i, j, ipoint) & + !$OMP SHARED (n_points_final_grid, ao_num, final_weight_at_r_vector, aos_in_r_array_transp, tmp) !$OMP DO COLLAPSE(2) - do k = 1, ao_num - do l = 1, ao_num - kl = (k-1)*ao_num + l - - do i = 1, ao_num - do j = 1, ao_num - ij = (i-1)*ao_num + j - - A(ij,kl) = 0.d0 - do ipoint = 1, n_points_final_grid - A(ij,kl) += final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j) & - * aos_in_r_array_transp(ipoint,k) * aos_in_r_array_transp(ipoint,l) - enddo - enddo + do j = 1, ao_num + do i = 1, ao_num + do ipoint = 1, n_points_final_grid + tmp(i,j,ipoint) = dsqrt(final_weight_at_r_vector(ipoint)) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j) enddo enddo enddo !$OMP END DO !$OMP END PARALLEL -! print *, ' A' -! do ij = 1, ao_num*ao_num -! write(*, '(100000(f15.7))') (A(ij,kl), kl = 1, ao_num*ao_num) -! enddo + call dgemm( "N", "T", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0 & + , tmp(1,1,1), ao_num*ao_num, tmp(1,1,1), ao_num*ao_num & + , 0.d0, A(1,1,1,1), ao_num*ao_num) + + deallocate(tmp) + ! --- --- --- ! get b - allocate(b(ao_num*ao_num)) + allocate(b(ao_num,ao_num)) - !$OMP PARALLEL & - !$OMP DEFAULT (NONE) & - !$OMP PRIVATE (i, j, ij, ipoint) & - !$OMP SHARED (n_points_final_grid, ao_num, & - !$OMP final_weight_at_r_vector, aos_in_r_array_transp, u1e_tmp, b) + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (i, j, ipoint) & + !$OMP SHARED (n_points_final_grid, ao_num, final_weight_at_r_vector, aos_in_r_array_transp, u1e_tmp, b) !$OMP DO COLLAPSE(2) do i = 1, ao_num do j = 1, ao_num - ij = (i-1)*ao_num + j - - b(ij) = 0.d0 + b(j,i) = 0.d0 do ipoint = 1, n_points_final_grid - b(ij) = b(ij) + final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j) * u1e_tmp(ipoint) + b(j,i) = b(j,i) + final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j) * u1e_tmp(ipoint) enddo enddo enddo @@ -231,36 +250,69 @@ subroutine get_j1e_coef_fit_ao2(dim_fit, coef_fit) ! --- --- --- ! solve Ax = b - allocate(A_inv(ao_num*ao_num,ao_num*ao_num)) - !call get_inverse(A, ao_num*ao_num, ao_num*ao_num, A_inv, ao_num*ao_num) - call get_pseudo_inverse(A, ao_num*ao_num, ao_num*ao_num, ao_num*ao_num, A_inv, ao_num*ao_num, 5d-8) + !call get_pseudo_inverse(A, ao_num*ao_num, ao_num*ao_num, ao_num*ao_num, A_inv, ao_num*ao_num, cutoff_svd) + + allocate(D(ao_num*ao_num), U(ao_num*ao_num,ao_num*ao_num), Vt(ao_num*ao_num,ao_num*ao_num)) + + allocate(work(1)) + lwork = -1 + call dgesvd( 'S', 'A', ao_num*ao_num, ao_num*ao_num, A(1,1,1,1), ao_num*ao_num & + , D(1), U(1,1), ao_num*ao_num, Vt(1,1), ao_num*ao_num, work, lwork, info) + if(info /= 0) then + print *, info, ': SVD failed' + stop + endif + + LWORK = max(5*ao_num*ao_num, int(WORK(1))) + deallocate(work) + allocate(work(lwork)) + call dgesvd( 'S', 'A', ao_num*ao_num, ao_num*ao_num, A(1,1,1,1), ao_num*ao_num & + , D(1), U(1,1), ao_num*ao_num, Vt(1,1), ao_num*ao_num, work, lwork, info) + if(info /= 0) then + print *, info, ':: SVD failed' + stop 1 + endif + + deallocate(work) + + n_svd = 0 + do ij = 1, ao_num*ao_num + if(D(ij)/D(1) > cutoff_svd) then + D(ij) = 1.d0 / D(ij) + n_svd = n_svd + 1 + else + D(ij) = 0.d0 + endif + enddo + print*, ' n_svd = ', n_svd + + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (ij, kl) & + !$OMP SHARED (ao_num, n_svd, D, Vt) + !$OMP DO + do kl = 1, ao_num*ao_num + do ij = 1, n_svd + Vt(ij,kl) = Vt(ij,kl) * D(ij) + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + + ! A = A_inv + call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_svd, 1.d0 & + , U(1,1), ao_num*ao_num, Vt(1,1), ao_num*ao_num & + , 0.d0, A(1,1,1,1), ao_num*ao_num) + + deallocate(D, U, Vt) + + + ! --- ! coef_fit = A_inv x b - call dgemv("N", ao_num*ao_num, ao_num*ao_num, 1.d0, A_inv, ao_num*ao_num, b, 1, 0.d0, coef_fit, 1) + call dgemv("N", ao_num*ao_num, ao_num*ao_num, 1.d0, A(1,1,1,1), ao_num*ao_num, b(1,1), 1, 0.d0, coef_fit(1,1), 1) - integer :: mn - double precision :: tmp, acc, nrm - - acc = 0.d0 - nrm = 0.d0 - do ij = 1, ao_num*ao_num - tmp = 0.d0 - do kl = 1, ao_num*ao_num - tmp += A(ij,kl) * coef_fit(kl) - enddo - tmp = tmp - b(ij) - if(dabs(tmp) .gt. 1d-7) then - print*, ' problem found in fitting 1e-Jastrow' - print*, ij, tmp - endif - - acc += dabs(tmp) - nrm += dabs(b(ij)) - enddo - print *, ' Relative Error (%) =', 100.d0*acc/nrm - - - deallocate(A, A_inv, b) + deallocate(A, b) call wall_time(t1) print*, ' END after (min) ', (t1-t0)/60.d0 diff --git a/plugins/local/non_h_ints_mu/print_j1ecoef_info.irp.f b/plugins/local/non_h_ints_mu/print_j1ecoef_info.irp.f new file mode 100644 index 00000000..feb2685a --- /dev/null +++ b/plugins/local/non_h_ints_mu/print_j1ecoef_info.irp.f @@ -0,0 +1,94 @@ + +! --- + +program print_j1ecoef_info + + implicit none + + my_grid_becke = .True. + PROVIDE tc_grid1_a tc_grid1_r + my_n_pt_r_grid = tc_grid1_r + my_n_pt_a_grid = tc_grid1_a + touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid + + if(tc_integ_type .eq. "numeric") then + my_extra_grid_becke = .True. + PROVIDE tc_grid2_a tc_grid2_r + my_n_pt_r_extra_grid = tc_grid2_r + my_n_pt_a_extra_grid = tc_grid2_a + touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid + endif + + call print_j1ecoef() + +end + +! --- + +subroutine print_j1ecoef() + + implicit none + integer :: i, j, ij + integer :: ierr + logical :: exists + character(len=10) :: ni, nj + double precision, allocatable :: coef_fit2(:) + + PROVIDE ao_l_char_space + + allocate(coef_fit2(ao_num*ao_num)) + + if(mpi_master) then + call ezfio_has_jastrow_j1e_coef_ao2(exists) + endif + IRP_IF MPI_DEBUG + print *, irp_here, mpi_rank + call MPI_BARRIER(MPI_COMM_WORLD, ierr) + IRP_ENDIF + IRP_IF MPI + include 'mpif.h' + call MPI_BCAST(coef_fit2, ao_num*ao_num, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr) + if (ierr /= MPI_SUCCESS) then + stop 'Unable to read j1e_coef_ao2 with MPI' + endif + IRP_ENDIF + if(exists) then + if(mpi_master) then + write(6,'(A)') '.. >>>>> [ IO READ: j1e_coef_ao2 ] <<<<< ..' + call ezfio_get_jastrow_j1e_coef_ao2(coef_fit2) + IRP_IF MPI + call MPI_BCAST(coef_fit2, ao_num*ao_num, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr) + if (ierr /= MPI_SUCCESS) then + stop 'Unable to read j1e_coef_ao2 with MPI' + endif + IRP_ENDIF + endif + else + + call get_j1e_coef_fit_ao2(ao_num*ao_num, coef_fit2) + call ezfio_set_jastrow_j1e_coef_ao2(coef_fit2) + + endif + + + do i = 1, ao_num + write(ni, '(I0)') ao_l(i)+1 + do j = 1, ao_num + write(nj, '(I0)') ao_l(j)+1 + ij = (i-1)*ao_num + j + print *, trim(adjustl(ni)) // trim(adjustl(ao_l_char_space(i))), " " & + , trim(adjustl(nj)) // trim(adjustl(ao_l_char_space(j))), " " & + , dabs(coef_fit2(ij)) + enddo +! print *, ' ' + enddo + + + deallocate(coef_fit2) + + return +end + +! --- + + diff --git a/plugins/local/non_h_ints_mu/test_non_h_ints.irp.f b/plugins/local/non_h_ints_mu/test_non_h_ints.irp.f index 90e5a7b3..2b96591b 100644 --- a/plugins/local/non_h_ints_mu/test_non_h_ints.irp.f +++ b/plugins/local/non_h_ints_mu/test_non_h_ints.irp.f @@ -39,8 +39,11 @@ program test_non_h !call test_j1e_fit_ao() - call test_tc_grad_and_lapl_ao_new() - call test_tc_grad_square_ao_new() + !call test_tc_grad_and_lapl_ao_new() + !call test_tc_grad_square_ao_new() + + !call test_fit_coef_A1() + call test_fit_coef_inv() end ! --- @@ -1112,3 +1115,328 @@ END_PROVIDER ! --- +subroutine test_fit_coef_A1() + + implicit none + integer :: i, j, k, l, ij, kl, ipoint + double precision :: t1, t2 + double precision :: accu, norm, diff + double precision, allocatable :: A1(:,:) + double precision, allocatable :: A2(:,:,:,:), tmp(:,:,:) + + ! --- + + allocate(A1(ao_num*ao_num,ao_num*ao_num)) + + call wall_time(t1) + + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (i, j, k, l, ij, kl, ipoint) & + !$OMP SHARED (n_points_final_grid, ao_num, & + !$OMP final_weight_at_r_vector, aos_in_r_array_transp, A1) + !$OMP DO COLLAPSE(2) + do k = 1, ao_num + do l = 1, ao_num + kl = (k-1)*ao_num + l + + do i = 1, ao_num + do j = 1, ao_num + ij = (i-1)*ao_num + j + + A1(ij,kl) = 0.d0 + do ipoint = 1, n_points_final_grid + A1(ij,kl) += final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j) & + * aos_in_r_array_transp(ipoint,k) * aos_in_r_array_transp(ipoint,l) + enddo + enddo + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + + call wall_time(t2) + print*, ' WALL TIME FOR A1 (min) =', (t2-t1)/60.d0 + + ! --- + + call wall_time(t1) + + allocate(tmp(ao_num,ao_num,n_points_final_grid)) + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (i, j, ipoint) & + !$OMP SHARED (n_points_final_grid, ao_num, final_weight_at_r_vector, aos_in_r_array_transp, tmp) + !$OMP DO COLLAPSE(2) + do j = 1, ao_num + do i = 1, ao_num + do ipoint = 1, n_points_final_grid + tmp(i,j,ipoint) = dsqrt(final_weight_at_r_vector(ipoint)) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j) + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + + allocate(A2(ao_num,ao_num,ao_num,ao_num)) + + call dgemm( "N", "T", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0 & + , tmp(1,1,1), ao_num*ao_num, tmp(1,1,1), ao_num*ao_num & + , 0.d0, A2(1,1,1,1), ao_num*ao_num) + deallocate(tmp) + + call wall_time(t2) + print*, ' WALL TIME FOR A2 (min) =', (t2-t1)/60.d0 + + ! --- + + accu = 0.d0 + norm = 0.d0 + do k = 1, ao_num + do l = 1, ao_num + kl = (k-1)*ao_num + l + + do i = 1, ao_num + do j = 1, ao_num + ij = (i-1)*ao_num + j + + diff = dabs(A2(j,i,l,k) - A1(ij,kl)) + if(diff .gt. 1d-10) then + print *, ' problem in A2 on:', i, i, l, k + print *, ' A1 :', A1(ij,kl) + print *, ' A2 :', A2(j,i,l,k) + stop + endif + + accu += diff + norm += dabs(A1(ij,kl)) + enddo + enddo + enddo + enddo + + deallocate(A1, A2) + + print*, ' accuracy (%) = ', 100.d0 * accu / norm + + return +end + +! --- + +subroutine test_fit_coef_inv() + + implicit none + integer :: i, j, k, l, ij, kl, ipoint + integer :: n_svd, info, lwork, mn + double precision :: t1, t2 + double precision :: accu, norm, diff + double precision :: cutoff_svd + double precision, allocatable :: A1(:,:), A1_inv(:,:) + double precision, allocatable :: A2(:,:,:,:), tmp(:,:,:), A2_inv(:,:,:,:) + double precision, allocatable :: U(:,:), D(:), Vt(:,:), work(:), A2_tmp(:,:,:,:) + + + cutoff_svd = 5d-8 + + ! --- + + call wall_time(t1) + + allocate(A1(ao_num*ao_num,ao_num*ao_num)) + + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (i, j, k, l, ij, kl, ipoint) & + !$OMP SHARED (n_points_final_grid, ao_num, & + !$OMP final_weight_at_r_vector, aos_in_r_array_transp, A1) + !$OMP DO COLLAPSE(2) + do k = 1, ao_num + do l = 1, ao_num + kl = (k-1)*ao_num + l + + do i = 1, ao_num + do j = 1, ao_num + ij = (i-1)*ao_num + j + + A1(ij,kl) = 0.d0 + do ipoint = 1, n_points_final_grid + A1(ij,kl) += final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j) & + * aos_in_r_array_transp(ipoint,k) * aos_in_r_array_transp(ipoint,l) + enddo + enddo + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + + call wall_time(t2) + print*, ' WALL TIME FOR A1 (min) =', (t2-t1)/60.d0 + + allocate(A1_inv(ao_num*ao_num,ao_num*ao_num)) + call get_pseudo_inverse(A1, ao_num*ao_num, ao_num*ao_num, ao_num*ao_num, A1_inv, ao_num*ao_num, cutoff_svd) + + call wall_time(t1) + print*, ' WALL TIME FOR A1_inv (min) =', (t1-t2)/60.d0 + + ! --- + + call wall_time(t1) + + allocate(tmp(ao_num,ao_num,n_points_final_grid)) + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (i, j, ipoint) & + !$OMP SHARED (n_points_final_grid, ao_num, final_weight_at_r_vector, aos_in_r_array_transp, tmp) + !$OMP DO COLLAPSE(2) + do j = 1, ao_num + do i = 1, ao_num + do ipoint = 1, n_points_final_grid + tmp(i,j,ipoint) = dsqrt(final_weight_at_r_vector(ipoint)) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j) + enddo + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + + allocate(A2(ao_num,ao_num,ao_num,ao_num)) + + call dgemm( "N", "T", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0 & + , tmp(1,1,1), ao_num*ao_num, tmp(1,1,1), ao_num*ao_num & + , 0.d0, A2(1,1,1,1), ao_num*ao_num) + + deallocate(tmp) + + call wall_time(t2) + print*, ' WALL TIME FOR A2 (min) =', (t2-t1)/60.d0 + + allocate(A2_tmp(ao_num,ao_num,ao_num,ao_num)) + A2_tmp = A2 + + allocate(A2_inv(ao_num,ao_num,ao_num,ao_num)) + + allocate(D(ao_num*ao_num), U(ao_num*ao_num,ao_num*ao_num), Vt(ao_num*ao_num,ao_num*ao_num)) + + allocate(work(1)) + lwork = -1 + + call dgesvd( 'S', 'A', ao_num*ao_num, ao_num*ao_num, A2_tmp(1,1,1,1), ao_num*ao_num & + , D(1), U(1,1), ao_num*ao_num, Vt(1,1), ao_num*ao_num, work, lwork, info) + if(info /= 0) then + print *, info, ': SVD failed' + stop + endif + + LWORK = max(5*ao_num*ao_num, int(WORK(1))) + deallocate(work) + allocate(work(lwork)) + + call dgesvd( 'S', 'A', ao_num*ao_num, ao_num*ao_num, A2_tmp(1,1,1,1), ao_num*ao_num & + , D(1), U(1,1), ao_num*ao_num, Vt(1,1), ao_num*ao_num, work, lwork, info) + if(info /= 0) then + print *, info, ':: SVD failed' + stop 1 + endif + + deallocate(A2_tmp) + deallocate(work) + + n_svd = 0 + do ij = 1, ao_num*ao_num + if(D(ij)/D(1) > cutoff_svd) then + D(ij) = 1.d0 / D(ij) + n_svd = n_svd + 1 + else + D(ij) = 0.d0 + endif + enddo + print*, ' n_svd = ', n_svd + + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (ij, kl) & + !$OMP SHARED (ao_num, n_svd, D, Vt) + !$OMP DO + do kl = 1, ao_num*ao_num + do ij = 1, n_svd + Vt(ij,kl) = Vt(ij,kl) * D(ij) + enddo + enddo + !$OMP END DO + !$OMP END PARALLEL + + call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_svd, 1.d0 & + , U(1,1), ao_num*ao_num, Vt(1,1), ao_num*ao_num & + , 0.d0, A2_inv(1,1,1,1), ao_num*ao_num) + + deallocate(D, U, Vt) + + call wall_time(t1) + print*, ' WALL TIME FOR A2_inv (min) =', (t1-t2)/60.d0 + + ! --- + + accu = 0.d0 + norm = 0.d0 + do k = 1, ao_num + do l = 1, ao_num + kl = (k-1)*ao_num + l + + do i = 1, ao_num + do j = 1, ao_num + ij = (i-1)*ao_num + j + + diff = dabs(A2(j,i,l,k) - A1(ij,kl)) + if(diff .gt. 1d-10) then + print *, ' problem in A2 on:', i, i, l, k + print *, ' A1 :', A1(ij,kl) + print *, ' A2 :', A2(j,i,l,k) + stop + endif + + accu += diff + norm += dabs(A1(ij,kl)) + enddo + enddo + enddo + enddo + + print*, ' accuracy on A (%) = ', 100.d0 * accu / norm + + accu = 0.d0 + norm = 0.d0 + do k = 1, ao_num + do l = 1, ao_num + kl = (k-1)*ao_num + l + + do i = 1, ao_num + do j = 1, ao_num + ij = (i-1)*ao_num + j + + diff = dabs(A2_inv(j,i,l,k) - A1_inv(ij,kl)) + !if(diff .gt. cutoff_svd) then + ! print *, ' problem in A2_inv on:', i, i, l, k + ! print *, ' A1_inv :', A1_inv(ij,kl) + ! print *, ' A2_inv :', A2_inv(j,i,l,k) + ! stop + !endif + + accu += diff + norm += dabs(A1_inv(ij,kl)) + enddo + enddo + enddo + enddo + + deallocate(A1_inv, A2_inv) + deallocate(A1, A2) + + print*, ' accuracy on A_inv (%) = ', 100.d0 * accu / norm + + return +end + +! --- + diff --git a/src/becke_numerical_grid/grid_becke_vector.irp.f b/src/becke_numerical_grid/grid_becke_vector.irp.f index 0386f3c6..473096d0 100644 --- a/src/becke_numerical_grid/grid_becke_vector.irp.f +++ b/src/becke_numerical_grid/grid_becke_vector.irp.f @@ -55,7 +55,7 @@ END_PROVIDER do j = 1, nucl_num do i = 1, n_points_radial_grid -1 do k = 1, n_points_integration_angular - if(dabs(final_weight_at_r(k,i,j)) < thresh_grid)then + if(dabs(final_weight_at_r(k,i,j)) < thresh_grid) then cycle endif i_count += 1 @@ -67,6 +67,13 @@ END_PROVIDER index_final_points(2,i_count) = i index_final_points(3,i_count) = j index_final_points_reverse(k,i,j) = i_count + + if(final_weight_at_r_vector(i_count) .lt. 0.d0) then + print *, ' !!! WARNING !!!' + print *, ' negative weight !!!!' + print *, i_count, final_weight_at_r_vector(i_count) + stop + endif enddo enddo enddo From 8018440410fac858f9a5ed2fb9f2c4ec4963c4b3 Mon Sep 17 00:00:00 2001 From: AbdAmmar Date: Thu, 25 Jan 2024 22:13:13 +0100 Subject: [PATCH 2/4] OPENMP & DGEMM in pseudo_inv --- src/utils/linear_algebra.irp.f | 57 +++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 18 deletions(-) diff --git a/src/utils/linear_algebra.irp.f b/src/utils/linear_algebra.irp.f index 314ad4f6..a67a219c 100644 --- a/src/utils/linear_algebra.irp.f +++ b/src/utils/linear_algebra.irp.f @@ -1321,19 +1321,22 @@ subroutine get_inverse(A,LDA,m,C,LDC) deallocate(ipiv,work) end -subroutine get_pseudo_inverse(A,LDA,m,n,C,LDC,cutoff) - implicit none +subroutine get_pseudo_inverse(A, LDA, m, n, C, LDC, cutoff) + BEGIN_DOC ! Find C = A^-1 END_DOC - integer, intent(in) :: m,n, LDA, LDC - double precision, intent(in) :: A(LDA,n) - double precision, intent(in) :: cutoff - double precision, intent(out) :: C(LDC,m) - double precision, allocatable :: U(:,:), D(:), Vt(:,:), work(:), A_tmp(:,:) - integer :: info, lwork - integer :: i,j,k + implicit none + integer, intent(in) :: m, n, LDA, LDC + double precision, intent(in) :: A(LDA,n) + double precision, intent(in) :: cutoff + double precision, intent(out) :: C(LDC,m) + + integer :: info, lwork + integer :: i, j, k, n_svd + double precision, allocatable :: U(:,:), D(:), Vt(:,:), work(:), A_tmp(:,:) + allocate (D(n),U(m,n),Vt(n,n),work(1),A_tmp(m,n)) do j=1,n do i=1,m @@ -1355,22 +1358,40 @@ subroutine get_pseudo_inverse(A,LDA,m,n,C,LDC,cutoff) stop 1 endif - do i=1,n - if (D(i)/D(1) > cutoff) then - D(i) = 1.d0/D(i) + n_svd = 0 + do i = 1, n + if(D(i)/D(1) > cutoff) then + D(i) = 1.d0 / D(i) + n_svd = n_svd + 1 else D(i) = 0.d0 endif enddo + print*, ' n_svd = ', n_svd - C = 0.d0 - do i=1,m - do j=1,n - do k=1,n - C(j,i) = C(j,i) + U(i,k) * D(k) * Vt(k,j) - enddo + !$OMP PARALLEL & + !$OMP DEFAULT (NONE) & + !$OMP PRIVATE (i, j) & + !$OMP SHARED (n, n_svd, D, Vt) + !$OMP DO + do j = 1, n + do i = 1, n_svd + Vt(i,j) = D(i) * Vt(i,j) enddo enddo + !$OMP END DO + !$OMP END PARALLEL + + call dgemm("N", "N", m, n, n_svd, 1.d0, U, m, Vt, n, 0.d0, C, LDC) + + !C = 0.d0 + !do i=1,m + ! do j=1,n + ! do k=1,n + ! C(j,i) = C(j,i) + U(i,k) * D(k) * Vt(k,j) + ! enddo + ! enddo + !enddo deallocate(U,D,Vt,work,A_tmp) From 0b83c1ab8b34bd303142f0a7352b0775510ee874 Mon Sep 17 00:00:00 2001 From: ydamour Date: Fri, 26 Jan 2024 17:34:16 +0100 Subject: [PATCH 3/4] mkl with gfortran --- config/gfortran_mkl.cfg | 62 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 config/gfortran_mkl.cfg diff --git a/config/gfortran_mkl.cfg b/config/gfortran_mkl.cfg new file mode 100644 index 00000000..f2787d63 --- /dev/null +++ b/config/gfortran_mkl.cfg @@ -0,0 +1,62 @@ +# Common flags +############## +# +# -ffree-line-length-none : Needed for IRPF90 which produces long lines +# -lblas -llapack : Link with libblas and liblapack libraries provided by the system +# -I . : Include the curent directory (Mandatory) +# +# --ninja : Allow the utilisation of ninja. (Mandatory) +# --align=32 : Align all provided arrays on a 32-byte boundary +# +# +[COMMON] +FC : gfortran -ffree-line-length-none -I . -mavx -g -fPIC -std=legacy +LAPACK_LIB : -I${MKLROOT}/include -L${MKLROOT}/lib/intel64 -Wl,--no-as-needed -lmkl_gf_lp64 -lmkl_core -lpthread -lm -ldl -lmkl_gnu_thread -lgomp -fopenmp +IRPF90 : irpf90 +IRPF90_FLAGS : --ninja --align=32 -DSET_NESTED + +# Global options +################ +# +# 1 : Activate +# 0 : Deactivate +# +[OPTION] +MODE : OPT ; [ OPT | PROFILE | DEBUG ] : Chooses the section below +CACHE : 0 ; Enable cache_compile.py +OPENMP : 1 ; Append OpenMP flags + +# Optimization flags +#################### +# +# -Ofast : Disregard strict standards compliance. Enables all -O3 optimizations. +# It also enables optimizations that are not valid +# for all standard-compliant programs. It turns on +# -ffast-math and the Fortran-specific +# -fno-protect-parens and -fstack-arrays. +[OPT] +FCFLAGS : -Ofast -mavx + +# Profiling flags +################# +# +[PROFILE] +FC : -p -g +FCFLAGS : -Ofast + +# Debugging flags +################# +# +# -fcheck=all : Checks uninitialized variables, array subscripts, etc... +# -g : Extra debugging information +# +[DEBUG] +FCFLAGS : -fcheck=all -g + +# OpenMP flags +################# +# +[OPENMP] +FC : -fopenmp +IRPF90_FLAGS : --openmp + From cc334b34b736af8a9ec2aa31a714f8a5d201956f Mon Sep 17 00:00:00 2001 From: AbdAmmar Date: Fri, 26 Jan 2024 19:50:18 +0100 Subject: [PATCH 4/4] opt in 1e-Jast & fixed bug in pseudo_inv --- .../local/non_h_ints_mu/jast_1e_utils.irp.f | 99 ++++++++----------- .../local/non_h_ints_mu/test_non_h_ints.irp.f | 37 ++++--- src/utils/linear_algebra.irp.f | 42 ++++---- 3 files changed, 85 insertions(+), 93 deletions(-) diff --git a/plugins/local/non_h_ints_mu/jast_1e_utils.irp.f b/plugins/local/non_h_ints_mu/jast_1e_utils.irp.f index 90fcb5bb..79f780b1 100644 --- a/plugins/local/non_h_ints_mu/jast_1e_utils.irp.f +++ b/plugins/local/non_h_ints_mu/jast_1e_utils.irp.f @@ -127,8 +127,8 @@ subroutine get_j1e_coef_fit_ao2(dim_fit, coef_fit) integer :: info, n_svd, LWORK double precision :: g double precision :: t0, t1 - double precision :: cutoff_svd - double precision, allocatable :: A(:,:,:,:), b(:,:) + double precision :: cutoff_svd, D1_inv + double precision, allocatable :: A(:,:,:,:), b(:) double precision, allocatable :: Pa(:,:), Pb(:,:), Pt(:,:) double precision, allocatable :: u1e_tmp(:), tmp(:,:,:) double precision, allocatable :: U(:,:), D(:), Vt(:,:), work(:) @@ -140,7 +140,7 @@ subroutine get_j1e_coef_fit_ao2(dim_fit, coef_fit) PROVIDE mo_coef - cutoff_svd = 5d-8 + cutoff_svd = 1d-10 call wall_time(t0) print*, ' PROVIDING the representation of 1e-Jastrow in AOs x AOs ... ' @@ -175,31 +175,7 @@ subroutine get_j1e_coef_fit_ao2(dim_fit, coef_fit) ! --- --- --- ! get A - !!$OMP PARALLEL & - !!$OMP DEFAULT (NONE) & - !!$OMP PRIVATE (i, j, k, l, ij, kl, ipoint) & - !!$OMP SHARED (n_points_final_grid, ao_num, & - !!$OMP final_weight_at_r_vector, aos_in_r_array_transp, A) - !!$OMP DO COLLAPSE(2) - !do k = 1, ao_num - ! do l = 1, ao_num - ! kl = (k-1)*ao_num + l - ! do i = 1, ao_num - ! do j = 1, ao_num - ! ij = (i-1)*ao_num + j - ! A(ij,kl) = 0.d0 - ! do ipoint = 1, n_points_final_grid - ! A(ij,kl) += final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j) & - ! * aos_in_r_array_transp(ipoint,k) * aos_in_r_array_transp(ipoint,l) - ! enddo - ! enddo - ! enddo - ! enddo - !enddo - !!$OMP END DO - !!$OMP END PARALLEL - - allocate(tmp(ao_num,ao_num,n_points_final_grid)) + allocate(tmp(n_points_final_grid,ao_num,ao_num)) allocate(A(ao_num,ao_num,ao_num,ao_num)) !$OMP PARALLEL & @@ -210,47 +186,41 @@ subroutine get_j1e_coef_fit_ao2(dim_fit, coef_fit) do j = 1, ao_num do i = 1, ao_num do ipoint = 1, n_points_final_grid - tmp(i,j,ipoint) = dsqrt(final_weight_at_r_vector(ipoint)) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j) + tmp(ipoint,i,j) = dsqrt(final_weight_at_r_vector(ipoint)) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j) enddo enddo enddo !$OMP END DO !$OMP END PARALLEL - call dgemm( "N", "T", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0 & - , tmp(1,1,1), ao_num*ao_num, tmp(1,1,1), ao_num*ao_num & + call dgemm( "T", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0 & + , tmp(1,1,1), n_points_final_grid, tmp(1,1,1), n_points_final_grid & , 0.d0, A(1,1,1,1), ao_num*ao_num) - deallocate(tmp) - - ! --- --- --- ! get b - allocate(b(ao_num,ao_num)) + allocate(b(ao_num*ao_num)) - !$OMP PARALLEL & - !$OMP DEFAULT (NONE) & - !$OMP PRIVATE (i, j, ipoint) & - !$OMP SHARED (n_points_final_grid, ao_num, final_weight_at_r_vector, aos_in_r_array_transp, u1e_tmp, b) - !$OMP DO COLLAPSE(2) - do i = 1, ao_num - do j = 1, ao_num - b(j,i) = 0.d0 - do ipoint = 1, n_points_final_grid - b(j,i) = b(j,i) + final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j) * u1e_tmp(ipoint) - enddo - enddo + do ipoint = 1, n_points_final_grid + u1e_tmp(ipoint) = dsqrt(final_weight_at_r_vector(ipoint)) * u1e_tmp(ipoint) enddo - !$OMP END DO - !$OMP END PARALLEL + + call dgemv("T", n_points_final_grid, ao_num*ao_num, 1.d0, tmp(1,1,1), n_points_final_grid, u1e_tmp(1), 1, 0.d0, b(1), 1) + !call dgemm( "T", "N", ao_num*ao_num, 1, n_points_final_grid, 1.d0 & + ! , tmp(1,1,1), n_points_final_grid, u1e_tmp(1), n_points_final_grid & + ! , 0.d0, b(1), ao_num*ao_num) deallocate(u1e_tmp) + deallocate(tmp) ! --- --- --- ! solve Ax = b - !call get_pseudo_inverse(A, ao_num*ao_num, ao_num*ao_num, ao_num*ao_num, A_inv, ao_num*ao_num, cutoff_svd) +! double precision, allocatable :: A_inv(:,:,:,:) +! allocate(A_inv(ao_num,ao_num,ao_num,ao_num)) +! call get_pseudo_inverse(A(1,1,1,1), ao_num*ao_num, ao_num*ao_num, ao_num*ao_num, A_inv(1,1,1,1), ao_num*ao_num, cutoff_svd) +! A = A_inv allocate(D(ao_num*ao_num), U(ao_num*ao_num,ao_num*ao_num), Vt(ao_num*ao_num,ao_num*ao_num)) @@ -275,15 +245,21 @@ subroutine get_j1e_coef_fit_ao2(dim_fit, coef_fit) deallocate(work) - n_svd = 0 - do ij = 1, ao_num*ao_num - if(D(ij)/D(1) > cutoff_svd) then - D(ij) = 1.d0 / D(ij) - n_svd = n_svd + 1 - else - D(ij) = 0.d0 - endif - enddo + if(D(1) .lt. 1d-14) then + print*, ' largest singular value is very small:', D(1) + n_svd = 1 + else + n_svd = 0 + D1_inv = 1.d0 / D(1) + do ij = 1, ao_num*ao_num + if(D(ij)*D1_inv > cutoff_svd) then + D(ij) = 1.d0 / D(ij) + n_svd = n_svd + 1 + else + D(ij) = 0.d0 + endif + enddo + endif print*, ' n_svd = ', n_svd !$OMP PARALLEL & @@ -310,7 +286,10 @@ subroutine get_j1e_coef_fit_ao2(dim_fit, coef_fit) ! --- ! coef_fit = A_inv x b - call dgemv("N", ao_num*ao_num, ao_num*ao_num, 1.d0, A(1,1,1,1), ao_num*ao_num, b(1,1), 1, 0.d0, coef_fit(1,1), 1) + call dgemv("N", ao_num*ao_num, ao_num*ao_num, 1.d0, A(1,1,1,1), ao_num*ao_num, b(1), 1, 0.d0, coef_fit(1,1), 1) + !call dgemm( "N", "N", ao_num*ao_num, 1, ao_num*ao_num, 1.d0 & + ! , A(1,1,1,1), ao_num*ao_num, b(1), ao_num*ao_num & + ! , 0.d0, coef_fit(1,1), ao_num*ao_num) deallocate(A, b) diff --git a/plugins/local/non_h_ints_mu/test_non_h_ints.irp.f b/plugins/local/non_h_ints_mu/test_non_h_ints.irp.f index 2b96591b..c3fde334 100644 --- a/plugins/local/non_h_ints_mu/test_non_h_ints.irp.f +++ b/plugins/local/non_h_ints_mu/test_non_h_ints.irp.f @@ -1232,8 +1232,8 @@ subroutine test_fit_coef_inv() integer :: n_svd, info, lwork, mn double precision :: t1, t2 double precision :: accu, norm, diff - double precision :: cutoff_svd - double precision, allocatable :: A1(:,:), A1_inv(:,:) + double precision :: cutoff_svd, D1_inv + double precision, allocatable :: A1(:,:), A1_inv(:,:), A1_tmp(:,:) double precision, allocatable :: A2(:,:,:,:), tmp(:,:,:), A2_inv(:,:,:,:) double precision, allocatable :: U(:,:), D(:), Vt(:,:), work(:), A2_tmp(:,:,:,:) @@ -1285,7 +1285,7 @@ subroutine test_fit_coef_inv() call wall_time(t1) - allocate(tmp(ao_num,ao_num,n_points_final_grid)) + allocate(tmp(n_points_final_grid,ao_num,ao_num)) !$OMP PARALLEL & !$OMP DEFAULT (NONE) & !$OMP PRIVATE (i, j, ipoint) & @@ -1294,7 +1294,7 @@ subroutine test_fit_coef_inv() do j = 1, ao_num do i = 1, ao_num do ipoint = 1, n_points_final_grid - tmp(i,j,ipoint) = dsqrt(final_weight_at_r_vector(ipoint)) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j) + tmp(ipoint,i,j) = dsqrt(final_weight_at_r_vector(ipoint)) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j) enddo enddo enddo @@ -1303,8 +1303,8 @@ subroutine test_fit_coef_inv() allocate(A2(ao_num,ao_num,ao_num,ao_num)) - call dgemm( "N", "T", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0 & - , tmp(1,1,1), ao_num*ao_num, tmp(1,1,1), ao_num*ao_num & + call dgemm( "T", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0 & + , tmp(1,1,1), n_points_final_grid, tmp(1,1,1), n_points_final_grid & , 0.d0, A2(1,1,1,1), ao_num*ao_num) deallocate(tmp) @@ -1312,6 +1312,8 @@ subroutine test_fit_coef_inv() call wall_time(t2) print*, ' WALL TIME FOR A2 (min) =', (t2-t1)/60.d0 + allocate(A1_tmp(ao_num*ao_num,ao_num*ao_num)) + A1_tmp = A1 allocate(A2_tmp(ao_num,ao_num,ao_num,ao_num)) A2_tmp = A2 @@ -1322,7 +1324,8 @@ subroutine test_fit_coef_inv() allocate(work(1)) lwork = -1 - call dgesvd( 'S', 'A', ao_num*ao_num, ao_num*ao_num, A2_tmp(1,1,1,1), ao_num*ao_num & + call dgesvd( 'S', 'A', ao_num*ao_num, ao_num*ao_num, A1_tmp(1,1), ao_num*ao_num & + !call dgesvd( 'S', 'A', ao_num*ao_num, ao_num*ao_num, A2_tmp(1,1,1,1), ao_num*ao_num & , D(1), U(1,1), ao_num*ao_num, Vt(1,1), ao_num*ao_num, work, lwork, info) if(info /= 0) then print *, info, ': SVD failed' @@ -1333,7 +1336,8 @@ subroutine test_fit_coef_inv() deallocate(work) allocate(work(lwork)) - call dgesvd( 'S', 'A', ao_num*ao_num, ao_num*ao_num, A2_tmp(1,1,1,1), ao_num*ao_num & + call dgesvd( 'S', 'A', ao_num*ao_num, ao_num*ao_num, A1_tmp(1,1), ao_num*ao_num & + !call dgesvd( 'S', 'A', ao_num*ao_num, ao_num*ao_num, A2_tmp(1,1,1,1), ao_num*ao_num & , D(1), U(1,1), ao_num*ao_num, Vt(1,1), ao_num*ao_num, work, lwork, info) if(info /= 0) then print *, info, ':: SVD failed' @@ -1343,9 +1347,10 @@ subroutine test_fit_coef_inv() deallocate(A2_tmp) deallocate(work) - n_svd = 0 + n_svd = 0 + D1_inv = 1.d0 / D(1) do ij = 1, ao_num*ao_num - if(D(ij)/D(1) > cutoff_svd) then + if(D(ij)*D1_inv > cutoff_svd) then D(ij) = 1.d0 / D(ij) n_svd = n_svd + 1 else @@ -1416,12 +1421,12 @@ subroutine test_fit_coef_inv() ij = (i-1)*ao_num + j diff = dabs(A2_inv(j,i,l,k) - A1_inv(ij,kl)) - !if(diff .gt. cutoff_svd) then - ! print *, ' problem in A2_inv on:', i, i, l, k - ! print *, ' A1_inv :', A1_inv(ij,kl) - ! print *, ' A2_inv :', A2_inv(j,i,l,k) - ! stop - !endif + if(diff .gt. cutoff_svd) then + print *, ' problem in A2_inv on:', i, i, l, k + print *, ' A1_inv :', A1_inv(ij,kl) + print *, ' A2_inv :', A2_inv(j,i,l,k) + stop + endif accu += diff norm += dabs(A1_inv(ij,kl)) diff --git a/src/utils/linear_algebra.irp.f b/src/utils/linear_algebra.irp.f index a67a219c..c897140e 100644 --- a/src/utils/linear_algebra.irp.f +++ b/src/utils/linear_algebra.irp.f @@ -1335,6 +1335,7 @@ subroutine get_pseudo_inverse(A, LDA, m, n, C, LDC, cutoff) integer :: info, lwork integer :: i, j, k, n_svd + double precision :: D1_inv double precision, allocatable :: U(:,:), D(:), Vt(:,:), work(:), A_tmp(:,:) allocate (D(n),U(m,n),Vt(n,n),work(1),A_tmp(m,n)) @@ -1358,15 +1359,22 @@ subroutine get_pseudo_inverse(A, LDA, m, n, C, LDC, cutoff) stop 1 endif - n_svd = 0 - do i = 1, n - if(D(i)/D(1) > cutoff) then - D(i) = 1.d0 / D(i) - n_svd = n_svd + 1 - else - D(i) = 0.d0 - endif - enddo + if(D(1) .lt. 1d-14) then + print*, ' largest singular value is very small:', D(1) + n_svd = 1 + else + n_svd = 0 + D1_inv = 1.d0 / D(1) + do i = 1, n + if(D(i)*D1_inv > cutoff) then + D(i) = 1.d0 / D(i) + n_svd = n_svd + 1 + else + D(i) = 0.d0 + endif + enddo + endif + print*, ' n_svd = ', n_svd !$OMP PARALLEL & @@ -1384,14 +1392,14 @@ subroutine get_pseudo_inverse(A, LDA, m, n, C, LDC, cutoff) call dgemm("N", "N", m, n, n_svd, 1.d0, U, m, Vt, n, 0.d0, C, LDC) - !C = 0.d0 - !do i=1,m - ! do j=1,n - ! do k=1,n - ! C(j,i) = C(j,i) + U(i,k) * D(k) * Vt(k,j) - ! enddo - ! enddo - !enddo +! C = 0.d0 +! do i=1,m +! do j=1,n +! do k=1,n +! C(j,i) = C(j,i) + U(i,k) * D(k) * Vt(k,j) +! enddo +! enddo +! enddo deallocate(U,D,Vt,work,A_tmp)