From 5ca864b7b08ca29453cb00dc482ffed77e20b840 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Wed, 30 Aug 2023 18:49:09 +0200
Subject: [PATCH] DGEMM for fock_3e_uhf_mo_cs: OK

---
 src/tc_scf/fock_3e_bi_ortho_uhf.irp.f | 364 +++++++++++++++++++++++++-
 src/tc_scf/test_int.irp.f             |  46 +++-
 2 files changed, 402 insertions(+), 8 deletions(-)

diff --git a/src/tc_scf/fock_3e_bi_ortho_uhf.irp.f b/src/tc_scf/fock_3e_bi_ortho_uhf.irp.f
index 3e624941..5d663480 100644
--- a/src/tc_scf/fock_3e_bi_ortho_uhf.irp.f
+++ b/src/tc_scf/fock_3e_bi_ortho_uhf.irp.f
@@ -3,6 +3,356 @@
 
 BEGIN_PROVIDER [double precision, fock_3e_uhf_mo_cs, (mo_num, mo_num)]
 
+  implicit none
+  integer                       :: a, b, i, j, ipoint
+  double precision              :: ti, tf
+  double precision              :: loc_1, loc_2
+  double precision, allocatable :: tmpval_1(:), tmpvec_1(:,:)
+  double precision, allocatable :: tmpval_omp(:), tmpvec_omp(:,:), tmpten_omp(:,:,:)
+  double precision, allocatable :: tmp_1(:,:), tmp_2(:,:,:,:)
+  double precision, allocatable :: tmp_3(:,:,:), tmp_4(:,:,:)
+
+  PROVIDE mo_l_coef mo_r_coef
+
+  print *, ' PROVIDING fock_3e_uhf_mo_cs ...'
+  call wall_time(ti)
+
+  ! ---
+
+  allocate(tmpvec_1(n_points_final_grid,3), tmpval_1(n_points_final_grid))
+  tmpvec_1 = 0.d0
+  tmpval_1 = 0.d0
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (ipoint, i, tmpval_omp, tmpvec_omp)               &
+  !$OMP SHARED (n_points_final_grid, elec_beta_num,               &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         int2_grad1_u12_bimo_t, tmpval_1, tmpvec_1)
+
+  allocate(tmpvec_omp(n_points_final_grid,3), tmpval_omp(n_points_final_grid))
+  tmpvec_omp = 0.d0
+  tmpval_omp = 0.d0
+
+  !$OMP DO
+  do i = 1, elec_beta_num
+    do ipoint = 1, n_points_final_grid
+      tmpvec_omp(ipoint,1) += int2_grad1_u12_bimo_t(ipoint,1,i,i)
+      tmpvec_omp(ipoint,2) += int2_grad1_u12_bimo_t(ipoint,2,i,i)
+      tmpvec_omp(ipoint,3) += int2_grad1_u12_bimo_t(ipoint,3,i,i)
+      tmpval_omp(ipoint)   += mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+    enddo
+  enddo
+  !$OMP END DO NOWAIT
+
+  !$OMP CRITICAL
+  do ipoint = 1, n_points_final_grid
+    tmpvec_1(ipoint,1) += tmpvec_omp(ipoint,1) 
+    tmpvec_1(ipoint,2) += tmpvec_omp(ipoint,2) 
+    tmpvec_1(ipoint,3) += tmpvec_omp(ipoint,3) 
+    tmpval_1(ipoint)   += tmpval_omp(ipoint)   
+  enddo
+  !$OMP END CRITICAL
+
+  deallocate(tmpvec_omp, tmpval_omp)
+
+  !$OMP END PARALLEL
+
+  ! ---
+
+  allocate(tmp_1(n_points_final_grid,5))
+  tmp_1 = 0.d0
+
+  !$OMP PARALLEL                &
+  !$OMP DEFAULT (NONE)          &
+  !$OMP PRIVATE (ipoint, loc_1) &
+  !$OMP SHARED (n_points_final_grid, tmpval_1, tmpvec_1, tmp_1)
+  !$OMP DO
+  do ipoint = 1, n_points_final_grid
+
+    loc_1 = -4.d0 * tmpval_1(ipoint) 
+
+    tmp_1(ipoint,1) = loc_1 * tmpvec_1(ipoint,1)
+    tmp_1(ipoint,2) = loc_1 * tmpvec_1(ipoint,2)
+    tmp_1(ipoint,3) = loc_1 * tmpvec_1(ipoint,3)
+
+    tmp_1(ipoint,4) = -2.d0 * ( tmpvec_1(ipoint,1) * tmpvec_1(ipoint,1) &
+                              + tmpvec_1(ipoint,2) * tmpvec_1(ipoint,2) &
+                              + tmpvec_1(ipoint,3) * tmpvec_1(ipoint,3) )
+
+    tmp_1(ipoint,5) = tmpval_1(ipoint)
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (ipoint, i, j, loc_1, tmpvec_omp)                 &
+  !$OMP SHARED (n_points_final_grid, elec_beta_num,               &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         int2_grad1_u12_bimo_t, tmp_1)
+
+  allocate(tmpvec_omp(n_points_final_grid,4))
+  tmpvec_omp = 0.d0
+
+  !$OMP DO
+  do i = 1, elec_beta_num
+    do j = 1, elec_beta_num
+      do ipoint = 1, n_points_final_grid
+
+        loc_1 = mos_l_in_r_array_transp(ipoint,j) * mos_r_in_r_array_transp(ipoint,i) 
+
+        tmpvec_omp(ipoint,1) += 2.d0 * loc_1 * int2_grad1_u12_bimo_t(ipoint,1,i,j) 
+        tmpvec_omp(ipoint,2) += 2.d0 * loc_1 * int2_grad1_u12_bimo_t(ipoint,2,i,j) 
+        tmpvec_omp(ipoint,3) += 2.d0 * loc_1 * int2_grad1_u12_bimo_t(ipoint,3,i,j) 
+        tmpvec_omp(ipoint,4) += ( int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
+                                + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
+                                + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i) )
+      enddo
+    enddo
+  enddo
+  !$OMP END DO NOWAIT
+
+  !$OMP CRITICAL
+  do ipoint = 1, n_points_final_grid
+    tmp_1(ipoint,1) += tmpvec_omp(ipoint,1) 
+    tmp_1(ipoint,2) += tmpvec_omp(ipoint,2) 
+    tmp_1(ipoint,3) += tmpvec_omp(ipoint,3) 
+    tmp_1(ipoint,4) += tmpvec_omp(ipoint,4) 
+  enddo
+  !$OMP END CRITICAL
+
+  deallocate(tmpvec_omp)
+  !$OMP END PARALLEL
+
+  ! ---
+
+  allocate(tmp_2(n_points_final_grid,5,mo_num,mo_num))
+  tmp_2 = 0.d0
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (ipoint, a, b)                                    &
+  !$OMP SHARED (n_points_final_grid, mo_num,                      &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+  !$OMP         tmp_2)
+  !$OMP DO
+  do a = 1, mo_num
+    do b = 1, mo_num
+      do ipoint = 1, n_points_final_grid
+        tmp_2(ipoint,1,b,a) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,b,a)
+        tmp_2(ipoint,2,b,a) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,b,a)
+        tmp_2(ipoint,3,b,a) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,b,a)
+        tmp_2(ipoint,4,b,a) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,b) * mos_r_in_r_array_transp(ipoint,a)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  !$OMP PARALLEL                                                 &
+  !$OMP DEFAULT (NONE)                                           &
+  !$OMP PRIVATE (ipoint, a, b, i, tmpten_omp)                    &
+  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num,      &
+  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t, &
+  !$OMP         tmp_2)
+
+  allocate(tmpten_omp(n_points_final_grid,mo_num,mo_num))
+  tmpten_omp = 0.d0
+
+  !$OMP DO
+  do a = 1, mo_num
+    do b = 1, mo_num
+      do i = 1, elec_beta_num
+        do ipoint = 1, n_points_final_grid
+          tmpten_omp(ipoint,b,a) += 2.d0 * final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1,b,i) * int2_grad1_u12_bimo_t(ipoint,1,i,a) &
+                                                                              + int2_grad1_u12_bimo_t(ipoint,2,b,i) * int2_grad1_u12_bimo_t(ipoint,2,i,a) &
+                                                                              + int2_grad1_u12_bimo_t(ipoint,3,b,i) * int2_grad1_u12_bimo_t(ipoint,3,i,a) )
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO NOWAIT
+
+  !$OMP CRITICAL
+  do a = 1, mo_num
+    do b = 1, mo_num
+      do ipoint = 1, n_points_final_grid
+        tmp_2(ipoint,5,b,a) += tmpten_omp(ipoint,b,a)
+      enddo
+    enddo
+  enddo
+  !$OMP END CRITICAL
+
+  deallocate(tmpten_omp)
+
+  !$OMP END PARALLEL
+
+  ! ---
+ 
+  call dgemv( 'T', 5*n_points_final_grid, mo_num*mo_num, 1.d0 &
+            , tmp_2(1,1,1,1), size(tmp_2, 1) * size(tmp_2, 2) &
+            , tmp_1(1,1), 1                                   &
+            , 0.d0, fock_3e_uhf_mo_cs(1,1), 1)
+
+  deallocate(tmp_1, tmp_2)
+
+  ! ---
+
+  allocate(tmp_3(n_points_final_grid,7,mo_num), tmp_4(n_points_final_grid,7,mo_num))
+  tmp_3 = 0.d0
+  tmp_4 = 0.d0
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (ipoint, b, loc_1, loc_2)                         &
+  !$OMP SHARED (n_points_final_grid, mo_num,                      &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         final_weight_at_r_vector, tmp_3, tmp_4)
+  !$OMP DO
+  do b = 1, mo_num
+    do ipoint = 1, n_points_final_grid
+
+      loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,b)
+      loc_2 = mos_r_in_r_array_transp(ipoint,b)
+
+      tmp_3(ipoint,2,b) = loc_1
+      tmp_3(ipoint,7,b) = loc_1
+
+      tmp_4(ipoint,1,b) = loc_2
+      tmp_4(ipoint,6,b) = loc_2
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (ipoint, b, i, loc_1, loc_2, tmpten_omp)          &
+  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num,       &
+  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t,  &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         tmpvec_1, tmp_3, tmp_4)
+
+  allocate(tmpten_omp(n_points_final_grid,8,mo_num))
+  tmpten_omp = 0.d0
+
+  !$OMP DO
+  do b = 1, mo_num
+    do i = 1, elec_beta_num
+      do ipoint = 1, n_points_final_grid
+
+        loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i)
+        loc_2 = mos_r_in_r_array_transp(ipoint,i)
+
+        tmpten_omp(ipoint,1,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,1,b,i)
+        tmpten_omp(ipoint,2,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,2,b,i)
+        tmpten_omp(ipoint,3,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,3,b,i)
+        tmpten_omp(ipoint,4,b) += 2.d0 * loc_1 * ( tmpvec_1(ipoint,1) * int2_grad1_u12_bimo_t(ipoint,1,b,i) &
+                                                 + tmpvec_1(ipoint,2) * int2_grad1_u12_bimo_t(ipoint,2,b,i) &
+                                                 + tmpvec_1(ipoint,3) * int2_grad1_u12_bimo_t(ipoint,3,b,i) )
+
+        tmpten_omp(ipoint,5,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,1,i,b)
+        tmpten_omp(ipoint,6,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,2,i,b)
+        tmpten_omp(ipoint,7,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,3,i,b)
+        tmpten_omp(ipoint,8,b) += 2.d0 * loc_2 * ( tmpvec_1(ipoint,1) * int2_grad1_u12_bimo_t(ipoint,1,i,b) &
+                                                 + tmpvec_1(ipoint,2) * int2_grad1_u12_bimo_t(ipoint,2,i,b) &
+                                                 + tmpvec_1(ipoint,3) * int2_grad1_u12_bimo_t(ipoint,3,i,b) )
+      enddo
+    enddo
+  enddo
+  !$OMP END DO NOWAIT
+
+  !$OMP CRITICAL
+  do b = 1, mo_num
+    do ipoint = 1, n_points_final_grid
+      tmp_3(ipoint,3,b) += tmpten_omp(ipoint,1,b)
+      tmp_3(ipoint,4,b) += tmpten_omp(ipoint,2,b)
+      tmp_3(ipoint,5,b) += tmpten_omp(ipoint,3,b)
+      tmp_3(ipoint,6,b) += tmpten_omp(ipoint,4,b)
+
+      tmp_4(ipoint,3,b) += tmpten_omp(ipoint,5,b)
+      tmp_4(ipoint,4,b) += tmpten_omp(ipoint,6,b)
+      tmp_4(ipoint,5,b) += tmpten_omp(ipoint,7,b)
+      tmp_4(ipoint,7,b) += tmpten_omp(ipoint,8,b)
+    enddo
+  enddo
+  !$OMP END CRITICAL
+
+  deallocate(tmpten_omp)
+
+  !$OMP END PARALLEL
+
+
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (ipoint, b, i, j, loc_1, loc_2, tmpten_omp)       &
+  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num,       &
+  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t,  &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         tmp_3, tmp_4)
+
+  allocate(tmpten_omp(n_points_final_grid,2,mo_num))
+  tmpten_omp = 0.d0
+
+  !$OMP DO
+  do b = 1, mo_num
+    do i = 1, elec_beta_num
+      do j = 1, elec_beta_num
+        do ipoint = 1, n_points_final_grid
+
+          loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,j)
+          loc_2 = mos_r_in_r_array_transp(ipoint,i)
+
+          tmpten_omp(ipoint,1,b) -= loc_1 * ( int2_grad1_u12_bimo_t(ipoint,1,b,i) * int2_grad1_u12_bimo_t(ipoint,1,i,j) &
+                                            + int2_grad1_u12_bimo_t(ipoint,2,b,i) * int2_grad1_u12_bimo_t(ipoint,2,i,j) &
+                                            + int2_grad1_u12_bimo_t(ipoint,3,b,i) * int2_grad1_u12_bimo_t(ipoint,3,i,j) )
+
+          tmpten_omp(ipoint,2,b) -= loc_2 * ( int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,b) &
+                                            + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,b) &
+                                            + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,b) )
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO NOWAIT
+
+  !$OMP CRITICAL
+  do b = 1, mo_num
+    do ipoint = 1, n_points_final_grid
+      tmp_3(ipoint,1,b) += tmpten_omp(ipoint,1,b)
+      tmp_4(ipoint,2,b) += tmpten_omp(ipoint,2,b)
+    enddo
+  enddo
+  !$OMP END CRITICAL
+
+  deallocate(tmpten_omp)
+
+  !$OMP END PARALLEL
+
+  ! ---
+
+  call dgemm( 'T', 'N', mo_num, mo_num, 7*n_points_final_grid, 1.d0 &
+            , tmp_3(1,1,1), 7*n_points_final_grid                   &
+            , tmp_4(1,1,1), 7*n_points_final_grid                   &
+            , 1.d0, fock_3e_uhf_mo_cs(1,1), mo_num)
+
+  deallocate(tmp_3, tmp_4)
+
+  ! ---
+
+  call wall_time(tf)
+  print *, ' total Wall time for fock_3e_uhf_mo_cs =', tf - ti
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [double precision, fock_3e_uhf_mo_cs_old, (mo_num, mo_num)]
+
   implicit none
   integer                       :: a, b, i, j
   double precision              :: I_bij_aij, I_bij_ija, I_bij_jai, I_bij_aji, I_bij_iaj, I_bij_jia
@@ -12,14 +362,14 @@ BEGIN_PROVIDER [double precision, fock_3e_uhf_mo_cs, (mo_num, mo_num)]
   PROVIDE mo_l_coef mo_r_coef
   call give_integrals_3_body_bi_ort(1, 1, 1, 1, 1, 1, I_bij_aij)
 
-  !print *, ' PROVIDING fock_3e_uhf_mo_cs ...'
-  !call wall_time(ti)
+  print *, ' PROVIDING fock_3e_uhf_mo_cs_old ...'
+  call wall_time(ti)
 
-  fock_3e_uhf_mo_cs = 0.d0
+  fock_3e_uhf_mo_cs_old = 0.d0
 
   !$OMP PARALLEL DEFAULT (NONE)                                                                     &
   !$OMP PRIVATE (a, b, i, j, I_bij_aij, I_bij_ija, I_bij_jai, I_bij_aji, I_bij_iaj, I_bij_jia, tmp) &
-  !$OMP SHARED  (mo_num, elec_beta_num, fock_3e_uhf_mo_cs)
+  !$OMP SHARED  (mo_num, elec_beta_num, fock_3e_uhf_mo_cs_old)
 
   allocate(tmp(mo_num,mo_num))
   tmp = 0.d0
@@ -54,7 +404,7 @@ BEGIN_PROVIDER [double precision, fock_3e_uhf_mo_cs, (mo_num, mo_num)]
   !$OMP CRITICAL
   do a = 1, mo_num
     do b = 1, mo_num
-      fock_3e_uhf_mo_cs(b,a) += tmp(b,a)
+      fock_3e_uhf_mo_cs_old(b,a) += tmp(b,a)
     enddo
   enddo
   !$OMP END CRITICAL
@@ -62,8 +412,8 @@ BEGIN_PROVIDER [double precision, fock_3e_uhf_mo_cs, (mo_num, mo_num)]
   deallocate(tmp)
   !$OMP END PARALLEL
 
-  !call wall_time(tf)
-  !print *, ' total Wall time for fock_3e_uhf_mo_cs =', tf - ti
+  call wall_time(tf)
+  print *, ' total Wall time for fock_3e_uhf_mo_cs_old =', tf - ti
 
 END_PROVIDER 
 
diff --git a/src/tc_scf/test_int.irp.f b/src/tc_scf/test_int.irp.f
index 649d0f3e..d7780497 100644
--- a/src/tc_scf/test_int.irp.f
+++ b/src/tc_scf/test_int.irp.f
@@ -54,7 +54,10 @@ program test_ints
 !!PROVIDE TC_HF_energy VARTC_HF_energy
 !!print *, '    TC_HF_energy = ',    TC_HF_energy
 !!print *, ' VARTC_HF_energy = ', VARTC_HF_energy
- call test_old_ints
+! call test_old_ints
+
+  call test_fock_3e_uhf_mo_cs()
+
 end
 
 ! ---
@@ -1096,3 +1099,44 @@ subroutine test_int2_grad1_u12_ao_test
  print*,'accu_abs   = ',accu_abs/dble(ao_num)**4
  print*,'accu_relat = ',accu_relat/dble(ao_num)**4
 end
+
+! ---
+
+subroutine test_fock_3e_uhf_mo_cs()
+
+  implicit none
+  integer          :: i, j
+  double precision :: I_old, I_new
+  double precision :: diff_tot, diff, thr_ih, norm
+
+  PROVIDE fock_3e_uhf_mo_cs fock_3e_uhf_mo_cs_old
+
+  thr_ih   = 1d-10
+  norm     = 0.d0
+  diff_tot = 0.d0
+
+  do i = 1, mo_num
+    do j = 1, mo_num
+
+      I_old = fock_3e_uhf_mo_cs_old(j,i)
+      I_new = fock_3e_uhf_mo_cs    (j,i)
+
+      diff = dabs(I_old - I_new)
+      if(diff .gt. thr_ih) then
+        print *, ' problem on ', j, i
+        print *, ' old value = ', I_old
+        print *, ' new value = ', I_new
+        stop
+      endif
+
+      norm     += dabs(I_old)
+      diff_tot += diff
+    enddo
+  enddo
+
+  print *, ' diff tot (%) = ', 100.d0 * diff_tot / norm
+
+  return
+end subroutine test_fock_3e_uhf_mo_cs
+
+