Accelerated 4idx transformation

2025-02-05 00:10:44 +01:00 · 2023-02-16 18:34:47 +01:00 · 2023-02-16 18:34:47 +01:00 · 43704f8fc7
commit 43704f8fc7
parent edb1b43563
3 changed files with 108 additions and 19 deletions
--- a/RELEASE_NOTES.org
+++ b/RELEASE_NOTES.org
@ -8,6 +8,7 @@
  - Use OpamPack for OCaml
  - Configure adapted for ARM
  - Added many types of integrals
+  - Accelerated four-index transformation

 *** TODO: take from dev
  - [ ] Added GTOs with complex exponent
--- a/src/mo_two_e_ints/core_quantities.irp.f
+++ b/src/mo_two_e_ints/core_quantities.irp.f
@ -53,7 +53,7 @@ BEGIN_PROVIDER [ double precision, h_core_ri, (mo_num, mo_num) ]
   enddo
   do k=1,mo_num
     do i=1,mo_num
-       h_core_ri(i,j) = h_core_ri(i,j) - 0.5 * big_array_exchange_integrals(k,i,j)
+       h_core_ri(i,j) = h_core_ri(i,j) - 0.5d0 * big_array_exchange_integrals(k,i,j)
     enddo
   enddo
 enddo
--- a/src/mo_two_e_ints/mo_bi_integrals.irp.f
+++ b/src/mo_two_e_ints/mo_bi_integrals.irp.f
@ -53,7 +53,11 @@ BEGIN_PROVIDER [ logical, mo_two_e_integrals_in_map ]
 !    call four_idx_novvvv
    call four_idx_novvvv_old
  else
-    call add_integrals_to_map(full_ijkl_bitmask_4)
+    if (dble(ao_num)**4 * 32.d-9 < dble(qp_max_mem)) then
+      call four_idx_dgemm
+    else
+      call add_integrals_to_map(full_ijkl_bitmask_4)
+    endif
  endif

  call wall_time(wall_2)
@ -77,6 +81,94 @@ BEGIN_PROVIDER [ logical, mo_two_e_integrals_in_map ]

 END_PROVIDER

+subroutine four_idx_dgemm
+  implicit none
+  integer :: p,q,r,s,i,j,k,l
+  double precision, allocatable :: a1(:,:,:,:)
+  double precision, allocatable :: a2(:,:,:,:)
+
+  allocate (a1(ao_num,ao_num,ao_num,ao_num))
+
+  print *, 'Getting AOs'
+  !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q,r,s)
+  do s=1,ao_num
+    do r=1,ao_num
+      do q=1,ao_num
+        call get_ao_two_e_integrals(q,r,s,ao_num,a1(1,q,r,s))
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+  print *, '1st transformation'
+  ! 1st transformation
+  allocate (a2(ao_num,ao_num,ao_num,mo_num))
+  call dgemm('T','N', (ao_num*ao_num*ao_num), mo_num, ao_num, 1.d0, a1, ao_num, mo_coef, ao_num, 0.d0, a2, (ao_num*ao_num*ao_num))
+
+  ! 2nd transformation
+  print *, '2nd transformation'
+  deallocate (a1)
+  allocate (a1(ao_num,ao_num,mo_num,mo_num))
+  call dgemm('T','N', (ao_num*ao_num*mo_num), mo_num, ao_num, 1.d0, a2, ao_num, mo_coef, ao_num, 0.d0, a1, (ao_num*ao_num*mo_num))
+
+  ! 3rd transformation
+  print *, '3rd transformation'
+  deallocate (a2)
+  allocate (a2(ao_num,mo_num,mo_num,mo_num))
+  call dgemm('T','N', (ao_num*mo_num*mo_num), mo_num, ao_num, 1.d0, a1, ao_num, mo_coef, ao_num, 0.d0, a2, (ao_num*mo_num*mo_num))
+
+  ! 4th transformation
+  print *, '4th transformation'
+  deallocate (a1)
+  allocate (a1(mo_num,mo_num,mo_num,mo_num))
+  call dgemm('T','N', (mo_num*mo_num*mo_num), mo_num, ao_num, 1.d0, a2, ao_num, mo_coef, ao_num, 0.d0, a1, (mo_num*mo_num*mo_num))
+
+  deallocate (a2)
+
+  integer :: n_integrals, size_buffer
+  integer(key_kind)  , allocatable :: buffer_i(:)
+  real(integral_kind), allocatable :: buffer_value(:)
+  size_buffer = min(ao_num*ao_num*ao_num,16000000)
+
+  !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,j,k,l,buffer_value,buffer_i,n_integrals)
+  allocate ( buffer_i(size_buffer), buffer_value(size_buffer) )
+
+  n_integrals = 0
+  !$OMP DO
+  do l=1,mo_num
+    do k=1,mo_num
+      do j=1,l
+        do i=1,k
+            if (abs(a1(i,j,k,l)) < mo_integrals_threshold) then
+              cycle
+            endif
+            n_integrals += 1
+            buffer_value(n_integrals) = a1(i,j,k,l)
+            !DIR$ FORCEINLINE
+            call mo_two_e_integrals_index(i,j,k,l,buffer_i(n_integrals))
+            if (n_integrals == size_buffer) then
+              call map_append(mo_integrals_map, buffer_i, buffer_value, n_integrals)
+              n_integrals = 0
+            endif
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+
+  call map_append(mo_integrals_map, buffer_i, buffer_value, n_integrals)
+
+  deallocate(buffer_i, buffer_value)
+  !$OMP END PARALLEL
+
+  deallocate (a1)
+
+  call map_unique(mo_integrals_map)
+
+  integer*8                      :: get_mo_map_size, mo_map_size
+  mo_map_size = get_mo_map_size()
+
+end subroutine

 subroutine add_integrals_to_map(mask_ijkl)
  use bitmasks
@ -153,24 +245,26 @@ subroutine add_integrals_to_map(mask_ijkl)
    return
  endif

-  size_buffer = min(ao_num*ao_num*ao_num,16000000)
+  call wall_time(wall_1)
+
+  size_buffer = min(ao_num*ao_num*ao_num,8000000)
  print*, 'Buffers : ', 8.*(mo_num*(n_j)*(n_k+1) + mo_num+&
      ao_num+ao_num*ao_num+ size_buffer*3)/(1024*1024), 'MB / core'

-  double precision               :: accu_bis
-  accu_bis = 0.d0
-  call wall_time(wall_1)
-
  !$OMP PARALLEL PRIVATE(l1,k1,j1,i1,i2,i3,i4,i,j,k,l,c, ii1,kmax,   &
      !$OMP  two_e_tmp_0_idx, two_e_tmp_0, two_e_tmp_1,two_e_tmp_2,two_e_tmp_3,&
      !$OMP  buffer_i,buffer_value,n_integrals,wall_2,i0,j0,k0,l0,   &
-      !$OMP  wall_0,thread_num,accu_bis)                             &
+      !$OMP  wall_0,thread_num)                             &
      !$OMP  DEFAULT(NONE)                                           &
      !$OMP  SHARED(size_buffer,ao_num,mo_num,n_i,n_j,n_k,n_l,   &
      !$OMP  mo_coef_transp,                                         &
      !$OMP  mo_coef_transp_is_built, list_ijkl,                     &
      !$OMP  mo_coef_is_built, wall_1,                               &
      !$OMP  mo_coef,mo_integrals_threshold,mo_integrals_map)
+
+  thread_num = 0
+  !$  thread_num = omp_get_thread_num()
+
  n_integrals = 0
  wall_0 = wall_1
  allocate(two_e_tmp_3(mo_num, n_j, n_k),                 &
@ -181,8 +275,6 @@ subroutine add_integrals_to_map(mask_ijkl)
      buffer_i(size_buffer),                                         &
      buffer_value(size_buffer) )

-  thread_num = 0
-  !$  thread_num = omp_get_thread_num()
  !$OMP DO SCHEDULE(guided)
  do l1 = 1,ao_num
    two_e_tmp_3 = 0.d0
@ -340,10 +432,10 @@ subroutine add_integrals_to_map(mask_ijkl)
  !$OMP END DO NOWAIT
  deallocate (two_e_tmp_1,two_e_tmp_2,two_e_tmp_3)

-  integer                        :: index_needed
-
-  call insert_into_mo_integrals_map(n_integrals,buffer_i,buffer_value,&
-      real(mo_integrals_threshold,integral_kind))
+  if (n_integrals > 0) then
+    call insert_into_mo_integrals_map(n_integrals,buffer_i,buffer_value,&
+        real(mo_integrals_threshold,integral_kind))
+  endif
  deallocate(buffer_i, buffer_value)
  !$OMP END PARALLEL
  call map_merge(mo_integrals_map)
@ -433,12 +525,10 @@ subroutine add_integrals_to_map_three_indices(mask_ijk)

  call wall_time(wall_1)
  call cpu_time(cpu_1)
-  double precision               :: accu_bis
-  accu_bis = 0.d0
  !$OMP PARALLEL PRIVATE(m,l1,k1,j1,i1,i2,i3,i4,i,j,k,l,c, ii1,kmax, &
      !$OMP  two_e_tmp_0_idx, two_e_tmp_0, two_e_tmp_1,two_e_tmp_2,two_e_tmp_3,&
      !$OMP  buffer_i,buffer_value,n_integrals,wall_2,i0,j0,k0,l0,   &
-      !$OMP  wall_0,thread_num,accu_bis)                             &
+      !$OMP  wall_0,thread_num)                             &
      !$OMP  DEFAULT(NONE)                                           &
      !$OMP  SHARED(size_buffer,ao_num,mo_num,n_i,n_j,n_k,       &
      !$OMP  mo_coef_transp,                                         &
@ -636,8 +726,6 @@ subroutine add_integrals_to_map_three_indices(mask_ijk)
  !$OMP END DO NOWAIT
  deallocate (two_e_tmp_1,two_e_tmp_2,two_e_tmp_3)

-  integer                        :: index_needed
-
  call insert_into_mo_integrals_map(n_integrals,buffer_i,buffer_value,&
      real(mo_integrals_threshold,integral_kind))
  deallocate(buffer_i, buffer_value)