diff --git a/RELEASE_NOTES.org b/RELEASE_NOTES.org index 7b9483bf..9b579146 100644 --- a/RELEASE_NOTES.org +++ b/RELEASE_NOTES.org @@ -8,6 +8,7 @@ - Use OpamPack for OCaml - Configure adapted for ARM - Added many types of integrals + - Accelerated four-index transformation *** TODO: take from dev - [ ] Added GTOs with complex exponent diff --git a/src/mo_two_e_ints/core_quantities.irp.f b/src/mo_two_e_ints/core_quantities.irp.f index b764a1a6..3642365e 100644 --- a/src/mo_two_e_ints/core_quantities.irp.f +++ b/src/mo_two_e_ints/core_quantities.irp.f @@ -53,7 +53,7 @@ BEGIN_PROVIDER [ double precision, h_core_ri, (mo_num, mo_num) ] enddo do k=1,mo_num do i=1,mo_num - h_core_ri(i,j) = h_core_ri(i,j) - 0.5 * big_array_exchange_integrals(k,i,j) + h_core_ri(i,j) = h_core_ri(i,j) - 0.5d0 * big_array_exchange_integrals(k,i,j) enddo enddo enddo diff --git a/src/mo_two_e_ints/mo_bi_integrals.irp.f b/src/mo_two_e_ints/mo_bi_integrals.irp.f index d58932ce..ae299e9f 100644 --- a/src/mo_two_e_ints/mo_bi_integrals.irp.f +++ b/src/mo_two_e_ints/mo_bi_integrals.irp.f @@ -53,7 +53,11 @@ BEGIN_PROVIDER [ logical, mo_two_e_integrals_in_map ] ! call four_idx_novvvv call four_idx_novvvv_old else - call add_integrals_to_map(full_ijkl_bitmask_4) + if (dble(ao_num)**4 * 32.d-9 < dble(qp_max_mem)) then + call four_idx_dgemm + else + call add_integrals_to_map(full_ijkl_bitmask_4) + endif endif call wall_time(wall_2) @@ -77,6 +81,94 @@ BEGIN_PROVIDER [ logical, mo_two_e_integrals_in_map ] END_PROVIDER +subroutine four_idx_dgemm + implicit none + integer :: p,q,r,s,i,j,k,l + double precision, allocatable :: a1(:,:,:,:) + double precision, allocatable :: a2(:,:,:,:) + + allocate (a1(ao_num,ao_num,ao_num,ao_num)) + + print *, 'Getting AOs' + !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q,r,s) + do s=1,ao_num + do r=1,ao_num + do q=1,ao_num + call get_ao_two_e_integrals(q,r,s,ao_num,a1(1,q,r,s)) + enddo + enddo + enddo + !$OMP END PARALLEL DO + + print *, '1st transformation' + ! 1st transformation + allocate (a2(ao_num,ao_num,ao_num,mo_num)) + call dgemm('T','N', (ao_num*ao_num*ao_num), mo_num, ao_num, 1.d0, a1, ao_num, mo_coef, ao_num, 0.d0, a2, (ao_num*ao_num*ao_num)) + + ! 2nd transformation + print *, '2nd transformation' + deallocate (a1) + allocate (a1(ao_num,ao_num,mo_num,mo_num)) + call dgemm('T','N', (ao_num*ao_num*mo_num), mo_num, ao_num, 1.d0, a2, ao_num, mo_coef, ao_num, 0.d0, a1, (ao_num*ao_num*mo_num)) + + ! 3rd transformation + print *, '3rd transformation' + deallocate (a2) + allocate (a2(ao_num,mo_num,mo_num,mo_num)) + call dgemm('T','N', (ao_num*mo_num*mo_num), mo_num, ao_num, 1.d0, a1, ao_num, mo_coef, ao_num, 0.d0, a2, (ao_num*mo_num*mo_num)) + + ! 4th transformation + print *, '4th transformation' + deallocate (a1) + allocate (a1(mo_num,mo_num,mo_num,mo_num)) + call dgemm('T','N', (mo_num*mo_num*mo_num), mo_num, ao_num, 1.d0, a2, ao_num, mo_coef, ao_num, 0.d0, a1, (mo_num*mo_num*mo_num)) + + deallocate (a2) + + integer :: n_integrals, size_buffer + integer(key_kind) , allocatable :: buffer_i(:) + real(integral_kind), allocatable :: buffer_value(:) + size_buffer = min(ao_num*ao_num*ao_num,16000000) + + !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,j,k,l,buffer_value,buffer_i,n_integrals) + allocate ( buffer_i(size_buffer), buffer_value(size_buffer) ) + + n_integrals = 0 + !$OMP DO + do l=1,mo_num + do k=1,mo_num + do j=1,l + do i=1,k + if (abs(a1(i,j,k,l)) < mo_integrals_threshold) then + cycle + endif + n_integrals += 1 + buffer_value(n_integrals) = a1(i,j,k,l) + !DIR$ FORCEINLINE + call mo_two_e_integrals_index(i,j,k,l,buffer_i(n_integrals)) + if (n_integrals == size_buffer) then + call map_append(mo_integrals_map, buffer_i, buffer_value, n_integrals) + n_integrals = 0 + endif + enddo + enddo + enddo + enddo + !$OMP END DO + + call map_append(mo_integrals_map, buffer_i, buffer_value, n_integrals) + + deallocate(buffer_i, buffer_value) + !$OMP END PARALLEL + + deallocate (a1) + + call map_unique(mo_integrals_map) + + integer*8 :: get_mo_map_size, mo_map_size + mo_map_size = get_mo_map_size() + +end subroutine subroutine add_integrals_to_map(mask_ijkl) use bitmasks @@ -153,24 +245,26 @@ subroutine add_integrals_to_map(mask_ijkl) return endif - size_buffer = min(ao_num*ao_num*ao_num,16000000) + call wall_time(wall_1) + + size_buffer = min(ao_num*ao_num*ao_num,8000000) print*, 'Buffers : ', 8.*(mo_num*(n_j)*(n_k+1) + mo_num+& ao_num+ao_num*ao_num+ size_buffer*3)/(1024*1024), 'MB / core' - double precision :: accu_bis - accu_bis = 0.d0 - call wall_time(wall_1) - !$OMP PARALLEL PRIVATE(l1,k1,j1,i1,i2,i3,i4,i,j,k,l,c, ii1,kmax, & !$OMP two_e_tmp_0_idx, two_e_tmp_0, two_e_tmp_1,two_e_tmp_2,two_e_tmp_3,& !$OMP buffer_i,buffer_value,n_integrals,wall_2,i0,j0,k0,l0, & - !$OMP wall_0,thread_num,accu_bis) & + !$OMP wall_0,thread_num) & !$OMP DEFAULT(NONE) & !$OMP SHARED(size_buffer,ao_num,mo_num,n_i,n_j,n_k,n_l, & !$OMP mo_coef_transp, & !$OMP mo_coef_transp_is_built, list_ijkl, & !$OMP mo_coef_is_built, wall_1, & !$OMP mo_coef,mo_integrals_threshold,mo_integrals_map) + + thread_num = 0 + !$ thread_num = omp_get_thread_num() + n_integrals = 0 wall_0 = wall_1 allocate(two_e_tmp_3(mo_num, n_j, n_k), & @@ -181,8 +275,6 @@ subroutine add_integrals_to_map(mask_ijkl) buffer_i(size_buffer), & buffer_value(size_buffer) ) - thread_num = 0 - !$ thread_num = omp_get_thread_num() !$OMP DO SCHEDULE(guided) do l1 = 1,ao_num two_e_tmp_3 = 0.d0 @@ -340,10 +432,10 @@ subroutine add_integrals_to_map(mask_ijkl) !$OMP END DO NOWAIT deallocate (two_e_tmp_1,two_e_tmp_2,two_e_tmp_3) - integer :: index_needed - - call insert_into_mo_integrals_map(n_integrals,buffer_i,buffer_value,& - real(mo_integrals_threshold,integral_kind)) + if (n_integrals > 0) then + call insert_into_mo_integrals_map(n_integrals,buffer_i,buffer_value,& + real(mo_integrals_threshold,integral_kind)) + endif deallocate(buffer_i, buffer_value) !$OMP END PARALLEL call map_merge(mo_integrals_map) @@ -433,12 +525,10 @@ subroutine add_integrals_to_map_three_indices(mask_ijk) call wall_time(wall_1) call cpu_time(cpu_1) - double precision :: accu_bis - accu_bis = 0.d0 !$OMP PARALLEL PRIVATE(m,l1,k1,j1,i1,i2,i3,i4,i,j,k,l,c, ii1,kmax, & !$OMP two_e_tmp_0_idx, two_e_tmp_0, two_e_tmp_1,two_e_tmp_2,two_e_tmp_3,& !$OMP buffer_i,buffer_value,n_integrals,wall_2,i0,j0,k0,l0, & - !$OMP wall_0,thread_num,accu_bis) & + !$OMP wall_0,thread_num) & !$OMP DEFAULT(NONE) & !$OMP SHARED(size_buffer,ao_num,mo_num,n_i,n_j,n_k, & !$OMP mo_coef_transp, & @@ -636,8 +726,6 @@ subroutine add_integrals_to_map_three_indices(mask_ijk) !$OMP END DO NOWAIT deallocate (two_e_tmp_1,two_e_tmp_2,two_e_tmp_3) - integer :: index_needed - call insert_into_mo_integrals_map(n_integrals,buffer_i,buffer_value,& real(mo_integrals_threshold,integral_kind)) deallocate(buffer_i, buffer_value)