dav_double_dress_ext_rout.irp.f

2025-04-25 01:34:58 +02:00 · 2021-10-26 19:11:04 +02:00 · 2021-10-26 19:11:04 +02:00 · 1f0c48023d
commit 1f0c48023d
parent 699f655b89
3 changed files with 557 additions and 174 deletions
--- a/src/dav_general_mat/dav_double_dress_ext_rout.irp.f
+++ b/src/dav_general_mat/dav_double_dress_ext_rout.irp.f
@ -0,0 +1,520 @@
 subroutine dav_double_dressed(u_in,H_jj,Dress_jj,Dressing_vec,idx_dress,energies,sze,N_st,N_st_diag,converged,hcalc)
  use mmap_module
  BEGIN_DOC
  ! Generic Davidson diagonalization with TWO DRESSING VECTORS 
  !
  ! Dress_jj : DIAGONAL DRESSING of the Hamiltonian 
  !
  ! Dressing_vec : COLUMN / LINE DRESSING VECTOR 
  !
  ! idx_dress : position of the basis function used to use the Dressing_vec (usually the largest coeff)
  !
  ! H_jj : specific diagonal H matrix elements to diagonalize de Davidson
  !
  ! u_in : guess coefficients on the various states. Overwritten on exit
  !
  ! sze : leftmost dimension of u_in
  !
  ! sze : Number of determinants
  !
  ! N_st : Number of eigenstates
  !
  ! N_st_diag : Number of states in which H is diagonalized. Assumed > sze
  !
  ! Initial guess vectors are not necessarily orthonormal
  !
  ! hcalc subroutine to compute W = H U (see routine hcalc_template for template of input/output)
  END_DOC
  implicit none
  integer, intent(in)             :: sze, N_st, N_st_diag, idx_dress
  double precision,  intent(in)   :: H_jj(sze),Dress_jj(sze),Dressing_vec(sze,N_st)
  double precision, intent(inout) :: u_in(sze,N_st_diag)
  double precision, intent(out)   :: energies(N_st_diag)
  logical, intent(out)            :: converged
  external hcalc
  double precision, allocatable   :: H_jj_tmp(:)
  ASSERT (N_st > 0)
  ASSERT (sze > 0)
  allocate(H_jj_tmp(sze))
  do i=1,sze
    H_jj_tmp(i)  = H_jj(i) + Dress_jj(i)
  enddo
  do k=1,N_st
    do i=1,sze
      H_jj_tmp(i)  += u_in(i,k) * Dressing_vec(i,k)
    enddo
  enddo
  integer                        :: iter
  integer                        :: i,j,k,l,m
  double precision, external     :: u_dot_v, u_dot_u
  integer                        :: k_pairs, kl
  integer                        :: iter2, itertot
  double precision, allocatable  :: y(:,:), h(:,:), lambda(:)
  double precision, allocatable  :: s_tmp(:,:)
  double precision, allocatable  :: residual_norm(:),inv_c_idx_dress_vec(:)
  character*(16384)              :: write_buffer
  double precision               :: to_print(2,N_st),inv_c_idx_dress
  double precision               :: cpu, wall
  integer                        :: shift, shift2, itermax, istate
  double precision               :: r1, r2, alpha
  logical                        :: state_ok(N_st_diag*davidson_sze_max)
  integer                        :: nproc_target
  integer                        :: order(N_st_diag)
  double precision               :: cmax
  double precision, allocatable  :: U(:,:), overlap(:,:)
  double precision, pointer      :: W(:,:)
  logical                        :: disk_based
  double precision               :: energy_shift(N_st_diag*davidson_sze_max)
  allocate(inv_c_idx_dress_vec(N_st))
  inv_c_idx_dress = 1.d0/u_in(idx_dress,1)
  do i = 1, N_st
   inv_c_idx_dress_vec(i) = 1.d0/u_in(idx_dress,i)
  enddo
  include 'constants.include.F'
  integer :: N_st_diag_in 
  N_st_diag_in = N_st_diag
  !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: U, W, y, h, lambda
  if (N_st_diag_in*3 > sze) then
    print *,  'error in Davidson :'
    print *,  'Increase n_det_max_full to ', N_st_diag_in*3
    stop -1
  endif
  itermax = max(2,min(davidson_sze_max, sze/N_st_diag_in))+1
  itertot = 0
  if (state_following) then
    allocate(overlap(N_st_diag_in*itermax, N_st_diag_in*itermax))
  else
    allocate(overlap(1,1))  ! avoid 'if' for deallocate
  endif
  overlap = 0.d0
  call write_time(6)
  write(6,'(A)') ''
  write(6,'(A)') 'Davidson Diagonalization'
  write(6,'(A)') '------------------------'
  write(6,'(A)') ''
  ! Find max number of cores to fit in memory
  ! -----------------------------------------
  nproc_target = nproc
  double precision :: rss
  integer :: maxab
  maxab = max(N_det_alpha_unique, N_det_beta_unique)+1
  m=1
  disk_based = .False.
  call resident_memory(rss)
  do
    r1 = 8.d0 *                                   &! bytes
         ( dble(sze)*(N_st_diag_in*itermax)          &! U
         + 1.0d0*dble(sze*m)*(N_st_diag_in*itermax)  &! W
         + 3.0d0*(N_st_diag_in*itermax)**2           &! h,y,s_tmp
         + 1.d0*(N_st_diag_in*itermax)               &! lambda
         + 1.d0*(N_st_diag_in)                       &! residual_norm
                                                   ! In H_u_0_nstates_zmq
         + 2.d0*(N_st_diag_in*N_det)                 &! u_t, v_t, on collector
         + 2.d0*(N_st_diag_in*N_det)                 &! u_t, v_t, on slave
         + 0.5d0*maxab                            &! idx0 in H_u_0_nstates_openmp_work_*
         + nproc_target *                         &! In OMP section
           ( 1.d0*(N_int*maxab)                   &! buffer
           + 3.5d0*(maxab) )                      &! singles_a, singles_b, doubles, idx
         ) / 1024.d0**3
    if (nproc_target == 0) then
      call check_mem(r1,irp_here)
      nproc_target = 1
      exit
    endif
    if (r1+rss < qp_max_mem) then
      exit
    endif
    if (itermax > 4) then
      itermax = itermax - 1
    else if (m==1.and.disk_based_davidson) then
      m=0
      disk_based = .True.
      itermax = 6
    else
      nproc_target = nproc_target - 1
    endif
  enddo
  nthreads_davidson = nproc_target
  TOUCH nthreads_davidson
  call write_int(6,N_st,'Number of states')
  call write_int(6,N_st_diag_in,'Number of states in diagonalization')
  call write_int(6,sze,'Number of basis functions ')
  call write_int(6,nproc_target,'Number of threads for diagonalization')
  call write_double(6, r1, 'Memory(Gb)')
  if (disk_based) then
    print *, 'Using swap space to reduce RAM'
  endif
  !---------------
  write(6,'(A)') ''
  write_buffer = '====='
  do i=1,N_st
    write_buffer = trim(write_buffer)//' ================ ==========='
  enddo
  write(6,'(A)') write_buffer(1:6+41*N_st)
  write_buffer = 'Iter'
  do i=1,N_st
    write_buffer = trim(write_buffer)//'       Energy        Residual '
  enddo
  write(6,'(A)') write_buffer(1:6+41*N_st)
  write_buffer = '====='
  do i=1,N_st
    write_buffer = trim(write_buffer)//' ================ ==========='
  enddo
  write(6,'(A)') write_buffer(1:6+41*N_st)
  allocate(W(sze,N_st_diag_in*itermax))
  allocate(                                                          &
      ! Large
      U(sze,N_st_diag_in*itermax),                                      &
      ! Small
      h(N_st_diag_in*itermax,N_st_diag_in*itermax),                        &
      y(N_st_diag_in*itermax,N_st_diag_in*itermax),                        &
      s_tmp(N_st_diag_in*itermax,N_st_diag_in*itermax),                    &
      residual_norm(N_st_diag_in),                                      &
      lambda(N_st_diag_in*itermax))
  h = 0.d0
  U = 0.d0
  y = 0.d0
  s_tmp = 0.d0
  ASSERT (N_st > 0)
  ASSERT (N_st_diag_in >= N_st)
  ASSERT (sze > 0)
  ASSERT (Nint > 0)
  ASSERT (Nint == N_int)
  ! Davidson iterations
  ! ===================
  converged = .False.
  do k=N_st+1,N_st_diag_in
    do i=1,sze
        call random_number(r1)
        call random_number(r2)
        r1 = dsqrt(-2.d0*dlog(r1))
        r2 = dtwo_pi*r2
        u_in(i,k) = r1*dcos(r2) * u_in(i,k-N_st)
    enddo
    u_in(k,k) = u_in(k,k) + 10.d0
  enddo
  do k=1,N_st_diag_in
    call normalize(u_in(1,k),sze)
  enddo
  do k=1,N_st_diag_in
    do i=1,sze
      U(i,k) = u_in(i,k)
    enddo
  enddo
  do while (.not.converged)
    itertot = itertot+1
    if (itertot == 8) then
      exit
    endif
    do iter=1,itermax-1
      shift  = N_st_diag_in*(iter-1)
      shift2 = N_st_diag_in*iter
      if ((iter > 1).or.(itertot == 1)) then
        ! Compute |W_k> = \sum_i |i><i|H|u_k>
        ! -----------------------------------
        call hcalc(W(1,shift+1),U(1,shift+1),N_st_diag_in,sze)
        ! Compute then the DIAGONAL PART OF THE DRESSING 
        ! <i|W_k> += Dress_jj(i) * <i|U>
        call dressing_diag_uv(W(1,shift+1),U(1,shift+1),Dress_jj,N_st_diag_in,sze)
      else
         ! Already computed in update below
         continue
      endif
        if (N_st == 1) then
          l = idx_dress
          double precision :: f
          f = inv_c_idx_dress
          do istate=1,N_st_diag_in
            do i=1,sze
              W(i,shift+istate) += Dressing_vec(i,1) *f * U(l,shift+istate)
              W(l,shift+istate) += Dressing_vec(i,1) *f * U(i,shift+istate)
            enddo
          enddo
        else
         print*,'dav_double_dressed routine not yet implemented for N_st > 1'
 !
 !          call dgemm('T','N', N_st, N_st_diag_in, sze, 1.d0, &
 !            psi_coef, size(psi_coef,1), &
 !            U(1,shift+1), size(U,1), 0.d0, s_tmp, size(s_tmp,1))
 !
 !          call dgemm('N','N', sze, N_st_diag_in, N_st, 1.0d0, &
 !            Dressing_vec, size(Dressing_vec,1), s_tmp, size(s_tmp,1), &
 !            1.d0, W(1,shift+1), size(W,1))
 !
 !
 !          call dgemm('T','N', N_st, N_st_diag_in, sze, 1.d0, &
 !            Dressing_vec, size(Dressing_vec,1), &
 !            U(1,shift+1), size(U,1), 0.d0, s_tmp, size(s_tmp,1))
 !
 !          call dgemm('N','N', sze, N_st_diag_in, N_st, 1.0d0, &
 !            psi_coef, size(psi_coef,1), s_tmp, size(s_tmp,1), &
 !            1.d0, W(1,shift+1), size(W,1))
 !
        endif
      ! Compute h_kl = <u_k | W_l> = <u_k| H |u_l>
      ! -------------------------------------------
      call dgemm('T','N', shift2, shift2, sze,                       &
          1.d0, U, size(U,1), W, size(W,1),                          &
          0.d0, h, size(h,1))
      call dgemm('T','N', shift2, shift2, sze,                       &
          1.d0, U, size(U,1), U, size(U,1),                          &
          0.d0, s_tmp, size(s_tmp,1))
      ! Diagonalize h
      ! ---------------
       integer :: lwork, info
       double precision, allocatable :: work(:)
       y = h
       lwork = -1
       allocate(work(1))
       call dsygv(1,'V','U',shift2,y,size(y,1), &
          s_tmp,size(s_tmp,1), lambda, work,lwork,info)
       lwork = int(work(1))
       deallocate(work)
       allocate(work(lwork))
       call dsygv(1,'V','U',shift2,y,size(y,1), &
          s_tmp,size(s_tmp,1), lambda, work,lwork,info)
       deallocate(work)
       if (info /= 0) then
         stop 'DSYGV Diagonalization failed'
       endif
      ! Compute Energy for each eigenvector
      ! -----------------------------------
      call dgemm('N','N',shift2,shift2,shift2,                       &
          1.d0, h, size(h,1), y, size(y,1),                          &
          0.d0, s_tmp, size(s_tmp,1))
      call dgemm('T','N',shift2,shift2,shift2,                       &
          1.d0, y, size(y,1), s_tmp, size(s_tmp,1),                  &
          0.d0, h, size(h,1))
      do k=1,shift2
        lambda(k) = h(k,k)
      enddo
      if (state_following) then
        overlap = -1.d0
        do k=1,shift2
          do i=1,shift2
            overlap(k,i) = dabs(y(k,i))
          enddo
        enddo
        do k=1,N_st
          cmax = -1.d0
          do i=1,N_st
            if (overlap(i,k) > cmax) then
              cmax = overlap(i,k)
              order(k) = i
            endif
          enddo
          do i=1,N_st_diag_in
            overlap(order(k),i) = -1.d0
          enddo
        enddo
        overlap = y
        do k=1,N_st
          l = order(k)
          if (k /= l) then
            y(1:shift2,k) = overlap(1:shift2,l)
          endif
        enddo
        do k=1,N_st
          overlap(k,1) = lambda(k)
        enddo
      endif
      ! Express eigenvectors of h in the determinant basis
      ! --------------------------------------------------
      call dgemm('N','N', sze, N_st_diag_in, shift2,                    &
          1.d0, U, size(U,1), y, size(y,1), 0.d0, U(1,shift2+1), size(U,1))
      call dgemm('N','N', sze, N_st_diag_in, shift2,                    &
          1.d0, W, size(W,1), y, size(y,1), 0.d0, W(1,shift2+1), size(W,1))
      ! Compute residual vector and davidson step
      ! -----------------------------------------
      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i,k)
      do k=1,N_st_diag_in
        do i=1,sze
          U(i,shift2+k) =  &
            (lambda(k) * U(i,shift2+k) - W(i,shift2+k) )      &
              /max(H_jj_tmp(i) - lambda (k),1.d-2)
        enddo
        if (k <= N_st) then
          residual_norm(k) = u_dot_u(U(1,shift2+k),sze)
          to_print(1,k) = lambda(k) 
          to_print(2,k) = residual_norm(k)
        endif
      enddo
      !$OMP END PARALLEL DO
      if ((itertot>1).and.(iter == 1)) then
        !don't print
        continue
      else
        write(*,'(1X,I3,1X,100(1X,F16.10,1X,E11.3))') iter-1, to_print(1:2,1:N_st)
      endif
      ! Check convergence
      if (iter > 1) then
        converged = dabs(maxval(residual_norm(1:N_st))) < threshold_davidson
      endif
      do k=1,N_st
        if (residual_norm(k) > 1.d8) then
          print *, 'Davidson failed'
          stop -1
        endif
      enddo
      if (converged) then
        exit
      endif
      logical, external :: qp_stop
      if (qp_stop()) then
        converged = .True.
        exit
      endif
    enddo
    ! Re-contract U and update W
    ! --------------------------------
    call dgemm('N','N', sze, N_st_diag_in, shift2, 1.d0,      &
        W, size(W,1), y, size(y,1), 0.d0, u_in, size(u_in,1))
    do k=1,N_st_diag_in
      do i=1,sze
        W(i,k) = u_in(i,k)
      enddo
    enddo
    call dgemm('N','N', sze, N_st_diag_in, shift2, 1.d0,      &
        U, size(U,1), y, size(y,1), 0.d0, u_in, size(u_in,1))
    do k=1,N_st_diag_in
      do i=1,sze
        U(i,k) = u_in(i,k)
      enddo
    enddo
  enddo
  call nullify_small_elements(sze,N_st_diag_in,U,size(U,1),threshold_davidson_pt2)
  do k=1,N_st_diag_in
    do i=1,sze
      u_in(i,k) = U(i,k)
    enddo
  enddo
  do k=1,N_st_diag_in
    energies(k) = lambda(k)
  enddo
  write_buffer = '======'
  do i=1,N_st
    write_buffer = trim(write_buffer)//' ================ ==========='
  enddo
  write(6,'(A)') trim(write_buffer)
  write(6,'(A)') ''
  call write_time(6)
  deallocate(W)
  deallocate (                                                       &
      residual_norm,                                                 &
      U, overlap,                                                    &
      h, y, s_tmp,                                                   &
      lambda                                                         &
      )
  FREE nthreads_davidson
 end
 subroutine dressing_diag_uv(v,u,dress_diag,N_st,sze)
  implicit none
  BEGIN_DOC
  ! Routine that computes the diagonal part of the dressing 
  !
  ! v(i) += u(i) * dress_diag(i)
  !
  ! !!!!!!!! WARNING !!!!!!!! the vector v is not initialized 
  !
  ! !!!!!!!! SO MAKE SURE THERE ARE SOME MEANINGFUL VALUES IN THERE
  END_DOC
  integer, intent(in)              :: N_st,sze
  double precision, intent(in)     :: u(sze,N_st),dress_diag(sze)
  double precision, intent(inout)  :: v(sze,N_st)
  integer :: i,istate
  do istate = 1, N_st
   do i = 1, sze
     v(i,istate) += dress_diag(i) * u(i,istate)
   enddo
  enddo
 end
--- a/src/dav_general_mat/dav_ext_rout.irp.f
+++ b/src/dav_general_mat/dav_ext_rout.irp.f
@ -3,7 +3,7 @@ subroutine davidson_general_ext_rout(u_in,H_jj,energies,sze,N_st,N_st_diag_in,co
  use mmap_module
  implicit none
  BEGIN_DOC
-  ! Davidson diagonalization with specific diagonal elements of the H matrix
+  ! Generic Davidson diagonalization 
  !
  ! H_jj : specific diagonal H matrix elements to diagonalize de Davidson
  !
--- a/src/utils/sort.irp.f
+++ b/src/utils/sort.irp.f
@ -38,7 +38,15 @@ BEGIN_TEMPLATE
  $type,intent(inout)            :: x(isize)
  integer,intent(inout)          :: iorder(isize)
  integer, external              :: omp_get_num_threads
  if (omp_get_num_threads() == 1) then
    !$OMP PARALLEL DEFAULT(SHARED)
    !$OMP SINGLE
    call rec_$X_quicksort(x,iorder,isize,1,isize,nproc)
    !$OMP END SINGLE
    !$OMP END PARALLEL
  else
    call rec_$X_quicksort(x,iorder,isize,1,isize,nproc)
  endif
 end
 recursive subroutine rec_$X_quicksort(x, iorder, isize, first, last, level)
@ -81,11 +89,16 @@ BEGIN_TEMPLATE
    endif
  else
    if (first < i-1) then
      !$OMP TASK DEFAULT(SHARED) FIRSTPRIVATE(isize,first,i,level)
      call rec_$X_quicksort(x, iorder, isize, first, i-1,level/2)
      !$OMP END TASK
    endif
    if (j+1 < last) then
      !$OMP TASK DEFAULT(SHARED) FIRSTPRIVATE(isize,last,j,level)
      call rec_$X_quicksort(x, iorder, isize, j+1, last,level/2)
      !$OMP END TASK
    endif
    !$OMP TASKWAIT
  endif
 end
@ -249,60 +262,7 @@ SUBST [ X, type ]
 i2 ; integer*2 ;;
 END_TEMPLATE
 !---------------------- INTEL
 IRP_IF INTEL
 BEGIN_TEMPLATE
 subroutine $Xsort(x,iorder,isize)
  use intel
  implicit none
  BEGIN_DOC
  ! Sort array x(isize).
  ! iorder in input should be (1,2,3,...,isize), and in output
  ! contains the new order of the elements.
  END_DOC
  integer,intent(in)             :: isize
  $type,intent(inout)            :: x(isize)
  integer,intent(inout)          :: iorder(isize)
  integer                        :: n
  character, allocatable         :: tmp(:)
  if (isize < 2) return
  call ippsSortRadixIndexGetBufferSize(isize, $ippsz, n)
  allocate(tmp(n))
  call ippsSortRadixIndexAscend_$ityp(x, $n, iorder, isize, tmp)
  deallocate(tmp)
  iorder(1:isize) = iorder(1:isize)+1
  call $Xset_order(x,iorder,isize)
 end
 subroutine $Xsort_noidx(x,isize)
  use intel
  implicit none
  BEGIN_DOC
  ! Sort array x(isize).
  ! iorder in input should be (1,2,3,...,isize), and in output
  ! contains the new order of the elements.
  END_DOC
  integer,intent(in)             :: isize
  $type,intent(inout)            :: x(isize)
  integer                        :: n
  character, allocatable         :: tmp(:)
  if (isize < 2) return
  call ippsSortRadixIndexGetBufferSize(isize, $ippsz, n)
  allocate(tmp(n))
  call ippsSortRadixAscend_$ityp_I(x, isize, tmp)
  deallocate(tmp)
 end
 SUBST [ X, type, ityp, n, ippsz ]
   ; real ; 32f ; 4 ; 13 ;;
 i ; integer ; 32s ; 4 ; 11 ;;
 i2 ; integer*2 ; 16s ; 2 ; 7 ;;
 END_TEMPLATE
 BEGIN_TEMPLATE
 subroutine $Xsort(x,iorder,isize)
  implicit none
  BEGIN_DOC
@ -329,12 +289,12 @@ BEGIN_TEMPLATE
  endif
 end subroutine $Xsort
-SUBST [ X, type ]
+SUBST [ X, type, Y ]
- d ; double precision ;;
+   ; real ; i ;;
 d ; double precision ; i8 ;;
 END_TEMPLATE
 BEGIN_TEMPLATE
 subroutine $Xsort(x,iorder,isize)
  implicit none
  BEGIN_DOC
@ -346,112 +306,8 @@ BEGIN_TEMPLATE
  $type,intent(inout)            :: x(isize)
  integer,intent(inout)          :: iorder(isize)
  integer                        :: n
-  if (isize < 2) then
+!  call $Xradix_sort(x,iorder,isize,-1)
    return
  endif
  call sorted_$Xnumber(x,isize,n)
  if (isize == n) then
    return
  endif
  if ( isize < 32) then
    call insertion_$Xsort(x,iorder,isize)
  else
    call $Xradix_sort(x,iorder,isize,-1)
  endif
 end subroutine $Xsort
 SUBST [ X, type ]
 i8 ; integer*8 ;;
 END_TEMPLATE
 !---------------------- END INTEL
 IRP_ELSE
 !---------------------- NON-INTEL
 BEGIN_TEMPLATE
 subroutine $Xsort_noidx(x,isize)
  implicit none
  BEGIN_DOC
  ! Sort array x(isize).
  END_DOC
  integer,intent(in)             :: isize
  $type,intent(inout)            :: x(isize)
  integer, allocatable           :: iorder(:)
  integer                        :: i
  allocate(iorder(isize))
  do i=1,isize
   iorder(i)=i
  enddo
  call $Xsort(x,iorder,isize)
  deallocate(iorder)
 end subroutine $Xsort_noidx
 SUBST [ X, type ]
   ; real ;;
 d ; double precision ;;
 i ; integer ;;
 i8 ; integer*8 ;;
 i2 ; integer*2 ;;
 END_TEMPLATE
 BEGIN_TEMPLATE
 subroutine $Xsort(x,iorder,isize)
  implicit none
  BEGIN_DOC
  ! Sort array x(isize).
  ! iorder in input should be (1,2,3,...,isize), and in output
  ! contains the new order of the elements.
  END_DOC
  integer,intent(in)             :: isize
  $type,intent(inout)            :: x(isize)
  integer,intent(inout)          :: iorder(isize)
  integer                        :: n
  if (isize < 2) then
    return
  endif
 !  call sorted_$Xnumber(x,isize,n)
 !  if (isize == n) then
 !    return
 !  endif
  if ( isize < 32) then
    call insertion_$Xsort(x,iorder,isize)
  else
 !    call heap_$Xsort(x,iorder,isize)
  call quick_$Xsort(x,iorder,isize)
  endif
 end subroutine $Xsort
 SUBST [ X, type ]
   ; real ;;
 d ; double precision ;;
 END_TEMPLATE
 BEGIN_TEMPLATE
 subroutine $Xsort(x,iorder,isize)
  implicit none
  BEGIN_DOC
  ! Sort array x(isize).
  ! iorder in input should be (1,2,3,...,isize), and in output
  ! contains the new order of the elements.
  END_DOC
  integer,intent(in)             :: isize
  $type,intent(inout)            :: x(isize)
  integer,intent(inout)          :: iorder(isize)
  integer                        :: n
  if (isize < 2) then
    return
  endif
  call sorted_$Xnumber(x,isize,n)
  if (isize == n) then
    return
  endif
  if ( isize < 32) then
    call insertion_$Xsort(x,iorder,isize)
  else
    call $Xradix_sort(x,iorder,isize,-1)
  endif
 end subroutine $Xsort
 SUBST [ X, type ]
@ -460,11 +316,6 @@ SUBST [ X, type ]
 i2 ; integer*2 ;;
 END_TEMPLATE
 IRP_ENDIF
 !---------------------- END NON-INTEL
 BEGIN_TEMPLATE
 subroutine $Xset_order(x,iorder,isize)
  implicit none
@ -562,12 +413,10 @@ SUBST [ X, type ]
 i2; integer*2 ;;
 END_TEMPLATE
 BEGIN_TEMPLATE
 recursive subroutine $Xradix_sort$big(x,iorder,isize,iradix)
  implicit none
  BEGIN_DOC
  ! Sort integer array x(isize) using the radix sort algorithm.
  ! iorder in input should be (1,2,3,...,isize), and in output
@ -703,14 +552,24 @@ recursive subroutine $Xradix_sort$big(x,iorder,isize,iradix)
    endif
 !   !$OMP PARALLEL DEFAULT(SHARED) if (isize > 1000000)
 !   !$OMP SINGLE
    if (i3>1_$int_type) then
 !     !$OMP TASK FIRSTPRIVATE(iradix_new,i3) SHARED(x,iorder) if(i3 > 1000000)
      call $Xradix_sort$big(x,iorder,i3,iradix_new-1)
 !     !$OMP END TASK
    endif
    if (isize-i3>1_$int_type) then
 !     !$OMP TASK FIRSTPRIVATE(iradix_new,i3) SHARED(x,iorder) if(isize-i3 > 1000000)
      call $Xradix_sort$big(x(i3+1_$int_type),iorder(i3+1_$int_type),isize-i3,iradix_new-1)
 !     !$OMP END TASK
    endif
 !   !$OMP TASKWAIT
 !   !$OMP END SINGLE
 !   !$OMP END PARALLEL
    return
  endif
@ -765,11 +624,16 @@ recursive subroutine $Xradix_sort$big(x,iorder,isize,iradix)
  if (i1>1_$int_type) then
    !$OMP TASK FIRSTPRIVATE(i0,iradix,i1) SHARED(x,iorder) if(i1 >1000000)
    call $Xradix_sort$big(x(i0+1_$int_type),iorder(i0+1_$int_type),i1,iradix-1)
    !$OMP END TASK
  endif
  if (i0>1) then
    !$OMP TASK FIRSTPRIVATE(i0,iradix) SHARED(x,iorder) if(i0 >1000000)
    call $Xradix_sort$big(x,iorder,i0,iradix-1)
    !$OMP END TASK
  endif
  !$OMP TASKWAIT
 end
@ -782,4 +646,3 @@ SUBST [ X, type, integer_size, is_big, big, int_type ]
 END_TEMPLATE