dav_double_dress_ext_rout.irp.f

2025-04-27 18:54:42 +02:00 · 2021-10-26 19:11:04 +02:00 · 2021-10-26 19:11:04 +02:00 · 1f0c48023d
commit 1f0c48023d
parent 699f655b89
3 changed files with 557 additions and 174 deletions
--- a/src/dav_general_mat/dav_double_dress_ext_rout.irp.f
+++ b/src/dav_general_mat/dav_double_dress_ext_rout.irp.f
@ -0,0 +1,520 @@
+subroutine dav_double_dressed(u_in,H_jj,Dress_jj,Dressing_vec,idx_dress,energies,sze,N_st,N_st_diag,converged,hcalc)
+  use mmap_module
+  BEGIN_DOC
+  ! Generic Davidson diagonalization with TWO DRESSING VECTORS 
+  !
+  ! Dress_jj : DIAGONAL DRESSING of the Hamiltonian 
+  !
+  ! Dressing_vec : COLUMN / LINE DRESSING VECTOR 
+  !
+  ! idx_dress : position of the basis function used to use the Dressing_vec (usually the largest coeff)
+  !
+  ! H_jj : specific diagonal H matrix elements to diagonalize de Davidson
+  !
+  ! u_in : guess coefficients on the various states. Overwritten on exit
+  !
+  ! sze : leftmost dimension of u_in
+  !
+  ! sze : Number of determinants
+  !
+  ! N_st : Number of eigenstates
+  !
+  ! N_st_diag : Number of states in which H is diagonalized. Assumed > sze
+  !
+  ! Initial guess vectors are not necessarily orthonormal
+  !
+  ! hcalc subroutine to compute W = H U (see routine hcalc_template for template of input/output)
+  END_DOC
+  implicit none
+  integer, intent(in)             :: sze, N_st, N_st_diag, idx_dress
+  double precision,  intent(in)   :: H_jj(sze),Dress_jj(sze),Dressing_vec(sze,N_st)
+  double precision, intent(inout) :: u_in(sze,N_st_diag)
+  double precision, intent(out)   :: energies(N_st_diag)
+  logical, intent(out)            :: converged
+  external hcalc
+
+  double precision, allocatable   :: H_jj_tmp(:)
+  ASSERT (N_st > 0)
+  ASSERT (sze > 0)
+  allocate(H_jj_tmp(sze))
+
+  do i=1,sze
+    H_jj_tmp(i)  = H_jj(i) + Dress_jj(i)
+  enddo
+  do k=1,N_st
+    do i=1,sze
+      H_jj_tmp(i)  += u_in(i,k) * Dressing_vec(i,k)
+    enddo
+  enddo
+
+  integer                        :: iter
+  integer                        :: i,j,k,l,m
+
+  double precision, external     :: u_dot_v, u_dot_u
+
+  integer                        :: k_pairs, kl
+
+  integer                        :: iter2, itertot
+  double precision, allocatable  :: y(:,:), h(:,:), lambda(:)
+  double precision, allocatable  :: s_tmp(:,:)
+  double precision, allocatable  :: residual_norm(:),inv_c_idx_dress_vec(:)
+  character*(16384)              :: write_buffer
+  double precision               :: to_print(2,N_st),inv_c_idx_dress
+  double precision               :: cpu, wall
+  integer                        :: shift, shift2, itermax, istate
+  double precision               :: r1, r2, alpha
+  logical                        :: state_ok(N_st_diag*davidson_sze_max)
+  integer                        :: nproc_target
+  integer                        :: order(N_st_diag)
+  double precision               :: cmax
+  double precision, allocatable  :: U(:,:), overlap(:,:)
+  double precision, pointer      :: W(:,:)
+  logical                        :: disk_based
+  double precision               :: energy_shift(N_st_diag*davidson_sze_max)
+
+
+  allocate(inv_c_idx_dress_vec(N_st))
+  inv_c_idx_dress = 1.d0/u_in(idx_dress,1)
+  do i = 1, N_st
+   inv_c_idx_dress_vec(i) = 1.d0/u_in(idx_dress,i)
+  enddo
+  include 'constants.include.F'
+
+  integer :: N_st_diag_in 
+  N_st_diag_in = N_st_diag
+
+  !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: U, W, y, h, lambda
+  if (N_st_diag_in*3 > sze) then
+    print *,  'error in Davidson :'
+    print *,  'Increase n_det_max_full to ', N_st_diag_in*3
+    stop -1
+  endif
+
+  itermax = max(2,min(davidson_sze_max, sze/N_st_diag_in))+1
+  itertot = 0
+
+  if (state_following) then
+    allocate(overlap(N_st_diag_in*itermax, N_st_diag_in*itermax))
+  else
+    allocate(overlap(1,1))  ! avoid 'if' for deallocate
+  endif
+  overlap = 0.d0
+
+  call write_time(6)
+  write(6,'(A)') ''
+  write(6,'(A)') 'Davidson Diagonalization'
+  write(6,'(A)') '------------------------'
+  write(6,'(A)') ''
+
+  ! Find max number of cores to fit in memory
+  ! -----------------------------------------
+
+  nproc_target = nproc
+  double precision :: rss
+  integer :: maxab
+  maxab = max(N_det_alpha_unique, N_det_beta_unique)+1
+
+  m=1
+  disk_based = .False.
+  call resident_memory(rss)
+  do
+    r1 = 8.d0 *                                   &! bytes
+         ( dble(sze)*(N_st_diag_in*itermax)          &! U
+         + 1.0d0*dble(sze*m)*(N_st_diag_in*itermax)  &! W
+         + 3.0d0*(N_st_diag_in*itermax)**2           &! h,y,s_tmp
+         + 1.d0*(N_st_diag_in*itermax)               &! lambda
+         + 1.d0*(N_st_diag_in)                       &! residual_norm
+                                                   ! In H_u_0_nstates_zmq
+         + 2.d0*(N_st_diag_in*N_det)                 &! u_t, v_t, on collector
+         + 2.d0*(N_st_diag_in*N_det)                 &! u_t, v_t, on slave
+         + 0.5d0*maxab                            &! idx0 in H_u_0_nstates_openmp_work_*
+         + nproc_target *                         &! In OMP section
+           ( 1.d0*(N_int*maxab)                   &! buffer
+           + 3.5d0*(maxab) )                      &! singles_a, singles_b, doubles, idx
+         ) / 1024.d0**3
+
+    if (nproc_target == 0) then
+      call check_mem(r1,irp_here)
+      nproc_target = 1
+      exit
+    endif
+
+    if (r1+rss < qp_max_mem) then
+      exit
+    endif
+
+    if (itermax > 4) then
+      itermax = itermax - 1
+    else if (m==1.and.disk_based_davidson) then
+      m=0
+      disk_based = .True.
+      itermax = 6
+    else
+      nproc_target = nproc_target - 1
+    endif
+
+  enddo
+  nthreads_davidson = nproc_target
+  TOUCH nthreads_davidson
+  call write_int(6,N_st,'Number of states')
+  call write_int(6,N_st_diag_in,'Number of states in diagonalization')
+  call write_int(6,sze,'Number of basis functions ')
+  call write_int(6,nproc_target,'Number of threads for diagonalization')
+  call write_double(6, r1, 'Memory(Gb)')
+  if (disk_based) then
+    print *, 'Using swap space to reduce RAM'
+  endif
+
+  !---------------
+
+  write(6,'(A)') ''
+  write_buffer = '====='
+  do i=1,N_st
+    write_buffer = trim(write_buffer)//' ================ ==========='
+  enddo
+  write(6,'(A)') write_buffer(1:6+41*N_st)
+  write_buffer = 'Iter'
+  do i=1,N_st
+    write_buffer = trim(write_buffer)//'       Energy        Residual '
+  enddo
+  write(6,'(A)') write_buffer(1:6+41*N_st)
+  write_buffer = '====='
+  do i=1,N_st
+    write_buffer = trim(write_buffer)//' ================ ==========='
+  enddo
+  write(6,'(A)') write_buffer(1:6+41*N_st)
+
+
+  allocate(W(sze,N_st_diag_in*itermax))
+
+  allocate(                                                          &
+      ! Large
+      U(sze,N_st_diag_in*itermax),                                      &
+
+      ! Small
+      h(N_st_diag_in*itermax,N_st_diag_in*itermax),                        &
+      y(N_st_diag_in*itermax,N_st_diag_in*itermax),                        &
+      s_tmp(N_st_diag_in*itermax,N_st_diag_in*itermax),                    &
+      residual_norm(N_st_diag_in),                                      &
+      lambda(N_st_diag_in*itermax))
+
+  h = 0.d0
+  U = 0.d0
+  y = 0.d0
+  s_tmp = 0.d0
+
+
+  ASSERT (N_st > 0)
+  ASSERT (N_st_diag_in >= N_st)
+  ASSERT (sze > 0)
+  ASSERT (Nint > 0)
+  ASSERT (Nint == N_int)
+
+  ! Davidson iterations
+  ! ===================
+
+  converged = .False.
+
+  do k=N_st+1,N_st_diag_in
+    do i=1,sze
+        call random_number(r1)
+        call random_number(r2)
+        r1 = dsqrt(-2.d0*dlog(r1))
+        r2 = dtwo_pi*r2
+        u_in(i,k) = r1*dcos(r2) * u_in(i,k-N_st)
+    enddo
+    u_in(k,k) = u_in(k,k) + 10.d0
+  enddo
+  do k=1,N_st_diag_in
+    call normalize(u_in(1,k),sze)
+  enddo
+
+  do k=1,N_st_diag_in
+    do i=1,sze
+      U(i,k) = u_in(i,k)
+    enddo
+  enddo
+
+
+  do while (.not.converged)
+    itertot = itertot+1
+    if (itertot == 8) then
+      exit
+    endif
+
+    do iter=1,itermax-1
+
+      shift  = N_st_diag_in*(iter-1)
+      shift2 = N_st_diag_in*iter
+
+      if ((iter > 1).or.(itertot == 1)) then
+        ! Compute |W_k> = \sum_i |i><i|H|u_k>
+        ! -----------------------------------
+        call hcalc(W(1,shift+1),U(1,shift+1),N_st_diag_in,sze)
+        ! Compute then the DIAGONAL PART OF THE DRESSING 
+        ! <i|W_k> += Dress_jj(i) * <i|U>
+        call dressing_diag_uv(W(1,shift+1),U(1,shift+1),Dress_jj,N_st_diag_in,sze)
+      else
+         ! Already computed in update below
+         continue
+      endif
+
+
+        if (N_st == 1) then
+
+          l = idx_dress
+          double precision :: f
+          f = inv_c_idx_dress
+          do istate=1,N_st_diag_in
+            do i=1,sze
+              W(i,shift+istate) += Dressing_vec(i,1) *f * U(l,shift+istate)
+              W(l,shift+istate) += Dressing_vec(i,1) *f * U(i,shift+istate)
+            enddo
+          enddo
+
+        else
+         print*,'dav_double_dressed routine not yet implemented for N_st > 1'
+!
+!          call dgemm('T','N', N_st, N_st_diag_in, sze, 1.d0, &
+!            psi_coef, size(psi_coef,1), &
+!            U(1,shift+1), size(U,1), 0.d0, s_tmp, size(s_tmp,1))
+!
+!          call dgemm('N','N', sze, N_st_diag_in, N_st, 1.0d0, &
+!            Dressing_vec, size(Dressing_vec,1), s_tmp, size(s_tmp,1), &
+!            1.d0, W(1,shift+1), size(W,1))
+!
+!
+!          call dgemm('T','N', N_st, N_st_diag_in, sze, 1.d0, &
+!            Dressing_vec, size(Dressing_vec,1), &
+!            U(1,shift+1), size(U,1), 0.d0, s_tmp, size(s_tmp,1))
+!
+!          call dgemm('N','N', sze, N_st_diag_in, N_st, 1.0d0, &
+!            psi_coef, size(psi_coef,1), s_tmp, size(s_tmp,1), &
+!            1.d0, W(1,shift+1), size(W,1))
+!
+        endif
+
+      ! Compute h_kl = <u_k | W_l> = <u_k| H |u_l>
+      ! -------------------------------------------
+
+      call dgemm('T','N', shift2, shift2, sze,                       &
+          1.d0, U, size(U,1), W, size(W,1),                          &
+          0.d0, h, size(h,1))
+      call dgemm('T','N', shift2, shift2, sze,                       &
+          1.d0, U, size(U,1), U, size(U,1),                          &
+          0.d0, s_tmp, size(s_tmp,1))
+
+      ! Diagonalize h
+      ! ---------------
+
+       integer :: lwork, info
+       double precision, allocatable :: work(:)
+
+       y = h
+       lwork = -1
+       allocate(work(1))
+       call dsygv(1,'V','U',shift2,y,size(y,1), &
+          s_tmp,size(s_tmp,1), lambda, work,lwork,info)
+       lwork = int(work(1))
+       deallocate(work)
+       allocate(work(lwork))
+       call dsygv(1,'V','U',shift2,y,size(y,1), &
+          s_tmp,size(s_tmp,1), lambda, work,lwork,info)
+       deallocate(work)
+       if (info /= 0) then
+         stop 'DSYGV Diagonalization failed'
+       endif
+
+      ! Compute Energy for each eigenvector
+      ! -----------------------------------
+
+      call dgemm('N','N',shift2,shift2,shift2,                       &
+          1.d0, h, size(h,1), y, size(y,1),                          &
+          0.d0, s_tmp, size(s_tmp,1))
+
+      call dgemm('T','N',shift2,shift2,shift2,                       &
+          1.d0, y, size(y,1), s_tmp, size(s_tmp,1),                  &
+          0.d0, h, size(h,1))
+
+      do k=1,shift2
+        lambda(k) = h(k,k)
+      enddo
+
+      if (state_following) then
+
+        overlap = -1.d0
+        do k=1,shift2
+          do i=1,shift2
+            overlap(k,i) = dabs(y(k,i))
+          enddo
+        enddo
+        do k=1,N_st
+          cmax = -1.d0
+          do i=1,N_st
+            if (overlap(i,k) > cmax) then
+              cmax = overlap(i,k)
+              order(k) = i
+            endif
+          enddo
+          do i=1,N_st_diag_in
+            overlap(order(k),i) = -1.d0
+          enddo
+        enddo
+        overlap = y
+        do k=1,N_st
+          l = order(k)
+          if (k /= l) then
+            y(1:shift2,k) = overlap(1:shift2,l)
+          endif
+        enddo
+        do k=1,N_st
+          overlap(k,1) = lambda(k)
+        enddo
+
+      endif
+
+
+      ! Express eigenvectors of h in the determinant basis
+      ! --------------------------------------------------
+
+      call dgemm('N','N', sze, N_st_diag_in, shift2,                    &
+          1.d0, U, size(U,1), y, size(y,1), 0.d0, U(1,shift2+1), size(U,1))
+      call dgemm('N','N', sze, N_st_diag_in, shift2,                    &
+          1.d0, W, size(W,1), y, size(y,1), 0.d0, W(1,shift2+1), size(W,1))
+
+      ! Compute residual vector and davidson step
+      ! -----------------------------------------
+
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i,k)
+      do k=1,N_st_diag_in
+        do i=1,sze
+          U(i,shift2+k) =  &
+            (lambda(k) * U(i,shift2+k) - W(i,shift2+k) )      &
+              /max(H_jj_tmp(i) - lambda (k),1.d-2)
+        enddo
+
+        if (k <= N_st) then
+          residual_norm(k) = u_dot_u(U(1,shift2+k),sze)
+          to_print(1,k) = lambda(k) 
+          to_print(2,k) = residual_norm(k)
+        endif
+      enddo
+      !$OMP END PARALLEL DO
+
+
+      if ((itertot>1).and.(iter == 1)) then
+        !don't print
+        continue
+      else
+        write(*,'(1X,I3,1X,100(1X,F16.10,1X,E11.3))') iter-1, to_print(1:2,1:N_st)
+      endif
+
+      ! Check convergence
+      if (iter > 1) then
+        converged = dabs(maxval(residual_norm(1:N_st))) < threshold_davidson
+      endif
+
+      do k=1,N_st
+        if (residual_norm(k) > 1.d8) then
+          print *, 'Davidson failed'
+          stop -1
+        endif
+      enddo
+      if (converged) then
+        exit
+      endif
+
+      logical, external :: qp_stop
+      if (qp_stop()) then
+        converged = .True.
+        exit
+      endif
+
+
+    enddo
+
+    ! Re-contract U and update W
+    ! --------------------------------
+
+    call dgemm('N','N', sze, N_st_diag_in, shift2, 1.d0,      &
+        W, size(W,1), y, size(y,1), 0.d0, u_in, size(u_in,1))
+    do k=1,N_st_diag_in
+      do i=1,sze
+        W(i,k) = u_in(i,k)
+      enddo
+    enddo
+
+    call dgemm('N','N', sze, N_st_diag_in, shift2, 1.d0,      &
+        U, size(U,1), y, size(y,1), 0.d0, u_in, size(u_in,1))
+
+    do k=1,N_st_diag_in
+      do i=1,sze
+        U(i,k) = u_in(i,k)
+      enddo
+    enddo
+
+  enddo
+
+
+  call nullify_small_elements(sze,N_st_diag_in,U,size(U,1),threshold_davidson_pt2)
+  do k=1,N_st_diag_in
+    do i=1,sze
+      u_in(i,k) = U(i,k)
+    enddo
+  enddo
+
+  do k=1,N_st_diag_in
+    energies(k) = lambda(k)
+  enddo
+  write_buffer = '======'
+  do i=1,N_st
+    write_buffer = trim(write_buffer)//' ================ ==========='
+  enddo
+  write(6,'(A)') trim(write_buffer)
+  write(6,'(A)') ''
+  call write_time(6)
+
+  deallocate(W)
+
+  deallocate (                                                       &
+      residual_norm,                                                 &
+      U, overlap,                                                    &
+      h, y, s_tmp,                                                   &
+      lambda                                                         &
+      )
+  FREE nthreads_davidson
+end
+
+
+subroutine dressing_diag_uv(v,u,dress_diag,N_st,sze)
+  implicit none
+  BEGIN_DOC
+  ! Routine that computes the diagonal part of the dressing 
+  !
+  ! v(i) += u(i) * dress_diag(i)
+  !
+  ! !!!!!!!! WARNING !!!!!!!! the vector v is not initialized 
+  !
+  ! !!!!!!!! SO MAKE SURE THERE ARE SOME MEANINGFUL VALUES IN THERE
+  END_DOC
+  integer, intent(in)              :: N_st,sze
+  double precision, intent(in)     :: u(sze,N_st),dress_diag(sze)
+  double precision, intent(inout)  :: v(sze,N_st)
+  integer :: i,istate
+  do istate = 1, N_st
+   do i = 1, sze
+     v(i,istate) += dress_diag(i) * u(i,istate)
+   enddo
+  enddo
+end
+
+
+
+
+
+
+
+
+
+
+
--- a/src/dav_general_mat/dav_ext_rout.irp.f
+++ b/src/dav_general_mat/dav_ext_rout.irp.f
@ -3,7 +3,7 @@ subroutine davidson_general_ext_rout(u_in,H_jj,energies,sze,N_st,N_st_diag_in,co
  use mmap_module
  implicit none
  BEGIN_DOC
-  ! Davidson diagonalization with specific diagonal elements of the H matrix
+  ! Generic Davidson diagonalization 
  !
  ! H_jj : specific diagonal H matrix elements to diagonalize de Davidson
  !
--- a/src/utils/sort.irp.f
+++ b/src/utils/sort.irp.f
@ -38,7 +38,15 @@ BEGIN_TEMPLATE
  $type,intent(inout)            :: x(isize)
  integer,intent(inout)          :: iorder(isize)
  integer, external              :: omp_get_num_threads
-  call rec_$X_quicksort(x,iorder,isize,1,isize,nproc)
+  if (omp_get_num_threads() == 1) then
+    !$OMP PARALLEL DEFAULT(SHARED)
+    !$OMP SINGLE
+    call rec_$X_quicksort(x,iorder,isize,1,isize,nproc)
+    !$OMP END SINGLE
+    !$OMP END PARALLEL
+  else
+    call rec_$X_quicksort(x,iorder,isize,1,isize,nproc)
+  endif
 end

 recursive subroutine rec_$X_quicksort(x, iorder, isize, first, last, level)
@ -81,11 +89,16 @@ BEGIN_TEMPLATE
    endif
  else
    if (first < i-1) then
+      !$OMP TASK DEFAULT(SHARED) FIRSTPRIVATE(isize,first,i,level)
      call rec_$X_quicksort(x, iorder, isize, first, i-1,level/2)
+      !$OMP END TASK
    endif
    if (j+1 < last) then
+      !$OMP TASK DEFAULT(SHARED) FIRSTPRIVATE(isize,last,j,level)
      call rec_$X_quicksort(x, iorder, isize, j+1, last,level/2)
+      !$OMP END TASK
    endif
+    !$OMP TASKWAIT
  endif
 end

@ -249,60 +262,7 @@ SUBST [ X, type ]
 i2 ; integer*2 ;;
 END_TEMPLATE

-
-!---------------------- INTEL
-IRP_IF INTEL
-
 BEGIN_TEMPLATE
- subroutine $Xsort(x,iorder,isize)
-  use intel
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize).
-  ! iorder in input should be (1,2,3,...,isize), and in output
-  ! contains the new order of the elements.
-  END_DOC
-  integer,intent(in)             :: isize
-  $type,intent(inout)            :: x(isize)
-  integer,intent(inout)          :: iorder(isize)
-  integer                        :: n
-  character, allocatable         :: tmp(:)
-  if (isize < 2) return
-  call ippsSortRadixIndexGetBufferSize(isize, $ippsz, n)
-  allocate(tmp(n))
-  call ippsSortRadixIndexAscend_$ityp(x, $n, iorder, isize, tmp)
-  deallocate(tmp)
-  iorder(1:isize) = iorder(1:isize)+1
-  call $Xset_order(x,iorder,isize)
- end
-
- subroutine $Xsort_noidx(x,isize)
-  use intel
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize).
-  ! iorder in input should be (1,2,3,...,isize), and in output
-  ! contains the new order of the elements.
-  END_DOC
-  integer,intent(in)             :: isize
-  $type,intent(inout)            :: x(isize)
-  integer                        :: n
-  character, allocatable         :: tmp(:)
-  if (isize < 2) return
-  call ippsSortRadixIndexGetBufferSize(isize, $ippsz, n)
-  allocate(tmp(n))
-  call ippsSortRadixAscend_$ityp_I(x, isize, tmp)
-  deallocate(tmp)
- end
-
-SUBST [ X, type, ityp, n, ippsz ]
-   ; real ; 32f ; 4 ; 13 ;;
- i ; integer ; 32s ; 4 ; 11 ;;
- i2 ; integer*2 ; 16s ; 2 ; 7 ;;
-END_TEMPLATE
-
-BEGIN_TEMPLATE
-
 subroutine $Xsort(x,iorder,isize)
  implicit none
  BEGIN_DOC
@ -329,12 +289,12 @@ BEGIN_TEMPLATE
  endif
 end subroutine $Xsort

-SUBST [ X, type ]
- d ; double precision ;;
+SUBST [ X, type, Y ]
+   ; real ; i ;;
+ d ; double precision ; i8 ;;
 END_TEMPLATE

 BEGIN_TEMPLATE
-
 subroutine $Xsort(x,iorder,isize)
  implicit none
  BEGIN_DOC
@ -346,112 +306,8 @@ BEGIN_TEMPLATE
  $type,intent(inout)            :: x(isize)
  integer,intent(inout)          :: iorder(isize)
  integer                        :: n
-  if (isize < 2) then
-    return
-  endif
-  call sorted_$Xnumber(x,isize,n)
-  if (isize == n) then
-    return
-  endif
-  if ( isize < 32) then
-    call insertion_$Xsort(x,iorder,isize)
-  else
-    call $Xradix_sort(x,iorder,isize,-1)
-  endif
- end subroutine $Xsort
-
-SUBST [ X, type ]
- i8 ; integer*8 ;;
-END_TEMPLATE
-
-!---------------------- END INTEL
-IRP_ELSE
-!---------------------- NON-INTEL
-BEGIN_TEMPLATE
-
- subroutine $Xsort_noidx(x,isize)
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize).
-  END_DOC
-  integer,intent(in)             :: isize
-  $type,intent(inout)            :: x(isize)
-  integer, allocatable           :: iorder(:)
-  integer                        :: i
-  allocate(iorder(isize))
-  do i=1,isize
-   iorder(i)=i
-  enddo
-  call $Xsort(x,iorder,isize)
-  deallocate(iorder)
- end subroutine $Xsort_noidx
-
-SUBST [ X, type ]
-   ; real ;;
- d ; double precision ;;
- i ; integer ;;
- i8 ; integer*8 ;;
- i2 ; integer*2 ;;
-END_TEMPLATE
-
-BEGIN_TEMPLATE
-
- subroutine $Xsort(x,iorder,isize)
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize).
-  ! iorder in input should be (1,2,3,...,isize), and in output
-  ! contains the new order of the elements.
-  END_DOC
-  integer,intent(in)             :: isize
-  $type,intent(inout)            :: x(isize)
-  integer,intent(inout)          :: iorder(isize)
-  integer                        :: n
-  if (isize < 2) then
-    return
-  endif
-!  call sorted_$Xnumber(x,isize,n)
-!  if (isize == n) then
-!    return
-!  endif
-  if ( isize < 32) then
-    call insertion_$Xsort(x,iorder,isize)
-  else
-!    call heap_$Xsort(x,iorder,isize)
-    call quick_$Xsort(x,iorder,isize)
-  endif
- end subroutine $Xsort
-
-SUBST [ X, type ]
-   ; real ;;
- d ; double precision ;;
-END_TEMPLATE
-
-BEGIN_TEMPLATE
-
- subroutine $Xsort(x,iorder,isize)
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize).
-  ! iorder in input should be (1,2,3,...,isize), and in output
-  ! contains the new order of the elements.
-  END_DOC
-  integer,intent(in)             :: isize
-  $type,intent(inout)            :: x(isize)
-  integer,intent(inout)          :: iorder(isize)
-  integer                        :: n
-  if (isize < 2) then
-    return
-  endif
-  call sorted_$Xnumber(x,isize,n)
-  if (isize == n) then
-    return
-  endif
-  if ( isize < 32) then
-    call insertion_$Xsort(x,iorder,isize)
-  else
-    call $Xradix_sort(x,iorder,isize,-1)
-  endif
+!  call $Xradix_sort(x,iorder,isize,-1)
+  call quick_$Xsort(x,iorder,isize)
 end subroutine $Xsort

 SUBST [ X, type ]
@ -460,11 +316,6 @@ SUBST [ X, type ]
 i2 ; integer*2 ;;
 END_TEMPLATE

-IRP_ENDIF
-!---------------------- END NON-INTEL
-
-
-
 BEGIN_TEMPLATE
 subroutine $Xset_order(x,iorder,isize)
  implicit none
@ -562,12 +413,10 @@ SUBST [ X, type ]
 i2; integer*2 ;;
 END_TEMPLATE

-
 BEGIN_TEMPLATE

-recursive subroutine $Xradix_sort$big(x,iorder,isize,iradix)
+ recursive subroutine $Xradix_sort$big(x,iorder,isize,iradix)
  implicit none
-
  BEGIN_DOC
  ! Sort integer array x(isize) using the radix sort algorithm.
  ! iorder in input should be (1,2,3,...,isize), and in output
@ -703,14 +552,24 @@ recursive subroutine $Xradix_sort$big(x,iorder,isize,iradix)
    endif


+!   !$OMP PARALLEL DEFAULT(SHARED) if (isize > 1000000)
+!   !$OMP SINGLE
    if (i3>1_$int_type) then
+!     !$OMP TASK FIRSTPRIVATE(iradix_new,i3) SHARED(x,iorder) if(i3 > 1000000)
      call $Xradix_sort$big(x,iorder,i3,iradix_new-1)
+!     !$OMP END TASK
    endif

    if (isize-i3>1_$int_type) then
+!     !$OMP TASK FIRSTPRIVATE(iradix_new,i3) SHARED(x,iorder) if(isize-i3 > 1000000)
      call $Xradix_sort$big(x(i3+1_$int_type),iorder(i3+1_$int_type),isize-i3,iradix_new-1)
+!     !$OMP END TASK
    endif

+!   !$OMP TASKWAIT
+!   !$OMP END SINGLE
+!   !$OMP END PARALLEL
+
    return
  endif

@ -765,11 +624,16 @@ recursive subroutine $Xradix_sort$big(x,iorder,isize,iradix)


  if (i1>1_$int_type) then
+    !$OMP TASK FIRSTPRIVATE(i0,iradix,i1) SHARED(x,iorder) if(i1 >1000000)
    call $Xradix_sort$big(x(i0+1_$int_type),iorder(i0+1_$int_type),i1,iradix-1)
+    !$OMP END TASK
  endif
  if (i0>1) then
+    !$OMP TASK FIRSTPRIVATE(i0,iradix) SHARED(x,iorder) if(i0 >1000000)
    call $Xradix_sort$big(x,iorder,i0,iradix-1)
+    !$OMP END TASK
  endif
+  !$OMP TASKWAIT

 end

@ -782,4 +646,3 @@ SUBST [ X, type, integer_size, is_big, big, int_type ]
 END_TEMPLATE


-