From 94a069c2d5be74f2af87f49105a22a6460dc3e3c Mon Sep 17 00:00:00 2001
From: vijay gopal chilkuri <vijay.gopal.c@gmail.com>
Date: Wed, 26 May 2021 20:11:39 +0530
Subject: [PATCH 1/9] Simplified calculation of n_CSF. #158

---
 src/csf/sigma_vector.irp.f | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/src/csf/sigma_vector.irp.f b/src/csf/sigma_vector.irp.f
index 265d2384..1d6e663a 100644
--- a/src/csf/sigma_vector.irp.f
+++ b/src/csf/sigma_vector.irp.f
@@ -28,25 +28,22 @@
   detDimperBF = 0
   MS = elec_alpha_num-elec_beta_num
   ! number of cfgs = number of dets for 0 somos
-  n_CSF = cfg_seniority_index(0)-1
+  n_CSF = 0
   ncfgprev = cfg_seniority_index(0)
-  do i = 0-iand(MS,1)+2, NSOMOMax,2
-     if(cfg_seniority_index(i) .EQ. -1)then
-        ncfgpersomo = N_configuration + 1
-     else
-        ncfgpersomo = cfg_seniority_index(i)
-     endif
-  ncfg = ncfgpersomo - ncfgprev
-  !detDimperBF = max(1,nint((binom(i,(i+1)/2))))
-  if (i > 2) then
-    dimcsfpercfg = max(1,nint((binom(i-2,(i-2+1)/2)-binom(i-2,((i-2+1)/2)+1))))
-  else
-    dimcsfpercfg = 1
-  endif
-  n_CSF += ncfg * dimcsfpercfg
-  !if(cfg_seniority_index(i+2) == -1) EXIT
-  !if(detDimperBF > maxDetDimPerBF) maxDetDimPerBF = detDimperBF
-  ncfgprev = cfg_seniority_index(i)
+  do i = iand(MS,1), NSOMOMax-2,2
+    if(cfg_seniority_index(i+2) .EQ. -1) then
+      ncfgpersomo = N_configuration + 1
+    else
+      ncfgpersomo = cfg_seniority_index(i+2)
+    endif
+    ncfg = ncfgpersomo - ncfgprev
+    if(iand(MS,1) .EQ. 0) then
+      dimcsfpercfg = max(1,nint((binom(i,i/2)-binom(i,i/2+1))))
+    else
+      dimcsfpercfg = max(1,nint((binom(i,(i+1)/2)-binom(i,(i+3)/2))))
+    endif
+    n_CSF += ncfg * dimcsfpercfg
+    ncfgprev = cfg_seniority_index(i+2)
   enddo
   END_PROVIDER
 

From 32e2afca90dac6d12a6614fdbec39a03ad4c8f8a Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Mon, 31 May 2021 01:48:34 +0200
Subject: [PATCH 2/9] Using Intel IPP for sorting

---
 config/ifort.cfg                              |   4 +-
 config/ifort_avx.cfg                          |   4 +-
 config/ifort_avx_mpi.cfg                      |   4 +-
 config/ifort_debug.cfg                        |   4 +-
 config/ifort_mpi.cfg                          |   4 +-
 config/ifort_rome.cfg                         |   4 +-
 config/ifort_xHost.cfg                        |   6 +-
 src/ao_one_e_ints/pseudopot.f90               |  19 ++-
 src/cipsi/selection.irp.f                     |   7 +-
 src/csf/sigma_vector.irp.f                    | 106 ++++++------
 .../diagonalization_hcsf_dressed.irp.f        |   1 +
 src/utils/intel.f90                           |  70 ++++++++
 src/utils/sort.irp.f                          | 151 ++++++++++++++++--
 13 files changed, 297 insertions(+), 87 deletions(-)
 create mode 100644 src/utils/intel.f90

diff --git a/config/ifort.cfg b/config/ifort.cfg
index 866aae3d..714c4b10 100644
--- a/config/ifort.cfg
+++ b/config/ifort.cfg
@@ -7,9 +7,9 @@
 #
 [COMMON]
 FC           : ifort -fpic
-LAPACK_LIB   : -mkl=parallel
+LAPACK_LIB   : -mkl=parallel -lirc -lsvml -limf -lipps
 IRPF90       : irpf90
-IRPF90_FLAGS : --ninja --align=32 
+IRPF90_FLAGS : --ninja --align=32  -DINTEL
 
 # Global options
 ################
diff --git a/config/ifort_avx.cfg b/config/ifort_avx.cfg
index d689050e..a2cb4c8a 100644
--- a/config/ifort_avx.cfg
+++ b/config/ifort_avx.cfg
@@ -7,9 +7,9 @@
 #
 [COMMON]
 FC           : ifort -fpic
-LAPACK_LIB   : -mkl=parallel
+LAPACK_LIB   : -mkl=parallel -lirc -lsvml -limf -lipps
 IRPF90       : irpf90
-IRPF90_FLAGS : --ninja --align=32
+IRPF90_FLAGS : --ninja --align=32 -DINTEL
 
 # Global options
 ################
diff --git a/config/ifort_avx_mpi.cfg b/config/ifort_avx_mpi.cfg
index 9a839e66..f2bb8889 100644
--- a/config/ifort_avx_mpi.cfg
+++ b/config/ifort_avx_mpi.cfg
@@ -7,9 +7,9 @@
 #
 [COMMON]
 FC           : mpiifort -fpic
-LAPACK_LIB   : -mkl=parallel
+LAPACK_LIB   : -mkl=parallel -lirc -lsvml -limf -lipps
 IRPF90       : irpf90
-IRPF90_FLAGS : --ninja --align=32 -DMPI
+IRPF90_FLAGS : --ninja --align=32 -DMPI -DINTEL
 
 # Global options
 ################
diff --git a/config/ifort_debug.cfg b/config/ifort_debug.cfg
index f2fbf8ce..9b718380 100644
--- a/config/ifort_debug.cfg
+++ b/config/ifort_debug.cfg
@@ -7,9 +7,9 @@
 #
 [COMMON]
 FC           : ifort -fpic
-LAPACK_LIB   : -mkl=parallel
+LAPACK_LIB   : -mkl=parallel -lirc -lsvml -limf -lipps
 IRPF90       : irpf90
-IRPF90_FLAGS : --ninja --align=32  --assert
+IRPF90_FLAGS : --ninja --align=32  --assert -DINTEL
 
 # Global options
 ################
diff --git a/config/ifort_mpi.cfg b/config/ifort_mpi.cfg
index 57087847..e0d489a0 100644
--- a/config/ifort_mpi.cfg
+++ b/config/ifort_mpi.cfg
@@ -7,9 +7,9 @@
 #
 [COMMON]
 FC           : mpiifort -fpic
-LAPACK_LIB   : -mkl=parallel
+LAPACK_LIB   : -mkl=parallel -lirc -lsvml -limf -lipps
 IRPF90       : irpf90
-IRPF90_FLAGS : --ninja --align=32 -DMPI
+IRPF90_FLAGS : --ninja --align=32 -DMPI -DINTEL
 
 # Global options
 ################
diff --git a/config/ifort_rome.cfg b/config/ifort_rome.cfg
index a4bce680..3b70f98b 100644
--- a/config/ifort_rome.cfg
+++ b/config/ifort_rome.cfg
@@ -7,9 +7,9 @@
 #
 [COMMON]
 FC           : ifort -fpic
-LAPACK_LIB   : -mkl=parallel
+LAPACK_LIB   : -mkl=parallel -lirc -lsvml -limf -lipps
 IRPF90       : irpf90
-IRPF90_FLAGS : --ninja --align=32 
+IRPF90_FLAGS : --ninja --align=32  -DINTEL
 
 # Global options
 ################
diff --git a/config/ifort_xHost.cfg b/config/ifort_xHost.cfg
index 5d952e54..ddb4aa2d 100644
--- a/config/ifort_xHost.cfg
+++ b/config/ifort_xHost.cfg
@@ -6,10 +6,10 @@
 # --align=32                 : Align all provided arrays on a 32-byte boundary
 #
 [COMMON]
-FC           : ifort -fpic 
-LAPACK_LIB   : -mkl=parallel
+FC           : ifort -fpic
+LAPACK_LIB   : -mkl=parallel -lirc -lsvml -limf -lipps
 IRPF90       : irpf90
-IRPF90_FLAGS : --ninja --align=64
+IRPF90_FLAGS : --ninja --align=64 -DINTEL
 
 # Global options
 ################
diff --git a/src/ao_one_e_ints/pseudopot.f90 b/src/ao_one_e_ints/pseudopot.f90
index 6c8e5c83..48e3803e 100644
--- a/src/ao_one_e_ints/pseudopot.f90
+++ b/src/ao_one_e_ints/pseudopot.f90
@@ -96,8 +96,12 @@ end
 ! x=cos(theta)
 
         double precision function ylm_real(l,m,x,phi)
-        implicit double precision (a-h,o-z)
-        DIMENSION PM(0:100,0:100)
+        implicit none
+        integer :: MM, iabs_m, m, l
+        double precision :: pi, fourpi, factor, x, phi, coef
+        double precision :: xchap, ychap, zchap
+        double precision, external :: fact
+        double precision :: PM(0:100,0:100), plm
         MM=100
         pi=dacos(-1.d0)
         fourpi=4.d0*pi
@@ -1150,8 +1154,10 @@ end
 !       Output:  PM(m,n) --- Pmn(x)
 !       =====================================================
 !
-        IMPLICIT DOUBLE PRECISION (P,X)
-        DIMENSION PM(0:MM,0:(N+1))
+        implicit none
+!        IMPLICIT DOUBLE PRECISION (P,X)
+        integer :: MM, N, I, J, M
+        double precision :: PM(0:MM,0:(N+1)), X, XQ, XS
         DOUBLE PRECISION, SAVE ::  INVERSE(100) = 0.D0
         DOUBLE PRECISION       :: LS, II, JJ
         IF (INVERSE(1) == 0.d0) THEN
@@ -1202,8 +1208,9 @@ end
 !  P_l^|m|(cos(theta))  exp(i m phi)
 
       subroutine erreur(x,n,rmoy,error)
-      implicit double precision(a-h,o-z)
-      dimension x(n)
+      implicit none
+      integer :: i, n
+      double precision :: x(n), rn, rn1, error, rmoy
 ! calcul de la moyenne
       rmoy=0.d0
       do i=1,n
diff --git a/src/cipsi/selection.irp.f b/src/cipsi/selection.irp.f
index 11492652..eda9642c 100644
--- a/src/cipsi/selection.irp.f
+++ b/src/cipsi/selection.irp.f
@@ -253,12 +253,7 @@ subroutine select_singles_and_doubles(i_generator,hole_mask,particle_mask,fock_d
   deallocate(exc_degree)
   nmax=k-1
 
-  allocate(iorder(nmax))
-  do i=1,nmax
-    iorder(i) = i
-  enddo
-  call isort(indices,iorder,nmax)
-  deallocate(iorder)
+  call isort_noidx(indices,nmax)
 
   ! Start with 32 elements. Size will double along with the filtering.
   allocate(preinteresting(0:32), prefullinteresting(0:32),     &
diff --git a/src/csf/sigma_vector.irp.f b/src/csf/sigma_vector.irp.f
index 265d2384..0f3f093c 100644
--- a/src/csf/sigma_vector.irp.f
+++ b/src/csf/sigma_vector.irp.f
@@ -1,54 +1,62 @@
-  BEGIN_PROVIDER [ integer, NSOMOMax]
- &BEGIN_PROVIDER [ integer, NCSFMax]
- &BEGIN_PROVIDER [ integer*8, NMO]
- &BEGIN_PROVIDER [ integer, NBFMax]
- &BEGIN_PROVIDER [ integer, n_CSF]
- &BEGIN_PROVIDER [ integer, maxDetDimPerBF]
-  implicit none
-  BEGIN_DOC
-  ! Documentation for NSOMOMax
-  ! The maximum number of SOMOs for the current calculation.
-  ! required for the calculation of prototype arrays.
-  END_DOC
-  NSOMOMax = min(elec_num, cfg_nsomo_max + 2)
-  ! Note that here we need NSOMOMax + 2 sizes
-  NCSFMax = max(1,nint((binom(NSOMOMax,(NSOMOMax+1)/2)-binom(NSOMOMax,((NSOMOMax+1)/2)+1)))) ! TODO: NCSFs for MS=0
-  NBFMax = NCSFMax
-  maxDetDimPerBF = max(1,nint((binom(NSOMOMax,(NSOMOMax+1)/2))))
-  NMO = n_act_orb
-  integer i,j,k,l
-  integer startdet,enddet
-  integer ncfg,ncfgprev
-  integer NSOMO
-  integer dimcsfpercfg
-  integer detDimperBF
-  real*8 :: coeff
-  integer MS
-  integer ncfgpersomo
-  detDimperBF = 0
-  MS = elec_alpha_num-elec_beta_num
-  ! number of cfgs = number of dets for 0 somos
-  n_CSF = cfg_seniority_index(0)-1
-  ncfgprev = cfg_seniority_index(0)
-  do i = 0-iand(MS,1)+2, NSOMOMax,2
-     if(cfg_seniority_index(i) .EQ. -1)then
-        ncfgpersomo = N_configuration + 1
+ BEGIN_PROVIDER [ integer, NSOMOMax]
+&BEGIN_PROVIDER [ integer, NCSFMax]
+&BEGIN_PROVIDER [ integer*8, NMO]
+&BEGIN_PROVIDER [ integer, NBFMax]
+&BEGIN_PROVIDER [ integer, n_CSF]
+&BEGIN_PROVIDER [ integer, maxDetDimPerBF]
+   implicit none
+
+   BEGIN_DOC
+ ! The maximum number of SOMOs for the current calculation.
+ ! required for the calculation of prototype arrays.
+   END_DOC
+
+   integer                        :: i,j,k,l
+   integer                        :: startdet,enddet
+   integer                        :: ncfg,ncfgprev
+   integer                        :: NSOMO
+   integer                        :: dimcsfpercfg
+   integer                        :: detDimperBF
+   real*8                         :: coeff
+   integer                        :: MS
+   integer                        :: ncfgpersomo
+   double precision               :: bin_1, bin_2
+
+   NSOMOMax = min(elec_num, cfg_nsomo_max + 2)
+
+   bin_1 = binom(NSOMOMax, (NSOMOMax+1)/2)
+   bin_2 = binom(NSOMOMax,((NSOMOMax+1)/2)+1)
+
+   ! Note that here we need NSOMOMax + 2 sizes
+   NCSFMax = max(1,int(bin_1-bin_2))
+   NBFMax = NCSFMax
+   maxDetDimPerBF = max(1,nint((binom(NSOMOMax,(NSOMOMax+1)/2))))
+   NMO = n_act_orb
+
+   detDimperBF = 0
+   MS = elec_alpha_num-elec_beta_num
+
+   ! number of cfgs = number of dets for 0 somos
+   n_CSF = cfg_seniority_index(0)-1
+   ncfgprev = cfg_seniority_index(0)
+   do i = 0-iand(MS,1)+2, cfg_nsomo_max,2
+     if(cfg_seniority_index(i) == -1)then
+       ncfgpersomo = N_configuration + 1
      else
-        ncfgpersomo = cfg_seniority_index(i)
+       ncfgpersomo = cfg_seniority_index(i)
      endif
-  ncfg = ncfgpersomo - ncfgprev
-  !detDimperBF = max(1,nint((binom(i,(i+1)/2))))
-  if (i > 2) then
-    dimcsfpercfg = max(1,nint((binom(i-2,(i-2+1)/2)-binom(i-2,((i-2+1)/2)+1))))
-  else
-    dimcsfpercfg = 1
-  endif
-  n_CSF += ncfg * dimcsfpercfg
-  !if(cfg_seniority_index(i+2) == -1) EXIT
-  !if(detDimperBF > maxDetDimPerBF) maxDetDimPerBF = detDimperBF
-  ncfgprev = cfg_seniority_index(i)
-  enddo
-  END_PROVIDER
+     ncfg = ncfgpersomo - ncfgprev
+     if (i > 2) then
+       bin_1 = binom(i-2, (i-2+1)/2)
+       bin_2 = binom(i-2,((i-2+1)/2)+1)
+       dimcsfpercfg = max(1,int(bin_1-bin_2))
+     else
+       dimcsfpercfg = 1
+     endif
+     n_CSF += ncfg * dimcsfpercfg
+     ncfgprev = cfg_seniority_index(i)
+   enddo
+END_PROVIDER
 
 
 subroutine get_phase_qp_to_cfg(Ialpha, Ibeta, phaseout)
diff --git a/src/davidson/diagonalization_hcsf_dressed.irp.f b/src/davidson/diagonalization_hcsf_dressed.irp.f
index 2a83cc28..da23b919 100644
--- a/src/davidson/diagonalization_hcsf_dressed.irp.f
+++ b/src/davidson/diagonalization_hcsf_dressed.irp.f
@@ -197,6 +197,7 @@ subroutine davidson_diag_csf_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,sze_csf,N
   call write_int(6,N_st,'Number of states')
   call write_int(6,N_st_diag,'Number of states in diagonalization')
   call write_int(6,sze,'Number of determinants')
+  call write_int(6,sze_csf,'Number of CSFs')
   call write_int(6,nproc_target,'Number of threads for diagonalization')
   call write_double(6, r1, 'Memory(Gb)')
   if (disk_based) then
diff --git a/src/utils/intel.f90 b/src/utils/intel.f90
new file mode 100644
index 00000000..681adde9
--- /dev/null
+++ b/src/utils/intel.f90
@@ -0,0 +1,70 @@
+module intel
+  use, intrinsic :: iso_c_binding
+  interface
+    subroutine ippsSortRadixIndexGetBufferSize(len, dataType, pBufSize) bind(C, name='ippsSortRadixIndexGetBufferSize')
+     use  iso_c_binding
+     integer, intent(in), value :: len
+     integer, intent(in), value :: dataType
+     integer, intent(out) :: pBufSize
+    end
+  end interface
+  interface
+    subroutine ippsSortAscend_32s_I(pSrc, len) bind(C, name='ippsSortAscend_32s_I')
+     use  iso_c_binding
+     integer, intent(in), value :: len
+     integer, intent(inout) :: pSrc(len)
+    end
+  end interface
+  interface
+    subroutine ippsSortRadixAscend_32s_I(pSrc, len, pTmp) bind(C, name='ippsSortRadixAscend_32s_I')
+     use  iso_c_binding
+     integer, intent(in), value :: len
+     integer, intent(inout) :: pSrc(len)
+     integer, intent(inout) :: pTmp(len)
+    end
+  end interface
+  interface
+    subroutine ippsSortRadixIndexAscend_32s(pSrc, srcStrideBytes, pDstIndx, len, pTmpIndx) bind(C, name='ippsSortRadixIndexAscend_32s')
+     use  iso_c_binding
+     integer, intent(in), value :: len
+     integer, intent(inout) :: pSrc(len)
+     integer, intent(in), value :: srcStrideBytes
+     integer, intent(inout) :: pDstIndx(len)
+     integer, intent(inout) :: pTmpIndx(len)
+    end
+  end interface
+  interface
+    subroutine ippsSortRadixIndexAscend_32f(pSrc, srcStrideBytes, pDstIndx, len, pTmpIndx) bind(C,name='ippsSortRadixIndexAscend_32f')
+     use  iso_c_binding
+     integer, intent(in), value :: len
+     real   , intent(inout) :: pSrc(len)
+     integer, intent(in), value :: srcStrideBytes
+     integer, intent(inout) :: pDstIndx(len)
+     integer, intent(inout) :: pTmpIndx(len)
+    end
+  end interface
+  interface
+    subroutine ippsSortIndexAscend_32f_I(pSrcDst, pDstIndx, len) bind(C,name='ippsSortIndexAscend_32f_I')
+     use  iso_c_binding
+     real(4), intent(in) :: pSrcDst(*)
+     integer(4), intent(inout) :: pDstIndx(*)
+     integer(4), intent(in), value :: len
+    end
+  end interface
+  interface
+    subroutine ippsSortIndexAscend_32s_I(pSrcDst, pDstIndx, len) bind(C,name='ippsSortIndexAscend_32s_I')
+     use  iso_c_binding
+     integer(4), intent(in) :: pSrcDst(*)
+     integer(4), intent(inout) :: pDstIndx(*)
+     integer(4), intent(in), value :: len
+    end
+  end interface
+  interface
+    subroutine ippsSortIndexAscend_64f_I(pSrcDst, pDstIndx, len) bind(C,name='ippsSortIndexAscend_64f_I')
+     use  iso_c_binding
+     real(8), intent(in) :: pSrcDst(*)
+     integer(4), intent(inout) :: pDstIndx(*)
+     integer(4), intent(in), value :: len
+    end
+  end interface
+end module
diff --git a/src/utils/sort.irp.f b/src/utils/sort.irp.f
index 2a655eed..be58e7a5 100644
--- a/src/utils/sort.irp.f
+++ b/src/utils/sort.irp.f
@@ -57,7 +57,7 @@ BEGIN_TEMPLATE
   $type                          :: c, tmp
   integer                        :: itmp
   integer                        :: i, j
- 
+
   if(isize<2)return
 
   c = x( shiftr(first+last,1) )
@@ -262,6 +262,104 @@ SUBST [ X, type ]
  i2 ; integer*2 ;;
 END_TEMPLATE
 
+IRP_IF INTEL
+
+ subroutine sort(x,iorder,isize)
+  use intel
+  implicit none
+  BEGIN_DOC
+  ! Sort array x(isize).
+  ! iorder in input should be (1,2,3,...,isize), and in output
+  ! contains the new order of the elements.
+  END_DOC
+  integer,intent(in)             :: isize
+  real ,intent(inout)            :: x(isize)
+  integer,intent(inout)          :: iorder(isize)
+  integer                        :: n
+  call ippsSortIndexAscend_32f_I(x, iorder, isize)
+  iorder(:) = iorder(:)+1
+ end subroutine sort
+
+ subroutine dsort(x,iorder,isize)
+  use intel
+  implicit none
+  BEGIN_DOC
+  ! Sort array x(isize).
+  ! iorder in input should be (1,2,3,...,isize), and in output
+  ! contains the new order of the elements.
+  END_DOC
+  integer,intent(in)             :: isize
+  real(8) ,intent(inout)         :: x(isize)
+  integer,intent(inout)          :: iorder(isize)
+  integer                        :: n
+  call ippsSortIndexAscend_64f_I(x, iorder, isize)
+  iorder(:) = iorder(:)+1
+ end subroutine dsort
+
+ subroutine isort(x,iorder,isize)
+  use intel
+  implicit none
+  BEGIN_DOC
+  ! Sort array x(isize).
+  ! iorder in input should be (1,2,3,...,isize), and in output
+  ! contains the new order of the elements.
+  END_DOC
+  integer,intent(in)             :: isize
+  integer ,intent(inout)         :: x(isize)
+  integer,intent(inout)          :: iorder(isize)
+  integer                        :: n
+  integer, allocatable :: iorder1(:)
+  allocate(iorder1(isize*2))
+  n=4
+  call ippsSortRadixIndexAscend_32s(x, n, iorder, isize, iorder1)
+  iorder(1:isize) = iorder(1:isize)+1
+  deallocate(iorder1)
+  call iset_order(x,iorder,isize)
+ end subroutine isort
+
+ subroutine isort_noidx(x,isize)
+  use intel
+  implicit none
+  BEGIN_DOC
+  ! Sort array x(isize).
+  ! iorder in input should be (1,2,3,...,isize), and in output
+  ! contains the new order of the elements.
+  END_DOC
+  integer,intent(in)             :: isize
+  integer ,intent(inout)         :: x(isize)
+  integer, allocatable :: iorder1(:)
+  integer :: n
+  call ippsSortRadixIndexGetBufferSize(isize, 11, n)
+  n = n/4
+  allocate(iorder1(n))
+  call ippsSortRadixAscend_32s_I(x, isize, iorder1)
+  deallocate(iorder1)
+ end subroutine isort_noidx
+
+
+BEGIN_TEMPLATE
+ subroutine $Xsort(x,iorder,isize)
+  implicit none
+  BEGIN_DOC
+  ! Sort array x(isize).
+  ! iorder in input should be (1,2,3,...,isize), and in output
+  ! contains the new order of the elements.
+  END_DOC
+  integer,intent(in)             :: isize
+  $type,intent(inout)            :: x(isize)
+  integer,intent(inout)          :: iorder(isize)
+  integer                        :: n
+!  call $Xradix_sort(x,iorder,isize,-1)
+  call quick_$Xsort(x,iorder,isize)
+ end subroutine $Xsort
+
+SUBST [ X, type ]
+ i8 ; integer*8 ;;
+ i2 ; integer*2 ;;
+END_TEMPLATE
+
+IRP_ELSE
+
 BEGIN_TEMPLATE
  subroutine $Xsort(x,iorder,isize)
   implicit none
@@ -289,9 +387,9 @@ BEGIN_TEMPLATE
   endif
  end subroutine $Xsort
 
-SUBST [ X, type, Y ]
-   ; real ; i ;;
- d ; double precision ; i8 ;;
+SUBST [ X, type ]
+   ; real ;;
+ d ; double precision ;;
 END_TEMPLATE
 
 BEGIN_TEMPLATE
@@ -316,6 +414,22 @@ SUBST [ X, type ]
  i2 ; integer*2 ;;
 END_TEMPLATE
 
+ subroutine isort_noidx(x,isize)
+  implicit none
+  BEGIN_DOC
+  ! Sort array x(isize).
+  END_DOC
+  integer,intent(in)             :: isize
+  $type,intent(inout)            :: x(isize)
+  integer, allocatable :: iorder
+  allocate(iorder)
+  iorder=0
+  call $Xradix_sort(x,iorder,isize,-1)
+  deallocate(iorder)
+ end subroutine $Xsort
+IRP_ENDIF
+
+
 BEGIN_TEMPLATE
  subroutine $Xset_order(x,iorder,isize)
   implicit none
@@ -413,10 +527,15 @@ SUBST [ X, type ]
  i2; integer*2 ;;
 END_TEMPLATE
 
+
 BEGIN_TEMPLATE
 
- recursive subroutine $Xradix_sort$big(x,iorder,isize,iradix)
+recursive subroutine $Xradix_sort$big(x,iorder,isize,iradix)
+IRP_IF INTEL
+  use intel
+IRP_ENDIF
   implicit none
+
   BEGIN_DOC
   ! Sort integer array x(isize) using the radix sort algorithm.
   ! iorder in input should be (1,2,3,...,isize), and in output
@@ -448,6 +567,15 @@ BEGIN_TEMPLATE
       stop
     endif
 
+    IRP_IF INTEL
+    if ( ($type == 4).and.($integer_size == 32).and.($is_big == .False.) ) then
+        $intel
+        iorder(:) = iorder(:)+1
+        return
+    endif
+    IRP_ENDIF
+
+
     i1=1_$int_type
     i2=1_$int_type
     do i=1_$int_type,isize
@@ -637,12 +765,13 @@ BEGIN_TEMPLATE
 
  end
 
-SUBST [ X, type, integer_size, is_big, big, int_type ]
- i  ; 4 ; 32 ; .False. ;      ; 4 ;;
- i8 ; 8 ; 64 ; .False. ;      ; 4 ;;
- i2 ; 2 ; 16 ; .False. ;      ; 4 ;;
- i  ; 4 ; 32 ; .True.  ; _big ; 8 ;;
- i8 ; 8 ; 64 ; .True.  ; _big ; 8 ;;
+SUBST [ X, type, integer_size, is_big, big, int_type, intel ]
+ i  ; 4 ; 32 ; .False. ;      ; 4 ; call ippsSortRadixIndexAscend_32s(x, 4, iorder, isize, iorder1) ;;
+ i8 ; 8 ; 64 ; .False. ;      ; 4 ; ;;
+ i2 ; 2 ; 16 ; .False. ;      ; 4 ; ;;
+ i  ; 4 ; 32 ; .True.  ; _big ; 8 ; ;;
+ i8 ; 8 ; 64 ; .True.  ; _big ; 8 ; ;;
 END_TEMPLATE
 
 
+

From c4a91c78c9011de18ae5d866b70693981b5485ed Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Mon, 31 May 2021 08:28:00 +0200
Subject: [PATCH 3/9] Fix gfortrab compilation

---
 src/utils/sort.irp.f | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/utils/sort.irp.f b/src/utils/sort.irp.f
index be58e7a5..b4e30db4 100644
--- a/src/utils/sort.irp.f
+++ b/src/utils/sort.irp.f
@@ -420,13 +420,16 @@ END_TEMPLATE
   ! Sort array x(isize).
   END_DOC
   integer,intent(in)             :: isize
-  $type,intent(inout)            :: x(isize)
-  integer, allocatable :: iorder
-  allocate(iorder)
-  iorder=0
-  call $Xradix_sort(x,iorder,isize,-1)
+  integer,intent(inout)            :: x(isize)
+  integer, allocatable :: iorder(:)
+  integer :: i
+  allocate(iorder(isize))
+  do i=1,isize
+   iorder(i)=i
+  enddo
+  call iradix_sort(x,iorder,isize,-1)
   deallocate(iorder)
- end subroutine $Xsort
+ end subroutine isort_noidx
 IRP_ENDIF
 
 

From 2c3a25e20a6e924de0ac63ba6a353c1db01455b9 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Mon, 31 May 2021 10:45:37 +0200
Subject: [PATCH 4/9] Cleaned sorting

---
 src/utils/intel.f90  | 125 ++++++++++++++++++++++++++---
 src/utils/sort.irp.f | 187 ++++++++++++++-----------------------------
 2 files changed, 173 insertions(+), 139 deletions(-)

diff --git a/src/utils/intel.f90 b/src/utils/intel.f90
index 681adde9..4c18af8d 100644
--- a/src/utils/intel.f90
+++ b/src/utils/intel.f90
@@ -1,13 +1,5 @@
 module intel
   use, intrinsic :: iso_c_binding
-  interface
-    subroutine ippsSortRadixIndexGetBufferSize(len, dataType, pBufSize) bind(C, name='ippsSortRadixIndexGetBufferSize')
-     use  iso_c_binding
-     integer, intent(in), value :: len
-     integer, intent(in), value :: dataType
-     integer, intent(out) :: pBufSize
-    end
-  end interface
   interface
     subroutine ippsSortAscend_32s_I(pSrc, len) bind(C, name='ippsSortAscend_32s_I')
      use  iso_c_binding
@@ -15,12 +7,86 @@ module intel
      integer, intent(inout) :: pSrc(len)
     end
   end interface
+  interface
+    subroutine ippsSortAscend_32f_I(pSrc, len) bind(C, name='ippsSortAscend_32f_I')
+     use  iso_c_binding
+     integer, intent(in), value :: len
+     real, intent(inout) :: pSrc(len)
+    end
+  end interface
+  interface
+    subroutine ippsSortAscend_64s_I(pSrc, len) bind(C, name='ippsSortAscend_64s_I')
+     use  iso_c_binding
+     integer*8, intent(in), value :: len
+     integer, intent(inout) :: pSrc(len)
+    end
+  end interface
+  interface
+    subroutine ippsSortAscend_64f_I(pSrc, len) bind(C, name='ippsSortAscend_64f_I')
+     use  iso_c_binding
+     double precision, intent(in), value :: len
+     real, intent(inout) :: pSrc(len)
+    end
+  end interface
+
+  interface
+    subroutine ippsSortRadixIndexGetBufferSize(len, dataType, pBufSize) bind(C, name='ippsSortRadixIndexGetBufferSize')
+     use  iso_c_binding
+     integer, intent(in), value :: len
+     integer, intent(in), value :: dataType
+     integer, intent(out) :: pBufSize
+    end
+  end interface
+
+  interface
+    subroutine ippsSortRadixAscend_16s_I(pSrc, len, pTmp) bind(C, name='ippsSortRadixAscend_16s_I')
+     use  iso_c_binding
+     integer, intent(in), value :: len
+     integer*2, intent(inout) :: pSrc(len)
+     character, intent(inout) :: pTmp(len)
+    end
+  end interface
   interface
     subroutine ippsSortRadixAscend_32s_I(pSrc, len, pTmp) bind(C, name='ippsSortRadixAscend_32s_I')
      use  iso_c_binding
      integer, intent(in), value :: len
      integer, intent(inout) :: pSrc(len)
-     integer, intent(inout) :: pTmp(len)
+     character, intent(inout) :: pTmp(len)
+    end
+  end interface
+  interface
+    subroutine ippsSortRadixAscend_32f_I(pSrc, len, pTmp) bind(C, name='ippsSortRadixAscend_32f_I')
+     use  iso_c_binding
+     integer, intent(in), value :: len
+     real, intent(inout) :: pSrc(len)
+     character, intent(inout) :: pTmp(len)
+    end
+  end interface
+  interface
+    subroutine ippsSortRadixAscend_64s_I(pSrc, len, pTmp) bind(C, name='ippsSortRadixAscend_64s_I')
+     use  iso_c_binding
+     integer, intent(in), value :: len
+     integer*8, intent(inout) :: pSrc(len)
+     character, intent(inout) :: pTmp(len)
+    end
+  end interface
+  interface
+    subroutine ippsSortRadixAscend_64f_I(pSrc, len, pTmp) bind(C, name='ippsSortRadixAscend_64f_I')
+     use  iso_c_binding
+     integer, intent(in), value :: len
+     double precision, intent(inout) :: pSrc(len)
+     character, intent(inout) :: pTmp(len)
+    end
+  end interface
+
+  interface
+    subroutine ippsSortRadixIndexAscend_16s(pSrc, srcStrideBytes, pDstIndx, len, pTmpIndx) bind(C, name='ippsSortRadixIndexAscend_16s')
+     use  iso_c_binding
+     integer, intent(in), value :: len
+     integer*2, intent(inout) :: pSrc(len)
+     integer, intent(in), value :: srcStrideBytes
+     integer, intent(inout) :: pDstIndx(len)
+     character, intent(inout) :: pTmpIndx(len)
     end
   end interface
   interface
@@ -30,7 +96,7 @@ module intel
      integer, intent(inout) :: pSrc(len)
      integer, intent(in), value :: srcStrideBytes
      integer, intent(inout) :: pDstIndx(len)
-     integer, intent(inout) :: pTmpIndx(len)
+     character, intent(inout) :: pTmpIndx(len)
     end
   end interface
   interface
@@ -40,9 +106,30 @@ module intel
      real   , intent(inout) :: pSrc(len)
      integer, intent(in), value :: srcStrideBytes
      integer, intent(inout) :: pDstIndx(len)
-     integer, intent(inout) :: pTmpIndx(len)
+     character, intent(inout) :: pTmpIndx(len)
     end
   end interface
+  interface
+    subroutine ippsSortRadixIndexAscend_64s(pSrc, srcStrideBytes, pDstIndx, len, pTmpIndx) bind(C, name='ippsSortRadixIndexAscend_64s')
+     use  iso_c_binding
+     integer, intent(in), value :: len
+     integer*8, intent(inout) :: pSrc(len)
+     integer, intent(in), value :: srcStrideBytes
+     integer, intent(inout) :: pDstIndx(len)
+     character, intent(inout) :: pTmpIndx(len)
+    end
+  end interface
+  interface
+    subroutine ippsSortRadixIndexAscend_64f(pSrc, srcStrideBytes, pDstIndx, len, pTmpIndx) bind(C,name='ippsSortRadixIndexAscend_64f')
+     use  iso_c_binding
+     integer, intent(in), value :: len
+     real*8 , intent(inout) :: pSrc(len)
+     integer, intent(in), value :: srcStrideBytes
+     integer, intent(inout) :: pDstIndx(len)
+     character, intent(inout) :: pTmpIndx(len)
+    end
+  end interface
+
   interface
     subroutine ippsSortIndexAscend_32f_I(pSrcDst, pDstIndx, len) bind(C,name='ippsSortIndexAscend_32f_I')
      use  iso_c_binding
@@ -67,4 +154,20 @@ module intel
      integer(4), intent(in), value :: len
     end
   end interface
+  interface
+    subroutine ippsSortIndexAscend_64s_I(pSrcDst, pDstIndx, len) bind(C,name='ippsSortIndexAscend_64s_I')
+     use  iso_c_binding
+     integer(8), intent(in) :: pSrcDst(*)
+     integer(4), intent(inout) :: pDstIndx(*)
+     integer(4), intent(in), value :: len
+    end
+  end interface
+  interface
+    subroutine ippsSortIndexAscend_16s_I(pSrcDst, pDstIndx, len) bind(C,name='ippsSortIndexAscend_16s_I')
+     use  iso_c_binding
+     integer(2), intent(in) :: pSrcDst(*)
+     integer(4), intent(inout) :: pDstIndx(*)
+     integer(4), intent(in), value :: len
+    end
+  end interface
 end module
diff --git a/src/utils/sort.irp.f b/src/utils/sort.irp.f
index b4e30db4..d40b7d1d 100644
--- a/src/utils/sort.irp.f
+++ b/src/utils/sort.irp.f
@@ -262,83 +262,13 @@ SUBST [ X, type ]
  i2 ; integer*2 ;;
 END_TEMPLATE
 
+
+!---------------------- INTEL
 IRP_IF INTEL
 
- subroutine sort(x,iorder,isize)
-  use intel
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize).
-  ! iorder in input should be (1,2,3,...,isize), and in output
-  ! contains the new order of the elements.
-  END_DOC
-  integer,intent(in)             :: isize
-  real ,intent(inout)            :: x(isize)
-  integer,intent(inout)          :: iorder(isize)
-  integer                        :: n
-  call ippsSortIndexAscend_32f_I(x, iorder, isize)
-  iorder(:) = iorder(:)+1
- end subroutine sort
-
- subroutine dsort(x,iorder,isize)
-  use intel
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize).
-  ! iorder in input should be (1,2,3,...,isize), and in output
-  ! contains the new order of the elements.
-  END_DOC
-  integer,intent(in)             :: isize
-  real(8) ,intent(inout)         :: x(isize)
-  integer,intent(inout)          :: iorder(isize)
-  integer                        :: n
-  call ippsSortIndexAscend_64f_I(x, iorder, isize)
-  iorder(:) = iorder(:)+1
- end subroutine dsort
-
- subroutine isort(x,iorder,isize)
-  use intel
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize).
-  ! iorder in input should be (1,2,3,...,isize), and in output
-  ! contains the new order of the elements.
-  END_DOC
-  integer,intent(in)             :: isize
-  integer ,intent(inout)         :: x(isize)
-  integer,intent(inout)          :: iorder(isize)
-  integer                        :: n
-  integer, allocatable :: iorder1(:)
-  allocate(iorder1(isize*2))
-  n=4
-  call ippsSortRadixIndexAscend_32s(x, n, iorder, isize, iorder1)
-  iorder(1:isize) = iorder(1:isize)+1
-  deallocate(iorder1)
-  call iset_order(x,iorder,isize)
- end subroutine isort
-
- subroutine isort_noidx(x,isize)
-  use intel
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize).
-  ! iorder in input should be (1,2,3,...,isize), and in output
-  ! contains the new order of the elements.
-  END_DOC
-  integer,intent(in)             :: isize
-  integer ,intent(inout)         :: x(isize)
-  integer, allocatable :: iorder1(:)
-  integer :: n
-  call ippsSortRadixIndexGetBufferSize(isize, 11, n)
-  n = n/4
-  allocate(iorder1(n))
-  call ippsSortRadixAscend_32s_I(x, isize, iorder1)
-  deallocate(iorder1)
- end subroutine isort_noidx
-
-
 BEGIN_TEMPLATE
  subroutine $Xsort(x,iorder,isize)
+  use intel
   implicit none
   BEGIN_DOC
   ! Sort array x(isize).
@@ -349,17 +279,46 @@ BEGIN_TEMPLATE
   $type,intent(inout)            :: x(isize)
   integer,intent(inout)          :: iorder(isize)
   integer                        :: n
-!  call $Xradix_sort(x,iorder,isize,-1)
-  call quick_$Xsort(x,iorder,isize)
- end subroutine $Xsort
+  character, allocatable         :: tmp(:)
+  if (isize < 2) return
+  call ippsSortRadixIndexGetBufferSize(isize, $ippsz, n)
+  allocate(tmp(n))
+  call ippsSortRadixIndexAscend_$ityp(x, $n, iorder, isize, tmp)
+  deallocate(tmp)
+  iorder(1:isize) = iorder(1:isize)+1
+  call $Xset_order(x,iorder,isize)
+ end
 
-SUBST [ X, type ]
- i8 ; integer*8 ;;
- i2 ; integer*2 ;;
+ subroutine $Xsort_noidx(x,isize)
+  use intel
+  implicit none
+  BEGIN_DOC
+  ! Sort array x(isize).
+  ! iorder in input should be (1,2,3,...,isize), and in output
+  ! contains the new order of the elements.
+  END_DOC
+  integer,intent(in)             :: isize
+  $type,intent(inout)            :: x(isize)
+  integer                        :: n
+  character, allocatable         :: tmp(:)
+  if (isize < 2) return
+  call ippsSortRadixIndexGetBufferSize(isize, $ippsz, n)
+  allocate(tmp(n))
+  call ippsSortRadixAscend_$ityp_I(x, isize, tmp)
+  deallocate(tmp)
+ end
+
+SUBST [ X, type, ityp, n, ippsz ]
+   ; real ; 32f ; 4 ; 13 ;;
+ d ; double precision ; 64f ; 8 ; 19;;
+ i ; integer ; 32s ; 4 ; 11 ;;
+ i8 ; integer*8 ; 64s ; 8 ; 17;;
+ i2 ; integer*2 ; 16s ; 2 ; 7 ;;
 END_TEMPLATE
 
+!---------------------- END INTEL
 IRP_ELSE
-
+!---------------------- NON-INTEL
 BEGIN_TEMPLATE
  subroutine $Xsort(x,iorder,isize)
   implicit none
@@ -387,50 +346,34 @@ BEGIN_TEMPLATE
   endif
  end subroutine $Xsort
 
-SUBST [ X, type ]
-   ; real ;;
- d ; double precision ;;
-END_TEMPLATE
-
-BEGIN_TEMPLATE
- subroutine $Xsort(x,iorder,isize)
+ subroutine $Xsort_noidx(x,isize)
   implicit none
   BEGIN_DOC
   ! Sort array x(isize).
-  ! iorder in input should be (1,2,3,...,isize), and in output
-  ! contains the new order of the elements.
   END_DOC
   integer,intent(in)             :: isize
   $type,intent(inout)            :: x(isize)
-  integer,intent(inout)          :: iorder(isize)
-  integer                        :: n
-!  call $Xradix_sort(x,iorder,isize,-1)
-  call quick_$Xsort(x,iorder,isize)
- end subroutine $Xsort
+  integer, allocatable           :: iorder(:)
+  integer                        :: i
+  allocate(iorder(isize))
+  do i=1,isize
+   iorder(i)=i
+  enddo
+  call $Xsort(x,iorder,isize)
+  deallocate(iorder)
+ end subroutine $Xsort_noidx
 
 SUBST [ X, type ]
+   ; real ;;
+ d ; double precision ;;
  i ; integer ;;
  i8 ; integer*8 ;;
  i2 ; integer*2 ;;
 END_TEMPLATE
 
- subroutine isort_noidx(x,isize)
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize).
-  END_DOC
-  integer,intent(in)             :: isize
-  integer,intent(inout)            :: x(isize)
-  integer, allocatable :: iorder(:)
-  integer :: i
-  allocate(iorder(isize))
-  do i=1,isize
-   iorder(i)=i
-  enddo
-  call iradix_sort(x,iorder,isize,-1)
-  deallocate(iorder)
- end subroutine isort_noidx
 IRP_ENDIF
+!---------------------- END NON-INTEL
+
 
 
 BEGIN_TEMPLATE
@@ -534,9 +477,6 @@ END_TEMPLATE
 BEGIN_TEMPLATE
 
 recursive subroutine $Xradix_sort$big(x,iorder,isize,iradix)
-IRP_IF INTEL
-  use intel
-IRP_ENDIF
   implicit none
 
   BEGIN_DOC
@@ -570,15 +510,6 @@ IRP_ENDIF
       stop
     endif
 
-    IRP_IF INTEL
-    if ( ($type == 4).and.($integer_size == 32).and.($is_big == .False.) ) then
-        $intel
-        iorder(:) = iorder(:)+1
-        return
-    endif
-    IRP_ENDIF
-
-
     i1=1_$int_type
     i2=1_$int_type
     do i=1_$int_type,isize
@@ -768,12 +699,12 @@ IRP_ENDIF
 
  end
 
-SUBST [ X, type, integer_size, is_big, big, int_type, intel ]
- i  ; 4 ; 32 ; .False. ;      ; 4 ; call ippsSortRadixIndexAscend_32s(x, 4, iorder, isize, iorder1) ;;
- i8 ; 8 ; 64 ; .False. ;      ; 4 ; ;;
- i2 ; 2 ; 16 ; .False. ;      ; 4 ; ;;
- i  ; 4 ; 32 ; .True.  ; _big ; 8 ; ;;
- i8 ; 8 ; 64 ; .True.  ; _big ; 8 ; ;;
+SUBST [ X, type, integer_size, is_big, big, int_type ]
+ i  ; 4 ; 32 ; .False. ;      ; 4 ;;
+ i8 ; 8 ; 64 ; .False. ;      ; 4 ;;
+ i2 ; 2 ; 16 ; .False. ;      ; 4 ;;
+ i  ; 4 ; 32 ; .True.  ; _big ; 8 ;;
+ i8 ; 8 ; 64 ; .True.  ; _big ; 8 ;;
 END_TEMPLATE
 
 

From 3837dea58c0a5cc30996b6fec3ecc4824d774db7 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Mon, 31 May 2021 11:47:34 +0200
Subject: [PATCH 5/9] Improving sort

---
 src/utils/sort.irp.f | 75 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 58 insertions(+), 17 deletions(-)

diff --git a/src/utils/sort.irp.f b/src/utils/sort.irp.f
index d40b7d1d..f3879d4a 100644
--- a/src/utils/sort.irp.f
+++ b/src/utils/sort.irp.f
@@ -320,6 +320,34 @@ END_TEMPLATE
 IRP_ELSE
 !---------------------- NON-INTEL
 BEGIN_TEMPLATE
+
+ subroutine $Xsort_noidx(x,isize)
+  implicit none
+  BEGIN_DOC
+  ! Sort array x(isize).
+  END_DOC
+  integer,intent(in)             :: isize
+  $type,intent(inout)            :: x(isize)
+  integer, allocatable           :: iorder(:)
+  integer                        :: i
+  allocate(iorder(isize))
+  do i=1,isize
+   iorder(i)=i
+  enddo
+  call $Xsort(x,iorder,isize)
+  deallocate(iorder)
+ end subroutine $Xsort_noidx
+
+SUBST [ X, type ]
+   ; real ;;
+ d ; double precision ;;
+ i ; integer ;;
+ i8 ; integer*8 ;;
+ i2 ; integer*2 ;;
+END_TEMPLATE
+
+BEGIN_TEMPLATE
+
  subroutine $Xsort(x,iorder,isize)
   implicit none
   BEGIN_DOC
@@ -346,26 +374,39 @@ BEGIN_TEMPLATE
   endif
  end subroutine $Xsort
 
- subroutine $Xsort_noidx(x,isize)
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize).
-  END_DOC
-  integer,intent(in)             :: isize
-  $type,intent(inout)            :: x(isize)
-  integer, allocatable           :: iorder(:)
-  integer                        :: i
-  allocate(iorder(isize))
-  do i=1,isize
-   iorder(i)=i
-  enddo
-  call $Xsort(x,iorder,isize)
-  deallocate(iorder)
- end subroutine $Xsort_noidx
-
 SUBST [ X, type ]
    ; real ;;
  d ; double precision ;;
+END_TEMPLATE
+
+BEGIN_TEMPLATE
+
+ subroutine $Xsort(x,iorder,isize)
+  implicit none
+  BEGIN_DOC
+  ! Sort array x(isize).
+  ! iorder in input should be (1,2,3,...,isize), and in output
+  ! contains the new order of the elements.
+  END_DOC
+  integer,intent(in)             :: isize
+  $type,intent(inout)            :: x(isize)
+  integer,intent(inout)          :: iorder(isize)
+  integer                        :: n
+  if (isize < 2) then
+    return
+  endif
+  call sorted_$Xnumber(x,isize,n)
+  if (isize == n) then
+    return
+  endif
+  if ( isize < 32) then
+    call insertion_$Xsort(x,iorder,isize)
+  else
+    call $Xradix_sort(x,iorder,isize,-1)
+  endif
+ end subroutine $Xsort
+
+SUBST [ X, type ]
  i ; integer ;;
  i8 ; integer*8 ;;
  i2 ; integer*2 ;;

From d45f6091daf107a2a18925c9b75d9a8ed4963eee Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Mon, 31 May 2021 12:16:00 +0200
Subject: [PATCH 6/9] Sorting compatible with old IPP

---
 config/ifort_rome.cfg |  4 +--
 src/utils/intel.f90   |  8 +++---
 src/utils/sort.irp.f  | 65 +++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 69 insertions(+), 8 deletions(-)

diff --git a/config/ifort_rome.cfg b/config/ifort_rome.cfg
index 3b70f98b..5ed01227 100644
--- a/config/ifort_rome.cfg
+++ b/config/ifort_rome.cfg
@@ -31,8 +31,8 @@ OPENMP  : 1          ; Append OpenMP flags
 # -ftz                       : Flushes denormal results to zero
 #
 [OPT]
-FC       : -traceback
-FCFLAGS  : -O2 -ip -g  -march=core-avx2 -align array64byte -fma -ftz -fomit-frame-pointer
+FC       : -traceback -shared-intel
+FCFLAGS  : -O2 -ip -g  -march=core-avx2 -align array64byte -fma -ftz -fomit-frame-pointer 
 
 # Profiling flags
 #################
diff --git a/src/utils/intel.f90 b/src/utils/intel.f90
index 4c18af8d..2b61cb38 100644
--- a/src/utils/intel.f90
+++ b/src/utils/intel.f90
@@ -17,15 +17,15 @@ module intel
   interface
     subroutine ippsSortAscend_64s_I(pSrc, len) bind(C, name='ippsSortAscend_64s_I')
      use  iso_c_binding
-     integer*8, intent(in), value :: len
-     integer, intent(inout) :: pSrc(len)
+     integer, intent(in), value :: len
+     integer*8, intent(inout) :: pSrc(len)
     end
   end interface
   interface
     subroutine ippsSortAscend_64f_I(pSrc, len) bind(C, name='ippsSortAscend_64f_I')
      use  iso_c_binding
-     double precision, intent(in), value :: len
-     real, intent(inout) :: pSrc(len)
+     integer, intent(in), value :: len
+     double precision, intent(inout) :: pSrc(len)
     end
   end interface
 
diff --git a/src/utils/sort.irp.f b/src/utils/sort.irp.f
index f3879d4a..21eb8b67 100644
--- a/src/utils/sort.irp.f
+++ b/src/utils/sort.irp.f
@@ -310,12 +310,73 @@ BEGIN_TEMPLATE
 
 SUBST [ X, type, ityp, n, ippsz ]
    ; real ; 32f ; 4 ; 13 ;;
- d ; double precision ; 64f ; 8 ; 19;;
  i ; integer ; 32s ; 4 ; 11 ;;
- i8 ; integer*8 ; 64s ; 8 ; 17;;
  i2 ; integer*2 ; 16s ; 2 ; 7 ;;
 END_TEMPLATE
 
+BEGIN_TEMPLATE
+
+ subroutine $Xsort(x,iorder,isize)
+  implicit none
+  BEGIN_DOC
+  ! Sort array x(isize).
+  ! iorder in input should be (1,2,3,...,isize), and in output
+  ! contains the new order of the elements.
+  END_DOC
+  integer,intent(in)             :: isize
+  $type,intent(inout)            :: x(isize)
+  integer,intent(inout)          :: iorder(isize)
+  integer                        :: n
+  if (isize < 2) then
+    return
+  endif
+!  call sorted_$Xnumber(x,isize,n)
+!  if (isize == n) then
+!    return
+!  endif
+  if ( isize < 32) then
+    call insertion_$Xsort(x,iorder,isize)
+  else
+!    call heap_$Xsort(x,iorder,isize)
+    call quick_$Xsort(x,iorder,isize)
+  endif
+ end subroutine $Xsort
+
+SUBST [ X, type ]
+ d ; double precision ;;
+END_TEMPLATE
+
+BEGIN_TEMPLATE
+
+ subroutine $Xsort(x,iorder,isize)
+  implicit none
+  BEGIN_DOC
+  ! Sort array x(isize).
+  ! iorder in input should be (1,2,3,...,isize), and in output
+  ! contains the new order of the elements.
+  END_DOC
+  integer,intent(in)             :: isize
+  $type,intent(inout)            :: x(isize)
+  integer,intent(inout)          :: iorder(isize)
+  integer                        :: n
+  if (isize < 2) then
+    return
+  endif
+  call sorted_$Xnumber(x,isize,n)
+  if (isize == n) then
+    return
+  endif
+  if ( isize < 32) then
+    call insertion_$Xsort(x,iorder,isize)
+  else
+    call $Xradix_sort(x,iorder,isize,-1)
+  endif
+ end subroutine $Xsort
+
+SUBST [ X, type ]
+ i8 ; integer*8 ;;
+END_TEMPLATE
+
 !---------------------- END INTEL
 IRP_ELSE
 !---------------------- NON-INTEL

From 00dfa11f4f013f91bf49d99d7300bbe98cbe3da5 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Mon, 31 May 2021 19:02:25 +0200
Subject: [PATCH 7/9] Removed OMP in sorting

---
 src/utils/sort.irp.f | 30 +-----------------------------
 1 file changed, 1 insertion(+), 29 deletions(-)

diff --git a/src/utils/sort.irp.f b/src/utils/sort.irp.f
index 21eb8b67..a63eb4a3 100644
--- a/src/utils/sort.irp.f
+++ b/src/utils/sort.irp.f
@@ -38,15 +38,7 @@ BEGIN_TEMPLATE
   $type,intent(inout)            :: x(isize)
   integer,intent(inout)          :: iorder(isize)
   integer, external              :: omp_get_num_threads
-  if (omp_get_num_threads() == 1) then
-    !$OMP PARALLEL DEFAULT(SHARED)
-    !$OMP SINGLE
-    call rec_$X_quicksort(x,iorder,isize,1,isize,nproc)
-    !$OMP END SINGLE
-    !$OMP END PARALLEL
-  else
-    call rec_$X_quicksort(x,iorder,isize,1,isize,nproc)
-  endif
+  call rec_$X_quicksort(x,iorder,isize,1,isize,nproc)
  end
 
  recursive subroutine rec_$X_quicksort(x, iorder, isize, first, last, level)
@@ -89,16 +81,11 @@ BEGIN_TEMPLATE
     endif
   else
     if (first < i-1) then
-      !$OMP TASK DEFAULT(SHARED) FIRSTPRIVATE(isize,first,i,level)
       call rec_$X_quicksort(x, iorder, isize, first, i-1,level/2)
-      !$OMP END TASK
     endif
     if (j+1 < last) then
-      !$OMP TASK DEFAULT(SHARED) FIRSTPRIVATE(isize,last,j,level)
       call rec_$X_quicksort(x, iorder, isize, j+1, last,level/2)
-      !$OMP END TASK
     endif
-    !$OMP TASKWAIT
   endif
  end
 
@@ -716,24 +703,14 @@ recursive subroutine $Xradix_sort$big(x,iorder,isize,iradix)
     endif
 
 
-!   !$OMP PARALLEL DEFAULT(SHARED) if (isize > 1000000)
-!   !$OMP SINGLE
     if (i3>1_$int_type) then
-!     !$OMP TASK FIRSTPRIVATE(iradix_new,i3) SHARED(x,iorder) if(i3 > 1000000)
       call $Xradix_sort$big(x,iorder,i3,iradix_new-1)
-!     !$OMP END TASK
     endif
 
     if (isize-i3>1_$int_type) then
-!     !$OMP TASK FIRSTPRIVATE(iradix_new,i3) SHARED(x,iorder) if(isize-i3 > 1000000)
       call $Xradix_sort$big(x(i3+1_$int_type),iorder(i3+1_$int_type),isize-i3,iradix_new-1)
-!     !$OMP END TASK
     endif
 
-!   !$OMP TASKWAIT
-!   !$OMP END SINGLE
-!   !$OMP END PARALLEL
-
     return
   endif
 
@@ -788,16 +765,11 @@ recursive subroutine $Xradix_sort$big(x,iorder,isize,iradix)
 
 
   if (i1>1_$int_type) then
-    !$OMP TASK FIRSTPRIVATE(i0,iradix,i1) SHARED(x,iorder) if(i1 >1000000)
     call $Xradix_sort$big(x(i0+1_$int_type),iorder(i0+1_$int_type),i1,iradix-1)
-    !$OMP END TASK
   endif
   if (i0>1) then
-    !$OMP TASK FIRSTPRIVATE(i0,iradix) SHARED(x,iorder) if(i0 >1000000)
     call $Xradix_sort$big(x,iorder,i0,iradix-1)
-    !$OMP END TASK
   endif
-  !$OMP TASKWAIT
 
  end
 

From d174a1f7cc0a4dda35d7556f9a3f7d5b2b29711a Mon Sep 17 00:00:00 2001
From: vijay gopal chilkuri <vijay.gopal.c@gmail.com>
Date: Mon, 31 May 2021 23:45:05 +0530
Subject: [PATCH 8/9] Fixed and verfied bug in n_CSF for Benzene singlet. #158

---
 src/csf/sigma_vector.irp.f | 67 +++++++++++++++++++++++++++++++-------
 1 file changed, 56 insertions(+), 11 deletions(-)

diff --git a/src/csf/sigma_vector.irp.f b/src/csf/sigma_vector.irp.f
index 581a6a6b..75d9f1f7 100644
--- a/src/csf/sigma_vector.irp.f
+++ b/src/csf/sigma_vector.irp.f
@@ -1,9 +1,15 @@
- BEGIN_PROVIDER [ integer, NSOMOMax]
-&BEGIN_PROVIDER [ integer, NCSFMax]
-&BEGIN_PROVIDER [ integer*8, NMO]
-&BEGIN_PROVIDER [ integer, NBFMax]
-&BEGIN_PROVIDER [ integer, n_CSF]
-&BEGIN_PROVIDER [ integer, maxDetDimPerBF]
+ real*8 function lgamma(x)
+ implicit none
+ real*8, intent(in) :: x
+ lgamma = log(abs(gamma(x)))
+ end function lgamma
+  
+  BEGIN_PROVIDER [ integer, NSOMOMax]
+ &BEGIN_PROVIDER [ integer, NCSFMax]
+ &BEGIN_PROVIDER [ integer*8, NMO]
+ &BEGIN_PROVIDER [ integer, NBFMax]
+ &BEGIN_PROVIDER [ integer, n_CSF]
+ &BEGIN_PROVIDER [ integer, maxDetDimPerBF]
   implicit none
   BEGIN_DOC
   ! Documentation for NSOMOMax
@@ -22,28 +28,67 @@
   integer NSOMO
   integer dimcsfpercfg
   integer detDimperBF
-  real*8 :: coeff
+  real*8 :: coeff, binom1, binom2
   integer MS
   integer ncfgpersomo
+  real*8, external :: lgamma
   detDimperBF = 0
   MS = elec_alpha_num-elec_beta_num
   ! number of cfgs = number of dets for 0 somos
   n_CSF = 0
   ncfgprev = cfg_seniority_index(0)
+  ncfgpersomo = ncfgprev
+  do i = 1, elec_num
+    print *,"i=",i," Ncfg= ",cfg_seniority_index(i)
+  enddo
   do i = iand(MS,1), NSOMOMax-2,2
+    if(cfg_seniority_index(i) .EQ. -1) then
+      cycle
+    endif
     if(cfg_seniority_index(i+2) .EQ. -1) then
       ncfgpersomo = N_configuration + 1
     else
-      ncfgpersomo = cfg_seniority_index(i+2)
+      if(cfg_seniority_index(i+2) > ncfgpersomo) then
+          ncfgpersomo = cfg_seniority_index(i+2)
+      else
+        k = 0
+        do while(cfg_seniority_index(i+2+k) < ncfgpersomo)
+          k = k + 2
+          ncfgpersomo = cfg_seniority_index(i+2+k)
+        enddo
+      endif
     endif
     ncfg = ncfgpersomo - ncfgprev
     if(iand(MS,1) .EQ. 0) then
-      dimcsfpercfg = max(1,nint((binom(i,i/2)-binom(i,i/2+1))))
+      !dimcsfpercfg = max(1,nint((binom(i,i/2)-binom(i,i/2+1))))
+      binom1 = dexp(lgamma(1.0d0*(i+1))                            &
+                  - lgamma(1.0d0*((i/2)+1))                        &
+                  - lgamma(1.0d0*(i-((i/2))+1)));
+      binom2 = dexp(lgamma(1.0d0*(i+1))                            &
+                  - lgamma(1.0d0*(((i/2)+1)+1))                    &
+                  - lgamma(1.0d0*(i-((i/2)+1)+1)));
+      dimcsfpercfg = max(1,nint(binom1 - binom2))
     else
-      dimcsfpercfg = max(1,nint((binom(i,(i+1)/2)-binom(i,(i+3)/2))))
+      !dimcsfpercfg = max(1,nint((binom(i,(i+1)/2)-binom(i,(i+3)/2))))
+      binom1 = dexp(lgamma(1.0d0*(i+1))                            &
+                  - lgamma(1.0d0*(((i+1)/2)+1))                    &
+                  - lgamma(1.0d0*(i-(((i+1)/2))+1)));
+      binom2 = dexp(lgamma(1.0d0*(i+1))                            &
+                  - lgamma(1.0d0*((((i+3)/2)+1)+1))                &
+                  - lgamma(1.0d0*(i-(((i+3)/2)+1)+1)));
+      dimcsfpercfg = max(1,nint(binom1 - binom2))
     endif
     n_CSF += ncfg * dimcsfpercfg
-    ncfgprev = cfg_seniority_index(i+2)
+    print *,"i=",i," ncfg= ", ncfg, " dims=", dimcsfpercfg, " n_csf=", n_CSF, ncfgpersomo, ncfgprev
+    if(cfg_seniority_index(i+2) > ncfgprev) then
+      ncfgprev = cfg_seniority_index(i+2)
+    else
+      k = 0
+      do while(cfg_seniority_index(i+2+k) < ncfgprev)
+        k = k + 2
+        ncfgprev = cfg_seniority_index(i+2+k)
+      enddo
+    endif
   enddo
 END_PROVIDER
 

From d584862f09462058457f01529f20e2906e650752 Mon Sep 17 00:00:00 2001
From: vijay gopal chilkuri <vijay.gopal.c@gmail.com>
Date: Tue, 1 Jun 2021 10:05:18 +0530
Subject: [PATCH 9/9] Remove debug print.

---
 src/csf/sigma_vector.irp.f | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/csf/sigma_vector.irp.f b/src/csf/sigma_vector.irp.f
index 75d9f1f7..c1b0b228 100644
--- a/src/csf/sigma_vector.irp.f
+++ b/src/csf/sigma_vector.irp.f
@@ -38,9 +38,6 @@
   n_CSF = 0
   ncfgprev = cfg_seniority_index(0)
   ncfgpersomo = ncfgprev
-  do i = 1, elec_num
-    print *,"i=",i," Ncfg= ",cfg_seniority_index(i)
-  enddo
   do i = iand(MS,1), NSOMOMax-2,2
     if(cfg_seniority_index(i) .EQ. -1) then
       cycle
@@ -79,7 +76,6 @@
       dimcsfpercfg = max(1,nint(binom1 - binom2))
     endif
     n_CSF += ncfg * dimcsfpercfg
-    print *,"i=",i," ncfg= ", ncfg, " dims=", dimcsfpercfg, " n_csf=", n_CSF, ncfgpersomo, ncfgprev
     if(cfg_seniority_index(i+2) > ncfgprev) then
       ncfgprev = cfg_seniority_index(i+2)
     else