diff --git a/src/ao_many_one_e_ints/ao_gaus_gauss.irp.f b/src/ao_many_one_e_ints/ao_gaus_gauss.irp.f
index d515b0c1..19cfe4cd 100644
--- a/src/ao_many_one_e_ints/ao_gaus_gauss.irp.f
+++ b/src/ao_many_one_e_ints/ao_gaus_gauss.irp.f
@@ -22,15 +22,15 @@ subroutine overlap_gauss_xyz_r12_ao(D_center,delta,i,j,gauss_ints)
  power_B(1:3)= ao_power(j,1:3)
  B_center(1:3) = nucl_coord(num_B,1:3)
  do l=1,ao_prim_num(i)
-  alpha = ao_expo_ordered_transp(l,i)     
+  alpha = ao_expo_ordered_transp(l,i)
   do k=1,ao_prim_num(j)
-   beta = ao_expo_ordered_transp(k,j)     
+   beta = ao_expo_ordered_transp(k,j)
    call overlap_gauss_xyz_r12(D_center,delta,A_center,B_center,power_A,power_B,alpha,beta,gauss_ints_tmp)
    do m = 1, 3
     gauss_ints(m) += gauss_ints_tmp(m) *  ao_coef_normalized_ordered_transp(l,i)             &
-                                       *  ao_coef_normalized_ordered_transp(k,j) 
+                                       *  ao_coef_normalized_ordered_transp(k,j)
    enddo
-  enddo 
+  enddo
  enddo
 end
 
@@ -61,13 +61,13 @@ double precision function overlap_gauss_xyz_r12_ao_specific(D_center,delta,i,j,m
  power_B(1:3)= ao_power(j,1:3)
  B_center(1:3) = nucl_coord(num_B,1:3)
  do l=1,ao_prim_num(i)
-  alpha = ao_expo_ordered_transp(l,i)     
+  alpha = ao_expo_ordered_transp(l,i)
   do k=1,ao_prim_num(j)
-   beta = ao_expo_ordered_transp(k,j)     
+   beta = ao_expo_ordered_transp(k,j)
    gauss_int = overlap_gauss_xyz_r12_specific(D_center,delta,A_center,B_center,power_A,power_B,alpha,beta,mx)
    overlap_gauss_xyz_r12_ao_specific = gauss_int *  ao_coef_normalized_ordered_transp(l,i)             &
-                                                 *  ao_coef_normalized_ordered_transp(k,j) 
-  enddo 
+                                                 *  ao_coef_normalized_ordered_transp(k,j)
+  enddo
  enddo
 end
 
@@ -90,13 +90,13 @@ subroutine overlap_gauss_r12_all_ao(D_center,delta,aos_ints)
    power_B(1:3)= ao_power(j,1:3)
    B_center(1:3) = nucl_coord(num_B,1:3)
    do l=1,ao_prim_num(i)
-    alpha = ao_expo_ordered_transp(l,i)     
+    alpha = ao_expo_ordered_transp(l,i)
     do k=1,ao_prim_num(j)
-     beta = ao_expo_ordered_transp(k,j)     
+     beta = ao_expo_ordered_transp(k,j)
      analytical_j = overlap_gauss_r12(D_center,delta,A_center,B_center,power_A,power_B,alpha,beta)
      aos_ints(j,i) += analytical_j *  ao_coef_normalized_ordered_transp(l,i)             &
-                                   *  ao_coef_normalized_ordered_transp(k,j) 
-    enddo 
+                                   *  ao_coef_normalized_ordered_transp(k,j)
+    enddo
    enddo
   enddo
  enddo
@@ -114,7 +114,7 @@ double precision function overlap_gauss_r12_ao(D_center, delta, i, j)
   implicit none
   integer,          intent(in) :: i, j
   double precision, intent(in) :: D_center(3), delta
- 
+
   integer                      :: power_A(3), power_B(3), l, k
   double precision             :: A_center(3), B_center(3), alpha, beta, coef, coef1, analytical_j
 
@@ -133,19 +133,19 @@ double precision function overlap_gauss_r12_ao(D_center, delta, i, j)
   B_center(1:3) = nucl_coord(ao_nucl(j),1:3)
 
   do l = 1, ao_prim_num(i)
-    alpha = ao_expo_ordered_transp           (l,i)     
-    coef1 = ao_coef_normalized_ordered_transp(l,i) 
+    alpha = ao_expo_ordered_transp           (l,i)
+    coef1 = ao_coef_normalized_ordered_transp(l,i)
 
     do k = 1, ao_prim_num(j)
       beta = ao_expo_ordered_transp(k,j)
-      coef = coef1 * ao_coef_normalized_ordered_transp(k,j) 
+      coef = coef1 * ao_coef_normalized_ordered_transp(k,j)
 
       if(dabs(coef) .lt. 1d-12) cycle
 
       analytical_j = overlap_gauss_r12(D_center, delta, A_center, B_center, power_A, power_B, alpha, beta)
 
       overlap_gauss_r12_ao += coef * analytical_j
-    enddo 
+    enddo
   enddo
 
 end function overlap_gauss_r12_ao
@@ -163,14 +163,13 @@ double precision function overlap_gauss_r12_ao_with1s(B_center, beta, D_center,
   implicit none
   integer,          intent(in) :: i, j
   double precision, intent(in) :: B_center(3), beta, D_center(3), delta
- 
+
   integer                      :: power_A1(3), power_A2(3), l, k
   double precision             :: A1_center(3), A2_center(3), alpha1, alpha2, coef1, coef12, analytical_j
   double precision             :: G_center(3), gama, fact_g, gama_inv
 
   double precision, external   :: overlap_gauss_r12, overlap_gauss_r12_ao
 
-  ASSERT(beta .gt. 0.d0)
   if(beta .lt. 1d-10) then
     overlap_gauss_r12_ao_with1s = overlap_gauss_r12_ao(D_center, delta, i, j)
     return
@@ -204,22 +203,120 @@ double precision function overlap_gauss_r12_ao_with1s(B_center, beta, D_center,
   A2_center(1:3) = nucl_coord(ao_nucl(j),1:3)
 
   do l = 1, ao_prim_num(i)
-    alpha1 = ao_expo_ordered_transp                    (l,i)     
+    alpha1 = ao_expo_ordered_transp                    (l,i)
     coef1  = fact_g * ao_coef_normalized_ordered_transp(l,i)
     if(dabs(coef1) .lt. 1d-12) cycle
 
     do k = 1, ao_prim_num(j)
       alpha2 = ao_expo_ordered_transp                   (k,j)
-      coef12 = coef1 * ao_coef_normalized_ordered_transp(k,j) 
+      coef12 = coef1 * ao_coef_normalized_ordered_transp(k,j)
       if(dabs(coef12) .lt. 1d-12) cycle
 
       analytical_j = overlap_gauss_r12(G_center, gama, A1_center, A2_center, power_A1, power_A2, alpha1, alpha2)
 
       overlap_gauss_r12_ao_with1s += coef12 * analytical_j
-    enddo 
+    enddo
   enddo
 
 end function overlap_gauss_r12_ao_with1s
 
 ! ---
 
+subroutine overlap_gauss_r12_ao_with1s_v(B_center, beta, D_center, delta, i, j, resv, n_points)
+  BEGIN_DOC
+  !
+  ! \int dr AO_i(r) AO_j(r) e^{-beta |r-B_center^2|} e^{-delta |r-D_center|^2}
+  ! using an array of D_centers.
+  !
+  END_DOC
+
+  implicit none
+  integer,          intent(in)  :: i, j, n_points
+  double precision, intent(in)  :: B_center(3), beta, D_center(3,n_points), delta
+  double precision, intent(out) :: resv(n_points)
+
+  integer                      :: power_A1(3), power_A2(3), l, k
+  double precision             :: A1_center(3), A2_center(3), alpha1, alpha2, coef1
+  double precision             :: coef12, coef12f
+  double precision             :: gama, gama_inv
+  double precision             :: bg, dg, bdg
+
+  double precision, external   :: overlap_gauss_r12, overlap_gauss_r12_ao
+
+  double precision, external   :: overlap_gauss_r12_ao_with1s
+  integer :: ipoint
+
+  double precision, allocatable :: fact_g(:), G_center(:,:), analytical_j(:)
+
+  if(ao_overlap_abs(j,i) .lt. 1.d-12) then
+    return
+  endif
+
+  ASSERT(beta .gt. 0.d0)
+
+  if(beta .lt. 1d-10) then
+    do ipoint=1,n_points
+      resv(ipoint) = overlap_gauss_r12_ao(D_center(1,ipoint), delta, i, j)
+    enddo
+    return
+  endif
+
+  resv(:) = 0.d0
+
+  ! e^{-beta |r-B_center^2|} e^{-delta |r-D_center|^2} = fact_g e^{-gama |r - G|^2}
+
+  gama        = beta + delta
+  gama_inv    = 1.d0 / gama
+
+  power_A1(1:3) = ao_power(i,1:3)
+  power_A2(1:3) = ao_power(j,1:3)
+
+  A1_center(1:3) = nucl_coord(ao_nucl(i),1:3)
+  A2_center(1:3) = nucl_coord(ao_nucl(j),1:3)
+
+  allocate (fact_g(n_points), G_center(3,n_points), analytical_j(n_points) )
+
+  bg = beta * gama_inv
+  dg = delta * gama_inv
+  bdg = bg * delta 
+  do ipoint=1,n_points
+    G_center(1,ipoint) = bg * B_center(1) + dg * D_center(1,ipoint)
+    G_center(2,ipoint) = bg * B_center(2) + dg * D_center(2,ipoint)
+    G_center(3,ipoint) = bg * B_center(3) + dg * D_center(3,ipoint)
+    fact_g(ipoint) = bdg * ( &
+          (B_center(1) - D_center(1,ipoint)) * (B_center(1) - D_center(1,ipoint))  &
+        + (B_center(2) - D_center(2,ipoint)) * (B_center(2) - D_center(2,ipoint))  &
+        + (B_center(3) - D_center(3,ipoint)) * (B_center(3) - D_center(3,ipoint)) )
+
+    if(fact_g(ipoint) < 10d0) then
+      fact_g(ipoint) = dexp(-fact_g(ipoint))
+    else
+      fact_g(ipoint) = 0.d0
+    endif
+
+  enddo
+
+    ! ---
+
+  do l = 1, ao_prim_num(i)
+    alpha1 = ao_expo_ordered_transp                    (l,i)
+    coef1  = ao_coef_normalized_ordered_transp(l,i)
+
+    do k = 1, ao_prim_num(j)
+      alpha2 = ao_expo_ordered_transp                   (k,j)
+      coef12 = coef1 * ao_coef_normalized_ordered_transp(k,j)
+      if(dabs(coef12) .lt. 1d-12) cycle
+
+      call overlap_gauss_r12_v(G_center, gama, A1_center,&
+            A2_center, power_A1, power_A2, alpha1, alpha2, analytical_j, n_points)
+
+      do ipoint=1,n_points
+        coef12f = coef12 * fact_g(ipoint)
+        resv(ipoint) += coef12f * analytical_j(ipoint)
+      end do
+
+    enddo
+  enddo
+
+
+end
diff --git a/src/ao_many_one_e_ints/grad2_jmu_modif.irp.f b/src/ao_many_one_e_ints/grad2_jmu_modif.irp.f
index 50b4bf96..6875bc7a 100644
--- a/src/ao_many_one_e_ints/grad2_jmu_modif.irp.f
+++ b/src/ao_many_one_e_ints/grad2_jmu_modif.irp.f
@@ -16,54 +16,55 @@ BEGIN_PROVIDER [ double precision, int2_grad1u2_grad2u2_j1b2, (ao_num, ao_num, n
   double precision              :: tmp
   double precision              :: wall0, wall1
 
+  double precision, allocatable :: int_fit_v(:)
   double precision, external    :: overlap_gauss_r12_ao_with1s
 
   provide mu_erf final_grid_points j1b_pen
   call wall_time(wall0)
 
-  int2_grad1u2_grad2u2_j1b2 = 0.d0
+ allocate(int_fit_v(n_points_final_grid))
+ int2_grad1u2_grad2u2_j1b2(:,:,:) = 0.d0
 
- !$OMP PARALLEL DEFAULT (NONE)                                      &
- !$OMP PRIVATE (ipoint, i, j, i_1s, i_fit, r, coef, beta, B_center, &
- !$OMP          coef_fit, expo_fit, int_fit, tmp)                   & 
- !$OMP SHARED  (n_points_final_grid, ao_num, List_all_comb_b3_size, & 
- !$OMP          final_grid_points, n_max_fit_slat,                  &
- !$OMP          expo_gauss_1_erf_x_2, coef_gauss_1_erf_x_2,         &
- !$OMP          List_all_comb_b3_coef, List_all_comb_b3_expo,       & 
- !$OMP          List_all_comb_b3_cent, int2_grad1u2_grad2u2_j1b2)
- !$OMP DO
-  !do ipoint = 1, 10
-  do ipoint = 1, n_points_final_grid
-    r(1) = final_grid_points(1,ipoint)
-    r(2) = final_grid_points(2,ipoint)
-    r(3) = final_grid_points(3,ipoint)
+ !$OMP PARALLEL DEFAULT (NONE)                                       &
+     !$OMP PRIVATE (ipoint, i, j, i_1s, i_fit, r, coef, beta, B_center,&
+     !$OMP          coef_fit, expo_fit, int_fit_v, tmp)                &
+     !$OMP SHARED  (n_points_final_grid, ao_num, List_all_comb_b3_size,&
+     !$OMP          final_grid_points, n_max_fit_slat,               &
+     !$OMP          expo_gauss_1_erf_x_2, coef_gauss_1_erf_x_2,      &
+     !$OMP          List_all_comb_b3_coef, List_all_comb_b3_expo,    &
+     !$OMP          List_all_comb_b3_cent, int2_grad1u2_grad2u2_j1b2,&
+     !$OMP          ao_overlap_abs)
+ !$OMP DO SCHEDULE(dynamic)
+ do i = 1, ao_num
+   do j = i, ao_num
 
-    do i = 1, ao_num
-      do j = i, ao_num
+     if(ao_overlap_abs(j,i) .lt. 1.d-12) then
+       cycle
+     endif
 
-        tmp = 0.d0
-        do i_1s = 1, List_all_comb_b3_size
+     do i_1s = 1, List_all_comb_b3_size
 
-          coef        = List_all_comb_b3_coef  (i_1s)
-          beta        = List_all_comb_b3_expo  (i_1s)
-          B_center(1) = List_all_comb_b3_cent(1,i_1s)
-          B_center(2) = List_all_comb_b3_cent(2,i_1s)
-          B_center(3) = List_all_comb_b3_cent(3,i_1s)
+       coef        = List_all_comb_b3_coef  (i_1s)
+       beta        = List_all_comb_b3_expo  (i_1s)
+       B_center(1) = List_all_comb_b3_cent(1,i_1s)
+       B_center(2) = List_all_comb_b3_cent(2,i_1s)
+       B_center(3) = List_all_comb_b3_cent(3,i_1s)
 
-          do i_fit = 1, n_max_fit_slat
+       do i_fit = 1, n_max_fit_slat
 
-            expo_fit = expo_gauss_1_erf_x_2(i_fit)
-            coef_fit = coef_gauss_1_erf_x_2(i_fit)
-            int_fit  = overlap_gauss_r12_ao_with1s(B_center, beta, r, expo_fit, i, j)
+         expo_fit = expo_gauss_1_erf_x_2(i_fit)
+         coef_fit = -0.25d0 *  coef_gauss_1_erf_x_2(i_fit) * coef
 
-            tmp += -0.25d0 * coef * coef_fit * int_fit
-          enddo
-        enddo
+         call overlap_gauss_r12_ao_with1s_v(B_center, beta, final_grid_points, expo_fit, i, j, int_fit_v, n_points_final_grid)
 
-        int2_grad1u2_grad2u2_j1b2(j,i,ipoint) = tmp
-      enddo
-    enddo
-  enddo
+         do ipoint = 1, n_points_final_grid
+           int2_grad1u2_grad2u2_j1b2(j,i,ipoint) += coef_fit * int_fit_v(ipoint)
+         enddo
+       enddo
+
+     enddo
+   enddo
+ enddo
  !$OMP END DO
  !$OMP END PARALLEL
 
@@ -78,7 +79,7 @@ BEGIN_PROVIDER [ double precision, int2_grad1u2_grad2u2_j1b2, (ao_num, ao_num, n
   call wall_time(wall1)
   print*, ' wall time for int2_grad1u2_grad2u2_j1b2', wall1 - wall0
 
-END_PROVIDER 
+END_PROVIDER
 
 ! ---
 
@@ -105,11 +106,11 @@ BEGIN_PROVIDER [ double precision, int2_u2_j1b2, (ao_num, ao_num, n_points_final
 
  !$OMP PARALLEL DEFAULT (NONE)                                      &
  !$OMP PRIVATE (ipoint, i, j, i_1s, i_fit, r, coef, beta, B_center, &
- !$OMP          coef_fit, expo_fit, int_fit, tmp)                   & 
- !$OMP SHARED  (n_points_final_grid, ao_num, List_all_comb_b3_size, & 
+ !$OMP          coef_fit, expo_fit, int_fit, tmp)                   &
+ !$OMP SHARED  (n_points_final_grid, ao_num, List_all_comb_b3_size, &
  !$OMP          final_grid_points, n_max_fit_slat,                  &
  !$OMP          expo_gauss_j_mu_x_2, coef_gauss_j_mu_x_2,           &
- !$OMP          List_all_comb_b3_coef, List_all_comb_b3_expo,       & 
+ !$OMP          List_all_comb_b3_coef, List_all_comb_b3_expo,       &
  !$OMP          List_all_comb_b3_cent, int2_u2_j1b2)
  !$OMP DO
   !do ipoint = 1, 10
@@ -158,7 +159,7 @@ BEGIN_PROVIDER [ double precision, int2_u2_j1b2, (ao_num, ao_num, n_points_final
   call wall_time(wall1)
   print*, ' wall time for int2_u2_j1b2', wall1 - wall0
 
-END_PROVIDER 
+END_PROVIDER
 
 ! ---
 
@@ -186,12 +187,12 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2, (3, ao_num, ao_num, n_p
  !$OMP PARALLEL DEFAULT (NONE)                                      &
  !$OMP PRIVATE (ipoint, i, j, i_1s, i_fit, r, coef, beta, B_center, &
  !$OMP          coef_fit, expo_fit, int_fit, alpha_1s, dist,        &
- !$OMP          alpha_1s_inv, centr_1s, expo_coef_1s, coef_tmp,     & 
- !$OMP          tmp_x, tmp_y, tmp_z)                                & 
- !$OMP SHARED  (n_points_final_grid, ao_num, List_all_comb_b3_size, & 
+ !$OMP          alpha_1s_inv, centr_1s, expo_coef_1s, coef_tmp,     &
+ !$OMP          tmp_x, tmp_y, tmp_z)                                &
+ !$OMP SHARED  (n_points_final_grid, ao_num, List_all_comb_b3_size, &
  !$OMP          final_grid_points, n_max_fit_slat,                  &
  !$OMP          expo_gauss_j_mu_1_erf, coef_gauss_j_mu_1_erf,       &
- !$OMP          List_all_comb_b3_coef, List_all_comb_b3_expo,       & 
+ !$OMP          List_all_comb_b3_coef, List_all_comb_b3_expo,       &
  !$OMP          List_all_comb_b3_cent, int2_u_grad1u_x_j1b2)
  !$OMP DO
   do ipoint = 1, n_points_final_grid
@@ -214,7 +215,7 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2, (3, ao_num, ao_num, n_p
           B_center(3) = List_all_comb_b3_cent(3,i_1s)
           dist        = (B_center(1) - r(1)) * (B_center(1) - r(1)) &
                       + (B_center(2) - r(2)) * (B_center(2) - r(2)) &
-                      + (B_center(3) - r(3)) * (B_center(3) - r(3)) 
+                      + (B_center(3) - r(3)) * (B_center(3) - r(3))
 
           do i_fit = 1, n_max_fit_slat
 
@@ -222,17 +223,17 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2, (3, ao_num, ao_num, n_p
             coef_fit = coef_gauss_j_mu_1_erf(i_fit)
 
             alpha_1s     = beta + expo_fit
-            alpha_1s_inv = 1.d0 / alpha_1s 
+            alpha_1s_inv = 1.d0 / alpha_1s
 
             centr_1s(1)  = alpha_1s_inv * (beta * B_center(1) + expo_fit * r(1))
             centr_1s(2)  = alpha_1s_inv * (beta * B_center(2) + expo_fit * r(2))
             centr_1s(3)  = alpha_1s_inv * (beta * B_center(3) + expo_fit * r(3))
 
-            expo_coef_1s = beta * expo_fit * alpha_1s_inv * dist 
+            expo_coef_1s = beta * expo_fit * alpha_1s_inv * dist
             !if(expo_coef_1s .gt. 80.d0) cycle
             coef_tmp = coef * coef_fit * dexp(-expo_coef_1s)
             !if(dabs(coef_tmp) .lt. 1d-10) cycle
-            
+
             call NAI_pol_x_mult_erf_ao_with1s(i, j, alpha_1s, centr_1s, 1.d+9, r, int_fit)
 
             tmp_x += coef_tmp * int_fit(1)
@@ -263,7 +264,7 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2, (3, ao_num, ao_num, n_p
   call wall_time(wall1)
   print*, ' wall time for int2_u_grad1u_x_j1b2', wall1 - wall0
 
-END_PROVIDER 
+END_PROVIDER
 
 ! ---
 
@@ -291,11 +292,11 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_j1b2, (ao_num, ao_num, n_points
  !$OMP PARALLEL DEFAULT (NONE)                                      &
  !$OMP PRIVATE (ipoint, i, j, i_1s, i_fit, r, coef, beta, B_center, &
  !$OMP          coef_fit, expo_fit, int_fit, tmp, alpha_1s, dist,   &
- !$OMP          alpha_1s_inv, centr_1s, expo_coef_1s, coef_tmp)     & 
- !$OMP SHARED  (n_points_final_grid, ao_num, List_all_comb_b3_size, & 
+ !$OMP          alpha_1s_inv, centr_1s, expo_coef_1s, coef_tmp)     &
+ !$OMP SHARED  (n_points_final_grid, ao_num, List_all_comb_b3_size, &
  !$OMP          final_grid_points, n_max_fit_slat,                  &
  !$OMP          expo_gauss_j_mu_1_erf, coef_gauss_j_mu_1_erf,       &
- !$OMP          List_all_comb_b3_coef, List_all_comb_b3_expo,       & 
+ !$OMP          List_all_comb_b3_coef, List_all_comb_b3_expo,       &
  !$OMP          List_all_comb_b3_cent, int2_u_grad1u_j1b2)
  !$OMP DO
   do ipoint = 1, n_points_final_grid
@@ -323,7 +324,7 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_j1b2, (ao_num, ao_num, n_points
             coef_fit = coef_gauss_j_mu_1_erf(i_fit)
 
             alpha_1s     = beta + expo_fit
-            alpha_1s_inv = 1.d0 / alpha_1s 
+            alpha_1s_inv = 1.d0 / alpha_1s
             centr_1s(1)  = alpha_1s_inv * (beta * B_center(1) + expo_fit * r(1))
             centr_1s(2)  = alpha_1s_inv * (beta * B_center(2) + expo_fit * r(2))
             centr_1s(3)  = alpha_1s_inv * (beta * B_center(3) + expo_fit * r(3))
@@ -332,7 +333,7 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_j1b2, (ao_num, ao_num, n_points
             !if(expo_coef_1s .gt. 80.d0) cycle
             coef_tmp = coef * coef_fit * dexp(-expo_coef_1s)
             !if(dabs(coef_tmp) .lt. 1d-10) cycle
-            
+
             int_fit = NAI_pol_mult_erf_ao_with1s(i, j, alpha_1s, centr_1s,  1.d+9, r)
 
             tmp += coef_tmp * int_fit
@@ -357,7 +358,7 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_j1b2, (ao_num, ao_num, n_points
   call wall_time(wall1)
   print*, ' wall time for int2_u_grad1u_j1b2', wall1 - wall0
 
-END_PROVIDER 
+END_PROVIDER
 
 ! ---
 
diff --git a/src/ao_many_one_e_ints/listj1b.irp.f b/src/ao_many_one_e_ints/listj1b.irp.f
index 42d37069..ced6f4e9 100644
--- a/src/ao_many_one_e_ints/listj1b.irp.f
+++ b/src/ao_many_one_e_ints/listj1b.irp.f
@@ -63,7 +63,6 @@ END_PROVIDER
       tmp_cent_z               += tmp_alphaj * nucl_coord(j,3)
     enddo
 
-    ASSERT(List_all_comb_b2_expo(i) .gt. 0d0)
     if(List_all_comb_b2_expo(i) .lt. 1d-10) cycle
 
     List_all_comb_b2_cent(1,i) = tmp_cent_x / List_all_comb_b2_expo(i) 
diff --git a/src/ao_many_one_e_ints/prim_int_gauss_gauss.irp.f b/src/ao_many_one_e_ints/prim_int_gauss_gauss.irp.f
index 749227ea..0da39561 100644
--- a/src/ao_many_one_e_ints/prim_int_gauss_gauss.irp.f
+++ b/src/ao_many_one_e_ints/prim_int_gauss_gauss.irp.f
@@ -1,67 +1,139 @@
-
 double precision function overlap_gauss_r12(D_center,delta,A_center,B_center,power_A,power_B,alpha,beta)
   BEGIN_DOC
   ! Computes the following integral :
   !
-  ! .. math::
-  ! 
+  ! .. math                      ::
+  !
   !   \int dr exp(-delta (r - D)^2 ) (x-A_x)^a (x-B_x)^b \exp(-\alpha (x-A_x)^2 - \beta (x-B_x)^2 )
   !
   END_DOC
 
- implicit none
+  implicit none
   include 'constants.include.F'
- double precision, intent(in)    :: D_center(3), delta  ! pure gaussian "D" 
- double precision, intent(in)    :: A_center(3),B_center(3),alpha,beta ! gaussian/polynoms "A" and "B"
- integer, intent(in)             :: power_A(3),power_B(3)
+  double precision, intent(in)   :: D_center(3), delta  ! pure gaussian "D"
+  double precision, intent(in)   :: A_center(3),B_center(3),alpha,beta ! gaussian/polynoms "A" and "B"
+  integer, intent(in)            :: power_A(3),power_B(3)
 
- double precision  :: overlap_x,overlap_y,overlap_z,overlap
- ! First you multiply the usual gaussian "A" with the gaussian exp(-delta (r - D)^2 )
- double precision  :: A_new(0:max_dim,3)! new polynom 
- double precision  :: A_center_new(3)   ! new center
- integer           :: iorder_a_new(3)   ! i_order(i) = order of the new polynom ==> should be equal to power_A
- double precision  :: alpha_new         ! new exponent
- double precision  :: fact_a_new        ! constant factor
- double precision  :: accu,coefx,coefy,coefz,coefxy,coefxyz,thr
- integer           :: d(3),i,lx,ly,lz,iorder_tmp(3),dim1
- dim1=100
- thr = 1.d-10
- d = 0 ! order of the polynom for the gaussian exp(-delta (r - D)^2 )  == 0
+  double precision               :: overlap_x,overlap_y,overlap_z,overlap
+  ! First you multiply the usual gaussian "A" with the gaussian exp(-delta (r - D)^2 )
+  double precision               :: A_new(0:max_dim,3)! new polynom
+  double precision               :: A_center_new(3)   ! new center
+  integer                        :: iorder_a_new(3)   ! i_order(i) = order of the new polynom ==> should be equal to power_A
+  double precision               :: alpha_new         ! new exponent
+  double precision               :: fact_a_new        ! constant factor
+  double precision               :: accu,coefx,coefy,coefz,coefxy,coefxyz,thr
+  integer                        :: d(3),i,lx,ly,lz,iorder_tmp(3),dim1
+  dim1=100
+  thr = 1.d-10
+  d(:) = 0 ! order of the polynom for the gaussian exp(-delta (r - D)^2 )  == 0
 
- ! New gaussian/polynom defined by :: new pol new center new expo   cst fact new order                                
- call give_explicit_poly_and_gaussian(A_new , A_center_new , alpha_new, fact_a_new , iorder_a_new , & 
-                                      delta,alpha,d,power_A,D_center,A_center,n_pt_max_integrals)
- ! The new gaussian exp(-delta (r - D)^2 ) (x-A_x)^a \exp(-\alpha (x-A_x)^2
- accu = 0.d0
- do lx = 0, iorder_a_new(1)
-  coefx = A_new(lx,1)
-  if(dabs(coefx).lt.thr)cycle
-  iorder_tmp(1) = lx
-  do ly = 0, iorder_a_new(2)
-   coefy = A_new(ly,2)
-   coefxy = coefx * coefy 
-   if(dabs(coefxy).lt.thr)cycle
-   iorder_tmp(2) = ly
-   do lz = 0, iorder_a_new(3)
-    coefz = A_new(lz,3)
-    coefxyz = coefxy * coefz 
-    if(dabs(coefxyz).lt.thr)cycle
-    iorder_tmp(3) = lz
-    call overlap_gaussian_xyz(A_center_new,B_center,alpha_new,beta,iorder_tmp,power_B,overlap_x,overlap_y,overlap_z,overlap,dim1)
-    accu += coefxyz * overlap
-   enddo
+  ! New gaussian/polynom defined by :: new pol new center new expo   cst fact new order
+  call give_explicit_poly_and_gaussian(A_new , A_center_new , alpha_new, fact_a_new , iorder_a_new ,&
+      delta,alpha,d,power_A,D_center,A_center,n_pt_max_integrals)
+  ! The new gaussian exp(-delta (r - D)^2 ) (x-A_x)^a \exp(-\alpha (x-A_x)^2
+  accu = 0.d0
+  do lx = 0, iorder_a_new(1)
+    coefx = A_new(lx,1)
+    if(dabs(coefx).lt.thr)cycle
+    iorder_tmp(1) = lx
+    do ly = 0, iorder_a_new(2)
+      coefy = A_new(ly,2)
+      coefxy = coefx * coefy
+      if(dabs(coefxy).lt.thr)cycle
+      iorder_tmp(2) = ly
+      do lz = 0, iorder_a_new(3)
+        coefz = A_new(lz,3)
+        coefxyz = coefxy * coefz
+        if(dabs(coefxyz).lt.thr)cycle
+        iorder_tmp(3) = lz
+        call overlap_gaussian_xyz(A_center_new,B_center,alpha_new,beta,iorder_tmp,power_B,overlap_x,overlap_y,overlap_z,overlap,dim1)
+        accu += coefxyz * overlap
+      enddo
+    enddo
   enddo
- enddo
- overlap_gauss_r12 = fact_a_new * accu 
+  overlap_gauss_r12 = fact_a_new * accu
 end
 
+!---
+
+subroutine overlap_gauss_r12_v(D_center,delta,A_center,B_center,power_A,power_B,alpha,beta,rvec,n_points)
+  BEGIN_DOC
+  ! Computes the following integral :
+  !
+  ! .. math                      ::
+  !
+  !   \int dr exp(-delta (r - D)^2 ) (x-A_x)^a (x-B_x)^b \exp(-\alpha (x-A_x)^2 - \beta (x-B_x)^2 )
+  !   using an array of D_centers
+  !
+  END_DOC
+
+  implicit none
+  include 'constants.include.F'
+  integer, intent(in)            :: n_points
+  double precision, intent(in)   :: D_center(3,n_points), delta  ! pure gaussian "D"
+  double precision, intent(in)   :: A_center(3),B_center(3),alpha,beta ! gaussian/polynoms "A" and "B"
+  integer, intent(in)            :: power_A(3),power_B(3)
+  double precision, intent(out)  :: rvec(n_points)
+
+  double precision               :: overlap_x,overlap_y,overlap_z,overlap
+
+  integer                        :: maxab
+  integer, allocatable           :: iorder_a_new(:)
+  double precision, allocatable  :: A_new(:,:,:), A_center_new(:,:)
+  double precision, allocatable  :: fact_a_new(:)
+  double precision               :: alpha_new
+  double precision               :: accu,coefx,coefy,coefz,coefxy,coefxyz,thr
+  integer                        :: d(3),i,lx,ly,lz,iorder_tmp(3),dim1, ipoint
+
+  dim1=100
+  thr = 1.d-10
+  d(:) = 0
+
+!  maxab = maxval(d(1:3))
+  maxab = max_dim
+  allocate (A_new(0:maxab, 3, n_points), A_center_new(3, n_points), &
+            fact_a_new(n_points), iorder_a_new(3))
+
+  call give_explicit_poly_and_gaussian_v(A_new, maxab, A_center_new, &
+        alpha_new, fact_a_new, iorder_a_new , delta, alpha, d, power_A, &
+        D_center, A_center, n_points)
+
+  do ipoint=1,n_points
+
+    ! The new gaussian exp(-delta (r - D)^2 ) (x-A_x)^a \exp(-\alpha (x-A_x)^2
+    accu = 0.d0
+    do lx = 0, iorder_a_new(1)
+      coefx = A_new(lx,1,ipoint)
+      if(dabs(coefx).lt.thr)cycle
+      iorder_tmp(1) = lx
+      do ly = 0, iorder_a_new(2)
+        coefy = A_new(ly,2,ipoint)
+        coefxy = coefx * coefy
+        if(dabs(coefxy).lt.thr)cycle
+        iorder_tmp(2) = ly
+        do lz = 0, iorder_a_new(3)
+          coefz = A_new(lz,3,ipoint)
+          coefxyz = coefxy * coefz
+          if(dabs(coefxyz).lt.thr)cycle
+          iorder_tmp(3) = lz
+          call overlap_gaussian_xyz(A_center_new(1,ipoint),B_center,alpha_new,beta,iorder_tmp,power_B,overlap_x,overlap_y,overlap_z,overlap,dim1)
+          accu += coefxyz * overlap
+        enddo
+      enddo
+    enddo
+    rvec(ipoint) = fact_a_new(ipoint) * accu
+  end do
+end
+
+!---
+!---
 
 subroutine overlap_gauss_xyz_r12(D_center,delta,A_center,B_center,power_A,power_B,alpha,beta,gauss_ints)
   BEGIN_DOC
   ! Computes the following integral :
   !
   ! .. math::
-  ! 
+  !
   !   gauss_ints(m) = \int dr exp(-delta (r - D)^2 ) * x/y/z (x-A_x)^a (x-B_x)^b \exp(-\alpha (x-A_x)^2 - \beta (x-B_x)^2 )
   !
   ! with m == 1 ==> x, m == 2 ==> y, m == 3 ==> z
@@ -69,14 +141,14 @@ subroutine overlap_gauss_xyz_r12(D_center,delta,A_center,B_center,power_A,power_
 
  implicit none
   include 'constants.include.F'
- double precision, intent(in)    :: D_center(3), delta  ! pure gaussian "D" 
+ double precision, intent(in)    :: D_center(3), delta  ! pure gaussian "D"
  double precision, intent(in)    :: A_center(3),B_center(3),alpha,beta ! gaussian/polynoms "A" and "B"
  integer, intent(in)             :: power_A(3),power_B(3)
  double precision, intent(out)   :: gauss_ints(3)
 
  double precision  :: overlap_x,overlap_y,overlap_z,overlap
  ! First you multiply the usual gaussian "A" with the gaussian exp(-delta (r - D)^2 )
- double precision  :: A_new(0:max_dim,3)! new polynom 
+ double precision  :: A_new(0:max_dim,3)! new polynom
  double precision  :: A_center_new(3)   ! new center
  integer           :: iorder_a_new(3)   ! i_order(i) = order of the new polynom ==> should be equal to power_A
  integer           :: power_B_new(3)
@@ -88,8 +160,8 @@ subroutine overlap_gauss_xyz_r12(D_center,delta,A_center,B_center,power_A,power_
  thr = 1.d-10
  d = 0 ! order of the polynom for the gaussian exp(-delta (r - D)^2 )  == 0
 
- ! New gaussian/polynom defined by :: new pol new center new expo   cst fact new order                                
- call give_explicit_poly_and_gaussian(A_new , A_center_new , alpha_new, fact_a_new , iorder_a_new , & 
+ ! New gaussian/polynom defined by :: new pol new center new expo   cst fact new order
+ call give_explicit_poly_and_gaussian(A_new , A_center_new , alpha_new, fact_a_new , iorder_a_new , &
                                       delta,alpha,d,power_A,D_center,A_center,n_pt_max_integrals)
  ! The new gaussian exp(-delta (r - D)^2 ) (x-A_x)^a \exp(-\alpha (x-A_x)^2
  gauss_ints = 0.d0
@@ -99,12 +171,12 @@ subroutine overlap_gauss_xyz_r12(D_center,delta,A_center,B_center,power_A,power_
   iorder_tmp(1) = lx
   do ly = 0, iorder_a_new(2)
    coefy = A_new(ly,2)
-   coefxy = coefx * coefy 
+   coefxy = coefx * coefy
    if(dabs(coefxy).lt.thr)cycle
    iorder_tmp(2) = ly
    do lz = 0, iorder_a_new(3)
     coefz = A_new(lz,3)
-    coefxyz = coefxy * coefz 
+    coefxyz = coefxy * coefz
     if(dabs(coefxyz).lt.thr)cycle
     iorder_tmp(3) = lz
     do m = 1, 3
@@ -129,7 +201,7 @@ double precision function overlap_gauss_xyz_r12_specific(D_center,delta,A_center
   ! Computes the following integral :
   !
   ! .. math::
-  ! 
+  !
   !    \int dr exp(-delta (r - D)^2 ) * x/y/z (x-A_x)^a (x-B_x)^b \exp(-\alpha (x-A_x)^2 - \beta (x-B_x)^2 )
   !
   ! with mx == 1 ==> x, mx == 2 ==> y, mx == 3 ==> z
@@ -137,13 +209,13 @@ double precision function overlap_gauss_xyz_r12_specific(D_center,delta,A_center
 
  implicit none
   include 'constants.include.F'
- double precision, intent(in)    :: D_center(3), delta  ! pure gaussian "D" 
+ double precision, intent(in)    :: D_center(3), delta  ! pure gaussian "D"
  double precision, intent(in)    :: A_center(3),B_center(3),alpha,beta ! gaussian/polynoms "A" and "B"
  integer, intent(in)             :: power_A(3),power_B(3),mx
 
  double precision  :: overlap_x,overlap_y,overlap_z,overlap
  ! First you multiply the usual gaussian "A" with the gaussian exp(-delta (r - D)^2 )
- double precision  :: A_new(0:max_dim,3)! new polynom 
+ double precision  :: A_new(0:max_dim,3)! new polynom
  double precision  :: A_center_new(3)   ! new center
  integer           :: iorder_a_new(3)   ! i_order(i) = order of the new polynom ==> should be equal to power_A
  integer           :: power_B_new(3)
@@ -155,8 +227,8 @@ double precision function overlap_gauss_xyz_r12_specific(D_center,delta,A_center
  thr = 1.d-10
  d = 0 ! order of the polynom for the gaussian exp(-delta (r - D)^2 )  == 0
 
- ! New gaussian/polynom defined by :: new pol new center new expo   cst fact new order                                
- call give_explicit_poly_and_gaussian(A_new , A_center_new , alpha_new, fact_a_new , iorder_a_new , & 
+ ! New gaussian/polynom defined by :: new pol new center new expo   cst fact new order
+ call give_explicit_poly_and_gaussian(A_new , A_center_new , alpha_new, fact_a_new , iorder_a_new , &
                                       delta,alpha,d,power_A,D_center,A_center,n_pt_max_integrals)
  ! The new gaussian exp(-delta (r - D)^2 ) (x-A_x)^a \exp(-\alpha (x-A_x)^2
  overlap_gauss_xyz_r12_specific = 0.d0
@@ -166,12 +238,12 @@ double precision function overlap_gauss_xyz_r12_specific(D_center,delta,A_center
   iorder_tmp(1) = lx
   do ly = 0, iorder_a_new(2)
    coefy = A_new(ly,2)
-   coefxy = coefx * coefy 
+   coefxy = coefx * coefy
    if(dabs(coefxy).lt.thr)cycle
    iorder_tmp(2) = ly
    do lz = 0, iorder_a_new(3)
     coefz = A_new(lz,3)
-    coefxyz = coefxy * coefz 
+    coefxyz = coefxy * coefz
     if(dabs(coefxyz).lt.thr)cycle
     iorder_tmp(3) = lz
     m = mx
diff --git a/src/utils/integration.irp.f b/src/utils/integration.irp.f
index c2bff2e8..fd9e233b 100644
--- a/src/utils/integration.irp.f
+++ b/src/utils/integration.irp.f
@@ -56,7 +56,7 @@ subroutine give_explicit_poly_and_gaussian(P_new,P_center,p,fact_k,iorder,alpha,
   !               * [ sum (l_y = 0,i_order(2)) P_new(l_y,2) * (y-P_center(2))^l_y ] exp (- p (y-P_center(2))^2 )
   !               * [ sum (l_z = 0,i_order(3)) P_new(l_z,3) * (z-P_center(3))^l_z ] exp (- p (z-P_center(3))^2 )
   !
-  ! WARNING ::: IF fact_k is too smal then: 
+  ! WARNING ::: IF fact_k is too smal then:
   ! returns a "s" function centered in zero
   ! with an inifinite exponent and a zero polynom coef
   END_DOC
@@ -86,13 +86,11 @@ subroutine give_explicit_poly_and_gaussian(P_new,P_center,p,fact_k,iorder,alpha,
   !DIR$ FORCEINLINE
   call gaussian_product(alpha,A_center,beta,B_center,fact_k,p,P_center)
   if (fact_k < thresh) then
-    ! IF fact_k is too smal then: 
+    ! IF fact_k is too smal then:
     ! returns a "s" function centered in zero
     ! with an inifinite exponent and a zero polynom coef
     P_center = 0.d0
     p = 1.d+15
-    P_new = 0.d0
-    iorder = 0
     fact_k = 0.d0
     return
   endif
@@ -129,6 +127,89 @@ subroutine give_explicit_poly_and_gaussian(P_new,P_center,p,fact_k,iorder,alpha,
 
 end
 
+!---
+
+subroutine give_explicit_poly_and_gaussian_v(P_new, ldp, P_center,p,fact_k,iorder,alpha,beta,a,b,A_center,B_center,n_points)
+  BEGIN_DOC
+  ! Transforms the product of
+  !          (x-x_A)^a(1) (x-x_B)^b(1) (x-x_A)^a(2) (y-y_B)^b(2) (z-z_A)^a(3) (z-z_B)^b(3) exp(-(r-A)^2 alpha) exp(-(r-B)^2 beta)
+  ! into
+  !        fact_k * [ sum (l_x = 0,i_order(1)) P_new(l_x,1) * (x-P_center(1))^l_x ] exp (- p (x-P_center(1))^2 )
+  !               * [ sum (l_y = 0,i_order(2)) P_new(l_y,2) * (y-P_center(2))^l_y ] exp (- p (y-P_center(2))^2 )
+  !               * [ sum (l_z = 0,i_order(3)) P_new(l_z,3) * (z-P_center(3))^l_z ] exp (- p (z-P_center(3))^2 )
+  !
+  ! WARNING                      :: : IF fact_k is too smal then:
+  ! returns a "s" function centered in zero
+  ! with an inifinite exponent and a zero polynom coef
+  END_DOC
+  implicit none
+  include 'constants.include.F'
+  integer, intent(in)            :: n_points, ldp
+  integer, intent(in)            :: a(3),b(3)         ! powers : (x-xa)**a_x = (x-A(1))**a(1)
+  double precision, intent(in)   :: alpha, beta       ! exponents
+  double precision, intent(in)   :: A_center(3,n_points) ! A center
+  double precision, intent(in)   :: B_center (3)         ! B center
+  double precision, intent(out)  :: P_center(3,n_points) ! new center
+  double precision, intent(out)  :: p                 ! new exponent
+  double precision, intent(out)  :: fact_k(n_points)            ! constant factor
+  double precision, intent(out)  :: P_new(0:ldp,3,n_points)! polynomial
+  integer, intent(out)           :: iorder(3)         ! i_order(i) = order of the polynomials
+
+  double precision, allocatable  :: P_a(:,:,:), P_b(:,:,:)
+
+  integer                        :: n_new,i,j, ipoint, lda, ldb, xyz
+
+  call gaussian_product_v(alpha,A_center,beta,B_center,fact_k,p,P_center,n_points)
+
+  if ( ior(ior(b(1),b(2)),b(3)) == 0 ) then  ! b == (0,0,0)
+
+    lda = maxval(a)
+    ldb = 0
+    allocate(P_a(0:lda,3,n_points),P_b(0:0,3,n_points))
+
+    call recentered_poly2_v0(P_a,lda,A_center,P_center,a,P_b,B_center,P_center,n_points)
+
+    iorder(1:3) = a(1:3)
+    do ipoint=1,n_points
+      do xyz=1,3
+        P_new(0,xyz,ipoint) = P_b(0,xyz,ipoint) * P_a(0,xyz,ipoint)
+        do i=1,a(xyz)
+          P_new(i,xyz,ipoint) = P_new(i,xyz,ipoint) + P_b(0,xyz,ipoint) * P_a(i,xyz,ipoint)
+        enddo
+      enddo
+    enddo
+    return
+
+  endif
+
+  lda = maxval(a)
+  ldb = maxval(b)
+  allocate(P_a(0:lda,3,n_points),P_b(0:ldb,3,n_points))
+  call recentered_poly2_v(P_a,lda,A_center,P_center,a,P_b,ldb,B_center,P_center,b,n_points)
+
+  iorder(1:3) = a(1:3) + b(1:3)
+  do xyz=1,3
+    if (b(xyz) == 0) then
+      do ipoint=1,n_points
+        P_new(0,xyz,ipoint) = P_b(0,xyz,ipoint) * P_a(0,xyz,ipoint)
+        do i=1,a(xyz)
+          P_new(i,xyz,ipoint) = P_new(i,xyz,ipoint) + P_b(0,xyz,ipoint) * P_a(i,xyz,ipoint)
+        enddo
+      enddo
+    else
+      do ipoint=1,n_points
+        do i=0,iorder(xyz)
+          P_new(i,xyz,ipoint) = 0.d0
+        enddo
+        n_new=0
+        call multiply_poly(P_a(0,xyz,ipoint),a(xyz),P_b(0,xyz,ipoint),b(xyz),P_new(0,xyz,ipoint),n_new)
+      enddo
+    endif
+  enddo
+
+end
+
+!-
 
 subroutine give_explicit_poly_and_gaussian_double(P_new,P_center,p,fact_k,iorder,alpha,beta,gama,a,b,A_center,B_center,Nucl_center,dim)
   BEGIN_DOC
@@ -231,6 +312,59 @@ subroutine gaussian_product(a,xa,b,xb,k,p,xp)
   xp(3) = (a*xa(3)+b*xb(3))*p_inv
 end subroutine
 
+!---
+subroutine gaussian_product_v(a,xa,b,xb,k,p,xp,n_points)
+  implicit none
+  BEGIN_DOC
+  ! Gaussian product in 1D.
+  ! e^{-a (x-x_A)^2} e^{-b (x-x_B)^2} = K_{ab}^x e^{-p (x-x_P)^2}
+  ! Using multiple A centers
+  END_DOC
+
+  integer, intent(in)            :: n_points
+  double precision, intent(in)   :: a,b         ! Exponents
+  double precision, intent(in)   :: xa(3,n_points),xb(3) ! Centers
+  double precision, intent(out)  :: p ! New exponent
+  double precision, intent(out)  :: xp(3,n_points)       ! New center
+  double precision, intent(out)  :: k(n_points) ! Constant
+
+  double precision               :: p_inv
+
+  integer :: ipoint
+  ASSERT (a>0.)
+  ASSERT (b>0.)
+
+  double precision               :: xab(3), ab, ap, bp, bpxb(3)
+  !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: xab
+
+  p = a+b
+  p_inv = 1.d0/(a+b)
+  ab = a*b*p_inv
+  ap = a*p_inv
+  bp = b*p_inv
+  bpxb(1) = bp*xb(1)
+  bpxb(2) = bp*xb(2)
+  bpxb(3) = bp*xb(3)
+
+  do ipoint=1,n_points
+    xab(1) = xa(1,ipoint)-xb(1)
+    xab(2) = xa(2,ipoint)-xb(2)
+    xab(3) = xa(3,ipoint)-xb(3)
+    k(ipoint) = ab*(xab(1)*xab(1)+xab(2)*xab(2)+xab(3)*xab(3))
+    if (k(ipoint) > 40.d0) then
+      k(ipoint)=0.d0
+      xp(1,ipoint) = 0.d0
+      xp(2,ipoint) = 0.d0
+      xp(3,ipoint) = 0.d0
+    else
+      k(ipoint) = dexp(-k(ipoint))
+      xp(1,ipoint) = ap*xa(1,ipoint)+bpxb(1)
+      xp(2,ipoint) = ap*xa(2,ipoint)+bpxb(2)
+      xp(3,ipoint) = ap*xa(3,ipoint)+bpxb(3)
+    endif
+  enddo
+end subroutine
+
 
 
 
@@ -404,22 +538,152 @@ subroutine recentered_poly2(P_new,x_A,x_P,a,P_new2,x_B,x_Q,b)
   do i =  minab+1,min(b,20)
     P_new2(i) =  binom_transp(b-i,b) * pows_b(b-i)
   enddo
-  do i =  101,a
+  do i =  21,a
     P_new(i) =  binom_func(a,a-i) * pows_a(a-i)
   enddo
-  do i =  101,b
+  do i =  21,b
     P_new2(i) =  binom_func(b,b-i) * pows_b(b-i)
   enddo
 end
 
-subroutine pol_modif_center(A_center, B_center, iorder, A_pol, B_pol) 
+!-
+subroutine recentered_poly2_v(P_new,lda,x_A,x_P,a,P_new2,ldb,x_B,x_Q,b,n_points)
+  implicit none
+  BEGIN_DOC
+  ! Recenter two polynomials
+  END_DOC
+  integer, intent(in)            :: a(3),b(3), n_points, lda, ldb
+  double precision, intent(in)   :: x_A(3,n_points),x_P(3,n_points),x_B(3),x_Q(3,n_points)
+  double precision, intent(out)  :: P_new(0:lda,3,n_points),P_new2(0:ldb,3,n_points)
+  double precision               :: binom_func
+  integer                        :: i,j,k,l, minab(3), maxab(3),ipoint, xyz
+  double precision, allocatable  :: pows_a(:,:), pows_b(:,:)
+  double precision :: fa, fb
+
+  maxab(1:3) = max(a(1:3),b(1:3))
+  minab(1:3) = max(min(a(1:3),b(1:3)),(/0,0,0/))
+
+  allocate( pows_a(n_points,-2:maxval(maxab)+4), pows_b(n_points,-2:maxval(maxab)+4) )
+
+
+  do xyz=1,3
+    if ((a(xyz)<0).or.(b(xyz)<0) ) cycle
+    do ipoint=1,n_points
+      pows_a(ipoint,0) = 1.d0
+      pows_a(ipoint,1) = (x_P(xyz,ipoint) - x_A(xyz,ipoint))
+      pows_b(ipoint,0) = 1.d0
+      pows_b(ipoint,1) = (x_Q(xyz,ipoint) - x_B(xyz))
+    enddo
+    do i =  2,maxab(xyz)
+      do ipoint=1,n_points
+        pows_a(ipoint,i) = pows_a(ipoint,i-1)*pows_a(ipoint,1)
+        pows_b(ipoint,i) = pows_b(ipoint,i-1)*pows_b(ipoint,1)
+      enddo
+    enddo
+    do ipoint=1,n_points
+      P_new (0,xyz,ipoint) =  pows_a(ipoint,a(xyz))
+      P_new2(0,xyz,ipoint) =  pows_b(ipoint,b(xyz))
+    enddo
+    do i =  1,min(minab(xyz),20)
+      fa =  binom_transp(a(xyz)-i,a(xyz))
+      fb =  binom_transp(b(xyz)-i,b(xyz))
+      do ipoint=1,n_points
+        P_new (i,xyz,ipoint) =  fa * pows_a(ipoint,a(xyz)-i)
+        P_new2(i,xyz,ipoint) =  fb * pows_b(ipoint,b(xyz)-i)
+      enddo
+    enddo
+    do i =  minab(xyz)+1,min(a(xyz),20)
+      fa =  binom_transp(a(xyz)-i,a(xyz))
+      do ipoint=1,n_points
+        P_new (i,xyz,ipoint) =  fa * pows_a(ipoint,a(xyz)-i)
+      enddo
+    enddo
+    do i =  minab(xyz)+1,min(b(xyz),20)
+      fb =  binom_transp(b(xyz)-i,b(xyz))
+      do ipoint=1,n_points
+        P_new2(i,xyz,ipoint) =  fb * pows_b(ipoint,b(xyz)-i)
+      enddo
+    enddo
+    do i =  21,a(xyz)
+      fa =  binom_func(a(xyz),a(xyz)-i)
+      do ipoint=1,n_points
+        P_new (i,xyz,ipoint) =  fa * pows_a(ipoint,a(xyz)-i)
+      enddo
+    enddo
+    do i =  21,b(xyz)
+      fb = binom_func(b(xyz),b(xyz)-i)
+      do ipoint=1,n_points
+        P_new2(i,xyz,ipoint) =  fb * pows_b(ipoint,b(xyz)-i)
+      enddo
+    enddo
+  enddo
+end
+
+
+subroutine recentered_poly2_v0(P_new,lda,x_A,x_P,a,P_new2,x_B,x_Q,n_points)
+  implicit none
+  BEGIN_DOC
+  ! Recenter two polynomials. Special case for b=(0,0,0)
+  END_DOC
+  integer, intent(in)            :: a(3), n_points, lda
+  double precision, intent(in)   :: x_A(3,n_points),x_P(3,n_points),x_B(3),x_Q(3,n_points)
+  double precision, intent(out)  :: P_new(0:lda,3,n_points),P_new2(3,n_points)
+  double precision               :: binom_func
+  integer                        :: i,j,k,l, xyz, ipoint, maxab(3)
+  double precision, allocatable  :: pows_a(:,:), pows_b(:,:)
+  double precision               :: fa
+
+  maxab(1:3) = max(a(1:3),(/0,0,0/))
+
+  allocate( pows_a(n_points,-2:maxval(maxab)+4), pows_b(n_points,-2:maxval(maxab)+4) )
+
+  do xyz=1,3
+    if (a(xyz)<0) cycle
+    do ipoint=1,n_points
+      pows_a(ipoint,0) = 1.d0
+      pows_a(ipoint,1) = (x_P(xyz,ipoint) - x_A(xyz,ipoint))
+      pows_b(ipoint,0) = 1.d0
+      pows_b(ipoint,1) = (x_Q(xyz,ipoint) - x_B(xyz))
+    enddo
+    do i = 2,maxab(xyz)
+      do ipoint=1,n_points
+        pows_a(ipoint,i) = pows_a(ipoint,i-1)*pows_a(ipoint,1)
+        pows_b(ipoint,i) = pows_b(ipoint,i-1)*pows_b(ipoint,1)
+      enddo
+    enddo
+    do ipoint=1,n_points
+      P_new (0,xyz,ipoint) =  pows_a(ipoint,a(xyz))
+      P_new2(xyz,ipoint) =  pows_b(ipoint,0)
+    enddo
+    do i =  1,min(a(xyz),20)
+      fa = binom_transp(a(xyz)-i,a(xyz))
+      do ipoint=1,n_points
+        P_new (i,xyz,ipoint) =  fa * pows_a(ipoint,a(xyz)-i)
+      enddo
+    enddo
+    do i = 21,a(xyz)
+      fa = binom_func(a(xyz),a(xyz)-i)
+      do ipoint=1,n_points
+        P_new (i,xyz,ipoint) =  fa * pows_a(ipoint,a(xyz)-i)
+      enddo
+    enddo
+
+  enddo !xyz
+
+  deallocate(pows_a, pows_b)
+end
+
+!--
+!--
+
+subroutine pol_modif_center(A_center, B_center, iorder, A_pol, B_pol)
 
   BEGIN_DOC
-  ! 
+  !
   ! Transform the pol centerd on A:
-  !       [ \sum_i ax_i (x-x_A)^i ] [ \sum_j ay_j (y-y_A)^j ] [ \sum_k az_k (z-z_A)^k ] 
+  !       [ \sum_i ax_i (x-x_A)^i ] [ \sum_j ay_j (y-y_A)^j ] [ \sum_k az_k (z-z_A)^k ]
   ! to a pol centered on B
-  !       [ \sum_i bx_i (x-x_B)^i ] [ \sum_j by_j (y-y_B)^j ] [ \sum_k bz_k (z-z_B)^k ] 
+  !       [ \sum_i bx_i (x-x_B)^i ] [ \sum_j by_j (y-y_B)^j ] [ \sum_k bz_k (z-z_B)^k ]
   !
   END_DOC
 
@@ -437,7 +701,7 @@ subroutine pol_modif_center(A_center, B_center, iorder, A_pol, B_pol)
 
   do i = 1, 3
     Lmax = iorder(i)
-    call pol_modif_center_x( A_center(i), B_center(i), Lmax, A_pol(0:Lmax, i), B_pol(0:Lmax, i) ) 
+    call pol_modif_center_x( A_center(i), B_center(i), Lmax, A_pol(0:Lmax, i), B_pol(0:Lmax, i) )
   enddo
 
   return
@@ -445,14 +709,14 @@ end subroutine pol_modif_center
 
 
 
-subroutine pol_modif_center_x(A_center, B_center, iorder, A_pol, B_pol) 
+subroutine pol_modif_center_x(A_center, B_center, iorder, A_pol, B_pol)
 
   BEGIN_DOC
-  ! 
+  !
   ! Transform the pol centerd on A:
-  !       [ \sum_i ax_i (x-x_A)^i ] 
+  !       [ \sum_i ax_i (x-x_A)^i ]
   ! to a pol centered on B
-  !       [ \sum_i bx_i (x-x_B)^i ] 
+  !       [ \sum_i bx_i (x-x_B)^i ]
   !
   ! bx_i = \sum_{j=i}^{iorder} ax_j (x_B - x_A)^(j-i) j! / [ i! (j-i)! ]
   !      = \sum_{j=i}^{iorder} ax_j (x_B - x_A)^(j-i) binom_func(j,i)
@@ -591,7 +855,7 @@ double precision function rint_sum(n_pt_out,rho,d1)
       u_inv=1.d0/dsqrt(rho)
       u=rho*u_inv
       rint_sum=0.5d0*u_inv*sqpi*derf(u) *d1(0)
-!      print *, 0, d1(0), 0.5d0*u_inv*sqpi*derf(u) 
+!      print *, 0, d1(0), 0.5d0*u_inv*sqpi*derf(u)
     endif
 
     do i=2,n_pt_out,2
diff --git a/src/utils/map_module.f90 b/src/utils/map_module.f90
index 98e73470..ceaec874 100644
--- a/src/utils/map_module.f90
+++ b/src/utils/map_module.f90
@@ -238,11 +238,11 @@ subroutine cache_map_sort(map)
       iorder(i) = i
     enddo
     if (cache_key_kind == 2) then
-      call i2radix_sort(map%key,iorder,map%n_elements,-1)
+      call i2sort(map%key,iorder,map%n_elements,-1)
     else if (cache_key_kind == 4) then
-      call iradix_sort(map%key,iorder,map%n_elements,-1)
+      call isort(map%key,iorder,map%n_elements,-1)
     else if (cache_key_kind == 8) then
-      call i8radix_sort(map%key,iorder,map%n_elements,-1)
+      call i8sort(map%key,iorder,map%n_elements,-1)
     endif
     if (integral_kind == 4) then
       call set_order(map%value,iorder,map%n_elements)
diff --git a/src/utils/qsort.c b/src/utils/qsort.c
new file mode 100644
index 00000000..c011b35a
--- /dev/null
+++ b/src/utils/qsort.c
@@ -0,0 +1,373 @@
+/* [[file:~/qp2/src/utils/qsort.org::*Generated%20C%20file][Generated C file:1]] */
+#include <stdlib.h>
+#include <stdint.h>
+
+struct int16_t_comp {
+  int16_t    x;
+  int32_t i;
+};
+
+int compare_int16_t( const void * l, const void * r )
+{
+  const int16_t * restrict _l= l;
+  const int16_t * restrict _r= r;
+  if( *_l > *_r ) return 1;
+  if( *_l < *_r ) return -1;
+  return 0;
+}
+
+void qsort_int16_t(int16_t* restrict A_in, int32_t* restrict iorder, int32_t isize) {
+  struct int16_t_comp* A = malloc(isize * sizeof(struct int16_t_comp));
+  if (A == NULL) return;
+
+  for (int i=0 ; i<isize ; ++i) {
+    A[i].x = A_in[i];
+    A[i].i = iorder[i];
+  }
+
+  qsort( (void*) A, (size_t) isize, sizeof(struct int16_t_comp), compare_int16_t);
+
+  for (int i=0 ; i<isize ; ++i) {
+    A_in[i] = A[i].x;
+    iorder[i] = A[i].i;
+  }
+  free(A);
+}
+
+void qsort_int16_t_noidx(int16_t* A, int32_t isize) {
+  qsort( (void*) A, (size_t) isize, sizeof(int16_t), compare_int16_t);
+}
+
+
+struct int16_t_comp_big {
+  int16_t    x;
+  int64_t i;
+};
+
+int compare_int16_t_big( const void * l, const void * r )
+{
+  const int16_t * restrict _l= l;
+  const int16_t * restrict _r= r;
+  if( *_l > *_r ) return 1;
+  if( *_l < *_r ) return -1;
+  return 0;
+}
+
+void qsort_int16_t_big(int16_t* restrict A_in, int64_t* restrict iorder, int64_t isize) {
+  struct int16_t_comp_big* A = malloc(isize * sizeof(struct int16_t_comp_big));
+  if (A == NULL) return;
+
+  for (int i=0 ; i<isize ; ++i) {
+    A[i].x = A_in[i];
+    A[i].i = iorder[i];
+  }
+
+  qsort( (void*) A, (size_t) isize, sizeof(struct int16_t_comp_big), compare_int16_t_big);
+
+  for (int i=0 ; i<isize ; ++i) {
+    A_in[i] = A[i].x;
+    iorder[i] = A[i].i;
+  }
+  free(A);
+}
+
+void qsort_int16_t_noidx_big(int16_t* A, int64_t isize) {
+  qsort( (void*) A, (size_t) isize, sizeof(int16_t), compare_int16_t_big);
+}
+
+
+struct int32_t_comp {
+  int32_t    x;
+  int32_t i;
+};
+
+int compare_int32_t( const void * l, const void * r )
+{
+  const int32_t * restrict _l= l;
+  const int32_t * restrict _r= r;
+  if( *_l > *_r ) return 1;
+  if( *_l < *_r ) return -1;
+  return 0;
+}
+
+void qsort_int32_t(int32_t* restrict A_in, int32_t* restrict iorder, int32_t isize) {
+  struct int32_t_comp* A = malloc(isize * sizeof(struct int32_t_comp));
+  if (A == NULL) return;
+
+  for (int i=0 ; i<isize ; ++i) {
+    A[i].x = A_in[i];
+    A[i].i = iorder[i];
+  }
+
+  qsort( (void*) A, (size_t) isize, sizeof(struct int32_t_comp), compare_int32_t);
+
+  for (int i=0 ; i<isize ; ++i) {
+    A_in[i] = A[i].x;
+    iorder[i] = A[i].i;
+  }
+  free(A);
+}
+
+void qsort_int32_t_noidx(int32_t* A, int32_t isize) {
+  qsort( (void*) A, (size_t) isize, sizeof(int32_t), compare_int32_t);
+}
+
+
+struct int32_t_comp_big {
+  int32_t    x;
+  int64_t i;
+};
+
+int compare_int32_t_big( const void * l, const void * r )
+{
+  const int32_t * restrict _l= l;
+  const int32_t * restrict _r= r;
+  if( *_l > *_r ) return 1;
+  if( *_l < *_r ) return -1;
+  return 0;
+}
+
+void qsort_int32_t_big(int32_t* restrict A_in, int64_t* restrict iorder, int64_t isize) {
+  struct int32_t_comp_big* A = malloc(isize * sizeof(struct int32_t_comp_big));
+  if (A == NULL) return;
+
+  for (int i=0 ; i<isize ; ++i) {
+    A[i].x = A_in[i];
+    A[i].i = iorder[i];
+  }
+
+  qsort( (void*) A, (size_t) isize, sizeof(struct int32_t_comp_big), compare_int32_t_big);
+
+  for (int i=0 ; i<isize ; ++i) {
+    A_in[i] = A[i].x;
+    iorder[i] = A[i].i;
+  }
+  free(A);
+}
+
+void qsort_int32_t_noidx_big(int32_t* A, int64_t isize) {
+  qsort( (void*) A, (size_t) isize, sizeof(int32_t), compare_int32_t_big);
+}
+
+
+struct int64_t_comp {
+  int64_t    x;
+  int32_t i;
+};
+
+int compare_int64_t( const void * l, const void * r )
+{
+  const int64_t * restrict _l= l;
+  const int64_t * restrict _r= r;
+  if( *_l > *_r ) return 1;
+  if( *_l < *_r ) return -1;
+  return 0;
+}
+
+void qsort_int64_t(int64_t* restrict A_in, int32_t* restrict iorder, int32_t isize) {
+  struct int64_t_comp* A = malloc(isize * sizeof(struct int64_t_comp));
+  if (A == NULL) return;
+
+  for (int i=0 ; i<isize ; ++i) {
+    A[i].x = A_in[i];
+    A[i].i = iorder[i];
+  }
+
+  qsort( (void*) A, (size_t) isize, sizeof(struct int64_t_comp), compare_int64_t);
+
+  for (int i=0 ; i<isize ; ++i) {
+    A_in[i] = A[i].x;
+    iorder[i] = A[i].i;
+  }
+  free(A);
+}
+
+void qsort_int64_t_noidx(int64_t* A, int32_t isize) {
+  qsort( (void*) A, (size_t) isize, sizeof(int64_t), compare_int64_t);
+}
+
+
+struct int64_t_comp_big {
+  int64_t    x;
+  int64_t i;
+};
+
+int compare_int64_t_big( const void * l, const void * r )
+{
+  const int64_t * restrict _l= l;
+  const int64_t * restrict _r= r;
+  if( *_l > *_r ) return 1;
+  if( *_l < *_r ) return -1;
+  return 0;
+}
+
+void qsort_int64_t_big(int64_t* restrict A_in, int64_t* restrict iorder, int64_t isize) {
+  struct int64_t_comp_big* A = malloc(isize * sizeof(struct int64_t_comp_big));
+  if (A == NULL) return;
+
+  for (int i=0 ; i<isize ; ++i) {
+    A[i].x = A_in[i];
+    A[i].i = iorder[i];
+  }
+
+  qsort( (void*) A, (size_t) isize, sizeof(struct int64_t_comp_big), compare_int64_t_big);
+
+  for (int i=0 ; i<isize ; ++i) {
+    A_in[i] = A[i].x;
+    iorder[i] = A[i].i;
+  }
+  free(A);
+}
+
+void qsort_int64_t_noidx_big(int64_t* A, int64_t isize) {
+  qsort( (void*) A, (size_t) isize, sizeof(int64_t), compare_int64_t_big);
+}
+
+
+struct double_comp {
+  double    x;
+  int32_t i;
+};
+
+int compare_double( const void * l, const void * r )
+{
+  const double * restrict _l= l;
+  const double * restrict _r= r;
+  if( *_l > *_r ) return 1;
+  if( *_l < *_r ) return -1;
+  return 0;
+}
+
+void qsort_double(double* restrict A_in, int32_t* restrict iorder, int32_t isize) {
+  struct double_comp* A = malloc(isize * sizeof(struct double_comp));
+  if (A == NULL) return;
+
+  for (int i=0 ; i<isize ; ++i) {
+    A[i].x = A_in[i];
+    A[i].i = iorder[i];
+  }
+
+  qsort( (void*) A, (size_t) isize, sizeof(struct double_comp), compare_double);
+
+  for (int i=0 ; i<isize ; ++i) {
+    A_in[i] = A[i].x;
+    iorder[i] = A[i].i;
+  }
+  free(A);
+}
+
+void qsort_double_noidx(double* A, int32_t isize) {
+  qsort( (void*) A, (size_t) isize, sizeof(double), compare_double);
+}
+
+
+struct double_comp_big {
+  double    x;
+  int64_t i;
+};
+
+int compare_double_big( const void * l, const void * r )
+{
+  const double * restrict _l= l;
+  const double * restrict _r= r;
+  if( *_l > *_r ) return 1;
+  if( *_l < *_r ) return -1;
+  return 0;
+}
+
+void qsort_double_big(double* restrict A_in, int64_t* restrict iorder, int64_t isize) {
+  struct double_comp_big* A = malloc(isize * sizeof(struct double_comp_big));
+  if (A == NULL) return;
+
+  for (int i=0 ; i<isize ; ++i) {
+    A[i].x = A_in[i];
+    A[i].i = iorder[i];
+  }
+
+  qsort( (void*) A, (size_t) isize, sizeof(struct double_comp_big), compare_double_big);
+
+  for (int i=0 ; i<isize ; ++i) {
+    A_in[i] = A[i].x;
+    iorder[i] = A[i].i;
+  }
+  free(A);
+}
+
+void qsort_double_noidx_big(double* A, int64_t isize) {
+  qsort( (void*) A, (size_t) isize, sizeof(double), compare_double_big);
+}
+
+
+struct float_comp {
+  float    x;
+  int32_t i;
+};
+
+int compare_float( const void * l, const void * r )
+{
+  const float * restrict _l= l;
+  const float * restrict _r= r;
+  if( *_l > *_r ) return 1;
+  if( *_l < *_r ) return -1;
+  return 0;
+}
+
+void qsort_float(float* restrict A_in, int32_t* restrict iorder, int32_t isize) {
+  struct float_comp* A = malloc(isize * sizeof(struct float_comp));
+  if (A == NULL) return;
+
+  for (int i=0 ; i<isize ; ++i) {
+    A[i].x = A_in[i];
+    A[i].i = iorder[i];
+  }
+
+  qsort( (void*) A, (size_t) isize, sizeof(struct float_comp), compare_float);
+
+  for (int i=0 ; i<isize ; ++i) {
+    A_in[i] = A[i].x;
+    iorder[i] = A[i].i;
+  }
+  free(A);
+}
+
+void qsort_float_noidx(float* A, int32_t isize) {
+  qsort( (void*) A, (size_t) isize, sizeof(float), compare_float);
+}
+
+
+struct float_comp_big {
+  float    x;
+  int64_t i;
+};
+
+int compare_float_big( const void * l, const void * r )
+{
+  const float * restrict _l= l;
+  const float * restrict _r= r;
+  if( *_l > *_r ) return 1;
+  if( *_l < *_r ) return -1;
+  return 0;
+}
+
+void qsort_float_big(float* restrict A_in, int64_t* restrict iorder, int64_t isize) {
+  struct float_comp_big* A = malloc(isize * sizeof(struct float_comp_big));
+  if (A == NULL) return;
+
+  for (int i=0 ; i<isize ; ++i) {
+    A[i].x = A_in[i];
+    A[i].i = iorder[i];
+  }
+
+  qsort( (void*) A, (size_t) isize, sizeof(struct float_comp_big), compare_float_big);
+
+  for (int i=0 ; i<isize ; ++i) {
+    A_in[i] = A[i].x;
+    iorder[i] = A[i].i;
+  }
+  free(A);
+}
+
+void qsort_float_noidx_big(float* A, int64_t isize) {
+  qsort( (void*) A, (size_t) isize, sizeof(float), compare_float_big);
+}
+/* Generated C file:1 ends here */
diff --git a/src/utils/qsort.org b/src/utils/qsort.org
new file mode 100644
index 00000000..abf0d54a
--- /dev/null
+++ b/src/utils/qsort.org
@@ -0,0 +1,169 @@
+#+TITLE: Quick sort binding for Fortran
+
+* C template
+
+#+NAME: c_template
+#+BEGIN_SRC c
+struct TYPE_comp_big {
+  TYPE    x;
+  int32_t i;
+};
+
+int compare_TYPE_big( const void * l, const void * r )
+{
+  const TYPE * restrict _l= l;
+  const TYPE * restrict _r= r;
+  if( *_l > *_r ) return 1;
+  if( *_l < *_r ) return -1;
+  return 0;
+}
+
+void qsort_TYPE_big(TYPE* restrict A_in, int32_t* restrict iorder, int32_t isize) {
+  struct TYPE_comp_big* A = malloc(isize * sizeof(struct TYPE_comp_big));
+  if (A == NULL) return;
+
+  for (int i=0 ; i<isize ; ++i) {
+    A[i].x = A_in[i];
+    A[i].i = iorder[i];
+  }
+
+  qsort( (void*) A, (size_t) isize, sizeof(struct TYPE_comp_big), compare_TYPE_big);
+
+  for (int i=0 ; i<isize ; ++i) {
+    A_in[i] = A[i].x;
+    iorder[i] = A[i].i;
+  }
+  free(A);
+}
+
+void qsort_TYPE_noidx_big(TYPE* A, int32_t isize) {
+  qsort( (void*) A, (size_t) isize, sizeof(TYPE), compare_TYPE_big);
+}
+#+END_SRC
+
+* Fortran template
+
+#+NAME:f_template
+#+BEGIN_SRC f90
+subroutine Lsort_big_c(A, iorder, isize) bind(C, name="qsort_TYPE_big")
+  use iso_c_binding
+  integer(c_int32_t), value :: isize
+  integer(c_int32_t)        :: iorder(isize)
+  real   (c_TYPE)         :: A(isize)
+end subroutine Lsort_big_c
+
+subroutine Lsort_noidx_big_c(A, isize) bind(C, name="qsort_TYPE_noidx_big")
+  use iso_c_binding
+  integer(c_int32_t), value :: isize
+  real   (c_TYPE)         :: A(isize)
+end subroutine Lsort_noidx_big_c
+
+#+END_SRC
+
+#+NAME:f_template2
+#+BEGIN_SRC f90
+subroutine Lsort_big(A, iorder, isize) 
+  use qsort_module
+  use iso_c_binding
+  integer(c_int32_t)        :: isize
+  integer(c_int32_t)        :: iorder(isize)
+  real   (c_TYPE)         :: A(isize)
+  call Lsort_big_c(A, iorder, isize)
+end subroutine Lsort_big
+
+subroutine Lsort_noidx_big(A, isize)
+  use iso_c_binding
+  use qsort_module
+  integer(c_int32_t) :: isize
+  real   (c_TYPE)    :: A(isize)
+  call Lsort_noidx_big_c(A, isize)
+end subroutine Lsort_noidx_big
+
+#+END_SRC
+
+* Python scripts for type replacements
+
+#+NAME: replaced
+#+begin_src python :results output :noweb yes
+data = """
+<<c_template>>
+"""
+for typ in ["int16_t", "int32_t", "int64_t", "double", "float"]:
+    print( data.replace("TYPE", typ).replace("_big", "") )
+    print( data.replace("int32_t", "int64_t").replace("TYPE", typ) )
+#+end_src
+
+#+NAME: replaced_f
+#+begin_src python :results output :noweb yes
+data = """
+<<f_template>>
+"""
+c1 = {
+    "int16_t": "i2",
+    "int32_t": "i",
+    "int64_t": "i8",
+    "double": "d",
+    "float": ""
+}
+c2 = {
+    "int16_t": "integer",
+    "int32_t": "integer",
+    "int64_t": "integer",
+    "double": "real",
+    "float": "real"
+}
+
+for typ in ["int16_t", "int32_t", "int64_t", "double", "float"]:
+    print( data.replace("real",c2[typ]).replace("L",c1[typ]).replace("TYPE", typ).replace("_big", "") )
+    print( data.replace("real",c2[typ]).replace("L",c1[typ]).replace("int32_t", "int64_t").replace("TYPE", typ) )
+#+end_src
+
+#+NAME: replaced_f2
+#+begin_src python :results output :noweb yes
+data = """
+<<f_template2>>
+"""
+c1 = {
+    "int16_t": "i2",
+    "int32_t": "i",
+    "int64_t": "i8",
+    "double": "d",
+    "float": ""
+}
+c2 = {
+    "int16_t": "integer",
+    "int32_t": "integer",
+    "int64_t": "integer",
+    "double": "real",
+    "float": "real"
+}
+
+for typ in ["int16_t", "int32_t", "int64_t", "double", "float"]:
+    print( data.replace("real",c2[typ]).replace("L",c1[typ]).replace("TYPE", typ).replace("_big", "") )
+    print( data.replace("real",c2[typ]).replace("L",c1[typ]).replace("int32_t", "int64_t").replace("TYPE", typ) )
+#+end_src
+
+* Generated C file
+  
+#+BEGIN_SRC c :comments link :tangle qsort.c :noweb yes
+#include <stdlib.h>
+#include <stdint.h>
+<<replaced()>>
+#+END_SRC
+
+* Generated Fortran file
+
+#+BEGIN_SRC f90 :tangle qsort_module.f90 :noweb yes
+module qsort_module
+  use iso_c_binding
+  
+  interface
+     <<replaced_f()>>
+  end interface
+
+end module qsort_module
+
+<<replaced_f2()>>
+
+#+END_SRC
+
diff --git a/src/utils/qsort_module.f90 b/src/utils/qsort_module.f90
new file mode 100644
index 00000000..a72a4f9e
--- /dev/null
+++ b/src/utils/qsort_module.f90
@@ -0,0 +1,347 @@
+module qsort_module
+  use iso_c_binding
+  
+  interface
+     
+     subroutine i2sort_c(A, iorder, isize) bind(C, name="qsort_int16_t")
+       use iso_c_binding
+       integer(c_int32_t), value :: isize
+       integer(c_int32_t)        :: iorder(isize)
+       integer   (c_int16_t)         :: A(isize)
+     end subroutine i2sort_c
+     
+     subroutine i2sort_noidx_c(A, isize) bind(C, name="qsort_int16_t_noidx")
+       use iso_c_binding
+       integer(c_int32_t), value :: isize
+       integer   (c_int16_t)         :: A(isize)
+     end subroutine i2sort_noidx_c
+     
+     
+     
+     subroutine i2sort_big_c(A, iorder, isize) bind(C, name="qsort_int16_t_big")
+       use iso_c_binding
+       integer(c_int64_t), value :: isize
+       integer(c_int64_t)        :: iorder(isize)
+       integer   (c_int16_t)         :: A(isize)
+     end subroutine i2sort_big_c
+     
+     subroutine i2sort_noidx_big_c(A, isize) bind(C, name="qsort_int16_t_noidx_big")
+       use iso_c_binding
+       integer(c_int64_t), value :: isize
+       integer   (c_int16_t)         :: A(isize)
+     end subroutine i2sort_noidx_big_c
+     
+     
+     
+     subroutine isort_c(A, iorder, isize) bind(C, name="qsort_int32_t")
+       use iso_c_binding
+       integer(c_int32_t), value :: isize
+       integer(c_int32_t)        :: iorder(isize)
+       integer   (c_int32_t)         :: A(isize)
+     end subroutine isort_c
+     
+     subroutine isort_noidx_c(A, isize) bind(C, name="qsort_int32_t_noidx")
+       use iso_c_binding
+       integer(c_int32_t), value :: isize
+       integer   (c_int32_t)         :: A(isize)
+     end subroutine isort_noidx_c
+     
+     
+     
+     subroutine isort_big_c(A, iorder, isize) bind(C, name="qsort_int32_t_big")
+       use iso_c_binding
+       integer(c_int64_t), value :: isize
+       integer(c_int64_t)        :: iorder(isize)
+       integer   (c_int32_t)         :: A(isize)
+     end subroutine isort_big_c
+     
+     subroutine isort_noidx_big_c(A, isize) bind(C, name="qsort_int32_t_noidx_big")
+       use iso_c_binding
+       integer(c_int64_t), value :: isize
+       integer   (c_int32_t)         :: A(isize)
+     end subroutine isort_noidx_big_c
+     
+     
+     
+     subroutine i8sort_c(A, iorder, isize) bind(C, name="qsort_int64_t")
+       use iso_c_binding
+       integer(c_int32_t), value :: isize
+       integer(c_int32_t)        :: iorder(isize)
+       integer   (c_int64_t)         :: A(isize)
+     end subroutine i8sort_c
+     
+     subroutine i8sort_noidx_c(A, isize) bind(C, name="qsort_int64_t_noidx")
+       use iso_c_binding
+       integer(c_int32_t), value :: isize
+       integer   (c_int64_t)         :: A(isize)
+     end subroutine i8sort_noidx_c
+     
+     
+     
+     subroutine i8sort_big_c(A, iorder, isize) bind(C, name="qsort_int64_t_big")
+       use iso_c_binding
+       integer(c_int64_t), value :: isize
+       integer(c_int64_t)        :: iorder(isize)
+       integer   (c_int64_t)         :: A(isize)
+     end subroutine i8sort_big_c
+     
+     subroutine i8sort_noidx_big_c(A, isize) bind(C, name="qsort_int64_t_noidx_big")
+       use iso_c_binding
+       integer(c_int64_t), value :: isize
+       integer   (c_int64_t)         :: A(isize)
+     end subroutine i8sort_noidx_big_c
+     
+     
+     
+     subroutine dsort_c(A, iorder, isize) bind(C, name="qsort_double")
+       use iso_c_binding
+       integer(c_int32_t), value :: isize
+       integer(c_int32_t)        :: iorder(isize)
+       real   (c_double)         :: A(isize)
+     end subroutine dsort_c
+     
+     subroutine dsort_noidx_c(A, isize) bind(C, name="qsort_double_noidx")
+       use iso_c_binding
+       integer(c_int32_t), value :: isize
+       real   (c_double)         :: A(isize)
+     end subroutine dsort_noidx_c
+     
+     
+     
+     subroutine dsort_big_c(A, iorder, isize) bind(C, name="qsort_double_big")
+       use iso_c_binding
+       integer(c_int64_t), value :: isize
+       integer(c_int64_t)        :: iorder(isize)
+       real   (c_double)         :: A(isize)
+     end subroutine dsort_big_c
+     
+     subroutine dsort_noidx_big_c(A, isize) bind(C, name="qsort_double_noidx_big")
+       use iso_c_binding
+       integer(c_int64_t), value :: isize
+       real   (c_double)         :: A(isize)
+     end subroutine dsort_noidx_big_c
+     
+     
+     
+     subroutine sort_c(A, iorder, isize) bind(C, name="qsort_float")
+       use iso_c_binding
+       integer(c_int32_t), value :: isize
+       integer(c_int32_t)        :: iorder(isize)
+       real   (c_float)         :: A(isize)
+     end subroutine sort_c
+     
+     subroutine sort_noidx_c(A, isize) bind(C, name="qsort_float_noidx")
+       use iso_c_binding
+       integer(c_int32_t), value :: isize
+       real   (c_float)         :: A(isize)
+     end subroutine sort_noidx_c
+     
+     
+     
+     subroutine sort_big_c(A, iorder, isize) bind(C, name="qsort_float_big")
+       use iso_c_binding
+       integer(c_int64_t), value :: isize
+       integer(c_int64_t)        :: iorder(isize)
+       real   (c_float)         :: A(isize)
+     end subroutine sort_big_c
+     
+     subroutine sort_noidx_big_c(A, isize) bind(C, name="qsort_float_noidx_big")
+       use iso_c_binding
+       integer(c_int64_t), value :: isize
+       real   (c_float)         :: A(isize)
+     end subroutine sort_noidx_big_c
+     
+     
+     
+  end interface
+
+end module qsort_module
+
+
+subroutine i2sort(A, iorder, isize) 
+  use qsort_module
+  use iso_c_binding
+  integer(c_int32_t)        :: isize
+  integer(c_int32_t)        :: iorder(isize)
+  integer   (c_int16_t)         :: A(isize)
+  call i2sort_c(A, iorder, isize)
+end subroutine i2sort
+
+subroutine i2sort_noidx(A, isize)
+  use iso_c_binding
+  use qsort_module
+  integer(c_int32_t) :: isize
+  integer   (c_int16_t)    :: A(isize)
+  call i2sort_noidx_c(A, isize)
+end subroutine i2sort_noidx
+
+
+
+subroutine i2sort_big(A, iorder, isize) 
+  use qsort_module
+  use iso_c_binding
+  integer(c_int64_t)        :: isize
+  integer(c_int64_t)        :: iorder(isize)
+  integer   (c_int16_t)         :: A(isize)
+  call i2sort_big_c(A, iorder, isize)
+end subroutine i2sort_big
+
+subroutine i2sort_noidx_big(A, isize)
+  use iso_c_binding
+  use qsort_module
+  integer(c_int64_t) :: isize
+  integer   (c_int16_t)    :: A(isize)
+  call i2sort_noidx_big_c(A, isize)
+end subroutine i2sort_noidx_big
+
+
+
+subroutine isort(A, iorder, isize) 
+  use qsort_module
+  use iso_c_binding
+  integer(c_int32_t)        :: isize
+  integer(c_int32_t)        :: iorder(isize)
+  integer   (c_int32_t)         :: A(isize)
+  call isort_c(A, iorder, isize)
+end subroutine isort
+
+subroutine isort_noidx(A, isize)
+  use iso_c_binding
+  use qsort_module
+  integer(c_int32_t) :: isize
+  integer   (c_int32_t)    :: A(isize)
+  call isort_noidx_c(A, isize)
+end subroutine isort_noidx
+
+
+
+subroutine isort_big(A, iorder, isize) 
+  use qsort_module
+  use iso_c_binding
+  integer(c_int64_t)        :: isize
+  integer(c_int64_t)        :: iorder(isize)
+  integer   (c_int32_t)         :: A(isize)
+  call isort_big_c(A, iorder, isize)
+end subroutine isort_big
+
+subroutine isort_noidx_big(A, isize)
+  use iso_c_binding
+  use qsort_module
+  integer(c_int64_t) :: isize
+  integer   (c_int32_t)    :: A(isize)
+  call isort_noidx_big_c(A, isize)
+end subroutine isort_noidx_big
+
+
+
+subroutine i8sort(A, iorder, isize) 
+  use qsort_module
+  use iso_c_binding
+  integer(c_int32_t)        :: isize
+  integer(c_int32_t)        :: iorder(isize)
+  integer   (c_int64_t)         :: A(isize)
+  call i8sort_c(A, iorder, isize)
+end subroutine i8sort
+
+subroutine i8sort_noidx(A, isize)
+  use iso_c_binding
+  use qsort_module
+  integer(c_int32_t) :: isize
+  integer   (c_int64_t)    :: A(isize)
+  call i8sort_noidx_c(A, isize)
+end subroutine i8sort_noidx
+
+
+
+subroutine i8sort_big(A, iorder, isize) 
+  use qsort_module
+  use iso_c_binding
+  integer(c_int64_t)        :: isize
+  integer(c_int64_t)        :: iorder(isize)
+  integer   (c_int64_t)         :: A(isize)
+  call i8sort_big_c(A, iorder, isize)
+end subroutine i8sort_big
+
+subroutine i8sort_noidx_big(A, isize)
+  use iso_c_binding
+  use qsort_module
+  integer(c_int64_t) :: isize
+  integer   (c_int64_t)    :: A(isize)
+  call i8sort_noidx_big_c(A, isize)
+end subroutine i8sort_noidx_big
+
+
+
+subroutine dsort(A, iorder, isize) 
+  use qsort_module
+  use iso_c_binding
+  integer(c_int32_t)        :: isize
+  integer(c_int32_t)        :: iorder(isize)
+  real   (c_double)         :: A(isize)
+  call dsort_c(A, iorder, isize)
+end subroutine dsort
+
+subroutine dsort_noidx(A, isize)
+  use iso_c_binding
+  use qsort_module
+  integer(c_int32_t) :: isize
+  real   (c_double)    :: A(isize)
+  call dsort_noidx_c(A, isize)
+end subroutine dsort_noidx
+
+
+
+subroutine dsort_big(A, iorder, isize) 
+  use qsort_module
+  use iso_c_binding
+  integer(c_int64_t)        :: isize
+  integer(c_int64_t)        :: iorder(isize)
+  real   (c_double)         :: A(isize)
+  call dsort_big_c(A, iorder, isize)
+end subroutine dsort_big
+
+subroutine dsort_noidx_big(A, isize)
+  use iso_c_binding
+  use qsort_module
+  integer(c_int64_t) :: isize
+  real   (c_double)    :: A(isize)
+  call dsort_noidx_big_c(A, isize)
+end subroutine dsort_noidx_big
+
+
+
+subroutine sort(A, iorder, isize) 
+  use qsort_module
+  use iso_c_binding
+  integer(c_int32_t)        :: isize
+  integer(c_int32_t)        :: iorder(isize)
+  real   (c_float)         :: A(isize)
+  call sort_c(A, iorder, isize)
+end subroutine sort
+
+subroutine sort_noidx(A, isize)
+  use iso_c_binding
+  use qsort_module
+  integer(c_int32_t) :: isize
+  real   (c_float)    :: A(isize)
+  call sort_noidx_c(A, isize)
+end subroutine sort_noidx
+
+
+
+subroutine sort_big(A, iorder, isize) 
+  use qsort_module
+  use iso_c_binding
+  integer(c_int64_t)        :: isize
+  integer(c_int64_t)        :: iorder(isize)
+  real   (c_float)         :: A(isize)
+  call sort_big_c(A, iorder, isize)
+end subroutine sort_big
+
+subroutine sort_noidx_big(A, isize)
+  use iso_c_binding
+  use qsort_module
+  integer(c_int64_t) :: isize
+  real   (c_float)    :: A(isize)
+  call sort_noidx_big_c(A, isize)
+end subroutine sort_noidx_big
diff --git a/src/utils/sort.irp.f b/src/utils/sort.irp.f
index a63eb4a3..089c3871 100644
--- a/src/utils/sort.irp.f
+++ b/src/utils/sort.irp.f
@@ -1,222 +1,4 @@
 BEGIN_TEMPLATE
- subroutine insertion_$Xsort (x,iorder,isize)
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize) using the insertion sort algorithm.
-  ! iorder in input should be (1,2,3,...,isize), and in output
-  ! contains the new order of the elements.
-  END_DOC
-  integer,intent(in)             :: isize
-  $type,intent(inout)            :: x(isize)
-  integer,intent(inout)          :: iorder(isize)
-  $type                          :: xtmp
-  integer                        :: i, i0, j, jmax
-
-  do i=2,isize
-    xtmp = x(i)
-    i0 = iorder(i)
-    j=i-1
-    do while (j>0)
-      if ((x(j) <= xtmp)) exit
-      x(j+1) = x(j)
-      iorder(j+1) = iorder(j)
-      j=j-1
-    enddo
-    x(j+1) = xtmp
-    iorder(j+1) = i0
-  enddo
- end subroutine insertion_$Xsort
-
- subroutine quick_$Xsort(x, iorder, isize)
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize) using the quicksort algorithm.
-  ! iorder in input should be (1,2,3,...,isize), and in output
-  ! contains the new order of the elements.
-  END_DOC
-  integer,intent(in)             :: isize
-  $type,intent(inout)            :: x(isize)
-  integer,intent(inout)          :: iorder(isize)
-  integer, external              :: omp_get_num_threads
-  call rec_$X_quicksort(x,iorder,isize,1,isize,nproc)
- end
-
- recursive subroutine rec_$X_quicksort(x, iorder, isize, first, last, level)
-  implicit none
-  integer, intent(in)            :: isize, first, last, level
-  integer,intent(inout)          :: iorder(isize)
-  $type, intent(inout)           :: x(isize)
-  $type                          :: c, tmp
-  integer                        :: itmp
-  integer                        :: i, j
-
-  if(isize<2)return
-
-  c = x( shiftr(first+last,1) )
-  i = first
-  j = last
-  do
-    do while (x(i) < c)
-      i=i+1
-    end do
-    do while (c < x(j))
-      j=j-1
-    end do
-    if (i >= j) exit
-    tmp  = x(i)
-    x(i) = x(j)
-    x(j) = tmp
-    itmp      = iorder(i)
-    iorder(i) = iorder(j)
-    iorder(j) = itmp
-    i=i+1
-    j=j-1
-  enddo
-  if ( ((i-first <= 10000).and.(last-j <= 10000)).or.(level<=0) ) then
-    if (first < i-1) then
-      call rec_$X_quicksort(x, iorder, isize, first, i-1,level/2)
-    endif
-    if (j+1 < last) then
-      call rec_$X_quicksort(x, iorder, isize, j+1, last,level/2)
-    endif
-  else
-    if (first < i-1) then
-      call rec_$X_quicksort(x, iorder, isize, first, i-1,level/2)
-    endif
-    if (j+1 < last) then
-      call rec_$X_quicksort(x, iorder, isize, j+1, last,level/2)
-    endif
-  endif
- end
-
- subroutine heap_$Xsort(x,iorder,isize)
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize) using the heap sort algorithm.
-  ! iorder in input should be (1,2,3,...,isize), and in output
-  ! contains the new order of the elements.
-  END_DOC
-  integer,intent(in)             :: isize
-  $type,intent(inout)            :: x(isize)
-  integer,intent(inout)          :: iorder(isize)
-
-  integer                        :: i, k, j, l, i0
-  $type                          :: xtemp
-
-  l = isize/2+1
-  k = isize
-  do while (.True.)
-    if (l>1) then
-      l=l-1
-      xtemp = x(l)
-      i0 = iorder(l)
-    else
-      xtemp = x(k)
-      i0 = iorder(k)
-      x(k) = x(1)
-      iorder(k) = iorder(1)
-      k = k-1
-      if (k == 1) then
-        x(1) = xtemp
-        iorder(1) = i0
-        exit
-      endif
-    endif
-    i=l
-    j = shiftl(l,1)
-    do while (j<k)
-      if ( x(j) < x(j+1) ) then
-        j=j+1
-      endif
-      if (xtemp < x(j)) then
-        x(i) = x(j)
-        iorder(i) = iorder(j)
-        i = j
-        j = shiftl(j,1)
-      else
-        j = k+1
-      endif
-    enddo
-    if (j==k) then
-      if (xtemp < x(j)) then
-        x(i) = x(j)
-        iorder(i) = iorder(j)
-        i = j
-        j = shiftl(j,1)
-      else
-        j = k+1
-      endif
-    endif
-    x(i) = xtemp
-    iorder(i) = i0
-  enddo
- end subroutine heap_$Xsort
-
- subroutine heap_$Xsort_big(x,iorder,isize)
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize) using the heap sort algorithm.
-  ! iorder in input should be (1,2,3,...,isize), and in output
-  ! contains the new order of the elements.
-  ! This is a version for very large arrays where the indices need
-  ! to be in integer*8 format
-  END_DOC
-  integer*8,intent(in)           :: isize
-  $type,intent(inout)            :: x(isize)
-  integer*8,intent(inout)        :: iorder(isize)
-
-  integer*8                      :: i, k, j, l, i0
-  $type                          :: xtemp
-
-  l = isize/2+1
-  k = isize
-  do while (.True.)
-    if (l>1) then
-      l=l-1
-      xtemp = x(l)
-      i0 = iorder(l)
-    else
-      xtemp = x(k)
-      i0 = iorder(k)
-      x(k) = x(1)
-      iorder(k) = iorder(1)
-      k = k-1
-      if (k == 1) then
-        x(1) = xtemp
-        iorder(1) = i0
-        exit
-      endif
-    endif
-    i=l
-    j = shiftl(l,1)
-    do while (j<k)
-      if ( x(j) < x(j+1) ) then
-        j=j+1
-      endif
-      if (xtemp < x(j)) then
-        x(i) = x(j)
-        iorder(i) = iorder(j)
-        i = j
-        j = shiftl(j,1)
-      else
-        j = k+1
-      endif
-    enddo
-    if (j==k) then
-      if (xtemp < x(j)) then
-        x(i) = x(j)
-        iorder(i) = iorder(j)
-        i = j
-        j = shiftl(j,1)
-      else
-        j = k+1
-      endif
-    endif
-    x(i) = xtemp
-    iorder(i) = i0
-  enddo
-
- end subroutine heap_$Xsort_big
 
  subroutine sorted_$Xnumber(x,isize,n)
   implicit none
@@ -250,220 +32,6 @@ SUBST [ X, type ]
 END_TEMPLATE
 
 
-!---------------------- INTEL
-IRP_IF INTEL
-
-BEGIN_TEMPLATE
- subroutine $Xsort(x,iorder,isize)
-  use intel
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize).
-  ! iorder in input should be (1,2,3,...,isize), and in output
-  ! contains the new order of the elements.
-  END_DOC
-  integer,intent(in)             :: isize
-  $type,intent(inout)            :: x(isize)
-  integer,intent(inout)          :: iorder(isize)
-  integer                        :: n
-  character, allocatable         :: tmp(:)
-  if (isize < 2) return
-  call ippsSortRadixIndexGetBufferSize(isize, $ippsz, n)
-  allocate(tmp(n))
-  call ippsSortRadixIndexAscend_$ityp(x, $n, iorder, isize, tmp)
-  deallocate(tmp)
-  iorder(1:isize) = iorder(1:isize)+1
-  call $Xset_order(x,iorder,isize)
- end
-
- subroutine $Xsort_noidx(x,isize)
-  use intel
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize).
-  ! iorder in input should be (1,2,3,...,isize), and in output
-  ! contains the new order of the elements.
-  END_DOC
-  integer,intent(in)             :: isize
-  $type,intent(inout)            :: x(isize)
-  integer                        :: n
-  character, allocatable         :: tmp(:)
-  if (isize < 2) return
-  call ippsSortRadixIndexGetBufferSize(isize, $ippsz, n)
-  allocate(tmp(n))
-  call ippsSortRadixAscend_$ityp_I(x, isize, tmp)
-  deallocate(tmp)
- end
-
-SUBST [ X, type, ityp, n, ippsz ]
-   ; real ; 32f ; 4 ; 13 ;;
- i ; integer ; 32s ; 4 ; 11 ;;
- i2 ; integer*2 ; 16s ; 2 ; 7 ;;
-END_TEMPLATE
-
-BEGIN_TEMPLATE
-
- subroutine $Xsort(x,iorder,isize)
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize).
-  ! iorder in input should be (1,2,3,...,isize), and in output
-  ! contains the new order of the elements.
-  END_DOC
-  integer,intent(in)             :: isize
-  $type,intent(inout)            :: x(isize)
-  integer,intent(inout)          :: iorder(isize)
-  integer                        :: n
-  if (isize < 2) then
-    return
-  endif
-!  call sorted_$Xnumber(x,isize,n)
-!  if (isize == n) then
-!    return
-!  endif
-  if ( isize < 32) then
-    call insertion_$Xsort(x,iorder,isize)
-  else
-!    call heap_$Xsort(x,iorder,isize)
-    call quick_$Xsort(x,iorder,isize)
-  endif
- end subroutine $Xsort
-
-SUBST [ X, type ]
- d ; double precision ;;
-END_TEMPLATE
-
-BEGIN_TEMPLATE
-
- subroutine $Xsort(x,iorder,isize)
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize).
-  ! iorder in input should be (1,2,3,...,isize), and in output
-  ! contains the new order of the elements.
-  END_DOC
-  integer,intent(in)             :: isize
-  $type,intent(inout)            :: x(isize)
-  integer,intent(inout)          :: iorder(isize)
-  integer                        :: n
-  if (isize < 2) then
-    return
-  endif
-  call sorted_$Xnumber(x,isize,n)
-  if (isize == n) then
-    return
-  endif
-  if ( isize < 32) then
-    call insertion_$Xsort(x,iorder,isize)
-  else
-    call $Xradix_sort(x,iorder,isize,-1)
-  endif
- end subroutine $Xsort
-
-SUBST [ X, type ]
- i8 ; integer*8 ;;
-END_TEMPLATE
-
-!---------------------- END INTEL
-IRP_ELSE
-!---------------------- NON-INTEL
-BEGIN_TEMPLATE
-
- subroutine $Xsort_noidx(x,isize)
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize).
-  END_DOC
-  integer,intent(in)             :: isize
-  $type,intent(inout)            :: x(isize)
-  integer, allocatable           :: iorder(:)
-  integer                        :: i
-  allocate(iorder(isize))
-  do i=1,isize
-   iorder(i)=i
-  enddo
-  call $Xsort(x,iorder,isize)
-  deallocate(iorder)
- end subroutine $Xsort_noidx
-
-SUBST [ X, type ]
-   ; real ;;
- d ; double precision ;;
- i ; integer ;;
- i8 ; integer*8 ;;
- i2 ; integer*2 ;;
-END_TEMPLATE
-
-BEGIN_TEMPLATE
-
- subroutine $Xsort(x,iorder,isize)
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize).
-  ! iorder in input should be (1,2,3,...,isize), and in output
-  ! contains the new order of the elements.
-  END_DOC
-  integer,intent(in)             :: isize
-  $type,intent(inout)            :: x(isize)
-  integer,intent(inout)          :: iorder(isize)
-  integer                        :: n
-  if (isize < 2) then
-    return
-  endif
-!  call sorted_$Xnumber(x,isize,n)
-!  if (isize == n) then
-!    return
-!  endif
-  if ( isize < 32) then
-    call insertion_$Xsort(x,iorder,isize)
-  else
-!    call heap_$Xsort(x,iorder,isize)
-    call quick_$Xsort(x,iorder,isize)
-  endif
- end subroutine $Xsort
-
-SUBST [ X, type ]
-   ; real ;;
- d ; double precision ;;
-END_TEMPLATE
-
-BEGIN_TEMPLATE
-
- subroutine $Xsort(x,iorder,isize)
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize).
-  ! iorder in input should be (1,2,3,...,isize), and in output
-  ! contains the new order of the elements.
-  END_DOC
-  integer,intent(in)             :: isize
-  $type,intent(inout)            :: x(isize)
-  integer,intent(inout)          :: iorder(isize)
-  integer                        :: n
-  if (isize < 2) then
-    return
-  endif
-  call sorted_$Xnumber(x,isize,n)
-  if (isize == n) then
-    return
-  endif
-  if ( isize < 32) then
-    call insertion_$Xsort(x,iorder,isize)
-  else
-    call $Xradix_sort(x,iorder,isize,-1)
-  endif
- end subroutine $Xsort
-
-SUBST [ X, type ]
- i ; integer ;;
- i8 ; integer*8 ;;
- i2 ; integer*2 ;;
-END_TEMPLATE
-
-IRP_ENDIF
-!---------------------- END NON-INTEL
-
-
 
 BEGIN_TEMPLATE
  subroutine $Xset_order(x,iorder,isize)
@@ -489,47 +57,6 @@ BEGIN_TEMPLATE
   deallocate(xtmp)
  end
 
-SUBST [ X, type ]
-   ; real ;;
- d ; double precision ;;
- i ; integer ;;
- i8; integer*8 ;;
- i2; integer*2 ;;
-END_TEMPLATE
-
-
-BEGIN_TEMPLATE
- subroutine insertion_$Xsort_big (x,iorder,isize)
-  implicit none
-  BEGIN_DOC
-  ! Sort array x(isize) using the insertion sort algorithm.
-  ! iorder in input should be (1,2,3,...,isize), and in output
-  ! contains the new order of the elements.
-  ! This is a version for very large arrays where the indices need
-  ! to be in integer*8 format
-  END_DOC
-  integer*8,intent(in)           :: isize
-  $type,intent(inout)            :: x(isize)
-  integer*8,intent(inout)        :: iorder(isize)
-  $type                          :: xtmp
-  integer*8                      :: i, i0, j, jmax
-
-  do i=2_8,isize
-    xtmp = x(i)
-    i0 = iorder(i)
-    j = i-1_8
-    do while (j>0_8)
-      if (x(j)<=xtmp) exit
-      x(j+1_8) = x(j)
-      iorder(j+1_8) = iorder(j)
-      j = j-1_8
-    enddo
-    x(j+1_8) = xtmp
-    iorder(j+1_8) = i0
-  enddo
-
- end subroutine insertion_$Xsort_big
-
  subroutine $Xset_order_big(x,iorder,isize)
   implicit none
   BEGIN_DOC
@@ -563,223 +90,3 @@ SUBST [ X, type ]
 END_TEMPLATE
 
 
-BEGIN_TEMPLATE
-
-recursive subroutine $Xradix_sort$big(x,iorder,isize,iradix)
-  implicit none
-
-  BEGIN_DOC
-  ! Sort integer array x(isize) using the radix sort algorithm.
-  ! iorder in input should be (1,2,3,...,isize), and in output
-  ! contains the new order of the elements.
-  ! iradix should be -1 in input.
-  END_DOC
-  integer*$int_type, intent(in)  :: isize
-  integer*$int_type, intent(inout) :: iorder(isize)
-  integer*$type, intent(inout)   :: x(isize)
-  integer, intent(in)            :: iradix
-  integer                        :: iradix_new
-  integer*$type, allocatable     :: x2(:), x1(:)
-  integer*$type                  :: i4               ! data type
-  integer*$int_type, allocatable :: iorder1(:),iorder2(:)
-  integer*$int_type              :: i0, i1, i2, i3, i ! index type
-  integer*$type                  :: mask
-  integer                        :: err
-  !DIR$ ATTRIBUTES ALIGN : 128   :: iorder1,iorder2, x2, x1
-
-  if (isize < 2) then
-    return
-  endif
-
-  if (iradix == -1) then ! Sort Positive and negative
-
-    allocate(x1(isize),iorder1(isize), x2(isize),iorder2(isize),stat=err)
-    if (err /= 0) then
-      print *,  irp_here, ': Unable to allocate arrays'
-      stop
-    endif
-
-    i1=1_$int_type
-    i2=1_$int_type
-    do i=1_$int_type,isize
-      if (x(i) < 0_$type) then
-        iorder1(i1) = iorder(i)
-        x1(i1) = -x(i)
-        i1 = i1+1_$int_type
-      else
-        iorder2(i2) = iorder(i)
-        x2(i2) = x(i)
-        i2 = i2+1_$int_type
-      endif
-    enddo
-    i1=i1-1_$int_type
-    i2=i2-1_$int_type
-
-    do i=1_$int_type,i2
-      iorder(i1+i) = iorder2(i)
-      x(i1+i) = x2(i)
-    enddo
-    deallocate(x2,iorder2,stat=err)
-    if (err /= 0) then
-      print *,  irp_here, ': Unable to deallocate arrays x2, iorder2'
-      stop
-    endif
-
-
-    if (i1 > 1_$int_type) then
-      call $Xradix_sort$big(x1,iorder1,i1,-2)
-      do i=1_$int_type,i1
-        x(i) = -x1(1_$int_type+i1-i)
-        iorder(i) = iorder1(1_$int_type+i1-i)
-      enddo
-    endif
-
-    if (i2>1_$int_type) then
-      call $Xradix_sort$big(x(i1+1_$int_type),iorder(i1+1_$int_type),i2,-2)
-    endif
-
-    deallocate(x1,iorder1,stat=err)
-    if (err /= 0) then
-      print *,  irp_here, ': Unable to deallocate arrays x1, iorder1'
-      stop
-    endif
-    return
-
-  else if (iradix == -2) then ! Positive
-
-    ! Find most significant bit
-
-    i0 = 0_$int_type
-    i4 = maxval(x)
-
-    iradix_new = max($integer_size-1-leadz(i4),1)
-    mask = ibset(0_$type,iradix_new)
-
-    allocate(x1(isize),iorder1(isize), x2(isize),iorder2(isize),stat=err)
-    if (err /= 0) then
-      print *,  irp_here, ': Unable to allocate arrays'
-      stop
-    endif
-
-    i1=1_$int_type
-    i2=1_$int_type
-
-    do i=1_$int_type,isize
-      if (iand(mask,x(i)) == 0_$type) then
-        iorder1(i1) = iorder(i)
-        x1(i1) = x(i)
-        i1 = i1+1_$int_type
-      else
-        iorder2(i2) = iorder(i)
-        x2(i2) = x(i)
-        i2 = i2+1_$int_type
-      endif
-    enddo
-    i1=i1-1_$int_type
-    i2=i2-1_$int_type
-
-    do i=1_$int_type,i1
-      iorder(i0+i) = iorder1(i)
-      x(i0+i) = x1(i)
-    enddo
-    i0 = i0+i1
-    i3 = i0
-    deallocate(x1,iorder1,stat=err)
-    if (err /= 0) then
-      print *,  irp_here, ': Unable to deallocate arrays x1, iorder1'
-      stop
-    endif
-
-
-    do i=1_$int_type,i2
-      iorder(i0+i) = iorder2(i)
-      x(i0+i) = x2(i)
-    enddo
-    i0 = i0+i2
-    deallocate(x2,iorder2,stat=err)
-    if (err /= 0) then
-      print *,  irp_here, ': Unable to deallocate arrays x2, iorder2'
-      stop
-    endif
-
-
-    if (i3>1_$int_type) then
-      call $Xradix_sort$big(x,iorder,i3,iradix_new-1)
-    endif
-
-    if (isize-i3>1_$int_type) then
-      call $Xradix_sort$big(x(i3+1_$int_type),iorder(i3+1_$int_type),isize-i3,iradix_new-1)
-    endif
-
-    return
-  endif
-
-  ASSERT (iradix >= 0)
-
-  if (isize < 48) then
-    call insertion_$Xsort$big(x,iorder,isize)
-    return
-  endif
-
-
-  allocate(x2(isize),iorder2(isize),stat=err)
-  if (err /= 0) then
-    print *,  irp_here, ': Unable to allocate arrays x1, iorder1'
-    stop
-  endif
-
-
-  mask = ibset(0_$type,iradix)
-  i0=1_$int_type
-  i1=1_$int_type
-
-  do i=1_$int_type,isize
-    if (iand(mask,x(i)) == 0_$type) then
-      iorder(i0) = iorder(i)
-      x(i0) = x(i)
-      i0 = i0+1_$int_type
-    else
-      iorder2(i1) = iorder(i)
-      x2(i1) = x(i)
-      i1 = i1+1_$int_type
-    endif
-  enddo
-  i0=i0-1_$int_type
-  i1=i1-1_$int_type
-
-  do i=1_$int_type,i1
-    iorder(i0+i) = iorder2(i)
-    x(i0+i) = x2(i)
-  enddo
-
-  deallocate(x2,iorder2,stat=err)
-  if (err /= 0) then
-    print *,  irp_here, ': Unable to allocate arrays x2, iorder2'
-    stop
-  endif
-
-
-  if (iradix == 0) then
-    return
-  endif
-
-
-  if (i1>1_$int_type) then
-    call $Xradix_sort$big(x(i0+1_$int_type),iorder(i0+1_$int_type),i1,iradix-1)
-  endif
-  if (i0>1) then
-    call $Xradix_sort$big(x,iorder,i0,iradix-1)
-  endif
-
- end
-
-SUBST [ X, type, integer_size, is_big, big, int_type ]
- i  ; 4 ; 32 ; .False. ;      ; 4 ;;
- i8 ; 8 ; 64 ; .False. ;      ; 4 ;;
- i2 ; 2 ; 16 ; .False. ;      ; 4 ;;
- i  ; 4 ; 32 ; .True.  ; _big ; 8 ;;
- i8 ; 8 ; 64 ; .True.  ; _big ; 8 ;;
-END_TEMPLATE
-
-
-