Numerical precision

1. Control of the numerical precision
2. Precision
3. Range
4. Helper functions
5. Approximate functions
- 5.1. Exponential

1 Control of the numerical precision

Controlling numerical precision enables optimizations. Here, the default parameters determining the target numerical precision and range are defined. Following the IEEE Standard for Floating-Point Arithmetic (IEEE 754), precision refers to the number of significand bits (including the sign bit) and range refers to the number of exponent bits.

`QMCKL_DEFAULT_PRECISION`	53
`QMCKL_DEFAULT_RANGE`	11

typedef struct qmckl_numprec_struct {
  uint32_t  precision;
  uint32_t  range;
} qmckl_numprec_struct;

The following functions set and get the required precision and range. precision is an integer between 2 and 53, and range is an integer between 2 and 11.

The setter functions functions return a new context as a 64-bit integer. The getter functions return the value, as a 32-bit integer. The update functions return QMCKL_SUCCESS or QMCKL_FAILURE.

2 Precision

qmckl_context_set_numprec_precision modifies the parameter for the numerical precision in the context.

qmckl_exit_code qmckl_set_numprec_precision(const qmckl_context context, const int precision) {

  if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT)
    return QMCKL_INVALID_CONTEXT;

  if (precision <  2) {
    return qmckl_failwith(context,
                          QMCKL_INVALID_ARG_2,
                          "qmckl_update_numprec_precision",
                          "precision < 2");
  }

  if (precision > 53) {
    return qmckl_failwith(context,
                          QMCKL_INVALID_ARG_2,
                          "qmckl_update_numprec_precision",
                          "precision > 53");
  }

  qmckl_context_struct* const ctx = (qmckl_context_struct*) context;

  /* This should be always true because the context is valid */
  assert (ctx != NULL);

  qmckl_lock(context);
  {
    ctx->numprec.precision = (uint32_t) precision;
  }
  qmckl_unlock(context);

  return QMCKL_SUCCESS;
}

interface
   integer (qmckl_exit_code) function qmckl_set_numprec_precision(context, precision) bind(C)
     use, intrinsic :: iso_c_binding
     import
     integer (qmckl_context), intent(in), value :: context
     integer (c_int32_t), intent(in), value :: precision
   end function qmckl_set_numprec_precision
end interface

qmckl_get_numprec_precision returns the value of the numerical precision in the context.

int qmckl_get_numprec_precision(const qmckl_context context) {
  if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) {
      return qmckl_failwith(context,
                      QMCKL_INVALID_CONTEXT,
                      "qmckl_get_numprec_precision",
                      "");
  }

  const qmckl_context_struct* const ctx = (qmckl_context_struct*) context;
  return ctx->numprec.precision;
}

interface
   integer (qmckl_exit_code) function qmckl_get_numprec_precision(context) bind(C)
     use, intrinsic :: iso_c_binding
     import
     integer (qmckl_context), intent(in), value :: context
   end function qmckl_get_numprec_precision
end interface

3 Range

qmckl_set_numprec_range modifies the parameter for the numerical range in a given context.

qmckl_exit_code qmckl_set_numprec_range(const qmckl_context context, const int range) {

  if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT)
    return QMCKL_INVALID_CONTEXT;

  if (range <  2) {
    return qmckl_failwith(context,
                    QMCKL_INVALID_ARG_2,
                    "qmckl_set_numprec_range",
                    "range < 2");
  }

  if (range > 11) {
    return qmckl_failwith(context,
                    QMCKL_INVALID_ARG_2,
                    "qmckl_set_numprec_range",
                    "range > 11");
  }

  qmckl_context_struct* const ctx = (qmckl_context_struct*) context;

  /* This should be always true because the context is valid */
  assert (ctx != NULL);

  qmckl_lock(context);
  {
    ctx->numprec.range = (uint32_t) range;
  }
  qmckl_unlock(context);

  return QMCKL_SUCCESS;
}

interface
   integer (qmckl_exit_code) function qmckl_set_numprec_range(context, range) bind(C)
     use, intrinsic :: iso_c_binding
     import
     integer (qmckl_context), intent(in), value :: context
     integer (c_int32_t), intent(in), value :: range
   end function qmckl_set_numprec_range
end interface

qmckl_get_numprec_range returns the value of the numerical range in the context.

int qmckl_get_numprec_range(const qmckl_context context) {
  if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) {
      return qmckl_failwith(context,
                      QMCKL_INVALID_CONTEXT,
                      "qmckl_get_numprec_range",
                      "");
  }

  const qmckl_context_struct* const ctx = (qmckl_context_struct*) context;
  return ctx->numprec.range;
}

4 Helper functions

qmckl_get_numprec_epsilon returns \(\epsilon = 2^{1-n}\) where n is the precision. We need to remove the sign bit from the precision.

double qmckl_get_numprec_epsilon(const qmckl_context context) {
  const int precision = qmckl_get_numprec_precision(context);
  return 1. /  (double) ( ((uint64_t) 1) << (precision-2));
}

5 Approximate functions

5.1 Exponential

Fast exponential function, adapted from Johan Rade's implementation (https://gist.github.com/jrade/293a73f89dfef51da6522428c857802d). It is based on Schraudolph's paper:

N. Schraudolph, "A Fast, Compact Approximation of the Exponential Function", Neural Computation 11, 853–862 (1999). (available at https://nic.schraudolph.org/pubs/Schraudolph99.pdf)

float fastExpf(float x)
{
  const float a = 12102203.0;
  const float b = 1064986816.0;
  x = a * x + b;

  const float c = 8388608.0;
  const float d = 2139095040.0;
  if (x < c || x > d)
    x = (x < c) ? 0.0f : d;

  uint32_t n = (uint32_t) x;
  memcpy(&x, &n, 4);
  return x;
}


double fastExp(double x)
{
  const double a = 6497320848556798.0;
  const double b = 4606985713057410560.0;
  x = a * x + b;

  const double c = 4503599627370496.0;
  const double d = 9218868437227405312.0;
  if (x < c || x > d)
    x = (x < c) ? 0.0 : d;

  uint64_t n = (uint64_t) x;
  memcpy(&x, &n, 8);
  return x;
}