14 KiB
3+TITLE: Numerical precision
Control of the numerical precision
Controlling numerical precision enables optimizations. Here, the default parameters determining the target numerical precision and range are defined. Following the IEEE Standard for Floating-Point Arithmetic (IEEE 754), precision refers to the number of significand bits (including the sign bit) and range refers to the number of exponent bits.
QMCKL_DEFAULT_PRECISION |
53 |
QMCKL_DEFAULT_RANGE |
11 |
#define QMCKL_DEFAULT_PRECISION 53
#define QMCKL_DEFAULT_RANGE 11
typedef struct qmckl_numprec_struct {
uint32_t precision;
uint32_t range;
} qmckl_numprec_struct;
The following functions set and get the required precision and
range. precision
is an integer between 2 and 53, and range
is an
integer between 2 and 11.
The setter functions functions return a new context as a 64-bit
integer. The getter functions return the value, as a 32-bit
integer. The update functions return QMCKL_SUCCESS
or
QMCKL_FAILURE
.
Precision
qmckl_context_set_numprec_precision
modifies the parameter for the
numerical precision in the context.
qmckl_exit_code qmckl_set_numprec_precision(const qmckl_context context, const int precision) {
if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT)
return QMCKL_INVALID_CONTEXT;
if (precision < 2) {
return qmckl_failwith(context,
QMCKL_INVALID_ARG_2,
"qmckl_update_numprec_precision",
"precision < 2");
}
if (precision > 53) {
return qmckl_failwith(context,
QMCKL_INVALID_ARG_2,
"qmckl_update_numprec_precision",
"precision > 53");
}
qmckl_context_struct* const ctx = (qmckl_context_struct*) context;
/* This should be always true because the context is valid */
assert (ctx != NULL);
qmckl_lock(context);
{
ctx->numprec.precision = (uint32_t) precision;
}
qmckl_unlock(context);
return QMCKL_SUCCESS;
}
interface
integer (qmckl_exit_code) function qmckl_set_numprec_precision(context, precision) bind(C)
use, intrinsic :: iso_c_binding
import
integer (qmckl_context), intent(in), value :: context
integer (c_int32_t), intent(in), value :: precision
end function qmckl_set_numprec_precision
end interface
qmckl_get_numprec_precision
returns the value of the numerical precision in the context.
int qmckl_get_numprec_precision(const qmckl_context context) {
if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) {
return qmckl_failwith(context,
QMCKL_INVALID_CONTEXT,
"qmckl_get_numprec_precision",
"");
}
const qmckl_context_struct* const ctx = (qmckl_context_struct*) context;
return ctx->numprec.precision;
}
interface
integer (qmckl_exit_code) function qmckl_get_numprec_precision(context) bind(C)
use, intrinsic :: iso_c_binding
import
integer (qmckl_context), intent(in), value :: context
end function qmckl_get_numprec_precision
end interface
Range
qmckl_set_numprec_range
modifies the parameter for the numerical
range in a given context.
qmckl_exit_code qmckl_set_numprec_range(const qmckl_context context, const int range) {
if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT)
return QMCKL_INVALID_CONTEXT;
if (range < 2) {
return qmckl_failwith(context,
QMCKL_INVALID_ARG_2,
"qmckl_set_numprec_range",
"range < 2");
}
if (range > 11) {
return qmckl_failwith(context,
QMCKL_INVALID_ARG_2,
"qmckl_set_numprec_range",
"range > 11");
}
qmckl_context_struct* const ctx = (qmckl_context_struct*) context;
/* This should be always true because the context is valid */
assert (ctx != NULL);
qmckl_lock(context);
{
ctx->numprec.range = (uint32_t) range;
}
qmckl_unlock(context);
return QMCKL_SUCCESS;
}
interface
integer (qmckl_exit_code) function qmckl_set_numprec_range(context, range) bind(C)
use, intrinsic :: iso_c_binding
import
integer (qmckl_context), intent(in), value :: context
integer (c_int32_t), intent(in), value :: range
end function qmckl_set_numprec_range
end interface
qmckl_get_numprec_range
returns the value of the numerical range in the context.
int qmckl_get_numprec_range(const qmckl_context context) {
if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) {
return qmckl_failwith(context,
QMCKL_INVALID_CONTEXT,
"qmckl_get_numprec_range",
"");
}
const qmckl_context_struct* const ctx = (qmckl_context_struct*) context;
return ctx->numprec.range;
}
Helper functions
Epsilon
qmckl_get_numprec_epsilon
returns $\epsilon = 2^{1-n}$ where n
is the precision.
We need to remove the sign bit from the precision.
double qmckl_get_numprec_epsilon(const qmckl_context context) {
if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT)
return QMCKL_INVALID_CONTEXT;
const qmckl_context_struct* const ctx = (qmckl_context_struct*) context;
const int precision = ctx->numprec.precision;
return 1. / (double) ( ((uint64_t) 1) << (precision-2));
}
Testing the number of unchanged bits
To test that a given approximation keeps a given number of bits unchanged, we need a function that returns the number of unchanged bits in the range, and in the precision.
For this, we first count by how many units in the last place (ulps) two numbers differ.
int64_t countUlpDifference_64(double a, double b) {
union int_or_float {
int64_t i;
double f;
} x, y;
x.f = a;
y.f = b;
// Handle sign bit discontinuity: if the signs are different and either value is not zero
if ((x.i < 0) != (y.i < 0) && (x.f != 0.0) && (y.f != 0.0)) {
// Use the absolute values and add the distance to zero for both numbers
int64_t distanceToZeroForX = x.i < 0 ? INT64_MAX + x.i : INT64_MAX - x.i;
int64_t distanceToZeroForY = y.i < 0 ? INT64_MAX + y.i : INT64_MAX - y.i;
return distanceToZeroForX + distanceToZeroForY;
}
// Calculate the difference in their binary representations
int64_t result = x.i - y.i;
result = result > 0 ? result : -result;
return result;
}
int32_t qmckl_test_precision_64(double a, double b) {
int64_t diff = countUlpDifference_64(a,b);
if (diff == 0) return 53;
int32_t result = 53;
for (int i=0 ; i<53 && diff != 0 ; ++i) {
diff >>= 1;
result--;
}
return result;
}
int32_t qmckl_test_precision_32(float a, float b) {
return qmckl_test_precision_64( (double) a, (double) b );
}
Approximate functions
Exponential
Fast exponential function, adapted from Johan Rade's implementation (https://gist.github.com/jrade/293a73f89dfef51da6522428c857802d). It is based on Schraudolph's paper:
- Schraudolph, "A Fast, Compact Approximation of the Exponential Function",
Neural Computation 11, 853–862 (1999). (available at https://nic.schraudolph.org/pubs/Schraudolph99.pdf)
float fastExpf(float x)
{
const float a = 12102203.0;
const float b = 1064986816.0;
x = a * x + b;
const float c = 8388608.0;
const float d = 2139095040.0;
if (x < c || x > d)
x = (x < c) ? 0.0f : d;
uint32_t n = (uint32_t) x;
memcpy(&x, &n, 4);
return x;
}
double fastExp(double x)
{
const double a = 6497320848556798.0;
const double b = 4606985713057410560.0;
x = a * x + b;
const double c = 4503599627370496.0;
const double d = 9218868437227405312.0;
if (x < c || x > d)
x = (x < c) ? 0.0 : d;
uint64_t n = (uint64_t) x;
memcpy(&x, &n, 8);
return x;
}