| //! An implementation of Clinger's Bellerophon algorithm. |
| //! |
| //! This is a moderate path algorithm that uses an extended-precision |
| //! float, represented in 80 bits, by calculating the bits of slop |
| //! and determining if those bits could prevent unambiguous rounding. |
| //! |
| //! This algorithm requires less static storage than the Lemire algorithm, |
| //! and has decent performance, and is therefore used when non-decimal, |
| //! non-power-of-two strings need to be parsed. Clinger's algorithm |
| //! is described in depth in "How to Read Floating Point Numbers Accurately.", |
| //! available online [here](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.45.4152&rep=rep1&type=pdf). |
| //! |
| //! This implementation is loosely based off the Golang implementation, |
| //! found [here](https://github.com/golang/go/blob/b10849fbb97a2244c086991b4623ae9f32c212d0/src/strconv/extfloat.go). |
| //! This code is therefore subject to a 3-clause BSD license. |
| |
| #![cfg(feature = "compact")] |
| #![doc(hidden)] |
| |
| use crate::extended_float::ExtendedFloat; |
| use crate::mask::{lower_n_halfway, lower_n_mask}; |
| use crate::num::Float; |
| use crate::number::Number; |
| use crate::rounding::{round, round_nearest_tie_even}; |
| use crate::table::BASE10_POWERS; |
| |
| // ALGORITHM |
| // --------- |
| |
| /// Core implementation of the Bellerophon algorithm. |
| /// |
| /// Create an extended-precision float, scale it to the proper radix power, |
| /// calculate the bits of slop, and return the representation. The value |
| /// will always be guaranteed to be within 1 bit, rounded-down, of the real |
| /// value. If a negative exponent is returned, this represents we were |
| /// unable to unambiguously round the significant digits. |
| /// |
| /// This has been modified to return a biased, rather than unbiased exponent. |
| pub fn bellerophon<F: Float>(num: &Number) -> ExtendedFloat { |
| let fp_zero = ExtendedFloat { |
| mant: 0, |
| exp: 0, |
| }; |
| let fp_inf = ExtendedFloat { |
| mant: 0, |
| exp: F::INFINITE_POWER, |
| }; |
| |
| // Early short-circuit, in case of literal 0 or infinity. |
| // This allows us to avoid narrow casts causing numeric overflow, |
| // and is a quick check for any radix. |
| if num.mantissa == 0 || num.exponent <= -0x1000 { |
| return fp_zero; |
| } else if num.exponent >= 0x1000 { |
| return fp_inf; |
| } |
| |
| // Calculate our indexes for our extended-precision multiplication. |
| // This narrowing cast is safe, since exponent must be in a valid range. |
| let exponent = num.exponent as i32 + BASE10_POWERS.bias; |
| let small_index = exponent % BASE10_POWERS.step; |
| let large_index = exponent / BASE10_POWERS.step; |
| |
| if exponent < 0 { |
| // Guaranteed underflow (assign 0). |
| return fp_zero; |
| } |
| if large_index as usize >= BASE10_POWERS.large.len() { |
| // Overflow (assign infinity) |
| return fp_inf; |
| } |
| |
| // Within the valid exponent range, multiply by the large and small |
| // exponents and return the resulting value. |
| |
| // Track errors to as a factor of unit in last-precision. |
| let mut errors: u32 = 0; |
| if num.many_digits { |
| errors += error_halfscale(); |
| } |
| |
| // Multiply by the small power. |
| // Check if we can directly multiply by an integer, if not, |
| // use extended-precision multiplication. |
| let mut fp = ExtendedFloat { |
| mant: num.mantissa, |
| exp: 0, |
| }; |
| match fp.mant.overflowing_mul(BASE10_POWERS.get_small_int(small_index as usize)) { |
| // Overflow, multiplication unsuccessful, go slow path. |
| (_, true) => { |
| normalize(&mut fp); |
| fp = mul(&fp, &BASE10_POWERS.get_small(small_index as usize)); |
| errors += error_halfscale(); |
| }, |
| // No overflow, multiplication successful. |
| (mant, false) => { |
| fp.mant = mant; |
| normalize(&mut fp); |
| }, |
| } |
| |
| // Multiply by the large power. |
| fp = mul(&fp, &BASE10_POWERS.get_large(large_index as usize)); |
| if errors > 0 { |
| errors += 1; |
| } |
| errors += error_halfscale(); |
| |
| // Normalize the floating point (and the errors). |
| let shift = normalize(&mut fp); |
| errors <<= shift; |
| fp.exp += F::EXPONENT_BIAS; |
| |
| // Check for literal overflow, even with halfway cases. |
| if -fp.exp + 1 > 65 { |
| return fp_zero; |
| } |
| |
| // Too many errors accumulated, return an error. |
| if !error_is_accurate::<F>(errors, &fp) { |
| // Bias the exponent so we know it's invalid. |
| fp.exp += F::INVALID_FP; |
| return fp; |
| } |
| |
| // Check if we have a literal 0 or overflow here. |
| // If we have an exponent of -63, we can still have a valid shift, |
| // giving a case where we have too many errors and need to round-up. |
| if -fp.exp + 1 == 65 { |
| // Have more than 64 bits below the minimum exponent, must be 0. |
| return fp_zero; |
| } |
| |
| round::<F, _>(&mut fp, |f, s| { |
| round_nearest_tie_even(f, s, |is_odd, is_halfway, is_above| { |
| is_above || (is_odd && is_halfway) |
| }); |
| }); |
| fp |
| } |
| |
| // ERRORS |
| // ------ |
| |
| // Calculate if the errors in calculating the extended-precision float. |
| // |
| // Specifically, we want to know if we are close to a halfway representation, |
| // or halfway between `b` and `b+1`, or `b+h`. The halfway representation |
| // has the form: |
| // SEEEEEEEHMMMMMMMMMMMMMMMMMMMMMMM100... |
| // where: |
| // S = Sign Bit |
| // E = Exponent Bits |
| // H = Hidden Bit |
| // M = Mantissa Bits |
| // |
| // The halfway representation has a bit set 1-after the mantissa digits, |
| // and no bits set immediately afterward, making it impossible to |
| // round between `b` and `b+1` with this representation. |
| |
| /// Get the full error scale. |
| #[inline(always)] |
| const fn error_scale() -> u32 { |
| 8 |
| } |
| |
| /// Get the half error scale. |
| #[inline(always)] |
| const fn error_halfscale() -> u32 { |
| error_scale() / 2 |
| } |
| |
| /// Determine if the number of errors is tolerable for float precision. |
| fn error_is_accurate<F: Float>(errors: u32, fp: &ExtendedFloat) -> bool { |
| // Check we can't have a literal 0 denormal float. |
| debug_assert!(fp.exp >= -64); |
| |
| // Determine if extended-precision float is a good approximation. |
| // If the error has affected too many units, the float will be |
| // inaccurate, or if the representation is too close to halfway |
| // that any operations could affect this halfway representation. |
| // See the documentation for dtoa for more information. |
| |
| // This is always a valid u32, since `fp.exp >= -64` |
| // will always be positive and the significand size is {23, 52}. |
| let mantissa_shift = 64 - F::MANTISSA_SIZE - 1; |
| |
| // The unbiased exponent checks is `unbiased_exp <= F::MANTISSA_SIZE |
| // - F::EXPONENT_BIAS -64 + 1`, or `biased_exp <= F::MANTISSA_SIZE - 63`, |
| // or `biased_exp <= mantissa_shift`. |
| let extrabits = match fp.exp <= -mantissa_shift { |
| // Denormal, since shifting to the hidden bit still has a negative exponent. |
| // The unbiased check calculation for bits is `1 - F::EXPONENT_BIAS - unbiased_exp`, |
| // or `1 - biased_exp`. |
| true => 1 - fp.exp, |
| false => 64 - F::MANTISSA_SIZE - 1, |
| }; |
| |
| // Our logic is as follows: we want to determine if the actual |
| // mantissa and the errors during calculation differ significantly |
| // from the rounding point. The rounding point for round-nearest |
| // is the halfway point, IE, this when the truncated bits start |
| // with b1000..., while the rounding point for the round-toward |
| // is when the truncated bits are equal to 0. |
| // To do so, we can check whether the rounding point +/- the error |
| // are >/< the actual lower n bits. |
| // |
| // For whether we need to use signed or unsigned types for this |
| // analysis, see this example, using u8 rather than u64 to simplify |
| // things. |
| // |
| // # Comparisons |
| // cmp1 = (halfway - errors) < extra |
| // cmp1 = extra < (halfway + errors) |
| // |
| // # Large Extrabits, Low Errors |
| // |
| // extrabits = 8 |
| // halfway = 0b10000000 |
| // extra = 0b10000010 |
| // errors = 0b00000100 |
| // halfway - errors = 0b01111100 |
| // halfway + errors = 0b10000100 |
| // |
| // Unsigned: |
| // halfway - errors = 124 |
| // halfway + errors = 132 |
| // extra = 130 |
| // cmp1 = true |
| // cmp2 = true |
| // Signed: |
| // halfway - errors = 124 |
| // halfway + errors = -124 |
| // extra = -126 |
| // cmp1 = false |
| // cmp2 = true |
| // |
| // # Conclusion |
| // |
| // Since errors will always be small, and since we want to detect |
| // if the representation is accurate, we need to use an **unsigned** |
| // type for comparisons. |
| let maskbits = extrabits as u64; |
| let errors = errors as u64; |
| |
| // Round-to-nearest, need to use the halfway point. |
| if extrabits > 64 { |
| // Underflow, we have a shift larger than the mantissa. |
| // Representation is valid **only** if the value is close enough |
| // overflow to the next bit within errors. If it overflows, |
| // the representation is **not** valid. |
| !fp.mant.overflowing_add(errors).1 |
| } else { |
| let mask = lower_n_mask(maskbits); |
| let extra = fp.mant & mask; |
| |
| // Round-to-nearest, need to check if we're close to halfway. |
| // IE, b10100 | 100000, where `|` signifies the truncation point. |
| let halfway = lower_n_halfway(maskbits); |
| let cmp1 = halfway.wrapping_sub(errors) < extra; |
| let cmp2 = extra < halfway.wrapping_add(errors); |
| |
| // If both comparisons are true, we have significant rounding error, |
| // and the value cannot be exactly represented. Otherwise, the |
| // representation is valid. |
| !(cmp1 && cmp2) |
| } |
| } |
| |
| // MATH |
| // ---- |
| |
| /// Normalize float-point number. |
| /// |
| /// Shift the mantissa so the number of leading zeros is 0, or the value |
| /// itself is 0. |
| /// |
| /// Get the number of bytes shifted. |
| pub fn normalize(fp: &mut ExtendedFloat) -> i32 { |
| // Note: |
| // Using the ctlz intrinsic via leading_zeros is way faster (~10x) |
| // than shifting 1-bit at a time, via while loop, and also way |
| // faster (~2x) than an unrolled loop that checks at 32, 16, 4, |
| // 2, and 1 bit. |
| // |
| // Using a modulus of pow2 (which will get optimized to a bitwise |
| // and with 0x3F or faster) is slightly slower than an if/then, |
| // however, removing the if/then will likely optimize more branched |
| // code as it removes conditional logic. |
| |
| // Calculate the number of leading zeros, and then zero-out |
| // any overflowing bits, to avoid shl overflow when self.mant == 0. |
| if fp.mant != 0 { |
| let shift = fp.mant.leading_zeros() as i32; |
| fp.mant <<= shift; |
| fp.exp -= shift; |
| shift |
| } else { |
| 0 |
| } |
| } |
| |
| /// Multiply two normalized extended-precision floats, as if by `a*b`. |
| /// |
| /// The precision is maximal when the numbers are normalized, however, |
| /// decent precision will occur as long as both values have high bits |
| /// set. The result is not normalized. |
| /// |
| /// Algorithm: |
| /// 1. Non-signed multiplication of mantissas (requires 2x as many bits as input). |
| /// 2. Normalization of the result (not done here). |
| /// 3. Addition of exponents. |
| pub fn mul(x: &ExtendedFloat, y: &ExtendedFloat) -> ExtendedFloat { |
| // Logic check, values must be decently normalized prior to multiplication. |
| debug_assert!(x.mant >> 32 != 0); |
| debug_assert!(y.mant >> 32 != 0); |
| |
| // Extract high-and-low masks. |
| // Mask is u32::MAX for older Rustc versions. |
| const LOMASK: u64 = 0xffff_ffff; |
| let x1 = x.mant >> 32; |
| let x0 = x.mant & LOMASK; |
| let y1 = y.mant >> 32; |
| let y0 = y.mant & LOMASK; |
| |
| // Get our products |
| let x1_y0 = x1 * y0; |
| let x0_y1 = x0 * y1; |
| let x0_y0 = x0 * y0; |
| let x1_y1 = x1 * y1; |
| |
| let mut tmp = (x1_y0 & LOMASK) + (x0_y1 & LOMASK) + (x0_y0 >> 32); |
| // round up |
| tmp += 1 << (32 - 1); |
| |
| ExtendedFloat { |
| mant: x1_y1 + (x1_y0 >> 32) + (x0_y1 >> 32) + (tmp >> 32), |
| exp: x.exp + y.exp + 64, |
| } |
| } |
| |
| // POWERS |
| // ------ |
| |
| /// Precalculated powers of base N for the Bellerophon algorithm. |
| pub struct BellerophonPowers { |
| // Pre-calculated small powers. |
| pub small: &'static [u64], |
| // Pre-calculated large powers. |
| pub large: &'static [u64], |
| /// Pre-calculated small powers as 64-bit integers |
| pub small_int: &'static [u64], |
| // Step between large powers and number of small powers. |
| pub step: i32, |
| // Exponent bias for the large powers. |
| pub bias: i32, |
| /// ceil(log2(radix)) scaled as a multiplier. |
| pub log2: i64, |
| /// Bitshift for the log2 multiplier. |
| pub log2_shift: i32, |
| } |
| |
| /// Allow indexing of values without bounds checking |
| impl BellerophonPowers { |
| #[inline] |
| pub fn get_small(&self, index: usize) -> ExtendedFloat { |
| let mant = self.small[index]; |
| let exp = (1 - 64) + ((self.log2 * index as i64) >> self.log2_shift); |
| ExtendedFloat { |
| mant, |
| exp: exp as i32, |
| } |
| } |
| |
| #[inline] |
| pub fn get_large(&self, index: usize) -> ExtendedFloat { |
| let mant = self.large[index]; |
| let biased_e = index as i64 * self.step as i64 - self.bias as i64; |
| let exp = (1 - 64) + ((self.log2 * biased_e) >> self.log2_shift); |
| ExtendedFloat { |
| mant, |
| exp: exp as i32, |
| } |
| } |
| |
| #[inline] |
| pub fn get_small_int(&self, index: usize) -> u64 { |
| self.small_int[index] |
| } |
| } |