| //! Defines rounding schemes for floating-point numbers. |
| |
| #![doc(hidden)] |
| |
| use crate::extended_float::ExtendedFloat; |
| use crate::mask::{lower_n_halfway, lower_n_mask}; |
| use crate::num::Float; |
| |
| // ROUNDING |
| // -------- |
| |
| /// Round an extended-precision float to the nearest machine float. |
| /// |
| /// Shifts the significant digits into place, adjusts the exponent, |
| /// so it can be easily converted to a native float. |
| #[cfg_attr(not(feature = "compact"), inline)] |
| pub fn round<F, Cb>(fp: &mut ExtendedFloat, cb: Cb) |
| where |
| F: Float, |
| Cb: Fn(&mut ExtendedFloat, i32), |
| { |
| let fp_inf = ExtendedFloat { |
| mant: 0, |
| exp: F::INFINITE_POWER, |
| }; |
| |
| // Calculate our shift in significant digits. |
| let mantissa_shift = 64 - F::MANTISSA_SIZE - 1; |
| |
| // Check for a denormal float, if after the shift the exponent is negative. |
| if -fp.exp >= mantissa_shift { |
| // Have a denormal float that isn't a literal 0. |
| // The extra 1 is to adjust for the denormal float, which is |
| // `1 - F::EXPONENT_BIAS`. This works as before, because our |
| // old logic rounded to `F::DENORMAL_EXPONENT` (now 1), and then |
| // checked if `exp == F::DENORMAL_EXPONENT` and no hidden mask |
| // bit was set. Here, we handle that here, rather than later. |
| // |
| // This might round-down to 0, but shift will be at **max** 65, |
| // for halfway cases rounding towards 0. |
| let shift = -fp.exp + 1; |
| debug_assert!(shift <= 65); |
| cb(fp, shift.min(64)); |
| // Check for round-up: if rounding-nearest carried us to the hidden bit. |
| fp.exp = (fp.mant >= F::HIDDEN_BIT_MASK) as i32; |
| return; |
| } |
| |
| // The float is normal, round to the hidden bit. |
| cb(fp, mantissa_shift); |
| |
| // Check if we carried, and if so, shift the bit to the hidden bit. |
| let carry_mask = F::CARRY_MASK; |
| if fp.mant & carry_mask == carry_mask { |
| fp.mant >>= 1; |
| fp.exp += 1; |
| } |
| |
| // Handle if we carried and check for overflow again. |
| if fp.exp >= F::INFINITE_POWER { |
| // Exponent is above largest normal value, must be infinite. |
| *fp = fp_inf; |
| return; |
| } |
| |
| // Remove the hidden bit. |
| fp.mant &= F::MANTISSA_MASK; |
| } |
| |
| /// Shift right N-bytes and round towards a direction. |
| /// |
| /// Callback should take the following parameters: |
| /// 1. is_odd |
| /// 1. is_halfway |
| /// 1. is_above |
| #[cfg_attr(not(feature = "compact"), inline)] |
| pub fn round_nearest_tie_even<Cb>(fp: &mut ExtendedFloat, shift: i32, cb: Cb) |
| where |
| // is_odd, is_halfway, is_above |
| Cb: Fn(bool, bool, bool) -> bool, |
| { |
| // Ensure we've already handled denormal values that underflow. |
| debug_assert!(shift <= 64); |
| |
| // Extract the truncated bits using mask. |
| // Calculate if the value of the truncated bits are either above |
| // the mid-way point, or equal to it. |
| // |
| // For example, for 4 truncated bytes, the mask would be 0b1111 |
| // and the midway point would be 0b1000. |
| let mask = lower_n_mask(shift as u64); |
| let halfway = lower_n_halfway(shift as u64); |
| let truncated_bits = fp.mant & mask; |
| let is_above = truncated_bits > halfway; |
| let is_halfway = truncated_bits == halfway; |
| |
| // Bit shift so the leading bit is in the hidden bit. |
| // This optimixes pretty well: |
| // ```text |
| // mov ecx, esi |
| // shr rdi, cl |
| // xor eax, eax |
| // cmp esi, 64 |
| // cmovne rax, rdi |
| // ret |
| // ``` |
| fp.mant = match shift == 64 { |
| true => 0, |
| false => fp.mant >> shift, |
| }; |
| fp.exp += shift; |
| |
| // Extract the last bit after shifting (and determine if it is odd). |
| let is_odd = fp.mant & 1 == 1; |
| |
| // Calculate if we need to roundup. |
| // We need to roundup if we are above halfway, or if we are odd |
| // and at half-way (need to tie-to-even). Avoid the branch here. |
| fp.mant += cb(is_odd, is_halfway, is_above) as u64; |
| } |
| |
| /// Round our significant digits into place, truncating them. |
| #[cfg_attr(not(feature = "compact"), inline)] |
| pub fn round_down(fp: &mut ExtendedFloat, shift: i32) { |
| // Might have a shift greater than 64 if we have an error. |
| fp.mant = match shift == 64 { |
| true => 0, |
| false => fp.mant >> shift, |
| }; |
| fp.exp += shift; |
| } |