| //! A simple big-integer type for slow path algorithms. |
| //! |
| //! This includes minimal stackvector for use in big-integer arithmetic. |
| |
| #![doc(hidden)] |
| |
| #[cfg(feature = "alloc")] |
| use crate::heapvec::HeapVec; |
| use crate::num::Float; |
| #[cfg(not(feature = "alloc"))] |
| use crate::stackvec::StackVec; |
| #[cfg(not(feature = "compact"))] |
| use crate::table::{LARGE_POW5, LARGE_POW5_STEP}; |
| use core::{cmp, ops, ptr}; |
| |
| /// Number of bits in a Bigint. |
| /// |
| /// This needs to be at least the number of bits required to store |
| /// a Bigint, which is `log2(radix**digits)`. |
| /// ≅ 3600 for base-10, rounded-up. |
| pub const BIGINT_BITS: usize = 4000; |
| |
| /// The number of limbs for the bigint. |
| pub const BIGINT_LIMBS: usize = BIGINT_BITS / LIMB_BITS; |
| |
| #[cfg(feature = "alloc")] |
| pub type VecType = HeapVec; |
| |
| #[cfg(not(feature = "alloc"))] |
| pub type VecType = StackVec; |
| |
| /// Storage for a big integer type. |
| /// |
| /// This is used for algorithms when we have a finite number of digits. |
| /// Specifically, it stores all the significant digits scaled to the |
| /// proper exponent, as an integral type, and then directly compares |
| /// these digits. |
| /// |
| /// This requires us to store the number of significant bits, plus the |
| /// number of exponent bits (required) since we scale everything |
| /// to the same exponent. |
| #[derive(Clone, PartialEq, Eq)] |
| pub struct Bigint { |
| /// Significant digits for the float, stored in a big integer in LE order. |
| /// |
| /// This is pretty much the same number of digits for any radix, since the |
| /// significant digits balances out the zeros from the exponent: |
| /// 1. Decimal is 1091 digits, 767 mantissa digits + 324 exponent zeros. |
| /// 2. Base 6 is 1097 digits, or 680 mantissa digits + 417 exponent zeros. |
| /// 3. Base 36 is 1086 digits, or 877 mantissa digits + 209 exponent zeros. |
| /// |
| /// However, the number of bytes required is larger for large radixes: |
| /// for decimal, we need `log2(10**1091) ≅ 3600`, while for base 36 |
| /// we need `log2(36**1086) ≅ 5600`. Since we use uninitialized data, |
| /// we avoid a major performance hit from the large buffer size. |
| pub data: VecType, |
| } |
| |
| #[allow(clippy::new_without_default)] |
| impl Bigint { |
| /// Construct a bigint representing 0. |
| #[inline(always)] |
| pub fn new() -> Self { |
| Self { |
| data: VecType::new(), |
| } |
| } |
| |
| /// Construct a bigint from an integer. |
| #[inline(always)] |
| pub fn from_u64(value: u64) -> Self { |
| Self { |
| data: VecType::from_u64(value), |
| } |
| } |
| |
| #[inline(always)] |
| pub fn hi64(&self) -> (u64, bool) { |
| self.data.hi64() |
| } |
| |
| /// Multiply and assign as if by exponentiation by a power. |
| #[inline] |
| pub fn pow(&mut self, base: u32, exp: u32) -> Option<()> { |
| debug_assert!(base == 2 || base == 5 || base == 10); |
| if base % 5 == 0 { |
| pow(&mut self.data, exp)?; |
| } |
| if base % 2 == 0 { |
| shl(&mut self.data, exp as usize)?; |
| } |
| Some(()) |
| } |
| |
| /// Calculate the bit-length of the big-integer. |
| #[inline] |
| pub fn bit_length(&self) -> u32 { |
| bit_length(&self.data) |
| } |
| } |
| |
| impl ops::MulAssign<&Bigint> for Bigint { |
| fn mul_assign(&mut self, rhs: &Bigint) { |
| self.data *= &rhs.data; |
| } |
| } |
| |
| /// REVERSE VIEW |
| |
| /// Reverse, immutable view of a sequence. |
| pub struct ReverseView<'a, T: 'a> { |
| inner: &'a [T], |
| } |
| |
| impl<'a, T> ops::Index<usize> for ReverseView<'a, T> { |
| type Output = T; |
| |
| #[inline] |
| fn index(&self, index: usize) -> &T { |
| let len = self.inner.len(); |
| &(*self.inner)[len - index - 1] |
| } |
| } |
| |
| /// Create a reverse view of the vector for indexing. |
| #[inline] |
| pub fn rview(x: &[Limb]) -> ReverseView<Limb> { |
| ReverseView { |
| inner: x, |
| } |
| } |
| |
| // COMPARE |
| // ------- |
| |
| /// Compare `x` to `y`, in little-endian order. |
| #[inline] |
| pub fn compare(x: &[Limb], y: &[Limb]) -> cmp::Ordering { |
| match x.len().cmp(&y.len()) { |
| cmp::Ordering::Equal => { |
| let iter = x.iter().rev().zip(y.iter().rev()); |
| for (&xi, yi) in iter { |
| match xi.cmp(yi) { |
| cmp::Ordering::Equal => (), |
| ord => return ord, |
| } |
| } |
| // Equal case. |
| cmp::Ordering::Equal |
| }, |
| ord => ord, |
| } |
| } |
| |
| // NORMALIZE |
| // --------- |
| |
| /// Normalize the integer, so any leading zero values are removed. |
| #[inline] |
| pub fn normalize(x: &mut VecType) { |
| // We don't care if this wraps: the index is bounds-checked. |
| while let Some(&value) = x.get(x.len().wrapping_sub(1)) { |
| if value == 0 { |
| unsafe { x.set_len(x.len() - 1) }; |
| } else { |
| break; |
| } |
| } |
| } |
| |
| /// Get if the big integer is normalized. |
| #[inline] |
| #[allow(clippy::match_like_matches_macro)] |
| pub fn is_normalized(x: &[Limb]) -> bool { |
| // We don't care if this wraps: the index is bounds-checked. |
| match x.get(x.len().wrapping_sub(1)) { |
| Some(&0) => false, |
| _ => true, |
| } |
| } |
| |
| // FROM |
| // ---- |
| |
| /// Create StackVec from u64 value. |
| #[inline(always)] |
| #[allow(clippy::branches_sharing_code)] |
| pub fn from_u64(x: u64) -> VecType { |
| let mut vec = VecType::new(); |
| debug_assert!(vec.capacity() >= 2); |
| if LIMB_BITS == 32 { |
| vec.try_push(x as Limb).unwrap(); |
| vec.try_push((x >> 32) as Limb).unwrap(); |
| } else { |
| vec.try_push(x as Limb).unwrap(); |
| } |
| vec.normalize(); |
| vec |
| } |
| |
| // HI |
| // -- |
| |
| /// Check if any of the remaining bits are non-zero. |
| /// |
| /// # Safety |
| /// |
| /// Safe as long as `rindex <= x.len()`. |
| #[inline] |
| pub fn nonzero(x: &[Limb], rindex: usize) -> bool { |
| debug_assert!(rindex <= x.len()); |
| |
| let len = x.len(); |
| let slc = &x[..len - rindex]; |
| slc.iter().rev().any(|&x| x != 0) |
| } |
| |
| // These return the high X bits and if the bits were truncated. |
| |
| /// Shift 32-bit integer to high 64-bits. |
| #[inline] |
| pub fn u32_to_hi64_1(r0: u32) -> (u64, bool) { |
| u64_to_hi64_1(r0 as u64) |
| } |
| |
| /// Shift 2 32-bit integers to high 64-bits. |
| #[inline] |
| pub fn u32_to_hi64_2(r0: u32, r1: u32) -> (u64, bool) { |
| let r0 = (r0 as u64) << 32; |
| let r1 = r1 as u64; |
| u64_to_hi64_1(r0 | r1) |
| } |
| |
| /// Shift 3 32-bit integers to high 64-bits. |
| #[inline] |
| pub fn u32_to_hi64_3(r0: u32, r1: u32, r2: u32) -> (u64, bool) { |
| let r0 = r0 as u64; |
| let r1 = (r1 as u64) << 32; |
| let r2 = r2 as u64; |
| u64_to_hi64_2(r0, r1 | r2) |
| } |
| |
| /// Shift 64-bit integer to high 64-bits. |
| #[inline] |
| pub fn u64_to_hi64_1(r0: u64) -> (u64, bool) { |
| let ls = r0.leading_zeros(); |
| (r0 << ls, false) |
| } |
| |
| /// Shift 2 64-bit integers to high 64-bits. |
| #[inline] |
| pub fn u64_to_hi64_2(r0: u64, r1: u64) -> (u64, bool) { |
| let ls = r0.leading_zeros(); |
| let rs = 64 - ls; |
| let v = match ls { |
| 0 => r0, |
| _ => (r0 << ls) | (r1 >> rs), |
| }; |
| let n = r1 << ls != 0; |
| (v, n) |
| } |
| |
| /// Extract the hi bits from the buffer. |
| macro_rules! hi { |
| // # Safety |
| // |
| // Safe as long as the `stackvec.len() >= 1`. |
| (@1 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{ |
| $fn($rview[0] as $t) |
| }}; |
| |
| // # Safety |
| // |
| // Safe as long as the `stackvec.len() >= 2`. |
| (@2 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{ |
| let r0 = $rview[0] as $t; |
| let r1 = $rview[1] as $t; |
| $fn(r0, r1) |
| }}; |
| |
| // # Safety |
| // |
| // Safe as long as the `stackvec.len() >= 2`. |
| (@nonzero2 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{ |
| let (v, n) = hi!(@2 $self, $rview, $t, $fn); |
| (v, n || nonzero($self, 2 )) |
| }}; |
| |
| // # Safety |
| // |
| // Safe as long as the `stackvec.len() >= 3`. |
| (@3 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{ |
| let r0 = $rview[0] as $t; |
| let r1 = $rview[1] as $t; |
| let r2 = $rview[2] as $t; |
| $fn(r0, r1, r2) |
| }}; |
| |
| // # Safety |
| // |
| // Safe as long as the `stackvec.len() >= 3`. |
| (@nonzero3 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{ |
| let (v, n) = hi!(@3 $self, $rview, $t, $fn); |
| (v, n || nonzero($self, 3)) |
| }}; |
| } |
| |
| /// Get the high 64 bits from the vector. |
| #[inline(always)] |
| pub fn hi64(x: &[Limb]) -> (u64, bool) { |
| let rslc = rview(x); |
| // SAFETY: the buffer must be at least length bytes long. |
| match x.len() { |
| 0 => (0, false), |
| 1 if LIMB_BITS == 32 => hi!(@1 x, rslc, u32, u32_to_hi64_1), |
| 1 => hi!(@1 x, rslc, u64, u64_to_hi64_1), |
| 2 if LIMB_BITS == 32 => hi!(@2 x, rslc, u32, u32_to_hi64_2), |
| 2 => hi!(@2 x, rslc, u64, u64_to_hi64_2), |
| _ if LIMB_BITS == 32 => hi!(@nonzero3 x, rslc, u32, u32_to_hi64_3), |
| _ => hi!(@nonzero2 x, rslc, u64, u64_to_hi64_2), |
| } |
| } |
| |
| // POWERS |
| // ------ |
| |
| /// MulAssign by a power of 5. |
| /// |
| /// Theoretically... |
| /// |
| /// Use an exponentiation by squaring method, since it reduces the time |
| /// complexity of the multiplication to ~`O(log(n))` for the squaring, |
| /// and `O(n*m)` for the result. Since `m` is typically a lower-order |
| /// factor, this significantly reduces the number of multiplications |
| /// we need to do. Iteratively multiplying by small powers follows |
| /// the nth triangular number series, which scales as `O(p^2)`, but |
| /// where `p` is `n+m`. In short, it scales very poorly. |
| /// |
| /// Practically.... |
| /// |
| /// Exponentiation by Squaring: |
| /// running 2 tests |
| /// test bigcomp_f32_lexical ... bench: 1,018 ns/iter (+/- 78) |
| /// test bigcomp_f64_lexical ... bench: 3,639 ns/iter (+/- 1,007) |
| /// |
| /// Exponentiation by Iterative Small Powers: |
| /// running 2 tests |
| /// test bigcomp_f32_lexical ... bench: 518 ns/iter (+/- 31) |
| /// test bigcomp_f64_lexical ... bench: 583 ns/iter (+/- 47) |
| /// |
| /// Exponentiation by Iterative Large Powers (of 2): |
| /// running 2 tests |
| /// test bigcomp_f32_lexical ... bench: 671 ns/iter (+/- 31) |
| /// test bigcomp_f64_lexical ... bench: 1,394 ns/iter (+/- 47) |
| /// |
| /// The following benchmarks were run on `1 * 5^300`, using native `pow`, |
| /// a version with only small powers, and one with pre-computed powers |
| /// of `5^(3 * max_exp)`, rather than `5^(5 * max_exp)`. |
| /// |
| /// However, using large powers is crucial for good performance for higher |
| /// powers. |
| /// pow/default time: [426.20 ns 427.96 ns 429.89 ns] |
| /// pow/small time: [2.9270 us 2.9411 us 2.9565 us] |
| /// pow/large:3 time: [838.51 ns 842.21 ns 846.27 ns] |
| /// |
| /// Even using worst-case scenarios, exponentiation by squaring is |
| /// significantly slower for our workloads. Just multiply by small powers, |
| /// in simple cases, and use precalculated large powers in other cases. |
| /// |
| /// Furthermore, using sufficiently big large powers is also crucial for |
| /// performance. This is a tradeoff of binary size and performance, and |
| /// using a single value at ~`5^(5 * max_exp)` seems optimal. |
| pub fn pow(x: &mut VecType, mut exp: u32) -> Option<()> { |
| // Minimize the number of iterations for large exponents: just |
| // do a few steps with a large powers. |
| #[cfg(not(feature = "compact"))] |
| { |
| while exp >= LARGE_POW5_STEP { |
| large_mul(x, &LARGE_POW5)?; |
| exp -= LARGE_POW5_STEP; |
| } |
| } |
| |
| // Now use our pre-computed small powers iteratively. |
| // This is calculated as `⌊log(2^BITS - 1, 5)⌋`. |
| let small_step = if LIMB_BITS == 32 { |
| 13 |
| } else { |
| 27 |
| }; |
| let max_native = (5 as Limb).pow(small_step); |
| while exp >= small_step { |
| small_mul(x, max_native)?; |
| exp -= small_step; |
| } |
| if exp != 0 { |
| // SAFETY: safe, since `exp < small_step`. |
| let small_power = unsafe { f64::int_pow_fast_path(exp as usize, 5) }; |
| small_mul(x, small_power as Limb)?; |
| } |
| Some(()) |
| } |
| |
| // SCALAR |
| // ------ |
| |
| /// Add two small integers and return the resulting value and if overflow happens. |
| #[inline(always)] |
| pub fn scalar_add(x: Limb, y: Limb) -> (Limb, bool) { |
| x.overflowing_add(y) |
| } |
| |
| /// Multiply two small integers (with carry) (and return the overflow contribution). |
| /// |
| /// Returns the (low, high) components. |
| #[inline(always)] |
| pub fn scalar_mul(x: Limb, y: Limb, carry: Limb) -> (Limb, Limb) { |
| // Cannot overflow, as long as wide is 2x as wide. This is because |
| // the following is always true: |
| // `Wide::MAX - (Narrow::MAX * Narrow::MAX) >= Narrow::MAX` |
| let z: Wide = (x as Wide) * (y as Wide) + (carry as Wide); |
| (z as Limb, (z >> LIMB_BITS) as Limb) |
| } |
| |
| // SMALL |
| // ----- |
| |
| /// Add small integer to bigint starting from offset. |
| #[inline] |
| pub fn small_add_from(x: &mut VecType, y: Limb, start: usize) -> Option<()> { |
| let mut index = start; |
| let mut carry = y; |
| while carry != 0 && index < x.len() { |
| let result = scalar_add(x[index], carry); |
| x[index] = result.0; |
| carry = result.1 as Limb; |
| index += 1; |
| } |
| // If we carried past all the elements, add to the end of the buffer. |
| if carry != 0 { |
| x.try_push(carry)?; |
| } |
| Some(()) |
| } |
| |
| /// Add small integer to bigint. |
| #[inline(always)] |
| pub fn small_add(x: &mut VecType, y: Limb) -> Option<()> { |
| small_add_from(x, y, 0) |
| } |
| |
| /// Multiply bigint by small integer. |
| #[inline] |
| pub fn small_mul(x: &mut VecType, y: Limb) -> Option<()> { |
| let mut carry = 0; |
| for xi in x.iter_mut() { |
| let result = scalar_mul(*xi, y, carry); |
| *xi = result.0; |
| carry = result.1; |
| } |
| // If we carried past all the elements, add to the end of the buffer. |
| if carry != 0 { |
| x.try_push(carry)?; |
| } |
| Some(()) |
| } |
| |
| // LARGE |
| // ----- |
| |
| /// Add bigint to bigint starting from offset. |
| pub fn large_add_from(x: &mut VecType, y: &[Limb], start: usize) -> Option<()> { |
| // The effective x buffer is from `xstart..x.len()`, so we need to treat |
| // that as the current range. If the effective y buffer is longer, need |
| // to resize to that, + the start index. |
| if y.len() > x.len().saturating_sub(start) { |
| // Ensure we panic if we can't extend the buffer. |
| // This avoids any unsafe behavior afterwards. |
| x.try_resize(y.len() + start, 0)?; |
| } |
| |
| // Iteratively add elements from y to x. |
| let mut carry = false; |
| for (index, &yi) in y.iter().enumerate() { |
| // We panicked in `try_resize` if this wasn't true. |
| let xi = x.get_mut(start + index).unwrap(); |
| |
| // Only one op of the two ops can overflow, since we added at max |
| // Limb::max_value() + Limb::max_value(). Add the previous carry, |
| // and store the current carry for the next. |
| let result = scalar_add(*xi, yi); |
| *xi = result.0; |
| let mut tmp = result.1; |
| if carry { |
| let result = scalar_add(*xi, 1); |
| *xi = result.0; |
| tmp |= result.1; |
| } |
| carry = tmp; |
| } |
| |
| // Handle overflow. |
| if carry { |
| small_add_from(x, 1, y.len() + start)?; |
| } |
| Some(()) |
| } |
| |
| /// Add bigint to bigint. |
| #[inline(always)] |
| pub fn large_add(x: &mut VecType, y: &[Limb]) -> Option<()> { |
| large_add_from(x, y, 0) |
| } |
| |
| /// Grade-school multiplication algorithm. |
| /// |
| /// Slow, naive algorithm, using limb-bit bases and just shifting left for |
| /// each iteration. This could be optimized with numerous other algorithms, |
| /// but it's extremely simple, and works in O(n*m) time, which is fine |
| /// by me. Each iteration, of which there are `m` iterations, requires |
| /// `n` multiplications, and `n` additions, or grade-school multiplication. |
| /// |
| /// Don't use Karatsuba multiplication, since out implementation seems to |
| /// be slower asymptotically, which is likely just due to the small sizes |
| /// we deal with here. For example, running on the following data: |
| /// |
| /// ```text |
| /// const SMALL_X: &[u32] = &[ |
| /// 766857581, 3588187092, 1583923090, 2204542082, 1564708913, 2695310100, 3676050286, |
| /// 1022770393, 468044626, 446028186 |
| /// ]; |
| /// const SMALL_Y: &[u32] = &[ |
| /// 3945492125, 3250752032, 1282554898, 1708742809, 1131807209, 3171663979, 1353276095, |
| /// 1678845844, 2373924447, 3640713171 |
| /// ]; |
| /// const LARGE_X: &[u32] = &[ |
| /// 3647536243, 2836434412, 2154401029, 1297917894, 137240595, 790694805, 2260404854, |
| /// 3872698172, 690585094, 99641546, 3510774932, 1672049983, 2313458559, 2017623719, |
| /// 638180197, 1140936565, 1787190494, 1797420655, 14113450, 2350476485, 3052941684, |
| /// 1993594787, 2901001571, 4156930025, 1248016552, 848099908, 2660577483, 4030871206, |
| /// 692169593, 2835966319, 1781364505, 4266390061, 1813581655, 4210899844, 2137005290, |
| /// 2346701569, 3715571980, 3386325356, 1251725092, 2267270902, 474686922, 2712200426, |
| /// 197581715, 3087636290, 1379224439, 1258285015, 3230794403, 2759309199, 1494932094, |
| /// 326310242 |
| /// ]; |
| /// const LARGE_Y: &[u32] = &[ |
| /// 1574249566, 868970575, 76716509, 3198027972, 1541766986, 1095120699, 3891610505, |
| /// 2322545818, 1677345138, 865101357, 2650232883, 2831881215, 3985005565, 2294283760, |
| /// 3468161605, 393539559, 3665153349, 1494067812, 106699483, 2596454134, 797235106, |
| /// 705031740, 1209732933, 2732145769, 4122429072, 141002534, 790195010, 4014829800, |
| /// 1303930792, 3649568494, 308065964, 1233648836, 2807326116, 79326486, 1262500691, |
| /// 621809229, 2258109428, 3819258501, 171115668, 1139491184, 2979680603, 1333372297, |
| /// 1657496603, 2790845317, 4090236532, 4220374789, 601876604, 1828177209, 2372228171, |
| /// 2247372529 |
| /// ]; |
| /// ``` |
| /// |
| /// We get the following results: |
| |
| /// ```text |
| /// mul/small:long time: [220.23 ns 221.47 ns 222.81 ns] |
| /// Found 4 outliers among 100 measurements (4.00%) |
| /// 2 (2.00%) high mild |
| /// 2 (2.00%) high severe |
| /// mul/small:karatsuba time: [233.88 ns 234.63 ns 235.44 ns] |
| /// Found 11 outliers among 100 measurements (11.00%) |
| /// 8 (8.00%) high mild |
| /// 3 (3.00%) high severe |
| /// mul/large:long time: [1.9365 us 1.9455 us 1.9558 us] |
| /// Found 12 outliers among 100 measurements (12.00%) |
| /// 7 (7.00%) high mild |
| /// 5 (5.00%) high severe |
| /// mul/large:karatsuba time: [4.4250 us 4.4515 us 4.4812 us] |
| /// ``` |
| /// |
| /// In short, Karatsuba multiplication is never worthwhile for out use-case. |
| pub fn long_mul(x: &[Limb], y: &[Limb]) -> Option<VecType> { |
| // Using the immutable value, multiply by all the scalars in y, using |
| // the algorithm defined above. Use a single buffer to avoid |
| // frequent reallocations. Handle the first case to avoid a redundant |
| // addition, since we know y.len() >= 1. |
| let mut z = VecType::try_from(x)?; |
| if !y.is_empty() { |
| let y0 = y[0]; |
| small_mul(&mut z, y0)?; |
| |
| for (index, &yi) in y.iter().enumerate().skip(1) { |
| if yi != 0 { |
| let mut zi = VecType::try_from(x)?; |
| small_mul(&mut zi, yi)?; |
| large_add_from(&mut z, &zi, index)?; |
| } |
| } |
| } |
| |
| z.normalize(); |
| Some(z) |
| } |
| |
| /// Multiply bigint by bigint using grade-school multiplication algorithm. |
| #[inline(always)] |
| pub fn large_mul(x: &mut VecType, y: &[Limb]) -> Option<()> { |
| // Karatsuba multiplication never makes sense, so just use grade school |
| // multiplication. |
| if y.len() == 1 { |
| // SAFETY: safe since `y.len() == 1`. |
| small_mul(x, y[0])?; |
| } else { |
| *x = long_mul(y, x)?; |
| } |
| Some(()) |
| } |
| |
| // SHIFT |
| // ----- |
| |
| /// Shift-left `n` bits inside a buffer. |
| #[inline] |
| pub fn shl_bits(x: &mut VecType, n: usize) -> Option<()> { |
| debug_assert!(n != 0); |
| |
| // Internally, for each item, we shift left by n, and add the previous |
| // right shifted limb-bits. |
| // For example, we transform (for u8) shifted left 2, to: |
| // b10100100 b01000010 |
| // b10 b10010001 b00001000 |
| debug_assert!(n < LIMB_BITS); |
| let rshift = LIMB_BITS - n; |
| let lshift = n; |
| let mut prev: Limb = 0; |
| for xi in x.iter_mut() { |
| let tmp = *xi; |
| *xi <<= lshift; |
| *xi |= prev >> rshift; |
| prev = tmp; |
| } |
| |
| // Always push the carry, even if it creates a non-normal result. |
| let carry = prev >> rshift; |
| if carry != 0 { |
| x.try_push(carry)?; |
| } |
| |
| Some(()) |
| } |
| |
| /// Shift-left `n` limbs inside a buffer. |
| #[inline] |
| pub fn shl_limbs(x: &mut VecType, n: usize) -> Option<()> { |
| debug_assert!(n != 0); |
| if n + x.len() > x.capacity() { |
| None |
| } else if !x.is_empty() { |
| let len = n + x.len(); |
| // SAFE: since x is not empty, and `x.len() + n <= x.capacity()`. |
| unsafe { |
| // Move the elements. |
| let src = x.as_ptr(); |
| let dst = x.as_mut_ptr().add(n); |
| ptr::copy(src, dst, x.len()); |
| // Write our 0s. |
| ptr::write_bytes(x.as_mut_ptr(), 0, n); |
| x.set_len(len); |
| } |
| Some(()) |
| } else { |
| Some(()) |
| } |
| } |
| |
| /// Shift-left buffer by n bits. |
| #[inline] |
| pub fn shl(x: &mut VecType, n: usize) -> Option<()> { |
| let rem = n % LIMB_BITS; |
| let div = n / LIMB_BITS; |
| if rem != 0 { |
| shl_bits(x, rem)?; |
| } |
| if div != 0 { |
| shl_limbs(x, div)?; |
| } |
| Some(()) |
| } |
| |
| /// Get number of leading zero bits in the storage. |
| #[inline] |
| pub fn leading_zeros(x: &[Limb]) -> u32 { |
| let length = x.len(); |
| // wrapping_sub is fine, since it'll just return None. |
| if let Some(&value) = x.get(length.wrapping_sub(1)) { |
| value.leading_zeros() |
| } else { |
| 0 |
| } |
| } |
| |
| /// Calculate the bit-length of the big-integer. |
| #[inline] |
| pub fn bit_length(x: &[Limb]) -> u32 { |
| let nlz = leading_zeros(x); |
| LIMB_BITS as u32 * x.len() as u32 - nlz |
| } |
| |
| // LIMB |
| // ---- |
| |
| // Type for a single limb of the big integer. |
| // |
| // A limb is analogous to a digit in base10, except, it stores 32-bit |
| // or 64-bit numbers instead. We want types where 64-bit multiplication |
| // is well-supported by the architecture, rather than emulated in 3 |
| // instructions. The quickest way to check this support is using a |
| // cross-compiler for numerous architectures, along with the following |
| // source file and command: |
| // |
| // Compile with `gcc main.c -c -S -O3 -masm=intel` |
| // |
| // And the source code is: |
| // ```text |
| // #include <stdint.h> |
| // |
| // struct i128 { |
| // uint64_t hi; |
| // uint64_t lo; |
| // }; |
| // |
| // // Type your code here, or load an example. |
| // struct i128 square(uint64_t x, uint64_t y) { |
| // __int128 prod = (__int128)x * (__int128)y; |
| // struct i128 z; |
| // z.hi = (uint64_t)(prod >> 64); |
| // z.lo = (uint64_t)prod; |
| // return z; |
| // } |
| // ``` |
| // |
| // If the result contains `call __multi3`, then the multiplication |
| // is emulated by the compiler. Otherwise, it's natively supported. |
| // |
| // This should be all-known 64-bit platforms supported by Rust. |
| // https://forge.rust-lang.org/platform-support.html |
| // |
| // # Supported |
| // |
| // Platforms where native 128-bit multiplication is explicitly supported: |
| // - x86_64 (Supported via `MUL`). |
| // - mips64 (Supported via `DMULTU`, which `HI` and `LO` can be read-from). |
| // - s390x (Supported via `MLGR`). |
| // |
| // # Efficient |
| // |
| // Platforms where native 64-bit multiplication is supported and |
| // you can extract hi-lo for 64-bit multiplications. |
| // - aarch64 (Requires `UMULH` and `MUL` to capture high and low bits). |
| // - powerpc64 (Requires `MULHDU` and `MULLD` to capture high and low bits). |
| // - riscv64 (Requires `MUL` and `MULH` to capture high and low bits). |
| // |
| // # Unsupported |
| // |
| // Platforms where native 128-bit multiplication is not supported, |
| // requiring software emulation. |
| // sparc64 (`UMUL` only supports double-word arguments). |
| // sparcv9 (Same as sparc64). |
| // |
| // These tests are run via `xcross`, my own library for C cross-compiling, |
| // which supports numerous targets (far in excess of Rust's tier 1 support, |
| // or rust-embedded/cross's list). xcross may be found here: |
| // https://github.com/Alexhuszagh/xcross |
| // |
| // To compile for the given target, run: |
| // `xcross gcc main.c -c -S -O3 --target $target` |
| // |
| // All 32-bit architectures inherently do not have support. That means |
| // we can essentially look for 64-bit architectures that are not SPARC. |
| |
| #[cfg(all(target_pointer_width = "64", not(target_arch = "sparc")))] |
| pub type Limb = u64; |
| #[cfg(all(target_pointer_width = "64", not(target_arch = "sparc")))] |
| pub type Wide = u128; |
| #[cfg(all(target_pointer_width = "64", not(target_arch = "sparc")))] |
| pub const LIMB_BITS: usize = 64; |
| |
| #[cfg(not(all(target_pointer_width = "64", not(target_arch = "sparc"))))] |
| pub type Limb = u32; |
| #[cfg(not(all(target_pointer_width = "64", not(target_arch = "sparc"))))] |
| pub type Wide = u64; |
| #[cfg(not(all(target_pointer_width = "64", not(target_arch = "sparc"))))] |
| pub const LIMB_BITS: usize = 32; |