From dbd22cfdd804d592c15759ba4f266dd5a99535f8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 20 Nov 2015 01:03:24 -0900 Subject: [PATCH 1/6] fix some annoying build warnings --- benches/shootout-pidigits.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benches/shootout-pidigits.rs b/benches/shootout-pidigits.rs index 7ddb3d1..2cdcbda 100644 --- a/benches/shootout-pidigits.rs +++ b/benches/shootout-pidigits.rs @@ -93,7 +93,7 @@ fn pidigits(n: isize, out: &mut io::Write) -> io::Result<()> { let mut k = 0; let mut context = Context::new(); - for i in (1..(n+1)) { + for i in 1..(n+1) { let mut d; loop { k += 1; @@ -110,7 +110,7 @@ fn pidigits(n: isize, out: &mut io::Write) -> io::Result<()> { let m = n % 10; if m != 0 { - for _ in (m..10) { try!(write!(out, " ")); } + for _ in m..10 { try!(write!(out, " ")); } try!(write!(out, "\t:{}\n", n)); } Ok(()) From a1e57a48b2f46e6c8bcb94cbb8a90c4afeb7cf6f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 20 Nov 2015 01:05:36 -0900 Subject: [PATCH 2/6] Add multiplication/division benchmarks Add benchmarks that test multiplies/divides of different sizes --- benches/bigint.rs | 53 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/benches/bigint.rs b/benches/bigint.rs index 52b8590..79b5a1e 100644 --- a/benches/bigint.rs +++ b/benches/bigint.rs @@ -2,10 +2,33 @@ extern crate test; extern crate num; +extern crate rand; use std::mem::replace; use test::Bencher; use num::{BigUint, Zero, One, FromPrimitive}; +use num::bigint::RandBigInt; +use rand::{SeedableRng, StdRng}; + +fn multiply_bench(b: &mut Bencher, xbits: usize, ybits: usize) { + let seed: &[_] = &[1, 2, 3, 4]; + let mut rng: StdRng = SeedableRng::from_seed(seed); + + let x = rng.gen_bigint(xbits); + let y = rng.gen_bigint(ybits); + + b.iter(|| &x * &y); +} + +fn divide_bench(b: &mut Bencher, xbits: usize, ybits: usize) { + let seed: &[_] = &[1, 2, 3, 4]; + let mut rng: StdRng = SeedableRng::from_seed(seed); + + let x = rng.gen_bigint(xbits); + let y = rng.gen_bigint(ybits); + + b.iter(|| &x / &y); +} fn factorial(n: usize) -> BigUint { let mut f: BigUint = One::one(); @@ -26,6 +49,36 @@ fn fib(n: usize) -> BigUint { f0 } +#[bench] +fn multiply_0(b: &mut Bencher) { + multiply_bench(b, 1 << 8, 1 << 8); +} + +#[bench] +fn multiply_1(b: &mut Bencher) { + multiply_bench(b, 1 << 8, 1 << 16); +} + +#[bench] +fn multiply_2(b: &mut Bencher) { + multiply_bench(b, 1 << 16, 1 << 16); +} + +#[bench] +fn divide_0(b: &mut Bencher) { + divide_bench(b, 1 << 8, 1 << 6); +} + +#[bench] +fn divide_1(b: &mut Bencher) { + divide_bench(b, 1 << 12, 1 << 8); +} + +#[bench] +fn divide_2(b: &mut Bencher) { + divide_bench(b, 1 << 16, 1 << 12); +} + #[bench] fn factorial_100(b: &mut Bencher) { b.iter(|| { From 496ae0337c00715d5ab4507fdd1202d1a8f94e0a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 10 Dec 2015 12:33:28 -0900 Subject: [PATCH 3/6] Add add/subtract functions that work in place, on slices This is needed for the multiply optimizations in the next patch. --- src/bigint.rs | 193 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 137 insertions(+), 56 deletions(-) diff --git a/src/bigint.rs b/src/bigint.rs index b905294..ad0c2ed 100644 --- a/src/bigint.rs +++ b/src/bigint.rs @@ -116,6 +116,38 @@ pub mod big_digit { } } +/* + * Generic functions for add/subtract/multiply with carry/borrow: + */ + +// Add with carry: +#[inline] +fn adc(a: BigDigit, b: BigDigit, carry: &mut BigDigit) -> BigDigit { + let (hi, lo) = big_digit::from_doublebigdigit( + (a as DoubleBigDigit) + + (b as DoubleBigDigit) + + (*carry as DoubleBigDigit)); + + *carry = hi; + lo +} + +// Subtract with borrow: +#[inline] +fn sbb(a: BigDigit, b: BigDigit, borrow: &mut BigDigit) -> BigDigit { + let (hi, lo) = big_digit::from_doublebigdigit( + big_digit::BASE + + (a as DoubleBigDigit) + - (b as DoubleBigDigit) + - (*borrow as DoubleBigDigit)); + /* + hi * (base) + lo == 1*(base) + ai - bi - borrow + => ai - bi - borrow < 0 <=> hi == 0 + */ + *borrow = if hi == 0 { 1 } else { 0 }; + lo +} + /// A big unsigned integer type. /// /// A `BigUint`-typed value `BigUint { data: vec!(a, b, c) }` represents a number @@ -467,69 +499,112 @@ impl Unsigned for BigUint {} forward_all_binop_to_val_ref_commutative!(impl Add for BigUint, add); +// Only for the Add impl: +#[must_use] +#[inline] +fn __add2(a: &mut [BigDigit], b: &[BigDigit]) -> BigDigit { + let mut b_iter = b.iter(); + let mut carry = 0; + + for ai in a.iter_mut() { + if let Some(bi) = b_iter.next() { + *ai = adc(*ai, *bi, &mut carry); + } else if carry != 0 { + *ai = adc(*ai, 0, &mut carry); + } else { + break; + } + } + + debug_assert!(b_iter.next() == None); + carry +} + +/// /Two argument addition of raw slices: +/// a += b +/// +/// The caller _must_ ensure that a is big enough to store the result - typically this means +/// resizing a to max(a.len(), b.len()) + 1, to fit a possible carry. +fn add2(a: &mut [BigDigit], b: &[BigDigit]) { + let carry = __add2(a, b); + + debug_assert!(carry == 0); +} + +/* + * We'd really prefer to avoid using add2/sub2 directly as much as possible - since they make the + * caller entirely responsible for ensuring a's vector is big enough, and that the result is + * normalized, they're rather error prone and verbose: + * + * We could implement the Add and Sub traits for BigUint + BigDigit slices, like below - this works + * great, except that then it becomes the module's public interface, which we probably don't want: + * + * I'm keeping the code commented out, because I think this is worth revisiting: + +impl<'a> Add<&'a [BigDigit]> for BigUint { + type Output = BigUint; + + fn add(mut self, other: &[BigDigit]) -> BigUint { + if self.data.len() < other.len() { + let extra = other.len() - self.data.len(); + self.data.extend(repeat(0).take(extra)); + } + + let carry = __add2(&mut self.data[..], other); + if carry != 0 { + self.data.push(carry); + } + + self + } +} + */ + impl<'a> Add<&'a BigUint> for BigUint { type Output = BigUint; - fn add(self, other: &BigUint) -> BigUint { - let mut sum = self.data; - - if other.data.len() > sum.len() { - let additional = other.data.len() - sum.len(); - sum.reserve(additional); - sum.extend(repeat(ZERO_BIG_DIGIT).take(additional)); - } - let other_iter = other.data.iter().cloned().chain(repeat(ZERO_BIG_DIGIT)); - - let mut carry = 0; - for (a, b) in sum.iter_mut().zip(other_iter) { - let d = (*a as DoubleBigDigit) - + (b as DoubleBigDigit) - + (carry as DoubleBigDigit); - let (hi, lo) = big_digit::from_doublebigdigit(d); - carry = hi; - *a = lo; + fn add(mut self, other: &BigUint) -> BigUint { + if self.data.len() < other.data.len() { + let extra = other.data.len() - self.data.len(); + self.data.extend(repeat(0).take(extra)); } - if carry != 0 { sum.push(carry); } - BigUint::new(sum) + let carry = __add2(&mut self.data[..], &other.data[..]); + if carry != 0 { + self.data.push(carry); + } + + self } } forward_all_binop_to_val_ref!(impl Sub for BigUint, sub); +fn sub2(a: &mut [BigDigit], b: &[BigDigit]) { + let mut b_iter = b.iter(); + let mut borrow = 0; + + for ai in a.iter_mut() { + if let Some(bi) = b_iter.next() { + *ai = sbb(*ai, *bi, &mut borrow); + } else if borrow != 0 { + *ai = sbb(*ai, 0, &mut borrow); + } else { + break; + } + } + + /* note: we're _required_ to fail on underflow */ + assert!(borrow == 0 && b_iter.all(|x| *x == 0), + "Cannot subtract b from a because b is larger than a."); +} + impl<'a> Sub<&'a BigUint> for BigUint { type Output = BigUint; - fn sub(self, other: &BigUint) -> BigUint { - let mut diff = self.data; - let other = &other.data; - assert!(diff.len() >= other.len(), "arithmetic operation overflowed"); - - let mut borrow: DoubleBigDigit = 0; - for (a, &b) in diff.iter_mut().zip(other.iter()) { - let d = big_digit::BASE - borrow - + (*a as DoubleBigDigit) - - (b as DoubleBigDigit); - let (hi, lo) = big_digit::from_doublebigdigit(d); - /* - hi * (base) + lo == 1*(base) + ai - bi - borrow - => ai - bi - borrow < 0 <=> hi == 0 - */ - borrow = if hi == 0 { 1 } else { 0 }; - *a = lo; - } - - for a in &mut diff[other.len()..] { - if borrow == 0 { break } - let d = big_digit::BASE - borrow - + (*a as DoubleBigDigit); - let (hi, lo) = big_digit::from_doublebigdigit(d); - borrow = if hi == 0 { 1 } else { 0 }; - *a = lo; - } - - assert!(borrow == 0, "arithmetic operation overflowed"); - BigUint::new(diff) + fn sub(mut self, other: &BigUint) -> BigUint { + sub2(&mut self.data[..], &other.data[..]); + self.normalize() } } @@ -982,12 +1057,8 @@ impl BigUint { /// /// The digits are in little-endian base 2^32. #[inline] - pub fn new(mut digits: Vec) -> BigUint { - // omit trailing zeros - while let Some(&0) = digits.last() { - digits.pop(); - } - BigUint { data: digits } + pub fn new(digits: Vec) -> BigUint { + BigUint { data: digits }.normalize() } /// Creates and initializes a `BigUint`. @@ -1176,6 +1247,16 @@ impl BigUint { let zeros = self.data.last().unwrap().leading_zeros(); return self.data.len()*big_digit::BITS - zeros as usize; } + + /// Strips off trailing zero bigdigits - comparisons require the last element in the vector to + /// be nonzero. + #[inline] + fn normalize(mut self) -> BigUint { + while let Some(&0) = self.data.last() { + self.data.pop(); + } + self + } } // `DoubleBigDigit` size dependent From 08b0022aab249d16f1b63fef51b89b2f5bfff931 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 10 Dec 2015 15:25:42 -0900 Subject: [PATCH 4/6] Improve multiply performance The main idea here is to do as much as possible with slices, instead of allocating new BigUints (= heap allocations). Current performance: multiply_0: 10,507 ns/iter (+/- 987) multiply_1: 2,788,734 ns/iter (+/- 100,079) multiply_2: 69,923,515 ns/iter (+/- 4,550,902) After this patch, we get: multiply_0: 364 ns/iter (+/- 62) multiply_1: 34,085 ns/iter (+/- 1,179) multiply_2: 3,753,883 ns/iter (+/- 46,876) --- src/bigint.rs | 289 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 219 insertions(+), 70 deletions(-) diff --git a/src/bigint.rs b/src/bigint.rs index ad0c2ed..c47f99c 100644 --- a/src/bigint.rs +++ b/src/bigint.rs @@ -148,6 +148,16 @@ fn sbb(a: BigDigit, b: BigDigit, borrow: &mut BigDigit) -> BigDigit { lo } +#[inline] +fn mac_with_carry(a: BigDigit, b: BigDigit, c: BigDigit, carry: &mut BigDigit) -> BigDigit { + let (hi, lo) = big_digit::from_doublebigdigit( + (a as DoubleBigDigit) + + (b as DoubleBigDigit) * (c as DoubleBigDigit) + + (*carry as DoubleBigDigit)); + *carry = hi; + lo +} + /// A big unsigned integer type. /// /// A `BigUint`-typed value `BigUint { data: vec!(a, b, c) }` represents a number @@ -172,18 +182,25 @@ impl PartialOrd for BigUint { } } +fn cmp_slice(a: &[BigDigit], b: &[BigDigit]) -> Ordering { + debug_assert!(a.last() != Some(&0)); + debug_assert!(b.last() != Some(&0)); + + let (a_len, b_len) = (a.len(), b.len()); + if a_len < b_len { return Less; } + if a_len > b_len { return Greater; } + + for (&ai, &bi) in a.iter().rev().zip(b.iter().rev()) { + if ai < bi { return Less; } + if ai > bi { return Greater; } + } + return Equal; +} + impl Ord for BigUint { #[inline] fn cmp(&self, other: &BigUint) -> Ordering { - let (s_len, o_len) = (self.data.len(), other.data.len()); - if s_len < o_len { return Less; } - if s_len > o_len { return Greater; } - - for (&self_i, &other_i) in self.data.iter().rev().zip(other.data.iter().rev()) { - if self_i < other_i { return Less; } - if self_i > other_i { return Greater; } - } - return Equal; + cmp_slice(&self.data[..], &other.data[..]) } } @@ -608,80 +625,202 @@ impl<'a> Sub<&'a BigUint> for BigUint { } } +fn sub_sign(a: &[BigDigit], b: &[BigDigit]) -> BigInt { + // Normalize: + let a = &a[..a.iter().rposition(|&x| x != 0).map_or(0, |i| i + 1)]; + let b = &b[..b.iter().rposition(|&x| x != 0).map_or(0, |i| i + 1)]; -forward_all_binop_to_val_ref_commutative!(impl Mul for BigUint, mul); + match cmp_slice(a, b) { + Greater => { + let mut ret = BigUint::from_slice(a); + sub2(&mut ret.data[..], b); + BigInt::from_biguint(Plus, ret.normalize()) + }, + Less => { + let mut ret = BigUint::from_slice(b); + sub2(&mut ret.data[..], a); + BigInt::from_biguint(Minus, ret.normalize()) + }, + _ => Zero::zero(), + } +} -impl<'a> Mul<&'a BigUint> for BigUint { - type Output = BigUint; +forward_all_binop_to_ref_ref!(impl Mul for BigUint, mul); - fn mul(self, other: &BigUint) -> BigUint { - if self.is_zero() || other.is_zero() { return Zero::zero(); } +/// Three argument multiply accumulate: +/// acc += b * c +fn mac_digit(acc: &mut [BigDigit], b: &[BigDigit], c: BigDigit) { + if c == 0 { return; } - let (s_len, o_len) = (self.data.len(), other.data.len()); - if s_len == 1 { return mul_digit(other.clone(), self.data[0]); } - if o_len == 1 { return mul_digit(self, other.data[0]); } + let mut b_iter = b.iter(); + let mut carry = 0; - // Using Karatsuba multiplication - // (a1 * base + a0) * (b1 * base + b0) - // = a1*b1 * base^2 + - // (a1*b1 + a0*b0 - (a1-b0)*(b1-a0)) * base + - // a0*b0 - let half_len = cmp::max(s_len, o_len) / 2; - let (s_hi, s_lo) = cut_at(self, half_len); - let (o_hi, o_lo) = cut_at(other.clone(), half_len); - - let ll = &s_lo * &o_lo; - let hh = &s_hi * &o_hi; - let mm = { - let (s1, n1) = sub_sign(s_hi, s_lo); - let (s2, n2) = sub_sign(o_hi, o_lo); - match (s1, s2) { - (Equal, _) | (_, Equal) => &hh + &ll, - (Less, Greater) | (Greater, Less) => &hh + &ll + (n1 * n2), - (Less, Less) | (Greater, Greater) => &hh + &ll - (n1 * n2) - } - }; - - return ll + mm.shl_unit(half_len) + hh.shl_unit(half_len * 2); - - - fn mul_digit(a: BigUint, n: BigDigit) -> BigUint { - if n == 0 { return Zero::zero(); } - if n == 1 { return a; } - - let mut carry = 0; - let mut prod = a.data; - for a in &mut prod { - let d = (*a as DoubleBigDigit) - * (n as DoubleBigDigit) - + (carry as DoubleBigDigit); - let (hi, lo) = big_digit::from_doublebigdigit(d); - carry = hi; - *a = lo; - } - if carry != 0 { prod.push(carry); } - BigUint::new(prod) + for ai in acc.iter_mut() { + if let Some(bi) = b_iter.next() { + *ai = mac_with_carry(*ai, *bi, c, &mut carry); + } else if carry != 0 { + *ai = mac_with_carry(*ai, 0, c, &mut carry); + } else { + break; } + } - #[inline] - fn cut_at(mut a: BigUint, n: usize) -> (BigUint, BigUint) { - let mid = cmp::min(a.data.len(), n); - let hi = BigUint::from_slice(&a.data[mid ..]); - a.data.truncate(mid); - (hi, BigUint::new(a.data)) + assert!(carry == 0); +} + +/// Three argument multiply accumulate: +/// acc += b * c +fn mac3(acc: &mut [BigDigit], b: &[BigDigit], c: &[BigDigit]) { + let (x, y) = if b.len() < c.len() { (b, c) } else { (c, b) }; + + /* + * Karatsuba multiplication is slower than long multiplication for small x and y: + */ + if x.len() <= 4 { + for (i, xi) in x.iter().enumerate() { + mac_digit(&mut acc[i..], y, *xi); } + } else { + /* + * Karatsuba multiplication: + * + * The idea is that we break x and y up into two smaller numbers that each have about half + * as many digits, like so (note that multiplying by b is just a shift): + * + * x = x0 + x1 * b + * y = y0 + y1 * b + * + * With some algebra, we can compute x * y with three smaller products, where the inputs to + * each of the smaller products have only about half as many digits as x and y: + * + * x * y = (x0 + x1 * b) * (y0 + y1 * b) + * + * x * y = x0 * y0 + * + x0 * y1 * b + * + x1 * y0 * b + * + x1 * y1 * b^2 + * + * Let p0 = x0 * y0 and p2 = x1 * y1: + * + * x * y = p0 + * + (x0 * y1 + x1 * p0) * b + * + p2 * b^2 + * + * The real trick is that middle term: + * + * x0 * y1 + x1 * y0 + * + * = x0 * y1 + x1 * y0 - p0 + p0 - p2 + p2 + * + * = x0 * y1 + x1 * y0 - x0 * y0 - x1 * y1 + p0 + p2 + * + * Now we complete the square: + * + * = -(x0 * y0 - x0 * y1 - x1 * y0 + x1 * y1) + p0 + p2 + * + * = -((x1 - x0) * (y1 - y0)) + p0 + p2 + * + * Let p1 = (x1 - x0) * (y1 - y0), and substitute back into our original formula: + * + * x * y = p0 + * + (p0 + p2 - p1) * b + * + p2 * b^2 + * + * Where the three intermediate products are: + * + * p0 = x0 * y0 + * p1 = (x1 - x0) * (y1 - y0) + * p2 = x1 * y1 + * + * In doing the computation, we take great care to avoid unnecessary temporary variables + * (since creating a BigUint requires a heap allocation): thus, we rearrange the formula a + * bit so we can use the same temporary variable for all the intermediate products: + * + * x * y = p2 * b^2 + p2 * b + * + p0 * b + p0 + * - p1 * b + * + * The other trick we use is instead of doing explicit shifts, we slice acc at the + * appropriate offset when doing the add. + */ - #[inline] - fn sub_sign(a: BigUint, b: BigUint) -> (Ordering, BigUint) { - match a.cmp(&b) { - Less => (Less, b - a), - Greater => (Greater, a - b), - _ => (Equal, Zero::zero()) - } + /* + * When x is smaller than y, it's significantly faster to pick b such that x is split in + * half, not y: + */ + let b = x.len() / 2; + let (x0, x1) = x.split_at(b); + let (y0, y1) = y.split_at(b); + + /* We reuse the same BigUint for all the intermediate multiplies: */ + + let len = y.len() + 1; + let mut p: BigUint = BigUint { data: Vec::with_capacity(len) }; + p.data.extend(repeat(0).take(len)); + + // p2 = x1 * y1 + mac3(&mut p.data[..], x1, y1); + + // Not required, but the adds go faster if we drop any unneeded 0s from the end: + p = p.normalize(); + + add2(&mut acc[b..], &p.data[..]); + add2(&mut acc[b * 2..], &p.data[..]); + + // Zero out p before the next multiply: + p.data.truncate(0); + p.data.extend(repeat(0).take(len)); + + // p0 = x0 * y0 + mac3(&mut p.data[..], x0, y0); + p = p.normalize(); + + add2(&mut acc[..], &p.data[..]); + add2(&mut acc[b..], &p.data[..]); + + // p1 = (x1 - x0) * (y1 - y0) + // We do this one last, since it may be negative and acc can't ever be negative: + let j0 = sub_sign(x1, x0); + let j1 = sub_sign(y1, y0); + + match j0.sign * j1.sign { + Plus => { + p.data.truncate(0); + p.data.extend(repeat(0).take(len)); + + mac3(&mut p.data[..], &j0.data.data[..], &j1.data.data[..]); + p = p.normalize(); + + sub2(&mut acc[b..], &p.data[..]); + }, + Minus => { + mac3(&mut acc[b..], &j0.data.data[..], &j1.data.data[..]); + }, + NoSign => (), } } } +fn mul3(x: &[BigDigit], y: &[BigDigit]) -> BigUint { + let len = x.len() + y.len() + 1; + let mut prod: BigUint = BigUint { data: Vec::with_capacity(len) }; + + // resize isn't stable yet: + //prod.data.resize(len, 0); + prod.data.extend(repeat(0).take(len)); + + mac3(&mut prod.data[..], x, y); + prod.normalize() +} + +impl<'a, 'b> Mul<&'b BigUint> for &'a BigUint { + type Output = BigUint; + + #[inline] + fn mul(self, other: &BigUint) -> BigUint { + mul3(&self.data[..], &other.data[..]) + } +} forward_all_binop_to_ref_ref!(impl Div for BigUint, div); @@ -3131,6 +3270,16 @@ mod biguint_tests { // Switching u and l should fail: let _n: BigUint = rng.gen_biguint_range(&u, &l); } + + #[test] + fn test_sub_sign() { + use super::sub_sign; + let a = BigInt::from_str_radix("265252859812191058636308480000000", 10).unwrap(); + let b = BigInt::from_str_radix("26525285981219105863630848000000", 10).unwrap(); + + assert_eq!(sub_sign(&a.data.data[..], &b.data.data[..]), &a - &b); + assert_eq!(sub_sign(&b.data.data[..], &a.data.data[..]), &b - &a); + } } #[cfg(test)] From 79928b3185eb4b153745f152a9fcf0d9c67a15fd Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 10 Dec 2015 12:40:45 -0900 Subject: [PATCH 5/6] Improve division performance Before: test divide_0 ... bench: 4,058 ns/iter (+/- 255) test divide_1 ... bench: 304,507 ns/iter (+/- 28,063) test divide_2 ... bench: 668,293,969 ns/iter (+/- 25,383,239) After: test divide_0 ... bench: 874 ns/iter (+/- 71) test divide_1 ... bench: 16,641 ns/iter (+/- 1,205) test divide_2 ... bench: 1,336,888 ns/iter (+/- 77,450) --- src/bigint.rs | 202 ++++++++++++++++++++++++++------------------------ 1 file changed, 107 insertions(+), 95 deletions(-) diff --git a/src/bigint.rs b/src/bigint.rs index c47f99c..382caab 100644 --- a/src/bigint.rs +++ b/src/bigint.rs @@ -158,6 +158,20 @@ fn mac_with_carry(a: BigDigit, b: BigDigit, c: BigDigit, carry: &mut BigDigit) - lo } +/// Divide a two digit numerator by a one digit divisor, returns quotient and remainder: +/// +/// Note: the caller must ensure that both the quotient and remainder will fit into a single digit. +/// This is _not_ true for an arbitrary numerator/denominator. +/// +/// (This function also matches what the x86 divide instruction does). +#[inline] +fn div_wide(hi: BigDigit, lo: BigDigit, divisor: BigDigit) -> (BigDigit, BigDigit) { + debug_assert!(hi < divisor); + + let lhs = big_digit::to_doublebigdigit(hi, lo); + let rhs = divisor as DoubleBigDigit; + ((lhs / rhs) as BigDigit, (lhs % rhs) as BigDigit) +} /// A big unsigned integer type. /// /// A `BigUint`-typed value `BigUint { data: vec!(a, b, c) }` represents a number @@ -822,6 +836,18 @@ impl<'a, 'b> Mul<&'b BigUint> for &'a BigUint { } } +fn div_rem_digit(mut a: BigUint, b: BigDigit) -> (BigUint, BigDigit) { + let mut rem = 0; + + for d in a.data.iter_mut().rev() { + let (q, r) = div_wide(rem, *d, b); + *d = q; + rem = r; + } + + (a.normalize(), rem) +} + forward_all_binop_to_ref_ref!(impl Div for BigUint, div); impl<'a, 'b> Div<&'b BigUint> for &'a BigUint { @@ -917,83 +943,94 @@ impl Integer for BigUint { if self.is_zero() { return (Zero::zero(), Zero::zero()); } if *other == One::one() { return (self.clone(), Zero::zero()); } + /* Required or the q_len calculation below can underflow: */ match self.cmp(other) { Less => return (Zero::zero(), self.clone()), Equal => return (One::one(), Zero::zero()), Greater => {} // Do nothing } - let mut shift = 0; - let mut n = *other.data.last().unwrap(); - while n < (1 << big_digit::BITS - 2) { - n <<= 1; - shift += 1; - } - assert!(shift < big_digit::BITS); - let (d, m) = div_mod_floor_inner(self << shift, other << shift); - return (d, m >> shift); + /* + * This algorithm is from Knuth, TAOCP vol 2 section 4.3, algorithm D: + * + * First, normalize the arguments so the highest bit in the highest digit of the divisor is + * set: the main loop uses the highest digit of the divisor for generating guesses, so we + * want it to be the largest number we can efficiently divide by. + */ + let shift = other.data.last().unwrap().leading_zeros() as usize; + let mut a = self << shift; + let b = other << shift; + /* + * The algorithm works by incrementally calculating "guesses", q0, for part of the + * remainder. Once we have any number q0 such that q0 * b <= a, we can set + * + * q += q0 + * a -= q0 * b + * + * and then iterate until a < b. Then, (q, a) will be our desired quotient and remainder. + * + * q0, our guess, is calculated by dividing the last few digits of a by the last digit of b + * - this should give us a guess that is "close" to the actual quotient, but is possibly + * greater than the actual quotient. If q0 * b > a, we simply use iterated subtraction + * until we have a guess such that q0 & b <= a. + */ - fn div_mod_floor_inner(a: BigUint, b: BigUint) -> (BigUint, BigUint) { - let mut m = a; - let mut d: BigUint = Zero::zero(); - let mut n = 1; - while m >= b { - let (d0, d_unit, b_unit) = div_estimate(&m, &b, n); - let mut d0 = d0; - let mut prod = &b * &d0; - while prod > m { - // FIXME(#5992): assignment operator overloads - // d0 -= &d_unit - d0 = d0 - &d_unit; - // FIXME(#5992): assignment operator overloads - // prod -= &b_unit; - prod = prod - &b_unit - } - if d0.is_zero() { - n = 2; - continue; - } - n = 1; - // FIXME(#5992): assignment operator overloads - // d += d0; - d = d + d0; - // FIXME(#5992): assignment operator overloads - // m -= prod; - m = m - prod; + let bn = *b.data.last().unwrap(); + let q_len = a.data.len() - b.data.len() + 1; + let mut q: BigUint = BigUint { data: Vec::with_capacity(q_len) }; + + q.data.extend(repeat(0).take(q_len)); + + /* + * We reuse the same temporary to avoid hitting the allocator in our inner loop - this is + * sized to hold a0 (in the common case; if a particular digit of the quotient is zero a0 + * can be bigger). + */ + let mut tmp: BigUint = BigUint { data: Vec::with_capacity(2) }; + + for j in (0..q_len).rev() { + /* + * When calculating our next guess q0, we don't need to consider the digits below j + * + b.data.len() - 1: we're guessing digit j of the quotient (i.e. q0 << j) from + * digit bn of the divisor (i.e. bn << (b.data.len() - 1) - so the product of those + * two numbers will be zero in all digits up to (j + b.data.len() - 1). + */ + let offset = j + b.data.len() - 1; + if offset >= a.data.len() { + continue; } - return (d, m); + + /* just avoiding a heap allocation: */ + let mut a0 = tmp; + a0.data.truncate(0); + a0.data.extend(a.data[offset..].iter().cloned()); + + /* + * q0 << j * big_digit::BITS is our actual quotient estimate - we do the shifts + * implicitly at the end, when adding and subtracting to a and q. Not only do we + * save the cost of the shifts, the rest of the arithmetic gets to work with + * smaller numbers. + */ + let (mut q0, _) = div_rem_digit(a0, bn); + let mut prod = &b * &q0; + + while cmp_slice(&prod.data[..], &a.data[j..]) == Greater { + let one: BigUint = One::one(); + q0 = q0 - one; + prod = prod - &b; + } + + add2(&mut q.data[j..], &q0.data[..]); + sub2(&mut a.data[j..], &prod.data[..]); + a = a.normalize(); + + tmp = q0; } + debug_assert!(a < b); - fn div_estimate(a: &BigUint, b: &BigUint, n: usize) - -> (BigUint, BigUint, BigUint) { - if a.data.len() < n { - return (Zero::zero(), Zero::zero(), (*a).clone()); - } - - let an = &a.data[a.data.len() - n ..]; - let bn = *b.data.last().unwrap(); - let mut d = Vec::with_capacity(an.len()); - let mut carry = 0; - for elt in an.iter().rev() { - let ai = big_digit::to_doublebigdigit(carry, *elt); - let di = ai / (bn as DoubleBigDigit); - assert!(di < big_digit::BASE); - carry = (ai % (bn as DoubleBigDigit)) as BigDigit; - d.push(di as BigDigit) - } - d.reverse(); - - let shift = (a.data.len() - an.len()) - (b.data.len() - 1); - if shift == 0 { - return (BigUint::new(d), One::one(), (*b).clone()); - } - let one: BigUint = One::one(); - return (BigUint::new(d).shl_unit(shift), - one.shl_unit(shift), - b.shl_unit(shift)); - } + (q.normalize(), a >> shift) } /// Calculates the Greatest Common Divisor (GCD) of the number and `other`. @@ -1146,43 +1183,18 @@ fn to_str_radix_reversed(u: &BigUint, radix: u32) -> Vec { vec![b'0'] } else { let mut res = Vec::new(); - let mut digits = u.data.to_vec(); + let mut digits = u.clone(); - while !digits.is_empty() { - let rem = div_rem_in_place(&mut digits, radix); - res.push(to_digit(rem as u8)); - - // If we finished the most significant digit, drop it - if let Some(&0) = digits.last() { - digits.pop(); - } + while digits != Zero::zero() { + let (q, r) = div_rem_digit(digits, radix as BigDigit); + res.push(to_digit(r as u8)); + digits = q; } res } } -fn div_rem_in_place(digits: &mut [BigDigit], divisor: BigDigit) -> BigDigit { - let mut rem = 0; - - for d in digits.iter_mut().rev() { - let (q, r) = full_div_rem(*d, divisor, rem); - *d = q; - rem = r; - } - - rem -} - -fn full_div_rem(a: BigDigit, b: BigDigit, borrow: BigDigit) -> (BigDigit, BigDigit) { - let lo = a as DoubleBigDigit; - let hi = borrow as DoubleBigDigit; - - let lhs = lo | (hi << big_digit::BITS); - let rhs = b as DoubleBigDigit; - ((lhs / rhs) as BigDigit, (lhs % rhs) as BigDigit) -} - fn to_digit(b: u8) -> u8 { match b { 0 ... 9 => b'0' + b, From fa372e230bd1d087ea38d4a954c899d28d4847ff Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 1 Dec 2015 05:25:44 -0900 Subject: [PATCH 6/6] Multiply/divide torture test --- src/bigint.rs | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/src/bigint.rs b/src/bigint.rs index 382caab..ebf4576 100644 --- a/src/bigint.rs +++ b/src/bigint.rs @@ -3292,6 +3292,42 @@ mod biguint_tests { assert_eq!(sub_sign(&a.data.data[..], &b.data.data[..]), &a - &b); assert_eq!(sub_sign(&b.data.data[..], &a.data.data[..]), &b - &a); } + + fn test_mul_divide_torture_count(count: usize) { + use rand::{SeedableRng, StdRng, Rng}; + + let bits_max = 1 << 12; + let seed: &[_] = &[1, 2, 3, 4]; + let mut rng: StdRng = SeedableRng::from_seed(seed); + + for _ in 0..count { + /* Test with numbers of random sizes: */ + let xbits = rng.gen_range(0, bits_max); + let ybits = rng.gen_range(0, bits_max); + + let x = rng.gen_biguint(xbits); + let y = rng.gen_biguint(ybits); + + if x.is_zero() || y.is_zero() { + continue; + } + + let prod = &x * &y; + assert_eq!(&prod / &x, y); + assert_eq!(&prod / &y, x); + } + } + + #[test] + fn test_mul_divide_torture() { + test_mul_divide_torture_count(1000); + } + + #[test] + #[ignore] + fn test_mul_divide_torture_long() { + test_mul_divide_torture_count(1000000); + } } #[cfg(test)]