Auto merge of #133 - koverstreet:master, r=cuviper

[RFC] Some performance improvements I added add and subtract methods that take an operand by value, instead of by reference, so they can work in place instead of always allocating new BigInts for the result. I'd like to hear if anyone knows a better way of doing this, though - with the various wrappers/forwardings doing this for all the operations is going to be a mess and add a ton of boilerplate. I haven't even tried to do a complete job of it yet. I'm starting to think the sanest thing to do would be to only have in place implements of most of the operations - and have the wrappers clone() if necessary. That way, the only extra work we're doing is a memcpy(), which is a hell of a lot faster than a heap allocation.
2015-12-11 10:13:09 +09:00 · 2015-12-11 10:13:09 +09:00 · f2ea30d9f4
parent a228385394 fa372e230b
commit f2ea30d9f4
3 changed files with 559 additions and 228 deletions
--- a/benches/bigint.rs
+++ b/benches/bigint.rs
@ -2,10 +2,33 @@

 extern crate test;
 extern crate num;
+extern crate rand;

 use std::mem::replace;
 use test::Bencher;
 use num::{BigUint, Zero, One, FromPrimitive};
+use num::bigint::RandBigInt;
+use rand::{SeedableRng, StdRng};
+
+fn multiply_bench(b: &mut Bencher, xbits: usize, ybits: usize) {
+    let seed: &[_] = &[1, 2, 3, 4];
+    let mut rng: StdRng = SeedableRng::from_seed(seed);
+
+    let x = rng.gen_bigint(xbits);
+    let y = rng.gen_bigint(ybits);
+
+    b.iter(|| &x * &y);
+}
+
+fn divide_bench(b: &mut Bencher, xbits: usize, ybits: usize) {
+    let seed: &[_] = &[1, 2, 3, 4];
+    let mut rng: StdRng = SeedableRng::from_seed(seed);
+
+    let x = rng.gen_bigint(xbits);
+    let y = rng.gen_bigint(ybits);
+
+    b.iter(|| &x / &y);
+}

 fn factorial(n: usize) -> BigUint {
    let mut f: BigUint = One::one();
@ -26,6 +49,36 @@ fn fib(n: usize) -> BigUint {
    f0
 }

+#[bench]
+fn multiply_0(b: &mut Bencher) {
+    multiply_bench(b, 1 << 8, 1 << 8);
+}
+
+#[bench]
+fn multiply_1(b: &mut Bencher) {
+    multiply_bench(b, 1 << 8, 1 << 16);
+}
+
+#[bench]
+fn multiply_2(b: &mut Bencher) {
+    multiply_bench(b, 1 << 16, 1 << 16);
+}
+
+#[bench]
+fn divide_0(b: &mut Bencher) {
+    divide_bench(b, 1 << 8, 1 << 6);
+}
+
+#[bench]
+fn divide_1(b: &mut Bencher) {
+    divide_bench(b, 1 << 12, 1 << 8);
+}
+
+#[bench]
+fn divide_2(b: &mut Bencher) {
+    divide_bench(b, 1 << 16, 1 << 12);
+}
+
 #[bench]
 fn factorial_100(b: &mut Bencher) {
    b.iter(|| {
--- a/benches/shootout-pidigits.rs
+++ b/benches/shootout-pidigits.rs
@ -93,7 +93,7 @@ fn pidigits(n: isize, out: &mut io::Write) -> io::Result<()> {
    let mut k = 0;
    let mut context = Context::new();

-    for i in (1..(n+1)) {
+    for i in 1..(n+1) {
        let mut d;
        loop {
            k += 1;
@ -110,7 +110,7 @@ fn pidigits(n: isize, out: &mut io::Write) -> io::Result<()> {

    let m = n % 10;
    if m != 0 {
-        for _ in (m..10) { try!(write!(out, " ")); }
+        for _ in m..10 { try!(write!(out, " ")); }
        try!(write!(out, "\t:{}\n", n));
    }
    Ok(())
--- a/src/bigint.rs
+++ b/src/bigint.rs
@ -116,6 +116,62 @@ pub mod big_digit {
    }
 }

+/*
+ * Generic functions for add/subtract/multiply with carry/borrow:
+ */
+
+// Add with carry:
+#[inline]
+fn adc(a: BigDigit, b: BigDigit, carry: &mut BigDigit) -> BigDigit {
+    let (hi, lo) = big_digit::from_doublebigdigit(
+        (a as DoubleBigDigit) +
+        (b as DoubleBigDigit) +
+        (*carry as DoubleBigDigit));
+
+    *carry = hi;
+    lo
+}
+
+// Subtract with borrow:
+#[inline]
+fn sbb(a: BigDigit, b: BigDigit, borrow: &mut BigDigit) -> BigDigit {
+    let (hi, lo) = big_digit::from_doublebigdigit(
+        big_digit::BASE
+        + (a as DoubleBigDigit)
+        - (b as DoubleBigDigit)
+        - (*borrow as DoubleBigDigit));
+    /*
+       hi * (base) + lo == 1*(base) + ai - bi - borrow
+       => ai - bi - borrow < 0 <=> hi == 0
+       */
+    *borrow = if hi == 0 { 1 } else { 0 };
+    lo
+}
+
+#[inline]
+fn mac_with_carry(a: BigDigit, b: BigDigit, c: BigDigit, carry: &mut BigDigit) -> BigDigit {
+    let (hi, lo) = big_digit::from_doublebigdigit(
+        (a as DoubleBigDigit) +
+        (b as DoubleBigDigit) * (c as DoubleBigDigit) +
+        (*carry as DoubleBigDigit));
+    *carry = hi;
+    lo
+}
+
+/// Divide a two digit numerator by a one digit divisor, returns quotient and remainder:
+///
+/// Note: the caller must ensure that both the quotient and remainder will fit into a single digit.
+/// This is _not_ true for an arbitrary numerator/denominator.
+///
+/// (This function also matches what the x86 divide instruction does).
+#[inline]
+fn div_wide(hi: BigDigit, lo: BigDigit, divisor: BigDigit) -> (BigDigit, BigDigit) {
+    debug_assert!(hi < divisor);
+
+    let lhs = big_digit::to_doublebigdigit(hi, lo);
+    let rhs = divisor as DoubleBigDigit;
+    ((lhs / rhs) as BigDigit, (lhs % rhs) as BigDigit)
+}
 /// A big unsigned integer type.
 ///
 /// A `BigUint`-typed value `BigUint { data: vec!(a, b, c) }` represents a number
@ -140,18 +196,25 @@ impl PartialOrd for BigUint {
    }
 }

+fn cmp_slice(a: &[BigDigit], b: &[BigDigit]) -> Ordering {
+    debug_assert!(a.last() != Some(&0));
+    debug_assert!(b.last() != Some(&0));
+
+    let (a_len, b_len) = (a.len(), b.len());
+    if a_len < b_len { return Less; }
+    if a_len > b_len { return Greater;  }
+
+    for (&ai, &bi) in a.iter().rev().zip(b.iter().rev()) {
+        if ai < bi { return Less; }
+        if ai > bi { return Greater; }
+    }
+    return Equal;
+}
+
 impl Ord for BigUint {
    #[inline]
    fn cmp(&self, other: &BigUint) -> Ordering {
-        let (s_len, o_len) = (self.data.len(), other.data.len());
-        if s_len < o_len { return Less; }
-        if s_len > o_len { return Greater;  }
-
-        for (&self_i, &other_i) in self.data.iter().rev().zip(other.data.iter().rev()) {
-            if self_i < other_i { return Less; }
-            if self_i > other_i { return Greater; }
-        }
-        return Equal;
+        cmp_slice(&self.data[..], &other.data[..])
    }
 }

@ -467,146 +530,323 @@ impl Unsigned for BigUint {}

 forward_all_binop_to_val_ref_commutative!(impl Add for BigUint, add);

+// Only for the Add impl:
+#[must_use]
+#[inline]
+fn __add2(a: &mut [BigDigit], b: &[BigDigit]) -> BigDigit {
+    let mut b_iter = b.iter();
+    let mut carry = 0;
+
+    for ai in a.iter_mut() {
+        if let Some(bi) = b_iter.next() {
+            *ai = adc(*ai, *bi, &mut carry);
+        } else if carry != 0 {
+            *ai = adc(*ai, 0, &mut carry);
+        } else {
+            break;
+        }
+    }
+
+    debug_assert!(b_iter.next() == None);
+    carry
+}
+
+/// /Two argument addition of raw slices:
+/// a += b
+///
+/// The caller _must_ ensure that a is big enough to store the result - typically this means
+/// resizing a to max(a.len(), b.len()) + 1, to fit a possible carry.
+fn add2(a: &mut [BigDigit], b: &[BigDigit]) {
+    let carry = __add2(a, b);
+
+    debug_assert!(carry == 0);
+}
+
+/*
+ * We'd really prefer to avoid using add2/sub2 directly as much as possible - since they make the
+ * caller entirely responsible for ensuring a's vector is big enough, and that the result is
+ * normalized, they're rather error prone and verbose:
+ *
+ * We could implement the Add and Sub traits for BigUint + BigDigit slices, like below - this works
+ * great, except that then it becomes the module's public interface, which we probably don't want:
+ *
+ * I'm keeping the code commented out, because I think this is worth revisiting:
+
+impl<'a> Add<&'a [BigDigit]> for BigUint {
+    type Output = BigUint;
+
+    fn add(mut self, other: &[BigDigit]) -> BigUint {
+        if self.data.len() < other.len() {
+            let extra = other.len() - self.data.len();
+            self.data.extend(repeat(0).take(extra));
+        }
+
+        let carry = __add2(&mut self.data[..], other);
+        if carry != 0 {
+            self.data.push(carry);
+        }
+
+        self
+    }
+}
+ */
+
 impl<'a> Add<&'a BigUint> for BigUint {
    type Output = BigUint;

-    fn add(self, other: &BigUint) -> BigUint {
-        let mut sum = self.data;
-
-        if other.data.len() > sum.len() {
-            let additional = other.data.len() - sum.len();
-            sum.reserve(additional);
-            sum.extend(repeat(ZERO_BIG_DIGIT).take(additional));
-        }
-        let other_iter = other.data.iter().cloned().chain(repeat(ZERO_BIG_DIGIT));
-
-        let mut carry = 0;
-        for (a, b) in sum.iter_mut().zip(other_iter) {
-            let d = (*a as DoubleBigDigit)
-                + (b as DoubleBigDigit)
-                + (carry as DoubleBigDigit);
-            let (hi, lo) = big_digit::from_doublebigdigit(d);
-            carry = hi;
-            *a = lo;
+    fn add(mut self, other: &BigUint) -> BigUint {
+        if self.data.len() < other.data.len() {
+            let extra = other.data.len() - self.data.len();
+            self.data.extend(repeat(0).take(extra));
        }

-        if carry != 0 { sum.push(carry); }
-        BigUint::new(sum)
+        let carry = __add2(&mut self.data[..], &other.data[..]);
+        if carry != 0 {
+            self.data.push(carry);
+        }
+
+        self
    }
 }

 forward_all_binop_to_val_ref!(impl Sub for BigUint, sub);

+fn sub2(a: &mut [BigDigit], b: &[BigDigit]) {
+    let mut b_iter = b.iter();
+    let mut borrow = 0;
+
+    for ai in a.iter_mut() {
+        if let Some(bi) = b_iter.next() {
+            *ai = sbb(*ai, *bi, &mut borrow);
+        } else if borrow != 0 {
+            *ai = sbb(*ai, 0, &mut borrow);
+        } else {
+            break;
+        }
+    }
+
+    /* note: we're _required_ to fail on underflow */
+    assert!(borrow == 0 && b_iter.all(|x| *x == 0),
+            "Cannot subtract b from a because b is larger than a.");
+}
+
 impl<'a> Sub<&'a BigUint> for BigUint {
    type Output = BigUint;

-    fn sub(self, other: &BigUint) -> BigUint {
-        let mut diff = self.data;
-        let other = &other.data;
-        assert!(diff.len() >= other.len(), "arithmetic operation overflowed");
+    fn sub(mut self, other: &BigUint) -> BigUint {
+        sub2(&mut self.data[..], &other.data[..]);
+        self.normalize()
+    }
+}
+
+fn sub_sign(a: &[BigDigit], b: &[BigDigit]) -> BigInt {
+    // Normalize:
+    let a = &a[..a.iter().rposition(|&x| x != 0).map_or(0, |i| i + 1)];
+    let b = &b[..b.iter().rposition(|&x| x != 0).map_or(0, |i| i + 1)];
+
+    match cmp_slice(a, b) {
+        Greater => {
+            let mut ret = BigUint::from_slice(a);
+            sub2(&mut ret.data[..], b);
+            BigInt::from_biguint(Plus, ret.normalize())
+        },
+        Less    => {
+            let mut ret = BigUint::from_slice(b);
+            sub2(&mut ret.data[..], a);
+            BigInt::from_biguint(Minus, ret.normalize())
+        },
+        _       => Zero::zero(),
+    }
+}
+
+forward_all_binop_to_ref_ref!(impl Mul for BigUint, mul);
+
+/// Three argument multiply accumulate:
+/// acc += b * c
+fn mac_digit(acc: &mut [BigDigit], b: &[BigDigit], c: BigDigit) {
+    if c == 0 { return; }
+
+    let mut b_iter = b.iter();
+    let mut carry = 0;
+
+    for ai in acc.iter_mut() {
+        if let Some(bi) = b_iter.next() {
+            *ai = mac_with_carry(*ai, *bi, c, &mut carry);
+        } else if carry != 0 {
+            *ai = mac_with_carry(*ai, 0, c, &mut carry);
+        } else {
+            break;
+        }
+    }
+
+    assert!(carry == 0);
+}
+
+/// Three argument multiply accumulate:
+/// acc += b * c
+fn mac3(acc: &mut [BigDigit], b: &[BigDigit], c: &[BigDigit]) {
+    let (x, y) = if b.len() < c.len() { (b, c) } else { (c, b) };

-        let mut borrow: DoubleBigDigit = 0;
-        for (a, &b) in diff.iter_mut().zip(other.iter()) {
-            let d = big_digit::BASE - borrow
-                + (*a as DoubleBigDigit)
-                - (b as DoubleBigDigit);
-            let (hi, lo) = big_digit::from_doublebigdigit(d);
    /*
-            hi * (base) + lo == 1*(base) + ai - bi - borrow
-            => ai - bi - borrow < 0 <=> hi == 0
+     * Karatsuba multiplication is slower than long multiplication for small x and y:
     */
-            borrow = if hi == 0 { 1 } else { 0 };
-            *a = lo;
+    if x.len() <= 4 {
+        for (i, xi) in x.iter().enumerate() {
+            mac_digit(&mut acc[i..], y, *xi);
        }
+    } else {
+        /*
+         * Karatsuba multiplication:
+         *
+         * The idea is that we break x and y up into two smaller numbers that each have about half
+         * as many digits, like so (note that multiplying by b is just a shift):
+         *
+         * x = x0 + x1 * b
+         * y = y0 + y1 * b
+         *
+         * With some algebra, we can compute x * y with three smaller products, where the inputs to
+         * each of the smaller products have only about half as many digits as x and y:
+         *
+         * x * y = (x0 + x1 * b) * (y0 + y1 * b)
+         *
+         * x * y = x0 * y0
+         *       + x0 * y1 * b
+         *       + x1 * y0 * b
+         *       + x1 * y1 * b^2
+         *
+         * Let p0 = x0 * y0 and p2 = x1 * y1:
+         *
+         * x * y = p0
+         *       + (x0 * y1 + x1 * p0) * b
+         *       + p2 * b^2
+         *
+         * The real trick is that middle term:
+         *
+         *         x0 * y1 + x1 * y0
+         *
+         *       = x0 * y1 + x1 * y0 - p0 + p0 - p2 + p2
+         *
+         *       = x0 * y1 + x1 * y0 - x0 * y0 - x1 * y1 + p0 + p2
+         *
+         * Now we complete the square:
+         *
+         *       = -(x0 * y0 - x0 * y1 - x1 * y0 + x1 * y1) + p0 + p2
+         *
+         *       = -((x1 - x0) * (y1 - y0)) + p0 + p2
+         *
+         * Let p1 = (x1 - x0) * (y1 - y0), and substitute back into our original formula:
+         *
+         * x * y = p0
+         *       + (p0 + p2 - p1) * b
+         *       + p2 * b^2
+         *
+         * Where the three intermediate products are:
+         *
+         * p0 = x0 * y0
+         * p1 = (x1 - x0) * (y1 - y0)
+         * p2 = x1 * y1
+         *
+         * In doing the computation, we take great care to avoid unnecessary temporary variables
+         * (since creating a BigUint requires a heap allocation): thus, we rearrange the formula a
+         * bit so we can use the same temporary variable for all the intermediate products:
+         *
+         * x * y = p2 * b^2 + p2 * b
+         *       + p0 * b + p0
+         *       - p1 * b
+         *
+         * The other trick we use is instead of doing explicit shifts, we slice acc at the
+         * appropriate offset when doing the add.
+         */

-        for a in &mut diff[other.len()..] {
-            if borrow == 0 { break }
-            let d = big_digit::BASE - borrow
-                + (*a as DoubleBigDigit);
-            let (hi, lo) = big_digit::from_doublebigdigit(d);
-            borrow = if hi == 0 { 1 } else { 0 };
-            *a = lo;
+        /*
+         * When x is smaller than y, it's significantly faster to pick b such that x is split in
+         * half, not y:
+         */
+        let b = x.len() / 2;
+        let (x0, x1) = x.split_at(b);
+        let (y0, y1) = y.split_at(b);
+
+        /* We reuse the same BigUint for all the intermediate multiplies: */
+
+        let len = y.len() + 1;
+        let mut p: BigUint = BigUint { data: Vec::with_capacity(len) };
+        p.data.extend(repeat(0).take(len));
+
+        // p2 = x1 * y1
+        mac3(&mut p.data[..], x1, y1);
+
+        // Not required, but the adds go faster if we drop any unneeded 0s from the end:
+        p = p.normalize();
+
+        add2(&mut acc[b..],        &p.data[..]);
+        add2(&mut acc[b * 2..],    &p.data[..]);
+
+        // Zero out p before the next multiply:
+        p.data.truncate(0);
+        p.data.extend(repeat(0).take(len));
+
+        // p0 = x0 * y0
+        mac3(&mut p.data[..], x0, y0);
+        p = p.normalize();
+
+        add2(&mut acc[..],                &p.data[..]);
+        add2(&mut acc[b..],        &p.data[..]);
+
+        // p1 = (x1 - x0) * (y1 - y0)
+        // We do this one last, since it may be negative and acc can't ever be negative:
+        let j0 = sub_sign(x1, x0);
+        let j1 = sub_sign(y1, y0);
+
+        match j0.sign * j1.sign {
+            Plus    => {
+                p.data.truncate(0);
+                p.data.extend(repeat(0).take(len));
+
+                mac3(&mut p.data[..], &j0.data.data[..], &j1.data.data[..]);
+                p = p.normalize();
+
+                sub2(&mut acc[b..], &p.data[..]);
+            },
+            Minus   => {
+                mac3(&mut acc[b..], &j0.data.data[..], &j1.data.data[..]);
+            },
+            NoSign  => (),
        }
-
-        assert!(borrow == 0, "arithmetic operation overflowed");
-        BigUint::new(diff)
    }
 }

+fn mul3(x: &[BigDigit], y: &[BigDigit]) -> BigUint {
+    let len = x.len() + y.len() + 1;
+    let mut prod: BigUint = BigUint { data: Vec::with_capacity(len) };

-forward_all_binop_to_val_ref_commutative!(impl Mul for BigUint, mul);
+    // resize isn't stable yet:
+    //prod.data.resize(len, 0);
+    prod.data.extend(repeat(0).take(len));

-impl<'a> Mul<&'a BigUint> for BigUint {
+    mac3(&mut prod.data[..], x, y);
+    prod.normalize()
+}
+
+impl<'a, 'b> Mul<&'b BigUint> for &'a BigUint {
    type Output = BigUint;

+    #[inline]
    fn mul(self, other: &BigUint) -> BigUint {
-        if self.is_zero() || other.is_zero() { return Zero::zero(); }
-
-        let (s_len, o_len) = (self.data.len(), other.data.len());
-        if s_len == 1 { return mul_digit(other.clone(), self.data[0]);  }
-        if o_len == 1 { return mul_digit(self,  other.data[0]); }
-
-        // Using Karatsuba multiplication
-        // (a1 * base + a0) * (b1 * base + b0)
-        // = a1*b1 * base^2 +
-        //   (a1*b1 + a0*b0 - (a1-b0)*(b1-a0)) * base +
-        //   a0*b0
-        let half_len = cmp::max(s_len, o_len) / 2;
-        let (s_hi, s_lo) = cut_at(self,  half_len);
-        let (o_hi, o_lo) = cut_at(other.clone(), half_len);
-
-        let ll = &s_lo * &o_lo;
-        let hh = &s_hi * &o_hi;
-        let mm = {
-            let (s1, n1) = sub_sign(s_hi, s_lo);
-            let (s2, n2) = sub_sign(o_hi, o_lo);
-            match (s1, s2) {
-                (Equal, _) | (_, Equal) => &hh + &ll,
-                (Less, Greater) | (Greater, Less) => &hh + &ll + (n1 * n2),
-                (Less, Less) | (Greater, Greater) => &hh + &ll - (n1 * n2)
-            }
-        };
-
-        return ll + mm.shl_unit(half_len) + hh.shl_unit(half_len * 2);
-
-
-        fn mul_digit(a: BigUint, n: BigDigit) -> BigUint {
-            if n == 0 { return Zero::zero(); }
-            if n == 1 { return a; }
-
-            let mut carry = 0;
-            let mut prod = a.data;
-            for a in &mut prod {
-                let d = (*a as DoubleBigDigit)
-                    * (n as DoubleBigDigit)
-                    + (carry as DoubleBigDigit);
-                let (hi, lo) = big_digit::from_doublebigdigit(d);
-                carry = hi;
-                *a = lo;
-            }
-            if carry != 0 { prod.push(carry); }
-            BigUint::new(prod)
-        }
-
-        #[inline]
-        fn cut_at(mut a: BigUint, n: usize) -> (BigUint, BigUint) {
-            let mid = cmp::min(a.data.len(), n);
-            let hi = BigUint::from_slice(&a.data[mid ..]);
-            a.data.truncate(mid);
-            (hi, BigUint::new(a.data))
-        }
-
-        #[inline]
-        fn sub_sign(a: BigUint, b: BigUint) -> (Ordering, BigUint) {
-            match a.cmp(&b) {
-                Less    => (Less,    b - a),
-                Greater => (Greater, a - b),
-                _       => (Equal,   Zero::zero())
-            }
-        }
+        mul3(&self.data[..], &other.data[..])
    }
 }

+fn div_rem_digit(mut a: BigUint, b: BigDigit) -> (BigUint, BigDigit) {
+    let mut rem = 0;
+
+    for d in a.data.iter_mut().rev() {
+        let (q, r) = div_wide(rem, *d, b);
+        *d = q;
+        rem = r;
+    }
+
+    (a.normalize(), rem)
+}

 forward_all_binop_to_ref_ref!(impl Div for BigUint, div);

@ -703,83 +943,94 @@ impl Integer for BigUint {
        if self.is_zero() { return (Zero::zero(), Zero::zero()); }
        if *other == One::one() { return (self.clone(), Zero::zero()); }

+        /* Required or the q_len calculation below can underflow: */
        match self.cmp(other) {
            Less    => return (Zero::zero(), self.clone()),
            Equal   => return (One::one(), Zero::zero()),
            Greater => {} // Do nothing
        }

-        let mut shift = 0;
-        let mut n = *other.data.last().unwrap();
-        while n < (1 << big_digit::BITS - 2) {
-            n <<= 1;
-            shift += 1;
-        }
-        assert!(shift < big_digit::BITS);
-        let (d, m) = div_mod_floor_inner(self << shift, other << shift);
-        return (d, m >> shift);
+        /*
+         * This algorithm is from Knuth, TAOCP vol 2 section 4.3, algorithm D:
+         *
+         * First, normalize the arguments so the highest bit in the highest digit of the divisor is
+         * set: the main loop uses the highest digit of the divisor for generating guesses, so we
+         * want it to be the largest number we can efficiently divide by.
+         */
+        let shift = other.data.last().unwrap().leading_zeros() as usize;
+        let mut a = self << shift;
+        let b     = other << shift;

+        /*
+         * The algorithm works by incrementally calculating "guesses", q0, for part of the
+         * remainder. Once we have any number q0 such that q0 * b <= a, we can set
+         *
+         *     q += q0
+         *     a -= q0 * b
+         *
+         * and then iterate until a < b. Then, (q, a) will be our desired quotient and remainder.
+         *
+         * q0, our guess, is calculated by dividing the last few digits of a by the last digit of b
+         * - this should give us a guess that is "close" to the actual quotient, but is possibly
+         * greater than the actual quotient. If q0 * b > a, we simply use iterated subtraction
+         * until we have a guess such that q0 & b <= a.
+         */

-        fn div_mod_floor_inner(a: BigUint, b: BigUint) -> (BigUint, BigUint) {
-            let mut m = a;
-            let mut d: BigUint = Zero::zero();
-            let mut n = 1;
-            while m >= b {
-                let (d0, d_unit, b_unit) = div_estimate(&m, &b, n);
-                let mut d0 = d0;
-                let mut prod = &b * &d0;
-                while prod > m {
-                    // FIXME(#5992): assignment operator overloads
-                    // d0 -= &d_unit
-                    d0   = d0 - &d_unit;
-                    // FIXME(#5992): assignment operator overloads
-                    // prod -= &b_unit;
-                    prod = prod - &b_unit
-                }
-                if d0.is_zero() {
-                    n = 2;
+        let bn = *b.data.last().unwrap();
+        let q_len = a.data.len() - b.data.len() + 1;
+        let mut q: BigUint = BigUint { data: Vec::with_capacity(q_len) };
+
+        q.data.extend(repeat(0).take(q_len));
+
+        /*
+         * We reuse the same temporary to avoid hitting the allocator in our inner loop - this is
+         * sized to hold a0 (in the common case; if a particular digit of the quotient is zero a0
+         * can be bigger).
+         */
+        let mut tmp: BigUint = BigUint { data: Vec::with_capacity(2) };
+
+        for j in (0..q_len).rev() {
+            /*
+             * When calculating our next guess q0, we don't need to consider the digits below j
+             * + b.data.len() - 1: we're guessing digit j of the quotient (i.e. q0 << j) from
+             * digit bn of the divisor (i.e. bn << (b.data.len() - 1) - so the product of those
+             * two numbers will be zero in all digits up to (j + b.data.len() - 1).
+             */
+            let offset = j + b.data.len() - 1;
+            if offset >= a.data.len() {
                continue;
            }
-                n = 1;
-                // FIXME(#5992): assignment operator overloads
-                // d += d0;
-                d = d + d0;
-                // FIXME(#5992): assignment operator overloads
-                // m -= prod;
-                m = m - prod;
-            }
-            return (d, m);
-        }

+            /* just avoiding a heap allocation: */
+            let mut a0 = tmp;
+            a0.data.truncate(0);
+            a0.data.extend(a.data[offset..].iter().cloned());

-        fn div_estimate(a: &BigUint, b: &BigUint, n: usize)
-                        -> (BigUint, BigUint, BigUint) {
-                            if a.data.len() < n {
-                                return (Zero::zero(), Zero::zero(), (*a).clone());
-                            }
+            /*
+             * q0 << j * big_digit::BITS is our actual quotient estimate - we do the shifts
+             * implicitly at the end, when adding and subtracting to a and q. Not only do we
+             * save the cost of the shifts, the rest of the arithmetic gets to work with
+             * smaller numbers.
+             */
+            let (mut q0, _) = div_rem_digit(a0, bn);
+            let mut prod = &b * &q0;

-                            let an = &a.data[a.data.len() - n ..];
-                            let bn = *b.data.last().unwrap();
-                            let mut d = Vec::with_capacity(an.len());
-                            let mut carry = 0;
-                            for elt in an.iter().rev() {
-                                let ai = big_digit::to_doublebigdigit(carry, *elt);
-                                let di = ai / (bn as DoubleBigDigit);
-                                assert!(di < big_digit::BASE);
-                                carry = (ai % (bn as DoubleBigDigit)) as BigDigit;
-                                d.push(di as BigDigit)
-                            }
-                            d.reverse();
-
-                            let shift = (a.data.len() - an.len()) - (b.data.len() - 1);
-                            if shift == 0 {
-                                return (BigUint::new(d), One::one(), (*b).clone());
-                            }
+            while cmp_slice(&prod.data[..], &a.data[j..]) == Greater {
                let one: BigUint = One::one();
-                            return (BigUint::new(d).shl_unit(shift),
-                                    one.shl_unit(shift),
-                                    b.shl_unit(shift));
+                q0 = q0 - one;
+                prod = prod - &b;
            }
+
+            add2(&mut q.data[j..], &q0.data[..]);
+            sub2(&mut a.data[j..], &prod.data[..]);
+            a = a.normalize();
+
+            tmp = q0;
+        }
+
+        debug_assert!(a < b);
+
+        (q.normalize(), a >> shift)
    }

    /// Calculates the Greatest Common Divisor (GCD) of the number and `other`.
@ -932,43 +1183,18 @@ fn to_str_radix_reversed(u: &BigUint, radix: u32) -> Vec<u8> {
        vec![b'0']
    } else {
        let mut res = Vec::new();
-        let mut digits = u.data.to_vec();
+        let mut digits = u.clone();

-        while !digits.is_empty() {
-            let rem = div_rem_in_place(&mut digits, radix);
-            res.push(to_digit(rem as u8));
-
-            // If we finished the most significant digit, drop it
-            if let Some(&0) = digits.last() {
-                digits.pop();
-            }
+        while digits != Zero::zero() {
+            let (q, r) = div_rem_digit(digits, radix as BigDigit);
+            res.push(to_digit(r as u8));
+            digits = q;
        }

        res
    }
 }

-fn div_rem_in_place(digits: &mut [BigDigit], divisor: BigDigit) -> BigDigit {
-    let mut rem = 0;
-
-    for d in digits.iter_mut().rev() {
-        let (q, r) = full_div_rem(*d, divisor, rem);
-        *d = q;
-        rem = r;
-    }
-
-    rem
-}
-
-fn full_div_rem(a: BigDigit, b: BigDigit, borrow: BigDigit) -> (BigDigit, BigDigit) {
-    let lo = a as DoubleBigDigit;
-    let hi = borrow as DoubleBigDigit;
-
-    let lhs = lo | (hi << big_digit::BITS);
-    let rhs = b as DoubleBigDigit;
-    ((lhs / rhs) as BigDigit, (lhs % rhs) as BigDigit)
-}
-
 fn to_digit(b: u8) -> u8 {
    match b {
        0 ... 9 => b'0' + b,
@ -982,12 +1208,8 @@ impl BigUint {
    ///
    /// The digits are in little-endian base 2^32.
    #[inline]
-    pub fn new(mut digits: Vec<BigDigit>) -> BigUint {
-        // omit trailing zeros
-        while let Some(&0) = digits.last() {
-            digits.pop();
-        }
-        BigUint { data: digits }
+    pub fn new(digits: Vec<BigDigit>) -> BigUint {
+        BigUint { data: digits }.normalize()
    }

    /// Creates and initializes a `BigUint`.
@ -1176,6 +1398,16 @@ impl BigUint {
        let zeros = self.data.last().unwrap().leading_zeros();
        return self.data.len()*big_digit::BITS - zeros as usize;
    }
+
+    /// Strips off trailing zero bigdigits - comparisons require the last element in the vector to
+    /// be nonzero.
+    #[inline]
+    fn normalize(mut self) -> BigUint {
+        while let Some(&0) = self.data.last() {
+            self.data.pop();
+        }
+        self
+    }
 }

 // `DoubleBigDigit` size dependent
@ -3050,6 +3282,52 @@ mod biguint_tests {
        // Switching u and l should fail:
        let _n: BigUint = rng.gen_biguint_range(&u, &l);
    }
+
+    #[test]
+    fn test_sub_sign() {
+        use super::sub_sign;
+        let a = BigInt::from_str_radix("265252859812191058636308480000000", 10).unwrap();
+        let b = BigInt::from_str_radix("26525285981219105863630848000000", 10).unwrap();
+
+        assert_eq!(sub_sign(&a.data.data[..], &b.data.data[..]), &a - &b);
+        assert_eq!(sub_sign(&b.data.data[..], &a.data.data[..]), &b - &a);
+    }
+
+    fn test_mul_divide_torture_count(count: usize) {
+        use rand::{SeedableRng, StdRng, Rng};
+
+        let bits_max = 1 << 12;
+        let seed: &[_] = &[1, 2, 3, 4];
+        let mut rng: StdRng = SeedableRng::from_seed(seed);
+
+        for _ in 0..count {
+            /* Test with numbers of random sizes: */
+            let xbits = rng.gen_range(0, bits_max);
+            let ybits = rng.gen_range(0, bits_max);
+
+            let x = rng.gen_biguint(xbits);
+            let y = rng.gen_biguint(ybits);
+
+            if x.is_zero() || y.is_zero() {
+                continue;
+            }
+
+            let prod = &x * &y;
+            assert_eq!(&prod / &x, y);
+            assert_eq!(&prod / &y, x);
+        }
+    }
+
+    #[test]
+    fn test_mul_divide_torture() {
+        test_mul_divide_torture_count(1000);
+    }
+
+    #[test]
+    #[ignore]
+    fn test_mul_divide_torture_long() {
+        test_mul_divide_torture_count(1000000);
+    }
 }

 #[cfg(test)]