diff --git a/benches/bigint.rs b/benches/bigint.rs index 30522d5..0eafb93 100644 --- a/benches/bigint.rs +++ b/benches/bigint.rs @@ -78,6 +78,11 @@ fn multiply_2(b: &mut Bencher) { multiply_bench(b, 1 << 16, 1 << 16); } +#[bench] +fn multiply_3(b: &mut Bencher) { + multiply_bench(b, 1 << 16, 1 << 17); +} + #[bench] fn divide_0(b: &mut Bencher) { divide_bench(b, 1 << 8, 1 << 6); diff --git a/bigint/src/algorithms.rs b/bigint/src/algorithms.rs index 3560103..d93bdd5 100644 --- a/bigint/src/algorithms.rs +++ b/bigint/src/algorithms.rs @@ -8,6 +8,7 @@ use traits::{Zero, One}; use biguint::BigUint; +use bigint::BigInt; use bigint::Sign; use bigint::Sign::{Minus, NoSign, Plus}; @@ -225,20 +226,18 @@ fn mac_digit(acc: &mut [BigDigit], b: &[BigDigit], c: BigDigit) { return; } - let mut b_iter = b.iter(); let mut carry = 0; + let (a_lo, a_hi) = acc.split_at_mut(b.len()); - for ai in acc.iter_mut() { - if let Some(bi) = b_iter.next() { - *ai = mac_with_carry(*ai, *bi, c, &mut carry); - } else if carry != 0 { - *ai = mac_with_carry(*ai, 0, c, &mut carry); - } else { - break; - } + for (a, &b) in a_lo.iter_mut().zip(b) { + *a = mac_with_carry(*a, b, c, &mut carry); } - assert!(carry == 0); + let mut a = a_hi.iter_mut(); + while carry != 0 { + let a = a.next().expect("carry overflow during multiplication!"); + *a = adc(*a, 0, &mut carry); + } } /// Three argument multiply accumulate: @@ -250,13 +249,23 @@ fn mac3(acc: &mut [BigDigit], b: &[BigDigit], c: &[BigDigit]) { (c, b) }; - // Karatsuba multiplication is slower than long multiplication for small x and y: + // We use three algorithms for different input sizes. // - if x.len() <= 4 { + // - For small inputs, long multiplication is fastest. + // - Next we use Karatsuba multiplication (Toom-2), which we have optimized + // to avoid unnecessary allocations for intermediate values. + // - For the largest inputs we use Toom-3, which better optimizes the + // number of operations, but uses more temporary allocations. + // + // The thresholds are somewhat arbitrary, chosen by evaluating the results + // of `cargo bench --bench bigint multiply`. + + if x.len() <= 32 { + // Long multiplication: for (i, xi) in x.iter().enumerate() { mac_digit(&mut acc[i..], y, *xi); } - } else { + } else if x.len() <= 256 { /* * Karatsuba multiplication: * @@ -375,6 +384,53 @@ fn mac3(acc: &mut [BigDigit], b: &[BigDigit], c: &[BigDigit]) { }, NoSign => (), } + + } else { + // Toom-3 multiplication: + // + // Toom-3 is like Karatsuba above, but dividing the inputs into three parts. + // Both are instances of Toom-Cook, using `k=3` and `k=2` respectively. + // + // FIXME: It would be nice to have comments breaking down the operations below. + + let i = y.len()/3 + 1; + + let x0_len = cmp::min(x.len(), i); + let x1_len = cmp::min(x.len() - x0_len, i); + + let y0_len = i; + let y1_len = cmp::min(y.len() - y0_len, i); + + let x0 = BigInt::from_slice(Plus, &x[..x0_len]); + let x1 = BigInt::from_slice(Plus, &x[x0_len..x0_len + x1_len]); + let x2 = BigInt::from_slice(Plus, &x[x0_len + x1_len..]); + + let y0 = BigInt::from_slice(Plus, &y[..y0_len]); + let y1 = BigInt::from_slice(Plus, &y[y0_len..y0_len + y1_len]); + let y2 = BigInt::from_slice(Plus, &y[y0_len + y1_len..]); + + let p = &x0 + &x2; + let q = &y0 + &y2; + + let p2 = &p - &x1; + let q2 = &q - &y1; + + let r0 = &x0 * &y0; + let r4 = &x2 * &y2; + let r1 = (p + x1) * (q + y1); + let r2 = &p2 * &q2; + let r3 = ((p2 + x2)*2 - x0) * ((q2 + y2)*2 - y0); + + let mut comp3: BigInt = (r3 - &r1) / 3; + let mut comp1: BigInt = (r1 - &r2) / 2; + let mut comp2: BigInt = r2 - &r0; + comp3 = (&comp2 - comp3)/2 + &r4*2; + comp2 = comp2 + &comp1 - &r4; + comp1 = comp1 - &comp3; + + let result = r0 + (comp1 << 32*i) + (comp2 << 2*32*i) + (comp3 << 3*32*i) + (r4 << 4*32*i); + let result_pos = result.to_biguint().unwrap(); + add2(&mut acc[..], &result_pos.data); } }