From 10127907f589bee5ffc5b9f433a6c6c54e337130 Mon Sep 17 00:00:00 2001 From: Nicolas Kirchner Date: Thu, 3 Aug 2017 18:30:09 +0200 Subject: [PATCH 01/10] Add a bench for bigint multiply This bench allows to see the increase of time if we double the size of one of the operands. --- benches/bigint.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/benches/bigint.rs b/benches/bigint.rs index 30522d5..0eafb93 100644 --- a/benches/bigint.rs +++ b/benches/bigint.rs @@ -78,6 +78,11 @@ fn multiply_2(b: &mut Bencher) { multiply_bench(b, 1 << 16, 1 << 16); } +#[bench] +fn multiply_3(b: &mut Bencher) { + multiply_bench(b, 1 << 16, 1 << 17); +} + #[bench] fn divide_0(b: &mut Bencher) { divide_bench(b, 1 << 8, 1 << 6); From d7554ad931290aea4ba123cfa1b699a3ad4c8b3b Mon Sep 17 00:00:00 2001 From: Nicolas Kirchner Date: Sun, 13 Aug 2017 23:32:03 +0200 Subject: [PATCH 02/10] Naive implementation of the Toom-3 algorithm The Toom-2 algorithm is bypassed for tests. --- bigint/src/algorithms.rs | 222 ++++++++++++++++++++++++++++++++++++++- bigint/src/bigint.rs | 4 +- 2 files changed, 223 insertions(+), 3 deletions(-) diff --git a/bigint/src/algorithms.rs b/bigint/src/algorithms.rs index 3560103..a5cb34d 100644 --- a/bigint/src/algorithms.rs +++ b/bigint/src/algorithms.rs @@ -8,6 +8,7 @@ use traits::{Zero, One}; use biguint::BigUint; +use bigint::BigInt; use bigint::Sign; use bigint::Sign::{Minus, NoSign, Plus}; @@ -256,7 +257,7 @@ fn mac3(acc: &mut [BigDigit], b: &[BigDigit], c: &[BigDigit]) { for (i, xi) in x.iter().enumerate() { mac_digit(&mut acc[i..], y, *xi); } - } else { + } else if y.len() <= 0 { /* * Karatsuba multiplication: * @@ -375,6 +376,225 @@ fn mac3(acc: &mut [BigDigit], b: &[BigDigit], c: &[BigDigit]) { }, NoSign => (), } + + } else { + let i = y.len()/3 + 1; + + let x0_len = cmp::min(x.len(), i); + let x1_len = cmp::min(x.len() - x0_len, i); + + let y0_len = i; + let y1_len = cmp::min(y.len() - y0_len, i); + + let x0 = BigInt::from_slice(Plus, &x[..x0_len]); + let x1 = BigInt::from_slice(Plus, &x[x0_len..x0_len + x1_len]); + let x2 = BigInt::from_slice(Plus, &x[x0_len + x1_len..]); + + let y0 = BigInt::from_slice(Plus, &y[..y0_len]); + let y1 = BigInt::from_slice(Plus, &y[y0_len..y0_len + y1_len]); + let y2 = BigInt::from_slice(Plus, &y[y0_len + y1_len..]); + + let r0 = &x0 * &y0; + let r1 = (&x0 + &x1 + &x2) * (&y0 + &y1 + &y2); + let r2 = (&x0 - &x1 + &x2) * (&y0 - &y1 + &y2); + let r3 = (&x0 - &x1*2 + &x2*4) * (&y0 - &y1*2 + &y2*4); + let r4 = &x2 * &y2; + + let comp0: BigInt = r0.clone(); + let comp4: BigInt = r4.clone(); + let mut comp3: BigInt = (&r3 - &r1) / 3; + let mut comp1: BigInt = (&r1 - &r2) / 2; + let mut comp2: BigInt = &r2 - &r0; + comp3 = (&comp2 - &comp3)/2 + &r4*2; + comp2 = comp2 + &comp1 - &comp4; + comp1 = comp1 - &comp3; + + let result = comp0 + (comp1 << 32*i) + (comp2 << 2*32*i) + (comp3 << 3*32*i) + (comp4 << 4*32*i); + assert!(result.sign != Minus); + add2(&mut acc[..], &result.data.data); + + // add2(&mut acc[..], &comp0.data.data); + // add2(&mut acc[4*i..], &comp4.data.data); + // + // match (comp1.sign, comp2.sign, comp3.sign) { + // (Minus, Minus, Minus) => { + // for _ in 0..i { + // comp1.data.data.insert(0, 0); + // } + // sub2(&mut acc[..], &comp1.data.data); + // + // for _ in 0..2*i { + // comp2.data.data.insert(0, 0); + // } + // sub2(&mut acc[..], &comp2.data.data); + // + // for _ in 0..3*i { + // comp3.data.data.insert(0, 0); + // } + // sub2(&mut acc[..], &comp3.data.data); + // }, + // + // (Minus, Minus, _) => { + // add2(&mut acc[3*i..], &comp3.data.data); + // for _ in 0..i { + // comp1.data.data.insert(0, 0); + // } + // sub2(&mut acc[..], &comp1.data.data); + // for _ in 0..2*i { + // comp2.data.data.insert(0, 0); + // } + // sub2(&mut acc[..], &comp2.data.data); + // }, + // + // (Minus, _, Minus) => { + // add2(&mut acc[2*i..], &comp2.data.data); + // for _ in 0..i { + // comp1.data.data.insert(0, 0); + // } + // sub2(&mut acc[..], &comp1.data.data); + // for _ in 0..3*i { + // comp3.data.data.insert(0, 0); + // } + // sub2(&mut acc[..], &comp3.data.data); + // }, + // + // (_, Minus, Minus) => { + // add2(&mut acc[i..], &comp1.data.data); + // for _ in 0..2*i { + // comp2.data.data.insert(0, 0); + // } + // sub2(&mut acc[..], &comp2.data.data); + // for _ in 0..3*i { + // comp3.data.data.insert(0, 0); + // } + // sub2(&mut acc[..], &comp3.data.data); + // }, + // + // (Minus, _, _) => { + // add2(&mut acc[2*i..], &comp2.data.data); + // add2(&mut acc[3*i..], &comp3.data.data); + // for _ in 0..i { + // comp1.data.data.insert(0, 0); + // } + // sub2(&mut acc[..], &comp1.data.data); + // }, + // + // (_, Minus, _) => { + // add2(&mut acc[i..], &comp1.data.data); + // add2(&mut acc[3*i..], &comp3.data.data); + // for _ in 0..2*i { + // comp2.data.data.insert(0, 0); + // } + // sub2(&mut acc[..], &comp2.data.data); + // }, + // + // (_, _, Minus) => { + // add2(&mut acc[i..], &comp1.data.data); + // add2(&mut acc[2*i..], &comp2.data.data); + // for _ in 0..3*i { + // comp3.data.data.insert(0, 0); + // } + // sub2(&mut acc[..], &comp3.data.data); + // }, + // + // _ => { + // add2(&mut acc[i..], &comp1.data.data); + // add2(&mut acc[2*i..], &comp2.data.data); + // add2(&mut acc[3*i..], &comp3.data.data); + // } + // } + + // let mut r0 = BigUint{ data: vec![0; 2*i + 1] }; + // mac3(&mut r0.data, &x0, &y0); + // r0.normalize(); + // + // let mut r1 = BigUint{ data: vec![0; 2*(i+2) + 1] }; + // p.data.assign_from_slice(&x0); + // p.data.data.extend_from_slice(&[0, 0]); + // add2(&mut p.data.data, &x1); + // add2(&mut p.data.data, &x2); + // q.data.assign_from_slice(&y0); + // q.data.data.extend_from_slice(&[0, 0]); + // add2(&mut q.data.data, &y1); + // add2(&mut q.data.data, &y2); + // mac3(&mut r1.data, &p.data.data, &q.data.data); + // r1.normalize(); + // + // let mut r2 = BigUint{ data: vec![0; 2*(i+1) + 1] }; + // p.data.assign_from_slice(&x0); + // p.data.data.extend_from_slice(&[0, 0]); + // add2(&mut p.data, &x2); + // p = p - BigInt::from_biguint(Plus, BigUint::from_slice(&x1)); + // q.data.assign_from_slice(&y0); + // q.data.data.extend_from_slice(&[0, 0]); + // add2(&mut q.data, &y2); + // q = q - BigInt::from_biguint(Plus, BigUint::from_slice(&y1)); + // mac3(&mut r2.data, &p.data, &q.data); + // r2.normalize(); + // let r2_sign = p_sign * q_sign; + // + // let mut r3 = BigUint{ data: vec![0; 2*(i+2) + 1] }; + // p.assign_from_slice(&x0); + // p.data.extend_from_slice(&[0, 0]); + // let mut temp = BigUint { data: Vec::with_capacity(i + 1) }; + // temp.assign_from_slice(&x2); + // temp = temp * 4_u32; + // add2(&mut p.data, &temp.data); + // temp.assign_from_slice(&x1); + // temp = temp * 2_u32; + // if p.data[i] == 0 && p.data[..i].iter().rev().lt(temp.data.iter().rev()) { + // p_sign = Minus; + // let p_temp = p.clone(); + // p.assign_from_slice(&temp.data); + // sub2(&mut p.data, &p_temp.data); + // } else { + // p_sign = Plus; + // sub2(&mut p.data, &temp.data); + // } + // q.assign_from_slice(&y0); + // q.data.extend_from_slice(&[0, 0]); + // temp.assign_from_slice(&y2); + // temp = temp * 4_u32; + // add2(&mut q.data, &temp.data); + // temp.assign_from_slice(&y1); + // temp = temp * 2_u32; + // if q.data[i] == 0 && q.data[..i].iter().rev().lt(temp.data.iter().rev()) { + // q_sign = Minus; + // let q_temp = q.clone(); + // q.assign_from_slice(&temp.data); + // sub2(&mut q.data, &q_temp.data); + // } else { + // q_sign = Plus; + // sub2(&mut q.data, &temp.data); + // } + // mac3(&mut r3.data, &p.data, &q.data); + // r3.normalize(); + // let r3_sign = p_sign * q_sign; + // + // let mut r4 = BigUint{ data: vec![0; 2*i + 1] }; + // mac3(&mut r4.data, &x2, &y2); + // r4.normalize(); + // + // let signed_r0 = BigInt::from_biguint(Plus, r0); + // let signed_r1 = BigInt::from_biguint(Plus, r1); + // let signed_r2 = BigInt::from_biguint(r2_sign, r2); + // let signed_r3 = BigInt::from_biguint(r3_sign, r3); + // let signed_r4 = BigInt::from_biguint(Plus, r4); + // + // let comp0 = signed_r0.clone(); + // let comp4 = signed_r4.clone(); + // let mut comp3: BigInt = (&signed_r3 - &signed_r1) / 3; + // let mut comp1: BigInt = (&signed_r1 - &signed_r2) / 2; + // let mut comp2 = &signed_r2 - &signed_r0; + // comp3 = ((&comp2 - &comp3) / 2) + (&signed_r4 * 2); + // comp2 = comp2 + &comp1 - &comp4; + // comp1 = comp1 - &comp3; + // + // add2(&mut acc[..], &comp0.data.data); + // add2(&mut acc[i..], &comp1.data.data); + // add2(&mut acc[2*i..], &comp2.data.data); + // add2(&mut acc[3*i..], &comp3.data.data); + // add2(&mut acc[4*i..], &comp4.data.data); } } diff --git a/bigint/src/bigint.rs b/bigint/src/bigint.rs index bae78d7..f3cd4ec 100644 --- a/bigint/src/bigint.rs +++ b/bigint/src/bigint.rs @@ -105,8 +105,8 @@ impl serde::Deserialize for Sign { #[derive(Clone, Debug, Hash)] #[cfg_attr(feature = "rustc-serialize", derive(RustcEncodable, RustcDecodable))] pub struct BigInt { - sign: Sign, - data: BigUint, + pub(super) sign: Sign, + pub(super) data: BigUint, } impl PartialEq for BigInt { From c9c40b9402fdcd2ff027b2df5424753609bee1ad Mon Sep 17 00:00:00 2001 From: Nicolas Kirchner Date: Mon, 14 Aug 2017 20:07:35 +0200 Subject: [PATCH 03/10] Optimize and clean the Toom-3 algorithm and choose better thresholds --- bigint/src/algorithms.rs | 197 ++------------------------------------- 1 file changed, 6 insertions(+), 191 deletions(-) diff --git a/bigint/src/algorithms.rs b/bigint/src/algorithms.rs index a5cb34d..2082d04 100644 --- a/bigint/src/algorithms.rs +++ b/bigint/src/algorithms.rs @@ -253,11 +253,11 @@ fn mac3(acc: &mut [BigDigit], b: &[BigDigit], c: &[BigDigit]) { // Karatsuba multiplication is slower than long multiplication for small x and y: // - if x.len() <= 4 { + if x.len() <= 16 { for (i, xi) in x.iter().enumerate() { mac_digit(&mut acc[i..], y, *xi); } - } else if y.len() <= 0 { + } else if x.len() <= 300 { /* * Karatsuba multiplication: * @@ -395,206 +395,21 @@ fn mac3(acc: &mut [BigDigit], b: &[BigDigit], c: &[BigDigit]) { let y2 = BigInt::from_slice(Plus, &y[y0_len + y1_len..]); let r0 = &x0 * &y0; + let r4 = &x2 * &y2; let r1 = (&x0 + &x1 + &x2) * (&y0 + &y1 + &y2); let r2 = (&x0 - &x1 + &x2) * (&y0 - &y1 + &y2); - let r3 = (&x0 - &x1*2 + &x2*4) * (&y0 - &y1*2 + &y2*4); - let r4 = &x2 * &y2; + let r3 = (x0 - x1*2 + x2*4) * (y0 - y1*2 + y2*4); - let comp0: BigInt = r0.clone(); - let comp4: BigInt = r4.clone(); let mut comp3: BigInt = (&r3 - &r1) / 3; let mut comp1: BigInt = (&r1 - &r2) / 2; let mut comp2: BigInt = &r2 - &r0; comp3 = (&comp2 - &comp3)/2 + &r4*2; - comp2 = comp2 + &comp1 - &comp4; + comp2 = comp2 + &comp1 - &r4; comp1 = comp1 - &comp3; - let result = comp0 + (comp1 << 32*i) + (comp2 << 2*32*i) + (comp3 << 3*32*i) + (comp4 << 4*32*i); + let result = r0 + (comp1 << 32*i) + (comp2 << 2*32*i) + (comp3 << 3*32*i) + (r4 << 4*32*i); assert!(result.sign != Minus); add2(&mut acc[..], &result.data.data); - - // add2(&mut acc[..], &comp0.data.data); - // add2(&mut acc[4*i..], &comp4.data.data); - // - // match (comp1.sign, comp2.sign, comp3.sign) { - // (Minus, Minus, Minus) => { - // for _ in 0..i { - // comp1.data.data.insert(0, 0); - // } - // sub2(&mut acc[..], &comp1.data.data); - // - // for _ in 0..2*i { - // comp2.data.data.insert(0, 0); - // } - // sub2(&mut acc[..], &comp2.data.data); - // - // for _ in 0..3*i { - // comp3.data.data.insert(0, 0); - // } - // sub2(&mut acc[..], &comp3.data.data); - // }, - // - // (Minus, Minus, _) => { - // add2(&mut acc[3*i..], &comp3.data.data); - // for _ in 0..i { - // comp1.data.data.insert(0, 0); - // } - // sub2(&mut acc[..], &comp1.data.data); - // for _ in 0..2*i { - // comp2.data.data.insert(0, 0); - // } - // sub2(&mut acc[..], &comp2.data.data); - // }, - // - // (Minus, _, Minus) => { - // add2(&mut acc[2*i..], &comp2.data.data); - // for _ in 0..i { - // comp1.data.data.insert(0, 0); - // } - // sub2(&mut acc[..], &comp1.data.data); - // for _ in 0..3*i { - // comp3.data.data.insert(0, 0); - // } - // sub2(&mut acc[..], &comp3.data.data); - // }, - // - // (_, Minus, Minus) => { - // add2(&mut acc[i..], &comp1.data.data); - // for _ in 0..2*i { - // comp2.data.data.insert(0, 0); - // } - // sub2(&mut acc[..], &comp2.data.data); - // for _ in 0..3*i { - // comp3.data.data.insert(0, 0); - // } - // sub2(&mut acc[..], &comp3.data.data); - // }, - // - // (Minus, _, _) => { - // add2(&mut acc[2*i..], &comp2.data.data); - // add2(&mut acc[3*i..], &comp3.data.data); - // for _ in 0..i { - // comp1.data.data.insert(0, 0); - // } - // sub2(&mut acc[..], &comp1.data.data); - // }, - // - // (_, Minus, _) => { - // add2(&mut acc[i..], &comp1.data.data); - // add2(&mut acc[3*i..], &comp3.data.data); - // for _ in 0..2*i { - // comp2.data.data.insert(0, 0); - // } - // sub2(&mut acc[..], &comp2.data.data); - // }, - // - // (_, _, Minus) => { - // add2(&mut acc[i..], &comp1.data.data); - // add2(&mut acc[2*i..], &comp2.data.data); - // for _ in 0..3*i { - // comp3.data.data.insert(0, 0); - // } - // sub2(&mut acc[..], &comp3.data.data); - // }, - // - // _ => { - // add2(&mut acc[i..], &comp1.data.data); - // add2(&mut acc[2*i..], &comp2.data.data); - // add2(&mut acc[3*i..], &comp3.data.data); - // } - // } - - // let mut r0 = BigUint{ data: vec![0; 2*i + 1] }; - // mac3(&mut r0.data, &x0, &y0); - // r0.normalize(); - // - // let mut r1 = BigUint{ data: vec![0; 2*(i+2) + 1] }; - // p.data.assign_from_slice(&x0); - // p.data.data.extend_from_slice(&[0, 0]); - // add2(&mut p.data.data, &x1); - // add2(&mut p.data.data, &x2); - // q.data.assign_from_slice(&y0); - // q.data.data.extend_from_slice(&[0, 0]); - // add2(&mut q.data.data, &y1); - // add2(&mut q.data.data, &y2); - // mac3(&mut r1.data, &p.data.data, &q.data.data); - // r1.normalize(); - // - // let mut r2 = BigUint{ data: vec![0; 2*(i+1) + 1] }; - // p.data.assign_from_slice(&x0); - // p.data.data.extend_from_slice(&[0, 0]); - // add2(&mut p.data, &x2); - // p = p - BigInt::from_biguint(Plus, BigUint::from_slice(&x1)); - // q.data.assign_from_slice(&y0); - // q.data.data.extend_from_slice(&[0, 0]); - // add2(&mut q.data, &y2); - // q = q - BigInt::from_biguint(Plus, BigUint::from_slice(&y1)); - // mac3(&mut r2.data, &p.data, &q.data); - // r2.normalize(); - // let r2_sign = p_sign * q_sign; - // - // let mut r3 = BigUint{ data: vec![0; 2*(i+2) + 1] }; - // p.assign_from_slice(&x0); - // p.data.extend_from_slice(&[0, 0]); - // let mut temp = BigUint { data: Vec::with_capacity(i + 1) }; - // temp.assign_from_slice(&x2); - // temp = temp * 4_u32; - // add2(&mut p.data, &temp.data); - // temp.assign_from_slice(&x1); - // temp = temp * 2_u32; - // if p.data[i] == 0 && p.data[..i].iter().rev().lt(temp.data.iter().rev()) { - // p_sign = Minus; - // let p_temp = p.clone(); - // p.assign_from_slice(&temp.data); - // sub2(&mut p.data, &p_temp.data); - // } else { - // p_sign = Plus; - // sub2(&mut p.data, &temp.data); - // } - // q.assign_from_slice(&y0); - // q.data.extend_from_slice(&[0, 0]); - // temp.assign_from_slice(&y2); - // temp = temp * 4_u32; - // add2(&mut q.data, &temp.data); - // temp.assign_from_slice(&y1); - // temp = temp * 2_u32; - // if q.data[i] == 0 && q.data[..i].iter().rev().lt(temp.data.iter().rev()) { - // q_sign = Minus; - // let q_temp = q.clone(); - // q.assign_from_slice(&temp.data); - // sub2(&mut q.data, &q_temp.data); - // } else { - // q_sign = Plus; - // sub2(&mut q.data, &temp.data); - // } - // mac3(&mut r3.data, &p.data, &q.data); - // r3.normalize(); - // let r3_sign = p_sign * q_sign; - // - // let mut r4 = BigUint{ data: vec![0; 2*i + 1] }; - // mac3(&mut r4.data, &x2, &y2); - // r4.normalize(); - // - // let signed_r0 = BigInt::from_biguint(Plus, r0); - // let signed_r1 = BigInt::from_biguint(Plus, r1); - // let signed_r2 = BigInt::from_biguint(r2_sign, r2); - // let signed_r3 = BigInt::from_biguint(r3_sign, r3); - // let signed_r4 = BigInt::from_biguint(Plus, r4); - // - // let comp0 = signed_r0.clone(); - // let comp4 = signed_r4.clone(); - // let mut comp3: BigInt = (&signed_r3 - &signed_r1) / 3; - // let mut comp1: BigInt = (&signed_r1 - &signed_r2) / 2; - // let mut comp2 = &signed_r2 - &signed_r0; - // comp3 = ((&comp2 - &comp3) / 2) + (&signed_r4 * 2); - // comp2 = comp2 + &comp1 - &comp4; - // comp1 = comp1 - &comp3; - // - // add2(&mut acc[..], &comp0.data.data); - // add2(&mut acc[i..], &comp1.data.data); - // add2(&mut acc[2*i..], &comp2.data.data); - // add2(&mut acc[3*i..], &comp3.data.data); - // add2(&mut acc[4*i..], &comp4.data.data); } } From b43c1ab258520c0b731d8cf8ca1aa80f161c978b Mon Sep 17 00:00:00 2001 From: Nicolas Kirchner Date: Tue, 15 Aug 2017 01:39:43 +0200 Subject: [PATCH 04/10] Replace the use of a feature not yet implemented in rust 1.8 --- bigint/src/algorithms.rs | 4 ++-- bigint/src/bigint.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bigint/src/algorithms.rs b/bigint/src/algorithms.rs index 2082d04..2c28113 100644 --- a/bigint/src/algorithms.rs +++ b/bigint/src/algorithms.rs @@ -408,8 +408,8 @@ fn mac3(acc: &mut [BigDigit], b: &[BigDigit], c: &[BigDigit]) { comp1 = comp1 - &comp3; let result = r0 + (comp1 << 32*i) + (comp2 << 2*32*i) + (comp3 << 3*32*i) + (r4 << 4*32*i); - assert!(result.sign != Minus); - add2(&mut acc[..], &result.data.data); + let result_pos = result.to_biguint().unwrap(); + add2(&mut acc[..], &result_pos.data); } } diff --git a/bigint/src/bigint.rs b/bigint/src/bigint.rs index f3cd4ec..bae78d7 100644 --- a/bigint/src/bigint.rs +++ b/bigint/src/bigint.rs @@ -105,8 +105,8 @@ impl serde::Deserialize for Sign { #[derive(Clone, Debug, Hash)] #[cfg_attr(feature = "rustc-serialize", derive(RustcEncodable, RustcDecodable))] pub struct BigInt { - pub(super) sign: Sign, - pub(super) data: BigUint, + sign: Sign, + data: BigUint, } impl PartialEq for BigInt { From 243bc6fe4c3183e63fcd3f1672a07f13c9568f05 Mon Sep 17 00:00:00 2001 From: Nicolas Kirchner Date: Tue, 15 Aug 2017 19:27:22 +0200 Subject: [PATCH 05/10] Optimize Toom-3 algorithm --- bigint/src/algorithms.rs | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/bigint/src/algorithms.rs b/bigint/src/algorithms.rs index 2c28113..33df786 100644 --- a/bigint/src/algorithms.rs +++ b/bigint/src/algorithms.rs @@ -394,11 +394,17 @@ fn mac3(acc: &mut [BigDigit], b: &[BigDigit], c: &[BigDigit]) { let y1 = BigInt::from_slice(Plus, &y[y0_len..y0_len + y1_len]); let y2 = BigInt::from_slice(Plus, &y[y0_len + y1_len..]); + let p = &x0 + &x2; + let q = &y0 + &y2; + + let p2 = &p - &x1; + let q2 = &q - &y1; + let r0 = &x0 * &y0; let r4 = &x2 * &y2; - let r1 = (&x0 + &x1 + &x2) * (&y0 + &y1 + &y2); - let r2 = (&x0 - &x1 + &x2) * (&y0 - &y1 + &y2); - let r3 = (x0 - x1*2 + x2*4) * (y0 - y1*2 + y2*4); + let r1 = (&p + &x1) * (&q + &y1); + let r2 = &p2 * &q2; + let r3 = ((p2 + x2)*2 - x0) * ((q2 + y2)*2 - y0); let mut comp3: BigInt = (&r3 - &r1) / 3; let mut comp1: BigInt = (&r1 - &r2) / 2; From bcd76c55e88a7efc718ad9b3303b5251aaaad142 Mon Sep 17 00:00:00 2001 From: Nicolas Kirchner Date: Tue, 15 Aug 2017 21:14:15 +0200 Subject: [PATCH 06/10] Optimize mac_digit --- bigint/src/algorithms.rs | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/bigint/src/algorithms.rs b/bigint/src/algorithms.rs index 33df786..dc3c426 100644 --- a/bigint/src/algorithms.rs +++ b/bigint/src/algorithms.rs @@ -226,20 +226,18 @@ fn mac_digit(acc: &mut [BigDigit], b: &[BigDigit], c: BigDigit) { return; } - let mut b_iter = b.iter(); let mut carry = 0; - for ai in acc.iter_mut() { - if let Some(bi) = b_iter.next() { - *ai = mac_with_carry(*ai, *bi, c, &mut carry); - } else if carry != 0 { - *ai = mac_with_carry(*ai, 0, c, &mut carry); - } else { - break; - } + for (i, bi) in b.iter().enumerate() { + acc[i] = mac_with_carry(acc[i], *bi, c, &mut carry); } - assert!(carry == 0); + let mut i = b.len(); + + while carry != 0 { + acc[i] = adc(acc[i], 0, &mut carry); + i += 1; + } } /// Three argument multiply accumulate: From 05dc87c04188635851fbd14acde997eb37856aee Mon Sep 17 00:00:00 2001 From: Josh Stone Date: Wed, 20 Sep 2017 11:41:59 -0700 Subject: [PATCH 07/10] Improve mac_digit bounds checking By starting with `split_at_mut`, the hot multiplication loop runs with no bounds checking at all! The remaining carry loop has a slightly simpler check for when the remaining iterator runs dry. --- bigint/src/algorithms.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bigint/src/algorithms.rs b/bigint/src/algorithms.rs index dc3c426..d46f7e7 100644 --- a/bigint/src/algorithms.rs +++ b/bigint/src/algorithms.rs @@ -227,16 +227,16 @@ fn mac_digit(acc: &mut [BigDigit], b: &[BigDigit], c: BigDigit) { } let mut carry = 0; + let (a_lo, a_hi) = acc.split_at_mut(b.len()); - for (i, bi) in b.iter().enumerate() { - acc[i] = mac_with_carry(acc[i], *bi, c, &mut carry); + for (a, &b) in a_lo.iter_mut().zip(b) { + *a = mac_with_carry(*a, b, c, &mut carry); } - let mut i = b.len(); - + let mut a = a_hi.iter_mut(); while carry != 0 { - acc[i] = adc(acc[i], 0, &mut carry); - i += 1; + let a = a.next().expect("carry overflow during multiplication!"); + *a = adc(*a, 0, &mut carry); } } From 2c2e46c8dfd5b7db432e45a3e4bca8a78029496f Mon Sep 17 00:00:00 2001 From: Josh Stone Date: Wed, 20 Sep 2017 13:15:44 -0700 Subject: [PATCH 08/10] Add comments about multiplication strategy --- bigint/src/algorithms.rs | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/bigint/src/algorithms.rs b/bigint/src/algorithms.rs index d46f7e7..1f0bc5a 100644 --- a/bigint/src/algorithms.rs +++ b/bigint/src/algorithms.rs @@ -249,9 +249,19 @@ fn mac3(acc: &mut [BigDigit], b: &[BigDigit], c: &[BigDigit]) { (c, b) }; - // Karatsuba multiplication is slower than long multiplication for small x and y: + // We use three algorithms for different input sizes. // + // - For small inputs, long multiplication is fastest. + // - Next we use Karatsuba multiplication (Toom-2), which we have optimized + // to avoid unnecessary allocations for intermediate values. + // - For the largest inputs we use Toom-3, which better optimizes the + // number of operations, but uses more temporary allocations. + // + // The thresholds are somewhat arbitrary, chosen by evaluating the results + // of `cargo bench --bench bigint multiply`. + if x.len() <= 16 { + // Long multiplication: for (i, xi) in x.iter().enumerate() { mac_digit(&mut acc[i..], y, *xi); } @@ -376,6 +386,13 @@ fn mac3(acc: &mut [BigDigit], b: &[BigDigit], c: &[BigDigit]) { } } else { + // Toom-3 multiplication: + // + // Toom-3 is like Karatsuba above, but dividing the inputs into three parts. + // Both are instances of Toom-Cook, using `k=3` and `k=2` respectively. + // + // FIXME: It would be nice to have comments breaking down the operations below. + let i = y.len()/3 + 1; let x0_len = cmp::min(x.len(), i); From 28d84ca3acdfce3b35bf2b1be9fed7a59fb355ff Mon Sep 17 00:00:00 2001 From: Josh Stone Date: Wed, 20 Sep 2017 13:17:06 -0700 Subject: [PATCH 09/10] Toom-3: operate more on values where possible --- bigint/src/algorithms.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bigint/src/algorithms.rs b/bigint/src/algorithms.rs index 1f0bc5a..8a133b2 100644 --- a/bigint/src/algorithms.rs +++ b/bigint/src/algorithms.rs @@ -417,14 +417,14 @@ fn mac3(acc: &mut [BigDigit], b: &[BigDigit], c: &[BigDigit]) { let r0 = &x0 * &y0; let r4 = &x2 * &y2; - let r1 = (&p + &x1) * (&q + &y1); + let r1 = (p + x1) * (q + y1); let r2 = &p2 * &q2; let r3 = ((p2 + x2)*2 - x0) * ((q2 + y2)*2 - y0); - let mut comp3: BigInt = (&r3 - &r1) / 3; - let mut comp1: BigInt = (&r1 - &r2) / 2; - let mut comp2: BigInt = &r2 - &r0; - comp3 = (&comp2 - &comp3)/2 + &r4*2; + let mut comp3: BigInt = (r3 - &r1) / 3; + let mut comp1: BigInt = (r1 - &r2) / 2; + let mut comp2: BigInt = r2 - &r0; + comp3 = (&comp2 - comp3)/2 + &r4*2; comp2 = comp2 + &comp1 - &r4; comp1 = comp1 - &comp3; From 1ddbee7f37c6d2e9f8eae5742e948909a3569bd6 Mon Sep 17 00:00:00 2001 From: Josh Stone Date: Wed, 20 Sep 2017 13:19:00 -0700 Subject: [PATCH 10/10] bigint mac3: tweak thresholds between algorithms It's not too rigorous, but thresholds 32 and 256 give me better results. Before: test multiply_0 ... bench: 87 ns/iter (+/- 0) test multiply_1 ... bench: 11,926 ns/iter (+/- 19) test multiply_2 ... bench: 772,178 ns/iter (+/- 3,068) test multiply_3 ... bench: 2,034,237 ns/iter (+/- 9,618) After: test multiply_0 ... bench: 87 ns/iter (+/- 0) test multiply_1 ... bench: 11,927 ns/iter (+/- 64) test multiply_2 ... bench: 672,440 ns/iter (+/- 3,570) test multiply_3 ... bench: 1,577,065 ns/iter (+/- 11,137) --- bigint/src/algorithms.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bigint/src/algorithms.rs b/bigint/src/algorithms.rs index 8a133b2..d93bdd5 100644 --- a/bigint/src/algorithms.rs +++ b/bigint/src/algorithms.rs @@ -260,12 +260,12 @@ fn mac3(acc: &mut [BigDigit], b: &[BigDigit], c: &[BigDigit]) { // The thresholds are somewhat arbitrary, chosen by evaluating the results // of `cargo bench --bench bigint multiply`. - if x.len() <= 16 { + if x.len() <= 32 { // Long multiplication: for (i, xi) in x.iter().enumerate() { mac_digit(&mut acc[i..], y, *xi); } - } else if x.len() <= 300 { + } else if x.len() <= 256 { /* * Karatsuba multiplication: *