From 10127907f589bee5ffc5b9f433a6c6c54e337130 Mon Sep 17 00:00:00 2001
From: Nicolas Kirchner <kirchner.nic@gmail.com>
Date: Thu, 3 Aug 2017 18:30:09 +0200
Subject: [PATCH 01/10] Add a bench for bigint multiply

This bench allows to see the increase of time
if we double the size of one of the operands.
---
 benches/bigint.rs | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/benches/bigint.rs b/benches/bigint.rs
index 30522d5..0eafb93 100644
--- a/benches/bigint.rs
+++ b/benches/bigint.rs
@@ -78,6 +78,11 @@ fn multiply_2(b: &mut Bencher) {
     multiply_bench(b, 1 << 16, 1 << 16);
 }
 
+#[bench]
+fn multiply_3(b: &mut Bencher) {
+    multiply_bench(b, 1 << 16, 1 << 17);
+}
+
 #[bench]
 fn divide_0(b: &mut Bencher) {
     divide_bench(b, 1 << 8, 1 << 6);

From d7554ad931290aea4ba123cfa1b699a3ad4c8b3b Mon Sep 17 00:00:00 2001
From: Nicolas Kirchner <kirchner.nic@gmail.com>
Date: Sun, 13 Aug 2017 23:32:03 +0200
Subject: [PATCH 02/10] Naive implementation of the Toom-3 algorithm

The Toom-2 algorithm is bypassed for tests.
---
 bigint/src/algorithms.rs | 222 ++++++++++++++++++++++++++++++++++++++-
 bigint/src/bigint.rs     |   4 +-
 2 files changed, 223 insertions(+), 3 deletions(-)

diff --git a/bigint/src/algorithms.rs b/bigint/src/algorithms.rs
index 3560103..a5cb34d 100644
--- a/bigint/src/algorithms.rs
+++ b/bigint/src/algorithms.rs
@@ -8,6 +8,7 @@ use traits::{Zero, One};
 
 use biguint::BigUint;
 
+use bigint::BigInt;
 use bigint::Sign;
 use bigint::Sign::{Minus, NoSign, Plus};
 
@@ -256,7 +257,7 @@ fn mac3(acc: &mut [BigDigit], b: &[BigDigit], c: &[BigDigit]) {
         for (i, xi) in x.iter().enumerate() {
             mac_digit(&mut acc[i..], y, *xi);
         }
-    } else {
+    } else if y.len() <= 0 {
         /*
          * Karatsuba multiplication:
          *
@@ -375,6 +376,225 @@ fn mac3(acc: &mut [BigDigit], b: &[BigDigit], c: &[BigDigit]) {
             },
             NoSign  => (),
         }
+
+    } else {
+        let i = y.len()/3 + 1;
+
+        let x0_len = cmp::min(x.len(), i);
+        let x1_len = cmp::min(x.len() - x0_len, i);
+
+        let y0_len = i;
+        let y1_len = cmp::min(y.len() - y0_len, i);
+
+        let x0 = BigInt::from_slice(Plus, &x[..x0_len]);
+        let x1 = BigInt::from_slice(Plus, &x[x0_len..x0_len + x1_len]);
+        let x2 = BigInt::from_slice(Plus, &x[x0_len + x1_len..]);
+
+        let y0 = BigInt::from_slice(Plus, &y[..y0_len]);
+        let y1 = BigInt::from_slice(Plus, &y[y0_len..y0_len + y1_len]);
+        let y2 = BigInt::from_slice(Plus, &y[y0_len + y1_len..]);
+
+        let r0 = &x0 * &y0;
+        let r1 = (&x0 + &x1 + &x2) * (&y0 + &y1 + &y2);
+        let r2 = (&x0 - &x1 + &x2) * (&y0 - &y1 + &y2);
+        let r3 = (&x0 - &x1*2 + &x2*4) * (&y0 - &y1*2 + &y2*4);
+        let r4 = &x2 * &y2;
+
+        let comp0: BigInt = r0.clone();
+        let comp4: BigInt = r4.clone();
+        let mut comp3: BigInt = (&r3 - &r1) / 3;
+        let mut comp1: BigInt = (&r1 - &r2) / 2;
+        let mut comp2: BigInt = &r2 - &r0;
+        comp3 = (&comp2 - &comp3)/2 + &r4*2;
+        comp2 = comp2 + &comp1 - &comp4;
+        comp1 = comp1 - &comp3;
+
+        let result = comp0 + (comp1 << 32*i) + (comp2 << 2*32*i) + (comp3 << 3*32*i) + (comp4 << 4*32*i);
+        assert!(result.sign != Minus);
+        add2(&mut acc[..], &result.data.data);
+
+        // add2(&mut acc[..], &comp0.data.data);
+        // add2(&mut acc[4*i..], &comp4.data.data);
+        //
+        // match (comp1.sign, comp2.sign, comp3.sign) {
+        //     (Minus, Minus, Minus) => {
+        //         for _ in 0..i {
+        //             comp1.data.data.insert(0, 0);
+        //         }
+        //         sub2(&mut acc[..], &comp1.data.data);
+        //
+        //         for _ in 0..2*i {
+        //             comp2.data.data.insert(0, 0);
+        //         }
+        //         sub2(&mut acc[..], &comp2.data.data);
+        //
+        //         for _ in 0..3*i {
+        //             comp3.data.data.insert(0, 0);
+        //         }
+        //         sub2(&mut acc[..], &comp3.data.data);
+        //     },
+        //
+        //     (Minus, Minus, _) => {
+        //         add2(&mut acc[3*i..], &comp3.data.data);
+        //         for _ in 0..i {
+        //             comp1.data.data.insert(0, 0);
+        //         }
+        //         sub2(&mut acc[..], &comp1.data.data);
+        //         for _ in 0..2*i {
+        //             comp2.data.data.insert(0, 0);
+        //         }
+        //         sub2(&mut acc[..], &comp2.data.data);
+        //     },
+        //
+        //     (Minus, _, Minus) => {
+        //         add2(&mut acc[2*i..], &comp2.data.data);
+        //         for _ in 0..i {
+        //             comp1.data.data.insert(0, 0);
+        //         }
+        //         sub2(&mut acc[..], &comp1.data.data);
+        //         for _ in 0..3*i {
+        //             comp3.data.data.insert(0, 0);
+        //         }
+        //         sub2(&mut acc[..], &comp3.data.data);
+        //     },
+        //
+        //     (_, Minus, Minus) => {
+        //         add2(&mut acc[i..], &comp1.data.data);
+        //         for _ in 0..2*i {
+        //             comp2.data.data.insert(0, 0);
+        //         }
+        //         sub2(&mut acc[..], &comp2.data.data);
+        //         for _ in 0..3*i {
+        //             comp3.data.data.insert(0, 0);
+        //         }
+        //         sub2(&mut acc[..], &comp3.data.data);
+        //     },
+        //
+        //     (Minus, _, _) => {
+        //         add2(&mut acc[2*i..], &comp2.data.data);
+        //         add2(&mut acc[3*i..], &comp3.data.data);
+        //         for _ in 0..i {
+        //             comp1.data.data.insert(0, 0);
+        //         }
+        //         sub2(&mut acc[..], &comp1.data.data);
+        //     },
+        //
+        //     (_, Minus, _) => {
+        //         add2(&mut acc[i..], &comp1.data.data);
+        //         add2(&mut acc[3*i..], &comp3.data.data);
+        //         for _ in 0..2*i {
+        //             comp2.data.data.insert(0, 0);
+        //         }
+        //         sub2(&mut acc[..], &comp2.data.data);
+        //     },
+        //
+        //     (_, _, Minus) => {
+        //         add2(&mut acc[i..], &comp1.data.data);
+        //         add2(&mut acc[2*i..], &comp2.data.data);
+        //         for _ in 0..3*i {
+        //             comp3.data.data.insert(0, 0);
+        //         }
+        //         sub2(&mut acc[..], &comp3.data.data);
+        //     },
+        //
+        //     _ => {
+        //         add2(&mut acc[i..], &comp1.data.data);
+        //         add2(&mut acc[2*i..], &comp2.data.data);
+        //         add2(&mut acc[3*i..], &comp3.data.data);
+        //     }
+        // }
+
+        // let mut r0 = BigUint{ data: vec![0; 2*i + 1] };
+        // mac3(&mut r0.data, &x0, &y0);
+        // r0.normalize();
+        //
+        // let mut r1 = BigUint{ data: vec![0; 2*(i+2) + 1] };
+        // p.data.assign_from_slice(&x0);
+        // p.data.data.extend_from_slice(&[0, 0]);
+        // add2(&mut p.data.data, &x1);
+        // add2(&mut p.data.data, &x2);
+        // q.data.assign_from_slice(&y0);
+        // q.data.data.extend_from_slice(&[0, 0]);
+        // add2(&mut q.data.data, &y1);
+        // add2(&mut q.data.data, &y2);
+        // mac3(&mut r1.data, &p.data.data, &q.data.data);
+        // r1.normalize();
+        //
+        // let mut r2 = BigUint{ data: vec![0; 2*(i+1) + 1] };
+        // p.data.assign_from_slice(&x0);
+        // p.data.data.extend_from_slice(&[0, 0]);
+        // add2(&mut p.data, &x2);
+        // p = p - BigInt::from_biguint(Plus, BigUint::from_slice(&x1));
+        // q.data.assign_from_slice(&y0);
+        // q.data.data.extend_from_slice(&[0, 0]);
+        // add2(&mut q.data, &y2);
+        // q = q - BigInt::from_biguint(Plus, BigUint::from_slice(&y1));
+        // mac3(&mut r2.data, &p.data, &q.data);
+        // r2.normalize();
+        // let r2_sign = p_sign * q_sign;
+        //
+        // let mut r3 = BigUint{ data: vec![0; 2*(i+2) + 1] };
+        // p.assign_from_slice(&x0);
+        // p.data.extend_from_slice(&[0, 0]);
+        // let mut temp = BigUint { data: Vec::with_capacity(i + 1) };
+        // temp.assign_from_slice(&x2);
+        // temp = temp * 4_u32;
+        // add2(&mut p.data, &temp.data);
+        // temp.assign_from_slice(&x1);
+        // temp = temp * 2_u32;
+        // if p.data[i] == 0 && p.data[..i].iter().rev().lt(temp.data.iter().rev()) {
+        //     p_sign = Minus;
+        //     let p_temp = p.clone();
+        //     p.assign_from_slice(&temp.data);
+        //     sub2(&mut p.data, &p_temp.data);
+        // } else {
+        //     p_sign = Plus;
+        //     sub2(&mut p.data, &temp.data);
+        // }
+        // q.assign_from_slice(&y0);
+        // q.data.extend_from_slice(&[0, 0]);
+        // temp.assign_from_slice(&y2);
+        // temp = temp * 4_u32;
+        // add2(&mut q.data, &temp.data);
+        // temp.assign_from_slice(&y1);
+        // temp = temp * 2_u32;
+        // if q.data[i] == 0 && q.data[..i].iter().rev().lt(temp.data.iter().rev()) {
+        //     q_sign = Minus;
+        //     let q_temp = q.clone();
+        //     q.assign_from_slice(&temp.data);
+        //     sub2(&mut q.data, &q_temp.data);
+        // } else {
+        //     q_sign = Plus;
+        //     sub2(&mut q.data, &temp.data);
+        // }
+        // mac3(&mut r3.data, &p.data, &q.data);
+        // r3.normalize();
+        // let r3_sign = p_sign * q_sign;
+        //
+        // let mut r4 = BigUint{ data: vec![0; 2*i + 1] };
+        // mac3(&mut r4.data, &x2, &y2);
+        // r4.normalize();
+        //
+        // let signed_r0 = BigInt::from_biguint(Plus, r0);
+        // let signed_r1 = BigInt::from_biguint(Plus, r1);
+        // let signed_r2 = BigInt::from_biguint(r2_sign, r2);
+        // let signed_r3 = BigInt::from_biguint(r3_sign, r3);
+        // let signed_r4 = BigInt::from_biguint(Plus, r4);
+        //
+        // let comp0 = signed_r0.clone();
+        // let comp4 = signed_r4.clone();
+        // let mut comp3: BigInt = (&signed_r3 - &signed_r1) / 3;
+        // let mut comp1: BigInt = (&signed_r1 - &signed_r2) / 2;
+        // let mut comp2 = &signed_r2 - &signed_r0;
+        // comp3 = ((&comp2 - &comp3) / 2) + (&signed_r4 * 2);
+        // comp2 = comp2 + &comp1 - &comp4;
+        // comp1 = comp1 - &comp3;
+        //
+        // add2(&mut acc[..], &comp0.data.data);
+        // add2(&mut acc[i..], &comp1.data.data);
+        // add2(&mut acc[2*i..], &comp2.data.data);
+        // add2(&mut acc[3*i..], &comp3.data.data);
+        // add2(&mut acc[4*i..], &comp4.data.data);
     }
 }
 
diff --git a/bigint/src/bigint.rs b/bigint/src/bigint.rs
index bae78d7..f3cd4ec 100644
--- a/bigint/src/bigint.rs
+++ b/bigint/src/bigint.rs
@@ -105,8 +105,8 @@ impl serde::Deserialize for Sign {
 #[derive(Clone, Debug, Hash)]
 #[cfg_attr(feature = "rustc-serialize", derive(RustcEncodable, RustcDecodable))]
 pub struct BigInt {
-    sign: Sign,
-    data: BigUint,
+    pub(super) sign: Sign,
+    pub(super) data: BigUint,
 }
 
 impl PartialEq for BigInt {

From c9c40b9402fdcd2ff027b2df5424753609bee1ad Mon Sep 17 00:00:00 2001
From: Nicolas Kirchner <kirchner.nic@gmail.com>
Date: Mon, 14 Aug 2017 20:07:35 +0200
Subject: [PATCH 03/10] Optimize and clean the Toom-3 algorithm and choose
 better thresholds

---
 bigint/src/algorithms.rs | 197 ++-------------------------------------
 1 file changed, 6 insertions(+), 191 deletions(-)

diff --git a/bigint/src/algorithms.rs b/bigint/src/algorithms.rs
index a5cb34d..2082d04 100644
--- a/bigint/src/algorithms.rs
+++ b/bigint/src/algorithms.rs
@@ -253,11 +253,11 @@ fn mac3(acc: &mut [BigDigit], b: &[BigDigit], c: &[BigDigit]) {
 
     // Karatsuba multiplication is slower than long multiplication for small x and y:
     //
-    if x.len() <= 4 {
+    if x.len() <= 16 {
         for (i, xi) in x.iter().enumerate() {
             mac_digit(&mut acc[i..], y, *xi);
         }
-    } else if y.len() <= 0 {
+    } else if x.len() <= 300 {
         /*
          * Karatsuba multiplication:
          *
@@ -395,206 +395,21 @@ fn mac3(acc: &mut [BigDigit], b: &[BigDigit], c: &[BigDigit]) {
         let y2 = BigInt::from_slice(Plus, &y[y0_len + y1_len..]);
 
         let r0 = &x0 * &y0;
+        let r4 = &x2 * &y2;
         let r1 = (&x0 + &x1 + &x2) * (&y0 + &y1 + &y2);
         let r2 = (&x0 - &x1 + &x2) * (&y0 - &y1 + &y2);
-        let r3 = (&x0 - &x1*2 + &x2*4) * (&y0 - &y1*2 + &y2*4);
-        let r4 = &x2 * &y2;
+        let r3 = (x0 - x1*2 + x2*4) * (y0 - y1*2 + y2*4);
 
-        let comp0: BigInt = r0.clone();
-        let comp4: BigInt = r4.clone();
         let mut comp3: BigInt = (&r3 - &r1) / 3;
         let mut comp1: BigInt = (&r1 - &r2) / 2;
         let mut comp2: BigInt = &r2 - &r0;
         comp3 = (&comp2 - &comp3)/2 + &r4*2;
-        comp2 = comp2 + &comp1 - &comp4;
+        comp2 = comp2 + &comp1 - &r4;
         comp1 = comp1 - &comp3;
 
-        let result = comp0 + (comp1 << 32*i) + (comp2 << 2*32*i) + (comp3 << 3*32*i) + (comp4 << 4*32*i);
+        let result = r0 + (comp1 << 32*i) + (comp2 << 2*32*i) + (comp3 << 3*32*i) + (r4 << 4*32*i);
         assert!(result.sign != Minus);
         add2(&mut acc[..], &result.data.data);
-
-        // add2(&mut acc[..], &comp0.data.data);
-        // add2(&mut acc[4*i..], &comp4.data.data);
-        //
-        // match (comp1.sign, comp2.sign, comp3.sign) {
-        //     (Minus, Minus, Minus) => {
-        //         for _ in 0..i {
-        //             comp1.data.data.insert(0, 0);
-        //         }
-        //         sub2(&mut acc[..], &comp1.data.data);
-        //
-        //         for _ in 0..2*i {
-        //             comp2.data.data.insert(0, 0);
-        //         }
-        //         sub2(&mut acc[..], &comp2.data.data);
-        //
-        //         for _ in 0..3*i {
-        //             comp3.data.data.insert(0, 0);
-        //         }
-        //         sub2(&mut acc[..], &comp3.data.data);
-        //     },
-        //
-        //     (Minus, Minus, _) => {
-        //         add2(&mut acc[3*i..], &comp3.data.data);
-        //         for _ in 0..i {
-        //             comp1.data.data.insert(0, 0);
-        //         }
-        //         sub2(&mut acc[..], &comp1.data.data);
-        //         for _ in 0..2*i {
-        //             comp2.data.data.insert(0, 0);
-        //         }
-        //         sub2(&mut acc[..], &comp2.data.data);
-        //     },
-        //
-        //     (Minus, _, Minus) => {
-        //         add2(&mut acc[2*i..], &comp2.data.data);
-        //         for _ in 0..i {
-        //             comp1.data.data.insert(0, 0);
-        //         }
-        //         sub2(&mut acc[..], &comp1.data.data);
-        //         for _ in 0..3*i {
-        //             comp3.data.data.insert(0, 0);
-        //         }
-        //         sub2(&mut acc[..], &comp3.data.data);
-        //     },
-        //
-        //     (_, Minus, Minus) => {
-        //         add2(&mut acc[i..], &comp1.data.data);
-        //         for _ in 0..2*i {
-        //             comp2.data.data.insert(0, 0);
-        //         }
-        //         sub2(&mut acc[..], &comp2.data.data);
-        //         for _ in 0..3*i {
-        //             comp3.data.data.insert(0, 0);
-        //         }
-        //         sub2(&mut acc[..], &comp3.data.data);
-        //     },
-        //
-        //     (Minus, _, _) => {
-        //         add2(&mut acc[2*i..], &comp2.data.data);
-        //         add2(&mut acc[3*i..], &comp3.data.data);
-        //         for _ in 0..i {
-        //             comp1.data.data.insert(0, 0);
-        //         }
-        //         sub2(&mut acc[..], &comp1.data.data);
-        //     },
-        //
-        //     (_, Minus, _) => {
-        //         add2(&mut acc[i..], &comp1.data.data);
-        //         add2(&mut acc[3*i..], &comp3.data.data);
-        //         for _ in 0..2*i {
-        //             comp2.data.data.insert(0, 0);
-        //         }
-        //         sub2(&mut acc[..], &comp2.data.data);
-        //     },
-        //
-        //     (_, _, Minus) => {
-        //         add2(&mut acc[i..], &comp1.data.data);
-        //         add2(&mut acc[2*i..], &comp2.data.data);
-        //         for _ in 0..3*i {
-        //             comp3.data.data.insert(0, 0);
-        //         }
-        //         sub2(&mut acc[..], &comp3.data.data);
-        //     },
-        //
-        //     _ => {
-        //         add2(&mut acc[i..], &comp1.data.data);
-        //         add2(&mut acc[2*i..], &comp2.data.data);
-        //         add2(&mut acc[3*i..], &comp3.data.data);
-        //     }
-        // }
-
-        // let mut r0 = BigUint{ data: vec![0; 2*i + 1] };
-        // mac3(&mut r0.data, &x0, &y0);
-        // r0.normalize();
-        //
-        // let mut r1 = BigUint{ data: vec![0; 2*(i+2) + 1] };
-        // p.data.assign_from_slice(&x0);
-        // p.data.data.extend_from_slice(&[0, 0]);
-        // add2(&mut p.data.data, &x1);
-        // add2(&mut p.data.data, &x2);
-        // q.data.assign_from_slice(&y0);
-        // q.data.data.extend_from_slice(&[0, 0]);
-        // add2(&mut q.data.data, &y1);
-        // add2(&mut q.data.data, &y2);
-        // mac3(&mut r1.data, &p.data.data, &q.data.data);
-        // r1.normalize();
-        //
-        // let mut r2 = BigUint{ data: vec![0; 2*(i+1) + 1] };
-        // p.data.assign_from_slice(&x0);
-        // p.data.data.extend_from_slice(&[0, 0]);
-        // add2(&mut p.data, &x2);
-        // p = p - BigInt::from_biguint(Plus, BigUint::from_slice(&x1));
-        // q.data.assign_from_slice(&y0);
-        // q.data.data.extend_from_slice(&[0, 0]);
-        // add2(&mut q.data, &y2);
-        // q = q - BigInt::from_biguint(Plus, BigUint::from_slice(&y1));
-        // mac3(&mut r2.data, &p.data, &q.data);
-        // r2.normalize();
-        // let r2_sign = p_sign * q_sign;
-        //
-        // let mut r3 = BigUint{ data: vec![0; 2*(i+2) + 1] };
-        // p.assign_from_slice(&x0);
-        // p.data.extend_from_slice(&[0, 0]);
-        // let mut temp = BigUint { data: Vec::with_capacity(i + 1) };
-        // temp.assign_from_slice(&x2);
-        // temp = temp * 4_u32;
-        // add2(&mut p.data, &temp.data);
-        // temp.assign_from_slice(&x1);
-        // temp = temp * 2_u32;
-        // if p.data[i] == 0 && p.data[..i].iter().rev().lt(temp.data.iter().rev()) {
-        //     p_sign = Minus;
-        //     let p_temp = p.clone();
-        //     p.assign_from_slice(&temp.data);
-        //     sub2(&mut p.data, &p_temp.data);
-        // } else {
-        //     p_sign = Plus;
-        //     sub2(&mut p.data, &temp.data);
-        // }
-        // q.assign_from_slice(&y0);
-        // q.data.extend_from_slice(&[0, 0]);
-        // temp.assign_from_slice(&y2);
-        // temp = temp * 4_u32;
-        // add2(&mut q.data, &temp.data);
-        // temp.assign_from_slice(&y1);
-        // temp = temp * 2_u32;
-        // if q.data[i] == 0 && q.data[..i].iter().rev().lt(temp.data.iter().rev()) {
-        //     q_sign = Minus;
-        //     let q_temp = q.clone();
-        //     q.assign_from_slice(&temp.data);
-        //     sub2(&mut q.data, &q_temp.data);
-        // } else {
-        //     q_sign = Plus;
-        //     sub2(&mut q.data, &temp.data);
-        // }
-        // mac3(&mut r3.data, &p.data, &q.data);
-        // r3.normalize();
-        // let r3_sign = p_sign * q_sign;
-        //
-        // let mut r4 = BigUint{ data: vec![0; 2*i + 1] };
-        // mac3(&mut r4.data, &x2, &y2);
-        // r4.normalize();
-        //
-        // let signed_r0 = BigInt::from_biguint(Plus, r0);
-        // let signed_r1 = BigInt::from_biguint(Plus, r1);
-        // let signed_r2 = BigInt::from_biguint(r2_sign, r2);
-        // let signed_r3 = BigInt::from_biguint(r3_sign, r3);
-        // let signed_r4 = BigInt::from_biguint(Plus, r4);
-        //
-        // let comp0 = signed_r0.clone();
-        // let comp4 = signed_r4.clone();
-        // let mut comp3: BigInt = (&signed_r3 - &signed_r1) / 3;
-        // let mut comp1: BigInt = (&signed_r1 - &signed_r2) / 2;
-        // let mut comp2 = &signed_r2 - &signed_r0;
-        // comp3 = ((&comp2 - &comp3) / 2) + (&signed_r4 * 2);
-        // comp2 = comp2 + &comp1 - &comp4;
-        // comp1 = comp1 - &comp3;
-        //
-        // add2(&mut acc[..], &comp0.data.data);
-        // add2(&mut acc[i..], &comp1.data.data);
-        // add2(&mut acc[2*i..], &comp2.data.data);
-        // add2(&mut acc[3*i..], &comp3.data.data);
-        // add2(&mut acc[4*i..], &comp4.data.data);
     }
 }
 

From b43c1ab258520c0b731d8cf8ca1aa80f161c978b Mon Sep 17 00:00:00 2001
From: Nicolas Kirchner <kirchner.nic@gmail.com>
Date: Tue, 15 Aug 2017 01:39:43 +0200
Subject: [PATCH 04/10] Replace the use of a feature not yet implemented in
 rust 1.8

---
 bigint/src/algorithms.rs | 4 ++--
 bigint/src/bigint.rs     | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/bigint/src/algorithms.rs b/bigint/src/algorithms.rs
index 2082d04..2c28113 100644
--- a/bigint/src/algorithms.rs
+++ b/bigint/src/algorithms.rs
@@ -408,8 +408,8 @@ fn mac3(acc: &mut [BigDigit], b: &[BigDigit], c: &[BigDigit]) {
         comp1 = comp1 - &comp3;
 
         let result = r0 + (comp1 << 32*i) + (comp2 << 2*32*i) + (comp3 << 3*32*i) + (r4 << 4*32*i);
-        assert!(result.sign != Minus);
-        add2(&mut acc[..], &result.data.data);
+        let result_pos = result.to_biguint().unwrap();
+        add2(&mut acc[..], &result_pos.data);
     }
 }
 
diff --git a/bigint/src/bigint.rs b/bigint/src/bigint.rs
index f3cd4ec..bae78d7 100644
--- a/bigint/src/bigint.rs
+++ b/bigint/src/bigint.rs
@@ -105,8 +105,8 @@ impl serde::Deserialize for Sign {
 #[derive(Clone, Debug, Hash)]
 #[cfg_attr(feature = "rustc-serialize", derive(RustcEncodable, RustcDecodable))]
 pub struct BigInt {
-    pub(super) sign: Sign,
-    pub(super) data: BigUint,
+    sign: Sign,
+    data: BigUint,
 }
 
 impl PartialEq for BigInt {

From 243bc6fe4c3183e63fcd3f1672a07f13c9568f05 Mon Sep 17 00:00:00 2001
From: Nicolas Kirchner <kirchner.nic@gmail.com>
Date: Tue, 15 Aug 2017 19:27:22 +0200
Subject: [PATCH 05/10] Optimize Toom-3 algorithm

---
 bigint/src/algorithms.rs | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/bigint/src/algorithms.rs b/bigint/src/algorithms.rs
index 2c28113..33df786 100644
--- a/bigint/src/algorithms.rs
+++ b/bigint/src/algorithms.rs
@@ -394,11 +394,17 @@ fn mac3(acc: &mut [BigDigit], b: &[BigDigit], c: &[BigDigit]) {
         let y1 = BigInt::from_slice(Plus, &y[y0_len..y0_len + y1_len]);
         let y2 = BigInt::from_slice(Plus, &y[y0_len + y1_len..]);
 
+        let p = &x0 + &x2;
+        let q = &y0 + &y2;
+
+        let p2 = &p - &x1;
+        let q2 = &q - &y1;
+
         let r0 = &x0 * &y0;
         let r4 = &x2 * &y2;
-        let r1 = (&x0 + &x1 + &x2) * (&y0 + &y1 + &y2);
-        let r2 = (&x0 - &x1 + &x2) * (&y0 - &y1 + &y2);
-        let r3 = (x0 - x1*2 + x2*4) * (y0 - y1*2 + y2*4);
+        let r1 = (&p + &x1) * (&q + &y1);
+        let r2 = &p2 * &q2;
+        let r3 = ((p2 + x2)*2 - x0) * ((q2 + y2)*2 - y0);
 
         let mut comp3: BigInt = (&r3 - &r1) / 3;
         let mut comp1: BigInt = (&r1 - &r2) / 2;

From bcd76c55e88a7efc718ad9b3303b5251aaaad142 Mon Sep 17 00:00:00 2001
From: Nicolas Kirchner <kirchner.nic@gmail.com>
Date: Tue, 15 Aug 2017 21:14:15 +0200
Subject: [PATCH 06/10] Optimize mac_digit

---
 bigint/src/algorithms.rs | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/bigint/src/algorithms.rs b/bigint/src/algorithms.rs
index 33df786..dc3c426 100644
--- a/bigint/src/algorithms.rs
+++ b/bigint/src/algorithms.rs
@@ -226,20 +226,18 @@ fn mac_digit(acc: &mut [BigDigit], b: &[BigDigit], c: BigDigit) {
         return;
     }
 
-    let mut b_iter = b.iter();
     let mut carry = 0;
 
-    for ai in acc.iter_mut() {
-        if let Some(bi) = b_iter.next() {
-            *ai = mac_with_carry(*ai, *bi, c, &mut carry);
-        } else if carry != 0 {
-            *ai = mac_with_carry(*ai, 0, c, &mut carry);
-        } else {
-            break;
-        }
+    for (i, bi) in b.iter().enumerate() {
+        acc[i] = mac_with_carry(acc[i], *bi, c, &mut carry);
     }
 
-    assert!(carry == 0);
+    let mut i = b.len();
+
+    while carry != 0 {
+        acc[i] = adc(acc[i], 0, &mut carry);
+        i += 1;
+    }
 }
 
 /// Three argument multiply accumulate:

From 05dc87c04188635851fbd14acde997eb37856aee Mon Sep 17 00:00:00 2001
From: Josh Stone <cuviper@gmail.com>
Date: Wed, 20 Sep 2017 11:41:59 -0700
Subject: [PATCH 07/10] Improve mac_digit bounds checking

By starting with `split_at_mut`, the hot multiplication loop runs with
no bounds checking at all!  The remaining carry loop has a slightly
simpler check for when the remaining iterator runs dry.
---
 bigint/src/algorithms.rs | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/bigint/src/algorithms.rs b/bigint/src/algorithms.rs
index dc3c426..d46f7e7 100644
--- a/bigint/src/algorithms.rs
+++ b/bigint/src/algorithms.rs
@@ -227,16 +227,16 @@ fn mac_digit(acc: &mut [BigDigit], b: &[BigDigit], c: BigDigit) {
     }
 
     let mut carry = 0;
+    let (a_lo, a_hi) = acc.split_at_mut(b.len());
 
-    for (i, bi) in b.iter().enumerate() {
-        acc[i] = mac_with_carry(acc[i], *bi, c, &mut carry);
+    for (a, &b) in a_lo.iter_mut().zip(b) {
+        *a = mac_with_carry(*a, b, c, &mut carry);
     }
 
-    let mut i = b.len();
-
+    let mut a = a_hi.iter_mut();
     while carry != 0 {
-        acc[i] = adc(acc[i], 0, &mut carry);
-        i += 1;
+        let a = a.next().expect("carry overflow during multiplication!");
+        *a = adc(*a, 0, &mut carry);
     }
 }
 

From 2c2e46c8dfd5b7db432e45a3e4bca8a78029496f Mon Sep 17 00:00:00 2001
From: Josh Stone <cuviper@gmail.com>
Date: Wed, 20 Sep 2017 13:15:44 -0700
Subject: [PATCH 08/10] Add comments about multiplication strategy

---
 bigint/src/algorithms.rs | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/bigint/src/algorithms.rs b/bigint/src/algorithms.rs
index d46f7e7..1f0bc5a 100644
--- a/bigint/src/algorithms.rs
+++ b/bigint/src/algorithms.rs
@@ -249,9 +249,19 @@ fn mac3(acc: &mut [BigDigit], b: &[BigDigit], c: &[BigDigit]) {
         (c, b)
     };
 
-    // Karatsuba multiplication is slower than long multiplication for small x and y:
+    // We use three algorithms for different input sizes.
     //
+    // - For small inputs, long multiplication is fastest.
+    // - Next we use Karatsuba multiplication (Toom-2), which we have optimized
+    //   to avoid unnecessary allocations for intermediate values.
+    // - For the largest inputs we use Toom-3, which better optimizes the
+    //   number of operations, but uses more temporary allocations.
+    //
+    // The thresholds are somewhat arbitrary, chosen by evaluating the results
+    // of `cargo bench --bench bigint multiply`.
+
     if x.len() <= 16 {
+        // Long multiplication:
         for (i, xi) in x.iter().enumerate() {
             mac_digit(&mut acc[i..], y, *xi);
         }
@@ -376,6 +386,13 @@ fn mac3(acc: &mut [BigDigit], b: &[BigDigit], c: &[BigDigit]) {
         }
 
     } else {
+        // Toom-3 multiplication:
+        //
+        // Toom-3 is like Karatsuba above, but dividing the inputs into three parts.
+        // Both are instances of Toom-Cook, using `k=3` and `k=2` respectively.
+        //
+        // FIXME: It would be nice to have comments breaking down the operations below.
+
         let i = y.len()/3 + 1;
 
         let x0_len = cmp::min(x.len(), i);

From 28d84ca3acdfce3b35bf2b1be9fed7a59fb355ff Mon Sep 17 00:00:00 2001
From: Josh Stone <cuviper@gmail.com>
Date: Wed, 20 Sep 2017 13:17:06 -0700
Subject: [PATCH 09/10] Toom-3: operate more on values where possible

---
 bigint/src/algorithms.rs | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/bigint/src/algorithms.rs b/bigint/src/algorithms.rs
index 1f0bc5a..8a133b2 100644
--- a/bigint/src/algorithms.rs
+++ b/bigint/src/algorithms.rs
@@ -417,14 +417,14 @@ fn mac3(acc: &mut [BigDigit], b: &[BigDigit], c: &[BigDigit]) {
 
         let r0 = &x0 * &y0;
         let r4 = &x2 * &y2;
-        let r1 = (&p + &x1) * (&q + &y1);
+        let r1 = (p + x1) * (q + y1);
         let r2 = &p2 * &q2;
         let r3 = ((p2 + x2)*2 - x0) * ((q2 + y2)*2 - y0);
 
-        let mut comp3: BigInt = (&r3 - &r1) / 3;
-        let mut comp1: BigInt = (&r1 - &r2) / 2;
-        let mut comp2: BigInt = &r2 - &r0;
-        comp3 = (&comp2 - &comp3)/2 + &r4*2;
+        let mut comp3: BigInt = (r3 - &r1) / 3;
+        let mut comp1: BigInt = (r1 - &r2) / 2;
+        let mut comp2: BigInt = r2 - &r0;
+        comp3 = (&comp2 - comp3)/2 + &r4*2;
         comp2 = comp2 + &comp1 - &r4;
         comp1 = comp1 - &comp3;
 

From 1ddbee7f37c6d2e9f8eae5742e948909a3569bd6 Mon Sep 17 00:00:00 2001
From: Josh Stone <cuviper@gmail.com>
Date: Wed, 20 Sep 2017 13:19:00 -0700
Subject: [PATCH 10/10] bigint mac3: tweak thresholds between algorithms

It's not too rigorous, but thresholds 32 and 256 give me better results.

Before:

     test multiply_0        ... bench:          87 ns/iter (+/- 0)
     test multiply_1        ... bench:      11,926 ns/iter (+/- 19)
     test multiply_2        ... bench:     772,178 ns/iter (+/- 3,068)
     test multiply_3        ... bench:   2,034,237 ns/iter (+/- 9,618)

After:

     test multiply_0        ... bench:          87 ns/iter (+/- 0)
     test multiply_1        ... bench:      11,927 ns/iter (+/- 64)
     test multiply_2        ... bench:     672,440 ns/iter (+/- 3,570)
     test multiply_3        ... bench:   1,577,065 ns/iter (+/- 11,137)
---
 bigint/src/algorithms.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bigint/src/algorithms.rs b/bigint/src/algorithms.rs
index 8a133b2..d93bdd5 100644
--- a/bigint/src/algorithms.rs
+++ b/bigint/src/algorithms.rs
@@ -260,12 +260,12 @@ fn mac3(acc: &mut [BigDigit], b: &[BigDigit], c: &[BigDigit]) {
     // The thresholds are somewhat arbitrary, chosen by evaluating the results
     // of `cargo bench --bench bigint multiply`.
 
-    if x.len() <= 16 {
+    if x.len() <= 32 {
         // Long multiplication:
         for (i, xi) in x.iter().enumerate() {
             mac_digit(&mut acc[i..], y, *xi);
         }
-    } else if x.len() <= 300 {
+    } else if x.len() <= 256 {
         /*
          * Karatsuba multiplication:
          *