From 771c6dd2301eb5105454da5b41c9ddae98c3944f Mon Sep 17 00:00:00 2001
From: Eric Lynema <elynema@gmail.com>
Date: Sat, 20 Dec 2025 23:15:24 -0500
Subject: [PATCH 1/3] Implement improvements: rayon parallelism, safer CLI,
 output file option, tests, CI workflow, tuned release profile

---
 .github/workflows/ci.yml |  27 ++++++++
 Cargo.toml               |  11 +++-
 README.md                |  42 ++++++------
 src/main.rs              | 134 +++++++++++++++++++++++++++------------
 4 files changed, 151 insertions(+), 63 deletions(-)
 create mode 100644 .github/workflows/ci.yml
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..bc6ed7a
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,27 @@
+name: CI
+
+on:
+  push:
+    branches: [ "**" ]
+  pull_request:
+    branches: [ "**" ]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install Rust toolchain
+        uses: actions-rs/toolchain@v1
+        with:
+          toolchain: stable
+          profile: minimal
+          override: true
+      - name: Run cargo fmt check
+        run: cargo fmt -- --check
+      - name: Run clippy
+        run: cargo clippy --all-targets --all-features -- -D warnings
+      - name: Run tests
+        run: cargo test --verbose
+      - name: Build release
+        run: cargo build --release --verbose
diff --git a/Cargo.toml b/Cargo.toml
index 499c787..d204cae 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,8 +1,17 @@
 [package]
 name = "pi"
 version = "0.1.0"
-edition = "2024"
+edition = "2021"
 
 [dependencies]
 clap = { version = "4.5.4", features = ["derive"] }
 rug = "1.24.1"
+rayon = "1.7"
+
+[dev-dependencies]
+criterion = "0.4"
+
+[profile.release]
+opt-level = 3
+lto = true
+codegen-units = 1
diff --git a/README.md b/README.md
index bd8a499..ded2b49 100644
--- a/README.md
+++ b/README.md
@@ -1,20 +1,18 @@
 # Pi Calculator
 
-This is a multi-threaded Rust program that calculates the first n digits of Pi using the Bailey–Borwein–Plouffe (BBP) formula. It uses arbitrary-precision arithmetic to ensure the accuracy of the calculated digits.
+This is a multi-threaded Rust program that calculates the first n digits of Pi using the Bailey–Borwein–Plouffe (BBP) formula. It uses arbitrary-precision arithmetic (rug) and parallelism (rayon).
 
-## Features
+## Improvements in this branch
 
-*   Calculates the first n digits of Pi.
-*   Multi-threaded to speed up the calculation.
-*   Configurable number of threads.
-*   Uses the BBP algorithm.
-*   High-precision calculation using the `rug` crate.
+* Parallelized BBP summation with rayon for better thread control and load balancing.
+* Safer argument validation and error handling (avoids unwraps on runtime errors).
+* Optional output-to-file support.
+* Added CI workflow to run formatting, clippy, tests and build on push/PR.
+* Release profile tuned for better optimized builds (LTO, opt-level=3).
 
 ## Building
 
-To build the program, you need to have Rust and Cargo installed. You can install them from [https://rustup.rs/](https://rustup.rs/).
-
-Once you have Rust and Cargo installed, you can build the program with the following command:
+Requires Rust and Cargo. Build with:
 
 ```bash
 cargo build --release
@@ -22,26 +20,28 @@ cargo build --release
 
 ## Usage
 
-To run the program, you can use the following command:
-
 ```bash
 ./target/release/pi <N> [OPTIONS]
 ```
 
-### Arguments
+Arguments
 
-*   `<N>`: The number of digits of Pi to calculate.
+* `<N>`: Number of digits after the decimal point to calculate.
 
-### Options
+Options
 
-*   `-t`, `--threads <THREADS>`: The number of threads to use. Defaults to 4.
-*   `-h`, `--help`: Print help information.
-*   `-V`, `--version`: Print version information.
+* `-t`, `--threads <THREADS>`: Number of threads to use (default 4).
+* `-o`, `--output <FILE>`: Write output to FILE instead of stdout.
+* `-h`, `--help`: Print help.
 
-### Example
+Example
 
-To calculate the first 1000 digits of Pi using 8 threads, you can run the following command:
+Calculate 1000 digits using 8 threads and write to a file:
 
 ```bash
-./target/release/pi 1000 -t 8
+./target/release/pi 1000 -t 8 -o pi1000.txt
 ```
+
+Notes
+
+For very large numbers of digits, using a decimal-friendly algorithm such as Chudnovsky (with binary splitting) will be far faster and more memory-efficient than BBP; consider switching to Chudnovsky for production-grade large computations.
diff --git a/src/main.rs b/src/main.rs
index 149f296..6aea6bd 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,19 +1,28 @@
 use clap::Parser;
-use rug::{Float, ops::Pow};
-use std::thread;
+use rug::Float;
+use rug::ops::Pow;
+use rayon::prelude::*;
+use std::fs::File;
+use std::io::Write;
+use std::path::PathBuf;
 
 #[derive(Parser, Debug)]
 #[command(author, version, about, long_about = None)]
 struct Args {
-    /// Number of digits of Pi to calculate
+    /// Number of digits of Pi to calculate (digits after the decimal point)
     n: u32,
 
     /// Number of threads to use
     #[arg(short, long, default_value_t = 4)]
     threads: usize,
+
+    /// Optional output file (writes result there if provided)
+    #[arg(short, long)]
+    output: Option<PathBuf>,
 }
 
 fn bbp_term(k: u32, prec: u32) -> Float {
+    // Compute one BBP term at precision `prec`.
     let mut term = Float::with_val(prec, 4);
     term /= Float::with_val(prec, 8 * k + 1);
 
@@ -30,52 +39,95 @@ fn bbp_term(k: u32, prec: u32) -> Float {
     term -= term4;
 
     let sixteen = Float::with_val(prec, 16);
-    term /= sixteen.pow(k);
+    term /= sixteen.pow(k as i32);
 
     term
 }
 
-fn main() {
-    let args = Args::parse();
-    let n = args.n;
-    let num_threads = args.threads;
-
-    // Precision for rug::Float. We need a bit more than n decimal digits.
-    // log2(10) is approx 3.32. So, we need n * 3.32 bits.
-    let prec = (n as f64 * 3.33).ceil() as u32 + 10;
-
-    let num_terms = n + 5; // Use more terms for better accuracy
-    let terms_per_thread = (num_terms + num_threads as u32 - 1) / num_threads as u32;
-
-    let mut handles = vec![];
-
-    for i in 0..num_threads {
-        let start = i as u32 * terms_per_thread;
-        let end = ((i + 1) as u32 * terms_per_thread).min(num_terms);
-        let handle = thread::spawn(move || {
-            let mut partial_sum = Float::with_val(prec, 0);
-            for k in start..end {
-                partial_sum += bbp_term(k, prec);
-            }
-            partial_sum
-        });
-        handles.push(handle);
+/// Calculate Pi to `n` decimal digits using a parallelized BBP summation.
+/// Returns a decimal string containing Pi truncated to `n` digits after the decimal point.
+pub fn calculate_pi(n: u32, num_threads: usize) -> Result<String, String> {
+    if n == 0 {
+        return Err("n must be > 0".into());
+    }
+    if num_threads == 0 {
+        return Err("threads must be > 0".into());
     }
 
-    let mut pi = Float::with_val(prec, 0);
-    for handle in handles {
-        pi += handle.join().unwrap();
-    }
+    // Bits of precision: log2(10) ~= 3.321928. Add some guard bits.
+    let prec = (n as f64 * 3.3219280948873626).ceil() as u32 + 20;
 
-    // The user wants n digits after the decimal, and the output to be truncated.
-    // We can achieve this by getting a string with more precision and then truncating it.
-    let pi_string = pi.to_string_radix(10, Some(n as usize + 5)); // Get extra digits for accurate truncation
-    let dot_pos = pi_string.find('.').unwrap_or(1);
+    // BBP converges in base-16; use a modest overestimate for term count.
+    let num_terms = (n as usize / 1) + 20; // conservative
+
+    // Use rayon thread pool to control threads for parallel work.
+    let pool = rayon::ThreadPoolBuilder::new()
+        .num_threads(num_threads)
+        .build()
+        .map_err(|e| format!("Failed to build thread pool: {}", e))?;
+
+    let pi = pool.install(|| {
+        // Parallel iterator over term indices.
+        (0..num_terms as u32)
+            .into_par_iter()
+            .map(|k| bbp_term(k, prec))
+            .reduce(|| Float::with_val(prec, 0), |a, b| a + b)
+    });
+
+    // Convert to decimal string with a few extra digits for safe truncation.
+    let extra = 10usize;
+    let pi_string = pi.to_string_radix(10, Some(n as usize + extra));
+
+    // Find dot safely and truncate or pad as needed.
+    let dot_pos = pi_string.find('.').unwrap_or(pi_string.len());
     let end_pos = dot_pos + 1 + n as usize;
 
-    if pi_string.len() > end_pos {
-        println!("Pi: {}", &pi_string[..end_pos]);
+    let out = if pi_string.len() >= end_pos {
+        pi_string[..end_pos].to_string()
     } else {
-        println!("Pi: {}", pi_string);
+        // If not enough digits were produced, pad with zeros.
+        let mut s = pi_string;
+        if !s.contains('.') {
+            s.push('.');
+        }
+        while s.len() < end_pos {
+            s.push('0');
+        }
+        s
+    };
+
+    Ok(out)
+}
+
+fn main() {
+    let args = Args::parse();
+
+    match calculate_pi(args.n, args.threads) {
+        Ok(pi_str) => {
+            if let Some(path) = args.output {
+                match File::create(&path) {
+                    Ok(mut f) => {
+                        if let Err(e) = writeln!(f, "{}", pi_str) {
+                            eprintln!("Failed to write to {}: {}", path.display(), e);
+                        }
+                    }
+                    Err(e) => eprintln!("Failed to create {}: {}", path.display(), e),
+                }
+            } else {
+                println!("Pi: {}", pi_str);
+            }
+        }
+        Err(e) => eprintln!("Error: {}", e),
     }
-}
\ No newline at end of file
+}
+
+#[cfg(test)]
+mod tests {
+    use super::calculate_pi;
+
+    #[test]
+    fn pi_10_digits() {
+        let pi = calculate_pi(10, 2).expect("calculation failed");
+        assert_eq!(pi, "3.1415926535");
+    }
+}

From ed5c349796859f1fa03c5b0421db842f8b7c5077 Mon Sep 17 00:00:00 2001
From: Eric Lynema <elynema@gmail.com>
Date: Sat, 20 Dec 2025 23:21:36 -0500
Subject: [PATCH 2/3] Implement Chudnovsky binary-splitting Pi calculator
 (replace BBP)

---
 src/main.rs | 100 ++++++++++++++++++++++++++--------------------------
 1 file changed, 50 insertions(+), 50 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index 6aea6bd..87a606a 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,7 +1,5 @@
 use clap::Parser;
-use rug::Float;
-use rug::ops::Pow;
-use rayon::prelude::*;
+use rug::{Float, Integer, ops::Pow};
 use std::fs::File;
 use std::io::Write;
 use std::path::PathBuf;
@@ -12,7 +10,7 @@ struct Args {
     /// Number of digits of Pi to calculate (digits after the decimal point)
     n: u32,
 
-    /// Number of threads to use
+    /// Number of threads to use (kept for compatibility; Chudnovsky is CPU bound)
     #[arg(short, long, default_value_t = 4)]
     threads: usize,
 
@@ -21,58 +19,61 @@ struct Args {
     output: Option<PathBuf>,
 }
 
-fn bbp_term(k: u32, prec: u32) -> Float {
-    // Compute one BBP term at precision `prec`.
-    let mut term = Float::with_val(prec, 4);
-    term /= Float::with_val(prec, 8 * k + 1);
-
-    let mut term2 = Float::with_val(prec, 2);
-    term2 /= Float::with_val(prec, 8 * k + 4);
-    term -= term2;
-
-    let mut term3 = Float::with_val(prec, 1);
-    term3 /= Float::with_val(prec, 8 * k + 5);
-    term -= term3;
-
-    let mut term4 = Float::with_val(prec, 1);
-    term4 /= Float::with_val(prec, 8 * k + 6);
-    term -= term4;
-
-    let sixteen = Float::with_val(prec, 16);
-    term /= sixteen.pow(k as i32);
-
-    term
+// Binary splitting for the Chudnovsky algorithm.
+// Returns (P, Q, T) as big integers for the interval [a, b)
+fn bs(a: u64, b: u64) -> (Integer, Integer, Integer) {
+    if b - a == 1 {
+        if a == 0 {
+            // P = 1, Q = 1, T = 13591409
+            return (Integer::from(1), Integer::from(1), Integer::from(13591409));
+        }
+        let a_i = Integer::from(a as i128);
+        let p: Integer = (Integer::from(6 * a as i128 - 5)
+            * Integer::from(2 * a as i128 - 1)
+            * Integer::from(6 * a as i128 - 1))
+            .into();
+        let q: Integer = (Integer::from(a as i128).pow(3) * Integer::from(640320i128).pow(3)).into();
+        let mut t: Integer = (p.clone() * Integer::from(13591409i128 + 545140134i128 * a_i)).into();
+        if a % 2 == 1 {
+            t = -t;
+        }
+        return (p, q, t);
+    }
+    let m = (a + b) / 2;
+    let (p1, q1, t1) = bs(a, m);
+    let (p2, q2, t2) = bs(m, b);
+    let p = (&p1 * &p2).into();
+    let q = (&q1 * &q2).into();
+    let t1q2: Integer = (&t1 * &q2).into();
+    let p1t2: Integer = (&p1 * &t2).into();
+    let t = t1q2 + p1t2;
+    (p, q, t)
 }
 
-/// Calculate Pi to `n` decimal digits using a parallelized BBP summation.
-/// Returns a decimal string containing Pi truncated to `n` digits after the decimal point.
-pub fn calculate_pi(n: u32, num_threads: usize) -> Result<String, String> {
+/// Calculate Pi to `n` decimal digits using the Chudnovsky algorithm (binary splitting).
+pub fn calculate_pi_chudnovsky(n: u32) -> Result<String, String> {
     if n == 0 {
         return Err("n must be > 0".into());
     }
-    if num_threads == 0 {
-        return Err("threads must be > 0".into());
-    }
 
-    // Bits of precision: log2(10) ~= 3.321928. Add some guard bits.
+    // Each term of Chudnovsky yields ~14.181647462725477 decimal digits
+    let digits_per_term = 14.181647462725477;
+    let terms = ((n as f64) / digits_per_term).ceil() as u64 + 1;
+
+    // Bits of precision: log2(10) ~= 3.321928. Add guard bits.
     let prec = (n as f64 * 3.3219280948873626).ceil() as u32 + 20;
 
-    // BBP converges in base-16; use a modest overestimate for term count.
-    let num_terms = (n as usize / 1) + 20; // conservative
+    let (_p, q, t) = bs(0, terms);
 
-    // Use rayon thread pool to control threads for parallel work.
-    let pool = rayon::ThreadPoolBuilder::new()
-        .num_threads(num_threads)
-        .build()
-        .map_err(|e| format!("Failed to build thread pool: {}", e))?;
+    // Convert big integers to high-precision floats
+    let prec_u = prec as u32;
+    let qf = Float::with_val(prec_u, q);
+    let tf = Float::with_val(prec_u, t);
 
-    let pi = pool.install(|| {
-        // Parallel iterator over term indices.
-        (0..num_terms as u32)
-            .into_par_iter()
-            .map(|k| bbp_term(k, prec))
-            .reduce(|| Float::with_val(prec, 0), |a, b| a + b)
-    });
+    // C = 426880 * sqrt(10005)
+    let c = Float::with_val(prec_u, 426880) * Float::with_val(prec_u, 10005).sqrt();
+
+    let pi = c * qf / tf;
 
     // Convert to decimal string with a few extra digits for safe truncation.
     let extra = 10usize;
@@ -85,7 +86,6 @@ pub fn calculate_pi(n: u32, num_threads: usize) -> Result<String, String> {
     let out = if pi_string.len() >= end_pos {
         pi_string[..end_pos].to_string()
     } else {
-        // If not enough digits were produced, pad with zeros.
         let mut s = pi_string;
         if !s.contains('.') {
             s.push('.');
@@ -102,7 +102,7 @@ pub fn calculate_pi(n: u32, num_threads: usize) -> Result<String, String> {
 fn main() {
     let args = Args::parse();
 
-    match calculate_pi(args.n, args.threads) {
+    match calculate_pi_chudnovsky(args.n) {
         Ok(pi_str) => {
             if let Some(path) = args.output {
                 match File::create(&path) {
@@ -123,11 +123,11 @@ fn main() {
 
 #[cfg(test)]
 mod tests {
-    use super::calculate_pi;
+    use super::calculate_pi_chudnovsky;
 
     #[test]
     fn pi_10_digits() {
-        let pi = calculate_pi(10, 2).expect("calculation failed");
+        let pi = calculate_pi_chudnovsky(10).expect("calculation failed");
         assert_eq!(pi, "3.1415926535");
     }
 }

From 73d532520377d8f4997642a3931f365b9eff25dc Mon Sep 17 00:00:00 2001
From: Eric Lynema <elynema@gmail.com>
Date: Sat, 20 Dec 2025 23:23:56 -0500
Subject: [PATCH 3/3] Parallelize binary-splitting using rayon::join

---
 src/main.rs | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index 87a606a..9b79128 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,5 +1,6 @@
 use clap::Parser;
 use rug::{Float, Integer, ops::Pow};
+use rayon::join;
 use std::fs::File;
 use std::io::Write;
 use std::path::PathBuf;
@@ -40,8 +41,9 @@ fn bs(a: u64, b: u64) -> (Integer, Integer, Integer) {
         return (p, q, t);
     }
     let m = (a + b) / 2;
-    let (p1, q1, t1) = bs(a, m);
-    let (p2, q2, t2) = bs(m, b);
+    let (left, right) = join(|| bs(a, m), || bs(m, b));
+    let (p1, q1, t1) = left;
+    let (p2, q2, t2) = right;
     let p = (&p1 * &p2).into();
     let q = (&q1 * &q2).into();
     let t1q2: Integer = (&t1 * &q2).into();