Parallel Iterators with Rayon

use rayon::prelude::*;
use std::time::Instant;

// Sequential implementation
fn sum_of_squares_sequential(data: &[i32]) -> i64 {
    data.iter()
        .map(|&x| (x as i64).pow(2))
        .sum()
}

// Parallel implementation
fn sum_of_squares_parallel(data: &[i32]) -> i64 {
    data.par_iter()
        .map(|&x| (x as i64).pow(2))
        .sum()
}

fn main() {
    // Generate test data
    let data: Vec<i32> = (0..10_000_000).collect();
    
    // Measure sequential implementation
    let start = Instant::now();
    let result1 = sum_of_squares_sequential(&data);
    let duration1 = start.elapsed();
    
    // Measure parallel implementation
    let start = Instant::now();
    let result2 = sum_of_squares_parallel(&data);
    let duration2 = start.elapsed();
    
    assert_eq!(result1, result2);
    
    println!("Sequential: {:?}", duration1);
    println!("Parallel: {:?}", duration2);
    println!("Speedup: {:.2}x", duration1.as_secs_f64() / duration2.as_secs_f64());
}

Thread Pools

use std::sync::mpsc;
use std::thread;
use threadpool::ThreadPool;

fn process_data_sequential(data: &[i32]) -> Vec<i32> {
    data.iter().map(|&x| {
        // Simulate expensive computation
        thread::sleep(std::time::Duration::from_millis(1));
        x * x
    }).collect()
}

fn process_data_parallel(data: &[i32], num_threads: usize) -> Vec<i32> {
    let pool = ThreadPool::new(num_threads);
    let (tx, rx) = mpsc::channel();
    
    for (i, &item) in data.iter().enumerate() {
        let tx = tx.clone();
        pool.execute(move || {
            // Simulate expensive computation
            thread::sleep(std::time::Duration::from_millis(1));
            tx.send((i, item * item)).expect("Channel send failed");
        });
    }
    
    // Drop the original sender to allow the channel to close
    drop(tx);
    
    // Collect results in the correct order
    let mut results = vec![0; data.len()];
    for (i, result) in rx.iter() {
        results[i] = result;
    }
    
    results
}

Async/Await for I/O-Bound Tasks

use tokio::fs::File;
use tokio::io::{AsyncReadExt, AsyncWriteExt};
use futures::stream::{self, StreamExt};

async fn process_file(path: &str) -> Result<usize, std::io::Error> {
    let mut file = File::open(path).await?;
    let mut contents = Vec::new();
    file.read_to_end(&mut contents).await?;
    
    // Process the contents
    let processed = contents.iter().map(|&b| b.wrapping_add(1)).collect::<Vec<_>>();
    
    // Write the processed contents
    let mut output = File::create(format!("{}.processed", path)).await?;
    output.write_all(&processed).await?;
    
    Ok(processed.len())
}

#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
    let files = vec!["file1.txt", "file2.txt", "file3.txt"];
    
    // Process files concurrently
    let results = stream::iter(files)
        .map(|path| async move {
            match process_file(path).await {
                Ok(size) => println!("Processed {} ({} bytes)", path, size),
                Err(e) => eprintln!("Error processing {}: {}", path, e),
            }
        })
        .buffer_unordered(10) // Process up to 10 files concurrently
        .collect::<Vec<_>>()
        .await;
    
    Ok(())
}

Compiler and Build Optimizations

Rust’s compiler offers various optimization options:

Optimization Levels

# Cargo.toml

# Debug profile with some optimizations
[profile.dev]
opt-level = 1  # Basic optimizations
debug = true   # Include debug info

# Release profile with maximum optimizations
[profile.release]
opt-level = 3      # Maximum optimizations
lto = "fat"        # Link-time optimization
codegen-units = 1  # Optimize across the entire codebase
panic = "abort"    # Smaller binary size by not unwinding on panic
strip = true       # Strip symbols from binary
# Cargo.toml

[profile.release]
# Enable LTO for better cross-module optimizations
lto = true  # Default LTO
# lto = "thin"  # Faster compilation, slightly less optimization
# lto = "fat"   # Maximum optimization, slower compilation

Profile-Guided Optimization (PGO)

# Step 1: Compile with instrumentation
RUSTFLAGS="-Cprofile-generate=/tmp/pgo-data" cargo build --release

# Step 2: Run the program to collect profile data
./target/release/my_program --typical-workload

# Step 3: Merge the profile data
llvm-profdata merge -o /tmp/pgo-data/merged.profdata /tmp/pgo-data

# Step 4: Compile with the profile data
RUSTFLAGS="-Cprofile-use=/tmp/pgo-data/merged.profdata" cargo build --release