Low-Level Optimizations
These optimizations focus on making the most efficient use of CPU and memory:
SIMD (Single Instruction, Multiple Data)
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
// Scalar implementation
fn add_arrays_scalar(a: &[f32], b: &[f32], result: &mut [f32]) {
for i in 0..a.len() {
result[i] = a[i] + b[i];
}
}
// SIMD implementation
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn add_arrays_simd(a: &[f32], b: &[f32], result: &mut [f32]) {
let n = a.len();
let mut i = 0;
// Process 8 elements at a time using AVX2
while i + 8 <= n {
let a_vec = _mm256_loadu_ps(&a[i]);
let b_vec = _mm256_loadu_ps(&b[i]);
let sum = _mm256_add_ps(a_vec, b_vec);
_mm256_storeu_ps(&mut result[i], sum);
i += 8;
}
// Process remaining elements
for j in i..n {
result[j] = a[j] + b[j];
}
}
// Safe wrapper
fn add_arrays(a: &[f32], b: &[f32], result: &mut [f32]) {
assert_eq!(a.len(), b.len());
assert_eq!(a.len(), result.len());
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
unsafe {
add_arrays_simd(a, b, result);
return;
}
}
}
// Fall back to scalar implementation
add_arrays_scalar(a, b, result);
}
Memory Layout Optimization
// Cache-unfriendly struct (fields accessed at different times)
struct CacheUnfriendly {
a: [u8; 64], // 64 bytes (one cache line)
b: [u8; 64], // 64 bytes (another cache line)
c: [u8; 64], // 64 bytes (yet another cache line)
}
// Cache-friendly struct (fields accessed together are stored together)
struct CacheFriendly {
// Fields accessed together in one operation
header: Header,
// Fields accessed together in another operation
data: Data,
}
struct Header {
id: u64,
flags: u32,
type_code: u32,
}
struct Data {
buffer: [u8; 1024],
length: usize,
}
// Structure of Arrays vs Array of Structures
// Array of Structures (AoS) - Cache-unfriendly for some operations
struct Particle {
position: [f32; 3],
velocity: [f32; 3],
acceleration: [f32; 3],
mass: f32,
}
// Structure of Arrays (SoA) - Cache-friendly for operations on specific attributes
struct ParticleSystem {
positions: Vec<[f32; 3]>,
velocities: Vec<[f32; 3]>,
accelerations: Vec<[f32; 3]>,
masses: Vec<f32>,
}
Alignment and Padding
// Unaligned struct (potentially inefficient memory access)
struct Unaligned {
a: u8, // 1 byte
b: u32, // 4 bytes (might not be aligned)
c: u16, // 2 bytes (might not be aligned)
}
// Aligned struct (more efficient memory access)
#[repr(C)]
struct Aligned {
b: u32, // 4 bytes (aligned)
c: u16, // 2 bytes (aligned)
a: u8, // 1 byte
_pad: u8, // 1 byte padding to maintain alignment
}
// Explicitly aligned struct
#[repr(align(64))]
struct CacheLineAligned {
data: [u8; 64],
}
Branch Prediction Optimization
// Branch-heavy code (potentially slow due to branch mispredictions)
fn sum_if_positive(data: &[i32]) -> i32 {
let mut sum = 0;
for &x in data {
if x > 0 {
sum += x;
}
}
sum
}
// Branchless version (potentially faster if branches are unpredictable)
fn sum_if_positive_branchless(data: &[i32]) -> i32 {
let mut sum = 0;
for &x in data {
// Use arithmetic to avoid branches
sum += x * (x > 0) as i32;
}
sum
}
Memory Management Optimizations
Efficient memory management can significantly improve performance:
Custom Allocators
use std::alloc::{GlobalAlloc, Layout, System};
// A simple allocator that tracks allocations
struct TracingAllocator;
unsafe impl GlobalAlloc for TracingAllocator {
unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
println!("Allocating {} bytes", layout.size());
System.alloc(layout)
}
unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
println!("Deallocating {} bytes", layout.size());
System.dealloc(ptr, layout)
}
}
// Register the custom allocator
#[global_allocator]
static ALLOCATOR: TracingAllocator = TracingAllocator;
// Using a bump allocator for temporary allocations
use bumpalo::Bump;
fn process_data(data: &[u8]) -> Vec<u8> {
// Create a bump allocator for temporary allocations
let bump = Bump::new();
// Allocate temporary buffers from the bump allocator
let temp1 = bump.alloc_slice_copy(&[1, 2, 3]);
let temp2 = bump.alloc_slice_copy(&[4, 5, 6]);
// Process data using temporary buffers
let mut result = Vec::new();
for &x in data {
for &y in temp1 {
for &z in temp2 {
result.push(x + y + z);
}
}
}
// The bump allocator and all its allocations are freed when it goes out of scope
result
}