Advanced Debugging - Profiling, Memory Analysis, and Production Debugging
Performance bugs are the sneakiest problems you’ll encounter. Your code works correctly but runs too slowly, uses too much memory, or mysteriously degrades over time. I’ve seen applications that worked fine in development but crawled to a halt in production because nobody profiled them under realistic load.
Advanced debugging goes beyond finding logical errors to understanding how your code behaves under stress, where it spends time, and how it uses system resources. These skills become essential as your applications scale and performance becomes critical.
CPU Profiling with cProfile
Python’s built-in cProfile module shows you exactly where your program spends its time. This data is invaluable for identifying performance bottlenecks:
import cProfile
import pstats
from io import StringIO
def expensive_calculation(n):
"""Simulate CPU-intensive work."""
total = 0
for i in range(n):
for j in range(100):
total += i * j
return total
def inefficient_string_building(items):
"""Demonstrate inefficient string concatenation."""
result = ""
for item in items:
result += str(item) + ", " # This creates new strings each time
return result.rstrip(", ")
def profile_performance():
"""Profile code to identify bottlenecks."""
pr = cProfile.Profile()
pr.enable()
# Code to profile
result1 = expensive_calculation(1000)
result2 = inefficient_string_building(range(10000))
pr.disable()
# Analyze results
s = StringIO()
ps = pstats.Stats(pr, stream=s).sort_stats('cumulative')
ps.print_stats()
print(s.getvalue())
if __name__ == "__main__":
profile_performance()
The profiler output shows function call counts, total time, and time per call, helping you identify which functions consume the most resources.
Line-by-Line Profiling
For detailed analysis of specific functions, use line_profiler to see exactly which lines are slow:
# Install: pip install line_profiler
# Run: kernprof -l -v script.py
@profile # This decorator is added by line_profiler
def analyze_data(data):
"""Function to profile line by line."""
# Line 1: Fast operation
filtered = [x for x in data if x > 0]
# Line 2: Potentially slow operation
sorted_data = sorted(filtered, reverse=True)
# Line 3: Another potentially slow operation
result = sum(x ** 2 for x in sorted_data[:100])
return result
def main():
import random
data = [random.randint(-100, 100) for _ in range(100000)]
result = analyze_data(data)
print(f"Result: {result}")
if __name__ == "__main__":
main()
Line profiler shows the execution time for each line, making it easy to spot the exact operations that need optimization.
Memory Profiling and Leak Detection
Memory issues can be harder to debug than CPU performance problems. Use memory_profiler to track memory usage over time:
# Install: pip install memory_profiler psutil
from memory_profiler import profile
import gc
@profile
def memory_intensive_function():
"""Function that demonstrates memory usage patterns."""
# Create large data structures
large_list = list(range(1000000)) # ~40MB
# Create nested structures
nested_data = {i: list(range(100)) for i in range(10000)} # More memory
# Process data (memory usage should stay stable)
processed = [x * 2 for x in large_list if x % 2 == 0]
# Clean up explicitly
del large_list
del nested_data
gc.collect() # Force garbage collection
return len(processed)
def detect_memory_leaks():
"""Run function multiple times to detect memory leaks."""
import tracemalloc
tracemalloc.start()
for i in range(5):
result = memory_intensive_function()
# Take memory snapshot
snapshot = tracemalloc.take_snapshot()
top_stats = snapshot.statistics('lineno')
print(f"Iteration {i+1}: {len(top_stats)} memory allocations")
for stat in top_stats[:3]:
print(f" {stat}")
if __name__ == "__main__":
detect_memory_leaks()
Memory profiling helps identify memory leaks, excessive allocations, and opportunities for optimization.
Production Debugging Strategies
Debugging production issues requires different techniques because you can’t stop the application or add breakpoints. Instead, you rely on logging, monitoring, and non-intrusive debugging tools:
import logging
import sys
import traceback
from functools import wraps
# Configure structured logging for production
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('app.log'),
logging.StreamHandler(sys.stdout)
]
)
logger = logging.getLogger(__name__)
def debug_on_error(func):
"""Decorator to capture detailed error information."""
@wraps(func)
def wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except Exception as e:
# Capture detailed error context
error_info = {
'function': func.__name__,
'args': str(args)[:200], # Limit size
'kwargs': str(kwargs)[:200],
'exception_type': type(e).__name__,
'exception_message': str(e),
'traceback': traceback.format_exc()
}
logger.error(f"Function {func.__name__} failed", extra=error_info)
raise
return wrapper
@debug_on_error
def process_user_request(user_id, action, data):
"""Example function with production debugging."""
logger.info(f"Processing request for user {user_id}: {action}")
# Simulate processing
if action == "invalid_action":
raise ValueError(f"Unknown action: {action}")
# Log performance metrics
import time
start_time = time.time()
# Simulate work
time.sleep(0.1)
duration = time.time() - start_time
logger.info(f"Request processed in {duration:.3f}s", extra={
'user_id': user_id,
'action': action,
'duration': duration
})
return {"status": "success", "duration": duration}
This approach captures detailed error information without impacting normal operation performance.
Debugging Distributed Systems
When debugging applications that span multiple services, correlation IDs help track requests across system boundaries:
import uuid
import logging
from contextvars import ContextVar
# Context variable to store correlation ID across async calls
correlation_id: ContextVar[str] = ContextVar('correlation_id', default='')
class CorrelationFilter(logging.Filter):
"""Add correlation ID to all log messages."""
def filter(self, record):
record.correlation_id = correlation_id.get('')
return True
# Configure logging with correlation IDs
logger = logging.getLogger(__name__)
logger.addFilter(CorrelationFilter())
def set_correlation_id(cid=None):
"""Set correlation ID for current context."""
if cid is None:
cid = str(uuid.uuid4())
correlation_id.set(cid)
return cid
async def service_a_function(data):
"""Function in service A."""
cid = set_correlation_id()
logger.info(f"Service A processing data: {data}")
# Call service B
result = await call_service_b(data, cid)
logger.info(f"Service A completed processing")
return result
async def call_service_b(data, cid):
"""Simulate calling another service."""
correlation_id.set(cid) # Propagate correlation ID
logger.info(f"Service B received data: {data}")
# Simulate processing
processed_data = {"processed": data, "service": "B"}
logger.info(f"Service B completed processing")
return processed_data
Correlation IDs let you trace a single request through multiple services, making distributed debugging much easier.
Performance Regression Detection
Set up automated performance monitoring to catch regressions before they reach production:
import time
import statistics
from functools import wraps
class PerformanceMonitor:
"""Monitor function performance over time."""
def __init__(self):
self.metrics = {}
def monitor(self, func_name=None):
"""Decorator to monitor function performance."""
def decorator(func):
name = func_name or func.__name__
@wraps(func)
def wrapper(*args, **kwargs):
start_time = time.perf_counter()
try:
result = func(*args, **kwargs)
success = True
except Exception as e:
success = False
raise
finally:
duration = time.perf_counter() - start_time
self._record_metric(name, duration, success)
return result
return wrapper
return decorator
def _record_metric(self, func_name, duration, success):
"""Record performance metric."""
if func_name not in self.metrics:
self.metrics[func_name] = {
'durations': [],
'success_count': 0,
'error_count': 0
}
self.metrics[func_name]['durations'].append(duration)
if success:
self.metrics[func_name]['success_count'] += 1
else:
self.metrics[func_name]['error_count'] += 1
# Keep only recent measurements
if len(self.metrics[func_name]['durations']) > 1000:
self.metrics[func_name]['durations'] = \
self.metrics[func_name]['durations'][-1000:]
def get_stats(self, func_name):
"""Get performance statistics for a function."""
if func_name not in self.metrics:
return None
durations = self.metrics[func_name]['durations']
if not durations:
return None
return {
'mean': statistics.mean(durations),
'median': statistics.median(durations),
'p95': statistics.quantiles(durations, n=20)[18], # 95th percentile
'success_rate': self.metrics[func_name]['success_count'] /
(self.metrics[func_name]['success_count'] +
self.metrics[func_name]['error_count'])
}
# Usage example
monitor = PerformanceMonitor()
@monitor.monitor()
def database_query(query):
"""Simulate database query."""
time.sleep(0.01) # Simulate query time
return f"Results for: {query}"
# After running many queries, check performance
stats = monitor.get_stats('database_query')
if stats and stats['p95'] > 0.05: # Alert if 95th percentile > 50ms
print(f"Performance regression detected: {stats}")
This monitoring system helps you catch performance regressions early and understand how your application performs under different conditions.
In our next part, we’ll explore test-driven development (TDD) and behavior-driven development (BDD) methodologies. We’ll learn how writing tests first can improve code design, reduce bugs, and create better documentation for your applications.