What is PyTorch?
PyTorch is Meta's (formerly Facebook) open-source deep learning framework that has revolutionized ML research and production. Born from the Torch library, PyTorch provides dynamic computation graphs, making it incredibly intuitive for researchers while offering production-ready deployment tools. It powers critical systems at Meta, Tesla, Microsoft, and countless startups.
What sets PyTorch apart is its eager execution model - operations execute immediately as they're called, making debugging as simple as using Python's debugger. Combined with automatic differentiation and seamless GPU acceleration, PyTorch enables rapid prototyping that scales to production without rewriting code.
🧮 Training Cost Calculator
Estimate GPU costs, memory requirements, and training time for your PyTorch models.
📊 Performance Estimates
💡 Optimization Tips: Memory usage is low. You can increase batch size or model size for better GPU utilization.
🌍 PyTorch in Production
🚗 Tesla Autopilot
Scale: Processes 8 cameras at 36fps in real-time
Architecture: Custom PyTorch models on Tesla's FSD chip
Challenge: Sub-100ms inference with safety-critical accuracy
🧠 Meta's Language Models
Scale: Training models with 100B+ parameters
Infrastructure: Distributed across 1000s of GPUs
Innovation: FSDP (Fully Sharded Data Parallel)
🏥 Medical AI at Stanford
Application: Skin cancer detection from photos
Performance: Matches dermatologist accuracy
Deployment: Mobile app with PyTorch Mobile
🎬 Disney's Visual Effects
Use Case: Real-time facial capture and animation
Technology: Custom PyTorch models for motion capture
Innovation: Real-time processing for live film production
PyTorch Fundamentals
Core concepts: tensors, autograd, and GPU acceleration
# PyTorch Fundamentals - Production Ready
import torch
import torch.nn as nn
import torch.optim as optim
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DataLoader, DistributedSampler
import numpy as np
import time
import logging
from contextlib import contextmanager
class OptimizedTensorOps:
"""Production-ready tensor operations with optimization"""
def __init__(self, device='cuda', dtype=torch.float32):
self.device = device
self.dtype = dtype
# Configure optimal settings
if device == 'cuda':
torch.backends.cudnn.benchmark = True # Optimize for consistent input sizes
torch.backends.cudnn.enabled = True
@contextmanager
def timer(self, name):
"""Context manager for timing operations"""
start = time.perf_counter()
yield
end = time.perf_counter()
print(f"{name}: {(end - start)*1000:.2f}ms")
def efficient_tensor_creation(self, batch_size=1024):
"""Demonstrate efficient tensor creation patterns"""
# Pre-allocate tensors for better memory management
with self.timer("Tensor Creation"):
# Avoid creating tensors in loops - use batch operations
x = torch.randn(batch_size, 1024, device=self.device, dtype=self.dtype)
y = torch.zeros_like(x) # More efficient than torch.zeros()
# Use tensor factories for initialization
weights = torch.empty(1024, 512, device=self.device).normal_(0, 0.02)
bias = torch.zeros(512, device=self.device)
# Memory-efficient operations
with self.timer("In-place Operations"):
# In-place operations save memory
x.add_(0.1) # Instead of x = x + 0.1
x.mul_(2.0) # Instead of x = x * 2.0
x.clamp_(-1.0, 1.0) # Gradient clipping
return x, y, weights, bias
def vectorized_operations(self, x, y):
"""Demonstrate vectorized vs loop-based operations"""
batch_size = x.size(0)
# BAD: Element-wise operations in Python loop
with self.timer("Python Loop (Slow)"):
result_slow = torch.zeros_like(x)
for i in range(min(100, batch_size)): # Limited to avoid timeout
result_slow[i] = x[i] * y[i] + x[i].sum()
# GOOD: Vectorized operations
with self.timer("Vectorized (Fast)"):
result_fast = x * y + x.sum(dim=1, keepdim=True)
return result_fast
def mixed_precision_example(self):
"""Demonstrate automatic mixed precision training"""
from torch.cuda.amp import autocast, GradScaler
model = nn.Linear(1024, 512).to(self.device)
optimizer = optim.AdamW(model.parameters(), lr=1e-3)
scaler = GradScaler()
x = torch.randn(32, 1024, device=self.device)
target = torch.randn(32, 512, device=self.device)
with self.timer("Mixed Precision Training Step"):
with autocast():
output = model(x)
loss = nn.MSELoss()(output, target)
# Backward pass with gradient scaling
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad()
return loss.item()
# Advanced GPU Memory Management
class GPUMemoryManager:
"""Advanced GPU memory management techniques"""
@staticmethod
def get_memory_stats():
"""Get detailed GPU memory statistics"""
if torch.cuda.is_available():
allocated = torch.cuda.memory_allocated() / 1024**3
cached = torch.cuda.memory_reserved() / 1024**3
return f"Allocated: {allocated:.2f}GB, Cached: {cached:.2f}GB"
return "CUDA not available"
@staticmethod
def gradient_checkpointing_example():
"""Memory-efficient training with gradient checkpointing"""
from torch.utils.checkpoint import checkpoint
class MemoryEfficientModel(nn.Module):
def __init__(self):
super().__init__()
self.layers = nn.ModuleList([
nn.Linear(1024, 1024) for _ in range(10)
])
self.activation = nn.ReLU()
def forward(self, x):
# Use checkpointing to trade compute for memory
for layer in self.layers:
x = checkpoint(lambda x, layer=layer: self.activation(layer(x)), x)
return x
return MemoryEfficientModel()
@staticmethod
def memory_efficient_attention(q, k, v, chunk_size=1024):
"""Memory-efficient attention computation"""
batch_size, seq_len, hidden_dim = q.shape
# Chunked computation to reduce memory usage
outputs = []
for i in range(0, seq_len, chunk_size):
end_idx = min(i + chunk_size, seq_len)
q_chunk = q[:, i:end_idx]
# Compute attention for chunk
scores = torch.matmul(q_chunk, k.transpose(-2, -1)) / (hidden_dim ** 0.5)
attn_weights = torch.softmax(scores, dim=-1)
chunk_output = torch.matmul(attn_weights, v)
outputs.append(chunk_output)
return torch.cat(outputs, dim=1)
# Distributed Training Setup
class DistributedTrainingManager:
"""Production distributed training setup"""
def __init__(self, backend='nccl'):
self.backend = backend
self.world_size = int(os.environ.get('WORLD_SIZE', 1))
self.rank = int(os.environ.get('RANK', 0))
self.local_rank = int(os.environ.get('LOCAL_RANK', 0))
def setup_distributed(self):
"""Initialize distributed training"""
if self.world_size > 1:
dist.init_process_group(
backend=self.backend,
rank=self.rank,
world_size=self.world_size
)
torch.cuda.set_device(self.local_rank)
def wrap_model_for_ddp(self, model):
"""Wrap model for distributed data parallel"""
if self.world_size > 1:
model = model.cuda(self.local_rank)
model = DDP(model, device_ids=[self.local_rank])
return model
def create_distributed_dataloader(self, dataset, batch_size, shuffle=True):
"""Create distributed-aware dataloader"""
sampler = None
if self.world_size > 1:
sampler = DistributedSampler(
dataset,
num_replicas=self.world_size,
rank=self.rank,
shuffle=shuffle
)
shuffle = False # Sampler handles shuffling
return DataLoader(
dataset,
batch_size=batch_size,
shuffle=shuffle,
sampler=sampler,
num_workers=4,
pin_memory=True
)
# Example usage and benchmarking
if __name__ == "__main__":
# Set up logging
logging.basicConfig(level=logging.INFO)
# Initialize optimized operations
ops = OptimizedTensorOps(device='cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {ops.device}")
print(f"Memory before: {GPUMemoryManager.get_memory_stats()}")
# Run tensor operations
x, y, weights, bias = ops.efficient_tensor_creation(batch_size=2048)
result = ops.vectorized_operations(x, y)
print(f"Memory after tensor ops: {GPUMemoryManager.get_memory_stats()}")
# Mixed precision training
loss = ops.mixed_precision_example()
print(f"Training loss: {loss:.4f}")
print(f"Final memory: {GPUMemoryManager.get_memory_stats()}")Key Features
- ✓Optimized tensor operations for production
- ✓Mixed precision training with autocast
- ✓Advanced memory management techniques
- ✓Distributed training setup patterns
- ✓Performance monitoring and profiling
🌟 PyTorch Ecosystem
Core Libraries
- •PyTorch Core
- •TorchVision
- •TorchAudio
- •TorchText
Deployment
- •TorchServe
- •TorchScript
- •ONNX Export
- •Mobile (iOS/Android)
Specialized
- •Lightning
- •Ignite
- •Captum
- •Fairscale
⚖️ PyTorch vs TensorFlow
| Aspect | PyTorch | TensorFlow |
|---|---|---|
| Learning Curve | Easier, Pythonic | Steeper, more concepts |
| Computation Graph | Dynamic (define-by-run) | Static (define-then-run) |
| Debugging | Native Python debugging | TensorBoard, more complex |
| Production Deployment | TorchServe, growing | TF Serving, mature |
| Research Adoption | Dominant in research | Strong but declining |
| Mobile/Edge | PyTorch Mobile | TensorFlow Lite |
💡 PyTorch Best Practices
Development
- ✓Use DataLoader for efficient data loading
- ✓Implement custom Dataset classes
- ✓Move tensors to GPU with .to(device)
- ✓Use torch.no_grad() for inference
- ✓Set model.eval() during evaluation
- ✓Clear gradients with optimizer.zero_grad()
Production
- ✓Use TorchScript for deployment
- ✓Apply model quantization for speed
- ✓Implement batch inference
- ✓Monitor GPU memory usage
- ✓Save model state_dict, not entire model
- ✓Use mixed precision training (AMP)