--- name: pytorch-patterns description: PyTorch deep learning patterns and best practices for building robust, efficient, and reproducible training pipelines, model architectures, and data loading. origin: ECC --- # PyTorch Development Patterns Idiomatic PyTorch patterns and best practices for building robust, efficient, and reproducible deep learning applications. ## When to Activate - Writing new PyTorch models or training scripts - Reviewing deep learning code - Debugging training loops or data pipelines - Optimizing GPU memory usage or training speed - Setting up reproducible experiments ## Core Principles ### 1. Device-Agnostic Code Always write code that works on both CPU and GPU without hardcoding devices. ```python # Good: Device-agnostic device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = MyModel().to(device) data = data.to(device) # Bad: Hardcoded device model = MyModel().cuda() # Crashes if no GPU data = data.cuda() ``` ### 2. Reproducibility First Set all random seeds for reproducible results. ```python # Good: Full reproducibility setup def set_seed(seed: int = 42) -> None: torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) np.random.seed(seed) random.seed(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # Bad: No seed control model = MyModel() # Different weights every run ``` ### 3. Explicit Shape Management Always document and verify tensor shapes. ```python # Good: Shape-annotated forward pass def forward(self, x: torch.Tensor) -> torch.Tensor: # x: (batch_size, channels, height, width) x = self.conv1(x) # -> (batch_size, 32, H, W) x = self.pool(x) # -> (batch_size, 32, H//2, W//2) x = x.view(x.size(0), -1) # -> (batch_size, 32*H//2*W//2) return self.fc(x) # -> (batch_size, num_classes) # Bad: No shape tracking def forward(self, x): x = self.conv1(x) x = self.pool(x) x = x.view(x.size(0), -1) # What size is this? return self.fc(x) # Will this even work? ``` ## Model Architecture Patterns ### Clean nn.Module Structure ```python # Good: Well-organized module class ImageClassifier(nn.Module): def __init__(self, num_classes: int, dropout: float = 0.5) -> None: super().__init__() self.features = nn.Sequential( nn.Conv2d(3, 64, kernel_size=3, padding=1), nn.BatchNorm2d(64), nn.ReLU(inplace=True), nn.MaxPool2d(2), ) self.classifier = nn.Sequential( nn.Dropout(dropout), nn.Linear(64 * 16 * 16, num_classes), ) def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.features(x) x = x.view(x.size(0), -1) return self.classifier(x) # Bad: Everything in forward class ImageClassifier(nn.Module): def __init__(self): super().__init__() def forward(self, x): x = F.conv2d(x, weight=self.make_weight()) # Creates weight each call! return x ``` ### Proper Weight Initialization ```python # Good: Explicit initialization def _init_weights(self, module: nn.Module) -> None: if isinstance(module, nn.Linear): nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu") if module.bias is not None: nn.init.zeros_(module.bias) elif isinstance(module, nn.Conv2d): nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu") elif isinstance(module, nn.BatchNorm2d): nn.init.ones_(module.weight) nn.init.zeros_(module.bias) model = MyModel() model.apply(model._init_weights) ``` ## Training Loop Patterns ### Standard Training Loop ```python # Good: Complete training loop with best practices def train_one_epoch( model: nn.Module, dataloader: DataLoader, optimizer: torch.optim.Optimizer, criterion: nn.Module, device: torch.device, scaler: torch.amp.GradScaler | None = None, ) -> float: model.train() # Always set train mode total_loss = 0.0 for batch_idx, (data, target) in enumerate(dataloader): data, target = data.to(device), target.to(device) optimizer.zero_grad(set_to_none=True) # More efficient than zero_grad() # Mixed precision training with torch.amp.autocast("cuda", enabled=scaler is not None): output = model(data) loss = criterion(output, target) if scaler is not None: scaler.scale(loss).backward() scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) scaler.step(optimizer) scaler.update() else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) optimizer.step() total_loss += loss.item() return total_loss / len(dataloader) ``` ### Validation Loop ```python # Good: Proper evaluation @torch.no_grad() # More efficient than wrapping in torch.no_grad() block def evaluate( model: nn.Module, dataloader: DataLoader, criterion: nn.Module, device: torch.device, ) -> tuple[float, float]: model.eval() # Always set eval mode — disables dropout, uses running BN stats total_loss = 0.0 correct = 0 total = 0 for data, target in dataloader: data, target = data.to(device), target.to(device) output = model(data) total_loss += criterion(output, target).item() correct += (output.argmax(1) == target).sum().item() total += target.size(0) return total_loss / len(dataloader), correct / total ``` ## Data Pipeline Patterns ### Custom Dataset ```python # Good: Clean Dataset with type hints class ImageDataset(Dataset): def __init__( self, image_dir: str, labels: dict[str, int], transform: transforms.Compose | None = None, ) -> None: self.image_paths = list(Path(image_dir).glob("*.jpg")) self.labels = labels self.transform = transform def __len__(self) -> int: return len(self.image_paths) def __getitem__(self, idx: int) -> tuple[torch.Tensor, int]: img = Image.open(self.image_paths[idx]).convert("RGB") label = self.labels[self.image_paths[idx].stem] if self.transform: img = self.transform(img) return img, label ``` ### Efficient DataLoader Configuration ```python # Good: Optimized DataLoader dataloader = DataLoader( dataset, batch_size=32, shuffle=True, # Shuffle for training num_workers=4, # Parallel data loading pin_memory=True, # Faster CPU->GPU transfer persistent_workers=True, # Keep workers alive between epochs drop_last=True, # Consistent batch sizes for BatchNorm ) # Bad: Slow defaults dataloader = DataLoader(dataset, batch_size=32) # num_workers=0, no pin_memory ``` ### Custom Collate for Variable-Length Data ```python # Good: Pad sequences in collate_fn def collate_fn(batch: list[tuple[torch.Tensor, int]]) -> tuple[torch.Tensor, torch.Tensor]: sequences, labels = zip(*batch) # Pad to max length in batch padded = nn.utils.rnn.pad_sequence(sequences, batch_first=True, padding_value=0) return padded, torch.tensor(labels) dataloader = DataLoader(dataset, batch_size=32, collate_fn=collate_fn) ``` ## Checkpointing Patterns ### Save and Load Checkpoints ```python # Good: Complete checkpoint with all training state def save_checkpoint( model: nn.Module, optimizer: torch.optim.Optimizer, epoch: int, loss: float, path: str, ) -> None: torch.save({ "epoch": epoch, "model_state_dict": model.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "loss": loss, }, path) def load_checkpoint( path: str, model: nn.Module, optimizer: torch.optim.Optimizer | None = None, ) -> dict: checkpoint = torch.load(path, map_location="cpu", weights_only=True) model.load_state_dict(checkpoint["model_state_dict"]) if optimizer: optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) return checkpoint # Bad: Only saving model weights (can't resume training) torch.save(model.state_dict(), "model.pt") ``` ## Performance Optimization ### Mixed Precision Training ```python # Good: AMP with GradScaler scaler = torch.amp.GradScaler("cuda") for data, target in dataloader: with torch.amp.autocast("cuda"): output = model(data) loss = criterion(output, target) scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() optimizer.zero_grad(set_to_none=True) ``` ### Gradient Checkpointing for Large Models ```python # Good: Trade compute for memory from torch.utils.checkpoint import checkpoint class LargeModel(nn.Module): def forward(self, x: torch.Tensor) -> torch.Tensor: # Recompute activations during backward to save memory x = checkpoint(self.block1, x, use_reentrant=False) x = checkpoint(self.block2, x, use_reentrant=False) return self.head(x) ``` ### torch.compile for Speed ```python # Good: Compile the model for faster execution (PyTorch 2.0+) model = MyModel().to(device) model = torch.compile(model, mode="reduce-overhead") # Modes: "default" (safe), "reduce-overhead" (faster), "max-autotune" (fastest) ``` ## Quick Reference: PyTorch Idioms | Idiom | Description | |-------|-------------| | `model.train()` / `model.eval()` | Always set mode before train/eval | | `torch.no_grad()` | Disable gradients for inference | | `optimizer.zero_grad(set_to_none=True)` | More efficient gradient clearing | | `.to(device)` | Device-agnostic tensor/model placement | | `torch.amp.autocast` | Mixed precision for 2x speed | | `pin_memory=True` | Faster CPU→GPU data transfer | | `torch.compile` | JIT compilation for speed (2.0+) | | `weights_only=True` | Secure model loading | | `torch.manual_seed` | Reproducible experiments | | `gradient_checkpointing` | Trade compute for memory | ## Anti-Patterns to Avoid ```python # Bad: Forgetting model.eval() during validation model.train() with torch.no_grad(): output = model(val_data) # Dropout still active! BatchNorm uses batch stats! # Good: Always set eval mode model.eval() with torch.no_grad(): output = model(val_data) # Bad: In-place operations breaking autograd x = F.relu(x, inplace=True) # Can break gradient computation x += residual # In-place add breaks autograd graph # Good: Out-of-place operations x = F.relu(x) x = x + residual # Bad: Moving data to GPU inside the training loop repeatedly for data, target in dataloader: model = model.cuda() # Moves model EVERY iteration! # Good: Move model once before the loop model = model.to(device) for data, target in dataloader: data, target = data.to(device), target.to(device) # Bad: Using .item() before backward loss = criterion(output, target).item() # Detaches from graph! loss.backward() # Error: can't backprop through .item() # Good: Call .item() only for logging loss = criterion(output, target) loss.backward() print(f"Loss: {loss.item():.4f}") # .item() after backward is fine # Bad: Not using torch.save properly torch.save(model, "model.pt") # Saves entire model (fragile, not portable) # Good: Save state_dict torch.save(model.state_dict(), "model.pt") ``` __Remember__: PyTorch code should be device-agnostic, reproducible, and memory-conscious. When in doubt, profile with `torch.profiler` and check GPU memory with `torch.cuda.memory_summary()`.