Image Captioning Systems

Building AI systems that automatically generate natural language descriptions from images

Captioning Architectures

Vision Transformer + GPT Implementation

import torch
import torch.nn as nn
from transformers import BlipProcessor, BlipForConditionalGeneration

class ImageCaptioningPipeline:
    def __init__(self, model_name="Salesforce/blip-image-captioning-base"):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        # Load BLIP model and processor
        self.processor = BlipProcessor.from_pretrained(model_name)
        self.model = BlipForConditionalGeneration.from_pretrained(model_name)
        self.model.to(self.device)
        self.model.eval()
        
    def generate_caption(self, image, prompt=None):
        """Generate caption for an image"""
        inputs = self.processor(image, return_tensors="pt").to(self.device)
        
        with torch.no_grad():
            if prompt:
                # Conditional generation with prompt
                text_inputs = self.processor.tokenizer(
                    prompt, return_tensors="pt", padding=True
                ).to(self.device)
                
                outputs = self.model.generate(
                    **inputs,
                    text=text_inputs.input_ids,
                    max_length=50,
                    num_beams=5,
                    early_stopping=True
                )
            else:
                # Unconditional generation
                outputs = self.model.generate(
                    **inputs,
                    max_length=50,
                    num_beams=5,
                    early_stopping=True
                )
        
        caption = self.processor.decode(outputs[0], skip_special_tokens=True)
        return caption

Evaluation Metrics

Production Service

import asyncio
from PIL import Image
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
import redis
import json
from datetime import datetime

class ImageCaptioningService:
    def __init__(self, config):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        # Load model
        self.processor = BlipProcessor.from_pretrained(
            "Salesforce/blip-image-captioning-base"
        )
        self.model = BlipForConditionalGeneration.from_pretrained(
            "Salesforce/blip-image-captioning-base"
        )
        self.model.to(self.device)
        self.model.eval()
        
        # Caching
        self.redis_client = redis.Redis(host='localhost', port=6379)
        
    async def generate_caption(self, image_data, style="descriptive"):
        """Generate caption for an image with specified style"""
        
        # Process image
        image = Image.open(image_data).convert('RGB')
        
        # Style-specific prompt engineering
        prompts = {
            "descriptive": "A photo of",
            "creative": "In this artistic image,", 
            "technical": "This image shows"
        }
        
        # Generate caption
        inputs = self.processor(image, return_tensors="pt").to(self.device)
        
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_length=50,
                num_beams=5,
                temperature=0.7 if style == "creative" else 0.3,
                do_sample=True,
                early_stopping=True
            )
        
        caption = self.processor.decode(outputs[0], skip_special_tokens=True)
        
        # Apply style post-processing
        if style == "creative":
            caption = self._enhance_creativity(caption)
        elif style == "technical":
            caption = self._add_technical_details(caption, image)
        
        return {
            'caption': caption,
            'style': style,
            'confidence': self._calculate_confidence(outputs),
            'timestamp': datetime.now().isoformat()
        }
    
    def _calculate_confidence(self, outputs):
        """Calculate confidence score for generated caption"""
        # Simplified confidence calculation
        return 0.85  # In practice, use model logits
    
    def _enhance_creativity(self, caption):
        """Add creative flair to caption"""
        creative_prefixes = [
            "In this captivating scene, ",
            "This image beautifully captures ",
            "A striking composition showing "
        ]
        import random
        return random.choice(creative_prefixes) + caption.lower()
    
    def _add_technical_details(self, caption, image):
        """Add technical details to caption"""
        width, height = image.size
        aspect_ratio = width / height
        
        if aspect_ratio > 1.5:
            tech_detail = "wide-angle composition"
        elif aspect_ratio < 0.7:
            tech_detail = "portrait orientation"
        else:
            tech_detail = "balanced aspect ratio"
        
        return f"{caption}. Technical analysis: {tech_detail}"

📝 Test Your Understanding

1 of 4Current: 0/4

What is the key innovation of BLIP-2 over previous image captioning models?