Image Captioning Systems
Building AI systems that automatically generate natural language descriptions from images
Captioning Architectures
Vision Transformer + GPT Implementation
import torch
import torch.nn as nn
from transformers import BlipProcessor, BlipForConditionalGeneration
class ImageCaptioningPipeline:
def __init__(self, model_name="Salesforce/blip-image-captioning-base"):
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load BLIP model and processor
self.processor = BlipProcessor.from_pretrained(model_name)
self.model = BlipForConditionalGeneration.from_pretrained(model_name)
self.model.to(self.device)
self.model.eval()
def generate_caption(self, image, prompt=None):
"""Generate caption for an image"""
inputs = self.processor(image, return_tensors="pt").to(self.device)
with torch.no_grad():
if prompt:
# Conditional generation with prompt
text_inputs = self.processor.tokenizer(
prompt, return_tensors="pt", padding=True
).to(self.device)
outputs = self.model.generate(
**inputs,
text=text_inputs.input_ids,
max_length=50,
num_beams=5,
early_stopping=True
)
else:
# Unconditional generation
outputs = self.model.generate(
**inputs,
max_length=50,
num_beams=5,
early_stopping=True
)
caption = self.processor.decode(outputs[0], skip_special_tokens=True)
return caption
Evaluation Metrics
Production Service
import asyncio
from PIL import Image
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
import redis
import json
from datetime import datetime
class ImageCaptioningService:
def __init__(self, config):
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load model
self.processor = BlipProcessor.from_pretrained(
"Salesforce/blip-image-captioning-base"
)
self.model = BlipForConditionalGeneration.from_pretrained(
"Salesforce/blip-image-captioning-base"
)
self.model.to(self.device)
self.model.eval()
# Caching
self.redis_client = redis.Redis(host='localhost', port=6379)
async def generate_caption(self, image_data, style="descriptive"):
"""Generate caption for an image with specified style"""
# Process image
image = Image.open(image_data).convert('RGB')
# Style-specific prompt engineering
prompts = {
"descriptive": "A photo of",
"creative": "In this artistic image,",
"technical": "This image shows"
}
# Generate caption
inputs = self.processor(image, return_tensors="pt").to(self.device)
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_length=50,
num_beams=5,
temperature=0.7 if style == "creative" else 0.3,
do_sample=True,
early_stopping=True
)
caption = self.processor.decode(outputs[0], skip_special_tokens=True)
# Apply style post-processing
if style == "creative":
caption = self._enhance_creativity(caption)
elif style == "technical":
caption = self._add_technical_details(caption, image)
return {
'caption': caption,
'style': style,
'confidence': self._calculate_confidence(outputs),
'timestamp': datetime.now().isoformat()
}
def _calculate_confidence(self, outputs):
"""Calculate confidence score for generated caption"""
# Simplified confidence calculation
return 0.85 # In practice, use model logits
def _enhance_creativity(self, caption):
"""Add creative flair to caption"""
creative_prefixes = [
"In this captivating scene, ",
"This image beautifully captures ",
"A striking composition showing "
]
import random
return random.choice(creative_prefixes) + caption.lower()
def _add_technical_details(self, caption, image):
"""Add technical details to caption"""
width, height = image.size
aspect_ratio = width / height
if aspect_ratio > 1.5:
tech_detail = "wide-angle composition"
elif aspect_ratio < 0.7:
tech_detail = "portrait orientation"
else:
tech_detail = "balanced aspect ratio"
return f"{caption}. Technical analysis: {tech_detail}"
📝 Test Your Understanding
1 of 4Current: 0/4