🚀 From Prototype to Production
Moving LangChain applications from development to production requires addressing scalability, reliability, monitoring, and cost optimization. This lesson covers enterprise-grade patterns.
❌ Development Mindset
- • Single-threaded execution
- • No error recovery
- • Blocking operations
- • Manual testing only
- • No monitoring or logging
✅ Production Mindset
- • Async, concurrent processing
- • Robust error handling & retries
- • Non-blocking, streaming responses
- • Automated testing & validation
- • Comprehensive observability
🏗️ Production Patterns
Chain Composition
Building complex workflows from simple, reusable chain components
✅ Benefits
- • Modularity
- • Reusability
- • Maintainability
- • Testing isolation
⚠️ Challenges
- • State management
- • Error propagation
- • Debugging complexity
Best for: Multi-step reasoning, document processing, complex Q&A
Production Implementation
from langchain.chains import LLMChain, SequentialChain
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
class ProductionChainManager:
def __init__(self, llm):
self.llm = llm
self.chains = {}
self.memory = ConversationBufferMemory()
def create_extraction_chain(self):
"""Extract key information from documents"""
prompt = PromptTemplate(
input_variables=["document"],
template="""
Extract key information from this document:
{document}
Return JSON format:
{{"entities": [], "key_points": [], "sentiment": ""}}
"""
)
return LLMChain(llm=self.llm, prompt=prompt, output_key="extracted_info")
def create_analysis_chain(self):
"""Analyze extracted information"""
prompt = PromptTemplate(
input_variables=["extracted_info"],
template="""
Analyze this extracted information:
{extracted_info}
Provide insights and recommendations:
"""
)
return LLMChain(llm=self.llm, prompt=prompt, output_key="analysis")
def create_summary_chain(self):
"""Generate final summary"""
prompt = PromptTemplate(
input_variables=["analysis", "extracted_info"],
template="""
Create a comprehensive summary:
Extracted Information: {extracted_info}
Analysis: {analysis}
Summary:
"""
)
return LLMChain(llm=self.llm, prompt=prompt, output_key="summary")
def build_pipeline(self):
"""Compose chains into a pipeline"""
extraction_chain = self.create_extraction_chain()
analysis_chain = self.create_analysis_chain()
summary_chain = self.create_summary_chain()
return SequentialChain(
chains=[extraction_chain, analysis_chain, summary_chain],
input_variables=["document"],
output_variables=["summary"],
verbose=True
)
⚡ Performance Optimization
Response Caching
Cache complete LLM responses based on input similarity
from langchain.cache import InMemoryCache, RedisCache
from langchain.globals import set_llm_cache
import hashlib
import json
class SemanticCache:
def __init__(self, similarity_threshold=0.95):
self.cache = {}
self.embeddings_model = SentenceTransformer('all-MiniLM-L6-v2')
self.threshold = similarity_threshold
def _get_embedding(self, text: str):
return self.embeddings_model.encode([text])[0]
def _calculate_similarity(self, emb1, emb2):
return cosine_similarity([emb1], [emb2])[0][0]
def get(self, prompt: str):
prompt_embedding = self._get_embedding(prompt)
for cached_prompt, (cached_embedding, response) in self.cache.items():
similarity = self._calculate_similarity(prompt_embedding, cached_embedding)
if similarity > self.threshold:
return response
return None
def set(self, prompt: str, response: str):
prompt_embedding = self._get_embedding(prompt)
self.cache[prompt] = (prompt_embedding, response)
# Set up global caching
set_llm_cache(SemanticCache())
Chain Result Caching
Cache intermediate results in complex chains
from functools import wraps
import pickle
import hashlib
class ChainCache:
def __init__(self, cache_backend='memory'):
self.cache = {}
def cache_step(self, step_name: str, cache_key_fn=None):
def decorator(func):
@wraps(func)
async def wrapper(*args, **kwargs):
# Generate cache key
if cache_key_fn:
cache_key = cache_key_fn(*args, **kwargs)
else:
cache_key = hashlib.md5(
pickle.dumps((args, kwargs))
).hexdigest()
full_key = f"{step_name}:{cache_key}"
# Check cache
if full_key in self.cache:
return self.cache[full_key]
# Execute and cache
result = await func(*args, **kwargs)
self.cache[full_key] = result
return result
return wrapper
return decorator
# Usage in chains
cache = ChainCache()
@cache.cache_step('document_processing')
async def process_document(document_text):
# Expensive document processing
return processed_result
🌐 Deployment Strategies
Containerized Deployment
Deploy LangChain applications using Docker and Kubernetes
Benefits
- • Scalability
- • Isolation
- • Easy rollbacks
Considerations
- • Resource management
- • State handling
- • Networking
Example
# Dockerfile for LangChain App
FROM python:3.11-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install -r requirements.txt
COPY . .
EXPOSE 8000
CMD ["gunicorn", "--bind", "0.0.0.0:8000", "--workers", "4", "--worker-class", "uvicorn.workers.UvicornWorker", "main:app"]
Serverless Functions
Deploy individual chains as serverless functions
Benefits
- • Cost efficiency
- • Auto-scaling
- • No infrastructure management
Considerations
- • Cold starts
- • Timeout limits
- • State management
Example
# AWS Lambda deployment
import json
from langchain.llms import OpenAI
from langchain.chains import LLMChain
def lambda_handler(event, context):
try:
# Initialize chain (consider caching)
llm = OpenAI(temperature=0)
chain = LLMChain(llm=llm, prompt=your_prompt)
# Process request
result = chain.run(event['input'])
return {
'statusCode': 200,
'body': json.dumps({'result': result})
}
except Exception as e:
return {
'statusCode': 500,
'body': json.dumps({'error': str(e)})
}
Microservices Architecture
Split complex applications into specialized microservices
Benefits
- • Service isolation
- • Independent scaling
- • Technology diversity
Considerations
- • Service communication
- • Data consistency
- • Monitoring complexity
Example
# FastAPI microservice
from fastapi import FastAPI, HTTPException
from langchain.chains import LLMChain
import asyncio
app = FastAPI()
class ChainService:
def __init__(self):
self.chains = self._initialize_chains()
async def process_request(self, request_type: str, data: dict):
if request_type not in self.chains:
raise HTTPException(status_code=400, detail="Unknown request type")
chain = self.chains[request_type]
return await chain.arun(data)
service = ChainService()
@app.post("/process/{request_type}")
async def process_request(request_type: str, data: dict):
try:
result = await service.process_request(request_type, data)
return {"result": result}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
📊 Production Monitoring
💡 Production Best Practices
Development
- ✓ Use async/await for all I/O operations
- ✓ Implement comprehensive error handling
- ✓ Add circuit breakers for external dependencies
- ✓ Use structured logging with correlation IDs
- ✓ Implement health checks and readiness probes
Operations
- ✓ Set up comprehensive monitoring and alerting
- ✓ Implement gradual rollouts and canary deployments
- ✓ Use semantic caching to reduce API costs
- ✓ Implement rate limiting and request queuing
- ✓ Plan for disaster recovery and data backup
🎯 Key Takeaways
Async First: Design all LangChain applications with async patterns from the start for production scalability
Error Recovery: Implement exponential backoff, circuit breakers, and fallback mechanisms for reliability
Cost Optimization: Use semantic caching, request batching, and monitoring to control LLM API costs
Observability: Comprehensive monitoring, logging, and metrics are essential for production debugging
Deployment Strategy: Choose containerized, serverless, or microservices based on scale and complexity needs