🗃️ Vector Databases
Specialized databases optimized for storing and querying high-dimensional vector embeddings. Essential infrastructure for AI applications like RAG, recommendation systems, and semantic search.
Database Comparison
Database | Type | Best For | Key Strength |
---|---|---|---|
Pinecone | Cloud-managed | Production RAG applications with high availability requirements | Managed service |
ChromaDB | Open-source | Rapid prototyping and local development environments | Easy setup |
Weaviate | Open-source/Cloud | Complex data relationships and real-time applications | GraphQL interface |
pgvector (PostgreSQL) | PostgreSQL Extension | Applications already using PostgreSQL with vector search needs | SQL integration |
Pinecone
Fully managed vector database optimized for machine learning applications
Type
Cloud-managed
Key Strengths
- ✓Managed service
- ✓High performance
- ✓Easy scaling
- ✓Real-time updates
Best Use Case
Production RAG applications with high availability requirements
Implementation Example
# Pinecone Vector Database Integration
import pinecone
from sentence_transformers import SentenceTransformer
import numpy as np
from typing import List, Dict, Any
class PineconeVectorDB:
def __init__(self, api_key: str, environment: str, index_name: str):
# Initialize Pinecone
pinecone.init(api_key=api_key, environment=environment)
self.index_name = index_name
self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
self.dimension = 384 # Dimension for all-MiniLM-L6-v2
# Create or connect to index
if index_name not in pinecone.list_indexes():
pinecone.create_index(
name=index_name,
dimension=self.dimension,
metric='cosine',
pods=1,
replicas=1,
pod_type='p1.x1'
)
self.index = pinecone.Index(index_name)
def add_documents(self, documents: List[str], ids: List[str] = None,
metadata: List[Dict] = None) -> None:
"""Add documents to the vector database"""
if ids is None:
ids = [f"doc_{i}" for i in range(len(documents))]
if metadata is None:
metadata = [{"text": doc} for doc in documents]
# Generate embeddings
embeddings = self.embedding_model.encode(documents).tolist()
# Prepare vectors for upsert
vectors = []
for i, (doc_id, embedding, meta) in enumerate(zip(ids, embeddings, metadata)):
vectors.append({
'id': doc_id,
'values': embedding,
'metadata': {**meta, 'text': documents[i]}
})
# Upsert vectors in batches
batch_size = 100
for i in range(0, len(vectors), batch_size):
batch = vectors[i:i + batch_size]
self.index.upsert(vectors=batch)
print(f"Added {len(documents)} documents to Pinecone index")
def search(self, query: str, top_k: int = 5,
filter_dict: Dict = None) -> List[Dict]:
"""Search for similar documents"""
# Generate query embedding
query_embedding = self.embedding_model.encode([query]).tolist()[0]
# Search in Pinecone
search_kwargs = {
'vector': query_embedding,
'top_k': top_k,
'include_metadata': True
}
if filter_dict:
search_kwargs['filter'] = filter_dict
results = self.index.query(**search_kwargs)
# Format results
formatted_results = []
for match in results['matches']:
formatted_results.append({
'id': match['id'],
'score': match['score'],
'text': match['metadata'].get('text', ''),
'metadata': match['metadata']
})
return formatted_results
def hybrid_search(self, query: str, filters: Dict = None,
top_k: int = 5) -> List[Dict]:
"""Perform hybrid search with metadata filtering"""
# Basic vector search
results = self.search(query, top_k=top_k * 2, filter_dict=filters)
# Re-rank based on additional criteria
# This is a simplified example - real hybrid search would be more sophisticated
for result in results:
# Boost score if query keywords appear in text
query_words = query.lower().split()
text_words = result['text'].lower().split()
keyword_matches = sum(1 for word in query_words if word in text_words)
# Adjust score based on keyword matches
result['score'] = result['score'] * (1 + keyword_matches * 0.1)
# Sort by adjusted score and return top_k
results.sort(key=lambda x: x['score'], reverse=True)
return results[:top_k]
def delete_documents(self, ids: List[str]) -> None:
"""Delete documents from the index"""
self.index.delete(ids=ids)
print(f"Deleted {len(ids)} documents from index")
def get_index_stats(self) -> Dict:
"""Get statistics about the index"""
stats = self.index.describe_index_stats()
return {
'total_vectors': stats['total_vector_count'],
'dimension': stats['dimension'],
'index_fullness': stats['index_fullness']
}
# Example usage
pinecone_db = PineconeVectorDB(
api_key="your-api-key",
environment="your-environment",
index_name="knowledge-base"
)
# Add documents with metadata
documents = [
"Machine learning is a subset of artificial intelligence.",
"Deep learning uses neural networks with multiple layers.",
"Natural language processing helps computers understand text."
]
metadata = [
{"category": "ml", "source": "textbook", "date": "2024"},
{"category": "dl", "source": "paper", "date": "2024"},
{"category": "nlp", "source": "article", "date": "2024"}
]
pinecone_db.add_documents(documents, metadata=metadata)
# Search with filtering
results = pinecone_db.search(
query="What is artificial intelligence?",
top_k=3
)
# Search with metadata filter
filtered_results = pinecone_db.search(
query="neural networks",
filter_dict={"category": "dl"},
top_k=2
)
🎯 How to Choose a Vector Database
Consider These Factors
- •Scale requirements (millions vs billions of vectors)
- •Latency requirements (real-time vs batch)
- •Existing infrastructure (cloud vs on-premise)
- •Development complexity tolerance
- •Budget constraints
- •Team expertise (SQL vs new APIs)
Quick Recommendations
For Beginners
Start with ChromaDB for local development, then move to Pinecone for production
For SQL Teams
Use pgvector if you're already on PostgreSQL
For Complex Apps
Consider Weaviate for GraphQL and advanced relationships
⚡ Performance Considerations
Indexing Strategy
- •HNSW for balanced performance
- •IVF for memory efficiency
- •LSH for high-dimensional data
- •Consider index build time
Query Optimization
- •Batch queries when possible
- •Use appropriate similarity metrics
- •Filter before vector search
- •Cache frequent queries
Scaling Patterns
- •Shard by metadata
- •Read replicas for queries
- •Separate hot/cold data
- •Monitor query patterns
📝 Test Your Understanding
1 of 8Current: 0/8