Traditional keyword search fails when users don’t know the exact terms. Semantic search understands meaning—“how to fix a slow database” matches “optimizing PostgreSQL query performance.” With embeddings readily available, semantic search is now practical to implement.
Here’s a practical guide to building semantic search.
How Semantic Search Works
Traditional vs. Semantic
keyword_search:
approach: Match exact words
query: "kubernetes pod networking"
matches: Documents containing "kubernetes", "pod", "networking"
misses: "k8s container network configuration"
semantic_search:
approach: Match meaning
query: "kubernetes pod networking"
matches: Documents about container networking concepts
finds: "k8s container network configuration" (similar meaning)
The Pipeline
Query → Embed → Search Vector DB → Rank → Return
Documents → Chunk → Embed → Store in Vector DB
Building the System
Document Ingestion
from sentence_transformers import SentenceTransformer
from dataclasses import dataclass
import hashlib
@dataclass
class Document:
id: str
content: str
metadata: dict
class SemanticSearchIndex:
def __init__(self, model_name='all-MiniLM-L6-v2'):
self.model = SentenceTransformer(model_name)
self.chunks = []
self.embeddings = []
self.metadata = []
def add_documents(self, documents: list[Document]):
for doc in documents:
chunks = self._chunk_document(doc)
for chunk in chunks:
embedding = self.model.encode(chunk['text'])
self.chunks.append(chunk['text'])
self.embeddings.append(embedding)
self.metadata.append({
'document_id': doc.id,
'chunk_index': chunk['index'],
**doc.metadata
})
def _chunk_document(self, doc: Document, chunk_size=500, overlap=50):
text = doc.content
chunks = []
start = 0
index = 0
while start < len(text):
end = min(start + chunk_size, len(text))
# Try to break at sentence boundary
if end < len(text):
for sep in ['. ', '.\n', '\n\n']:
last_sep = text.rfind(sep, start + chunk_size//2, end)
if last_sep != -1:
end = last_sep + len(sep)
break
chunks.append({
'text': text[start:end].strip(),
'index': index,
'start_char': start,
'end_char': end
})
start = end - overlap
index += 1
return chunks
Search Implementation
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
class SemanticSearchIndex:
# ... previous code ...
def search(self, query: str, top_k: int = 10) -> list[dict]:
# Embed query
query_embedding = self.model.encode(query)
# Calculate similarities
embeddings_array = np.array(self.embeddings)
similarities = cosine_similarity(
[query_embedding],
embeddings_array
)[0]
# Get top k indices
top_indices = np.argsort(similarities)[::-1][:top_k]
# Build results
results = []
for idx in top_indices:
results.append({
'text': self.chunks[idx],
'score': float(similarities[idx]),
'metadata': self.metadata[idx]
})
return results
Hybrid Search
from rank_bm25 import BM25Okapi
import re
class HybridSearchIndex:
def __init__(self, semantic_weight=0.7):
self.semantic = SemanticSearchIndex()
self.bm25 = None
self.tokenized_chunks = []
self.semantic_weight = semantic_weight
def add_documents(self, documents):
self.semantic.add_documents(documents)
# Build BM25 index
self.tokenized_chunks = [
self._tokenize(chunk)
for chunk in self.semantic.chunks
]
self.bm25 = BM25Okapi(self.tokenized_chunks)
def _tokenize(self, text):
# Simple tokenization
return re.findall(r'\w+', text.lower())
def search(self, query: str, top_k: int = 10) -> list[dict]:
# Semantic search
semantic_results = self.semantic.search(query, top_k=top_k * 2)
# BM25 search
tokenized_query = self._tokenize(query)
bm25_scores = self.bm25.get_scores(tokenized_query)
# Normalize scores
semantic_scores = {r['metadata']['chunk_index']: r['score']
for i, r in enumerate(semantic_results)}
bm25_max = max(bm25_scores) if max(bm25_scores) > 0 else 1
bm25_normalized = {i: s/bm25_max for i, s in enumerate(bm25_scores)}
# Combine scores
combined_scores = {}
all_indices = set(semantic_scores.keys()) | set(range(len(bm25_scores)))
for idx in all_indices:
sem_score = semantic_scores.get(idx, 0)
bm25_score = bm25_normalized.get(idx, 0)
combined_scores[idx] = (
self.semantic_weight * sem_score +
(1 - self.semantic_weight) * bm25_score
)
# Sort and return top k
sorted_indices = sorted(
combined_scores.keys(),
key=lambda x: combined_scores[x],
reverse=True
)[:top_k]
return [{
'text': self.semantic.chunks[idx],
'score': combined_scores[idx],
'metadata': self.semantic.metadata[idx]
} for idx in sorted_indices]
Production Considerations
Using Vector Databases
import pinecone
from sentence_transformers import SentenceTransformer
class ProductionSemanticSearch:
def __init__(self, index_name: str):
pinecone.init(
api_key=os.environ['PINECONE_API_KEY'],
environment='us-east1-gcp'
)
self.index = pinecone.Index(index_name)
self.model = SentenceTransformer('all-MiniLM-L6-v2')
def add_documents(self, documents: list[Document], batch_size=100):
vectors = []
for doc in documents:
chunks = self._chunk_document(doc)
for chunk in chunks:
embedding = self.model.encode(chunk['text']).tolist()
vectors.append({
'id': f"{doc.id}_{chunk['index']}",
'values': embedding,
'metadata': {
'text': chunk['text'][:1000], # Pinecone metadata limit
'document_id': doc.id,
**doc.metadata
}
})
if len(vectors) >= batch_size:
self.index.upsert(vectors=vectors)
vectors = []
if vectors:
self.index.upsert(vectors=vectors)
def search(self, query: str, top_k: int = 10, filter: dict = None):
query_embedding = self.model.encode(query).tolist()
results = self.index.query(
vector=query_embedding,
top_k=top_k,
include_metadata=True,
filter=filter
)
return [{
'text': match.metadata.get('text', ''),
'score': match.score,
'metadata': match.metadata
} for match in results.matches]
Filtering and Faceting
# Search with filters
results = search_index.search(
query="kubernetes deployment best practices",
top_k=10,
filter={
"category": {"$in": ["devops", "kubernetes"]},
"date": {"$gte": "2023-01-01"},
"author": {"$eq": "engineering-team"}
}
)
# Faceted search
def search_with_facets(query: str, facet_fields: list[str]):
results = search_index.search(query, top_k=100)
facets = {}
for field in facet_fields:
facets[field] = {}
for result in results:
value = result['metadata'].get(field)
if value:
facets[field][value] = facets[field].get(value, 0) + 1
return {
'results': results[:10],
'facets': facets
}
Query Understanding
class QueryProcessor:
def __init__(self, llm):
self.llm = llm
def expand_query(self, query: str) -> list[str]:
"""Generate query variations for better recall."""
prompt = f"""Generate 3 alternative search queries for: "{query}"
Return only the queries, one per line."""
response = self.llm.generate(prompt)
queries = [q.strip() for q in response.split('\n') if q.strip()]
return [query] + queries
def extract_filters(self, query: str) -> tuple[str, dict]:
"""Extract explicit filters from query."""
prompt = f"""Extract search filters from this query.
Return JSON with "query" (the search text) and "filters" (extracted constraints).
Query: "{query}"
Example:
Query: "kubernetes tutorials from 2023"
{{"query": "kubernetes tutorials", "filters": {{"year": 2023}}}}
JSON:"""
response = self.llm.generate(prompt, temperature=0)
try:
result = json.loads(response)
return result.get('query', query), result.get('filters', {})
except:
return query, {}
Evaluation
Measuring Quality
def evaluate_search(search_index, test_queries):
"""Evaluate search quality with labeled test data."""
metrics = {
'mrr': [], # Mean Reciprocal Rank
'precision_at_5': [],
'recall_at_10': []
}
for test in test_queries:
results = search_index.search(test.query, top_k=10)
result_ids = [r['metadata']['document_id'] for r in results]
# MRR
for rank, doc_id in enumerate(result_ids, 1):
if doc_id in test.relevant_docs:
metrics['mrr'].append(1.0 / rank)
break
else:
metrics['mrr'].append(0.0)
# Precision@5
relevant_in_top5 = len(set(result_ids[:5]) & set(test.relevant_docs))
metrics['precision_at_5'].append(relevant_in_top5 / 5)
# Recall@10
relevant_found = len(set(result_ids) & set(test.relevant_docs))
metrics['recall_at_10'].append(
relevant_found / len(test.relevant_docs)
)
return {k: sum(v)/len(v) for k, v in metrics.items()}
Key Takeaways
- Semantic search matches meaning, not just keywords
- Use embedding models (sentence-transformers) for encoding
- Chunk documents appropriately for your use case
- Hybrid search combines semantic and keyword for best results
- Use vector databases for production scale
- Metadata filtering enables scoped search
- Query expansion improves recall
- Evaluate with real queries and labeled relevance
- Test different embedding models for your domain
- Monitor and iterate based on user behavior
Semantic search is foundational for modern search and AI applications.