Implementing Semantic Search: A Practical Guide

Traditional keyword search fails when users don’t know the exact terms. Semantic search understands meaning—“how to fix a slow database” matches “optimizing PostgreSQL query performance.” With embeddings readily available, semantic search is now practical to implement.

Here’s a practical guide to building semantic search.

How Semantic Search Works

Traditional vs. Semantic

keyword_search:
  approach: Match exact words
  query: "kubernetes pod networking"
  matches: Documents containing "kubernetes", "pod", "networking"
  misses: "k8s container network configuration"

semantic_search:
  approach: Match meaning
  query: "kubernetes pod networking"
  matches: Documents about container networking concepts
  finds: "k8s container network configuration" (similar meaning)

The Pipeline

Query → Embed → Search Vector DB → Rank → Return

Documents → Chunk → Embed → Store in Vector DB

Building the System

Document Ingestion

from sentence_transformers import SentenceTransformer
from dataclasses import dataclass
import hashlib

@dataclass
class Document:
    id: str
    content: str
    metadata: dict

class SemanticSearchIndex:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        self.chunks = []
        self.embeddings = []
        self.metadata = []

    def add_documents(self, documents: list[Document]):
        for doc in documents:
            chunks = self._chunk_document(doc)
            for chunk in chunks:
                embedding = self.model.encode(chunk['text'])
                self.chunks.append(chunk['text'])
                self.embeddings.append(embedding)
                self.metadata.append({
                    'document_id': doc.id,
                    'chunk_index': chunk['index'],
                    **doc.metadata
                })

    def _chunk_document(self, doc: Document, chunk_size=500, overlap=50):
        text = doc.content
        chunks = []
        start = 0
        index = 0

        while start < len(text):
            end = min(start + chunk_size, len(text))

            # Try to break at sentence boundary
            if end < len(text):
                for sep in ['. ', '.\n', '\n\n']:
                    last_sep = text.rfind(sep, start + chunk_size//2, end)
                    if last_sep != -1:
                        end = last_sep + len(sep)
                        break

            chunks.append({
                'text': text[start:end].strip(),
                'index': index,
                'start_char': start,
                'end_char': end
            })

            start = end - overlap
            index += 1

        return chunks

Search Implementation

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class SemanticSearchIndex:
    # ... previous code ...

    def search(self, query: str, top_k: int = 10) -> list[dict]:
        # Embed query
        query_embedding = self.model.encode(query)

        # Calculate similarities
        embeddings_array = np.array(self.embeddings)
        similarities = cosine_similarity(
            [query_embedding],
            embeddings_array
        )[0]

        # Get top k indices
        top_indices = np.argsort(similarities)[::-1][:top_k]

        # Build results
        results = []
        for idx in top_indices:
            results.append({
                'text': self.chunks[idx],
                'score': float(similarities[idx]),
                'metadata': self.metadata[idx]
            })

        return results

Hybrid Search

from rank_bm25 import BM25Okapi
import re

class HybridSearchIndex:
    def __init__(self, semantic_weight=0.7):
        self.semantic = SemanticSearchIndex()
        self.bm25 = None
        self.tokenized_chunks = []
        self.semantic_weight = semantic_weight

    def add_documents(self, documents):
        self.semantic.add_documents(documents)

        # Build BM25 index
        self.tokenized_chunks = [
            self._tokenize(chunk)
            for chunk in self.semantic.chunks
        ]
        self.bm25 = BM25Okapi(self.tokenized_chunks)

    def _tokenize(self, text):
        # Simple tokenization
        return re.findall(r'\w+', text.lower())

    def search(self, query: str, top_k: int = 10) -> list[dict]:
        # Semantic search
        semantic_results = self.semantic.search(query, top_k=top_k * 2)

        # BM25 search
        tokenized_query = self._tokenize(query)
        bm25_scores = self.bm25.get_scores(tokenized_query)

        # Normalize scores
        semantic_scores = {r['metadata']['chunk_index']: r['score']
                         for i, r in enumerate(semantic_results)}
        bm25_max = max(bm25_scores) if max(bm25_scores) > 0 else 1
        bm25_normalized = {i: s/bm25_max for i, s in enumerate(bm25_scores)}

        # Combine scores
        combined_scores = {}
        all_indices = set(semantic_scores.keys()) | set(range(len(bm25_scores)))

        for idx in all_indices:
            sem_score = semantic_scores.get(idx, 0)
            bm25_score = bm25_normalized.get(idx, 0)
            combined_scores[idx] = (
                self.semantic_weight * sem_score +
                (1 - self.semantic_weight) * bm25_score
            )

        # Sort and return top k
        sorted_indices = sorted(
            combined_scores.keys(),
            key=lambda x: combined_scores[x],
            reverse=True
        )[:top_k]

        return [{
            'text': self.semantic.chunks[idx],
            'score': combined_scores[idx],
            'metadata': self.semantic.metadata[idx]
        } for idx in sorted_indices]

Production Considerations

Using Vector Databases

import pinecone
from sentence_transformers import SentenceTransformer

class ProductionSemanticSearch:
    def __init__(self, index_name: str):
        pinecone.init(
            api_key=os.environ['PINECONE_API_KEY'],
            environment='us-east1-gcp'
        )
        self.index = pinecone.Index(index_name)
        self.model = SentenceTransformer('all-MiniLM-L6-v2')

    def add_documents(self, documents: list[Document], batch_size=100):
        vectors = []

        for doc in documents:
            chunks = self._chunk_document(doc)
            for chunk in chunks:
                embedding = self.model.encode(chunk['text']).tolist()
                vectors.append({
                    'id': f"{doc.id}_{chunk['index']}",
                    'values': embedding,
                    'metadata': {
                        'text': chunk['text'][:1000],  # Pinecone metadata limit
                        'document_id': doc.id,
                        **doc.metadata
                    }
                })

                if len(vectors) >= batch_size:
                    self.index.upsert(vectors=vectors)
                    vectors = []

        if vectors:
            self.index.upsert(vectors=vectors)

    def search(self, query: str, top_k: int = 10, filter: dict = None):
        query_embedding = self.model.encode(query).tolist()

        results = self.index.query(
            vector=query_embedding,
            top_k=top_k,
            include_metadata=True,
            filter=filter
        )

        return [{
            'text': match.metadata.get('text', ''),
            'score': match.score,
            'metadata': match.metadata
        } for match in results.matches]

Filtering and Faceting

# Search with filters
results = search_index.search(
    query="kubernetes deployment best practices",
    top_k=10,
    filter={
        "category": {"$in": ["devops", "kubernetes"]},
        "date": {"$gte": "2023-01-01"},
        "author": {"$eq": "engineering-team"}
    }
)

# Faceted search
def search_with_facets(query: str, facet_fields: list[str]):
    results = search_index.search(query, top_k=100)

    facets = {}
    for field in facet_fields:
        facets[field] = {}
        for result in results:
            value = result['metadata'].get(field)
            if value:
                facets[field][value] = facets[field].get(value, 0) + 1

    return {
        'results': results[:10],
        'facets': facets
    }

Query Understanding

class QueryProcessor:
    def __init__(self, llm):
        self.llm = llm

    def expand_query(self, query: str) -> list[str]:
        """Generate query variations for better recall."""
        prompt = f"""Generate 3 alternative search queries for: "{query}"
Return only the queries, one per line."""

        response = self.llm.generate(prompt)
        queries = [q.strip() for q in response.split('\n') if q.strip()]
        return [query] + queries

    def extract_filters(self, query: str) -> tuple[str, dict]:
        """Extract explicit filters from query."""
        prompt = f"""Extract search filters from this query.
Return JSON with "query" (the search text) and "filters" (extracted constraints).

Query: "{query}"

Example:
Query: "kubernetes tutorials from 2023"
{{"query": "kubernetes tutorials", "filters": {{"year": 2023}}}}

JSON:"""

        response = self.llm.generate(prompt, temperature=0)
        try:
            result = json.loads(response)
            return result.get('query', query), result.get('filters', {})
        except:
            return query, {}

Evaluation

Measuring Quality

def evaluate_search(search_index, test_queries):
    """Evaluate search quality with labeled test data."""
    metrics = {
        'mrr': [],  # Mean Reciprocal Rank
        'precision_at_5': [],
        'recall_at_10': []
    }

    for test in test_queries:
        results = search_index.search(test.query, top_k=10)
        result_ids = [r['metadata']['document_id'] for r in results]

        # MRR
        for rank, doc_id in enumerate(result_ids, 1):
            if doc_id in test.relevant_docs:
                metrics['mrr'].append(1.0 / rank)
                break
        else:
            metrics['mrr'].append(0.0)

        # Precision@5
        relevant_in_top5 = len(set(result_ids[:5]) & set(test.relevant_docs))
        metrics['precision_at_5'].append(relevant_in_top5 / 5)

        # Recall@10
        relevant_found = len(set(result_ids) & set(test.relevant_docs))
        metrics['recall_at_10'].append(
            relevant_found / len(test.relevant_docs)
        )

    return {k: sum(v)/len(v) for k, v in metrics.items()}

Key Takeaways

Semantic search matches meaning, not just keywords
Use embedding models (sentence-transformers) for encoding
Chunk documents appropriately for your use case
Hybrid search combines semantic and keyword for best results
Use vector databases for production scale
Metadata filtering enables scoped search
Query expansion improves recall
Evaluate with real queries and labeled relevance
Test different embedding models for your domain
Monitor and iterate based on user behavior

Semantic search is foundational for modern search and AI applications.