Implementing Semantic Search: A Practical Guide

June 26, 2023

Traditional keyword search fails when users don’t know the exact terms. Semantic search understands meaning—“how to fix a slow database” matches “optimizing PostgreSQL query performance.” With embeddings readily available, semantic search is now practical to implement.

Here’s a practical guide to building semantic search.

How Semantic Search Works

Traditional vs. Semantic

keyword_search:
  approach: Match exact words
  query: "kubernetes pod networking"
  matches: Documents containing "kubernetes", "pod", "networking"
  misses: "k8s container network configuration"

semantic_search:
  approach: Match meaning
  query: "kubernetes pod networking"
  matches: Documents about container networking concepts
  finds: "k8s container network configuration" (similar meaning)

The Pipeline

Query → Embed → Search Vector DB → Rank → Return

Documents → Chunk → Embed → Store in Vector DB

Building the System

Document Ingestion

from sentence_transformers import SentenceTransformer
from dataclasses import dataclass
import hashlib

@dataclass
class Document:
    id: str
    content: str
    metadata: dict

class SemanticSearchIndex:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        self.chunks = []
        self.embeddings = []
        self.metadata = []

    def add_documents(self, documents: list[Document]):
        for doc in documents:
            chunks = self._chunk_document(doc)
            for chunk in chunks:
                embedding = self.model.encode(chunk['text'])
                self.chunks.append(chunk['text'])
                self.embeddings.append(embedding)
                self.metadata.append({
                    'document_id': doc.id,
                    'chunk_index': chunk['index'],
                    **doc.metadata
                })

    def _chunk_document(self, doc: Document, chunk_size=500, overlap=50):
        text = doc.content
        chunks = []
        start = 0
        index = 0

        while start < len(text):
            end = min(start + chunk_size, len(text))

            # Try to break at sentence boundary
            if end < len(text):
                for sep in ['. ', '.\n', '\n\n']:
                    last_sep = text.rfind(sep, start + chunk_size//2, end)
                    if last_sep != -1:
                        end = last_sep + len(sep)
                        break

            chunks.append({
                'text': text[start:end].strip(),
                'index': index,
                'start_char': start,
                'end_char': end
            })

            start = end - overlap
            index += 1

        return chunks

Search Implementation

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class SemanticSearchIndex:
    # ... previous code ...

    def search(self, query: str, top_k: int = 10) -> list[dict]:
        # Embed query
        query_embedding = self.model.encode(query)

        # Calculate similarities
        embeddings_array = np.array(self.embeddings)
        similarities = cosine_similarity(
            [query_embedding],
            embeddings_array
        )[0]

        # Get top k indices
        top_indices = np.argsort(similarities)[::-1][:top_k]

        # Build results
        results = []
        for idx in top_indices:
            results.append({
                'text': self.chunks[idx],
                'score': float(similarities[idx]),
                'metadata': self.metadata[idx]
            })

        return results
from rank_bm25 import BM25Okapi
import re

class HybridSearchIndex:
    def __init__(self, semantic_weight=0.7):
        self.semantic = SemanticSearchIndex()
        self.bm25 = None
        self.tokenized_chunks = []
        self.semantic_weight = semantic_weight

    def add_documents(self, documents):
        self.semantic.add_documents(documents)

        # Build BM25 index
        self.tokenized_chunks = [
            self._tokenize(chunk)
            for chunk in self.semantic.chunks
        ]
        self.bm25 = BM25Okapi(self.tokenized_chunks)

    def _tokenize(self, text):
        # Simple tokenization
        return re.findall(r'\w+', text.lower())

    def search(self, query: str, top_k: int = 10) -> list[dict]:
        # Semantic search
        semantic_results = self.semantic.search(query, top_k=top_k * 2)

        # BM25 search
        tokenized_query = self._tokenize(query)
        bm25_scores = self.bm25.get_scores(tokenized_query)

        # Normalize scores
        semantic_scores = {r['metadata']['chunk_index']: r['score']
                         for i, r in enumerate(semantic_results)}
        bm25_max = max(bm25_scores) if max(bm25_scores) > 0 else 1
        bm25_normalized = {i: s/bm25_max for i, s in enumerate(bm25_scores)}

        # Combine scores
        combined_scores = {}
        all_indices = set(semantic_scores.keys()) | set(range(len(bm25_scores)))

        for idx in all_indices:
            sem_score = semantic_scores.get(idx, 0)
            bm25_score = bm25_normalized.get(idx, 0)
            combined_scores[idx] = (
                self.semantic_weight * sem_score +
                (1 - self.semantic_weight) * bm25_score
            )

        # Sort and return top k
        sorted_indices = sorted(
            combined_scores.keys(),
            key=lambda x: combined_scores[x],
            reverse=True
        )[:top_k]

        return [{
            'text': self.semantic.chunks[idx],
            'score': combined_scores[idx],
            'metadata': self.semantic.metadata[idx]
        } for idx in sorted_indices]

Production Considerations

Using Vector Databases

import pinecone
from sentence_transformers import SentenceTransformer

class ProductionSemanticSearch:
    def __init__(self, index_name: str):
        pinecone.init(
            api_key=os.environ['PINECONE_API_KEY'],
            environment='us-east1-gcp'
        )
        self.index = pinecone.Index(index_name)
        self.model = SentenceTransformer('all-MiniLM-L6-v2')

    def add_documents(self, documents: list[Document], batch_size=100):
        vectors = []

        for doc in documents:
            chunks = self._chunk_document(doc)
            for chunk in chunks:
                embedding = self.model.encode(chunk['text']).tolist()
                vectors.append({
                    'id': f"{doc.id}_{chunk['index']}",
                    'values': embedding,
                    'metadata': {
                        'text': chunk['text'][:1000],  # Pinecone metadata limit
                        'document_id': doc.id,
                        **doc.metadata
                    }
                })

                if len(vectors) >= batch_size:
                    self.index.upsert(vectors=vectors)
                    vectors = []

        if vectors:
            self.index.upsert(vectors=vectors)

    def search(self, query: str, top_k: int = 10, filter: dict = None):
        query_embedding = self.model.encode(query).tolist()

        results = self.index.query(
            vector=query_embedding,
            top_k=top_k,
            include_metadata=True,
            filter=filter
        )

        return [{
            'text': match.metadata.get('text', ''),
            'score': match.score,
            'metadata': match.metadata
        } for match in results.matches]

Filtering and Faceting

# Search with filters
results = search_index.search(
    query="kubernetes deployment best practices",
    top_k=10,
    filter={
        "category": {"$in": ["devops", "kubernetes"]},
        "date": {"$gte": "2023-01-01"},
        "author": {"$eq": "engineering-team"}
    }
)

# Faceted search
def search_with_facets(query: str, facet_fields: list[str]):
    results = search_index.search(query, top_k=100)

    facets = {}
    for field in facet_fields:
        facets[field] = {}
        for result in results:
            value = result['metadata'].get(field)
            if value:
                facets[field][value] = facets[field].get(value, 0) + 1

    return {
        'results': results[:10],
        'facets': facets
    }

Query Understanding

class QueryProcessor:
    def __init__(self, llm):
        self.llm = llm

    def expand_query(self, query: str) -> list[str]:
        """Generate query variations for better recall."""
        prompt = f"""Generate 3 alternative search queries for: "{query}"
Return only the queries, one per line."""

        response = self.llm.generate(prompt)
        queries = [q.strip() for q in response.split('\n') if q.strip()]
        return [query] + queries

    def extract_filters(self, query: str) -> tuple[str, dict]:
        """Extract explicit filters from query."""
        prompt = f"""Extract search filters from this query.
Return JSON with "query" (the search text) and "filters" (extracted constraints).

Query: "{query}"

Example:
Query: "kubernetes tutorials from 2023"
{{"query": "kubernetes tutorials", "filters": {{"year": 2023}}}}

JSON:"""

        response = self.llm.generate(prompt, temperature=0)
        try:
            result = json.loads(response)
            return result.get('query', query), result.get('filters', {})
        except:
            return query, {}

Evaluation

Measuring Quality

def evaluate_search(search_index, test_queries):
    """Evaluate search quality with labeled test data."""
    metrics = {
        'mrr': [],  # Mean Reciprocal Rank
        'precision_at_5': [],
        'recall_at_10': []
    }

    for test in test_queries:
        results = search_index.search(test.query, top_k=10)
        result_ids = [r['metadata']['document_id'] for r in results]

        # MRR
        for rank, doc_id in enumerate(result_ids, 1):
            if doc_id in test.relevant_docs:
                metrics['mrr'].append(1.0 / rank)
                break
        else:
            metrics['mrr'].append(0.0)

        # Precision@5
        relevant_in_top5 = len(set(result_ids[:5]) & set(test.relevant_docs))
        metrics['precision_at_5'].append(relevant_in_top5 / 5)

        # Recall@10
        relevant_found = len(set(result_ids) & set(test.relevant_docs))
        metrics['recall_at_10'].append(
            relevant_found / len(test.relevant_docs)
        )

    return {k: sum(v)/len(v) for k, v in metrics.items()}

Key Takeaways

Semantic search is foundational for modern search and AI applications.