Advanced Retrieval Strategies for RAG

Basic RAG—embed query, find similar chunks, stuff into context—often disappoints. The retrieval quality determines the generation quality. Advanced retrieval strategies can dramatically improve results.

Here are retrieval strategies that make RAG actually work.

Why Basic RAG Falls Short

Common Failure Modes

basic_rag_problems:
  semantic_gap:
    issue: "Query terms don't match document terms"
    example: "User asks 'vacation policy', docs say 'PTO guidelines'"

  chunking_issues:
    issue: "Important context split across chunks"
    example: "Question spans multiple paragraphs"

  wrong_granularity:
    issue: "Retrieved too much or too little"
    example: "Need sentence, got whole section"

  recency_blindness:
    issue: "Old and new docs treated equally"
    example: "Outdated policy retrieved"

  multi_hop_failure:
    issue: "Answer requires connecting multiple facts"
    example: "Who is the CEO of the company that makes iPhones?"

Advanced Retrieval Strategies

Hybrid Search

class HybridRetriever:
    """Combine semantic and keyword search."""

    def __init__(self, vector_store, keyword_index, alpha: float = 0.7):
        self.vector_store = vector_store
        self.keyword_index = keyword_index
        self.alpha = alpha  # Weight for semantic search

    async def search(
        self,
        query: str,
        top_k: int = 10
    ) -> list[Document]:
        # Semantic search
        semantic_results = await self.vector_store.search(
            query=query,
            top_k=top_k * 2
        )

        # Keyword search (BM25)
        keyword_results = await self.keyword_index.search(
            query=query,
            top_k=top_k * 2
        )

        # Reciprocal Rank Fusion
        combined = self._rrf_combine(
            semantic_results,
            keyword_results,
            alpha=self.alpha
        )

        return combined[:top_k]

    def _rrf_combine(
        self,
        semantic: list[Document],
        keyword: list[Document],
        alpha: float,
        k: int = 60
    ) -> list[Document]:
        """Reciprocal Rank Fusion scoring."""
        scores = {}

        for rank, doc in enumerate(semantic):
            scores[doc.id] = scores.get(doc.id, 0) + alpha / (k + rank)

        for rank, doc in enumerate(keyword):
            scores[doc.id] = scores.get(doc.id, 0) + (1 - alpha) / (k + rank)

        # Sort by combined score
        all_docs = {d.id: d for d in semantic + keyword}
        sorted_ids = sorted(scores.keys(), key=lambda x: scores[x], reverse=True)

        return [all_docs[id] for id in sorted_ids]

Query Expansion

class QueryExpander:
    """Expand query to improve recall."""

    async def expand_query(self, query: str) -> list[str]:
        expanded = await self.llm.generate(
            prompt=f"""Generate 3 alternative phrasings of this search query.
Include synonyms and related terms.

Original query: {query}

Alternative queries (one per line):"""
        )

        alternatives = expanded.strip().split("\n")
        return [query] + alternatives[:3]

    async def hypothetical_document(self, query: str) -> str:
        """HyDE: Generate hypothetical answer to use as query."""
        return await self.llm.generate(
            prompt=f"""Write a short paragraph that would be a perfect answer to this question.
Don't say you don't know - write what an ideal answer would look like.

Question: {query}

Ideal answer paragraph:"""
        )

class ExpandedRetriever:
    async def search(self, query: str, top_k: int = 10) -> list[Document]:
        # Expand query
        queries = await self.expander.expand_query(query)

        # Generate HyDE document
        hyde_doc = await self.expander.hypothetical_document(query)
        queries.append(hyde_doc)

        # Search with all queries
        all_results = []
        for q in queries:
            results = await self.vector_store.search(q, top_k=top_k)
            all_results.extend(results)

        # Deduplicate and rerank
        return self._dedupe_and_rerank(all_results, query)[:top_k]

Contextual Chunking

class ContextualChunker:
    """Chunk documents while preserving context."""

    def chunk_with_context(
        self,
        document: str,
        chunk_size: int = 500,
        overlap: int = 100
    ) -> list[Chunk]:
        # Parse document structure
        sections = self._parse_sections(document)
        chunks = []

        for section in sections:
            section_chunks = self._chunk_section(
                section,
                chunk_size=chunk_size,
                overlap=overlap
            )

            for chunk in section_chunks:
                # Add contextual header
                chunk.text = f"""Document: {section.document_title}
Section: {section.title}

{chunk.text}"""
                chunk.metadata["section"] = section.title
                chunk.metadata["document"] = section.document_title
                chunks.append(chunk)

        return chunks

    async def generate_chunk_summary(self, chunk: Chunk) -> str:
        """Generate searchable summary for chunk."""
        return await self.llm.generate(
            prompt=f"""Write a brief summary of this text chunk that captures
its key searchable concepts.

Text:
{chunk.text}

Summary (2-3 sentences):"""
        )

Multi-Vector Retrieval

class MultiVectorRetriever:
    """Store multiple representations per document."""

    async def index_document(self, doc: Document):
        # Original text embedding
        original_embedding = await self.embed(doc.text)

        # Summary embedding
        summary = await self.summarize(doc.text)
        summary_embedding = await self.embed(summary)

        # Question embeddings (what questions does this answer?)
        questions = await self.generate_questions(doc.text)
        question_embeddings = [await self.embed(q) for q in questions]

        # Store all vectors pointing to same document
        vectors = [
            {"embedding": original_embedding, "type": "original"},
            {"embedding": summary_embedding, "type": "summary"},
        ] + [
            {"embedding": emb, "type": "question"}
            for emb in question_embeddings
        ]

        for vec in vectors:
            await self.vector_store.insert(
                embedding=vec["embedding"],
                metadata={
                    "doc_id": doc.id,
                    "vector_type": vec["type"]
                }
            )

    async def generate_questions(self, text: str) -> list[str]:
        """Generate questions this text could answer."""
        result = await self.llm.generate(
            prompt=f"""What questions would this text answer?
Generate 3-5 specific questions.

Text:
{text}

Questions:"""
        )
        return result.strip().split("\n")

Reranking

class Reranker:
    """Rerank retrieved documents for relevance."""

    async def rerank(
        self,
        query: str,
        documents: list[Document],
        top_k: int = 5
    ) -> list[Document]:
        # Score each document
        scored_docs = []

        for doc in documents:
            score = await self._score_relevance(query, doc.text)
            scored_docs.append((doc, score))

        # Sort by score
        scored_docs.sort(key=lambda x: x[1], reverse=True)

        return [doc for doc, _ in scored_docs[:top_k]]

    async def _score_relevance(self, query: str, text: str) -> float:
        result = await self.llm.generate(
            prompt=f"""Rate how relevant this text is to the query.
Score from 0-10 where 10 is perfectly relevant.

Query: {query}

Text: {text[:1000]}

Score (just the number):"""
        )
        try:
            return float(result.strip())
        except:
            return 0.0

Evaluation

retrieval_metrics:
  recall_at_k:
    what: "Percentage of relevant docs in top K"
    target: ">0.8 at k=10"

  precision_at_k:
    what: "Percentage of top K that are relevant"
    target: ">0.6 at k=5"

  mrr:
    what: "Mean Reciprocal Rank of first relevant"
    target: ">0.7"

  ndcg:
    what: "Ranking quality weighted by position"
    target: ">0.75"

Key Takeaways

Basic RAG is just the starting point
Hybrid search (semantic + keyword) beats either alone
Query expansion improves recall significantly
HyDE generates better search representations
Contextual chunking preserves document structure
Multi-vector indexing captures different aspects
Reranking improves precision after recall
Measure retrieval quality separately from generation
The best retrieval strategy depends on your data
Iterate on retrieval before tuning prompts

Better retrieval means better answers. Invest in it.