Basic RAG—embed query, find similar chunks, stuff into context—often disappoints. The retrieval quality determines the generation quality. Advanced retrieval strategies can dramatically improve results.
Here are retrieval strategies that make RAG actually work.
Why Basic RAG Falls Short
Common Failure Modes
basic_rag_problems:
semantic_gap:
issue: "Query terms don't match document terms"
example: "User asks 'vacation policy', docs say 'PTO guidelines'"
chunking_issues:
issue: "Important context split across chunks"
example: "Question spans multiple paragraphs"
wrong_granularity:
issue: "Retrieved too much or too little"
example: "Need sentence, got whole section"
recency_blindness:
issue: "Old and new docs treated equally"
example: "Outdated policy retrieved"
multi_hop_failure:
issue: "Answer requires connecting multiple facts"
example: "Who is the CEO of the company that makes iPhones?"
Advanced Retrieval Strategies
Hybrid Search
class HybridRetriever:
"""Combine semantic and keyword search."""
def __init__(self, vector_store, keyword_index, alpha: float = 0.7):
self.vector_store = vector_store
self.keyword_index = keyword_index
self.alpha = alpha # Weight for semantic search
async def search(
self,
query: str,
top_k: int = 10
) -> list[Document]:
# Semantic search
semantic_results = await self.vector_store.search(
query=query,
top_k=top_k * 2
)
# Keyword search (BM25)
keyword_results = await self.keyword_index.search(
query=query,
top_k=top_k * 2
)
# Reciprocal Rank Fusion
combined = self._rrf_combine(
semantic_results,
keyword_results,
alpha=self.alpha
)
return combined[:top_k]
def _rrf_combine(
self,
semantic: list[Document],
keyword: list[Document],
alpha: float,
k: int = 60
) -> list[Document]:
"""Reciprocal Rank Fusion scoring."""
scores = {}
for rank, doc in enumerate(semantic):
scores[doc.id] = scores.get(doc.id, 0) + alpha / (k + rank)
for rank, doc in enumerate(keyword):
scores[doc.id] = scores.get(doc.id, 0) + (1 - alpha) / (k + rank)
# Sort by combined score
all_docs = {d.id: d for d in semantic + keyword}
sorted_ids = sorted(scores.keys(), key=lambda x: scores[x], reverse=True)
return [all_docs[id] for id in sorted_ids]
Query Expansion
class QueryExpander:
"""Expand query to improve recall."""
async def expand_query(self, query: str) -> list[str]:
expanded = await self.llm.generate(
prompt=f"""Generate 3 alternative phrasings of this search query.
Include synonyms and related terms.
Original query: {query}
Alternative queries (one per line):"""
)
alternatives = expanded.strip().split("\n")
return [query] + alternatives[:3]
async def hypothetical_document(self, query: str) -> str:
"""HyDE: Generate hypothetical answer to use as query."""
return await self.llm.generate(
prompt=f"""Write a short paragraph that would be a perfect answer to this question.
Don't say you don't know - write what an ideal answer would look like.
Question: {query}
Ideal answer paragraph:"""
)
class ExpandedRetriever:
async def search(self, query: str, top_k: int = 10) -> list[Document]:
# Expand query
queries = await self.expander.expand_query(query)
# Generate HyDE document
hyde_doc = await self.expander.hypothetical_document(query)
queries.append(hyde_doc)
# Search with all queries
all_results = []
for q in queries:
results = await self.vector_store.search(q, top_k=top_k)
all_results.extend(results)
# Deduplicate and rerank
return self._dedupe_and_rerank(all_results, query)[:top_k]
Contextual Chunking
class ContextualChunker:
"""Chunk documents while preserving context."""
def chunk_with_context(
self,
document: str,
chunk_size: int = 500,
overlap: int = 100
) -> list[Chunk]:
# Parse document structure
sections = self._parse_sections(document)
chunks = []
for section in sections:
section_chunks = self._chunk_section(
section,
chunk_size=chunk_size,
overlap=overlap
)
for chunk in section_chunks:
# Add contextual header
chunk.text = f"""Document: {section.document_title}
Section: {section.title}
{chunk.text}"""
chunk.metadata["section"] = section.title
chunk.metadata["document"] = section.document_title
chunks.append(chunk)
return chunks
async def generate_chunk_summary(self, chunk: Chunk) -> str:
"""Generate searchable summary for chunk."""
return await self.llm.generate(
prompt=f"""Write a brief summary of this text chunk that captures
its key searchable concepts.
Text:
{chunk.text}
Summary (2-3 sentences):"""
)
Multi-Vector Retrieval
class MultiVectorRetriever:
"""Store multiple representations per document."""
async def index_document(self, doc: Document):
# Original text embedding
original_embedding = await self.embed(doc.text)
# Summary embedding
summary = await self.summarize(doc.text)
summary_embedding = await self.embed(summary)
# Question embeddings (what questions does this answer?)
questions = await self.generate_questions(doc.text)
question_embeddings = [await self.embed(q) for q in questions]
# Store all vectors pointing to same document
vectors = [
{"embedding": original_embedding, "type": "original"},
{"embedding": summary_embedding, "type": "summary"},
] + [
{"embedding": emb, "type": "question"}
for emb in question_embeddings
]
for vec in vectors:
await self.vector_store.insert(
embedding=vec["embedding"],
metadata={
"doc_id": doc.id,
"vector_type": vec["type"]
}
)
async def generate_questions(self, text: str) -> list[str]:
"""Generate questions this text could answer."""
result = await self.llm.generate(
prompt=f"""What questions would this text answer?
Generate 3-5 specific questions.
Text:
{text}
Questions:"""
)
return result.strip().split("\n")
Reranking
class Reranker:
"""Rerank retrieved documents for relevance."""
async def rerank(
self,
query: str,
documents: list[Document],
top_k: int = 5
) -> list[Document]:
# Score each document
scored_docs = []
for doc in documents:
score = await self._score_relevance(query, doc.text)
scored_docs.append((doc, score))
# Sort by score
scored_docs.sort(key=lambda x: x[1], reverse=True)
return [doc for doc, _ in scored_docs[:top_k]]
async def _score_relevance(self, query: str, text: str) -> float:
result = await self.llm.generate(
prompt=f"""Rate how relevant this text is to the query.
Score from 0-10 where 10 is perfectly relevant.
Query: {query}
Text: {text[:1000]}
Score (just the number):"""
)
try:
return float(result.strip())
except:
return 0.0
Evaluation
retrieval_metrics:
recall_at_k:
what: "Percentage of relevant docs in top K"
target: ">0.8 at k=10"
precision_at_k:
what: "Percentage of top K that are relevant"
target: ">0.6 at k=5"
mrr:
what: "Mean Reciprocal Rank of first relevant"
target: ">0.7"
ndcg:
what: "Ranking quality weighted by position"
target: ">0.75"
Key Takeaways
- Basic RAG is just the starting point
- Hybrid search (semantic + keyword) beats either alone
- Query expansion improves recall significantly
- HyDE generates better search representations
- Contextual chunking preserves document structure
- Multi-vector indexing captures different aspects
- Reranking improves precision after recall
- Measure retrieval quality separately from generation
- The best retrieval strategy depends on your data
- Iterate on retrieval before tuning prompts
Better retrieval means better answers. Invest in it.