Tracing RAG Pipelines
Best practices for observing Retrieval-Augmented Generation applications.
Overview
RAG (Retrieval-Augmented Generation) combines document retrieval with LLM generation. Proper tracing helps you understand retrieval quality, debug generation issues, and optimize performance.
RAG Pipeline Structure
Query
↓
[Query Processing]
↓
[Embedding Generation]
↓
[Vector Search]
↓
[Document Retrieval]
↓
[Context Formatting]
↓
[LLM Generation]
↓
Response
Complete RAG Trace
import { getObservability } from '@transactional/observability';
async function ragQuery(query: string): Promise<string> {
const obs = getObservability();
const trace = obs.trace({
name: 'rag-query',
input: { query },
metadata: {
pipelineVersion: '2.0',
vectorStore: 'pinecone',
model: 'gpt-4o',
},
});
try {
// Step 1: Query Processing
const processSpan = obs.observation({
type: 'SPAN',
name: 'process-query',
input: { query },
});
const processedQuery = await processQuery(query);
await processSpan.end({
output: {
processedQuery,
modifications: processedQuery !== query,
},
});
// Step 2: Generate Embedding
const embedGeneration = obs.generation({
name: 'generate-embedding',
modelName: 'text-embedding-3-small',
input: { text: processedQuery },
});
const embedding = await openai.embeddings.create({
model: 'text-embedding-3-small',
input: processedQuery,
});
await embedGeneration.end({
output: { dimensions: embedding.data[0].embedding.length },
promptTokens: embedding.usage.total_tokens,
});
// Step 3: Vector Search
const searchSpan = obs.observation({
type: 'SPAN',
name: 'vector-search',
input: { topK: 5, threshold: 0.7 },
metadata: { index: 'production-docs' },
});
const searchResults = await vectorStore.query({
vector: embedding.data[0].embedding,
topK: 5,
includeMetadata: true,
});
await searchSpan.end({
output: {
resultCount: searchResults.matches.length,
topScore: searchResults.matches[0]?.score,
avgScore: computeAverage(searchResults.matches.map(m => m.score)),
scores: searchResults.matches.map(m => m.score),
},
});
// Step 4: Format Context
const formatSpan = obs.observation({
type: 'SPAN',
name: 'format-context',
input: { documentCount: searchResults.matches.length },
});
const context = searchResults.matches
.map(m => `## ${m.metadata.title}\n${m.metadata.content}`)
.join('\n\n');
await formatSpan.end({
output: {
contextLength: context.length,
contextTokens: estimateTokens(context),
},
});
// Step 5: Generate Response
const generation = obs.generation({
name: 'generate-response',
modelName: 'gpt-4o',
input: {
systemPrompt: buildSystemPrompt(context),
userQuery: query,
},
});
const response = await openai.chat.completions.create({
model: 'gpt-4o',
messages: [
{ role: 'system', content: buildSystemPrompt(context) },
{ role: 'user', content: query },
],
});
await generation.end({
output: response.choices[0].message,
promptTokens: response.usage?.prompt_tokens,
completionTokens: response.usage?.completion_tokens,
});
const answer = response.choices[0].message.content ?? '';
await trace.end({
output: {
answer,
sourcesUsed: searchResults.matches.length,
totalTokens: response.usage?.total_tokens,
},
});
return answer;
} catch (error) {
await trace.error(error as Error);
throw error;
}
}Retrieval Quality Metrics
Tracking Search Results
Monitor retrieval quality:
const searchSpan = obs.observation({
type: 'SPAN',
name: 'vector-search',
input: { query, topK },
});
const results = await vectorStore.query({...});
await searchSpan.end({
output: {
// Basic metrics
resultCount: results.matches.length,
topScore: results.matches[0]?.score,
avgScore: computeAverage(results.matches.map(m => m.score)),
// Score distribution
aboveThreshold: results.matches.filter(m => m.score > 0.7).length,
// Diversity
uniqueSources: new Set(results.matches.map(m => m.metadata.source)).size,
// Document IDs for analysis
documentIds: results.matches.map(m => m.id),
},
});Retrieval Assessment
Assess retrieval relevance:
async function assessRetrieval(query: string, documents: Document[]) {
const obs = getObservability();
const assessment = await obs.assess({
type: 'llm-judge',
criteria: ['relevance', 'coverage'],
customPrompt: `
Query: ${query}
Retrieved Documents:
${documents.map((d, i) => `${i + 1}. ${d.content.substring(0, 200)}...`).join('\n')}
Assess:
1. Are the documents relevant to the query? (1-5)
2. Do they cover the topic comprehensively? (1-5)
`,
});
return assessment.scores;
}Chunking and Indexing
Tracking Indexing
Monitor document processing:
async function indexDocument(document: Document) {
const obs = getObservability();
const trace = obs.trace({
name: 'index-document',
input: {
documentId: document.id,
size: document.content.length,
},
});
// Chunking
const chunkSpan = obs.observation({
type: 'SPAN',
name: 'chunk-document',
input: { size: document.content.length },
});
const chunks = chunkDocument(document);
await chunkSpan.end({
output: {
chunkCount: chunks.length,
avgChunkSize: computeAverage(chunks.map(c => c.length)),
},
});
// Embedding
const embedGeneration = obs.generation({
name: 'embed-chunks',
modelName: 'text-embedding-3-small',
input: { chunkCount: chunks.length },
});
const embeddings = await openai.embeddings.create({
model: 'text-embedding-3-small',
input: chunks.map(c => c.content),
});
await embedGeneration.end({
output: { embeddingsGenerated: embeddings.data.length },
promptTokens: embeddings.usage.total_tokens,
});
// Indexing
const indexSpan = obs.observation({
type: 'SPAN',
name: 'store-vectors',
input: { vectorCount: embeddings.data.length },
});
await vectorStore.upsert(embeddings.data.map((e, i) => ({
id: `${document.id}-${i}`,
values: e.embedding,
metadata: { documentId: document.id, chunkIndex: i },
})));
await indexSpan.end({ output: { success: true } });
await trace.end({
output: {
chunksIndexed: chunks.length,
},
});
}Context Window Management
Tracking Context Usage
Monitor context utilization:
const contextTokens = estimateTokens(context);
const systemTokens = estimateTokens(systemPrompt);
const queryTokens = estimateTokens(query);
const maxContext = 128000; // gpt-4o
const generation = obs.generation({
name: 'generate-response',
modelName: 'gpt-4o',
metadata: {
contextTokens,
systemTokens,
queryTokens,
totalInputTokens: contextTokens + systemTokens + queryTokens,
contextUtilization: (contextTokens + systemTokens + queryTokens) / maxContext,
},
});Context Truncation
Track when context is truncated:
async function buildContext(documents: Document[], maxTokens: number) {
const obs = getObservability();
const span = obs.observation({
type: 'SPAN',
name: 'build-context',
input: {
documentCount: documents.length,
maxTokens,
},
});
let context = '';
let includedDocs = 0;
for (const doc of documents) {
const docTokens = estimateTokens(doc.content);
if (estimateTokens(context) + docTokens > maxTokens) break;
context += doc.content + '\n\n';
includedDocs++;
}
await span.end({
output: {
includedDocs,
excludedDocs: documents.length - includedDocs,
contextTokens: estimateTokens(context),
truncated: includedDocs < documents.length,
},
});
return context;
}Hybrid Search
Tracking Multiple Retrieval Methods
async function hybridSearch(query: string) {
const obs = getObservability();
const trace = obs.trace({
name: 'hybrid-search',
input: { query },
});
// Vector search
const vectorSpan = obs.observation({
type: 'SPAN',
name: 'vector-search',
});
const vectorResults = await vectorSearch(query);
await vectorSpan.end({
output: {
count: vectorResults.length,
topScore: vectorResults[0]?.score,
},
});
// Keyword search
const keywordSpan = obs.observation({
type: 'SPAN',
name: 'keyword-search',
});
const keywordResults = await keywordSearch(query);
await keywordSpan.end({
output: {
count: keywordResults.length,
topScore: keywordResults[0]?.score,
},
});
// Merge results
const mergeSpan = obs.observation({
type: 'SPAN',
name: 'merge-results',
input: {
vectorCount: vectorResults.length,
keywordCount: keywordResults.length,
},
});
const merged = reciprocalRankFusion(vectorResults, keywordResults);
await mergeSpan.end({
output: {
finalCount: merged.length,
vectorOnly: merged.filter(r => !r.keywordRank).length,
keywordOnly: merged.filter(r => !r.vectorRank).length,
both: merged.filter(r => r.vectorRank && r.keywordRank).length,
},
});
await trace.end({
output: { results: merged.length },
});
return merged;
}Best Practices
1. Track Source Attribution
await trace.end({
output: {
answer,
sources: searchResults.matches.map(m => ({
id: m.id,
title: m.metadata.title,
score: m.score,
})),
},
});2. Monitor Retrieval Metrics
Key metrics to track:
- Top score (> 0.7 is good)
- Results above threshold
- Unique sources
- Query-to-result latency
3. Assess Groundedness
Check if responses are grounded in retrieved context:
const groundedness = await obs.assess({
type: 'llm-judge',
customPrompt: `
Is every claim in the response supported by the context?
Context: ${context}
Response: ${response}
Score 1-5: 1=unsupported claims, 5=fully grounded
`,
});4. A/B Test Retrieval Strategies
const strategy = Math.random() > 0.5 ? 'hybrid' : 'vector-only';
const trace = obs.trace({
name: 'rag-query',
metadata: {
experiment: 'retrieval-strategy',
variant: strategy,
},
});Next Steps
- Production Guide - Production best practices
- Evaluation - Assess RAG quality
- Performance - Optimize latency
On This Page
- Overview
- RAG Pipeline Structure
- Complete RAG Trace
- Retrieval Quality Metrics
- Tracking Search Results
- Retrieval Assessment
- Chunking and Indexing
- Tracking Indexing
- Context Window Management
- Tracking Context Usage
- Context Truncation
- Hybrid Search
- Tracking Multiple Retrieval Methods
- Best Practices
- 1. Track Source Attribution
- 2. Monitor Retrieval Metrics
- 3. Assess Groundedness
- 4. A/B Test Retrieval Strategies
- Next Steps