<?xml version="1.0" encoding="UTF-8"?><rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	>

<channel>
	<title>PyImageSearch</title>
	<atom:link href="https://pyimagesearch.com/feed/" rel="self" type="application/rss+xml" />
	<link>https://pyimagesearch.com/</link>
	<description>You can master Computer Vision, Deep Learning, and OpenCV - PyImageSearch</description>
	<lastBuildDate>Mon, 22 Jun 2026 09:53:56 +0000</lastBuildDate>
	<language>en-US</language>
	<sy:updatePeriod>
	hourly	</sy:updatePeriod>
	<sy:updateFrequency>
	1	</sy:updateFrequency>
	<generator>https://wordpress.org/?v=6.8.5</generator>
	<item>
		<title>Google DeepMind&#8217;s Gemma 4: MoE, Efficiency Tricks, and Benchmarks</title>
		<link>https://pyimagesearch.com/2026/06/22/google-deepminds-gemma-4-moe-efficiency-tricks-and-benchmarks/</link>
		
		<dc:creator><![CDATA[Piyush Thakur]]></dc:creator>
		<pubDate>Mon, 22 Jun 2026 12:45:00 +0000</pubDate>
				<category><![CDATA[Artificial Intelligence]]></category>
		<category><![CDATA[Generative AI]]></category>
		<category><![CDATA[Large Language Models]]></category>
		<category><![CDATA[Multimodal AI]]></category>
		<category><![CDATA[Tutorial]]></category>
		<category><![CDATA[agentic ai]]></category>
		<category><![CDATA[audio encoder]]></category>
		<category><![CDATA[function calling]]></category>
		<category><![CDATA[gemma 4]]></category>
		<category><![CDATA[google deepmind]]></category>
		<category><![CDATA[grouped query attention]]></category>
		<category><![CDATA[hugging face transformers]]></category>
		<category><![CDATA[kv cache]]></category>
		<category><![CDATA[large language models]]></category>
		<category><![CDATA[llm architecture]]></category>
		<category><![CDATA[long context]]></category>
		<category><![CDATA[mixture of experts]]></category>
		<category><![CDATA[model benchmarks]]></category>
		<category><![CDATA[model optimization]]></category>
		<category><![CDATA[moe model]]></category>
		<category><![CDATA[multimodal ai]]></category>
		<category><![CDATA[open weight models]]></category>
		<category><![CDATA[RoPE]]></category>
		<category><![CDATA[tutorial]]></category>
		<category><![CDATA[vision transformer]]></category>
		<guid isPermaLink="false">https://pyimagesearch.com/?p=54330</guid>

					<description><![CDATA[<p>Table of Contents Google DeepMind’s Gemma 4: MoE, Efficiency Tricks, and Benchmarks Gemma 4 Model Family Overview: E2B, E4B, 31B, and MoE 26B A4B Gemma 4 Capabilities: Reasoning, Multimodal AI, and Thinking Mode Gemma 4 Thinking Mode: Chain-of-Thought Reasoning Explained&#8230;</p>
<p>The post <a rel="nofollow" href="https://pyimagesearch.com/2026/06/22/google-deepminds-gemma-4-moe-efficiency-tricks-and-benchmarks/">Google DeepMind&#8217;s Gemma 4: MoE, Efficiency Tricks, and Benchmarks</a> appeared first on <a rel="nofollow" href="https://pyimagesearch.com">PyImageSearch</a>.</p>
]]></description>
										<content:encoded><![CDATA[
<hr class="wp-block-separator has-alpha-channel-opacity" id="TOC"/>


<div class="yoast-breadcrumbs"><span><span><a href="https://pyimagesearch.com/">Home</a></span></div>


<div class="toc">
<hr class="TOC"/>
<p class="has-large-font-size"><strong>Table of Contents</strong></p>
<ul>
    <li id="TOC-h1-Google-DeepMind-Gemma-4-MoE-Efficiency-Tricks-Benchmarks"><a rel="noopener" target="_blank" href="#h1-Google-DeepMind-Gemma-4-MoE-Efficiency-Tricks-Benchmarks">Google DeepMind’s Gemma 4: MoE, Efficiency Tricks, and Benchmarks</a></li>

    <li id="TOC-h2-Gemma-4-Model-Family-Overview-E2B-E4B-31B-MoE-26B-A4B"><a rel="noopener" target="_blank" href="#h2-Gemma-4-Model-Family-Overview-E2B-E4B-31B-MoE-26B-A4B">Gemma 4 Model Family Overview: E2B, E4B, 31B, and MoE 26B A4B</a></li>

    <li id="TOC-h2-Gemma-4-Capabilities-Reasoning-Multimodal-AI-Thinking-Mode"><a rel="noopener" target="_blank" href="#h2-Gemma-4-Capabilities-Reasoning-Multimodal-AI-Thinking-Mode">Gemma 4 Capabilities: Reasoning, Multimodal AI, and Thinking Mode</a></li>
    <ul>
        <li id="TOC-h3-Gemma-4-Thinking-Mode-Chain-of-Thought-Reasoning-Explained"><a rel="noopener" target="_blank" href="#h3-Gemma-4-Thinking-Mode-Chain-of-Thought-Reasoning-Explained">Gemma 4 Thinking Mode: Chain-of-Thought Reasoning Explained</a></li>
        <li id="TOC-h3-Image-Understanding-Object-Detection-OCR-GUI-Navigation"><a rel="noopener" target="_blank" href="#h3-Image-Understanding-Object-Detection-OCR-GUI-Navigation">Image Understanding: Object Detection, OCR, and GUI Navigation</a></li>
        <li id="TOC-h3-Gemma-4-Code-Generation-Images-UI-Reconstruction-Vision-to-Code"><a rel="noopener" target="_blank" href="#h3-Gemma-4-Code-Generation-Images-UI-Reconstruction-Vision-to-Code">Gemma 4 Code Generation from Images: UI Reconstruction and Vision-to-Code</a></li>
        <li id="TOC-h3-Gemma-4-Video-Understanding-Multimodal-Temporal-Reasoning"><a rel="noopener" target="_blank" href="#h3-Gemma-4-Video-Understanding-Multimodal-Temporal-Reasoning">Gemma 4 Video Understanding: Multimodal Temporal Reasoning</a></li>
        <li id="TOC-h3-Gemma-4-Audio-AI-Speech-Recognition-Translation-Audio-QA"><a rel="noopener" target="_blank" href="#h3-Gemma-4-Audio-AI-Speech-Recognition-Translation-Audio-QA">Gemma 4 Audio AI: Speech Recognition, Translation, and Audio Q&amp;A</a></li>
        <li id="TOC-h3-Gemma-4-Function-Calling-Tool-Use-Agentic-AI-Workflows"><a rel="noopener" target="_blank" href="#h3-Gemma-4-Function-Calling-Tool-Use-Agentic-AI-Workflows">Gemma 4 Function Calling: Tool Use and Agentic AI Workflows</a></li>
        <li id="TOC-h3-Gemma-4-System-Prompts-Instruction-Control-Chat-Behavior"><a rel="noopener" target="_blank" href="#h3-Gemma-4-System-Prompts-Instruction-Control-Chat-Behavior">Gemma 4 System Prompts: Instruction Control and Chat Behavior</a></li>
    </ul>

    <li id="TOC-h2-Gemma-4-Architecture-Overview-Shared-Transformer-Design-Principles"><a rel="noopener" target="_blank" href="#h2-Gemma-4-Architecture-Overview-Shared-Transformer-Design-Principles">Gemma 4 Architecture Overview: Shared Transformer Design Principles</a></li>
    <ul>
        <li id="TOC-h3-Gemma-4-Attention-Mechanism-Local-Global-Interleaved-Attention-Explained"><a rel="noopener" target="_blank" href="#h3-Gemma-4-Attention-Mechanism-Local-Global-Interleaved-Attention-Explained">Gemma 4 Attention Mechanism: Local + Global Interleaved Attention Explained</a></li>
        <li id="TOC-h3-Gemma-4-Efficiency-Tricks-GQA-K-V-Caching-Memory-Optimization"><a rel="noopener" target="_blank" href="#h3-Gemma-4-Efficiency-Tricks-GQA-K-V-Caching-Memory-Optimization">Gemma 4 Efficiency Tricks: GQA, K=V Caching, and Memory Optimization</a></li>
        <li id="TOC-h3-Gemma-4-Vision-Encoder-ViT-Based-Image-Processing-Architecture"><a rel="noopener" target="_blank" href="#h3-Gemma-4-Vision-Encoder-ViT-Based-Image-Processing-Architecture">Gemma 4 Vision Encoder: ViT-Based Image Processing Architecture</a></li>
    </ul>

    <li id="TOC-h2-Gemma-4-Architecture-Variants-Dense-vs-MoE-vs-On-Device-Models"><a rel="noopener" target="_blank" href="#h2-Gemma-4-Architecture-Variants-Dense-vs-MoE-vs-On-Device-Models">Gemma 4 Architecture Variants: Dense vs MoE vs On-Device Models</a></li>
    <ul>
        <li id="TOC-h3-Gemma-4-31B-Dense-Baseline"><a rel="noopener" target="_blank" href="#h3-Gemma-4-31B-Dense-Baseline">Gemma 4 31B: The Dense Baseline</a></li>
        <li id="TOC-h3-Gemma-4-26B-A4B-MoE-Sparse-Experts-Efficient-Inference-Explained"><a rel="noopener" target="_blank" href="#h3-Gemma-4-26B-A4B-MoE-Sparse-Experts-Efficient-Inference-Explained">Gemma 4 26B A4B MoE: Sparse Experts and Efficient Inference Explained</a></li>
        <li id="TOC-h3-Gemma-4-E2B-E4B-On-Device-Multimodal-AI-Models-Edge-Deployment"><a rel="noopener" target="_blank" href="#h3-Gemma-4-E2B-E4B-On-Device-Multimodal-AI-Models-Edge-Deployment">Gemma 4 E2B and E4B: On-Device Multimodal AI Models for Edge Deployment</a></li>
    </ul>

    <li id="TOC-h2-Gemma-4-Hardware-Requirements-GPU-VRAM-Inference-Cost-Breakdown"><a rel="noopener" target="_blank" href="#h2-Gemma-4-Hardware-Requirements-GPU-VRAM-Inference-Cost-Breakdown">Gemma 4 Hardware Requirements: GPU VRAM and Inference Cost Breakdown</a></li>

    <li id="TOC-h2-Gemma-4-Benchmarks-LMArena-Elo-Scores-Multimodal-Performance-Results"><a rel="noopener" target="_blank" href="#h2-Gemma-4-Benchmarks-LMArena-Elo-Scores-Multimodal-Performance-Results">Gemma 4 Benchmarks: LMArena Elo Scores and Multimodal Performance Results</a></li>

    <li id="TOC-h2-How-Run-Gemma-4-Transformers-llama-cpp-MLX-Cloud-Deployment-Options"><a rel="noopener" target="_blank" href="#h2-How-Run-Gemma-4-Transformers-llama-cpp-MLX-Cloud-Deployment-Options">How to Run Gemma 4: Transformers, llama.cpp, MLX, and Cloud Deployment Options</a></li>

    <li id="TOC-h2-Fine-Tuning-Gemma-4-LoRA-QLoRA-TRL-Training-Pipeline-Guide"><a rel="noopener" target="_blank" href="#h2-Fine-Tuning-Gemma-4-LoRA-QLoRA-TRL-Training-Pipeline-Guide">Fine-Tuning Gemma 4: LoRA, QLoRA, and TRL Training Pipeline Guide</a></li>

    <li id="TOC-h2-Gemma-4-Prompt-Formatting-Chat-Templates-Multimodal-Input-Structure"><a rel="noopener" target="_blank" href="#h2-Gemma-4-Prompt-Formatting-Chat-Templates-Multimodal-Input-Structure">Gemma 4 Prompt Formatting: Chat Templates and Multimodal Input Structure</a></li>

    <li id="TOC-h2-Which-Gemma-4-Model-Use-E2B-vs-E4B-vs-26B-MoE-vs-31B-Comparison"><a rel="noopener" target="_blank" href="#h2-Which-Gemma-4-Model-Use-E2B-vs-E4B-vs-26B-MoE-vs-31B-Comparison">Which Gemma 4 Model to Use: E2B vs E4B vs 26B MoE vs 31B Comparison</a></li>

    <li id="TOC-h2-Summary"><a rel="noopener" target="_blank" href="#h2-Summary">Summary</a></li>
    <ul>
        <li id="TOC-h3-Citation-Information"><a rel="noopener" target="_blank" href="#h3-Citation-Information">Citation Information</a></li>
    </ul>
</ul>
</div>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h1-Google-DeepMind-Gemma-4-MoE-Efficiency-Tricks-Benchmarks"/>



<h2 class="wp-block-heading"><a href="#TOC-h1-Google-DeepMind-Gemma-4-MoE-Efficiency-Tricks-Benchmarks">Google DeepMind’s Gemma 4: MoE, Efficiency Tricks, and Benchmarks</a></h2>



<p>Google DeepMind&#8217;s <strong><a href="https://deepmind.google/models/gemma/gemma-4/" target="_blank" rel="noreferrer noopener">Gemma 4</a></strong> is one of the most compelling open-weight model releases in recent memory. It&#8217;s not just one model; it is a carefully designed family spanning from tiny on-device variants to a 31-billion-parameter powerhouse, all built with multimodal reasoning, long context, and real deployment constraints in mind. And crucially, these models are released under an Apache 2.0 license, meaning you can use, modify, and deploy them commercially without restriction.</p>


<div class="wp-block-image">
<figure class="aligncenter size-full"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/google-deepminds-gemma-4-moe-efficiency-tricks-benchmarks-featured.png" target="_blank" rel=" noreferrer noopener"><img fetchpriority="high" decoding="async" width="940" height="780" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/google-deepminds-gemma-4-moe-efficiency-tricks-benchmarks-featured.png?lossy=2&strip=1&webp=1" alt="google-deepminds-gemma-4-moe-efficiency-tricks-benchmarks-featured.png" class="wp-image-54362" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/google-deepminds-gemma-4-moe-efficiency-tricks-benchmarks-featured.png?size=126x105&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/google-deepminds-gemma-4-moe-efficiency-tricks-benchmarks-featured-300x249.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/google-deepminds-gemma-4-moe-efficiency-tricks-benchmarks-featured.png?size=378x314&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/google-deepminds-gemma-4-moe-efficiency-tricks-benchmarks-featured.png?size=504x418&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/google-deepminds-gemma-4-moe-efficiency-tricks-benchmarks-featured.png?size=630x523&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/google-deepminds-gemma-4-moe-efficiency-tricks-benchmarks-featured-768x637.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/google-deepminds-gemma-4-moe-efficiency-tricks-benchmarks-featured.png?lossy=2&amp;strip=1&amp;webp=1 940w" sizes="(max-width: 630px) 100vw, 630px" /></a></figure></div>


<p>In this post, we will peel back the hood and explain what makes Gemma 4 tick, including the architecture, the clever efficiency tricks, the multimodal capabilities, what hardware you actually need to run these models, and how to get started in code. No prior deep knowledge of transformers required, though some familiarity will help. </p>



<p>Whether you are evaluating Gemma 4 for a production use case, curious about the architecture, or just want to know which variant to reach for, this post has you covered.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-75-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="839" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-75-1024x839.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-54364" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-75.png?size=126x103&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-75-300x246.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-75.png?size=378x310&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-75.png?size=504x413&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-75.png?size=630x516&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-75-768x629.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-75-1024x839.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-75-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 1:</strong> Gemma 4 Architecture (source: <a href="https://newsletter.maartengrootendorst.com/p/a-visual-guide-to-gemma-4" target="_blank" rel="noreferrer noopener">Grootendorst, 2026</a>)</figcaption></figure></div>


<p>This lesson is the 1st in a 5-part series on <strong>Google DeepMind&#8217;s Gemma 4</strong>:</p>



<ol class="wp-block-list">
<li><em><strong><a href="https://pyimg.co/uqxzw" target="_blank" rel="noreferrer noopener">Google DeepMind’s Gemma 4: MoE, Efficiency Tricks, and Benchmarks</a></strong></em><strong> (this tutorial)</strong></li>



<li><em>Lesson 2</em></li>



<li><em>Lesson 3</em></li>



<li><em>Lesson 4</em></li>



<li><em>Lesson 5</em></li>
</ol>



<p><strong>To learn </strong><strong>how Gemma 4&#8217;s architecture, Mixture-of-Experts design, multimodal capabilities, and efficiency optimizations work</strong><strong>, </strong><em><strong>just keep reading.</strong></em></p>



<hr class="wp-block-separator has-alpha-channel-opacity"/>



<p>Would you like immediate access to 3,457 images curated and labeled with hand gestures to train, explore, and experiment with &#8230; for free? Head over to <a href="https://universe.roboflow.com/isl/az-6mqow?ref=pyimagesearch" target="_blank" rel="noreferrer noopener">Roboflow</a> and get a free account to grab these hand gesture images. </p>



<hr class="wp-block-separator has-alpha-channel-opacity"/>



<!-- wp:paragraph -->
<h3>Need Help Configuring Your Development Environment?</h3>
<!-- /wp:paragraph -->

<!-- wp:image {"align":"center","id":18137,"sizeSlug":"large","linkDestination":"custom"} -->
<figure class="wp-block-image aligncenter size-large"><a href="https://pyimagesearch.com/pyimagesearch-university/" target="_blank" rel="noreferrer noopener"><img decoding="async" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-18137" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?lossy=2&strip=1&webp=1 500w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?size=126x84&lossy=2&strip=1&webp=1 126w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?size=252x168&lossy=2&strip=1&webp=1 252w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?size=378x253&lossy=2&strip=1&webp=1 378w" sizes="(max-width: 500px) 100vw, 500px" /></a><figcaption>Having trouble configuring your development environment? Want access to pre-configured Jupyter Notebooks running on Google Colab? Be sure to join <a href="https://pyimagesearch.com/pyimagesearch-university/" target="_blank" rel="noreferrer noopener" aria-label=" (opens in a new tab)">PyImageSearch University</a> — you will be up and running with this tutorial in a matter of minutes. </figcaption></figure>
<!-- /wp:image -->

<!-- wp:paragraph -->
<p>All that said, are you:</p>
<!-- /wp:paragraph -->

<!-- wp:list -->
<ul><li>Short on time?</li><li>Learning on your employer’s administratively locked system?</li><li>Wanting to skip the hassle of fighting with the command line, package managers, and virtual environments?</li><li><strong>Ready to run the code immediately on your Windows, macOS, or Linux system?</strong></li></ul>
<!-- /wp:list -->

<!-- wp:paragraph -->
<p>Then join <a href="https://pyimagesearch.com/pyimagesearch-university/" target="_blank">PyImageSearch University</a> today!</p>
<!-- /wp:paragraph -->

<!-- wp:paragraph -->
<p><strong>Gain access to Jupyter Notebooks for this tutorial and other PyImageSearch guides pre-configured to run on Google Colab’s ecosystem right in your web browser!</strong> No installation required.</p>
<!-- /wp:paragraph -->

<!-- wp:paragraph -->
<p>And best of all, these Jupyter Notebooks will run on Windows, macOS, and Linux!</p>
<!-- /wp:paragraph -->



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Gemma-4-Model-Family-Overview-E2B-E4B-31B-MoE-26B-A4B"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Gemma-4-Model-Family-Overview-E2B-E4B-31B-MoE-26B-A4B">Gemma 4 Model Family Overview: E2B, E4B, 31B, and MoE 26B A4B</a></h2>



<p>Before diving into how these models work, let us first look at the lineup. There are 4 models:</p>



<p><strong>Gemma 4 E2B and E4B</strong><strong>:</strong> The smallest models in the family, designed to run efficiently on-device (think: your phone). The &#8220;E&#8221; stands for <em>effective parameters</em>, a concept we&#8217;ll unpack below. They support text, images, and even audio.</p>



<p><strong>Gemma 4 31B:</strong> A dense 31-billion parameter model. Dense means every parameter participates in every inference pass. Think of it as the &#8220;traditional&#8221; heavyweight.</p>



<p><strong>Gemma 4 26B A4B:</strong> A Mixture-of-Experts model with 26 billion total parameters, but only 4 billion &#8220;active&#8221; during any given computation (inference). The &#8220;A&#8221; stands for <em>active parameters</em>. It runs with the speed of a 4B model despite its much larger knowledge capacity.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-76-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="423" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-76-1024x423.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-54366" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-76.png?size=126x52&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-76-300x124.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-76.png?size=378x156&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-76.png?size=504x208&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-76.png?size=630x260&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-76-768x317.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-76-1024x423.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-76-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Table 1:</strong> Gemma 4 Model Family Overview (source: <a href="https://deepmind.google/models/gemma/gemma-4/" target="_blank" rel="noreferrer noopener">Google DeepMind</a>)</figcaption></figure></div>


<p>The lineup spans from phone-friendly to server-grade, so you can pick the right model for your constraints. All 4 models are <strong>multimodal</strong>; they can reason over images alongside text. The 2 smaller models (E2B and E4B) go a step further and also handle <strong>audio</strong>.</p>



<p>Every model ships in both a base (pre-trained) and instruction-tuned (IT) version. The instruction-tuned versions are what most practitioners will want to use for tasks like chat, reasoning, and function-calling.</p>



<p>All 4 models are available on <a href="https://huggingface.co/collections/google/gemma-4" target="_blank" rel="noreferrer noopener">Hugging Face</a>, <a href="https://www.kaggle.com/models/google/gemma-4" target="_blank" rel="noreferrer noopener">Kaggle</a>, <a href="https://ollama.com/library/gemma4" target="_blank" rel="noreferrer noopener">Ollama</a>, <a href="https://lmstudio.ai/models/gemma-4" target="_blank" rel="noreferrer noopener">LM Studio</a>, and <a href="https://hub.docker.com/r/ai/gemma4" target="_blank" rel="noreferrer noopener">Docker</a>. Also, it can run via Transformers, llama.cpp, MLX, and several other popular inference stacks.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Gemma-4-Capabilities-Reasoning-Multimodal-AI-Thinking-Mode"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Gemma-4-Capabilities-Reasoning-Multimodal-AI-Thinking-Mode">Gemma 4 Capabilities: Reasoning, Multimodal AI, and Thinking Mode</a></h2>



<p>Before getting into architecture, it is worth understanding the capabilities these models were trained and evaluated for. The design choices only make sense in that context.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Gemma-4-Thinking-Mode-Chain-of-Thought-Reasoning-Explained"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Gemma-4-Thinking-Mode-Chain-of-Thought-Reasoning-Explained">Gemma 4 Thinking Mode: Chain-of-Thought Reasoning Explained</a></h3>



<p>All Gemma 4 models are designed as capable reasoners with configurable &#8220;thinking mode.&#8221; When enabled, the model produces an internal chain-of-thought before arriving at its final answer, similar in spirit to what you would see with OpenAI&#8217;s o-series or Anthropic&#8217;s extended thinking. This is particularly valuable for math, logic, and multi-step planning tasks.</p>



<p>Thinking can be toggled per-request. In the Transformers API, you enable it by passing <code data-enlighter-language="python" class="EnlighterJSRAW">enable_thinking=True</code> to the <code data-enlighter-language="python" class="EnlighterJSRAW">apply_chat_template</code> call:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Google DeepMind’s Gemma 4: MoE, Efficiency Tricks, and Benchmarks" data-enlighter-group="1">inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    return_dict=True,
    return_tensors="pt",
    add_generation_prompt=True,
    enable_thinking=True,  # activates chain-of-thought mode
).to(model.device)
</pre>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Image-Understanding-Object-Detection-OCR-GUI-Navigation"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Image-Understanding-Object-Detection-OCR-GUI-Navigation">Image Understanding: Object Detection, OCR, and GUI Navigation</a></h3>



<p>The vision capabilities in Gemma 4 are genuinely impressive, especially for an open-weight model. All 4 model sizes could reliably perform bounding-box detection, returning results natively as structured JSON without any special grammar constraints or prompting tricks.</p>



<p>For example, given a UI screenshot and the prompt &#8220;What&#8217;s the bounding box for the &#8216;submit&#8217; button?&#8221;, the model returns something like:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="json" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Google DeepMind’s Gemma 4: MoE, Efficiency Tricks, and Benchmarks" data-enlighter-group="2">[{"box_2d": [171, 75, 245, 308], "label": "view recipe element"}]
</pre>



<p>The coordinates are normalized to a 1000×1000 grid regardless of the original image dimensions, which makes post-processing straightforward. This makes Gemma 4 a strong candidate for tasks like automated UI testing, document parsing, and robotic process automation.</p>



<p>Image captioning was tested across all 4 sizes and all performed well, accurately capturing details such as the type of bird, the architectural style of background buildings, and whether the scene was indoors or outdoors. Even the tiny E2B model produced detailed and accurate captions.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Gemma-4-Code-Generation-Images-UI-Reconstruction-Vision-to-Code"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Gemma-4-Code-Generation-Images-UI-Reconstruction-Vision-to-Code">Gemma 4 Code Generation from Images: UI Reconstruction and Vision-to-Code</a></h3>



<p>One standout test: When given each model a screenshot of a webpage and asked it to write the HTML to recreate it. With thinking mode enabled and a token budget of 4,000 output tokens, the larger models (26B A4B and 31B) produced near-faithful reproductions. The smaller E4B model held its own remarkably well, while E2B showed the expected drop-off in fidelity.</p>



<p>This capability to understand a visual layout and translate it into working code has real applications for prototyping, design-to-code workflows, and accessibility tooling.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Gemma-4-Video-Understanding-Multimodal-Temporal-Reasoning"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Gemma-4-Video-Understanding-Multimodal-Temporal-Reasoning">Gemma 4 Video Understanding: Multimodal Temporal Reasoning</a></h3>



<p>Gemma 4 can process video input, though capabilities differ by size. The smaller E2B and E4B models accept video <em>with </em>audio, treating it as a combined audio-visual signal. The larger 31B and 26B A4B models accept video <em>without</em> audio because they lack an audio encoder, which we will discuss below.</p>



<p>In informal testing with a live concert video, E4B correctly identified the genre of music, the mood of the song lyrics, and the stage setup and crowd. The 31B model gave a detailed description of the visual elements and even identified a brand visible on a large screen, despite not having access to audio. Neither model had been explicitly fine-tuned on video data; this capability emerged from the multimodal training.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Gemma-4-Audio-AI-Speech-Recognition-Translation-Audio-QA"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Gemma-4-Audio-AI-Speech-Recognition-Translation-Audio-QA">Gemma 4 Audio AI: Speech Recognition, Translation, and Audio Q&amp;A</a></h3>



<p>The E2B and E4B models include a dedicated audio encoder, enabling end-to-end speech understanding. This is novel for an open-weight model at this scale. Practically, it means you can send raw audio (as an MP4 or audio file) and ask the model questions about the audio, with no separate transcription step required.</p>



<p>This is particularly useful for:</p>



<ul class="wp-block-list">
<li>Automatic speech recognition (ASR) in a single-model pipeline</li>



<li>Multilingual audio translation</li>



<li>Video Q&amp;A where both the speech and visuals matter</li>
</ul>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Gemma-4-Function-Calling-Tool-Use-Agentic-AI-Workflows"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Gemma-4-Function-Calling-Tool-Use-Agentic-AI-Workflows">Gemma 4 Function Calling: Tool Use and Agentic AI Workflows</a></h3>



<p>Gemma 4 has built-in support for structured function/tool calling, both in text-only and multimodal contexts. This is essential for building agents: systems in which the model needs to decide which tool to invoke, with what arguments, in response to a user request. The fact that this is natively supported (rather than requiring prompt-engineering workarounds) makes Gemma 4 a serious option for agentic workflows running locally or in constrained environments.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Gemma-4-System-Prompts-Instruction-Control-Chat-Behavior"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Gemma-4-System-Prompts-Instruction-Control-Chat-Behavior">Gemma 4 System Prompts: Instruction Control and Chat Behavior</a></h3>



<p>Gemma 4 introduces first-class support for the <code data-enlighter-language="python" class="EnlighterJSRAW">system</code> role in conversations. In prior Gemma versions, system-level instructions had to be blended into the user turn in ad hoc ways. Now the model is trained to recognize and respect a proper system prompt, which makes deploying it inside structured applications (where you want to set tone, persona, or capabilities) significantly cleaner.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Gemma-4-Architecture-Overview-Shared-Transformer-Design-Principles"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Gemma-4-Architecture-Overview-Shared-Transformer-Design-Principles">Gemma 4 Architecture Overview: Shared Transformer Design Principles</a></h2>



<p>Despite their size differences, all Gemma 4 models share the same core architectural DNA. Let us go through each shared component one by one.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Gemma-4-Attention-Mechanism-Local-Global-Interleaved-Attention-Explained"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Gemma-4-Attention-Mechanism-Local-Global-Interleaved-Attention-Explained">Gemma 4 Attention Mechanism: Local + Global Interleaved Attention Explained</a></h3>



<p>To appreciate what Gemma 4 does here, you first need to understand what &#8220;attention&#8221; means in a transformer model.</p>



<p><strong>The classic attention problem:</strong> In a standard transformer, every word in your input looks at every other word to figure out context. This is called <em>full</em> or <em>global</em> attention. It is powerful but brutally expensive because the computation grows with the <em>square</em> of the input length. Double your input length, and you quadruple the cost.</p>



<p><strong>Sliding window attention (local attention):</strong> Imagine reading a book, but instead of remembering every page you&#8217;ve ever read, you can only reference the last 5 pages. That&#8217;s sliding window attention. Each token only attends to the N most recent tokens (a &#8220;window&#8221;), not the entire sequence. This is dramatically cheaper to compute.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-77-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="529" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-77-1024x529.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-54370" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-77.png?size=126x65&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-77-300x155.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-77.png?size=378x195&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-77.png?size=504x260&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-77.png?size=630x325&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-77-768x397.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-77-1024x529.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-77-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 2:</strong> Global Attention vs Sliding Window Attention (source: <a href="https://newsletter.maartengrootendorst.com/p/a-visual-guide-to-gemma-4" target="_blank" rel="noreferrer noopener">Grootendorst, 2026</a>)</figcaption></figure></div>


<p>Here is the tradeoff made tangible: say you are generating a response to a long legal document. With a sliding window of 512 tokens, any given token looks only at the 512 tokens before it, rather than the entire 10,000-token document. That saves enormous compute, but risks losing context from early in the document.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-78-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="263" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-78-1024x263.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-54373" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-78.png?size=126x32&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-78-300x77.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-78.png?size=378x97&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-78.png?size=504x129&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-78.png?size=630x162&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-78-768x197.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-78-1024x263.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-78-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 3:</strong> Global Attention vs Sliding Window Attention (source: <a href="https://newsletter.maartengrootendorst.com/p/a-visual-guide-to-gemma-4" target="_blank" rel="noreferrer noopener">Grootendorst, 2026</a>)</figcaption></figure></div>


<p><strong>The interleaving solution:</strong> Gemma 4 does not pick one strategy; it alternates between them across layers. Most layers use the efficient sliding window, but every few layers, a full global attention layer kicks in and &#8220;resets&#8221; the context by attending to everything. Think of it like a student who mostly skims through dense reading, but every few chapters pauses to re-read everything they have covered.</p>



<p>In practice, the E2B model uses a 4-local-to-1-global pattern. All other models use a 5:1 ratio. Crucially, Gemma 4 ensures the <em>final</em> layer is always a global attention layer, so the model&#8217;s last word on any sequence is fully informed, a deliberate fix from Gemma 3 where the last layer could end up being local.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-79-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="813" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-79-1024x813.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-54375" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-79.png?size=126x100&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-79-300x238.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-79.png?size=378x300&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-79.png?size=504x400&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-79.png?size=630x500&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-79-768x610.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-79-1024x813.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-79-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 4:</strong> Gemma 3 vs Gemma 4 attention mechanism (source: <a href="https://newsletter.maartengrootendorst.com/p/a-visual-guide-to-gemma-4" target="_blank" rel="noreferrer noopener">Grootendorst, 2026</a>)</figcaption></figure></div>


<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Gemma-4-Efficiency-Tricks-GQA-K-V-Caching-Memory-Optimization"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Gemma-4-Efficiency-Tricks-GQA-K-V-Caching-Memory-Optimization">Gemma 4 Efficiency Tricks: GQA, K=V Caching, and Memory Optimization</a></h3>



<p>Even with interleaving, global attention layers are still the most expensive part. Gemma 4 layers on three additional tricks to tame the cost.</p>



<h4 class="wp-block-heading">Grouped Query Attention (GQA)</h4>



<p>In standard multi-head attention, every &#8220;head&#8221; maintains its own set of Key and Value matrices. This creates a large memory footprint because all of these have to be cached during generation (this is called the KV-cache).</p>



<p>GQA is the idea that multiple Query heads can <em>share</em> the same set of Keys and Values. Imagine 8 students all reading from the same textbook instead of each having their own, with the same knowledge and much less paper.</p>



<p>In Gemma 4&#8217;s global attention layers, 8 Query heads share a single KV pair. This dramatically reduces what needs to be stored in the cache, which is especially significant because global attention has to cache the <em>entire</em> context (versus the local attention layers, which only cache a small window).</p>



<p>To compensate for any quality loss from fewer KV heads, Gemma 4 doubles the dimensionality of the Keys, giving each shared Key more expressive capacity.</p>



<h4 class="wp-block-heading">Keys Equal Values (K=V)</h4>



<p>Here&#8217;s an even bolder efficiency move: in global attention layers, Gemma 4 sets the Key and Value matrices to be identical. Instead of storing both K and V separately in cache, you only need to store one. The KV-cache effectively becomes a K-cache for those layers, cutting memory requirements in half at that level.</p>



<p>This sounds like it might hurt quality significantly, but in practice the performance impact turns out to be modest, a good trade for the memory savings.</p>



<h4 class="wp-block-heading">p-RoPE: Smarter Positional Encoding</h4>



<p>To understand this trick, you need to know how transformers track word order. Because attention has no built-in sense of sequence (unlike an RNN), position is injected into embeddings explicitly. The popular method for this is <strong>Rotary Positional Encoding (RoPE)</strong>.</p>



<p><strong>How RoPE works:</strong> Each embedding vector is split into pairs of values. Each pair is thought of as a 2D vector pointing in some direction. RoPE <em>rotates</em> each pair by a position-dependent angle, so earlier words get one rotation, later words get another. By comparing how much two vectors have been rotated, the model can infer their relative distance.</p>



<p>The rotation speeds vary: the first pairs rotate quickly (high frequency) and the last pairs rotate very slowly (low frequency). The high-frequency pairs are great for tracking <em>where</em> a word is. The low-frequency pairs rotate so little that they barely carry positional information at all, making them closer to the raw semantic meaning of the word.</p>



<p>Here is the problem Gemma 4 solves: over very long sequences, even those tiny low-frequency rotations accumulate and start to introduce misleading positional noise into what should be a semantic signal. Think of it like a clock&#8217;s hour hand being used to measure seconds, where the movement is technically there but too small to be meaningful and can cause errors.</p>



<p><strong>p-RoPE</strong> (pruned RoPE) solves this elegantly: apply rotational encoding only to the first <em>p</em> fraction of pairs, and zero out the rest. If p = 0.25, only the top 25% of pairs (the high-frequency, positional ones) get rotation. The low-frequency pairs are left clean, with pure semantic content and no positional noise. This is especially important in global attention, where the context can span tens of thousands of tokens.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Gemma-4-Vision-Encoder-ViT-Based-Image-Processing-Architecture"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Gemma-4-Vision-Encoder-ViT-Based-Image-Processing-Architecture">Gemma 4 Vision Encoder: ViT-Based Image Processing Architecture</a></h3>



<p>All four Gemma 4 models are multimodal, meaning they can reason about images as well as text. To make this work, images need to be converted into a format the language model can process. The component responsible for this is the <strong>Vision Encoder</strong>, built on a Vision Transformer (ViT).</p>



<p><strong>The core idea of a ViT:</strong> Rather than treating an image as a grid of pixels, a ViT slices the image into fixed-size patches (typically 16×16 pixels each) and treats each patch like a &#8220;word token.&#8221; The sequence of patches goes through a transformer, which produces an embedding for each patch capturing its visual content and context.</p>



<h4 class="wp-block-heading">Handling Variable Aspect Ratios with 2D RoPE</h4>



<p>Standard ViTs assume a square input image with a fixed grid of patches. But real-world images come in all shapes (e.g., wide panoramas, tall portraits, and square thumbnails). Forcing every image into a square distorts content and destroys spatial relationships.</p>



<p>Gemma 4 addresses this by using <strong>2D RoPE</strong> for its vision encoder. Instead of encoding patches with a single 1D position (patch 1, patch 2, patch 3, etc.), each patch is given a 2D position: its (row, column) coordinates in the image grid. The patch embedding is split into two halves where one half encodes the horizontal position, and the other encodes the vertical position. This way, a patch in the upper-left corner of a wide landscape and a patch in the upper-left corner of a tall portrait both correctly identify themselves as &#8220;top-left,&#8221; regardless of the total number of patches.</p>



<p>Images are also adaptively resized to maintain the original aspect ratio while ensuring the dimensions are multiples of 16 (the patch size), with padding added where needed.</p>



<h4 class="wp-block-heading">Soft Token Budget: Controlling Variable Resolution</h4>



<p>More patches mean more tokens fed into the language model, which increases computational cost. To give developers control over this, Gemma 4 introduces a <strong>soft token budget</strong>: a configurable cap on how many visual tokens are processed by the LLM.</p>



<p>Here&#8217;s a concrete example. Suppose you set a budget of 280 tokens. The model will resize your image so that the total resulting patches, after pooling every 3×3 patch block into a single embedding, stays within 280. A budget of 1120 tokens lets high-resolution images through with much more visual detail; a budget of 70 tokens dramatically downsamples the image. The right budget depends on your task:</p>



<ul class="wp-block-list">
<li>Describing a photo? 70–140 tokens is probably fine.</li>



<li>Reading a scanned invoice with fine print? You&#8217;d want 560–1120 tokens.</li>



<li>Analyzing consecutive video frames quickly? Lower budgets keep things fast.</li>
</ul>



<h4 class="wp-block-heading">Linear Projection: Bridging Vision and Language</h4>



<p>The patch embeddings produced by the ViT live in a different dimensional space than the word embeddings Gemma 4 was trained on. Feeding mismatched embeddings into the language model would be like asking someone to add meters and kilograms, which makes no sense.</p>



<p>To solve this, a small neural network called a <strong>linear projection</strong> learns to map vision embeddings into the exact dimensional space Gemma 4 expects. This projection is trained alongside the language model so it perfectly aligns the two embedding spaces. A normalization step (RMSNorm) follows the projection to ensure the scale of visual embeddings matches what the transformer layers anticipate.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Gemma-4-Architecture-Variants-Dense-vs-MoE-vs-On-Device-Models"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Gemma-4-Architecture-Variants-Dense-vs-MoE-vs-On-Device-Models">Gemma 4 Architecture Variants: Dense vs MoE vs On-Device Models</a></h2>



<p>Now that you understand what all Gemma 4 models share, let us look at what makes each variant distinctive.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Gemma-4-31B-Dense-Baseline"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Gemma-4-31B-Dense-Baseline">Gemma 4 31B: The Dense Baseline</a></h3>



<p>The 31B model is the most architecturally conventional in the family. It is a <strong>dense transformer</strong>, meaning every parameter is used on every forward pass. Think of it as a large, all-purpose Swiss Army knife: every tool is always there, every tool can always be used.</p>



<p>Its architecture closely follows Gemma 3&#8217;s 27B model in spirit, but applies all the global attention improvements we&#8217;ve described: K=V, 8-query GQA, doubled Key dimensions, and p-RoPE. It has 60 layers (slightly fewer than Gemma 3&#8217;s 27B model with 62 layers) but compensates with a wider hidden dimension, meaning more parameters per layer rather than more layers.</p>



<p>For most inference scenarios that require a powerful, capable model without the complexity of MoE routing, this is the model to reach for.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Gemma-4-26B-A4B-MoE-Sparse-Experts-Efficient-Inference-Explained"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Gemma-4-26B-A4B-MoE-Sparse-Experts-Efficient-Inference-Explained">Gemma 4 26B A4B MoE: Sparse Experts and Efficient Inference Explained</a></h3>



<p>This is where things get architecturally interesting. The 26B A4B model uses a design called <strong>Mixture of Experts (MoE)</strong> to achieve something remarkable: the knowledge capacity of a 26-billion-parameter model at roughly the inference cost of a 4-billion-parameter model.</p>



<h4 class="wp-block-heading">How Mixture of Experts Works</h4>



<p>In a standard (dense) transformer, every layer contains a single large feedforward neural network (FFNN) that processes every token. In a MoE layer, that single FFNN is replaced by a <em>collection</em> of smaller FFNNs called <strong>experts</strong>, plus a lightweight <strong>router</strong> network.</p>



<p>When a token arrives at a MoE layer, here&#8217;s what happens step by step:</p>



<ul class="wp-block-list">
<li>The router examines the token&#8217;s embedding and assigns a probability score to each expert.</li>



<li>The top-scoring experts are selected (in Gemma 4, 8 out of 128 experts are chosen).</li>



<li>Each selected expert processes the token independently and produces an output.</li>



<li>The outputs are weighted by the router&#8217;s probability scores and summed together.</li>
</ul>



<p>This means for any given token, only 8 experts are doing work, while the other 120 are idle. The total number of parameters that get loaded into memory (the &#8220;sparse&#8221; parameters) is 26B. But the number doing active computation (the &#8220;active&#8221; parameters) is only <img src='https://b2633864.smushcdn.com/2633864/wp-content/latex/658/6588c95074f2609674f5fe10ab63f88f-ffffff-000000-0.png?lossy=2&strip=1&webp=1' alt='\sim' title='\sim' class='latex' />4B. Hence: 26B A4B.</p>



<p>A good analogy: imagine a hospital with 128 specialist doctors, but any given patient only sees 8 of them during their visit. The hospital has the collective knowledge of all 128 doctors, but each consultation only draws on a relevant subset.</p>



<h4 class="wp-block-heading">The Shared Expert</h4>



<p>Gemma 4&#8217;s MoE adds one more element: a <strong>shared expert</strong> that is always activated for every single token, regardless of what the router decides. This expert is three times larger than the other experts.</p>



<p>The intuition is compelling. Some knowledge is universally useful (e.g., grammar, common-sense reasoning, and factual recall) and should always be applied. The shared expert holds this general knowledge. The routed experts hold more specialized knowledge that is selectively engaged depending on the content. This is similar to how you would always use your native language&#8217;s grammar rules (shared expert), but only pull out domain-specific vocabulary when discussing, say, molecular biology (a selected expert).</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Gemma-4-E2B-E4B-On-Device-Multimodal-AI-Models-Edge-Deployment"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Gemma-4-E2B-E4B-On-Device-Multimodal-AI-Models-Edge-Deployment">Gemma 4 E2B and E4B: On-Device Multimodal AI Models for Edge Deployment</a></h3>



<p>These are the smallest and most novel models in the family. They are designed to run on devices with severely limited RAM, with smartphones being the primary target. Two key innovations enable this: <strong>Per-Layer Embeddings</strong> and an <strong>Audio Encoder</strong>.</p>



<h4 class="wp-block-heading">Per-Layer Embeddings (PLE): Teaching Each Layer Its Own Vocabulary</h4>



<p>In a standard transformer, each token is looked up in a single embedding table at the very start. That means one embedding per token, used everywhere. A richer context comes from stacking many transformer layers on top.</p>



<p>Per-Layer Embeddings take a different approach. Each token has not one embedding, but a <em>separate</em> embedding for every layer in the model. Continuing our analogy: instead of greeting a visitor with one name badge, you give them a different badge for each room they will enter, with each badge describing their role in the context of that room&#8217;s purpose.</p>



<p>For the E2B model, this means 262,144 vocabulary tokens × 35 layers × 256 dimensions per layer-embedding. That&#8217;s a large table, but here&#8217;s the key insight: this table lives in <strong>flash storage</strong> (like your phone&#8217;s SSD), not in RAM. RAM is precious and fast; flash is abundant and cheap. During inference, the needed embeddings are fetched from flash memory once at the start, then used at each layer.</p>



<p>At each layer, a gating function decides how to weight the values in the fetched embedding, effectively letting the model emphasize different aspects of a token&#8217;s meaning at different depths. The resulting embedding is projected up to the full model dimension and added into the main processing stream, functioning as a kind of continuous &#8220;reminder&#8221; to each layer of what the original token meant, preventing that meaning from getting diluted as context accumulates.</p>



<p>The &#8220;E&#8221; in E2B means <em>effective parameters</em>, referring to the parameters that actually reside in RAM and do computation. The large layer-embedding table is intentionally excluded from this count because it sits in flash, not in working memory.</p>



<h4 class="wp-block-heading">The Audio Encoder</h4>



<p>The E2B and E4B models go one step further: they accept raw audio as input, enabling tasks like speech recognition, audio translation, and voice-based Q&amp;A.</p>



<p>Audio processing follows a three-stage pipeline before the language model ever sees it:</p>



<p><strong>Stage 1</strong><strong>.</strong><strong> Feature Extraction:</strong> The raw audio waveform is converted into a <strong>mel-spectrogram</strong>, which is a 2D image-like representation where the horizontal axis represents time and the vertical axis represents frequency. This is similar to how sheet music represents music: time flows left to right, and the vertical position tells you the pitch. The mel scale emphasizes frequency ranges the human ear is most sensitive to.</p>



<p><strong>Stage 2</strong><strong>.</strong><strong> Chunking:</strong> The mel-spectrogram is divided into overlapping chunks, turning the continuous audio signal into a structured sequence of frames ready for processing.</p>



<p><strong>Stage 3</strong><strong>.</strong><strong> Downsampling with Convolutions:</strong> Two 2D convolutional layers process and compress these chunks, reducing the sequence length into a manageable number of &#8220;soft tokens&#8221; (continuous, dense embeddings rather than discrete word tokens). This is the audio equivalent of the ViT&#8217;s patch pooling: it reduces a large number of raw signals into a compact, information-rich sequence.</p>



<p>The resulting audio embeddings pass through a <strong>Conformer encoder</strong>, a transformer-style architecture augmented with convolutional modules, which is well-suited for sequential signal data such as audio. The Conformer&#8217;s output is then linearly projected into Gemma 4&#8217;s embedding space, exactly as we saw with the vision encoder.</p>



<p>The beauty of this design is that it&#8217;s modality-agnostic in spirit: whether it&#8217;s a word, an image patch, or an audio chunk, the final product is always a sequence of aligned embeddings that the language model can reason over uniformly.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Gemma-4-Hardware-Requirements-GPU-VRAM-Inference-Cost-Breakdown"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Gemma-4-Hardware-Requirements-GPU-VRAM-Inference-Cost-Breakdown">Gemma 4 Hardware Requirements: GPU VRAM and Inference Cost Breakdown</a></h2>



<p>Understanding memory requirements is critical before committing to a deployment setup. Here are the approximate GPU or TPU memory requirements for running inference at different precision levels.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-80-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="290" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-80-1024x290.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-54378" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-80.png?size=126x36&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-80-300x85.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-80.png?size=378x107&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-80.png?size=504x143&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-80.png?size=630x178&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-80-768x218.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-80-1024x290.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-80-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Table 2:</strong> Gemma 4 Hardware Requirements (source: <a href="https://deepmind.google/models/gemma/gemma-4/" target="_blank" rel="noreferrer noopener">Google DeepMind</a>)</figcaption></figure></div>


<p>At full 16-bit precision, the 31B model needs roughly 60 GB of VRAM, which is equivalent to two A100 80GB GPUs or a single H100. But at 4-bit quantization, the same model fits in about 17 GB, which means a single RTX 4090 or A10G becomes viable.</p>



<p>The 26B A4B model is interesting: its full-precision footprint of 48 GB looks large, but because only 4B parameters are active during inference, it runs significantly faster than the 31B despite needing less memory. At 4-bit, it drops to 15.6 GB.</p>



<p>The E2B and E4B models, at 4-bit quantization, fit in 3–5 GB of VRAM, placing them in genuinely on-device territory for modern phones and edge hardware. The E suffix models are especially designed for this: their PLE (Per-Layer Embeddings) tables live in flash storage, so the actual RAM footprint is even smaller than these numbers suggest during full inference runs on mobile devices.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Gemma-4-Benchmarks-LMArena-Elo-Scores-Multimodal-Performance-Results"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Gemma-4-Benchmarks-LMArena-Elo-Scores-Multimodal-Performance-Results">Gemma 4 Benchmarks: LMArena Elo Scores and Multimodal Performance Results</a></h2>



<p>Gemma 4&#8217;s large models set a new bar for what&#8217;s achievable in the open-weight space at this parameter count.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-81-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="952" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-81-1024x952.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-54380" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-81.png?size=126x117&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-81-300x279.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-81.png?size=378x351&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-81.png?size=504x469&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-81.png?size=630x586&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-81-768x714.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-81-1024x952.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-81-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 5:</strong> Model Performance vs Size (source: <a href="https://deepmind.google/models/gemma/gemma-4/" target="_blank" rel="noreferrer noopener">Google DeepMind</a>)</figcaption></figure></div>


<p>The 31B dense model achieves an estimated LMArena Elo score of <strong>1,452</strong> on text-only evaluations, placing it competitively with models that are significantly larger. The 26B A4B MoE model reaches <strong>1,441</strong>, which is remarkable given that it uses only 4 billion active parameters. To put that in context: these scores are competitive with several closed-source models from mid-2024.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-82-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="819" height="1024" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-82-819x1024.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-54381" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-82.png?size=126x158&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-82-240x300.png?lossy=2&amp;strip=1&amp;webp=1 240w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-82.png?size=378x473&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-82.png?size=504x630&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-82.png?size=630x788&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-82-768x960.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-82-819x1024.png?lossy=2&amp;strip=1&amp;webp=1 819w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-82-scaled.png?lossy=2&amp;strip=1&amp;webp=1 864w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 6:</strong> Arena ELO Score (source: <a href="https://deepmind.google/models/gemma/gemma-4/" target="_blank" rel="noreferrer noopener">Google DeepMind</a>)</figcaption></figure></div>


<p>Multimodal performance follows a similar pattern. Even the vision and audio capabilities were comparable in quality to the text performance, and not degraded by the multimodal conditioning. All model sizes demonstrated strong OCR, object detection, scene description, and audio understanding.</p>



<p>On coding and agentic benchmarks, Gemma 4 shows notable improvements over Gemma 3, partly due to the expanded context window (128K for small models, 256K for large ones), the native function-calling support, and the thinking-mode capability.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-83-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="834" height="1024" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-83-834x1024.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-54384" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-83.png?size=126x155&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-83-244x300.png?lossy=2&amp;strip=1&amp;webp=1 244w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-83.png?size=378x464&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-83.png?size=504x619&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-83.png?size=630x774&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-83-768x943.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-83-834x1024.png?lossy=2&amp;strip=1&amp;webp=1 834w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-83-scaled.png?lossy=2&amp;strip=1&amp;webp=1 880w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Table 3:</strong> Benchmark Performance (source: <a href="https://huggingface.co/blog/gemma4" target="_blank" rel="noreferrer noopener">Gemma 4 Hugging Face blog</a>)</figcaption></figure></div>


<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-How-Run-Gemma-4-Transformers-llama-cpp-MLX-Cloud-Deployment-Options"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-How-Run-Gemma-4-Transformers-llama-cpp-MLX-Cloud-Deployment-Options">How to Run Gemma 4: Transformers, llama.cpp, MLX, and Cloud Deployment Options</a></h2>



<p>Google and the community have built Gemma 4 support into virtually every major inference stack. Here&#8217;s a quick summary to help you choose.</p>



<ul class="wp-block-list">
<li><strong><a href="https://github.com/huggingface/transformers" target="_blank" rel="noreferrer noopener">Hugging Face Transformers</a></strong><strong>:</strong> The most fully featured option for Python users. It supports all modalities, thinking mode, function calling, and the full Processor API for handling mixed text/image/audio inputs. It is the best choice for research, fine-tuning, and flexible experimentation.</li>



<li><strong><a href="https://github.com/ggml-org/llama.cpp" target="_blank" rel="noreferrer noopener">Llama.cpp</a></strong><strong>:</strong> Offers highly optimized CPU and GPU inference, particularly valuable if you&#8217;re running on Apple Silicon or hardware without NVIDIA GPUs. Gemma 4 is supported in recent builds, with GGUF quantization enabling the small models to run on consumer hardware.</li>



<li><strong><a href="https://github.com/ml-explore/mlx" target="_blank" rel="noreferrer noopener">MLX</a></strong><strong>:</strong> The framework of choice for Apple Silicon, offering native Metal GPU acceleration. The E2B and E4B models run surprisingly fast on M-series chips via MLX, making on-Mac deployment practical.</li>



<li><strong><a href="https://github.com/huggingface/transformers.js" target="_blank" rel="noreferrer noopener">transformers.js</a></strong>: Enables in-browser inference via WebGPU. Gemma 4&#8217;s small models can run directly in a web browser (no server required), which opens up genuinely private, fully offline applications.</li>



<li><strong><a href="https://github.com/EricLBuehler/mistral.rs" target="_blank" rel="noreferrer noopener">Mistral.rs</a></strong><strong>:</strong> A Rust-based inference engine with strong performance characteristics for production deployments.</li>
</ul>



<p>For cloud production environments, Gemma 4 is available via the Gemini API, Google Cloud&#8217;s Vertex AI, Cloud Run, and GKE with GPU nodes. The Gemini API option is the lowest-friction path for managed serving without infrastructure work.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Fine-Tuning-Gemma-4-LoRA-QLoRA-TRL-Training-Pipeline-Guide"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Fine-Tuning-Gemma-4-LoRA-QLoRA-TRL-Training-Pipeline-Guide">Fine-Tuning Gemma 4: LoRA, QLoRA, and TRL Training Pipeline Guide</a></h2>



<p>One interesting observation from the Hugging Face team: Gemma 4 was difficult to demonstrate through fine-tuning examples <em>because the base instruction-tuned models are already so capable</em>. That said, fine-tuning is well-supported for domain specialization, style adaptation, or building task-specific versions.</p>



<p><strong><a href="https://github.com/huggingface/trl" target="_blank" rel="noreferrer noopener">TRL (Transformer Reinforcement Learning)</a></strong><strong>:</strong> The primary recommended library for supervised fine-tuning. It supports QLoRA (quantized LoRA), which dramatically reduces the memory requirements for fine-tuning, making it possible to fine-tune the 31B model on a machine with two consumer-grade GPUs if combined with 4-bit quantization. Fine-tuning is also supported on Vertex AI via TRL if you&#8217;d prefer a managed training environment.</p>



<p><strong><a href="https://unsloth.ai/docs/new/studio" target="_blank" rel="noreferrer noopener">Unsloth Studio</a></strong><strong>:</strong> A no-code fine-tuning interface for users who want to adapt Gemma 4 without writing training code. It supports Gemma 4 with memory optimizations baked in.</p>



<p>For a full fine-tuning pipeline in code, the key is using QLoRA via Hugging Face&#8217;s <a href="https://github.com/huggingface/peft" target="_blank" rel="noreferrer noopener">peft</a> and <a href="https://github.com/huggingface/trl" target="_blank" rel="noreferrer noopener">trl</a> libraries, targeting the attention and feedforward projection layers. Google also provides official guides for LoRA fine-tuning via Keras, PyTorch, and the Gemma library itself.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Gemma-4-Prompt-Formatting-Chat-Templates-Multimodal-Input-Structure"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Gemma-4-Prompt-Formatting-Chat-Templates-Multimodal-Input-Structure">Gemma 4 Prompt Formatting: Chat Templates and Multimodal Input Structure</a></h2>



<p>Gemma 4 follows a specific chat template that you should be aware of when building applications. The instruction-tuned models expect input in a structured multi-turn format. When using Hugging Face Transformers, always use <code data-enlighter-language="python" class="EnlighterJSRAW">processor.apply_chat_template()</code> rather than constructing prompts manually. This ensures special tokens are correctly inserted and the model receives input in the format it was trained on.</p>



<p>For multimodal inputs, images and audio are passed as dictionary entries alongside text in the message content list:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Google DeepMind’s Gemma 4: MoE, Efficiency Tricks, and Benchmarks" data-enlighter-group="3">messages = [
    {
        "role": "user",
        "content": [
            # For image input:
            {"type": "image", "url": "https://example.com/image.png"},
            # Or for local audio:
            {"type": "audio", "path": "/path/to/audio.mp3"},
            # Text always accompanies the media:
            {"type": "text", "text": "Describe what you see/hear."},
        ],
    }
]
</pre>



<p>For video with audio (E2B and E4B only), pass <code data-enlighter-language="python" class="EnlighterJSRAW">load_audio_from_video=True</code> in the <code data-enlighter-language="python" class="EnlighterJSRAW">apply_chat_template</code> call. For larger models, omit this flag since they do not have an audio encoder.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Which-Gemma-4-Model-Use-E2B-vs-E4B-vs-26B-MoE-vs-31B-Comparison"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Which-Gemma-4-Model-Use-E2B-vs-E4B-vs-26B-MoE-vs-31B-Comparison">Which Gemma 4 Model to Use: E2B vs E4B vs 26B MoE vs 31B Comparison</a></h2>



<p>With 4 variants available, the choice comes down to a few key questions.</p>



<p>If you are <strong>building something that runs on a phone or edge device</strong> with less than 6–8 GB of RAM available for the model, the E2B or E4B are your options, and they are genuinely capable. E4B is worth the extra memory if you are doing audio-visual tasks. At 4-bit quantization, E2B runs in about 3 GB, which fits on most modern Android and iOS devices.</p>



<p>If you are <strong>running on a single GPU</strong> in the 16–24 GB range (RTX 3090, 4090, A10G), the 26B A4B at 4-bit quantization (<img src='https://b2633864.smushcdn.com/2633864/wp-content/latex/fb4/fb4f353ef9a72c24566678c957a5ae9f-ffffff-000000-0.png?lossy=2&strip=1&webp=1' alt='\approx' title='\approx' class='latex' />15.6 GB) gives you the best intelligence-per-dollar, running at 4B-speed throughput.</p>



<p>If you need <strong>maximum capability</strong> and have the hardware for it (2× A100 or H100), the 31B dense model at BF16 or the 26B A4B at 16-bit precision are both strong choices. The 31B is architecturally simpler; the 26B A4B provides better throughput if you&#8217;re processing high request volumes.</p>



<p>If you are <strong>doing audio tasks</strong> at all, you must use E2B or E4B, since the larger models do not have an audio encoder.</p>



<hr class="wp-block-separator has-alpha-channel-opacity"/>



<div id="pitch" style="padding: 40px; width: 100%; background-color: #F4F6FA;">
	<h3>What's next? We recommend <a target="_blank" href="https://pyimagesearch.com/pyimagesearch-university/?utm_source=blogPost&utm_medium=bottomBanner&utm_campaign=What%27s%20next%3F%20I%20recommend">PyImageSearch University</a>.</h3>

	<script src="https://fast.wistia.com/embed/medias/kno0cmko2z.jsonp" async></script><script src="https://fast.wistia.com/assets/external/E-v1.js" async></script><div class="wistia_responsive_padding" style="padding:56.25% 0 0 0;position:relative;"><div class="wistia_responsive_wrapper" style="height:100%;left:0;position:absolute;top:0;width:100%;"><div class="wistia_embed wistia_async_kno0cmko2z videoFoam=true" style="height:100%;position:relative;width:100%"><div class="wistia_swatch" style="height:100%;left:0;opacity:0;overflow:hidden;position:absolute;top:0;transition:opacity 200ms;width:100%;"><img decoding="async" src="https://fast.wistia.com/embed/medias/kno0cmko2z/swatch" style="filter:blur(5px);height:100%;object-fit:contain;width:100%;" alt="" aria-hidden="true" onload="this.parentNode.style.opacity=1;" /></div></div></div></div>

	<div style="margin-top: 32px; margin-bottom: 32px; ">
		<strong>Course information:</strong><br/>
		86+ total classes • 115+ hours hours of on-demand code walkthrough videos • Last updated: June 2026<br/>
		<span style="color: #169FE6;">★★★★★</span> 4.84 (128 Ratings) • 16,000+ Students Enrolled
	</div>

	<p><strong>I strongly believe that if you had the right teacher you could <em>master</em> computer vision and deep learning.</strong></p>

	<p>Do you think learning computer vision and deep learning has to be time-consuming, overwhelming, and complicated? Or has to involve complex mathematics and equations? Or requires a degree in computer science?</p>

	<p>That’s <em>not</em> the case.</p>

	<p>All you need to master computer vision and deep learning is for someone to explain things to you in <em>simple, intuitive</em> terms. <em>And that’s exactly what I do</em>. My mission is to change education and how complex Artificial Intelligence topics are taught.</p>

	<p>If you're serious about learning computer vision, your next stop should be PyImageSearch University, the most comprehensive computer vision, deep learning, and OpenCV course online today. Here you’ll learn how to <em>successfully</em> and <em>confidently</em> apply computer vision to your work, research, and projects. Join me in computer vision mastery.</p>

	<p><strong>Inside PyImageSearch University you'll find:</strong></p>

	<ul style="margin-left: 0px;">
		<li style="list-style: none;">&check; <strong>86+ courses</strong> on essential computer vision, deep learning, and OpenCV topics</li>
		<li style="list-style: none;">&check; <strong>86 Certificates</strong> of Completion</li>
		<li style="list-style: none;">&check; <strong>115+ hours hours</strong> of on-demand video</li>
		<li style="list-style: none;">&check; <strong>Brand new courses released <em>regularly</em></strong>, ensuring you can keep up with state-of-the-art techniques</li>
		<li style="list-style: none;">&check; <strong>Pre-configured Jupyter Notebooks in Google Colab</strong></li>
		<li style="list-style: none;">&check; Run all code examples in your web browser — works on Windows, macOS, and Linux (no dev environment configuration required!)</li>
		<li style="list-style: none;">&check; Access to <strong>centralized code repos for <em>all</em> 540+ tutorials</strong> on PyImageSearch</li>
		<li style="list-style: none;">&check; <strong> Easy one-click downloads</strong> for code, datasets, pre-trained models, etc.</li>
		<li style="list-style: none;">&check; <strong>Access</strong> on mobile, laptop, desktop, etc.</li>
	</ul>

	<p style="text-align: center;">
		<a target="_blank" class="button link" href="https://pyimagesearch.com/pyimagesearch-university/?utm_source=blogPost&utm_medium=bottomBanner&utm_campaign=What%27s%20next%3F%20I%20recommend" style="background-color: #6DC713; border-bottom: none;">Click here to join PyImageSearch University</a>
	</p>
</div>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Summary"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Summary">Summary</a></h2>



<p>Gemma 4 is best understood not as a single model but as a thoughtfully tiered family, each member engineered for a specific place in the hardware spectrum, from a smartphone to a data center GPU cluster.</p>



<p>The two small models (E2B and E4B) push the frontier of what is possible on-device by storing large embedding tables in flash memory rather than RAM, and by packing audio understanding alongside vision and text in a package that fits in just a few gigabytes.</p>



<p>The 26B A4B MoE model achieves something that still feels almost counterintuitive: the knowledge depth of a 26-billion-parameter model running at roughly the speed and cost of a 4-billion-parameter model, thanks to sparse expert routing.</p>



<p>The 31B dense model serves as the reliable, architecturally simple heavyweight for applications that need maximum capability without the added complexity of MoE.</p>



<p>Across all variants, Gemma 4 shares a core set of architectural decisions that compound in value: interleaved local-and-global attention tames the cost of long contexts; grouped query attention and the K=V cache trick shrink the memory footprint of those global layers; and pruned positional encoding keeps semantic meaning clean even across hundreds of thousands of tokens.</p>



<p>These are not isolated optimizations; they are a coherent strategy for squeezing frontier-level intelligence into constrained environments.</p>



<p>On the capability side, what sets Gemma 4 apart from prior open-weight releases is the breadth of what works <em>out of the box</em>. Native structured output for object detection, code generation from screenshots, audio Q&amp;A, configurable thinking mode, and function-calling support all come without special prompting tricks or external scaffolding.</p>



<p>The Apache 2.0 license is a major advantage for commercial use, allowing you to deploy, modify, and build on these models without restriction.</p>



<p>If you take one thing away from this post, let it be this: the right way to approach Gemma 4 is not to ask &#8220;which is the best model?&#8221; but rather &#8220;what are my actual constraints — memory, latency, modality, hardware — and which variant is engineered for exactly that?&#8221;</p>



<p>The answer is almost certainly one of these four. The rest of this series will help you put whichever one you choose to work.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Citation-Information"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Citation-Information">Citation Information</a></h3>



<p><strong>Thakur, P. </strong>“Google DeepMind&#8217;s Gemma 4: MoE, Efficiency Tricks, and Benchmarks,” <em>PyImageSearch</em>, S. Huot and A. Sharma, eds., 2026, <a href="https://pyimg.co/uqxzw" target="_blank" rel="noreferrer noopener">https://pyimg.co/uqxzw</a></p>



<pre class="EnlighterJSRAW" data-enlighter-language="raw" data-enlighter-theme="classic" data-enlighter-highlight="" data-enlighter-linenumbers="false" data-enlighter-lineoffset="" data-enlighter-title="Google DeepMind’s Gemma 4: MoE, Efficiency Tricks, and Benchmarks" data-enlighter-group="4">@incollection{Thakur_2026_google-deepminds-gemma-4-moe-efficiency-tricks-benchmarks,
  author = {Piyush Thakur},
  title = {{Google DeepMind's Gemma 4: MoE, Efficiency Tricks, and Benchmarks}},
  booktitle = {PyImageSearch},
  editor = {Susan Huot and Aditya Sharma},
  year = {2026},
  url = {https://pyimg.co/uqxzw},
}
</pre>



<hr class="wp-block-separator has-alpha-channel-opacity"/>



<div id="download-the-code" class="post-cta-wrap">
<div class="gpd-post-cta">
	<div class="gpd-post-cta-content">
		

			<div class="gpd-post-cta-top">
				<div class="gpd-post-cta-top-image"><img decoding="async" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?lossy=2&strip=1&webp=1" alt="" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?lossy=2&strip=1&webp=1 410w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?size=126x174&lossy=2&strip=1&webp=1 126w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?size=252x348&lossy=2&strip=1&webp=1 252w" sizes="(max-width: 410px) 100vw, 410px" /></div>
				
				<div class="gpd-post-cta-top-title"><h4>Join the PyImageSearch Newsletter and Grab My FREE 17-page Resource Guide PDF</h4></div>
				<div class="gpd-post-cta-top-desc"><p>Enter your email address below to <strong>join the PyImageSearch Newsletter</strong> and <strong>download my FREE 17-page Resource Guide PDF</strong> on Computer Vision, OpenCV, and Deep Learning.</p></div>


			</div>

			<div class="gpd-post-cta-bottom">
				<form class="footer-cta" action="https://www.getdrip.com/forms/657075648/submissions" method="post" target="_blank" data-drip-embedded-form="657075648">
					<input name="fields[email]" type="email" value="" placeholder="Your email address" class="form-control" />

					<button type="submit">Join the Newsletter!</button>

					<div style="display: none;" aria-hidden="true"><label for="website">Website</label><br /><input type="text" id="website" name="website" tabindex="-1" autocomplete="false" value="" /></div>
				</form>
			</div>


		
	</div>

</div>
</div>
<p>The post <a rel="nofollow" href="https://pyimagesearch.com/2026/06/22/google-deepminds-gemma-4-moe-efficiency-tricks-and-benchmarks/">Google DeepMind&#8217;s Gemma 4: MoE, Efficiency Tricks, and Benchmarks</a> appeared first on <a rel="nofollow" href="https://pyimagesearch.com">PyImageSearch</a>.</p>
]]></content:encoded>
					
		
		
			</item>
		<item>
		<title>RAG Observability with Langfuse, vLLM, and FAISS</title>
		<link>https://pyimagesearch.com/2026/06/15/rag-observability-with-langfuse-vllm-and-faiss/</link>
		
		<dc:creator><![CDATA[Vikram Singh]]></dc:creator>
		<pubDate>Mon, 15 Jun 2026 12:45:00 +0000</pubDate>
				<category><![CDATA[Langfuse]]></category>
		<category><![CDATA[LLMOps]]></category>
		<category><![CDATA[MLOps]]></category>
		<category><![CDATA[Observability]]></category>
		<category><![CDATA[Retrieval-Augmented Generation]]></category>
		<category><![CDATA[Tutorial]]></category>
		<category><![CDATA[Vector Databases]]></category>
		<category><![CDATA[ai observability]]></category>
		<category><![CDATA[faiss]]></category>
		<category><![CDATA[hallucination detection]]></category>
		<category><![CDATA[langfuse]]></category>
		<category><![CDATA[langfuse tracing]]></category>
		<category><![CDATA[llm monitoring]]></category>
		<category><![CDATA[llm observability]]></category>
		<category><![CDATA[local llm]]></category>
		<category><![CDATA[production rag]]></category>
		<category><![CDATA[prompt engineering]]></category>
		<category><![CDATA[rag evaluation]]></category>
		<category><![CDATA[rag metrics]]></category>
		<category><![CDATA[rag observability]]></category>
		<category><![CDATA[rag pipeline]]></category>
		<category><![CDATA[relevancy scoring]]></category>
		<category><![CDATA[retrieval augmented generation]]></category>
		<category><![CDATA[retrieval scoring]]></category>
		<category><![CDATA[sentence transformers]]></category>
		<category><![CDATA[token usage tracking]]></category>
		<category><![CDATA[traced llm]]></category>
		<category><![CDATA[traced retriever]]></category>
		<category><![CDATA[tutorial]]></category>
		<category><![CDATA[vector database]]></category>
		<category><![CDATA[vector search]]></category>
		<category><![CDATA[vllm]]></category>
		<guid isPermaLink="false">https://pyimagesearch.com/?p=54252</guid>

					<description><![CDATA[<p>Table of Contents RAG Observability with Langfuse, vLLM, and FAISS Introduction to Production-Grade RAG and LLM Observability RAG Observability Architecture with Langfuse, vLLM, and FAISS Project Setup Building a Langfuse-Traced Retriever with FAISS Building a Traced LLM Wrapper for vLLM&#8230;</p>
<p>The post <a rel="nofollow" href="https://pyimagesearch.com/2026/06/15/rag-observability-with-langfuse-vllm-and-faiss/">RAG Observability with Langfuse, vLLM, and FAISS</a> appeared first on <a rel="nofollow" href="https://pyimagesearch.com">PyImageSearch</a>.</p>
]]></description>
										<content:encoded><![CDATA[<div class="yoast-breadcrumbs"><span><span><a href="https://pyimagesearch.com/">Home</a></span></div>


<div class="toc">
<hr class="TOC"/>
<p class="has-large-font-size"><strong>Table of Contents</strong></p>
<ul>
    <li id="TOC-h1-RAG-Observability-Langfuse-vLLM-FAISS"><a rel="noopener" target="_blank" href="#h1-RAG-Observability-Langfuse-vLLM-FAISS">RAG Observability with Langfuse, vLLM, and FAISS</a></li>

    <li id="TOC-h2-Introduction-Production-Grade-RAG-LLM-Observability"><a rel="noopener" target="_blank" href="#h2-Introduction-Production-Grade-RAG-LLM-Observability">Introduction to Production-Grade RAG and LLM Observability</a></li>

    <li id="TOC-h2-RAG-Observability-Architecture-Langfuse-vLLM-FAISS"><a rel="noopener" target="_blank" href="#h2-RAG-Observability-Architecture-Langfuse-vLLM-FAISS">RAG Observability Architecture with Langfuse, vLLM, and FAISS</a></li>

    <li id="TOC-h2-Project-Setup"><a rel="noopener" target="_blank" href="#h2-Project-Setup">Project Setup</a></li>

    <li id="TOC-h2-Building-Langfuse-Traced-Retriever-FAISS"><a rel="noopener" target="_blank" href="#h2-Building-Langfuse-Traced-Retriever-FAISS">Building a Langfuse-Traced Retriever with FAISS</a></li>

    <li id="TOC-h2-Building-Traced-LLM-Wrapper-vLLM-Langfuse"><a rel="noopener" target="_blank" href="#h2-Building-Traced-LLM-Wrapper-vLLM-Langfuse">Building a Traced LLM Wrapper for vLLM and Langfuse</a></li>

    <li id="TOC-h2-Building-Fully-Traced-RAG-Pipeline-Langfuse"><a rel="noopener" target="_blank" href="#h2-Building-Fully-Traced-RAG-Pipeline-Langfuse">Building a Fully Traced RAG Pipeline with Langfuse</a></li>

    <li id="TOC-h2-Implementing-LLM-Evaluation-Metrics-Relevancy-Hallucination-Risk"><a rel="noopener" target="_blank" href="#h2-Implementing-LLM-Evaluation-Metrics-Relevancy-Hallucination-Risk">Implementing LLM Evaluation Metrics for RAG: Relevancy and Hallucination Risk</a></li>

    <li id="TOC-h2-Running-Inspecting-RAG-Pipeline-End-to-End"><a rel="noopener" target="_blank" href="#h2-Running-Inspecting-RAG-Pipeline-End-to-End">Running and Inspecting the RAG Pipeline End-to-End</a></li>

    <li id="TOC-h2-Viewing-RAG-Traces-Spans-Scores-Langfuse"><a rel="noopener" target="_blank" href="#h2-Viewing-RAG-Traces-Spans-Scores-Langfuse">Viewing RAG Traces, Spans, and Scores in Langfuse</a></li>

    <li id="TOC-h2-Summary"><a rel="noopener" target="_blank" href="#h2-Summary">Summary</a></li>
</ul>
</div>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h1-RAG-Observability-Langfuse-vLLM-FAISS"/>



<h2 class="wp-block-heading"><a href="#TOC-h1-RAG-Observability-Langfuse-vLLM-FAISS">RAG Observability with Langfuse, vLLM, and FAISS</a></h2>



<p>In this lesson, you will learn how to instrument every step of your Retrieval-Augmented Generation (RAG) pipeline using Langfuse, capture traces across ingestion, retrieval, and generation, and understand exactly how your system behaves under the hood.</p>


<div class="wp-block-image">
<figure class="aligncenter size-full"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/rag-observability-langfuse-vllm-faiss-featured.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="940" height="780" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/rag-observability-langfuse-vllm-faiss-featured.png?lossy=2&strip=1&webp=1" alt="rag-observability-langfuse-vllm-faiss-featured.png" class="wp-image-54313" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/rag-observability-langfuse-vllm-faiss-featured.png?size=126x105&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/rag-observability-langfuse-vllm-faiss-featured-300x249.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/rag-observability-langfuse-vllm-faiss-featured.png?size=378x314&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/rag-observability-langfuse-vllm-faiss-featured.png?size=504x418&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/rag-observability-langfuse-vllm-faiss-featured.png?size=630x523&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/rag-observability-langfuse-vllm-faiss-featured-768x637.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/rag-observability-langfuse-vllm-faiss-featured.png?lossy=2&amp;strip=1&amp;webp=1 940w" sizes="(max-width: 630px) 100vw, 630px" /></a></figure></div>


<p>You will wire tracing into your retriever and generator, monitor latency and token usage, evaluate quality scores, and run the entire stack with vLLM and FAISS locally so you can experiment freely without any cloud dependencies.</p>



<p>By the end, you will have a fully transparent RAG workflow that you can debug, optimize, and scale with confidence.</p>



<p>This lesson is the last in a 3-part series on <strong>LLM observability with Langfuse</strong>:</p>



<ol class="wp-block-list">
<li><em><strong><a href="https://pyimg.co/tadoh" target="_blank" rel="noreferrer noopener">LLM Observability with Self-Hosted Langfuse and vLLM</a></strong></em></li>



<li><em><strong><a href="https://pyimg.co/24p06" target="_blank" rel="noreferrer noopener">Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)</a></strong></em></li>



<li><em><strong><a href="https://pyimg.co/g20yk" target="_blank" rel="noreferrer noopener">RAG Observability with Langfuse, vLLM, and FAISS</a></strong></em><strong> (this tutorial)</strong></li>
</ol>



<p><strong>To learn how to make your RAG pipeline fully observable with Langfuse, vLLM, and FAISS, </strong><em><strong>just keep reading.</strong></em></p>



<div id="pyi-source-code-block" class="source-code-wrap"><div class="gpd-source-code">
    <div class="gpd-source-code-content">
        <img decoding="async" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/source-code-icon.png?lossy=2&strip=1&webp=1" alt="">
        <h4>Looking for the source code to this post?</h4>
                    <a href="#download-the-code" class="pyis-cta-modal-open-modal">Jump Right To The Downloads Section <svg class="svg-icon arrow-right" width="12" height="12" aria-hidden="true" role="img" focusable="false" viewBox="0 0 14 14" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M6.8125 0.1875C6.875 0.125 6.96875 0.09375 7.09375 0.09375C7.1875 0.09375 7.28125 0.125 7.34375 0.1875L13.875 6.75C13.9375 6.8125 14 6.90625 14 7C14 7.125 13.9375 7.1875 13.875 7.25L7.34375 13.8125C7.28125 13.875 7.1875 13.9062 7.09375 13.9062C6.96875 13.9062 6.875 13.875 6.8125 13.8125L6.1875 13.1875C6.125 13.125 6.09375 13.0625 6.09375 12.9375C6.09375 12.8438 6.125 12.75 6.1875 12.6562L11.0312 7.8125H0.375C0.25 7.8125 0.15625 7.78125 0.09375 7.71875C0.03125 7.65625 0 7.5625 0 7.4375V6.5625C0 6.46875 0.03125 6.375 0.09375 6.3125C0.15625 6.25 0.25 6.1875 0.375 6.1875H11.0312L6.1875 1.34375C6.125 1.28125 6.09375 1.1875 6.09375 1.0625C6.09375 0.96875 6.125 0.875 6.1875 0.8125L6.8125 0.1875Z" fill="#169FE6"></path></svg></a>
            </div>
</div>
</div>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Introduction-Production-Grade-RAG-LLM-Observability"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Introduction-Production-Grade-RAG-LLM-Observability">Introduction to Production-Grade RAG and LLM Observability</a></h2>



<h3 class="wp-block-heading">What Makes a RAG Pipeline Production-Grade</h3>



<p>A RAG pipeline becomes “production-grade’’ only when it consistently delivers correct, stable, and explainable outputs under real-world constraints. In development, it is easy to get an LLM to answer questions using retrieved context. In production, the challenges multiply: retrieval quality varies, embeddings may shift over time, documents evolve, and latency budgets tighten. A production RAG pipeline must remain robust even when the input data is noisy, queries are unpredictable, and traffic is high.</p>



<p>A production-ready RAG system must treat <strong>retrieval as a first-class subsystem</strong>, not a background detail. That means surfacing similarity scores, exposing ranking decisions, understanding how vector search behaves at scale, and ensuring retriever recall stays high across diverse query types. It also requires that the <strong>prompt construction step is deterministic, inspectable, and traceable</strong>, because subtle variations in formatting often change model behavior dramatically.</p>



<p>Beyond these retrieval and prompt concerns, the LLM is also a production component. That means <strong>retry logic, token accounting, consistent latency, predictable throughput, and graceful failure modes</strong>. Production pipelines need clear boundaries between retrieval failures, prompt-generation bugs, and LLM invocation issues. If these concerns remain invisible, debugging becomes guesswork and reliability collapses under load. Production-grade RAG means engineered behavior, not accidental correctness.</p>



<h3 class="wp-block-heading">Why Observability Is Essential for Retrieval-Augmented Systems</h3>



<p>RAG pipelines fail silently. Retrieval may return irrelevant documents, prompting may omit essential context, and the LLM may hallucinate confidently even when grounded context exists. Without observability, it is impossible to diagnose <em>why</em> a particular answer was wrong. Was the embedding model inconsistent? Did FAISS, the vector search library used to retrieve similar documents, return poor matches? Did the prompt formatting break a system instruction? Did the LLM drift or degrade?</p>



<p>Observability solves this by turning the RAG pipeline into a transparent execution graph. Tools like Langfuse give you <strong>hierarchical traces</strong>: one trace for the whole request, and nested spans for retrieval, LLM calls, evaluation, and supporting steps. Each span captures inputs, outputs, metadata, latencies, token usage, and even scoring metrics. With this information, problems become diagnosable:</p>



<ul class="wp-block-list">
<li>Retrieval returned low-relevance documents</li>



<li>Prompt formatting changed unexpectedly</li>



<li>LLM call degraded or hit retry logic</li>



<li>Evaluation metrics began trending downward</li>
</ul>



<p>In other words, observability provides <strong>ground truth for system behavior</strong>. Production RAG must be accountable: decisions should be explainable, errors traceable, and failures measurable. Without observability, shipping RAG to production is equivalent to flying an airplane without instruments; the system might work, but you will not know <em>when</em> or <em>why</em> it stops working.</p>



<h3 class="wp-block-heading">What We Will Build: Traced Retriever, Traced LLM, Full RAG Pipeline, and Evaluation</h3>



<p>In this lesson, you will construct a fully observable, component-wise traced RAG system using <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">FAISS</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">SentenceTransformers</code>, and <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code>. Each part of the pipeline is instrumented for visibility: you will build a <strong>traced retriever</strong> that logs embeddings, index sizes, similarity scores, and ranking. You will build a <strong>traced LLM wrapper</strong> that records prompts, responses, retry attempts, and token usage. These components power a <strong>fully traced RAG pipeline</strong> that captures retrieval, prompt construction, generation, and final evaluation as a single hierarchical execution tree.</p>



<p>You will also implement <strong>automatic RAG output evaluation</strong>, computing relevancy, hallucination risk, and an overall quality score, with each metric logged back to Langfuse dashboards with scoring nodes. This gives you a complete introspection loop: every answer is measured, every metric is recorded, and every decision is traceable through structured spans.</p>



<p>By the end, you will have a <strong>production-grade RAG observability stack</strong>, running locally with:</p>



<ul class="wp-block-list">
<li>A traced retriever</li>



<li>A traced LLM client</li>



<li>A fully instrumented RAG pipeline</li>



<li>Automatic scoring and diagnostics</li>



<li>Local dashboards for analyzing behavior</li>
</ul>



<p>This foundation prepares you for upcoming lessons, where we extend these ideas into multi-step agents and more complex reasoning workflows.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-RAG-Observability-Architecture-Langfuse-vLLM-FAISS"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-RAG-Observability-Architecture-Langfuse-vLLM-FAISS">RAG Observability Architecture with Langfuse, vLLM, and FAISS</a></h2>



<p>A production-grade RAG pipeline is not a single model call. It is an orchestrated system composed of independent but cooperating components: retrieval, prompt assembly, LLM inference, and evaluation. In this section, we break down each subsystem and explain how they interact, why they are separated, and how Langfuse stitches everything together into a fully observable execution graph.</p>



<h3 class="wp-block-heading">Retrieval → Prompt Construction → LLM → Scoring (The Core RAG Loop)</h3>



<p>A well-designed RAG architecture follows a clean, linear flow where each stage has a single responsibility:</p>



<h4 class="wp-block-heading">Retrieval</h4>



<p>The system begins by embedding the user query and searching for relevant documents in a vector index. The retriever returns <em>ranked, scored</em> context items that will guide the LLM. In production, retrieval quality is often the primary bottleneck; if retrieval fails, generation cannot be correct. Therefore, retrieval spans log:</p>



<ul class="wp-block-list">
<li>embeddings used</li>



<li>search distances and converted relevance scores</li>



<li>number of documents returned</li>



<li>FAISS query latencies</li>
</ul>



<p>Your <code data-enlighter-language="python" class="EnlighterJSRAW">TracedRetriever</code> does exactly this in the codebase, generating embeddings, searching the FAISS index, and tracing each step.</p>



<h4 class="wp-block-heading">Prompt Construction</h4>



<p>Next, the system converts retrieved documents into a structured context block and assembles a prompt that the LLM can reliably parse. Prompt construction must be deterministic to avoid instability across runs. The code in <code data-enlighter-language="python" class="EnlighterJSRAW">rag_pipeline.py</code> builds a system message, a user message, and contextual references (<code data-enlighter-language="python" class="EnlighterJSRAW">[1] doc1</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">[2] doc2</code>, etc.). This ensures:</p>



<ul class="wp-block-list">
<li>deterministic ordering</li>



<li>visible context structure</li>



<li>stable interface for downstream evaluation</li>
</ul>



<h4 class="wp-block-heading">LLM Generation</h4>



<p>The prompt is sent to the LLM via an <strong>OpenAI-compatible Completion API</strong>, served by <strong>vLLM</strong> locally. The <code data-enlighter-language="python" class="EnlighterJSRAW">TracedLLMClient</code> wraps this call with:</p>



<ul class="wp-block-list">
<li>retry logic</li>



<li>token usage reporting</li>



<li>error logging</li>



<li>prompt and response capture</li>



<li>metadata annotations</li>
</ul>



<p>This is critical for production reliability because LLM latency, token usage, and intermittent failures must all be observable.</p>



<h4 class="wp-block-heading">Scoring and Evaluation</h4>



<p>Finally, the answer is passed through a lightweight evaluation module (<code data-enlighter-language="python" class="EnlighterJSRAW">evaluation.py</code>). It computes:</p>



<ul class="wp-block-list">
<li>a relevancy score</li>



<li>a hallucination risk score</li>



<li>an overall quality score</li>
</ul>



<p>These metrics are reported back into Langfuse as scoring nodes. Production RAG systems need this because correctness is subjective; evaluation makes correctness measurable.</p>



<p>This 4-step pipeline forms the backbone of every modern retrieval-augmented system.</p>



<h3 class="wp-block-heading">Local Vector Store Using FAISS and SentenceTransformers</h3>



<p>RAG pipelines must remain fast, private, and cost-efficient. This system uses <strong>FAISS</strong> as the vector index and <strong>SentenceTransformers</strong> for embedding models, giving you:</p>



<ul class="wp-block-list">
<li><strong>Zero API cost</strong> (everything is local)</li>



<li><strong>GPU acceleration optional</strong> (FAISS works on CPU just fine)</li>



<li><strong>Deterministic embeddings</strong> (critical for reproducibility)</li>



<li><strong>Config-driven control</strong> over the embedding model and dimensionality</li>
</ul>



<p>The retrieval pipeline is built around the following 3 core mechanisms:</p>



<h4 class="wp-block-heading">Document Embeddings</h4>



<p>Each document is encoded using the local model defined in <code data-enlighter-language="python" class="EnlighterJSRAW">config.yaml</code>:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="1">embeddings:
  model: "sentence-transformers/all-MiniLM-L6-v2"
  dimension: 384
</pre>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">TracedRetriever</code> loads this model and produces <strong>normalized embeddings</strong> for better retrieval precision.</p>



<h4 class="wp-block-heading">FAISS Index</h4>



<p>FAISS stores all document embeddings in a vector index created via:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="2">self.index = faiss.IndexFlatL2(self.dimension)
</pre>



<p><code data-enlighter-language="python" class="EnlighterJSRAW">IndexFlatL2</code> is simple, fast, and perfect for local development, while still appropriate for many production environments.</p>



<h4 class="wp-block-heading">Similarity Search</h4>



<p>Retrieval happens by computing L2 distance and converting those distances into relevance scores, ensuring consistency and interpretability.</p>



<p>You end up with a fully local, high-performance vector store without touching external cloud APIs.</p>



<h3 class="wp-block-heading">vLLM as an OpenAI-Compatible Inference Server</h3>



<p>Instead of relying on OpenAI or Anthropic APIs, your lesson uses <strong>vLLM</strong>, a high-throughput inference engine built for serving LLMs at scale.</p>



<p>Your Docker Compose file runs vLLM either on <strong>GPU (recommended)</strong> or <strong>CPU fallback</strong>, exposing it at:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="3">http://localhost:8000/v1
</pre>



<p>This allows you to call vLLM with the exact same interface as OpenAI:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="4">response = client.chat.completions.create(
    model=self.model,
    messages=messages,
    temperature=temperature,
    max_tokens=max_tokens
)
</pre>



<p>Benefits for production-grade RAG:</p>



<ul class="wp-block-list">
<li><strong>Predictable latency</strong></li>



<li><strong>Full control over model versioning</strong></li>



<li><strong>No external dependencies</strong></li>



<li><strong>High-throughput serving (paged attention)</strong></li>



<li><strong>OpenAI API compatibility</strong> (no code rewrite needed)</li>
</ul>



<p>Your <code data-enlighter-language="python" class="EnlighterJSRAW">TracedLLMClient</code> wraps all of this with Langfuse observability, giving you:</p>



<ul class="wp-block-list">
<li>latency metrics</li>



<li>retry attempts</li>



<li>token usage breakdown</li>



<li>full input/output transparency</li>



<li>error-level spans when inference fails</li>
</ul>



<p>This is how modern enterprises run private LLMs with production reliability.</p>



<h3 class="wp-block-heading">Langfuse for Tracing, Metrics, Evaluation, and Span Hierarchies</h3>



<p>Langfuse is the backbone of observability in this system. Every major component (i.e., embedding, retrieval, generation, and evaluation) becomes a <strong>span</strong> inside a single <strong>root trace</strong>.</p>



<p>A typical trace hierarchy looks like:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="5">rag_pipeline (root)
│
├── retrieve_documents
│   ├── embed_text
│   ├── index_documents (only once)
│   └── retrieve_documents
│
├── llm_completion
│
└── evaluate_rag_output
    ├── evaluate_relevancy
    └── evaluate_hallucination
</pre>



<p>This structure gives you:</p>



<h4 class="wp-block-heading">Full-System Visibility</h4>



<p>Every question generates a complete execution tree revealing:</p>



<ul class="wp-block-list">
<li>what happened</li>



<li>where it happened</li>



<li>how long it took</li>



<li>what went wrong</li>
</ul>



<h4 class="wp-block-heading">End-to-End Metrics</h4>



<ul class="wp-block-list">
<li>token usage</li>



<li>retrieval scores</li>



<li>latency per component</li>



<li>evaluation metrics</li>
</ul>



<h4 class="wp-block-heading">Rich Debugging Context</h4>



<p>Each span stores:</p>



<ul class="wp-block-list">
<li>input messages</li>



<li>embeddings preview</li>



<li>retrieved context</li>



<li>generated outputs</li>



<li>error details</li>
</ul>



<h4 class="wp-block-heading">Continuous Quality Monitoring</h4>



<p>Your evaluation step logs:</p>



<ul class="wp-block-list">
<li>a relevancy score</li>



<li>a hallucination risk</li>



<li>a final pass-or-fail quality metric</li>
</ul>



<p>Langfuse becomes the <em>single pane of glass</em> for understanding your RAG pipeline’s behavior, serving as the missing observability layer that transforms a working prototype into a production-ready system.</p>



<hr class="wp-block-separator has-alpha-channel-opacity"/>



<p>Would you like immediate access to 3,457 images curated and labeled with hand gestures to train, explore, and experiment with &#8230; for free? Head over to <a href="https://universe.roboflow.com/isl/az-6mqow?ref=pyimagesearch" target="_blank" rel="noreferrer noopener">Roboflow</a> and get a free account to grab these hand gesture images. </p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Project-Setup"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Project-Setup">Project Setup</a></h2>



<p>Before we write a single line of RAG logic, the foundation must be solid: a clean folder structure, repeatable environment setup, deterministic configuration, and a reliable inference and observability stack. This section walks you through the project layout, how to launch vLLM and Langfuse via Docker Compose, how to install retrieval dependencies (FAISS and SentenceTransformers), and how to configure all components using a single <code data-enlighter-language="python" class="EnlighterJSRAW">config.yaml</code> file.</p>



<h3 class="wp-block-heading">Folder Structure Walkthrough</h3>



<p>Your project is organized for <strong>production clarity</strong>, where each subsystem (RAG, LLM, agent, evaluation, and infrastructure) is isolated in its own module.</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="6">project-root/
│
├── configs/
│   └── config.yaml                # Central config: LLM, embeddings, RAG, agent, eval, Langfuse
│
├── data/
│   └── sample_docs.txt            # Example inputs for retrieval
│
├── src/
│   ├── config.py                  # Config loader utilities
│   ├── llm_utils.py               # OpenAI-compatible client initialization
│   ├── llm_client.py              # Traced LLM wrapper (retry + token usage + spans)
│   ├── retriever.py               # FAISS retriever with traced indexing + search
│   ├── evaluation.py              # RAG quality scoring (relevancy + hallucination)
│   ├── rag_pipeline.py            # Full retrieval → prompt → generation → evaluation pipeline
│   ├── agent_orchestration.py     # 3-step traced agent workflow
│   ├── langfuse_instrumentation.py# Bootstraps Langfuse + flush utilities
│
├── docker-compose.yml             # vLLM + Langfuse + Postgres (self-hosted observability)
│
├── requirements.txt               # Python dependencies
│
└── check_rag_health.py            # Full system health check (env, docker, dependencies, files)
</pre>



<p>This layout ensures:</p>



<ul class="wp-block-list">
<li><strong>Decoupled components:</strong> easy for testing and future replacement</li>



<li><strong>Reproducible environment:</strong> config-driven behavior</li>



<li><strong>Portable observability:</strong> one command launches everything</li>



<li><strong>Scalable structure:</strong> supports RAG, agents, and future tools</li>
</ul>



<p>Every file in the <code data-enlighter-language="python" class="EnlighterJSRAW">src/</code> directory corresponds to a runnable pipeline step, and each is instrumented with Langfuse decorators so all activity becomes visible in the dashboard.</p>



<h3 class="wp-block-heading">Starting vLLM and Langfuse Using Docker Compose</h3>



<p>For production-like observability, the system relies on <strong>2 running services</strong>:</p>



<ul class="wp-block-list">
<li><strong>Langfuse:</strong> tracing, metrics, and span visualization</li>



<li><strong>vLLM:</strong> inference engine serving the LLM</li>
</ul>



<p>Both are provided in your <code data-enlighter-language="python" class="EnlighterJSRAW">docker-compose.yml</code>, and both run <strong>locally</strong>, meaning:</p>



<ul class="wp-block-list">
<li>zero cloud dependency</li>



<li>zero per-token cost</li>



<li>repeatable development environment</li>
</ul>



<h4 class="wp-block-heading">Start the entire stack (GPU version)</h4>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="7">docker-compose --profile gpu up -d
</pre>



<h4 class="wp-block-heading">Or start CPU mode</h4>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="8">docker-compose --profile cpu up -d
</pre>



<h4 class="wp-block-heading">Confirm services are running</h4>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="9">docker-compose ps
</pre>



<p>You should see something like:</p>


<div class="wp-block-image">
<figure class="aligncenter size-full"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-56.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="734" height="233" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-56.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-54283" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-56.png?size=126x40&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-56-300x95.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-56.png?size=378x120&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-56.png?size=504x160&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-56.png?size=630x200&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-56.png?lossy=2&amp;strip=1&amp;webp=1 734w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Table 1: </strong>Core services with port mappings and health status</figcaption></figure></div>


<h4 class="wp-block-heading">UI access</h4>



<ul class="wp-block-list">
<li><strong>Langfuse dashboard:</strong> <code data-enlighter-language="python" class="EnlighterJSRAW">http://localhost:3000</code></li>



<li><strong>vLLM API:</strong> <code data-enlighter-language="python" class="EnlighterJSRAW">http://localhost:8000/v1</code></li>
</ul>



<h4 class="wp-block-heading">What these services do internally</h4>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse server</code>: stores traces, spans, scoring, and metadata</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse worker</code>: processes asynchronous scoring and analytics</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">PostgreSQL</code>: stores trace data</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code>: serves the Llama 2 model loaded at runtime</li>
</ul>



<p>This cluster forms your local, production-grade observability and inference backbone.</p>



<h3 class="wp-block-heading">Installing FAISS and SentenceTransformers</h3>



<p>The retrieval layer requires:</p>



<ul class="wp-block-list">
<li><strong>FAISS:</strong> similarity search</li>



<li><strong>SentenceTransformers:</strong> embedding model</li>
</ul>



<p>These are already declared in your <code data-enlighter-language="python" class="EnlighterJSRAW">requirements.txt</code>:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="10">sentence-transformers>=2.2.0
faiss-cpu>=1.7.4
numpy>=1.24.0
</pre>



<p>Install dependencies:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="11">pip install -r requirements.txt
</pre>



<p>After installation, verify FAISS is working:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="12">python -c "import faiss; print(f'FAISS version: {faiss.__version__}')"
</pre>



<p>Verify embedding model loads:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="13">python -c "from sentence_transformers import SentenceTransformer; print(SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2'))"
</pre>



<p>These 2 libraries form the <strong>core retrieval engine</strong>:</p>



<ul class="wp-block-list">
<li><strong>Embeddings:</strong> produced locally (MiniLM)</li>



<li><strong>Retrieval:</strong> performed locally (FAISS <code data-enlighter-language="python" class="EnlighterJSRAW">IndexFlatL2</code>)</li>
</ul>



<p>No external API latency.</p>



<p>No cost.</p>



<p>No vendor lock-in.</p>



<h3 class="wp-block-heading">Configuring config.yaml (LLM, Embeddings, RAG, and Evaluation)</h3>



<p>The <strong>entire</strong> RAG and agent system is configurable from a single <code data-enlighter-language="python" class="EnlighterJSRAW">YAML</code> file:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="14">langfuse:
  host: "http://localhost:3000"
  project_name: "rag-selfhosted"

llm:
  base_url: "http://localhost:8000/v1"
  model: "meta-llama/Llama-2-7b-chat-hf"
  temperature: 0.7
  max_tokens: 300
  max_retries: 2

embeddings:
  model: "sentence-transformers/all-MiniLM-L6-v2"
  dimension: 384

rag:
  top_k: 3

agent:
  max_steps: 3

evaluation:
  enable_scoring: true
  min_quality_score: 0.6
</pre>



<h4 class="wp-block-heading">Key configuration sections</h4>



<h5 class="wp-block-heading">LLM Configuration</h5>



<p>Controls inference behavior:</p>



<ul class="wp-block-list">
<li>model name</li>



<li>sampling temperature</li>



<li>max tokens</li>



<li>retry count</li>



<li>endpoint (<code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> server URL)</li>
</ul>



<p>Your <code data-enlighter-language="python" class="EnlighterJSRAW">TracedLLMClient</code> loads these automatically via:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="15">from config import get_llm_config
</pre>



<h5 class="wp-block-heading">Embeddings Configuration</h5>



<p>Controls vector dimension and embedding model, and is consumed by:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="16">from config import get_embeddings_config
</pre>



<h5 class="wp-block-heading">RAG Settings</h5>



<p>Controls retrieval behavior:</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">top_k</code>: results returned from FAISS</li>



<li>used inside: <code data-enlighter-language="python" class="EnlighterJSRAW">TracedRetriever.retrieve()</code></li>
</ul>



<h5 class="wp-block-heading">Agent Settings</h5>



<p>Agent workflows build on top of RAG, controlling:</p>



<ul class="wp-block-list">
<li>max agent steps</li>



<li>model used for intent detection</li>
</ul>



<h5 class="wp-block-heading">Evaluation Configuration</h5>



<p>Defines quality control thresholds:</p>



<ul class="wp-block-list">
<li>relevancy</li>



<li>hallucination risk</li>



<li>minimum acceptable quality</li>
</ul>



<p>Used in:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="17">from config import get_evaluation_config
</pre>



<p>This config-driven system makes your pipeline:</p>



<ul class="wp-block-list">
<li>reproducible</li>



<li>tunable</li>



<li>production-friendly</li>



<li>environment-agnostic</li>
</ul>



<p>Change models or thresholds, and no code changes are required.</p>



<hr class="wp-block-separator has-alpha-channel-opacity"/>



<!-- wp:paragraph -->
<h3>Need Help Configuring Your Development Environment?</h3>
<!-- /wp:paragraph -->

<!-- wp:image {"align":"center","id":18137,"sizeSlug":"large","linkDestination":"custom"} -->
<figure class="wp-block-image aligncenter size-large"><a href="https://pyimagesearch.com/pyimagesearch-university/" target="_blank" rel="noreferrer noopener"><img decoding="async" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-18137" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?lossy=2&strip=1&webp=1 500w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?size=126x84&lossy=2&strip=1&webp=1 126w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?size=252x168&lossy=2&strip=1&webp=1 252w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?size=378x253&lossy=2&strip=1&webp=1 378w" sizes="(max-width: 500px) 100vw, 500px" /></a><figcaption>Having trouble configuring your development environment? Want access to pre-configured Jupyter Notebooks running on Google Colab? Be sure to join <a href="https://pyimagesearch.com/pyimagesearch-university/" target="_blank" rel="noreferrer noopener" aria-label=" (opens in a new tab)">PyImageSearch University</a> — you will be up and running with this tutorial in a matter of minutes. </figcaption></figure>
<!-- /wp:image -->

<!-- wp:paragraph -->
<p>All that said, are you:</p>
<!-- /wp:paragraph -->

<!-- wp:list -->
<ul><li>Short on time?</li><li>Learning on your employer’s administratively locked system?</li><li>Wanting to skip the hassle of fighting with the command line, package managers, and virtual environments?</li><li><strong>Ready to run the code immediately on your Windows, macOS, or Linux system?</strong></li></ul>
<!-- /wp:list -->

<!-- wp:paragraph -->
<p>Then join <a href="https://pyimagesearch.com/pyimagesearch-university/" target="_blank">PyImageSearch University</a> today!</p>
<!-- /wp:paragraph -->

<!-- wp:paragraph -->
<p><strong>Gain access to Jupyter Notebooks for this tutorial and other PyImageSearch guides pre-configured to run on Google Colab’s ecosystem right in your web browser!</strong> No installation required.</p>
<!-- /wp:paragraph -->

<!-- wp:paragraph -->
<p>And best of all, these Jupyter Notebooks will run on Windows, macOS, and Linux!</p>
<!-- /wp:paragraph -->



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Building-Langfuse-Traced-Retriever-FAISS"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Building-Langfuse-Traced-Retriever-FAISS">Building a Langfuse-Traced Retriever with FAISS</a></h2>



<p>The retriever is the beating heart of any RAG pipeline. If retrieval is weak, every downstream component (i.e., prompting, LLM generation, and evaluation) will degrade. In this section, we construct a <em>production-grade retriever</em> built on three pillars: <strong>local embeddings</strong>, <strong>FAISS vector search</strong>, and <strong>Langfuse instrumentation</strong>. The result is a component that is fast, reproducible, fully observable, and cheap to run because it never leaves your machine or calls a cloud API.</p>



<p>Your <code data-enlighter-language="python" class="EnlighterJSRAW">TracedRetriever</code> class in <code data-enlighter-language="python" class="EnlighterJSRAW">src/retriever.py</code> handles 4 responsibilities:</p>



<ul class="wp-block-list">
<li>Load an embedding model</li>



<li>Embed and index documents</li>



<li>Perform similarity-based search</li>



<li>Emit <strong>Langfuse spans</strong> for every step (embedding, indexing, and retrieval)</li>
</ul>



<h3 class="wp-block-heading">Loading and Embedding Documents</h3>



<p>The retriever begins by loading a <strong>local SentenceTransformers model</strong>, which provides dense vector embeddings without any external API calls.</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="18">embeddings_config = get_embeddings_config()
model_name = embeddings_config.get("model", "sentence-transformers/all-MiniLM-L6-v2")
self.model = SentenceTransformer(model_name)
</pre>



<h4 class="wp-block-heading">Why local embeddings?</h4>



<ul class="wp-block-list">
<li><strong>No rate limits or API costs</strong> after the local environment is configured </li>



<li><strong>Fast inference</strong> through optimized ONNX or Torch acceleration</li>



<li><strong>Privacy-safe</strong> since no data leaves the environment</li>



<li><strong>Predictable latency</strong>, which is critical in production</li>
</ul>



<h4 class="wp-block-heading">Embedding a document (with tracing)</h4>



<p>Your <code data-enlighter-language="python" class="EnlighterJSRAW">embed()</code> method is wrapped with the Langfuse <code data-enlighter-language="python" class="EnlighterJSRAW">@observe</code> decorator:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="19">@observe(name="embed_text")
def embed(self, text: str) -> np.ndarray:
</pre>



<p>This automatically creates a <em>top-level span</em> called <code data-enlighter-language="python" class="EnlighterJSRAW">embed_text</code> in Langfuse.</p>



<p>Inside the span, you record:</p>



<ul class="wp-block-list">
<li>first 100 characters of the text</li>



<li>embedding dimension</li>
</ul>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="20">langfuse_context.update_current_observation(
    input={"text_preview": text[:100]}
)
</pre>



<p>The embedding call itself:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="21">embedding = self.model.encode([text], normalize_embeddings=True)[0]
</pre>



<p>This normalization step ensures the embeddings have unit length, which stabilizes similarity scoring and produces better retrieval in FAISS L2 spaces.</p>



<p>The span finishes by storing metadata:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="22">langfuse_context.update_current_observation(
    output={"embedding_dim": len(embedding)}
)
</pre>



<p>This is extremely useful later for debugging:</p>



<ul class="wp-block-list">
<li>Did documents produce embeddings with inconsistent lengths?</li>



<li>Are embeddings accidentally empty?</li>



<li>Are overly long texts being passed in?</li>
</ul>



<p>Langfuse gives you full visibility.</p>



<h3 class="wp-block-heading">Creating and Populating a FAISS Index</h3>



<p>After loading the embedding model, the retriever constructs a <strong>FAISS</strong> <code data-enlighter-language="python" class="EnlighterJSRAW">IndexFlatL2</code> index:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="23">self.index = faiss.IndexFlatL2(self.dimension)
</pre>



<p>This index:</p>



<ul class="wp-block-list">
<li>Stores vectors in RAM</li>



<li>Uses <strong>Euclidean distance</strong> (L2) for similarity</li>



<li>Has <em>no training step</em>, making it ideal for small and medium-sized datasets</li>
</ul>



<p>Your retriever keeps an in-memory list of source documents:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="24">self.documents = []
</pre>



<h4 class="wp-block-heading">Indexing documents (with tracing)</h4>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="25">@observe(name="index_documents")
def index_documents(self, documents: List[str]):
</pre>



<p>This span tracks:</p>



<ul class="wp-block-list">
<li>how many documents are being indexed</li>



<li>how many embeddings were added</li>



<li>previews of content for debugging</li>
</ul>



<p>Under the hood:</p>



<ul class="wp-block-list">
<li><strong>Store the raw documents</strong></li>



<li><strong>Embed them in a batch</strong></li>



<li><strong>Add the vectors to FAISS</strong></li>
</ul>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="26">embeddings = self.model.encode(documents, normalize_embeddings=True)
self.index.add(embeddings.astype(np.float32))
</pre>



<p>Because FAISS expects <code data-enlighter-language="python" class="EnlighterJSRAW">float32</code>, the cast is mandatory.</p>



<h4 class="wp-block-heading">Why IndexFlatL2?</h4>



<ul class="wp-block-list">
<li>Simple</li>



<li>Deterministic</li>



<li>Fast for small–medium corpora (&lt; 200k docs)</li>



<li>Plays well with normalized embeddings (MiniLM, BERT, etc.)</li>
</ul>



<p>Your pipeline achieves high throughput without additional libraries or GPUs.</p>



<h3 class="wp-block-heading">Retrieving with Similarity Ranking</h3>



<p>The retrieval process begins with the <code data-enlighter-language="python" class="EnlighterJSRAW">retrieve()</code> method:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="27">@observe(name="retrieve_documents")
def retrieve(self, query: str, top_k: int = None):
</pre>



<p>Langfuse creates a tracing span named <code data-enlighter-language="python" class="EnlighterJSRAW">retrieve_documents</code> for every search operation.</p>



<h4 class="wp-block-heading">Step 1. Embed the query</h4>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="28">query_embedding = self.embed(query).reshape(1, -1)
</pre>



<p>Notice that calling <code data-enlighter-language="python" class="EnlighterJSRAW">self.embed()</code> creates a <em>nested span</em> under the retrieval span in Langfuse.</p>



<p>This nesting hierarchy:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="29">retrieve_documents
    ├── embed_text
</pre>



<p>gives you a complete view of:</p>



<ul class="wp-block-list">
<li>how long embedding took</li>



<li>token count (if embedding model changes)</li>



<li>exact query text</li>
</ul>



<h4 class="wp-block-heading">Step 2. Search the FAISS index</h4>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="30">distances, indices = self.index.search(query_embedding, top_k)
</pre>



<p>FAISS returns:</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">indices</code>: the closest documents</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">distances</code>: L2 distances to each doc</li>
</ul>



<p>You convert distances into similarity scores:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="31">relevance_score = 1.0 / (1.0 + float(distance))
</pre>



<p>This transforms smaller distances into higher scores.</p>



<h4 class="wp-block-heading">Step 3. Format ranked results</h4>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="32">results.append({
    "content": self.documents[idx],
    "score": relevance_score,
    "rank": rank + 1,
    "distance": float(distance)
})
</pre>



<h4 class="wp-block-heading">Step 4. Log retrieval metadata to Langfuse</h4>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="33">langfuse_context.update_current_observation(
    output={
        "result_count": len(results),
        "scores": [r["score"] for r in results],
        "results": [...]
    }
)
</pre>



<p>You even send content previews (200 characters), which appear in the Langfuse UI and make debugging dramatically easier.</p>



<h3 class="wp-block-heading">Adding Langfuse Spans to Indexing and Retrieval Steps</h3>



<p>Langfuse observability is woven into every retrieval path using the <code data-enlighter-language="python" class="EnlighterJSRAW">@observe</code> decorator and metadata updates.</p>



<h4 class="wp-block-heading">Spans you automatically get from your retriever</h4>


<div class="wp-block-image">
<figure class="aligncenter size-full"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-57.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="729" height="179" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-57.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-54286" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-57.png?size=126x31&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-57-300x74.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-57.png?size=378x93&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-57.png?size=504x124&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-57.png?size=630x155&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-57.png?lossy=2&amp;strip=1&amp;webp=1 729w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Table 2: </strong>Key instrumented methods and their corresponding Langfuse tracing spans</figcaption></figure></div>


<p>These spans appear under your RAG pipeline trace like:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="34">rag_pipeline
    ├── retrieve_documents
    │       ├── embed_text
    │       └── result metadata
    ├── llm_completion
    ├── evaluate_rag_output
    └── final scoring
</pre>



<h4 class="wp-block-heading">Why this matters in production</h4>



<ul class="wp-block-list">
<li>You can identify whether latency is coming from <strong>embedding</strong>, <strong>FAISS search</strong>, or <strong>LLM inference</strong>.</li>



<li>You can detect mismatches like:
<ul class="wp-block-list">
<li>wrong embedding dimension</li>



<li>missing documents</li>



<li>unnormalized vectors</li>



<li>misconfigured top-k</li>
</ul>
</li>



<li>You get complete end-to-end lineage for every query.</li>



<li>You can monitor retriever performance across time.</li>
</ul>



<p>This is the observability layer that most open-source RAG tutorials <em>never</em> include, but you now have it baked into the core of your retriever.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Building-Traced-LLM-Wrapper-vLLM-Langfuse"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Building-Traced-LLM-Wrapper-vLLM-Langfuse">Building a Traced LLM Wrapper for vLLM and Langfuse</a></h2>



<h3 class="wp-block-heading">OpenAI-Compatible Chat Completions via vLLM</h3>



<p>Your LLM wrapper is split into 2 layers:</p>



<ul class="wp-block-list">
<li>a <strong>low-level OpenAI-compatible client</strong> in <code data-enlighter-language="python" class="EnlighterJSRAW">llm_utils.py</code>, and</li>



<li>a <strong>high-level, Langfuse-traced wrapper</strong> in <code data-enlighter-language="python" class="EnlighterJSRAW">llm_client.py</code>.</li>
</ul>



<p>The low-level client is created in <code data-enlighter-language="python" class="EnlighterJSRAW">get_llm_client()</code>:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="35">from openai import OpenAI

def get_llm_client(timeout: int = 60, load_model_from_config: bool = False):
    if os.getenv("OPENAI_BASE_URL") is None:
        print("⚠️  OPENAI_BASE_URL not found in environment. Using default http://localhost:8000/v1")
   
    if os.getenv("OPENAI_API_KEY") is None:
        print("⚠️  OPENAI_API_KEY not set. Using dummy key.")
   
    client = OpenAI(
        base_url=os.getenv("OPENAI_BASE_URL", "http://localhost:8000/v1"),
        api_key=os.getenv("OPENAI_API_KEY", "dummy"),
        timeout=timeout,
    )
    ...
    return client
</pre>



<p>This means that <strong>as long as vLLM is running</strong> behind an OpenAI-compatible server (from <code data-enlighter-language="python" class="EnlighterJSRAW">docker-compose.yml</code> on <code data-enlighter-language="python" class="EnlighterJSRAW">http://localhost:8000/v1</code>), the rest of your code simply calls <code data-enlighter-language="python" class="EnlighterJSRAW">client.chat.completions.create(...)</code> exactly like it would against OpenAI, without any vendor-specific changes.</p>



<p>At the higher level, <code data-enlighter-language="python" class="EnlighterJSRAW">TracedLLMClient</code> in <code data-enlighter-language="python" class="EnlighterJSRAW">src/llm_client.py</code> wraps this client and pulls model configuration from <code data-enlighter-language="python" class="EnlighterJSRAW">configs/config.yaml</code> via <code data-enlighter-language="python" class="EnlighterJSRAW">get_llm_config()</code>:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="36">llm:
  base_url: "http://localhost:8000/v1"
  model: "meta-llama/Llama-2-7b-chat-hf"
  temperature: 0.7
  max_tokens: 300
  max_retries: 2
</pre>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="37">class TracedLLMClient:
    def __init__(self, model: str = None, max_retries: int = 2, timeout: int = 60):
        self.client = get_llm_client(timeout=timeout)
        if model is None:
            llm_config = get_llm_config()
            model = llm_config.get("model", "meta-llama/Llama-2-7b-chat-hf")
        self.model = model
        self.max_retries = max_retries
</pre>



<p>The end result: your <strong>RAG and agent code never talks to vLLM directly</strong>; it always goes through <code data-enlighter-language="python" class="EnlighterJSRAW">TracedLLMClient</code>, which is OpenAI-compatible, config-driven, and ready for tracing.</p>



<h3 class="wp-block-heading">Retry Logic and Error Handling</h3>



<p>The core of the wrapper is the <code data-enlighter-language="python" class="EnlighterJSRAW">complete()</code> method:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="38">from langfuse.decorators import observe, langfuse_context

class TracedLLMClient:
    @observe(name="llm_completion")
    def complete(self, messages: List[Dict[str, str]], **kwargs) -> Dict:
        llm_config = get_llm_config()
        temperature = kwargs.get("temperature", llm_config.get("temperature", 0.7))
        max_tokens = kwargs.get("max_tokens", llm_config.get("max_tokens", 300))

        langfuse_context.update_current_observation(
            input={"messages": messages, "model": self.model}
        )

        last_error = None
        for attempt in range(self.max_retries):
            try:
                response = self.client.chat.completions.create(
                    model=self.model,
                    messages=messages,
                    temperature=temperature,
                    max_tokens=max_tokens
                )
                ...
                return {..., "success": True}
            except Exception as e:
                last_error = e
                if attempt &lt; self.max_retries - 1:
                    time.sleep(1)
                    continue

        error_msg = f"LLM call failed after {self.max_retries} attempts: {last_error}"
        langfuse_context.update_current_observation(
            level="ERROR",
            output={"error": error_msg}
        )
        return {"content": None, "error": error_msg, "success": False}
</pre>



<p>A few production-grade details are baked in here:</p>



<ul class="wp-block-list">
<li><strong>Config-driven defaults:</strong> <code data-enlighter-language="python" class="EnlighterJSRAW">temperature</code> and <code data-enlighter-language="python" class="EnlighterJSRAW">max_tokens</code> come from <code data-enlighter-language="python" class="EnlighterJSRAW">config.yaml</code> but can be overridden per-call via kwargs.</li>



<li><strong>Retry loop:</strong> the method tries up to <code data-enlighter-language="python" class="EnlighterJSRAW">self.max_retries</code> times (default 2), with a short <code data-enlighter-language="python" class="EnlighterJSRAW">time.sleep(1)</code> backoff between attempts.</li>



<li><strong>Graceful failure:</strong> if all attempts fail, you get a structured response <code data-enlighter-language="python" class="EnlighterJSRAW">{content: None, error: "...", success: False}</code> instead of a hard crash, and the Langfuse span is explicitly marked as <code data-enlighter-language="python" class="EnlighterJSRAW">"ERROR"</code>.</li>
</ul>



<p>When you call this from <code data-enlighter-language="python" class="EnlighterJSRAW">rag_pipeline.py</code> or <code data-enlighter-language="python" class="EnlighterJSRAW">agent_orchestration.py</code>, you can safely check <code data-enlighter-language="python" class="EnlighterJSRAW">result</code><code data-enlighter-language="python" class="EnlighterJSRAW">["success"]</code> and decide whether to return a fallback answer, propagate the error, or short-circuit the pipeline.</p>



<h3 class="wp-block-heading">Logging Request and Response Payloads</h3>



<p>Because <code data-enlighter-language="python" class="EnlighterJSRAW">TracedLLMClient</code> is decorated with <code data-enlighter-language="python" class="EnlighterJSRAW">@observe(name="llm_completion")</code>, every call automatically becomes a <strong>Langfuse span</strong>, and you manually enrich that span with <strong>inputs and outputs</strong> via <code data-enlighter-language="python" class="EnlighterJSRAW">langfuse_context</code>.</p>



<p>At the start of the call, you log the <strong>request payload</strong>:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="39">langfuse_context.update_current_observation(
    input={"messages": messages, "model": self.model}
)
</pre>



<p>This means that in the Langfuse UI you will see:</p>



<ul class="wp-block-list">
<li>the full chat history (messages) you sent to the model</li>



<li>which <strong>model</strong> was used (e.g., <code data-enlighter-language="python" class="EnlighterJSRAW">"meta-llama/Llama-2-7b-chat-hf"</code>)</li>
</ul>



<p>After a successful LLM call, you log the <strong>response content</strong>:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="40">content = response.choices[0].message.content

langfuse_context.update_current_observation(
    output={"content": content},
    usage={
        "input": response.usage.prompt_tokens,
        "output": response.usage.completion_tokens,
        "total": response.usage.total_tokens
    },
    metadata={"attempt": attempt + 1}
)
</pre>



<p>So every Langfuse span for <code data-enlighter-language="python" class="EnlighterJSRAW">llm_completion</code> will show:</p>



<ul class="wp-block-list">
<li>the <strong>raw answer text</strong> the model generated</li>



<li>which <strong>attempt</strong> succeeded (first try or retry)</li>



<li>the <strong>token usage</strong> for that call</li>
</ul>



<p>On failure, the wrapper logs the error message instead of content:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="41">langfuse_context.update_current_observation(
    level="ERROR",
    output={"error": error_msg}
)
</pre>



<p>This gives you <strong>debuggable traces</strong> when vLLM is down, you hit timeouts, or your model name is misconfigured.</p>



<h3 class="wp-block-heading">Capturing Token Usage and Metadata in Langfuse</h3>



<p><code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> exposes <code data-enlighter-language="python" class="EnlighterJSRAW">response.usage</code> in an OpenAI-like shape, and you forward that directly into Langfuse as part of the span:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="42">langfuse_context.update_current_observation(
    output={"content": content},
    usage={
        "input": response.usage.prompt_tokens,
        "output": response.usage.completion_tokens,
        "total": response.usage.total_tokens
    },
    metadata={"attempt": attempt + 1}
)

return {
    "content": content,
    "usage": response.usage.model_dump(),
    "success": True
}
</pre>



<p>This gives you <strong>2 layers</strong> of observability:</p>



<ul class="wp-block-list">
<li><strong>Inside Langfuse</strong>
<ul class="wp-block-list">
<li>You can filter and inspect spans by <code data-enlighter-language="python" class="EnlighterJSRAW">usage.total</code>, see which prompts are expensive, and spot unusually long generations.</li>



<li>You can correlate token usage with overall RAG or agent traces because <code data-enlighter-language="python" class="EnlighterJSRAW">llm_completion</code> spans sit inside higher-level pipeline spans such as <code data-enlighter-language="python" class="EnlighterJSRAW">rag_pipeline</code> or <code data-enlighter-language="python" class="EnlighterJSRAW">agent_workflow</code>.</li>
</ul>
</li>



<li><strong>Inside your Python code</strong>
<ul class="wp-block-list">
<li>Callers receive <code data-enlighter-language="python" class="EnlighterJSRAW">result["usage"]</code> and can log or aggregate it themselves (e.g., cost dashboards, quotas, or alerting in future lessons).</li>



<li>Because usage is returned as <code data-enlighter-language="python" class="EnlighterJSRAW">response.usage.model_dump()</code>, it is just a normal Python dict you can serialize or send elsewhere.</li>
</ul>
</li>
</ul>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">metadata={"attempt": attempt + 1}</code> block gives you a clean way to see <strong>how often retries are needed</strong>; if you start seeing a lot of second or third attempts in Langfuse, you know <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> or your infra is becoming unreliable and needs attention.</p>



<h3 class="wp-block-heading">Example: Using the Traced LLM Client</h3>



<p>Your <code data-enlighter-language="python" class="EnlighterJSRAW">__main__ block</code> in <code data-enlighter-language="python" class="EnlighterJSRAW">llm_client.py</code> shows a minimal end-to-end example:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="43">if __name__ == "__main__":
    client = TracedLLMClient()
   
    result = client.complete(
        messages=[
            {"role": "user", "content": "What is RAG in AI?"}
        ]
    )
   
    print(f"Response: {result['content']}")
    print(f"Tokens: {result['usage']['total_tokens']}")
   
    trace_id = langfuse_context.get_current_trace_id()
    langfuse_host = os.getenv("LANGFUSE_HOST", "http://localhost:3000")
    print(f"🔍 View trace: {langfuse_host}/trace/{trace_id}")
</pre>



<p>This script:</p>



<ul class="wp-block-list">
<li>verifies that <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> is responding correctly</li>



<li>verifies that <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> keys and host are properly configured</li>



<li>gives you a direct URL to the <strong>exact trace</strong> for this LLM call in the <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> UI</li>
</ul>



<p>In the next sections, you will see this same <code data-enlighter-language="python" class="EnlighterJSRAW">TracedLLMClient</code> reused inside the <strong>RAG pipeline</strong> and <strong>RAG evaluation</strong>, where it becomes just one span in a larger, nested trace tree.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Building-Fully-Traced-RAG-Pipeline-Langfuse"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Building-Fully-Traced-RAG-Pipeline-Langfuse">Building a Fully Traced RAG Pipeline with Langfuse</a></h2>



<h3 class="wp-block-heading">The run_rag_pipeline Orchestrator</h3>



<p>Your full RAG flow is implemented in <code data-enlighter-language="python" class="EnlighterJSRAW">src/rag_pipeline.py</code> as a single orchestrator function:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="44">@observe(name="rag_pipeline")
def run_rag_pipeline(
    question: str,
    retriever: TracedRetriever,
    llm_client: TracedLLMClient,
    top_k: int = 3
) -> Dict:
    ...
</pre>



<p>This one function wires together everything you have built so far: it takes a <strong>user question</strong>, uses the <strong>traced retriever</strong> to find context, calls the <strong>traced LLM client</strong> to generate an answer, and then runs <strong>RAG evaluation</strong> to compute relevancy and hallucination scores. Because it is decorated with <code data-enlighter-language="python" class="EnlighterJSRAW">@observe(name="rag_pipeline")</code>, the entire run shows up in Langfuse as a <strong>top-level trace</strong>, with all retrieval, LLM, and evaluation spans nested underneath.</p>



<h3 class="wp-block-heading">Step 1: Retrieve</h3>



<p>The first step is retrieving documents with your <code data-enlighter-language="python" class="EnlighterJSRAW">TracedRetriever</code>:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="45">langfuse_context.update_current_observation(
    input={"question": question, "top_k": top_k}
)

print("Step 1: Retrieving documents...")
docs = retriever.retrieve(query=question, top_k=top_k)

if not docs:
    print("❌ No documents found")
    return {"answer": "No relevant information found.", "success": False}
</pre>



<p>Here is what happens in this step:</p>



<ul class="wp-block-list">
<li>The pipeline span is enriched with the <strong>incoming question</strong> and the <strong>top_k</strong> parameter via <code data-enlighter-language="python" class="EnlighterJSRAW">langfuse_context.update_current_observation</code>.</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">retriever.retrieve(...)</code> is itself decorated with <code data-enlighter-language="python" class="EnlighterJSRAW">@observe(name="retrieve_documents")</code>, so Langfuse automatically creates a <strong>child span</strong> under <code data-enlighter-language="python" class="EnlighterJSRAW">rag_pipeline</code>. Inside that span, you log the query, scores, and content previews.</li>



<li>If the index is empty or nothing is returned, you fail fast with a friendly message and <code data-enlighter-language="python" class="EnlighterJSRAW">success=False</code> instead of trying to prompt the LLM with no context.</li>
</ul>



<p>By the end of Step 1, you have a ranked list of documents such as:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="46">[
    {"content": "...", "score": 0.93, "rank": 1, "distance": 0.12},
    {"content": "...", "score": 0.88, "rank": 2, "distance": 0.18},
    ...
]
</pre>



<p>and their retrieval details are already captured in Langfuse.</p>



<h3 class="wp-block-heading">Step 2: Build Prompt from Retrieved Docs</h3>



<p>Next, you turn those retrieved documents into a single, structured prompt:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="47">print("Step 2: Building prompt...")
context = "\n\n".join([f"[{i+1}] {d['content']}" for i, d in enumerate(docs)])
messages = [
    {"role": "system", "content": "Answer based on the provided context."},
    {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {question}\n\nAnswer:"}
]
</pre>



<p>A few important details:</p>



<ul class="wp-block-list">
<li>Each document is tagged with an index (<code data-enlighter-language="python" class="EnlighterJSRAW">[1]</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">[2]</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">[3]</code>) so it is easy to map parts of the final answer back to specific sources, both as a human and when you are debugging traces.</li>



<li>The <strong>system message</strong> explicitly constrains the model: <em>“Answer based on the provided context.”</em> This is a simple but effective guardrail against hallucinations.</li>



<li>The <strong>user message</strong> includes both the stitched context and the original question, finishing with &#8220;Answer:&#8221; to bias the model toward a direct response.</li>
</ul>



<p>Because messages are later passed into the traced LLM client, the <strong>entire prompt (including context)</strong> is visible inside the <code data-enlighter-language="python" class="EnlighterJSRAW">llm_completion</code> span in Langfuse.</p>



<h3 class="wp-block-heading">Step 3: Generate with vLLM</h3>



<p>You then hand off the prompt to your <code data-enlighter-language="python" class="EnlighterJSRAW">TracedLLMClient</code>:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="48">print("Step 3: Generating answer...")
result = llm_client.complete(messages)

if not result["success"]:
    print(f"❌ Generation failed: {result.get('error')}")
    return {"answer": None, "error": result.get("error"), "success": False}

answer = result["content"]
print(f"✅ Answer generated\n")
</pre>



<p>Under the hood:</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">TracedLLMClient.complete()</code> calls <code data-enlighter-language="python" class="EnlighterJSRAW">client.chat.completions.create(...)</code> against the vLLM OpenAI-compatible server (configured via <code data-enlighter-language="python" class="EnlighterJSRAW">OPENAI_BASE_URL</code> and <code data-enlighter-language="python" class="EnlighterJSRAW">OPENAI_API_KEY</code>, with model and temperature from <code data-enlighter-language="python" class="EnlighterJSRAW">config.yaml</code>).</li>



<li>The method is decorated with <code data-enlighter-language="python" class="EnlighterJSRAW">@observe(name="llm_completion")</code>, so a <strong>child span</strong> is created inside the <code data-enlighter-language="python" class="EnlighterJSRAW">rag_pipeline</code> trace.</li>



<li>Inside that span, you log:
<ul class="wp-block-list">
<li>the <strong>messages</strong> and <strong>model</strong> as input</li>



<li>the <strong>generated content</strong> as output</li>



<li>detailed <strong>token usage</strong> (<code data-enlighter-language="python" class="EnlighterJSRAW">prompt_tokens</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">completion_tokens</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">total_tokens</code>) as usage, plus <code data-enlighter-language="python" class="EnlighterJSRAW">metadata={"attempt": ...}</code> indicating which retry succeeded</li>
</ul>
</li>
</ul>



<p>If <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> is down, misconfigured, or times out, the wrapper returns <code data-enlighter-language="python" class="EnlighterJSRAW">{"success": False, "error": ...}</code> and updates the Langfuse span with <code data-enlighter-language="python" class="EnlighterJSRAW">level="ERROR"</code>, so you get a clear red node in the trace instead of a mysterious failure.</p>



<h3 class="wp-block-heading">Step 4: Evaluate Response Quality</h3>



<p>Once you have an answer, you pass everything into the evaluation layer in <code data-enlighter-language="python" class="EnlighterJSRAW">src/evaluation.py</code>:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="49">print("Step 4: Evaluating quality...")
evaluation_results = evaluate_rag_output(question, docs, answer)
</pre>



<p><code data-enlighter-language="python" class="EnlighterJSRAW">evaluate_rag_output</code> is itself annotated with <code data-enlighter-language="python" class="EnlighterJSRAW">@observe(name="evaluate_rag_output")</code>, and it calls 2 more traced helpers under the hood:</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">evaluate_relevancy(query, retrieved_docs, answer)</code></li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">evaluate_hallucination_risk(retrieved_docs, answer)</code></li>
</ul>



<p>The process inside <code data-enlighter-language="python" class="EnlighterJSRAW">evaluate_rag_output</code> looks like this:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="50">langfuse_context.update_current_observation(
    input={
        "query": query,
        "doc_count": len(retrieved_docs),
        "answer_length": len(answer)
    }
)

relevancy_score = evaluate_relevancy(query, retrieved_docs, answer)
hallucination_risk = evaluate_hallucination_risk(retrieved_docs, answer)
overall_quality = (relevancy_score + (1.0 - hallucination_risk)) / 2.0

eval_config = get_evaluation_config()
min_quality = eval_config.get("min_quality_score", 0.6)

results = {
    "relevancy_score": relevancy_score,
    "hallucination_risk": hallucination_risk,
    "overall_quality": overall_quality,
    "passed": overall_quality >= min_quality
}
</pre>



<p>In more detail:</p>



<ul class="wp-block-list">
<li><strong>Relevancy scoring</strong> (<code data-enlighter-language="python" class="EnlighterJSRAW">evaluate_relevancy</code>): computes how well the answer overlaps with both the query and the retrieved documents using simple word-level heuristics.</li>



<li><strong>Hallucination risk</strong> (<code data-enlighter-language="python" class="EnlighterJSRAW">evaluate_hallucination_risk</code>): estimates how many of the answer’s content words are grounded in the retrieved documents; low grounding means higher risk.</li>



<li><strong>Overall quality:</strong> is a simple average of <code data-enlighter-language="python" class="EnlighterJSRAW">relevancy</code> and <code data-enlighter-language="python" class="EnlighterJSRAW">1 − hallucination_risk</code>, giving a single number between 0 and 1.</li>



<li>A minimum quality threshold (<code data-enlighter-language="python" class="EnlighterJSRAW">min_quality_score</code>) comes from the <code data-enlighter-language="python" class="EnlighterJSRAW">evaluation</code> section of <code data-enlighter-language="python" class="EnlighterJSRAW">config.yaml</code> and is used to set a <code data-enlighter-language="python" class="EnlighterJSRAW">passed</code> boolean.</li>
</ul>



<p>The function then:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="51">langfuse_context.score_current_observation(
    name="relevancy",
    value=relevancy_score,
    comment="Keyword and document relevance"
)

langfuse_context.score_current_observation(
    name="hallucination_risk",
    value=hallucination_risk,
    comment="Risk of ungrounded claims"
)

langfuse_context.score_current_observation(
    name="overall_quality",
    value=overall_quality,
    comment=f"Combined quality score (threshold: {min_quality})"
)

langfuse_context.update_current_observation(output=results)
</pre>



<p>So you get <strong>3 named scores</strong> on the evaluation span inside Langfuse: <code data-enlighter-language="python" class="EnlighterJSRAW">relevancy</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">hallucination_risk</code>, and <code data-enlighter-language="python" class="EnlighterJSRAW">overall_quality</code>, each with a numeric value and a human-readable comment.</p>



<h3 class="wp-block-heading">Tracing the Entire RAG Pipeline with Nested Spans</h3>



<p>Back in <code data-enlighter-language="python" class="EnlighterJSRAW">run_rag_pipeline</code>, you finalize the top-level observation and return a structured result:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="52">langfuse_context.update_current_observation(
    output={
        "answer": answer,
        "sources_count": len(docs),
        "evaluation": evaluation_results
    }
)

print(f"✅ Evaluation complete")
print(f"  Relevancy: {evaluation_results['relevancy_score']:.2f}")
print(f"  Hallucination Risk: {evaluation_results['hallucination_risk']:.2f}")
print(f"  Overall Quality: {evaluation_results['overall_quality']:.2f}")
print(f"  Passed: {'✅' if evaluation_results['passed'] else '❌'}\n")
</pre>



<p>Then you expose the trace URL:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="53">trace_id = langfuse_context.get_current_trace_id()
langfuse_host = os.getenv("LANGFUSE_HOST", "http://localhost:3000")

print(f"{'='*50}")
print(f"✅ Pipeline Complete")
print(f"🔍 View trace: {langfuse_host}/trace/{trace_id}")
print(f"{'='*50}\n")
</pre>



<p>At this point, a single pipeline run creates a <strong>hierarchy of spans</strong> roughly like this in Langfuse:</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">rag_pipeline</code> (top-level)
<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">retrieve_documents</code> (from <code data-enlighter-language="python" class="EnlighterJSRAW">TracedRetriever.retrieve</code>)
<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">embed_text</code> (from <code data-enlighter-language="python" class="EnlighterJSRAW">TracedRetriever.embed</code>)</li>
</ul>
</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">llm_completion</code> (from <code data-enlighter-language="python" class="EnlighterJSRAW">TracedLLMClient.complete</code>)</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">evaluate_rag_output</code>
<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">evaluate_relevancy</code></li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">evaluate_hallucination</code></li>
</ul>
</li>
</ul>
</li>
</ul>



<p>Each node contains its own <strong>inputs</strong>, <strong>outputs</strong>, <strong>usage</strong>, and <strong>scores</strong>, giving you a complete picture of <strong>where time is spent</strong>, <strong>how the model behaved</strong>, and <strong>whether the final answer passed your quality threshold</strong>.</p>



<h3 class="wp-block-heading">Returned Structure and Downstream Use</h3>



<p>Finally, the function returns a rich Python dictionary :</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="54">return {
    "answer": answer,
    "sources": docs,
    "evaluation": evaluation_results,
    "success": True
}
</pre>



<p>This shape is deliberate:</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">answer</code>: can be rendered in a UI, CLI, or logged for later inspection.</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">sources</code>: lets you show which documents backed the answer (e.g., for “source citations” in a frontend).</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">evaluation</code>: gives your downstream systems a simple way to <strong>gate</strong> responses (e.g., only show answers where <code data-enlighter-language="python" class="EnlighterJSRAW">overall_quality &gt;= 0.7</code>).</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">success</code>: makes it easy to distinguish between “no documents”, “LLM error”, and “normal completion”.</li>
</ul>



<p>Together, this section gives you not just a RAG pipeline, but a <strong>fully traced, quality-scored RAG system</strong> that is ready to plug into dashboards, UIs, or further production hardening.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Implementing-LLM-Evaluation-Metrics-Relevancy-Hallucination-Risk"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Implementing-LLM-Evaluation-Metrics-Relevancy-Hallucination-Risk">Implementing LLM Evaluation Metrics for RAG: Relevancy and Hallucination Risk</a></h2>



<h3 class="wp-block-heading">Relevancy Scoring</h3>



<p>Relevancy is implemented in <code data-enlighter-language="python" class="EnlighterJSRAW">evaluate_relevancy()</code> and answers a simple but crucial question: <strong>“How well does the model’s answer align with the retrieved documents and the user’s query?”</strong></p>



<p>Your scoring function uses a lightweight, keyword-overlap heuristic, which is ideal for debugging and observability without introducing another model dependency. The implementation:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="55">@observe(name="evaluate_relevancy")
def evaluate_relevancy(query: str, retrieved_docs: List[Dict], answer: str) -> float:
    langfuse_context.update_current_observation(
        input={"query": query, "doc_count": len(retrieved_docs), "answer_length": len(answer)}
    )

    query_words = set(query.lower().split())
    answer_words = set(answer.lower().split())

    overlap_with_query = len(answer_words &amp; query_words) / max(len(query_words), 1)

    doc_words = set()
    for doc in retrieved_docs:
        doc_words |= set(doc["content"].lower().split())

    overlap_with_docs = len(answer_words &amp; doc_words) / max(len(answer_words), 1)

    relevancy_score = (overlap_with_query + overlap_with_docs) / 2.0

    langfuse_context.score_current_observation(
        name="relevancy",
        value=relevancy_score,
        comment="Keyword and doc overlap relevance"
    )

    langfuse_context.update_current_observation(output={"relevancy": relevancy_score})
    return relevancy_score
</pre>



<p><strong>What the algorithm evaluates:</strong></p>



<ul class="wp-block-list">
<li><strong>Query–Answer overlap:</strong> Ensures the model is addressing the question.</li>



<li><strong>Document–Answer overlap:</strong> Checks that the model grounds its answer in retrieved context.</li>



<li>The final score is the average of both signals.</li>
</ul>



<p>While simple, this gives you an interpretable, production-friendly metric that appears directly in Langfuse traces.</p>



<h3 class="wp-block-heading">Hallucination Risk Estimation</h3>



<p>Hallucination risk is implemented in <code data-enlighter-language="python" class="EnlighterJSRAW">evaluate_hallucination_risk()</code> and estimates <strong>how much of the answer is unsupported by the retrieved documents</strong>.</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="56">@observe(name="evaluate_hallucination")
def evaluate_hallucination_risk(retrieved_docs: List[Dict], answer: str) -> float:
    all_doc_words = set()
    for doc in retrieved_docs:
        all_doc_words |= set(doc["content"].lower().split())

    answer_words = set(answer.lower().split())

    grounding_ratio = len(answer_words &amp; all_doc_words) / max(len(answer_words), 1)
    hallucination_risk = 1.0 - grounding_ratio
</pre>



<p><strong>Interpretation:</strong></p>



<ul class="wp-block-list">
<li>If every important token in the answer appears in the retrieved context, the <strong>hallucination risk is low</strong>.</li>



<li>If the answer relies heavily on tokens not present in any source document, the <strong>hallucination risk is high</strong>.</li>
</ul>



<p>Langfuse logs this as:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="57">langfuse_context.score_current_observation(
    name="hallucination_risk",
    value=hallucination_risk,
    comment="Ungrounded token ratio"
)
</pre>



<p>This trace node helps you immediately visualize how close an answer is to going “off the rails.”</p>



<h3 class="wp-block-heading">Overall Quality Metric</h3>



<p>Your master scoring function <code data-enlighter-language="python" class="EnlighterJSRAW">evaluate_rag_output()</code> combines the 2 metrics:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="58">overall_quality = (relevancy_score + (1.0 - hallucination_risk)) / 2.0
</pre>



<p>This means:</p>



<ul class="wp-block-list">
<li>high relevancy and low hallucination risk indicate <strong>high quality</strong></li>



<li>low relevancy and high hallucination risk indicate <strong>low quality</strong></li>
</ul>



<p><code data-enlighter-language="python" class="EnlighterJSRAW">config.yaml</code> defines the minimum acceptable score:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="59">evaluation:
  min_quality_score: 0.6
</pre>



<p>Then:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="60">passed = overall_quality >= min_quality
</pre>



<p>This allows your downstream systems to treat RAG evaluation like a <strong>gatekeeper</strong>:</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">passed=True</code>: show the answer to the user, store it, or send it downstream</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">passed=False</code>: trigger fallback mode, self-reflection, or agentic repair workflows</li>
</ul>



<p>All 3 metrics (<code data-enlighter-language="python" class="EnlighterJSRAW">relevancy</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">hallucination_risk</code>, and <code data-enlighter-language="python" class="EnlighterJSRAW">overall_quality</code>) are scored and attached to the current Langfuse span.</p>



<h3 class="wp-block-heading">How Langfuse Displays Evaluation and Scoring Nodes</h3>



<p>The evaluation subsystem produces one of the most informative trace segments in Langfuse. A typical structure:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="61">rag_pipeline
 ├── retrieve_documents
 ├── llm_completion
 └── evaluate_rag_output
       ├── relevancy (score)
       ├── hallucination_risk (score)
       ├── overall_quality (score)
</pre>



<p>Each node includes:</p>



<h4 class="wp-block-heading">Inputs</h4>



<ul class="wp-block-list">
<li>user query</li>



<li>document count</li>



<li>answer length</li>
</ul>



<h4 class="wp-block-heading">Outputs</h4>



<ul class="wp-block-list">
<li>numeric scores</li>



<li>pass-or-fail status</li>



<li>evaluation metadata</li>
</ul>



<h4 class="wp-block-heading">Visual Benefits Inside Langfuse</h4>



<ul class="wp-block-list">
<li><strong>Color-coded score nodes</strong> help you spot failing RAG runs instantly.</li>



<li><strong>Timeline alignment</strong> shows you evaluation overhead and where bottlenecks appear.</li>



<li><strong>Nested spans</strong> reveal exactly which part of the pipeline caused a failure.</li>



<li><strong>JSON detail view</strong> allows exporting evaluation metrics for dashboards or analytics.</li>
</ul>



<p>With these evaluation spans, your Langfuse trace evolves from a simple log viewer into a <strong>quality monitoring dashboard for your RAG system</strong>.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Running-Inspecting-RAG-Pipeline-End-to-End"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Running-Inspecting-RAG-Pipeline-End-to-End">Running and Inspecting the RAG Pipeline End-to-End</a></h2>



<h3 class="wp-block-heading">Running rag_pipeline.py End-to-End</h3>



<p>With all components in place (the retriever, the traced LLM wrapper, and the evaluation module), you can now run the complete production-grade RAG pipeline. The script <code data-enlighter-language="python" class="EnlighterJSRAW">rag_pipeline.py</code> orchestrates the entire flow:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="62">python src/rag_pipeline.py
</pre>



<p>This script loads documents, indexes them, retrieves the <code data-enlighter-language="python" class="EnlighterJSRAW">top_k</code> matches, builds a contextual prompt, generates an answer using <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code>, evaluates the output quality, and logs every step into <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code>. If all services are running (<code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> UI on port <code data-enlighter-language="python" class="EnlighterJSRAW">3000</code> and <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> on port <code data-enlighter-language="python" class="EnlighterJSRAW">8000</code>), the run completes with a final console message showing the trace URL:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="63">🔍 View trace: http://localhost:3000/trace/&lt;trace_id>
</pre>



<p>This makes it trivial to jump directly into the corresponding trace in your observability dashboard and inspect the entire RAG execution, including nested spans and evaluation scores.</p>



<h3 class="wp-block-heading">Example Trace Outputs</h3>



<p>A successful run produces a hierarchical trace structure in Langfuse that mirrors your pipeline architecture. A typical RAG trace looks like this:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="64">rag_pipeline
 ├── retrieve_documents
 │     ├── embed_text
 │     └── FAISS search metadata
 ├── llm_completion
 │     ├── request payload
 │     ├── response payload
 │     └── token usage
 └── evaluate_rag_output
        ├── relevancy (score)
        ├── hallucination_risk (score)
        └── overall_quality (score)
</pre>



<p>What you will see in the trace:</p>



<h4 class="wp-block-heading">Retrieval metadata</h4>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">top_k</code> value</li>



<li>query text</li>



<li>relevance scores</li>



<li>FAISS distances</li>



<li>document preview snippets</li>
</ul>



<h4 class="wp-block-heading">LLM generation metadata</h4>



<ul class="wp-block-list">
<li>system and user messages used for prompting</li>



<li>token usage breakdown</li>



<li>retry attempts</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> latency and response time</li>
</ul>



<h4 class="wp-block-heading">Evaluation metrics</h4>



<ul class="wp-block-list">
<li>numeric relevancy score</li>



<li>hallucination risk estimation</li>



<li>overall quality score</li>



<li>pass-or-fail decision using the threshold in <code data-enlighter-language="python" class="EnlighterJSRAW">config.yaml</code></li>
</ul>



<p>Together, these give you a full audit trail for each RAG run, which is perfect for debugging, monitoring, or offline analysis.</p>



<h3 class="wp-block-heading">Debugging with the Langfuse UI (Span Trees, Scores, and Metadata)</h3>



<p>Langfuse is not just a logger; it acts as a <strong>visual debugger for your entire RAG system</strong>. When you open the trace URL, you will see several powerful debugging tools:</p>



<h4 class="wp-block-heading">Span Tree View</h4>



<p>This hierarchical tree shows the exact execution order and timing of:</p>



<ul class="wp-block-list">
<li>retrieval</li>



<li>embedding</li>



<li>indexing</li>



<li>LLM generation</li>



<li>evaluation steps</li>
</ul>



<p>It helps you detect:</p>



<ul class="wp-block-list">
<li>slow spans (bottlenecks)</li>



<li>failed or retried LLM calls</li>



<li>missing or empty retrieval results</li>
</ul>



<h4 class="wp-block-heading">Scoring Nodes</h4>



<p>Evaluation scores appear as structured nodes:</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">relevancy</code></li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">hallucination_risk</code></li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">overall_quality</code></li>
</ul>



<p>Langfuse color-codes these (green, yellow, and red), making it instantly clear when a RAG answer is degrading in quality.</p>



<h4 class="wp-block-heading">Metadata Panels</h4>



<p>Each span contains:</p>



<ul class="wp-block-list">
<li>input and output payloads</li>



<li>token counts</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">FAISS</code> distances</li>



<li>processed document counts</li>



<li>retry counts</li>



<li>trace-level summaries</li>
</ul>



<p>This makes debugging extremely fast:</p>



<ul class="wp-block-list">
<li>Wrong documents retrieved? Inspect retrieval span input and output.</li>



<li>Unexpected LLM answer? Check the exact prompt in the generation span.</li>



<li>Poor evaluation scores? Expand the scoring spans to see the raw metrics.</li>
</ul>



<p>Because traces are stored locally in your self-hosted Langfuse instance, you get complete transparency without relying on cloud telemetry.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Viewing-RAG-Traces-Spans-Scores-Langfuse"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Viewing-RAG-Traces-Spans-Scores-Langfuse">Viewing RAG Traces, Spans, and Scores in Langfuse</a></h2>



<p>Once your RAG pipeline is running end-to-end, the real magic happens inside Langfuse. This is where retrieval steps, LLM calls, evaluation metrics, token usage, and pipeline-level metadata condense into a single, navigable trace. In this section, you will learn how to interpret that trace, span by span, so you can debug, understand, and improve RAG behavior with production-grade visibility.</p>



<h3 class="wp-block-heading">Understanding Hierarchical Spans (Retrieve → Prompt → Generate → Evaluate)</h3>



<p>Langfuse automatically groups each step of your <code data-enlighter-language="python" class="EnlighterJSRAW">rag_pipeline</code> into a nested hierarchy of spans. A typical RAG trace looks like this:</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">rag_pipeline</code> (root trace)
<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">retrieve_documents</code>
<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">embed_text</code></li>
</ul>
</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">llm_completion</code></li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">evaluate_rag_output</code>
<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">evaluate_relevancy</code></li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">evaluate_hallucination</code></li>
</ul>
</li>



<li>scoring nodes (<code data-enlighter-language="python" class="EnlighterJSRAW">overall_quality</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">relevancy</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">hallucination_risk</code>)</li>
</ul>
</li>
</ul>



<p>This hierarchy corresponds directly to your source code:</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">TracedRetriever.retrieve()</code>: retrieval span</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">TracedLLMClient.complete()</code>: generation span</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">evaluate_rag_output()</code>: evaluation span</li>
</ul>


<div class="wp-block-image">
<figure class="aligncenter size-full"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-58.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="888" height="696" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-58.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-54288" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-58.png?size=126x99&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-58-300x235.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-58.png?size=378x296&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-58.png?size=504x395&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-58.png?size=630x494&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-58-768x602.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-58.png?lossy=2&amp;strip=1&amp;webp=1 888w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 1:</strong> Hierarchical spans created automatically by the RAG pipeline. Notice Retrieve → Generate → Evaluate structure.</figcaption></figure></div>


<h4 class="wp-block-heading">How to Navigate the Span Tree</h4>



<p>Each span reveals:</p>



<ul class="wp-block-list">
<li>execution time (critical for latency bottlenecks)</li>



<li>inputs and outputs captured via <code data-enlighter-language="python" class="EnlighterJSRAW">langfuse_context.update_current_observation()</code></li>



<li>whether nested operations (e.g., embedding calls) executed successfully</li>



<li>metadata from FAISS search, document previews, and query text</li>
</ul>



<p>Langfuse becomes a timeline and debugger for your RAG system.</p>



<h3 class="wp-block-heading">Inspecting Retrieval: Document Scores and Previews</h3>



<p>The retrieval stage is your first major insight point. The <code data-enlighter-language="python" class="EnlighterJSRAW">retrieve_documents</code> span logs:</p>



<ul class="wp-block-list">
<li>the <strong>query</strong> that was embedded</li>



<li>the <code data-enlighter-language="python" class="EnlighterJSRAW">top_k</code> used for <code data-enlighter-language="python" class="EnlighterJSRAW">FAISS</code> search</li>



<li><strong>distance scores</strong> returned</li>



<li><strong>converted relevancy scores</strong> (your <code data-enlighter-language="python" class="EnlighterJSRAW">1/(1+d)</code> heuristic)</li>



<li><strong>ranked documents</strong> with text previews</li>
</ul>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-59-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="535" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-59-1024x535.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-54290" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-59.png?size=126x66&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-59-300x157.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-59.png?size=378x197&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-59.png?size=504x263&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-59.png?size=630x329&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-59-768x401.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-59-1024x535.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-59-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-59-1536x803.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 2:</strong> Retrieval span showing FAISS scores, document previews, and ranked results.</figcaption></figure></div>


<h4 class="wp-block-heading">What to Look For</h4>



<ul class="wp-block-list">
<li><strong>High distances and low scores:</strong> embedding mismatch or poor docs</li>



<li><strong>Same document repeatedly ranking #1:</strong> indexing error</li>



<li><strong>Empty results:</strong> index not built or FAISS dimension mismatch</li>
</ul>



<h4 class="wp-block-heading">Embedded Text Span</h4>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">embed_text</code> span reveals the preview of text used for embeddings:</p>



<ul class="wp-block-list">
<li>inspect embeddings length</li>



<li>detect empty or malformed documents</li>



<li>verify embeddings model configuration</li>
</ul>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-60-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="231" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-60-1024x231.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-54293" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-60.png?size=126x28&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-60-300x68.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-60.png?size=378x85&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-60.png?size=504x114&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-60.png?size=630x142&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-60-768x173.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-60-1024x231.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-60-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-60-1536x346.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 3:</strong> Embedding span showing text preview and output vector dimension.</figcaption></figure></div>


<h3 class="wp-block-heading">Inspecting Prompt Construction (Optional View)</h3>



<p>Prompt creation happens between retrieval and generation. Although you do not create a separate Langfuse span for this step, the <strong>constructed prompt appears inside the LLM span input</strong>.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-61-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="568" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-61-1024x568.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-54295" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-61.png?size=126x70&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-61-300x166.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-61.png?size=378x210&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-61.png?size=504x280&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-61.png?size=630x349&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-61-768x426.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-61-1024x568.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-61-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-61-1536x851.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 4:</strong> Prompt passed to vLLM including context numbered <code>[1]</code>, <code>[2]</code>, and <code>[3]</code>.</figcaption></figure></div>


<p>What you verify here:</p>



<ul class="wp-block-list">
<li>context formatting</li>



<li>numbering</li>



<li>whitespace</li>



<li>hallucination-reducing systems instructions</li>
</ul>



<p>This becomes essential when debugging wrong answers.</p>



<h3 class="wp-block-heading">Token Usage and Generation Metadata</h3>



<p>Inside the <code data-enlighter-language="python" class="EnlighterJSRAW">llm_completion</code> span, Langfuse records:</p>



<ul class="wp-block-list">
<li><strong>input tokens</strong></li>



<li><strong>output tokens</strong></li>



<li><strong>total tokens</strong></li>



<li><strong>retry count</strong></li>



<li><strong>model name</strong></li>



<li><strong>latency breakdown</strong></li>



<li><strong>response content</strong></li>
</ul>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-62-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="555" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-62-1024x555.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-54298" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-62.png?size=126x68&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-62-300x163.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-62.png?size=378x205&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-62.png?size=504x273&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-62.png?size=630x341&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-62-768x416.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-62-1024x555.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-62-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-62-1536x833.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 5:</strong> LLM span showing the request payload, response payload, token usage, retry-attempt metadata, and timing.</figcaption></figure></div>


<h4 class="wp-block-heading">What to Look For</h4>



<ul class="wp-block-list">
<li><strong>Unusually high input tokens:</strong> prompt too large</li>



<li><strong>High output tokens:</strong> model drifting or verbose</li>



<li><strong>Repeated retries:</strong> <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> throughput issue</li>



<li><strong>Very long latency:</strong> GPU under-provisioned or CPU fallback</li>
</ul>



<h3 class="wp-block-heading">Evaluation Scoring Nodes (Relevancy, Hallucination, and Overall Quality)</h3>



<p>Your evaluation functions (<code data-enlighter-language="python" class="EnlighterJSRAW">evaluate_relevancy</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">evaluate_hallucination_risk</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">evaluate_rag_output</code>) create <strong>3 scoring nodes</strong> inside Langfuse:</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">relevancy</code></li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">hallucination_risk</code></li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">overall_quality</code></li>
</ul>



<p>These appear alongside the evaluation span.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-63-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="369" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-63-1024x369.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-54300" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-63.png?size=126x45&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-63-300x108.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-63.png?size=378x136&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-63.png?size=504x182&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-63.png?size=630x227&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-63-768x276.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-63-1024x369.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-63-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-63-1536x553.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 6:</strong> Langfuse scoring nodes: relevancy, hallucination risk, and overall quality.</figcaption></figure></div>


<h4 class="wp-block-heading">How to Interpret Them</h4>



<ul class="wp-block-list">
<li><strong>High relevancy and low hallucination</strong><strong> risk</strong><strong>:</strong> high <code data-enlighter-language="python" class="EnlighterJSRAW">overall_quality</code></li>



<li><strong>Low relevancy and high hallucination</strong><strong> risk</strong><strong>:</strong> RAG failure</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">passed=True</code> means the response met the <code data-enlighter-language="python" class="EnlighterJSRAW">min_quality_score</code> threshold in <code data-enlighter-language="python" class="EnlighterJSRAW">config.yaml</code></li>
</ul>



<h4 class="wp-block-heading">Debugging Failures</h4>



<ul class="wp-block-list">
<li><strong>Relevancy low:</strong> retrieval needs improvement</li>



<li><strong>Hallucination high:</strong> prompt needs grounding</li>



<li><strong>Both low:</strong> LLM ignoring context, bad retrieval, or noisy docs</li>
</ul>



<h3 class="wp-block-heading">Visual Timeline and Performance Profiling</h3>



<p>The timeline view shows exact timings:</p>



<ul class="wp-block-list">
<li>embedding</li>



<li>retrieval</li>



<li>prompt construction</li>



<li>LLM generation</li>



<li>evaluation</li>
</ul>



<p>This allows profiling end-to-end latency.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-64-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="182" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-64-1024x182.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-54302" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-64.png?size=126x22&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-64-300x53.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-64.png?size=378x67&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-64.png?size=504x90&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-64.png?size=630x112&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-64-768x137.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-64-1024x182.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-64-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-64-1536x273.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 7:</strong> Timeline visualization showing latency distribution across RAG stages, including embedding, retrieval, LLM generation, and evaluation.</figcaption></figure></div>


<h3 class="wp-block-heading">How Langfuse Helps Production Debugging</h3>



<p>Langfuse tracing helps answer real production questions:</p>



<h4 class="wp-block-heading">“Why was this answer wrong?”</h4>



<p>Open the evaluation spans, review the hallucination score, inspect the prompt, and then inspect the retrieved documents.</p>



<h4 class="wp-block-heading">“Which part is slowing down?”</h4>



<p>Open the timeline and locate the bottleneck, which is often embeddings or the LLM.</p>



<h4 class="wp-block-heading">“Did the LLM actually use the retrieved documents?”</h4>



<p>Compare:</p>



<ul class="wp-block-list">
<li>retrieval previews</li>



<li>answer keywords</li>



<li>relevancy score</li>
</ul>



<h4 class="wp-block-heading">“Why did this query fail?”</h4>



<p>The trace will show:</p>



<ul class="wp-block-list">
<li>empty index</li>



<li>retries</li>



<li>exceptions</li>



<li>malformed inputs</li>



<li>missing environment variables</li>
</ul>



<p>In production, this becomes indispensable.</p>



<hr class="wp-block-separator has-alpha-channel-opacity"/>



<div id="pitch" style="padding: 40px; width: 100%; background-color: #F4F6FA;">
	<h3>What's next? We recommend <a target="_blank" href="https://pyimagesearch.com/pyimagesearch-university/?utm_source=blogPost&utm_medium=bottomBanner&utm_campaign=What%27s%20next%3F%20I%20recommend">PyImageSearch University</a>.</h3>

	<script src="https://fast.wistia.com/embed/medias/kno0cmko2z.jsonp" async></script><script src="https://fast.wistia.com/assets/external/E-v1.js" async></script><div class="wistia_responsive_padding" style="padding:56.25% 0 0 0;position:relative;"><div class="wistia_responsive_wrapper" style="height:100%;left:0;position:absolute;top:0;width:100%;"><div class="wistia_embed wistia_async_kno0cmko2z videoFoam=true" style="height:100%;position:relative;width:100%"><div class="wistia_swatch" style="height:100%;left:0;opacity:0;overflow:hidden;position:absolute;top:0;transition:opacity 200ms;width:100%;"><img decoding="async" src="https://fast.wistia.com/embed/medias/kno0cmko2z/swatch" style="filter:blur(5px);height:100%;object-fit:contain;width:100%;" alt="" aria-hidden="true" onload="this.parentNode.style.opacity=1;" /></div></div></div></div>

	<div style="margin-top: 32px; margin-bottom: 32px; ">
		<strong>Course information:</strong><br/>
		86+ total classes • 115+ hours hours of on-demand code walkthrough videos • Last updated: June 2026<br/>
		<span style="color: #169FE6;">★★★★★</span> 4.84 (128 Ratings) • 16,000+ Students Enrolled
	</div>

	<p><strong>I strongly believe that if you had the right teacher you could <em>master</em> computer vision and deep learning.</strong></p>

	<p>Do you think learning computer vision and deep learning has to be time-consuming, overwhelming, and complicated? Or has to involve complex mathematics and equations? Or requires a degree in computer science?</p>

	<p>That’s <em>not</em> the case.</p>

	<p>All you need to master computer vision and deep learning is for someone to explain things to you in <em>simple, intuitive</em> terms. <em>And that’s exactly what I do</em>. My mission is to change education and how complex Artificial Intelligence topics are taught.</p>

	<p>If you're serious about learning computer vision, your next stop should be PyImageSearch University, the most comprehensive computer vision, deep learning, and OpenCV course online today. Here you’ll learn how to <em>successfully</em> and <em>confidently</em> apply computer vision to your work, research, and projects. Join me in computer vision mastery.</p>

	<p><strong>Inside PyImageSearch University you'll find:</strong></p>

	<ul style="margin-left: 0px;">
		<li style="list-style: none;">&check; <strong>86+ courses</strong> on essential computer vision, deep learning, and OpenCV topics</li>
		<li style="list-style: none;">&check; <strong>86 Certificates</strong> of Completion</li>
		<li style="list-style: none;">&check; <strong>115+ hours hours</strong> of on-demand video</li>
		<li style="list-style: none;">&check; <strong>Brand new courses released <em>regularly</em></strong>, ensuring you can keep up with state-of-the-art techniques</li>
		<li style="list-style: none;">&check; <strong>Pre-configured Jupyter Notebooks in Google Colab</strong></li>
		<li style="list-style: none;">&check; Run all code examples in your web browser — works on Windows, macOS, and Linux (no dev environment configuration required!)</li>
		<li style="list-style: none;">&check; Access to <strong>centralized code repos for <em>all</em> 540+ tutorials</strong> on PyImageSearch</li>
		<li style="list-style: none;">&check; <strong> Easy one-click downloads</strong> for code, datasets, pre-trained models, etc.</li>
		<li style="list-style: none;">&check; <strong>Access</strong> on mobile, laptop, desktop, etc.</li>
	</ul>

	<p style="text-align: center;">
		<a target="_blank" class="button link" href="https://pyimagesearch.com/pyimagesearch-university/?utm_source=blogPost&utm_medium=bottomBanner&utm_campaign=What%27s%20next%3F%20I%20recommend" style="background-color: #6DC713; border-bottom: none;">Click here to join PyImageSearch University</a>
	</p>
</div>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Summary"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Summary">Summary</a></h2>



<p>In this lesson, you built a fully instrumented, production-grade RAG pipeline and learned how observability transforms retrieval-augmented systems from “black boxes” into transparent, debuggable, measurable workflows. You started by setting up the core infrastructure (self-hosted Langfuse, vLLM for fast local inference, and FAISS and SentenceTransformers for efficient retrieval) and then wired all these components together using a clean, traceable architecture.</p>



<p>With tracing enabled end-to-end, every stage of your RAG pipeline became inspectable: document embedding, FAISS indexing, retrieval scoring, prompt construction, LLM generation, and quality evaluation. You saw how Langfuse automatically visualizes these steps as nested spans, how it captures token usage and metadata for LLM calls, and how your evaluation functions produce relevancy, hallucination risk, and overall-quality scores directly inside the trace.</p>



<p>By running the pipeline and examining the traces, you learned how to debug retrieval quality, diagnose prompt-related issues, inspect model behavior, and identify performance bottlenecks using Langfuse’s hierarchical tree view and timeline profiler. The final result is an observability-first RAG stack: fully local, fast, and transparent, designed exactly the way production systems must operate.</p>



<p>This foundation prepares you for upcoming lessons, where we extend the same tracing principles to <strong>multi-step agents</strong>, adding reasoning chains, intent analysis, and multi-span agent workflows on top of the RAG engine you constructed here.</p>



<h3 class="wp-block-heading">Citation Information</h3>



<p><strong>Singh, V</strong><strong>. </strong>“RAG Observability with Langfuse, vLLM, and FAISS,” <em>PyImageSearch</em>, S. Huot, A. Sharma, and P. Thakur, eds., 2026, <a href="https://pyimg.co/g20yk" target="_blank" rel="noreferrer noopener">https://pyimg.co/g20yk</a> </p>



<pre class="EnlighterJSRAW" data-enlighter-language="raw" data-enlighter-theme="classic" data-enlighter-highlight="" data-enlighter-linenumbers="false" data-enlighter-lineoffset="" data-enlighter-title="RAG Observability with Langfuse, vLLM, and FAISS" data-enlighter-group="65">@incollection{Singh_2026_rag-observability-langfuse-vllm-faiss,
  author = {Vikram Singh},
  title = {{RAG Observability with Langfuse, vLLM, and FAISS}},
  booktitle = {PyImageSearch},
  editor = {Susan Huot and Aditya Sharma and Piyush Thakur},
  year = {2026},
  url = {https://pyimg.co/g20yk},
}
</pre>



<hr class="wp-block-separator has-alpha-channel-opacity"/>



<p><strong>To download the source code to this post (and be notified when future tutorials are published here on PyImageSearch), </strong><em><strong>simply enter your email address in the form below!</strong></em></p>



<div id="download-the-code" class="post-cta-wrap">
<div class="gpd-post-cta">
	<div class="gpd-post-cta-content">
		

			<div class="gpd-post-cta-top">
				<div class="gpd-post-cta-top-image"><img decoding="async" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?lossy=2&strip=1&webp=1" alt="" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?lossy=2&strip=1&webp=1 410w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?size=126x174&lossy=2&strip=1&webp=1 126w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?size=252x348&lossy=2&strip=1&webp=1 252w" sizes="(max-width: 410px) 100vw, 410px" /></div>
				
				<div class="gpd-post-cta-top-title"><h4>Download the Source Code and FREE 17-page Resource Guide</h4></div>
				<div class="gpd-post-cta-top-desc"><p>Enter your email address below to get a .zip of the code and a <strong>FREE 17-page Resource Guide on Computer Vision, OpenCV, and Deep Learning.</strong> Inside you'll find my hand-picked tutorials, books, courses, and libraries to help you master CV and DL!</p></div>


			</div>

			<div class="gpd-post-cta-bottom">
				<form id="footer-cta-code" class="footer-cta" action="https://www.getdrip.com/forms/4130035/submissions" method="post" target="blank" data-drip-embedded-form="4130035">
					<input name="fields[email]" type="email" value="" placeholder="Your email address" class="form-control" />

					<button type="submit">Download the code!</button>

					<div style="display: none;" aria-hidden="true"><label for="website">Website</label><br /><input type="text" id="website" name="website" tabindex="-1" autocomplete="false" value="" /></div>
				</form>
			</div>


		
	</div>

</div>
</div>
<p>The post <a rel="nofollow" href="https://pyimagesearch.com/2026/06/15/rag-observability-with-langfuse-vllm-and-faiss/">RAG Observability with Langfuse, vLLM, and FAISS</a> appeared first on <a rel="nofollow" href="https://pyimagesearch.com">PyImageSearch</a>.</p>
]]></content:encoded>
					
		
		
			</item>
		<item>
		<title>Run an Apache Airflow DAG with Docker Compose and PostgreSQL</title>
		<link>https://pyimagesearch.com/2026/06/08/run-an-apache-airflow-dag-with-docker-compose-and-postgresql/</link>
		
		<dc:creator><![CDATA[Vikram Singh]]></dc:creator>
		<pubDate>Mon, 08 Jun 2026 12:45:00 +0000</pubDate>
				<category><![CDATA[Apache Airflow]]></category>
		<category><![CDATA[Docker]]></category>
		<category><![CDATA[MLOps]]></category>
		<category><![CDATA[PostgreSQL]]></category>
		<category><![CDATA[Tutorial]]></category>
		<category><![CDATA[airflow dag]]></category>
		<category><![CDATA[airflow scheduler]]></category>
		<category><![CDATA[airflow webserver]]></category>
		<category><![CDATA[apache airflow]]></category>
		<category><![CDATA[containerization]]></category>
		<category><![CDATA[data engineering]]></category>
		<category><![CDATA[data pipelines]]></category>
		<category><![CDATA[docker]]></category>
		<category><![CDATA[docker compose]]></category>
		<category><![CDATA[docker volumes]]></category>
		<category><![CDATA[document ingestion]]></category>
		<category><![CDATA[fastapi]]></category>
		<category><![CDATA[mlops]]></category>
		<category><![CDATA[postgresql]]></category>
		<category><![CDATA[pypdf]]></category>
		<category><![CDATA[rag]]></category>
		<category><![CDATA[retrieval augmented generation]]></category>
		<category><![CDATA[sqlalchemy]]></category>
		<category><![CDATA[tutorial]]></category>
		<category><![CDATA[workflow orchestration]]></category>
		<guid isPermaLink="false">https://pyimagesearch.com/?p=54094</guid>

					<description><![CDATA[<p>Table of Contents Run an Apache Airflow DAG with Docker Compose and PostgreSQL Project Structure PDF Parsing and Text Chunking Logic for Airflow DAG Docker Orchestration and Runtime for Airflow and FastAPI Running the Apache Airflow and FastAPI Document Ingestion&#8230;</p>
<p>The post <a rel="nofollow" href="https://pyimagesearch.com/2026/06/08/run-an-apache-airflow-dag-with-docker-compose-and-postgresql/">Run an Apache Airflow DAG with Docker Compose and PostgreSQL</a> appeared first on <a rel="nofollow" href="https://pyimagesearch.com">PyImageSearch</a>.</p>
]]></description>
										<content:encoded><![CDATA[
<hr class="wp-block-separator has-alpha-channel-opacity" id="TOC"/>


<div class="yoast-breadcrumbs"><span><span><a href="https://pyimagesearch.com/">Home</a></span></div>


<div class="toc">
<hr class="TOC"/>
<p class="has-large-font-size"><strong>Table of Contents</strong></p>
<ul>
    <li id="TOC-h1-Run-Apache-Airflow-DAG-Docker-Compose-PostgreSQL"><a rel="noopener" target="_blank" href="#h1-Run-Apache-Airflow-DAG-Docker-Compose-PostgreSQL">Run an Apache Airflow DAG with Docker Compose and PostgreSQL</a></li>

    <li id="TOC-h2-Project-Structure"><a rel="noopener" target="_blank" href="#h2-Project-Structure">Project Structure</a></li>

    <li id="TOC-h2-PDF-Parsing-Text-Chunking-Logic-Airflow-DAG"><a rel="noopener" target="_blank" href="#h2-PDF-Parsing-Text-Chunking-Logic-Airflow-DAG">PDF Parsing and Text Chunking Logic for Airflow DAG</a></li>

    <li id="TOC-h2-Docker-Orchestration-Runtime-Airflow-FastAPI"><a rel="noopener" target="_blank" href="#h2-Docker-Orchestration-Runtime-Airflow-FastAPI">Docker Orchestration and Runtime for Airflow and FastAPI</a></li>

    <li id="TOC-h2-Running-Apache-Airflow-FastAPI-Document-Ingestion-Pipeline"><a rel="noopener" target="_blank" href="#h2-Running-Apache-Airflow-FastAPI-Document-Ingestion-Pipeline">Running the Apache Airflow and FastAPI Document Ingestion Pipeline</a></li>

    <li id="TOC-h2-Triggering-Monitoring-Apache-Airflow-DAGs"><a rel="noopener" target="_blank" href="#h2-Triggering-Monitoring-Apache-Airflow-DAGs">Triggering and Monitoring Apache Airflow DAGs</a></li>

    <li id="TOC-h2-Verifying-Document-Chunk-Data-PostgreSQL"><a rel="noopener" target="_blank" href="#h2-Verifying-Document-Chunk-Data-PostgreSQL">Verifying Document and Chunk Data in PostgreSQL</a></li>

    <li id="TOC-h2-Error-Handling-Failure-Recovery-Airflow-Pipeline"><a rel="noopener" target="_blank" href="#h2-Error-Handling-Failure-Recovery-Airflow-Pipeline">Error Handling and Failure Recovery in Airflow Pipeline</a></li>

    <li id="TOC-h2-Design-Principles-Idempotency-Observability-Reproducibility-Airflow-Pipelines"><a rel="noopener" target="_blank" href="#h2-Design-Principles-Idempotency-Observability-Reproducibility-Airflow-Pipelines">Design Principles: Idempotency, Observability, and Reproducibility in Airflow Pipelines</a></li>

    <li id="TOC-h2-Limits-Apache-Airflow-Machine-Learning-Workloads"><a rel="noopener" target="_blank" href="#h2-Limits-Apache-Airflow-Machine-Learning-Workloads">Limits of Apache Airflow for Machine Learning Workloads</a></li>

    <li id="TOC-h2-Summary"><a rel="noopener" target="_blank" href="#h2-Summary">Summary</a></li>
</ul>
</div>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h1-Run-Apache-Airflow-DAG-Docker-Compose-PostgreSQL"/>



<h2 class="wp-block-heading"><a href="#TOC-h1-Run-Apache-Airflow-DAG-Docker-Compose-PostgreSQL">Run an Apache Airflow DAG with Docker Compose and PostgreSQL</a></h2>



<p>In Lesson 1, we designed a production-grade document ingestion pipeline using FastAPI, Apache Airflow, and PostgreSQL. We defined the data model, built an idempotent 5-task DAG, structured our project for separation of concerns, and established the architectural principles that make the system reliable.</p>


<div class="wp-block-image">
<figure class="aligncenter size-full"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/run-apache-airflow-dag-docker-compose-postgresql-featured.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="940" height="780" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/run-apache-airflow-dag-docker-compose-postgresql-featured.png?lossy=2&strip=1&webp=1" alt="run-apache-airflow-dag-docker-compose-postgresql-featured.png" class="wp-image-54217" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/run-apache-airflow-dag-docker-compose-postgresql-featured.png?size=126x105&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/run-apache-airflow-dag-docker-compose-postgresql-featured-300x249.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/run-apache-airflow-dag-docker-compose-postgresql-featured.png?size=378x314&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/run-apache-airflow-dag-docker-compose-postgresql-featured.png?size=504x418&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/run-apache-airflow-dag-docker-compose-postgresql-featured.png?size=630x523&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/run-apache-airflow-dag-docker-compose-postgresql-featured-768x637.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/run-apache-airflow-dag-docker-compose-postgresql-featured.png?lossy=2&amp;strip=1&amp;webp=1 940w" sizes="(max-width: 630px) 100vw, 630px" /></a></figure></div>


<p>Now, in Lesson 2, we bring that architecture to life.</p>



<p>In this lesson, you will operationalize the ingestion pipeline by running it inside Docker, uploading real documents, triggering DAG runs, inspecting logs, and validating results directly in PostgreSQL. You will also explore failure scenarios, observability patterns, and the practical limits of using Airflow for ML-style workloads.</p>



<p>By the end of this lesson, you will not only understand how the pipeline works, but how to run it, debug it, monitor it, and reason about its behavior in production.</p>



<p>This lesson is the last in a 2-part series on <strong>building production-grade document ingestion pipelines for RAG systems</strong>:</p>



<ol class="wp-block-list">
<li><em><strong><a href="https://pyimg.co/8b2ey" target="_blank" rel="noreferrer noopener">Apache Airflow Document Ingestion Pipeline for RAG Systems</a></strong></em></li>



<li><em><strong><a href="https://pyimg.co/kxc7e" target="_blank" rel="noreferrer noopener">Run an Apache Airflow DAG with Docker Compose and PostgreSQL</a></strong></em><em><strong>  (this tutorial)</strong></em></li>
</ol>



<p><strong>To learn how to deploy, execute, monitor, and validate a production-ready ingestion pipeline with Apache Airflow, </strong><em><strong>just keep reading.</strong></em></p>



<div id="pyi-source-code-block" class="source-code-wrap"><div class="gpd-source-code">
    <div class="gpd-source-code-content">
        <img decoding="async" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/source-code-icon.png?lossy=2&strip=1&webp=1" alt="">
        <h4>Looking for the source code to this post?</h4>
                    <a href="#download-the-code" class="pyis-cta-modal-open-modal">Jump Right To The Downloads Section <svg class="svg-icon arrow-right" width="12" height="12" aria-hidden="true" role="img" focusable="false" viewBox="0 0 14 14" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M6.8125 0.1875C6.875 0.125 6.96875 0.09375 7.09375 0.09375C7.1875 0.09375 7.28125 0.125 7.34375 0.1875L13.875 6.75C13.9375 6.8125 14 6.90625 14 7C14 7.125 13.9375 7.1875 13.875 7.25L7.34375 13.8125C7.28125 13.875 7.1875 13.9062 7.09375 13.9062C6.96875 13.9062 6.875 13.875 6.8125 13.8125L6.1875 13.1875C6.125 13.125 6.09375 13.0625 6.09375 12.9375C6.09375 12.8438 6.125 12.75 6.1875 12.6562L11.0312 7.8125H0.375C0.25 7.8125 0.15625 7.78125 0.09375 7.71875C0.03125 7.65625 0 7.5625 0 7.4375V6.5625C0 6.46875 0.03125 6.375 0.09375 6.3125C0.15625 6.25 0.25 6.1875 0.375 6.1875H11.0312L6.1875 1.34375C6.125 1.28125 6.09375 1.1875 6.09375 1.0625C6.09375 0.96875 6.125 0.875 6.1875 0.8125L6.8125 0.1875Z" fill="#169FE6"></path></svg></a>
            </div>
</div>
</div>



<hr class="wp-block-separator has-alpha-channel-opacity"/>



<p>Would you like immediate access to 3,457 images curated and labeled with hand gestures to train, explore, and experiment with &#8230; for free? Head over to <a href="https://universe.roboflow.com/isl/az-6mqow?ref=pyimagesearch" target="_blank" rel="noreferrer noopener">Roboflow</a> and get a free account to grab these hand gesture images. </p>



<hr class="wp-block-separator has-alpha-channel-opacity"/>



<!-- wp:paragraph -->
<h3>Need Help Configuring Your Development Environment?</h3>
<!-- /wp:paragraph -->

<!-- wp:image {"align":"center","id":18137,"sizeSlug":"large","linkDestination":"custom"} -->
<figure class="wp-block-image aligncenter size-large"><a href="https://pyimagesearch.com/pyimagesearch-university/" target="_blank" rel="noreferrer noopener"><img decoding="async" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-18137" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?lossy=2&strip=1&webp=1 500w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?size=126x84&lossy=2&strip=1&webp=1 126w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?size=252x168&lossy=2&strip=1&webp=1 252w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?size=378x253&lossy=2&strip=1&webp=1 378w" sizes="(max-width: 500px) 100vw, 500px" /></a><figcaption>Having trouble configuring your development environment? Want access to pre-configured Jupyter Notebooks running on Google Colab? Be sure to join <a href="https://pyimagesearch.com/pyimagesearch-university/" target="_blank" rel="noreferrer noopener" aria-label=" (opens in a new tab)">PyImageSearch University</a> — you will be up and running with this tutorial in a matter of minutes. </figcaption></figure>
<!-- /wp:image -->

<!-- wp:paragraph -->
<p>All that said, are you:</p>
<!-- /wp:paragraph -->

<!-- wp:list -->
<ul><li>Short on time?</li><li>Learning on your employer’s administratively locked system?</li><li>Wanting to skip the hassle of fighting with the command line, package managers, and virtual environments?</li><li><strong>Ready to run the code immediately on your Windows, macOS, or Linux system?</strong></li></ul>
<!-- /wp:list -->

<!-- wp:paragraph -->
<p>Then join <a href="https://pyimagesearch.com/pyimagesearch-university/" target="_blank">PyImageSearch University</a> today!</p>
<!-- /wp:paragraph -->

<!-- wp:paragraph -->
<p><strong>Gain access to Jupyter Notebooks for this tutorial and other PyImageSearch guides pre-configured to run on Google Colab’s ecosystem right in your web browser!</strong> No installation required.</p>
<!-- /wp:paragraph -->

<!-- wp:paragraph -->
<p>And best of all, these Jupyter Notebooks will run on Windows, macOS, and Linux!</p>
<!-- /wp:paragraph -->



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Project-Structure"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Project-Structure">Project Structure</a></h2>



<p>In Lesson 1, we focused on architectural boundaries. Now we look at the same structure through a runtime lens: what runs where, and what each container actually executes.</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Run an Apache Airflow DAG with Docker Compose and PostgreSQL" data-enlighter-group="2">├── airflow_project/        # Orchestration + runtime layer
│   ├── dags/              # Airflow DAG definitions
│   │   └── ingest_documents_dag.py  # Main DAG: PDF → chunks pipeline
│   │
│   ├── ingestion_service/ # FastAPI REST API for file uploads
│   │   ├── __init__.py
│   │   ├── main.py      # Upload endpoint with deduplication
│   │   └── requirements.txt  # FastAPI + Uvicorn dependencies
│   │
│   ├── docker-compose.yml  # Defines Postgres, Airflow, API services
│   ├── Dockerfile               # Airflow container image
│   ├── Dockerfile.service   # FastAPI service container image
│   └── init-db.sh              # PostgreSQL database initialization script
│
├── shared/      # Reusable business logic (no Airflow deps)
│   ├── data_models/ # Pydantic schemas (API validation layer)
│   │   ├── __init__.py
│   │   └── models.py  # Document, Chunk, PipelineRun schemas
│   │
│   ├── parsing/                # Document processing logic
│   │   ├── __init__.py
│   │   ├── pdf_parser.py # PyPDF text extraction (page-level)
│   │   ├── chunker.py    # Sliding window text chunking
│   │   └── deduplication.py # Content hash comparison utilities
│   │
│   ├── storage/            # Database interaction layer
│   │   ├── __init__.py
│   │   ├── database.py     # SQLAlchemy session management
│   │   └── models.py       # ORM models (tables definition)
│   │
│   ├── utils/               # Low-level helpers
│   │   ├── __init__.py
│   │   ├── hashing.py      # SHA-256 file &amp; text hashing
│   │   └── logging.py      # Structured logging utilities
│   │
│   ├── __init__.py
│   └── requirements.txt  # Shared dependencies (SQLAlchemy, PyPDF, etc.)
</pre>



<p>When you run:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Run an Apache Airflow DAG with Docker Compose and PostgreSQL" data-enlighter-group="3">docker compose up --build
</pre>



<p>This structure turns into 5 running services.</p>



<h3 class="wp-block-heading">What Runs Inside Each Container</h3>



<p><strong>Airflow Webserver </strong><strong>and</strong><strong> Scheduler</strong></p>



<ul class="wp-block-list">
<li>Load DAGs from <code data-enlighter-language="python" class="EnlighterJSRAW">dags/</code></li>



<li>Import parsing and storage logic from <code data-enlighter-language="python" class="EnlighterJSRAW">shared/</code></li>



<li>Connect to PostgreSQL</li>



<li>Read uploaded files from the shared volume</li>
</ul>



<p><strong>Ingestion Service</strong></p>



<ul class="wp-block-list">
<li>Runs <code data-enlighter-language="python" class="EnlighterJSRAW">main.py</code></li>



<li>Accepts PDF uploads</li>



<li>Writes files into <code data-enlighter-language="python" class="EnlighterJSRAW">/tmp/ml_orchestration/uploads</code></li>



<li>Inserts rows into PostgreSQL</li>
</ul>



<p><strong>PostgreSQL</strong></p>



<ul class="wp-block-list">
<li>Stores Airflow metadata</li>



<li>Stores <code data-enlighter-language="python" class="EnlighterJSRAW">documents</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">chunks</code>, and <code data-enlighter-language="python" class="EnlighterJSRAW">pipeline_runs</code></li>
</ul>



<h3 class="wp-block-heading">Why shared/ Is Mounted Everywhere</h3>



<p>Both the Airflow containers and the ingestion service mount the <code data-enlighter-language="python" class="EnlighterJSRAW">shared/</code> directory.</p>



<p>This guarantees:</p>



<ul class="wp-block-list">
<li>The API and DAG use identical parsing logic</li>



<li>Hashing behaves consistently</li>



<li>Database models stay synchronized</li>
</ul>



<p>No duplicated code.</p>



<p>No version drift.</p>



<h3 class="wp-block-heading">The Shared Upload Volume</h3>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">/tmp/ml_orchestration/uploads</code> volume is mounted into both:</p>



<ul class="wp-block-list">
<li>ingestion-service</li>



<li>airflow containers</li>
</ul>



<p>This is critical.</p>



<p>The API writes files.</p>



<p>Airflow reads the same files directly from disk.</p>



<p>There is no file transfer, no API polling, and no network hop.</p>



<p>This design eliminates an entire class of distributed file consistency problems.</p>



<h3 class="wp-block-heading">Why This Matters in Practice</h3>



<p>At runtime:</p>



<ul class="wp-block-list">
<li>The API handles fast, user-facing work.</li>



<li>Airflow handles scheduled, asynchronous work.</li>



<li>PostgreSQL persists state.</li>



<li>The shared module ensures consistency.</li>



<li>Docker isolates everything into reproducible containers.</li>
</ul>



<p>Lesson 1 taught you how the system is designed.</p>



<p>Lesson 2 shows you how that design behaves when executed.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-PDF-Parsing-Text-Chunking-Logic-Airflow-DAG"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-PDF-Parsing-Text-Chunking-Logic-Airflow-DAG">PDF Parsing and Text Chunking Logic for Airflow DAG</a></h2>



<p>The DAG calls functions like <code data-enlighter-language="python" class="EnlighterJSRAW">parse_pdf()</code> and <code data-enlighter-language="python" class="EnlighterJSRAW">chunk_text()</code>, but these are not defined in the DAG file. They live in <code data-enlighter-language="python" class="EnlighterJSRAW">shared/parsing/</code> where both Airflow and future Argo Workflows can use them. <strong>Figure 1</strong> shows the chunking pipeline.</p>


<div class="wp-block-image">
<figure class="aligncenter size-full"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-34.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="624" height="340" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-34.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-54140" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-34.png?size=126x69&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-34-300x163.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-34.png?size=378x206&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-34.png?size=504x275&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-34.png?lossy=2&amp;strip=1&amp;webp=1 624w" sizes="(max-width: 624px) 100vw, 624px" /></a><figcaption class="wp-element-caption"><strong>Figure 1:</strong> The text chunking pipeline showing PDF extraction, page concatenation, chunking with overlap, and hash-based deduplication.</figcaption></figure></div>


<p>Here is the PDF parsing code from <code data-enlighter-language="python" class="EnlighterJSRAW">shared/parsing/pdf_parser.py</code>:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Run an Apache Airflow DAG with Docker Compose and PostgreSQL" data-enlighter-group="4">def parse_pdf(file_path: str) -> List[dict]:
    """
    Parse a PDF file and extract text page by page.
    
    Args:
        file_path: Path to PDF file
        
    Returns:
        List of dictionaries with page_number and text
    """
    logger.info(f"Parsing PDF: {file_path}")
    
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"PDF file not found: {file_path}")
    
    try:
        reader = PdfReader(file_path)
        pages = []
        
        for i, page in enumerate(reader.pages):
            text = page.extract_text()
            metadata = reader.metadata or {}
            
            pages.append({
                'page_number': i + 1,
                'text': text,
                'metadata': {
                    'title': metadata.get('/Title', ''),
                    'author': metadata.get('/Author', ''),
                    'total_pages': len(reader.pages)
                }
            })
        
        logger.info(f"Extracted {len(pages)} pages from PDF")
        return pages
        
    except Exception as e:
        logger.error(f"Failed to parse PDF {file_path}: {str(e)}")
        raise</pre>



<p>This function uses PyPDF&#8217;s <code data-enlighter-language="python" class="EnlighterJSRAW">PdfReader</code> to load the PDF. It iterates over each page, calls <code data-enlighter-language="python" class="EnlighterJSRAW">extract_text()</code>, and bundles the text with page metadata. The result is a list of dictionaries, one per page.</p>



<p>Notice we extract metadata like title and author if available. This can be useful for citations or provenance tracking later. We also include the total page count so downstream tasks know if they have partial data.</p>



<p>The error handling re-raises exceptions after logging. This ensures failures propagate to Airflow where they can trigger retries or mark the document as failed.</p>



<p>Here is the chunking code from <code data-enlighter-language="python" class="EnlighterJSRAW">shared/parsing/chunker.py</code>:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Run an Apache Airflow DAG with Docker Compose and PostgreSQL" data-enlighter-group="6">def chunk_text(
    text: str,
    chunk_size: int = 512,
    overlap: int = 50,
    min_chunk_size: int = 100
) -> List[str]:
    """
    Split text into overlapping chunks.
    
    Args:
        text: Input text to chunk
        chunk_size: Target size in words (approximate token proxy)
        overlap: Number of overlapping words between chunks
        min_chunk_size: Minimum chunk size in words (discard smaller)
        
    Returns:
        List of text chunks
    """
    logger.info(f"Chunking text: {len(text)} chars, chunk_size={chunk_size}, overlap={overlap}")
    
    words = text.split()  # Simple whitespace splitting (production would use proper tokenizers)
    chunks = []
    start = 0
    
    while start &lt; len(words):
        end = min(start + chunk_size, len(words))
        chunk_words = words[start:end]
        chunk = ' '.join(chunk_words)
        
        if len(chunk_words) >= min_chunk_size:
            chunks.append(chunk)
        
        start += (chunk_size - overlap)
    
    logger.info(f"Created {len(chunks)} chunks")
    return chunks</pre>



<p>This function uses a sliding window approach — imagine sliding a frame across a long document, capturing a portion of text, then sliding forward while keeping some overlap with the previous frame.</p>



<p>We split the text into words using whitespace (production systems would use proper tokenizers like tiktoken, but whitespace splitting works as a reasonable approximation). We extract a window of <code data-enlighter-language="python" class="EnlighterJSRAW">chunk_size</code> words, join them back into a string, and add to the chunks list. Then we slide the window forward by <code data-enlighter-language="python" class="EnlighterJSRAW">chunk_size - overlap</code> words to create the next chunk.</p>



<p>The overlap is critical for retrieval quality. If an important sentence or concept spans a chunk boundary, the overlap ensures it appears in full within at least one chunk. Without overlap, you might split a key phrase across two chunks and miss it during semantic search.</p>



<p>We also have a <code data-enlighter-language="python" class="EnlighterJSRAW">min_chunk_size</code> parameter. Very small chunks (just a few words) usually lack enough context to be semantically meaningful, so we discard them.</p>



<p>Here is the content hashing code from <code data-enlighter-language="python" class="EnlighterJSRAW">shared/utils/hashing.py</code>:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Run an Apache Airflow DAG with Docker Compose and PostgreSQL" data-enlighter-group="8">def hash_content(content: str) -> str:
    """
    Compute SHA-256 hash of text content.
    
    Args:
        content: Text to hash
        
    Returns:
        Hexadecimal hash string
    """
    return hashlib.sha256(content.encode('utf-8')).hexdigest()


def hash_file(file_path: str) -> str:
    """
    Compute SHA-256 hash of file content.
    
    Args:
        file_path: Path to file
        
    Returns:
        Hexadecimal hash string
    """
    hash_obj = hashlib.sha256()
    
    with open(file_path, 'rb') as f:
        for chunk in iter(lambda: f.read(4096), b''):
            hash_obj.update(chunk)
    
    return hash_obj.hexdigest()</pre>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">hash_content()</code> function encodes text as UTF-8 bytes and computes its SHA-256 hash. The <code data-enlighter-language="python" class="EnlighterJSRAW">hash_file()</code> function reads a file in 4KB chunks (to handle large files without loading them entirely into memory) and computes the cumulative hash.</p>



<p>These hashes are what enable deduplication. Two identical chunks will always produce the same hash, so we can check for existence in the database before inserting.</p>



<p><strong>Why Sharing This Logic Matters</strong></p>



<p>By putting parsing and chunking in <code data-enlighter-language="python" class="EnlighterJSRAW">shared/</code>, we ensure consistency across different orchestration systems. When we introduce Argo Workflows in a future lesson, it will use the exact same <code data-enlighter-language="python" class="EnlighterJSRAW">parse_pdf()</code> and <code data-enlighter-language="python" class="EnlighterJSRAW">chunk_text()</code> functions. This means documents processed by Airflow and documents processed by Argo will have identical chunk boundaries, identical hashes, and identical behavior.</p>



<p>This is critical for systems where you might migrate from one orchestrator to another or run hybrid setups.</p>



<p>Now let&#8217;s see how this all runs in Docker.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Docker-Orchestration-Runtime-Airflow-FastAPI"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Docker-Orchestration-Runtime-Airflow-FastAPI">Docker Orchestration and Runtime for Airflow and FastAPI</a></h2>



<p>We have examined the code. Now let&#8217;s see how Docker brings it all together. <strong>Figure 2</strong> shows the container topology.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-scaled.jpeg" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="612" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-1024x612.jpeg?lossy=2&strip=1&webp=1" alt="" class="wp-image-54233" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image.jpeg?size=126x75&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-300x179.jpeg?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image.jpeg?size=378x226&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image.jpeg?size=504x301&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image.jpeg?size=630x377&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-768x459.jpeg?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-1024x612.jpeg?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-scaled.jpeg?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-1536x919.jpeg?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 2:</strong> Docker Compose container topology showing all services, volumes, networks, and inter-container communication paths.</figcaption></figure></div>


<p>Here is the complete <code data-enlighter-language="python" class="EnlighterJSRAW">docker-compose.yml</code>:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="yaml" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Run an Apache Airflow DAG with Docker Compose and PostgreSQL" data-enlighter-group="10">version: '3.8'

x-airflow-common:
  &amp;airflow-common
  image: apache/airflow:2.7.3-python3.11
  environment:
    &amp;airflow-common-env
    AIRFLOW__CORE__EXECUTOR: LocalExecutor
    AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
    AIRFLOW__CORE__FERNET_KEY: ''
    AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'false'
    AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
    AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session'
    PYTHONPATH: /opt/airflow/shared
    ML_ORCHESTRATION_DB_URI: postgresql://mlops:mlops_password@postgres:5432/ml_orchestration
  volumes:
    - ./dags:/opt/airflow/dags
    - ./logs:/opt/airflow/logs
    - ./plugins:/opt/airflow/plugins
    - ../shared:/opt/airflow/shared
    - /tmp/ml_orchestration/uploads:/tmp/ml_orchestration/uploads
  user: "${AIRFLOW_UID:-50000}:0"
  depends_on:
    &amp;airflow-common-depends-on
    postgres:
      condition: service_healthy

services:
  postgres:
    image: postgres:15
    environment:
      POSTGRES_USER: airflow
      POSTGRES_PASSWORD: airflow
      POSTGRES_DB: airflow
    volumes:
      - postgres-db-volume:/var/lib/postgresql/data
      - ./init-db.sh:/docker-entrypoint-initdb.d/init-db.sh
    healthcheck:
      test: ["CMD", "pg_isready", "-U", "airflow"]
      interval: 10s
      retries: 5
      start_period: 5s
    ports:
      - "5432:5432"

  airflow-webserver:
    &lt;&lt;: *airflow-common
    command: webserver
    ports:
      - "8080:8080"
    healthcheck:
      test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 30s
    restart: always

  airflow-scheduler:
    &lt;&lt;: *airflow-common
    command: scheduler
    healthcheck:
      test: ["CMD", "curl", "--fail", "http://localhost:8974/health"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 30s
    restart: always

  airflow-init:
    &lt;&lt;: *airflow-common
    entrypoint: /bin/bash
    command:
      - -c
      - |
        airflow db init
        airflow users create \
          --username admin \
          --firstname Admin \
          --lastname User \
          --role Admin \
          --email admin@example.com \
          --password admin
    environment:
      &lt;&lt;: *airflow-common-env
      _AIRFLOW_DB_UPGRADE: 'true'
      _AIRFLOW_WWW_USER_CREATE: 'true'
    user: "0:0"

  ingestion-service:
    build:
      context: .
      dockerfile: Dockerfile.service
    ports:
      - "8000:8000"
    environment:
      ML_ORCHESTRATION_DB_URI: postgresql://mlops:mlops_password@postgres:5432/ml_orchestration
      UPLOAD_DIR: /tmp/ml_orchestration/uploads
    volumes:
      - ./ingestion_service:/app/ingestion_service
      - ../shared:/app/shared
      - /tmp/ml_orchestration/uploads:/tmp/ml_orchestration/uploads
    depends_on:
      postgres:
        condition: service_healthy
    restart: always

volumes:
  postgres-db-volume:
  /tmp/ml_orchestration/uploads:</pre>



<p>Let us break down the key parts.</p>



<p>The x-airflow-common anchor (starting with &amp;) defines configuration shared by all Airflow containers. This is a YAML feature that lets us avoid repeating the same settings. The <code data-enlighter-language="python" class="EnlighterJSRAW">&lt;&lt;: *airflow-common</code> syntax merges that configuration into each service.</p>



<h3 class="wp-block-heading">Environment Variables</h3>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">AIRFLOW__CORE__EXECUTOR: LocalExecutor</code> runs tasks in separate processes on the same machine (unlike <code data-enlighter-language="python" class="EnlighterJSRAW">SequentialExecutor</code> which runs one task at a time, or <code data-enlighter-language="python" class="EnlighterJSRAW">CeleryExecutor</code> which distributes across workers). Perfect for development and moderate workloads.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">AIRFLOW__DATABASE__SQL_ALCHEMY_CONN</code> points to the PostgreSQL container using the service name <code data-enlighter-language="python" class="EnlighterJSRAW">postgres</code> as the hostname — Docker Compose automatically creates a network where services can reach each other by name.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">PYTHONPATH: /opt/airflow/shared</code> adds the shared module to Python&#8217;s import path, allowing the DAG to import from <code data-enlighter-language="python" class="EnlighterJSRAW">shared.storage</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">shared.parsing</code>, etc. Without this, imports would fail.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">ML_ORCHESTRATION_DB_URI</code> connects to our application database (<code data-enlighter-language="python" class="EnlighterJSRAW">ml_orchestration</code>), which is separate from Airflow&#8217;s metadata database.</p>



<h3 class="wp-block-heading">Volume Configuration</h3>



<p>Volumes mount local directories into containers. The <code data-enlighter-language="python" class="EnlighterJSRAW">./dags:/opt/airflow/dags</code> mapping makes your local DAG files immediately visible to Airflow — edit locally, see changes in the container.</p>



<p>The critical volume is <code data-enlighter-language="python" class="EnlighterJSRAW">/tmp/ml_orchestration/uploads:/tmp/ml_orchestration/uploads</code>, which mounts the same directory into both Airflow and the ingestion service. This shared storage lets Airflow read files the ingestion service writes, eliminating the &#8220;file not found&#8221; errors you would get with isolated containers.</p>



<h3 class="wp-block-heading">Service Definitions</h3>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">postgres</code> service runs PostgreSQL 15. It uses a volume (<code data-enlighter-language="python" class="EnlighterJSRAW">postgres-db-volume</code>) for persistent storage. Without this volume, restarting the container would erase all data. The <code data-enlighter-language="python" class="EnlighterJSRAW">init-db.sh</code> script is mounted into <code data-enlighter-language="python" class="EnlighterJSRAW">/docker-entrypoint-initdb.d/</code>, which PostgreSQL runs automatically on first startup. This creates the <code data-enlighter-language="python" class="EnlighterJSRAW">ml_orchestration</code> database and <code data-enlighter-language="python" class="EnlighterJSRAW">mlops</code> user.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">healthcheck</code> tests if PostgreSQL is ready. Other services wait for this health check to pass before starting.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">airflow-webserver</code> service runs the Airflow UI. It inherits all settings from <code data-enlighter-language="python" class="EnlighterJSRAW">airflow-common</code> and adds a <code data-enlighter-language="python" class="EnlighterJSRAW">webserver</code> command. It exposes port <code data-enlighter-language="python" class="EnlighterJSRAW">8080</code>, which you access in your browser.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">airflow-scheduler</code> service runs the scheduler, which triggers DAG runs according to the schedule interval. This is the component that wakes up every minute and checks if the DAG should run.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">airflow-init</code> service is a one-time initialization container. It runs <code data-enlighter-language="python" class="EnlighterJSRAW">airflow db</code> init to create Airflow&#8217;s metadata tables and <code data-enlighter-language="python" class="EnlighterJSRAW">airflow users create</code> to set up the admin user. After this completes, the container exits. You do not interact with it after the first startup.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">ingestion-service</code> service runs the FastAPI app. It builds from <code data-enlighter-language="python" class="EnlighterJSRAW">Dockerfile.service</code>, which installs dependencies and runs <code data-enlighter-language="python" class="EnlighterJSRAW">uvicorn</code>. It mounts the ingestion service code and shared module. It also mounts the uploads volume so it can write files.</p>



<h3 class="wp-block-heading">Building the Ingestion Service Container</h3>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">Dockerfile.service</code> defines how the FastAPI container is built. Here is the complete file:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="dockerfile" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Run an Apache Airflow DAG with Docker Compose and PostgreSQL" data-enlighter-group="12"># airflow_project/Dockerfile.service
FROM python:3.11-slim

WORKDIR /app

# Copy shared module first
COPY shared/ /app/shared/
RUN pip install --no-cache-dir -r /app/shared/requirements.txt

# Copy service dependencies
COPY airflow_project/ingestion_service/requirements.txt /app/requirements.txt
RUN pip install --no-cache-dir -r /app/requirements.txt

# Copy service code
COPY airflow_project/ingestion_service/ /app/ingestion_service/

# Set Python path to include shared module
ENV PYTHONPATH="/app:/app/shared"

EXPOSE 8000

CMD ["uvicorn", "ingestion_service.main:app", "--host", "0.0.0.0", "--port", "8000"]</pre>



<p>This Dockerfile uses a multi-layer approach. First, we copy and install the shared module dependencies. This layer is cached and only rebuilt when shared dependencies change. Then we install the service-specific dependencies. Finally, we copy the application code.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">PYTHONPATH</code> environment variable is critical. It tells Python where to find modules. By including both <code data-enlighter-language="python" class="EnlighterJSRAW">/app</code> and <code data-enlighter-language="python" class="EnlighterJSRAW">/app/shared</code>, we can import from both <code data-enlighter-language="python" class="EnlighterJSRAW">ingestion_service.main</code> and <code data-enlighter-language="python" class="EnlighterJSRAW">shared.storage.models</code> without issues.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">CMD</code> runs <code data-enlighter-language="python" class="EnlighterJSRAW">uvicorn</code>, the Asynchronous Server Gateway Interface (<code data-enlighter-language="python" class="EnlighterJSRAW">ASGI</code>) server for <code data-enlighter-language="python" class="EnlighterJSRAW">FastAPI</code>. The <code data-enlighter-language="python" class="EnlighterJSRAW">--host 0.0.0.0</code> makes the server accessible from outside the container (necessary for Docker port mapping). The <code data-enlighter-language="python" class="EnlighterJSRAW">--port 8000</code> matches the port we expose in <code data-enlighter-language="python" class="EnlighterJSRAW">docker-compose.yml</code>.</p>



<h3 class="wp-block-heading">Database Initialization Script</h3>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">init-db.sh</code> script creates the <code data-enlighter-language="python" class="EnlighterJSRAW">ml_orchestration</code> database and <code data-enlighter-language="python" class="EnlighterJSRAW">mlops</code> user when PostgreSQL starts for the first time. Here is the complete script:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Run an Apache Airflow DAG with Docker Compose and PostgreSQL" data-enlighter-group="14">#!/bin/bash
# airflow_project/init-db.sh
set -e

# Create additional database and user for ML orchestration
# This script is idempotent - safe to run multiple times
psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" --dbname "$POSTGRES_DB" &lt;&lt;-EOSQL
    -- Create database only if it doesn't exist
    SELECT 'CREATE DATABASE ml_orchestration'
    WHERE NOT EXISTS (SELECT FROM pg_database WHERE datname = 'ml_orchestration')\gexec
    
    -- Create user only if it doesn't exist
    DO
    \$\$
    BEGIN
        IF NOT EXISTS (SELECT FROM pg_catalog.pg_user WHERE usename = 'mlops') THEN
            CREATE USER mlops WITH PASSWORD 'mlops_password';
        END IF;
    END
    \$\$;
    
    -- Grant privileges (idempotent operation)
    GRANT ALL PRIVILEGES ON DATABASE ml_orchestration TO mlops;
    
    -- Connect to ml_orchestration and grant schema privileges
    \c ml_orchestration
    GRANT ALL ON SCHEMA public TO mlops;
    ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO mlops;
    ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO mlops;
EOSQL

echo "ML orchestration database setup completed successfully"</pre>



<p>This script demonstrates several important PostgreSQL patterns. The <code data-enlighter-language="python" class="EnlighterJSRAW">set -e</code> command makes the script exit immediately if any command fails, preventing partial setup.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">SELECT 'CREATE DATABASE ...' WHERE NOT EXISTS ... \gexec</code> pattern creates the database only if it does not already exist. This is idempotent — running the script multiple times produces the same result.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">DO $$ ... END $$</code> block is a PL/pgSQL anonymous code block that checks if the user exists before creating it. Again, this makes the script safe to rerun.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">GRANT ALL PRIVILEGES</code> statements give the <code data-enlighter-language="python" class="EnlighterJSRAW">mlops</code> user full access to the <code data-enlighter-language="python" class="EnlighterJSRAW">ml_orchestration</code> database. The <code data-enlighter-language="python" class="EnlighterJSRAW">ALTER DEFAULT PRIVILEGES</code> commands ensure that future tables and sequences created by any user are automatically accessible to <code data-enlighter-language="python" class="EnlighterJSRAW">mlops</code>. This prevents permission errors when Airflow creates tables.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">\c ml_orchestration</code> command switches the connection to the newly created database so we can grant schema-level privileges.</p>



<p>This initialization script runs automatically because docker-compose mounts it into <code data-enlighter-language="python" class="EnlighterJSRAW">/docker-entrypoint-initdb.d/</code>. PostgreSQL executes all scripts in that directory on first startup.</p>



<p><strong>Why This Structure Works</strong></p>



<p>This setup gives us isolated, reproducible environments. Everything runs in containers. There is no &#8220;works on my machine&#8221; problem. The same <code data-enlighter-language="python" class="EnlighterJSRAW">docker-compose.yml</code> runs identically on macOS, Linux, and Windows (with Docker Desktop).</p>



<p>The shared uploads volume is the key to inter-service communication. Both Airflow and the ingestion service see the same files. Airflow does not need to call the ingestion API to get files; it just reads from the shared directory.</p>



<p>The dependency management (<code data-enlighter-language="python" class="EnlighterJSRAW">depends_on</code> with health checks) ensures services start in the correct order. PostgreSQL must be ready before Airflow tries to connect.</p>



<p>Now let&#8217;s see how to run this system from scratch.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Running-Apache-Airflow-FastAPI-Document-Ingestion-Pipeline"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Running-Apache-Airflow-FastAPI-Document-Ingestion-Pipeline">Running the Apache Airflow and FastAPI Document Ingestion Pipeline</a></h2>



<p>To start the system, open a terminal, navigate to <code data-enlighter-language="python" class="EnlighterJSRAW">airflow_project/</code>, and run:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Run an Apache Airflow DAG with Docker Compose and PostgreSQL" data-enlighter-group="16">docker compose up --build</pre>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">--build</code> flag ensures Docker rebuilds images if you changed any code. Docker will pull images, build containers, and start all services. You will see logs streaming from all 5 containers.</p>



<p>After about 30 seconds, you should see messages such as the following:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Run an Apache Airflow DAG with Docker Compose and PostgreSQL" data-enlighter-group="18">airflow-init-1         | Admin user admin created
airflow-webserver-1    | Listening at http://0.0.0.0:8080
airflow-scheduler-1    | Starting the scheduler
ingestion-service-1    | INFO:     Uvicorn running on http://0.0.0.0:8000
postgres-1             | database system is ready to accept connections</pre>



<p>This means all services are up and healthy.</p>



<h3 class="wp-block-heading">Verify the Setup</h3>



<p>Open your browser and go to:</p>



<ul class="wp-block-list">
<li><strong>Airflow UI:</strong> <code data-enlighter-language="python" class="EnlighterJSRAW">http://localhost:8080</code></li>



<li><strong>Ingestion API Docs:</strong> <code data-enlighter-language="python" class="EnlighterJSRAW">http://localhost:8000/docs</code></li>
</ul>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-36-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="254" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-36-1024x254.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-54157" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-36.png?size=126x31&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-36-300x74.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-36.png?size=378x94&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-36.png?size=504x125&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-36.png?size=630x156&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-36-768x190.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-36-1024x254.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-36-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-36-1536x380.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 3:</strong> The Airflow login screen. Use username <code>admin</code> and password <code>admin</code>.</figcaption></figure></div>


<p>Log into Airflow with username <code data-enlighter-language="python" class="EnlighterJSRAW">admin</code> and password <code data-enlighter-language="python" class="EnlighterJSRAW">admin</code>. You should see the DAGs page.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-37-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="207" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-37-1024x207.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-54159" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-37.png?size=126x25&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-37-300x61.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-37.png?size=378x76&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-37.png?size=504x102&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-37.png?size=630x127&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-37-768x155.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-37-1024x207.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-37-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-37-1536x311.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 4:</strong> The Airflow DAGs page showing <code>ingest_documents_dag</code> enabled and successfully scheduled every minute.</figcaption></figure></div>


<p>Find the <code data-enlighter-language="python" class="EnlighterJSRAW">ingest_documents_dag</code> row. Make sure the toggle on the left is enabled (blue or green). This allows the scheduler to run the DAG.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-38-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="437" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-38-1024x437.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-54161" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-38.png?size=126x54&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-38-300x128.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-38.png?size=378x161&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-38.png?size=504x215&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-38.png?size=630x269&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-38-768x327.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-38-1024x437.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-38-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-38-1536x655.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 5:</strong> The FastAPI Swagger documentation at <code>/docs</code> showing the POST <code>/documents</code> endpoint.</figcaption></figure></div>


<p>Go to <code data-enlighter-language="python" class="EnlighterJSRAW">http://localhost:8000/docs</code>. This is the interactive API documentation powered by Swagger. You should see a green POST box labeled <code data-enlighter-language="python" class="EnlighterJSRAW">/documents</code>.</p>



<p>If all 3 URLs load successfully, your system is running.</p>



<h3 class="wp-block-heading">Stopping the System</h3>



<p>To stop all containers, press <code data-enlighter-language="python" class="EnlighterJSRAW">Ctrl+C</code> in the terminal where <code data-enlighter-language="python" class="EnlighterJSRAW">docker compose up</code> is running. Then run:</p>



<p><code data-enlighter-language="python" class="EnlighterJSRAW">docker compose down</code></p>



<p>This stops and removes containers but preserves volumes (your data persists). If you want to completely reset everything (delete all data), run:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Run an Apache Airflow DAG with Docker Compose and PostgreSQL" data-enlighter-group="20">docker compose down -v</pre>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">-v</code> flag removes volumes, giving you a fresh start.</p>



<p>Now let us upload a document and trigger the pipeline.</p>



<h3 class="wp-block-heading">Uploading Documents via the API</h3>



<p>The ingestion service exposes a single endpoint: <code data-enlighter-language="python" class="EnlighterJSRAW">POST /documents</code>. You can call it with curl or use the Swagger UI for convenience. Let us use Swagger for this demo.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-39-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="932" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-39-1024x932.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-54167" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-39.png?size=126x115&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-39-300x273.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-39.png?size=378x344&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-39.png?size=504x459&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-39.png?size=630x573&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-39-768x699.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-39-1024x932.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-39-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-39-1536x1398.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 6:</strong> The Swagger UI with the <code>POST /documents</code> endpoint expanded, showing the <code>"Try it out"</code> button and file upload field.</figcaption></figure></div>


<p>In the Swagger UI (<code>http://localhost:8000/docs</code>), click the green <code data-enlighter-language="python" class="EnlighterJSRAW">POST /documents</code> box to expand it. Click <code>"Try it out"</code> in the top right corner. You should see a file upload field.</p>



<p>Click <code>"Choose File"</code> and select any PDF from your computer. For testing, use a small PDF (a few pages). Click <code>"Execute"</code> to send the request.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-40-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="836" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-40-1024x836.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-54172" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-40.png?size=126x103&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-40-300x245.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-40.png?size=378x309&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-40.png?size=504x411&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-40.png?size=630x514&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-40-768x627.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-40-1024x836.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-40-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-40-1536x1253.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 7:</strong> The Swagger UI response showing status <code>201 Created</code> with the returned document <code>JSON</code>, including <code>id</code>, <code>filename</code>, <code>status: PENDING</code>, and <code>content_hash</code>.</figcaption></figure></div>


<p>You should receive a <code>201 Created</code> response with <code>JSON</code> similar to the following:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="json" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Run an Apache Airflow DAG with Docker Compose and PostgreSQL" data-enlighter-group="22">{
  "id": 1,
  "filename": "sample.pdf",
  "file_path": "/tmp/ml_orchestration/uploads/1769421678.801241_sample.pdf",
  "content_hash": "a7f3b2c8d9e1f2a3b4c5d6e7f8a9b0c1d2e3f4a5b6c7d8e9f0a1b2c3d4e5f6a7",
  "file_size": 23456,
  "mime_type": "application/pdf",
  "status": "PENDING",
  "created_at": "2026-01-25T10:30:00",
  "updated_at": "2026-01-25T10:30:00"
}</pre>



<p>The important fields are <code data-enlighter-language="python" class="EnlighterJSRAW">id</code> (<code data-enlighter-language="python" class="EnlighterJSRAW">1</code> in this case) and <code data-enlighter-language="python" class="EnlighterJSRAW">status</code> (<code data-enlighter-language="python" class="EnlighterJSRAW">PENDING</code>). This document is now in the database waiting for Airflow to process it.</p>



<p>If you upload the same file again, you will get a <code data-enlighter-language="python" class="EnlighterJSRAW">409 Conflict</code> error:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="json" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Run an Apache Airflow DAG with Docker Compose and PostgreSQL" data-enlighter-group="24">{
  "detail": "Document already exists with ID 1"
}</pre>



<p>This is the deduplication logic working. The system detected the identical content hash and rejected the duplicate.</p>



<p><strong>Using curl</strong></p>



<p>If you prefer the command line, use <code><code data-enlighter-language="python" class="EnlighterJSRAW">curl</code></code>:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Run an Apache Airflow DAG with Docker Compose and PostgreSQL" data-enlighter-group="26">curl -X POST "http://localhost:8000/documents" \
  -H "Content-Type: multipart/form-data" \
  -F "file=@/path/to/your/document.pdf"
</pre>



<p>Replace <code data-enlighter-language="python" class="EnlighterJSRAW">/path/to/your/document.pdf</code> with the actual path to your PDF.</p>



<p>Now that we have a pending document, let&#8217;s see Airflow process it.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Triggering-Monitoring-Apache-Airflow-DAGs"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Triggering-Monitoring-Apache-Airflow-DAGs">Triggering and Monitoring Apache Airflow DAGs</a></h2>



<p>The DAG runs automatically every minute. If you just uploaded a document, wait up to 60 seconds and the scheduler will trigger a run.</p>



<h3 class="wp-block-heading">Manual Trigger (Optional)</h3>



<p>If you do not want to wait, you can manually trigger the DAG. In the Airflow UI, click on <code data-enlighter-language="python" class="EnlighterJSRAW">ingest_documents_dag</code> to open the DAG detail page.</p>



<p>You should see a graph view with your five tasks connected by arrows. On the top right, click the play button (triangle icon) and select &#8220;Trigger DAG&#8221;. This starts an immediate run.</p>



<h3 class="wp-block-heading">Monitoring Execution</h3>



<p>The DAG runs page shows all executions. Click on the latest run (top of the list). The status will progress from &#8220;running&#8221; (yellow) to &#8220;success&#8221; (green) or &#8220;failed&#8221; (red).</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-41-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="422" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-41-1024x422.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-54182" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-41.png?size=126x52&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-41-300x123.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-41.png?size=378x156&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-41.png?size=504x208&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-41.png?size=630x260&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-41-768x316.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-41-1024x422.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-41-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-41-1536x632.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 8:</strong> A successful DAG run showing all 5 tasks with green checkmarks.</figcaption></figure></div>


<p>Each task should turn green as it completes. The entire run takes about 10-30 seconds, depending on your <code>PDF</code> size.</p>



<h3 class="wp-block-heading">Viewing Logs</h3>



<p>Click on any task box (e.g., <code data-enlighter-language="python" class="EnlighterJSRAW">parse_documents</code>). A panel appears on the right. Click &#8220;Log&#8221; to see the task output.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-42-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="401" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-42-1024x401.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-54185" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-42.png?size=126x49&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-42-300x117.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-42.png?size=378x148&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-42.png?size=504x197&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-42.png?size=630x247&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-42-768x301.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-42-1024x401.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-42-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-42-1536x602.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 9:</strong> Task log for <code>parse_documents</code>, showing PDF parsing, page extraction warnings, and successful completion.</figcaption></figure></div>


<p>You should see log messages similar to the following:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Run an Apache Airflow DAG with Docker Compose and PostgreSQL" data-enlighter-group="27">[2026-01-25, 10:31:00 UTC] {ingest_documents_dag.py:234} INFO - Parsing documents...
[2026-01-25, 10:31:00 UTC] {pdf_parser.py:15} INFO - Parsing PDF: /tmp/ml_orchestration/uploads/1769421678.801241_sample.pdf
[2026-01-25, 10:31:01 UTC] {pdf_parser.py:35} INFO - Extracted 5 pages from PDF
[2026-01-25, 10:31:01 UTC] {ingest_documents_dag.py:250} INFO - Parsed 5 pages from sample.pdf
</pre>



<p>These logs confirm the DAG is working. If a task fails, the log will show the exception and stack trace.</p>



<p>Now let us verify the data was written to the database.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Verifying-Document-Chunk-Data-PostgreSQL"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Verifying-Document-Chunk-Data-PostgreSQL">Verifying Document and Chunk Data in PostgreSQL</a></h2>



<p>The DAG writes to 3 tables: <code data-enlighter-language="python" class="EnlighterJSRAW">documents</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">chunks</code>, and <code data-enlighter-language="python" class="EnlighterJSRAW">pipeline_runs</code>. Let us query them to confirm everything worked.</p>



<h3 class="wp-block-heading">Connect to PostgreSQL</h3>



<p>Open a new terminal and run:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Run an Apache Airflow DAG with Docker Compose and PostgreSQL" data-enlighter-group="28">docker exec -it airflow_project-postgres-1 psql -U mlops -d ml_orchestration
</pre>



<p>This opens a PostgreSQL shell inside the <code data-enlighter-language="python" class="EnlighterJSRAW">postgres</code> container. You are now connected as the <code data-enlighter-language="python" class="EnlighterJSRAW">mlops</code> user to the <code data-enlighter-language="python" class="EnlighterJSRAW">ml_orchestration</code> database.</p>



<h3 class="wp-block-heading">Query Documents</h3>



<pre class="EnlighterJSRAW" data-enlighter-language="sql" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Run an Apache Airflow DAG with Docker Compose and PostgreSQL" data-enlighter-group="29">SELECT id, filename, status, created_at FROM documents;
</pre>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-43-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="92" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-43-1024x92.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-54189" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-43.png?size=126x11&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-43-300x27.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-43.png?size=378x34&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-43.png?size=504x45&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-43.png?size=630x57&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-43-768x69.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-43-1024x92.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-43-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-43-1536x138.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 10:</strong> PostgreSQL query results showing the uploaded document with status <code>COMPLETED</code>.</figcaption></figure></div>


<p>You should see your document with <code data-enlighter-language="python" class="EnlighterJSRAW">status = COMPLETED</code>.</p>



<h3 class="wp-block-heading">Query Chunks</h3>



<pre class="EnlighterJSRAW" data-enlighter-language="sql" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Run an Apache Airflow DAG with Docker Compose and PostgreSQL" data-enlighter-group="30">SELECT id, document_id, chunk_index, LENGTH(text) as text_length FROM chunks LIMIT 10;
</pre>



<p>You should see rows similar to the following:</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-44-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="272" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-44-1024x272.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-54193" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-44.png?size=126x33&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-44-300x80.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-44.png?size=378x100&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-44.png?size=504x134&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-44.png?size=630x167&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-44-768x204.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-44-1024x272.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-44-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-44-1536x407.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 11:</strong> Query results from the <code>chunks</code> table showing generated text chunks with their <code>document_id</code>, <code>chunk_index</code>, and <code>text_length</code>.</figcaption></figure></div>


<p>Each row represents a chunk extracted from the document. The <code data-enlighter-language="python" class="EnlighterJSRAW">chunk_index</code> indicates the order of chunks within the document. The <code data-enlighter-language="python" class="EnlighterJSRAW">text_length</code> column shows the character length of each chunk. Since chunking is performed using a target word count (e.g., <code data-enlighter-language="python" class="EnlighterJSRAW">512</code> words with overlap), the character length varies depending on word size and content structure.</p>



<h3 class="wp-block-heading">Query Pipeline Runs</h3>



<pre class="EnlighterJSRAW" data-enlighter-language="sql" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Run an Apache Airflow DAG with Docker Compose and PostgreSQL" data-enlighter-group="31">SELECT run_id, status, documents_processed, chunks_created FROM pipeline_runs;
</pre>



<p>You should see:</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-45-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="173" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-45-1024x173.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-54199" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-45.png?size=126x21&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-45-300x51.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-45.png?size=378x64&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-45.png?size=504x85&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-45.png?size=630x106&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-45-768x130.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-45-1024x173.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-45-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-45-1536x260.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 12:</strong> Query results from the <code>pipeline_runs</code> table showing DAG run IDs, status, documents processed, and total chunks created.</figcaption></figure></div>


<p>This shows one DAG run processed one document and created <code data-enlighter-language="python" class="EnlighterJSRAW">23</code> chunks. The <code data-enlighter-language="python" class="EnlighterJSRAW">run_id</code> includes the execution timestamp.</p>



<h3 class="wp-block-heading">Exit PostgreSQL</h3>



<p>Type <code data-enlighter-language="python" class="EnlighterJSRAW">\q</code> and press Enter to exit the PostgreSQL shell.</p>



<p>You have now verified end-to-end functionality. A document was uploaded, parsed, chunked, validated, and marked complete. The data is in the database and ready for the next stage (embeddings).</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Error-Handling-Failure-Recovery-Airflow-Pipeline"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Error-Handling-Failure-Recovery-Airflow-Pipeline">Error Handling and Failure Recovery in Airflow Pipeline</a></h2>



<p>Let us walk through a real failure scenario to understand how the system handles errors.</p>



<p><strong>Scenario: A user uploads a corrupted PDF</strong></p>



<ul class="wp-block-list">
<li><strong>Upload Phase:</strong> The ingestion service receives the file, computes its hash, and saves it to <code data-enlighter-language="python" class="EnlighterJSRAW">/tmp/ml_orchestration/uploads/</code>. The file is inserted into the database with <code data-enlighter-language="python" class="EnlighterJSRAW">status=PENDING</code>. The API returns <code data-enlighter-language="python" class="EnlighterJSRAW">201 Created</code>. Everything looks normal.</li>



<li><strong>Processing Phase:</strong> One minute later, the Airflow scheduler triggers the DAG. The <code data-enlighter-language="python" class="EnlighterJSRAW">fetch_documents</code> task finds the corrupted PDF in the <code data-enlighter-language="python" class="EnlighterJSRAW">PENDING</code> state and adds its ID to the processing list.</li>



<li><strong>Parse Failure:</strong> The <code data-enlighter-language="python" class="EnlighterJSRAW">parse_documents</code> task retrieves the file path and calls <code data-enlighter-language="python" class="EnlighterJSRAW">parse_pdf()</code>. PyPDF attempts to read the file, encounters corrupted data, and raises a <code data-enlighter-language="python" class="EnlighterJSRAW">PdfReadError</code>. The exception is caught in the try-except block.</li>



<li><strong>Error Handling:</strong> Instead of letting the exception crash the entire task, we log the error, update the document&#8217;s status to <code data-enlighter-language="python" class="EnlighterJSRAW">FAILED</code>, commit the transaction, and continue processing the next document in the batch. Other valid PDFs in the same batch proceed normally.</li>



<li><strong>Observability:</strong> The task completes successfully (from Airflow&#8217;s perspective), but logs show which specific document failed and why. The failed document remains in the database with <code data-enlighter-language="python" class="EnlighterJSRAW">status=FAILED</code> and no chunks. You can query for failed documents and investigate.</li>



<li><strong>Recovery:</strong> Once you fix the corrupted PDF (or replace it), you can manually reset its status back to <code data-enlighter-language="python" class="EnlighterJSRAW">PENDING</code>. The next DAG run will pick it up and try again.</li>
</ul>



<p>This granular failure handling is why production systems use orchestrators. One bad file does not bring down the entire pipeline. You get observability into what failed, why it failed, and can fix issues incrementally.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Design-Principles-Idempotency-Observability-Reproducibility-Airflow-Pipelines"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Design-Principles-Idempotency-Observability-Reproducibility-Airflow-Pipelines">Design Principles: Idempotency, Observability, and Reproducibility in Airflow Pipelines</a></h2>



<p>This architecture demonstrates several important principles for production ML systems.</p>



<h3 class="wp-block-heading">Separation of Concerns</h3>



<p>We separated ingestion from processing. The FastAPI service accepts uploads quickly and returns. It does not block while parsing PDFs or chunking text. Users get fast feedback. Heavy processing happens asynchronously in Airflow.</p>



<p>This separation also means we can scale components independently. If upload traffic spikes, we can run more ingestion service replicas. If processing lags, we can add more Airflow workers.</p>



<h3 class="wp-block-heading">Idempotency</h3>



<p>Every task in the DAG is idempotent. You can rerun the DAG multiple times on the same documents without creating duplicates or corrupting data. This is essential for retries and recovery.</p>



<p>We achieve idempotency through:</p>



<ul class="wp-block-list">
<li><strong>Content hashing:</strong> Documents with the same content hash are recognized as duplicates.</li>



<li><strong>Existence checks:</strong> Before creating chunks, we check if they already exist.</li>



<li><strong>File-based XCom</strong><strong>:</strong> Parsing results are written to files. If a task reruns, it checks for existing files first.</li>



<li><strong>Status tracking:</strong> Documents have a <code data-enlighter-language="python" class="EnlighterJSRAW">status</code> field that prevents reprocessing completed items.</li>
</ul>



<h3 class="wp-block-heading">Observability</h3>



<p>Every operation logs its progress. You can see exactly what the system is doing at any moment. Task logs show which documents were parsed, how many chunks were created, and which operations failed.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">pipeline_runs</code> table provides metrics over time. You can query it to answer questions like &#8220;How many documents did we process last week?&#8221; or &#8220;What is our average chunk count per document?&#8221;</p>



<h3 class="wp-block-heading">Reproducibility</h3>



<p>The entire system is defined in code and configuration files. There are no manual setup steps beyond <code data-enlighter-language="python" class="EnlighterJSRAW">docker compose up</code>. Anyone can clone the repository and run the same system.</p>



<p>The shared module ensures parsing and chunking logic is consistent. Documents processed today will have the same chunk boundaries as documents processed next month.</p>



<p><strong>Failure Handling</strong></p>



<p>The DAG does not abort if one document fails. It marks that document as <code data-enlighter-language="python" class="EnlighterJSRAW">FAILED</code> and continues with others. This prevents one corrupted PDF from blocking an entire batch.</p>



<p>Airflow&#8217;s retry mechanism automatically retries failed tasks. If a transient error (network timeout, database deadlock) causes a failure, the task will retry up to 3 times before giving up.</p>



<h3 class="wp-block-heading">Data Provenance</h3>



<p>We track where every chunk came from. The <code data-enlighter-language="python" class="EnlighterJSRAW">chunk.document_id</code> and <code data-enlighter-language="python" class="EnlighterJSRAW">chunk.chunk_index</code> fields let us trace any chunk back to its source document and position. The <code data-enlighter-language="python" class="EnlighterJSRAW">pipeline_runs</code> table records when and how documents were processed.</p>



<p>This is critical for debugging, auditing, and explaining model behavior. If your RAG system returns an incorrect answer, you can trace it back to the source chunk, the document it came from, and the DAG run that processed it.</p>



<p>Now let&#8217;s discuss the limits of this approach.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Limits-Apache-Airflow-Machine-Learning-Workloads"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Limits-Apache-Airflow-Machine-Learning-Workloads">Limits of Apache Airflow for Machine Learning Workloads</a></h2>



<p>Airflow excels at orchestrating data pipelines, but it has limits for ML workloads.</p>



<h3 class="wp-block-heading">No GPU Support</h3>



<p>Airflow tasks run in Python processes on CPU. If you want to generate embeddings using a transformer model, you need GPU acceleration. Airflow does not provide native GPU scheduling.</p>



<p>You could work around this by calling an external service (like a separate embedding API), but that adds complexity and latency.</p>



<h3 class="wp-block-heading">Heavy Computation</h3>



<p>Embedding generation for thousands of chunks can take minutes or hours. Airflow is not designed for long-running, compute-intensive tasks. Tasks that run for hours can clog the scheduler and make the UI sluggish.</p>



<h3 class="wp-block-heading">Parallelism</h3>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">LocalExecutor</code> runs tasks in parallel on a single machine. If you have 100 documents to process, Airflow will process them sequentially or in small batches limited by your CPU cores.</p>



<p>For true parallelism across multiple machines, you need the <code data-enlighter-language="python" class="EnlighterJSRAW">CeleryExecutor</code> or <code data-enlighter-language="python" class="EnlighterJSRAW">KubernetesExecutor</code>, which adds complexity.</p>



<h3 class="wp-block-heading">Kubernetes Native Workloads</h3>



<p>If you are running on Kubernetes, Airflow is an additional system to manage. It needs its own deployment, scheduler, database, and monitoring.</p>



<p>Argo Workflows, which we will cover in the next lesson, is Kubernetes-native. It runs as Kubernetes Jobs and integrates directly with Kubernetes features like resource limits, autoscaling, and GPU scheduling.</p>



<h3 class="wp-block-heading">When to Use Apache Airflow</h3>



<p>Airflow is perfect for:</p>



<ul class="wp-block-list">
<li>Data ingestion and ETL (Extract, Transform, Load)</li>



<li>Scheduled batch processing</li>



<li>Orchestrating API calls and database operations</li>



<li>Tasks that run on CPU and do not require GPU</li>



<li>Workflows with moderate parallelism</li>
</ul>



<p>Airflow is not ideal for:</p>



<ul class="wp-block-list">
<li>GPU-accelerated ML training or inference</li>



<li>Massive parallel processing (thousands of tasks)</li>



<li>Real-time or low-latency workloads</li>



<li>Kubernetes-native deployments where you want to avoid managing a separate orchestrator</li>
</ul>



<p>For our RAG pipeline, Airflow handles document ingestion perfectly. It parses PDFs, chunks text, and validates quality. But for the next stage (generating embeddings), we will switch to Argo Workflows, which can run GPU-accelerated tasks on Kubernetes.</p>


<div class="wp-block-image">
<figure class="aligncenter size-full"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-46.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="624" height="273" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-46.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-54203" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-46.png?size=126x55&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-46-300x131.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-46.png?size=378x165&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-46.png?size=504x221&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-46.png?lossy=2&amp;strip=1&amp;webp=1 624w" sizes="(max-width: 624px) 100vw, 624px" /></a><figcaption class="wp-element-caption"><strong>Figure 13:</strong> A comparison chart showing <code>Airflow</code> strengths (left column: ETL, scheduled jobs, API orchestration) and <code>Argo</code> strengths (right column: GPU tasks, massive parallelism, Kubernetes-native, DAGs with complex dependencies).</figcaption></figure></div>


<hr class="wp-block-separator has-alpha-channel-opacity"/>



<div id="pitch" style="padding: 40px; width: 100%; background-color: #F4F6FA;">
	<h3>What's next? We recommend <a target="_blank" href="https://pyimagesearch.com/pyimagesearch-university/?utm_source=blogPost&utm_medium=bottomBanner&utm_campaign=What%27s%20next%3F%20I%20recommend">PyImageSearch University</a>.</h3>

	<script src="https://fast.wistia.com/embed/medias/kno0cmko2z.jsonp" async></script><script src="https://fast.wistia.com/assets/external/E-v1.js" async></script><div class="wistia_responsive_padding" style="padding:56.25% 0 0 0;position:relative;"><div class="wistia_responsive_wrapper" style="height:100%;left:0;position:absolute;top:0;width:100%;"><div class="wistia_embed wistia_async_kno0cmko2z videoFoam=true" style="height:100%;position:relative;width:100%"><div class="wistia_swatch" style="height:100%;left:0;opacity:0;overflow:hidden;position:absolute;top:0;transition:opacity 200ms;width:100%;"><img decoding="async" src="https://fast.wistia.com/embed/medias/kno0cmko2z/swatch" style="filter:blur(5px);height:100%;object-fit:contain;width:100%;" alt="" aria-hidden="true" onload="this.parentNode.style.opacity=1;" /></div></div></div></div>

	<div style="margin-top: 32px; margin-bottom: 32px; ">
		<strong>Course information:</strong><br/>
		86+ total classes • 115+ hours hours of on-demand code walkthrough videos • Last updated: June 2026<br/>
		<span style="color: #169FE6;">★★★★★</span> 4.84 (128 Ratings) • 16,000+ Students Enrolled
	</div>

	<p><strong>I strongly believe that if you had the right teacher you could <em>master</em> computer vision and deep learning.</strong></p>

	<p>Do you think learning computer vision and deep learning has to be time-consuming, overwhelming, and complicated? Or has to involve complex mathematics and equations? Or requires a degree in computer science?</p>

	<p>That’s <em>not</em> the case.</p>

	<p>All you need to master computer vision and deep learning is for someone to explain things to you in <em>simple, intuitive</em> terms. <em>And that’s exactly what I do</em>. My mission is to change education and how complex Artificial Intelligence topics are taught.</p>

	<p>If you're serious about learning computer vision, your next stop should be PyImageSearch University, the most comprehensive computer vision, deep learning, and OpenCV course online today. Here you’ll learn how to <em>successfully</em> and <em>confidently</em> apply computer vision to your work, research, and projects. Join me in computer vision mastery.</p>

	<p><strong>Inside PyImageSearch University you'll find:</strong></p>

	<ul style="margin-left: 0px;">
		<li style="list-style: none;">&check; <strong>86+ courses</strong> on essential computer vision, deep learning, and OpenCV topics</li>
		<li style="list-style: none;">&check; <strong>86 Certificates</strong> of Completion</li>
		<li style="list-style: none;">&check; <strong>115+ hours hours</strong> of on-demand video</li>
		<li style="list-style: none;">&check; <strong>Brand new courses released <em>regularly</em></strong>, ensuring you can keep up with state-of-the-art techniques</li>
		<li style="list-style: none;">&check; <strong>Pre-configured Jupyter Notebooks in Google Colab</strong></li>
		<li style="list-style: none;">&check; Run all code examples in your web browser — works on Windows, macOS, and Linux (no dev environment configuration required!)</li>
		<li style="list-style: none;">&check; Access to <strong>centralized code repos for <em>all</em> 540+ tutorials</strong> on PyImageSearch</li>
		<li style="list-style: none;">&check; <strong> Easy one-click downloads</strong> for code, datasets, pre-trained models, etc.</li>
		<li style="list-style: none;">&check; <strong>Access</strong> on mobile, laptop, desktop, etc.</li>
	</ul>

	<p style="text-align: center;">
		<a target="_blank" class="button link" href="https://pyimagesearch.com/pyimagesearch-university/?utm_source=blogPost&utm_medium=bottomBanner&utm_campaign=What%27s%20next%3F%20I%20recommend" style="background-color: #6DC713; border-bottom: none;">Click here to join PyImageSearch University</a>
	</p>
</div>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Summary"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Summary">Summary</a></h2>



<p>In this lesson, you operationalized the document ingestion architecture built in Lesson 1. You ran the complete system inside Docker, configured environment variables and shared volumes, and verified inter-container communication between FastAPI, Airflow, and PostgreSQL.</p>



<p>You uploaded real PDF documents through the API, triggered DAG executions, monitored task progress in the Airflow UI, inspected logs, and validated results directly in PostgreSQL. You confirmed that documents moved from <code data-enlighter-language="python" class="EnlighterJSRAW">PENDING</code> to <code data-enlighter-language="python" class="EnlighterJSRAW">COMPLETED</code>, chunks were created correctly, and pipeline runs were recorded for observability and auditing.</p>



<p>You also explored real-world failure scenarios, including corrupted PDFs and partial processing failures, and saw how the system isolates errors without blocking the entire pipeline. Through this, you reinforced key production principles: separation of concerns, idempotency, observability, reproducibility, and data provenance.</p>



<p>Finally, you examined the practical limits of Apache Airflow for ML workloads and identified where GPU-based execution and Kubernetes-native systems may be more appropriate.</p>



<p>At this point, you now have a fully runnable, production-style ingestion system that you can extend with embedding generation, vector databases, and Retrieval-Augmented Generation workflows.</p>



<h3 class="wp-block-heading">Citation Information</h3>



<p><strong>Singh, V</strong><strong>. </strong>“Run an Apache Airflow DAG with Docker Compose and PostgreSQL,” <em>PyImageSearch</em>, S. Huot, A. Sharma, and P. Thakur, eds., 2026, <a href="https://pyimg.co/kxc7e" target="_blank" rel="noreferrer noopener">https://pyimg.co/kxc7e</a> </p>



<pre class="EnlighterJSRAW" data-enlighter-language="raw" data-enlighter-theme="classic" data-enlighter-highlight="" data-enlighter-linenumbers="false" data-enlighter-lineoffset="" data-enlighter-title="Run an Apache Airflow DAG with Docker Compose and PostgreSQL" data-enlighter-group="32">@incollection{Singh_2026_run-apache-airflow-dag-docker-compose-postgresql,
  author = {Vikram Singh},
  title = {{Run an Apache Airflow DAG with Docker Compose and PostgreSQL}},
  booktitle = {PyImageSearch},
  editor = {Susan Huot and Aditya Sharma and Piyush Thakur},
  year = {2026},
  url = {https://pyimg.co/kxc7e},
}
</pre>



<hr class="wp-block-separator has-alpha-channel-opacity"/>



<p><strong>To download the source code to this post (and be notified when future tutorials are published here on PyImageSearch), </strong><em><strong>simply enter your email address in the form below!</strong></em></p>



<div id="download-the-code" class="post-cta-wrap">
<div class="gpd-post-cta">
	<div class="gpd-post-cta-content">
		

			<div class="gpd-post-cta-top">
				<div class="gpd-post-cta-top-image"><img decoding="async" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?lossy=2&strip=1&webp=1" alt="" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?lossy=2&strip=1&webp=1 410w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?size=126x174&lossy=2&strip=1&webp=1 126w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?size=252x348&lossy=2&strip=1&webp=1 252w" sizes="(max-width: 410px) 100vw, 410px" /></div>
				
				<div class="gpd-post-cta-top-title"><h4>Download the Source Code and FREE 17-page Resource Guide</h4></div>
				<div class="gpd-post-cta-top-desc"><p>Enter your email address below to get a .zip of the code and a <strong>FREE 17-page Resource Guide on Computer Vision, OpenCV, and Deep Learning.</strong> Inside you'll find my hand-picked tutorials, books, courses, and libraries to help you master CV and DL!</p></div>


			</div>

			<div class="gpd-post-cta-bottom">
				<form id="footer-cta-code" class="footer-cta" action="https://www.getdrip.com/forms/4130035/submissions" method="post" target="blank" data-drip-embedded-form="4130035">
					<input name="fields[email]" type="email" value="" placeholder="Your email address" class="form-control" />

					<button type="submit">Download the code!</button>

					<div style="display: none;" aria-hidden="true"><label for="website">Website</label><br /><input type="text" id="website" name="website" tabindex="-1" autocomplete="false" value="" /></div>
				</form>
			</div>


		
	</div>

</div>
</div>



<p></p>
<p>The post <a rel="nofollow" href="https://pyimagesearch.com/2026/06/08/run-an-apache-airflow-dag-with-docker-compose-and-postgresql/">Run an Apache Airflow DAG with Docker Compose and PostgreSQL</a> appeared first on <a rel="nofollow" href="https://pyimagesearch.com">PyImageSearch</a>.</p>
]]></content:encoded>
					
		
		
			</item>
		<item>
		<title>Apache Airflow Document Ingestion Pipeline for RAG Systems</title>
		<link>https://pyimagesearch.com/2026/06/01/apache-airflow-document-ingestion-pipeline-for-rag-systems/</link>
		
		<dc:creator><![CDATA[Vikram Singh]]></dc:creator>
		<pubDate>Mon, 01 Jun 2026 12:45:00 +0000</pubDate>
				<category><![CDATA[Apache Airflow]]></category>
		<category><![CDATA[FastAPI]]></category>
		<category><![CDATA[MLOps]]></category>
		<category><![CDATA[Retrieval-Augmented Generation (RAG)]]></category>
		<category><![CDATA[Tutorial]]></category>
		<category><![CDATA[airflow dag]]></category>
		<category><![CDATA[airflow pipeline]]></category>
		<category><![CDATA[airflow tutorial]]></category>
		<category><![CDATA[apache airflow]]></category>
		<category><![CDATA[chunking]]></category>
		<category><![CDATA[data engineering]]></category>
		<category><![CDATA[data pipeline]]></category>
		<category><![CDATA[document ingestion]]></category>
		<category><![CDATA[document processing]]></category>
		<category><![CDATA[fastapi]]></category>
		<category><![CDATA[fastapi tutorial]]></category>
		<category><![CDATA[idempotency]]></category>
		<category><![CDATA[machine learning operations]]></category>
		<category><![CDATA[mlops]]></category>
		<category><![CDATA[orchestration]]></category>
		<category><![CDATA[pdf processing]]></category>
		<category><![CDATA[postgresql]]></category>
		<category><![CDATA[rag]]></category>
		<category><![CDATA[retrieval augmented generation]]></category>
		<category><![CDATA[tutorial]]></category>
		<category><![CDATA[workflow orchestration]]></category>
		<guid isPermaLink="false">https://pyimagesearch.com/?p=54017</guid>

					<description><![CDATA[<p>Table of Contents Apache Airflow Document Ingestion Pipeline for RAG Systems Introduction to Production-Grade Document Ingestion Pipelines Why Airflow Instead of Cron Jobs or Celery? Apache Airflow Document Ingestion Pipeline Architecture Component 1: FastAPI Ingestion Service Component 2: Apache Airflow&#8230;</p>
<p>The post <a rel="nofollow" href="https://pyimagesearch.com/2026/06/01/apache-airflow-document-ingestion-pipeline-for-rag-systems/">Apache Airflow Document Ingestion Pipeline for RAG Systems</a> appeared first on <a rel="nofollow" href="https://pyimagesearch.com">PyImageSearch</a>.</p>
]]></description>
										<content:encoded><![CDATA[
<hr class="wp-block-separator has-alpha-channel-opacity" id="TOC"/>


<div class="yoast-breadcrumbs"><span><span><a href="https://pyimagesearch.com/">Home</a></span></div>


<div class="toc">
<hr class="TOC"/>
<p class="has-large-font-size"><strong>Table of Contents</strong></p>
<ul>
    <li id="TOC-h1-Apache-Airflow-Document-Ingestion-Pipeline-RAG-Systems"><a rel="noopener" target="_blank" href="#h1-Apache-Airflow-Document-Ingestion-Pipeline-RAG-Systems">Apache Airflow Document Ingestion Pipeline for RAG Systems</a></li>

    <li id="TOC-h2-Introduction-Production-Grade-Document-Ingestion-Pipelines"><a rel="noopener" target="_blank" href="#h2-Introduction-Production-Grade-Document-Ingestion-Pipelines">Introduction to Production-Grade Document Ingestion Pipelines</a></li>
    <ul>
        <li id="TOC-h3-Why-Airflow-Instead-Cron-Jobs-Celery"><a rel="noopener" target="_blank" href="#h3-Why-Airflow-Instead-Cron-Jobs-Celery">Why Airflow Instead of Cron Jobs or Celery?</a></li>
    </ul>

    <li id="TOC-h2-Apache-Airflow-Document-Ingestion-Pipeline-Architecture"><a rel="noopener" target="_blank" href="#h2-Apache-Airflow-Document-Ingestion-Pipeline-Architecture">Apache Airflow Document Ingestion Pipeline Architecture</a></li>
    <ul>
        <li id="TOC-h3-Component-1-FastAPI-Ingestion-Service"><a rel="noopener" target="_blank" href="#h3-Component-1-FastAPI-Ingestion-Service">Component 1: FastAPI Ingestion Service</a></li>
        <li id="TOC-h3-Component-2-Apache-Airflow"><a rel="noopener" target="_blank" href="#h3-Component-2-Apache-Airflow">Component 2: Apache Airflow</a></li>
        <li id="TOC-h3-Component-3-PostgreSQL-Database"><a rel="noopener" target="_blank" href="#h3-Component-3-PostgreSQL-Database">Component 3: PostgreSQL Database</a></li>
        <li id="TOC-h3-Component-4-Shared-Volume"><a rel="noopener" target="_blank" href="#h3-Component-4-Shared-Volume">Component 4: Shared Volume</a></li>
    </ul>

    <li id="TOC-h2-Project-Structure"><a rel="noopener" target="_blank" href="#h2-Project-Structure">Project Structure</a></li>
    <ul>
        <li id="TOC-h3-Understanding-Structure"><a rel="noopener" target="_blank" href="#h3-Understanding-Structure">Understanding the Structure</a></li>
    </ul>

    <li id="TOC-h2-Database-Schema-Design-Document-Ingestion-Pipelines"><a rel="noopener" target="_blank" href="#h2-Database-Schema-Design-Document-Ingestion-Pipelines">Database Schema Design for Document Ingestion Pipelines</a></li>
    <ul>
        <li id="TOC-h3-documents-Table"><a rel="noopener" target="_blank" href="#h3-documents-Table">The documents Table</a></li>
        <li id="TOC-h3-chunks-Table"><a rel="noopener" target="_blank" href="#h3-chunks-Table">The chunks Table</a></li>
        <li id="TOC-h3-pipeline-runs-Table"><a rel="noopener" target="_blank" href="#h3-pipeline-runs-Table">The pipeline_runs Table</a></li>
        <li id="TOC-h3-Why-Hashes-Matter"><a rel="noopener" target="_blank" href="#h3-Why-Hashes-Matter">Why Hashes Matter</a></li>
        <li id="TOC-h3-Why-Idempotency-Matters"><a rel="noopener" target="_blank" href="#h3-Why-Idempotency-Matters">Why Idempotency Matters</a></li>
        <li id="TOC-h3-Database-Session-Management"><a rel="noopener" target="_blank" href="#h3-Database-Session-Management">Database Session Management</a></li>
    </ul>

    <li id="TOC-h2-Building-FastAPI-Document-Ingestion-Service"><a rel="noopener" target="_blank" href="#h2-Building-FastAPI-Document-Ingestion-Service">Building a FastAPI Document Ingestion Service</a></li>

    <li id="TOC-h2-Designing-Apache-Airflow-DAG"><a rel="noopener" target="_blank" href="#h2-Designing-Apache-Airflow-DAG">Designing an Apache Airflow DAG</a></li>
    <ul>
        <li id="TOC-h3-Task-1-Fetch-Pending-Documents"><a rel="noopener" target="_blank" href="#h3-Task-1-Fetch-Pending-Documents">Task 1: Fetch Pending Documents</a></li>
        <li id="TOC-h3-Task-2-Parse-Documents"><a rel="noopener" target="_blank" href="#h3-Task-2-Parse-Documents">Task 2: Parse Documents</a></li>
        <li id="TOC-h3-Task-3-Chunk-Documents"><a rel="noopener" target="_blank" href="#h3-Task-3-Chunk-Documents">Task 3: Chunk Documents</a></li>
        <li id="TOC-h3-Task-4-Validate-Chunks"><a rel="noopener" target="_blank" href="#h3-Task-4-Validate-Chunks">Task 4: Validate Chunks</a></li>
        <li id="TOC-h3-Task-5-Mark-Complete"><a rel="noopener" target="_blank" href="#h3-Task-5-Mark-Complete">Task 5: Mark Complete</a></li>
        <li id="TOC-h3-Why-This-DAG-Structure-Works"><a rel="noopener" target="_blank" href="#h3-Why-This-DAG-Structure-Works">Why This DAG Structure Works</a></li>
    </ul>

    <li id="TOC-h2-Summary"><a rel="noopener" target="_blank" href="#h2-Summary">Summary</a></li>
    <ul>
        <li id="TOC-h3-Citation-Information"><a rel="noopener" target="_blank" href="#h3-Citation-Information">Citation Information</a></li>
    </ul>
</ul>
</div>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h1-Apache-Airflow-Document-Ingestion-Pipeline-RAG-Systems"/>



<h2 class="wp-block-heading"><a href="#TOC-h1-Apache-Airflow-Document-Ingestion-Pipeline-RAG-Systems">Apache Airflow Document Ingestion Pipeline for RAG Systems</a></h2>



<p>In this lesson, you will learn how to design a production-grade document ingestion pipeline using Apache Airflow. We will build a system that accepts PDF uploads via FastAPI and orchestrates their processing using an Airflow DAG (Directed Acyclic Graph). You will see how to structure ingestion pipelines with idempotency, status tracking, and PostgreSQL-backed metadata. By the end of this lesson, you will understand how Airflow fits into modern RAG-style document ingestion workflows.</p>


<div class="wp-block-image">
<figure class="aligncenter size-full"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/apache-airflow-document-ingestion-pipeline-rag-systems-featured.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="940" height="780" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/apache-airflow-document-ingestion-pipeline-rag-systems-featured.png?lossy=2&strip=1&webp=1" alt="apache-airflow-document-ingestion-pipeline-rag-systems-featured.png" class="wp-image-54031" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/apache-airflow-document-ingestion-pipeline-rag-systems-featured.png?size=126x105&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/apache-airflow-document-ingestion-pipeline-rag-systems-featured-300x249.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/apache-airflow-document-ingestion-pipeline-rag-systems-featured.png?size=378x314&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/apache-airflow-document-ingestion-pipeline-rag-systems-featured.png?size=504x418&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/apache-airflow-document-ingestion-pipeline-rag-systems-featured.png?size=630x523&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/apache-airflow-document-ingestion-pipeline-rag-systems-featured-768x637.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/apache-airflow-document-ingestion-pipeline-rag-systems-featured.png?lossy=2&amp;strip=1&amp;webp=1 940w" sizes="(max-width: 630px) 100vw, 630px" /></a></figure></div>


<p>This lesson is the 1st in a 2-part series on <strong>Document Ingestion with Airflow</strong>:</p>



<ol class="wp-block-list">
<li><em><strong><a href="https://pyimg.co/8b2ey" target="_blank" rel="noreferrer noopener">Apache Airflow Document Ingestion Pipeline for RAG Systems</a></strong></em> <strong>(this tutorial)</strong></li>



<li><em>Lesson 2</em></li>
</ol>



<p><strong>To learn how to design and orchestrate a production-ready ingestion pipeline with Apache Airflow, FastAPI, and PostgreSQL, </strong><em><strong>just keep reading</strong></em><strong>.</strong></p>



<div id="pyi-source-code-block" class="source-code-wrap"><div class="gpd-source-code">
    <div class="gpd-source-code-content">
        <img decoding="async" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/source-code-icon.png?lossy=2&strip=1&webp=1" alt="">
        <h4>Looking for the source code to this post?</h4>
                    <a href="#download-the-code" class="pyis-cta-modal-open-modal">Jump Right To The Downloads Section <svg class="svg-icon arrow-right" width="12" height="12" aria-hidden="true" role="img" focusable="false" viewBox="0 0 14 14" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M6.8125 0.1875C6.875 0.125 6.96875 0.09375 7.09375 0.09375C7.1875 0.09375 7.28125 0.125 7.34375 0.1875L13.875 6.75C13.9375 6.8125 14 6.90625 14 7C14 7.125 13.9375 7.1875 13.875 7.25L7.34375 13.8125C7.28125 13.875 7.1875 13.9062 7.09375 13.9062C6.96875 13.9062 6.875 13.875 6.8125 13.8125L6.1875 13.1875C6.125 13.125 6.09375 13.0625 6.09375 12.9375C6.09375 12.8438 6.125 12.75 6.1875 12.6562L11.0312 7.8125H0.375C0.25 7.8125 0.15625 7.78125 0.09375 7.71875C0.03125 7.65625 0 7.5625 0 7.4375V6.5625C0 6.46875 0.03125 6.375 0.09375 6.3125C0.15625 6.25 0.25 6.1875 0.375 6.1875H11.0312L6.1875 1.34375C6.125 1.28125 6.09375 1.1875 6.09375 1.0625C6.09375 0.96875 6.125 0.875 6.1875 0.8125L6.8125 0.1875Z" fill="#169FE6"></path></svg></a>
            </div>
</div>
</div>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Introduction-Production-Grade-Document-Ingestion-Pipelines"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Introduction-Production-Grade-Document-Ingestion-Pipelines">Introduction to Production-Grade Document Ingestion Pipelines</a></h2>



<p>If you have ever built a Retrieval-Augmented Generation (RAG) system, you know that ingestion is the hardest part. Not the embeddings. Not the vector search. Not even the prompt engineering. The hardest part is reliably getting documents into your system, parsing them correctly, chunking them intelligently, and tracking every step along the way.</p>



<p>Why? Because ingestion is where the real world meets your clean ML architecture. PDFs are corrupted. Files are massive. Network requests fail halfway through. And when something breaks, you need to know exactly which document failed, why it failed, and how to restart processing without duplicating work or losing data.</p>



<p>This is where orchestration becomes critical. You need a system that can schedule work, retry failures, track progress, and give you observability into every stage of your pipeline. For ML ingestion pipelines, Apache Airflow is one of the best tools for this job.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Why-Airflow-Instead-Cron-Jobs-Celery"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Why-Airflow-Instead-Cron-Jobs-Celery">Why Airflow Instead of Cron Jobs or Celery?</a></h3>



<p>You might ask: why not just use cron jobs to trigger a Python script every minute? Or why not use Celery for task queueing? The answer is observability and resilience.</p>



<p>Cron jobs give you scheduling, but no visibility into what failed or why. When a cron job fails at 3am, you find out when users complain. You have no task history, no retry logic, and no dependency management. Celery gives you distributed task execution, but it does not provide workflow orchestration. You have to manually chain tasks, handle retries, and build your own monitoring.</p>



<p>Airflow gives you all of this out of the box. Think of it as a conveyor belt with inspection stations. Every document moves through the same sequence of steps (parse, chunk, validate), and at each station, Airflow records what happened. If a step fails, Airflow retries it automatically. If the entire system crashes, Airflow resumes from where it left off. The web UI shows you exactly which documents are stuck and why.</p>



<p>For production ML systems, this observability is not optional. It is the difference between debugging for hours and knowing immediately which PDF caused the parser to crash.</p>



<p>In this lesson, you will learn how to build a production-grade document ingestion pipeline using Apache Airflow. We will design a complete system that accepts PDF uploads via a REST (Representational State Transfer) API and orchestrates their processing using an Airflow DAG, with full deduplication and idempotency guarantees backed by PostgreSQL.</p>



<p>More importantly, you will understand why Airflow fits ingestion better than training or inference, and where its limitations begin. This foundation prepares you for the next lesson, where we implement the shared parsing and chunking logic and later transition to Argo Workflows for GPU-based ML compute.</p>



<p>By the end of this part, you will have a working control plane for your ingestion pipeline that you can extend for your own RAG systems and document processing workflows.</p>



<p>Let’s get started.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Apache-Airflow-Document-Ingestion-Pipeline-Architecture"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Apache-Airflow-Document-Ingestion-Pipeline-Architecture">Apache Airflow Document Ingestion Pipeline Architecture</a></h2>



<p>Before we dive into code, let&#8217;s understand what we are building. <strong>Figure 1</strong> shows the high-level architecture of our ingestion pipeline.</p>


<div class="wp-block-image">
<figure class="aligncenter size-full"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-4.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="624" height="203" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-4.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-54033" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-4.png?size=126x41&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-4-300x98.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-4.png?size=378x123&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-4.png?size=504x164&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-4.png?lossy=2&amp;strip=1&amp;webp=1 624w" sizes="(max-width: 624px) 100vw, 624px" /></a><figcaption class="wp-element-caption"><strong>Figure 1:</strong> High-level architecture of the Airflow-based ML ingestion pipeline. Documents flow from the FastAPI service through Airflow tasks and into PostgreSQL.</figcaption></figure></div>


<p>Our system consists of the following 4 main components.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Component-1-FastAPI-Ingestion-Service"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Component-1-FastAPI-Ingestion-Service">Component 1: FastAPI Ingestion Service</a></h3>



<p>This is the entry point for documents. It exposes a REST API on port 8000 that accepts PDF uploads. When a document arrives, the service performs three critical operations. First, it computes a SHA-256 hash of the file content to detect duplicates. Second, it saves the file to a shared volume that Airflow can access. Third, it inserts a record into the documents table in PostgreSQL with status set to <code data-enlighter-language="python" class="EnlighterJSRAW">PENDING</code>.</p>



<p>The service does not process the document. It only accepts it and marks it for processing. This separation of concerns is intentional. Ingestion and processing are different responsibilities with different scaling characteristics.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Component-2-Apache-Airflow"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Component-2-Apache-Airflow">Component 2: Apache Airflow</a></h3>



<p>Airflow is the orchestration layer. It runs two main processes: the scheduler and the webserver. The scheduler monitors our DAG (Directed Acyclic Graph) and triggers it on a schedule. In our case, the DAG runs every minute and looks for documents with status <code data-enlighter-language="python" class="EnlighterJSRAW">PENDING</code>.</p>



<p>When the DAG runs, it executes a series of tasks in order: fetch pending documents, parse PDFs into pages, chunk the text, validate chunk quality, and mark documents as complete. Each task is idempotent, meaning you can run it multiple times safely. Each task also has retry logic, so transient failures do not require manual intervention.</p>



<p>The webserver provides a UI on port 8080 where you can monitor DAG runs, inspect task logs, and manually trigger runs when needed.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Component-3-PostgreSQL-Database"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Component-3-PostgreSQL-Database">Component 3: PostgreSQL Database</a></h3>



<p>PostgreSQL serves 2 purposes in our system. First, it stores Airflow&#8217;s own metadata (DAG runs, task instances, logs). Second, it stores our application data in a separate database called <code data-enlighter-language="python" class="EnlighterJSRAW">ml_orchestration</code>.</p>



<p>Our application database has 3 main tables. The documents table tracks every uploaded file with its hash, path, and processing status. The chunks table stores the parsed and chunked text with deduplication via content hashes. The <code data-enlighter-language="python" class="EnlighterJSRAW">pipeline_runs</code> table records every DAG execution with metrics like how many documents were processed and how many chunks were created.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Component-4-Shared-Volume"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Component-4-Shared-Volume">Component 4: Shared Volume</a></h3>



<p>The fourth component is not visible in the diagram, but it is critical. All containers share a Docker volume mounted at <code data-enlighter-language="python" class="EnlighterJSRAW">/tmp/ml_orchestration/uploads</code>. When the FastAPI service saves a file, Airflow tasks can read it directly without network transfers or complex file synchronization.</p>



<p>This architecture gives us several important properties. First, we have a clear separation between ingestion (FastAPI) and processing (Airflow). Second, we have observability through Airflow&#8217;s UI and PostgreSQL queries. Third, we have idempotency through content hashing and status tracking. Fourth, we have reliability through Airflow&#8217;s retry mechanisms.</p>



<p>Now let us see how this maps to the actual codebase.</p>



<hr class="wp-block-separator has-alpha-channel-opacity"/>



<p>Would you like immediate access to 3,457 images curated and labeled with hand gestures to train, explore, and experiment with &#8230; for free? Head over to <a href="https://universe.roboflow.com/isl/az-6mqow?ref=pyimagesearch" target="_blank" rel="noreferrer noopener">Roboflow</a> and get a free account to grab these hand gesture images. </p>



<hr class="wp-block-separator has-alpha-channel-opacity"/>



<!-- wp:paragraph -->
<h3>Need Help Configuring Your Development Environment?</h3>
<!-- /wp:paragraph -->

<!-- wp:image {"align":"center","id":18137,"sizeSlug":"large","linkDestination":"custom"} -->
<figure class="wp-block-image aligncenter size-large"><a href="https://pyimagesearch.com/pyimagesearch-university/" target="_blank" rel="noreferrer noopener"><img decoding="async" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-18137" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?lossy=2&strip=1&webp=1 500w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?size=126x84&lossy=2&strip=1&webp=1 126w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?size=252x168&lossy=2&strip=1&webp=1 252w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?size=378x253&lossy=2&strip=1&webp=1 378w" sizes="(max-width: 500px) 100vw, 500px" /></a><figcaption>Having trouble configuring your development environment? Want access to pre-configured Jupyter Notebooks running on Google Colab? Be sure to join <a href="https://pyimagesearch.com/pyimagesearch-university/" target="_blank" rel="noreferrer noopener" aria-label=" (opens in a new tab)">PyImageSearch University</a> — you will be up and running with this tutorial in a matter of minutes. </figcaption></figure>
<!-- /wp:image -->

<!-- wp:paragraph -->
<p>All that said, are you:</p>
<!-- /wp:paragraph -->

<!-- wp:list -->
<ul><li>Short on time?</li><li>Learning on your employer’s administratively locked system?</li><li>Wanting to skip the hassle of fighting with the command line, package managers, and virtual environments?</li><li><strong>Ready to run the code immediately on your Windows, macOS, or Linux system?</strong></li></ul>
<!-- /wp:list -->

<!-- wp:paragraph -->
<p>Then join <a href="https://pyimagesearch.com/pyimagesearch-university/" target="_blank">PyImageSearch University</a> today!</p>
<!-- /wp:paragraph -->

<!-- wp:paragraph -->
<p><strong>Gain access to Jupyter Notebooks for this tutorial and other PyImageSearch guides pre-configured to run on Google Colab’s ecosystem right in your web browser!</strong> No installation required.</p>
<!-- /wp:paragraph -->

<!-- wp:paragraph -->
<p>And best of all, these Jupyter Notebooks will run on Windows, macOS, and Linux!</p>
<!-- /wp:paragraph -->



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Project-Structure"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Project-Structure">Project Structure</a></h2>



<p>We first need to review our project directory structure.</p>



<p>Start by accessing this tutorial’s <em><strong>“Downloads”</strong></em> section to retrieve the source code and example images.</p>



<p>From there, take a look at the directory structure:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Apache Airflow Document Ingestion Pipeline for RAG Systems" data-enlighter-group="1">├── airflow_project/ # Airflow orchestration system
│ ├── dags/
│ │ └── ingest_documents_dag.py # Main DAG: orchestrates PDF→chunks pipeline
│ │
│ ├── ingestion_service/ # FastAPI REST API for file uploads
│ │ ├── __init__.py
│ │ ├── main.py # Upload endpoint with deduplication
│ │ └── requirements.txt # FastAPI, Uvicorn dependencies
│ │
│ ├── docker-compose.yml # Orchestrates 5 services (Postgres, Airflow, API)
│ ├── Dockerfile # Airflow container image
│ ├── Dockerfile.service # FastAPI service container image
│ └── init-db.sh # PostgreSQL database initialization script
│
└── shared/ # Shared utilities (used by Airflow)
├── data_models/
│ ├── __init__.py
│ └── models.py # Pydantic schemas (Document, Chunk, PipelineRun)
│
├── parsing/
│ ├── __init__.py
│ ├── pdf_parser.py # PyPDF extraction logic
│ ├── chunker.py # Sliding window text chunking
│ └── deduplication.py # Content hashing utilities
│
├── storage/
│ ├── __init__.py
│ ├── database.py # SQLAlchemy session management (session_scope, get_session)
│ └── models.py # ORM models (DocumentModel, ChunkModel, PipelineRunModel)
│
├── utils/
│ ├── __init__.py
│ ├── hashing.py # SHA-256 file/content hashing
│ └── logging.py # Structured logging (get_logger)
│
├── __init__.py
└── requirements.txt # Shared dependencies (SQLAlchemy, Pydantic, PyPDF)
</pre>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Understanding-Structure"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Understanding-Structure">Understanding the Structure</a></h3>



<p>This project consists of 2 main directories that work together to create a production-grade document ingestion pipeline.</p>



<h4 class="wp-block-heading">The airflow_project/ Directory</h4>



<p>This folder contains everything for document ingestion using Apache Airflow. Think of it as your document processing factory &#8211; where raw PDFs enter the system and emerge as structured, searchable chunks.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">dags/ingest_documents_dag.py</code> file defines our workflow with five sequential tasks: fetch pending documents from the database, parse PDFs with PyPDF, split text into overlapping chunks, validate chunk quality, and mark documents complete. Each task is idempotent (safe to retry) and includes granular error handling so one corrupted PDF does not block an entire batch.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">ingestion_service/</code> subdirectory runs a FastAPI REST API on port 8000. Users upload PDFs via HTTP POST. The service computes a SHA-256 hash, checks for duplicates, saves the file to a shared volume, and inserts a database record with <code data-enlighter-language="python" class="EnlighterJSRAW">status=PENDING</code>. It deliberately does not process the file — that separation keeps uploads fast (users get immediate feedback) while heavy processing happens asynchronously in Airflow.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">docker-compose.yml</code> file orchestrates five containers: PostgreSQL (dual purpose: stores Airflow&#8217;s metadata and our application data in separate databases), Airflow webserver (UI on port 8080), Airflow scheduler (triggers the DAG every minute), init container (one-time database setup), and the ingestion service (API on port 8000). The critical piece is the shared </p>



<p><code data-enlighter-language="python" class="EnlighterJSRAW">/tmp/ml_orchestration/uploads</code> volume mounted into both Airflow containers and the API service &#8211; this lets Airflow read files the API writes without network transfers.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">Dockerfile</code> builds the Airflow container with necessary Python dependencies. The <code data-enlighter-language="python" class="EnlighterJSRAW">Dockerfile.service</code> builds the FastAPI container. The <code data-enlighter-language="python" class="EnlighterJSRAW">init-db.sh</code> script runs automatically when PostgreSQL starts, creating the <code data-enlighter-language="python" class="EnlighterJSRAW">ml_orchestration</code> database and <code data-enlighter-language="python" class="EnlighterJSRAW">mlops</code> user with proper permissions.</p>



<h4 class="wp-block-heading">The shared/ Directory</h4>



<p>This is your reusable logic layer. Everything here is pure Python business logic with zero Airflow dependencies.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">data_models/models.py</code> file contains Pydantic schemas that enforce data structure. Every document has a filename, file path, content hash, and status. Every chunk has text, a content hash, and a document reference. These schemas validate data at the API boundary and prevent type mismatches.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">parsing/</code> subdirectory implements document processing. The <code data-enlighter-language="python" class="EnlighterJSRAW">pdf_parser.py</code> module uses PyPDF to extract text page by page, preserving metadata like title and author. The <code data-enlighter-language="python" class="EnlighterJSRAW">chunker.py</code> module implements sliding window chunking (512 words with 50-word overlap) to split long documents while maintaining context across boundaries. The <code data-enlighter-language="python" class="EnlighterJSRAW">deduplication.py</code> module computes SHA-256 hashes to detect identical content both at the document and chunk level.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">storage/</code> subdirectory manages all database interaction. The <code data-enlighter-language="python" class="EnlighterJSRAW">database.py</code> file provides 2 session management utilities: <code data-enlighter-language="python" class="EnlighterJSRAW">session_scope()</code> (context manager for Airflow tasks with automatic commit/rollback) and <code data-enlighter-language="python" class="EnlighterJSRAW">get_session()</code> (generator for FastAPI dependency injection). The <code data-enlighter-language="python" class="EnlighterJSRAW">models.py</code> file defines SQLAlchemy ORM classes that map Python objects to PostgreSQL tables &#8211; <code data-enlighter-language="python" class="EnlighterJSRAW">DocumentModel</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">ChunkModel</code>, and <code data-enlighter-language="python" class="EnlighterJSRAW">PipelineRunModel</code>.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">utils/</code> subdirectory contains 2 essential helpers. The <code data-enlighter-language="python" class="EnlighterJSRAW">hashing.py</code> module computes SHA-256 hashes for both files (read in chunks to handle large PDFs) and strings (for chunk deduplication). The <code data-enlighter-language="python" class="EnlighterJSRAW">logging.py</code> module provides the <code data-enlighter-language="python" class="EnlighterJSRAW">get_logger()</code> function that returns a configured logger with consistent formatting across the entire system.</p>



<p>Now that you understand where everything lives and why, let&#8217;s dive into building the system.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Database-Schema-Design-Document-Ingestion-Pipelines"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Database-Schema-Design-Document-Ingestion-Pipelines">Database Schema Design for Document Ingestion Pipelines</a></h2>



<p>The database schema is the backbone of our ingestion pipeline. <strong>Figure </strong><strong>2</strong> shows the 3 main tables and their relationships.</p>


<div class="wp-block-image">
<figure class="aligncenter size-full"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-5.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="624" height="335" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-5.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-54038" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-5.png?size=126x68&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-5-300x161.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-5.png?size=378x203&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-5.png?size=504x271&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-5.png?lossy=2&amp;strip=1&amp;webp=1 624w" sizes="(max-width: 624px) 100vw, 624px" /></a><figcaption class="wp-element-caption"><strong>Figure 2:</strong> Database schema showing the <code>documents</code>, <code>chunks</code>, and <code>pipeline_runs</code> tables with their relationships and key columns.</figcaption></figure></div>


<p>Let&#8217;s examine each table and understand the design decisions.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-documents-Table"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-documents-Table">The documents Table</a></h3>



<p>This table tracks every uploaded file. The <code data-enlighter-language="python" class="EnlighterJSRAW">id</code> column is an auto-incrementing primary key. The <code data-enlighter-language="python" class="EnlighterJSRAW">filename</code> stores the original name (e.g., <code data-enlighter-language="python" class="EnlighterJSRAW">research_paper.pdf</code>). The <code data-enlighter-language="python" class="EnlighterJSRAW">file_path</code> stores the absolute path where the file is saved on disk.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">content_hash</code> column is critical. It stores the SHA-256 hash of the entire file content. This hash serves 2 purposes. First, it detects duplicate uploads. If 2 users upload the same PDF with different filenames, we catch it immediately. Second, it enables idempotency. If we need to reprocess a document, we can verify the file content has not changed.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">status</code> column uses a PostgreSQL ENUM with 4 values: <code data-enlighter-language="python" class="EnlighterJSRAW">PENDING</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">PROCESSING</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">COMPLETED</code>, and <code data-enlighter-language="python" class="EnlighterJSRAW">FAILED</code>. This drives the entire workflow. The FastAPI service sets status to <code data-enlighter-language="python" class="EnlighterJSRAW">PENDING</code>. When the DAG completes successfully, Airflow updates it to <code data-enlighter-language="python" class="EnlighterJSRAW">COMPLETED</code>. If any task fails, it becomes <code data-enlighter-language="python" class="EnlighterJSRAW">FAILED</code>. (The <code data-enlighter-language="python" class="EnlighterJSRAW">PROCESSING</code> state is available for systems that want to mark documents as in-progress, though our implementation goes directly from <code data-enlighter-language="python" class="EnlighterJSRAW">PENDING</code> to <code data-enlighter-language="python" class="EnlighterJSRAW">COMPLETED</code> or <code data-enlighter-language="python" class="EnlighterJSRAW">FAILED</code>.)</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">created_at</code> and <code data-enlighter-language="python" class="EnlighterJSRAW">updated_at</code> columns provide audit trails. We know exactly when each document entered the system and when it was last modified.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-chunks-Table"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-chunks-Table">The chunks Table</a></h3>



<p>This table stores the processed text chunks. The <code data-enlighter-language="python" class="EnlighterJSRAW">document_id</code> foreign key creates a one-to-many relationship with documents. One document produces many chunks.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">chunk_index</code> tracks the order of chunks within a document. Chunk 0 is the first chunk, chunk 1 is the second, and so on. This ordering is important for maintaining context.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">text</code> column holds the actual chunk content. The <code data-enlighter-language="python" class="EnlighterJSRAW">content_hash</code> is the SHA-256 of this text. Just like with documents, this prevents duplicate chunks. If the same text appears in multiple places (common in academic papers with repeated abstracts), we store it once.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">page_number</code> tracks which PDF page the chunk came from. This is useful for providing citations back to users. The <code data-enlighter-language="python" class="EnlighterJSRAW">token_count</code> provides a rough estimate of length (we use word count as a proxy for tokens), which helps with embedding model limits later.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-pipeline-runs-Table"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-pipeline-runs-Table">The pipeline_runs Table</a></h3>



<p>This table tracks every DAG execution. The <code data-enlighter-language="python" class="EnlighterJSRAW">pipeline_type</code> column will eventually distinguish between <code data-enlighter-language="python" class="EnlighterJSRAW">airflow</code> and <code data-enlighter-language="python" class="EnlighterJSRAW">argo</code> runs. For now, it is always <code data-enlighter-language="python" class="EnlighterJSRAW">airflow</code>.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">run_id</code> is Airflow&#8217;s unique execution identifier. It looks like <code data-enlighter-language="python" class="EnlighterJSRAW">manual__2026-01-26T09:56:12.565856+00:00</code>. This connects our table to Airflow&#8217;s internal metadata.</p>



<p>The status column tracks whether the entire pipeline run succeeded or failed. The <code data-enlighter-language="python" class="EnlighterJSRAW">started_at</code> and <code data-enlighter-language="python" class="EnlighterJSRAW">completed_at</code> timestamps measure execution time.</p>



<p>The metrics columns (<code data-enlighter-language="python" class="EnlighterJSRAW">documents_processed</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">chunks_created</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">embeddings_created</code>) provide observability. You can query this table to see how many documents you have processed over time or track your processing rate.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">run_metadata</code> column is a JSON field for flexible additional data. We store the DAG ID and execution date here.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Why-Hashes-Matter"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Why-Hashes-Matter">Why Hashes Matter</a></h3>



<p>Content hashing is not optional in production ML systems. Without hashes, you cannot detect duplicates. Users will upload the same research paper five times, creating 5 sets of chunks and wasting embedding compute and storage.</p>



<p>Without hashes, you cannot implement idempotency. If Airflow retries a task, you might create duplicate chunks or corrupt existing data. With hashes, every operation checks &#8220;does this hash already exist?&#8221; before creating new records.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Why-Idempotency-Matters"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Why-Idempotency-Matters">Why Idempotency Matters</a></h3>



<p>Idempotency means you can run an operation multiple times and get the same result. This is essential in distributed systems where failures are normal. If your DAG fails halfway through, you should be able to restart it safely.</p>



<p>Our design achieves idempotency through 3 mechanisms. First, content hashes prevent duplicate records. Second, status tracking prevents reprocessing completed documents. Third, task-level checks (e.g., &#8220;does this chunk hash already exist?&#8221;) ensure partial failures are recoverable.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Database-Session-Management"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Database-Session-Management">Database Session Management</a></h3>



<p>Before we dive into the ingestion service and DAG code, we need to understand how we connect to the database. All our code uses 2 key utilities from <code data-enlighter-language="python" class="EnlighterJSRAW">shared/storage/database.py</code>: <code data-enlighter-language="python" class="EnlighterJSRAW">session_scope()</code> for Airflow tasks and <code data-enlighter-language="python" class="EnlighterJSRAW">get_session()</code> for FastAPI.</p>



<p>Here is the complete database connection code:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Apache Airflow Document Ingestion Pipeline for RAG Systems" data-enlighter-group="3"># shared/storage/database.py
import logging
import os
from contextlib import contextmanager
from typing import Generator
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker, Session

logger = logging.getLogger(__name__)

# Base class for all ORM models
Base = declarative_base()

# Database connection string from environment
DATABASE_URL = os.getenv(
    "ML_ORCHESTRATION_DB_URI",
    "postgresql://mlops:mlops_password@localhost:5432/ml_orchestration"
)

def get_engine():
    """
    Create and return a SQLAlchemy engine with connection pooling.
    """
    engine = create_engine(
        DATABASE_URL,
        pool_pre_ping=True,  # Verify connections before using
        pool_size=5,
        max_overflow=10,
        echo=False  # Set to True for SQL query logging
    )
    return engine

@contextmanager
def session_scope():
    """
    Provide a transactional scope for database operations.
    
    Usage in Airflow tasks:
        with session_scope() as session:
            documents = session.query(DocumentModel).all()
    
    This ensures:
    - Automatic commit on success
    - Automatic rollback on exception  
    - Proper connection cleanup
    """
    engine = get_engine()
    SessionLocal = sessionmaker(bind=engine)
    session = SessionLocal()
    try:
        yield session
        session.commit()
    except Exception:
        session.rollback()
        raise
    finally:
        session.close()

def get_session() -> Generator[Session, None, None]:
    """
    FastAPI dependency for database sessions.
    
    Usage:
        @app.post("/documents")
        async def upload(session: Session = Depends(get_session)):
            # Use session here
    
    FastAPI calls this function for each request and handles cleanup.
    """
    engine = get_engine()
    SessionLocal = sessionmaker(bind=engine)
    session = SessionLocal()
    try:
        yield session
    finally:
        session.close()</pre>



<p>Let us break down these utilities.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">get_engine()</code> function creates a SQLAlchemy engine, which manages the connection pool to PostgreSQL. The <code data-enlighter-language="python" class="EnlighterJSRAW">pool_pre_ping=True</code> parameter tells SQLAlchemy to test each connection before using it. This handles cases where the database was restarted or connections went stale.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">pool_size=5</code> and <code data-enlighter-language="python" class="EnlighterJSRAW">max_overflow=10</code> settings control connection pooling. We maintain 5 persistent connections and can create up to 10 additional temporary connections under load. This prevents overwhelming the database with thousands of connections.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">session_scope()</code> context manager is used throughout our Airflow DAG tasks. It provides a transactional scope with automatic cleanup. When you use with <code data-enlighter-language="python" class="EnlighterJSRAW">session_scope() as session:</code>, the context manager creates a session, executes your code, commits the transaction if successful, or rolls back if an exception occurs. The <code data-enlighter-language="python" class="EnlighterJSRAW">finally</code> block ensures the connection is always closed.</p>



<p>This pattern prevents common bugs like forgetting to commit, leaking connections, or leaving transactions open after errors.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">get_session()</code> generator is designed for FastAPI&#8217;s dependency injection system. FastAPI calls this function for each HTTP request and automatically handles cleanup when the request completes. You never need to manually close the session in your endpoint code.</p>



<p>These 2 utilities ensure database operations are safe, consistent, and clean across both Airflow and FastAPI. Now let us see how the ingestion service uses <code data-enlighter-language="python" class="EnlighterJSRAW">get_session()</code>.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Building-FastAPI-Document-Ingestion-Service"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Building-FastAPI-Document-Ingestion-Service">Building a FastAPI Document Ingestion Service</a></h2>



<p>The FastAPI service is the entry point for documents. Let us walk through the code line by line to understand how it works.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-6-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="994" height="1024" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-6-994x1024.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-54042" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-6.png?size=126x130&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-6-291x300.png?lossy=2&amp;strip=1&amp;webp=1 291w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-6.png?size=378x389&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-6.png?size=504x519&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-6.png?size=630x649&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-6-768x791.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-6-994x1024.png?lossy=2&amp;strip=1&amp;webp=1 994w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-6-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1049w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 3:</strong> Request flow through the FastAPI ingestion service showing validation, storage, hashing, duplication check, and database insertion.</figcaption></figure></div>


<p>Here is the complete upload endpoint from <code data-enlighter-language="python" class="EnlighterJSRAW">airflow_project/ingestion_service/main.py</code>:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Apache Airflow Document Ingestion Pipeline for RAG Systems" data-enlighter-group="5">@app.post("/documents", response_model=DocumentResponse, status_code=201)
async def upload_document(
    file: UploadFile = File(...),
    session: Session = Depends(lambda: next(get_session()))
):
    """
    Upload a new document.
    
    The document will be stored and marked as PENDING for processing.
    """
    logger.info(f"Uploading document: {file.filename}")
    
    # Validate file type
    if not file.filename.lower().endswith('.pdf'):
        raise HTTPException(
            status_code=400,
            detail="Only PDF files are supported"
        )
    
    try:
        # Save file to disk
        file_path = UPLOAD_DIR / f"{datetime.utcnow().timestamp()}_{file.filename}"
        
        with open(file_path, "wb") as buffer:
            shutil.copyfileobj(file.file, buffer)
        
        # Compute file hash and size
        content_hash = hash_file(str(file_path))
        file_size = file_path.stat().st_size
        
        # Check for duplicates
        existing_doc = session.query(DocumentModel).filter(
            DocumentModel.content_hash == content_hash
        ).first()
        
        if existing_doc:
            logger.warning(f"Duplicate document detected: {content_hash}")
            file_path.unlink()
            raise HTTPException(
                status_code=409,
                detail=f"Document already exists with ID {existing_doc.id}"
            )
        
        # Create document record
        document = DocumentModel(
            filename=file.filename,
            file_path=str(file_path),
            content_hash=content_hash,
            file_size=file_size,
            mime_type="application/pdf",
            status=DocumentStatus.PENDING
        )
        
        session.add(document)
        session.commit()
        session.refresh(document)
        
        logger.info(f"Document uploaded successfully: ID {document.id}")
        
        return DocumentResponse.from_orm(document)
        
    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Failed to upload document: {str(e)}")
        raise HTTPException(status_code=500, detail=str(e))</pre>



<p>Let us break this down step by step.</p>



<p>The function signature uses FastAPI’s dependency injection. The <code data-enlighter-language="python" class="EnlighterJSRAW">file</code> parameter comes from the HTTP request as multipart form data. The <code data-enlighter-language="python" class="EnlighterJSRAW">session</code> parameter is injected by FastAPI using <code data-enlighter-language="python" class="EnlighterJSRAW">Depends()</code>. This gives us a database session without manual connection management.</p>



<p>The first operation is file type validation. We only accept PDFs for this lesson, so we check the filename extension. If it is not a PDF, we raise an HTTP 400 error immediately. Production systems might also validate file size, scan for malware, or check MIME types, but we keep it simple here.</p>



<p>Next, we save the file to disk. The <code data-enlighter-language="python" class="EnlighterJSRAW">UPLOAD_DIR</code> is <code data-enlighter-language="python" class="EnlighterJSRAW">/tmp/ml_orchestration/uploads</code>. This directory is mounted as a Docker volume, which means all containers can access it. We prefix the filename with a UTC timestamp to avoid collisions. If 2 users upload files named <code data-enlighter-language="python" class="EnlighterJSRAW">paper.pdf</code>, they become <code data-enlighter-language="python" class="EnlighterJSRAW">1769421678.801241_paper.pdf</code> and <code data-enlighter-language="python" class="EnlighterJSRAW">1769421690.123456_paper.pdf</code>.</p>



<p>We use <code data-enlighter-language="python" class="EnlighterJSRAW">shutil.copyfileobj()</code> to stream the file content from the upload to disk. This is memory-efficient because it processes the file in chunks rather than loading the entire file into RAM.</p>



<p>After saving, we compute 2 important values. The <code data-enlighter-language="python" class="EnlighterJSRAW">hash_file()</code> function reads the entire file and computes its SHA-256 hash. This is a cryptographic hash function that produces a unique 64-character hexadecimal string for the file content. Even a single byte change produces a completely different hash. We also get the file size in bytes using <code data-enlighter-language="python" class="EnlighterJSRAW">file_path.stat().st_size</code>.</p>



<p>The next step is critical: duplicate detection. We query the database for any existing document with the same content hash. If we find one, we know this exact file has been uploaded before, even if it has a different filename. We delete the newly uploaded file with <code data-enlighter-language="python" class="EnlighterJSRAW">file_path.unlink()</code> and return an HTTP 409 Conflict error with the ID of the existing document. This prevents duplicate processing.</p>



<p>If the document is unique, we create a new <code data-enlighter-language="python" class="EnlighterJSRAW">DocumentModel</code> instance. Notice the status field is set to <code data-enlighter-language="python" class="EnlighterJSRAW">DocumentStatus.PENDING</code>. This tells Airflow that the document needs processing. We do not set it to <code data-enlighter-language="python" class="EnlighterJSRAW">PROCESSING</code> or <code data-enlighter-language="python" class="EnlighterJSRAW">COMPLETED</code> because the upload service does not process documents. It only accepts them.</p>



<p>We add the model to the session, commit the transaction, and refresh the model to get the auto-generated ID. Finally, we return a <code data-enlighter-language="python" class="EnlighterJSRAW">DocumentResponse</code> with all the document details. The HTTP status code is 201 Created, which is the correct status for successful resource creation.</p>



<p>The error handling is worth noting. We re-raise <code data-enlighter-language="python" class="EnlighterJSRAW">HTTPException</code> instances without modification because FastAPI knows how to convert them to HTTP responses. For all other exceptions, we log the error and return an HTTP 500 with the error message. In production, you would want more sophisticated error handling (do not expose internal errors to clients), but this is sufficient for a lesson.</p>



<p><strong>What This Service Does Not Do</strong></p>



<p>Notice what is missing from this code. There is no PDF parsing. No text chunking. No embedding generation. The service has one responsibility: accept files and mark them for processing. This separation is intentional.</p>



<p>Ingestion and processing are different concerns. Ingestion must be fast and available. Users should be able to upload files without waiting for heavyweight processing. Processing can happen asynchronously, can retry on failure, and can take as long as needed.</p>



<p>This is where Airflow enters the picture. Let us see how the DAG processes these pending documents.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Designing-Apache-Airflow-DAG"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Designing-Apache-Airflow-DAG">Designing an Apache Airflow DAG</a></h2>



<p>The DAG is the heart of our orchestration logic. <strong>Figure </strong><strong>4</strong> shows the task graph and execution order.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/06/image-7-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="786" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-7-1024x786.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-54046" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-7.png?size=126x97&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-7-300x230.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-7.png?size=378x290&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-7.png?size=504x387&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-7.png?size=630x484&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-7-768x589.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-7-1024x786.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/06/image-7-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 4:</strong> The Airflow DAG execution graph showing task dependencies and data flow between tasks.</figcaption></figure></div>


<p>Here is how the DAG is defined in <code data-enlighter-language="python" class="EnlighterJSRAW">airflow_project/dags/ingest_documents_dag.py</code>:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Apache Airflow Document Ingestion Pipeline for RAG Systems" data-enlighter-group="7">with DAG(
    dag_id='ingest_documents_dag',
    default_args=default_args,
    description='Ingest and process documents for ML pipeline',
    schedule_interval=timedelta(minutes=1),
    start_date=days_ago(1),
    catchup=False,
    tags=['ingestion', 'documents', 'ml-pipeline'],
) as dag:
    
    fetch_documents_task = PythonOperator(
        task_id='fetch_documents',
        python_callable=fetch_pending_documents,
        provide_context=True,
    )
    
    parse_documents_task = PythonOperator(
        task_id='parse_documents',
        python_callable=parse_documents,
        provide_context=True,
    )
    
    chunk_documents_task = PythonOperator(
        task_id='chunk_documents',
        python_callable=chunk_documents,
        provide_context=True,
    )
    
    validate_chunks_task = PythonOperator(
        task_id='validate_chunks',
        python_callable=validate_chunks,
        provide_context=True,
    )
    
    mark_complete_task = PythonOperator(
        task_id='mark_complete',
        python_callable=mark_documents_complete,
        provide_context=True,
    )
    
    # Define task dependencies
    fetch_documents_task >> parse_documents_task >> chunk_documents_task
    chunk_documents_task >> validate_chunks_task >> mark_complete_task</pre>



<p>Let us understand each configuration parameter.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">dag_id</code> is the unique identifier for this workflow. It appears in the Airflow UI and logs. The <code data-enlighter-language="python" class="EnlighterJSRAW">default_args</code> dictionary contains settings that apply to all tasks. This includes retry behavior, execution timeout, and owner information.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">schedule_interval</code> is set to <code data-enlighter-language="python" class="EnlighterJSRAW">timedelta(minutes=1)</code>. This means Airflow runs this DAG every minute. In production, you might use hourly or daily schedules, but for demos and development, 1 minute lets you see results quickly.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">start_date</code> is set to <code data-enlighter-language="python" class="EnlighterJSRAW">days_ago(1)</code>, which means the DAG is eligible to run starting yesterday. The <code data-enlighter-language="python" class="EnlighterJSRAW">catchup=False</code> parameter is important. Without this, Airflow would try to run the DAG for every missed interval since the start date. We do not want that. We only care about processing current pending documents, not creating historical backfill runs.</p>



<p>The tags list helps organize DAGs in the UI. You can filter by tag to find related workflows.</p>



<p>Each task uses a <code data-enlighter-language="python" class="EnlighterJSRAW">PythonOperator</code>, which executes a Python function. The <code data-enlighter-language="python" class="EnlighterJSRAW">task_id</code> must be unique within the DAG. The <code data-enlighter-language="python" class="EnlighterJSRAW">python_callable</code> is the function to execute. The <code data-enlighter-language="python" class="EnlighterJSRAW">provide_context=True</code> parameter gives the function access to Airflow’s execution context.</p>



<p>Why does context matter? Because it provides critical runtime information: the unique <code data-enlighter-language="python" class="EnlighterJSRAW">run_id</code> (for creating file names that do not collide across runs), the execution timestamp (for audit trails), and XCom access (for passing data between tasks). Without context, your task functions would be isolated and unable to coordinate or share state.</p>



<p>The task dependencies are defined using the <code data-enlighter-language="python" class="EnlighterJSRAW">&gt;&gt;</code> operator. This creates a directed graph. <code data-enlighter-language="python" class="EnlighterJSRAW">fetch_documents_task &gt;&gt; parse_documents_task</code> means parse documents cannot start until fetch documents completes. The final line creates a longer chain: fetch, then parse, then chunk, then validate, then mark complete. This ensures strict ordering.</p>



<p>Now let us examine what each task function does.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Task-1-Fetch-Pending-Documents"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Task-1-Fetch-Pending-Documents">Task 1: Fetch Pending Documents</a></h3>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Apache Airflow Document Ingestion Pipeline for RAG Systems" data-enlighter-group="9">def fetch_pending_documents(**context) -> List[int]:
    """
    Task 1: Fetch documents that need processing.
    
    Returns list of document IDs to process.
    """
    logger.info("Fetching pending documents...")
    
    with session_scope() as session:
        pending_docs = session.query(DocumentModel).filter(
            DocumentModel.status == DocumentStatus.PENDING
        ).all()
        
        doc_ids = [doc.id for doc in pending_docs]
        logger.info(f"Found {len(doc_ids)} pending documents: {doc_ids}")
        
        run_id = context['dag_run'].run_id
        filepath = write_data_to_file(doc_ids, f'{run_id}_document_ids.json')
        
        context['task_instance'].xcom_push(key='document_ids_file', value=filepath)
        
        return doc_ids</pre>



<p>This function queries the database for all documents where <code data-enlighter-language="python" class="EnlighterJSRAW">status = PENDING</code>. It extracts just the IDs into a list. If there are no pending documents, the list is empty and subsequent tasks have no work to do.</p>



<p>The interesting part is how we pass data to the next task. We do not use Airflow’s XCom directly for the document IDs. Instead, we write them to a JSON file and pass only the file path through XCom. Why? Because XCom stores data in the Airflow metadata database. Large payloads slow down the database and can hit size limits. By using files, we keep XCom small and handle arbitrary data sizes.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">write_data_to_file()</code> helper function writes JSON to <code data-enlighter-language="python" class="EnlighterJSRAW">/tmp/***_dag_data/</code> and returns the full path. The next task reads from this path.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Task-2-Parse-Documents"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Task-2-Parse-Documents">Task 2: Parse Documents</a></h3>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Apache Airflow Document Ingestion Pipeline for RAG Systems" data-enlighter-group="11">def parse_documents(**context) -> Dict[str, int]:
    """
    Task 2: Parse PDF documents into pages.
    
    Reads document IDs from previous task and parses each PDF.
    """
    logger.info("Parsing documents...")
    
    doc_ids_file = context['task_instance'].xcom_pull(
        key='document_ids_file',
        task_ids='fetch_documents'
    )
    doc_ids = read_data_from_file(doc_ids_file)
    
    parsed_count = 0
    
    with session_scope() as session:
        for doc_id in doc_ids:
            pages_file = TEMP_DIR / f'{run_id}_doc_{doc_id}_pages.json'
            if pages_file.exists():
                logger.info(f"Document {doc_id} already parsed, skipping")
                parsed_count += 1
                continue
            
            doc = session.query(DocumentModel).filter(
                DocumentModel.id == doc_id
            ).first()
            
            if not doc:
                logger.warning(f"Document {doc_id} not found")
                continue
            
            try:
                pages = parse_pdf(doc.file_path)
                logger.info(f"Parsed {len(pages)} pages from {doc.filename}")
                
                pages_file = write_data_to_file(pages, f'{run_id}_doc_{doc_id}_pages.json')
                parsed_count += 1
                
            except Exception as e:
                logger.error(f"Failed to parse document {doc_id}: {str(e)}")
                doc.status = DocumentStatus.FAILED
                session.commit()
    
    logger.info(f"Successfully parsed {parsed_count} documents")
    return {'parsed': parsed_count}</pre>



<p>This task pulls the document IDs from the previous task, loads the document record from the database, and calls <code data-enlighter-language="python" class="EnlighterJSRAW">parse_pdf()</code> on the file path. The <code data-enlighter-language="python" class="EnlighterJSRAW">parse_pdf()</code> function (from <code data-enlighter-language="python" class="EnlighterJSRAW">shared/parsing/pdf_parser.py</code>) uses PyPDF to extract text page by page.</p>



<p>Notice the idempotency check at the top of the loop. If a file named <code data-enlighter-language="python" class="EnlighterJSRAW">{run_id}_doc_{doc_id}_pages.json</code> already exists, we skip parsing. This means if the task retries or reruns, it does not waste time reparsing documents that succeeded before.</p>



<p>The error handling is important. If parsing fails for any reason (corrupted PDF, missing file, permission error), we catch the exception, mark that document as <code data-enlighter-language="python" class="EnlighterJSRAW">FAILED</code>, and continue with the next one. This prevents one bad document from blocking the entire batch.</p>



<p>The parsed pages are written to a file, one file per document. The next task will read these files.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Task-3-Chunk-Documents"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Task-3-Chunk-Documents">Task 3: Chunk Documents</a></h3>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Apache Airflow Document Ingestion Pipeline for RAG Systems" data-enlighter-group="13">def chunk_documents(**context) -> Dict[str, int]:
    """
    Task 3: Chunk parsed pages into text segments.
    
    Reads pages from previous task and creates chunks.
    """
    logger.info("Chunking documents...")
    
    doc_ids_file = context['task_instance'].xcom_pull(
        key='document_ids_file',
        task_ids='fetch_documents'
    )
    doc_ids = read_data_from_file(doc_ids_file)
    
    total_chunks = 0
    
    with session_scope() as session:
        for doc_id in doc_ids:
            run_id = context['dag_run'].run_id
            pages_file = TEMP_DIR / f'{run_id}_doc_{doc_id}_pages.json'
            
            if not pages_file.exists():
                logger.warning(f"No pages file found for document {doc_id}")
                continue
            
            pages = read_data_from_file(str(pages_file))
            
            existing_chunks = session.query(ChunkModel).filter(
                ChunkModel.document_id == doc_id
            ).count()
            
            if existing_chunks > 0:
                logger.info(f"Document {doc_id} already has {existing_chunks} chunks, skipping")
                total_chunks += existing_chunks
                continue
            
            try:
                full_text = "\n\n".join(page['text'] for page in pages)
                chunks = chunk_text(full_text, chunk_size=512, overlap=50)
                
                chunk_index = 0
                for chunk in chunks:
                    chunk_hash = hash_content(chunk)
                    
                    existing_chunk = session.query(ChunkModel).filter(
                        ChunkModel.content_hash == chunk_hash
                    ).first()
                    
                    if existing_chunk:
                        continue
                    
                    chunk_model = ChunkModel(
                        document_id=doc_id,
                        chunk_index=chunk_index,
                        text=chunk,
                        content_hash=chunk_hash,
                        page_number=None,
                        token_count=len(chunk.split())
                    )
                    session.add(chunk_model)
                    chunk_index += 1
                
                session.commit()
                logger.info(f"Created {chunk_index} chunks for document {doc_id}")
                total_chunks += chunk_index
                
            except Exception as e:
                logger.error(f"Failed to chunk document {doc_id}: {str(e)}")
    
    logger.info(f"Total chunks created: {total_chunks}")
    return {'chunks': total_chunks}</pre>



<p>This task joins all pages into a single text string, then calls <code data-enlighter-language="python" class="EnlighterJSRAW">chunk_text()</code> to split it into overlapping segments. The default chunk size is 512 words (we use whitespace-separated words as an approximate proxy for tokens) with 50-word overlap. Think of this like cutting a long rope into segments with intentional overlap at the ends — if an important concept spans a boundary, the overlap ensures it appears fully in at least one segment.</p>



<p>For each chunk, we compute a content hash and check if that exact text already exists in the database. This is duplicate detection at the chunk level. If the same sentence appears in multiple documents, we store it once. This saves storage and embedding compute later.</p>



<p>Notice we track <code data-enlighter-language="python" class="EnlighterJSRAW">chunk_index</code> to maintain ordering within a document. This is important for reconstruction or citation purposes.</p>



<p>The task again has idempotency checks. If the document already has chunks in the database, we skip it. This lets us safely retry the task.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Task-4-Validate-Chunks"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Task-4-Validate-Chunks">Task 4: Validate Chunks</a></h3>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Apache Airflow Document Ingestion Pipeline for RAG Systems" data-enlighter-group="15">def validate_chunks(**context) -> Dict[str, int]:
    """
    Task 4: Validate chunk quality.
    
    Checks for empty chunks, excessive length, etc.
    """
    logger.info("Validating chunks...")
    
    doc_ids_file = context['task_instance'].xcom_pull(
        key='document_ids_file',
        task_ids='fetch_documents'
    )
    doc_ids = read_data_from_file(doc_ids_file)
    
    valid_count = 0
    invalid_count = 0
    
    with session_scope() as session:
        for doc_id in doc_ids:
            chunks = session.query(ChunkModel).filter(
                ChunkModel.document_id == doc_id
            ).all()
            
            for chunk in chunks:
                # Too short
                if len(chunk.text) &lt; 50:
                    logger.warning(f"Chunk {chunk.id} too short: {len(chunk.text)} chars")
                    invalid_count += 1
                    continue
                
                # Too long
                if len(chunk.text) > 2000:
                    logger.warning(f"Chunk {chunk.id} too long: {len(chunk.text)} chars")
                    invalid_count += 1
                    continue
                
                # Empty or whitespace only
                if not chunk.text.strip():
                    logger.warning(f"Chunk {chunk.id} is empty or whitespace only")
                    invalid_count += 1
                    continue
                
                valid_count += 1
    
    logger.info(f"Validation complete: {valid_count} valid, {invalid_count} invalid chunks")
    return {'valid': valid_count, 'invalid': invalid_count}</pre>



<p>This task performs quality checks on chunks. It checks for chunks that are too short (less than <code data-enlighter-language="python" class="EnlighterJSRAW">50</code> characters), too long (more than <code data-enlighter-language="python" class="EnlighterJSRAW">2000</code> characters), or empty. In production, you might delete invalid chunks or mark them in a separate table. Here, we just log warnings.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Task-5-Mark-Complete"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Task-5-Mark-Complete">Task 5: Mark Complete</a></h3>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Apache Airflow Document Ingestion Pipeline for RAG Systems" data-enlighter-group="17">def mark_documents_complete(**context) -> Dict[str, int]:
    """
    Task 5: Mark documents as complete.
    
    Updates document status and creates pipeline run record.
    """
    logger.info("Marking documents complete...")
    
    doc_ids_file = context['task_instance'].xcom_pull(
        key='document_ids_file',
        task_ids='fetch_documents'
    )
    doc_ids = read_data_from_file(doc_ids_file)
    
    chunks_result = context['task_instance'].xcom_pull(task_ids='chunk_documents')
    total_chunks = chunks_result.get('chunks', 0)
    
    with session_scope() as session:
        for doc_id in doc_ids:
            doc = session.query(DocumentModel).filter(
                DocumentModel.id == doc_id
            ).first()
            
            if doc and doc.status == DocumentStatus.PROCESSING:
                doc.status = DocumentStatus.COMPLETED
        
        run_id = context['dag_run'].run_id
        pipeline_run = PipelineRunModel(
            pipeline_type='airflow',
            run_id=run_id,
            status=PipelineRunStatus.COMPLETED,
            started_at=context['dag_run'].start_date,
            completed_at=datetime.utcnow(),
            documents_processed=len(doc_ids),
            chunks_created=total_chunks,
            embeddings_created=0,
            run_metadata={
                'dag_id': context['dag'].dag_id,
                'execution_date': str(context['execution_date'])
            }
        )
        
        session.add(pipeline_run)
        session.commit()
        
        logger.info(f"Pipeline run {run_id} completed: {len(doc_ids)} docs, {total_chunks} chunks")
    
    return {'documents_completed': len(doc_ids)}</pre>



<p>The final task updates document status to <code data-enlighter-language="python" class="EnlighterJSRAW">COMPLETED</code> and creates a <code data-enlighter-language="python" class="EnlighterJSRAW">PipelineRunModel</code> record. This record captures metrics about the entire DAG run. Later, you can query this table to track throughput, find bottlenecks, or generate reports.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Why-This-DAG-Structure-Works"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Why-This-DAG-Structure-Works">Why This DAG Structure Works</a></h3>



<p>This 5-task structure enforces 4 critical principles. </p>



<p>First, each task has a single responsibility. Fetch finds work. Parse extracts text. Chunk splits text. Validate checks quality. Mark complete updates status. This makes debugging easier. If chunking fails, you know exactly which task to inspect.</p>



<p>Second, each task is idempotent. You can retry tasks without creating duplicate data or corrupting state. This is essential for reliability.</p>



<p>Third, we have observability at every step. Each task logs its progress. You can see exactly how many documents were parsed, how many chunks were created, and which documents failed.</p>



<p>Fourth, failure handling is granular. The pipeline is designed to continue processing other documents when individual documents fail, rather than aborting the entire batch. We catch exceptions at the document level, mark failed documents with <code data-enlighter-language="python" class="EnlighterJSRAW">FAILED</code> status, and let the task continue with the remaining documents.</p>



<p>In an upcoming lesson, we will implement the shared parsing and chunking logic and see how these tasks operate on real documents end to end.</p>



<hr class="wp-block-separator has-alpha-channel-opacity"/>



<div id="pitch" style="padding: 40px; width: 100%; background-color: #F4F6FA;">
	<h3>What's next? We recommend <a target="_blank" href="https://pyimagesearch.com/pyimagesearch-university/?utm_source=blogPost&utm_medium=bottomBanner&utm_campaign=What%27s%20next%3F%20I%20recommend">PyImageSearch University</a>.</h3>

	<script src="https://fast.wistia.com/embed/medias/kno0cmko2z.jsonp" async></script><script src="https://fast.wistia.com/assets/external/E-v1.js" async></script><div class="wistia_responsive_padding" style="padding:56.25% 0 0 0;position:relative;"><div class="wistia_responsive_wrapper" style="height:100%;left:0;position:absolute;top:0;width:100%;"><div class="wistia_embed wistia_async_kno0cmko2z videoFoam=true" style="height:100%;position:relative;width:100%"><div class="wistia_swatch" style="height:100%;left:0;opacity:0;overflow:hidden;position:absolute;top:0;transition:opacity 200ms;width:100%;"><img decoding="async" src="https://fast.wistia.com/embed/medias/kno0cmko2z/swatch" style="filter:blur(5px);height:100%;object-fit:contain;width:100%;" alt="" aria-hidden="true" onload="this.parentNode.style.opacity=1;" /></div></div></div></div>

	<div style="margin-top: 32px; margin-bottom: 32px; ">
		<strong>Course information:</strong><br/>
		86+ total classes • 115+ hours hours of on-demand code walkthrough videos • Last updated: June 2026<br/>
		<span style="color: #169FE6;">★★★★★</span> 4.84 (128 Ratings) • 16,000+ Students Enrolled
	</div>

	<p><strong>I strongly believe that if you had the right teacher you could <em>master</em> computer vision and deep learning.</strong></p>

	<p>Do you think learning computer vision and deep learning has to be time-consuming, overwhelming, and complicated? Or has to involve complex mathematics and equations? Or requires a degree in computer science?</p>

	<p>That’s <em>not</em> the case.</p>

	<p>All you need to master computer vision and deep learning is for someone to explain things to you in <em>simple, intuitive</em> terms. <em>And that’s exactly what I do</em>. My mission is to change education and how complex Artificial Intelligence topics are taught.</p>

	<p>If you're serious about learning computer vision, your next stop should be PyImageSearch University, the most comprehensive computer vision, deep learning, and OpenCV course online today. Here you’ll learn how to <em>successfully</em> and <em>confidently</em> apply computer vision to your work, research, and projects. Join me in computer vision mastery.</p>

	<p><strong>Inside PyImageSearch University you'll find:</strong></p>

	<ul style="margin-left: 0px;">
		<li style="list-style: none;">&check; <strong>86+ courses</strong> on essential computer vision, deep learning, and OpenCV topics</li>
		<li style="list-style: none;">&check; <strong>86 Certificates</strong> of Completion</li>
		<li style="list-style: none;">&check; <strong>115+ hours hours</strong> of on-demand video</li>
		<li style="list-style: none;">&check; <strong>Brand new courses released <em>regularly</em></strong>, ensuring you can keep up with state-of-the-art techniques</li>
		<li style="list-style: none;">&check; <strong>Pre-configured Jupyter Notebooks in Google Colab</strong></li>
		<li style="list-style: none;">&check; Run all code examples in your web browser — works on Windows, macOS, and Linux (no dev environment configuration required!)</li>
		<li style="list-style: none;">&check; Access to <strong>centralized code repos for <em>all</em> 540+ tutorials</strong> on PyImageSearch</li>
		<li style="list-style: none;">&check; <strong> Easy one-click downloads</strong> for code, datasets, pre-trained models, etc.</li>
		<li style="list-style: none;">&check; <strong>Access</strong> on mobile, laptop, desktop, etc.</li>
	</ul>

	<p style="text-align: center;">
		<a target="_blank" class="button link" href="https://pyimagesearch.com/pyimagesearch-university/?utm_source=blogPost&utm_medium=bottomBanner&utm_campaign=What%27s%20next%3F%20I%20recommend" style="background-color: #6DC713; border-bottom: none;">Click here to join PyImageSearch University</a>
	</p>
</div>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Summary"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Summary">Summary</a></h2>



<p>In this lesson, you built the foundation of a production-grade document ingestion pipeline using Apache Airflow. You learned how to design a FastAPI service for uploading PDF documents with built-in deduplication, how to model ingestion state in PostgreSQL, and how to define a reliable Airflow DAG to orchestrate document processing.</p>



<p>You saw how to separate ingestion from processing, use content hashing for idempotency, and construct a task graph that represents each stage of the pipeline. By the end of this part, you had a complete orchestration design for moving documents from raw uploads into a scheduled workflow.</p>



<p>This architecture forms the control plane of your ingestion pipeline and prepares you to implement the parsing and chunking logic in the next part.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Citation-Information"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Citation-Information">Citation Information</a></h3>



<p><strong>Singh, V</strong><strong>. </strong>“Apache Airflow Document Ingestion Pipeline for RAG Systems,” <em>PyImageSearch</em>, S. Huot, A. Sharma, and P. Thakur, eds., 2026, <a href="https://pyimg.co/8b2ey" target="_blank" rel="noreferrer noopener">https://pyimg.co/8b2ey</a></p>



<pre class="EnlighterJSRAW" data-enlighter-language="raw" data-enlighter-theme="classic" data-enlighter-highlight="" data-enlighter-linenumbers="false" data-enlighter-lineoffset="" data-enlighter-title="Apache Airflow Document Ingestion Pipeline for RAG Systems" data-enlighter-group="19">@incollection{Singh_2026_apache-airflow-document-ingestion-pipeline-rag-systems,
  author = {Vikram Singh},
  title = {{Apache Airflow Document Ingestion Pipeline for RAG Systems}},
  booktitle = {PyImageSearch},
  editor = {Susan Huot and Aditya Sharma and Piyush Thakur},
  year = {2026},
  url = {https://pyimg.co/8b2ey},
}
</pre>



<hr class="wp-block-separator has-alpha-channel-opacity"/>



<p><strong>To download the source code to this post (and be notified when future tutorials are published here on PyImageSearch), </strong><em><strong>simply enter your email address in the form below!</strong></em></p>



<div id="download-the-code" class="post-cta-wrap">
<div class="gpd-post-cta">
	<div class="gpd-post-cta-content">
		

			<div class="gpd-post-cta-top">
				<div class="gpd-post-cta-top-image"><img decoding="async" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?lossy=2&strip=1&webp=1" alt="" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?lossy=2&strip=1&webp=1 410w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?size=126x174&lossy=2&strip=1&webp=1 126w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?size=252x348&lossy=2&strip=1&webp=1 252w" sizes="(max-width: 410px) 100vw, 410px" /></div>
				
				<div class="gpd-post-cta-top-title"><h4>Download the Source Code and FREE 17-page Resource Guide</h4></div>
				<div class="gpd-post-cta-top-desc"><p>Enter your email address below to get a .zip of the code and a <strong>FREE 17-page Resource Guide on Computer Vision, OpenCV, and Deep Learning.</strong> Inside you'll find my hand-picked tutorials, books, courses, and libraries to help you master CV and DL!</p></div>


			</div>

			<div class="gpd-post-cta-bottom">
				<form id="footer-cta-code" class="footer-cta" action="https://www.getdrip.com/forms/4130035/submissions" method="post" target="blank" data-drip-embedded-form="4130035">
					<input name="fields[email]" type="email" value="" placeholder="Your email address" class="form-control" />

					<button type="submit">Download the code!</button>

					<div style="display: none;" aria-hidden="true"><label for="website">Website</label><br /><input type="text" id="website" name="website" tabindex="-1" autocomplete="false" value="" /></div>
				</form>
			</div>


		
	</div>

</div>
</div>
<p>The post <a rel="nofollow" href="https://pyimagesearch.com/2026/06/01/apache-airflow-document-ingestion-pipeline-for-rag-systems/">Apache Airflow Document Ingestion Pipeline for RAG Systems</a> appeared first on <a rel="nofollow" href="https://pyimagesearch.com">PyImageSearch</a>.</p>
]]></content:encoded>
					
		
		
			</item>
		<item>
		<title>Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)</title>
		<link>https://pyimagesearch.com/2026/05/25/manual-tracing-scores-and-evaluation-with-langfuse-self-hosted/</link>
		
		<dc:creator><![CDATA[Vikram Singh]]></dc:creator>
		<pubDate>Mon, 25 May 2026 12:45:00 +0000</pubDate>
				<category><![CDATA[Generative AI]]></category>
		<category><![CDATA[Langfuse]]></category>
		<category><![CDATA[LLMOps]]></category>
		<category><![CDATA[MLOps]]></category>
		<category><![CDATA[Monitoring]]></category>
		<category><![CDATA[Observability]]></category>
		<category><![CDATA[Tutorial]]></category>
		<category><![CDATA[agent observability]]></category>
		<category><![CDATA[evaluation metrics]]></category>
		<category><![CDATA[langfuse]]></category>
		<category><![CDATA[langfuse tracing]]></category>
		<category><![CDATA[latency monitoring]]></category>
		<category><![CDATA[llm diagnostics]]></category>
		<category><![CDATA[llm evaluation]]></category>
		<category><![CDATA[llm evaluation metrics]]></category>
		<category><![CDATA[llm monitoring]]></category>
		<category><![CDATA[llm observability]]></category>
		<category><![CDATA[manual tracing]]></category>
		<category><![CDATA[observability dashboard]]></category>
		<category><![CDATA[openai compatible api]]></category>
		<category><![CDATA[quality scoring]]></category>
		<category><![CDATA[rag observability]]></category>
		<category><![CDATA[self-hosted langfuse]]></category>
		<category><![CDATA[token usage tracking]]></category>
		<category><![CDATA[tracing pipelines]]></category>
		<category><![CDATA[tracing spans]]></category>
		<category><![CDATA[tutorial]]></category>
		<category><![CDATA[vllm]]></category>
		<guid isPermaLink="false">https://pyimagesearch.com/?p=53941</guid>

					<description><![CDATA[<p>Table of Contents Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted) Why Manual Tracing Matters for LLM Observability Decorator vs Manual Tracing: When to Use Which Manual Tracing with the Langfuse Low-Level API Why Manual Tracing Matters (Even If You&#8230;</p>
<p>The post <a rel="nofollow" href="https://pyimagesearch.com/2026/05/25/manual-tracing-scores-and-evaluation-with-langfuse-self-hosted/">Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)</a> appeared first on <a rel="nofollow" href="https://pyimagesearch.com">PyImageSearch</a>.</p>
]]></description>
										<content:encoded><![CDATA[<div class="yoast-breadcrumbs"><span><span><a href="https://pyimagesearch.com/">Home</a></span></div>


<div class="toc">
<hr class="TOC"/>
<p class="has-large-font-size"><strong>Table of Contents</strong></p>
<ul>
    <li id="TOC-h1-Manual-Tracing-Scores-Evaluation-Langfuse-Self-Hosted"><a rel="noopener" target="_blank" href="#h1-Manual-Tracing-Scores-Evaluation-Langfuse-Self-Hosted">Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)</a></li>

    <li id="TOC-h2-Why-Manual-Tracing-Matters-LLM-Observability"><a rel="noopener" target="_blank" href="#h2-Why-Manual-Tracing-Matters-LLM-Observability">Why Manual Tracing Matters for LLM Observability</a></li>

    <li id="TOC-h2-Decorator-vs-Manual-Tracing-When-Use-Which"><a rel="noopener" target="_blank" href="#h2-Decorator-vs-Manual-Tracing-When-Use-Which">Decorator vs Manual Tracing: When to Use Which</a></li>

    <li id="TOC-h2-Manual-Tracing-Langfuse-Low-Level-API"><a rel="noopener" target="_blank" href="#h2-Manual-Tracing-Langfuse-Low-Level-API">Manual Tracing with the Langfuse Low-Level API</a></li>
    <ul>
        <li id="TOC-h3-Why-Manual-Tracing-Matters-Even-If-Use-Decorators"><a rel="noopener" target="_blank" href="#h3-Why-Manual-Tracing-Matters-Even-If-Use-Decorators">Why Manual Tracing Matters (Even If You Use Decorators)</a></li>
        <li id="TOC-h3-Full-Manual-Tracing-Implementation-Langfuse"><a rel="noopener" target="_blank" href="#h3-Full-Manual-Tracing-Implementation-Langfuse">Full Manual Tracing Implementation with Langfuse</a></li>
        <li id="TOC-h3-Code-Walkthrough-Langfuse-Manual-Tracing-Pipeline"><a rel="noopener" target="_blank" href="#h3-Code-Walkthrough-Langfuse-Manual-Tracing-Pipeline">Code Walkthrough: Langfuse Manual Tracing Pipeline</a></li>
        <li id="TOC-h3-Creating-Manual-Traces-Langfuse"><a rel="noopener" target="_blank" href="#h3-Creating-Manual-Traces-Langfuse">Creating Manual Traces in Langfuse</a></li>
        <li id="TOC-h3-Running-Langfuse-Manual-Tracing-Script"><a rel="noopener" target="_blank" href="#h3-Running-Langfuse-Manual-Tracing-Script">Running the Langfuse Manual Tracing Script</a></li>
        <li id="TOC-h3-Viewing-Manual-Traces-Langfuse-Dashboard"><a rel="noopener" target="_blank" href="#h3-Viewing-Manual-Traces-Langfuse-Dashboard">Viewing Manual Traces in the Langfuse Dashboard</a></li>
        <li id="TOC-h3-Manual-vs-Decorator-Tracing-Langfuse"><a rel="noopener" target="_blank" href="#h3-Manual-vs-Decorator-Tracing-Langfuse">Manual vs Decorator Tracing in Langfuse</a></li>
    </ul>

    <li id="TOC-h2-LLM-Evaluation-Metrics-Quality-Scoring-Langfuse"><a rel="noopener" target="_blank" href="#h2-LLM-Evaluation-Metrics-Quality-Scoring-Langfuse">LLM Evaluation Metrics and Quality Scoring with Langfuse</a></li>
    <ul>
        <li id="TOC-h3-Adding-LLM-Evaluation-Metrics-Beyond-Manual-Tracing"><a rel="noopener" target="_blank" href="#h3-Adding-LLM-Evaluation-Metrics-Beyond-Manual-Tracing">Adding LLM Evaluation Metrics Beyond Manual Tracing</a></li>
        <li id="TOC-h3-Code-Walkthrough-evaluation-metrics-py"><a rel="noopener" target="_blank" href="#h3-Code-Walkthrough-evaluation-metrics-py">Code Walkthrough: evaluation_metrics.py</a></li>
        <li id="TOC-h3-Running-LLM-Evaluation-Metrics-Pipeline"><a rel="noopener" target="_blank" href="#h3-Running-LLM-Evaluation-Metrics-Pipeline">Running the LLM Evaluation Metrics Pipeline</a></li>
        <li id="TOC-h3-Conceptual-Mockup-Evaluation-Trace-Langfuse"><a rel="noopener" target="_blank" href="#h3-Conceptual-Mockup-Evaluation-Trace-Langfuse">Conceptual Mockup: Evaluation Trace in Langfuse</a></li>
        <li id="TOC-h3-Real-Trace-Self-Hosted-Langfuse-Dashboard"><a rel="noopener" target="_blank" href="#h3-Real-Trace-Self-Hosted-Langfuse-Dashboard">Real Trace from Our Self-Hosted Langfuse Dashboard</a></li>
        <li id="TOC-h3-Why-LLM-Evaluation-Metrics-Matter"><a rel="noopener" target="_blank" href="#h3-Why-LLM-Evaluation-Metrics-Matter">Why LLM Evaluation Metrics Matter</a></li>
    </ul>

    <li id="TOC-h2-vLLM-Diagnostics-Health-Checks-LLM-Observability"><a rel="noopener" target="_blank" href="#h2-vLLM-Diagnostics-Health-Checks-LLM-Observability">vLLM Diagnostics and Health Checks for LLM Observability</a></li>
    <ul>
        <li id="TOC-h3-What-vLLM-Health-Check-Script-Validates"><a rel="noopener" target="_blank" href="#h3-What-vLLM-Health-Check-Script-Validates">What the vLLM Health Check Script Validates</a></li>
        <li id="TOC-h3-Code-Walkthrough-health-check-py"><a rel="noopener" target="_blank" href="#h3-Code-Walkthrough-health-check-py">Code Walkthrough: health_check.py</a></li>
        <li id="TOC-h3-Why-vLLM-Health-Checks-Matter-LLM-Observability"><a rel="noopener" target="_blank" href="#h3-Why-vLLM-Health-Checks-Matter-LLM-Observability">Why vLLM Health Checks Matter for LLM Observability</a></li>
    </ul>

    <li id="TOC-h2-Summary"><a rel="noopener" target="_blank" href="#h2-Summary">Summary</a></li>
    <ul>
        <li id="TOC-h3-Citation-Information"><a rel="noopener" target="_blank" href="#h3-Citation-Information">Citation Information</a></li>
    </ul>
</ul>
</div>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h1-Manual-Tracing-Scores-Evaluation-Langfuse-Self-Hosted"/>



<h2 class="wp-block-heading"><a href="#TOC-h1-Manual-Tracing-Scores-Evaluation-Langfuse-Self-Hosted">Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)</a></h2>



<p>In this lesson, you will learn how to take full control of LLM observability using the <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> manual tracing API. While Lesson 1 demonstrated the benefits of decorator-based tracing, real-world LLM systems often require deeper visibility. This includes custom spans, step-level metadata, evaluation scores, and multi-stage inspection for RAG pipelines and agent workflows. </p>


<div class="wp-block-image">
<figure class="aligncenter size-full"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/manual-tracing-scores-evaluation-langfuse-self-hosted-featured.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="940" height="780" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/manual-tracing-scores-evaluation-langfuse-self-hosted-featured.png?lossy=2&strip=1&webp=1" alt="manual-tracing-scores-evaluation-langfuse-self-hosted-featured.png" class="wp-image-53961" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/manual-tracing-scores-evaluation-langfuse-self-hosted-featured.png?size=126x105&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/manual-tracing-scores-evaluation-langfuse-self-hosted-featured-300x249.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/manual-tracing-scores-evaluation-langfuse-self-hosted-featured.png?size=378x314&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/manual-tracing-scores-evaluation-langfuse-self-hosted-featured.png?size=504x418&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/manual-tracing-scores-evaluation-langfuse-self-hosted-featured.png?size=630x523&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/manual-tracing-scores-evaluation-langfuse-self-hosted-featured-768x637.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/manual-tracing-scores-evaluation-langfuse-self-hosted-featured.png?lossy=2&amp;strip=1&amp;webp=1 940w" sizes="(max-width: 630px) 100vw, 630px" /></a></figure></div>


<p>In this lesson, you will build a fully instrumented pipeline where every step, every decision, and every model output is recorded with precision inside your self-hosted <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> dashboard.</p>



<p>This lesson is the 2nd in a 3-part series on <strong>LLM observability with Langfuse</strong>:</p>



<ol class="wp-block-list">
<li><em><strong><a href="https://pyimg.co/tadoh" target="_blank" rel="noreferrer noopener">LLM Observability with Self-Hosted Langfuse and vLLM</a></strong></em></li>



<li><em><strong><a href="https://pyimg.co/24p06" target="_blank" rel="noreferrer noopener">Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)</a></strong></em><strong> (this tutorial)</strong></li>



<li><em>Lesson 3</em></li>
</ol>



<p><strong>To learn how to build manual traces, attach custom spans, and evaluate LLM outputs with scoring metadata, </strong><em><strong>just keep reading.</strong></em></p>



<div id="pyi-source-code-block" class="source-code-wrap"><div class="gpd-source-code">
    <div class="gpd-source-code-content">
        <img decoding="async" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/source-code-icon.png?lossy=2&strip=1&webp=1" alt="">
        <h4>Looking for the source code to this post?</h4>
                    <a href="#download-the-code" class="pyis-cta-modal-open-modal">Jump Right To The Downloads Section <svg class="svg-icon arrow-right" width="12" height="12" aria-hidden="true" role="img" focusable="false" viewBox="0 0 14 14" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M6.8125 0.1875C6.875 0.125 6.96875 0.09375 7.09375 0.09375C7.1875 0.09375 7.28125 0.125 7.34375 0.1875L13.875 6.75C13.9375 6.8125 14 6.90625 14 7C14 7.125 13.9375 7.1875 13.875 7.25L7.34375 13.8125C7.28125 13.875 7.1875 13.9062 7.09375 13.9062C6.96875 13.9062 6.875 13.875 6.8125 13.8125L6.1875 13.1875C6.125 13.125 6.09375 13.0625 6.09375 12.9375C6.09375 12.8438 6.125 12.75 6.1875 12.6562L11.0312 7.8125H0.375C0.25 7.8125 0.15625 7.78125 0.09375 7.71875C0.03125 7.65625 0 7.5625 0 7.4375V6.5625C0 6.46875 0.03125 6.375 0.09375 6.3125C0.15625 6.25 0.25 6.1875 0.375 6.1875H11.0312L6.1875 1.34375C6.125 1.28125 6.09375 1.1875 6.09375 1.0625C6.09375 0.96875 6.125 0.875 6.1875 0.8125L6.8125 0.1875Z" fill="#169FE6"></path></svg></a>
            </div>
</div>
</div>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Why-Manual-Tracing-Matters-LLM-Observability"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Why-Manual-Tracing-Matters-LLM-Observability">Why Manual Tracing Matters for LLM Observability</a></h2>



<p>In Lesson 1, we built the foundations of LLM observability with a fully self-hosted <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> stack, a local <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> server, and a complete decorator-based tracing pipeline. With just a few <code data-enlighter-language="python" class="EnlighterJSRAW">@observe</code> decorators, we captured prompts, outputs, latency, token usage, and nested spans, all visualized instantly in the <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> dashboard. That approach was simple, powerful, and ideal for most LLM applications.</p>



<p>However, real production systems require more control than a decorator can provide.</p>



<p>Decorator-based tracing works well when function boundaries align with observability boundaries. Once a pipeline becomes dynamic, for example by involving multiple retrieval steps, conditional branches, tool calls, retries, validations, re-ranking, scoring, or multi-agent planning, you must explicitly decide what gets traced, how traces are grouped, and what metadata is recorded at each stage. In these scenarios, manual tracing becomes essential.</p>



<p>Manual tracing allows you to open and close spans at will, attach arbitrary metadata, log intermediate states, record evaluation scores, and capture execution steps that do not live inside a function, including loops, conditionals, streaming tokens, or retry logic. In short, decorator tracing provides automation, while manual tracing provides precision.</p>



<p>This lesson shows you how to construct traces explicitly, starting from creating the root trace and continuing through building child spans and attaching fine-grained metadata and custom evaluation signals. You will also integrate <code data-enlighter-language="python" class="EnlighterJSRAW">evaluation_metrics.py</code>, which introduces lightweight scoring for model generations. This makes it possible to track correctness, response length, latency thresholds, or any domain-specific metric directly inside <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> as structured metadata.</p>



<p>By the end of this section, you will understand not only why manual tracing matters, but also when it becomes indispensable. Common use cases include debugging RAG pipelines, analyzing retrieval failures, tracking hallucination hotspots, validating agent actions, and building complex multi-step LLM systems where you need complete visibility into what happened and why.</p>



<p>If you are ready to take full control of your observability pipeline, including manual spans and rich evaluation metadata, the following sections will guide you through the process step by step.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Decorator-vs-Manual-Tracing-When-Use-Which"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Decorator-vs-Manual-Tracing-When-Use-Which">Decorator vs Manual Tracing: When to Use Which</a></h2>



<p>In Lesson 1, the <code data-enlighter-language="python" class="EnlighterJSRAW">@observe</code> decorator gave us an elegant and almost magical tracing experience. You wrapped a function, ran your pipeline, and <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> automatically produced a structured trace with child spans, latency, token usage, and full metadata. This approach works well when your application is composed of clean, well-defined functions, such as a simple “generate answer” pipeline or a single LLM call with minimal branching.</p>



<p>However, decorators have an important limitation. They observe function boundaries, not logic boundaries.</p>



<p>If your real pipeline involves conditional flows, loops, retries, branching, retrieval, ranking, tool invocation, or agent-style decision-making, tracing only the outer function hides much of the interesting behavior. The decorator cannot see inside reasoning steps, iterative refinements, or internal calls unless those steps are wrapped in separate functions. As systems become more dynamic and non-linear, decorator-based tracing begins to fall short.</p>



<p>This is where manual tracing becomes essential.</p>



<p>Manual spans allow you to mark exactly where a step begins and ends, even when that step is not a function. You can record intermediate artifacts such as retrieved documents, scoring signals, latency thresholds, or model reasoning stages. You can attach custom metadata to any span and build a detailed step-by-step view of how your LLM pipeline behaves, rather than only seeing which functions were invoked.</p>



<p>In practice, the most effective approach is hybrid. Use decorators for high-level structure, and use manual spans when precision is required.</p>



<p>This lesson focuses on building that precision.</p>


<div class="wp-block-image">
<figure class="aligncenter size-full"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-69.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="624" height="340" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-69.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53963" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-69.png?size=126x69&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-69-300x163.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-69.png?size=378x206&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-69.png?size=504x275&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-69.png?lossy=2&amp;strip=1&amp;webp=1 624w" sizes="(max-width: 624px) 100vw, 624px" /></a><figcaption class="wp-element-caption"><strong>Figure 1:</strong> Decorators trace function calls; manual spans trace logic. Together, they give you complete control over LLM observability.</figcaption></figure></div>


<hr class="wp-block-separator has-alpha-channel-opacity"/>



<p>Would you like immediate access to 3,457 images curated and labeled with hand gestures to train, explore, and experiment with &#8230; for free? Head over to <a href="https://universe.roboflow.com/isl/az-6mqow?ref=pyimagesearch" target="_blank" rel="noreferrer noopener">Roboflow</a> and get a free account to grab these hand gesture images. </p>



<hr class="wp-block-separator has-alpha-channel-opacity"/>



<!-- wp:paragraph -->
<h3>Need Help Configuring Your Development Environment?</h3>
<!-- /wp:paragraph -->

<!-- wp:image {"align":"center","id":18137,"sizeSlug":"large","linkDestination":"custom"} -->
<figure class="wp-block-image aligncenter size-large"><a href="https://pyimagesearch.com/pyimagesearch-university/" target="_blank" rel="noreferrer noopener"><img decoding="async" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-18137" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?lossy=2&strip=1&webp=1 500w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?size=126x84&lossy=2&strip=1&webp=1 126w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?size=252x168&lossy=2&strip=1&webp=1 252w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?size=378x253&lossy=2&strip=1&webp=1 378w" sizes="(max-width: 500px) 100vw, 500px" /></a><figcaption>Having trouble configuring your development environment? Want access to pre-configured Jupyter Notebooks running on Google Colab? Be sure to join <a href="https://pyimagesearch.com/pyimagesearch-university/" target="_blank" rel="noreferrer noopener" aria-label=" (opens in a new tab)">PyImageSearch University</a> — you will be up and running with this tutorial in a matter of minutes. </figcaption></figure>
<!-- /wp:image -->

<!-- wp:paragraph -->
<p>All that said, are you:</p>
<!-- /wp:paragraph -->

<!-- wp:list -->
<ul><li>Short on time?</li><li>Learning on your employer’s administratively locked system?</li><li>Wanting to skip the hassle of fighting with the command line, package managers, and virtual environments?</li><li><strong>Ready to run the code immediately on your Windows, macOS, or Linux system?</strong></li></ul>
<!-- /wp:list -->

<!-- wp:paragraph -->
<p>Then join <a href="https://pyimagesearch.com/pyimagesearch-university/" target="_blank">PyImageSearch University</a> today!</p>
<!-- /wp:paragraph -->

<!-- wp:paragraph -->
<p><strong>Gain access to Jupyter Notebooks for this tutorial and other PyImageSearch guides pre-configured to run on Google Colab’s ecosystem right in your web browser!</strong> No installation required.</p>
<!-- /wp:paragraph -->

<!-- wp:paragraph -->
<p>And best of all, these Jupyter Notebooks will run on Windows, macOS, and Linux!</p>
<!-- /wp:paragraph -->



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Manual-Tracing-Langfuse-Low-Level-API"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Manual-Tracing-Langfuse-Low-Level-API">Manual Tracing with the Langfuse Low-Level API</a></h2>



<p>In Lesson 1, you used <code data-enlighter-language="python" class="EnlighterJSRAW">@observe</code> decorators to add observability with almost no effort: just annotate your functions and <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> automatically created traces, spans, usage metadata, and latency metrics.</p>



<p>In this lesson, we take the opposite approach: full manual control.</p>



<p>Manual tracing exposes the entire underlying API used by <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> itself. You decide:</p>



<ul class="wp-block-list">
<li>when traces are created</li>



<li>how spans relate to each other</li>



<li>what metadata you attach</li>



<li>how token usage is recorded</li>



<li>how latencies are measured</li>



<li>how deeply nested your pipeline becomes</li>
</ul>



<p>This approach is critical for advanced LLM workflows where decorators are either too restrictive or too magical.</p>



<p>You will see exactly how <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> stores a trace internally, and why this skill becomes essential when building complex RAG, evaluation, or multi-agent systems.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Why-Manual-Tracing-Matters-Even-If-Use-Decorators"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Why-Manual-Tracing-Matters-Even-If-Use-Decorators">Why Manual Tracing Matters (Even If You Use Decorators)</a></h3>



<p>The decorator API is elegant but sometimes too simple.</p>



<p>Manual tracing is required when you need:</p>



<ul class="wp-block-list">
<li><strong>Full control over trace structure:</strong> Define parent → child → subchild relationships explicitly.</li>



<li><strong>Dynamic spans:</strong> When you do not know upfront how many steps your pipeline will generate.</li>



<li><strong>Conditional traces:</strong> e.g., only log LLM calls above 2 seconds latency.</li>



<li><strong>Custom metadata injection:</strong> Dynamic context, retrieval sources, ranking scores, chain-of-thought summaries, etc.</li>



<li><strong>Advanced RAG + agent observability:</strong> Where each tool call needs explicit naming and structure.</li>
</ul>



<p>In short:</p>



<p>The decorator API is the convenience layer.</p>



<p>Manual tracing is the power-user layer.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Full-Manual-Tracing-Implementation-Langfuse"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Full-Manual-Tracing-Implementation-Langfuse">Full Manual Tracing Implementation with Langfuse</a></h3>



<p>Below is your complete script, <code data-enlighter-language="python" class="EnlighterJSRAW">src/tracing_manual.py</code>, unmodified and shown entirely so readers can reference it line-by-line.</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)" data-enlighter-group="1">"""
Manual Tracing with Low-Level Langfuse API

Shows explicit trace creation and management using Langfuse SDK directly.
This gives you full control but requires more code compared to decorators.
"""

from langfuse import Langfuse
from llm_utils import get_llm_client
from config import get_llm_config
import time

# Initialize Langfuse client
langfuse = Langfuse()

# Initialize vLLM client
client, model = get_llm_client(load_model_from_config=True)

# Get configuration
llm_config = get_llm_config()
temperature = llm_config.get("temperature", 0.7)
max_tokens = llm_config.get("max_tokens", 300)


def generate_with_manual_tracing(question: str) -> str:
    """
    Generate answer WITH manual trace creation.
   
    This gives you full control over every trace property:
    - Custom trace names and IDs
    - Granular span creation
    - Manual token counting
    - Custom metadata
    """
   
    print("Calling LLM with manual tracing...")
   
    # 1. Create trace manually
    trace = langfuse.trace(
        name="manual_llm_call",
        metadata={"method": "manual", "question": question}
    )
   
    # 2. Create span for LLM generation
    start_time = time.time()
   
    generation = trace.generation(
        name="llm_generation",
        model=model,
        input=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": question}
        ],
        metadata={
            "temperature": temperature,
            "max_tokens": max_tokens
        }
    )
   
    # 3. Make the actual LLM call
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": question}
        ],
        temperature=temperature,
        max_tokens=max_tokens
    )
   
    latency_ms = (time.time() - start_time) * 1000
    answer = response.choices[0].message.content
   
    # 4. Update generation with results
    generation.update(
        output=answer,
        usage={
            "input": response.usage.prompt_tokens,
            "output": response.usage.completion_tokens,
            "total": response.usage.total_tokens
        },
        metadata={
            "latency_ms": round(latency_ms, 2)
        }
    )
   
    print(f"   Tokens used: {response.usage.total_tokens}")
    print(f"   Latency: {latency_ms:.2f}ms")
    print(f"   ✅ Manually logged to Langfuse")
    print(f"   🔍 Trace ID: {trace.id}\n")
   
    return answer


if __name__ == "__main__":
    print("\n" + "="*70)
    print("Manual Tracing Demo")
    print("="*70 + "\n")
   
    question = "What is deep learning?"
    print(f"Question: {question}\n")
    print("-" * 70 + "\n")
   
    # Generate with manual tracing
    answer = generate_with_manual_tracing(question)
    print(f"Answer: {answer}\n")
   
    print("=" * 70)
    print("\n📊 Manual Tracing vs Decorators:")
    print("   Manual (this file):")
    print("   • Full control over trace structure")
    print("   • More verbose code")
    print("   • Good for complex custom logging")
    print()
    print("   Decorators (recommended):")
    print("   • Clean @observe annotation")
    print("   • Less boilerplate")
    print("   • Automatic nesting")
    print("   • See: src/tracing_decorator.py")
    print("\n🔍 Check your dashboard: https://cloud.langfuse.com")
    print("=" * 70 + "\n")
   
    # Flush traces
    langfuse.flush()
</pre>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Code-Walkthrough-Langfuse-Manual-Tracing-Pipeline"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Code-Walkthrough-Langfuse-Manual-Tracing-Pipeline">Code Walkthrough: Langfuse Manual Tracing Pipeline</a></h3>



<p>Let us break this down into meaningful building blocks.</p>



<h4 class="wp-block-heading">Initializing Langfuse + vLLM</h4>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)" data-enlighter-group="2">langfuse = Langfuse()
client, model = get_llm_client(load_model_from_config=True)
llm_config = get_llm_config()
</pre>



<p>Here, we:</p>



<ul class="wp-block-list">
<li>connect to the self-hosted <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse Server</code></li>



<li>initialize a <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> OpenAI-compatible client</li>



<li>load generation parameters such as <code data-enlighter-language="python" class="EnlighterJSRAW">temperature</code> and <code data-enlighter-language="python" class="EnlighterJSRAW">max_tokens</code></li>
</ul>



<p>Nothing happens yet. This is just configuration.</p>



<p>The real magic begins once we create a trace.</p>



<p><em><strong>Important: </strong></em><em>Manual tracing gives you full control over the trace lifecycle.</em></p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Creating-Manual-Traces-Langfuse"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Creating-Manual-Traces-Langfuse">Creating Manual Traces in Langfuse</a></h3>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)" data-enlighter-group="3">trace = langfuse.trace(
    name="manual_llm_call",
    metadata={"method": "manual", "question": question}
)
</pre>



<p>A <strong>trace</strong> is the root object that represents the entire request.</p>



<p>You define:</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">trace</code> name</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">metadata</code></li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">context</code></li>
</ul>



<p>This is equivalent to <code data-enlighter-language="python" class="EnlighterJSRAW">@observe(name="llm_pipeline")</code>, but explicit.</p>


<div class="wp-block-image">
<figure class="aligncenter size-full"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-70.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="624" height="305" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-70.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53966" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-70.png?size=126x62&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-70-300x147.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-70.png?size=378x185&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-70.png?size=504x246&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-70.png?lossy=2&amp;strip=1&amp;webp=1 624w" sizes="(max-width: 624px) 100vw, 624px" /></a><figcaption class="wp-element-caption"><strong>Figure 2:</strong> A manual trace begins with an explicit call to <code>langfuse.trace()</code>, giving you full control over naming, IDs, metadata, and context.</figcaption></figure></div>


<h4 class="wp-block-heading">Creating a Generation Span</h4>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)" data-enlighter-group="4">generation = trace.generation(
    name="llm_generation",
    model=model,
    input=[ ... ],
    metadata={ ... }
)
</pre>



<p>This is the part decorators automatically create.</p>



<p>A <strong>generation span</strong>:</p>



<ul class="wp-block-list">
<li>represents a single LLM model call</li>



<li>stores the prompt</li>



<li>stores parameters (<code data-enlighter-language="python" class="EnlighterJSRAW">temperature</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">max_tokens</code>)</li>



<li>links itself as a <em>child</em> of the main trace</li>
</ul>



<p>This is a foundational building block for RAG and agent pipelines.</p>



<h4 class="wp-block-heading">Making the Actual LLM Call</h4>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)" data-enlighter-group="5">response = client.chat.completions.create(...)
</pre>



<p>Here, the raw LLM execution happens.</p>



<p>No tracing occurs automatically; the span must be updated manually afterward.</p>



<h4 class="wp-block-heading">Recording Results (Tokens, Latency, Outputs)</h4>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)" data-enlighter-group="6">generation.update(
    output=answer,
    usage={...},
    metadata={ "latency_ms": round(latency_ms, 2) }
)
</pre>



<p>In manual mode, <strong>you choose what to log</strong>.</p>



<p>This is how you capture:</p>



<ul class="wp-block-list">
<li>latency</li>



<li>token usage</li>



<li>answer text</li>



<li>any additional metadata</li>



<li>final span status</li>
</ul>



<p>This is where evaluators, reward functions, safety signals, etc., get attached.</p>



<h4 class="wp-block-heading">Flushing Traces</h4>



<p>Short scripts exit before <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> can finish sending data.</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)" data-enlighter-group="7">langfuse.flush()
</pre>



<p>This guarantees the trace appears in the <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> dashboard immediately.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Running-Langfuse-Manual-Tracing-Script"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Running-Langfuse-Manual-Tracing-Script">Running the Langfuse Manual Tracing Script</a></h3>



<p>Right after the “run this script” block:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)" data-enlighter-group="8">$ python src/tracing_manual.py
</pre>



<p>You should see the output, as shown in <strong>Figure 3</strong>:</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-71-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="846" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-71-1024x846.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53969" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-71.png?size=126x104&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-71-300x248.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-71.png?size=378x312&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-71.png?size=504x416&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-71.png?size=630x520&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-71-768x635.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-71-1024x846.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-71-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-71-1536x1269.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 3:</strong> Actual terminal output from running <code>tracing_manual.py</code>, showing manual <code>trace</code> creation, token usage, <code>latency</code>, and the generated <code>answer</code>.</figcaption></figure></div>


<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Viewing-Manual-Traces-Langfuse-Dashboard"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Viewing-Manual-Traces-Langfuse-Dashboard">Viewing Manual Traces in the Langfuse Dashboard</a></h3>



<p>After running the manual tracing script, open the printed trace URL in your browser.</p>



<p>You should see a page similar to the screenshot below, showing the full structure of your manually created trace.</p>



<p>This view includes:</p>



<ul class="wp-block-list">
<li><strong>Root trace:</strong> <code data-enlighter-language="python" class="EnlighterJSRAW">manual_llm_call</code></li>



<li><strong>Child span:</strong> <code data-enlighter-language="python" class="EnlighterJSRAW">llm_generation</code></li>



<li><strong>Token usage summary:</strong> 32 → 300 (332 total)</li>



<li><strong>Metadata:</strong>
<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">method: "manual"</code></li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">question: "What is deep learning?"</code></li>
</ul>
</li>



<li><strong>Input and output placeholders:</strong>
<ul class="wp-block-list">
<li>(These appear as <code data-enlighter-language="python" class="EnlighterJSRAW">null</code> until the generation span updates, since the child span holds the actual LLM data.)</li>
</ul>
</li>
</ul>



<p>This is the clearest demonstration of what manual tracing gives you: explicit control over the <code data-enlighter-language="python" class="EnlighterJSRAW">structure</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">metadata</code>, and <code data-enlighter-language="python" class="EnlighterJSRAW">nesting</code> of your trace.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-72-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="384" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-72-1024x384.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53972" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-72.png?size=126x47&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-72-300x112.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-72.png?size=378x142&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-72.png?size=504x189&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-72.png?size=630x236&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-72-768x288.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-72-1024x384.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-72-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-72-1536x575.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 4:</strong> Manual trace in <code>Langfuse</code> showing a custom root trace, a generation span, metadata, and token usage logged via explicit API calls.</figcaption></figure></div>


<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Manual-vs-Decorator-Tracing-Langfuse"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Manual-vs-Decorator-Tracing-Langfuse">Manual vs Decorator Tracing in Langfuse</a></h3>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-73.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="423" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-73-1024x423.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53974" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-73.png?size=126x52&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-73-300x124.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-73.png?size=378x156&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-73.png?size=504x208&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-73.png?size=630x260&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-73-768x317.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-73-1024x423.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-73.png?lossy=2&amp;strip=1&amp;webp=1 1039w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Table 1:</strong> Comparison of decorator-based tracing versus manual instrumentation across usability, control, and pipeline complexity.</figcaption></figure></div>


<p>In this section, you learned how to build an entire trace manually:</p>



<ul class="wp-block-list">
<li>creating a root <code data-enlighter-language="python" class="EnlighterJSRAW">trace</code></li>



<li>adding a generation <code data-enlighter-language="python" class="EnlighterJSRAW">span</code></li>



<li>logging <code data-enlighter-language="python" class="EnlighterJSRAW">prompts</code></li>



<li>recording <code data-enlighter-language="python" class="EnlighterJSRAW">latency</code></li>



<li>logging token usage</li>



<li>updating <code data-enlighter-language="python" class="EnlighterJSRAW">metadata</code></li>



<li>flushing <code data-enlighter-language="python" class="EnlighterJSRAW">results</code></li>
</ul>



<p>Manual tracing is verbose, but incredibly powerful for custom workflows, evaluation, and multi-step LLM applications.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-LLM-Evaluation-Metrics-Quality-Scoring-Langfuse"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-LLM-Evaluation-Metrics-Quality-Scoring-Langfuse">LLM Evaluation Metrics and Quality Scoring with Langfuse</a></h2>



<p>Observability is more than <code data-enlighter-language="python" class="EnlighterJSRAW">latency</code> and <code data-enlighter-language="python" class="EnlighterJSRAW">tokens</code>. In real LLM systems, you also need to evaluate:</p>



<ul class="wp-block-list">
<li><strong>“Was the answer good?”</strong></li>



<li><strong>“Was it long enough?”</strong></li>



<li><strong>“Was it too slow?”</strong></li>



<li><strong>“Did model quality silently degrade?”</strong></li>
</ul>



<p>This section introduces evaluation metrics, custom scoring, and decorator-based tracing for quality analysis. You will learn how to attach accuracy/quality metadata to traces, visualize scores inside <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code>, and detect degraded model outputs in real time.</p>



<p>We will do this using the file <code data-enlighter-language="python" class="EnlighterJSRAW">evaluation_metrics.py</code>, which combines:</p>



<ul class="wp-block-list">
<li>the <code data-enlighter-language="python" class="EnlighterJSRAW">@observe</code> decorator</li>



<li>custom <code data-enlighter-language="python" class="EnlighterJSRAW">scoring</code> logic</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">latency</code> checks</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">trace</code> scoring</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">evaluation</code> pipeline wrapper</li>
</ul>



<p>By the end, you will have a complete <code data-enlighter-language="python" class="EnlighterJSRAW">scoring pipeline</code> with metrics displayed inside the <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> dashboard.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Adding-LLM-Evaluation-Metrics-Beyond-Manual-Tracing"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Adding-LLM-Evaluation-Metrics-Beyond-Manual-Tracing">Adding LLM Evaluation Metrics Beyond Manual Tracing</a></h3>



<p>This file builds on everything from Sections 2 and 3:</p>



<p>This script adds 4 major <code data-enlighter-language="python" class="EnlighterJSRAW">improvements</code>:</p>



<ul class="wp-block-list">
<li><strong>Automated tracing</strong> using <code data-enlighter-language="python" class="EnlighterJSRAW">@observe</code></li>



<li><strong>Custom </strong><code data-enlighter-language="python" class="EnlighterJSRAW">quality</code><strong> metric</strong> (using <code data-enlighter-language="python" class="EnlighterJSRAW">answer_length</code> as a proxy)</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">latency</code><strong> threshold warnings</strong></li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">score</code><strong> logging</strong> inside <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> (visible as a numerical “<code data-enlighter-language="python" class="EnlighterJSRAW">quality</code>” score)</li>
</ul>



<p>This turns your traces from “LLM diagnostics” into <strong>LLM evaluation and monitoring</strong>.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Code-Walkthrough-evaluation-metrics-py"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Code-Walkthrough-evaluation-metrics-py">Code Walkthrough: evaluation_metrics.py</a></h3>



<p>Below is the full annotated walkthrough.</p>



<h4 class="wp-block-heading">Initialize Langfuse + LLM Client</h4>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)" data-enlighter-group="9">langfuse = Langfuse()
client, model = get_llm_client(load_model_from_config=True)
</pre>



<p>We initialize 2 systems:</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> (manual scoring only): decorators handle <code data-enlighter-language="python" class="EnlighterJSRAW">tracing</code>, but <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse()</code> is needed for scoring.</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> client: same <code data-enlighter-language="python" class="EnlighterJSRAW">OpenAI</code>-compatible API as Lesson 1.</li>
</ul>



<h4 class="wp-block-heading">The Main Function: generate_and_score()</h4>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)" data-enlighter-group="10">@observe(name="generate_and_score")
def generate_and_score(question: str) -> tuple[str, float]:
</pre>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">@observe</code> decorator automatically creates a trace and an associated observation.</p>



<p>The rest of the function focuses on:</p>



<ul class="wp-block-list">
<li>LLM call</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">latency</code> measurement</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">quality</code> scoring</li>



<li>updating the observation</li>



<li>recording a <code data-enlighter-language="python" class="EnlighterJSRAW">score</code></li>
</ul>



<h4 class="wp-block-heading">Load Configurations</h4>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)" data-enlighter-group="11">llm_config = get_llm_config()
eval_config = get_evaluation_config()
temperature = llm_config.get("temperature", 0.7)
max_tokens = llm_config.get("max_tokens", 300)
   
min_length = eval_config.get("min_length", 20)
good_length_threshold = eval_config.get("good_length_threshold", 100)
max_latency_ms = eval_config.get("max_latency_ms", 5000)
</pre>



<p>From <code data-enlighter-language="python" class="EnlighterJSRAW">config.yaml</code>, we load:</p>



<p><strong>LLM Parameters</strong></p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">temperature</code></li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">max_tokens</code></li>
</ul>



<p><strong>Evaluation Parameters</strong></p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">min_length</code></li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">good_length_threshold</code></li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">max_latency_ms</code></li>
</ul>



<p>This means your scoring logic is <strong>configurable</strong> without touching Python code.</p>



<h4 class="wp-block-heading">Log Input</h4>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)" data-enlighter-group="12">langfuse_context.update_current_observation(
    input={"question": question, "model": model}
)
</pre>



<p><code data-enlighter-language="python" class="EnlighterJSRAW">langfuse_context.update_current_observation(...)</code> is used to attach new information to the current observation in a <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> trace.</p>



<p>Think of a <code data-enlighter-language="python" class="EnlighterJSRAW">trace</code> as one full request, and an <code data-enlighter-language="python" class="EnlighterJSRAW">observation</code> as one step inside that request (e.g., LLM call, embedding call, retrieval step).</p>



<h4 class="wp-block-heading">Perform the LLM Call + Measure Latency</h4>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)" data-enlighter-group="13">start_time = time.time()
# Make LLM call
response = client.chat.completions.create(
    model=model,
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": question}
    ],
    temperature=temperature,
    max_tokens=max_tokens
)
# Calculate latency
latency_ms = (time.time() - start_time) * 1000
</pre>



<p>This gives us:</p>



<ul class="wp-block-list">
<li>Real wall-clock <code data-enlighter-language="python" class="EnlighterJSRAW">latency</code></li>



<li>First-token + completion <code data-enlighter-language="python" class="EnlighterJSRAW">latency</code> combined</li>



<li>Values used for <code data-enlighter-language="python" class="EnlighterJSRAW">threshold</code> checking</li>
</ul>



<h4 class="wp-block-heading">Compute Answer Length + Quality Score</h4>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)" data-enlighter-group="14">answer_length = len(answer)
# Calculate quality score
if answer_length &lt; min_length:
    quality_score = 0.3
elif answer_length >= good_length_threshold:
    quality_score = 1.0
else:
    quality_score = 0.3 + (
        0.7 * (answer_length - min_length) /
        (good_length_threshold - min_length)
    )
</pre>



<p>This snippet measures the length of the generated <code data-enlighter-language="python" class="EnlighterJSRAW">answer</code> and uses it to compute a simple <code data-enlighter-language="python" class="EnlighterJSRAW">quality</code> score: if the answer is too short (below <code data-enlighter-language="python" class="EnlighterJSRAW">min_length</code>), it assigns a low score of <code data-enlighter-language="python" class="EnlighterJSRAW">0.3</code>; if it exceeds the <code data-enlighter-language="python" class="EnlighterJSRAW">good_length_threshold</code>, it gives a perfect score of <code data-enlighter-language="python" class="EnlighterJSRAW">1.0</code>. Otherwise, it linearly scales the score between <code data-enlighter-language="python" class="EnlighterJSRAW">0.3</code> and <code data-enlighter-language="python" class="EnlighterJSRAW">1.0</code> based on how close the <code data-enlighter-language="python" class="EnlighterJSRAW">answer_length</code> is to the ideal range. This provides a lightweight heuristic for judging response completeness without requiring complex evaluation logic.</p>



<h4 class="wp-block-heading">Update the Observation (Output + Usage + Metadata)</h4>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)" data-enlighter-group="15"># Update observation with results and custom metrics
langfuse_context.update_current_observation(
    output={"answer": answer, "quality_score": quality_score},
    usage={
        "input": response.usage.prompt_tokens,
        "output": response.usage.completion_tokens,
        "total": response.usage.total_tokens
    },
    metadata={
        "latency_ms": round(latency_ms, 2),
        "answer_length": answer_length
    }
)
</pre>



<p>This block updates the current <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> observation with everything needed to record the model’s performance: it logs the generated <code data-enlighter-language="python" class="EnlighterJSRAW">answer</code> and its quality score, tracks token usage from the model response (input, output, and total), and attaches custom metadata such as request <code data-enlighter-language="python" class="EnlighterJSRAW">latency</code> and the length of the returned answer. Together, these fields give you a complete view of each evaluation run, including what the model produced, how much it cost, and how efficiently it responded, making it easier to analyze and compare results across experiments.</p>



<p>What this adds to <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code>:</p>



<ul class="wp-block-list">
<li>Answer text</li>



<li>Quality score</li>



<li>Token usage</li>



<li>Latency</li>



<li>Derived <code data-enlighter-language="python" class="EnlighterJSRAW">metrics</code> (<code data-enlighter-language="python" class="EnlighterJSRAW">answer_length</code>)</li>
</ul>



<p>This gives you the same view you would see in enterprise-grade observability tools.</p>



<h4 class="wp-block-heading">Attach a Score to the Trace</h4>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)" data-enlighter-group="16"># Score the trace
langfuse_context.score_current_observation(
    name="quality",
    value=quality_score,
    comment=f"Based on answer length ({answer_length} chars)"
)
</pre>



<p>This line evaluates the current <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> observation by attaching a custom score named &#8220;quality&#8221; to the trace. It records the numerical <code data-enlighter-language="python" class="EnlighterJSRAW">quality_score</code>, your own metric for evaluating the model’s answer, and adds a short comment explaining the basis of that score, in this case referencing the <code data-enlighter-language="python" class="EnlighterJSRAW">answer_length</code>. Scoring observations like this makes it easy to compare model responses, analyze performance over time, and visualize quality trends directly in the <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> dashboard.</p>



<p>In short, this creates a <em>visible, numeric score</em> inside the <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> dashboard.</p>



<p>This is extremely powerful for:</p>



<ul class="wp-block-list">
<li>model comparisons</li>



<li>regression testing</li>



<li>degradation alerts</li>



<li>ranking model performance</li>
</ul>



<h4 class="wp-block-heading">Running the Evaluation Pipeline</h4>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)" data-enlighter-group="17">@observe(name="evaluation_pipeline")
def run_evaluation(question: str):
    """Wrapper to create a trace context for the evaluation."""
    from datetime import datetime
   
    # Add timestamp to make each run unique
    langfuse_context.update_current_trace(
        metadata={"run_time": datetime.now().isoformat()}
    )
   
    answer, score = generate_and_score(question)
   
    print(f"\n✅ Answer: {answer}\n")
    print(f"📊 Quality Score: {score:.2f}\n")
   
    trace_id = langfuse_context.get_current_trace_id()
    if trace_id:
        print(f"🔍 View trace with scores: https://cloud.langfuse.com/trace/{trace_id}")
        print(f"📋 Trace ID: {trace_id}")
    print("="*50 + "\n")
   
    return answer, score
</pre>



<p>This function defines an evaluation pipeline using the <code data-enlighter-language="python" class="EnlighterJSRAW">@observe</code> decorator, which tells <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> to treat every call as a traced, observable run. When the function starts, it imports datetime and immediately updates the active <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> trace with a timestamp so each evaluation run is uniquely identifiable. This metadata is helpful when you are comparing multiple experiments, debugging behavior, or tracking quality trends over time.</p>



<p>The core of the function calls <code data-enlighter-language="python" class="EnlighterJSRAW">generate_and_score(question)</code>, which returns an AI-generated <code data-enlighter-language="python" class="EnlighterJSRAW">answer</code> along with a numerical quality score. Both values are printed in a human-friendly format, and the function then retrieves the current <code data-enlighter-language="python" class="EnlighterJSRAW">trace_id</code> from <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code>. If a trace exists, it prints a direct link to view the full run, including <code data-enlighter-language="python" class="EnlighterJSRAW">metrics</code> and <code data-enlighter-language="python" class="EnlighterJSRAW">scores</code>, in the <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> dashboard. </p>



<p>Finally, the function returns the answer and score so they can be used downstream, while also visually marking the end of the run in the terminal output.</p>



<p><strong>It adds:</strong></p>



<ul class="wp-block-list">
<li>timestamp <code data-enlighter-language="python" class="EnlighterJSRAW">metadata</code></li>



<li>parent-level <code data-enlighter-language="python" class="EnlighterJSRAW">trace</code> context</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">output</code> printing</li>



<li>a link to view the <code data-enlighter-language="python" class="EnlighterJSRAW">trace</code></li>
</ul>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Running-LLM-Evaluation-Metrics-Pipeline"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Running-LLM-Evaluation-Metrics-Pipeline">Running the LLM Evaluation Metrics Pipeline</a></h3>



<p>A typical terminal run will show:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="raw" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)" data-enlighter-group="18">==================================================
Evaluation with Custom Scoring
==================================================

Question: What are neural networks?

📊 Quality Score: 0.82 (answer length: 112 chars)
📊 Latency: 212.45ms
📊 Tokens: 14 → 72

🔍 View trace with scores: http://localhost:3000/trace/01HY3SJQH9...
==================================================
⏳ Flushing traces to Langfuse...
✅ Traces sent!
</pre>



<p>This output must appear in the lesson. It helps the reader validate correctness.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Conceptual-Mockup-Evaluation-Trace-Langfuse"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Conceptual-Mockup-Evaluation-Trace-Langfuse">Conceptual Mockup: Evaluation Trace in Langfuse</a></h3>



<p>Before looking at the real dashboard output, here is a clean conceptual view of what an evaluation trace looks like inside <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code>.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-74-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="1021" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-74-1024x1021.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53976" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-74-150x150.png?lossy=2&amp;strip=1&amp;webp=1 150w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-74-300x300.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-74.png?size=378x377&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-74.png?size=504x503&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-74.png?size=630x628&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-74-768x766.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-74-1024x1021.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-74-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 5:</strong> <code>Langfuse UI</code> mockup showing the evaluation pipeline, complete with the parent trace (<code>evaluation_pipeline</code>), child span (<code>generate_and_score</code>), token usage, <code>latency</code>, model <code>metadata</code>, <code>answer</code> output, and the computed <code>quality</code> score.</figcaption></figure></div>


<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Real-Trace-Self-Hosted-Langfuse-Dashboard"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Real-Trace-Self-Hosted-Langfuse-Dashboard">Real Trace from Our Self-Hosted Langfuse Dashboard</a></h3>



<p>Now, let us look at the <strong>actual trace</strong> generated by our evaluation script.</p>



<p>This is exactly what you should see when running:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)" data-enlighter-group="19">$ python src/evaluation_metrics.py
</pre>



<p>Your <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> dashboard will show:</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">evaluation_pipeline</code>: as the parent trace</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">generate_and_score</code>: as the nested span</li>



<li>full <strong>inputs</strong> (question, system message, model config)</li>



<li>full <strong>outputs</strong> (LLM answer + quality score)</li>



<li><strong>token usage</strong> (input, output, total)</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">latency</code> measured manually</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">metadata</code> from <code data-enlighter-language="python" class="EnlighterJSRAW">config.yaml</code></li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">score</code> <strong>badge</strong> showing the computed <code data-enlighter-language="python" class="EnlighterJSRAW">quality</code> metric</li>
</ul>



<p>While <strong>Figure 6</strong> shows the actual <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> trace captured during execution, the diagram below abstracts the same process into a clear evaluation pipeline. It highlights how the <code data-enlighter-language="python" class="EnlighterJSRAW">LLM</code> response is generated, how <code data-enlighter-language="python" class="EnlighterJSRAW">evaluation</code> metrics are computed, and how both the raw <code data-enlighter-language="python" class="EnlighterJSRAW">outputs</code> and derived <code data-enlighter-language="python" class="EnlighterJSRAW">quality</code> scores are attached to a single <code data-enlighter-language="python" class="EnlighterJSRAW">trace</code> before being logged to <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code>.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-75-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="490" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-75-1024x490.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53978" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-75.png?size=126x60&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-75-300x143.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-75.png?size=378x181&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-75.png?size=504x241&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-75.png?size=630x301&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-75-768x367.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-75-1024x490.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-75-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-75-1536x734.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 6:</strong> Real evaluation trace from the self-hosted <code>Langfuse</code> dashboard showing <code>metadata</code>, full <code>answer</code> output, <code>latency</code> breakdown, token usage, and the custom <code>quality</code> score registered by our <code>evaluation_metrics.py</code> script.</figcaption></figure></div>

<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-76-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="128" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-76-1024x128.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53980" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-76.png?size=126x16&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-76-300x38.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-76.png?size=378x47&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-76.png?size=504x63&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-76.png?size=630x79&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-76-768x96.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-76-1024x128.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-76-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-76-1536x192.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 7:</strong> The <code>evaluation_pipeline</code> generates an LLM answer, computes <code>metrics</code>, attaches a <code>quality</code> score, and logs everything into <code>Langfuse</code>.</figcaption></figure></div>


<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Why-LLM-Evaluation-Metrics-Matter"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Why-LLM-Evaluation-Metrics-Matter">Why LLM Evaluation Metrics Matter</a></h3>



<p>By adding <code data-enlighter-language="python" class="EnlighterJSRAW">evaluation</code> metrics:</p>



<ul class="wp-block-list">
<li>You detect <code data-enlighter-language="python" class="EnlighterJSRAW">model</code> degradation</li>



<li>You compare <code data-enlighter-language="python" class="EnlighterJSRAW">models</code> or <code data-enlighter-language="python" class="EnlighterJSRAW">prompts</code></li>



<li>You measure <code data-enlighter-language="python" class="EnlighterJSRAW">latency</code> regressions</li>



<li>You track <code data-enlighter-language="python" class="EnlighterJSRAW">token</code> cost spikes</li>



<li>You get quality insights per <code data-enlighter-language="python" class="EnlighterJSRAW">request</code></li>
</ul>



<p>This pushes your system beyond “debuggable” into <strong>evaluated</strong>, which is critical for anything involving RAG, agents, or multi-step pipelines.</p>



<p>In this section, you learned how to:</p>



<ul class="wp-block-list">
<li>Instrument <code data-enlighter-language="python" class="EnlighterJSRAW">LLM</code> calls with decorators</li>



<li>Compute custom <code data-enlighter-language="python" class="EnlighterJSRAW">evaluation</code> metrics</li>



<li>Attach <code data-enlighter-language="python" class="EnlighterJSRAW">quality</code> scores to <code data-enlighter-language="python" class="EnlighterJSRAW">traces</code></li>



<li>Visualize <code data-enlighter-language="python" class="EnlighterJSRAW">scores</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">latency</code>, and <code data-enlighter-language="python" class="EnlighterJSRAW">tokens</code> inside <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code></li>



<li>Wrap everything inside an <code data-enlighter-language="python" class="EnlighterJSRAW">evaluation_pipeline</code></li>
</ul>



<p>With this, tracing evolves from simple diagnostics into actual <code data-enlighter-language="python" class="EnlighterJSRAW">LLM</code> evaluation.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-vLLM-Diagnostics-Health-Checks-LLM-Observability"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-vLLM-Diagnostics-Health-Checks-LLM-Observability">vLLM Diagnostics and Health Checks for LLM Observability</a></h2>



<p>Before we evaluate model outputs or analyze <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> traces, we need to make sure the underlying engine <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> is alive, reachable, and responding correctly. If <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> is down, every script in this lesson fails. If the model is still loading, requests time out. If ports are wrong, you will get cryptic errors that look like <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> problems but are actually <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> issues.</p>



<p>To prevent all of that, we use <code data-enlighter-language="python" class="EnlighterJSRAW">health_check.py</code>, a dedicated diagnostic tool that validates your entire local <code data-enlighter-language="python" class="EnlighterJSRAW">LLM</code> runtime before you run any <code data-enlighter-language="python" class="EnlighterJSRAW">tracing</code> or <code data-enlighter-language="python" class="EnlighterJSRAW">scoring</code> scripts.</p>



<p>This script confirms 3 things:</p>



<ul class="wp-block-list">
<li>Is the <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> server running and responding?</li>



<li>Are models actually loaded?</li>



<li>Can the model generate text?</li>
</ul>



<p>If all 3 pass, your observability stack is ready.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-What-vLLM-Health-Check-Script-Validates"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-What-vLLM-Health-Check-Script-Validates">What the vLLM Health Check Script Validates</a></h3>



<p><code data-enlighter-language="python" class="EnlighterJSRAW">health_check.py</code> performs 3 layers of validation:</p>



<h4 class="wp-block-heading">Layer 1: Infrastructure health</h4>



<ul class="wp-block-list">
<li>Calls <code data-enlighter-language="python" class="EnlighterJSRAW">/health</code> endpoint</li>



<li>Checks whether the <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> server is reachable</li>



<li>Confirms that the port and base URL match your config</li>
</ul>



<h4 class="wp-block-heading">Layer 2: Model readiness</h4>



<ul class="wp-block-list">
<li>Calls <code data-enlighter-language="python" class="EnlighterJSRAW">/v1/models</code></li>



<li>Ensures at least one model is loaded</li>



<li>Detects if <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> is still downloading or initializing the model</li>
</ul>



<h4 class="wp-block-heading">Layer 3: LLM generation test</h4>



<ul class="wp-block-list">
<li>Sends a simple prompt: <em>“Say ‘OK’ if you’re working.”</em></li>



<li>Ensures the model produces an actual <code data-enlighter-language="python" class="EnlighterJSRAW">response</code></li>
</ul>



<p>This prevents 95% of “It’s not working” confusion.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Code-Walkthrough-health-check-py"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Code-Walkthrough-health-check-py">Code Walkthrough: health_check.py</a></h3>



<p>We now walk through the entire script, grouped logically rather than line by line, following typical PyImageSearch style.</p>



<h4 class="wp-block-heading">Configuration and Imports</h4>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)" data-enlighter-group="20">import sys
import httpx
from llm_utils import get_llm_client
from config import get_llm_config
</pre>



<p>The script uses:</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">httpx</code>: for fast <code data-enlighter-language="python" class="EnlighterJSRAW">HTTP</code> checks</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">get_llm_client()</code>: to issue a test generation</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">get_llm_config()</code>: to load the base URL from your YAML config</li>
</ul>



<p>No hard-coded URLs, which keeps the system in sync with <code data-enlighter-language="python" class="EnlighterJSRAW">config.yaml</code>.</p>



<h4 class="wp-block-heading">Checking vLLM Health</h4>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)" data-enlighter-group="21">def check_vllm_health(base_url: str = None, timeout: int = 5) -> bool:
    """
    Check if vLLM server is healthy.
   
    Args:
        base_url: vLLM server base URL (defaults to config.yaml)
        timeout: Request timeout in seconds
       
    Returns:
        True if server is healthy, False otherwise
    """
    # Load base_url from config if not provided
    if base_url is None:
        llm_config = get_llm_config()
        base_url = llm_config.get("base_url", "http://localhost:8000/v1")
        base_url = base_url.rstrip("/v1")
   
    health_url = f"{base_url}/health"
    models_url = f"{base_url}/v1/models"
   
    print(f"🔍 Checking vLLM server at {base_url}...")
</pre>



<p>If <code data-enlighter-language="python" class="EnlighterJSRAW">base_url</code> is not provided, the <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> URL is loaded from <code data-enlighter-language="python" class="EnlighterJSRAW">config.yaml</code>.</p>



<p>Next:</p>



<h4 class="wp-block-heading">Health endpoint check</h4>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)" data-enlighter-group="22">try:
        # Check health endpoint
        with httpx.Client(timeout=timeout) as client:
            response = client.get(health_url)
            if response.status_code == 200:
                print(f"  ✅ Health check passed")
            else:
                print(f"  ❌ Health check failed (status: {response.status_code})")
                return False
</pre>



<p>A healthy <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> server returns:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)" data-enlighter-group="23">{"status": "ok"}
</pre>



<p>If this fails, <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> is down, so no tracing or scoring will work.</p>



<h4 class="wp-block-heading">Models endpoint check</h4>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)" data-enlighter-group="24">       # Check models endpoint
        with httpx.Client(timeout=timeout) as client:
            response = client.get(models_url)
            if response.status_code == 200:
                models = response.json().get("data", [])
                if models:
                    print(f"  ✅ Models available: {[m['id'] for m in models]}")
                else:
                    print(f"  ⚠️  No models loaded yet (still initializing?)")
                    return False
            else:
                print(f"  ❌ Models endpoint failed (status: {response.status_code})")
                return False
       
        return True
</pre>



<p>A healthy <code data-enlighter-language="python" class="EnlighterJSRAW">response</code> contains:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="json" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)" data-enlighter-group="25">{
  "data": [
     {"id": "meta-llama/Llama-2-7b-chat-hf"}
   ]
}
</pre>



<p>If this list is empty, the model is still loading.</p>



<h4 class="wp-block-heading">Error handling</h4>



<p>The script gracefully handles:</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">connection</code> failure</li>



<li>timeouts</li>



<li>unexpected <code data-enlighter-language="python" class="EnlighterJSRAW">JSON</code></li>



<li>wrong <code data-enlighter-language="python" class="EnlighterJSRAW">ports</code></li>



<li>wrong <code data-enlighter-language="python" class="EnlighterJSRAW">base_url</code></li>
</ul>



<p>And prints clear, actionable fixes.</p>



<h4 class="wp-block-heading">Testing LLM Generation</h4>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)" data-enlighter-group="26">def test_llm_generation() -> bool:
    """Test simple LLM generation."""
    print("\n🔍 Testing LLM generation...")
   
    try:
        client = get_llm_client(timeout=30)
        response = client.chat.completions.create(
            model="meta-llama/Llama-2-7b-chat-hf",
            messages=[{"role": "user", "content": "Say 'OK' if you're working."}],
            max_tokens=10
        )
       
        answer = response.choices[0].message.content
        print(f"  ✅ Generation successful: {answer[:50]}...")
        return True
       
    except Exception as e:
        print(f"  ❌ Generation failed: {e}")
        return False
</pre>



<p>This test:</p>



<ul class="wp-block-list">
<li>Instantiates the OpenAI client</li>



<li>Sends a tiny one-line <code data-enlighter-language="python" class="EnlighterJSRAW">prompt</code></li>



<li>Validates the model <code data-enlighter-language="python" class="EnlighterJSRAW">answers</code> with at least <em>something</em></li>
</ul>



<p>If the model cannot generate, your entire <code data-enlighter-language="python" class="EnlighterJSRAW">tracing</code> pipeline will also fail.</p>



<h4 class="wp-block-heading">The Entry Point</h4>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)" data-enlighter-group="27">if __name__ == "__main__":
     main()
</pre>



<p>This is the command you will run before every other script.</p>



<p>It prints:</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> health</li>



<li>Model <code data-enlighter-language="python" class="EnlighterJSRAW">availability</code></li>



<li>Generation <code data-enlighter-language="python" class="EnlighterJSRAW">test</code></li>
</ul>



<p>And guides you through failures with friendly hints:</p>



<p>“Start vLLM: docker-compose up -d”</p>



<p>“Wait 2-3 minutes for model download”</p>



<p>“Check docker logs”</p>



<p>This makes beginner troubleshooting seamless.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-77-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="682" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-77-1024x682.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53983" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-77.png?size=126x84&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-77-300x200.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-77.png?size=378x252&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-77.png?size=504x336&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-77.png?size=630x420&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-77-768x511.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-77-1024x682.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-77-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 8:</strong> The <code>vLLM</code> health check verifies that the server is running, the model is loaded, and generation works end-to-end.</figcaption></figure></div>


<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Why-vLLM-Health-Checks-Matter-LLM-Observability"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Why-vLLM-Health-Checks-Matter-LLM-Observability">Why vLLM Health Checks Matter for LLM Observability</a></h3>



<p>If <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> is unhealthy, <strong>every</strong> tracing script fails.</p>



<p>This script prevents:</p>



<ul class="wp-block-list">
<li>Running manual tracing while <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> is down</li>



<li>Chasing <code data-enlighter-language="python" class="EnlighterJSRAW">decorator</code> errors that are actually <code data-enlighter-language="python" class="EnlighterJSRAW">connection</code> errors</li>



<li>Confusing <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> ingestion errors with model-loading delays</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">token</code> errors caused by uninitialized models</li>



<li>Timeouts that look like <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> bugs</li>
</ul>



<p>It gives readers a clean, deterministic start before diving into observability.</p>



<p>In this section, you learned:</p>



<ul class="wp-block-list">
<li>How to verify <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> health</li>



<li>Why the <code data-enlighter-language="python" class="EnlighterJSRAW">/health</code> and <code data-enlighter-language="python" class="EnlighterJSRAW">/v1/models</code> endpoints matter</li>



<li>How to test real <code data-enlighter-language="python" class="EnlighterJSRAW">generation</code></li>



<li>How to diagnose common <code data-enlighter-language="python" class="EnlighterJSRAW">startup</code> issues</li>



<li>How to ensure the entire <code data-enlighter-language="python" class="EnlighterJSRAW">tracing</code> pipeline will work</li>
</ul>



<p>With your environment confirmed healthy, you are ready to score model <code data-enlighter-language="python" class="EnlighterJSRAW">outputs</code> and analyze <code data-enlighter-language="python" class="EnlighterJSRAW">evaluation</code> traces in <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code>.</p>



<hr class="wp-block-separator has-alpha-channel-opacity"/>



<div id="pitch" style="padding: 40px; width: 100%; background-color: #F4F6FA;">
	<h3>What's next? We recommend <a target="_blank" href="https://pyimagesearch.com/pyimagesearch-university/?utm_source=blogPost&utm_medium=bottomBanner&utm_campaign=What%27s%20next%3F%20I%20recommend">PyImageSearch University</a>.</h3>

	<script src="https://fast.wistia.com/embed/medias/kno0cmko2z.jsonp" async></script><script src="https://fast.wistia.com/assets/external/E-v1.js" async></script><div class="wistia_responsive_padding" style="padding:56.25% 0 0 0;position:relative;"><div class="wistia_responsive_wrapper" style="height:100%;left:0;position:absolute;top:0;width:100%;"><div class="wistia_embed wistia_async_kno0cmko2z videoFoam=true" style="height:100%;position:relative;width:100%"><div class="wistia_swatch" style="height:100%;left:0;opacity:0;overflow:hidden;position:absolute;top:0;transition:opacity 200ms;width:100%;"><img decoding="async" src="https://fast.wistia.com/embed/medias/kno0cmko2z/swatch" style="filter:blur(5px);height:100%;object-fit:contain;width:100%;" alt="" aria-hidden="true" onload="this.parentNode.style.opacity=1;" /></div></div></div></div>

	<div style="margin-top: 32px; margin-bottom: 32px; ">
		<strong>Course information:</strong><br/>
		86+ total classes • 115+ hours hours of on-demand code walkthrough videos • Last updated: June 2026<br/>
		<span style="color: #169FE6;">★★★★★</span> 4.84 (128 Ratings) • 16,000+ Students Enrolled
	</div>

	<p><strong>I strongly believe that if you had the right teacher you could <em>master</em> computer vision and deep learning.</strong></p>

	<p>Do you think learning computer vision and deep learning has to be time-consuming, overwhelming, and complicated? Or has to involve complex mathematics and equations? Or requires a degree in computer science?</p>

	<p>That’s <em>not</em> the case.</p>

	<p>All you need to master computer vision and deep learning is for someone to explain things to you in <em>simple, intuitive</em> terms. <em>And that’s exactly what I do</em>. My mission is to change education and how complex Artificial Intelligence topics are taught.</p>

	<p>If you're serious about learning computer vision, your next stop should be PyImageSearch University, the most comprehensive computer vision, deep learning, and OpenCV course online today. Here you’ll learn how to <em>successfully</em> and <em>confidently</em> apply computer vision to your work, research, and projects. Join me in computer vision mastery.</p>

	<p><strong>Inside PyImageSearch University you'll find:</strong></p>

	<ul style="margin-left: 0px;">
		<li style="list-style: none;">&check; <strong>86+ courses</strong> on essential computer vision, deep learning, and OpenCV topics</li>
		<li style="list-style: none;">&check; <strong>86 Certificates</strong> of Completion</li>
		<li style="list-style: none;">&check; <strong>115+ hours hours</strong> of on-demand video</li>
		<li style="list-style: none;">&check; <strong>Brand new courses released <em>regularly</em></strong>, ensuring you can keep up with state-of-the-art techniques</li>
		<li style="list-style: none;">&check; <strong>Pre-configured Jupyter Notebooks in Google Colab</strong></li>
		<li style="list-style: none;">&check; Run all code examples in your web browser — works on Windows, macOS, and Linux (no dev environment configuration required!)</li>
		<li style="list-style: none;">&check; Access to <strong>centralized code repos for <em>all</em> 540+ tutorials</strong> on PyImageSearch</li>
		<li style="list-style: none;">&check; <strong> Easy one-click downloads</strong> for code, datasets, pre-trained models, etc.</li>
		<li style="list-style: none;">&check; <strong>Access</strong> on mobile, laptop, desktop, etc.</li>
	</ul>

	<p style="text-align: center;">
		<a target="_blank" class="button link" href="https://pyimagesearch.com/pyimagesearch-university/?utm_source=blogPost&utm_medium=bottomBanner&utm_campaign=What%27s%20next%3F%20I%20recommend" style="background-color: #6DC713; border-bottom: none;">Click here to join PyImageSearch University</a>
	</p>
</div>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Summary"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Summary">Summary</a></h2>



<p>In this lesson, you moved beyond simply capturing <code data-enlighter-language="python" class="EnlighterJSRAW">traces</code> and learned how to measure, <code data-enlighter-language="python" class="EnlighterJSRAW">score</code>, and diagnose the quality of your <code data-enlighter-language="python" class="EnlighterJSRAW">LLM</code> pipeline. Lesson 1 gave you observability; Lesson 2 gave you interpretation.</p>



<p>You began by understanding why manual tracing still matters even when decorators exist. Manual <code data-enlighter-language="python" class="EnlighterJSRAW">spans</code> give you full control over <code data-enlighter-language="python" class="EnlighterJSRAW">trace</code> structure, <code data-enlighter-language="python" class="EnlighterJSRAW">metadata</code>, and custom <code data-enlighter-language="python" class="EnlighterJSRAW">logging</code>, making them essential for debugging <code data-enlighter-language="python" class="EnlighterJSRAW">agent</code> loops, multi-step pipelines, and retrieval-heavy systems. You then revisited the <code data-enlighter-language="python" class="EnlighterJSRAW">decorator</code> pattern and learned when to use each approach so your real-world projects can choose the right <code data-enlighter-language="python" class="EnlighterJSRAW">instrumentation</code> strategy.</p>



<p>Next, you implemented true <code data-enlighter-language="python" class="EnlighterJSRAW">evaluation-driven</code> observability using the <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> scoring interface. You wrapped LLM calls with <code data-enlighter-language="python" class="EnlighterJSRAW">@observe</code>, computed a custom “<code data-enlighter-language="python" class="EnlighterJSRAW">quality</code> score,” tracked <code data-enlighter-language="python" class="EnlighterJSRAW">latency</code> and token usage, and attached structured <code data-enlighter-language="python" class="EnlighterJSRAW">metrics</code> directly to your traces. This transformed your dashboard from a simple <code data-enlighter-language="python" class="EnlighterJSRAW">trace</code> viewer into a performance <code data-enlighter-language="python" class="EnlighterJSRAW">analytics</code> console.</p>



<p>Finally, you validated your infrastructure using a robust <code data-enlighter-language="python" class="EnlighterJSRAW">health-check</code> system. Before any tracing or scoring happens, <code data-enlighter-language="python" class="EnlighterJSRAW">health_check.py</code> ensures <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> is running, the model is loaded, and real <code data-enlighter-language="python" class="EnlighterJSRAW">generation</code> works end-to-end. This eliminates guesswork and gives you a reliable foundation for more advanced workflows.</p>



<p>By the end of this lesson, your observability pipeline now supports:</p>



<ul class="wp-block-list">
<li>manual low-level <code data-enlighter-language="python" class="EnlighterJSRAW">traces</code></li>



<li>decorator-based nested <code data-enlighter-language="python" class="EnlighterJSRAW">traces</code></li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">latency</code> instrumentation</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">token</code> usage insights</li>



<li>custom <code data-enlighter-language="python" class="EnlighterJSRAW">evaluation</code> scores</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">metadata</code>-rich pipeline summaries</li>



<li>infrastructure-level <code data-enlighter-language="python" class="EnlighterJSRAW">diagnostics</code></li>
</ul>



<p>Together, these upgrades elevate your system from “traced” to measured, from “visible” to actionable.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Citation-Information"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Citation-Information">Citation Information</a></h3>



<p><strong>Singh, V</strong><strong>. </strong>“Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted),” <em>PyImageSearch</em>, S. Huot, A. Sharma, and P. Thakur, eds., 2026, <a href="https://pyimg.co/24p06" target="_blank" rel="noreferrer noopener">https://pyimg.co/24p06</a> </p>



<pre class="EnlighterJSRAW" data-enlighter-language="raw" data-enlighter-theme="classic" data-enlighter-highlight="" data-enlighter-linenumbers="false" data-enlighter-lineoffset="" data-enlighter-title="Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)" data-enlighter-group="28">@incollection{Singh_2026_manual-tracing-scores-evaluation-langfuse-self-hosted,
  author = {Vikram Singh},
  title = {{Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)}},
  booktitle = {PyImageSearch},
  editor = {Susan Huot and Aditya Sharma and Piyush Thakur},
  year = {2026},
  url = {https://pyimg.co/24p06},
}
</pre>



<p><strong>To download the source code to this post (and be notified when future tutorials are published here on PyImageSearch), </strong><em><strong>simply enter your email address in the form below!</strong></em></p>



<div id="download-the-code" class="post-cta-wrap">
<div class="gpd-post-cta">
	<div class="gpd-post-cta-content">
		

			<div class="gpd-post-cta-top">
				<div class="gpd-post-cta-top-image"><img decoding="async" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?lossy=2&strip=1&webp=1" alt="" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?lossy=2&strip=1&webp=1 410w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?size=126x174&lossy=2&strip=1&webp=1 126w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?size=252x348&lossy=2&strip=1&webp=1 252w" sizes="(max-width: 410px) 100vw, 410px" /></div>
				
				<div class="gpd-post-cta-top-title"><h4>Download the Source Code and FREE 17-page Resource Guide</h4></div>
				<div class="gpd-post-cta-top-desc"><p>Enter your email address below to get a .zip of the code and a <strong>FREE 17-page Resource Guide on Computer Vision, OpenCV, and Deep Learning.</strong> Inside you'll find my hand-picked tutorials, books, courses, and libraries to help you master CV and DL!</p></div>


			</div>

			<div class="gpd-post-cta-bottom">
				<form id="footer-cta-code" class="footer-cta" action="https://www.getdrip.com/forms/4130035/submissions" method="post" target="blank" data-drip-embedded-form="4130035">
					<input name="fields[email]" type="email" value="" placeholder="Your email address" class="form-control" />

					<button type="submit">Download the code!</button>

					<div style="display: none;" aria-hidden="true"><label for="website">Website</label><br /><input type="text" id="website" name="website" tabindex="-1" autocomplete="false" value="" /></div>
				</form>
			</div>


		
	</div>

</div>
</div>
<p>The post <a rel="nofollow" href="https://pyimagesearch.com/2026/05/25/manual-tracing-scores-and-evaluation-with-langfuse-self-hosted/">Manual Tracing, Scores, and Evaluation with Langfuse (Self-Hosted)</a> appeared first on <a rel="nofollow" href="https://pyimagesearch.com">PyImageSearch</a>.</p>
]]></content:encoded>
					
		
		
			</item>
		<item>
		<title>LLM Observability with Self-Hosted Langfuse and vLLM</title>
		<link>https://pyimagesearch.com/2026/05/18/llm-observability-with-self-hosted-langfuse-and-vllm/</link>
		
		<dc:creator><![CDATA[Vikram Singh]]></dc:creator>
		<pubDate>Mon, 18 May 2026 12:45:00 +0000</pubDate>
				<category><![CDATA[Docker]]></category>
		<category><![CDATA[Generative AI]]></category>
		<category><![CDATA[LLMOps]]></category>
		<category><![CDATA[MLOps]]></category>
		<category><![CDATA[Observability]]></category>
		<category><![CDATA[Tutorial]]></category>
		<category><![CDATA[docker compose]]></category>
		<category><![CDATA[generative ai]]></category>
		<category><![CDATA[langfuse]]></category>
		<category><![CDATA[langfuse dashboard]]></category>
		<category><![CDATA[latency monitoring]]></category>
		<category><![CDATA[llm monitoring]]></category>
		<category><![CDATA[llm observability]]></category>
		<category><![CDATA[llm pipeline]]></category>
		<category><![CDATA[llm tracing]]></category>
		<category><![CDATA[llmops]]></category>
		<category><![CDATA[local llm inference]]></category>
		<category><![CDATA[mlops]]></category>
		<category><![CDATA[observability stack]]></category>
		<category><![CDATA[openai compatible api]]></category>
		<category><![CDATA[postgresql]]></category>
		<category><![CDATA[prompt tracing]]></category>
		<category><![CDATA[self-hosted llm]]></category>
		<category><![CDATA[token usage]]></category>
		<category><![CDATA[trace visualization]]></category>
		<category><![CDATA[tutorial]]></category>
		<category><![CDATA[vllm]]></category>
		<category><![CDATA[vllm docker]]></category>
		<guid isPermaLink="false">https://pyimagesearch.com/?p=53755</guid>

					<description><![CDATA[<p>Table of Contents LLM Observability with Self-Hosted Langfuse and vLLM Introduction to LLM Observability with Langfuse How Langfuse Fits into an LLM Observability Stack Langfuse Architecture for LLM Observability Why Understanding LLM Observability Architecture Matters Setting Up a Self-Hosted Langfuse&#8230;</p>
<p>The post <a rel="nofollow" href="https://pyimagesearch.com/2026/05/18/llm-observability-with-self-hosted-langfuse-and-vllm/">LLM Observability with Self-Hosted Langfuse and vLLM</a> appeared first on <a rel="nofollow" href="https://pyimagesearch.com">PyImageSearch</a>.</p>
]]></description>
										<content:encoded><![CDATA[<div class="yoast-breadcrumbs"><span><span><a href="https://pyimagesearch.com/">Home</a></span></div>


<div class="toc">
<hr class="TOC"/>
<p class="has-large-font-size"><strong>Table of Contents</strong></p>
<ul>
    <li id="TOC-h1-LLM-Observability-Self-Hosted-Langfuse-vLLM"><a rel="noopener" target="_blank" href="#h1-LLM-Observability-Self-Hosted-Langfuse-vLLM">LLM Observability with Self-Hosted Langfuse and vLLM</a></li>

  <li id="TOC-h2-Introduction-LLM-Observability-Langfuse"><a rel="noopener" target="_blank" href="#h2-Introduction-LLM-Observability-Langfuse">Introduction to LLM Observability with Langfuse</a></li>

    <li id="TOC-h2-How-Langfuse-Fits-LLM-Observability-Stack"><a rel="noopener" target="_blank" href="#h2-How-Langfuse-Fits-LLM-Observability-Stack">How Langfuse Fits into an LLM Observability Stack</a></li>

    <li id="TOC-h2-Langfuse-Architecture-LLM-Observability"><a rel="noopener" target="_blank" href="#h2-Langfuse-Architecture-LLM-Observability">Langfuse Architecture for LLM Observability</a></li>

    <li id="TOC-h2-Why-Understanding-LLM-Observability-Architecture-Matters"><a rel="noopener" target="_blank" href="#h2-Why-Understanding-LLM-Observability-Architecture-Matters">Why Understanding LLM Observability Architecture Matters</a></li>

    <li id="TOC-h2-Setting-Up-Self-Hosted-Langfuse-vLLM-Stack"><a rel="noopener" target="_blank" href="#h2-Setting-Up-Self-Hosted-Langfuse-vLLM-Stack">Setting Up a Self-Hosted Langfuse and vLLM Stack</a></li>

    <li id="TOC-h2-Baseline-LLM-Application-Before-Observability"><a rel="noopener" target="_blank" href="#h2-Baseline-LLM-Application-Before-Observability">Baseline LLM Application (Before Observability)</a></li>

    <li id="TOC-h2-Adding-LLM-Observability-Langfuse-observe-Decorator"><a rel="noopener" target="_blank" href="#h2-Adding-LLM-Observability-Langfuse-observe-Decorator">Adding LLM Observability with the Langfuse @observe Decorator</a></li>

    <li id="TOC-h2-Running-Verifying-Self-Hosted-Langfuse-Observability-Stack"><a rel="noopener" target="_blank" href="#h2-Running-Verifying-Self-Hosted-Langfuse-Observability-Stack">Running and Verifying a Self-Hosted Langfuse Observability Stack</a></li>

    <li id="TOC-h2-Summary"><a rel="noopener" target="_blank" href="#h2-Summary">Summary</a></li>
</ul>
</div>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h1-LLM-Observability-Self-Hosted-Langfuse-vLLM"/>



<h2 class="wp-block-heading"><a href="#TOC-h1-LLM-Observability-Self-Hosted-Langfuse-vLLM">LLM Observability with Self-Hosted Langfuse and vLLM</a></h2>



<p>In this lesson, you will finally demystify what Large Language Model (LLM) observability actually is. It is not just logs or print statements. It is a full, end-to-end view of how your AI system behaves in real-world conditions.</p>


<div class="wp-block-image">
<figure class="aligncenter size-full"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/llm-observability-self-hosted-langfuse-vllm-featured.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="940" height="780" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/llm-observability-self-hosted-langfuse-vllm-featured.png?lossy=2&strip=1&webp=1" alt="llm-observability-self-hosted-langfuse-vllm-featured.png" class="wp-image-53802" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/llm-observability-self-hosted-langfuse-vllm-featured.png?size=126x105&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/llm-observability-self-hosted-langfuse-vllm-featured-300x249.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/llm-observability-self-hosted-langfuse-vllm-featured.png?size=378x314&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/llm-observability-self-hosted-langfuse-vllm-featured.png?size=504x418&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/llm-observability-self-hosted-langfuse-vllm-featured.png?size=630x523&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/llm-observability-self-hosted-langfuse-vllm-featured-768x637.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/llm-observability-self-hosted-langfuse-vllm-featured.png?lossy=2&amp;strip=1&amp;webp=1 940w" sizes="(max-width: 630px) 100vw, 630px" /></a></figure></div>


<p>You will learn why modern LLM apps need more than “it works on my machine,” and how traces, token usage, latency, and model interactions become powerful tools for debugging and optimization.</p>



<p>Next, you will roll up your sleeves and self-host <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> locally, connect it to a <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM server</code>, and run your first fully instrumented LLM pipeline from prompt to response.</p>



<p>By the end, you will be exploring live traces in the <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse UI</code>, inspecting individual requests, understanding where time is spent, and building a solid foundation for debugging, improving, and scaling every LLM workflow you create.</p>



<p>This lesson is the 1st in a 3-part series on <strong>LLM observability with Langfuse</strong>:</p>



<ol class="wp-block-list">
<li><em><strong><a href="https://pyimg.co/tadoh" target="_blank" rel="noreferrer noopener">LLM Observability with Self-Hosted Langfuse and vLLM</a></strong></em><strong> (this tutorial)</strong></li>



<li><em>Lesson 2</em></li>



<li><em>Lesson 3</em></li>
</ol>



<p><strong>To learn how to self-host Langfuse, connect it to vLLM, and build end-to-end LLM observability from the ground up,</strong><em><strong> just keep reading.</strong></em></p>



<div id="pyi-source-code-block" class="source-code-wrap"><div class="gpd-source-code">
    <div class="gpd-source-code-content">
        <img decoding="async" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/source-code-icon.png?lossy=2&strip=1&webp=1" alt="">
        <h4>Looking for the source code to this post?</h4>
                    <a href="#download-the-code" class="pyis-cta-modal-open-modal">Jump Right To The Downloads Section <svg class="svg-icon arrow-right" width="12" height="12" aria-hidden="true" role="img" focusable="false" viewBox="0 0 14 14" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M6.8125 0.1875C6.875 0.125 6.96875 0.09375 7.09375 0.09375C7.1875 0.09375 7.28125 0.125 7.34375 0.1875L13.875 6.75C13.9375 6.8125 14 6.90625 14 7C14 7.125 13.9375 7.1875 13.875 7.25L7.34375 13.8125C7.28125 13.875 7.1875 13.9062 7.09375 13.9062C6.96875 13.9062 6.875 13.875 6.8125 13.8125L6.1875 13.1875C6.125 13.125 6.09375 13.0625 6.09375 12.9375C6.09375 12.8438 6.125 12.75 6.1875 12.6562L11.0312 7.8125H0.375C0.25 7.8125 0.15625 7.78125 0.09375 7.71875C0.03125 7.65625 0 7.5625 0 7.4375V6.5625C0 6.46875 0.03125 6.375 0.09375 6.3125C0.15625 6.25 0.25 6.1875 0.375 6.1875H11.0312L6.1875 1.34375C6.125 1.28125 6.09375 1.1875 6.09375 1.0625C6.09375 0.96875 6.125 0.875 6.1875 0.8125L6.8125 0.1875Z" fill="#169FE6"></path></svg></a>
            </div>
</div>
</div>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Introduction-LLM-Observability-Langfuse"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Introduction-LLM-Observability-Langfuse">Introduction to LLM Observability with Langfuse</a></h2>



<p>Modern LLM applications behave very differently from traditional software. They are probabilistic, non-deterministic, sensitive to prompt phrasing, and often expensive to run. Debugging them requires far more than print statements or simple application logs — <em>you need visibility into how your entire LLM pipeline behaves at runtime.</em></p>



<p>This section introduces the foundations of LLM observability, explains why classical ML monitoring tools fall short, and sets the stage for building a complete <strong>self-hosted </strong><code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code><strong> + </strong><code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code><strong> observability stack</strong>.</p>



<h3 class="wp-block-heading">What Problem Does LLM Observability Solve?</h3>



<p>LLMs fail in ways ordinary software doesn’t:</p>



<ul class="wp-block-list">
<li>They <strong>hallucinate</strong> confidently.</li>



<li>They produce different answers for the same input.</li>



<li>They slow down under load due to tokenizer/model server issues.</li>



<li>They cost real money per token.</li>



<li>They silently degrade when context windows overflow.</li>



<li>They chain multiple steps, making errors hard to pinpoint.</li>
</ul>



<p>Without observability, you are essentially debugging blind.</p>



<p>LLM observability gives you visibility into:</p>



<ul class="wp-block-list">
<li>What prompt was sent?</li>



<li>What did the LLM actually output?</li>



<li>How long did it take?</li>



<li>How many tokens did it use?</li>



<li>Where did a pipeline fail?</li>



<li>Was this output good or bad?</li>



<li>What downstream component was impacted?</li>
</ul>



<p>In short: Observability turns your LLM pipeline from a black box into a glass box.</p>



<h3 class="wp-block-heading">Logs vs Metrics vs Traces (Why Logs Alone Fail)</h3>



<p>Modern systems use 3 observability pillars:</p>



<h4 class="wp-block-heading">Logs</h4>



<p>Unstructured text messages. Good for errors; terrible for understanding multi-step LLM pipelines.</p>



<h4 class="wp-block-heading">Metrics</h4>



<p>Numerical time-series (e.g., latency, tokens/sec). Good for dashboards and alerts.</p>



<h4 class="wp-block-heading">Traces</h4>



<p>End-to-end structured records of what happened across a pipeline.</p>



<p>Traces are <strong>THE critical component for LLM apps</strong> because a single request may produce:</p>



<ul class="wp-block-list">
<li>multiple sub-steps</li>



<li>multiple model calls</li>



<li>embeddings</li>



<li>retrieval calls</li>



<li>tool invocations</li>



<li>agent planning</li>



<li>scoring</li>



<li>post-processing</li>
</ul>



<p><strong>Logs tell you what happened. Metrics tell you how often. Traces tell you why.</strong></p>


<div class="wp-block-image">
<figure class="aligncenter size-full"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-36.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="624" height="332" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-36.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53804" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-36.png?size=126x67&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-36-300x160.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-36.png?size=378x201&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-36.png?size=504x268&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-36.png?lossy=2&amp;strip=1&amp;webp=1 624w" sizes="(max-width: 624px) 100vw, 624px" /></a><figcaption class="wp-element-caption"><strong>Figure 1:</strong> Logs tell you what happened, metrics tell you how your system behaves over time, but traces show you the entire LLM pipeline step by step.</figcaption></figure></div>


<h3 class="wp-block-heading">Why LLM Apps Require Traces, Not Just Logs</h3>



<p>LLM-specific debugging demands visibility into things you cannot get from logging alone:</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">Prompt tracking</code>: See every prompt, <code data-enlighter-language="python" class="EnlighterJSRAW">system</code> message, and <code data-enlighter-language="python" class="EnlighterJSRAW">user</code> message.</li>



<li><strong>Chain-of-thought</strong><strong> structure: </strong>(Even if hidden, you can capture high-level execution steps.)</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">Latency breakdown</code>: Where time is spent: tokenization? forward pass? retrieval?</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">Token usage visibility</code>: Cost control + throughput estimation.</li>



<li><strong>Hallucination hotspots: </strong>Which prompts or contexts fail most?</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">Pipeline correctness</code>: Observations from retrieval → reasoning → generation.</li>
</ul>


<div class="wp-block-image">
<figure class="aligncenter size-full"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-37.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="624" height="332" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-37.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53806" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-37.png?size=126x67&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-37-300x160.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-37.png?size=378x201&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-37.png?size=504x268&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-37.png?lossy=2&amp;strip=1&amp;webp=1 624w" sizes="(max-width: 624px) 100vw, 624px" /></a><figcaption class="wp-element-caption"><strong>Figure 2:</strong> LLM pipelines fail in subtle ways, including hallucinations, slowdowns, bad retrievals, and token spikes. Observability exposes these problems before users do.</figcaption></figure></div>


<h3 class="wp-block-heading">What Is Langfuse? (And Why It Is the Right Tool)</h3>



<p><strong><a href="https://langfuse.com" target="_blank" rel="noreferrer noopener">Langfuse</a></strong> is an open-source observability platform designed specifically for LLM apps. It captures:</p>



<ul class="wp-block-list">
<li>Traces</li>



<li>Spans</li>



<li>Prompt metadata</li>



<li>Inputs and outputs</li>



<li>Token usage</li>



<li>Latencies</li>



<li>Scores (quality, correctness, safety)</li>
</ul>



<p>…and displays them in a clean, production-grade UI.</p>


<div class="wp-block-image">
<figure class="aligncenter size-full"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-38.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="624" height="249" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-38.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53807" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-38.png?size=126x50&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-38-300x120.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-38.png?size=378x151&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-38.png?size=504x201&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-38.png?lossy=2&amp;strip=1&amp;webp=1 624w" sizes="(max-width: 624px) 100vw, 624px" /></a><figcaption class="wp-element-caption"><strong>Figure 3:</strong> <code>Langfuse</code> Preview</figcaption></figure></div>


<p>You can think of it as:</p>



<p>“<code data-enlighter-language="python" class="EnlighterJSRAW">Prometheus</code> + <code data-enlighter-language="python" class="EnlighterJSRAW">Grafana</code> + <code data-enlighter-language="python" class="EnlighterJSRAW">MLflow</code>, but specifically for LLM pipelines.”</p>



<h4 class="wp-block-heading">Why Not MLflow or Weights &amp; Biases?</h4>


<div class="wp-block-image">
<figure class="aligncenter size-full"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-39.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="624" height="340" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-39.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53809" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-39.png?size=126x69&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-39-300x163.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-39.png?size=378x206&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-39.png?size=504x275&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-39.png?lossy=2&amp;strip=1&amp;webp=1 624w" sizes="(max-width: 624px) 100vw, 624px" /></a><figcaption class="wp-element-caption"><strong>Figure 4:</strong> LLM applications require observability during inference rather than training, which is where <code>Langfuse</code> provides the most value.</figcaption></figure></div>


<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-How-Langfuse-Fits-LLM-Observability-Stack"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-How-Langfuse-Fits-LLM-Observability-Stack">How Langfuse Fits into an LLM Observability Stack</a></h2>



<p>Before building anything, consider the mental model:</p>



<ul class="wp-block-list">
<li><strong>Your Python LLM app: </strong>Sends prompts and metadata</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse SDK</code>: Records traces locally inside your code</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">vLLM Server</code> (port <code data-enlighter-language="python" class="EnlighterJSRAW">8000</code>)<strong>: </strong>Handles the actual model inference</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse Server</code> (port <code data-enlighter-language="python" class="EnlighterJSRAW">3000</code>): Receives trace data from the SDK</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse Worker</code>: Aggregates, transforms, and prepares data for the dashboard</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">PostgreSQL</code> database: Stores all traces, spans, scores, and token counts</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse UI</code> dashboard: Displays everything in real time</li>
</ul>



<p>This flow is the backbone of LLM observability.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-40-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="492" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-40-1024x492.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53811" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-40.png?size=126x61&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-40-300x144.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-40.png?size=378x182&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-40.png?size=504x242&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-40.png?size=630x303&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-40-768x369.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-40-1024x492.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-40-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-40-1536x738.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 5:</strong> The <code>Langfuse SDK</code> logs trace data inside your Python app, the <code>Langfuse Server</code> stores it in <code>PostgreSQL</code>, and the <code>Worker</code> powers the real-time dashboard.</figcaption></figure></div>


<h3 class="wp-block-heading">Why Self-Hosted Langfuse Instead of Cloud?</h3>



<p>When we first integrated <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse Cloud</code> during development, we immediately ran into:</p>



<ul class="wp-block-list">
<li>trace delivery delays</li>



<li>out-of-order spans</li>



<li>slow UI updates</li>



<li>unreliable real-time feedback</li>
</ul>



<p>This is a <em>problem</em> when you are developing an agent or <code data-enlighter-language="python" class="EnlighterJSRAW">RAG</code> system and need to see:</p>



<ul class="wp-block-list">
<li>the exact prompt</li>



<li>the exact context</li>



<li>the exact output</li>



<li>the exact cost</li>



<li><strong>immediately</strong> after running your script.</li>
</ul>



<p>So we switched to:</p>



<h4 class="wp-block-heading">Self-Hosted Langfuse + Local vLLM</h4>



<p>Benefits:</p>



<ul class="wp-block-list">
<li>Real-time, near-instant traces</li>



<li>Fully local development</li>



<li>No Internet dependency</li>



<li>Faster iteration loops</li>



<li>Full control of database and dashboard</li>



<li>Ideal for agent debugging and RAG evaluation</li>
</ul>



<p>📌 <strong>OPTIONAL CALLOUT</strong></p>



<p><em><strong>One short bullet note: </strong></em><em>We still show the Cloud API flow briefly, but everything you build in this module uses the self-hosted setup for real-time performance.</em></p>



<h3 class="wp-block-heading">What You Will Build in This Lesson</h3>



<p>By the end of Lesson 1, you will have a complete local observability foundation:</p>



<h3 class="wp-block-heading">Infrastructure</h3>



<ul class="wp-block-list">
<li>Self-hosted <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse Server</code></li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse Worker</code> (required for dashboards)</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">PostgreSQL</code> database</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> model server (<code data-enlighter-language="python" class="EnlighterJSRAW">OpenAI</code>-compatible API)</li>
</ul>



<h3 class="wp-block-heading">Tracing Skills</h3>



<ul class="wp-block-list">
<li>How to instrument an LLM call</li>



<li>How to build hierarchical traces (pipeline → model call)</li>



<li>How to log prompts, outputs, latencies, and token usage</li>



<li>How to visualize traces instantly in the <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> User Interface (UI) </li>
</ul>



<h3 class="wp-block-heading">What You Will Actually Run</h3>



<ul class="wp-block-list">
<li>Decorator-based tracing (<code data-enlighter-language="python" class="EnlighterJSRAW">tracing_decorator.py</code>)</li>



<li>Baseline app with <strong>no tracing</strong> (<code data-enlighter-language="python" class="EnlighterJSRAW">basic_llm_app.py</code>)</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code>-connected LLM client (<code data-enlighter-language="python" class="EnlighterJSRAW">llm_utils.py</code>)</li>



<li>Config loaders (<code data-enlighter-language="python" class="EnlighterJSRAW">config.py</code>)</li>
</ul>



<figure class="wp-block-image aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-41-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="462" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-41-1024x462.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53814" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-41.png?size=126x57&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-41-300x135.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-41.png?size=378x171&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-41.png?size=504x227&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-41.png?size=630x284&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-41-768x346.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-41-1024x462.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-41-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-41-1536x693.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 6:</strong> Our self-hosted stack: <code>vLLM</code> handles inference, the <code>Langfuse</code> Software Development Kit (SDK) records traces, and the <code>Langfuse Server</code> + <code>Langfuse Worker</code> + <code>PostgreSQL</code> power the observability dashboard.</figcaption></figure>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Langfuse-Architecture-LLM-Observability"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Langfuse-Architecture-LLM-Observability">Langfuse Architecture for LLM Observability</a></h2>



<p>Before we start installing anything, let us zoom out and understand the architecture of the observability stack you are about to build. <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> is not just a dashboard. It is a coordinated system of services that receives traces, stores them, aggregates them, and displays them in real time. Your LLM app, <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code>, and <code data-enlighter-language="python" class="EnlighterJSRAW">PostgreSQL</code> all work together to form a complete observability pipeline.</p>



<p>Think of this section as building your mental model. Once you understand these flows, all the <code data-enlighter-language="python" class="EnlighterJSRAW">Docker</code> configuration, YAML files, keys, and scripts will make perfect sense.</p>



<h3 class="wp-block-heading">The High-Level Architecture</h3>



<p>At the core, your pipeline is simple:</p>



<ul class="wp-block-list">
<li>Your Python LLM app: executes inference and logs traces</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> Python SDK: captures all observability data</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">vLLM Server</code>: handles the actual LLM generation</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse Server</code>: receives trace, span, and token data</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">PostgreSQL</code>: stores all traces, metadata, and scores</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse Worker</code>: aggregates data for dashboards</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse UI</code>: visualizes everything instantly</li>
</ul>



<p>This architecture ensures that every LLM call becomes a structured trace that you can drill into, including latencies, inputs, outputs, steps, errors, and token details.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-42-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="367" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-42-1024x367.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53816" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-42.png?size=126x45&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-42-300x107.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-42.png?size=378x135&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-42.png?size=504x181&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-42.png?size=630x226&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-42-768x275.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-42-1024x367.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-42-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-42-1536x550.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 7:</strong> Your Python app calls <code>vLLM</code> for inference and the <code>Langfuse SDK</code> for tracing. The <code>Langfuse Server</code> stores data in <code>PostgreSQL</code>, the <code>Langfuse Worker</code> processes it, and the UI displays it.</figcaption></figure></div>


<h3 class="wp-block-heading">How a Single LLM Request Turns Into a Trace</h3>



<p>Every time your code calls <code data-enlighter-language="python" class="EnlighterJSRAW">client.chat.completions.create(...)</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> performs 3 major steps behind the scenes:</p>



<ul class="wp-block-list">
<li><strong>Observe the call:</strong> capture input, parameters, metadata.</li>



<li><strong>Record the output:</strong> LLM response, tokens, shapes, errors.</li>



<li><strong>Create </strong><strong>and </strong><strong>update a trace hierarchy:</strong> pipeline spans, child spans, nested steps.</li>
</ul>



<p>For example:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="1">llm_pipeline (trace)
    ├── retrieve_context (span)
    ├── rerank_candidates (span)
    └── generate_answer (span)
</pre>



<p>Even in Lesson 1 (where we only use decorators), you will already produce parent → child traces automatically.</p>



<p>Without this structure, debugging multi-step LLM pipelines becomes guesswork.</p>


<div class="wp-block-image">
<figure class="aligncenter size-full"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-43.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="624" height="337" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-43.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53818" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-43.png?size=126x68&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-43-300x162.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-43.png?size=378x204&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-43.png?size=504x272&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-43.png?lossy=2&amp;strip=1&amp;webp=1 624w" sizes="(max-width: 624px) 100vw, 624px" /></a><figcaption class="wp-element-caption"><strong>Figure 8:</strong> Every LLM request becomes a structured trace: your app → <code>Langfuse SDK</code> → <code>Langfuse Server</code> → stored in <code>PostgreSQL</code> → visualized in real time.</figcaption></figure></div>


<h3 class="wp-block-heading">The Four Core Components You Will Deploy</h3>



<p>You will deploy <strong>4</strong> <strong>services</strong> using <code data-enlighter-language="python" class="EnlighterJSRAW">Docker Compose</code>:</p>



<h3 class="wp-block-heading">1. vLLM Server (Port 8000)</h3>



<p>Your local LLM inference engine.</p>



<p>It exposes an <code data-enlighter-language="python" class="EnlighterJSRAW">OpenAI</code>-compatible Application Programming Interface (API) endpoint:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="2">http://localhost:8000/v1
</pre>



<p>Your Python scripts send prompts here.</p>



<h3 class="wp-block-heading">2. Langfuse Server (Port 3000)</h3>



<p>The brains of the observability system.</p>



<p>It receives traces from the Python SDK, stores them, and exposes the dashboard.</p>



<h3 class="wp-block-heading">3. Langfuse Worker</h3>



<p>Most tutorials miss this, but you cannot get dashboards without the <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse Worker</code>.</p>



<p>It processes:</p>



<ul class="wp-block-list">
<li>aggregations</li>



<li>analytics</li>



<li>score updates</li>



<li>background tasks</li>
</ul>



<p>Without the <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse Worker</code>:</p>



<p><strong>you will see traces, but your dashboard will be empty.</strong></p>



<h3 class="wp-block-heading">4. PostgreSQL (Port 5433 → 5432)</h3>



<p>Stores everything:</p>



<ul class="wp-block-list">
<li>traces</li>



<li>spans</li>



<li>metadata</li>



<li>scores</li>



<li>projects</li>



<li>settings</li>
</ul>



<p>It provides the persistence layer that the <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse Server</code> depends on.</p>


<div class="wp-block-image">
<figure class="aligncenter size-full is-resized"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-44.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="619" height="317" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-44.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53821" style="width:639px;height:auto" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-44.png?size=126x65&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-44-300x154.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-44.png?size=378x194&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-44.png?size=504x258&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-44.png?lossy=2&amp;strip=1&amp;webp=1 619w" sizes="(max-width: 619px) 100vw, 619px" /></a><figcaption class="wp-element-caption"><strong>Figure 9:</strong> The self-hosted <code>Langfuse</code> stack includes <code>vLLM</code> for inference, <code>Langfuse Server</code> for ingestion, <code>Langfuse Worker</code> for dashboards, and <code>PostgreSQL</code> for storage.</figcaption></figure></div>


<h3 class="wp-block-heading">How These Components Communicate (Data Flow)</h3>



<p>Let us make the full pipeline explicit:</p>



<ul class="wp-block-list">
<li><strong>Your script</strong> sends an inference request to <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code>.</li>



<li>The <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse SDK</code> in your script sends trace info to <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse Server</code>.</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse Server</code> writes raw trace data into <code data-enlighter-language="python" class="EnlighterJSRAW">PostgreSQL</code>.</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse Worker</code> processes raw data to generate:
<ul class="wp-block-list">
<li>analytics</li>



<li>histograms</li>



<li>span trees</li>



<li>scores</li>
</ul>
</li>



<li>The <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse Web UI</code> reads processed data and displays:
<ul class="wp-block-list">
<li>full trace trees</li>



<li>input/output pairs</li>



<li>token usage</li>



<li>latency heatmaps</li>



<li>error stacks</li>
</ul>
</li>
</ul>



<p>This is the “observability heartbeat” that runs for every request.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-45-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="461" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-45-1024x461.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53823" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-45.png?size=126x57&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-45-300x135.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-45.png?size=378x170&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-45.png?size=504x227&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-45.png?size=630x284&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-45-768x346.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-45-1024x461.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-45-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-45-1536x691.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 10:</strong> A complete view of how inference and tracing flow through your stack, from your Python script to the final <code>Langfuse</code> dashboard.</figcaption></figure></div>


<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Why-Understanding-LLM-Observability-Architecture-Matters"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Why-Understanding-LLM-Observability-Architecture-Matters">Why Understanding LLM Observability Architecture Matters</a></h2>



<p>Before diving into code, it is important to visualize this system because:</p>



<ul class="wp-block-list">
<li>It prevents confusion when running <code data-enlighter-language="python" class="EnlighterJSRAW">Docker</code> for the first time.</li>



<li>You will instantly understand errors like “Worker not running” or “Database unavailable”.</li>



<li>You will know exactly where to look when traces do not appear.</li>



<li>You will develop intuition about how requests become saved spans.</li>
</ul>



<p>Once this architectural layer clicks, every file in <code data-enlighter-language="python" class="EnlighterJSRAW">docker-compose.yml</code>, every script in <code data-enlighter-language="python" class="EnlighterJSRAW">src/</code>, and every dashboard panel will feel obvious.</p>



<hr class="wp-block-separator has-alpha-channel-opacity"/>



<p>Would you like immediate access to 3,457 images curated and labeled with hand gestures to train, explore, and experiment with &#8230; for free? Head over to <a href="https://universe.roboflow.com/isl/az-6mqow?ref=pyimagesearch" target="_blank" rel="noreferrer noopener">Roboflow</a> and get a free account to grab these hand gesture images. </p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Setting-Up-Self-Hosted-Langfuse-vLLM-Stack"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Setting-Up-Self-Hosted-Langfuse-vLLM-Stack">Setting Up a Self-Hosted Langfuse and vLLM Stack</a></h2>



<p>Before we can trace a single LLM call, we need to set up a clean project skeleton and a fully functioning self-hosted observability stack. In this section, you will configure the environment, install dependencies, review the project layout, understand each configuration file, and bring up the <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> + <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> infrastructure using <code data-enlighter-language="python" class="EnlighterJSRAW">Docker Compose</code>.</p>



<p>Everything that comes later (tracing, scoring, evaluation, debugging) depends on getting this foundation right.</p>



<h3 class="wp-block-heading">Project Structure Overview</h3>



<p>Here is the complete repository structure we will use throughout this lesson:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="3">├── configs
│   └── config.yaml
├── docker-compose.yml
├── README.md
├── requirements.txt
└── src
    ├── basic_llm_app.py
    ├── config.py
    ├── evaluation_metrics.py
    ├── health_check.py
    ├── llm_utils.py
    ├── run_all_examples.py
    ├── tracing_decorator.py
    └── tracing_manual.py
</pre>



<p>At a high level:</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">configs/</code>: stores global configuration used by every example.</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">src/</code>: contains the LLM application scripts, utilities, and tracing examples.</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">docker-compose.yml</code>: defines the entire <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> + <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> infrastructure.</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">requirements.txt</code>: defines Python dependencies.</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">.env.example</code>: defines required environment variables.</li>
</ul>



<p>We will walk through each piece, focusing not on the logic inside every file, but on <em>how the system is designed</em> and <em>how everything connects</em>.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-46-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="544" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-46-1024x544.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53827" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-46.png?size=126x67&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-46-300x159.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-46.png?size=378x201&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-46.png?size=504x268&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-46.png?size=630x335&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-46-768x408.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-46-1024x544.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-46-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-46-1536x816.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 11:</strong> The project separates configuration, infrastructure, and application modules to keep <code>Langfuse</code> observability reusable across different LLM workflows.</figcaption></figure></div>


<h3 class="wp-block-heading">Installing Dependencies</h3>



<p>Install the required Python packages:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="4">pip install -r requirements.txt
</pre>



<p>The key dependencies in this project are intentionally minimal. We use the following packages:</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">langfuse&gt;=2.0.0</code>: provides the observability SDK and the <code data-enlighter-language="python" class="EnlighterJSRAW">@observe</code> decorator</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">openai&gt;=1.0.0</code>: is required because <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> exposes an <code data-enlighter-language="python" class="EnlighterJSRAW">OpenAI</code>-compatible API endpoint</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">python-dotenv</code>: loads <code data-enlighter-language="python" class="EnlighterJSRAW">.env</code> environment variables</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">pyyaml</code>: reads configuration values from <code data-enlighter-language="python" class="EnlighterJSRAW">config.yaml</code></li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">httpx</code> and <code data-enlighter-language="python" class="EnlighterJSRAW">requests</code>: handle health checks and HTTP communication</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">numpy</code>: supports scoring and numeric utilities</li>
</ul>



<p>Together, these packages form the lightweight foundation for our self-hosted observability stack.</p>



<h3 class="wp-block-heading">Configuring Environment Variables</h3>



<p>Copy <code data-enlighter-language="python" class="EnlighterJSRAW">.env.example</code> into <code data-enlighter-language="python" class="EnlighterJSRAW">.env</code>:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="5">cp .env.example .env
</pre>



<p>Then update the following values after starting <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code>:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="6">LANGFUSE_PUBLIC_KEY="pk-lf-xxxx"
LANGFUSE_SECRET_KEY="sk-lf-xxxx"
LANGFUSE_HOST=http://localhost:3000

OPENAI_BASE_URL=http://localhost:8000/v1
OPENAI_API_KEY=dummy
</pre>



<p>A few key points:</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> <strong>keys</strong> come from your local dashboard once you create a project.</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> <strong>does not require authentication</strong>, but the <code data-enlighter-language="python" class="EnlighterJSRAW">OpenAI</code> client still requires an API key value, so <code data-enlighter-language="python" class="EnlighterJSRAW">"dummy"</code> works.</li>



<li>If you use <code data-enlighter-language="python" class="EnlighterJSRAW">Hugging Face</code> models that are not cached, you may need a token.</li>
</ul>



<p>This <code data-enlighter-language="python" class="EnlighterJSRAW">.env</code> file becomes the backbone for all examples.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-47-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="317" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-47-1024x317.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53830" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-47.png?size=126x39&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-47-300x93.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-47.png?size=378x117&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-47.png?size=504x156&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-47.png?size=630x195&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-47-768x237.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-47-1024x317.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-47-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-47-1536x475.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 12:</strong> <code>Langfuse</code> keys come from the local dashboard, while <code>vLLM</code> uses an <code>OpenAI</code>-compatible endpoint, with everything funneling into the <code>.env</code> file read by your Python scripts.</figcaption></figure></div>


<h3 class="wp-block-heading">Centralized Configuration (configs/config.yaml)</h3>



<p>Instead of scattering options across scripts, everything is configured through one <code data-enlighter-language="python" class="EnlighterJSRAW">YAML</code> file:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="7">llm:
  base_url: "http://localhost:8000/v1"
  model: "meta-llama/Llama-2-7b-chat-hf"
  temperature: 0.7
  max_tokens: 300

langfuse:
  host: "http://localhost:3000"
  project_name: "llm-observability-selfhosted"

evaluation:
  enable_scoring: true
  max_latency_ms: 5000
  min_length: 20
  good_length_threshold: 100
</pre>



<p>This allows you to:</p>



<ul class="wp-block-list">
<li>Switch models without changing code</li>



<li>Tune evaluation logic centrally</li>



<li>Redirect LLM traffic to remote endpoints if needed</li>



<li>Adjust <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse Server</code> location</li>
</ul>



<p>Every script loads from this file automatically.</p>



<h3 class="wp-block-heading">Utility Modules (src/config.py and src/llm_utils.py)</h3>



<p>These 2 utilities prevent duplication across all examples.</p>



<h4 class="wp-block-heading">config.py: Central Configuration Loader</h4>



<p>This module provides:</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">load_config()</code>: returns parsed <code data-enlighter-language="python" class="EnlighterJSRAW">YAML</code> config</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">get_llm_config()</code>: returns <code data-enlighter-language="python" class="EnlighterJSRAW">model</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">temp</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">max_tokens</code></li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">get_langfuse_config()</code>: returns <code data-enlighter-language="python" class="EnlighterJSRAW">host</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">project_name</code></li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">get_evaluation_config()</code>: returns scoring <code data-enlighter-language="python" class="EnlighterJSRAW">thresholds</code></li>
</ul>



<p>This keeps every script flexible and model-agnostic.</p>



<h4 class="wp-block-heading">llm_utils.py: Consistent vLLM Client Factory</h4>



<p><code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> supports the <code data-enlighter-language="python" class="EnlighterJSRAW">OpenAI</code> Python client natively:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="8">client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
</pre>



<p>This module wraps it into a reusable function:</p>



<ul class="wp-block-list">
<li>Validates environment variables</li>



<li>Loads model name from <code data-enlighter-language="python" class="EnlighterJSRAW">config.yaml</code></li>



<li>Handles default <code data-enlighter-language="python" class="EnlighterJSRAW">base_url</code></li>



<li>Sets request timeouts</li>



<li>Returns the <code data-enlighter-language="python" class="EnlighterJSRAW">(client, model)</code> tuple when requested</li>
</ul>



<p>Every tracing example uses this function.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-48-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="239" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-48-1024x239.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53833" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-48.png?size=126x29&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-48-300x70.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-48.png?size=378x88&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-48.png?size=504x118&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-48.png?size=630x147&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-48-768x179.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-48-1024x239.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-48-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-48-1536x358.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 13:</strong> <code>config.py</code> reads <code>YAML</code> → <code>llm_utils.py</code> builds a <code>vLLM</code> client → example scripts use both modules for consistent behavior.</figcaption></figure></div>


<h3 class="wp-block-heading">The Self-Hosted Stack (docker-compose.yml)</h3>



<p>This is the heart of the system.</p>



<p><code data-enlighter-language="python" class="EnlighterJSRAW">docker-compose.yml</code> defines:</p>



<h3 class="wp-block-heading">Langfuse Server</h3>



<ul class="wp-block-list">
<li>Runs the frontend + API</li>



<li>Exposes port <code data-enlighter-language="python" class="EnlighterJSRAW">3000</code></li>



<li>Performs authentication, API key creation, and trace storage</li>
</ul>



<h3 class="wp-block-heading">Langfuse Worker</h3>



<ul class="wp-block-list">
<li>Mandatory for dashboards</li>



<li>Processes traces</li>



<li>Updates analytics, charts, latency heatmaps</li>
</ul>



<h3 class="wp-block-heading">PostgreSQL</h3>



<ul class="wp-block-list">
<li>Persistence layer for traces, spans, scores</li>



<li>Exposed on port <code data-enlighter-language="python" class="EnlighterJSRAW">5433</code> (to avoid conflicts)</li>
</ul>



<h3 class="wp-block-heading">vLLM Model Server (GPU or CPU)</h3>



<ul class="wp-block-list">
<li>Exposes <code data-enlighter-language="python" class="EnlighterJSRAW">OpenAI</code>-compatible API at <code data-enlighter-language="python" class="EnlighterJSRAW">http://localhost:8000/v1</code></li>



<li>Runs <code data-enlighter-language="python" class="EnlighterJSRAW">Llama 2</code> by default</li>



<li>Enables fast, local inference for testing</li>
</ul>



<p>You can start everything with:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="9">docker-compose --profile gpu up -d
</pre>



<p>Or if you don’t have a GPU:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="10">docker-compose --profile cpu up -d
</pre>



<p>Verify services:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="11">docker-compose ps
</pre>



<p>Visit the <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> dashboard: <code data-enlighter-language="python" class="EnlighterJSRAW">http://localhost:3000</code></p>



<p><em><strong>Note:</strong></em><em> If you are running the server on a remote machine</em><em>,</em><em> do not forget to </em><em>use </em><em>SSH port forwarding</em><em>. O</em><em>therwise</em><em>,</em><em> you w</em><em>ill no</em><em>t be able to access the Langfuse UI dashboard from your local machine.</em></p>


<div class="wp-block-image">
<figure class="aligncenter size-full"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-49.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="624" height="340" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-49.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53835" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-49.png?size=126x69&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-49-300x163.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-49.png?size=378x206&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-49.png?size=504x275&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-49.png?lossy=2&amp;strip=1&amp;webp=1 624w" sizes="(max-width: 624px) 100vw, 624px" /></a><figcaption class="wp-element-caption"><strong>Figure 14:</strong> The <code>docker-compose</code> setup includes <code>Langfuse Server</code>, <code>Langfuse Worker</code>, <code>PostgreSQL</code>, and <code>vLLM</code>, with each container handling a distinct responsibility within the observability stack.</figcaption></figure></div>


<h3 class="wp-block-heading">Bringing Up the Entire Observability Stack</h3>



<p>Once configuration is in place:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="12">docker-compose --profile gpu up -d
</pre>



<p>Then:</p>



<ul class="wp-block-list">
<li>Go to <code data-enlighter-language="python" class="EnlighterJSRAW">http://localhost:3000</code></li>



<li>Create a project</li>



<li>Copy your <strong>public</strong> and <strong>secret</strong> keys</li>



<li>Paste them into <code data-enlighter-language="python" class="EnlighterJSRAW">.env</code></li>



<li>Restart your Python script</li>
</ul>



<p>You now have:</p>



<ul class="wp-block-list">
<li>A live model server (<code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code>)</li>



<li>A local observability platform (<code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code>)</li>



<li>A database storing every trace</li>



<li>Real-time dashboards</li>



<li>A clean Python project ready for tracing</li>
</ul>



<p>The foundation is complete. Next, we will write and trace our first LLM call.</p>



<hr class="wp-block-separator has-alpha-channel-opacity"/>



<!-- wp:paragraph -->
<h3>Need Help Configuring Your Development Environment?</h3>
<!-- /wp:paragraph -->

<!-- wp:image {"align":"center","id":18137,"sizeSlug":"large","linkDestination":"custom"} -->
<figure class="wp-block-image aligncenter size-large"><a href="https://pyimagesearch.com/pyimagesearch-university/" target="_blank" rel="noreferrer noopener"><img decoding="async" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-18137" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?lossy=2&strip=1&webp=1 500w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?size=126x84&lossy=2&strip=1&webp=1 126w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?size=252x168&lossy=2&strip=1&webp=1 252w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?size=378x253&lossy=2&strip=1&webp=1 378w" sizes="(max-width: 500px) 100vw, 500px" /></a><figcaption>Having trouble configuring your development environment? Want access to pre-configured Jupyter Notebooks running on Google Colab? Be sure to join <a href="https://pyimagesearch.com/pyimagesearch-university/" target="_blank" rel="noreferrer noopener" aria-label=" (opens in a new tab)">PyImageSearch University</a> — you will be up and running with this tutorial in a matter of minutes. </figcaption></figure>
<!-- /wp:image -->

<!-- wp:paragraph -->
<p>All that said, are you:</p>
<!-- /wp:paragraph -->

<!-- wp:list -->
<ul><li>Short on time?</li><li>Learning on your employer’s administratively locked system?</li><li>Wanting to skip the hassle of fighting with the command line, package managers, and virtual environments?</li><li><strong>Ready to run the code immediately on your Windows, macOS, or Linux system?</strong></li></ul>
<!-- /wp:list -->

<!-- wp:paragraph -->
<p>Then join <a href="https://pyimagesearch.com/pyimagesearch-university/" target="_blank">PyImageSearch University</a> today!</p>
<!-- /wp:paragraph -->

<!-- wp:paragraph -->
<p><strong>Gain access to Jupyter Notebooks for this tutorial and other PyImageSearch guides pre-configured to run on Google Colab’s ecosystem right in your web browser!</strong> No installation required.</p>
<!-- /wp:paragraph -->

<!-- wp:paragraph -->
<p>And best of all, these Jupyter Notebooks will run on Windows, macOS, and Linux!</p>
<!-- /wp:paragraph -->



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Baseline-LLM-Application-Before-Observability"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Baseline-LLM-Application-Before-Observability">Baseline LLM Application (Before Observability)</a></h2>



<p>Before we wire in <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code>, we need a clean baseline: a tiny LLM app that talks to <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code>, prints an answer, and knows nothing about traces, latency, or tokens.</p>



<p>This section walks through <code data-enlighter-language="python" class="EnlighterJSRAW">src/basic_llm_app.py</code> end-to-end so we have a clear “before” picture of life <strong>without </strong>observability.</p>



<h3 class="wp-block-heading">The Full Baseline Script</h3>



<p>Here is the full file we will dissect:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="13">"""
Basic LLM Application (No Tracing Baseline)

Simple pipeline using local vLLM server.
This version has NO tracing - compare with tracing_decorator.py
"""

from llm_utils import get_llm_client
from config import get_llm_config

# Initialize vLLM client with model from config
client, model = get_llm_client(load_model_from_config=True)
</pre>



<p>The docstring sets the tone very clearly:</p>



<p>this is a <strong>“no tracing”</strong> baseline that we will later compare against a traced version.</p>



<p>We import:</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">get_llm_client</code>: a reusable helper that knows how to connect to <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> using <code data-enlighter-language="python" class="EnlighterJSRAW">OPENAI_BASE_URL</code> and <code data-enlighter-language="python" class="EnlighterJSRAW">OPENAI_API_KEY</code>.</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">get_llm_config</code>: a small wrapper around <code data-enlighter-language="python" class="EnlighterJSRAW">config.yaml</code> so we don’t hardcode model parameters in the code.</li>
</ul>



<p><code data-enlighter-language="python" class="EnlighterJSRAW">client, model = get_llm_client(load_model_from_config=True)</code> gives us:</p>



<ul class="wp-block-list">
<li>an <code data-enlighter-language="python" class="EnlighterJSRAW">OpenAI</code>-compatible client already pointed at <code data-enlighter-language="python" class="EnlighterJSRAW">http://localhost:8000/v1</code></li>



<li>the model name loaded from <code data-enlighter-language="python" class="EnlighterJSRAW">configs/config.yaml</code>.</li>
</ul>



<p>At this point, the app can already talk to <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code>, but we still have <em>zero</em> observability.</p>



<h3 class="wp-block-heading">Generating an Answer (With No Tracing at All)</h3>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="14">def generate_answer(question: str) -> str:
    """Generate answer using vLLM - NO tracing."""
    # Load config
    llm_config = get_llm_config()
    temperature = llm_config.get("temperature", 0.7)
    max_tokens = llm_config.get("max_tokens", 300)
   
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": question}
            ],
            temperature=temperature,
            max_tokens=max_tokens
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error: {e}")
        print("Tip: Make sure vLLM is running (docker-compose up -d)")
        raise
</pre>



<h4 class="wp-block-heading">Loading config per call</h4>



<p>Inside <code data-enlighter-language="python" class="EnlighterJSRAW">generate_answer</code>, we first pull generation settings from <code data-enlighter-language="python" class="EnlighterJSRAW">YAML</code>:</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">llm_config = get_llm_config()</code> loads the <code data-enlighter-language="python" class="EnlighterJSRAW">llm</code>: section from <code data-enlighter-language="python" class="EnlighterJSRAW">configs/config.yaml</code>.</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">temperature</code> and <code data-enlighter-language="python" class="EnlighterJSRAW">max_tokens</code> are read with sensible defaults (<code data-enlighter-language="python" class="EnlighterJSRAW">0.7</code> and <code data-enlighter-language="python" class="EnlighterJSRAW">300</code>) in case the config is missing keys.</li>
</ul>



<p>This keeps your generation parameters <strong>config-driven</strong>, not hardcoded, which is great for experiments, but still does not give you any tracing.</p>



<h4 class="wp-block-heading">Making the chat completion request</h4>



<p>The try block does a standard <code data-enlighter-language="python" class="EnlighterJSRAW">OpenAI</code>-style chat completion call:</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">model=model</code> uses the <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code>-hosted Llama model from your config.</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">messages=[...]</code> constructs a simple conversation with:
<ul class="wp-block-list">
<li>a <code data-enlighter-language="python" class="EnlighterJSRAW">system</code> message: <code data-enlighter-language="python" class="EnlighterJSRAW">"You are a helpful assistant."</code></li>



<li>a <code data-enlighter-language="python" class="EnlighterJSRAW">user</code> message: the question string passed into the function.</li>
</ul>
</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">temperature</code> and <code data-enlighter-language="python" class="EnlighterJSRAW">max_tokens</code> control creativity and output length.</li>
</ul>



<p><code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> behaves like the <code data-enlighter-language="python" class="EnlighterJSRAW">OpenAI</code> API here, so <code data-enlighter-language="python" class="EnlighterJSRAW">response.choices[0].message.content</code> gives us the generated answer, which is then returned.</p>



<h4 class="wp-block-heading">Error handling (still without observability)</h4>



<p>If anything goes wrong (<code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> not running, bad network, misconfiguration), the <code data-enlighter-language="python" class="EnlighterJSRAW">except</code> block:</p>



<ul class="wp-block-list">
<li>Prints the raw error message.</li>



<li>Prints a helpful hint: <code data-enlighter-language="python" class="EnlighterJSRAW">Make sure vLLM is running (docker-compose up -d)</code>.</li>



<li>Re-raises the exception so the script fails loudly.</li>
</ul>



<p>This is <strong>basic error handling</strong>, but notice what is still missing:</p>



<ul class="wp-block-list">
<li>No trace of which prompt failed.</li>



<li>No structured record of latency or context.</li>



<li>No way to inspect this error later in a dashboard.</li>
</ul>



<p>Even errors are invisible beyond your terminal scrollback.</p>



<h3 class="wp-block-heading">Running the “Invisible” Pipeline</h3>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="15">def run_simple_pipeline(question: str):
    """Simple pipeline without tracing - baseline example."""
    print(f"\n{'='*50}")
    print(f"Question: {question}")
    print(f"{'='*50}\n")
   
    print("Generating answer (no tracing)...")
    answer = generate_answer(question)
   
    print(f"✅ Answer:\n{answer}\n")
    print(f"{'='*50}\n")
</pre>



<p><code data-enlighter-language="python" class="EnlighterJSRAW">run_simple_pipeline</code> is deliberately small and linear:</p>



<ul class="wp-block-list">
<li>It prints a visual separator and echoes the question.</li>



<li>It calls <code data-enlighter-language="python" class="EnlighterJSRAW">generate_answer(question)</code>, the black-box LLM call.</li>



<li>It prints the answer and another separator.</li>
</ul>



<p>This gives you a <strong>nice terminal UX</strong>, but again, it is only surface-level:</p>



<ul class="wp-block-list">
<li>You see the <em>question</em> and <em>final answer</em>.</li>



<li>You do not see any internal steps.</li>



<li>You do not know how long it took.</li>



<li>You do not know how many tokens it used or how much it cost.</li>



<li>You cannot compare this run with previous ones.</li>
</ul>



<p>For anything beyond a toy demo, this is not enough.</p>



<h3 class="wp-block-heading">The __main__ Block</h3>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="16">if __name__ == "__main__":
    question = "What is machine learning?"
    run_simple_pipeline(question)
</pre>



<p>The entry point is intentionally as minimal as possible:</p>



<ul class="wp-block-list">
<li>It defines a simple default question: <code data-enlighter-language="python" class="EnlighterJSRAW">"What is machine learning?"</code></li>



<li>It calls <code data-enlighter-language="python" class="EnlighterJSRAW">run_simple_pipeline(question)</code></li>
</ul>



<p>This makes <code data-enlighter-language="python" class="EnlighterJSRAW">basic_llm_app.py</code> runnable as a one-shot script:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="17">python src/basic_llm_app.py
</pre>



<p>It is perfect for quick manual testing and serves as a <strong>control group</strong> when we later add <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> tracing and see how much more we can observe.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-50-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="339" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-50-1024x339.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53837" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-50.png?size=126x42&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-50-300x99.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-50.png?size=378x125&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-50.png?size=504x167&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-50.png?size=630x209&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-50-768x254.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-50-1024x339.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-50-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-50-1536x508.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 15:</strong> The baseline <code>vLLM</code> pipeline returns answers but offers zero insight into prompts, latency, token usage, or internal steps.</figcaption></figure></div>


<h3 class="wp-block-heading">Why This Baseline Is Not Enough</h3>



<p>With this script, your entire view of the system is:</p>



<ul class="wp-block-list">
<li>one printed question</li>



<li>one printed answer</li>



<li>and maybe an error line if something crashes</li>
</ul>



<p>You cannot answer:</p>



<ul class="wp-block-list">
<li>“Why was this slow?”</li>



<li>“What exact prompt + params did we send?”</li>



<li>“How many tokens did we consume?”</li>



<li>“Where did the pipeline fail?”</li>



<li>“Why is today’s behavior different from yesterday’s?”</li>
</ul>



<p>For serious LLM work involving <code data-enlighter-language="python" class="EnlighterJSRAW">RAG</code> systems, agents, evaluation runs, and A/B testing, this is <strong>debugging in the dark</strong>.</p>



<p>That is exactly what <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> is going to fix.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Adding-LLM-Observability-Langfuse-observe-Decorator"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Adding-LLM-Observability-Langfuse-observe-Decorator">Adding LLM Observability with the Langfuse @observe Decorator</a></h2>



<p>At this point, you have seen how an uninstrumented LLM pipeline behaves: it works, but it hides everything that matters. Now it is time to unlock <strong>real observability</strong> using the <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> <code data-enlighter-language="python" class="EnlighterJSRAW">@observe</code> decorator, the cleanest and most powerful way to add tracing in <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> 2.x.</p>



<p>In this section, we will transform the baseline pipeline into a fully observable workflow, capturing:</p>



<ul class="wp-block-list">
<li>prompts</li>



<li>outputs</li>



<li>latency</li>



<li>token usage</li>



<li>metadata</li>



<li>hierarchy of steps (pipeline → model call)</li>



<li>trace IDs you can click and inspect instantly in <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code></li>
</ul>



<p>This is where everything finally becomes visible.</p>



<h3 class="wp-block-heading">Imports, Initialization, and Configuration Logging</h3>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="18">import os
from langfuse.decorators import observe, langfuse_context
from llm_utils import get_llm_client
from config import get_llm_config
</pre>



<p>We import:</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">observe</code> → adds tracing automatically</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">langfuse_context</code> → lets us update spans programmatically</li>



<li>our reusable LLM client and config loaders</li>
</ul>



<p>Before anything happens, the script prints the <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> configuration:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="19">print("\n" + "="*70)
print("🔧 LANGFUSE CONFIGURATION")
print("="*70)
print(f"📍 LANGFUSE_HOST: {os.getenv('LANGFUSE_HOST', 'NOT SET')}")
print(f"🔑 LANGFUSE_PUBLIC_KEY: {os.getenv('LANGFUSE_PUBLIC_KEY', 'NOT SET')[:20]}...")
print(f"🔐 LANGFUSE_SECRET_KEY: {os.getenv('LANGFUSE_SECRET_KEY', 'NOT SET')[:20]}...")
print("="*70 + "\n")
</pre>



<p>This is extremely practical.</p>



<p>It confirms:</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> host</li>



<li>truncated keys</li>



<li>environment setup correctness</li>
</ul>



<p>If anything is misconfigured, this block saves you debugging time before you even send a single request.</p>



<p>Finally, we initialize the LLM client:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="20">client, model = get_llm_client(load_model_from_config=True)
</pre>



<p>The model name and base URL automatically load from the <code data-enlighter-language="python" class="EnlighterJSRAW">YAML</code> config.</p>



<h3 class="wp-block-heading">Tracing a Single LLM Call with @observe</h3>



<p>Here is the traced model-call function:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="21">@observe(name="generate_answer")
def generate_answer(question: str) -> str:
</pre>



<p>This single decorator:</p>



<ul class="wp-block-list">
<li>creates a <strong>new observation</strong></li>



<li>wraps the function execution</li>



<li>automatically timestamps execution</li>



<li>links child spans to parent spans</li>
</ul>



<h4 class="wp-block-heading">Step 1: Recording Inputs</h4>



<p>Inside the function, the first thing we do is explicitly log the input:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="22">langfuse_context.update_current_observation(
    input={"question": question, "model": model}
)
</pre>



<p>This ensures <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> displays:</p>



<ul class="wp-block-list">
<li>full question</li>



<li>selected model</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">temperature</code> and <code data-enlighter-language="python" class="EnlighterJSRAW">max_tokens</code> (we will update outputs later)</li>
</ul>



<h4 class="wp-block-heading">Step 2: Tracking Latency Manually</h4>



<p>Although <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> timestamps spans automatically, we want explicit latency measurement:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="23">import time
start_time = time.time()
</pre>



<p>Then we perform the <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> call:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="24">response = client.chat.completions.create(
    model=model,
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": question}
    ],
    temperature=temperature,
    max_tokens=max_tokens
)
</pre>



<h4 class="wp-block-heading">Step 3: Computing Latency + Extracting Answer</h4>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="25">latency_ms = (time.time() - start_time) * 1000
answer = response.choices[0].message.content
</pre>



<h3 class="wp-block-heading">Adding Outputs, Token Usage, and Metadata</h3>



<p>This is the heart of observability:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="26">langfuse_context.update_current_observation(
    output={"answer": answer},
    usage={
        "input": response.usage.prompt_tokens,
        "output": response.usage.completion_tokens,
        "total": response.usage.total_tokens
    },
    metadata={"latency_ms": round(latency_ms, 2)}
)
</pre>



<p>With a single update call, you give <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code>:</p>



<h4 class="wp-block-heading">Outputs</h4>



<ul class="wp-block-list">
<li>final LLM response</li>
</ul>



<h4 class="wp-block-heading">Usage</h4>



<ul class="wp-block-list">
<li>prompt tokens</li>



<li>completion tokens</li>



<li>total tokens</li>
</ul>



<p>Essential for:</p>



<ul class="wp-block-list">
<li>cost analysis</li>



<li>throughput understanding</li>



<li>debugging prompt inflation</li>
</ul>



<h4 class="wp-block-heading">Metadata</h4>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">latency_ms</code> (explicit + human-readable)</li>
</ul>



<p>This is exactly what the baseline pipeline could <em>not</em> show.</p>



<p>Print statements reinforce visibility:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="27">print(f"📊 Latency: {latency_ms:.2f}ms")
print(f"📊 Tokens: {response.usage.prompt_tokens} → {response.usage.completion_tokens} (total: {response.usage.total_tokens})")
</pre>



<h3 class="wp-block-heading">Building Nested Traces with run_pipeline()</h3>



<p>The pipeline function also uses <code data-enlighter-language="python" class="EnlighterJSRAW">@observe</code>:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="28">@observe(name="llm_pipeline")
def run_pipeline(question: str):
</pre>



<p>This creates a <em>parent span</em>.</p>



<p>Any traced function called inside <code data-enlighter-language="python" class="EnlighterJSRAW">run_pipeline()</code> automatically becomes a <em>child span</em>.</p>



<h4 class="wp-block-heading">Updating the Trace Metadata</h4>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="29">langfuse_context.update_current_trace(
    name="decorator_pipeline",
    metadata={"method": "decorator"}
)
</pre>



<p>This changes the trace title in the <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse UI</code> and adds custom metadata so you always know which instrumentation method you used.</p>



<h4 class="wp-block-heading">Calling the Nested Span</h4>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="30">answer = generate_answer(question)
</pre>



<p>This produces:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="31">llm_pipeline (parent)
└── generate_answer (child)
</pre>



<p>The tree structure appears instantly in <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code>.</p>



<h4 class="wp-block-heading">Linking Back to the UI</h4>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="32">trace_id = langfuse_context.get_current_trace_id()
print(f"🔍 View trace: {langfuse_host}/trace/{trace_id}")
</pre>



<p>This clickable URL directly opens the exact trace and is extremely useful while iterating locally.</p>



<h3 class="wp-block-heading">Flushing Traces Before Exit</h3>



<p>Short-lived scripts often exit before <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> sends data.</p>



<p>This line ensures nothing is lost:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="33">langfuse_context.flush()
print("✅ Traces sent!\n")
</pre>



<p>Without flushing, traces may appear incomplete or missing entirely.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-51-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="555" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-51-1024x555.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53839" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-51.png?size=126x68&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-51-300x163.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-51.png?size=378x205&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-51.png?size=504x273&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-51.png?size=630x341&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-51-768x417.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-51-1024x555.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-51-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-51-1536x833.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 16:</strong> The <code>@observe</code> decorator automatically builds a hierarchical trace. The pipeline becomes the parent span, and the model call becomes a child span with full visibility into latency, tokens, and outputs.</figcaption></figure></div>


<h3 class="wp-block-heading">Why the Decorator Approach Is the Best Default</h3>



<figure class="wp-block-image aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-53.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="424" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-53-1024x424.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53845" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-53.png?size=126x52&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-53-300x124.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-53.png?size=378x157&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-53.png?size=504x209&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-53.png?size=630x261&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-53-768x318.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-53-1024x424.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-53.png?lossy=2&amp;strip=1&amp;webp=1 1033w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Table 1:</strong> Comparison of manual tracing implementation versus <code>Langfuse</code>’s <code>@observe</code> decorator for automatic observability and trace management in LLM pipelines.</figcaption></figure>



<p>This is why nearly every modern <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> tutorial and production workflow recommends decorators as the <strong>first instrumentation layer</strong>.</p>



<h3 class="wp-block-heading">What You Just Built</h3>



<p>Your LLM pipeline now has:</p>



<ul class="wp-block-list">
<li>Clickable traces</li>



<li>Per-step metadata</li>



<li>Latency and token breakdown</li>



<li>Nested trace hierarchy</li>



<li>Real-time <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse UI</code> updates</li>



<li>Automatic error propagation</li>
</ul>



<p>This completes the transformation from:</p>



<p><strong>a blind LLM script → a fully observable workflow.</strong></p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Running-Verifying-Self-Hosted-Langfuse-Observability-Stack"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Running-Verifying-Self-Hosted-Langfuse-Observability-Stack">Running and Verifying a Self-Hosted Langfuse Observability Stack</a></h2>



<p>By now, we have all the moving parts ready:</p>



<ul class="wp-block-list">
<li>the <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse Server</code> + <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse Worker</code> + <code data-enlighter-language="python" class="EnlighterJSRAW">PostgreSQL</code></li>



<li>the <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> model server</li>



<li>our traced LLM pipeline using the <code data-enlighter-language="python" class="EnlighterJSRAW">@observe</code> decorator</li>
</ul>



<p>In this section, we will bring everything online, verify the system health, and run the traced pipeline end-to-end. By the end, you will see your <em>first real traces</em> appear instantly inside the <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> dashboard.</p>



<h3 class="wp-block-heading">Start the Self-Hosted Stack</h3>



<p>All core services, including <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse Server</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse Worker</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">PostgreSQL</code>, and <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code>, run through your project’s <code data-enlighter-language="python" class="EnlighterJSRAW">docker-compose.yml</code>.</p>



<p>To start everything with GPU acceleration:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="34">docker compose --profile gpu up -d
</pre>



<p>Or, if you don’t have a GPU:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="35">docker compose --profile cpu up -d
</pre>



<p>This launches:</p>



<figure class="wp-block-image aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-54.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="306" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-54-1024x306.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53847" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-54.png?size=126x38&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-54-300x90.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-54.png?size=378x113&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-54.png?size=504x151&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-54.png?size=630x188&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-54-768x229.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-54-1024x306.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-54.png?lossy=2&amp;strip=1&amp;webp=1 1038w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Table 2:</strong> Core <code>Langfuse</code> deployment services and their roles in trace collection, metric computation, storage, and local LLM inference.</figcaption></figure>



<p>You can check everything is healthy using:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="36">docker compose ps
</pre>



<p><strong>Expected output (sample):</strong></p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="37">NAME                 STATUS              PORTS
langfuse-server      healthy             0.0.0.0:3000->3000/tcp
langfuse-worker      running            
langfuse-postgres    healthy             0.0.0.0:5433->5432/tcp
vllm-server          healthy             host:8000->8000/tcp
</pre>



<p><strong>Tip:</strong></p>



<p>If <code data-enlighter-language="python" class="EnlighterJSRAW">langfuse-worker</code> is not running, your dashboard will be empty.</p>



<p>If <code data-enlighter-language="python" class="EnlighterJSRAW">vllm-server</code> is not healthy, your LLM calls will fail.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-55-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="254" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-55-1024x254.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53850" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-55.png?size=126x31&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-55-300x74.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-55.png?size=378x94&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-55.png?size=504x125&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-55.png?size=630x156&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-55-768x190.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-55-1024x254.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-55-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 17:</strong> The full observability stack running locally using <code>Docker Compose</code>.</figcaption></figure></div>

<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-56-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="239" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-56-1024x239.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53851" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-56.png?size=126x29&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-56-300x70.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-56.png?size=378x88&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-56.png?size=504x118&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-56.png?size=630x147&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-56-768x179.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-56-1024x239.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-56-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-56-1536x359.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 18:</strong> <code>Docker</code> containers running the local <code>Langfuse</code> observability stack, including the <code>Langfuse Server</code>, <code>Langfuse Worker</code>, <code>PostgreSQL</code> database, and <code>vLLM</code> inference service.</figcaption></figure></div>


<h3 class="wp-block-heading">Verify Each Component Individually</h3>



<h4 class="wp-block-heading">Langfuse Server (UI)</h4>



<p>Open:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="38">http://localhost:3000
</pre>



<p>You should see:</p>



<ul class="wp-block-list">
<li>The <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> login screen</li>



<li>The dashboard panel</li>



<li>Empty traces (for now)</li>
</ul>



<h4 class="wp-block-heading">vLLM Health</h4>



<p>Visit:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="39">http://localhost:8000/health
</pre>



<p>Expected JSON:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="json" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="40">{"status": "ok"}
</pre>



<p>If this endpoint fails, no LLM calls will work.</p>



<h4 class="wp-block-heading">PostgreSQL Health (optional)</h4>



<p>Inside <code data-enlighter-language="python" class="EnlighterJSRAW">Docker</code>:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="41">docker compose logs langfuse-postgres
</pre>



<p>Look for:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="42">database system is ready to accept connections
</pre>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-57-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="236" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-57-1024x236.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53855" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-57.png?size=126x29&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-57-300x69.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-57.png?size=378x87&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-57.png?size=504x116&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-57.png?size=630x145&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-57-768x177.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-57-1024x236.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-57-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-57-1536x353.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 19:</strong> <code>Langfuse UI</code> Home Page</figcaption></figure></div>


<h3 class="wp-block-heading">Run Your First Traced Pipeline</h3>



<p>Now run the decorator-instrumented script:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="43">python src/tracing_decorator.py
</pre>



<p>You should see terminal output like:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="raw" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="44">==================================================
Question: Explain neural networks briefly
==================================================

Generating answer with tracing...
📊 Latency: 312.45ms
📊 Tokens: 12 → 88 (total: 100)
🔍 View trace: http://localhost:3000/trace/01HXF...

⏳ Flushing traces to Langfuse...
✅ Traces sent!
</pre>



<p>This confirms:</p>



<ul class="wp-block-list">
<li>the decorator worked</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> received the trace</li>



<li>the worker processed it</li>
</ul>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-58-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="794" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-58-1024x794.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53858" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-58.png?size=126x98&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-58-300x232.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-58.png?size=378x293&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-58.png?size=504x391&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-58.png?size=630x488&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-58-768x595.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-58-1024x794.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-58-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-58-1536x1190.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 20:</strong> Running the traced pipeline prints latency, token usage, and a direct link to the trace.</figcaption></figure></div>


<h3 class="wp-block-heading">View the Trace in Langfuse</h3>



<p>Open the printed URL, for example:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="45">http://localhost:3000/trace/01HXFG23P9...
</pre>



<p>You will see:</p>



<h4 class="wp-block-heading">The parent trace</h4>



<p><code data-enlighter-language="python" class="EnlighterJSRAW">decorator_pipeline</code></p>



<h4 class="wp-block-heading">A nested span</h4>



<p><code data-enlighter-language="python" class="EnlighterJSRAW">generate_answer</code></p>



<h4 class="wp-block-heading">Full metadata</h4>



<ul class="wp-block-list">
<li>prompt</li>



<li>output</li>



<li>latency</li>



<li>token usage</li>



<li>model</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">system</code> and <code data-enlighter-language="python" class="EnlighterJSRAW">user</code> messages</li>
</ul>



<p>This is the moment where the entire pipeline becomes visible.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-59-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="520" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-59-1024x520.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53860" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-59.png?size=126x64&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-59-300x152.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-59.png?size=378x192&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-59.png?size=504x256&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-59.png?size=630x320&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-59-768x390.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-59-1024x520.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-59-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-59-1536x779.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 21: </strong>The <code>Langfuse</code> trace view showing the full <code>decorator_pipeline</code> execution, including the parent trace, nested <code>generate_answer</code> span, inputs, outputs, and metadata captured automatically via the <code>@observe</code> decorator.</figcaption></figure></div>


<h3 class="wp-block-heading">Your Observability Stack Is Live</h3>



<p>By the end of this section, you now have:</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse Server</code> + <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse Worker</code> + <code data-enlighter-language="python" class="EnlighterJSRAW">PostgreSQL</code> running locally</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> inference server healthy at port <code data-enlighter-language="python" class="EnlighterJSRAW">8000</code></li>



<li>traced LLM requests flowing into the dashboard</li>



<li>real-time visibility into latency, prompts, outputs, and token usage</li>
</ul>



<p>This forms the foundation for everything in Lesson 2:</p>



<ul class="wp-block-list">
<li>scores</li>



<li>evaluations</li>



<li>diagnostics</li>



<li>advanced tracing patterns</li>
</ul>



<hr class="wp-block-separator has-alpha-channel-opacity"/>



<div id="pitch" style="padding: 40px; width: 100%; background-color: #F4F6FA;">
	<h3>What's next? We recommend <a target="_blank" href="https://pyimagesearch.com/pyimagesearch-university/?utm_source=blogPost&utm_medium=bottomBanner&utm_campaign=What%27s%20next%3F%20I%20recommend">PyImageSearch University</a>.</h3>

	<script src="https://fast.wistia.com/embed/medias/kno0cmko2z.jsonp" async></script><script src="https://fast.wistia.com/assets/external/E-v1.js" async></script><div class="wistia_responsive_padding" style="padding:56.25% 0 0 0;position:relative;"><div class="wistia_responsive_wrapper" style="height:100%;left:0;position:absolute;top:0;width:100%;"><div class="wistia_embed wistia_async_kno0cmko2z videoFoam=true" style="height:100%;position:relative;width:100%"><div class="wistia_swatch" style="height:100%;left:0;opacity:0;overflow:hidden;position:absolute;top:0;transition:opacity 200ms;width:100%;"><img decoding="async" src="https://fast.wistia.com/embed/medias/kno0cmko2z/swatch" style="filter:blur(5px);height:100%;object-fit:contain;width:100%;" alt="" aria-hidden="true" onload="this.parentNode.style.opacity=1;" /></div></div></div></div>

	<div style="margin-top: 32px; margin-bottom: 32px; ">
		<strong>Course information:</strong><br/>
		86+ total classes • 115+ hours hours of on-demand code walkthrough videos • Last updated: June 2026<br/>
		<span style="color: #169FE6;">★★★★★</span> 4.84 (128 Ratings) • 16,000+ Students Enrolled
	</div>

	<p><strong>I strongly believe that if you had the right teacher you could <em>master</em> computer vision and deep learning.</strong></p>

	<p>Do you think learning computer vision and deep learning has to be time-consuming, overwhelming, and complicated? Or has to involve complex mathematics and equations? Or requires a degree in computer science?</p>

	<p>That’s <em>not</em> the case.</p>

	<p>All you need to master computer vision and deep learning is for someone to explain things to you in <em>simple, intuitive</em> terms. <em>And that’s exactly what I do</em>. My mission is to change education and how complex Artificial Intelligence topics are taught.</p>

	<p>If you're serious about learning computer vision, your next stop should be PyImageSearch University, the most comprehensive computer vision, deep learning, and OpenCV course online today. Here you’ll learn how to <em>successfully</em> and <em>confidently</em> apply computer vision to your work, research, and projects. Join me in computer vision mastery.</p>

	<p><strong>Inside PyImageSearch University you'll find:</strong></p>

	<ul style="margin-left: 0px;">
		<li style="list-style: none;">&check; <strong>86+ courses</strong> on essential computer vision, deep learning, and OpenCV topics</li>
		<li style="list-style: none;">&check; <strong>86 Certificates</strong> of Completion</li>
		<li style="list-style: none;">&check; <strong>115+ hours hours</strong> of on-demand video</li>
		<li style="list-style: none;">&check; <strong>Brand new courses released <em>regularly</em></strong>, ensuring you can keep up with state-of-the-art techniques</li>
		<li style="list-style: none;">&check; <strong>Pre-configured Jupyter Notebooks in Google Colab</strong></li>
		<li style="list-style: none;">&check; Run all code examples in your web browser — works on Windows, macOS, and Linux (no dev environment configuration required!)</li>
		<li style="list-style: none;">&check; Access to <strong>centralized code repos for <em>all</em> 540+ tutorials</strong> on PyImageSearch</li>
		<li style="list-style: none;">&check; <strong> Easy one-click downloads</strong> for code, datasets, pre-trained models, etc.</li>
		<li style="list-style: none;">&check; <strong>Access</strong> on mobile, laptop, desktop, etc.</li>
	</ul>

	<p style="text-align: center;">
		<a target="_blank" class="button link" href="https://pyimagesearch.com/pyimagesearch-university/?utm_source=blogPost&utm_medium=bottomBanner&utm_campaign=What%27s%20next%3F%20I%20recommend" style="background-color: #6DC713; border-bottom: none;">Click here to join PyImageSearch University</a>
	</p>
</div>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Summary"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Summary">Summary</a></h2>



<p>In this lesson, you built the core foundation for modern LLM observability. You began by understanding why LLM applications need far more than traditional logs or metrics. They require visibility into prompts, responses, latency, token usage, and multi-step pipelines. This led naturally to <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code>, a tool purpose-built for tracing and monitoring LLM workloads.</p>



<p>You then deployed a fully self-hosted observability stack using <code data-enlighter-language="python" class="EnlighterJSRAW">Docker Compose</code>: <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse Server</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse Worker</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">PostgreSQL</code>, and a local <code data-enlighter-language="python" class="EnlighterJSRAW">vLLM</code> model server. With the project structure, configuration files, and environment variables in place, your development environment became capable of real-time local trace analysis.</p>



<p>Next, you examined your baseline LLM script, a simple “send a question, print an answer” pipeline that works but offers zero visibility. No prompts, no timing, no token counts, and no traceability. This served as the perfect starting point to highlight why observability is essential.</p>



<p>With the <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> <code data-enlighter-language="python" class="EnlighterJSRAW">@observe</code> decorator, you then transformed that invisible pipeline into a fully instrumented one. Every request now captures structured traces: inputs, outputs, latency, token usage, and parent-child spans. Running the script produced your first real trace inside the <code data-enlighter-language="python" class="EnlighterJSRAW">Langfuse</code> dashboard, revealing exactly what the model did and how the pipeline behaved.</p>



<p>By the end of the lesson, your LLM application evolved from a black box into a transparent, debuggable system running locally with self-hosted components.</p>



<p>In the next lesson, you will go deeper by adding manual tracing, scoring, evaluation logic, latency checks, and health diagnostics, building on the foundation you created today.</p>



<h3 class="wp-block-heading">Citation Information</h3>



<p><strong>Singh, V. </strong>“LLM Observability with Self-Hosted Langfuse and vLLM,” <em>PyImageSearch</em>, S. Huot, A. Sharma, and P. Thakur, eds., 2026, <a href="https://pyimg.co/tadoh" target="_blank" rel="noreferrer noopener">https://pyimg.co/tadoh</a> </p>



<pre class="EnlighterJSRAW" data-enlighter-language="raw" data-enlighter-theme="classic" data-enlighter-highlight="" data-enlighter-linenumbers="false" data-enlighter-lineoffset="" data-enlighter-title="LLM Observability with Self-Hosted Langfuse and vLLM" data-enlighter-group="46">@incollection{Singh_2026_llm-observability-self-hosted-langfuse-vllm,
  author = {Vikram Singh},
  title = {{LLM Observability with Self-Hosted Langfuse and vLLM}},
  booktitle = {PyImageSearch},
  editor = {Susan Huot and Aditya Sharma and Piyush Thakur},
  year = {2026},
  url = {https://pyimg.co/tadoh},
}
</pre>



<p><strong>To download the source code to this post (and be notified when future tutorials are published here on PyImageSearch), </strong><em><strong>simply enter your email address in the form below!</strong></em></p>



<div id="download-the-code" class="post-cta-wrap">
<div class="gpd-post-cta">
	<div class="gpd-post-cta-content">
		

			<div class="gpd-post-cta-top">
				<div class="gpd-post-cta-top-image"><img decoding="async" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?lossy=2&strip=1&webp=1" alt="" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?lossy=2&strip=1&webp=1 410w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?size=126x174&lossy=2&strip=1&webp=1 126w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?size=252x348&lossy=2&strip=1&webp=1 252w" sizes="(max-width: 410px) 100vw, 410px" /></div>
				
				<div class="gpd-post-cta-top-title"><h4>Download the Source Code and FREE 17-page Resource Guide</h4></div>
				<div class="gpd-post-cta-top-desc"><p>Enter your email address below to get a .zip of the code and a <strong>FREE 17-page Resource Guide on Computer Vision, OpenCV, and Deep Learning.</strong> Inside you'll find my hand-picked tutorials, books, courses, and libraries to help you master CV and DL!</p></div>


			</div>

			<div class="gpd-post-cta-bottom">
				<form id="footer-cta-code" class="footer-cta" action="https://www.getdrip.com/forms/4130035/submissions" method="post" target="blank" data-drip-embedded-form="4130035">
					<input name="fields[email]" type="email" value="" placeholder="Your email address" class="form-control" />

					<button type="submit">Download the code!</button>

					<div style="display: none;" aria-hidden="true"><label for="website">Website</label><br /><input type="text" id="website" name="website" tabindex="-1" autocomplete="false" value="" /></div>
				</form>
			</div>


		
	</div>

</div>
</div>
<p>The post <a rel="nofollow" href="https://pyimagesearch.com/2026/05/18/llm-observability-with-self-hosted-langfuse-and-vllm/">LLM Observability with Self-Hosted Langfuse and vLLM</a> appeared first on <a rel="nofollow" href="https://pyimagesearch.com">PyImageSearch</a>.</p>
]]></content:encoded>
					
		
		
			</item>
		<item>
		<title>Building and Training a Kimi-K2 Model Using DeepSeek-V3 Components</title>
		<link>https://pyimagesearch.com/2026/05/11/building-and-training-a-kimi-k2-model-using-deepseek-v3-components/</link>
		
		<dc:creator><![CDATA[Puneet Mangla]]></dc:creator>
		<pubDate>Mon, 11 May 2026 12:45:00 +0000</pubDate>
				<category><![CDATA[Artificial Intelligence]]></category>
		<category><![CDATA[Deep Learning]]></category>
		<category><![CDATA[Generative AI]]></category>
		<category><![CDATA[LLMs]]></category>
		<category><![CDATA[Tutorial]]></category>
		<category><![CDATA[agentic ai]]></category>
		<category><![CDATA[attention logits]]></category>
		<category><![CDATA[deepseek v3]]></category>
		<category><![CDATA[deepseek-v3]]></category>
		<category><![CDATA[hugging face transformers]]></category>
		<category><![CDATA[kimi k2]]></category>
		<category><![CDATA[kimi-k2]]></category>
		<category><![CDATA[llm training]]></category>
		<category><![CDATA[mixture of experts]]></category>
		<category><![CDATA[mla]]></category>
		<category><![CDATA[moe]]></category>
		<category><![CDATA[multi-head latent attention]]></category>
		<category><![CDATA[muonclip]]></category>
		<category><![CDATA[open source llm]]></category>
		<category><![CDATA[pytorch]]></category>
		<category><![CDATA[qk-clip]]></category>
		<category><![CDATA[synthetic data generation]]></category>
		<category><![CDATA[token efficiency]]></category>
		<category><![CDATA[transformer architecture]]></category>
		<category><![CDATA[tutorial]]></category>
		<guid isPermaLink="false">https://pyimagesearch.com/?p=53671</guid>

					<description><![CDATA[<p>Table of Contents Building and Training a Kimi-K2 Model Using DeepSeek-V3 Components Kimi-K2 vs DeepSeek-V3: Key Architecture Differences in LLM Design Mixture of Experts Scaling in Kimi-K2: Model Size, Sparsity, and Efficiency Attention Head Optimization in Kimi-K2 for Efficient Long-Context&#8230;</p>
<p>The post <a rel="nofollow" href="https://pyimagesearch.com/2026/05/11/building-and-training-a-kimi-k2-model-using-deepseek-v3-components/">Building and Training a Kimi-K2 Model Using DeepSeek-V3 Components</a> appeared first on <a rel="nofollow" href="https://pyimagesearch.com">PyImageSearch</a>.</p>
]]></description>
										<content:encoded><![CDATA[<div class="yoast-breadcrumbs"><span><span><a href="https://pyimagesearch.com/">Home</a></span></div>


<div class="toc">
<hr class="TOC"/>
<p class="has-large-font-size"><strong>Table of Contents</strong></p>
<ul>
    <li id="TOC-h1-Building-Training-Kimi-K2-Model-Using-DeepSeek-V3-Components"><a rel="noopener" target="_blank" href="#h1-Building-Training-Kimi-K2-Model-Using-DeepSeek-V3-Components">Building and Training a Kimi-K2 Model Using DeepSeek-V3 Components</a></li>

    <li id="TOC-h2-Kimi-K2-vs-DeepSeek-V3-Key-Architecture-Differences-LLM-Design"><a rel="noopener" target="_blank" href="#h2-Kimi-K2-vs-DeepSeek-V3-Key-Architecture-Differences-LLM-Design">Kimi-K2 vs DeepSeek-V3: Key Architecture Differences in LLM Design</a></li>
    <ul>
        <li id="TOC-h3-Mixture-Experts-Scaling-Kimi-K2-Model-Size-Sparsity-Efficiency"><a rel="noopener" target="_blank" href="#h3-Mixture-Experts-Scaling-Kimi-K2-Model-Size-Sparsity-Efficiency">Mixture of Experts Scaling in Kimi-K2: Model Size, Sparsity, and Efficiency</a></li>
        <li id="TOC-h3-Attention-Head-Optimization-Kimi-K2-Efficient-Long-Context-LLMs"><a rel="noopener" target="_blank" href="#h3-Attention-Head-Optimization-Kimi-K2-Efficient-Long-Context-LLMs">Attention Head Optimization in Kimi-K2 for Efficient Long-Context LLMs</a></li>
    </ul>

    <li id="TOC-h2-MuonClip-Optimizer-Stabilizing-Large-Scale-LLM-Training-Kimi-K2"><a rel="noopener" target="_blank" href="#h2-MuonClip-Optimizer-Stabilizing-Large-Scale-LLM-Training-Kimi-K2">MuonClip Optimizer: Stabilizing Large-Scale LLM Training in Kimi-K2</a></li>
    <ul>
        <li id="TOC-h3-Token-Efficiency-LLM-Training-Why-It-Matters-Kimi-K2"><a rel="noopener" target="_blank" href="#h3-Token-Efficiency-LLM-Training-Why-It-Matters-Kimi-K2">Token Efficiency in LLM Training: Why It Matters for Kimi-K2</a></li>
        <li id="TOC-h3-Attention-Logit-Explosion-LLMs-Training-Instability-Challenges"><a rel="noopener" target="_blank" href="#h3-Attention-Logit-Explosion-LLMs-Training-Instability-Challenges">Attention Logit Explosion in LLMs: Training Instability and Challenges</a></li>
        <li id="TOC-h3-QK-Clip-Preventing-Attention-Logit-Explosion-Kimi-K2-Training"><a rel="noopener" target="_blank" href="#h3-QK-Clip-Preventing-Attention-Logit-Explosion-Kimi-K2-Training">QK-Clip: Preventing Attention Logit Explosion in Kimi-K2 Training</a></li>
    </ul>

    <li id="TOC-h2-Training-Data-Optimization-Kimi-K2-Improving-Token-Utility-LLMs"><a rel="noopener" target="_blank" href="#h2-Training-Data-Optimization-Kimi-K2-Improving-Token-Utility-LLMs">Training Data Optimization for Kimi-K2: Improving Token Utility in LLMs</a></li>
    <ul>
        <li id="TOC-h3-Token-Utility-LLM-Training-Maximizing-Learning-per-Token"><a rel="noopener" target="_blank" href="#h3-Token-Utility-LLM-Training-Maximizing-Learning-per-Token">Token Utility in LLM Training: Maximizing Learning per Token</a></li>
        <li id="TOC-h3-Knowledge-Data-Rephrasing-LLMs-Improving-Training-Data-Quality"><a rel="noopener" target="_blank" href="#h3-Knowledge-Data-Rephrasing-LLMs-Improving-Training-Data-Quality">Knowledge Data Rephrasing for LLMs: Improving Training Data Quality</a></li>
    </ul>

    <li id="TOC-h2-Kimi-K2-Implementation-Training-Open-Source-LLM-DeepSeek-V3"><a rel="noopener" target="_blank" href="#h2-Kimi-K2-Implementation-Training-Open-Source-LLM-DeepSeek-V3">Kimi-K2 Implementation: Training an Open-Source LLM with DeepSeek-V3</a></li>
    <ul>
        <li id="TOC-h3-Multi-Head-Latent-Attention-MLA-Max-Logit-Tracking-Kimi-K2"><a rel="noopener" target="_blank" href="#h3-Multi-Head-Latent-Attention-MLA-Max-Logit-Tracking-Kimi-K2">Multi-Head Latent Attention (MLA) with Max Logit Tracking in Kimi-K2</a></li>
        <li id="TOC-h3-Implementing-MuonClip-Optimizer-Stable-LLM-Training"><a rel="noopener" target="_blank" href="#h3-Implementing-MuonClip-Optimizer-Stable-LLM-Training">Implementing the MuonClip Optimizer for Stable LLM Training</a></li>
        <li id="TOC-h3-Complete-Kimi-K2-Training-Pipeline-Setup-Config-Optimization"><a rel="noopener" target="_blank" href="#h3-Complete-Kimi-K2-Training-Pipeline-Setup-Config-Optimization">Complete Kimi-K2 Training Pipeline: Setup, Config, and Optimization</a></li>
    </ul>

    <li id="TOC-h2-Summary"><a rel="noopener" target="_blank" href="#h2-Summary">Summary</a></li>
    <ul>
        <li id="TOC-h3-Citation-Information"><a rel="noopener" target="_blank" href="#h3-Citation-Information">Citation Information</a></li>
    </ul>
</ul>
</div>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h1-Building-Training-Kimi-K2-Model-Using-DeepSeek-V3-Components"/>



<h2 class="wp-block-heading"><a href="#TOC-h1-Building-Training-Kimi-K2-Model-Using-DeepSeek-V3-Components">Building and Training a Kimi-K2 Model Using DeepSeek-V3 Components</a></h2>



<p>The landscape of large language models (LLMs) is undergoing a fundamental transformation toward <strong>agentic intelligence</strong>, where models can autonomously perceive, plan, reason, and act within complex and dynamic environments. This paradigm shift moves beyond traditional static imitation learning toward models that actively learn through interaction, acquire skills beyond their training distribution, and adapt their behavior based on experience. Agentic intelligence represents a critical capability for the next generation of foundation models, with transformative implications for tool use, software development, and real-world autonomy.</p>


<div class="wp-block-image">
<figure class="aligncenter size-full"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/building-training-kimi-k2-model-using-deepseek-v3-featured.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="940" height="780" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/building-training-kimi-k2-model-using-deepseek-v3-featured.png?lossy=2&strip=1&webp=1" alt="building-training-kimi-k2-model-using-deepseek-v3-featured.png" class="wp-image-53723" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/building-training-kimi-k2-model-using-deepseek-v3-featured.png?size=126x105&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/building-training-kimi-k2-model-using-deepseek-v3-featured-300x249.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/building-training-kimi-k2-model-using-deepseek-v3-featured.png?size=378x314&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/building-training-kimi-k2-model-using-deepseek-v3-featured.png?size=504x418&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/building-training-kimi-k2-model-using-deepseek-v3-featured.png?size=630x523&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/building-training-kimi-k2-model-using-deepseek-v3-featured-768x637.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/building-training-kimi-k2-model-using-deepseek-v3-featured.png?lossy=2&amp;strip=1&amp;webp=1 940w" sizes="(max-width: 630px) 100vw, 630px" /></a></figure></div>


<p>Kimi-K2 stands at the forefront of this revolution. As a 1.04 trillion-parameter Mixture-of-Experts (MoE) language model with 32 billion activated parameters, Kimi-K2 was purposefully designed to address the core challenges of agentic capability development. The model achieves remarkable performance across diverse benchmarks:</p>



<ul class="wp-block-list">
<li>66.1 on Tau2-bench</li>



<li>76.5 on ACEBench (en)</li>



<li>65.8 on SWE-bench Verified</li>



<li>53.7 on LiveCodeBench v6</li>



<li>75.1 on GPQA-Diamond</li>
</ul>



<p>On the LMSYS (Large Model Systems Organization) Arena leaderboard, Kimi-K2 ranks as the top open-source model and 5th overall, competing closely with Claude 4 Opus and Claude 4 Sonnet.</p>



<p>In this lesson, we dive deep into the technical innovations behind Kimi-K2, focusing on its architectural differences from DeepSeek-V3, the revolutionary MuonClip optimizer, and training data improvements. We also provide a complete implementation guide using DeepSeek-V3 components as building blocks.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Kimi-K2-vs-DeepSeek-V3-Key-Architecture-Differences-LLM-Design"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Kimi-K2-vs-DeepSeek-V3-Key-Architecture-Differences-LLM-Design">Kimi-K2 vs DeepSeek-V3: Key Architecture Differences in LLM Design</a></h2>



<p>While Kimi-K2 builds on DeepSeek-V3&#8217;s architecture, several strategic modifications were made to optimize agentic capabilities and inference efficiency. Understanding these architectural differences is crucial for implementing the model effectively (<strong>Table 1</strong>).</p>


<div class="wp-block-image">
<figure class="aligncenter size-full"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-8.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="930" height="416" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-8.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53695" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-8.png?size=126x56&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-8-300x134.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-8.png?size=378x169&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-8.png?size=504x225&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-8.png?size=630x282&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-8-768x344.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-8.png?lossy=2&amp;strip=1&amp;webp=1 930w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Table 1:</strong> Kimi-K2 vs DeepSeek-V3 Configurations (source: <a href="https://arxiv.org/pdf/2507.20534" target="_blank" rel="noreferrer noopener">Kimi Team, 2026</a>).</figcaption></figure></div>


<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Mixture-Experts-Scaling-Kimi-K2-Model-Size-Sparsity-Efficiency"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Mixture-Experts-Scaling-Kimi-K2-Model-Size-Sparsity-Efficiency">Mixture of Experts Scaling in Kimi-K2: Model Size, Sparsity, and Efficiency</a></h3>



<p>The most significant architectural departure lies in Kimi-K2&#8217;s aggressive sparsity scaling. Through carefully controlled small-scale experiments, the Kimi team developed a <strong>sparsity scaling law</strong> that demonstrated a clear relationship: with the number of activated parameters held constant (i.e., constant FLOPs), increasing the total number of experts consistently lowers both training and validation loss. This finding led to a dramatic increase in model sparsity.</p>



<p>Kimi-K2 employs <strong>384 experts</strong> compared to DeepSeek-V3&#8217;s 256 experts, representing a 50% increase. Despite this, the model maintains 8 active experts per token, resulting in a sparsity ratio of 48 (384/8) versus DeepSeek-V3&#8217;s 32 (256/8). This increased sparsity comes with a trade-off: while total parameters grow to 1.04 trillion (54% more than DeepSeek-V3&#8217;s 671B), the number of activated parameters actually <em>decreases</em> to 32.6B (13% less than DeepSeek-V3&#8217;s 37B). This design choice optimizes the compute-performance frontier, achieving superior model quality while maintaining efficient inference.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Attention-Head-Optimization-Kimi-K2-Efficient-Long-Context-LLMs"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Attention-Head-Optimization-Kimi-K2-Efficient-Long-Context-LLMs">Attention Head Optimization in Kimi-K2 for Efficient Long-Context LLMs</a></h3>



<p>A critical optimization for agentic applications involves the number of attention heads. DeepSeek-V3 sets the number of attention heads to roughly twice the number of model layers (128 heads for 61 layers) to better utilize memory bandwidth. However, as context length increases, this design incurs significant inference overhead.</p>



<p>For agentic applications requiring efficient long-context processing, this becomes prohibitive. With a 128k sequence length, increasing attention heads from 64 to 128 (while keeping 384 total experts) leads to an <strong>83% increase in inference FLOPs</strong>. Through controlled experiments, the Kimi team found that doubling the number of attention heads yields only modest improvements in validation loss (0.5% to 1.2%) under iso-token training conditions.</p>



<p>Given that sparsity 48 already provides strong performance, the marginal gains from doubling attention heads do not justify the inference cost. Kimi-K2 therefore uses <strong>64 attention heads</strong> (half of DeepSeek-V3&#8217;s 128), dramatically reducing inference costs for long-context agentic workloads while maintaining competitive performance.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-MuonClip-Optimizer-Stabilizing-Large-Scale-LLM-Training-Kimi-K2"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-MuonClip-Optimizer-Stabilizing-Large-Scale-LLM-Training-Kimi-K2">MuonClip Optimizer: Stabilizing Large-Scale LLM Training in Kimi-K2</a></h2>



<p>The MuonClip optimizer represents one of the most significant innovations in Kimi-K2&#8217;s development, addressing the fundamental challenge of training stability at trillion-parameter scale while maintaining token efficiency. Understanding MuonClip requires examining both the underlying Muon optimizer and the novel QK-Clip mechanism that makes it stable for large-scale training.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Token-Efficiency-LLM-Training-Why-It-Matters-Kimi-K2"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Token-Efficiency-LLM-Training-Why-It-Matters-Kimi-K2">Token Efficiency in LLM Training: Why It Matters for Kimi-K2</a></h3>



<p>Given the increasingly limited availability of high-quality human data, <strong>token efficiency</strong> has emerged as a critical factor in LLM scaling. Token efficiency refers to how much performance improvement is achieved per token consumed during training. The Muon optimizer, introduced by <a href="https://kellerjordan.github.io/posts/muon/" target="_blank" rel="noreferrer noopener">Jordan et al. (2024)</a>, substantially outperforms AdamW under the same compute budget, model size, and training data volume.</p>



<p>Previous work in Moonlight demonstrated that Muon&#8217;s token efficiency gains make it an ideal choice for maximizing the intelligence extracted from limited high-quality tokens. However, scaling Muon to trillion-parameter models revealed a critical challenge: <strong>training instability due to exploding attention logits</strong>.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Attention-Logit-Explosion-LLMs-Training-Instability-Challenges"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Attention-Logit-Explosion-LLMs-Training-Instability-Challenges">Attention Logit Explosion in LLMs: Training Instability and Challenges</a></h3>



<p>During medium-scale training runs using vanilla Muon, attention logits rapidly exceeded magnitudes of 1000, leading to numerical instabilities and occasional training divergence (<strong>Figure 1</strong>). This phenomenon occurred more frequently with Muon than with AdamW, suggesting that Muon&#8217;s aggressive optimization dynamics amplify instabilities in the attention mechanism.</p>


<div class="wp-block-image">
<figure class="aligncenter size-full"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-9.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="889" height="545" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-9.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53698" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-9.png?size=126x77&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-9-300x184.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-9.png?size=378x232&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-9.png?size=504x309&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-9.png?size=630x386&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-9-768x471.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-9.png?lossy=2&amp;strip=1&amp;webp=1 889w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 1:</strong> Attention logits rapidly exceed 1000, which could lead to potential numerical instabilities and even training divergence (source: <a href="https://arxiv.org/pdf/2507.20534" target="_blank" rel="noreferrer noopener">Kimi Team, 2026</a>).</figcaption></figure></div>


<p>Existing mitigation strategies proved insufficient:</p>



<ul class="wp-block-list">
<li><strong>Logit soft-capping</strong> (used in Gemma) directly clips attention logits, but the dot products between queries and keys can still grow excessively <em>before</em> capping is applied</li>



<li><strong>Query-Key Normalization (QK-Norm)</strong> (<a href="https://arxiv.org/abs/2302.05442" target="_blank" rel="noreferrer noopener">Dehghani</a><a href="https://arxiv.org/abs/2302.05442" target="_blank" rel="noreferrer noopener"> et al., 2023</a>) is incompatible with Multi-head Latent Attention (MLA) because full key matrices are not explicitly materialized during inference</li>
</ul>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-QK-Clip-Preventing-Attention-Logit-Explosion-Kimi-K2-Training"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-QK-Clip-Preventing-Attention-Logit-Explosion-Kimi-K2-Training">QK-Clip: Preventing Attention Logit Explosion in Kimi-K2 Training</a></h3>



<p>To address this fundamental challenge, the Kimi team proposed <strong>QK-Clip</strong>, a novel weight-clipping mechanism that explicitly constrains attention logits by rescaling the query and key projection weights post-update. The elegance of QK-Clip lies in its simplicity: it does not alter forward and backward computation in the current step but instead uses maximum logits as a guiding signal to control weight growth (<strong>Figure 2</strong>).</p>


<div class="wp-block-image">
<figure class="aligncenter size-full"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-2.jpeg" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="900" height="553" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-2.jpeg?lossy=2&strip=1&webp=1" alt="" class="wp-image-53700" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-2.jpeg?size=126x77&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-2-300x184.jpeg?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-2.jpeg?size=378x232&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-2.jpeg?size=504x310&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-2.jpeg?size=630x387&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-2-768x472.jpeg?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-2.jpeg?lossy=2&amp;strip=1&amp;webp=1 900w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 2:</strong> Maximum logits for Kimi-K2 with MuonClip and τ = 100 over the entire training run. The max logits rapidly increase to the capped value of 100 before decaying to a stable range (source: <a href="https://arxiv.org/pdf/2507.20534" target="_blank" rel="noreferrer noopener">Kimi Team, 2026</a>).</figcaption></figure></div>


<p>For each attention head <img src='https://b2633864.smushcdn.com/2633864/wp-content/latex/251/2510c39011c5be704182423e3a695e91-ffffff-000000-0.png?lossy=2&strip=1&webp=1' alt='h' title='h' class='latex' />, the attention mechanism computes:</p>



<p class="has-text-align-center"><img src='https://b2633864.smushcdn.com/2633864/wp-content/latex/257/25724548027ce7dd7fabfff6a26a14cb-ffffff-000000-0.png?lossy=2&strip=1&webp=1' alt='Q^h = X W_q^h, \quad K^h = X W_k^h, \quad V^h = X W_v^h' title='Q^h = X W_q^h, \quad K^h = X W_k^h, \quad V^h = X W_v^h' class='latex' srcset='https://b2633864.smushcdn.com/2633864/wp-content/latex/257/25724548027ce7dd7fabfff6a26a14cb-ffffff-000000-0.png?lossy=2&strip=1&webp=1 295w,https://b2633864.smushcdn.com/2633864/wp-content/latex/257/25724548027ce7dd7fabfff6a26a14cb-ffffff-000000-0.png?size=126x9&lossy=2&strip=1&webp=1 126w' sizes='(max-width: 295px) 100vw, 295px' /></p>



<p>The attention output is:</p>



<p class="has-text-align-center"><img src='https://b2633864.smushcdn.com/2633864/wp-content/latex/514/51409ecdc9e1c7546bbb02d0c5f46616-ffffff-000000-0.png?lossy=2&strip=1&webp=1' alt='O^h = \text{softmax}\left(\dfrac{1}{\sqrt{d}} Q^h (K^h)^\top\right) V^h' title='O^h = \text{softmax}\left(\dfrac{1}{\sqrt{d}} Q^h (K^h)^\top\right) V^h' class='latex' srcset='https://b2633864.smushcdn.com/2633864/wp-content/latex/514/51409ecdc9e1c7546bbb02d0c5f46616-ffffff-000000-0.png?lossy=2&strip=1&webp=1 239w,https://b2633864.smushcdn.com/2633864/wp-content/latex/514/51409ecdc9e1c7546bbb02d0c5f46616-ffffff-000000-0.png?size=126x22&lossy=2&strip=1&webp=1 126w' sizes='(max-width: 239px) 100vw, 239px' /></p>



<p>QK-Clip defines the <strong>max logit</strong> per head as:</p>



<p class="has-text-align-center"><img src='https://b2633864.smushcdn.com/2633864/wp-content/latex/41b/41b103dd14298456087cb885c9b0ea34-ffffff-000000-0.png?lossy=2&strip=1&webp=1' alt='S_{\max}^h = \dfrac{1}{\sqrt{d}} \max_{X \in B} \max_{i,j} Q_i^h (K_j^h)^\top' title='S_{\max}^h = \dfrac{1}{\sqrt{d}} \max_{X \in B} \max_{i,j} Q_i^h (K_j^h)^\top' class='latex' srcset='https://b2633864.smushcdn.com/2633864/wp-content/latex/41b/41b103dd14298456087cb885c9b0ea34-ffffff-000000-0.png?lossy=2&strip=1&webp=1 255w,https://b2633864.smushcdn.com/2633864/wp-content/latex/41b/41b103dd14298456087cb885c9b0ea34-ffffff-000000-0.png?size=126x19&lossy=2&strip=1&webp=1 126w' sizes='(max-width: 255px) 100vw, 255px' /></p>



<p>where <img src='https://b2633864.smushcdn.com/2633864/wp-content/latex/9d5/9d5ed678fe57bcca610140957afab571-ffffff-000000-0.png?lossy=2&strip=1&webp=1' alt='B' title='B' class='latex' /> is the current batch and <img src='https://b2633864.smushcdn.com/2633864/wp-content/latex/f54/f540942e195ca3ac12148363180a7912-ffffff-000000-0.png?lossy=2&strip=1&webp=1' alt='i, j' title='i, j' class='latex' /> index different tokens.</p>



<p>When <img src='https://b2633864.smushcdn.com/2633864/wp-content/latex/fbb/fbbfe0d629823f9635abc37a80d44390-ffffff-000000-0.png?lossy=2&strip=1&webp=1' alt='S_{\max}^h' title='S_{\max}^h' class='latex' /> exceeds a threshold <img src='https://b2633864.smushcdn.com/2633864/wp-content/latex/a6f/a6f317b268ae825d94f832f970af607c-ffffff-000000-0.png?lossy=2&strip=1&webp=1' alt='\tau' title='\tau' class='latex' /> (set to 100 for Kimi-K2), QK-Clip rescales the weights. Critically, the rescaling is applied <strong>per-head</strong> rather than globally, minimizing intervention on heads that remain stable:</p>



<p class="has-text-align-center"><img src='https://b2633864.smushcdn.com/2633864/wp-content/latex/dbb/dbba3baf5b2edebc882c9a597b2fce7b-ffffff-000000-0.png?lossy=2&strip=1&webp=1' alt='\gamma_h = \min(1, \tau / S_{\max}^h)' title='\gamma_h = \min(1, \tau / S_{\max}^h)' class='latex' />.</p>



<p>This per-head, component-aware clipping represents a substantial refinement over naive global clipping strategies.</p>



<p><strong>Figure 3 </strong>describes the complete algorithm for MuonClip Optimizer.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-3-scaled.jpeg" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="502" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-3-1024x502.jpeg?lossy=2&strip=1&webp=1" alt="" class="wp-image-53702" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-3.jpeg?size=126x62&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-3-300x147.jpeg?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-3.jpeg?size=378x185&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-3.jpeg?size=504x247&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-3.jpeg?size=630x309&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-3-768x377.jpeg?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-3-1024x502.jpeg?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-3-scaled.jpeg?lossy=2&amp;strip=1&amp;webp=1 1080w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 3:</strong> MuonClip Algorithm (source: <a href="https://arxiv.org/pdf/2507.20534" target="_blank" rel="noreferrer noopener">Kimi Team, 2026</a>).</figcaption></figure></div>


<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Training-Data-Optimization-Kimi-K2-Improving-Token-Utility-LLMs"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Training-Data-Optimization-Kimi-K2-Improving-Token-Utility-LLMs">Training Data Optimization for Kimi-K2: Improving Token Utility in LLMs</a></h2>



<p>Beyond architectural and optimizer innovations, Kimi-K2&#8217;s superior performance stems significantly from strategic improvements in training data. With high-quality human-generated data becoming increasingly scarce, the focus shifts to <strong>increasing token utility</strong>, defined as the effective learning signal each token contributes to model updates.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Token-Utility-LLM-Training-Maximizing-Learning-per-Token"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Token-Utility-LLM-Training-Maximizing-Learning-per-Token">Token Utility in LLM Training: Maximizing Learning per Token</a></h3>



<p>Token efficiency in pre-training encompasses 2 related but distinct concepts:</p>



<ul class="wp-block-list">
<li><strong>Optimizer efficiency:</strong> How effectively the optimizer extracts signal from each gradient update (addressed by MuonClip)</li>



<li><strong>Token utility:</strong> The inherent information density and learning signal in each token</li>
</ul>



<p>Increasing token utility directly improves token efficiency. A naive approach involves repeated exposure to the same tokens across multiple epochs, but this leads to overfitting and reduced generalization. The key innovation in Kimi-K2 lies in a sophisticated <strong>synthetic data generation strategy</strong> that amplifies high-quality tokens without inducing overfitting.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Knowledge-Data-Rephrasing-LLMs-Improving-Training-Data-Quality"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Knowledge-Data-Rephrasing-LLMs-Improving-Training-Data-Quality">Knowledge Data Rephrasing for LLMs: Improving Training Data Quality</a></h3>



<p>Pre-training on knowledge-intensive text presents a fundamental trade-off: a single epoch is insufficient for comprehensive knowledge absorption, while multi-epoch repetition yields diminishing returns. To resolve this tension, Kimi-K2 employs a synthetic rephrasing framework with the following 3 key components.</p>



<h4 class="wp-block-heading">Style- and Perspective-Diverse Prompting</h4>



<p>To enhance linguistic diversity while maintaining factual integrity, carefully engineered prompts guide a large language model to generate faithful rephrasings in varied styles and perspectives. This approach ensures that while surface-level linguistic features change, the underlying factual content remains consistent. The diversity of expressions forces the model to learn robust representations of the same knowledge across multiple linguistic realizations.</p>



<h4 class="wp-block-heading">Chunk-wise Autoregressive Generation</h4>



<p>Long documents pose a challenge for standard LLM-based rewriting due to implicit output length limitations. Kimi-K2 addresses this through a chunk-based autoregressive strategy: documents are segmented, each segment is rephrased individually with preserved context, and segments are stitched back together to form complete passages. This methodology prevents information loss and maintains global coherence across extended texts (<strong>Figure 4</strong>).</p>



<h4 class="wp-block-heading">Fidelity Verification</h4>



<p>To ensure consistency between original and rewritten content, fidelity checks compare the semantic alignment of each rephrased passage with its source. This quality control step prevents the introduction of hallucinations or factual errors during the rephrasing process.</p>


<div class="wp-block-image">
<figure class="aligncenter size-full"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-10.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="913" height="410" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-10.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53704" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-10.png?size=126x57&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-10-300x135.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-10.png?size=378x170&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-10.png?size=504x226&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-10.png?size=630x283&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-10-768x345.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-10.png?lossy=2&amp;strip=1&amp;webp=1 913w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 4:</strong> Auto-regressive chunk-wise rephrasing pipeline for long input excerpts (source: <a href="https://arxiv.org/pdf/2507.20534" target="_blank" rel="noreferrer noopener">Kimi Team, 2026</a>).</figcaption></figure></div>


<h4 class="wp-block-heading">Mathematics Data Rephrasing</h4>



<p>To enhance mathematical reasoning capabilities, high-quality mathematical documents are rewritten into a &#8220;learning-note&#8221; style following SwallowMath methodology (<strong>Figure 5</strong>). This transformation converts dense mathematical exposition into more pedagogical formats that better support learning. Additionally, data diversity is increased through the translation of high-quality mathematical materials from other languages into English, effectively multiplying the available high-quality mathematical training data.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-11-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="298" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-11-1024x298.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53706" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-11.png?size=126x37&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-11-300x87.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-11.png?size=378x110&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-11.png?size=504x147&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-11.png?size=630x183&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-11-768x223.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-11-1024x298.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-11-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 5:</strong> Four-stage pipeline for constructing SwallowMath (source: <a href="https://arxiv.org/pdf/2505.02881" target="_blank" rel="noreferrer noopener">Fujii et al., 2026</a>).</figcaption></figure></div>


<h4 class="wp-block-heading">Overall Pre-training Corpus</h4>



<p>The complete Kimi-K2 pre-training corpus comprises <strong>15.5 trillion tokens</strong> of curated, high-quality data spanning 4 primary domains:</p>



<ul class="wp-block-list">
<li><strong>Web Text:</strong> General knowledge and natural language understanding</li>



<li><strong>Code:</strong> Programming and structured reasoning</li>



<li><strong>Mathematics:</strong> Quantitative reasoning and formal problem-solving</li>



<li><strong>Knowledge:</strong> Domain-specific expertise and factual information</li>
</ul>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Kimi-K2-Implementation-Training-Open-Source-LLM-DeepSeek-V3"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Kimi-K2-Implementation-Training-Open-Source-LLM-DeepSeek-V3">Kimi-K2 Implementation: Training an Open-Source LLM with DeepSeek-V3</a></h2>



<p>In this section, we walk through the key implementation details for training Kimi-K2, focusing specifically on the components that differ from the standard DeepSeek-V3 implementation. We&#8217;ll examine the enhanced Multi-head Latent Attention with max logit tracking, the MuonClip optimizer implementation, and the custom training setup.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Multi-Head-Latent-Attention-MLA-Max-Logit-Tracking-Kimi-K2"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Multi-Head-Latent-Attention-MLA-Max-Logit-Tracking-Kimi-K2">Multi-Head Latent Attention (MLA) with Max Logit Tracking in Kimi-K2</a></h3>



<p>The Multi-head Latent Attention (MLA) mechanism in Kimi-K2 extends DeepSeek-V3&#8217;s implementation with critical modifications to support QK-Clip. The key enhancement is <strong>per-head max-logit tracking</strong> during the forward pass, which provides the signal needed for weight clipping by the optimizer.</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Building and Training a Kimi-K2 Model Using DeepSeek-V3 Components" data-enlighter-group="1">class MultiheadLatentAttention(nn.Module):
    
    def __init__(self, config: DeepSeekConfig):
        super().__init__()
        self.config = config
        self.n_embd = config.n_embd
        self.n_head = config.n_head
        self.head_dim = config.n_embd // config.n_head

        # Compression dimensions
        self.kv_lora_rank = config.kv_lora_rank
        self.q_lora_rank = config.q_lora_rank
        self.rope_dim = config.rope_dim

        # KV compression
        self.kv_proj = nn.Linear(self.n_embd, self.kv_lora_rank, bias=False)
        self.kv_norm = RMSNorm(self.kv_lora_rank)

        # KV decompression
        self.k_decompress = nn.Linear(self.kv_lora_rank, self.n_head * self.head_dim, bias=False)
        self.v_decompress = nn.Linear(self.kv_lora_rank, self.n_head * self.head_dim, bias=False)

        # Query compression
        self.q_proj = nn.Linear(self.n_embd, self.q_lora_rank, bias=False)
        self.q_decompress = nn.Linear(self.q_lora_rank, self.n_head * self.head_dim, bias=False)

        # RoPE projections
        self.k_rope_proj = nn.Linear(self.n_embd, self.n_head * self.rope_dim, bias=False)
        self.q_rope_proj = nn.Linear(self.q_lora_rank, self.n_head * self.rope_dim, bias=False)

        # Output projection
        self.o_proj = nn.Linear(self.n_head * self.head_dim, self.n_embd, bias=config.bias)

        # Dropout
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)

        # RoPE
        self.rope = RotaryEmbedding(self.rope_dim, config.block_size)

        # Causal mask
        self.register_buffer(
            "causal_mask",
            torch.tril(torch.ones(config.block_size, config.block_size)).view(
                1, 1, config.block_size, config.block_size
            )
        )

        self.max_logits = 0.0  # Track maximum attention logits

</pre>



<p><strong>O</strong><strong>n Lines 1-47</strong>, we define the MLA architecture following DeepSeek-V3&#8217;s design with compression and decompression of queries and key-values through low-rank projections. The key innovation appears on <strong>Line 49</strong>, where we initialize <code data-enlighter-language="python" class="EnlighterJSRAW">self.max_logits = 0.0</code>, a critical state variable that tracks the maximum attention logits across heads. This tracking mechanism is essential for QK-Clip to function properly.</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="52" data-enlighter-title="Building and Training a Kimi-K2 Model Using DeepSeek-V3 Components" data-enlighter-group="2">    def forward(self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None):
        B, T, C = x.size()

        # Compression phase
        kv_compressed = self.kv_norm(self.kv_proj(x))
        q_compressed = self.q_proj(x)

        # Decompression phase
        k_content = self.k_decompress(kv_compressed)
        v = self.v_decompress(kv_compressed)
        q_content = self.q_decompress(q_compressed)

        # RoPE components
        k_rope = self.k_rope_proj(x)
        q_rope = self.q_rope_proj(q_compressed)

        # Reshape for multi-head attention
        k_content = k_content.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
        v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
        q_content = q_content.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
        k_rope = k_rope.view(B, T, self.n_head, self.rope_dim).transpose(1, 2)
        q_rope = q_rope.view(B, T, self.n_head, self.rope_dim).transpose(1, 2)

        # Apply RoPE
        cos, sin = self.rope(x, T)
        q_rope = apply_rope(q_rope, cos, sin)
        k_rope = apply_rope(k_rope, cos, sin)

        # Concatenate content and rope parts
        q = torch.cat([q_content, q_rope], dim=-1)
        k = torch.cat([k_content, k_rope], dim=-1)
     
</pre>



<p><strong>On Lines 52-82</strong>, we implement the standard forward pass through the compression-decompression pipeline. The input undergoes compression via <code data-enlighter-language="python" class="EnlighterJSRAW">kv_proj</code> and <code data-enlighter-language="python" class="EnlighterJSRAW">q_proj</code>, followed by decompression through dedicated linear layers. We then reshape tensors for multi-head processing and apply Rotary Position Embeddings (RoPE) separately to content and positional components. This separation allows per-head QK-Clip to target only the appropriate components without affecting shared rotary embeddings.</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="84" data-enlighter-title="Building and Training a Kimi-K2 Model Using DeepSeek-V3 Components" data-enlighter-group="3">        # Concatenate content and rope parts
        q = torch.cat([q_content, q_rope], dim=-1)
        k = torch.cat([k_content, k_rope], dim=-1)

        # Attention computation
        scale = 1.0 / math.sqrt(q.size(-1))
        scores = torch.matmul(q, k.transpose(-2, -1)) * scale

        with torch.no_grad():
            # self.max_logits = torch.max(scores, dim=1).item()
            self.max_logits = list(torch.max(scores.transpose(1, 0).contiguous().view(scores.shape[1], -1), dim=-1)[0])

        # Apply causal mask
        scores = scores.masked_fill(self.causal_mask[:, :, :T, :T] == 0, float('-inf'))

        # Apply padding mask if provided
        if attention_mask is not None:
            padding_mask_additive = (1 - attention_mask).unsqueeze(1).unsqueeze(2) * float('-inf')
            scores = scores + padding_mask_additive

        # Softmax and dropout
        attn_weights = F.softmax(scores, dim=-1)
        attn_weights = self.attn_dropout(attn_weights)

        # Apply attention to values
        out = torch.matmul(attn_weights, v)

        # Reshape and project
        out = out.transpose(1, 2).contiguous().view(B, T, self.n_head * self.head_dim)
        out = self.resid_dropout(self.o_proj(out))

        return out
</pre>



<p>On <strong>Lines 89-94</strong>, we compute attention scores and implement the crucial max logit tracking. The score computation follows standard scaled dot-product attention. However, <strong>Lines 92-94</strong> represent a key departure from vanilla DeepSeek-V3: we track the maximum attention logit <strong>per head</strong> using <code data-enlighter-language="python" class="EnlighterJSRAW">torch.no_grad()</code> to avoid affecting gradients. The <code data-enlighter-language="python" class="EnlighterJSRAW">scores</code> tensor has shape <code data-enlighter-language="python" class="EnlighterJSRAW">[batch, num_heads, seq_len, seq_len]</code>, and we transpose and reshape to extract per-head maximum values. This per-head granularity enables targeted intervention only on heads exhibiting logit explosion, minimizing disruption to stable heads.</p>



<p>On <strong>Lines </strong><strong>97-113</strong>, we complete the attention mechanism with causal masking, optional padding masks, softmax normalization, and dropout. The final output projection maintains the standard MLA architecture. The elegance of this implementation lies in its non-invasiveness: max logit tracking adds minimal computational overhead (a single max operation under <code data-enlighter-language="python" class="EnlighterJSRAW">torch.no_grad</code>) while providing the critical signal for optimizer-level weight clipping.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Implementing-MuonClip-Optimizer-Stable-LLM-Training"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Implementing-MuonClip-Optimizer-Stable-LLM-Training">Implementing the MuonClip Optimizer for Stable LLM Training</a></h3>



<p>The MuonClip optimizer represents the core innovation enabling stable trillion-parameter training. Our implementation integrates Newton-Schulz orthogonalization, RMS matching, weight decay, and per-head QK-Clip into a unified optimizer.</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Building and Training a Kimi-K2 Model Using DeepSeek-V3 Components" data-enlighter-group="4">def apply_qk_clip_per_head(
    query_weights: torch.Tensor,
    key_weights: torch.Tensor,
    max_logits_per_head: Union[List[float], torch.Tensor],
    tau: float = 100.0
) -> None:
        if isinstance(max_logits_per_head, list):
        max_logits_per_head = torch.tensor(
            max_logits_per_head,
            device=query_weights.device,
            dtype=query_weights.dtype
        )
    apply_qk_clip_vectorized(query_weights, key_weights, max_logits_per_head, tau)

</pre>



<p>On <strong>Lines 1-13</strong>, we define the entry point for the QK-Clip application. The function accepts query and key projection weights along with per-head max logits and a threshold <img src='https://b2633864.smushcdn.com/2633864/wp-content/latex/a6f/a6f317b268ae825d94f832f970af607c-ffffff-000000-0.png?lossy=2&strip=1&webp=1' alt='\tau' title='\tau' class='latex' /> (defaulting to <code data-enlighter-language="python" class="EnlighterJSRAW">100</code>). We handle both list and tensor inputs for flexibility, converting lists to tensors on the appropriate device with matching <code data-enlighter-language="python" class="EnlighterJSRAW">dtype</code>. The critical design choice here is <strong>in-place modification</strong>: we directly modify weight tensors to avoid memory allocation overhead during optimization.</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="15" data-enlighter-title="Building and Training a Kimi-K2 Model Using DeepSeek-V3 Components" data-enlighter-group="5">def apply_qk_clip_per_head(
    query_weights: torch.Tensor,
    key_weights: torch.Tensor,
    max_logits_per_head: Union[List[float], torch.Tensor],
    tau: float = 100.0
) -> None:
        if isinstance(max_logits_per_head, list):
        max_logits_per_head = torch.tensor(
            max_logits_per_head,
            device=query_weights.device,
            dtype=query_weights.dtype
        )
    apply_qk_clip_vectorized(query_weights, key_weights, max_logits_per_head, tau)

@torch.no_grad()
def apply_qk_clip_vectorized(
    query_weights: torch.Tensor,
    key_weights: torch.Tensor,
    max_logits_per_head: torch.Tensor,
    tau: float = 100.0
) -> None:
    
    q_out, q_in = query_weights.shape[0], query_weights.shape[1]
    k_out, k_in = key_weights.shape[0], key_weights.shape[1]
    num_heads = len(max_logits_per_head)
    d_k = q_out // num_heads

    # Ensure tensor type
    if not isinstance(max_logits_per_head, torch.Tensor):
        max_logits_per_head = torch.tensor(
            max_logits_per_head,
            device=query_weights.device,
            dtype=query_weights.dtype
        )

    # Compute scaling factors: gamma = tau / max_logit where max_logit > tau
    needs_clip = max_logits_per_head > tau
</pre>



<p>On <strong>Lines 15-48</strong>, we extract dimensions and ensure tensor type compatibility. We first extract dimensions and compute the per-head scaling factor <img src='https://b2633864.smushcdn.com/2633864/wp-content/latex/dbb/dbba3baf5b2edebc882c9a597b2fce7b-ffffff-000000-0.png?lossy=2&strip=1&webp=1' alt='\gamma_h = \min(1, \tau / S_{\max}^h)' title='\gamma_h = \min(1, \tau / S_{\max}^h)' class='latex' /> only for heads where <img src='https://b2633864.smushcdn.com/2633864/wp-content/latex/f0a/f0a17224c7274f174055214500c5c70e-ffffff-000000-0.png?lossy=2&strip=1&webp=1' alt='S_{\max}^h &gt; \tau' title='S_{\max}^h &gt; \tau' class='latex' />.</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="52" data-enlighter-title="Building and Training a Kimi-K2 Model Using DeepSeek-V3 Components" data-enlighter-group="6">@torch.no_grad()
def apply_qk_clip_vectorized(
    query_weights: torch.Tensor,
    key_weights: torch.Tensor,
    max_logits_per_head: torch.Tensor,
    tau: float = 100.0
) -> None:
    
    q_out, q_in = query_weights.shape[0], query_weights.shape[1]
    k_out, k_in = key_weights.shape[0], key_weights.shape[1]
    num_heads = len(max_logits_per_head)
    d_k = q_out // num_heads

    # Ensure tensor type
    if not isinstance(max_logits_per_head, torch.Tensor):
        max_logits_per_head = torch.tensor(
            max_logits_per_head,
            device=query_weights.device,
            dtype=query_weights.dtype
        )

    # Compute scaling factors: gamma = tau / max_logit where max_logit > tau
    needs_clip = max_logits_per_head > tau

    # If no clipping needed, return early
    if not needs_clip.any():
        return

    gamma = torch.where(
        needs_clip,
        tau / max_logits_per_head.clamp(min=1e-8),
        torch.ones_like(max_logits_per_head)
    )
    sqrt_gamma = torch.sqrt(gamma)

    # Reshape weights to [d_model, num_heads, d_k] for per-head scaling
    # Views share underlying storage, so in-place ops modify original tensor
    q_reshaped = query_weights.view(q_out // num_heads, num_heads, q_in)
    k_reshaped = key_weights.view(k_out // num_heads, num_heads, k_in)

    # Apply per-head scaling IN-PLACE: broadcast sqrt_gamma [num_heads] over [d_model, num_heads, d_k]
    q_reshaped.mul_(sqrt_gamma.view(1, num_heads, 1))
    k_reshaped.mul_(sqrt_gamma.view(1, num_heads, 1))

    q_reshaped = q_reshaped.view(q_out, q_in)
    k_reshaped = k_reshaped.view(k_out, k_in)
</pre>



<p>On <strong>Lines 80-97</strong>, we perform the actual weight clipping through careful tensor reshaping and in-place multiplication. The weights are reshaped from <code data-enlighter-language="python" class="EnlighterJSRAW">[d_model, d_model]</code> to <code data-enlighter-language="python" class="EnlighterJSRAW">[d_model/num_heads, num_heads, d_k]</code> to expose the head dimension. We then apply <img src='https://b2633864.smushcdn.com/2633864/wp-content/latex/1b9/1b93d4fbff6b33f401722350670a419d-ffffff-000000-0.png?lossy=2&strip=1&webp=1' alt='\sqrt{\gamma_h}' title='\sqrt{\gamma_h}' class='latex' /> scaling using in-place multiplication (<code data-enlighter-language="python" class="EnlighterJSRAW">mul_</code>) with broadcasting. The square root scaling ensures that when query and key both receive <img src='https://b2633864.smushcdn.com/2633864/wp-content/latex/68e/68e92bd7d9878c99406d6f534f99f10a-ffffff-000000-0.png?lossy=2&strip=1&webp=1' alt='\sqrt{\gamma}' title='\sqrt{\gamma}' class='latex' />, their dot product receives the full <img src='https://b2633864.smushcdn.com/2633864/wp-content/latex/ae5/ae539dfcc999c28e25a0f3ae65c1de79-ffffff-000000-0.png?lossy=2&strip=1&webp=1' alt='\gamma' title='\gamma' class='latex' /> scaling. This elegant mathematical property allows us to clip attention logits by rescaling the weights that produce them, rather than clipping logits directly after they&#8217;re computed.</p>



<p><strong>Lines</strong> <strong>77</strong><strong> and </strong><strong>78</strong> implement early exit if no head requires clipping, which becomes a common case later in training when attention logits stabilize. This optimization avoids unnecessary computation when the model is well-behaved.</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Building and Training a Kimi-K2 Model Using DeepSeek-V3 Components" data-enlighter-group="7">class MuonClip(torch.optim.Optimizer):
    def __init__(
        self,
        params,
        lr: float = 1e-3,
        momentum: float = 0.95,
        weight_decay: float = 0.01,
        tau: float = 100.0,
        ns_steps: int = 5,
        eps: float = 1e-7
    ):
        if lr &lt; 0.0:
            raise ValueError(f"Invalid learning rate: {lr}")
        if not 0.0 &lt;= momentum &lt;= 1.0:
            raise ValueError(f"Invalid momentum value: {momentum}")
        if weight_decay &lt; 0.0:
            raise ValueError(f"Invalid weight_decay value: {weight_decay}")
        if tau &lt;= 0.0:
            raise ValueError(f"Invalid tau value: {tau}")

        defaults = dict(
            lr=lr,
            momentum=momentum,
            weight_decay=weight_decay,
            tau=tau,
            ns_steps=ns_steps,
            eps=eps
        )
        super().__init__(params, defaults)

        # For QK-Clip functionality
        self.model = None
        self.attention_layers = []

    def set_model(self, model: nn.Module):
        self.model = model
        if hasattr(model, 'get_attention_layers'):
            self.attention_layers = model.get_attention_layers()

</pre>



<p>On <strong>Lines 1-33</strong>, we define the MuonClip optimizer class, inheriting from PyTorch&#8217;s base <code data-enlighter-language="python" class="EnlighterJSRAW">Optimizer</code>. The constructor accepts standard hyperparameters (learning rate, momentum, weight decay) plus QK-Clip-specific parameters (<img src='https://b2633864.smushcdn.com/2633864/wp-content/latex/a6f/a6f317b268ae825d94f832f970af607c-ffffff-000000-0.png?lossy=2&strip=1&webp=1' alt='\tau' title='\tau' class='latex' /> and Newton-Schulz steps). We validate all parameters and initialize state tracking. Critically, <strong>Lines 35-38</strong> implement model registration through <code data-enlighter-language="python" class="EnlighterJSRAW">set_model()</code>, which extracts attention layers for later QK-Clip application. This design separates optimizer logic from model architecture, allowing the optimizer to operate on any model exposing a <code data-enlighter-language="python" class="EnlighterJSRAW">get_attention_layers()</code> method.</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="40" data-enlighter-title="Building and Training a Kimi-K2 Model Using DeepSeek-V3 Components" data-enlighter-group="8">    @torch.no_grad()
    def step(self, closure: Optional[Callable] = None) -> Optional[float]:
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()

        for group in self.param_groups:
            lr = group['lr']
            momentum = group['momentum']
            weight_decay = group['weight_decay']
            ns_steps = group['ns_steps']
            eps = group['eps']

            for p in group['params']:
                if p.grad is None:
                    continue

                grad = p.grad
                state = self.state[p]

                # Initialize momentum buffer
                if len(state) == 0:
                    state['momentum_buffer'] = torch.zeros_like(p)

                buf = state['momentum_buffer']

                # Apply momentum: Mt = μMt−1 + Gt
                buf.mul_(momentum).add_(grad)

                if p.ndim >= 2:  # 2D+ parameters - use Muon
                    # Apply Newton-Schulz orthogonalization
                    if p.ndim > 2:
                        original_shape = buf.shape
                        buf_2d = buf.view(buf.shape[0], -1)
                        orthogonal_update = newton_schulz(buf_2d, ns_steps, eps)
                        orthogonal_update = orthogonal_update.view(original_shape)
                    else:
                        orthogonal_update = newton_schulz(buf, ns_steps, eps)

                    # RMS matching factor: √(max(n,m) × 0.2)
                    n, m = p.shape[0], p.shape[1] if p.ndim > 1 else 1
                    rms_factor = math.sqrt(max(n, m) * 0.2)
                    orthogonal_update = orthogonal_update * rms_factor

                    # Update: Wt = Wt−1 − η(Ot + λWt−1)
                    p.add_(orthogonal_update + weight_decay * p, alpha=-lr)
                else:
                    # 1D parameters - standard momentum
                    p.add_(buf + weight_decay * p, alpha=-lr)

        # Apply QK-Clip
        self._apply_qk_clip()

        return loss

</pre>



<p>On <strong>Lines 41-94</strong>, we implement the core optimization step integrating Muon updates with QK-Clip. The step begins with standard closure handling and parameter group iteration. <strong>Lines 41-68</strong> implement momentum accumulation (<img src='https://b2633864.smushcdn.com/2633864/wp-content/latex/a94/a94eaf68f4dd0ecd984fe2a564e63f2f-ffffff-000000-0.png?lossy=2&strip=1&webp=1' alt='M_t = \mu M_{t-1} + G_t' title='M_t = \mu M_{t-1} + G_t' class='latex' />) using in-place operations for memory efficiency. The critical branching occurs at <strong>Line 70</strong>: parameters with 2+ dimensions receive Muon treatment. </p>



<p>On <strong>Lines 72-83</strong>, we apply the Muon update for matrix parameters. Newton-Schulz orthogonalization produces an orthogonal approximation of the momentum buffer, which we then scale by <img src='https://b2633864.smushcdn.com/2633864/wp-content/latex/1ed/1ed57c6055afe2435bbee42ba89e80be-ffffff-000000-0.png?lossy=2&strip=1&webp=1' alt='\sqrt{\max(n,m)} \times 0.2' title='\sqrt{\max(n,m)} \times 0.2' class='latex' /> to match AdamW&#8217;s RMS characteristics. This scaling ensures Muon&#8217;s updates have similar magnitudes to AdamW, enabling easier hyperparameter transfer. Finally, <strong>Line 86</strong> applies the update with weight decay: <img src='https://b2633864.smushcdn.com/2633864/wp-content/latex/265/265fbabff429aa2668545a63e371a4c7-ffffff-000000-0.png?lossy=2&strip=1&webp=1' alt='W_t = W_{t-1} - \eta(O_t + \lambda W_{t-1})' title='W_t = W_{t-1} - \eta(O_t + \lambda W_{t-1})' class='latex' srcset='https://b2633864.smushcdn.com/2633864/wp-content/latex/265/265fbabff429aa2668545a63e371a4c7-ffffff-000000-0.png?lossy=2&strip=1&webp=1 200w,https://b2633864.smushcdn.com/2633864/wp-content/latex/265/265fbabff429aa2668545a63e371a4c7-ffffff-000000-0.png?size=126x11&lossy=2&strip=1&webp=1 126w' sizes='(max-width: 200px) 100vw, 200px' />. <strong>Line 89</strong> applies standard momentum updates to 1D parameters such as biases and normalization layers.</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="96" data-enlighter-title="Building and Training a Kimi-K2 Model Using DeepSeek-V3 Components" data-enlighter-group="9">    def _apply_qk_clip(self):
        """Apply QK-Clip to attention layers to prevent logit explosion."""
        if not self.attention_layers:
            return

        tau = self.param_groups[0]['tau']

        for attention_layer in self.attention_layers:
            if not hasattr(attention_layer, 'max_logits'):
                continue

            max_logits = attention_layer.max_logits
            if not max_logits:
                continue


            # Handle both scalar and per-head max logits
            if isinstance(max_logits, (int, float)):
                max_logits = [max_logits]


            apply_qk_clip_per_head(
                    attention_layer.k_decompress.weight.data,
                    attention_layer.q_decompress.weight.data,
                    max_logits,
                    tau
            )
</pre>



<p>On <strong>Lines 96-122</strong>, we apply QK-Clip after all weight updates. The <code data-enlighter-language="python" class="EnlighterJSRAW">_apply_qk_clip()</code> method iterates through all registered attention layers, extracts their <code data-enlighter-language="python" class="EnlighterJSRAW">max_logits</code> attribute (populated during forward pass), and applies per-head clipping to the query and key decompression weights. This post-update clipping ensures weights don&#8217;t grow unboundedly across training steps while preserving gradient information within each step.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Complete-Kimi-K2-Training-Pipeline-Setup-Config-Optimization"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Complete-Kimi-K2-Training-Pipeline-Setup-Config-Optimization">Complete Kimi-K2 Training Pipeline: Setup, Config, and Optimization</a></h3>



<p>Finally, we bring everything together in a complete training configuration:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Building and Training a Kimi-K2 Model Using DeepSeek-V3 Components" data-enlighter-group="10">config = DeepSeekConfig()
config.multi_token_predict = 0
config.n_experts = 8
config.n_head = 4

training_args = TrainingArguments(
    output_dir="./kimik2_checkpoints",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    learning_rate=5e-4,
    warmup_steps=10,
    weight_decay=0.01,
    logging_dir="./kimik2_checkpoints/logs",
    logging_steps=50,
    save_steps=50,
    save_total_limit=3,
    eval_steps=50,
    eval_strategy="steps",
    save_strategy="steps",
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    gradient_accumulation_steps=4,
    fp16=True,
    dataloader_num_workers=2,
    remove_unused_columns=False,
    report_to="none",
    push_to_hub=False,
    save_safetensors=False,
)
</pre>



<p>On <strong>Lines 1-4</strong>, we configure the model architecture. Kimi-K2 does not use Multi-Token Prediction, so we disable multi-token prediction (<code data-enlighter-language="python" class="EnlighterJSRAW">multi_token_predict=0</code>) to simplify training and focus on core capabilities. We use <code data-enlighter-language="python" class="EnlighterJSRAW">8</code> experts for this educational implementation rather than the hundreds used in production-scale Kimi-K2 and DeepSeek-V3 models. We also use <code data-enlighter-language="python" class="EnlighterJSRAW">4</code> attention heads for this small-scale educational implementation, compared to the production-scale configurations used in DeepSeek-V3 and Kimi-K2.</p>



<p>On <strong>Lines 6-30</strong>, we define training arguments following best practices for small-scale experiments. We use gradient accumulation (<code data-enlighter-language="python" class="EnlighterJSRAW">4</code> steps) to simulate larger batch sizes with limited GPU memory, enable mixed-precision training (<code data-enlighter-language="python" class="EnlighterJSRAW">fp16=True</code>) for speed and memory efficiency, and configure regular evaluation and checkpointing every <code data-enlighter-language="python" class="EnlighterJSRAW">50</code> steps. The learning rate of <code data-enlighter-language="python" class="EnlighterJSRAW">5e-4</code> is conservative for stable training, with a brief <code data-enlighter-language="python" class="EnlighterJSRAW">10</code>-step warmup.</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="31" data-enlighter-title="Building and Training a Kimi-K2 Model Using DeepSeek-V3 Components" data-enlighter-group="11">model = DeepSeek(config)

data_collator = DeepSeekDataCollator(tokenizer)

optimizer = MuonClip(model.parameters(), lr=5e-3)
optimizer.set_model(model)

# Create trainer
trainer = DeepSeekTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    optimizers=(optimizer, None)
)

print("✓ Trainer created. Starting training...")
print("=" * 80)

# Train!
trainer.train()

print("=" * 80)
print("✓ Training complete!")

# Save final model
trainer.save_model("./kimik2_final")
tokenizer.save_pretrained("./kimik2_final")
print("✓ Model saved to ./kimik2_final")
</pre>



<p>On <strong>Lines 31-36</strong>, we initialize the model and create a MuonClip <code data-enlighter-language="python" class="EnlighterJSRAW">optimizer</code>. Critically, <strong>Line 36</strong> registers the model with the optimizer using <code data-enlighter-language="python" class="EnlighterJSRAW">set_model()</code>, enabling QK-Clip to access attention layers. This registration must occur before training begins.</p>



<p>On <strong>Lines 39-60</strong>, we instantiate the custom <code data-enlighter-language="python" class="EnlighterJSRAW">trainer</code> with all components and launch training. The <code data-enlighter-language="python" class="EnlighterJSRAW">optimizers=(optimizer, None)</code> argument provides our custom optimizer to Hugging Face Trainer, overriding its default optimizer creation. After training completes, we save both the model weights and tokenizer for later inference.</p>



<hr class="wp-block-separator has-alpha-channel-opacity"/>



<div id="pitch" style="padding: 40px; width: 100%; background-color: #F4F6FA;">
	<h3>What's next? We recommend <a target="_blank" href="https://pyimagesearch.com/pyimagesearch-university/?utm_source=blogPost&utm_medium=bottomBanner&utm_campaign=What%27s%20next%3F%20I%20recommend">PyImageSearch University</a>.</h3>

	<script src="https://fast.wistia.com/embed/medias/kno0cmko2z.jsonp" async></script><script src="https://fast.wistia.com/assets/external/E-v1.js" async></script><div class="wistia_responsive_padding" style="padding:56.25% 0 0 0;position:relative;"><div class="wistia_responsive_wrapper" style="height:100%;left:0;position:absolute;top:0;width:100%;"><div class="wistia_embed wistia_async_kno0cmko2z videoFoam=true" style="height:100%;position:relative;width:100%"><div class="wistia_swatch" style="height:100%;left:0;opacity:0;overflow:hidden;position:absolute;top:0;transition:opacity 200ms;width:100%;"><img decoding="async" src="https://fast.wistia.com/embed/medias/kno0cmko2z/swatch" style="filter:blur(5px);height:100%;object-fit:contain;width:100%;" alt="" aria-hidden="true" onload="this.parentNode.style.opacity=1;" /></div></div></div></div>

	<div style="margin-top: 32px; margin-bottom: 32px; ">
		<strong>Course information:</strong><br/>
		86+ total classes • 115+ hours hours of on-demand code walkthrough videos • Last updated: June 2026<br/>
		<span style="color: #169FE6;">★★★★★</span> 4.84 (128 Ratings) • 16,000+ Students Enrolled
	</div>

	<p><strong>I strongly believe that if you had the right teacher you could <em>master</em> computer vision and deep learning.</strong></p>

	<p>Do you think learning computer vision and deep learning has to be time-consuming, overwhelming, and complicated? Or has to involve complex mathematics and equations? Or requires a degree in computer science?</p>

	<p>That’s <em>not</em> the case.</p>

	<p>All you need to master computer vision and deep learning is for someone to explain things to you in <em>simple, intuitive</em> terms. <em>And that’s exactly what I do</em>. My mission is to change education and how complex Artificial Intelligence topics are taught.</p>

	<p>If you're serious about learning computer vision, your next stop should be PyImageSearch University, the most comprehensive computer vision, deep learning, and OpenCV course online today. Here you’ll learn how to <em>successfully</em> and <em>confidently</em> apply computer vision to your work, research, and projects. Join me in computer vision mastery.</p>

	<p><strong>Inside PyImageSearch University you'll find:</strong></p>

	<ul style="margin-left: 0px;">
		<li style="list-style: none;">&check; <strong>86+ courses</strong> on essential computer vision, deep learning, and OpenCV topics</li>
		<li style="list-style: none;">&check; <strong>86 Certificates</strong> of Completion</li>
		<li style="list-style: none;">&check; <strong>115+ hours hours</strong> of on-demand video</li>
		<li style="list-style: none;">&check; <strong>Brand new courses released <em>regularly</em></strong>, ensuring you can keep up with state-of-the-art techniques</li>
		<li style="list-style: none;">&check; <strong>Pre-configured Jupyter Notebooks in Google Colab</strong></li>
		<li style="list-style: none;">&check; Run all code examples in your web browser — works on Windows, macOS, and Linux (no dev environment configuration required!)</li>
		<li style="list-style: none;">&check; Access to <strong>centralized code repos for <em>all</em> 540+ tutorials</strong> on PyImageSearch</li>
		<li style="list-style: none;">&check; <strong> Easy one-click downloads</strong> for code, datasets, pre-trained models, etc.</li>
		<li style="list-style: none;">&check; <strong>Access</strong> on mobile, laptop, desktop, etc.</li>
	</ul>

	<p style="text-align: center;">
		<a target="_blank" class="button link" href="https://pyimagesearch.com/pyimagesearch-university/?utm_source=blogPost&utm_medium=bottomBanner&utm_campaign=What%27s%20next%3F%20I%20recommend" style="background-color: #6DC713; border-bottom: none;">Click here to join PyImageSearch University</a>
	</p>
</div>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Summary"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Summary">Summary</a></h2>



<p>We began by detailing how to train Kimi-K2 from scratch using DeepSeek-V3 components, emphasizing the architectural differences that set Kimi-K2 apart. We explored the model’s scale and sparsity, showing that reducing the number of attention heads allowed us to balance efficiency and performance. A key part of this journey was the introduction of the MuonClip optimizer, which stabilizes training while pushing the limits of large-scale language modeling.</p>



<p>We then turned to the challenges of token efficiency and the attention logit explosion problem. To address these, we introduced the QK-Clip innovation, which helped us control runaway logits and improve overall stability. Alongside this, we refined our training data pipeline, focusing on token utility and knowledge data rephrasing to ensure that every token contributed meaningfully to the model’s learning process. These improvements allowed us to maximize the value of the data while keeping training efficient.</p>



<p>Finally, we described the implementation details, including enhanced multi-head latent attention with max logit tracking and the practical integration of the MuonClip optimizer. We concluded with a complete training setup, showing how all these innovations came together to make Kimi-K2 a robust, efficient, and scalable model. By combining architectural refinements, optimizer breakthroughs, and data improvements, this lesson demonstrated how these techniques push the boundaries of what’s possible in modern language model training.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h3-Citation-Information"/>



<h3 class="wp-block-heading"><a href="#TOC-h3-Citation-Information">Citation Information</a></h3>



<p><strong>Mangla, P</strong><strong>. </strong>“Building and Training a Kimi-K2 Model Using DeepSeek-V3 Components,” <em>PyImageSearch</em>, S. Huot, A. Sharma, and P. Thakur, eds., 2026, <a href="https://pyimg.co/d3tge" target="_blank" rel="noreferrer noopener">https://pyimg.co/d3tge</a> </p>



<pre class="EnlighterJSRAW" data-enlighter-language="raw" data-enlighter-theme="classic" data-enlighter-highlight="" data-enlighter-linenumbers="false" data-enlighter-lineoffset="" data-enlighter-title="Building and Training a Kimi-K2 Model Using DeepSeek-V3 Components" data-enlighter-group="12">@incollection{Mangla_2026_building-training-kimi-k2-model-using-deepseek-v3,
  author = {Puneet Mangla},
  title = {{Building and Training a Kimi-K2 Model Using DeepSeek-V3 Components}},
  booktitle = {PyImageSearch},
  editor = {Susan Huot and Aditya Sharma and Piyush Thakur},
  year = {2026},
  url = {https://pyimg.co/d3tge},
}
</pre>



<p><strong>To download the source code to this post (and be notified when future tutorials are published here on PyImageSearch), </strong><em><strong>simply enter your email address in the form below!</strong></em></p>



<div id="download-the-code" class="post-cta-wrap">
<div class="gpd-post-cta">
	<div class="gpd-post-cta-content">
		

			<div class="gpd-post-cta-top">
				<div class="gpd-post-cta-top-image"><img decoding="async" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?lossy=2&strip=1&webp=1" alt="" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?lossy=2&strip=1&webp=1 410w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?size=126x174&lossy=2&strip=1&webp=1 126w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?size=252x348&lossy=2&strip=1&webp=1 252w" sizes="(max-width: 410px) 100vw, 410px" /></div>
				
				<div class="gpd-post-cta-top-title"><h4>Download the Source Code and FREE 17-page Resource Guide</h4></div>
				<div class="gpd-post-cta-top-desc"><p>Enter your email address below to get a .zip of the code and a <strong>FREE 17-page Resource Guide on Computer Vision, OpenCV, and Deep Learning.</strong> Inside you'll find my hand-picked tutorials, books, courses, and libraries to help you master CV and DL!</p></div>


			</div>

			<div class="gpd-post-cta-bottom">
				<form id="footer-cta-code" class="footer-cta" action="https://www.getdrip.com/forms/4130035/submissions" method="post" target="blank" data-drip-embedded-form="4130035">
					<input name="fields[email]" type="email" value="" placeholder="Your email address" class="form-control" />

					<button type="submit">Download the code!</button>

					<div style="display: none;" aria-hidden="true"><label for="website">Website</label><br /><input type="text" id="website" name="website" tabindex="-1" autocomplete="false" value="" /></div>
				</form>
			</div>


		
	</div>

</div>
</div>
<p>The post <a rel="nofollow" href="https://pyimagesearch.com/2026/05/11/building-and-training-a-kimi-k2-model-using-deepseek-v3-components/">Building and Training a Kimi-K2 Model Using DeepSeek-V3 Components</a> appeared first on <a rel="nofollow" href="https://pyimagesearch.com">PyImageSearch</a>.</p>
]]></content:encoded>
					
		
		
			</item>
		<item>
		<title>Semantic Caching for LLMs: TTLs, Confidence, and Cache Safety</title>
		<link>https://pyimagesearch.com/2026/05/04/semantic-caching-for-llms-ttls-confidence-and-cache-safety/</link>
		
		<dc:creator><![CDATA[Vikram Singh]]></dc:creator>
		<pubDate>Mon, 04 May 2026 12:45:00 +0000</pubDate>
				<category><![CDATA[Artificial Intelligence]]></category>
		<category><![CDATA[LLMOps]]></category>
		<category><![CDATA[Machine Learning]]></category>
		<category><![CDATA[MLOps]]></category>
		<category><![CDATA[Tutorial]]></category>
		<category><![CDATA[cache poisoning]]></category>
		<category><![CDATA[cache ttl]]></category>
		<category><![CDATA[confidence scoring]]></category>
		<category><![CDATA[deduplication]]></category>
		<category><![CDATA[fastapi]]></category>
		<category><![CDATA[llm caching]]></category>
		<category><![CDATA[llm optimization]]></category>
		<category><![CDATA[llmops]]></category>
		<category><![CDATA[production llm]]></category>
		<category><![CDATA[python]]></category>
		<category><![CDATA[redis]]></category>
		<category><![CDATA[semantic caching]]></category>
		<category><![CDATA[tutorial]]></category>
		<guid isPermaLink="false">https://pyimagesearch.com/?p=53619</guid>

					<description><![CDATA[<p>Table of Contents Semantic Caching for LLMs: TTLs, Confidence, and Cache Safety Why Semantic Caching for LLMs Requires Production Hardening Cache TTL in Semantic Caching: Preventing Stale LLM Responses MLOps Project Structure for Semantic Caching with FastAPI and Redis How&#8230;</p>
<p>The post <a rel="nofollow" href="https://pyimagesearch.com/2026/05/04/semantic-caching-for-llms-ttls-confidence-and-cache-safety/">Semantic Caching for LLMs: TTLs, Confidence, and Cache Safety</a> appeared first on <a rel="nofollow" href="https://pyimagesearch.com">PyImageSearch</a>.</p>
]]></description>
										<content:encoded><![CDATA[<div class="yoast-breadcrumbs"><span><span><a href="https://pyimagesearch.com/">Home</a></span></div>


<div class="toc">
<hr class="TOC"/>
<p class="has-large-font-size"><strong>Table of Contents</strong></p>
<ul>
    <li id="TOC-h1-Semantic-Caching-LLMs-TTLs-Confidence-Cache-Safety"><a rel="noopener" target="_blank" href="#h1-Semantic-Caching-LLMs-TTLs-Confidence-Cache-Safety">Semantic Caching for LLMs: TTLs, Confidence, and Cache Safety</a></li>

    <li id="TOC-h2-Why-Semantic-Caching-LLMs-Requires-Production-Hardening"><a rel="noopener" target="_blank" href="#h2-Why-Semantic-Caching-LLMs-Requires-Production-Hardening">Why Semantic Caching for LLMs Requires Production Hardening</a></li>

    <li id="TOC-h2-Cache-TTL-Semantic-Caching-Preventing-Stale-LLM-Responses"><a rel="noopener" target="_blank" href="#h2-Cache-TTL-Semantic-Caching-Preventing-Stale-LLM-Responses">Cache TTL in Semantic Caching: Preventing Stale LLM Responses</a></li>

    <li id="TOC-h2-MLOps-Project-Structure-Semantic-Caching-FastAPI-Redis"><a rel="noopener" target="_blank" href="#h2-MLOps-Project-Structure-Semantic-Caching-FastAPI-Redis">MLOps Project Structure for Semantic Caching with FastAPI and Redis</a></li>

    <li id="TOC-h2-How-Implement-Cache-TTL-Validation-Python-Redis"><a rel="noopener" target="_blank" href="#h2-How-Implement-Cache-TTL-Validation-Python-Redis">How to Implement Cache TTL Validation in Python and Redis</a></li>

    <li id="TOC-h2-Confidence-Scoring-Semantic-Caching-Beyond-Similarity-LLMs"><a rel="noopener" target="_blank" href="#h2-Confidence-Scoring-Semantic-Caching-Beyond-Similarity-LLMs">Confidence Scoring in Semantic Caching: Beyond Similarity for LLMs</a></li>

    <li id="TOC-h2-Implementing-Confidence-Scoring-LLM-Cache-Optimization-Code-Walkthrough"><a rel="noopener" target="_blank" href="#h2-Implementing-Confidence-Scoring-LLM-Cache-Optimization-Code-Walkthrough">Implementing Confidence Scoring for LLM Cache Optimization (Code Walkthrough)</a></li>

    <li id="TOC-h2-Query-Normalization-Deduplication-Efficient-Semantic-Caching"><a rel="noopener" target="_blank" href="#h2-Query-Normalization-Deduplication-Efficient-Semantic-Caching">Query Normalization and Deduplication for Efficient Semantic Caching</a></li>

    <li id="TOC-h2-Preventing-Cache-Poisoning-Semantic-Caching-LLM-Systems"><a rel="noopener" target="_blank" href="#h2-Preventing-Cache-Poisoning-Semantic-Caching-LLM-Systems">Preventing Cache Poisoning in Semantic Caching for LLM Systems</a></li>

    <li id="TOC-h2-End-to-End-Semantic-Cache-Hardening-TTL-Confidence-Safety-Demos"><a rel="noopener" target="_blank" href="#h2-End-to-End-Semantic-Cache-Hardening-TTL-Confidence-Safety-Demos">End-to-End Semantic Cache Hardening: TTL, Confidence, and Safety Demos</a></li>

    <li id="TOC-h2-Semantic-Caching-Limitations-Trade-Offs-LLM-Optimization-Systems"><a rel="noopener" target="_blank" href="#h2-Semantic-Caching-Limitations-Trade-Offs-LLM-Optimization-Systems">Semantic Caching Limitations: Trade-Offs in LLM Optimization Systems</a></li>

    <li id="TOC-h2-Summary"><a rel="noopener" target="_blank" href="#h2-Summary">Summary</a></li>
</ul>
</div>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h1-Semantic-Caching-LLMs-TTLs-Confidence-Cache-Safety"/>



<h2 class="wp-block-heading"><a href="#TOC-h1-Semantic-Caching-LLMs-TTLs-Confidence-Cache-Safety">Semantic Caching for LLMs: TTLs, Confidence, and Cache Safety</a></h2>



<p>In this lesson, you will learn how to harden a semantic cache for LLMs, one of the most important LLMOps patterns for reducing redundant inference costs, and move from a working semantic caching prototype to a system that can survive real-world usage with TTL validation, confidence scoring, deduplication, and cache poisoning prevention.</p>


<div class="wp-block-image">
<figure class="aligncenter size-full"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/semantic-caching-llms-ttls-confidence-cache-safety-feature.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="940" height="780" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/semantic-caching-llms-ttls-confidence-cache-safety-feature.png?lossy=2&strip=1&webp=1" alt="semantic-caching-llms-ttls-confidence-cache-safety-feature.png" class="wp-image-53650" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/semantic-caching-llms-ttls-confidence-cache-safety-feature.png?size=126x105&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/semantic-caching-llms-ttls-confidence-cache-safety-feature-300x249.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/semantic-caching-llms-ttls-confidence-cache-safety-feature.png?size=378x314&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/semantic-caching-llms-ttls-confidence-cache-safety-feature.png?size=504x418&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/semantic-caching-llms-ttls-confidence-cache-safety-feature.png?size=630x523&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/semantic-caching-llms-ttls-confidence-cache-safety-feature-768x637.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/semantic-caching-llms-ttls-confidence-cache-safety-feature.png?lossy=2&amp;strip=1&amp;webp=1 940w" sizes="(max-width: 630px) 100vw, 630px" /></a></figure></div>


<p>This lesson is the last in a 2-part series on <strong>Semantic Caching for LLMs</strong>:</p>



<ol class="wp-block-list">
<li><em><strong><a href="https://pyimg.co/yso6f" target="_blank" rel="noreferrer noopener">Semantic Caching for LLMs: FastAPI, Redis, and Embeddings</a></strong></em></li>



<li><strong><em><a href="https://pyimg.co/ahr3p" target="_blank" rel="noreferrer noopener">Semantic Caching for LLMs: TTLs, Confidence, and Cache Safety</a></em></strong><strong> (this tutorial)</strong></li>
</ol>



<p><strong>To learn how to harden a semantic cache for LLMs and make it safe, reliable, and production-ready, </strong><em><strong>just keep reading.</strong></em></p>



<div id="pyi-source-code-block" class="source-code-wrap"><div class="gpd-source-code">
    <div class="gpd-source-code-content">
        <img decoding="async" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/source-code-icon.png?lossy=2&strip=1&webp=1" alt="">
        <h4>Looking for the source code to this post?</h4>
                    <a href="#download-the-code" class="pyis-cta-modal-open-modal">Jump Right To The Downloads Section <svg class="svg-icon arrow-right" width="12" height="12" aria-hidden="true" role="img" focusable="false" viewBox="0 0 14 14" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M6.8125 0.1875C6.875 0.125 6.96875 0.09375 7.09375 0.09375C7.1875 0.09375 7.28125 0.125 7.34375 0.1875L13.875 6.75C13.9375 6.8125 14 6.90625 14 7C14 7.125 13.9375 7.1875 13.875 7.25L7.34375 13.8125C7.28125 13.875 7.1875 13.9062 7.09375 13.9062C6.96875 13.9062 6.875 13.875 6.8125 13.8125L6.1875 13.1875C6.125 13.125 6.09375 13.0625 6.09375 12.9375C6.09375 12.8438 6.125 12.75 6.1875 12.6562L11.0312 7.8125H0.375C0.25 7.8125 0.15625 7.78125 0.09375 7.71875C0.03125 7.65625 0 7.5625 0 7.4375V6.5625C0 6.46875 0.03125 6.375 0.09375 6.3125C0.15625 6.25 0.25 6.1875 0.375 6.1875H11.0312L6.1875 1.34375C6.125 1.28125 6.09375 1.1875 6.09375 1.0625C6.09375 0.96875 6.125 0.875 6.1875 0.8125L6.8125 0.1875Z" fill="#169FE6"></path></svg></a>
            </div>
</div>
</div>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Why-Semantic-Caching-LLMs-Requires-Production-Hardening"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Why-Semantic-Caching-LLMs-Requires-Production-Hardening">Why Semantic Caching for LLMs Requires Production Hardening</a></h2>



<p>In Lesson 1, we built a semantic cache that works end-to-end. It correctly avoids redundant LLM calls, reuses responses for identical queries, and even handles paraphrased inputs via semantic similarity. For many tutorials, that would be the end of the story.</p>



<p>In real systems, however, working is only the starting point.</p>



<p>A semantic cache that works under ideal conditions can still fail in subtle and dangerous ways when exposed to real users, long-running processes, and evolving information. These failures do not usually appear as crashes or explicit errors. Instead, they show up as <strong>silent correctness issues</strong>, degraded user trust, and unpredictable behavior over time.</p>



<h3 class="wp-block-heading">What Lesson 1 Solved — and What It Didn’t</h3>



<p>Lesson 1 focused on the <strong>correctness of flow</strong>:</p>



<ul class="wp-block-list">
<li>Requests move through exact match → semantic match → LLM fallback (generation)</li>



<li>Cached responses are reused when appropriate</li>



<li>The system is observable and debuggable</li>



<li>Nothing is hidden behind abstractions</li>
</ul>



<p>What it intentionally did not address was <strong>long-term safety</strong>.</p>



<p>We did not ask:</p>



<ul class="wp-block-list">
<li><em>How old is this cached response, and should we still trust it?</em></li>



<li><em>What happens if the LLM returns an error or partial output?</em></li>



<li><em>What if the cache slowly fills with duplicates?</em></li>



<li><em>What if similarity is high but the answer is no longer valid?</em></li>
</ul>



<p>Those questions only matter once the system runs for days or weeks, not minutes.</p>



<h3 class="wp-block-heading">Real-World Failure Modes in Semantic Caching</h3>



<p>Semantic caching introduces failure modes that rarely exist in traditional exact-match caches.</p>



<p>For example:</p>



<ul class="wp-block-list">
<li>A cached answer with very high similarity may still be <strong>stale</strong></li>



<li>An error response may be accidentally cached and reused</li>



<li>Slight variations of the same query may create <strong>duplicate entries</strong></li>



<li>Old but similar answers may appear correct while being subtly wrong</li>
</ul>



<p>None of these issues breaks the system outright. Instead, they quietly degrade correctness and user trust over time.</p>



<p>These are the hardest bugs to detect because the system continues to respond quickly and confidently.</p>



<h3 class="wp-block-heading">Why “It Works” Does Not Mean “It’s Safe”</h3>



<p>A semantic cache sits directly in the decision path of an LLM system. When it makes a mistake, that mistake is amplified through reuse.</p>



<p>If an unsafe response enters the cache:</p>



<ul class="wp-block-list">
<li>It can be served repeatedly</li>



<li>It can outlive the conditions that made it valid</li>



<li>It can be returned with high confidence</li>
</ul>



<p>This is why semantic caching requires <strong>more discipline</strong>, not less, than direct LLM calls.</p>



<p>In this lesson, we will take the working system from Lesson 1 and begin hardening it. We will introduce explicit safeguards for staleness, confidence, duplication, and safety — without changing the core architecture.</p>



<p>The goal is not to make the system perfect, but to make its failures <strong>controlled, visible, and predictable</strong>.</p>



<p>That is the difference between a demo and a system you can trust.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Cache-TTL-Semantic-Caching-Preventing-Stale-LLM-Responses"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Cache-TTL-Semantic-Caching-Preventing-Stale-LLM-Responses">Cache TTL in Semantic Caching: Preventing Stale LLM Responses</a></h2>



<p>Once a semantic cache is deployed and begins reusing LLM responses, a new question immediately arises:</p>



<p><em>How long should a cached response be trusted?</em></p>



<p>Unlike traditional caches that store deterministic outputs, semantic caches store model-generated answers. These answers are only valid within a certain window of time and context. Without explicit controls, a semantic cache can continue serving responses that are technically valid but practically wrong.</p>



<p>This section explains <strong>why cached LLM responses become stale</strong>, <strong>how TTLs help</strong>, and <strong>what it means for a cache entry to be unsafe</strong>.</p>



<h3 class="wp-block-heading">Why Cached LLM Responses Become Stale</h3>



<p>LLM responses are not timeless.</p>



<p>They are influenced by:</p>



<ul class="wp-block-list">
<li>evolving APIs and libraries</li>



<li>changing business logic or documentation</li>



<li>updated prompts or system behavior</li>



<li>newly introduced edge cases</li>
</ul>



<p>A cached answer that was correct an hour ago may no longer reflect the current state of the world.</p>



<p>Semantic caching amplifies this risk because:</p>



<ul class="wp-block-list">
<li>responses are reused aggressively</li>



<li>high similarity can mask outdated content</li>



<li>cached answers are returned with confidence</li>
</ul>



<p>Without staleness controls, the cache slowly becomes a <strong>museum of old truths</strong>.</p>



<h3 class="wp-block-heading">TTL as a Safety Mechanism</h3>



<p>A <strong>time-to-live (TTL)</strong> specifies how long a cache entry remains valid.</p>



<p>Once the TTL expires:</p>



<ul class="wp-block-list">
<li>the entry is treated as unsafe</li>



<li>it should no longer be reused</li>



<li>a fresh LLM response must be generated</li>
</ul>



<p>TTL does not guarantee correctness, but it <strong>limits the blast radius of staleness</strong>.</p>



<p>In semantic caching, TTL is not an optimization. It is a <strong>correctness safeguard</strong>.</p>



<h3 class="wp-block-heading">Application-Level TTL vs Redis: EXPIRE</h3>



<p>There are 2 common ways to implement TTLs when using Redis:</p>



<h4 class="wp-block-heading">Redis EXPIRE</h4>



<ul class="wp-block-list">
<li>Redis automatically deletes keys after a fixed duration</li>



<li>Expired entries are removed entirely</li>



<li>The application has no visibility into expired data</li>
</ul>



<h4 class="wp-block-heading">Application-Level TTL (Used Here)</h4>



<ul class="wp-block-list">
<li>Entries remain stored in Redis</li>



<li>Expiration is checked at read time by the application</li>



<li>The application decides whether an entry is safe to reuse</li>
</ul>



<p>In this system, TTL is enforced at the application layer rather than using Redis TTL via the native EXPIRE command, a deliberate choice that prioritizes observability over automation.</p>



<p>This choice allows us to:</p>



<ul class="wp-block-list">
<li>inspect expired entries during debugging</li>



<li>apply custom expiration logic</li>



<li>combine TTL with other safety signals (such as confidence)</li>
</ul>



<p>We trade automatic deletion for <strong>control and observability</strong>.</p>



<h3 class="wp-block-heading">When a Cache Entry Becomes Unsafe</h3>



<p>In this system, a cache entry is considered unsafe when <strong>any</strong> of the following are true:</p>



<ul class="wp-block-list">
<li>its TTL has expired</li>



<li>its content is malformed or erroneous</li>



<li>its confidence score falls below an acceptable threshold</li>
</ul>



<p>TTL is the first and most basic of these checks.</p>



<p>If an entry fails the TTL check, semantic similarity is irrelevant.</p>



<p>Reusing it would prioritize speed over correctness.</p>



<h3 class="wp-block-heading">Designing TTLs for LLM Workloads</h3>



<p>There is no universal “correct” TTL for LLM responses.</p>



<p>Instead, TTLs should be chosen based on:</p>



<ul class="wp-block-list">
<li>how fast the underlying information changes</li>



<li>how costly incorrect answers are</li>



<li>how frequently similar queries appear</li>
</ul>



<p>Short TTLs:</p>



<ul class="wp-block-list">
<li>reduce staleness risk</li>



<li>increase LLM calls</li>
</ul>



<p>Long TTLs:</p>



<ul class="wp-block-list">
<li>improve cache hit rate</li>



<li>increase risk of outdated responses</li>
</ul>



<p>In Lesson 1, we used a conservative default TTL to keep behavior predictable. In this lesson, we will focus on <strong>how TTLs are enforced</strong> rather than on tuning them for a specific domain.</p>



<p>TTL design is a policy decision. TTL enforcement is a correctness requirement.</p>



<hr class="wp-block-separator has-alpha-channel-opacity"/>



<p>Would you like immediate access to 3,457 images curated and labeled with hand gestures to train, explore, and experiment with &#8230; for free? Head over to <a href="https://universe.roboflow.com/isl/az-6mqow?ref=pyimagesearch" target="_blank" rel="noreferrer noopener">Roboflow</a> and get a free account to grab these hand gesture images. </p>



<hr class="wp-block-separator has-alpha-channel-opacity"/>



<!-- wp:paragraph -->
<h3>Need Help Configuring Your Development Environment?</h3>
<!-- /wp:paragraph -->

<!-- wp:image {"align":"center","id":18137,"sizeSlug":"large","linkDestination":"custom"} -->
<figure class="wp-block-image aligncenter size-large"><a href="https://pyimagesearch.com/pyimagesearch-university/" target="_blank" rel="noreferrer noopener"><img decoding="async" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-18137" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?lossy=2&strip=1&webp=1 500w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?size=126x84&lossy=2&strip=1&webp=1 126w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?size=252x168&lossy=2&strip=1&webp=1 252w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?size=378x253&lossy=2&strip=1&webp=1 378w" sizes="(max-width: 500px) 100vw, 500px" /></a><figcaption>Having trouble configuring your development environment? Want access to pre-configured Jupyter Notebooks running on Google Colab? Be sure to join <a href="https://pyimagesearch.com/pyimagesearch-university/" target="_blank" rel="noreferrer noopener" aria-label=" (opens in a new tab)">PyImageSearch University</a> — you will be up and running with this tutorial in a matter of minutes. </figcaption></figure>
<!-- /wp:image -->

<!-- wp:paragraph -->
<p>All that said, are you:</p>
<!-- /wp:paragraph -->

<!-- wp:list -->
<ul><li>Short on time?</li><li>Learning on your employer’s administratively locked system?</li><li>Wanting to skip the hassle of fighting with the command line, package managers, and virtual environments?</li><li><strong>Ready to run the code immediately on your Windows, macOS, or Linux system?</strong></li></ul>
<!-- /wp:list -->

<!-- wp:paragraph -->
<p>Then join <a href="https://pyimagesearch.com/pyimagesearch-university/" target="_blank">PyImageSearch University</a> today!</p>
<!-- /wp:paragraph -->

<!-- wp:paragraph -->
<p><strong>Gain access to Jupyter Notebooks for this tutorial and other PyImageSearch guides pre-configured to run on Google Colab’s ecosystem right in your web browser!</strong> No installation required.</p>
<!-- /wp:paragraph -->

<!-- wp:paragraph -->
<p>And best of all, these Jupyter Notebooks will run on Windows, macOS, and Linux!</p>
<!-- /wp:paragraph -->



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-MLOps-Project-Structure-Semantic-Caching-FastAPI-Redis"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-MLOps-Project-Structure-Semantic-Caching-FastAPI-Redis">MLOps Project Structure for Semantic Caching with FastAPI and Redis</a></h2>



<p>Before diving into individual components, let’s take a moment to understand how the project is organized.</p>



<p>A clear directory structure is especially important in LLM-backed systems, where responsibilities span API orchestration, caching, embeddings, model calls, and observability. In this project, each concern is isolated into its own module so the request flow remains easy to trace and reason about.</p>



<p>After downloading the source code from the “Downloads” section, your directory structure should look like this:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: TTLs, Confidence, and Cache Safety" data-enlighter-group="1">.
├── app
│   ├── api
│   │   ├── __init__.py
│   │   └── ask.py
│   ├── cache
│   │   ├── __init__.py
│   │   ├── poisoning.py
│   │   ├── schemas.py
│   │   ├── semantic_cache.py
│   │   └── ttl.py
│   ├── config
│   │   ├── __init__.py
│   │   └── settings.py
│   ├── embeddings
│   │   ├── __init__.py
│   │   └── embedder.py
│   ├── llm
│   │   ├── __init__.py
│   │   └── ollama_client.py
│   ├── main.py
│   └── observability
│       └── metrics.py
├── complete-codebase.txt
├── docker-compose.yml
├── Dockerfile
├── README.md
└── requirements.txt
</pre>



<p>Let’s break this down at a high level.</p>



<h3 class="wp-block-heading">The app/ Package</h3>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">app/</code> directory contains all runtime application code. Nothing outside this folder is imported at runtime.</p>



<p>This keeps the service self-contained and makes it easy to reason about deployment and dependencies.</p>



<h3 class="wp-block-heading">app/main.py: Application Entry Point</h3>



<p>This file defines the FastAPI application and registers all routers.</p>



<p>It contains <strong>no business logic</strong> — only service wiring. Every request to the system enters through this file.</p>



<h3 class="wp-block-heading">app/api/: API Layer</h3>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">api/</code> package defines HTTP-facing endpoints.</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">ask.py</code>: Implements the <code data-enlighter-language="python" class="EnlighterJSRAW">/ask</code> endpoint and acts as the orchestration layer for the entire semantic caching pipeline.</li>
</ul>



<p>The API layer is responsible for:</p>



<ul class="wp-block-list">
<li>validating input</li>



<li>enforcing cache ordering</li>



<li>coordinating cache, embeddings, and LLM calls</li>



<li>returning structured debug information</li>
</ul>



<p>It does not implement caching or similarity logic directly.</p>



<h3 class="wp-block-heading">app/cache/: Caching Logic</h3>



<p>This package contains all cache-related functionality.</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">semantic_cache.py</code>: Core semantic cache implementation (exact match, semantic match, Redis storage, similarity search).</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">schemas.py</code>: Defines the cache entry schema used for Redis storage.</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">ttl.py</code>: Application-level TTL configuration and expiration checks.</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">poisoning.py</code>: Safety checks to prevent invalid or error responses from being reused.</li>
</ul>



<p>By isolating caching logic here, the API layer stays clean and reusable.</p>



<h3 class="wp-block-heading">app/embeddings/: Embedding Generation</h3>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">embedder.py</code>: Handles embedding generation via Ollama’s embedding endpoint.</li>
</ul>



<p>This module has a single responsibility: converting text into semantic vectors.</p>



<p>It does not cache, rank, or validate embeddings.</p>



<h3 class="wp-block-heading">app/llm/: LLM Client</h3>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">ollama_client.py</code>: Wraps calls to the Ollama text-generation endpoint.</li>
</ul>



<p>Isolating LLM interaction allows the rest of the system to remain model-agnostic.</p>



<h3 class="wp-block-heading">app/observability/: Metrics</h3>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">metrics.py</code>: Implements simple in-memory counters for cache hits, misses, and LLM calls.</li>
</ul>



<p>These metrics are intentionally lightweight and meant for learning and debugging, not production monitoring.</p>



<h3 class="wp-block-heading">Configuration and Infrastructure</h3>



<p>Outside the <code data-enlighter-language="python" class="EnlighterJSRAW">app/</code> directory:</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">config/settings.py</code>: Centralizes environment-based configuration (Redis host, TTLs, model names).</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">Dockerfile</code> and <code data-enlighter-language="python" class="EnlighterJSRAW">docker-compose.yml</code>: Define a reproducible runtime environment for the API and Redis.</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">requirements.txt</code>: Lists all Python dependencies required to run the service.</li>
</ul>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-How-Implement-Cache-TTL-Validation-Python-Redis"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-How-Implement-Cache-TTL-Validation-Python-Redis">How to Implement Cache TTL Validation in Python and Redis</a></h2>



<p>In the previous section, we discussed <em>why</em> cached LLM responses become stale and <em>why</em> TTLs are necessary. In this section, we move from concept to code and look at <strong>how TTL validation is enforced in practice</strong>.</p>



<p>The key idea is simple but important:</p>



<p><strong>Cache entries are not deleted automatically. They are validated at read time.</strong></p>



<p>This design choice keeps cache behavior explicit, observable, and safe.</p>



<h3 class="wp-block-heading">The Default TTL Configuration</h3>



<p>TTL configuration is centralized in a single helper function:</p>



<p><strong>File:</strong> <code data-enlighter-language="python" class="EnlighterJSRAW">app/cache/ttl.py</code></p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: TTLs, Confidence, and Cache Safety" data-enlighter-group="2">def default_ttl():
    return settings.CACHE_TTL_SECONDS
</pre>



<p>Rather than hardcoding a value, the TTL is loaded from configuration. This allows different environments to use different TTLs without changing the code.</p>



<p>At this stage, the specific TTL value is not important. What matters is that:</p>



<ul class="wp-block-list">
<li>every cache entry receives a TTL at creation time</li>



<li>TTL is treated as metadata, not as a Redis feature</li>
</ul>



<h3 class="wp-block-heading">Checking Whether an Entry Has Expired</h3>



<p>TTL enforcement happens through a dedicated validation function:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: TTLs, Confidence, and Cache Safety" data-enlighter-group="3">def is_expired(entry):
    try:
        created_at = int(entry["created_at"])
        ttl = int(entry["ttl"])
        now = int(time.time())
        return now > (created_at + ttl)
    except (KeyError, ValueError, TypeError):
        return True
</pre>



<p>This function answers 1 question:</p>



<p><strong>Is this cache entry still safe to reuse?</strong></p>



<p>If the current time exceeds <code data-enlighter-language="python" class="EnlighterJSRAW">created_at + ttl</code>, the entry is considered expired and must not be reused.</p>



<h3 class="wp-block-heading">Fail-Safe Expiration Behavior</h3>



<p>Notice the exception handling at the end of <code data-enlighter-language="python" class="EnlighterJSRAW">is_expired()</code>.</p>



<p>If the entry:</p>



<ul class="wp-block-list">
<li>is missing required fields</li>



<li>contains malformed values</li>



<li>cannot be parsed safely</li>
</ul>



<p>…it is treated as <strong>expired by default</strong>.</p>



<p>This is a deliberate fail-safe design.</p>



<p>When dealing with cached LLM responses, <strong>silently trusting malformed data is more dangerous than recomputing a response</strong>. If the system is unsure, it expires the entry and falls back to the LLM.</p>



<p>Correctness always wins over reuse.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-2-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="439" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-2-1024x439.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53631" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-2.png?size=126x54&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-2-300x129.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-2.png?size=378x162&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-2.png?size=504x216&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-2.png?size=630x270&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-2-768x329.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-2-1024x439.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-2-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-2-1536x659.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 1:</strong> Application-level TTL validation for semantic cache entries. Cached responses are reused only within their TTL window and are rejected at read time once expired (source: image by the author).</figcaption></figure></div>


<h3 class="wp-block-heading">Best-Effort Cleanup During Cache Reads</h3>



<p>TTL validation does more than reject expired entries — it also performs <strong>opportunistic cleanup</strong> during cache searches.</p>



<p>Inside the semantic cache search logic:</p>



<ul class="wp-block-list">
<li>expired entries are detected</li>



<li>expired keys are removed from Redis</li>



<li>the cache continues scanning remaining entries</li>
</ul>



<p>This cleanup happens:</p>



<ul class="wp-block-list">
<li>without background workers</li>



<li>without scheduled jobs</li>



<li>without blocking the request</li>
</ul>



<p>This is not a full garbage collector. It is a <strong>best-effort hygiene mechanism</strong> that keeps the cache from accumulating junk over time.</p>



<h3 class="wp-block-heading">Why We Validate on Read, Not Delete on Write</h3>



<p>At this point, a natural question arises:</p>



<p><em>Why not just use Redis EXPIRE and let Redis delete entries automatically?</em></p>



<p>There are 3 reasons this system validates TTLs <strong>on read</strong> instead:</p>



<ul class="wp-block-list">
<li><strong>Visibility: </strong>Expired entries remain inspectable during debugging.</li>



<li><strong>Control: </strong>The application decides what “expired” means, not Redis.</li>



<li><strong>Composability: </strong>TTL checks can be combined with confidence scoring, poisoning detection, and other safety signals.</li>
</ul>



<p>By validating at read time, TTL becomes part of the decision-making pipeline rather than an invisible background mechanism.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Confidence-Scoring-Semantic-Caching-Beyond-Similarity-LLMs"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Confidence-Scoring-Semantic-Caching-Beyond-Similarity-LLMs">Confidence Scoring in Semantic Caching: Beyond Similarity for LLMs</a></h2>



<p>Up to this point, semantic caching decisions have relied heavily on <strong>semantic similarity</strong>. If a cached response is similar enough to a new query, it feels reasonable to reuse it.</p>



<p>In practice, this assumption breaks down.</p>



<p>High similarity answers an important question — <em>“Is this response about the same thing?” </em>— but it does <strong>not</strong> answer an equally important one:</p>



<p><em>“Is this response still safe to reuse right now?”</em></p>



<p>Confidence scoring exists to bridge that gap.</p>



<h3 class="wp-block-heading">Why High Similarity Can Still Be Wrong</h3>



<p>Semantic similarity measures closeness in meaning, not correctness over time.</p>



<p>Consider a cached response that:</p>



<ul class="wp-block-list">
<li>has very high embedding similarity to the current query</li>



<li>was generated hours or days ago</li>



<li>refers to information that has since changed</li>
</ul>



<p>From a vector perspective, the response still appears “correct.”</p>



<p>From a system perspective, it may no longer be trustworthy.</p>



<p>This problem is subtle because:</p>



<ul class="wp-block-list">
<li>similarity scores remain high</li>



<li>responses look fluent and confident</li>



<li>failures are silent rather than catastrophic</li>
</ul>



<p>Without an additional signal, the cache has no way to distinguish <em>relevant but stale</em> from <em>relevant and safe</em>.</p>



<h3 class="wp-block-heading">Combining Semantic Similarity with Freshness</h3>



<p>Confidence scoring introduces a second dimension: <strong>freshness</strong>.</p>



<p>Rather than deciding reuse based on similarity alone, the cache evaluates a combined signal that reflects:</p>



<ul class="wp-block-list">
<li>how semantically close the response is</li>



<li>how recently the response was generated</li>
</ul>



<p>At a high level, confidence answers the question:</p>



<p><em>“How comfortable are we reusing this response right now?”</em></p>



<p>Fresh responses with high similarity score high confidence.</p>



<p>Old responses, even with high similarity, gradually lose confidence as they age.</p>



<p>This ensures that time acts as a natural decay mechanism.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/05/image-3-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="553" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-3-1024x553.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53633" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-3.png?size=126x68&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-3-300x162.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-3.png?size=378x204&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-3.png?size=504x272&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-3.png?size=630x340&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-3-768x415.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-3-1024x553.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-3-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/05/image-3-1536x830.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 2:</strong> Confidence scoring combines semantic similarity with freshness. Even highly similar cached responses lose confidence over time and are eventually rejected (source: image by the author).</figcaption></figure></div>


<h3 class="wp-block-heading">Understanding the Confidence Score (High-Level)</h3>



<p>In this system, confidence is a <strong>weighted combination</strong> of:</p>



<ul class="wp-block-list">
<li>semantic similarity</li>



<li>freshness relative to TTL</li>
</ul>



<p>You do not need to think about exact formulas at this stage. What matters is the behavior:</p>



<ul class="wp-block-list">
<li>Confidence starts high when an entry is created</li>



<li>Confidence decreases as the entry ages</li>



<li>Confidence is capped by semantic similarity</li>



<li>Expired entries always fail confidence checks</li>
</ul>



<p>Confidence is not a probability. It is a <strong>reuse heuristic</strong> designed to favor correctness over speed.</p>



<h3 class="wp-block-heading">How Confidence Affects Cache Reuse Decisions</h3>



<p>Confidence scoring acts as a <strong>gatekeeper</strong> in the cache pipeline.</p>



<p>Even if:</p>



<ul class="wp-block-list">
<li>the entry is not expired</li>



<li>the semantic similarity is above threshold</li>
</ul>



<p>…the cache will <strong>reject reuse</strong> if confidence falls below an acceptable level.</p>



<p>When this happens:</p>



<ul class="wp-block-list">
<li>the cache treats the entry as unsafe</li>



<li>the request falls back to the LLM</li>



<li>a fresh response is generated and stored</li>
</ul>



<p>This behavior ensures that the cache degrades gracefully.</p>



<p>As uncertainty increases, the system automatically shifts work back to the LLM rather than returning questionable results.</p>



<h3 class="wp-block-heading">Why Confidence Belongs in the Cache (Not the LLM)</h3>



<p>It’s tempting to push this logic downstream and let the LLM “fix” stale responses.</p>



<p>That approach fails for two reasons:</p>



<ul class="wp-block-list">
<li>the LLM has no context about cache age</li>



<li>the LLM cannot distinguish reused content from fresh inference</li>
</ul>



<p>Confidence must be enforced <strong>before reuse</strong>, not after generation.</p>



<p>By embedding confidence checks directly into the cache, we ensure that reuse decisions are explicit, explainable, and controllable.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Implementing-Confidence-Scoring-LLM-Cache-Optimization-Code-Walkthrough"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Implementing-Confidence-Scoring-LLM-Cache-Optimization-Code-Walkthrough">Implementing Confidence Scoring for LLM Cache Optimization (Code Walkthrough)</a></h2>



<p>In the previous section, we introduced confidence scoring as a conceptual safeguard: a way to prevent semantically similar but stale responses from being reused.</p>



<p>In this section, we make that idea concrete by implementing it.</p>



<p>We will walk through <strong>where confidence is computed</strong>, <strong>where it is enforced</strong>, and <strong>what happens when a cached entry is rejected</strong>.</p>



<h3 class="wp-block-heading">Where Confidence Is Computed</h3>



<p>Confidence is computed inside the semantic cache, alongside similarity scoring.</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: TTLs, Confidence, and Cache Safety" data-enlighter-group="4">def compute_confidence(similarity: float, created_at: int, ttl: int) -> float:
    age = time.time() - created_at

    if ttl &lt;= 0:
        freshness = 1.0
    else:
        freshness = max(0.0, 1.0 - (age / ttl))

    confidence = (0.7 * similarity) + (0.3 * freshness)
    return round(confidence, 3)
</pre>



<p>This function combines 2 signals:</p>



<ul class="wp-block-list">
<li><strong>Semantic similarity:</strong> how close the meanings are</li>



<li><strong>Freshness:</strong> how recent the response is relative to its TTL</li>
</ul>



<p>The exact weights are not important here. What matters is the behavior:</p>



<ul class="wp-block-list">
<li>Fresh, similar responses score high confidence</li>



<li>Old responses lose confidence over time</li>



<li>Expired entries collapse to low confidence</li>
</ul>



<p>Confidence is therefore <strong>bounded</strong>, <strong>decaying</strong>, and <strong>explicitl</strong><strong>y defined</strong>.</p>



<h3 class="wp-block-heading">Why Confidence Is Computed in the Cache</h3>



<p>Notice that confidence is computed <strong>inside the cache layer</strong>, not in the API.</p>



<p>This ensures:</p>



<ul class="wp-block-list">
<li>all reuse decisions are centralized</li>



<li>confidence logic is applied consistently</li>



<li>the API remains an orchestration layer, not a policy engine</li>
</ul>



<p>The API does not need to understand <em>how</em> confidence is computed — only <em>whether</em> it is acceptable.</p>



<h3 class="wp-block-heading">Where Confidence Is Enforced</h3>



<p>Confidence enforcement happens in the request pipeline in <code data-enlighter-language="python" class="EnlighterJSRAW">ask.py</code>.</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: TTLs, Confidence, and Cache Safety" data-enlighter-group="5">elif cached.get("confidence", 0.0) &lt; 0.7:
    miss_reason = "low_confidence"
</pre>



<p>This check occurs <strong>after</strong>:</p>



<ul class="wp-block-list">
<li>exact or semantic matching</li>



<li>TTL validation</li>



<li>poisoning checks</li>
</ul>



<p>And <strong>before</strong> a cached response is returned.</p>



<p>If confidence is below the threshold:</p>



<ul class="wp-block-list">
<li>the cache entry is rejected</li>



<li>the request is treated as a cache miss</li>



<li>the pipeline falls back to the LLM</li>
</ul>



<p>This ensures that reuse happens only when confidence meets an acceptable threshold.</p>



<h3 class="wp-block-heading">Why Rejection Is Safer Than Reuse</h3>



<p>When confidence is low, the system has 2 choices:</p>



<ul class="wp-block-list">
<li>reuse a response it does not fully trust</li>



<li>generate a fresh response</li>
</ul>



<p>This implementation always chooses the second option.</p>



<p>The cost of an extra LLM call is predictable.</p>



<p>The cost of serving an incorrect response is not.</p>



<p>By rejecting low-confidence entries, the cache degrades <strong>gracefully</strong> instead of failing silently.</p>



<h3 class="wp-block-heading">What Happens After Rejection</h3>



<p>Once a cached entry is rejected:</p>



<ul class="wp-block-list">
<li>the request proceeds to the LLM</li>



<li>a new response is generated</li>



<li>the new response is stored with a fresh timestamp and TTL</li>
</ul>



<p>Over time, this naturally refreshes the cache without requiring explicit invalidation logic.</p>



<h3 class="wp-block-heading">Making Rejections Observable</h3>



<p>Confidence-based rejections are not hidden.</p>



<p>They are surfaced via:</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">miss_reason = "low_confidence"</code></li>



<li>debug metadata returned to the client</li>



<li>cache miss metrics</li>
</ul>



<p>This makes it possible to understand <em>why</em> the cache did not reuse a response — a critical property when tuning thresholds later.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Query-Normalization-Deduplication-Efficient-Semantic-Caching"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Query-Normalization-Deduplication-Efficient-Semantic-Caching">Query Normalization and Deduplication for Efficient Semantic Caching</a></h2>



<p>At this point, our semantic cache is safe against stale and low-confidence responses. However, there is another failure mode that appears once the system runs for longer periods of time:</p>



<p><strong>The cache slowly fills with duplicate entries</strong> <strong>represent</strong><strong>ing</strong><strong> the same query.</strong></p>



<p>This problem does not break correctness, but it can silently degrade cache quality and efficiency.</p>



<h3 class="wp-block-heading">Why Duplicate Cache Entries Are a Problem</h3>



<p>In natural language systems, users rarely type queries the same way twice.</p>



<p>Consider the following inputs:</p>



<ul class="wp-block-list">
<li>What is semantic caching?</li>



<li>What is semantic caching</li>



<li>What   is   semantic   caching?</li>
</ul>



<p>From a human perspective, these queries are identical.</p>



<p>From a naïve cache’s perspective, they are completely different strings.</p>



<p>If we store each variation separately:</p>



<ul class="wp-block-list">
<li>cache size grows unnecessarily</li>



<li>similarity scans become slower</li>



<li>cache hit rate decreases</li>



<li>identical LLM work is repeated</li>
</ul>



<p>This is not a semantic problem — it is a <strong>normalization problem</strong>.</p>



<h3 class="wp-block-heading">Normalizing Queries Before Caching</h3>



<p>To prevent this, the cache normalizes queries before storing them.</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: TTLs, Confidence, and Cache Safety" data-enlighter-group="6">def _hash_query(query: str) -> str:
    normalized = " ".join(query.lower().split())
    return hashlib.sha256(normalized.encode()).hexdigest()
</pre>



<p>This function performs 3 important steps:</p>



<ul class="wp-block-list">
<li><strong>Lowercasing: </strong>Ensures case-insensitive matching</li>



<li><strong>Whitespace normalization: </strong>Collapses extra spaces and removes leading/trailing whitespace</li>



<li><strong>Hashing: </strong>Produces a fixed-length identifier for fast comparison</li>
</ul>



<p>The result is a stable representation of the query’s <em>structure</em>, not its formatting.</p>



<h3 class="wp-block-heading">Deduplication at Store Time</h3>



<p>Deduplication happens when a new cache entry is about to be written.</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: TTLs, Confidence, and Cache Safety" data-enlighter-group="7">query_hash = self._hash_query(query)

for key in self.r.smembers(f"{self.namespace}:keys"):
    data = self.r.hgetall(key)
    if data and data.get("query_hash") == query_hash:
        return
</pre>



<p>Before storing a new entry, the cache checks whether an entry with the same normalized hash already exists in the cache.</p>



<p>If it does:</p>



<ul class="wp-block-list">
<li>the new entry is <strong>not stored</strong></li>



<li>the cache avoids creating a duplicate</li>



<li>storage space and future scans are preserved</li>
</ul>



<p>This approach ensures that <strong>identical queries map to a single cache entry</strong>, regardless of how they were formatted.</p>



<h3 class="wp-block-heading">Why Deduplication Happens in the Cache Layer</h3>



<p>Deduplication is enforced inside the cache rather than in the API layer.</p>



<p>This design ensures:</p>



<ul class="wp-block-list">
<li>all cache writes are normalized consistently</li>



<li>deduplication logic lives next to storage logic</li>



<li>API code remains simple and declarative</li>
</ul>



<p>The API does not need to care <em>how</em> deduplication works — only that the cache remains clean.</p>



<h3 class="wp-block-heading">Why Hash-Based Deduplication Works Well Here</h3>



<p>Using a hash instead of raw strings provides several advantages:</p>



<ul class="wp-block-list">
<li>fixed-length comparisons</li>



<li>efficient storage</li>



<li>no dependency on query length</li>



<li>practical collision resistance</li>
</ul>



<p>For this system, SHA-256 is more than sufficient. The goal is stability and simplicity, not cryptographic security.</p>



<h3 class="wp-block-heading">What Deduplication Does Not Solve</h3>



<p>It’s important to understand the limits of this approach.</p>



<p>Hash-based deduplication:</p>



<ul class="wp-block-list">
<li>prevents exact duplicates after normalization</li>



<li>does <strong>not</strong> merge semantically similar queries</li>



<li>does <strong>not</strong> replace semantic caching</li>
</ul>



<p>In other words:</p>



<ul class="wp-block-list">
<li>deduplication keeps the cache clean</li>



<li>semantic similarity keeps the cache useful</li>
</ul>



<p>They solve different problems and complement each other.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Preventing-Cache-Poisoning-Semantic-Caching-LLM-Systems"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Preventing-Cache-Poisoning-Semantic-Caching-LLM-Systems">Preventing Cache Poisoning in Semantic Caching for LLM Systems</a></h2>



<p>So far, we’ve protected the semantic cache against <em>staleness</em>, <em>low confidence</em>, and <em>duplicate entries</em>. There is one more failure mode that can silently undermine the entire system if left unchecked:</p>



<p><strong>Cache poisoning — storing responses that should never be reused.</strong></p>



<p>Cache poisoning does not usually crash the system. Instead, it causes the cache to confidently serve <strong>bad answers repeatedly</strong>, amplifying a single failure into many incorrect responses.</p>



<h3 class="wp-block-heading">What Cache Poisoning Looks Like in LLM Systems</h3>



<p>In the context of LLM-backed systems, cache poisoning typically happens when:</p>



<ul class="wp-block-list">
<li>the LLM returns an error message</li>



<li>the response is empty or incomplete</li>



<li>the output is malformed due to a timeout or partial generation</li>
</ul>



<p>If these responses are cached, every future “hit” returns the same failure instantly — fast, but incorrect.</p>



<p>This is especially dangerous because:</p>



<ul class="wp-block-list">
<li>the cache appears to be working</li>



<li>responses are returned quickly</li>



<li>the system looks healthy from the outside</li>
</ul>



<h3 class="wp-block-heading">Poisoning Prevention Strategy</h3>



<p>Rather than trying to detect every possible bad response, this system uses a <strong>simple, conservative heuristic</strong>:</p>



<p><em>If a response looks unsafe, do not cache it.</em></p>



<p>This keeps the logic easy to reason about and avoids false positives.</p>



<h3 class="wp-block-heading">Detecting Poisoned Entries</h3>



<p>Poisoning detection is implemented in a dedicated helper function.</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: TTLs, Confidence, and Cache Safety" data-enlighter-group="8">def is_poisoned(entry):
    resp = entry.get("response", "")
    if not resp or resp.startswith("[LLM Error]"):
        return True
    return False
</pre>



<p>This function flags an entry as poisoned if:</p>



<ul class="wp-block-list">
<li>the response is empty, or</li>



<li>the response is an explicit LLM error</li>
</ul>



<p>These conditions are intentionally strict. When in doubt, the entry is treated as unsafe.</p>



<h3 class="wp-block-heading">Where Poisoning Is Enforced</h3>



<p>Poisoning checks are applied <strong>before</strong> any cached response is reused in <code data-enlighter-language="python" class="EnlighterJSRAW">ask.py</code>.</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: TTLs, Confidence, and Cache Safety" data-enlighter-group="9">elif is_poisoned(cached):
    miss_reason = "poisoned"
</pre>



<p>If a cached entry is poisoned:</p>



<ul class="wp-block-list">
<li>it is rejected immediately</li>



<li>the request is treated as a cache miss</li>



<li>the pipeline falls back to the LLM</li>
</ul>



<p>This ensures that invalid responses are never reused, even if they have high similarity or appear fresh.</p>



<h3 class="wp-block-heading">Why Poisoned Entries Are Rejected, Not Repaired</h3>



<p>The cache does not attempt to “fix” poisoned entries.</p>



<p>Trying to repair cached LLM output introduces:</p>



<ul class="wp-block-list">
<li>ambiguity</li>



<li>hidden transformations</li>



<li>unpredictable behavior</li>
</ul>



<p>Instead, the system takes the safest possible action:</p>



<ul class="wp-block-list">
<li>reject the entry</li>



<li>generate a fresh response</li>



<li>overwrite with a clean result</li>
</ul>



<p>This keeps the cache behavior explicit and predictable.</p>



<h3 class="wp-block-heading">Making Poisoning Visible</h3>



<p>Just like low-confidence rejections, poisoning is not silent.</p>



<p>The reason is surfaced via:</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">miss_reason = "poisoned"</code></li>



<li>debug metadata returned to the client</li>



<li>cache miss metrics</li>
</ul>



<p>This makes it possible to distinguish between:</p>



<ul class="wp-block-list">
<li>semantic misses</li>



<li>safety rejections</li>



<li>forced fallbacks</li>
</ul>



<p>Visibility is a critical part of safety.</p>



<h3 class="wp-block-heading">What This Approach Does Not Cover</h3>



<p>This poisoning strategy is intentionally simple.</p>



<p>It does not attempt to:</p>



<ul class="wp-block-list">
<li>analyze response quality</li>



<li>validate structured output</li>



<li>detect hallucinations</li>



<li>score semantic correctness</li>
</ul>



<p>Those checks are domain-specific and belong outside the cache.</p>



<p>The cache’s responsibility is narrow:</p>



<p><strong>Do not reuse responses that are obviously unsafe.</strong></p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-End-to-End-Semantic-Cache-Hardening-TTL-Confidence-Safety-Demos"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-End-to-End-Semantic-Cache-Hardening-TTL-Confidence-Safety-Demos">End-to-End Semantic Cache Hardening: TTL, Confidence, and Safety Demos</a></h2>



<p>In Lesson 1, we verified that semantic caching works.</p>



<p>In this lesson, we harden that system by watching each <strong>safety mechanism activate in practice</strong>.</p>



<p>The goal of these demos is not performance testing.</p>



<p>The goal is <strong>behavioral verification</strong>.</p>



<p>Each demo isolates one hardening feature and makes its effect visible through the response payload.</p>



<h3 class="wp-block-heading">Demo Case 1: TTL Expiration Forces a Cache Miss</h3>



<p>Start by sending a query and populating the cache:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: TTLs, Confidence, and Cache Safety" data-enlighter-group="10">curl -X POST http://localhost:8000/ask \
  -H "Content-Type: application/json" \
  -d '{"query": "Explain semantic caching for LLMs"}'
</pre>



<p>This first request falls back to the LLM and stores a new cache entry.</p>



<p>After waiting <strong>longer than the configured TTL</strong>, send the same request again:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: TTLs, Confidence, and Cache Safety" data-enlighter-group="11">sleep 61
curl -X POST http://localhost:8000/ask \
  -H "Content-Type: application/json" \
  -d '{"query": "Explain semantic caching for LLMs"}'
</pre>



<p><strong>Expected Behavior</strong></p>



<ul class="wp-block-list">
<li>Exact-match lookup finds an entry</li>



<li>TTL validation fails</li>



<li>Entry is rejected</li>



<li>LLM is called again</li>
</ul>



<p><strong>Example response</strong></p>



<pre class="EnlighterJSRAW" data-enlighter-language="json" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: TTLs, Confidence, and Cache Safety" data-enlighter-group="12">{
  "from_cache": false,
  "debug": {
    "hit": false,
    "miss_reason": "no_match"
  }
}
</pre>



<p>This confirms that stale responses are not reused.</p>



<h3 class="wp-block-heading">Demo Case 2: Semantic Reuse When Confidence Remains High</h3>



<p>Now consider a cached response that is still within TTL and retains sufficient confidence.</p>



<p>Send a semantically similar query:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: TTLs, Confidence, and Cache Safety" data-enlighter-group="13">curl -X POST http://localhost:8000/ask \
  -H "Content-Type: application/json" \
  -d '{"query": "How does semantic caching reduce LLM calls?"}'
</pre>



<p><strong>Expected Behavior</strong></p>



<ul class="wp-block-list">
<li>Semantic similarity match found</li>



<li>Confidence computed</li>



<li>Confidence above threshold</li>



<li>Cached response reused</li>
</ul>



<p><strong>Example response</strong></p>



<pre class="EnlighterJSRAW" data-enlighter-language="json" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: TTLs, Confidence, and Cache Safety" data-enlighter-group="14">{
  "from_cache": true,
  "debug": {
    "hit": true,
    "cache_path": "semantic_match",
    "confidence": 0.81
  }
}
</pre>



<p>This demonstrates that semantic reuse is allowed when both relevance and freshness remain acceptable.</p>



<h3 class="wp-block-heading">Demo Case 3: Failed LLM Responses Are Never Cached</h3>



<p>A safe semantic cache must ensure that failed LLM responses are never reused. This demo demonstrates <em>write-time</em> cache poisoning prevention.</p>



<p>This system enforces that rule at <strong>write time</strong>.</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: TTLs, Confidence, and Cache Safety" data-enlighter-group="15">if not response.startswith("[LLM Error]"):
    cache.store(...)
</pre>



<p>Only valid responses are ever written to Redis.</p>



<h4 class="wp-block-heading">How We Demonstrate This</h4>



<p>We <strong>do not</strong> shut down Ollama or the embedding service.</p>



<p>Network failures abort the request before caching logic runs and are not suitable demos.</p>



<p>Instead, we simulate an LLM failure.</p>



<h4 class="wp-block-heading">Step 1: Temporarily Simulate an LLM Error</h4>



<p>In <code data-enlighter-language="python" class="EnlighterJSRAW">generate_llm_response()</code>:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: TTLs, Confidence, and Cache Safety" data-enlighter-group="16">if "simulate_error" in prompt.lower():
    return "[LLM Error] Simulated failure"
</pre>



<h4 class="wp-block-heading">Step 2: Send a Query</h4>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: TTLs, Confidence, and Cache Safety" data-enlighter-group="17">curl -X POST http://localhost:8000/ask \
  -H "Content-Type: application/json" \
  -d '{"query": "Simulate error in semantic caching"}'
</pre>



<p><strong>Expected Behavior</strong></p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">from_cache = false</code></li>



<li>Cache miss</li>



<li>Error response returned</li>
</ul>



<h4 class="wp-block-heading">Step 3: Send the Same Query Again</h4>



<p><strong>Expected </strong><strong>Result</strong></p>



<ul class="wp-block-list">
<li>Cache miss again</li>



<li>LLM called again</li>



<li>No cached response reused</li>
</ul>



<h4 class="wp-block-heading">Why the Miss Reason Is no_match</h4>



<ul class="wp-block-list">
<li>Failed responses are <strong>never stored</strong></li>



<li>No cache entry exists to reject or evaluate</li>



<li>Cache poisoning checks apply only to existing entries</li>
</ul>



<p>This is intentional and correct.</p>



<h3 class="wp-block-heading">Demo Case 4: Deduplication Under Query Variations</h3>



<p>Send a query with unusual spacing:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: TTLs, Confidence, and Cache Safety" data-enlighter-group="18">curl -X POST http://localhost:8000/ask \
  -H "Content-Type: application/json" \
  -d '{"query": "   What   is   semantic   caching?   "}'
</pre>



<p>Then send the normalized version:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: TTLs, Confidence, and Cache Safety" data-enlighter-group="19">curl -X POST http://localhost:8000/ask \
  -H "Content-Type: application/json" \
  -d '{"query": "What is semantic caching?"}'
</pre>



<p><strong>Expected Behavior</strong></p>



<ul class="wp-block-list">
<li>Both queries map to the same normalized hash</li>



<li>Only one cache entry exists</li>



<li>Exact-match reuse occurs</li>
</ul>



<p><strong>Example response</strong></p>



<pre class="EnlighterJSRAW" data-enlighter-language="json" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: TTLs, Confidence, and Cache Safety" data-enlighter-group="20">{
  "from_cache": true,
  "debug": {
    "hit": true,
    "cache_path": "exact_match"
  }
}
</pre>



<p>This confirms deduplication is working correctly.</p>



<h3 class="wp-block-heading">Demo Case 5: Observing Metrics After Hardening</h3>



<p>After running several demos, inspect the metrics endpoint:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: TTLs, Confidence, and Cache Safety" data-enlighter-group="21">curl http://localhost:8000/internal/metrics
</pre>



<p><strong>Example response</strong></p>



<pre class="EnlighterJSRAW" data-enlighter-language="json" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: TTLs, Confidence, and Cache Safety" data-enlighter-group="22">{
  "hits": 3,
  "misses": 4,
  "llm_calls": 4,
  "_note": "In-memory metrics. Reset on restart. Not production-ready."
}
</pre>



<p>Metrics help you verify that:</p>



<ul class="wp-block-list">
<li>safety rejections increase misses</li>



<li>LLM calls rise when reuse is unsafe</li>



<li>the system degrades gracefully</li>
</ul>



<h3 class="wp-block-heading">What These Demos Prove</h3>



<p>Across these scenarios, we verified that:</p>



<ul class="wp-block-list">
<li>Stale entries are rejected</li>



<li>Low-confidence reuse is prevented</li>



<li>Poisoned responses are never cached</li>



<li>Duplicate entries are avoided</li>



<li>Cache behavior is observable and explainable</li>
</ul>



<p>The cache no longer optimizes for speed alone.</p>



<p>It optimizes for <strong>safe reuse</strong>.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Semantic-Caching-Limitations-Trade-Offs-LLM-Optimization-Systems"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Semantic-Caching-Limitations-Trade-Offs-LLM-Optimization-Systems">Semantic Caching Limitations: Trade-Offs in LLM Optimization Systems</a></h2>



<p>By this point, we’ve built a semantic cache that is not only functional, but also hardened against common failure modes: staleness, low confidence, poisoning, duplication, and silent reuse.</p>



<p>However, no system design is complete without clearly stating <strong>what it does not attempt to solve</strong>.</p>



<p>This section makes those boundaries explicit.</p>



<h3 class="wp-block-heading">Why This Cache Still Uses O(N) Scans</h3>



<p>All semantic lookups in this implementation perform a <strong>linear scan</strong> over cached entries.</p>



<p>That means:</p>



<ul class="wp-block-list">
<li>every semantic search compares the query embedding against all stored embeddings</li>



<li>time complexity grows linearly with cache size</li>
</ul>



<p>This is not an oversight.</p>



<p>It is a <strong>deliberate design choice</strong> made for:</p>



<ul class="wp-block-list">
<li>teaching clarity</li>



<li>transparency</li>



<li>small-to-medium cache sizes</li>
</ul>



<p>By avoiding ANN indexes or vector databases, every decision remains visible and debuggable. You can trace exactly why a match was selected or rejected.</p>



<p>For educational systems and low-volume services, this trade-off is acceptable — and often desirable.</p>



<h3 class="wp-block-heading">What We Intentionally Did Not Implement</h3>



<p>To keep the system focused and understandable, several production features were intentionally left out:</p>



<ul class="wp-block-list">
<li>Approximate nearest neighbor (ANN) indexing</li>



<li>Redis Vector Search or RediSearch</li>



<li>Background garbage collection workers</li>



<li>Distributed locks for thundering herd prevention</li>



<li>Request coalescing or single-flight patterns</li>



<li>Multi-process or persistent metrics</li>



<li>Cache warming strategies</li>
</ul>



<p>Each of these adds complexity that would obscure the core ideas being taught.</p>



<p>This cache is designed to <strong>explain semantic caching</strong>, not to compete with specialized retrieval infrastructure.</p>



<h3 class="wp-block-heading">When This Design Is “Good Enough”</h3>



<p>This architecture works well when:</p>



<ul class="wp-block-list">
<li>cache size is modest (hundreds to low thousands of entries)</li>



<li>traffic is low to moderate</li>



<li>correctness and explainability matter more than raw throughput</li>



<li>you are experimenting with semantic reuse behavior</li>



<li>you want to understand cache dynamics before scaling</li>
</ul>



<p>Typical examples include:</p>



<ul class="wp-block-list">
<li>internal tools</li>



<li>developer-facing APIs</li>



<li>research prototypes</li>



<li>educational systems</li>



<li>early-stage LLM applications</li>
</ul>



<p>In these contexts, the simplicity of the design is a strength, not a weakness.</p>



<h3 class="wp-block-heading">When You Need a Vector Database or ANN Index</h3>



<p>As usage grows, linear scans eventually become the bottleneck.</p>



<p>You should consider a dedicated vector search solution when:</p>



<ul class="wp-block-list">
<li>cache size grows into tens or hundreds of thousands of entries</li>



<li>latency requirements become strict</li>



<li>multiple workers or services share the same cache</li>



<li>semantic search dominates request time</li>
</ul>



<p>At that point, technologies such as the following:</p>



<ul class="wp-block-list">
<li>FAISS (Facebook AI Similarity Search)</li>



<li>Milvus</li>



<li>Pinecone</li>



<li>Redis Vector Search</li>
</ul>



<p>become appropriate.</p>



<p>Importantly, the <strong>hardening concepts from this lesson still apply</strong>. TTLs, confidence scoring, poisoning prevention, and observability remain relevant even when the storage backend changes.</p>



<h3 class="wp-block-heading">The Core Trade-Off, Revisited</h3>



<p>This lesson deliberately favors:</p>



<ul class="wp-block-list">
<li>clarity over cleverness</li>



<li>explicit decisions over hidden automation</li>



<li>safety over aggressive reuse</li>
</ul>



<p>That makes it an ideal foundation, not a final destination.</p>



<hr class="wp-block-separator has-alpha-channel-opacity"/>



<div id="pitch" style="padding: 40px; width: 100%; background-color: #F4F6FA;">
	<h3>What's next? We recommend <a target="_blank" href="https://pyimagesearch.com/pyimagesearch-university/?utm_source=blogPost&utm_medium=bottomBanner&utm_campaign=What%27s%20next%3F%20I%20recommend">PyImageSearch University</a>.</h3>

	<script src="https://fast.wistia.com/embed/medias/kno0cmko2z.jsonp" async></script><script src="https://fast.wistia.com/assets/external/E-v1.js" async></script><div class="wistia_responsive_padding" style="padding:56.25% 0 0 0;position:relative;"><div class="wistia_responsive_wrapper" style="height:100%;left:0;position:absolute;top:0;width:100%;"><div class="wistia_embed wistia_async_kno0cmko2z videoFoam=true" style="height:100%;position:relative;width:100%"><div class="wistia_swatch" style="height:100%;left:0;opacity:0;overflow:hidden;position:absolute;top:0;transition:opacity 200ms;width:100%;"><img decoding="async" src="https://fast.wistia.com/embed/medias/kno0cmko2z/swatch" style="filter:blur(5px);height:100%;object-fit:contain;width:100%;" alt="" aria-hidden="true" onload="this.parentNode.style.opacity=1;" /></div></div></div></div>

	<div style="margin-top: 32px; margin-bottom: 32px; ">
		<strong>Course information:</strong><br/>
		86+ total classes • 115+ hours hours of on-demand code walkthrough videos • Last updated: June 2026<br/>
		<span style="color: #169FE6;">★★★★★</span> 4.84 (128 Ratings) • 16,000+ Students Enrolled
	</div>

	<p><strong>I strongly believe that if you had the right teacher you could <em>master</em> computer vision and deep learning.</strong></p>

	<p>Do you think learning computer vision and deep learning has to be time-consuming, overwhelming, and complicated? Or has to involve complex mathematics and equations? Or requires a degree in computer science?</p>

	<p>That’s <em>not</em> the case.</p>

	<p>All you need to master computer vision and deep learning is for someone to explain things to you in <em>simple, intuitive</em> terms. <em>And that’s exactly what I do</em>. My mission is to change education and how complex Artificial Intelligence topics are taught.</p>

	<p>If you're serious about learning computer vision, your next stop should be PyImageSearch University, the most comprehensive computer vision, deep learning, and OpenCV course online today. Here you’ll learn how to <em>successfully</em> and <em>confidently</em> apply computer vision to your work, research, and projects. Join me in computer vision mastery.</p>

	<p><strong>Inside PyImageSearch University you'll find:</strong></p>

	<ul style="margin-left: 0px;">
		<li style="list-style: none;">&check; <strong>86+ courses</strong> on essential computer vision, deep learning, and OpenCV topics</li>
		<li style="list-style: none;">&check; <strong>86 Certificates</strong> of Completion</li>
		<li style="list-style: none;">&check; <strong>115+ hours hours</strong> of on-demand video</li>
		<li style="list-style: none;">&check; <strong>Brand new courses released <em>regularly</em></strong>, ensuring you can keep up with state-of-the-art techniques</li>
		<li style="list-style: none;">&check; <strong>Pre-configured Jupyter Notebooks in Google Colab</strong></li>
		<li style="list-style: none;">&check; Run all code examples in your web browser — works on Windows, macOS, and Linux (no dev environment configuration required!)</li>
		<li style="list-style: none;">&check; Access to <strong>centralized code repos for <em>all</em> 540+ tutorials</strong> on PyImageSearch</li>
		<li style="list-style: none;">&check; <strong> Easy one-click downloads</strong> for code, datasets, pre-trained models, etc.</li>
		<li style="list-style: none;">&check; <strong>Access</strong> on mobile, laptop, desktop, etc.</li>
	</ul>

	<p style="text-align: center;">
		<a target="_blank" class="button link" href="https://pyimagesearch.com/pyimagesearch-university/?utm_source=blogPost&utm_medium=bottomBanner&utm_campaign=What%27s%20next%3F%20I%20recommend" style="background-color: #6DC713; border-bottom: none;">Click here to join PyImageSearch University</a>
	</p>
</div>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Summary"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Summary">Summary</a></h2>



<p>In this lesson, we took a working semantic cache and made it safe, bounded, and explainable.</p>



<p>Rather than focusing on improving cache hit rates at all costs, we introduced guardrails to ensure cached LLM responses are reused only when they are trustworthy. </p>



<p>We added application-level TTL validation to prevent stale responses from persisting indefinitely, combined semantic similarity with freshness through confidence scoring, and enforced explicit rejection paths for low-confidence and expired entries.</p>



<p>We also addressed subtle but dangerous failure modes that appear in real systems over time. Query normalization and deduplication prevent silent cache bloat, and poisoning checks ensure that error responses are never reused. </p>



<p>Observability signals make every cache decision inspectable rather than implicit. Together, these changes transform the cache from a performance optimization into a reliability component.</p>



<p>Finally, we made the system’s limitations explicit. This design favors clarity, correctness, and debuggability over raw scalability. It deliberately avoids ANN indexes, vector databases, and distributed coordination, making it suitable for small-to-medium systems and educational use cases.</p>



<p>As workloads grow, the same hardening principles apply even when the underlying storage or retrieval strategy changes.</p>



<p>With this lesson, semantic caching is no longer just fast. It is defensive, explainable, and production-aware.</p>



<h3 class="wp-block-heading">Citation Information</h3>



<p><strong>Singh, V</strong><strong>. </strong>“Semantic Caching for LLMs: TTLs, Confidence, and Cache Safety,” <em>PyImageSearch</em>, S. Huot, A. Sharma, and P. Thakur, eds., 2026, <a href="https://pyimg.co/ahr3p" target="_blank" rel="noreferrer noopener">https://pyimg.co/ahr3p</a> </p>



<pre class="EnlighterJSRAW" data-enlighter-language="raw" data-enlighter-theme="classic" data-enlighter-highlight="" data-enlighter-linenumbers="false" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: TTLs, Confidence, and Cache Safety" data-enlighter-group="23">@incollection{Singh_2026_semantic-caching-llms-ttls-confidence-cache-safety,
  author = {Vikram Singh},
  title = {{Semantic Caching for LLMs: TTLs, Confidence, and Cache Safety}},
  booktitle = {PyImageSearch},
  editor = {Susan Huot and Aditya Sharma and Piyush Thakur},
  year = {2026},
  url = {https://pyimg.co/ahr3p},
}
</pre>



<p><strong>To download the source code to this post (and be notified when future tutorials are published here on PyImageSearch), </strong><em><strong>simply enter your email address in the form below!</strong></em></p>



<div id="download-the-code" class="post-cta-wrap">
<div class="gpd-post-cta">
	<div class="gpd-post-cta-content">
		

			<div class="gpd-post-cta-top">
				<div class="gpd-post-cta-top-image"><img decoding="async" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?lossy=2&strip=1&webp=1" alt="" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?lossy=2&strip=1&webp=1 410w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?size=126x174&lossy=2&strip=1&webp=1 126w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?size=252x348&lossy=2&strip=1&webp=1 252w" sizes="(max-width: 410px) 100vw, 410px" /></div>
				
				<div class="gpd-post-cta-top-title"><h4>Download the Source Code and FREE 17-page Resource Guide</h4></div>
				<div class="gpd-post-cta-top-desc"><p>Enter your email address below to get a .zip of the code and a <strong>FREE 17-page Resource Guide on Computer Vision, OpenCV, and Deep Learning.</strong> Inside you'll find my hand-picked tutorials, books, courses, and libraries to help you master CV and DL!</p></div>


			</div>

			<div class="gpd-post-cta-bottom">
				<form id="footer-cta-code" class="footer-cta" action="https://www.getdrip.com/forms/4130035/submissions" method="post" target="blank" data-drip-embedded-form="4130035">
					<input name="fields[email]" type="email" value="" placeholder="Your email address" class="form-control" />

					<button type="submit">Download the code!</button>

					<div style="display: none;" aria-hidden="true"><label for="website">Website</label><br /><input type="text" id="website" name="website" tabindex="-1" autocomplete="false" value="" /></div>
				</form>
			</div>


		
	</div>

</div>
</div>
<p>The post <a rel="nofollow" href="https://pyimagesearch.com/2026/05/04/semantic-caching-for-llms-ttls-confidence-and-cache-safety/">Semantic Caching for LLMs: TTLs, Confidence, and Cache Safety</a> appeared first on <a rel="nofollow" href="https://pyimagesearch.com">PyImageSearch</a>.</p>
]]></content:encoded>
					
		
		
			</item>
		<item>
		<title>Semantic Caching for LLMs: FastAPI, Redis, and Embeddings</title>
		<link>https://pyimagesearch.com/2026/04/27/semantic-caching-for-llms-fastapi-redis-and-embeddings/</link>
		
		<dc:creator><![CDATA[Vikram Singh]]></dc:creator>
		<pubDate>Mon, 27 Apr 2026 12:45:00 +0000</pubDate>
				<category><![CDATA[LLMOps]]></category>
		<category><![CDATA[MLOps]]></category>
		<category><![CDATA[Tutorial]]></category>
		<category><![CDATA[caching]]></category>
		<category><![CDATA[cosine similarity]]></category>
		<category><![CDATA[embeddings]]></category>
		<category><![CDATA[fastapi]]></category>
		<category><![CDATA[llm]]></category>
		<category><![CDATA[llm optimization]]></category>
		<category><![CDATA[ollama]]></category>
		<category><![CDATA[python]]></category>
		<category><![CDATA[redis]]></category>
		<category><![CDATA[semantic caching]]></category>
		<category><![CDATA[tutorial]]></category>
		<category><![CDATA[vector search]]></category>
		<guid isPermaLink="false">https://pyimagesearch.com/?p=53546</guid>

					<description><![CDATA[<p>Table of Contents Semantic Caching for LLMs: FastAPI, Redis, and Embeddings Introduction: Why Semantic Caching Matters for LLM Systems How Semantic Caching Works for LLMs: Embeddings and Similarity Search Explained Semantic Caching Architecture and Request Flow Configuring Your Environment for&#8230;</p>
<p>The post <a rel="nofollow" href="https://pyimagesearch.com/2026/04/27/semantic-caching-for-llms-fastapi-redis-and-embeddings/">Semantic Caching for LLMs: FastAPI, Redis, and Embeddings</a> appeared first on <a rel="nofollow" href="https://pyimagesearch.com">PyImageSearch</a>.</p>
]]></description>
										<content:encoded><![CDATA[<div class="yoast-breadcrumbs"><span><span><a href="https://pyimagesearch.com/">Home</a></span></div>


<div class="toc">
<hr class="TOC"/>
<p class="has-large-font-size"><strong>Table of Contents</strong></p>
<ul>
    <li id="TOC-h1-Semantic-Caching-LLMs-FastAPI-Redis-Embeddings"><a rel="noopener" target="_blank" href="#h1-Semantic-Caching-LLMs-FastAPI-Redis-Embeddings">Semantic Caching for LLMs: FastAPI, Redis, and Embeddings</a></li>

    <li id="TOC-h2-Introduction-Why-Semantic-Caching-Matters-LLM-Systems"><a rel="noopener" target="_blank" href="#h2-Introduction-Why-Semantic-Caching-Matters-LLM-Systems">Introduction: Why Semantic Caching Matters for LLM Systems</a></li>

    <li id="TOC-h2-How-Semantic-Caching-Works-LLMs-Embeddings-Similarity-Search-Explained"><a rel="noopener" target="_blank" href="#h2-How-Semantic-Caching-Works-LLMs-Embeddings-Similarity-Search-Explained">How Semantic Caching Works for LLMs: Embeddings and Similarity Search Explained</a></li>

    <li id="TOC-h2-Semantic-Caching-Architecture-Request-Flow"><a rel="noopener" target="_blank" href="#h2-Semantic-Caching-Architecture-Request-Flow">Semantic Caching Architecture and Request Flow</a></li>

    <li id="TOC-h2-Configuring-Your-Environment-Semantic-Caching-FastAPI-Redis-Ollama-Setup"><a rel="noopener" target="_blank" href="#h2-Configuring-Your-Environment-Semantic-Caching-FastAPI-Redis-Ollama-Setup">Configuring Your Environment for Semantic Caching: FastAPI, Redis, and Ollama Setup</a></li>

    <li id="TOC-h2-Project-Structure"><a rel="noopener" target="_blank" href="#h2-Project-Structure">Project Structure</a></li>

    <li id="TOC-h2-FastAPI-Entry-Point-Semantic-Caching-Wiring-API-Service"><a rel="noopener" target="_blank" href="#h2-FastAPI-Entry-Point-Semantic-Caching-Wiring-API-Service">FastAPI Entry Point for Semantic Caching: Wiring the API Service</a></li>

    <li id="TOC-h2-FastAPI-Ask-Endpoint-End-to-End-Semantic-Caching-Request-Flow"><a rel="noopener" target="_blank" href="#h2-FastAPI-Ask-Endpoint-End-to-End-Semantic-Caching-Request-Flow">FastAPI Ask Endpoint: End-to-End Semantic Caching Request Flow</a></li>

    <li id="TOC-h2-Embeddings-Turning-Text-into-Semantic-Vectors"><a rel="noopener" target="_blank" href="#h2-Embeddings-Turning-Text-into-Semantic-Vectors">Embeddings: Turning Text into Semantic Vectors</a></li>

    <li id="TOC-h2-Semantic-Cache-Cosine-Similarity-Redis-Storage-Reusing-Meaning"><a rel="noopener" target="_blank" href="#h2-Semantic-Cache-Cosine-Similarity-Redis-Storage-Reusing-Meaning">The Semantic Cache: Cosine Similarity, Redis Storage, and Reusing Meaning</a></li>

    <li id="TOC-h2-Cache-Entries-What-Exactly-Gets-Stored"><a rel="noopener" target="_blank" href="#h2-Cache-Entries-What-Exactly-Gets-Stored">Cache Entries: What Exactly Gets Stored?</a></li>

    <li id="TOC-h2-End-to-End-Demo-Verifying-Core-Cache-Behavior"><a rel="noopener" target="_blank" href="#h2-End-to-End-Demo-Verifying-Core-Cache-Behavior">End-to-End Demo: Verifying Core Cache Behavior</a></li>

    <li id="TOC-h2-Summary"><a rel="noopener" target="_blank" href="#h2-Summary">Summary</a></li>
</ul>
</div>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h1-Semantic-Caching-LLMs-FastAPI-Redis-Embeddings"/>



<h2 class="wp-block-heading"><a href="#TOC-h1-Semantic-Caching-LLMs-FastAPI-Redis-Embeddings">Semantic Caching for LLMs: FastAPI, Redis, and Embeddings</a></h2>



<p>In this lesson, you will learn how to build a semantic cache for LLM applications using FastAPI, Redis, and embedding-based similarity search, and how requests flow from exact matches to semantic matches before falling back to the LLM.</p>


<div class="wp-block-image">
<figure class="aligncenter size-full"><a href="https://pyimagesearch.com/wp-content/uploads/2026/04/semantic-caching-for-llms-fastapi-redis-and-embeddings-featured.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="940" height="780" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/semantic-caching-for-llms-fastapi-redis-and-embeddings-featured.png?lossy=2&strip=1&webp=1" alt="semantic-caching-for-llms-fastapi-redis-and-embeddings-featured.png" class="wp-image-53571" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/semantic-caching-for-llms-fastapi-redis-and-embeddings-featured.png?size=126x105&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/semantic-caching-for-llms-fastapi-redis-and-embeddings-featured-300x249.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/semantic-caching-for-llms-fastapi-redis-and-embeddings-featured.png?size=378x314&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/semantic-caching-for-llms-fastapi-redis-and-embeddings-featured.png?size=504x418&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/semantic-caching-for-llms-fastapi-redis-and-embeddings-featured.png?size=630x523&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/semantic-caching-for-llms-fastapi-redis-and-embeddings-featured-768x637.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/semantic-caching-for-llms-fastapi-redis-and-embeddings-featured.png?lossy=2&amp;strip=1&amp;webp=1 940w" sizes="(max-width: 630px) 100vw, 630px" /></a></figure></div>


<p>This lesson is the 1st in a 2-part series on <strong>Semantic Caching for LLMs</strong>:</p>



<ol class="wp-block-list">
<li><em><strong><a href="https://pyimg.co/yso6f" target="_blank" rel="noreferrer noopener">Semantic Caching for LLMs: FastAPI, Redis, and Embeddings</a></strong></em><strong> (this tutorial)</strong></li>



<li><em>Lesson 2</em></li>
</ol>



<p><strong>To learn how to build a semantic cache for LLM applications using embeddings and Redis, </strong><em><strong>just keep reading.</strong></em></p>



<div id="pyi-source-code-block" class="source-code-wrap"><div class="gpd-source-code">
    <div class="gpd-source-code-content">
        <img decoding="async" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/source-code-icon.png?lossy=2&strip=1&webp=1" alt="">
        <h4>Looking for the source code to this post?</h4>
                    <a href="#download-the-code" class="pyis-cta-modal-open-modal">Jump Right To The Downloads Section <svg class="svg-icon arrow-right" width="12" height="12" aria-hidden="true" role="img" focusable="false" viewBox="0 0 14 14" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M6.8125 0.1875C6.875 0.125 6.96875 0.09375 7.09375 0.09375C7.1875 0.09375 7.28125 0.125 7.34375 0.1875L13.875 6.75C13.9375 6.8125 14 6.90625 14 7C14 7.125 13.9375 7.1875 13.875 7.25L7.34375 13.8125C7.28125 13.875 7.1875 13.9062 7.09375 13.9062C6.96875 13.9062 6.875 13.875 6.8125 13.8125L6.1875 13.1875C6.125 13.125 6.09375 13.0625 6.09375 12.9375C6.09375 12.8438 6.125 12.75 6.1875 12.6562L11.0312 7.8125H0.375C0.25 7.8125 0.15625 7.78125 0.09375 7.71875C0.03125 7.65625 0 7.5625 0 7.4375V6.5625C0 6.46875 0.03125 6.375 0.09375 6.3125C0.15625 6.25 0.25 6.1875 0.375 6.1875H11.0312L6.1875 1.34375C6.125 1.28125 6.09375 1.1875 6.09375 1.0625C6.09375 0.96875 6.125 0.875 6.1875 0.8125L6.8125 0.1875Z" fill="#169FE6"></path></svg></a>
            </div>
</div>
</div>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Introduction-Why-Semantic-Caching-Matters-LLM-Systems"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Introduction-Why-Semantic-Caching-Matters-LLM-Systems">Introduction: Why Semantic Caching Matters for LLM Systems</a></h2>



<h3 class="wp-block-heading">Cost, Latency, and Redundant LLM Calls</h3>



<p>Large language models are powerful, but they are not cheap. Every request to an LLM involves tokenization, inference, decoding, and network overhead. Even when models are hosted locally, response times are measured in hundreds of milliseconds or seconds rather than microseconds.</p>



<p>In real applications, this cost compounds quickly. Users often ask similar questions repeatedly, either across sessions or within the same workflow. Each request is treated as a fresh LLM invocation, even when the underlying intent has already been handled before.</p>



<p>This leads to 3 systemic problems:</p>



<ul class="wp-block-list">
<li><strong>High latency:</strong> Users wait for responses that could have been reused instantly</li>



<li><strong>Increased cost:</strong> Identical reasoning is paid for multiple times</li>



<li><strong>Wasted capacity:</strong> LLM throughput is consumed by redundant requests</li>
</ul>



<p>These issues become especially visible under load, where repeated paraphrased queries can overwhelm an otherwise well-sized system.</p>



<h3 class="wp-block-heading">Why Exact-Match Caching Breaks Down for Natural Language</h3>



<p>Traditional caching assumes that identical inputs produce identical outputs. This works well for APIs, database queries, and deterministic functions. It fails for natural language.</p>



<p>From a string-matching perspective, the following queries are completely unrelated:</p>



<ul class="wp-block-list">
<li>“What is semantic caching?”</li>



<li>“Can you explain how semantic caching works?”</li>



<li>“How does caching based on embeddings work for LLMs?”</li>
</ul>



<p>A traditional cache keyed on raw strings will miss all three. As a result, the system calls the LLM three times, even though a human would expect the same answer.</p>



<p>This brittleness causes exact-match caches to have extremely low hit rates in LLM-backed systems. Worse, it gives a false sense of optimization. The cache exists, but it almost never helps in practice.</p>



<h3 class="wp-block-heading">Where Semantic Caching Fits in Real Systems</h3>



<p>Semantic caching addresses this mismatch by caching <em>meaning</em> instead of exact text.</p>



<p>Rather than asking “have I seen this string before?”, a semantic cache asks “have I answered something <strong>semantically similar</strong> before?”. It does this by converting queries into embeddings and comparing them using a similarity metric such as cosine similarity.</p>



<p>In a real system, semantic caching sits between the application layer and the LLM:</p>



<ul class="wp-block-list">
<li>The application sends a query</li>



<li>The cache evaluates whether a prior response is reusable</li>



<li>Only true cache misses reach the LLM</li>
</ul>



<p>When designed correctly, this layer is invisible to the user. Responses feel faster, costs drop, and the system scales more gracefully without changing the frontend or prompt logic.</p>



<p>This lesson focuses on building that layer explicitly and transparently, using FastAPI, Redis, and embeddings, without hiding the mechanics behind heavy abstractions.</p>


<div class="wp-block-image">
<figure class="aligncenter size-full"><a href="https://pyimagesearch.com/wp-content/uploads/2026/04/semantic-caching-fig1.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="512" height="224" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/semantic-caching-fig1.png?lossy=2&strip=1&webp=1" alt="Figure 1: Why semantic caching matters for LLM systems. Exact-match caching treats paraphrased queries as unique requests, resulting in repeated LLM calls. Semantic caching groups queries by meaning, reducing latency and redundant inference." class="wp-image-53552" style="object-fit:cover" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/semantic-caching-fig1.png?size=126x55&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/semantic-caching-fig1-300x131.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/semantic-caching-fig1.png?size=378x165&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/semantic-caching-fig1.png?lossy=2&amp;strip=1&amp;webp=1 512w" sizes="(max-width: 512px) 100vw, 512px" /></a><figcaption class="wp-element-caption"><strong>Figure 1: </strong>Why semantic caching matters for LLM systems. Exact-match caching treats paraphrased queries as unique requests, resulting in repeated LLM calls. Semantic caching groups queries by meaning, reducing latency and redundant inference (source: image by the author).</figcaption></figure></div>


<p>Exact-match caching treats paraphrased queries as unique requests, resulting in repeated LLM calls. Semantic caching groups similar queries by meaning, allowing responses to be reused and reducing both latency and cost.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-How-Semantic-Caching-Works-LLMs-Embeddings-Similarity-Search-Explained"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-How-Semantic-Caching-Works-LLMs-Embeddings-Similarity-Search-Explained">How Semantic Caching Works for LLMs: Embeddings and Similarity Search Explained</a></h2>



<p><a href="#h2-Introduction-Why-Semantic-Caching-Matters-LLM-Systems" target="_blank" rel="noreferrer noopener">Section 1</a> explained <em>why</em> semantic caching exists.</p>



<p>This section explains <strong>how it works</strong>, conceptually, before we touch any FastAPI, Redis, or code.</p>



<p>The goal here is to give the reader a <strong>mental execution model</strong> they can keep in their head while reading the implementation.</p>



<h3 class="wp-block-heading">From Text to Meaning: Embeddings as the Cache Key</h3>



<p>Semantic caching replaces raw text comparison with <strong>vector similarity</strong>.</p>



<p>Instead of caching responses under the literal query string, the system converts each query into an <strong>embedding</strong>: a high-dimensional numeric vector that captures semantic meaning. Queries that are worded differently but mean the same thing produce embeddings that are close together in vector space.</p>



<p>This is what allows the cache to recognize paraphrases as equivalent:</p>



<ul class="wp-block-list">
<li>“How do I reset my password?”</li>



<li>“I forgot my password, what should I do?”</li>



<li>“Guide me through password recovery”</li>
</ul>



<p>Exact strings differ. Embeddings do not.</p>



<p>At a high level, semantic caching works by:</p>



<ul class="wp-block-list">
<li>Generating an embedding for the incoming query</li>



<li>Comparing it against embeddings stored in the cache</li>



<li>Reusing a cached response if similarity is high enough</li>
</ul>



<p>The similarity metric used in this lesson is <strong>cosine similarity</strong>, which measures the angle between two vectors rather than their raw magnitude.</p>



<h3 class="wp-block-heading">Why a Layered Cache Beats Semantic-Only Caching</h3>



<p>While semantic matching is powerful, it is also <strong>computationally expensive</strong>.</p>



<p>Embedding generation requires a model call. Similarity search requires vector math. Doing this for every request, even when the exact same query has already been seen, would be wasteful.</p>



<p>That is why this lesson uses a <strong>layered caching strategy</strong>.</p>



<h4 class="wp-block-heading">Layer 1: Exact Match (Fast Path)</h4>



<p>The query is normalized and hashed.</p>



<p>If the same query has already been answered, the response is returned immediately.</p>



<ul class="wp-block-list">
<li>No embedding generation</li>



<li>No similarity computation</li>



<li>Minimal latency</li>
</ul>



<p>This handles repeated identical queries efficiently.</p>



<h4 class="wp-block-heading">Layer 2: Semantic Match (Flexible Path)</h4>



<p>If no exact match exists, the query is embedded and compared against cached embeddings.</p>



<p>This layer catches:</p>



<ul class="wp-block-list">
<li>paraphrases</li>



<li>minor wording differences</li>



<li>reordered phrases</li>
</ul>



<p>Semantic matches trade compute cost for much higher cache hit rates.</p>



<h4 class="wp-block-heading">Layer 3: LLM Fallback (Slow Path)</h4>



<p>If neither exact nor semantic matches succeed, the request is forwarded to the LLM.</p>



<p>The response is then stored in the cache so future requests can reuse it.</p>



<p>This layered approach ensures:</p>



<ul class="wp-block-list">
<li>the cheapest checks happen first</li>



<li>expensive operations are only used when necessary</li>
</ul>



<h3 class="wp-block-heading">Confidence, Freshness, and Cache Safety</h3>



<p>Semantic similarity alone is not enough to decide whether a cached response should be reused.</p>



<p>This lesson introduces the idea of <strong>confidence scoring</strong>, which combines:</p>



<ul class="wp-block-list">
<li><strong>Similarity:</strong> how close the embeddings are</li>



<li><strong>Freshness:</strong> how old the cached entry is</li>
</ul>



<p>A highly similar but stale response should not necessarily be trusted. Likewise, a fresh response with low similarity should be rejected.</p>



<p>In addition, cached entries are validated to prevent:</p>



<ul class="wp-block-list">
<li>expired responses</li>



<li>poisoned entries (errors, empty outputs)</li>
</ul>



<p>These checks ensure the cache improves correctness and performance rather than degrading them.</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/04/image-22-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="554" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-22-1024x554.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53576" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-22.png?size=126x68&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-22-300x162.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-22.png?size=378x205&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-22.png?size=504x273&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-22.png?size=630x341&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-22-768x415.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-22-1024x554.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-22-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-22-1536x830.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 2: </strong>Layered semantic caching request flow (source: image by the author).</figcaption></figure></div>


<p>Incoming queries first attempt an exact-match lookup, then fall back to semantic similarity search using embeddings, and finally call the LLM only on cache miss. This ordering minimizes latency and unnecessary model calls.</p>



<p><em><strong>Note:</strong></em><em> In this lesson, we implement this flow using Redis as a simple embedding store with linear similarity scans, rather than a dedicated vector database.</em></p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Semantic-Caching-Architecture-Request-Flow"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Semantic-Caching-Architecture-Request-Flow">Semantic Caching Architecture and Request Flow</a></h2>



<p>In <a href="#h2-How-Semantic-Caching-Works-LLMs-Embeddings-Similarity-Search-Explained" target="_blank" rel="noreferrer noopener">Section 2</a>, you learned how semantic caching works conceptually.</p>



<p>In this section, we map that mental model to a <strong>real request flow</strong> in an LLM-backed service.</p>



<p>The goal is to answer one question clearly:</p>



<p><em>What happens, step by step, when a user sends a request to this system?</em></p>



<p>We will stay implementation-aware, but not code-specific yet. That comes next.</p>



<h3 class="wp-block-heading">High-Level System Components</h3>



<p>At a high level, the system consists of 5 logical components:</p>



<ul class="wp-block-list">
<li><strong>API layer: </strong>Receives user requests and orchestrates the caching pipeline.</li>



<li><strong>Exact-match cache: </strong>Performs fast hash-based lookups for identical queries.</li>



<li><strong>Embedding model: </strong>Converts text queries into semantic vectors when needed.</li>



<li><strong>Semantic cache: </strong>Stores embeddings and responses and performs similarity matching.</li>



<li><strong>LLM: </strong>Acts as the final fallback when no cache entry is suitable.</li>
</ul>



<p>Each component has a narrowly defined responsibility. This separation is intentional and keeps the system easy to reason about and extend.</p>



<p>In this implementation:</p>



<ul class="wp-block-list">
<li>The API layer is built using FastAPI and acts as the orchestration point.</li>



<li>Redis is used as the backing store for both exact-match and semantic cache layers.</li>



<li>Ollama provides both embedding generation and LLM inference locally.</li>
</ul>



<p>These choices keep the system lightweight, self-contained, and easy to reason about while still reflecting real production patterns.</p>



<h3 class="wp-block-heading">End-to-End Request Flow</h3>



<p>When a user sends a query, the system processes it in the following order.</p>



<h4 class="wp-block-heading">Step 1: Request enters the API</h4>



<p>The API receives a text query along with optional flags, such as whether to use the <code data-enlighter-language="python" class="EnlighterJSRAW">bypass_cache</code>. Input validation happens immediately to prevent meaningless or malformed queries from entering the pipeline.</p>



<p>This ensures the cache is not polluted with empty or invalid entries.</p>



<h4 class="wp-block-heading">Step 2: Exact-match cache lookup</h4>



<p>The query is normalized and hashed.</p>



<p>The system checks whether an identical query has already been answered.</p>



<ul class="wp-block-list">
<li>If an exact match exists and is valid, the response is returned immediately.</li>



<li>No embeddings are generated.</li>



<li>The LLM is not touched.</li>
</ul>



<p>This is the fastest possible path through the system.</p>



<h4 class="wp-block-heading">Step 3: Embedding generation</h4>



<p>If the exact-match lookup fails, the query is passed to the embedding model.</p>



<p>The model converts the text into a numeric vector that captures semantic meaning. This vector becomes the key for semantic comparison.</p>



<p>This step is intentionally skipped when an exact match succeeds.</p>



<h4 class="wp-block-heading">Step 4: Semantic cache lookup</h4>



<p>The embedding is compared against cached embeddings using a similarity metric.</p>



<p>A cached response is reused only if:</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">similarity</code> exceeds a defined threshold</li>



<li>the entry has not expired</li>



<li>the entry is not poisoned</li>



<li>the computed <code data-enlighter-language="python" class="EnlighterJSRAW">confidence</code> is high enough</li>
</ul>



<p>If a suitable match is found, the response is returned to the user without calling the LLM.</p>



<h4 class="wp-block-heading">Step 5: LLM fallback and cache population</h4>



<p>If both cache layers miss, the request is forwarded to the LLM.</p>



<p>Once a response is generated:</p>



<ul class="wp-block-list">
<li>it is returned to the user</li>



<li>it is stored in the cache with metadata, timestamps, and TTL (Time To Live)</li>
</ul>



<p>This ensures future requests can reuse the result.</p>



<h3 class="wp-block-heading">Why This Architecture Works Well</h3>



<p>This architecture is intentionally conservative and explicit.</p>



<ul class="wp-block-list">
<li>Cheap operations happen first.</li>



<li>Expensive operations are deferred.</li>



<li>Every step is observable and debuggable.</li>



<li>No component hides complexity behind opaque abstractions.</li>
</ul>



<p>Most importantly, the system degrades gracefully. Even when the cache provides no benefit, the request still succeeds via the LLM.</p>


<div class="wp-block-image">
<figure class="aligncenter size-full"><a href="https://pyimagesearch.com/wp-content/uploads/2026/04/semantic-caching-fig3.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="512" height="248" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/semantic-caching-fig3.png?lossy=2&strip=1&webp=1" alt="Figure 3: Architecture and request flow for a layered semantic caching system." class="wp-image-53556" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/semantic-caching-fig3.png?size=126x61&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/semantic-caching-fig3-300x145.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/semantic-caching-fig3.png?size=378x183&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/semantic-caching-fig3.png?lossy=2&amp;strip=1&amp;webp=1 512w" sizes="(max-width: 512px) 100vw, 512px" /></a><figcaption class="wp-element-caption"><strong>Figure 3: </strong>Architecture and request flow for a layered semantic caching system (source: image by the author).</figcaption></figure></div>


<p>User queries enter the API, attempt an exact-match lookup, fall back to semantic similarity search using embeddings, and call the LLM only when both cache layers miss. Successful LLM responses are stored for future reuse.</p>



<hr class="wp-block-separator has-alpha-channel-opacity"/>



<p>Would you like immediate access to 3,457 images curated and labeled with hand gestures to train, explore, and experiment with &#8230; for free? Head over to <a href="https://universe.roboflow.com/isl/az-6mqow?ref=pyimagesearch" target="_blank" rel="noreferrer noopener">Roboflow</a> and get a free account to grab these hand gesture images. </p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Configuring-Your-Environment-Semantic-Caching-FastAPI-Redis-Ollama-Setup"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Configuring-Your-Environment-Semantic-Caching-FastAPI-Redis-Ollama-Setup">Configuring Your Environment for Semantic Caching: FastAPI, Redis, and Ollama Setup</a></h2>



<p>To follow this guide, you need a small set of Python libraries and system services that support API orchestration, vector similarity, and LLM interaction. The goal is to keep the environment lightweight, reproducible, and easy to reason about.</p>



<p>At a minimum, you will need:</p>



<ul class="wp-block-list">
<li>Python 3.10 or newer</li>



<li>Redis (used as the cache backing store)</li>



<li>An LLM + embedding provider (Ollama in this tutorial)</li>
</ul>



<p>All required Python dependencies are <code data-enlighter-language="python" class="EnlighterJSRAW">pip</code>-installable.</p>



<h3 class="wp-block-heading">Installing Python Dependencies</h3>



<p>Create and activate a virtual environment (recommended), then install the required packages:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="1">$ pip install fastapi uvicorn redis httpx python-dotenv numpy
</pre>



<p>These libraries provide the following functionality:</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">fastapi</code>: API layer and request orchestration</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">uvicorn</code>: ASGI server for running the service</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">redis</code>: client Communication with the cache store</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">httpx</code>: HTTP client for embedding and LLM calls</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">numpy</code>: Vector math for cosine similarity</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">python-dotenv</code>: Environment-based configuration</li>
</ul>



<h3 class="wp-block-heading">Verifying Redis</h3>



<p>This lesson assumes Redis is running locally on the default port.</p>



<p>You can verify Redis is available with:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="2">$ redis-cli ping
PONG
</pre>



<p>If Redis is not installed, you can start it quickly using Docker (but you also can spin it up using the <code data-enlighter-language="python" class="EnlighterJSRAW">docker-compose.yml</code> we provide in the code zip):</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="3">$ docker run -p 6379:6379 redis:7
</pre>



<h3 class="wp-block-heading">Setting Up Ollama</h3>



<p>This system uses <strong>Ollama</strong> for both embedding generation and LLM inference. Make sure Ollama is installed and running, and that the required models are available.</p>



<p>For example:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="4">$ ollama pull nomic-embed-text
$ ollama pull llama3.2
</pre>



<p>Once running, Ollama exposes local HTTP endpoints that the application will call directly for embeddings and text generation.</p>



<!-- wp:paragraph -->
<h3>Need Help Configuring Your Development Environment?</h3>
<!-- /wp:paragraph -->

<!-- wp:image {"align":"center","id":18137,"sizeSlug":"large","linkDestination":"custom"} -->
<figure class="wp-block-image aligncenter size-large"><a href="https://pyimagesearch.com/pyimagesearch-university/" target="_blank" rel="noreferrer noopener"><img decoding="async" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-18137" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?lossy=2&strip=1&webp=1 500w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?size=126x84&lossy=2&strip=1&webp=1 126w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?size=252x168&lossy=2&strip=1&webp=1 252w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2021/01/pyimagesearch_plus_jupyter.png?size=378x253&lossy=2&strip=1&webp=1 378w" sizes="(max-width: 500px) 100vw, 500px" /></a><figcaption>Having trouble configuring your development environment? Want access to pre-configured Jupyter Notebooks running on Google Colab? Be sure to join <a href="https://pyimagesearch.com/pyimagesearch-university/" target="_blank" rel="noreferrer noopener" aria-label=" (opens in a new tab)">PyImageSearch University</a> — you will be up and running with this tutorial in a matter of minutes. </figcaption></figure>
<!-- /wp:image -->

<!-- wp:paragraph -->
<p>All that said, are you:</p>
<!-- /wp:paragraph -->

<!-- wp:list -->
<ul><li>Short on time?</li><li>Learning on your employer’s administratively locked system?</li><li>Wanting to skip the hassle of fighting with the command line, package managers, and virtual environments?</li><li><strong>Ready to run the code immediately on your Windows, macOS, or Linux system?</strong></li></ul>
<!-- /wp:list -->

<!-- wp:paragraph -->
<p>Then join <a href="https://pyimagesearch.com/pyimagesearch-university/" target="_blank">PyImageSearch University</a> today!</p>
<!-- /wp:paragraph -->

<!-- wp:paragraph -->
<p><strong>Gain access to Jupyter Notebooks for this tutorial and other PyImageSearch guides pre-configured to run on Google Colab’s ecosystem right in your web browser!</strong> No installation required.</p>
<!-- /wp:paragraph -->

<!-- wp:paragraph -->
<p>And best of all, these Jupyter Notebooks will run on Windows, macOS, and Linux!</p>
<!-- /wp:paragraph -->



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Project-Structure"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Project-Structure">Project Structure</a></h2>



<p>Before diving into individual components, let’s take a moment to understand how the project is organized.</p>



<p>A clear directory structure is especially important in LLM-backed systems, where responsibilities span API orchestration, caching, embeddings, model calls, and observability. In this project, each concern is isolated into its own module so the request flow remains easy to trace and reason about.</p>



<p>After downloading the source code from the “Downloads” section, your directory structure should look like this:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="5">.
├── app
│   ├── api
│   │   ├── __init__.py
│   │   └── ask.py
│   ├── cache
│   │   ├── __init__.py
│   │   ├── poisoning.py
│   │   ├── schemas.py
│   │   ├── semantic_cache.py
│   │   └── ttl.py
│   ├── config
│   │   ├── __init__.py
│   │   └── settings.py
│   ├── embeddings
│   │   ├── __init__.py
│   │   └── embedder.py
│   ├── llm
│   │   ├── __init__.py
│   │   └── ollama_client.py
│   ├── main.py
│   └── observability
│       └── metrics.py
├── complete-codebase.txt
├── docker-compose.yml
├── Dockerfile
├── README.md
└── requirements.txt
</pre>



<p>Let’s break this down at a high level.</p>



<h3 class="wp-block-heading">The app/ Package</h3>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">app/</code> directory contains all runtime application code. Nothing outside this folder is imported at execution time.</p>



<p>This keeps the service self-contained and makes it easy to reason about deployment and dependencies.</p>



<h3 class="wp-block-heading">app/main.py: Application Entry Point</h3>



<p>This file defines the FastAPI application and registers all routers.</p>



<p>It contains <strong>no business logic</strong> — only service wiring. Every request into the system enters through this file.</p>



<h3 class="wp-block-heading">app/api/: API Layer</h3>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">api/</code> package defines HTTP-facing endpoints.</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">ask.py</code>: Implements the <code data-enlighter-language="python" class="EnlighterJSRAW">/ask</code> endpoint and acts as the orchestration layer for the entire semantic caching pipeline.</li>
</ul>



<p>The API layer is responsible for:</p>



<ul class="wp-block-list">
<li>input validation</li>



<li>enforcing cache ordering</li>



<li>coordinating cache, embeddings, and LLM calls</li>



<li>returning structured debug information</li>
</ul>



<p>It does <em>not</em> implement caching or similarity logic directly.</p>



<h3 class="wp-block-heading">app/cache/: Caching Logic</h3>



<p>This package contains all cache-related functionality.</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">semantic_cache.py</code>: Core semantic cache implementation (exact match, semantic match, Redis storage, similarity search).</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">schemas.py</code>: Defines the cache entry schema used for Redis storage.</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">ttl.py</code>: Application-level TTL configuration and expiration checks.</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">poisoning.py</code>: Safety checks to prevent invalid or error responses from being reused.</li>
</ul>



<p>By isolating caching logic here, the API layer stays clean and reusable.</p>



<h3 class="wp-block-heading">app/embeddings/: Embedding Generation</h3>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">embedder.py</code>: Handles embedding generation via Ollama’s embedding endpoint.</li>
</ul>



<p>This module has a single responsibility: convert text into semantic vectors.</p>



<p>It does not cache, rank, or validate embeddings.</p>



<h3 class="wp-block-heading">app/llm/: LLM Client</h3>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">ollama_client.py</code>: Wraps calls to the Ollama text-generation endpoint.</li>
</ul>



<p>Keeping LLM interaction isolated allows the rest of the system to remain model-agnostic.</p>



<h3 class="wp-block-heading">app/observability/: Metrics</h3>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">metrics.py</code>: Implements simple in-memory counters for cache hits, misses, and LLM calls.</li>
</ul>



<p>These metrics are intentionally lightweight and meant for learning and debugging, not production monitoring.</p>



<h3 class="wp-block-heading">Configuration and Infrastructure</h3>



<p>Outside the <code data-enlighter-language="python" class="EnlighterJSRAW">app/</code> directory:</p>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">config/settings.py</code>: Centralizes environment-based configuration (Redis host, TTLs, model names).</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">Dockerfile</code> and <code data-enlighter-language="python" class="EnlighterJSRAW">docker-compose.yml</code>: Define a reproducible runtime environment for the API and Redis.</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">requirements.txt</code>: Lists all Python dependencies required to run the service.</li>
</ul>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-FastAPI-Entry-Point-Semantic-Caching-Wiring-API-Service"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-FastAPI-Entry-Point-Semantic-Caching-Wiring-API-Service">FastAPI Entry Point for Semantic Caching: Wiring the API Service</a></h2>



<p>Before we look at caching logic, embeddings, or Redis, it’s important to understand how the service itself is wired together. Every request to the semantic cache enters the system through a single FastAPI application, defined in <code data-enlighter-language="python" class="EnlighterJSRAW">app/main.py</code>.</p>



<p>This file acts as the <strong>entry point</strong> of the service. Its responsibility is not to implement business logic, but to connect the application components and expose HTTP routes.</p>



<h3 class="wp-block-heading">Application Entry Point (app/main.py)</h3>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="6">from fastapi import FastAPI
from api.ask import router as ask_router

app = FastAPI(title="Semantic Cache Basics")
app.include_router(ask_router)
</pre>



<p>Let’s break this down.</p>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">FastAPI()</code> call creates the application object. This object represents the entire web service and is what the ASGI (Asynchronous Server Gateway Interface) server (<code data-enlighter-language="python" class="EnlighterJSRAW">uvicorn</code>) runs when the container starts.</p>



<p>The application itself contains no knowledge of caching, embeddings, or LLMs. It simply defines a runtime container that will host those capabilities.</p>



<h3 class="wp-block-heading">Router Registration</h3>



<p>Instead of defining endpoints directly in <code data-enlighter-language="python" class="EnlighterJSRAW">main.py</code>, the application imports a router from <code data-enlighter-language="python" class="EnlighterJSRAW">api/ask.py</code> and registers it using <code data-enlighter-language="python" class="EnlighterJSRAW">include_router()</code>.</p>



<p>This pattern serves several purposes:</p>



<ul class="wp-block-list">
<li><strong>Separation of concerns: </strong>Routing and request handling live outside the application entry point.</li>



<li><strong>Scalability: </strong>As the system grows, additional routers (for health checks, metrics, or admin endpoints) can be added without modifying core application wiring.</li>



<li><strong>Readability: </strong><code data-enlighter-language="python" class="EnlighterJSRAW">main.py</code> remains easy to understand at a glance, even as the codebase expands.</li>
</ul>



<p>At runtime, FastAPI merges the routes defined in <code data-enlighter-language="python" class="EnlighterJSRAW">ask_router</code> into the main application. When a request arrives at the <code data-enlighter-language="python" class="EnlighterJSRAW">/ask</code> endpoint, FastAPI resolves it through the registered router and forwards it to the appropriate handler function.</p>



<h3 class="wp-block-heading">Why This Matters</h3>



<p>Keeping the entry point minimal is intentional. It ensures that:</p>



<ul class="wp-block-list">
<li>The application startup process is predictable</li>



<li>Routing logic is easy to trace</li>



<li>Core functionality can evolve independently of service wiring</li>
</ul>



<p>With the application structure in place, we can now focus on what actually happens when a request reaches the system.</p>



<p>In the next section, we will walk through the <code data-enlighter-language="python" class="EnlighterJSRAW">/ask</code> endpoint and see how it orchestrates exact-match caching, semantic search, and LLM fallback step by step.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-FastAPI-Ask-Endpoint-End-to-End-Semantic-Caching-Request-Flow"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-FastAPI-Ask-Endpoint-End-to-End-Semantic-Caching-Request-Flow">FastAPI Ask Endpoint: End-to-End Semantic Caching Request Flow</a></h2>



<p>This section makes the architecture concrete. We now walk through the <code data-enlighter-language="python" class="EnlighterJSRAW">/ask</code> endpoint, which orchestrates the entire semantic caching pipeline from request arrival to response delivery.</p>



<p>The goal here is not to memorize code, but to understand <strong>why each step exists</strong>, <strong>where it lives</strong>, and <strong>how it protects performance, cost, and correctness</strong>.</p>



<h3 class="wp-block-heading">The Role of the Ask Endpoint</h3>



<p>The Ask endpoint is the <strong>control plane</strong> of the system.</p>



<p>It does <strong>not</strong>:</p>



<ul class="wp-block-list">
<li>Compute similarity</li>



<li>Store embeddings</li>



<li>Talk directly to Redis internals</li>
</ul>



<p>Instead, it:</p>



<ul class="wp-block-list">
<li>Validates input</li>



<li>Decides which cache layers to consult</li>



<li>Enforces ordering between cheap and expensive operations</li>



<li>Collects observability signals</li>



<li>Guarantees a response even on cache failure</li>
</ul>



<p>This separation is intentional. Cache logic remains reusable and testable, while orchestration logic stays explicit at the API boundary.</p>



<h3 class="wp-block-heading">Defining the API Contract</h3>



<p>We begin by defining the request and response models.</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="7">class AskRequest(BaseModel):
    query: str
    bypass_cache: bool = False
</pre>



<p>The request consists of a user <code data-enlighter-language="python" class="EnlighterJSRAW">query</code> and an optional <code data-enlighter-language="python" class="EnlighterJSRAW">bypass_cache</code> flag. This flag allows us to force a cache miss during debugging or testing, ensuring that the LLM and embedding pipeline still function correctly.</p>



<p>Before the request ever reaches the cache, the <code data-enlighter-language="python" class="EnlighterJSRAW">query</code> field is validated.</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="8">@field_validator('query')
@classmethod
def validate_query(cls, v: str) -> str:
    if not v or not v.strip():
        raise ValueError("Query cannot be empty or whitespace-only")
    return v.strip()
</pre>



<p>This validation step protects the system at the boundary. Rejecting empty or whitespace-only queries prevents:</p>



<ul class="wp-block-list">
<li>wasted embedding computation</li>



<li>cache pollution with meaningless entries</li>



<li>unnecessary LLM calls</li>
</ul>



<p>This is a recurring pattern in production systems: <strong>fail fast, before expensive operations are triggered</strong>.</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="9">class AskResponse(BaseModel):
    response: str
    from_cache: bool
    similarity: float
    debug: dict
</pre>



<p>The response model intentionally exposes diagnostic information through fields such as <code data-enlighter-language="python" class="EnlighterJSRAW">from_cache</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">similarity</code>, and <code data-enlighter-language="python" class="EnlighterJSRAW">debug</code>. During development, this makes cache behavior transparent rather than opaque.</p>



<h3 class="wp-block-heading">Initializing the Cache</h3>



<p>Before handling requests, we create a <code data-enlighter-language="python" class="EnlighterJSRAW">SemanticCache</code> instance:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="10">cache = SemanticCache()
</pre>



<p>The endpoint itself remains stateless. All persistence and reuse live inside the cache layer.</p>



<h3 class="wp-block-heading">Step 1: Entering the Endpoint</h3>



<p>The endpoint is registered using FastAPI’s routing mechanism:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="11">@router.post("/ask", response_model=AskResponse)
def ask_endpoint(request: AskRequest):
</pre>



<p>FastAPI automatically validates incoming requests and outgoing responses using the schemas defined earlier. If invalid data enters or exits the system, FastAPI raises an error instead of silently failing.</p>



<p>Inside the handler, we extract the query and initialize tracking state:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="12">query = request.query
miss_reason = None
</pre>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">miss_reason</code> variable exists purely for observability. Rather than treating cache misses as a black box, we explicitly track <em>why</em> a miss occurred.</p>



<h3 class="wp-block-heading">Step 2: Exact-Match Cache Lookup (Fast Path)</h3>



<p>The first decision point is the exact-match cache lookup:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="13">if not request.bypass_cache:
    cached = cache.search(None, exact_query=query)
</pre>



<p>This is the <strong>cheapest path</strong> through the system.</p>



<p>If the same query has already been answered, the response can be returned immediately:</p>



<ul class="wp-block-list">
<li>no embeddings are generated</li>



<li>no similarity computation occurs</li>



<li>the LLM is not touched</li>
</ul>



<p>If a cached entry is found, it is validated:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="14">if is_expired(cached):
    miss_reason = "expired"
elif is_poisoned(cached):
    miss_reason = "poisoned"
elif cached.get("confidence", 0.0) &lt; 0.7:
    miss_reason = "low_confidence"
</pre>



<p>Only entries that are fresh, valid, and confident are allowed to short-circuit the pipeline.</p>



<p>When all checks pass, the endpoint returns immediately:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="15">metrics.cache_hit()
return AskResponse(...)
</pre>



<p>This path typically completes in milliseconds and handles repeated identical queries efficiently.</p>



<h3 class="wp-block-heading">Step 3: Embedding Generation (Escalation Point)</h3>



<p>If the exact-match lookup fails or is bypassed, the endpoint escalates:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="16">embedding = embed_text(query)
</pre>



<p>Embedding generation is expensive, even when running locally. For this reason, it is intentionally delayed until all cheaper options have been exhausted.</p>



<p>This single design choice has a significant impact on system efficiency.</p>



<h3 class="wp-block-heading">Step 4: Semantic Cache Lookup</h3>



<p>With the embedding available, the endpoint attempts a semantic search:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="17">cached = cache.search(embedding)
</pre>



<p>This path catches paraphrased and reworded queries. As before, cached entries are validated to ensure they are safe to reuse.</p>



<p>If a suitable match is found, the response is returned without calling the LLM.</p>



<h3 class="wp-block-heading">Step 5: Explicit Cache Bypass</h3>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">bypass_cache</code> flag is handled explicitly:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="18">if request.bypass_cache:
    miss_reason = "bypass"
</pre>



<p>This allows controlled testing and debugging without modifying code or disabling cache logic globally.</p>



<h3 class="wp-block-heading">Step 6: LLM Fallback and Cache Population</h3>



<p>If both cache layers miss, the request is forwarded to the LLM:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="19">metrics.cache_miss()
response = generate_llm_response(query)
metrics.llm_call()
</pre>



<p>This is the slowest path through the system, but it guarantees correctness.</p>



<p>Successful responses are stored in the cache:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="20">if not response.startswith("[LLM Error]"):
    cache.store(query, embedding, response, metadata=metadata)
</pre>



<p>Responses beginning with <code data-enlighter-language="python" class="EnlighterJSRAW">[LLM Error]</code> are intentionally not cached, preventing cache poisoning and ensuring failures do not propagate to future requests.</p>



<h3 class="wp-block-heading">Control Flow Summary</h3>



<p>The endpoint follows a simple, explicit sequence:</p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/04/image-23-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="738" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-23-1024x738.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53580" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-23.png?size=126x91&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-23-300x216.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-23.png?size=378x272&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-23.png?size=504x363&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-23.png?size=630x454&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-23-768x554.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-23-1024x738.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-23-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 4:</strong> LLM API Control Flow with Layered Semantic Caching (source: image by the author).</figcaption></figure></div>


<p>Every expensive operation is deferred until absolutely necessary.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Embeddings-Turning-Text-into-Semantic-Vectors"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Embeddings-Turning-Text-into-Semantic-Vectors">Embeddings: Turning Text into Semantic Vectors</a></h2>



<p>Up to this point, we have treated embeddings as a black box: something expensive that we try to avoid unless absolutely necessary.</p>



<p>In this section, we will open that box just enough to understand <strong>what embeddings are</strong>, <strong>when they are generated</strong>, and <strong>why they enable semantic caching</strong> without diving into vector math or model internals.</p>



<h3 class="wp-block-heading">Why Embeddings Exist in This System</h3>



<p>Exact-match caching works only when queries are identical at the string level. As soon as wording changes, exact matching breaks down.</p>



<p>Embeddings solve this problem by converting text into a numeric representation that captures <strong>meaning rather than surface form</strong>.</p>



<p>Queries that mean the same thing tend to produce vectors that are close together in vector space, even if their wording differs significantly.</p>



<p>This is the foundation that makes semantic caching possible.</p>



<h3 class="wp-block-heading">Embedding Generation Happens on Demand</h3>



<p>In our implementation, embeddings are generated <strong>only after</strong> the exact-match cache fails.</p>



<p>This decision is intentional.</p>



<p>Embedding generation involves:</p>



<ul class="wp-block-list">
<li>a model invocation</li>



<li>network overhead</li>



<li>serialization and deserialization</li>



<li>non-trivial latency</li>
</ul>



<p>Because of this cost, embeddings are treated as an <strong>escalation step</strong>, not a default operation.</p>



<p>This is why the <code data-enlighter-language="python" class="EnlighterJSRAW">/ask</code> endpoint first attempts an exact-match lookup before calling <code data-enlighter-language="python" class="EnlighterJSRAW">embed_text()</code>.</p>



<h3 class="wp-block-heading">The embed_text Function</h3>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="21">def embed_text(text: str):
</pre>



<p>This function has one responsibility: <strong>Convert input text into a semantic vector representation.</strong></p>



<p>It does not perform caching, similarity search, or validation. Those concerns live elsewhere.</p>



<h3 class="wp-block-heading">Calling the Embedding Model</h3>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="22">url = f"http://{settings.OLLAMA_HOST}:{settings.OLLAMA_PORT}/api/embeddings"
</pre>



<p>Here, we construct the Ollama embedding endpoint using configuration values (e.g., <code data-enlighter-language="python" class="EnlighterJSRAW">settings.OLLAMA_HOST</code>, <code data-enlighter-language="python" class="EnlighterJSRAW">settings.OLLAMA_PORT</code>, etc.).</p>



<p>This allows the embedding service to run locally, inside Docker, or on a remote host without changing code.</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="23">resp = httpx.post(
    url,
    json={"model": settings.EMBEDDING_MODEL, "prompt": text},
    timeout=10.0
)
</pre>



<p>This request sends 2 key pieces of information to the embedding service:</p>



<ul class="wp-block-list">
<li>the <strong>embedding model name</strong> (e.g., <code data-enlighter-language="python" class="EnlighterJSRAW">nomic-embed-text</code>)</li>



<li>the <strong>input text</strong> to embed</li>
</ul>



<p>The timeout ensures the request does not hang indefinitely. Embedding generation is expensive, but it should still fail fast if something goes wrong.</p>



<h3 class="wp-block-heading">Handling the Response</h3>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="24">resp.raise_for_status()
return resp.json().get("embedding", [])
</pre>



<p>If the request succeeds, the embedding model returns a numeric vector — typically a list of floating-point values.</p>



<p>This vector represents the <strong>semantic meaning</strong> of the input text and becomes the key used for similarity comparison in the cache.</p>



<p>At this stage, we treat the vector as an opaque object. We do not inspect its dimensionality or normalize it here.</p>



<h3 class="wp-block-heading">Error Handling Strategy</h3>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="25">except Exception as e:
    raise RuntimeError(f"Failed to generate embedding: {e}")
</pre>



<p>If embedding generation fails for any reason (network issues, model errors, timeouts), the function raises an exception.</p>



<p>This is intentional.</p>



<p>If embeddings cannot be generated, the system cannot safely perform semantic matching. Silently continuing would lead to unpredictable behavior, so we fail loudly instead.</p>



<h3 class="wp-block-heading">Why the Embedder Is Intentionally Simple</h3>



<p>Notice what this function <strong>does not do</strong>:</p>



<ul class="wp-block-list">
<li>it does not store embeddings</li>



<li>it does not perform similarity search</li>



<li>it does not retry failed requests</li>



<li>it does not fall back to alternative models</li>
</ul>



<p>Those decisions are deliberate.</p>



<p>For Lesson 1, the embedder exists purely to convert text into vectors. Keeping it small and focused makes the system easier to understand and test.</p>



<h3 class="wp-block-heading">How the Embedder Is Used in the Pipeline</h3>



<p>At runtime, the embedder is called only when necessary:</p>



<ul class="wp-block-list">
<li>Exact-match cache fails</li>



<li>The query is passed to <code data-enlighter-language="python" class="EnlighterJSRAW">embed_text()</code></li>



<li>The returned vector is sent to the semantic cache</li>



<li>Similarity is computed against stored embeddings</li>
</ul>



<p>This ensures embeddings are generated <strong>only when cheaper paths have already failed</strong>.</p>



<h3 class="wp-block-heading">Key Takeaways</h3>



<ul class="wp-block-list">
<li>Embeddings are generated via a simple HTTP call to a local model</li>



<li>The embedder has a single responsibility</li>



<li>Errors are surfaced immediately</li>



<li>Embeddings act as semantic keys for cache lookup</li>
</ul>



<p>With embedding generation understood, we are now ready to look at the <strong>semantic cache itself</strong>, how embeddings and responses are stored, scanned, and matched.</p>



<p>In the next section, we will walk through the semantic cache implementation, starting with a deliberately naive but correct linear scan approach.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Semantic-Cache-Cosine-Similarity-Redis-Storage-Reusing-Meaning"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Semantic-Cache-Cosine-Similarity-Redis-Storage-Reusing-Meaning">The Semantic Cache: Cosine Similarity, Redis Storage, and Reusing Meaning</a></h2>



<p>At this point, we understand how queries enter the system and how text is converted into embeddings. What remains is the component that ties everything together: the semantic cache itself.</p>



<p>The semantic cache is responsible for 2 things:</p>



<ul class="wp-block-list">
<li><strong>Storing</strong> past queries, embeddings, and responses</li>



<li><strong>Retrieving</strong> the best reusable response for a new query</li>
</ul>



<p>In Lesson 1, we intentionally implement the cache in the simplest correct way possible: a <strong>linear scan over cached entries</strong>. This keeps the implementation easy to reason about and makes the request flow fully transparent.</p>



<h3 class="wp-block-heading">The Semantic Cache Module</h3>



<p>The cache logic lives in <code data-enlighter-language="python" class="EnlighterJSRAW">semantic_cache.py</code>:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="26">class SemanticCache:
</pre>



<p>This class encapsulates all Redis interaction and similarity logic. The API layer never talks to Redis directly.</p>



<h3 class="wp-block-heading">Initializing the Cache</h3>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="27">def __init__(self):
    self.r = redis.Redis(
        host=settings.REDIS_HOST,
        port=settings.REDIS_PORT,
        decode_responses=True
    )
    self.similarity_threshold = 0.85
    self.namespace = "semantic_cache:v1"
</pre>



<p>Here we establish a Redis connection and configure 2 important parameters:</p>



<ul class="wp-block-list">
<li><strong>Similarity threshold: </strong>Only responses with sufficiently high semantic similarity are eligible for reuse.</li>



<li><strong>Namespace prefix: </strong>All Redis keys are namespaced to avoid collisions and allow future versioning.</li>
</ul>



<p>For Lesson 1, the exact threshold value is not important. What matters is that a threshold exists and is applied consistently.</p>



<h3 class="wp-block-heading">Storing Cache Entries</h3>



<p>The first core operation is storing new entries.</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="28">def store(self, query, embedding, response, metadata=None):
</pre>



<p>This method is called only after a successful LLM response.</p>



<h3 class="wp-block-heading">Creating a Cache Entry</h3>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="29">entry = CacheEntry(
    id=entry_uuid,
    query=query,
    query_hash=query_hash,
    embedding=json.dumps(embedding),
    response=response,
    created_at=int(time.time()),
    ttl=default_ttl(),
    metadata=metadata or {}
)
</pre>



<p>Each cache entry stores:</p>



<ul class="wp-block-list">
<li>the original query</li>



<li>a normalized query hash (used for exact matching)</li>



<li>the embedding (serialized for Redis storage)</li>



<li>the LLM response</li>



<li>timestamps and TTL</li>



<li>optional metadata for observability</li>
</ul>



<p>This structure allows the cache to support both exact-match and semantic lookups.</p>



<h3 class="wp-block-heading">Writing to Redis</h3>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="30">self.r.hset(redis_key, mapping=entry.dict())
self.r.sadd(f"{self.namespace}:keys", redis_key)
</pre>



<p>Each cache entry is stored as a Redis hash, and all entry keys are tracked in a Redis set.</p>



<p>This allows the cache to iterate over all entries during search operations.</p>



<p>For Lesson 1, this approach is intentionally simple and explicit.</p>



<h3 class="wp-block-heading">Searching the Cache</h3>



<p>The second core operation is lookup.</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="31">def search(self, embedding, exact_query=None):
</pre>



<p>This method supports <strong>2 search modes</strong>, which map directly to the layered cache strategy used in the API.</p>



<h3 class="wp-block-heading">Exact-Match Lookup (Fast Path)</h3>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="32">if exact_query:
    query_hash = self._hash_query(exact_query)
</pre>



<p>When an exact query is provided, the cache first attempts a hash-based lookup.</p>



<p>Each cached entry is scanned until a matching hash is found. If found, the entry is returned immediately with a similarity score of 1.0.</p>



<p>No embeddings are involved in this path.</p>



<h3 class="wp-block-heading">Semantic Lookup (Flexible Path)</h3>



<p>If no exact match is found and an embedding is provided, the cache performs a semantic search:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="33">sim = self.cosine_similarity(query_embedding, cached_embedding)
</pre>



<p>Each cached embedding is compared against the query embedding using cosine similarity.</p>



<p>Only entries that exceed the configured similarity threshold are considered candidates.</p>



<h3 class="wp-block-heading">Selecting the Best Match</h3>



<p>During the scan, the cache tracks the highest similarity score and returns the best matching entry.</p>



<p>This ensures that even when multiple entries are similar, the most relevant response is reused.</p>



<h3 class="wp-block-heading">Why This Implementation Is O(N)</h3>



<p>Every search scans all cached entries.</p>



<p>This is not an accident.</p>



<p>For Lesson 1, a linear scan has 3 advantages:</p>



<ul class="wp-block-list">
<li>the behavior is easy to understand</li>



<li>the logic is fully visible</li>



<li>debugging is straightforward</li>
</ul>



<p>More advanced indexing strategies belong in later lessons.</p>



<h3 class="wp-block-heading">Why Expired Entries Are Cleaned During Search</h3>



<p>While scanning entries, expired items are removed opportunistically.</p>



<p>This prevents stale data from accumulating indefinitely without introducing background workers or schedulers.</p>



<h3 class="wp-block-heading">Key Takeaways</h3>



<ul class="wp-block-list">
<li>The semantic cache owns all <code data-enlighter-language="python" class="EnlighterJSRAW">Redis</code> interactions</li>



<li>Exact-match lookup is attempted before semantic matching</li>



<li>Semantic similarity is computed using embeddings</li>



<li>A linear scan trades performance for clarity</li>



<li>The cache returns the <em>best</em> reusable response, not just the first match</li>
</ul>



<p>At this stage, the system is fully functional: queries can be answered, cached, and reused.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Cache-Entries-What-Exactly-Gets-Stored"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Cache-Entries-What-Exactly-Gets-Stored">Cache Entries: What Exactly Gets Stored?</a></h2>



<p>So far, we’ve treated the cache as a logical concept: something that stores queries, embeddings, and responses.</p>



<p>In this section, we’ll make that concrete by looking at <strong>the structure of a cache entry</strong>. Understanding this structure is important because it explains <em>why</em> the cache can support both exact-match and semantic lookup — without duplicating data or logic.</p>



<h3 class="wp-block-heading">The Cache Entry Schema</h3>



<p>Cache entries are defined using a Pydantic model:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="34">class CacheEntry(BaseModel):
    id: str
    query: str
    query_hash: str
    embedding: str
    response: str
    created_at: int
    ttl: int
    metadata: Optional[Dict] = Field(default_factory=dict)
</pre>



<p>Each field exists for a specific reason. Let’s walk through them one by one.</p>



<h3 class="wp-block-heading">Identity and Query Fields</h3>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="35">id: str
query: str
query_hash: str
</pre>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">id</code>: uniquely identifies the cache entry and is used to construct the Redis key.</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">query</code>: stores the original user input. This is useful for debugging and inspection.</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">query_hash</code>: stores a normalized hash of the query and enables <strong>exact-match lookup</strong>.</li>
</ul>



<p>At this stage, it’s enough to know that the hash ensures identical queries can be matched quickly. We’ll revisit <em>how</em> and <em>why</em> this normalization matters in a later lesson.</p>



<h3 class="wp-block-heading">Embedding Storage</h3>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="36">embedding: str
</pre>



<p>Embeddings are stored as a <strong>JSON-serialized string</strong>, not as a raw Python list.</p>



<p>This choice is deliberate:</p>



<ul class="wp-block-list">
<li>Redis stores strings efficiently</li>



<li>Serialization keeps the schema simple</li>



<li>Deserialization happens only when similarity needs to be computed</li>
</ul>



<p>For Lesson 1, the important takeaway is that embeddings are stored <strong>once</strong>, alongside the response they produced.</p>



<h3 class="wp-block-heading">Response and Timing Information</h3>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="37">response: str
created_at: int
ttl: int
</pre>



<ul class="wp-block-list">
<li><code data-enlighter-language="python" class="EnlighterJSRAW">response</code>: is the text returned by the LLM.</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">created_at</code>: records when the entry was generated.</li>



<li><code data-enlighter-language="python" class="EnlighterJSRAW">ttl</code>: defines how long the entry is considered valid.</li>
</ul>



<p>The cache does not rely on Redis expiration here. Instead, validity is checked at read time. This gives the application full control over when an entry should be reused or rejected.</p>



<p>We intentionally avoid deeper TTL semantics in this lesson.</p>



<h3 class="wp-block-heading">Metadata and Safety</h3>



<pre class="EnlighterJSRAW" data-enlighter-language="python" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="38">metadata: Optional[Dict] = Field(default_factory=dict)
</pre>



<p>Metadata allows the cache to store contextual information such as:</p>



<ul class="wp-block-list">
<li>pipeline name</li>



<li>model identifier</li>



<li>request origin</li>
</ul>



<p>The use of <code data-enlighter-language="python" class="EnlighterJSRAW">default_factory=dict</code> avoids shared mutable state across cache entries — a subtle but important correctness detail.</p>



<p>At this stage, metadata is informational rather than functional.</p>



<h3 class="wp-block-heading">Why This Schema Works Well</h3>



<p>This schema supports the layered caching strategy naturally:</p>



<ul class="wp-block-list">
<li><strong>Exact match</strong> uses <code data-enlighter-language="python" class="EnlighterJSRAW">query_hash</code></li>



<li><strong>Semantic match</strong> uses embedding</li>



<li><strong>Freshness checks</strong> use <code data-enlighter-language="python" class="EnlighterJSRAW">created_at</code> and <code data-enlighter-language="python" class="EnlighterJSRAW">ttl</code></li>



<li><strong>Safety checks</strong> use response and metadata</li>
</ul>



<p>All required information is co-located in a single cache entry, making lookup and validation straightforward.</p>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-End-to-End-Demo-Verifying-Core-Cache-Behavior"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-End-to-End-Demo-Verifying-Core-Cache-Behavior">End-to-End Demo: Verifying Core Cache Behavior</a></h2>



<p>In this section, we will verify that the semantic cache behaves as expected under a small set of controlled scenarios.</p>



<p>These examples are meant to be <strong>run locally by the reader</strong>. The responses shown below are <strong>representative</strong> and may vary slightly depending on the model and configuration.</p>



<h3 class="wp-block-heading">Demo Case 1: Cold Request (LLM Fallback)</h3>



<p>We begin with a query that has not been seen before.</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="39">curl -X POST http://localhost:8000/ask \
  -H "Content-Type: application/json" \
  -d '{"query": "What is semantic caching?"}'
</pre>



<p><strong>Expected behavior</strong></p>



<ul class="wp-block-list">
<li>Exact-match cache miss</li>



<li>Semantic cache miss</li>



<li>LLM call</li>



<li>Cache population</li>
</ul>



<p><strong>Response</strong></p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/04/image-24-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="463" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-24-1024x463.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53582" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-24.png?size=126x57&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-24-300x135.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-24.png?size=378x171&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-24.png?size=504x228&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-24.png?size=630x285&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-24-768x347.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-24-1024x463.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-24-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-24-1536x694.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 5:</strong> Cold request flow showing a cache miss at both the exact-match and semantic cache layers, triggering an LLM fallback. The response is generated by the model and stored for future reuse (source: image by the author).</figcaption></figure></div>


<p>The key signal here is <code data-enlighter-language="python" class="EnlighterJSRAW">"from_cache": false</code>, confirming the request fell back to the LLM.</p>



<h3 class="wp-block-heading">Demo Case 2: Exact-Match Cache Hit</h3>



<p>Now we send the <strong>same query again</strong>.</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="40">curl -X POST http://localhost:8000/ask \
  -H "Content-Type: application/json" \
  -d '{"query": "What is semantic caching?"}'
</pre>



<p><strong>Expected behavior</strong></p>



<ul class="wp-block-list">
<li>Exact-match cache hit</li>



<li>No embedding generation</li>



<li>No LLM call</li>
</ul>



<p><strong>Example response</strong></p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/04/image-25-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="494" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-25-1024x494.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53584" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-25.png?size=126x61&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-25-300x145.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-25.png?size=378x182&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-25.png?size=504x243&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-25.png?size=630x304&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-25-768x371.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-25-1024x494.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-25-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-25-1536x741.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 6:</strong> Exact-match cache behavior. The repeated query is served directly from the cache via an exact string match, bypassing embedding generation and the LLM entirely (source: image by the author).</figcaption></figure></div>


<p>Here, the cache reused the response immediately using an exact-match lookup.</p>



<h3 class="wp-block-heading">Optional Demo: Whitespace Normalization</h3>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="41">curl -X POST http://localhost:8000/ask \
  -H "Content-Type: application/json" \
  -d '{"query": "   What   is   semantic   caching?   "}'
</pre>



<p>This will hit the exact-match cache due to query normalization.</p>



<h3 class="wp-block-heading">Demo Case 3: Semantic Cache Hit (Paraphrased Query)</h3>



<p>Next, we send a paraphrased version of the original query.</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="42">curl -X POST http://localhost:8000/ask \
  -H "Content-Type: application/json" \
  -d '{"query": "Can you explain how semantic caching works?"}'
</pre>



<p><strong>Expected behavior</strong></p>



<ul class="wp-block-list">
<li>Exact-match cache miss</li>



<li>Embedding generation</li>



<li>Semantic cache hit</li>



<li>No LLM call</li>
</ul>



<p><strong>Example response</strong></p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/04/image-26-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="480" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-26-1024x480.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53586" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-26.png?size=126x59&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-26-300x141.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-26.png?size=378x177&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-26.png?size=504x236&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-26.png?size=630x295&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-26-768x360.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-26-1024x480.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-26-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-26-1536x720.png?lossy=2&amp;strip=1&amp;webp=1 1536w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 7:</strong> Semantic cache hit for a paraphrased query. Although the input text differs, the cached response is reused based on embedding similarity, avoiding a new LLM call (source: image by the author).</figcaption></figure></div>


<p>Even though the query text is different, the cache successfully reused the response based on semantic similarity.</p>



<h3 class="wp-block-heading">Demo Case 4: Forcing a Cache Miss with bypass_cache</h3>



<p>The <code data-enlighter-language="python" class="EnlighterJSRAW">bypass_cache</code> flag allows us to force the system to skip both cache layers.</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="43">curl -X POST http://localhost:8000/ask \
  -H "Content-Type: application/json" \
  -d '{"query": "What is semantic caching?", "bypass_cache": true}'
</pre>



<p><strong>Expected behavior</strong></p>



<ul class="wp-block-list">
<li>Exact-match cache skipped</li>



<li>Semantic cache skipped</li>



<li>LLM called unconditionally</li>
</ul>



<p><strong>Example response</strong></p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/04/image-27-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="488" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-27-1024x488.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53587" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-27.png?size=126x60&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-27-300x143.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-27.png?size=378x180&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-27.png?size=504x240&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-27.png?size=630x300&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-27-768x366.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-27-1024x488.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-27-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 8: </strong>Cache bypass behavior. The request explicitly skips all cache layers via <code>bypass_cache</code>, ensuring the LLM pipeline executes independently of cached responses (source: image by the author).</figcaption></figure></div>


<p>This is useful for debugging and validating that the LLM pipeline still works independently of the cache.</p>



<h3 class="wp-block-heading">Observing Cache Metrics (Optional)</h3>



<p>You can inspect basic cache statistics using the <code data-enlighter-language="python" class="EnlighterJSRAW">/internal/metrics</code> endpoint:</p>



<pre class="EnlighterJSRAW" data-enlighter-language="shell" data-enlighter-theme="" data-enlighter-highlight="" data-enlighter-linenumbers="true" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="44">curl http://localhost:8000/internal/metrics
</pre>



<p><strong>Example response</strong></p>


<div class="wp-block-image">
<figure class="aligncenter size-large"><a href="https://pyimagesearch.com/wp-content/uploads/2026/04/image-28-scaled.png" target="_blank" rel=" noreferrer noopener"><img decoding="async" width="1024" height="262" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-28-1024x262.png?lossy=2&strip=1&webp=1" alt="" class="wp-image-53589" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-28.png?size=126x32&amp;lossy=2&amp;strip=1&amp;webp=1 126w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-28-300x77.png?lossy=2&amp;strip=1&amp;webp=1 300w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-28.png?size=378x97&amp;lossy=2&amp;strip=1&amp;webp=1 378w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-28.png?size=504x129&amp;lossy=2&amp;strip=1&amp;webp=1 504w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-28.png?size=630x161&amp;lossy=2&amp;strip=1&amp;webp=1 630w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-28-768x196.png?lossy=2&amp;strip=1&amp;webp=1 768w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-28-1024x262.png?lossy=2&amp;strip=1&amp;webp=1 1024w, https://b2633864.smushcdn.com/2633864/wp-content/uploads/2026/04/image-28-scaled.png?lossy=2&amp;strip=1&amp;webp=1 1080w" sizes="(max-width: 630px) 100vw, 630px" /></a><figcaption class="wp-element-caption"><strong>Figure 9:</strong> Internal cache metrics showing hit, miss, and bypass counters, enabling lightweight observability of cache behavior during development and debugging (source: image by the author).</figcaption></figure></div>


<p>These metrics make cache behavior observable without requiring external tooling.</p>



<p>If you can reproduce these behaviors locally, you’ve successfully implemented a working semantic cache.</p>



<p>In the next lesson, we will take this system and begin hardening it for real-world use.</p>



<div id="pitch" style="padding: 40px; width: 100%; background-color: #F4F6FA;">
	<h3>What's next? We recommend <a target="_blank" href="https://pyimagesearch.com/pyimagesearch-university/?utm_source=blogPost&utm_medium=bottomBanner&utm_campaign=What%27s%20next%3F%20I%20recommend">PyImageSearch University</a>.</h3>

	<script src="https://fast.wistia.com/embed/medias/kno0cmko2z.jsonp" async></script><script src="https://fast.wistia.com/assets/external/E-v1.js" async></script><div class="wistia_responsive_padding" style="padding:56.25% 0 0 0;position:relative;"><div class="wistia_responsive_wrapper" style="height:100%;left:0;position:absolute;top:0;width:100%;"><div class="wistia_embed wistia_async_kno0cmko2z videoFoam=true" style="height:100%;position:relative;width:100%"><div class="wistia_swatch" style="height:100%;left:0;opacity:0;overflow:hidden;position:absolute;top:0;transition:opacity 200ms;width:100%;"><img decoding="async" src="https://fast.wistia.com/embed/medias/kno0cmko2z/swatch" style="filter:blur(5px);height:100%;object-fit:contain;width:100%;" alt="" aria-hidden="true" onload="this.parentNode.style.opacity=1;" /></div></div></div></div>

	<div style="margin-top: 32px; margin-bottom: 32px; ">
		<strong>Course information:</strong><br/>
		86+ total classes • 115+ hours hours of on-demand code walkthrough videos • Last updated: June 2026<br/>
		<span style="color: #169FE6;">★★★★★</span> 4.84 (128 Ratings) • 16,000+ Students Enrolled
	</div>

	<p><strong>I strongly believe that if you had the right teacher you could <em>master</em> computer vision and deep learning.</strong></p>

	<p>Do you think learning computer vision and deep learning has to be time-consuming, overwhelming, and complicated? Or has to involve complex mathematics and equations? Or requires a degree in computer science?</p>

	<p>That’s <em>not</em> the case.</p>

	<p>All you need to master computer vision and deep learning is for someone to explain things to you in <em>simple, intuitive</em> terms. <em>And that’s exactly what I do</em>. My mission is to change education and how complex Artificial Intelligence topics are taught.</p>

	<p>If you're serious about learning computer vision, your next stop should be PyImageSearch University, the most comprehensive computer vision, deep learning, and OpenCV course online today. Here you’ll learn how to <em>successfully</em> and <em>confidently</em> apply computer vision to your work, research, and projects. Join me in computer vision mastery.</p>

	<p><strong>Inside PyImageSearch University you'll find:</strong></p>

	<ul style="margin-left: 0px;">
		<li style="list-style: none;">&check; <strong>86+ courses</strong> on essential computer vision, deep learning, and OpenCV topics</li>
		<li style="list-style: none;">&check; <strong>86 Certificates</strong> of Completion</li>
		<li style="list-style: none;">&check; <strong>115+ hours hours</strong> of on-demand video</li>
		<li style="list-style: none;">&check; <strong>Brand new courses released <em>regularly</em></strong>, ensuring you can keep up with state-of-the-art techniques</li>
		<li style="list-style: none;">&check; <strong>Pre-configured Jupyter Notebooks in Google Colab</strong></li>
		<li style="list-style: none;">&check; Run all code examples in your web browser — works on Windows, macOS, and Linux (no dev environment configuration required!)</li>
		<li style="list-style: none;">&check; Access to <strong>centralized code repos for <em>all</em> 540+ tutorials</strong> on PyImageSearch</li>
		<li style="list-style: none;">&check; <strong> Easy one-click downloads</strong> for code, datasets, pre-trained models, etc.</li>
		<li style="list-style: none;">&check; <strong>Access</strong> on mobile, laptop, desktop, etc.</li>
	</ul>

	<p style="text-align: center;">
		<a target="_blank" class="button link" href="https://pyimagesearch.com/pyimagesearch-university/?utm_source=blogPost&utm_medium=bottomBanner&utm_campaign=What%27s%20next%3F%20I%20recommend" style="background-color: #6DC713; border-bottom: none;">Click here to join PyImageSearch University</a>
	</p>
</div>



<hr class="wp-block-separator has-alpha-channel-opacity" id="h2-Summary"/>



<h2 class="wp-block-heading"><a href="#TOC-h2-Summary">Summary</a></h2>



<p>In this lesson, we built a complete semantic caching system for LLM applications from the ground up. We started by wiring a FastAPI service and defining a clean request–response contract, then implemented a layered caching strategy that prioritizes cheap exact-match lookups before escalating to semantic similarity and, finally, LLM inference.</p>



<p>We walked through how text queries are converted into embeddings on demand, how cached responses and embeddings are stored in Redis, and how the cache decides whether a prior response can be safely reused. By keeping the implementation intentionally simple and explicit, every step in the request flow remains observable and easy to reason about.</p>



<p>Finally, we verified the system end-to-end by running controlled demos: a cold request falling back to the LLM, an exact-match cache hit, a semantic cache hit for a paraphrased query, and an explicit cache bypass. At this point, you have a working semantic cache that behaves correctly, makes its decisions visible, and serves as a solid foundation for further hardening and optimization.</p>



<h3 class="wp-block-heading">Citation Information</h3>



<p><strong>Singh, V</strong><strong>. </strong>“Semantic Caching for LLMs: FastAPI, Redis, and Embeddings,” <em>PyImageSearch</em>, S. Huot, A. Sharma, and P. Thakur, eds., 2026, <a href="https://pyimg.co/yso6f" target="_blank" rel="noreferrer noopener">https://pyimg.co/yso6f</a> </p>



<pre class="EnlighterJSRAW" data-enlighter-language="raw" data-enlighter-theme="classic" data-enlighter-highlight="" data-enlighter-linenumbers="false" data-enlighter-lineoffset="" data-enlighter-title="Semantic Caching for LLMs: FastAPI, Redis, and Embeddings" data-enlighter-group="45">@incollection{Singh_2026_semantic-caching-for-llms-fastapi-redis-and-embeddings,
  author = {Vikram Singh},
  title = {{Semantic Caching for LLMs: FastAPI, Redis, and Embeddings}},
  booktitle = {PyImageSearch},
  editor = {Susan Huot and Aditya Sharma and Piyush Thakur},
  year = {2026},
  url = {https://pyimg.co/yso6f},
}
</pre>



<p><strong>To download the source code to this post (and be notified when future tutorials are published here on PyImageSearch), </strong><em><strong>simply enter your email address in the form below!</strong></em></p>



<div id="download-the-code" class="post-cta-wrap">
<div class="gpd-post-cta">
	<div class="gpd-post-cta-content">
		

			<div class="gpd-post-cta-top">
				<div class="gpd-post-cta-top-image"><img decoding="async" src="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?lossy=2&strip=1&webp=1" alt="" srcset="https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?lossy=2&strip=1&webp=1 410w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?size=126x174&lossy=2&strip=1&webp=1 126w,https://b2633864.smushcdn.com/2633864/wp-content/uploads/2020/01/cta-source-guide-1.png?size=252x348&lossy=2&strip=1&webp=1 252w" sizes="(max-width: 410px) 100vw, 410px" /></div>
				
				<div class="gpd-post-cta-top-title"><h4>Download the Source Code and FREE 17-page Resource Guide</h4></div>
				<div class="gpd-post-cta-top-desc"><p>Enter your email address below to get a .zip of the code and a <strong>FREE 17-page Resource Guide on Computer Vision, OpenCV, and Deep Learning.</strong> Inside you'll find my hand-picked tutorials, books, courses, and libraries to help you master CV and DL!</p></div>


			</div>

			<div class="gpd-post-cta-bottom">
				<form id="footer-cta-code" class="footer-cta" action="https://www.getdrip.com/forms/4130035/submissions" method="post" target="blank" data-drip-embedded-form="4130035">
					<input name="fields[email]" type="email" value="" placeholder="Your email address" class="form-control" />

					<button type="submit">Download the code!</button>

					<div style="display: none;" aria-hidden="true"><label for="website">Website</label><br /><input type="text" id="website" name="website" tabindex="-1" autocomplete="false" value="" /></div>
				</form>
			</div>


		
	</div>

</div>
</div>
<p>The post <a rel="nofollow" href="https://pyimagesearch.com/2026/04/27/semantic-caching-for-llms-fastapi-redis-and-embeddings/">Semantic Caching for LLMs: FastAPI, Redis, and Embeddings</a> appeared first on <a rel="nofollow" href="https://pyimagesearch.com">PyImageSearch</a>.</p>
]]></content:encoded>
					
		
		
			</item>
	</channel>
</rss>
