Learn Hugging Face šŸ¤—
  • Home
  • About
  • Setup
  • Glossary
    • Natural Language Processing (NLP)
      • Fully fine-tune an LLM to do structrued data extraction
      • Build a custom text classification model and demo
    • Computer Vision
      • Build a custom object detection model and demo
      • Fine-tune a Small VLM to do structured data extraction
    • RAG - Retrieval-Augmented Generation
      • Build a multimodal RAG pipeline with NVIDIA Nemotron models

    On this page

    • Work in Progress: Performing accelerated inference with vLLM
    • Report an issue

    Work in Progress: Performing accelerated inference with vLLM

    Note: This notebook is a work in progress.

    Goal: 10x our inference speed on our LLM.

    • TK - this notebook builds off of the LLM fine-tuning tutorial - https://www.learnhuggingface.com/notebooks/hugging_face_llm_full_fine_tune_tutorial
    • TK - reference link to our fine-tuned model
    • TK - make a if/else statement to check if in Google Colab and then install required dependencies
    import time
    
    print(f"Last updated: {time.ctime()}")
    import torch
    
    assert torch.cuda.is_available()
    # =============================================================================
    # vLLM vs HuggingFace Pipeline — Throughput Comparison (Google Colab)
    # =============================================================================
    # Run this entire cell in a Google Colab notebook with a GPU runtime.
    # (Runtime > Change runtime type > T4 GPU or better)
    #
    # This demo:
    #   1. Installs vllm
    #   2. Loads the FoodExtract dataset
    #   3. Runs inference with HuggingFace pipeline (baseline)
    #   4. Runs inference with vLLM offline batch mode (fast)
    #   5. Compares throughput and shows sample outputs
    # =============================================================================
    
    # --- Step 0: Install dependencies ---
    # print("=" * 70)
    # print("  STEP 0: Installing dependencies (this takes a few minutes)...")
    # print("=" * 70)
    
    # import subprocess, sys
    
    # subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
    #                        "vllm", "datasets", "matplotlib"])
    
    # print("\nāœ… Dependencies installed!\n")
    
    # --- Step 1: Load dataset ---
    print("=" * 70)
    print("  STEP 1: Loading FoodExtract-1k dataset")
    print("=" * 70)
    
    from datasets import load_dataset
    
    dataset = load_dataset("mrdbourke/FoodExtract-1k")
    
    def sample_to_conversation(sample):
        return {
            "messages": [
                {"role": "user", "content": sample["sequence"]},
                {"role": "assistant", "content": sample["gpt-oss-120b-label-condensed"]},
            ]
        }
    
    dataset = dataset.map(sample_to_conversation, batched=False)
    print(f"āœ… Loaded {len(dataset['train'])} samples\n")
    
    # --- Step 2: Prepare prompts (using Gemma 3 chat template) ---
    print("=" * 70)
    print("  STEP 2: Preparing prompts")
    print("=" * 70)
    
    from transformers import AutoTokenizer
    
    MODEL_PATH = "mrdbourke/FoodExtract-gemma-3-270m-fine-tune-v2"
    
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    
    # Format prompts using the chat template (same as the original notebook)
    test_prompts = [
        tokenizer.apply_chat_template(
            item["messages"][:1],  # user message only
            tokenize=False,
            add_generation_prompt=True,
        )
        for item in dataset["train"]
    ]
    
    print(f"āœ… Prepared {len(test_prompts)} prompts")
    print(f"   Example prompt (first 120 chars): {test_prompts[0][:120]}...\n")
    
    # --- Step 3: HuggingFace Pipeline baseline ---
    print("=" * 70)
    print("  STEP 3: Running HuggingFace Pipeline (baseline)")
    print("=" * 70)
    
    import time
    from transformers import AutoModelForCausalLM, pipeline, GenerationConfig
    
    # Load model with HF
    hf_model = AutoModelForCausalLM.from_pretrained(
        MODEL_PATH,
        dtype="auto",
        device_map="cuda",
        attn_implementation="eager",
    )
    
    hf_pipeline = pipeline(
        "text-generation",
        model=hf_model,
        tokenizer=tokenizer,
    )
    
    gen_config = GenerationConfig(
        max_new_tokens=512,
        do_sample=False,
        disable_compile=True,
    )
    
    # Run HF pipeline with batch_size=32 (a reasonable batch size for comparison)
    HF_BATCH_SIZE = 32
    NUM_SAMPLES = len(test_prompts)
    
    print(f"   Running {NUM_SAMPLES} samples with batch_size={HF_BATCH_SIZE}...")
    
    hf_outputs = []
    hf_start = time.time()
    
    for i in range(0, NUM_SAMPLES, HF_BATCH_SIZE):
        batch = test_prompts[i : i + HF_BATCH_SIZE]
        batch_out = hf_pipeline(
            text_inputs=batch,
            batch_size=HF_BATCH_SIZE,
            generation_config=gen_config,
        )
        hf_outputs.extend(batch_out)
        # Progress update every 5 batches
        if ((i // HF_BATCH_SIZE) + 1) % 5 == 0:
            elapsed = time.time() - hf_start
            done = min(i + HF_BATCH_SIZE, NUM_SAMPLES)
            rate = done / elapsed
            print(f"   ... {done}/{NUM_SAMPLES} samples ({rate:.1f} samples/s)")
    
    hf_time = time.time() - hf_start
    hf_samples_per_sec = NUM_SAMPLES / hf_time
    
    print(f"\n   āœ… HuggingFace Pipeline done!")
    print(f"   Time: {hf_time:.2f}s | Samples/s: {hf_samples_per_sec:.2f}\n")
    
    # Free HF model memory
    del hf_model, hf_pipeline
    import torch, gc
    gc.collect()
    torch.cuda.empty_cache()
    
    # --- Step 4: vLLM offline batch inference ---
    print("=" * 70)
    print("  STEP 4: Running vLLM offline batch inference")
    print("=" * 70)
    
    from vllm import LLM, SamplingParams
    
    # Load model with vLLM
    # vLLM handles batching automatically — you just pass ALL prompts at once
    vllm_model = LLM(
        model=MODEL_PATH,
        max_model_len=4096,       # keep it modest for Colab GPU memory
        gpu_memory_utilization=0.85,
        dtype="auto",
    )
    
    sampling_params = SamplingParams(
        max_tokens=512,
        temperature=0,  # deterministic (greedy)
    )
    
    print(f"   Running {NUM_SAMPLES} samples in a single vLLM batch call...")
    
    vllm_start = time.time()
    
    # This is the key difference: ONE call with ALL prompts
    # vLLM internally handles optimal batching, scheduling, and KV cache management
    vllm_outputs = vllm_model.generate(test_prompts, sampling_params)
    
    vllm_time = time.time() - vllm_start
    vllm_samples_per_sec = NUM_SAMPLES / vllm_time
    
    print(f"\n   āœ… vLLM batch inference done!")
    print(f"   Time: {vllm_time:.2f}s | Samples/s: {vllm_samples_per_sec:.2f}\n")
    
    # --- Step 5: Results comparison ---
    print("=" * 70)
    print("  RESULTS COMPARISON")
    print("=" * 70)
    
    speedup = hf_time / vllm_time
    
    print(f"""
      {'Metric':<30} {'HuggingFace':>15} {'vLLM':>15}
      {'-' * 60}
      {'Total time (s)':<30} {hf_time:>15.2f} {vllm_time:>15.2f}
      {'Samples/second':<30} {hf_samples_per_sec:>15.2f} {vllm_samples_per_sec:>15.2f}
      {'Batch strategy':<30} {'chunk=' + str(HF_BATCH_SIZE):>15} {'auto (all)':>15}
    
      ⚔ vLLM speedup: {speedup:.1f}x faster
    
      At HF speed: {NUM_SAMPLES / hf_samples_per_sec / 3600:.2f} hours for {NUM_SAMPLES} samples
      At vLLM speed: {NUM_SAMPLES / vllm_samples_per_sec / 3600:.4f} hours for {NUM_SAMPLES} samples
    
      --- Extrapolation to 80M samples ---
      HF pipeline:  {80_000_000 / hf_samples_per_sec / 86400:,.1f} days
      vLLM batch:   {80_000_000 / vllm_samples_per_sec / 86400:,.1f} days
    """)
    
    # --- Step 6: Show sample outputs side by side ---
    print("=" * 70)
    print("  SAMPLE OUTPUTS (first 5)")
    print("=" * 70)
    
    for i in range(5):
        prompt_text = dataset["train"][i]["messages"][0]["content"]
        expected = dataset["train"][i]["messages"][1]["content"]
    
        # Extract HF output (strip the prompt prefix)
        hf_text = hf_outputs[i][0]["generated_text"][len(test_prompts[i]):]
    
        # Extract vLLM output
        vllm_text = vllm_outputs[i].outputs[0].text
    
        print(f"\n  --- Sample {i} ---")
        print(f"  Prompt:   {prompt_text[:80]}{'...' if len(prompt_text) > 80 else ''}")
        print(f"  Expected: {expected[:80]}{'...' if len(expected) > 80 else ''}")
        print(f"  HF out:   {hf_text.strip()[:80]}{'...' if len(hf_text.strip()) > 80 else ''}")
        print(f"  vLLM out: {vllm_text.strip()[:80]}{'...' if len(vllm_text.strip()) > 80 else ''}")
        match = "āœ… match" if hf_text.strip() == vllm_text.strip() else "āš ļø  differ"
        print(f"  HF vs vLLM: {match}")
    
    # --- Step 7: Plot ---
    print(f"\n{'=' * 70}")
    print("  PLOT")
    print("=" * 70)
    
    import matplotlib.pyplot as plt
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    
    # Bar chart: time
    methods = ["HuggingFace\nPipeline", "vLLM\nBatch"]
    times = [hf_time, vllm_time]
    colors = ["steelblue", "coral"]
    
    ax1.bar(methods, times, color=colors)
    ax1.set_ylabel("Total Time (seconds)")
    ax1.set_title(f"Inference Time — {NUM_SAMPLES} samples")
    for i, v in enumerate(times):
        ax1.text(i, v + max(times) * 0.02, f"{v:.1f}s", ha="center", fontweight="bold")
    
    # Add speedup arrow
    ax1.annotate(
        f"{speedup:.1f}x faster",
        xy=(1, vllm_time + max(times) * 0.05),
        xytext=(0, hf_time + max(times) * 0.1),
        arrowprops=dict(arrowstyle="->", color="green", lw=2),
        fontsize=12, fontweight="bold", color="green", ha="center",
    )
    ax1.set_ylim(0, max(times) * 1.4)
    
    # Bar chart: samples/s
    rates = [hf_samples_per_sec, vllm_samples_per_sec]
    ax2.bar(methods, rates, color=colors)
    ax2.set_ylabel("Samples per Second")
    ax2.set_title("Throughput Comparison")
    for i, v in enumerate(rates):
        ax2.text(i, v + max(rates) * 0.02, f"{v:.1f}", ha="center", fontweight="bold")
    ax2.set_ylim(0, max(rates) * 1.3)
    
    plt.tight_layout()
    plt.savefig("vllm_vs_hf_comparison.png", dpi=150, bbox_inches="tight")
    plt.show()
    
    print("\nāœ… Plot saved to vllm_vs_hf_comparison.png")
    print("=" * 70)
    print("  DONE!")
    print("=" * 70)
    Back to top
     
     
    • Report an issue