Work in Progress: Performing accelerated inference with vLLM

Note: This notebook is a work in progress.

Goal: 10x our inference speed on our LLM.

TK - this notebook builds off of the LLM fine-tuning tutorial - https://www.learnhuggingface.com/notebooks/hugging_face_llm_full_fine_tune_tutorial
TK - reference link to our fine-tuned model
TK - make a if/else statement to check if in Google Colab and then install required dependencies

In [1]:

import time

print(f"Last updated: {time.ctime()}")

Out [1]:

Last updated: Mon Mar 23 05:29:05 2026

In [1]:

import torch

assert torch.cuda.is_available()

In [2]:

# =============================================================================
# vLLM vs HuggingFace Pipeline — Throughput Comparison (Google Colab)
# =============================================================================
# Run this entire cell in a Google Colab notebook with a GPU runtime.
# (Runtime > Change runtime type > T4 GPU or better)
#
# This demo:
#   1. Installs vllm
#   2. Loads the FoodExtract dataset
#   3. Runs inference with HuggingFace pipeline (baseline)
#   4. Runs inference with vLLM offline batch mode (fast)
#   5. Compares throughput and shows sample outputs
# =============================================================================

# --- Step 0: Install dependencies ---
# print("=" * 70)
# print("  STEP 0: Installing dependencies (this takes a few minutes)...")
# print("=" * 70)

# import subprocess, sys

# subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
#                        "vllm", "datasets", "matplotlib"])

# print("\n✅ Dependencies installed!\n")

# --- Step 1: Load dataset ---
print("=" * 70)
print("  STEP 1: Loading FoodExtract-1k dataset")
print("=" * 70)

from datasets import load_dataset

dataset = load_dataset("mrdbourke/FoodExtract-1k")

def sample_to_conversation(sample):
    return {
        "messages": [
            {"role": "user", "content": sample["sequence"]},
            {"role": "assistant", "content": sample["gpt-oss-120b-label-condensed"]},
        ]
    }

dataset = dataset.map(sample_to_conversation, batched=False)
print(f"✅ Loaded {len(dataset['train'])} samples\n")

# --- Step 2: Prepare prompts (using Gemma 3 chat template) ---
print("=" * 70)
print("  STEP 2: Preparing prompts")
print("=" * 70)

from transformers import AutoTokenizer

MODEL_PATH = "mrdbourke/FoodExtract-gemma-3-270m-fine-tune-v2"

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

# Format prompts using the chat template (same as the original notebook)
test_prompts = [
    tokenizer.apply_chat_template(
        item["messages"][:1],  # user message only
        tokenize=False,
        add_generation_prompt=True,
    )
    for item in dataset["train"]
]

print(f"✅ Prepared {len(test_prompts)} prompts")
print(f"   Example prompt (first 120 chars): {test_prompts[0][:120]}...\n")

# --- Step 3: HuggingFace Pipeline baseline ---
print("=" * 70)
print("  STEP 3: Running HuggingFace Pipeline (baseline)")
print("=" * 70)

import time
from transformers import AutoModelForCausalLM, pipeline, GenerationConfig

# Load model with HF
hf_model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    dtype="auto",
    device_map="cuda",
    attn_implementation="eager",
)

hf_pipeline = pipeline(
    "text-generation",
    model=hf_model,
    tokenizer=tokenizer,
)

gen_config = GenerationConfig(
    max_new_tokens=512,
    do_sample=False,
    disable_compile=True,
)

# Run HF pipeline with batch_size=32 (a reasonable batch size for comparison)
HF_BATCH_SIZE = 32
NUM_SAMPLES = len(test_prompts)

print(f"   Running {NUM_SAMPLES} samples with batch_size={HF_BATCH_SIZE}...")

hf_outputs = []
hf_start = time.time()

for i in range(0, NUM_SAMPLES, HF_BATCH_SIZE):
    batch = test_prompts[i : i + HF_BATCH_SIZE]
    batch_out = hf_pipeline(
        text_inputs=batch,
        batch_size=HF_BATCH_SIZE,
        generation_config=gen_config,
    )
    hf_outputs.extend(batch_out)
    # Progress update every 5 batches
    if ((i // HF_BATCH_SIZE) + 1) % 5 == 0:
        elapsed = time.time() - hf_start
        done = min(i + HF_BATCH_SIZE, NUM_SAMPLES)
        rate = done / elapsed
        print(f"   ... {done}/{NUM_SAMPLES} samples ({rate:.1f} samples/s)")

hf_time = time.time() - hf_start
hf_samples_per_sec = NUM_SAMPLES / hf_time

print(f"\n   ✅ HuggingFace Pipeline done!")
print(f"   Time: {hf_time:.2f}s | Samples/s: {hf_samples_per_sec:.2f}\n")

# Free HF model memory
del hf_model, hf_pipeline
import torch, gc
gc.collect()
torch.cuda.empty_cache()

# --- Step 4: vLLM offline batch inference ---
print("=" * 70)
print("  STEP 4: Running vLLM offline batch inference")
print("=" * 70)

from vllm import LLM, SamplingParams

# Load model with vLLM
# vLLM handles batching automatically — you just pass ALL prompts at once
vllm_model = LLM(
    model=MODEL_PATH,
    max_model_len=4096,       # keep it modest for Colab GPU memory
    gpu_memory_utilization=0.85,
    dtype="auto",
)

sampling_params = SamplingParams(
    max_tokens=512,
    temperature=0,  # deterministic (greedy)
)

print(f"   Running {NUM_SAMPLES} samples in a single vLLM batch call...")

vllm_start = time.time()

# This is the key difference: ONE call with ALL prompts
# vLLM internally handles optimal batching, scheduling, and KV cache management
vllm_outputs = vllm_model.generate(test_prompts, sampling_params)

vllm_time = time.time() - vllm_start
vllm_samples_per_sec = NUM_SAMPLES / vllm_time

print(f"\n   ✅ vLLM batch inference done!")
print(f"   Time: {vllm_time:.2f}s | Samples/s: {vllm_samples_per_sec:.2f}\n")

# --- Step 5: Results comparison ---
print("=" * 70)
print("  RESULTS COMPARISON")
print("=" * 70)

speedup = hf_time / vllm_time

print(f"""
  {'Metric':<30} {'HuggingFace':>15} {'vLLM':>15}
  {'-' * 60}
  {'Total time (s)':<30} {hf_time:>15.2f} {vllm_time:>15.2f}
  {'Samples/second':<30} {hf_samples_per_sec:>15.2f} {vllm_samples_per_sec:>15.2f}
  {'Batch strategy':<30} {'chunk=' + str(HF_BATCH_SIZE):>15} {'auto (all)':>15}

  ⚡ vLLM speedup: {speedup:.1f}x faster

  At HF speed: {NUM_SAMPLES / hf_samples_per_sec / 3600:.2f} hours for {NUM_SAMPLES} samples
  At vLLM speed: {NUM_SAMPLES / vllm_samples_per_sec / 3600:.4f} hours for {NUM_SAMPLES} samples

  --- Extrapolation to 80M samples ---
  HF pipeline:  {80_000_000 / hf_samples_per_sec / 86400:,.1f} days
  vLLM batch:   {80_000_000 / vllm_samples_per_sec / 86400:,.1f} days
""")

# --- Step 6: Show sample outputs side by side ---
print("=" * 70)
print("  SAMPLE OUTPUTS (first 5)")
print("=" * 70)

for i in range(5):
    prompt_text = dataset["train"][i]["messages"][0]["content"]
    expected = dataset["train"][i]["messages"][1]["content"]

    # Extract HF output (strip the prompt prefix)
    hf_text = hf_outputs[i][0]["generated_text"][len(test_prompts[i]):]

    # Extract vLLM output
    vllm_text = vllm_outputs[i].outputs[0].text

    print(f"\n  --- Sample {i} ---")
    print(f"  Prompt:   {prompt_text[:80]}{'...' if len(prompt_text) > 80 else ''}")
    print(f"  Expected: {expected[:80]}{'...' if len(expected) > 80 else ''}")
    print(f"  HF out:   {hf_text.strip()[:80]}{'...' if len(hf_text.strip()) > 80 else ''}")
    print(f"  vLLM out: {vllm_text.strip()[:80]}{'...' if len(vllm_text.strip()) > 80 else ''}")
    match = "✅ match" if hf_text.strip() == vllm_text.strip() else "⚠️  differ"
    print(f"  HF vs vLLM: {match}")

# --- Step 7: Plot ---
print(f"\n{'=' * 70}")
print("  PLOT")
print("=" * 70)

import matplotlib.pyplot as plt

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Bar chart: time
methods = ["HuggingFace\nPipeline", "vLLM\nBatch"]
times = [hf_time, vllm_time]
colors = ["steelblue", "coral"]

ax1.bar(methods, times, color=colors)
ax1.set_ylabel("Total Time (seconds)")
ax1.set_title(f"Inference Time — {NUM_SAMPLES} samples")
for i, v in enumerate(times):
    ax1.text(i, v + max(times) * 0.02, f"{v:.1f}s", ha="center", fontweight="bold")

# Add speedup arrow
ax1.annotate(
    f"{speedup:.1f}x faster",
    xy=(1, vllm_time + max(times) * 0.05),
    xytext=(0, hf_time + max(times) * 0.1),
    arrowprops=dict(arrowstyle="->", color="green", lw=2),
    fontsize=12, fontweight="bold", color="green", ha="center",
)
ax1.set_ylim(0, max(times) * 1.4)

# Bar chart: samples/s
rates = [hf_samples_per_sec, vllm_samples_per_sec]
ax2.bar(methods, rates, color=colors)
ax2.set_ylabel("Samples per Second")
ax2.set_title("Throughput Comparison")
for i, v in enumerate(rates):
    ax2.text(i, v + max(rates) * 0.02, f"{v:.1f}", ha="center", fontweight="bold")
ax2.set_ylim(0, max(rates) * 1.3)

plt.tight_layout()
plt.savefig("vllm_vs_hf_comparison.png", dpi=150, bbox_inches="tight")
plt.show()

print("\n✅ Plot saved to vllm_vs_hf_comparison.png")
print("=" * 70)
print("  DONE!")
print("=" * 70)

Out [2]:

======================================================================
  STEP 1: Loading FoodExtract-1k dataset
======================================================================

/home/mrdbourke/miniforge3/envs/vllm_bench/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

✅ Loaded 1420 samples

======================================================================
  STEP 2: Preparing prompts
======================================================================
✅ Prepared 1420 prompts
   Example prompt (first 120 chars): <bos><start_of_turn>user
A mouth-watering photograph captures a delectable dish centered on a rectangular white porcelai...

======================================================================
  STEP 3: Running HuggingFace Pipeline (baseline)
======================================================================

/home/mrdbourke/miniforge3/envs/vllm_bench/lib/python3.12/site-packages/torch/cuda/__init__.py:435: UserWarning: 
    Found GPU0 NVIDIA GB10 which is of cuda capability 12.1.
    Minimum and Maximum cuda capability supported by this version of PyTorch is
    (8.0) - (12.0)
    
  queued_call()
Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/home/mrdbourke/.cache/huggingface/hub/models--mrdbourke--FoodExtract-gemma-3-270m-fine-tune-v2/.no_exist/9c09257060fe5e8d50e8181d9d344668bd9279f8/custom_generate'
Device set to use cuda
`generation_config` default values have been modified to match model-specific defaults: {'do_sample': True, 'top_k': 64, 'top_p': 0.95, 'pad_token_id': 0, 'bos_token_id': 2, 'eos_token_id': [1, 106]}. If this is not desired, please set these values explicitly.

   Running 1420 samples with batch_size=32...
   ... 160/1420 samples (12.0 samples/s)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset

   ... 320/1420 samples (11.3 samples/s)
   ... 480/1420 samples (9.1 samples/s)
   ... 640/1420 samples (7.8 samples/s)
   ... 800/1420 samples (7.7 samples/s)
   ... 960/1420 samples (7.8 samples/s)
   ... 1120/1420 samples (7.3 samples/s)
   ... 1280/1420 samples (6.3 samples/s)
   ... 1420/1420 samples (6.8 samples/s)

   ✅ HuggingFace Pipeline done!
   Time: 208.44s | Samples/s: 6.81

======================================================================
  STEP 4: Running vLLM offline batch inference
======================================================================
INFO 03-18 04:18:42 [utils.py:233] non-default args: {'max_model_len': 4096, 'gpu_memory_utilization': 0.85, 'disable_log_stats': True, 'model': 'mrdbourke/FoodExtract-gemma-3-270m-fine-tune-v2'}

Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/home/mrdbourke/.cache/huggingface/hub/models--mrdbourke--FoodExtract-gemma-3-270m-fine-tune-v2/.no_exist/9c09257060fe5e8d50e8181d9d344668bd9279f8/preprocessor_config.json'

INFO 03-18 04:18:44 [model.py:533] Resolved architecture: Gemma3ForCausalLM
INFO 03-18 04:18:44 [model.py:1582] Using max model len 4096
INFO 03-18 04:18:44 [scheduler.py:231] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 03-18 04:18:44 [vllm.py:754] Asynchronous scheduling is enabled.
WARNING 03-18 04:18:47 [system_utils.py:152] We must use the `spawn` multiprocessing start method. Overriding VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See https://docs.vllm.ai/en/latest/usage/troubleshooting.html#python-multiprocessing for more information. Reasons: CUDA is initialized
(EngineCore pid=851428) INFO 03-18 04:18:49 [core.py:103] Initializing a V1 LLM engine (v0.17.2rc1.dev45+g761e0aa7a) with config: model='mrdbourke/FoodExtract-gemma-3-270m-fine-tune-v2', speculative_config=None, tokenizer='mrdbourke/FoodExtract-gemma-3-270m-fine-tune-v2', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=mrdbourke/FoodExtract-gemma-3-270m-fine-tune-v2, enable_prefix_caching=True, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': <CompilationMode.VLLM_COMPILE: 3>, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': <CUDAGraphMode.FULL_AND_PIECEWISE: (2, 1)>, 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []}

(EngineCore pid=851428) /home/mrdbourke/miniforge3/envs/vllm_bench/lib/python3.12/site-packages/torch/cuda/__init__.py:435: UserWarning: 
(EngineCore pid=851428)     Found GPU0 NVIDIA GB10 which is of cuda capability 12.1.
(EngineCore pid=851428)     Minimum and Maximum cuda capability supported by this version of PyTorch is
(EngineCore pid=851428)     (8.0) - (12.0)
(EngineCore pid=851428)     
(EngineCore pid=851428)   queued_call()

(EngineCore pid=851428) INFO 03-18 04:18:50 [parallel_state.py:1395] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://192.168.1.6:39495 backend=nccl
(EngineCore pid=851428) INFO 03-18 04:18:50 [parallel_state.py:1717] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A
(EngineCore pid=851428) INFO 03-18 04:18:50 [gpu_model_runner.py:4506] Starting to load model mrdbourke/FoodExtract-gemma-3-270m-fine-tune-v2...
(EngineCore pid=851428) INFO 03-18 04:18:50 [cuda.py:333] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION'].
(EngineCore pid=851428) INFO 03-18 04:18:50 [flash_attn.py:598] Using FlashAttention version 2

(EngineCore pid=851428) Ignored error while writing commit hash to /home/mrdbourke/.cache/huggingface/hub/models--mrdbourke--FoodExtract-gemma-3-270m-fine-tune-v2/refs/main: [Errno 13] Permission denied: '/home/mrdbourke/.cache/huggingface/hub/models--mrdbourke--FoodExtract-gemma-3-270m-fine-tune-v2/refs/main'.
(EngineCore pid=851428) Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/home/mrdbourke/.cache/huggingface/hub/models--mrdbourke--FoodExtract-gemma-3-270m-fine-tune-v2/.no_exist/9c09257060fe5e8d50e8181d9d344668bd9279f8/model.safetensors.index.json'
Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]

(EngineCore pid=851428) INFO 03-18 04:18:52 [weight_utils.py:618] No model.safetensors.index.json found in remote.

Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:03<00:00,  3.09s/it]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:03<00:00,  3.09s/it]
(EngineCore pid=851428)

(EngineCore pid=851428) INFO 03-18 04:18:55 [default_loader.py:377] Loading weights took 3.11 seconds
(EngineCore pid=851428) INFO 03-18 04:18:55 [gpu_model_runner.py:4591] Model loading took 0.53 GiB memory and 4.880466 seconds
(EngineCore pid=851428) INFO 03-18 04:18:56 [backends.py:988] Using cache directory: /home/mrdbourke/.cache/vllm/torch_compile_cache/21aa8e5800/rank_0_0/backbone for vLLM's torch.compile
(EngineCore pid=851428) INFO 03-18 04:18:56 [backends.py:1048] Dynamo bytecode transform time: 0.82 s

(EngineCore pid=851428) [rank0]:W0318 04:18:57.420000 851428 site-packages/torch/_inductor/utils.py:1679] Not enough SMs to use max_autotune_gemm mode

(EngineCore pid=851428) INFO 03-18 04:18:58 [backends.py:284] Directly load the compiled graph(s) for compile range (1, 8192) from the cache, took 1.490 s
(EngineCore pid=851428) INFO 03-18 04:18:58 [monitor.py:48] torch.compile took 2.41 s in total
(EngineCore pid=851428) INFO 03-18 04:18:58 [decorators.py:296] Directly load AOT compilation from path /home/mrdbourke/.cache/vllm/torch_compile_cache/torch_aot_compile/0cc0fab39e8169dcb63bbf9e524e9cb1f63a6cb828ce868820ff2ae7346cd674/rank_0_0/model
(EngineCore pid=851428) INFO 03-18 04:18:58 [monitor.py:76] Initial profiling/warmup run took 0.21 s
(EngineCore pid=851428) INFO 03-18 04:18:58 [kv_cache_utils.py:826] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512
(EngineCore pid=851428) INFO 03-18 04:18:58 [gpu_model_runner.py:5632] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=35 (largest=256)
(EngineCore pid=851428) INFO 03-18 04:18:59 [gpu_model_runner.py:5711] Estimated CUDA graph memory: 0.25 GiB total
(EngineCore pid=851428) INFO 03-18 04:18:59 [gpu_worker.py:456] Available KV cache memory: 99.47 GiB
(EngineCore pid=851428) INFO 03-18 04:18:59 [gpu_worker.py:490] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.8500 to 0.8521 to maintain the same effective KV cache size.
(EngineCore pid=851428) INFO 03-18 04:18:59 [kv_cache_utils.py:1316] GPU KV cache size: 5,794,416 tokens
(EngineCore pid=851428) INFO 03-18 04:18:59 [kv_cache_utils.py:1321] Maximum concurrency for 4,096 tokens per request: 1410.06x

(EngineCore pid=851428) 2026-03-18 04:19:06,553 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ...
(EngineCore pid=851428) 2026-03-18 04:19:06,835 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 51/51 [00:00<00:00, 69.06it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 35/35 [00:00<00:00, 86.07it/s]

(EngineCore pid=851428) INFO 03-18 04:19:08 [gpu_model_runner.py:5771] Graph capturing finished in 1 secs, took 0.46 GiB
(EngineCore pid=851428) INFO 03-18 04:19:08 [gpu_worker.py:617] CUDA graph pool memory: 0.46 GiB (actual), 0.25 GiB (estimated), difference: 0.21 GiB (46.1%).
(EngineCore pid=851428) INFO 03-18 04:19:08 [core.py:281] init engine (profile, create kv cache, warmup model) took 13.01 seconds
(EngineCore pid=851428) INFO 03-18 04:19:12 [vllm.py:754] Asynchronous scheduling is enabled.
INFO 03-18 04:19:12 [llm.py:391] Supported tasks: ['generate']
   Running 1420 samples in a single vLLM batch call...

Rendering prompts: 100%|██████████| 1420/1420 [00:00<00:00, 3968.93it/s]
Processed prompts: 100%|██████████| 1420/1420 [00:05<00:00, 241.95it/s, est. speed input: 29275.11 toks/s, output: 7791.33 toks/s]


   ✅ vLLM batch inference done!
   Time: 6.25s | Samples/s: 227.17

======================================================================
  RESULTS COMPARISON
======================================================================

  Metric                             HuggingFace            vLLM
  ------------------------------------------------------------
  Total time (s)                          208.44            6.25
  Samples/second                            6.81          227.17
  Batch strategy                        chunk=32      auto (all)

  ⚡ vLLM speedup: 33.3x faster

  At HF speed: 0.06 hours for 1420 samples
  At vLLM speed: 0.0017 hours for 1420 samples

  --- Extrapolation to 80M samples ---
  HF pipeline:  135.9 days
  vLLM batch:   4.1 days

======================================================================
  SAMPLE OUTPUTS (first 5)
======================================================================

  --- Sample 0 ---
  Prompt:   A mouth-watering photograph captures a delectable dish centered on a rectangular...
  Expected: food_or_drink: 1
tags: fi, fa
foods: cheese-stuffed peppers, cherry tomato halve...
  HF out:   food_or_drink: 1
tags: fi
foods: cheese-stuffed peppers, cherry tomato halves, g...
  vLLM out: food_or_drink: 1
tags: fi
foods: cheese-stuffed peppers, cherry tomato halves, g...
  HF vs vLLM: ⚠️  differ

  --- Sample 1 ---
  Prompt:   Bake Your Way To Happiness by Lisa de Nikolits
  Expected: food_or_drink: 1
tags: fa
foods: 
drinks:
  HF out:   food_or_drink: 1
tags: fa, fi
foods: 
drinks:
  vLLM out: food_or_drink: 1
tags: fi
foods: 
drinks:
  HF vs vLLM: ⚠️  differ

  --- Sample 2 ---
  Prompt:   A top‑down view of a pristine white plate showcases each item separately: a neat...
  Expected: food_or_drink: 1
tags: fi, di
foods: elbow macaroni pasta, grilled chicken skewe...
  HF out:   food_or_drink: 1
tags: fi, di
foods: elbow macaroni pasta, grilled chicken skewe...
  vLLM out: food_or_drink: 1
tags: fi, di
foods: elbow macaroni pasta, grilled chicken skewe...
  HF vs vLLM: ⚠️  differ

  --- Sample 3 ---
  Prompt:   Educational science activity kit with solar print paper to create solar print ar...
  Expected: food_or_drink: 0
tags: 
foods: 
drinks:
  HF out:   food_or_drink: 0
tags: 
foods: 
drinks:
  vLLM out: food_or_drink: 0
tags: 
foods: 
drinks:
  HF vs vLLM: ✅ match

  --- Sample 4 ---
  Prompt:   This image showcases a promotional photograph of a five-piece set of pots and pa...
  Expected: food_or_drink: 0
tags: 
foods: 
drinks:
  HF out:   food_or_drink: 0
tags: 
foods: 
drinks:
  vLLM out: food_or_drink: 0
tags: 
foods: 
drinks:
  HF vs vLLM: ✅ match

======================================================================
  PLOT
======================================================================

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[2], line 222
    219 print("  PLOT")
    220 print("=" * 70)
--> 222 import matplotlib.pyplot as plt
    224 fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    226 # Bar chart: time

ModuleNotFoundError: No module named 'matplotlib'

False