Work in Progress: Performing accelerated inference with vLLM
Note: This notebook is a work in progress.
Goal: 10x our inference speed on our LLM.
- TK - this notebook builds off of the LLM fine-tuning tutorial - https://www.learnhuggingface.com/notebooks/hugging_face_llm_full_fine_tune_tutorial
- TK - reference link to our fine-tuned model
- TK - make a if/else statement to check if in Google Colab and then install required dependencies
import time
print(f"Last updated: {time.ctime()}")import torch
assert torch.cuda.is_available()# =============================================================================
# vLLM vs HuggingFace Pipeline ā Throughput Comparison (Google Colab)
# =============================================================================
# Run this entire cell in a Google Colab notebook with a GPU runtime.
# (Runtime > Change runtime type > T4 GPU or better)
#
# This demo:
# 1. Installs vllm
# 2. Loads the FoodExtract dataset
# 3. Runs inference with HuggingFace pipeline (baseline)
# 4. Runs inference with vLLM offline batch mode (fast)
# 5. Compares throughput and shows sample outputs
# =============================================================================
# --- Step 0: Install dependencies ---
# print("=" * 70)
# print(" STEP 0: Installing dependencies (this takes a few minutes)...")
# print("=" * 70)
# import subprocess, sys
# subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
# "vllm", "datasets", "matplotlib"])
# print("\nā
Dependencies installed!\n")
# --- Step 1: Load dataset ---
print("=" * 70)
print(" STEP 1: Loading FoodExtract-1k dataset")
print("=" * 70)
from datasets import load_dataset
dataset = load_dataset("mrdbourke/FoodExtract-1k")
def sample_to_conversation(sample):
return {
"messages": [
{"role": "user", "content": sample["sequence"]},
{"role": "assistant", "content": sample["gpt-oss-120b-label-condensed"]},
]
}
dataset = dataset.map(sample_to_conversation, batched=False)
print(f"ā
Loaded {len(dataset['train'])} samples\n")
# --- Step 2: Prepare prompts (using Gemma 3 chat template) ---
print("=" * 70)
print(" STEP 2: Preparing prompts")
print("=" * 70)
from transformers import AutoTokenizer
MODEL_PATH = "mrdbourke/FoodExtract-gemma-3-270m-fine-tune-v2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
# Format prompts using the chat template (same as the original notebook)
test_prompts = [
tokenizer.apply_chat_template(
item["messages"][:1], # user message only
tokenize=False,
add_generation_prompt=True,
)
for item in dataset["train"]
]
print(f"ā
Prepared {len(test_prompts)} prompts")
print(f" Example prompt (first 120 chars): {test_prompts[0][:120]}...\n")
# --- Step 3: HuggingFace Pipeline baseline ---
print("=" * 70)
print(" STEP 3: Running HuggingFace Pipeline (baseline)")
print("=" * 70)
import time
from transformers import AutoModelForCausalLM, pipeline, GenerationConfig
# Load model with HF
hf_model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
dtype="auto",
device_map="cuda",
attn_implementation="eager",
)
hf_pipeline = pipeline(
"text-generation",
model=hf_model,
tokenizer=tokenizer,
)
gen_config = GenerationConfig(
max_new_tokens=512,
do_sample=False,
disable_compile=True,
)
# Run HF pipeline with batch_size=32 (a reasonable batch size for comparison)
HF_BATCH_SIZE = 32
NUM_SAMPLES = len(test_prompts)
print(f" Running {NUM_SAMPLES} samples with batch_size={HF_BATCH_SIZE}...")
hf_outputs = []
hf_start = time.time()
for i in range(0, NUM_SAMPLES, HF_BATCH_SIZE):
batch = test_prompts[i : i + HF_BATCH_SIZE]
batch_out = hf_pipeline(
text_inputs=batch,
batch_size=HF_BATCH_SIZE,
generation_config=gen_config,
)
hf_outputs.extend(batch_out)
# Progress update every 5 batches
if ((i // HF_BATCH_SIZE) + 1) % 5 == 0:
elapsed = time.time() - hf_start
done = min(i + HF_BATCH_SIZE, NUM_SAMPLES)
rate = done / elapsed
print(f" ... {done}/{NUM_SAMPLES} samples ({rate:.1f} samples/s)")
hf_time = time.time() - hf_start
hf_samples_per_sec = NUM_SAMPLES / hf_time
print(f"\n ā
HuggingFace Pipeline done!")
print(f" Time: {hf_time:.2f}s | Samples/s: {hf_samples_per_sec:.2f}\n")
# Free HF model memory
del hf_model, hf_pipeline
import torch, gc
gc.collect()
torch.cuda.empty_cache()
# --- Step 4: vLLM offline batch inference ---
print("=" * 70)
print(" STEP 4: Running vLLM offline batch inference")
print("=" * 70)
from vllm import LLM, SamplingParams
# Load model with vLLM
# vLLM handles batching automatically ā you just pass ALL prompts at once
vllm_model = LLM(
model=MODEL_PATH,
max_model_len=4096, # keep it modest for Colab GPU memory
gpu_memory_utilization=0.85,
dtype="auto",
)
sampling_params = SamplingParams(
max_tokens=512,
temperature=0, # deterministic (greedy)
)
print(f" Running {NUM_SAMPLES} samples in a single vLLM batch call...")
vllm_start = time.time()
# This is the key difference: ONE call with ALL prompts
# vLLM internally handles optimal batching, scheduling, and KV cache management
vllm_outputs = vllm_model.generate(test_prompts, sampling_params)
vllm_time = time.time() - vllm_start
vllm_samples_per_sec = NUM_SAMPLES / vllm_time
print(f"\n ā
vLLM batch inference done!")
print(f" Time: {vllm_time:.2f}s | Samples/s: {vllm_samples_per_sec:.2f}\n")
# --- Step 5: Results comparison ---
print("=" * 70)
print(" RESULTS COMPARISON")
print("=" * 70)
speedup = hf_time / vllm_time
print(f"""
{'Metric':<30} {'HuggingFace':>15} {'vLLM':>15}
{'-' * 60}
{'Total time (s)':<30} {hf_time:>15.2f} {vllm_time:>15.2f}
{'Samples/second':<30} {hf_samples_per_sec:>15.2f} {vllm_samples_per_sec:>15.2f}
{'Batch strategy':<30} {'chunk=' + str(HF_BATCH_SIZE):>15} {'auto (all)':>15}
ā” vLLM speedup: {speedup:.1f}x faster
At HF speed: {NUM_SAMPLES / hf_samples_per_sec / 3600:.2f} hours for {NUM_SAMPLES} samples
At vLLM speed: {NUM_SAMPLES / vllm_samples_per_sec / 3600:.4f} hours for {NUM_SAMPLES} samples
--- Extrapolation to 80M samples ---
HF pipeline: {80_000_000 / hf_samples_per_sec / 86400:,.1f} days
vLLM batch: {80_000_000 / vllm_samples_per_sec / 86400:,.1f} days
""")
# --- Step 6: Show sample outputs side by side ---
print("=" * 70)
print(" SAMPLE OUTPUTS (first 5)")
print("=" * 70)
for i in range(5):
prompt_text = dataset["train"][i]["messages"][0]["content"]
expected = dataset["train"][i]["messages"][1]["content"]
# Extract HF output (strip the prompt prefix)
hf_text = hf_outputs[i][0]["generated_text"][len(test_prompts[i]):]
# Extract vLLM output
vllm_text = vllm_outputs[i].outputs[0].text
print(f"\n --- Sample {i} ---")
print(f" Prompt: {prompt_text[:80]}{'...' if len(prompt_text) > 80 else ''}")
print(f" Expected: {expected[:80]}{'...' if len(expected) > 80 else ''}")
print(f" HF out: {hf_text.strip()[:80]}{'...' if len(hf_text.strip()) > 80 else ''}")
print(f" vLLM out: {vllm_text.strip()[:80]}{'...' if len(vllm_text.strip()) > 80 else ''}")
match = "ā
match" if hf_text.strip() == vllm_text.strip() else "ā ļø differ"
print(f" HF vs vLLM: {match}")
# --- Step 7: Plot ---
print(f"\n{'=' * 70}")
print(" PLOT")
print("=" * 70)
import matplotlib.pyplot as plt
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
# Bar chart: time
methods = ["HuggingFace\nPipeline", "vLLM\nBatch"]
times = [hf_time, vllm_time]
colors = ["steelblue", "coral"]
ax1.bar(methods, times, color=colors)
ax1.set_ylabel("Total Time (seconds)")
ax1.set_title(f"Inference Time ā {NUM_SAMPLES} samples")
for i, v in enumerate(times):
ax1.text(i, v + max(times) * 0.02, f"{v:.1f}s", ha="center", fontweight="bold")
# Add speedup arrow
ax1.annotate(
f"{speedup:.1f}x faster",
xy=(1, vllm_time + max(times) * 0.05),
xytext=(0, hf_time + max(times) * 0.1),
arrowprops=dict(arrowstyle="->", color="green", lw=2),
fontsize=12, fontweight="bold", color="green", ha="center",
)
ax1.set_ylim(0, max(times) * 1.4)
# Bar chart: samples/s
rates = [hf_samples_per_sec, vllm_samples_per_sec]
ax2.bar(methods, rates, color=colors)
ax2.set_ylabel("Samples per Second")
ax2.set_title("Throughput Comparison")
for i, v in enumerate(rates):
ax2.text(i, v + max(rates) * 0.02, f"{v:.1f}", ha="center", fontweight="bold")
ax2.set_ylim(0, max(rates) * 1.3)
plt.tight_layout()
plt.savefig("vllm_vs_hf_comparison.png", dpi=150, bbox_inches="tight")
plt.show()
print("\nā
Plot saved to vllm_vs_hf_comparison.png")
print("=" * 70)
print(" DONE!")
print("=" * 70)