In [1]:
import time
print(f"Last updated: {time.ctime()}")Out [1]:
Last updated: Mon Mar 23 05:29:05 2026
Note: This notebook is a work in progress.
Goal: 10x our inference speed on our LLM.
Last updated: Mon Mar 23 05:29:05 2026
# =============================================================================
# vLLM vs HuggingFace Pipeline β Throughput Comparison (Google Colab)
# =============================================================================
# Run this entire cell in a Google Colab notebook with a GPU runtime.
# (Runtime > Change runtime type > T4 GPU or better)
#
# This demo:
# 1. Installs vllm
# 2. Loads the FoodExtract dataset
# 3. Runs inference with HuggingFace pipeline (baseline)
# 4. Runs inference with vLLM offline batch mode (fast)
# 5. Compares throughput and shows sample outputs
# =============================================================================
# --- Step 0: Install dependencies ---
# print("=" * 70)
# print(" STEP 0: Installing dependencies (this takes a few minutes)...")
# print("=" * 70)
# import subprocess, sys
# subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
# "vllm", "datasets", "matplotlib"])
# print("\nβ
Dependencies installed!\n")
# --- Step 1: Load dataset ---
print("=" * 70)
print(" STEP 1: Loading FoodExtract-1k dataset")
print("=" * 70)
from datasets import load_dataset
dataset = load_dataset("mrdbourke/FoodExtract-1k")
def sample_to_conversation(sample):
return {
"messages": [
{"role": "user", "content": sample["sequence"]},
{"role": "assistant", "content": sample["gpt-oss-120b-label-condensed"]},
]
}
dataset = dataset.map(sample_to_conversation, batched=False)
print(f"β
Loaded {len(dataset['train'])} samples\n")
# --- Step 2: Prepare prompts (using Gemma 3 chat template) ---
print("=" * 70)
print(" STEP 2: Preparing prompts")
print("=" * 70)
from transformers import AutoTokenizer
MODEL_PATH = "mrdbourke/FoodExtract-gemma-3-270m-fine-tune-v2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
# Format prompts using the chat template (same as the original notebook)
test_prompts = [
tokenizer.apply_chat_template(
item["messages"][:1], # user message only
tokenize=False,
add_generation_prompt=True,
)
for item in dataset["train"]
]
print(f"β
Prepared {len(test_prompts)} prompts")
print(f" Example prompt (first 120 chars): {test_prompts[0][:120]}...\n")
# --- Step 3: HuggingFace Pipeline baseline ---
print("=" * 70)
print(" STEP 3: Running HuggingFace Pipeline (baseline)")
print("=" * 70)
import time
from transformers import AutoModelForCausalLM, pipeline, GenerationConfig
# Load model with HF
hf_model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
dtype="auto",
device_map="cuda",
attn_implementation="eager",
)
hf_pipeline = pipeline(
"text-generation",
model=hf_model,
tokenizer=tokenizer,
)
gen_config = GenerationConfig(
max_new_tokens=512,
do_sample=False,
disable_compile=True,
)
# Run HF pipeline with batch_size=32 (a reasonable batch size for comparison)
HF_BATCH_SIZE = 32
NUM_SAMPLES = len(test_prompts)
print(f" Running {NUM_SAMPLES} samples with batch_size={HF_BATCH_SIZE}...")
hf_outputs = []
hf_start = time.time()
for i in range(0, NUM_SAMPLES, HF_BATCH_SIZE):
batch = test_prompts[i : i + HF_BATCH_SIZE]
batch_out = hf_pipeline(
text_inputs=batch,
batch_size=HF_BATCH_SIZE,
generation_config=gen_config,
)
hf_outputs.extend(batch_out)
# Progress update every 5 batches
if ((i // HF_BATCH_SIZE) + 1) % 5 == 0:
elapsed = time.time() - hf_start
done = min(i + HF_BATCH_SIZE, NUM_SAMPLES)
rate = done / elapsed
print(f" ... {done}/{NUM_SAMPLES} samples ({rate:.1f} samples/s)")
hf_time = time.time() - hf_start
hf_samples_per_sec = NUM_SAMPLES / hf_time
print(f"\n β
HuggingFace Pipeline done!")
print(f" Time: {hf_time:.2f}s | Samples/s: {hf_samples_per_sec:.2f}\n")
# Free HF model memory
del hf_model, hf_pipeline
import torch, gc
gc.collect()
torch.cuda.empty_cache()
# --- Step 4: vLLM offline batch inference ---
print("=" * 70)
print(" STEP 4: Running vLLM offline batch inference")
print("=" * 70)
from vllm import LLM, SamplingParams
# Load model with vLLM
# vLLM handles batching automatically β you just pass ALL prompts at once
vllm_model = LLM(
model=MODEL_PATH,
max_model_len=4096, # keep it modest for Colab GPU memory
gpu_memory_utilization=0.85,
dtype="auto",
)
sampling_params = SamplingParams(
max_tokens=512,
temperature=0, # deterministic (greedy)
)
print(f" Running {NUM_SAMPLES} samples in a single vLLM batch call...")
vllm_start = time.time()
# This is the key difference: ONE call with ALL prompts
# vLLM internally handles optimal batching, scheduling, and KV cache management
vllm_outputs = vllm_model.generate(test_prompts, sampling_params)
vllm_time = time.time() - vllm_start
vllm_samples_per_sec = NUM_SAMPLES / vllm_time
print(f"\n β
vLLM batch inference done!")
print(f" Time: {vllm_time:.2f}s | Samples/s: {vllm_samples_per_sec:.2f}\n")
# --- Step 5: Results comparison ---
print("=" * 70)
print(" RESULTS COMPARISON")
print("=" * 70)
speedup = hf_time / vllm_time
print(f"""
{'Metric':<30} {'HuggingFace':>15} {'vLLM':>15}
{'-' * 60}
{'Total time (s)':<30} {hf_time:>15.2f} {vllm_time:>15.2f}
{'Samples/second':<30} {hf_samples_per_sec:>15.2f} {vllm_samples_per_sec:>15.2f}
{'Batch strategy':<30} {'chunk=' + str(HF_BATCH_SIZE):>15} {'auto (all)':>15}
β‘ vLLM speedup: {speedup:.1f}x faster
At HF speed: {NUM_SAMPLES / hf_samples_per_sec / 3600:.2f} hours for {NUM_SAMPLES} samples
At vLLM speed: {NUM_SAMPLES / vllm_samples_per_sec / 3600:.4f} hours for {NUM_SAMPLES} samples
--- Extrapolation to 80M samples ---
HF pipeline: {80_000_000 / hf_samples_per_sec / 86400:,.1f} days
vLLM batch: {80_000_000 / vllm_samples_per_sec / 86400:,.1f} days
""")
# --- Step 6: Show sample outputs side by side ---
print("=" * 70)
print(" SAMPLE OUTPUTS (first 5)")
print("=" * 70)
for i in range(5):
prompt_text = dataset["train"][i]["messages"][0]["content"]
expected = dataset["train"][i]["messages"][1]["content"]
# Extract HF output (strip the prompt prefix)
hf_text = hf_outputs[i][0]["generated_text"][len(test_prompts[i]):]
# Extract vLLM output
vllm_text = vllm_outputs[i].outputs[0].text
print(f"\n --- Sample {i} ---")
print(f" Prompt: {prompt_text[:80]}{'...' if len(prompt_text) > 80 else ''}")
print(f" Expected: {expected[:80]}{'...' if len(expected) > 80 else ''}")
print(f" HF out: {hf_text.strip()[:80]}{'...' if len(hf_text.strip()) > 80 else ''}")
print(f" vLLM out: {vllm_text.strip()[:80]}{'...' if len(vllm_text.strip()) > 80 else ''}")
match = "β
match" if hf_text.strip() == vllm_text.strip() else "β οΈ differ"
print(f" HF vs vLLM: {match}")
# --- Step 7: Plot ---
print(f"\n{'=' * 70}")
print(" PLOT")
print("=" * 70)
import matplotlib.pyplot as plt
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
# Bar chart: time
methods = ["HuggingFace\nPipeline", "vLLM\nBatch"]
times = [hf_time, vllm_time]
colors = ["steelblue", "coral"]
ax1.bar(methods, times, color=colors)
ax1.set_ylabel("Total Time (seconds)")
ax1.set_title(f"Inference Time β {NUM_SAMPLES} samples")
for i, v in enumerate(times):
ax1.text(i, v + max(times) * 0.02, f"{v:.1f}s", ha="center", fontweight="bold")
# Add speedup arrow
ax1.annotate(
f"{speedup:.1f}x faster",
xy=(1, vllm_time + max(times) * 0.05),
xytext=(0, hf_time + max(times) * 0.1),
arrowprops=dict(arrowstyle="->", color="green", lw=2),
fontsize=12, fontweight="bold", color="green", ha="center",
)
ax1.set_ylim(0, max(times) * 1.4)
# Bar chart: samples/s
rates = [hf_samples_per_sec, vllm_samples_per_sec]
ax2.bar(methods, rates, color=colors)
ax2.set_ylabel("Samples per Second")
ax2.set_title("Throughput Comparison")
for i, v in enumerate(rates):
ax2.text(i, v + max(rates) * 0.02, f"{v:.1f}", ha="center", fontweight="bold")
ax2.set_ylim(0, max(rates) * 1.3)
plt.tight_layout()
plt.savefig("vllm_vs_hf_comparison.png", dpi=150, bbox_inches="tight")
plt.show()
print("\nβ
Plot saved to vllm_vs_hf_comparison.png")
print("=" * 70)
print(" DONE!")
print("=" * 70)======================================================================
STEP 1: Loading FoodExtract-1k dataset
======================================================================
/home/mrdbourke/miniforge3/envs/vllm_bench/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm
β
Loaded 1420 samples
======================================================================
STEP 2: Preparing prompts
======================================================================
β
Prepared 1420 prompts
Example prompt (first 120 chars): <bos><start_of_turn>user
A mouth-watering photograph captures a delectable dish centered on a rectangular white porcelai...
======================================================================
STEP 3: Running HuggingFace Pipeline (baseline)
======================================================================
/home/mrdbourke/miniforge3/envs/vllm_bench/lib/python3.12/site-packages/torch/cuda/__init__.py:435: UserWarning:
Found GPU0 NVIDIA GB10 which is of cuda capability 12.1.
Minimum and Maximum cuda capability supported by this version of PyTorch is
(8.0) - (12.0)
queued_call()
Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/home/mrdbourke/.cache/huggingface/hub/models--mrdbourke--FoodExtract-gemma-3-270m-fine-tune-v2/.no_exist/9c09257060fe5e8d50e8181d9d344668bd9279f8/custom_generate'
Device set to use cuda
`generation_config` default values have been modified to match model-specific defaults: {'do_sample': True, 'top_k': 64, 'top_p': 0.95, 'pad_token_id': 0, 'bos_token_id': 2, 'eos_token_id': [1, 106]}. If this is not desired, please set these values explicitly.
Running 1420 samples with batch_size=32...
... 160/1420 samples (12.0 samples/s)
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
... 320/1420 samples (11.3 samples/s)
... 480/1420 samples (9.1 samples/s)
... 640/1420 samples (7.8 samples/s)
... 800/1420 samples (7.7 samples/s)
... 960/1420 samples (7.8 samples/s)
... 1120/1420 samples (7.3 samples/s)
... 1280/1420 samples (6.3 samples/s)
... 1420/1420 samples (6.8 samples/s)
β
HuggingFace Pipeline done!
Time: 208.44s | Samples/s: 6.81
======================================================================
STEP 4: Running vLLM offline batch inference
======================================================================
INFO 03-18 04:18:42 [utils.py:233] non-default args: {'max_model_len': 4096, 'gpu_memory_utilization': 0.85, 'disable_log_stats': True, 'model': 'mrdbourke/FoodExtract-gemma-3-270m-fine-tune-v2'}
Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/home/mrdbourke/.cache/huggingface/hub/models--mrdbourke--FoodExtract-gemma-3-270m-fine-tune-v2/.no_exist/9c09257060fe5e8d50e8181d9d344668bd9279f8/preprocessor_config.json'
INFO 03-18 04:18:44 [model.py:533] Resolved architecture: Gemma3ForCausalLM
INFO 03-18 04:18:44 [model.py:1582] Using max model len 4096
INFO 03-18 04:18:44 [scheduler.py:231] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 03-18 04:18:44 [vllm.py:754] Asynchronous scheduling is enabled.
WARNING 03-18 04:18:47 [system_utils.py:152] We must use the `spawn` multiprocessing start method. Overriding VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See https://docs.vllm.ai/en/latest/usage/troubleshooting.html#python-multiprocessing for more information. Reasons: CUDA is initialized
(EngineCore pid=851428) INFO 03-18 04:18:49 [core.py:103] Initializing a V1 LLM engine (v0.17.2rc1.dev45+g761e0aa7a) with config: model='mrdbourke/FoodExtract-gemma-3-270m-fine-tune-v2', speculative_config=None, tokenizer='mrdbourke/FoodExtract-gemma-3-270m-fine-tune-v2', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=mrdbourke/FoodExtract-gemma-3-270m-fine-tune-v2, enable_prefix_caching=True, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': <CompilationMode.VLLM_COMPILE: 3>, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': <CUDAGraphMode.FULL_AND_PIECEWISE: (2, 1)>, 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []}
(EngineCore pid=851428) /home/mrdbourke/miniforge3/envs/vllm_bench/lib/python3.12/site-packages/torch/cuda/__init__.py:435: UserWarning:
(EngineCore pid=851428) Found GPU0 NVIDIA GB10 which is of cuda capability 12.1.
(EngineCore pid=851428) Minimum and Maximum cuda capability supported by this version of PyTorch is
(EngineCore pid=851428) (8.0) - (12.0)
(EngineCore pid=851428)
(EngineCore pid=851428) queued_call()
(EngineCore pid=851428) INFO 03-18 04:18:50 [parallel_state.py:1395] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://192.168.1.6:39495 backend=nccl
(EngineCore pid=851428) INFO 03-18 04:18:50 [parallel_state.py:1717] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A
(EngineCore pid=851428) INFO 03-18 04:18:50 [gpu_model_runner.py:4506] Starting to load model mrdbourke/FoodExtract-gemma-3-270m-fine-tune-v2...
(EngineCore pid=851428) INFO 03-18 04:18:50 [cuda.py:333] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION'].
(EngineCore pid=851428) INFO 03-18 04:18:50 [flash_attn.py:598] Using FlashAttention version 2
(EngineCore pid=851428) Ignored error while writing commit hash to /home/mrdbourke/.cache/huggingface/hub/models--mrdbourke--FoodExtract-gemma-3-270m-fine-tune-v2/refs/main: [Errno 13] Permission denied: '/home/mrdbourke/.cache/huggingface/hub/models--mrdbourke--FoodExtract-gemma-3-270m-fine-tune-v2/refs/main'.
(EngineCore pid=851428) Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/home/mrdbourke/.cache/huggingface/hub/models--mrdbourke--FoodExtract-gemma-3-270m-fine-tune-v2/.no_exist/9c09257060fe5e8d50e8181d9d344668bd9279f8/model.safetensors.index.json'
Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00<?, ?it/s]
(EngineCore pid=851428) INFO 03-18 04:18:52 [weight_utils.py:618] No model.safetensors.index.json found in remote.
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:03<00:00, 3.09s/it]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:03<00:00, 3.09s/it]
(EngineCore pid=851428)
(EngineCore pid=851428) INFO 03-18 04:18:55 [default_loader.py:377] Loading weights took 3.11 seconds
(EngineCore pid=851428) INFO 03-18 04:18:55 [gpu_model_runner.py:4591] Model loading took 0.53 GiB memory and 4.880466 seconds
(EngineCore pid=851428) INFO 03-18 04:18:56 [backends.py:988] Using cache directory: /home/mrdbourke/.cache/vllm/torch_compile_cache/21aa8e5800/rank_0_0/backbone for vLLM's torch.compile
(EngineCore pid=851428) INFO 03-18 04:18:56 [backends.py:1048] Dynamo bytecode transform time: 0.82 s
(EngineCore pid=851428) [rank0]:W0318 04:18:57.420000 851428 site-packages/torch/_inductor/utils.py:1679] Not enough SMs to use max_autotune_gemm mode
(EngineCore pid=851428) INFO 03-18 04:18:58 [backends.py:284] Directly load the compiled graph(s) for compile range (1, 8192) from the cache, took 1.490 s
(EngineCore pid=851428) INFO 03-18 04:18:58 [monitor.py:48] torch.compile took 2.41 s in total
(EngineCore pid=851428) INFO 03-18 04:18:58 [decorators.py:296] Directly load AOT compilation from path /home/mrdbourke/.cache/vllm/torch_compile_cache/torch_aot_compile/0cc0fab39e8169dcb63bbf9e524e9cb1f63a6cb828ce868820ff2ae7346cd674/rank_0_0/model
(EngineCore pid=851428) INFO 03-18 04:18:58 [monitor.py:76] Initial profiling/warmup run took 0.21 s
(EngineCore pid=851428) INFO 03-18 04:18:58 [kv_cache_utils.py:826] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512
(EngineCore pid=851428) INFO 03-18 04:18:58 [gpu_model_runner.py:5632] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=35 (largest=256)
(EngineCore pid=851428) INFO 03-18 04:18:59 [gpu_model_runner.py:5711] Estimated CUDA graph memory: 0.25 GiB total
(EngineCore pid=851428) INFO 03-18 04:18:59 [gpu_worker.py:456] Available KV cache memory: 99.47 GiB
(EngineCore pid=851428) INFO 03-18 04:18:59 [gpu_worker.py:490] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.8500 to 0.8521 to maintain the same effective KV cache size.
(EngineCore pid=851428) INFO 03-18 04:18:59 [kv_cache_utils.py:1316] GPU KV cache size: 5,794,416 tokens
(EngineCore pid=851428) INFO 03-18 04:18:59 [kv_cache_utils.py:1321] Maximum concurrency for 4,096 tokens per request: 1410.06x
(EngineCore pid=851428) 2026-03-18 04:19:06,553 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ...
(EngineCore pid=851428) 2026-03-18 04:19:06,835 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|ββββββββββ| 51/51 [00:00<00:00, 69.06it/s]
Capturing CUDA graphs (decode, FULL): 100%|ββββββββββ| 35/35 [00:00<00:00, 86.07it/s]
(EngineCore pid=851428) INFO 03-18 04:19:08 [gpu_model_runner.py:5771] Graph capturing finished in 1 secs, took 0.46 GiB
(EngineCore pid=851428) INFO 03-18 04:19:08 [gpu_worker.py:617] CUDA graph pool memory: 0.46 GiB (actual), 0.25 GiB (estimated), difference: 0.21 GiB (46.1%).
(EngineCore pid=851428) INFO 03-18 04:19:08 [core.py:281] init engine (profile, create kv cache, warmup model) took 13.01 seconds
(EngineCore pid=851428) INFO 03-18 04:19:12 [vllm.py:754] Asynchronous scheduling is enabled.
INFO 03-18 04:19:12 [llm.py:391] Supported tasks: ['generate']
Running 1420 samples in a single vLLM batch call...
Rendering prompts: 100%|ββββββββββ| 1420/1420 [00:00<00:00, 3968.93it/s]
Processed prompts: 100%|ββββββββββ| 1420/1420 [00:05<00:00, 241.95it/s, est. speed input: 29275.11 toks/s, output: 7791.33 toks/s]
β
vLLM batch inference done!
Time: 6.25s | Samples/s: 227.17
======================================================================
RESULTS COMPARISON
======================================================================
Metric HuggingFace vLLM
------------------------------------------------------------
Total time (s) 208.44 6.25
Samples/second 6.81 227.17
Batch strategy chunk=32 auto (all)
β‘ vLLM speedup: 33.3x faster
At HF speed: 0.06 hours for 1420 samples
At vLLM speed: 0.0017 hours for 1420 samples
--- Extrapolation to 80M samples ---
HF pipeline: 135.9 days
vLLM batch: 4.1 days
======================================================================
SAMPLE OUTPUTS (first 5)
======================================================================
--- Sample 0 ---
Prompt: A mouth-watering photograph captures a delectable dish centered on a rectangular...
Expected: food_or_drink: 1
tags: fi, fa
foods: cheese-stuffed peppers, cherry tomato halve...
HF out: food_or_drink: 1
tags: fi
foods: cheese-stuffed peppers, cherry tomato halves, g...
vLLM out: food_or_drink: 1
tags: fi
foods: cheese-stuffed peppers, cherry tomato halves, g...
HF vs vLLM: β οΈ differ
--- Sample 1 ---
Prompt: Bake Your Way To Happiness by Lisa de Nikolits
Expected: food_or_drink: 1
tags: fa
foods:
drinks:
HF out: food_or_drink: 1
tags: fa, fi
foods:
drinks:
vLLM out: food_or_drink: 1
tags: fi
foods:
drinks:
HF vs vLLM: β οΈ differ
--- Sample 2 ---
Prompt: A topβdown view of a pristine white plate showcases each item separately: a neat...
Expected: food_or_drink: 1
tags: fi, di
foods: elbow macaroni pasta, grilled chicken skewe...
HF out: food_or_drink: 1
tags: fi, di
foods: elbow macaroni pasta, grilled chicken skewe...
vLLM out: food_or_drink: 1
tags: fi, di
foods: elbow macaroni pasta, grilled chicken skewe...
HF vs vLLM: β οΈ differ
--- Sample 3 ---
Prompt: Educational science activity kit with solar print paper to create solar print ar...
Expected: food_or_drink: 0
tags:
foods:
drinks:
HF out: food_or_drink: 0
tags:
foods:
drinks:
vLLM out: food_or_drink: 0
tags:
foods:
drinks:
HF vs vLLM: β
match
--- Sample 4 ---
Prompt: This image showcases a promotional photograph of a five-piece set of pots and pa...
Expected: food_or_drink: 0
tags:
foods:
drinks:
HF out: food_or_drink: 0
tags:
foods:
drinks:
vLLM out: food_or_drink: 0
tags:
foods:
drinks:
HF vs vLLM: β
match
======================================================================
PLOT
======================================================================
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) Cell In[2], line 222 219 print(" PLOT") 220 print("=" * 70) --> 222 import matplotlib.pyplot as plt 224 fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5)) 226 # Bar chart: time ModuleNotFoundError: No module named 'matplotlib'
False