Learn Hugging Face 🤗
  • Home
  • About
  • Setup
  • Glossary
  1. Work in progress: Batched LLM inference
  • Natural Language Processing (NLP)
    • Fully fine-tune an LLM to do structrued data extraction
    • Build a custom text classification model and demo
  • Computer Vision
    • Build a custom object detection model and demo
    • Fine-tune a Small VLM to do structured data extraction
  • RAG - Retrieval-Augmented Generation
    • Build a multimodal RAG pipeline with NVIDIA Nemotron models

On this page

  • TK - Bonus: Performing evaluations on our model
    • Helper functions for condensing and uncondensing items
    • More specific evals
    • Calculate precision and recall metrics across all fields
  • Report an issue

Work in progress: Batched LLM inference

Note: This notebook is a work in progress.

Goal: Batched inference with Hugging Face transformers.

TK - Resources * TK - this notebook builds off of the LLM fine-tuning tutorial - https://www.learnhuggingface.com/notebooks/hugging_face_llm_full_fine_tune_tutorial

In [1]:
import time

print(f"Last updated: {time.ctime()}")
Out [1]:
Last updated: Mon Mar 23 05:29:48 2026
  • TK - use JSON here to make sure our model outputs the right output
  • Code:
import json

def validate_response(response_text: str, required_fields: dict) -> tuple[bool, str]:
    """
    required_fields = {"answer": str, "confidence": float}
    """
    try:
        data = json.loads(response_text)
    except json.JSONDecodeError as e:
        return False, f"Invalid JSON: {e}"

    for field, expected_type in required_fields.items():
        if field not in data:
            return False, f"Missing field: {field}"
        if not isinstance(data[field], expected_type):
            return False, f"{field} expected {expected_type.__name__}, got {type(data[field]).__name__}"

    return True, "valid"

# Usage
raw = '{"answer": "Paris", "confidence": 0.95}'
ok, msg = validate_response(raw, {"answer": str, "confidence": float})
  • dataclass?
# models.py
from dataclasses import dataclass, field, asdict
from typing import TypedDict

# --- dataclass: mutable objects you build up during a run ---
@dataclass
class EvalResult:
    prompt: str
    expected: str
    actual: str = ""
    latency_s: float = 0.0
    passed: bool = False
    error: str | None = None

# Create, mutate, then serialize
result = EvalResult(prompt="Capital of France?", expected="Paris")
result.actual = "Paris"
result.passed = result.actual.lower() == result.expected.lower()
result.latency_s = 0.432

print(asdict(result))  # clean dict for JSON output


# --- TypedDict: typing for dicts you're reading (e.g. from a JSON file) ---
class TestCase(TypedDict):
    id: int
    prompt: str
    expected_answer: str

# This is just a type hint — it's still a plain dict at runtime
case: TestCase = {"id": 1, "prompt": "Capital of France?", "expected_answer": "Paris"}

TK - Bonus: Performing evaluations on our model

TK - split this into another notebook, introduce batching to make the predictions faster

We can evaluate our model directly against the original labels from gpt-oss-120b and see how it stacks up.

For example, we could compare the following:

  • F1 Score
  • Precision
  • Recall
  • Pure accuracy

Have these metrics as well as samples which are different to the ground truth would allow us to further explore where our model needs improvements.

len(dataset["test"]["sequence"])
input_test_prompts_formatted = [{"role": "user", "content": item} for item in dataset["test"]["sequence"]]
input_test_prompts_no_format = [item for item in dataset["test"]["sequence"]]
len(input_test_prompts_no_format)
# Get preds on the test set
test_outputs = []

for item in tqdm(dataset["test"]):
  prompt = pipe.tokenizer.apply_chat_template(item["messages"][:1], tokenize=False, add_generation_prompt=True)
  outputs = pipe(prompt, max_new_tokens=256, disable_compile=True)

  item["test_outputs"] = outputs
  test_outputs.append(item)

Helper functions for condensing and uncondensing items

%%writefile checkpoint_models/helper_functions.py
def condense_output(original_output):
    """Helper function to condense a given FoodExtract string.
    
    Example input: {'is_food_or_drink': True, 'tags': ['fi'], 'food_items': ['cape gooseberries', 'mulberry', 'chilli powder', 'flathead lobster', 'hoisin sauce', 'duck leg', 'chestnuts', 'raw quail', 'duck breast', 'rogan josh curry sauce', 'brown rice', 'dango'], 'drink_items': []}

    Example output: food_or_drink: 1\ntags: fi\nfoods: cape gooseberries, mulberry, chilli powder, flathead lobster, hoisin sauce, duck leg, chestnuts, raw quail, duck breast, rogan josh curry sauce, brown rice, dango\ndrinks:"""

    condensed_output_string_base = '''food_or_drink: <is_food_or_drink>
    tags: <output_tags>
    foods: <food_items>
    drinks: <drink_items>'''

    is_food_or_drink = str(1) if str(original_output["is_food_or_drink"]).lower() == "true" else str(0)
    tags = ", ".join(original_output["tags"]) if len(original_output["tags"]) > 0 else ""
    foods = ", ".join(original_output["food_items"]) if len(original_output["food_items"]) > 0 else ""
    drinks = ", ".join(original_output["drink_items"]) if len(original_output["drink_items"]) > 0 else ""

    condensed_output_string_formatted = condensed_output_string_base.replace("<is_food_or_drink>", is_food_or_drink).replace("<output_tags>", tags).replace("<food_items>", foods).replace("<drink_items>", drinks)

    return condensed_output_string_formatted.strip()

def uncondense_output(condensed_output):
    """Helper to go from condensed output to uncondensed output.

    Example input: food_or_drink: 1\ntags: fi\nfoods: cape gooseberries, mulberry, chilli powder, flathead lobster, hoisin sauce, duck leg, chestnuts, raw quail, duck breast, rogan josh curry sauce, brown rice, dango\ndrinks:

    Example output: {'is_food_or_drink': True, 'tags': ['fi'], 'food_items': ['cape gooseberries', 'mulberry', 'chilli powder', 'flathead lobster', 'hoisin sauce', 'duck leg', 'chestnuts', 'raw quail', 'duck breast', 'rogan josh curry sauce', 'brown rice', 'dango'], 'drink_items': []}
    """

    condensed_list = condensed_output.split("\n")

    condensed_dict_base = {
        "is_food_or_drink": "",
        "tags": [],
        "food_items": [],
        "drink_items": []
    }

    # Set values to defaults
    food_or_drink_item = None
    tags_item = None
    foods_item = None
    drinks_item = None

    # Extract items from condensed_list
    for item in condensed_list:
        if "food_or_drink:" in item.strip():
            food_or_drink_item = item

        if "tags:" in item:
            tags_item = item

        if "foods:" in item:
            foods_item = item

        if "drinks:" in item:
            drinks_item = item

    if food_or_drink_item:
        is_food_or_drink_bool = True if food_or_drink_item.replace("food_or_drink: ", "").strip() == "1" else False
    else:
        is_food_or_drink_bool = None

    if tags_item:
        tags_list = [item.replace("tags: ", "").replace("tags:", "").strip() for item in tags_item.split(", ")]
        tags_list = [item for item in tags_list if item] # Filter for empty items
    else:
        tags_list = []

    if foods_item:
        foods_list = [item.replace("foods:", "").replace("foods: ", "").strip() for item in foods_item.split(", ")]
        foods_list = [item for item in foods_list if item] # Filter for empty items
    else:
        foods_list = []

    if drinks_item:
        drinks_list = [item.replace("drinks:", "").replace("drinks: ", "").strip() for item in drinks_item.split(", ")]
        drinks_list = [item for item in drinks_list if item] # Filter for empty items
    else:
        drinks_list = []

    condensed_dict_base["is_food_or_drink"] = is_food_or_drink_bool
    condensed_dict_base["tags"] = tags_list
    condensed_dict_base["food_items"] = foods_list
    condensed_dict_base["drink_items"] = drinks_list

    return condensed_dict_base
# Test them out
import random

random_test_sample = random.choice(test_outputs)
random_test_sequence = random_test_sample["sequence"]
random_test_original_label = random_test_sample["gpt-oss-120b-label"]
random_test_predicted_label_condensed = random_test_sample["test_outputs"][0]["generated_text"].split("<start_of_turn>model")[-1].strip()

print(f"[INFO] Original sequence:")
print(random_test_sequence)
print()
print(f"[INFO] GPT-OSS-120B label:")
print(random_test_original_label)
print()
print(f"[INFO] Uncondensed output:")
print(uncondense_output(random_test_predicted_label_condensed))
print()
print(f"[INFO] Condensed output:")
print(random_test_predicted_label_condensed)
for item in test_outputs:
  item["condensed_output"] = uncondense_output(item["test_outputs"][0]["generated_text"].split("<start_of_turn>model")[-1].strip())
random_test_sample = random.choice(test_outputs)
random_test_sequence = random_test_sample["sequence"]
random_test_original_label = random_test_sample["gpt-oss-120b-label"]
random_test_predicted_label = random_test_sample["condensed_output"]

print(f"[INFO] Original sequence:")
print(random_test_sequence)
print()
print(f"[INFO] GPT-OSS-120B label:")
print(random_test_original_label)
print()
print(f"[INFO] Uncondensed output:")
print(random_test_predicted_label)
print()
# Quick evals

# Get pure equality matching (e.g. strict field to field)
num_bool_matches = 0
num_tags_matches = 0
num_food_items_matches = 0
num_drink_items_matches = 0

for item in test_outputs:
  gpt_labels = eval(item["gpt-oss-120b-label"])
  gemma_labels = item["condensed_output"]

  for key, value in gpt_labels.items():
    is_match = 1 if gemma_labels[key] == value else 0

    if key == "is_food_or_drink":
      num_bool_matches += is_match

    if key == "tags":
      num_tags_matches += is_match

    if key == "food_items":
      num_food_items_matches += is_match

    if key == "drink_items":
      num_drink_items_matches += is_match

print(f"[INFO] Number of direct bool matches: {num_bool_matches}/{len(test_outputs)}")
print(f"[INFO] Number of direct tag matches: {num_tags_matches}/{len(test_outputs)}")
print(f"[INFO] Number of direct food items matches: {num_food_items_matches}/{len(test_outputs)}")
print(f"[INFO] Number of direct drink items matches: {num_drink_items_matches}/{len(test_outputs)}")

More specific evals

Going to expand on the quick evals and see where we get to.

Details:

  • For precision and recall we require true positives, false positives and false negatives:
    • True positives: Items in both the target set as well as the predicted set.
    • False positives: Items in the predicted set but not in the target set.
    • False negatives: Items in the target set but not in the predicted set.
# Get pure equality matching (e.g. strict field to field)
num_bool_matches = 0
num_tags_matches = 0
num_food_items_matches = 0
num_food_items_filter_matches = 0
num_drink_items_matches = 0
num_drink_items_filter_matches = 0

def make_sure_bool_is_str(input):
  if isinstance(input, bool):
    input = str(input).lower()
    input = "true" if "true" in input else "false"
    return input
  else:
    input = str(input).lower()
    input = "true" if "true" in input else "false"
    return input

result_dicts = []
for item in test_outputs:
  result_dict = {}

  gpt_labels = eval(item["gpt-oss-120b-label"])
  gemma_labels = item["condensed_output"]

  result_dict["gpt_labels"] = gpt_labels
  result_dict["gemma_labels"] = gemma_labels
  result_dict["sequence"] = item["sequence"]
  result_dict["uuid"] = item["uuid"]

  # Make sure the types of the bools are the same (we just want all strings)
  gpt_labels["is_food_or_drink"] = make_sure_bool_is_str(input=gpt_labels["is_food_or_drink"])
  gemma_labels["is_food_or_drink"] = make_sure_bool_is_str(input=gemma_labels["is_food_or_drink"])

  for key, value in gpt_labels.items():
    # Get truth labels
    gpt_truth = value
    gemma_truth = gemma_labels[key]

    # Find direct matches
    is_match = 1 if gpt_truth == gemma_truth else 0

    # Go through individual stats and see if there are matches
    if key == "is_food_or_drink":
      result_dict["results_is_food_or_drink_match"] = True if gpt_truth == gemma_truth else False
      num_bool_matches += is_match

    if key == "tags":
      result_dict["results_tags_match"] = True if gpt_truth == gemma_truth else False
      num_tags_matches += is_match

    if key == "food_items":
      result_dict["results_food_items_match"] = True if gpt_truth == gemma_truth else False
      num_food_items_matches += is_match

      # Compare samples with lowering and filtering for uniques
      gpt_food_list_lower = sorted(set([item.lower() for item in gpt_truth]))
      gemma_food_list_lower = sorted(set([item.lower() for item in gemma_truth]))

      # Match filtered and set labels (removes duplicates)
      if gpt_food_list_lower == gemma_food_list_lower:
        num_food_items_filter_matches += 1
        result_dict["result_food_items_filter_match"] = True
      else:
        result_dict["result_food_items_filter_match"] = False

      # Get items that are predicted by Gemma but aren't in the labels (False Positive)
      food_list_false_positive = set(gemma_food_list_lower) - set(gpt_food_list_lower)
      result_dict["result_food_list_false_positive"] = food_list_false_positive

      # Get items that are in GPT labels but aren't predicted by Gemma (False Negative)
      food_list_true_negative = set(gpt_food_list_lower) - set(gemma_food_list_lower)
      result_dict["result_food_list_false_negative"] = food_list_true_negative

    if key == "drink_items":
      result_dict["results_drink_items_match"] = True if gpt_truth == gemma_truth else False
      num_drink_items_matches += is_match

      # Compare samples with lowering and filtering for uniques
      gpt_drink_list_lower = sorted(set([item.lower() for item in gpt_truth]))
      gemma_drink_list_lower = sorted(set([item.lower() for item in gemma_truth]))

      # Match filtered and set labels (removes duplicates)
      if gpt_drink_list_lower == gemma_drink_list_lower:
        num_drink_items_filter_matches += 1
        result_dict["result_drink_items_filter_match"] = True
      else:
        result_dict["result_drink_items_filter_match"] = False

      # Get items that are predicted by Gemma but aren't in the labels (False Positives)
      drink_list_false_positive = set(gemma_drink_list_lower) - set(gpt_drink_list_lower)
      result_dict["result_drink_list_false_positive"] = drink_list_false_positive

      # Get items that are in GPT labels but aren't predicted by Gemma (False Negatives)
      drink_list_true_negative = set(gpt_drink_list_lower) - set(gemma_drink_list_lower)
      result_dict["result_drink_list_false_negative"] = drink_list_true_negative

  result_dicts.append(result_dict)

print(f"[INFO] Number of direct bool matches: {num_bool_matches}/{len(test_outputs)} ({num_bool_matches / len(test_outputs) * 100:.2f}%)")
print(f"[INFO] Number of direct tag matches: {num_tags_matches}/{len(test_outputs)} ({num_tags_matches / len(test_outputs) * 100:.2f}%)")
print(f"[INFO] Number of direct food items matches: {num_food_items_matches}/{len(test_outputs)} ({num_food_items_matches / len(test_outputs) * 100:.2f}%)")
print(f"[INFO] Number of filtered food items matches: {num_food_items_filter_matches}/{len(test_outputs)} ({num_food_items_filter_matches / len(test_outputs) * 100:.2f}%)")
print(f"[INFO] Number of direct drink items matches: {num_drink_items_matches}/{len(test_outputs)} ({num_drink_items_matches / len(test_outputs) * 100:.2f}%)")
# Examples
gpt_food = ['wheat flour', 'vegetable oil', 'salt', 'sugar', 'water']
gemma_food = ['wheat flour', 'vegetable oil', 'salt', 'milk', 'vanilla']

def filter_and_lower_list(input_list):
  """Filters a list for uniques and lowers all inputs."""
  if len(input_list) > 0:
    return sorted(set([str(item.lower()) for item in input_list]))
  else:
    return input_list

def calculate_precision_recall_f1(reference, predicted):
  """Calculates precision, recall and F1 scores for two given lists."""

  # Filter items for best comparison
  reference = filter_and_lower_list(input_list=reference)
  predicted = filter_and_lower_list(input_list=predicted)

  # Ensure they're both sets for easy comparison
  ref_set = set(reference)
  pred_set = set(predicted)

  # Handle case where there are empty lists, in this case, the empty predictions are correct
  if (len(ref_set) == 0) and (len(pred_set) == 0):
    precision = 1.0
    recall = 1.0
    f1_score = 1.0

  else:
    # Get TP (True Positives)
    tp = len(ref_set & pred_set) # intersection

    # Get FP (False Positives)
    fp = len(pred_set - ref_set) # items only in predicted but not in reference

    # Get FN (False Negatives)
    fn = len(ref_set - pred_set) # items only in reference but not in predicted

    # Calculate metrics
    # Precision = fp / (tp + fp) - higher precision = less false positives
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0

    # Recall = tp / (tp + fn) - higher recall = less false negatives
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0

    # F1 (combined metric for precision and recall) = 2 * precision * recall / (precision + recall)
    f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

  # Return a dict of metrics
  return {
      "precision": precision,
      "recall": recall,
      "f1_score": f1_score,
      "intersection": list(ref_set.intersection(pred_set)),
      "only_in_reference": list(ref_set - pred_set),
      "only_in_pred": list(pred_set - ref_set)
  }

calculate_precision_recall_f1(reference=gpt_food,
                              predicted=gemma_food)

Calculate precision and recall metrics across all fields

tags_precision = 0
tags_recall = 0
tags_f1 = 0

food_precision = 0
food_recall = 0
food_f1 = 0

drink_recall = 0
drink_precision = 0
drink_f1 = 0

for item in result_dicts:
  tags_metrics = item["tag_metrics"]
  tags_precision += tags_metrics["precision"]
  tags_recall += tags_metrics["recall"]
  tags_f1 += tags_metrics["f1_score"]

  food_metrics = item["food_metrics"]
  food_precision += food_metrics["precision"]
  food_recall += food_metrics["recall"]
  food_f1 += food_metrics["f1_score"]

  drink_metrics = item["drink_metrics"]
  drink_precision += drink_metrics["precision"]
  drink_recall += drink_metrics["recall"]
  drink_f1 += drink_metrics["f1_score"]

print(f"[INFO] Tags metrics:")
print(f"Precision: {tags_precision / len(result_dicts):.3f}")
print(f"Recall: {tags_recall / len(result_dicts):.3f}")
print(f"F1: {tags_f1 / len(result_dicts):.3f}")
print()

print(f"[INFO] Food metrics:")
print(f"Precision: {food_precision / len(result_dicts):.3f}")
print(f"Recall: {food_recall / len(result_dicts):.3f}")
print(f"F1: {food_f1 / len(result_dicts):.3f}")
print()

print(f"[INFO] Drink metrics:")
print(f"Precision: {drink_precision / len(result_dicts):.3f}")
print(f"Recall: {drink_recall / len(result_dicts):.3f}")
print(f"F1: {drink_f1 / len(result_dicts):.3f}")
print()
Back to top
 
 
  • Report an issue