Source code for afnio.utils.datasets.facility_support

"""Meta's Facility Support Analyzer dataset."""

import json
import os
import random
from pathlib import Path
from typing import Dict, List, Union
from urllib.error import URLError

from afnio._variable import Variable
from afnio.utils.data.dataset import Dataset
from afnio.utils.datasets.utils import check_integrity, download


[docs] class FacilitySupport(Dataset): """The Meta Facility Support Analyzer dataset consists of 200 real-world emails or messages sent in enterprise settings related to facility maintenance or support requests. Each example is annotated with: - urgency (low, medium, high) - sentiment (negative, neutral, positive) - relevant service request categories (e.g., cleaning, IT support, maintenance) The dataset is split into train, validation, and test sets with a 33%/33%/34% ratio. The split is deterministic, ensuring reproducibility across different runs. Args: split (str): The dataset split to load. Must be either "train", "val", or "test". root (Union[str, Path], optional): The root directory where JSON files are stored. Defaults to None. """ mirrors = [ "https://raw.githubusercontent.com/meta-llama/llama-prompt-ops/refs/heads/main/use-cases/facility-support-analyzer/" # noqa: E501 ] resources = [ ("dataset.json", "530dc66b1b07c9b15b19f08891e9bfa0"), ] _repr_indent = 4 def __init__(self, split: str, root: Union[str, Path] = None) -> None: if split not in {"train", "val", "test"}: raise ValueError( f"FacilitySupport Dataset: expected split in ['train', 'val', 'test'], " f"but got split={split}" ) if isinstance(root, str): root = os.path.expanduser(root) self.split = split self.root = root self.download() # Load dataset from JSON file_path = os.path.join(self.raw_folder, self.resources[0][0]) with open(file_path, "r", encoding="utf-8") as f: dataset: List[Dict] = json.load(f) # Shuffle deterministically random.Random(0).shuffle(dataset) n = len(dataset) n_train = int(n * 0.33) n_val = int(n * 0.33) if split == "train": self.data = dataset[:n_train] elif split == "val": self.data = dataset[n_train : n_train + n_val] # noqa: E203 else: # test self.data = dataset[n_train + n_val :] # noqa: E203 def __getitem__(self, index: int) -> Dict: """ Fetches a data sample (message, (urgency, sentiment, categories)) for a given index. """ if not (0 <= index < len(self.data)): raise IndexError("Index out of range.") item = self.data[index] answer: dict = json.loads(item["answer"]) urgency = answer.get("urgency", None) sentiment = answer.get("sentiment", None) categories = answer.get("categories", None) message = Variable( data=item["fields"]["input"], role="input email or message", ) urgency = Variable(data=urgency, role="output urgency") sentiment = Variable(data=sentiment, role="output sentiment") categories = Variable(data=json.dumps(categories), role="output categories") return message, (urgency, sentiment, categories) def __len__(self) -> int: """Returns the number of samples in the dataset.""" return len(self.data)
[docs] def extra_repr(self) -> str: """Returns additional information about the dataset.""" split_map = {"train": "Train", "val": "Validation", "test": "Test"} try: split = split_map[self.split] except KeyError: raise ValueError( f"Invalid split value: {self.split}. " f"Expected one of ['train', 'val', 'test']." ) return f"Split: {split}"
def __repr__(self) -> str: """Returns a string representation of the dataset.""" head = "Dataset " + self.__class__.__name__ body = [f"Number of datapoints: {self.__len__()}"] if self.root is not None: body.append(f"Root location: {self.root}") body += self.extra_repr().splitlines() lines = [head] + [" " * self._repr_indent + line for line in body] return "\n".join(lines) @property def raw_folder(self) -> str: return os.path.join(self.root, self.__class__.__name__, "raw") def _check_exists(self) -> bool: return all( check_integrity( os.path.join( self.raw_folder, os.path.splitext(os.path.basename(url))[0] ) ) for url, _ in self.resources )
[docs] def download(self) -> None: """Download the Facility Support data if it doesn't exist already.""" if self._check_exists(): return os.makedirs(self.raw_folder, exist_ok=True) # download files for filename, md5 in self.resources: for mirror in self.mirrors: url = f"{mirror}{filename}" try: download( url, download_root=self.raw_folder, filename=filename, md5=md5 ) except URLError as error: print(f"Failed to download (trying next):\n{error}") continue finally: print() break else: raise RuntimeError(f"Error downloading {filename}")