Source code for afnio.cognitive.modules.lm_judge_evaluator

from typing import Any, Callable, Dict, List, Optional, Tuple, Union

from afnio._utils import MultiTurnMessages
from afnio._variable import Variable
from afnio.autodiff.evaluator import LMJudgeEvaluator as LMJudgeEvaluatorOp
from afnio.models import ChatCompletionModel

from .module import Module



[docs]
class LMJudgeEvaluator(Module):
    """
    Evaluates predictions using a language model (LM) as the judge.

    This module leverages the `LMJudgeEvaluator` operation from
    `afnio.autodiff.evaluator` to perform model-based evaluations. The `forward`
    method accepts a list of `messages` that construct the evaluation prompt, with
    optional `inputs` to dynamically fill placeholders within message templates.
    A `prediction` is compared against a `target` (optional) to generate a `score`
    and an `explanation`.

    When processing a batch of predictions and targets, `reduction_fn` function
    aggregates individual scores (e.g., using `sum` to compute a total score). The
    `reduction_fn_purpose` parameter is a brief description of the aggregation’s purpose
    (e.g., `"summation"`). If aggregation is not desired, set `reduction_fn` and
    `reduction_fn_purpose` to `None`. The `success_fn` checks if all evaluations are
    successful, allowing the `backward` pass to skip unnecessary gradient computations.

    This module supports both evaluation (`eval_mode=True`) and optimization
    (`eval_mode=False`) modes.

    The `forward_model_client` specifies the LM responsible for evaluation, while
    `completion_args` allows customization of generation parameters like temperature,
    max tokens, and seed.

    Example:
        >>> import afnio as hf
        >>> from afnio import cognitive as cog
        >>> from afnio.models.openai import OpenAI
        >>> from afnio import set_backward_model_client
        >>> fwd_model_client = OpenAI()
        >>> fwd_model_args = {"model": "gpt-4o", "temperature": 0.5}
        >>> set_backward_model_client("openai/gpt-4o")
        >>> class Evaluator(cog.Module):
        ...     def __init__(self):
        ...         super().__init__()
        ...         self.judge = cog.LMJudgeEvaluator()
        ...     def forward(self, fwd_model, messages, prediction, target, inputs, **completion_args):
        ...         return self.judge(fwd_model, messages, prediction, target, inputs, **completion_args)
        >>> task = Variable(
        ...     "Evaluate if the translation is {metric}.",
        ...     role="evaluation task",
        ...     requires_grad=True
        ... )
        >>> format = Variable(
        ...     "Provide 'score' (true/false) and 'explanation' in JSON.",
        ...     role="output format"
        ... )
        >>> metric = Variable(["accurate", "accurate"], role="metric")
        >>> user = Variable(
        ...     "<PREDICTION>{prediction}</PREDICTION><TARGET>{target}</TARGET>",
        ..      role="user query"
        ... )
        >>> prediction = Variable(
        ...     ["Hola Mundo", "Salve a tutti"],
        ...     role="translated text",
        ...     requires_grad=True
        ... )
        >>> target = ["Ciao Mondo", "Salve a tutti"]
        >>> messages = [
        ...     {"role": "system", "content": [task, format]},
        ...     {"role": "user", "content": [user]},
        ... ]
        >>> model = Evaluator()
        >>> score, explanation = model(
        ...     fwd_model_client,
        ...     messages,
        ...     prediction,
        ...     target,
        ...     inputs={"metric": metric},
        ...     reduction_fn=sum,
        ...     reduction_fn_purpose="summation",
        ...     **fwd_model_args
        ... )
        >>> print(score.data)
        1
        >>> print(explanation.data)
        'The evaluation function, designed using an LM as the judge, compared the <DATA> fields of the predicted variable and the target variable across all samples in the batch. These scores were then aggregated using the reduction function 'summation', resulting in a final aggregated score: 1.'
        >>> explanation.backward()
        >>> system.grad[0].data
        'The translated text should be in Italian.'

    See Also:
        :class:`afnio.autodiff.evaluator.LMJudgeEvaluator` for the underlying operation.
    """  # noqa: E501

    forward_model_client: Optional[ChatCompletionModel]
    messages: MultiTurnMessages
    success_fn: Optional[Callable[[List[Any]], bool]]
    reduction_fn: Optional[Callable[[List[Any]], Any]]
    reduction_fn_purpose: Optional[Union[str, Variable]]
    eval_mode: Union[bool, Variable]
    completion_args: Dict[str, Any]

    def __init__(self):
        super().__init__()

        self.register_model("forward_model_client", None)
        self.register_chat("messages", None)
        self.register_function("success_fn", None)
        self.register_function("reduction_fn", None)
        self.register_buffer("reduction_fn_purpose", None)
        self.register_buffer("eval_mode", None)
        self.register_completion_config("completion_args", None)


[docs]
    def forward(
        self,
        forward_model_client: Optional[ChatCompletionModel],
        messages: MultiTurnMessages,
        prediction: Variable,
        target: Optional[Union[str, List[str], Variable]] = None,
        inputs: Optional[Dict[str, Union[str, Variable]]] = None,
        success_fn: Optional[Callable[[List[Any]], bool]] = None,
        reduction_fn: Optional[Callable[[List[Any]], Any]] = sum,
        reduction_fn_purpose: Optional[Union[str, Variable]] = "summation",
        eval_mode: Union[bool, Variable] = True,
        **completion_args,
    ) -> Tuple[Variable, Variable]:
        self.forward_model_client = forward_model_client
        self.messages = messages
        self.success_fn = success_fn
        self.reduction_fn = reduction_fn
        self.reduction_fn_purpose = (
            None
            if reduction_fn_purpose is None
            else (
                reduction_fn_purpose
                if isinstance(reduction_fn_purpose, Variable)
                else Variable(reduction_fn_purpose)
            )
        )
        self.eval_mode = (
            eval_mode if isinstance(eval_mode, Variable) else Variable(eval_mode)
        )
        self.completion_args = completion_args
        return LMJudgeEvaluatorOp.apply(
            self.forward_model_client,
            self.messages,
            prediction,
            target,
            inputs,
            self.success_fn,
            self.reduction_fn,
            self.reduction_fn_purpose,
            self.eval_mode,
            **self.completion_args,
        )