Source code for flexeval.schema.eval_schema

# originally generated by datamodel-codegen:
#   filename:  src/flexeval/eval_schema.json
#   timestamp: 2025-05-19T21:42:39+00:00

from __future__ import annotations

import sys
from typing import Any, Dict, List, Literal, Optional

from pydantic import BaseModel, Field

from flexeval.schema import schema_utils

VALID_METRIC_LEVELS = ["Message", "Turn", "Thread", "ToolCall"]
MetricLevel = Literal["Message", "Turn", "Thread", "ToolCall"]



[docs]
class DependsOnItem(BaseModel):
    """Defines a metric dependency."""


[docs]
    class Config:
        extra = "forbid"


    name: Optional[str] = Field(
        None, description="Name of the dependency function or rubric."
    )
    type: Optional[Literal["function", "rubric"]] = Field(
        None,
        description="One of 'function' or 'rubric' indicating the type of the dependency.",
    )
    kwargs: Optional[Dict[str, Any]] = Field(
        None,
        description="The keyword arguments for the dependency. If provided, used to match which evaluation this dependency is for, so must match the keyword args given for some evaluation.",
    )
    metric_name: Optional[str] = Field(
        None,
        description="Name of the metric dependency. This may be different than function_name if the metric function returns a key/value pair - in which case, this will match the key.",
    )
    metric_level: Optional[MetricLevel] = Field(
        None,
        description="The level of the metric to depend on, which must be equal to or 'greater' than the dependent metric's level. e.g. a Turn can depend on a Thread metric, but not the reverse.",
    )
    relative_object_position: int = Field(
        0,
        le=0,
        strict=True,
        description="The position of the object within the Thread. If 0 (default), this is the metric value for the current object. If -1, this is the metric value for the most recent object before this one.",
    )
    # TODO we could consider adding an absolute_object_position
    metric_min_value: Optional[float] = Field(
        -sys.float_info.max,
        description="Minimum value of the dependency to consider it as satisfied.",
    )
    metric_max_value: Optional[float] = Field(
        sys.float_info.max,
        description="Maximum value of the dependency to consider it as satisfied.",
    )




[docs]
class MetricItem(BaseModel):
    "Defines a metric."

    name: str = Field(
        ...,
        description="The function to call or name of rubric to use to compute this metric.",
    )
    depends_on: Optional[List[DependsOnItem]] = Field(
        default_factory=list,
        description="List of dependencies that must be satisfied for this metric to be computed.",
    )
    # TODO why is metric_level optional? Should likely make it required
    metric_level: Optional[MetricLevel] = Field(
        "Turn",
        description="What level of granularity (ToolCall, Message, Turn, or Thread) this rubric should be applied to",
    )




[docs]
class FunctionItem(MetricItem):
    """Defines a metric computed from a Python function."""

    kwargs: schema_utils.OptionalDict = Field(
        default_factory=dict,
        description="Keyword arguments for the function. Each key must correspond to an argument in the function. Extra keys will cause an error.",
    )

    # TODO add the ability to provide a function source: Path | FunctionsCollection | schema_utils.ModuleType



[docs]
class RubricItem(MetricItem):
    """Defines a metric computed from a rubric."""

    # TODO is RubricItem.kwargs actually used?
    kwargs: Optional[Dict[str, Any]] = Field(
        default_factory=dict,
        description="Keyword arguments for the rubric evaluation.",
    )

    # TODO add the ability to provide a rubric source: Path | RubricsCollection



[docs]
class Metrics(BaseModel):
    """Defines the metrics to be evaluated."""

    function: Optional[List[FunctionItem]] = Field(
        None, description="List of function-based metrics to be evaluated."
    )
    rubric: Optional[List[RubricItem]] = Field(
        None, description="List of rubrics to be evaluated."
    )




[docs]
class CompletionLlm(BaseModel):

[docs]
    class Config:
        extra = "forbid"


    function_name: str = Field(
        ...,
        description="Completion function defined in `completion_functions.py` or available in the global namespace.",
    )
    include_system_prompt: bool = True
    kwargs: Dict[str, Any] = Field(
        default_factory=dict,
        description="Additional arguments that will be passed to the completion function. Must correspond to arguments in the named function.",
    )




[docs]
class GraderLlm(BaseModel):
    """Defines the LLM used for evaluating rubrics."""


[docs]
    class Config:
        extra = "forbid"


    function_name: str = Field(
        ...,
        description="Function defined in `completion_functions.py`. We're not really completing a conversation, but we ARE asking an LLM to provide a response to an input - in this case, the rubric.",
    )
    kwargs: Dict[str, Any] = Field(
        default_factory=dict,
        description="Additional arguments that will be passed to the completion function. Must correspond to arguments in tne named function.",
    )




[docs]
class Eval(BaseModel):
    """Defines the evaluation that should be executed.

    The key fields are :attr:`metrics` and :attr:`grader_llm`.
    """


[docs]
    class Config:
        # TODO don't permit additional fields in Eval
        extra = "allow"


    do_completion: bool = Field(
        False,
        description="Flag to determine if completions should be done in each thread. Set to 'true' if you are testing a new API and want to evaluate the API responses. Set to 'false' (default) if you are evaluating past conversations and do not need to generate new completions.",
    )
    name: Optional[str] = Field(
        None,
        description="Name of the test suite. Used as metadata only. Does not need to match the key of the entry in the evals.yaml file.",
    )
    notes: str = Field(
        "",
        description="Additional notes regarding the configuration. Used as metadata only.",
    )
    metrics: Metrics = Field(
        default_factory=Metrics, description="Metrics to use in the evaluation."
    )
    completion_llm: Optional[CompletionLlm] = Field(
        None,
        description="Specification of the LLM or API used to perform new completions. Must be defined if `do_completions: true` is set.",
    )
    grader_llm: Optional[GraderLlm] = Field(
        None,
        description="Specification of the LLM or API used to grade rubrics. Must be defined if any rubric_metrics are specified.",
    )