# originally generated by datamodel-codegen:
# filename: src/flexeval/eval_schema.json
# timestamp: 2025-05-19T21:42:39+00:00
from __future__ import annotations
import sys
from typing import Any, Dict, List, Literal, Optional
from pydantic import BaseModel, Field
from flexeval.schema import schema_utils
VALID_METRIC_LEVELS = ["Message", "Turn", "Thread", "ToolCall"]
MetricLevel = Literal["Message", "Turn", "Thread", "ToolCall"]
[docs]
class DependsOnItem(BaseModel):
[docs]
class Config:
extra = "forbid"
name: Optional[str] = Field(
None, description="Name of the dependency function or rubric."
)
type: Optional[Literal["function", "rubric"]] = Field(
None,
description="One of 'function' or 'rubric' indicating the type of the dependency.",
)
kwargs: Optional[Dict[str, Any]] = Field(
None,
description="The keyword arguments for the dependency. If provided, used to match which evaluation this dependency is for, so must match the keyword args given for some evaluation.",
)
metric_name: Optional[str] = Field(
None,
description="Name of the metric dependency. This may be different than function_name if the metric function returns a key/value pair - in which case, this will match the key.",
)
metric_level: Optional[MetricLevel] = Field(
None,
description="The level of the metric to depend on, which must be equal to or 'greater' than the dependent metric's level. e.g. a Turn can depend on a Thread metric, but not the reverse.",
)
relative_object_position: int = Field(
0,
le=0,
strict=True,
description="The position of the object within the Thread. If 0 (default), this is the metric value for the current object. If -1, this is the metric value for the most recent object before this one.",
)
# TODO we could consider adding an absolute_object_position
metric_min_value: Optional[float] = Field(
-sys.float_info.max,
description="Minimum value of the dependency to consider it as satisfied.",
)
metric_max_value: Optional[float] = Field(
sys.float_info.max,
description="Maximum value of the dependency to consider it as satisfied.",
)
[docs]
class MetricItem(BaseModel):
name: str = Field(
...,
description="The function to call or name of rubric to use to compute this metric.",
)
depends_on: Optional[List[DependsOnItem]] = Field(
default_factory=list,
description="List of dependencies that must be satisfied for this metric to be computed.",
)
# TODO why is metric_level optional? Should likely make it required
metric_level: Optional[MetricLevel] = Field(
"Turn",
description="What level of granularity (ToolCall, Message, Turn, or Thread) this rubric should be applied to",
)
[docs]
class FunctionItem(MetricItem):
kwargs: schema_utils.OptionalDict = Field(
default_factory=dict,
description="Keyword arguments for the function. Each key must correspond to an argument in the function. Extra keys will cause an error.",
)
# TODO add the ability to provide a function source: Path | FunctionsCollection | schema_utils.ModuleType
[docs]
class RubricItem(MetricItem):
# TODO is RubricItem.kwargs actually used?
kwargs: Optional[Dict[str, Any]] = Field(
default_factory=dict,
description="Keyword arguments for the rubric evaluation.",
)
# TODO add the ability to provide a rubric source: Path | RubricsCollection
[docs]
class Metrics(BaseModel):
"""Defines the metrics to be evaluated."""
function: Optional[List[FunctionItem]] = Field(
None, description="List of function-based metrics to be evaluated."
)
rubric: Optional[List[RubricItem]] = Field(
None, description="List of rubrics to be evaluated."
)
[docs]
class CompletionLlm(BaseModel):
[docs]
class Config:
extra = "forbid"
function_name: str = Field(
...,
description="Completion function defined in `completion_functions.py` or available in the global namespace.",
)
include_system_prompt: bool = True
kwargs: Dict[str, Any] = Field(
default_factory=dict,
description="Additional arguments that will be passed to the completion function. Must correspond to arguments in the named function.",
)
[docs]
class GraderLlm(BaseModel):
[docs]
class Config:
extra = "forbid"
function_name: str = Field(
...,
description="Function defined in `completion_functions.py`. We're not really completing a conversation, but we ARE asking an LLM to provide a response to an input - in this case, the rubric.",
)
kwargs: Dict[str, Any] = Field(
default_factory=dict,
description="Additional arguments that will be passed to the completion function. Must correspond to arguments in tne named function.",
)
[docs]
class Eval(BaseModel):
"""Defines the evaluation that should be executed.
The key fields are :attr:`metrics` and :attr:`grader_llm`.
"""
[docs]
class Config:
# TODO don't permit additional fields in Eval
extra = "allow"
do_completion: bool = Field(
False,
description="Flag to determine if completions should be done in each thread. Set to 'true' if you are testing a new API and want to evaluate the API responses. Set to 'false' (default) if you are evaluating past conversations and do not need to generate new completions.",
)
name: Optional[str] = Field(
None,
description="Name of the test suite. Used as metadata only. Does not need to match the key of the entry in the evals.yaml file.",
)
notes: str = Field(
"",
description="Additional notes regarding the configuration. Used as metadata only.",
)
metrics: Metrics = Field(
default_factory=Metrics, description="Metrics to use in the evaluation."
)
completion_llm: Optional[CompletionLlm] = Field(
None,
description="Specification of the LLM or API used to perform new completions. Must be defined if `do_completions: true` is set.",
)
grader_llm: Optional[GraderLlm] = Field(
None,
description="Specification of the LLM or API used to grade rubrics. Must be defined if any rubric_metrics are specified.",
)