Source code for flexeval.schema.eval_schema

# originally generated by datamodel-codegen:
#   filename:  src/flexeval/eval_schema.json
#   timestamp: 2025-05-19T21:42:39+00:00

from __future__ import annotations

import sys
from typing import Any, Dict, List, Literal, Optional

from pydantic import BaseModel, Field

from flexeval.schema import schema_utils

VALID_METRIC_LEVELS = ["Message", "Turn", "Thread", "ToolCall"]
MetricLevel = Literal["Message", "Turn", "Thread", "ToolCall"]


[docs] class DependsOnItem(BaseModel):
[docs] class Config: extra = "forbid"
name: Optional[str] = Field( None, description="Name of the dependency function or rubric." ) type: Optional[Literal["function", "rubric"]] = Field( None, description="One of 'function' or 'rubric' indicating the type of the dependency.", ) kwargs: Optional[Dict[str, Any]] = Field( None, description="The keyword arguments for the dependency. If provided, used to match which evaluation this dependency is for, so must match the keyword args given for some evaluation.", ) metric_name: Optional[str] = Field( None, description="Name of the metric dependency. This may be different than function_name if the metric function returns a key/value pair - in which case, this will match the key.", ) metric_level: Optional[MetricLevel] = Field( None, description="The level of the metric to depend on, which must be equal to or 'greater' than the dependent metric's level. e.g. a Turn can depend on a Thread metric, but not the reverse.", ) relative_object_position: int = Field( 0, le=0, strict=True, description="The position of the object within the Thread. If 0 (default), this is the metric value for the current object. If -1, this is the metric value for the most recent object before this one.", ) # TODO we could consider adding an absolute_object_position metric_min_value: Optional[float] = Field( -sys.float_info.max, description="Minimum value of the dependency to consider it as satisfied.", ) metric_max_value: Optional[float] = Field( sys.float_info.max, description="Maximum value of the dependency to consider it as satisfied.", )
[docs] class MetricItem(BaseModel): name: str = Field( ..., description="The function to call or name of rubric to use to compute this metric.", ) depends_on: Optional[List[DependsOnItem]] = Field( default_factory=list, description="List of dependencies that must be satisfied for this metric to be computed.", ) # TODO why is metric_level optional? Should likely make it required metric_level: Optional[MetricLevel] = Field( "Turn", description="What level of granularity (ToolCall, Message, Turn, or Thread) this rubric should be applied to", )
[docs] class FunctionItem(MetricItem): kwargs: schema_utils.OptionalDict = Field( default_factory=dict, description="Keyword arguments for the function. Each key must correspond to an argument in the function. Extra keys will cause an error.", )
# TODO add the ability to provide a function source: Path | FunctionsCollection | schema_utils.ModuleType
[docs] class RubricItem(MetricItem): # TODO is RubricItem.kwargs actually used? kwargs: Optional[Dict[str, Any]] = Field( default_factory=dict, description="Keyword arguments for the rubric evaluation.", )
# TODO add the ability to provide a rubric source: Path | RubricsCollection
[docs] class Metrics(BaseModel): """Defines the metrics to be evaluated.""" function: Optional[List[FunctionItem]] = Field( None, description="List of function-based metrics to be evaluated." ) rubric: Optional[List[RubricItem]] = Field( None, description="List of rubrics to be evaluated." )
[docs] class CompletionLlm(BaseModel):
[docs] class Config: extra = "forbid"
function_name: str = Field( ..., description="Completion function defined in `completion_functions.py` or available in the global namespace.", ) include_system_prompt: bool = True kwargs: Dict[str, Any] = Field( default_factory=dict, description="Additional arguments that will be passed to the completion function. Must correspond to arguments in the named function.", )
[docs] class GraderLlm(BaseModel):
[docs] class Config: extra = "forbid"
function_name: str = Field( ..., description="Function defined in `completion_functions.py`. We're not really completing a conversation, but we ARE asking an LLM to provide a response to an input - in this case, the rubric.", ) kwargs: Dict[str, Any] = Field( default_factory=dict, description="Additional arguments that will be passed to the completion function. Must correspond to arguments in tne named function.", )
[docs] class Eval(BaseModel): """Defines the evaluation that should be executed. The key fields are :attr:`metrics` and :attr:`grader_llm`. """
[docs] class Config: # TODO don't permit additional fields in Eval extra = "allow"
do_completion: bool = Field( False, description="Flag to determine if completions should be done in each thread. Set to 'true' if you are testing a new API and want to evaluate the API responses. Set to 'false' (default) if you are evaluating past conversations and do not need to generate new completions.", ) name: Optional[str] = Field( None, description="Name of the test suite. Used as metadata only. Does not need to match the key of the entry in the evals.yaml file.", ) notes: str = Field( "", description="Additional notes regarding the configuration. Used as metadata only.", ) metrics: Metrics = Field( default_factory=Metrics, description="Metrics to use in the evaluation." ) completion_llm: Optional[CompletionLlm] = Field( None, description="Specification of the LLM or API used to perform new completions. Must be defined if `do_completions: true` is set.", ) grader_llm: Optional[GraderLlm] = Field( None, description="Specification of the LLM or API used to grade rubrics. Must be defined if any rubric_metrics are specified.", )