Source code for flexeval.classes.metric
import peewee as pw
from flexeval.classes.base import BaseModel
from flexeval.classes.dataset import Dataset
from flexeval.classes.eval_set_run import EvalSetRun
from flexeval.classes.message import Message
from flexeval.classes.thread import Thread
from flexeval.classes.tool_call import ToolCall
from flexeval.classes.turn import Turn
[docs]
class Metric(BaseModel):
"""Holds a single metric/property computed based one just ONE turn"""
id = pw.IntegerField(primary_key=True)
evalsetrun = pw.ForeignKeyField(EvalSetRun, backref="metrics_list")
dataset = pw.ForeignKeyField(Dataset, backref="metrics_list")
thread = pw.ForeignKeyField(Thread, backref="metrics_list")
turn = pw.ForeignKeyField(
Turn, null=True, backref="metrics_list"
) # Only defined for Turn metrics
message = pw.ForeignKeyField(
Message, null=True, backref="metrics_list"
) # Only defined for Message metrics
toolcall = pw.ForeignKeyField(
ToolCall, null=True, backref="metrics_list"
) # Only defined for ToolCall metrics
evaluation_name = pw.TextField()
evaluation_type = pw.TextField()
metric_name = pw.TextField()
# metric_type = pw.TextField() # TODO: Some parts of the code use "metric_tye" and others use "evaluation_type" - choose one for consistency
metric_level = pw.TextField()
# TODO we may want to consider adding a secondary metric_nonnumeric_value field to support non-numeric functions and rubrics
metric_value = pw.FloatField(
null=True
) # necessary if rubric result is INVALID or e.g. latency doesn't apply to the very first message
kwargs = pw.TextField()
# context_only allows us to create another kind of dependency
# where we can quantify something about the previous conversation
# and then use that quantity in a downstream analysis
# e.g. 'would a plot be pedagogically appropriate here' is really a question about the PAST of the conversation
# NOTE: but we have gotten rid of context_only for rubrics, where only {context} is used so technically here 'context_only' is False
# or 'was the conversation ever flagged by the moderation api' would be a question about the previous turns that might
# allow to have better context for the properties of this turn
# context_only = pw.BooleanField(default=False)
source = pw.TextField() # TODO - make another table for this? But maybe not, because this also contains filled-in rubrics
depends_on = pw.TextField()
rubric_prompt = pw.TextField(null=True)
rubric_completion = pw.TextField(null=True)
rubric_model = pw.TextField(null=True)
rubric_completion_tokens = pw.IntegerField(null=True)
rubric_prompt_tokens = pw.IntegerField(null=True)
rubric_score = pw.TextField(null=True)