Source code for flexeval.classes.metric

import peewee as pw

from flexeval.classes.base import BaseModel
from flexeval.classes.dataset import Dataset
from flexeval.classes.eval_set_run import EvalSetRun
from flexeval.classes.message import Message
from flexeval.classes.thread import Thread
from flexeval.classes.tool_call import ToolCall
from flexeval.classes.turn import Turn


[docs] class Metric(BaseModel): """Holds a single metric/property computed based one just ONE turn""" id = pw.IntegerField(primary_key=True) evalsetrun = pw.ForeignKeyField(EvalSetRun, backref="metrics_list") dataset = pw.ForeignKeyField(Dataset, backref="metrics_list") thread = pw.ForeignKeyField(Thread, backref="metrics_list") turn = pw.ForeignKeyField( Turn, null=True, backref="metrics_list" ) # Only defined for Turn metrics message = pw.ForeignKeyField( Message, null=True, backref="metrics_list" ) # Only defined for Message metrics toolcall = pw.ForeignKeyField( ToolCall, null=True, backref="metrics_list" ) # Only defined for ToolCall metrics evaluation_name = pw.TextField() evaluation_type = pw.TextField() metric_name = pw.TextField() # metric_type = pw.TextField() # TODO: Some parts of the code use "metric_tye" and others use "evaluation_type" - choose one for consistency metric_level = pw.TextField() # TODO we may want to consider adding a secondary metric_nonnumeric_value field to support non-numeric functions and rubrics metric_value = pw.FloatField( null=True ) # necessary if rubric result is INVALID or e.g. latency doesn't apply to the very first message kwargs = pw.TextField() # context_only allows us to create another kind of dependency # where we can quantify something about the previous conversation # and then use that quantity in a downstream analysis # e.g. 'would a plot be pedagogically appropriate here' is really a question about the PAST of the conversation # NOTE: but we have gotten rid of context_only for rubrics, where only {context} is used so technically here 'context_only' is False # or 'was the conversation ever flagged by the moderation api' would be a question about the previous turns that might # allow to have better context for the properties of this turn # context_only = pw.BooleanField(default=False) source = pw.TextField() # TODO - make another table for this? But maybe not, because this also contains filled-in rubrics depends_on = pw.TextField() rubric_prompt = pw.TextField(null=True) rubric_completion = pw.TextField(null=True) rubric_model = pw.TextField(null=True) rubric_completion_tokens = pw.IntegerField(null=True) rubric_prompt_tokens = pw.IntegerField(null=True) rubric_score = pw.TextField(null=True)