Source code for flexeval.run_utils

"""Utilities for :mod:`~flexeval.runner`."""

import json
import logging

from flexeval import rubric
from flexeval.classes.dataset import Dataset
from flexeval.classes.eval_runner import EvalRunner
from flexeval.classes.eval_set_run import EvalSetRun

logger = logging.getLogger(__name__)


[docs] def build_eval_set_run(runner: EvalRunner) -> EvalSetRun: rubrics = rubric.load_rubric_metrics(runner.evalrun.rubric_paths) # TODO this code uses a model_name that does not appear in the Eval schema; should look into this model_name = json.dumps(None) # model_name = json.dumps( # runner.eval.get("completion_llm", {}).get("model_name", None) # ) evalsetrun = EvalSetRun.create( name=runner.evalrun.eval.name, notes=runner.evalrun.eval.notes, metrics=runner.evalrun.eval.metrics.model_dump_json(), metrics_graph_ordered_list=json.dumps(runner.metrics_graph_ordered_list), dataset_files=json.dumps( [str(data_source.path) for data_source in runner.evalrun.data_sources] ), do_completion=runner.evalrun.eval.do_completion, completion_llm=( runner.evalrun.eval.completion_llm.model_dump_json() if runner.evalrun.eval.completion_llm is not None else json.dumps(None) ), model_name=model_name, grader_llm=( runner.evalrun.eval.grader_llm.model_dump_json() if runner.evalrun.eval.grader_llm is not None else json.dumps(None) ), # only save rubrics that will actually be used rubrics=json.dumps( { i["evaluation_name"]: rubrics[i["evaluation_name"]].model_dump() for i in runner.metrics_graph_ordered_list if i["evaluation_type"] == "rubric" } ), ) return evalsetrun
[docs] def build_datasets(runner: EvalRunner, evalsetrun: EvalSetRun): for filename in evalsetrun.get_datasets(): # these will automatically be saved as a property of evalsetrun Dataset.create( evalsetrun=evalsetrun, filename=filename, max_n_conversation_threads=runner.evalrun.config.max_n_conversation_threads, nb_evaluations_per_thread=runner.evalrun.config.nb_evaluations_per_thread, ) runner.logger.info( f"Created dataset from {filename}. Max number of conversation threads: {runner.evalrun.config.max_n_conversation_threads} - Nb of evaluations per thread: {runner.evalrun.config.nb_evaluations_per_thread}" )