Source code for flexeval.runner
"""Convenience functions for running an Eval Run."""
import logging
import random as rd
from pathlib import Path
import flexeval.metrics
from flexeval import completions, compute_metrics, run_utils
from flexeval.classes.eval_runner import EvalRunner
from flexeval.io.parsers import yaml_parser
from flexeval.schema import EvalRun, FileDataSource
logger = logging.getLogger(__name__)
# Levels of abstraction -
# Dataset
# Thread
# Turn
# Message
# ToolCall
# Metric
# Features to add:
# - allow comparison with 'ideal' responses
[docs]
def run_from_name_args(
input_data: list[Path],
database_path: Path,
eval_name: str,
config_path: str,
evals_path: str,
**kwargs,
):
data_sources = [FileDataSource(path=input_path) for input_path in input_data]
config = yaml_parser.load_config_from_yaml(config_path)
evals = yaml_parser.load_evals_from_yaml(evals_path)
if eval_name not in evals:
raise ValueError(
f"Eval name {eval_name} not in defined evals: {list(evals.keys())}"
)
selected_eval = evals[eval_name]
if selected_eval.name is None or selected_eval.name.strip() == "":
selected_eval.name = eval_name
for key, value in kwargs.items():
setattr(config, key, value)
eval_run = EvalRun(
data_sources=data_sources,
database_path=database_path,
eval=selected_eval,
config=config,
)
return run(eval_run)
[docs]
def run(eval_run: EvalRun) -> EvalRunner:
"""Runs the evaluations."""
runner = EvalRunner(eval_run)
#######################################################
############ Create Test Run ########################
#######################################################
try:
runner.logger.info("Creating EvalSetRun")
# TODO instead of raw 'metrics', pass in graph created when setting up the runner
evalsetrun = run_utils.build_eval_set_run(runner)
runner.logger.info(f"Metric graph: {evalsetrun.metrics_graph_ordered_list}")
except Exception:
runner.logger.exception(
"An error occurred creating the EvalSetRun.", exc_info=True
)
runner.shutdown_logging()
raise
#######################################################
############ Load and Parse Data ####################
#######################################################
try:
runner.logger.debug("Loading data")
# set random seed
rd_seed = runner.evalrun.config.random_seed_conversation_sampling
rd.seed(rd_seed)
runner.logger.info(f"Set random seed to {rd_seed}")
run_utils.build_datasets(runner, evalsetrun)
except Exception:
runner.logger.exception(
"An error occurred creating dataset metadata.", exc_info=True
)
try:
runner.logger.info("Parsing data files")
for dataset in evalsetrun.datasets:
runner.logger.debug(f"Loading data from {dataset.filename}")
dataset.load_data()
except Exception:
runner.logger.exception("An error occurred loading data.", exc_info=True)
# Do completions, if necessary
try:
if evalsetrun.do_completion:
# We do this by creating new turns
runner.logger.info("Generating completions")
completions.get_completions(eval_run, evalsetrun)
except Exception:
runner.logger.exception(
"An error occurred generating completions.", exc_info=True
)
if eval_run.config.raise_on_completion_error:
runner.shutdown_logging()
raise
#######################################################
################# Compute Metrics ###################
#######################################################
try:
metrics = compute_metrics.compute_metrics(eval_run, evalsetrun)
runner.logger.info(f"Saving {len(metrics)} metrics to database.")
flexeval.metrics.save.save_metrics(metrics)
except Exception:
runner.logger.exception("An error occurred computing metrics.", exc_info=True)
if eval_run.config.raise_on_metric_error:
runner.shutdown_logging()
raise
runner.logger.info("Evaluation run complete.")
runner.shutdown_logging()
return runner