Basic rubric usage#
This vignette includes a rubric and uses a “grader function”. Rubrics are defined with a prompt template and a map of string outputs to numeric choice scores.
Python source: basic_rubric.py
1import flexeval
2from flexeval.metrics import access
3from flexeval.schema import (
4 Config,
5 DependsOnItem,
6 Eval,
7 EvalRun,
8 FileDataSource,
9 FunctionItem,
10 GraderLlm,
11 Metrics,
12 Rubric,
13 RubricItem,
14 RubricsCollection,
15)
16
17data_sources = [FileDataSource(path="vignettes/conversations.jsonl")]
18rubric = Rubric(
19 prompt="Answer YES if the response is helpful, NO otherwise.",
20 choice_scores={"YES": 1, "NO": 2},
21)
22# using a placeholder grader, but you can use any supported completion function
23grader_llm = GraderLlm(function_name="echo_completion", kwargs={"response": "YES"})
24is_assistant_dependency = DependsOnItem(
25 name="is_role", kwargs={"role": "assistant"}, metric_min_value=1
26)
27eval = Eval(
28 name="basic_eval",
29 metrics=Metrics(
30 function=[
31 FunctionItem(name="is_role", kwargs={"role": "assistant"}),
32 FunctionItem(
33 name="flesch_reading_ease",
34 depends_on=[is_assistant_dependency],
35 ),
36 ],
37 rubric=[RubricItem(name="is_helpful", depends_on=[is_assistant_dependency])],
38 ),
39 grader_llm=grader_llm,
40)
41config = Config(clear_tables=True, logs_path="tmp")
42eval_run = EvalRun(
43 data_sources=data_sources,
44 database_path="eval_results.db",
45 eval=eval,
46 config=config,
47 rubric_paths=[RubricsCollection(rubrics={"is_helpful": rubric})],
48)
49flexeval.run(eval_run)
50for metric in access.get_all_metrics():
51 print(
52 f"{metric['thread']} {metric['turn']} {metric['evaluation_name']} {metric['metric_value']:.1f}"
53 )
conversations.jsonl
contents:
1{"input": [{"role": "system", "content": "Be friendly and helpful."}, {"role": "user", "content": "I need help."}, {"role": "assistant", "content": "Help with what?"}, {"role": "user", "content": "My homework."}]}