Multiple configurations#
If you name a dataset, even an in-memory IterableDataSource, you can reuse it in multiple runs with different configurations.
Python source: multiple_configs.py
1import flexeval
2from flexeval.metrics import access
3from flexeval.schema import (
4 Config,
5 Eval,
6 EvalRun,
7 FunctionItem,
8 IterableDataSource,
9 NamedDataSource,
10 Metrics,
11)
12
13# You can load conversations directly from Python using IterableDataSource
14# Each thread is a dict with an "input" key containing a list of messages
15conversations = [
16 {
17 "input": [
18 {"role": "user", "content": "What is 2+2?"},
19 {"role": "assistant", "content": "The answer is 4."},
20 {"role": "user", "content": "Thanks!"},
21 {
22 "role": "assistant",
23 "content": "You're welcome! Let me know if you have more questions.",
24 },
25 ]
26 },
27 {
28 "input": [
29 {"role": "user", "content": "Can you help me write a regex?"},
30 {
31 "role": "assistant",
32 "content": "Sure! What pattern do you need to match?",
33 },
34 ]
35 },
36]
37
38# Name the data source so it can be reused across eval runs.
39# The data is loaded into the database on the first run and reused on the second.
40data_sources = [IterableDataSource(name="test_conversations", contents=conversations)]
41
42# --- Config 1: applying the index_in_thread function to ---
43# Computes the position of each turn within a thread.
44eval_run_1 = EvalRun(
45 data_sources=data_sources,
46 database_path="eval_results.db",
47 eval=Eval(metrics=Metrics(function=[FunctionItem(name="index_in_thread")])),
48 config=Config(clear_tables=True),
49)
50flexeval.run(eval_run_1)
51print("=== Run 1: index_in_thread ===")
52for metric in access.get_all_metrics():
53 print(
54 f" thread={metric['thread']} turn={metric['turn']}"
55 f" {metric['metric_name']}={metric['metric_value']}"
56 )
57
58# --- Run 2: message_matches_regex ---
59# Counts question marks in each message.
60# The dataset "test_conversations" is reused from Run 1 by specifying a NamedDataSource with the same name.
61# (You could also reuse the the same IterableDataSource object created early.)
62eval_run_2 = EvalRun(
63 data_sources=[NamedDataSource(name="test_conversations")],
64 database_path="eval_results.db",
65 eval=Eval(
66 metrics=Metrics(
67 function=[
68 FunctionItem(
69 name="message_matches_regex",
70 metric_level="Message",
71 kwargs={"expression": r"\?"},
72 )
73 ]
74 )
75 ),
76 config=Config(clear_tables=False),
77)
78flexeval.run(eval_run_2)
79print("\n=== Run 2: message_matches_regex (question marks) ===")
80for metric in access.get_all_metrics():
81 if metric["evaluation_name"] == "message_matches_regex":
82 print(
83 f" thread={metric['thread']} message={metric['message']}"
84 f" {metric['metric_name']}={metric['metric_value']}"
85 )