Multiple configurations#

If you name a dataset, even an in-memory IterableDataSource, you can reuse it in multiple runs with different configurations.

Python source: multiple_configs.py

 1import flexeval
 2from flexeval.metrics import access
 3from flexeval.schema import (
 4    Config,
 5    Eval,
 6    EvalRun,
 7    FunctionItem,
 8    IterableDataSource,
 9    NamedDataSource,
10    Metrics,
11)
12
13# You can load conversations directly from Python using IterableDataSource
14# Each thread is a dict with an "input" key containing a list of messages
15conversations = [
16    {
17        "input": [
18            {"role": "user", "content": "What is 2+2?"},
19            {"role": "assistant", "content": "The answer is 4."},
20            {"role": "user", "content": "Thanks!"},
21            {
22                "role": "assistant",
23                "content": "You're welcome! Let me know if you have more questions.",
24            },
25        ]
26    },
27    {
28        "input": [
29            {"role": "user", "content": "Can you help me write a regex?"},
30            {
31                "role": "assistant",
32                "content": "Sure! What pattern do you need to match?",
33            },
34        ]
35    },
36]
37
38# Name the data source so it can be reused across eval runs.
39# The data is loaded into the database on the first run and reused on the second.
40data_sources = [IterableDataSource(name="test_conversations", contents=conversations)]
41
42# --- Config 1: applying the index_in_thread function to ---
43# Computes the position of each turn within a thread.
44eval_run_1 = EvalRun(
45    data_sources=data_sources,
46    database_path="eval_results.db",
47    eval=Eval(metrics=Metrics(function=[FunctionItem(name="index_in_thread")])),
48    config=Config(clear_tables=True),
49)
50flexeval.run(eval_run_1)
51print("=== Run 1: index_in_thread ===")
52for metric in access.get_all_metrics():
53    print(
54        f"  thread={metric['thread']} turn={metric['turn']}"
55        f" {metric['metric_name']}={metric['metric_value']}"
56    )
57
58# --- Run 2: message_matches_regex ---
59# Counts question marks in each message.
60# The dataset "test_conversations" is reused from Run 1 by specifying a NamedDataSource with the same name.
61# (You could also reuse the the same IterableDataSource object created early.)
62eval_run_2 = EvalRun(
63    data_sources=[NamedDataSource(name="test_conversations")],
64    database_path="eval_results.db",
65    eval=Eval(
66        metrics=Metrics(
67            function=[
68                FunctionItem(
69                    name="message_matches_regex",
70                    metric_level="Message",
71                    kwargs={"expression": r"\?"},
72                )
73            ]
74        )
75    ),
76    config=Config(clear_tables=False),
77)
78flexeval.run(eval_run_2)
79print("\n=== Run 2: message_matches_regex (question marks) ===")
80for metric in access.get_all_metrics():
81    if metric["evaluation_name"] == "message_matches_regex":
82        print(
83            f"  thread={metric['thread']} message={metric['message']}"
84            f" {metric['metric_name']}={metric['metric_value']}"
85        )