Source code for flexeval.classes.dataset
import os.path
import peewee as pw
from flexeval.classes.base import BaseModel
from flexeval.classes.eval_set_run import EvalSetRun
[docs]
class Dataset(BaseModel):
"""Holds a dataset, e.g. a jsonl file"""
id = pw.IntegerField(primary_key=True)
evalsetrun = pw.ForeignKeyField(EvalSetRun, backref="datasets")
filename = pw.TextField()
datatype = pw.TextField(null=True)
contents = pw.TextField(null=True) # raw contents
max_n_conversation_threads = pw.IntegerField(null=True)
nb_evaluations_per_thread = pw.IntegerField(null=True, default=1)
# In line with LangGraph expectations, we assume n=1 for all outputs of LLMs
# However, each node can append list with length 2+ to the message queue
# Thread - conversation
# Turn - adjacent messages from the same agent
# Message -
# role - human or ai, user or assistant
# text - empty string or non-empty
# list of 0+ Tool Calls
# post-processing - add a turn_id
# additional_kwargs JSON
# ToolUse
# foreign keys to "invoker" message and "function output" message
# message that invoked it - foreign key
# parameters of the input
# result of tool call
# Metric
# granularity type
# foreign key to the object
# **each entry from LangGraph is a LIST of completions - usually with length 1
# Completion - has one bit of text content, and 0+ ToolCalls
# ToolCall - tool call (and response!) associated with the completion
# completion_id
# message_id
# turn_id
def load_data(self):
from flexeval import (
data_loader,
) # Local import as this needs to happen after the module is fully loaded
if self.filename.endswith(".jsonl"):
self.datatype = "json"
data_loader.load_jsonl(
dataset=self,
filename=self.filename,
max_n_conversation_threads=self.max_n_conversation_threads,
nb_evaluations_per_thread=self.nb_evaluations_per_thread,
)
elif is_sqlite_file(self.filename):
self.datatype = "sqlite"
data_loader.load_langgraph_sqlite(
dataset=self,
filename=self.filename,
max_n_conversation_threads=self.max_n_conversation_threads,
nb_evaluations_per_thread=self.nb_evaluations_per_thread,
)
else:
raise ValueError(
f"Unsupported format '{os.path.splitext(self.filename)[-1]}'. Each Data File must be either a jsonl or sqlite file. You provided the file: {self.filename}"
)
[docs]
def is_sqlite_file(filepath):
# Open the file in binary mode
with open(filepath, "rb") as file:
header = file.read(16)
# Check if the header matches the SQLite format header
return header == b"SQLite format 3\x00"