Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion backend/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,8 @@ EVALS_OPENAI_KEY=<your_openai_api_key>
EVALS_WANDB_KEY=<your_wandb_api_key>
```

The evaluation results will be logged to [Weights & Biases](https://wandb.ai/aipotheosis-labs/function-search-evaluation) where you can track metrics, view experiment configurations, and analyze the results.

Then, seed the database with all apps and mock credentials:

```bash
Expand All @@ -414,7 +416,10 @@ Additional flags:

```bash
# Specify a custom dataset artifact name (default: "synthetic_intent_dataset")
docker compose exec runner python -m evals.evaluation_pipeline --mode evaluate-only --dataset my_custom_dataset
docker compose exec runner python -m evals.evaluation_pipeline --mode evaluate-only --dataset_artifact my_custom_artifact

# Specify the filename saved on the dataset artifact
docker compose exec runner python -m evals.evaluation_pipeline --mode evaluate-only --dataset_filename my_custom_dataset.csv

# Limit the number of samples to generate
docker compose exec runner python -m evals.evaluation_pipeline --mode generate-only --generation-limit 50
Expand Down
47 changes: 37 additions & 10 deletions backend/evals/evaluation_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
logger = logging.getLogger(__name__)

DEFAULT_DATASET_ARTIFACT = "synthetic_intent_dataset"
DEFAULT_DATASET_FILENAME = "synthetic_intents.csv"
DEFAULT_EVALUATION_MODEL = "dual-encoder-text-embedding-1024"


Expand Down Expand Up @@ -60,26 +61,32 @@ def __init__(
api_key=search_api_key,
)

def _load_dataset_from_wandb(self, artifact_name: str) -> pd.DataFrame:
def _load_dataset_from_wandb(self, artifact_name: str, dataset_filename: str) -> pd.DataFrame:
"""
Load a dataset from a W&B artifact.

Args:
artifact_name: Name of the W&B artifact

dataset_filename: Filename to save the dataset to
Returns:
DataFrame containing the dataset
"""
artifact = wandb.use_artifact(f"{artifact_name}:latest")
artifact_dir = artifact.download()
return pd.read_csv(os.path.join(artifact_dir, "temp_dataset.csv"))
return pd.read_csv(os.path.join(artifact_dir, dataset_filename))

def _generate(self, dataset_artifact: str, generation_limit: int | None = None) -> pd.DataFrame:
def _generate(
self,
dataset_artifact: str,
dataset_filename: str,
generation_limit: int | None = None,
) -> pd.DataFrame:
"""
Generate synthetic intents.

Args:
dataset_artifact: Name of the artifact to save the dataset to
dataset_filename: Filename to save the dataset to
generation_limit: Optional limit on number of samples to generate

Returns:
Expand All @@ -88,6 +95,7 @@ def _generate(self, dataset_artifact: str, generation_limit: int | None = None)
logger.info("Generating synthetic intents...")
df = self.generator.generate(
dataset_artifact=dataset_artifact,
dataset_filename=dataset_filename,
limit=generation_limit,
)

Expand All @@ -97,6 +105,7 @@ def _generate(self, dataset_artifact: str, generation_limit: int | None = None)
def _evaluate(
self,
dataset_artifact: str,
dataset_filename: str,
evaluation_samples: int | None = None,
df: pd.DataFrame | None = None,
) -> dict:
Expand All @@ -105,6 +114,7 @@ def _evaluate(

Args:
dataset_artifact: Name of the dataset artifact to evaluate
dataset_filename: Filename of the dataset in the artifact
evaluation_samples: Optional limit on number of samples to evaluate
df: Optional DataFrame containing the dataset. If None, load from dataset_artifact

Expand All @@ -113,7 +123,7 @@ def _evaluate(
"""
if df is None:
logger.info(f"Loading dataset from artifact: {dataset_artifact}")
df = self._load_dataset_from_wandb(dataset_artifact)
df = self._load_dataset_from_wandb(dataset_artifact, dataset_filename)

# Evaluate search performance
logger.info("Evaluating search performance...")
Expand All @@ -138,19 +148,21 @@ def _evaluate(

def run(
self,
dataset_artifact: str,
dataset_filename: str,
generate_data: bool = False,
evaluate_data: bool = True,
dataset_artifact: str = DEFAULT_DATASET_ARTIFACT,
generation_limit: int | None = None,
evaluation_samples: int | None = None,
) -> None:
"""
Run the evaluation pipeline.

Args:
dataset_artifact: Name of dataset artifact to use
dataset_filename: Filename to save/load the dataset to/from
generate_data: Whether to generate new data
evaluate_data: Whether to evaluate data
dataset_artifact: Name of dataset artifact to use
generation_limit: Optional limit on number of samples to generate
evaluation_samples: Optional limit on number of samples to evaluate

Expand All @@ -169,6 +181,7 @@ def run(
"evaluation_model": DEFAULT_EVALUATION_MODEL,
"evaluation_samples": evaluation_samples,
"dataset_artifact": dataset_artifact,
"dataset_filename": dataset_filename,
},
)

Expand All @@ -177,12 +190,14 @@ def run(
if generate_data:
df = self._generate(
dataset_artifact=dataset_artifact,
dataset_filename=dataset_filename,
generation_limit=generation_limit,
)

if evaluate_data:
self._evaluate(
dataset_artifact=dataset_artifact,
dataset_filename=dataset_filename,
evaluation_samples=evaluation_samples,
df=df,
)
Expand All @@ -199,15 +214,26 @@ def run(
required=True,
)
@click.option(
"--dataset",
"--dataset-artifact",
default=DEFAULT_DATASET_ARTIFACT,
help="Name of the W&B dataset artifact to use",
show_default=True,
)
@click.option(
"--dataset-filename",
default=DEFAULT_DATASET_FILENAME,
type=str,
help="Filename to save the generated dataset to",
show_default=True,
)
@click.option("--generation-limit", type=int, help="Limit number of samples to generate")
@click.option("--evaluation-samples", type=int, help="Limit number of samples to evaluate")
def main(
mode: str, dataset: str, generation_limit: int | None, evaluation_samples: int | None
mode: str,
dataset_artifact: str,
generation_limit: int | None,
evaluation_samples: int | None,
dataset_filename: str,
) -> None:
"""Main entry point for the evaluation pipeline."""
# Get API keys from environment
Expand Down Expand Up @@ -235,9 +261,10 @@ def main(

# Run pipeline
pipeline.run(
dataset_artifact=dataset_artifact,
dataset_filename=dataset_filename,
generate_data=generate_data,
evaluate_data=evaluate_data,
dataset_artifact=dataset,
generation_limit=generation_limit,
evaluation_samples=evaluation_samples,
)
Expand Down
36 changes: 14 additions & 22 deletions backend/evals/synthetic_intent_generator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
import os
import tempfile

import openai
import pandas as pd
import wandb
Expand Down Expand Up @@ -112,13 +109,14 @@ def _log_dataset_stats(self, df: pd.DataFrame) -> None:
}
)

def _save_to_wandb(self, df: pd.DataFrame, dataset_artifact: str) -> str:
def _save_to_wandb(self, df: pd.DataFrame, dataset_artifact: str, dataset_filename: str) -> str:
"""
Save the dataset as a wandb artifact.

Args:
df: DataFrame containing the generated dataset
dataset_artifact: Name for the artifact
dataset_filename: Filename to save the dataset to

Returns:
The artifact name for reference
Expand All @@ -142,37 +140,31 @@ def _save_to_wandb(self, df: pd.DataFrame, dataset_artifact: str) -> str:
},
)

# Use tempfile to create and manage a temporary file
with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as temp_file:
temp_filename = temp_file.name

try:
# Write dataframe to the temporary file
df.to_csv(temp_filename, index=False)
# Add the file to the artifact
artifact.add_file(temp_filename)
# Log the artifact
wandb.log_artifact(artifact)
# Write dataframe to the temporary file
df.to_csv(dataset_filename, index=False)
# Add the file to the artifact
artifact.add_file(dataset_filename)
# Log the artifact
wandb.log_artifact(artifact)

return artifact.name
finally:
# Ensure temp file is removed even if any operation fails
if os.path.exists(temp_filename):
os.unlink(temp_filename)
return artifact.name

def generate(
self,
dataset_artifact: str,
dataset_filename: str,
limit: int | None = None,
) -> pd.DataFrame:
"""
Generate synthetic intents and save them.

Args:
dataset_artifact: Name of the artifact to save the dataset to
dataset_filename: Filename to save the dataset to
limit: Optional limit on number of samples to generate

Returns:
The name of the saved artifact
DataFrame containing the generated dataset
"""
# Fetch data
df = self._fetch_app_function_data()
Expand All @@ -191,7 +183,7 @@ def generate(

# Log and save
self._log_dataset_stats(df)
artifact_name = self._save_to_wandb(df, dataset_artifact)
artifact_name = self._save_to_wandb(df, dataset_artifact, dataset_filename)

print(f"Dataset saved as W&B artifact: {artifact_name}")
return df