MixedVoices provides the ability to evaluate agents through simulated text-to-text conversations. This allows you to test your agent's behavior across different scenarios before deploying to production.
Quick Start
import mixedvoices as mv
from mixedvoices.metrics import empathy, Metric
# Create project with metrics
hangup_metric = Metric(
name="call_hangup",
definition="FAILS if the bot faces problems in ending the call",
scoring="binary"
)
project = mv.create_project("dental_clinic", metrics=[empathy, hangup_metric])
# Create version
v1 = project.create_version(
"v1",
prompt="You are a friendly dental receptionist...",
metadata={"model": "gpt-4", "deployment_date": "2024-01-15"}
)
# Generate test cases
test_generator = mv.TestCaseGenerator(v1.prompt)
test_cases = test_generator.add_from_transcripts([existing_conversation])
.add_edge_cases(2)
.add_from_descriptions(["An elderly patient", "A rushed parent"])
.generate()
# Create and run evaluator
evaluator = project.create_evaluator(test_cases, metric_names=["empathy", "call_hangup"])
evaluator.run(v1, MyAgent, agent_starts=False)
Test Case Generation
The TestCaseGenerator class provides multiple methods to create diverse test cases:
test_generator = mv.TestCaseGenerator(agent_prompt)
# Add from existing conversations
test_generator.add_from_transcripts([transcript1, transcript2])
# Add from audio recordings
test_generator.add_from_recordings(["call1.wav", "call2.wav"], user_channel="left")
# Add edge cases
test_generator.add_edge_cases(count=2)
# Add from user descriptions
test_generator.add_from_descriptions([
"An old lady from India with hearing difficulties",
"A young man from New York in a hurry"
])
# Add from existing project/version paths
test_generator.add_from_project(project)
test_generator.add_from_version(version)
# Generate all test cases
test_cases = test_generator.generate()
Implementing Your Agent
Create a class that inherits from BaseAgent:
from typing import Tuple
import mixedvoices as mv
class MyDentalAgent(mv.BaseAgent):
def __init__(self, model="gpt-4", temperature=0.7):
self.agent = YourAgentImplementation(
model=model,
temperature=temperature
)
def respond(self, input_text: str) -> Tuple[str, bool]:
"""Generate agent response for each conversation turn.
Args:
input_text (str): User input (empty string if agent starts)
Returns:
Tuple[str, bool]: (response_text, has_conversation_ended)
"""
response = self.agent.get_response(input_text)
has_ended = check_conversation_ended(response)
return response, has_ended
Running Evaluations
# Create evaluator with specific metrics
evaluator = project.create_evaluator(
test_cases,
metric_names=["empathy", "call_hangup"] # or omit for all project metrics
)
# Run evaluation
eval_run = evaluator.run(
version=v1,
agent_class=MyDentalAgent,
agent_starts=False, # False: user starts, True: agent starts, None: random
verbose=True, # Print conversations and scores
model="gpt-4", # Additional args passed to agent
temperature=0.7
)
# Access results
results = eval_run.results # List of results per test case
status = eval_run.status() # IN PROGRESS/COMPLETED/FAILED/INTERRUPTED