MixedVoices provides the ability to evaluate agents through simulated text-to-text conversations. This allows you to test your agent's behavior across different scenarios before deploying to production.
Quick Start
import mixedvoices as mvfrom mixedvoices.metrics import empathy, Metric# Create project with metricshangup_metric =Metric( name="call_hangup", definition="FAILS if the bot faces problems in ending the call", scoring="binary")project = mv.create_project("dental_clinic", metrics=[empathy, hangup_metric])# Create versionv1 = project.create_version("v1", prompt="You are a friendly dental receptionist...", metadata={"model": "gpt-4", "deployment_date": "2024-01-15"})# Generate test casestest_generator = mv.TestCaseGenerator(v1.prompt)test_cases = test_generator.add_from_transcripts([existing_conversation]).add_edge_cases(2).add_from_descriptions(["An elderly patient", "A rushed parent"]).generate()# Create and run evaluatorevaluator = project.create_evaluator(test_cases, metric_names=["empathy", "call_hangup"])evaluator.run(v1, MyAgent, agent_starts=False)
Test Case Generation
The TestCaseGenerator class provides multiple methods to create diverse test cases:
test_generator = mv.TestCaseGenerator(agent_prompt)# Add from existing conversationstest_generator.add_from_transcripts([transcript1, transcript2])# Add from audio recordingstest_generator.add_from_recordings(["call1.wav", "call2.wav"], user_channel="left")# Add edge casestest_generator.add_edge_cases(count=2)# Add from user descriptionstest_generator.add_from_descriptions(["An old lady from India with hearing difficulties","A young man from New York in a hurry"])# Add from existing project/version pathstest_generator.add_from_project(project)test_generator.add_from_version(version)# Generate all test casestest_cases = test_generator.generate()
Implementing Your Agent
Create a class that inherits from BaseAgent:
from typing import Tupleimport mixedvoices as mvclassMyDentalAgent(mv.BaseAgent):def__init__(self,model="gpt-4",temperature=0.7): self.agent =YourAgentImplementation( model=model, temperature=temperature )defrespond(self,input_text:str) -> Tuple[str,bool]:"""Generate agent response for each conversation turn. Args: input_text (str): User input (empty string if agent starts) Returns: Tuple[str, bool]: (response_text, has_conversation_ended) """ response = self.agent.get_response(input_text) has_ended =check_conversation_ended(response)return response, has_ended
Running Evaluations
# Create evaluator with specific metricsevaluator = project.create_evaluator( test_cases, metric_names=["empathy", "call_hangup"] # or omit for all project metrics)# Run evaluationeval_run = evaluator.run( version=v1, agent_class=MyDentalAgent, agent_starts=False, # False: user starts, True: agent starts, None: random verbose=True, # Print conversations and scores model="gpt-4", # Additional args passed to agent temperature=0.7)# Access resultsresults = eval_run.results # List of results per test casestatus = eval_run.status()# IN PROGRESS/COMPLETED/FAILED/INTERRUPTED