Evaluating AI Models Using pytest-evals
Post

Evaluating AI Models Using pytest-evals

pytest-evals

Powerful testing framework designed specifically for AI model evaluation. We’ll implement a mock LLM system to enable testing without requiring external API keys. As AI models become integral to production systems, proper evaluation is crucial for ensuring these models meet quality standards before deployment. Evaluation helps you:

  • Identify model strengths and weaknesses
  • Track performance over time
  • Establish quality gates for deployment
  • Compare different model versions
  • Create reproducible test suites for regression testing

Commands

1
2
3
4
mkdir llm-evaluation
cd llm-evaluation
python3 -m venv venv
source venv/bin/activate

Dependencies

1
pip install pytest==7.4.0 pandas==2.0.3 matplotlib==3.7.2 numpy==1.24.3 pytest-evals==0.3.4
1
2
3
4
5
6
mkdir -p mock_llm
mkdir -p tests/data
touch tests/__init__.py
touch tests/conftest.py

pytest --help

Usage:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
mkdir -p mock_llm tests
touch mock_llm/__init__.py
touch tests/__init__.py
cat > mock_llm/simple_llm.py <<'EOF'
import random

class MockLLM:
    def __init__(self, quality_level=0.8):
        # Quality level affects how 'good' the model is (0-1)
        self.quality_level = quality_level

        # Response templates by category
        self.response_templates = {
            "account_access": [
                "You can access your account by clicking the profile icon.",
                "To reset your password, visit the forgot password page."
            ],
            "billing": [
                "Our refund policy allows returns within 30 days.",
                "Payment processing typically takes 2-3 business days."
            ],
            "general_inquiry": [
                "Our business hours are 9am-5pm, Monday through Friday.",
                "You can reach customer service at support@example.com."
            ]
        }

    def classify_text(self, text):
        # Simple keyword-based classification
        text = text.lower()
        if "account" in text or "password" in text or "login" in text:
            return "account_access"
        elif "payment" in text or "refund" in text or "charge" in text:
            return "billing"
        else:
            return "general_inquiry"
EOF
cat > test_llm.py <<'EOF'
from mock_llm.simple_llm import MockLLM

# Create the LLM
llm = MockLLM()

# Test some examples
examples = [
    "How do I reset my password?",
    "When will my payment be processed?",
    "What are your opening hours?"
]

# Run classification
for example in examples:
    category = llm.classify_text(example)
    print(f"Text: '{example}'")
    print(f"Classification: '{category}'")
    print("---")
EOF

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
cat > tests/conftest.py <<'EOF'
import pytest
import sys
import os

# Add the parent directory to path for imports
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

# Import our LLM
from mock_llm.simple_llm import MockLLM

# Create a fixture for our classifier
@pytest.fixture
def classifier():
    llm = MockLLM()
    return llm.classify_text
EOF
cat > tests/test_simple_eval.py <<'EOF'
import pytest

@pytest.mark.eval(name="my_classifier")
def test_customer_inquiry(eval_bag, classifier):
    # Input text to classify
    input_text = "I need to change my account password"

    # Get prediction using our classifier fixture
    prediction = classifier(input_text)

    # Store results in eval_bag
    eval_bag.input = input_text
    eval_bag.prediction = prediction

    # Incorrectly expect "billing" instead of "account_access"
    eval_bag.expected = "billing"  # This is wrong!

    # Calculate accuracy (1 if correct, 0 if incorrect)
    eval_bag.accuracy = 1 if prediction == eval_bag.expected else 0
    eval_bag.confidence = 0.95
EOF

Running the evaluation phase:

1
pytest --run-eval -v

Debugging and Fixing Evaluation Tests

Creating an Analysis Test

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
cat > tests/test_analysis.py <<'EOF'
import pytest

@pytest.mark.eval_analysis(name="my_classifier")
def test_analysis(eval_results):
    # Count total test cases
    total_cases = len(eval_results)

    # Skip if no results found
    if total_cases == 0:
        pytest.skip("No evaluation results found. Run with --run-eval first.")

    # Count correct predictions
    correct_predictions = sum(1 for result in eval_results 
                            if hasattr(result, 'result') 
                            and 'accuracy' in result.result 
                            and result.result['accuracy'])

    # Calculate overall accuracy
    accuracy = correct_predictions / total_cases

    # Print results
    print(f"Total test cases: {total_cases}")
    print(f"Correct predictions: {correct_predictions}")
    print(f"Accuracy: {accuracy:.2%}")

    # Assert minimum quality threshold
    assert accuracy >= 0.7, f"Accuracy {accuracy:.2%} below threshold of 70%"
EOF
pytest --run-eval-analysis -v

Fixing the Evaluation Test

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
cat > tests/test_simple_eval.py <<'EOF'
import pytest

@pytest.mark.eval(name="my_classifier")
def test_customer_inquiry(eval_bag, classifier):
    # Input text to classify
    input_text = "I need to change my account password"

    # Get prediction using our classifier fixture
    prediction = classifier(input_text)

    # Store results in eval_bag
    eval_bag.input = input_text
    eval_bag.prediction = prediction

    # Use the correct expectation
    eval_bag.expected = "account_access"  # Fixed!

    # Calculate accuracy (1 if correct, 0 if incorrect)
    eval_bag.accuracy = 1 if prediction == eval_bag.expected else 0
    eval_bag.confidence = 0.95
EOF
pytest --run-eval --run-eval-analysis -v

Adding More Test Cases

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
cat > tests/test_multiple_cases.py <<'EOF'
import pytest

@pytest.mark.parametrize("input_text,expected", [
    ("I need to reset my password", "account_access"),
    ("How do I update my account details?", "account_access"),
    ("When will my payment be processed?", "billing"),
    ("What's your refund policy?", "billing"),
    ("What are your hours of operation?", "general_inquiry"),
])
@pytest.mark.eval(name="my_classifier")
def test_classifier_cases(eval_bag, classifier, input_text, expected):
    # Get prediction
    prediction = classifier(input_text)

    # Store results
    eval_bag.input = input_text
    eval_bag.prediction = prediction
    eval_bag.expected = expected
    eval_bag.accuracy = 1 if prediction == expected else 0
EOF
pytest --run-eval --run-eval-analysis -v