JBON_DATA

Testing Strategies for Data Applications

Data applications present unique testing challenges. Unlike traditional software where outputs are deterministic, data systems must handle variability, drift, and large-scale validation.

The Testing Pyramid for Data

  • Unit tests: Individual functions and transformations
  • Integration tests: Pipeline components working together
  • Data validation: Schema and quality checks
  • End-to-end tests: Complete pipeline execution

Unit Testing Transformations

import pytest
import pandas as pd
from src.transforms import calculate_metrics

def test_calculate_metrics_basic():
    """Test metric calculation with known values."""
    df = pd.DataFrame({
        'revenue': [100, 200, 300],
        'orders': [10, 20, 30]
    })
    
    result = calculate_metrics(df)
    
    assert 'avg_order_value' in result.columns
    assert result['avg_order_value'].iloc[0] == 10.0

def test_calculate_metrics_handles_zeros():
    """Test division by zero handling."""
    df = pd.DataFrame({
        'revenue': [100, 0],
        'orders': [0, 10]
    })
    
    result = calculate_metrics(df)
    
    assert pd.isna(result['avg_order_value'].iloc[0])
    assert result['avg_order_value'].iloc[1] == 0.0

Schema Validation

import pandera as pa
from pandera import Column, Check, DataFrameSchema

# Define expected schema
order_schema = DataFrameSchema({
    "order_id": Column(int, Check.greater_than(0), unique=True),
    "customer_id": Column(int, Check.greater_than(0)),
    "amount": Column(float, Check.greater_than_or_equal_to(0)),
    "status": Column(str, Check.isin(['pending', 'complete', 'cancelled'])),
    "created_at": Column(pa.DateTime)
})

# Validate data
def test_orders_schema():
    df = load_orders()
    order_schema.validate(df)

Data Quality Checks

from great_expectations import expect

def validate_order_data(df):
    """Run data quality expectations."""
    
    # Completeness
    expect(df['customer_id'].notna().mean()).to_be_greater_than(0.99)
    
    # Uniqueness
    expect(df['order_id'].is_unique).to_be_true()
    
    # Validity
    expect(df['amount']).to_be_between(0, 1000000)
    
    # Freshness
    expect(df['created_at'].max()).to_be_greater_than(
        pd.Timestamp.now() - pd.Timedelta(hours=24)
    )
    
    # Distribution stability
    expect(df['amount'].mean()).to_be_between(
        baseline_mean * 0.8, 
        baseline_mean * 1.2
    )

Testing ML Models

def test_model_performance():
    """Ensure model meets minimum performance."""
    X_test, y_test = load_test_data()
    model = load_model()
    
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    
    assert accuracy > 0.85, f"Model accuracy {accuracy} below threshold"

def test_model_invariants():
    """Test model behaves consistently with invariant inputs."""
    model = load_model()
    
    # Same input should give same output
    input_data = create_test_input()
    pred1 = model.predict(input_data)
    pred2 = model.predict(input_data)
    
    assert np.array_equal(pred1, pred2)

def test_no_feature_leakage():
    """Ensure no future data leakage."""
    train_dates = train_df['date'].max()
    for col in feature_columns:
        derived_dates = extract_dates_from_feature(train_df[col])
        assert derived_dates.max() <= train_dates

Integration Testing Pipelines

def test_pipeline_end_to_end():
    """Run pipeline with test data and validate output."""
    # Setup test fixtures
    test_input = create_test_dataset()
    
    # Run pipeline
    result = run_pipeline(test_input)
    
    # Validate output
    assert len(result) > 0
    assert result.schema == expected_schema
    assert result['key_metric'].mean() > 0

Testing data applications requires thinking about data quality as a first-class concern.

← Back to Blog