Testing Strategies for Data Applications
Data applications present unique testing challenges. Unlike traditional software where outputs are deterministic, data systems must handle variability, drift, and large-scale validation.
The Testing Pyramid for Data
- Unit tests: Individual functions and transformations
- Integration tests: Pipeline components working together
- Data validation: Schema and quality checks
- End-to-end tests: Complete pipeline execution
Unit Testing Transformations
import pytest
import pandas as pd
from src.transforms import calculate_metrics
def test_calculate_metrics_basic():
"""Test metric calculation with known values."""
df = pd.DataFrame({
'revenue': [100, 200, 300],
'orders': [10, 20, 30]
})
result = calculate_metrics(df)
assert 'avg_order_value' in result.columns
assert result['avg_order_value'].iloc[0] == 10.0
def test_calculate_metrics_handles_zeros():
"""Test division by zero handling."""
df = pd.DataFrame({
'revenue': [100, 0],
'orders': [0, 10]
})
result = calculate_metrics(df)
assert pd.isna(result['avg_order_value'].iloc[0])
assert result['avg_order_value'].iloc[1] == 0.0
Schema Validation
import pandera as pa
from pandera import Column, Check, DataFrameSchema
# Define expected schema
order_schema = DataFrameSchema({
"order_id": Column(int, Check.greater_than(0), unique=True),
"customer_id": Column(int, Check.greater_than(0)),
"amount": Column(float, Check.greater_than_or_equal_to(0)),
"status": Column(str, Check.isin(['pending', 'complete', 'cancelled'])),
"created_at": Column(pa.DateTime)
})
# Validate data
def test_orders_schema():
df = load_orders()
order_schema.validate(df)
Data Quality Checks
from great_expectations import expect
def validate_order_data(df):
"""Run data quality expectations."""
# Completeness
expect(df['customer_id'].notna().mean()).to_be_greater_than(0.99)
# Uniqueness
expect(df['order_id'].is_unique).to_be_true()
# Validity
expect(df['amount']).to_be_between(0, 1000000)
# Freshness
expect(df['created_at'].max()).to_be_greater_than(
pd.Timestamp.now() - pd.Timedelta(hours=24)
)
# Distribution stability
expect(df['amount'].mean()).to_be_between(
baseline_mean * 0.8,
baseline_mean * 1.2
)
Testing ML Models
def test_model_performance():
"""Ensure model meets minimum performance."""
X_test, y_test = load_test_data()
model = load_model()
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
assert accuracy > 0.85, f"Model accuracy {accuracy} below threshold"
def test_model_invariants():
"""Test model behaves consistently with invariant inputs."""
model = load_model()
# Same input should give same output
input_data = create_test_input()
pred1 = model.predict(input_data)
pred2 = model.predict(input_data)
assert np.array_equal(pred1, pred2)
def test_no_feature_leakage():
"""Ensure no future data leakage."""
train_dates = train_df['date'].max()
for col in feature_columns:
derived_dates = extract_dates_from_feature(train_df[col])
assert derived_dates.max() <= train_dates
Integration Testing Pipelines
def test_pipeline_end_to_end():
"""Run pipeline with test data and validate output."""
# Setup test fixtures
test_input = create_test_dataset()
# Run pipeline
result = run_pipeline(test_input)
# Validate output
assert len(result) > 0
assert result.schema == expected_schema
assert result['key_metric'].mean() > 0
Testing data applications requires thinking about data quality as a first-class concern.