You are viewing a free preview of this lesson.
Subscribe to unlock all 10 lessons in this course and every other course on LearningBro.
Data pipelines are notoriously hard to test. The data changes, external systems are unreliable, and transformations can be complex. But untested pipelines break silently — producing wrong data that spreads downstream before anyone notices. This lesson covers testing strategies, fixtures, snapshot testing, and CI/CD for data applications.
Untested Pipeline Tested Pipeline
┌─────────────────────┐ ┌─────────────────────┐
│ "It worked last │ │ Unit tests catch │
│ time I ran it" │ │ logic bugs early │
│ │ │ │
│ Fails silently │ │ Integration tests │
│ Wrong data loaded │ │ catch system issues │
│ Nobody notices for │ │ │
│ days or weeks │ │ CI/CD prevents bad │
│ │ │ code from deploying │
└─────────────────────┘ └─────────────────────┘
| Layer | What It Tests | Speed | Coverage |
|---|---|---|---|
| Unit tests | Individual functions / transforms | Fast | Narrow |
| Integration tests | Database, API, file interactions | Medium | Moderate |
| Contract tests | Schema and quality expectations | Fast | Structural |
| End-to-end tests | Full pipeline from source to sink | Slow | Broad |
import pandas as pd
import pytest
from src.transform.clean import clean_customer_data
class TestCleanCustomerData:
def test_lowercases_email(self):
df = pd.DataFrame({
"name": ["Alice"],
"email": ["ALICE@COMPANY.COM"],
"age": [30],
})
result = clean_customer_data(df)
assert result["email"].iloc[0] == "alice@company.com"
def test_drops_rows_without_email(self):
df = pd.DataFrame({
"name": ["Alice", "Bob"],
"email": ["alice@co.com", None],
"age": [30, 25],
})
result = clean_customer_data(df)
assert len(result) == 1
def test_title_cases_name(self):
df = pd.DataFrame({
"name": ["alice smith"],
"email": ["alice@co.com"],
"age": [30],
})
result = clean_customer_data(df)
assert result["name"].iloc[0] == "Alice Smith"
def test_fills_missing_age_with_median(self):
df = pd.DataFrame({
"name": ["Alice", "Bob", "Charlie"],
"email": ["a@co.com", "b@co.com", "c@co.com"],
"age": [20, None, 40],
})
result = clean_customer_data(df)
assert result["age"].iloc[1] == 30.0 # median of 20 and 40
Fixtures provide reusable test data.
import pytest
import pandas as pd
@pytest.fixture
def sample_customers() -> pd.DataFrame:
"""A small, realistic customer DataFrame for testing."""
return pd.DataFrame({
"id": [1, 2, 3, 4, 5],
"name": ["Alice Smith", "bob jones", "CHARLIE BROWN", "Diana Prince", "Eve"],
"email": ["alice@co.com", "BOB@CO.COM", "charlie@co.com", None, "eve@co.com"],
"age": [30, 25, None, 35, 28],
"signup_date": ["2024-01-15", "2024-02-20", "2024-03-10", "2024-04-05", "2024-01-15"],
})
@pytest.fixture
def empty_dataframe() -> pd.DataFrame:
"""An empty DataFrame with the expected schema."""
return pd.DataFrame(columns=["id", "name", "email", "age", "signup_date"])
@pytest.fixture
def sample_orders() -> pd.DataFrame:
"""Sample order data for join tests."""
return pd.DataFrame({
"order_id": [101, 102, 103],
"customer_id": [1, 2, 1],
"amount": [50.0, 75.0, 30.0],
"order_date": ["2024-06-01", "2024-06-02", "2024-06-03"],
})
# Usage in tests
def test_cleaning(sample_customers):
result = clean_customer_data(sample_customers)
assert len(result) == 4 # One row dropped (Diana has no email)
import json
@pytest.fixture
def sample_api_response():
"""Load a sample API response from a fixture file."""
with open("tests/fixtures/api_response.json") as f:
return json.load(f)
@pytest.fixture
def sample_csv_data():
"""Load sample data from a CSV fixture."""
return pd.read_csv("tests/fixtures/sample_customers.csv")
Subscribe to continue reading
Get full access to this lesson and all 10 lessons in this course.