Adding Shapers

Guide to creating custom data transformation shapers.

Overview

Shapers transform DataFrames in the analysis pipeline. They follow the Strategy Pattern and are composable (pipelines).

Key Concepts:

All shapers inherit from base interface
Immutable transformations (return new DataFrame)
Registered in ShaperFactory
Chainable in pipelines

Step 1: Create Shaper Class

File Location

Create in src/web/services/shapers/:

src/web/services/shapers/
├── my_shaper.py       # Your new shaper
├── column_selector.py
├── sort_shaper.py
└── ...

Class Template

from typing import Dict, Any
import pandas as pd


class MyShaper:
    """
    Custom shaper that [describe transformation].

    Configuration:
        param1 (type): Description
        param2 (type): Description

    Example:
        config = {"param1": "value", "param2": 123}
        shaper = MyShaper(config)
        result = shaper(data)
    """

    def __init__(self, config: Dict[str, Any]) -> None:
        """
        Initialize shaper with configuration.

        Args:
            config: Configuration dictionary
        """
        self.config = config
        self.param1 = config.get("param1")
        self.param2 = config.get("param2", default_value)

    def __call__(self, data: pd.DataFrame) -> pd.DataFrame:
        """
        Apply transformation to data.

        Args:
            data: Input DataFrame

        Returns:
            Transformed DataFrame (new instance)

        Raises:
            ValueError: If configuration invalid
            KeyError: If required columns missing
        """
        # Validate
        self._validate(data)

        # Transform (immutable)
        result = data.copy()

        # Apply your transformation
        result = self._transform(result)

        return result

    def _validate(self, data: pd.DataFrame) -> None:
        """Validate data and configuration."""
        if self.param1 not in data.columns:
            raise KeyError(f"Column {self.param1} not found")

        if not isinstance(self.param2, int):
            raise ValueError("param2 must be integer")

    def _transform(self, data: pd.DataFrame) -> pd.DataFrame:
        """Core transformation logic."""
        # Your transformation here
        return data

Step 2: Register in Factory

Edit src/web/services/shapers/shaper_factory.py:

from src.web.services.shapers.my_shaper import MyShaper


class ShaperFactory:
    @staticmethod
    def create_shaper(shaper_type: str, config: Dict[str, Any]):
        shapers = {
            "column_selector": ColumnSelector,
            "sort": SortShaper,
            "my_shaper": MyShaper,  # Add here
            # ...
        }

        if shaper_type not in shapers:
            raise ValueError(f"Unknown shaper: {shaper_type}")

        return shapers[shaper_type](config)

Step 3: Write Tests

Create tests/unit/test_my_shaper.py:

import pytest
import pandas as pd
from src.web.services.shapers.my_shaper import MyShaper


@pytest.fixture
def sample_data():
    """Sample data for testing."""
    return pd.DataFrame({
        "col1": [1, 2, 3, 4, 5],
        "col2": ["a", "b", "c", "d", "e"]
    })


class TestMyShaper:
    """Test suite for MyShaper."""

    def test_basic_transformation(self, sample_data):
        """Test basic shaper operation."""
        config = {"param1": "col1", "param2": 10}
        shaper = MyShaper(config)

        result = shaper(sample_data)

        assert isinstance(result, pd.DataFrame)
        assert len(result) > 0
        # Add specific assertions

    def test_immutability(self, sample_data):
        """Test data immutability."""
        original = sample_data.copy()
        config = {"param1": "col1", "param2": 10}
        shaper = MyShaper(config)

        result = shaper(sample_data)

        # Original unchanged
        pd.testing.assert_frame_equal(sample_data, original)
        # Result is different object
        assert result is not sample_data

    def test_missing_column(self, sample_data):
        """Test error on missing column."""
        config = {"param1": "missing", "param2": 10}
        shaper = MyShaper(config)

        with pytest.raises(KeyError):
            shaper(sample_data)

    def test_invalid_config(self, sample_data):
        """Test error on invalid config."""
        config = {"param1": "col1", "param2": "invalid"}
        shaper = MyShaper(config)

        with pytest.raises(ValueError):
            shaper(sample_data)

Run tests:

pytest tests/unit/test_my_shaper.py -v

Step 4: Add UI Component

Create src/web/ui/components/shapers/my_shaper_ui.py:

import streamlit as st
from typing import Dict, Any


def render_my_shaper_config() -> Dict[str, Any]:
    """
    Render UI for MyShaper configuration.

    Returns:
        Configuration dictionary
    """
    st.subheader("My Shaper")

    # Get available columns
    from src.web.state_manager import StateManager
    columns = StateManager.get_columns()

    # Configuration inputs
    param1 = st.selectbox(
        "Parameter 1",
        options=columns,
        help="Select column for param1"
    )

    param2 = st.number_input(
        "Parameter 2",
        min_value=0,
        value=10,
        help="Integer parameter"
    )

    # Return config
    return {
        "type": "my_shaper",
        "param1": param1,
        "param2": param2
    }

from src.web.ui.components.shapers.my_shaper_ui import render_my_shaper_config


def render_shaper_selector():
    shaper_type = st.selectbox(
        "Shaper Type",
        options=["column_selector", "sort", "my_shaper", ...]
    )

    if shaper_type == "my_shaper":
        return render_my_shaper_config()
    # ... other shapers

Example: Filter Shaper

Complete example of a filtering shaper:

class FilterShaper:
    """
    Filter rows based on column values.

    Configuration:
        column (str): Column to filter
        operator (str): Comparison operator ('>', '<', '==', '!=')
        value (float|str): Value to compare against

    Example:
        config = {"column": "ipc", "operator": ">", "value": 1.5}
        shaper = FilterShaper(config)
        filtered = shaper(data)  # Keep only rows where ipc > 1.5
    """

    OPERATORS = {
        ">": lambda x, v: x > v,
        "<": lambda x, v: x < v,
        "==": lambda x, v: x == v,
        "!=": lambda x, v: x != v,
        ">=": lambda x, v: x >= v,
        "<=": lambda x, v: x <= v
    }

    def __init__(self, config: Dict[str, Any]) -> None:
        self.config = config
        self.column = config["column"]
        self.operator = config["operator"]
        self.value = config["value"]

        if self.operator not in self.OPERATORS:
            raise ValueError(f"Invalid operator: {self.operator}")

    def __call__(self, data: pd.DataFrame) -> pd.DataFrame:
        if self.column not in data.columns:
            raise KeyError(f"Column {self.column} not found")

        # Apply filter
        op_func = self.OPERATORS[self.operator]
        mask = op_func(data[self.column], self.value)

        return data[mask].copy()

Best Practices

Immutability: Always return new DataFrame, never modify input
Validation: Check config and data before transformation
Type Hints: Full annotations on all methods
Error Handling: Clear error messages with context
Documentation: Docstrings with examples
Testing: Test basic operation, immutability, errors, edge cases

Common Patterns

Column Operations

def __call__(self, data: pd.DataFrame) -> pd.DataFrame:
    result = data.copy()
    result["new_col"] = result["col1"] + result["col2"]
    return result

Conditional Transformation

def __call__(self, data: pd.DataFrame) -> pd.DataFrame:
    result = data.copy()
    mask = result["col"] > threshold
    result.loc[mask, "col"] = new_value
    return result

Aggregation

def __call__(self, data: pd.DataFrame) -> pd.DataFrame:
    return data.groupby("category").agg({"value": "mean"}).reset_index()

Integration Testing

Test shaper in pipeline:

def test_shaper_in_pipeline():
    data = pd.DataFrame({"col": [1, 2, 3, 4, 5]})

    pipeline = [
        {"type": "my_shaper", "param1": "col", "param2": 10},
        {"type": "sort", "column": "col", "ascending": False}
    ]

    result = data.copy()
    for shaper_config in pipeline:
        shaper = ShaperFactory.create_shaper(
            shaper_config["type"],
            shaper_config
        )
        result = shaper(result)

    assert len(result) > 0

Next Steps

Pipeline Building: Data-Transformations.md
Testing: Testing-Guide.md
Pandas Reference: https://pandas.pydata.org/docs/