Plugin System Usage Examples¶

The simplified plugin system allows you to easily register and use custom data loaders, especially in Jupyter notebooks and interactive environments.

Basic Usage¶

1. Creating a Custom Data Loader¶

import polars as pl
from tnp_statistic_library.api import register_named_loader, reset_plugin_manager

# Reset plugin manager for clean slate (useful in notebooks)
reset_plugin_manager()

def excel_loader(location: str) -> pl.LazyFrame:
    """Example loader for Excel files."""
    if location.endswith('.xlsx') or location.endswith('.xls'):
        # Use polars to read Excel (requires additional dependencies)
        return pl.read_excel(location).lazy()
    else:
        raise ValueError(f"Excel loader can only handle .xlsx/.xls files, got: {location}")

# Register the loader
register_named_loader("excel", excel_loader)

2. Using the Registered Loader¶

from tnp_statistic_library._internal.datasets.datasets import Dataset

# Create dataset using the registered loader
dataset = Dataset(location="data/sales.xlsx", loader="excel")

# The data will be loaded when accessed
df = dataset.lazyframe.collect()
print(df)

3. Automatic Extension-based Loader (register_data_loader)¶

For loaders that should automatically handle files based on extensions:

from tnp_statistic_library.api import register_data_loader

def csv_special_loader(location: str) -> pl.LazyFrame | None:
    """Handle CSV files with special processing."""
    if location.endswith('.csv'):
        # Custom CSV processing logic
        return pl.scan_csv(location, separator=';', has_header=False).lazy()
    return None

# Register for automatic extension handling
register_data_loader("csv_special", csv_special_loader)

# This will automatically use our loader for .csv files
dataset = Dataset(location="data/special.csv")  # No loader= needed
data = dataset.lazyframe.collect()

4. API Data Loader Example¶

import requests
import polars as pl
from tnp_statistic_library.api import register_named_loader

def api_loader(location: str) -> pl.LazyFrame:
    """Load data from REST APIs."""
    if location.startswith("api://"):
        url = location.replace("api://", "https://")
        response = requests.get(url)
        data = response.json()
        return pl.from_records(data).lazy()
    raise ValueError(f"Expected api:// URL, got {location}")

# Register the named loader
register_named_loader("api", api_loader)

4. Using the API Loader¶

# Use the named loader with explicit loader specification
dataset = Dataset(location="api://jsonplaceholder.typicode.com/posts", loader="api")
data = dataset.lazyframe.collect()
print(data)

Advanced Features¶

Overwriting Loaders¶

# Register initial loader
def loader1(location: str) -> pl.LazyFrame | None:
    return pl.DataFrame({"old": [1, 2, 3]}).lazy() if location == "test" else None

def loader2(location: str) -> pl.LazyFrame | None:
    return pl.DataFrame({"new": [4, 5, 6]}).lazy() if location == "test" else None

register_data_loader("custom", loader1)

# This will fail
try:
    register_data_loader("custom", loader2)
except ValueError as e:
    print(f"Error: {e}")

# This will succeed
register_data_loader("custom", loader2, overwrite=True)

Managing Loaders¶

from tnp_statistic_library.api import (
    list_data_loaders,
    unregister_data_loader,
    clear_data_loaders
)

# List all registered loaders
print("Registered loaders:", list_data_loaders())

# Remove a loader
if unregister_data_loader("excel"):
    print("Excel loader removed")

# Clear all loaders
clear_data_loaders()
print("All loaders cleared")

Database Loader Example¶

import polars as pl
from sqlalchemy import create_engine
from tnp_statistic_library.api import register_named_loader

def database_loader(location: str) -> pl.LazyFrame:
    """Load data from SQL databases."""
    if location.startswith("sql://"):
        table_name = location.replace("sql://", "")

        def query_database():
            engine = create_engine("postgresql://user:pass@localhost/db")
            query = f"SELECT * FROM {table_name}"
            return pl.read_database(query, engine)

        return pl.defer(query_database)
    else:
        raise ValueError("DatabaseLoader requires sql:// prefix")

# Register database loader
register_named_loader("database", database_loader)

# Use in dataset
dataset = Dataset(location="sql://customers", loader="database")

Integration with Metrics¶

The plugin system integrates seamlessly with the existing metrics system:

from tnp_statistic_library.metrics.summary import Mean

# Register your custom loader
register_data_loader("custom", my_custom_loader)

# Use in metric computation
metric = Mean.build(
    dataset=Dataset(location="custom://data", loader="custom"),
    name="average_value",
    variable="value_column"
)

result = metric.run_metric().collect()

Jupyter Notebook Workflow¶

The plugin system is particularly useful in Jupyter notebooks:

# Cell 1: Define and register loader
class MyLoader:
    def load(self, location: str) -> pl.LazyFrame:
        # Custom loading logic
        return pl.DataFrame({"data": [1, 2, 3]}).lazy()

register_data_loader("notebook_loader", MyLoader())

# Cell 2: Use in different cells
dataset1 = Dataset(location="source1", loader="notebook_loader")
dataset2 = Dataset(location="source2", loader="notebook_loader")

# Cell 3: Experiment with different loaders
register_data_loader("notebook_loader", ImprovedLoader(), overwrite=True)
# Now all previous datasets will use the new loader!

Error Handling¶

The plugin system provides clear error messages:

# Unknown loader
try:
    dataset = Dataset(location="data.txt", loader="unknown")
    df = dataset.lazyframe
except ValueError as e:
    print(f"Loader error: {e}")
    # Output: "Unknown data loader 'unknown'"

# Loader registration error
try:
    register_data_loader("test", "not_a_loader")
except TypeError as e:
    print(f"Registration error: {e}")
    # Output: "Loader must implement DataLoaderProtocol"

Migration from Hook-based Plugins¶

If you have existing hook-based plugins, they will continue to work. The new system provides an additional, simpler way to register loaders:

# Old way (still works)
from tnp_statistic_library.api import hookimpl

class OldStyleLoader:
    @hookimpl
    def data_loader(self, location: str) -> pl.LazyFrame | None:
        if location.endswith('.custom'):
            return pl.scan_csv(location)
        return None

# New way (recommended for interactive use)
class NewStyleLoader:
    def load(self, location: str) -> pl.LazyFrame:
        return pl.scan_csv(location)

register_data_loader("new_style", NewStyleLoader())

The new system takes priority for named loaders, while hook-based plugins are still used for automatic format detection.