Plugin System Usage Examples¶
The dataset loader plugin system lets you register custom loaders for YAML-defined datasets and for programmatic use.
Basic Usage¶
1. Creating a Custom Dataset Loader¶
import polars as pl
from tnp_statistic_library.plugins import DatasetSpec, register_dataset_loader, reset_plugin_manager
# Reset plugin manager for clean slate (useful in notebooks)
reset_plugin_manager()
def excel_loader(spec: DatasetSpec) -> pl.LazyFrame:
"""Example loader for Excel files."""
source = str(spec.source)
if source.endswith(".xlsx") or source.endswith(".xls"):
return pl.read_excel(source).lazy()
raise ValueError(f"Excel loader can only handle .xlsx/.xls files, got: {source}")
# Register the loader
register_dataset_loader("excel", excel_loader)
2. Using the Registered Loader¶
from tnp_statistic_library._internal.datasets.datasets import Dataset
# Create dataset using the registered loader
dataset = Dataset(type="excel", source="data/sales.xlsx")
# The data will be loaded when accessed
df = dataset.lazyframe.collect()
print(df)
3. API Data Loader Example¶
import requests
import polars as pl
from tnp_statistic_library.plugins import DatasetSpec, register_dataset_loader
def api_loader(spec: DatasetSpec) -> pl.LazyFrame:
"""Load data from REST APIs."""
source = str(spec.source)
if not source.startswith("api://"):
raise ValueError(f"Expected api:// URL, got {source}")
url = source.replace("api://", "https://")
response = requests.get(url)
data = response.json()
return pl.from_records(data).lazy()
register_dataset_loader("api", api_loader)
4. Using the API Loader¶
dataset = Dataset(type="api", source="api://jsonplaceholder.typicode.com/posts")
data = dataset.lazyframe.collect()
print(data)
Advanced Features¶
Overwriting Loaders¶
def loader1(spec: DatasetSpec) -> pl.LazyFrame:
return pl.DataFrame({"old": [1, 2, 3]}).lazy()
def loader2(spec: DatasetSpec) -> pl.LazyFrame:
return pl.DataFrame({"new": [4, 5, 6]}).lazy()
register_dataset_loader("custom", loader1)
# This will fail
try:
register_dataset_loader("custom", loader2)
except ValueError as e:
print(f"Error: {e}")
# This will succeed
register_dataset_loader("custom", loader2, overwrite=True)
Managing Loaders¶
from tnp_statistic_library.plugins import (
list_dataset_loaders,
unregister_dataset_loader,
clear_dataset_loaders,
)
# List all registered loaders
print("Registered loaders:", list_dataset_loaders())
# Remove a loader
if unregister_dataset_loader("excel"):
print("Excel loader removed")
# Clear all user-registered loaders
clear_dataset_loaders()
print("All user loaders cleared")
Database Loader Example¶
import polars as pl
from sqlalchemy import create_engine
from tnp_statistic_library.plugins import DatasetSpec, register_dataset_loader
def database_loader(spec: DatasetSpec) -> pl.LazyFrame:
"""Load data from SQL databases."""
source = str(spec.source)
if not source.startswith("sql://"):
raise ValueError("Database loader requires sql:// prefix")
table_name = source.replace("sql://", "")
def query_database():
engine = create_engine("postgresql://user:pass@localhost/db")
query = f"SELECT * FROM {table_name}"
return pl.read_database(query, engine)
return pl.defer(query_database)
register_dataset_loader("database", database_loader)
dataset = Dataset(type="database", source="sql://customers")
Integration with Metrics¶
from tnp_statistic_library.metrics.summary import mean
register_dataset_loader("custom", my_custom_loader)
data = Dataset(type="custom", source="custom://data").lazyframe
result = mean(
data=data,
variable="value_column",
)
Jupyter Notebook Recipe¶
def loader_v1(spec: DatasetSpec) -> pl.LazyFrame:
return pl.DataFrame({"data": [1, 2, 3]}).lazy()
register_dataset_loader("notebook_loader", loader_v1)
dataset1 = Dataset(type="notebook_loader", source="source1")
dataset2 = Dataset(type="notebook_loader", source="source2")
def loader_v2(spec: DatasetSpec) -> pl.LazyFrame:
return pl.DataFrame({"data": [4, 5, 6]}).lazy()
register_dataset_loader("notebook_loader", loader_v2, overwrite=True)
Error Handling¶
# Unknown loader type
try:
dataset = Dataset(type="unknown", source="data.txt")
df = dataset.lazyframe
except ValueError as e:
print(f"Loader error: {e}")
# Output: "Unknown dataset loader type 'unknown'"
# Loader registration error
try:
register_dataset_loader("csv", loader_v1)
except ValueError as e:
print(f"Registration error: {e}")
# Output: "Dataset loader 'csv' already registered. Use overwrite=True to replace."
Hook-based Plugins (Distributed Packages)¶
import polars as pl
from pluggy import HookimplMarker
from tnp_statistic_library.plugins import DatasetSpec, dataset_loader
hookimpl = HookimplMarker("tnp_statistic_library")
@dataset_loader("custom")
def scan_custom(spec: DatasetSpec) -> pl.LazyFrame:
return pl.scan_csv(str(spec.source))
class CustomPlugin:
@hookimpl
def dataset_loaders(self) -> list[object]:
return [scan_custom]