Skip to content

Examples

This page provides comprehensive examples for all metric types and common usage patterns in the TNP Statistic Library.

Sample Dataset

All examples use this realistic financial portfolio dataset:

import polars as pl
import numpy as np

# Create a comprehensive sample dataset
np.random.seed(42)  # For reproducible examples

# Generate 1000 customer records
n_customers = 1000
customer_ids = [f"CUST_{i:05d}" for i in range(1, n_customers + 1)]

# Generate realistic financial data
regions = ["North", "South", "East", "West", "Central"]
products = ["Personal_Loan", "Credit_Card", "Mortgage", "Auto_Loan", "Business_Loan"]
risk_grades = ["A", "B", "C", "D", "E"]

# Create the dataset
predicted_ead_values = np.random.lognormal(9, 0.8, n_customers).astype(int)
lgd_prediction_values = np.random.uniform(0.3, 0.7, n_customers)

df = pl.DataFrame({
    "customer_id": customer_ids,
    "probability": np.random.beta(2, 8, n_customers),  # Realistic PD distribution
    "default_flag": np.random.binomial(1, 0.12, n_customers),  # ~12% default rate
    "exposure_amount": np.random.lognormal(10, 1, n_customers).astype(int),
    "predicted_ead": predicted_ead_values,
    "actual_ead": predicted_ead_values * np.random.normal(1, 0.15, n_customers),
    "lgd_prediction": lgd_prediction_values,
    "actual_lgd": lgd_prediction_values * np.random.normal(1, 0.1, n_customers),
    "region": np.random.choice(regions, n_customers),
    "product": np.random.choice(products, n_customers),
    "risk_grade": np.random.choice(risk_grades, n_customers),
    "origination_year": np.random.choice([2020, 2021, 2022, 2023, 2024], n_customers),
    "account_age_months": np.random.randint(1, 60, n_customers)
})

print(f"Dataset shape: {df.shape}")
print(df.head())

Accuracy Metrics

Default Accuracy

Basic Usage

from tnp_statistic_library.metrics import default_accuracy

# Overall model accuracy
overall_accuracy = default_accuracy(
    name="model_accuracy",
    dataset=df,
    data_format="record_level",
    prob_def="probability",
    default="default_flag"
)
print(f"Overall Model Accuracy: {overall_accuracy}")

Segmented Analysis

# Accuracy by region
regional_accuracy = default_accuracy(
    name="regional_accuracy",
    dataset=df,
    data_format="record_level",
    prob_def="probability",
    default="default_flag",
    segment=["region"]
)

# Accuracy by product and risk grade
product_risk_accuracy = default_accuracy(
    name="product_risk_accuracy",
    dataset=df,
    data_format="record_level",
    prob_def="probability",
    default="default_flag",
    segment=["product", "risk_grade"]
)

print(f"Regional Accuracy: {regional_accuracy}")
print(f"Product-Risk Accuracy: {product_risk_accuracy}")

Time-Based Analysis

# Accuracy by origination year
vintage_accuracy = default_accuracy(
    name="vintage_accuracy",
    dataset=df,
    data_format="record_level",
    prob_def="probability",
    default="default_flag",
    segment=["origination_year"]
)
print(f"Vintage Analysis: {vintage_accuracy}")

EAD Accuracy

Basic EAD Validation

from tnp_statistic_library.metrics import ead_accuracy

# Overall EAD accuracy
ead_result = ead_accuracy(
    name="ead_validation",
    dataset=df,
    data_format="record_level",
    predicted_ead="predicted_ead",
    actual_ead="actual_ead",
    default="default_flag"
)
print(f"EAD Accuracy: {ead_result}")

Segmented EAD Analysis

# EAD accuracy by product
product_ead = ead_accuracy(
    name="product_ead_accuracy",
    dataset=df,
    data_format="record_level",
    predicted_ead="predicted_ead",
    actual_ead="actual_ead",
    default="default_flag",
    segment=["product"]
)

# EAD accuracy for defaulted accounts only
defaulted_df = df.filter(pl.col("default_flag") == 1)
defaulted_ead = ead_accuracy(
    name="defaulted_ead_accuracy",
    dataset=defaulted_df,
    data_format="record_level",
    predicted_ead="predicted_ead",
    actual_ead="actual_ead",
    default="default_flag",
    segment=["region"]
)

print(f"Product EAD Accuracy: {product_ead}")
print(f"Defaulted Accounts EAD Accuracy: {defaulted_ead}")

MAPE (Mean Absolute Percentage Error)

Basic MAPE Calculation

from tnp_statistic_library.metrics import mape

# Overall MAPE for EAD predictions
ead_mape = mape(
    name="ead_mape",
    dataset=df,
    data_format="record_level",
    observed="actual_ead",
    predicted="predicted_ead"
)
print(f"EAD MAPE: {ead_mape}")

# MAPE for LGD predictions
lgd_mape = mape(
    name="lgd_mape",
    dataset=df,
    data_format="record_level",
    observed="actual_lgd",
    predicted="lgd_prediction"
)
print(f"LGD MAPE: {lgd_mape}")

Segmented MAPE Analysis

# MAPE by region for EAD predictions
regional_ead_mape = mape(
    name="regional_ead_mape",
    dataset=df,
    data_format="record_level",
    observed="actual_ead",
    predicted="predicted_ead",
    segment=["region"]
)

# MAPE by product type for LGD predictions
product_lgd_mape = mape(
    name="product_lgd_mape",
    dataset=df,
    data_format="record_level",
    observed="actual_lgd",
    predicted="lgd_prediction",
    segment=["product"]
)

# MAPE by risk grade for both EAD and LGD
risk_grade_ead_mape = mape(
    name="risk_grade_ead_mape",
    dataset=df,
    data_format="record_level",
    observed="actual_ead",
    predicted="predicted_ead",
    segment=["risk_grade"]
)

print(f"Regional EAD MAPE: {regional_ead_mape}")
print(f"Product LGD MAPE: {product_lgd_mape}")
print(f"Risk Grade EAD MAPE: {risk_grade_ead_mape}")

MAPE for Defaulted Accounts Only

# Filter to defaulted accounts for more relevant analysis
defaulted_df = df.filter(pl.col("default_flag") == 1)

# MAPE on defaulted accounts (where EAD/LGD is actually realized)
defaulted_ead_mape = mape(
    name="defaulted_ead_mape",
    dataset=defaulted_df,
    data_format="record_level",
    observed="actual_ead",
    predicted="predicted_ead",
    segment=["product"]
)

defaulted_lgd_mape = mape(
    name="defaulted_lgd_mape",
    dataset=defaulted_df,
    data_format="record_level",
    observed="actual_lgd",
    predicted="lgd_prediction",
    segment=["region"]
)

print(f"Defaulted EAD MAPE: {defaulted_ead_mape}")
print(f"Defaulted LGD MAPE: {defaulted_lgd_mape}")

Comparing MAPE with RMSE

from tnp_statistic_library.metrics import mape, rmse, f1_score, f2_score

# Calculate both MAPE and RMSE for EAD predictions
ead_mape_result = mape(
    name="ead_comparison_mape",
    dataset=df,
    data_format="record_level",
    observed="actual_ead",
    predicted="predicted_ead"
)

ead_rmse_result = rmse(
    name="ead_comparison_rmse",
    dataset=df,
    data_format="record_level",
    observed="actual_ead",
    predicted="predicted_ead"
)

print(f"EAD MAPE (scale-independent): {ead_mape_result}")
print(f"EAD RMSE (scale-dependent): {ead_rmse_result}")
print("\nMAPE provides percentage-based error interpretation")
print("RMSE provides absolute error in original units")

F1 and F2 Score Classification Metrics

Basic F-Score Calculation

from tnp_statistic_library.metrics import f1_score, f2_score

# Overall F1 score (balanced precision and recall)
f1_result = f1_score(
    name="balanced_classification",
    dataset=df,
    data_format="record_level",
    prob_def="probability",
    default="default_flag",
    threshold=0.5
)
print(f"F1 Score: {f1_result}")

# F2 score emphasizing recall over precision
f2_result = f2_score(
    name="recall_focused_classification",
    dataset=df,
    data_format="record_level",
    prob_def="probability",
    default="default_flag",
    threshold=0.3  # Lower threshold for better recall
)
print(f"F2 Score: {f2_result}")

Segmented F-Score Analysis

# F1 score by region
regional_f1 = f1_score(
    name="regional_f1",
    dataset=df,
    data_format="record_level",
    prob_def="probability",
    default="default_flag",
    threshold=0.5,
    segment=["region"]
)

# F2 score by product type (emphasizing recall)
product_f2 = f2_score(
    name="product_f2",
    dataset=df,
    data_format="record_level",
    prob_def="probability",
    default="default_flag",
    threshold=0.3,
    segment=["product"]
)

# F1 score by risk grade
risk_grade_f1 = f1_score(
    name="risk_grade_f1",
    dataset=df,
    data_format="record_level",
    prob_def="probability",
    default="default_flag",
    threshold=0.5,
    segment=["risk_grade"]
)

print(f"Regional F1: {regional_f1}")
print(f"Product F2: {product_f2}")
print(f"Risk Grade F1: {risk_grade_f1}")

Summary-Level F-Score Analysis

⚠️ Important Note: When using F-scores with summary-level data, results may be skewed since the classification threshold is applied at the risk bucket level rather than individual record level. This can lead to different performance assessments compared to record-level calculations.

# Create summary-level data for F-score analysis
summary_f_df = df.group_by(["region", "risk_grade"]).agg([
    pl.count().alias("volume"),
    pl.col("default_flag").sum().alias("defaults"),
    pl.col("probability").mean().alias("mean_pd")
])

# Summary-level F1 score
summary_f1 = f1_score(
    name="summary_f1",
    dataset=summary_f_df,
    data_format="summary_level",
    mean_pd="mean_pd",
    defaults="defaults",
    volume="volume",
    threshold=0.5,
    segment=["region"]
)

# Summary-level F2 score with lower threshold
summary_f2 = f2_score(
    name="summary_f2",
    dataset=summary_f_df,
    data_format="summary_level",
    mean_pd="mean_pd",
    defaults="defaults",
    volume="volume",
    threshold=0.3,
    segment=["region"]
)

print(f"Summary F1: {summary_f1}")
print(f"Summary F2: {summary_f2}")
print("\nNote: Summary-level F-scores may differ from record-level due to bucket-level thresholding")

Comparing F1 and F2 Performance

# Calculate both F1 and F2 for comparison
comparison_f1 = f1_score(
    name="comparison_f1",
    dataset=df,
    data_format="record_level",
    prob_def="probability",
    default="default_flag",
    threshold=0.5
)

comparison_f2 = f2_score(
    name="comparison_f2",
    dataset=df,
    data_format="record_level",
    prob_def="probability",
    default="default_flag",
    threshold=0.5  # Same threshold for fair comparison
)

# Extract metrics for comparison
f1_value = comparison_f1["f_score"][0]
f2_value = comparison_f2["f_score"][0]
f1_precision = comparison_f1["precision"][0]
f1_recall = comparison_f1["recall"][0]
f2_precision = comparison_f2["precision"][0]
f2_recall = comparison_f2["recall"][0]

print(f"F1 Score: {f1_value:.4f} (Precision: {f1_precision:.4f}, Recall: {f1_recall:.4f})")
print(f"F2 Score: {f2_value:.4f} (Precision: {f2_precision:.4f}, Recall: {f2_recall:.4f})")

if f1_recall > f1_precision:
    print(f"\nF2 > F1 since recall ({f1_recall:.4f}) > precision ({f1_precision:.4f})")
    print("F2 emphasizes the stronger recall performance")
else:
    print(f"\nF1 > F2 since precision ({f1_precision:.4f}) > recall ({f1_recall:.4f})")
    print("F2 penalizes the weaker recall performance")

Threshold Optimization for F-Scores

import numpy as np

# Test different thresholds for optimal F1 and F2 scores
thresholds = np.arange(0.1, 0.9, 0.1)
f1_scores = []
f2_scores = []

for threshold in thresholds:
    f1_result = f1_score(
        name=f"f1_threshold_{threshold:.1f}",
        dataset=df,
        data_format="record_level",
        prob_def="probability",
        default="default_flag",
        threshold=threshold
    )

    f2_result = f2_score(
        name=f"f2_threshold_{threshold:.1f}",
        dataset=df,
        data_format="record_level",
        prob_def="probability",
        default="default_flag",
        threshold=threshold
    )

    f1_scores.append((threshold, f1_result["f_score"][0]))
    f2_scores.append((threshold, f2_result["f_score"][0]))

# Find optimal thresholds
optimal_f1_threshold, optimal_f1_score = max(f1_scores, key=lambda x: x[1])
optimal_f2_threshold, optimal_f2_score = max(f2_scores, key=lambda x: x[1])

print(f"Optimal F1: {optimal_f1_score:.4f} at threshold {optimal_f1_threshold:.1f}")
print(f"Optimal F2: {optimal_f2_score:.4f} at threshold {optimal_f2_threshold:.1f}")
print("\nF2 often favors lower thresholds to maximize recall")

T-Test Statistical Analysis

Basic T-Test for Model Bias Detection

from tnp_statistic_library.metrics import ttest

# Test if predictions are systematically biased (null hypothesis: mean difference = 0)
bias_test = ttest(
    name="model_bias_test",
    dataset=df,
    data_format="record_level",
    observed="actual_ead",
    predicted="predicted_ead"
)
print(f"Model Bias Test: {bias_test}")

# Interpret results
t_stat = bias_test["t_statistic"][0]
p_value = bias_test["p_value"][0]
mean_diff = bias_test["mean_difference"][0]

print(f"T-statistic: {t_stat:.4f}")
print(f"P-value: {p_value:.4f}")
print(f"Mean difference: {mean_diff:.2f}")

if p_value < 0.05:
    if mean_diff > 0:
        print("Significant positive bias: Model under-predicts on average")
    else:
        print("Significant negative bias: Model over-predicts on average")
else:
    print("No significant bias detected")

Segmented Bias Analysis

# Test for bias by product type
product_bias = ttest(
    name="product_bias_test",
    dataset=df,
    data_format="record_level",
    observed="actual_ead",
    predicted="predicted_ead",
    segment=["product"]
)
print(f"Product Bias Analysis: {product_bias}")

# Test for bias by region
regional_bias = ttest(
    name="regional_bias_test",
    dataset=df,
    data_format="record_level",
    observed="actual_lgd",
    predicted="lgd_prediction",
    segment=["region"]
)
print(f"Regional LGD Bias Analysis: {regional_bias}")

Model Comparison Using T-Test

# Compare two models by testing if their errors differ significantly
# First, create error columns for both models
comparison_df = df.with_columns([
    (pl.col("actual_ead") - pl.col("predicted_ead")).abs().alias("model_a_error"),
    (pl.col("actual_ead") - pl.col("predicted_ead") * 1.1).abs().alias("model_b_error")  # Simulate model B
])

# Test if Model A errors are significantly different from Model B errors
model_comparison = ttest(
    name="model_comparison_test",
    dataset=comparison_df,
    data_format="record_level",
    observed="model_a_error",
    predicted="model_b_error",
    null_hypothesis_mean=0.0  # Test if mean difference between errors equals 0
)
print(f"Model Comparison: {model_comparison}")

# Interpretation
if model_comparison["p_value"][0] < 0.05:
    if model_comparison["mean_difference"][0] < 0:
        print("Model A has significantly lower errors than Model B")
    else:
        print("Model A has significantly higher errors than Model B")
else:
    print("No significant difference between model performances")

Custom Hypothesis Testing

# Test if model predictions meet business target (e.g., mean difference should be within 1000)
target_test = ttest(
    name="business_target_test",
    dataset=df,
    data_format="record_level",
    observed="actual_ead",
    predicted="predicted_ead",
    null_hypothesis_mean=1000.0  # Business expects 1000 unit average difference
)
print(f"Business Target Test: {target_test}")

# Seasonal bias testing
seasonal_df = df.with_columns([
    pl.when(pl.col("origination_year") % 2 == 0).then("Even_Year").otherwise("Odd_Year").alias("season")
])

seasonal_bias = ttest(
    name="seasonal_bias_test",
    dataset=seasonal_df,
    data_format="record_level",
    observed="actual_ead",
    predicted="predicted_ead",
    segment=["season"]
)
print(f"Seasonal Bias Analysis: {seasonal_bias}")

Summary-Level T-Test for Pre-Aggregated Data

# Create summary-level data with pre-calculated difference statistics
summary_ttest_df = df.with_columns([
    (pl.col("actual_ead") - pl.col("predicted_ead")).alias("difference")
]).group_by(["region", "product"]).agg([
    pl.count().alias("volume"),
    pl.col("difference").sum().alias("sum_differences"),
    (pl.col("difference") ** 2).sum().alias("sum_squared_differences")
])

# Perform T-test on summary data
summary_ttest = ttest(
    name="summary_level_test",
    dataset=summary_ttest_df,
    data_format="summary_level",
    volume="volume",
    sum_differences="sum_differences",
    sum_squared_differences="sum_squared_differences",
    segment=["region"]
)
print(f"Summary-Level T-Test: {summary_ttest}")

Power Analysis and Sample Size Considerations

# Analyze statistical power for different segment sizes
import polars as pl

# Check sample sizes by segment
segment_sizes = df.group_by(["region"]).agg([
    pl.count().alias("sample_size")
]).sort("sample_size")

print("Sample sizes by region:")
print(segment_sizes)

# Only perform T-tests on segments with adequate sample size (n >= 30 recommended)
adequate_regions = segment_sizes.filter(pl.col("sample_size") >= 30)["region"].to_list()

adequate_sample_bias = ttest(
    name="adequate_sample_bias",
    dataset=df.filter(pl.col("region").is_in(adequate_regions)),
    data_format="record_level",
    observed="actual_ead",
    predicted="predicted_ead",
    segment=["region"]
)
print(f"T-Test with Adequate Samples: {adequate_sample_bias}")

T-Test Results Interpretation Guide

def interpret_ttest_results(result_df, alpha=0.05):
    """Helper function to interpret T-test results"""
    for i in range(len(result_df)):
        row = result_df.row(i)
        group = row[0]  # group_key
        volume = row[1]
        t_stat = row[2]
        p_value = row[3]
        mean_diff = row[4]

        print(f"\n--- Group: {group} (n={volume}) ---")
        print(f"Mean Difference: {mean_diff:.4f}")
        print(f"T-statistic: {t_stat:.4f}")
        print(f"P-value: {p_value:.4f}")

        if volume < 30:
            print("⚠️ Small sample size - results may be unreliable")

        if p_value is None:
            print("❌ Cannot compute p-value (insufficient variance or sample size)")
        elif p_value < alpha:
            direction = "positive" if mean_diff > 0 else "negative"
            print(f"✅ Significant {direction} bias detected (p < {alpha})")
        else:
            print(f"❌ No significant bias detected (p ≥ {alpha})")

        # Effect size interpretation
        if abs(mean_diff) > 1000:  # Example threshold
            print("📊 Large practical effect size")
        elif abs(mean_diff) > 500:
            print("📊 Medium practical effect size")
        else:
            print("📊 Small practical effect size")

# Apply interpretation to our bias test results
interpret_ttest_results(bias_test)

Statistical Tests

Hosmer-Lemeshow Test

from tnp_statistic_library.metrics import hosmer_lemeshow

# Test model calibration
hl_test = hosmer_lemeshow(
    name="calibration_test",
    dataset=df,
    data_format="record_level",
    prob_def="probability",
    default="default_flag",
    buckets=10
)
print(f"Hosmer-Lemeshow Test: {hl_test}")

# Test by product
hl_by_product = hosmer_lemeshow(
    name="calibration_by_product",
    dataset=df,
    data_format="record_level",
    prob_def="probability",
    default="default_flag",
    buckets=8,
    segment=["product"]
)
print(f"Calibration by Product: {hl_by_product}")

Jeffreys Test

from tnp_statistic_library.metrics import jeffreys_test

# Bayesian calibration test
jeffreys_result = jeffreys_test(
    name="bayesian_calibration",
    dataset=df,
    data_format="record_level",
    prob_def="probability",
    default="default_flag"
)
print(f"Jeffreys Test: {jeffreys_result}")

Discrimination Metrics

AUC (Area Under Curve)

Basic AUC Calculation

from tnp_statistic_library.metrics import auc

# Overall model discrimination
overall_auc = auc(
    name="model_discrimination",
    dataset=df,
    data_format="record_level",
    prob_def="probability",
    default="default_flag"
)
print(f"Overall AUC: {overall_auc}")

Segmented Discrimination Analysis

# AUC by region
regional_auc = auc(
    name="regional_discrimination",
    dataset=df,
    data_format="record_level",
    prob_def="probability",
    default="default_flag",
    segment=["region"]
)

# AUC by risk grade
risk_auc = auc(
    name="risk_grade_discrimination",
    dataset=df,
    data_format="record_level",
    prob_def="probability",
    default="default_flag",
    segment=["risk_grade"]
)

print(f"Regional AUC: {regional_auc}")
print(f"Risk Grade AUC: {risk_auc}")

Time-Based Discrimination

# AUC by vintage and product
vintage_product_auc = auc(
    name="vintage_product_auc",
    dataset=df,
    data_format="record_level",
    prob_def="probability",
    default="default_flag",
    segment=["origination_year", "product"]
)
print(f"Vintage-Product AUC: {vintage_product_auc}")

Using Boolean Default Indicators

# Discrimination metrics also support boolean default columns
boolean_auc = auc(
    name="boolean_discrimination",
    dataset=df,  # Assuming df has a column 'is_default' with True/False values
    data_format="record_level",
    prob_def="probability",
    default="is_default"  # Boolean column: True for defaults, False for non-defaults
)
print(f"Boolean AUC: {boolean_auc}")

Gini Coefficient

Basic Gini Calculation

from tnp_statistic_library.metrics import gini

# Overall model discrimination using Gini coefficient
overall_gini = gini(
    name="model_gini",
    dataset=df,
    data_format="record_level",
    prob_def="probability",
    default="default_flag"
)
print(f"Overall Gini: {overall_gini}")

Segmented Gini Analysis

# Gini by region
regional_gini = gini(
    name="regional_gini",
    dataset=df,
    data_format="record_level",
    prob_def="probability",
    default="default_flag",
    segment=["region"]
)

# Gini by product type
product_gini = gini(
    name="product_gini",
    dataset=df,
    data_format="record_level",
    prob_def="probability",
    default="default_flag",
    segment=["product"]
)

print(f"Regional Gini: {regional_gini}")
print(f"Product Gini: {product_gini}")

Comparing AUC and Gini

from tnp_statistic_library.metrics import auc, gini

# Calculate both metrics for comparison
auc_result = auc(
    name="model_auc",
    dataset=df,
    data_format="record_level",
    prob_def="probability",
    default="default_flag"
)

gini_result = gini(
    name="model_gini",
    dataset=df,
    data_format="record_level",
    prob_def="probability",
    default="default_flag"
)

# Verify the relationship: Gini = 2*AUC - 1
auc_value = auc_result["AUC"][0]
gini_value = gini_result["Gini"][0]
expected_gini = 2 * auc_value - 1

print(f"AUC: {auc_value:.4f}")
print(f"Gini: {gini_value:.4f}")
print(f"Expected Gini (2*AUC - 1): {expected_gini:.4f}")
print(f"Relationship verified: {abs(gini_value - expected_gini) < 1e-10}")

Stability Metrics

Population Stability Index (PSI)

Basic PSI Calculation

from tnp_statistic_library.metrics import population_stability_index
import polars as pl

# Create baseline and current period indicators
# Assume first 500 records are baseline, next 500 are current
baseline_current_df = df.with_columns([
    pl.when(pl.int_range(pl.len()) < 500)
    .then(1)
    .otherwise(0)
    .alias("is_baseline"),
    pl.when(pl.int_range(pl.len()) >= 500)
    .then(1)
    .otherwise(0)
    .alias("is_current")
])

# Overall population stability
overall_psi = population_stability_index(
    name="model_stability",
    dataset=baseline_current_df,
    data_format="record_level",
    band_column="risk_grade",
    baseline_column="is_baseline",
    current_column="is_current"
)
print(f"Overall PSI: {overall_psi}")

Segmented Stability Analysis

# PSI by region
regional_psi = population_stability_index(
    name="regional_stability",
    dataset=baseline_current_df,
    data_format="record_level",
    band_column="risk_grade",
    baseline_column="is_baseline",
    current_column="is_current",
    segment=["region"]
)

# PSI by product
product_psi = population_stability_index(
    name="product_stability",
    dataset=baseline_current_df,
    data_format="record_level",
    band_column="risk_grade",
    baseline_column="is_baseline",
    current_column="is_current",
    segment=["product"]
)

print(f"Regional PSI: {regional_psi}")
print(f"Product PSI: {product_psi}")

Score Distribution Stability

# Create score bands for distribution monitoring
score_bands_df = baseline_current_df.with_columns([
    pl.when(pl.col("probability") <= 0.05).then(pl.lit("Low"))
    .when(pl.col("probability") <= 0.15).then(pl.lit("Medium"))
    .when(pl.col("probability") <= 0.30).then(pl.lit("High"))
    .otherwise(pl.lit("Very_High"))
    .alias("score_band")
])

# Monitor score distribution stability
score_psi = population_stability_index(
    name="score_distribution_stability",
    dataset=score_bands_df,
    data_format="record_level",
    band_column="score_band",
    baseline_column="is_baseline",
    current_column="is_current"
)

# Score stability by product line
score_product_psi = population_stability_index(
    name="score_product_stability",
    dataset=score_bands_df,
    data_format="record_level",
    band_column="score_band",
    baseline_column="is_baseline",
    current_column="is_current",
    segment=["product"]
)

print(f"Score Distribution PSI: {score_psi}")
print(f"Score-Product PSI: {score_product_psi}")

Time-Based Stability Monitoring

# Create monthly cohorts for stability tracking
monthly_cohorts_df = df.with_columns([
    # Simulate monthly data by grouping account age
    pl.when(pl.col("account_age_months") <= 12).then(pl.lit("Month_1-12"))
    .when(pl.col("account_age_months") <= 24).then(pl.lit("Month_13-24"))
    .when(pl.col("account_age_months") <= 36).then(pl.lit("Month_25-36"))
    .otherwise(pl.lit("Month_37+"))
    .alias("age_band"),

    # Create baseline/current indicators based on origination year
    pl.when(pl.col("origination_year") <= 2022).then(1).otherwise(0).alias("is_baseline"),
    pl.when(pl.col("origination_year") >= 2023).then(1).otherwise(0).alias("is_current")
])

# Monitor age band stability over time
age_stability_psi = population_stability_index(
    name="age_stability",
    dataset=monthly_cohorts_df,
    data_format="record_level",
    band_column="age_band",
    baseline_column="is_baseline",
    current_column="is_current",
    segment=["product"]
)

print(f"Age Band Stability PSI: {age_stability_psi}")

Summary-Level PSI for Pre-Aggregated Data

# Create summary-level data for PSI calculation
summary_psi_df = pl.DataFrame({
    "risk_grade": ["A", "B", "C", "D", "E"],
    "baseline_count": [120, 150, 100, 80, 50],
    "current_count": [100, 140, 120, 90, 50],
    "region": ["North"] * 5
})

# Calculate PSI from summary data
summary_psi = population_stability_index(
    name="summary_stability",
    dataset=summary_psi_df,
    data_format="summary_level",
    band_column="risk_grade",
    baseline_volume="baseline_count",
    current_volume="current_count",
    segment=["region"]
)

print(f"Summary-Level PSI: {summary_psi}")

PSI with Plotting Data for Visualization

# Calculate PSI with curve data for BI tool visualization
psi_with_plotting = population_stability_index(
    name="stability_with_plots",
    dataset=baseline_current_df,
    data_format="record_level",
    band_column="risk_grade",
    baseline_column="is_baseline",
    current_column="is_current"
)

# Extract plotting data for visualization
curve_data = psi_with_plotting["curve_data"][0]
print(f"PSI Value: {psi_with_plotting['psi'][0]:.4f}")

# Display band-level breakdown for visualization
if curve_data is not None:
    print("\nBand-level PSI Analysis:")
    print("Band | Baseline% | Current% | PSI Component")
    print("-" * 45)

    for band_data in curve_data:
        band = band_data["band"]
        baseline_pct = band_data["baseline_pct"]
        current_pct = band_data["current_pct"]
        psi_component = band_data["psi_component"]

        print(f"{band:4} | {baseline_pct:8.2f} | {current_pct:7.2f} | {psi_component:12.6f}")

    # This data can be used in BI tools for:
    # - Bar charts comparing baseline vs current percentages
    # - Heat maps showing distribution shifts
    # - Waterfall charts showing PSI component contributions

Summary Statistics

Mean Calculations

Basic Statistics

from tnp_statistic_library.metrics import mean

# Mean exposure by region
regional_exposure = mean(
    name="regional_exposure",
    dataset=df,
    variable="exposure_amount",
    segment=["region"]
)

# Mean probability by product
product_pd = mean(
    name="product_pd",
    dataset=df,
    variable="probability",
    segment=["product"]
)

print(f"Regional Exposure: {regional_exposure}")
print(f"Product PD: {product_pd}")

Advanced Segmentation

# Mean exposure for high-risk accounts
high_risk_exposure = mean(
    name="high_risk_exposure",
    dataset=df.filter(pl.col("risk_grade").is_in(["D", "E"])),
    variable="exposure_amount",
    segment=["product"]
)

# Account age analysis
age_analysis = mean(
    name="account_age_analysis",
    dataset=df,
    variable="account_age_months",
    segment=["region", "product"]
)

print(f"High Risk Exposure: {high_risk_exposure}")
print(f"Account Age Analysis: {age_analysis}")

Median Calculations

from tnp_statistic_library.metrics import median

# Median exposure analysis
median_exposure = median(
    name="median_regional_exposure",
    dataset=df,
    variable="exposure_amount",
    segment=["region"]
)

# Median PD by risk grade
median_pd = median(
    name="median_pd_by_grade",
    dataset=df,
    variable="probability",
    segment=["risk_grade"]
)

print(f"Median Exposure: {median_exposure}")
print(f"Median PD by Grade: {median_pd}")

Advanced Usage Patterns

Working with Summary-Level Data

# Create summary-level data
summary_df = df.group_by(["region", "product"]).agg([
    pl.count().alias("volume"),
    pl.col("default_flag").sum().alias("defaults"),
    pl.col("probability").mean().alias("mean_pd"),
    pl.col("exposure_amount").sum().alias("total_exposure")
])

# Calculate accuracy from summary data
summary_accuracy = default_accuracy(
    name="summary_accuracy",
    dataset=summary_df,
    data_format="summary_level",
    mean_pd="mean_pd",
    defaults="defaults",
    volume="volume",
    segment=["region"]
)
print(f"Summary-Level Accuracy: {summary_accuracy}")

YAML Configuration Examples

For production use, combine multiple metrics in YAML workflows:

# comprehensive_analysis.yaml
datasets:
  portfolio:
    location: "portfolio_data.csv"

  high_risk:
    location: "portfolio_data.csv"
    filters:
      - "risk_grade IN ['D', 'E']"

metrics:
  model_validation:
    metric_type: default_accuracy
    config:
      name: ["overall", "by_region", "by_product", "by_risk"]
      segment: [null, ["region"], ["product"], ["risk_grade"]]
      dataset: "portfolio"
      data_format: "record_level"
      prob_def: "probability"
      default: "default_flag"

  discrimination_analysis:
    metric_type: auc
    config:
      name: ["overall_auc", "regional_auc", "product_auc"]
      segment: [null, ["region"], ["product"]]
      dataset: "portfolio"
      data_format: "record_level"
      prob_def: "probability"
      default: "default_flag"

  calibration_tests:
    metric_type: hosmer_lemeshow
    config:
      name: ["overall_hl", "product_hl"]
      segment: [null, ["product"]]
      dataset: "portfolio"
      data_format: "record_level"
      prob_def: "probability"
      default: "default_flag"
      buckets: 10

  portfolio_summary:
    metric_type: mean
    config:
      name: ["regional_exposure", "product_exposure", "risk_exposure"]
      segment: [["region"], ["product"], ["risk_grade"]]
      dataset: "portfolio"
      variable: "exposure_amount"

  high_risk_analysis:
    metric_type: default_accuracy
    config:
      name: ["high_risk_accuracy"]
      dataset: "high_risk"
      data_format: "record_level"
      prob_def: "probability"
      default: "default_flag"
      segment: [["product"]]

  population_stability:
    metric_type: population_stability_index
    config:
      name: ["overall_stability", "regional_stability", "product_stability"]
      segment: [null, ["region"], ["product"]]
      dataset: "portfolio"
      data_format: "record_level"
      band_column: "risk_grade"
      baseline_column: "is_baseline"
      current_column: "is_current"

  score_distribution_stability:
    metric_type: population_stability_index
    config:
      name: ["score_stability", "score_product_stability"]
      segment: [null, ["product"]]
      dataset: "portfolio"
      data_format: "record_level"
      band_column: "score_band"
      baseline_column: "is_baseline"
      current_column: "is_current"

# Note: Summary-level F-scores may be skewed due to bucket-level thresholding
# Consider using record-level data for more accurate F-score assessments

Execute the comprehensive analysis:

from tnp_statistic_library.workflows import load_configuration_from_yaml

# Save the dataset
df.write_csv("portfolio_data.csv")

# Run the comprehensive analysis
config = load_configuration_from_yaml("comprehensive_analysis.yaml")
all_results = config.metrics.collect_all()

# Convert to DataFrame for easy analysis
results_df = all_results.to_dataframe()
print("Comprehensive Analysis Results:")
print(results_df)

# Export results
results_df.write_csv("model_validation_results.csv")

Next Steps