Skip to content

glassbox.inspector.statistics

Statistical profiling and pairwise association analysis (Pearson, Cramér's V).


StatProfiler

Calculates summary statistics for dataset columns.

calculate_numeric_stats

calculate_numeric_stats(data, cols)

Compute statistics for numerical columns.

Parameters:

Name Type Description Default
data Dataset

The dataset containing the inputs.

required
cols List[str]

List of column names to analyze.

required

Returns:

Type Description
Dict

Mapping from column names to NumericStats objects.

Source code in glassbox/inspector/statistics.py
def calculate_numeric_stats(
    self, data: Dataset, cols: List[str]
) -> Dict[str, NumericStats]:
    """
    Compute statistics for numerical columns.

    Parameters
    ----------
    data : Dataset
        The dataset containing the inputs.
    cols : List[str]
        List of column names to analyze.

    Returns
    -------
    Dict
        Mapping from column names to NumericStats objects.
    """
    results = {}
    for col_name in cols:
        col_data = data.get_columns(col_name).data[:, 0].astype(float)
        col_valid = col_data[~np.isnan(col_data)]
        if len(col_valid) == 0:
            results[col_name] = NumericStats(
                mean=float("nan"),
                median=float("nan"),
                std=float("nan"),
                skew=float("nan"),
                kurt=float("nan"),
            )
            continue

        results[col_name] = NumericStats(
            mean=self._calc_mean(col_valid),
            median=self._calc_median(col_valid),
            std=self._calc_std(col_valid),
            skew=self._calc_skew(col_valid),
            kurt=self._calc_kurtosis(col_valid),
        )
    return results

calculate_categorical_stats

calculate_categorical_stats(data, cols)

Compute statistics for categorical columns.

Parameters:

Name Type Description Default
data Dataset

The dataset containing the inputs.

required
cols List[str]

List of column names to analyze.

required

Returns:

Type Description
Dict

Mapping from column names to CategoricalStats objects.

Source code in glassbox/inspector/statistics.py
def calculate_categorical_stats(
    self, data: Dataset, cols: List[str]
) -> Dict[str, CategoricalStats]:
    """
    Compute statistics for categorical columns.

    Parameters
    ----------
    data : Dataset
        The dataset containing the inputs.
    cols : List[str]
        List of column names to analyze.

    Returns
    -------
    Dict
        Mapping from column names to CategoricalStats objects.
    """
    results = {}
    for col_name in cols:
        col_data = data.get_columns(col_name).data[:, 0]
        valid_mask = np.array(
            [
                v is not None and not (isinstance(v, float) and np.isnan(v))
                for v in col_data
            ]
        )
        col_valid = col_data[valid_mask]

        if len(col_valid) == 0:
            results[col_name] = CategoricalStats(mode=float("nan"), cardinality=0)
            continue

        unique_vals = np.unique(col_valid)
        results[col_name] = CategoricalStats(
            mode=self._calc_mode(col_valid), cardinality=len(unique_vals)
        )
    return results

AssociationAnalyzer

Analyzes pairwise correlations and associations between features.

build_associations

build_associations(data, num_cols, cat_cols)

Compute pairwise correlation and associations across specified columns.

Parameters:

Name Type Description Default
data Dataset

Input dataset.

required
num_cols List[str]

Numerical columns to inspect with Pearson.

required
cat_cols List[str]

Categorical columns to inspect with Cramer's V.

required

Returns:

Type Description
List

A list of CollinearityPair objects containing scores.

Source code in glassbox/inspector/statistics.py
def build_associations(
    self, data: Dataset, num_cols: List[str], cat_cols: List[str]
) -> List[CollinearityPair]:
    """
    Compute pairwise correlation and associations across specified columns.

    Parameters
    ----------
    data : Dataset
        Input dataset.
    num_cols : List[str]
        Numerical columns to inspect with Pearson.
    cat_cols : List[str]
        Categorical columns to inspect with Cramer's V.

    Returns
    -------
    List
        A list of CollinearityPair objects containing scores.
    """
    pairs = []
    n_num = len(num_cols)
    for i in range(n_num):
        for j in range(i + 1, n_num):
            col_x_name = num_cols[i]
            col_y_name = num_cols[j]
            col_x = data.get_columns(col_x_name).data[:, 0].astype(float)
            col_y = data.get_columns(col_y_name).data[:, 0].astype(float)

            valid_mask = ~(np.isnan(col_x) | np.isnan(col_y))
            x_val = col_x[valid_mask]
            y_val = col_y[valid_mask]

            score = self._calc_pearson(x_val, y_val)
            pairs.append(
                CollinearityPair(
                    feature_a=col_x_name,
                    feature_b=col_y_name,
                    score=score,
                    metric="pearson",
                )
            )

    n_cat = len(cat_cols)
    for i in range(n_cat):
        for j in range(i + 1, n_cat):
            col_x_name = cat_cols[i]
            col_y_name = cat_cols[j]
            col_x = data.get_columns(col_x_name).data[:, 0]
            col_y = data.get_columns(col_y_name).data[:, 0]

            valid_mask = np.array(
                [
                    v_x is not None
                    and not (isinstance(v_x, float) and np.isnan(v_x))
                    and v_y is not None
                    and not (isinstance(v_y, float) and np.isnan(v_y))
                    for v_x, v_y in zip(col_x, col_y)
                ]
            )
            x_val = col_x[valid_mask]
            y_val = col_y[valid_mask]

            score = self._calc_cramers_v(x_val, y_val)
            pairs.append(
                CollinearityPair(
                    feature_a=col_x_name,
                    feature_b=col_y_name,
                    score=score,
                    metric="cramers_v",
                )
            )

    return pairs