Skip to content

glassbox.core.math

Low-level statistical functions, distance metrics, and tree split utilities — all implemented with NumPy.


calc_mean

calc_mean(arr)

Calculate the mean of a 1D array.

Parameters:

Name Type Description Default
arr ndarray

Numeric array of shape (n_samples,).

required

Returns:

Type Description
float

The calculated mean.

Source code in glassbox/core/math.py
def calc_mean(arr: np.ndarray) -> float:
    """
    Calculate the mean of a 1D array.

    Parameters
    ----------
    arr : np.ndarray
        Numeric array of shape (n_samples,).

    Returns
    -------
    float
        The calculated mean.
    """
    return float(np.sum(arr) / len(arr))

calc_median

calc_median(arr)

Calculate the median of a 1D array.

Parameters:

Name Type Description Default
arr ndarray

Numeric array of shape (n_samples,).

required

Returns:

Type Description
float

The calculated median.

Source code in glassbox/core/math.py
def calc_median(arr: np.ndarray) -> float:
    """
    Calculate the median of a 1D array.

    Parameters
    ----------
    arr : np.ndarray
        Numeric array of shape (n_samples,).

    Returns
    -------
    float
        The calculated median.
    """
    sorted_arr = np.sort(arr)
    n = len(sorted_arr)
    mid = n // 2
    if n % 2 == 0:
        return float((sorted_arr[mid - 1] + sorted_arr[mid]) / 2.0)
    else:
        return float(sorted_arr[mid])

calc_mode

calc_mode(arr)

Calculate the mode of a 1D array.

Parameters:

Name Type Description Default
arr ndarray

Array of shape (n_samples,).

required

Returns:

Type Description
float | str

The calculated mode.

Source code in glassbox/core/math.py
def calc_mode(arr: np.ndarray) -> float | str:
    """
    Calculate the mode of a 1D array.

    Parameters
    ----------
    arr : np.ndarray
        Array of shape (n_samples,).

    Returns
    -------
    float | str
        The calculated mode.
    """
    vals, counts = np.unique(arr, return_counts=True)
    max_idx = np.argmax(counts)
    val = vals[max_idx]
    if hasattr(val, "dtype") and np.issubdtype(val.dtype, np.number):
        return float(val)
    if isinstance(val, (int, float)):
        return float(val)
    return str(val)

calc_std

calc_std(arr)

Calculate the standard deviation of a 1D array.

Parameters:

Name Type Description Default
arr ndarray

Numeric array of shape (n_samples,).

required

Returns:

Type Description
float

Standard deviation.

Source code in glassbox/core/math.py
def calc_std(arr: np.ndarray) -> float:
    """
    Calculate the standard deviation of a 1D array.

    Parameters
    ----------
    arr : np.ndarray
        Numeric array of shape (n_samples,).

    Returns
    -------
    float
        Standard deviation.
    """
    n = len(arr)
    if n <= 1:
        return 0.0
    mean_val = calc_mean(arr)
    var = np.sum((arr - mean_val) ** 2) / (n - 1)
    return float(np.sqrt(var))

calc_variance

calc_variance(arr)

Calculate the variance (MSE) of a 1D array.

Parameters:

Name Type Description Default
arr ndarray

Array of continuous values, shape (n_samples,).

required

Returns:

Type Description
float

Calculated variance.

Source code in glassbox/core/math.py
def calc_variance(arr: np.ndarray) -> float:
    """
    Calculate the variance (MSE) of a 1D array.

    Parameters
    ----------
    arr : np.ndarray
        Array of continuous values, shape (n_samples,).

    Returns
    -------
    float
        Calculated variance.
    """
    n = len(arr)
    if n == 0:
        return 0.0
    mean_val = float(np.sum(arr) / n)
    return float(np.sum((arr - mean_val) ** 2) / n)

generate_bootstrap_indices

generate_bootstrap_indices(n_samples)

Generate random indices for a bootstrap sample.

Parameters:

Name Type Description Default
n_samples int

Number of samples in the original dataset.

required

Returns:

Type Description
ndarray

Array of bootstrapped indices of shape (n_samples,).

Source code in glassbox/core/math.py
def generate_bootstrap_indices(n_samples: int) -> np.ndarray:
    """
    Generate random indices for a bootstrap sample.

    Parameters
    ----------
    n_samples : int
        Number of samples in the original dataset.

    Returns
    -------
    np.ndarray
        Array of bootstrapped indices of shape (n_samples,).
    """
    if n_samples == 0:
        return np.array([], dtype=int)
    return np.random.choice(n_samples, size=n_samples, replace=True)

generate_feature_subset_indices

generate_feature_subset_indices(n_features)

Generate random indices for a feature subset (sqrt of total features).

Parameters:

Name Type Description Default
n_features int

Number of total features.

required

Returns:

Type Description
ndarray

Array of subset feature indices.

Source code in glassbox/core/math.py
def generate_feature_subset_indices(n_features: int) -> np.ndarray:
    """
    Generate random indices for a feature subset (sqrt of total features).

    Parameters
    ----------
    n_features : int
        Number of total features.

    Returns
    -------
    np.ndarray
        Array of subset feature indices.
    """
    if n_features == 0:
        return np.array([], dtype=int)

    n_subset = int(np.sqrt(n_features))
    if n_subset == 0:
        n_subset = 1

    return np.random.choice(n_features, size=n_subset, replace=False)

calc_skew

calc_skew(arr)

Calculate the skewness of a 1D array.

Parameters:

Name Type Description Default
arr ndarray

Numeric array of shape (n_samples,).

required

Returns:

Type Description
float

Skewness value.

Source code in glassbox/core/math.py
def calc_skew(arr: np.ndarray) -> float:
    """
    Calculate the skewness of a 1D array.

    Parameters
    ----------
    arr : np.ndarray
        Numeric array of shape (n_samples,).

    Returns
    -------
    float
        Skewness value.
    """
    n = len(arr)
    if n <= 2:
        return 0.0
    std_val = calc_std(arr)
    if std_val == 0:
        return 0.0
    mean_val = calc_mean(arr)
    skew = np.sum(((arr - mean_val) / std_val) ** 3) * (n / ((n - 1) * (n - 2)))
    return float(skew)

calc_kurtosis

calc_kurtosis(arr)

Calculate the kurtosis of a 1D array.

Parameters:

Name Type Description Default
arr ndarray

Numeric array of shape (n_samples,).

required

Returns:

Type Description
float

Kurtosis value.

Source code in glassbox/core/math.py
def calc_kurtosis(arr: np.ndarray) -> float:
    """
    Calculate the kurtosis of a 1D array.

    Parameters
    ----------
    arr : np.ndarray
        Numeric array of shape (n_samples,).

    Returns
    -------
    float
        Kurtosis value.
    """
    n = len(arr)
    if n <= 3:
        return 0.0
    std_val = calc_std(arr)
    if std_val == 0:
        return 0.0
    mean_val = calc_mean(arr)

    m4 = np.sum((arr - mean_val) ** 4)
    term1 = (n * (n + 1)) / ((n - 1) * (n - 2) * (n - 3))
    term2 = (3 * ((n - 1) ** 2)) / ((n - 2) * (n - 3))
    kurtosis = term1 * (m4 / (std_val**4)) - term2
    return float(kurtosis)

calc_pearson

calc_pearson(arr_x, arr_y)

Calculate Pearson correlation coefficient between two numerical arrays.

Parameters:

Name Type Description Default
arr_x ndarray

First numeric array of shape (n_samples,).

required
arr_y ndarray

Second numeric array of shape (n_samples,).

required

Returns:

Type Description
float

Pearson correlation coefficient.

Source code in glassbox/core/math.py
def calc_pearson(arr_x: np.ndarray, arr_y: np.ndarray) -> float:
    """
    Calculate Pearson correlation coefficient between two numerical arrays.

    Parameters
    ----------
    arr_x : np.ndarray
        First numeric array of shape (n_samples,).
    arr_y : np.ndarray
        Second numeric array of shape (n_samples,).

    Returns
    -------
    float
        Pearson correlation coefficient.
    """
    n = len(arr_x)
    if n <= 1:
        return 0.0

    mean_x = np.sum(arr_x) / n
    mean_y = np.sum(arr_y) / n

    num = np.sum((arr_x - mean_x) * (arr_y - mean_y))
    den = np.sqrt(np.sum((arr_x - mean_x) ** 2) * np.sum((arr_y - mean_y) ** 2))

    if den == 0:
        return 0.0

    return float(num / den)

calc_cramers_v

calc_cramers_v(arr_x, arr_y)

Calculate Cramer's V statistic for categorical-categorical association between 2 arrays.

Parameters:

Name Type Description Default
arr_x ndarray

First nominal array of shape (n_samples,).

required
arr_y ndarray

Second nominal array of shape (n_samples,).

required

Returns:

Type Description
float

Cramer's V score between 0.0 and 1.0.

Source code in glassbox/core/math.py
def calc_cramers_v(arr_x: np.ndarray, arr_y: np.ndarray) -> float:
    """
    Calculate Cramer's V statistic for categorical-categorical association between 2 arrays.

    Parameters
    ----------
    arr_x : np.ndarray
        First nominal array of shape (n_samples,).
    arr_y : np.ndarray
        Second nominal array of shape (n_samples,).

    Returns
    -------
    float
        Cramer's V score between 0.0 and 1.0.
    """
    n = len(arr_x)
    if n == 0:
        return 0.0

    x_unique, x_idx = np.unique(arr_x, return_inverse=True)
    y_unique, y_idx = np.unique(arr_y, return_inverse=True)

    k = len(x_unique)
    r = len(y_unique)
    if k <= 1 or r <= 1:
        return 0.0

    contingency = np.zeros((r, k))
    np.add.at(contingency, (y_idx, x_idx), 1)

    row_sums = contingency.sum(axis=1)
    arr_sums = contingency.sum(axis=0)
    expected = np.outer(row_sums, arr_sums) / n

    with np.errstate(divide="ignore", invalid="ignore"):
        chi2_components = ((contingency - expected) ** 2) / expected
        chi2_components[expected == 0] = 0

    chi2 = np.sum(chi2_components)

    phi2 = chi2 / n
    min_dim = min(k - 1, r - 1)
    if min_dim == 0:
        return 0.0

    v = np.sqrt(phi2 / min_dim)
    return float(v)

calc_percentile

calc_percentile(arr, p)

Calculate the precise percentile of an array using interpolation.

Parameters:

Name Type Description Default
arr ndarray

Array dimension to extract percentile from.

required
p float

Percentile range (0-100).

required

Returns:

Type Description
float

Calculated percentile.

Source code in glassbox/core/math.py
def calc_percentile(arr: np.ndarray, p: float) -> float:
    """
    Calculate the precise percentile of an array using interpolation.

    Parameters
    ----------
    arr : np.ndarray
        Array dimension to extract percentile from.
    p : float
        Percentile range (0-100).

    Returns
    -------
    float
        Calculated percentile.
    """
    sorted_col = np.sort(arr)
    n = len(sorted_col)
    if n == 0:
        return 0.0
    idx = (n - 1) * p / 100.0
    idx_int = int(idx)
    if idx_int == n - 1:
        return float(sorted_col[idx_int])
    fraction = idx - idx_int
    return float(
        sorted_col[idx_int] + fraction * (sorted_col[idx_int + 1] - sorted_col[idx_int])
    )

calc_iqr

calc_iqr(arr)

Calculate the Interquartile Range (IQR) bounds.

Parameters:

Name Type Description Default
arr ndarray

Array to bound.

required

Returns:

Type Description
Tuple

Tuple containing parameters for lower and upper limits.

Source code in glassbox/core/math.py
def calc_iqr(arr: np.ndarray) -> Tuple[float, float]:
    """
    Calculate the Interquartile Range (IQR) bounds.

    Parameters
    ----------
    arr : np.ndarray
        Array to bound.

    Returns
    -------
    Tuple
        Tuple containing parameters for lower and upper limits.
    """
    n = len(arr)
    if n == 0:
        return 0.0, 0.0
    q1 = calc_percentile(arr, 25.0)
    q3 = calc_percentile(arr, 75.0)
    iqr = q3 - q1
    return float(q1 - 1.5 * iqr), float(q3 + 1.5 * iqr)

calc_split_gain

calc_split_gain(
    parent_cost,
    left_cost,
    right_cost,
    n_parent,
    n_left,
    n_right,
)

Calculate the information gain or variance reduction of a split.

Parameters:

Name Type Description Default
parent_cost float

Cost of the parent node.

required
left_cost float

Cost of the left child node.

required
right_cost float

Cost of the right child node.

required
n_parent int

Number of samples in the parent node.

required
n_left int

Number of samples in the left child node.

required
n_right int

Number of samples in the right child node.

required

Returns:

Type Description
float

The calculated gain.

Source code in glassbox/core/math.py
def calc_split_gain(
    parent_cost: float,
    left_cost: float,
    right_cost: float,
    n_parent: int,
    n_left: int,
    n_right: int,
) -> float:
    """
    Calculate the information gain or variance reduction of a split.

    Parameters
    ----------
    parent_cost : float
        Cost of the parent node.
    left_cost : float
        Cost of the left child node.
    right_cost : float
        Cost of the right child node.
    n_parent : int
        Number of samples in the parent node.
    n_left : int
        Number of samples in the left child node.
    n_right : int
        Number of samples in the right child node.

    Returns
    -------
    float
        The calculated gain.
    """
    weight_left = n_left / n_parent
    weight_right = n_right / n_parent
    child_cost = (weight_left * left_cost) + (weight_right * right_cost)
    return float(parent_cost - child_cost)

calc_gini_impurity

calc_gini_impurity(arr)

Calculate the Gini impurity of an array of categorical labels.

Parameters:

Name Type Description Default
arr ndarray

Array of categorical labels, shape (n_samples,).

required

Returns:

Type Description
float

Calculated Gini impurity.

Source code in glassbox/core/math.py
def calc_gini_impurity(arr: np.ndarray) -> float:
    """
    Calculate the Gini impurity of an array of categorical labels.

    Parameters
    ----------
    arr : np.ndarray
        Array of categorical labels, shape (n_samples,).

    Returns
    -------
    float
        Calculated Gini impurity.
    """
    n = len(arr)
    if n == 0:
        return 0.0
    _, counts = np.unique(arr, return_counts=True)
    probabilities = counts / n
    return float(1.0 - np.sum(probabilities**2))

calc_euclidean

calc_euclidean(x, y)

Calculate the Euclidean distance between two vectors.

Parameters:

Name Type Description Default
x ndarray

First numeric array.

required
y ndarray

Second numeric array.

required

Returns:

Type Description
float

Euclidean distance.

Source code in glassbox/core/math.py
def calc_euclidean(x: np.ndarray, y: np.ndarray) -> float:
    """
    Calculate the Euclidean distance between two vectors.

    Parameters
    ----------
    x : np.ndarray
        First numeric array.
    y : np.ndarray
        Second numeric array.

    Returns
    -------
    float
        Euclidean distance.
    """
    return float(np.sqrt(np.sum((x - y) ** 2)))

calc_manhattan

calc_manhattan(x, y)

Calculate the Manhattan distance between two vectors.

Parameters:

Name Type Description Default
x ndarray

First numeric array.

required
y ndarray

Second numeric array.

required

Returns:

Type Description
float

Manhattan distance.

Source code in glassbox/core/math.py
def calc_manhattan(x: np.ndarray, y: np.ndarray) -> float:
    """
    Calculate the Manhattan distance between two vectors.

    Parameters
    ----------
    x : np.ndarray
        First numeric array.
    y : np.ndarray
        Second numeric array.

    Returns
    -------
    float
        Manhattan distance.
    """
    return float(np.sum(np.abs(x - y)))