Skip to content

glassbox.cleaner

Scikit-learn-style data cleaning transformers.


LabelEncoder

LabelEncoder()

Bases: BaseTransformer

Encode target labels with value between 0 and n_classes-1.

Source code in glassbox/cleaner/encoders.py
def __init__(self):
    self._mapping: Dict[str, int] = {}

fit

fit(X)

Learn the vocabulary of the labels.

Parameters:

Name Type Description Default
X ndarray

Input array of shape (n_samples, n_features).

required

Returns:

Type Description
Self

Fitted encoder instance.

Source code in glassbox/cleaner/encoders.py
def fit(self, X: np.ndarray) -> Self:
    """
    Learn the vocabulary of the labels.

    Parameters
    ----------
    X : np.ndarray
        Input array of shape (n_samples, n_features).

    Returns
    -------
    Self
        Fitted encoder instance.
    """
    X_flat = X.ravel()
    if np.issubdtype(X_flat.dtype, np.number):
        X_clean = X_flat[~np.isnan(X_flat)]
    else:
        X_clean = np.array(
            [
                x
                for x in X_flat
                if x is not None and not (isinstance(x, float) and np.isnan(x))
            ]
        )

    uniques = np.unique(X_clean)
    self._mapping = {str(val): i for i, val in enumerate(uniques)}
    return self

transform

transform(X)

Transform labels to normalized encoding.

Parameters:

Name Type Description Default
X ndarray

Input array of shape (n_samples, n_features).

required

Returns:

Type Description
ndarray

Transformed array properly encoded.

Source code in glassbox/cleaner/encoders.py
def transform(self, X: np.ndarray) -> np.ndarray:
    """
    Transform labels to normalized encoding.

    Parameters
    ----------
    X : np.ndarray
        Input array of shape (n_samples, n_features).

    Returns
    -------
    np.ndarray
        Transformed array properly encoded.
    """
    X_flat = X.ravel()
    res = np.zeros(X_flat.shape, dtype=float)

    for i, val in enumerate(X_flat):
        if (isinstance(val, float) and np.isnan(val)) or val is None:
            res[i] = np.nan
        else:
            res[i] = self._mapping.get(str(val), -1)

    return res.reshape(X.shape)

OneHotEncoder

OneHotEncoder()

Bases: BaseTransformer

Encode categorical features as a one-hot numeric array.

Source code in glassbox/cleaner/encoders.py
def __init__(self):
    self._categories: Dict[int, List[str]] = {}

fit

fit(X)

Learn the categorical levels for encoding.

Parameters:

Name Type Description Default
X ndarray

Input array of shape (n_samples, n_features).

required

Returns:

Type Description
Self

Fitted encoder instance.

Source code in glassbox/cleaner/encoders.py
def fit(self, X: np.ndarray) -> Self:
    """
    Learn the categorical levels for encoding.

    Parameters
    ----------
    X : np.ndarray
        Input array of shape (n_samples, n_features).

    Returns
    -------
    Self
        Fitted encoder instance.
    """
    n_features = X.shape[1]
    for col_idx in range(n_features):
        col = X[:, col_idx]
        if np.issubdtype(col.dtype, np.number):
            col_clean = col[~np.isnan(col)]
        else:
            col_clean = np.array(
                [
                    x
                    for x in col
                    if x is not None and not (isinstance(x, float) and np.isnan(x))
                ]
            )

        uniques = np.unique(col_clean)
        self._categories[col_idx] = list(uniques)
    return self

transform

transform(X)

Transform the dataset into a one-hot encoded representation.

Parameters:

Name Type Description Default
X ndarray

Input array of shape (n_samples, n_features).

required

Returns:

Type Description
ndarray

Transformed array properly encoded.

Source code in glassbox/cleaner/encoders.py
def transform(self, X: np.ndarray) -> np.ndarray:
    """
    Transform the dataset into a one-hot encoded representation.

    Parameters
    ----------
    X : np.ndarray
        Input array of shape (n_samples, n_features).

    Returns
    -------
    np.ndarray
        Transformed array properly encoded.
    """
    n_features = X.shape[1]
    out_cols = []
    for col_idx in range(n_features):
        col = X[:, col_idx]
        cats = self._categories.get(col_idx, [])
        if not cats:
            continue

        for cat in cats:
            if isinstance(cat, float) and np.isnan(cat):
                continue
            # For string objects, equal checking doesn't break, generates boolean mask
            mask = (col == cat).astype(float)
            out_cols.append(mask)

    if not out_cols:
        return np.empty((X.shape[0], 0))

    return np.column_stack(out_cols)

ImputationStrategy

Bases: Enum

Strategies available for imputing missing values.

SimpleImputer

SimpleImputer(strategy=MEAN, constant_value=0.0)

Bases: BaseTransformer

Replaces missing values using a specified statistical strategy.

Notes

This imputer supports basic strategies like mean, median, mode, or a constant value.

Parameters:

Name Type Description Default
strategy ImputationStrategy

The strategy used for missing value imputation.

ImputationStrategy.MEAN
constant_value Union[float, str, None]

The value to use when strategy is CONSTANT.

0.0
Source code in glassbox/cleaner/imputers.py
def __init__(
    self,
    strategy: ImputationStrategy = ImputationStrategy.MEAN,
    constant_value: Union[float, str, None] = 0.0,
):
    """
    Parameters
    ----------
    strategy : ImputationStrategy, default=ImputationStrategy.MEAN
        The strategy used for missing value imputation.
    constant_value: Union[float, str, None], default=0.0
        The value to use when strategy is CONSTANT.
    """
    self._strategy = strategy
    self._constant_value = constant_value
    self._fill_values: Dict[str, Union[float, str]] = {}

fit

fit(X)

Learn the imputation values from the training data.

Parameters:

Name Type Description Default
X ndarray

Input array of shape (n_samples, n_features).

required

Returns:

Type Description
Self

Fitted imputer instance.

Source code in glassbox/cleaner/imputers.py
def fit(self, X: np.ndarray) -> Self:
    """
    Learn the imputation values from the training data.

    Parameters
    ----------
    X : np.ndarray
        Input array of shape (n_samples, n_features).

    Returns
    -------
    Self
        Fitted imputer instance.
    """
    n_features = X.shape[1]
    for col_idx in range(n_features):
        col = X[:, col_idx]

        if np.issubdtype(col.dtype, np.number):
            col_clean = col[~np.isnan(col)]
        else:
            col_clean = col[
                np.array(
                    [
                        v is not None and not (isinstance(v, float) and np.isnan(v))
                        for v in col
                    ]
                )
            ]

        if len(col_clean) == 0:
            self._fill_values[str(col_idx)] = 0.0
            continue

        if self._strategy == ImputationStrategy.MEAN:
            self._fill_values[str(col_idx)] = calc_mean(col_clean)
        elif self._strategy == ImputationStrategy.MEDIAN:
            self._fill_values[str(col_idx)] = calc_median(col_clean)
        elif self._strategy == ImputationStrategy.MODE:
            self._fill_values[str(col_idx)] = calc_mode(col_clean)
        elif self._strategy == ImputationStrategy.CONSTANT:
            self._fill_values[str(col_idx)] = self._constant_value
        else:
            raise ValueError(f"Unknown strategy: {self._strategy}")

    return self

transform

transform(X)

Impute missing values in the given dataset.

Parameters:

Name Type Description Default
X ndarray

Input array of shape (n_samples, n_features).

required

Returns:

Type Description
ndarray

Transformed array with missing values imputed.

Source code in glassbox/cleaner/imputers.py
def transform(self, X: np.ndarray) -> np.ndarray:
    """
    Impute missing values in the given dataset.

    Parameters
    ----------
    X : np.ndarray
        Input array of shape (n_samples, n_features).

    Returns
    -------
    np.ndarray
        Transformed array with missing values imputed.
    """
    X_out = X.copy()
    n_features = X.shape[1]

    for col_idx in range(n_features):
        col = X_out[:, col_idx]
        fill_val = self._fill_values.get(str(col_idx), 0.0)

        if np.issubdtype(col.dtype, np.number):
            mask = np.isnan(col)
            X_out[mask, col_idx] = fill_val
        else:
            mask = np.array(
                [v is None or (isinstance(v, float) and np.isnan(v)) for v in col]
            )
            X_out[mask, col_idx] = fill_val

    return X_out

OutlierCapper

OutlierCapper()

Bases: BaseTransformer

Identifies and caps numerical outliers based on specified bounds.

Source code in glassbox/cleaner/outliers.py
def __init__(self):
    self._bounds: Dict[str, Dict[str, float]] = {}

fit

fit(X)

Detect boundaries for outlier capping from the training data.

Parameters:

Name Type Description Default
X ndarray

Input array of shape (n_samples, n_features).

required

Returns:

Type Description
Self

Fitted outlier capper instance.

Source code in glassbox/cleaner/outliers.py
def fit(self, X: np.ndarray) -> Self:
    """
    Detect boundaries for outlier capping from the training data.

    Parameters
    ----------
    X : np.ndarray
        Input array of shape (n_samples, n_features).

    Returns
    -------
    Self
        Fitted outlier capper instance.
    """
    n_features = X.shape[1]
    for col_idx in range(n_features):
        col = X[:, col_idx]

        if np.issubdtype(col.dtype, np.number):
            col_clean = col[~np.isnan(col)]
        else:
            col_clean = np.array(
                [
                    x
                    for x in col
                    if x is not None and not (isinstance(x, float) and np.isnan(x))
                ]
            )

        if len(col_clean) == 0:
            self._bounds[str(col_idx)] = {"lower": 0.0, "upper": 1.0}
        else:
            lower, upper = calc_iqr(col_clean)
            self._bounds[str(col_idx)] = {"lower": lower, "upper": upper}
    return self

transform

transform(X)

Cap outliers in the input dataset.

Parameters:

Name Type Description Default
X ndarray

Input array of shape (n_samples, n_features).

required

Returns:

Type Description
ndarray

Transformed array with outliers capped.

Source code in glassbox/cleaner/outliers.py
def transform(self, X: np.ndarray) -> np.ndarray:
    """
    Cap outliers in the input dataset.

    Parameters
    ----------
    X : np.ndarray
        Input array of shape (n_samples, n_features).

    Returns
    -------
    np.ndarray
        Transformed array with outliers capped.
    """
    X_out = X.copy()
    n_features = X.shape[1]
    for col_idx in range(n_features):
        bounds = self._bounds.get(str(col_idx), {"lower": 0.0, "upper": 1.0})
        lower = bounds["lower"]
        upper = bounds["upper"]

        col = X_out[:, col_idx]
        if np.issubdtype(col.dtype, np.number):
            mask_clean = ~np.isnan(col)
            # Cap the values only for non-NaN elements
            X_out[mask_clean, col_idx] = np.clip(col[mask_clean], lower, upper)
        else:
            mask_clean = np.array([v is not None for v in col])
            # Objects might fail if not numbers under the hood
            try:
                X_out[mask_clean, col_idx] = np.clip(
                    col[mask_clean].astype(float), lower, upper
                )
            except ValueError:
                pass
    return X_out

MinMaxScaler

MinMaxScaler()

Bases: BaseTransformer

Transforms features by scaling each feature to a given range.

Source code in glassbox/cleaner/scalers.py
def __init__(self):
    self._min: np.ndarray = np.array([])
    self._max: np.ndarray = np.array([])

fit

fit(X)

Compute the minimum and maximum to be used for later scaling.

Parameters:

Name Type Description Default
X ndarray

Input array of shape (n_samples, n_features).

required

Returns:

Type Description
Self

Fitted scaler instance.

Source code in glassbox/cleaner/scalers.py
def fit(self, X: np.ndarray) -> Self:
    """
    Compute the minimum and maximum to be used for later scaling.

    Parameters
    ----------
    X : np.ndarray
        Input array of shape (n_samples, n_features).

    Returns
    -------
    Self
        Fitted scaler instance.
    """
    n_features = X.shape[1]
    mins = []
    maxs = []

    for col_idx in range(n_features):
        col = X[:, col_idx]
        if np.issubdtype(col.dtype, np.number):
            col_clean = col[~np.isnan(col)]
        else:
            col_clean = np.array(
                [
                    x
                    for x in col
                    if x is not None and not (isinstance(x, float) and np.isnan(x))
                ]
            )

        if len(col_clean) == 0:
            mins.append(0.0)
            maxs.append(1.0)
        else:
            mins.append(float(np.min(col_clean)))
            maxs.append(float(np.max(col_clean)))

    self._min = np.array(mins)
    self._max = np.array(maxs)
    return self

transform

transform(X)

Scale features of X according to feature range.

Parameters:

Name Type Description Default
X ndarray

Input array of shape (n_samples, n_features).

required

Returns:

Type Description
ndarray

Transformed array properly scaled.

Source code in glassbox/cleaner/scalers.py
def transform(self, X: np.ndarray) -> np.ndarray:
    """
    Scale features of X according to feature range.

    Parameters
    ----------
    X : np.ndarray
        Input array of shape (n_samples, n_features).

    Returns
    -------
    np.ndarray
        Transformed array properly scaled.
    """
    X_out = X.copy()
    range_vals = self._max - self._min
    range_vals[range_vals == 0.0] = 1.0  # avoid division by zero

    if np.issubdtype(X_out.dtype, np.number):
        X_out = (X_out - self._min) / range_vals
    else:
        for col_idx in range(X_out.shape[1]):
            col = X_out[:, col_idx]
            mask = np.array([v is not None for v in col])
            X_out[mask, col_idx] = (col[mask] - self._min[col_idx]) / range_vals[
                col_idx
            ]

    return X_out

StandardScaler

StandardScaler()

Bases: BaseTransformer

Standardizes features by removing the mean and scaling to unit variance.

Source code in glassbox/cleaner/scalers.py
def __init__(self):
    self._mean: np.ndarray = np.array([])
    self._std: np.ndarray = np.array([])

fit

fit(X)

Compute the mean and standard deviation to be used for later scaling.

Parameters:

Name Type Description Default
X ndarray

Input array of shape (n_samples, n_features).

required

Returns:

Type Description
Self

Fitted scaler instance.

Source code in glassbox/cleaner/scalers.py
def fit(self, X: np.ndarray) -> Self:
    """
    Compute the mean and standard deviation to be used for later scaling.

    Parameters
    ----------
    X : np.ndarray
        Input array of shape (n_samples, n_features).

    Returns
    -------
    Self
        Fitted scaler instance.
    """
    n_features = X.shape[1]
    means = []
    stds = []

    for col_idx in range(n_features):
        col = X[:, col_idx]
        if np.issubdtype(col.dtype, np.number):
            col_clean = col[~np.isnan(col)]
        else:
            col_clean = np.array(
                [
                    x
                    for x in col
                    if x is not None and not (isinstance(x, float) and np.isnan(x))
                ]
            )

        if len(col_clean) == 0:
            means.append(0.0)
            stds.append(1.0)
        else:
            means.append(calc_mean(col_clean))
            s = calc_std(col_clean)
            stds.append(s if s > 0 else 1.0)

    self._mean = np.array(means)
    self._std = np.array(stds)
    return self

transform

transform(X)

Perform standardization by centering and scaling.

Parameters:

Name Type Description Default
X ndarray

Input array of shape (n_samples, n_features).

required

Returns:

Type Description
ndarray

Transformed array properly scaled.

Source code in glassbox/cleaner/scalers.py
def transform(self, X: np.ndarray) -> np.ndarray:
    """
    Perform standardization by centering and scaling.

    Parameters
    ----------
    X : np.ndarray
        Input array of shape (n_samples, n_features).

    Returns
    -------
    np.ndarray
        Transformed array properly scaled.
    """
    X_out = X.copy()

    # If input has NaNs, they remain NaNs. We must be careful not to subtract mean from None
    if np.issubdtype(X_out.dtype, np.number):
        # broadcast subtraction
        X_out = (X_out - self._mean) / self._std
    else:
        for col_idx in range(X_out.shape[1]):
            col = X_out[:, col_idx]
            mask = np.array([v is not None for v in col])
            X_out[mask, col_idx] = (col[mask] - self._mean[col_idx]) / self._std[
                col_idx
            ]

    return X_out