Skip to content

glassbox.cleaner.imputers

SimpleImputer — replaces missing values using mean, median, mode, or a constant.


ImputationStrategy

Bases: Enum

Strategies available for imputing missing values.

SimpleImputer

SimpleImputer(strategy=MEAN, constant_value=0.0)

Bases: BaseTransformer

Replaces missing values using a specified statistical strategy.

Notes

This imputer supports basic strategies like mean, median, mode, or a constant value.

Parameters:

Name Type Description Default
strategy ImputationStrategy

The strategy used for missing value imputation.

ImputationStrategy.MEAN
constant_value Union[float, str, None]

The value to use when strategy is CONSTANT.

0.0
Source code in glassbox/cleaner/imputers.py
def __init__(
    self,
    strategy: ImputationStrategy = ImputationStrategy.MEAN,
    constant_value: Union[float, str, None] = 0.0,
):
    """
    Parameters
    ----------
    strategy : ImputationStrategy, default=ImputationStrategy.MEAN
        The strategy used for missing value imputation.
    constant_value: Union[float, str, None], default=0.0
        The value to use when strategy is CONSTANT.
    """
    self._strategy = strategy
    self._constant_value = constant_value
    self._fill_values: Dict[str, Union[float, str]] = {}

fit

fit(X)

Learn the imputation values from the training data.

Parameters:

Name Type Description Default
X ndarray

Input array of shape (n_samples, n_features).

required

Returns:

Type Description
Self

Fitted imputer instance.

Source code in glassbox/cleaner/imputers.py
def fit(self, X: np.ndarray) -> Self:
    """
    Learn the imputation values from the training data.

    Parameters
    ----------
    X : np.ndarray
        Input array of shape (n_samples, n_features).

    Returns
    -------
    Self
        Fitted imputer instance.
    """
    n_features = X.shape[1]
    for col_idx in range(n_features):
        col = X[:, col_idx]

        if np.issubdtype(col.dtype, np.number):
            col_clean = col[~np.isnan(col)]
        else:
            col_clean = col[
                np.array(
                    [
                        v is not None and not (isinstance(v, float) and np.isnan(v))
                        for v in col
                    ]
                )
            ]

        if len(col_clean) == 0:
            self._fill_values[str(col_idx)] = 0.0
            continue

        if self._strategy == ImputationStrategy.MEAN:
            self._fill_values[str(col_idx)] = calc_mean(col_clean)
        elif self._strategy == ImputationStrategy.MEDIAN:
            self._fill_values[str(col_idx)] = calc_median(col_clean)
        elif self._strategy == ImputationStrategy.MODE:
            self._fill_values[str(col_idx)] = calc_mode(col_clean)
        elif self._strategy == ImputationStrategy.CONSTANT:
            self._fill_values[str(col_idx)] = self._constant_value
        else:
            raise ValueError(f"Unknown strategy: {self._strategy}")

    return self

transform

transform(X)

Impute missing values in the given dataset.

Parameters:

Name Type Description Default
X ndarray

Input array of shape (n_samples, n_features).

required

Returns:

Type Description
ndarray

Transformed array with missing values imputed.

Source code in glassbox/cleaner/imputers.py
def transform(self, X: np.ndarray) -> np.ndarray:
    """
    Impute missing values in the given dataset.

    Parameters
    ----------
    X : np.ndarray
        Input array of shape (n_samples, n_features).

    Returns
    -------
    np.ndarray
        Transformed array with missing values imputed.
    """
    X_out = X.copy()
    n_features = X.shape[1]

    for col_idx in range(n_features):
        col = X_out[:, col_idx]
        fill_val = self._fill_values.get(str(col_idx), 0.0)

        if np.issubdtype(col.dtype, np.number):
            mask = np.isnan(col)
            X_out[mask, col_idx] = fill_val
        else:
            mask = np.array(
                [v is None or (isinstance(v, float) and np.isnan(v)) for v in col]
            )
            X_out[mask, col_idx] = fill_val

    return X_out