Skip to content

glassbox.frame.dataset

The Dataset class — a lightweight, named-column wrapper around a 2-D NumPy array.


Dataset

Dataset(data, columns)

Container for data matrices with multiple helper functions.

Attributes:

Name Type Description
data ndarray

Data arranged as a 2D matrix. To access columns, take the transpose.

columns List[str]

Names of the columns stored in a list.

shape Tuple[int, int]

Shape of the dataset (# of rows, # of columns).

Parameters:

Name Type Description Default
data ndarray

Data arranged as a 2D matrix - (n_rows, n_cols)

required
columns List[str]

column names - must match data.shape[1]

required
Source code in glassbox/frame/dataset.py
def __init__(self, data: np.ndarray, columns: List[str]):
    """
    Parameters
    ----------
    data: np.ndarray
        Data arranged as a 2D matrix - (n_rows, n_cols)
    columns: List[str]
        column names - must match data.shape[1]
    """
    if data.ndim != 2:
        raise ValueError(f"data must be a 2D matrix, got shape {data.ndim}")
    if len(columns) != data.shape[1]:
        raise ValueError(
            f"Number of columns ({len(columns)}) does not match"
            f"data width ({data.shape[1]})"
        )
    self._data = data
    self._columns = columns

get_columns

get_columns(names)

Retrieve data for specific columns by name.

Parameters:

Name Type Description Default
names str | List[str]

A single column name or a list of column names.

required

Returns:

Type Description
ndarray

Array slice representing the requested columns.

Source code in glassbox/frame/dataset.py
def get_columns(self, names: str | List[str]) -> "Dataset":
    """
    Retrieve data for specific columns by name.

    Parameters
    ----------
    names : str | List[str]
        A single column name or a list of column names.

    Returns
    -------
    np.ndarray
        Array slice representing the requested columns.
    """
    names = self._normalize_names(names)
    indices = [self._get_column_index(name) for name in names]
    return Dataset(self._data[:, indices].copy(), names)

get_rows

get_rows(indices)

Get specific rows based on indices and return as a new dataset.

Parameters:

Name Type Description Default
indices ndarray

Integer array of row coordinates.

required

Returns:

Type Description
Dataset

A new Dataset instance containing the selected rows.

Source code in glassbox/frame/dataset.py
def get_rows(self, indices: np.ndarray) -> "Dataset":
    """
    Get specific rows based on indices and return as a new dataset.

    Parameters
    ----------
    indices : np.ndarray
        Integer array of row coordinates.

    Returns
    -------
    Dataset
        A new Dataset instance containing the selected rows.
    """
    return Dataset(self._data[indices].copy(), self._columns)

update_column

update_column(name, new_data)

Update the array content for an existing column.

Parameters:

Name Type Description Default
name str

Target column to update.

required
new_data ndarray

Array values to overwrite the column.

required

Returns:

Type Description
None
Source code in glassbox/frame/dataset.py
def update_column(self, name: str, new_data: np.ndarray) -> None:
    """
    Update the array content for an existing column.

    Parameters
    ----------
    name : str
        Target column to update.
    new_data : np.ndarray
        Array values to overwrite the column.

    Returns
    -------
    None
    """
    idx = self._get_column_index(name)
    self._data[:, idx] = new_data.ravel()

drop_columns

drop_columns(names)

Remove columns by name from the dataset.

Parameters:

Name Type Description Default
names str | List[str]

Target column or list of columns to remove.

required

Returns:

Type Description
None

Raises:

Type Description
KeyError

if one of the columns to drop doesn't exist in the dataset

Source code in glassbox/frame/dataset.py
def drop_columns(self, names: str | List[str]) -> None:
    """
    Remove columns by name from the dataset.

    Parameters
    ----------
    names : str | List[str]
        Target column or list of columns to remove.

    Returns
    -------
    None

    Raises
    ------
    KeyError
        if one of the columns to drop doesn't exist in the dataset
    """
    to_drop = set(self._normalize_names(names))
    keep_mask = [i for i, col in enumerate(self._columns) if col not in to_drop]
    missing = to_drop - set(self._columns)
    if missing:
        raise KeyError(f"columns not found: {missing}")
    self._data = self._data[:, keep_mask]
    self._columns = [self._columns[i] for i in keep_mask]

add_columns

add_columns(new_dataset)

Add new columns alongside the dataset arrays.

Parameters:

Name Type Description Default
new_dataset Dataset

New data to append.

required

Returns:

Type Description
None

Raises:

Type Description
ValueError

If column to add already exists

Source code in glassbox/frame/dataset.py
def add_columns(self, new_dataset: "Dataset") -> None:
    """
    Add new columns alongside the dataset arrays.

    Parameters
    ----------
    new_dataset : Dataset
        New data to append.

    Returns
    -------
    None

    Raises
    ------
    ValueError
        If column to add already exists
    """
    duplicates = [n for n in new_dataset.columns if n in self._columns]
    if duplicates:
        raise ValueError(f"Columns already exist: {duplicates}")

    if new_dataset.shape[0] != self._data.shape[0]:
        raise ValueError(
            f"Row count mismatch: dataset has {self._data.shape[0]} rows, "
            f"new data has {new_dataset.shape[0]}"
        )

    self._data = np.hstack([self._data, new_dataset.data])
    self._columns.extend(new_dataset.columns)