`glassbox.frame.dataset`¶

The Dataset class — a lightweight, named-column wrapper around a 2-D NumPy array.

Dataset ¶

Dataset(data, columns)

Container for data matrices with multiple helper functions.

Attributes:

Name	Type	Description
`data`	`ndarray`	Data arranged as a 2D matrix. To access columns, take the transpose.
`columns`	`List[str]`	Names of the columns stored in a list.
`shape`	`Tuple[int, int]`	Shape of the dataset (# of rows, # of columns).

Parameters:

Name	Type	Description	Default
`data`	`ndarray`	Data arranged as a 2D matrix - (n_rows, n_cols)	required
`columns`	`List[str]`	column names - must match data.shape[1]	required

Source code in glassbox/frame/dataset.py

def __init__(self, data: np.ndarray, columns: List[str]):
    """
    Parameters
    ----------
    data: np.ndarray
        Data arranged as a 2D matrix - (n_rows, n_cols)
    columns: List[str]
        column names - must match data.shape[1]
    """
    if data.ndim != 2:
        raise ValueError(f"data must be a 2D matrix, got shape {data.ndim}")
    if len(columns) != data.shape[1]:
        raise ValueError(
            f"Number of columns ({len(columns)}) does not match"
            f"data width ({data.shape[1]})"
        )
    self._data = data
    self._columns = columns

get_columns ¶

get_columns(names)

Retrieve data for specific columns by name.

Parameters:

Name	Type	Description	Default
`names`	`str \| List[str]`	A single column name or a list of column names.	required

Returns:

Type	Description
`ndarray`	Array slice representing the requested columns.

Source code in glassbox/frame/dataset.py

def get_columns(self, names: str | List[str]) -> "Dataset":
    """
    Retrieve data for specific columns by name.

    Parameters
    ----------
    names : str | List[str]
        A single column name or a list of column names.

    Returns
    -------
    np.ndarray
        Array slice representing the requested columns.
    """
    names = self._normalize_names(names)
    indices = [self._get_column_index(name) for name in names]
    return Dataset(self._data[:, indices].copy(), names)

get_rows ¶

get_rows(indices)

Get specific rows based on indices and return as a new dataset.

Parameters:

Name	Type	Description	Default
`indices`	`ndarray`	Integer array of row coordinates.	required

Returns:

Type	Description
`Dataset`	A new Dataset instance containing the selected rows.

Source code in glassbox/frame/dataset.py

def get_rows(self, indices: np.ndarray) -> "Dataset":
    """
    Get specific rows based on indices and return as a new dataset.

    Parameters
    ----------
    indices : np.ndarray
        Integer array of row coordinates.

    Returns
    -------
    Dataset
        A new Dataset instance containing the selected rows.
    """
    return Dataset(self._data[indices].copy(), self._columns)

update_column ¶

update_column(name, new_data)

Update the array content for an existing column.

Parameters:

Name	Type	Description	Default
`name`	`str`	Target column to update.	required
`new_data`	`ndarray`	Array values to overwrite the column.	required

Returns:

Type	Description
`None`

Source code in glassbox/frame/dataset.py

def update_column(self, name: str, new_data: np.ndarray) -> None:
    """
    Update the array content for an existing column.

    Parameters
    ----------
    name : str
        Target column to update.
    new_data : np.ndarray
        Array values to overwrite the column.

    Returns
    -------
    None
    """
    idx = self._get_column_index(name)
    self._data[:, idx] = new_data.ravel()

drop_columns ¶

drop_columns(names)

Remove columns by name from the dataset.

Parameters:

Name	Type	Description	Default
`names`	`str \| List[str]`	Target column or list of columns to remove.	required

Returns:

Type	Description
`None`

Raises:

Type	Description
`KeyError`	if one of the columns to drop doesn't exist in the dataset

Source code in glassbox/frame/dataset.py

def drop_columns(self, names: str | List[str]) -> None:
    """
    Remove columns by name from the dataset.

    Parameters
    ----------
    names : str | List[str]
        Target column or list of columns to remove.

    Returns
    -------
    None

    Raises
    ------
    KeyError
        if one of the columns to drop doesn't exist in the dataset
    """
    to_drop = set(self._normalize_names(names))
    keep_mask = [i for i, col in enumerate(self._columns) if col not in to_drop]
    missing = to_drop - set(self._columns)
    if missing:
        raise KeyError(f"columns not found: {missing}")
    self._data = self._data[:, keep_mask]
    self._columns = [self._columns[i] for i in keep_mask]

add_columns ¶

add_columns(new_dataset)

Add new columns alongside the dataset arrays.

Parameters:

Name	Type	Description	Default
`new_dataset`	`Dataset`	New data to append.	required

Returns:

Type	Description
`None`

Raises:

Type	Description
`ValueError`	If column to add already exists

Source code in glassbox/frame/dataset.py

def add_columns(self, new_dataset: "Dataset") -> None:
    """
    Add new columns alongside the dataset arrays.

    Parameters
    ----------
    new_dataset : Dataset
        New data to append.

    Returns
    -------
    None

    Raises
    ------
    ValueError
        If column to add already exists
    """
    duplicates = [n for n in new_dataset.columns if n in self._columns]
    if duplicates:
        raise ValueError(f"Columns already exist: {duplicates}")

    if new_dataset.shape[0] != self._data.shape[0]:
        raise ValueError(
            f"Row count mismatch: dataset has {self._data.shape[0]} rows, "
            f"new data has {new_dataset.shape[0]}"
        )

    self._data = np.hstack([self._data, new_dataset.data])
    self._columns.extend(new_dataset.columns)

glassbox.frame.dataset¶

Dataset ¶

get_columns ¶

get_rows ¶

update_column ¶

drop_columns ¶

add_columns ¶

`glassbox.frame.dataset`¶