Skip to content

glassbox.frame

Data container and CSV I/O utilities.


Dataset

Dataset(data, columns)

Container for data matrices with multiple helper functions.

Attributes:

Name Type Description
data ndarray

Data arranged as a 2D matrix. To access columns, take the transpose.

columns List[str]

Names of the columns stored in a list.

shape Tuple[int, int]

Shape of the dataset (# of rows, # of columns).

Parameters:

Name Type Description Default
data ndarray

Data arranged as a 2D matrix - (n_rows, n_cols)

required
columns List[str]

column names - must match data.shape[1]

required
Source code in glassbox/frame/dataset.py
def __init__(self, data: np.ndarray, columns: List[str]):
    """
    Parameters
    ----------
    data: np.ndarray
        Data arranged as a 2D matrix - (n_rows, n_cols)
    columns: List[str]
        column names - must match data.shape[1]
    """
    if data.ndim != 2:
        raise ValueError(f"data must be a 2D matrix, got shape {data.ndim}")
    if len(columns) != data.shape[1]:
        raise ValueError(
            f"Number of columns ({len(columns)}) does not match"
            f"data width ({data.shape[1]})"
        )
    self._data = data
    self._columns = columns

get_columns

get_columns(names)

Retrieve data for specific columns by name.

Parameters:

Name Type Description Default
names str | List[str]

A single column name or a list of column names.

required

Returns:

Type Description
ndarray

Array slice representing the requested columns.

Source code in glassbox/frame/dataset.py
def get_columns(self, names: str | List[str]) -> "Dataset":
    """
    Retrieve data for specific columns by name.

    Parameters
    ----------
    names : str | List[str]
        A single column name or a list of column names.

    Returns
    -------
    np.ndarray
        Array slice representing the requested columns.
    """
    names = self._normalize_names(names)
    indices = [self._get_column_index(name) for name in names]
    return Dataset(self._data[:, indices].copy(), names)

get_rows

get_rows(indices)

Get specific rows based on indices and return as a new dataset.

Parameters:

Name Type Description Default
indices ndarray

Integer array of row coordinates.

required

Returns:

Type Description
Dataset

A new Dataset instance containing the selected rows.

Source code in glassbox/frame/dataset.py
def get_rows(self, indices: np.ndarray) -> "Dataset":
    """
    Get specific rows based on indices and return as a new dataset.

    Parameters
    ----------
    indices : np.ndarray
        Integer array of row coordinates.

    Returns
    -------
    Dataset
        A new Dataset instance containing the selected rows.
    """
    return Dataset(self._data[indices].copy(), self._columns)

update_column

update_column(name, new_data)

Update the array content for an existing column.

Parameters:

Name Type Description Default
name str

Target column to update.

required
new_data ndarray

Array values to overwrite the column.

required

Returns:

Type Description
None
Source code in glassbox/frame/dataset.py
def update_column(self, name: str, new_data: np.ndarray) -> None:
    """
    Update the array content for an existing column.

    Parameters
    ----------
    name : str
        Target column to update.
    new_data : np.ndarray
        Array values to overwrite the column.

    Returns
    -------
    None
    """
    idx = self._get_column_index(name)
    self._data[:, idx] = new_data.ravel()

drop_columns

drop_columns(names)

Remove columns by name from the dataset.

Parameters:

Name Type Description Default
names str | List[str]

Target column or list of columns to remove.

required

Returns:

Type Description
None

Raises:

Type Description
KeyError

if one of the columns to drop doesn't exist in the dataset

Source code in glassbox/frame/dataset.py
def drop_columns(self, names: str | List[str]) -> None:
    """
    Remove columns by name from the dataset.

    Parameters
    ----------
    names : str | List[str]
        Target column or list of columns to remove.

    Returns
    -------
    None

    Raises
    ------
    KeyError
        if one of the columns to drop doesn't exist in the dataset
    """
    to_drop = set(self._normalize_names(names))
    keep_mask = [i for i, col in enumerate(self._columns) if col not in to_drop]
    missing = to_drop - set(self._columns)
    if missing:
        raise KeyError(f"columns not found: {missing}")
    self._data = self._data[:, keep_mask]
    self._columns = [self._columns[i] for i in keep_mask]

add_columns

add_columns(new_dataset)

Add new columns alongside the dataset arrays.

Parameters:

Name Type Description Default
new_dataset Dataset

New data to append.

required

Returns:

Type Description
None

Raises:

Type Description
ValueError

If column to add already exists

Source code in glassbox/frame/dataset.py
def add_columns(self, new_dataset: "Dataset") -> None:
    """
    Add new columns alongside the dataset arrays.

    Parameters
    ----------
    new_dataset : Dataset
        New data to append.

    Returns
    -------
    None

    Raises
    ------
    ValueError
        If column to add already exists
    """
    duplicates = [n for n in new_dataset.columns if n in self._columns]
    if duplicates:
        raise ValueError(f"Columns already exist: {duplicates}")

    if new_dataset.shape[0] != self._data.shape[0]:
        raise ValueError(
            f"Row count mismatch: dataset has {self._data.shape[0]} rows, "
            f"new data has {new_dataset.shape[0]}"
        )

    self._data = np.hstack([self._data, new_dataset.data])
    self._columns.extend(new_dataset.columns)

read_csv

read_csv(filepath)

Load a CSV file into a Dataset.

Parameters:

Name Type Description Default
filepath str

Path to the CSV file.

required

Returns:

Type Description
Dataset

Loaded dataset object.

Source code in glassbox/frame/io.py
def read_csv(filepath: str) -> Dataset:
    """
    Load a CSV file into a Dataset.

    Parameters
    ----------
    filepath : str
        Path to the CSV file.

    Returns
    -------
    Dataset
        Loaded dataset object.
    """
    with open(filepath, newline="", encoding="utf-8") as data_source:
        reader = csv.reader(data_source)
        # Filter empty rows
        entries = [row for row in reader if row]

    if not entries:
        raise ValueError(f"CSV file is empty: {filepath}")

    # Parse header
    columns = [col.strip() for col in entries[0]]

    # Parse rows
    rows = []
    for _row in entries[1:]:
        cells = [cell.strip() for cell in _row]
        rows.append(cells)

    if not rows:
        # Header-only file — return empty dataset with float64 dtype
        data = np.empty((0, len(columns)), dtype=object)
        return Dataset(data, columns)

    # Build a column-at-a-time array, trying float conversion per column
    n_rows = len(rows)
    n_cols = len(columns)
    data = np.empty((n_rows, n_cols), dtype=object)

    for r, row in enumerate(rows):
        for c, cell in enumerate(row):
            if cell == "" or cell.upper() == "NA":
                data[r, c] = np.nan
            else:
                data[r, c] = cell

    # Try to cast each column to float; leave as object if it fails
    float_data = np.empty((n_rows, n_cols), dtype=float)
    col_is_float = np.ones(n_cols, dtype=bool)

    for c in range(n_cols):
        try:
            float_data[:, c] = data[:, c].astype(float)
        except (ValueError, TypeError):
            col_is_float[c] = False

    if col_is_float.all():
        return Dataset(float_data, columns)

    # Mixed types — keep object array as-is; promote pure-float cols
    for c in range(n_cols):
        if col_is_float[c]:
            data[:, c] = float_data[:, c]

    return Dataset(data, columns)