`glassbox.frame`¶

Data container and CSV I/O utilities.

Dataset ¶

Dataset(data, columns)

Container for data matrices with multiple helper functions.

Attributes:

Name	Type	Description
`data`	`ndarray`	Data arranged as a 2D matrix. To access columns, take the transpose.
`columns`	`List[str]`	Names of the columns stored in a list.
`shape`	`Tuple[int, int]`	Shape of the dataset (# of rows, # of columns).

Parameters:

Name	Type	Description	Default
`data`	`ndarray`	Data arranged as a 2D matrix - (n_rows, n_cols)	required
`columns`	`List[str]`	column names - must match data.shape[1]	required

Source code in glassbox/frame/dataset.py

def __init__(self, data: np.ndarray, columns: List[str]):
    """
    Parameters
    ----------
    data: np.ndarray
        Data arranged as a 2D matrix - (n_rows, n_cols)
    columns: List[str]
        column names - must match data.shape[1]
    """
    if data.ndim != 2:
        raise ValueError(f"data must be a 2D matrix, got shape {data.ndim}")
    if len(columns) != data.shape[1]:
        raise ValueError(
            f"Number of columns ({len(columns)}) does not match"
            f"data width ({data.shape[1]})"
        )
    self._data = data
    self._columns = columns

get_columns ¶

get_columns(names)

Retrieve data for specific columns by name.

Parameters:

Name	Type	Description	Default
`names`	`str \| List[str]`	A single column name or a list of column names.	required

Returns:

Type	Description
`ndarray`	Array slice representing the requested columns.

Source code in glassbox/frame/dataset.py

def get_columns(self, names: str | List[str]) -> "Dataset":
    """
    Retrieve data for specific columns by name.

    Parameters
    ----------
    names : str | List[str]
        A single column name or a list of column names.

    Returns
    -------
    np.ndarray
        Array slice representing the requested columns.
    """
    names = self._normalize_names(names)
    indices = [self._get_column_index(name) for name in names]
    return Dataset(self._data[:, indices].copy(), names)

get_rows ¶

get_rows(indices)

Get specific rows based on indices and return as a new dataset.

Parameters:

Name	Type	Description	Default
`indices`	`ndarray`	Integer array of row coordinates.	required

Returns:

Type	Description
`Dataset`	A new Dataset instance containing the selected rows.

Source code in glassbox/frame/dataset.py

def get_rows(self, indices: np.ndarray) -> "Dataset":
    """
    Get specific rows based on indices and return as a new dataset.

    Parameters
    ----------
    indices : np.ndarray
        Integer array of row coordinates.

    Returns
    -------
    Dataset
        A new Dataset instance containing the selected rows.
    """
    return Dataset(self._data[indices].copy(), self._columns)

update_column ¶

update_column(name, new_data)

Update the array content for an existing column.

Parameters:

Name	Type	Description	Default
`name`	`str`	Target column to update.	required
`new_data`	`ndarray`	Array values to overwrite the column.	required

Returns:

Type	Description
`None`

Source code in glassbox/frame/dataset.py

def update_column(self, name: str, new_data: np.ndarray) -> None:
    """
    Update the array content for an existing column.

    Parameters
    ----------
    name : str
        Target column to update.
    new_data : np.ndarray
        Array values to overwrite the column.

    Returns
    -------
    None
    """
    idx = self._get_column_index(name)
    self._data[:, idx] = new_data.ravel()

drop_columns ¶

drop_columns(names)

Remove columns by name from the dataset.

Parameters:

Name	Type	Description	Default
`names`	`str \| List[str]`	Target column or list of columns to remove.	required

Returns:

Type	Description
`None`

Raises:

Type	Description
`KeyError`	if one of the columns to drop doesn't exist in the dataset

Source code in glassbox/frame/dataset.py

def drop_columns(self, names: str | List[str]) -> None:
    """
    Remove columns by name from the dataset.

    Parameters
    ----------
    names : str | List[str]
        Target column or list of columns to remove.

    Returns
    -------
    None

    Raises
    ------
    KeyError
        if one of the columns to drop doesn't exist in the dataset
    """
    to_drop = set(self._normalize_names(names))
    keep_mask = [i for i, col in enumerate(self._columns) if col not in to_drop]
    missing = to_drop - set(self._columns)
    if missing:
        raise KeyError(f"columns not found: {missing}")
    self._data = self._data[:, keep_mask]
    self._columns = [self._columns[i] for i in keep_mask]

add_columns ¶

add_columns(new_dataset)

Add new columns alongside the dataset arrays.

Parameters:

Name	Type	Description	Default
`new_dataset`	`Dataset`	New data to append.	required

Returns:

Type	Description
`None`

Raises:

Type	Description
`ValueError`	If column to add already exists

Source code in glassbox/frame/dataset.py

def add_columns(self, new_dataset: "Dataset") -> None:
    """
    Add new columns alongside the dataset arrays.

    Parameters
    ----------
    new_dataset : Dataset
        New data to append.

    Returns
    -------
    None

    Raises
    ------
    ValueError
        If column to add already exists
    """
    duplicates = [n for n in new_dataset.columns if n in self._columns]
    if duplicates:
        raise ValueError(f"Columns already exist: {duplicates}")

    if new_dataset.shape[0] != self._data.shape[0]:
        raise ValueError(
            f"Row count mismatch: dataset has {self._data.shape[0]} rows, "
            f"new data has {new_dataset.shape[0]}"
        )

    self._data = np.hstack([self._data, new_dataset.data])
    self._columns.extend(new_dataset.columns)

read_csv ¶

read_csv(filepath)

Load a CSV file into a Dataset.

Parameters:

Name	Type	Description	Default
`filepath`	`str`	Path to the CSV file.	required

Returns:

Type	Description
`Dataset`	Loaded dataset object.

Source code in glassbox/frame/io.py

def read_csv(filepath: str) -> Dataset:
    """
    Load a CSV file into a Dataset.

    Parameters
    ----------
    filepath : str
        Path to the CSV file.

    Returns
    -------
    Dataset
        Loaded dataset object.
    """
    with open(filepath, newline="", encoding="utf-8") as data_source:
        reader = csv.reader(data_source)
        # Filter empty rows
        entries = [row for row in reader if row]

    if not entries:
        raise ValueError(f"CSV file is empty: {filepath}")

    # Parse header
    columns = [col.strip() for col in entries[0]]

    # Parse rows
    rows = []
    for _row in entries[1:]:
        cells = [cell.strip() for cell in _row]
        rows.append(cells)

    if not rows:
        # Header-only file — return empty dataset with float64 dtype
        data = np.empty((0, len(columns)), dtype=object)
        return Dataset(data, columns)

    # Build a column-at-a-time array, trying float conversion per column
    n_rows = len(rows)
    n_cols = len(columns)
    data = np.empty((n_rows, n_cols), dtype=object)

    for r, row in enumerate(rows):
        for c, cell in enumerate(row):
            if cell == "" or cell.upper() == "NA":
                data[r, c] = np.nan
            else:
                data[r, c] = cell

    # Try to cast each column to float; leave as object if it fails
    float_data = np.empty((n_rows, n_cols), dtype=float)
    col_is_float = np.ones(n_cols, dtype=bool)

    for c in range(n_cols):
        try:
            float_data[:, c] = data[:, c].astype(float)
        except (ValueError, TypeError):
            col_is_float[c] = False

    if col_is_float.all():
        return Dataset(float_data, columns)

    # Mixed types — keep object array as-is; promote pure-float cols
    for c in range(n_cols):
        if col_is_float[c]:
            data[:, c] = float_data[:, c]

    return Dataset(data, columns)

glassbox.frame¶

Dataset ¶

get_columns ¶

get_rows ¶

update_column ¶

drop_columns ¶

add_columns ¶

read_csv ¶

`glassbox.frame`¶