Frame¶

The glassbox.frame module provides a lightweight data container and CSV I/O utilities. All data is stored internally as np.ndarray — no pandas dependency.

Loading Data¶

Use read_csv to load a CSV file into a Dataset:

from glassbox.frame import read_csv

ds = read_csv("students.csv")
print(ds)
# Dataset(shape=(1000, 12), columns=['Age', 'Gender', 'Score', ...])

Type inference

Columns that can be fully cast to float are stored as float64. Mixed or string columns remain as object dtype. Empty cells and "NA" values are converted to np.nan.

The Dataset Class¶

Kroki

Dataset wraps a 2-D NumPy array with named columns.

Properties¶

Property	Type	Description
`data`	`np.ndarray`	The underlying 2-D array.
`columns`	`List[str]`	Column names.
`shape`	`Tuple[int, int]`	`(n_rows, n_cols)`.

Selecting Columns¶

# Single column → Dataset with 1 column
ages = ds.get_columns("Age")

# Multiple columns → Dataset subset
subset = ds.get_columns(["Age", "Score"])

Selecting Rows¶

import numpy as np

indices = np.array([0, 5, 10])
sample = ds.get_rows(indices)

Modifying Data¶

# Update an existing column
ds.update_column("Score", new_scores)

# Drop columns
ds.drop_columns(["Unused_1", "Unused_2"])

# Add new columns from another Dataset
ds.add_columns(extra_ds)

Saving Data¶

from glassbox.frame.io import to_csv

to_csv(ds, columns=["Age", "Score"], filepath="output.csv")

Tip

to_csv automatically formats whole-number floats without a decimal point and properly escapes commas and quotes in string values.

API Reference¶

Dataset ¶

Dataset(data, columns)

Container for data matrices with multiple helper functions.

Attributes:

Name	Type	Description
`data`	`ndarray`	Data arranged as a 2D matrix. To access columns, take the transpose.
`columns`	`List[str]`	Names of the columns stored in a list.
`shape`	`Tuple[int, int]`	Shape of the dataset (# of rows, # of columns).

Parameters:

Name	Type	Description	Default
`data`	`ndarray`	Data arranged as a 2D matrix - (n_rows, n_cols)	required
`columns`	`List[str]`	column names - must match data.shape[1]	required

Source code in glassbox/frame/dataset.py

def __init__(self, data: np.ndarray, columns: List[str]):
    """
    Parameters
    ----------
    data: np.ndarray
        Data arranged as a 2D matrix - (n_rows, n_cols)
    columns: List[str]
        column names - must match data.shape[1]
    """
    if data.ndim != 2:
        raise ValueError(f"data must be a 2D matrix, got shape {data.ndim}")
    if len(columns) != data.shape[1]:
        raise ValueError(
            f"Number of columns ({len(columns)}) does not match"
            f"data width ({data.shape[1]})"
        )
    self._data = data
    self._columns = columns

get_columns ¶

get_columns(names)

Retrieve data for specific columns by name.

Parameters:

Name	Type	Description	Default
`names`	`str \| List[str]`	A single column name or a list of column names.	required

Returns:

Type	Description
`ndarray`	Array slice representing the requested columns.

Source code in glassbox/frame/dataset.py

def get_columns(self, names: str | List[str]) -> "Dataset":
    """
    Retrieve data for specific columns by name.

    Parameters
    ----------
    names : str | List[str]
        A single column name or a list of column names.

    Returns
    -------
    np.ndarray
        Array slice representing the requested columns.
    """
    names = self._normalize_names(names)
    indices = [self._get_column_index(name) for name in names]
    return Dataset(self._data[:, indices].copy(), names)

get_rows ¶

get_rows(indices)

Get specific rows based on indices and return as a new dataset.

Parameters:

Name	Type	Description	Default
`indices`	`ndarray`	Integer array of row coordinates.	required

Returns:

Type	Description
`Dataset`	A new Dataset instance containing the selected rows.

Source code in glassbox/frame/dataset.py

def get_rows(self, indices: np.ndarray) -> "Dataset":
    """
    Get specific rows based on indices and return as a new dataset.

    Parameters
    ----------
    indices : np.ndarray
        Integer array of row coordinates.

    Returns
    -------
    Dataset
        A new Dataset instance containing the selected rows.
    """
    return Dataset(self._data[indices].copy(), self._columns)

update_column ¶

update_column(name, new_data)

Update the array content for an existing column.

Parameters:

Name	Type	Description	Default
`name`	`str`	Target column to update.	required
`new_data`	`ndarray`	Array values to overwrite the column.	required

Returns:

Type	Description
`None`

Source code in glassbox/frame/dataset.py

def update_column(self, name: str, new_data: np.ndarray) -> None:
    """
    Update the array content for an existing column.

    Parameters
    ----------
    name : str
        Target column to update.
    new_data : np.ndarray
        Array values to overwrite the column.

    Returns
    -------
    None
    """
    idx = self._get_column_index(name)
    self._data[:, idx] = new_data.ravel()

drop_columns ¶

drop_columns(names)

Remove columns by name from the dataset.

Parameters:

Name	Type	Description	Default
`names`	`str \| List[str]`	Target column or list of columns to remove.	required

Returns:

Type	Description
`None`

Raises:

Type	Description
`KeyError`	if one of the columns to drop doesn't exist in the dataset

Source code in glassbox/frame/dataset.py

def drop_columns(self, names: str | List[str]) -> None:
    """
    Remove columns by name from the dataset.

    Parameters
    ----------
    names : str | List[str]
        Target column or list of columns to remove.

    Returns
    -------
    None

    Raises
    ------
    KeyError
        if one of the columns to drop doesn't exist in the dataset
    """
    to_drop = set(self._normalize_names(names))
    keep_mask = [i for i, col in enumerate(self._columns) if col not in to_drop]
    missing = to_drop - set(self._columns)
    if missing:
        raise KeyError(f"columns not found: {missing}")
    self._data = self._data[:, keep_mask]
    self._columns = [self._columns[i] for i in keep_mask]

add_columns ¶

add_columns(new_dataset)

Add new columns alongside the dataset arrays.

Parameters:

Name	Type	Description	Default
`new_dataset`	`Dataset`	New data to append.	required

Returns:

Type	Description
`None`

Raises:

Type	Description
`ValueError`	If column to add already exists

Source code in glassbox/frame/dataset.py

def add_columns(self, new_dataset: "Dataset") -> None:
    """
    Add new columns alongside the dataset arrays.

    Parameters
    ----------
    new_dataset : Dataset
        New data to append.

    Returns
    -------
    None

    Raises
    ------
    ValueError
        If column to add already exists
    """
    duplicates = [n for n in new_dataset.columns if n in self._columns]
    if duplicates:
        raise ValueError(f"Columns already exist: {duplicates}")

    if new_dataset.shape[0] != self._data.shape[0]:
        raise ValueError(
            f"Row count mismatch: dataset has {self._data.shape[0]} rows, "
            f"new data has {new_dataset.shape[0]}"
        )

    self._data = np.hstack([self._data, new_dataset.data])
    self._columns.extend(new_dataset.columns)

read_csv ¶

read_csv(filepath)

Load a CSV file into a Dataset.

Parameters:

Name	Type	Description	Default
`filepath`	`str`	Path to the CSV file.	required

Returns:

Type	Description
`Dataset`	Loaded dataset object.

Source code in glassbox/frame/io.py

def read_csv(filepath: str) -> Dataset:
    """
    Load a CSV file into a Dataset.

    Parameters
    ----------
    filepath : str
        Path to the CSV file.

    Returns
    -------
    Dataset
        Loaded dataset object.
    """
    with open(filepath, newline="", encoding="utf-8") as data_source:
        reader = csv.reader(data_source)
        # Filter empty rows
        entries = [row for row in reader if row]

    if not entries:
        raise ValueError(f"CSV file is empty: {filepath}")

    # Parse header
    columns = [col.strip() for col in entries[0]]

    # Parse rows
    rows = []
    for _row in entries[1:]:
        cells = [cell.strip() for cell in _row]
        rows.append(cells)

    if not rows:
        # Header-only file — return empty dataset with float64 dtype
        data = np.empty((0, len(columns)), dtype=object)
        return Dataset(data, columns)

    # Build a column-at-a-time array, trying float conversion per column
    n_rows = len(rows)
    n_cols = len(columns)
    data = np.empty((n_rows, n_cols), dtype=object)

    for r, row in enumerate(rows):
        for c, cell in enumerate(row):
            if cell == "" or cell.upper() == "NA":
                data[r, c] = np.nan
            else:
                data[r, c] = cell

    # Try to cast each column to float; leave as object if it fails
    float_data = np.empty((n_rows, n_cols), dtype=float)
    col_is_float = np.ones(n_cols, dtype=bool)

    for c in range(n_cols):
        try:
            float_data[:, c] = data[:, c].astype(float)
        except (ValueError, TypeError):
            col_is_float[c] = False

    if col_is_float.all():
        return Dataset(float_data, columns)

    # Mixed types — keep object array as-is; promote pure-float cols
    for c in range(n_cols):
        if col_is_float[c]:
            data[:, c] = float_data[:, c]

    return Dataset(data, columns)