Skip to content

Frame

The glassbox.frame module provides a lightweight data container and CSV I/O utilities. All data is stored internally as np.ndarray — no pandas dependency.


Loading Data

Use read_csv to load a CSV file into a Dataset:

from glassbox.frame import read_csv

ds = read_csv("students.csv")
print(ds)
# Dataset(shape=(1000, 12), columns=['Age', 'Gender', 'Score', ...])
Type inference

Columns that can be fully cast to float are stored as float64. Mixed or string columns remain as object dtype. Empty cells and "NA" values are converted to np.nan.


The Dataset Class

Kroki

Dataset wraps a 2-D NumPy array with named columns.

Properties

Property Type Description
data np.ndarray The underlying 2-D array.
columns List[str] Column names.
shape Tuple[int, int] (n_rows, n_cols).

Selecting Columns

# Single column → Dataset with 1 column
ages = ds.get_columns("Age")

# Multiple columns → Dataset subset
subset = ds.get_columns(["Age", "Score"])

Selecting Rows

import numpy as np

indices = np.array([0, 5, 10])
sample = ds.get_rows(indices)

Modifying Data

# Update an existing column
ds.update_column("Score", new_scores)

# Drop columns
ds.drop_columns(["Unused_1", "Unused_2"])

# Add new columns from another Dataset
ds.add_columns(extra_ds)

Saving Data

from glassbox.frame.io import to_csv

to_csv(ds, columns=["Age", "Score"], filepath="output.csv")
Tip

to_csv automatically formats whole-number floats without a decimal point and properly escapes commas and quotes in string values.


API Reference

Dataset

Dataset(data, columns)

Container for data matrices with multiple helper functions.

Attributes:

Name Type Description
data ndarray

Data arranged as a 2D matrix. To access columns, take the transpose.

columns List[str]

Names of the columns stored in a list.

shape Tuple[int, int]

Shape of the dataset (# of rows, # of columns).

Parameters:

Name Type Description Default
data ndarray

Data arranged as a 2D matrix - (n_rows, n_cols)

required
columns List[str]

column names - must match data.shape[1]

required
Source code in glassbox/frame/dataset.py
def __init__(self, data: np.ndarray, columns: List[str]):
    """
    Parameters
    ----------
    data: np.ndarray
        Data arranged as a 2D matrix - (n_rows, n_cols)
    columns: List[str]
        column names - must match data.shape[1]
    """
    if data.ndim != 2:
        raise ValueError(f"data must be a 2D matrix, got shape {data.ndim}")
    if len(columns) != data.shape[1]:
        raise ValueError(
            f"Number of columns ({len(columns)}) does not match"
            f"data width ({data.shape[1]})"
        )
    self._data = data
    self._columns = columns

get_columns

get_columns(names)

Retrieve data for specific columns by name.

Parameters:

Name Type Description Default
names str | List[str]

A single column name or a list of column names.

required

Returns:

Type Description
ndarray

Array slice representing the requested columns.

Source code in glassbox/frame/dataset.py
def get_columns(self, names: str | List[str]) -> "Dataset":
    """
    Retrieve data for specific columns by name.

    Parameters
    ----------
    names : str | List[str]
        A single column name or a list of column names.

    Returns
    -------
    np.ndarray
        Array slice representing the requested columns.
    """
    names = self._normalize_names(names)
    indices = [self._get_column_index(name) for name in names]
    return Dataset(self._data[:, indices].copy(), names)

get_rows

get_rows(indices)

Get specific rows based on indices and return as a new dataset.

Parameters:

Name Type Description Default
indices ndarray

Integer array of row coordinates.

required

Returns:

Type Description
Dataset

A new Dataset instance containing the selected rows.

Source code in glassbox/frame/dataset.py
def get_rows(self, indices: np.ndarray) -> "Dataset":
    """
    Get specific rows based on indices and return as a new dataset.

    Parameters
    ----------
    indices : np.ndarray
        Integer array of row coordinates.

    Returns
    -------
    Dataset
        A new Dataset instance containing the selected rows.
    """
    return Dataset(self._data[indices].copy(), self._columns)

update_column

update_column(name, new_data)

Update the array content for an existing column.

Parameters:

Name Type Description Default
name str

Target column to update.

required
new_data ndarray

Array values to overwrite the column.

required

Returns:

Type Description
None
Source code in glassbox/frame/dataset.py
def update_column(self, name: str, new_data: np.ndarray) -> None:
    """
    Update the array content for an existing column.

    Parameters
    ----------
    name : str
        Target column to update.
    new_data : np.ndarray
        Array values to overwrite the column.

    Returns
    -------
    None
    """
    idx = self._get_column_index(name)
    self._data[:, idx] = new_data.ravel()

drop_columns

drop_columns(names)

Remove columns by name from the dataset.

Parameters:

Name Type Description Default
names str | List[str]

Target column or list of columns to remove.

required

Returns:

Type Description
None

Raises:

Type Description
KeyError

if one of the columns to drop doesn't exist in the dataset

Source code in glassbox/frame/dataset.py
def drop_columns(self, names: str | List[str]) -> None:
    """
    Remove columns by name from the dataset.

    Parameters
    ----------
    names : str | List[str]
        Target column or list of columns to remove.

    Returns
    -------
    None

    Raises
    ------
    KeyError
        if one of the columns to drop doesn't exist in the dataset
    """
    to_drop = set(self._normalize_names(names))
    keep_mask = [i for i, col in enumerate(self._columns) if col not in to_drop]
    missing = to_drop - set(self._columns)
    if missing:
        raise KeyError(f"columns not found: {missing}")
    self._data = self._data[:, keep_mask]
    self._columns = [self._columns[i] for i in keep_mask]

add_columns

add_columns(new_dataset)

Add new columns alongside the dataset arrays.

Parameters:

Name Type Description Default
new_dataset Dataset

New data to append.

required

Returns:

Type Description
None

Raises:

Type Description
ValueError

If column to add already exists

Source code in glassbox/frame/dataset.py
def add_columns(self, new_dataset: "Dataset") -> None:
    """
    Add new columns alongside the dataset arrays.

    Parameters
    ----------
    new_dataset : Dataset
        New data to append.

    Returns
    -------
    None

    Raises
    ------
    ValueError
        If column to add already exists
    """
    duplicates = [n for n in new_dataset.columns if n in self._columns]
    if duplicates:
        raise ValueError(f"Columns already exist: {duplicates}")

    if new_dataset.shape[0] != self._data.shape[0]:
        raise ValueError(
            f"Row count mismatch: dataset has {self._data.shape[0]} rows, "
            f"new data has {new_dataset.shape[0]}"
        )

    self._data = np.hstack([self._data, new_dataset.data])
    self._columns.extend(new_dataset.columns)

read_csv

read_csv(filepath)

Load a CSV file into a Dataset.

Parameters:

Name Type Description Default
filepath str

Path to the CSV file.

required

Returns:

Type Description
Dataset

Loaded dataset object.

Source code in glassbox/frame/io.py
def read_csv(filepath: str) -> Dataset:
    """
    Load a CSV file into a Dataset.

    Parameters
    ----------
    filepath : str
        Path to the CSV file.

    Returns
    -------
    Dataset
        Loaded dataset object.
    """
    with open(filepath, newline="", encoding="utf-8") as data_source:
        reader = csv.reader(data_source)
        # Filter empty rows
        entries = [row for row in reader if row]

    if not entries:
        raise ValueError(f"CSV file is empty: {filepath}")

    # Parse header
    columns = [col.strip() for col in entries[0]]

    # Parse rows
    rows = []
    for _row in entries[1:]:
        cells = [cell.strip() for cell in _row]
        rows.append(cells)

    if not rows:
        # Header-only file — return empty dataset with float64 dtype
        data = np.empty((0, len(columns)), dtype=object)
        return Dataset(data, columns)

    # Build a column-at-a-time array, trying float conversion per column
    n_rows = len(rows)
    n_cols = len(columns)
    data = np.empty((n_rows, n_cols), dtype=object)

    for r, row in enumerate(rows):
        for c, cell in enumerate(row):
            if cell == "" or cell.upper() == "NA":
                data[r, c] = np.nan
            else:
                data[r, c] = cell

    # Try to cast each column to float; leave as object if it fails
    float_data = np.empty((n_rows, n_cols), dtype=float)
    col_is_float = np.ones(n_cols, dtype=bool)

    for c in range(n_cols):
        try:
            float_data[:, c] = data[:, c].astype(float)
        except (ValueError, TypeError):
            col_is_float[c] = False

    if col_is_float.all():
        return Dataset(float_data, columns)

    # Mixed types — keep object array as-is; promote pure-float cols
    for c in range(n_cols):
        if col_is_float[c]:
            data[:, c] = float_data[:, c]

    return Dataset(data, columns)