Skip to content

Orchestrator

The glassbox.orchestrator module provides tools for model selection and hyperparameter tuning through cross-validation.

Kroki


Automate the process of finding the optimal hyperparameters for your models.

GridSearchCV

Exhaustive search over specified parameter values for an estimator. Each combination is evaluated using cross-validation.

from glassbox.orchestrator import GridSearchCV
from glassbox.models import DecisionTreeClassifier

param_grid = {
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5]
}

model = DecisionTreeClassifier()
search = GridSearchCV(model, param_grid, cv=5)
search.fit(X_train, y_train)

# The best model is automatically refitted on the entire training set
print(search.best_params_)
preds = search.predict(X_test)

RandomizedSearchCV

Randomized search over a hyperparameter space. Trades exhaustiveness for computational speed.

from glassbox.orchestrator import RandomizedSearchCV
from glassbox.models import RandomForestClassifier

param_space = {
    'max_depth': [5, 10, 15, 20],
    'n_estimators': [10, 50, 100]
}

model = RandomForestClassifier()
search = RandomizedSearchCV(model, param_space, n_iter=10, cv=5)
search.fit(X_train, y_train)

print(search.best_params_)
preds = search.predict(X_test)

Cross-Validation Splitters

Strategies to split data into training and validation sets.

KFoldSplitter

Splits the dataset into n_splits consecutive folds, preserving the underlying order if not shuffled.

from glassbox.orchestrator import KFoldSplitter

splitter = KFoldSplitter(n_splits=5, shuffle=True, random_seed=42)
for train_idx, val_idx in splitter.split(X):
    X_fold_train, X_fold_val = X[train_idx], X[val_idx]

StratifiedKFoldSplitter

Splits the dataset into folds while preserving the percentage of samples for each class in y. Ideally suited for classification problems with imbalanced labels.

from glassbox.orchestrator import StratifiedKFoldSplitter

splitter = StratifiedKFoldSplitter(n_splits=5, shuffle=True, random_seed=42)
for train_idx, val_idx in splitter.split(X, y):
    y_fold_train, y_fold_val = y[train_idx], y[val_idx]

API Reference

BaseSearch

BaseSearch(estimator, param_space, cv_engine, scoring_func)

Bases: ABC

Abstract base class for search-based model selection.

Parameters:

Name Type Description Default
estimator BaseModel

The model to optimize.

required
param_space Dict

Parameter search space.

required
cv_engine BaseSplitter

Cross-validation splitter.

required
scoring_func Callable

Scoring function used to evaluate candidates.

required

Attributes:

Name Type Description
best_params_ Dict

Best found parameter set.

best_score_ float

Best scoring value.

best_estimator_ BaseModel

Best estimator instance.

Source code in glassbox/orchestrator/base_search.py
def __init__(
    self,
    estimator: BaseModel,
    param_space: Dict,
    cv_engine: BaseSplitter,
    scoring_func: Callable,
) -> None:
    self.estimator: BaseModel = estimator
    self.param_space: Dict = param_space
    self.cv_engine: BaseSplitter = cv_engine
    self.scoring_func: Callable = scoring_func
    self.best_params_: Dict = {}
    self.best_score_: float = 0.0
    self.best_estimator_: BaseModel = estimator

fit

fit(X, y)

Fit the search object and select the best estimator.

Parameters:

Name Type Description Default
X ndarray

Training data of shape (n_samples, n_features).

required
y ndarray

Target values of shape (n_samples,).

required

Returns:

Type Description
Self

The fitted search object.

Source code in glassbox/orchestrator/base_search.py
def fit(self, X: np.ndarray, y: np.ndarray) -> Self:
    """
    Fit the search object and select the best estimator.

    Parameters
    ----------
    X : np.ndarray
        Training data of shape (n_samples, n_features).
    y : np.ndarray
        Target values of shape (n_samples,).

    Returns
    -------
    Self
        The fitted search object.
    """
    if X.ndim != 2:
        raise ValueError("X must be a 2D array")
    if y.ndim != 1:
        raise ValueError("y must be a 1D array")
    if X.shape[0] != y.shape[0]:
        raise ValueError(
            "X and y must have the same number of samples"
        )

    self.best_score_ = float("-inf")
    self.best_params_ = {}
    self.best_estimator_ = copy.deepcopy(self.estimator)

    for candidate_params in self._generate_candidates():
        candidate_estimator = copy.deepcopy(self.estimator)
        for key, value in candidate_params.items():
            setattr(candidate_estimator, key, value)

        fold_scores = []
        for train_idx, val_idx in self.cv_engine.split(X, y):
            clone_estimator = copy.deepcopy(candidate_estimator)
            clone_estimator.fit(X[train_idx], y[train_idx])
            predictions = clone_estimator.predict(X[val_idx])
            fold_score = self.scoring_func(y[val_idx], predictions)
            fold_scores.append(fold_score)

        if len(fold_scores) == 0:
            continue

        mean_score = float(np.mean(fold_scores))
        if mean_score > self.best_score_:
            self.best_score_ = mean_score
            self.best_params_ = candidate_params
            self.best_estimator_ = copy.deepcopy(candidate_estimator)

    return self

BaseSplitter

BaseSplitter(n_splits=5, shuffle=False)

Bases: ABC

Abstract base class for cross-validation splitters.

Parameters:

Name Type Description Default
n_splits int

Number of splits.

5
shuffle bool

Whether to shuffle data before splitting.

False
Source code in glassbox/orchestrator/base_splitter.py
def __init__(self, n_splits: int = 5, shuffle: bool = False) -> None:
    self.n_splits: int = n_splits
    self.shuffle: bool = shuffle

split abstractmethod

split(X, y)

Generate train/test indices for cross-validation.

Parameters:

Name Type Description Default
X ndarray

Data array of shape (n_samples, n_features).

required
y ndarray

Target values of shape (n_samples,).

required

Returns:

Type Description
Generator[Tuple[ndarray, ndarray], None, None]

Generator yielding training and validation index tuples.

Source code in glassbox/orchestrator/base_splitter.py
@abstractmethod
def split(
    self, X: np.ndarray, y: np.ndarray
) -> Generator[Tuple[np.ndarray, np.ndarray], None, None]:
    """
    Generate train/test indices for cross-validation.

    Parameters
    ----------
    X : np.ndarray
        Data array of shape (n_samples, n_features).
    y : np.ndarray
        Target values of shape (n_samples,).

    Returns
    -------
    Generator[Tuple[np.ndarray, np.ndarray], None, None]
        Generator yielding training and validation index tuples.
    """
    raise NotImplementedError

GridSearchCV

GridSearchCV(
    estimator, param_space, cv_engine, scoring_func
)

Bases: BaseSearch

Exhaustive grid search over a parameter space.

Parameters:

Name Type Description Default
estimator BaseModel

The model to optimize.

required
param_space Dict

Parameter grid for exhaustive search.

required
cv_engine BaseSplitter

Cross-validation splitter.

required
scoring_func Callable

Scoring function used to evaluate candidates.

required
Source code in glassbox/orchestrator/base_search.py
def __init__(
    self,
    estimator: BaseModel,
    param_space: Dict,
    cv_engine: BaseSplitter,
    scoring_func: Callable,
) -> None:
    self.estimator: BaseModel = estimator
    self.param_space: Dict = param_space
    self.cv_engine: BaseSplitter = cv_engine
    self.scoring_func: Callable = scoring_func
    self.best_params_: Dict = {}
    self.best_score_: float = 0.0
    self.best_estimator_: BaseModel = estimator

RandomizedSearchCV

RandomizedSearchCV(
    estimator,
    param_space,
    cv_engine,
    scoring_func,
    n_iter=10,
    time_budget=0.0,
)

Bases: BaseSearch

Randomized search over a parameter space.

Parameters:

Name Type Description Default
estimator BaseModel

The model to optimize.

required
param_space Dict

Distribution for random search.

required
cv_engine BaseSplitter

Cross-validation splitter.

required
scoring_func Callable

Scoring function used to evaluate candidates.

required
n_iter int

Number of random parameter candidates to evaluate.

10
time_budget float

Maximum time budget for the search.

0.0
Source code in glassbox/orchestrator/randomized_search.py
def __init__(
    self,
    estimator: "BaseModel",
    param_space: Dict,
    cv_engine: "BaseSplitter",
    scoring_func: "Callable",
    n_iter: int = 10,
    time_budget: float = 0.0,
) -> None:
    super().__init__(estimator, param_space, cv_engine, scoring_func)
    self.n_iter: int = n_iter
    self.time_budget: float = time_budget

KFoldSplitter

KFoldSplitter(n_splits=5, shuffle=False)

Bases: BaseSplitter

K-fold cross-validation splitter.

Parameters:

Name Type Description Default
n_splits int

Number of folds.

5
shuffle bool

Whether to shuffle data before splitting.

False
Source code in glassbox/orchestrator/base_splitter.py
def __init__(self, n_splits: int = 5, shuffle: bool = False) -> None:
    self.n_splits: int = n_splits
    self.shuffle: bool = shuffle

split

split(X, y)

Generate train/test splits for K-fold cross-validation.

Parameters:

Name Type Description Default
X ndarray

Data array of shape (n_samples, n_features).

required
y ndarray

Target values of shape (n_samples,).

required

Returns:

Type Description
Generator[Tuple[ndarray, ndarray], None, None]

Generator yielding training and validation index tuples.

Source code in glassbox/orchestrator/splitters.py
def split(
    self, X: np.ndarray, y: np.ndarray
) -> Generator[Tuple[np.ndarray, np.ndarray], None, None]:
    """
    Generate train/test splits for K-fold cross-validation.

    Parameters
    ----------
    X : np.ndarray
        Data array of shape (n_samples, n_features).
    y : np.ndarray
        Target values of shape (n_samples,).

    Returns
    -------
    Generator[Tuple[np.ndarray, np.ndarray], None, None]
        Generator yielding training and validation index tuples.
    """
    n_samples = X.shape[0]
    if self.n_splits <= 1 or self.n_splits > n_samples:
        raise ValueError("n_splits must be between 2 and n_samples")

    indices = np.arange(n_samples)
    if self.shuffle:
        np.random.shuffle(indices)

    fold_sizes = np.full(self.n_splits, n_samples // self.n_splits, dtype=int)
    fold_sizes[: n_samples % self.n_splits] += 1

    current = 0
    for fold_size in fold_sizes:
        start = current
        stop = current + fold_size
        test_idx = indices[start:stop]
        train_idx = np.concatenate(
            [indices[:start], indices[stop:]]
        ).astype(int)
        yield train_idx, test_idx
        current = stop

StratifiedKFoldSplitter

StratifiedKFoldSplitter(n_splits=5, shuffle=False)

Bases: BaseSplitter

Stratified K-fold cross-validation splitter.

Parameters:

Name Type Description Default
n_splits int

Number of folds.

5
shuffle bool

Whether to shuffle data before splitting.

False
Source code in glassbox/orchestrator/base_splitter.py
def __init__(self, n_splits: int = 5, shuffle: bool = False) -> None:
    self.n_splits: int = n_splits
    self.shuffle: bool = shuffle

split

split(X, y)

Generate stratified train/test splits for K-fold cross-validation.

Parameters:

Name Type Description Default
X ndarray

Data array of shape (n_samples, n_features).

required
y ndarray

Target values of shape (n_samples,).

required

Returns:

Type Description
Generator[Tuple[ndarray, ndarray], None, None]

Generator yielding training and validation index tuples.

Source code in glassbox/orchestrator/splitters.py
def split(
    self, X: np.ndarray, y: np.ndarray
) -> Generator[Tuple[np.ndarray, np.ndarray], None, None]:
    """
    Generate stratified train/test splits for K-fold cross-validation.

    Parameters
    ----------
    X : np.ndarray
        Data array of shape (n_samples, n_features).
    y : np.ndarray
        Target values of shape (n_samples,).

    Returns
    -------
    Generator[Tuple[np.ndarray, np.ndarray], None, None]
        Generator yielding training and validation index tuples.
    """
    n_samples = X.shape[0]
    if self.n_splits <= 1 or self.n_splits > n_samples:
        raise ValueError("n_splits must be between 2 and n_samples")

    indices = np.arange(n_samples)
    if self.shuffle:
        np.random.shuffle(indices)

    unique_classes = np.unique(y)
    strata = {cls: [] for cls in unique_classes}
    for idx in indices:
        strata[y[idx]].append(idx)

    folds = [[] for _ in range(self.n_splits)]
    for cls in unique_classes:
        class_indices = np.array(strata[cls], dtype=int)
        if self.shuffle:
            np.random.shuffle(class_indices)
        for fold_index, sample_idx in enumerate(class_indices):
            folds[fold_index % self.n_splits].append(sample_idx)

    for fold_index in range(self.n_splits):
        test_idx = np.array(folds[fold_index], dtype=int)
        train_idx = np.array(
            [idx for i, fold in enumerate(folds) if i != fold_index for idx in fold],
            dtype=int,
        )
        yield train_idx, test_idx