From 6d17e6a42eb36b4404466afcdbfbe0ac893ef579 Mon Sep 17 00:00:00 2001 From: Ysobel Date: Wed, 21 Jan 2026 22:23:58 +1100 Subject: [PATCH] add random --- configs/penguins_config.yaml | 5 +++++ configs/penguins_multilabel.yaml | 5 +++++ configs/possum_config.yaml | 5 +++++ configs/possum_multilabel.yaml | 5 +++++ ecosci/models.py | 23 +++++++++++++++++++++++ 5 files changed, 43 insertions(+) diff --git a/configs/penguins_config.yaml b/configs/penguins_config.yaml index ca9250b..1ae288a 100644 --- a/configs/penguins_config.yaml +++ b/configs/penguins_config.yaml @@ -75,6 +75,11 @@ models: solver: adam learning_rate: adaptive + # Random baseline for comparison + - name: random + params: + strategy: stratified # Predicts based on class distribution + training: repetitions: 10 # Train 10 times with different random seeds random_seed: 42 # Base seed for reproducibility diff --git a/configs/penguins_multilabel.yaml b/configs/penguins_multilabel.yaml index 5452b6f..062bae2 100644 --- a/configs/penguins_multilabel.yaml +++ b/configs/penguins_multilabel.yaml @@ -54,6 +54,11 @@ models: max_iter: 1000 class_weight: balanced + # Random baseline for comparison + - name: random + params: + strategy: stratified # Predicts based on class distribution + # Training configuration training: repetitions: 5 # Train 5 times with different seeds diff --git a/configs/possum_config.yaml b/configs/possum_config.yaml index d5565be..37216fc 100644 --- a/configs/possum_config.yaml +++ b/configs/possum_config.yaml @@ -81,6 +81,11 @@ models: params: fit_intercept: true + # Random baseline for comparison + - name: random + params: + strategy: mean # Predicts the mean of training targets + # Training configuration training: repetitions: 10 # Train each model 10 times with different seeds diff --git a/configs/possum_multilabel.yaml b/configs/possum_multilabel.yaml index b293047..43ea570 100644 --- a/configs/possum_multilabel.yaml +++ b/configs/possum_multilabel.yaml @@ -51,6 +51,11 @@ models: - name: linear params: {} + # Random baseline for comparison + - name: random + params: + strategy: mean # Predicts the mean of training targets + # Training configuration training: repetitions: 5 # Train 5 times with different seeds diff --git a/ecosci/models.py b/ecosci/models.py index c59632d..fcf7491 100644 --- a/ecosci/models.py +++ b/ecosci/models.py @@ -7,6 +7,7 @@ - xgboost : gradient boosting trees (requires `xgboost` package) - logistic : logistic regression (classification baseline) - linear : linear regression (regression only) +- random : random baseline (uses sklearn DummyClassifier/DummyRegressor) All model hyperparameters come from `models[].params` in the YAML and are passed through to the underlying scikit-learn/xgboost classes. This keeps the code @@ -186,6 +187,28 @@ def get_model( return MultiOutputRegressor(base_model) return base_model + if name.lower() == "random": + from sklearn.dummy import DummyClassifier, DummyRegressor + + # DummyClassifier/Regressor serve as random baselines + # Strategies for classification: "most_frequent", "prior", "stratified", "uniform" + # Strategies for regression: "mean", "median", "quantile", "constant" + if problem_type == "classification": + strategy = params.get("strategy", "stratified") + base_model = DummyClassifier( + strategy=strategy, + random_state=params.get("random_state", 0), + **{k: v for k, v in params.items() if k not in ["strategy", "random_state"]}, + ) + return ModelZoo.wrap_for_multioutput(base_model, problem_type, n_outputs) + else: + strategy = params.get("strategy", "mean") + base_model = DummyRegressor( + strategy=strategy, + **{k: v for k, v in params.items() if k not in ["strategy", "random_state"]}, + ) + return ModelZoo.wrap_for_multioutput(base_model, problem_type, n_outputs) + raise ValueError(f"Unknown model name: {name}") @staticmethod