From 6d17e6a42eb36b4404466afcdbfbe0ac893ef579 Mon Sep 17 00:00:00 2001
From: Ysobel <ysobelsims@gmail.com>
Date: Wed, 21 Jan 2026 22:23:58 +1100
Subject: [PATCH] add random

---
 configs/penguins_config.yaml     |  5 +++++
 configs/penguins_multilabel.yaml |  5 +++++
 configs/possum_config.yaml       |  5 +++++
 configs/possum_multilabel.yaml   |  5 +++++
 ecosci/models.py                 | 23 +++++++++++++++++++++++
 5 files changed, 43 insertions(+)

diff --git a/configs/penguins_config.yaml b/configs/penguins_config.yaml
index ca9250b..1ae288a 100644
--- a/configs/penguins_config.yaml
+++ b/configs/penguins_config.yaml
@@ -75,6 +75,11 @@ models:
       solver: adam
       learning_rate: adaptive
 
+  # Random baseline for comparison
+  - name: random
+    params:
+      strategy: stratified   # Predicts based on class distribution
+
 training:
   repetitions: 10          # Train 10 times with different random seeds
   random_seed: 42         # Base seed for reproducibility
diff --git a/configs/penguins_multilabel.yaml b/configs/penguins_multilabel.yaml
index 5452b6f..062bae2 100644
--- a/configs/penguins_multilabel.yaml
+++ b/configs/penguins_multilabel.yaml
@@ -54,6 +54,11 @@ models:
       max_iter: 1000
       class_weight: balanced
 
+  # Random baseline for comparison
+  - name: random
+    params:
+      strategy: stratified   # Predicts based on class distribution
+
 # Training configuration
 training:
   repetitions: 5           # Train 5 times with different seeds
diff --git a/configs/possum_config.yaml b/configs/possum_config.yaml
index d5565be..37216fc 100644
--- a/configs/possum_config.yaml
+++ b/configs/possum_config.yaml
@@ -81,6 +81,11 @@ models:
     params:
       fit_intercept: true
 
+  # Random baseline for comparison
+  - name: random
+    params:
+      strategy: mean         # Predicts the mean of training targets
+
 # Training configuration
 training:
   repetitions: 10            # Train each model 10 times with different seeds
diff --git a/configs/possum_multilabel.yaml b/configs/possum_multilabel.yaml
index b293047..43ea570 100644
--- a/configs/possum_multilabel.yaml
+++ b/configs/possum_multilabel.yaml
@@ -51,6 +51,11 @@ models:
   - name: linear
     params: {}
 
+  # Random baseline for comparison
+  - name: random
+    params:
+      strategy: mean         # Predicts the mean of training targets
+
 # Training configuration
 training:
   repetitions: 5           # Train 5 times with different seeds
diff --git a/ecosci/models.py b/ecosci/models.py
index c59632d..fcf7491 100644
--- a/ecosci/models.py
+++ b/ecosci/models.py
@@ -7,6 +7,7 @@
 - xgboost      : gradient boosting trees (requires `xgboost` package)
 - logistic     : logistic regression (classification baseline)
 - linear       : linear regression (regression only)
+- random       : random baseline (uses sklearn DummyClassifier/DummyRegressor)
 
 All model hyperparameters come from `models[].params` in the YAML and are passed
 through to the underlying scikit-learn/xgboost classes. This keeps the code
@@ -186,6 +187,28 @@ def get_model(
                 return MultiOutputRegressor(base_model)
             return base_model
 
+        if name.lower() == "random":
+            from sklearn.dummy import DummyClassifier, DummyRegressor
+
+            # DummyClassifier/Regressor serve as random baselines
+            # Strategies for classification: "most_frequent", "prior", "stratified", "uniform"
+            # Strategies for regression: "mean", "median", "quantile", "constant"
+            if problem_type == "classification":
+                strategy = params.get("strategy", "stratified")
+                base_model = DummyClassifier(
+                    strategy=strategy,
+                    random_state=params.get("random_state", 0),
+                    **{k: v for k, v in params.items() if k not in ["strategy", "random_state"]},
+                )
+                return ModelZoo.wrap_for_multioutput(base_model, problem_type, n_outputs)
+            else:
+                strategy = params.get("strategy", "mean")
+                base_model = DummyRegressor(
+                    strategy=strategy,
+                    **{k: v for k, v in params.items() if k not in ["strategy", "random_state"]},
+                )
+                return ModelZoo.wrap_for_multioutput(base_model, problem_type, n_outputs)
+
         raise ValueError(f"Unknown model name: {name}")
     
     @staticmethod