From 20e37a2d30f182695a10fea50a887cd67e24cd0c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= <fkiraly@gcos.ai>
Date: Wed, 31 Dec 2025 02:06:06 +0100
Subject: [PATCH 1/8] experimental model framework

---
 openml/__init__.py                           |   2 +
 openml/_get.py                               |   9 ++
 openml/base/__init__.py                      |   6 +
 openml/{base.py => base/_base.py}            |   3 +-
 openml/base/_base_pkg.py                     | 120 +++++++++++++++++++
 openml/models/__init__.py                    |   5 +
 openml/models/_get.py                        |  63 ++++++++++
 openml/models/apis/__init__.py               |   5 +
 openml/models/apis/_classifier.py            |  24 ++++
 openml/models/base/__init__.py               |   5 +
 openml/models/base/_base.py                  |  41 +++++++
 openml/models/classification/__init__.py     |   1 +
 openml/models/classification/auto_sklearn.py |  14 +++
 openml/models/classification/xgboost.py      |  14 +++
 pyproject.toml                               |   1 +
 15 files changed, 311 insertions(+), 2 deletions(-)
 create mode 100644 openml/_get.py
 create mode 100644 openml/base/__init__.py
 rename openml/{base.py => base/_base.py} (98%)
 create mode 100644 openml/base/_base_pkg.py
 create mode 100644 openml/models/__init__.py
 create mode 100644 openml/models/_get.py
 create mode 100644 openml/models/apis/__init__.py
 create mode 100644 openml/models/apis/_classifier.py
 create mode 100644 openml/models/base/__init__.py
 create mode 100644 openml/models/base/_base.py
 create mode 100644 openml/models/classification/__init__.py
 create mode 100644 openml/models/classification/auto_sklearn.py
 create mode 100644 openml/models/classification/xgboost.py

diff --git a/openml/__init__.py b/openml/__init__.py
index c49505eb9..f93cbb5d3 100644
--- a/openml/__init__.py
+++ b/openml/__init__.py
@@ -48,6 +48,7 @@
     OpenMLSupervisedTask,
     OpenMLTask,
 )
+from openml._get import get
 
 
 def populate_cache(
@@ -120,4 +121,5 @@ def populate_cache(
     "utils",
     "_api_calls",
     "__version__",
+    "get",
 ]
diff --git a/openml/_get.py b/openml/_get.py
new file mode 100644
index 000000000..b576668db
--- /dev/null
+++ b/openml/_get.py
@@ -0,0 +1,9 @@
+"""Global get dispatch utility."""
+
+# currently just a forward to models
+# to discuss and possibly
+# todo: add global get utility here
+# in general, e.g., datasets will not have same name as models etc
+from openml.models import get
+
+__all__ = ["get"]
diff --git a/openml/base/__init__.py b/openml/base/__init__.py
new file mode 100644
index 000000000..76a88c42b
--- /dev/null
+++ b/openml/base/__init__.py
@@ -0,0 +1,6 @@
+"""Module of base classes."""
+
+from openml.base._base import OpenMLBase
+from openml.base._base_pkg import _BasePkg
+
+__all__ = ["_BasePkg", "OpenMLBase"]
diff --git a/openml/base.py b/openml/base/_base.py
similarity index 98%
rename from openml/base.py
rename to openml/base/_base.py
index fbfb9dfc8..de2b387bf 100644
--- a/openml/base.py
+++ b/openml/base/_base.py
@@ -10,8 +10,7 @@
 
 import openml._api_calls
 import openml.config
-
-from .utils import _get_rest_api_type_alias, _tag_openml_base
+from openml.utils import _get_rest_api_type_alias, _tag_openml_base
 
 
 class OpenMLBase(ABC):
diff --git a/openml/base/_base_pkg.py b/openml/base/_base_pkg.py
new file mode 100644
index 000000000..9f5d6005e
--- /dev/null
+++ b/openml/base/_base_pkg.py
@@ -0,0 +1,120 @@
+"""Base Packager class."""
+
+import inspect
+from pathlib import Path
+import sys
+import textwrap
+
+from skbase.base import BaseObject
+from skbase.utils.dependencies import _check_estimator_deps
+
+
+class _BasePkg(BaseObject):
+
+    _tags = {
+        "python_dependencies": None,
+        "python_version": None,
+        # package register and manifest
+        "pkg_id": None,  # object id contained, "__multiple" if multiple
+        "pkg_obj": "reference",  # or "code"
+        "pkg_obj_type": None,  # openml API type
+        "pkg_compression": "zlib",  # compression
+    }
+
+    def __init__(self):
+        super().__init__()
+
+    def materialize(self):
+        try:
+            _check_estimator_deps(obj=self)
+        except ModuleNotFoundError as e:
+            # prettier message, so the reference is to the pkg_id
+            # currently, we cannot simply pass the object name to skbase
+            # in the error message, so this is a hack
+            # todo: fix this in scikit-base
+            msg = str(e)
+            if len(msg) > 11:
+                msg = msg[11:]
+            raise ModuleNotFoundError(msg) from e
+
+        return self._materialize()
+
+    def _materialize(self):
+        raise RuntimeError("abstract method")
+
+    def serialize(self):
+        cls_str = class_to_source(type(self))
+        compress_method = self.get_tag("pkg_compression")
+        if compress_method in [None, "None"]:
+            return cls_str
+
+        cls_str = cls_str.encode("utf-8")
+        exec(f"import {compress_method}")
+        compressed_str = eval(f"{compress_method}.compress(cls_str)")
+
+        return compressed_str
+
+
+def _has_source(obj) -> bool:
+    """
+    Return True if inspect.getsource(obj) should succeed.
+    """
+    module_name = getattr(obj, "__module__", None)
+    if not module_name or module_name not in sys.modules:
+        return False
+
+    module = sys.modules[module_name]
+    file = getattr(module, "__file__", None)
+    if not file:
+        return False
+
+    return Path(file).suffix == ".py"
+
+
+def class_to_source(cls) -> str:
+    """Return full source definition of python class as string.
+
+    Parameters
+    ----------
+    cls : class to serialize
+
+    Returns
+    -------
+    str : complete definition of cls, as str.
+        Imports are not contained or serialized.
+    """""
+
+    # Fast path: class has retrievable source
+    if _has_source(cls):
+        source = inspect.getsource(cls)
+        return textwrap.dedent(source)
+
+    # Fallback for dynamically created classes
+    lines = []
+
+    bases = [base.__name__ for base in cls.__bases__ if base is not object]
+    base_str = f"({', '.join(bases)})" if bases else ""
+    lines.append(f"class {cls.__name__}{base_str}:")
+
+    body_added = False
+
+    for name, value in cls.__dict__.items():
+        if name.startswith("__") and name.endswith("__"):
+            continue
+
+        if inspect.isfunction(value):
+            if _has_source(value):
+                method_src = inspect.getsource(value)
+                method_src = textwrap.indent(textwrap.dedent(method_src), "    ")
+                lines.append(method_src)
+            else:
+                lines.append(f"    def {name}(self): ...")
+            body_added = True
+        else:
+            lines.append(f"    {name} = {repr(value)}")
+            body_added = True
+
+    if not body_added:
+        lines.append("    pass")
+
+    return "\n".join(lines)
diff --git a/openml/models/__init__.py b/openml/models/__init__.py
new file mode 100644
index 000000000..ae833fc63
--- /dev/null
+++ b/openml/models/__init__.py
@@ -0,0 +1,5 @@
+"""Module with packaging adapters."""
+
+from openml.models._get import get
+
+__all__ = ["get"]
diff --git a/openml/models/_get.py b/openml/models/_get.py
new file mode 100644
index 000000000..b270ec0b6
--- /dev/null
+++ b/openml/models/_get.py
@@ -0,0 +1,63 @@
+
+"""Model retrieval utility."""
+
+from functools import lru_cache
+
+
+def get(id: str):
+    """Retrieve model object with unique identifier.
+
+    Parameter
+    ---------
+    id : str
+        unique identifier of object to retrieve
+
+    Returns
+    -------
+    class
+        retrieved object
+
+    Raises
+    ------
+    ModuleNotFoundError
+        if dependencies of object to retrieve are not satisfied
+    """
+
+    id_lookup = _id_lookup()
+    obj = id_lookup.get(id)
+    if obj is None:
+        raise ValueError(
+            f"Error in openml.get, object with package id {id} "
+            "does not exist."
+        )
+    return obj().materialize()
+
+
+# todo: need to generalize this later to more types
+# currently intentionally retrieves only classifiers
+# todo: replace this, optionally, by database backend
+def _id_lookup(obj_type=None):
+    return _id_lookup_cached(obj_type=obj_type).copy()
+
+
+@lru_cache
+def _id_lookup_cached(obj_type=None):
+    all_objs = _all_objects(obj_type=obj_type)
+
+    # todo: generalize that pkg can contain more than one object
+    lookup_dict = {obj.get_class_tag("pkg_id"): obj for obj in all_objs}
+
+    return lookup_dict
+
+
+@lru_cache
+def _all_objects(obj_type=None):
+    from skbase.lookup import all_objects
+
+    from openml.models.apis._classifier import _ModelPkgClassifier
+
+    clses = all_objects(
+        object_types=_ModelPkgClassifier, package_name="openml", return_names=False
+    )
+
+    return clses
diff --git a/openml/models/apis/__init__.py b/openml/models/apis/__init__.py
new file mode 100644
index 000000000..f560dcf6f
--- /dev/null
+++ b/openml/models/apis/__init__.py
@@ -0,0 +1,5 @@
+"""Module with packaging adapters."""
+
+from openml.models.apis._classifier import _ModelPkgClassifier
+
+__all__ = ["_ModelPkgClassifier"]
diff --git a/openml/models/apis/_classifier.py b/openml/models/apis/_classifier.py
new file mode 100644
index 000000000..a6d75b967
--- /dev/null
+++ b/openml/models/apis/_classifier.py
@@ -0,0 +1,24 @@
+"""Base package for sklearn classifiers."""
+
+from openml.models.base import _OpenmlModelPkg
+
+
+class _ModelPkgClassifier(_OpenmlModelPkg):
+
+    _tags = {
+        # tags specific to API type
+        "pkg_obj_type": "classifier",
+    }
+
+    def get_obj_tags(self):
+        """Return tags of the object as a dictionary."""
+        return {}  # this needs to be implemented
+
+    def get_obj_param_names(self):
+        """Return parameter names of the object as a list.
+
+        Returns
+        -------
+        list: names of object parameters
+        """
+        return list(self.materialize()().get_params().keys())
diff --git a/openml/models/base/__init__.py b/openml/models/base/__init__.py
new file mode 100644
index 000000000..a60e1e404
--- /dev/null
+++ b/openml/models/base/__init__.py
@@ -0,0 +1,5 @@
+"""Module with packaging adapters."""
+
+from openml.models.base._base import _OpenmlModelPkg
+
+__all__ = ["_OpenmlModelPkg"]
diff --git a/openml/models/base/_base.py b/openml/models/base/_base.py
new file mode 100644
index 000000000..4384e754c
--- /dev/null
+++ b/openml/models/base/_base.py
@@ -0,0 +1,41 @@
+"""Base model package class."""
+
+from openml.base import _BasePkg
+
+
+class _OpenmlModelPkg(_BasePkg):
+
+    _obj = None
+
+    def _materialize(self):
+        pkg_obj = self.get_tag("pkg_obj")
+
+        _obj = self._obj
+
+        if _obj is None:
+            raise ValueError(
+                "Error in materialize."
+                "Either _materialize must be implemented, or"
+                "the _obj attribute must be not None."
+            )
+
+        if pkg_obj == "reference":
+            from skbase.utils.dependencies import _safe_import
+
+            obj = _safe_import(self._obj)
+            return obj
+
+        elif pkg_obj == "code":
+            exec(self._obj)
+
+            return obj
+
+        # elif pkg_obj == "craft":
+        #    identify and call appropriate craft method
+
+        else:
+            raise ValueError(
+                'Error in package tag "pkg_obj", '
+                'must be one of "reference", "code", "craft", '
+                f'but found value {pkg_obj}, of type {type(pkg_obj)}'
+            )
diff --git a/openml/models/classification/__init__.py b/openml/models/classification/__init__.py
new file mode 100644
index 000000000..e547a50cf
--- /dev/null
+++ b/openml/models/classification/__init__.py
@@ -0,0 +1 @@
+"""Sklearn classification models."""
diff --git a/openml/models/classification/auto_sklearn.py b/openml/models/classification/auto_sklearn.py
new file mode 100644
index 000000000..0be641394
--- /dev/null
+++ b/openml/models/classification/auto_sklearn.py
@@ -0,0 +1,14 @@
+"""Auto-sklearn classifier."""
+
+
+from openml.models.apis import _ModelPkgClassifier
+
+
+class OpenmlPkg__AutoSklearnClassifier(_ModelPkgClassifier):
+
+    _tags = {
+        "pkg_id": "AutoSklearnClassifier",
+        "python_dependencies": "auto-sklearn",
+    }
+
+    _obj = "autosklearn.classification.AutoSklearnClassifier"
diff --git a/openml/models/classification/xgboost.py b/openml/models/classification/xgboost.py
new file mode 100644
index 000000000..44f3173fe
--- /dev/null
+++ b/openml/models/classification/xgboost.py
@@ -0,0 +1,14 @@
+"""Xgboost classifier."""
+
+
+from openml.models.apis import _ModelPkgClassifier
+
+
+class OpenmlPkg__XGBClassifier(_ModelPkgClassifier):
+
+    _tags = {
+        "pkg_id": "XGBClassifier",
+        "python_dependencies": "xgboost",
+    }
+
+    _obj = "xgboost.XGBClassifier"
diff --git a/pyproject.toml b/pyproject.toml
index 2bf762b09..83b62554d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,6 +20,7 @@ dependencies = [
   "minio",
   "pyarrow",
   "tqdm",  # For MinIO download progress bars
+  "scikit-base",
 ]
 requires-python = ">=3.8"
 maintainers = [

From d79fbe52dceca77b70b1c46a7b32aceecef2aa71 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 31 Dec 2025 01:22:05 +0000
Subject: [PATCH 2/8] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 openml/__init__.py                           |  3 ++-
 openml/_get.py                               |  2 ++
 openml/base/_base_pkg.py                     | 17 +++++++----------
 openml/models/_get.py                        | 19 +++++--------------
 openml/models/apis/_classifier.py            |  3 ++-
 openml/models/base/_base.py                  | 19 +++++++++----------
 openml/models/classification/auto_sklearn.py |  2 +-
 openml/models/classification/xgboost.py      |  2 +-
 8 files changed, 29 insertions(+), 38 deletions(-)

diff --git a/openml/__init__.py b/openml/__init__.py
index f93cbb5d3..7eb077057 100644
--- a/openml/__init__.py
+++ b/openml/__init__.py
@@ -18,6 +18,8 @@
 # License: BSD 3-Clause
 from __future__ import annotations
 
+from openml._get import get
+
 from . import (
     _api_calls,
     config,
@@ -48,7 +50,6 @@
     OpenMLSupervisedTask,
     OpenMLTask,
 )
-from openml._get import get
 
 
 def populate_cache(
diff --git a/openml/_get.py b/openml/_get.py
index b576668db..0c5e9739e 100644
--- a/openml/_get.py
+++ b/openml/_get.py
@@ -4,6 +4,8 @@
 # to discuss and possibly
 # todo: add global get utility here
 # in general, e.g., datasets will not have same name as models etc
+from __future__ import annotations
+
 from openml.models import get
 
 __all__ = ["get"]
diff --git a/openml/base/_base_pkg.py b/openml/base/_base_pkg.py
index 9f5d6005e..690b93a86 100644
--- a/openml/base/_base_pkg.py
+++ b/openml/base/_base_pkg.py
@@ -1,16 +1,17 @@
 """Base Packager class."""
 
+from __future__ import annotations
+
 import inspect
-from pathlib import Path
 import sys
 import textwrap
+from pathlib import Path
 
 from skbase.base import BaseObject
 from skbase.utils.dependencies import _check_estimator_deps
 
 
 class _BasePkg(BaseObject):
-
     _tags = {
         "python_dependencies": None,
         "python_version": None,
@@ -50,15 +51,11 @@ def serialize(self):
 
         cls_str = cls_str.encode("utf-8")
         exec(f"import {compress_method}")
-        compressed_str = eval(f"{compress_method}.compress(cls_str)")
-
-        return compressed_str
+        return eval(f"{compress_method}.compress(cls_str)")
 
 
 def _has_source(obj) -> bool:
-    """
-    Return True if inspect.getsource(obj) should succeed.
-    """
+    """Return True if inspect.getsource(obj) should succeed."""
     module_name = getattr(obj, "__module__", None)
     if not module_name or module_name not in sys.modules:
         return False
@@ -82,7 +79,7 @@ def class_to_source(cls) -> str:
     -------
     str : complete definition of cls, as str.
         Imports are not contained or serialized.
-    """""
+    """ ""
 
     # Fast path: class has retrievable source
     if _has_source(cls):
@@ -111,7 +108,7 @@ def class_to_source(cls) -> str:
                 lines.append(f"    def {name}(self): ...")
             body_added = True
         else:
-            lines.append(f"    {name} = {repr(value)}")
+            lines.append(f"    {name} = {value!r}")
             body_added = True
 
     if not body_added:
diff --git a/openml/models/_get.py b/openml/models/_get.py
index b270ec0b6..75b807ca7 100644
--- a/openml/models/_get.py
+++ b/openml/models/_get.py
@@ -1,6 +1,7 @@
-
 """Model retrieval utility."""
 
+from __future__ import annotations
+
 from functools import lru_cache
 
 
@@ -22,14 +23,10 @@ def get(id: str):
     ModuleNotFoundError
         if dependencies of object to retrieve are not satisfied
     """
-
     id_lookup = _id_lookup()
     obj = id_lookup.get(id)
     if obj is None:
-        raise ValueError(
-            f"Error in openml.get, object with package id {id} "
-            "does not exist."
-        )
+        raise ValueError(f"Error in openml.get, object with package id {id} " "does not exist.")
     return obj().materialize()
 
 
@@ -45,9 +42,7 @@ def _id_lookup_cached(obj_type=None):
     all_objs = _all_objects(obj_type=obj_type)
 
     # todo: generalize that pkg can contain more than one object
-    lookup_dict = {obj.get_class_tag("pkg_id"): obj for obj in all_objs}
-
-    return lookup_dict
+    return {obj.get_class_tag("pkg_id"): obj for obj in all_objs}
 
 
 @lru_cache
@@ -56,8 +51,4 @@ def _all_objects(obj_type=None):
 
     from openml.models.apis._classifier import _ModelPkgClassifier
 
-    clses = all_objects(
-        object_types=_ModelPkgClassifier, package_name="openml", return_names=False
-    )
-
-    return clses
+    return all_objects(object_types=_ModelPkgClassifier, package_name="openml", return_names=False)
diff --git a/openml/models/apis/_classifier.py b/openml/models/apis/_classifier.py
index a6d75b967..c1198ee32 100644
--- a/openml/models/apis/_classifier.py
+++ b/openml/models/apis/_classifier.py
@@ -1,10 +1,11 @@
 """Base package for sklearn classifiers."""
 
+from __future__ import annotations
+
 from openml.models.base import _OpenmlModelPkg
 
 
 class _ModelPkgClassifier(_OpenmlModelPkg):
-
     _tags = {
         # tags specific to API type
         "pkg_obj_type": "classifier",
diff --git a/openml/models/base/_base.py b/openml/models/base/_base.py
index 4384e754c..6b3fa2a92 100644
--- a/openml/models/base/_base.py
+++ b/openml/models/base/_base.py
@@ -1,10 +1,11 @@
 """Base model package class."""
 
+from __future__ import annotations
+
 from openml.base import _BasePkg
 
 
 class _OpenmlModelPkg(_BasePkg):
-
     _obj = None
 
     def _materialize(self):
@@ -22,10 +23,9 @@ def _materialize(self):
         if pkg_obj == "reference":
             from skbase.utils.dependencies import _safe_import
 
-            obj = _safe_import(self._obj)
-            return obj
+            return _safe_import(self._obj)
 
-        elif pkg_obj == "code":
+        if pkg_obj == "code":
             exec(self._obj)
 
             return obj
@@ -33,9 +33,8 @@ def _materialize(self):
         # elif pkg_obj == "craft":
         #    identify and call appropriate craft method
 
-        else:
-            raise ValueError(
-                'Error in package tag "pkg_obj", '
-                'must be one of "reference", "code", "craft", '
-                f'but found value {pkg_obj}, of type {type(pkg_obj)}'
-            )
+        raise ValueError(
+            'Error in package tag "pkg_obj", '
+            'must be one of "reference", "code", "craft", '
+            f"but found value {pkg_obj}, of type {type(pkg_obj)}"
+        )
diff --git a/openml/models/classification/auto_sklearn.py b/openml/models/classification/auto_sklearn.py
index 0be641394..1d29044da 100644
--- a/openml/models/classification/auto_sklearn.py
+++ b/openml/models/classification/auto_sklearn.py
@@ -1,11 +1,11 @@
 """Auto-sklearn classifier."""
 
+from __future__ import annotations
 
 from openml.models.apis import _ModelPkgClassifier
 
 
 class OpenmlPkg__AutoSklearnClassifier(_ModelPkgClassifier):
-
     _tags = {
         "pkg_id": "AutoSklearnClassifier",
         "python_dependencies": "auto-sklearn",
diff --git a/openml/models/classification/xgboost.py b/openml/models/classification/xgboost.py
index 44f3173fe..5b91e647c 100644
--- a/openml/models/classification/xgboost.py
+++ b/openml/models/classification/xgboost.py
@@ -1,11 +1,11 @@
 """Xgboost classifier."""
 
+from __future__ import annotations
 
 from openml.models.apis import _ModelPkgClassifier
 
 
 class OpenmlPkg__XGBClassifier(_ModelPkgClassifier):
-
     _tags = {
         "pkg_id": "XGBClassifier",
         "python_dependencies": "xgboost",

From dc15c669213f71e17cda16d8b91c3045f71143d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= <fkiraly@gcos.ai>
Date: Fri, 9 Jan 2026 21:26:15 +0100
Subject: [PATCH 3/8] move utils

---
 openml/utils/__init__.py |  35 +++
 openml/utils/_openml.py  | 471 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 506 insertions(+)
 create mode 100644 openml/utils/__init__.py
 create mode 100644 openml/utils/_openml.py

diff --git a/openml/utils/__init__.py b/openml/utils/__init__.py
new file mode 100644
index 000000000..83e379222
--- /dev/null
+++ b/openml/utils/__init__.py
@@ -0,0 +1,35 @@
+"""Utilities module."""
+
+from openml.utils._openml import (
+    ProgressBar,
+    _create_cache_directory,
+    _create_cache_directory_for_id,
+    _create_lockfiles_dir,
+    _delete_entity,
+    _get_cache_dir_for_id,
+    _get_cache_dir_for_key,
+    _get_rest_api_type_alias,
+    _list_all,
+    _remove_cache_dir_for_id,
+    _tag_entity,
+    _tag_openml_base,
+    extract_xml_tags,
+    thread_safe_if_oslo_installed,
+)
+
+__all__ = [
+    "ProgressBar",
+    "_create_cache_directory",
+    "_create_cache_directory_for_id",
+    "_create_lockfiles_dir",
+    "_delete_entity",
+    "_get_cache_dir_for_id",
+    "_get_cache_dir_for_key",
+    "_get_rest_api_type_alias",
+    "_list_all",
+    "_remove_cache_dir_for_id",
+    "_tag_entity",
+    "_tag_openml_base",
+    "extract_xml_tags",
+    "thread_safe_if_oslo_installed",
+]
diff --git a/openml/utils/_openml.py b/openml/utils/_openml.py
new file mode 100644
index 000000000..7e72e7aee
--- /dev/null
+++ b/openml/utils/_openml.py
@@ -0,0 +1,471 @@
+# License: BSD 3-Clause
+from __future__ import annotations
+
+import contextlib
+import shutil
+import warnings
+from functools import wraps
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Callable, Mapping, Sized, TypeVar, overload
+from typing_extensions import Literal, ParamSpec
+
+import numpy as np
+import xmltodict
+from minio.helpers import ProgressType
+from tqdm import tqdm
+
+import openml
+import openml._api_calls
+import openml.exceptions
+
+from . import config
+
+# Avoid import cycles: https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles
+if TYPE_CHECKING:
+    from openml.base import OpenMLBase
+
+    P = ParamSpec("P")
+    R = TypeVar("R")
+    _SizedT = TypeVar("_SizedT", bound=Sized)
+
+
+@overload
+def extract_xml_tags(
+    xml_tag_name: str,
+    node: Mapping[str, Any],
+    *,
+    allow_none: Literal[True] = ...,
+) -> Any | None: ...
+
+
+@overload
+def extract_xml_tags(
+    xml_tag_name: str,
+    node: Mapping[str, Any],
+    *,
+    allow_none: Literal[False],
+) -> Any: ...
+
+
+def extract_xml_tags(
+    xml_tag_name: str,
+    node: Mapping[str, Any],
+    *,
+    allow_none: bool = True,
+) -> Any | None:
+    """Helper to extract xml tags from xmltodict.
+
+    Parameters
+    ----------
+    xml_tag_name : str
+        Name of the xml tag to extract from the node.
+
+    node : Mapping[str, Any]
+        Node object returned by ``xmltodict`` from which ``xml_tag_name``
+        should be extracted.
+
+    allow_none : bool
+        If ``False``, the tag needs to exist in the node. Will raise a
+        ``ValueError`` if it does not.
+
+    Returns
+    -------
+    object
+    """
+    if xml_tag_name in node and node[xml_tag_name] is not None:
+        if isinstance(node[xml_tag_name], (dict, str)):
+            return [node[xml_tag_name]]
+        if isinstance(node[xml_tag_name], list):
+            return node[xml_tag_name]
+
+        raise ValueError("Received not string and non list as tag item")
+
+    if allow_none:
+        return None
+
+    raise ValueError(f"Could not find tag '{xml_tag_name}' in node '{node!s}'")
+
+
+def _get_rest_api_type_alias(oml_object: OpenMLBase) -> str:
+    """Return the alias of the openml entity as it is defined for the REST API."""
+    rest_api_mapping: list[tuple[type | tuple, str]] = [
+        (openml.datasets.OpenMLDataset, "data"),
+        (openml.flows.OpenMLFlow, "flow"),
+        (openml.tasks.OpenMLTask, "task"),
+        (openml.runs.OpenMLRun, "run"),
+        ((openml.study.OpenMLStudy, openml.study.OpenMLBenchmarkSuite), "study"),
+    ]
+    _, api_type_alias = next(
+        (python_type, api_alias)
+        for (python_type, api_alias) in rest_api_mapping
+        if isinstance(oml_object, python_type)
+    )
+    return api_type_alias
+
+
+def _tag_openml_base(oml_object: OpenMLBase, tag: str, untag: bool = False) -> None:  # noqa: FBT001, FBT002
+    api_type_alias = _get_rest_api_type_alias(oml_object)
+    if oml_object.id is None:
+        raise openml.exceptions.ObjectNotPublishedError(
+            f"Cannot tag an {api_type_alias} that has not been published yet."
+            "Please publish the object first before being able to tag it."
+            f"\n{oml_object}",
+        )
+    _tag_entity(entity_type=api_type_alias, entity_id=oml_object.id, tag=tag, untag=untag)
+
+
+def _tag_entity(entity_type: str, entity_id: int, tag: str, *, untag: bool = False) -> list[str]:
+    """
+    Function that tags or untags a given entity on OpenML. As the OpenML
+    API tag functions all consist of the same format, this function covers
+    all entity types (currently: dataset, task, flow, setup, run). Could
+    be used in a partial to provide dataset_tag, dataset_untag, etc.
+
+    Parameters
+    ----------
+    entity_type : str
+        Name of the entity to tag (e.g., run, flow, data)
+
+    entity_id : int
+        OpenML id of the entity
+
+    tag : str
+        The tag
+
+    untag : bool
+        Set to true if needed to untag, rather than tag
+
+    Returns
+    -------
+    tags : list
+        List of tags that the entity is (still) tagged with
+    """
+    legal_entities = {"data", "task", "flow", "setup", "run"}
+    if entity_type not in legal_entities:
+        raise ValueError(f"Can't tag a {entity_type}")
+
+    if untag:
+        uri = f"{entity_type}/untag"
+        main_tag = f"oml:{entity_type}_untag"
+    else:
+        uri = f"{entity_type}/tag"
+        main_tag = f"oml:{entity_type}_tag"
+
+    result_xml = openml._api_calls._perform_api_call(
+        uri,
+        "post",
+        {f"{entity_type}_id": entity_id, "tag": tag},
+    )
+
+    result = xmltodict.parse(result_xml, force_list={"oml:tag"})[main_tag]
+
+    if "oml:tag" in result:
+        return result["oml:tag"]  # type: ignore
+
+    # no tags, return empty list
+    return []
+
+
+# TODO(eddiebergman): Maybe this can be made more specific with a Literal
+def _delete_entity(entity_type: str, entity_id: int) -> bool:
+    """
+    Function that deletes a given entity on OpenML. As the OpenML
+    API tag functions all consist of the same format, this function covers
+    all entity types that can be deleted (currently: dataset, task, flow,
+    run, study and user).
+
+    Parameters
+    ----------
+    entity_type : str
+        Name of the entity to tag (e.g., run, flow, data)
+
+    entity_id : int
+        OpenML id of the entity
+
+    Returns
+    -------
+    bool
+        True iff the deletion was successful. False otherwse
+    """
+    legal_entities = {
+        "data",
+        "flow",
+        "task",
+        "run",
+        "study",
+        "user",
+    }
+    if entity_type not in legal_entities:
+        raise ValueError(f"Can't delete a {entity_type}")
+
+    url_suffix = "%s/%d" % (entity_type, entity_id)
+    try:
+        result_xml = openml._api_calls._perform_api_call(url_suffix, "delete")
+        result = xmltodict.parse(result_xml)
+        return f"oml:{entity_type}_delete" in result
+    except openml.exceptions.OpenMLServerException as e:
+        # https://github.com/openml/OpenML/blob/21f6188d08ac24fcd2df06ab94cf421c946971b0/openml_OS/views/pages/api_new/v1/xml/pre.php
+        # Most exceptions are descriptive enough to be raised as their standard
+        # OpenMLServerException, however there are two cases where we add information:
+        #  - a generic "failed" message, we direct them to the right issue board
+        #  - when the user successfully authenticates with the server,
+        #    but user is not allowed to take the requested action,
+        #    in which case we specify a OpenMLNotAuthorizedError.
+        by_other_user = [323, 353, 393, 453, 594]
+        has_dependent_entities = [324, 326, 327, 328, 354, 454, 464, 595]
+        unknown_reason = [325, 355, 394, 455, 593]
+        if e.code in by_other_user:
+            raise openml.exceptions.OpenMLNotAuthorizedError(
+                message=(
+                    f"The {entity_type} can not be deleted because it was not uploaded by you."
+                ),
+            ) from e
+        if e.code in has_dependent_entities:
+            raise openml.exceptions.OpenMLNotAuthorizedError(
+                message=(
+                    f"The {entity_type} can not be deleted because "
+                    f"it still has associated entities: {e.message}"
+                ),
+            ) from e
+        if e.code in unknown_reason:
+            raise openml.exceptions.OpenMLServerError(
+                message=(
+                    f"The {entity_type} can not be deleted for unknown reason,"
+                    " please open an issue at: https://github.com/openml/openml/issues/new"
+                ),
+            ) from e
+        raise e
+
+
+def _list_all(  # noqa: C901
+    listing_call: Callable[[int, int], _SizedT],
+    *,
+    limit: int | None = None,
+    offset: int | None = None,
+    batch_size: int | None = 10_000,
+) -> list[_SizedT]:
+    """Helper to handle paged listing requests.
+
+    Example usage:
+
+    ``evaluations = list_all(list_evaluations, "predictive_accuracy", task=mytask)``
+
+    Parameters
+    ----------
+    listing_call : callable
+        Call listing, e.g. list_evaluations. Takes two positional
+        arguments: batch_size and offset.
+    batch_size : int, optional
+        The batch size to use for the listing call.
+    offset : int, optional
+        The initial offset to use for the listing call.
+    limit : int, optional
+        The total size of the listing. If not provided, the function will
+        request the first batch and then continue until no more results are
+        returned
+
+    Returns
+    -------
+    List of types returned from type of the listing call
+    """
+    page = 0
+    results: list[_SizedT] = []
+
+    offset = offset if offset is not None else 0
+    batch_size = batch_size if batch_size is not None else 10_000
+
+    LIMIT = limit
+    BATCH_SIZE_ORIG = batch_size
+
+    # Default batch size per paging.
+    # This one can be set in filters (batch_size), but should not be
+    # changed afterwards. The derived batch_size can be changed.
+    if not isinstance(BATCH_SIZE_ORIG, int):
+        raise ValueError(f"'batch_size' should be an integer but got {BATCH_SIZE_ORIG}")
+
+    if (LIMIT is not None) and (not isinstance(LIMIT, int)) and (not np.isinf(LIMIT)):
+        raise ValueError(f"'limit' should be an integer or inf but got {LIMIT}")
+
+    # If our batch size is larger than the limit, we should only
+    # request one batch of size of LIMIT
+    if LIMIT is not None and BATCH_SIZE_ORIG > LIMIT:
+        BATCH_SIZE_ORIG = LIMIT
+
+    if not isinstance(offset, int):
+        raise ValueError(f"'offset' should be an integer but got {offset}")
+
+    batch_size = BATCH_SIZE_ORIG
+    while True:
+        try:
+            current_offset = offset + BATCH_SIZE_ORIG * page
+            new_batch = listing_call(batch_size, current_offset)
+        except openml.exceptions.OpenMLServerNoResult:
+            # NOTE: This above statement may not actually happen, but we could just return here
+            # to enforce it...
+            break
+
+        results.append(new_batch)
+
+        # If the batch is less than our requested batch_size, that's the last batch
+        # and we can bail out.
+        if len(new_batch) < batch_size:
+            break
+
+        page += 1
+        if LIMIT is not None:
+            # check if the number of required results has been achieved
+            # always do a 'bigger than' check,
+            # in case of bugs to prevent infinite loops
+            n_received = sum(len(result) for result in results)
+            if n_received >= LIMIT:
+                break
+
+            # check if there are enough results to fulfill a batch
+            if LIMIT - n_received < BATCH_SIZE_ORIG:
+                batch_size = LIMIT - n_received
+
+    return results
+
+
+def _get_cache_dir_for_key(key: str) -> Path:
+    return Path(config.get_cache_directory()) / key
+
+
+def _create_cache_directory(key: str) -> Path:
+    cache_dir = _get_cache_dir_for_key(key)
+
+    try:
+        cache_dir.mkdir(exist_ok=True, parents=True)
+    except Exception as e:
+        raise openml.exceptions.OpenMLCacheException(
+            f"Cannot create cache directory {cache_dir}."
+        ) from e
+
+    return cache_dir
+
+
+def _get_cache_dir_for_id(key: str, id_: int, create: bool = False) -> Path:  # noqa: FBT001, FBT002
+    cache_dir = _create_cache_directory(key) if create else _get_cache_dir_for_key(key)
+    return Path(cache_dir) / str(id_)
+
+
+def _create_cache_directory_for_id(key: str, id_: int) -> Path:
+    """Create the cache directory for a specific ID
+
+    In order to have a clearer cache structure and because every task
+    is cached in several files (description, split), there
+    is a directory for each task witch the task ID being the directory
+    name. This function creates this cache directory.
+
+    This function is NOT thread/multiprocessing safe.
+
+    Parameters
+    ----------
+    key : str
+
+    id_ : int
+
+    Returns
+    -------
+    cache_dir : Path
+        Path of the created dataset cache directory.
+    """
+    cache_dir = _get_cache_dir_for_id(key, id_, create=True)
+    if cache_dir.exists() and not cache_dir.is_dir():
+        raise ValueError(f"{key} cache dir exists but is not a directory!")
+
+    cache_dir.mkdir(exist_ok=True, parents=True)
+    return cache_dir
+
+
+def _remove_cache_dir_for_id(key: str, cache_dir: Path) -> None:
+    """Remove the task cache directory
+
+    This function is NOT thread/multiprocessing safe.
+
+    Parameters
+    ----------
+    key : str
+
+    cache_dir : str
+    """
+    try:
+        shutil.rmtree(cache_dir)
+    except OSError as e:
+        raise ValueError(
+            f"Cannot remove faulty {key} cache directory {cache_dir}. Please do this manually!",
+        ) from e
+
+
+def thread_safe_if_oslo_installed(func: Callable[P, R]) -> Callable[P, R]:
+    try:
+        # Currently, importing oslo raises a lot of warning that it will stop working
+        # under python3.8; remove this once they disappear
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            from oslo_concurrency import lockutils
+
+        @wraps(func)
+        def safe_func(*args: P.args, **kwargs: P.kwargs) -> R:
+            # Lock directories use the id that is passed as either positional or keyword argument.
+            id_parameters = [parameter_name for parameter_name in kwargs if "_id" in parameter_name]
+            if len(id_parameters) == 1:
+                id_ = kwargs[id_parameters[0]]
+            elif len(args) > 0:
+                id_ = args[0]
+            else:
+                raise RuntimeError(
+                    f"An id must be specified for {func.__name__}, was passed: ({args}, {kwargs}).",
+                )
+            # The [7:] gets rid of the 'openml.' prefix
+            lock_name = f"{func.__module__[7:]}.{func.__name__}:{id_}"
+            with lockutils.external_lock(name=lock_name, lock_path=_create_lockfiles_dir()):
+                return func(*args, **kwargs)
+
+        return safe_func
+    except ImportError:
+        return func
+
+
+def _create_lockfiles_dir() -> Path:
+    path = Path(config.get_cache_directory()) / "locks"
+    # TODO(eddiebergman): Not sure why this is allowed to error and ignore???
+    with contextlib.suppress(OSError):
+        path.mkdir(exist_ok=True, parents=True)
+    return path
+
+
+class ProgressBar(ProgressType):
+    """Progressbar for MinIO function's `progress` parameter."""
+
+    def __init__(self) -> None:
+        self._object_name = ""
+        self._progress_bar: tqdm | None = None
+
+    def set_meta(self, object_name: str, total_length: int) -> None:
+        """Initializes the progress bar.
+
+        Parameters
+        ----------
+        object_name: str
+          Not used.
+
+        total_length: int
+          File size of the object in bytes.
+        """
+        self._object_name = object_name
+        self._progress_bar = tqdm(total=total_length, unit_scale=True, unit="B")
+
+    def update(self, length: int) -> None:
+        """Updates the progress bar.
+
+        Parameters
+        ----------
+        length: int
+          Number of bytes downloaded since last `update` call.
+        """
+        if not self._progress_bar:
+            raise RuntimeError("Call `set_meta` before calling `update`.")
+        self._progress_bar.update(length)
+        if self._progress_bar.total <= self._progress_bar.n:
+            self._progress_bar.close()

From 91c138cfc54badcfd0be81f8de704b6e38fcb4cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= <fkiraly@gcos.ai>
Date: Fri, 9 Jan 2026 21:27:45 +0100
Subject: [PATCH 4/8] Delete utils.py

---
 openml/utils.py | 471 ------------------------------------------------
 1 file changed, 471 deletions(-)
 delete mode 100644 openml/utils.py

diff --git a/openml/utils.py b/openml/utils.py
deleted file mode 100644
index 7e72e7aee..000000000
--- a/openml/utils.py
+++ /dev/null
@@ -1,471 +0,0 @@
-# License: BSD 3-Clause
-from __future__ import annotations
-
-import contextlib
-import shutil
-import warnings
-from functools import wraps
-from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Mapping, Sized, TypeVar, overload
-from typing_extensions import Literal, ParamSpec
-
-import numpy as np
-import xmltodict
-from minio.helpers import ProgressType
-from tqdm import tqdm
-
-import openml
-import openml._api_calls
-import openml.exceptions
-
-from . import config
-
-# Avoid import cycles: https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles
-if TYPE_CHECKING:
-    from openml.base import OpenMLBase
-
-    P = ParamSpec("P")
-    R = TypeVar("R")
-    _SizedT = TypeVar("_SizedT", bound=Sized)
-
-
-@overload
-def extract_xml_tags(
-    xml_tag_name: str,
-    node: Mapping[str, Any],
-    *,
-    allow_none: Literal[True] = ...,
-) -> Any | None: ...
-
-
-@overload
-def extract_xml_tags(
-    xml_tag_name: str,
-    node: Mapping[str, Any],
-    *,
-    allow_none: Literal[False],
-) -> Any: ...
-
-
-def extract_xml_tags(
-    xml_tag_name: str,
-    node: Mapping[str, Any],
-    *,
-    allow_none: bool = True,
-) -> Any | None:
-    """Helper to extract xml tags from xmltodict.
-
-    Parameters
-    ----------
-    xml_tag_name : str
-        Name of the xml tag to extract from the node.
-
-    node : Mapping[str, Any]
-        Node object returned by ``xmltodict`` from which ``xml_tag_name``
-        should be extracted.
-
-    allow_none : bool
-        If ``False``, the tag needs to exist in the node. Will raise a
-        ``ValueError`` if it does not.
-
-    Returns
-    -------
-    object
-    """
-    if xml_tag_name in node and node[xml_tag_name] is not None:
-        if isinstance(node[xml_tag_name], (dict, str)):
-            return [node[xml_tag_name]]
-        if isinstance(node[xml_tag_name], list):
-            return node[xml_tag_name]
-
-        raise ValueError("Received not string and non list as tag item")
-
-    if allow_none:
-        return None
-
-    raise ValueError(f"Could not find tag '{xml_tag_name}' in node '{node!s}'")
-
-
-def _get_rest_api_type_alias(oml_object: OpenMLBase) -> str:
-    """Return the alias of the openml entity as it is defined for the REST API."""
-    rest_api_mapping: list[tuple[type | tuple, str]] = [
-        (openml.datasets.OpenMLDataset, "data"),
-        (openml.flows.OpenMLFlow, "flow"),
-        (openml.tasks.OpenMLTask, "task"),
-        (openml.runs.OpenMLRun, "run"),
-        ((openml.study.OpenMLStudy, openml.study.OpenMLBenchmarkSuite), "study"),
-    ]
-    _, api_type_alias = next(
-        (python_type, api_alias)
-        for (python_type, api_alias) in rest_api_mapping
-        if isinstance(oml_object, python_type)
-    )
-    return api_type_alias
-
-
-def _tag_openml_base(oml_object: OpenMLBase, tag: str, untag: bool = False) -> None:  # noqa: FBT001, FBT002
-    api_type_alias = _get_rest_api_type_alias(oml_object)
-    if oml_object.id is None:
-        raise openml.exceptions.ObjectNotPublishedError(
-            f"Cannot tag an {api_type_alias} that has not been published yet."
-            "Please publish the object first before being able to tag it."
-            f"\n{oml_object}",
-        )
-    _tag_entity(entity_type=api_type_alias, entity_id=oml_object.id, tag=tag, untag=untag)
-
-
-def _tag_entity(entity_type: str, entity_id: int, tag: str, *, untag: bool = False) -> list[str]:
-    """
-    Function that tags or untags a given entity on OpenML. As the OpenML
-    API tag functions all consist of the same format, this function covers
-    all entity types (currently: dataset, task, flow, setup, run). Could
-    be used in a partial to provide dataset_tag, dataset_untag, etc.
-
-    Parameters
-    ----------
-    entity_type : str
-        Name of the entity to tag (e.g., run, flow, data)
-
-    entity_id : int
-        OpenML id of the entity
-
-    tag : str
-        The tag
-
-    untag : bool
-        Set to true if needed to untag, rather than tag
-
-    Returns
-    -------
-    tags : list
-        List of tags that the entity is (still) tagged with
-    """
-    legal_entities = {"data", "task", "flow", "setup", "run"}
-    if entity_type not in legal_entities:
-        raise ValueError(f"Can't tag a {entity_type}")
-
-    if untag:
-        uri = f"{entity_type}/untag"
-        main_tag = f"oml:{entity_type}_untag"
-    else:
-        uri = f"{entity_type}/tag"
-        main_tag = f"oml:{entity_type}_tag"
-
-    result_xml = openml._api_calls._perform_api_call(
-        uri,
-        "post",
-        {f"{entity_type}_id": entity_id, "tag": tag},
-    )
-
-    result = xmltodict.parse(result_xml, force_list={"oml:tag"})[main_tag]
-
-    if "oml:tag" in result:
-        return result["oml:tag"]  # type: ignore
-
-    # no tags, return empty list
-    return []
-
-
-# TODO(eddiebergman): Maybe this can be made more specific with a Literal
-def _delete_entity(entity_type: str, entity_id: int) -> bool:
-    """
-    Function that deletes a given entity on OpenML. As the OpenML
-    API tag functions all consist of the same format, this function covers
-    all entity types that can be deleted (currently: dataset, task, flow,
-    run, study and user).
-
-    Parameters
-    ----------
-    entity_type : str
-        Name of the entity to tag (e.g., run, flow, data)
-
-    entity_id : int
-        OpenML id of the entity
-
-    Returns
-    -------
-    bool
-        True iff the deletion was successful. False otherwse
-    """
-    legal_entities = {
-        "data",
-        "flow",
-        "task",
-        "run",
-        "study",
-        "user",
-    }
-    if entity_type not in legal_entities:
-        raise ValueError(f"Can't delete a {entity_type}")
-
-    url_suffix = "%s/%d" % (entity_type, entity_id)
-    try:
-        result_xml = openml._api_calls._perform_api_call(url_suffix, "delete")
-        result = xmltodict.parse(result_xml)
-        return f"oml:{entity_type}_delete" in result
-    except openml.exceptions.OpenMLServerException as e:
-        # https://github.com/openml/OpenML/blob/21f6188d08ac24fcd2df06ab94cf421c946971b0/openml_OS/views/pages/api_new/v1/xml/pre.php
-        # Most exceptions are descriptive enough to be raised as their standard
-        # OpenMLServerException, however there are two cases where we add information:
-        #  - a generic "failed" message, we direct them to the right issue board
-        #  - when the user successfully authenticates with the server,
-        #    but user is not allowed to take the requested action,
-        #    in which case we specify a OpenMLNotAuthorizedError.
-        by_other_user = [323, 353, 393, 453, 594]
-        has_dependent_entities = [324, 326, 327, 328, 354, 454, 464, 595]
-        unknown_reason = [325, 355, 394, 455, 593]
-        if e.code in by_other_user:
-            raise openml.exceptions.OpenMLNotAuthorizedError(
-                message=(
-                    f"The {entity_type} can not be deleted because it was not uploaded by you."
-                ),
-            ) from e
-        if e.code in has_dependent_entities:
-            raise openml.exceptions.OpenMLNotAuthorizedError(
-                message=(
-                    f"The {entity_type} can not be deleted because "
-                    f"it still has associated entities: {e.message}"
-                ),
-            ) from e
-        if e.code in unknown_reason:
-            raise openml.exceptions.OpenMLServerError(
-                message=(
-                    f"The {entity_type} can not be deleted for unknown reason,"
-                    " please open an issue at: https://github.com/openml/openml/issues/new"
-                ),
-            ) from e
-        raise e
-
-
-def _list_all(  # noqa: C901
-    listing_call: Callable[[int, int], _SizedT],
-    *,
-    limit: int | None = None,
-    offset: int | None = None,
-    batch_size: int | None = 10_000,
-) -> list[_SizedT]:
-    """Helper to handle paged listing requests.
-
-    Example usage:
-
-    ``evaluations = list_all(list_evaluations, "predictive_accuracy", task=mytask)``
-
-    Parameters
-    ----------
-    listing_call : callable
-        Call listing, e.g. list_evaluations. Takes two positional
-        arguments: batch_size and offset.
-    batch_size : int, optional
-        The batch size to use for the listing call.
-    offset : int, optional
-        The initial offset to use for the listing call.
-    limit : int, optional
-        The total size of the listing. If not provided, the function will
-        request the first batch and then continue until no more results are
-        returned
-
-    Returns
-    -------
-    List of types returned from type of the listing call
-    """
-    page = 0
-    results: list[_SizedT] = []
-
-    offset = offset if offset is not None else 0
-    batch_size = batch_size if batch_size is not None else 10_000
-
-    LIMIT = limit
-    BATCH_SIZE_ORIG = batch_size
-
-    # Default batch size per paging.
-    # This one can be set in filters (batch_size), but should not be
-    # changed afterwards. The derived batch_size can be changed.
-    if not isinstance(BATCH_SIZE_ORIG, int):
-        raise ValueError(f"'batch_size' should be an integer but got {BATCH_SIZE_ORIG}")
-
-    if (LIMIT is not None) and (not isinstance(LIMIT, int)) and (not np.isinf(LIMIT)):
-        raise ValueError(f"'limit' should be an integer or inf but got {LIMIT}")
-
-    # If our batch size is larger than the limit, we should only
-    # request one batch of size of LIMIT
-    if LIMIT is not None and BATCH_SIZE_ORIG > LIMIT:
-        BATCH_SIZE_ORIG = LIMIT
-
-    if not isinstance(offset, int):
-        raise ValueError(f"'offset' should be an integer but got {offset}")
-
-    batch_size = BATCH_SIZE_ORIG
-    while True:
-        try:
-            current_offset = offset + BATCH_SIZE_ORIG * page
-            new_batch = listing_call(batch_size, current_offset)
-        except openml.exceptions.OpenMLServerNoResult:
-            # NOTE: This above statement may not actually happen, but we could just return here
-            # to enforce it...
-            break
-
-        results.append(new_batch)
-
-        # If the batch is less than our requested batch_size, that's the last batch
-        # and we can bail out.
-        if len(new_batch) < batch_size:
-            break
-
-        page += 1
-        if LIMIT is not None:
-            # check if the number of required results has been achieved
-            # always do a 'bigger than' check,
-            # in case of bugs to prevent infinite loops
-            n_received = sum(len(result) for result in results)
-            if n_received >= LIMIT:
-                break
-
-            # check if there are enough results to fulfill a batch
-            if LIMIT - n_received < BATCH_SIZE_ORIG:
-                batch_size = LIMIT - n_received
-
-    return results
-
-
-def _get_cache_dir_for_key(key: str) -> Path:
-    return Path(config.get_cache_directory()) / key
-
-
-def _create_cache_directory(key: str) -> Path:
-    cache_dir = _get_cache_dir_for_key(key)
-
-    try:
-        cache_dir.mkdir(exist_ok=True, parents=True)
-    except Exception as e:
-        raise openml.exceptions.OpenMLCacheException(
-            f"Cannot create cache directory {cache_dir}."
-        ) from e
-
-    return cache_dir
-
-
-def _get_cache_dir_for_id(key: str, id_: int, create: bool = False) -> Path:  # noqa: FBT001, FBT002
-    cache_dir = _create_cache_directory(key) if create else _get_cache_dir_for_key(key)
-    return Path(cache_dir) / str(id_)
-
-
-def _create_cache_directory_for_id(key: str, id_: int) -> Path:
-    """Create the cache directory for a specific ID
-
-    In order to have a clearer cache structure and because every task
-    is cached in several files (description, split), there
-    is a directory for each task witch the task ID being the directory
-    name. This function creates this cache directory.
-
-    This function is NOT thread/multiprocessing safe.
-
-    Parameters
-    ----------
-    key : str
-
-    id_ : int
-
-    Returns
-    -------
-    cache_dir : Path
-        Path of the created dataset cache directory.
-    """
-    cache_dir = _get_cache_dir_for_id(key, id_, create=True)
-    if cache_dir.exists() and not cache_dir.is_dir():
-        raise ValueError(f"{key} cache dir exists but is not a directory!")
-
-    cache_dir.mkdir(exist_ok=True, parents=True)
-    return cache_dir
-
-
-def _remove_cache_dir_for_id(key: str, cache_dir: Path) -> None:
-    """Remove the task cache directory
-
-    This function is NOT thread/multiprocessing safe.
-
-    Parameters
-    ----------
-    key : str
-
-    cache_dir : str
-    """
-    try:
-        shutil.rmtree(cache_dir)
-    except OSError as e:
-        raise ValueError(
-            f"Cannot remove faulty {key} cache directory {cache_dir}. Please do this manually!",
-        ) from e
-
-
-def thread_safe_if_oslo_installed(func: Callable[P, R]) -> Callable[P, R]:
-    try:
-        # Currently, importing oslo raises a lot of warning that it will stop working
-        # under python3.8; remove this once they disappear
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-            from oslo_concurrency import lockutils
-
-        @wraps(func)
-        def safe_func(*args: P.args, **kwargs: P.kwargs) -> R:
-            # Lock directories use the id that is passed as either positional or keyword argument.
-            id_parameters = [parameter_name for parameter_name in kwargs if "_id" in parameter_name]
-            if len(id_parameters) == 1:
-                id_ = kwargs[id_parameters[0]]
-            elif len(args) > 0:
-                id_ = args[0]
-            else:
-                raise RuntimeError(
-                    f"An id must be specified for {func.__name__}, was passed: ({args}, {kwargs}).",
-                )
-            # The [7:] gets rid of the 'openml.' prefix
-            lock_name = f"{func.__module__[7:]}.{func.__name__}:{id_}"
-            with lockutils.external_lock(name=lock_name, lock_path=_create_lockfiles_dir()):
-                return func(*args, **kwargs)
-
-        return safe_func
-    except ImportError:
-        return func
-
-
-def _create_lockfiles_dir() -> Path:
-    path = Path(config.get_cache_directory()) / "locks"
-    # TODO(eddiebergman): Not sure why this is allowed to error and ignore???
-    with contextlib.suppress(OSError):
-        path.mkdir(exist_ok=True, parents=True)
-    return path
-
-
-class ProgressBar(ProgressType):
-    """Progressbar for MinIO function's `progress` parameter."""
-
-    def __init__(self) -> None:
-        self._object_name = ""
-        self._progress_bar: tqdm | None = None
-
-    def set_meta(self, object_name: str, total_length: int) -> None:
-        """Initializes the progress bar.
-
-        Parameters
-        ----------
-        object_name: str
-          Not used.
-
-        total_length: int
-          File size of the object in bytes.
-        """
-        self._object_name = object_name
-        self._progress_bar = tqdm(total=total_length, unit_scale=True, unit="B")
-
-    def update(self, length: int) -> None:
-        """Updates the progress bar.
-
-        Parameters
-        ----------
-        length: int
-          Number of bytes downloaded since last `update` call.
-        """
-        if not self._progress_bar:
-            raise RuntimeError("Call `set_meta` before calling `update`.")
-        self._progress_bar.update(length)
-        if self._progress_bar.total <= self._progress_bar.n:
-            self._progress_bar.close()

From 778b87e5372ee27a50713cf6d2dfc0ab0308aab3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= <fkiraly@gcos.ai>
Date: Fri, 9 Jan 2026 21:28:37 +0100
Subject: [PATCH 5/8] indexer sklearn

---
 openml/utils/_indexing/__init__.py          |   1 +
 openml/utils/_indexing/_preindex_sklearn.py | 137 ++++++++++++++++++++
 openml/utils/_inmemory/__init__.py          |   1 +
 openml/utils/_inmemory/_dict.py             |  54 ++++++++
 4 files changed, 193 insertions(+)
 create mode 100644 openml/utils/_indexing/__init__.py
 create mode 100644 openml/utils/_indexing/_preindex_sklearn.py
 create mode 100644 openml/utils/_inmemory/__init__.py
 create mode 100644 openml/utils/_inmemory/_dict.py

diff --git a/openml/utils/_indexing/__init__.py b/openml/utils/_indexing/__init__.py
new file mode 100644
index 000000000..80b82550d
--- /dev/null
+++ b/openml/utils/_indexing/__init__.py
@@ -0,0 +1 @@
+"""Utilities module for indexing third party libraries."""
diff --git a/openml/utils/_indexing/_preindex_sklearn.py b/openml/utils/_indexing/_preindex_sklearn.py
new file mode 100644
index 000000000..5291f3adb
--- /dev/null
+++ b/openml/utils/_indexing/_preindex_sklearn.py
@@ -0,0 +1,137 @@
+"""Registry lookup methods - scikit-learn estimators."""
+# adapted from the sktime utility of the same name
+# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+
+__author__ = ["fkiraly"]
+# all_estimators is also based on the sklearn utility of the same name
+
+from skbase.lookup import all_objects
+
+
+def _all_sklearn_estimators_locdict(package_name="sklearn", serialized=False):
+    """Dictionary of all scikit-learn estimators in sktime and sklearn.
+
+    Parameters
+    ----------
+    package_name : str, optional (default="sklearn")
+        The package from which to retrieve the sklearn estimators.
+        This is an import name, e.g., ``"sklearn"``, not a PEP 440 package identifier,
+        e.g., ``"scikit-learn"``.
+
+    serialized : bool, optional (default=False)
+        If True, returns a serialized version of the dict, via
+        ``openml.utils._inmemory._dict.serialize_dict``.
+        If False, returns the dict directly.
+
+    Returns
+    -------
+    loc_dict : dict
+        A dictionary with:
+
+        * keys: str, estimator class name, e.g., ``RandomForestClassifier``
+        * values: str, public import path of the estimator, e.g.,
+          ``sklearn.ensemble.RandomForestClassifier``
+    """
+    all_ests = _all_sklearn_estimators(
+        package_name=package_name,
+        return_names=False,
+    )
+
+    loc_dict = {est.__name__: f"{est.__module__}.{est.__name__}" for est in all_ests}
+
+    if serialized:
+        from openml.utils._inmemory._dict import serialize_dict
+
+        loc_dict = serialize_dict(loc_dict, name="sklearn_estimators_loc_dict")
+
+    return loc_dict
+
+
+def _all_sklearn_estimators(
+    package_name="sklearn",
+    return_names=True,
+    as_dataframe=False,
+    suppress_import_stdout=True,
+):
+    """List all scikit-learn objects in a given package.
+
+    This function retrieves all sklearn objects inheriting from ``BaseEstimator``,
+    from the import location given by ``package_name``.
+
+    Not included are: the base classes themselves, classes defined in test modules.
+
+    Parameters
+    ----------
+    package_name : str, optional (default="sklearn")
+        The package from which to retrieve the sklearn estimators.
+        This is an import name, e.g., ``"sklearn"``, not a PEP 440 package identifier,
+        e.g., ``"scikit-learn"``.
+
+    return_names: bool, optional (default=True)
+
+        if True, estimator class name is included in the ``all_estimators``
+        return in the order: name, estimator class, optional tags, either as
+        a tuple or as pandas.DataFrame columns
+
+        if False, estimator class name is removed from the ``all_estimators`` return.
+
+    as_dataframe: bool, optional (default=False)
+
+        True: ``all_estimators`` will return a ``pandas.DataFrame`` with named
+        columns for all of the attributes being returned.
+
+        False: ``all_estimators`` will return a list (either a list of
+        estimators or a list of tuples, see Returns)
+
+    suppress_import_stdout : bool, optional. Default=True
+        whether to suppress stdout printout upon import.
+
+    Returns
+    -------
+    all_estimators will return one of the following:
+
+        1. list of estimators, if ``return_names=False``, and ``return_tags`` is None
+
+        2. list of tuples (optional estimator name, class, ~ptional estimator
+        tags), if ``return_names=True`` or ``return_tags`` is not ``None``.
+
+        3. ``pandas.DataFrame`` if ``as_dataframe = True``
+
+        if list of estimators:
+            entries are estimators matching the query,
+            in alphabetical order of estimator name
+        if list of tuples:
+            list of (optional estimator name, estimator, optional estimator
+            tags) matching the query, in alphabetical order of estimator name,
+            where
+            ``name`` is the estimator name as string, and is an
+            optional return
+            ``estimator`` is the actual estimator
+            ``tags`` are the estimator's values for each tag in return_tags
+            and is an optional return.
+        if ``DataFrame``:
+            column names represent the attributes contained in each column.
+            "estimators" will be the name of the column of estimators, "names"
+            will be the name of the column of estimator class names and the string(s)
+            passed in return_tags will serve as column names for all columns of
+            tags that were optionally requested.
+    """  # noqa: E501
+    from sklearn.base import BaseEstimator
+
+    MODULES_TO_IGNORE_SKLEARN = [
+        "array_api_compat",
+        "tests",
+        "experimental",
+        "conftest",
+    ]
+
+    result = all_objects(
+        object_types=BaseEstimator,
+        package_name=package_name,
+        modules_to_ignore=MODULES_TO_IGNORE_SKLEARN,
+        as_dataframe=as_dataframe,
+        return_names=return_names,
+        suppress_import_stdout=suppress_import_stdout,
+    )
+
+    return result
diff --git a/openml/utils/_inmemory/__init__.py b/openml/utils/_inmemory/__init__.py
new file mode 100644
index 000000000..07bdfba5a
--- /dev/null
+++ b/openml/utils/_inmemory/__init__.py
@@ -0,0 +1 @@
+"""Utilities module for serializing and deserializing in-memory objects."""
diff --git a/openml/utils/_inmemory/_dict.py b/openml/utils/_inmemory/_dict.py
new file mode 100644
index 000000000..09d59285e
--- /dev/null
+++ b/openml/utils/_inmemory/_dict.py
@@ -0,0 +1,54 @@
+"""Utilities module for serializing and deserializing dicts."""
+
+
+def serialize_dict(d, mode="eval", name="d"):
+    """Serialize a dict as an executable Python code snippet.
+
+    To deserialize, simply execute the code snippet in a Python environment.
+
+    Command for deserialization:
+
+    * if ``mode == "eval"``, use ``deserialized = exec(code_snippet)``
+    * if ``mode == "exec"``, use ``exec(code_snippet)`` and then access the dict
+
+    Parameters
+    ----------
+    d : dict
+        The dictionary to serialize.
+
+    mode : str, "eval" or "exec", default="eval"
+        The mode of serialization.
+
+        * If ``"eval"``, the returned code snippet is an expression that evaluates to the dict.
+        * If ``"exec"``, the returned code snippet is a series of statements that assign the dict
+          to a variable named ``name``.
+
+    name : str, default="d"
+        The variable name to assign the dict to.
+        Only used if mode is ``"exec"``.
+
+    Returns
+    -------
+    code_snippet : str
+        A string containing the Python code snippet that recreates the dict ``d``,
+        assigned to the specified variable name ``name``.
+
+    Example
+    -------
+    >>> my_dict = {'a': 'apple', 'b': 'banana'}
+    >>> serialized_dict = serialize_dict(my_dict, name="my_dict")
+    >>> deserialized_dict = eval(serialized_dict)
+    >>> assert deserialized_dict == my_dict
+    """
+    def dq(s):
+        # Escape backslashes and double quotes for valid Python strings
+        return s.replace("\\", "\\\\").replace('"', '\\"')
+
+    if mode == "eval":
+        lines = ["{"]
+    else: # mode == "exec"
+        lines = [f"{name} = {{"]
+    for k, v in d.items():
+        lines.append(f'    "{dq(k)}": "{dq(v)}",')
+    lines.append("}")
+    return "\n".join(lines)

From abc0f4929285422427bdd55dd93cc81322ae2bc0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= <fkiraly@gcos.ai>
Date: Fri, 9 Jan 2026 21:29:44 +0100
Subject: [PATCH 6/8] Update _openml.py

---
 openml/utils/_openml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openml/utils/_openml.py b/openml/utils/_openml.py
index 7e72e7aee..d86a62ce7 100644
--- a/openml/utils/_openml.py
+++ b/openml/utils/_openml.py
@@ -18,7 +18,7 @@
 import openml._api_calls
 import openml.exceptions
 
-from . import config
+from openml import config
 
 # Avoid import cycles: https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles
 if TYPE_CHECKING:

From 5a6161ff7ccb86f893d13bcd7b196b65e5b13313 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= <fkiraly@gcos.ai>
Date: Fri, 9 Jan 2026 22:26:42 +0100
Subject: [PATCH 7/8] multi-import package

---
 openml/__init__.py                           |   7 +
 openml/base/_base_pkg.py                     |   1 +
 openml/models/_get.py                        |  14 +-
 openml/models/base/_base.py                  |  27 ++-
 openml/models/classification/auto_sklearn.py |   1 +
 openml/models/classification/scikit_learn.py | 227 +++++++++++++++++++
 openml/models/classification/xgboost.py      |   1 +
 7 files changed, 274 insertions(+), 4 deletions(-)
 create mode 100644 openml/models/classification/scikit_learn.py

diff --git a/openml/__init__.py b/openml/__init__.py
index 7eb077057..a9a732fc9 100644
--- a/openml/__init__.py
+++ b/openml/__init__.py
@@ -124,3 +124,10 @@ def populate_cache(
     "__version__",
     "get",
 ]
+
+
+def __getattr__(name: str):
+    if name in __all__:
+        return globals()[name]
+    if name not in __all__:
+        return get(name)
diff --git a/openml/base/_base_pkg.py b/openml/base/_base_pkg.py
index 690b93a86..729619fa9 100644
--- a/openml/base/_base_pkg.py
+++ b/openml/base/_base_pkg.py
@@ -20,6 +20,7 @@ class _BasePkg(BaseObject):
         "pkg_obj": "reference",  # or "code"
         "pkg_obj_type": None,  # openml API type
         "pkg_compression": "zlib",  # compression
+        "pkg_pypi_name": None,  # PyPI package name of objects
     }
 
     def __init__(self):
diff --git a/openml/models/_get.py b/openml/models/_get.py
index 75b807ca7..7762b8013 100644
--- a/openml/models/_get.py
+++ b/openml/models/_get.py
@@ -27,7 +27,7 @@ def get(id: str):
     obj = id_lookup.get(id)
     if obj is None:
         raise ValueError(f"Error in openml.get, object with package id {id} " "does not exist.")
-    return obj().materialize()
+    return obj(id).materialize()
 
 
 # todo: need to generalize this later to more types
@@ -41,8 +41,16 @@ def _id_lookup(obj_type=None):
 def _id_lookup_cached(obj_type=None):
     all_objs = _all_objects(obj_type=obj_type)
 
-    # todo: generalize that pkg can contain more than one object
-    return {obj.get_class_tag("pkg_id"): obj for obj in all_objs}
+    lookup_dict = {}
+    for obj in all_objs:
+        obj_index = obj.get_class_tag("pkg_id")
+        if obj_index != "__multiple":
+            lookup_dict[obj_index] = obj
+        else:
+            obj_all_ids = obj.contained_ids()
+            lookup_dict.update({obj_id: obj for obj_id in obj_all_ids})
+
+    return lookup_dict
 
 
 @lru_cache
diff --git a/openml/models/base/_base.py b/openml/models/base/_base.py
index 6b3fa2a92..13166cfe2 100644
--- a/openml/models/base/_base.py
+++ b/openml/models/base/_base.py
@@ -7,6 +7,28 @@
 
 class _OpenmlModelPkg(_BasePkg):
     _obj = None
+    _obj_dict = {}
+
+    def __init__(self, id=None):
+        super().__init__()
+
+        pkg_id = self.get_tag("pkg_id")
+        if pkg_id == "__multiple":
+            self._obj = self._obj_dict.get(id, None)
+
+    @classmethod
+    def contained_ids(cls):
+        """Return list of ids of objects contained in this package.
+
+        Returns
+        -------
+        ids : list of str
+            list of unique identifiers of objects contained in this package
+        """
+        pkg_id = cls.get_class_tag("pkg_id")
+        if pkg_id != "__multiple":
+            return [cls.get_class_tag("pkg_id")]
+        return list(cls._obj_dict.keys())
 
     def _materialize(self):
         pkg_obj = self.get_tag("pkg_obj")
@@ -23,7 +45,10 @@ def _materialize(self):
         if pkg_obj == "reference":
             from skbase.utils.dependencies import _safe_import
 
-            return _safe_import(self._obj)
+            obj_loc = self._obj
+            pkg_name = self.get_tag("pkg_pypi_name")
+
+            return _safe_import(obj_loc, pkg_name=pkg_name)
 
         if pkg_obj == "code":
             exec(self._obj)
diff --git a/openml/models/classification/auto_sklearn.py b/openml/models/classification/auto_sklearn.py
index 1d29044da..c4d926e72 100644
--- a/openml/models/classification/auto_sklearn.py
+++ b/openml/models/classification/auto_sklearn.py
@@ -9,6 +9,7 @@ class OpenmlPkg__AutoSklearnClassifier(_ModelPkgClassifier):
     _tags = {
         "pkg_id": "AutoSklearnClassifier",
         "python_dependencies": "auto-sklearn",
+        "pkg_pypi_name": "auto-sklearn",
     }
 
     _obj = "autosklearn.classification.AutoSklearnClassifier"
diff --git a/openml/models/classification/scikit_learn.py b/openml/models/classification/scikit_learn.py
new file mode 100644
index 000000000..dd05d3d46
--- /dev/null
+++ b/openml/models/classification/scikit_learn.py
@@ -0,0 +1,227 @@
+"""Auto-sklearn classifier."""
+
+from __future__ import annotations
+
+from openml.models.apis import _ModelPkgClassifier
+
+
+class OpenmlPkg__Sklearn(_ModelPkgClassifier):
+    _tags = {
+        "pkg_id": "__multiple",
+        "python_dependencies": "scikit-learn",
+        "pkg_pypi_name": "scikit-learn",
+    }
+
+    # obtained via utils._indexing._preindex_sklearn
+    # todo: automate generation
+    # todo: include version bounds for availability
+    # todo: test generated index against actual index
+    _obj_dict = {
+        "ARDRegression": "sklearn.linear_model._bayes.ARDRegression",
+        "AdaBoostClassifier": "sklearn.ensemble._weight_boosting.AdaBoostClassifier",
+        "AdaBoostRegressor": "sklearn.ensemble._weight_boosting.AdaBoostRegressor",
+        "AdditiveChi2Sampler": "sklearn.kernel_approximation.AdditiveChi2Sampler",
+        "AffinityPropagation": "sklearn.cluster._affinity_propagation.AffinityPropagation",
+        "AgglomerativeClustering": "sklearn.cluster._agglomerative.AgglomerativeClustering",
+        "BaggingClassifier": "sklearn.ensemble._bagging.BaggingClassifier",
+        "BaggingRegressor": "sklearn.ensemble._bagging.BaggingRegressor",
+        "BayesianGaussianMixture": "sklearn.mixture._bayesian_mixture.BayesianGaussianMixture",
+        "BayesianRidge": "sklearn.linear_model._bayes.BayesianRidge",
+        "BernoulliNB": "sklearn.naive_bayes.BernoulliNB",
+        "BernoulliRBM": "sklearn.neural_network._rbm.BernoulliRBM",
+        "Binarizer": "sklearn.preprocessing._data.Binarizer",
+        "Birch": "sklearn.cluster._birch.Birch",
+        "BisectingKMeans": "sklearn.cluster._bisect_k_means.BisectingKMeans",
+        "CCA": "sklearn.cross_decomposition._pls.CCA",
+        "CalibratedClassifierCV": "sklearn.calibration.CalibratedClassifierCV",
+        "CategoricalNB": "sklearn.naive_bayes.CategoricalNB",
+        "ClassifierChain": "sklearn.multioutput.ClassifierChain",
+        "ColumnTransformer": "sklearn.compose._column_transformer.ColumnTransformer",
+        "ComplementNB": "sklearn.naive_bayes.ComplementNB",
+        "CountVectorizer": "sklearn.feature_extraction.text.CountVectorizer",
+        "DBSCAN": "sklearn.cluster._dbscan.DBSCAN",
+        "DecisionTreeClassifier": "sklearn.tree._classes.DecisionTreeClassifier",
+        "DecisionTreeRegressor": "sklearn.tree._classes.DecisionTreeRegressor",
+        "DictVectorizer": "sklearn.feature_extraction._dict_vectorizer.DictVectorizer",
+        "DictionaryLearning": "sklearn.decomposition._dict_learning.DictionaryLearning",
+        "DummyClassifier": "sklearn.dummy.DummyClassifier",
+        "DummyRegressor": "sklearn.dummy.DummyRegressor",
+        "ElasticNet": "sklearn.linear_model._coordinate_descent.ElasticNet",
+        "ElasticNetCV": "sklearn.linear_model._coordinate_descent.ElasticNetCV",
+        "EllipticEnvelope": "sklearn.covariance._elliptic_envelope.EllipticEnvelope",
+        "EmpiricalCovariance": "sklearn.covariance._empirical_covariance.EmpiricalCovariance",
+        "ExtraTreeClassifier": "sklearn.tree._classes.ExtraTreeClassifier",
+        "ExtraTreeRegressor": "sklearn.tree._classes.ExtraTreeRegressor",
+        "ExtraTreesClassifier": "sklearn.ensemble._forest.ExtraTreesClassifier",
+        "ExtraTreesRegressor": "sklearn.ensemble._forest.ExtraTreesRegressor",
+        "FactorAnalysis": "sklearn.decomposition._factor_analysis.FactorAnalysis",
+        "FastICA": "sklearn.decomposition._fastica.FastICA",
+        "FeatureAgglomeration": "sklearn.cluster._agglomerative.FeatureAgglomeration",
+        "FeatureHasher": "sklearn.feature_extraction._hash.FeatureHasher",
+        "FeatureUnion": "sklearn.pipeline.FeatureUnion",
+        "FixedThresholdClassifier": "sklearn.model_selection._classification_threshold.FixedThresholdClassifier",
+        "FrozenEstimator": "sklearn.frozen._frozen.FrozenEstimator",
+        "FunctionTransformer": "sklearn.preprocessing._function_transformer.FunctionTransformer",
+        "GammaRegressor": "sklearn.linear_model._glm.glm.GammaRegressor",
+        "GaussianMixture": "sklearn.mixture._gaussian_mixture.GaussianMixture",
+        "GaussianNB": "sklearn.naive_bayes.GaussianNB",
+        "GaussianProcessClassifier": "sklearn.gaussian_process._gpc.GaussianProcessClassifier",
+        "GaussianProcessRegressor": "sklearn.gaussian_process._gpr.GaussianProcessRegressor",
+        "GaussianRandomProjection": "sklearn.random_projection.GaussianRandomProjection",
+        "GenericUnivariateSelect": "sklearn.feature_selection._univariate_selection.GenericUnivariateSelect",
+        "GradientBoostingClassifier": "sklearn.ensemble._gb.GradientBoostingClassifier",
+        "GradientBoostingRegressor": "sklearn.ensemble._gb.GradientBoostingRegressor",
+        "GraphicalLasso": "sklearn.covariance._graph_lasso.GraphicalLasso",
+        "GraphicalLassoCV": "sklearn.covariance._graph_lasso.GraphicalLassoCV",
+        "GridSearchCV": "sklearn.model_selection._search.GridSearchCV",
+        "HDBSCAN": "sklearn.cluster._hdbscan.hdbscan.HDBSCAN",
+        "HashingVectorizer": "sklearn.feature_extraction.text.HashingVectorizer",
+        "HistGradientBoostingClassifier": "sklearn.ensemble._hist_gradient_boosting.gradient_boosting.HistGradientBoostingClassifier",
+        "HistGradientBoostingRegressor": "sklearn.ensemble._hist_gradient_boosting.gradient_boosting.HistGradientBoostingRegressor",
+        "HuberRegressor": "sklearn.linear_model._huber.HuberRegressor",
+        "IncrementalPCA": "sklearn.decomposition._incremental_pca.IncrementalPCA",
+        "IsolationForest": "sklearn.ensemble._iforest.IsolationForest",
+        "Isomap": "sklearn.manifold._isomap.Isomap",
+        "IsotonicRegression": "sklearn.isotonic.IsotonicRegression",
+        "KBinsDiscretizer": "sklearn.preprocessing._discretization.KBinsDiscretizer",
+        "KMeans": "sklearn.cluster._kmeans.KMeans",
+        "KNNImputer": "sklearn.impute._knn.KNNImputer",
+        "KNeighborsClassifier": "sklearn.neighbors._classification.KNeighborsClassifier",
+        "KNeighborsRegressor": "sklearn.neighbors._regression.KNeighborsRegressor",
+        "KNeighborsTransformer": "sklearn.neighbors._graph.KNeighborsTransformer",
+        "KernelCenterer": "sklearn.preprocessing._data.KernelCenterer",
+        "KernelDensity": "sklearn.neighbors._kde.KernelDensity",
+        "KernelPCA": "sklearn.decomposition._kernel_pca.KernelPCA",
+        "KernelRidge": "sklearn.kernel_ridge.KernelRidge",
+        "LabelBinarizer": "sklearn.preprocessing._label.LabelBinarizer",
+        "LabelEncoder": "sklearn.preprocessing._label.LabelEncoder",
+        "LabelPropagation": "sklearn.semi_supervised._label_propagation.LabelPropagation",
+        "LabelSpreading": "sklearn.semi_supervised._label_propagation.LabelSpreading",
+        "Lars": "sklearn.linear_model._least_angle.Lars",
+        "LarsCV": "sklearn.linear_model._least_angle.LarsCV",
+        "Lasso": "sklearn.linear_model._coordinate_descent.Lasso",
+        "LassoCV": "sklearn.linear_model._coordinate_descent.LassoCV",
+        "LassoLars": "sklearn.linear_model._least_angle.LassoLars",
+        "LassoLarsCV": "sklearn.linear_model._least_angle.LassoLarsCV",
+        "LassoLarsIC": "sklearn.linear_model._least_angle.LassoLarsIC",
+        "LatentDirichletAllocation": "sklearn.decomposition._lda.LatentDirichletAllocation",
+        "LedoitWolf": "sklearn.covariance._shrunk_covariance.LedoitWolf",
+        "LinearDiscriminantAnalysis": "sklearn.discriminant_analysis.LinearDiscriminantAnalysis",
+        "LinearRegression": "sklearn.linear_model._base.LinearRegression",
+        "LinearSVC": "sklearn.svm._classes.LinearSVC",
+        "LinearSVR": "sklearn.svm._classes.LinearSVR",
+        "LocalOutlierFactor": "sklearn.neighbors._lof.LocalOutlierFactor",
+        "LocallyLinearEmbedding": "sklearn.manifold._locally_linear.LocallyLinearEmbedding",
+        "LogisticRegression": "sklearn.linear_model._logistic.LogisticRegression",
+        "LogisticRegressionCV": "sklearn.linear_model._logistic.LogisticRegressionCV",
+        "MDS": "sklearn.manifold._mds.MDS",
+        "MLPClassifier": "sklearn.neural_network._multilayer_perceptron.MLPClassifier",
+        "MLPRegressor": "sklearn.neural_network._multilayer_perceptron.MLPRegressor",
+        "MaxAbsScaler": "sklearn.preprocessing._data.MaxAbsScaler",
+        "MeanShift": "sklearn.cluster._mean_shift.MeanShift",
+        "MinCovDet": "sklearn.covariance._robust_covariance.MinCovDet",
+        "MinMaxScaler": "sklearn.preprocessing._data.MinMaxScaler",
+        "MiniBatchDictionaryLearning": "sklearn.decomposition._dict_learning.MiniBatchDictionaryLearning",
+        "MiniBatchKMeans": "sklearn.cluster._kmeans.MiniBatchKMeans",
+        "MiniBatchNMF": "sklearn.decomposition._nmf.MiniBatchNMF",
+        "MiniBatchSparsePCA": "sklearn.decomposition._sparse_pca.MiniBatchSparsePCA",
+        "MissingIndicator": "sklearn.impute._base.MissingIndicator",
+        "MultiLabelBinarizer": "sklearn.preprocessing._label.MultiLabelBinarizer",
+        "MultiOutputClassifier": "sklearn.multioutput.MultiOutputClassifier",
+        "MultiOutputRegressor": "sklearn.multioutput.MultiOutputRegressor",
+        "MultiTaskElasticNet": "sklearn.linear_model._coordinate_descent.MultiTaskElasticNet",
+        "MultiTaskElasticNetCV": "sklearn.linear_model._coordinate_descent.MultiTaskElasticNetCV",
+        "MultiTaskLasso": "sklearn.linear_model._coordinate_descent.MultiTaskLasso",
+        "MultiTaskLassoCV": "sklearn.linear_model._coordinate_descent.MultiTaskLassoCV",
+        "MultinomialNB": "sklearn.naive_bayes.MultinomialNB",
+        "NMF": "sklearn.decomposition._nmf.NMF",
+        "NearestCentroid": "sklearn.neighbors._nearest_centroid.NearestCentroid",
+        "NearestNeighbors": "sklearn.neighbors._unsupervised.NearestNeighbors",
+        "NeighborhoodComponentsAnalysis": "sklearn.neighbors._nca.NeighborhoodComponentsAnalysis",
+        "Normalizer": "sklearn.preprocessing._data.Normalizer",
+        "NuSVC": "sklearn.svm._classes.NuSVC",
+        "NuSVR": "sklearn.svm._classes.NuSVR",
+        "Nystroem": "sklearn.kernel_approximation.Nystroem",
+        "OAS": "sklearn.covariance._shrunk_covariance.OAS",
+        "OPTICS": "sklearn.cluster._optics.OPTICS",
+        "OneClassSVM": "sklearn.svm._classes.OneClassSVM",
+        "OneHotEncoder": "sklearn.preprocessing._encoders.OneHotEncoder",
+        "OneVsOneClassifier": "sklearn.multiclass.OneVsOneClassifier",
+        "OneVsRestClassifier": "sklearn.multiclass.OneVsRestClassifier",
+        "OrdinalEncoder": "sklearn.preprocessing._encoders.OrdinalEncoder",
+        "OrthogonalMatchingPursuit": "sklearn.linear_model._omp.OrthogonalMatchingPursuit",
+        "OrthogonalMatchingPursuitCV": "sklearn.linear_model._omp.OrthogonalMatchingPursuitCV",
+        "OutputCodeClassifier": "sklearn.multiclass.OutputCodeClassifier",
+        "PCA": "sklearn.decomposition._pca.PCA",
+        "PLSCanonical": "sklearn.cross_decomposition._pls.PLSCanonical",
+        "PLSRegression": "sklearn.cross_decomposition._pls.PLSRegression",
+        "PLSSVD": "sklearn.cross_decomposition._pls.PLSSVD",
+        "PassiveAggressiveClassifier": "sklearn.linear_model._passive_aggressive.PassiveAggressiveClassifier",
+        "PassiveAggressiveRegressor": "sklearn.linear_model._passive_aggressive.PassiveAggressiveRegressor",
+        "PatchExtractor": "sklearn.feature_extraction.image.PatchExtractor",
+        "Perceptron": "sklearn.linear_model._perceptron.Perceptron",
+        "Pipeline": "sklearn.pipeline.Pipeline",
+        "PoissonRegressor": "sklearn.linear_model._glm.glm.PoissonRegressor",
+        "PolynomialCountSketch": "sklearn.kernel_approximation.PolynomialCountSketch",
+        "PolynomialFeatures": "sklearn.preprocessing._polynomial.PolynomialFeatures",
+        "PowerTransformer": "sklearn.preprocessing._data.PowerTransformer",
+        "QuadraticDiscriminantAnalysis": "sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis",
+        "QuantileRegressor": "sklearn.linear_model._quantile.QuantileRegressor",
+        "QuantileTransformer": "sklearn.preprocessing._data.QuantileTransformer",
+        "RANSACRegressor": "sklearn.linear_model._ransac.RANSACRegressor",
+        "RBFSampler": "sklearn.kernel_approximation.RBFSampler",
+        "RFE": "sklearn.feature_selection._rfe.RFE",
+        "RFECV": "sklearn.feature_selection._rfe.RFECV",
+        "RadiusNeighborsClassifier": "sklearn.neighbors._classification.RadiusNeighborsClassifier",
+        "RadiusNeighborsRegressor": "sklearn.neighbors._regression.RadiusNeighborsRegressor",
+        "RadiusNeighborsTransformer": "sklearn.neighbors._graph.RadiusNeighborsTransformer",
+        "RandomForestClassifier": "sklearn.ensemble._forest.RandomForestClassifier",
+        "RandomForestRegressor": "sklearn.ensemble._forest.RandomForestRegressor",
+        "RandomTreesEmbedding": "sklearn.ensemble._forest.RandomTreesEmbedding",
+        "RandomizedSearchCV": "sklearn.model_selection._search.RandomizedSearchCV",
+        "RegressorChain": "sklearn.multioutput.RegressorChain",
+        "Ridge": "sklearn.linear_model._ridge.Ridge",
+        "RidgeCV": "sklearn.linear_model._ridge.RidgeCV",
+        "RidgeClassifier": "sklearn.linear_model._ridge.RidgeClassifier",
+        "RidgeClassifierCV": "sklearn.linear_model._ridge.RidgeClassifierCV",
+        "RobustScaler": "sklearn.preprocessing._data.RobustScaler",
+        "SGDClassifier": "sklearn.linear_model._stochastic_gradient.SGDClassifier",
+        "SGDOneClassSVM": "sklearn.linear_model._stochastic_gradient.SGDOneClassSVM",
+        "SGDRegressor": "sklearn.linear_model._stochastic_gradient.SGDRegressor",
+        "SVC": "sklearn.svm._classes.SVC",
+        "SVR": "sklearn.svm._classes.SVR",
+        "SelectFdr": "sklearn.feature_selection._univariate_selection.SelectFdr",
+        "SelectFpr": "sklearn.feature_selection._univariate_selection.SelectFpr",
+        "SelectFromModel": "sklearn.feature_selection._from_model.SelectFromModel",
+        "SelectFwe": "sklearn.feature_selection._univariate_selection.SelectFwe",
+        "SelectKBest": "sklearn.feature_selection._univariate_selection.SelectKBest",
+        "SelectPercentile": "sklearn.feature_selection._univariate_selection.SelectPercentile",
+        "SelfTrainingClassifier": "sklearn.semi_supervised._self_training.SelfTrainingClassifier",
+        "SequentialFeatureSelector": "sklearn.feature_selection._sequential.SequentialFeatureSelector",
+        "ShrunkCovariance": "sklearn.covariance._shrunk_covariance.ShrunkCovariance",
+        "SimpleImputer": "sklearn.impute._base.SimpleImputer",
+        "SkewedChi2Sampler": "sklearn.kernel_approximation.SkewedChi2Sampler",
+        "SparseCoder": "sklearn.decomposition._dict_learning.SparseCoder",
+        "SparsePCA": "sklearn.decomposition._sparse_pca.SparsePCA",
+        "SparseRandomProjection": "sklearn.random_projection.SparseRandomProjection",
+        "SpectralBiclustering": "sklearn.cluster._bicluster.SpectralBiclustering",
+        "SpectralClustering": "sklearn.cluster._spectral.SpectralClustering",
+        "SpectralCoclustering": "sklearn.cluster._bicluster.SpectralCoclustering",
+        "SpectralEmbedding": "sklearn.manifold._spectral_embedding.SpectralEmbedding",
+        "SplineTransformer": "sklearn.preprocessing._polynomial.SplineTransformer",
+        "StackingClassifier": "sklearn.ensemble._stacking.StackingClassifier",
+        "StackingRegressor": "sklearn.ensemble._stacking.StackingRegressor",
+        "StandardScaler": "sklearn.preprocessing._data.StandardScaler",
+        "TSNE": "sklearn.manifold._t_sne.TSNE",
+        "TargetEncoder": "sklearn.preprocessing._target_encoder.TargetEncoder",
+        "TfidfTransformer": "sklearn.feature_extraction.text.TfidfTransformer",
+        "TfidfVectorizer": "sklearn.feature_extraction.text.TfidfVectorizer",
+        "TheilSenRegressor": "sklearn.linear_model._theil_sen.TheilSenRegressor",
+        "TransformedTargetRegressor": "sklearn.compose._target.TransformedTargetRegressor",
+        "TruncatedSVD": "sklearn.decomposition._truncated_svd.TruncatedSVD",
+        "TunedThresholdClassifierCV": "sklearn.model_selection._classification_threshold.TunedThresholdClassifierCV",
+        "TweedieRegressor": "sklearn.linear_model._glm.glm.TweedieRegressor",
+        "VarianceThreshold": "sklearn.feature_selection._variance_threshold.VarianceThreshold",
+        "VotingClassifier": "sklearn.ensemble._voting.VotingClassifier",
+        "VotingRegressor": "sklearn.ensemble._voting.VotingRegressor",
+    }
diff --git a/openml/models/classification/xgboost.py b/openml/models/classification/xgboost.py
index 5b91e647c..b320fcabf 100644
--- a/openml/models/classification/xgboost.py
+++ b/openml/models/classification/xgboost.py
@@ -9,6 +9,7 @@ class OpenmlPkg__XGBClassifier(_ModelPkgClassifier):
     _tags = {
         "pkg_id": "XGBClassifier",
         "python_dependencies": "xgboost",
+        "pkg_pypi_name": "xgboost",
     }
 
     _obj = "xgboost.XGBClassifier"

From f6f050acc786ba196468ff7b51cf3a80ce98ffa3 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 9 Jan 2026 21:26:56 +0000
Subject: [PATCH 8/8] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 openml/__init__.py                          | 1 +
 openml/utils/_indexing/_preindex_sklearn.py | 8 ++++----
 openml/utils/_inmemory/_dict.py             | 5 ++++-
 openml/utils/_openml.py                     | 1 -
 4 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/openml/__init__.py b/openml/__init__.py
index a9a732fc9..c9a90e45a 100644
--- a/openml/__init__.py
+++ b/openml/__init__.py
@@ -131,3 +131,4 @@ def __getattr__(name: str):
         return globals()[name]
     if name not in __all__:
         return get(name)
+    return None
diff --git a/openml/utils/_indexing/_preindex_sklearn.py b/openml/utils/_indexing/_preindex_sklearn.py
index 5291f3adb..bf4f8130b 100644
--- a/openml/utils/_indexing/_preindex_sklearn.py
+++ b/openml/utils/_indexing/_preindex_sklearn.py
@@ -1,6 +1,8 @@
 """Registry lookup methods - scikit-learn estimators."""
+
 # adapted from the sktime utility of the same name
 # copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+from __future__ import annotations
 
 __author__ = ["fkiraly"]
 # all_estimators is also based on the sklearn utility of the same name
@@ -115,7 +117,7 @@ def _all_sklearn_estimators(
             will be the name of the column of estimator class names and the string(s)
             passed in return_tags will serve as column names for all columns of
             tags that were optionally requested.
-    """  # noqa: E501
+    """
     from sklearn.base import BaseEstimator
 
     MODULES_TO_IGNORE_SKLEARN = [
@@ -125,7 +127,7 @@ def _all_sklearn_estimators(
         "conftest",
     ]
 
-    result = all_objects(
+    return all_objects(
         object_types=BaseEstimator,
         package_name=package_name,
         modules_to_ignore=MODULES_TO_IGNORE_SKLEARN,
@@ -133,5 +135,3 @@ def _all_sklearn_estimators(
         return_names=return_names,
         suppress_import_stdout=suppress_import_stdout,
     )
-
-    return result
diff --git a/openml/utils/_inmemory/_dict.py b/openml/utils/_inmemory/_dict.py
index 09d59285e..c27e78dd7 100644
--- a/openml/utils/_inmemory/_dict.py
+++ b/openml/utils/_inmemory/_dict.py
@@ -1,5 +1,7 @@
 """Utilities module for serializing and deserializing dicts."""
 
+from __future__ import annotations
+
 
 def serialize_dict(d, mode="eval", name="d"):
     """Serialize a dict as an executable Python code snippet.
@@ -40,13 +42,14 @@ def serialize_dict(d, mode="eval", name="d"):
     >>> deserialized_dict = eval(serialized_dict)
     >>> assert deserialized_dict == my_dict
     """
+
     def dq(s):
         # Escape backslashes and double quotes for valid Python strings
         return s.replace("\\", "\\\\").replace('"', '\\"')
 
     if mode == "eval":
         lines = ["{"]
-    else: # mode == "exec"
+    else:  # mode == "exec"
         lines = [f"{name} = {{"]
     for k, v in d.items():
         lines.append(f'    "{dq(k)}": "{dq(v)}",')
diff --git a/openml/utils/_openml.py b/openml/utils/_openml.py
index d86a62ce7..f20aedcca 100644
--- a/openml/utils/_openml.py
+++ b/openml/utils/_openml.py
@@ -17,7 +17,6 @@
 import openml
 import openml._api_calls
 import openml.exceptions
-
 from openml import config
 
 # Avoid import cycles: https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles