From 20e37a2d30f182695a10fea50a887cd67e24cd0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Wed, 31 Dec 2025 02:06:06 +0100 Subject: [PATCH 1/8] experimental model framework --- openml/__init__.py | 2 + openml/_get.py | 9 ++ openml/base/__init__.py | 6 + openml/{base.py => base/_base.py} | 3 +- openml/base/_base_pkg.py | 120 +++++++++++++++++++ openml/models/__init__.py | 5 + openml/models/_get.py | 63 ++++++++++ openml/models/apis/__init__.py | 5 + openml/models/apis/_classifier.py | 24 ++++ openml/models/base/__init__.py | 5 + openml/models/base/_base.py | 41 +++++++ openml/models/classification/__init__.py | 1 + openml/models/classification/auto_sklearn.py | 14 +++ openml/models/classification/xgboost.py | 14 +++ pyproject.toml | 1 + 15 files changed, 311 insertions(+), 2 deletions(-) create mode 100644 openml/_get.py create mode 100644 openml/base/__init__.py rename openml/{base.py => base/_base.py} (98%) create mode 100644 openml/base/_base_pkg.py create mode 100644 openml/models/__init__.py create mode 100644 openml/models/_get.py create mode 100644 openml/models/apis/__init__.py create mode 100644 openml/models/apis/_classifier.py create mode 100644 openml/models/base/__init__.py create mode 100644 openml/models/base/_base.py create mode 100644 openml/models/classification/__init__.py create mode 100644 openml/models/classification/auto_sklearn.py create mode 100644 openml/models/classification/xgboost.py diff --git a/openml/__init__.py b/openml/__init__.py index c49505eb9..f93cbb5d3 100644 --- a/openml/__init__.py +++ b/openml/__init__.py @@ -48,6 +48,7 @@ OpenMLSupervisedTask, OpenMLTask, ) +from openml._get import get def populate_cache( @@ -120,4 +121,5 @@ def populate_cache( "utils", "_api_calls", "__version__", + "get", ] diff --git a/openml/_get.py b/openml/_get.py new file mode 100644 index 000000000..b576668db --- /dev/null +++ b/openml/_get.py @@ -0,0 +1,9 @@ +"""Global get dispatch utility.""" + +# currently just a forward to models +# to discuss and possibly +# todo: add global get utility here +# in general, e.g., datasets will not have same name as models etc +from openml.models import get + +__all__ = ["get"] diff --git a/openml/base/__init__.py b/openml/base/__init__.py new file mode 100644 index 000000000..76a88c42b --- /dev/null +++ b/openml/base/__init__.py @@ -0,0 +1,6 @@ +"""Module of base classes.""" + +from openml.base._base import OpenMLBase +from openml.base._base_pkg import _BasePkg + +__all__ = ["_BasePkg", "OpenMLBase"] diff --git a/openml/base.py b/openml/base/_base.py similarity index 98% rename from openml/base.py rename to openml/base/_base.py index fbfb9dfc8..de2b387bf 100644 --- a/openml/base.py +++ b/openml/base/_base.py @@ -10,8 +10,7 @@ import openml._api_calls import openml.config - -from .utils import _get_rest_api_type_alias, _tag_openml_base +from openml.utils import _get_rest_api_type_alias, _tag_openml_base class OpenMLBase(ABC): diff --git a/openml/base/_base_pkg.py b/openml/base/_base_pkg.py new file mode 100644 index 000000000..9f5d6005e --- /dev/null +++ b/openml/base/_base_pkg.py @@ -0,0 +1,120 @@ +"""Base Packager class.""" + +import inspect +from pathlib import Path +import sys +import textwrap + +from skbase.base import BaseObject +from skbase.utils.dependencies import _check_estimator_deps + + +class _BasePkg(BaseObject): + + _tags = { + "python_dependencies": None, + "python_version": None, + # package register and manifest + "pkg_id": None, # object id contained, "__multiple" if multiple + "pkg_obj": "reference", # or "code" + "pkg_obj_type": None, # openml API type + "pkg_compression": "zlib", # compression + } + + def __init__(self): + super().__init__() + + def materialize(self): + try: + _check_estimator_deps(obj=self) + except ModuleNotFoundError as e: + # prettier message, so the reference is to the pkg_id + # currently, we cannot simply pass the object name to skbase + # in the error message, so this is a hack + # todo: fix this in scikit-base + msg = str(e) + if len(msg) > 11: + msg = msg[11:] + raise ModuleNotFoundError(msg) from e + + return self._materialize() + + def _materialize(self): + raise RuntimeError("abstract method") + + def serialize(self): + cls_str = class_to_source(type(self)) + compress_method = self.get_tag("pkg_compression") + if compress_method in [None, "None"]: + return cls_str + + cls_str = cls_str.encode("utf-8") + exec(f"import {compress_method}") + compressed_str = eval(f"{compress_method}.compress(cls_str)") + + return compressed_str + + +def _has_source(obj) -> bool: + """ + Return True if inspect.getsource(obj) should succeed. + """ + module_name = getattr(obj, "__module__", None) + if not module_name or module_name not in sys.modules: + return False + + module = sys.modules[module_name] + file = getattr(module, "__file__", None) + if not file: + return False + + return Path(file).suffix == ".py" + + +def class_to_source(cls) -> str: + """Return full source definition of python class as string. + + Parameters + ---------- + cls : class to serialize + + Returns + ------- + str : complete definition of cls, as str. + Imports are not contained or serialized. + """"" + + # Fast path: class has retrievable source + if _has_source(cls): + source = inspect.getsource(cls) + return textwrap.dedent(source) + + # Fallback for dynamically created classes + lines = [] + + bases = [base.__name__ for base in cls.__bases__ if base is not object] + base_str = f"({', '.join(bases)})" if bases else "" + lines.append(f"class {cls.__name__}{base_str}:") + + body_added = False + + for name, value in cls.__dict__.items(): + if name.startswith("__") and name.endswith("__"): + continue + + if inspect.isfunction(value): + if _has_source(value): + method_src = inspect.getsource(value) + method_src = textwrap.indent(textwrap.dedent(method_src), " ") + lines.append(method_src) + else: + lines.append(f" def {name}(self): ...") + body_added = True + else: + lines.append(f" {name} = {repr(value)}") + body_added = True + + if not body_added: + lines.append(" pass") + + return "\n".join(lines) diff --git a/openml/models/__init__.py b/openml/models/__init__.py new file mode 100644 index 000000000..ae833fc63 --- /dev/null +++ b/openml/models/__init__.py @@ -0,0 +1,5 @@ +"""Module with packaging adapters.""" + +from openml.models._get import get + +__all__ = ["get"] diff --git a/openml/models/_get.py b/openml/models/_get.py new file mode 100644 index 000000000..b270ec0b6 --- /dev/null +++ b/openml/models/_get.py @@ -0,0 +1,63 @@ + +"""Model retrieval utility.""" + +from functools import lru_cache + + +def get(id: str): + """Retrieve model object with unique identifier. + + Parameter + --------- + id : str + unique identifier of object to retrieve + + Returns + ------- + class + retrieved object + + Raises + ------ + ModuleNotFoundError + if dependencies of object to retrieve are not satisfied + """ + + id_lookup = _id_lookup() + obj = id_lookup.get(id) + if obj is None: + raise ValueError( + f"Error in openml.get, object with package id {id} " + "does not exist." + ) + return obj().materialize() + + +# todo: need to generalize this later to more types +# currently intentionally retrieves only classifiers +# todo: replace this, optionally, by database backend +def _id_lookup(obj_type=None): + return _id_lookup_cached(obj_type=obj_type).copy() + + +@lru_cache +def _id_lookup_cached(obj_type=None): + all_objs = _all_objects(obj_type=obj_type) + + # todo: generalize that pkg can contain more than one object + lookup_dict = {obj.get_class_tag("pkg_id"): obj for obj in all_objs} + + return lookup_dict + + +@lru_cache +def _all_objects(obj_type=None): + from skbase.lookup import all_objects + + from openml.models.apis._classifier import _ModelPkgClassifier + + clses = all_objects( + object_types=_ModelPkgClassifier, package_name="openml", return_names=False + ) + + return clses diff --git a/openml/models/apis/__init__.py b/openml/models/apis/__init__.py new file mode 100644 index 000000000..f560dcf6f --- /dev/null +++ b/openml/models/apis/__init__.py @@ -0,0 +1,5 @@ +"""Module with packaging adapters.""" + +from openml.models.apis._classifier import _ModelPkgClassifier + +__all__ = ["_ModelPkgClassifier"] diff --git a/openml/models/apis/_classifier.py b/openml/models/apis/_classifier.py new file mode 100644 index 000000000..a6d75b967 --- /dev/null +++ b/openml/models/apis/_classifier.py @@ -0,0 +1,24 @@ +"""Base package for sklearn classifiers.""" + +from openml.models.base import _OpenmlModelPkg + + +class _ModelPkgClassifier(_OpenmlModelPkg): + + _tags = { + # tags specific to API type + "pkg_obj_type": "classifier", + } + + def get_obj_tags(self): + """Return tags of the object as a dictionary.""" + return {} # this needs to be implemented + + def get_obj_param_names(self): + """Return parameter names of the object as a list. + + Returns + ------- + list: names of object parameters + """ + return list(self.materialize()().get_params().keys()) diff --git a/openml/models/base/__init__.py b/openml/models/base/__init__.py new file mode 100644 index 000000000..a60e1e404 --- /dev/null +++ b/openml/models/base/__init__.py @@ -0,0 +1,5 @@ +"""Module with packaging adapters.""" + +from openml.models.base._base import _OpenmlModelPkg + +__all__ = ["_OpenmlModelPkg"] diff --git a/openml/models/base/_base.py b/openml/models/base/_base.py new file mode 100644 index 000000000..4384e754c --- /dev/null +++ b/openml/models/base/_base.py @@ -0,0 +1,41 @@ +"""Base model package class.""" + +from openml.base import _BasePkg + + +class _OpenmlModelPkg(_BasePkg): + + _obj = None + + def _materialize(self): + pkg_obj = self.get_tag("pkg_obj") + + _obj = self._obj + + if _obj is None: + raise ValueError( + "Error in materialize." + "Either _materialize must be implemented, or" + "the _obj attribute must be not None." + ) + + if pkg_obj == "reference": + from skbase.utils.dependencies import _safe_import + + obj = _safe_import(self._obj) + return obj + + elif pkg_obj == "code": + exec(self._obj) + + return obj + + # elif pkg_obj == "craft": + # identify and call appropriate craft method + + else: + raise ValueError( + 'Error in package tag "pkg_obj", ' + 'must be one of "reference", "code", "craft", ' + f'but found value {pkg_obj}, of type {type(pkg_obj)}' + ) diff --git a/openml/models/classification/__init__.py b/openml/models/classification/__init__.py new file mode 100644 index 000000000..e547a50cf --- /dev/null +++ b/openml/models/classification/__init__.py @@ -0,0 +1 @@ +"""Sklearn classification models.""" diff --git a/openml/models/classification/auto_sklearn.py b/openml/models/classification/auto_sklearn.py new file mode 100644 index 000000000..0be641394 --- /dev/null +++ b/openml/models/classification/auto_sklearn.py @@ -0,0 +1,14 @@ +"""Auto-sklearn classifier.""" + + +from openml.models.apis import _ModelPkgClassifier + + +class OpenmlPkg__AutoSklearnClassifier(_ModelPkgClassifier): + + _tags = { + "pkg_id": "AutoSklearnClassifier", + "python_dependencies": "auto-sklearn", + } + + _obj = "autosklearn.classification.AutoSklearnClassifier" diff --git a/openml/models/classification/xgboost.py b/openml/models/classification/xgboost.py new file mode 100644 index 000000000..44f3173fe --- /dev/null +++ b/openml/models/classification/xgboost.py @@ -0,0 +1,14 @@ +"""Xgboost classifier.""" + + +from openml.models.apis import _ModelPkgClassifier + + +class OpenmlPkg__XGBClassifier(_ModelPkgClassifier): + + _tags = { + "pkg_id": "XGBClassifier", + "python_dependencies": "xgboost", + } + + _obj = "xgboost.XGBClassifier" diff --git a/pyproject.toml b/pyproject.toml index 2bf762b09..83b62554d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,7 @@ dependencies = [ "minio", "pyarrow", "tqdm", # For MinIO download progress bars + "scikit-base", ] requires-python = ">=3.8" maintainers = [ From d79fbe52dceca77b70b1c46a7b32aceecef2aa71 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 31 Dec 2025 01:22:05 +0000 Subject: [PATCH 2/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- openml/__init__.py | 3 ++- openml/_get.py | 2 ++ openml/base/_base_pkg.py | 17 +++++++---------- openml/models/_get.py | 19 +++++-------------- openml/models/apis/_classifier.py | 3 ++- openml/models/base/_base.py | 19 +++++++++---------- openml/models/classification/auto_sklearn.py | 2 +- openml/models/classification/xgboost.py | 2 +- 8 files changed, 29 insertions(+), 38 deletions(-) diff --git a/openml/__init__.py b/openml/__init__.py index f93cbb5d3..7eb077057 100644 --- a/openml/__init__.py +++ b/openml/__init__.py @@ -18,6 +18,8 @@ # License: BSD 3-Clause from __future__ import annotations +from openml._get import get + from . import ( _api_calls, config, @@ -48,7 +50,6 @@ OpenMLSupervisedTask, OpenMLTask, ) -from openml._get import get def populate_cache( diff --git a/openml/_get.py b/openml/_get.py index b576668db..0c5e9739e 100644 --- a/openml/_get.py +++ b/openml/_get.py @@ -4,6 +4,8 @@ # to discuss and possibly # todo: add global get utility here # in general, e.g., datasets will not have same name as models etc +from __future__ import annotations + from openml.models import get __all__ = ["get"] diff --git a/openml/base/_base_pkg.py b/openml/base/_base_pkg.py index 9f5d6005e..690b93a86 100644 --- a/openml/base/_base_pkg.py +++ b/openml/base/_base_pkg.py @@ -1,16 +1,17 @@ """Base Packager class.""" +from __future__ import annotations + import inspect -from pathlib import Path import sys import textwrap +from pathlib import Path from skbase.base import BaseObject from skbase.utils.dependencies import _check_estimator_deps class _BasePkg(BaseObject): - _tags = { "python_dependencies": None, "python_version": None, @@ -50,15 +51,11 @@ def serialize(self): cls_str = cls_str.encode("utf-8") exec(f"import {compress_method}") - compressed_str = eval(f"{compress_method}.compress(cls_str)") - - return compressed_str + return eval(f"{compress_method}.compress(cls_str)") def _has_source(obj) -> bool: - """ - Return True if inspect.getsource(obj) should succeed. - """ + """Return True if inspect.getsource(obj) should succeed.""" module_name = getattr(obj, "__module__", None) if not module_name or module_name not in sys.modules: return False @@ -82,7 +79,7 @@ def class_to_source(cls) -> str: ------- str : complete definition of cls, as str. Imports are not contained or serialized. - """"" + """ "" # Fast path: class has retrievable source if _has_source(cls): @@ -111,7 +108,7 @@ def class_to_source(cls) -> str: lines.append(f" def {name}(self): ...") body_added = True else: - lines.append(f" {name} = {repr(value)}") + lines.append(f" {name} = {value!r}") body_added = True if not body_added: diff --git a/openml/models/_get.py b/openml/models/_get.py index b270ec0b6..75b807ca7 100644 --- a/openml/models/_get.py +++ b/openml/models/_get.py @@ -1,6 +1,7 @@ - """Model retrieval utility.""" +from __future__ import annotations + from functools import lru_cache @@ -22,14 +23,10 @@ def get(id: str): ModuleNotFoundError if dependencies of object to retrieve are not satisfied """ - id_lookup = _id_lookup() obj = id_lookup.get(id) if obj is None: - raise ValueError( - f"Error in openml.get, object with package id {id} " - "does not exist." - ) + raise ValueError(f"Error in openml.get, object with package id {id} " "does not exist.") return obj().materialize() @@ -45,9 +42,7 @@ def _id_lookup_cached(obj_type=None): all_objs = _all_objects(obj_type=obj_type) # todo: generalize that pkg can contain more than one object - lookup_dict = {obj.get_class_tag("pkg_id"): obj for obj in all_objs} - - return lookup_dict + return {obj.get_class_tag("pkg_id"): obj for obj in all_objs} @lru_cache @@ -56,8 +51,4 @@ def _all_objects(obj_type=None): from openml.models.apis._classifier import _ModelPkgClassifier - clses = all_objects( - object_types=_ModelPkgClassifier, package_name="openml", return_names=False - ) - - return clses + return all_objects(object_types=_ModelPkgClassifier, package_name="openml", return_names=False) diff --git a/openml/models/apis/_classifier.py b/openml/models/apis/_classifier.py index a6d75b967..c1198ee32 100644 --- a/openml/models/apis/_classifier.py +++ b/openml/models/apis/_classifier.py @@ -1,10 +1,11 @@ """Base package for sklearn classifiers.""" +from __future__ import annotations + from openml.models.base import _OpenmlModelPkg class _ModelPkgClassifier(_OpenmlModelPkg): - _tags = { # tags specific to API type "pkg_obj_type": "classifier", diff --git a/openml/models/base/_base.py b/openml/models/base/_base.py index 4384e754c..6b3fa2a92 100644 --- a/openml/models/base/_base.py +++ b/openml/models/base/_base.py @@ -1,10 +1,11 @@ """Base model package class.""" +from __future__ import annotations + from openml.base import _BasePkg class _OpenmlModelPkg(_BasePkg): - _obj = None def _materialize(self): @@ -22,10 +23,9 @@ def _materialize(self): if pkg_obj == "reference": from skbase.utils.dependencies import _safe_import - obj = _safe_import(self._obj) - return obj + return _safe_import(self._obj) - elif pkg_obj == "code": + if pkg_obj == "code": exec(self._obj) return obj @@ -33,9 +33,8 @@ def _materialize(self): # elif pkg_obj == "craft": # identify and call appropriate craft method - else: - raise ValueError( - 'Error in package tag "pkg_obj", ' - 'must be one of "reference", "code", "craft", ' - f'but found value {pkg_obj}, of type {type(pkg_obj)}' - ) + raise ValueError( + 'Error in package tag "pkg_obj", ' + 'must be one of "reference", "code", "craft", ' + f"but found value {pkg_obj}, of type {type(pkg_obj)}" + ) diff --git a/openml/models/classification/auto_sklearn.py b/openml/models/classification/auto_sklearn.py index 0be641394..1d29044da 100644 --- a/openml/models/classification/auto_sklearn.py +++ b/openml/models/classification/auto_sklearn.py @@ -1,11 +1,11 @@ """Auto-sklearn classifier.""" +from __future__ import annotations from openml.models.apis import _ModelPkgClassifier class OpenmlPkg__AutoSklearnClassifier(_ModelPkgClassifier): - _tags = { "pkg_id": "AutoSklearnClassifier", "python_dependencies": "auto-sklearn", diff --git a/openml/models/classification/xgboost.py b/openml/models/classification/xgboost.py index 44f3173fe..5b91e647c 100644 --- a/openml/models/classification/xgboost.py +++ b/openml/models/classification/xgboost.py @@ -1,11 +1,11 @@ """Xgboost classifier.""" +from __future__ import annotations from openml.models.apis import _ModelPkgClassifier class OpenmlPkg__XGBClassifier(_ModelPkgClassifier): - _tags = { "pkg_id": "XGBClassifier", "python_dependencies": "xgboost", From dc15c669213f71e17cda16d8b91c3045f71143d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Fri, 9 Jan 2026 21:26:15 +0100 Subject: [PATCH 3/8] move utils --- openml/utils/__init__.py | 35 +++ openml/utils/_openml.py | 471 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 506 insertions(+) create mode 100644 openml/utils/__init__.py create mode 100644 openml/utils/_openml.py diff --git a/openml/utils/__init__.py b/openml/utils/__init__.py new file mode 100644 index 000000000..83e379222 --- /dev/null +++ b/openml/utils/__init__.py @@ -0,0 +1,35 @@ +"""Utilities module.""" + +from openml.utils._openml import ( + ProgressBar, + _create_cache_directory, + _create_cache_directory_for_id, + _create_lockfiles_dir, + _delete_entity, + _get_cache_dir_for_id, + _get_cache_dir_for_key, + _get_rest_api_type_alias, + _list_all, + _remove_cache_dir_for_id, + _tag_entity, + _tag_openml_base, + extract_xml_tags, + thread_safe_if_oslo_installed, +) + +__all__ = [ + "ProgressBar", + "_create_cache_directory", + "_create_cache_directory_for_id", + "_create_lockfiles_dir", + "_delete_entity", + "_get_cache_dir_for_id", + "_get_cache_dir_for_key", + "_get_rest_api_type_alias", + "_list_all", + "_remove_cache_dir_for_id", + "_tag_entity", + "_tag_openml_base", + "extract_xml_tags", + "thread_safe_if_oslo_installed", +] diff --git a/openml/utils/_openml.py b/openml/utils/_openml.py new file mode 100644 index 000000000..7e72e7aee --- /dev/null +++ b/openml/utils/_openml.py @@ -0,0 +1,471 @@ +# License: BSD 3-Clause +from __future__ import annotations + +import contextlib +import shutil +import warnings +from functools import wraps +from pathlib import Path +from typing import TYPE_CHECKING, Any, Callable, Mapping, Sized, TypeVar, overload +from typing_extensions import Literal, ParamSpec + +import numpy as np +import xmltodict +from minio.helpers import ProgressType +from tqdm import tqdm + +import openml +import openml._api_calls +import openml.exceptions + +from . import config + +# Avoid import cycles: https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles +if TYPE_CHECKING: + from openml.base import OpenMLBase + + P = ParamSpec("P") + R = TypeVar("R") + _SizedT = TypeVar("_SizedT", bound=Sized) + + +@overload +def extract_xml_tags( + xml_tag_name: str, + node: Mapping[str, Any], + *, + allow_none: Literal[True] = ..., +) -> Any | None: ... + + +@overload +def extract_xml_tags( + xml_tag_name: str, + node: Mapping[str, Any], + *, + allow_none: Literal[False], +) -> Any: ... + + +def extract_xml_tags( + xml_tag_name: str, + node: Mapping[str, Any], + *, + allow_none: bool = True, +) -> Any | None: + """Helper to extract xml tags from xmltodict. + + Parameters + ---------- + xml_tag_name : str + Name of the xml tag to extract from the node. + + node : Mapping[str, Any] + Node object returned by ``xmltodict`` from which ``xml_tag_name`` + should be extracted. + + allow_none : bool + If ``False``, the tag needs to exist in the node. Will raise a + ``ValueError`` if it does not. + + Returns + ------- + object + """ + if xml_tag_name in node and node[xml_tag_name] is not None: + if isinstance(node[xml_tag_name], (dict, str)): + return [node[xml_tag_name]] + if isinstance(node[xml_tag_name], list): + return node[xml_tag_name] + + raise ValueError("Received not string and non list as tag item") + + if allow_none: + return None + + raise ValueError(f"Could not find tag '{xml_tag_name}' in node '{node!s}'") + + +def _get_rest_api_type_alias(oml_object: OpenMLBase) -> str: + """Return the alias of the openml entity as it is defined for the REST API.""" + rest_api_mapping: list[tuple[type | tuple, str]] = [ + (openml.datasets.OpenMLDataset, "data"), + (openml.flows.OpenMLFlow, "flow"), + (openml.tasks.OpenMLTask, "task"), + (openml.runs.OpenMLRun, "run"), + ((openml.study.OpenMLStudy, openml.study.OpenMLBenchmarkSuite), "study"), + ] + _, api_type_alias = next( + (python_type, api_alias) + for (python_type, api_alias) in rest_api_mapping + if isinstance(oml_object, python_type) + ) + return api_type_alias + + +def _tag_openml_base(oml_object: OpenMLBase, tag: str, untag: bool = False) -> None: # noqa: FBT001, FBT002 + api_type_alias = _get_rest_api_type_alias(oml_object) + if oml_object.id is None: + raise openml.exceptions.ObjectNotPublishedError( + f"Cannot tag an {api_type_alias} that has not been published yet." + "Please publish the object first before being able to tag it." + f"\n{oml_object}", + ) + _tag_entity(entity_type=api_type_alias, entity_id=oml_object.id, tag=tag, untag=untag) + + +def _tag_entity(entity_type: str, entity_id: int, tag: str, *, untag: bool = False) -> list[str]: + """ + Function that tags or untags a given entity on OpenML. As the OpenML + API tag functions all consist of the same format, this function covers + all entity types (currently: dataset, task, flow, setup, run). Could + be used in a partial to provide dataset_tag, dataset_untag, etc. + + Parameters + ---------- + entity_type : str + Name of the entity to tag (e.g., run, flow, data) + + entity_id : int + OpenML id of the entity + + tag : str + The tag + + untag : bool + Set to true if needed to untag, rather than tag + + Returns + ------- + tags : list + List of tags that the entity is (still) tagged with + """ + legal_entities = {"data", "task", "flow", "setup", "run"} + if entity_type not in legal_entities: + raise ValueError(f"Can't tag a {entity_type}") + + if untag: + uri = f"{entity_type}/untag" + main_tag = f"oml:{entity_type}_untag" + else: + uri = f"{entity_type}/tag" + main_tag = f"oml:{entity_type}_tag" + + result_xml = openml._api_calls._perform_api_call( + uri, + "post", + {f"{entity_type}_id": entity_id, "tag": tag}, + ) + + result = xmltodict.parse(result_xml, force_list={"oml:tag"})[main_tag] + + if "oml:tag" in result: + return result["oml:tag"] # type: ignore + + # no tags, return empty list + return [] + + +# TODO(eddiebergman): Maybe this can be made more specific with a Literal +def _delete_entity(entity_type: str, entity_id: int) -> bool: + """ + Function that deletes a given entity on OpenML. As the OpenML + API tag functions all consist of the same format, this function covers + all entity types that can be deleted (currently: dataset, task, flow, + run, study and user). + + Parameters + ---------- + entity_type : str + Name of the entity to tag (e.g., run, flow, data) + + entity_id : int + OpenML id of the entity + + Returns + ------- + bool + True iff the deletion was successful. False otherwse + """ + legal_entities = { + "data", + "flow", + "task", + "run", + "study", + "user", + } + if entity_type not in legal_entities: + raise ValueError(f"Can't delete a {entity_type}") + + url_suffix = "%s/%d" % (entity_type, entity_id) + try: + result_xml = openml._api_calls._perform_api_call(url_suffix, "delete") + result = xmltodict.parse(result_xml) + return f"oml:{entity_type}_delete" in result + except openml.exceptions.OpenMLServerException as e: + # https://github.com/openml/OpenML/blob/21f6188d08ac24fcd2df06ab94cf421c946971b0/openml_OS/views/pages/api_new/v1/xml/pre.php + # Most exceptions are descriptive enough to be raised as their standard + # OpenMLServerException, however there are two cases where we add information: + # - a generic "failed" message, we direct them to the right issue board + # - when the user successfully authenticates with the server, + # but user is not allowed to take the requested action, + # in which case we specify a OpenMLNotAuthorizedError. + by_other_user = [323, 353, 393, 453, 594] + has_dependent_entities = [324, 326, 327, 328, 354, 454, 464, 595] + unknown_reason = [325, 355, 394, 455, 593] + if e.code in by_other_user: + raise openml.exceptions.OpenMLNotAuthorizedError( + message=( + f"The {entity_type} can not be deleted because it was not uploaded by you." + ), + ) from e + if e.code in has_dependent_entities: + raise openml.exceptions.OpenMLNotAuthorizedError( + message=( + f"The {entity_type} can not be deleted because " + f"it still has associated entities: {e.message}" + ), + ) from e + if e.code in unknown_reason: + raise openml.exceptions.OpenMLServerError( + message=( + f"The {entity_type} can not be deleted for unknown reason," + " please open an issue at: https://github.com/openml/openml/issues/new" + ), + ) from e + raise e + + +def _list_all( # noqa: C901 + listing_call: Callable[[int, int], _SizedT], + *, + limit: int | None = None, + offset: int | None = None, + batch_size: int | None = 10_000, +) -> list[_SizedT]: + """Helper to handle paged listing requests. + + Example usage: + + ``evaluations = list_all(list_evaluations, "predictive_accuracy", task=mytask)`` + + Parameters + ---------- + listing_call : callable + Call listing, e.g. list_evaluations. Takes two positional + arguments: batch_size and offset. + batch_size : int, optional + The batch size to use for the listing call. + offset : int, optional + The initial offset to use for the listing call. + limit : int, optional + The total size of the listing. If not provided, the function will + request the first batch and then continue until no more results are + returned + + Returns + ------- + List of types returned from type of the listing call + """ + page = 0 + results: list[_SizedT] = [] + + offset = offset if offset is not None else 0 + batch_size = batch_size if batch_size is not None else 10_000 + + LIMIT = limit + BATCH_SIZE_ORIG = batch_size + + # Default batch size per paging. + # This one can be set in filters (batch_size), but should not be + # changed afterwards. The derived batch_size can be changed. + if not isinstance(BATCH_SIZE_ORIG, int): + raise ValueError(f"'batch_size' should be an integer but got {BATCH_SIZE_ORIG}") + + if (LIMIT is not None) and (not isinstance(LIMIT, int)) and (not np.isinf(LIMIT)): + raise ValueError(f"'limit' should be an integer or inf but got {LIMIT}") + + # If our batch size is larger than the limit, we should only + # request one batch of size of LIMIT + if LIMIT is not None and BATCH_SIZE_ORIG > LIMIT: + BATCH_SIZE_ORIG = LIMIT + + if not isinstance(offset, int): + raise ValueError(f"'offset' should be an integer but got {offset}") + + batch_size = BATCH_SIZE_ORIG + while True: + try: + current_offset = offset + BATCH_SIZE_ORIG * page + new_batch = listing_call(batch_size, current_offset) + except openml.exceptions.OpenMLServerNoResult: + # NOTE: This above statement may not actually happen, but we could just return here + # to enforce it... + break + + results.append(new_batch) + + # If the batch is less than our requested batch_size, that's the last batch + # and we can bail out. + if len(new_batch) < batch_size: + break + + page += 1 + if LIMIT is not None: + # check if the number of required results has been achieved + # always do a 'bigger than' check, + # in case of bugs to prevent infinite loops + n_received = sum(len(result) for result in results) + if n_received >= LIMIT: + break + + # check if there are enough results to fulfill a batch + if LIMIT - n_received < BATCH_SIZE_ORIG: + batch_size = LIMIT - n_received + + return results + + +def _get_cache_dir_for_key(key: str) -> Path: + return Path(config.get_cache_directory()) / key + + +def _create_cache_directory(key: str) -> Path: + cache_dir = _get_cache_dir_for_key(key) + + try: + cache_dir.mkdir(exist_ok=True, parents=True) + except Exception as e: + raise openml.exceptions.OpenMLCacheException( + f"Cannot create cache directory {cache_dir}." + ) from e + + return cache_dir + + +def _get_cache_dir_for_id(key: str, id_: int, create: bool = False) -> Path: # noqa: FBT001, FBT002 + cache_dir = _create_cache_directory(key) if create else _get_cache_dir_for_key(key) + return Path(cache_dir) / str(id_) + + +def _create_cache_directory_for_id(key: str, id_: int) -> Path: + """Create the cache directory for a specific ID + + In order to have a clearer cache structure and because every task + is cached in several files (description, split), there + is a directory for each task witch the task ID being the directory + name. This function creates this cache directory. + + This function is NOT thread/multiprocessing safe. + + Parameters + ---------- + key : str + + id_ : int + + Returns + ------- + cache_dir : Path + Path of the created dataset cache directory. + """ + cache_dir = _get_cache_dir_for_id(key, id_, create=True) + if cache_dir.exists() and not cache_dir.is_dir(): + raise ValueError(f"{key} cache dir exists but is not a directory!") + + cache_dir.mkdir(exist_ok=True, parents=True) + return cache_dir + + +def _remove_cache_dir_for_id(key: str, cache_dir: Path) -> None: + """Remove the task cache directory + + This function is NOT thread/multiprocessing safe. + + Parameters + ---------- + key : str + + cache_dir : str + """ + try: + shutil.rmtree(cache_dir) + except OSError as e: + raise ValueError( + f"Cannot remove faulty {key} cache directory {cache_dir}. Please do this manually!", + ) from e + + +def thread_safe_if_oslo_installed(func: Callable[P, R]) -> Callable[P, R]: + try: + # Currently, importing oslo raises a lot of warning that it will stop working + # under python3.8; remove this once they disappear + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + from oslo_concurrency import lockutils + + @wraps(func) + def safe_func(*args: P.args, **kwargs: P.kwargs) -> R: + # Lock directories use the id that is passed as either positional or keyword argument. + id_parameters = [parameter_name for parameter_name in kwargs if "_id" in parameter_name] + if len(id_parameters) == 1: + id_ = kwargs[id_parameters[0]] + elif len(args) > 0: + id_ = args[0] + else: + raise RuntimeError( + f"An id must be specified for {func.__name__}, was passed: ({args}, {kwargs}).", + ) + # The [7:] gets rid of the 'openml.' prefix + lock_name = f"{func.__module__[7:]}.{func.__name__}:{id_}" + with lockutils.external_lock(name=lock_name, lock_path=_create_lockfiles_dir()): + return func(*args, **kwargs) + + return safe_func + except ImportError: + return func + + +def _create_lockfiles_dir() -> Path: + path = Path(config.get_cache_directory()) / "locks" + # TODO(eddiebergman): Not sure why this is allowed to error and ignore??? + with contextlib.suppress(OSError): + path.mkdir(exist_ok=True, parents=True) + return path + + +class ProgressBar(ProgressType): + """Progressbar for MinIO function's `progress` parameter.""" + + def __init__(self) -> None: + self._object_name = "" + self._progress_bar: tqdm | None = None + + def set_meta(self, object_name: str, total_length: int) -> None: + """Initializes the progress bar. + + Parameters + ---------- + object_name: str + Not used. + + total_length: int + File size of the object in bytes. + """ + self._object_name = object_name + self._progress_bar = tqdm(total=total_length, unit_scale=True, unit="B") + + def update(self, length: int) -> None: + """Updates the progress bar. + + Parameters + ---------- + length: int + Number of bytes downloaded since last `update` call. + """ + if not self._progress_bar: + raise RuntimeError("Call `set_meta` before calling `update`.") + self._progress_bar.update(length) + if self._progress_bar.total <= self._progress_bar.n: + self._progress_bar.close() From 91c138cfc54badcfd0be81f8de704b6e38fcb4cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Fri, 9 Jan 2026 21:27:45 +0100 Subject: [PATCH 4/8] Delete utils.py --- openml/utils.py | 471 ------------------------------------------------ 1 file changed, 471 deletions(-) delete mode 100644 openml/utils.py diff --git a/openml/utils.py b/openml/utils.py deleted file mode 100644 index 7e72e7aee..000000000 --- a/openml/utils.py +++ /dev/null @@ -1,471 +0,0 @@ -# License: BSD 3-Clause -from __future__ import annotations - -import contextlib -import shutil -import warnings -from functools import wraps -from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, Mapping, Sized, TypeVar, overload -from typing_extensions import Literal, ParamSpec - -import numpy as np -import xmltodict -from minio.helpers import ProgressType -from tqdm import tqdm - -import openml -import openml._api_calls -import openml.exceptions - -from . import config - -# Avoid import cycles: https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles -if TYPE_CHECKING: - from openml.base import OpenMLBase - - P = ParamSpec("P") - R = TypeVar("R") - _SizedT = TypeVar("_SizedT", bound=Sized) - - -@overload -def extract_xml_tags( - xml_tag_name: str, - node: Mapping[str, Any], - *, - allow_none: Literal[True] = ..., -) -> Any | None: ... - - -@overload -def extract_xml_tags( - xml_tag_name: str, - node: Mapping[str, Any], - *, - allow_none: Literal[False], -) -> Any: ... - - -def extract_xml_tags( - xml_tag_name: str, - node: Mapping[str, Any], - *, - allow_none: bool = True, -) -> Any | None: - """Helper to extract xml tags from xmltodict. - - Parameters - ---------- - xml_tag_name : str - Name of the xml tag to extract from the node. - - node : Mapping[str, Any] - Node object returned by ``xmltodict`` from which ``xml_tag_name`` - should be extracted. - - allow_none : bool - If ``False``, the tag needs to exist in the node. Will raise a - ``ValueError`` if it does not. - - Returns - ------- - object - """ - if xml_tag_name in node and node[xml_tag_name] is not None: - if isinstance(node[xml_tag_name], (dict, str)): - return [node[xml_tag_name]] - if isinstance(node[xml_tag_name], list): - return node[xml_tag_name] - - raise ValueError("Received not string and non list as tag item") - - if allow_none: - return None - - raise ValueError(f"Could not find tag '{xml_tag_name}' in node '{node!s}'") - - -def _get_rest_api_type_alias(oml_object: OpenMLBase) -> str: - """Return the alias of the openml entity as it is defined for the REST API.""" - rest_api_mapping: list[tuple[type | tuple, str]] = [ - (openml.datasets.OpenMLDataset, "data"), - (openml.flows.OpenMLFlow, "flow"), - (openml.tasks.OpenMLTask, "task"), - (openml.runs.OpenMLRun, "run"), - ((openml.study.OpenMLStudy, openml.study.OpenMLBenchmarkSuite), "study"), - ] - _, api_type_alias = next( - (python_type, api_alias) - for (python_type, api_alias) in rest_api_mapping - if isinstance(oml_object, python_type) - ) - return api_type_alias - - -def _tag_openml_base(oml_object: OpenMLBase, tag: str, untag: bool = False) -> None: # noqa: FBT001, FBT002 - api_type_alias = _get_rest_api_type_alias(oml_object) - if oml_object.id is None: - raise openml.exceptions.ObjectNotPublishedError( - f"Cannot tag an {api_type_alias} that has not been published yet." - "Please publish the object first before being able to tag it." - f"\n{oml_object}", - ) - _tag_entity(entity_type=api_type_alias, entity_id=oml_object.id, tag=tag, untag=untag) - - -def _tag_entity(entity_type: str, entity_id: int, tag: str, *, untag: bool = False) -> list[str]: - """ - Function that tags or untags a given entity on OpenML. As the OpenML - API tag functions all consist of the same format, this function covers - all entity types (currently: dataset, task, flow, setup, run). Could - be used in a partial to provide dataset_tag, dataset_untag, etc. - - Parameters - ---------- - entity_type : str - Name of the entity to tag (e.g., run, flow, data) - - entity_id : int - OpenML id of the entity - - tag : str - The tag - - untag : bool - Set to true if needed to untag, rather than tag - - Returns - ------- - tags : list - List of tags that the entity is (still) tagged with - """ - legal_entities = {"data", "task", "flow", "setup", "run"} - if entity_type not in legal_entities: - raise ValueError(f"Can't tag a {entity_type}") - - if untag: - uri = f"{entity_type}/untag" - main_tag = f"oml:{entity_type}_untag" - else: - uri = f"{entity_type}/tag" - main_tag = f"oml:{entity_type}_tag" - - result_xml = openml._api_calls._perform_api_call( - uri, - "post", - {f"{entity_type}_id": entity_id, "tag": tag}, - ) - - result = xmltodict.parse(result_xml, force_list={"oml:tag"})[main_tag] - - if "oml:tag" in result: - return result["oml:tag"] # type: ignore - - # no tags, return empty list - return [] - - -# TODO(eddiebergman): Maybe this can be made more specific with a Literal -def _delete_entity(entity_type: str, entity_id: int) -> bool: - """ - Function that deletes a given entity on OpenML. As the OpenML - API tag functions all consist of the same format, this function covers - all entity types that can be deleted (currently: dataset, task, flow, - run, study and user). - - Parameters - ---------- - entity_type : str - Name of the entity to tag (e.g., run, flow, data) - - entity_id : int - OpenML id of the entity - - Returns - ------- - bool - True iff the deletion was successful. False otherwse - """ - legal_entities = { - "data", - "flow", - "task", - "run", - "study", - "user", - } - if entity_type not in legal_entities: - raise ValueError(f"Can't delete a {entity_type}") - - url_suffix = "%s/%d" % (entity_type, entity_id) - try: - result_xml = openml._api_calls._perform_api_call(url_suffix, "delete") - result = xmltodict.parse(result_xml) - return f"oml:{entity_type}_delete" in result - except openml.exceptions.OpenMLServerException as e: - # https://github.com/openml/OpenML/blob/21f6188d08ac24fcd2df06ab94cf421c946971b0/openml_OS/views/pages/api_new/v1/xml/pre.php - # Most exceptions are descriptive enough to be raised as their standard - # OpenMLServerException, however there are two cases where we add information: - # - a generic "failed" message, we direct them to the right issue board - # - when the user successfully authenticates with the server, - # but user is not allowed to take the requested action, - # in which case we specify a OpenMLNotAuthorizedError. - by_other_user = [323, 353, 393, 453, 594] - has_dependent_entities = [324, 326, 327, 328, 354, 454, 464, 595] - unknown_reason = [325, 355, 394, 455, 593] - if e.code in by_other_user: - raise openml.exceptions.OpenMLNotAuthorizedError( - message=( - f"The {entity_type} can not be deleted because it was not uploaded by you." - ), - ) from e - if e.code in has_dependent_entities: - raise openml.exceptions.OpenMLNotAuthorizedError( - message=( - f"The {entity_type} can not be deleted because " - f"it still has associated entities: {e.message}" - ), - ) from e - if e.code in unknown_reason: - raise openml.exceptions.OpenMLServerError( - message=( - f"The {entity_type} can not be deleted for unknown reason," - " please open an issue at: https://github.com/openml/openml/issues/new" - ), - ) from e - raise e - - -def _list_all( # noqa: C901 - listing_call: Callable[[int, int], _SizedT], - *, - limit: int | None = None, - offset: int | None = None, - batch_size: int | None = 10_000, -) -> list[_SizedT]: - """Helper to handle paged listing requests. - - Example usage: - - ``evaluations = list_all(list_evaluations, "predictive_accuracy", task=mytask)`` - - Parameters - ---------- - listing_call : callable - Call listing, e.g. list_evaluations. Takes two positional - arguments: batch_size and offset. - batch_size : int, optional - The batch size to use for the listing call. - offset : int, optional - The initial offset to use for the listing call. - limit : int, optional - The total size of the listing. If not provided, the function will - request the first batch and then continue until no more results are - returned - - Returns - ------- - List of types returned from type of the listing call - """ - page = 0 - results: list[_SizedT] = [] - - offset = offset if offset is not None else 0 - batch_size = batch_size if batch_size is not None else 10_000 - - LIMIT = limit - BATCH_SIZE_ORIG = batch_size - - # Default batch size per paging. - # This one can be set in filters (batch_size), but should not be - # changed afterwards. The derived batch_size can be changed. - if not isinstance(BATCH_SIZE_ORIG, int): - raise ValueError(f"'batch_size' should be an integer but got {BATCH_SIZE_ORIG}") - - if (LIMIT is not None) and (not isinstance(LIMIT, int)) and (not np.isinf(LIMIT)): - raise ValueError(f"'limit' should be an integer or inf but got {LIMIT}") - - # If our batch size is larger than the limit, we should only - # request one batch of size of LIMIT - if LIMIT is not None and BATCH_SIZE_ORIG > LIMIT: - BATCH_SIZE_ORIG = LIMIT - - if not isinstance(offset, int): - raise ValueError(f"'offset' should be an integer but got {offset}") - - batch_size = BATCH_SIZE_ORIG - while True: - try: - current_offset = offset + BATCH_SIZE_ORIG * page - new_batch = listing_call(batch_size, current_offset) - except openml.exceptions.OpenMLServerNoResult: - # NOTE: This above statement may not actually happen, but we could just return here - # to enforce it... - break - - results.append(new_batch) - - # If the batch is less than our requested batch_size, that's the last batch - # and we can bail out. - if len(new_batch) < batch_size: - break - - page += 1 - if LIMIT is not None: - # check if the number of required results has been achieved - # always do a 'bigger than' check, - # in case of bugs to prevent infinite loops - n_received = sum(len(result) for result in results) - if n_received >= LIMIT: - break - - # check if there are enough results to fulfill a batch - if LIMIT - n_received < BATCH_SIZE_ORIG: - batch_size = LIMIT - n_received - - return results - - -def _get_cache_dir_for_key(key: str) -> Path: - return Path(config.get_cache_directory()) / key - - -def _create_cache_directory(key: str) -> Path: - cache_dir = _get_cache_dir_for_key(key) - - try: - cache_dir.mkdir(exist_ok=True, parents=True) - except Exception as e: - raise openml.exceptions.OpenMLCacheException( - f"Cannot create cache directory {cache_dir}." - ) from e - - return cache_dir - - -def _get_cache_dir_for_id(key: str, id_: int, create: bool = False) -> Path: # noqa: FBT001, FBT002 - cache_dir = _create_cache_directory(key) if create else _get_cache_dir_for_key(key) - return Path(cache_dir) / str(id_) - - -def _create_cache_directory_for_id(key: str, id_: int) -> Path: - """Create the cache directory for a specific ID - - In order to have a clearer cache structure and because every task - is cached in several files (description, split), there - is a directory for each task witch the task ID being the directory - name. This function creates this cache directory. - - This function is NOT thread/multiprocessing safe. - - Parameters - ---------- - key : str - - id_ : int - - Returns - ------- - cache_dir : Path - Path of the created dataset cache directory. - """ - cache_dir = _get_cache_dir_for_id(key, id_, create=True) - if cache_dir.exists() and not cache_dir.is_dir(): - raise ValueError(f"{key} cache dir exists but is not a directory!") - - cache_dir.mkdir(exist_ok=True, parents=True) - return cache_dir - - -def _remove_cache_dir_for_id(key: str, cache_dir: Path) -> None: - """Remove the task cache directory - - This function is NOT thread/multiprocessing safe. - - Parameters - ---------- - key : str - - cache_dir : str - """ - try: - shutil.rmtree(cache_dir) - except OSError as e: - raise ValueError( - f"Cannot remove faulty {key} cache directory {cache_dir}. Please do this manually!", - ) from e - - -def thread_safe_if_oslo_installed(func: Callable[P, R]) -> Callable[P, R]: - try: - # Currently, importing oslo raises a lot of warning that it will stop working - # under python3.8; remove this once they disappear - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - from oslo_concurrency import lockutils - - @wraps(func) - def safe_func(*args: P.args, **kwargs: P.kwargs) -> R: - # Lock directories use the id that is passed as either positional or keyword argument. - id_parameters = [parameter_name for parameter_name in kwargs if "_id" in parameter_name] - if len(id_parameters) == 1: - id_ = kwargs[id_parameters[0]] - elif len(args) > 0: - id_ = args[0] - else: - raise RuntimeError( - f"An id must be specified for {func.__name__}, was passed: ({args}, {kwargs}).", - ) - # The [7:] gets rid of the 'openml.' prefix - lock_name = f"{func.__module__[7:]}.{func.__name__}:{id_}" - with lockutils.external_lock(name=lock_name, lock_path=_create_lockfiles_dir()): - return func(*args, **kwargs) - - return safe_func - except ImportError: - return func - - -def _create_lockfiles_dir() -> Path: - path = Path(config.get_cache_directory()) / "locks" - # TODO(eddiebergman): Not sure why this is allowed to error and ignore??? - with contextlib.suppress(OSError): - path.mkdir(exist_ok=True, parents=True) - return path - - -class ProgressBar(ProgressType): - """Progressbar for MinIO function's `progress` parameter.""" - - def __init__(self) -> None: - self._object_name = "" - self._progress_bar: tqdm | None = None - - def set_meta(self, object_name: str, total_length: int) -> None: - """Initializes the progress bar. - - Parameters - ---------- - object_name: str - Not used. - - total_length: int - File size of the object in bytes. - """ - self._object_name = object_name - self._progress_bar = tqdm(total=total_length, unit_scale=True, unit="B") - - def update(self, length: int) -> None: - """Updates the progress bar. - - Parameters - ---------- - length: int - Number of bytes downloaded since last `update` call. - """ - if not self._progress_bar: - raise RuntimeError("Call `set_meta` before calling `update`.") - self._progress_bar.update(length) - if self._progress_bar.total <= self._progress_bar.n: - self._progress_bar.close() From 778b87e5372ee27a50713cf6d2dfc0ab0308aab3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Fri, 9 Jan 2026 21:28:37 +0100 Subject: [PATCH 5/8] indexer sklearn --- openml/utils/_indexing/__init__.py | 1 + openml/utils/_indexing/_preindex_sklearn.py | 137 ++++++++++++++++++++ openml/utils/_inmemory/__init__.py | 1 + openml/utils/_inmemory/_dict.py | 54 ++++++++ 4 files changed, 193 insertions(+) create mode 100644 openml/utils/_indexing/__init__.py create mode 100644 openml/utils/_indexing/_preindex_sklearn.py create mode 100644 openml/utils/_inmemory/__init__.py create mode 100644 openml/utils/_inmemory/_dict.py diff --git a/openml/utils/_indexing/__init__.py b/openml/utils/_indexing/__init__.py new file mode 100644 index 000000000..80b82550d --- /dev/null +++ b/openml/utils/_indexing/__init__.py @@ -0,0 +1 @@ +"""Utilities module for indexing third party libraries.""" diff --git a/openml/utils/_indexing/_preindex_sklearn.py b/openml/utils/_indexing/_preindex_sklearn.py new file mode 100644 index 000000000..5291f3adb --- /dev/null +++ b/openml/utils/_indexing/_preindex_sklearn.py @@ -0,0 +1,137 @@ +"""Registry lookup methods - scikit-learn estimators.""" +# adapted from the sktime utility of the same name +# copyright: sktime developers, BSD-3-Clause License (see LICENSE file) + +__author__ = ["fkiraly"] +# all_estimators is also based on the sklearn utility of the same name + +from skbase.lookup import all_objects + + +def _all_sklearn_estimators_locdict(package_name="sklearn", serialized=False): + """Dictionary of all scikit-learn estimators in sktime and sklearn. + + Parameters + ---------- + package_name : str, optional (default="sklearn") + The package from which to retrieve the sklearn estimators. + This is an import name, e.g., ``"sklearn"``, not a PEP 440 package identifier, + e.g., ``"scikit-learn"``. + + serialized : bool, optional (default=False) + If True, returns a serialized version of the dict, via + ``openml.utils._inmemory._dict.serialize_dict``. + If False, returns the dict directly. + + Returns + ------- + loc_dict : dict + A dictionary with: + + * keys: str, estimator class name, e.g., ``RandomForestClassifier`` + * values: str, public import path of the estimator, e.g., + ``sklearn.ensemble.RandomForestClassifier`` + """ + all_ests = _all_sklearn_estimators( + package_name=package_name, + return_names=False, + ) + + loc_dict = {est.__name__: f"{est.__module__}.{est.__name__}" for est in all_ests} + + if serialized: + from openml.utils._inmemory._dict import serialize_dict + + loc_dict = serialize_dict(loc_dict, name="sklearn_estimators_loc_dict") + + return loc_dict + + +def _all_sklearn_estimators( + package_name="sklearn", + return_names=True, + as_dataframe=False, + suppress_import_stdout=True, +): + """List all scikit-learn objects in a given package. + + This function retrieves all sklearn objects inheriting from ``BaseEstimator``, + from the import location given by ``package_name``. + + Not included are: the base classes themselves, classes defined in test modules. + + Parameters + ---------- + package_name : str, optional (default="sklearn") + The package from which to retrieve the sklearn estimators. + This is an import name, e.g., ``"sklearn"``, not a PEP 440 package identifier, + e.g., ``"scikit-learn"``. + + return_names: bool, optional (default=True) + + if True, estimator class name is included in the ``all_estimators`` + return in the order: name, estimator class, optional tags, either as + a tuple or as pandas.DataFrame columns + + if False, estimator class name is removed from the ``all_estimators`` return. + + as_dataframe: bool, optional (default=False) + + True: ``all_estimators`` will return a ``pandas.DataFrame`` with named + columns for all of the attributes being returned. + + False: ``all_estimators`` will return a list (either a list of + estimators or a list of tuples, see Returns) + + suppress_import_stdout : bool, optional. Default=True + whether to suppress stdout printout upon import. + + Returns + ------- + all_estimators will return one of the following: + + 1. list of estimators, if ``return_names=False``, and ``return_tags`` is None + + 2. list of tuples (optional estimator name, class, ~ptional estimator + tags), if ``return_names=True`` or ``return_tags`` is not ``None``. + + 3. ``pandas.DataFrame`` if ``as_dataframe = True`` + + if list of estimators: + entries are estimators matching the query, + in alphabetical order of estimator name + if list of tuples: + list of (optional estimator name, estimator, optional estimator + tags) matching the query, in alphabetical order of estimator name, + where + ``name`` is the estimator name as string, and is an + optional return + ``estimator`` is the actual estimator + ``tags`` are the estimator's values for each tag in return_tags + and is an optional return. + if ``DataFrame``: + column names represent the attributes contained in each column. + "estimators" will be the name of the column of estimators, "names" + will be the name of the column of estimator class names and the string(s) + passed in return_tags will serve as column names for all columns of + tags that were optionally requested. + """ # noqa: E501 + from sklearn.base import BaseEstimator + + MODULES_TO_IGNORE_SKLEARN = [ + "array_api_compat", + "tests", + "experimental", + "conftest", + ] + + result = all_objects( + object_types=BaseEstimator, + package_name=package_name, + modules_to_ignore=MODULES_TO_IGNORE_SKLEARN, + as_dataframe=as_dataframe, + return_names=return_names, + suppress_import_stdout=suppress_import_stdout, + ) + + return result diff --git a/openml/utils/_inmemory/__init__.py b/openml/utils/_inmemory/__init__.py new file mode 100644 index 000000000..07bdfba5a --- /dev/null +++ b/openml/utils/_inmemory/__init__.py @@ -0,0 +1 @@ +"""Utilities module for serializing and deserializing in-memory objects.""" diff --git a/openml/utils/_inmemory/_dict.py b/openml/utils/_inmemory/_dict.py new file mode 100644 index 000000000..09d59285e --- /dev/null +++ b/openml/utils/_inmemory/_dict.py @@ -0,0 +1,54 @@ +"""Utilities module for serializing and deserializing dicts.""" + + +def serialize_dict(d, mode="eval", name="d"): + """Serialize a dict as an executable Python code snippet. + + To deserialize, simply execute the code snippet in a Python environment. + + Command for deserialization: + + * if ``mode == "eval"``, use ``deserialized = exec(code_snippet)`` + * if ``mode == "exec"``, use ``exec(code_snippet)`` and then access the dict + + Parameters + ---------- + d : dict + The dictionary to serialize. + + mode : str, "eval" or "exec", default="eval" + The mode of serialization. + + * If ``"eval"``, the returned code snippet is an expression that evaluates to the dict. + * If ``"exec"``, the returned code snippet is a series of statements that assign the dict + to a variable named ``name``. + + name : str, default="d" + The variable name to assign the dict to. + Only used if mode is ``"exec"``. + + Returns + ------- + code_snippet : str + A string containing the Python code snippet that recreates the dict ``d``, + assigned to the specified variable name ``name``. + + Example + ------- + >>> my_dict = {'a': 'apple', 'b': 'banana'} + >>> serialized_dict = serialize_dict(my_dict, name="my_dict") + >>> deserialized_dict = eval(serialized_dict) + >>> assert deserialized_dict == my_dict + """ + def dq(s): + # Escape backslashes and double quotes for valid Python strings + return s.replace("\\", "\\\\").replace('"', '\\"') + + if mode == "eval": + lines = ["{"] + else: # mode == "exec" + lines = [f"{name} = {{"] + for k, v in d.items(): + lines.append(f' "{dq(k)}": "{dq(v)}",') + lines.append("}") + return "\n".join(lines) From abc0f4929285422427bdd55dd93cc81322ae2bc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Fri, 9 Jan 2026 21:29:44 +0100 Subject: [PATCH 6/8] Update _openml.py --- openml/utils/_openml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openml/utils/_openml.py b/openml/utils/_openml.py index 7e72e7aee..d86a62ce7 100644 --- a/openml/utils/_openml.py +++ b/openml/utils/_openml.py @@ -18,7 +18,7 @@ import openml._api_calls import openml.exceptions -from . import config +from openml import config # Avoid import cycles: https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles if TYPE_CHECKING: From 5a6161ff7ccb86f893d13bcd7b196b65e5b13313 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Fri, 9 Jan 2026 22:26:42 +0100 Subject: [PATCH 7/8] multi-import package --- openml/__init__.py | 7 + openml/base/_base_pkg.py | 1 + openml/models/_get.py | 14 +- openml/models/base/_base.py | 27 ++- openml/models/classification/auto_sklearn.py | 1 + openml/models/classification/scikit_learn.py | 227 +++++++++++++++++++ openml/models/classification/xgboost.py | 1 + 7 files changed, 274 insertions(+), 4 deletions(-) create mode 100644 openml/models/classification/scikit_learn.py diff --git a/openml/__init__.py b/openml/__init__.py index 7eb077057..a9a732fc9 100644 --- a/openml/__init__.py +++ b/openml/__init__.py @@ -124,3 +124,10 @@ def populate_cache( "__version__", "get", ] + + +def __getattr__(name: str): + if name in __all__: + return globals()[name] + if name not in __all__: + return get(name) diff --git a/openml/base/_base_pkg.py b/openml/base/_base_pkg.py index 690b93a86..729619fa9 100644 --- a/openml/base/_base_pkg.py +++ b/openml/base/_base_pkg.py @@ -20,6 +20,7 @@ class _BasePkg(BaseObject): "pkg_obj": "reference", # or "code" "pkg_obj_type": None, # openml API type "pkg_compression": "zlib", # compression + "pkg_pypi_name": None, # PyPI package name of objects } def __init__(self): diff --git a/openml/models/_get.py b/openml/models/_get.py index 75b807ca7..7762b8013 100644 --- a/openml/models/_get.py +++ b/openml/models/_get.py @@ -27,7 +27,7 @@ def get(id: str): obj = id_lookup.get(id) if obj is None: raise ValueError(f"Error in openml.get, object with package id {id} " "does not exist.") - return obj().materialize() + return obj(id).materialize() # todo: need to generalize this later to more types @@ -41,8 +41,16 @@ def _id_lookup(obj_type=None): def _id_lookup_cached(obj_type=None): all_objs = _all_objects(obj_type=obj_type) - # todo: generalize that pkg can contain more than one object - return {obj.get_class_tag("pkg_id"): obj for obj in all_objs} + lookup_dict = {} + for obj in all_objs: + obj_index = obj.get_class_tag("pkg_id") + if obj_index != "__multiple": + lookup_dict[obj_index] = obj + else: + obj_all_ids = obj.contained_ids() + lookup_dict.update({obj_id: obj for obj_id in obj_all_ids}) + + return lookup_dict @lru_cache diff --git a/openml/models/base/_base.py b/openml/models/base/_base.py index 6b3fa2a92..13166cfe2 100644 --- a/openml/models/base/_base.py +++ b/openml/models/base/_base.py @@ -7,6 +7,28 @@ class _OpenmlModelPkg(_BasePkg): _obj = None + _obj_dict = {} + + def __init__(self, id=None): + super().__init__() + + pkg_id = self.get_tag("pkg_id") + if pkg_id == "__multiple": + self._obj = self._obj_dict.get(id, None) + + @classmethod + def contained_ids(cls): + """Return list of ids of objects contained in this package. + + Returns + ------- + ids : list of str + list of unique identifiers of objects contained in this package + """ + pkg_id = cls.get_class_tag("pkg_id") + if pkg_id != "__multiple": + return [cls.get_class_tag("pkg_id")] + return list(cls._obj_dict.keys()) def _materialize(self): pkg_obj = self.get_tag("pkg_obj") @@ -23,7 +45,10 @@ def _materialize(self): if pkg_obj == "reference": from skbase.utils.dependencies import _safe_import - return _safe_import(self._obj) + obj_loc = self._obj + pkg_name = self.get_tag("pkg_pypi_name") + + return _safe_import(obj_loc, pkg_name=pkg_name) if pkg_obj == "code": exec(self._obj) diff --git a/openml/models/classification/auto_sklearn.py b/openml/models/classification/auto_sklearn.py index 1d29044da..c4d926e72 100644 --- a/openml/models/classification/auto_sklearn.py +++ b/openml/models/classification/auto_sklearn.py @@ -9,6 +9,7 @@ class OpenmlPkg__AutoSklearnClassifier(_ModelPkgClassifier): _tags = { "pkg_id": "AutoSklearnClassifier", "python_dependencies": "auto-sklearn", + "pkg_pypi_name": "auto-sklearn", } _obj = "autosklearn.classification.AutoSklearnClassifier" diff --git a/openml/models/classification/scikit_learn.py b/openml/models/classification/scikit_learn.py new file mode 100644 index 000000000..dd05d3d46 --- /dev/null +++ b/openml/models/classification/scikit_learn.py @@ -0,0 +1,227 @@ +"""Auto-sklearn classifier.""" + +from __future__ import annotations + +from openml.models.apis import _ModelPkgClassifier + + +class OpenmlPkg__Sklearn(_ModelPkgClassifier): + _tags = { + "pkg_id": "__multiple", + "python_dependencies": "scikit-learn", + "pkg_pypi_name": "scikit-learn", + } + + # obtained via utils._indexing._preindex_sklearn + # todo: automate generation + # todo: include version bounds for availability + # todo: test generated index against actual index + _obj_dict = { + "ARDRegression": "sklearn.linear_model._bayes.ARDRegression", + "AdaBoostClassifier": "sklearn.ensemble._weight_boosting.AdaBoostClassifier", + "AdaBoostRegressor": "sklearn.ensemble._weight_boosting.AdaBoostRegressor", + "AdditiveChi2Sampler": "sklearn.kernel_approximation.AdditiveChi2Sampler", + "AffinityPropagation": "sklearn.cluster._affinity_propagation.AffinityPropagation", + "AgglomerativeClustering": "sklearn.cluster._agglomerative.AgglomerativeClustering", + "BaggingClassifier": "sklearn.ensemble._bagging.BaggingClassifier", + "BaggingRegressor": "sklearn.ensemble._bagging.BaggingRegressor", + "BayesianGaussianMixture": "sklearn.mixture._bayesian_mixture.BayesianGaussianMixture", + "BayesianRidge": "sklearn.linear_model._bayes.BayesianRidge", + "BernoulliNB": "sklearn.naive_bayes.BernoulliNB", + "BernoulliRBM": "sklearn.neural_network._rbm.BernoulliRBM", + "Binarizer": "sklearn.preprocessing._data.Binarizer", + "Birch": "sklearn.cluster._birch.Birch", + "BisectingKMeans": "sklearn.cluster._bisect_k_means.BisectingKMeans", + "CCA": "sklearn.cross_decomposition._pls.CCA", + "CalibratedClassifierCV": "sklearn.calibration.CalibratedClassifierCV", + "CategoricalNB": "sklearn.naive_bayes.CategoricalNB", + "ClassifierChain": "sklearn.multioutput.ClassifierChain", + "ColumnTransformer": "sklearn.compose._column_transformer.ColumnTransformer", + "ComplementNB": "sklearn.naive_bayes.ComplementNB", + "CountVectorizer": "sklearn.feature_extraction.text.CountVectorizer", + "DBSCAN": "sklearn.cluster._dbscan.DBSCAN", + "DecisionTreeClassifier": "sklearn.tree._classes.DecisionTreeClassifier", + "DecisionTreeRegressor": "sklearn.tree._classes.DecisionTreeRegressor", + "DictVectorizer": "sklearn.feature_extraction._dict_vectorizer.DictVectorizer", + "DictionaryLearning": "sklearn.decomposition._dict_learning.DictionaryLearning", + "DummyClassifier": "sklearn.dummy.DummyClassifier", + "DummyRegressor": "sklearn.dummy.DummyRegressor", + "ElasticNet": "sklearn.linear_model._coordinate_descent.ElasticNet", + "ElasticNetCV": "sklearn.linear_model._coordinate_descent.ElasticNetCV", + "EllipticEnvelope": "sklearn.covariance._elliptic_envelope.EllipticEnvelope", + "EmpiricalCovariance": "sklearn.covariance._empirical_covariance.EmpiricalCovariance", + "ExtraTreeClassifier": "sklearn.tree._classes.ExtraTreeClassifier", + "ExtraTreeRegressor": "sklearn.tree._classes.ExtraTreeRegressor", + "ExtraTreesClassifier": "sklearn.ensemble._forest.ExtraTreesClassifier", + "ExtraTreesRegressor": "sklearn.ensemble._forest.ExtraTreesRegressor", + "FactorAnalysis": "sklearn.decomposition._factor_analysis.FactorAnalysis", + "FastICA": "sklearn.decomposition._fastica.FastICA", + "FeatureAgglomeration": "sklearn.cluster._agglomerative.FeatureAgglomeration", + "FeatureHasher": "sklearn.feature_extraction._hash.FeatureHasher", + "FeatureUnion": "sklearn.pipeline.FeatureUnion", + "FixedThresholdClassifier": "sklearn.model_selection._classification_threshold.FixedThresholdClassifier", + "FrozenEstimator": "sklearn.frozen._frozen.FrozenEstimator", + "FunctionTransformer": "sklearn.preprocessing._function_transformer.FunctionTransformer", + "GammaRegressor": "sklearn.linear_model._glm.glm.GammaRegressor", + "GaussianMixture": "sklearn.mixture._gaussian_mixture.GaussianMixture", + "GaussianNB": "sklearn.naive_bayes.GaussianNB", + "GaussianProcessClassifier": "sklearn.gaussian_process._gpc.GaussianProcessClassifier", + "GaussianProcessRegressor": "sklearn.gaussian_process._gpr.GaussianProcessRegressor", + "GaussianRandomProjection": "sklearn.random_projection.GaussianRandomProjection", + "GenericUnivariateSelect": "sklearn.feature_selection._univariate_selection.GenericUnivariateSelect", + "GradientBoostingClassifier": "sklearn.ensemble._gb.GradientBoostingClassifier", + "GradientBoostingRegressor": "sklearn.ensemble._gb.GradientBoostingRegressor", + "GraphicalLasso": "sklearn.covariance._graph_lasso.GraphicalLasso", + "GraphicalLassoCV": "sklearn.covariance._graph_lasso.GraphicalLassoCV", + "GridSearchCV": "sklearn.model_selection._search.GridSearchCV", + "HDBSCAN": "sklearn.cluster._hdbscan.hdbscan.HDBSCAN", + "HashingVectorizer": "sklearn.feature_extraction.text.HashingVectorizer", + "HistGradientBoostingClassifier": "sklearn.ensemble._hist_gradient_boosting.gradient_boosting.HistGradientBoostingClassifier", + "HistGradientBoostingRegressor": "sklearn.ensemble._hist_gradient_boosting.gradient_boosting.HistGradientBoostingRegressor", + "HuberRegressor": "sklearn.linear_model._huber.HuberRegressor", + "IncrementalPCA": "sklearn.decomposition._incremental_pca.IncrementalPCA", + "IsolationForest": "sklearn.ensemble._iforest.IsolationForest", + "Isomap": "sklearn.manifold._isomap.Isomap", + "IsotonicRegression": "sklearn.isotonic.IsotonicRegression", + "KBinsDiscretizer": "sklearn.preprocessing._discretization.KBinsDiscretizer", + "KMeans": "sklearn.cluster._kmeans.KMeans", + "KNNImputer": "sklearn.impute._knn.KNNImputer", + "KNeighborsClassifier": "sklearn.neighbors._classification.KNeighborsClassifier", + "KNeighborsRegressor": "sklearn.neighbors._regression.KNeighborsRegressor", + "KNeighborsTransformer": "sklearn.neighbors._graph.KNeighborsTransformer", + "KernelCenterer": "sklearn.preprocessing._data.KernelCenterer", + "KernelDensity": "sklearn.neighbors._kde.KernelDensity", + "KernelPCA": "sklearn.decomposition._kernel_pca.KernelPCA", + "KernelRidge": "sklearn.kernel_ridge.KernelRidge", + "LabelBinarizer": "sklearn.preprocessing._label.LabelBinarizer", + "LabelEncoder": "sklearn.preprocessing._label.LabelEncoder", + "LabelPropagation": "sklearn.semi_supervised._label_propagation.LabelPropagation", + "LabelSpreading": "sklearn.semi_supervised._label_propagation.LabelSpreading", + "Lars": "sklearn.linear_model._least_angle.Lars", + "LarsCV": "sklearn.linear_model._least_angle.LarsCV", + "Lasso": "sklearn.linear_model._coordinate_descent.Lasso", + "LassoCV": "sklearn.linear_model._coordinate_descent.LassoCV", + "LassoLars": "sklearn.linear_model._least_angle.LassoLars", + "LassoLarsCV": "sklearn.linear_model._least_angle.LassoLarsCV", + "LassoLarsIC": "sklearn.linear_model._least_angle.LassoLarsIC", + "LatentDirichletAllocation": "sklearn.decomposition._lda.LatentDirichletAllocation", + "LedoitWolf": "sklearn.covariance._shrunk_covariance.LedoitWolf", + "LinearDiscriminantAnalysis": "sklearn.discriminant_analysis.LinearDiscriminantAnalysis", + "LinearRegression": "sklearn.linear_model._base.LinearRegression", + "LinearSVC": "sklearn.svm._classes.LinearSVC", + "LinearSVR": "sklearn.svm._classes.LinearSVR", + "LocalOutlierFactor": "sklearn.neighbors._lof.LocalOutlierFactor", + "LocallyLinearEmbedding": "sklearn.manifold._locally_linear.LocallyLinearEmbedding", + "LogisticRegression": "sklearn.linear_model._logistic.LogisticRegression", + "LogisticRegressionCV": "sklearn.linear_model._logistic.LogisticRegressionCV", + "MDS": "sklearn.manifold._mds.MDS", + "MLPClassifier": "sklearn.neural_network._multilayer_perceptron.MLPClassifier", + "MLPRegressor": "sklearn.neural_network._multilayer_perceptron.MLPRegressor", + "MaxAbsScaler": "sklearn.preprocessing._data.MaxAbsScaler", + "MeanShift": "sklearn.cluster._mean_shift.MeanShift", + "MinCovDet": "sklearn.covariance._robust_covariance.MinCovDet", + "MinMaxScaler": "sklearn.preprocessing._data.MinMaxScaler", + "MiniBatchDictionaryLearning": "sklearn.decomposition._dict_learning.MiniBatchDictionaryLearning", + "MiniBatchKMeans": "sklearn.cluster._kmeans.MiniBatchKMeans", + "MiniBatchNMF": "sklearn.decomposition._nmf.MiniBatchNMF", + "MiniBatchSparsePCA": "sklearn.decomposition._sparse_pca.MiniBatchSparsePCA", + "MissingIndicator": "sklearn.impute._base.MissingIndicator", + "MultiLabelBinarizer": "sklearn.preprocessing._label.MultiLabelBinarizer", + "MultiOutputClassifier": "sklearn.multioutput.MultiOutputClassifier", + "MultiOutputRegressor": "sklearn.multioutput.MultiOutputRegressor", + "MultiTaskElasticNet": "sklearn.linear_model._coordinate_descent.MultiTaskElasticNet", + "MultiTaskElasticNetCV": "sklearn.linear_model._coordinate_descent.MultiTaskElasticNetCV", + "MultiTaskLasso": "sklearn.linear_model._coordinate_descent.MultiTaskLasso", + "MultiTaskLassoCV": "sklearn.linear_model._coordinate_descent.MultiTaskLassoCV", + "MultinomialNB": "sklearn.naive_bayes.MultinomialNB", + "NMF": "sklearn.decomposition._nmf.NMF", + "NearestCentroid": "sklearn.neighbors._nearest_centroid.NearestCentroid", + "NearestNeighbors": "sklearn.neighbors._unsupervised.NearestNeighbors", + "NeighborhoodComponentsAnalysis": "sklearn.neighbors._nca.NeighborhoodComponentsAnalysis", + "Normalizer": "sklearn.preprocessing._data.Normalizer", + "NuSVC": "sklearn.svm._classes.NuSVC", + "NuSVR": "sklearn.svm._classes.NuSVR", + "Nystroem": "sklearn.kernel_approximation.Nystroem", + "OAS": "sklearn.covariance._shrunk_covariance.OAS", + "OPTICS": "sklearn.cluster._optics.OPTICS", + "OneClassSVM": "sklearn.svm._classes.OneClassSVM", + "OneHotEncoder": "sklearn.preprocessing._encoders.OneHotEncoder", + "OneVsOneClassifier": "sklearn.multiclass.OneVsOneClassifier", + "OneVsRestClassifier": "sklearn.multiclass.OneVsRestClassifier", + "OrdinalEncoder": "sklearn.preprocessing._encoders.OrdinalEncoder", + "OrthogonalMatchingPursuit": "sklearn.linear_model._omp.OrthogonalMatchingPursuit", + "OrthogonalMatchingPursuitCV": "sklearn.linear_model._omp.OrthogonalMatchingPursuitCV", + "OutputCodeClassifier": "sklearn.multiclass.OutputCodeClassifier", + "PCA": "sklearn.decomposition._pca.PCA", + "PLSCanonical": "sklearn.cross_decomposition._pls.PLSCanonical", + "PLSRegression": "sklearn.cross_decomposition._pls.PLSRegression", + "PLSSVD": "sklearn.cross_decomposition._pls.PLSSVD", + "PassiveAggressiveClassifier": "sklearn.linear_model._passive_aggressive.PassiveAggressiveClassifier", + "PassiveAggressiveRegressor": "sklearn.linear_model._passive_aggressive.PassiveAggressiveRegressor", + "PatchExtractor": "sklearn.feature_extraction.image.PatchExtractor", + "Perceptron": "sklearn.linear_model._perceptron.Perceptron", + "Pipeline": "sklearn.pipeline.Pipeline", + "PoissonRegressor": "sklearn.linear_model._glm.glm.PoissonRegressor", + "PolynomialCountSketch": "sklearn.kernel_approximation.PolynomialCountSketch", + "PolynomialFeatures": "sklearn.preprocessing._polynomial.PolynomialFeatures", + "PowerTransformer": "sklearn.preprocessing._data.PowerTransformer", + "QuadraticDiscriminantAnalysis": "sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis", + "QuantileRegressor": "sklearn.linear_model._quantile.QuantileRegressor", + "QuantileTransformer": "sklearn.preprocessing._data.QuantileTransformer", + "RANSACRegressor": "sklearn.linear_model._ransac.RANSACRegressor", + "RBFSampler": "sklearn.kernel_approximation.RBFSampler", + "RFE": "sklearn.feature_selection._rfe.RFE", + "RFECV": "sklearn.feature_selection._rfe.RFECV", + "RadiusNeighborsClassifier": "sklearn.neighbors._classification.RadiusNeighborsClassifier", + "RadiusNeighborsRegressor": "sklearn.neighbors._regression.RadiusNeighborsRegressor", + "RadiusNeighborsTransformer": "sklearn.neighbors._graph.RadiusNeighborsTransformer", + "RandomForestClassifier": "sklearn.ensemble._forest.RandomForestClassifier", + "RandomForestRegressor": "sklearn.ensemble._forest.RandomForestRegressor", + "RandomTreesEmbedding": "sklearn.ensemble._forest.RandomTreesEmbedding", + "RandomizedSearchCV": "sklearn.model_selection._search.RandomizedSearchCV", + "RegressorChain": "sklearn.multioutput.RegressorChain", + "Ridge": "sklearn.linear_model._ridge.Ridge", + "RidgeCV": "sklearn.linear_model._ridge.RidgeCV", + "RidgeClassifier": "sklearn.linear_model._ridge.RidgeClassifier", + "RidgeClassifierCV": "sklearn.linear_model._ridge.RidgeClassifierCV", + "RobustScaler": "sklearn.preprocessing._data.RobustScaler", + "SGDClassifier": "sklearn.linear_model._stochastic_gradient.SGDClassifier", + "SGDOneClassSVM": "sklearn.linear_model._stochastic_gradient.SGDOneClassSVM", + "SGDRegressor": "sklearn.linear_model._stochastic_gradient.SGDRegressor", + "SVC": "sklearn.svm._classes.SVC", + "SVR": "sklearn.svm._classes.SVR", + "SelectFdr": "sklearn.feature_selection._univariate_selection.SelectFdr", + "SelectFpr": "sklearn.feature_selection._univariate_selection.SelectFpr", + "SelectFromModel": "sklearn.feature_selection._from_model.SelectFromModel", + "SelectFwe": "sklearn.feature_selection._univariate_selection.SelectFwe", + "SelectKBest": "sklearn.feature_selection._univariate_selection.SelectKBest", + "SelectPercentile": "sklearn.feature_selection._univariate_selection.SelectPercentile", + "SelfTrainingClassifier": "sklearn.semi_supervised._self_training.SelfTrainingClassifier", + "SequentialFeatureSelector": "sklearn.feature_selection._sequential.SequentialFeatureSelector", + "ShrunkCovariance": "sklearn.covariance._shrunk_covariance.ShrunkCovariance", + "SimpleImputer": "sklearn.impute._base.SimpleImputer", + "SkewedChi2Sampler": "sklearn.kernel_approximation.SkewedChi2Sampler", + "SparseCoder": "sklearn.decomposition._dict_learning.SparseCoder", + "SparsePCA": "sklearn.decomposition._sparse_pca.SparsePCA", + "SparseRandomProjection": "sklearn.random_projection.SparseRandomProjection", + "SpectralBiclustering": "sklearn.cluster._bicluster.SpectralBiclustering", + "SpectralClustering": "sklearn.cluster._spectral.SpectralClustering", + "SpectralCoclustering": "sklearn.cluster._bicluster.SpectralCoclustering", + "SpectralEmbedding": "sklearn.manifold._spectral_embedding.SpectralEmbedding", + "SplineTransformer": "sklearn.preprocessing._polynomial.SplineTransformer", + "StackingClassifier": "sklearn.ensemble._stacking.StackingClassifier", + "StackingRegressor": "sklearn.ensemble._stacking.StackingRegressor", + "StandardScaler": "sklearn.preprocessing._data.StandardScaler", + "TSNE": "sklearn.manifold._t_sne.TSNE", + "TargetEncoder": "sklearn.preprocessing._target_encoder.TargetEncoder", + "TfidfTransformer": "sklearn.feature_extraction.text.TfidfTransformer", + "TfidfVectorizer": "sklearn.feature_extraction.text.TfidfVectorizer", + "TheilSenRegressor": "sklearn.linear_model._theil_sen.TheilSenRegressor", + "TransformedTargetRegressor": "sklearn.compose._target.TransformedTargetRegressor", + "TruncatedSVD": "sklearn.decomposition._truncated_svd.TruncatedSVD", + "TunedThresholdClassifierCV": "sklearn.model_selection._classification_threshold.TunedThresholdClassifierCV", + "TweedieRegressor": "sklearn.linear_model._glm.glm.TweedieRegressor", + "VarianceThreshold": "sklearn.feature_selection._variance_threshold.VarianceThreshold", + "VotingClassifier": "sklearn.ensemble._voting.VotingClassifier", + "VotingRegressor": "sklearn.ensemble._voting.VotingRegressor", + } diff --git a/openml/models/classification/xgboost.py b/openml/models/classification/xgboost.py index 5b91e647c..b320fcabf 100644 --- a/openml/models/classification/xgboost.py +++ b/openml/models/classification/xgboost.py @@ -9,6 +9,7 @@ class OpenmlPkg__XGBClassifier(_ModelPkgClassifier): _tags = { "pkg_id": "XGBClassifier", "python_dependencies": "xgboost", + "pkg_pypi_name": "xgboost", } _obj = "xgboost.XGBClassifier" From f6f050acc786ba196468ff7b51cf3a80ce98ffa3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 9 Jan 2026 21:26:56 +0000 Subject: [PATCH 8/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- openml/__init__.py | 1 + openml/utils/_indexing/_preindex_sklearn.py | 8 ++++---- openml/utils/_inmemory/_dict.py | 5 ++++- openml/utils/_openml.py | 1 - 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/openml/__init__.py b/openml/__init__.py index a9a732fc9..c9a90e45a 100644 --- a/openml/__init__.py +++ b/openml/__init__.py @@ -131,3 +131,4 @@ def __getattr__(name: str): return globals()[name] if name not in __all__: return get(name) + return None diff --git a/openml/utils/_indexing/_preindex_sklearn.py b/openml/utils/_indexing/_preindex_sklearn.py index 5291f3adb..bf4f8130b 100644 --- a/openml/utils/_indexing/_preindex_sklearn.py +++ b/openml/utils/_indexing/_preindex_sklearn.py @@ -1,6 +1,8 @@ """Registry lookup methods - scikit-learn estimators.""" + # adapted from the sktime utility of the same name # copyright: sktime developers, BSD-3-Clause License (see LICENSE file) +from __future__ import annotations __author__ = ["fkiraly"] # all_estimators is also based on the sklearn utility of the same name @@ -115,7 +117,7 @@ def _all_sklearn_estimators( will be the name of the column of estimator class names and the string(s) passed in return_tags will serve as column names for all columns of tags that were optionally requested. - """ # noqa: E501 + """ from sklearn.base import BaseEstimator MODULES_TO_IGNORE_SKLEARN = [ @@ -125,7 +127,7 @@ def _all_sklearn_estimators( "conftest", ] - result = all_objects( + return all_objects( object_types=BaseEstimator, package_name=package_name, modules_to_ignore=MODULES_TO_IGNORE_SKLEARN, @@ -133,5 +135,3 @@ def _all_sklearn_estimators( return_names=return_names, suppress_import_stdout=suppress_import_stdout, ) - - return result diff --git a/openml/utils/_inmemory/_dict.py b/openml/utils/_inmemory/_dict.py index 09d59285e..c27e78dd7 100644 --- a/openml/utils/_inmemory/_dict.py +++ b/openml/utils/_inmemory/_dict.py @@ -1,5 +1,7 @@ """Utilities module for serializing and deserializing dicts.""" +from __future__ import annotations + def serialize_dict(d, mode="eval", name="d"): """Serialize a dict as an executable Python code snippet. @@ -40,13 +42,14 @@ def serialize_dict(d, mode="eval", name="d"): >>> deserialized_dict = eval(serialized_dict) >>> assert deserialized_dict == my_dict """ + def dq(s): # Escape backslashes and double quotes for valid Python strings return s.replace("\\", "\\\\").replace('"', '\\"') if mode == "eval": lines = ["{"] - else: # mode == "exec" + else: # mode == "exec" lines = [f"{name} = {{"] for k, v in d.items(): lines.append(f' "{dq(k)}": "{dq(v)}",') diff --git a/openml/utils/_openml.py b/openml/utils/_openml.py index d86a62ce7..f20aedcca 100644 --- a/openml/utils/_openml.py +++ b/openml/utils/_openml.py @@ -17,7 +17,6 @@ import openml import openml._api_calls import openml.exceptions - from openml import config # Avoid import cycles: https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles