diff --git a/openml/_api/__init__.py b/openml/_api/__init__.py new file mode 100644 index 000000000..881f40671 --- /dev/null +++ b/openml/_api/__init__.py @@ -0,0 +1,8 @@ +from openml._api.runtime.core import APIContext + + +def set_api_version(version: str, *, strict: bool = False) -> None: + api_context.set_version(version=version, strict=strict) + + +api_context = APIContext() diff --git a/openml/_api/config.py b/openml/_api/config.py new file mode 100644 index 000000000..848fe8da1 --- /dev/null +++ b/openml/_api/config.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Literal + +DelayMethod = Literal["human", "robot"] + + +@dataclass +class APIConfig: + server: str + base_url: str + key: str + timeout: int = 10 # seconds + + +@dataclass +class APISettings: + v1: APIConfig + v2: APIConfig + + +@dataclass +class ConnectionConfig: + retries: int = 3 + delay_method: DelayMethod = "human" + delay_time: int = 1 # seconds + + def __post_init__(self) -> None: + if self.delay_method not in ("human", "robot"): + raise ValueError(f"delay_method must be 'human' or 'robot', got {self.delay_method}") + + +@dataclass +class CacheConfig: + dir: str = "~/.openml/cache" + ttl: int = 60 * 60 * 24 * 7 # one week + + +@dataclass +class Settings: + api: APISettings + connection: ConnectionConfig + cache: CacheConfig + + +settings = Settings( + api=APISettings( + v1=APIConfig( + server="https://www.openml.org/", + base_url="api/v1/xml/", + key="...", + ), + v2=APIConfig( + server="http://127.0.0.1:8001/", + base_url="", + key="...", + ), + ), + connection=ConnectionConfig(), + cache=CacheConfig(), +) diff --git a/openml/_api/http/__init__.py b/openml/_api/http/__init__.py new file mode 100644 index 000000000..8e6d1e4ce --- /dev/null +++ b/openml/_api/http/__init__.py @@ -0,0 +1,3 @@ +from openml._api.http.client import HTTPClient + +__all__ = ["HTTPClient"] diff --git a/openml/_api/http/client.py b/openml/_api/http/client.py new file mode 100644 index 000000000..a90e93933 --- /dev/null +++ b/openml/_api/http/client.py @@ -0,0 +1,151 @@ +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING, Any +from urllib.parse import urlencode, urljoin, urlparse + +import requests +from requests import Response + +from openml.__version__ import __version__ +from openml._api.config import settings + +if TYPE_CHECKING: + from openml._api.config import APIConfig + + +class CacheMixin: + @property + def dir(self) -> str: + return settings.cache.dir + + @property + def ttl(self) -> int: + return settings.cache.ttl + + def _get_cache_dir(self, url: str, params: dict[str, Any]) -> Path: + parsed_url = urlparse(url) + netloc_parts = parsed_url.netloc.split(".")[::-1] # reverse domain + path_parts = parsed_url.path.strip("/").split("/") + + # remove api_key and serialize params if any + filtered_params = {k: v for k, v in params.items() if k != "api_key"} + params_part = [urlencode(filtered_params)] if filtered_params else [] + + return Path(self.dir).joinpath(*netloc_parts, *path_parts, *params_part) + + def _get_cache_response(self, cache_dir: Path) -> Response: # noqa: ARG002 + return Response() + + def _set_cache_response(self, cache_dir: Path, response: Response) -> None: # noqa: ARG002 + return None + + +class HTTPClient(CacheMixin): + def __init__(self, config: APIConfig) -> None: + self.config = config + self.headers: dict[str, str] = {"user-agent": f"openml-python/{__version__}"} + + @property + def server(self) -> str: + return self.config.server + + @property + def base_url(self) -> str: + return self.config.base_url + + @property + def key(self) -> str: + return self.config.key + + @property + def timeout(self) -> int: + return self.config.timeout + + def request( + self, + method: str, + path: str, + *, + use_cache: bool = False, + use_api_key: bool = False, + **request_kwargs: Any, + ) -> Response: + url = urljoin(self.server, urljoin(self.base_url, path)) + + params = request_kwargs.pop("params", {}) + params = params.copy() + if use_api_key: + params["api_key"] = self.key + + headers = request_kwargs.pop("headers", {}) + headers = headers.copy() + headers.update(self.headers) + + timeout = request_kwargs.pop("timeout", self.timeout) + cache_dir = self._get_cache_dir(url, params) + + if use_cache: + try: + return self._get_cache_response(cache_dir) + # TODO: handle ttl expired error + except Exception: + raise + + response = requests.request( + method=method, + url=url, + params=params, + headers=headers, + timeout=timeout, + **request_kwargs, + ) + + if use_cache: + self._set_cache_response(cache_dir, response) + + return response + + def get( + self, + path: str, + *, + use_cache: bool = False, + use_api_key: bool = False, + **request_kwargs: Any, + ) -> Response: + # TODO: remove override when cache is implemented + use_cache = False + return self.request( + method="GET", + path=path, + use_cache=use_cache, + use_api_key=use_api_key, + **request_kwargs, + ) + + def post( + self, + path: str, + **request_kwargs: Any, + ) -> Response: + return self.request( + method="POST", + path=path, + use_cache=False, + use_api_key=True, + **request_kwargs, + ) + + def delete( + self, + path: str, + **request_kwargs: Any, + ) -> Response: + return self.request( + method="DELETE", + path=path, + use_cache=False, + use_api_key=True, + **request_kwargs, + ) diff --git a/openml/_api/http/utils.py b/openml/_api/http/utils.py new file mode 100644 index 000000000..e69de29bb diff --git a/openml/_api/resources/__init__.py b/openml/_api/resources/__init__.py new file mode 100644 index 000000000..93db5eb2b --- /dev/null +++ b/openml/_api/resources/__init__.py @@ -0,0 +1,5 @@ +from openml._api.resources.datasets import DatasetsV1, DatasetsV2 +from openml._api.resources.studies import StudiesV1, StudiesV2 +from openml._api.resources.tasks import TasksV1, TasksV2 + +__all__ = ["DatasetsV1", "DatasetsV2", "StudiesV1", "StudiesV2", "TasksV1", "TasksV2"] diff --git a/openml/_api/resources/base.py b/openml/_api/resources/base.py new file mode 100644 index 000000000..d48517d58 --- /dev/null +++ b/openml/_api/resources/base.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from requests import Response + + from openml._api.http import HTTPClient + from openml.datasets.dataset import OpenMLDataset + from openml.tasks.task import OpenMLTask + + +class ResourceAPI: + def __init__(self, http: HTTPClient): + self._http = http + + +class DatasetsAPI(ResourceAPI, ABC): + @abstractmethod + def get(self, dataset_id: int) -> OpenMLDataset | tuple[OpenMLDataset, Response]: ... + + +class TasksAPI(ResourceAPI, ABC): + @abstractmethod + def get( + self, + task_id: int, + *, + return_response: bool = False, + ) -> OpenMLTask | tuple[OpenMLTask, Response]: ... + + +class StudiesAPI(ResourceAPI, ABC): + @abstractmethod + def list( # noqa: PLR0913 + self, + limit: int | None = None, + offset: int | None = None, + status: str | None = None, + main_entity_type: str | None = None, + uploader: list[int] | None = None, + benchmark_suite: int | None = None, + ) -> Any: ... diff --git a/openml/_api/resources/datasets.py b/openml/_api/resources/datasets.py new file mode 100644 index 000000000..9ff1ec278 --- /dev/null +++ b/openml/_api/resources/datasets.py @@ -0,0 +1,20 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from openml._api.resources.base import DatasetsAPI + +if TYPE_CHECKING: + from responses import Response + + from openml.datasets.dataset import OpenMLDataset + + +class DatasetsV1(DatasetsAPI): + def get(self, dataset_id: int) -> OpenMLDataset | tuple[OpenMLDataset, Response]: + raise NotImplementedError + + +class DatasetsV2(DatasetsAPI): + def get(self, dataset_id: int) -> OpenMLDataset | tuple[OpenMLDataset, Response]: + raise NotImplementedError diff --git a/openml/_api/resources/studies.py b/openml/_api/resources/studies.py new file mode 100644 index 000000000..cf05a878c --- /dev/null +++ b/openml/_api/resources/studies.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +from typing import Any + +from openml._api.resources.base import StudiesAPI + + +class StudiesV1(StudiesAPI): + def list( # noqa: PLR0913 + self, + limit: int | None = None, + offset: int | None = None, + status: str | None = None, + main_entity_type: str | None = None, + uploader: list[int] | None = None, + benchmark_suite: int | None = None, + ) -> Any: + api_call = "study/list" + + if limit is not None: + api_call += f"/limit/{limit}" + if offset is not None: + api_call += f"/offset/{offset}" + if status is not None: + api_call += f"/status/{status}" + if main_entity_type is not None: + api_call += f"/main_entity_type/{main_entity_type}" + if uploader is not None: + api_call += f"/uploader/{','.join(str(u) for u in uploader)}" + if benchmark_suite is not None: + api_call += f"/benchmark_suite/{benchmark_suite}" + + # Make the GET request and return the XML text + response = self._http.get(api_call) + return response.text + + +class StudiesV2(StudiesAPI): + def list( # noqa: PLR0913 + self, + limit: int | None = None, + offset: int | None = None, + status: str | None = None, + main_entity_type: str | None = None, + uploader: list[int] | None = None, + benchmark_suite: int | None = None, + ) -> Any: + raise NotImplementedError("V2 API implementation is not yet available") diff --git a/openml/_api/resources/tasks.py b/openml/_api/resources/tasks.py new file mode 100644 index 000000000..f494fb9a3 --- /dev/null +++ b/openml/_api/resources/tasks.py @@ -0,0 +1,128 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import xmltodict + +from openml._api.resources.base import TasksAPI +from openml.tasks.task import ( + OpenMLClassificationTask, + OpenMLClusteringTask, + OpenMLLearningCurveTask, + OpenMLRegressionTask, + OpenMLTask, + TaskType, +) + +if TYPE_CHECKING: + from requests import Response + + +class TasksV1(TasksAPI): + def get( + self, + task_id: int, + *, + return_response: bool = False, + ) -> OpenMLTask | tuple[OpenMLTask, Response]: + path = f"task/{task_id}" + response = self._http.get(path) + xml_content = response.text + task = self._create_task_from_xml(xml_content) + + if return_response: + return task, response + + return task + + def _create_task_from_xml(self, xml: str) -> OpenMLTask: + """Create a task given a xml string. + + Parameters + ---------- + xml : string + Task xml representation. + + Returns + ------- + OpenMLTask + """ + dic = xmltodict.parse(xml)["oml:task"] + estimation_parameters = {} + inputs = {} + # Due to the unordered structure we obtain, we first have to extract + # the possible keys of oml:input; dic["oml:input"] is a list of + # OrderedDicts + + # Check if there is a list of inputs + if isinstance(dic["oml:input"], list): + for input_ in dic["oml:input"]: + name = input_["@name"] + inputs[name] = input_ + # Single input case + elif isinstance(dic["oml:input"], dict): + name = dic["oml:input"]["@name"] + inputs[name] = dic["oml:input"] + + evaluation_measures = None + if "evaluation_measures" in inputs: + evaluation_measures = inputs["evaluation_measures"]["oml:evaluation_measures"][ + "oml:evaluation_measure" + ] + + task_type = TaskType(int(dic["oml:task_type_id"])) + common_kwargs = { + "task_id": dic["oml:task_id"], + "task_type": dic["oml:task_type"], + "task_type_id": task_type, + "data_set_id": inputs["source_data"]["oml:data_set"]["oml:data_set_id"], + "evaluation_measure": evaluation_measures, + } + # TODO: add OpenMLClusteringTask? + if task_type in ( + TaskType.SUPERVISED_CLASSIFICATION, + TaskType.SUPERVISED_REGRESSION, + TaskType.LEARNING_CURVE, + ): + # Convert some more parameters + for parameter in inputs["estimation_procedure"]["oml:estimation_procedure"][ + "oml:parameter" + ]: + name = parameter["@name"] + text = parameter.get("#text", "") + estimation_parameters[name] = text + + common_kwargs["estimation_procedure_type"] = inputs["estimation_procedure"][ + "oml:estimation_procedure" + ]["oml:type"] + common_kwargs["estimation_procedure_id"] = int( + inputs["estimation_procedure"]["oml:estimation_procedure"]["oml:id"] + ) + + common_kwargs["estimation_parameters"] = estimation_parameters + common_kwargs["target_name"] = inputs["source_data"]["oml:data_set"][ + "oml:target_feature" + ] + common_kwargs["data_splits_url"] = inputs["estimation_procedure"][ + "oml:estimation_procedure" + ]["oml:data_splits_url"] + + cls = { + TaskType.SUPERVISED_CLASSIFICATION: OpenMLClassificationTask, + TaskType.SUPERVISED_REGRESSION: OpenMLRegressionTask, + TaskType.CLUSTERING: OpenMLClusteringTask, + TaskType.LEARNING_CURVE: OpenMLLearningCurveTask, + }.get(task_type) + if cls is None: + raise NotImplementedError(f"Task type {common_kwargs['task_type']} not supported.") + return cls(**common_kwargs) # type: ignore + + +class TasksV2(TasksAPI): + def get( + self, + task_id: int, + *, + return_response: bool = False, + ) -> OpenMLTask | tuple[OpenMLTask, Response]: + raise NotImplementedError diff --git a/openml/_api/runtime/__init__.py b/openml/_api/runtime/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/openml/_api/runtime/core.py b/openml/_api/runtime/core.py new file mode 100644 index 000000000..066be552b --- /dev/null +++ b/openml/_api/runtime/core.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from openml._api.config import settings +from openml._api.http.client import HTTPClient +from openml._api.resources import ( + DatasetsV1, + DatasetsV2, + StudiesV1, + StudiesV2, + TasksV1, + TasksV2, +) + +if TYPE_CHECKING: + from openml._api.resources.base import DatasetsAPI, StudiesAPI, TasksAPI + + +class APIBackend: + def __init__(self, *, datasets: DatasetsAPI, tasks: TasksAPI, studies: StudiesAPI): + self.datasets = datasets + self.tasks = tasks + self.studies = studies + + +def build_backend(version: str, *, strict: bool) -> APIBackend: + v1_http = HTTPClient(config=settings.api.v1) + v2_http = HTTPClient(config=settings.api.v2) + + v1 = APIBackend( + datasets=DatasetsV1(v1_http), + tasks=TasksV1(v1_http), + studies=StudiesV1(v1_http), + ) + + if version == "v1": + return v1 + + v2 = APIBackend( + datasets=DatasetsV2(v2_http), + tasks=TasksV2(v2_http), + studies=StudiesV2(v2_http), + ) + + if strict: + return v2 + + return v1 + + +class APIContext: + def __init__(self) -> None: + self._backend = build_backend("v1", strict=False) + + def set_version(self, version: str, *, strict: bool = False) -> None: + self._backend = build_backend(version=version, strict=strict) + + @property + def backend(self) -> APIBackend: + return self._backend diff --git a/openml/_api/runtime/fallback.py b/openml/_api/runtime/fallback.py new file mode 100644 index 000000000..1bc99d270 --- /dev/null +++ b/openml/_api/runtime/fallback.py @@ -0,0 +1,12 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from openml._api.resources.base import ResourceAPI + + +class FallbackProxy: + def __init__(self, primary: ResourceAPI, fallback: ResourceAPI): + self._primary = primary + self._fallback = fallback diff --git a/openml/study/functions.py b/openml/study/functions.py index 4e16879d7..b4bce8257 100644 --- a/openml/study/functions.py +++ b/openml/study/functions.py @@ -4,7 +4,7 @@ import warnings from functools import partial -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING import pandas as pd import xmltodict @@ -12,6 +12,7 @@ import openml._api_calls import openml.config import openml.utils +from openml._api import api_context from openml.study.study import OpenMLBenchmarkSuite, OpenMLStudy if TYPE_CHECKING: @@ -483,7 +484,7 @@ def list_studies( offset: int | None = None, size: int | None = None, status: str | None = None, - uploader: list[str] | None = None, + uploader: list[int] | None = None, benchmark_suite: int | None = None, ) -> pd.DataFrame: """ @@ -531,7 +532,15 @@ def list_studies( return pd.concat(batches) -def _list_studies(limit: int, offset: int, **kwargs: Any) -> pd.DataFrame: +def _list_studies( + limit: int, + offset: int, + *, + status: str | None = None, + main_entity_type: str | None = None, + uploader: list[int] | None = None, + benchmark_suite: int | None = None, +) -> pd.DataFrame: """Perform api call to return a list of studies. Parameters @@ -548,33 +557,59 @@ def _list_studies(limit: int, offset: int, **kwargs: Any) -> pd.DataFrame: ------- studies : dataframe """ - api_call = "study/list" - if limit is not None: - api_call += f"/limit/{limit}" - if offset is not None: - api_call += f"/offset/{offset}" - if kwargs is not None: - for operator, value in kwargs.items(): - if value is not None: - api_call += f"/{operator}/{value}" - return __list_studies(api_call=api_call) - - -def __list_studies(api_call: str) -> pd.DataFrame: + return __list_studies( + limit=limit, + offset=offset, + status=status, + main_entity_type=main_entity_type, + uploader=uploader, + benchmark_suite=benchmark_suite, + ) + + +def __list_studies( + limit: int | None, + offset: int | None, + *, + status: str | None = None, + main_entity_type: str | None = None, + uploader: list[int] | None = None, + benchmark_suite: int | None = None, +) -> pd.DataFrame: """Retrieves the list of OpenML studies and returns it in a dictionary or a Pandas DataFrame. + This function constructs the API call from parameters, making it + ready for both V1 (URL-based) and future V2 (JSON-based) APIs. + Parameters ---------- - api_call : str - The API call for retrieving the list of OpenML studies. + limit : int, optional + The maximum number of studies to return. + offset : int, optional + The number of studies to skip, starting from the first. + status : str, optional + Filter by status (active, in_preparation, deactivated, all) + main_entity_type : str, optional + Filter by main entity type (run, task) + uploader : list[int], optional + Filter by uploader IDs + benchmark_suite : int, optional + Filter by benchmark suite ID Returns ------- pd.DataFrame A Pandas DataFrame of OpenML studies """ - xml_string = openml._api_calls._perform_api_call(api_call, "get") + xml_string = api_context.backend.studies.list( + limit=limit, + offset=offset, + status=status, + main_entity_type=main_entity_type, + uploader=uploader, + benchmark_suite=benchmark_suite, + ) study_dict = xmltodict.parse(xml_string, force_list=("oml:study",)) # Minimalistic check if the XML is useful diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py index e9b879ae4..a794ad56d 100644 --- a/openml/tasks/functions.py +++ b/openml/tasks/functions.py @@ -12,6 +12,7 @@ import openml._api_calls import openml.utils +from openml._api import api_context from openml.datasets import get_dataset from openml.exceptions import OpenMLCacheException @@ -444,11 +445,16 @@ def _get_task_description(task_id: int) -> OpenMLTask: except OpenMLCacheException: _cache_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id) xml_file = _cache_dir / "task.xml" - task_xml = openml._api_calls._perform_api_call("task/%d" % task_id, "get") + result = api_context.backend.tasks.get(task_id, return_response=True) - with xml_file.open("w", encoding="utf8") as fh: - fh.write(task_xml) - return _create_task_from_xml(task_xml) + if isinstance(result, tuple): + task, response = result + with xml_file.open("w", encoding="utf8") as fh: + fh.write(response.text) + else: + task = result + + return task def _create_task_from_xml(xml: str) -> OpenMLTask: